Dataset

library(poweRlaw)
library(fitdistrplus)
library(Pareto)
library(ggplot2)
library(dplyr)
library(tidyr)

# Read in data from excel
all_data = readxl::read_excel("all-epidemics.xlsx", n_max = 540) |>
    dplyr::select(!starts_with("...")) |>
    bind_rows(tribble(
        ~Location, ~`Start Year`, ~`End Year`, ~`# deaths (thousands)`, ~`World Population (thousands)`, ~Disease,
        "HIV/AIDS", 1981, 2022, 40e3, 4.61e6, "AIDS", # Estimated deaths to end 2022
        "COVID-19", 2020, 2022, 20e3, 7.84e6, "Coronavirus", # Excess deaths to end 2022
    )) |>
    mutate(
        start = `Start Year`,
        length = `End Year` - `Start Year` + 1,
        intensity = `Intensity (deaths per mil/year)` / 1000,
        intensity2 = `# deaths (thousands)` / (`World Population (thousands)` * length),
        prop_dead = `# deaths (thousands)` / `World Population (thousands)`,
    )
glimpse(all_data)
## Rows: 541
## Columns: 15
## $ Location                           <chr> "China, Kwangsi", "China, Shansi", …
## $ `Start Year`                       <dbl> 1500, 1504, 1506, 1507, 1510, 1511,…
## $ `End Year`                         <dbl> 1500, 1504, 1506, 1541, 1510, 1511,…
## $ `# deaths (thousands)`             <dbl> -999.0, -999.0, -999.0, 300.0, -999…
## $ `World Population (thousands)`     <dbl> 463230, 463230, 463230, 463230, 463…
## $ `Relative Epidemic size (per mil)` <dbl> -2.156596075, -2.156596075, -2.1565…
## $ `Intensity (deaths per mil/year)`  <dbl> -2.156596075, -2.156596075, -2.1565…
## $ Disease                            <chr> "Unknown", "Unknown", "Unknown", "S…
## $ References                         <chr> "McNeill, 1998", "McNeill, 1998", "…
## $ Notes                              <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ start                              <dbl> 1500, 1504, 1506, 1507, 1510, 1511,…
## $ length                             <dbl> 1, 1, 1, 35, 1, 1, 1, 1, 1, 1, 7, 2…
## $ intensity                          <dbl> -2.156596e-03, -2.156596e-03, -2.15…
## $ intensity2                         <dbl> -2.156596e-03, -2.156596e-03, -2.15…
## $ prop_dead                          <dbl> -2.156596e-03, -2.156596e-03, -2.15…

A total of 541 epidemics are recorded in the dataset. 395 started between 1600 and 1944.

Notes on dataset construction

  • inclusion criteria
    • Independence: epidemic cannot end in the same year that marks the start of a subsequent epidemic of the same disease, irrespective of their occurrence location.
      • Epidemics recorded in the literature that occurred in the same period of time were merged into a single epidemic (e.g., several plague events in Europe in the 17th and 18th centuries). This first independence condition ensures that the analyses only focus on epidemics associated with a new or reemerging pathogen after a previous epidemic has ended in the human population (e.g., due to the reemergence of zoonoses from a natural reservoir).
    • only non-active epidemic included
    • if ended by introducing vaccines or treatments then excluded
  • possible European bias
    • the worst pandemics listed seem to be quite heavily European, with Asia underrepresented in particular
    • but also better data collection in Europe
    • and some Euro-centric source (specifically V. Harding, “The Dead and the Living in Paris and London, 1500–1670”)

Classifying diseases

This could really do with someone else looking over it.

classify_transmission = function(disease) {
    disease = case_when(
        stringr::str_starts(disease, "Typhus") ~ "Typhus",
        stringr::str_starts(disease, "Smallpox") ~ "Smallpox",
        .default = disease
    )
    case_match(
        disease,
        c("Tuberculosis", "Encephalitis", "Smallpox", "Influenza", "Coronavirus", "Measles", "Pneumonia", "Whooping Cough", "Diphtheria", "Meningitis", "Rubella", "Scarlet Fever") ~ "Respiratory",
        c("Dengue", "Plague", "Typhus", "Yellow Fever", "Sleeping Sickness", "Visceral Leishmaniasis", "Relapsing fever") ~ "Vector",
        c("Polio", "Cholera", "Dysentery", "Cocolitzli", "Typhoid") ~ "Fecal-oral",
        c("AIDS", "Ebola", "Hemorragic Fever") ~ "Bodily fluids",
    )
}

Data exploration

Failed non-linear Fitting

# fit dataset with a poisson regression model
# predict number of pandemics per year
# use a spline with 5 degrees of freedom
data_for_fit = all_data |>
    count(start) |>
    complete(start = 1500:2019, fill = list(n = 0))
fit = glm(n ~ splines::ns(start, df = 5), data = data_for_fit, family = "poisson")

# plot fit and data with prediction interval
ggplot(data_for_fit, aes(start, n)) +
    geom_line(aes(y = predict(fit, data_for_fit, type = "response")), colour = "red") +
    geom_point() +
    geom_ribbon(
        aes(ymin = predict(fit, data_for_fit, type = "response", se.fit = TRUE)$fit - 1.96 * predict(fit, data_for_fit, type = "response", se.fit = TRUE)$se.fit,
            ymax = predict(fit, data_for_fit, type = "response", se.fit = TRUE)$fit + 1.96 * predict(fit, data_for_fit, type = "response", se.fit = TRUE)$se.fit),
        fill = "grey", alpha = 0.5
    ) #+
    scale_y_log10()

all_data |>
    mutate(
        decade = floor(start / 10) * 10,
    ) |>
    count(decade) |>
    filter(decade <= 1950) |>
    ggplot(aes(decade, n)) +
    geom_line() +
    geom_point() +
    # best fit line with poisson regression
    geom_smooth(
        method = "glm",
        method.args = list(family = "poisson")
    )

Overall trend, by choice of threshold

  • Below changes the min threshold for inclusion
  • Possible the least bad pandemics are being better recorded since 1900
thresholds = 10^c(-5,-6,-8,-15)
all_data |>
    mutate(
        decade = floor(start / 10) * 10,
        `# deaths (thousands)` = pmax(`# deaths (thousands)`, 1e-3),
        prop_dead = `# deaths (thousands)` / `World Population (thousands)`,
    ) |>
    reframe(
        threshold = thresholds,
        count = purrr::map_dbl(thresholds, ~sum(prop_dead >= .x)),
        .by = decade
    ) |>
    # put 0 for threshold / decade combination if missing
    complete(decade = seq(from = 1500, to = 2019, by = 10), threshold, fill = list(count = 0)) |>
    mutate(threshold = as.factor(threshold)) |>
    ggplot(aes(x = decade, y = count, colour = threshold)) +
    geom_point() +
    geom_line()

Growth in diseases

Looks exponential

  • Doubling time: ~190 years
  • Corresponds with general growth in many metrics
all_data |>
    mutate(
        decade = floor(start / 10) * 10,
    ) |>
    count(decade) |>
    filter(decade <= 1950) |>
    ggplot(aes(decade, n)) +
    geom_line() +
    geom_point() +
    # best fit line with poisson regression
    geom_smooth(method = "glm", method.args = list(family = "poisson"))
## `geom_smooth()` using formula = 'y ~ x'

Dominanted by respiratory

  • Caveat: early causes often unknown
  • Doubling time: ~100 years
all_data |>
    # filter(prop_dead > 1e-6, start >= 1600) |>
    mutate(transmission = classify_transmission(Disease)) |>
    count(decade = floor(start / 10) * 10, transmission) |>
    filter(decade < 1950) |>
    # filter(is.na(transmission)) |>
    # count(Disease)
    # pivot_wider(names_from = transmission, values_from = n) |>
    # print(n=Inf)
    complete(decade = seq(from = 1500, to = 1945, by = 10), transmission, fill = list(n = 0)) |>
    ggplot(aes(x = decade, y = n, colour = transmission)) +
    geom_point() +
    geom_line() +
    # poisson regression
    geom_smooth(method = "glm", method.args = list(family = "poisson")) +
    theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

Only worst diseases

Threshold at 1 in 100,000 of the population killed. 80,000 deaths today or 10,000 in 1800.

all_data |>
    filter(prop_dead > 1e-5) |>
    mutate(transmission = classify_transmission(Disease)) |>
    count(decade = floor(start / 20) * 20, transmission) |>
    filter(decade < 1950) |>
    # filter(is.na(transmission)) |>
    # count(Disease)
    # pivot_wider(names_from = transmission, values_from = n) |>
    # print(n=Inf)
    complete(decade = seq(from = 1500, to = 1945, by = 20), transmission, fill = list(n = 0)) |>
    ggplot(aes(x = decade, y = n, colour = transmission)) +
    geom_point() +
    geom_line() +
    # poisson regression
    # geom_smooth(method = "glm", method.args = list(family = "quasipoisson")) +
    theme_minimal()

Big thing is vector born diseases around 1800. Lets look at those.

all_data |>
    mutate(transmission = classify_transmission(Disease)) |>
    filter(
        prop_dead > 1e-5,
        transmission == "Vector",
        between(start, 1780, 1840),
    ) |>
    DT::datatable()

Shrink in diseases

All diseases have decreased

all_data |>
    mutate(transmission = classify_transmission(Disease)) |>
    count(decade = floor(start / 10) * 10, transmission) |>
    filter(decade >= 1900) |>
    complete(decade = seq(from = 1900, to = 2020, by = 10), transmission, fill = list(n = 0)) |>
    ggplot(aes(x = decade, y = n, colour = transmission)) +
    geom_point() +
    geom_line() +
    # poisson regression
    # geom_smooth(method = "glm", method.args = list(family = "poisson")) +
    theme_minimal()