library(poweRlaw)
library(fitdistrplus)
library(Pareto)
library(ggplot2)
library(dplyr)
library(tidyr)
# Read in data from excel
all_data = readxl::read_excel("all-epidemics.xlsx", n_max = 540) |>
dplyr::select(!starts_with("...")) |>
bind_rows(tribble(
~Location, ~`Start Year`, ~`End Year`, ~`# deaths (thousands)`, ~`World Population (thousands)`, ~Disease,
"HIV/AIDS", 1981, 2022, 40e3, 4.61e6, "AIDS", # Estimated deaths to end 2022
"COVID-19", 2020, 2022, 20e3, 7.84e6, "Coronavirus", # Excess deaths to end 2022
)) |>
mutate(
start = `Start Year`,
length = `End Year` - `Start Year` + 1,
intensity = `Intensity (deaths per mil/year)` / 1000,
intensity2 = `# deaths (thousands)` / (`World Population (thousands)` * length),
prop_dead = `# deaths (thousands)` / `World Population (thousands)`,
)
glimpse(all_data)
## Rows: 541
## Columns: 15
## $ Location <chr> "China, Kwangsi", "China, Shansi", …
## $ `Start Year` <dbl> 1500, 1504, 1506, 1507, 1510, 1511,…
## $ `End Year` <dbl> 1500, 1504, 1506, 1541, 1510, 1511,…
## $ `# deaths (thousands)` <dbl> -999.0, -999.0, -999.0, 300.0, -999…
## $ `World Population (thousands)` <dbl> 463230, 463230, 463230, 463230, 463…
## $ `Relative Epidemic size (per mil)` <dbl> -2.156596075, -2.156596075, -2.1565…
## $ `Intensity (deaths per mil/year)` <dbl> -2.156596075, -2.156596075, -2.1565…
## $ Disease <chr> "Unknown", "Unknown", "Unknown", "S…
## $ References <chr> "McNeill, 1998", "McNeill, 1998", "…
## $ Notes <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ start <dbl> 1500, 1504, 1506, 1507, 1510, 1511,…
## $ length <dbl> 1, 1, 1, 35, 1, 1, 1, 1, 1, 1, 7, 2…
## $ intensity <dbl> -2.156596e-03, -2.156596e-03, -2.15…
## $ intensity2 <dbl> -2.156596e-03, -2.156596e-03, -2.15…
## $ prop_dead <dbl> -2.156596e-03, -2.156596e-03, -2.15…
A total of 541 epidemics are recorded in the dataset. 395 started between 1600 and 1944.
This could really do with someone else looking over it.
classify_transmission = function(disease) {
disease = case_when(
stringr::str_starts(disease, "Typhus") ~ "Typhus",
stringr::str_starts(disease, "Smallpox") ~ "Smallpox",
.default = disease
)
case_match(
disease,
c("Tuberculosis", "Encephalitis", "Smallpox", "Influenza", "Coronavirus", "Measles", "Pneumonia", "Whooping Cough", "Diphtheria", "Meningitis", "Rubella", "Scarlet Fever") ~ "Respiratory",
c("Dengue", "Plague", "Typhus", "Yellow Fever", "Sleeping Sickness", "Visceral Leishmaniasis", "Relapsing fever") ~ "Vector",
c("Polio", "Cholera", "Dysentery", "Cocolitzli", "Typhoid") ~ "Fecal-oral",
c("AIDS", "Ebola", "Hemorragic Fever") ~ "Bodily fluids",
)
}
# fit dataset with a poisson regression model
# predict number of pandemics per year
# use a spline with 5 degrees of freedom
data_for_fit = all_data |>
count(start) |>
complete(start = 1500:2019, fill = list(n = 0))
fit = glm(n ~ splines::ns(start, df = 5), data = data_for_fit, family = "poisson")
# plot fit and data with prediction interval
ggplot(data_for_fit, aes(start, n)) +
geom_line(aes(y = predict(fit, data_for_fit, type = "response")), colour = "red") +
geom_point() +
geom_ribbon(
aes(ymin = predict(fit, data_for_fit, type = "response", se.fit = TRUE)$fit - 1.96 * predict(fit, data_for_fit, type = "response", se.fit = TRUE)$se.fit,
ymax = predict(fit, data_for_fit, type = "response", se.fit = TRUE)$fit + 1.96 * predict(fit, data_for_fit, type = "response", se.fit = TRUE)$se.fit),
fill = "grey", alpha = 0.5
) #+
scale_y_log10()
all_data |>
mutate(
decade = floor(start / 10) * 10,
) |>
count(decade) |>
filter(decade <= 1950) |>
ggplot(aes(decade, n)) +
geom_line() +
geom_point() +
# best fit line with poisson regression
geom_smooth(
method = "glm",
method.args = list(family = "poisson")
)
thresholds = 10^c(-5,-6,-8,-15)
all_data |>
mutate(
decade = floor(start / 10) * 10,
`# deaths (thousands)` = pmax(`# deaths (thousands)`, 1e-3),
prop_dead = `# deaths (thousands)` / `World Population (thousands)`,
) |>
reframe(
threshold = thresholds,
count = purrr::map_dbl(thresholds, ~sum(prop_dead >= .x)),
.by = decade
) |>
# put 0 for threshold / decade combination if missing
complete(decade = seq(from = 1500, to = 2019, by = 10), threshold, fill = list(count = 0)) |>
mutate(threshold = as.factor(threshold)) |>
ggplot(aes(x = decade, y = count, colour = threshold)) +
geom_point() +
geom_line()
all_data |>
mutate(
decade = floor(start / 10) * 10,
) |>
count(decade) |>
filter(decade <= 1950) |>
ggplot(aes(decade, n)) +
geom_line() +
geom_point() +
# best fit line with poisson regression
geom_smooth(method = "glm", method.args = list(family = "poisson"))
## `geom_smooth()` using formula = 'y ~ x'
all_data |>
# filter(prop_dead > 1e-6, start >= 1600) |>
mutate(transmission = classify_transmission(Disease)) |>
count(decade = floor(start / 10) * 10, transmission) |>
filter(decade < 1950) |>
# filter(is.na(transmission)) |>
# count(Disease)
# pivot_wider(names_from = transmission, values_from = n) |>
# print(n=Inf)
complete(decade = seq(from = 1500, to = 1945, by = 10), transmission, fill = list(n = 0)) |>
ggplot(aes(x = decade, y = n, colour = transmission)) +
geom_point() +
geom_line() +
# poisson regression
geom_smooth(method = "glm", method.args = list(family = "poisson")) +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
Threshold at 1 in 100,000 of the population killed. 80,000 deaths today or 10,000 in 1800.
all_data |>
filter(prop_dead > 1e-5) |>
mutate(transmission = classify_transmission(Disease)) |>
count(decade = floor(start / 20) * 20, transmission) |>
filter(decade < 1950) |>
# filter(is.na(transmission)) |>
# count(Disease)
# pivot_wider(names_from = transmission, values_from = n) |>
# print(n=Inf)
complete(decade = seq(from = 1500, to = 1945, by = 20), transmission, fill = list(n = 0)) |>
ggplot(aes(x = decade, y = n, colour = transmission)) +
geom_point() +
geom_line() +
# poisson regression
# geom_smooth(method = "glm", method.args = list(family = "quasipoisson")) +
theme_minimal()
Big thing is vector born diseases around 1800. Lets look at those.
all_data |>
mutate(transmission = classify_transmission(Disease)) |>
filter(
prop_dead > 1e-5,
transmission == "Vector",
between(start, 1780, 1840),
) |>
DT::datatable()
all_data |>
mutate(transmission = classify_transmission(Disease)) |>
count(decade = floor(start / 10) * 10, transmission) |>
filter(decade >= 1900) |>
complete(decade = seq(from = 1900, to = 2020, by = 10), transmission, fill = list(n = 0)) |>
ggplot(aes(x = decade, y = n, colour = transmission)) +
geom_point() +
geom_line() +
# poisson regression
# geom_smooth(method = "glm", method.args = list(family = "poisson")) +
theme_minimal()