November, 2020
Replication code for empirical example in the Appendix of “How do populations aggregate?” by Dennis M. Feehan and Elizabeth Wrigley-Field
NOTE: in order to run this code, you will have to download the life tables from the United States Mortality Database. This requires that you register (which is free).
The file you need is the bundled archive called lifetables.zip.
This analysis is based on the version of lifetables.zip
that was downloaded from the USMD website on March 21, 2019.
library(tidyverse)
Registered S3 methods overwritten by 'dbplyr':
method from
print.tbl_lazy
print.tbl_sql
── Attaching packages ─────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
✓ ggplot2 3.3.2 ✓ purrr 0.3.4
✓ tibble 3.0.4 ✓ dplyr 1.0.2
✓ tidyr 1.1.2 ✓ stringr 1.4.0
✓ readr 1.4.0 ✓ forcats 0.5.0
── Conflicts ────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
x dplyr::filter() masks stats::filter()
x dplyr::lag() masks stats::lag()
library(HMDHFDplus)
library(patchwork)
library(here)
here() starts at /Users/dennis/Dropbox/aggregation/aggregation-code-release
Create directories for data and output
dir.create(here("raw-data"), showWarnings=FALSE)
dir.create(here("out"), showWarnings=FALSE)
# this should be whichever directory has the raw USMD data
usmd.data.dir <- here("raw-data")
# this is where we'll save outputs
out.dir <- here("out")
This chunk will ask you for your US Mortality Database username and password. Then it will download the lifetables for you.
usmd_uname <- readline("Your username:")
feehan@berkeley.edu
usmd_pwd <- readline("Your password:")
trixie
httr::GET(url = "https://usa.mortality.org/uploads/lifetables/lifetables.zip",
httr::authenticate(password=usmd_pwd, user=usmd_uname),
httr::write_disk(file.path(usmd.data.dir, 'lifetables.zip'),
overwrite = TRUE))
Response [https://usa.mortality.org/uploads/lifetables/lifetables.zip]
Date: 2020-12-16 18:35
Status: 200
Content-Type: application/zip
Size: 93.5 MB
<ON DISK> /Users/dennis/Dropbox/aggregation/aggregation-code-release/raw-data/lifetables.zip
Load the data
zipped_csv_names <- grep('\\.csv$', unzip(file.path(usmd.data.dir, 'lifetables.zip'), list=TRUE)$Name,
ignore.case=TRUE, value=TRUE)
state_codes <- c("AK", "AL", "AR", "AZ", "CA", "CO", "CT", "DC", "DE", "FL",
"GA", "HI", "IA", "ID", "IL", "IN", "KS", "KY", "LA", "MA", "MD",
"ME", "MI", "MN", "MO", "MS", "MT", "NC", "ND", "NE", "NH", "NJ",
"NM", "NV", "NY", "OH", "OK", "OR", "PA", "RI", "SC", "SD", "TN",
"TX", "UT", "VA", "VT", "WA", "WI", "WV", "WY")
# quick function to get the life table for a state
get_lt <- function(code, type='b', fmt="1x1", zipfn = file.path('raw-data', 'lifetables.zip')) {
fullpath <- glue::glue("lifetables/States/{code}/{code}_{type}ltper_{fmt}.csv")
this_lt <- read_csv(unz(zipfn, fullpath),
col_types='ccdcdddddddd') %>%
mutate(Age = HMDHFDplus::age2int(Age))
return(this_lt)
}
# read all the lifetables in
lts_both <- setNames(map(state_codes, ~get_lt(.x, 'b')), state_codes)
lts_male <- setNames(map(state_codes, ~get_lt(.x, 'm')), state_codes)
lts_female <- setNames(map(state_codes, ~get_lt(.x, 'f')), state_codes)
lts <- bind_rows(bind_rows(lts_both), bind_rows(lts_male), bind_rows(lts_female))
# save this to avoid having to re-load the data all the time
# (this is not necessary for replication, so commenting it out)
#write_csv(lts, file=file.path(out.dir, 'usmd_lts.csv'))
Number of records
lts %>% filter(Year == 2015) %>% nrow()
[1] 16983
```r
# quick function to get the life table for a state
get_lt <- function(code, type='b', fmt=\1x1\, zipfn = file.path('raw-data', 'lifetables.zip')) {
fullpath <- glue::glue(\lifetables/States/{code}/{code}_{type}ltper_{fmt}.csv\)
this_lt <- read_csv(unz(zipfn, fullpath),
col_types='ccdcdddddddd') %>%
mutate(Age = HMDHFDplus::age2int(Age))
return(this_lt)
}
# read all the lifetables in
lts_both <- setNames(map(state_codes, ~get_lt(.x, 'b')), state_codes)
lts_male <- setNames(map(state_codes, ~get_lt(.x, 'm')), state_codes)
lts_female <- setNames(map(state_codes, ~get_lt(.x, 'f')), state_codes)
lts <- bind_rows(bind_rows(lts_both), bind_rows(lts_male), bind_rows(lts_female))
# save this to avoid having to re-load the data all the time
# (this is not necessary for replication, so commenting it out)
#write_csv(lts, file=file.path(out.dir, 'usmd_lts.csv'))
<!-- rnb-source-end -->
<!-- rnb-chunk-end -->
<!-- rnb-text-begin -->
In these life tables, mx is not exactly equal to dx / Lx. We'll add a column that is.
<!-- rnb-text-end -->
<!-- rnb-chunk-begin -->
<!-- rnb-source-begin eyJkYXRhIjoiYGBgclxubHRzIDwtIGx0cyAlPiUgbXV0YXRlKGR4Lm92ZXIuTHggPSBkeC9MeClcbmBgYCJ9 -->
```r
lts <- lts %>% mutate(dx.over.Lx = dx/Lx)
Go through and calculate reldiff by age and sex when aggregation is incorrect
reldiffs <- lts %>%
filter(Year == 2015) %>%
split(list(.$Sex, .$Age)) %>%
map_df(function(asdat) {
## only look at states with at least 1 death
asdat <- asdat %>% filter(dx > 0)
direct.agg.mx <- sum(asdat$dx) / sum(asdat$Lx)
# harmonic mean of rates, weighted by deaths (correct)
hm.mx.d <- hmean(asdat$dx.over.Lx, asdat$dx)
# arithmetic mean of rates, weighted by nLx (correct)
am.mx.L <- weighted.mean(asdat$dx.over.Lx, asdat$Lx)
# arithmetic mean of rates, weighted by deaths (incorrect)
am.mx <- weighted.mean(asdat$dx.over.Lx, asdat$dx)
# arithmetic mean of rates, unweighted (incorrect)
uam.mx <- mean(asdat$dx.over.Lx)
# arithmetic mean of death rates, weighted by exposure, squared
am.m.d2 <- weighted.mean(asdat$dx.over.Lx, asdat$Lx)^2
# arithmetic mean of squared death rates, weighted by exposure
am.m2.d <- weighted.mean(asdat$dx.over.Lx^2, asdat$Lx)
# appropriately weighted cv2.mx
w.cv2.mx <- (am.m2.d/am.m.d2) - 1
# the weighted cv (not squared)
w.cv.mx <- sqrt(w.cv2.mx)
# diff in arithmetic mean weighted by deaths, relative to true aggregate value
reldiff <- (am.mx - direct.agg.mx) / direct.agg.mx
# diff in unweighted arithmetic mean, relative to true aggregate value
reldiff2 <- (uam.mx - direct.agg.mx) / direct.agg.mx
return(tibble(age=asdat$Age[1],
sex=asdat$Sex[1],
n=nrow(asdat),
direct=direct.agg.mx,
# correct harmonic mean
hm=hm.mx.d,
# arithmetic mean with wrong weights
am=am.mx,
# arithmetic mean with no weights
uam=uam.mx,
reldiff.am = reldiff,
reldiff.uam = reldiff2,
w.cv.mx = w.cv.mx,
w.cv2.mx = w.cv2.mx))
}) %>%
mutate(sex = dplyr::recode(sex,
'b'='Both sexes',
'm'='Males',
'f'='Females'))
fig.h <- 5
fig.w <- 8
am.reldiffs <- ggplot(reldiffs) +
geom_point(aes(x=age, y=reldiff.am, color=sex)) +
expand_limits(y=c(0,.6)) +
theme_minimal() +
ylab("Relative error") +
ggtitle("Relative error\nusing arithmetic mean\nweighted by deaths")
#ggsave(plot=am.reldiffs, filename='am-reldiffs.pdf', height=fig.h, width=fig.w)
am.reldiffs
uam.reldiffs <- ggplot(reldiffs) +
geom_point(aes(x=age, y=reldiff.uam, color=sex)) +
expand_limits(y=c(0,.6)) +
theme_minimal() +
ylab("Relative error") +
ggtitle("Relative error\nusing arithmetic mean\nunweighted")
#ggsave(plot=uam.reldiffs, filename='uam-reldiffs.pdf', height=fig.h, width=fig.w)
uam.reldiffs
fig.h <- 5
fig.w <- 8
relerr.plot <- (uam.reldiffs | am.reldiffs) + plot_annotation(tag_levels = 'A')
ggsave(plot=relerr.plot, filename=file.path(out.dir, 'relerrs.pdf'), height=fig.h, width=fig.w)
Plot the actual aggregate death rates
agg.pseudous <- ggplot(reldiffs) +
geom_point(aes(x=age, y=direct, color=sex)) +
theme_minimal() +
scale_y_log10() +
ylab("nMx") +
ggtitle("True aggregate\nage-specific death rate\n(logged)")
#ggsave(plot=agg.pseudous, filename='agg-pseudo-us.pdf', height=fig.h, width=fig.w)
agg.pseudous
Plot the variance in means
cv.pseudous <- ggplot(reldiffs) +
geom_point(aes(x=age, y=w.cv.mx, color=sex)) +
theme_minimal() +
ylab("CV") +
ggtitle("Coefficient of variation in\ndeath rates\nacross sub-populations")
cv.pseudous
```r
cv.pseudous <- ggplot(reldiffs) +
geom_point(aes(x=age, y=w.cv.mx, color=sex)) +
theme_minimal() +
ylab(\CV\) +
ggtitle(\Coefficient of variation in\ndeath rates\nacross sub-populations\)
cv.pseudous
<!-- rnb-source-end -->
<!-- rnb-plot-begin -->
<img src="" />
<!-- rnb-plot-end -->
<!-- rnb-chunk-end -->
<!-- rnb-text-begin -->
Directly compare rel-sd and error in second strategy
<!-- rnb-text-end -->
<!-- rnb-chunk-begin -->
<!-- rnb-source-begin eyJkYXRhIjoiYGBgclxuY3YyLnJlbGVyci5wbG90IDwtIGdncGxvdChyZWxkaWZmcykgK1xuICAjZ2VvbV9wb2ludChhZXMoeD13LmN2Mi5teCwgeT1yZWxkaWZmLmFtLCBjb2xvcj1sb2coZGlyZWN0KSkpICtcbiAgZ2VvbV9hYmxpbmUoaW50ZXJjZXB0PTAsIHNsb3BlPTEsIGNvbG9yPSdncmV5JykgK1xuICBnZW9tX3BvaW50KGFlcyh4PXcuY3YyLm14LCB5PXJlbGRpZmYuYW0pKSArXG4gICN4bGltKDAsLjEpICsgeWxpbSgwLC4xKSArXG4gIHhsYWIoXCJTcXVhcmVkIGNvZWZmaWNpZW50IG9mIHZhcmlhdGlvbiBpbiBkZWF0aCByYXRlc1wiKSArXG4gIHlsYWIoXCJSZWxhdGl2ZSBlcnJvclxcbnVzaW5nIGFyaXRobWV0aWMgbWVhblxcbndlaWdodGVkIGJ5IGRlYXRoc1wiKSArXG4gIHRoZW1lX21pbmltYWwoKSArXG4gIGNvb3JkX2VxdWFsKClcblxuY3YyLnJlbGVyci5wbG90XG5gYGAifQ== -->
```r
cv2.relerr.plot <- ggplot(reldiffs) +
#geom_point(aes(x=w.cv2.mx, y=reldiff.am, color=log(direct))) +
geom_abline(intercept=0, slope=1, color='grey') +
geom_point(aes(x=w.cv2.mx, y=reldiff.am)) +
#xlim(0,.1) + ylim(0,.1) +
xlab("Squared coefficient of variation in death rates") +
ylab("Relative error\nusing arithmetic mean\nweighted by deaths") +
theme_minimal() +
coord_equal()
cv2.relerr.plot
fig.h <- 5
fig.w <- 5
ggsave(plot=cv2.relerr.plot,
filename=file.path(out.dir, 'cv2-relerr.pdf'), height=fig.h, width=fig.w)