date = "2025-04-10"
)
options(scipen = 999, digits = 2)
# update with your own working directory/filepath here:
setwd("~/Desktop/Papers/TESS LGBT-Data/do/replication")
data <- read_dta("data/merged_LGBT_data_clean.dta")
# FIG 1 ******************************************
# minor data cleaning
data <- data %>%
mutate(
region_num = haven::as_factor(REGION4),
anotherrace = case_when(
aaa == 1  |
aian == 1 |
mena == 1 |
nhpi == 1 |
another == 1 ~ 1,
TRUE ~ 0
),
sexual_orientation_collapsed = case_when(
sexual_orientation %in% c(1, 2) ~ sexual_orientation,
sexual_orientation %in% c(3, 4) ~ 3
),
age_collapsed = case_when(
AGE4 %in% c(1, 2, 3) ~ AGE4,
AGE4 == 4 ~ 3
)
)
# data subset
data_amab <- data %>%  filter(sex_at_birth == 1)
data_afab <- data %>%  filter(sex_at_birth == 2)
# MODEL ESTIMATION
results <- list()
vars <- c("sex_at_birth", "region_num", "white", "black", "hispanic", "anotherrace", "sexual_orientation_collapsed",
"age_collapsed", "employment")
i <- 1
# full sample
for (var in vars) {
var_sym <- sym(var)
base <- data %>%
filter(!is.na(gender_writein_flag), !is.na(EXP_term_type), !is.na(!!var_sym))
levs <- base %>%
pull(!!var_sym) %>%
unique()
for (lvl in levs) {
d <- base %>% filter(!!var_sym == lvl)
fit <- glm(gender_writein_flag ~ EXP_term_type, data = d, family = binomial(link="logit"))
ame <- avg_comparisons(
fit,
variables = "EXP_term_type",
comparison = "dydx"
)
est <- ame$estimate
lb  <- ame$conf.low
ub  <- ame$conf.high
pval <- ame$p.value
results[[i]] <- data.frame(
group   = as.character(lvl),
subgroup= var,
term    = names(coef(fit))[2],
est      = est,
lb  = lb,
ub = ub,
pval    = pval,
pull    = "Full Sample",
stringsAsFactors = FALSE
)
i <- i + 1
}
}
# Repeat for AFAB only (sex_at_birth == 2)
for (var in vars[-1]) {
var_sym <- sym(var)
base <- data_afab %>%
filter(!is.na(gender_writein_flag), !is.na(EXP_term_type), !is.na(!!var_sym))
levs <- base %>%
pull(!!var_sym) %>%
unique()
for (lvl in levs) {
d <- base %>% filter(!!var_sym == lvl)
fit <- glm(gender_writein_flag ~ EXP_term_type, data = d, family = binomial(link="logit"))
ame <- avg_comparisons(
fit,
variables = "EXP_term_type",
comparison = "dydx"
)
est <- ame$estimate
lb  <- ame$conf.low
ub  <- ame$conf.high
pval <- ame$p.value
results[[i]] <- data.frame(
group   = as.character(lvl),
subgroup= var,
term    = names(coef(fit))[2],
est      = est,
lb  = lb,
ub = ub,
pval    = pval,
pull    = "AFAB",
stringsAsFactors = FALSE
)
i <- i + 1
}
}
# Repeat for AMAB only (sex_at_birth == 1)
for (var in vars[-1]) {
var_sym <- sym(var)
base <- data_amab %>%
filter(!is.na(gender_writein_flag), !is.na(EXP_term_type), !is.na(!!var_sym))
levs <- base %>%
pull(!!var_sym) %>%
unique()
for (lvl in levs) {
d <- base %>% filter(!!var_sym == lvl)
fit <- glm(gender_writein_flag ~ EXP_term_type, data = d, family = binomial(link="logit"))
ame <- avg_comparisons(
fit,
variables = "EXP_term_type",
comparison = "dydx"
)
est <- ame$estimate
lb  <- ame$conf.low
ub  <- ame$conf.high
pval <- ame$p.value
results[[i]] <- data.frame(
group   = as.character(lvl),
subgroup= var,
term    = names(coef(fit))[2],
est      = est,
lb  = lb,
ub = ub,
pval    = pval,
pull    = "AMAB",
stringsAsFactors = FALSE
)
i <- i + 1
}
}
# FIGURE CREATION
df <- do.call(rbind.data.frame, results)
df <- df %>%
filter(!(subgroup %in% c("white", "black", "hispanic", "anotherrace") & group == "0"))
df <- df %>%
mutate(label_vars = case_when(
group == "2" & subgroup == "sex_at_birth" ~ "AFAB",
group == "1" & subgroup == "sex_at_birth" ~ "AMAB",
group == "South" & subgroup == "region_num" ~ "South",
group == "Northeast" & subgroup == "region_num" ~ "Northeast",
group == "West" & subgroup == "region_num" ~ "West",
group == "Midwest" & subgroup == "region_num" ~ "Midwest",
group == "1" & subgroup == "white" ~ "White/European American",
group == "1" & subgroup == "hispanic" ~ "Hispanic/Latino",
group == "1" & subgroup == "black" ~ "Black/African American",
group == "1" & subgroup == "anotherrace" ~ "Another race/origin",
group == "1" & subgroup == "sexual_orientation_collapsed" ~ "Bisexual",
group == "2" & subgroup == "sexual_orientation_collapsed" ~ "Lesbian/Gay",
group == "3" & subgroup == "sexual_orientation_collapsed" ~ "Another Sexual Orientation",
group == "1" & subgroup == "age_collapsed" ~ "18-29",
group == "2" & subgroup == "age_collapsed" ~ "30-44",
group == "3" & subgroup == "age_collapsed" ~ "45+",
group == "1" & subgroup == "employment" ~ "Employed",
group == "2" & subgroup == "employment" ~ "Unemployed",
group == "3" & subgroup == "employment" ~ "Not in labor force",
TRUE ~ ""
))
df <- df %>%
mutate(section = case_when(
subgroup == "sex_at_birth" ~ "Sex",
subgroup == "region_num" ~ "Region",
subgroup == "white" ~ "Race/Origin",
subgroup == "hispanic" ~ "Race/Origin",
subgroup == "black" ~ "Race/Origin",
subgroup == "anotherrace" ~ "Race/Origin",
subgroup == "sexual_orientation_collapsed" ~ "Sexual Orientation",
subgroup == "employment" ~ "Employment",
subgroup == "age_collapsed" ~ "Age",
TRUE ~ ""
))
df$pull <- factor(df$pull, levels = (rev(c(
"Full Sample", "AFAB", "AMAB"
))))
df$section <- factor(df$section, levels = c(
"Sex", "Region", "Sexual Orientation", "Race/Origin", "Age", "Employment"
))
df$label_vars <- factor(df$label_vars, levels = rev(c(
"AFAB", "AMAB",
"South",  "West", "Midwest", "Northeast",
"Bisexual", "Lesbian/Gay", "Another Sexual Orientation",
"White/European American", "Hispanic/Latino", "Black/African American", "Another race/origin",
"18-29", "30-44", "45+",
"Employed", "Unemployed", "Not in labor force"
)))
df$significance <- ifelse(df$pval <= 0.05, TRUE, FALSE)
ggplot(df, aes(x = est, y = label_vars, color = pull)) +
geom_point(size = 2, position = position_dodgev(height = 0.6)) +
geom_errorbarh(aes(xmin = lb, xmax = ub), height = 0, position = position_dodgev(height = 0.6)) +
geom_vline(xintercept = 0, linewidth = 0.6) +
facet_grid(section ~ ., scales = "free_y", space = "free_y", switch = "y") +
scale_color_discrete(name = NULL, breaks = c("Full Sample","AFAB","AMAB")) +
scale_color_manual(
name = NULL,
breaks = c("Full Sample", "AFAB", "AMAB"),
values = c(
"Full Sample" = "black",
"AFAB" = "grey40",
"AMAB" = "grey70"
)
) +
scale_x_continuous(limits = c(-0.19, 0.19)) +
labs(x = "Average Marginal Effect (AME)", y = NULL) +
theme_minimal()
ggplot(df, aes(x = est, y = label_vars, color = pull, group = pull)) +
geom_point(size = 2, position = pd) +
geom_errorbarh(
aes(xmin = lb, xmax = ub),
height = 0,
position = pd
) +
geom_vline(xintercept = 0, linewidth = 0.6) +
facet_grid(
section ~ .,
scales = "free_y",
space = "free_y",
switch = "y"
) +
scale_color_manual(
name = NULL,
breaks = c("Full Sample", "AFAB", "AMAB"),
values = c(
"Full Sample" = "black",
"AFAB" = "grey40",
"AMAB" = "grey70"
)
) +
scale_x_continuous(limits = c(-0.19, 0.19)) +
labs(x = "Average Marginal Effect (AME)", y = NULL) +
theme_minimal()
pd <- position_dodge(width = 0.6)
ggplot(df, aes(x = est, y = label_vars, color = pull, group = pull)) +
geom_point(size = 2, position = pd) +
geom_errorbarh(
aes(xmin = lb, xmax = ub),
height = 0,
position = pd
) +
geom_vline(xintercept = 0, linewidth = 0.6) +
facet_grid(
section ~ .,
scales = "free_y",
space = "free_y",
switch = "y"
) +
scale_color_manual(
name = NULL,
breaks = c("Full Sample", "AFAB", "AMAB"),
values = c(
"Full Sample" = "black",
"AFAB" = "grey40",
"AMAB" = "grey70"
)
) +
scale_x_continuous(limits = c(-0.19, 0.19)) +
labs(x = "Average Marginal Effect (AME)", y = NULL) +
theme_minimal()
# SET UP ******************************************
# Install groundhog if not already installed
if (!requireNamespace("groundhog", quietly = TRUE)) {
install.packages("groundhog")
}
# File: 04 - Figure 1 replication
# Authors: 		Tessa Holtzman and Aliya Saperstein
# Date: 			February 4, 2026
#Project: 		Formatting the Two-Step Gender Measure: Experimental Insights from the United States
# Description: 	This file contains code to reproduce figure 1.
# SET UP ******************************************
# Install groundhog if not already installed
if (!requireNamespace("groundhog", quietly = TRUE)) {
install.packages("groundhog")
}
# Load groundhog
library(groundhog)
# Load specific package versions as of 2025-10-01
groundhog.library(c("tidyverse", "readr", "haven", "marginaleffects"),
date = "2025-04-10"
)
options(scipen = 999, digits = 2)
# update with your own working directory/filepath here:
setwd("~/Desktop/Papers/TESS LGBT-Data/do/replication")
data <- read_dta("data/merged_LGBT_data_clean.dta")
# FIG 1 ******************************************
# minor data cleaning
data <- data %>%
mutate(
region_num = haven::as_factor(REGION4),
anotherrace = case_when(
aaa == 1  |
aian == 1 |
mena == 1 |
nhpi == 1 |
another == 1 ~ 1,
TRUE ~ 0
),
sexual_orientation_collapsed = case_when(
sexual_orientation %in% c(1, 2) ~ sexual_orientation,
sexual_orientation %in% c(3, 4) ~ 3
),
age_collapsed = case_when(
AGE4 %in% c(1, 2, 3) ~ AGE4,
AGE4 == 4 ~ 3
)
)
# data subset
data_amab <- data %>%  filter(sex_at_birth == 1)
data_afab <- data %>%  filter(sex_at_birth == 2)
# MODEL ESTIMATION
results <- list()
vars <- c("sex_at_birth", "region_num", "white", "black", "hispanic", "anotherrace", "sexual_orientation_collapsed",
"age_collapsed", "employment")
i <- 1
# full sample
for (var in vars) {
var_sym <- sym(var)
base <- data %>%
filter(!is.na(gender_writein_flag), !is.na(EXP_term_type), !is.na(!!var_sym))
levs <- base %>%
pull(!!var_sym) %>%
unique()
for (lvl in levs) {
d <- base %>% filter(!!var_sym == lvl)
fit <- glm(gender_writein_flag ~ EXP_term_type, data = d, family = binomial(link="logit"))
ame <- avg_comparisons(
fit,
variables = "EXP_term_type",
comparison = "dydx"
)
est <- ame$estimate
lb  <- ame$conf.low
ub  <- ame$conf.high
pval <- ame$p.value
results[[i]] <- data.frame(
group   = as.character(lvl),
subgroup= var,
term    = names(coef(fit))[2],
est      = est,
lb  = lb,
ub = ub,
pval    = pval,
pull    = "Full Sample",
stringsAsFactors = FALSE
)
i <- i + 1
}
}
# Repeat for AFAB only (sex_at_birth == 2)
for (var in vars[-1]) {
var_sym <- sym(var)
base <- data_afab %>%
filter(!is.na(gender_writein_flag), !is.na(EXP_term_type), !is.na(!!var_sym))
levs <- base %>%
pull(!!var_sym) %>%
unique()
for (lvl in levs) {
d <- base %>% filter(!!var_sym == lvl)
fit <- glm(gender_writein_flag ~ EXP_term_type, data = d, family = binomial(link="logit"))
ame <- avg_comparisons(
fit,
variables = "EXP_term_type",
comparison = "dydx"
)
est <- ame$estimate
lb  <- ame$conf.low
ub  <- ame$conf.high
pval <- ame$p.value
results[[i]] <- data.frame(
group   = as.character(lvl),
subgroup= var,
term    = names(coef(fit))[2],
est      = est,
lb  = lb,
ub = ub,
pval    = pval,
pull    = "AFAB",
stringsAsFactors = FALSE
)
i <- i + 1
}
}
# Repeat for AMAB only (sex_at_birth == 1)
for (var in vars[-1]) {
var_sym <- sym(var)
base <- data_amab %>%
filter(!is.na(gender_writein_flag), !is.na(EXP_term_type), !is.na(!!var_sym))
levs <- base %>%
pull(!!var_sym) %>%
unique()
for (lvl in levs) {
d <- base %>% filter(!!var_sym == lvl)
fit <- glm(gender_writein_flag ~ EXP_term_type, data = d, family = binomial(link="logit"))
ame <- avg_comparisons(
fit,
variables = "EXP_term_type",
comparison = "dydx"
)
est <- ame$estimate
lb  <- ame$conf.low
ub  <- ame$conf.high
pval <- ame$p.value
results[[i]] <- data.frame(
group   = as.character(lvl),
subgroup= var,
term    = names(coef(fit))[2],
est      = est,
lb  = lb,
ub = ub,
pval    = pval,
pull    = "AMAB",
stringsAsFactors = FALSE
)
i <- i + 1
}
}
# FIGURE CREATION
df <- do.call(rbind.data.frame, results)
df <- df %>%
filter(!(subgroup %in% c("white", "black", "hispanic", "anotherrace") & group == "0"))
df <- df %>%
mutate(label_vars = case_when(
group == "2" & subgroup == "sex_at_birth" ~ "AFAB",
group == "1" & subgroup == "sex_at_birth" ~ "AMAB",
group == "South" & subgroup == "region_num" ~ "South",
group == "Northeast" & subgroup == "region_num" ~ "Northeast",
group == "West" & subgroup == "region_num" ~ "West",
group == "Midwest" & subgroup == "region_num" ~ "Midwest",
group == "1" & subgroup == "white" ~ "White/European American",
group == "1" & subgroup == "hispanic" ~ "Hispanic/Latino",
group == "1" & subgroup == "black" ~ "Black/African American",
group == "1" & subgroup == "anotherrace" ~ "Another race/origin",
group == "1" & subgroup == "sexual_orientation_collapsed" ~ "Bisexual",
group == "2" & subgroup == "sexual_orientation_collapsed" ~ "Lesbian/Gay",
group == "3" & subgroup == "sexual_orientation_collapsed" ~ "Another Sexual Orientation",
group == "1" & subgroup == "age_collapsed" ~ "18-29",
group == "2" & subgroup == "age_collapsed" ~ "30-44",
group == "3" & subgroup == "age_collapsed" ~ "45+",
group == "1" & subgroup == "employment" ~ "Employed",
group == "2" & subgroup == "employment" ~ "Unemployed",
group == "3" & subgroup == "employment" ~ "Not in labor force",
TRUE ~ ""
))
df <- df %>%
mutate(section = case_when(
subgroup == "sex_at_birth" ~ "Sex",
subgroup == "region_num" ~ "Region",
subgroup == "white" ~ "Race/Origin",
subgroup == "hispanic" ~ "Race/Origin",
subgroup == "black" ~ "Race/Origin",
subgroup == "anotherrace" ~ "Race/Origin",
subgroup == "sexual_orientation_collapsed" ~ "Sexual Orientation",
subgroup == "employment" ~ "Employment",
subgroup == "age_collapsed" ~ "Age",
TRUE ~ ""
))
df$pull <- factor(df$pull, levels = (rev(c(
"Full Sample", "AFAB", "AMAB"
))))
df$section <- factor(df$section, levels = c(
"Sex", "Region", "Sexual Orientation", "Race/Origin", "Age", "Employment"
))
df$label_vars <- factor(df$label_vars, levels = rev(c(
"AFAB", "AMAB",
"South",  "West", "Midwest", "Northeast",
"Bisexual", "Lesbian/Gay", "Another Sexual Orientation",
"White/European American", "Hispanic/Latino", "Black/African American", "Another race/origin",
"18-29", "30-44", "45+",
"Employed", "Unemployed", "Not in labor force"
)))
df$significance <- ifelse(df$pval <= 0.05, TRUE, FALSE)
pd <- position_dodge(width = 0.6) # stagger estimates
ggplot(df, aes(x = est, y = label_vars, color = pull, group = pull)) +
geom_point(size = 2, position = pd) +
geom_errorbarh(
aes(xmin = lb, xmax = ub),
height = 0,
position = pd
) +
geom_vline(xintercept = 0, linewidth = 0.6) +
facet_grid(
section ~ .,
scales = "free_y",
space = "free_y",
switch = "y"
) +
scale_color_manual(
name = NULL,
breaks = c("Full Sample", "AFAB", "AMAB"),
values = c(
"Full Sample" = "black",
"AFAB" = "grey40",
"AMAB" = "grey70"
)
) +
scale_x_continuous(limits = c(-0.19, 0.19)) +
labs(x = "Average Marginal Effect (AME)", y = NULL) +
theme_minimal()
ggsave(
filename = "output/ame_plot.png",
plot = last_plot(),
width = 10,
height = 10,
units = "in",
dpi = 300,
bg = "white"
)
