From 743f99866bc886ca159421cd4ab615c38a6fd884 Mon Sep 17 00:00:00 2001 From: Malcolm Barrett Date: Sun, 3 May 2026 20:45:25 -0400 Subject: [PATCH 1/4] add panache --- chapters/panache.toml | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 chapters/panache.toml diff --git a/chapters/panache.toml b/chapters/panache.toml new file mode 100644 index 0000000..9c4b986 --- /dev/null +++ b/chapters/panache.toml @@ -0,0 +1,13 @@ +flavor = "quarto" + +[format] +wrap = "sentence" + +[linters] +r = "jarl" + +[formatters] +r = "air" + +[lint.rules] +missing-chunk-labels = false From 94ed0f3f6f136b2d2d28e6b8574d898bcc7785f8 Mon Sep 17 00:00:00 2001 From: Malcolm Barrett Date: Sun, 3 May 2026 20:49:03 -0400 Subject: [PATCH 2/4] run panache lint --fix and format --- chapters/00-setup.qmd | 1 + chapters/01-casual-to-causal.qmd | 94 ++-- chapters/02-whole-game.qmd | 183 ++++--- chapters/03-po-counterfactuals.qmd | 522 ++++++++++++++------ chapters/04-dags.qmd | 151 ++++-- chapters/05-not-just-a-stats-problem.qmd | 172 ++++--- chapters/06-stats-models-ci.qmd | 170 ++++--- chapters/07-prep-data.qmd | 116 +++-- chapters/08-propensity-scores.qmd | 324 +++++++++--- chapters/09-evaluating-ps.qmd | 186 ++++--- chapters/10-estimands.qmd | 307 ++++++++---- chapters/11-outcome-model.qmd | 74 +-- chapters/12-other-exposures.qmd | 33 +- chapters/13-g-comp.qmd | 150 ++++-- chapters/14-interaction.qmd | 126 +++-- chapters/15-missingness-and-measurement.qmd | 280 +++++++---- chapters/16-sensitivity.qmd | 356 +++++++++---- chapters/17-mediation.qmd | 1 + chapters/18-longitudinal.qmd | 1 + chapters/19-time-to-event.qmd | 1 + chapters/20-doubly-robust.qmd | 1 + chapters/21-machine-learning.qmd | 1 + chapters/22-iv-and-friends.qmd | 1 + chapters/23-diff-in-diff.qmd | 1 + chapters/24-evidence.qmd | 4 +- chapters/future/time_varying_gcomp.qmd | 117 +++-- 26 files changed, 2299 insertions(+), 1074 deletions(-) diff --git a/chapters/00-setup.qmd b/chapters/00-setup.qmd index 4e6f544..d770f97 100644 --- a/chapters/00-setup.qmd +++ b/chapters/00-setup.qmd @@ -1,5 +1,6 @@ ```{r} #| include: false + source(here::here("R/ggdag-mask.R")) source(here::here("R/setup.R")) library(tidyverse) diff --git a/chapters/01-casual-to-causal.qmd b/chapters/01-casual-to-causal.qmd index 2502021..d42d303 100644 --- a/chapters/01-casual-to-causal.qmd +++ b/chapters/01-casual-to-causal.qmd @@ -6,6 +6,7 @@ ```{r} #| echo: false + # TODO: remove when first edition complete status("complete") ``` @@ -30,11 +31,14 @@ However, most discussed how such a cause might be justified by previous research ```{r} #| label: "fig-word-ranking" -#| fig.cap: "Rankings of causal strength of root words used by researchers. Root words with more Strong rankings have stronger causal implications than those with many None or Weak rankings. Data from @haber_causal_language." -#| fig.height: 9 +#| fig-cap: "Rankings of causal strength of root words used by researchers. Root words with more Strong rankings have stronger causal implications than those with many None or Weak rankings. Data from @haber_causal_language." +#| fig-height: 9 #| echo: false -rankings <- read_csv(here::here("data/word_rankings.csv"), show_col_types = FALSE) |> +rankings <- read_csv( + here::here("data/word_rankings.csv"), + show_col_types = FALSE +) |> janitor::clean_names() lvls <- rankings |> @@ -53,7 +57,9 @@ rankings |> mutate(rank = n / sum(n)) |> ungroup() |> drop_na(rating) |> - mutate(rating = factor(rating, levels = c("None", "Weak", "Moderate", "Strong"))) |> + mutate( + rating = factor(rating, levels = c("None", "Weak", "Moderate", "Strong")) + ) |> ggplot(aes(x = rank, y = root_word, fill = rating)) + geom_col(position = position_fill(reverse = TRUE)) + scale_fill_viridis_d(direction = -1) + @@ -81,7 +87,7 @@ rankings |> Instead of clear questions with obvious assumptions and goals, we end up with "Schrödinger's causal inference": -> Our results suggest that "Schrödinger's causal inference," — where studies avoid stating (or even explicitly deny) an interest in estimating causal effects yet are otherwise embedded with causal intent, inference, implications, and recommendations — is common. +> Our results suggest that "Schrödinger's causal inference," --- where studies avoid stating (or even explicitly deny) an interest in estimating causal effects yet are otherwise embedded with causal intent, inference, implications, and recommendations --- is common. > > --- @haber_causal_language @@ -126,7 +132,9 @@ Since the coronavirus is similar to other respiratory diseases, we had many publ Descriptive statistics of cases by region were vital for deciding local policies and the strength of those policies. A great example of a more complex descriptive analysis during the pandemic was an [ongoing report by the Financial Times of expected deaths vs. observed deaths](https://www.ft.com/content/a2901ce8-5eb7-4633-b89c-cbdf5b386938) in various countries and regions[^01-casual-to-causal-1]. -While the calculation of expected deaths is slightly more sophisticated than most descriptive statistics, it provided a tremendous amount of information about current deaths without needing to untangle causal effects (e.g., were they due to COVID-19 directly? Inaccessible healthcare? Cardiovascular events post-COVID?). +While the calculation of expected deaths is slightly more sophisticated than most descriptive statistics, it provided a tremendous amount of information about current deaths without needing to untangle causal effects (e.g., were they due to COVID-19 directly? +Inaccessible healthcare? +Cardiovascular events post-COVID?). In this (simplified) recreation of their plot from July 2020, you can see the staggering effect of the pandemic's early months. [^01-casual-to-causal-1]: John Burn-Murdoch was responsible for many of these presentations and gave a [fascinating talk on the subject](https://cloud.rstudio.com/resources/rstudioglobal-2021/reporting-on-and-visualising-the-pandemic/). @@ -138,6 +146,7 @@ In this (simplified) recreation of their plot from July 2020, you can see the st #| fig-cap: "2020 excess deaths vs. historical expected deaths from any cause. Data from the Financial Times." #| echo: false #| message: false + ft_excess_deaths <- read_csv(here::here("data/ft_excess_deaths.csv")) |> mutate(year = factor(year)) @@ -190,9 +199,21 @@ ggplot( Here are some other great examples of descriptive analyses. -- **Deforestation around the world**. Our World in Data [@owidforestsanddeforestation] is a data journalism organization that produces thoughtful, usually descriptive reports on various topics. In this report, they present data visualizations of both absolute change in forest coverage (forest transitions) and relative change (deforestation or reforestation), using basic statistics and forestry theory to present helpful information about the state of forests over time. -- **The prevalence of chlamydial and gonococcal infections** [@Miller2004]. Measuring the prevalence of disease (how many people currently have a disease, usually expressed as a ratio per number of people) is helpful for basic public health (resources, prevention, education) and scientific understanding. In this study, the authors conducted a complex survey meant to be representative of all high schools in the United States (the target population); they used survey weights to address a variety of factors related to their question, then calculated prevalence ratios and other statistics. As we'll see, weights are helpful in causal inference for the same reason: targeting a particular population. That said, not all weighting techniques are causal in nature, and they were not here. -- **Estimating race and ethnicity-specific hysterectomy inequalities** [@Gartner2020]. Descriptive techniques also help us understand disparities in areas like economics and epidemiology. In this study, the authors asked: Does the risk of hysterectomy differ by racial or ethnic background? Although the analysis is stratified by a key variable, it's still descriptive. Another interesting aspect of this paper is the authors' work ensuring the research answered questions about the right target population. Their analysis combined several data sources to better estimate the true population prevalence (instead of the prevalence among those in hospitals, as commonly presented). They also adjusted for the prevalence of hysterectomy, e.g., they calculated the incidence (new case) rate only among those who could actually have a hysterectomy (e.g., they hadn't had one yet). +- **Deforestation around the world**. + Our World in Data [@owidforestsanddeforestation] is a data journalism organization that produces thoughtful, usually descriptive reports on various topics. + In this report, they present data visualizations of both absolute change in forest coverage (forest transitions) and relative change (deforestation or reforestation), using basic statistics and forestry theory to present helpful information about the state of forests over time. +- **The prevalence of chlamydial and gonococcal infections** [@Miller2004]. + Measuring the prevalence of disease (how many people currently have a disease, usually expressed as a ratio per number of people) is helpful for basic public health (resources, prevention, education) and scientific understanding. + In this study, the authors conducted a complex survey meant to be representative of all high schools in the United States (the target population); they used survey weights to address a variety of factors related to their question, then calculated prevalence ratios and other statistics. + As we'll see, weights are helpful in causal inference for the same reason: targeting a particular population. + That said, not all weighting techniques are causal in nature, and they were not here. +- **Estimating race and ethnicity-specific hysterectomy inequalities** [@Gartner2020]. + Descriptive techniques also help us understand disparities in areas like economics and epidemiology. + In this study, the authors asked: Does the risk of hysterectomy differ by racial or ethnic background? + Although the analysis is stratified by a key variable, it's still descriptive. + Another interesting aspect of this paper is the authors' work ensuring the research answered questions about the right target population. + Their analysis combined several data sources to better estimate the true population prevalence (instead of the prevalence among those in hospitals, as commonly presented). + They also adjusted for the prevalence of hysterectomy, e.g., they calculated the incidence (new case) rate only among those who could actually have a hysterectomy (e.g., they hadn't had one yet). #### Validity @@ -271,13 +292,14 @@ Without question, some of the predictive value of this model stems from the caus Here are other good examples from the predictive space: -- Some of the most exciting work in predictive modeling is in industry. - Netflix regularly shares details on their modeling success and novel strategies in their [research blog](https://research.netflix.com/). - They also recently published a paper reviewing their use of deep learning models for recommender systems (in this case, recommending shows and movies to users) [@steck2021]. - The authors explain their experimentation with models, the details of those models, and many of the challenges they faced, resulting in a practical guide on using such models. +- Some of the most exciting work in predictive modeling is in industry. + Netflix regularly shares details on their modeling success and novel strategies in their [research blog](https://research.netflix.com/). + They also recently published a paper reviewing their use of deep learning models for recommender systems (in this case, recommending shows and movies to users) [@steck2021]. + The authors explain their experimentation with models, the details of those models, and many of the challenges they faced, resulting in a practical guide on using such models. -- In early 2020, researchers experienced with predictive and prognostic modeling in health research published a review of models for diagnosis and prognosis of COVID-19 [@Wynants2020]. - This review is interesting not just for its breadth but also the astounding number of models that were rated as poor quality: "\[232\] models were rated at high or unclear risk of bias, mostly because of non-representative selection of control patients, exclusion of patients who had not experienced the event of interest by the end of the study, high risk of model overfitting, and unclear reporting." This research is also a [living review](https://www.covprecise.org/). +- In early 2020, researchers experienced with predictive and prognostic modeling in health research published a review of models for diagnosis and prognosis of COVID-19 [@Wynants2020]. + This review is interesting not just for its breadth but also the astounding number of models that were rated as poor quality: "\[232\] models were rated at high or unclear risk of bias, mostly because of non-representative selection of control patients, exclusion of patients who had not experienced the event of interest by the end of the study, high risk of model overfitting, and unclear reporting." + This research is also a [living review](https://www.covprecise.org/). #### Validity @@ -339,7 +361,7 @@ He was right: contaminated water was a mechanism for cholera transmission. Yet, he didn't have enough information to explain how: *Vibrio cholerae*, the bacteria responsible for cholera, wasn't identified until nearly thirty years later. ::: -#### Examples +### Examples We'll see many examples of causal inference in this book, but let's continue with an example related to COVID-19. As the pandemic continued and tools like vaccines and anti-viral treatments became available, policies like universal masking also began to change. @@ -354,19 +376,20 @@ The authors found that districts that continued masking saw a drastically lower Here are a few other interesting examples: -- Netflix regularly uses causal inference in their work. - In 2022, they published a [blog post summarizing some causal tasks](https://netflixtechblog.com/a-survey-of-causal-inference-applications-at-netflix-b62d25175e6f) they have engaged with. - One interesting example is localization. - Netflix, being worldwide, localizes content through subtitles and dubbing. - Randomized experiments were a bad idea because they meant withholding content from users, so researchers at Netflix used several approaches to understand the value of localization while addressing potential confounding. - One example is studying the impact of pandemic-related delays in dubbing. - Researchers used synthetic controls (@sec-did) to simulate the impact on viewership with and without these delays. - Presumably, the timing of the pandemic-related delays was unrelated to many factors that would typically be related to dubbing processes, thus reducing some of the potential confounding. - -- The Tuskegee Study is one of modern history's most infamous examples of medical abuse. - It is commonly pointed to as a source of distrust in the medical community from Black Americans. - Health economics researchers used a variation of difference-in-difference techniques to assess the effect of the Tuskegee Study on distrust and life expectancy in older Black men [@Alsan2018]. - The results are important and disturbing: "We find that the disclosure of the study in 1972 is correlated with increases in medical mistrust and mortality and decreases in both outpatient and inpatient physician interactions for older black men. Our estimates imply life expectancy at age 45 for black men fell by up to 1.5 years in response to the disclosure, accounting for approximately 35% of the 1980 life expectancy gap between black and white men and 25% of the gap between black men and women." +- Netflix regularly uses causal inference in their work. + In 2022, they published a [blog post summarizing some causal tasks](https://netflixtechblog.com/a-survey-of-causal-inference-applications-at-netflix-b62d25175e6f) they have engaged with. + One interesting example is localization. + Netflix, being worldwide, localizes content through subtitles and dubbing. + Randomized experiments were a bad idea because they meant withholding content from users, so researchers at Netflix used several approaches to understand the value of localization while addressing potential confounding. + One example is studying the impact of pandemic-related delays in dubbing. + Researchers used synthetic controls (@sec-did) to simulate the impact on viewership with and without these delays. + Presumably, the timing of the pandemic-related delays was unrelated to many factors that would typically be related to dubbing processes, thus reducing some of the potential confounding. + +- The Tuskegee Study is one of modern history's most infamous examples of medical abuse. + It is commonly pointed to as a source of distrust in the medical community from Black Americans. + Health economics researchers used a variation of difference-in-difference techniques to assess the effect of the Tuskegee Study on distrust and life expectancy in older Black men [@Alsan2018]. + The results are important and disturbing: "We find that the disclosure of the study in 1972 is correlated with increases in medical mistrust and mortality and decreases in both outpatient and inpatient physician interactions for older black men. + Our estimates imply life expectancy at age 45 for black men fell by up to 1.5 years in response to the disclosure, accounting for approximately 35% of the 1980 life expectancy gap between black and white men and 25% of the gap between black men and women." #### Validity @@ -446,6 +469,7 @@ We've pulled out the *cause*, the *effect*, the *subject* (for whom?), and the * #| fig-cap: "Example of diagraming a causal claim." #| fig-height: 2 #| label: fig-diagram-1 + knitr::include_graphics("../images/sentence-diagram-1.png") ``` @@ -459,11 +483,13 @@ The causal claim here could be that smoking causes lung cancer. #| label: fig-diagram-2 #| fig-height: 2 #| fig-cap: "Diagram of the causal claim \"smoking causes lung cancer\"." + knitr::include_graphics("../images/sentence-diagram-2.png") ``` Let's get more specific. -A study was published in *JAMA* (the Journal of the American Medical Association) in 2005 titled "Effect of Smoking Reduction on Lung Cancer Risk." This study concluded: "Among individuals who smoke 15 or more cigarettes per day, smoking reduction by 50% significantly reduces the risk of lung cancer". +A study was published in *JAMA* (the Journal of the American Medical Association) in 2005 titled "Effect of Smoking Reduction on Lung Cancer Risk." +This study concluded: "Among individuals who smoke 15 or more cigarettes per day, smoking reduction by 50% significantly reduces the risk of lung cancer". [@godtfredsen2005effect] The study describes the time frame studied as 5-10 years. Let's diagram this causal claim. Here, we assume that the eligibility criteria and the target population for the estimated causal effect are the same (individuals who smoke 15 or more cigarettes per day); this need not always be the case. @@ -473,16 +499,19 @@ In @sec-estimands, we will discuss other potential target populations. #| echo: false #| fig-cap: "Example diagram of a more specific causal claim based on results from @godtfredsen2005effect." #| label: fig-diagram-3 + knitr::include_graphics("../images/sentence-diagram-3.png") ``` -Translating this idea into asking good causal questions, we can map the following terms that you will see throughout this book to these diagrams: *exposure* (the cause), *outcome* (the effect), *eligibility criteria* (for whom?), *time zero* (when did the participants begin to be followed?), *target population*, (who can we estimate an outcome effect for?) and *follow-up period* (when?). +Translating this idea into asking good causal questions, we can map the following terms that you will see throughout this book to these diagrams: *exposure* (the cause), *outcome* (the effect), *eligibility criteria* (for whom?), *time zero* (when did the participants begin to be followed?), *target population*, (who can we estimate an outcome effect for?) +and *follow-up period* (when?). ```{r} #| echo: false #| label: fig-diagram-4 #| fig-height: 2 #| fig-cap: "Example diagram mapped to causal analysis terminology" + knitr::include_graphics("../images/sentence-diagram-4.png") ``` @@ -491,5 +520,4 @@ Let's return to the smoking example. Our initial question was: *Does smoking cause lung cancer?*; The evidence in the study shows that for people who smoke 15+ cigarettes a day, reducing smoking by 50% reduces the risk of lung cancer over 5-10 years. Does the answer match the question? Not quite. -Let's update our question to match what the study actually showed: *For people who smoke 15+ cigarettes a day, does reducing smoking by 50% reduce the lung cancer risk over 5-10 years?* -Honing this skill — asking answerable causal questions — is essential and one we will discuss throughout this book. +Let's update our question to match what the study actually showed: *For people who smoke 15+ cigarettes a day, does reducing smoking by 50% reduce the lung cancer risk over 5-10 years?* Honing this skill --- asking answerable causal questions --- is essential and one we will discuss throughout this book. diff --git a/chapters/02-whole-game.qmd b/chapters/02-whole-game.qmd index a595f63..0aae797 100644 --- a/chapters/02-whole-game.qmd +++ b/chapters/02-whole-game.qmd @@ -4,6 +4,7 @@ ```{r} #| echo: false + # TODO: remove when first edition complete status("complete") ``` @@ -11,12 +12,12 @@ status("complete") In this chapter, we'll analyze data using techniques we learn in this book. We'll play the [whole game](https://www.gse.harvard.edu/news/uk/09/01/education-bat-seven-principles-educators) of causal analysis using a few key steps: -1. Specify a causal question -2. Draw our assumptions using a causal diagram -3. Model our assumptions -4. Diagnose our models -5. Estimate the causal effect -6. Conduct sensitivity analysis on the effect estimate +1. Specify a causal question +2. Draw our assumptions using a causal diagram +3. Model our assumptions +4. Diagnose our models +5. Estimate the causal effect +6. Conduct sensitivity analysis on the effect estimate We'll focus on the broader ideas behind each step and what they look like all together; however, we don't expect you to fully digest each idea. We'll spend the rest of the book taking up each step in detail. @@ -56,42 +57,39 @@ The study's authors found that net uptake was similar between the groups and tha There are several reasons we might not be able to conduct a new randomized trial to estimate the effect of bed net use on malaria risk, including ethics, cost, and time. We have substantial, robust evidence in favor of bed net use, but let's consider some conditions where observational causal inference could help. -- Imagine we are at a time before trials on this subject, and let's say people have started to use bed nets for this purpose on their own. - Our goal may still be to conduct a randomized trial, but we can answer questions more quickly with observed data. - In addition, this study's results might guide trials' design or intermediary policy suggestions. +- Imagine we are at a time before trials on this subject, and let's say people have started to use bed nets for this purpose on their own. + Our goal may still be to conduct a randomized trial, but we can answer questions more quickly with observed data. + In addition, this study's results might guide trials' design or intermediary policy suggestions. -- Sometimes, it is also not ethical to conduct a trial. - An example of this in malaria research is a question that arose in the study of bed net effectiveness: does malaria control in early childhood result in delayed immunity to the disease, resulting in severe malaria or death later in life? - Since we now know bed net use is very effective, *withholding* nets would be unethical. - A recent observational study found that the benefits of bed net use in childhood on all-cause mortality persist into adulthood [@Fink2022]. +- Sometimes, it is also not ethical to conduct a trial. + An example of this in malaria research is a question that arose in the study of bed net effectiveness: does malaria control in early childhood result in delayed immunity to the disease, resulting in severe malaria or death later in life? + Since we now know bed net use is very effective, *withholding* nets would be unethical. + A recent observational study found that the benefits of bed net use in childhood on all-cause mortality persist into adulthood [@Fink2022]. -- We may also want to estimate a different effect or the effect for another population than in previous trials. - For example, both randomized and observational studies helped us better understand that insecticide-based nets improve malaria resistance in the entire community, not just among those who use nets, so long as net usage is high enough [@howard2000; @hawley2003]. +- We may also want to estimate a different effect or the effect for another population than in previous trials. + For example, both randomized and observational studies helped us better understand that insecticide-based nets improve malaria resistance in the entire community, not just among those who use nets, so long as net usage is high enough [@howard2000; @hawley2003]. As we'll see in @sec-strat-outcome and @sec-g-comp, the causal inference techniques that we'll discuss in this book are often beneficial even when we're able to randomize. When we conduct an observational study, it's still helpful to think through the randomized trial we would run were it possible. -The trial we're trying to emulate in this causal analysis is the **target trial**. Considering the target trial helps us make our causal question more precise. +The trial we're trying to emulate in this causal analysis is the **target trial**. +Considering the target trial helps us make our causal question more precise. We'll use this framework more explicitly in @sec-designs, but for now, let's consider the causal question posed earlier: does using a bed net (a mosquito net) reduce the risk of malaria? This question is relatively straightforward, but it is still vague. As we saw in @sec-causal-question, we need to clarify some key areas: -- **What do we mean by "bed net"?** - There are several types of nets: untreated bed nets, insecticide-treated bed nets, and newer long-lasting insecticide-treated bed nets. +- **What do we mean by "bed net"?** There are several types of nets: untreated bed nets, insecticide-treated bed nets, and newer long-lasting insecticide-treated bed nets. -- **Risk compared to what?** - Are we, for instance, comparing insecticide-treated bed nets to *no* net? - Untreated nets? - Or are we comparing a new type of net, like long-lasting insecticide-treated bed nets, to nets that are already in use? +- **Risk compared to what?** Are we, for instance, comparing insecticide-treated bed nets to *no* net? + Untreated nets? + Or are we comparing a new type of net, like long-lasting insecticide-treated bed nets, to nets that are already in use? -- **Risk as defined by what?** - Whether or not a person contracted malaria? - Whether a person died of malaria? +- **Risk as defined by what?** Whether or not a person contracted malaria? + Whether a person died of malaria? -- **Risk among whom?** - What population are we trying to apply this knowledge to? - Who is it practical to include in our study? - Who might we need to exclude? +- **Risk among whom?** What population are we trying to apply this knowledge to? + Who is it practical to include in our study? + Who might we need to exclude? We will use simulated data to answer a more specific question: Does using insecticide-treated bed nets compared to no nets decrease the risk of contracting malaria after 1 year? In this particular data, [simulated by Dr. Andrew Heiss](https://evalsp21.classes.andrewheiss.com/example/matching-ipw/#program-background): @@ -148,20 +146,21 @@ The distribution of malaria risk appears to be quite different by net usage. ```{r} #| label: fig-malaria-risk-density -#| fig.cap: > +#| fig-cap: > #| A density plot of malaria risk for those who did and did not use nets. #| The risk of malaria is lower for those who use nets. + library(tidyverse) library(causalworkshop) net_data |> ggplot(aes(malaria_risk, fill = net)) + - geom_density(color = NA, alpha = .8) + geom_density(color = NA, alpha = 0.8) ``` ```{r} #| echo = FALSE means <- net_data |> - group_by(net) |> + group_by(net) |> summarize(malaria_risk = mean(malaria_risk)) |> pull(malaria_risk) ``` @@ -171,7 +170,7 @@ The mean difference in malaria risk is about `r round(means[[1]] - means[[2]], d ```{r} net_data |> - group_by(net) |> + group_by(net) |> summarize(malaria_risk = mean(malaria_risk)) ``` @@ -197,8 +196,8 @@ Here's the DAG that we're proposing for this question. ```{r} #| label: fig-net-data-dag #| echo: false -#| fig.width: 7 -#| fig.cap: > +#| fig-width: 7 +#| fig-cap: > #| A proposed causal diagram of the effect of bed net use on malaria. #| This directed acyclic graph (DAG) states our assumption that bed net #| use causes a reduction in malaria risk. It also says that we assume: @@ -208,6 +207,7 @@ Here's the DAG that we're proposing for this question. #| people in a household; eligibility for the free net programs is impacted #| by income and the number of people in a household; and health is impacted #| by income. + library(ggdag, warn.conflicts = FALSE) library(ggokabeito) mosquito_dag <- dagify( @@ -272,10 +272,10 @@ In DAGs, each point represents a variable, and each arrow represents a cause. In other words, this diagram declares what we think the causal relationships are between these variables. In @fig-net-data-dag, we're saying that we believe: -- Malaria risk is causally impacted by net usage, income, health, temperature, and insecticide resistance. -- Net usage is causally impacted by income, health, temperature, eligibility for the free net program, and the number of people in a household. -- Eligibility for the free net programs is determined by income and the number of people in a household. -- Health is causally impacted by income. +- Malaria risk is causally impacted by net usage, income, health, temperature, and insecticide resistance. +- Net usage is causally impacted by income, health, temperature, eligibility for the free net program, and the number of people in a household. +- Eligibility for the free net programs is determined by income and the number of people in a household. +- Health is causally impacted by income. You may agree or disagree with some of these assertions. That's a good thing! @@ -297,13 +297,14 @@ The association between bed net use and malaria risk is a mixture of all of thes ```{r} #| label: fig-net-data-confounding #| echo: false -#| fig.width: 14 -#| fig.height: 10 -#| fig.cap: > +#| fig-width: 14 +#| fig-height: 10 +#| fig-cap: > #| In the proposed DAG, there are eight open pathways that contribute to the #| causal effect seen in the naive regression: the true effect (in green) of #| net usage on malaria risk and seven other confounding pathways (in orange). #| The naive estimate is wrong because it is a composite of all these effects. + glyph <- function(data, params, size) { data$shape <- 15 data$size <- 12 @@ -320,8 +321,18 @@ mosquito_dag |> ), effects = factor(effects, c("true effect", "confounding effect")) ) |> - ggplot(aes(x = x, y = y, xend = xend, yend = yend, color = effects, alpha = path)) + - geom_dag_edges(aes(edge_alpha = path, edge_colour = effects), show.legend = FALSE) + + ggplot(aes( + x = x, + y = y, + xend = xend, + yend = yend, + color = effects, + alpha = path + )) + + geom_dag_edges( + aes(edge_alpha = path, edge_colour = effects), + show.legend = FALSE + ) + geom_dag_point( data = function(.x) dplyr::filter(.x, is.na(path)), key_glyph = glyph @@ -347,13 +358,13 @@ mosquito_dag |> scale_alpha_manual( drop = FALSE, values = c("open path" = 1), - na.value = .5, + na.value = 0.5, breaks = "open path" ) + ggraph::scale_edge_alpha_manual( drop = FALSE, values = c("open path" = 1), - na.value = .5, + na.value = 0.5, breaks = "open path" ) + scale_color_okabe_ito( @@ -373,7 +384,9 @@ mosquito_dag |> When we calculate a naive linear regression that only includes net usage and malaria risk, the effect we see is incorrect because the seven other confounding pathways in @fig-net-data-confounding distort it. In DAG terminology, we need to *block* these open pathways that distort the causal estimate we're after. -(We can block paths through several techniques, including stratification, matching, weighting, and more. We'll see several methods throughout the book.) Luckily, by specifying a DAG, we can precisely determine the variables we need to control for. +(We can block paths through several techniques, including stratification, matching, weighting, and more. +We'll see several methods throughout the book.) +Luckily, by specifying a DAG, we can precisely determine the variables we need to control for. For this DAG, we need to control for three variables: `r glue::glue_collapse(as.list(dagitty::adjustmentSets(mosquito_dag))[[1]], sep = ", ", last = ", and ")`. These three variables are a *minimal adjustment set*, the minimum set (or sets) of variables you need to block all confounding pathways. We'll discuss adjustment sets further in @sec-dags. @@ -448,8 +461,9 @@ Here's the distribution of the propensity score by group, created by `geom_mirro ```{r} #| label: fig-mirror-histogram-net-data-unweighted -#| fig.cap: > +#| fig-cap: > #| A mirrored histogram of the propensity scores of those who used nets (top, blue) versus those who did not use nets (bottom, orange). The range of propensity scores is similar between groups, with those who used nets slightly to the left of those who didn't, but the shapes of the distribution are different. + library(halfmoon) ggplot(net_data_wts, aes(.fitted)) + geom_mirror_histogram( @@ -464,8 +478,9 @@ The weighted propensity score creates a pseudo-population where the distribution ```{r} #| label: fig-mirror-histogram-net-data-weighted -#| fig.cap: > +#| fig-cap: > #| A mirrored histogram of the propensity scores of those who used nets (top, blue) versus those who did not use nets (bottom, orange). The shaded region represents the unweighted distribution, and the lighter colored region represents the weighted distributions. The ATE weights up-weight the groups to be similar in range and shape of the distribution of propensity scores. + ggplot(net_data_wts, aes(.fitted)) + geom_mirror_histogram( aes(group = net), @@ -474,7 +489,7 @@ ggplot(net_data_wts, aes(.fitted)) + geom_mirror_histogram( aes(fill = net, weight = wts), bins = 50, - alpha = .5 + alpha = 0.5 ) + scale_y_continuous(labels = abs) + labs(x = "propensity score") @@ -497,8 +512,9 @@ We'll calculate the SMDs with `tidy_smd()` then plot them with `geom_love()`, bo ```{r} #| label: fig-love-plot-net-data -#| fig.cap: > +#| fig-cap: > #| A love plot representing the standardized mean differences (SMD) between exposure groups of three confounders: temperature, income, and health. Before weighting, there are considerable differences in the groups. After weighting, the confounders are much more balanced between groups. + plot_df <- tidy_smd( net_data_wts, c(income, health, temperature), @@ -527,8 +543,9 @@ We'll also discuss several other types of weights that are less prone to this is ```{r} #| label: fig-ate-density-net-data -#| fig.cap: > +#| fig-cap: > #| A density plot of the average treatment effect (ATE) weights. The plot is skewed, with higher values towards 8. This may indicate a problem with the model, but the weights aren't so extreme to destabilize the variance of the estimate. + net_data_wts |> ggplot(aes(wts)) + geom_density(fill = "#CC79A7", color = NA, alpha = 0.8) @@ -630,12 +647,12 @@ bootstrapped_net_data <- bootstraps( bootstrapped_net_data ``` -The result is a nested data frame: each `splits` object contains metadata that rsample uses to subset the bootstrap samples for each of the 1,000 samples. +The result is a nested data frame: each `splits` object contains metadata that rsample uses to subset the bootstrap samples for each of the 1,000 samples. We actually have 1,001 rows because `apparent = TRUE` keeps a copy of the original data frame, as well, which is needed for some types of confidence interval calculations. Next, we'll run `fit_ipw()` 1,001 times to create a distribution for `estimate`. At its heart, the calculation we're doing is -``` r +```r fit_ipw(bootstrapped_net_data$splits[[n]]) ``` @@ -663,18 +680,21 @@ Now we have a distribution of estimates: #| label: fig-bootstrap-estimates-net-data #| message: false #| warning: false -#| fig.cap: > +#| fig-cap: > #| "A histogram of 1,000 bootstrapped estimates of the effect of net use on malaria risk. The spread of these estimates accounts for the dependency and uncertainty in the use of IPW weights." + ipw_results |> # remove original data set results - filter(id != "Apparent") |> + filter(id != "Apparent") |> mutate( estimate = map_dbl( boot_fits, # pull the `estimate` for `netTRUE` for each fit - \(.fit) .fit |> - filter(term == "netTRUE") |> - pull(estimate) + \(.fit) { + .fit |> + filter(term == "netTRUE") |> + pull(estimate) + } ) ) |> ggplot(aes(estimate)) + @@ -720,15 +740,20 @@ We'll use tipr to calculate this answer for 5 scenarios, where the mean differen ```{r} #| echo: false + options(tipr.verbose = FALSE) ``` ```{r} #| label: fig-tip-coef-net -#| fig.cap: > +#| fig-cap: > #| A tipping point analysis under several confounding scenarios where the unmeasured confounder is a normally-distributed continuous variable. The line represents the strength of confounding necessary to tip the upper confidence interval of the causal effect estimate to 0. The x-axis represents the coefficient of the confounder-outcome relationship adjusted for the exposure and the set of measured confounders. The y-axis represents the scaled mean difference of the confounder between exposure groups. + library(tipr) -tipping_points <- tip_coef(boot_estimate$.upper, exposure_confounder_effect = 1:5) +tipping_points <- tip_coef( + boot_estimate$.upper, + exposure_confounder_effect = 1:5 +) tipping_points |> ggplot(aes(confounder_outcome_effect, exposure_confounder_effect)) + @@ -752,9 +777,9 @@ Let's say that in our simulated data, an unnamed ethnic group in the unnamed cou For historical reasons, bed net use in this group is also very high. We don't have this variable in `net_data`, but let's say we know from the literature that in this sample, we can estimate at: -1. People with this genetic resistance have, on average, a lower malaria risk by about 10. -2. About 26% of people who use nets in our study have this genetic resistance. -3. About 5% of people who don't use nets have this genetic resistance. +1. People with this genetic resistance have, on average, a lower malaria risk by about 10. +2. About 26% of people who use nets in our study have this genetic resistance. +3. About 5% of people who don't use nets have this genetic resistance. With this amount of information, we can use tipr to adjust the estimates we calculated for the unmeasured confounder. We'll use `adjust_coef_with_binary()` to calculate the adjusted estimates. @@ -780,11 +805,22 @@ The true effect of net use on malaria is about -10, and the true DAG that genera ```{r} #| label: fig-net-data-true-dag #| echo: false -#| fig.cap: > +#| fig-cap: > #| The true causal diagram for `net_data`. This DAG is identical to the one we proposed with one addition: genetic resistance to malaria causally reduces the risk of malaria and impacts net use. It's thus a confounder and a part of the minimal adjustment set required to get an unbiased effect estimate. In otherwords, by not including it, we've calculated the wrong effect. + mosquito_dag_full <- dagify( - malaria_risk ~ net + income + health + temperature + insecticide_resistance + genetic_resistance, - net ~ income + health + temperature + eligible + household + genetic_resistance, + malaria_risk ~ net + + income + + health + + temperature + + insecticide_resistance + + genetic_resistance, + net ~ income + + health + + temperature + + eligible + + household + + genetic_resistance, eligible ~ income + household, health ~ income, exposure = "net", @@ -843,6 +879,7 @@ mosquito_dag_full |> ```{r} #| include: false + fit_ipw_full <- function(.split, ...) { # get bootstrapped data frame .df <- as.data.frame(.split) @@ -893,12 +930,12 @@ We'll explore these techniques and others in @sec-sensitivity. To calculate this effect, we: -1. Specified a causal question (for the average treatment effect) -2. Drew our assumptions using a causal diagram (using DAGs) -3. Modeled our assumptions (using propensity score weighting) -4. Diagnosed our models (by checking confounder balance after weighting) -5. Estimated the causal effect (using inverse probability weighting) -6. Conducted sensitivity analysis on the effect estimate (using tipping point analysis) +1. Specified a causal question (for the average treatment effect) +2. Drew our assumptions using a causal diagram (using DAGs) +3. Modeled our assumptions (using propensity score weighting) +4. Diagnosed our models (by checking confounder balance after weighting) +5. Estimated the causal effect (using inverse probability weighting) +6. Conducted sensitivity analysis on the effect estimate (using tipping point analysis) Throughout the rest of the book, we'll follow these broad steps in examples from many domains. We'll dive more deeply into propensity score techniques, explore other methods for estimating causal effects, and, most importantly, make sure, over and over again, that the assumptions we're making are reasonable---even if we'll never know for sure. diff --git a/chapters/03-po-counterfactuals.qmd b/chapters/03-po-counterfactuals.qmd index ee8f985..e46447c 100644 --- a/chapters/03-po-counterfactuals.qmd +++ b/chapters/03-po-counterfactuals.qmd @@ -4,16 +4,12 @@ ```{r} #| echo: false + # TODO: remove when first edition complete status("complete") ``` -> | Two roads diverged in a yellow wood, -> | And sorry I could not travel both -> | And be one traveler, long I stood -> | And looked down one as far as I could -> | To where it bent in the undergrowth -> | --- Robert Frost +> \| Two roads diverged in a yellow wood, \| And sorry I could not travel both \| And be one traveler, long I stood \| And looked down one as far as I could \| To where it bent in the undergrowth \| --- Robert Frost In 2022, Ice-T, best known as an American rapper and Fin on *Law and Order: SVU*, co-authored a book titled *Split Decision: Life Stories* [@split]. In Split Decision, Ice-T recounts his dramatic journey from a life of crime to fame and success, contrasting it with the fate of his former crime partner and co-author, Spike. @@ -92,15 +88,15 @@ The outcome we're interested in is whether or not the individual goes to jail. Let's assume an exposure has two levels: -- $X=continue$ if you continue doing jewel heists +- $X=continue$ if you continue doing jewel heists -- $X=quit$ if you quit doing jewel heists +- $X=quit$ if you quit doing jewel heists Under this scenario, there are two potential outcomes: -- $Y(continue)$, the potential outcome if $X=continue$ +- $Y(continue)$, the potential outcome if $X=continue$ -- $Y(quit)$, the potential outcome if $X=quit$ +- $Y(quit)$, the potential outcome if $X=quit$ Only *one* of these potential outcomes will be realized, the factual one corresponding to the exposure that actually occurred. Therefore, only one potential outcome is observable for each individual. @@ -110,8 +106,8 @@ In the case of a binary exposure, this leaves one potential outcome as *observab Our causal effect of interest is often some difference in potential outcomes, such as $Y(continue) - Y(quit)$ (say, the difference in the probabilities of going to jail the following year if you continue heists vs. quitting heists). In the case of Ice-T and Spike, we're interested in their individual causal effects: -- $Y_{Ice-T}(continue) - Y_{Ice-T}(quit)$ -- $Y_{Spike}(continue) - Y_{Spike}(quit)$ +- $Y_{Ice-T}(continue) - Y_{Ice-T}(quit)$ +- $Y_{Spike}(continue) - Y_{Spike}(quit)$ Here, we're missing $Y_{Ice-T}(continue)$ and $Y_{Spike}(quit)$, so we can't calculate these values. In practice, we need to use observed data as proxies for the missing potential outcomes, and we usually average them over a particular population. @@ -153,6 +149,7 @@ data <- data |> #| label: tbl-po #| tbl-cap: "Potential Outcomes Simulation: The causal effect of eating chocolate (versus vanilla) ice cream on happiness. In this simulation, each individual has a known outcome for each exposure. Since we know each potential outcome, we can calculate the individual causal effects on happiness of eating chocolate versus vanilla ice cream." #| code-fold: true + library(gt) data |> gt() |> @@ -160,7 +157,9 @@ data |> id = "ID", y_chocolate = md("$$Y_{\\text{id}}(\\text{chocolate})$$"), y_vanilla = md("$$Y_{\\text{id}}(\\text{vanilla})$$"), - causal_effect = md("$$Y_{\\text{id}}(\\text{chocolate}) - Y_{\\text{id}}(\\text{vanilla})$$") + causal_effect = md( + "$$Y_{\\text{id}}(\\text{chocolate}) - Y_{\\text{id}}(\\text{vanilla})$$" + ) ) |> fmt_markdown( columns = c(y_chocolate, y_vanilla, causal_effect) @@ -194,7 +193,8 @@ data |> In reality, we cannot observe both potential outcomes at any given moment; each individual in our study can only eat one flavor of ice cream at the time the study is conducted[^03-po-counterfactuals-1]. Suppose we randomly gave one flavor or the other to each participant. -Now, what we *observe* is shown in @tbl-obs. We only know one potential outcome (the one related to the exposure the participant received). +Now, what we *observe* is shown in @tbl-obs. +We only know one potential outcome (the one related to the exposure the participant received). We don't know the other one, and consequently, we don't know the individual causal effect. [^03-po-counterfactuals-1]: Ice cream swirls were prohibited per the study protocol. @@ -210,7 +210,9 @@ data_observed <- data |> # a binomial distribution with a probability of 0.5 for # being in either group exposure = if_else( - rbinom(n(), 1, 0.5) == 1, "chocolate", "vanilla" + rbinom(n(), 1, 0.5) == 1, + "chocolate", + "vanilla" ), observed_outcome = case_when( exposure == "chocolate" ~ y_chocolate, @@ -223,6 +225,7 @@ data_observed <- data |> #| label: tbl-obs #| tbl-cap: "Potential Outcomes Simulation: The observed exposure is the only potential outcome we know. We don't know the missing potential outcome, and thus, we cannot calculate the individual causal effect. We need to use the observed exposure and outcome to estimate the effect of eating chocolate (versus vanilla) ice cream on happiness. Unfortunately, we can't do it at the individual level." #| code-fold: true + avg_chocolate <- data_observed |> filter(exposure == "chocolate") |> pull(observed_outcome) |> @@ -245,7 +248,9 @@ data_observed |> id = "ID", y_chocolate = md("$$Y_{\\text{id}}(\\text{chocolate})$$"), y_vanilla = md("$$Y_{\\text{id}}(\\text{vanilla})$$"), - causal_effect = md("$$Y_{\\text{id}}(\\text{chocolate}) - Y_{\\text{id}}(\\text{vanilla})$$") + causal_effect = md( + "$$Y_{\\text{id}}(\\text{chocolate}) - Y_{\\text{id}}(\\text{vanilla})$$" + ) ) |> fmt_markdown(columns = c(y_chocolate, y_vanilla, causal_effect)) |> sub_missing( @@ -300,11 +305,11 @@ That's only one way to say it. [There are a lot of variations worldwide](https://en.wikipedia.org/wiki/Apples_and_oranges). Here are some other things people should not try to compare: -- Cheese and chalk (UK English) -- Apples and pears (German) -- Potatoes and sweet potatoes (Latin American Spanish) -- Grandmothers and toads (Serbian) -- Horses and donkeys (Hindi) +- Cheese and chalk (UK English) +- Apples and pears (German) +- Potatoes and sweet potatoes (Latin American Spanish) +- Grandmothers and toads (Serbian) +- Horses and donkeys (Hindi) ::: For the first three-fourths or so of the book, we'll deal with so-called **unconfoundedness** methods. @@ -330,6 +335,7 @@ When this assumption holds, we can treat the vanilla group as the proxy for the ```{r} #| include: false + library(glue) prepare_plot_data <- function( @@ -349,7 +355,13 @@ prepare_plot_data <- function( names_to = "potential_outcome", values_to = "happiness" ) |> - mutate(observed = if_else(exposure == potential_outcome, "observed", "unobserved")) |> + mutate( + observed = if_else( + exposure == potential_outcome, + "observed", + "unobserved" + ) + ) |> mutate( potential_outcome = potential_outcome_transform(potential_outcome), exposure = transform_exposure(exposure) @@ -365,13 +377,17 @@ compute_avg_labels <- function(plot_data, group_vars) { mutate(y_id = min(plot_data$y_id) - 1) } -add_avg_layers <- function(avg_labels, observed_col = "grey50", unobserved_col = "grey50") { +add_avg_layers <- function( + avg_labels, + observed_col = "grey50", + unobserved_col = "grey50" +) { list( geom_point( data = avg_labels |> filter(observed == "unobserved"), mapping = aes(x = happiness, y = y_id), size = 4, - shape = 23, + shape = 23, fill = "white", color = unobserved_col, inherit.aes = FALSE @@ -395,34 +411,46 @@ po_theme <- theme( ) ``` - ```{r} #| label: fig-po #| fig-cap: "The average potential outcomes by observed exposure group. Under exchangeability, the exposure groups have nothing to do with the potential outcomes. Had the vanilla group received chocolate, their potential outcomes would have been about the same on average as the chocolate group, and vice versa. Exchangeability allows us to use each group as a counterfactual for the other." #| message: false #| code-fold: true + plot_data <- data_observed |> select(starts_with("y"), exposure) |> mutate(id = row_number()) |> prepare_plot_data( pivot_prefix = "y_", potential_outcome_transform = \(x) paste0("potential outcome: y(", x, ")"), - transform_exposure = \(exp) if_else(exp == "vanilla", "actually ate vanilla", "actually ate chocolate"), + transform_exposure = \(exp) { + if_else( + exp == "vanilla", + "actually ate vanilla", + "actually ate chocolate" + ) + }, id_assignment = FALSE ) # Compute group averages and add label text -avg_labels <- compute_avg_labels(plot_data, c("potential_outcome", "exposure", "observed")) |> +avg_labels <- compute_avg_labels( + plot_data, + c("potential_outcome", "exposure", "observed") +) |> mutate( exposure_lbl = str_replace_all(exposure, "actually ate ", ""), po_lbl = str_replace_all(potential_outcome, "potential outcome: ", ""), - label = glue("Avg {po_lbl}\n({exposure_lbl} group, {observed})") |> str_wrap(19) + label = glue("Avg {po_lbl}\n({exposure_lbl} group, {observed})") |> + str_wrap(19) ) # Prepare an annotation for ID 3 id_annotation <- plot_data |> filter(id == 3) |> - mutate(label = glue("Potential outcomes\nfor ID 3 ({observed})") |> str_wrap(15)) + mutate( + label = glue("Potential outcomes\nfor ID 3 ({observed})") |> str_wrap(15) + ) # Exchangeability annotation between group averages exchangeability_annotation <- tibble( @@ -431,15 +459,27 @@ exchangeability_annotation <- tibble( y = 1, yend = 0.5, potential_outcome = "potential outcome: y(chocolate)", - label = str_wrap("For exchangeability to hold, these\ngroup averages should be similar", 19) + label = str_wrap( + "For exchangeability to hold, these\ngroup averages should be similar", + 19 + ) ) ggplot(plot_data, aes(happiness, y_id, color = observed, shape = observed)) + geom_point(aes(fill = observed), size = 3, alpha = 0.8) + - add_avg_layers(avg_labels, observed_col = ggokabeito::palette_okabe_ito(1), unobserved_col = ggokabeito::palette_okabe_ito(2)) + + add_avg_layers( + avg_labels, + observed_col = ggokabeito::palette_okabe_ito(1), + unobserved_col = ggokabeito::palette_okabe_ito(2) + ) + geom_curve( data = id_annotation, - mapping = aes(x = happiness + 2.5, xend = happiness + 0.5, y = y_id + 2, yend = y_id), + mapping = aes( + x = happiness + 2.5, + xend = happiness + 0.5, + y = y_id + 2, + yend = y_id + ), curvature = -0.2, arrow = arrow(length = unit(0.02, "npc")), inherit.aes = FALSE, @@ -472,7 +512,7 @@ ggplot(plot_data, aes(happiness, y_id, color = observed, shape = observed)) + size = 4, label.size = NA ) + - facet_wrap(~ potential_outcome) + + facet_wrap(~potential_outcome) + scale_y_continuous( breaks = c(unique(plot_data$y_id), min(plot_data$y_id) - 1), labels = c(unique(plot_data$id), expression(bold("Avg"))) @@ -483,11 +523,17 @@ ggplot(plot_data, aes(happiness, y_id, color = observed, shape = observed)) + ) + scale_fill_manual( name = NULL, - values = c("observed" = ggokabeito::palette_okabe_ito(1), "unobserved" = "white") + values = c( + "observed" = ggokabeito::palette_okabe_ito(1), + "unobserved" = "white" + ) ) + scale_color_manual( name = NULL, - values = c("observed" = ggokabeito::palette_okabe_ito(1), "unobserved" = ggokabeito::palette_okabe_ito(2)) + values = c( + "observed" = ggokabeito::palette_okabe_ito(1), + "unobserved" = ggokabeito::palette_okabe_ito(2) + ) ) + scale_x_continuous( breaks = seq(0, 12, by = 2.5), @@ -520,7 +566,9 @@ mix_up <- function(flavor) { data_observed <- data |> mutate( exposure = if_else( - rbinom(n(), 1, 0.5) == 1, "chocolate", "vanilla" + rbinom(n(), 1, 0.5) == 1, + "chocolate", + "vanilla" ), exposure = mix_up(exposure), observed_outcome = case_when( @@ -586,7 +634,8 @@ As we see in @fig-po-confounding, our groups are no longer exchangeable; they do #| label: "fig-po-confounding" #| fig-cap: "The average potential outcomes by observed exposure group in the presence of confounding. The groups are no longer exchangeable because the flavor assignment is no longer independent of the potential outcomes." #| code-fold: true -#| message: false +#| message: false + data_observed_exch |> select(starts_with("y"), exposure) |> pivot_longer( @@ -594,16 +643,26 @@ data_observed_exch |> names_prefix = "y_", names_to = "potential_outcome", values_to = "happiness" - ) |> + ) |> mutate( observed = if_else(exposure == potential_outcome, "observed", "unobserved"), potential_outcome = paste0("potential outcome: y(", potential_outcome, ")"), - exposure = if_else(exposure == "vanilla", "actually ate\nvanilla", "actually ate\nchocolate") + exposure = if_else( + exposure == "vanilla", + "actually ate\nvanilla", + "actually ate\nchocolate" + ) ) |> - ggplot(aes(happiness, exposure, color = observed, fill = observed, shape = observed)) + + ggplot(aes( + happiness, + exposure, + color = observed, + fill = observed, + shape = observed + )) + stat_summary( - fun = "mean", - size = 3.5, + fun = "mean", + size = 3.5, geom = "point", shape = 23, position = position_nudge(y = 0.033) @@ -614,57 +673,76 @@ data_observed_exch |> aes(label = round(after_stat(x), 1)), vjust = 1.8, show.legend = FALSE - ) + - facet_wrap(~ potential_outcome) + + ) + + facet_wrap(~potential_outcome) + theme( panel.grid.major.y = element_blank(), panel.border = element_rect(color = "grey40", fill = NA, linewidth = 0.8), axis.title.y = element_blank() - ) + + ) + labs( - y = "actual exposure", + y = "actual exposure", color = NULL, shape = NULL, fill = NULL - ) + - coord_cartesian(clip = "off") + + ) + + coord_cartesian(clip = "off") + scale_shape_manual(values = c(19, 21)) + - scale_fill_manual(values = c(observed = ggokabeito::palette_okabe_ito(1), unobserved = "white")) + - scale_x_continuous(breaks = seq(0, 12, by = 2.5), limits = c(-2, 12)) + scale_fill_manual( + values = c( + observed = ggokabeito::palette_okabe_ito(1), + unobserved = "white" + ) + ) + + scale_x_continuous(breaks = seq(0, 12, by = 2.5), limits = c(-2, 12)) ``` What can we do when exchangeability is violated? Throughout the book, we'll devote a lot of time to this problem. The heart of the solution, though, is that we can sometimes still achieve exchangeability within levels of another variable. -This is called **conditional exchangeability**: $Y(x) \perp\!\!\!\perp X \mid Z$. In this case, we need exchangeability within levels of `prefer_chocolate`. +This is called **conditional exchangeability**: $Y(x) \perp\!\!\!\perp X \mid Z$. +In this case, we need exchangeability within levels of `prefer_chocolate`. ```{r} #| label: fig-po-cond-exch #| fig-cap: "The average potential outcomes by observed exposure group in the presence of confounding. We can still achieve *conditional* exchangeability within levels of the confounder. Here, we also start to see the limits of our sample size, as the potential outcomes, which would be valid in higher numbers, start to fail." #| code-fold: true #| message: false + data_observed_exch |> - mutate(prefer_chocolate = if_else( - prefer_chocolate, - "prefers\nchocolate", - "prefers\nvanilla" - )) |> + mutate( + prefer_chocolate = if_else( + prefer_chocolate, + "prefers\nchocolate", + "prefers\nvanilla" + ) + ) |> pivot_longer( starts_with("y"), names_prefix = "y_", names_to = "potential_outcome", values_to = "happiness" - ) |> + ) |> mutate( observed = if_else(exposure == potential_outcome, "observed", "unobserved"), potential_outcome = paste0("potential outcome: y(", potential_outcome, ")"), - exposure = if_else(exposure == "vanilla", "actually ate\nvanilla", "actually ate\nchocolate") + exposure = if_else( + exposure == "vanilla", + "actually ate\nvanilla", + "actually ate\nchocolate" + ) ) |> - - ggplot(aes(happiness, exposure, color = observed, fill = observed, shape = observed)) + + + ggplot(aes( + happiness, + exposure, + color = observed, + fill = observed, + shape = observed + )) + stat_summary( - fun = "mean", - size = 3.5, + fun = "mean", + size = 3.5, geom = "point", shape = 23, position = position_nudge(y = 0.033) @@ -675,23 +753,28 @@ data_observed_exch |> aes(label = round(after_stat(x), 1)), vjust = 1.8, show.legend = FALSE - ) + - facet_grid(prefer_chocolate ~ potential_outcome) + + ) + + facet_grid(prefer_chocolate ~ potential_outcome) + theme( panel.grid.major.y = element_blank(), panel.border = element_rect(color = "grey40", fill = NA, linewidth = 0.8), axis.title.y = element_blank() - ) + + ) + labs( - y = "actual exposure", + y = "actual exposure", color = NULL, shape = NULL, fill = NULL - ) + - coord_cartesian(clip = "off") + + ) + + coord_cartesian(clip = "off") + scale_shape_manual(values = c(19, 21)) + - scale_fill_manual(values = c(observed = ggokabeito::palette_okabe_ito(1), unobserved = "white")) + - scale_x_continuous(breaks = seq(0, 12, by = 2.5), limits = c(-2, 12)) + scale_fill_manual( + values = c( + observed = ggokabeito::palette_okabe_ito(1), + unobserved = "white" + ) + ) + + scale_x_continuous(breaks = seq(0, 12, by = 2.5), limits = c(-2, 12)) ``` ::: callout-warning @@ -702,7 +785,8 @@ We'd achieve better exchangeability as our sample size increases, but this quick ### Positivity The positivity assumption states that every individual has a non-zero probability of receiving each level of exposure. -Mathematically, this means that $P(X = x) > 0$ for all $x$. In other words, we assume that there is no one for whom one or more levels of exposure are impossible. +Mathematically, this means that $P(X = x) > 0$ for all $x$. +In other words, we assume that there is no one for whom one or more levels of exposure are impossible. We need this assumption because it defines the potential outcome for a given exposure level. If someone is never, under any circumstances, exposed to chocolate, then the potential outcome for `y(chocolate)` isn't defined for that person. We can't use them to provide information about this potential outcome. @@ -719,7 +803,8 @@ Stochastic violations are chance occurrences where you don't have any observatio In the example where 80% of participants chose the ice cream that would make them happiest, it's feasible that, given our low sample size, we might end up with people who only choose chocolate. Naturally, we can only calculate the effect of vanilla vs. chocolate if we have observations of vanilla. -A nuance of positivity is that it needs to hold within levels of all covariates required for exchangeability: $P(X = x \mid Z = z) > 0$ for all $x$ and $z$. Even if the flavors vary, we also need variability within levels of `prefer_chocolate`. +A nuance of positivity is that it needs to hold within levels of all covariates required for exchangeability: $P(X = x \mid Z = z) > 0$ for all $x$ and $z$. +Even if the flavors vary, we also need variability within levels of `prefer_chocolate`. That can also fail by chance. ```{r} @@ -802,21 +887,32 @@ For those with a vanilla allergy, `y_vanilla` is not defined, as in @fig-po-pos. #| fig-cap: "Potential outcomes in the presence of a structural positivity violation. When potential outcomes can't occur, they are not defined. Those who have an allergy to vanilla do not have a counterpart `y(vanilla)`." #| code-fold: true #| warning: false + plot_data <- data_observed_struc |> mutate(is_missing_y_vanilla = is.na(y_vanilla)) |> select(id, starts_with("y"), exposure, is_missing_y_vanilla) |> prepare_plot_data( pivot_prefix = "y_", potential_outcome_transform = \(x) paste0("y(", x, ")"), - transform_exposure = \(exp) if_else(exp == "vanilla", "actually ate vanilla", "actually ate chocolate"), + transform_exposure = \(exp) { + if_else( + exp == "vanilla", + "actually ate vanilla", + "actually ate chocolate" + ) + }, id_assignment = FALSE ) -avg_labels <- compute_avg_labels(plot_data, c("potential_outcome", "exposure", "observed")) |> +avg_labels <- compute_avg_labels( + plot_data, + c("potential_outcome", "exposure", "observed") +) |> mutate( exposure_lbl = str_replace_all(exposure, "actually ate ", ""), po_lbl = str_replace_all(potential_outcome, "potential outcome: ", ""), - label = glue("Avg {po_lbl}\n({exposure_lbl} group, {observed})") |> str_wrap(19) + label = glue("Avg {po_lbl}\n({exposure_lbl} group, {observed})") |> + str_wrap(19) ) # Missing y(vanilla) points on the chocolate side @@ -888,7 +984,7 @@ ggplot( size = 4.5, label.size = NA ) + - facet_wrap(~ potential_outcome) + + facet_wrap(~potential_outcome) + scale_y_continuous( breaks = c(unique(plot_data$y_id), min(plot_data$y_id) - 1), labels = c(unique(plot_data$id), expression(bold("Avg"))) @@ -931,18 +1027,22 @@ What else would it be? If you think this issue through, though, you'll see that this assumption can be violated easily for any given exposure. Let's consider two common cases: -- **Poorly-defined exposure**: For each exposure value, there is a difference between subjects when delivering that exposure. Put another way, multiple treatment versions exist. Instead, we need a *well-defined exposure*. -- **Interference**: The outcome (technically all potential outcomes, regardless of whether they are observed) for any subject depends on another subject's exposure; Instead, we need *no interference*. +- **Poorly-defined exposure**: For each exposure value, there is a difference between subjects when delivering that exposure. + Put another way, multiple treatment versions exist. + Instead, we need a *well-defined exposure*. +- **Interference**: The outcome (technically all potential outcomes, regardless of whether they are observed) for any subject depends on another subject's exposure; Instead, we need *no interference*. ::: callout-tip -Consistency is sometimes called the **stable-unit-treatment-value assumption** or **SUTVA** [@imbens2015causal]. +Consistency is sometimes called the **stable-unit-treatment-value assumption** or **SUTVA** [@imbens2015causal]. However, causal consistency is a distinct idea from *statistical* consistency, which is a property where an estimator moves closer to the truth as the sample size increases. ::: #### Poorly-defined exposures Consistency violations are common when an exposure is poorly defined. -They occur in everything from surgeries (say if one doctor is more experienced with a surgical procedure than another) to income (are all income sources dollar-for-dollar the same? Is the lottery the same as a weekly paycheck?) to education (do years of education have the same effect across school quality?), and many others [@Rehkopf2016]. +They occur in everything from surgeries (say if one doctor is more experienced with a surgical procedure than another) to income (are all income sources dollar-for-dollar the same? +Is the lottery the same as a weekly paycheck?) +to education (do years of education have the same effect across school quality?), and many others [@Rehkopf2016]. Suppose there were two containers of chocolate ice cream, one of which was spoiled. Exposure to "chocolate" could mean different things depending on where the individual's scoop came from (regular chocolate ice cream or spoiled chocolate ice cream); we are lumping them all together under a single term. @@ -996,6 +1096,7 @@ We just want `y(chocolate, spoiled = FALSE)`. #| label: "fig-po-cons" #| fig-cap: "Potential outcomes under a consistency violation. Consistency allows us to treat the observed data as the factual outcome. Here, we're treating the chocolate group as representative of `y(chocolate)`, but this isn't true. The data represent a mixture of `y(chocolate, spoiled = FALSE)` and `y(chocolate, spoiled = TRUE)`, different potential outcomes." #| code-fold: true + plot_data <- data_observed_poorly_defined |> mutate(is_spoiled = exposure_unobserved == "chocolate (spoiled)") |> pivot_longer( @@ -1006,7 +1107,7 @@ plot_data <- data_observed_poorly_defined |> ) |> filter( !(is_spoiled & potential_outcome == "chocolate"), - !(is_spoiled == FALSE & potential_outcome == "spoiled_chocolate") + !(!is_spoiled & potential_outcome == "spoiled_chocolate") ) |> mutate( potential_outcome = case_when( @@ -1020,7 +1121,10 @@ plot_data <- data_observed_poorly_defined |> arrange(id) |> mutate(y_id = dense_rank(id)) -avg_labels <- compute_avg_labels(plot_data, c("potential_outcome", "exposure", "observed")) +avg_labels <- compute_avg_labels( + plot_data, + c("potential_outcome", "exposure", "observed") +) # Annotations for spoiled chocolate spoiled_annotation <- plot_data |> @@ -1038,17 +1142,26 @@ spoiled_arrows <- plot_data |> ) flawed_avg_annotation <- avg_labels |> - filter(potential_outcome == "y(chocolate)", exposure == "chocolate", observed == "observed") |> + filter( + potential_outcome == "y(chocolate)", + exposure == "chocolate", + observed == "observed" + ) |> mutate( label = "This is an average of\nboth potential outcomes", x = happiness + 2.25, - y = y_id + .9, + y = y_id + 0.9, happiness = happiness + 0.25, y_id = y_id + 0.15 ) unspoiled_annotation <- plot_data |> - filter(!is_spoiled, exposure == "chocolate", potential_outcome == "y(chocolate)", observed == "observed") |> + filter( + !is_spoiled, + exposure == "chocolate", + potential_outcome == "y(chocolate)", + observed == "observed" + ) |> slice(1) |> mutate( label = "But theirs\nwasn't", @@ -1057,7 +1170,12 @@ unspoiled_annotation <- plot_data |> ) unspoiled_arrows <- plot_data |> - filter(!is_spoiled, exposure == "chocolate", potential_outcome == "y(chocolate)", observed == "observed") |> + filter( + !is_spoiled, + exposure == "chocolate", + potential_outcome == "y(chocolate)", + observed == "observed" + ) |> mutate( xend = happiness + 0.35, yend = y_id - c(0.1, 0.2), @@ -1083,7 +1201,12 @@ ggplot(plot_data, aes(x = happiness, y = y_id)) + alpha = 0.8 ) + geom_point( - data = plot_data |> filter(is_spoiled, potential_outcome == "y(chocolate)", observed == "observed"), + data = plot_data |> + filter( + is_spoiled, + potential_outcome == "y(chocolate)", + observed == "observed" + ), size = 3, shape = 21, fill = ggokabeito::palette_okabe_ito(7), @@ -1091,7 +1214,12 @@ ggplot(plot_data, aes(x = happiness, y = y_id)) + alpha = 0.8 ) + geom_point( - data = plot_data |> filter(is_spoiled, potential_outcome == "y(chocolate)", observed == "unobserved"), + data = plot_data |> + filter( + is_spoiled, + potential_outcome == "y(chocolate)", + observed == "unobserved" + ), size = 3, shape = 21, fill = "white", @@ -1150,7 +1278,7 @@ ggplot(plot_data, aes(x = happiness, y = y_id)) + size = 4, label.size = NA ) + - facet_wrap(~ potential_outcome) + + facet_wrap(~potential_outcome) + scale_y_continuous( breaks = c(unique(plot_data$y_id), min(plot_data$y_id) - 1), labels = c(unique(plot_data$id), expression(bold("Avg"))) @@ -1160,7 +1288,8 @@ ggplot(plot_data, aes(x = happiness, y = y_id)) + po_theme ``` -We can imagine other ways in which slight variations in the treatment occur: high-quality and low-quality brands of vanilla ice cream are categorized as "vanilla." One person eats in the morning and another in the afternoon. +We can imagine other ways in which slight variations in the treatment occur: high-quality and low-quality brands of vanilla ice cream are categorized as "vanilla." +One person eats in the morning and another in the afternoon. One has a spoonful, and one has three bowls full. There will almost always be *some* consistency violation in this sense; the question for us is whether or not that violation is meaningful regarding the potential outcomes we see. If two brands of ice cream produce the same happiness, this variation doesn't matter. @@ -1182,6 +1311,7 @@ Now we're back to the right answer because we've correctly separated the potenti #| label: "fig-po-const-defined" #| fig-cap: "Potential outcomes under a consistency violation. Now we've correctly linked the observed data to their underlying potential outcome, making them consistent." #| code-fold: true + plot_data <- data_observed_poorly_defined |> pivot_longer( cols = starts_with("y"), @@ -1190,15 +1320,30 @@ plot_data <- data_observed_poorly_defined |> values_to = "happiness" ) |> mutate( - potential_outcome = if_else(potential_outcome == "spoiled_chocolate", "chocolate (spoiled)", potential_outcome), - observed = if_else(exposure_unobserved == potential_outcome, "observed", "unobserved"), - potential_outcome = if_else(potential_outcome == "chocolate (spoiled)", "spoiled_chocolate", potential_outcome), + potential_outcome = if_else( + potential_outcome == "spoiled_chocolate", + "chocolate (spoiled)", + potential_outcome + ), + observed = if_else( + exposure_unobserved == potential_outcome, + "observed", + "unobserved" + ), + potential_outcome = if_else( + potential_outcome == "chocolate (spoiled)", + "spoiled_chocolate", + potential_outcome + ), potential_outcome = paste0("y(", potential_outcome, ")") ) |> arrange(id) |> mutate(y_id = dense_rank(id)) -avg_labels <- compute_avg_labels(plot_data, c("potential_outcome", "exposure_unobserved", "observed")) +avg_labels <- compute_avg_labels( + plot_data, + c("potential_outcome", "exposure_unobserved", "observed") +) ggplot(plot_data, aes(x = happiness, y = y_id, color = exposure_unobserved)) + geom_point( @@ -1215,22 +1360,27 @@ ggplot(plot_data, aes(x = happiness, y = y_id, color = exposure_unobserved)) + fill = "white", alpha = 0.8 ) + - geom_point( + geom_point( data = avg_labels |> filter(observed == "unobserved"), - aes(x = happiness, y = y_id, color = exposure_unobserved), - size = 4, - shape = 23, - fill = "white", - inherit.aes = FALSE - ) + - geom_point( - data = avg_labels |> filter(observed == "observed"), - aes(x = happiness, y = y_id, fill = exposure_unobserved, color = exposure_unobserved), - size = 4, + aes(x = happiness, y = y_id, color = exposure_unobserved), + size = 4, + shape = 23, + fill = "white", + inherit.aes = FALSE + ) + + geom_point( + data = avg_labels |> filter(observed == "observed"), + aes( + x = happiness, + y = y_id, + fill = exposure_unobserved, + color = exposure_unobserved + ), + size = 4, shape = 23, - inherit.aes = FALSE + inherit.aes = FALSE ) + - facet_wrap(~ potential_outcome) + + facet_wrap(~potential_outcome) + scale_y_continuous( breaks = c(unique(plot_data$y_id), min(plot_data$y_id) - 1), labels = c(unique(plot_data$id), expression(bold("Avg"))) @@ -1273,10 +1423,14 @@ set.seed(37) data_observed_interf <- data |> mutate( exposure = if_else( - rbinom(n(), 1, 0.5) == 1, "chocolate", "vanilla" + rbinom(n(), 1, 0.5) == 1, + "chocolate", + "vanilla" ), exposure_partner = if_else( - rbinom(n(), 1, 0.5) == 1, "chocolate", "vanilla" + rbinom(n(), 1, 0.5) == 1, + "chocolate", + "vanilla" ), observed_outcome = case_when( exposure == "chocolate" & exposure_partner == "chocolate" ~ @@ -1303,13 +1457,13 @@ Similarly, the problem is that we're estimating the wrong potential outcome (@fi Interference and poorly defined exposures are different manifestations of the same assumption violation. The potential outcomes we are estimating are not consistent with the causal question we're asking. In this case, we have three counterfactuals: the flavor an individual had with the exposure their partner *didn't* have, and the flavor an individual *didn't* have with the two flavors their partner could have. -The averages we calculated don't seem to be estimating any of these combinations. - +The averages we calculated don't seem to be estimating any of these combinations. ```{r} #| label: "fig-const-interf" #| fig-cap: "Potential outcomes under a consistency violation. This time, the vanilla group's `y(vanilla)` and the chocolate group's `y(chocolate)` are not correct: they are a mixture of effects due to interference by the partner's flavor." #| code-fold: true + plot_data <- data_observed_interf |> pivot_longer( cols = starts_with("y_"), @@ -1320,7 +1474,11 @@ plot_data <- data_observed_interf |> mutate( potential_outcome = paste0("y(", str_remove(po_combination, "_.*"), ")"), observed = po_combination == paste0(exposure, "_", exposure_partner), - flavor_match = if_else(exposure == exposure_partner, "Same Flavors", "Different Flavors"), + flavor_match = if_else( + exposure == exposure_partner, + "Same Flavors", + "Different Flavors" + ), y_id = dense_rank(id) ) |> filter( @@ -1333,7 +1491,10 @@ flavor_annotation <- plot_data |> summarize( x = max(happiness) + 0.5, y = mean(y_id) - 2, - label = str_wrap("Different exposure pairs yield different potential outcomes", 20), + label = str_wrap( + "Different exposure pairs yield different potential outcomes", + 20 + ), potential_outcome = "y(chocolate)" ) @@ -1342,7 +1503,7 @@ flavor_arrows <- plot_data |> filter(observed, potential_outcome == "y(chocolate)", id %in% c(3, 5)) |> mutate( xend = happiness, - yend = y_id - .3, + yend = y_id - 0.3, x = flavor_annotation$x, y = flavor_annotation$y, potential_outcome = "y(chocolate)" @@ -1356,7 +1517,10 @@ unobserved_annotation <- plot_data |> summarize( x = max(happiness) + 0.75, y = mean(y_id) - 0.5, - label = str_wrap("There are two unobserved potential outcomes for the opposite exposure", 20), + label = str_wrap( + "There are two unobserved potential outcomes for the opposite exposure", + 20 + ), potential_outcome = "y(vanilla)" ) @@ -1365,7 +1529,7 @@ unobserved_arrows <- plot_data |> filter(!observed, id == unobserved_id, potential_outcome == "y(vanilla)") |> mutate( xend = happiness + c(0.35, 0.15), - yend = y_id - c(0.1, 0.2), + yend = y_id - c(0.1, 0.2), x = unobserved_annotation$x, y = unobserved_annotation$y, potential_outcome = "y(vanilla)" @@ -1376,8 +1540,11 @@ chocolate_unobserved_annotation <- plot_data |> filter(id == 6, potential_outcome == "y(chocolate)", !observed) |> summarize( x = min(happiness) - 2.5, - y = mean(y_id) + .75, - label = str_wrap("There is an unobserved potential outcome for the partner's other flavor", 15), + y = mean(y_id) + 0.75, + label = str_wrap( + "There is an unobserved potential outcome for the partner's other flavor", + 15 + ), potential_outcome = "y(chocolate)" ) @@ -1460,7 +1627,7 @@ ggplot(plot_data, aes(x = happiness, y = y_id)) + size = 3, label.size = NA ) + - facet_wrap(~ potential_outcome) + + facet_wrap(~potential_outcome) + scale_y_continuous( breaks = unique(plot_data$y_id), labels = unique(plot_data$id) @@ -1510,7 +1677,9 @@ set.seed(11) partners <- tibble( partner_id = 1:5, exposure = if_else( - rbinom(5, 1, 0.5) == 1, "chocolate", "vanilla" + rbinom(5, 1, 0.5) == 1, + "chocolate", + "vanilla" ) ) partners_observed <- data |> @@ -1549,9 +1718,24 @@ The key is to think about what you want to estimate and try to represent the pot In @sec-whole-game, we estimated the causal effect of mosquito nets on malaria risk. Let's consider the causal assumptions for this question and how they might have been violated in a real-life analysis. -- **Exchangeability**: We saw what can happen with this problem when an unmeasured confounder (in this case, genetic resistance to malaria) is present. Thinking about exchangeability is one of the more demanding and more common tasks in causal inference with observational data. We'll discuss this in-depth in @sec-dags, and what to do if we think (or know) we're wrong in @sec-sensitivity. We should also be concerned that we haven't measured the covariates we need well or are missing values (@sec-missingness). -- **Positivity**: A positivity violation would occur if any household would *always* or *never* use a bed net. A situation like this is feasible in real life. Someone may be allergic to the material, or one type of net may be unapproved in a region. As mentioned, we also need positivity to hold within the levels of the combination confounders in the analysis. Is it possible, for instance, that someone of low economic status in cold weather (when mosquitoes are not very active) and good health would never use a bed net because it wasn't worth the expense? -- **Consistency**: We already saw an example of the well-defined exposure problem: a "bed net" can mean many things. We decided we were comparing insecticide-treated bed nets compared to no nets, but we may need more. Does the manufacturer matter? The type of insecticide? The amount of insecticide? We need to think about the effect on the potential outcomes each of these variations might have. Interference is a real problem for bed-net research, particularly for insecticidal nets. Notably, we used households as the unit of observation, which would reduce some of the local interference. We could also consider households that are geographically dispersed from one another. +- **Exchangeability**: We saw what can happen with this problem when an unmeasured confounder (in this case, genetic resistance to malaria) is present. + Thinking about exchangeability is one of the more demanding and more common tasks in causal inference with observational data. + We'll discuss this in-depth in @sec-dags, and what to do if we think (or know) we're wrong in @sec-sensitivity. + We should also be concerned that we haven't measured the covariates we need well or are missing values (@sec-missingness). +- **Positivity**: A positivity violation would occur if any household would *always* or *never* use a bed net. + A situation like this is feasible in real life. + Someone may be allergic to the material, or one type of net may be unapproved in a region. + As mentioned, we also need positivity to hold within the levels of the combination confounders in the analysis. + Is it possible, for instance, that someone of low economic status in cold weather (when mosquitoes are not very active) and good health would never use a bed net because it wasn't worth the expense? +- **Consistency**: We already saw an example of the well-defined exposure problem: a "bed net" can mean many things. + We decided we were comparing insecticide-treated bed nets compared to no nets, but we may need more. + Does the manufacturer matter? + The type of insecticide? + The amount of insecticide? + We need to think about the effect on the potential outcomes each of these variations might have. + Interference is a real problem for bed-net research, particularly for insecticidal nets. + Notably, we used households as the unit of observation, which would reduce some of the local interference. + We could also consider households that are geographically dispersed from one another. ::: ## When do study designs support causal inference? {#sec-designs} @@ -1594,25 +1778,27 @@ Non-randomized (observational) studies have none of these guarantees, even ideal Like a realistic randomized trial, observational studies require careful design and execution to better meet the assumptions necessary for causal inference. @tbl-assump-solved summarizes how well ideal randomized trials, realistic randomized trials, and observational studies each meet the exchangeability, positivity, and consistency criteria. -| Assumption | Ideal Randomized Trial | Realistic Randomized Trial | Observational Study | -|----------------------|-----------------|-----------------|-----------------| -| Consistency (Well defined exposure) | `r emo::ji("smile")` | `r emo::ji("shrug")` | `r emo::ji("shrug")` | -| Consistency (No interference) | `r emo::ji("shrug")` | `r emo::ji("shrug")` | `r emo::ji("shrug")` | -| Positivity | `r emo::ji("smile")` | `r emo::ji("smile")` | `r emo::ji("shrug")` | -| Exchangeability | `r emo::ji("smile")` | `r emo::ji("shrug")` | `r emo::ji("shrug")` | + | Assumption | Ideal Randomized Trial | Realistic Randomized Trial | Observational Study | + | ----------------------------------- | ---------------------- | -------------------------- | -------------------- | + | Consistency (Well defined exposure) | `r emo::ji("smile")` | `r emo::ji("shrug")` | `r emo::ji("shrug")` | + | Consistency (No interference) | `r emo::ji("shrug")` | `r emo::ji("shrug")` | `r emo::ji("shrug")` | + | Positivity | `r emo::ji("smile")` | `r emo::ji("smile")` | `r emo::ji("shrug")` | + | Exchangeability | `r emo::ji("smile")` | `r emo::ji("shrug")` | `r emo::ji("shrug")` | -: Assumptions solved by study design. `r emo::ji("smile")` indicates it is solved by default, `r emo::ji("shrug")` indicates that it is *solvable* but not solved by default. {#tbl-assump-solved} + : Assumptions solved by study design. + `r emo::ji("smile")` indicates it is solved by default, `r emo::ji("shrug")` indicates that it is *solvable* but not solved by default. + {#tbl-assump-solved} The design of a causal analysis requires a clear causal question. We then can map this question to a *protocol*, consisting of the following seven elements which comprise the target trial framework, as defined by @hernan2016using: -- **Eligibility criteria**: Who or what should be included in the study? -- **Exposure definition**: When eligible, what precise exposure will units under study receive? -- **Assignment procedures**: How will eligible units be assigned to an exposure? -- **Follow-up period**: When does follow-up start and end? -- **Outcome definition**: What precise outcomes will be measured? -- **Causal contrast of interest**: Which causal estimand will be estimated? -- **Analysis plan**: What data manipulation and statistical procedures will be applied to the data to estimate the causal contrast of interest? +- **Eligibility criteria**: Who or what should be included in the study? +- **Exposure definition**: When eligible, what precise exposure will units under study receive? +- **Assignment procedures**: How will eligible units be assigned to an exposure? +- **Follow-up period**: When does follow-up start and end? +- **Outcome definition**: What precise outcomes will be measured? +- **Causal contrast of interest**: Which causal estimand will be estimated? +- **Analysis plan**: What data manipulation and statistical procedures will be applied to the data to estimate the causal contrast of interest? Recall our diagrams from @sec-diag (@fig-diagram-4); several of these protocol elements can be mapped to these diagrams when we are attempting to define our causal question. @@ -1621,6 +1807,7 @@ Recall our diagrams from @sec-diag (@fig-diagram-4); several of these protocol e #| label: fig-diagram-4-again #| fig-height: 2 #| fig-cap: "Example diagram mapped to causal analysis terminology" + knitr::include_graphics("../images/sentence-diagram-4.png") ``` @@ -1635,22 +1822,29 @@ This is the idea behind target trial emulation: you specify a target trial (the Whether the design is a randomized trial or observational study, using a protocol can improve our likelihood of meeting the causal assumptions. In @tbl-protocol, we map these elements to the corresponding assumption they can address. -| Assumption | Eligibility Criteria | Exposure Definition | Assignment Procedures | Follow-up Period | Outcome Definition | Causal contrast | Analysis Plan | -|---------|---------|---------|---------|---------|---------|---------|---------| -| Consistency (Well-defined exposure) | `r emo::ji("heavy_check_mark")` | `r emo::ji("heavy_check_mark")` | `r emo::ji("heavy_check_mark")` | | | | `r emo::ji("heavy_check_mark")` | -| Consistency (No interference) | `r emo::ji("heavy_check_mark")` | `r emo::ji("heavy_check_mark")` | `r emo::ji("heavy_check_mark")` | | `r emo::ji("heavy_check_mark")` | | `r emo::ji("heavy_check_mark")` | -| Positivity | `r emo::ji("heavy_check_mark")` | `r emo::ji("heavy_check_mark")` | `r emo::ji("heavy_check_mark")` | | | `r emo::ji("heavy_check_mark")` | `r emo::ji("heavy_check_mark")` | -| Exchangeability | `r emo::ji("heavy_check_mark")` | `r emo::ji("heavy_check_mark")` | `r emo::ji("heavy_check_mark")` | `r emo::ji("heavy_check_mark")` | `r emo::ji("heavy_check_mark")` | `r emo::ji("heavy_check_mark")` | `r emo::ji("heavy_check_mark")` | - -: Mapping assumptions to elements of a study protocol {#tbl-protocol} - -- **Eligibility criteria**: Eligibility criteria are beneficial for preventing positivity violations (by excluding those who can never have one of the exposures) but are also helpful for consistency and exchangeability. You can, for instance, exclude people who are likely to affect each other's outcomes or only include people without extreme characteristics. -- **Exposure definition**: A precise exposure definition is particularly useful for improving consistency since it encourages a well-defined exposure and can better control the potential of interference. A precise exposure also helps identify factors related to exchangeability and positivity. -- **Assignment procedures**: In randomized assignment, exchangeability and positivity are met. Understanding how an exposure occurs also improves exchangeability and positivity in non-randomized studies. As we've seen, we can also use assignment procedures to prevent interference, and likewise, they can help improve a well-defined exposure since the assignment mechanism needs to be precise. -- **Follow-up period**: We'll discuss the issues that defining a follow-up period solves in more detail in @sec-longitudinal, but it helps with exchangeability by ensuring the follow-up time is comparable for all exposure levels. It's also closely related to consistency because a well-defined exposure may require a time element. -- **Outcome definition**: Precisely defining the outcome helps identify prognostic factors, improving precision in a randomized trial but also improving exchangeability in an observational study. -- **Causal contrast of interest**: As we'll see in @sec-estimands, some causal contrasts are less strict about exchangeability and positivity than others. -- **Analysis plan**: A clear analysis plan improves all aspects of meeting the causal assumptions in observational studies. For instance, we must be clear about variables we must adjust to meet exchangeability. Additionally, while we can never confirm the assumptions of a causal method are met, we can provide evidence through exploratory analysis (@sec-data-causal) or probe the consequences of violating them with sensitivity analysis (@sec-sensitivity). + | Assumption | Eligibility Criteria | Exposure Definition | Assignment Procedures | Follow-up Period | Outcome Definition | Causal contrast | Analysis Plan | + | ----------------------------------- | ------------------------------- | ------------------------------- | ------------------------------- | ------------------------------- | ------------------------------- | ------------------------------- | ------------------------------- | + | Consistency (Well-defined exposure) | `r emo::ji("heavy_check_mark")` | `r emo::ji("heavy_check_mark")` | `r emo::ji("heavy_check_mark")` | | | | `r emo::ji("heavy_check_mark")` | + | Consistency (No interference) | `r emo::ji("heavy_check_mark")` | `r emo::ji("heavy_check_mark")` | `r emo::ji("heavy_check_mark")` | | `r emo::ji("heavy_check_mark")` | | `r emo::ji("heavy_check_mark")` | + | Positivity | `r emo::ji("heavy_check_mark")` | `r emo::ji("heavy_check_mark")` | `r emo::ji("heavy_check_mark")` | | | `r emo::ji("heavy_check_mark")` | `r emo::ji("heavy_check_mark")` | + | Exchangeability | `r emo::ji("heavy_check_mark")` | `r emo::ji("heavy_check_mark")` | `r emo::ji("heavy_check_mark")` | `r emo::ji("heavy_check_mark")` | `r emo::ji("heavy_check_mark")` | `r emo::ji("heavy_check_mark")` | `r emo::ji("heavy_check_mark")` | + + : Mapping assumptions to elements of a study protocol {#tbl-protocol} + +- **Eligibility criteria**: Eligibility criteria are beneficial for preventing positivity violations (by excluding those who can never have one of the exposures) but are also helpful for consistency and exchangeability. + You can, for instance, exclude people who are likely to affect each other's outcomes or only include people without extreme characteristics. +- **Exposure definition**: A precise exposure definition is particularly useful for improving consistency since it encourages a well-defined exposure and can better control the potential of interference. + A precise exposure also helps identify factors related to exchangeability and positivity. +- **Assignment procedures**: In randomized assignment, exchangeability and positivity are met. + Understanding how an exposure occurs also improves exchangeability and positivity in non-randomized studies. + As we've seen, we can also use assignment procedures to prevent interference, and likewise, they can help improve a well-defined exposure since the assignment mechanism needs to be precise. +- **Follow-up period**: We'll discuss the issues that defining a follow-up period solves in more detail in @sec-longitudinal, but it helps with exchangeability by ensuring the follow-up time is comparable for all exposure levels. + It's also closely related to consistency because a well-defined exposure may require a time element. +- **Outcome definition**: Precisely defining the outcome helps identify prognostic factors, improving precision in a randomized trial but also improving exchangeability in an observational study. +- **Causal contrast of interest**: As we'll see in @sec-estimands, some causal contrasts are less strict about exchangeability and positivity than others. +- **Analysis plan**: A clear analysis plan improves all aspects of meeting the causal assumptions in observational studies. + For instance, we must be clear about variables we must adjust to meet exchangeability. + Additionally, while we can never confirm the assumptions of a causal method are met, we can provide evidence through exploratory analysis (@sec-data-causal) or probe the consequences of violating them with sensitivity analysis (@sec-sensitivity). What would a target trial and emulation look like for the ice cream example? Imagine that we have a database that has all the information we need. @@ -1660,17 +1854,17 @@ In @tbl-target-trial, we outline some ideas for a randomized trial of this quest Notably, the target trial we can emulate is usually a so-called pragmatic trial---a trial *without* blinding. That is because it's often not possible to blind someone's exposure from themselves. -| Protocol Step | Description | Target Trial | Emulation | -|-------------|-------------|--------------------|--------------------------| -| Eligibility criteria | Who should be included in the study? | Inclusion: Age 18 to 65. Exclusion: No lactose intolerance or allergy to any ingredients; entered store within the week of the study. | Same as target trial. | -| Exposure definition | When eligible, what precise exposure will units under study receive? | 100g vanilla or chocolate ice cream in a bowl, both Don and Jerzy brand ice cream. | Same as target trial. | -| Assignment procedures | How will eligible units be assigned to an exposure? | Participants are randomized with a 50% probability of either flavor. The assignment is non-blinded. | Participants are assigned the flavor consistent with their data, e.g., the flavor they chose. Randomization is emulated using baseline covariates. | -| Follow-up period | When does follow-up start and end? | Start: When eligibility criteria are met and flavor is assigned; End: 30 minutes after flavor assignment. | Same as target trial. | -| Outcome definition | What precise outcomes will be measured? | Happiness (1-10) as measured by the gold-standard tool. | Same as target trial. | -| Causal contrast of interest | Which causal estimand will be estimated? | Average Treatment Effect (ATE). | Same as target trial. | -| Analysis plan | What data manipulation and statistical procedures will be applied to the data to estimate the causal contrast of interest? | ATE will be calculated using inverse probability weighting, weighted for baseline happiness, age, income, education, physical activity, self-rated physical health, self-rated mental health, quality of relationships, and preference of flavor. | ATE will be calculated using inverse probability weighting, weighted for confounders (baseline happiness, age, preference of flavor) and additional baseline prognostic variables (age, income, education, physical activity, self-rated physical health, self-rated mental health, and quality of relationships). | - -: A protocol of a target trial of ice cream flavor on happiness and a corresponding protocol for an observational study that emulates the target trial {#tbl-target-trial} + | Protocol Step | Description | Target Trial | Emulation | + | --------------------------- | -------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | + | Eligibility criteria | Who should be included in the study? | Inclusion: Age 18 to 65. Exclusion: No lactose intolerance or allergy to any ingredients; entered store within the week of the study. | Same as target trial. | + | Exposure definition | When eligible, what precise exposure will units under study receive? | 100g vanilla or chocolate ice cream in a bowl, both Don and Jerzy brand ice cream. | Same as target trial. | + | Assignment procedures | How will eligible units be assigned to an exposure? | Participants are randomized with a 50% probability of either flavor. The assignment is non-blinded. | Participants are assigned the flavor consistent with their data, e.g., the flavor they chose. Randomization is emulated using baseline covariates. | + | Follow-up period | When does follow-up start and end? | Start: When eligibility criteria are met and flavor is assigned; End: 30 minutes after flavor assignment. | Same as target trial. | + | Outcome definition | What precise outcomes will be measured? | Happiness (1-10) as measured by the gold-standard tool. | Same as target trial. | + | Causal contrast of interest | Which causal estimand will be estimated? | Average Treatment Effect (ATE). | Same as target trial. | + | Analysis plan | What data manipulation and statistical procedures will be applied to the data to estimate the causal contrast of interest? | ATE will be calculated using inverse probability weighting, weighted for baseline happiness, age, income, education, physical activity, self-rated physical health, self-rated mental health, quality of relationships, and preference of flavor. | ATE will be calculated using inverse probability weighting, weighted for confounders (baseline happiness, age, preference of flavor) and additional baseline prognostic variables (age, income, education, physical activity, self-rated physical health, self-rated mental health, and quality of relationships). | + + : A protocol of a target trial of ice cream flavor on happiness and a corresponding protocol for an observational study that emulates the target trial {#tbl-target-trial} The protocol in @tbl-target-trial helps us meet the causal assumptions in a variety of ways. For instance, we are more likely to have positivity without people with allergies or sensitivities. diff --git a/chapters/04-dags.qmd b/chapters/04-dags.qmd index 2693147..c67db9b 100644 --- a/chapters/04-dags.qmd +++ b/chapters/04-dags.qmd @@ -4,20 +4,23 @@ ```{r} #| echo: false + # TODO: remove when first edition complete status("complete") ``` ## Visualizing Causal Assumptions -> When we try to pick out anything by itself we find that it is bound fast by a thousand invisible cords that cannot be broken, to everything in the universe. -- John Muir +> When we try to pick out anything by itself we find that it is bound fast by a thousand invisible cords that cannot be broken, to everything in the universe. +> -- John Muir **Causal diagrams** are a tool to visualize your assumptions about the causal structure of the questions you're trying to answer. In a randomized experiment, the causal structure is quite simple. While there may be many causes of an outcome, the only cause of the exposure is the randomization process itself (we hope!). In many non-randomized settings, however, the structure of your question can be a complex web of causality. Causal diagrams help communicate what we think this structure looks like. -In addition to being open about what we think the causal structure is, causal diagrams have incredible mathematical properties that allow us to identify a way to estimate unbiased causal effects even with observational data. In other words, if correctly specified, they help us achieve exchangeability. +In addition to being open about what we think the causal structure is, causal diagrams have incredible mathematical properties that allow us to identify a way to estimate unbiased causal effects even with observational data. +In other words, if correctly specified, they help us achieve exchangeability. Causal diagrams are also increasingly common. Data collected as a review of causal diagrams in applied health research papers show a drastic increase in use over time [@Tennant2021]. @@ -27,6 +30,7 @@ Data collected as a review of causal diagrams in applied health research papers #| echo: false #| warning: false #| fig-cap: "Percentage of health research papers using causal diagrams over time." + dag_data <- readxl::read_xlsx(here::here("data", "dag_data.xlsx")) dag_data <- dag_data |> @@ -57,7 +61,7 @@ dag_data |> mutate(pct = n / sum(n)) |> filter(used_dag, year < max(year)) |> ggplot(aes(year, pct)) + - geom_line(color = "#0072B2", linewidth = .9) + + geom_line(color = "#0072B2", linewidth = 0.9) + scale_y_continuous( name = "percent of papers", labels = scales::label_percent() @@ -87,11 +91,12 @@ Here, we are saying that `x` causes `y`. #| fig-width: 3 #| fig-height: 2 #| fig-cap: "A causal directed acyclic graph (DAG). DAGs depict causal relationships. In this DAG, the assumption is that `x` causes `y`." + library(ggdag) dagify(y ~ x, coords = time_ordered_coords()) |> ggdag() + theme_dag() + - expand_plot(expand_x = expansion(c(.2, .2))) + expand_plot(expand_x = expansion(c(0.2, 0.2))) ``` If we're interested in the causal effect of `x` on `y`, we're trying to estimate a numeric representation of that arrow. @@ -103,6 +108,7 @@ There are three types of paths you'll see in DAGs: **forks**, **chains**, and ** #| code-fold: true #| label: fig-dag-path-types #| fig-cap: "Three types of causal relationships: forks, chains, and colliders. The direction of the arrows and the relationships of interest dictate which type of path a series of variables is. Forks represent a mutual cause, chains represent direct causes, and colliders represent a mutual descendant." + coords <- list(x = c(x = 0, y = 2, q = 1), y = c(x = 0, y = 0, q = 1)) fork <- dagify( @@ -128,7 +134,10 @@ collider <- dagify( coords = coords ) -dag_flows <- map(list(fork = fork, chain = chain, collider = collider), tidy_dagitty) |> +dag_flows <- map( + list(fork = fork, chain = chain, collider = collider), + tidy_dagitty +) |> map("data") |> list_rbind(names_to = "dag") |> mutate(dag = factor(dag, levels = c("fork", "chain", "collider"))) @@ -175,9 +184,9 @@ This question defines which paths we're interested in and which we're not. These three types of paths have different implications for the statistical relationship between `x` and `y`. If we only look at the correlation between the two variables under these assumptions: -1. In the fork, `x` and `y` will be associated, despite there being no arrow from `x` to `y`. -2. In the chain, `x` and `y` are related only through `q`. -3. In the collider, `x` and `y` will *not* be related. +1. In the fork, `x` and `y` will be associated, despite there being no arrow from `x` to `y`. +2. In the chain, `x` and `y` are related only through `q`. +3. In the collider, `x` and `y` will *not* be related. Paths that transmit association are called **open paths**. Paths that do not transmit association are called **closed paths**. @@ -209,12 +218,13 @@ Within levels of `q`, however, `x` and `y` are unrelated. #| label: fig-confounder-scatter #| message: false #| fig-cap: "Two scatterplots of the relationship between `x` and `y`. With forks, the relationship is biased by `q`. When accounting for `q`, we see the true null relationship." + set.seed(123) library(patchwork) n <- 1000 ### q -q <- rbinom(n, size = 1, prob = .35) +q <- rbinom(n, size = 1, prob = 0.35) ### ### x @@ -229,13 +239,13 @@ confounder_data <- tibble(x, y, q = as.factor(q)) p1 <- confounder_data |> ggplot(aes(x, y)) + - geom_point(alpha = .2) + + geom_point(alpha = 0.2) + geom_smooth(method = "lm", se = FALSE, color = "black") + facet_wrap(~"not adjusting for `q`\n(biased)") p2 <- confounder_data |> ggplot(aes(x, y, color = q)) + - geom_point(alpha = .2) + + geom_point(alpha = 0.2) + geom_smooth(method = "lm", se = FALSE) + facet_wrap(~"adjusting for `q`\n(unbiased)") @@ -261,6 +271,7 @@ Neither of these effects is due to bias, but each answers a different research q #| label: fig-mediator-scatter #| message: false #| fig-cap: "Two scatterplots of the relationship between `x` and `y`. With chains, whether and how we should account for `q` depends on the research question. Without doing so, we see the impact of the total effect of `x` and `y`, including the indirect effect via `q`. When accounting for `q`, we see the direct (null) effect of `x` on `y`." + ### x x <- rnorm(n) ### @@ -279,13 +290,13 @@ mediator_data <- tibble(x, y, q = as.factor(q)) p1 <- mediator_data |> ggplot(aes(x, y)) + - geom_point(alpha = .2) + + geom_point(alpha = 0.2) + geom_smooth(method = "lm", se = FALSE, color = "black") + facet_wrap(~"not adjusting for `q`\n(total effect)") p2 <- mediator_data |> ggplot(aes(x, y, color = q)) + - geom_point(alpha = .2) + + geom_point(alpha = 0.2) + geom_smooth(method = "lm", se = FALSE) + facet_wrap(~"adjusting for `q`\n(direct effect)") @@ -310,7 +321,7 @@ We end up with a biased effect of `x` on `y`. #| label: fig-collider-scatter #| message: false #| fig-cap: "Two scatterplots of the relationship between `x` and `y`. The unadjusted relationship between the two is unbiased. When accounting for `q`, we open a colliding backdoor path and bias the relationship between `x` and `y`." -#| + ### x x <- rnorm(n) ### @@ -329,13 +340,13 @@ collider_data <- tibble(x, y, q = as.factor(q)) p1 <- collider_data |> ggplot(aes(x, y)) + - geom_point(alpha = .2) + + geom_point(alpha = 0.2) + geom_smooth(method = "lm", se = FALSE, color = "black") + facet_wrap(~"not adjusting for `q`\n(unbiased)") p2 <- collider_data |> ggplot(aes(x, y, color = q)) + - geom_point(alpha = .2) + + geom_point(alpha = 0.2) + geom_smooth(method = "lm", se = FALSE) + facet_wrap(~"adjusting for `q`\n(biased)") @@ -376,8 +387,7 @@ t2 <- collider |> collider_t$data <- bind_rows(collider_t$data, t2) collider_t |> - mutate(deemphasize = (name %in% c("x", "y") & time == - "time point 2")) |> + mutate(deemphasize = (name %in% c("x", "y") & time == "time point 2")) |> ggplot(aes(x = x, y = y, xend = xend, yend = yend)) + geom_dag_edges(edge_width = 1, edge_color = "grey85") + geom_dag_point(aes(color = deemphasize), show.legend = FALSE) + @@ -449,6 +459,7 @@ We can diagram this using the method described in @sec-diag (@fig-diagram-podcas #| fig-cap: "A sentence diagram for the question: Does listening to a comedy podcast the morning before an exam improve graduate student test scores? The population is graduate students. The start time is morning, and the outcome time is after the exam." #| fig-height: 2 #| label: fig-diagram-podcast + knitr::include_graphics("../images/podcast-diagram.png") ``` @@ -461,6 +472,7 @@ This is just like the type of formula we specify for most regression models in R ```{r} #| eval: false + dagify( effect1 ~ cause1 + cause2 + cause3, effect2 ~ cause1 + cause4, @@ -482,17 +494,20 @@ dagify( In the code above, we assume that: -- a graduate student's mood, sense of humor, and how prepared they feel for the exam could influence whether they listened to a podcast the morning of the test -- their mood and how prepared they are also influence their exam score +- a graduate student's mood, sense of humor, and how prepared they feel for the exam could influence whether they listened to a podcast the morning of the test +- their mood and how prepared they are also influence their exam score Notice we *do not* see podcast in the exam equation; this means that we assume that there is *no* causal relationship between podcast and the exam score. There are some other useful arguments you'll often find yourself supplying to `dagify()`: -- `exposure` and `outcome`: Telling ggdag the variables that are the exposure and outcome of your research question is required for many of the most valuable queries we can make of DAGs. -- `latent`: This argument lets us tell ggdag that some variables in the DAG are unmeasured. `latent` helps identify valid adjustment sets with the data we actually have. -- `coords`: Coordinates for the variables. You can choose between algorithmic or manual layouts, as discussed below. We'll use `time_ordered_coords()` here. -- `labels`: A character vector of labels for the variables. +- `exposure` and `outcome`: Telling ggdag the variables that are the exposure and outcome of your research question is required for many of the most valuable queries we can make of DAGs. +- `latent`: This argument lets us tell ggdag that some variables in the DAG are unmeasured. + `latent` helps identify valid adjustment sets with the data we actually have. +- `coords`: Coordinates for the variables. + You can choose between algorithmic or manual layouts, as discussed below. + We'll use `time_ordered_coords()` here. +- `labels`: A character vector of labels for the variables. Let's create a DAG object, `podcast_dag`, with some of these attributes, then visualize the DAG with `ggdag()`. `ggdag()` returns a ggplot object, so we can add additional layers to the plot, like themes. @@ -503,6 +518,7 @@ Let's create a DAG object, `podcast_dag`, with some of these attributes, then vi #| fig-width: 4 #| fig-height: 4 #| warning: false + podcast_dag <- dagify( podcast ~ mood + humor + prepared, exam ~ mood + prepared, @@ -550,13 +566,13 @@ theme_set( You don't need to specify coordinates to ggdag. If you don't, it uses algorithms designed for automatic layouts. -There are many such algorithms, and they focus on different aspects of the layout, e.g., the shape, the space between the nodes, minimizing how many edges cross, etc. -These layout algorithms usually have a component of randomness, so it's good to use a seed if you want to get the same result. +There are many such algorithms, and they focus on different aspects of the layout, e.g., the shape, the space between the nodes, minimizing how many edges cross, etc. These layout algorithms usually have a component of randomness, so it's good to use a seed if you want to get the same result. ```{r} #| fig-width: 4 #| fig-height: 4 #| fig-align: center + # no coordinates specified set.seed(123) pod_dag <- dagify( @@ -575,6 +591,7 @@ We can also ask for a specific layout, e.g., the popular Sugiyama algorithm for #| fig-width: 4 #| fig-height: 4 #| fig-align: center + pod_dag |> ggdag(layout = "sugiyama", text_size = 2.8) ``` @@ -591,6 +608,7 @@ We know that's not the case: listening to the podcast happened before taking the #| fig-width: 4 #| fig-height: 4 #| fig-align: center + pod_dag |> ggdag(layout = "time_ordered", text_size = 2.8) ``` @@ -607,11 +625,14 @@ We've specified the DAG for this question and told ggdag what the exposure and o According to the DAG, there is no direct causal relationship between listening to a podcast and exam scores. Are there any other open paths? `ggdag_paths()` takes a DAG and visualizes the open paths. -In @fig-paths-podcast, we see two open paths: `podcast <- mood -> exam"` and `podcast <- prepared -> exam`. These are both forks---*confounding pathways*. Since there is no causal relationship between listening to a podcast and exam scores, the only open paths are *backdoor* paths, these two confounding pathways. +In @fig-paths-podcast, we see two open paths: `podcast <- mood -> exam"` and `podcast <- prepared -> exam`. +These are both forks---*confounding pathways*. +Since there is no causal relationship between listening to a podcast and exam scores, the only open paths are *backdoor* paths, these two confounding pathways. ```{r} #| label: fig-paths-podcast #| fig-cap: "`ggdag_paths()` visualizes open paths in a DAG. There are two open paths in `podcast_dag`: the fork from `mood` and the fork from `prepared`." + podcast_dag |> # show the whole dag as a light gray "shadow" # rather than just the paths @@ -661,6 +682,7 @@ Any arrows coming out of adjusted variables are removed from the DAG because the #| fig-height: 4 #| fig-align: center #| fig-cap: "A visualization of the minimal adjustment set for the podcast-exam DAG. If this DAG is correct, two variables are required to block the backdoor paths: `mood` and `prepared`." + ggdag_adjustment_set( podcast_dag, use_text = FALSE, @@ -685,6 +707,7 @@ Here's a condensed version of what `ggdag_adjustment_set()` is doing: #| fig-width: 4.5 #| fig-height: 4.5 #| fig-align: center + podcast_dag_tidy |> # add adjustment sets to data dag_adjustment_sets() |> @@ -718,6 +741,7 @@ Full adjustment sets are every combination of variables that result in a valid s #| fig-height: 5 #| fig-align: center #| fig-cap: "All valid adjustment sets for `podcast_dag`." + ggdag_adjustment_set( podcast_dag, use_text = FALSE, @@ -760,6 +784,7 @@ In contrast, the estimate that adjusted for the two variables as suggested by `g #| label: fig-dag-sim #| fig-cap: "Forest plot of simulated data based on the DAG described in @fig-dag-podcast." #| code-fold: true + ## Model that does not close backdoor paths library(broom) unadjusted_model <- lm(exam ~ podcast, sim_data) |> @@ -797,6 +822,7 @@ Let's say that, not knowing the true DAG (@fig-dag-podcast), we drew @fig-dag-po #| fig-height: 4 #| warning: false #| code-fold: true + podcast_dag_wrong <- dagify( podcast ~ humor + prepared, exam ~ prepared, @@ -831,6 +857,7 @@ Now, neither estimate is right. #| label: fig-dag-sim-wrong #| fig-cap: "Forest plot of simulated data based on the DAG described in @fig-dag-podcast. However, we've analyzed it using the adjustment set from @fig-dag-podcast-wrong, giving us the wrong answer." #| code-fold: true + ## Model that does not close backdoor paths library(broom) unadjusted_model <- lm(exam ~ podcast, sim_data) |> @@ -877,6 +904,7 @@ Let's take a look at @fig-podcast_dag2. #| fig-width: 5 #| fig-height: 4 #| fig-cap: "An expanded version of `podcast_dag` that includes two additional variables: `skills_course`, representing a College Skills Course, and `alertness`." + podcast_dag2 <- dagify( podcast ~ mood + humor + skills_course, alertness ~ mood, @@ -902,9 +930,15 @@ ggdag(podcast_dag2, use_labels = TRUE, use_text = FALSE) ```{r} #| echo: false + paths <- paths(podcast_dag2) -open_paths <- glue::glue_collapse(glue::glue("`{paths$paths[paths$open]}`"), sep = ", ", last = ", and") -adj_sets <- unclass(adjustmentSets(podcast_dag2)) |> map_chr(\(.x) glue::glue('{unlist(glue::glue_collapse(.x, sep = " + "))}')) +open_paths <- glue::glue_collapse( + glue::glue("`{paths$paths[paths$open]}`"), + sep = ", ", + last = ", and" +) +adj_sets <- unclass(adjustmentSets(podcast_dag2)) |> + map_chr(\(.x) glue::glue('{unlist(glue::glue_collapse(.x, sep = " + "))}')) adj_sets <- glue::glue("`{adj_sets}`") ``` @@ -915,6 +949,7 @@ Now there are *three* backdoor paths we need to close: `r open_paths`. #| fig-width: 11 #| fig-height: 4.5 #| fig-cap: "Three open paths in `podcast_dag2`. Since there is no effect of `podcast` on `exam`, all three are backdoor paths that must be closed to get the correct effect." + ggdag_paths(podcast_dag2, use_labels = TRUE, use_text = FALSE, shadow = TRUE) ``` @@ -930,6 +965,7 @@ Notably, `prepared` and `alertness` could happen at the same time or even after #| fig-width: 7 #| fig-height: 8 #| fig-cap: "Valid minimal adjustment sets that will close the backdoor paths in @fig-podcast_dag2-paths." + ggdag_adjustment_set(podcast_dag2, use_labels = TRUE, use_text = FALSE) ``` @@ -956,6 +992,7 @@ The true result of `exam` is missing for those who didn't show up; by studying t #| fig-width: 4.5 #| fig-height: 3.5 #| fig-cap: "Another variant of `podcast_dag`, this time including the inherent stratification on those who appear for the exam. There is still no direct effect of `podcast` on `exam`, but there is an indirect effect via `showed_up`." + podcast_dag3 <- dagify( podcast ~ mood + humor + prepared, exam ~ mood + prepared + showed_up, @@ -995,6 +1032,7 @@ Unfortunately, we cannot calculate the total effect of `podcast` on `exam` becau #| fig-height: 4 #| warning: false #| fig-cap: "The adjustment set for `podcast_dag3` given that the data are inherently conditioned on showing up to the exam. In this case, there is no way to recover an unbiased estimate of the total effect of `podcast` on `exam`." + podcast_dag3 |> adjust_for("showed_up") |> ggdag_adjustment_set(use_text = FALSE, use_labels = TRUE) @@ -1009,6 +1047,7 @@ We can't calculate the total effect because we are missing the indirect effect, #| fig-height: 4 #| warning: false #| fig-cap: "The adjustment set for `podcast_dag3` when targeting a different effect. There is one minimal adjustment set that we can use to estimate the direct effect of `podcast` on `exam`." + podcast_dag3 |> adjust_for("showed_up") |> ggdag_adjustment_set(effect = "direct", use_text = FALSE, use_labels = TRUE) @@ -1024,6 +1063,7 @@ It's called M-bias because it looks like an M when arranged top to bottom. #| fig-width: 4 #| fig-height: 4 #| fig-cap: "A DAG representing M-Bias, a situation where a collider predates the exposure and outcome." + m_bias() |> ggdag() ``` @@ -1050,6 +1090,7 @@ As above, there are no open paths in this subset of the DAG. #| fig-width: 5.5 #| fig-height: 4 #| fig-cap: "A reconfiguration of @fig-dag-podcast where `mood` is a collider on an M-shaped path." + podcast_dag4 <- dagify( podcast ~ u1, exam ~ u2, @@ -1088,6 +1129,7 @@ There is no way to close this open path. #| fig-width: 5.5 #| fig-height: 4.5 #| fig-cap: "The adjustment set where `mood` is a collider. If we control for `mood` and don't know about or have the unmeasured causes of `mood`, we have no means of closing the backdoor path opened by adjusting for a collider." + podcast_dag4 |> adjust_for("mood") |> ggdag_adjustment_set(use_labels = TRUE, use_text = FALSE) @@ -1106,6 +1148,7 @@ This arrangement is sometimes called butterfly or bowtie bias, again because of #| fig-width: 5 #| fig-height: 4 #| fig-cap: "In butterfly bias, `mood` is both a collider and a confounder. Controlling for the bias induced by `mood` opens a new pathway because we've also conditioned on a collider. We can't properly close all backdoor paths without either `u1` or `u2`." + butterfly_bias(x = "podcast", y = "exam", m = "mood", a = "u1", b = "u2") |> ggdag(use_text = FALSE, use_labels = TRUE) ``` @@ -1128,6 +1171,7 @@ Let's add a variable, `grader_mood`, to the original DAG. #| fig-width: 5 #| fig-height: 4 #| fig-cap: "A DAG containing a cause of the exposure that is not the cause of the outcome (`humor`) and a cause of the outcome that is not a cause of the exposure (`grader_mood`)." + podcast_dag5 <- dagify( podcast ~ mood + humor + prepared, exam ~ mood + prepared + grader_mood, @@ -1194,12 +1238,12 @@ An example of when this can occur is a case-control study of cancer. Someone *with* cancer may be more motivated to ruminate on their past exposures than someone *without* cancer. So, their memory about a given exposure may be more refined than someone without. - ```{r} #| label: fig-error_dag #| fig-width: 5.5 #| fig-height: 4 #| fig-cap: "A DAG representing measurement error in observing the exposure and outcome. In this case, the outcome impacts the participant's memory of the exposure, also known as recall bias." + error_dag <- dagify( exposure_observed ~ exposure_real + exposure_error, outcome_observed ~ outcome_real + outcome_error, @@ -1247,6 +1291,7 @@ We'll discuss estimands in detail in [Chapter -@sec-estimands]. #| echo: false #| message: false #| tbl-cap: "A table of DAG properties in applied health research. Number of nodes and arcs are the median number of variables and arrows in the analyzed DAGs, while the Node to Arc ratio is their ratio. Saturation proportion is the proportion of all possible arrows going forward in time to other included variables. Fully saturated DAGs are those that include all such arrows. The researchers also analyzed whether studies reported their estimands and adjustment sets." + library(gtsummary) library(gt) dag_data_used <- dag_data |> @@ -1262,7 +1307,15 @@ dag_data_used |> report_adjset = report_adjset == "Yes", across(c(saturated, reported_estimand, report_adjset), as_yes_no) ) |> - select(nodes, arcs, ratio, saturation, saturated, reported_estimand, report_adjset) |> + select( + nodes, + arcs, + ratio, + saturation, + saturated, + reported_estimand, + report_adjset + ) |> tbl_summary( label = list( nodes ~ "Number of Nodes", @@ -1346,12 +1399,18 @@ It's tempting to visualize that relationship like this: #| fig-width: 4.5 #| fig-height: 3.5 #| fig-cap: "A conceptual diagram representing the reciprocal relationship between A/C use and global temperature because of global warming. Feedback loops are useful mental shorthands to describe variables that impact each other over time compactly, but they are not true causal diagrams." + dagify( ac_use ~ global_temp, global_temp ~ ac_use, labels = c(ac_use = "A/C use", global_temp = "Global\ntemperature") ) |> - ggdag(layout = "circle", edge_type = "arc", use_text = FALSE, use_labels = TRUE) + ggdag( + layout = "circle", + edge_type = "arc", + use_text = FALSE, + use_labels = TRUE + ) ``` From a DAG perspective, this is a problem because of the *A* part of *DAG*: it's cyclic! @@ -1366,6 +1425,7 @@ The real DAG looks something like this: #| fig-width: 5 #| fig-height: 3.5 #| fig-cap: "A DAG showing the relationship between A/C use and global temperature over time. The true causal relationship in a feedback loop goes *forward*." + dagify( global_temp_2000 ~ ac_use_1990 + global_temp_1990, ac_use_2000 ~ ac_use_1990 + global_temp_1990, @@ -1481,6 +1541,7 @@ From the DAG, it would appear that the entire design is invalid! #| fig-width: 4.5 #| fig-height: 3.2 #| fig-cap: "A DAG representing a matched case-control study. In such a study, selection is determined by outcome status and any matched confounders. Selection into the study is thus a collider. Since it is inherently stratified on who is actually in the study, such data are limited in the types of causal effects they can estimate." + dagify( outcome ~ confounder + exposure, selection ~ outcome + confounder, @@ -1551,6 +1612,7 @@ You can think of `p` as a mismeasured version of `q`; it will seldom wholly cont #| fig-width: 4.5 #| fig-height: 3.2 #| fig-cap: "A DAG with a confounder, `q`, and a proxy confounder, `p`. The true adjustment set is `q`. Since `p` causes `q`, it contains information about `q` and can reduce the bias if we don't have `q` measured." + dagify( y ~ x + q, x ~ q, @@ -1577,9 +1639,11 @@ In practice, such a case is probably not relevant. There is *effectively* no arrow. ::: {.callout-note} -Put another way, the assumption is saying that at least one unit under study has an individual causal effect. If we have a variable `X` that has an arrow leading to `Y`, we are saying that for at least one person, there is a change in the potential outcome `Y(X = x)` for the values of `X` under study. In terms of @tbl-po, `y_chocolate - y_vanilla` would be 0 for every individual. +Put another way, the assumption is saying that at least one unit under study has an individual causal effect. +If we have a variable `X` that has an arrow leading to `Y`, we are saying that for at least one person, there is a change in the potential outcome `Y(X = x)` for the values of `X` under study. +In terms of @tbl-po, `y_chocolate - y_vanilla` would be 0 for every individual. -When not a single unit under study has an individual causal effect, we call it a **sharp null**. +When not a single unit under study has an individual causal effect, we call it a **sharp null**. ::: The more significant point, though, is that you should feel confident to add an arrow. @@ -1599,6 +1663,7 @@ The saturated DAG given this ordering is: #| fig-height: 4 #| code-fold: true #| fig-cap: "A saturated version of `podcast_dag`: variables have all possible arrows going forward to other variables over time." + podcast_dag_sat <- dagify( podcast ~ mood + humor + prepared, exam ~ mood + prepared + humor, @@ -1624,7 +1689,7 @@ podcast_dag_sat <- dagify( ) curvatures <- rep(0, 8) -curvatures[1] <- .25 +curvatures[1] <- 0.25 podcast_dag_sat |> tidy_dagitty() |> @@ -1649,6 +1714,7 @@ Let's prune those two. #| fig-width: 5.5 #| fig-height: 4 #| fig-cap: "A pruned version of @fig-podcast_dag_sat: we've removed implausible arrows from the fully saturated DAGs." + podcast_dag_pruned <- dagify( podcast ~ mood + humor + prepared, exam ~ mood + prepared, @@ -1728,9 +1794,18 @@ Finally, we recommend checking your DAG for robustness. You can never verify the correctness of your DAG under most conditions, but you can use the implications in your DAG to support it. Three types of robustness checks can be helpful depending on the circumstances. -1. **Negative controls** [@Lipsitch2010]. These come in two flavors: negative exposure controls and negative outcome controls. The idea is to find something associated with one but not the other, e.g., the outcome but not the exposure, so there should be no effect. Since there should be no effect, you now have a measurement for how well you control for *other* effects (e.g., the difference from null). Ideally, the confounders for negative controls are similar to the research question. -2. **DAG-data consistency** [@Textor2016]. Negative controls are an implication of your DAG. An extension of this idea is that there are *many* such implications. Because blocking a path removes statistical dependencies from that path, you can check those assumptions in several places in your DAG. -3. **Alternate adjustment sets**. Adjustment sets should give roughly the same answer because, outside of random and measurement errors, they are all sets that block backdoor paths. If more than one adjustment set seems reasonable, you can use that as a sensitivity analysis by checking multiple models. +1. **Negative controls** [@Lipsitch2010]. + These come in two flavors: negative exposure controls and negative outcome controls. + The idea is to find something associated with one but not the other, e.g., the outcome but not the exposure, so there should be no effect. + Since there should be no effect, you now have a measurement for how well you control for *other* effects (e.g., the difference from null). + Ideally, the confounders for negative controls are similar to the research question. +2. **DAG-data consistency** [@Textor2016]. + Negative controls are an implication of your DAG. + An extension of this idea is that there are *many* such implications. + Because blocking a path removes statistical dependencies from that path, you can check those assumptions in several places in your DAG. +3. **Alternate adjustment sets**. + Adjustment sets should give roughly the same answer because, outside of random and measurement errors, they are all sets that block backdoor paths. + If more than one adjustment set seems reasonable, you can use that as a sensitivity analysis by checking multiple models. We'll discuss these in detail in [Chapter -@sec-sensitivity]. The caveat here is that these should be complementary to your initial DAG, not a way of *replacing* it. diff --git a/chapters/05-not-just-a-stats-problem.qmd b/chapters/05-not-just-a-stats-problem.qmd index 1f72893..2a635d7 100644 --- a/chapters/05-not-just-a-stats-problem.qmd +++ b/chapters/05-not-just-a-stats-problem.qmd @@ -4,6 +4,7 @@ ```{r} #| echo: false + # TODO: remove when first edition complete status("complete") ``` @@ -23,6 +24,7 @@ In the plots in @fig-anscombe, each data set has remarkably similar summary stat #| label: fig-anscombe #| message: false #| fig-cap: "Anscombe's Quartet, a set of four datasets with nearly identical summary statistics. Anscombe's point was that one must visualize the data to understand it." + library(quartets) anscombe_quartet |> @@ -49,6 +51,7 @@ datasaurus_dozen |> #| message: false #| fig-cap: "The Datasaurus Dozen, a set of datasets with nearly identical summary statistics. The Datasaurus Dozen is a modern version of Anscombe's Quartet. It's actually a baker's dozen, but who's counting?" #| fig-height: 8 + datasaurus_dozen |> ggplot(aes(x, y)) + geom_point() + @@ -67,6 +70,7 @@ The difference is the causal structure that generated each dataset. #| label: fig-causal_quartet_hidden #| message: false #| fig-cap: "The Causal Quartet, four data sets with nearly identical summary statistics and visualizations. The causal structure of each dataset is different, and data alone cannot tell us which is which." + causal_quartet |> # hide the dataset names mutate(dataset = as.integer(factor(dataset))) |> @@ -91,6 +95,7 @@ Likewise, the correlation between `exposure` and `covariate` is no help: they're #| label: tbl-quartet_lm #| code-fold: true #| tbl-cap: "The causal quartet, with the estimated effect of `exposure` on `outcome` with and without adjustment for `covariate`. The unadjusted estimate is identical for all four datasets, as is the correlation between `exposure` and `covariate`. The adjusted estimate varies. Without background knowledge, it's not clear which is right." + library(gt) effects <- causal_quartet |> nest_by(dataset = as.integer(factor(dataset))) |> @@ -126,6 +131,7 @@ Even the reverse technique, *excluding* a variable when it's *less* than ten per #| label: tbl-quartet_ten_percent #| code-fold: true #| tbl-cap: "The percent change in the coefficient for `exposure` when including `covariate` in the model." + effects |> mutate(percent_change = scales::percent((ate_x - ate_xz) / ate_x)) |> select(dataset, percent_change) |> @@ -152,6 +158,7 @@ causal_quartet |> #| label: fig-causal_quartet_covariate #| message: false #| fig-cap: "The scaled relationship between `exposure` and `covariate`. We still do not have enough information to determine whether `covariate` is a confounder, mediator, or collider." + causal_quartet |> # hide the dataset names mutate(dataset = as.integer(factor(dataset))) |> @@ -180,6 +187,7 @@ In fact, it's a mathematical artifact of the data generating process. #| label: fig-causal_quartet_covariate_unscaled #| message: false #| fig-cap: "@fig-causal_quartet_covariate, unscaled" + causal_quartet |> # hide the dataset names mutate(dataset = as.integer(factor(dataset))) |> @@ -200,6 +208,7 @@ In 3, it's a mediator (it depends on the research question). #| label: fig-causal_quartet #| message: false #| fig-cap: "The Causal Quartet, revealed. The first and last datasets are types of collider bias; we should *not* control for `covariate.` In the second dataset, `covariate` is a confounder, and we *should* control for it. In the third dataset, `covariate` is a mediator, and we should control for it if we want the direct effect, but not if we want the total effect." + causal_quartet |> ggplot(aes(exposure, outcome)) + geom_point() + @@ -226,6 +235,7 @@ Once we compile a DAG for each dataset, we only need to query the DAG for the co #| - "The DAG for dataset 2, where `covariate` (c) is a confounder. `covariate` is a mutual cause of `exposure` (e) and `outcome` (o), representing a backdoor path, so we *must* adjust for it to get the right answer." #| - "The DAG for dataset 3, where `covariate` (c) is a mediator. `covariate` is a descendant of `exposure` (e) and a cause of `outcome` (o). The path through `covariate` is the indirect path, and the path through `exposure` is the direct path. We should adjust for `covariate` if we want the direct effect, but not if we want the total effect." #| - "The DAG for dataset 4, where `covariate` (c) is a collider via M-Bias. Although `covariate` happens before both `outcome` (o) and `exposure` (e), it's still a collider. We should *not* adjust for `covariate`, particularly since we can't control for the bias via `u1` and `u2`, which are unmeasured." + library(ggdag) d_coll <- quartet_collider(x = "e", y = "o", z = "c") @@ -249,11 +259,13 @@ p_coll <- d_coll |> coord_cartesian(clip = "off") + theme(legend.position = "bottom") + ggtitle("(1) Collider") + - guides(color = guide_legend( - title = NULL, - keywidth = unit(1.4, "mm"), - override.aes = list(size = 3.4, shape = 15) - )) + + guides( + color = guide_legend( + title = NULL, + keywidth = unit(1.4, "mm"), + override.aes = list(size = 3.4, shape = 15) + ) + ) + scale_color_discrete(breaks = "covariate", na.value = "grey70") @@ -270,11 +282,13 @@ p_conf <- d_conf |> coord_cartesian(clip = "off") + theme(legend.position = "bottom") + ggtitle("(2) Confounder") + - guides(color = guide_legend( - title = NULL, - keywidth = unit(1.4, "mm"), - override.aes = list(size = 3.4, shape = 15) - )) + + guides( + color = guide_legend( + title = NULL, + keywidth = unit(1.4, "mm"), + override.aes = list(size = 3.4, shape = 15) + ) + ) + scale_color_discrete(breaks = "covariate", na.value = "grey70") p_med <- d_med |> @@ -290,11 +304,13 @@ p_med <- d_med |> coord_cartesian(clip = "off") + theme(legend.position = "bottom") + ggtitle("(3) Mediator") + - guides(color = guide_legend( - title = NULL, - keywidth = unit(1.4, "mm"), - override.aes = list(size = 3.4, shape = 15) - )) + + guides( + color = guide_legend( + title = NULL, + keywidth = unit(1.4, "mm"), + override.aes = list(size = 3.4, shape = 15) + ) + ) + scale_color_discrete(breaks = "covariate", na.value = "grey70") @@ -311,11 +327,13 @@ p_m_bias <- d_mbias |> coord_cartesian(clip = "off") + ggtitle("(4) M-bias") + theme(legend.position = "bottom") + - guides(color = guide_legend( - title = NULL, - keywidth = unit(1.4, "mm"), - override.aes = list(size = 3.4, shape = 15) - )) + + guides( + color = guide_legend( + title = NULL, + keywidth = unit(1.4, "mm"), + override.aes = list(size = 3.4, shape = 15) + ) + ) + scale_color_discrete(breaks = "covariate", na.value = "grey70") @@ -334,12 +352,13 @@ For dataset 3, it depends on which mediation effect we want: adjusted for the di #| label: tbl-quartets_true_effects #| echo: false #| tbl-cap: "The data generating mechanism and true causal effects in each dataset. Sometimes, the unadjusted effect is the same, and sometimes it is not, depending on the mechanism and question." + tibble::tribble( - ~`Data generating mechanism`, ~`Correct causal model`, ~`Correct causal effect`, - "(1) Collider", "outcome ~ exposure", "1", - "(2) Confounder", "outcome ~ exposure; covariate", "0.5", - "(3) Mediator", "Direct effect: outcome ~ exposure; covariate, Total Effect: outcome ~ exposure", "Direct effect: 0, Total effect: 1", - "(4) M-Bias", "outcome ~ exposure", "1" + ~`Data generating mechanism` , ~`Correct causal model` , ~`Correct causal effect` , + "(1) Collider" , "outcome ~ exposure" , "1" , + "(2) Confounder" , "outcome ~ exposure; covariate" , "0.5" , + "(3) Mediator" , "Direct effect: outcome ~ exposure; covariate, Total Effect: outcome ~ exposure" , "Direct effect: 0, Total effect: 1" , + "(4) M-Bias" , "outcome ~ exposure" , "1" ) |> gt() ``` @@ -374,15 +393,25 @@ Only control for variables that precede the outcome. #| code-fold: true #| fig-width: 4 #| fig-height: 3.75 + d_coll <- quartet_time_collider( - x0 = "e0", x1 = "e1", x2 = "e2", x3 = "e3", - y1 = "o1", y2 = "o2", y3 = "o3", - z1 = "c1", z2 = "c2", z3 = "c3" + x0 = "e0", + x1 = "e1", + x2 = "e2", + x3 = "e3", + y1 = "o1", + y2 = "o2", + y3 = "o3", + z1 = "c1", + z2 = "c2", + z3 = "c3" ) d_coll |> tidy_dagitty() |> - mutate(covariate = if_else(label == "c3", "covariate\n(follow-up)", NA_character_)) |> + mutate( + covariate = if_else(label == "c3", "covariate\n(follow-up)", NA_character_) + ) |> ggplot( aes(x = x, y = y, xend = xend, yend = yend) ) + @@ -394,17 +423,27 @@ d_coll |> theme(legend.position = "bottom") + geom_vline(xintercept = c(2.6, 3.25, 3.6, 4.25), lty = 2, color = "grey60") + annotate("label", x = 2.925, y = 0.97, label = "baseline", color = "grey50") + - annotate("label", x = 3.925, y = 0.97, label = "follow-up", color = "grey50") + - guides(color = guide_legend( - title = NULL, - keywidth = unit(1.4, "mm"), - override.aes = list(size = 3.4, shape = 15) - )) + + annotate( + "label", + x = 3.925, + y = 0.97, + label = "follow-up", + color = "grey50" + ) + + guides( + color = guide_legend( + title = NULL, + keywidth = unit(1.4, "mm"), + override.aes = list(size = 3.4, shape = 15) + ) + ) + scale_color_discrete(breaks = "covariate\n(follow-up)", na.value = "grey70") d_coll |> tidy_dagitty() |> - mutate(covariate = if_else(label == "c2", "covariate\n(baseline)", NA_character_)) |> + mutate( + covariate = if_else(label == "c2", "covariate\n(baseline)", NA_character_) + ) |> ggplot( aes(x = x, y = y, xend = xend, yend = yend) ) + @@ -416,12 +455,20 @@ d_coll |> theme(legend.position = "bottom") + geom_vline(xintercept = c(2.6, 3.25, 3.6, 4.25), lty = 2, color = "grey60") + annotate("label", x = 2.925, y = 0.97, label = "baseline", color = "grey50") + - annotate("label", x = 3.925, y = 0.97, label = "follow-up", color = "grey50") + - guides(color = guide_legend( - title = NULL, - keywidth = unit(1.4, "mm"), - override.aes = list(size = 3.4, shape = 15) - )) + + annotate( + "label", + x = 3.925, + y = 0.97, + label = "follow-up", + color = "grey50" + ) + + guides( + color = guide_legend( + title = NULL, + keywidth = unit(1.4, "mm"), + override.aes = list(size = 3.4, shape = 15) + ) + ) + scale_color_discrete(breaks = "covariate\n(baseline)", na.value = "grey70") ``` @@ -443,16 +490,16 @@ Even though `covariate_baseline` is only in the adjustment set for the second da #| label: tbl-quartet_time_adjusted #| code-fold: true #| tbl-cap: "The adjusted effect of `exposure_baseline` on `outcome_followup` in each dataset. The effect adjusted for `covariate_baseline` is correct for three out of four datasets." + causal_quartet_time |> nest_by(dataset) |> mutate( - adjusted_effect = - coef( - lm( - outcome_followup ~ exposure_baseline + covariate_baseline, - data = data - ) - )[2] + adjusted_effect = coef( + lm( + outcome_followup ~ exposure_baseline + covariate_baseline, + data = data + ) + )[2] ) |> bind_cols(tibble(truth = c(1, 0.5, 1, 1))) |> select(-data, dataset) |> @@ -512,11 +559,10 @@ causal_quartet |> data, lm(outcome ~ exposure, data = data) ), - rmse2 = - get_rmse( - data, - lm(outcome ~ exposure + covariate, data = data) - ), + rmse2 = get_rmse( + data, + lm(outcome ~ exposure + covariate, data = data) + ), rmse_diff = rmse2 - rmse1, r_squared1 = get_r_squared(lm(outcome ~ exposure, data = data)), r_squared2 = get_r_squared(lm(outcome ~ exposure + covariate, data = data)), @@ -548,7 +594,9 @@ In @fig-quartet_confounder, we see that `covariate` is a confounder. If this DAG represents the complete causal structure for `outcome`, the model `outcome ~ exposure + covariate` will give an unbiased estimate of the effect on `outcome` for `exposure`, assuming we've met other assumptions of the modeling process. The adjustment set for `covariate`'s effect on `outcome` is empty, and `exposure` is not a collider, so controlling for it does not induce bias[^06-not-just-a-stats-problem-4]. But look again. -`exposure` is a mediator for `covariate`'s effect on `outcome`; some of the total effect is mediated through `outcome`, while there is also a direct effect of `covariate` on `outcome`. Both estimates are unbiased, but they are different *types* of estimates. The effect of `exposure` on `outcome` is the *total effect* of that relationship, while the effect of `covariate` on `outcome` is the *direct effect*. +`exposure` is a mediator for `covariate`'s effect on `outcome`; some of the total effect is mediated through `outcome`, while there is also a direct effect of `covariate` on `outcome`. +Both estimates are unbiased, but they are different *types* of estimates. +The effect of `exposure` on `outcome` is the *total effect* of that relationship, while the effect of `covariate` on `outcome` is the *direct effect*. [^06-not-just-a-stats-problem-4]: Additionally, OLS produces a *collapsible* effect. Other effects, like the odds and hazards ratios, are *non-collapsible*, meaning that the conditional odds or hazards ratio might differ from its marginal version, even when there is no confounding. @@ -560,6 +608,7 @@ But look again. #| fig-cap: "The DAG for dataset 2, where `covariate` is a confounder. If you look closely, you'll realize that, from the perspective of the effect of `covariate` on the `outcome`, `exposure` is a *mediator*." #| fig-width: 3 #| fig-height: 2.75 + p_conf + ggtitle(NULL) ``` @@ -578,6 +627,7 @@ Now, we have a situation where `covariate` not only answers a different type of #| fig-cap: "A modification of the DAG for dataset 2, where `covariate` is a confounder. Now, the relationship between `covariate` and `outcome` is confounded by `q`, a variable not necessary to calculate the unbiased effect of `exposure` on `outcome`." #| fig-width: 3.5 #| fig-height: 3 + coords <- list( x = c(X = 1.75, Z = 1, Y = 3, Q = 0), y = c(X = 1.1, Z = 1.5, Y = 1, Q = 1) @@ -605,11 +655,13 @@ p_conf2 <- d_conf2 |> theme_dag() + coord_cartesian(clip = "off") + theme(legend.position = "none") + - guides(color = guide_legend( - title = NULL, - keywidth = unit(1.4, "mm"), - override.aes = list(size = 3.4, shape = 15) - )) + + guides( + color = guide_legend( + title = NULL, + keywidth = unit(1.4, "mm"), + override.aes = list(size = 3.4, shape = 15) + ) + ) + scale_color_discrete(breaks = "covariate", na.value = "grey70") p_conf2 diff --git a/chapters/06-stats-models-ci.qmd b/chapters/06-stats-models-ci.qmd index 1a92aab..07a8cb9 100644 --- a/chapters/06-stats-models-ci.qmd +++ b/chapters/06-stats-models-ci.qmd @@ -4,6 +4,7 @@ ```{r} #| echo: false + # TODO: remove when first edition complete status("complete") ``` @@ -51,10 +52,14 @@ dag1 <- dagify( ) ggdag(dag1, use_text = FALSE, use_edges = FALSE) + - geom_dag_text(aes(label = label), nudge_y = c(-.05, -.05, -.05), color = "black") + + geom_dag_text( + aes(label = label), + nudge_y = c(-0.05, -0.05, -0.05), + color = "black" + ) + geom_dag_edges_arc(curvature = c(0.07, 0)) + theme_dag() + - ylim(c(.2, -.2)) + ylim(c(0.2, -0.2)) ``` Let's simulate some data that matches this data-generating process. @@ -88,8 +93,7 @@ satisfaction1 <- tibble( y1 = y0, # in practice, we will only see one of these # observed - satisfaction = (1 - update_frequency) * y0 + - update_frequency * y1, + satisfaction = (1 - update_frequency) * y0 + update_frequency * y1, observed_potential_outcome = case_when( update_frequency == 0 ~ "y0", update_frequency == 1 ~ "y1" @@ -118,6 +122,7 @@ Now, let's try to estimate the effect of the `update_frequency` on the `satisfac ```{r} #| message: false #| warning: false + satisfaction1 |> group_by(update_frequency) |> summarise(avg_satisfaction = mean(satisfaction)) @@ -132,6 +137,7 @@ In this case, there is only one such set: `customer_type`. ```{r} #| message: false #| warning: false + satisfaction_strat <- satisfaction1 |> group_by(customer_type, update_frequency) |> summarise( @@ -148,6 +154,7 @@ Now, we're much closer to the right answer: within levels of customer type, ther ```{r} #| message: false #| warning: false + satisfaction_strat_est <- satisfaction_strat |> pivot_wider( names_from = update_frequency, @@ -177,6 +184,7 @@ Some customers overlap well with the company's business hours, while some don't; #| code-fold: true #| warning: false #| fig-cap: "A causal diagram of the relationship between frequency of software updates and customer satisfaction. The frequency of updates does not cause customer satisfaction, and the relationship is confounded by their mutual causes, customer type, and business hours. The effect of business hours on customer satisfaction is entirely mediated by availability of customer service." + dag2 <- dagify( satisfaction ~ customer_service + customer_type, customer_service ~ business_hours, @@ -194,7 +202,7 @@ dag2 <- dagify( ggdag(dag2, use_text = FALSE) + geom_dag_text( aes(label = label), - nudge_y = c(-.35, -.35, .35, .35, .35), + nudge_y = c(-0.35, -0.35, 0.35, 0.35, 0.35), color = "black" ) + theme_dag() @@ -217,11 +225,9 @@ satisfaction2 <- tibble( # Weekly (0) vs Daily (1) update_frequency = rbinom(n, 1, p_exposure), # More likely during business hours - customer_service_prob = business_hours * 0.9 + - (1 - business_hours) * 0.2, + customer_service_prob = business_hours * 0.9 + (1 - business_hours) * 0.2, customer_service = rbinom(n, 1, prob = customer_service_prob), - satisfaction = 70 + 10 * customer_type + - 15 * customer_service + rnorm(n), + satisfaction = 70 + 10 * customer_type + 15 * customer_service + rnorm(n), ) |> mutate( satisfaction = as.numeric(scale(satisfaction)), @@ -253,6 +259,7 @@ Within combinations of `customer_type` and `business_hours`, the update frequenc ```{r} #| message: false #| warning: false + satisfaction2_strat <- satisfaction2 |> group_by(customer_type, business_hours, update_frequency) |> summarise( @@ -281,6 +288,7 @@ The answers are slightly different because of chance differences between the var ```{r} #| message: false #| warning: false + satisfaction2 |> group_by(customer_type, customer_service, update_frequency) |> summarise( @@ -322,10 +330,14 @@ dag3 <- dagify( ) ggdag(dag3, use_text = FALSE, use_edges = FALSE) + - geom_dag_text(aes(label = label), nudge_y = c(-.05, -.05, -.05), color = "black") + + geom_dag_text( + aes(label = label), + nudge_y = c(-0.05, -0.05, -0.05), + color = "black" + ) + geom_dag_edges_arc(curvature = c(0.07, 0)) + theme_dag() + - ylim(c(.2, -.2)) + ylim(c(0.2, -0.2)) ``` Organizations with more users get more updates and have slightly lower satisfaction scores. @@ -353,6 +365,7 @@ If we still want to use `group_by` and `summarize()`, we could bin the continuou ```{r} #| message: false #| warning: false + satisfaction3_strat <- satisfaction3 |> mutate(num_users_q = ntile(num_users, 5)) |> group_by(num_users_q, update_frequency) |> @@ -370,6 +383,7 @@ Let's get the overall mean: ```{r} #| message: false #| warning: false + satisfaction3_strat |> ungroup() |> pivot_wider( @@ -385,10 +399,13 @@ As opposed to binary and categorical confounders, grouping by bins for continuou ## What would happen if we change the number of bins? -Let's see what happens if we increase the number of bins. In the figure below, we've changed the number of bins from 5 in the example in the text to range from 3 to 20. Notice as we increase the number of bins the bias decreases. +Let's see what happens if we increase the number of bins. +In the figure below, we've changed the number of bins from 5 in the example in the text to range from 3 to 20. +Notice as we increase the number of bins the bias decreases. ```{r} #| code-fold: true + update_bins <- function(bins) { satisfaction3 |> mutate(num_users_q = ntile(num_users, bins)) |> @@ -434,7 +451,8 @@ satisfaction3 |> summarise(estimate = mean(daily - weekly)) ``` -As with many good things, however, there is a limit to the utility of increasing the number of bins. For example, let's see what happens if we try to have 30 bins. +As with many good things, however, there is a limit to the utility of increasing the number of bins. +For example, let's see what happens if we try to have 30 bins. ```{r} satisfaction3 |> @@ -452,7 +470,12 @@ satisfaction3 |> summarise(estimate = mean(daily - weekly)) ``` -The estimate is `NA` because some of our bins didn't have anyone in one of the exposure groups, making their difference inestimable. Now, this analysis violates our *positivity* assumption. This is a stochastic violation; it has to do with our sample size, `r scales::comma(n)`, and the number of bins, 30. By chance, we ended up with at least one of the 30 bins without anyone in one of the exposure groups, making our causal effect inestimable. This non-parametric method, while flexible, has limitations due to the sample size. Parametric models are useful because they allow us to extrapolate under certain assumptions, which makes them more efficient (assuming our assumptions are true, let's learn more in @sec-parametric). +The estimate is `NA` because some of our bins didn't have anyone in one of the exposure groups, making their difference inestimable. +Now, this analysis violates our *positivity* assumption. +This is a stochastic violation; it has to do with our sample size, `r scales::comma(n)`, and the number of bins, 30. +By chance, we ended up with at least one of the 30 bins without anyone in one of the exposure groups, making our causal effect inestimable. +This non-parametric method, while flexible, has limitations due to the sample size. +Parametric models are useful because they allow us to extrapolate under certain assumptions, which makes them more efficient (assuming our assumptions are true, let's learn more in @sec-parametric). ::: @@ -462,8 +485,6 @@ You can also think of it as a type of non-parametric approach. We aren't using any parameterization from a statistical model to restrict the form of the variables the way we might with, say, a linear regression. (This is only partially true for continuous confounders because it's not practical to stratify by all values of the continuous variable). - - Stratification can be powerful for simple problems or when you have lots of data because you can sometimes avoid model misspecification problems. However, with many confounders (especially continuous ones), we quickly encounter the curse of dimensionality, making it impractical because we have too few observations by combinations of confounder levels. @@ -509,7 +530,9 @@ Modeling this well requires an understanding of the nature of the relationship b ## Functional form in parametric models -In the text we simulated the relationship between the outcome and the confounders to be linear, that is, it exactly met the assumptions underlying `lm()`, so we got the right answer when we fit our parameteric model. What would happen if our simulation did not match the assumptions underlying `lm()`? Let's take a look. +In the text we simulated the relationship between the outcome and the confounders to be linear, that is, it exactly met the assumptions underlying `lm()`, so we got the right answer when we fit our parameteric model. +What would happen if our simulation did not match the assumptions underlying `lm()`? +Let's take a look. ```{r} set.seed(11) @@ -519,7 +542,7 @@ satisfaction4 <- tibble( # Larger customers more likely to have daily updates update_frequency = rbinom(n, 1, plogis(num_users / 100)), # non-linear relationship between satisfaction and number of users - satisfaction = 70 - 0.001 * (num_users-300)^2 - 0.001 * (num_users - 300)^3 + satisfaction = 70 - 0.001 * (num_users - 300)^2 - 0.001 * (num_users - 300)^3 ) |> mutate( satisfaction = as.numeric(scale(satisfaction)), @@ -532,7 +555,8 @@ ggplot(satisfaction4, aes(x = num_users, y = satisfaction)) + geom_line() ``` -In the figure above we see that now there is a non-linear relationship between our confounder, number of users, and our outcome, satisfaction. Let's see what happens if we fit an (incorrect) parameteric model to these data. +In the figure above we see that now there is a non-linear relationship between our confounder, number of users, and our outcome, satisfaction. +Let's see what happens if we fit an (incorrect) parameteric model to these data. ```{r} lm( @@ -545,7 +569,11 @@ lm( ``` -Our estimates are far from the truth (which should be zero); the truth is not even contained in the confidence interval. What went wrong? Our parametric model assumed that the functional form of the relationship between the number of users and satisfaction was linear, but we generated it non-linearly. There is a solution that still allows for the use of a parametric model; if we knew the true functional form, we could use that. Let's see how that looks. +Our estimates are far from the truth (which should be zero); the truth is not even contained in the confidence interval. +What went wrong? +Our parametric model assumed that the functional form of the relationship between the number of users and satisfaction was linear, but we generated it non-linearly. +There is a solution that still allows for the use of a parametric model; if we knew the true functional form, we could use that. +Let's see how that looks. ```{r} lm( @@ -557,7 +585,10 @@ lm( select(estimate, starts_with("conf")) ``` -Beautiful! Now, this model was fit *exactly* as the data were generated, and again, we ended up with the exact right answer. In the real world, we often do not know the data-generating mechanism, but we can still fit flexible parametric models. A great way to do this is through natural cubic splines. +Beautiful! +Now, this model was fit *exactly* as the data were generated, and again, we ended up with the exact right answer. +In the real world, we often do not know the data-generating mechanism, but we can still fit flexible parametric models. +A great way to do this is through natural cubic splines. ```{r} lm( @@ -599,7 +630,8 @@ Outcome regression can work very well when we meet the assumptions of the estima OLS, for instance, can be very beneficial if we understand the relationships between the outcome and the variables in the regression, and we believe the assumptions of the model, particularly linearity. It's very efficient, statistically (meaning we'll get a small standard error). We'll also get nominally correct confidence intervals without needing to bootstrap (@sec-appendix-bootstrap). -Scientists and other people who analyze data are also usually familiar with linear regression, making it easier for many to understand what you did to calculate the causal effect. In fact, you could say that when there is a linear relationship between the outcome and exposure, and we meet the causal assumptions laid out in @sec-assump, *correlation is causation*. +Scientists and other people who analyze data are also usually familiar with linear regression, making it easier for many to understand what you did to calculate the causal effect. +In fact, you could say that when there is a linear relationship between the outcome and exposure, and we meet the causal assumptions laid out in @sec-assump, *correlation is causation*. So, why don't we always use outcome models to calculate causal effects? First, we may be more confident in modeling the exposure instead of the outcome (as in inverse probability models, for instance). @@ -609,7 +641,11 @@ For example, using a propensity score method can be more statistically efficient Second, it can sometimes be challenging to get the estimate we are targeting with outcome models---an answer to a precise question---something we will probe more deeply in @sec-estimands. Relatedly, outcome models give us **conditional effects**. -In other words, when describing the estimated coefficient, we often say something like "a one-unit change in the exposure results in a `coefficient` change in the outcome *holding all other variables in the model constant*". In causal inference, we are often interested in **marginal effects**. Mathematically, this means that we want to average the effect of interest across the distribution of factors in a particular population for which we are trying to estimate a causal effect. In the case where the outcome is continuous, the effect is linear, and there are no interactions between the exposure effect and other factors about the population, the distinction between a conditional and a marginal effect is largely semantic. The estimates will be identical. +In other words, when describing the estimated coefficient, we often say something like "a one-unit change in the exposure results in a `coefficient` change in the outcome *holding all other variables in the model constant*". +In causal inference, we are often interested in **marginal effects**. +Mathematically, this means that we want to average the effect of interest across the distribution of factors in a particular population for which we are trying to estimate a causal effect. +In the case where the outcome is continuous, the effect is linear, and there are no interactions between the exposure effect and other factors about the population, the distinction between a conditional and a marginal effect is largely semantic. +The estimates will be identical. If there *is* an interaction in the model, that is, if the exposure has a different impact on the outcome depending on some other factor, we no longer have a single coefficient to interpret. We may want to estimate a marginal effect, taking into account the distribution of that factor in the population of interest. @@ -622,17 +658,17 @@ For free customers, daily updates *decrease* satisfaction by 5 points. The effect of changing the update frequency is heterogeneous, depending on customer type. Whether increasing the update frequency to daily for everyone is beneficial depends on the distribution of premium vs. free customers. -- If 50% of the customers are premium and 50% are free, the average effect of switching to daily updates would be: +- If 50% of the customers are premium and 50% are free, the average effect of switching to daily updates would be: - $(0.5 * 5) + (0.5 * -5) = 0$ + $(0.5 * 5) + (0.5 * -5) = 0$ -- If 100% of the customers are premium, the average effect would be: +- If 100% of the customers are premium, the average effect would be: - $(1 * 5) + (0 * -5) = 5$ + $(1 * 5) + (0 * -5) = 5$ -- If 100% of the customers are free, the average effect would be: +- If 100% of the customers are free, the average effect would be: - $(0 * 5) + (1 * -5) = -5$ + $(0 * 5) + (1 * -5) = -5$ Marginalization tells us the average effect for the distribution of covariates in the data. Of course, we might want to estimate the causal effect *by* customer type; we'll discuss interaction effects in depth in @sec-interaction. @@ -656,19 +692,28 @@ As we've seen, it's possible to do causal inference with simple methods like str For the rest of the book, however, we will focus on other causal methods, which allow us more flexibility in answering the questions we want to ask. Here's a brief summary of some of the unconfoundedness methods we'll cover and what they do. -- *Unconfoundedness methods* - - **Inverse probability weighting** (propensity score weighting): Using a propensity score (predicted probability of treatment), we reweight units to create a pseudo-population where exchangeability holds. Extends to time-varying treatments. - - **Matching** (propensity score matching and other methods): Find treated and untreated units with similar propensity scores (or other measures of similarity) to match, creating a subpopulation where exchangeability holds. - - **G-computation** (also called standardization or marginal effects): Fit an outcome model but marginalize to get a marginal effect estimate. Extends to time-varying treatments. - - **Doubly robust methods**: Fit models for both the outcome and treatment. Using doubly robust methods, only one of these models needs to be correct for the estimate to be correct. Doubly robust methods also allow us to use machine learning algorithms. We'll discuss **targeted learning (TMLE)** and **augmented propensity scores**. +- *Unconfoundedness methods* + - **Inverse probability weighting** (propensity score weighting): Using a propensity score (predicted probability of treatment), we reweight units to create a pseudo-population where exchangeability holds. + Extends to time-varying treatments. + - **Matching** (propensity score matching and other methods): Find treated and untreated units with similar propensity scores (or other measures of similarity) to match, creating a subpopulation where exchangeability holds. + - **G-computation** (also called standardization or marginal effects): Fit an outcome model but marginalize to get a marginal effect estimate. + Extends to time-varying treatments. + - **Doubly robust methods**: Fit models for both the outcome and treatment. + Using doubly robust methods, only one of these models needs to be correct for the estimate to be correct. + Doubly robust methods also allow us to use machine learning algorithms. + We'll discuss **targeted learning (TMLE)** and **augmented propensity scores**. While the book focuses primarily on unconfoundedness methods, we later cover methods that make other assumptions (@sec-iv-friends and @sec-did). Here's a brief summary of when we might want to explore these methods instead of trying to achieve exchangeability: -- **Instrumental variables**: There’s a variable (the instrument) that affects the treatment but does not directly affect the outcome except through the treatment. Because it's effectively random, we can use it to estimate a type of causal effect. -- **Regression discontinuity**: There’s a cutoff or threshold that determines who gets the treatment, and individuals just above or below the threshold are comparable. Regression discontinuity is closely related to instruments. -- **Difference-in-differences**: The treated and untreated groups would have followed the same trend over time in the absence of the treatment (they have *parallel trends*). If the two groups would have been identical sans treatment, we can use the untreated as a counterfactual for the treated. -- **Synthetic controls**: A weighted combination of untreated units can closely approximate the treated unit’s outcome without the treatment. Synthetic controls are closely related to difference-in-differences. +- **Instrumental variables**: There's a variable (the instrument) that affects the treatment but does not directly affect the outcome except through the treatment. + Because it's effectively random, we can use it to estimate a type of causal effect. +- **Regression discontinuity**: There's a cutoff or threshold that determines who gets the treatment, and individuals just above or below the threshold are comparable. + Regression discontinuity is closely related to instruments. +- **Difference-in-differences**: The treated and untreated groups would have followed the same trend over time in the absence of the treatment (they have *parallel trends*). + If the two groups would have been identical sans treatment, we can use the untreated as a counterfactual for the treated. +- **Synthetic controls**: A weighted combination of untreated units can closely approximate the treated unit's outcome without the treatment. + Synthetic controls are closely related to difference-in-differences. ### Causal methods in randomized trials {#sec-ci-rct} @@ -704,10 +749,10 @@ satisfaction_randomized <- tibble( update_frequency = rbinom(n, 1, 0.5), # More likely during business hours customer_service_prob = business_hours * - 0.9 + (1 - business_hours) * 0.2, + 0.9 + + (1 - business_hours) * 0.2, customer_service = rbinom(n, 1, prob = customer_service_prob), - satisfaction = 70 + 10 * customer_type + - 15 * customer_service + rnorm(n), + satisfaction = 70 + 10 * customer_type + 15 * customer_service + rnorm(n), ) |> mutate( satisfaction = as.numeric(scale(satisfaction)), @@ -732,25 +777,28 @@ satisfaction_randomized <- tibble( plot_estimates <- function(d) { unadj_model <- lm(satisfaction ~ update_frequency, data = d) |> tidy(conf.int = TRUE) |> - mutate(term = if_else( - term == "update_frequencydaily", - "update_frequency", - term - )) |> + mutate( + term = if_else( + term == "update_frequencydaily", + "update_frequency", + term + ) + ) |> filter(term == "update_frequency") |> mutate(model = "unadjusted") adj_model <- lm( - satisfaction ~ update_frequency + business_hours + - customer_type, + satisfaction ~ update_frequency + business_hours + customer_type, data = d ) |> tidy(conf.int = TRUE) |> - mutate(term = if_else( - term == "update_frequencydaily", - "update_frequency", - term - )) |> + mutate( + term = if_else( + term == "update_frequencydaily", + "update_frequency", + term + ) + ) |> filter(term == "update_frequency") |> mutate(model = "direct\nadjustment") @@ -778,14 +826,16 @@ plot_estimates <- function(d) { ) models <- bind_rows(unadj_model, adj_model, psw_model) |> - mutate(model = factor( - model, - levels = c( - "unadjusted", - "direct\nadjustment", - "inverse\nprobability\nweighting" + mutate( + model = factor( + model, + levels = c( + "unadjusted", + "direct\nadjustment", + "inverse\nprobability\nweighting" + ) ) - )) + ) models |> select(model, estimate, std.error, starts_with("conf")) |> diff --git a/chapters/07-prep-data.qmd b/chapters/07-prep-data.qmd index a030e34..b8bd524 100644 --- a/chapters/07-prep-data.qmd +++ b/chapters/07-prep-data.qmd @@ -4,6 +4,7 @@ ```{r} #| echo: false + # TODO: remove when first edition complete status("polishing") ``` @@ -80,6 +81,7 @@ Let's begin by diagramming this causal question (@fig-seven-diag). #| fig-cap: "Diagram of the causal question \"Is there a relationship between whether there were Extra Magic Hours in the morning at Magic Kingdom and the average posted wait time for the Seven Dwarfs Mine Train the same day between 9 AM and 10 AM in 2018?\"" #| label: fig-seven-diag #| warning: false + knitr::include_graphics(here::here("images/emm-diagram.png")) ``` @@ -102,11 +104,12 @@ For example's sake, though, we'll keep it simple. #| code-fold: true #| message: false #| warning: false -#| fig.cap: > +#| fig-cap: > #| Proposed DAG for the relationship between Extra Magic Hours #| in the morning at a particular park and the average posted wait #| time between 9 AM and 10 AM. #| Here, we believe 1) Extra Magic Hours impacts average wait time and 2) both Extra Magic Hours and average wait time are determined by the time the park closes, historic high temperatures, and ticket season. + library(ggdag) library(ggokabeito) @@ -136,7 +139,7 @@ dagify( ggplot( aes(x, y, xend = xend, yend = yend, color = status) ) + - geom_dag_edges_arc(curvature = c(rep(0, 5), .3)) + + geom_dag_edges_arc(curvature = c(rep(0, 5), 0.3)) + geom_dag_point() + geom_dag_label_repel(seed = 1630) + scale_color_okabe_ito(na.value = "grey90") + @@ -150,17 +153,17 @@ Since we're not, we need to rely on previously collected observational data and Here, our observations are *days*. @tbl-tt-7dwarfs maps each element of the causal question to elements of the target trial protocol. -| Protocol Step | Description | Target Trial | Emulation | -|------------------|------------------|------------------|--------------------| -| Eligibility criteria | Which days should be included in the study? | Days must be from 2018. | Same as target trial. | -| Exposure definition | When eligible, what precise exposure will days under study receive? | Exposed: Magic Kingdom had Extra Magic Hours in the morning. Otherwise, unexposed. | Same as target trial. | -| Assignment procedures | How will eligible days be assigned to an exposure? | Days are randomized with a 50% probability of having Extra Magic Hours in the morning. The assignment is non-blinded. | Days are assigned the exposure consistent with their data, e.g., whether or not there were Extra Magic Hours that morning. Randomization is emulated using adjustment for confounding. | -| Follow-up period | When does follow-up start and end? | Start: When the park opens the day of the exposure; End: at 10 AM on the same day. | Same as target trial. | -| Outcome definition | What precise outcomes will be measured? | The average posted wait time for the Seven Dwarfs Mine Train between 9 AM and 10 AM on the same day. | Same as target trial. | -| Causal contrast of interest | Which causal estimand will be estimated? | Average Treatment Effect (ATE). | Same as target trial. | -| Analysis plan | What data manipulation and statistical procedures will be applied to the data to estimate the causal contrast of interest? | ATE will be calculated using inverse probability weighting, weighted for historic high temperature, ticket season, and park close time. | Same as target trial. In this case, the variables are confounders, and the adjustment set was determined by assuming the causal structure presented in @fig-dag-magic. | + | Protocol Step | Description | Target Trial | Emulation | + | --------------------------- | -------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | + | Eligibility criteria | Which days should be included in the study? | Days must be from 2018. | Same as target trial. | + | Exposure definition | When eligible, what precise exposure will days under study receive? | Exposed: Magic Kingdom had Extra Magic Hours in the morning. Otherwise, unexposed. | Same as target trial. | + | Assignment procedures | How will eligible days be assigned to an exposure? | Days are randomized with a 50% probability of having Extra Magic Hours in the morning. The assignment is non-blinded. | Days are assigned the exposure consistent with their data, e.g., whether or not there were Extra Magic Hours that morning. Randomization is emulated using adjustment for confounding. | + | Follow-up period | When does follow-up start and end? | Start: When the park opens the day of the exposure; End: at 10 AM on the same day. | Same as target trial. | + | Outcome definition | What precise outcomes will be measured? | The average posted wait time for the Seven Dwarfs Mine Train between 9 AM and 10 AM on the same day. | Same as target trial. | + | Causal contrast of interest | Which causal estimand will be estimated? | Average Treatment Effect (ATE). | Same as target trial. | + | Analysis plan | What data manipulation and statistical procedures will be applied to the data to estimate the causal contrast of interest? | ATE will be calculated using inverse probability weighting, weighted for historic high temperature, ticket season, and park close time. | Same as target trial. In this case, the variables are confounders, and the adjustment set was determined by assuming the causal structure presented in @fig-dag-magic. | -: A protocol of a target trial of the effect of Extra Magic Morning on average posted wait times and a corresponding protocol for an observational study that emulates the target trial {#tbl-tt-7dwarfs} + : A protocol of a target trial of the effect of Extra Magic Morning on average posted wait times and a corresponding protocol for an observational study that emulates the target trial {#tbl-tt-7dwarfs} ## Data wrangling and target trials @@ -169,16 +172,16 @@ In a randomized trial, many of these actions are part of the trial design and da In a target trial emulation, we often need to apply those actions ourselves to the data we are preparing to answer causal questions. @tbl-dplyr presents the type of actions (here, functions in the tidyverse) we might need to take. -| Target trial protocol element | tidyverse function | -|------------------------------|------------------------------------------| -| Eligibility criteria | `filter()` | -| Exposure definition | `mutate()` | -| Assignment procedures | `mutate()`, `select()` | -| Follow-up period | `mutate()`, `pivot_longer()`, `pivot_wider()` | -| Outcome definition | `mutate()` | -| Analysis plan | `select()`, `mutate()`, `*_join()` | + | Target trial protocol element | tidyverse function | + | ----------------------------- | --------------------------------------------- | + | Eligibility criteria | `filter()` | + | Exposure definition | `mutate()` | + | Assignment procedures | `mutate()`, `select()` | + | Follow-up period | `mutate()`, `pivot_longer()`, `pivot_wider()` | + | Outcome definition | `mutate()` | + | Analysis plan | `select()`, `mutate()`, `*_join()` | -: Mapping elements of target trial protocols to commonly used tidyverse functions {#tbl-dplyr} + : Mapping elements of target trial protocols to commonly used tidyverse functions {#tbl-dplyr} We need to manipulate both the `seven_dwarfs_train` dataset and the `parks_metadata_raw` dataset to answer our causal question. Let's start with the `seven_dwarfs_train` data set. @@ -209,6 +212,7 @@ The distribution of the wait times is quite wide, with actual times appearing sh ```{r} #| warning: false + seven_dwarfs_train |> pivot_longer( starts_with("wait_minutes"), @@ -242,6 +246,7 @@ Our eligibility criteria state that we need to restrict our analysis to days in ```{r} #| message: false #| warning: false + seven_dwarfs_9 <- seven_dwarfs_train |> # eligibility criteria filter(year(park_date) == 2018) |> @@ -366,14 +371,15 @@ There were many more regular ticket days in the summer and more peak ticket days ```{r} #| label: fig-ticket-season #| fig-cap: "Ticket season by month. The proportion of days of each ticket prices varies across the year. There were many more regular ticket days in the summer and more peak ticket days in March, May, and December. Additionally, there were no peak tickets in August or September or value tickets in June, July, or December." + ticket_season_by_month |> ggplot(aes(month, n, fill = park_ticket_season)) + - geom_col(position = "fill", alpha = .8) + + geom_col(position = "fill", alpha = 0.8) + labs( y = "proportion of days", x = NULL, fill = "ticket season" - ) + + ) + theme(panel.grid.major.x = element_blank()) ``` @@ -390,15 +396,16 @@ Some early times don't happen in the summer, and some late times don't happen in ```{r} #| label: fig-close-time #| fig-cap: "Close time for the Magic Kingdom park by month. The close time varies across the year. Earlier close times occur more often in the late fall and winter. There are months where some close times don't occur in any days, such as early times in the summer and late times in the late fall." + parks_metadata |> count_by_month(park_close) |> ggplot(aes(month, n, fill = ordered(park_close))) + - geom_col(position = "fill", alpha = .85) + + geom_col(position = "fill", alpha = 0.85) + labs( y = "proportion of days", x = NULL, fill = "close time" - ) + + ) + theme(panel.grid.major.x = element_blank()) ``` @@ -407,6 +414,7 @@ Disney World is in Florida, so it never gets particularly cold, but it gets hot ```{r} #| label: fig-high-temp #| fig-cap: "Historic high temperature in farenheit for Walt Disney World by month. Being in a warm climate, the park never gets particularly cold, but the temperature does vary quite a bit throughout the year, with very hot summers." + parks_metadata |> mutate( month = month( @@ -416,7 +424,7 @@ parks_metadata |> ) ) |> ggplot(aes(month, park_temperature_high)) + - geom_jitter(height = 0, width = .15, alpha = .5) + + geom_jitter(height = 0, width = 0.15, alpha = 0.5) + labs( y = "historic high\ntemperature (F)", x = NULL @@ -475,7 +483,10 @@ Days with Extra Magic Mornings were also slightly cooler, but not by much. ```{r} #| label: tbl-unweighted-gtsummary -#| tbl-cap: A descriptive table of Extra Magic Morning in the touringplans dataset. This table shows the distributions of these variables in the observed population. +#| tbl-cap: A descriptive table of Extra Magic Morning in the touringplans +#| dataset. This table shows the distributions of these variables in the +#| observed population. + library(gtsummary) library(labelled) seven_dwarfs_9 |> @@ -483,11 +494,11 @@ seven_dwarfs_9 |> park_ticket_season = "Ticket Season", park_close = "Close Time", park_temperature_high = "Historic High Temperature" - ) |> + ) |> mutate( park_close = as.character(park_close), park_extra_magic_morning = factor( - park_extra_magic_morning, + park_extra_magic_morning, labels = c("No Extra Magic Hours", "Extra Magic Hours") ) ) |> @@ -495,7 +506,7 @@ seven_dwarfs_9 |> by = park_extra_magic_morning, include = c( park_ticket_season, - park_close, + park_close, park_temperature_high ) ) |> @@ -553,8 +564,9 @@ Here, we're grouping by the values of the `exposure` as we have them assigned an ```{r} #| eval: false -dataset |> - group_by(exposure, exposure_type) |> + +dataset |> + group_by(exposure, exposure_type) |> summarize(...) ``` @@ -577,6 +589,7 @@ Both exposure levels span the majority of the covariate space, but none of the d ```{r} #| label: fig-close #| fig-cap: "Distribution of Magic Kingdom park closing time by whether the date had Extra Magic Hours in the morning. Some close times did not have days with Extra Magic Hours, a potential positivity violation." + ggplot( seven_dwarfs_9, aes( @@ -585,7 +598,7 @@ ggplot( fill = factor(park_extra_magic_morning) ) ) + - geom_bar(position = "fill", alpha = .8) + + geom_bar(position = "fill", alpha = 0.8) + labs( fill = "Extra Magic Morning", x = "Time of Park Close" @@ -599,6 +612,7 @@ We'll explore both in detail later on. ```{r} #| message: false + library(hms) seven_dwarfs_9 |> count(park_close, park_extra_magic_morning) |> @@ -617,6 +631,7 @@ Examining @fig-temp, very few days in the exposed group have maximum temperature ```{r} #| label: fig-temp #| fig-cap: "Distribution of historic high temperature at Magic Kingdom by whether the date had Extra Magic Hours in the morning. Only 1 day with Extra Magic Hours had an historic high temperature of less than 60 degress Fahrenheit." + library(halfmoon) ggplot( seven_dwarfs_9, @@ -626,7 +641,7 @@ ggplot( fill = factor(park_extra_magic_morning) ) ) + - geom_mirror_histogram(bins = 20, alpha = .8) + + geom_mirror_histogram(bins = 20, alpha = 0.8) + scale_y_continuous(labels = abs) + labs( fill = "Extra Magic Morning", @@ -650,6 +665,7 @@ Examining @fig-ticket, we do not see any positivity violations. ```{r} #| label: fig-ticket #| fig-cap: "Distribution of ticket season by whether the date had Extra Magic Hours in the morning. As days with and without Extra Magic Hours happen across all three levels, there don't appear to be positivity violations." + ggplot( seven_dwarfs_9, aes( @@ -658,7 +674,7 @@ ggplot( fill = factor(park_extra_magic_morning) ) ) + - geom_bar(position = "dodge", alpha = .8) + + geom_bar(position = "dodge", alpha = 0.8) + labs( fill = "Extra Magic Morning", x = "Magic Kingdom Ticket Season" @@ -676,18 +692,23 @@ Let's start by discretizing the `park_temperature_high` variable, cutting it int #| label: fig-positivity #| fig-cap: "Check for positivity violations across three confounders: historic high temperature, park close time, and ticket season." #| fig-width: 9 + prop_exposed <- seven_dwarfs_9 |> ## cut park_temperature_high into tertiles - mutate(park_temperature_high_bin = cut( + mutate( + park_temperature_high_bin = cut( park_temperature_high, breaks = 3 - )) |> + ) + ) |> ## bin park close time - mutate(park_close_bin = case_when( - hour(park_close) < 19 & hour(park_close) > 12 ~ "(1) early", - hour(park_close) >= 19 & hour(park_close) < 24 ~ "(2) standard", - hour(park_close) >= 24 | hour(park_close) < 12 ~ "(3) late" - )) |> + mutate( + park_close_bin = case_when( + hour(park_close) < 19 & hour(park_close) > 12 ~ "(1) early", + hour(park_close) >= 19 & hour(park_close) < 24 ~ "(2) standard", + hour(park_close) >= 24 | hour(park_close) < 12 ~ "(3) late" + ) + ) |> group_by( park_close_bin, park_temperature_high_bin, @@ -695,7 +716,7 @@ prop_exposed <- seven_dwarfs_9 |> ) |> ## calculate the proportion exposed in each bin summarize( - prop_exposed = mean(park_extra_magic_morning), + prop_exposed = mean(park_extra_magic_morning), .groups = "drop" ) |> complete( @@ -708,14 +729,14 @@ prop_exposed <- seven_dwarfs_9 |> prop_exposed |> ggplot( aes( - x = park_close_bin, - y = park_temperature_high_bin, + x = park_close_bin, + y = park_temperature_high_bin, fill = prop_exposed ) ) + geom_tile() + - scale_fill_viridis_c(begin = .1, end = .9) + - facet_wrap(~ park_ticket_season) + + scale_fill_viridis_c(begin = 0.1, end = 0.9) + + facet_wrap(~park_ticket_season) + labs( y = "Historic High Temperature (F)", x = "Magic Kingdom Park Close Time", @@ -736,6 +757,7 @@ We also have nine combinations that were never exposed. #| label: tbl-positivity-check #| tbl-cap: "Among binned combinations of confounders, 10 were either always exposed or never exposed. These represent potential positivity violations." #| code-fold: true + library(gt) prop_exposed |> filter(prop_exposed %in% c(1, 0)) |> diff --git a/chapters/08-propensity-scores.qmd b/chapters/08-propensity-scores.qmd index ed6c790..a3323a0 100644 --- a/chapters/08-propensity-scores.qmd +++ b/chapters/08-propensity-scores.qmd @@ -4,6 +4,7 @@ ```{r} #| echo: false + # TODO: remove when first edition complete status("polishing") ``` @@ -15,7 +16,7 @@ As presented in @sec-data-causal, the causal question we'd like to answer is: ** #| code-fold: true #| message: false #| warning: false -#| fig.cap: > +#| fig-cap: > #| Proposed DAG for the relationship between Extra Magic Hours #| in the morning at a particular park and the average wait #| time between 9 AM and 10 AM. @@ -60,13 +61,18 @@ dag_plot <- dag |> coord_cartesian(clip = "off") dag_plot + - geom_dag_edges_arc(curvature = c(rep(0, 5), .3, 0)) + + geom_dag_edges_arc(curvature = c(rep(0, 5), 0.3, 0)) + geom_dag_label_repel(seed = 1630) ``` -Now that we've done some exploratory analysis of these data, how should we go about answering this question? We know from @sec-dags that we have three backdoor paths to close. Each path has one variable, resulting in three confounders: the historic high temperature on the day, the time the park closed, and the ticket season: value, regular, or peak. +Now that we've done some exploratory analysis of these data, how should we go about answering this question? +We know from @sec-dags that we have three backdoor paths to close. +Each path has one variable, resulting in three confounders: the historic high temperature on the day, the time the park closed, and the ticket season: value, regular, or peak. -We also know that we have several ways we can close these paths. Stratification isn't a good solution here because of the curse of dimensionality; we had better use a statistical model. But which relationship should we model? Consider @fig-dag-close-paths-1, @fig-dag-close-paths-2, and @fig-dag-close-paths-3. +We also know that we have several ways we can close these paths. +Stratification isn't a good solution here because of the curse of dimensionality; we had better use a statistical model. +But which relationship should we model? +Consider @fig-dag-close-paths-1, @fig-dag-close-paths-2, and @fig-dag-close-paths-3. ```{r} #| label: fig-dag-close-paths @@ -76,8 +82,9 @@ We also know that we have several ways we can close these paths. Stratification #| - "Shall we close the paths going into the exposure?" #| - "Or the paths going into the outcome?" #| - "Or both?" + dag_plot + - geom_dag_edges_arc(curvature = c(rep(0, 5), .3, 0), edge_color = "grey80") + + geom_dag_edges_arc(curvature = c(rep(0, 5), 0.3, 0), edge_color = "grey80") + geom_dag_edges_arc( data = \(.x) filter(.x, to == "x"), curvature = c(0, 0, 0), @@ -85,10 +92,10 @@ dag_plot + ) dag_plot + - geom_dag_edges_arc(curvature = c(rep(0, 5), .3, 0), edge_color = "grey80") + + geom_dag_edges_arc(curvature = c(rep(0, 5), 0.3, 0), edge_color = "grey80") + geom_dag_edges_arc( data = \(.x) filter(.x, name != "x", to == "y"), - curvature = c(0, 0, .3), + curvature = c(0, 0, 0.3), edge_width = 1.1 ) @@ -99,18 +106,42 @@ dag_plot + ) + geom_dag_edges_arc( data = \(.x) filter(.x, name != "x"), - curvature = c(rep(0, 5), .3), + curvature = c(rep(0, 5), 0.3), edge_width = 1.1 ) ``` -First, we could model the relationship between the confounders and the exposure, as in @fig-dag-close-paths-1. A class of techniques using the **propensity score**, the probability of exposure, let us do this. Second, we could model the relationship between the confounders and the outcome, as in @fig-dag-close-paths-2. A class of techniques using the **outcome model** allows us to do this. We saw an outcome model in @sec-strat-outcome, and we'll see another approach in @sec-g-comp. Both propensity score and outcome model approaches can get us the right answer as long as we've got the right DAG and have modeled the relationships correctly. Alternatively, we could model both sets of relationships as in @fig-dag-close-paths-2. A class of techniques called **doubly robust** estimators allows us to do this, which we'll revisit in @sec-dr and @sec-causal-ml. - -For the next several chapters, we'll take up the class of techniques we can use to close the paths via @fig-dag-close-paths-1: propensity scores. First, consider what it would look like if Disney randomized the Extra Magic Mornings assignment. Let's say each day has a 0.5 probability of being assigned Extra Magic Mornings, so about half the days had them and half didn't. In this case, the arrows highlighted in @fig-dag-close-paths-1 wouldn't exist, but the arrows highlighted in @fig-dag-close-paths-2 would: we've intervened on the exposure---whether or not a day has Extra Magic Hours---not on the outcome---the posted wait times. The historic high temperature on the day, the time the park closed, and the ticket season still affect the posted wait times, but not Extra Magic Hours. Here, the probability of exposure---the propensity score---is 0.5 for every day. In other words, in an experiment, the propensity score is known. A given day is randomly assigned with `rbinom(1, 1, 0.5)`. - -Let's say Disney conducted a more complex experiment: they randomized the exposure *within levels of the three variables of the DAG*. For instance, let's say every day was granted a baseline probability of 0.5. In this experiment, historic temperatures >= 80 degrees Fahrenheit reduced the probability of exposure by 0.1, a park closure time of less than 10 PM increased the probability by 0.2, and being in peak ticket season decreased the probability by 0.3. A day with a historic temperature of 86 degrees that closed at 9:00 PM in peak ticket season, the day would be randomly assigned Extra Magic Hours in the morning with a probability of $0.5 - 0.1 + 0.2 - 0.25 = 0.35$, or `rbinom(1, 1, 0.35)`. This design is still a randomized experiment, but it's randomized within levels of the covariates. It gives us conditional exchangeability, much like we have to assume in non-randomized data: days are exchangeable within levels of the covariates. - -In our case, though, these probabilities are not known. What if we could find the "hidden experiment" in this non-randomized data? Remember, *someone* decided to put Extra Magic Hours on certain days. What was that decision process? If we could use a combination of domain knowledge---to understand what factors went into treatment assignment that also affect the outcome---and statistics---to estimate these conditional probabilities---then perhaps we could use those probabilities to achieve exchangeability and calculate an unbiased causal effect. @rosenbaum1983central showed that conditioning on propensity scores in observational studies can lead to unbiased estimates of the exposure effect as long as the assumptions discussed in @sec-assump hold. +First, we could model the relationship between the confounders and the exposure, as in @fig-dag-close-paths-1. +A class of techniques using the **propensity score**, the probability of exposure, let us do this. +Second, we could model the relationship between the confounders and the outcome, as in @fig-dag-close-paths-2. +A class of techniques using the **outcome model** allows us to do this. +We saw an outcome model in @sec-strat-outcome, and we'll see another approach in @sec-g-comp. +Both propensity score and outcome model approaches can get us the right answer as long as we've got the right DAG and have modeled the relationships correctly. +Alternatively, we could model both sets of relationships as in @fig-dag-close-paths-2. +A class of techniques called **doubly robust** estimators allows us to do this, which we'll revisit in @sec-dr and @sec-causal-ml. + +For the next several chapters, we'll take up the class of techniques we can use to close the paths via @fig-dag-close-paths-1: propensity scores. +First, consider what it would look like if Disney randomized the Extra Magic Mornings assignment. +Let's say each day has a 0.5 probability of being assigned Extra Magic Mornings, so about half the days had them and half didn't. +In this case, the arrows highlighted in @fig-dag-close-paths-1 wouldn't exist, but the arrows highlighted in @fig-dag-close-paths-2 would: we've intervened on the exposure---whether or not a day has Extra Magic Hours---not on the outcome---the posted wait times. +The historic high temperature on the day, the time the park closed, and the ticket season still affect the posted wait times, but not Extra Magic Hours. +Here, the probability of exposure---the propensity score---is 0.5 for every day. +In other words, in an experiment, the propensity score is known. +A given day is randomly assigned with `rbinom(1, 1, 0.5)`. + +Let's say Disney conducted a more complex experiment: they randomized the exposure *within levels of the three variables of the DAG*. +For instance, let's say every day was granted a baseline probability of 0.5. +In this experiment, historic temperatures >= 80 degrees Fahrenheit reduced the probability of exposure by 0.1, a park closure time of less than 10 PM increased the probability by 0.2, and being in peak ticket season decreased the probability by 0.3. +A day with a historic temperature of 86 degrees that closed at 9:00 PM in peak ticket season, the day would be randomly assigned Extra Magic Hours in the morning with a probability of $0.5 - 0.1 + 0.2 - 0.25 = 0.35$, or `rbinom(1, 1, 0.35)`. +This design is still a randomized experiment, but it's randomized within levels of the covariates. +It gives us conditional exchangeability, much like we have to assume in non-randomized data: days are exchangeable within levels of the covariates. + +In our case, though, these probabilities are not known. +What if we could find the "hidden experiment" in this non-randomized data? +Remember, *someone* decided to put Extra Magic Hours on certain days. +What was that decision process? +If we could use a combination of domain knowledge---to understand what factors went into treatment assignment that also affect the outcome---and statistics---to estimate these conditional probabilities---then perhaps we could use those probabilities to achieve exchangeability and calculate an unbiased causal effect. +@rosenbaum1983central showed that conditioning on propensity scores in observational studies can lead to unbiased estimates of the exposure effect as long as the assumptions discussed in @sec-assump hold. ## Building propensity score models {#sec-building-models} @@ -121,11 +152,13 @@ A logistic regression with exposure as the predicted value and the confounders a Below is pseudo-code for using `glm()` to fit a propensity score model using logistic regression. The first argument is the model, with the exposure on the left side and the confounders on the right. The `data` argument takes the data frame, and the `family = binomial()` argument denotes that the model should be fit using logistic regression (as opposed to a different generalized linear model, although other links are sometimes used in propensity score modeling). -You've likely fit models like this before, but the key details are that we are predicting the probability of exposure (instead of something around the outcome; we'll get to that!) and that the predictors are the confounders determined from the DAG. +You've likely fit models like this before, but the key details are that we are predicting the probability of exposure (instead of something around the outcome; we'll get to that!) +and that the predictors are the confounders determined from the DAG. As we saw in @sec-ci-rct, if we had any predictors of the outcome that weren't related to the exposure, we'd probably want to include those, too, for precision's sake. ```{r} #| eval: false + glm( # predict the probability of treatment exposure ~ confounder_1 + confounder_2, @@ -135,17 +168,19 @@ glm( ) ``` -We can extract the propensity scores by pulling out the predictions on the probability scale using `predict()` or `fitted()`. However, -using the `augment()` function from the [{`broom`}](https://broom.tidymodels.org/) package, we can extract these propensity scores and add them to our original data frame in one step, so we'll use that approach. +We can extract the propensity scores by pulling out the predictions on the probability scale using `predict()` or `fitted()`. +However, using the `augment()` function from the [{`broom`}](https://broom.tidymodels.org/) package, we can extract these propensity scores and add them to our original data frame in one step, so we'll use that approach. The predictions will be on the *linear logit* scale by default. Setting the argument `type.predict` to `"response"` indicates that we want to extract the predicted values on the *probability* scale. -The `data` argument contains the original data frame; if we leave this blank, we'll only get back a data frame with the variables in the propensity score model. However, we need the outcome, too, so it's handy to use the whole data frame, even though there are no new days in the data set. +The `data` argument contains the original data frame; if we leave this blank, we'll only get back a data frame with the variables in the propensity score model. +However, we need the outcome, too, so it's handy to use the whole data frame, even though there are no new days in the data set. This code will output a new data frame consisting of all components in `df` with six additional columns corresponding to the logistic regression model that was fit. The `.fitted` column is the propensity score. A convenient detail about broom is that the columns returned from its functions are consistent across models in R, so the code in this chapter will work for many types of broom output. ```{r} #| eval: false + glm( exposure ~ confounder_1 + confounder_2, data = df, @@ -171,7 +206,8 @@ seven_dwarfs_9 <- seven_dwarfs_train_2018 |> filter(wait_hour == 9) ps_mod <- glm( - park_extra_magic_morning ~ park_ticket_season + park_close + + park_extra_magic_morning ~ park_ticket_season + + park_close + park_temperature_high, data = seven_dwarfs_9, family = binomial() @@ -181,21 +217,45 @@ seven_dwarfs_9_with_ps <- ps_mod |> augment(type.predict = "response", data = seven_dwarfs_9) ``` -First, we might ask how well this model performs. On the one hand, model statistics can't tell us how well we've adjsuted for confounding. However, we do have expectations about how the model behaves. For instance, the prediction model should be **well-calibrated**. This means that if we look at all the days with a predicted probability of 0.3, about 30% of those days should have Extra Magic Mornings. We can check this by grouping the data into bins of propensity scores and calculating the observed proportion of days with Extra Magic Mornings in each bin. @fig-calibration-ps shows a calibration plot for our propensity score model. The points are close to the 45-degree line, indicating that the model is well-calibrated. Logistic regression models tend to be well-calibrated, so this is a good sign. However, we have few points towards the higher end of the propensity score, so we are more uncertain about these. Calibration can be particularly useful for machine learning models, which can be poorly calibrated. If we find that our model is showing issues in this respect, `{propensity}` can calibrate the model using logistic regression with `ps_calibrate()`. +First, we might ask how well this model performs. +On the one hand, model statistics can't tell us how well we've adjsuted for confounding. +However, we do have expectations about how the model behaves. +For instance, the prediction model should be **well-calibrated**. +This means that if we look at all the days with a predicted probability of 0.3, about 30% of those days should have Extra Magic Mornings. +We can check this by grouping the data into bins of propensity scores and calculating the observed proportion of days with Extra Magic Mornings in each bin. +@fig-calibration-ps shows a calibration plot for our propensity score model. +The points are close to the 45-degree line, indicating that the model is well-calibrated. +Logistic regression models tend to be well-calibrated, so this is a good sign. +However, we have few points towards the higher end of the propensity score, so we are more uncertain about these. +Calibration can be particularly useful for machine learning models, which can be poorly calibrated. +If we find that our model is showing issues in this respect, `{propensity}` can calibrate the model using logistic regression with `ps_calibrate()`. ```{r} #| label: fig-calibration-ps #| fig-cap: "Calibration plot for the propensity score model. The points are close to the 45-degree line, indicating that the model is well-calibrated. However, we have few points towards the higher end of the propensity score, so we are more uncertain about these." #| warning: false + library(halfmoon) plot_model_calibration(ps_mod, method = "windowed") ``` -Another common check of model performance is the receiver operating characteristic curve (ROC). The ROC curve plots the true positive rate (sensitivity) against the false positive rate (1 - specificity) at various threshold settings. The ROC is a measure of discrimination: how well the model can distinguish between days with and without Extra Magic Mornings. This means that if we randomly select one day with Extra Magic Mornings and one day without, the model will assign a higher propensity score to the day with Extra Magic Mornings more often than not. A 45 degree line indicates no discrimination (random chance), while a curve that hugs the top left corner indicates perfect discrimination. The area under the curve (AUC) is a summary measure of discrimination, with values ranging from 0.5 (no discrimination) to 1 (perfect discrimination). In prediction modeling, the goal is often to maximize discrimination. However, in a causal model, perfect discrimination would indicate a violation of positivity: some days would have a 0% chance of Extra Magic Mornings, and some would have a 100% chance. Some discrimination, howver, indicates we have identified differences between the groups. There is no clear cutoff for what is a "good" AUC is for causal inference (a randomized trial, for instance, will have an expected AUC of 0.5 because the only thing to discriminate between groups is group assignment), but values between 0.6 and 0.8 usually indicate that we've picked up enough signal from the counfounders to use for adjustment but not so much that we have positivity issues. As opposed to prediction modeling, though, we are not trying to optimize the AUC to this range; we are using it as a diagnostic. We'll revisit AUC and ROC curves in @sec-eval-ps-model. +Another common check of model performance is the receiver operating characteristic curve (ROC). +The ROC curve plots the true positive rate (sensitivity) against the false positive rate (1 - specificity) at various threshold settings. +The ROC is a measure of discrimination: how well the model can distinguish between days with and without Extra Magic Mornings. +This means that if we randomly select one day with Extra Magic Mornings and one day without, the model will assign a higher propensity score to the day with Extra Magic Mornings more often than not. +A 45 degree line indicates no discrimination (random chance), while a curve that hugs the top left corner indicates perfect discrimination. +The area under the curve (AUC) is a summary measure of discrimination, with values ranging from 0.5 (no discrimination) to 1 (perfect discrimination). +In prediction modeling, the goal is often to maximize discrimination. +However, in a causal model, perfect discrimination would indicate a violation of positivity: some days would have a 0% chance of Extra Magic Mornings, and some would have a 100% chance. +Some discrimination, howver, indicates we have identified differences between the groups. +There is no clear cutoff for what is a "good" AUC is for causal inference (a randomized trial, for instance, will have an expected AUC of 0.5 because the only thing to discriminate between groups is group assignment), but values between 0.6 and 0.8 usually indicate that we've picked up enough signal from the counfounders to use for adjustment but not so much that we have positivity issues. +As opposed to prediction modeling, though, we are not trying to optimize the AUC to this range; we are using it as a diagnostic. +We'll revisit AUC and ROC curves in @sec-eval-ps-model. ```{r} #| label: fig-roc-ps #| fig-cap: "ROC curve for the propensity score model. The curve is above the 45-degree line, indicating that the model has some discrimination. The AUC is 0.65, indicating that we've picked up enough signal from the confounders to use for adjustment but not so much that we have extreme positivity issues." + seven_dwarfs_9_with_ps |> check_model_roc_curve(park_extra_magic_morning, .fitted) |> plot_model_roc_curve() @@ -223,6 +283,7 @@ There was likewise a low probability (18.4%) on January 5, but this day *did* ha #| tbl-cap: > #| The first six observations in the `seven_dwarfs_9_with_ps` dataset, including their propensity scores in the `.fitted` column. #| code-fold: true + library(gt) seven_dwarfs_9_with_ps |> select( @@ -257,10 +318,11 @@ We'll also tweak the y-axis labels to use absolute values (rather than negative ```{r} #| label: fig-mirrored-ps -#| fig.cap: > +#| fig-cap: > #| Mirrored histograms of estimated propensity scores for #| Extra Magic Hour days (exposed group, top) and days without #| Extra Magic hours (unexposed group, bottom) + ggplot( seven_dwarfs_9_with_ps, aes(.fitted, fill = factor(park_extra_magic_morning)) @@ -270,16 +332,26 @@ ggplot( labs(x = "propensity score", fill = "extra magic morning") ``` -We'll explore how to assess a propensity score model after applying a method (like weighting or matching) in more detail in @sec-eval-ps-model; however, let's think about what we're seeing from the perspective of the causal assumptions we need to make for valid inferences. In @sec-explore-assump, we saw that we might have issues with both exchangeability and positivity. We were pretty confident that causal consistency wasn't an issue for this question, but looking at the raw propensity scores (before we apply weighting or matching) can give us some insight into the other two assumptions. While data will never be able to prove our assumptions are right or wrong, it does give us some evidence. +We'll explore how to assess a propensity score model after applying a method (like weighting or matching) in more detail in @sec-eval-ps-model; however, let's think about what we're seeing from the perspective of the causal assumptions we need to make for valid inferences. +In @sec-explore-assump, we saw that we might have issues with both exchangeability and positivity. +We were pretty confident that causal consistency wasn't an issue for this question, but looking at the raw propensity scores (before we apply weighting or matching) can give us some insight into the other two assumptions. +While data will never be able to prove our assumptions are right or wrong, it does give us some evidence. -For positivity and exchangeability, we're looking for two things: **overlap** and **balance**. Overlap refers to overlapping ranges of the propensity scores by exposure group, sometimes called **common support**. Overlap is related to positivity: if there are regions of the propensity score where only one exposure group has observations, we might have a positivity violation. The other thing to look for is balance in the populations. In a randomized setting, we expect the distributions between the two groups to be approximately the same because their likelihood of exposure is unrelated to the baseline covariates. This balance is another perspective on the choice of the word "exchangeable": we should be able to reassign the two groups to the other exposure and still get the right answer. +For positivity and exchangeability, we're looking for two things: **overlap** and **balance**. +Overlap refers to overlapping ranges of the propensity scores by exposure group, sometimes called **common support**. +Overlap is related to positivity: if there are regions of the propensity score where only one exposure group has observations, we might have a positivity violation. +The other thing to look for is balance in the populations. +In a randomized setting, we expect the distributions between the two groups to be approximately the same because their likelihood of exposure is unrelated to the baseline covariates. +This balance is another perspective on the choice of the word "exchangeable": we should be able to reassign the two groups to the other exposure and still get the right answer. -@fig-sim-ps shows simulated scenarios of the kinds of distributions we will tend to see with good, moderate, and poor overlap and balance. Poor balance and overlap can worsen bias and variance and make us less confident in the causal assumptions we need to make. +@fig-sim-ps shows simulated scenarios of the kinds of distributions we will tend to see with good, moderate, and poor overlap and balance. +Poor balance and overlap can worsen bias and variance and make us less confident in the causal assumptions we need to make. ```{r} #| label: fig-sim-ps #| fig-cap: "Simulated distributions of propensity scores. We look for two qualities in mirrored histograms: overlap and balance, related to positivity and exchangeability, respectively." #| code-fold: true + library(patchwork) set.seed(2025) @@ -336,7 +408,10 @@ plots_balance$moderate <- plots_balance$moderate + (plots_balance$good + plots_balance$moderate + plots_balance$poor) ``` -@fig-mirrored-ps gives us a perspective on these assumptions reduced to a single dimension (the propensity score). We definitely see both overlap and balance problems. Clearly, the distribution is different between the groups in terms of shape and number of days. Let's also look at the range of the propensity score by group. +@fig-mirrored-ps gives us a perspective on these assumptions reduced to a single dimension (the propensity score). +We definitely see both overlap and balance problems. +Clearly, the distribution is different between the groups in terms of shape and number of days. +Let's also look at the range of the propensity score by group. ```{r} seven_dwarfs_9_with_ps |> @@ -344,7 +419,8 @@ seven_dwarfs_9_with_ps |> reframe(range = range(.fitted)) ``` -The range between groups seems pretty close. One helpful way to look at the tails is to check the number of *unexposed* observations below the lowest probability for the *exposed* group and the number of *exposed* observations above the highest probability for the *unexposed* group. +The range between groups seems pretty close. +One helpful way to look at the tails is to check the number of *unexposed* observations below the lowest probability for the *exposed* group and the number of *exposed* observations above the highest probability for the *unexposed* group. ```{r} seven_dwarfs_9_with_ps |> @@ -361,22 +437,28 @@ seven_dwarfs_9_with_ps |> count(support, .drop = FALSE) ``` -However, looking at @fig-mirrored-ps, we see some areas of sparsity, implying some combinations have positivity violations. For instance, many more days are without Extra Magic Mornings with a probability below 10%. +However, looking at @fig-mirrored-ps, we see some areas of sparsity, implying some combinations have positivity violations. +For instance, many more days are without Extra Magic Mornings with a probability below 10%. ```{r} seven_dwarfs_9_with_ps |> - count(park_extra_magic_morning, low_prob = .fitted <= .1) + count(park_extra_magic_morning, low_prob = .fitted <= 0.1) ``` -We're likely seeing a combination of structural positivity (some days will never or always get Extra Magic Hours per Disney's decision-making process) and stochastic violations. Since we have a limited number of days in the year and only about 17% of days get Extra Magic Hours in the morning, we expect some sparsity. For instance, @fig-random-days-ps is what the graph would look like if Extra Magic Mornings were randomized with the same proportions of exposed and unexposed days. (If you try this out with different seeds, you'll see that these data are prone to such random violations.) Propensity score methods are more susceptible to issues with positivity violations than outcome model-based methods and some doubly robust methods; we'll have to keep an eye on this. +We're likely seeing a combination of structural positivity (some days will never or always get Extra Magic Hours per Disney's decision-making process) and stochastic violations. +Since we have a limited number of days in the year and only about 17% of days get Extra Magic Hours in the morning, we expect some sparsity. +For instance, @fig-random-days-ps is what the graph would look like if Extra Magic Mornings were randomized with the same proportions of exposed and unexposed days. +(If you try this out with different seeds, you'll see that these data are prone to such random violations.) +Propensity score methods are more susceptible to issues with positivity violations than outcome model-based methods and some doubly robust methods; we'll have to keep an eye on this. ```{r} #| label: fig-random-days-ps #| code-fold: true #| fig-cap: "A simulation of randomizing Extra Magic Mornings. There is no relationship between the covariates and the exposure except chance. Because of the small sample size and lower proportion of exposed days, we need to be cautious about interpreting the results of these distributions." + set.seed(2025) seven_dwarfs_9 |> - mutate(randomized_emm = rbinom(n(), 1, .17)) |> + mutate(randomized_emm = rbinom(n(), 1, 0.17)) |> glm( randomized_emm ~ park_ticket_season + park_close + park_temperature_high, data = _, @@ -391,7 +473,8 @@ seven_dwarfs_9 |> labs(x = "propensity score", fill = "extra magic morning") ``` -What can we do about these problems? Although we need good domain knowledge to base our assumptions on (and perhaps better exclusion criteria for days that are structurally unable to receive Extra Magic Hours), using the information in the propensity score can help with both exchangeability and, with some methods, positivity. +What can we do about these problems? +Although we need good domain knowledge to base our assumptions on (and perhaps better exclusion criteria for days that are structurally unable to receive Extra Magic Hours), using the information in the propensity score can help with both exchangeability and, with some methods, positivity. ## Using the propensity scores {#sec-using-ps} @@ -401,14 +484,18 @@ There are many ways to incorporate the propensity score into an analysis. Commonly used techniques include stratification (estimating the causal effect within propensity score strata), matching, weighting, and direct covariate adjustment (including the propensity score as a covariate in the outcome model). This section will focus on **matching** and **weighting**. -Matching and weighting are two different ways of creating populations with better confounder balance. In matching, we create a *sub*-population by selecting a subgroup of observations where we hope exchangeability holds. In weighting, we re-weight the observations to create a *pseudo*-population where we hope exchangeability holds. +Matching and weighting are two different ways of creating populations with better confounder balance. +In matching, we create a *sub*-population by selecting a subgroup of observations where we hope exchangeability holds. +In weighting, we re-weight the observations to create a *pseudo*-population where we hope exchangeability holds. ## Matching {#sec-matching} Matching is an intuitive way to create a population where we can make apples-to-apples comparisons. -Imagine that we start with an exposed observation. In an infinite population, we could handpick an unexposed observation, the only difference being the exposure status. +Imagine that we start with an exposed observation. +In an infinite population, we could handpick an unexposed observation, the only difference being the exposure status. In other words, we match two observations that have identical confounder values but opposite exposure values. -This is called **exact matching**. Exact matching works well for very large data or very limited numbers (and values) of confounders, but it becomes increasingly complex to find such a match when the number and continuity of confounders increase. +This is called **exact matching**. +Exact matching works well for very large data or very limited numbers (and values) of confounders, but it becomes increasingly complex to find such a match when the number and continuity of confounders increase. This is where the propensity score, a summary measure of all of the confounders, comes into play. The `{MatchIt}` package is one of the most flexible tools for matching in R. @@ -416,13 +503,16 @@ Let's match similar days with `matchit()`. (If we didn't include the pre-computed propensity score in the `distance` argument, `matchit()` would have refit the logistic regression for us.) There were 60 days in 2018 when the Magic Kingdom had Extra Magic Morning hours. For each of these 60 exposed days, `matchit()` found a comparable unexposed day, by implementing a nearest-neighbor match using the constructed propensity score. -Examining the output, we also see that the default target estimand is an "ATT," the average treatment effect among the treated. We will discuss this and several other estimands in @sec-estimands, but the important thing to know for know is that `matchit()` is going to keep all the days with Extra Magic Morning and may discard some days that were unexposed. +Examining the output, we also see that the default target estimand is an "ATT," the average treatment effect among the treated. +We will discuss this and several other estimands in @sec-estimands, but the important thing to know for know is that `matchit()` is going to keep all the days with Extra Magic Morning and may discard some days that were unexposed. ```{r} library(MatchIt) ps_logit_scale <- predict(ps_mod) matchit_obj <- matchit( - park_extra_magic_morning ~ park_ticket_season + park_close + park_temperature_high, + park_extra_magic_morning ~ park_ticket_season + + park_close + + park_temperature_high, data = seven_dwarfs_9_with_ps, # match on the propensity score we fit on the logit scale # TODO: @Lucy, should we supply this on the logit scale or probability scale? @@ -441,7 +531,9 @@ matched_data <- get_matches(matchit_obj) |> matched_data ``` -The `subclass` column tells us which days are matched. For instance, for `subclass == 1`, we have a pair of days, one with and one without Extra Magic Mornings. Their propensity scores are the same. +The `subclass` column tells us which days are matched. +For instance, for `subclass == 1`, we have a pair of days, one with and one without Extra Magic Mornings. +Their propensity scores are the same. ```{r} matched_data |> @@ -449,7 +541,9 @@ matched_data |> select(park_date, park_extra_magic_morning, .fitted) ``` -If we look closer at their covariates, we can see why. These are not exact matches---the temperature and park close variables are slightly different---but we can see these are both regular ticket season days with cooler historic temperatures and later close times. Do they seem like good counterfactuals for one another? +If we look closer at their covariates, we can see why. +These are not exact matches---the temperature and park close variables are slightly different---but we can see these are both regular ticket season days with cooler historic temperatures and later close times. +Do they seem like good counterfactuals for one another? ```{r} matched_data |> @@ -465,7 +559,8 @@ matched_data |> select(park_date, park_extra_magic_morning, .fitted) ``` -Their actual variables, however, are not as close. There is about a 20 degree difference in the historical temperature, and the park close time, while earlier for both, is a little further apart than pair 1. +Their actual variables, however, are not as close. +There is about a 20 degree difference in the historical temperature, and the park close time, while earlier for both, is a little further apart than pair 1. ```{r} matched_data |> @@ -473,7 +568,9 @@ matched_data |> select(park_date, park_temperature_high, park_ticket_season, park_close) ``` -We might also want to know which days *weren't* matched. Since we kept all the days with Extra Magic Hours, we know all of the days that were dropped did not have them. We can check which days aren't in the matched data with an anti-join. +We might also want to know which days *weren't* matched. +Since we kept all the days with Extra Magic Hours, we know all of the days that were dropped did not have them. +We can check which days aren't in the matched data with an anti-join. ```{r} seven_dwarfs_9_with_ps |> @@ -481,13 +578,21 @@ seven_dwarfs_9_with_ps |> select(park_date, park_extra_magic_morning, .fitted) ``` -We may feel that there is still a lot of valuable statistical information in these dropped data. We could gain extra statistical precision by using more than one match for each day with Extra Magic Hours. Previously, we used $1:1$ matching, but `matchit()` also supports $1:k$ matching with the `ratio` argument. For instance, for `ratio = 2`, we would get two matches for every day with Extra Magic Mornings, resulting in a sample size of 180. +We may feel that there is still a lot of valuable statistical information in these dropped data. +We could gain extra statistical precision by using more than one match for each day with Extra Magic Hours. +Previously, we used $1:1$ matching, but `matchit()` also supports $1:k$ matching with the `ratio` argument. +For instance, for `ratio = 2`, we would get two matches for every day with Extra Magic Mornings, resulting in a sample size of 180. -However, we need to be cautious about adding extra matches when data are limited. The more days we try to match, the worse the match will get. For instance, when we try to find four matches for each day with Extra Magic Mornings, the day the propensity scores for `subclass == 1` start to get pretty different for the later matches. This is a bias-variance trade-off: for more matches, we get better precision, but we make it harder to find good matches, which may increase bias. +However, we need to be cautious about adding extra matches when data are limited. +The more days we try to match, the worse the match will get. +For instance, when we try to find four matches for each day with Extra Magic Mornings, the day the propensity scores for `subclass == 1` start to get pretty different for the later matches. +This is a bias-variance trade-off: for more matches, we get better precision, but we make it harder to find good matches, which may increase bias. ```{r} matchit( - park_extra_magic_morning ~ park_ticket_season + park_close + park_temperature_high, + park_extra_magic_morning ~ park_ticket_season + + park_close + + park_temperature_high, data = seven_dwarfs_9_with_ps, distance = ps_logit_scale, ratio = 4 @@ -498,11 +603,20 @@ matchit( select(park_date, park_extra_magic_morning, .fitted) ``` -We can control the quality of the match by asking `matchit()` to only match observations within a certain distance away. In other words, we can ask not to match observations with propensity scores farther apart than we'd like. We control this by setting a **caliper**. The caliper is a dynamic distance, on the logit scale, that we accept as the maximum difference between two observations that can be matched. It is dynamic in that sense that whatever value we give to the `caliper` argument is multiplied by the standard deviation of the propensity score. However, let's try 1:2 matching with a caliper of 0.2. Matching with a caliper gives us 178 days, two days fewer than $60 + 60*2$. Two of the days with Extra Magic Mornings only received one matched control instead of two. +We can control the quality of the match by asking `matchit()` to only match observations within a certain distance away. +In other words, we can ask not to match observations with propensity scores farther apart than we'd like. +We control this by setting a **caliper**. +The caliper is a dynamic distance, on the logit scale, that we accept as the maximum difference between two observations that can be matched. +It is dynamic in that sense that whatever value we give to the `caliper` argument is multiplied by the standard deviation of the propensity score. +However, let's try 1:2 matching with a caliper of 0.2. +Matching with a caliper gives us 178 days, two days fewer than $60 + 60*2$. +Two of the days with Extra Magic Mornings only received one matched control instead of two. ```{r} mtchs <- matchit( - park_extra_magic_morning ~ park_ticket_season + park_close + park_temperature_high, + park_extra_magic_morning ~ park_ticket_season + + park_close + + park_temperature_high, data = seven_dwarfs_9_with_ps, distance = ps_logit_scale, ratio = 2, @@ -516,7 +630,8 @@ mtchs |> filter(n < 3) ``` -One important point is that by setting a caliper, we may be changing the population about whom we are making inferences depending on which and how many observations are dropped. We'll revisit this in @sec-estimands. +One important point is that by setting a caliper, we may be changing the population about whom we are making inferences depending on which and how many observations are dropped. +We'll revisit this in @sec-estimands. ## Weighting {#sec-weighting} @@ -528,13 +643,16 @@ We can calculate many different weights depending on the target estimand of inte In this section, we will focus on the Average Treatment Effect (ATE) weights, commonly referred to as inverse probability weights. The weight is constructed as follows: each observation is weighted by the *inverse* of the probability of receiving the exposure it actually received. -$$w_{ATE} = \frac{X}{p} + \frac{(1 - X)}{1 - p}$$ +$$ +w_{ATE} = \frac{X}{p} + \frac{(1 - X)}{1 - p} +$$ For example, if observation 1 had a very high likelihood of being exposed given their pre-exposure covariates ($p = 0.9$), but they in fact were *not* exposed, their weight would be 10 ($w_1 = 1 / (1 - 0.9)$). Likewise, if observation 2 had a very high likelihood of being exposed given their pre-exposure covariates ($p = 0.9$), and they *were* exposed, their weight would be 1.1 ($w_2 = 1 / 0.9$). Intuitively, we give more weight to observations that, based on their measured confounders, appear to have helpful information for constructing a counterfactual---we would have predicted that they were exposed, but by chance, they were not, or vice versa. -The `{propensity}` package calculates a variety of propensity score weights with functions named to follow the pattern `wt_estimand()`. To calculate the ATE, we use `wt_ate()`, which we provide the fitted propensity score and the observed exposure values. +The `{propensity}` package calculates a variety of propensity score weights with functions named to follow the pattern `wt_estimand()`. +To calculate the ATE, we use `wt_ate()`, which we provide the fitted propensity score and the observed exposure values. ```{r} library(propensity) @@ -543,13 +661,19 @@ seven_dwarfs_9_with_wt <- seven_dwarfs_9_with_ps |> mutate(w_ate = wt_ate(.fitted, park_extra_magic_morning)) ``` -@tbl-df-wt shows the weights in the first six rows. For instance, January 1 did not have Extra Magic Hours, and it only had a probability of $1 - 0.3 = 0.7$ of *not* having them. Therefore, this isn't a particularly surprising day. It gets a weight of 1.4. January 5, however, is more surprising: it has a probability of receiving Extra Magic Hours of 0.18, but it did, in fact, have them. That makes it a good counterfactual for these other days that didn't, so it gets a weight of 5.4. +@tbl-df-wt shows the weights in the first six rows. +For instance, January 1 did not have Extra Magic Hours, and it only had a probability of $1 - 0.3 = 0.7$ of *not* having them. +Therefore, this isn't a particularly surprising day. +It gets a weight of 1.4. +January 5, however, is more surprising: it has a probability of receiving Extra Magic Hours of 0.18, but it did, in fact, have them. +That makes it a good counterfactual for these other days that didn't, so it gets a weight of 5.4. ```{r} #| label: tbl-df-wt #| tbl-cap: > #| The first six observations in the `seven_dwarfs_9_with_wt` dataset, including their propensity scores in the `.fitted` column and weight in the `w_ate` column. #| code-fold: true + seven_dwarfs_9_with_wt |> select( park_date, @@ -576,11 +700,14 @@ seven_dwarfs_9_with_wt |> ::: {.callout-note} ## WeightIt -If you like the feel of MatchIt, it has a cousin package called `{WeightIt}` with the same design principles and many useful features. We'll focus on propensity, but WeightIt is easy to use if you are familiar with MatchIt. +If you like the feel of MatchIt, it has a cousin package called `{WeightIt}` with the same design principles and many useful features. +We'll focus on propensity, but WeightIt is easy to use if you are familiar with MatchIt. ```{r} wt_it <- WeightIt::weightit( - park_extra_magic_morning ~ park_ticket_season + park_close + park_temperature_high, + park_extra_magic_morning ~ park_ticket_season + + park_close + + park_temperature_high, data = seven_dwarfs_9_with_ps, ps = ".fitted" ) @@ -592,12 +719,19 @@ head(wt_it$weights) ::: -A day that had a very low predicted probability of Extra Magic Hours in the morning will receive a high weight if it did, in fact, have them. The minimum of ATE weights is 1, but the maximum is unbounded. The closer the probability of receiving the observed exposure is to 0, the higher the weight will be. That means we need to be cautious about **extreme weights**. Extreme weights are weights that add undue information to the resulting outcome model. Extreme weights tend to *destabilize* our estimate, resulting in worse precision and potentially worse bias. @fig-wts-distr shows the distribution of the ATE weights. +A day that had a very low predicted probability of Extra Magic Hours in the morning will receive a high weight if it did, in fact, have them. +The minimum of ATE weights is 1, but the maximum is unbounded. +The closer the probability of receiving the observed exposure is to 0, the higher the weight will be. +That means we need to be cautious about **extreme weights**. +Extreme weights are weights that add undue information to the resulting outcome model. +Extreme weights tend to *destabilize* our estimate, resulting in worse precision and potentially worse bias. +@fig-wts-distr shows the distribution of the ATE weights. ```{r} #| label: fig-wts-distr #| fig-cap: "The distribution of Average Treatment Effect weights. ATE weights have a lower bound of 1 and no upper bound. Here, we see a spike near one but many weights much higher in the right tail." #| message: false + seven_dwarfs_9_with_wt |> ggplot(aes(w_ate)) + geom_histogram() + @@ -605,12 +739,15 @@ seven_dwarfs_9_with_wt |> xlab("ATE Weights") ``` -Indeed, we have several days with weights over 10 (@tbl-extreme-wts). April 27, for instance, is being treated like almost twenty days! It might be a good counterfactual for days without Extra Magic Hours, but a weight that high will add more variance than it will reduce bias. +Indeed, we have several days with weights over 10 (@tbl-extreme-wts). +April 27, for instance, is being treated like almost twenty days! +It might be a good counterfactual for days without Extra Magic Hours, but a weight that high will add more variance than it will reduce bias. ```{r} #| label: tbl-extreme-wts #| tbl-cap: "Days with ATE weights over 10. These days are upweighted in our population because of their low probability of occurring, but the higher weights get, the more instability we introduce." #| code-fold: true + seven_dwarfs_9_with_wt |> filter(w_ate > 10) |> select( @@ -634,12 +771,18 @@ seven_dwarfs_9_with_wt |> ) ``` -We can mitigate some of the instability of extreme weights by using a stabilization factor: the proportions of the exposed and unexposed. Instead of inverting the probability of the received exposure, we use the proportion in the numerator. Stabilization has an interesting effect on the weights. First, it improves the variance by making the spread of the weights smaller. Second, it creates weights that have a mean of 1. In other words, the pseudo-population is approximately the same size as the original population. +We can mitigate some of the instability of extreme weights by using a stabilization factor: the proportions of the exposed and unexposed. +Instead of inverting the probability of the received exposure, we use the proportion in the numerator. +Stabilization has an interesting effect on the weights. +First, it improves the variance by making the spread of the weights smaller. +Second, it creates weights that have a mean of 1. +In other words, the pseudo-population is approximately the same size as the original population. ```{r} #| label: fig-stbl-wts #| fig-cap: "The distribution of stabilized ATE weights. Their distribution is much more restricted than the unstabilized weights, and they average to a weight of 1." #| message: false + seven_dwarfs_9_with_wt <- seven_dwarfs_9_with_ps |> mutate(stbl_wts = wt_ate(.fitted, park_extra_magic_morning, stabilize = TRUE)) @@ -659,20 +802,36 @@ seven_dwarfs_9_with_wt |> xlab("Stabilized ATE Weights") ``` -Another set of techniques that are used to address extreme weights (and poor overlap) are **trimming** and **truncation**. Trimming is when we set an acceptable range of the propensity score and drop the observations that fall outside of that range from the analysis. When trimming, you should refit the propensity score model to improve the fit and calibration. Truncation is when, instead of dropping observations, we *truncate* any value outside of the acceptable range to the minimum or maximum of the range. Truncating is also sometimes called **Winsorizing**. Note that sometimes, authors use "trim" and "truncate" interchangeably or even with the opposite meanings, so be clear about what you mean and make sure you understand what other analysts mean, too. +Another set of techniques that are used to address extreme weights (and poor overlap) are **trimming** and **truncation**. +Trimming is when we set an acceptable range of the propensity score and drop the observations that fall outside of that range from the analysis. +When trimming, you should refit the propensity score model to improve the fit and calibration. +Truncation is when, instead of dropping observations, we *truncate* any value outside of the acceptable range to the minimum or maximum of the range. +Truncating is also sometimes called **Winsorizing**. +Note that sometimes, authors use "trim" and "truncate" interchangeably or even with the opposite meanings, so be clear about what you mean and make sure you understand what other analysts mean, too. -propensity provides helper functions for managing these processes. `ps_trim()` will trim the observations, and `ps_trunc()` will truncate them. `ps_refit()` will refit the propensity score on only the non-trimmed observations. There are a few things to note here. First, we're using an adaptive method to decide the best range to trim the scores at; this approach optimizes the variance of the resulting observations. Second, we're using `ps_refit()` on the trimmed propensity scores to recalculate the propensity score without the trimmed observations. Third, in `ps_trunc()`, we're truncating propensity scores to *under* the 1st percentile *to* the 1st percentile. Researchers will commonly truncate to the 1st and 99th percentiles, but since our highest propensity score is about 0.50, these won't produce extreme weights, so we leave them alone. +propensity provides helper functions for managing these processes. +`ps_trim()` will trim the observations, and `ps_trunc()` will truncate them. +`ps_refit()` will refit the propensity score on only the non-trimmed observations. +There are a few things to note here. +First, we're using an adaptive method to decide the best range to trim the scores at; this approach optimizes the variance of the resulting observations. +Second, we're using `ps_refit()` on the trimmed propensity scores to recalculate the propensity score without the trimmed observations. +Third, in `ps_trunc()`, we're truncating propensity scores to *under* the 1st percentile *to* the 1st percentile. +Researchers will commonly truncate to the 1st and 99th percentiles, but since our highest propensity score is about 0.50, these won't produce extreme weights, so we leave them alone. ```{r} seven_dwarfs_9_with_wt <- seven_dwarfs_9_with_wt |> mutate( trimmed_ps = ps_trim(.fitted, method = "adaptive") |> ps_refit(ps_mod), - trunc_ps = ps_trunc(.fitted, method = "pctl", lower = .01, upper = 1) + trunc_ps = ps_trunc(.fitted, method = "pctl", lower = 0.01, upper = 1) ) ``` -Trimming and truncating affect our sample differently. In trimming, we have fewer observations afterward. We can see which observations were trimmed with `is_unit_trimmed()`. Only observations in the lower range of the original propensity score were trimmed. Their values for `trimmed_ps` are `NA` because they were not included in the model when we used `ps_refit()`. +Trimming and truncating affect our sample differently. +In trimming, we have fewer observations afterward. +We can see which observations were trimmed with `is_unit_trimmed()`. +Only observations in the lower range of the original propensity score were trimmed. +Their values for `trimmed_ps` are `NA` because they were not included in the model when we used `ps_refit()`. ```{r} seven_dwarfs_9_with_wt |> @@ -687,6 +846,7 @@ You can see the overlap in this subset is slightly improved (@fig-dist-ps-trimme #| fig-cap: "The distribution of propensity scores by exposure group after trimming. Trimming improved the overlap." #| message: false #| warning: false + ggplot( seven_dwarfs_9_with_wt, aes(trimmed_ps, fill = factor(park_extra_magic_morning)) @@ -696,7 +856,8 @@ ggplot( labs(x = "propensity score", fill = "extra magic morning") ``` -In truncation, we are not removing observations but forcing some observations to be within the acceptable range. All the truncated observations (which we found with `is_unit_truncated()`) now have the same value in `trunc_ps`, equal to the 1st percentile of `.fitted`. +In truncation, we are not removing observations but forcing some observations to be within the acceptable range. +All the truncated observations (which we found with `is_unit_truncated()`) now have the same value in `trunc_ps`, equal to the 1st percentile of `.fitted`. ```{r} seven_dwarfs_9_with_wt |> @@ -704,13 +865,15 @@ seven_dwarfs_9_with_wt |> select(park_date, park_extra_magic_morning, .fitted, trunc_ps) ``` -You can see how the truncation on the left side of the plot forces overlap (@fig-dist-ps-trunc). Truncation doesn't discard units (improving the sample size over trimming), but this forced change of propensity scores can be unintuitive. +You can see how the truncation on the left side of the plot forces overlap (@fig-dist-ps-trunc). +Truncation doesn't discard units (improving the sample size over trimming), but this forced change of propensity scores can be unintuitive. ```{r} #| label: fig-dist-ps-trunc #| fig-cap: "The distribution of propensity scores by exposure group after truncating. Truncation improved the overlap, as well." #| message: false #| warning: false + ggplot( seven_dwarfs_9_with_wt, aes(trunc_ps, fill = factor(park_extra_magic_morning)) @@ -720,19 +883,26 @@ ggplot( labs(x = "propensity score", fill = "extra magic morning") ``` -We can then use trimmed or truncated weights to calculate weights. In fact, we can combine these approaches with stabilized weights. Let's calculate stabilized weights on the truncated propensity scores. (We can also use truncated or trimmed weights with matching and a caliper, but we won't show that here.) @fig-sbl-trunc-wts shows the distribution of weights after truncating and stabilizing. +We can then use trimmed or truncated weights to calculate weights. +In fact, we can combine these approaches with stabilized weights. +Let's calculate stabilized weights on the truncated propensity scores. +(We can also use truncated or trimmed weights with matching and a caliper, but we won't show that here.) +@fig-sbl-trunc-wts shows the distribution of weights after truncating and stabilizing. ```{r} #| label: fig-sbl-trunc-wts #| fig-cap: "The distribution of stabilized weights where the propensity score was truncated to the 1st percentile. The extreme weights we saw before are slightly improved." #| message: false #| warning: false + seven_dwarfs_9_with_wt <- seven_dwarfs_9_with_wt |> - mutate(trunc_stbl_wt = wt_ate( - trunc_ps, - park_extra_magic_morning, - stabilize = TRUE - )) + mutate( + trunc_stbl_wt = wt_ate( + trunc_ps, + park_extra_magic_morning, + stabilize = TRUE + ) + ) seven_dwarfs_9_with_wt |> ggplot(aes(trunc_stbl_wt)) + @@ -741,9 +911,15 @@ seven_dwarfs_9_with_wt |> xlab("Truncated and Stabilized ATE Weights") ``` -Truncation and trimming, like using a caliper, may change the population we are making inferences about. We'll investigate this further in @sec-estimands. +Truncation and trimming, like using a caliper, may change the population we are making inferences about. +We'll investigate this further in @sec-estimands. -You may have noticed that extreme weights are often a positivity problem. The days that were trimmed, for instance, mainly were days without Extra Magic Hours that had a low predicted probability of receiving them. Once we've fit the propensity score, we can probe the trimmed or truncated results to better understand why we need to modify them in the first place. Here, it seems the trimmed observations were all warm days in the value ticket season with later closing hours. Perhaps these days are structurally unable to receive Extra Magic Hours per Disney's requirements. We'd want to determine if that was the case, as we may wish to modify our exclusion criteria rather than dynamically removing observations based on their propensity score. +You may have noticed that extreme weights are often a positivity problem. +The days that were trimmed, for instance, mainly were days without Extra Magic Hours that had a low predicted probability of receiving them. +Once we've fit the propensity score, we can probe the trimmed or truncated results to better understand why we need to modify them in the first place. +Here, it seems the trimmed observations were all warm days in the value ticket season with later closing hours. +Perhaps these days are structurally unable to receive Extra Magic Hours per Disney's requirements. +We'd want to determine if that was the case, as we may wish to modify our exclusion criteria rather than dynamically removing observations based on their propensity score. ```{r} seven_dwarfs_9_with_wt |> @@ -758,9 +934,13 @@ seven_dwarfs_9_with_wt |> ::: {.callout-note} ## When should I use matching vs weighting? -Weighting is statistically more efficient than matching, and we recommend using it over matching when possible. However, matching has a distinct advantage: it's easy to understand. Someone with a statistical background might be comfortable interpreting results from a weighted analysis, but a stakeholder with a different background may not understand the pseudo-population or why its sample size can be a non-integer value. So, if you have a lot of data and think it will help improve the interpretation of the analysis, matching can be a good option. +Weighting is statistically more efficient than matching, and we recommend using it over matching when possible. +However, matching has a distinct advantage: it's easy to understand. +Someone with a statistical background might be comfortable interpreting results from a weighted analysis, but a stakeholder with a different background may not understand the pseudo-population or why its sample size can be a non-integer value. +So, if you have a lot of data and think it will help improve the interpretation of the analysis, matching can be a good option. We'll also present some ways to present weighted populations in @sec-estimands that may help stakeholders understand the analysis better, getting the best of both worlds. ::: -Now that we've applied the propensity score via weighting and matching, it's time to ask: do these approaches improve the balance we saw in @fig-mirrored-ps? Let's turn to techniques for probing the results of propensity score techniques. +Now that we've applied the propensity score via weighting and matching, it's time to ask: do these approaches improve the balance we saw in @fig-mirrored-ps? +Let's turn to techniques for probing the results of propensity score techniques. diff --git a/chapters/09-evaluating-ps.qmd b/chapters/09-evaluating-ps.qmd index 453209c..9267df3 100644 --- a/chapters/09-evaluating-ps.qmd +++ b/chapters/09-evaluating-ps.qmd @@ -4,6 +4,7 @@ ```{r} #| echo: false + # TODO: remove when first edition complete status("polishing") ``` @@ -34,13 +35,13 @@ seven_dwarfs_9_with_ps <- park_ticket_season + park_close + park_temperature_high, data = seven_dwarfs_9, family = binomial() - ) |> + ) |> augment(type.predict = "response", data = seven_dwarfs_9) seven_dwarfs_9_with_wt <- seven_dwarfs_9_with_ps |> mutate( w_ate = wt_ate(.fitted, park_extra_magic_morning), park_extra_magic_morning = factor(park_extra_magic_morning) - ) + ) ``` Once we have weights, we can use them in `geom_mirror_histogram()`'s `weight` argument to visualize the weighted distributions of the propensity score, as shown in @fig-ps-weighted-histogram. @@ -49,18 +50,19 @@ Once we have weights, we can use them in `geom_mirror_histogram()`'s `weight` ar #| label: fig-ps-weighted-histogram #| fig-cap: "Weighted propensity score distributions for days with Extra Magic Morning hours compared to those without." #| message: false + ggplot( seven_dwarfs_9_with_wt, aes(.fitted, group = park_extra_magic_morning) ) + geom_mirror_histogram( aes(fill = factor(park_extra_magic_morning), weight = w_ate) - ) + + ) + scale_y_continuous(labels = abs) + labs( x = "propensity score", fill = "Extra Magic Morning" - ) + ) ``` We also might want to look at the weighted and unweighted distributions side-by-side, as shown in @fig-ps-weighted-unweighted-comparison. @@ -69,12 +71,13 @@ We can speed up data wrangling and plotting with `plot_mirror_distributions()`. ```{r} #| label: fig-ps-weighted-unweighted-comparison #| fig-cap: "Weighted and unweighted propensity score distributions for days with and without Extra Magic Morning hours." + seven_dwarfs_9_with_wt |> plot_mirror_distributions( .exposure = park_extra_magic_morning, .var = .fitted, .weights = w_ate - ) + + ) + labs(fill = "extra magic morning") ``` @@ -88,12 +91,13 @@ In a causal model, however, it means that, after weighting, there is no way to d ```{r} #| label: fig-roc-ps-weighted #| fig-cap: "A weighted and unweighted. ROC curve for the propensity score model. The observed curve is above the 45-degree line, indicating that the model has some discrimination. The AUC is 0.65, indicating that we've picked up enough signal from the confounders to use for adjustment, but not so much that we have extreme positivity issues. Once we've weighted the curve, it traces the 45-degree line, indicating that, after weighting, the discrimination is no better than random. The weighted AUC is 0.50, exactly as good as random." + seven_dwarfs_9_with_wt |> check_model_roc_curve( park_extra_magic_morning, .fitted, .weights = w_ate - ) |> + ) |> plot_model_roc_curve() ``` @@ -128,13 +132,14 @@ The weighted sample will give us the precision of an equivalent 160-day sample, ```{r} #| message: false + library(scales) seven_dwarfs_9_with_wt |> summarize( n = n(), ess_ate = ess(w_ate), percent_ess_ate = percent(ess_ate / n) - ) + ) ``` It can also be instructive to look at the ESS by exposure group (note that we don't expect the grouped ESS to sum to the total ESS; the ESS is a measure of the efficiency given the variation in the weights, and this variation differs across the three calculations). @@ -149,7 +154,7 @@ seven_dwarfs_9_with_wt |> n = n(), ess_ate = ess(w_ate), percent_ess_ate = percent(ess_ate / n) - ) + ) ``` We can see that clearly in the distribution of the weights: @@ -157,6 +162,7 @@ We can see that clearly in the distribution of the weights: ```{r} #| label: fig-ate-weights-grouped #| fig-cap: "Distribution of ATE weights stratified by exposure group. The days with Extra Magic Morning have a wider distribution of weights, indicating greater variation, which in turn leads to a lower effective sample size. The x-axis is on the log-10 scale to better visualize the distribution." + ggplot( seven_dwarfs_9_with_wt, aes(x = as.numeric(w_ate), fill = park_extra_magic_morning) @@ -176,14 +182,14 @@ The more we trim and truncate, though, the more likely we are to be answering qu ```{r} seven_dwarfs_9_with_wt |> mutate( - trunc_ps = ps_trunc(.fitted, method = "pctl", lower = .01, upper = .99), + trunc_ps = ps_trunc(.fitted, method = "pctl", lower = 0.01, upper = 0.99), w_ate_trunc = wt_ate(trunc_ps, park_extra_magic_morning) - ) |> + ) |> summarize( n = n(), ess_ate_trunc = ess(w_ate_trunc), percent_ess_ate = percent(ess_ate_trunc / n) - ) + ) ``` ## Variable-level diagnostics @@ -217,6 +223,7 @@ In R, the `{halfmoon}` package has a function `check_balance()` that will calcul ```{r} #| eval: false + library(halfmoon) balance_metrics <- check_balance( @@ -238,7 +245,7 @@ balance_metrics <- .vars = c(park_ticket_season, park_close, park_temperature_high), .exposure = park_extra_magic_morning, .weights = w_ate - ) + ) balance_metrics ``` @@ -282,6 +289,7 @@ Some people suggest a cut-off of 0.1 as a measure of good balance (an SMD of 0.1 ```{r} #| label: fig-love-smd #| fig-cap: "Love plot showing absolute standardized mean differences before and after weighting." + ggplot( data = smds, aes( @@ -289,7 +297,7 @@ ggplot( y = variable, group = method, color = method - ) + ) ) + geom_love() ``` @@ -299,6 +307,7 @@ The variance ratios, on the other hand, don't show as much improvement; while mo ```{r} #| label: fig-love-vr #| fig-cap: "Love plot showing variance ratios before and after weighting." + ggplot( data = vrs, aes( @@ -306,7 +315,7 @@ ggplot( y = variable, group = method, color = method - ) + ) ) + geom_love(vline_xintercept = 1) ``` @@ -317,6 +326,7 @@ ggplot( ```{r} #| label: fig-balance-helper #| fig-cap: "Balance metrics using the `plot_balance()` helper function." + balance_metrics |> filter(metric %in% c("smd", "vr")) |> plot_balance(facet_scales = "free_x") @@ -336,27 +346,28 @@ SMDs can't tell us if this is improved. ```{r} #| label: fig-boxplot -#| fig.cap: "Unweighted boxplot showing the difference in historical high temperature between days that had extra magic hours and those that did not." +#| fig-cap: "Unweighted boxplot showing the difference in historical high temperature between days that had extra magic hours and those that did not." + ggplot( seven_dwarfs_9_with_wt, aes( x = factor(park_extra_magic_morning), y = park_temperature_high, color = park_extra_magic_morning - ) + ) ) + - geom_jitter(width = .12, height = 0, alpha = .5) + + geom_jitter(width = 0.12, height = 0, alpha = 0.5) + geom_boxplot( outlier.color = NA, fill = NA, - width = .3, + width = 0.3, color = "black" - ) + + ) + labs( color = "Extra magic morning", y = "Historic temperature high", x = NULL - ) + ) ``` Now let's look at a weighted version. @@ -365,8 +376,9 @@ The tails of the distribution, while improved, still show some imbalance. ```{r} #| label: fig-weighted-boxplot -#| fig.cap: "Weighted boxplot showing the difference in historical high temperature between days that had extra magic hours and those that did not after incorporating the propensity score weight (ATE weight)." +#| fig-cap: "Weighted boxplot showing the difference in historical high temperature between days that had extra magic hours and those that did not after incorporating the propensity score weight (ATE weight)." #| warning: false + ggplot( seven_dwarfs_9_with_wt, aes( @@ -374,37 +386,37 @@ ggplot( y = park_temperature_high, color = park_extra_magic_morning, weight = w_ate - ) + ) ) + - geom_jitter(width = .12, height = 0, alpha = .5) + + geom_jitter(width = 0.12, height = 0, alpha = 0.5) + geom_boxplot( outlier.color = NA, fill = NA, - width = .3, + width = 0.3, color = "black" - ) + + ) + labs( color = "Extra magic morning", y = "Historic temperature high", x = NULL - ) + ) ``` Many other geoms in ggplot2 accept a `weight` argument that can be useful for visualizing weighted populations. Make good use of your exploratory data analysis skills by treating the weighted pseudo-population as a real sample! -- `geom_bar()` -- `geom_boxplot()` -- `geom_contour()` -- `geom_count()` -- `geom_density()` -- `geom_dotplot()` -- `geom_freqpoly()` -- `geom_hex()` -- `geom_histogram()` -- `geom_quantile()` -- `geom_smooth()` -- `geom_violin()` +- `geom_bar()` +- `geom_boxplot()` +- `geom_contour()` +- `geom_count()` +- `geom_density()` +- `geom_dotplot()` +- `geom_freqpoly()` +- `geom_hex()` +- `geom_histogram()` +- `geom_quantile()` +- `geom_smooth()` +- `geom_violin()` ## The empirical cumulative distribution function @@ -415,42 +427,42 @@ In a balanced population, we expect the lines to overlap across the distribution ```{r} #| label: fig-ecdf -#| fig.cap: "Unweighted eCDF examining the difference in distribution for historic high temperature among days that had extra magic morning hours (blue) compared to those that did not (orange)." +#| fig-cap: "Unweighted eCDF examining the difference in distribution for historic high temperature among days that had extra magic morning hours (blue) compared to those that did not (orange)." ggplot( seven_dwarfs_9_with_wt, aes( x = park_temperature_high, color = factor(park_extra_magic_morning) - ) + ) ) + geom_ecdf() + labs( x = "Historic temperature high", y = "Proportion <= x", color = "Extra Magic Morning" - ) + ) ``` `geom_ecdf()` allows for the additional `weight` argument to display a weighted eCDF plot. ```{r} #| label: fig-weighted-ecdf -#| fig.cap: "Weighted eCDF examining the difference in distribution for historic high temperature among days that had extra magic morning hours (blue) compared to those that did not (orange) after incorporating the propensity score weight (ATE)." +#| fig-cap: "Weighted eCDF examining the difference in distribution for historic high temperature among days that had extra magic morning hours (blue) compared to those that did not (orange) after incorporating the propensity score weight (ATE)." ggplot( seven_dwarfs_9_with_wt, aes( x = park_temperature_high, color = factor(park_extra_magic_morning) - ) + ) ) + geom_ecdf(aes(weights = w_ate)) + labs( x = "Historic temperature high", y = "Proportion <= x", color = "Extra Magic Morning" - ) + ) ``` Examining @fig-weighted-ecdf reveals a few things. @@ -473,12 +485,13 @@ Unlike polynomial terms (like $x^2$ or $x^3$), which can become extreme at the b ```{r} seven_dwarfs_9_with_ps <- glm( - park_extra_magic_morning ~ park_ticket_season + park_close + - # refit model with a spline + park_extra_magic_morning ~ park_ticket_season + + park_close + + # refit model with a spline splines::ns(park_temperature_high, df = 5), data = seven_dwarfs_9, family = binomial() - ) |> + ) |> augment(type.predict = "response", data = seven_dwarfs_9) seven_dwarfs_9_with_wt <- seven_dwarfs_9_with_ps |> mutate(w_ate = wt_ate(.fitted, park_extra_magic_morning)) @@ -488,21 +501,21 @@ Now, let's see how that impacts the weighted eCDF plot. ```{r} #| label: fig-weighted-ecdf-2 -#| fig.cap: "Weighted eCDF examining the difference in distribution for historic high temperature among days that had extra magic morning hours compared to those that did not after incorporating the propensity score weight where we modeled historic high temperature flexibly with a spline." +#| fig-cap: "Weighted eCDF examining the difference in distribution for historic high temperature among days that had extra magic morning hours compared to those that did not after incorporating the propensity score weight where we modeled historic high temperature flexibly with a spline." ggplot( seven_dwarfs_9_with_wt, aes( x = park_temperature_high, color = factor(park_extra_magic_morning) - ) + ) ) + geom_ecdf(aes(weights = w_ate)) + labs( color = "Extra Magic Morning", x = "Historic temperature high", y = "Proportion <= x" - ) + ) ``` Now in @fig-weighted-ecdf-2, the lines appear to overlap better across the whole space. @@ -569,16 +582,16 @@ wecdf_0 <- tibble(temp = all_temps) |> ecdf_val = map_dbl(temp, \(t) { sum(wt_data_0$w_ate[wt_data_0$park_temperature_high <= t]) / sum(wt_data_0$w_ate) - }) - ) + }) + ) wecdf_1 <- tibble(temp = all_temps) |> mutate( ecdf_val = map_dbl(temp, \(t) { sum(wt_data_1$w_ate[wt_data_1$park_temperature_high <= t]) / sum(wt_data_1$w_ate) - }) - ) + }) + ) # Find weighted KS point wt_distances <- abs(wecdf_0$ecdf_val - wecdf_1$ecdf_val) @@ -593,20 +606,24 @@ ks_weighted <- tibble( ) ks_both <- bind_rows(ks_unweighted, ks_weighted) |> - mutate(weight_status = factor(weight_status, levels = c("Unweighted", "Weighted"))) + mutate( + weight_status = factor(weight_status, levels = c("Unweighted", "Weighted")) + ) plot_data <- seven_dwarfs_9_with_wt |> select(park_temperature_high, park_extra_magic_morning, w_ate) |> mutate( Unweighted = 1, Weighted = as.numeric(w_ate) - ) |> + ) |> pivot_longer( cols = c(Unweighted, Weighted), names_to = "weight_status", values_to = "weight" - ) |> - mutate(weight_status = factor(weight_status, levels = c("Unweighted", "Weighted"))) + ) |> + mutate( + weight_status = factor(weight_status, levels = c("Unweighted", "Weighted")) + ) ggplot( plot_data, @@ -614,50 +631,50 @@ ggplot( x = park_temperature_high, color = factor(park_extra_magic_morning), weight = weight - ) + ) ) + - geom_ecdf(linewidth = 1, alpha = .5) + + geom_ecdf(linewidth = 1, alpha = 0.5) + geom_segment( data = ks_both, aes(x = x, xend = x, y = y0, yend = y1), color = "#DB5461", - linewidth = .8, + linewidth = 0.8, linetype = "solid", inherit.aes = FALSE - ) + + ) + geom_point( data = ks_both, aes(x = x, y = y0), color = "#DB5461", size = 2, inherit.aes = FALSE - ) + + ) + geom_point( data = ks_both, aes(x = x, y = y1), color = "#DB5461", size = 2, inherit.aes = FALSE - ) + + ) + geom_label( data = ks_both, aes( x = x, y = (y0 + y1) / 2, label = paste0("KS = ", round(ks, 3)) - ), + ), color = "#DB5461", size = 3.5, fontface = "bold", hjust = 1.1, inherit.aes = FALSE - ) + + ) + facet_wrap(~weight_status) + labs( x = "Historic temperature high", y = "Proportion <= x", color = "Extra Magic Morning" - ) + + ) + theme(legend.position = "bottom") ``` @@ -679,6 +696,7 @@ Balance is also commonly assessed among interaction terms and squares of continu ```{r} #| fig-width: 9 + balance_metrics_joint <- seven_dwarfs_9_with_wt |> mutate(park_close = as.numeric(park_close)) |> @@ -688,7 +706,7 @@ balance_metrics_joint <- .weights = w_ate, interactions = TRUE, squares = TRUE - ) + ) balance_metrics_joint |> filter(metric %in% c("smd", "vr")) |> @@ -723,8 +741,11 @@ Let's expand on that modification and create a more flexible propensity score mo ```{r} ps_improved_model <- glm( park_extra_magic_morning ~ - park_ticket_season + park_close + park_temperature_high + - splines::ns(park_temperature_high, df = 5) + I(as.numeric(park_close)^2) + + park_ticket_season + + park_close + + park_temperature_high + + splines::ns(park_temperature_high, df = 5) + + I(as.numeric(park_close)^2) + park_ticket_season:park_close + park_ticket_season:park_temperature_high + park_close:park_temperature_high, @@ -736,7 +757,7 @@ seven_dwarfs_9_with_improved <- seven_dwarfs_9_with_wt |> mutate( .fitted_improved = predict(ps_improved_model, type = "response"), w_ate_improved = wt_ate(.fitted_improved, park_extra_magic_morning) - ) + ) ``` Now let's check the balance with both our original and improved models: @@ -746,6 +767,7 @@ Now let's check the balance with both our original and improved models: #| fig-cap: "Balance comparison between original and improved propensity score models." #| fig-height: 9 #| fig-width: 8.5 + balance_comparison <- seven_dwarfs_9_with_improved |> mutate(park_close = as.numeric(park_close)) |> check_balance( @@ -754,7 +776,7 @@ balance_comparison <- seven_dwarfs_9_with_improved |> .weights = c(w_ate, w_ate_improved), squares = TRUE, interactions = TRUE - ) + ) balance_comparison |> plot_balance() + @@ -774,7 +796,7 @@ seven_dwarfs_9_with_improved |> ess_ate_improved = ess(w_ate_improved), percent_ess_original = percent(ess_ate_original / n), percent_ess_improved = percent(ess_ate_improved / n) - ) + ) ``` While our improved model achieves slightly better balance, it may come at the cost of a lower effective sample size due to more extreme weights. @@ -816,13 +838,14 @@ Let's check the balance achieved by these optimization-based weights and compare #| fig-cap: "Balance comparison between propensity score weights and optimization-based balancing weights." #| fig-height: 9 #| fig-width: 8 + balance_opt_comparison <- seven_dwarfs_9_with_opt |> mutate(park_close = as.numeric(park_close)) |> check_balance( .vars = c(park_ticket_season, park_close, park_temperature_high), .exposure = park_extra_magic_morning, .weights = c(w_ate, w_opt) - ) + ) balance_opt_comparison |> plot_balance() + @@ -860,7 +883,7 @@ seven_dwarfs_9_with_opt |> ess_opt = ess(w_opt), percent_ess_ate = percent(ess_ate / n), percent_ess_opt = percent(ess_opt / n) - ) + ) ``` This class of optimization-based weights often yields a better effective sample size because they directly minimize weight variability while satisfying the specified balance constraints. @@ -898,7 +921,7 @@ balance_all_comparison <- seven_dwarfs_9_with_all |> .vars = c(park_ticket_season, park_close, park_temperature_high), .exposure = park_extra_magic_morning, .weights = c(w_ate, w_opt, w_energy) - ) + ) balance_all_comparison |> filter(metric == "smd") |> @@ -912,7 +935,7 @@ seven_dwarfs_9_with_all |> summarize( ess_energy = ess(w_energy), percent_ess_energy = percent(ess_energy / n()) - ) + ) ``` ### Don't use p-values @@ -985,6 +1008,7 @@ In @fig-ps-paradox-caliper, we see that as we tighten the caliper (resulting in #| label: fig-ps-paradox-caliper #| fig-cap: "Balance metrics across different caliper widths in propensity score matching, demonstrating the propensity score paradox." #| code-fold: true + library(MatchIt) caliper_widths <- c(0.25, 0.1, 0.05, 0.025, 0.01) @@ -996,7 +1020,7 @@ check_balance_by_cal <- function(caliper) { method = "nearest", caliper = caliper, distance = "glm" - ) + ) matched_data <- match.data(match_obj) @@ -1005,14 +1029,14 @@ check_balance_by_cal <- function(caliper) { check_balance( .vars = c(park_ticket_season, park_close, park_temperature_high), .exposure = park_extra_magic_morning - ) + ) balance |> filter(metric == "smd") |> mutate( caliper = caliper, n_matched = nrow(matched_data) - ) + ) } balance_by_caliper <- map(caliper_widths, check_balance_by_cal) |> @@ -1028,7 +1052,7 @@ ggplot( labs( x = "caliper", y = "smd" - ) + + ) + theme(legend.position = "right") ``` diff --git a/chapters/10-estimands.qmd b/chapters/10-estimands.qmd index 639999f..cee62ed 100644 --- a/chapters/10-estimands.qmd +++ b/chapters/10-estimands.qmd @@ -4,6 +4,7 @@ ```{r} #| echo: false + # TODO: remove when first edition complete status("polishing") ``` @@ -20,18 +21,23 @@ You can think of the **estimand** as a glossy picture of a beautiful cake we are #| label: fig-cake-est #| echo: false #| fig-cap: "The estimand, estimator, and estimate are all important parts of the statistical estimation process. The estimand is like the perfect cake in the cookbook. The estimator is like the recipe we use to make that cake. The estimate is the cake we pull out of the oven... sometimes to our great dismay." + knitr::include_graphics(here::here("chapters/images/cake_est.png")) ``` So far, we have primarily focused on the average treatment effect, the effect of the exposure of interest averaged across the whole population. The **estimand** here is the expected value of the difference in potential outcomes across all individuals: -$$E[Y(1) - Y(0)]$$ +$$ +E[Y(1) - Y(0)] +$$ The **estimator** we use depends on the method we've chosen. For example, in an A/B test or randomized controlled trial, our estimator could be the average outcome among those who received exposure A minus the average outcome among those who received exposure B. -$$\sum_{i=1}^{N}\frac{Y_i\times X_i}{N_{\textrm{A}}} - \frac{Y_i\times (1-X_i)}{N_{\textrm{B}}}$$ +$$ +\sum_{i=1}^{N}\frac{Y_i\times X_i}{N_{\textrm{A}}} - \frac{Y_i\times (1-X_i)}{N_{\textrm{B}}} +$$ Where $X$ is an indicator for exposure A ($X = 1$ for exposure A and $X = 0$ for exposure B), $N_A$ is the total number in group A and $N_B$ is the total number in group B such that $N_A + N_B = N$. Let's motivate this example a bit more. @@ -69,8 +75,7 @@ ab |> n_a = sum(x), n_b = sum(1 - x), estimate = sum( - (y * x) / n_a - - y * (1 - x) / n_b + (y * x) / n_a - y * (1 - x) / n_b ) ) ``` @@ -93,13 +98,13 @@ Because $X$, the exposure, was randomly assigned, this estimator results in an u ```{r} #| include: false + estimate <- ab |> summarize( n_a = sum(x), n_b = sum(1 - x), estimate = sum( - (y * x) / n_a - - y * (1 - x) / n_b + (y * x) / n_a - y * (1 - x) / n_b ) ) |> pull(estimate) @@ -125,8 +130,7 @@ ab |> n_a = sum(x), n_b = sum(1 - x), estimate = sum( - (y * x) / n_a - - y * (1 - x) / n_b + (y * x) / n_a - y * (1 - x) / n_b ) ) ``` @@ -139,7 +143,9 @@ We could extend the unweighted estimator to use weighted means to *estimate* our This version of inverse probability weighting is sometimes called the Hajek estimator. While we'll focus on using regression for outcome models, this simple extension of the unweighted estimator shows how we can modify an estimator to adapt to different estimands. -$$\frac{\sum_{i=1}^NY_i\times X_i\times w_i}{\sum_{i=1}^NX_i\times w_i}-\frac{\sum_{i=1}^NY_i\times(1-X_i)\times w_i}{\sum_{i=1}^N(1-X_i)\times w_i}$$ +$$ +\frac{\sum_{i=1}^NY_i\times X_i\times w_i}{\sum_{i=1}^NX_i\times w_i}-\frac{\sum_{i=1}^NY_i\times(1-X_i)\times w_i}{\sum_{i=1}^N(1-X_i)\times w_i} +$$ ## Estimating treatment effects with specific targets in mind @@ -152,9 +158,12 @@ A common estimand is the average treatment effect (ATE). The target population is the *total sample* or population of interest. The estimand here is the expected value of the difference in potential outcomes across all individuals: -$$E[Y(1) - Y(0)]$$ +$$ +E[Y(1) - Y(0)] +$$ -An example research question is "Should a policy be applied to all eligible patients?" [@greifer2021choosing]. +An example research question is "Should a policy be applied to all eligible patients?" +[@greifer2021choosing]. Most randomized controlled trials are designed with the ATE as the target estimand. The estimator we applied to `ab` earlier will this work. @@ -163,7 +172,9 @@ In a non-randomized setting, you can estimate the ATE using full matching. Each observation in the exposed group is matched to at least one observation in the control group (and vice versa) without replacement. We'll discuss estimands with matching further in @sec-matching-estimands) Alternatively, the following inverse probability weight will allow you to estimate the ATE using propensity score weighting. -$$w_{ATE} = \frac{X}{p} + \frac{(1 - X)}{1 - p}$$ +$$ +w_{ATE} = \frac{X}{p} + \frac{(1 - X)}{1 - p} +$$ In other words, the weight is one over the propensity score for those in the exposure group and one over one minus the propensity score for the control group. Intuitively, this weight equates to the inverse probability of being in the exposure group in which you actually were. @@ -172,8 +183,7 @@ That is to say, matching or weighting on the propensity score is only the first It's using the matched or weighted data in the outcome model that will then generate an estimate of the ATE. Let's dig deeper into this causal estimand using the Touring Plans data. -Recall that in @sec-using-ps, we examined the relationship between whether there were "Extra Magic Hours" in the morning and the average wait time for the Seven Dwarfs Mine Train the same day between 9 A.M. -and 10 A.M. +Recall that in @sec-using-ps, we examined the relationship between whether there were "Extra Magic Hours" in the morning and the average wait time for the Seven Dwarfs Mine Train the same day between 9 A.M. and 10 A.M. Let's reconstruct our data set, `seven_dwarfs`, and fit the same propensity score model specified previously. ```{r} @@ -182,13 +192,16 @@ library(touringplans) seven_dwarfs <- seven_dwarfs_train_2018 |> filter(wait_hour == 9) |> - mutate(park_extra_magic_morning = factor( - park_extra_magic_morning, - labels = c("No Magic Hours", "Extra Magic Hours") - )) + mutate( + park_extra_magic_morning = factor( + park_extra_magic_morning, + labels = c("No Magic Hours", "Extra Magic Hours") + ) + ) seven_dwarfs_with_ps <- glm( - park_extra_magic_morning ~ park_ticket_season + park_close + + park_extra_magic_morning ~ park_ticket_season + + park_close + park_temperature_high, data = seven_dwarfs, family = binomial() @@ -202,7 +215,10 @@ We will use the `tbl_summary()` function from the `{gtsummary}` package to do so ```{r} #| label: tbl-unweighted-gtsummary -#| tbl-cap: A descriptive table of Extra Magic Morning in the touringplans dataset. This table shows the distributions of these variables in the observed population. +#| tbl-cap: A descriptive table of Extra Magic Morning in the touringplans +#| dataset. This table shows the distributions of these variables in the +#| observed population. + library(gtsummary) library(labelled) seven_dwarfs_with_ps <- seven_dwarfs_with_ps |> @@ -223,8 +239,7 @@ tbl_summary( From @tbl-unweighted-gtsummary, 294 days did not have Extra Magic Hours in the morning, and 60 did. We also see that 30% of the Extra Magic Mornings were during peak season compared to 20% of the non-Extra Magic Mornings. -Additionally, days with Extra Magic Mornings were more likely to close at 6 P.M. -(18:00:00) compared to non-magic hour mornings. +Additionally, days with Extra Magic Mornings were more likely to close at 6 P.M. (18:00:00) compared to non-magic hour mornings. The median high temperature on days with Extra Magic Hour mornings is slightly lower (83 degrees) than non-Extra Magic Hour morning days. Now, let's construct our propensity score weight to estimate the ATE. @@ -241,8 +256,9 @@ Let's look at the distribution of these weights. ```{r} #| label: fig-sd-ate-hist -#| fig.cap: > +#| fig-cap: > #| A histogram of the average treatment effect (ATE) weights for whether or not a day had Extra Magic Hours. ATE weights can range from 1 to infinity, so paying attention to the actual range of weights is important. + ggplot(seven_dwarfs_wts, aes(x = w_ate)) + geom_histogram(bins = 50) ``` @@ -292,7 +308,8 @@ finite_sample_wts <- glm( finite_sample_wts |> summarize( - effect = sum(y * x * wts) / sum(x * wts) - + effect = sum(y * x * wts) / + sum(x * wts) - sum(y * (1 - x) * wts) / sum((1 - x) * wts) ) ``` @@ -307,6 +324,7 @@ Let's rerun this simulation many times at different sample sizes to explore the #| message: false #| cache: true #| code-fold: true + sim <- function(n) { ## create a simulated dataset finite_sample <- tibble( @@ -326,7 +344,8 @@ sim <- function(n) { mutate(wts = wt_ate(.fitted, x)) bias <- finite_sample_wts |> summarize( - effect = sum(y * x * wts) / sum(x * wts) - + effect = sum(y * x * wts) / + sum(x * wts) - sum(y * (1 - x) * wts) / sum((1 - x) * wts) ) |> pull() @@ -362,7 +381,8 @@ It isn't until a sample size larger than 10,000 that we see this bias disappear. Estimands that utilize unbounded weights (i.e., that theoretically can be infinitely large) are more likely to suffer from finite sample bias. The likelihood of falling into finite sample bias depends on:\ (1) the estimand you have chosen (i.e., are the weights bounded?)\ -(2) the distribution of the covariates in the exposed and unexposed groups (i.e., is there good overlap? Potential positivity violations, when there is poor overlap, are the regions where weights can become too large)\ +(2) the distribution of the covariates in the exposed and unexposed groups (i.e., is there good overlap? +Potential positivity violations, when there is poor overlap, are the regions where weights can become too large)\ (3) the sample size. ::: @@ -378,9 +398,12 @@ Even though we are not doing a survey analysis, the same techniques are helpful ```{r} #| label: tbl-weighted-gtsummary -#| tbl-cap: A descriptive table of Extra Magic Morning hours weighted by the Average Treatment Effect Weights. This table shows the distributions of these variables in the pseudo-population created by these weights. +#| tbl-cap: A descriptive table of Extra Magic Morning hours weighted by the +#| Average Treatment Effect Weights. This table shows the distributions of +#| these variables in the pseudo-population created by these weights. #| message: false #| warning: false + library(survey) library(halfmoon) hdr <- paste0( @@ -418,8 +441,9 @@ As we saw in @sec-eval-ps-model, the `{halfmoon}` package includes a function `g ```{r} #| label: fig-sd-mirror-hist-ate -#| fig.cap: > +#| fig-cap: > #| A mirrored histogram of the distribution of propensity scores between exposure groups. The dark bars represent the unweighted distribution, and the colored bars represent the distribution weighted by the average treatment effect (ATE) weight. + library(halfmoon) ggplot(seven_dwarfs_wts, aes(.fitted, group = park_extra_magic_morning)) + geom_mirror_histogram(bins = 50) + @@ -443,7 +467,9 @@ We can also see that after weighting, the two distributions look similar; the sh Recall that our estimand is: -$$E[Y(1) - Y(0)]$$ +$$ +E[Y(1) - Y(0)] +$$ To use data to estimate averages of potential outcomes, we need to meet the three causal assumptions discussed in @sec-assump: exchangeability, positivity, and consistency. This estimand also tells us what part of our data we need to make these assumptions *about*. @@ -457,15 +483,21 @@ Another estimand we've seen---the default estimand in MatchIt---is the average t The target population for the ATT is the *exposed* (treated) population. This causal estimand conditions on those in the exposed group: -$$E[Y(1) - Y(0) | X = 1]$$ +$$ +E[Y(1) - Y(0) | X = 1] +$$ -Example research questions where the ATT is of interest could include "Should we stop our marketing campaign to those currently receiving it?" or "Should medical providers stop recommending treatment for those currently receiving it?" [@greifer2021choosing] +Example research questions where the ATT is of interest could include "Should we stop our marketing campaign to those currently receiving it?" +or "Should medical providers stop recommending treatment for those currently receiving it?" +[@greifer2021choosing] The ATT is a common target estimand when matching; here, all exposed observations are included and matched to control observations, some of which may be discarded. Alternatively, we can estimate the ATT via weighting. The ATT weight is estimated by: -$$w_{ATT} = X + \frac{(1 - X)p}{1 - p}$$ +$$ +w_{ATT} = X + \frac{(1 - X)p}{1 - p} +$$ In other words, the treated group receives a weight of 1, while the untreated group receives weights equal to their odds of being *treated*. Why do these weights work to target this population? @@ -506,6 +538,7 @@ For instance, for the ATE, the tilting function keeps the target population as-i #| fig-cap: "The tilting function of the ATE and the behavior of the weights for each group over the range of the propensity score (x-axis). The y-axis represents the value of the tilting function scalar (top panel) or the weights corresponding to the propensity score value (bottom two panels). For the ATE, the tilting function is $1$, so the weights remain unchanged. ATE weights can range from 0 to infinity for both groups." #| code-fold: true #| fig-width: 3 + ps <- seq(0.01, 0.99, by = 0.01) df <- tibble(ps = ps) |> @@ -528,7 +561,8 @@ df <- tibble(ps = ps) |> ) plot_df <- bind_rows( - df |> select(ps, starts_with("h_")) |> + df |> + select(ps, starts_with("h_")) |> pivot_longer( -ps, names_to = "estimand", @@ -579,7 +613,7 @@ plot_weight_properties <- function(plot_df, estimand_type) { y = NULL ) + theme_minimal() + - xlim(.01, .99) + + xlim(0.01, 0.99) + expand_limits(y = c(0, 1)) } @@ -593,6 +627,7 @@ For the ATT, the tilting function tries to make the population more like the tre #| fig-cap: "The tilting function of the ATT and the behavior of the weights for each group over the range of the propensity score (x-axis). The y-axis represents the value of the tilting function scalar (top panel) or the weights corresponding to the propensity score value (bottom two panels). For the ATT, the tilting function is $p$, the propensity score, so the weights are 1 for the treated and the odds of being treated for the controls. As an untreated observation approaches a propensity score of 1, the odds go to infinity." #| code-fold: true #| fig-width: 3 + plot_weight_properties(plot_df, "att") ``` @@ -600,8 +635,9 @@ Let's add the ATT weights to the `seven_dwarfs_wts` data frame and look at their ```{r} #| label: fig-sd-att-hist -#| fig.cap : > +#| fig-cap: > #| A histogram of the average treatment effect among the treated (ATT) weights. The range of the ATT weights in this example is more stable than the ATE weights: the range is much smaller. + seven_dwarfs_wts <- seven_dwarfs_wts |> mutate(w_att = wt_att(.fitted, park_extra_magic_morning)) @@ -620,9 +656,13 @@ Let's look at the weighted table to understand the pseudo-population we've creat ```{r} #| label: tbl-weighted-att -#| tbl-cap: A descriptive table of Extra Magic Morning hours weighted by the Average Treatment Effect among the Treated Weights. This table shows the distributions of these variables in the pseudo-population created by these weights. +#| tbl-cap: A descriptive table of Extra Magic Morning hours weighted by the +#| Average Treatment Effect among the Treated Weights. This table shows the +#| distributions of these variables in the pseudo-population created by these +#| weights. #| message: false #| warning: false + seven_dwarfs_svy <- svydesign( ids = ~1, data = seven_dwarfs_wts, @@ -647,8 +687,9 @@ We can again create a mirrored histogram to observe the weighted pseudo-populati ```{r} #| label: fig-sd-mirror-hist-att -#| fig.cap: > +#| fig-cap: > #| A mirrored histogram of the distribution of propensity scores between exposure groups. The dark bars represent the unweighted distribution, and the colored bars represent the distribution weighted by the average treatment effect among the treated (ATT) weight. + ggplot(seven_dwarfs_wts, aes(.fitted, group = park_extra_magic_morning)) + geom_mirror_histogram(bins = 50) + geom_mirror_histogram( @@ -670,7 +711,9 @@ The ATT is easier to estimate when there are many more observations in the unexp The ATT and other estimands have an advantage over the ATE: because we only making inferences for a subset of the population, *the causal assumptions we need to make only need to be met for that subset of the population*. Since the estimand for the ATT is: -$$E[Y(1) - Y(0) | X = 1]$$ +$$ +E[Y(1) - Y(0) | X = 1] +$$ We have a weaker assumption because it only applies to the treated group as opposed to the whole population. For exchangeability, we are no longer trying to estimate the potential outcome `y(1)` for the untreated; we are only making inferences for the treated and have already observed `y(1)` for that group. @@ -683,15 +726,21 @@ The target population to estimate the average treatment effect among the unexpos We'll use both ATU and ATC to refer to this estimand. This causal estimand conditions on those in the unexposed group. -$$E[Y(1) - Y(0) | X = 0]$$ +$$ +E[Y(1) - Y(0) | X = 0] +$$ -Example research questions where the ATU is of interest could include "Should we send our marketing campaign to those not currently receiving it?" or "Should medical providers begin recommending treatment for those not currently receiving it?" [@greifer2021choosing] +Example research questions where the ATU is of interest could include "Should we send our marketing campaign to those not currently receiving it?" +or "Should medical providers begin recommending treatment for those not currently receiving it?" +[@greifer2021choosing] The ATU can be also estimated via matching; here, all unexposed observations are included and matched to exposed observations, some of which may be discarded. Alternatively, the ATU can be estimated via weighting. The ATU weight is estimated by: -$$w_{ATU} = \frac{X(1-p)}{p}+ (1-X)$$ +$$ +w_{ATU} = \frac{X(1-p)}{p}+ (1-X) +$$ This is the reverse of the ATT: the unexposed group gets a weight of 1, while the exposed group receives weights equal to the odds of being *untreated*. We're targeting the untreated group by using the probability of being untreated as a tilting function. @@ -717,6 +766,7 @@ Thus, the untreated get weights of 1, and the treated get weights based on the o #| fig-cap: "The tilting function of the ATU and the behavior of the weights for each group over the range of the propensity score (x-axis). The y-axis represents the value of the tilting function scalar (top panel) or the weights corresponding to the propensity score value (bottom two panels). For the ATU, the tilting function is $1-p$, the propensity score, so the weights are 1 for the untreated and the odds of being untreated for the treated. As a treated observation approaches a propensity score of 0, the odds go to infinity." #| code-fold: true #| fig-width: 3 + plot_weight_properties(plot_df, "atc") ``` @@ -724,8 +774,9 @@ Let's add the ATU weights to the `seven_dwarfs_wts` data frame and look at their ```{r} #| label: fig-sd-atu-hist -#| fig.cap : > +#| fig-cap: > #| A histogram of the average treatment effect among the unexposed (ATU) weights. The range of the ATU weights in this example is very similar to the ATE weights. + seven_dwarfs_wts <- seven_dwarfs_wts |> mutate(w_atu = wt_atu(.fitted, park_extra_magic_morning)) @@ -741,7 +792,10 @@ Now, let's take a look at the weighted table. ```{r} #| label: tbl-weighted-atu -#| tbl-cap: A descriptive table of Extra Magic Morning hours weighted by the Average Treatment Effect among the Unexposed Weights. This table shows the distributions of these variables in the pseudo-population created by these weights. +#| tbl-cap: A descriptive table of Extra Magic Morning hours weighted by the +#| Average Treatment Effect among the Unexposed Weights. This table shows the +#| distributions of these variables in the pseudo-population created by these +#| weights. #| warning: false seven_dwarfs_svy <- svydesign( @@ -764,8 +818,9 @@ We can again create a mirrored histogram to observe the weighted pseudo-populati ```{r} #| label: fig-sd-mirror-hist-atu -#| fig.cap: > +#| fig-cap: > #| A mirrored histogram of the distribution of propensity scores between exposure groups. The dark bars represent the unweighted distribution, and the colored bars represent the distribution weighted by the average treatment effect among the unexposed (ATU) weight. + ggplot(seven_dwarfs_wts, aes(.fitted, group = park_extra_magic_morning)) + geom_mirror_histogram(bins = 50) + geom_mirror_histogram( @@ -794,10 +849,13 @@ In other words, we now have a positivity assumption of $0 > p$: there needs to b The target population to estimate the average treatment effect among the evenly matchable (ATM) is the evenly matchable. This causal estimand conditions on those deemed "evenly matchable" by some distance metric. -$$E[Y(1) - Y(0) | M_d = 1]$$ +$$ +E[Y(1) - Y(0) | M_d = 1] +$$ Here, $d$ denotes a distance measure, and $M_d=1$ indicates that a unit is evenly matchable under that distance measure [@samuels2017aspects; @d2018improving]. -Example research questions about the ATM could include "Should those at clinical equipoise receive the exposure?" [@greifer2021choosing]. +Example research questions about the ATM could include "Should those at clinical equipoise receive the exposure?" +[@greifer2021choosing]. In medicine, those at **clinical equipoise** are those about whom we are genuinely uncertain how to treat. This is very often the part of the population about which we want to make inferences because we want to make the best decision in the face of that uncertainty. @@ -808,7 +866,9 @@ This type of matching is often done via a caliper, where observations are only m Alternatively, the ATM can be estimated via weighting. The ATM weight is estimated by: -$$w_{ATM} = \frac{\min \{p, 1-p\}}{Xp + (1-X)(1-p)}$$ +$$ +w_{ATM} = \frac{\min \{p, 1-p\}}{Xp + (1-X)(1-p)} +$$ The tilting function for the ATM is $min\{p, 1-p\}$. @@ -837,6 +897,7 @@ Below 0.5, they have a weight of 1 and are downweighted as they approach a proba #| fig-cap: "The tilting function of the ATM and the behavior of the weights for each group over the range of the propensity score (x-axis). The y-axis represents the value of the tilting function scalar (top panel) or the weights corresponding to the propensity score value (bottom two panels). For the ATM, the tilting function is $min\\{p, 1-p\\}$. The untreated have weights of 1 above 0.5 and are downweighted as the propensity score approaches 0. The reverse is true for the treated." #| code-fold: true #| fig-width: 3 + plot_weight_properties(plot_df, "atm") ``` @@ -844,8 +905,9 @@ Let's add the ATM weights to the `seven_dwarfs_wts` data frame and look at their ```{r} #| label: fig-sd-atm-hist -#| fig.cap: > +#| fig-cap: > #| A histogram of the average treatment effect among the evenly matchable (ATM) weights for whether or not a day had Extra Magic Hours. ATM weights can only range from 0 to 1, so they are always stable. + seven_dwarfs_wts <- seven_dwarfs_wts |> mutate(w_atm = wt_atm(.fitted, park_extra_magic_morning)) @@ -861,7 +923,10 @@ Now, let's take a look at the weighted table. ```{r} #| label: tbl-weighted-atm -#| tbl-cap: A descriptive table of Extra Magic Morning hours weighted by the Average Treatment Effect among the Evenly Matchable Weights. This table shows the distributions of these variables in the pseudo-population created by these weights. +#| tbl-cap: A descriptive table of Extra Magic Morning hours weighted by the +#| Average Treatment Effect among the Evenly Matchable Weights. This table +#| shows the distributions of these variables in the pseudo-population created +#| by these weights. #| warning: false seven_dwarfs_svy <- svydesign( @@ -886,8 +951,9 @@ We can again create a mirrored histogram to observe the ATM-weighted pseudo-popu ```{r} #| label: fig-sd-mirror-hist-atm -#| fig.cap: > +#| fig-cap: > #| A mirrored histogram of the distribution of propensity scores between exposure groups. The dark bars represent the unweighted distribution, and the colored bars represent the distribution weighted by the average treatment effect among the evenly matchable (ATM) weights. + ggplot(seven_dwarfs_wts, aes(.fitted, group = park_extra_magic_morning)) + geom_mirror_histogram(bins = 50) + geom_mirror_histogram( @@ -912,13 +978,16 @@ This is often more feasible simply because the confounding in the middle regions The ATM estimates a particular population at equipoise (the evenly matchable), but there are several ways we could consider equipoise. A related target population is to estimate the average treatment effect among the overlap population (ATO). -Example research questions where the ATO is of interest are the same as those for the ATM, such as "Should those at clinical equipoise receive the exposure?" [@greifer2021choosing]. +Example research questions where the ATO is of interest are the same as those for the ATM, such as "Should those at clinical equipoise receive the exposure?" +[@greifer2021choosing]. Again, these weights are similar to the ATM weights but are slightly attenuated, yielding improved variance properties. The ATO weight is estimated by: -$$w_{ATO} = X(1-p) + (1-X)p$$ +$$ +w_{ATO} = X(1-p) + (1-X)p +$$ The tilting function for the ATO is $p(1-p)$[^10-estimands-1]. @@ -947,6 +1016,7 @@ We give each group weights equal to the probability of being in the other group! #| fig-cap: "The tilting function of the ATO and the behavior of the weights for each group over the range of the propensity score (x-axis). The y-axis represents the value of the tilting function scalar (top panel) or the weights corresponding to the propensity score value (bottom two panels). For the ATO, the tilting function is $p(1-p)$. The weights are linearly downweighted towards a probability of 0 for the group the observation falls in." #| code-fold: true #| fig-width: 3 + plot_weight_properties(plot_df, "ato") ``` @@ -954,8 +1024,9 @@ Let's add the ATO weights to the `seven_dwarfs_wts` data frame and look at their ```{r} #| label: fig-sd-ato-hist -#| fig.cap: > +#| fig-cap: > #| A histogram of the average treatment effect among the overlap population (ATO) weights for whether or not a day had Extra Magic Hours. Like ATM weights, ATO weights can range from 0 to 1, so they are always stable. ATO weights also have improved variance properties compared to the ATM weights. + seven_dwarfs_wts <- seven_dwarfs_wts |> mutate(w_ato = wt_ato(.fitted, park_extra_magic_morning)) @@ -977,6 +1048,7 @@ Let's revisit our finite sample bias simulation, adding the ATO weights to exami #| message: false #| cache: true #| code-fold: true + sim <- function(n) { ## create a simulated dataset finite_sample <- tibble( @@ -999,9 +1071,11 @@ sim <- function(n) { ) bias <- finite_sample_wts |> summarize( - effect_ate = sum(y * x * wts_ate) / sum(x * wts_ate) - + effect_ate = sum(y * x * wts_ate) / + sum(x * wts_ate) - sum(y * (1 - x) * wts_ate) / sum((1 - x) * wts_ate), - effect_ato = sum(y * x * wts_ato) / sum(x * wts_ato) - + effect_ato = sum(y * x * wts_ato) / + sum(x * wts_ato) - sum(y * (1 - x) * wts_ato) / sum((1 - x) * wts_ato) ) tibble( @@ -1038,7 +1112,10 @@ Now, let's take a look at the weighted table. ```{r} #| label: tbl-weighted-ato -#| tbl-cap: A descriptive table of Extra Magic Morning hours weighted by the Average Treatment Effect among the Overlap Population Weights. This table shows the distributions of these variables in the pseudo-population created by these weights. +#| tbl-cap: A descriptive table of Extra Magic Morning hours weighted by the +#| Average Treatment Effect among the Overlap Population Weights. This table +#| shows the distributions of these variables in the pseudo-population created +#| by these weights. #| warning: false seven_dwarfs_svy <- svydesign( @@ -1061,8 +1138,9 @@ We can again create a mirrored histogram to observe the ATO-weighted pseudo-popu ```{r} #| label: fig-sd-mirror-hist-ato -#| fig.cap: > +#| fig-cap: > #| A mirrored histogram of the distribution of propensity scores between exposure groups. The dark bars represent the unweighted distribution, and the colored bars represent the distribution weighted by the average treatment effect among the overlap population (ATO) weight. + ggplot(seven_dwarfs_wts, aes(.fitted, group = park_extra_magic_morning)) + geom_mirror_histogram(bins = 50) + geom_mirror_histogram( @@ -1089,6 +1167,7 @@ In @fig-ato-love, all three covariates have an SMD of 0 when weighted by ATO wei ```{r} #| label: fig-ato-love #| fig-cap: "A Love plot of ticket season, park close time, and historic high temperature in the original dataset and with ATO weights. The ATO weights balance perfectly on the mean for all confounders." + seven_dwarfs_wts |> mutate(park_close = as.numeric(park_close)) |> tidy_smd( @@ -1116,12 +1195,12 @@ It may be helpful to look at the changes between the populations when trimming o MatchIt can easily target different estimands with the `estimand` argument. Note that you need to use full matching for the ATE and a caliper for the ATM. -| estimand | MatchIt command | -|----------|-------------------------------------------------| -| ATE | `matchit(…, method = "full", estimand = "ATE")` | -| ATT | `matchit(…, estimand = "ATT")` | -| ATC | `matchit(…, estimand = "ATC")` | -| ATM | `matchit(…, caliper = )` | + | estimand | MatchIt command | + | -------- | ----------------------------------------------- | + | ATE | `matchit(…, method = "full", estimand = "ATE")` | + | ATT | `matchit(…, estimand = "ATT")` | + | ATC | `matchit(…, estimand = "ATC")` | + | ATM | `matchit(…, caliper = )` | The intuition for how weights target different populations comes down to the tilting function: we scale the weights to represent the population we're interested in. The intuition for matching is more straightforward: it comes down to who we drop and don't drop. @@ -1147,6 +1226,7 @@ In matching, we control the level of overlap we find acceptable with a caliper. #| echo: false #| fig-width: 4 #| fig-height: 4 + set.seed(10) p <- tibble( x = c(runif(10, 0, 0.9), runif(10, 0.1, 1)), @@ -1248,62 +1328,73 @@ Since other estimands besides the ATE weaken causal assumptions, they might be m Below is a table summarizing the estimands and methods for estimating them (including R functions), along with sample questions extended from Table 2 in @greifer2021choosing. -+----------+----------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------+------------------+ -| Estimand | Target population | Example Research Question | Matching Method | Weighting Method | -+==========+==================================+======================================================================================================================================================+======================================+==================+ -| ATE | Full population | Should we have Extra Magic Hours *all* mornings to change the wait time for Seven Dwarfs Mine Train between 5–6 P.M.? | Full matching | ATE Weights | -| | | | | | -| | | Should a specific policy be applied to all eligible observations? | Fine stratification | `wt_ate()` | -| | | | | | -| | | | `matchit(…, estimand = "ATE")` | | | -+----------+----------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------+------------------+ -| ATT | Exposed (treated) observations | Should we stop Extra Magic Hours to change the wait time for Seven Dwarfs Mine Train between 5–6 P.M.? | Pair matching without a caliper | ATT weights | -| | | | | | -| | | Should we stop our marketing campaign to those currently receiving it? | Full matching | `wt_att()` | -| | | | | | -| | | Should medical providers stop recommending treatment for those currently receiving it? | Fine stratification | | -| | | | | | -| | | | `matchit(…, estimand = "ATT")` | | -+----------+----------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------+------------------+ -| ATU | Unexposed (control) observations | Should we add Extra Magic Hours for all days to change the wait time for Seven Dwarfs Mine Train between 5–6 P.M.? | Pair matching without a caliper | ATU weights | -| | | | | | -| | | Should we extend our marketing campaign to those not receiving it? | Full matching | `wt_atu()` | -| | | | | | -| | | Should medical providers extend treatment to those not currently receiving it? | Fine stratification | | -| | | | | | -| | | | `matchit(…, estimand = "ATC")` | | -+----------+----------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------+------------------+ -| ATM | Evenly matchable | Are there some days we should change whether we are offering Extra Magic Hours to change the wait time for Seven Dwarfs Mine Train between 5–6 P.M.? | Caliper matching | ATM weights | -| | | | | | -| | | Is there an effect of the exposure for some observations? | `matchit(…, caliper = 0.1)` | `wt_atm()` | -| | | | | | -| | | Should those at clinical equipoise receive treatment? | Coarsened exact matching | | -| | | | | | -| | | | Cardinality matching | | -| | | | | | -| | | | `matchit(…, method = "cardinality")` | | -+----------+----------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------+------------------+ -| ATO | Overlap population | Same as ATM | | ATO weights | -| | | | | | -| | | | | `wt_ato()` | -+----------+----------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------+------------------+ + +----------+----------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------+------------------+ + | Estimand | Target population | Example Research Question | Matching Method | Weighting Method | + +==========+==================================+======================================================================================================================================================+======================================+==================+ + | ATE | Full population | Should we have Extra Magic Hours *all* mornings to change the wait time for Seven Dwarfs Mine Train between 5–6 P.M.? | Full matching | ATE Weights | + | | | | | | + | | | Should a specific policy be applied to all eligible observations? | Fine stratification | `wt_ate()` | + | | | | | | + | | | | `matchit(…, estimand = "ATE")` | | + +----------+----------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------+------------------+ + | ATT | Exposed (treated) observations | Should we stop Extra Magic Hours to change the wait time for Seven Dwarfs Mine Train between 5–6 P.M.? | Pair matching without a caliper | ATT weights | + | | | | | | + | | | Should we stop our marketing campaign to those currently receiving it? | Full matching | `wt_att()` | + | | | | | | + | | | Should medical providers stop recommending treatment for those currently receiving it? | Fine stratification | | + | | | | | | + | | | | `matchit(…, estimand = "ATT")` | | + +----------+----------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------+------------------+ + | ATU | Unexposed (control) observations | Should we add Extra Magic Hours for all days to change the wait time for Seven Dwarfs Mine Train between 5–6 P.M.? | Pair matching without a caliper | ATU weights | + | | | | | | + | | | Should we extend our marketing campaign to those not receiving it? | Full matching | `wt_atu()` | + | | | | | | + | | | Should medical providers extend treatment to those not currently receiving it? | Fine stratification | | + | | | | | | + | | | | `matchit(…, estimand = "ATC")` | | + +----------+----------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------+------------------+ + | ATM | Evenly matchable | Are there some days we should change whether we are offering Extra Magic Hours to change the wait time for Seven Dwarfs Mine Train between 5–6 P.M.? | Caliper matching | ATM weights | + | | | | | | + | | | Is there an effect of the exposure for some observations? | `matchit(…, caliper = 0.1)` | `wt_atm()` | + | | | | | | + | | | Should those at clinical equipoise receive treatment? | Coarsened exact matching | | + | | | | | | + | | | | Cardinality matching | | + | | | | | | + | | | | `matchit(…, method = "cardinality")` | | + +----------+----------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------+------------------+ + | ATO | Overlap population | Same as ATM | | ATO weights | + | | | | | | + | | | | | `wt_ato()` | + +----------+----------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------+------------------+ ### Effective Sample Size -As discussed in @sec-eval-ps-model, effective sample size (ESS) is a way to understand how efficient various estimates are. It measures how many independent observations would give the same variance as your weighted sample. ESS is useful both as a general check on the efficiency of your chosen method and estimation process, as well as a tool for choosing between methods when multiple approaches address your research question. +As discussed in @sec-eval-ps-model, effective sample size (ESS) is a way to understand how efficient various estimates are. +It measures how many independent observations would give the same variance as your weighted sample. +ESS is useful both as a general check on the efficiency of your chosen method and estimation process, as well as a tool for choosing between methods when multiple approaches address your research question. -When you have more than one set of weights or matching schemes that addresses your research question, comparing the ESS of each can help you decide which to use. All else being equal, pick the one with the highest ESS. `ess()` and `plot_ess()` in `{halfmoon}` can help with this. In @fig-ess, we see that the ATE and ATU weights have a much lower ESS than the other weights, which are all relatively similar, with the ATO weights having the highest ESS. +When you have more than one set of weights or matching schemes that addresses your research question, comparing the ESS of each can help you decide which to use. +All else being equal, pick the one with the highest ESS. +`ess()` and `plot_ess()` in `{halfmoon}` can help with this. +In @fig-ess, we see that the ATE and ATU weights have a much lower ESS than the other weights, which are all relatively similar, with the ATO weights having the highest ESS. ```{r} #| label: fig-ess #| fig-cap: "A plot of the effective sample size (ESS) for each set of weights. The ESS is the number of observations that would give the same variance as the weighted sample. The bars are relative to the original sample size in percent." #| fig-width: 5 + library(halfmoon) seven_dwarfs_wts |> plot_ess(.weights = starts_with("w_")) ``` -The fundamental tension in looking at balance vs. precision is a bias-variance trade-off. The observed sample size is the highest, but it also has the worst balance and may not address our research question. The ATO has the best balance and the highest ESS, but it addresses a particular sort of research question that may not suit our needs. Had we randomized the exposure, we would have had the best balance and the highest ESS, at least on average and with a high enough (actual) sample size. However, in observational data, we often have to make trade-offs between bias and variance. While weighting reduces our effective sample size, it's often a necessary trade-off: without it, we'd get a precisely-estimated wrong answer! +The fundamental tension in looking at balance vs. precision is a bias-variance trade-off. +The observed sample size is the highest, but it also has the worst balance and may not address our research question. +The ATO has the best balance and the highest ESS, but it addresses a particular sort of research question that may not suit our needs. +Had we randomized the exposure, we would have had the best balance and the highest ESS, at least on average and with a high enough (actual) sample size. +However, in observational data, we often have to make trade-offs between bias and variance. +While weighting reduces our effective sample size, it's often a necessary trade-off: without it, we'd get a precisely-estimated wrong answer! ## What estimand does multivariable linear regression target? @@ -1321,8 +1412,10 @@ The default is for the ATE, but we can also target other estimands. ```{r} library(lmw) implied_weights <- lmw( - ~ park_extra_magic_morning + park_ticket_season + park_close + - park_temperature_high, + ~ park_extra_magic_morning + + park_ticket_season + + park_close + + park_temperature_high, data = seven_dwarfs_with_ps, treat = "park_extra_magic_morning" ) diff --git a/chapters/11-outcome-model.qmd b/chapters/11-outcome-model.qmd index 57291fe..f0c007b 100644 --- a/chapters/11-outcome-model.qmd +++ b/chapters/11-outcome-model.qmd @@ -4,6 +4,7 @@ ```{r} #| echo: false + # TODO: remove when first edition complete status("polishing") ``` @@ -16,6 +17,7 @@ For example, re-performing the matching as we did in @sec-using-ps, we can extra ```{r} #| message: false #| warning: false + library(broom) library(touringplans) library(MatchIt) @@ -24,7 +26,9 @@ seven_dwarfs_9 <- seven_dwarfs_train_2018 |> filter(wait_hour == 9) m <- matchit( - park_extra_magic_morning ~ park_ticket_season + park_close + park_temperature_high, + park_extra_magic_morning ~ park_ticket_season + + park_close + + park_temperature_high, data = seven_dwarfs_9 ) matched_data <- get_matches(m) @@ -50,13 +54,16 @@ We will use the ATT weights so the analysis matches that for matching above. ```{r} #| message: false #| warning: false + library(propensity) propensity_model <- glm( - park_extra_magic_morning ~ park_ticket_season + park_close + park_temperature_high, - data = seven_dwarfs_9, - family = binomial() - ) + park_extra_magic_morning ~ park_ticket_season + + park_close + + park_temperature_high, + data = seven_dwarfs_9, + family = binomial() +) seven_dwarfs_9_with_ps <- propensity_model |> augment(type.predict = "response", data = seven_dwarfs_9) @@ -102,9 +109,9 @@ Causal inference with `group_by()` and `summarize()` works just fine now, since There are three ways to estimate the uncertainty: -1. A bootstrap -2. A sandwich estimator that only takes into account the outcome model -3. A sandwich estimator that takes into account the propensity score model +1. A bootstrap +2. A sandwich estimator that only takes into account the outcome model +3. A sandwich estimator that takes into account the propensity score model The first option can be computationally intensive, but should get you the correct estimates. The second option is computationally the easiest, but tends to overestimate the variability. @@ -112,7 +119,7 @@ There are not many current solutions in R (aside from coding it up yourself) for ### The bootstrap {#sec-bootstrap} -1. Create a function to run your analysis once on a sample of your data +1. Create a function to run your analysis once on a sample of your data ```{r} fit_ipw <- function(.split, ...) { @@ -121,7 +128,9 @@ fit_ipw <- function(.split, ...) { # fit propensity score model propensity_model <- glm( - park_extra_magic_morning ~ park_ticket_season + park_close + park_temperature_high, + park_extra_magic_morning ~ park_ticket_season + + park_close + + park_temperature_high, data = seven_dwarfs_9, family = binomial() ) @@ -129,11 +138,13 @@ fit_ipw <- function(.split, ...) { # calculate inverse probability weights .df <- propensity_model |> augment(type.predict = "response", data = .df) |> - mutate(wts = wt_att( - .fitted, - park_extra_magic_morning, - exposure_type = "binary" - )) + mutate( + wts = wt_att( + .fitted, + park_extra_magic_morning, + exposure_type = "binary" + ) + ) # fit correctly bootstrapped ipw model lm( @@ -145,11 +156,12 @@ fit_ipw <- function(.split, ...) { } ``` -2. Use {rsample} to bootstrap our causal effect +2. Use {rsample} to bootstrap our causal effect ```{r} #| message: false #| warning: false + library(rsample) # fit ipw model to bootstrapped samples @@ -172,9 +184,11 @@ ipw_results |> mutate( estimate = map_dbl( boot_fits, - \(.fit) .fit |> - filter(term == "park_extra_magic_morning") |> - pull(estimate) + \(.fit) { + .fit |> + filter(term == "park_extra_magic_morning") |> + pull(estimate) + } ) ) |> ggplot(aes(estimate)) + @@ -182,7 +196,7 @@ ipw_results |> theme_minimal() ``` -3. Pull out the causal effect +3. Pull out the causal effect ```{r} # get t-based CIs @@ -202,6 +216,7 @@ Using the `sandwich` function, we can get the robust estimate for the variance f ```{r} #| message: false #| warning: false + library(sandwich) weighted_mod <- lm( wait_minutes_posted_avg ~ park_extra_magic_morning, @@ -232,6 +247,7 @@ To do this, we need to create a design object, like we did when fitting weighted ```{r} #| message: false #| warning: false + library(survey) des <- svydesign( @@ -262,7 +278,7 @@ results We can also collect the results in a data frame. ```{r} -results |> +results |> as.data.frame() ``` @@ -283,8 +299,8 @@ In the case of a binary outcome, we calculate average probabilities for each tre Let's call these `p_untreated` and `p_treated`. When we're working with these probabilities, calculating the risk difference and risk ratio is simple: -- **Risk difference**: `p_treated - p_untreated` -- **Risk ratio**: `p_treated / p_untreated` +- **Risk difference**: `p_treated - p_untreated` +- **Risk ratio**: `p_treated / p_untreated` ::: callout-note By "risk", we mean the risk of an outcome. @@ -295,7 +311,7 @@ A more general way to think about these is as the difference in or ratio of the The odds for a probability is calculated as `p / (1 - p)`, so the odds ratio is: -- **Odds ratio**: `(p_treated / (1 - p_treated)) / (p_untreated / (1 - p_untreated))` +- **Odds ratio**: `(p_treated / (1 - p_treated)) / (p_untreated / (1 - p_untreated))` When outcomes are rare, `(1 - p)` approaches 1, and odds ratios approximate risk ratios. The rarer the outcome, the closer the approximation. @@ -384,6 +400,7 @@ In other words, the effect estimate of `exposure` on `outcome` should be the sam #| fig-height: 4 #| fig-align: "center" #| fig-cap: "A DAG showing the causal relationship between `outcome`, `exposure`, and `covariate`. `exposure` and `covariate` both cause `outcome`, but there is no relationship between `exposure` and `covariate`. In a logistic regression, the odds ratio for exposure will be non-collapsible over strata of covariate." + library(ggdag) dagify( outcome ~ exposure + covariate, @@ -407,6 +424,7 @@ outcome <- rbinom(n, 1, plogis(-0.5 + exposure + 2 * covariate)) ```{r} #| echo: false + odds_ratio <- function(tbl) { or <- (tbl[2, 2] * tbl[1, 1]) / (tbl[1, 2] * tbl[2, 1]) round(or, digits = 2) @@ -422,10 +440,10 @@ marginal_table <- table(exposure, outcome) marginal_or <- odds_ratio(marginal_table) marginal_rr <- risk_ratio(marginal_table) conditional_tables <- table(exposure, outcome, covariate) -conditional_or_0 <- odds_ratio(conditional_tables[, , 1]) -conditional_or_1 <- odds_ratio(conditional_tables[, , 2]) -conditional_rr_0 <- risk_ratio(conditional_tables[, , 1]) -conditional_rr_1 <- risk_ratio(conditional_tables[, , 2]) +conditional_or_0 <- odds_ratio(conditional_tables[,, 1]) +conditional_or_1 <- odds_ratio(conditional_tables[,, 2]) +conditional_rr_0 <- risk_ratio(conditional_tables[,, 1]) +conditional_rr_1 <- risk_ratio(conditional_tables[,, 2]) ``` First, let's look at the relationship between `exposure` and `outcome` among everyone. diff --git a/chapters/12-other-exposures.qmd b/chapters/12-other-exposures.qmd index b318e20..99aba0d 100644 --- a/chapters/12-other-exposures.qmd +++ b/chapters/12-other-exposures.qmd @@ -6,6 +6,7 @@ ```{r} #| echo: false + # TODO: remove when first edition complete status("wip") ``` @@ -29,10 +30,10 @@ Here's our DAG: #| code-fold: true #| message: false #| warning: false -#| fig.cap: > -#| Proposed DAG for the relationship between posted wait -#| in the morning at a particular park and the average -#| wait time between 5 pm and 6 pm. +#| fig-cap: > +#| Proposed DAG for the relationship between posted wait +#| in the morning at a particular park and the average +#| wait time between 5 pm and 6 pm. library(tidyverse) library(ggdag) @@ -66,7 +67,10 @@ dagify( ggplot( aes(x, y, xend = xend, yend = yend, color = status) ) + - geom_dag_edges_arc(curvature = c(rep(0, 7), .2, 0, .2, .2, 0), edge_colour = "grey70") + + geom_dag_edges_arc( + curvature = c(rep(0, 7), 0.2, 0, 0.2, 0.2, 0), + edge_colour = "grey70" + ) + geom_dag_point() + geom_dag_label_repel(seed = 1602) + scale_color_okabe_ito(na.value = "grey90") + @@ -94,13 +98,13 @@ This is the only minimal adjustment set in the DAG, as well. The confounders precede the exposure and outcome, and (by definition) the exposure precedes the outcome. The average posted wait time is, in theory, a manipulable exposure because the park could post a time different from what they expect. - The model is similar to the binary exposure case, but we need to use linear regression, as the posted time is a continuous variable. Since we're not using probabilities, we'll calculate denominators for our weights from a normal density. We then calculate the denominator using the `dnorm()` function, which calculates the normal density for the `exposure`, using `.fitted` as the mean and `mean(.sigma)` as the SD. ```{r} #| eval: false + lm( exposure ~ confounder_1 + confounder_2, data = df @@ -127,6 +131,7 @@ Bounded weights like the ATO (which are bounded to 0 and 1) do not have this pro ```{r} #| eval: false + # for continuous exposures lm( exposure ~ 1, @@ -176,7 +181,10 @@ We'll fit a model using `lm()` for `wait_minutes_posted_avg` with our covariates library(broom) denominator_model <- lm( wait_minutes_posted_avg ~ - park_close + park_extra_magic_morning + park_temperature_high + park_ticket_season, + park_close + + park_extra_magic_morning + + park_temperature_high + + park_ticket_season, data = wait_times ) @@ -196,8 +204,9 @@ When we only use the inverted values of `denominator`, we end up with several ex ```{r} #| label: fig-hist-sd-unstable -#| fig.cap: > +#| fig-cap: > #| A histogram of the inverse probability weights for posted waiting time. Weights for continuous exposures are prone to extreme values, which can unstabilize estimates and variance. + denominators |> mutate(wts = 1 / denominator) |> ggplot(aes(wts)) + @@ -242,8 +251,9 @@ If the mean is far from 1, we may have issues with model misspecification or pos ```{r} #| label: fig-hist-sd-stable -#| fig.cap: > +#| fig-cap: > #| A histogram of the stabilized inverse probability weights for posted waiting time. These weights are much more reasonable and will allow the outcome model to behave better. + ggplot(wait_times_wts, aes(swts)) + geom_histogram(fill = "#E69F00", color = "white", bins = 50) + scale_x_log10(name = "weights") @@ -254,8 +264,9 @@ Is this a problem, or is this a valid data point? ```{r} #| label: fig-stabilized-wts-scatter -#| fig.cap: > +#| fig-cap: > #| A scatter of the stabilized inverse probability weights for posted waiting time vs. posted waiting times. Days with more values of `wait_minutes_posted_avg` farther from the mean appear to be downweighted, with a few exceptions. The most unusual weight is for June 23, 2018. + ggplot(wait_times_wts, aes(wait_minutes_posted_avg, swts)) + geom_point(size = 3, color = "grey80", alpha = 0.7) + geom_point( @@ -301,6 +312,7 @@ We don't know why the posted time was so high (the actual time was much lower), ```{r} #| echo: false + # TODO: remove when first edition complete status("unstarted") ``` @@ -314,4 +326,3 @@ rnorm(5) ### Diagnostics with many categories ### Fitting the outcome model again - diff --git a/chapters/13-g-comp.qmd b/chapters/13-g-comp.qmd index 33f4d3b..1f51cb8 100644 --- a/chapters/13-g-comp.qmd +++ b/chapters/13-g-comp.qmd @@ -4,27 +4,40 @@ ```{r} #| echo: false + # TODO: remove when first edition complete status("wip") ``` ## The Parametric G-Formula -The parametric g-formula provides a direct approach to causal estimation through outcome modeling. Rather than reweighting observations to balance confounders, as we have done so far in this book, we model the outcome as a function of exposure and covariates, then use that model to predict what would happen under different exposure scenarios. By averaging these predictions across the population, we obtain estimates of causal effects. +The parametric g-formula provides a direct approach to causal estimation through outcome modeling. +Rather than reweighting observations to balance confounders, as we have done so far in this book, we model the outcome as a function of exposure and covariates, then use that model to predict what would happen under different exposure scenarios. +By averaging these predictions across the population, we obtain estimates of causal effects. -We start with fitting a conditional outcome model. Importantly, this means we must correctly specify the functional form of the outcome model. If our model is misspecified, our causal estimates will be biased. The outcome model must contain all variables in the adjustment set identified from the causal diagram. Like weighting and matching-based methods, valid inference is predicated on the assumption that this adjustment set is sufficient to identify the causal effect of interest. Once this model is fit, it can be used to generate predictions under counterfactual scenarios in which the exposure is set to the levels of interest. The resulting predicted outcomes are then averaged to create estimates of the expected outcome under each hypothetical intervention. The approach is called 'parametric' because we assume a specific parametric model (e.g., linear or logistic regression) for the outcome. +We start with fitting a conditional outcome model. +Importantly, this means we must correctly specify the functional form of the outcome model. +If our model is misspecified, our causal estimates will be biased. +The outcome model must contain all variables in the adjustment set identified from the causal diagram. +Like weighting and matching-based methods, valid inference is predicated on the assumption that this adjustment set is sufficient to identify the causal effect of interest. +Once this model is fit, it can be used to generate predictions under counterfactual scenarios in which the exposure is set to the levels of interest. +The resulting predicted outcomes are then averaged to create estimates of the expected outcome under each hypothetical intervention. +The approach is called 'parametric' because we assume a specific parametric model (e.g., linear or logistic regression) for the outcome. -This approach has several advantages: it naturally handles multiple exposure comparisons, accommodates complex relationships through flexible modeling, and can be more efficient when outcomes are easier to model than exposure assignment. In this chapter, we will begin with the single exposure time point setting, demonstrating the core logic of the g-formula with binary and continuous exposures. +This approach has several advantages: it naturally handles multiple exposure comparisons, accommodates complex relationships through flexible modeling, and can be more efficient when outcomes are easier to model than exposure assignment. +In this chapter, we will begin with the single exposure time point setting, demonstrating the core logic of the g-formula with binary and continuous exposures. ### A Single Exposure Time Point Example -Let's again consider whether Extra Magic Morning hours influence the average posted wait time for the Seven Dwarfs Mine Train ride between 9 and 10am as described in @sec-data and assume the same data generating mechanism proposed in @fig-dag-magic-hours-wait. To implement the parametric g-formula we will (1) fit an outcome model with the exposure and all confounders, (2) create two versions of the dataset in which each observation is set to be exposed or unexposed, (3) generate predicted outcomes for each version using the fitted model, (4) average those predictions to construct marginal means by exposure, and (5) take the difference in marginal means to estimate the average treatment effect. +Let's again consider whether Extra Magic Morning hours influence the average posted wait time for the Seven Dwarfs Mine Train ride between 9 and 10am as described in @sec-data and assume the same data generating mechanism proposed in @fig-dag-magic-hours-wait. +To implement the parametric g-formula we will (1) fit an outcome model with the exposure and all confounders, (2) create two versions of the dataset in which each observation is set to be exposed or unexposed, (3) generate predicted outcomes for each version using the fitted model, (4) average those predictions to construct marginal means by exposure, and (5) take the difference in marginal means to estimate the average treatment effect. Let's first fit the outcome model that includes the exposure (`park_extra_magic_morning`) and all confounders (the historic high temperature on the day `park_temperature_high`, the time the park closed `park_close`, and the ticket season `park_ticket_season`). ```{r} #| message: false #| warning: false + library(tidyverse) library(broom) library(touringplans) @@ -42,27 +55,38 @@ gcomp_model <- lm( ) ``` -We then duplicate the `seven_dwarfs_9` dataset for each unique level of exposure (in this case two: one for the exposed and one for the unexposed). One copy assigns every park date as if Extra Magic Morning hours occurred and the other assigns every park date as if it did not. These counterfactual datasets represent the hypothetical worlds under which the exposure is uniformly set to each value. +We then duplicate the `seven_dwarfs_9` dataset for each unique level of exposure (in this case two: one for the exposed and one for the unexposed). +One copy assigns every park date as if Extra Magic Morning hours occurred and the other assigns every park date as if it did not. +These counterfactual datasets represent the hypothetical worlds under which the exposure is uniformly set to each value. ```{r} exposed <- seven_dwarfs_9 |> mutate(park_extra_magic_morning = 1) unexposed <- seven_dwarfs_9 |> mutate(park_extra_magic_morning = 0) ``` -To build intuition for what these counterfactual datasets look like, it helps to examine them directly. Each row in the original data is "cloned" twice, once into a world where Extra Magic Morning hours occurred, and once into a world where they did not. The covariates are identical across the two clones; only the exposure value differs. +To build intuition for what these counterfactual datasets look like, it helps to examine them directly. +Each row in the original data is "cloned" twice, once into a world where Extra Magic Morning hours occurred, and once into a world where they did not. +The covariates are identical across the two clones; only the exposure value differs. ```{r} bind_rows( exposed |> mutate(clone = "exposed"), unexposed |> mutate(clone = "unexposed") ) |> - select(park_date, clone, park_extra_magic_morning, - park_ticket_season, park_temperature_high) |> + select( + park_date, + clone, + park_extra_magic_morning, + park_ticket_season, + park_temperature_high + ) |> arrange(park_date) |> slice_head(n = 6) ``` -Using the previously fitted outcome model, predicted outcomes are generated for each of these counterfactual datasets and averaged. These averages (marginal means) represent the expected posted wait time if everyone were exposed or if no one were exposed. We can then take the difference between these marginal means to estimate the causal effect of Extra Magic Morning on posted wait times. +Using the previously fitted outcome model, predicted outcomes are generated for each of these counterfactual datasets and averaged. +These averages (marginal means) represent the expected posted wait time if everyone were exposed or if no one were exposed. +We can then take the difference between these marginal means to estimate the causal effect of Extra Magic Morning on posted wait times. ```{r} pred_exposed <- gcomp_model |> @@ -103,9 +127,14 @@ bind_cols( ) ``` -Each point represents a single park date predicted under the exposed and unexposed scenarios. Because we are using a linear model, the two sets of predictions are parallel (in other words, the gap between them is constant across the range of temperature). That gap, averaged across all park dates, is the estimated average treatment effect. Notice that we are not plugging in a single "representative" covariate value; we are generating predictions for every observation at their actual covariate values and then averaging. This is what it means to marginalize over the covariate distribution, and it is what distinguishes g-computation from simply extracting a coefficient from the outcome model. +Each point represents a single park date predicted under the exposed and unexposed scenarios. +Because we are using a linear model, the two sets of predictions are parallel (in other words, the gap between them is constant across the range of temperature). +That gap, averaged across all park dates, is the estimated average treatment effect. +Notice that we are not plugging in a single "representative" covariate value; we are generating predictions for every observation at their actual covariate values and then averaging. +This is what it means to marginalize over the covariate distribution, and it is what distinguishes g-computation from simply extracting a coefficient from the outcome model. -The `{marginaleffects}` package makes some of these mechanics easy and help with uncertainty quantification. For example, rather than creating the separate counterfactual datasets, we can calculate the average treatment effect directly using the `average_comparisons` function. +The `{marginaleffects}` package makes some of these mechanics easy and help with uncertainty quantification. +For example, rather than creating the separate counterfactual datasets, we can calculate the average treatment effect directly using the `average_comparisons` function. ```{r} library(marginaleffects) @@ -117,23 +146,38 @@ ate <- avg_comparisons( ate ``` -It is worth pausing here to connect back to the framework from @sec-estimands. The estimand is the same whether we use IPW or an outcome model: $E[Y(1)−Y(0)]$, the average treatment effect. What differs is the -estimator, the recipe we use to get there. When we used IPW in @sec-outcome-model, we reweighted the observed data to balance confounders across exposure groups. Above, with g-computation we fit the outcome model to directly estimate the potential outcomes under each exposure level and average their difference. Because these are different recipes for the same cake, we should not be surprised if they produce somewhat different estimates in practice. Each relies on its own modeling assumptions: IPW depends on a correctly specified propensity score model, while the outcome model depends on a correctly specified model for the outcome. When those models are wrong in different ways, the estimates will diverge. +It is worth pausing here to connect back to the framework from @sec-estimands. +The estimand is the same whether we use IPW or an outcome model: $E[Y(1)−Y(0)]$, the average treatment effect. +What differs is the estimator, the recipe we use to get there. +When we used IPW in @sec-outcome-model, we reweighted the observed data to balance confounders across exposure groups. +Above, with g-computation we fit the outcome model to directly estimate the potential outcomes under each exposure level and average their difference. +Because these are different recipes for the same cake, we should not be surprised if they produce somewhat different estimates in practice. +Each relies on its own modeling assumptions: IPW depends on a correctly specified propensity score model, while the outcome model depends on a correctly specified model for the outcome. +When those models are wrong in different ways, the estimates will diverge. ::: {.callout-note} ## Under the Hood: The Delta Method -When `{marginaleffects}` computes uncertainty estimates for quantities like average treatment effects, it relies on the **delta method**, a technique for approximating the variance of a transformation of estimated parameters. If you have a vector of coefficients $\hat{\boldsymbol{\beta}}$ with known variance-covariance matrix $\Sigma$, and you want the variance of some function $g(\hat{\boldsymbol{\beta}})$ (like a predicted probability or a marginal effect), the delta method gives you: +When `{marginaleffects}` computes uncertainty estimates for quantities like average treatment effects, it relies on the **delta method**, a technique for approximating the variance of a transformation of estimated parameters. +If you have a vector of coefficients $\hat{\boldsymbol{\beta}}$ with known variance-covariance matrix $\Sigma$, and you want the variance of some function $g(\hat{\boldsymbol{\beta}})$ (like a predicted probability or a marginal effect), the delta method gives you: -$$\text{Var}\left[g(\hat{\boldsymbol{\beta}})\right] \approx \nabla -g(\hat{\boldsymbol{\beta}})^\top \, \Sigma \, \nabla g(\hat{\boldsymbol{\beta}})$$ +$$ +\text{Var}\left[g(\hat{\boldsymbol{\beta}})\right] \approx \nabla +g(\hat{\boldsymbol{\beta}})^\top \, \Sigma \, \nabla g(\hat{\boldsymbol{\beta}}) +$$ -where $\nabla g$ is the gradient of $g$ with respect to $\boldsymbol{\beta}$. `{marginaleffects}` handles this gradient computation automatically and propagates it through $\Sigma$ to produce standard errors. +where $\nabla g$ is the gradient of $g$ with respect to $\boldsymbol{\beta}$. +`{marginaleffects}` handles this gradient computation automatically and propagates it through $\Sigma$ to produce standard errors. ::: ### Targeting Different Estimands {#sec-gcomp-target} -As we discussed in @sec-estimands, the ATE is not the only causal estimand of interest. G-computation makes it straightforward to target different estimands by changing which covariate distribution we average over. For the ATE, we average predictions over the full observed sample. For the ATT, we average only over the covariate distribution of the exposed (in our case, days that actually had Extra Magic Morning hours). For the ATC, we average only over the unexposed. The outcome model is the same in all three cases; what changes is whose covariate values we use when generating predictions. +As we discussed in @sec-estimands, the ATE is not the only causal estimand of interest. +G-computation makes it straightforward to target different estimands by changing which covariate distribution we average over. +For the ATE, we average predictions over the full observed sample. +For the ATT, we average only over the covariate distribution of the exposed (in our case, days that actually had Extra Magic Morning hours). +For the ATC, we average only over the unexposed. +The outcome model is the same in all three cases; what changes is whose covariate values we use when generating predictions. We can target each of these estimands directly in `avg_comparisons` using the `newdata` argument: ```{r} @@ -157,7 +201,10 @@ avg_comparisons( newdata = filter(seven_dwarfs_9, park_extra_magic_morning == 0) ) ``` -In this case, because our outcome model contains no interaction terms between the exposure and the covariates, the three estimates will be identical. A linear model without interactions assumes a constant treatment effect across all covariate values, so it does not matter whose covariate distribution we average over, we always recover the same gap. To see differences between the ATE, ATT, and ATC from g-computation, the outcome model needs to allow the effect of the exposure to vary with covariates, for example by including an interaction term like `park_extra_magic_morning * park_ticket_season`. Let's refit our model to see these differences. +In this case, because our outcome model contains no interaction terms between the exposure and the covariates, the three estimates will be identical. +A linear model without interactions assumes a constant treatment effect across all covariate values, so it does not matter whose covariate distribution we average over, we always recover the same gap. +To see differences between the ATE, ATT, and ATC from g-computation, the outcome model needs to allow the effect of the exposure to vary with covariates, for example by including an interaction term like `park_extra_magic_morning * park_ticket_season`. +Let's refit our model to see these differences. ```{r} gcomp_model_int <- lm( @@ -191,11 +238,16 @@ avg_comparisons( ) ``` -Once at least one interaction has been specified, the three estimates can differ because the exposed and unexposed days tend to have different covariate distributions. The ATT asks what the effect was for the kinds of days that actually had Extra Magic Hours; the ATC asks what the effect would be for the kinds of days that did not. If the treatment effect were constant across all covariate values, the three estimates would coincide. When they differ, that divergence is itself informative about treatment effect heterogeneity across the population. +Once at least one interaction has been specified, the three estimates can differ because the exposed and unexposed days tend to have different covariate distributions. +The ATT asks what the effect was for the kinds of days that actually had Extra Magic Hours; the ATC asks what the effect would be for the kinds of days that did not. +If the treatment effect were constant across all covariate values, the three estimates would coincide. +When they differ, that divergence is itself informative about treatment effect heterogeneity across the population. ### Continuous Exposures -The same strategy applies when the exposure is continuous. For instance, imagine estimating how the actual wait time at 9am would change if the posted wait time at 8am were set to 60 minutes rather than 30 minutes. The procedure outlined above is identical, except that the exposure is set to the chosen numerical values for the counterfactual datasets. +The same strategy applies when the exposure is continuous. +For instance, imagine estimating how the actual wait time at 9am would change if the posted wait time at 8am were set to 60 minutes rather than 30 minutes. +The procedure outlined above is identical, except that the exposure is set to the chosen numerical values for the counterfactual datasets. Let's start by arranging the data so that the actual wait time at 9 AM can be modeled using the posted wait time at 8 AM and the same confounders idenfied above. @@ -213,7 +265,8 @@ wait_times <- eight |> drop_na(wait_minutes_actual_avg) ``` -We fit the outcome model the same way as for the binary exposure, as the outcome in both cases is continuous. Note now since we have a continuous exposure we can flexibly fit this, for example by using a natural spline. +We fit the outcome model the same way as for the binary exposure, as the outcome in both cases is continuous. +Note now since we have a continuous exposure we can flexibly fit this, for example by using a natural spline. ```{r} fit_wait <- lm( @@ -233,15 +286,14 @@ To compare posted wait times of 60 and 30 minutes, two counterfactual datasets a ```{r} high <- wait_times |> mutate(wait_minutes_posted_avg = 60) -low <- wait_times |> mutate(wait_minutes_posted_avg = 30) - +low <- wait_times |> mutate(wait_minutes_posted_avg = 30) ``` We then obtain predicted outcomes for each counterfactual dataset and then average these and take the difference: ```{r} pred_high <- fit_wait |> augment(newdata = high) |> select(pred_high = .fitted) -pred_low <- fit_wait |> augment(newdata = low) |> select(pred_low = .fitted) +pred_low <- fit_wait |> augment(newdata = low) |> select(pred_low = .fitted) bind_cols(pred_high, pred_low) |> summarize( @@ -266,13 +318,19 @@ comparison_30_60 ::: {.callout-warning} ## Positivity, Extrapolation, and the Price of Relying Only on the Outcome Model -G-computation handles poor overlap differently from IPW. When positivity is violated (or nearly so), IPW weights become extreme, destabilizing the estimate. G-computation sidesteps this by extrapolating from the outcome model into regions of the covariate space where one exposure group is rare. This can be an advantage if the model is correctly specified, but it also comes at a cost if not as it can mask potential problems. +G-computation handles poor overlap differently from IPW. +When positivity is violated (or nearly so), IPW weights become extreme, destabilizing the estimate. +G-computation sidesteps this by extrapolating from the outcome model into regions of the covariate space where one exposure group is rare. +This can be an advantage if the model is correctly specified, but it also comes at a cost if not as it can mask potential problems. -To see this, consider a toy dataset with poor overlap. Here, the confounder `z` strongly predicts treatment, so many observations have propensity scores near 0 or 1. The true outcome model includes a nonlinear (cubic) relationship between `z` and `y`. +To see this, consider a toy dataset with poor overlap. +Here, the confounder `z` strongly predicts treatment, so many observations have propensity scores near 0 or 1. +The true outcome model includes a nonlinear (cubic) relationship between `z` and `y`. ```{r} #| label: fig-positivity-setup #| code-fold: true + set.seed(1) n <- 500 @@ -284,11 +342,13 @@ sim_poor <- tibble( ) ``` -The true ATE is 2. Let's first examine the overlap. +The true ATE is 2. +Let's first examine the overlap. ```{r} #| label: fig-positivity-overlap #| fig-cap: "Distribution of the propensity score under poor overlap. Many observations have scores near 0 or 1, leaving little common support." + library(propensity) ps_model <- glm(x ~ z, data = sim_poor, family = binomial()) @@ -311,6 +371,7 @@ Now let's see what happens to the IPW weights. ```{r} #| label: fig-positivity-weights #| fig-cap: "ATE weights under poor overlap. Some weights are extremely large, which will destabilize the IPW estimate." + ggplot(sim_poor, aes(x = as.numeric(w_ate))) + geom_histogram(bins = 50) + labs(x = "ATE Weight") @@ -321,7 +382,8 @@ The IPW estimate is unstable as a result. ```{r} sim_poor |> summarize( - ipw_ate = sum(y * x * w_ate) / sum(x * w_ate) - + ipw_ate = sum(y * x * w_ate) / + sum(x * w_ate) - sum(y * (1 - x) * w_ate) / sum((1 - x) * w_ate) ) ``` @@ -331,7 +393,7 @@ Now let's try g-computation with a correctly specified outcome model, i.e., one ```{r} gcomp_correct <- lm(y ~ x + z + I(z^3), data = sim_poor) -exposed_sim <- sim_poor |> mutate(x = 1) +exposed_sim <- sim_poor |> mutate(x = 1) unexposed_sim <- sim_poor |> mutate(x = 0) bind_cols( @@ -341,9 +403,12 @@ bind_cols( summarize(ate = mean(pred_1 - pred_0)) ``` -With a correctly specified model, g-computation recovers an estimate close to the true ATE of 2, even under poor overlap. It achieves this by extrapolating the fitted outcome surface into regions of the covariate space where one group is rare, borrowing strength from the model rather than from observed counterparts. +With a correctly specified model, g-computation recovers an estimate close to the true ATE of 2, even under poor overlap. +It achieves this by extrapolating the fitted outcome surface into regions of the covariate space where one group is rare, borrowing strength from the model rather than from observed counterparts. + +However, this extrapolation only works if the outcome model is correctly specified. +If we misspecify the model by omitting the cubic term, g-computation will extrapolate the wrong surface into exactly those regions where we have no data to detect the error. -However, this extrapolation only works if the outcome model is correctly specified. If we misspecify the model by omitting the cubic term, g-computation will extrapolate the wrong surface into exactly those regions where we have no data to detect the error. ```{r} gcomp_wrong <- lm(y ~ x + z, data = sim_poor) @@ -354,12 +419,15 @@ bind_cols( summarize(ate = mean(pred_1 - pred_0)) ``` -The misspecified model produces a noticeably biased estimate. The bias is concentrated in the tails of the covariate distribution, exactly where overlap is poor and where we are relying most heavily on the model to fill in the gaps. We can see this by plotting the fitted outcome surfaces from the two models against the true cubic relationship. +The misspecified model produces a noticeably biased estimate. +The bias is concentrated in the tails of the covariate distribution, exactly where overlap is poor and where we are relying most heavily on the model to fill in the gaps. +We can see this by plotting the fitted outcome surfaces from the two models against the true cubic relationship. ```{r} #| label: fig-positivity-surfaces #| fig-cap: "Fitted outcome surfaces from the correctly specified (cubic) and misspecified (linear) outcome models, overlaid on the observed data. In the tails where overlap is poor, the linear model extrapolates the wrong surface." #| code-fold: true + z_seq <- tibble(z = seq(min(sim_poor$z), max(sim_poor$z), length.out = 200)) surfaces <- bind_rows( @@ -367,14 +435,14 @@ surfaces <- bind_rows( mutate( x = 1, correct = predict(gcomp_correct, newdata = mutate(z_seq, x = 1)), - wrong = predict(gcomp_wrong, newdata = mutate(z_seq, x = 1)), + wrong = predict(gcomp_wrong, newdata = mutate(z_seq, x = 1)), group = "Treated" ), z_seq |> mutate( x = 0, correct = predict(gcomp_correct, newdata = mutate(z_seq, x = 0)), - wrong = predict(gcomp_wrong, newdata = mutate(z_seq, x = 0)), + wrong = predict(gcomp_wrong, newdata = mutate(z_seq, x = 0)), group = "Untreated" ) ) @@ -383,7 +451,8 @@ ggplot() + geom_point( data = sim_poor, aes(x = z, y = y, color = factor(x)), - alpha = 0.3, size = 0.8 + alpha = 0.3, + size = 0.8 ) + geom_line( data = surfaces, @@ -393,7 +462,8 @@ ggplot() + geom_line( data = surfaces, aes(x = z, y = wrong, color = group), - linewidth = 1, linetype = "dashed" + linewidth = 1, + linetype = "dashed" ) + labs( x = "z", @@ -403,5 +473,9 @@ ggplot() + ) ``` -The takeaway is that g-computation and IPW face a fundamental tradeoff when overlap is poor. IPW makes its reliance on overlap explicit (i.e., the weights blow up and the analyst is forced to confront the problem). G-computation absorbs the problem silently, producing a stable estimate that may nonetheless be wrong. Neither method can conjure causal information that is not in the data; g-computation simply moves the assumption from the propensity score model to the outcome model. When overlap is poor, getting the functional form of the outcome model right in the tails matters, and it is precisely in those regions that we have the least data to check it. +The takeaway is that g-computation and IPW face a fundamental tradeoff when overlap is poor. +IPW makes its reliance on overlap explicit (i.e., the weights blow up and the analyst is forced to confront the problem). +G-computation absorbs the problem silently, producing a stable estimate that may nonetheless be wrong. +Neither method can conjure causal information that is not in the data; g-computation simply moves the assumption from the propensity score model to the outcome model. +When overlap is poor, getting the functional form of the outcome model right in the tails matters, and it is precisely in those regions that we have the least data to check it. ::: diff --git a/chapters/14-interaction.qmd b/chapters/14-interaction.qmd index f9ae0c9..c2e1683 100644 --- a/chapters/14-interaction.qmd +++ b/chapters/14-interaction.qmd @@ -4,6 +4,7 @@ ```{r} #| echo: false + # TODO: remove when first edition complete status("wip") ``` @@ -11,23 +12,28 @@ status("wip") ```{r} #| echo: false #| message: false + library(ggdag) ``` - -As discussed in @sec-estimands, causal estimands such as the ATE, ATT, and ATC are all defined as expectations of the same unit-level contrast, $(Y(1)-Y(0))$, but taken over different covariate distributions. This distinction becomes meaningful only when the individual treatment effect varies across units. +As discussed in @sec-estimands, causal estimands such as the ATE, ATT, and ATC are all defined as expectations of the same unit-level contrast, $(Y(1)-Y(0))$, but taken over different covariate distributions. +This distinction becomes meaningful only when the individual treatment effect varies across units. ::: {.callout-tip} -Note here we denote $\mathbf{Z}$ in bold. This is to indicate that this represents several covariates (in practice, all of the confounders), rather than a single variable +Note here we denote $\mathbf{Z}$ in bold. +This is to indicate that this represents several covariates (in practice, all of the confounders), rather than a single variable ::: -As defined in @sec-po, $Y(1)$ and $Y(0)$ denote potential outcomes under exposure $(X=1)$ and control $(X=0)$. $\mathbf{Z}$ is a vector of pre-treatment covariates. The individual treatment effect can be written as +As defined in @sec-po, $Y(1)$ and $Y(0)$ denote potential outcomes under exposure $(X=1)$ and control $(X=0)$. +$\mathbf{Z}$ is a vector of pre-treatment covariates. +The individual treatment effect can be written as $$ \tau(Z)=Y(1)-Y(0). $$ -Heterogeneous treatment effects arise when $\tau(Z)$ is not constant in $Z$. In that case, we see the various estimands defined in @sec-estimand, for example +Heterogeneous treatment effects arise when $\tau(Z)$ is not constant in $Z$. +In that case, we see the various estimands defined in @sec-estimand, for example $$ \text{ATE}=E[\tau(Z)], \quad @@ -35,41 +41,59 @@ $$ \text{ATC}=E[\tau(Z)\mid X=0]. $$ -The above are different averages of the same function $\tau(Z)$. Any difference between them is driven by two things: 1. variation in treatment effects across covariate profiles, and 2. differences in the distribution of those covariate profiles across treated and untreated units. +The above are different averages of the same function $\tau(Z)$. +Any difference between them is driven by two things: 1. +variation in treatment effects across covariate profiles, and 2. +differences in the distribution of those covariate profiles across treated and untreated units. ## Interaction terms as a parameterization of heterogeneity -Suppose we fit a simple regression model to understand the relationship between an exposure and outcome. In other words, we model the conditional mean of the observed outcome $Y$ as a function of treatment $X$ and confounders $\mathbf{Z}$. +Suppose we fit a simple regression model to understand the relationship between an exposure and outcome. +In other words, we model the conditional mean of the observed outcome $Y$ as a function of treatment $X$ and confounders $\mathbf{Z}$. -$$E[Y\mid X,Z]=\beta_0+\beta_1X+\boldsymbol\beta^\top \textbf{Z}$$ +$$ +E[Y\mid X,Z]=\beta_0+\beta_1X+\boldsymbol\beta^\top \textbf{Z} +$$ Under this specification, +$$ +E[Y\mid X=1,\mathbf{Z}]-E[Y\mid X=0,\mathbf{Z}]=\beta_1. +$$ -$$E[Y\mid X=1,\mathbf{Z}]-E[Y\mid X=0,\mathbf{Z}]=\beta_1.$$ - -This means the treatment effect is assumed to be the same for every unit, regardless of covariate values. In other words, after adjusting for $\mathbf{Z}$, the model assumes the effect of treatment does not vary across the population. +This means the treatment effect is assumed to be the same for every unit, regardless of covariate values. +In other words, after adjusting for $\mathbf{Z}$, the model assumes the effect of treatment does not vary across the population. -That can be a strong assumption. In many settings, treatment effects may differ across subgroups. To allow for this possibility, suppose one variable in the confounder set $\mathbf{Z}$ modifies the treatment effect (let's call this $Z_1$). We can include an interaction between treatment and that covariate: +That can be a strong assumption. +In many settings, treatment effects may differ across subgroups. +To allow for this possibility, suppose one variable in the confounder set $\mathbf{Z}$ modifies the treatment effect (let's call this $Z_1$). +We can include an interaction between treatment and that covariate: $$ E[Y \mid X,\mathbf{Z}] = \beta_0 + \beta_1 X + \beta_2 Z_1 + \beta_3(X \times Z_1) + \boldsymbol{\beta}_4^{\top}\mathbf{W}, $$ -where $\mathbf{W}$ contains the remaining covariates in $\mathbf{Z}$ excluding $Z_1$. Then the conditional treatment effect becomes - +where $\mathbf{W}$ contains the remaining covariates in $\mathbf{Z}$ excluding $Z_1$. +Then the conditional treatment effect becomes -$$\tau(Z_1)= E[Y \mid X=1,\mathbf{Z}] - E[Y \mid X=0,\mathbf{Z}] =\beta_1 + \beta_3 Z_1.$$ +$$ +\tau(Z_1)= E[Y \mid X=1,\mathbf{Z}] - E[Y \mid X=0,\mathbf{Z}] =\beta_1 + \beta_3 Z_1. +$$ -Now the treatment effect depends on $Z_1$. If $\beta_3=0$, the treatment effect is constant, otherwise the treatment effect changes across values of $Z_1$. +Now the treatment effect depends on $Z_1$. +If $\beta_3=0$, the treatment effect is constant, otherwise the treatment effect changes across values of $Z_1$. -This is one way regression models represent heterogeneous treatment effects. The interaction term formalizes the claim that the effect of treatment differs across levels of a covariate. +This is one way regression models represent heterogeneous treatment effects. +The interaction term formalizes the claim that the effect of treatment differs across levels of a covariate. -An equivalent way to represent the same idea is to fit separate regression models within strata of $Z_1$ (or if $Z_1$ is categorical, fitting the model within each category). Separate models allow the treatment coefficient to differ across groups. The interaction model is often preferred because it estimates those subgroup-specific effects within a single unified framework while using all available data simultaneously. +An equivalent way to represent the same idea is to fit separate regression models within strata of $Z_1$ (or if $Z_1$ is categorical, fitting the model within each category). +Separate models allow the treatment coefficient to differ across groups. +The interaction model is often preferred because it estimates those subgroup-specific effects within a single unified framework while using all available data simultaneously. ## Heterogeneity in G-computation -G-computation makes the link between conditional effects and marginal estimands explicit. After fitting an outcome model, we generate two predictions for each unit: +G-computation makes the link between conditional effects and marginal estimands explicit. +After fitting an outcome model, we generate two predictions for each unit: $$ \hat Y_i(1)=\widehat{E}[Y\mid X=1,Z_i], \quad @@ -82,16 +106,22 @@ $$ \hat\tau_i=\hat Y_i(1)-\hat Y_i(0). $$ -When interactions are present, $\hat\tau_i$ varies across units because it depends on $Z_i$. The ATE is then +When interactions are present, $\hat\tau_i$ varies across units because it depends on $Z_i$. +The ATE is then $$ \widehat{\text{ATE}}=\frac{1}{n}\sum_{i=1}^n \hat\tau_i. $$ -The ATT and ATC are obtained by averaging over treated or untreated subsets instead. Let's return to the example from @sec-gcomp-target. Once again, we are interested in understanding the relationship between whether a particular day offers Extra Magic Morning hours and the average posted wait time at the Seven Dwarfs Mine Train ride between 9 and 10am. Here, we allow the effect of Extra Magic Morning to vary by ticket season through an interaction term. Once treatment effects are allowed to differ across covariate profiles, g-computation averages different unit-level effects over different target populations, and the ATE, ATT, and ATC need not coincide. +The ATT and ATC are obtained by averaging over treated or untreated subsets instead. +Let's return to the example from @sec-gcomp-target. +Once again, we are interested in understanding the relationship between whether a particular day offers Extra Magic Morning hours and the average posted wait time at the Seven Dwarfs Mine Train ride between 9 and 10am. +Here, we allow the effect of Extra Magic Morning to vary by ticket season through an interaction term. +Once treatment effects are allowed to differ across covariate profiles, g-computation averages different unit-level effects over different target populations, and the ATE, ATT, and ATC need not coincide. ```{r} #| message: false #| warning: false + library(tidyverse) library(broom) library(touringplans) @@ -131,15 +161,21 @@ avg_comparisons( ) ``` -Once interaction is introduced, these estimands can differ because they average the same heterogeneous response surface over different covariate distributions. Because g-computation focuses on modeling the *outcome* we must specify every interaction we think exists in order to accurately estimate treatment effect heterogeneity. +Once interaction is introduced, these estimands can differ because they average the same heterogeneous response surface over different covariate distributions. +Because g-computation focuses on modeling the *outcome* we must specify every interaction we think exists in order to accurately estimate treatment effect heterogeneity. ## Heterogeneity in IPW -Inverse probability weighting targets the same estimands through a different mechanism. Rather than modeling $E[Y\mid X,\mathbf{Z}]$ directly, IPW reweights the data so treatment assignment is independent of $Z$. +Inverse probability weighting targets the same estimands through a different mechanism. +Rather than modeling $E[Y\mid X,\mathbf{Z}]$ directly, IPW reweights the data so treatment assignment is independent of $Z$. -Unlike g-computation, IPW does not require us to specify an explicit regression model for the outcome surface. The weighting scheme determines which covariate distribution is used to average those heterogeneous unit-level effects. Just as with g-computation, ATE, ATT, and ATC can differ when treatment effects vary across covariate profiles, but we didn't need to specify the interaction terms explicitly to see these differences. IPW captures heterogeneity implicitly through the weighting and averaging target, not by estimating subgroup-specific treatment effects. +Unlike g-computation, IPW does not require us to specify an explicit regression model for the outcome surface. +The weighting scheme determines which covariate distribution is used to average those heterogeneous unit-level effects. +Just as with g-computation, ATE, ATT, and ATC can differ when treatment effects vary across covariate profiles, but we didn't need to specify the interaction terms explicitly to see these differences. +IPW captures heterogeneity implicitly through the weighting and averaging target, not by estimating subgroup-specific treatment effects. -Let's look at the same example as above. As in @sec-outcome-model, we first estimate the propensity score, the probability of receiving Extra Magic Morning conditional on observed covariates: +Let's look at the same example as above. +As in @sec-outcome-model, we first estimate the propensity score, the probability of receiving Extra Magic Morning conditional on observed covariates: ```{r} propensity_model <- glm( @@ -157,6 +193,7 @@ We then pull the propensity scores from this model and estimate the weights acco ```{r} #| message: false #| warning: false + library(propensity) seven_dwarfs_9_with_ps <- propensity_model |> augment(type.predict = "response", data = seven_dwarfs_9) @@ -165,7 +202,8 @@ seven_dwarfs_9_with_wt <- seven_dwarfs_9_with_ps |> mutate( w_ate = wt_ate(.fitted, park_extra_magic_morning), w_att = wt_att(.fitted, park_extra_magic_morning), - w_atc = wt_atc(.fitted, park_extra_magic_morning)) + w_atc = wt_atc(.fitted, park_extra_magic_morning) + ) ``` Finally we get our estimates by fitting the weighted outcome models: @@ -198,19 +236,25 @@ lm( ## Effect modification versus interaction - -Up to this point, we have used interaction terms in regression models to represent heterogeneous treatment effects: the effect of the exposure varies across levels of a covariate $\mathbf{Z}$. That is often pracitioners mean when they ask whether an effect "differs by subgroup." It can be useful to distinguish two related but different ideas: **effect modification** and **interaction** [@weinberg2007; @VanderWeele2009; @Nilsson2021; @hernan2021]. These terms are often used interchangeably, but they answer different causal questions. **Effect modification** asks whether the causal effect of treatment $X$ on outcome $Y$ differs across levels of some pre-treatment variable $Z$. In our notation, this is the question: +Up to this point, we have used interaction terms in regression models to represent heterogeneous treatment effects: the effect of the exposure varies across levels of a covariate $\mathbf{Z}$. +That is often pracitioners mean when they ask whether an effect "differs by subgroup." +It can be useful to distinguish two related but different ideas: **effect modification** and **interaction** [@weinberg2007; @VanderWeele2009; @Nilsson2021; @hernan2021]. +These terms are often used interchangeably, but they answer different causal questions. +**Effect modification** asks whether the causal effect of treatment $X$ on outcome $Y$ differs across levels of some pre-treatment variable $Z$. +In our notation, this is the question: $$ E[Y(1)-Y(0)\mid Z=z_1] \neq E[Y(1)-Y(0)\mid Z=z_0]. $$ +Here, $Z$ is used to define subgroups. +We are interested in how the effect of $X$ changes across observed strata of $Z$, not in intervening on $Z$ itself. -Here, $Z$ is used to define subgroups. We are interested in how the effect of $X$ changes across observed strata of $Z$, not in intervening on $Z$ itself. - -For the Seven Dwarfs example, $Z$ might be `park_ticket_season`. Then the question is whether Extra Magic Morning changes posted wait time differently during peak season versus off-season. +For the Seven Dwarfs example, $Z$ might be `park_ticket_season`. +Then the question is whether Extra Magic Morning changes posted wait time differently during peak season versus off-season. -**Causal interaction**, by contrast, treats both variables as exposures of causal interest. Instead of asking whether the effect of $X$ varies across strata of $Z$, it asks whether the **joint effect** of changing both $X$ and $Z$ differs from what would be expected based on their separate effects. +**Causal interaction**, by contrast, treats both variables as exposures of causal interest. +Instead of asking whether the effect of $X$ varies across strata of $Z$, it asks whether the **joint effect** of changing both $X$ and $Z$ differs from what would be expected based on their separate effects. Formally, causal interaction compares counterfactual contrasts such as @@ -218,11 +262,17 @@ $$ E[Y(x_1,z_1)-Y(x_0,z_1)] \neq E[Y(x_1,z_0)-Y(x_0,z_0)]. $$ -This may look similar to effect modification, but the interpretation is different. In effect modification, $Z$ is a subgroup-defining characteristic. In causal interaction, $Z$ is itself something we conceptually intervene on. That distinction matters because some variables are natural subgroup indicators but not plausible intervention targets. For example, season, age group, or baseline risk category may be useful effect modifiers, even though we would not think of "assigning" people to them in the same way we assign an exposure. +This may look similar to effect modification, but the interpretation is different. +In effect modification, $Z$ is a subgroup-defining characteristic. +In causal interaction, $Z$ is itself something we conceptually intervene on. +That distinction matters because some variables are natural subgroup indicators but not plausible intervention targets. +For example, season, age group, or baseline risk category may be useful effect modifiers, even though we would not think of "assigning" people to them in the same way we assign an exposure. -In the Seven Dwarfs setting, asking whether the effect of Extra Magic Morning differs by `park_ticket_season` is naturally an effect-modification question: does the policy work differently in busy versus less busy periods? That is different from asking about the joint causal effect of changing both Extra Magic Morning access and ticket season itself. +In the Seven Dwarfs setting, asking whether the effect of Extra Magic Morning differs by `park_ticket_season` is naturally an effect-modification question: does the policy work differently in busy versus less busy periods? +That is different from asking about the joint causal effect of changing both Extra Magic Morning access and ticket season itself. -In practice, many regression models with interaction terms are used to study effect modification rather than interaction in the stricter causal sense. The same algebraic model can serve either purpose, but interpretation depends on the underlying question. +In practice, many regression models with interaction terms are used to study effect modification rather than interaction in the stricter causal sense. +The same algebraic model can serve either purpose, but interpretation depends on the underlying question. For most policy and applied settings, the subgroup question is the relevant one: **who benefits more, who benefits less, and how does the average effect depend on population composition?** That is why heterogeneous treatment effects and effect modification are central to causal inference, transportability, and targeting interventions. @@ -266,9 +316,8 @@ ggdag(conventional_dag) + theme_dag() ``` -HEAD -This DAG shows that X, Q, and Z all affect Y, but it doesn't explicitly indicate whether X and Q interact. The interaction, if present, would be implicit in the functional form of how X and Q jointly affect Y in a statistical model. - +HEAD This DAG shows that X, Q, and Z all affect Y, but it doesn't explicitly indicate whether X and Q interact. +The interaction, if present, would be implicit in the functional form of how X and Q jointly affect Y in a statistical model. ### Explicit interaction nodes @@ -366,4 +415,3 @@ The choice depends on your goals. If you want to make interactions explicit within a single diagram, the explicit interaction node approach may be most practical. If you're conducting a detailed analysis of interaction mechanisms, IDAGs provide a more formal framework. For most purposes, conventional DAGs suffice, with interactions handled in the statistical modeling stage. - diff --git a/chapters/15-missingness-and-measurement.qmd b/chapters/15-missingness-and-measurement.qmd index 60f9b71..453a081 100644 --- a/chapters/15-missingness-and-measurement.qmd +++ b/chapters/15-missingness-and-measurement.qmd @@ -4,6 +4,7 @@ ```{r} #| echo: false + # TODO: remove when first edition complete status("wip") ``` @@ -54,6 +55,7 @@ For simplicity, we've removed the confounders from this DAG. #| fig-cap: "A DAG showing the relationship between posted and actual wait times, with additional information about measurement. The mismeasured versions of the two wait time variables are represented as separate nodes. The mismeasured versions are caused by the true values and an unknown mechanism that worsens measurement. When conducting a causal analysis with mismeasured variables, we use them as proxies for the true values." #| code-fold: true #| message: false + library(ggdag) glyph <- function(data, params, size) { @@ -65,7 +67,10 @@ glyph <- function(data, params, size) { show_edge_color <- function(...) { list( theme(legend.position = "bottom"), - ggokabeito::scale_edge_color_okabe_ito(name = NULL, breaks = ~ .x[!is.na(.x)]), + ggokabeito::scale_edge_color_okabe_ito( + name = NULL, + breaks = ~ .x[!is.na(.x)] + ), guides(color = "none") ) } @@ -85,15 +90,32 @@ edges_with_aes <- function(..., edge_color = "grey85", shadow = TRUE) { ) } -ggdag2 <- function(.dag, ..., order = 1:9, seed = 1633, box.padding = 3.4, edges = geom_dag_edges_link(edge_color = "grey85")) { +ggdag2 <- function( + .dag, + ..., + order = 1:9, + seed = 1633, + box.padding = 3.4, + edges = geom_dag_edges_link(edge_color = "grey85") +) { ggplot( .dag, aes_dag(...) ) + edges + geom_dag_point(key_glyph = glyph) + - geom_dag_text_repel(aes(label = label), size = 3.8, seed = seed, color = "#494949", box.padding = box.padding) + - ggokabeito::scale_color_okabe_ito(order = order, na.value = "grey90", breaks = ~ .x[!is.na(.x)]) + + geom_dag_text_repel( + aes(label = label), + size = 3.8, + seed = seed, + color = "#494949", + box.padding = box.padding + ) + + ggokabeito::scale_color_okabe_ito( + order = order, + na.value = "grey90", + breaks = ~ .x[!is.na(.x)] + ) + theme_dag() + theme(legend.position = "none") + coord_cartesian(clip = "off") @@ -147,7 +169,10 @@ dagify( theme( legend.key.spacing.x = unit(4, "points"), legend.key.size = unit(1, "points"), - legend.text = element_text(size = rel(1.25), margin = margin(l = -10.5, b = 2.6)), + legend.text = element_text( + size = rel(1.25), + margin = margin(l = -10.5, b = 2.6) + ), legend.box.margin = margin(b = 20), strip.text = element_blank() ) @@ -178,6 +203,7 @@ The extent of this error depends on how well the measured version correlates wit #| layout-ncol: 2 #| fig-width: 4 #| fig-height: 4 + labels <- c( "actual" = "actual\nwait", "actual_star" = "measured\nactual", @@ -239,7 +265,7 @@ x <- rnorm(n) y <- x + rnorm(n) # bad measurement of x u <- rnorm(n) -x_measured <- .01 * x + u +x_measured <- 0.01 * x + u cor(x, x_measured) lm(y ~ x_measured) ``` @@ -285,6 +311,7 @@ This is called *dependent, non-differential* measurement error. #| layout-ncol: 2 #| fig-width: 4 #| fig-height: 4 + labels <- c( "actual" = "actual wait", "actual_star" = "measured\nactual", @@ -329,6 +356,7 @@ Let's expand @fig-meas-err-dag-dep-1 to include an arrow from posted time to how #| fig-cap: "When the true value of posted wait times affects the measurement of both measured variables, it becomes a confounder for them. When the backdoor path is due to either the exposure or the outcome, it's classified as differential measurement error." #| code-fold: true #| fig-width: 7.5 + labels <- c( "actual" = "actual wait", "actual_star" = "measured\nactual", @@ -407,6 +435,7 @@ mismeasured_model <- lm(outcome ~ exposure * confounder) #| code-fold: true #| label: tbl-confounder-me #| tbl-cap: "The coefficients of interaction terms between a confounder and the exposure. In one mode, the confounder is measured correctly. In another, it is mismeasured differentially by the outcome. In addition to not closing the backdoor path completely, this type of mismeasurement often appears as an interaction between the exposure and confounder, even when such an interaction does not exist." + library(gt) library(broom) pull_interaction <- function(mdl) { @@ -450,6 +479,7 @@ The unknown mechanism is random, but the mechanism related to posted wait times #| fig-cap: "A DAG representing the missingness structure of actual wait time. In this DAG, missingness is caused by posted wait times and an unknown mechanism that affects whether or not actual wait times are measured. Missingness in actual wait times is represented as a separate node, a missingness indicator." #| code-fold: true #| fig-width: 5 + labels <- c( "actual" = "actual wait", "actual_missing" = "missingness\nin actual", @@ -475,14 +505,18 @@ missing_dag |> theme( legend.key.spacing.x = unit(4, "points"), legend.key.size = unit(1, "points"), - legend.text = element_text(size = rel(1.25), margin = margin(l = -10.5, b = 2.6)), + legend.text = element_text( + size = rel(1.25), + margin = margin(l = -10.5, b = 2.6) + ), legend.box.margin = margin(b = 20), strip.text = element_blank() ) ``` However, in this simple DAG, conditioning on missingness does not open a backdoor path between `actual` and `posted`. -(It does bias the relationship between `unknown` and `posted`, but we don't care about that relationship even if we could estimate it.) The only open path in @fig-missing-dag-2 is the one from `posted` to `actual`. +(It does bias the relationship between `unknown` and `posted`, but we don't care about that relationship even if we could estimate it.) +The only open path in @fig-missing-dag-2 is the one from `posted` to `actual`. ```{r} #| label: fig-missing-dag-2 @@ -490,6 +524,7 @@ However, in this simple DAG, conditioning on missingness does not open a backdoo #| code-fold: true #| fig-width: 4 #| fig-height: 4 + missing_dag |> tidy_dagitty() |> add_missing() |> @@ -528,6 +563,7 @@ In this case, there is no way to close the backdoor paths opened by conditioning #| layout-ncol: 2 #| fig-width: 5 #| fig-height: 5 + labels <- c( "actual" = "actual wait", "actual_missing" = "missingness\nin actual", @@ -561,7 +597,7 @@ missing_dag |> ) + show_edge_color() + facet_wrap(~set) + - expand_plot(expand_x = expansion(c(.2, .2))) + + expand_plot(expand_x = expansion(c(0.2, 0.2))) + theme(strip.text = element_blank()) ``` @@ -576,6 +612,7 @@ Each of the DAGs represents a simple but differing structure of missingness. #| label: fig-missing-dags-sim #| fig-cap: "5 DAGs where `a` is `actual`, `p` is `posted`, `u` is `unknown`, and `m` is `missing`. Each DAG represents a slightly different missingness mechanism. In DAGs 1-3, the actual wait time values have missingness; in DAGs 4-5, some posted wait times are missing. The causal structure of missingness impacts what we can estimate." #| code-fold: true + library(patchwork) define_dag <- function(..., tag, title) { @@ -585,7 +622,7 @@ define_dag <- function(..., tag, title) { exposure = "p", outcome = "a" ) |> - ggdag(size = .7) + + ggdag(size = 0.7) + labs(title = paste0(tag, ": ", title)) + theme_dag() + theme(plot.title = element_text(size = 12)) + @@ -641,6 +678,7 @@ In DAG 4, we can calculate the mean of `actual` and the causal effect but not th #| label: fig-recoverables #| fig-cap: "A forest plot of the results of three different effects for data simulated from each DAG in @fig-missing-dags-sim. In the non-missing results, we can see what the effect should be for the sample. Each simulated dataset has 365 rows with missingness in either actual or posted wait times. For each of the DAGs, we're limited in what we can estimate correctly." #| code-fold: true + set.seed(123) posted <- rnorm(365, mean = 30, sd = 5) # create an effect where an hour of posted time creates 50 min of actual time @@ -648,30 +686,36 @@ coef <- 50 / 60 actual <- coef * posted + rnorm(365, mean = 0, sd = 2) posted_60 <- posted / 60 -missing_dag_1 <- rbinom(365, 1, .3) |> +missing_dag_1 <- rbinom(365, 1, 0.3) |> as.logical() -missing_dag_2 <- if_else(posted_60 > .50, rbinom(365, 1, .95), 0) |> +missing_dag_2 <- if_else(posted_60 > 0.50, rbinom(365, 1, 0.95), 0) |> as.logical() -missing_dag_3 <- if_else(actual > 22, rbinom(365, 1, .99), 0) |> +missing_dag_3 <- if_else(actual > 22, rbinom(365, 1, 0.99), 0) |> as.logical() # the same structure, but it's `posted` that gets the resulting missingness missing_dag_4 <- missing_dag_2 missing_dag_5 <- missing_dag_3 -fit_stats <- function(dag, actual, posted_60, missing_by = NULL, missing_for = "actual") { +fit_stats <- function( + dag, + actual, + posted_60, + missing_by = NULL, + missing_for = "actual" +) { if (!is.null(missing_by) & missing_for == "actual") { actual[missing_by] <- NA } - + if (!is.null(missing_by) & missing_for == "posted") { posted_60[missing_by] <- NA } - + t_actual <- t.test(actual) t_posted <- t.test(posted_60 * 60) mdl <- lm(actual ~ posted_60) mdl_confints <- confint(mdl) - + tibble( dag = dag, mean_actual_estimate = as.numeric(t_actual$estimate), @@ -696,13 +740,29 @@ dag_stats <- bind_rows( fit_stats("DAG 1", actual, posted_60, missing_by = missing_dag_1), fit_stats("DAG 2", actual, posted_60, missing_by = missing_dag_2), fit_stats("DAG 3", actual, posted_60, missing_by = missing_dag_3), - fit_stats("DAG 4", actual, posted_60, missing_by = missing_dag_4, missing_for = "posted"), - fit_stats("DAG 5", actual, posted_60, missing_by = missing_dag_5, missing_for = "posted"), + fit_stats( + "DAG 4", + actual, + posted_60, + missing_by = missing_dag_4, + missing_for = "posted" + ), + fit_stats( + "DAG 5", + actual, + posted_60, + missing_by = missing_dag_5, + missing_for = "posted" + ), ) dag_stats |> mutate( - true_value = if_else(dag == "No missingness", "True value", "Observed value"), + true_value = if_else( + dag == "No missingness", + "True value", + "Observed value" + ), dag = factor(dag, levels = c(paste("DAG", 5:1), "No missingness")), stat = factor( stat, @@ -712,7 +772,13 @@ dag_stats |> ) |> ggplot(aes(color = true_value)) + geom_point(aes(estimate, dag)) + - geom_segment(aes(x = lower, xend = upper, y = dag, yend = dag, group = stat)) + + geom_segment(aes( + x = lower, + xend = upper, + y = dag, + yend = dag, + group = stat + )) + facet_wrap(~stat, scales = "free_x") + labs(y = NULL, color = NULL) ``` @@ -727,9 +793,12 @@ In the grand tradition of statisticians being bad at naming things, you'll also In the case of causal models, we can explain these ideas by the causal structure of missingness and the availability of variables and values related to that structure in your data. -- **Missing completely at random (MCAR)**: there are missing values, but the causes of missingness are such that the missingness process is unrelated to the causal structure of your question. In other words, the only problem with missingness is a reduction in sample size. -- **Missing at random (MAR)**: the causes of missingness are related to the causal structure of the research problem, but it only depends on the variables and values in the data that we've actually observed. -- **Missing not at random (MNAR)**: the causes of missingness are related to the causal structure of the research problem, but this process is related to values we are missing. A classic example is when a variable's missingness is impacted by itself, e.g., higher values of `x` are more likely to be missing in `x`. We don't have that information because, by definition, it's missing. +- **Missing completely at random (MCAR)**: there are missing values, but the causes of missingness are such that the missingness process is unrelated to the causal structure of your question. + In other words, the only problem with missingness is a reduction in sample size. +- **Missing at random (MAR)**: the causes of missingness are related to the causal structure of the research problem, but it only depends on the variables and values in the data that we've actually observed. +- **Missing not at random (MNAR)**: the causes of missingness are related to the causal structure of the research problem, but this process is related to values we are missing. + A classic example is when a variable's missingness is impacted by itself, e.g., higher values of `x` are more likely to be missing in `x`. + We don't have that information because, by definition, it's missing. These terms don't always tell you what to do next, so we'll avoid them in favor of explicitly describing the missingness generation process. ::: @@ -747,20 +816,21 @@ In [Chapter -@sec-sensitivity], we'll also discuss sensitivity analyses for miss ## Regression Calibration -Sometimes, you have a well-measured version of a variable for a subset of observations and a version with more measurement error for a more significant proportion of the dataset. Sometimes people call this a **validation set**. +Sometimes, you have a well-measured version of a variable for a subset of observations and a version with more measurement error for a more significant proportion of the dataset. +Sometimes people call this a **validation set**. When this is the case, you can use a simple approach called *regression calibration* to predict the value of the well-measured version for more observations in the dataset. This technique's name refers to the fact that you're recalibrating the variable that you've observed more of, given the subset of values you have for the well-measured version. But other than that, it's just a prediction model that includes the version of the variable you have more observations of and other variables you find essential to the measurement process. As we know, the actual wait times have a lot of missingness. What if we considered posted wait times a proxy for actual wait times? -In this case, we could redo the analysis of Extra Magic Morning's effect on the calibrated version of actual wait times. +In this case, we could redo the analysis of Extra Magic Morning's effect on the calibrated version of actual wait times. -First, we'll fit a model to predict `wait_minutes_actual_avg` using `wait_minutes_posted_avg`. Then, instead of using `wait_minutes_posted_avg` we'll use the calibrated value from that model in it's place. +First, we'll fit a model to predict `wait_minutes_actual_avg` using `wait_minutes_posted_avg`. +Then, instead of using `wait_minutes_posted_avg` we'll use the calibrated value from that model in it's place. When fitting regression calibration models, it is crucial to include all variables that will be in subsequent models in the same form they will be there (for example, if we have a confounder in our final model that is fit using a spline, we need this same confounder, fit using a spline, in our calibration model). - ```{r} library(splines) library(touringplans) @@ -768,35 +838,44 @@ library(broom) calib_model <- lm( wait_minutes_actual_avg ~ - wait_minutes_posted_avg * wait_hour + - park_extra_magic_morning + - park_temperature_high + park_close + park_ticket_season, - data = seven_dwarfs_train_2018) + wait_minutes_posted_avg * + wait_hour + + park_extra_magic_morning + + park_temperature_high + + park_close + + park_ticket_season, + data = seven_dwarfs_train_2018 +) seven_dwarves_calib <- calib_model |> augment(newdata = seven_dwarfs_train_2018) |> rename(wait_minutes_posted_calib = .fitted) ``` - ```{r} #| echo: false + library(propensity) -fit_ipw_effect <- function(.fmla, .data = seven_dwarfs, .trt = "park_extra_magic_morning", .outcome_fmla = wait_minutes_posted_calib ~ park_extra_magic_morning) { +fit_ipw_effect <- function( + .fmla, + .data = seven_dwarfs, + .trt = "park_extra_magic_morning", + .outcome_fmla = wait_minutes_posted_calib ~ park_extra_magic_morning +) { .trt_var <- rlang::ensym(.trt) - + # fit propensity score model propensity_model <- glm( .fmla, data = .data, family = binomial() ) - + # calculate ATE weights .df <- propensity_model |> augment(type.predict = "response", data = .data) |> mutate(w_ate = wt_ate(.fitted, !!.trt_var, exposure_type = "binary")) - + # fit ipw model lm(.outcome_fmla, data = .df, weights = w_ate) |> tidy() |> @@ -806,7 +885,8 @@ fit_ipw_effect <- function(.fmla, .data = seven_dwarfs, .trt = "park_extra_magic effect_calib <- fit_ipw_effect( park_extra_magic_morning ~ park_temperature_high + - park_close + park_ticket_season, + park_close + + park_ticket_season, .outcome_fmla = wait_minutes_posted_calib ~ park_extra_magic_morning, .data = seven_dwarves_calib |> filter(wait_hour == 9) ) |> @@ -814,7 +894,12 @@ effect_calib <- fit_ipw_effect( ``` Fitting this model with the IPW estimator results in an effect of `r effect_calib`, marginally attenuated compared to the value we saw when using the uncalibrated `wait_minutes_posted_avg` in [Chapter -@sec-outcome-model]. -Another approach could be to **impute** the values. Where `wait_minutes_actual_avg` is available, we'll use that. If it's `NA`, we'll use the calibrated value. In practice, this uses the same calibration model as above, but instead of giving the calibrated values to all observations, we only impute them for the observations missing in the validation set. This method is sometimes called *regression imputation* or *deterministic imputation*. It is deterministic in the sense that we are just giving every observation with missing data a single predicted value and treating it as fixed rather than adding any variability (we'll see a *stocastic* imptuation method next, where we introduce some variability). +Another approach could be to **impute** the values. +Where `wait_minutes_actual_avg` is available, we'll use that. +If it's `NA`, we'll use the calibrated value. +In practice, this uses the same calibration model as above, but instead of giving the calibrated values to all observations, we only impute them for the observations missing in the validation set. +This method is sometimes called *regression imputation* or *deterministic imputation*. +It is deterministic in the sense that we are just giving every observation with missing data a single predicted value and treating it as fixed rather than adding any variability (we'll see a *stocastic* imptuation method next, where we introduce some variability). ```{r} seven_dwarves_reg_impute <- calib_model |> @@ -834,7 +919,8 @@ seven_dwarves_reg_impute <- calib_model |> effect_reg_impute <- fit_ipw_effect( park_extra_magic_morning ~ park_temperature_high + - park_close + park_ticket_season, + park_close + + park_ticket_season, .outcome_fmla = wait_minutes_actual_impute ~ park_extra_magic_morning, .data = seven_dwarves_reg_impute |> filter(wait_hour == 9) ) |> @@ -850,25 +936,35 @@ Be sure to include the fitting of this model in your bootstrap to get the correc ## Multiple Imputation {#sec-imputation} -Regression calibration provides a straightforward way to address measurement error and missingness by predicting values with a single model and plugging those predictions into the analysis. However, this plug-in approach is generally inefficient. If we correctly estimate uncertainty (for example, by bootstrapping both the calibration and outcome models), we often end up with wide confidence intervals, reflecting the fact that we've relied on a single imputed value. +Regression calibration provides a straightforward way to address measurement error and missingness by predicting values with a single model and plugging those predictions into the analysis. +However, this plug-in approach is generally inefficient. +If we correctly estimate uncertainty (for example, by bootstrapping both the calibration and outcome models), we often end up with wide confidence intervals, reflecting the fact that we've relied on a single imputed value. -Multiple imputation (MI) is an often more efficient alternative. Instead of producing one "best guess" for the missing value, MI draws predicted values from a distribution, capturing the uncertainty inherent in the missing data. This process allows for better estimates of both summary statistics (like the mean of the imputed variable) and downstream conditional effects (like treatment effects when the imputed variable appears in the outcome model). Through this process, we will generate multiple imputed datasets. The default is often 5, but if a large portion of your data is missing, more imputations may be necessary to achieve stable results. A common rule of thumb is that the number of imputations should roughly match the percentage of incomplete cases. +Multiple imputation (MI) is an often more efficient alternative. +Instead of producing one "best guess" for the missing value, MI draws predicted values from a distribution, capturing the uncertainty inherent in the missing data. +This process allows for better estimates of both summary statistics (like the mean of the imputed variable) and downstream conditional effects (like treatment effects when the imputed variable appears in the outcome model). +Through this process, we will generate multiple imputed datasets. +The default is often 5, but if a large portion of your data is missing, more imputations may be necessary to achieve stable results. +A common rule of thumb is that the number of imputations should roughly match the percentage of incomplete cases. There are a few critical modeling considerations when using MI in causal analyses: -1. Your imputation model *must include all variables* that appear in your final outcome model and your propensity score model. This includes variables that confound the treatment–outcome relationship, the treatment, *and the outcome itself* (when the outcome is not the target of the imputation). Leaving out key variables, especially the outcome, can lead to biased estimates. [@d2024behind] -2. As with regression calibration, *use the same functional forms in your imputation model as you'll use later*. If a variable is modeled with splines or interactions downstream, those need to be mirrored in the imputation step. +1. Your imputation model *must include all variables* that appear in your final outcome model and your propensity score model. + This includes variables that confound the treatment--outcome relationship, the treatment, *and the outcome itself* (when the outcome is not the target of the imputation). + Leaving out key variables, especially the outcome, can lead to biased estimates. + [@d2024behind] +2. As with regression calibration, *use the same functional forms in your imputation model as you'll use later*. + If a variable is modeled with splines or interactions downstream, those need to be mirrored in the imputation step. In a typical causal analysis, we proceed as follows: 1. Impute multiple complete datasets using an imputation model that includes the outcome and all covariates. 2. Estimate the treatment effect within each dataset: - * Fit the propensity score model. - * Compute inverse probability weights. - * Fit the weighted outcome model. + - Fit the propensity score model. + - Compute inverse probability weights. + - Fit the weighted outcome model. 3. Pool the estimates using Rubin's rules (see @tip-rubin) to obtain an overall treatment effect and valid standard errors. - ::: {#tip-rubin .callout-tip} ### Rubin's Rules @@ -877,30 +973,35 @@ After performing the analysis separately on each of the $m$ imputed datasets: Let: -- $\hat{Q}_i$: the estimate from the $i$-th imputed dataset -- $U_i$: the variance of $\hat{Q}_i$ -- $\bar{Q} = \frac{1}{m} \sum_{i=1}^m \hat{Q}_i$: the pooled estimate -- $\bar{U} = \frac{1}{m} \sum_{i=1}^m U_i$: the average within-imputation variance +- $\hat{Q}_i$: the estimate from the $i$-th imputed dataset\ +- $U_i$: the variance of $\hat{Q}_i$\ +- $\bar{Q} = \frac{1}{m} \sum_{i=1}^m \hat{Q}_i$: the pooled estimate\ +- $\bar{U} = \frac{1}{m} \sum_{i=1}^m U_i$: the average within-imputation variance\ - $B = \frac{1}{m - 1} \sum_{i=1}^m (\hat{Q}_i - \bar{Q})^2$: the between-imputation variance Then the total variance is: -$$T = \bar{U} + \left(1 + \frac{1}{m}\right) B$$ +$$ +T = \bar{U} + \left(1 + \frac{1}{m}\right) B +$$ The standard error of the pooled estimate is $\sqrt{T}$ and this can be used to construct confidence intervals or perform hypothesis testing. ::: +This approach accounts for uncertainty in both the imputation and the treatment effect estimation steps. +It is particularly helpful when missingness affects variables involved in the treatment assignment or outcome processes, cases where complete-case analysis or single imputation can yield biased or inefficient estimates. -This approach accounts for uncertainty in both the imputation and the treatment effect estimation steps. It is particularly helpful when missingness affects variables involved in the treatment assignment or outcome processes, cases where complete-case analysis or single imputation can yield biased or inefficient estimates. - -In practice, multiple imputation is often implemented using the MICE (Multivariate Imputation by Chained Equations) algorithm, which iteratively imputes each variable with missing values using models conditional on the others. In R, we often use the `{mice}` package to implement the imputation. +In practice, multiple imputation is often implemented using the MICE (Multivariate Imputation by Chained Equations) algorithm, which iteratively imputes each variable with missing values using models conditional on the others. +In R, we often use the `{mice}` package to implement the imputation. -Let's use the same example as above, but rather than just performing a single regression imputation, we will perform the stochastic (multiple) imputation 10 times. First, we need to impute our data. +Let's use the same example as above, but rather than just performing a single regression imputation, we will perform the stochastic (multiple) imputation 10 times. +First, we need to impute our data. ```{r} #| message: false #| warning: false + library(mice) seven_dwarfs_to_impute_data <- seven_dwarfs_train_2018 |> @@ -926,7 +1027,7 @@ seven_dwarfs_mi <- mice( seven_dwarfs_to_impute_data, m = 10, # we are doing 10 imputations predictorMatrix = predictor_matrix, - method = "pmm", # predictive mean matching + method = "pmm", # predictive mean matching seed = 1, print = FALSE ) @@ -938,42 +1039,53 @@ Then we can collect our imputed datasets into a list using the `complete` functi seven_dwarfs_mi_data <- complete(seven_dwarfs_mi, action = "all") ``` - -Finally, let's write a functcion to fit the IPW effect across these datasets. We want to preserve the treatment effect as well as the standard error for applying Rubin's Rules. +Finally, let's write a functcion to fit the IPW effect across these datasets. +We want to preserve the treatment effect as well as the standard error for applying Rubin's Rules. ```{r} -fit_ipw_effect <- function(.fmla, .data = seven_dwarfs, .trt = "park_extra_magic_morning", .outcome_fmla = wait_minutes_posted_calib ~ park_extra_magic_morning) { +fit_ipw_effect <- function( + .fmla, + .data = seven_dwarfs, + .trt = "park_extra_magic_morning", + .outcome_fmla = wait_minutes_posted_calib ~ park_extra_magic_morning +) { .trt_var <- rlang::ensym(.trt) - + # fit propensity score model propensity_model <- glm( .fmla, data = .data, family = binomial() ) - + # calculate ATE weights .df <- propensity_model |> augment(type.predict = "response", data = .data) |> mutate(w_ate = wt_ate(.fitted, !!.trt_var, exposure_type = "binary")) - + # fit outcome model - outcome_model <- lm(.outcome_fmla, data = .df, weights = w_ate) - + outcome_model <- lm(.outcome_fmla, data = .df, weights = w_ate) + # fit ipw effect ipw_output <- ipw(propensity_model, outcome_model, .df) - return(c(estimate = ipw_output$estimates$estimate, - std.err = ipw_output$estimates$std.err)) + return(c( + estimate = ipw_output$estimates$estimate, + std.err = ipw_output$estimates$std.err + )) } ``` ```{r} -effect_mi_all <- map(seven_dwarfs_mi_data, ~fit_ipw_effect( - park_extra_magic_morning ~ park_temperature_high + - park_close + park_ticket_season, - .outcome_fmla = wait_minutes_actual_avg ~ park_extra_magic_morning, - .data = .x |> filter(wait_hour == 9) -)) |> +effect_mi_all <- map( + seven_dwarfs_mi_data, + ~ fit_ipw_effect( + park_extra_magic_morning ~ park_temperature_high + + park_close + + park_ticket_season, + .outcome_fmla = wait_minutes_actual_avg ~ park_extra_magic_morning, + .data = .x |> filter(wait_hour == 9) + ) +) |> bind_rows() ``` @@ -988,19 +1100,17 @@ To get the final effect, we are going to average the `estimate` column and to ge ```{r} effect_mi_all |> summarise( - effect_mi = mean(estimate), # pooled estimate - u_bar = mean(std.err^2), # average within-imputation variance - b = var(estimate), # between-imputation variance - t_var = u_bar + (1 + 1/n()) * b, # total variance - se_mi = sqrt(t_var) # pooled standard error + effect_mi = mean(estimate), # pooled estimate + u_bar = mean(std.err^2), # average within-imputation variance + b = var(estimate), # between-imputation variance + t_var = u_bar + (1 + 1 / n()) * b, # total variance + se_mi = sqrt(t_var) # pooled standard error ) |> select(effect_mi, se_mi) ``` Again, this effect is attenuated compared to what we saw in [Chapter -@sec-outcome-model] (although the standard error is quite large, so the differences may not be meaningful). - - ::: callout-tip The effects of missingness on results and the impact of complete case analyses and multiple imputation can be deeply unintuitive. When you add measurement error and other types of bias, it can be nearly impossible to reason about. @@ -1008,9 +1118,11 @@ A partial solution to this problem is to offload some of the reasoning from your We recommend writing down the causal mechanisms you think are involved in your research question and then using simulation to probe different strategies. -1. Create a DAG that includes the missingness and mismeasurement generation process and any other types of bias you think are essential. -2. Simulate data that match this process. Often, you'll want to simulate it to match different assumptions, such as the strength of mismeasurement or missingness related to variables in the DAG. -3. Check the results under different analysis strategies, such as complete-case analysis vs. imputation. You may also want to calculate the nominal coverage for the confidence intervals (the proportion of confidence intervals attained through your simulation that contain the true value; e.g., for 95% confidence intervals, 95% of the confidence intervals from your simulation should contain the true result). +1. Create a DAG that includes the missingness and mismeasurement generation process and any other types of bias you think are essential. +2. Simulate data that match this process. + Often, you'll want to simulate it to match different assumptions, such as the strength of mismeasurement or missingness related to variables in the DAG. +3. Check the results under different analysis strategies, such as complete-case analysis vs. imputation. + You may also want to calculate the nominal coverage for the confidence intervals (the proportion of confidence intervals attained through your simulation that contain the true value; e.g., for 95% confidence intervals, 95% of the confidence intervals from your simulation should contain the true result). As with our general suggestion about DAGs, if you are unsure about the correct DAG, you should check to see how these results differ depending on the specification. ::: diff --git a/chapters/16-sensitivity.qmd b/chapters/16-sensitivity.qmd index e06c426..09e8793 100644 --- a/chapters/16-sensitivity.qmd +++ b/chapters/16-sensitivity.qmd @@ -4,12 +4,14 @@ ```{r} #| echo: false + # TODO: remove when first edition complete status("wip") ``` ```{r} #| include: false + library(ggdag) library(touringplans) library(ggokabeito) @@ -45,9 +47,22 @@ Let's consider the DAG we introduced in @fig-dag-magic. #| average wait time and 2) both Extra Magic Hours and average wait time #| are determined by the time the park closes, historic high temperatures, #| and ticket season. + coord_dag <- list( - x = c(park_ticket_season = 0, park_close = 0, park_temperature_high = -1, park_extra_magic_morning = 1, wait_minutes_posted_avg = 2), - y = c(park_ticket_season = -1, park_close = 1, park_temperature_high = 0, park_extra_magic_morning = 0, wait_minutes_posted_avg = 0) + x = c( + park_ticket_season = 0, + park_close = 0, + park_temperature_high = -1, + park_extra_magic_morning = 1, + wait_minutes_posted_avg = 2 + ), + y = c( + park_ticket_season = -1, + park_close = 1, + park_temperature_high = 0, + park_extra_magic_morning = 0, + wait_minutes_posted_avg = 0 + ) ) labels <- c( @@ -59,8 +74,13 @@ labels <- c( ) emm_wait_dag <- dagify( - wait_minutes_posted_avg ~ park_extra_magic_morning + park_close + park_ticket_season + park_temperature_high, - park_extra_magic_morning ~ park_temperature_high + park_close + park_ticket_season, + wait_minutes_posted_avg ~ park_extra_magic_morning + + park_close + + park_ticket_season + + park_temperature_high, + park_extra_magic_morning ~ park_temperature_high + + park_close + + park_ticket_season, coords = coord_dag, labels = labels, exposure = "park_extra_magic_morning", @@ -68,7 +88,7 @@ emm_wait_dag <- dagify( ) curvatures <- rep(0, 7) -curvatures[5] <- .3 +curvatures[5] <- 0.3 emm_wait_dag |> tidy_dagitty() |> @@ -78,7 +98,12 @@ emm_wait_dag |> ) + geom_dag_edges_arc(curvature = curvatures, edge_color = "grey80") + geom_dag_point() + - geom_dag_text_repel(aes(label = label), size = 3.8, seed = 1630, color = "#494949") + + geom_dag_text_repel( + aes(label = label), + size = 3.8, + seed = 1630, + color = "#494949" + ) + scale_color_okabe_ito(na.value = "grey90") + theme_dag() + theme(legend.position = "none") + @@ -100,8 +125,12 @@ Let's say, though, that we had used @fig-dag-magic-missing instead, which is mis #| in the morning at a particular park and the average wait #| time between 9 am and 10 am. #| This DAG has no arrows from park close time and historical temperature to Extra Magic Hours. + emm_wait_dag_missing <- dagify( - wait_minutes_posted_avg ~ park_extra_magic_morning + park_close + park_ticket_season + park_temperature_high, + wait_minutes_posted_avg ~ park_extra_magic_morning + + park_close + + park_ticket_season + + park_temperature_high, park_extra_magic_morning ~ park_ticket_season, coords = coord_dag, labels = labels, @@ -111,12 +140,15 @@ emm_wait_dag_missing <- dagify( # produces below: # park_ticket_season, park_close + park_ticket_season, park_temperature_high + park_ticket_season, or park_close + park_temperature_high + park_ticket_season -adj_sets <- unclass(dagitty::adjustmentSets(emm_wait_dag_missing, type = "all")) |> +adj_sets <- unclass(dagitty::adjustmentSets( + emm_wait_dag_missing, + type = "all" +)) |> map_chr(\(.x) glue::glue('{unlist(glue::glue_collapse(.x, sep = " + "))}')) |> glue::glue_collapse(sep = ", ", last = ", or ") curvatures <- rep(0, 5) -curvatures[3] <- .3 +curvatures[3] <- 0.3 emm_wait_dag_missing |> tidy_dagitty() |> @@ -126,7 +158,12 @@ emm_wait_dag_missing |> ) + geom_dag_edges_arc(curvature = curvatures, edge_color = "grey80") + geom_dag_point() + - geom_dag_text_repel(aes(label = label), size = 3.8, seed = 1630, color = "#494949") + + geom_dag_text_repel( + aes(label = label), + size = 3.8, + seed = 1630, + color = "#494949" + ) + scale_color_okabe_ito(na.value = "grey90") + theme_dag() + theme(legend.position = "none") + @@ -148,11 +185,17 @@ The difference in these results implies that there is something off about the ca #| label: tbl-alt-sets #| tbl-cap: "A table of ATE estimates from the IPW estimator. Each estimate was calculated for one of the valid adjustment sets for the DAG. The estimates are sorted by effect size in order. If the DAG is right and all the data well measured, different adjustment sets should give roughly the same answer." #| code-fold: true + seven_dwarfs <- touringplans::seven_dwarfs_train_2018 |> filter(wait_hour == 9) # we'll use `.data` and `.trt` later -fit_ipw_effect <- function(.fmla, .data = seven_dwarfs, .trt = "park_extra_magic_morning", .outcome_fmla = wait_minutes_posted_avg ~ park_extra_magic_morning) { +fit_ipw_effect <- function( + .fmla, + .data = seven_dwarfs, + .trt = "park_extra_magic_morning", + .outcome_fmla = wait_minutes_posted_avg ~ park_extra_magic_morning +) { .trt_var <- rlang::ensym(.trt) # fit propensity score model @@ -179,7 +222,8 @@ effects <- list( park_extra_magic_morning ~ park_close + park_ticket_season, park_extra_magic_morning ~ park_temperature_high + park_ticket_season, park_extra_magic_morning ~ park_temperature_high + - park_close + park_ticket_season + park_close + + park_ticket_season ) |> map_dbl(fit_ipw_effect) @@ -206,9 +250,9 @@ A negative control is either an exposure (negative exposure control) or outcome In their article, they reference standard controls in bench science. In a lab experiment, any of these actions should lead to a null effect: -1. Leave out an essential ingredient. -2. Inactivate the hypothesized active ingredient. -3. Check for an effect that would be impossible by the hypothesized outcome. +1. Leave out an essential ingredient. +2. Inactivate the hypothesized active ingredient. +3. Check for an effect that would be impossible by the hypothesized outcome. There's nothing unique to lab work here; these scientists merely probe the logical implications of their understanding and hypotheses. To find a good negative control, you usually need to extend your DAG to include more of the causal structure surrounding your question. @@ -243,6 +287,7 @@ So, given the DAG, our adjustment set is any combination of the confounders (as #| 63 days before the day's wait time we are examining. #| Because of the long period, there should be no effect. #| Similarly, the DAG also has earlier confounders related to day `i - 63`. + labels <- c( x63 = "Extra Magic\nMorning (i-63)", x = "Extra Magic\nMorning (i)", @@ -286,13 +331,17 @@ We'll use `lag()` from dplyr to get those variables. ```{r} #| eval: false + n_days_lag <- 63 distinct_emm <- seven_dwarfs_train_2018 |> filter(wait_hour == 9) |> arrange(park_date) |> transmute( park_date, - prev_park_extra_magic_morning = lag(park_extra_magic_morning, n = n_days_lag), + prev_park_extra_magic_morning = lag( + park_extra_magic_morning, + n = n_days_lag + ), prev_park_temperature_high = lag(park_temperature_high, n = n_days_lag), prev_park_close = lag(park_close, n = n_days_lag), prev_park_ticket_season = lag(park_ticket_season, n = n_days_lag) @@ -306,13 +355,17 @@ seven_dwarfs_train_2018_lag <- seven_dwarfs_train_2018 |> ```{r} #| echo: false + calculate_coef <- function(n_days_lag) { distinct_emm <- seven_dwarfs_train_2018 |> filter(wait_hour == 9) |> arrange(park_date) |> transmute( park_date, - prev_park_extra_magic_morning = lag(park_extra_magic_morning, n = n_days_lag), + prev_park_extra_magic_morning = lag( + park_extra_magic_morning, + n = n_days_lag + ), prev_park_temperature_high = lag(park_temperature_high, n = n_days_lag), prev_park_close = lag(park_close, n = n_days_lag), prev_park_ticket_season = lag(park_ticket_season, n = n_days_lag) @@ -324,10 +377,13 @@ calculate_coef <- function(n_days_lag) { drop_na(prev_park_extra_magic_morning) fit_ipw_effect( - prev_park_extra_magic_morning ~ prev_park_temperature_high + prev_park_close + prev_park_ticket_season, + prev_park_extra_magic_morning ~ prev_park_temperature_high + + prev_park_close + + prev_park_ticket_season, .data = seven_dwarfs_train_2018_lag, .trt = "prev_park_extra_magic_morning", - .outcome_fmla = wait_minutes_posted_avg ~ prev_park_extra_magic_morning + park_extra_magic_morning + .outcome_fmla = wait_minutes_posted_avg ~ prev_park_extra_magic_morning + + park_extra_magic_morning ) } @@ -344,17 +400,21 @@ If these results are accurate, it implies that we have some residual confounding ```{r} #| label: fig-sens-i-63 #| fig-cap: > -#| A scatterplot with a smoothed regression of the relationship between wait times on day `i` and whether there were Extra Magic Hours on day `i - n`, where `n` represents the number of days previous to day `i`. We expect this relationship to rapidly approach the null, but the effect hovers above null for quite some time. This lingering effect implies we have some residual confounding present. +#| A scatterplot with a smoothed regression of the relationship between wait times on day `i` and whether there were Extra Magic Hours on day `i - n`, where `n` represents the number of days previous to day `i`. We expect this relationship to rapidly approach the null, but the effect hovers above null for quite some time. This lingering effect implies we have some residual confounding present. #| code-fold: true #| warning: false #| message: false + coefs <- purrr::map_dbl(1:63, calculate_coef) ggplot(tibble(coefs = coefs, x = 1:63), aes(x = x, y = coefs)) + geom_hline(yintercept = 0) + geom_point() + geom_smooth(se = FALSE) + - labs(y = "difference in wait times (minutes)\n on day (i) for EMM on day (i - n)", x = "day (i - n)") + labs( + y = "difference in wait times (minutes)\n on day (i) for EMM on day (i - n)", + x = "day (i - n)" + ) ``` #### Negative outcomes @@ -373,20 +433,22 @@ seven_dwarfs_sim <- seven_dwarfs_train_2018 |> mutate( # we scale each variable and add a bit of random noise # to simulate reasonable Universal wait times - wait_time_universal = - park_temperature_high / 150 + - as.numeric(park_close) / 1500 + - as.integer(factor(park_ticket_season)) / 1000 + - rnorm(n(), 5, 5) + wait_time_universal = park_temperature_high / + 150 + + as.numeric(park_close) / 1500 + + as.integer(factor(park_ticket_season)) / 1000 + + rnorm(n(), 5, 5) ) ``` ```{r} #| echo: false + wait_universal <- seven_dwarfs_sim |> fit_ipw_effect( park_extra_magic_morning ~ park_temperature_high + - park_close + park_ticket_season, + park_close + + park_ticket_season, .data = _, .outcome_fmla = wait_time_universal ~ park_extra_magic_morning ) |> @@ -404,24 +466,26 @@ seven_dwarfs_sim2 <- seven_dwarfs_train_2018 |> wait_minutes_posted_avg = wait_minutes_posted_avg + u, park_extra_magic_morning = if_else( u > 10, - rbinom(1, 1, .1), + rbinom(1, 1, 0.1), park_extra_magic_morning ), - wait_time_universal = - park_temperature_high / 150 + - as.numeric(park_close) / 1500 + - as.integer(factor(park_ticket_season)) / 1000 + - u + - rnorm(n(), 5, 5) + wait_time_universal = park_temperature_high / + 150 + + as.numeric(park_close) / 1500 + + as.integer(factor(park_ticket_season)) / 1000 + + u + + rnorm(n(), 5, 5) ) ``` ```{r} #| echo: false + disney <- seven_dwarfs_sim2 |> fit_ipw_effect( park_extra_magic_morning ~ park_temperature_high + - park_close + park_ticket_season, + park_close + + park_ticket_season, .data = _ ) |> round(2) @@ -429,7 +493,8 @@ disney <- seven_dwarfs_sim2 |> universal <- seven_dwarfs_sim2 |> fit_ipw_effect( park_extra_magic_morning ~ park_temperature_high + - park_close + park_ticket_season, + park_close + + park_ticket_season, .data = _, .outcome_fmla = wait_time_universal ~ park_extra_magic_morning ) |> @@ -468,7 +533,8 @@ There is a correlation between the park's close time and ticket season. ```{r} #| label: fig-conditional-ind #| fig-cap: > -#| A plot of the estimates and 95% confidence intervals of the correlations between the residuals resulting from a regression of variables in the DAG that should have no relationship. While two relationships appear null, park close time and ticket season seem to be correlated, suggesting we have misspecified the DAG. One source of this misspecification may be missing arrows between the variables. Notably, the adjustment sets are identical with and without this arrow. +#| A plot of the estimates and 95% confidence intervals of the correlations between the residuals resulting from a regression of variables in the DAG that should have no relationship. While two relationships appear null, park close time and ticket season seem to be correlated, suggesting we have misspecified the DAG. One source of this misspecification may be missing arrows between the variables. Notably, the adjustment sets are identical with and without this arrow. + test_conditional_independence( emm_wait_dag, data = seven_dwarfs_train_2018 |> @@ -505,6 +571,7 @@ Let's look at an example that is more likely to be misspecified, where we remove ```{r} #| echo: false + labels <- c( park_extra_magic_morning = "Extra Magic\nMorning", wait_minutes_posted_avg = "Average\nwait", @@ -516,8 +583,10 @@ labels <- c( ```{r} emm_wait_dag2 <- dagify( - wait_minutes_posted_avg ~ park_extra_magic_morning + park_close + - park_ticket_season + park_temperature_high, + wait_minutes_posted_avg ~ park_extra_magic_morning + + park_close + + park_ticket_season + + park_temperature_high, park_extra_magic_morning ~ park_temperature_high, coords = coord_dag, labels = labels, @@ -535,7 +604,8 @@ In @fig-conditional-ind-misspec, we see an additional association between ticket ```{r} #| label: fig-conditional-ind-misspec #| fig-cap: > -#| A plot of the estimates and 95% confidence intervals of the correlations between the residuals resulting from a regression of variables in the DAG that should have no relationship. While two relationships appear null, park close time and ticket season seem to be correlated, suggesting we have misspecified the DAG. One source of this misspecification may be missing arrows between the variables. +#| A plot of the estimates and 95% confidence intervals of the correlations between the residuals resulting from a regression of variables in the DAG that should have no relationship. While two relationships appear null, park close time and ticket season seem to be correlated, suggesting we have misspecified the DAG. One source of this misspecification may be missing arrows between the variables. + test_conditional_independence( emm_wait_dag2, data = seven_dwarfs_train_2018 |> @@ -558,6 +628,7 @@ These are called *equivalent* DAGs because their implications are the same. ```{r} #| eval: false + ggdag_equivalent_dags(emm_wait_dag2) ``` @@ -566,17 +637,42 @@ ggdag_equivalent_dags(emm_wait_dag2) #| code-fold: true #| fig-width: 9 #| fig-cap: > -#| Equivalent DAGs for the likely misspecified version of @fig-dag-magic. -#| These two DAGs produce the same set of implied conditional independencies. -#| The difference between them is only the direction of the arrow between -#| historic high temperature and Extra Magic Hours. +#| Equivalent DAGs for the likely misspecified version of @fig-dag-magic. +#| These two DAGs produce the same set of implied conditional independencies. +#| The difference between them is only the direction of the arrow between +#| historic high temperature and Extra Magic Hours. + curvatures <- rep(0, 10) -curvatures[c(4, 9)] <- .25 +curvatures[c(4, 9)] <- 0.25 ggdag_equivalent_dags(emm_wait_dag2, use_edges = FALSE, use_text = FALSE) + - geom_dag_edges_arc(data = function(x) distinct(x), curvature = curvatures, edge_color = "grey80") + - geom_dag_edges_link(data = function(x) filter(x, (name == "park_extra_magic_morning" & to == "park_temperature_high") | (name == "park_temperature_high" & to == "park_extra_magic_morning")), edge_color = "black") + - geom_dag_text_repel(aes(label = label), data = function(x) filter(x, label %in% c("Extra Magic\nMorning", "Historic high\ntemperature")), box.padding = 15, seed = 12, color = "#494949") + + geom_dag_edges_arc( + data = function(x) distinct(x), + curvature = curvatures, + edge_color = "grey80" + ) + + geom_dag_edges_link( + data = function(x) { + filter( + x, + (name == "park_extra_magic_morning" & to == "park_temperature_high") | + (name == "park_temperature_high" & to == "park_extra_magic_morning") + ) + }, + edge_color = "black" + ) + + geom_dag_text_repel( + aes(label = label), + data = function(x) { + filter( + x, + label %in% c("Extra Magic\nMorning", "Historic high\ntemperature") + ) + }, + box.padding = 15, + seed = 12, + color = "#494949" + ) + theme_dag() ``` @@ -586,6 +682,7 @@ While technical, this connection can condense the visualization to a single DAG ```{r} #| eval: false + ggdag_equivalent_class(emm_wait_dag2, use_text = FALSE, use_labels = TRUE) ``` @@ -594,18 +691,35 @@ ggdag_equivalent_class(emm_wait_dag2, use_text = FALSE, use_labels = TRUE) #| code-fold: true #| fig-width: 5 #| fig-cap: > -#| An alternative way of visualizing @fig-equiv-dag where all the equivalent -#| DAGs are condensed to a single version where the *reversible* edges are denoted -#| with edges without arrows. +#| An alternative way of visualizing @fig-equiv-dag where all the equivalent +#| DAGs are condensed to a single version where the *reversible* edges are denoted +#| with edges without arrows. + curvatures <- rep(0, 4) -curvatures[3] <- .25 +curvatures[3] <- 0.25 emm_wait_dag2 |> node_equivalent_class() |> ggdag(use_edges = FALSE, use_text = FALSE) + - geom_dag_edges_arc(data = function(x) filter(x, !reversable), curvature = curvatures, edge_color = "grey90") + + geom_dag_edges_arc( + data = function(x) filter(x, !reversable), + curvature = curvatures, + edge_color = "grey90" + ) + geom_dag_edges_link(data = function(x) filter(x, reversable), arrow = NULL) + - geom_dag_text_repel(aes(label = label), data = function(x) filter(x, label %in% c("Extra Magic\nMorning", "Historic high\ntemperature")), box.padding = 16, seed = 12, size = 5, color = "#494949") + + geom_dag_text_repel( + aes(label = label), + data = function(x) { + filter( + x, + label %in% c("Extra Magic\nMorning", "Historic high\ntemperature") + ) + }, + box.padding = 16, + seed = 12, + size = 5, + color = "#494949" + ) + theme_dag() ``` @@ -647,7 +761,7 @@ In this case, we're considering a different causal structure. ```{r} #| label: fig-dag-extra-days #| fig-cap: > -#| An expansion of @fig-dag-magic, which now includes two new variables on their own backdoor paths: whether or not it's a holiday and/or a weekend. +#| An expansion of @fig-dag-magic, which now includes two new variables on their own backdoor paths: whether or not it's a holiday and/or a weekend. #| code-fold: true labels <- c( @@ -661,8 +775,17 @@ labels <- c( ) emm_wait_dag3 <- dagify( - wait_minutes_posted_avg ~ park_extra_magic_morning + park_close + park_ticket_season + park_temperature_high + is_weekend + is_holiday, - park_extra_magic_morning ~ park_temperature_high + park_close + park_ticket_season + is_weekend + is_holiday, + wait_minutes_posted_avg ~ park_extra_magic_morning + + park_close + + park_ticket_season + + park_temperature_high + + is_weekend + + is_holiday, + park_extra_magic_morning ~ park_temperature_high + + park_close + + park_ticket_season + + is_weekend + + is_holiday, park_close ~ is_weekend + is_holiday, coords = time_ordered_coords(), labels = labels, @@ -671,7 +794,7 @@ emm_wait_dag3 <- dagify( ) curvatures <- rep(0, 13) -curvatures[11] <- .25 +curvatures[11] <- 0.25 emm_wait_dag3 |> tidy_dagitty() |> @@ -681,7 +804,12 @@ emm_wait_dag3 |> ) + geom_dag_edges_arc(curvature = curvatures, edge_color = "grey80") + geom_dag_point() + - geom_dag_text_repel(aes(label = label), size = 3.8, seed = 16301, color = "#494949") + + geom_dag_text_repel( + aes(label = label), + size = 3.8, + seed = 16301, + color = "#494949" + ) + scale_color_okabe_ito(na.value = "grey90") + theme_dag() + theme(legend.position = "none") + @@ -723,10 +851,16 @@ Both Extra Magic Morning hours and posted wait times are associated with whether ```{r} #| label: tbl-days #| tbl-cap: > -#| The descriptive associations between the two new variables, holiday and weekend, and the exposure and outcome. The average posted waiting time differs on both holidays and weekends, as do the occurrences of Extra Magic Hours. While we can't determine a confounding relationship from descriptive statistics alone, this adds to the evidence that these are confounders. +#| The descriptive associations between the two new variables, holiday and weekend, and the exposure and outcome. The average posted waiting time differs on both holidays and weekends, as do the occurrences of Extra Magic Hours. While we can't determine a confounding relationship from descriptive statistics alone, this adds to the evidence that these are confounders. #| code-fold: true + tbl_data_days <- seven_dwarfs_with_days |> - select(wait_minutes_posted_avg, park_extra_magic_morning, is_weekend, is_holiday) + select( + wait_minutes_posted_avg, + park_extra_magic_morning, + is_weekend, + is_holiday + ) library(labelled) var_label(tbl_data_days) <- list( @@ -753,11 +887,16 @@ gtsummary::tbl_merge(list(tbl1, tbl2), c("Weekend", "Holiday")) ```{r} #| echo: false + ipw_results_with_days <- fit_ipw_effect( park_extra_magic_morning ~ park_temperature_high + - park_close + park_ticket_season + is_weekend + is_holiday, + park_close + + park_ticket_season + + is_weekend + + is_holiday, .data = seven_dwarfs_with_days -) |> round(2) +) |> + round(2) ``` When we refit the IPW estimator, we get `r ipw_results_with_days` minutes, slightly bigger than we got without the two new confounders. @@ -781,9 +920,9 @@ We can take this further using quantitative bias analysis, which uses mathematic Sensitivity analyses for unmeasured confounding are important tools in observational studies to assess how robust findings are to potential unmeasured factors [@d2022sensitivity]. These analyses rely on three key components: -1) the observed exposure-outcome effect after adjusting for measured confounders,\ -2) the estimated relationship between a hypothetical unmeasured confounder and the exposure, and\ -3) the estimated relationship between that unmeasured confounder and the outcome. +1) the observed exposure-outcome effect after adjusting for measured confounders,\ +2) the estimated relationship between a hypothetical unmeasured confounder and the exposure, and\ +3) the estimated relationship between that unmeasured confounder and the outcome. By specifying plausible values for these relationships, researchers can quantify how much the observed effect might change if such an unmeasured confounder existed. @@ -804,18 +943,33 @@ The three key components above are described by 1) the arrow between 'Extra Magi #| average wait time and 2) both Extra Magic Hours and average wait time #| are determined by the time the park closes, historic high temperatures, #| and ticket season. + curvatures <- rep(0, 7) curvatures[5] <- 0.3 emm_wait_dag |> tidy_dagitty() |> node_status() |> - mutate(linetype = if_else(name == "park_temperature_high", "dashed", "solid")) |> + mutate( + linetype = if_else(name == "park_temperature_high", "dashed", "solid") + ) |> ggplot( - aes(x, y, xend = xend, yend = yend, color = status, edge_linetype = linetype) + aes( + x, + y, + xend = xend, + yend = yend, + color = status, + edge_linetype = linetype + ) ) + geom_dag_edges_arc(curvature = curvatures, edge_color = "grey80") + geom_dag_point() + - geom_dag_text_repel(aes(label = label), size = 3.8, seed = 1630, color = "#494949") + + geom_dag_text_repel( + aes(label = label), + size = 3.8, + seed = 1630, + color = "#494949" + ) + scale_color_okabe_ito(na.value = "grey90") + theme_dag() + theme(legend.position = "none") + @@ -834,15 +988,15 @@ While these analyses cannot prove the absence of unmeasured confounding, they pr The first componenet, the observed exposure-outcome effect, is the proposed causal effect of interest, i.e. the effect you would like to perform a sensitivity analysis on. The effect itself will depend on the choice of outcome model, which in turn often depends on the distribution of the outcome and the desired effect measure: -1. For continuous outcomes: Linear models or generalized linear models (GLMs) with Gaussian distribution and identity link are used, typically estimating a coefficient. +1. For continuous outcomes: Linear models or generalized linear models (GLMs) with Gaussian distribution and identity link are used, typically estimating a coefficient. -2. For binary outcomes, we have a few choices: +2. For binary outcomes, we have a few choices: -- GLMs with binomial distribution and log link -- GLMs with Poisson distribution and log link -- GLMs with binomial distribution and logit link These estimate coefficients, which can be exponentiated to obtain risk ratios (log link models) or odds ratios (logit link models). +- GLMs with binomial distribution and log link +- GLMs with Poisson distribution and log link +- GLMs with binomial distribution and logit link These estimate coefficients, which can be exponentiated to obtain risk ratios (log link models) or odds ratios (logit link models). -3. For time-to-event outcomes: Cox proportional hazards models are used, with the hazard ratio obtained by exponentiating the coefficient. +3. For time-to-event outcomes: Cox proportional hazards models are used, with the hazard ratio obtained by exponentiating the coefficient. Let's use the analysis from @tbl-alt-sets where we only adjusted for 'Time park closed' and 'Ticket season'. According to @fig-dag-magic-sens, we know 'Historic high temperature' is also a confounder, but it is *unmeasured* so we cannot include it in our practical adjustment set. @@ -852,18 +1006,18 @@ This resulted in an observed effect of `r round(effects[2], 2)`. The relationship between an unmeasured confounder and the exposure can be characterized in three ways: -1. For a binary unmeasured confounder: +1. For a binary unmeasured confounder: -- Prevalence of the unmeasured confounder in the exposed group -- Prevalence of the unmeasured confounder in the unexposed group +- Prevalence of the unmeasured confounder in the exposed group +- Prevalence of the unmeasured confounder in the unexposed group -2. For a continuous unmeasured confounder (assuming a normal distribution and unit variance): +2. For a continuous unmeasured confounder (assuming a normal distribution and unit variance): -- Difference in means of the unmeasured confounder between exposed and unexposed groups +- Difference in means of the unmeasured confounder between exposed and unexposed groups -3. Distribution-agnostic approach: +3. Distribution-agnostic approach: -- Partial $R^2$, representing the proportion of variation in the exposure explained by the unmeasured confounder after accounting for measured confounders +- Partial $R^2$, representing the proportion of variation in the exposure explained by the unmeasured confounder after accounting for measured confounders These characterizations allow researchers to specify the unmeasured confounder-exposure relationship in sensitivity analyses, accommodating different types of confounders and levels of knowledge about their distribution. @@ -879,14 +1033,15 @@ Hold on to this number; we'll use it in conjunction with the next section for ou The relationship between an unmeasured confounder and the outcome can be quantified in two main ways: -1. Coefficient-based approach: Estimate the coefficient for an unmeasured confounder in a fully adjusted outcome model. - You can also estimate the exponentiated coefficient (risk ratio, odds ratio, or hazard ratio) +1. Coefficient-based approach: Estimate the coefficient for an unmeasured confounder in a fully adjusted outcome model. + You can also estimate the exponentiated coefficient (risk ratio, odds ratio, or hazard ratio) -2. Distribution-agnostic approach (for continuous outcomes): Use partial $R^2$, representing the proportion of variation in the outcome explained by the unmeasured confounder after accounting for the exposure and measured confounders +2. Distribution-agnostic approach (for continuous outcomes): Use partial $R^2$, representing the proportion of variation in the outcome explained by the unmeasured confounder after accounting for the exposure and measured confounders Let's do the coeffient-based approach. In our case, we need to estimate what we think the coefficient bewteen our standardized 'Historic high temperature' variable and our outcome after adjusting for our exposure as well as the other measured confounders (in this case ticket season and the time the park closed). -Another way to describe this effect in the context of this problem is: "How would the average posted wait time change if we changed the historic high temperature by one standard deviation, after adjusting for whether there were extra magic morning hours, the park close time, and the ticket season?" Let's suppose we think this would change by -2.3 minutes. +Another way to describe this effect in the context of this problem is: "How would the average posted wait time change if we changed the historic high temperature by one standard deviation, after adjusting for whether there were extra magic morning hours, the park close time, and the ticket season?" +Let's suppose we think this would change by -2.3 minutes. That is, if the historic high temperature is one standard deviation unit higher (in our scenario, 9 degrees warmer), we expect this to decrease the average posted wait time by 2.3 minutes. For a mathematical explanation of these quantities, see @d2022sensitivity. @@ -902,19 +1057,20 @@ For example, to adjust (`action`) a coefficient (`effect`) with a binary unmeasu Below is a copy of the table included in @lucy2022tipr about this package. -| category | Function term | Use | -|----------------|-----------------|---------------------------------------| -| **action** | `adjust` | These functions adjust observed effects, requiring both the unmeasured \| confounder-exposure relationship and unmeasured confounder-outcome relationship to be specified. | -| | `tip` | These functions tip observed effects. Only one relationship, either the unmeasured confounder-exposure relationship or unmeasured confounder-outcome relationship needs to be specified. | -| **effect** | `coef` | These functions specify an observed coefficient from a linear, log-linear, logistic, or Cox proportional hazards model | -| | `rr` | These functions specify an observed relative risk | -| | `or` | These functions specify an observed odds ratio | -| | `hr` | These functions specify an observed hazard ratio | -| **what** | `continuous` | These functions specify an unmeasured standardized Normally distributed confounder. These functions will include the parameters `exposure_confounder_effect` and `confounder_outcome_effect` | -| | `binary` | These functions specify an unmeasured binary confounder. These functions will include the parameters `exposed_confounder_prev`, `unexposed_confounder_prev`, and `confounder_outcome_effect` | -| | `r2` | These functions specify an unmeasured confounder parameterized by specifying the percent of variation in the exposure / outcom explained by the unmeasured confounder. These functions will include the parameters `confounder_exposure_r2` and `outcome_exposure_r2` | - -: Grammar of `tipr` functions. {#tbl-sens} + | category | Function term | Use | + | ---------- | ------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------ | + | **action** | `adjust` | These functions adjust observed effects, requiring both the unmeasured \ | confounder-exposure relationship and unmeasured confounder-outcome relationship to be specified. | + | | `tip` | These functions tip observed effects. Only one relationship, either the unmeasured confounder-exposure relationship or unmeasured confounder-outcome relationship needs to be specified. | + | **effect** | `coef` | These functions specify an observed coefficient from a linear, log-linear, logistic, or Cox proportional hazards model | + | | `rr` | These functions specify an observed relative risk | + | | `or` | These functions specify an observed odds ratio | + | | `hr` | These functions specify an observed hazard ratio | + | **what** | `continuous` | These functions specify an unmeasured standardized Normally distributed confounder. These functions will include the parameters `exposure_confounder_effect` and `confounder_outcome_effect` | + | | `binary` | These functions specify an unmeasured binary confounder. These functions will include the parameters `exposed_confounder_prev`, `unexposed_confounder_prev`, and `confounder_outcome_effect` | + | | `r2` | These functions specify an unmeasured confounder parameterized by specifying the percent of variation in the exposure / outcom explained by the unmeasured confounder. These functions will include the parameters `confounder_exposure_r2` and `outcome_exposure_r2` | + + : Grammar of `tipr` functions. + {#tbl-sens} You can find full documentation here: [r-causal.github.io/tipr/](https://r-causal.github.io/tipr/) @@ -928,7 +1084,7 @@ Let's plug in the quantities we established above for each of the three paramete library(tipr) adjust_coef( effect_observed = 6.58, - exposure_confounder_effect = -.17, + exposure_confounder_effect = -0.17, confounder_outcome_effect = -2.3 ) ``` diff --git a/chapters/17-mediation.qmd b/chapters/17-mediation.qmd index 62b51bd..12b47c9 100644 --- a/chapters/17-mediation.qmd +++ b/chapters/17-mediation.qmd @@ -4,6 +4,7 @@ ```{r} #| echo: false + # TODO: remove when first edition complete status("unstarted") ``` diff --git a/chapters/18-longitudinal.qmd b/chapters/18-longitudinal.qmd index 0f88c7a..1f6c064 100644 --- a/chapters/18-longitudinal.qmd +++ b/chapters/18-longitudinal.qmd @@ -4,6 +4,7 @@ ```{r} #| echo: false + # TODO: remove when first edition complete status("unstarted") ``` diff --git a/chapters/19-time-to-event.qmd b/chapters/19-time-to-event.qmd index 5b0f607..7161bb1 100644 --- a/chapters/19-time-to-event.qmd +++ b/chapters/19-time-to-event.qmd @@ -4,6 +4,7 @@ ```{r} #| echo: false + # TODO: remove when first edition complete status("unstarted") ``` diff --git a/chapters/20-doubly-robust.qmd b/chapters/20-doubly-robust.qmd index c77917b..8c61473 100644 --- a/chapters/20-doubly-robust.qmd +++ b/chapters/20-doubly-robust.qmd @@ -4,6 +4,7 @@ ```{r} #| echo: false + # TODO: remove when first edition complete status("unstarted") ``` diff --git a/chapters/21-machine-learning.qmd b/chapters/21-machine-learning.qmd index 25252bf..929c64d 100644 --- a/chapters/21-machine-learning.qmd +++ b/chapters/21-machine-learning.qmd @@ -4,6 +4,7 @@ ```{r} #| echo: false + # TODO: remove when first edition complete status("unstarted") ``` diff --git a/chapters/22-iv-and-friends.qmd b/chapters/22-iv-and-friends.qmd index d028086..3f2930a 100644 --- a/chapters/22-iv-and-friends.qmd +++ b/chapters/22-iv-and-friends.qmd @@ -4,6 +4,7 @@ ```{r} #| echo: false + # TODO: remove when first edition complete status("unstarted") ``` diff --git a/chapters/23-diff-in-diff.qmd b/chapters/23-diff-in-diff.qmd index 7782909..dfa8a53 100644 --- a/chapters/23-diff-in-diff.qmd +++ b/chapters/23-diff-in-diff.qmd @@ -4,6 +4,7 @@ ```{r} #| echo: false + # TODO: remove when first edition complete status("unstarted") ``` diff --git a/chapters/24-evidence.qmd b/chapters/24-evidence.qmd index ff58d00..25db519 100644 --- a/chapters/24-evidence.qmd +++ b/chapters/24-evidence.qmd @@ -4,11 +4,13 @@ ```{r} #| echo: false + # TODO: remove when first edition complete status("unstarted") ``` -> "Circumstantial evidence is a very tricky thing. It may seem to point very straight to one thing, but if you shift your own point of view a little, you may find it pointing in an equally uncompromising manner to something entirely different” --- Sherlock Holmes +> "Circumstantial evidence is a very tricky thing. +> It may seem to point very straight to one thing, but if you shift your own point of view a little, you may find it pointing in an equally uncompromising manner to something entirely different" --- Sherlock Holmes ## Triangulation diff --git a/chapters/future/time_varying_gcomp.qmd b/chapters/future/time_varying_gcomp.qmd index e60ed10..256df48 100644 --- a/chapters/future/time_varying_gcomp.qmd +++ b/chapters/future/time_varying_gcomp.qmd @@ -1,17 +1,25 @@ - ### Time-varying Settings -The parameteric g-formula can naturally extend to scenarios when the exposure, covariates, and outcomes change over time. It is implemented through the following sequence of steps: +The parameteric g-formula can naturally extend to scenarios when the exposure, covariates, and outcomes change over time. +It is implemented through the following sequence of steps: 1. The time-ordered structure of the variables is laid out by drawing the appropriate DAG, which makes explicit the ordering of baseline covariates, time-varying covariates, treatment, and outcome. -2. For each time point after baseline, a parametric model is specified for every variable whose value evolves over time. Each model expresses the variable at that moment as a function of the variables that precede it in the graph. Linear models are commonly used for continuous variables and logistic regressions for binary variables, although any suitable parametric specification can be used. +2. For each time point after baseline, a parametric model is specified for every variable whose value evolves over time. + Each model expresses the variable at that moment as a function of the variables that precede it in the graph. + Linear models are commonly used for continuous variables and logistic regressions for binary variables, although any suitable parametric specification can be used. -3. Using the observed baseline covariates for each individual, predicted values for time-varying confounders and outcomes are generated sequentially using the fitted models. At each time point, the exposure is set according to the intervention regime being evaluated (rather than using its observed or predicted value). Predictions from one time point become inputs to the model for the next time point, propagating the intervention's effects forward through time. +3. Using the observed baseline covariates for each individual, predicted values for time-varying confounders and outcomes are generated sequentially using the fitted models. + At each time point, the exposure is set according to the intervention regime being evaluated (rather than using its observed or predicted value). + Predictions from one time point become inputs to the model for the next time point, propagating the intervention's effects forward through time. -4. The predicted outcomes under the intervention are averaged across all individuals in the sample to obtain the expected outcome under that intervention regime. This process is repeated for each intervention of interest (including the natural course where exposures remain at their observed values). The desired causal contrast is obtained by comparing these averages. +4. The predicted outcomes under the intervention are averaged across all individuals in the sample to obtain the expected outcome under that intervention regime. + This process is repeated for each intervention of interest (including the natural course where exposures remain at their observed values). + The desired causal contrast is obtained by comparing these averages. -Returning to the Seven Dwarfs Mine Train data, suppose the park is considering an operational policy that guarantees the posted wait at 8am never exceeds thirty minutes. The policy is expected to influence congestion later in the morning, since the early-morning posted wait affects how quickly queues accumulate. We therefore pose the following question: +Returning to the Seven Dwarfs Mine Train data, suppose the park is considering an operational policy that guarantees the posted wait at 8am never exceeds thirty minutes. +The policy is expected to influence congestion later in the morning, since the early-morning posted wait affects how quickly queues accumulate. +We therefore pose the following question: > What would the average actual wait time at 10am have been if, for every park date, the posted wait at 8am were set to thirty minutes, compared with what was actually observed? @@ -22,7 +30,7 @@ Below is a proposed DAG to answer this causal question. #| code-fold: true #| message: false #| warning: false -#| fig.cap: > +#| fig-cap: > #| Time-ordered DAG representing the effect of setting the 8am posted wait #| to a fixed value on the actual wait at 10am. Baseline covariates influence #| each wait time; the posted wait at 8am influences the posted wait at 9am, @@ -32,24 +40,29 @@ library(ggdag) library(ggokabeito) coord_dag <- list( - x = c(Season = -1, close = -1, weather = -1, - wait8 = 0, wait9 = 1, wait10 = 2), - y = c(Season = 1, close = 0, weather = -1, - wait8 = 0, wait9 = 0, wait10 = 0) + x = c( + Season = -1, + close = -1, + weather = -1, + wait8 = 0, + wait9 = 1, + wait10 = 2 + ), + y = c(Season = 1, close = 0, weather = -1, wait8 = 0, wait9 = 0, wait10 = 0) ) labels <- c( Season = "Ticket Season", weather = "Historic high temperature", - close = "Time park closed", - wait8 = "8am posted wait", - wait9 = "9am posted wait", - wait10 = "10am actual wait" + close = "Time park closed", + wait8 = "8am posted wait", + wait9 = "9am posted wait", + wait10 = "10am actual wait" ) dagify( - wait8 ~ Season + weather + close, - wait9 ~ wait8 + Season + weather + close, + wait8 ~ Season + weather + close, + wait9 ~ wait8 + Season + weather + close, wait10 ~ wait9 + wait8 + Season + weather + close, coords = coord_dag, labels = labels, @@ -59,7 +72,7 @@ dagify( tidy_dagitty() |> node_status() |> ggplot(aes(x, y, xend = xend, yend = yend, color = status)) + - geom_dag_edges_arc(curvature = c(rep(0, 6), .3)) + + geom_dag_edges_arc(curvature = c(rep(0, 6), 0.3)) + geom_dag_point() + geom_dag_label_repel(seed = 1630) + scale_color_okabe_ito(na.value = "grey90") + @@ -81,7 +94,9 @@ dagify( ) ``` -Because each park date includes hourly posted waits across the morning, these repeated measurements form a simple longitudinal sequence. Weather, ticket season, Extra Magic Morning, and closing time remain fixed for the date, but posted waits change from hour to hour and depend on what happened earlier. The posted wait at 8am influences the posted wait at 9am, and both influence the actual wait at 10am, so an intervention applied at 8am propagates forward through the subsequent hours. +Because each park date includes hourly posted waits across the morning, these repeated measurements form a simple longitudinal sequence. +Weather, ticket season, Extra Magic Morning, and closing time remain fixed for the date, but posted waits change from hour to hour and depend on what happened earlier. +The posted wait at 8am influences the posted wait at 9am, and both influence the actual wait at 10am, so an intervention applied at 8am propagates forward through the subsequent hours. To set up this example, let's reshape the 8am and 9am posted waits so that both appear on the same row, along with the baseline park-level features. @@ -89,9 +104,13 @@ To set up this example, let's reshape the 8am and 9am posted waits so that both df <- seven_dwarfs_train_2018 |> filter(wait_hour %in% c(8, 9)) |> select( - park_date, wait_hour, wait_minutes_posted_avg, - park_extra_magic_morning, park_ticket_season, - park_close, park_temperature_high + park_date, + wait_hour, + wait_minutes_posted_avg, + park_extra_magic_morning, + park_ticket_season, + park_close, + park_temperature_high ) wide <- df |> @@ -133,7 +152,8 @@ Now let's fit a second model that describes the 10am actual wait in terms of the ```{r} m10 <- lm( wait_minutes_actual_avg ~ - wait_9 + wait_8 + + wait_9 + + wait_8 + park_extra_magic_morning + park_ticket_season + park_close + @@ -142,15 +162,21 @@ m10 <- lm( ) ``` -To evaluate an intervention, suppose that all park dates begin with an 8am posted wait of thirty minutes. The longitudinal g-formula uses the fitted models to reconstruct the rest of the morning under this counterfactual scenario. The observed data are first modified so that the 8am wait is replaced by the intervention value. The 9am posted wait is then generated from its model using this modified 8am value, and the 10am actual wait is generated from its model using both the modified 8am value and the model-based value of the 9am wait. This sequence reproduces the evolving process under the intervention. +To evaluate an intervention, suppose that all park dates begin with an 8am posted wait of thirty minutes. +The longitudinal g-formula uses the fitted models to reconstruct the rest of the morning under this counterfactual scenario. +The observed data are first modified so that the 8am wait is replaced by the intervention value. +The 9am posted wait is then generated from its model using this modified 8am value, and the 10am actual wait is generated from its model using both the modified 8am value and the model-based value of the 9am wait. +This sequence reproduces the evolving process under the intervention. ```{r} # Observed: use observed 8am waits natural <- three |> mutate(wait_9_pred = predict(m9, newdata = pick(everything()))) |> mutate( - wait_10_pred = predict(m10, - newdata = mutate(pick(everything()), wait_9 = wait_9_pred)) + wait_10_pred = predict( + m10, + newdata = mutate(pick(everything()), wait_9 = wait_9_pred) + ) ) # Intervention: set all 8am waits to 30 @@ -158,8 +184,10 @@ intervention <- three |> mutate(wait_8 = 30) |> mutate(wait_9_pred = predict(m9, newdata = pick(everything()))) |> mutate( - wait_10_pred = predict(m10, - newdata = mutate(pick(everything()), wait_9 = wait_9_pred)) + wait_10_pred = predict( + m10, + newdata = mutate(pick(everything()), wait_9 = wait_9_pred) + ) ) # Estimate causal effect @@ -170,9 +198,14 @@ tibble( ) ``` -The average of `wait_10` across all dates gives the estimated expected outcome under the intervention. Repeating the same procedure with the observed values of `wait_8` yields the expected outcome under the natural course. Their difference measures the effect of lowering the early-morning posted wait to thirty minutes. +The average of `wait_10` across all dates gives the estimated expected outcome under the intervention. +Repeating the same procedure with the observed values of `wait_8` yields the expected outcome under the natural course. +Their difference measures the effect of lowering the early-morning posted wait to thirty minutes. -To estimate the uncertainty around this effect we can use the bootstrap. Following the convention outlined in @sec-bootstrap, we wrap the entire procedure (including fitting both models) into a single function. Because the intervention's effect at 10am flows through predictions from the 9am model, both models must be refit on each resample; fitting only the outcome model would produce confidence intervals that are too narrow. Here are the steps: +To estimate the uncertainty around this effect we can use the bootstrap. +Following the convention outlined in @sec-bootstrap, we wrap the entire procedure (including fitting both models) into a single function. +Because the intervention's effect at 10am flows through predictions from the 9am model, both models must be refit on each resample; fitting only the outcome model would produce confidence intervals that are too narrow. +Here are the steps: 1. Create a function to run the time-varying g-formula once on a sample of your data: @@ -193,7 +226,8 @@ fit_gformula_tv <- function(.split, ...) { # fit model for 10am actual wait m10 <- lm( wait_minutes_actual_avg ~ - wait_9 + wait_8 + + wait_9 + + wait_8 + park_extra_magic_morning + park_ticket_season + park_close + @@ -205,8 +239,10 @@ fit_gformula_tv <- function(.split, ...) { natural <- .df |> mutate(wait_9_pred = predict(m9, newdata = pick(everything()))) |> mutate( - wait_10_pred = predict(m10, - newdata = mutate(pick(everything()), wait_9 = wait_9_pred)) + wait_10_pred = predict( + m10, + newdata = mutate(pick(everything()), wait_9 = wait_9_pred) + ) ) # intervention: set all 8am waits to 30 @@ -214,8 +250,10 @@ fit_gformula_tv <- function(.split, ...) { mutate(wait_8 = 30) |> mutate(wait_9_pred = predict(m9, newdata = pick(everything()))) |> mutate( - wait_10_pred = predict(m10, - newdata = mutate(pick(everything()), wait_9 = wait_9_pred)) + wait_10_pred = predict( + m10, + newdata = mutate(pick(everything()), wait_9 = wait_9_pred) + ) ) tibble( @@ -230,6 +268,7 @@ fit_gformula_tv <- function(.split, ...) { ```{r} #| message: false #| warning: false + library(rsample) set.seed(1) bootstrapped_dwarfs_tv <- bootstraps( @@ -249,9 +288,11 @@ gformula_tv_results |> mutate( estimate = map_dbl( boot_fits, - \(.fit) .fit |> - filter(term == "ate") |> - pull(estimate) + \(.fit) { + .fit |> + filter(term == "ate") |> + pull(estimate) + } ) ) |> ggplot(aes(estimate)) + From 5e0786812c48327c9d6ebcb4758f1ad7755858a4 Mon Sep 17 00:00:00 2001 From: Malcolm Barrett Date: Sun, 3 May 2026 21:00:27 -0400 Subject: [PATCH 3/4] fix lints --- .github/workflows/panache.yaml | 24 +++++++++++++++++++++ chapters/03-po-counterfactuals.qmd | 2 +- chapters/05-not-just-a-stats-problem.qmd | 2 +- chapters/08-propensity-scores.qmd | 2 +- chapters/14-interaction.qmd | 2 +- chapters/15-missingness-and-measurement.qmd | 4 ++-- 6 files changed, 30 insertions(+), 6 deletions(-) create mode 100644 .github/workflows/panache.yaml diff --git a/.github/workflows/panache.yaml b/.github/workflows/panache.yaml new file mode 100644 index 0000000..894884d --- /dev/null +++ b/.github/workflows/panache.yaml @@ -0,0 +1,24 @@ +name: panache + +on: + pull_request: + push: + branches: [main] + +permissions: + contents: read + +jobs: + panache: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: posit-dev/setup-air@v1 + # Lint disabled until panache has a fix for + # `[lint.rules] undefined-reference-label = false` being ignored + # (book-wide false positives on subfig refs and cross-file refs we + # cannot suppress). + - uses: jolars/panache-action@v1 + with: + path: chapters/ + lint: "false" diff --git a/chapters/03-po-counterfactuals.qmd b/chapters/03-po-counterfactuals.qmd index e46447c..eaa370b 100644 --- a/chapters/03-po-counterfactuals.qmd +++ b/chapters/03-po-counterfactuals.qmd @@ -79,7 +79,7 @@ Instead of relying on a single individual, we often rely on many. We could conduct an experiment in which we randomize many individuals to leave criminal life (or not) and see how this impacts their outcomes on average (of course, this randomized trial presents some ethical issues, which is why observational data like Ice-T and Spike's are interesting). In any case, we must rely on statistical techniques to help construct these unobservable counterfactuals from observed data. -## Potential outcomes +## Potential outcomes {#sec-po} Factual outcomes and counterfactual outcomes are two realizations of **potential outcomes**. Before some cause occurs, the potential outcomes are all the things that could happen depending on what you are exposed to. diff --git a/chapters/05-not-just-a-stats-problem.qmd b/chapters/05-not-just-a-stats-problem.qmd index 2a635d7..5fa5552 100644 --- a/chapters/05-not-just-a-stats-problem.qmd +++ b/chapters/05-not-just-a-stats-problem.qmd @@ -586,7 +586,7 @@ causal_quartet |> Relatedly, model coefficients for variables *other* than those of the causes we're interested in can be difficult to interpret. In a model with `outcome ~ exposure + covariate`, it's tempting to present the coefficient of `covariate` as well as `exposure`. -The problem, as discussed @sec-pred-or-explain, is that the causal structure for the effect of `covariate` on `outcome` may differ from that of `exposure` on `outcome`. +The problem, as discussed in @sec-pred-or-explain, is that the causal structure for the effect of `covariate` on `outcome` may differ from that of `exposure` on `outcome`. Let's consider a variation of the quartet DAGs with other variables. First, let's start with the confounder DAG. diff --git a/chapters/08-propensity-scores.qmd b/chapters/08-propensity-scores.qmd index a3323a0..eb90167 100644 --- a/chapters/08-propensity-scores.qmd +++ b/chapters/08-propensity-scores.qmd @@ -12,7 +12,7 @@ status("polishing") As presented in @sec-data-causal, the causal question we'd like to answer is: **is there a relationship between whether there were "Extra Magic Hours" in the morning at Magic Kingdom and the average wait time for an attraction called the "Seven Dwarfs Mine Train" the same day between 9 AM and 10 AM in 2018?** Below is a proposed DAG for this question. ```{r} -#| label: fig-dag-magic-hours-wait2 +#| label: fig-dag-magic-hours-wait #| code-fold: true #| message: false #| warning: false diff --git a/chapters/14-interaction.qmd b/chapters/14-interaction.qmd index c2e1683..cfe8eb1 100644 --- a/chapters/14-interaction.qmd +++ b/chapters/14-interaction.qmd @@ -33,7 +33,7 @@ $$ $$ Heterogeneous treatment effects arise when $\tau(Z)$ is not constant in $Z$. -In that case, we see the various estimands defined in @sec-estimand, for example +In that case, we see the various estimands defined in @sec-estimands, for example $$ \text{ATE}=E[\tau(Z)], \quad diff --git a/chapters/15-missingness-and-measurement.qmd b/chapters/15-missingness-and-measurement.qmd index 453a081..28e8401 100644 --- a/chapters/15-missingness-and-measurement.qmd +++ b/chapters/15-missingness-and-measurement.qmd @@ -703,11 +703,11 @@ fit_stats <- function( missing_by = NULL, missing_for = "actual" ) { - if (!is.null(missing_by) & missing_for == "actual") { + if (!is.null(missing_by) && missing_for == "actual") { actual[missing_by] <- NA } - if (!is.null(missing_by) & missing_for == "posted") { + if (!is.null(missing_by) && missing_for == "posted") { posted_60[missing_by] <- NA } From 618268a7e9f8bf9ae49aedfde9f73f2247c5faf6 Mon Sep 17 00:00:00 2001 From: Malcolm Barrett Date: Mon, 4 May 2026 11:25:20 -0400 Subject: [PATCH 4/4] update readme --- README.Rmd | 9 +++++++++ README.md | 14 ++++++++++++++ 2 files changed, 23 insertions(+) diff --git a/README.Rmd b/README.Rmd index e75dec4..b358dc4 100644 --- a/README.Rmd +++ b/README.Rmd @@ -31,3 +31,12 @@ remotes::install_deps(dependencies = TRUE) ``` We use [Quarto](https://quarto.org/) to render this book. + +## Formatting and linting + +This project uses [panache](https://panache.bz/) to format and lint the Quarto documents. For the code cells, panache uses [air](https://posit-dev.github.io/air/) and [jarl](https://jarl.etiennebacher.com/) for formatting and linting, respectively. + +```bash +panache format chapters/ +panache lint chapters/ +``` diff --git a/README.md b/README.md index 01ef116..1f88176 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,7 @@ # Causal Inference in R + This repository contains the source code for the book *Causal Inference @@ -20,3 +21,16 @@ remotes::install_deps(dependencies = TRUE) ``` We use [Quarto](https://quarto.org/) to render this book. + +## Formatting and linting + +This project uses [panache](https://panache.bz/) to format and lint the +Quarto documents. For the code cells, panache uses +[air](https://posit-dev.github.io/air/) and +[jarl](https://jarl.etiennebacher.com/) for formatting and linting, +respectively. + +``` bash +panache format chapters/ +panache lint chapters/ +```