2 Lab I: Tidyverse

2.0.1 Preparation

## Packages
library(tidyverse)
library(dplyr)
library(readxl) # Package to read Excel data
library(stargazer)

2.1 Join the data sets.

Join the cases and vaccination data by date and state.
- Case data: United_States_COVID-19_Cases_and_Deaths_by_State_over_Time.csv
- Vaccination data: COVID-19_Vaccinations_in_the_United_States_Jurisdiction.csv
- Other state variables: USstates.xlsx
Add the USstates.xlsx data and limit your dataframe to the states listed in USstates.xlsx.
How do you know if your merge was successful?

ANSWER: Looking at the dataframe is useful. Look at individual states - do the variables that are supposed to be the same (e.g. population) the same over the time period? Do the case numbers look reasonable?

## Load data 
# https://data.cdc.gov/Case-Surveillance/United-States-COVID-19-Cases-and-Deaths-by-State-o/9mfq-cb36
cases <- read.csv("Data/United_States_COVID-19_Cases_and_Deaths_by_State_over_Time.csv")

# Vax data 
  # "COVID-19 Vaccinations in the United States,Jurisdiction"
  # https://data.cdc.gov/Vaccinations/COVID-19-Vaccinations-in-the-United-States-Jurisdi/unsk-b7fc
vax <- read.csv("Data/COVID-19_Vaccinations_in_the_United_States_Jurisdiction.csv")

df <- cases %>% 
  left_join(vax, by = c("submission_date" = "Date",
                        "state" = "Location"))

# Add state variables
stFacts <- read_excel("Data/USstates.xlsx", sheet = "data") %>% 
  dplyr::rename("trump2020Pct" = "trump2020_percent")

df <- df %>% 
  filter(state %in% stFacts$stateAbbr == 1) %>%
  left_join(stFacts, by = c("state" = "stateAbbr"))

2.2 Create and add the following four variables to your dataframe: density (based on sqMiles and pop2019), deaths per capita (based on new_death), cases per capita (based on new_case) and vaccinated percent (based on Series_Complete_12PlusPop_Pct).

For a specific day (based on submission_date), show the top five states ranked by deaths per capita and calculate the average vaccinated per capita and the mean, minimum and maximum deaths per capita.

df <- df %>% 
  mutate("deathsPC" = 100000*new_death/pop2019, 
         "casesPC"  = 100000*new_case/pop2019,
         "vaxedPct"  = Series_Complete_12PlusPop_Pct/100,
         "density"   = pop2019/(1000000*sqMiles))

# Pick a day
DATE <- "10/18/2021"

df_day <- df %>% 
  filter(submission_date == DATE) %>%
  arrange(desc(deathsPC))

# Look at data (check for negative deaths etc)
df_day %>% 
  dplyr::select(submission_date, state, new_case, new_death, deathsPC, casesPC) %>% 
  slice(1:5)

##   submission_date state new_case new_death deathsPC casesPC
## 1      10/18/2021    ID     1290        42      2.4      72
## 2      10/18/2021    AK      691        16      2.2      94
## 3      10/18/2021    MT      426        19      1.8      40
## 4      10/18/2021    AL      961        84      1.7      20
## 5      10/18/2021    WV      689        26      1.5      38

df_day %>% 
  dplyr::summarize(meanVaxedPct = mean(vaxedPct),
            meanDeath = mean(deathsPC),
            minDeath = min(deathsPC),
            maxDeath = max(deathsPC))

##   meanVaxedPct meanDeath minDeath maxDeath
## 1         0.64      0.46        0      2.4

2.3 Estimate three regression models with deaths per capita on your selected day as the dependent variable.

Your first model will have only Trump 2020 percent as an independent variable.
Your second model will add vaccinated percent as an independent variable.
Your third model will add density.
Before you estimate the models, write down your expectations about what will happen in these models.

ANSWER: See table below. The key idea here is omitted variable bias. If we only have Trump percent, then vaccination rates are omitted. For October 1st, 2021, the Trump variable becomes insignificant when vaccination rates are included. One complication, for which I suspect there is no clear answer, is whether vaccination rates are a post-treatment variable. What do you think?

## Models
ols.1 <- lm(deathsPC ~ trump2020Pct, data = df_day)
summary(ols.1)

## 
## Call:
## lm(formula = deathsPC ~ trump2020Pct, data = df_day)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.8697 -0.2935 -0.1092  0.0997  1.6572 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)   
## (Intercept)    -0.519      0.315   -1.65   0.1058   
## trump2020Pct    1.987      0.622    3.19   0.0025 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.53 on 49 degrees of freedom
## Multiple R-squared:  0.172,  Adjusted R-squared:  0.155 
## F-statistic: 10.2 on 1 and 49 DF,  p-value: 0.00246

ols.2 <- lm(deathsPC ~ trump2020Pct + vaxedPct, data = df_day)
summary(ols.2)

## 
## Call:
## lm(formula = deathsPC ~ trump2020Pct + vaxedPct, data = df_day)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.8504 -0.3076 -0.0117  0.1775  1.6685 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     2.029      0.726    2.79  0.00746 ** 
## trump2020Pct    0.219      0.721    0.30  0.76236    
## vaxedPct       -2.627      0.691   -3.80  0.00041 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.47 on 48 degrees of freedom
## Multiple R-squared:  0.364,  Adjusted R-squared:  0.337 
## F-statistic: 13.7 on 2 and 48 DF,  p-value: 1.94e-05

ols.3 <- lm(deathsPC ~ trump2020Pct + vaxedPct + density, data = df_day)
summary(ols.3)

## 
## Call:
## lm(formula = deathsPC ~ trump2020Pct + vaxedPct + density, data = df_day)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.8395 -0.3063 -0.0134  0.1699  1.6674 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     2.110      0.890    2.37  0.02194 *  
## trump2020Pct    0.120      0.956    0.13  0.90028    
## vaxedPct       -2.671      0.751   -3.56  0.00086 ***
## density        -8.772     54.952   -0.16  0.87386    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.47 on 47 degrees of freedom
## Multiple R-squared:  0.364,  Adjusted R-squared:  0.323 
## F-statistic: 8.97 on 3 and 47 DF,  p-value: 8.33e-05

2.4 Assess specific vaccines

Create and add vaccinated percent by state for each of the Pfizer, Modern and Janssen (which is Johnson and Johnson) vaccines.
Use pop2019 for population
Use Series_Complete_Moderna_18Plus, Series_Complete_Janssen_18Plus and Series_Complete_Pfizer_18Plus for the vaccination totals.
Estimate a model in which deaths per capita is a function of all three vaccination rates.
Explain what the results mean, especially in light of the results above for overall vaccination results.
Explain how one would compare the efficacy of the individual vaccines (e.g., whether the Moderna vaccine works better than the Johnson and Johnson vaccine).

ANSWER: The major issue here is multicollinearity. For October 1, 2021, the overall vaccination rate was statistically significant (see models 1 - 3 in the table) yet each of the different vaccines was insignificant (see model 4). The vaccines are multicollinear (which can be assessed with an auxilliary regression) so the loss of power is not terribly surprising. An F-test whether all the specific vaccine variables equal zero is reported for model 4 and, broadly consistent with the results in models 1 -3 we can say that we can reject the null that all individual vaccines have zero effect. To assess whether Moderna is better than Johnson and Johnson we would do an F-test where our restricted equation adds those two vaccines. If doing so causes a substantial reduction in fit we would reject the null that the effects of those two vaccines are equal. (And we would want to take such results with a grain of salt given endogeneity in which states have which vaccines.)

df <- df %>% 
  mutate("vaxedPctModerna"  = Series_Complete_Moderna_18Plus/(100*pop2019),
         "vaxedPctJans"  = Series_Complete_Janssen_18Plus/(100*pop2019),
         "vaxedPctPfizer"  = Series_Complete_Pfizer_18Plus/(100*pop2019))

df_day <- df %>% 
  filter(submission_date == DATE)

ols.4 <- lm(deathsPC ~ vaxedPctJans + vaxedPctModerna +
             vaxedPctPfizer, data = df_day)

summary(ols.4)

## 
## Call:
## lm(formula = deathsPC ~ vaxedPctJans + vaxedPctModerna + vaxedPctPfizer, 
##     data = df_day)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.0240 -0.3373 -0.0304  0.1689  1.5567 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        2.393      0.653    3.67  0.00062 ***
## vaxedPctJans     469.840   1077.009    0.44  0.66466    
## vaxedPctModerna -156.194    541.375   -0.29  0.77422    
## vaxedPctPfizer  -676.480    305.101   -2.22  0.03148 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.52 on 47 degrees of freedom
## Multiple R-squared:  0.228,  Adjusted R-squared:  0.179 
## F-statistic: 4.63 on 3 and 47 DF,  p-value: 0.00643

stargazer(ols.1, ols.2, ols.3, ols.4,
          type = "html", # FOR PDF - the "asis" above makes it work
          keep.stat = c("n","ser", "rsq", "f"),
          report = "vcst",
          column.labels = c("Model 1", "Model 2", "Model 3", "Model 4"),
          digits = 3,
          dep.var.caption = "", dep.var.labels.include = FALSE)


	Model 1	Model 2	Model 3	Model 4
	(1)	(2)	(3)	(4)

trump2020Pct	2.000	0.220	0.120
	(0.620)	(0.720)	(0.960)
	t = 3.200	t = 0.300	t = 0.130

vaxedPct		-2.600	-2.700
		(0.690)	(0.750)
		t = -3.800	t = -3.600

density			-8.800
			(55.000)
			t = -0.160

vaxedPctJans				470.000
				(1,077.000)
				t = 0.440

vaxedPctModerna				-156.000
				(541.000)
				t = -0.290

vaxedPctPfizer				-676.000
				(305.000)
				t = -2.200

Constant	-0.520	2.000	2.100	2.400
	(0.320)	(0.730)	(0.890)	(0.650)
	t = -1.600	t = 2.800	t = 2.400	t = 3.700


Observations	51	51	51	51
R²	0.170	0.360	0.360	0.230
Residual Std. Error	0.530 (df = 49)	0.470 (df = 48)	0.470 (df = 47)	0.520 (df = 47)
F Statistic	10.000^*** (df = 1; 49)	14.000^*** (df = 2; 48)	9.000^*** (df = 3; 47)	4.600^*** (df = 3; 47)

Note:	p<0.1; p<0.05; p<0.01