2 9/23 | Lab I: Tidyverse & OLS Review

# Preparation
library(tidyverse)
library(readr)
library(readxl)
library(stargazer)
library(car)

2.1 Join the data sets.

Join the cases and vaccination data by date and state.
- Case data: United_States_COVID-19_Cases_and_Deaths_by_State_over_Time.csv
- Vaccination data: COVID-19_Vaccinations_in_the_United_States_Jurisdiction.csv
- Other state variables: USstates.xlsx
Add the USstates.xlsx data and limit your dataframe to the states listed in USstates.xlsx.
How do you know if your merge was successful?

## Loading Case Data
case <- read_csv("Data/United_States_COVID-19_Cases_and_Deaths_by_State_over_Time.csv")

## Loading Vax Data 
vax <- read_csv("Data/COVID-19_Vaccinations_in_the_United_States_Jurisdiction.csv")

## Joining Case & Vax
case_vax <- case %>%
  left_join(vax, by = c("submission_date" = "Date", "state" = "Location"))

## Adding in state variables
states <- read_excel("Data/USstates.xlsx")

## Joining all datasets to create df
df <- case_vax %>%
  filter(state %in% states$stateAbbr) %>%
  left_join(states, by = c("state" = "stateAbbr")) %>%
  janitor::clean_names() ## making variable names lowercase with underscores

## Checking for only states
count(tibble(unique(df$state)))

## # A tibble: 1 × 1
##       n
##   <int>
## 1    51

## View(df)

To use janitor::clean_names() above, be sure to install the janitor package with install.packages("janitor")

2.2 Create and add the following variables to your dataframe density, deaths per capita, cases per capita and vaccinated percent.

For a specific day, show the top five states ranked by deaths per capita and calculate the average vaccinated per capita and the mean, minimum and maximum deaths per capita.

## Creating Variables
df <- df %>% 
  mutate("deaths_pc" = 100000*new_death/pop2019, 
         "cases_pc"  = 100000*new_case/pop2019,
         "vaxed_pct"  = series_complete_12plus_pop_pct/100,
         "density"   = pop2019/(1000000*sq_miles))

## Filtering to specific day
df_day <- df %>%
  filter(submission_date == "10/1/2021")

## Top five states by deaths per capita
df_day %>%
  arrange(desc(deaths_pc)) %>%
  dplyr::select(submission_date, state, deaths_pc, cases_pc, vaxed_pct, density) %>%
  slice(1:5)

## # A tibble: 5 × 6
##   submission_date state deaths_pc cases_pc vaxed_pct   density
##   <chr>           <chr>     <dbl>    <dbl>     <dbl>     <dbl>
## 1 10/1/2021       WV         2.90     91.8     0.465 0.0000746
## 2 10/1/2021       OH         2.81     53.3     0.588 0.000286 
## 3 10/1/2021       GA         1.49     31.5     0.537 0.000185 
## 4 10/1/2021       ID         1.34     99.5     0     0.0000216
## 5 10/1/2021       ND         1.31     93.3     0.529 0.0000110

## Mean vaxed per capita and mean, min, max of deaths per capita
df_day %>%
  summarise(mean_vaxed = mean(vaxed_pct),
            mean_deaths = mean(deaths_pc),
            min_deaths = min(deaths_pc),
            max_deaths = max(deaths_pc))

## # A tibble: 1 × 4
##   mean_vaxed mean_deaths min_deaths max_deaths
##        <dbl>       <dbl>      <dbl>      <dbl>
## 1      0.625       0.551          0       2.90

2.3 Estimate three regression models with deaths per capita on your selected day as the dependent variable.

Your first model will have only Trump 2020 percent as an independent variable.
Your second model will add vaccinated percent as an independent variable.
Your third model will add density.
Before you estimate the models, write down your expectations about what will happen in these models.

## Trump percent model
ols.1 <- lm(deaths_pc ~ trump2020_percent, data = df_day)
summary(ols.1)

## 
## Call:
## lm(formula = deaths_pc ~ trump2020_percent, data = df_day)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.88445 -0.31999 -0.09294  0.17142  2.18890 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)  
## (Intercept)        -0.2412     0.3422  -0.705   0.4843  
## trump2020_percent   1.6103     0.6759   2.382   0.0211 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.573 on 49 degrees of freedom
## Multiple R-squared:  0.1038, Adjusted R-squared:  0.08552 
## F-statistic: 5.676 on 1 and 49 DF,  p-value: 0.02112

## Trump and Vax model
ols.2 <- lm(deaths_pc ~ trump2020_percent + vaxed_pct, data = df_day)
summary(ols.2)

## 
## Call:
## lm(formula = deaths_pc ~ trump2020_percent + vaxed_pct, data = df_day)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.8757 -0.2596 -0.1054  0.1314  2.1706 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)  
## (Intercept)         1.5915     0.8539   1.864   0.0685 .
## trump2020_percent   0.3252     0.8514   0.382   0.7041  
## vaxed_pct          -1.9208     0.8264  -2.324   0.0244 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5489 on 48 degrees of freedom
## Multiple R-squared:  0.1945, Adjusted R-squared:  0.1609 
## F-statistic: 5.795 on 2 and 48 DF,  p-value: 0.005569

## Trump, Vax, and Density Model
ols.3 <- lm(deaths_pc ~ trump2020_percent + vaxed_pct + density, data = df_day)
summary(ols.3)

## 
## Call:
## lm(formula = deaths_pc ~ trump2020_percent + vaxed_pct + density, 
##     data = df_day)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.8859 -0.2580 -0.1098  0.1301  2.1695 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)  
## (Intercept)         1.5157     1.0494   1.444   0.1553  
## trump2020_percent   0.4184     1.1311   0.370   0.7131  
## vaxed_pct          -1.8785     0.8991  -2.089   0.0421 *
## density             8.2056    64.6874   0.127   0.8996  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5546 on 47 degrees of freedom
## Multiple R-squared:  0.1948, Adjusted R-squared:  0.1434 
## F-statistic: 3.789 on 3 and 47 DF,  p-value: 0.01628

The results of ols.1, using October 1st, 2021 as the day, indicate that area with higher support for Donald Trump have more deaths per capita, specifically, a one percentage increase in Trump’s 2020 vote share, on average, increases deaths per capita by 1.6. This is a classic case of omitted variable bias, however. Once we control for the percent vaxed in an area, the trump2020_percent loses statistical significance. We can see in the second model that the coefficient on the Trump vote share variable greatly decreases and is no longer statistically significant at conventional thresholds. vaxed_pct is statistically significant at the \(p<.05\) level and indicates that a one percent increase in an areas vaccination rate corresponds, on average, with a 1.9 decrease in deaths per capita, holding trump2020_percent constant. This result stands even when controlling for an areas density as shown in model 3.

2.4 Assess specific vaccines

Create and add vaccinated percent by state for each of the Pfizer, Modern and Janssen (which is Johnson and Johnson) vaccines.
Use pop2019 for population
Use Series_Complete_Moderna_18Plus, Series_Complete_Janssen_18Plus and Series_Complete_Pfizer_18Plus for the vaccination totals.
Estimate a model in which deaths per capita is a function of all three vaccination rates.
Explain what the results mean, especially in light of the results above for overall vaccination results.
Explain how one would compare the efficacy of the individual vaccines (e.g., whether the Moderna vaccine works better than the Johnson and Johnson vaccine). Many answers would work here.

## Creating vaxed percent variables for different vaccines
df <- df %>% 
  mutate(vaxed_pct_moderna  = series_complete_moderna_18plus/(100*pop2019),
         vaxed_pct_jans  = series_complete_janssen_18plus/(100*pop2019),
         vaxed_pct_pfizer  = series_complete_pfizer_18plus/(100*pop2019))

## Filtering to day
df_day = df %>% 
  filter(submission_date == "10/1/2021")

## Running Model
ols.4 <- lm(deaths_pc ~ vaxed_pct_jans + vaxed_pct_moderna +
             vaxed_pct_pfizer, data = df_day)
summary(ols.4)

## 
## Call:
## lm(formula = deaths_pc ~ vaxed_pct_jans + vaxed_pct_moderna + 
##     vaxed_pct_pfizer, data = df_day)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.93880 -0.29403 -0.05261  0.15284  2.13378 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)   
## (Intercept)          2.373      0.679   3.496  0.00104 **
## vaxed_pct_jans    -570.304   1174.009  -0.486  0.62938   
## vaxed_pct_moderna -359.090    586.494  -0.612  0.54331   
## vaxed_pct_pfizer  -321.407    333.845  -0.963  0.34060   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5544 on 47 degrees of freedom
## Multiple R-squared:  0.1954, Adjusted R-squared:  0.1441 
## F-statistic: 3.806 on 3 and 47 DF,  p-value: 0.01599

**Lab I OLS Results**

	Model 1	Model 2	Model 3	Model 4
	(1)	(2)	(3)	(4)

trump2020_percent	1.610	0.325	0.418
	(0.676)	(0.851)	(1.131)
	t = 2.382	t = 0.382	t = 0.370

vaxed_pct		-1.921	-1.879
		(0.826)	(0.899)
		t = -2.324	t = -2.089

density			8.206
			(64.687)
			t = 0.127

vaxed_pct_jans				-570.304
				(1,174.009)
				t = -0.486

vaxed_pct_moderna				-359.090
				(586.494)
				t = -0.612

vaxed_pct_pfizer				-321.407
				(333.845)
				t = -0.963

Constant	-0.241	1.591	1.516	2.374
	(0.342)	(0.854)	(1.049)	(0.679)
	t = -0.705	t = 1.864	t = 1.444	t = 3.496


Observations	51	51	51	51
R²	0.104	0.194	0.195	0.195
Residual Std. Error	0.573 (df = 49)	0.549 (df = 48)	0.555 (df = 47)	0.554 (df = 47)
F Statistic	5.676^** (df = 1; 49)	5.795^*** (df = 2; 48)	3.789^** (df = 3; 47)	3.806^** (df = 3; 47)

Note:	p<0.1; p<0.05; p<0.01

The results are no longer statistically significance due to multicollinearity. There is a high degree of correlation between the rates each vaccine were administered which is contributing to the large standard errors. The p-value of the \(F\)-statistic tells us the probability that we would see a result this extreme given all of the coefficients on the vaccines are actually zero, reinforcing our findings above.

This is not necessary for this lab, but to assess the efficacy of one vaccine over another, we could conduct an \(F\) test. Remember, the formula for F-tests is: \[F_{q, N-k} = \frac{(R^2_{Unrestricted}-R^2_{Restricted})\setminus q}{(1-R^2_{Unrestricted}) \setminus (N-k)}\]

As an example, I will evaluate if \(\beta_1 = \beta_2\) in the following model: \(deaths_pc_{it} = \beta_0 + \beta_1jans_{it} + \beta_2moderna_{it} + \beta_3pfizer_{it} + \epsilon_i\) which is ols.4 estimated above that examines the relationship between the rate of vaccination for each vaccine and deaths per capita in state \(i\) and day \(t\). We can do this manually, but library(car) makes it easy. See the code below:

## F Test 
linearHypothesis(ols.4, c("vaxed_pct_jans = vaxed_pct_moderna"))

## Linear hypothesis test
## 
## Hypothesis:
## vaxed_pct_jans - vaxed_pct_moderna = 0
## 
## Model 1: restricted model
## Model 2: deaths_pc ~ vaxed_pct_jans + vaxed_pct_moderna + vaxed_pct_pfizer
## 
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1     48 14.451                           
## 2     47 14.445  1 0.0056829 0.0185 0.8924

We can see that the \(p\)-value is not statistically significant, meaning that we cannot reject the null hypothesis that \(\beta_1\) is statistically significantly different than \(\beta_2\).

2.5 Optional Lab 1b

This lab is optional and just for some additional practice.

2.5.1 Load the World Values Survey data from “Data/WV7_small.csv” and the CountryCode data from “Data/Country codes for WVS wave 7.csv”.

The codebook for the World Values Survey data is available at http://www.worldvaluessurvey.org/WVSDocumentationWV7.jsp

The variables in WV_small were created as follows:

Satisfaction with your life from 1 (completely dissatisfied) to 10 (completely satisfied) (V23)
Income: a country-specific scale ranging from 1 (lowest income category) to 10 (highest income category) (V239 in the dataset)
Education: a scale ranging from 1 (no formal education) to 9 (a degree from a university) (V248 in the dataset)
Country: based on V2 in the dataset. See “Country codes for WVS wave 6.csv”
Conservatism: self-identified political conservatism from 1 (most liberal) to 10 (most conservative) (V95 in the data set)
Male: a dummy variable indicating male (V240)
Religious: Indicating how often the individual attends religious services ranging from 1 (almost never) to 7 (more than once a week) (this is a re-coding of V145 that had the polarity reversed, but is harder to interpret)
Marital: Marital status 1 = Married 2 = Living together as married 3 = Divorced 4 = Separated 5 = Widowed 6 = Single
Survey year: year survey taken
Birth year: respondent’s year of birth

## World Values Dataset
wv <- read_csv("Data/WV7_small.csv")

## Country Codes
cc <- read_csv("Data/Country codes for WVS wave 7.csv")

2.5.2 Join the country name from the CountryCode data to the World Values data. The country code is V2 in CountryCode and B_Country in the World Values data. Create two data objects of the countries in and not in the World Values data. Display the first 10 countries for each list.

## Left Join
wv <- wv %>%
  left_join(cc, by = c("B_COUNTRY" = "V2"))

## Showing countries in
countries_in_wv <- tibble(unique(wv$country))
countries_in_wv[1:10,]

## # A tibble: 10 × 1
##    `unique(wv$country)`
##    <chr>               
##  1 Ethiopia            
##  2 Zimbabwe            
##  3 Tajikistan          
##  4 Pakistan            
##  5 Bangladesh          
##  6 Nigeria             
##  7 Myanmar             
##  8 Kyrgyzstan          
##  9 Nicaragua           
## 10 Viet Nam

## Anti Join
countries_out <- cc %>%
  anti_join(wv, by = c("V2" = "B_COUNTRY")) %>%
  filter(V2 > 0)

## Countries not in wv
countries_out %>%
  slice(1:10)

## # A tibble: 10 × 2
##       V2 country       
##    <dbl> <chr>         
##  1     8 Albania       
##  2    12 Algeria       
##  3    16 American Samoa
##  4    24 Angola        
##  5    28 Antigua       
##  6    31 Azerbaijan    
##  7    40 Austria       
##  8    48 Bahrain       
##  9    51 Armenia       
## 10    52 Barbados

2.5.3 Using pipes, calculate the percent immigrant in every country. Show the highest 10 and lowest 10 countries. (Be sure to think through what the variable in the data set means.)

## Recoding Immigrant Variable
wv$Immigrant <- wv$Immigrant-1

## 10 Highest
wv %>%
  group_by(country) %>%
  dplyr::summarize(pct_imm = mean(Immigrant, na.rm=T), n=n())%>%
  arrange(desc(pct_imm)) %>%
  slice(1:10)

## # A tibble: 10 × 3
##    country       pct_imm     n
##    <chr>           <dbl> <int>
##  1 Andorra         0.723  1004
##  2 Macao           0.417  1023
##  3 Hong Kong       0.261  2075
##  4 New Zealand     0.227  1057
##  5 China           0.221  3036
##  6 Cyprus (G)      0.22   1000
##  7 Germany         0.139  1528
##  8 Jordan          0.138  1203
##  9 Greece          0.106  1200
## 10 United States   0.106  2596

## 10 Lowest
## Percent immigrant in each country
wv %>%
  group_by(country) %>%
  dplyr::summarize(pct_imm = mean(Immigrant, na.rm=T), n=n())%>%
  arrange(pct_imm) %>%
  slice(1:10)

## # A tibble: 10 × 3
##    country     pct_imm     n
##    <chr>         <dbl> <int>
##  1 Bangladesh 0         1200
##  2 Iraq       0         1200
##  3 Mexico     0         1739
##  4 Myanmar    0         1200
##  5 Thailand   0         1500
##  6 Viet Nam   0         1200
##  7 Egypt      0.000833  1200
##  8 Indonesia  0.000938  3200
##  9 Romania    0.00159   1257
## 10 Ethiopia   0.00163   1230

2.5.4 We’re now going to give some examples of the kind of data we can calculate. Think about how you would do this in Excel (but don’t actually do it!) and then calculate in R. Calculate the percent of immigrants who are men, by country and show the highest and lowest 10 countries.

## 10 Highest
wv %>%
  filter(Male==1) %>%
  group_by(country) %>%
  dplyr::summarize(pct_imm = mean(Immigrant, na.rm=T), n=n()) %>%
  arrange(desc(pct_imm)) %>%
  slice(1:10)

## # A tibble: 10 × 3
##    country       pct_imm     n
##    <chr>           <dbl> <int>
##  1 Andorra         0.784   495
##  2 Macao           0.443   572
##  3 Hong Kong       0.257  1123
##  4 China           0.223  1667
##  5 New Zealand     0.220   594
##  6 Cyprus (G)      0.191   518
##  7 Jordan          0.154   596
##  8 Germany         0.145   785
##  9 Greece          0.111   637
## 10 United States   0.103  1206

## 10 Lowest
wv %>%
  filter(Male==1) %>%
  group_by(country) %>%
  dplyr::summarize(pct_imm = mean(Immigrant, na.rm=T), n=n()) %>%
  arrange(pct_imm) %>%
  slice(1:10)

## # A tibble: 10 × 3
##    country    pct_imm     n
##    <chr>        <dbl> <int>
##  1 Bangladesh 0         608
##  2 Egypt      0         579
##  3 Iraq       0         592
##  4 Mexico     0         866
##  5 Myanmar    0         599
##  6 Nigeria    0         604
##  7 Thailand   0         793
##  8 Viet Nam   0         655
##  9 Indonesia  0.00114  1754
## 10 Romania    0.00132   760

2.5.5 Group by marital status and country and show the percent of people in each category who are men. Show the results for a country of your choice and briefly discuss what it means when the percentages in each category are not roughly 50 percent. Recall that the coding for the Marital variable is 1 = Married 2 = Living together as married 3 = Divorced 4 = Separated 5 = Widowed 6 = Single.

wv %>%
  group_by(country, Marital) %>%
  dplyr::summarize(male_pct = mean(Male, na.rm=T), n=n()) %>%
  filter(country == "United States")

## # A tibble: 7 × 4
## # Groups:   country [1]
##   country       Marital male_pct     n
##   <chr>           <dbl>    <dbl> <int>
## 1 United States       1    0.431  1296
## 2 United States       2    0.548   197
## 3 United States       3    0.493   278
## 4 United States       4    0.5      54
## 5 United States       5    0.560    84
## 6 United States       6    0.477   677
## 7 United States      NA    0.6      10

2.5.6 Come up with your own alternatives: think about some possible subset of people and some information we have about them and see if you can write code to capture that information.

wv %>%
  group_by(country, Income) %>%
  dplyr::summarize(imm_pct = mean(Immigrant, na.rm=T), n=n()) %>%
  filter(country == "United States")

## # A tibble: 11 × 4
## # Groups:   country [1]
##    country       Income imm_pct     n
##    <chr>          <dbl>   <dbl> <int>
##  1 United States      1  0.106    123
##  2 United States      2  0.072    128
##  3 United States      3  0.0871   268
##  4 United States      4  0.0867   373
##  5 United States      5  0.110    599
##  6 United States      6  0.111    468
##  7 United States      7  0.110    370
##  8 United States      8  0.142    151
##  9 United States      9  0.111     36
## 10 United States     10  0.227     22
## 11 United States     NA  0.136     58