2 Lab I: Introduction to R & R Studio

2.1 Intro to R

# Author: QSS Ch. 1 script with edits by Mark Richardson & Benjamin Reese
# Date: 08/24/2023
# Purpose: Introduction to R - GOVT 8001 Lab I

#### Arithmetic Operations ####

5 + 3
## [1] 8
5 - 3
## [1] 2
5 / 3
## [1] 1.666667
5 ^ 3
## [1] 125
5 * (10 - 3)
## [1] 35
sqrt(4)
## [1] 2
#### Objects ####

result <- 5 + 3
result
## [1] 8
print(result)
## [1] 8
result <- 5 - 3
result
## [1] 2
## R is case sensitive so we get an error.
Result
kosuke <- "instructor"
kosuke
## [1] "instructor"
kosuke <- "instructor and author"
kosuke
## [1] "instructor and author"
Result <- 5
Result + 2
## [1] 7
result
## [1] 2
class(result)
## [1] "numeric"
Result
## [1] 5
class(Result)
## [1] "numeric"
class(sqrt)
## [1] "function"
sum(result)
## [1] 2
sum(Result)
## [1] 5
#### Vectors ####

# Creating vectors

world.pop <- c(2525779, 3026003, 3691173, 4449049, 5320817, 6127700, 6916183)
world.pop
## [1] 2525779 3026003 3691173 4449049 5320817 6127700 6916183
pop.first <- c(2525779, 3026003, 3691173)
pop.second <- c(4449049, 5320817, 6127700, 6916183)
pop.all <- c(pop.first, pop.second)
pop.all
## [1] 2525779 3026003 3691173 4449049 5320817 6127700 6916183
# Accessing elements of a vector

world.pop[2]
## [1] 3026003
world.pop[c(2, 4)] 
## [1] 3026003 4449049
world.pop[c(4, 2)] 
## [1] 4449049 3026003
world.pop[-3]
## [1] 2525779 3026003 4449049 5320817 6127700 6916183
# Arithmetic operations on a vector

pop.million <- world.pop / 1000
pop.million
## [1] 2525.779 3026.003 3691.173 4449.049 5320.817 6127.700 6916.183
pop.rate <- world.pop / world.pop[1]
pop.rate
## [1] 1.000000 1.198047 1.461400 1.761456 2.106604 2.426063 2.738238
pop.increase <- world.pop[-1] - world.pop[-7]
pop.increase
## [1] 500224 665170 757876 871768 806883 788483
percent.increase <- (pop.increase / world.pop[-7]) * 100
percent.increase
## [1] 19.80474 21.98180 20.53212 19.59448 15.16464 12.86752
# Can replace individual elements (better way is to use round())

round(percent.increase)
## [1] 20 22 21 20 15 13
percent.increase[c(1, 2)] <- c(20, 22)
percent.increase
## [1] 20.00000 22.00000 20.53212 19.59448 15.16464 12.86752
#### Functions ####

length(world.pop)  
## [1] 7
min(world.pop)     
## [1] 2525779
max(world.pop)     
## [1] 6916183
range(world.pop)   
## [1] 2525779 6916183
mean(world.pop)    
## [1] 4579529
sum(world.pop) / length(world.pop) 
## [1] 4579529
year <- seq(from = 1950, to = 2010, by = 10)
year
## [1] 1950 1960 1970 1980 1990 2000 2010
seq(to = 2010, by = 10, from = 1950)
## [1] 1950 1960 1970 1980 1990 2000 2010
seq(1950, 2010, 10)
## [1] 1950 1960 1970 1980 1990 2000 2010
seq(2010, 1950, -10)
## [1] 2010 2000 1990 1980 1970 1960 1950
seq(from = 2010, to = 1950, by = -10)
## [1] 2010 2000 1990 1980 1970 1960 1950
2008:2012
## [1] 2008 2009 2010 2011 2012
2012:2008
## [1] 2012 2011 2010 2009 2008
names(world.pop) 
## NULL
names(world.pop) <- year
names(world.pop)
## [1] "1950" "1960" "1970" "1980" "1990" "2000" "2010"
world.pop
##    1950    1960    1970    1980    1990    2000    2010 
## 2525779 3026003 3691173 4449049 5320817 6127700 6916183
#### Saving data and loading data ####
 
# Create a data set (Table 1.2)
# tibble() is the equivalent of data.frame() tidyverse function from the tibble package
UNpop <- data.frame(world.pop = world.pop,
                    year = year)

# Get basic information about the data set

names(UNpop)
## [1] "world.pop" "year"
nrow(UNpop)
## [1] 7
ncol(UNpop)
## [1] 2
dim(UNpop)
## [1] 7 2
summary(UNpop)
##    world.pop            year     
##  Min.   :2525779   Min.   :1950  
##  1st Qu.:3358588   1st Qu.:1965  
##  Median :4449049   Median :1980  
##  Mean   :4579529   Mean   :1980  
##  3rd Qu.:5724258   3rd Qu.:1995  
##  Max.   :6916183   Max.   :2010
UNpop$world.pop
## [1] 2525779 3026003 3691173 4449049 5320817 6127700 6916183
UNpop[, "world.pop"] # extract the column called "world.pop"
## [1] 2525779 3026003 3691173 4449049 5320817 6127700 6916183
UNpop[c(1, 2, 3, 5), ]   # extract the first three rows (and all columns)
##      world.pop year
## 1950   2525779 1950
## 1960   3026003 1960
## 1970   3691173 1970
## 1990   5320817 1990
UNpop[1:3, "year"]   # extract the first three rows of the "year" column
## [1] 1950 1960 1970
UNpop$world.pop[seq(from = 1, to = nrow(UNpop), by = 2)]
## [1] 2525779 3691173 5320817 6916183
# File paths and working directory

getwd() # Confirm the change
## [1] "C:/Users/17176/Documents/GOVT701"
#### Getting Help: mean() example ####

world.pop <- c(UNpop$world.pop, NA)
world.pop
## [1] 2525779 3026003 3691173 4449049 5320817 6127700 6916183      NA
mean(world.pop)
## [1] NA
## Use Question Marks to see help documentation
?mean

mean(world.pop, na.rm = TRUE)
## [1] 4579529

2.2 Intro to library(tidyverse)

# Packages

## install.packages("devtools") # install the package
library(devtools) # load the package

## install a package from github
## devtools::install_github("kosukeimai/qss-package", build_vignettes = TRUE)
library(qss) ## loading in qss
## You may need to allow R to update/install additional packages

## Loading in tidyverse
## install.packages("tidyverse")
library(tidyverse)

## Loading in a Dataset
data(UNpop, package = "qss")

## Number of Rows and Columns - Base R
dim(UNpop)
## [1] 7 2
## Number of observation, number of variables, and initial observations - tidyverse
glimpse(UNpop)
## Rows: 7
## Columns: 2
## $ year      <int> 1950, 1960, 1970, 1980, 1990, 2000, 2010
## $ world.pop <int> 2525779, 3026003, 3691173, 4449049, 5320817, 6127700, 6916183
## First 6 rows
head(UNpop)
##   year world.pop
## 1 1950   2525779
## 2 1960   3026003
## 3 1970   3691173
## 4 1980   4449049
## 5 1990   5320817
## 6 2000   6127700
## Last 6 Rows
tail(UNpop)
##   year world.pop
## 2 1960   3026003
## 3 1970   3691173
## 4 1980   4449049
## 5 1990   5320817
## 6 2000   6127700
## 7 2010   6916183
## Selecting A Variable - Base R
UNpop$world.pop
## [1] 2525779 3026003 3691173 4449049 5320817 6127700 6916183
## subset all rows for the column called "world.pop" from the UNpop data
UNpop[, "world.pop"]
## [1] 2525779 3026003 3691173 4449049 5320817 6127700 6916183
## subset the first three rows (and all columns)
UNpop[c(1, 2, 3),]
##   year world.pop
## 1 1950   2525779
## 2 1960   3026003
## 3 1970   3691173
## subset the first three rows of the "year" column
UNpop[1:3, "year"]
## [1] 1950 1960 1970
## Now with tidyverse

## Subset the first three rows of UNpop with tidyverse
slice(UNpop, 1:3)
##   year world.pop
## 1 1950   2525779
## 2 1960   3026003
## 3 1970   3691173
## Extract/subset the world.pop variable (column)
select(UNpop, world.pop)
##   world.pop
## 1   2525779
## 2   3026003
## 3   3691173
## 4   4449049
## 5   5320817
## 6   6127700
## 7   6916183
## Base R subset the first three rows of the year variable
UNpop[1:3, "year"]
## [1] 1950 1960 1970
## or in tidyverse, combining slice() and select()
select(slice(UNpop, 1:3), year)
##   year
## 1 1950
## 2 1960
## 3 1970
## Basic Data Wrangling with the tidyverse using pipes (i.e., %>%)

UNpop %>% # take the UNpop data we have loaded, and then...
  slice(1:3) %>% # subset the first three rows, and then...
  select(year) # subset the year column
##   year
## 1 1950
## 2 1960
## 3 1970
UNpop %>%
  slice(seq(1, n(), by = 2)) %>% # using a sequence from 1 to n()
  select(world.pop)
##   world.pop
## 1   2525779
## 2   3691173
## 3   5320817
## 4   6916183
pop.1970 <- UNpop %>% # take the UNpop data and then....
  filter(year == 1970) %>% # subset rows where the year variable is equal to 1970
  select(world.pop) %>% # subset just the world.pop column
  pull() # return a vector, not a tibble

## Print the vector to the console to see it
print(pop.1970)
## [1] 3691173
UNpop.mill <- UNpop %>% # create a new tibble from UNpop
  mutate(world.pop.mill = world.pop / 1000) %>% # create a new variable, world.pop.mill
  select(-world.pop) # drop the original world.pop column

## Adding a variable with if_else
UNpop.mill <- UNpop.mill %>%
  mutate(after.1980 = if_else(year >= 1980, 1, 0))

## Creating a vector of the years of interest
specific.years <- c(1950, 1980, 2000)

## Adding a variable with if_else and %in%
UNpop.mill <- UNpop.mill %>%
  mutate(year.of.interest = if_else(year %in% specific.years, 1, 0))

summary(UNpop.mill)
##       year      world.pop.mill   after.1980     year.of.interest
##  Min.   :1950   Min.   :2526   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:1965   1st Qu.:3359   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :1980   Median :4449   Median :1.0000   Median :0.0000  
##  Mean   :1980   Mean   :4580   Mean   :0.5714   Mean   :0.4286  
##  3rd Qu.:1995   3rd Qu.:5724   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :2010   Max.   :6916   Max.   :1.0000   Max.   :1.0000
mean(UNpop.mill$world.pop.mill)
## [1] 4579.529
## Add a row where values for all columns is NA
UNpop.mill.wNAs <- UNpop.mill %>%
  add_row(year = NA, world.pop.mill = NA,
          after.1980 = NA,
          year.of.interest = NA)
## Take the mean of world.pop.mill (returns NA)
mean(UNpop.mill.wNAs$world.pop.mill)
## [1] NA
## Take the mean of world.pop.mill (ignores the NA)
mean(UNpop.mill.wNAs$world.pop.mill, na.rm = TRUE)
## [1] 4579.529
## Other Summary Statistics with tidyverse
UNpop.mill %>%
  summarize(mean.pop = mean(world.pop.mill),
            median.pop = median(world.pop.mill))
##   mean.pop median.pop
## 1 4579.529   4449.049
UNpop.mill %>%
  group_by(after.1980) %>% # create subset group for each value of after.1980
  summarize(mean.pop = mean(world.pop.mill)) # calculate mean for each group
## # A tibble: 2 × 2
##   after.1980 mean.pop
##        <dbl>    <dbl>
## 1          0    3081.
## 2          1    5703.