2 Lab I: Introduction to R & R Studio

2.1 Intro to R

# Author: QSS Ch. 1 script with edits by Mark Richardson & Benjamin Reese
# Date: 08/24/2023
# Purpose: Introduction to R - GOVT 8001 Lab I

#### Arithmetic Operations ####

5 + 3

## [1] 8

5 - 3

## [1] 2

5 / 3

## [1] 1.666667

5 ^ 3

## [1] 125

5 * (10 - 3)

## [1] 35

sqrt(4)

## [1] 2

#### Objects ####

result <- 5 + 3
result

## [1] 8

print(result)

## [1] 8

result <- 5 - 3
result

## [1] 2

## R is case sensitive so we get an error.
Result

kosuke <- "instructor"
kosuke

## [1] "instructor"

kosuke <- "instructor and author"
kosuke

## [1] "instructor and author"

Result <- 5
Result + 2

## [1] 7

result

## [1] 2

class(result)

## [1] "numeric"

Result

## [1] 5

class(Result)

## [1] "numeric"

class(sqrt)

## [1] "function"

sum(result)

## [1] 2

sum(Result)

## [1] 5

#### Vectors ####

# Creating vectors

world.pop <- c(2525779, 3026003, 3691173, 4449049, 5320817, 6127700, 6916183)
world.pop

## [1] 2525779 3026003 3691173 4449049 5320817 6127700 6916183

pop.first <- c(2525779, 3026003, 3691173)
pop.second <- c(4449049, 5320817, 6127700, 6916183)
pop.all <- c(pop.first, pop.second)
pop.all

## [1] 2525779 3026003 3691173 4449049 5320817 6127700 6916183

# Accessing elements of a vector

world.pop[2]

## [1] 3026003

world.pop[c(2, 4)]

## [1] 3026003 4449049

world.pop[c(4, 2)]

## [1] 4449049 3026003

world.pop[-3]

## [1] 2525779 3026003 4449049 5320817 6127700 6916183

# Arithmetic operations on a vector

pop.million <- world.pop / 1000
pop.million

## [1] 2525.779 3026.003 3691.173 4449.049 5320.817 6127.700 6916.183

pop.rate <- world.pop / world.pop[1]
pop.rate

## [1] 1.000000 1.198047 1.461400 1.761456 2.106604 2.426063 2.738238

pop.increase <- world.pop[-1] - world.pop[-7]
pop.increase

## [1] 500224 665170 757876 871768 806883 788483

percent.increase <- (pop.increase / world.pop[-7]) * 100
percent.increase

## [1] 19.80474 21.98180 20.53212 19.59448 15.16464 12.86752

# Can replace individual elements (better way is to use round())

round(percent.increase)

## [1] 20 22 21 20 15 13

percent.increase[c(1, 2)] <- c(20, 22)
percent.increase

## [1] 20.00000 22.00000 20.53212 19.59448 15.16464 12.86752

#### Functions ####

length(world.pop)

## [1] 7

min(world.pop)

## [1] 2525779

max(world.pop)

## [1] 6916183

range(world.pop)

## [1] 2525779 6916183

mean(world.pop)

## [1] 4579529

sum(world.pop) / length(world.pop)

## [1] 4579529

year <- seq(from = 1950, to = 2010, by = 10)
year

## [1] 1950 1960 1970 1980 1990 2000 2010

seq(to = 2010, by = 10, from = 1950)

## [1] 1950 1960 1970 1980 1990 2000 2010

seq(1950, 2010, 10)

## [1] 1950 1960 1970 1980 1990 2000 2010

seq(2010, 1950, -10)

## [1] 2010 2000 1990 1980 1970 1960 1950

seq(from = 2010, to = 1950, by = -10)

## [1] 2010 2000 1990 1980 1970 1960 1950

2008:2012

## [1] 2008 2009 2010 2011 2012

2012:2008

## [1] 2012 2011 2010 2009 2008

names(world.pop)

## NULL

names(world.pop) <- year
names(world.pop)

## [1] "1950" "1960" "1970" "1980" "1990" "2000" "2010"

world.pop

##    1950    1960    1970    1980    1990    2000    2010 
## 2525779 3026003 3691173 4449049 5320817 6127700 6916183

#### Saving data and loading data ####
 
# Create a data set (Table 1.2)
# tibble() is the equivalent of data.frame() tidyverse function from the tibble package
UNpop <- data.frame(world.pop = world.pop,
                    year = year)

# Get basic information about the data set

names(UNpop)

## [1] "world.pop" "year"

nrow(UNpop)

## [1] 7

ncol(UNpop)

## [1] 2

dim(UNpop)

## [1] 7 2

summary(UNpop)

##    world.pop            year     
##  Min.   :2525779   Min.   :1950  
##  1st Qu.:3358588   1st Qu.:1965  
##  Median :4449049   Median :1980  
##  Mean   :4579529   Mean   :1980  
##  3rd Qu.:5724258   3rd Qu.:1995  
##  Max.   :6916183   Max.   :2010

UNpop$world.pop

## [1] 2525779 3026003 3691173 4449049 5320817 6127700 6916183

UNpop[, "world.pop"] # extract the column called "world.pop"

## [1] 2525779 3026003 3691173 4449049 5320817 6127700 6916183

UNpop[c(1, 2, 3, 5), ]   # extract the first three rows (and all columns)

##      world.pop year
## 1950   2525779 1950
## 1960   3026003 1960
## 1970   3691173 1970
## 1990   5320817 1990

UNpop[1:3, "year"]   # extract the first three rows of the "year" column

## [1] 1950 1960 1970

UNpop$world.pop[seq(from = 1, to = nrow(UNpop), by = 2)]

## [1] 2525779 3691173 5320817 6916183

# File paths and working directory

getwd() # Confirm the change

## [1] "C:/Users/17176/Documents/GOVT701"

#### Getting Help: mean() example ####

world.pop <- c(UNpop$world.pop, NA)
world.pop

## [1] 2525779 3026003 3691173 4449049 5320817 6127700 6916183      NA

mean(world.pop)

## [1] NA

## Use Question Marks to see help documentation
?mean

mean(world.pop, na.rm = TRUE)

## [1] 4579529

2.2 Intro to `library(tidyverse)`

# Packages

## install.packages("devtools") # install the package
library(devtools) # load the package

## install a package from github
## devtools::install_github("kosukeimai/qss-package", build_vignettes = TRUE)
library(qss) ## loading in qss
## You may need to allow R to update/install additional packages

## Loading in tidyverse
## install.packages("tidyverse")
library(tidyverse)

## Loading in a Dataset
data(UNpop, package = "qss")

## Number of Rows and Columns - Base R
dim(UNpop)

## [1] 7 2

## Number of observation, number of variables, and initial observations - tidyverse
glimpse(UNpop)

## Rows: 7
## Columns: 2
## $ year      <int> 1950, 1960, 1970, 1980, 1990, 2000, 2010
## $ world.pop <int> 2525779, 3026003, 3691173, 4449049, 5320817, 6127700, 6916183

## First 6 rows
head(UNpop)

##   year world.pop
## 1 1950   2525779
## 2 1960   3026003
## 3 1970   3691173
## 4 1980   4449049
## 5 1990   5320817
## 6 2000   6127700

## Last 6 Rows
tail(UNpop)

##   year world.pop
## 2 1960   3026003
## 3 1970   3691173
## 4 1980   4449049
## 5 1990   5320817
## 6 2000   6127700
## 7 2010   6916183

## Selecting A Variable - Base R
UNpop$world.pop

## [1] 2525779 3026003 3691173 4449049 5320817 6127700 6916183

## subset all rows for the column called "world.pop" from the UNpop data
UNpop[, "world.pop"]

## [1] 2525779 3026003 3691173 4449049 5320817 6127700 6916183

## subset the first three rows (and all columns)
UNpop[c(1, 2, 3),]

##   year world.pop
## 1 1950   2525779
## 2 1960   3026003
## 3 1970   3691173

## subset the first three rows of the "year" column
UNpop[1:3, "year"]

## [1] 1950 1960 1970

## Now with tidyverse

## Subset the first three rows of UNpop with tidyverse
slice(UNpop, 1:3)

##   year world.pop
## 1 1950   2525779
## 2 1960   3026003
## 3 1970   3691173

## Extract/subset the world.pop variable (column)
select(UNpop, world.pop)

##   world.pop
## 1   2525779
## 2   3026003
## 3   3691173
## 4   4449049
## 5   5320817
## 6   6127700
## 7   6916183

## Base R subset the first three rows of the year variable
UNpop[1:3, "year"]

## [1] 1950 1960 1970

## or in tidyverse, combining slice() and select()
select(slice(UNpop, 1:3), year)

##   year
## 1 1950
## 2 1960
## 3 1970

## Basic Data Wrangling with the tidyverse using pipes (i.e., %>%)

UNpop %>% # take the UNpop data we have loaded, and then...
  slice(1:3) %>% # subset the first three rows, and then...
  select(year) # subset the year column

##   year
## 1 1950
## 2 1960
## 3 1970

UNpop %>%
  slice(seq(1, n(), by = 2)) %>% # using a sequence from 1 to n()
  select(world.pop)

##   world.pop
## 1   2525779
## 2   3691173
## 3   5320817
## 4   6916183

pop.1970 <- UNpop %>% # take the UNpop data and then....
  filter(year == 1970) %>% # subset rows where the year variable is equal to 1970
  select(world.pop) %>% # subset just the world.pop column
  pull() # return a vector, not a tibble

## Print the vector to the console to see it
print(pop.1970)

## [1] 3691173

UNpop.mill <- UNpop %>% # create a new tibble from UNpop
  mutate(world.pop.mill = world.pop / 1000) %>% # create a new variable, world.pop.mill
  select(-world.pop) # drop the original world.pop column

## Adding a variable with if_else
UNpop.mill <- UNpop.mill %>%
  mutate(after.1980 = if_else(year >= 1980, 1, 0))

## Creating a vector of the years of interest
specific.years <- c(1950, 1980, 2000)

## Adding a variable with if_else and %in%
UNpop.mill <- UNpop.mill %>%
  mutate(year.of.interest = if_else(year %in% specific.years, 1, 0))

summary(UNpop.mill)

##       year      world.pop.mill   after.1980     year.of.interest
##  Min.   :1950   Min.   :2526   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:1965   1st Qu.:3359   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :1980   Median :4449   Median :1.0000   Median :0.0000  
##  Mean   :1980   Mean   :4580   Mean   :0.5714   Mean   :0.4286  
##  3rd Qu.:1995   3rd Qu.:5724   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :2010   Max.   :6916   Max.   :1.0000   Max.   :1.0000

mean(UNpop.mill$world.pop.mill)

## [1] 4579.529

## Add a row where values for all columns is NA
UNpop.mill.wNAs <- UNpop.mill %>%
  add_row(year = NA, world.pop.mill = NA,
          after.1980 = NA,
          year.of.interest = NA)
## Take the mean of world.pop.mill (returns NA)
mean(UNpop.mill.wNAs$world.pop.mill)

## [1] NA

## Take the mean of world.pop.mill (ignores the NA)
mean(UNpop.mill.wNAs$world.pop.mill, na.rm = TRUE)

## [1] 4579.529

## Other Summary Statistics with tidyverse
UNpop.mill %>%
  summarize(mean.pop = mean(world.pop.mill),
            median.pop = median(world.pop.mill))

##   mean.pop median.pop
## 1 4579.529   4449.049

UNpop.mill %>%
  group_by(after.1980) %>% # create subset group for each value of after.1980
  summarize(mean.pop = mean(world.pop.mill)) # calculate mean for each group

## # A tibble: 2 × 2
##   after.1980 mean.pop
##        <dbl>    <dbl>
## 1          0    3081.
## 2          1    5703.