Intro to R
# Author: QSS Ch. 1 script with edits by Mark Richardson & Benjamin Reese
# Date: 08/24/2023
# Purpose: Introduction to R - GOVT 8001 Lab I
#### Arithmetic Operations ####
5 + 3
## [1] 8
## [1] 2
## [1] 1.666667
## [1] 125
## [1] 35
## [1] 2
#### Objects ####
result <- 5 + 3
result
## [1] 8
## [1] 8
## [1] 2
## R is case sensitive so we get an error.
Result
kosuke <- "instructor"
kosuke
## [1] "instructor"
kosuke <- "instructor and author"
kosuke
## [1] "instructor and author"
## [1] 7
## [1] 2
## [1] "numeric"
## [1] 5
## [1] "numeric"
## [1] "function"
## [1] 2
## [1] 5
#### Vectors ####
# Creating vectors
world.pop <- c(2525779, 3026003, 3691173, 4449049, 5320817, 6127700, 6916183)
world.pop
## [1] 2525779 3026003 3691173 4449049 5320817 6127700 6916183
pop.first <- c(2525779, 3026003, 3691173)
pop.second <- c(4449049, 5320817, 6127700, 6916183)
pop.all <- c(pop.first, pop.second)
pop.all
## [1] 2525779 3026003 3691173 4449049 5320817 6127700 6916183
# Accessing elements of a vector
world.pop[2]
## [1] 3026003
## [1] 3026003 4449049
## [1] 4449049 3026003
## [1] 2525779 3026003 4449049 5320817 6127700 6916183
# Arithmetic operations on a vector
pop.million <- world.pop / 1000
pop.million
## [1] 2525.779 3026.003 3691.173 4449.049 5320.817 6127.700 6916.183
pop.rate <- world.pop / world.pop[1]
pop.rate
## [1] 1.000000 1.198047 1.461400 1.761456 2.106604 2.426063 2.738238
pop.increase <- world.pop[-1] - world.pop[-7]
pop.increase
## [1] 500224 665170 757876 871768 806883 788483
percent.increase <- (pop.increase / world.pop[-7]) * 100
percent.increase
## [1] 19.80474 21.98180 20.53212 19.59448 15.16464 12.86752
# Can replace individual elements (better way is to use round())
round(percent.increase)
## [1] 20 22 21 20 15 13
percent.increase[c(1, 2)] <- c(20, 22)
percent.increase
## [1] 20.00000 22.00000 20.53212 19.59448 15.16464 12.86752
#### Functions ####
length(world.pop)
## [1] 7
## [1] 2525779
## [1] 6916183
## [1] 2525779 6916183
## [1] 4579529
sum(world.pop) / length(world.pop)
## [1] 4579529
year <- seq(from = 1950, to = 2010, by = 10)
year
## [1] 1950 1960 1970 1980 1990 2000 2010
seq(to = 2010, by = 10, from = 1950)
## [1] 1950 1960 1970 1980 1990 2000 2010
## [1] 1950 1960 1970 1980 1990 2000 2010
## [1] 2010 2000 1990 1980 1970 1960 1950
seq(from = 2010, to = 1950, by = -10)
## [1] 2010 2000 1990 1980 1970 1960 1950
## [1] 2008 2009 2010 2011 2012
## [1] 2012 2011 2010 2009 2008
## NULL
names(world.pop) <- year
names(world.pop)
## [1] "1950" "1960" "1970" "1980" "1990" "2000" "2010"
## 1950 1960 1970 1980 1990 2000 2010
## 2525779 3026003 3691173 4449049 5320817 6127700 6916183
#### Saving data and loading data ####
# Create a data set (Table 1.2)
# tibble() is the equivalent of data.frame() tidyverse function from the tibble package
UNpop <- data.frame(world.pop = world.pop,
year = year)
# Get basic information about the data set
names(UNpop)
## [1] "world.pop" "year"
## [1] 7
## [1] 2
## [1] 7 2
## world.pop year
## Min. :2525779 Min. :1950
## 1st Qu.:3358588 1st Qu.:1965
## Median :4449049 Median :1980
## Mean :4579529 Mean :1980
## 3rd Qu.:5724258 3rd Qu.:1995
## Max. :6916183 Max. :2010
## [1] 2525779 3026003 3691173 4449049 5320817 6127700 6916183
UNpop[, "world.pop"] # extract the column called "world.pop"
## [1] 2525779 3026003 3691173 4449049 5320817 6127700 6916183
UNpop[c(1, 2, 3, 5), ] # extract the first three rows (and all columns)
## world.pop year
## 1950 2525779 1950
## 1960 3026003 1960
## 1970 3691173 1970
## 1990 5320817 1990
UNpop[1:3, "year"] # extract the first three rows of the "year" column
## [1] 1950 1960 1970
UNpop$world.pop[seq(from = 1, to = nrow(UNpop), by = 2)]
## [1] 2525779 3691173 5320817 6916183
# File paths and working directory
getwd() # Confirm the change
## [1] "C:/Users/17176/Documents/GOVT701"
#### Getting Help: mean() example ####
world.pop <- c(UNpop$world.pop, NA)
world.pop
## [1] 2525779 3026003 3691173 4449049 5320817 6127700 6916183 NA
## [1] NA
## Use Question Marks to see help documentation
?mean
mean(world.pop, na.rm = TRUE)
## [1] 4579529
Intro to library(tidyverse)
# Packages
## install.packages("devtools") # install the package
library(devtools) # load the package
## install a package from github
## devtools::install_github("kosukeimai/qss-package", build_vignettes = TRUE)
library(qss) ## loading in qss
## You may need to allow R to update/install additional packages
## Loading in tidyverse
## install.packages("tidyverse")
library(tidyverse)
## Loading in a Dataset
data(UNpop, package = "qss")
## Number of Rows and Columns - Base R
dim(UNpop)
## [1] 7 2
## Number of observation, number of variables, and initial observations - tidyverse
glimpse(UNpop)
## Rows: 7
## Columns: 2
## $ year <int> 1950, 1960, 1970, 1980, 1990, 2000, 2010
## $ world.pop <int> 2525779, 3026003, 3691173, 4449049, 5320817, 6127700, 6916183
## First 6 rows
head(UNpop)
## year world.pop
## 1 1950 2525779
## 2 1960 3026003
## 3 1970 3691173
## 4 1980 4449049
## 5 1990 5320817
## 6 2000 6127700
## Last 6 Rows
tail(UNpop)
## year world.pop
## 2 1960 3026003
## 3 1970 3691173
## 4 1980 4449049
## 5 1990 5320817
## 6 2000 6127700
## 7 2010 6916183
## Selecting A Variable - Base R
UNpop$world.pop
## [1] 2525779 3026003 3691173 4449049 5320817 6127700 6916183
## subset all rows for the column called "world.pop" from the UNpop data
UNpop[, "world.pop"]
## [1] 2525779 3026003 3691173 4449049 5320817 6127700 6916183
## subset the first three rows (and all columns)
UNpop[c(1, 2, 3),]
## year world.pop
## 1 1950 2525779
## 2 1960 3026003
## 3 1970 3691173
## subset the first three rows of the "year" column
UNpop[1:3, "year"]
## [1] 1950 1960 1970
## Now with tidyverse
## Subset the first three rows of UNpop with tidyverse
slice(UNpop, 1:3)
## year world.pop
## 1 1950 2525779
## 2 1960 3026003
## 3 1970 3691173
## Extract/subset the world.pop variable (column)
select(UNpop, world.pop)
## world.pop
## 1 2525779
## 2 3026003
## 3 3691173
## 4 4449049
## 5 5320817
## 6 6127700
## 7 6916183
## Base R subset the first three rows of the year variable
UNpop[1:3, "year"]
## [1] 1950 1960 1970
## or in tidyverse, combining slice() and select()
select(slice(UNpop, 1:3), year)
## year
## 1 1950
## 2 1960
## 3 1970
## Basic Data Wrangling with the tidyverse using pipes (i.e., %>%)
UNpop %>% # take the UNpop data we have loaded, and then...
slice(1:3) %>% # subset the first three rows, and then...
select(year) # subset the year column
## year
## 1 1950
## 2 1960
## 3 1970
UNpop %>%
slice(seq(1, n(), by = 2)) %>% # using a sequence from 1 to n()
select(world.pop)
## world.pop
## 1 2525779
## 2 3691173
## 3 5320817
## 4 6916183
pop.1970 <- UNpop %>% # take the UNpop data and then....
filter(year == 1970) %>% # subset rows where the year variable is equal to 1970
select(world.pop) %>% # subset just the world.pop column
pull() # return a vector, not a tibble
## Print the vector to the console to see it
print(pop.1970)
## [1] 3691173
UNpop.mill <- UNpop %>% # create a new tibble from UNpop
mutate(world.pop.mill = world.pop / 1000) %>% # create a new variable, world.pop.mill
select(-world.pop) # drop the original world.pop column
## Adding a variable with if_else
UNpop.mill <- UNpop.mill %>%
mutate(after.1980 = if_else(year >= 1980, 1, 0))
## Creating a vector of the years of interest
specific.years <- c(1950, 1980, 2000)
## Adding a variable with if_else and %in%
UNpop.mill <- UNpop.mill %>%
mutate(year.of.interest = if_else(year %in% specific.years, 1, 0))
summary(UNpop.mill)
## year world.pop.mill after.1980 year.of.interest
## Min. :1950 Min. :2526 Min. :0.0000 Min. :0.0000
## 1st Qu.:1965 1st Qu.:3359 1st Qu.:0.0000 1st Qu.:0.0000
## Median :1980 Median :4449 Median :1.0000 Median :0.0000
## Mean :1980 Mean :4580 Mean :0.5714 Mean :0.4286
## 3rd Qu.:1995 3rd Qu.:5724 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :2010 Max. :6916 Max. :1.0000 Max. :1.0000
mean(UNpop.mill$world.pop.mill)
## [1] 4579.529
## Add a row where values for all columns is NA
UNpop.mill.wNAs <- UNpop.mill %>%
add_row(year = NA, world.pop.mill = NA,
after.1980 = NA,
year.of.interest = NA)
## Take the mean of world.pop.mill (returns NA)
mean(UNpop.mill.wNAs$world.pop.mill)
## [1] NA
## Take the mean of world.pop.mill (ignores the NA)
mean(UNpop.mill.wNAs$world.pop.mill, na.rm = TRUE)
## [1] 4579.529
## Other Summary Statistics with tidyverse
UNpop.mill %>%
summarize(mean.pop = mean(world.pop.mill),
median.pop = median(world.pop.mill))
## mean.pop median.pop
## 1 4579.529 4449.049
UNpop.mill %>%
group_by(after.1980) %>% # create subset group for each value of after.1980
summarize(mean.pop = mean(world.pop.mill)) # calculate mean for each group
## # A tibble: 2 × 2
## after.1980 mean.pop
## <dbl> <dbl>
## 1 0 3081.
## 2 1 5703.