## country year pop continent lifeExp gdpPercap
## 1 Afghanistan 1952 8425333 Asia 28.801 779.4453
## 2 Afghanistan 1957 9240934 Asia 30.332 820.8530
## 3 Afghanistan 1962 10267083 Asia 31.997 853.1007
## 4 Afghanistan 1967 11537966 Asia 34.020 836.1971
## 5 Afghanistan 1972 13079460 Asia 36.088 739.9811
## 6 Afghanistan 1977 14880372 Asia 38.438 786.1134
here
packageThe here()
function gets your current working directory and appends any strings enclosed in the function.
## [1] "/Users/rap168/Documents/GitHub/bootcamp-2019"
## [1] "/Users/rap168/Documents/GitHub/bootcamp-2019/data/gapminder.csv"
## country year pop continent lifeExp gdpPercap
## 1 Afghanistan 1952 8425333 Asia 28.801 779.4453
## 2 Afghanistan 1957 9240934 Asia 30.332 820.8530
## 3 Afghanistan 1962 10267083 Asia 31.997 853.1007
## 4 Afghanistan 1967 11537966 Asia 34.020 836.1971
## 5 Afghanistan 1972 13079460 Asia 36.088 739.9811
## 6 Afghanistan 1977 14880372 Asia 38.438 786.1134
gapminder
dataset## 'data.frame': 1704 obs. of 6 variables:
## $ country : Factor w/ 142 levels "Afghanistan",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ year : int 1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
## $ pop : num 8425333 9240934 10267083 11537966 13079460 ...
## $ continent: Factor w/ 5 levels "Africa","Americas",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ lifeExp : num 28.8 30.3 32 34 36.1 ...
## $ gdpPercap: num 779 821 853 836 740 ...
gapminder
datasetgapminder
datasetgapminder$country <- as.character(gapminder$country)
gapminder$continent <- as.character(gapminder$continent)
str(gapminder)
## 'data.frame': 1704 obs. of 6 variables:
## $ country : chr "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
## $ year : int 1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
## $ pop : num 8425333 9240934 10267083 11537966 13079460 ...
## $ continent: chr "Asia" "Asia" "Asia" "Asia" ...
## $ lifeExp : num 28.8 30.3 32 34 36.1 ...
## $ gdpPercap: num 779 821 853 836 740 ...
gapminder
dataset## [1] 37.47883
## [1] 68.43292
gapminder
datasetOr we could do save ourselves a lot of typing and time
Loops!
for
, while
, and the apply
familyfor
while
apply
family (often preferred to loops because the code is cleaner)dplyr
and data.table
offer better approaches to loopsfor
loopfor
loops repeat a function for all values in a vector – don’t cut and paste!for (i in vector) { function(i) }
i
is the iterator variable (could be any letter!)i
for each interationfor
loopCreate a new variable that finds that natural log (log
) of the GDP per capita and of population - call them log_gdpPercap
and log_pop
## country year pop continent lifeExp gdpPercap gdp
## 1 Afghanistan 1952 8425333 Asia 28.801 779.4453 6567086330
## 2 Afghanistan 1957 9240934 Asia 30.332 820.8530 7585448670
## 3 Afghanistan 1962 10267083 Asia 31.997 853.1007 8758855797
## 4 Afghanistan 1967 11537966 Asia 34.020 836.1971 9648014150
## 5 Afghanistan 1972 13079460 Asia 36.088 739.9811 9678553274
## 6 Afghanistan 1977 14880372 Asia 38.438 786.1134 11697659231
## log_gdpPercap log_pop
## 1 6.658583 15.94675
## 2 6.710344 16.03915
## 3 6.748878 16.14445
## 4 6.728864 16.26115
## 5 6.606625 16.38655
## 6 6.667101 16.51555
gapminder$vec_log_gdpPercap <- log(gapminder$gdpPercap)
all(gapminder$vec_log_gdpPercap == gapminder$log_gdpPercap)
## [1] TRUE
years <- unique(gapminder$year)
for (i in years) {
mean_le <- mean(gapminder$lifeExp[gapminder$year == i],
na.rm = T)
print(paste0(i, ": ", mean_le))
}
## [1] "1952: 49.0576197183099"
## [1] "1957: 51.5074011267606"
## [1] "1962: 53.6092490140845"
## [1] "1967: 55.6782895774648"
## [1] "1972: 57.6473864788732"
## [1] "1977: 59.5701574647887"
## [1] "1982: 61.5331971830986"
## [1] "1987: 63.2126126760563"
## [1] "1992: 64.160338028169"
## [1] "1997: 65.014676056338"
## [1] "2002: 65.6949225352113"
## [1] "2007: 67.0074225352113"
conts <- unique(gapminder$continent)
for (i in conts) {
mean_le <- mean(gapminder$lifeExp[gapminder$continent == i],
na.rm = T)
print(paste0(i, ": ", mean_le))
}
## [1] "Asia: 60.0649032323232"
## [1] "Europe: 71.9036861111111"
## [1] "Africa: 48.8653301282051"
## [1] "Americas: 64.6587366666667"
## [1] "Oceania: 74.3262083333333"
for
loops by defining different iteratorsfor
loopfor (i in conts) {
print(paste0("Continent: ", i))
for (j in years) {
mean_le <- mean(gapminder$lifeExp[gapminder$continent == i &
gapminder$year == j],
na.rm = T)
print(paste0(j, ": ", mean_le))
}
}
## [1] "Continent: Asia"
## [1] "1952: 46.3143939393939"
## [1] "1957: 49.3185442424242"
## [1] "1962: 51.563223030303"
## [1] "1967: 54.66364"
## [1] "1972: 57.3192690909091"
## [1] "1977: 59.6105563636364"
## [1] "1982: 62.6179393939394"
## [1] "1987: 64.8511818181818"
## [1] "1992: 66.5372121212121"
## [1] "1997: 68.0205151515152"
## [1] "2002: 69.2338787878788"
## [1] "2007: 70.7284848484849"
## [1] "Continent: Europe"
## [1] "1952: 64.4085"
## [1] "1957: 66.7030666666667"
## [1] "1962: 68.5392333333333"
## [1] "1967: 69.7376"
## [1] "1972: 70.7750333333333"
## [1] "1977: 71.9377666666667"
## [1] "1982: 72.8064"
## [1] "1987: 73.6421666666667"
## [1] "1992: 74.4401"
## [1] "1997: 75.5051666666667"
## [1] "2002: 76.7006"
## [1] "2007: 77.6486"
## [1] "Continent: Africa"
## [1] "1952: 39.1355"
## [1] "1957: 41.2663461538462"
## [1] "1962: 43.3194423076923"
## [1] "1967: 45.3345384615385"
## [1] "1972: 47.4509423076923"
## [1] "1977: 49.5804230769231"
## [1] "1982: 51.5928653846154"
## [1] "1987: 53.3447884615385"
## [1] "1992: 53.6295769230769"
## [1] "1997: 53.5982692307692"
## [1] "2002: 53.3252307692308"
## [1] "2007: 54.8060384615385"
## [1] "Continent: Americas"
## [1] "1952: 53.27984"
## [1] "1957: 55.96028"
## [1] "1962: 58.39876"
## [1] "1967: 60.41092"
## [1] "1972: 62.39492"
## [1] "1977: 64.39156"
## [1] "1982: 66.22884"
## [1] "1987: 68.09072"
## [1] "1992: 69.56836"
## [1] "1997: 71.15048"
## [1] "2002: 72.42204"
## [1] "2007: 73.60812"
## [1] "Continent: Oceania"
## [1] "1952: 69.255"
## [1] "1957: 70.295"
## [1] "1962: 71.085"
## [1] "1967: 71.31"
## [1] "1972: 71.91"
## [1] "1977: 72.855"
## [1] "1982: 74.29"
## [1] "1987: 75.32"
## [1] "1992: 76.945"
## [1] "1997: 78.19"
## [1] "2002: 79.74"
## [1] "2007: 80.7195"
for
loop exercise!for
loop exercise!sd
) for life expectancy for each continent for each year?for
loop exercise!for (i in conts) {
print(paste0("Continent: ", i))
for (j in years) {
sd_le <- sd(gapminder$lifeExp[gapminder$continent == i &
gapminder$year == j],
na.rm = T)
print(paste0(j, ": ", sd_le))
}
}
## [1] "Continent: Asia"
## [1] "1952: 9.29175069597824"
## [1] "1957: 9.63542861940215"
## [1] "1962: 9.82063194066467"
## [1] "1967: 9.65096458232544"
## [1] "1972: 9.72270004073083"
## [1] "1977: 10.0221969818167"
## [1] "1982: 8.53522140873991"
## [1] "1987: 8.20379188414779"
## [1] "1992: 8.07554897033932"
## [1] "1997: 8.09117060876087"
## [1] "2002: 8.37459538857541"
## [1] "2007: 7.96372447069057"
## [1] "Continent: Europe"
## [1] "1952: 6.36108825405387"
## [1] "1957: 5.29580539238584"
## [1] "1962: 4.30249955966524"
## [1] "1967: 3.79972849846788"
## [1] "1972: 3.2405763693743"
## [1] "1977: 3.12102997680124"
## [1] "1982: 3.21826029893856"
## [1] "1987: 3.16968033940696"
## [1] "1992: 3.20978108986074"
## [1] "1997: 3.10467655135052"
## [1] "2002: 2.92217957861169"
## [1] "2007: 2.9798126601609"
## [1] "Continent: Africa"
## [1] "1952: 5.1515814343277"
## [1] "1957: 5.62012285430095"
## [1] "1962: 5.87536393337021"
## [1] "1967: 6.08267262744012"
## [1] "1972: 6.41625832389558"
## [1] "1977: 6.80819741006083"
## [1] "1982: 7.37594008904693"
## [1] "1987: 7.86408910830706"
## [1] "1992: 9.46107098639753"
## [1] "1997: 9.10338657543333"
## [1] "2002: 9.58649585045544"
## [1] "2007: 9.63078067196179"
## [1] "Continent: Americas"
## [1] "1952: 9.32608188397822"
## [1] "1957: 9.03319227681997"
## [1] "1962: 8.50354373815215"
## [1] "1967: 7.90917103705144"
## [1] "1972: 7.32301680161029"
## [1] "1977: 7.06949561543585"
## [1] "1982: 6.72083381905351"
## [1] "1987: 5.80192884249138"
## [1] "1992: 5.16710380580843"
## [1] "1997: 4.88758389629614"
## [1] "2002: 4.7997054986044"
## [1] "2007: 4.44094763085538"
## [1] "Continent: Oceania"
## [1] "1952: 0.190918830920365"
## [1] "1957: 0.0494974746830535"
## [1] "1962: 0.219203102167821"
## [1] "1967: 0.296984848098351"
## [1] "1972: 0.0282842712474663"
## [1] "1977: 0.898025612106913"
## [1] "1982: 0.636396103067887"
## [1] "1987: 1.4142135623731"
## [1] "1992: 0.869741340859456"
## [1] "1997: 0.905096679918782"
## [1] "2002: 0.890954544295053"
## [1] "2007: 0.729027091403335"
for
loops can be slow…very slowfor
loops can be slow…very slowapply
family of functions as a faster alternative
apply
family is “loop-hiding”apply
and its relatives help you write cleaner code, but do not expect much of a speed boost
apply
, lapply
, sapply
apply
, lapply
, sapply
apply
apply(matrix, 1 = row or 2 = column, function)
- Let’s say we want to find the mean for each stat in gapminder
## lifeExp pop gdpPercap
## 5.947444e+01 2.960121e+07 7.215327e+03
apply
versus for
## lifeExp pop gdpPercap
## 5.947444e+01 2.960121e+07 7.215327e+03
## [1] 59.47444
## [1] 29601212
## [1] 7215.327
lapply
and sapply
lapply
and sapply
iterate over a values in a vector or list, rather than rows or columns
lapply
returns a listsapply
returns a simplified list (i.e., a vector)
sapply
returns results, so always checklapply
and sapply
lapply(vector, function)
## $country
## [1] NA
##
## $year
## [1] 1979.5
##
## $pop
## [1] 29601212
##
## $continent
## [1] NA
##
## $lifeExp
## [1] 59.47444
##
## $gdpPercap
## [1] 7215.327
##
## $gdp
## [1] 186809560507
##
## $log_gdpPercap
## [1] 8.158791
##
## $log_pop
## [1] 15.76611
##
## $vec_log_gdpPercap
## [1] 8.158791
## country year pop continent
## NA 1.979500e+03 2.960121e+07 NA
## lifeExp gdpPercap gdp log_gdpPercap
## 5.947444e+01 7.215327e+03 1.868096e+11 8.158791e+00
## log_pop vec_log_gdpPercap
## 1.576611e+01 8.158791e+00
apply
apply
callfunction(x) [function]
to the call–x
becomes the iterator## [1] 49.05762 51.50740 53.60925 55.67829 57.64739 59.57016 61.53320
## [8] 63.21261 64.16034 65.01468 65.69492 67.00742
while
loopwhile
loop syntaxfor
loop -> while (condition) { function }
i <- 1952 # define the interator
while (i < 1987) {
sd_le <- sd(gapminder$lifeExp[gapminder$year == i])
print(paste0(i, ": ", sd_le)
)
i <- i + 5 # increase the iterator by the interval between years
}
## [1] "1952: 12.2259557776501"
## [1] "1957: 12.2312861234041"
## [1] "1962: 12.0972450062645"
## [1] "1967: 11.7188577789887"
## [1] "1972: 11.3819531380937"
## [1] "1977: 11.2272293919197"
## [1] "1982: 10.7706178327824"
while
loopi <- 1987 # define the interator
while (i <= 2002) {
sd_le <- sd(gapminder$lifeExp[gapminder$year == i])
print(paste0(i, ": ", sd_le)
)
i <- i + 5 # increase the iterator by the interval between years
}
## [1] "1987: 10.5562851721688"
## [1] "1992: 11.2273795265798"
## [1] "1997: 11.5594390582383"
## [1] "2002: 12.2798227122797"
while
loop cautionary talewhile
loop will continually run if the logical condition is always satisfied!while
loop without increasing the iteratorif/else
conditional!if
for
and while
, initialize with if
and then detail condition in parentheses## [1] 1992
## [1] 1992
if
statementsyears
set.seed()
## [1] 2002
else
clauserandom_year
<= 1977? NOTHING!else
statement, telling R what to do when the if
condition isn’t met## [1] 1992
if (random_year > 1977) {
print(random_year)
} else {
print("sorry, random year is less than 1977")
}
## [1] 1992
if
and else
togetherrandom_year <- sample(years, 1)
if (random_year > 1977) {
print(paste0(random_year, ": ",
mean(gapminder$lifeExp[gapminder$year == random_year]))
)
} else {
print("sorry, random year is less than 1977")
}
## [1] "sorry, random year is less than 1977"
for
and if/else
togetherif
…else
clause to a for
loopfor
and if/else
togetherWhich continents have a mean life expectancy greater than 70 years?
threshold <- 70
for (i in unique(gapminder$continent)) {
tmp <- mean(gapminder$lifeExp[gapminder$continent==i])
if (tmp < threshold) {
print(paste("Mean Life Expectancy in", i, "is less than", threshold))
} else {
print(paste("Mean Life Expectancy in", i, "is greater than", threshold))
}
}
## [1] "Mean Life Expectancy in Asia is less than 70"
## [1] "Mean Life Expectancy in Europe is greater than 70"
## [1] "Mean Life Expectancy in Africa is less than 70"
## [1] "Mean Life Expectancy in Americas is less than 70"
## [1] "Mean Life Expectancy in Oceania is greater than 70"
for
and if/else
togetherWrite a for
loop that reports the mean population for years greater than or equal to 1987. Make sure the loop prints a message if the condition is not met!
for
and if/else
togetherfor (i in years) {
if (i >= 1987) {
mean_pop <- mean(gapminder$lifeExp[gapminder$year == i])
print(paste0(i, ": ", mean_pop))
} else {
print("Sorry, year is less than 1987")
}
}
## [1] "Sorry, year is less than 1987"
## [1] "Sorry, year is less than 1987"
## [1] "Sorry, year is less than 1987"
## [1] "Sorry, year is less than 1987"
## [1] "Sorry, year is less than 1987"
## [1] "Sorry, year is less than 1987"
## [1] "Sorry, year is less than 1987"
## [1] "1987: 63.2126126760563"
## [1] "1992: 64.160338028169"
## [1] "1997: 65.014676056338"
## [1] "2002: 65.6949225352113"
## [1] "2007: 67.0074225352113"
[
## [1] 67.500 69.100 70.300 70.800 71.000 72.500 73.800 74.847 76.070 77.340
## [11] 78.670 79.406
## [1] 67.500 69.100 70.300 70.800 71.000 72.500 73.800 74.847 76.070 77.340
## [11] 78.670 79.406
function
gapminder
datasetreport_mean_sd <-
function(df, variable, country) {
var <- df[[variable]][df$country == country]
m_le <- mean(var)
sd_le <- sd(var)
cat("Country:", country,
"\nMean Life Expectancy:", m_le,
"\nSD Life Expectancy:", sd_le)
}
report_mean_sd(gapminder, "lifeExp", "Bulgaria")
## Country: Bulgaria
## Mean Life Expectancy: 69.74375
## SD Life Expectancy: 3.55268
gapminder
min
, max
report_stats <-
function(df, variable, continent) {
var <- df[[variable]][df$continent == continent]
min_le <- min(var)
max_le <- max(var)
cat("Continent:", continent,
"\nMinimum Life expectancy:", min_le,
"\nMaximum Life expectancy:", max_le)
}
report_stats(gapminder, "lifeExp", "Asia")
## Continent: Asia
## Minimum Life expectancy: 28.801
## Maximum Life expectancy: 82.603
rmd_exercise_template.Rmd
Day1Part2RExercise_LastnameFirstname.Rmd
.