California energy data
ca_energy_generation.csv
and ca_energy_imports.csv
generation <- read.csv(here::here("data/ca_energy_generation.csv"),
stringsAsFactors = F)
imports <- read.csv(here::here("data/ca_energy_imports.csv"),
stringsAsFactors = F)
str(generation)
## 'data.frame': 168 obs. of 12 variables:
## $ datetime : chr "2019-09-03 00:00:00" "2019-09-03 01:00:00" "2019-09-03 02:00:00" "2019-09-03 03:00:00" ...
## $ biogas : num 239 239 239 239 238 ...
## $ biomass : num 407 406 407 407 406 ...
## $ coal : num 9 9 9 9.83 9.42 ...
## $ geothermal : num 967 970 973 974 975 ...
## $ large_hydro: num 3152 2724 2630 2611 2651 ...
## $ natural_gas: num 9711 9155 8078 7801 7744 ...
## $ nuclear : num 2249 2251 2251 2245 2243 ...
## $ other : int 0 0 0 0 0 0 0 0 0 0 ...
## $ small_hydro: num 402 401 414 360 382 ...
## $ solar : num 0 0 0 0 0 ...
## $ wind : num 2670 2779 2795 2583 2397 ...
class(generation$datetime)
## [1] "character"
class(imports$datetime)
## [1] "character"
lubridate
lubridate
package and the as_datetime
functiondatetime
variable in the imports
dataframelibrary(lubridate)
generation$datetime <- as_datetime(generation$datetime)
class(generation$datetime)
## [1] "POSIXct" "POSIXt"
head(generation$datetime)
## [1] "2019-09-03 00:00:00 UTC" "2019-09-03 01:00:00 UTC"
## [3] "2019-09-03 02:00:00 UTC" "2019-09-03 03:00:00 UTC"
## [5] "2019-09-03 04:00:00 UTC" "2019-09-03 05:00:00 UTC"
lubridate
imports$datetime <- as_datetime(imports$datetime)
head(imports$datetime)
## [1] "2019-09-03 00:00:00 UTC" "2019-09-03 01:00:00 UTC"
## [3] "2019-09-03 02:00:00 UTC" "2019-09-03 03:00:00 UTC"
## [5] "2019-09-03 04:00:00 UTC" "2019-09-03 05:00:00 UTC"
head(generation)
## datetime biogas biomass coal geothermal large_hydro
## 1 2019-09-03 00:00:00 238.9167 407.0833 9.000000 967.2500 3152.500
## 2 2019-09-03 01:00:00 239.0000 406.5000 9.000000 970.3333 2724.333
## 3 2019-09-03 02:00:00 239.0000 406.8333 9.000000 973.1667 2630.000
## 4 2019-09-03 03:00:00 238.9167 407.1667 9.833333 973.8333 2611.250
## 5 2019-09-03 04:00:00 237.9167 406.0833 9.416667 974.6667 2651.333
## 6 2019-09-03 05:00:00 237.1667 408.9167 10.000000 973.3333 2965.500
## natural_gas nuclear other small_hydro solar wind
## 1 9710.833 2248.750 0 401.6667 0 2670.167
## 2 9155.000 2250.583 0 400.9167 0 2778.833
## 3 8078.333 2250.833 0 413.5833 0 2795.250
## 4 7801.250 2245.000 0 359.9167 0 2582.583
## 5 7744.333 2242.833 0 382.0833 0 2397.417
## 6 8624.583 2242.833 0 395.9167 0 2327.000
reshape2
melt
–> make data longdcast
–> make data widerecast
–> melt then cast datageneration
dataframe has several observations per rowhead(generation)
## datetime biogas biomass coal geothermal large_hydro
## 1 2019-09-03 00:00:00 238.9167 407.0833 9.000000 967.2500 3152.500
## 2 2019-09-03 01:00:00 239.0000 406.5000 9.000000 970.3333 2724.333
## 3 2019-09-03 02:00:00 239.0000 406.8333 9.000000 973.1667 2630.000
## 4 2019-09-03 03:00:00 238.9167 407.1667 9.833333 973.8333 2611.250
## 5 2019-09-03 04:00:00 237.9167 406.0833 9.416667 974.6667 2651.333
## 6 2019-09-03 05:00:00 237.1667 408.9167 10.000000 973.3333 2965.500
## natural_gas nuclear other small_hydro solar wind
## 1 9710.833 2248.750 0 401.6667 0 2670.167
## 2 9155.000 2250.583 0 400.9167 0 2778.833
## 3 8078.333 2250.833 0 413.5833 0 2795.250
## 4 7801.250 2245.000 0 359.9167 0 2582.583
## 5 7744.333 2242.833 0 382.0833 0 2397.417
## 6 8624.583 2242.833 0 395.9167 0 2327.000
melt
the generation datamelt(df, id.vars = "id")
id.vars
library(reshape2)
long_gen <- melt(generation, id.vars = "datetime",
variable.name = "source",
value.name = "usage")
head(long_gen)
## datetime source usage
## 1 2019-09-03 00:00:00 biogas 238.9167
## 2 2019-09-03 01:00:00 biogas 239.0000
## 3 2019-09-03 02:00:00 biogas 239.0000
## 4 2019-09-03 03:00:00 biogas 238.9167
## 5 2019-09-03 04:00:00 biogas 237.9167
## 6 2019-09-03 05:00:00 biogas 237.1667
melt
the generation datalong_gen[order(long_gen$datetime)[1:20], ]
## datetime source usage
## 1 2019-09-03 00:00:00 biogas 238.9167
## 169 2019-09-03 00:00:00 biomass 407.0833
## 337 2019-09-03 00:00:00 coal 9.0000
## 505 2019-09-03 00:00:00 geothermal 967.2500
## 673 2019-09-03 00:00:00 large_hydro 3152.5000
## 841 2019-09-03 00:00:00 natural_gas 9710.8333
## 1009 2019-09-03 00:00:00 nuclear 2248.7500
## 1177 2019-09-03 00:00:00 other 0.0000
## 1345 2019-09-03 00:00:00 small_hydro 401.6667
## 1513 2019-09-03 00:00:00 solar 0.0000
## 1681 2019-09-03 00:00:00 wind 2670.1667
## 2 2019-09-03 01:00:00 biogas 239.0000
## 170 2019-09-03 01:00:00 biomass 406.5000
## 338 2019-09-03 01:00:00 coal 9.0000
## 506 2019-09-03 01:00:00 geothermal 970.3333
## 674 2019-09-03 01:00:00 large_hydro 2724.3333
## 842 2019-09-03 01:00:00 natural_gas 9155.0000
## 1010 2019-09-03 01:00:00 nuclear 2250.5833
## 1178 2019-09-03 01:00:00 other 0.0000
## 1346 2019-09-03 01:00:00 small_hydro 400.9167
merge
merge(x, y, by = c("id", "year"))
x
: first dataframey
: second dataframeby
: variables to match (must have common name)merge
argumentsmerge(x, y, by.x = "id", by.y = "cd", all.x = T, all.y = T)
by.x
and by.y
if the dataframes have different variable namesall.x = T
if you want to keep all the observation in the first dataframe (unmatched observations in y
are dropped!)all.y = T
if you want to keep all observations in the second dataframe (umatched observations in x
are dropped!)all = T
) to keep all observations!datetime
merge
to join the generation
and imports
dataframes, using the datetime
variable to matchdatetime
merged_energy <- merge(generation, imports, by = "datetime")
dim(merged_energy)
## [1] 168 13
head(merged_energy)
## datetime biogas biomass coal geothermal large_hydro
## 1 2019-09-03 00:00:00 238.9167 407.0833 9.000000 967.2500 3152.500
## 2 2019-09-03 01:00:00 239.0000 406.5000 9.000000 970.3333 2724.333
## 3 2019-09-03 02:00:00 239.0000 406.8333 9.000000 973.1667 2630.000
## 4 2019-09-03 03:00:00 238.9167 407.1667 9.833333 973.8333 2611.250
## 5 2019-09-03 04:00:00 237.9167 406.0833 9.416667 974.6667 2651.333
## 6 2019-09-03 05:00:00 237.1667 408.9167 10.000000 973.3333 2965.500
## natural_gas nuclear other small_hydro solar wind imports
## 1 9710.833 2248.750 0 401.6667 0 2670.167 7527.333
## 2 9155.000 2250.583 0 400.9167 0 2778.833 6930.333
## 3 8078.333 2250.833 0 413.5833 0 2795.250 7005.833
## 4 7801.250 2245.000 0 359.9167 0 2582.583 7080.583
## 5 7744.333 2242.833 0 382.0833 0 2397.417 7421.667
## 6 8624.583 2242.833 0 395.9167 0 2327.000 7511.667
long_merged_energy
long_merged_energy <- melt(merged_energy, id.vars = "datetime",
variable.name = "source",
value.name = "usage")
head(long_merged_energy)
## datetime source usage
## 1 2019-09-03 00:00:00 biogas 238.9167
## 2 2019-09-03 01:00:00 biogas 239.0000
## 3 2019-09-03 02:00:00 biogas 239.0000
## 4 2019-09-03 03:00:00 biogas 238.9167
## 5 2019-09-03 04:00:00 biogas 237.9167
## 6 2019-09-03 05:00:00 biogas 237.1667
dplyr
is an R package for data manipulation developed by Hadley Wickham with help from lots of other R developersdplyr
verbsselect
–> subset variablesfilter
–> subset observations based on conditionsmutate
–> add new variablessummarize
–> reduce multiple observations to a single value (e.g., find the mean)dplyr
verbs - select
select(gapminder, continent, pop)
select(df, c(1, 3, 10))
select(df, country:pop)
or select(df, 1:3)
-
select(df, -gdpPercap)
dplyr
verbs - select
library(dplyr)
tmp <- select(merged_energy, biogas, biomass, geothermal, solar)
names(tmp)
## [1] "biogas" "biomass" "geothermal" "solar"
tmp <- select(merged_energy, -biogas, -biomass, -geothermal, -solar)
names(tmp)
## [1] "datetime" "coal" "large_hydro" "natural_gas" "nuclear"
## [6] "other" "small_hydro" "wind" "imports"
one_of()
, contains()
, starts_with()
, ends_with()
, matches()
tmp <- select(merged_energy, contains("hydro"), starts_with("bio"))
names(tmp)
## [1] "large_hydro" "small_hydro" "biogas" "biomass"
filter
tmp <- filter(merged_energy, imports > 7000)
nrow(tmp)
## [1] 78
head(tmp)
## datetime biogas biomass coal geothermal large_hydro
## 1 2019-09-03 00:00:00 238.9167 407.0833 9.000000 967.2500 3152.500
## 2 2019-09-03 02:00:00 239.0000 406.8333 9.000000 973.1667 2630.000
## 3 2019-09-03 03:00:00 238.9167 407.1667 9.833333 973.8333 2611.250
## 4 2019-09-03 04:00:00 237.9167 406.0833 9.416667 974.6667 2651.333
## 5 2019-09-03 05:00:00 237.1667 408.9167 10.000000 973.3333 2965.500
## 6 2019-09-03 06:00:00 236.7500 406.0833 9.166667 972.3333 3809.500
## natural_gas nuclear other small_hydro solar wind imports
## 1 9710.833 2248.750 0 401.6667 0.00000 2670.167 7527.333
## 2 8078.333 2250.833 0 413.5833 0.00000 2795.250 7005.833
## 3 7801.250 2245.000 0 359.9167 0.00000 2582.583 7080.583
## 4 7744.333 2242.833 0 382.0833 0.00000 2397.417 7421.667
## 5 8624.583 2242.833 0 395.9167 0.00000 2327.000 7511.667
## 6 9544.583 2244.833 0 407.5000 77.16667 2048.833 8172.667
filter
tmp <- filter(merged_energy, imports > 7000, natural_gas < 7000)
nrow(tmp)
## [1] 20
head(tmp)
## datetime biogas biomass coal geothermal large_hydro
## 1 2019-09-07 03:00:00 240.1667 358.9167 9.166667 964.7500 2413.833
## 2 2019-09-07 06:00:00 238.8333 371.3333 8.500000 971.7500 2649.417
## 3 2019-09-08 00:00:00 236.8333 389.7500 5.416667 970.2500 2697.000
## 4 2019-09-08 01:00:00 237.2500 388.8333 4.750000 969.3333 2654.083
## 5 2019-09-08 02:00:00 237.1667 374.3333 4.083333 969.9167 2217.667
## 6 2019-09-08 03:00:00 237.8333 342.5000 4.000000 972.5000 2090.917
## natural_gas nuclear other small_hydro solar wind imports
## 1 6667.417 2255.083 0 378.4167 0.00000 3809.917 7129.667
## 2 6553.917 2257.083 0 408.5000 67.83333 3643.750 7338.250
## 3 6364.333 2251.167 0 403.7500 0.00000 3628.083 8077.583
## 4 5037.917 2254.000 0 403.3333 0.00000 3118.083 8658.583
## 5 4530.750 2255.583 0 385.0833 0.00000 2928.083 8806.583
## 6 4079.500 2257.500 0 377.5000 0.00000 2726.583 8887.167
mutate
mutate
creates new variablestmp <- mutate(long_merged_energy, log_usage = log(usage))
head(tmp)
## datetime source usage log_usage
## 1 2019-09-03 00:00:00 biogas 238.9167 5.476115
## 2 2019-09-03 01:00:00 biogas 239.0000 5.476464
## 3 2019-09-03 02:00:00 biogas 239.0000 5.476464
## 4 2019-09-03 03:00:00 biogas 238.9167 5.476115
## 5 2019-09-03 04:00:00 biogas 237.9167 5.471920
## 6 2019-09-03 05:00:00 biogas 237.1667 5.468763
mutate
tmp <- mutate(long_merged_energy, log_usage = log(usage), usage2 = usage^2, usage3 = usage^3)
head(tmp)
## datetime source usage log_usage usage2 usage3
## 1 2019-09-03 00:00:00 biogas 238.9167 5.476115 57081.17 13637644
## 2 2019-09-03 01:00:00 biogas 239.0000 5.476464 57121.00 13651919
## 3 2019-09-03 02:00:00 biogas 239.0000 5.476464 57121.00 13651919
## 4 2019-09-03 03:00:00 biogas 238.9167 5.476115 57081.17 13637644
## 5 2019-09-03 04:00:00 biogas 237.9167 5.471920 56604.34 13467116
## 6 2019-09-03 05:00:00 biogas 237.1667 5.468763 56248.03 13340157
summarize
summarize
reduces observations to a single value based on functions - mean
, sum
, sd
, min
, max
, etc.
# total energy consumption
summarize(long_merged_energy, total = sum(usage, na.rm = T))
## total
## 1 5223392
# mean energy consumption
summarize(long_merged_energy, mean_cons = mean(usage, na.rm = T))
## mean_cons
## 1 2590.968
%>%
%>%
operator lets you chain together functions%>%
as “then”# take df then filter it then select these variables
# you do not need to repeat the name of the dataframe!
long_merged_energy %>%
filter(source == "geothermal") %>%
select(-datetime) %>%
mutate(log_usage = log(usage)) %>%
summarize(mean_log_usage = mean(log_usage, na.rm = T))
## mean_log_usage
## 1 6.872053
%>%
.
merged_energy %>%
select(-datetime) %>%
mutate(total_usage = rowSums(., na.rm = T)) %>%
summarize(total_usage = sum(total_usage, na.rm = T))
## total_usage
## 1 5223392
%>%
.
merged_energy %>%
select(-datetime) %>%
mutate(total_usage = rowSums(merged_energy, na.rm = T)) %>%
summarize(total_usage = sum(total_usage, na.rm = T))
## Error: Evaluation error: 'x' must be numeric.
merged_energy
do the following with pipes:total_hydro
that is the sum of the retained hydro variablestotal_hydro
merged_energy %>%
select(contains("hydro")) %>%
mutate(total_hydro = rowSums(., na.rm = T)) %>%
summarize(mean_hydro = mean(total_hydro, na.rm = T))
## mean_hydro
## 1 3387.702
group_by
and summarize
group_by
is a powerful function that allows us to perform operations by groups of observations!long_merged_energy %>%
group_by(source) %>%
summarize(sum_usage = sum(usage, na.rm = T))
## # A tibble: 12 x 2
## source sum_usage
## <fct> <dbl>
## 1 biogas 39427.
## 2 biomass 62890.
## 3 coal 1528.
## 4 geothermal 162114.
## 5 large_hydro 498668.
## 6 natural_gas 1786555.
## 7 nuclear 378618.
## 8 other 0
## 9 small_hydro 70466.
## 10 solar 660315.
## 11 wind 440125.
## 12 imports 1122687.
group_by
versus for
loopsfor
loop we wrote yesterday to find the mean life expectancy by year?dplyr
group_by
versus for
loopsgapminder <- read.csv(here::here("data/gapminder5.csv"))
gapminder %>%
group_by(year) %>%
summarize(mean_le = mean(lifeExp, na.rm = T),
sd_lf = sd(lifeExp, na.rm = T))
## # A tibble: 12 x 3
## year mean_le sd_lf
## <int> <dbl> <dbl>
## 1 1952 49.1 12.2
## 2 1957 51.5 12.2
## 3 1962 53.6 12.1
## 4 1967 55.7 11.7
## 5 1972 57.6 11.4
## 6 1977 59.6 11.2
## 7 1982 61.5 10.8
## 8 1987 63.2 10.6
## 9 1992 64.2 11.2
## 10 1997 65.0 11.6
## 11 2002 65.7 12.3
## 12 2007 67.0 12.1
dplyr
to find the mean usage for small hydro, large hydro, biogas, and biomassmerged_energy %>%
select(datetime, contains("hydro"), contains("bio")) %>%
melt(id.vars = "datetime",
variable.name = "source",
value.name = "usage") %>%
group_by(source) %>%
summarize(mean_usage = mean(usage, na.rm = T))
## # A tibble: 4 x 2
## source mean_usage
## <fct> <dbl>
## 1 large_hydro 2968.
## 2 small_hydro 419.
## 3 biogas 235.
## 4 biomass 374.
dplyr
with *_join
functionsdplyr
’s philosophy is to have more functions with fewer optionsleft_join
- keeps all observations in the first dataframe
merge(all.x = T)
right_join
- keeps all observations in the second dataframe
merge(all.y = T)
full_join
- keeps all observations in both dataframes
merge(all = T)
inner_join
- keeps only those observations that are matched in both datasets
merge(all = F)
merge
and inner_join
tmp <- merge(generation, imports, by = "datetime", all = F)
dim(tmp)
## [1] 168 13
tmp <- inner_join(generation, imports, by = "datetime")
dim(tmp)
## [1] 168 13