Read both California energy datasets. Make sure the datetime
variable is in an appropriate data type (i.e. not character).
dplyr
library(dplyr)
library(lubridate)
generation_dp <- read.csv(here::here("data/ca_energy_generation.csv"),
stringsAsFactors = F)
imports_dp <- read.csv(here::here("data/ca_energy_imports.csv"),
stringsAsFactors = F)
generation_dp <- mutate(generation_dp, datetime = as_datetime(datetime))
imports_dp <- mutate(imports_dp, datetime = as_datetime(datetime))
data.table
library(data.table)
library(lubridate)
generation_dt <- fread(here::here("data/ca_energy_generation.csv"))
imports_dt <- fread(here::here("data/ca_energy_imports.csv"))
generation_dt[,datetime := as_datetime(datetime)]
imports_dt[,datetime := as_datetime(datetime)]
Merge the two datasets and then melt the resulting dataframe/datatable to make it tidy.
dplyr
# We chose to use dplyr::inner_join here to illustrate how this can be done in a single piped operation, but you can also use base::merge
library(reshape2)
long_ca_energy_dp <- generation_dp %>%
inner_join(imports_dp, by = "datetime") %>%
melt(id.vars = "datetime",
variable.name = "source",
value.name = "output")
data.table
all_sources <- merge(generation_dt, imports_dt, by = "datetime")
long_ca_energy_dt <- melt(all_sources,
id.vars = "datetime",
variable.name = "source",
value.name = "output")
# another option: advanced join
long_ca_energy_dt <- melt(generation_dt[imports_dt, on = "datetime"],
id.vars = "datetime",
variable.name = "source",
value.name = "output")
Create a series of new variables:
day
, which is the year-month-day, without the hour. The lubridate
function as_date
will do this.log_output
, which is the natural log of the output.per_output
, which is the percent of daily output represented by each observation. You will need to use group_by
and to create a new variable with the total output for the day. (Make sure to use ungroup()
after this!)Bonus: If you are using dplyr
, try to do this all in one pipe!
dplyr
long_ca_energy_dp <- long_ca_energy_dp %>%
mutate(day = as_date(datetime),
log_output = log(output)) %>%
group_by(day) %>%
mutate(total_daily_output = sum(output, na.rm = T)) %>%
ungroup() %>%
mutate(per_output = output/total_daily_output)
# Check results
long_ca_energy_dp %>% select(day, log_output, per_output) %>% head()
## # A tibble: 6 x 3
## day log_output per_output
## <date> <dbl> <dbl>
## 1 2019-09-03 5.48 0.000293
## 2 2019-09-03 5.48 0.000293
## 3 2019-09-03 5.48 0.000293
## 4 2019-09-03 5.48 0.000293
## 5 2019-09-03 5.47 0.000292
## 6 2019-09-03 5.47 0.000291
data.table
long_ca_energy_dt[,day := as_date(datetime)]
long_ca_energy_dt[,log_output := log(output)]
long_ca_energy_dt[,per_output := output/sum(output, na.rm = TRUE), by = day]
# one command (stylistic/readability choice)
long_ca_energy_dt[,`:=`(day = as_date(datetime),
log_output = log(output),
per_output = output/sum(output, na.rm = TRUE)),
by = day]
dplyr
verb arrange(desc(variable))
to order the data frame so that the largest value of variable
is first. Don’t use desc
and it arranges in ascending order. The data.table
function is setorder
.) Which has the least?dplyr
long_ca_energy_dp %>%
group_by(source) %>%
summarize(mean_hourly = mean(output, na.rm = T)) %>%
arrange(desc(mean_hourly))
## # A tibble: 12 x 2
## source mean_hourly
## <fct> <dbl>
## 1 natural_gas 10634.
## 2 imports 6683.
## 3 solar 3930.
## 4 large_hydro 2968.
## 5 wind 2620.
## 6 nuclear 2254.
## 7 geothermal 965.
## 8 small_hydro 419.
## 9 biomass 374.
## 10 biogas 235.
## 11 coal 9.09
## 12 other 0
long_ca_energy_dp %>%
group_by(source) %>%
summarize(mean_hourly = mean(output, na.rm = T)) %>%
arrange(mean_hourly)
## # A tibble: 12 x 2
## source mean_hourly
## <fct> <dbl>
## 1 other 0
## 2 coal 9.09
## 3 biogas 235.
## 4 biomass 374.
## 5 small_hydro 419.
## 6 geothermal 965.
## 7 nuclear 2254.
## 8 wind 2620.
## 9 large_hydro 2968.
## 10 solar 3930.
## 11 imports 6683.
## 12 natural_gas 10634.
data.table
mean_hrly <- long_ca_energy_dt[,.(mean_hourly = mean(output)), by = source]
mean_hrly[mean_hourly == max(mean_hourly) | mean_hourly == min(mean_hourly)]
## source mean_hourly
## 1: natural_gas 10634.25
## 2: other 0.00
# another option
setorder(mean_hrly, -mean_hourly)
mean_hrly
## source mean_hourly
## 1: natural_gas 10634.254960
## 2: imports 6682.663194
## 3: solar 3930.446429
## 4: large_hydro 2968.263889
## 5: wind 2619.791171
## 6: nuclear 2253.677083
## 7: geothermal 964.964782
## 8: small_hydro 419.438492
## 9: biomass 374.343254
## 10: biogas 234.685516
## 11: coal 9.092758
## 12: other 0.000000
# another option: chained together, no new variable
long_ca_energy_dt[,.(mean_hourly = mean(output)), by = source][mean_hourly %in% c(max(mean_hourly), min(mean_hourly))]
## source mean_hourly
## 1: natural_gas 10634.25
## 2: other 0.00
dplyr
long_ca_energy_dp %>%
filter(output>0) %>%
group_by(day, source) %>%
summarize(mean_daily = mean(output, na.rm = T)) %>%
arrange(desc(mean_daily))
## # A tibble: 77 x 3
## # Groups: day [7]
## day source mean_daily
## <date> <fct> <dbl>
## 1 2019-09-05 natural_gas 15403.
## 2 2019-09-04 natural_gas 14928.
## 3 2019-09-03 natural_gas 13051.
## 4 2019-09-06 natural_gas 12786.
## 5 2019-09-07 natural_gas 7508.
## 6 2019-09-08 solar 7278.
## 7 2019-09-07 solar 7186.
## 8 2019-09-09 solar 7133.
## 9 2019-09-03 imports 7076.
## 10 2019-09-09 imports 7058.
## # ... with 67 more rows
long_ca_energy_dp %>%
filter(output>0) %>%
group_by(day, source) %>%
summarize(mean_daily = mean(output, na.rm = T)) %>%
arrange(mean_daily)
## # A tibble: 77 x 3
## # Groups: day [7]
## day source mean_daily
## <date> <fct> <dbl>
## 1 2019-09-07 coal 6.57
## 2 2019-09-08 coal 7.24
## 3 2019-09-06 coal 8.89
## 4 2019-09-03 coal 8.99
## 5 2019-09-04 coal 9.13
## 6 2019-09-05 coal 9.5
## 7 2019-09-09 coal 13.3
## 8 2019-09-09 biogas 229.
## 9 2019-09-04 biogas 233.
## 10 2019-09-05 biogas 235.
## # ... with 67 more rows
data.table
mean_dly <- long_ca_energy_dt[output > 0,.(mean_daily = mean(output)), by = .(source, day)]
mean_dly[mean_daily %in% c(max(mean_daily), min(mean_daily))]
## source day mean_daily
## 1: coal 2019-09-07 6.569444
## 2: natural_gas 2019-09-05 15403.371528
dplyr
long_ca_energy_dp %>%
filter(output>0) %>%
group_by(source) %>%
summarize(sd_output = sd(output, na.rm = T)) %>%
arrange(desc(sd_output))
## # A tibble: 11 x 2
## source sd_output
## <fct> <dbl>
## 1 natural_gas 5275.
## 2 solar 3941.
## 3 imports 1761.
## 4 wind 1291.
## 5 large_hydro 1054.
## 6 small_hydro 30.7
## 7 biomass 23.0
## 8 geothermal 8.54
## 9 nuclear 6.63
## 10 biogas 4.22
## 11 coal 2.72
long_ca_energy_dp %>%
filter(output>0) %>%
group_by(source) %>%
summarize(sd_output = sd(output, na.rm = T)) %>%
arrange(sd_output)
## # A tibble: 11 x 2
## source sd_output
## <fct> <dbl>
## 1 coal 2.72
## 2 biogas 4.22
## 3 nuclear 6.63
## 4 geothermal 8.54
## 5 biomass 23.0
## 6 small_hydro 30.7
## 7 large_hydro 1054.
## 8 wind 1291.
## 9 imports 1761.
## 10 solar 3941.
## 11 natural_gas 5275.
data.table
var_by_source <- long_ca_energy_dt[output > 0,.(sd_output = sd(output)), by = source]
var_by_source[sd_output %in% c(max(sd_output), min(sd_output))]
## source sd_output
## 1: coal 2.723103
## 2: natural_gas 5275.136615
There are also other ways to find min and max values:
boxplot(output ~ source, data = long_ca_energy_dt,las=2)
The dataset regroup.csv
has information about which sources are considered renewable by the state of California. Use this dataset, along with yourdata manipulation skills, to explore the use of renewable and non-renewable sources. Annotate what your descisions for the analysis.
Hint: Use your merge skills to merge the CA energy data with the regroup
data. Which variable should you join by?