Let’s have a look in some very powerful function in the tidyverse package that are important for data wrangling.
library(tidyverse)
library(lubridate)
library(zoo)
theme_set(theme_light(20))
Daily air quality measurements in New York, May to September 1973. Temp: Maximum daily temperature in degrees Fahrenheit at La Guardia Airport.
df <- airquality %>%
as_tibble() %>%
select(day = Day, month = Month, temp = Temp, wind = Wind)
df
# A tibble: 153 x 4
day month temp wind
<int> <int> <int> <dbl>
1 1 5 67 7.4
2 2 5 72 8
3 3 5 74 12.6
4 4 5 62 11.5
5 5 5 56 14.3
6 6 5 66 14.9
7 7 5 65 8.6
8 8 5 59 13.8
9 9 5 61 20.1
10 10 5 69 8.6
# … with 143 more rows
df %>% count(month, sort = TRUE)
# A tibble: 5 x 2
month n
<int> <int>
1 5 31
2 7 31
3 8 31
4 6 30
5 9 30
df %>% group_by(month) %>%
mutate(days = n())
# A tibble: 153 x 5
# Groups: month [5]
day month temp wind days
<int> <int> <int> <dbl> <int>
1 1 5 67 7.4 31
2 2 5 72 8 31
3 3 5 74 12.6 31
4 4 5 62 11.5 31
5 5 5 56 14.3 31
6 6 5 66 14.9 31
7 7 5 65 8.6 31
8 8 5 59 13.8 31
9 9 5 61 20.1 31
10 10 5 69 8.6 31
# … with 143 more rows
df %>% sample_n(., size = 6) # same as sample_n(df, 6)
# A tibble: 6 x 4
day month temp wind
<int> <int> <int> <dbl>
1 2 7 85 9.2
2 31 5 76 7.4
3 13 7 81 14.9
4 13 9 77 6.3
5 7 5 65 8.6
6 14 8 80 9.7
df %>% distinct(month)
# A tibble: 5 x 1
month
<int>
1 5
2 6
3 7
4 8
5 9
df %>% mutate(cond1 = if_else(condition = temp > 65,
true = "warm",
false = "cold",
missing = "NA")) %>%
mutate(cond2 = if_else(wind >10,1,0))
# A tibble: 153 x 6
day month temp wind cond1 cond2
<int> <int> <int> <dbl> <chr> <dbl>
1 1 5 67 7.4 warm 0
2 2 5 72 8 warm 0
3 3 5 74 12.6 warm 1
4 4 5 62 11.5 cold 1
5 5 5 56 14.3 cold 1
6 6 5 66 14.9 warm 1
7 7 5 65 8.6 cold 0
8 8 5 59 13.8 cold 1
9 9 5 61 20.1 cold 1
10 10 5 69 8.6 warm 0
# … with 143 more rows
df2 <- df %>% mutate(pt = percent_rank(temp),
rt = min_rank(temp))
df2
# A tibble: 153 x 6
day month temp wind pt rt
<int> <int> <int> <dbl> <dbl> <int>
1 1 5 67 7.4 0.138 22
2 2 5 72 8 0.237 37
3 3 5 74 12.6 0.289 45
4 4 5 62 11.5 0.0724 12
5 5 5 56 14.3 0 1
6 6 5 66 14.9 0.118 19
7 7 5 65 8.6 0.105 17
8 8 5 59 13.8 0.0395 7
9 9 5 61 20.1 0.0526 9
10 10 5 69 8.6 0.191 30
# … with 143 more rows
df2 %>% filter(pt > 0.95) %>%
arrange(-pt)
# A tibble: 7 x 6
day month temp wind pt rt
<int> <int> <int> <dbl> <dbl> <int>
1 28 8 97 9.7 1. 153
2 30 8 96 6.3 0.993 152
3 29 8 94 2.3 0.980 150
4 31 8 94 6.3 0.980 150
5 11 6 93 10.9 0.961 147
6 3 9 93 2.8 0.961 147
7 4 9 93 4.6 0.961 147
df %>% mutate(w_class = ntile(wind, 3))
# A tibble: 153 x 5
day month temp wind w_class
<int> <int> <int> <dbl> <int>
1 1 5 67 7.4 1
2 2 5 72 8 1
3 3 5 74 12.6 3
4 4 5 62 11.5 2
5 5 5 56 14.3 3
6 6 5 66 14.9 3
7 7 5 65 8.6 2
8 8 5 59 13.8 3
9 9 5 61 20.1 3
10 10 5 69 8.6 2
# … with 143 more rows
df %>% mutate(w_class = ntile(wind, 3)) %>%
group_by(w_class) %>%
summarise(tavg = mean(temp, na.rm =T))
# A tibble: 3 x 2
w_class tavg
<int> <dbl>
1 1 82.7
2 2 77.1
3 3 73.8
Using lag()
is more convenient than diff()
because for n inputs diff() returns n - 1 outputs.
df %>% mutate(wind_lag = lag(wind),
wind_lead = lead(wind)) %>%
mutate(wind_diff = c(NA,diff(wind)),
wind_diff2 = wind - wind_lag)
# A tibble: 153 x 8
day month temp wind wind_lag wind_lead wind_diff wind_diff2
<int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 5 67 7.4 NA 8 NA NA
2 2 5 72 8 7.4 12.6 0.600 0.600
3 3 5 74 12.6 8 11.5 4.6 4.6
4 4 5 62 11.5 12.6 14.3 -1.10 -1.10
5 5 5 56 14.3 11.5 14.9 2.8 2.8
6 6 5 66 14.9 14.3 8.6 0.600 0.600
7 7 5 65 8.6 14.9 13.8 -6.3 -6.3
8 8 5 59 13.8 8.6 20.1 5.2 5.2
9 9 5 61 20.1 13.8 8.6 6.3 6.3
10 10 5 69 8.6 20.1 6.9 -11.5 -11.5
# … with 143 more rows
df %>% mutate(temp_lag3 = lag(temp,3),
temp_lag5 = lag(temp,5))
# A tibble: 153 x 6
day month temp wind temp_lag3 temp_lag5
<int> <int> <int> <dbl> <int> <int>
1 1 5 67 7.4 NA NA
2 2 5 72 8 NA NA
3 3 5 74 12.6 NA NA
4 4 5 62 11.5 67 NA
5 5 5 56 14.3 72 NA
6 6 5 66 14.9 74 67
7 7 5 65 8.6 62 72
8 8 5 59 13.8 56 74
9 9 5 61 20.1 66 62
10 10 5 69 8.6 65 56
# … with 143 more rows
rollapply()
is a powerful function in the zoo
-package, e.g. to calculate moving averages or using other functions for windowed calculations.
df %>% mutate(temp_5c = rollapply(temp, 5, FUN = "mean", align = "center", fill = NA)) %>%
mutate(temp_5r = rollapply(temp, 5, FUN = "mean", align = "right", fill = "-999"))
# A tibble: 153 x 6
day month temp wind temp_5c temp_5r
<int> <int> <int> <dbl> <dbl> <dbl>
1 1 5 67 7.4 NA -999
2 2 5 72 8 NA -999
3 3 5 74 12.6 66.2 -999
4 4 5 62 11.5 66 -999
5 5 5 56 14.3 64.6 66.2
6 6 5 66 14.9 61.6 66
7 7 5 65 8.6 61.4 64.6
8 8 5 59 13.8 64 61.6
9 9 5 61 20.1 65.6 61.4
10 10 5 69 8.6 66.4 64
# … with 143 more rows
df %>% mutate(wind_max3 = rollapply(wind, 3, FUN = "max", align = "right", fill = NA))
# A tibble: 153 x 5
day month temp wind wind_max3
<int> <int> <int> <dbl> <dbl>
1 1 5 67 7.4 NA
2 2 5 72 8 NA
3 3 5 74 12.6 12.6
4 4 5 62 11.5 12.6
5 5 5 56 14.3 14.3
6 6 5 66 14.9 14.9
7 7 5 65 8.6 14.9
8 8 5 59 13.8 14.9
9 9 5 61 20.1 20.1
10 10 5 69 8.6 20.1
# … with 143 more rows
cv <- function(x) sd(x) / mean(x)
df %>% mutate(wind_cv = rollapply(wind, 5, FUN = "cv", align = "right", fill = NA))
# A tibble: 153 x 5
day month temp wind wind_cv
<int> <int> <int> <dbl> <dbl>
1 1 5 67 7.4 NA
2 2 5 72 8 NA
3 3 5 74 12.6 NA
4 4 5 62 11.5 NA
5 5 5 56 14.3 0.276
6 6 5 66 14.9 0.223
7 7 5 65 8.6 0.203
8 8 5 59 13.8 0.205
9 9 5 61 20.1 0.285
10 10 5 69 8.6 0.366
# … with 143 more rows