Data analysis: Tidy tools

Let’s have a look in some very powerful function in the tidyverse package that are important for data wrangling.


library(tidyverse)
library(lubridate)
library(zoo)
theme_set(theme_light(20))

Load some data

Daily air quality measurements in New York, May to September 1973. Temp: Maximum daily temperature in degrees Fahrenheit at La Guardia Airport.


df <- airquality %>% 
        as_tibble() %>% 
        select(day = Day, month = Month, temp = Temp, wind = Wind)
df


# A tibble: 153 x 4
     day month  temp  wind
   <int> <int> <int> <dbl>
 1     1     5    67   7.4
 2     2     5    72   8  
 3     3     5    74  12.6
 4     4     5    62  11.5
 5     5     5    56  14.3
 6     6     5    66  14.9
 7     7     5    65   8.6
 8     8     5    59  13.8
 9     9     5    61  20.1
10    10     5    69   8.6
# … with 143 more rows

count()


df %>% count(month, sort = TRUE)


# A tibble: 5 x 2
  month     n
  <int> <int>
1     5    31
2     7    31
3     8    31
4     6    30
5     9    30


df %>% group_by(month) %>% 
        mutate(days = n())


# A tibble: 153 x 5
# Groups:   month [5]
     day month  temp  wind  days
   <int> <int> <int> <dbl> <int>
 1     1     5    67   7.4    31
 2     2     5    72   8      31
 3     3     5    74  12.6    31
 4     4     5    62  11.5    31
 5     5     5    56  14.3    31
 6     6     5    66  14.9    31
 7     7     5    65   8.6    31
 8     8     5    59  13.8    31
 9     9     5    61  20.1    31
10    10     5    69   8.6    31
# … with 143 more rows

Take a sample


df %>% sample_n(., size = 6) # same as sample_n(df, 6)


# A tibble: 6 x 4
    day month  temp  wind
  <int> <int> <int> <dbl>
1     2     7    85   9.2
2    31     5    76   7.4
3    13     7    81  14.9
4    13     9    77   6.3
5     7     5    65   8.6
6    14     8    80   9.7

Find distinct values in variable


df %>% distinct(month)


# A tibble: 5 x 1
  month
  <int>
1     5
2     6
3     7
4     8
5     9

If_else


df %>% mutate(cond1 = if_else(condition = temp > 65,
                              true = "warm",
                              false = "cold",
                              missing = "NA")) %>% 
        mutate(cond2 = if_else(wind >10,1,0))


# A tibble: 153 x 6
     day month  temp  wind cond1 cond2
   <int> <int> <int> <dbl> <chr> <dbl>
 1     1     5    67   7.4 warm      0
 2     2     5    72   8   warm      0
 3     3     5    74  12.6 warm      1
 4     4     5    62  11.5 cold      1
 5     5     5    56  14.3 cold      1
 6     6     5    66  14.9 warm      1
 7     7     5    65   8.6 cold      0
 8     8     5    59  13.8 cold      1
 9     9     5    61  20.1 cold      1
10    10     5    69   8.6 warm      0
# … with 143 more rows

Windowed rank functions

Relative and absolute ranks


df2 <- df %>%  mutate(pt = percent_rank(temp),
               rt = min_rank(temp))

df2


# A tibble: 153 x 6
     day month  temp  wind     pt    rt
   <int> <int> <int> <dbl>  <dbl> <int>
 1     1     5    67   7.4 0.138     22
 2     2     5    72   8   0.237     37
 3     3     5    74  12.6 0.289     45
 4     4     5    62  11.5 0.0724    12
 5     5     5    56  14.3 0          1
 6     6     5    66  14.9 0.118     19
 7     7     5    65   8.6 0.105     17
 8     8     5    59  13.8 0.0395     7
 9     9     5    61  20.1 0.0526     9
10    10     5    69   8.6 0.191     30
# … with 143 more rows


df2 %>% filter(pt > 0.95) %>% 
        arrange(-pt)


# A tibble: 7 x 6
    day month  temp  wind    pt    rt
  <int> <int> <int> <dbl> <dbl> <int>
1    28     8    97   9.7 1.      153
2    30     8    96   6.3 0.993   152
3    29     8    94   2.3 0.980   150
4    31     8    94   6.3 0.980   150
5    11     6    93  10.9 0.961   147
6     3     9    93   2.8 0.961   147
7     4     9    93   4.6 0.961   147

n Buckets


df %>% mutate(w_class = ntile(wind, 3))


# A tibble: 153 x 5
     day month  temp  wind w_class
   <int> <int> <int> <dbl>   <int>
 1     1     5    67   7.4       1
 2     2     5    72   8         1
 3     3     5    74  12.6       3
 4     4     5    62  11.5       2
 5     5     5    56  14.3       3
 6     6     5    66  14.9       3
 7     7     5    65   8.6       2
 8     8     5    59  13.8       3
 9     9     5    61  20.1       3
10    10     5    69   8.6       2
# … with 143 more rows


df %>% mutate(w_class = ntile(wind, 3)) %>% 
        group_by(w_class) %>% 
        summarise(tavg = mean(temp, na.rm =T))


# A tibble: 3 x 2
  w_class  tavg
    <int> <dbl>
1       1  82.7
2       2  77.1
3       3  73.8

lag() and lead()

Using lag() is more convenient than diff() because for n inputs diff() returns n - 1 outputs.


df %>% mutate(wind_lag = lag(wind), 
              wind_lead = lead(wind)) %>% 
       mutate(wind_diff = c(NA,diff(wind)),
              wind_diff2 = wind - wind_lag)


# A tibble: 153 x 8
     day month  temp  wind wind_lag wind_lead wind_diff wind_diff2
   <int> <int> <int> <dbl>    <dbl>     <dbl>     <dbl>      <dbl>
 1     1     5    67   7.4     NA         8      NA         NA    
 2     2     5    72   8        7.4      12.6     0.600      0.600
 3     3     5    74  12.6      8        11.5     4.6        4.6  
 4     4     5    62  11.5     12.6      14.3    -1.10      -1.10 
 5     5     5    56  14.3     11.5      14.9     2.8        2.8  
 6     6     5    66  14.9     14.3       8.6     0.600      0.600
 7     7     5    65   8.6     14.9      13.8    -6.3       -6.3  
 8     8     5    59  13.8      8.6      20.1     5.2        5.2  
 9     9     5    61  20.1     13.8       8.6     6.3        6.3  
10    10     5    69   8.6     20.1       6.9   -11.5      -11.5  
# … with 143 more rows


df %>% mutate(temp_lag3 = lag(temp,3),
              temp_lag5 = lag(temp,5))


# A tibble: 153 x 6
     day month  temp  wind temp_lag3 temp_lag5
   <int> <int> <int> <dbl>     <int>     <int>
 1     1     5    67   7.4        NA        NA
 2     2     5    72   8          NA        NA
 3     3     5    74  12.6        NA        NA
 4     4     5    62  11.5        67        NA
 5     5     5    56  14.3        72        NA
 6     6     5    66  14.9        74        67
 7     7     5    65   8.6        62        72
 8     8     5    59  13.8        56        74
 9     9     5    61  20.1        66        62
10    10     5    69   8.6        65        56
# … with 143 more rows

rollapply

rollapply() is a powerful function in the zoo-package, e.g. to calculate moving averages or using other functions for windowed calculations.


df %>% mutate(temp_5c = rollapply(temp, 5, FUN = "mean", align = "center", fill = NA)) %>% 
    mutate(temp_5r = rollapply(temp, 5, FUN = "mean", align = "right", fill = "-999"))


# A tibble: 153 x 6
     day month  temp  wind temp_5c temp_5r
   <int> <int> <int> <dbl>   <dbl>   <dbl>
 1     1     5    67   7.4    NA    -999  
 2     2     5    72   8      NA    -999  
 3     3     5    74  12.6    66.2  -999  
 4     4     5    62  11.5    66    -999  
 5     5     5    56  14.3    64.6    66.2
 6     6     5    66  14.9    61.6    66  
 7     7     5    65   8.6    61.4    64.6
 8     8     5    59  13.8    64      61.6
 9     9     5    61  20.1    65.6    61.4
10    10     5    69   8.6    66.4    64  
# … with 143 more rows


df %>% mutate(wind_max3 = rollapply(wind, 3, FUN = "max", align = "right", fill = NA))


# A tibble: 153 x 5
     day month  temp  wind wind_max3
   <int> <int> <int> <dbl>     <dbl>
 1     1     5    67   7.4      NA  
 2     2     5    72   8        NA  
 3     3     5    74  12.6      12.6
 4     4     5    62  11.5      12.6
 5     5     5    56  14.3      14.3
 6     6     5    66  14.9      14.9
 7     7     5    65   8.6      14.9
 8     8     5    59  13.8      14.9
 9     9     5    61  20.1      20.1
10    10     5    69   8.6      20.1
# … with 143 more rows


cv <- function(x) sd(x) / mean(x)
df %>% mutate(wind_cv = rollapply(wind, 5, FUN = "cv", align = "right", fill = NA))


# A tibble: 153 x 5
     day month  temp  wind wind_cv
   <int> <int> <int> <dbl>   <dbl>
 1     1     5    67   7.4  NA    
 2     2     5    72   8    NA    
 3     3     5    74  12.6  NA    
 4     4     5    62  11.5  NA    
 5     5     5    56  14.3   0.276
 6     6     5    66  14.9   0.223
 7     7     5    65   8.6   0.203
 8     8     5    59  13.8   0.205
 9     9     5    61  20.1   0.285
10    10     5    69   8.6   0.366
# … with 143 more rows