Tidyverse: R code

Principles of graph grammar and tidy data

logo

All code I used to build the slides can be found here. Making data and code available as supplementary material promotes transparency and reproducibility, enabling anyone to reproduce the methodology discussed during the lecture.

Data Structure

Pivoting: ‘long form’

dataEx1 <- readRDS("./data/dataEx1.RDS")


dataEx1.1 <- dataEx1 %>%
  rownames_to_column(var = "Farm") %>%
  pivot_longer(cols = 2:4, names_to = "Income", values_to = "Freq")

Pivoting: ‘wide form’

dataEx2 <- as_tibble(readRDS("./data/dataEx2.RDS"))

dataEx2.2 <- dataEx2 %>%
  pivot_wider(values_from = value, names_from = type)

I <- dataEx2 %>%
  pivot_wider(values_from = value, names_from = type) %>%
  transform(id = str_replace(id,"Ind",""))

Filter

cbp <- readRDS("./data/animal_sim.RDS")

threshold <- with(cbp, mean(phenotype) + 2*sd(phenotype))
cbp %>%
  filter(herd %in% c("A", "E")) %>%
  filter(year == "9" & phenotype > threshold) %>%
  ggplot(aes(y = phenotype, x = sex)) +
  geom_violin() +
  geom_jitter(shape = 1, width = 0.15)

cbp %>%
  filter(herd %in% c("A", "E")) %>%
  filter(year == "9" & (phenotype > threshold | sex == "F")) %>%
  ggplot(aes(y = phenotype, x = sex)) +
  geom_violin() +
  geom_jitter(shape = 1, width = 0.15)

ggsave("filterData2.pdf", width = 4, height = 4)

Distinct

cbp %>%
  distinct(dplyr::across(contains("r")))
## # A tibble: 8,186 × 4
##    father mother year  herd 
##     <dbl>  <dbl> <fct> <fct>
##  1     NA     NA 0     E    
##  2     NA     NA 0     B    
##  3     NA     NA 0     A    
##  4     NA     NA 0     D    
##  5     NA     NA 0     C    
##  6    418    692 1     E    
##  7    461    614 1     B    
##  8    195    524 1     A    
##  9    198    768 1     D    
## 10    122    537 1     A    
## # … with 8,176 more rows
cbp %>%
  distinct(sex)
## # A tibble: 2 × 1
##   sex  
##   <fct>
## 1 M    
## 2 F

Slice

cbp %>% slice(1:3)
## # A tibble: 3 × 7
##     ind father mother year  sex   phenotype herd 
##   <dbl>  <dbl>  <dbl> <fct> <fct>     <dbl> <fct>
## 1     1     NA     NA 0     M          37.7 E    
## 2     2     NA     NA 0     M          35.8 B    
## 3     3     NA     NA 0     M          28.4 A
cbp %>% slice((n()-5L):n())
## # A tibble: 6 × 7
##     ind father mother year  sex   phenotype herd 
##   <dbl>  <dbl>  <dbl> <fct> <fct>     <dbl> <fct>
## 1  9995   8463   8849 9     F          71.3 E    
## 2  9996   8013   8628 9     F          61.9 E    
## 3  9997   8342   8677 9     F          58.4 A    
## 4  9998   8088   8510 9     F          58.2 E    
## 5  9999   8449   8685 9     F          56.7 D    
## 6 10000   8296   8854 9     F          67.1 B
cbp %>% slice_sample(n = 5)
## # A tibble: 5 × 7
##     ind father mother year  sex   phenotype herd 
##   <dbl>  <dbl>  <dbl> <fct> <fct>     <dbl> <fct>
## 1  9516   8410   8794 9     F          58.3 C    
## 2  6592   5280   5501 6     F          47.2 A    
## 3  3973   2372   2794 3     F          35.6 E    
## 4  5629   4404   4834 5     F          45.7 B    
## 5  6441   5318   5840 6     M          56.7 B
cbp %>% slice_min(phenotype, n = 3)
## # A tibble: 3 × 7
##     ind father mother year  sex   phenotype herd 
##   <dbl>  <dbl>  <dbl> <fct> <fct>     <dbl> <fct>
## 1  1331    148    840 1     M          15.3 B    
## 2  1316     62    551 1     M          20.2 D    
## 3  1992    354    523 1     F          20.3 C

Arrange

cbp %>% arrange(desc(phenotype)) %>% slice_head(n=5)
## # A tibble: 5 × 7
##     ind father mother year  sex   phenotype herd 
##   <dbl>  <dbl>  <dbl> <fct> <fct>     <dbl> <fct>
## 1  9069   8366   8725 9     M          82.1 C    
## 2  9979   8305   8994 9     F          80.0 B    
## 3  9492   8346   8985 9     M          79.1 B    
## 4  9934   8470   8776 9     F          78.9 A    
## 5  8650   7117   7726 8     F          78.6 B
cbp %>% arrange(desc(herd)) %>% slice_head(n=5)
## # A tibble: 5 × 7
##     ind father mother year  sex   phenotype herd 
##   <dbl>  <dbl>  <dbl> <fct> <fct>     <dbl> <fct>
## 1     1     NA     NA 0     M          37.7 E    
## 2     8     NA     NA 0     M          33.3 E    
## 3     9     NA     NA 0     M          39.0 E    
## 4    17     NA     NA 0     M          42.5 E    
## 5    23     NA     NA 0     M          51.9 E
cbp %>% arrange(sex, phenotype, herd) %>% slice_head(n=7)
## # A tibble: 7 × 7
##     ind father mother year  sex   phenotype herd 
##   <dbl>  <dbl>  <dbl> <fct> <fct>     <dbl> <fct>
## 1  1992    354    523 1     F          20.3 C    
## 2   618     NA     NA 0     F          20.8 C    
## 3  1830    378    900 1     F          20.8 A    
## 4   557     NA     NA 0     F          21.5 A    
## 5  1663    102    983 1     F          21.9 B    
## 6   800     NA     NA 0     F          22.2 A    
## 7  1645     47    541 1     F          22.9 D
cbp %>% arrange(herd, phenotype, sex) %>% slice_head(n=7)
## # A tibble: 7 × 7
##     ind father mother year  sex   phenotype herd 
##   <dbl>  <dbl>  <dbl> <fct> <fct>     <dbl> <fct>
## 1  1830    378    900 1     F          20.8 A    
## 2   557     NA     NA 0     F          21.5 A    
## 3    95     NA     NA 0     M          22.2 A    
## 4   800     NA     NA 0     F          22.2 A    
## 5  1592    440    667 1     F          23.0 A    
## 6  1192    355    642 1     M          24.7 A    
## 7  1671    448    953 1     F          25.0 A

Add row

df <- tibble(x = 1:4, y = 20:23)
df %>% add_row(x = 3, y =21)
## # A tibble: 5 × 2
##       x     y
##   <dbl> <dbl>
## 1     1    20
## 2     2    21
## 3     3    22
## 4     4    23
## 5     3    21
df %>% add_row(x = 3, y =21, .before = 4)
## # A tibble: 5 × 2
##       x     y
##   <dbl> <dbl>
## 1     1    20
## 2     2    21
## 3     3    22
## 4     3    21
## 5     4    23
df %>% add_row(x = 3, .before = 4)
## # A tibble: 5 × 2
##       x     y
##   <dbl> <int>
## 1     1    20
## 2     2    21
## 3     3    22
## 4     3    NA
## 5     4    23

Pull

df %>% pull(var = x)
## [1] 1 2 3 4
df %>% pull(var = x) %>% sum()
## [1] 10

Select

cbp %>%
  select(ind:mother) %>% slice_head(n=5)
## # A tibble: 5 × 3
##     ind father mother
##   <dbl>  <dbl>  <dbl>
## 1     1     NA     NA
## 2     2     NA     NA
## 3     3     NA     NA
## 4     4     NA     NA
## 5     5     NA     NA
cbp %>%
  select(starts_with(c("p","m"))| ends_with("r")) %>% 
  slice_head(n=5)
## # A tibble: 5 × 4
##   phenotype mother father year 
##       <dbl>  <dbl>  <dbl> <fct>
## 1      37.7     NA     NA 0    
## 2      35.8     NA     NA 0    
## 3      28.4     NA     NA 0    
## 4      33.6     NA     NA 0    
## 5      32.9     NA     NA 0
cbp %>%
  select(starts_with(c("p","m")) & !ends_with("r")) %>% 
  slice_head(n=5)
## # A tibble: 5 × 1
##   phenotype
##       <dbl>
## 1      37.7
## 2      35.8
## 3      28.4
## 4      33.6
## 5      32.9
cbp %>% select(contains("a")) %>% slice_head(n=5)
## # A tibble: 5 × 2
##   father year 
##    <dbl> <fct>
## 1     NA 0    
## 2     NA 0    
## 3     NA 0    
## 4     NA 0    
## 5     NA 0

Relocate

cbp %>%
  relocate(mother, .before = father) %>%
  relocate(year, herd, .after = sex)
## # A tibble: 10,000 × 7
##      ind mother father sex   year  herd  phenotype
##    <dbl>  <dbl>  <dbl> <fct> <fct> <fct>     <dbl>
##  1     1     NA     NA M     0     E          37.7
##  2     2     NA     NA M     0     B          35.8
##  3     3     NA     NA M     0     A          28.4
##  4     4     NA     NA M     0     D          33.6
##  5     5     NA     NA M     0     A          32.9
##  6     6     NA     NA M     0     A          31.6
##  7     7     NA     NA M     0     A          38.8
##  8     8     NA     NA M     0     E          33.3
##  9     9     NA     NA M     0     E          39.0
## 10    10     NA     NA M     0     C          46.0
## # … with 9,990 more rows
cbp %>%
  relocate(where(is.factor), 
           .after = last_col())
## # A tibble: 10,000 × 7
##      ind father mother phenotype year  sex   herd 
##    <dbl>  <dbl>  <dbl>     <dbl> <fct> <fct> <fct>
##  1     1     NA     NA      37.7 0     M     E    
##  2     2     NA     NA      35.8 0     M     B    
##  3     3     NA     NA      28.4 0     M     A    
##  4     4     NA     NA      33.6 0     M     D    
##  5     5     NA     NA      32.9 0     M     A    
##  6     6     NA     NA      31.6 0     M     A    
##  7     7     NA     NA      38.8 0     M     A    
##  8     8     NA     NA      33.3 0     M     E    
##  9     9     NA     NA      39.0 0     M     E    
## 10    10     NA     NA      46.0 0     M     C    
## # … with 9,990 more rows

Mutate

cbp %>% select(sex, phenotype) %>%
  dplyr::mutate(logPheno = log10(phenotype)) %>% 
  slice_head(n=5)
## # A tibble: 5 × 3
##   sex   phenotype logPheno
##   <fct>     <dbl>    <dbl>
## 1 M          37.7     1.58
## 2 M          35.8     1.55
## 3 M          28.4     1.45
## 4 M          33.6     1.53
## 5 M          32.9     1.52
cbp %>% select(sex, phenotype) %>%
  dplyr::mutate(rankPheno = min_rank(desc(phenotype))) %>%
  dplyr::arrange(rankPheno) %>% slice_head(n=5)
## # A tibble: 5 × 3
##   sex   phenotype rankPheno
##   <fct>     <dbl>     <int>
## 1 M          82.1         1
## 2 F          80.0         2
## 3 M          79.1         3
## 4 F          78.9         4
## 5 F          78.6         5
cbp %>%
  dplyr::mutate(dplyr::across(!phenotype, as.factor))%>% 
  dplyr::mutate(phenotype = NULL) %>%
  slice_head(n=5)
## # A tibble: 5 × 6
##   ind   father mother year  sex   herd 
##   <fct> <fct>  <fct>  <fct> <fct> <fct>
## 1 1     <NA>   <NA>   0     M     E    
## 2 2     <NA>   <NA>   0     M     B    
## 3 3     <NA>   <NA>   0     M     A    
## 4 4     <NA>   <NA>   0     M     D    
## 5 5     <NA>   <NA>   0     M     A
cbp %>% select(sex, herd, phenotype) %>%
  dplyr::mutate(herdSex = sex:herd) %>%
  relocate(herdSex, .before = phenotype) %>% 
  slice_head(n=5)
## # A tibble: 5 × 4
##   sex   herd  herdSex phenotype
##   <fct> <fct> <fct>       <dbl>
## 1 M     E     M:E          37.7
## 2 M     B     M:B          35.8
## 3 M     A     M:A          28.4
## 4 M     D     M:D          33.6
## 5 M     A     M:A          32.9

Transmute

cbp %>% select(sex, phenotype) %>%
  dplyr::transmute(logPheno = log10(phenotype)) %>% 
  slice_head(n=5)
## # A tibble: 5 × 1
##   logPheno
##      <dbl>
## 1     1.58
## 2     1.55
## 3     1.45
## 4     1.53
## 5     1.52
cbp %>% select(sex, phenotype) %>%
  dplyr::transmute(rankPheno = min_rank(desc(phenotype))) %>%
  dplyr::arrange(rankPheno) %>% slice_head(n=5)
## # A tibble: 5 × 1
##   rankPheno
##       <int>
## 1         1
## 2         2
## 3         3
## 4         4
## 5         5
cbp %>%
  dplyr::transmute(dplyr::across(!phenotype, as.factor))%>% 
  dplyr::mutate(phenotype = NULL) %>%
  slice_head(n=5)
## # A tibble: 5 × 6
##   ind   father mother year  sex   herd 
##   <fct> <fct>  <fct>  <fct> <fct> <fct>
## 1 1     <NA>   <NA>   0     M     E    
## 2 2     <NA>   <NA>   0     M     B    
## 3 3     <NA>   <NA>   0     M     A    
## 4 4     <NA>   <NA>   0     M     D    
## 5 5     <NA>   <NA>   0     M     A
cbp %>% select(sex, herd, phenotype) %>%
  dplyr::transmute(herdSex = sex:herd) %>%
  slice_head(n=5)
## # A tibble: 5 × 1
##   herdSex
##   <fct>  
## 1 M:E    
## 2 M:B    
## 3 M:A    
## 4 M:D    
## 5 M:A

Across

cbp %>%
  dplyr::mutate(across(phenotype, round, 2))
## # A tibble: 10,000 × 7
##      ind father mother year  sex   phenotype herd 
##    <dbl>  <dbl>  <dbl> <fct> <fct>     <dbl> <fct>
##  1     1     NA     NA 0     M          37.7 E    
##  2     2     NA     NA 0     M          35.8 B    
##  3     3     NA     NA 0     M          28.4 A    
##  4     4     NA     NA 0     M          33.6 D    
##  5     5     NA     NA 0     M          32.9 A    
##  6     6     NA     NA 0     M          31.6 A    
##  7     7     NA     NA 0     M          38.8 A    
##  8     8     NA     NA 0     M          33.3 E    
##  9     9     NA     NA 0     M          39.0 E    
## 10    10     NA     NA 0     M          46.0 C    
## # … with 9,990 more rows
# Centering variable
fun <- function(x, na.rm = TRUE){
  x - mean(x, na.rm = na.rm)
}
cbp %>%
  dplyr::mutate(year2 = as.numeric(year)-1) %>%
  dplyr::mutate(across(c(year2, phenotype), fun))
## # A tibble: 10,000 × 8
##      ind father mother year  sex   phenotype herd  year2
##    <dbl>  <dbl>  <dbl> <fct> <fct>     <dbl> <fct> <dbl>
##  1     1     NA     NA 0     M        -12.3  E      -4.5
##  2     2     NA     NA 0     M        -14.2  B      -4.5
##  3     3     NA     NA 0     M        -21.6  A      -4.5
##  4     4     NA     NA 0     M        -16.4  D      -4.5
##  5     5     NA     NA 0     M        -17.1  A      -4.5
##  6     6     NA     NA 0     M        -18.4  A      -4.5
##  7     7     NA     NA 0     M        -11.2  A      -4.5
##  8     8     NA     NA 0     M        -16.7  E      -4.5
##  9     9     NA     NA 0     M        -11.0  E      -4.5
## 10    10     NA     NA 0     M         -4.02 C      -4.5
## # … with 9,990 more rows

Rename

cbp %>%
  rename(Pheno = phenotype) %>% slice_head(n=5)
## # A tibble: 5 × 7
##     ind father mother year  sex   Pheno herd 
##   <dbl>  <dbl>  <dbl> <fct> <fct> <dbl> <fct>
## 1     1     NA     NA 0     M      37.7 E    
## 2     2     NA     NA 0     M      35.8 B    
## 3     3     NA     NA 0     M      28.4 A    
## 4     4     NA     NA 0     M      33.6 D    
## 5     5     NA     NA 0     M      32.9 A
cbp %>%
  rename_with(~(toupper(gsub("r", "r2", .x, fixed = TRUE)))) %>%
  slice_head(n=5)
## # A tibble: 5 × 7
##     IND FATHER2 MOTHER2 YEAR2 SEX   PHENOTYPE HER2D
##   <dbl>   <dbl>   <dbl> <fct> <fct>     <dbl> <fct>
## 1     1      NA      NA 0     M          37.7 E    
## 2     2      NA      NA 0     M          35.8 B    
## 3     3      NA      NA 0     M          28.4 A    
## 4     4      NA      NA 0     M          33.6 D    
## 5     5      NA      NA 0     M          32.9 A

Group by

# Centering variable
fun <- function(x, na.rm = TRUE){
  x - mean(x, na.rm = na.rm)
}
cbp %>%
  group_by(herd) %>%
  dplyr::mutate(year2 = as.numeric(year)-1) %>%
  dplyr::mutate(across(c(year2, phenotype), fun))
## # A tibble: 10,000 × 8
## # Groups:   herd [5]
##      ind father mother year  sex   phenotype herd  year2
##    <dbl>  <dbl>  <dbl> <fct> <fct>     <dbl> <fct> <dbl>
##  1     1     NA     NA 0     M        -12.4  E      -4.5
##  2     2     NA     NA 0     M        -14.2  B      -4.5
##  3     3     NA     NA 0     M        -21.6  A      -4.5
##  4     4     NA     NA 0     M        -16.2  D      -4.5
##  5     5     NA     NA 0     M        -17.2  A      -4.5
##  6     6     NA     NA 0     M        -18.4  A      -4.5
##  7     7     NA     NA 0     M        -11.2  A      -4.5
##  8     8     NA     NA 0     M        -16.8  E      -4.5
##  9     9     NA     NA 0     M        -11.1  E      -4.5
## 10    10     NA     NA 0     M         -4.11 C      -4.5
## # … with 9,990 more rows

Statistics Summary

Summarise

s1 <- cbp %>%
  group_by(year, sex, herd) %>%
  summarise(
    mean = mean(phenotype),
    median = median(phenotype),
    min = min(phenotype),
    max = max(phenotype),
    IQR = IQR(phenotype),
    sd = sd(phenotype),
    var = sd^2,
    n = n()
  ) %>%
  dplyr::mutate(across(where(is.numeric), round,1))

s1 %>%
  ggplot(aes(y=mean, x =year)) +
  geom_point(aes(shape =herd, colour = herd)) +
  facet_wrap(~sex) +
  labs(x = "Generation", y = "Milk Yield (kg/day",
       colour = "Herd", shape = "Herd") +
  theme_bw(base_size = 13)

ggsave("milk.pdf", width = 5, height = 3)
s1 %>%
  filter(year %in% c("8","9") & 
           max > 76 & min >40)
## # A tibble: 7 × 11
## # Groups:   year, sex [4]
##   year  sex   herd   mean median   min   max   IQR    sd   var     n
##   <fct> <fct> <fct> <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 8     F     B      58.3   57.9  41.7  78.6   8.9   6.6  43.2   114
## 2 8     M     C      59.7   59.9  45.3  77.6   7.3   6    35.8   108
## 3 8     M     D      59.2   59.1  45.1  76.7   7.7   6.3  39.5   106
## 4 9     F     E      59.6   60.1  42.9  77.4   7.5   6.3  40.3    99
## 5 9     M     B      61.1   60.7  45    79.1  11.1   7.2  51.9    86
## 6 9     M     C      61.2   61    42.4  82.1   8.5   6.5  41.8   108
## 7 9     M     E      60.8   59.6  45.1  77.7   8.4   6.3  39.9   101
s1 %>%
  filter(year %in% c("8","9") & 
           max > 76 & min >40) %>%
  transmute(CV = sd / mean, 
            CV = scales::percent(CV)) %>%
  dplyr::mutate(across(where(is.numeric), round,1))
## # A tibble: 7 × 3
## # Groups:   year, sex [4]
##   year  sex   CV    
##   <fct> <fct> <chr> 
## 1 8     F     11%   
## 2 8     M     10.05%
## 3 8     M     10.64%
## 4 9     F     11%   
## 5 9     M     11.78%
## 6 9     M     10.62%
## 7 9     M     10.36%
Thiago de Paula Oliveira
Thiago de Paula Oliveira
Consultant Statistician

My research interests include statistical modelling, agriculture, genetics, and sports.