Tidyverse: R code
Principles of graph grammar and tidy data
All code I used to build the slides can be found here. Making data and code available as supplementary material promotes transparency and reproducibility, enabling anyone to reproduce the methodology discussed during the lecture.
Data Structure
Pivoting: ‘long form’
<- readRDS("./data/dataEx1.RDS")
dataEx1
.1 <- dataEx1 %>%
dataEx1rownames_to_column(var = "Farm") %>%
pivot_longer(cols = 2:4, names_to = "Income", values_to = "Freq")
Pivoting: ‘wide form’
<- as_tibble(readRDS("./data/dataEx2.RDS"))
dataEx2
.2 <- dataEx2 %>%
dataEx2pivot_wider(values_from = value, names_from = type)
<- dataEx2 %>%
I pivot_wider(values_from = value, names_from = type) %>%
transform(id = str_replace(id,"Ind",""))
Filter
<- readRDS("./data/animal_sim.RDS")
cbp
<- with(cbp, mean(phenotype) + 2*sd(phenotype))
threshold %>%
cbp filter(herd %in% c("A", "E")) %>%
filter(year == "9" & phenotype > threshold) %>%
ggplot(aes(y = phenotype, x = sex)) +
geom_violin() +
geom_jitter(shape = 1, width = 0.15)
%>%
cbp filter(herd %in% c("A", "E")) %>%
filter(year == "9" & (phenotype > threshold | sex == "F")) %>%
ggplot(aes(y = phenotype, x = sex)) +
geom_violin() +
geom_jitter(shape = 1, width = 0.15)
ggsave("filterData2.pdf", width = 4, height = 4)
Distinct
%>%
cbp distinct(dplyr::across(contains("r")))
## # A tibble: 8,186 × 4
## father mother year herd
## <dbl> <dbl> <fct> <fct>
## 1 NA NA 0 E
## 2 NA NA 0 B
## 3 NA NA 0 A
## 4 NA NA 0 D
## 5 NA NA 0 C
## 6 418 692 1 E
## 7 461 614 1 B
## 8 195 524 1 A
## 9 198 768 1 D
## 10 122 537 1 A
## # … with 8,176 more rows
%>%
cbp distinct(sex)
## # A tibble: 2 × 1
## sex
## <fct>
## 1 M
## 2 F
Slice
%>% slice(1:3) cbp
## # A tibble: 3 × 7
## ind father mother year sex phenotype herd
## <dbl> <dbl> <dbl> <fct> <fct> <dbl> <fct>
## 1 1 NA NA 0 M 37.7 E
## 2 2 NA NA 0 M 35.8 B
## 3 3 NA NA 0 M 28.4 A
%>% slice((n()-5L):n()) cbp
## # A tibble: 6 × 7
## ind father mother year sex phenotype herd
## <dbl> <dbl> <dbl> <fct> <fct> <dbl> <fct>
## 1 9995 8463 8849 9 F 71.3 E
## 2 9996 8013 8628 9 F 61.9 E
## 3 9997 8342 8677 9 F 58.4 A
## 4 9998 8088 8510 9 F 58.2 E
## 5 9999 8449 8685 9 F 56.7 D
## 6 10000 8296 8854 9 F 67.1 B
%>% slice_sample(n = 5) cbp
## # A tibble: 5 × 7
## ind father mother year sex phenotype herd
## <dbl> <dbl> <dbl> <fct> <fct> <dbl> <fct>
## 1 9516 8410 8794 9 F 58.3 C
## 2 6592 5280 5501 6 F 47.2 A
## 3 3973 2372 2794 3 F 35.6 E
## 4 5629 4404 4834 5 F 45.7 B
## 5 6441 5318 5840 6 M 56.7 B
%>% slice_min(phenotype, n = 3) cbp
## # A tibble: 3 × 7
## ind father mother year sex phenotype herd
## <dbl> <dbl> <dbl> <fct> <fct> <dbl> <fct>
## 1 1331 148 840 1 M 15.3 B
## 2 1316 62 551 1 M 20.2 D
## 3 1992 354 523 1 F 20.3 C
Arrange
%>% arrange(desc(phenotype)) %>% slice_head(n=5) cbp
## # A tibble: 5 × 7
## ind father mother year sex phenotype herd
## <dbl> <dbl> <dbl> <fct> <fct> <dbl> <fct>
## 1 9069 8366 8725 9 M 82.1 C
## 2 9979 8305 8994 9 F 80.0 B
## 3 9492 8346 8985 9 M 79.1 B
## 4 9934 8470 8776 9 F 78.9 A
## 5 8650 7117 7726 8 F 78.6 B
%>% arrange(desc(herd)) %>% slice_head(n=5) cbp
## # A tibble: 5 × 7
## ind father mother year sex phenotype herd
## <dbl> <dbl> <dbl> <fct> <fct> <dbl> <fct>
## 1 1 NA NA 0 M 37.7 E
## 2 8 NA NA 0 M 33.3 E
## 3 9 NA NA 0 M 39.0 E
## 4 17 NA NA 0 M 42.5 E
## 5 23 NA NA 0 M 51.9 E
%>% arrange(sex, phenotype, herd) %>% slice_head(n=7) cbp
## # A tibble: 7 × 7
## ind father mother year sex phenotype herd
## <dbl> <dbl> <dbl> <fct> <fct> <dbl> <fct>
## 1 1992 354 523 1 F 20.3 C
## 2 618 NA NA 0 F 20.8 C
## 3 1830 378 900 1 F 20.8 A
## 4 557 NA NA 0 F 21.5 A
## 5 1663 102 983 1 F 21.9 B
## 6 800 NA NA 0 F 22.2 A
## 7 1645 47 541 1 F 22.9 D
%>% arrange(herd, phenotype, sex) %>% slice_head(n=7) cbp
## # A tibble: 7 × 7
## ind father mother year sex phenotype herd
## <dbl> <dbl> <dbl> <fct> <fct> <dbl> <fct>
## 1 1830 378 900 1 F 20.8 A
## 2 557 NA NA 0 F 21.5 A
## 3 95 NA NA 0 M 22.2 A
## 4 800 NA NA 0 F 22.2 A
## 5 1592 440 667 1 F 23.0 A
## 6 1192 355 642 1 M 24.7 A
## 7 1671 448 953 1 F 25.0 A
Add row
<- tibble(x = 1:4, y = 20:23)
df %>% add_row(x = 3, y =21) df
## # A tibble: 5 × 2
## x y
## <dbl> <dbl>
## 1 1 20
## 2 2 21
## 3 3 22
## 4 4 23
## 5 3 21
%>% add_row(x = 3, y =21, .before = 4) df
## # A tibble: 5 × 2
## x y
## <dbl> <dbl>
## 1 1 20
## 2 2 21
## 3 3 22
## 4 3 21
## 5 4 23
%>% add_row(x = 3, .before = 4) df
## # A tibble: 5 × 2
## x y
## <dbl> <int>
## 1 1 20
## 2 2 21
## 3 3 22
## 4 3 NA
## 5 4 23
Pull
%>% pull(var = x) df
## [1] 1 2 3 4
%>% pull(var = x) %>% sum() df
## [1] 10
Select
%>%
cbp select(ind:mother) %>% slice_head(n=5)
## # A tibble: 5 × 3
## ind father mother
## <dbl> <dbl> <dbl>
## 1 1 NA NA
## 2 2 NA NA
## 3 3 NA NA
## 4 4 NA NA
## 5 5 NA NA
%>%
cbp select(starts_with(c("p","m"))| ends_with("r")) %>%
slice_head(n=5)
## # A tibble: 5 × 4
## phenotype mother father year
## <dbl> <dbl> <dbl> <fct>
## 1 37.7 NA NA 0
## 2 35.8 NA NA 0
## 3 28.4 NA NA 0
## 4 33.6 NA NA 0
## 5 32.9 NA NA 0
%>%
cbp select(starts_with(c("p","m")) & !ends_with("r")) %>%
slice_head(n=5)
## # A tibble: 5 × 1
## phenotype
## <dbl>
## 1 37.7
## 2 35.8
## 3 28.4
## 4 33.6
## 5 32.9
%>% select(contains("a")) %>% slice_head(n=5) cbp
## # A tibble: 5 × 2
## father year
## <dbl> <fct>
## 1 NA 0
## 2 NA 0
## 3 NA 0
## 4 NA 0
## 5 NA 0
Relocate
%>%
cbp relocate(mother, .before = father) %>%
relocate(year, herd, .after = sex)
## # A tibble: 10,000 × 7
## ind mother father sex year herd phenotype
## <dbl> <dbl> <dbl> <fct> <fct> <fct> <dbl>
## 1 1 NA NA M 0 E 37.7
## 2 2 NA NA M 0 B 35.8
## 3 3 NA NA M 0 A 28.4
## 4 4 NA NA M 0 D 33.6
## 5 5 NA NA M 0 A 32.9
## 6 6 NA NA M 0 A 31.6
## 7 7 NA NA M 0 A 38.8
## 8 8 NA NA M 0 E 33.3
## 9 9 NA NA M 0 E 39.0
## 10 10 NA NA M 0 C 46.0
## # … with 9,990 more rows
%>%
cbp relocate(where(is.factor),
.after = last_col())
## # A tibble: 10,000 × 7
## ind father mother phenotype year sex herd
## <dbl> <dbl> <dbl> <dbl> <fct> <fct> <fct>
## 1 1 NA NA 37.7 0 M E
## 2 2 NA NA 35.8 0 M B
## 3 3 NA NA 28.4 0 M A
## 4 4 NA NA 33.6 0 M D
## 5 5 NA NA 32.9 0 M A
## 6 6 NA NA 31.6 0 M A
## 7 7 NA NA 38.8 0 M A
## 8 8 NA NA 33.3 0 M E
## 9 9 NA NA 39.0 0 M E
## 10 10 NA NA 46.0 0 M C
## # … with 9,990 more rows
Mutate
%>% select(sex, phenotype) %>%
cbp ::mutate(logPheno = log10(phenotype)) %>%
dplyrslice_head(n=5)
## # A tibble: 5 × 3
## sex phenotype logPheno
## <fct> <dbl> <dbl>
## 1 M 37.7 1.58
## 2 M 35.8 1.55
## 3 M 28.4 1.45
## 4 M 33.6 1.53
## 5 M 32.9 1.52
%>% select(sex, phenotype) %>%
cbp ::mutate(rankPheno = min_rank(desc(phenotype))) %>%
dplyr::arrange(rankPheno) %>% slice_head(n=5) dplyr
## # A tibble: 5 × 3
## sex phenotype rankPheno
## <fct> <dbl> <int>
## 1 M 82.1 1
## 2 F 80.0 2
## 3 M 79.1 3
## 4 F 78.9 4
## 5 F 78.6 5
%>%
cbp ::mutate(dplyr::across(!phenotype, as.factor))%>%
dplyr::mutate(phenotype = NULL) %>%
dplyrslice_head(n=5)
## # A tibble: 5 × 6
## ind father mother year sex herd
## <fct> <fct> <fct> <fct> <fct> <fct>
## 1 1 <NA> <NA> 0 M E
## 2 2 <NA> <NA> 0 M B
## 3 3 <NA> <NA> 0 M A
## 4 4 <NA> <NA> 0 M D
## 5 5 <NA> <NA> 0 M A
%>% select(sex, herd, phenotype) %>%
cbp ::mutate(herdSex = sex:herd) %>%
dplyrrelocate(herdSex, .before = phenotype) %>%
slice_head(n=5)
## # A tibble: 5 × 4
## sex herd herdSex phenotype
## <fct> <fct> <fct> <dbl>
## 1 M E M:E 37.7
## 2 M B M:B 35.8
## 3 M A M:A 28.4
## 4 M D M:D 33.6
## 5 M A M:A 32.9
Transmute
%>% select(sex, phenotype) %>%
cbp ::transmute(logPheno = log10(phenotype)) %>%
dplyrslice_head(n=5)
## # A tibble: 5 × 1
## logPheno
## <dbl>
## 1 1.58
## 2 1.55
## 3 1.45
## 4 1.53
## 5 1.52
%>% select(sex, phenotype) %>%
cbp ::transmute(rankPheno = min_rank(desc(phenotype))) %>%
dplyr::arrange(rankPheno) %>% slice_head(n=5) dplyr
## # A tibble: 5 × 1
## rankPheno
## <int>
## 1 1
## 2 2
## 3 3
## 4 4
## 5 5
%>%
cbp ::transmute(dplyr::across(!phenotype, as.factor))%>%
dplyr::mutate(phenotype = NULL) %>%
dplyrslice_head(n=5)
## # A tibble: 5 × 6
## ind father mother year sex herd
## <fct> <fct> <fct> <fct> <fct> <fct>
## 1 1 <NA> <NA> 0 M E
## 2 2 <NA> <NA> 0 M B
## 3 3 <NA> <NA> 0 M A
## 4 4 <NA> <NA> 0 M D
## 5 5 <NA> <NA> 0 M A
%>% select(sex, herd, phenotype) %>%
cbp ::transmute(herdSex = sex:herd) %>%
dplyrslice_head(n=5)
## # A tibble: 5 × 1
## herdSex
## <fct>
## 1 M:E
## 2 M:B
## 3 M:A
## 4 M:D
## 5 M:A
Across
%>%
cbp ::mutate(across(phenotype, round, 2)) dplyr
## # A tibble: 10,000 × 7
## ind father mother year sex phenotype herd
## <dbl> <dbl> <dbl> <fct> <fct> <dbl> <fct>
## 1 1 NA NA 0 M 37.7 E
## 2 2 NA NA 0 M 35.8 B
## 3 3 NA NA 0 M 28.4 A
## 4 4 NA NA 0 M 33.6 D
## 5 5 NA NA 0 M 32.9 A
## 6 6 NA NA 0 M 31.6 A
## 7 7 NA NA 0 M 38.8 A
## 8 8 NA NA 0 M 33.3 E
## 9 9 NA NA 0 M 39.0 E
## 10 10 NA NA 0 M 46.0 C
## # … with 9,990 more rows
# Centering variable
<- function(x, na.rm = TRUE){
fun - mean(x, na.rm = na.rm)
x
}%>%
cbp ::mutate(year2 = as.numeric(year)-1) %>%
dplyr::mutate(across(c(year2, phenotype), fun)) dplyr
## # A tibble: 10,000 × 8
## ind father mother year sex phenotype herd year2
## <dbl> <dbl> <dbl> <fct> <fct> <dbl> <fct> <dbl>
## 1 1 NA NA 0 M -12.3 E -4.5
## 2 2 NA NA 0 M -14.2 B -4.5
## 3 3 NA NA 0 M -21.6 A -4.5
## 4 4 NA NA 0 M -16.4 D -4.5
## 5 5 NA NA 0 M -17.1 A -4.5
## 6 6 NA NA 0 M -18.4 A -4.5
## 7 7 NA NA 0 M -11.2 A -4.5
## 8 8 NA NA 0 M -16.7 E -4.5
## 9 9 NA NA 0 M -11.0 E -4.5
## 10 10 NA NA 0 M -4.02 C -4.5
## # … with 9,990 more rows
Rename
%>%
cbp rename(Pheno = phenotype) %>% slice_head(n=5)
## # A tibble: 5 × 7
## ind father mother year sex Pheno herd
## <dbl> <dbl> <dbl> <fct> <fct> <dbl> <fct>
## 1 1 NA NA 0 M 37.7 E
## 2 2 NA NA 0 M 35.8 B
## 3 3 NA NA 0 M 28.4 A
## 4 4 NA NA 0 M 33.6 D
## 5 5 NA NA 0 M 32.9 A
%>%
cbp rename_with(~(toupper(gsub("r", "r2", .x, fixed = TRUE)))) %>%
slice_head(n=5)
## # A tibble: 5 × 7
## IND FATHER2 MOTHER2 YEAR2 SEX PHENOTYPE HER2D
## <dbl> <dbl> <dbl> <fct> <fct> <dbl> <fct>
## 1 1 NA NA 0 M 37.7 E
## 2 2 NA NA 0 M 35.8 B
## 3 3 NA NA 0 M 28.4 A
## 4 4 NA NA 0 M 33.6 D
## 5 5 NA NA 0 M 32.9 A
Group by
# Centering variable
<- function(x, na.rm = TRUE){
fun - mean(x, na.rm = na.rm)
x
}%>%
cbp group_by(herd) %>%
::mutate(year2 = as.numeric(year)-1) %>%
dplyr::mutate(across(c(year2, phenotype), fun)) dplyr
## # A tibble: 10,000 × 8
## # Groups: herd [5]
## ind father mother year sex phenotype herd year2
## <dbl> <dbl> <dbl> <fct> <fct> <dbl> <fct> <dbl>
## 1 1 NA NA 0 M -12.4 E -4.5
## 2 2 NA NA 0 M -14.2 B -4.5
## 3 3 NA NA 0 M -21.6 A -4.5
## 4 4 NA NA 0 M -16.2 D -4.5
## 5 5 NA NA 0 M -17.2 A -4.5
## 6 6 NA NA 0 M -18.4 A -4.5
## 7 7 NA NA 0 M -11.2 A -4.5
## 8 8 NA NA 0 M -16.8 E -4.5
## 9 9 NA NA 0 M -11.1 E -4.5
## 10 10 NA NA 0 M -4.11 C -4.5
## # … with 9,990 more rows
Statistics Summary
Summarise
<- cbp %>%
s1 group_by(year, sex, herd) %>%
summarise(
mean = mean(phenotype),
median = median(phenotype),
min = min(phenotype),
max = max(phenotype),
IQR = IQR(phenotype),
sd = sd(phenotype),
var = sd^2,
n = n()
%>%
) ::mutate(across(where(is.numeric), round,1))
dplyr
%>%
s1 ggplot(aes(y=mean, x =year)) +
geom_point(aes(shape =herd, colour = herd)) +
facet_wrap(~sex) +
labs(x = "Generation", y = "Milk Yield (kg/day",
colour = "Herd", shape = "Herd") +
theme_bw(base_size = 13)
ggsave("milk.pdf", width = 5, height = 3)
%>%
s1 filter(year %in% c("8","9") &
> 76 & min >40) max
## # A tibble: 7 × 11
## # Groups: year, sex [4]
## year sex herd mean median min max IQR sd var n
## <fct> <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 8 F B 58.3 57.9 41.7 78.6 8.9 6.6 43.2 114
## 2 8 M C 59.7 59.9 45.3 77.6 7.3 6 35.8 108
## 3 8 M D 59.2 59.1 45.1 76.7 7.7 6.3 39.5 106
## 4 9 F E 59.6 60.1 42.9 77.4 7.5 6.3 40.3 99
## 5 9 M B 61.1 60.7 45 79.1 11.1 7.2 51.9 86
## 6 9 M C 61.2 61 42.4 82.1 8.5 6.5 41.8 108
## 7 9 M E 60.8 59.6 45.1 77.7 8.4 6.3 39.9 101
%>%
s1 filter(year %in% c("8","9") &
> 76 & min >40) %>%
max transmute(CV = sd / mean,
CV = scales::percent(CV)) %>%
::mutate(across(where(is.numeric), round,1)) dplyr
## # A tibble: 7 × 3
## # Groups: year, sex [4]
## year sex CV
## <fct> <fct> <chr>
## 1 8 F 11%
## 2 8 M 10.05%
## 3 8 M 10.64%
## 4 9 F 11%
## 5 9 M 11.78%
## 6 9 M 10.62%
## 7 9 M 10.36%