6.4 Group and Apply Functions
6.4.1 group_by()
group_by
grouping doesn’t change how the data looks, it changes how it acts with the other dplyr verbs.
by_cyl <- mtcars %>% group_by(cyl)
# conflict with plyr, specify which to use
summarise <- dplyr::summarise
summarize <- dplyr::summarize
by_cyl %>%
summarise(
disp = mean(disp),
hp = mean(hp),
count = n() # count the size of each group
)
# A tibble: 3 × 4
cyl disp hp count
<dbl> <dbl> <dbl> <int>
1 4 105. 82.6 11
2 6 183. 122. 7
3 8 353. 209. 14
n()
gives the current group size. Only works inside summarise()
.
Information about the “current” group or variable
These functions return information about the “current” group or “current” variable, so only work inside specific contexts like summarise()
and mutate()
.
n()
gives the current group size.Error in
n()
: ! Must only be used inside data-masking verbs likemutate()
,filter()
, andgroup_by()
.Cause: R’s confused with which summarize function (
dplyr
vs.plyr
) it should use.Fix: Explicitly specify
dplyr::summarise()
cur_group()
gives the group keys, a tibble with one row and one column for each grouping variable.Equivalent to
.BY
indata.table
.cur_group_id()
gives a unique numeric identifier for the current group.Equivalent to
.GBP
indata.table
.cur_group_rows()
gives the row indices for the current group.Equivalent to
.I
indata.table
.cur_column()
gives the name of the current column (inacross()
only).
# prioritize functions in `dplyr` to avoid conflicts with `plyr`
> needs::prioritize(dplyr)
> by_cyl %>% mutate(id = cur_group_id())
# A tibble: 32 × 12
# Groups: cyl [3]
mpg cyl disp hp drat wt qsec vs am gear carb id
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
1 21 6 160 110 3.9 2.62 16.5 0 1 4 4 2
2 21 6 160 110 3.9 2.88 17.0 0 1 4 4 2
3 22.8 4 108 93 3.85 2.32 18.6 1 1 4 1 1
4 21.4 6 258 110 3.08 3.22 19.4 1 0 3 1 2
5 18.7 8 360 175 3.15 3.44 17.0 0 0 3 2 3
6 18.1 6 225 105 2.76 3.46 20.2 1 0 3 1 2
7 14.3 8 360 245 3.21 3.57 15.8 0 0 3 4 3
8 24.4 4 147. 62 3.69 3.19 20 1 0 4 2 1
9 22.8 4 141. 95 3.92 3.15 22.9 1 0 4 2 1
10 19.2 6 168. 123 3.92 3.44 18.3 1 0 4 4 2
# ℹ 22 more rows
# ℹ Use `print(n = ...)` to see more rows
> by_cyl %>% summarise(id = cur_group_id())
# A tibble: 3 × 2
cyl id
<dbl> <int>
1 4 1
2 6 2
3 8 3
> by_cyl %>% summarise(data = cur_group())
# A tibble: 3 × 2
cyl data$cyl
<dbl> <dbl>
1 4 4
2 6 6
3 8 8
> df <- tibble(
g = sample(rep(letters[1:3], 1:3)),
x = runif(6),
y = runif(6)
)
> gf <- df %>% group_by(g)
> gf
# A tibble: 6 × 3
# Groups: g [3]
g x y
<chr> <dbl> <dbl>
1 b 0.0246 0.143
2 c 0.478 0.415
3 a 0.758 0.414
4 c 0.216 0.369
5 c 0.318 0.152
6 b 0.232 0.139
# get row indices for the current group
> gf %>% reframe(row=cur_group_rows())
# A tibble: 6 × 2
g row
<chr> <int>
1 a 3
2 b 1
3 b 6
4 c 2
5 c 4
6 c 5
group_by
then count non-NA
values
group_split()
split data.frame
by groups, good friend with group_by
, i.e. it used grouping structure from group_by()
and therefore is subject to the data mask.
It returns a list of tibbles. Each tibble contains the rows of .tbl
for the associated group and all the columns, including the grouping variables.
ir <- iris %>%
group_by(Species)
# returns a list of groups
ir %>% group_split()
# returns a tibble of group keys
ir %>% group_keys()
# A tibble: 3 x 1
Species
<fct>
1 setosa
2 versicolor
3 virginica
# get group_keys() with index, so you can access each specific group with group info
df %>% group_by(ID) %>% group_keys() %>% as.data.frame() %>% t()
df %>% group_by(ID) %>% group_keys() %>% pull(Species)
# assgin a index column to indicate the group number
df %>% group_by(ID) %>% group_keys() %>% mutate(id = row_number())
# equivalently
df %>% group_by(ID) %>% group_keys() %>% mutate(id = 1:n() )
Group by and count the number of observations in each group with tally
# count the number of rows per group
ir %>% tally(sort=TRUE) # same as ir %>% summarize(count=n())
# A tibble: 3 × 2
Species n
<fct> <int>
1 setosa 50
2 versicolor 50
3 virginica 50
dplyr::row_number(x)
gives every input a unique rank to a vector x
, so that c(10, 20, 20, 30)
would get ranks c(1, 2, 3, 4)
. From smallest to largest.
To rank by multiple columns at once, supply a data frame
x
.Rank by value, ascending:
If want descending, use
row_number(desc(.)
Another use is to add row number to a data frame.
Here, we assign row number to a variable or column name “row_id”.
6.4.2 Subset rows
slice_head(.data, n)
andslice_tail(.data, n)
select the first or lastn
rows in.data
.slice_sample()
randomly selects rows.slice_min(.data, order_by, n)
andslice_max()
select rows with the smallest or largest values of a variable.
6.4.3 Summarize
Scoped verbs (summarize_if
, summarize_at
, summarize_all
) have been superseded by the use of across(.var, .fun)
inside summarize()
.
summarize_at(.tbl, .var, .fun, ...)
\(\rightarrow\) summarize(.tbl, across(.var, .fun) )
.tbl
atbl
object, eg. a grouppedtibble
.var
A list of columns generated by vars(), acharacter
vector of column names, anumeric
vector of column positions, or NULL. 可以用变量名字 (character) 或者位置 (numeric)。- Note that
numeric
position start with the group keys, and then followed by ordinary columns.
- Note that
.fun
A function fun, a quosure style lambda~ fun(.)
or a list of either form....
Additional arguments for the function calls in.funs
.
# `summarize_at` affects variables selected with a character vector or vars()
error_continent_all %>%
group_by(pct, CON) %>%
summarize(across(-r_no, ~mean(.x, na.rm=TRUE)) )
starwars %>%
summarise_if(is.numeric, mean, na.rm = TRUE)
starwars %>%
summarise(across(where(is.numeric), ~ mean(.x, na.rm = TRUE)))
by_species <- iris %>%
group_by(Species)
by_species %>%
summarise_all(list(min, max))
by_species %>%
summarise(across(everything(), list(min = min, max = max)))
# Count size of group
by_species %>% summarize(count=n())
# Or use `tally()`
by_species %>% tally(sort=TRUE)
Select one group
# Select one element from a list
iris %>%
`[[`("Species")
# select one group
by_species %>% group_split() %>% `[[`(1) # can extract by either position or name
across(.cols = everything(), .fns = NULL, ..., .names = NULL)
makes it easy to apply the same transformation to multiple columns, allowing you to use select()
semantics inside in summarise()
and mutate()
.
.cols
Columns to transform.could use functions to select columns, eg,
starts_with("Sepal")
,where(is.factor)
.fns
Functions to apply to each of the selected columns..names
A glue specification that describes how to name the output columns.
6.4.4 split-apply-combine
Here is a summary of the split-apply-combine approach:
dplyr::group_map()
,dplyr::group_modify()
…split
→purrr::map()
plyr::ddply()
data.table::setDT()
# with `dplyr`
df %>% group_by(type) %>% group_map(~tibble(trivial_func(.)))
# must do `group_split` before your can `map_dfr`
df %>%
split(type) %>% # equivalent to `group_by(type)` then `group_split()`
purrr::map_dfr(trivial_func)
# with `data.table`
library(data.table)
setDT(df)[, trivial_func(.SD), type]
# with `plyr`
plyr::ddply(df, .(type), trivial_func)
Apply regression to each group with purrr::map
# A more realistic example: split a data frame into pieces, fit a
# model to each piece, summarise and extract R^2
mtcars %>%
split(.$cyl) %>%
map(~ lm(mpg ~ wt, data = .x)) %>%
map(summary) %>%
map_dbl("r.squared")
# If each element of the output is a data frame, use
# map_dfr to row-bind them together:
mtcars %>%
split(.$cyl) %>%
map(~ lm(mpg ~ wt, data = .x)) %>%
map_dfr(~ as.data.frame(t(as.matrix(coef(.)))))
# (if you also want to preserve the variable names see
# the broom package)
6.4.5 dplyr::group_modify()
dplyr::group_modify()
and dplyr::group_map()
are purrr-style functions that can be used to iterate on grouped tibbles.
Note that
purrr::map
,map_dfr
, andmap_dfc
does NOT work on groupped tibbles. You must dogroup_split
to a list, then you can apply the purrr-style functions.Convenient for multistep operations, easy to debug
group_modify
takes in a groupped tibble; returns a groupped tibble and.f
must return a data frame.group_map
takes in a groupped tibble and returns a list- They takes in groupped tibbles, not good at multistep operations, hard to debug because you need to wrap up your whole calculation in one function.
group_modify(.data, .f, ..., .keep = FALSE)
returns a grouped tibble.
.data
A grouped tibble.f
A function, formula, or vector to apply to each group. It MUST return a data frame or a tibble! Matices do not work.A workaround is to use
tibble::enframe
toIf a function, it is used as is. It should have at least 2 formal arguments.
If your costum function has only one argument, you use the formula specification
~f(.x)
.
If a formula, e.g.
~ head(.x)
, it is converted to a function.In the formula, you can use
.
or.x
to refer to the subset of rows of.tbl
for the given group-
.y
to refer to the group key, a one row tibble with one column per grouping variable that identifies the group
-
...
Additional arguments passed on to.f
.keep=FALSE
whether the grouping variables are kept in.x
.- Default to drop> the grouping variable. Be mindful of this when you want to subset columns by position.
cntry_group <- cntry_agg_df %>% group_by(as.numeric(ISO_N3), year)
groups <- cntry_group %>% group_split()
cntry_key <- group_keys(cntry_group)
# use a lambda function which includes a sequence of calculations
cntry_stat <- cntry_group %>%
group_modify(~ {
.x %>% select(var_vec) %>%
map_dfc(cal_stat) %>% # calculate stat
mutate(stat = c("mean", "std")) # assign stat names
}) %>%
ungroup()
When using a lambda function inside group_modify
:
.x
can only be used once at the first function; for later use, use.
.x
and.
are useful when the argument being assigned is NOT the first argument.If you use
.x
instead of.
, you will encounter the following error:Error in xj[i] : invalid subscript type ‘language’
Apply CAPM to each company
# Apply CAPM to each company
# start with `.x`
# clearer logic flow, I prefer this approach ✓
data_group <- data %>% group_by(ISIN)
data_group %>%
# filter(cur_group_id() %in% c(1,2)) %>%
filter(cur_group_id() == 1) %>%
group_modify(~ {
.x %>%
filter(between(p_date, estimation_start, estimation_end)) %>%
lm(eRi~rmrf, data=.) %>%
tidy(.)
})
# ------------------------------------------------
# Equivalently, plug in `.x` in the first func
# one line shorter, more concise
data_group %>%
filter(cur_group_id() == 1) %>%
group_modify(~ {
filter(.x, between(p_date, estimation_start, estimation_end)) %>%
lm(eRi~rmrf, data=.) %>%
tidy(.)
})
filter(cur_group_id() == 1)
select the 1st group to test run your function
✅ Can also define a function and pass to group_modify
- This approach is easier to debug, whereas long sequence of lambda is hard to debug.
factor_model <- function(company_data){
company_data <- company_data %>%
filter(between(p_date, estimation_start, estimation_end))
reg_ml <- lm(eRi~rmrf, data=company_data)
coef_est <- tidy(reg_ml)
return (coef_est)
}
# Debug with one group
groups <- data_group %>% group_split()
factor_model(groups[[1]])
# A tibble: 2 × 5
term estimate std.error statistic p.value
<chr> <dbl> <dbl> <dbl> <dbl>
1 (Intercept) 0.114 0.0784 1.45 0.147
2 rmrf 0.280 0.0945 2.97 0.00331
# Apply to all groups
data_group %>%
filter(cur_group_id() %in% c(1,2)) %>%
group_modify(~factor_model(.x))
group_modify
requires at least 2 formal arguments for the function.- Since the custom function
factor_model
has only 1 argument, we must use the lambda formula to pass togroup_modify
.
group_map(.data, .f, ..., .keep = FALSE)
returns a list of results.
More examples of group_modify
Apply a regression
What I like most about
group_modify
is that it automatically adds the group key to the returned results. You don’t need to specify identifiers by yourself then.Calculate summary statistics
group_modify
code snippets
# Apply a regression
iris %>%
group_by(Species) %>%
group_modify(~ broom::tidy(lm(Petal.Length ~ Sepal.Length, data = .x)))
#> # A tibble: 6 × 6
#> # Groups: Species [3]
#> Species term estimate std.error statistic p.value
#> <fct> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 setosa (Intercept) 0.803 0.344 2.34 2.38e- 2
#> 2 setosa Sepal.Length 0.132 0.0685 1.92 6.07e- 2
#> 3 versicolor (Intercept) 0.185 0.514 0.360 7.20e- 1
#> 4 versicolor Sepal.Length 0.686 0.0863 7.95 2.59e-10
#> 5 virginica (Intercept) 0.610 0.417 1.46 1.50e- 1
#> 6 virginica Sepal.Length 0.750 0.0630 11.9 6.30e-16
# ---------------------------------------------------------------
# descriptive statistics
# to use group_modify() the lambda must return a data frame
iris %>%
group_by(Species) %>%
group_modify(~ {
quantile(.x$Petal.Length, probs = c(0.25, 0.5, 0.75)) %>%
tibble::enframe(name = "prob", value = "quantile")
})
#> # A tibble: 9 × 3
#> # Groups: Species [3]
#> Species prob quantile
#> <fct> <chr> <dbl>
#> 1 setosa 25% 1.4
#> 2 setosa 50% 1.5
#> 3 setosa 75% 1.58
#> 4 versicolor 25% 4
#> 5 versicolor 50% 4.35
#> 6 versicolor 75% 4.6
#> 7 virginica 25% 5.1
#> 8 virginica 50% 5.55
#> 9 virginica 75% 5.88
# `fivenum()` returns min, lower-hinge (Q1), median (Q1),
# upper-hinge (Q3), and max
iris %>%
group_by(Species) %>%
group_modify(~ {
.x %>%
purrr::map_dfc(fivenum) %>%
mutate(nms = c("min", "Q1", "median", "Q3", "max"))
})
#> # A tibble: 15 × 6
#> # Groups: Species [3]
#> Species Sepal.Length Sepal.Width Petal.Length Petal.Width nms
#> <fct> <dbl> <dbl> <dbl> <dbl> <chr>
#> 1 setosa 4.3 2.3 1 0.1 min
#> 2 setosa 4.8 3.2 1.4 0.2 Q1
#> 3 setosa 5 3.4 1.5 0.2 median
#> 4 setosa 5.2 3.7 1.6 0.3 Q3
#> 5 setosa 5.8 4.4 1.9 0.6 max
#> 6 versicolor 4.9 2 3 1 min
#> 7 versicolor 5.6 2.5 4 1.2 Q1
#> 8 versicolor 5.9 2.8 4.35 1.3 median
#> 9 versicolor 6.3 3 4.6 1.5 Q3
#> 10 versicolor 7 3.4 5.1 1.8 max
#> 11 virginica 4.9 2.2 4.5 1.4 min
#> 12 virginica 6.2 2.8 5.1 1.8 Q1
#> 13 virginica 6.5 3 5.55 2 median
#> 14 virginica 6.9 3.2 5.9 2.3 Q3
#> 15 virginica 7.9 3.8 6.9 2.5 max
group_walk(.data, .f)
calls .f
for side effects and returns the input .tbl
, invisibly.
Can be used to write dplyr groups to separate files.
# `.x` refers to the group
# `.y` refers to the key
# `.keep = TRUE` keeps the key column
all_df %>%
group_by(ISO_C3) %>%
group_walk(~ write_csv(.x, sprintf("./data/reg_result/bootstrap/%s/pathway_country/country-impact_bootstrap1000_tmp_%s.csv", ssp, .y$ISO_C3) ), .keep = TRUE)
Suset specific groups
data.table
implementation
dplyr
implementation