Follow-Up: Generalizing a Data.Frame Subsetting Function 2

Follow-up: Generalizing a data.frame subsetting function 2

We could add some conditions with case_when

foo2 <- function(dat, study_col, ...) {

dot_cols <- ensyms(...)
str_cols <- purrr::map_chr(dot_cols, rlang::as_string)

dat %>%
dplyr::select({{study_col}}, !!! dot_cols) %>%
dplyr::group_by({{study_col}}) %>%
dplyr::mutate(grp = across(all_of(str_cols), ~ {
tmp <- n_distinct(.)
case_when(tmp == 1 ~ 1, tmp == n() ~ 2, tmp >1 & tmp < n() ~ 3, TRUE ~ 4)
}) %>%
purrr::reduce(stringr::str_c, collapse="")) %>%
dplyr::ungroup(.) %>%
dplyr::group_split(grp, .keep = FALSE)

}

foo2(dat, study, group, outcome)

Follow-up: Generalizing a data.frame subsetting function

If we want to add 3 dots at the end, capture it as symbols in ensyms, then convert it to string (as_string), make the only change by looping across those columns, create a logical condition with n_distinct and paste (str_c) with reduce, while pasteing the other columns as well (could be simplified in a single across as well) and then use group_split on the final 'grp' column

foo2 <- function(dat, study_col, group_col, outcome_col, ...) {

dot_cols <- ensyms(...)
str_cols <- purrr::map_chr(dot_cols, rlang::as_string)

dat %>%
dplyr::select({{study_col}}, {{group_col}}, {{outcome_col}}, !!! dot_cols) %>%
dplyr::group_by({{study_col}}) %>%
dplyr::mutate(grp = across(all_of(str_cols), ~ n_distinct(.) == 1) %>%
purrr::reduce(stringr::str_c, collapse=""),
grp = stringr::str_c(n_distinct({{group_col}}) == 1,
n_distinct({{outcome_col}}) == 1, grp)) %>%
dplyr::ungroup(.) %>%
dplyr::group_split(grp, .keep = FALSE)

}

-testing

> out <- foo2(h, case, group, outcome, sample, control)
> out
<list_of<
tbl_df<
case : integer
group : integer
outcome: integer
sample : integer
control: integer
>
>[14]>
[[1]]
# A tibble: 16 x 5
case group outcome sample control
<int> <int> <int> <int> <int>
1 9 1 1 1 1
2 9 1 1 2 1
3 9 2 1 1 1
4 9 2 1 2 1
5 9 1 2 1 1
6 9 1 2 2 1
7 9 2 2 1 1
8 9 2 2 2 1
9 9 1 1 1 2
10 9 1 1 2 2
11 9 2 1 1 2
12 9 2 1 2 2
13 9 1 2 1 2
14 9 1 2 2 2
15 9 2 2 1 2
16 9 2 2 2 2

[[2]]
# A tibble: 8 x 5
case group outcome sample control
<int> <int> <int> <int> <int>
1 1 1 1 1 1
2 1 1 1 2 1
3 1 2 1 1 1
4 1 2 1 2 1
5 1 1 2 1 1
6 1 1 2 2 1
7 1 2 2 1 1
8 1 2 2 2 1

[[3]]
# A tibble: 8 x 5
case group outcome sample control
<int> <int> <int> <int> <int>
1 10 1 1 1 1
2 10 2 1 1 1
3 10 1 2 1 1
4 10 2 2 1 1
5 10 1 1 1 2
6 10 2 1 1 2
7 10 1 2 1 2
8 10 2 2 1 2

[[4]]
# A tibble: 4 x 5
case group outcome sample control
<int> <int> <int> <int> <int>
1 2 1 1 1 1
2 2 2 1 1 1
3 2 1 2 1 1
4 2 2 2 1 1

[[5]]
# A tibble: 8 x 5
case group outcome sample control
<int> <int> <int> <int> <int>
1 12 1 1 1 1
2 12 1 1 2 1
3 12 2 1 1 1
4 12 2 1 2 1
5 12 1 1 1 2
6 12 1 1 2 2
7 12 2 1 1 2
8 12 2 1 2 2

[[6]]
# A tibble: 4 x 5
case group outcome sample control
<int> <int> <int> <int> <int>
1 4 1 1 1 1
2 4 1 1 2 1
3 4 2 1 1 1
4 4 2 1 2 1

[[7]]
# A tibble: 8 x 5
case group outcome sample control
<int> <int> <int> <int> <int>
1 13 1 1 1 1
2 13 2 1 1 1
3 13 1 1 1 2
4 13 2 1 1 2
5 14 1 1 1 1
6 14 2 1 1 1
7 14 1 1 1 2
8 14 2 1 1 2

[[8]]
# A tibble: 4 x 5
case group outcome sample control
<int> <int> <int> <int> <int>
1 5 1 1 1 1
2 5 2 1 1 1
3 6 1 1 1 1
4 6 2 1 1 1

[[9]]
# A tibble: 8 x 5
case group outcome sample control
<int> <int> <int> <int> <int>
1 11 1 1 1 1
2 11 1 1 2 1
3 11 1 2 1 1
4 11 1 2 2 1
5 11 1 1 1 2
6 11 1 1 2 2
7 11 1 2 1 2
8 11 1 2 2 2

[[10]]
# A tibble: 4 x 5
case group outcome sample control
<int> <int> <int> <int> <int>
1 3 1 1 1 1
2 3 1 1 2 1
3 3 1 2 1 1
4 3 1 2 2 1

[[11]]
# A tibble: 4 x 5
case group outcome sample control
<int> <int> <int> <int> <int>
1 15 1 1 1 1
2 15 1 1 2 1
3 15 1 1 1 2
4 15 1 1 2 2

[[12]]
# A tibble: 2 x 5
case group outcome sample control
<int> <int> <int> <int> <int>
1 7 1 1 1 1
2 7 1 1 2 1

[[13]]
# A tibble: 2 x 5
case group outcome sample control
<int> <int> <int> <int> <int>
1 16 1 1 1 1
2 16 1 1 1 2

[[14]]
# A tibble: 1 x 5
case group outcome sample control
<int> <int> <int> <int> <int>
1 8 1 1 1 1

testing the total rows

> sum(map_dbl(out, nrow))
[1] 81
> nrow(h)
[1] 81

If we want to remove some arguments

foo2 <- function(dat, study_col, ...) {

dot_cols <- ensyms(...)
str_cols <- purrr::map_chr(dot_cols, rlang::as_string)

dat %>%
dplyr::select({{study_col}}, !!! dot_cols) %>%
dplyr::group_by({{study_col}}) %>%
dplyr::mutate(grp = across(all_of(str_cols), ~ n_distinct(.) == 1) %>%
purrr::reduce(stringr::str_c, collapse="")) %>%
dplyr::ungroup(.) %>%
dplyr::group_split(grp, .keep = FALSE)

}

-testing

>  foo2(h, case, group, outcome, sample, control)
<list_of<
tbl_df<
case : integer
group : integer
outcome: integer
sample : integer
control: integer
>
>[14]>
[[1]]
# A tibble: 16 x 5
case group outcome sample control
<int> <int> <int> <int> <int>
1 9 1 1 1 1
2 9 1 1 2 1
3 9 2 1 1 1
4 9 2 1 2 1
5 9 1 2 1 1
6 9 1 2 2 1
7 9 2 2 1 1
8 9 2 2 2 1
9 9 1 1 1 2
10 9 1 1 2 2
11 9 2 1 1 2
12 9 2 1 2 2
13 9 1 2 1 2
14 9 1 2 2 2
15 9 2 2 1 2
16 9 2 2 2 2

[[2]]
# A tibble: 8 x 5
case group outcome sample control
<int> <int> <int> <int> <int>
1 1 1 1 1 1
2 1 1 1 2 1
3 1 2 1 1 1
4 1 2 1 2 1
5 1 1 2 1 1
6 1 1 2 2 1
7 1 2 2 1 1
8 1 2 2 2 1

[[3]]
# A tibble: 8 x 5
case group outcome sample control
<int> <int> <int> <int> <int>
1 10 1 1 1 1
2 10 2 1 1 1
3 10 1 2 1 1
4 10 2 2 1 1
5 10 1 1 1 2
6 10 2 1 1 2
7 10 1 2 1 2
8 10 2 2 1 2

[[4]]
# A tibble: 4 x 5
case group outcome sample control
<int> <int> <int> <int> <int>
1 2 1 1 1 1
2 2 2 1 1 1
3 2 1 2 1 1
4 2 2 2 1 1

[[5]]
# A tibble: 8 x 5
case group outcome sample control
<int> <int> <int> <int> <int>
1 12 1 1 1 1
2 12 1 1 2 1
3 12 2 1 1 1
4 12 2 1 2 1
5 12 1 1 1 2
6 12 1 1 2 2
7 12 2 1 1 2
8 12 2 1 2 2

[[6]]
# A tibble: 4 x 5
case group outcome sample control
<int> <int> <int> <int> <int>
1 4 1 1 1 1
2 4 1 1 2 1
3 4 2 1 1 1
4 4 2 1 2 1

[[7]]
# A tibble: 8 x 5
case group outcome sample control
<int> <int> <int> <int> <int>
1 13 1 1 1 1
2 13 2 1 1 1
3 13 1 1 1 2
4 13 2 1 1 2
5 14 1 1 1 1
6 14 2 1 1 1
7 14 1 1 1 2
8 14 2 1 1 2

[[8]]
# A tibble: 4 x 5
case group outcome sample control
<int> <int> <int> <int> <int>
1 5 1 1 1 1
2 5 2 1 1 1
3 6 1 1 1 1
4 6 2 1 1 1

[[9]]
# A tibble: 8 x 5
case group outcome sample control
<int> <int> <int> <int> <int>
1 11 1 1 1 1
2 11 1 1 2 1
3 11 1 2 1 1
4 11 1 2 2 1
5 11 1 1 1 2
6 11 1 1 2 2
7 11 1 2 1 2
8 11 1 2 2 2

[[10]]
# A tibble: 4 x 5
case group outcome sample control
<int> <int> <int> <int> <int>
1 3 1 1 1 1
2 3 1 1 2 1
3 3 1 2 1 1
4 3 1 2 2 1

[[11]]
# A tibble: 4 x 5
case group outcome sample control
<int> <int> <int> <int> <int>
1 15 1 1 1 1
2 15 1 1 2 1
3 15 1 1 1 2
4 15 1 1 2 2

[[12]]
# A tibble: 2 x 5
case group outcome sample control
<int> <int> <int> <int> <int>
1 7 1 1 1 1
2 7 1 1 2 1

[[13]]
# A tibble: 2 x 5
case group outcome sample control
<int> <int> <int> <int> <int>
1 16 1 1 1 1
2 16 1 1 1 2

[[14]]
# A tibble: 1 x 5
case group outcome sample control
<int> <int> <int> <int> <int>
1 8 1 1 1 1

Generalizing a data.frame subsetting function

If the input argument is unquoted, use {{}}

foo <- function(dat, study_col, group_col, outcome_col) {

fn1 <- function(cond) {
switch(cond,

`1` = dat %>%
group_by({{study_col}}) %>%
filter(n_distinct({{group_col}}) == 1, n_distinct({{outcome_col}}) > 1) %>%
ungroup,
`2` = dat %>%
group_by({{study_col}}) %>%
filter(n_distinct({{group_col}}) > 1, n_distinct({{outcome_col}}) == 1) %>%
ungroup,
`3` = dat %>%
group_by({{study_col}}) %>%
filter(n_distinct({{group_col}}) > 1, n_distinct({{outcome_col}}) > 1) %>%
ungroup,

`4` = dat %>%
group_by({{study_col}}) %>%
filter(n_distinct({{group_col}}) == 1, n_distinct({{outcome_col}}) == 1) %>%
ungroup
) }
purrr::map(1:4, ~ fn1(.x))

}

-testing

> foo(h, study, group, outcome)
[[1]]
# A tibble: 2 x 4
study outcome group time
<chr> <int> <int> <int>
1 a 1 1 0
2 a 2 1 1

[[2]]
# A tibble: 2 x 4
study outcome group time
<chr> <int> <int> <int>
1 b 1 1 0
2 b 1 2 0

[[3]]
# A tibble: 2 x 4
study outcome group time
<chr> <int> <int> <int>
1 c 2 1 0
2 c 3 2 1

[[4]]
# A tibble: 3 x 4
study outcome group time
<chr> <int> <int> <int>
1 d 1 1 0
2 d 1 1 0
3 e 1 1 0

Or use

foo2 <- function(dat, study_col, group_col, outcome_col) {

dat %>%
dplyr::select({{study_col}}, {{group_col}}, {{outcome_col}}) %>%
dplyr::group_by({{study_col}}) %>%
dplyr::mutate(grp = stringr::str_c(n_distinct({{group_col}}) == 1,
n_distinct({{outcome_col}}) == 1 )) %>%
dplyr::ungroup(.) %>%
dplyr::group_split(grp, .keep = FALSE)

}

-testing

> foo2(h, study, group, outcome)
<list_of<
tbl_df<
study : character
group : integer
outcome: integer
>
>[4]>
[[1]]
# A tibble: 2 x 3
study group outcome
<chr> <int> <int>
1 c 1 2
2 c 2 3

[[2]]
# A tibble: 2 x 3
study group outcome
<chr> <int> <int>
1 b 1 1
2 b 2 1

[[3]]
# A tibble: 2 x 3
study group outcome
<chr> <int> <int>
1 a 1 1
2 a 1 2

[[4]]
# A tibble: 3 x 3
study group outcome
<chr> <int> <int>
1 d 1 1
2 d 1 1
3 e 1 1

Follow-up: Putting back a missing column from a data.frame into a list of dta.frames

One option is to loop over the list ('LIST'), subset the data based with %in% on the pasteed rows of data and the list element data

LIST2 <-  lapply(LIST, function(x) 
data[do.call(paste, data[names(x)]) %in% do.call(paste, x),])

-checking

> all.equal(DESIRED_LIST, LIST2, check.attributes = FALSE)
[1] TRUE

unexpected error when subsetting data.frame by range of date in R?

I can see couple of problem in OP's code.

Prob#1: The default format expected by as.Date is "%Y-%m-%d" or "%Y/%m/%d". But the formats of the characters (Begin, End columns) used in code is %d.%m.%Y or %s-%m-%Y. Hence default format in function as.Date() will not work. The format argument should specifically provided to as.Date() function.

The correct code to create DATE1 and DATE2 should be:

DATE1 <- as.Date("01-01-1981", format = "%d-%m-%Y")
DATE2 <- as.Date("31-12-2014", , format = "%d-%m-%Y")

Prob#2: The Begin and End columns of dataframe should be changed to as.Date format as well before attempting filter operations.

The format of those 2 columns can be changed as:

joinedData$Begin = as.Date(joinedData$Begin, format = "%d.%m.%Y")
joinedData$End= as.Date(joinedData$End, format = "%d.%m.%Y")

Now, the OP'2 initial approach should work.

Note: Personally I prefer using as.POSIXlt over as.Date

conditional subsetting of all rows of a cluster in a dataframe

We could use n_distinct to create the condition in filter after grouping by 'study'

library(dplyr)
h %>%
group_by(study) %>%
filter(n_distinct(group) == 1, n_distinct(outcome) > 1)
# A tibble: 2 x 3
# Groups: study [1]
study outcome group
<chr> <int> <int>
1 a 1 1
2 a 2 1

Or using base R

subset(h, ave(group, study, FUN = function(x) length(unique(x)))
== 1 & ave(outcome, study, FUN = function(x) length(unique(x)) > 1))
study outcome group
1 a 1 1
2 a 2 1

We could generalize if we wanted

f1 <- function(dat, cond) {

switch(cond,

`1` = dat %>%
group_by(study) %>%
filter(n_distinct(group) == 1, n_distinct(outcome) > 1) %>%
ungroup,
`2` = dat %>%
group_by(study) %>%
filter(n_distinct(group) > 1, n_distinct(outcome) == 1) %>%
ungroup,

`3` = dat %>%
group_by(study) %>%
filter(n_distinct(group) > 1, n_distinct(outcome) > 1) %>%
ungroup,
`4` = dat %>%
group_by(study) %>%
filter(n_distinct(group) == 1, n_distinct(outcome) == 1) %>%
ungroup
)

}

-testing

> f1(h, 1)
# A tibble: 2 x 3
study outcome group
<chr> <int> <int>
1 a 1 1
2 a 2 1
> f1(h, 2)
# A tibble: 2 x 3
study outcome group
<chr> <int> <int>
1 b 1 1
2 b 1 2
> f1(h, 3)
# A tibble: 2 x 3
study outcome group
<chr> <int> <int>
1 c 2 1
2 c 3 2
> f1(h, 4)
# A tibble: 3 x 3
study outcome group
<chr> <int> <int>
1 d 1 1
2 d 1 1
3 e 1 1

How to create a loop which creates multiple subset dataframes from a larger data frame?

Your code works fine. Just remove list so you create a vector of color names and not a list. If you only want distinct values, use unique.

mydata <- data.frame(x = c(1,2,3), y = c('a','b','c'), z = c('red','red','yellow'))

colors <- unique(mydata$z)

for (i in 1:length(colors)) {
assign(paste0("mydata_",i), subset(mydata, z == colors[[i]]))
}

Is there an R package with a generalized class of data.frame in which a column can be an array (or how do I define such a class)?

data frames do allow matrix columns:

m <- diag(4)
v <- 1:4
DF <- data.frame(v, m = I(m))
str(DF)

giving:

'data.frame':   4 obs. of  2 variables:
$ v: int 1 2 3 4
$ m: 'AsIs' num [1:4, 1:4] 1 0 0 0 0 1 0 0 0 0 ...

Update 1

The R aggregate function can create matrix columns. For example,

DF <- data.frame(v = 1:4, g = c(1, 1, 2, 2))
ag <- aggregate(v ~ g, DF, function(x) c(sum = sum(x), mean = mean(x)))
str(ag)

giving:

'data.frame':   2 obs. of  2 variables:
$ g: num 1 2
$ v: num [1:2, 1:2] 3 7 1.5 3.5
..- attr(*, "dimnames")=List of 2
.. ..$ : NULL
.. ..$ : chr "sum" "mean"

Update 2

I don't think the aggregation discussed in the comments is nicely supported in R but you can use the following workaround:

m <- matrix(1:16, 4)
v <- c(1, 1, 2, 2)
DF <- data.frame(v, m = I(m))

nr <- nrow(DF)
ag2 <- aggregate(list(sum = 1:nr), DF["v"], function(ix) colSums(DF$m[ix, ]))
str(ag2)

giving:

'data.frame':   2 obs. of  2 variables:
$ v : num 1 2
$ sum: num [1:2, 1:4] 3 7 11 15 19 23 27 31


Related Topics



Leave a reply



Submit