Follow-Up: Generalizing a Data.Frame Subsetting Function 2

Follow-up: Generalizing a data.frame subsetting function 2

We could add some conditions with case_when

foo2 <- function(dat, study_col, ...) {
   
    dot_cols <- ensyms(...)
    str_cols <- purrr::map_chr(dot_cols, rlang::as_string)

    dat %>%
           dplyr::select({{study_col}}, !!! dot_cols) %>%
            dplyr::group_by({{study_col}}) %>%
            dplyr::mutate(grp = across(all_of(str_cols), ~ {
                      tmp <- n_distinct(.)
                      case_when(tmp  == 1 ~ 1, tmp == n() ~ 2, tmp >1 & tmp < n() ~ 3,  TRUE ~ 4)
                      }) %>%
                   purrr::reduce(stringr::str_c, collapse="")) %>%
                dplyr::ungroup(.) %>%
                 dplyr::group_split(grp, .keep = FALSE)
           
     }

foo2(dat, study, group, outcome)

Follow-up: Generalizing a data.frame subsetting function

If we want to add 3 dots at the end, capture it as symbols in ensyms, then convert it to string (as_string), make the only change by looping across those columns, create a logical condition with n_distinct and paste (str_c) with reduce, while pasteing the other columns as well (could be simplified in a single across as well) and then use group_split on the final 'grp' column

foo2 <- function(dat, study_col, group_col, outcome_col, ...) {
   
    dot_cols <- ensyms(...)
    str_cols <- purrr::map_chr(dot_cols, rlang::as_string)

    dat %>%
           dplyr::select({{study_col}}, {{group_col}}, {{outcome_col}}, !!! dot_cols) %>%
            dplyr::group_by({{study_col}}) %>%
            dplyr::mutate(grp = across(all_of(str_cols), ~ n_distinct(.) == 1) %>%
                   purrr::reduce(stringr::str_c, collapse=""),
                          grp =  stringr::str_c(n_distinct({{group_col}}) == 1, 
                n_distinct({{outcome_col}}) == 1, grp)) %>%
                dplyr::ungroup(.) %>%
                 dplyr::group_split(grp, .keep = FALSE)
           
            }

-testing

> out <- foo2(h, case, group, outcome, sample, control)
> out
<list_of<
  tbl_df<
    case   : integer
    group  : integer
    outcome: integer
    sample : integer
    control: integer
  >
>[14]>
[[1]]
# A tibble: 16 x 5
    case group outcome sample control
   <int> <int>   <int>  <int>   <int>
 1     9     1       1      1       1
 2     9     1       1      2       1
 3     9     2       1      1       1
 4     9     2       1      2       1
 5     9     1       2      1       1
 6     9     1       2      2       1
 7     9     2       2      1       1
 8     9     2       2      2       1
 9     9     1       1      1       2
10     9     1       1      2       2
11     9     2       1      1       2
12     9     2       1      2       2
13     9     1       2      1       2
14     9     1       2      2       2
15     9     2       2      1       2
16     9     2       2      2       2

[[2]]
# A tibble: 8 x 5
   case group outcome sample control
  <int> <int>   <int>  <int>   <int>
1     1     1       1      1       1
2     1     1       1      2       1
3     1     2       1      1       1
4     1     2       1      2       1
5     1     1       2      1       1
6     1     1       2      2       1
7     1     2       2      1       1
8     1     2       2      2       1

[[3]]
# A tibble: 8 x 5
   case group outcome sample control
  <int> <int>   <int>  <int>   <int>
1    10     1       1      1       1
2    10     2       1      1       1
3    10     1       2      1       1
4    10     2       2      1       1
5    10     1       1      1       2
6    10     2       1      1       2
7    10     1       2      1       2
8    10     2       2      1       2

[[4]]
# A tibble: 4 x 5
   case group outcome sample control
  <int> <int>   <int>  <int>   <int>
1     2     1       1      1       1
2     2     2       1      1       1
3     2     1       2      1       1
4     2     2       2      1       1

[[5]]
# A tibble: 8 x 5
   case group outcome sample control
  <int> <int>   <int>  <int>   <int>
1    12     1       1      1       1
2    12     1       1      2       1
3    12     2       1      1       1
4    12     2       1      2       1
5    12     1       1      1       2
6    12     1       1      2       2
7    12     2       1      1       2
8    12     2       1      2       2

[[6]]
# A tibble: 4 x 5
   case group outcome sample control
  <int> <int>   <int>  <int>   <int>
1     4     1       1      1       1
2     4     1       1      2       1
3     4     2       1      1       1
4     4     2       1      2       1

[[7]]
# A tibble: 8 x 5
   case group outcome sample control
  <int> <int>   <int>  <int>   <int>
1    13     1       1      1       1
2    13     2       1      1       1
3    13     1       1      1       2
4    13     2       1      1       2
5    14     1       1      1       1
6    14     2       1      1       1
7    14     1       1      1       2
8    14     2       1      1       2

[[8]]
# A tibble: 4 x 5
   case group outcome sample control
  <int> <int>   <int>  <int>   <int>
1     5     1       1      1       1
2     5     2       1      1       1
3     6     1       1      1       1
4     6     2       1      1       1

[[9]]
# A tibble: 8 x 5
   case group outcome sample control
  <int> <int>   <int>  <int>   <int>
1    11     1       1      1       1
2    11     1       1      2       1
3    11     1       2      1       1
4    11     1       2      2       1
5    11     1       1      1       2
6    11     1       1      2       2
7    11     1       2      1       2
8    11     1       2      2       2

[[10]]
# A tibble: 4 x 5
   case group outcome sample control
  <int> <int>   <int>  <int>   <int>
1     3     1       1      1       1
2     3     1       1      2       1
3     3     1       2      1       1
4     3     1       2      2       1

[[11]]
# A tibble: 4 x 5
   case group outcome sample control
  <int> <int>   <int>  <int>   <int>
1    15     1       1      1       1
2    15     1       1      2       1
3    15     1       1      1       2
4    15     1       1      2       2

[[12]]
# A tibble: 2 x 5
   case group outcome sample control
  <int> <int>   <int>  <int>   <int>
1     7     1       1      1       1
2     7     1       1      2       1

[[13]]
# A tibble: 2 x 5
   case group outcome sample control
  <int> <int>   <int>  <int>   <int>
1    16     1       1      1       1
2    16     1       1      1       2

[[14]]
# A tibble: 1 x 5
   case group outcome sample control
  <int> <int>   <int>  <int>   <int>
1     8     1       1      1       1

testing the total rows

> sum(map_dbl(out, nrow))
[1] 81
> nrow(h)
[1] 81

If we want to remove some arguments

foo2 <- function(dat, study_col, ...) {
   
    dot_cols <- ensyms(...)
    str_cols <- purrr::map_chr(dot_cols, rlang::as_string)

    dat %>%
           dplyr::select({{study_col}}, !!! dot_cols) %>%
            dplyr::group_by({{study_col}}) %>%
            dplyr::mutate(grp = across(all_of(str_cols), ~ n_distinct(.) == 1) %>%
                   purrr::reduce(stringr::str_c, collapse="")) %>%
                dplyr::ungroup(.) %>%
                 dplyr::group_split(grp, .keep = FALSE)
           
            }

-testing

>  foo2(h, case, group, outcome, sample, control)
<list_of<
  tbl_df<
    case   : integer
    group  : integer
    outcome: integer
    sample : integer
    control: integer
  >
>[14]>
[[1]]
# A tibble: 16 x 5
    case group outcome sample control
   <int> <int>   <int>  <int>   <int>
 1     9     1       1      1       1
 2     9     1       1      2       1
 3     9     2       1      1       1
 4     9     2       1      2       1
 5     9     1       2      1       1
 6     9     1       2      2       1
 7     9     2       2      1       1
 8     9     2       2      2       1
 9     9     1       1      1       2
10     9     1       1      2       2
11     9     2       1      1       2
12     9     2       1      2       2
13     9     1       2      1       2
14     9     1       2      2       2
15     9     2       2      1       2
16     9     2       2      2       2

[[2]]
# A tibble: 8 x 5
   case group outcome sample control
  <int> <int>   <int>  <int>   <int>
1     1     1       1      1       1
2     1     1       1      2       1
3     1     2       1      1       1
4     1     2       1      2       1
5     1     1       2      1       1
6     1     1       2      2       1
7     1     2       2      1       1
8     1     2       2      2       1

[[3]]
# A tibble: 8 x 5
   case group outcome sample control
  <int> <int>   <int>  <int>   <int>
1    10     1       1      1       1
2    10     2       1      1       1
3    10     1       2      1       1
4    10     2       2      1       1
5    10     1       1      1       2
6    10     2       1      1       2
7    10     1       2      1       2
8    10     2       2      1       2

[[4]]
# A tibble: 4 x 5
   case group outcome sample control
  <int> <int>   <int>  <int>   <int>
1     2     1       1      1       1
2     2     2       1      1       1
3     2     1       2      1       1
4     2     2       2      1       1

[[5]]
# A tibble: 8 x 5
   case group outcome sample control
  <int> <int>   <int>  <int>   <int>
1    12     1       1      1       1
2    12     1       1      2       1
3    12     2       1      1       1
4    12     2       1      2       1
5    12     1       1      1       2
6    12     1       1      2       2
7    12     2       1      1       2
8    12     2       1      2       2

[[6]]
# A tibble: 4 x 5
   case group outcome sample control
  <int> <int>   <int>  <int>   <int>
1     4     1       1      1       1
2     4     1       1      2       1
3     4     2       1      1       1
4     4     2       1      2       1

[[7]]
# A tibble: 8 x 5
   case group outcome sample control
  <int> <int>   <int>  <int>   <int>
1    13     1       1      1       1
2    13     2       1      1       1
3    13     1       1      1       2
4    13     2       1      1       2
5    14     1       1      1       1
6    14     2       1      1       1
7    14     1       1      1       2
8    14     2       1      1       2

[[8]]
# A tibble: 4 x 5
   case group outcome sample control
  <int> <int>   <int>  <int>   <int>
1     5     1       1      1       1
2     5     2       1      1       1
3     6     1       1      1       1
4     6     2       1      1       1

[[9]]
# A tibble: 8 x 5
   case group outcome sample control
  <int> <int>   <int>  <int>   <int>
1    11     1       1      1       1
2    11     1       1      2       1
3    11     1       2      1       1
4    11     1       2      2       1
5    11     1       1      1       2
6    11     1       1      2       2
7    11     1       2      1       2
8    11     1       2      2       2

[[10]]
# A tibble: 4 x 5
   case group outcome sample control
  <int> <int>   <int>  <int>   <int>
1     3     1       1      1       1
2     3     1       1      2       1
3     3     1       2      1       1
4     3     1       2      2       1

[[11]]
# A tibble: 4 x 5
   case group outcome sample control
  <int> <int>   <int>  <int>   <int>
1    15     1       1      1       1
2    15     1       1      2       1
3    15     1       1      1       2
4    15     1       1      2       2

[[12]]
# A tibble: 2 x 5
   case group outcome sample control
  <int> <int>   <int>  <int>   <int>
1     7     1       1      1       1
2     7     1       1      2       1

[[13]]
# A tibble: 2 x 5
   case group outcome sample control
  <int> <int>   <int>  <int>   <int>
1    16     1       1      1       1
2    16     1       1      1       2

[[14]]
# A tibble: 1 x 5
   case group outcome sample control
  <int> <int>   <int>  <int>   <int>
1     8     1       1      1       1

Generalizing a data.frame subsetting function

If the input argument is unquoted, use {{}}

foo <- function(dat, study_col, group_col, outcome_col) {
  
  fn1 <- function(cond) {
           switch(cond, 
         
         `1` = dat %>% 
           group_by({{study_col}}) %>%
           filter(n_distinct({{group_col}}) == 1, n_distinct({{outcome_col}}) > 1) %>%
           ungroup,
         `2` = dat %>% 
           group_by({{study_col}}) %>%
           filter(n_distinct({{group_col}}) > 1, n_distinct({{outcome_col}}) == 1) %>%
      ungroup,
         `3` = dat %>% 
           group_by({{study_col}}) %>%
           filter(n_distinct({{group_col}}) > 1, n_distinct({{outcome_col}}) > 1) %>%
      ungroup,
         
         `4` = dat %>% 
           group_by({{study_col}}) %>%
           filter(n_distinct({{group_col}}) == 1, n_distinct({{outcome_col}}) == 1) %>%
      ungroup
         )  }
     purrr::map(1:4, ~ fn1(.x))

}

-testing

> foo(h, study, group, outcome)
[[1]]
# A tibble: 2 x 4
  study outcome group  time
  <chr>   <int> <int> <int>
1 a           1     1     0
2 a           2     1     1

[[2]]
# A tibble: 2 x 4
  study outcome group  time
  <chr>   <int> <int> <int>
1 b           1     1     0
2 b           1     2     0

[[3]]
# A tibble: 2 x 4
  study outcome group  time
  <chr>   <int> <int> <int>
1 c           2     1     0
2 c           3     2     1

[[4]]
# A tibble: 3 x 4
  study outcome group  time
  <chr>   <int> <int> <int>
1 d           1     1     0
2 d           1     1     0
3 e           1     1     0

Or use

foo2 <- function(dat, study_col, group_col, outcome_col) {

    dat %>%
           dplyr::select({{study_col}}, {{group_col}}, {{outcome_col}}) %>%
           dplyr::group_by({{study_col}}) %>%
          dplyr::mutate(grp = stringr::str_c(n_distinct({{group_col}}) == 1, 
              n_distinct({{outcome_col}}) == 1 ))   %>%
           dplyr::ungroup(.) %>%
           dplyr::group_split(grp, .keep = FALSE)  

}

-testing

> foo2(h, study, group, outcome)
<list_of<
  tbl_df<
    study  : character
    group  : integer
    outcome: integer
  >
>[4]>
[[1]]
# A tibble: 2 x 3
  study group outcome
  <chr> <int>   <int>
1 c         1       2
2 c         2       3

[[2]]
# A tibble: 2 x 3
  study group outcome
  <chr> <int>   <int>
1 b         1       1
2 b         2       1

[[3]]
# A tibble: 2 x 3
  study group outcome
  <chr> <int>   <int>
1 a         1       1
2 a         1       2

[[4]]
# A tibble: 3 x 3
  study group outcome
  <chr> <int>   <int>
1 d         1       1
2 d         1       1
3 e         1       1

Follow-up: Putting back a missing column from a data.frame into a list of dta.frames

One option is to loop over the list ('LIST'), subset the data based with %in% on the pasteed rows of data and the list element data

LIST2 <-  lapply(LIST, function(x) 
    data[do.call(paste, data[names(x)]) %in% do.call(paste, x),])

-checking

> all.equal(DESIRED_LIST, LIST2, check.attributes = FALSE)
[1] TRUE

unexpected error when subsetting data.frame by range of date in R?

I can see couple of problem in OP's code.

Prob#1: The default format expected by as.Date is "%Y-%m-%d" or "%Y/%m/%d". But the formats of the characters (Begin, End columns) used in code is %d.%m.%Y or %s-%m-%Y. Hence default format in function as.Date() will not work. The format argument should specifically provided to as.Date() function.

The correct code to create DATE1 and DATE2 should be:

DATE1 <- as.Date("01-01-1981", format = "%d-%m-%Y")
DATE2 <- as.Date("31-12-2014", , format = "%d-%m-%Y")

Prob#2: The Begin and End columns of dataframe should be changed to as.Date format as well before attempting filter operations.

The format of those 2 columns can be changed as:

joinedData$Begin = as.Date(joinedData$Begin, format = "%d.%m.%Y")
joinedData$End= as.Date(joinedData$End, format = "%d.%m.%Y")

Now, the OP'2 initial approach should work.

Note: Personally I prefer using as.POSIXlt over as.Date

conditional subsetting of all rows of a cluster in a dataframe

We could use n_distinct to create the condition in filter after grouping by 'study'

library(dplyr)
h %>% 
    group_by(study) %>%
    filter(n_distinct(group) == 1, n_distinct(outcome) > 1)
# A tibble: 2 x 3
# Groups:   study [1]
  study outcome group
  <chr>   <int> <int>
1 a           1     1
2 a           2     1

Or using base R

subset(h, ave(group, study, FUN = function(x) length(unique(x)))
      == 1 & ave(outcome, study, FUN = function(x) length(unique(x)) > 1))
  study outcome group
1     a       1     1
2     a       2     1

We could generalize if we wanted

f1 <- function(dat, cond) {

  switch(cond, 

   `1` = dat %>% 
       group_by(study) %>%
       filter(n_distinct(group) == 1, n_distinct(outcome) > 1) %>%
       ungroup,
    `2` = dat %>% 
         group_by(study) %>%
         filter(n_distinct(group) > 1, n_distinct(outcome) == 1) %>%
         ungroup,
         
    `3` =  dat %>% 
         group_by(study) %>%
         filter(n_distinct(group) > 1, n_distinct(outcome) > 1) %>%
         ungroup,
    `4` = dat %>% 
         group_by(study) %>%
         filter(n_distinct(group) == 1, n_distinct(outcome) == 1) %>%
         ungroup
    )

}

-testing

> f1(h, 1)
# A tibble: 2 x 3
  study outcome group
  <chr>   <int> <int>
1 a           1     1
2 a           2     1
> f1(h, 2)
# A tibble: 2 x 3
  study outcome group
  <chr>   <int> <int>
1 b           1     1
2 b           1     2
> f1(h, 3)
# A tibble: 2 x 3
  study outcome group
  <chr>   <int> <int>
1 c           2     1
2 c           3     2
> f1(h, 4)
# A tibble: 3 x 3
  study outcome group
  <chr>   <int> <int>
1 d           1     1
2 d           1     1
3 e           1     1

How to create a loop which creates multiple subset dataframes from a larger data frame?

Your code works fine. Just remove list so you create a vector of color names and not a list. If you only want distinct values, use unique.

mydata <- data.frame(x = c(1,2,3), y = c('a','b','c'), z = c('red','red','yellow'))

colors <- unique(mydata$z)

for (i in 1:length(colors)) {
    assign(paste0("mydata_",i), subset(mydata, z == colors[[i]]))
    }

Is there an R package with a generalized class of data.frame in which a column can be an array (or how do I define such a class)?

data frames do allow matrix columns:

m <- diag(4)
v <- 1:4
DF <- data.frame(v, m = I(m))
str(DF)

giving:

'data.frame':   4 obs. of  2 variables:
 $ v: int  1 2 3 4
 $ m: 'AsIs' num [1:4, 1:4] 1 0 0 0 0 1 0 0 0 0 ...

Update 1

The R aggregate function can create matrix columns. For example,

DF <- data.frame(v = 1:4, g = c(1, 1, 2, 2))
ag <- aggregate(v ~ g, DF, function(x) c(sum = sum(x), mean = mean(x)))
str(ag)

giving:

'data.frame':   2 obs. of  2 variables:
 $ g: num  1 2
 $ v: num [1:2, 1:2] 3 7 1.5 3.5
  ..- attr(*, "dimnames")=List of 2
  .. ..$ : NULL
  .. ..$ : chr  "sum" "mean"

Update 2

I don't think the aggregation discussed in the comments is nicely supported in R but you can use the following workaround:

m <- matrix(1:16, 4)
v <- c(1, 1, 2, 2)
DF <- data.frame(v, m = I(m))

nr <- nrow(DF)
ag2 <- aggregate(list(sum = 1:nr), DF["v"], function(ix) colSums(DF$m[ix, ]))
str(ag2)

giving:

'data.frame':   2 obs. of  2 variables:
 $ v  : num  1 2
 $ sum: num [1:2, 1:4] 3 7 11 15 19 23 27 31

Follow-Up: Generalizing a Data.Frame Subsetting Function 2

Follow-up: Generalizing a data.frame subsetting function 2

Follow-up: Generalizing a data.frame subsetting function

Generalizing a data.frame subsetting function

Follow-up: Putting back a missing column from a data.frame into a list of dta.frames

unexpected error when subsetting data.frame by range of date in R?

conditional subsetting of all rows of a cluster in a dataframe

How to create a loop which creates multiple subset dataframes from a larger data frame?

Is there an R package with a generalized class of data.frame in which a column can be an array (or how do I define such a class)?

Update 1

Update 2

Related Topics

Leave a reply