Follow-up: Generalizing a data.frame subsetting function 2
We could add some conditions with case_when
foo2 <- function(dat, study_col, ...) {
dot_cols <- ensyms(...)
str_cols <- purrr::map_chr(dot_cols, rlang::as_string)
dat %>%
dplyr::select({{study_col}}, !!! dot_cols) %>%
dplyr::group_by({{study_col}}) %>%
dplyr::mutate(grp = across(all_of(str_cols), ~ {
tmp <- n_distinct(.)
case_when(tmp == 1 ~ 1, tmp == n() ~ 2, tmp >1 & tmp < n() ~ 3, TRUE ~ 4)
}) %>%
purrr::reduce(stringr::str_c, collapse="")) %>%
dplyr::ungroup(.) %>%
dplyr::group_split(grp, .keep = FALSE)
}
foo2(dat, study, group, outcome)
Follow-up: Generalizing a data.frame subsetting function
If we want to add 3 dots at the end, capture it as symbols in ensyms
, then convert it to string (as_string
), make the only change by looping across
those columns, create a logical condition with n_distinct
and paste (str_c
) with reduce
, while pasteing the other columns as well (could be simplified in a single across
as well) and then use group_split
on the final 'grp' column
foo2 <- function(dat, study_col, group_col, outcome_col, ...) {
dot_cols <- ensyms(...)
str_cols <- purrr::map_chr(dot_cols, rlang::as_string)
dat %>%
dplyr::select({{study_col}}, {{group_col}}, {{outcome_col}}, !!! dot_cols) %>%
dplyr::group_by({{study_col}}) %>%
dplyr::mutate(grp = across(all_of(str_cols), ~ n_distinct(.) == 1) %>%
purrr::reduce(stringr::str_c, collapse=""),
grp = stringr::str_c(n_distinct({{group_col}}) == 1,
n_distinct({{outcome_col}}) == 1, grp)) %>%
dplyr::ungroup(.) %>%
dplyr::group_split(grp, .keep = FALSE)
}
-testing
> out <- foo2(h, case, group, outcome, sample, control)
> out
<list_of<
tbl_df<
case : integer
group : integer
outcome: integer
sample : integer
control: integer
>
>[14]>
[[1]]
# A tibble: 16 x 5
case group outcome sample control
<int> <int> <int> <int> <int>
1 9 1 1 1 1
2 9 1 1 2 1
3 9 2 1 1 1
4 9 2 1 2 1
5 9 1 2 1 1
6 9 1 2 2 1
7 9 2 2 1 1
8 9 2 2 2 1
9 9 1 1 1 2
10 9 1 1 2 2
11 9 2 1 1 2
12 9 2 1 2 2
13 9 1 2 1 2
14 9 1 2 2 2
15 9 2 2 1 2
16 9 2 2 2 2
[[2]]
# A tibble: 8 x 5
case group outcome sample control
<int> <int> <int> <int> <int>
1 1 1 1 1 1
2 1 1 1 2 1
3 1 2 1 1 1
4 1 2 1 2 1
5 1 1 2 1 1
6 1 1 2 2 1
7 1 2 2 1 1
8 1 2 2 2 1
[[3]]
# A tibble: 8 x 5
case group outcome sample control
<int> <int> <int> <int> <int>
1 10 1 1 1 1
2 10 2 1 1 1
3 10 1 2 1 1
4 10 2 2 1 1
5 10 1 1 1 2
6 10 2 1 1 2
7 10 1 2 1 2
8 10 2 2 1 2
[[4]]
# A tibble: 4 x 5
case group outcome sample control
<int> <int> <int> <int> <int>
1 2 1 1 1 1
2 2 2 1 1 1
3 2 1 2 1 1
4 2 2 2 1 1
[[5]]
# A tibble: 8 x 5
case group outcome sample control
<int> <int> <int> <int> <int>
1 12 1 1 1 1
2 12 1 1 2 1
3 12 2 1 1 1
4 12 2 1 2 1
5 12 1 1 1 2
6 12 1 1 2 2
7 12 2 1 1 2
8 12 2 1 2 2
[[6]]
# A tibble: 4 x 5
case group outcome sample control
<int> <int> <int> <int> <int>
1 4 1 1 1 1
2 4 1 1 2 1
3 4 2 1 1 1
4 4 2 1 2 1
[[7]]
# A tibble: 8 x 5
case group outcome sample control
<int> <int> <int> <int> <int>
1 13 1 1 1 1
2 13 2 1 1 1
3 13 1 1 1 2
4 13 2 1 1 2
5 14 1 1 1 1
6 14 2 1 1 1
7 14 1 1 1 2
8 14 2 1 1 2
[[8]]
# A tibble: 4 x 5
case group outcome sample control
<int> <int> <int> <int> <int>
1 5 1 1 1 1
2 5 2 1 1 1
3 6 1 1 1 1
4 6 2 1 1 1
[[9]]
# A tibble: 8 x 5
case group outcome sample control
<int> <int> <int> <int> <int>
1 11 1 1 1 1
2 11 1 1 2 1
3 11 1 2 1 1
4 11 1 2 2 1
5 11 1 1 1 2
6 11 1 1 2 2
7 11 1 2 1 2
8 11 1 2 2 2
[[10]]
# A tibble: 4 x 5
case group outcome sample control
<int> <int> <int> <int> <int>
1 3 1 1 1 1
2 3 1 1 2 1
3 3 1 2 1 1
4 3 1 2 2 1
[[11]]
# A tibble: 4 x 5
case group outcome sample control
<int> <int> <int> <int> <int>
1 15 1 1 1 1
2 15 1 1 2 1
3 15 1 1 1 2
4 15 1 1 2 2
[[12]]
# A tibble: 2 x 5
case group outcome sample control
<int> <int> <int> <int> <int>
1 7 1 1 1 1
2 7 1 1 2 1
[[13]]
# A tibble: 2 x 5
case group outcome sample control
<int> <int> <int> <int> <int>
1 16 1 1 1 1
2 16 1 1 1 2
[[14]]
# A tibble: 1 x 5
case group outcome sample control
<int> <int> <int> <int> <int>
1 8 1 1 1 1
testing the total rows
> sum(map_dbl(out, nrow))
[1] 81
> nrow(h)
[1] 81
If we want to remove some arguments
foo2 <- function(dat, study_col, ...) {
dot_cols <- ensyms(...)
str_cols <- purrr::map_chr(dot_cols, rlang::as_string)
dat %>%
dplyr::select({{study_col}}, !!! dot_cols) %>%
dplyr::group_by({{study_col}}) %>%
dplyr::mutate(grp = across(all_of(str_cols), ~ n_distinct(.) == 1) %>%
purrr::reduce(stringr::str_c, collapse="")) %>%
dplyr::ungroup(.) %>%
dplyr::group_split(grp, .keep = FALSE)
}
-testing
> foo2(h, case, group, outcome, sample, control)
<list_of<
tbl_df<
case : integer
group : integer
outcome: integer
sample : integer
control: integer
>
>[14]>
[[1]]
# A tibble: 16 x 5
case group outcome sample control
<int> <int> <int> <int> <int>
1 9 1 1 1 1
2 9 1 1 2 1
3 9 2 1 1 1
4 9 2 1 2 1
5 9 1 2 1 1
6 9 1 2 2 1
7 9 2 2 1 1
8 9 2 2 2 1
9 9 1 1 1 2
10 9 1 1 2 2
11 9 2 1 1 2
12 9 2 1 2 2
13 9 1 2 1 2
14 9 1 2 2 2
15 9 2 2 1 2
16 9 2 2 2 2
[[2]]
# A tibble: 8 x 5
case group outcome sample control
<int> <int> <int> <int> <int>
1 1 1 1 1 1
2 1 1 1 2 1
3 1 2 1 1 1
4 1 2 1 2 1
5 1 1 2 1 1
6 1 1 2 2 1
7 1 2 2 1 1
8 1 2 2 2 1
[[3]]
# A tibble: 8 x 5
case group outcome sample control
<int> <int> <int> <int> <int>
1 10 1 1 1 1
2 10 2 1 1 1
3 10 1 2 1 1
4 10 2 2 1 1
5 10 1 1 1 2
6 10 2 1 1 2
7 10 1 2 1 2
8 10 2 2 1 2
[[4]]
# A tibble: 4 x 5
case group outcome sample control
<int> <int> <int> <int> <int>
1 2 1 1 1 1
2 2 2 1 1 1
3 2 1 2 1 1
4 2 2 2 1 1
[[5]]
# A tibble: 8 x 5
case group outcome sample control
<int> <int> <int> <int> <int>
1 12 1 1 1 1
2 12 1 1 2 1
3 12 2 1 1 1
4 12 2 1 2 1
5 12 1 1 1 2
6 12 1 1 2 2
7 12 2 1 1 2
8 12 2 1 2 2
[[6]]
# A tibble: 4 x 5
case group outcome sample control
<int> <int> <int> <int> <int>
1 4 1 1 1 1
2 4 1 1 2 1
3 4 2 1 1 1
4 4 2 1 2 1
[[7]]
# A tibble: 8 x 5
case group outcome sample control
<int> <int> <int> <int> <int>
1 13 1 1 1 1
2 13 2 1 1 1
3 13 1 1 1 2
4 13 2 1 1 2
5 14 1 1 1 1
6 14 2 1 1 1
7 14 1 1 1 2
8 14 2 1 1 2
[[8]]
# A tibble: 4 x 5
case group outcome sample control
<int> <int> <int> <int> <int>
1 5 1 1 1 1
2 5 2 1 1 1
3 6 1 1 1 1
4 6 2 1 1 1
[[9]]
# A tibble: 8 x 5
case group outcome sample control
<int> <int> <int> <int> <int>
1 11 1 1 1 1
2 11 1 1 2 1
3 11 1 2 1 1
4 11 1 2 2 1
5 11 1 1 1 2
6 11 1 1 2 2
7 11 1 2 1 2
8 11 1 2 2 2
[[10]]
# A tibble: 4 x 5
case group outcome sample control
<int> <int> <int> <int> <int>
1 3 1 1 1 1
2 3 1 1 2 1
3 3 1 2 1 1
4 3 1 2 2 1
[[11]]
# A tibble: 4 x 5
case group outcome sample control
<int> <int> <int> <int> <int>
1 15 1 1 1 1
2 15 1 1 2 1
3 15 1 1 1 2
4 15 1 1 2 2
[[12]]
# A tibble: 2 x 5
case group outcome sample control
<int> <int> <int> <int> <int>
1 7 1 1 1 1
2 7 1 1 2 1
[[13]]
# A tibble: 2 x 5
case group outcome sample control
<int> <int> <int> <int> <int>
1 16 1 1 1 1
2 16 1 1 1 2
[[14]]
# A tibble: 1 x 5
case group outcome sample control
<int> <int> <int> <int> <int>
1 8 1 1 1 1
Generalizing a data.frame subsetting function
If the input argument is unquoted, use {{}}
foo <- function(dat, study_col, group_col, outcome_col) {
fn1 <- function(cond) {
switch(cond,
`1` = dat %>%
group_by({{study_col}}) %>%
filter(n_distinct({{group_col}}) == 1, n_distinct({{outcome_col}}) > 1) %>%
ungroup,
`2` = dat %>%
group_by({{study_col}}) %>%
filter(n_distinct({{group_col}}) > 1, n_distinct({{outcome_col}}) == 1) %>%
ungroup,
`3` = dat %>%
group_by({{study_col}}) %>%
filter(n_distinct({{group_col}}) > 1, n_distinct({{outcome_col}}) > 1) %>%
ungroup,
`4` = dat %>%
group_by({{study_col}}) %>%
filter(n_distinct({{group_col}}) == 1, n_distinct({{outcome_col}}) == 1) %>%
ungroup
) }
purrr::map(1:4, ~ fn1(.x))
}
-testing
> foo(h, study, group, outcome)
[[1]]
# A tibble: 2 x 4
study outcome group time
<chr> <int> <int> <int>
1 a 1 1 0
2 a 2 1 1
[[2]]
# A tibble: 2 x 4
study outcome group time
<chr> <int> <int> <int>
1 b 1 1 0
2 b 1 2 0
[[3]]
# A tibble: 2 x 4
study outcome group time
<chr> <int> <int> <int>
1 c 2 1 0
2 c 3 2 1
[[4]]
# A tibble: 3 x 4
study outcome group time
<chr> <int> <int> <int>
1 d 1 1 0
2 d 1 1 0
3 e 1 1 0
Or use
foo2 <- function(dat, study_col, group_col, outcome_col) {
dat %>%
dplyr::select({{study_col}}, {{group_col}}, {{outcome_col}}) %>%
dplyr::group_by({{study_col}}) %>%
dplyr::mutate(grp = stringr::str_c(n_distinct({{group_col}}) == 1,
n_distinct({{outcome_col}}) == 1 )) %>%
dplyr::ungroup(.) %>%
dplyr::group_split(grp, .keep = FALSE)
}
-testing
> foo2(h, study, group, outcome)
<list_of<
tbl_df<
study : character
group : integer
outcome: integer
>
>[4]>
[[1]]
# A tibble: 2 x 3
study group outcome
<chr> <int> <int>
1 c 1 2
2 c 2 3
[[2]]
# A tibble: 2 x 3
study group outcome
<chr> <int> <int>
1 b 1 1
2 b 2 1
[[3]]
# A tibble: 2 x 3
study group outcome
<chr> <int> <int>
1 a 1 1
2 a 1 2
[[4]]
# A tibble: 3 x 3
study group outcome
<chr> <int> <int>
1 d 1 1
2 d 1 1
3 e 1 1
Follow-up: Putting back a missing column from a data.frame into a list of dta.frames
One option is to loop over the list
('LIST'), subset the data
based with %in%
on the paste
ed rows of data and the list element data
LIST2 <- lapply(LIST, function(x)
data[do.call(paste, data[names(x)]) %in% do.call(paste, x),])
-checking
> all.equal(DESIRED_LIST, LIST2, check.attributes = FALSE)
[1] TRUE
unexpected error when subsetting data.frame by range of date in R?
I can see couple of problem in OP's code.
Prob#1: The default format expected by as.Date
is "%Y-%m-%d" or "%Y/%m/%d"
. But the formats of the characters (Begin
, End
columns) used in code is %d.%m.%Y
or %s-%m-%Y
. Hence default format
in function as.Date()
will not work. The format
argument should specifically provided to as.Date()
function.
The correct code to create DATE1
and DATE2
should be:
DATE1 <- as.Date("01-01-1981", format = "%d-%m-%Y")
DATE2 <- as.Date("31-12-2014", , format = "%d-%m-%Y")
Prob#2: The Begin
and End
columns of dataframe should be changed to as.Date
format as well before attempting filter operations.
The format of those 2 columns can be changed as:
joinedData$Begin = as.Date(joinedData$Begin, format = "%d.%m.%Y")
joinedData$End= as.Date(joinedData$End, format = "%d.%m.%Y")
Now, the OP'2 initial approach should work.
Note: Personally I prefer using as.POSIXlt
over as.Date
conditional subsetting of all rows of a cluster in a dataframe
We could use n_distinct
to create the condition in filter
after grouping by 'study'
library(dplyr)
h %>%
group_by(study) %>%
filter(n_distinct(group) == 1, n_distinct(outcome) > 1)
# A tibble: 2 x 3
# Groups: study [1]
study outcome group
<chr> <int> <int>
1 a 1 1
2 a 2 1
Or using base R
subset(h, ave(group, study, FUN = function(x) length(unique(x)))
== 1 & ave(outcome, study, FUN = function(x) length(unique(x)) > 1))
study outcome group
1 a 1 1
2 a 2 1
We could generalize if we wanted
f1 <- function(dat, cond) {
switch(cond,
`1` = dat %>%
group_by(study) %>%
filter(n_distinct(group) == 1, n_distinct(outcome) > 1) %>%
ungroup,
`2` = dat %>%
group_by(study) %>%
filter(n_distinct(group) > 1, n_distinct(outcome) == 1) %>%
ungroup,
`3` = dat %>%
group_by(study) %>%
filter(n_distinct(group) > 1, n_distinct(outcome) > 1) %>%
ungroup,
`4` = dat %>%
group_by(study) %>%
filter(n_distinct(group) == 1, n_distinct(outcome) == 1) %>%
ungroup
)
}
-testing
> f1(h, 1)
# A tibble: 2 x 3
study outcome group
<chr> <int> <int>
1 a 1 1
2 a 2 1
> f1(h, 2)
# A tibble: 2 x 3
study outcome group
<chr> <int> <int>
1 b 1 1
2 b 1 2
> f1(h, 3)
# A tibble: 2 x 3
study outcome group
<chr> <int> <int>
1 c 2 1
2 c 3 2
> f1(h, 4)
# A tibble: 3 x 3
study outcome group
<chr> <int> <int>
1 d 1 1
2 d 1 1
3 e 1 1
How to create a loop which creates multiple subset dataframes from a larger data frame?
Your code works fine. Just remove list
so you create a vector of color names and not a list. If you only want distinct values, use unique
.
mydata <- data.frame(x = c(1,2,3), y = c('a','b','c'), z = c('red','red','yellow'))
colors <- unique(mydata$z)
for (i in 1:length(colors)) {
assign(paste0("mydata_",i), subset(mydata, z == colors[[i]]))
}
Is there an R package with a generalized class of data.frame in which a column can be an array (or how do I define such a class)?
data frames do allow matrix columns:
m <- diag(4)
v <- 1:4
DF <- data.frame(v, m = I(m))
str(DF)
giving:
'data.frame': 4 obs. of 2 variables:
$ v: int 1 2 3 4
$ m: 'AsIs' num [1:4, 1:4] 1 0 0 0 0 1 0 0 0 0 ...
Update 1
The R aggregate
function can create matrix columns. For example,
DF <- data.frame(v = 1:4, g = c(1, 1, 2, 2))
ag <- aggregate(v ~ g, DF, function(x) c(sum = sum(x), mean = mean(x)))
str(ag)
giving:
'data.frame': 2 obs. of 2 variables:
$ g: num 1 2
$ v: num [1:2, 1:2] 3 7 1.5 3.5
..- attr(*, "dimnames")=List of 2
.. ..$ : NULL
.. ..$ : chr "sum" "mean"
Update 2
I don't think the aggregation discussed in the comments is nicely supported in R but you can use the following workaround:
m <- matrix(1:16, 4)
v <- c(1, 1, 2, 2)
DF <- data.frame(v, m = I(m))
nr <- nrow(DF)
ag2 <- aggregate(list(sum = 1:nr), DF["v"], function(ix) colSums(DF$m[ix, ]))
str(ag2)
giving:
'data.frame': 2 obs. of 2 variables:
$ v : num 1 2
$ sum: num [1:2, 1:4] 3 7 11 15 19 23 27 31
Related Topics
Calculate Difference Between Dates by Group in R
Grouping Factor Levels in a Data.Table
Select N Rows Above and Below Match
Extracting HTML Table from a Website in R
Replace Na with Grouped Means in R
Convert Jpg to Greyscale CSV Using R
Mapping Variable to Hexagon Size with Geom_Hex
Add Months of Zero Demand to Zoo Time Series
Dealing with Nan's in Matlab Functions
How to Filter Rows Based on the Previous Row and Keep Previous Row Using Dplyr
How to Substitute Symbols in a Language Object
Obtain Function from Akima::Interp() Matrix
Stargazer Output Appears Below Text - Rmarkdown to PDF
Split Multiple Comma-Separated Column into Separate Rows
How to Plot Charts with Nested Categories Axes