Extracting a Random Sample of Rows in a Data.Frame with a Nested Conditional

Extracting a random sample of rows in a data.frame with a nested conditional

I'd use which to get the vector of rows numbers from which you can sample given your condition....

iris[ sample( which( iris$Species != "setosa" ) , 2 ) , ]
# Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#59 6.6 2.9 4.6 1.3 versicolor
#133 6.4 2.8 5.6 2.2 virginica

Random sample of rows with at least one from each condition

You can specify the n for each group here (use 1s if you only want a data frame with nrows == number of groups

dd <- read.table(header = TRUE, text = 'a   b   c
23 34 Falcons
14 9 Hawks
2 18 Eagles
3 21 Eagles
22 8 Falcons
11 4 Hawks', stringsAsFactors = FALSE)

(n <- setNames(c(1,2,1), unique(dd$c)))
# Falcons Hawks Eagles
# 1 2 1

set.seed(1)
dd[as.logical(ave(dd$c, dd$c, FUN = function(x)
sample(rep(c(FALSE, TRUE), c(length(x) - n[x[1]], n[x[1]]))))), ]

# a b c
# 1 23 34 Falcons
# 2 14 9 Hawks
# 4 3 21 Eagles
# 6 11 4 Hawks

Putting this into a function to automate some other things for you

sample_each <- function(data, var, n = 1L) {
lvl <- table(data[, var])
n1 <- setNames(rep_len(n, length(lvl)), names(lvl))
n0 <- lvl - n1
idx <- ave(as.character(data[, var]), data[, var], FUN = function(x)
sample(rep(0:1, c(n0[x[1]], n1[x[1]]))))
data[!!(as.numeric(idx)), ]
}

sample_each(dd, 'c', n = c(1,2,1))
# a b c
# 1 23 34 Falcons
# 3 2 18 Eagles
# 5 22 8 Falcons
# 6 11 4 Hawks

sample_each(mtcars, 'gear', 1)
# mpg cyl disp hp drat wt qsec vs am gear carb
# Valiant 18.1 6 225.0 105 2.76 3.46 20.22 1 0 3 1
# Merc 280 19.2 6 167.6 123 3.92 3.44 18.30 1 0 4 4
# Maserati Bora 15.0 8 301.0 335 3.54 3.57 14.60 0 1 5 8

sample_each(mtcars, 'gear', c(2,2,5))
# mpg cyl disp hp drat wt qsec vs am gear carb
# Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
# Porsche 914-2 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
# Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2
# Ford Pantera L 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4
# Ferrari Dino 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6
# Maserati Bora 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8
# Mazda RX4 Wag1 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
# Hornet Sportabout1 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
# Merc 2801 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4

Creating a random sample from a dataframe with a nested structure

Very easy with dplyr:

library(dplyr)
iris %>%
group_by(Species) %>%
sample_n(size = 2)

# Sepal.Length Sepal.Width Petal.Length Petal.Width Species
# 1 4.6 3.4 1.4 0.3 setosa
# 2 5.2 3.5 1.5 0.2 setosa
# 3 6.5 2.8 4.6 1.5 versicolor
# 4 5.7 2.8 4.5 1.3 versicolor
# 5 5.8 2.8 5.1 2.4 virginica
# 6 7.7 2.6 6.9 2.3 virginica

You can group by as many columns as you'd like

CO2 %>% group_by(Type, Treatment) %>% sample_n(size = 2)

R - Extract random sample with Conditional using 'Which' in Loop

The group_by and sample_n functions in the dplyr package let you do this easily:

library(dplyr)
subset <- H0_LONG %>%
group_by(Patch) %>%
sample_n(25)

This approach will typically also run faster than a for loop. Note that this code is just another way of writing:

subset <- sample_n(group_by(H0_LONG, Patch), 25)

R - subsetting original data frame: N random observations, 50% of N has ethnicity E and 50% of N has a education E

A simple solution would be to sample 1/4 for each combination, hoping that this combination exists:

n  <- 1e2 / 4
y <- x[c(sample(which(x$et & x$ed), n, TRUE)
, sample(which(!x$et & x$ed), n, TRUE)
, sample(which(x$et & !x$ed), n, TRUE)
, sample(which(!x$et & !x$ed), n, TRUE)),]
table(y)
# ed
#et FALSE TRUE
# FALSE 25 25
# TRUE 25 25

In case there is a combination which does not exist you can get the proportion of each combination with table like:

n  <- 1e2
x <- x[!x$et | x$ed,]
tt <- table(x)
tt <- tt * t(tt)
tt <- tt / rowSums(tt)
tt <- tt / rep(colSums(tt), each=2)
tt <- round(proportions(tt)*n) #Since R 4.0.0: prop.table becomes proportions
#tt <- round(prop.table(tt)*n) #Here the target number might not be reached
y <- x[c(sample(which(!x$et & !x$ed), tt[1], TRUE)
, sample(which(x$et & !x$ed), tt[2], TRUE)
, sample(which(!x$et & x$ed), tt[3], TRUE)
, sample(which(x$et & x$ed), tt[4], TRUE)),]
table(y)
# ed
#et FALSE TRUE
# FALSE 50 0
# TRUE 0 50

Data:

set.seed(7)
n <- 1e4
x <- data.frame(et=sample(c(TRUE,FALSE), n, TRUE, c(.25,.75)), ed=sample(c(TRUE,FALSE), n, TRUE, c(.75,.25)))

Selected randomly-sized, random subsets of rows

filter(lads, 
scenario == "north" & urban_areas_simple == "Primary Urban Areas") %>%
sample_frac(runif(1))

does just that.

The value is guaranteed to be returnable and it can handle stratified sampling from a grouped dataframe with unequal group sizes.

Using sample() to sample from nested lists in R

Two problems:

  • inc.outcomes[[index]] is a problem since index is 60-long here, meaning you are ultimately trying inc.outcomes[[ c(1,1,...,2,2,...,3,3) ]], which is incorrect. [[-indexing is either length-1 (for most uses) or a vector as long as its list is nested. For example, in list(list(1,2),list(3,4))[[ c(1,2) ]] the [[c(1,2)]] with length-2 works because the have 2-deep nested lists. Since inc.outcomes is only 1-deep, we can only have length-1 in the [[ indexing.

  • This means we need to do this by-index. (An from this, we need to change from nrow(dt) to .N, but frankly we should be using that anyway even without by=.)

dt[, inc.period := sample(inc.outcomes[[ index[1] ]], .N, replace = TRUE), by = index]
# index iteration inc.period
# <int> <int> <num>
# 1: 1 1 17
# 2: 1 2 17
# 3: 1 3 21
# 4: 1 4 24
# 5: 1 5 3
# 6: 1 6 1
# 7: 1 7 17
# 8: 1 8 0
# 9: 1 9 1
# 10: 1 10 0
# ---
# 51: 3 11 0
# 52: 3 12 0
# 53: 3 13 28
# 54: 3 14 28
# 55: 3 15 9
# 56: 3 16 28
# 57: 3 17 7
# 58: 3 18 28
# 59: 3 19 28
# 60: 3 20 28

My data:

dt <- setDT(structure(list(index = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), iteration = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L,  11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L)), row.names = c(NA, -60L), class = c("data.table", "data.frame"), sorted = c("index", "iteration")))

Conditional removal of rows for grouped data in R

Raw data: Remove the default factoring of variables while creating data frames.

df<-data.frame(id = id,year = year, month = month, day = day, value = value, stringsAsFactors = FALSE)

Group the data by id, year, month and get the number of days for the subset using .N internal variable. Next, chain the results (similar to %>% in dplyr). Now group-by id, year, followed by the condition N > 15 and finally the mean rainfall for that subset is computed and stored in avg_rainfall.

setDT converts dataframe to datatable by reference

library("data.table")
setDT(df)[, .(value, .N), by = .(id, year, month)][N > 15, .(avg_rainfall = mean(value)), by = .(id, year)]

# id year avg_rainfall
# 1: Station_1 1950 4.852840
# 2: Station_1 1951 5.138069
# 3: Station_1 1952 4.934006
# 4: Station_2 1950 4.870335
# 5: Station_2 1951 5.179425
# 6: Station_2 1952 5.055026
# 7: Station_3 1950 4.959524
# 8: Station_3 1951 5.049996
# 9: Station_3 1952 4.927548


Related Topics



Leave a reply



Submit