Aggregate by Factor Levels, Keeping Other Variables in the Resulting Data Frame

Aggregate by factor levels, keeping other variables in the resulting data frame

You need to use merge on result of aggregate and original data.frame

merge(aggregate(value ~ code, dat, min), dat, by = c("code", "value"))
## code value index
## 1 HH11 24.1 023434
## 2 HH45 37.2 3377477
## 3 JL03 20.0 1177777

Aggregate a data frame while keeping other variables, with dplyr

Since you only have one unique school per class, you can simply include the school variable in the grouping variables:

df %>% group_by(school, class) %>% summarize(mean_score = mean(score))
# # A tibble: 100 x 3
# # Groups: school [?]
# school class mean_score
# <chr> <chr> <dbl>
# 1 A A1 0.000506
# 2 A A10 -0.000275
# 3 A A2 0.00136
# 4 A A3 0.000405
# 5 A A4 -0.00156
# 6 A A5 -0.00214
# 7 A A6 -0.00108
# 8 A A7 -0.000534
# 9 A A8 0.000804
# 10 A A9 0.00106
# # ... with 90 more rows

Here's a data.table equivalent:

library(data.table)
setDT(df, key = c("school", "class"))
df[, .(mean_score = mean(score)), by=.(school, class)]

aggregate with empty factor but keep row

# create dataset
df2 <- data.frame( val=rep(seq(1:3),4), factor=cut(rep(seq(1:3),4),breaks=c(1,2,3,4), include.lowest = TRUE, ordered_results=True , labels=LETTERS[1:3]))

library(dplyr)

levels(df2$factor) %>% # get distinct levels of the factor variable
data.frame(factor = .) %>% # create a data frame
left_join(df2 %>% # join with
group_by(factor) %>% # for each value that exists
summarise(x = sum(val)), by = "factor") %>% # sum column val
mutate(x = coalesce(x, 0L)) # replace NAs with 0s

# factor x
# 1 A 12
# 2 B 12
# 3 C 0

Or without any package

dd = merge(data.frame(Group.1 = levels(df2$factor)), 
aggregate(df2$val,list(df2$factor),sum), all.x = T)
dd$x = ifelse(is.na(dd$x), 0, dd$x)
dd

# Group.1 x
# 1 A 12
# 2 B 12
# 3 C 0

Or using data.table package to check if it's faster

library(data.table)

# assuming you start with a data frame
df2 <- data.frame( val=rep(seq(1:3),4), factor=cut(rep(seq(1:3),4),breaks=c(1,2,3,4), include.lowest = TRUE, ordered_results=True , labels=LETTERS[1:3]))

# create a data table with all unique values of the variable "factor" and an index (key) on that variable
dt_levels = data.table(factor = levels(df2$factor), key = "factor")

# make df2 a data table with an index on column "factor" and aggregate
dt_sum = setDT(df2, key = "factor")[, list(Sum = sum(val)), by = "factor"]

# left join the two data tables and replace NA values with 0s
dt_result = dt_sum[dt_levels][, Sum := ifelse(is.na(Sum), 0, Sum)]

dt_result[]

# factor Sum
# 1: A 12
# 2: B 12
# 3: C 0

Converting numeric values to factor levels with factor levels assigned on the basis of the numeric ordering

Based on the description, it seems like the OP wanted to change the levels to numeric values starting from 1. This can be done using match

c[] <- lapply(c, function(x) factor(match(x, sort(unique(x)))))
c
# var1 var2
#1 1 4
#2 2 5
#3 3 6
#4 4 7
#5 5 8
#6 1 1
#7 2 2
#8 3 3
#9 5 4
#10 6 5

data

a <- c(0, 1, 3, 5, 6, 0, 1, 3, 6, 12)
b <- c(letters[5:9], letters[2:6])
c <- data.frame(var1 = a, var2 = b)

Based on the code in the comments, another option to replace str_pad is

c <- data.frame(var1 = sprintf("%02d", a), var2=b, stringsAsFactors=FALSE)

Label every repeat of a factor as the same number in a new column in order of occurence across factor levels

assuming you create the c data frame without the "count" column already in it you can do the following with dplyr:

makecounter <- c %>% group_by(class, set) %>%
filter(row_number() == 1)%>%
ungroup%>%
group_by(set)%>%mutate(count = seq_along(set))

final <- left_join(c, makecounter, by = c("class","set"))

Take difference between two levels of factor variable while retaining other factor variables in R

One option with dplyr would be

library(dplyr)

my.df %>%
group_by(Gene, Population) %>%
summarize(Coverage = Coverage[Color == "Blue"] - Coverage[Color == "Green"])

# A tibble: 4 x 3
# Groups: Gene [?]
# Gene Population Coverage
# <fct> <fct> <dbl>
# 1 A_1 PopA -0.00600
# 2 A_1 PopB -0.420
# 3 A_2 PopA -0.01
# 4 A_2 PopB 0.100

Data

my.df <- 
structure(list(Gene = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L), .Label = c("A_1", "A_2"), class = "factor"),
Population = structure(c(1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L), .Label = c("PopA", "PopB"), class = "factor"),
Color = structure(c(1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L), .Label = c("Blue", "Green"), class = "factor"),
Coverage = c(0.016, 0.022, 0.1322, 0.552, 0.13, 0.14, 1, 0.9)), class = "data.frame", row.names = c(NA, -8L))

Counting number of elements in a character column by levels of a factor column in a dataframe

A dplyr solution.

df %>% 
filter(!is.na(product)) %>%
group_by(company) %>%
count()

# A tibble: 4 × 2
comp n
<fctr> <int>
1 A 2
2 B 2
3 C 3
4 D 1

R use ddply or aggregate

I, too, would recommend data.table here, but since you asked for an aggregate solution, here is one which combines aggregate and merge to get all the columns:

merge(events22, aggregate(saleDate ~ custId, events22, max))

Or just aggregate if you only want the "custId" and "DelivDate" columns:

aggregate(list(DelivDate = events22$saleDate), 
list(custId = events22$custId),
function(x) events22[["DelivDate"]][which.max(x)])

Finally, here's an option using sqldf:

library(sqldf)
sqldf("select custId, DelivDate, max(saleDate) `saleDate`
from events22 group by custId")

Benchmarks

I'm not a benchmarking or data.table expert, but it surprised me that data.table is not faster here. My suspicion is that the results would be quite different on a larger dataset, say for instance, your 400k lines one. Anyway, here's some benchmarking code modeled after @mnel's answer here so you can do some tests on your actual dataset for future reference.

library(rbenchmark)

First, set up your functions for what you want to benchmark.

DDPLY <- function() { 
x <- ddply(events22, .(custId), .inform = T,
function(x) {
x[x$saleDate == max(x$saleDate),"DelivDate"]})
}
DATATABLE <- function() { x <- dt[, .SD[which.max(saleDate), ], by = custId] }
AGG1 <- function() {
x <- merge(events22, aggregate(saleDate ~ custId, events22, max)) }
AGG2 <- function() {
x <- aggregate(list(DelivDate = events22$saleDate),
list(custId = events22$custId),
function(x) events22[["DelivDate"]][which.max(x)]) }
SQLDF <- function() {
x <- sqldf("select custId, DelivDate, max(saleDate) `saleDate`
from events22 group by custId") }
DOCALL <- function() {
do.call(rbind,
lapply(split(events22, events22$custId), function(x){
x[which.max(x$saleDate), ]
})
)
}

Second, do the benchmarking.

benchmark(DDPLY(), DATATABLE(), AGG1(), AGG2(), SQLDF(), DOCALL(), 
order = "elapsed")[1:5]
# test replications elapsed relative user.self
# 4 AGG2() 100 0.285 1.000 0.284
# 3 AGG1() 100 0.891 3.126 0.896
# 6 DOCALL() 100 1.202 4.218 1.204
# 2 DATATABLE() 100 1.251 4.389 1.248
# 1 DDPLY() 100 1.254 4.400 1.252
# 5 SQLDF() 100 2.109 7.400 2.108


Related Topics



Leave a reply



Submit