Sum by Distinct Column Value in R

Sum by distinct column value in R

I think the neatest way to do this is in dplyr

library(dplyr)
shop %>%
group_by(shop_id, shop_name, city) %>%
summarise_all(sum)

How to extract and sum distinct values in from a column and create a column with the sum

We group by 'TYPE', get the unique 'SIZE' and return with the sum of those values in summarise

library(dplyr)
df1 %>%
group_by(TYPE) %>%
summarise(Sum = sum(unique(SIZE), na.rm = TRUE))

-output

# A tibble: 1 x 2
TYPE Sum
<chr> <dbl>
1 A 68409188.

data

df1 <- structure(list(TYPE = c("A", "A", "A", "A", "A", "A", "A", "A", 
"A", "A"), SIZE = c(24522145.17, 35359867.65, 35359867.65, 35359867.65,
35359867.65, 35359867.65, 24522145.17, 35359867.65, 35359867.65,
8527174.786)), class = "data.frame", row.names = c(NA, -10L))

Sum of unique values in a column

One way using dplyr could be to filter the dist values less than 1100 and keep only unique values for key and sum RR and dist columns.

library(dplyr)

df %>%
group_by(Year) %>%
filter(dist <= 1100 & !duplicated(key)) %>%
summarise(RR = sum(RR), dist = sum(dist))

To count distinct values, we can use n_distinct

df %>%
filter(dist <= 1100) %>%
group_by(Year) %>%
summarise(n = n_distinct(key))

Sum by distinct column value in R, ignoring duplicate values

We can wrap with unique before getting the sum

library(dplyr)
other_shop %>%
group_by(shop_id, shop_name) %>%
mutate(shop_total_sale_goal = sum(unique(city_sale_goal), na.rm = TRUE),
shop_profit = sum(profit, na.rm = TRUE)) %>%
ungroup

-output

# A tibble: 6 x 7
# shop_id shop_name city city_sale_goal profit shop_total_sale_goal shop_profit
# <dbl> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
#1 1 Shop A London 12 3 21 7
#2 1 Shop A London 12 1 21 7
#3 1 Shop A Paris 9 3 21 7
#4 2 Shop B Cardiff 15 6 15 6
#5 3 Shop C Dublin 10 5 10 14
#6 3 Shop C Dublin 10 9 10 14

Rowsums in R by unique values in multiple columns

You may get the data in long format and then sum the Games for each Player.

library(dplyr)
library(tidyr)

df %>%
pivot_longer(cols = c(White, Black), values_to = 'Player') %>%
group_by(Player) %>%
summarise(Games_total = sum(Games))

# Player Games_total
# <chr> <int>
#1 Anand 23
#2 Carlsen 19
#3 Caruana 27
#4 Giri 24
#5 Grischuk 7
#6 Naka 12

data

df <- structure(list(White = c("Anand", "Carlsen", "Caruana", "Giri", 
"Grischuk"), Black = c("Caruana", "Naka", "Giri", "Anand", "Carlsen"
), Games = c(13L, 12L, 14L, 10L, 7L)), row.names = c(NA, -5L), class = "data.frame")

How to sum project values in column based on unique project ID?

One option use group_by %>% do; inside do, you can use distinct to get unique values per project and sum the result:

df_test %>% 
group_by(proj_manager, stage) %>%
do({
sum_value = sum(distinct(., proj_ID, value)$value);
mutate(., sum_value = sum_value)
})

#Source: local data frame [7 x 5]
#Groups: proj_manager, stage [3]

# proj_manager proj_ID stage value sum_value
# <fctr> <dbl> <fctr> <dbl> <dbl>
#1 Alice 3 A 70 70
#2 Alice 4 C 5 5
#3 Emma 1 B 15 35
#4 Emma 1 B 15 35
#5 Emma 2 B 20 35
#6 Emma 2 B 20 35
#7 Emma 2 B 20 35

How to calculate the sum of distinct observations in R dplyr

We may use replace with unique

library(dplyr)
library(tidyr)
df %>%
pivot_longer(c(counts_col1:counts_col2),
names_to ="strings",values_to = "value") %>%
group_by(id,col1,col2) %>%
group_by(id) %>%
mutate(sum_distinct = replace(rep(NA_real_, n()), 1, sum(unique(value)))) %>%
ungroup

-output

# A tibble: 12 × 6
col1 col2 id strings value sum_distinct
<chr> <chr> <dbl> <chr> <dbl> <dbl>
1 apple pple 1 counts_col1 100 152
2 apple pple 1 counts_col2 2 NA
3 apple app 1 counts_col1 100 NA
4 apple app 1 counts_col2 50 NA
5 pple app 1 counts_col1 2 NA
6 pple app 1 counts_col2 50 NA
7 banana bananna 2 counts_col1 200 222
8 banana bananna 2 counts_col2 2 NA
9 banana banan 2 counts_col1 200 NA
10 banana banan 2 counts_col2 20 NA
11 bananna banan 2 counts_col1 2 NA
12 bananna banan 2 counts_col2 20 NA

Summing up values in one column based on unique values in another column

In base R you could use ave

df[, c("D", "E")] <- with(df, sapply(c(sum, length), function(x) ave(C, B, FUN = x)))
df
# A B C D E
#1 1 1 5 12 3
#2 2 1 4 12 3
#3 3 1 3 12 3
#4 4 2 1 4 2
#5 5 2 3 4 2

Or using dplyr

library(dplyr)
df <- df %>%
group_by(B) %>%
mutate(D = sum(C), E = length(C))
df
## A tibble: 5 x 5
## Groups: B [2]
# A B C D E
# <int> <int> <int> <int> <int>
#1 1 1 5 12 3
#2 2 1 4 12 3
#3 3 1 3 12 3
#4 4 2 1 4 2
#5 5 2 3 4 2

Sample data

df <- read.table(text =
"A B C
1 1 5
2 1 4
3 1 3
4 2 1
5 2 3", header = T)

It works just fine with your revised data

df <- read.table(text =
"docIdx newsgroup_ID freq
1 1 768
2 1 125
3 1 29
4 1 51
5 1 198
6 1 34
7 1 64
8 2 35
9 2 70
10 2 45", header = T)

df[, c("sum.freq", "length.freq")] <- with(df, sapply(c(sum, length), function(x)
ave(freq, newsgroup_ID, FUN = x)))
# docIdx newsgroup_ID freq sum.freq length.freq
#1 1 1 768 1269 7
#2 2 1 125 1269 7
#3 3 1 29 1269 7
#4 4 1 51 1269 7
#5 5 1 198 1269 7
#6 6 1 34 1269 7
#7 7 1 64 1269 7
#8 8 2 35 150 3
#9 9 2 70 150 3
#10 10 2 45 150 3

Here ave(freq, newsgroup_ID, FUN = x) applies function x to freq by newsgroup_ID.



Related Topics



Leave a reply



Submit