Sum by Distinct Column Value in R

Sum by distinct column value in R

I think the neatest way to do this is in dplyr

library(dplyr)
shop %>% 
  group_by(shop_id, shop_name, city) %>% 
  summarise_all(sum)

How to extract and sum distinct values in from a column and create a column with the sum

We group by 'TYPE', get the unique 'SIZE' and return with the sum of those values in summarise

library(dplyr)
df1 %>%
    group_by(TYPE) %>%
     summarise(Sum = sum(unique(SIZE), na.rm = TRUE))

-output

# A tibble: 1 x 2
  TYPE        Sum
  <chr>     <dbl>
1 A     68409188.

data

df1 <- structure(list(TYPE = c("A", "A", "A", "A", "A", "A", "A", "A", 
"A", "A"), SIZE = c(24522145.17, 35359867.65, 35359867.65, 35359867.65, 
35359867.65, 35359867.65, 24522145.17, 35359867.65, 35359867.65, 
8527174.786)), class = "data.frame", row.names = c(NA, -10L))

Sum of unique values in a column

One way using dplyr could be to filter the dist values less than 1100 and keep only unique values for key and sum RR and dist columns.

library(dplyr)

df %>%
  group_by(Year) %>%
  filter(dist <= 1100 & !duplicated(key)) %>%
  summarise(RR = sum(RR), dist = sum(dist))

To count distinct values, we can use n_distinct

df %>%
  filter(dist <= 1100) %>%
  group_by(Year) %>%
  summarise(n = n_distinct(key))

Sum by distinct column value in R, ignoring duplicate values

We can wrap with unique before getting the sum

library(dplyr)
other_shop %>%
    group_by(shop_id, shop_name) %>%
    mutate(shop_total_sale_goal = sum(unique(city_sale_goal), na.rm = TRUE), 
           shop_profit = sum(profit, na.rm = TRUE)) %>%
    ungroup

-output

# A tibble: 6 x 7
#  shop_id shop_name city    city_sale_goal profit shop_total_sale_goal shop_profit
#    <dbl> <chr>     <chr>            <dbl>  <dbl>                <dbl>       <dbl>
#1       1 Shop A    London              12      3                   21           7
#2       1 Shop A    London              12      1                   21           7
#3       1 Shop A    Paris                9      3                   21           7
#4       2 Shop B    Cardiff             15      6                   15           6
#5       3 Shop C    Dublin              10      5                   10          14
#6       3 Shop C    Dublin              10      9                   10          14

Rowsums in R by unique values in multiple columns

You may get the data in long format and then sum the Games for each Player.

library(dplyr)
library(tidyr)

df %>%
  pivot_longer(cols = c(White, Black), values_to = 'Player') %>%
  group_by(Player) %>%
  summarise(Games_total = sum(Games))

# Player   Games_total
#  <chr>          <int>
#1 Anand             23
#2 Carlsen           19
#3 Caruana           27
#4 Giri              24
#5 Grischuk           7
#6 Naka              12

data

df <- structure(list(White = c("Anand", "Carlsen", "Caruana", "Giri", 
"Grischuk"), Black = c("Caruana", "Naka", "Giri", "Anand", "Carlsen"
), Games = c(13L, 12L, 14L, 10L, 7L)), row.names = c(NA, -5L), class = "data.frame")

How to sum project values in column based on unique project ID?

One option use group_by %>% do; inside do, you can use distinct to get unique values per project and sum the result:

df_test %>% 
    group_by(proj_manager, stage) %>% 
    do({
        sum_value = sum(distinct(., proj_ID, value)$value);
        mutate(., sum_value = sum_value)
    })

#Source: local data frame [7 x 5]
#Groups: proj_manager, stage [3]

#  proj_manager proj_ID  stage value sum_value
#        <fctr>   <dbl> <fctr> <dbl>     <dbl>
#1        Alice       3      A    70        70
#2        Alice       4      C     5         5
#3         Emma       1      B    15        35
#4         Emma       1      B    15        35
#5         Emma       2      B    20        35
#6         Emma       2      B    20        35
#7         Emma       2      B    20        35

How to calculate the sum of distinct observations in R dplyr

We may use replace with unique

library(dplyr)
library(tidyr)
df %>% 
  pivot_longer(c(counts_col1:counts_col2), 
      names_to ="strings",values_to = "value") %>% 
  group_by(id,col1,col2) %>%    
  group_by(id) %>%
  mutate(sum_distinct = replace(rep(NA_real_, n()), 1, sum(unique(value)))) %>%
  ungroup

-output

# A tibble: 12 × 6
   col1    col2       id strings     value sum_distinct
   <chr>   <chr>   <dbl> <chr>       <dbl>        <dbl>
 1 apple   pple        1 counts_col1   100          152
 2 apple   pple        1 counts_col2     2           NA
 3 apple   app         1 counts_col1   100           NA
 4 apple   app         1 counts_col2    50           NA
 5 pple    app         1 counts_col1     2           NA
 6 pple    app         1 counts_col2    50           NA
 7 banana  bananna     2 counts_col1   200          222
 8 banana  bananna     2 counts_col2     2           NA
 9 banana  banan       2 counts_col1   200           NA
10 banana  banan       2 counts_col2    20           NA
11 bananna banan       2 counts_col1     2           NA
12 bananna banan       2 counts_col2    20           NA

Summing up values in one column based on unique values in another column

In base R you could use ave

df[, c("D", "E")] <- with(df, sapply(c(sum, length), function(x) ave(C, B, FUN = x)))
df
#  A B C  D E
#1 1 1 5 12 3
#2 2 1 4 12 3
#3 3 1 3 12 3
#4 4 2 1  4 2
#5 5 2 3  4 2

Or using dplyr

library(dplyr)
df <- df %>%
    group_by(B) %>%
    mutate(D = sum(C), E = length(C))
df
## A tibble: 5 x 5
## Groups:   B [2]
#      A     B     C     D     E
#  <int> <int> <int> <int> <int>
#1     1     1     5    12     3
#2     2     1     4    12     3
#3     3     1     3    12     3
#4     4     2     1     4     2
#5     5     2     3     4     2

Sample data

df <- read.table(text =
    "A B C
1 1 5
2 1 4
3 1 3
4 2 1
5 2 3", header = T)

It works just fine with your revised data

df <- read.table(text =
    "docIdx newsgroup_ID  freq
       1            1   768
       2            1   125
       3            1    29
       4            1    51
       5            1   198
       6            1    34
       7            1    64
       8            2    35
       9            2    70
       10           2    45", header = T)

df[, c("sum.freq", "length.freq")] <- with(df, sapply(c(sum, length), function(x) 
    ave(freq, newsgroup_ID, FUN = x)))
#   docIdx newsgroup_ID freq sum.freq length.freq
#1       1            1  768     1269           7
#2       2            1  125     1269           7
#3       3            1   29     1269           7
#4       4            1   51     1269           7
#5       5            1  198     1269           7
#6       6            1   34     1269           7
#7       7            1   64     1269           7
#8       8            2   35      150           3
#9       9            2   70      150           3
#10     10            2   45      150           3

Here ave(freq, newsgroup_ID, FUN = x) applies function x to freq by newsgroup_ID.

Sum by Distinct Column Value in R

Sum by distinct column value in R

How to extract and sum distinct values in from a column and create a column with the sum

data

Sum of unique values in a column

Sum by distinct column value in R, ignoring duplicate values

Rowsums in R by unique values in multiple columns

How to sum project values in column based on unique project ID?

How to calculate the sum of distinct observations in R dplyr

Summing up values in one column based on unique values in another column

Sample data

Related Topics

Leave a reply