Calculate Percentage for Each Time Series Observations Per Group in R

Calculate Percentage for each time series observations per Group in R

library(dplyr)
df %>% group_by(date) %>% mutate( percentage = views/sum(views))
Source: local data frame [9 x 4]
Groups: date

views date article percentage
1 1578 2015-01-01 A 0.5698808
2 616 2015-01-01 B 0.2224630
3 575 2015-01-01 C 0.2076562
4 1744 2015-01-02 A 0.5921902
5 541 2015-01-02 B 0.1837012
6 660 2015-01-02 C 0.2241087
7 2906 2015-01-03 A 0.6955481
8 629 2015-01-03 B 0.1505505
9 643 2015-01-03 C 0.1539014

Or, if multiple identical articles are possible per day:

df %>% group_by(date) %>% mutate(sum = sum(views)) %>% 
group_by(date, article) %>% mutate(percentage = views/sum) %>%
select(-sum)

Finding percentage in a sub-group using group_by and summarise

Try

library(dplyr)
data %>%
group_by(month) %>%
mutate(countT= sum(count)) %>%
group_by(type, add=TRUE) %>%
mutate(per=paste0(round(100*count/countT,2),'%'))

Or make it more simpler without creating additional columns

data %>%
group_by(month) %>%
mutate(per = 100 *count/sum(count)) %>%
ungroup

We could also use left_join after summarising the sum(count) by 'month'

Or an option using data.table.

 library(data.table)
setkey(setDT(data), month)[data[, list(count=sum(count)), month],
per:= paste0(round(100*count/i.count,2), '%')][]

Summarizing by subgroup percentage in R

Per your comment, if the subgroups are unique you can do

library(dplyr)
group_by(df, group) %>% mutate(percent = value/sum(value))
# group subgroup value percent
# 1 A a 1 0.1250000
# 2 A b 4 0.5000000
# 3 A c 2 0.2500000
# 4 A d 1 0.1250000
# 5 B a 1 0.1666667
# 6 B b 2 0.3333333
# 7 B c 3 0.5000000

Or to remove the value column and add the percent column at the same time, use transmute

group_by(df, group) %>% transmute(subgroup, percent = value/sum(value))
# group subgroup percent
# 1 A a 0.1250000
# 2 A b 0.5000000
# 3 A c 0.2500000
# 4 A d 0.1250000
# 5 B a 0.1666667
# 6 B b 0.3333333
# 7 B c 0.5000000

Using dplyr function to calculate percentage within groups

library(dplyr)

df %>%
# line below to freeze order of type_n if not ordered factor already
mutate(type_n = forcats::fct_inorder(type_n)) %>%
group_by(type_n) %>%
summarize(n = n(), total = sum(population)) %>%
mutate(new_col = (n / total) %>% scales::percent(decimal.mark = ",", suffix = ""))

# A tibble: 3 x 4
type_n n total new_col
<fct> <int> <int> <chr>
1 small 2 7 28,6
2 medium 2 14 14,3
3 large 3 15 20,0

R - Count observations (strings) in a column and calculate as a percentage of overall observations

Do these steps in tidyverse

  • set working directory to directory where your 300+ csv files are stored
  • read all 300+ csv names into temp
  • assuming each csv name refers to your date else you have to tweak the code a little bit
  • using map and imap_dfr as explained below you can execute the same code for each file only once and as a result you'll have a single dataframe
setwd('my/path/here')

temp <- list.files(pattern = '*.csv')

library(tidyverse)

map(temp, read.csv) %>% setNames(gsub('.csv', '', temp)) %>%
imap_dfr(~ .x %>% group_by(item) %>%
summarise(Percentage = n()/nrow(df)*100,
Average_Value = mean(Value), .groups = 'drop') %>%
mutate(Date = .y))

If instead all your csvs contain date column do this

map_dfr(temp, ~read.csv(.x) %>% group_by(item, date) %>%
summarise(Percentage = n()/nrow(df)*100,
Average_Value = mean(Value), .groups = 'drop'))

How to calculate percentage of mising data in a time series in R dplyr

I assume from the sequence generation in the question's code, that the expected observations are one per day between the first observed date and last observed date per ID. Here's a clunky piece by piece calculation to count the % missing data.

1. Make a data frame of all expected dates for each ID

library(dplyr)
# df as in the question, but coerce Date column
df$Date <- as.Date(df$Date)

# Data frame with date ranges per id
ranges_df <- df %>%
group_by(ID) %>%
summarize(min=min(Date), max=max(Date))

# Data frame with IDs and date for every day expected.
alldays <- ranges_df %>%
group_by(ID) %>%
do(., data.frame(
Date = seq(.$dmin,.$dmax, by = '1 day')
)
)

2. JOIN the expected dates table with the observed dates table.

imputed_df <- left_join(alldays, df)

3. Count NAs

imputed_df %>% 
group_by(ID) %>%
summarize(total=n(),
missing=sum(is.na(val)),
percent_missing=missing/total*100
)

result:

# A tibble: 4 x 4
ID total missing percent_missing
<fctr> <int> <int> <dbl>
1 xx 8 2 25.00000
2 xyz 4 4 100.00000
3 yy 62 57 91.93548
4 zz 4380 4371 99.794

Assuming that NAs in the original data should be counted as missing data, this will do so.

R: Calculate percentage of observations in a column that are below a certain value for panel data

Instead of count, which requires a data.frame/tibble, use sum on a logical vector to get the count - TRUE values will be counted as 1 and FALSE as 0

library(dplyr)
df %>%
group_by(Product) %>%
dplyr:: summarise(CountDate = n(),
SmallSize = sum(Size<1000000, na.rm = TRUE), .groups = "drop") %>%
dplyr::mutate(Percent = SmallSize/CountDate)
# A tibble: 3 × 4
Product CountDate SmallSize Percent
<chr> <int> <int> <dbl>
1 A 6 2 0.333
2 B 6 3 0.5
3 C 6 1 0.167

Also, we don't need to create both the columns. It can be directly calculated with mean

df %>%
group_by(Product) %>%
dplyr::summarise(Percent = mean(Size < 1000000, na.rm = TRUE))
# A tibble: 3 × 2
Product Percent
<chr> <dbl>
1 A 0.333
2 B 0.5
3 C 0.167

R: Average years in time series per group

To do what you want need an additional variable to group the year together. I used cut to do that.

library(dplyr)
# Define the cut breaks and labels for each group
# The cut define by the starting of each group and when using cut function
# I would use param right = FALSE to have the desire cut that I want here.
year_group_break <- c(2000, 2004, 2008, 2012, 2016, 2020)
year_group_labels <- c("2000-2003", "2004-2007", "2008-2011", "2012-2015", "2016-2019")

data %>%
# create the year group variable
mutate(year_group = cut(Year, breaks = year_group_break,
labels = year_group_labels,
include.lowest = TRUE, right = FALSE)) %>%
# calculte the total value for each Reporter + Partner in each year group
group_by(year_group, ReporterName, PartnerName) %>%
summarize(`TradeValue in 1000 USD` = sum(`TradeValue in 1000 USD`),
.groups = "drop") %>%
# calculate the percentage value for Partner of each Reporter/Year group
group_by(year_group, ReporterName) %>%
mutate(Percentage = `TradeValue in 1000 USD` / sum(`TradeValue in 1000 USD`)) %>%

ungroup()

Sample output

   year_group ReporterName PartnerName          `TradeValue in 1000 USD` Percentage
<fct> <chr> <chr> <dbl> <dbl>
1 2016-2019 Angola Canada 647164. 0.0161
2 2016-2019 Angola China 24517058. 0.609
3 2016-2019 Angola Congo, Rep. 299119. 0.00744
4 2016-2019 Angola France 734551. 0.0183
5 2016-2019 Angola India 3768940. 0.0937
6 2016-2019 Angola Indonesia 575477. 0.0143
7 2016-2019 Angola Israel 452453. 0.0112
8 2016-2019 Angola Italy 468915. 0.0117
9 2016-2019 Angola Japan 264672. 0.00658
10 2016-2019 Angola Namibia 327922. 0.00815
11 2016-2019 Angola Portugal 1074137. 0.0267
12 2016-2019 Angola Singapore 513983. 0.0128
13 2016-2019 Angola South Africa 1161852. 0.0289
14 2016-2019 Angola Spain 1250555. 0.0311
15 2016-2019 Angola Thailand 649626. 0.0161
16 2016-2019 Angola United Arab Emirates 884725. 0.0220
17 2016-2019 Angola United Kingdom 425617. 0.0106
18 2016-2019 Angola United States 1470133. 0.0365
19 2016-2019 Angola Unspecified 423009. 0.0105
20 2016-2019 Angola Uruguay 320586. 0.00797


Related Topics



Leave a reply



Submit