Plotting Cumulative Counts in Ggplot2

Plotting cumulative counts in ggplot2

This will not solve directly problem with grouping of lines but it will be workaround.

You can add three calls to stat_bin() where you subset your data according to A levels.

ggplot(x,aes(x=X,color=A)) +
stat_bin(data=subset(x,A=="a"),aes(y=cumsum(..count..)),geom="step")+
stat_bin(data=subset(x,A=="b"),aes(y=cumsum(..count..)),geom="step")+
stat_bin(data=subset(x,A=="c"),aes(y=cumsum(..count..)),geom="step")

Sample Image

UPDATE - solution using geom_step()

Another possibility is to multiply values of ..y.. with number of observations in each level. To get this number of observations at this moment only way I found is to precalculate them before plotting and add them to original data frame. I named this column len. Then in geom_step() inside aes() you should define that you will use variable len=len and then define y values as y=..y.. * len.

set.seed(123)
x <- data.frame(A=replicate(200,sample(c("a","b","c"),1)),X=rnorm(200))
library(plyr)
df <- ddply(x,.(A),transform,len=length(X))
ggplot(df,aes(x=X,color=A)) + geom_step(aes(len=len,y=..y.. * len),stat="ecdf")

Sample Image

Time-series/cumulative data plots using ggplot2

Does this do what you want?

library(ggplot2)
library(dplyr)
library(tidyr)
library(lubridate)

holocron %>%
mutate(Date = dmy(Date)) %>%
arrange(Date) %>% # Just in case not ordered already
mutate(Valid_Cumulative = cumsum(Valid),
Invalid_Cumulative = cumsum(Invalid)) %>%
pivot_longer(cols = c(Registers, Valid_Cumulative, Invalid_Cumulative)) %>%

ggplot(aes(Date, value, color = name)) +
geom_line()

Sample Image

cumulative plot using ggplot2

Try this:

ggplot(df, aes(x=1:5, y=cumsum(val))) + geom_line() + geom_point()

Sample Image

Just remove geom_point() if you don't want it.

Edit: Since you require to plot the data as such with x labels are dates, you can plot with x=1:5 and use scale_x_discrete to set labels a new data.frame. Taking df:

ggplot(data = df, aes(x = 1:5, y = cumsum(val))) + geom_line() + 
geom_point() + theme(axis.text.x = element_text(angle=90, hjust = 1)) +
scale_x_discrete(labels = df$date) + xlab("Date")

Sample Image

Since you say you'll have more than 1 val for "date", you can aggregate them first using plyr, for example.

require(plyr)
dd <- ddply(df, .(date), summarise, val = sum(val))

Then you can proceed with the same command by replacing x = 1:5 with x = seq_len(nrow(dd)).

Cumulative stacked area plot for counts in ggplot with R

For each organization, you'll want to make sure you have at least one value for counts for the minimum and maximum years. This is so that ggplot2 will fill in the gaps. Also, you'll want to be careful with cumulating sums. So the solution I've shown below adds in a zero count if not value exists for the earliest and last year.

I've added some code so that you can automate the adding of rows for organizations that don't have data for the first and last all years of your data.
To incorporate this automated code, you'll want to merge in the tail_dat complete_dat data frame and change the variables dat within the data.frame() definition to suite your own data.

library(ggplot2)
library(dplyr)
library(tidyr)

# Create sample data
dat <- tribble(
~organization, ~year, ~count,
"a", 1990, 1,
"a", 1991, 1,
"b", 1991, 1,
"c", 1992, 1,
"c", 1993, 0,
"a", 1994, 1,
"b", 1995, 1
)
dat
#> # A tibble: 7 x 3
#> organization year count
#> <chr> <dbl> <dbl>
#> 1 a 1990 1
#> 2 a 1991 1
#> 3 b 1991 1
#> 4 c 1992 1
#> 5 c 1993 0
#> 6 a 1994 1
#> 7 b 1995 1

# NOTE incorrect results for comparison
dat %>%
group_by(organization, year) %>%
summarise(total = sum(count)) %>%
ggplot(aes(x = year, y = cumsum(total), fill = organization)) +
geom_area()
#> `summarise()` regrouping output by 'organization' (override with `.groups` argument)


# Fill out all years and organization combinations
complete_dat <- tidyr::expand(dat, organization, year = 1990:1995)
complete_dat
#> # A tibble: 18 x 2
#> organization year
#> <chr> <int>
#> 1 a 1990
#> 2 a 1991
#> 3 a 1992
#> 4 a 1993
#> 5 a 1994
#> 6 a 1995
#> 7 b 1990
#> 8 b 1991
#> 9 b 1992
#> 10 b 1993
#> 11 b 1994
#> 12 b 1995
#> 13 c 1990
#> 14 c 1991
#> 15 c 1992
#> 16 c 1993
#> 17 c 1994
#> 18 c 1995

# Update data so that counting works and fills in gaps
final_dat <- complete_dat %>%
left_join(dat, by = c("organization", "year")) %>%
replace_na(list(count = 0)) %>% # Replace NA with zeros
group_by(organization, year) %>%
arrange(organization, year) %>% # Arrange by year so adding works
group_by(organization) %>%
mutate(aggcount = cumsum(count))
final_dat
#> # A tibble: 18 x 4
#> # Groups: organization [3]
#> organization year count aggcount
#> <chr> <dbl> <dbl> <dbl>
#> 1 a 1990 1 1
#> 2 a 1991 1 2
#> 3 a 1992 0 2
#> 4 a 1993 0 2
#> 5 a 1994 1 3
#> 6 a 1995 0 3
#> 7 b 1990 0 0
#> 8 b 1991 1 1
#> 9 b 1992 0 1
#> 10 b 1993 0 1
#> 11 b 1994 0 1
#> 12 b 1995 1 2
#> 13 c 1990 0 0
#> 14 c 1991 0 0
#> 15 c 1992 1 1
#> 16 c 1993 0 1
#> 17 c 1994 0 1
#> 18 c 1995 0 1

# Plot results
final_dat %>%
ggplot(aes(x = year, y = aggcount, fill = organization)) +
geom_area()

Created on 2020-12-10 by the reprex package (v0.3.0)

How to plot total cumulative row count over time ggplot

Order the data by year and ID before plotting and it will go from the first year to the last and within year the smaller ID first.

x <- 'ID    name      year
73 name73 2021
72 name72 2021
71 name71 2019
70 name70 2017
69 name69 2015
68 name68 2015'
df <- read.table(textConnection(x), header = TRUE)

library(ggplot2)

i <- order(df$year, df$ID)
ggplot(df[i,], aes(x=year, y=ID)) +
geom_line()

Created on 2022-07-08 by the reprex package (v2.0.1)


An alternative, that I do not know is what the question is asking for, is to aggregate the IDs by year keeping the maximum in each year.

The code below does this and pipes to the plot directly, without creating an extra data set in the global environment.

aggregate(ID ~ year, df, max) |>
ggplot(aes(x=year, y=ID)) +
geom_line()

Created on 2022-07-08 by the reprex package (v2.0.1)

How do I plot a running cumulative total from individual records in R?

Using dplyr (because you tagged the question with it) you can do what you want. The main things that need to happen are:

  1. Break out your entries and exits making your population positive and negative.
  2. Get all the dates from your earliest to your last so you can have the desired blocky lines. It is probably possible to do this without every date, but this is easy and requires less thinking.

Code is below

library(dplyr)
library(ggplot2)

example.dat <- data.frame (c(1000, 2000, 3000), c("15-10-01", "16-05-01", "16-07-01"), c("16-06-01", "16-10-01", "17-08-01"))
colnames(example.dat) <- c("Population", "Enter.Program", "Leave.Program")

changes = example.dat %>%
select("Population","Date"="Enter.Program") %>%
bind_rows(example.dat %>%
select("Population","Date"="Leave.Program") %>%
mutate(Population = -1*Population)) %>%
mutate(Date = as.Date(Date,"%y-%m-%d"))

startDate = min(changes$Date)
endDate = max(changes$Date)
final = data_frame(Date = seq(startDate,endDate,1)) %>%
left_join(changes,by="Date") %>%
mutate(Population = cumsum(ifelse(is.na(Population),0,Population)))

ggplot(data = final,aes(x=Date,y=Population)) +
geom_line()

UPDATE

If you don't want to have every date from the earliest to the latest, you can use a blurgh for loop to add the needed rows to get a pretty result. Here we walk through and duplicate each date after the first with the preceding cumulative sum. It's not pretty, but it makes the graph.

library(dplyr)
library(ggplot2)

example.dat <- data.frame (c(1000, 2000, 3000), c("15-10-01", "16-05-01", "16-07-01"), c("16-06-01", "16-10-01", "17-08-01"))
colnames(example.dat) <- c("Population", "Enter.Program", "Leave.Program")

changes = example.dat %>%
select("Population","Date"="Enter.Program") %>%
bind_rows(example.dat %>%
select("Population","Date"="Leave.Program") %>%
mutate(Population = -1*Population)) %>%
mutate(Date = as.Date(Date,"%y-%m-%d")) %>%
arrange(Date) %>%
mutate(Population = cumsum(Population))

for(i in nrow(changes):2){
changes = bind_rows(changes[1:(i-1),],
data_frame(Population = changes$Population[i-1],Date = changes$Date[i]),
changes[i:nrow(changes),])
}

ggplot(data = changes,aes(x=Date,y=Population)) +
geom_line()

Cumulative plot in ggplot2

The problem is that when you use cumsum() in the aesthetic, it applies over all values, not just the values within a particular year.

Rather than doing the transformation with ggplot, it would be safer to do the transformation with dplyr first, then plot the results. For example

ggplot(dat %>% group_by(year) %>% mutate(cv=cumsum(value)), 
aes(x = id.2wk, y = cv, colour = factor(year))) +
geom_line(size = 1)+
geom_point()

Sample Image

Creating 2 y axes in ggplot with count and cumulative count

Try this. With your dummy data you can create the variables for cases and cumulative counts. After computing the scaling factor, you can reshape to long and sketch the plot with the desired structure. Here the code, where tidyverse functions have been used over dummy dataframe:

library(tidyverse)
#Code
newdf <- dummy %>% group_by(symptomdate) %>%
summarise(Count=n()) %>% ungroup() %>%
mutate(Cum=cumsum(Count))
#Scaling factor
sf <- max(newdf$Count)
newdf$Cum <- newdf$Cum/sf
#plot
newdf %>%
pivot_longer(-symptomdate) %>%
ggplot(aes(x=symptomdate)) +
geom_bar( aes(y = value, fill = name, group = name),
stat="identity", position=position_dodge(),
color="black", alpha=.6) +
scale_fill_manual(values = c("blue", "red")) +
scale_y_continuous(name = "Cases",sec.axis = sec_axis(~.*sf, name="Cum Cases"))+
labs(fill='Variable')+
theme_bw()

Output:

Sample Image

Plotting counts and cumulative numbers in one plot

Something like this?

library(tidyverse)

dat <- structure(list(group_size = structure(c(
6L, 3L, 3L, 4L, 1L, 2L,
2L, 1L, 3L, 6L, 2L, 6L, 2L, 2L, 1L, 1L, 4L, 1L, 3L, 2L
), .Label = c(
"(0,50]",
"(50,100]", "(100,150]", "(150,200]", "(200,250]", "(250,3e+03]"
), class = "factor"), amount = c(
409, 101, 103, 198, 40, 63,
69, 49, 126, 304, 91, 401, 96, 63, 36, 1, 177, 7, 112, 61
), group_sum = c(
1114,
442, 442, 375, 133, 443, 443, 133, 442, 1114, 443, 1114, 443,
443, 133, 133, 375, 133, 442, 443
), count = c(
3L, 4L, 4L, 2L,
5L, 6L, 6L, 5L, 4L, 3L, 6L, 3L, 6L, 6L, 5L, 5L, 2L, 5L, 4L, 6L
)), row.names = c(NA, -20L), class = c("data.table", "data.frame"))

dat %>%
as_tibble() %>%
ggplot(aes(x = group_size)) +
geom_col(aes(y = group_sum), position = "identity", color = "red", fill = "transparent") +
geom_label(
data = dat %>% distinct(group_size, .keep_all = TRUE),
mapping = aes(y = group_sum, label = group_sum),
color = "red"
) +
geom_col(aes(y = count * 10), position = "identity", color = "blue", fill = "transparent") +
geom_label(
data = dat %>% distinct(count, .keep_all = TRUE),
mapping = aes(y = count * 10, label = count),
color = "blue"
) +
scale_y_continuous(sec.axis = sec_axis(trans = ~ . / 10, name = "Count"))

Created on 2022-02-22 by the reprex package (v2.0.0)



Related Topics



Leave a reply



Submit