Cumulative Sum of Values by Month, Filling in for Missing Months

Cumulative sum of values by month, filling in for missing months

This is very similar to other questions, but the best query is still tricky.

Basic query to get the running sum quickly:

SELECT to_char(date_trunc('month', date_added), 'Mon YYYY') AS mon_text
, sum(sum(qty)) OVER (ORDER BY date_trunc('month', date_added)) AS running_sum
FROM tbl
GROUP BY date_trunc('month', date_added)
ORDER BY date_trunc('month', date_added);

The tricky part is to fill in for missing months:

WITH cte AS (
SELECT date_trunc('month', date_added) AS mon, sum(qty) AS mon_sum
FROM tbl
GROUP BY 1
)
SELECT to_char(mon, 'Mon YYYY') AS mon_text
, sum(c.mon_sum) OVER (ORDER BY mon) AS running_sum
FROM (SELECT min(mon) AS min_mon FROM cte) init
, generate_series(init.min_mon, now(), interval '1 month') mon
LEFT JOIN cte c USING (mon)
ORDER BY mon;

The implicit CROSS JOIN LATERAL requires Postgres 9.3+. This starts with the first month in the table.

To start with a given month:

WITH cte AS (
SELECT date_trunc('month', date_added) AS mon, sum(qty) AS mon_sum
FROM tbl
GROUP BY 1
)
SELECT to_char(mon, 'Mon YYYY') AS mon_text
, COALESCE(sum(c.mon_sum) OVER (ORDER BY mon), 0) AS running_sum
FROM generate_series('2015-01-01'::date, now(), interval '1 month') mon
LEFT JOIN cte c USING (mon)
ORDER BY mon;

db<>fiddle here

Old sqlfiddle

Keeping months from different years apart. You did not ask for that, but you'll most likely want it.

Note that the "month" to some degree depends on the time zone setting of the current session! Details:

  • Ignoring time zones altogether in Rails and PostgreSQL

Related:

  • Calculating Cumulative Sum in PostgreSQL
  • PostgreSQL: running count of rows for a query 'by minute'
  • Postgres window function and group by exception

cumsum() by month but repeat the values if there is no data in that month

Use:

#create month period column  for correct ordering
df['months'] = df['date'].dt.to_period('m')
#aggregate month
df1 = df.groupby(['months', 'col1'])['col2'].sum()

#MultiIndex with all possible combinations
mux = pd.MultiIndex.from_product([pd.period_range(df['months'].min(),
df['months'].max(), freq='M'),
df['col1'].unique()], names=df1.index.names)

#add missing values with reindex reshape, cumulative sum
#forward fill missing values and reshape back
df2 = (df1.reindex(mux)
.unstack()
.cumsum()
.ffill()
.stack()
.astype(int)
.reset_index(name='cumsum')
)
print (df2)
months col1 cumsum
0 2016-01 apple 20
1 2016-02 apple 60
2 2016-02 pear 60
3 2016-03 apple 70
4 2016-03 pear 60
5 2016-04 apple 70
6 2016-04 pear 60
7 2016-05 apple 120
8 2016-05 pear 60
9 2016-06 apple 120
10 2016-06 pear 65

Last if necessary convert datetimes to custom strings:

df2['months'] = df2['months'].dt.strftime('%b-%y')
print (df2)
months col1 cumsum
0 Jan-16 apple 20
1 Feb-16 apple 60
2 Feb-16 pear 60
3 Mar-16 apple 70
4 Mar-16 pear 60
5 Apr-16 apple 70
6 Apr-16 pear 60
7 May-16 apple 120
8 May-16 pear 60
9 Jun-16 apple 120
10 Jun-16 pear 65

Fill Missing Dates for Running Total

You need a calendar table here containing all dates. Consider the following approach:

WITH dates AS (
SELECT '2021-05-01' AS Date UNION ALL
SELECT '2021-05-02' UNION ALL
SELECT '2021-05-03' UNION ALL
SELECT '2021-05-04' UNION ALL
SELECT '2021-05-05'
)

SELECT
u.UserID,
d.Date,
SUM(t.Sale) AS DailySale,
SUM(COALESCE(SUM(t.Sale), 0)) OVER (PARTITION BY u.UserID ORDER BY d.Date) AS RunningSale
FROM (SELECT DISTINCT UserID FROM yourTable) u
CROSS JOIN dates d
LEFT JOIN yourTable t
ON t.UserID = u.UserID AND t.Date = d.Date
GROUP BY
u.UserID,
d.Date
ORDER BY
u.UserID,
d.Date

screen capture from demo link below

Demo

Running sum with missing dates

To solve this problem you need to create separate derived tables of all the PeriodYearMonth values (generated using a recursive CTE) and the distinct Reg and PartNo pairs, and CROSS JOIN them to each other to get all combinations of the columns. This combined table can then be LEFT JOINed to the original table to get the ComponentRemovals for each Reg, PartNo and PeriodYearMonth, and these can then be summed using a window function:

WITH CTE AS (
SELECT CONVERT(DATE, CONCAT(REPLACE(MIN(PeriodYearMonth), ' ', '-'), '-01'), 23) AS date,
CONVERT(DATE, CONCAT(REPLACE(MAX(PeriodYearMonth), ' ', '-'), '-01'), 23) AS max_date
FROM vtRelRepComponentsRemovalsByPartNo
UNION ALL
SELECT DATEADD(MONTH, 1, date), max_date
FROM CTE
WHERE date < max_date
)
SELECT FORMAT(CTE.date, 'yyyy MM') AS PeriodYearMonth
, rp.Reg
, rp.PartNo
, COALESCE(v.ComponentRemovals, 0) AS ComponentRemovals
, SUM(COALESCE(v.ComponentRemovals, 0)) OVER (PARTITION BY rp.Reg, rp.PartNo ORDER BY FORMAT(CTE.date, 'yyyy MM')
ROWS BETWEEN 6 PRECEDING AND CURRENT ROW) AS RunningRemovals
FROM CTE
CROSS JOIN (SELECT DISTINCT Reg, PartNo FROM vtRelRepComponentsRemovalsByPartNo) rp
LEFT JOIN vtRelRepComponentsRemovalsByPartNo v
ON v.PeriodYearMonth = FORMAT(CTE.date, 'yyyy MM')
AND v.Reg = rp.Reg
AND v.PartNo = rp.PartNo
ORDER BY rp.Reg, rp.PartNo, CTE.date

Output

PeriodYearMonth     Reg     PartNo              ComponentRemovals   RunningRemovals
2019 09 G-NHVP 109-0740V01-137 0 0
2019 10 G-NHVP 109-0740V01-137 1 1
2019 11 G-NHVP 109-0740V01-137 1 2
2019 12 G-NHVP 109-0740V01-137 1 3
2020 01 G-NHVP 109-0740V01-137 1 4
2019 09 G-NHVR 11-13354P 2 2
2019 10 G-NHVR 11-13354P 0 2
2019 11 G-NHVR 11-13354P 0 2
2019 12 G-NHVR 11-13354P 0 2
2020 01 G-NHVR 11-13354P 0 2
2019 09 OO-NSF 11-13354P 0 0
2019 10 OO-NSF 11-13354P 1 1
2019 11 OO-NSF 11-13354P 0 1
2019 12 OO-NSF 11-13354P 0 1
2020 01 OO-NSF 11-13354P 0 1
2019 09 OY-HMV 11-13354P 0 0
2019 10 OY-HMV 11-13354P 1 1
2019 11 OY-HMV 11-13354P 0 1
2019 12 OY-HMV 11-13354P 0 1
2020 01 OY-HMV 11-13354P 0 1

Demo on SQLFiddle

Note that if all PeriodYearMonth values of interest are present in vtRelRepComponentsRemovalsByPartNo you can simply use a SELECT DISTINCT ... subquery to get those values rather than the recursive CTE e.g.

SELECT d.PeriodYearMonth
, rp.Reg
, rp.PartNo
, COALESCE(v.ComponentRemovals, 0) AS ComponentRemovals
, SUM(COALESCE(v.ComponentRemovals, 0)) OVER (PARTITION BY rp.Reg, rp.PartNo ORDER BY d.PeriodYearMonth
ROWS BETWEEN 6 PRECEDING AND CURRENT ROW) AS RunningRemovals
FROM (SELECT DISTINCT PeriodYearMonth FROM vtRelRepComponentsRemovalsByPartNo) d
CROSS JOIN (SELECT DISTINCT Reg, PartNo FROM vtRelRepComponentsRemovalsByPartNo) rp
LEFT JOIN vtRelRepComponentsRemovalsByPartNo v
ON v.PeriodYearMonth = d.PeriodYearMonth
AND v.Reg = rp.Reg
AND v.PartNo = rp.PartNo
ORDER BY rp.Reg, rp.PartNo, d.PeriodYearMonth

Demo on SQLFiddle

Cumulative sum for months that do and dont exist in a Snowflake table

This is a request that's overly complicated by arbitrary rules, but I'm going to give you an answer that's 90% there.

It uses cross joins to generate the combinations with no data, and a UDF to generate the missing months. Then window functions to get the desired values:

with data as (
select x[4]::string id, x[5]::date action_date, x[6]::string data
from (
select split(value, ' ') x
from table(split_to_table(
$$ 001 2021-01-20 jams
002 2021-01-23 orange
003 2021-01-19 banana
001 2021-04-11 pineap
002 2021-03-01 grape
004 2021-03-01 apple$$, '\n'
)))), range_months as (
select date_trunc(month, min(action_date)) since, max(action_date) until
from data
), all_months as (
select value::date m
from range_months, table(flatten(list_months_between(since, until)))
), all_ids as (
select distinct id
from data
), all_crossed as (
select *
from all_months, all_ids
), left_joined as (
select m, a.id, data, b.action_date action_date
from all_crossed a
left join data b
on a.id=b.id
and date_trunc(month, b.action_date)=a.m
), almost_there as (
select m, id, count(action_date) action_taken_this_month, any_value(data) recent_data
, lag(action_taken_this_month) over(partition by id order by m) previous_action_taken_this_month
from left_joined
group by 1, 2
)

select id, last_day(m) month_end
, action_taken_this_month
, ifnull(sum(previous_action_taken_this_month) over(partition by id order by m), 0) total_actions
, lag(recent_data, 1) over(partition by id order by m) most_recent_data
from almost_there
order by 1, month_end

I wrote the UDF to generate a list of months in Python, but you can rewrite in your favorite language if desired:

create or replace function list_months_between(since date, until date)
returns array
language python
runtime_version='3.8'
packages=('pandas')
handler = 'x'
as
$$
import pandas as pd
def x(since, until):
return pd.date_range(since, until, freq='MS').strftime("%Y-%m-%d").tolist()
$$
;

select list_months_between('2020-01-01', '2020-03-10');

At the end of this series of sub-queries, we get results 90% like desired:

Sample Image

After all this work, I hope you can take this queries and add an extra join to replace some of the nulls on most_recent_data with the last lookup value.

SQL Cumulative Sum not showing when group by values are missing

Use a cross join to generate the rows and then left join to bring in the values:

select m.month, c.category, t.monthly_value,
sum(t.monthly_value) over (partition by c.category order by m.month) as running_monthly_value
from (select distinct month from t) m cross join
(select distinct category from t) c left join
t
on t.month = m.month and t.category = c.category;

SQL Server : cumulative sum, missing dates

This was harder than I thought it would be. Someone may have a simpler solution. Need to fill in the artcode and also consider different ranges on different artcodes.

declare @T table (artcode varchar(10), transdate date, Qty smallmoney, transactionvalue smallmoney);
insert into @T values
('M100', '2010-11-24', 6.00, 179.40)
, ('M100', '2010-11-24', -6.00, -179.4)
, ('M100', '2010-11-25', 100.00, 2900.00)
, ('M100', '2010-11-26', -1.00, -29)
, ('M100', '2010-11-26', -5.00, -145)
, ('M100', '2010-11-26', -1.00, -29)
, ('M100', '2010-11-29', -5.00, -145)
, ('M100', '2010-11-29', -3.00, -87)
, ('M100', '2010-11-29', -1.00, -29)
, ('M101', '2010-11-23', 6.00, 179.40)
, ('M101', '2010-11-25', 100.00, 2900.00)
, ('M101', '2010-11-26', -1.00, -29)
, ('M101', '2010-11-26', -5.00, -145)
, ('M101', '2010-11-26', -1.00, -29)
, ('M101', '2010-11-30', -5.00, -145)
, ('M101', '2010-11-30', -3.00, -87)
, ('M101', '2010-11-30', -1.00, -29);
with limits as
( select t.artcode, min(t.transdate) as startDate, max(t.transdate) as endtDate
from @T t
group by t.artcode
)
, dts as
( select l.artcode, l.startDate as dt, l.startDate, l.endtDate
from limits l
union all
select l.artcode, dateadd(day, 1, l.dt), l.startDate, l.endtDate
from dts l
where dateadd(day, 1, l.dt) <= l.endtDate
)
select distinct dts.artcode, dts.dt
, sum(isnull(t.Qty, 0)) over (partition by dts.artcode order by dts.dt) as Qty
, sum(isnull(t.transactionvalue, 0)) over (partition by dts.artcode order by dts.dt) as transactionvalue
from dts
left join @T t
on t.transdate = dts.dt
and t.artcode = dts.artcode
order by dts.artcode, dts.dt;

artcode dt Qty transactionvalue
---------- ---------- --------------------- ---------------------
M100 2010-11-24 0.00 0.00
M100 2010-11-25 100.00 2900.00
M100 2010-11-26 93.00 2697.00
M100 2010-11-27 93.00 2697.00
M100 2010-11-28 93.00 2697.00
M100 2010-11-29 84.00 2436.00
M101 2010-11-23 6.00 179.40
M101 2010-11-24 6.00 179.40
M101 2010-11-25 106.00 3079.40
M101 2010-11-26 99.00 2876.40
M101 2010-11-27 99.00 2876.40
M101 2010-11-28 99.00 2876.40
M101 2010-11-29 99.00 2876.40
M101 2010-11-30 90.00 2615.40


Related Topics



Leave a reply



Submit