Duplicating Records to Fill Gap Between Dates

Duplicating records to fill gap between dates

I think I have a solution using an incremental approach toward the final result with CTE's:

with mindate as
(
select min(price_date) as mindate from PRICES_TEST
)
,dates as
(
select mindate.mindate + row_number() over (order by 1) - 1 as thedate from mindate,
dual d connect by level <= floor(SYSDATE - mindate.mindate) + 1
)
,productdates as
(
select p.product, d.thedate
from (select distinct product from PRICES_TEST) p, dates d
)
,ranges as
(
select
pd.product,
pd.thedate,
(select max(PRICE_DATE) from PRICES_TEST p2
where p2.product = pd.product and p2.PRICE_DATE <= pd.thedate) as mindate
from productdates pd
)
select
r.thedate,
r.product,
p.price
from ranges r
inner join PRICES_TEST p on r.mindate = p.price_date and r.product = p.product
order by r.product, r.thedate
  • mindate retrieves the earliest possible date in the data set
  • dates generates a calendar of dates from earliest possible date to today.
  • productdates cross joins all possible products with all possible dates
  • ranges determines which price date applied at each date
  • the final query links which price date applied to the actual price and filters out dates for which there are no relevant price dates via the inner join condition

Demo: http://www.sqlfiddle.com/#!4/e528f/126

Duplicating records to fill gap between dates in Google BigQuery

How can I build a query within Google BigQuery that yields an output like the one below? A value at a given date is repeated until the next change for the dates in between

See example below

SELECT
MODIFY_DATE,
MAX(SKU_TEMP) OVER(PARTITION BY grp) AS SKU,
MAX(STORE_TEMP) OVER(PARTITION BY grp) AS STORE,
MAX(STOCK_ON_HAND_TEMP) OVER(PARTITION BY grp) AS STOCK_ON_HAND,
FROM (
SELECT
DAY AS MODIFY_DATE, SKU AS SKU_TEMP, STORE AS STORE_TEMP, STOCK_ON_HAND AS STOCK_ON_HAND_TEMP,
COUNT(SKU) OVER(ORDER BY DAY ASC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS grp,
FROM (
SELECT DATE(DATE_ADD(TIMESTAMP("2016-08-01"), pos - 1, "DAY")) AS DAY
FROM (
SELECT ROW_NUMBER() OVER() AS pos, *
FROM (FLATTEN((
SELECT SPLIT(RPAD('', 1 + DATEDIFF(TIMESTAMP("2016-08-07"), TIMESTAMP("2016-08-01")), '.'),'') AS h
FROM (SELECT NULL)),h
)))
) AS DATES
LEFT JOIN (
SELECT DATE(MODIFY_DATE) AS MODIFY_DATE, SKU, STORE, STOCK_ON_HAND
FROM
(SELECT "2016-08-01" AS MODIFY_DATE, "1120010" AS SKU, 21 AS STORE, 75 AS STOCK_ON_HAND),
(SELECT "2016-08-05" AS MODIFY_DATE, "1120010" AS SKU, 22 AS STORE, 100 AS STOCK_ON_HAND),
(SELECT "2016-08-07" AS MODIFY_DATE, "1120011" AS SKU, 23 AS STORE, 40 AS STOCK_ON_HAND),
) AS TABLE_WITH_GAPS
ON TABLE_WITH_GAPS.MODIFY_DATE = DATES.DAY
)
ORDER BY MODIFY_DATE

Duplicate groups of records to fill multiple date gaps in Google BigQuery

Try below


#standardSQL
WITH history AS (
SELECT '2017-01-01' AS d, 'a' AS product, 'x' AS partner, 10 AS value UNION ALL
SELECT '2017-01-01' AS d, 'b' AS product, 'x' AS partner, 15 AS value UNION ALL
SELECT '2017-01-01' AS d, 'a' AS product, 'y' AS partner, 11 AS value UNION ALL
SELECT '2017-01-01' AS d, 'b' AS product, 'y' AS partner, 16 AS value UNION ALL
SELECT '2017-01-05' AS d, 'b' AS product, 'x' AS partner, 13 AS value UNION ALL
SELECT '2017-01-07' AS d, 'a' AS product, 'y' AS partner, 15 AS value UNION ALL
SELECT '2017-01-07' AS d, 'a' AS product, 'x' AS partner, 15 AS value
),
daterange AS (
SELECT date_in_range
FROM UNNEST(GENERATE_DATE_ARRAY('2017-01-01', '2017-01-10')) AS date_in_range
),
temp AS (
SELECT d, product, partner, value, LEAD(d) OVER(PARTITION BY product, partner ORDER BY d) AS next_d
FROM history
ORDER BY product, partner, d
)
SELECT date_in_range, product, partner, value
FROM daterange
JOIN temp
ON daterange.date_in_range >= PARSE_DATE('%Y-%m-%d', temp.d)
AND (daterange.date_in_range < PARSE_DATE('%Y-%m-%d', temp.next_d) OR temp.next_d IS NULL)
ORDER BY product, partner, date_in_range

BigQuery/SQL: Filling in gaps as duplicate rows between years

Below is for BigQuery Standard SQL

#standardSQL
WITH history AS (
SELECT 2012 AS d, 'a' AS product, 'x' AS partner, 10 AS value UNION ALL
SELECT 2010 AS d, 'b' AS product, 'x' AS partner, 15 AS value UNION ALL
SELECT 2014 AS d, 'a' AS product, 'y' AS partner, 11 AS value UNION ALL
SELECT 2012 AS d, 'b' AS product, 'y' AS partner, 16 AS value UNION ALL
SELECT 2015 AS d, 'b' AS product, 'x' AS partner, 13 AS value UNION ALL
SELECT 2017 AS d, 'a' AS product, 'y' AS partner, 15 AS value UNION ALL
SELECT 2017 AS d, 'a' AS product, 'x' AS partner, 15 AS value
),
daterange AS (
SELECT EXTRACT(YEAR FROM fiscalYear) AS date_in_range
FROM UNNEST(
GENERATE_DATE_ARRAY(DATE('2010-01-01'), CURRENT_DATE(), INTERVAL 1 YEAR)
) AS fiscalYear),
history_ext AS (
SELECT date_in_range, x.product, x.partner, value
FROM daterange dr
CROSS JOIN (SELECT DISTINCT product, partner FROM history) x
LEFT JOIN history h
ON dr.date_in_range = h.d
AND STRUCT(h.product, h.partner) = STRUCT(x.product, x.partner)
)
SELECT date_in_range, product, partner,
COALESCE(
value,
LAST_VALUE(value IGNORE NULLS) OVER(PARTITION BY product, partner ORDER BY date_in_range ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING),
FIRST_VALUE(value IGNORE NULLS) OVER(PARTITION BY product, partner ORDER BY date_in_range ROWS BETWEEN 1 FOLLOWING AND UNBOUNDED FOLLOWING)
) AS value
FROM history_ext
ORDER BY product, partner, date_in_range

and returns

Row date_in_range   product partner value    
1 2010 a x 10
2 2011 a x 10
3 2012 a x 10
4 2013 a x 10
5 2014 a x 10
6 2015 a x 10
7 2016 a x 10
8 2017 a x 15
9 2018 a x 15
10 2019 a x 15
11 2010 a y 11
12 2011 a y 11
13 2012 a y 11
14 2013 a y 11
15 2014 a y 11
16 2015 a y 11
17 2016 a y 11
18 2017 a y 15
19 2018 a y 15
20 2019 a y 15
21 2010 b x 15
22 2011 b x 15
23 2012 b x 15
24 2013 b x 15
25 2014 b x 15
26 2015 b x 13
27 2016 b x 13
28 2017 b x 13
29 2018 b x 13
30 2019 b x 13
31 2010 b y 16
32 2011 b y 16
33 2012 b y 16
34 2013 b y 16
35 2014 b y 16
36 2015 b y 16
37 2016 b y 16
38 2017 b y 16
39 2018 b y 16
40 2019 b y 16

duplicating records between date gaps within a selected time interval in a PySpark dataframe

With Respect to the @jxc comment, I have prepared the answer for this use case.

Following is the code snippet.

  1. Import the spark SQL functions

    from pyspark.sql import functions as F, Window

  2. Prepare the sample data

    simpleData = ((1,"Available",5,"2020-07"),                                                              
(1,"Available",8,"2020-08"),
(1,"Limited",8,"2020-12"),
(2,"Limited",1,"2020-09"),
(2,"Limited",3,"2020-12")
)


columns= ["product_id", "status", "price", "month"]

  1. Creating dataframe of sample data

    df = spark.createDataFrame(data = simpleData, schema = columns)

  2. Add date column in dataframe to get proper formatted date

    df0 = df.withColumn("date",F.to_date('month','yyyy-MM'))

df0.show()

+----------+---------+-----+-------+----------+
|product_id| status|price| month| date|
+----------+---------+-----+-------+----------+
| 1|Available| 5|2020-07|2020-07-01|
| 1|Available| 8|2020-08|2020-08-01|
| 1| Limited| 8|2020-12|2020-12-01|
| 2| Limited| 1|2020-09|2020-09-01|
| 2| Limited| 3|2020-12|2020-12-01|
+----------+---------+-----+-------+----------+

  1. Creating WinSpec w1 and use Window aggregate function lead to find the next date over(w1), convert it to the previous months to set up date sequences:
    w1 = Window.partitionBy('product_id').orderBy('date')
df1 = df0.withColumn('end_date',F.coalesce(F.add_months(F.lead('date').over(w1),-1),'date'))
df1.show()

+----------+---------+-----+-------+----------+----------+
|product_id| status|price| month| date| end_date|
+----------+---------+-----+-------+----------+----------+
| 1|Available| 5|2020-07|2020-07-01|2020-07-01|
| 1|Available| 8|2020-08|2020-08-01|2020-11-01|
| 1| Limited| 8|2020-12|2020-12-01|2020-12-01|
| 2| Limited| 1|2020-09|2020-09-01|2020-11-01|
| 2| Limited| 3|2020-12|2020-12-01|2020-12-01|
+----------+---------+-----+-------+----------+----------+

  1. Using months_between(end_date, date) to calculate # of months between two dates, and use transform function to iterate through sequence(0, #months), create a named_struct with date=add_months(date,i) and price=IF(i=0,price,price), use inline_outer to explode the array of structs.
    df2 = df1.selectExpr("product_id", "status", inline_outer( transform( sequence(0,int(months_between(end_date, date)),1), i -> (add_months(date,i) as date, IF(i=0,price,price) as price) ) ) )

df2.show()

+----------+---------+----------+-----+
|product_id| status| date|price|
+----------+---------+----------+-----+
| 1|Available|2020-07-01| 5|
| 1|Available|2020-08-01| 8|
| 1|Available|2020-09-01| 8|
| 1|Available|2020-10-01| 8|
| 1|Available|2020-11-01| 8|
| 1| Limited|2020-12-01| 8|
| 2| Limited|2020-09-01| 1|
| 2| Limited|2020-10-01| 1|
| 2| Limited|2020-11-01| 1|
| 2| Limited|2020-12-01| 3|
+----------+---------+----------+-----+

  1. Partitioning the dataframe on product_id and adding a rank column in df3 to get row number for each row. Then, Storing the maximum of rank column value with new column max_rank for each product_id and storing max_rank in to df4
    w2 = Window.partitionBy('product_id').orderBy('date')                                                            
df3 = df2.withColumn('rank',F.row_number().over(w2))
Schema: DataFrame[product_id: bigint, status: string, date: date, price: bigint, rank: int]
df3.show()
+----------+---------+----------+-----+----+
|product_id| status| date|price|rank|
+----------+---------+----------+-----+----+
| 1|Available|2020-07-01| 5| 1|
| 1|Available|2020-08-01| 8| 2|
| 1|Available|2020-09-01| 8| 3|
| 1|Available|2020-10-01| 8| 4|
| 1|Available|2020-11-01| 8| 5|
| 1| Limited|2020-12-01| 8| 6|
| 2| Limited|2020-09-01| 1| 1|
| 2| Limited|2020-10-01| 1| 2|
| 2| Limited|2020-11-01| 1| 3|
| 2| Limited|2020-12-01| 3| 4|
+----------+---------+----------+-----+----+


df4 = df3.groupBy("product_id").agg(F.max('rank').alias('max_rank'))
Schema: DataFrame[product_id: bigint, max_rank: int]
df4.show()
+----------+--------+
|product_id|max_rank|
+----------+--------+
| 1| 6|
| 2| 4|
+----------+--------+


  1. Joining df3 and df4 dataframes on product_id get max_rank
    df5 = df3.join(df4,df3.product_id == df4.product_id,"inner") \
.select(df3.product_id,df3.status,df3.date,df3.price,df3.rank,df4.max_rank)
Schema: DataFrame[product_id: bigint, status: string, date: date, price: bigint, rank: int, max_rank: int]
df5.show()
+----------+---------+----------+-----+----+--------+
|product_id| status| date|price|rank|max_rank|
+----------+---------+----------+-----+----+--------+
| 1|Available|2020-07-01| 5| 1| 6|
| 1|Available|2020-08-01| 8| 2| 6|
| 1|Available|2020-09-01| 8| 3| 6|
| 1|Available|2020-10-01| 8| 4| 6|
| 1|Available|2020-11-01| 8| 5| 6|
| 1| Limited|2020-12-01| 8| 6| 6|
| 2| Limited|2020-09-01| 1| 1| 4|
| 2| Limited|2020-10-01| 1| 2| 4|
| 2| Limited|2020-11-01| 1| 3| 4|
| 2| Limited|2020-12-01| 3| 4| 4|
+----------+---------+----------+-----+----+--------+


  1. Then finally filtering the df5 dataframe using between function to get the latest 6 months data.
    FinalResultDF = df5.filter(F.col('rank') \                                      
.between(F.when((F.col('max_rank') > 5),(F.col('max_rank')-6)).otherwise(0),F.col('max_rank'))) \
.select(df5.product_id,df5.status,df5.date,df5.price)

FinalResultDF.show(truncate=False)
    +----------+---------+----------+-----+                                               
|product_id|status |date |price|
+----------+---------+----------+-----+
|1 |Available|2020-07-01|5 |
|1 |Available|2020-08-01|8 |
|1 |Available|2020-09-01|8 |
|1 |Available|2020-10-01|8 |
|1 |Available|2020-11-01|8 |
|1 |Limited |2020-12-01|8 |
|2 |Limited |2020-09-01|1 |
|2 |Limited |2020-10-01|1 |
|2 |Limited |2020-11-01|1 |
|2 |Limited |2020-12-01|3 |
+----------+---------+----------+-----+

How to duplicate rows generating dates between Start Date and End Date in BigQuery?

Below is for BigQuery Standard SQL

#standardSQL
WITH `project.dataset.table` AS (
SELECT 'A' user_name, DATE '2019-07-01' start_date, DATE '2019-07-31' end_date
)
SELECT user_name, start_date, end_date, day
FROM `project.dataset.table`,
UNNEST(GENERATE_DATE_ARRAY(start_date, end_date)) day
ORDER BY user_name, day

with result

Row user_name   start_date  end_date    day  
1 A 2019-07-01 2019-07-31 2019-07-01
2 A 2019-07-01 2019-07-31 2019-07-02
3 A 2019-07-01 2019-07-31 2019-07-03
. . .
29 A 2019-07-01 2019-07-31 2019-07-29
30 A 2019-07-01 2019-07-31 2019-07-30
31 A 2019-07-01 2019-07-31 2019-07-31

View to fill date gaps but only if no records for a given date

You can use a table-valued function, which will allow you to use parameters, so there is no need to hard-code them.

The table returned includes an extra column Ins; it is 0 for the original data and 1 for the inserted.

Rextester demo

If you prefer a view you can easily extract the logic.

create function myFun(@start date, @end date) 
returns @result table (MyName varchar(10), MyDate date, Ins bit) as
begin
;
with dates as (
SELECT DATEADD(DAY, nbr - 1, @start) dt
FROM ( SELECT ROW_NUMBER() OVER ( ORDER BY c.object_id ) AS Nbr
FROM sys.columns c
) nbrs
WHERE nbr - 1 <= DATEDIFF(DAY, @start, @end)
), last_avail as (
select dates.dt, max(t.MyDate) prev_dt
from dates join t on dates.dt >= t.MyDate
group by dates.dt
), empty as (
select * from last_avail where dt <> prev_dt
)
insert @result (MyName, MyDate, Ins)
select MyName, MyDate, 0 from t
union all
select t.MyName, x.dt, 1
from empty x join t on x.prev_dt = t.MyDate;

return;
end

Usage example:

declare @start date, @end date;

set @start = '2017-04-08';
set @end = '2017-04-11';
select * from dbo.myFun(@start, @end) order by MyDate desc, MyName asc;

Attribution

The Common Table Expression which plays the role of a calendar table has been found here.

Google BigQuery SQL: How to fill in gaps in a table with dates?

Consider below

with temp as (
select customer, dates from (
select customer, min(dates) min_date, max(dates) max_date
from `project.dataset.table`
group by customer
), unnest(generate_date_array(min_date, max_date)) dates
)
select customer, dates,
first_value(subscription ignore nulls) over win as subscription
from temp a
left join `project.dataset.table` b
using(customer, dates)
window win as (partition by customer order by dates desc rows between current row and unbounded following)
# order by dates, customer

If to apply to sample data in y our question - output is

Sample Image

How to complete and fill in gaps between dates in SQL?

A query like this should do the trick:

create table test (yq int, items int);
INSERT INTO test Values (20201,10),(20204, 15),(20213, 25),(20222, 30);

with recursive quarters(q) as (
select min(yq) as q
from test
union all
select decode(right(q::text, 1), 4, q + 7, q + 1) as q
from quarters
where q < (select max(yq) from test)
)
select q as yq, decode(items is null, true,
lag(items ignore nulls) over (order by q), items) as items
from test t
right join quarters q
on t.yq = q.q
order by q;

It uses a recursive CTE to generate the quarters range needed, right joins this with the source data, and then uses a LAG() window function to populate the items if the value is NULL.



Related Topics



Leave a reply



Submit