Duplicating records to fill gap between dates
I think I have a solution using an incremental approach toward the final result with CTE's:
with mindate as
(
select min(price_date) as mindate from PRICES_TEST
)
,dates as
(
select mindate.mindate + row_number() over (order by 1) - 1 as thedate from mindate,
dual d connect by level <= floor(SYSDATE - mindate.mindate) + 1
)
,productdates as
(
select p.product, d.thedate
from (select distinct product from PRICES_TEST) p, dates d
)
,ranges as
(
select
pd.product,
pd.thedate,
(select max(PRICE_DATE) from PRICES_TEST p2
where p2.product = pd.product and p2.PRICE_DATE <= pd.thedate) as mindate
from productdates pd
)
select
r.thedate,
r.product,
p.price
from ranges r
inner join PRICES_TEST p on r.mindate = p.price_date and r.product = p.product
order by r.product, r.thedate
mindate
retrieves the earliest possible date in the data setdates
generates a calendar of dates from earliest possible date to today.productdates
cross joins all possible products with all possible datesranges
determines which price date applied at each date- the final query links which price date applied to the actual price and filters out dates for which there are no relevant price dates via the
inner join
condition
Demo: http://www.sqlfiddle.com/#!4/e528f/126
Duplicating records to fill gap between dates in Google BigQuery
How can I build a query within Google BigQuery that yields an output like the one below? A value at a given date is repeated until the next change for the dates in between
See example below
SELECT
MODIFY_DATE,
MAX(SKU_TEMP) OVER(PARTITION BY grp) AS SKU,
MAX(STORE_TEMP) OVER(PARTITION BY grp) AS STORE,
MAX(STOCK_ON_HAND_TEMP) OVER(PARTITION BY grp) AS STOCK_ON_HAND,
FROM (
SELECT
DAY AS MODIFY_DATE, SKU AS SKU_TEMP, STORE AS STORE_TEMP, STOCK_ON_HAND AS STOCK_ON_HAND_TEMP,
COUNT(SKU) OVER(ORDER BY DAY ASC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS grp,
FROM (
SELECT DATE(DATE_ADD(TIMESTAMP("2016-08-01"), pos - 1, "DAY")) AS DAY
FROM (
SELECT ROW_NUMBER() OVER() AS pos, *
FROM (FLATTEN((
SELECT SPLIT(RPAD('', 1 + DATEDIFF(TIMESTAMP("2016-08-07"), TIMESTAMP("2016-08-01")), '.'),'') AS h
FROM (SELECT NULL)),h
)))
) AS DATES
LEFT JOIN (
SELECT DATE(MODIFY_DATE) AS MODIFY_DATE, SKU, STORE, STOCK_ON_HAND
FROM
(SELECT "2016-08-01" AS MODIFY_DATE, "1120010" AS SKU, 21 AS STORE, 75 AS STOCK_ON_HAND),
(SELECT "2016-08-05" AS MODIFY_DATE, "1120010" AS SKU, 22 AS STORE, 100 AS STOCK_ON_HAND),
(SELECT "2016-08-07" AS MODIFY_DATE, "1120011" AS SKU, 23 AS STORE, 40 AS STOCK_ON_HAND),
) AS TABLE_WITH_GAPS
ON TABLE_WITH_GAPS.MODIFY_DATE = DATES.DAY
)
ORDER BY MODIFY_DATE
Duplicate groups of records to fill multiple date gaps in Google BigQuery
Try below
#standardSQL
WITH history AS (
SELECT '2017-01-01' AS d, 'a' AS product, 'x' AS partner, 10 AS value UNION ALL
SELECT '2017-01-01' AS d, 'b' AS product, 'x' AS partner, 15 AS value UNION ALL
SELECT '2017-01-01' AS d, 'a' AS product, 'y' AS partner, 11 AS value UNION ALL
SELECT '2017-01-01' AS d, 'b' AS product, 'y' AS partner, 16 AS value UNION ALL
SELECT '2017-01-05' AS d, 'b' AS product, 'x' AS partner, 13 AS value UNION ALL
SELECT '2017-01-07' AS d, 'a' AS product, 'y' AS partner, 15 AS value UNION ALL
SELECT '2017-01-07' AS d, 'a' AS product, 'x' AS partner, 15 AS value
),
daterange AS (
SELECT date_in_range
FROM UNNEST(GENERATE_DATE_ARRAY('2017-01-01', '2017-01-10')) AS date_in_range
),
temp AS (
SELECT d, product, partner, value, LEAD(d) OVER(PARTITION BY product, partner ORDER BY d) AS next_d
FROM history
ORDER BY product, partner, d
)
SELECT date_in_range, product, partner, value
FROM daterange
JOIN temp
ON daterange.date_in_range >= PARSE_DATE('%Y-%m-%d', temp.d)
AND (daterange.date_in_range < PARSE_DATE('%Y-%m-%d', temp.next_d) OR temp.next_d IS NULL)
ORDER BY product, partner, date_in_range
BigQuery/SQL: Filling in gaps as duplicate rows between years
Below is for BigQuery Standard SQL
#standardSQL
WITH history AS (
SELECT 2012 AS d, 'a' AS product, 'x' AS partner, 10 AS value UNION ALL
SELECT 2010 AS d, 'b' AS product, 'x' AS partner, 15 AS value UNION ALL
SELECT 2014 AS d, 'a' AS product, 'y' AS partner, 11 AS value UNION ALL
SELECT 2012 AS d, 'b' AS product, 'y' AS partner, 16 AS value UNION ALL
SELECT 2015 AS d, 'b' AS product, 'x' AS partner, 13 AS value UNION ALL
SELECT 2017 AS d, 'a' AS product, 'y' AS partner, 15 AS value UNION ALL
SELECT 2017 AS d, 'a' AS product, 'x' AS partner, 15 AS value
),
daterange AS (
SELECT EXTRACT(YEAR FROM fiscalYear) AS date_in_range
FROM UNNEST(
GENERATE_DATE_ARRAY(DATE('2010-01-01'), CURRENT_DATE(), INTERVAL 1 YEAR)
) AS fiscalYear),
history_ext AS (
SELECT date_in_range, x.product, x.partner, value
FROM daterange dr
CROSS JOIN (SELECT DISTINCT product, partner FROM history) x
LEFT JOIN history h
ON dr.date_in_range = h.d
AND STRUCT(h.product, h.partner) = STRUCT(x.product, x.partner)
)
SELECT date_in_range, product, partner,
COALESCE(
value,
LAST_VALUE(value IGNORE NULLS) OVER(PARTITION BY product, partner ORDER BY date_in_range ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING),
FIRST_VALUE(value IGNORE NULLS) OVER(PARTITION BY product, partner ORDER BY date_in_range ROWS BETWEEN 1 FOLLOWING AND UNBOUNDED FOLLOWING)
) AS value
FROM history_ext
ORDER BY product, partner, date_in_range
and returns
Row date_in_range product partner value
1 2010 a x 10
2 2011 a x 10
3 2012 a x 10
4 2013 a x 10
5 2014 a x 10
6 2015 a x 10
7 2016 a x 10
8 2017 a x 15
9 2018 a x 15
10 2019 a x 15
11 2010 a y 11
12 2011 a y 11
13 2012 a y 11
14 2013 a y 11
15 2014 a y 11
16 2015 a y 11
17 2016 a y 11
18 2017 a y 15
19 2018 a y 15
20 2019 a y 15
21 2010 b x 15
22 2011 b x 15
23 2012 b x 15
24 2013 b x 15
25 2014 b x 15
26 2015 b x 13
27 2016 b x 13
28 2017 b x 13
29 2018 b x 13
30 2019 b x 13
31 2010 b y 16
32 2011 b y 16
33 2012 b y 16
34 2013 b y 16
35 2014 b y 16
36 2015 b y 16
37 2016 b y 16
38 2017 b y 16
39 2018 b y 16
40 2019 b y 16
duplicating records between date gaps within a selected time interval in a PySpark dataframe
With Respect to the @jxc comment, I have prepared the answer for this use case.
Following is the code snippet.
Import the spark SQL functions
from pyspark.sql import functions as F, Window
Prepare the sample data
simpleData = ((1,"Available",5,"2020-07"),
(1,"Available",8,"2020-08"),
(1,"Limited",8,"2020-12"),
(2,"Limited",1,"2020-09"),
(2,"Limited",3,"2020-12")
)
columns= ["product_id", "status", "price", "month"]
Creating dataframe of sample data
df = spark.createDataFrame(data = simpleData, schema = columns)
Add date column in dataframe to get proper formatted date
df0 = df.withColumn("date",F.to_date('month','yyyy-MM'))
df0.show()
+----------+---------+-----+-------+----------+
|product_id| status|price| month| date|
+----------+---------+-----+-------+----------+
| 1|Available| 5|2020-07|2020-07-01|
| 1|Available| 8|2020-08|2020-08-01|
| 1| Limited| 8|2020-12|2020-12-01|
| 2| Limited| 1|2020-09|2020-09-01|
| 2| Limited| 3|2020-12|2020-12-01|
+----------+---------+-----+-------+----------+
- Creating WinSpec w1 and use Window aggregate function lead to find the next date over(w1), convert it to the previous months to set up date sequences:
w1 = Window.partitionBy('product_id').orderBy('date')
df1 = df0.withColumn('end_date',F.coalesce(F.add_months(F.lead('date').over(w1),-1),'date'))
df1.show()
+----------+---------+-----+-------+----------+----------+
|product_id| status|price| month| date| end_date|
+----------+---------+-----+-------+----------+----------+
| 1|Available| 5|2020-07|2020-07-01|2020-07-01|
| 1|Available| 8|2020-08|2020-08-01|2020-11-01|
| 1| Limited| 8|2020-12|2020-12-01|2020-12-01|
| 2| Limited| 1|2020-09|2020-09-01|2020-11-01|
| 2| Limited| 3|2020-12|2020-12-01|2020-12-01|
+----------+---------+-----+-------+----------+----------+
- Using months_between(end_date, date) to calculate # of months between two dates, and use transform function to iterate through sequence(0, #months), create a named_struct with date=add_months(date,i) and price=IF(i=0,price,price), use inline_outer to explode the array of structs.
df2 = df1.selectExpr("product_id", "status", inline_outer( transform( sequence(0,int(months_between(end_date, date)),1), i -> (add_months(date,i) as date, IF(i=0,price,price) as price) ) ) )
df2.show()
+----------+---------+----------+-----+
|product_id| status| date|price|
+----------+---------+----------+-----+
| 1|Available|2020-07-01| 5|
| 1|Available|2020-08-01| 8|
| 1|Available|2020-09-01| 8|
| 1|Available|2020-10-01| 8|
| 1|Available|2020-11-01| 8|
| 1| Limited|2020-12-01| 8|
| 2| Limited|2020-09-01| 1|
| 2| Limited|2020-10-01| 1|
| 2| Limited|2020-11-01| 1|
| 2| Limited|2020-12-01| 3|
+----------+---------+----------+-----+
- Partitioning the dataframe on
product_id
and adding a rank column indf3
to get row number for each row. Then, Storing the maximum ofrank
column value with new columnmax_rank
for eachproduct_id
and storingmax_rank
in todf4
w2 = Window.partitionBy('product_id').orderBy('date')
df3 = df2.withColumn('rank',F.row_number().over(w2))
Schema: DataFrame[product_id: bigint, status: string, date: date, price: bigint, rank: int]
df3.show()
+----------+---------+----------+-----+----+
|product_id| status| date|price|rank|
+----------+---------+----------+-----+----+
| 1|Available|2020-07-01| 5| 1|
| 1|Available|2020-08-01| 8| 2|
| 1|Available|2020-09-01| 8| 3|
| 1|Available|2020-10-01| 8| 4|
| 1|Available|2020-11-01| 8| 5|
| 1| Limited|2020-12-01| 8| 6|
| 2| Limited|2020-09-01| 1| 1|
| 2| Limited|2020-10-01| 1| 2|
| 2| Limited|2020-11-01| 1| 3|
| 2| Limited|2020-12-01| 3| 4|
+----------+---------+----------+-----+----+
df4 = df3.groupBy("product_id").agg(F.max('rank').alias('max_rank'))
Schema: DataFrame[product_id: bigint, max_rank: int]
df4.show()
+----------+--------+
|product_id|max_rank|
+----------+--------+
| 1| 6|
| 2| 4|
+----------+--------+
- Joining
df3
anddf4
dataframes onproduct_id
getmax_rank
df5 = df3.join(df4,df3.product_id == df4.product_id,"inner") \
.select(df3.product_id,df3.status,df3.date,df3.price,df3.rank,df4.max_rank)
Schema: DataFrame[product_id: bigint, status: string, date: date, price: bigint, rank: int, max_rank: int]
df5.show()
+----------+---------+----------+-----+----+--------+
|product_id| status| date|price|rank|max_rank|
+----------+---------+----------+-----+----+--------+
| 1|Available|2020-07-01| 5| 1| 6|
| 1|Available|2020-08-01| 8| 2| 6|
| 1|Available|2020-09-01| 8| 3| 6|
| 1|Available|2020-10-01| 8| 4| 6|
| 1|Available|2020-11-01| 8| 5| 6|
| 1| Limited|2020-12-01| 8| 6| 6|
| 2| Limited|2020-09-01| 1| 1| 4|
| 2| Limited|2020-10-01| 1| 2| 4|
| 2| Limited|2020-11-01| 1| 3| 4|
| 2| Limited|2020-12-01| 3| 4| 4|
+----------+---------+----------+-----+----+--------+
- Then finally filtering the
df5
dataframe usingbetween
function to get the latest 6 months data.
FinalResultDF = df5.filter(F.col('rank') \
.between(F.when((F.col('max_rank') > 5),(F.col('max_rank')-6)).otherwise(0),F.col('max_rank'))) \
.select(df5.product_id,df5.status,df5.date,df5.price)
FinalResultDF.show(truncate=False)
+----------+---------+----------+-----+
|product_id|status |date |price|
+----------+---------+----------+-----+
|1 |Available|2020-07-01|5 |
|1 |Available|2020-08-01|8 |
|1 |Available|2020-09-01|8 |
|1 |Available|2020-10-01|8 |
|1 |Available|2020-11-01|8 |
|1 |Limited |2020-12-01|8 |
|2 |Limited |2020-09-01|1 |
|2 |Limited |2020-10-01|1 |
|2 |Limited |2020-11-01|1 |
|2 |Limited |2020-12-01|3 |
+----------+---------+----------+-----+
How to duplicate rows generating dates between Start Date and End Date in BigQuery?
Below is for BigQuery Standard SQL
#standardSQL
WITH `project.dataset.table` AS (
SELECT 'A' user_name, DATE '2019-07-01' start_date, DATE '2019-07-31' end_date
)
SELECT user_name, start_date, end_date, day
FROM `project.dataset.table`,
UNNEST(GENERATE_DATE_ARRAY(start_date, end_date)) day
ORDER BY user_name, day
with result
Row user_name start_date end_date day
1 A 2019-07-01 2019-07-31 2019-07-01
2 A 2019-07-01 2019-07-31 2019-07-02
3 A 2019-07-01 2019-07-31 2019-07-03
. . .
29 A 2019-07-01 2019-07-31 2019-07-29
30 A 2019-07-01 2019-07-31 2019-07-30
31 A 2019-07-01 2019-07-31 2019-07-31
View to fill date gaps but only if no records for a given date
You can use a table-valued function, which will allow you to use parameters, so there is no need to hard-code them.
The table returned includes an extra column Ins
; it is 0 for the original data and 1 for the inserted.
Rextester demo
If you prefer a view you can easily extract the logic.
create function myFun(@start date, @end date)
returns @result table (MyName varchar(10), MyDate date, Ins bit) as
begin
;
with dates as (
SELECT DATEADD(DAY, nbr - 1, @start) dt
FROM ( SELECT ROW_NUMBER() OVER ( ORDER BY c.object_id ) AS Nbr
FROM sys.columns c
) nbrs
WHERE nbr - 1 <= DATEDIFF(DAY, @start, @end)
), last_avail as (
select dates.dt, max(t.MyDate) prev_dt
from dates join t on dates.dt >= t.MyDate
group by dates.dt
), empty as (
select * from last_avail where dt <> prev_dt
)
insert @result (MyName, MyDate, Ins)
select MyName, MyDate, 0 from t
union all
select t.MyName, x.dt, 1
from empty x join t on x.prev_dt = t.MyDate;
return;
end
Usage example:
declare @start date, @end date;
set @start = '2017-04-08';
set @end = '2017-04-11';
select * from dbo.myFun(@start, @end) order by MyDate desc, MyName asc;
Attribution
The Common Table Expression which plays the role of a calendar table has been found here.
Google BigQuery SQL: How to fill in gaps in a table with dates?
Consider below
with temp as (
select customer, dates from (
select customer, min(dates) min_date, max(dates) max_date
from `project.dataset.table`
group by customer
), unnest(generate_date_array(min_date, max_date)) dates
)
select customer, dates,
first_value(subscription ignore nulls) over win as subscription
from temp a
left join `project.dataset.table` b
using(customer, dates)
window win as (partition by customer order by dates desc rows between current row and unbounded following)
# order by dates, customer
If to apply to sample data in y our question - output is
How to complete and fill in gaps between dates in SQL?
A query like this should do the trick:
create table test (yq int, items int);
INSERT INTO test Values (20201,10),(20204, 15),(20213, 25),(20222, 30);
with recursive quarters(q) as (
select min(yq) as q
from test
union all
select decode(right(q::text, 1), 4, q + 7, q + 1) as q
from quarters
where q < (select max(yq) from test)
)
select q as yq, decode(items is null, true,
lag(items ignore nulls) over (order by q), items) as items
from test t
right join quarters q
on t.yq = q.q
order by q;
It uses a recursive CTE to generate the quarters range needed, right joins this with the source data, and then uses a LAG() window function to populate the items if the value is NULL.
Related Topics
In How Many Languages Is Null Not Equal to Anything Not Even Null
Calling Shell Script from Pl/Sql, But Shell Gets Executed as Grid User, Not Oracle
Pl/Sql- Get Column Names from a Query
Mod' Is Not a Recognized Built-In Function Name
Counter_Cache Has_Many_Through SQL Optimisation, Reduce Number of SQL Queries
Insert and Update a Record Using Cursors in Oracle
T-Sql Stop or Abort Command in SQL Server
How to Reuse a Large Query Without Repeating It
Entity Framework 6.1 - Create Index with Include Statement
Weighted Average in T-Sql (Like Excel's Sumproduct)
How This SQL Injection Works? Explanation Needed
Bigquery Select _Tables_ from All Tables Within Project
Alter Table to Modify Default Value of Column
How to Set List of Values as Parameter into Hibernate Query
Optimising a Select Query That Runs Slow on Oracle Which Runs Quickly on SQL Server
How to Generate a Permutations or Combinations of N Rows in M Columns
How to Get Second Highest Salary Department Wise Without Using Analytical Functions