How to Fill Missing Dates and Values in Partitioned Data

How to fill missing dates in BigQuery by partition (no backfill)

Consider below approach

select order_date, product, store, ifnull(sales, 0) sales 
from (
  select product, store, order_date 
  from (
    select product, store, min(order_date) start_date, max(order_date) end_date
    from `project.dataset.table`
    group by product, store
  ), unnest(generate_date_array(start_date, end_date)) order_date
)
left join `project.dataset.table` 
using(product, store, order_date)

if applied to sample data in your question - output is

Sample Image

How to fill missing dates in BigQuery?

this should work

with base as (

select 'A' as name,           '01/01/2020' as date,     1.5 as val  union all
select 'A' as name,           '01/03/2020' as date,     2 as val union all
select 'A' as name,           '01/06/2020' as date,     5 as val union all
select 'B' as name,           '01/02/2020' as date,     90 as val union all
select 'B' as name,           '01/07/2020' as date,     10 as val
),

missing_dates as (

select name,dates as date from 
UNNEST(GENERATE_DATE_ARRAY('2019-12-29', '2020-01-09', INTERVAL 1 DAY)) AS dates cross join (select distinct name from base)

), joined as (
select distinct missing_dates.name, missing_dates.date,val 
from  missing_dates 
left join base on missing_dates.name = base.name 
and  parse_date('%m/%d/%Y', base.date) = missing_dates.date

)

select * except(val), 
ifnull(first_value(val ignore nulls) over(partition by name order by date ROWS BETWEEN CURRENT ROW AND
    UNBOUNDED FOLLOWING),1) as va1 
from joined

How to fill missing dates and values in partitioned data?

First, you need to generate the dates. Then you can generate all the combinations of date and name. Finally, fill in the values. Here is an example using cross apply:

with dates as (
      select @MINDATE as thedate
      union all
      select dateadd(day, 1, thedate)
      from dates
      where dateadd(day, 1, thedate) <= getdate()
     )
select thedate, vals.val
from dates cross join
     (select distinct name from hypothetical) h cross apply
     (select top 1 val
      from hypothetical h2
      where h2.name = h.name and h2.date <= dates.thedate
      order by date desc
     ) vals;

Fill in missing dates across multiple partitions (Snowflake)

WITH fake_data AS (
    SELECT * FROM VALUES
        ('A','USD','2020-01-01'::date,3)
        ,('A','USD','2020-01-03'::date,4)
        ,('A','USD','2020-01-04'::date,2)
        ,('A','CAD','2021-01-04'::date,5)
        ,('A','CAD','2021-01-06'::date,6)
        ,('A','CAD','2020-01-07'::date,1)
        ,('B','USD','2019-01-01'::date,3)
        ,('B','USD','2019-01-03'::date,4)
        ,('B','USD','2019-01-04'::date,5)
        ,('B','CAD','2017-01-04'::date,3)
        ,('B','CAD','2017-01-06'::date,2)
        ,('B','CAD','2017-01-07'::date,2)
    d(Name,Currency,Date,Amount)
), partition_ranges AS (
    SELECT name,
        currency, 
        min(date) as min_date, 
        max(date) as max_date,
        datediff('days', min_date, max_date) as span
    FROM fake_data
    GROUP BY 1,2
), huge_range as (
    SELECT ROW_NUMBER() OVER(order by true)-1 as rn
    FROM table(generator(ROWCOUNT => 10000000))
), in_fill as (
    SELECT pr.name,
        pr.currency,
        dateadd('day', hr.rn, pr.min_date) as date
    FROM partition_ranges as pr
    JOIN huge_range as hr ON pr.span >= hr.rn
)
SELECT 
    i.name, 
    i.currency, 
    i.date,
    nvl(d.amount, 0) as amount
from in_fill as i
left join fake_data as d on d.name = i.name and d.currency = i.currency and d.date = i.date
order by 1,2,3;

How to fill missing values for missing dates with value from date before in sql bigquery?

Consider below:

WITH days_by_id AS (
  SELECT id, GENERATE_DATE_ARRAY(MIN(date), MAX(date)) days
    FROM sample
   GROUP BY id
)
SELECT date, id, 
       IFNULL(price, LAST_VALUE(price IGNORE NULLS) OVER (PARTITION BY id ORDER BY date)) AS price
  FROM days_by_id, UNNEST(days) date LEFT JOIN sample USING (id, date);

output :

Sample Image

Expanding the data for missing dates by each group

You can use a recursive query to generate the dates:

SELECT d.id,
       d.dt AS "DATE",
       COALESCE(
         LAST_VALUE(t.amount)
           IGNORE NULLS OVER (PARTITION BY d.id ORDER BY d.dt),
         0
       ) AS amount
FROM   table_name t
       RIGHT OUTER JOIN (
         WITH date_ranges (id, dt, max_dt) AS (
           SELECT id,
                  TRUNC(MIN("DATE"), 'MM'),
                  LAST_DAY(TRUNC(MAX("DATE"), 'MM'))
           FROM   table_name
           GROUP BY id
         UNION ALL
           SELECT id, dt + 1, max_dt
           FROM   date_ranges
           WHERE  dt < max_dt
         )
         SELECT id, dt
         FROM   date_ranges
       ) d
       ON (t.id = d.id AND t."DATE" = d.dt)
ORDER BY id, "DATE"

db<>fiddle here

How to fill in missing dates

Here is a query that would work. Start by cross joining all combinations of dates and users (add filters as needed), then left join the users table and calculate quota using the last_value() function (note that if you are using Snowflake, you must specify "rows between unbounded preceding and current row" as documented here):

with all_dates_users as (
--all combinations of dates and users
select date, user
from dates
cross join (select distinct user_email as user from users)
),
joined as (
--left join users table to the previous
select DU.date, DU.user, U.sent_at, U.user_email, U.score, U.quota
from all_dates_users DU
left join users U on U.sent_at = DU.date and U.user_email = DU.user
)
--calculate quota as previous quota using last_value() function
select date, user, nvl(score, 0) as score, last_value(quota) ignore nulls over (partition by user order by date desc rows between unbounded preceding and current row) as quota
from joined
order by date desc;

Hive SQL query to fill missing date values in table with nearest values between date range

Calculate additionally min and max dates for the whole dataset to determine date range required, also calculate min date per account to check if min date needs fixing. Then add additional step of calculation for both dates: check if it is boundary dates and if they are not as required, assign min and max values accordingly.

In this example Peter start date is 2021-05-24 nad Mary starts with 2021-05-23, so, the range was extended and 2021-05-23 record generated for Peter.
For Mary last date is 2021-05-30, missing rows generated at the end of the range.

with mytable as (--Demo dataset, use your table instead of this
select stack(10, --number of tuples
'Peter',float(50000),'2021-05-24',
'Peter',float(50035),'2021-05-25',
'Peter',float(50035),'2021-05-26',
'Peter',float(50610),'2021-05-28',
'Peter',float(51710),'2021-06-01',
'Peter',float(53028.1),'2021-06-02',
'Peter',float(53916.1),'2021-06-03', -------------end date greater than Mary
'Mary',float(50000),'2021-05-23', ----------------start date Less than Peter
'Mary',float(50035),'2021-05-25',
'Mary',float(53028.1),'2021-05-30'
) as (account_name,available_balance,Date_of_balance)
) --use your table instead of this CTE

select  account_name, available_balance, date_add(Date_of_balance,e.i) as Date_of_balance
from
(select account_name, available_balance, 
        case when min_date < min_date_account and Date_of_balance = min_date_account then min_date 
             else Date_of_balance 
         end Date_of_balance,
        
        case when (next_date is null) and (Date_of_balance = max_date) then Date_of_balance 
             when (Date_of_balance < max_date) then nvl(next_date,date_add(max_date,1)) 
         end as next_date
from
( --Get next_date to generate date range
select account_name,available_balance,Date_of_balance,
       lead(Date_of_balance,1) over (partition by account_name order by Date_of_balance) next_date,
       max(Date_of_balance) over() max_date, --total min and max dates all accounts should align
       min(Date_of_balance) over() min_date, 
       min(Date_of_balance) over(partition by account_name) min_date_account
  from mytable d  --use your table
) s 
) s lateral view outer posexplode(split(space(datediff(next_date,Date_of_balance)-1),'')) e as i,x --generate rows
order by account_name desc, Date_of_balance --this is to have order of rows like in your Converted Table

Result:

account_name    available_balance   date_of_balance
Peter             50000              2021-05-23
Peter             50000              2021-05-24
Peter             50035              2021-05-25
Peter             50035              2021-05-26
Peter             50035              2021-05-27
Peter             50610              2021-05-28
Peter             50610              2021-05-29
Peter             50610              2021-05-30
Peter             50610              2021-05-31
Peter             51710              2021-06-01
Peter             53028.1            2021-06-02
Peter             53916.1            2021-06-03
Mary              50000              2021-05-23
Mary              50000              2021-05-24
Mary              50035              2021-05-25
Mary              50035              2021-05-26
Mary              50035              2021-05-27
Mary              50035              2021-05-28
Mary              50035              2021-05-29
Mary              53028.1            2021-05-30
Mary              53028.1            2021-05-31
Mary              53028.1            2021-06-01
Mary              53028.1            2021-06-02
Mary              53028.1            2021-06-03

Note that lead function calculated differently also, it does not have default value, NULL indicates the end date available