Google Big Query SQL - Get Most Recent Column Value

Google Big Query SQL - Get Most Recent Column Value

SELECT user_email, user_first_name, user_last_name, time, is_deleted 
FROM (
SELECT user_email, user_first_name, user_last_name, time, is_deleted
, RANK() OVER(PARTITION BY user_email ORDER BY time DESC) rank
FROM table
)
WHERE rank=1

Get the attributes of the most recent row in BigQuery?

Just use row_number():

SELECT t.*
FROM (SELECT t.*,
ROW_NUMBER() OVER (PARTITION BY ADDRESS, POSTCODE
ORDER BY DATE DESC
) as seqnum
FROM [mytable] t
) t
WHERE seqnum = 1;

This is not an aggregation query. You want to filter the rows to get the most recent value.

Get the most recent Timestamp value

I'm going to refer to the timestamp field as ts_field for your example.

To get the latest timestamp, I would run the following query:

SELECT max(ts_field)
FROM `myproject.mydataset.mytable`

If your table is also partitioned on the timestamp field, you can do something like this to scan even less bytes:

SELECT max(ts_field)
FROM `myproject.mydataset.mytable`
WHERE date(ts_field) = current_date()

BigQuery - Get most recent data for each individual user

Use array_agg:

select 
email,
array_agg(STRUCT(TIMESTAMP_MICROS(time_usec) as date, event_type, event_name) ORDER BY time_usec desc LIMIT 1)[OFFSET(0)].*
from `bqadminreporting.adminlogtracking.activity`
where
record_type LIKE 'gplus'
and time_usec > unix_micros(timestamp_sub(current_timestamp(), interval 90 day))
group by email
order by email

Test example:

with mytable as (
select timestamp '2020-01-30 07:10:19.088 UTC' as date, 'user1@domain.com' as email, 'post_change' as event_type, 'create_post' as event_name union all
select timestamp '2020-03-03 08:47:25.086485 UTC', 'user1@domain.com', 'coment_change', 'create_comment' union all
select timestamp '2020-03-23 09:10:09.522 UTC', 'user1@domain.com', 'post_change', 'create_post' union all
select timestamp '2020-03-23 09:49:00.337 UTC', 'user1@domain.com', 'plusone_change', 'remove_plusone' union all
select timestamp '2020-03-23 09:48:10.461 UTC', 'user1@domain.com', 'plusone_change', 'add_plusone' union all
select timestamp '2020-01-30 10:04:29.757005 UTC', 'user1@domain.com', 'coment_change', 'create_coment' union all
select timestamp '2020-03-28 08:52:50.711359 UTC', 'user2@domain.com', 'coment_change', 'create_coment' union all
select timestamp '2020-11-08 10:08:09.161325 UTC', 'user2@domain.com', 'coment_change', 'create_coment' union all
select timestamp '2020-04-21 15:28:10.022683 UTC', 'user3@domain.com', 'coment_change', 'create_coment' union all
select timestamp '2020-03-28 09:37:28.738863 UTC', 'user4@domain.com', 'coment_change', 'create_coment'
)
select
email,
array_agg(STRUCT(date, event_type, event_name) ORDER BY date desc LIMIT 1)[OFFSET(0)].*
from mytable
group by email

Sample Image

Google Big Query SQL - Get most recent unique value by date

Below is for BigQuery Standard SQL

#standardSQL
SELECT date, COUNT(DISTINCT uuid) total_active
FROM `project.dataset.table`
WHERE status = 'active'
GROUP BY date
-- ORDER BY date

Update to address your "rephrased" question :o)

Below example is using dummy data from your question

#standardSQL
WITH `project.dataset.users` AS (
SELECT 3 uuid, 'inactive' status, DATE '2018-05-12' date UNION ALL
SELECT 1, 'active', '2018-05-10' UNION ALL
SELECT 1, 'inactive', '2018-05-08' UNION ALL
SELECT 2, 'active', '2018-05-08' UNION ALL
SELECT 3, 'active', '2018-05-04' UNION ALL
SELECT 2, 'inactive', '2018-04-22' UNION ALL
SELECT 3, 'inactive', '2018-04-18'
), dates AS (
SELECT day FROM UNNEST((
SELECT GENERATE_DATE_ARRAY(MIN(date), MAX(date))
FROM `project.dataset.users`
)) day
), active_users AS (
SELECT uuid, status, date first, DATE_SUB(next_status.date, INTERVAL 1 DAY) last FROM (
SELECT uuid, date, status, LEAD(STRUCT(status, date)) OVER(PARTITION BY uuid ORDER BY date ) next_status
FROM `project.dataset.users` u
)
WHERE status = 'active'
)
SELECT day, COUNT(DISTINCT uuid) actives
FROM dates d JOIN active_users u
ON day BETWEEN first AND IFNULL(last, day)
GROUP BY day
-- ORDER BY day

with result

Row day         actives  
1 2018-05-04 1
2 2018-05-05 1
3 2018-05-06 1
4 2018-05-07 1
5 2018-05-08 2
6 2018-05-09 2
7 2018-05-10 3
8 2018-05-11 3
9 2018-05-12 2

How to get the latest record on big query based on field date

In Big Query, you can use arrays for this. Assuming that you want the row with the most recent fecha per mpio:

select * except(ar)
from (
select
mpio,
array_agg(
struct(id, producto, precio, mpio,cod_dpto, presentacion)
order by fecha desc limit 1
) ar
from database
where
categoria="Arrendamiento de tierras"
and tipo="Arrendamiento de tierras"
and producto="Cabeza de ganado mensual para pastoreo, terreno inclinado/ondulado con agua"
and presentacion ="Arriendo"
group by mpio
) x, unnest(x.ar)

A more standard approach is to use window functions:

select * except(rn)
from (
select
fecha, id, producto, precio, mpio,cod_dpto, presentacion,
row_number() over(partition by mpio order by fecha desc) rn
from database
where
categoria="Arrendamiento de tierras"
and tipo="Arrendamiento de tierras"
and producto="Cabeza de ganado mensual para pastoreo, terreno inclinado/ondulado con agua"
and presentacion ="Arriendo"
) t
where rn = 1

Select latest N records in BigQuery based on TimeStamp - More Optimized

Use row_number(). For instance, to get the most recent records:

SELECT sd.* EXCEPT (seqnum)
FROM (SELECT sd.*,
ROW_NUMBER() OVER (PARTITION BY user_n ORDER BY timestamp_column DESC) as seqnum
FROM `project-id`.huge-dataset-id.streaming-data` sd
WHERE timestamp_column BETWEEN '2015-06-10 14:20' AND '2015-06-10 14:30'
) sd
WHERE seqnum <= 2;

If you want two random rows, use ORDER BY rand() instead.

The timestamp range is included because that is in the question. However, to get the two most recent rows, you can remove it.

Scalable Solution to get latest row for each ID in BigQuery

Quick and dirty option - combine your both queries into one - first get all records with latest collection_time (using your second query) and then dedup them using your first query:

SELECT * FROM (
SELECT *, ROW_NUMBER() OVER (PARTITION BY tab1.ID) AS rn
FROM (
SELECT tab1.*
FROM mytable AS tab1
INNER JOIN (
SELECT ID, MAX(collection_time) AS second_time
FROM mytable GROUP BY ID
) AS tab2
ON tab1.ID=tab2.ID AND tab1.collection_time=tab2.second_time
)
)
WHERE rn = 1

And with Standard SQL (proposed by S.Mohsen sh)

WITH myTable AS (
SELECT 1 AS ID, 1 AS collection_time
),
tab1 AS (
SELECT ID,
MAX(collection_time) AS second_time
FROM myTable GROUP BY ID
),
tab2 AS (
SELECT * FROM myTable
),
joint AS (
SELECT tab2.*
FROM tab2 INNER JOIN tab1
ON tab2.ID=tab1.ID AND tab2.collection_time=tab1.second_time
)
SELECT * EXCEPT(rn)
FROM (
SELECT *, ROW_NUMBER() OVER (PARTITION BY ID) AS rn
FROM joint
)
WHERE rn=1


Related Topics



Leave a reply



Submit