Group Data by the Change of Grouping Column Value in Order

Group data by the change of grouping column value in order

There's a (more or less) known technique of solving this kind of problem, involving two ROW_NUMBER() calls, like this:

WITH marked AS (
SELECT
*,
grp = ROW_NUMBER() OVER (PARTITION BY product ORDER BY date)
- ROW_NUMBER() OVER (PARTITION BY product, price ORDER BY date)
FROM #ph
)
SELECT
product,
date_from = MIN(date),
date_to = MAX(date),
price
FROM marked
GROUP BY
product,
price,
grp
ORDER BY
product,
MIN(date)

Output:

product  date_from   date_to        price 
------- ---------- ------------- -----
1 2012-01-01 2012-01-04 1
1 2012-01-05 2012-01-08 2
1 2012-01-09 2012-01-12 1

Set value for group by order

This is a classic gaps-and-islands problem.

There are many solutions. A common one is to use LAG to identify rows which change value, then a conditional windowed COUNT to assign the group numbers

WITH PrevValues AS (
SELECT *,
LAG(COL_1) OVER (ORDER BY COL_2) AS PrevCol1
FROM YourTable t
)
SELECT COL_1,COL_2,COL_3,
COUNT(CASE WHEN PrevCol1 = COL_1 THEN NULL ELSE 1 END)
OVER (ORDER BY COL_2 ROWS UNBOUNDED PRECEDING) AS New_Col_3
FROM PrevValues t

db<>fiddle

R: group_id by changing row values

We can use rleid (run-length-encoding id) from data.table which would basically increment 1 for each element that is not equal to the previous element

library(data.table)
library(dplyr)
df%>%
mutate(newcol = rleid(value))

and for the second dataset, it would be

df %>%
mutate(new = rleid(value, value2))
# value value2 desired_id new
#1 a a 1 1
#2 a a 1 1
#3 a c 2 2
#4 b b 3 3
#5 b b 3 3
#6 b c 4 4
#7 a a 5 5
#8 a a 5 5
#9 a d 6 6

Or with rle from base R

df$newcol <- with(rle(df$value), rep(seq_along(values), lengths))

Pandas: how to group on column change?

Use Series.cumsum with compare if not equal shifted values and aggregate by GroupBy.agg:

g = df["class"].ne(df["class"].shift()).cumsum()
df = (df.groupby(['class', g], sort=False)['speed'].agg(['count','mean'])
.reset_index(level=1, drop=True)
.reset_index())
print (df)
class count mean
0 1 12 7.500000
1 2 2 26.000000
2 1 5 10.400000
3 2 4 29.250000
4 3 2 42.500000
5 4 1 54.000000
6 5 3 60.666667
7 3 1 40.000000
8 1 3 14.666667
9 2 1 24.000000

reorder rows by group based on another index value in r

Assuming you have a named index vector v,

v <- c(a=2, b=3, c=1)

in base R you could use match to create a temporary id column in the following way, then order your result.

ds1$id <- v[match(substr(ds1$name, 1, 1), names(v))]
ds1[order(ds1$id), -5]
# name v1 v2 v3
# 7 c1 6 1 2
# 8 c2 5 1 2
# 9 c3 4 1 2
# 1 a1 4 1 2
# 2 a2 1 3 2
# 3 a3 1 5 2
# 4 b1 1 2 2
# 5 b2 1 4 2
# 6 b3 1 3 2

Without the "name" column you could split the rows in sequences of length n, sample them and rbind them back together.

ds2 <- ds1[2:4]  ## generate data w/o "name" column

n <- 3
do.call(rbind, sample(split(ds2, rep(seq(nrow(ds2)/n), each=n))))
# v1 v2 v3
# 3.7 6 1 2
# 3.8 5 1 2
# 3.9 4 1 2
# 1.1 4 1 2
# 1.2 1 3 2
# 1.3 1 5 2
# 2.4 1 2 2
# 2.5 1 4 2
# 2.6 1 3 2

With the division by n has a remainder, as it's the case with n=4 you'll get a warning, and excess lines are assigned to other groups. Don't know if that suffices for you?


Data

ds1 <- structure(list(name = c("a1", "a2", "a3", "b1", "b2", "b3", "c1", 
"c2", "c3"), v1 = c(4L, 1L, 1L, 1L, 1L, 1L, 6L, 5L, 4L), v2 = c(1L,
3L, 5L, 2L, 4L, 3L, 1L, 1L, 1L), v3 = c(2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L), id = c(2, 2, 2, 3, 3, 3, 1, 1, 1)), row.names = c(NA,
-9L), class = "data.frame")

Does the order of columns matter in a group by clause?

No, the order doesn't matter for the GROUP BY clause.

MySQL and SQLite are the only databases I'm aware of that allow you to select columns which are omitted from the group by (non-standard, not portable) but the order doesn't matter there either.

Grouping and counting rows by value until it changes

That was interesting :)

;WITH cte as (
SELECT Messages.Message, Timestamp,
ROW_NUMBER() OVER(PARTITION BY Message ORDER BY Timestamp) AS gn,
ROW_NUMBER() OVER (ORDER BY Timestamp) AS rn
FROM Messages
), cte2 AS (
SELECT Message, Timestamp, gn, rn, gn - rn as gb
FROM cte
), cte3 AS (
SELECT Message, MIN(Timestamp) As Ts, COUNT(1) as Cnt
FROM cte2
GROUP BY Message, gb)
SELECT Message, Cnt FROM cte3
ORDER BY Ts

Here is the result set:

  Message   Cnt
A 2
B 1
A 3
B 1

The query may be shorter but I post it that way so you can see what's happening.
The result is exactly as requested. This is the most important part gn - rn the idea is to number the rows in each partition and at the same time number the rows in the whole set then if you subtract the one from the other you'll get the 'rank' of each group.

;WITH cte as (
SELECT Messages.Message, Timestamp,
ROW_NUMBER() OVER(PARTITION BY Message ORDER BY Timestamp) AS gn,
ROW_NUMBER() OVER (ORDER BY Timestamp) AS rn
FROM Messages
), cte2 AS (
SELECT Message, Timestamp, gn, rn, gn - rn as gb
FROM cte
)
SELECT * FROM cte2

Message Timestamp gn rn gb
A 2015-03-29 00:00:00.000 1 1 0
A 2015-03-29 00:01:00.000 2 2 0
B 2015-03-29 00:02:00.000 1 3 -2
A 2015-03-29 00:03:00.000 3 4 -1
A 2015-03-29 00:04:00.000 4 5 -1
A 2015-03-29 00:05:00.000 5 6 -1
B 2015-03-29 00:06:00.000 2 7 -5

Select one value from a group based on order from other columns

SELECT g, a, b, v
FROM (
SELECT *,
@rn := IF(g = @g, @rn + 1, 1) rn,
@g := g
FROM (select @g := null, @rn := 0) x,
tab
ORDER BY g, a desc, b desc, v
) X
WHERE rn = 1;

Single pass. All the other solutions look O(n^2) to me.

group by pandas dataframe and select latest in each group

use idxmax in groupby and slice df with loc

df.loc[df.groupby('id').date.idxmax()]

id product date
2 220 6647 2014-10-16
5 826 3380 2015-05-19
8 901 4555 2014-11-01


Related Topics



Leave a reply



Submit