Group data by the change of grouping column value in order
There's a (more or less) known technique of solving this kind of problem, involving two ROW_NUMBER()
calls, like this:
WITH marked AS (
SELECT
*,
grp = ROW_NUMBER() OVER (PARTITION BY product ORDER BY date)
- ROW_NUMBER() OVER (PARTITION BY product, price ORDER BY date)
FROM #ph
)
SELECT
product,
date_from = MIN(date),
date_to = MAX(date),
price
FROM marked
GROUP BY
product,
price,
grp
ORDER BY
product,
MIN(date)
Output:
product date_from date_to price
------- ---------- ------------- -----
1 2012-01-01 2012-01-04 1
1 2012-01-05 2012-01-08 2
1 2012-01-09 2012-01-12 1
Set value for group by order
This is a classic gaps-and-islands problem.
There are many solutions. A common one is to use LAG
to identify rows which change value, then a conditional windowed COUNT
to assign the group numbers
WITH PrevValues AS (
SELECT *,
LAG(COL_1) OVER (ORDER BY COL_2) AS PrevCol1
FROM YourTable t
)
SELECT COL_1,COL_2,COL_3,
COUNT(CASE WHEN PrevCol1 = COL_1 THEN NULL ELSE 1 END)
OVER (ORDER BY COL_2 ROWS UNBOUNDED PRECEDING) AS New_Col_3
FROM PrevValues t
db<>fiddle
R: group_id by changing row values
We can use rleid
(run-length-encoding id) from data.table
which would basically increment 1 for each element that is not equal to the previous element
library(data.table)
library(dplyr)
df%>%
mutate(newcol = rleid(value))
and for the second dataset, it would be
df %>%
mutate(new = rleid(value, value2))
# value value2 desired_id new
#1 a a 1 1
#2 a a 1 1
#3 a c 2 2
#4 b b 3 3
#5 b b 3 3
#6 b c 4 4
#7 a a 5 5
#8 a a 5 5
#9 a d 6 6
Or with rle
from base R
df$newcol <- with(rle(df$value), rep(seq_along(values), lengths))
Pandas: how to group on column change?
Use Series.cumsum
with compare if not equal shifted values and aggregate by GroupBy.agg
:
g = df["class"].ne(df["class"].shift()).cumsum()
df = (df.groupby(['class', g], sort=False)['speed'].agg(['count','mean'])
.reset_index(level=1, drop=True)
.reset_index())
print (df)
class count mean
0 1 12 7.500000
1 2 2 26.000000
2 1 5 10.400000
3 2 4 29.250000
4 3 2 42.500000
5 4 1 54.000000
6 5 3 60.666667
7 3 1 40.000000
8 1 3 14.666667
9 2 1 24.000000
reorder rows by group based on another index value in r
Assuming you have a named index vector v
,
v <- c(a=2, b=3, c=1)
in base R you could use match
to create a temporary id
column in the following way, then order
your result.
ds1$id <- v[match(substr(ds1$name, 1, 1), names(v))]
ds1[order(ds1$id), -5]
# name v1 v2 v3
# 7 c1 6 1 2
# 8 c2 5 1 2
# 9 c3 4 1 2
# 1 a1 4 1 2
# 2 a2 1 3 2
# 3 a3 1 5 2
# 4 b1 1 2 2
# 5 b2 1 4 2
# 6 b3 1 3 2
Without the "name"
column you could split
the rows in seq
uences of length n
, sample
them and rbind
them back together.
ds2 <- ds1[2:4] ## generate data w/o "name" column
n <- 3
do.call(rbind, sample(split(ds2, rep(seq(nrow(ds2)/n), each=n))))
# v1 v2 v3
# 3.7 6 1 2
# 3.8 5 1 2
# 3.9 4 1 2
# 1.1 4 1 2
# 1.2 1 3 2
# 1.3 1 5 2
# 2.4 1 2 2
# 2.5 1 4 2
# 2.6 1 3 2
With the division by n
has a remainder, as it's the case with n=4
you'll get a warning, and excess lines are assigned to other groups. Don't know if that suffices for you?
Data
ds1 <- structure(list(name = c("a1", "a2", "a3", "b1", "b2", "b3", "c1",
"c2", "c3"), v1 = c(4L, 1L, 1L, 1L, 1L, 1L, 6L, 5L, 4L), v2 = c(1L,
3L, 5L, 2L, 4L, 3L, 1L, 1L, 1L), v3 = c(2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L), id = c(2, 2, 2, 3, 3, 3, 1, 1, 1)), row.names = c(NA,
-9L), class = "data.frame")
Does the order of columns matter in a group by clause?
No, the order doesn't matter for the GROUP BY clause.
MySQL and SQLite are the only databases I'm aware of that allow you to select columns which are omitted from the group by (non-standard, not portable) but the order doesn't matter there either.
Grouping and counting rows by value until it changes
That was interesting :)
;WITH cte as (
SELECT Messages.Message, Timestamp,
ROW_NUMBER() OVER(PARTITION BY Message ORDER BY Timestamp) AS gn,
ROW_NUMBER() OVER (ORDER BY Timestamp) AS rn
FROM Messages
), cte2 AS (
SELECT Message, Timestamp, gn, rn, gn - rn as gb
FROM cte
), cte3 AS (
SELECT Message, MIN(Timestamp) As Ts, COUNT(1) as Cnt
FROM cte2
GROUP BY Message, gb)
SELECT Message, Cnt FROM cte3
ORDER BY Ts
Here is the result set:
Message Cnt
A 2
B 1
A 3
B 1
The query may be shorter but I post it that way so you can see what's happening.
The result is exactly as requested. This is the most important part gn - rn
the idea is to number the rows in each partition and at the same time number the rows in the whole set then if you subtract the one from the other you'll get the 'rank' of each group.
;WITH cte as (
SELECT Messages.Message, Timestamp,
ROW_NUMBER() OVER(PARTITION BY Message ORDER BY Timestamp) AS gn,
ROW_NUMBER() OVER (ORDER BY Timestamp) AS rn
FROM Messages
), cte2 AS (
SELECT Message, Timestamp, gn, rn, gn - rn as gb
FROM cte
)
SELECT * FROM cte2
Message Timestamp gn rn gb
A 2015-03-29 00:00:00.000 1 1 0
A 2015-03-29 00:01:00.000 2 2 0
B 2015-03-29 00:02:00.000 1 3 -2
A 2015-03-29 00:03:00.000 3 4 -1
A 2015-03-29 00:04:00.000 4 5 -1
A 2015-03-29 00:05:00.000 5 6 -1
B 2015-03-29 00:06:00.000 2 7 -5
Select one value from a group based on order from other columns
SELECT g, a, b, v
FROM (
SELECT *,
@rn := IF(g = @g, @rn + 1, 1) rn,
@g := g
FROM (select @g := null, @rn := 0) x,
tab
ORDER BY g, a desc, b desc, v
) X
WHERE rn = 1;
Single pass. All the other solutions look O(n^2) to me.
group by pandas dataframe and select latest in each group
use idxmax
in groupby
and slice df
with loc
df.loc[df.groupby('id').date.idxmax()]
id product date
2 220 6647 2014-10-16
5 826 3380 2015-05-19
8 901 4555 2014-11-01
Related Topics
Differencebetween '->>' and '->' in Postgres SQL
Has Anyone Had Any Success in Unit Testing SQL Stored Procedures
How to Drop Multiple Tables in Postgresql Using a Wildcard
How to Select SQL Results Based on Multiple Tables
Maintaining Referential Integrity - Good or Bad
Sqlite: Preventing Duplicate Rows
Which One Have Better Performance:Derived Tables or Temporary Tables
Diff Between Top 1 1 and Select 1 in SQL Select Query
How to Get Node Name and Values from an Xml Variable in T-Sql
SQL Server: Invalid Column Name
Set Database from Single User Mode to Multi User
Why Does SQL Server Keep Executing After Raiserror When Xact_Abort Is On
How to Repair a Corrupted Mptt Tree (Nested Set) in the Database Using SQL
Bulk Insert, SQL Server 2000, Unix Linebreaks
Fastest Way Merge Two SQLite Databases