Make a Group_Indices Based on Several Columns

Make a group_indices based on several columns

Using igraph get membership, then map on names:

library(igraph)

# convert to graph, and get clusters membership ids
g <- graph_from_data_frame(df1[, c(2, 3, 1)])
myGroups <- components(g)$membership

myGroups
# A B C D E F Z X Y W V U s T
# 1 1 2 3 4 4 1 1 1 2 2 2 3 4

# then map on names
df1$group <- myGroups[df1$G1]


df1
# id G1 G2 group
# 1 1 A Z 1
# 2 2 A X 1
# 3 3 B X 1
# 4 4 B Y 1
# 5 5 C W 2
# 6 6 C V 2
# 7 7 C U 2
# 8 8 D s 3
# 9 9 E T 4
# 10 10 F T 4

R tidyverse: unique identifier for union of the sets of two columns

Perhaps, we can use igraph

library(dplyr)
library(igraph)
df %>%
select(-id) %>%
graph_from_data_frame %>%
clusters %>%
pluck(membership) -> cls
df %>%
mutate(group_size = cls[class1])
# A tibble: 6 x 4
# id class1 class2 group_size
# <dbl> <chr> <chr> <dbl>
#1 1 A L1 1
#2 2 A L1 1
#3 3 B L1 1
#4 4 B L2 1
#5 5 C L3 2
#6 6 D L4 3

How to create group indices for nested groups in r

dplyr

Using cumsum and !duplicated with dplyr

df %>%
group_by(id) %>%
mutate(daynum = cumsum(!duplicated(dayweek)))


# A tibble: 13 x 3
# Groups: id [2]
id dayweek daynum
<dbl> <dbl> <int>
1 1 1 1
2 1 1 1
3 1 4 2
4 1 4 2
5 1 5 3
6 1 5 3
7 2 1 1
8 2 1 1
9 2 2 2
10 2 2 2
11 2 3 3
12 2 3 3
13 2 3 3

tapply from base R

unlist(tapply(df$dayweek, df$id, function(x) cumsum(!duplicated(x))))

1 1 2 2 3 3 1 1 2 2 3 3 3

Assign unique ID based on two columns

We can do this in base R without doing any group by operation

df$ID <- cumsum(!duplicated(df[1:2]))
df
# School Student Year ID
#1 A 10 1999 1
#2 A 10 2000 1
#3 A 20 1999 2
#4 A 20 2000 2
#5 A 20 2001 2
#6 B 10 1999 3
#7 B 10 2000 3

NOTE: Assuming that 'School' and 'Student' are ordered


Or using tidyverse

library(dplyr)
df %>%
mutate(ID = group_indices_(df, .dots=c("School", "Student")))
# School Student Year ID
#1 A 10 1999 1
#2 A 10 2000 1
#3 A 20 1999 2
#4 A 20 2000 2
#5 A 20 2001 2
#6 B 10 1999 3
#7 B 10 2000 3

As @radek mentioned, in the recent version (dplyr_0.8.0), we get the notification that group_indices_ is deprecated, instead use group_indices

df %>% 
mutate(ID = group_indices(., School, Student))

Apply multiple functions to multiple groupby columns

The second half of the currently accepted answer is outdated and has two deprecations. First and most important, you can no longer pass a dictionary of dictionaries to the agg groupby method. Second, never use .ix.

If you desire to work with two separate columns at the same time I would suggest using the apply method which implicitly passes a DataFrame to the applied function. Let's use a similar dataframe as the one from above

df = pd.DataFrame(np.random.rand(4,4), columns=list('abcd'))
df['group'] = [0, 0, 1, 1]
df

a b c d group
0 0.418500 0.030955 0.874869 0.145641 0
1 0.446069 0.901153 0.095052 0.487040 0
2 0.843026 0.936169 0.926090 0.041722 1
3 0.635846 0.439175 0.828787 0.714123 1

A dictionary mapped from column names to aggregation functions is still a perfectly good way to perform an aggregation.

df.groupby('group').agg({'a':['sum', 'max'], 
'b':'mean',
'c':'sum',
'd': lambda x: x.max() - x.min()})

a b c d
sum max mean sum <lambda>
group
0 0.864569 0.446069 0.466054 0.969921 0.341399
1 1.478872 0.843026 0.687672 1.754877 0.672401

If you don't like that ugly lambda column name, you can use a normal function and supply a custom name to the special __name__ attribute like this:

def max_min(x):
return x.max() - x.min()

max_min.__name__ = 'Max minus Min'

df.groupby('group').agg({'a':['sum', 'max'],
'b':'mean',
'c':'sum',
'd': max_min})

a b c d
sum max mean sum Max minus Min
group
0 0.864569 0.446069 0.466054 0.969921 0.341399
1 1.478872 0.843026 0.687672 1.754877 0.672401

Using apply and returning a Series

Now, if you had multiple columns that needed to interact together then you cannot use agg, which implicitly passes a Series to the aggregating function. When using apply the entire group as a DataFrame gets passed into the function.

I recommend making a single custom function that returns a Series of all the aggregations. Use the Series index as labels for the new columns:

def f(x):
d = {}
d['a_sum'] = x['a'].sum()
d['a_max'] = x['a'].max()
d['b_mean'] = x['b'].mean()
d['c_d_prodsum'] = (x['c'] * x['d']).sum()
return pd.Series(d, index=['a_sum', 'a_max', 'b_mean', 'c_d_prodsum'])

df.groupby('group').apply(f)

a_sum a_max b_mean c_d_prodsum
group
0 0.864569 0.446069 0.466054 0.173711
1 1.478872 0.843026 0.687672 0.630494

If you are in love with MultiIndexes, you can still return a Series with one like this:

    def f_mi(x):
d = []
d.append(x['a'].sum())
d.append(x['a'].max())
d.append(x['b'].mean())
d.append((x['c'] * x['d']).sum())
return pd.Series(d, index=[['a', 'a', 'b', 'c_d'],
['sum', 'max', 'mean', 'prodsum']])

df.groupby('group').apply(f_mi)

a b c_d
sum max mean prodsum
group
0 0.864569 0.446069 0.466054 0.173711
1 1.478872 0.843026 0.687672 0.630494

Group-specific ID numbers using group_indices or similar in R

Here's one way to do it:

df <- df %>% 
left_join(unique(df) %>% group_by(Site) %>% mutate(Day.Number=1:n()))

head(df)
# Site Day Day.Number
# 1 A 21 1
# 2 A 21 1
# 3 A 21 1
# 4 A 21 1
# 5 A 21 1
# 6 A 21 1

Group rows if the value of a column appears in the other column

An option would be to replace the values based on the intersecting elements and then do the aggregate

i1 <- df$col1 %in% df$col2
df$col1[i1] <- df$col1[match(df$col1[inds], df$col2)]
aggregate(col2 ~ col1, unique(df), FUN = toString)
# col1 col2
#1 R1 R10
#2 R2 R4, R5, R6, R7, R9

Or with tidyverse

library(dplyr)
library(stringr)
df %>%
group_by(col1 = case_when(col1 %in% intersect(col1, col2) ~ "R2",
TRUE ~ col1)) %>%
distinct %>%
summarise(col2 = toString(col2))
# A tibble: 2 x 2
# col1 col2
# <chr> <chr>
#1 R1 R10
#2 R2 R4, R5, R6, R7, R9

R Find the groups of tuples

You can leverage on igraph to find the different clusters of networks

library(igraph)
g <- graph_from_data_frame(df, FALSE)
cg <- clusters(g)$membership
df$id3 <- cg[df$id1]
df

output:

   id1 id2 id3
1 1 a 1
2 1 b 1
3 2 a 1
4 2 c 1
5 3 c 1
6 3 d 1
7 4 x 2
8 4 y 2
9 5 y 2
10 5 z 2

Test condition of two columns on groups

Perhaps this would help

library(dplyr)
reprex %>%
group_by(id, number) %>%
mutate(check = length(intersect(classification, `classification-1`)) > 0)

Of if we need to check all the unique elements, then after grouping by 'id', 'number', get the unique elements of both classification, classification-1, check whether they are equal with setequal

reprex %>%
group_by(id, number) %>%
mutate(check = setequal(sort(unique(classification)),
sort(unique(`classification-1`))))

Creating new variable based on two other columns containing categories with tidyverse

A bit hard to understand what you need the functions to do. But here is a guess :)

df_subs <- data.frame(sex = c("Male", 
"Female",
"Female"),
patient.class = c("Not_Admitted",
"ORDINARY ADMISSION",
"ORDINARY ADMISSION"))

# Question 1
df_subs %>%
mutate(sex_patient_class = case_when(
sex == "Female" & patient.class == "Not_Admitted" ~ "female_not_admitted",
sex == "Female" & patient.class == "ORDINARY ADMISSION" ~ "female_admitted",
sex == "Male" & patient.class == "Not_Admitted" ~ "male_not_admitted",
sex == "Male" & patient.class == "ORDINARY ADMISSION" ~ "male_admitted"
))

# Question 2
df_subs %>%
unite(sex_patient_class, sex, patient.class, sep = "_")

# Question 3
df_subs %>%
mutate(group_indices = group_indices(., patient.class, sex))


Related Topics



Leave a reply



Submit