Extract Elements Common in All Column Groups

Extract elements common in all column groups

First, split the df$ID by Month and use intersect to find elements common in each sub-group.

Reduce(intersect, split(df$ID, df$Month))
#[1] 4 6

If you want to subset the corresponding data.frame, do

df[df$ID %in% Reduce(intersect, split(df$ID, df$Month)),]

Find common values between all groups

You can use Reduce, i.e.

Reduce(intersect, split(df$jahr, df$reg))
#[1] 3

How to find common things from all three column in r?

In R, you can use Reduce + intersect to get common values from all the columns.

common_genes <- Reduce(intersect, df)
common_genes
#[1] "gene1" "gene3"

write.csv(data.frame(common_genes), 'common_genes.csv', row.names = FALSE)

Sample data

df <- data.frame(a = c('gene1', 'gene3', 'gene4', 'gene2'), 
b = c('gene3', 'gene2', 'gene5', 'gene1'),
c = c('gene6', 'gene3', 'gene1', 'gdene7'))

df
# a b c
#1 gene1 gene3 gene6
#2 gene3 gene2 gene3
#3 gene4 gene5 gene1
#4 gene2 gene1 gdene7

obtaining 3 most common elements of groups, concatenating ties, and ignoring less common values

You could change n > 0, and it will work. Your question asks for 3, but my answer will be more generic by accepting any positive integer.

Using base R:

myfun <- function( data, n = 3, col1, col2 )
{
## n: numeric: total number of most common elements per group
stopifnot( n > 0 )

a1 <- lapply( split( data, data[[col1]] ), function( x ) { # split data by col1
# browser()
val <- factor( x[[col2]] ) # factor of data values
z1 <- tabulate( val ) # frequency table of levels of val
z2 <- sort( z1[ z1 > 0 ], decreasing = TRUE ) # sorted frequency table with >0
lenx <- length( unique( z2 ) ) # length of unique of z2

if ( lenx == 1 ) { # lenx == 1
return( c( paste( ( levels(val)[ which( z1 %in% z2 ) ] ), collapse = ','), rep(NA_character_, n - 1 ), sum( z1 ) ) )
} else if ( lenx > 1 ) { # lenx > 1
# remove the minimum, and and extract values by using levels of val with indices from the match of z1 and z2
z2 <- setdiff( z2, min( z2 ) )
z2 <- sapply( z2, function( y ) paste( levels(val)[ which( z1 %in% y ) ], collapse = ',') )

# count the length of z2 and get indices of length >= n
z2_ind <- which( cumsum( lengths(unlist( lapply(z2, strsplit, split = "," ),
recursive = F ) ) ) >= n )
if( length( z2_ind ) > 0 ) {
z2 <- z2[ seq_len( z2_ind[1] ) ]
}
# adjust length by assigning NA
if( length(z2) != n ) { z2[ (length(z2)+1):n ] <- NA_character_ }

return( c( z2, sum( z1 ) ) )
} else { # lenx < 1
return( as.list( rep(NA_character_, n ), NA_character_ ) )
}})

a1 <- do.call('rbind', a1) # row bind values of a1
a1 <- data.frame( group = rownames( a1 ), a1, stringsAsFactors = FALSE )
colnames( a1 ) <- c( 'group', paste( 'm', 1:n, sep = '' ), 'count' )
rownames( a1 ) <- NULL # remove row names
return( a1 )
}

Output:

# example1:
myfun(df, 3, 'col1', 'col2')
# group m1 m2 m3 count
# 1 group1 4 3 2 10
# 2 group2 2 NA NA 10
# 3 group3 4 NA NA 10
# 4 group4 4, 5 NA NA 10
# 5 group5 4, 5 NA NA 10

# example 2
myfun(df3, 3, 'col1', 'col2')
# group m1 m2 m3 count
# 1 group1 4 3 2, 6 12
# 2 group2 2 NA NA 12
# 3 group3 4 5, 7 NA 12
# 4 group4 4, 5 NA NA 12
# 5 group5 4, 5 NA NA 12
# 6 group6 4, 3, 2, 1 NA NA 12
# 7 group7 16 14, 42 NA 16

Create character data instead of numeric data by assigning letters to column 3 of example 1 data df.

set.seed(1L)
df$col3 <- sample( letters, 50, TRUE )
myfun(df, 3, 'col1', 'col3')
# group m1 m2 m3 count
# 1 group1 x <NA> <NA> 10
# 2 group2 j,u <NA> <NA> 10
# 3 group3 a,d,f,g,i,j,k,q,w,y <NA> <NA> 10
# 4 group4 m <NA> <NA> 10
# 5 group5 u <NA> <NA> 10

How to find values shared between groups in a data frame?

Or you can group by val and then check whether the number of distinct exp for that val is equal to the data frame level number of distinct exp:

df %>% 
group_by(val) %>%
mutate(shared = n_distinct(exp) == n_distinct(.$exp))
# notice the first exp refers to exp for each group while .$exp refers
# to the overall exp column in the data frame

# A tibble: 12 x 3
# Groups: val [6]
# exp val shared
# <fct> <dbl> <lgl>
# 1 A 10 TRUE
# 2 A 20 FALSE
# 3 A 15 TRUE
# 4 A 10 TRUE
# 5 B 10 TRUE
# 6 B 15 TRUE
# 7 B 99 FALSE
# 8 B 2 FALSE
# 9 C 15 TRUE
#10 C 20 FALSE
#11 C 10 TRUE
#12 C 4 FALSE

Select rows with common ids in grouped data frame

In base R, we can split id_name by group_name find common id's and then subset

subset(test, id_name %in% Reduce(intersect, split(id_name, group_name)))

# group_name id_name varA varB
# <chr> <chr> <dbl> <chr>
# 1 groupA id_1 1 a
# 2 groupA id_2 4 f
# 3 groupA id_4 6 x
# 4 groupA id_4 6 h
# 5 groupB id_1 2 s
# 6 groupB id_2 13 y
# 7 groupB id_4 14 t
# 8 groupC id_1 3 d
# 9 groupC id_2 7 j
#10 groupC id_4 9 l

Using similar concept in tidyverse, it would be

library(tidyverse)
test %>%
filter(id_name %in% (test %>%
group_split(group_name) %>%
map(~pull(., id_name)) %>%
reduce(intersect)))

How to find common elements from multiple vectors?

There might be a cleverer way to go about this, but

intersect(intersect(a,b),c)

will do the job.

EDIT: More cleverly, and more conveniently if you have a lot of arguments:

Reduce(intersect, list(a,b,c))

python group by two columns, extract first element by one index

First, let:

agg_df = Data.groupby(['id','company']).size()

Assuming you want the result from the first entry for each group of elements having the same value for the zeroth level of the MultiIndex, and that each group is sorted by the first index level as you prefer. (After the updated comment, this appears to be the desired output)

unique_zeroth_level = dict(agg_df.index.values).keys()
group_first_vals = [
agg_df.select(lambda x: x[0]==idx_val, axis=0).head(1).values[0]
for idx_val in unique_zeroth_level]

Assuming you're asking for the unique elements of the zeroth level of the resulting MultiIndex

In this particular case, since the returned result is a Series, you can make use of a trick using unstack:

agg_df.unstack(level=0).columns.values

or use a dict constructor

dict(agg_df.index.values).keys()

Assuming you want the result for (1, 'a') and (2, 'd') in particular, and that you want to access them by the index values (not just as a consequence of those being the lexicographically first entries in their respective groups)

agg_df.ix[[(1, 'a'), (2, 'd')]]


Related Topics



Leave a reply



Submit