Subtract Pairs of Columns Based on Matching Column

Subtract pairs of columns based on matching column

Using aggregate in base:

> aggregate(.~FILE, data= DF[, -2], FUN=diff)
FILE COUNT
1 A 15
2 B 25
3 C 20

Using ddply in plyr

> ddply(DF[,-2], .(FILE), summarize, DIFFERENCE=diff(COUNT))
FILE DIFFERENCE
1 A 15
2 B 25
3 C 20

with data.table

> # library(data.table)
> DT <- data.table(DF)
> DT[, diff(COUNT), by=FILE]
FILE V1
1: A 15
2: B 25
3: C 20

with by

> with(DF, by(COUNT, FILE, diff))
FILE: A
[1] 15
-----------------------------------------------------------------------------
FILE: B
[1] 25
-----------------------------------------------------------------------------
FILE: C
[1] 20

with tapply

> tapply(DF$COUNT, DF$FILE, diff)
A B C
15 25 20

with summaryBy from doBy package

> # library(doBy)
> summaryBy(COUNT~FILE, FUN=diff, data=DF)
FILE COUNT.diff
1 A 15
2 B 25
3 C 20

Update
As percentage:

> aggregate(.~FILE, data= DF[, -2], function(x) (x[1]/x[2])*100)
FILE COUNT
1 A 40.00000
2 B 16.66667
3 C 60.00000

Column pair subtraction - Pandas

MultiIndex comes in handy here, as it allows for a relatively easy way to reshape and manipulate the dataframe:

Set the customer id as index, and reshape the remaining columns into a multi index

temp = df.set_index('customer_id')
temp.columns = temp.columns.str.split(r'_(q\d+)_', expand = True)

temp
count_sessions purchases
q4 q1 q4 q1
2021 2022 2021 2022
customer_id
203 100 110 12 11
484 210 215 21 18
582 409 400 35 32

Iterate through the first level of the columns in a list comprehension and compute the differences; in this case we can tell that 2022 trails 2021 for each section, so we'll multiply our outcome by -1 to flip it:

keys = temp.columns.get_level_values(0).unique()
outcome = [temp[key].agg(np.subtract.reduce, axis = 1).mul(-1)
for key in keys]
outcome = pd.concat(outcome, axis = 1, keys = keys)
outcome.add_suffix('_diff').reset_index()

customer_id count_sessions_diff purchases_diff
0 203 10 -1
1 484 5 -3
2 582 -9 -3

Subtract paired column

I am assuming that for every Case there is exactly one Control and vice versa. It seemed easiest to transform your Pairing data.frame to align the Case and the Control. Once that is done, you can just build the data frame you want.

## First, recreate your data
Pairing = read.table(text="Match Pos
Control_70 001_A01
Control_56 001_A02
Case_70 001_A03
Case_56 001_A04
Control_21 001_A05
Case_21 001_A06",
header=TRUE)

matures.cpm.spike.batch = read.table(text=" 001_A01_S1 001_A02_S2 001_A03_S3 001_A04_S4 001_A05_S5 001_A06_S6
hsa-let-7a-5p 16.566813 11.415796 12.400252 22.701457 8.864882 20.442599
hsa-let-7b-5p 15.574190 11.107133 12.196465 17.954547 8.527478 25.788286
hsa-let-7c-5p 5.976763 4.372978 5.984685 9.821348 6.341252 7.480211
hsa-let-7d-3p 16.508818 10.697730 11.001534 18.375286 7.583910 24.974774
hsa-let-7d-5p 13.273824 5.134547 9.456675 11.567230 7.096485 13.294108
hsa-let-7f-5p 13.900711 9.804384 11.481614 20.002110 7.878241 17.295909",
header=TRUE)

## Build Matches to replace your Pairing
Control = Pairing[grep("Control", Pairing$Match),]
Control = Control[order(Control$Match),]
Case = Pairing[grep("Case", Pairing$Match),]
Case = Case[order(Case$Match),]
Matches = cbind(Control, Case)

# Uses Matches to build desired data.frame
Diffs = data.frame(matures.cpm.spike.batch[, Matches[1,4]] -
matures.cpm.spike.batch[, Matches[1,2]])
colnames(Diffs)[1] = sub("Control", "Diff", Matches[1,1])
for(i in 2:nrow(Matches)) {
Diffs[,i] = matures.cpm.spike.batch[, Matches[i,4]] -
matures.cpm.spike.batch[, Matches[i,2]]
colnames(Diffs)[i] = sub("Control", "Diff", Matches[i,1])
}

## Result
Diff_21 Diff_56 Diff_70
1 11.577717 11.285661 -4.166561
2 17.260808 6.847414 -3.377725
3 1.138959 5.448370 0.007922
4 17.390864 7.677556 -5.507284
5 6.197623 6.432683 -3.817149
6 9.417668 10.197726 -2.419097

Subtracting columns with the same prefix based on different suffix loop ends with starts with

One tidyverse possibility could be:

bind_cols(df %>%
gather(var, val, -matches("(_L)|(Date)")) %>%
select(Date., var, val),
df %>%
gather(var, val, -matches("(_H)|(Date)")) %>%
select(Date., var, val)) %>%
mutate(res1 = paste0(var, "_", var1),
res2 = val-val1) %>%
select(Date., res1, res2) %>%
spread(res1, res2) %>%
left_join(df, by = c("Date." = "Date."))

Date. A_H._A_L. B_H._B_L. C_H._C_L. D_H._D_L. A_H. B_H. C_H. D_H. A_L. B_L. C_L. D_L.
1 1/1/18. 1 4 5 2 4 6 7 6 3 2 2 4
2 1/2/18 -1 4 2 1 5 7 3 5 6 3 1 4

It, first, separately, transforms the _H and _Lcolumns from wide to long format. In the second step, it combines the two by columns. After this step, it creates the names of the new variables by combining the names of the _H and _L columns and subtracts the _L columns from _H columns. Finally, it transforms the new variables with the desired results to the original wide format and then joins it with the original df based on "Date.".

Or:

df %>%
gather(var, val, -Date.) %>%
mutate(temp = gsub("_.*$", "", var)) %>%
arrange(Date., temp) %>%
group_by(temp = gl(length(var)/2, 2)) %>%
mutate(res1 = paste(var, collapse = "_"),
res2 = val - lead(val)) %>%
na.omit() %>%
ungroup() %>%
select(Date., res1, res2) %>%
spread(res1, res2) %>%
left_join(df, by = c("Date." = "Date."))

In this case, it, first, transforms the data from wide to long format, excluding the "Date." column. Second, it takes the characters before _ from the variable names and arranges the data according "Date." and this. Third, for every two rows, it assigns a factor level and groups by it. Forth, it combines together the names of the variables and does the subtraction. Finally, it returns the data to the original wide format and joins it with the original df.

Or with base R you can try something like:

res <- df[, grepl("_H", colnames(df))] - df[, grepl("_L", colnames(df))]

colnames(res) <- paste(colnames(df[, grepl("_H", colnames(df))]),
colnames(df[, grepl("_L", colnames(df))]), sep = "_")

cbind(df, res)

Date. A_H. B_H. C_H. D_H. A_L. B_L. C_L. D_L A_H._A_L. B_H._B_L. C_H._C_L. D_H._D_L
1 1/1/18. 4 6 7 6 3 2 2 4 1 4 5 2
2 1/2/18 5 7 3 5 6 3 1 4 -1 4 2 1

Here it, first, identifies the columns that contains _H and _L. Second, it subtracts the _L columns from _H columns. Third, it combines the column names of _H and _L columns and assigns it to the results as column names. Finally, it combines the results with the old df.

Select the most common value of a column based on matched pairs from two columns using `ddply`

Similar to Ronak's approach

library(dplyr)
dat <- data.frame(from, to, interaction_type, stringsAsFactors = F)
dat %>%
mutate(
pair = purrr::pmap_chr(
.l = list(from = from, to = to),
.f = function(from, to) paste(sort(c(from, to)), collapse = "")
)
) %>%
group_by(pair) %>%
filter(n() == max(n()) & row_number() == 1) %>%
ungroup() %>%
select(-pair)
# A tibble: 6 x 3
from to interaction_type
<chr> <chr> <chr>
1 A B like
2 A D share
3 B C like
4 B D comment
5 C A like
6 C D like

mutate column in R: subtract values from column based on another column condition

You can group by id and then calculate values relative to the baseline value for each id. For the outlier filtering, I've selected rows where values is less than the overall 25th percentile or greater than the overall 75th percentile, which seemed to be what you were aiming for. However, you can, of course, tweak this to meet your specific needs.

library(tidyverse)

test %>%
group_by(id) %>%
mutate(v_baseline = values - values[baseline=="Y"]) %>%
ungroup %>%
filter(values < quantile(values, probs=0.25) |
values > quantile(values, probs=0.75))

Subtract rows of data frames column-wise preserving multiple factor column

IIUC, you have columns of the same name in df_1 and df_2 (eg int_samp_X for some integer X), and you want to get the difference of matching column names, grouped by gene_nm (eg df_1[df_1$gene_nm == 'A', int_samp_1] - df_2[df_2$gene_nm == 'A', int_samp_1]).

We can use the tidyverse family of packages to solve this problem, notably, dplyr and purrr.

First, merge df_1 and df_2 with a left_join, to ensure all the many entries in df_1 are preserved when matched against the gene-level entries in df_2:

library(tidyverse)

df_3 <- df_1 %>% left_join(df_2, by = "gene_nm")

df_3
pep_seq int_samp_1.x int_samp_2.x int_samp_3.x gene_nm int_samp_1.y int_samp_2.y int_samp_3.y
1 Subtract Pairs of Columns Based on Matching Columna 2421432 NA 11351 A 2421432 NA 11351
2 ababababba 24242424 5342353 NA A 2421432 NA 11351
3 dfsfsfsfds NA 14532556 NA A 2421432 NA 11351
4 xbbcbcncncc 4684757849 43566 NA A 2421432 NA 11351
5 fbbdsgffhhh NA 46367367 1354151345 A 2421432 NA 11351
6 dggdgdgegeggerr 10485040 768769769 1351351354 A 2421432 NA 11351
7 dfgthrgfgf NA 797899 314534 B 24242424 5342353 NA
8 wegregegg 6849400 NA 1535 B 24242424 5342353 NA
9 egegegergewge 40300 NA 3145354 B 24242424 5342353 NA
10 sfngegebser NA NA 4353455 C NA 14532556 NA
11 qegqeefbew NA 686899 324535 C NA 14532556 NA
12 qegqetegqt NA 7898979 3543445 C NA 14532556 NA
13 qwtqtewr 556456466 678568 34535 C NA 14532556 NA
14 etghsfrgf 4646456466 NA 34535534 C NA 14532556 NA
15 sfsdfbdfbergeagaegr 246464266 68886 NA C NA 14532556 NA
16 wasfqertsdfaefwe 4564242646 488 NA C NA 14532556 NA

Then map over the column names of interest, taking the difference from each column-pair. (Note that you'll need to convert the int_samp columns from factor to numeric first.)

Update (per OP comments):
To convert NA to 0 before computing differences, we can use mutate_if() and replace(), adding the following to the method chain:

mutate_if(is.numeric,  funs(replace(., is.na(.), 0)))

Finally, join back to df_1:

var_names <- df_1 %>% select(starts_with("int_samp")) %>% names()

var_names # [1] "int_samp_1" "int_samp_2" "int_samp_3"

var_names %>%
map_dfc(~df_3 %>%
mutate_at(vars(matches(.x)), funs(as.numeric(as.character(.)))) %>%
mutate_if(is.numeric, funs(replace(., is.na(.), 0))) %>%
select(matches(.x)) %>%
reduce(`-`)) %>%
set_names(paste0(var_names, "_diff")) %>%
bind_cols(df_1)

Output:

   int_samp_1_diff int_samp_2_diff int_samp_3_diff pep_seq             int_samp_1 int_samp_2 int_samp_3 gene_nm
<dbl> <dbl> <dbl> <fct> <fct> <fct> <fct> <fct>
1 0. 0. 0. Subtract Pairs of Columns Based on Matching Columna 2421432 NA 11351 A
2 21820992. 5342353. -11351. ababababba 24242424 5342353 NA A
3 -2421432. 14532556. -11351. dfsfsfsfds NA 14532556 NA A
4 4682336417. 43566. -11351. xbbcbcncncc 4684757849 43566 NA A
5 -2421432. 46367367. 1354139994. fbbdsgffhhh NA 46367367 1354151345 A
6 8063608. 768769769. 1351340003. dggdgdgegeggerr 10485040 768769769 1351351354 A
7 -24242424. -4544454. 314534. dfgthrgfgf NA 797899 314534 B
8 -17393024. -5342353. 1535. wegregegg 6849400 NA 1535 B
9 -24202124. -5342353. 3145354. egegegergewge 40300 NA 3145354 B
10 0. -14532556. 4353455. sfngegebser NA NA 4353455 C
11 0. -13845657. 324535. qegqeefbew NA 686899 324535 C
12 0. -6633577. 3543445. qegqetegqt NA 7898979 3543445 C
13 556456466. -13853988. 34535. qwtqtewr 556456466 678568 34535 C
14 4646456466. -14532556. 34535534. etghsfrgf 4646456466 NA 34535534 C
15 246464266. -14463670. 0. sfsdfbdfbergeagaegr 246464266 68886 NA C
16 4564242646. -14532068. 0. wasfqertsdfaefwe 4564242646 488 NA C

Note: This answer is largely a derivation from akrun's answer here.



Related Topics



Leave a reply



Submit