R Apply() Function on Specific Dataframe Columns

R Apply() function on specific dataframe columns

Using an example data.frame and example function (just +1 to all values)

A <- function(x) x + 1
wifi <- data.frame(replicate(9,1:4))
wifi

#  X1 X2 X3 X4 X5 X6 X7 X8 X9
#1  1  1  1  1  1  1  1  1  1
#2  2  2  2  2  2  2  2  2  2
#3  3  3  3  3  3  3  3  3  3
#4  4  4  4  4  4  4  4  4  4

data.frame(wifi[1:3], apply(wifi[4:9],2, A) )
#or
cbind(wifi[1:3], apply(wifi[4:9],2, A) )

#  X1 X2 X3 X4 X5 X6 X7 X8 X9
#1  1  1  1  2  2  2  2  2  2
#2  2  2  2  3  3  3  3  3  3
#3  3  3  3  4  4  4  4  4  4
#4  4  4  4  5  5  5  5  5  5

Or even:

data.frame(wifi[1:3], lapply(wifi[4:9], A) )
#or
cbind(wifi[1:3], lapply(wifi[4:9], A) )

#  X1 X2 X3 X4 X5 X6 X7 X8 X9
#1  1  1  1  2  2  2  2  2  2
#2  2  2  2  3  3  3  3  3  3
#3  3  3  3  4  4  4  4  4  4
#4  4  4  4  5  5  5  5  5  5

R: Apply function on specific columns preserving the rest of the dataframe

If you only want to do a computation on one or a few columns you can use transform or simply do index it manually:

# with transfrom:
df <- data.frame(A = 1:10, B = 1:10)
df <- transform(df, A = A*1000)

# Manually:
df <- data.frame(A = 1:10, B = 1:10)
df$A <- df$A * 1000

apply a custom function across certain columns in a dataframe in R

Using the across function from dplyr:

library(tidyverse)
library(lubridate)
#> 
#> Attaching package: 'lubridate'
#> The following objects are masked from 'package:base':
#> 
#>     date, intersect, setdiff, union

date_data1 <- data.frame(
  name = c('groupA'),
  number = as.numeric(c(1:10)),
  date1 = seq(from = ymd('2019-07-01'), to = ymd('2019-07-10'), by='days'), 
  date2 = seq(from = ymd('2019-07-02'), to = ymd('2019-07-11'), by='days'),
  date3 = seq(from = ymd('2019-06-29'), to = ymd('2019-07-08'), by='days'),
  date4 = seq(from = ymd('2019-07-03'), to = ymd('2019-07-12'), by='days'),
  date5 = seq(from = ymd('2019-07-05'), to = ymd('2019-07-14'), by='days')
) %>%
  mutate(yday = yday(date5))

date_data2 <- data.frame(
  name = c('groupB'),
  number = as.numeric(c(1:10)),
  date1 = seq(from = ymd('2019-07-01'), to = ymd('2019-07-10'), by='days'), 
  date2 = seq(from = ymd('2019-07-02'), to = ymd('2019-07-11'), by='days'),
  date3 = seq(from = ymd('2019-06-29'), to = ymd('2019-07-08'), by='days'),
  date4 = seq(from = ymd('2019-07-03'), to = ymd('2019-07-12'), by='days'),
  date5 = seq(from = ymd('2019-07-05'), to = ymd('2019-07-14'), by='days')
) %>%
  mutate(yday = yday(date5))

date_data <- bind_rows(date_data1, date_data2) %>% 
  as_tibble()

date_data %>%
  group_by(name) %>%
  summarise(across(
    .cols = 2:5,
    .fns = ~ abs(mean(interval(.x, date5) %/% days(1))),
    .names = "diff_{.col}_date5"
  ))
#> # A tibble: 2 × 5
#>   name   diff_date1_date5 diff_date2_date5 diff_date3_date5 diff_date4_date5
#>   <chr>             <dbl>            <dbl>            <dbl>            <dbl>
#> 1 groupA                4                3                6                2
#> 2 groupB                4                3                6                2

^{Created on 2021-11-11 by the reprex package (v2.0.1)}

R Apply user-defined function to selected columns of a dataframe

as.data.frame(t(unlist(lapply(df[,4:1003], SSD, p=p)))

Apply a function to multiple columns in R

you can use the dplyr package to apply a function to several columns:

library(dplyr)
library(lubridate)

df %>%
  mutate(across(matches("Date"), dmy))

Alternatively, you can use apply but you have to select the relevant columns first. This can be done using grep:

df[, grep("Date", colnames(df))] <- apply(df[, grep("Date", colnames(df))], 2, dmy)

apply() function to only certain columns

1) mutate/across With dplyr one can use mutate/across. The first argument of across defines which columns to use and the second is the function to apply to each such column. The right hand side of the formula is the body of the function and dot is the argument to the function. We use + to convert the logical result to numeric.

library(dplyr)

df %>% mutate(across(starts_with("id"), ~ +(. < beep)))
##    name id1 id2 id3 id4 id5 beep
## 1  Mary   0   1   1   1   0   15
## 2  John   0   1   1   0   0   20
## 3 Peter   0   0   0   0   1   23

2) modify_if The purrr package has a function which will modify only columns satisfying the condition defined by the second argument. It supports the same shorthand for functions as in (1).

library(purrr)

modify_if(df, startsWith(names(df), "id"), ~ +(. < df$beep))

##    name id1 id2 id3 id4 id5 beep
## 1  Mary   0   1   1   1   0   15
## 2  John   0   1   1   0   0   20
## 3 Peter   0   0   0   0   1   23

3) replace This is basically the same as another answer but uses grep and replace instead. No packages are used.

ix <- grep("^id", names(df))
replace(df, ix, +(df[ix] < df$beep))
##    name id1 id2 id3 id4 id5 beep
## 1  Mary   0   1   1   1   0   15
## 2  John   0   1   1   0   0   20
## 3 Peter   0   0   0   0   1   23

4) modifyList Its modifyList replaces the columns in the first argument by the columns in the second argument using name matching. Both arguments must be lists or data frames (not matrices).

ix <- grep("^id", names(df))
modifyList(df, +as.data.frame(df[ix] < df$beep))
##    name id1 id2 id3 id4 id5 beep
## 1  Mary   0   1   1   1   0   15
## 2  John   0   1   1   0   0   20
## 3 Peter   0   0   0   0   1   23

(This used to be in the lattice package but now it is in utils which is part of base R.)

How do I apply a function to specific columns in a dataframe and replace the original columns?

Several ways to do that.

If you work with voluminous data, I think data.table is the best approach (will bring you flexibility, speed and memory efficiency)

data.table

You can use the := (update by reference operator) together with lapplỳ to apply lubridate::ymd to all columns defined in .SDcols dimension

library(data.table)
setDT(my.medical.data)

cols_to_change <- endsWith("_date", colnames(my.medical.date))

my.medical.data[, c(cols_to_change) := lapply(.SD, lubridate::ymd), .SDcols = cols_to_change]

base R

A standard lapply can also help. You could try something like that (I did not test it)

my.medical.data[, cols_to_change] <- lapply(cols_to_change, function(d) lubridate::ymd(my.medical.data[,d]))

How to apply a function on selected dataframe columns and (depending on the outcome) return either the initial value or return 0?

The dplyr package makes this doable. Here is the first part (replacing with zero those values in the Sample columns where the Reference value is not zero but is more than half of the Sample value.

library(dplyr)
data <- structure(list(`Mass values` = c(50, 51, 52, 53), 
                       `Sample 1` = c(6000, 8500, 3600, 6324), 
                       `Sample 2` = c(5866, 56547, 7876, 5486), 
                       `Sample x` = c(36546, 346346, 56856, 565676), 
                       Reference = c(18000, 0, 96799, 68786)), 
                       row.names = c(NA, -4L), 
                       class = c("tbl_df", "tbl", "data.frame"))

data1 <- data %>%
   # mutate changes values in columns
  mutate(across(starts_with("Sample"),  # across(starts_with()) only uses the Sample columns
                ~ case_when(Reference == 0 ~ .,   # if Reference == 0, return original value
                            # if value / Reference is > 2, return original value, else 0 
                            TRUE ~ if_else(. / Reference > 2, ., 0 )) ))

# A tibble: 4 x 5
  `Mass values` `Sample 1` `Sample 2` `Sample x` Reference
          <dbl>      <dbl>      <dbl>      <dbl>     <dbl>
1            50          0          0      36546     18000
2            51       8500      56547     346346         0
3            52          0          0          0     96799
4            53          0          0     565676     68786

Removing the zeros is a little trickier. Here are two ways.

# Replace with NA
data1 %>% na_if(0)
# A tibble: 4 x 5
  `Mass values` `Sample 1` `Sample 2` `Sample x` Reference
          <dbl>      <dbl>      <dbl>      <dbl>     <dbl>
1            50         NA         NA      36546     18000
2            51       8500      56547     346346        NA
3            52         NA         NA         NA     96799
4            53         NA         NA     565676     68786

# Pivot longer and remove rows with zero.
library(tidyr)
data1 %>% pivot_longer(cols = starts_with("Sample"), names_to = "Sample") %>%
  filter(value != 0)

# A tibble: 5 x 4
  `Mass values` Reference Sample    value
          <dbl>     <dbl> <chr>     <dbl>
1            50     18000 Sample x  36546
2            51         0 Sample 1   8500
3            51         0 Sample 2  56547
4            51         0 Sample x 346346
5            53     68786 Sample x 565676

R Apply() Function on Specific Dataframe Columns