Mutate with Case_When and Contains

mutate with case_when and contains

We can use grep

df %>%  
mutate(group = case_when(grepl("Bl", b) ~ "Group1",
grepl("re", b, ignore.case = TRUE) ~"Group2"))
# a b group
#1 1 Black Group1
#2 2 Green Group2
#3 3 Green Group2
#4 4 Green Group2
#5 5 Red Group2
#6 6 Green Group2
#7 7 Black Group1
#8 8 Black Group1
#9 9 Green Group2
#10 10 Green Group2
#11 1 Green Group2
#12 2 Green Group2
#13 3 Blue Group1
#14 4 Red Group2
#15 5 Blue Group1
#16 6 Red Group2
#17 7 Blue Group1
#18 8 Blue Group1
#19 9 Black Group1
#20 10 Black Group1

case_when with partial string match and contains()

I think you are doing it backwards. Put case_when inside pmap_chr instead of the other way around:

library(dplyr)
library(purrr)
library(stringr)

problem %>%
mutate(final = pmap_chr(select(., contains("status")),
~ case_when(any(str_detect(c(...), "(?i)Exempt")) ~ "Exclude",
TRUE ~ "Completed")))

For each pmap iteration (each row of problem dataset), we want to use case_when to check if there exists the string Exempt. (?i) in str_detect makes it case insensitive. This is the same as writing str_detect(c(...), regex("Exempt", ignore_case = TRUE))

Output:

# A tibble: 4 x 5
person status1 status2 status3 final
<chr> <chr> <chr> <chr> <chr>
1 Corey 7EXEMPT exempt EXEMPTED Exclude
2 Sibley Completed Completed Completed Completed
3 Justin Completed Completed Completed Completed
4 Ruth Pending Pending ExempT - 14 Exclude

case_when using contains instead of declaring each variable to evaluate

We could wrap with if_any

library(dplyr)
library(stringr)
nm1 <- str_c(LETTERS[1:3], rep(c(1, 4), each = 3), "_num")
df <- df %>%
mutate(s1 = if_any(all_of(nm1), ~ . > 2),
n1 = if_any(all_of(nm1), ~ . == 2),
r1 = if_any(all_of(nm1), ~ . < 2),
Manipulation1 = case_when(s1 ~ "support",
n1 ~ "neither",
r1 ~ "reject",
TRUE ~ NA_character_),
s1 = NULL, n1 = NULL, r1 = NULL)

Combine mutate case_when() for columns that start_with() to replace certain characters

Here is one possibility using case_when and grepl:

df1 %>% 
mutate(
across(starts_with("col"),~case_when(
is.na(.) ~ NA_real_,
grepl("[SMD]$", .) ~ parse_number(.),
TRUE ~ 0
)
))

# A tibble: 3 x 7
position correction col1 col2 col3 col4 col5
<dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 100 62M89S NA NA NA 62 89
2 200 8M1D55M88S NA 8 1 55 88
3 300 1S25M1P36M89S 1 25 0 36 89

Use mutate case_when() in a specific range of columns in dplyr

dplyr's c_across is very handy for operations like this:

df1 %>% 
rowwise() %>%
mutate(inner_S = ifelse(any(grepl('S', c_across(col1:col4))), 'YES', 'NO'))

position correction col1 col2 col3 col4 col5 inner_S
<dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 100 62M89S NA NA NA 62M 89S NO
2 200 8M1D55M88S NA 8M 1D 55M 88S NO
3 300 1S25M1S36M89S 1S 25M 1S 36M 89S YES

R mutate across with function, case_when and data masking to parse timestamps

The function lubridate::fast_strptime allows the specification of more formats that will be applied in turn till success.

library(dplyr)
library(lubridate)

df %>%
mutate(across(matches("Time"), ~fast_strptime(.x,
format = c("%Y-%m-%d %H:%M:%S %z",
"%d/%m/%Y %H:%M"),
tz = "UTC")))

##> p_id ActualStartTime ActualEndTime
##> 1 1 2020-05-21 18:04:36 2020-05-21 18:29:42
##> 2 2 2020-09-21 14:14:00 2020-09-21 14:19:00
##> 3 3 2020-08-18 09:11:08 2020-08-18 09:14:26
##> 4 4 2020-10-12 21:25:00 2020-10-12 21:29:00
##> 5 5 2020-11-09 17:02:00 2020-11-09 17:06:00
##> 6 6 2020-05-16 09:50:58 2020-05-16 09:56:10

case_when and grepl to mutate a new variable and take certain character strings

Assuming you want to keep either integers or decimal numbers along with "UNKOWN", we can use grepl as :

df <- data.frame(MIX = results, stringsAsFactors = FALSE)
df$output <- df$MIX
df$output[!(grepl('^\\d+\\.?\\d+?$', df$MIX) | df$MIX == 'UNKNOWN')] <- ''

df
# MIX output
#1 500 500
#2 500.0 500.0
#3 60 60
#4 60.0 60.0
#5 UNKNOWN UNKNOWN
#6 450Other
#7 300-301
#8 ZZZ 12
#9 800/900
#10 500WWW500

Use mutate_at with contains to apply function on multiple columns

Use matches

library(dplyr)
df %>%
mutate_at(vars(matches('a', 'b')), sqrt)

Or specify the match as a character vector as the documentation says

match - A character vector. If length > 1, the union of the matches is taken.

df %>%
mutate_at(vars(contains(match = c('a', 'b'))), sqrt)
ab ba c dc
1 1.000000 2.449490 11 16
2 1.414214 2.645751 12 17
3 1.732051 2.828427 13 18
4 2.000000 3.000000 14 19
5 2.236068 3.162278 15 20

_at/_all are deprecated in favor of across

df %>% 
mutate(across(matches('a', 'b'), sqrt))

-output

        ab       ba  c dc
1 1.000000 2.449490 11 16
2 1.414214 2.645751 12 17
3 1.732051 2.828427 13 18
4 2.000000 3.000000 14 19
5 2.236068 3.162278 15 20

Use case_when and startsWith to selectively mutate by row

According to ?startsWith

x -vector of character string whose “starts” are considered.

So, startsWith expects the class to be character and here it is factor class. Converting it to character class would solve the issue

library(dplyr)
df %>%
mutate(b = case_when(startsWith(as.character(a), "a") ~ "letter",
TRUE ~ "number"))
# a b
#1 abc letter
#2 123 number
#3 abc letter
#4 123 number

The default behavior of data.frame would be stringsAsFactors = TRUE. If we specify stringsAsFactors = FALSE, the 'a' column will be character class


Another option is str_detect to create a logical expression by checking if the character from the start (^) of the string is a digit ([0-9])

library(stringr)
library(dplyr)
df %>%
mutate(b = c("letter", "number")[1+str_detect(a, "^[0-9]")])
# a b
#1 abc letter
#2 123 number
#3 abc letter
# 123 number


Related Topics



Leave a reply



Submit