Apply Tidyr::Separate Over Multiple Columns

Apply tidyr::separate over multiple columns

You could feed a customized separate_() call into Reduce().

sep <- function(...) {
    dots <- list(...)
    n <- stringr::str_count(dots[[1]][[dots[[2]]]], "\\d+")
    separate_(..., into = sprintf("%s_col%d", dots[[2]], 1:n))
}

df %>% Reduce(f = sep, x = c("a", "b"))
#   a_col_1 a_col_2 a_col_3 b_col_1 b_col_2 b_col_3
# 1    5312    2020    1212     345     982     284

Otherwise, cSplit will do it too.

splitstackshape::cSplit(df, names(df))
#     a_1  a_2  a_3 b_1 b_2 b_3
# 1: 5312 2020 1212 345 982 284

Tidy method to split multiple columns using tidyr::separate

Could try:

library(tidyverse)

names(df) %>%
  map(
    function(x) 
      df %>% 
      select(x) %>% 
      separate(x, 
               into = paste0(x, c("_attempted", "_landed")), 
               sep = " of ")
    ) %>%
  bind_cols()

Output:

# A tibble: 6 x 10
  A_attempted A_landed B_attempted B_landed C_attempted C_landed D_attempted D_landed E_attempted E_landed
  <chr>       <chr>    <chr>       <chr>    <chr>       <chr>    <chr>       <chr>    <chr>       <chr>   
1 3           5        2           2        10          21       0           0        8           16      
2 1           2        2           4        3           14       0           0        3           15      
3 1           3        0           1        11          34       0           0        10          32      
4 1           3        0           0        10          35       0           0        6           28      
5 3           4        0           0        16          53       0           0        13          49      
6 2           7        0           0        17          62       0           0        9           48

As OP suggests we can indeed avoid the last step with map_dfc:

names(df) %>% 
  map_dfc(~ df %>% 
             select(.x) %>% 
             separate(.x, 
                      into = paste0(.x, c("_attempted", "_landed")), 
                      sep = " of ")
           )

tidyr: separate column while retaining delimiter in the first column

You can use tidyr::extract with capture groups.

tidyr::extract(duplicates, sample, c("strain", "sample"), '(.*_)(\\w+)')

#   strain sample
#1    a_1_     b1
#2   a1_2_     b1
#3 a1_c_1_     b2

The same regex can also be used with strcapture in base R -

strcapture('(.*_)(\\w+)', duplicates$sample, 
           proto = list(strain = character(), sample = character()))

How to use separate in tidyverse to split a column?

We can use extra argument. Also, by default, the sep is in regex mode - according to ?separate documentation

sep - If character, sep is interpreted as a regular expression. The default value is a regular expression that matches any sequence of non-alphanumeric values.

and . is a metacharacter which can match any character. Therefore, we may need to either escape (\\.) or place it in square brackets ([.]). Also, based on the dput, the column is a list, which should be unnested first before doing the separate

library(dplyr)
library(tidyr)
jimma3 %>%
      select(Enterdateofexam2, Enterdayofexam, UniqueKey,MEDICALRECORD)%>%
      unnest(Enterdateofexam2) %>%
      separate(Enterdateofexam2,into=c("day", "month"), 
              sep="\\.", convert = TRUE, extra = "merge") %>% 
      na.omit

-output

# A tibble: 6 x 5
    day month Enterdayofexam UniqueKey MEDICALRECORD
  <int> <int> <chr>          <chr>     <chr>        
1     7     6 1              530       577207       
2     8     6 2              530       577207       
3     9     6 3              530       577207       
4     2    12 1              531       575333       
5     3    12 2              531       575333       
6     4    12 3              531       575333

Basically, with sep = ".", it is splitting at every character element and thus the warning popped up

data

jimma3 <- structure(list(Enterdateofexam2 = list(c("", "7.06"), c("", "8.06"
), c("", "9.06"), c("", "2.12"), c("", "3.12"), c("", "4.12")), 
    Enterdayofexam = c("1", "2", "3", "1", "2", "3"), UniqueKey = c("530", 
    "530", "530", "531", "531", "531"), MEDICALRECORD = c("577207", 
    "577207", "577207", "575333", "575333", "575333")), row.names = c(NA, 
-6L), class = c("tbl_df", "tbl", "data.frame"))

Split multiple columns into multiple columns using r

I suggest a reshape2 solution taking care of not knowing the number of parts:

> dput(pz1)
structure(list(id = c("HG00096", "HG00097", "HG00098", "HG00099"
), sub = c("GBR", "GBR", "GBR", "GBR"), HLA_A1 = c("01:01:01:01/01:01:01:02N", 
"03:01:01:01/03:01:01:02N", "01:01:01:01/01:01:01:02N/01:22N", 
"03:01:01:01"), HLA_A2 = c("29:02:01", "30:08:01", "29:02:01", 
"30:08:01"), HLA_B1 = c("08:01:01/08:19N", "09:02:01/08:19N", 
"08:01:01/08:19N", "09:02:01/08:19N"), HLA_B2 = c("44:03:01/44:03:03/44:03:04", 
"44:03:01/44:03:03/44:03:04", "44:03:01/44:03:03/44:03:04", "44:03:01/44:03:03/44:03:04"
), HLA_C1 = c("07:01:01/07:01:02", "07:01:01/07:01:02", "07:09:01/07:01:02", 
"07:08:01/07:01:02")), .Names = c("id", "sub", "HLA_A1", "HLA_A2", 
"HLA_B1", "HLA_B2", "HLA_C1"), row.names = c(NA, -4L), class = "data.frame")

add this function:

library("reshape2", lib.loc="~/R/win-library/3.3")

getIt <- function(df,col) {    
x <- max(sapply(strsplit(df[,col],split="/"),length))   ### get the max parts for column
q <- colsplit(string = df[,col],pattern="/",names = paste0(names(df)[col],"_",LETTERS[1:x]))
return(q) }

after you have this function you can easily do:

> getIt(pz1,3)
     HLA_A1_A     HLA_A1_B HLA_A1_C
1 01:01:01:01 01:01:01:02N         
2 03:01:01:01 03:01:01:02N         
3 01:01:01:01 01:01:01:02N   01:22N
4 03:01:01:01

and a simple cbind with the original dataframe (with or without the original columns) :

> cbind(pz1[,1:2],getIt(pz1,3),getIt(pz1,4),getIt(pz1,5),getIt(pz1,6))
       id sub    HLA_A1_A     HLA_A1_B HLA_A1_C HLA_A2_A HLA_B1_A HLA_B1_B HLA_B2_A HLA_B2_B HLA_B2_C
1 HG00096 GBR 01:01:01:01 01:01:01:02N          29:02:01 08:01:01   08:19N 44:03:01 44:03:03 44:03:04
2 HG00097 GBR 03:01:01:01 03:01:01:02N          30:08:01 09:02:01   08:19N 44:03:01 44:03:03 44:03:04
3 HG00098 GBR 01:01:01:01 01:01:01:02N   01:22N 29:02:01 08:01:01   08:19N 44:03:01 44:03:03 44:03:04
4 HG00099 GBR 03:01:01:01                       30:08:01 09:02:01   08:19N 44:03:01 44:03:03 44:03:04

How do I separate a string with different (& repeated) separators into multiple columns?

many good answers, one other variation below

#replace all punctuation with a space then seperate
  df %>% 
    mutate(game=str_replace_all(game,"[:punct:]"," ")) %>% 
    separate(col = game,into = c("year", "day", "month", "monthday", "site", "team", "decision", "runs1", "runs2"))

Separate a String using Tidyr's separate into Multiple Columns and then Create a New Column with Counts

We can try with str_count

library(stringr)
df %>%
  separate(Goal,paste0("Goal", 1:4), sep=",", remove=FALSE) %>% 
  mutate(Count = str_count(Goal, ",")+1) %>%
  select(-Goal) 
#  Name             Goal1          Goal2             Goal3         Goal4 Count
#  <chr>             <chr>          <chr>             <chr>         <chr> <dbl>
#1  John Go back to school Learn to drive     Learn to cook          <NA>     3
#2 Chris Go back to school      Get a job Learn a new Skill Learn to cook     4
#3  Andy    Learn to drive  Learn to Cook              <NA>          <NA>     2

using separate_rows in tidyr over many columns when some columns do not have delimiters

Ok, the easiest solution might be installing the development version of tidyr (0.8.3.9000) since it seems to be fixed there. Use devtools::install_github("tidyverse/tidyr") to achieve that.

However, for a workaround for those who can't update or don't want to use a prerelease version of the package, we can count the required number of separators in each row and fill the missing values in the columns with separators. That lets separate_rows work and creates empty strings, which we then replace back with NA.

library(tidyverse)
A <- c("Acura", "BMW", "Toyota", NA)
B <- c("1993;2004;2010", "2013", "2003;2011", NA)
C <- c("Blue;Black;Gold", "Silver", NA, NA)
df <- data.frame(A = A, B = B, C = C, stringsAsFactors = FALSE)

df %>%
  mutate(seps = str_pad("", width = str_count(B, ";"), pad = ";")) %>%
  mutate_at(vars(B, C), ~ coalesce(., seps)) %>%
  separate_rows(B, C, sep = ";") %>%
  mutate_at(vars(B, C), ~ str_replace(., "^$", NA_character_))
#>        A    B      C seps
#> 1  Acura 1993   Blue   ;;
#> 2  Acura 2004  Black   ;;
#> 3  Acura 2010   Gold   ;;
#> 4    BMW 2013 Silver     
#> 5 Toyota 2003   <NA>    ;
#> 6 Toyota 2011   <NA>    ;
#> 7   <NA> <NA>   <NA> <NA>

^{Created on 2019-07-01 by the reprex package (v0.3.0)}

tidyr use separate_rows over multiple columns

You can use a pipe. Note that sep = ", " is automatically detected.

d %>% separate_rows(b) %>% separate_rows(c)
#   a     b      c
# 1 1 name1  name7
# 2 1 name2  name7
# 3 1 name3  name7
# 4 2 name4  name8
# 5 2 name4  name9
# 6 3 name5 name10
# 7 3 name6 name10

Note: Using tidyr version 0.6.0, where the %>% operator is included in the package.

Update: Using @doscendodiscimus comment, we could use a for() loop and reassign d in each iteration. This way we can have as many columns as we like. We will use a character vector of column names, so we'll need to switch to the standard evaluation version, separate_rows_.

cols <- c("b", "c")
for(col in cols) {
    d <- separate_rows_(d, col)
}

which gives the updated d

  a     b      c
1 1 name1  name7
2 1 name2  name7
3 1 name3  name7
4 2 name4  name8
5 2 name4  name9
6 3 name5 name10
7 3 name6 name10

Apply Tidyr::Separate Over Multiple Columns