Apply Tidyr::Separate Over Multiple Columns

Apply tidyr::separate over multiple columns

You could feed a customized separate_() call into Reduce().

sep <- function(...) {
dots <- list(...)
n <- stringr::str_count(dots[[1]][[dots[[2]]]], "\\d+")
separate_(..., into = sprintf("%s_col%d", dots[[2]], 1:n))
}

df %>% Reduce(f = sep, x = c("a", "b"))
# a_col_1 a_col_2 a_col_3 b_col_1 b_col_2 b_col_3
# 1 5312 2020 1212 345 982 284

Otherwise, cSplit will do it too.

splitstackshape::cSplit(df, names(df))
# a_1 a_2 a_3 b_1 b_2 b_3
# 1: 5312 2020 1212 345 982 284

Tidy method to split multiple columns using tidyr::separate

Could try:

library(tidyverse)

names(df) %>%
map(
function(x)
df %>%
select(x) %>%
separate(x,
into = paste0(x, c("_attempted", "_landed")),
sep = " of ")
) %>%
bind_cols()

Output:

# A tibble: 6 x 10
A_attempted A_landed B_attempted B_landed C_attempted C_landed D_attempted D_landed E_attempted E_landed
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 3 5 2 2 10 21 0 0 8 16
2 1 2 2 4 3 14 0 0 3 15
3 1 3 0 1 11 34 0 0 10 32
4 1 3 0 0 10 35 0 0 6 28
5 3 4 0 0 16 53 0 0 13 49
6 2 7 0 0 17 62 0 0 9 48

As OP suggests we can indeed avoid the last step with map_dfc:

names(df) %>% 
map_dfc(~ df %>%
select(.x) %>%
separate(.x,
into = paste0(.x, c("_attempted", "_landed")),
sep = " of ")
)

tidyr: separate column while retaining delimiter in the first column

You can use tidyr::extract with capture groups.

tidyr::extract(duplicates, sample, c("strain", "sample"), '(.*_)(\\w+)')

# strain sample
#1 a_1_ b1
#2 a1_2_ b1
#3 a1_c_1_ b2

The same regex can also be used with strcapture in base R -

strcapture('(.*_)(\\w+)', duplicates$sample, 
proto = list(strain = character(), sample = character()))

How to use separate in tidyverse to split a column?

We can use extra argument. Also, by default, the sep is in regex mode - according to ?separate documentation

sep - If character, sep is interpreted as a regular expression. The default value is a regular expression that matches any sequence of non-alphanumeric values.

and . is a metacharacter which can match any character. Therefore, we may need to either escape (\\.) or place it in square brackets ([.]). Also, based on the dput, the column is a list, which should be unnested first before doing the separate

library(dplyr)
library(tidyr)
jimma3 %>%
select(Enterdateofexam2, Enterdayofexam, UniqueKey,MEDICALRECORD)%>%
unnest(Enterdateofexam2) %>%
separate(Enterdateofexam2,into=c("day", "month"),
sep="\\.", convert = TRUE, extra = "merge") %>%
na.omit

-output

# A tibble: 6 x 5
day month Enterdayofexam UniqueKey MEDICALRECORD
<int> <int> <chr> <chr> <chr>
1 7 6 1 530 577207
2 8 6 2 530 577207
3 9 6 3 530 577207
4 2 12 1 531 575333
5 3 12 2 531 575333
6 4 12 3 531 575333

Basically, with sep = ".", it is splitting at every character element and thus the warning popped up

data

jimma3 <- structure(list(Enterdateofexam2 = list(c("", "7.06"), c("", "8.06"
), c("", "9.06"), c("", "2.12"), c("", "3.12"), c("", "4.12")),
Enterdayofexam = c("1", "2", "3", "1", "2", "3"), UniqueKey = c("530",
"530", "530", "531", "531", "531"), MEDICALRECORD = c("577207",
"577207", "577207", "575333", "575333", "575333")), row.names = c(NA,
-6L), class = c("tbl_df", "tbl", "data.frame"))

Split multiple columns into multiple columns using r

I suggest a reshape2 solution taking care of not knowing the number of parts:

> dput(pz1)
structure(list(id = c("HG00096", "HG00097", "HG00098", "HG00099"
), sub = c("GBR", "GBR", "GBR", "GBR"), HLA_A1 = c("01:01:01:01/01:01:01:02N",
"03:01:01:01/03:01:01:02N", "01:01:01:01/01:01:01:02N/01:22N",
"03:01:01:01"), HLA_A2 = c("29:02:01", "30:08:01", "29:02:01",
"30:08:01"), HLA_B1 = c("08:01:01/08:19N", "09:02:01/08:19N",
"08:01:01/08:19N", "09:02:01/08:19N"), HLA_B2 = c("44:03:01/44:03:03/44:03:04",
"44:03:01/44:03:03/44:03:04", "44:03:01/44:03:03/44:03:04", "44:03:01/44:03:03/44:03:04"
), HLA_C1 = c("07:01:01/07:01:02", "07:01:01/07:01:02", "07:09:01/07:01:02",
"07:08:01/07:01:02")), .Names = c("id", "sub", "HLA_A1", "HLA_A2",
"HLA_B1", "HLA_B2", "HLA_C1"), row.names = c(NA, -4L), class = "data.frame")

add this function:

library("reshape2", lib.loc="~/R/win-library/3.3")

getIt <- function(df,col) {
x <- max(sapply(strsplit(df[,col],split="/"),length)) ### get the max parts for column
q <- colsplit(string = df[,col],pattern="/",names = paste0(names(df)[col],"_",LETTERS[1:x]))
return(q) }

after you have this function you can easily do:

> getIt(pz1,3)
HLA_A1_A HLA_A1_B HLA_A1_C
1 01:01:01:01 01:01:01:02N
2 03:01:01:01 03:01:01:02N
3 01:01:01:01 01:01:01:02N 01:22N
4 03:01:01:01

and a simple cbind with the original dataframe (with or without the original columns) :

> cbind(pz1[,1:2],getIt(pz1,3),getIt(pz1,4),getIt(pz1,5),getIt(pz1,6))
id sub HLA_A1_A HLA_A1_B HLA_A1_C HLA_A2_A HLA_B1_A HLA_B1_B HLA_B2_A HLA_B2_B HLA_B2_C
1 HG00096 GBR 01:01:01:01 01:01:01:02N 29:02:01 08:01:01 08:19N 44:03:01 44:03:03 44:03:04
2 HG00097 GBR 03:01:01:01 03:01:01:02N 30:08:01 09:02:01 08:19N 44:03:01 44:03:03 44:03:04
3 HG00098 GBR 01:01:01:01 01:01:01:02N 01:22N 29:02:01 08:01:01 08:19N 44:03:01 44:03:03 44:03:04
4 HG00099 GBR 03:01:01:01 30:08:01 09:02:01 08:19N 44:03:01 44:03:03 44:03:04

How do I separate a string with different (& repeated) separators into multiple columns?

many good answers, one other variation below

#replace all punctuation with a space then seperate
df %>%
mutate(game=str_replace_all(game,"[:punct:]"," ")) %>%
separate(col = game,into = c("year", "day", "month", "monthday", "site", "team", "decision", "runs1", "runs2"))

Separate a String using Tidyr's separate into Multiple Columns and then Create a New Column with Counts

We can try with str_count

library(stringr)
df %>%
separate(Goal,paste0("Goal", 1:4), sep=",", remove=FALSE) %>%
mutate(Count = str_count(Goal, ",")+1) %>%
select(-Goal)
# Name Goal1 Goal2 Goal3 Goal4 Count
# <chr> <chr> <chr> <chr> <chr> <dbl>
#1 John Go back to school Learn to drive Learn to cook <NA> 3
#2 Chris Go back to school Get a job Learn a new Skill Learn to cook 4
#3 Andy Learn to drive Learn to Cook <NA> <NA> 2

using separate_rows in tidyr over many columns when some columns do not have delimiters

Ok, the easiest solution might be installing the development version of tidyr (0.8.3.9000) since it seems to be fixed there. Use devtools::install_github("tidyverse/tidyr") to achieve that.

However, for a workaround for those who can't update or don't want to use a prerelease version of the package, we can count the required number of separators in each row and fill the missing values in the columns with separators. That lets separate_rows work and creates empty strings, which we then replace back with NA.

library(tidyverse)
A <- c("Acura", "BMW", "Toyota", NA)
B <- c("1993;2004;2010", "2013", "2003;2011", NA)
C <- c("Blue;Black;Gold", "Silver", NA, NA)
df <- data.frame(A = A, B = B, C = C, stringsAsFactors = FALSE)

df %>%
mutate(seps = str_pad("", width = str_count(B, ";"), pad = ";")) %>%
mutate_at(vars(B, C), ~ coalesce(., seps)) %>%
separate_rows(B, C, sep = ";") %>%
mutate_at(vars(B, C), ~ str_replace(., "^$", NA_character_))
#> A B C seps
#> 1 Acura 1993 Blue ;;
#> 2 Acura 2004 Black ;;
#> 3 Acura 2010 Gold ;;
#> 4 BMW 2013 Silver
#> 5 Toyota 2003 <NA> ;
#> 6 Toyota 2011 <NA> ;
#> 7 <NA> <NA> <NA> <NA>

Created on 2019-07-01 by the reprex package (v0.3.0)

tidyr use separate_rows over multiple columns

You can use a pipe. Note that sep = ", " is automatically detected.

d %>% separate_rows(b) %>% separate_rows(c)
# a b c
# 1 1 name1 name7
# 2 1 name2 name7
# 3 1 name3 name7
# 4 2 name4 name8
# 5 2 name4 name9
# 6 3 name5 name10
# 7 3 name6 name10

Note: Using tidyr version 0.6.0, where the %>% operator is included in the package.


Update: Using @doscendodiscimus comment, we could use a for() loop and reassign d in each iteration. This way we can have as many columns as we like. We will use a character vector of column names, so we'll need to switch to the standard evaluation version, separate_rows_.

cols <- c("b", "c")
for(col in cols) {
d <- separate_rows_(d, col)
}

which gives the updated d

  a     b      c
1 1 name1 name7
2 1 name2 name7
3 1 name3 name7
4 2 name4 name8
5 2 name4 name9
6 3 name5 name10
7 3 name6 name10


Related Topics



Leave a reply



Submit