Separate a Column of a Dataframe in Undefined Number of Columns with R/Tidyverse

Separate a column of a dataframe in undefined number of columns with R/tidyverse

You can first count the number of columns it can take and then use separate.

nmax <- max(stringr::str_count(df$x, "\\.")) + 1
tidyr::separate(df, x, paste0("col", seq_len(nmax)), sep = "\\.", fill = "right")

#  col1 col2 col3
#1    a <NA> <NA>
#2    a    b <NA>
#3    a    b    c
#4    a    b    d
#5    a    d <NA>

How to use tidyr::separate when the number of needed variables is unknown

We could use cSplit

library(splitstackshape) 
cSplit(dat, 'to', ',')

tidyr: Separate a column into a variable number of columns

You can first get data in long format with separate_rows, then separate into different columns, for each row create a row number column and get data in wide format.

library(dplyr)
library(tidyr)

data %>%
  mutate(id = row_number()) %>%
  separate_rows(variables, sep = ',') %>%
  separate(variables, c('question', 'time'), sep = ':') %>%
  group_by(id) %>%
  mutate(time = row_number()) %>%
  ungroup %>%
  pivot_wider(names_from = question,values_from=time, names_prefix = 'pos_') %>%
  select(-id)

# A tibble: 3 x 5
#  pos_q1 pos_q2 pos_q3 pos_q4 pos_q5
#   <int>  <int>  <int>  <int>  <int>
#1      1      2      3      4      5
#2      2      1      3      5      4
#3      1      2     NA     NA      3

How to use separate in tidyverse to split a column?

We can use extra argument. Also, by default, the sep is in regex mode - according to ?separate documentation

sep - If character, sep is interpreted as a regular expression. The default value is a regular expression that matches any sequence of non-alphanumeric values.

and . is a metacharacter which can match any character. Therefore, we may need to either escape (\\.) or place it in square brackets ([.]). Also, based on the dput, the column is a list, which should be unnested first before doing the separate

library(dplyr)
library(tidyr)
jimma3 %>%
      select(Enterdateofexam2, Enterdayofexam, UniqueKey,MEDICALRECORD)%>%
      unnest(Enterdateofexam2) %>%
      separate(Enterdateofexam2,into=c("day", "month"), 
              sep="\\.", convert = TRUE, extra = "merge") %>% 
      na.omit

-output

# A tibble: 6 x 5
    day month Enterdayofexam UniqueKey MEDICALRECORD
  <int> <int> <chr>          <chr>     <chr>        
1     7     6 1              530       577207       
2     8     6 2              530       577207       
3     9     6 3              530       577207       
4     2    12 1              531       575333       
5     3    12 2              531       575333       
6     4    12 3              531       575333

Basically, with sep = ".", it is splitting at every character element and thus the warning popped up

data

jimma3 <- structure(list(Enterdateofexam2 = list(c("", "7.06"), c("", "8.06"
), c("", "9.06"), c("", "2.12"), c("", "3.12"), c("", "4.12")), 
    Enterdayofexam = c("1", "2", "3", "1", "2", "3"), UniqueKey = c("530", 
    "530", "530", "531", "531", "531"), MEDICALRECORD = c("577207", 
    "577207", "577207", "575333", "575333", "575333")), row.names = c(NA, 
-6L), class = c("tbl_df", "tbl", "data.frame"))

How to split a column into multiple (non equal) columns in R

We could use cSplit from splitstackshape

library(splitstackshape)
cSplit(DF, "Col1",",")

-output

cSplit(DF, "Col1",",")
   Col1_1 Col1_2 Col1_3 Col1_4
1:      a      b      c   <NA>
2:      a      b   <NA>   <NA>
3:      a      b      c      d

Splitting string column into many using tidyverse

Something like this:

library(tidyverse)

df1 %>%
  mutate(id = row_number()) %>% 
  separate_rows(X) %>% 
  group_by(id) %>% 
  mutate(Y = "yes") %>% 
  spread(X, Y, fill = "no")

# A tibble: 4 x 4
# Groups:   id [4]
     id A     B     C    
  <int> <chr> <chr> <chr>
1     1 yes   yes   no   
2     2 yes   yes   yes  
3     3 yes   no    no   
4     4 yes   no    yes

Keep the last n columns only outputted by separate by delimiter

One way in base R is to split string at "/" and select last 3 elements from each list.

as.data.frame(t(sapply(strsplit(as.character(example.df$path), "/"), tail, 3)))

#         V1        V2                          V3
#1 location1 categoryA eyoshdzjow_random_image.txt
#2 location2 categoryB jdugnbtudg_random_image.txt

Using tidyverse, we can get the data in long format, select last 3 entries in each row and get the data in wide format.

library(tidyverse)

example.df %>%
  mutate(row = row_number()) %>%
  separate_rows(path, sep = "/") %>%
  group_by(row) %>%
  slice((n() - 2) : n()) %>%
  mutate(cols = c('location', 'category', 'filename')) %>%
  pivot_wider(names_from = cols, values_from = path) %>%
  ungroup() %>%
  select(-row)

# A tibble: 2 x 3
#  location  category  filename                   
#  <chr>     <chr>     <chr>                      
#1 location1 categoryA eyoshdzjow_random_image.txt
#2 location2 categoryB jdugnbtudg_random_image.txt

Or similar concept as base R but using tidyverse

example.df %>%
  mutate(temp = map(str_split(path, "/"), tail, 3)) %>%
  unnest_wider(temp, names_repair = ~paste0("dir", seq_along(.) - 1)) %>%
  select(-dir0)

separate column with unknown name

You could use strsplit().

split <- do.call(rbind, strsplit(gsub("\\*", "", df[, -1]), " "))[, -1]
df1 <- data.frame(df[, 1], split)
df1[] <- lapply(df1, function(x) as.numeric(as.character(x)))
names(df1) <- unlist(strsplit(names(df), split = ".", fixed=TRUE))

> df1
   header   ST adk fumC gyrB icd mdh purA recA
1       1   10  10   11    4   8   8    8    2
2       2   48   6   11    4   8   8    8    2
3       3   58   6    4    4  16  24    8   14
4       4   88   6    4   12   1  20   12    7
5       5  117  20   45   41  43   5   32    2
6       6 7036 526    7    1   1   8   71    6
7       7  101  43   41   15  18  11    7    6
8       8 3595 112   11    5  12   8   88   86
9       9  117  20   45   41  43   5   32    2
10     10  744  10   11  135   8   8    8    2

Data

df <-structure(list(header = 1:10, ST.adk.fumC.gyrB.icd.mdh.purA.recA = c(" 10 10 11 4 8 8 8 2", 
                                                                     " 48 6 11 4 8 8 8 2", " 58 6 4 4 16 24 8 14", " 88* 6* 4 12 1 20 12 7", 
                                                                     " 117 20 45 41 43 5 32 2", " 7036 526 7 1 1 8 71 6", " 101 43 41 15 18 11 7 6", 
                                                                     " 3595 112 11 5 12 8 88 86", " 117 20 45 41 43 5 32 2", " 744 10 11 135 8 8 8 2"
)), row.names = c(NA, -10L), class = c("tbl_df", "tbl", "data.frame"
))

R: Split Variable Column into multiple (unbalanced) columns by comma

From Ananda's splitstackshape package:

cSplit(df, "Events", sep=",")
#    Name Age Number First      Events_1 Events_2 Events_3 Events_4
#1: Karen  24      8     0  Triathlon/IM Marathon      10k       5k
#2:  Kurt  39      2     0 Half-Marathon      10k       NA       NA
#3: Leah   18      0     1            NA       NA       NA       NA

Or with tidyr:

separate(df, 'Events', paste("Events", 1:4, sep="_"), sep=",", extra="drop")
#   Name Age Number               Events_1 Events_2 Events_3 Events_4 First
#1 Karen  24      8           Triathlon/IM Marathon      10k       5k     0
#2  Kurt  39      2          Half-Marathon      10k     <NA>     <NA>     0
#3 Leah   18      0                     NA     <NA>     <NA>     <NA>     1

With the data.table package:

setDT(df)[,paste0("Events_", 1:4) := tstrsplit(Events, ",")][,-"Events", with=F]
#    Name Age Number First               Events_1 Events_2 Events_3 Events_4
#1: Karen  24      8     0           Triathlon/IM Marathon      10k       5k
#2:  Kurt  39      2     0          Half-Marathon      10k       NA       NA
#3: Leah   18      0     1                     NA       NA       NA       NA

Data

df <- structure(list(Name = structure(1:3, .Label = c("Karen", "Kurt", 
"Leah "), class = "factor"), Age = c(24L, 39L, 18L), Number = c(8L, 
2L, 0L), Events = structure(c(3L, 2L, 1L), .Label = c("               NA", 
"         Half-Marathon,10k", "     Triathlon/IM,Marathon,10k,5k"
), class = "factor"), First = c(0L, 0L, 1L)), .Names = c("Name", 
"Age", "Number", "Events", "First"), class = "data.frame", row.names = c(NA, 
-3L))

Separate a Column of a Dataframe in Undefined Number of Columns with R/Tidyverse