Separate a Column of a Dataframe in Undefined Number of Columns with R/Tidyverse

Separate a column of a dataframe in undefined number of columns with R/tidyverse

You can first count the number of columns it can take and then use separate.

nmax <- max(stringr::str_count(df$x, "\\.")) + 1
tidyr::separate(df, x, paste0("col", seq_len(nmax)), sep = "\\.", fill = "right")

# col1 col2 col3
#1 a <NA> <NA>
#2 a b <NA>
#3 a b c
#4 a b d
#5 a d <NA>

How to use tidyr::separate when the number of needed variables is unknown

We could use cSplit

library(splitstackshape) 
cSplit(dat, 'to', ',')

tidyr: Separate a column into a variable number of columns

You can first get data in long format with separate_rows, then separate into different columns, for each row create a row number column and get data in wide format.

library(dplyr)
library(tidyr)

data %>%
mutate(id = row_number()) %>%
separate_rows(variables, sep = ',') %>%
separate(variables, c('question', 'time'), sep = ':') %>%
group_by(id) %>%
mutate(time = row_number()) %>%
ungroup %>%
pivot_wider(names_from = question,values_from=time, names_prefix = 'pos_') %>%
select(-id)

# A tibble: 3 x 5
# pos_q1 pos_q2 pos_q3 pos_q4 pos_q5
# <int> <int> <int> <int> <int>
#1 1 2 3 4 5
#2 2 1 3 5 4
#3 1 2 NA NA 3

How to use separate in tidyverse to split a column?

We can use extra argument. Also, by default, the sep is in regex mode - according to ?separate documentation

sep - If character, sep is interpreted as a regular expression. The default value is a regular expression that matches any sequence of non-alphanumeric values.

and . is a metacharacter which can match any character. Therefore, we may need to either escape (\\.) or place it in square brackets ([.]). Also, based on the dput, the column is a list, which should be unnested first before doing the separate

library(dplyr)
library(tidyr)
jimma3 %>%
select(Enterdateofexam2, Enterdayofexam, UniqueKey,MEDICALRECORD)%>%
unnest(Enterdateofexam2) %>%
separate(Enterdateofexam2,into=c("day", "month"),
sep="\\.", convert = TRUE, extra = "merge") %>%
na.omit

-output

# A tibble: 6 x 5
day month Enterdayofexam UniqueKey MEDICALRECORD
<int> <int> <chr> <chr> <chr>
1 7 6 1 530 577207
2 8 6 2 530 577207
3 9 6 3 530 577207
4 2 12 1 531 575333
5 3 12 2 531 575333
6 4 12 3 531 575333

Basically, with sep = ".", it is splitting at every character element and thus the warning popped up

data

jimma3 <- structure(list(Enterdateofexam2 = list(c("", "7.06"), c("", "8.06"
), c("", "9.06"), c("", "2.12"), c("", "3.12"), c("", "4.12")),
Enterdayofexam = c("1", "2", "3", "1", "2", "3"), UniqueKey = c("530",
"530", "530", "531", "531", "531"), MEDICALRECORD = c("577207",
"577207", "577207", "575333", "575333", "575333")), row.names = c(NA,
-6L), class = c("tbl_df", "tbl", "data.frame"))

How to split a column into multiple (non equal) columns in R

We could use cSplit from splitstackshape

library(splitstackshape)
cSplit(DF, "Col1",",")

-output

cSplit(DF, "Col1",",")
Col1_1 Col1_2 Col1_3 Col1_4
1: a b c <NA>
2: a b <NA> <NA>
3: a b c d

Splitting string column into many using tidyverse

Something like this:

library(tidyverse)

df1 %>%
mutate(id = row_number()) %>%
separate_rows(X) %>%
group_by(id) %>%
mutate(Y = "yes") %>%
spread(X, Y, fill = "no")

# A tibble: 4 x 4
# Groups: id [4]
id A B C
<int> <chr> <chr> <chr>
1 1 yes yes no
2 2 yes yes yes
3 3 yes no no
4 4 yes no yes

Keep the last n columns only outputted by separate by delimiter

One way in base R is to split string at "/" and select last 3 elements from each list.

as.data.frame(t(sapply(strsplit(as.character(example.df$path), "/"), tail, 3)))

# V1 V2 V3
#1 location1 categoryA eyoshdzjow_random_image.txt
#2 location2 categoryB jdugnbtudg_random_image.txt

Using tidyverse, we can get the data in long format, select last 3 entries in each row and get the data in wide format.

library(tidyverse)

example.df %>%
mutate(row = row_number()) %>%
separate_rows(path, sep = "/") %>%
group_by(row) %>%
slice((n() - 2) : n()) %>%
mutate(cols = c('location', 'category', 'filename')) %>%
pivot_wider(names_from = cols, values_from = path) %>%
ungroup() %>%
select(-row)

# A tibble: 2 x 3
# location category filename
# <chr> <chr> <chr>
#1 location1 categoryA eyoshdzjow_random_image.txt
#2 location2 categoryB jdugnbtudg_random_image.txt

Or similar concept as base R but using tidyverse

example.df %>%
mutate(temp = map(str_split(path, "/"), tail, 3)) %>%
unnest_wider(temp, names_repair = ~paste0("dir", seq_along(.) - 1)) %>%
select(-dir0)

separate column with unknown name

You could use strsplit().

split <- do.call(rbind, strsplit(gsub("\\*", "", df[, -1]), " "))[, -1]
df1 <- data.frame(df[, 1], split)
df1[] <- lapply(df1, function(x) as.numeric(as.character(x)))
names(df1) <- unlist(strsplit(names(df), split = ".", fixed=TRUE))

> df1
header ST adk fumC gyrB icd mdh purA recA
1 1 10 10 11 4 8 8 8 2
2 2 48 6 11 4 8 8 8 2
3 3 58 6 4 4 16 24 8 14
4 4 88 6 4 12 1 20 12 7
5 5 117 20 45 41 43 5 32 2
6 6 7036 526 7 1 1 8 71 6
7 7 101 43 41 15 18 11 7 6
8 8 3595 112 11 5 12 8 88 86
9 9 117 20 45 41 43 5 32 2
10 10 744 10 11 135 8 8 8 2

Data

df <-structure(list(header = 1:10, ST.adk.fumC.gyrB.icd.mdh.purA.recA = c(" 10 10 11 4 8 8 8 2", 
" 48 6 11 4 8 8 8 2", " 58 6 4 4 16 24 8 14", " 88* 6* 4 12 1 20 12 7",
" 117 20 45 41 43 5 32 2", " 7036 526 7 1 1 8 71 6", " 101 43 41 15 18 11 7 6",
" 3595 112 11 5 12 8 88 86", " 117 20 45 41 43 5 32 2", " 744 10 11 135 8 8 8 2"
)), row.names = c(NA, -10L), class = c("tbl_df", "tbl", "data.frame"
))

R: Split Variable Column into multiple (unbalanced) columns by comma

From Ananda's splitstackshape package:

cSplit(df, "Events", sep=",")
# Name Age Number First Events_1 Events_2 Events_3 Events_4
#1: Karen 24 8 0 Triathlon/IM Marathon 10k 5k
#2: Kurt 39 2 0 Half-Marathon 10k NA NA
#3: Leah 18 0 1 NA NA NA NA

Or with tidyr:

separate(df, 'Events', paste("Events", 1:4, sep="_"), sep=",", extra="drop")
# Name Age Number Events_1 Events_2 Events_3 Events_4 First
#1 Karen 24 8 Triathlon/IM Marathon 10k 5k 0
#2 Kurt 39 2 Half-Marathon 10k <NA> <NA> 0
#3 Leah 18 0 NA <NA> <NA> <NA> 1

With the data.table package:

setDT(df)[,paste0("Events_", 1:4) := tstrsplit(Events, ",")][,-"Events", with=F]
# Name Age Number First Events_1 Events_2 Events_3 Events_4
#1: Karen 24 8 0 Triathlon/IM Marathon 10k 5k
#2: Kurt 39 2 0 Half-Marathon 10k NA NA
#3: Leah 18 0 1 NA NA NA NA

Data

df <- structure(list(Name = structure(1:3, .Label = c("Karen", "Kurt", 
"Leah "), class = "factor"), Age = c(24L, 39L, 18L), Number = c(8L,
2L, 0L), Events = structure(c(3L, 2L, 1L), .Label = c(" NA",
" Half-Marathon,10k", " Triathlon/IM,Marathon,10k,5k"
), class = "factor"), First = c(0L, 0L, 1L)), .Names = c("Name",
"Age", "Number", "Events", "First"), class = "data.frame", row.names = c(NA,
-3L))


Related Topics



Leave a reply



Submit