How to Use Tidyr::Separate When the Number of Needed Variables Is Unknown

How to use tidyr::separate when the number of needed variables is unknown

We could use cSplit

library(splitstackshape) 
cSplit(dat, 'to', ',')

Separate a column of a dataframe in undefined number of columns with R/tidyverse

You can first count the number of columns it can take and then use separate.

nmax <- max(stringr::str_count(df$x, "\\.")) + 1
tidyr::separate(df, x, paste0("col", seq_len(nmax)), sep = "\\.", fill = "right")

#  col1 col2 col3
#1    a <NA> <NA>
#2    a    b <NA>
#3    a    b    c
#4    a    b    d
#5    a    d <NA>

tidyr: Separate a column into a variable number of columns

You can first get data in long format with separate_rows, then separate into different columns, for each row create a row number column and get data in wide format.

library(dplyr)
library(tidyr)

data %>%
  mutate(id = row_number()) %>%
  separate_rows(variables, sep = ',') %>%
  separate(variables, c('question', 'time'), sep = ':') %>%
  group_by(id) %>%
  mutate(time = row_number()) %>%
  ungroup %>%
  pivot_wider(names_from = question,values_from=time, names_prefix = 'pos_') %>%
  select(-id)

# A tibble: 3 x 5
#  pos_q1 pos_q2 pos_q3 pos_q4 pos_q5
#   <int>  <int>  <int>  <int>  <int>
#1      1      2      3      4      5
#2      2      1      3      5      4
#3      1      2     NA     NA      3

How to generate a given number of columns in r for separate function?

almost got it. Try this:

mycols <- max(str_count(Applicant_data$Assignee_DWPI, ";"), na.rm = TRUE)+1

separate(Applicant_data, Assignee_DWPI, as.character(1:mycols), sep = " ; ")

If you not more than 26 columns you can also use

separate(Applicant_data, Assignee_DWPI, letters(1:mycols), sep = " ; ")

separate column with unknown name

You could use strsplit().

split <- do.call(rbind, strsplit(gsub("\\*", "", df[, -1]), " "))[, -1]
df1 <- data.frame(df[, 1], split)
df1[] <- lapply(df1, function(x) as.numeric(as.character(x)))
names(df1) <- unlist(strsplit(names(df), split = ".", fixed=TRUE))

> df1
   header   ST adk fumC gyrB icd mdh purA recA
1       1   10  10   11    4   8   8    8    2
2       2   48   6   11    4   8   8    8    2
3       3   58   6    4    4  16  24    8   14
4       4   88   6    4   12   1  20   12    7
5       5  117  20   45   41  43   5   32    2
6       6 7036 526    7    1   1   8   71    6
7       7  101  43   41   15  18  11    7    6
8       8 3595 112   11    5  12   8   88   86
9       9  117  20   45   41  43   5   32    2
10     10  744  10   11  135   8   8    8    2

Data

df <-structure(list(header = 1:10, ST.adk.fumC.gyrB.icd.mdh.purA.recA = c(" 10 10 11 4 8 8 8 2", 
                                                                     " 48 6 11 4 8 8 8 2", " 58 6 4 4 16 24 8 14", " 88* 6* 4 12 1 20 12 7", 
                                                                     " 117 20 45 41 43 5 32 2", " 7036 526 7 1 1 8 71 6", " 101 43 41 15 18 11 7 6", 
                                                                     " 3595 112 11 5 12 8 88 86", " 117 20 45 41 43 5 32 2", " 744 10 11 135 8 8 8 2"
)), row.names = c(NA, -10L), class = c("tbl_df", "tbl", "data.frame"
))

Using separate to split uneven number of variables in a column

With separate from the tidyr package:

library(tidyr)
country_info %>% 
  separate(country_data, 
           into = sprintf('%s.%s', rep(c('country','player.count'),3), rep(1:3, each=2)))

the result:

  country.1 player.count.1 country.2 player.count.2 country.3 player.count.3
1    France              4   Morroco              8     Italy              2
2  Scotland              6    Mexico              2      <NA>           <NA>
3  Scotland              2      <NA>           <NA>      <NA>           <NA>

Separate automatically recognizes : and | as characters on which it has to separate. If you want to separate on a specific character, you need to specify that with the sep argument. In this case you could use sep = '[:|]'. This also prevents misbehavior of the automatic detection when there are missing values (see discussion in the comments).

With sprintf you paste together the two vectors rep(c('country','player.count'),3) and rep(1:3, each=2) into a vector of column names where %s.%s tells sprintf to treat the two vectors are string-vectors and paste them together with a dot as separator. See ?sprintf for more info. The each argument tells rep not to repete the whole vector a number of times, but to repete each element of the vector a number of times.

tidyr:: gather multiple columns different types

I assume your expected output is incomplete as I don't see any entries for ID = 2 and ID = 3.

You could do the following

df %>%
    gather(k, v, -ID) %>%
    separate(k, into = c("tmp", "X_num", "ss"), sep = "_") %>%
    select(-tmp) %>%
    spread(ss, v)
#  ID X_num abc xyz
#1  1     1   1   1
#2  1     2   2   2
#3  1     3   2   1
#4  2     1   1   2
#5  2     2   1   0
#6  2     3   1  NA
#7  3     1   1   2
#8  3     2   1   1
#9  3     3  NA   0

Extracting many variables from a single column in R

Try with this:

library(dplyr) # must be version >= 1.0.0
library(stringr)

Original %>%
 mutate(across(everything(), str_remove_all, pattern = "\\[|\\]|\\'")) %>% 
 mutate(across(everything(), str_split, pattern = ",")) %>%                
 tidyr::unnest(everything()) %>% 
 mutate(across(everything(), str_trim)) %>% 
 mutate(across(c(CustNum, Amounts, Number), as.numeric))

# A tibble: 8 x 5
  CustNum Sales Amounts Number Identifier
    <dbl> <chr>   <dbl>  <dbl> <chr>     
1       0 1000       10      1 A         
2       0 345         2      2 A         
3       0 Zero        0      3 A         
4       0 56         98      4 A         
5       1 987        57      4 B         
6       1 879        25      3 B         
7       1 325        52      2 B         
8       1 4568       75      1 B

Basically:

Remove [ ] '
Split by ,
Unnest the lists
Trim out unnecessary spaces
Set to numeric where necessary

How to Use Tidyr::Separate When the Number of Needed Variables Is Unknown