R: Pulling Data from One Column to Create New Columns

R: Pulling data from one column to create new columns

using dplyr and tidyr.

First we insert a "." in the sample code, next we separate sample into 3 columns.

library(dplyr)
library(tidyr)

df %>%
mutate(sample = paste0(substring(df$sample, 1, 1), ".", substring(df$sample, 2))) %>%
separate(sample, into = c("tissue", "plant", "stage"), remove = FALSE)

sample tissue plant stage
1 P.10.1 P 10 1
2 P.11.2 P 11 2
3 S.1.1 S 1 1
4 S.3.3 S 3 3

data:

df <- structure(list(sample = c("P10.1", "P11.2", "S1.1", "S3.3")), 
.Names = "sample",
class = "data.frame",
row.names = c(NA, -4L))

Creating new columns using data in one column and fill with data

In base R, we can use table

out <-table(dt$col, dt$col)

-output

out
a b c d e f
a 1 0 0 0 0 0
b 0 1 0 0 0 0
c 0 0 1 0 0 0
d 0 0 0 1 0 0
e 0 0 0 0 1 0
f 0 0 0 0 0 1

Or use diag

 `dimnames<-`(diag(nrow(dt)), list(dt$col, dt$col))

How to create new columns based on values and names of existing columns in R?

I believe this is the result you're looking for; this solution uses tidyverse functions.

library(tidyverse)

# create empty tibble to store results
df.out <- tibble()

# loop over years
for (i in unique(df$Year)) {

this.year <- df %>%
# grab number of rows for this year
filter(Year == i) %>%
# only grab the weather columns for this specific year
select(Experiment, Site, Year, contains(paste0("_", i, "_"))) %>%
# this function uses a regex to rename columns with "_" and the current year by removing the part of the name with "_" then 4 numbers
rename_with(function(x) {str_replace(x, "\\_[0-9][0-9][0-9][0-9]", "")}, contains(paste0("_", i, "_")))

# add this year to your output tibble
df.out <- df.out %>%
bind_rows(this.year)

}

final data frame

R - Create new column based on substring from another column with conditions

There is probably a more efficient way to do this, but we could do a series of ifelse statements using case_when from tidyverse. First, I remove any rows that just end with ;s__. Then, in the series of statements, I check to if a given taxonomic level is present, then if so, then return that in the desired format. Then, that is repeated across all taxonomic levels.

library(tidyverse)

output <- input_data %>%
mutate(taxon = trimws(taxon, whitespace = ";s__")) %>%
mutate(taxon_main = case_when(str_detect(taxon, "s__") ~ trimws(str_replace_all(str_extract(taxon, "(?<=g__).*"), ";s_", ""), whitespace = '_'),
!str_detect(taxon, "s__") & str_detect(taxon, "g__")~ str_replace_all(str_extract(taxon, "g__.*"), "__", "_"),
!str_detect(taxon, "g__") & str_detect(taxon, "f__") ~ str_replace_all(str_extract(taxon, "f__.*"), "__", "_"),
!str_detect(taxon, "f__") & str_detect(taxon, "o__")~ str_replace_all(str_extract(taxon, "o__.*"), "__", "_"),
!str_detect(taxon, "o__") & str_detect(taxon, "c__")~ str_replace_all(str_extract(taxon, "c__.*"), "__", "_"),
!str_detect(taxon, "c__") & str_detect(taxon, "p__")~ str_replace_all(str_extract(taxon, "p__.*"), "__", "_"),
!str_detect(taxon, "p__") & str_detect(taxon, "k__")~ str_replace_all(str_extract(taxon, "k__.*"), "__", "_"),
TRUE ~ NA_character_))

Output

output %>% select(taxon_main)

taxon_main
1 Lactobacillus_crispatus
2 g_Anaerococcus
3 f_Comamonadaceae
4 f_Lachnospiraceae
5 Bosea_massiliensis
6 Acinetobacter_baumannii
7 f_Methylophilaceae

Or you could also use separate first, which will make the code less reliant on using a lot of stringr. We can clean up before using separate, such as only having one underscore and remove extra s__. Then, we can go through the ifelse statements, and then we can bind back to the original taxon column and drop all the other columns, except for taxon_main.

input_data %>%
mutate(taxon = trimws(taxon, whitespace = ";s__"),
taxon = str_replace_all(taxon, ";s__", ";"),
taxon = str_replace_all(taxon, "__", "_")) %>%
separate(taxon, sep = ";", into = c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species")) %>%
mutate(taxon_main = case_when(!is.na(Species) ~ paste(str_extract(Genus, "(?<=g_).*"), Species, sep = "_"),
is.na(Species) & !is.na(Genus) ~ Genus,
is.na(Genus) & !is.na(Family) ~ Family,
is.na(Family) & !is.na(Order) ~ Order,
is.na(Order) & !is.na(Class) ~ Class,
is.na(Class) & !is.na(Phylum) ~ Phylum,
is.na(Phylum) & !is.na(Kingdom) ~ Kingdom
)) %>%
bind_cols(input_data,.) %>%
select(taxon_main, taxon)

Output

               taxon_main                                                                                                                     taxon
1 Lactobacillus_crispatus k__Bacteria;p__Firmicutes;c__Bacilli;o__Lactobacillales;f__Lactobacillaceae;g__Lactobacillus;s__crispatus
2 g_Anaerococcus k__Bacteria;p__Firmicutes;c__Tissierellia;o__Tissierellales;f__Peptoniphilaceae;g__Anaerococcus;s__
3 f_Comamonadaceae k__Bacteria;p__Proteobacteria;c__Betap__Proteobacteria;o__Burkholderiales;f__Comamonadaceae
4 f_Lachnospiraceae k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Lachnospiraceae
5 Bosea_massiliensis k__Bacteria;p__Proteobacteria;c__Alphap__Proteobacteria;o__Rhizobiales;f__Bradyrhizobiaceae;g__Bosea;s__massiliensis
6 Acinetobacter_baumannii k__Bacteria;p__Proteobacteria;c__Gammap__Proteobacteria;o__Pseudomonadales;f__Moraxellaceae;g__Acinetobacter;s__baumannii
7 f_Methylophilaceae k__Bacteria;p__Proteobacteria;c__Betap__Proteobacteria;o__Nitrosomonadales;f__Methylophilaceae

R Creating new columns based on factors in another column

Try this

df %>% group_by(category) %>% mutate(id = row_number()) %>% ungroup() %>% pivot_wider(id_cols = id, names_from = category, values_from = cond) %>% select(-id) 

Taking variable names out of column and creating new columns in R

This can be accomplished a few ways. Might be a good opportunity to get to know the tidyverse:

library(tidyverse)
new.df <- spread(old.df, response, response)

This is an unusual use of tidyr::spread(). In this case, it constructs new column names from the values in "response", and also fills those columns with the values in "response". The fill argument can be used to change what goes in the resulting blank cells.

How to get R to create new column (named from left part of string in old column), and then put right part of string from old column into new column

Starting with

quux <- structure(list(oldColumn1 = c("COLOR: RED", "COLOR: RED", "COLOR: BLUE", "COLOR: GREEN", "COLOR: BLUE")), class = "data.frame", row.names = c(NA, -5L))

The naive approach would be

data.frame(COLOR = trimws(sub("COLOR:", "", quux$oldColumn1)))
# COLOR
# 1 RED
# 2 RED
# 3 BLUE
# 4 GREEN
# 5 BLUE

But I'm assuming you have a more generic need. Let's assume that you have some more things to parse out of that, such as

quux <- structure(list(oldColumn1 = c("COLOR: RED", "COLOR: RED", "COLOR: BLUE", "COLOR: GREEN", "COLOR: BLUE", "SIZE: 1", "SIZE: 3", "SIZE: 5")), class = "data.frame", row.names = c(NA, -8L))
quux
# oldColumn1
# 1 COLOR: RED
# 2 COLOR: RED
# 3 COLOR: BLUE
# 4 COLOR: GREEN
# 5 COLOR: BLUE
# 6 SIZE: 1
# 7 SIZE: 3
# 8 SIZE: 5

then we can generalize it with

tmp <- strcapture("(.*)\\s*:\\s*(.*)", quux$oldColumn1, list(k="", v=""))
tmp$ign <- ave(rep(1L, nrow(tmp)), tmp$k, FUN = seq_along)
reshape2::dcast(tmp, ign ~ k, value.var = "v")[,-1,drop=FALSE]
# COLOR SIZE
# 1 RED 1
# 2 RED 3
# 3 BLUE 5
# 4 GREEN <NA>
# 5 BLUE <NA>

--

Edit: alternative with updated data:

do.call(cbind, lapply(dat, function(X) {
nm <- sub(":.*", "", X[1])
out <- data.frame(trimws(sub(".*:", "", X)))
names(out) <- nm
out
}))
# COLOR SIZE DESIGNSTYLE
# 1 RED LARGE STYLED
# 2 RED MEDIUM ORIGINAL MAKER
# 3 BLUE XLARGE COUTURE
# 4 GREEN MEDIUM COUTURE
# 5 BLUE SMALL STYLED

create new column based on existing pattern R

We may use regex_left_join

library(data.table)
library(fuzzyjoin)
regex_left_join(tableRules, data.table(DIMENSION = listDimPoss),
by = c("object_name" = "DIMENSION"))
object_name DIMENSION
1 instr_asset_row instr_asset
2 functional_cat functional_cat
3 ref_sector_second ref_sector


Related Topics



Leave a reply



Submit