concatenate values across columns in data.table, row by row
You can use do.call()
, using .SDcols
to supply the columns.
x[, key_ := do.call(paste, c(.SD, sep = "_")), .SDcols = names(x)]
.SDcols = names(x)
supplies all the columns of x
. You can supply any vector of names or column numbers there.
concatenate names and values across columns in data.table, row by row
We could paste
the corresponding names with Map
d[,
.(x, y, labs = do.call(paste, c(Map(function(u, v)
paste0(v, ": ", u), .SD, labs_to_get), sep = ", "))),
.SDcols = labs_to_get
]
-output
x y labs
<num> <num> <char>
1: 1 3 x: 1, y: 3
2: 2 4 x: 2, y: 4
or another option is write.dcf
d[, labs := do.call(paste,
c(as.list(setdiff(capture.output(write.dcf(.SD)), "")),
sep = ", ")), 1:nrow(d)]
> d
x y labs
<num> <num> <char>
1: 1 3 x: 1, y: 3
2: 2 4 x: 2, y: 4
Or use apply
to loop over the rows
d[, labs := apply(.SD, 1, \(x) paste(names(x), x, sep = ": ",
collapse = ", ")), .SDcols = labs_to_get]
Or using tidyverse
library(dplyr)
library(purrr)
library(stringr)
d %>%
mutate(labs = invoke(str_c, c(across(all_of(labs_to_get),
~str_c(cur_column(), ": ", .x)), sep = ", ")))
x y labs
<num> <num> <char>
1: 1 3 x: 1, y: 3
2: 2 4 x: 2, y: 4
Row-wise sort then concatenate across specific columns of data frame
My first thought would've been to do this:
dt[, new_var := paste(sort(.SD), collapse = ", "), by = 1:nrow(dt)]
But you could make your function work with a couple of simple modifications:
f = function(...) paste(c(...)[order(c(...))],collapse=", ")
dt[, new_var := do.call(function(...) mapply(f, ...), .SD)]
How can I create this column in my datatable with the concatenated two values, separated with a dash?
Populate your table first, then add the column with its Expression
property set to automatically populate it from the other two:
Dim table As New DataTable
'...
Using reader = command.ExecuteReader()
table.Load(reader)
End Using
'...
table.Columns.Add("RoleDescriptionAndFirstName",
GetType(String),
"RoleDescription + ' - ' + FirstName")
Done! The column itself will perform the concatenation and any changes to the data in the other columns will be automatically propagated to the new column too.
Concatenate row-wise across specific columns of dataframe
Try
data$id <- paste(data$F, data$E, data$D, data$C, sep="_")
instead. The beauty of vectorized code is that you do not need row-by-row loops, or loop-equivalent *apply functions.
Edit Even better is
data <- within(data, id <- paste(F, E, D, C, sep=""))
How to combine two or more columns of data table to one column?
You may use tidyr::unite
-
dt.test <- tidyr::unite(dt.test, date, year:hour, sep = '-')
dt.test
# date id type
#1: 2018-01-01-00 8750 ist
#2: 2018-01-02-01 3048 plan
#3: 2018-01-03-02 3593 ist
#4: 2018-01-04-03 8475 plan
Fast concatenation of data.table columns into one string column
C to the rescue!
Stealing some code from data.table we can write a C function that works way faster (and could be parallelized to be even faster).
First make sure you have a working C++ toolchain with:
library(inline)
fx <- inline::cfunction( signature(x = "integer", y = "numeric" ) , '
return ScalarReal( INTEGER(x)[0] * REAL(y)[0] ) ;
' )
fx( 2L, 5 ) #Should return 10
Then this should work (assuming integer-only data, but the code can be extended to other types):
library(inline)
library(data.table)
library(stringi)
header <- "
//Taken from https://github.com/Rdatatable/data.table/blob/master/src/fwrite.c
static inline void reverse(char *upp, char *low)
{
upp--;
while (upp>low) {
char tmp = *upp;
*upp = *low;
*low = tmp;
upp--;
low++;
}
}
void writeInt32(int *col, size_t row, char **pch)
{
char *ch = *pch;
int x = col[row];
if (x == INT_MIN) {
*ch++ = 'N';
*ch++ = 'A';
} else {
if (x<0) { *ch++ = '-'; x=-x; }
// Avoid log() for speed. Write backwards then reverse when we know how long.
char *low = ch;
do { *ch++ = '0'+x%10; x/=10; } while (x>0);
reverse(ch, low);
}
*pch = ch;
}
//end of copied code
"
worker_fun <- inline::cfunction( signature(x = "list", preallocated_target = "character", columns = "integer", start_row = "integer", end_row = "integer"), includes = header , "
const size_t _start_row = INTEGER(start_row)[0] - 1;
const size_t _end_row = INTEGER(end_row)[0];
const int max_out_len = 256 * 256; //max length of the final string
char buffer[max_out_len];
const size_t num_elements = _end_row - _start_row;
const size_t num_columns = LENGTH(columns);
const int * _columns = INTEGER(columns);
for(size_t i = _start_row; i < _end_row; ++i) {
char *buf_pos = buffer;
for(size_t c = 0; c < num_columns; ++c) {
if(c > 0) {
buf_pos[0] = ',';
++buf_pos;
}
writeInt32(INTEGER(VECTOR_ELT(x, _columns[c] - 1)), i, &buf_pos);
}
SET_STRING_ELT(preallocated_target,i, mkCharLen(buffer, buf_pos - buffer));
}
return preallocated_target;
" )
#Test with the same data
RowCount <- 5e6
DT <- data.table(x = "foo",
y = "bar",
a = sample.int(9, RowCount, TRUE),
b = sample.int(9, RowCount, TRUE),
c = sample.int(9, RowCount, TRUE),
d = sample.int(9, RowCount, TRUE),
e = sample.int(9, RowCount, TRUE),
f = sample.int(9, RowCount, TRUE))
## Generate an expression to paste an arbitrary list of columns together
ConcatCols <- list("a","b","c","d","e","f")
## Do it 3x as many times
ConcatCols <- c(ConcatCols,ConcatCols,ConcatCols)
ptm <- proc.time()
preallocated_target <- character(RowCount)
column_indices <- sapply(ConcatCols, FUN = function(x) { which(colnames(DT) == x )})
x <- worker_fun(DT, preallocated_target, column_indices, as.integer(1), as.integer(RowCount))
DT[, State := preallocated_target]
proc.time() - ptm
While your (integer only) example runs in about 20s on my PC, this runs in ~5s and can be easily parallelized.
Some things to note:
- The code is not production ready - a lot of sanity checks should be made on the function inputs (especially checking if all columns are the same length, checking column types, preallocated_target size etc.)
- The function puts its output into a preallocated character vector, this is non-standard and ugly (R usually does not have pass-by-reference semantics) but allows for parallelization (see below).
- The last two parameters are start and end rows to be processed, once again, this is for paralellization
- The function accepts column indices not column names. All columns have to be of type integer.
- Except for the input data.table and preallocated_target the inputs have to be integers
- Compilation time for the function is not included (as you should compile it beforehand - maybe even make a package)
Parallelization
EDIT: The approach below would actually fail due to the way clusterExport
and R string storage work. Paralellization thus probably needs to be done in C as well, similarly to the way it is achieved in data.table.
Since you cannot pass inline-compiled functions across R processes, paralellization requires some more work. To be able to use the above function in parallel, you either need to compile it separately with R compiler and use dyn.load
OR wrap it in a package OR use a forking backend for parallel (I don't have one, forking works only on UNIX).
Running in parallel would then look something like (not tested):
no_cores <- detectCores()
# Initiate cluster
cl <- makeCluster(no_cores)
#Preallocated target and prepare params
num_elements <- length(DT[[1]])
preallocated_target <- character(num_elements)
block_size <- 4096 #No of rows processed at once. Adjust for best performance
column_indices <- sapply(ConcatCols, FUN = function(x) { which(colnames(DT) == x )})
num_blocks <- ceiling(num_elements / block_size)
clusterExport(cl,
c("DT","preallocated_target","column_indices","num_elements", "block_size"))
clusterEvalQ(cl, <CODE TO LOAD THE NATIVE FUNCTION HERE>)
parLapply(cl, 1:num_blocks ,
function(block_id)
{
throw_away <-
worker_fun(DT, preallocated_target, columns,
(block_id - 1) * block_size + 1, min(num_elements, block_id * block_size - 1))
return(NULL)
})
stopCluster(cl)
data table string concatenation of SD columns for by group values
You can concatenate all columns in using lapply
.
dt[, lapply(.SD, paste0, collapse=" "), by = ID]
## ID a b
## 1: 1 a b c A B C
## 2: 2 d e f g D E F G
## 3: 3 h i j H I J
Using newline characters as a ollapse argument instead of " "
does work, but does not print as you seem to expect in your desired output.
dt[, lapply(.SD, paste0, collapse="\n"), by = ID]
## ID a b
## 1: 1 a\nb\nc A\nB\nC
## 2: 2 d\ne\nf\ng D\nE\nF\nG
## 3: 3 h\ni\nj H\nI\nJ
As pointed out in the comments by @Frank, the question has been changed to have ,
as a seperator instead of \n
. Of course you can just change the collapse
argument to ","
. If you want to have a space as well ", "
, then the solution by @DavidArenburg is preferable.
dt[, lapply(.SD, paste0, collapse=","), by = ID]
dt[, lapply(.SD, toString), by = ID]
R: Concatenating a string to every column in a row (excluding col 1, variable col length, the string is held in col 1 of each row)
Use lapply
to iterate over each column and paste V1
column to them.
df[-1] <- lapply(df[-1], function(x) stringr::str_c(df$V1, x))
df
# V1 V2 V3 V4 V5
#1 Name1 Name1String111 Name1String112 <NA> <NA>
#2 Name2 Name2String121 Name2String122 Name2String123 <NA>
#3 Name3 Name3String131 Name3String132 Name3String133 Name3String134
data
df <- structure(list(V1 = c("Name1", "Name2", "Name3"), V2 = c("String111",
"String121", "String131"), V3 = c("String112", "String122", "String132"
), V4 = c(NA, "String123", "String133"), V5 = c(NA, NA, "String134")),
class = "data.frame", row.names = c(NA, -3L))
Concatenate column names in data.table based on conditions
We can grouo by the rows, unlist
the Subset of Data.table (.SD
), subset the names of the dataset, paste
the elements together and assign (:=
) to 'newCol'
nm1 <- names(dt)[-4]
dt[, newCol := toString(nm1[unlist(.SD)]) ,by = 1:nrow(dt),.SDcols = nm1]
Or another option is melt
to 'long' format and then do a join
dt[melt(dt[, n := seq_len(.N)], id.var = c("n", "PASTE"))[,
toString(variable[value]), n], on = "n"]
Related Topics
How to Count Sequences of Ones in a Logical Vector
R- Plot Numbers Instead of Points
Rcurl: Url.Exists Returns False When Url Does Exist
Applying Over a Vector of Functions
Object Not Found Error with Ggplot2
Condition Filter in Dplyr Based on Shiny Input
How to Always Suppress Messages in R
How to Plot X-Axis Labels and Bars Between Tick Marks in Ggplot2 Bar Plot
Create Fillable PDF Textbox via R
How to Convert a Hex String to Text in R
Find the Nearest X,Y Coordinate Using R
Compute Only Diagonals of Matrix Multiplication in R
How to Insert Appendix After References in Rmd Using Rstudio
How to Convert List of List into a Tibble (Dataframe)
Finding the Index of First Changes in the Elements of a Vector