refer to range of columns by name in R
A column number can be identified from a column name within a data frame as follows:
which(colnames(mydf)=="a")
where mydf is a data frame and a is the name of the column the column number is required for.
(Source)
This can be used to create a column range:
firstcol = which(colnames(x)=="a")
lastcol = which(colnames(x)=="b")
mydf[c(firstcol:lastcol)]
Select a consecutive range of data.frame columns using names of beginning and end columns
Use c()
function, then it works when using column names
> A <- data.frame(ID1=c(1,1),ID2=c(5,5),ID3=c(01901,01902))
> A
# ID1 ID2 ID3
# 1 1 5 1901
# 2 1 5 1902
> b <- A[,c(2:3)]
> b
# ID2 ID3
# 1 5 1901
# 2 5 1902
> b1 <- A[,c("ID2","ID3")]
> b1
# ID2 ID3
# 1 5 1901
# 2 5 1902
> b2 <- A[,2:3]
> b2
# ID2 ID3
# 1 5 1901
# 2 5 1902
How to select range of columns in a dataframe based on their name and not their indexes?
It looks like you can accomplish this with a subset
:
> df <- data.frame(c1=1:6, c2=2:7, c3=3:8, c4=4:9, c5=5:10, c6=6:11)
> rownames(df) <- c('r1', 'r2', 'r3', 'r4', 'r5', 'r6')
> subset(df, select=c1:c4)
c1 c2 c3 c4
r1 1 2 3 4
r2 2 3 4 5
r3 3 4 5 6
r4 4 5 6 7
r5 5 6 7 8
r6 6 7 8 9
> subset(df, select=c1:c2)
c1 c2
r1 1 2
r2 2 3
r3 3 4
r4 4 5
r5 5 6
r6 6 7
If you want to subset by row name range, this hack would do:
> gRI <- function(df, rName) {which(match(rNames, rName) == 1)}
> df[gRI(df,"r2"):gRI(df,"r4"),]
c1 c2 c3 c4 c5 c6
r2 2 3 4 5 6 7
r3 3 4 5 6 7 8
r4 4 5 6 7 8 9
Print a selected range of column names
Short answer
When using data.table
objects, it seems you cannot implicitely refer to columns if you do not use comma inside []
.
library(data.table)
# with data.frame
colnames(iris[3:ncol(iris)])
#> [1] "Petal.Length" "Petal.Width" "Species"
colnames(iris[,3:ncol(iris)])
#> [1] "Petal.Length" "Petal.Width" "Species"
# with data.table
t <- as.data.table(iris)
colnames(t[3:ncol(t)]) # here you get the colnames of the rows 3 to 5
#> [1] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width" "Species"
colnames(t[,3:ncol(t)])
#> [1] "Petal.Length" "Petal.Width" "Species"
For your t
object
I do not know how you have built the t
object. I had to remove .internal.selfref = < pointer:0x0000022fd86f1ef0 >,
from the output of dput(head(t))
, to get a R object of classes data.table
and data.frame
. This object works fine with print(colnames(t[,3:ncol(t)]))
(note the comma). I do not use data.table
so I guess you need to use the syntax with the comma if you want to refer to columns of an object of class data.table
.
library(data.table)
t <- structure(
list(
year = c(1949L, 1949L, 1949L, 1949L, 1949L, 1949L),
month = c(1L, 1L, 1L, 1L, 1L, 1L),
day = 1:6,
`Tmoy_AGEN-LA GARENNE` = c(6.9, 5.9, 2.9, 2, 7.9, 7.2),
Tmoy_AUXERRE = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_),
`Tmoy_BALE-MULHOUSE` = c(2.8, 1.3, 2.6, 1.6, 2.5, -0.7),
Tmoy_BESANCON = c(7.6, 2, 3.6, 3.1, 4, 3.3),
`Tmoy_BIARRITZ-PAYS-BASQUE` = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_),
`Tmoy_BORDEAUX-MERIGNAC` = c(9.4, 5.5, 6.2, 5.5, 8.9, 3.5),
Tmoy_BOURGES = c(7.6, 4.2, 3.7, 3.1, 6.6, 6.2),
`Tmoy_BREST-GUIPAVAS` = c(5.5, 5, 5.5, 7.8, 8.3, 8.9),
`Tmoy_CHAMBERY-AIX` = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_),
`Tmoy_CLERMONT-FD` = c(6.3, 3.8, 5.4, 0.9, 5.5, 5.3),
Tmoy_COGNAC = c(8.6, 4.8, 5.4, 5.3, 8.3, 5.6),
`Tmoy_DIJON-LONGVIC` = c(5.7, 3.8, 3.9, 0.3, 4.5, 4.6),
Tmoy_DINARD = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_),
`Tmoy_GRENOBLE-ST GEOIRS` = c(5.2, 5.3, -0.2, -0.9, 3.9, 1.2),
`Tmoy_LILLE-LESQUIN` = c(5.4, 3.7, 2.8, 3.8, 6.1, 5.5),
Tmoy_LUXEUIL = c(5.9, 1.6, 3.4, -0.6, 3, -1.5),
`Tmoy_LYON-BRON` = c(6.6, 5, 4.2, 1, 5.7, 4.1),
Tmoy_MARIGNANE = c(7.2, 12.3, 5.6, 3.2, 7, 6.3),
`Tmoy_METZ-FRESCATY` = c(4.6, 3.9, 2.6, 1, 4, 4.3),
Tmoy_MONTELIMAR = c(4.9, 9.8, 2.4, 4.6, 8.2, 7.6),
`Tmoy_NANTES-BOUGUENAIS` = c(8.5, 4.9, 4.6, 6.2, 8.2, 5.6),
Tmoy_NICE = c(7.6, 10.5, 7.2, 8.7, 7.6, 11.4),
`Tmoy_NIMES-COURBESSAC` = c(6.2, 10.7, 4.9, 5, 8.9, 10.1),
`Tmoy_PARIS-MONTSOURIS` = c(7.4, 4.4, 3.8, 3.6, 8.1, 6.4),
`Tmoy_PAU-UZEIN` = c(8.3, 6.1, 4.7, 4.8, 9.4, 7.3),
Tmoy_PERPIGNAN = c(8.1, 9.4, 6.5, 7.9, 9.6, 10.6),
`Tmoy_REIMS-PRUNAY` = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_),
`Tmoy_ROUEN-BOOS` = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_),
`Tmoy_ST ETIENNE-BOUTHEON` = c(7, 4.4, 3.2, 0.9, 7.3, 5.3),
`Tmoy_TOULOUSE-BLAGNAC` = c(5.8, 6, 2.5, 3.1, 7.7, 7.7),
Tmoy_TOURS = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_)
),
row.names = c(NA, -6L),
class = c("data.table", "data.frame"),
sorted = c("year", "month", "day")
)
Output:
> colnames(t[,3:ncol(t)]) # note the addition of the comma
[1] "day" "Tmoy_AGEN-LA GARENNE" "Tmoy_AUXERRE"
[4] "Tmoy_BALE-MULHOUSE" "Tmoy_BESANCON" "Tmoy_BIARRITZ-PAYS-BASQUE"
[7] "Tmoy_BORDEAUX-MERIGNAC" "Tmoy_BOURGES" "Tmoy_BREST-GUIPAVAS"
[10] "Tmoy_CHAMBERY-AIX" "Tmoy_CLERMONT-FD" "Tmoy_COGNAC"
[13] "Tmoy_DIJON-LONGVIC" "Tmoy_DINARD" "Tmoy_GRENOBLE-ST GEOIRS"
[16] "Tmoy_LILLE-LESQUIN" "Tmoy_LUXEUIL" "Tmoy_LYON-BRON"
[19] "Tmoy_MARIGNANE" "Tmoy_METZ-FRESCATY" "Tmoy_MONTELIMAR"
[22] "Tmoy_NANTES-BOUGUENAIS" "Tmoy_NICE" "Tmoy_NIMES-COURBESSAC"
[25] "Tmoy_PARIS-MONTSOURIS" "Tmoy_PAU-UZEIN" "Tmoy_PERPIGNAN"
[28] "Tmoy_REIMS-PRUNAY" "Tmoy_ROUEN-BOOS" "Tmoy_ST ETIENNE-BOUTHEON"
[31] "Tmoy_TOULOUSE-BLAGNAC" "Tmoy_TOURS"
Referencing a range of columns in dplyr
I think the most dplyr
-esque way would be:
df %>%
filter(rowSums(select_(., 'A2:B1')) > 10)
Which gives:
# txt A1 A2 B1 B2
#1 ala 6 9 12 23
R - refer to column names rather than column index when using lapply with data frame
You can use sapply()
as follows. The problem in this example is that you cannot set ranges of columns by name easily.
cols <- c("A", "B", "D", "F", "G", "H")
df[,cols] <- sapply(df[,cols], \(x) (5:1)[x])
The easiest way to select by a range of columns is to use eval_select()
to return their positions by number. But if you do this, you might as well just use the dplyr
solution. This is essentially an under the hood look at it.
library(tidyselect)
col_pos <- eval_select(expr(c(A:B, D, F:H)), df)
df[,col_pos] <- sapply(df[,col_pos], \(x) (5:1)[x])
How to use a range for columns instead of names for pmax / pmin
Here's an option that does one function-call on all rows, all columns at once.
foo %>%
mutate(maxcol = do.call(pmax, subset(., select = a:e)))
# a b c d e f g h i j k l m n o p q r s t u v w x y z maxcol
# 1 1 4 9 2 4 4 1 10 2 3 10 4 7 1 10 9 8 2 8 9 5 1 9 1 10 9 9
# 2 5 2 5 3 5 2 8 8 5 8 2 3 6 10 9 3 5 8 7 4 6 9 8 5 8 3 5
# 3 10 9 6 1 7 10 6 4 4 7 6 6 2 7 5 5 4 1 10 7 3 10 5 10 1 7 10
# 4 8 1 4 8 9 3 3 9 10 1 8 5 8 4 4 8 6 10 5 2 9 5 7 7 3 1 9
# 5 2 10 2 9 8 9 9 6 7 5 9 2 5 5 7 4 2 5 4 8 4 6 6 2 9 6 10
You can select some or all of the columns using the colon notation, even arbitrary columns:
foo %>%
mutate(maxcol = do.call(pmax, subset(., select = c(a:e,g))))
# a b c d e f g h i j k l m n o p q r s t u v w x y z maxcol
# 1 1 4 9 2 4 4 1 10 2 3 10 4 7 1 10 9 8 2 8 9 5 1 9 1 10 9 9
# 2 5 2 5 3 5 2 8 8 5 8 2 3 6 10 9 3 5 8 7 4 6 9 8 5 8 3 8
# 3 10 9 6 1 7 10 6 4 4 7 6 6 2 7 5 5 4 1 10 7 3 10 5 10 1 7 10
# 4 8 1 4 8 9 3 3 9 10 1 8 5 8 4 4 8 6 10 5 2 9 5 7 7 3 1 9
# 5 2 10 2 9 8 9 9 6 7 5 9 2 5 5 7 4 2 5 4 8 4 6 6 2 9 6 10
The reason this should be preferred over the other answers (which are generally using allegedly idiomatic methods) is because:
- in Dom's answer, the
max
function is called once for each row of the frame; R's vectorized ops are not being used, this is inefficient and should be avoided if possible; - in akrun's answer,
pmax
is being called once for each column of the frame, which in this case might sound worse but actually closer to the best one can do. My answer is closest to akrun's in that we areselect
ing data within themutate
.
If you'd prefer to use dplyr::select
over base::subset
, it needs to be broken out as
foo %>%
mutate(maxcol = select(., a:e, g) %>% do.call(pmax, .))
I think this is demonstrated a little better with benchmarks. Using the provided 5x26 frame, we see a clear improvement:
set.seed(42)
foo <- data.frame(sapply(letters, function(x) x = sample(1:10,5)))
microbenchmark::microbenchmark(
Dom = {
foo %>%
rowwise() %>%
summarise(max= max(c_across(a:z)))
},
akr = {
foo %>%
mutate(maxcol = reduce(select(., a:z), pmax))
},
r2 = {
foo %>%
mutate(maxcol = do.call(pmax, subset(., select = a:z)))
}
)
# Unit: milliseconds
# expr min lq mean median uq max neval
# Dom 6.6561 7.15260 7.61574 7.38345 7.90375 11.0387 100
# akr 4.2849 4.69920 4.96278 4.86110 5.18130 7.0908 100
# r2 2.3290 2.49285 2.68671 2.59180 2.78960 4.7086 100
Let's try with a slightly larger 5000x26:
set.seed(42)
foo <- data.frame(sapply(letters, function(x) x = sample(1:10,5000,replace=TRUE)))
microbenchmark::microbenchmark(
Dom = {
foo %>%
rowwise() %>%
summarise(max= max(c_across(a:z)))
},
akr = {
foo %>%
mutate(maxcol = reduce(select(., a:z), pmax))
},
r2 = {
foo %>%
mutate(maxcol = do.call(pmax, subset(., select = a:z)))
}
)
# Unit: milliseconds
# expr min lq mean median uq max neval
# Dom 515.6437 563.6060 763.97348 811.45815 883.00115 1775.2366 100
# akr 4.6660 5.1619 11.92847 5.74050 6.50625 293.7444 100
# r2 2.9253 3.4371 4.24548 3.71845 4.27380 14.0958 100
This last one definitely shows a consequence of using rowwise
. The relative performance between akrun's answer and this one is almost identical to 5 rows, reinforcing the premise that column-wise is better than row-wise (and all-at-once is faster than both).
(This can also be done with purrr::invoke
, if truly desired, though it does not speed it up:
library(purrr)
foo %>%
mutate(maxcol = invoke(pmax, subset(., select = a:z)))
### microbenchmark(...)
# Unit: milliseconds
# expr min lq mean median uq max neval
# Dom 7.8292 8.40275 9.02813 8.97345 9.38500 12.4368 100
# akr 4.9622 5.28855 8.78909 5.60090 6.11790 309.2607 100
# r2base 2.5521 2.74635 3.01949 2.90415 3.21060 4.6512 100
# r2purrr 2.5063 2.77510 3.11206 2.93415 3.33015 5.2403 100
use column-name range in data.table like in dplyrs select
What I asked for is possible in the development-version of data.table (1.9.5) as can be seen in the new feature number 17. quoting:
- .SDcols and with=FALSE understand colA:colB form now. That is, DT[, lapply(.SD, sum), by=V1, .SDcols=V4:V6] and DT[, V5:V7,
with=FALSE] works as intended. This is quite useful for interactive
use. Closes #748.
the installation of the development-version is explained here
Thanks for pointing this out @AnandaMahto and @Arun!
also using select_vars(names(dt), w:y)
from dplyr is probably better than names(select(dt, w:y))
as pointed out by @shadow, because it gives more options to select the names, is more readable and conveys the intent more clearly.
Select multiple ranges of columns using column names in data.table
We can use the range part in .SDcols
and then append the other column by concatenating
dt[, c(list(a= a), .SD) , .SDcols = c:d]
If there are multiple ranges, we create a sequence of ranges by match
, and then get the corresponding column names
i1 <- match(c("c", "f"), names(dt))
j1 <- match(c("d", "h"), names(dt))
nm1 <- c("a", names(dt)[unlist(Map(`:`, i1, j1))], "j")
dt[, ..nm1]
# a c d f g h j
#1: 1 11 16 26 31 36 46
#2: 2 12 17 27 32 37 47
#3: 3 13 18 28 33 38 48
#4: 4 14 19 29 34 39 49
#5: 5 15 20 30 35 40 50
Also, the dplyr
methods can be used within the data.table
dt[, select(.SD, a, c:d, f:h, j)]
# a c d f g h j
#1: 1 11 16 26 31 36 46
#2: 2 12 17 27 32 37 47
#3: 3 13 18 28 33 38 48
#4: 4 14 19 29 34 39 49
#5: 5 15 20 30 35 40 50
Related Topics
How to Convert List of List into a Tibble (Dataframe)
Find the Nearest X,Y Coordinate Using R
As.Posixct Gives an Unexpected Timezone
R - Data Frame - Convert to Sparse Matrix
Arrow() in Ggplot2 No Longer Supported
Font Awesome in R, Loaded But Not Found by Waffle
Tiny Plot Output from Sankeynetwork (Networkd3) in Firefox
Fuzzyjoin Two Data Frames Using Data.Table
Maintaining an Input/Output Log in R
Assign Names to Vector Entries Without Assigning the Vector a Variable Name
Aggregating Values on a Data Tree with R
Why Doesn't "+" Operate on Characters in R
Remove Text Inside Brackets, Parens, And/Or Braces
Select Rows in a Dataframe in R Based on Values in One Row
Unexpected Behaviour with Argument Defaults
How to Plot X-Axis Labels and Bars Between Tick Marks in Ggplot2 Bar Plot
Adding Annotation (Segment/Arrow) in Only Certain Facet Ggplot