Refer to Range of Columns by Name in R

refer to range of columns by name in R

A column number can be identified from a column name within a data frame as follows:

which(colnames(mydf)=="a")

where mydf is a data frame and a is the name of the column the column number is required for.

(Source)

This can be used to create a column range:

firstcol = which(colnames(x)=="a")
lastcol = which(colnames(x)=="b")

mydf[c(firstcol:lastcol)]

Select a consecutive range of data.frame columns using names of beginning and end columns

Use c() function, then it works when using column names

> A <- data.frame(ID1=c(1,1),ID2=c(5,5),ID3=c(01901,01902))
> A
# ID1 ID2 ID3
# 1 1 5 1901
# 2 1 5 1902

> b <- A[,c(2:3)]
> b
# ID2 ID3
# 1 5 1901
# 2 5 1902

> b1 <- A[,c("ID2","ID3")]
> b1
# ID2 ID3
# 1 5 1901
# 2 5 1902

> b2 <- A[,2:3]
> b2
# ID2 ID3
# 1 5 1901
# 2 5 1902

How to select range of columns in a dataframe based on their name and not their indexes?

It looks like you can accomplish this with a subset:

> df <- data.frame(c1=1:6, c2=2:7, c3=3:8, c4=4:9, c5=5:10, c6=6:11)
> rownames(df) <- c('r1', 'r2', 'r3', 'r4', 'r5', 'r6')
> subset(df, select=c1:c4)
c1 c2 c3 c4
r1 1 2 3 4
r2 2 3 4 5
r3 3 4 5 6
r4 4 5 6 7
r5 5 6 7 8
r6 6 7 8 9
> subset(df, select=c1:c2)
c1 c2
r1 1 2
r2 2 3
r3 3 4
r4 4 5
r5 5 6
r6 6 7

If you want to subset by row name range, this hack would do:

> gRI <- function(df, rName) {which(match(rNames, rName) == 1)}
> df[gRI(df,"r2"):gRI(df,"r4"),]
c1 c2 c3 c4 c5 c6
r2 2 3 4 5 6 7
r3 3 4 5 6 7 8
r4 4 5 6 7 8 9

Print a selected range of column names

Short answer

When using data.table objects, it seems you cannot implicitely refer to columns if you do not use comma inside [].

library(data.table)
# with data.frame
colnames(iris[3:ncol(iris)])
#> [1] "Petal.Length" "Petal.Width" "Species"
colnames(iris[,3:ncol(iris)])
#> [1] "Petal.Length" "Petal.Width" "Species"
# with data.table
t <- as.data.table(iris)
colnames(t[3:ncol(t)]) # here you get the colnames of the rows 3 to 5
#> [1] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width" "Species"
colnames(t[,3:ncol(t)])
#> [1] "Petal.Length" "Petal.Width" "Species"

For your t object

I do not know how you have built the t object. I had to remove .internal.selfref = < pointer:0x0000022fd86f1ef0 >, from the output of dput(head(t)), to get a R object of classes data.table and data.frame. This object works fine with print(colnames(t[,3:ncol(t)])) (note the comma). I do not use data.table so I guess you need to use the syntax with the comma if you want to refer to columns of an object of class data.table.

    library(data.table)
t <- structure(
list(
year = c(1949L, 1949L, 1949L, 1949L, 1949L, 1949L),
month = c(1L, 1L, 1L, 1L, 1L, 1L),
day = 1:6,
`Tmoy_AGEN-LA GARENNE` = c(6.9, 5.9, 2.9, 2, 7.9, 7.2),
Tmoy_AUXERRE = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_),
`Tmoy_BALE-MULHOUSE` = c(2.8, 1.3, 2.6, 1.6, 2.5, -0.7),
Tmoy_BESANCON = c(7.6, 2, 3.6, 3.1, 4, 3.3),
`Tmoy_BIARRITZ-PAYS-BASQUE` = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_),
`Tmoy_BORDEAUX-MERIGNAC` = c(9.4, 5.5, 6.2, 5.5, 8.9, 3.5),
Tmoy_BOURGES = c(7.6, 4.2, 3.7, 3.1, 6.6, 6.2),
`Tmoy_BREST-GUIPAVAS` = c(5.5, 5, 5.5, 7.8, 8.3, 8.9),
`Tmoy_CHAMBERY-AIX` = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_),
`Tmoy_CLERMONT-FD` = c(6.3, 3.8, 5.4, 0.9, 5.5, 5.3),
Tmoy_COGNAC = c(8.6, 4.8, 5.4, 5.3, 8.3, 5.6),
`Tmoy_DIJON-LONGVIC` = c(5.7, 3.8, 3.9, 0.3, 4.5, 4.6),
Tmoy_DINARD = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_),
`Tmoy_GRENOBLE-ST GEOIRS` = c(5.2, 5.3, -0.2, -0.9, 3.9, 1.2),
`Tmoy_LILLE-LESQUIN` = c(5.4, 3.7, 2.8, 3.8, 6.1, 5.5),
Tmoy_LUXEUIL = c(5.9, 1.6, 3.4, -0.6, 3, -1.5),
`Tmoy_LYON-BRON` = c(6.6, 5, 4.2, 1, 5.7, 4.1),
Tmoy_MARIGNANE = c(7.2, 12.3, 5.6, 3.2, 7, 6.3),
`Tmoy_METZ-FRESCATY` = c(4.6, 3.9, 2.6, 1, 4, 4.3),
Tmoy_MONTELIMAR = c(4.9, 9.8, 2.4, 4.6, 8.2, 7.6),
`Tmoy_NANTES-BOUGUENAIS` = c(8.5, 4.9, 4.6, 6.2, 8.2, 5.6),
Tmoy_NICE = c(7.6, 10.5, 7.2, 8.7, 7.6, 11.4),
`Tmoy_NIMES-COURBESSAC` = c(6.2, 10.7, 4.9, 5, 8.9, 10.1),
`Tmoy_PARIS-MONTSOURIS` = c(7.4, 4.4, 3.8, 3.6, 8.1, 6.4),
`Tmoy_PAU-UZEIN` = c(8.3, 6.1, 4.7, 4.8, 9.4, 7.3),
Tmoy_PERPIGNAN = c(8.1, 9.4, 6.5, 7.9, 9.6, 10.6),
`Tmoy_REIMS-PRUNAY` = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_),
`Tmoy_ROUEN-BOOS` = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_),
`Tmoy_ST ETIENNE-BOUTHEON` = c(7, 4.4, 3.2, 0.9, 7.3, 5.3),
`Tmoy_TOULOUSE-BLAGNAC` = c(5.8, 6, 2.5, 3.1, 7.7, 7.7),
Tmoy_TOURS = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_)
),
row.names = c(NA, -6L),
class = c("data.table", "data.frame"),
sorted = c("year", "month", "day")
)

Output:

> colnames(t[,3:ncol(t)]) # note the addition of the comma
[1] "day" "Tmoy_AGEN-LA GARENNE" "Tmoy_AUXERRE"
[4] "Tmoy_BALE-MULHOUSE" "Tmoy_BESANCON" "Tmoy_BIARRITZ-PAYS-BASQUE"
[7] "Tmoy_BORDEAUX-MERIGNAC" "Tmoy_BOURGES" "Tmoy_BREST-GUIPAVAS"
[10] "Tmoy_CHAMBERY-AIX" "Tmoy_CLERMONT-FD" "Tmoy_COGNAC"
[13] "Tmoy_DIJON-LONGVIC" "Tmoy_DINARD" "Tmoy_GRENOBLE-ST GEOIRS"
[16] "Tmoy_LILLE-LESQUIN" "Tmoy_LUXEUIL" "Tmoy_LYON-BRON"
[19] "Tmoy_MARIGNANE" "Tmoy_METZ-FRESCATY" "Tmoy_MONTELIMAR"
[22] "Tmoy_NANTES-BOUGUENAIS" "Tmoy_NICE" "Tmoy_NIMES-COURBESSAC"
[25] "Tmoy_PARIS-MONTSOURIS" "Tmoy_PAU-UZEIN" "Tmoy_PERPIGNAN"
[28] "Tmoy_REIMS-PRUNAY" "Tmoy_ROUEN-BOOS" "Tmoy_ST ETIENNE-BOUTHEON"
[31] "Tmoy_TOULOUSE-BLAGNAC" "Tmoy_TOURS"

Referencing a range of columns in dplyr

I think the most dplyr-esque way would be:

df %>%
filter(rowSums(select_(., 'A2:B1')) > 10)

Which gives:

#  txt A1 A2 B1 B2
#1 ala 6 9 12 23

R - refer to column names rather than column index when using lapply with data frame

You can use sapply() as follows. The problem in this example is that you cannot set ranges of columns by name easily.

cols <- c("A", "B", "D", "F", "G", "H")

df[,cols] <- sapply(df[,cols], \(x) (5:1)[x])

The easiest way to select by a range of columns is to use eval_select() to return their positions by number. But if you do this, you might as well just use the dplyr solution. This is essentially an under the hood look at it.

library(tidyselect)

col_pos <- eval_select(expr(c(A:B, D, F:H)), df)

df[,col_pos] <- sapply(df[,col_pos], \(x) (5:1)[x])

How to use a range for columns instead of names for pmax / pmin

Here's an option that does one function-call on all rows, all columns at once.

foo %>%
mutate(maxcol = do.call(pmax, subset(., select = a:e)))
# a b c d e f g h i j k l m n o p q r s t u v w x y z maxcol
# 1 1 4 9 2 4 4 1 10 2 3 10 4 7 1 10 9 8 2 8 9 5 1 9 1 10 9 9
# 2 5 2 5 3 5 2 8 8 5 8 2 3 6 10 9 3 5 8 7 4 6 9 8 5 8 3 5
# 3 10 9 6 1 7 10 6 4 4 7 6 6 2 7 5 5 4 1 10 7 3 10 5 10 1 7 10
# 4 8 1 4 8 9 3 3 9 10 1 8 5 8 4 4 8 6 10 5 2 9 5 7 7 3 1 9
# 5 2 10 2 9 8 9 9 6 7 5 9 2 5 5 7 4 2 5 4 8 4 6 6 2 9 6 10

You can select some or all of the columns using the colon notation, even arbitrary columns:

foo %>%
mutate(maxcol = do.call(pmax, subset(., select = c(a:e,g))))
# a b c d e f g h i j k l m n o p q r s t u v w x y z maxcol
# 1 1 4 9 2 4 4 1 10 2 3 10 4 7 1 10 9 8 2 8 9 5 1 9 1 10 9 9
# 2 5 2 5 3 5 2 8 8 5 8 2 3 6 10 9 3 5 8 7 4 6 9 8 5 8 3 8
# 3 10 9 6 1 7 10 6 4 4 7 6 6 2 7 5 5 4 1 10 7 3 10 5 10 1 7 10
# 4 8 1 4 8 9 3 3 9 10 1 8 5 8 4 4 8 6 10 5 2 9 5 7 7 3 1 9
# 5 2 10 2 9 8 9 9 6 7 5 9 2 5 5 7 4 2 5 4 8 4 6 6 2 9 6 10

The reason this should be preferred over the other answers (which are generally using allegedly idiomatic methods) is because:

  • in Dom's answer, the max function is called once for each row of the frame; R's vectorized ops are not being used, this is inefficient and should be avoided if possible;
  • in akrun's answer, pmax is being called once for each column of the frame, which in this case might sound worse but actually closer to the best one can do. My answer is closest to akrun's in that we are selecting data within the mutate.

If you'd prefer to use dplyr::select over base::subset, it needs to be broken out as

foo %>%
mutate(maxcol = select(., a:e, g) %>% do.call(pmax, .))

I think this is demonstrated a little better with benchmarks. Using the provided 5x26 frame, we see a clear improvement:

set.seed(42)
foo <- data.frame(sapply(letters, function(x) x = sample(1:10,5)))
microbenchmark::microbenchmark(
Dom = {
foo %>%
rowwise() %>%
summarise(max= max(c_across(a:z)))
},
akr = {
foo %>%
mutate(maxcol = reduce(select(., a:z), pmax))
},
r2 = {
foo %>%
mutate(maxcol = do.call(pmax, subset(., select = a:z)))
}
)
# Unit: milliseconds
# expr min lq mean median uq max neval
# Dom 6.6561 7.15260 7.61574 7.38345 7.90375 11.0387 100
# akr 4.2849 4.69920 4.96278 4.86110 5.18130 7.0908 100
# r2 2.3290 2.49285 2.68671 2.59180 2.78960 4.7086 100

Let's try with a slightly larger 5000x26:

set.seed(42)
foo <- data.frame(sapply(letters, function(x) x = sample(1:10,5000,replace=TRUE)))
microbenchmark::microbenchmark(
Dom = {
foo %>%
rowwise() %>%
summarise(max= max(c_across(a:z)))
},
akr = {
foo %>%
mutate(maxcol = reduce(select(., a:z), pmax))
},
r2 = {
foo %>%
mutate(maxcol = do.call(pmax, subset(., select = a:z)))
}
)
# Unit: milliseconds
# expr min lq mean median uq max neval
# Dom 515.6437 563.6060 763.97348 811.45815 883.00115 1775.2366 100
# akr 4.6660 5.1619 11.92847 5.74050 6.50625 293.7444 100
# r2 2.9253 3.4371 4.24548 3.71845 4.27380 14.0958 100

This last one definitely shows a consequence of using rowwise. The relative performance between akrun's answer and this one is almost identical to 5 rows, reinforcing the premise that column-wise is better than row-wise (and all-at-once is faster than both).

(This can also be done with purrr::invoke, if truly desired, though it does not speed it up:

library(purrr)
foo %>%
mutate(maxcol = invoke(pmax, subset(., select = a:z)))

### microbenchmark(...)
# Unit: milliseconds
# expr min lq mean median uq max neval
# Dom 7.8292 8.40275 9.02813 8.97345 9.38500 12.4368 100
# akr 4.9622 5.28855 8.78909 5.60090 6.11790 309.2607 100
# r2base 2.5521 2.74635 3.01949 2.90415 3.21060 4.6512 100
# r2purrr 2.5063 2.77510 3.11206 2.93415 3.33015 5.2403 100

use column-name range in data.table like in dplyrs select

What I asked for is possible in the development-version of data.table (1.9.5) as can be seen in the new feature number 17. quoting:


  1. .SDcols and with=FALSE understand colA:colB form now. That is, DT[, lapply(.SD, sum), by=V1, .SDcols=V4:V6] and DT[, V5:V7,
    with=FALSE] works as intended. This is quite useful for interactive
    use. Closes #748.

the installation of the development-version is explained here

Thanks for pointing this out @AnandaMahto and @Arun!

also using select_vars(names(dt), w:y) from dplyr is probably better than names(select(dt, w:y)) as pointed out by @shadow, because it gives more options to select the names, is more readable and conveys the intent more clearly.

Select multiple ranges of columns using column names in data.table

We can use the range part in .SDcols and then append the other column by concatenating

dt[, c(list(a= a), .SD) , .SDcols = c:d]

If there are multiple ranges, we create a sequence of ranges by match, and then get the corresponding column names

i1 <- match(c("c", "f"), names(dt))
j1 <- match(c("d", "h"), names(dt))
nm1 <- c("a", names(dt)[unlist(Map(`:`, i1, j1))], "j")
dt[, ..nm1]
# a c d f g h j
#1: 1 11 16 26 31 36 46
#2: 2 12 17 27 32 37 47
#3: 3 13 18 28 33 38 48
#4: 4 14 19 29 34 39 49
#5: 5 15 20 30 35 40 50

Also, the dplyr methods can be used within the data.table

dt[, select(.SD, a, c:d, f:h, j)]
# a c d f g h j
#1: 1 11 16 26 31 36 46
#2: 2 12 17 27 32 37 47
#3: 3 13 18 28 33 38 48
#4: 4 14 19 29 34 39 49
#5: 5 15 20 30 35 40 50


Related Topics



Leave a reply



Submit