Get Value of Last Non-Na Row Per Column in Data.Table

Get value of last non-NA row per column in data.table

If the dataset is data.table, loop through the Subset of Data.table (.SD), subset the non-NA element (x[!is.na(x)]) and extract the last element among those with tail.

df1[, lapply(.SD, function(x) tail(x[!is.na(x)],1))]
# a b c
#1: 63 57 4

How to get value of last non-NA column

You can use max.col with ties.method set as "last" to get last non-NA value in each row.

test$val <- test[cbind(1:nrow(test), max.col(!is.na(test), ties.method = 'last'))]
test

# date a b c val
#1 2020-01-01 4 NA NA 4
#2 2020-01-02 3 2 NA 2
#3 2020-01-03 4 1 5 5

Getting the position of the the last non-NA value in a row in an R data.table

We can use max.col :

max.col(!is.na(dt[, -1]), ties.method = 'last') * +(rowSums(!is.na(dt[,-1])) > 0)
#[1] 4 2 3 0

creating a table of last non-NA values that only stores last non-NA values

maybe such a solution will be useful

tidyverse

df <- data.frame(
id = c(rep('a', 5), rep('b', 5), rep('c', 5)),
year = c(seq(2011, 2015), seq(2011, 2015), seq(2011, 2015)),
x = c(c(1:3, NA, NA), c(1:5), rep(NA, 5)),
y = c(c(NA, NA, 3, 4, NA), c(NA, 2, 3, 4, NA), rep(NA, 5)),
z = c(c(1:5), c(1:4, NA), rep(NA, 5))
)

library(tidyverse)
df %>%
pivot_longer(-c(id, year), values_drop_na = TRUE) %>%
group_by(id, name) %>%
slice_tail(n = 1) %>%
ungroup() %>%
pivot_wider(id_cols = c(id, year), names_from = name, values_from = value)
#> # A tibble: 5 x 5
#> id year x y z
#> <chr> <int> <dbl> <dbl> <dbl>
#> 1 a 2013 3 NA NA
#> 2 a 2014 NA 4 NA
#> 3 a 2015 NA NA 5
#> 4 b 2015 5 NA NA
#> 5 b 2014 NA 4 4

Created on 2021-11-24 by the reprex package (v2.0.1)

data.table


library(data.table)
library(magrittr)

melt(data = setDT(df), id.vars = c("id", "year"), na.rm = TRUE) %>%
.[order(id, year), last(.SD), by = list(id, variable)] %>%
dcast(formula = id + year ~ variable)

#> id year x y z
#> 1: a 2013 3 NA NA
#> 2: a 2014 NA 4 NA
#> 3: a 2015 NA NA 5
#> 4: b 2014 NA 4 4
#> 5: b 2015 5 NA NA

Created on 2021-11-24 by the reprex package (v2.0.1)

Get the latest non-NA value based on date column by group

library(dplyr)
library(tidyr)

df1 %>%
mutate(date = as.Date(date)) %>%
group_by(country_name) %>%
arrange(date) %>%
select(-date) %>%
fill(everything()) %>%
slice(n())

#> # A tibble: 2 x 4
#> # Groups: country_name [2]
#> country_name column_1 column_2 column_3
#> <chr> <dbl> <int> <dbl>
#> 1 UK 0.5 3 NA
#> 2 US 10 3 7.3

Data:

read.table(text = "country_name  date         column_1 column_2 column_3
US 2016-11-02 7.5 NA NA
US 2017-09-12 NA NA 9
US 2017-09-19 NA 8 10
US 2020-02-10 10 NA NA
US 2021-03-10 NA NA 7.3
US 2021-05-02 NA 3 NA
UK 2016-11-02 NA 2 NA
UK 2017-09-12 0.5 3 NA",
header = T, stringsAsFactors = F) -> df1

Return years of first and last non-NA values for each column R

We can reshape into 'long' format and then do a group by the 'name' and summarise to get the min and max 'year'

library(dplyr)
library(tidyr)
library(tibble)
df1 %>%
select(-group) %>%
pivot_longer(cols = starts_with('value'), values_drop_na = TRUE) %>%
group_by(name) %>%
summarise(start = min(year), end = max(year)) %>%
column_to_rownames('name')
# start end
#value1 2001 2002
#value2 2000 2002
#value3 2000 2001

Or with melt from data.table

library(data.table)
melt(setDT(df1), id.var = c('year', 'group'), na.rm = TRUE)[,
.(start = min(year), end = max(year)), .(variable)]

Or we could also make use of summarise_at

df1 %>%
summarise_at(vars(starts_with('value')), ~
list(range(year[!is.na(.)]))) %>%
unnest(everything()) %>%
pivot_longer(everything())

data

df1 <- structure(list(group = c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L), 
year = c(2000L, 2001L, 2002L, 2000L, 2001L, 2002L, 2000L,
2001L, 2002L), value1 = c(NA, 8L, 4L, NA, 9L, 1L, NA, 9L,
NA), value2 = c(3L, 3L, 3L, NA, NA, NA, 5L, 5L, 5L), value3 = c(4L,
4L, NA, 1L, 1L, NA, NA, NA, NA)), class = "data.frame",
row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9"))

Selecting non `NA` values from duplicate rows with `data.table` -- when having more than one grouping variable

Here some data.table-based solutions.

setDT(df_id_year_and_type)

method 1

na.omit(df_id_year_and_type, cols="type") drops NA rows based on column type.
unique(df_id_year_and_type[, .(id, year)], fromLast=TRUE) finds all the groups.
And by joining them (using the last match: mult="last"), we obtain the desired output.

na.omit(df_id_year_and_type, cols="type"
)[unique(df_id_year_and_type[, .(id, year)], fromLast=TRUE),
on=c('id', 'year'),
mult="last"]

# id year type
# <num> <num> <char>
# 1: 1 2002 A
# 2: 2 2008 B
# 3: 3 2010 D
# 4: 3 2013 <NA>
# 5: 4 2020 C
# 6: 5 2009 A
# 7: 6 2010 B
# 8: 6 2012 <NA>

method 2

df_id_year_and_type[df_id_year_and_type[, .I[which.max(cumsum(!is.na(type)))], .(id, year)]$V1,]

method 3

(likely slower because of [ overhead)

df_id_year_and_type[, .SD[which.max(cumsum(!is.na(type)))], .(id, year)]

Extracting latest non-NA value in data frame based on grouping

We can create a function which gives us the latest non-NA value based on Day for each Vn column

get_last_non_NA_value <- function(x) {
x[which.max(cumsum(!is.na(x)))]
}

and then apply that function for each Year and ID

library(dplyr)

df %>%
group_by(Year, ID) %>%
summarise_at(vars(V1:V2), funs(get_last_non_NA_value(.[order(Day)])))

# Year ID V1 V2
# <int> <int> <int> <int>
#1 2003 1102 3 8
#2 2003 1103 5 10
#3 2003 1104 9 100
#4 2018 1102 3 6
#5 2018 1103 7 NA
#6 2018 1104 5 100

EDIT

If we also want to extract corresponding Day for each value, we can change the function to return both values as comma-separated string

get_last_non_NA_value <- function(x, y) {
ind <- which.max(cumsum(!is.na(x[order(y)])))
paste(x[ind], y[ind], sep = ",")
}

and then use cSplit to separate these comma separated values into different columns.

library(dplyr)
library(splitstackshape)
cols <- c("V1", "V2")

df %>%
group_by(Year, ID) %>%
summarise_at(cols, funs(get_last_non_NA_value(., Day))) %>%
cSplit(cols) %>%
rename_at(vars(contains("_1")), funs(sub("_1", "_last_value", .))) %>%
rename_at(vars(contains("_2")), funs(sub("_2", "_days", .)))

# Year ID V1_last_value V1_days V2_last_value V2_days
#1: 2003 1102 3 35 8 40
#2: 2003 1103 5 35 10 40
#3: 2003 1104 9 40 100 35
#4: 2018 1102 3 50 6 50
#5: 2018 1103 7 50 NA 50
#6: 2018 1104 5 49 100 50

Note that rename_at part renames the columns for better understanding of what value it holds, you can skip that part if you are not interested in renaming columns.

data

df <- structure(list(Year = c(2003L, 2003L, 2003L, 2003L, 2003L, 2003L, 
2018L, 2018L, 2018L, 2018L), Day = c(35L, 35L, 35L, 40L, 40L,
40L, 49L, 50L, 50L, 50L), ID = c(1102L, 1103L, 1104L, 1102L,
1103L, 1104L, 1104L, 1102L, 1103L, 1104L), V1 = c(3L, 5L, 8L,
NA, NA, 9L, 5L, 3L, 7L, NA), V2 = c(6L, NA, 100L, 8L, 10L, NA,
NA, 6L, NA, 100L)), .Names = c("Year", "Day", "ID", "V1", "V2"
), class = "data.frame", row.names = c(NA, -10L))

Extract last non-missing value in row with data.table

Here's another way:

dat[, res := NA_character_]
for (v in rev(names(dat))[-1]) dat[is.na(res), res := get(v)]

X1 X2 X3 X4 X5 res
1: u NA NA NA NA u
2: f q NA NA NA q
3: f b w NA NA w
4: k g h NA NA h
5: u b r NA NA r
6: f q w x t t
7: u g h i e e
8: u q r n t t

Benchmarks Using the same data as @alexis_laz and making (apparently) superficial changes to the functions, I see different results. Just showing them here in case anyone is curious. Alexis' answer (with small modifications) still comes out ahead.

Functions:

alex = function(x, ans = rep_len(NA, length(x[[1L]])), wh = seq_len(length(x[[1L]]))){
if(!length(wh)) return(ans)
ans[wh] = as.character(x[[length(x)]])[wh]
Recall(x[-length(x)], ans, wh[is.na(ans[wh])])
}

alex2 = function(x){
x[, res := NA_character_]
wh = x[, .I]
for (v in (length(x)-1):1){
if (!length(wh)) break
set(x, j="res", i=wh, v = x[[v]][wh])
wh = wh[is.na(x$res[wh])]
}
x$res
}

frank = function(x){
x[, res := NA_character_]
for(v in rev(names(x))[-1]) x[is.na(res), res := get(v)]
return(x$res)
}

frank2 = function(x){
x[, res := NA_character_]
for(v in rev(names(x))[-1]) x[is.na(res), res := .SD, .SDcols=v]
x$res
}

Example data and benchmark:

DAT1 = as.data.table(lapply(ceiling(seq(0, 1e4, length.out = 1e2)), 
function(n) c(rep(NA, n), sample(letters, 3e5 - n, TRUE))))
DAT2 = copy(DAT1)
DAT3 = as.list(copy(DAT1))
DAT4 = copy(DAT1)

library(microbenchmark)
microbenchmark(frank(DAT1), frank2(DAT2), alex(DAT3), alex2(DAT4), times = 30)

Unit: milliseconds
expr min lq mean median uq max neval
frank(DAT1) 850.05980 909.28314 985.71700 979.84230 1023.57049 1183.37898 30
frank2(DAT2) 88.68229 93.40476 118.27959 107.69190 121.60257 346.48264 30
alex(DAT3) 98.56861 109.36653 131.21195 131.20760 149.99347 183.43918 30
alex2(DAT4) 26.14104 26.45840 30.79294 26.67951 31.24136 50.66723 30

Select set of columns so that each row has at least one non-NA entry

Using a while loop, this should work to get the minimum set of variables with at least one non-NA per row.

best <- function(df){
best <- which.max(colSums(sapply(df, complete.cases)))
while(any(rowSums(sapply(df[best], complete.cases)) == 0)){
best <- c(best, which.max(sapply(df[is.na(df[best]), ], \(x) sum(complete.cases(x)))))
}
best
}

testing

best(df)
#d c
#4 3

df[best(df)]
# d c
#1 1 1
#2 1 NA
#3 1 NA
#4 1 NA
#5 NA 1

First, select the column with the least NAs (stored in best). Then, update the vector with the column that has the highest number of non-NA rows on the remaining rows (where best has still NAs), until you get every rows with a complete case.



Related Topics



Leave a reply



Submit