Get Value of Last Non-Na Row Per Column in Data.Table

Get value of last non-NA row per column in data.table

If the dataset is data.table, loop through the Subset of Data.table (.SD), subset the non-NA element (x[!is.na(x)]) and extract the last element among those with tail.

df1[, lapply(.SD, function(x) tail(x[!is.na(x)],1))]
#   a  b c
#1: 63 57 4

How to get value of last non-NA column

You can use max.col with ties.method set as "last" to get last non-NA value in each row.

test$val <- test[cbind(1:nrow(test), max.col(!is.na(test), ties.method = 'last'))]
test

#        date a  b  c val
#1 2020-01-01 4 NA NA   4
#2 2020-01-02 3  2 NA   2
#3 2020-01-03 4  1  5   5

Getting the position of the the last non-NA value in a row in an R data.table

We can use max.col :

max.col(!is.na(dt[, -1]), ties.method = 'last') * +(rowSums(!is.na(dt[,-1])) > 0)
#[1] 4 2 3 0

creating a table of last non-NA values that only stores last non-NA values

maybe such a solution will be useful

tidyverse

df <- data.frame(
  id = c(rep('a', 5), rep('b', 5), rep('c', 5)),
  year = c(seq(2011, 2015), seq(2011, 2015), seq(2011, 2015)),
  x = c(c(1:3, NA, NA), c(1:5), rep(NA, 5)),
  y = c(c(NA, NA, 3, 4, NA), c(NA, 2, 3, 4, NA), rep(NA, 5)),
  z = c(c(1:5), c(1:4, NA), rep(NA, 5))
)

library(tidyverse)
df %>% 
  pivot_longer(-c(id, year), values_drop_na = TRUE) %>% 
  group_by(id, name) %>% 
  slice_tail(n = 1) %>% 
  ungroup() %>% 
  pivot_wider(id_cols = c(id, year), names_from = name, values_from = value)
#> # A tibble: 5 x 5
#>   id     year     x     y     z
#>   <chr> <int> <dbl> <dbl> <dbl>
#> 1 a      2013     3    NA    NA
#> 2 a      2014    NA     4    NA
#> 3 a      2015    NA    NA     5
#> 4 b      2015     5    NA    NA
#> 5 b      2014    NA     4     4

^{Created on 2021-11-24 by the reprex package (v2.0.1)}

data.table


library(data.table)
library(magrittr)

melt(data = setDT(df), id.vars = c("id", "year"), na.rm = TRUE) %>% 
  .[order(id, year), last(.SD), by = list(id, variable)] %>% 
  dcast(formula = id + year ~ variable)

#>    id year  x  y  z
#> 1:  a 2013  3 NA NA
#> 2:  a 2014 NA  4 NA
#> 3:  a 2015 NA NA  5
#> 4:  b 2014 NA  4  4
#> 5:  b 2015  5 NA NA

^{Created on 2021-11-24 by the reprex package (v2.0.1)}

Get the latest non-NA value based on date column by group

library(dplyr)
library(tidyr)

df1 %>% 
  mutate(date = as.Date(date)) %>% 
  group_by(country_name) %>%
  arrange(date) %>%
  select(-date) %>% 
  fill(everything()) %>% 
  slice(n())

#> # A tibble: 2 x 4
#> # Groups:   country_name [2]
#>   country_name column_1 column_2 column_3
#>   <chr>           <dbl>    <int>    <dbl>
#> 1 UK                0.5        3     NA  
#> 2 US               10          3      7.3

Data:

read.table(text = "country_name  date         column_1 column_2 column_3
                   US            2016-11-02   7.5      NA       NA      
                   US            2017-09-12   NA       NA       9       
                   US            2017-09-19   NA       8        10      
                   US            2020-02-10   10       NA       NA      
                   US            2021-03-10   NA       NA       7.3     
                   US            2021-05-02   NA       3        NA      
                   UK            2016-11-02   NA       2        NA      
                   UK            2017-09-12   0.5      3        NA", 
           header = T, stringsAsFactors = F) -> df1

Return years of first and last non-NA values for each column R

We can reshape into 'long' format and then do a group by the 'name' and summarise to get the min and max 'year'

library(dplyr)
library(tidyr)
library(tibble)
df1 %>%
   select(-group) %>%
   pivot_longer(cols  = starts_with('value'), values_drop_na = TRUE) %>% 
   group_by(name) %>%
   summarise(start = min(year), end = max(year))  %>%
   column_to_rownames('name')
#        start  end
#value1  2001 2002
#value2  2000 2002
#value3  2000 2001

Or with melt from data.table

library(data.table)
melt(setDT(df1), id.var = c('year', 'group'), na.rm = TRUE)[,
     .(start = min(year), end = max(year)), .(variable)]

Or we could also make use of summarise_at

df1 %>%
    summarise_at(vars(starts_with('value')), ~ 
       list(range(year[!is.na(.)]))) %>% 
    unnest(everything()) %>% 
    pivot_longer(everything())

data

df1 <- structure(list(group = c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L), 
    year = c(2000L, 2001L, 2002L, 2000L, 2001L, 2002L, 2000L, 
    2001L, 2002L), value1 = c(NA, 8L, 4L, NA, 9L, 1L, NA, 9L, 
    NA), value2 = c(3L, 3L, 3L, NA, NA, NA, 5L, 5L, 5L), value3 = c(4L, 
    4L, NA, 1L, 1L, NA, NA, NA, NA)), class = "data.frame", 
    row.names = c("1", 
"2", "3", "4", "5", "6", "7", "8", "9"))

Selecting non `NA` values from duplicate rows with `data.table` -- when having more than one grouping variable

Here some data.table-based solutions.

setDT(df_id_year_and_type)

method 1

na.omit(df_id_year_and_type, cols="type") drops NA rows based on column type.
unique(df_id_year_and_type[, .(id, year)], fromLast=TRUE) finds all the groups.
And by joining them (using the last match: mult="last"), we obtain the desired output.

na.omit(df_id_year_and_type, cols="type"
        )[unique(df_id_year_and_type[, .(id, year)], fromLast=TRUE), 
          on=c('id', 'year'), 
          mult="last"]

#       id  year   type
#    <num> <num> <char>
# 1:     1  2002      A
# 2:     2  2008      B
# 3:     3  2010      D
# 4:     3  2013   <NA>
# 5:     4  2020      C
# 6:     5  2009      A
# 7:     6  2010      B
# 8:     6  2012   <NA>

method 2

df_id_year_and_type[df_id_year_and_type[, .I[which.max(cumsum(!is.na(type)))], .(id, year)]$V1,]

method 3

(likely slower because of [ overhead)

df_id_year_and_type[, .SD[which.max(cumsum(!is.na(type)))], .(id, year)]

Extracting latest non-NA value in data frame based on grouping

We can create a function which gives us the latest non-NA value based on Day for each Vn column

get_last_non_NA_value <- function(x) {
   x[which.max(cumsum(!is.na(x)))]
}

and then apply that function for each Year and ID

library(dplyr)

df %>%
  group_by(Year, ID) %>%
  summarise_at(vars(V1:V2), funs(get_last_non_NA_value(.[order(Day)])))

#    Year  ID    V1    V2
#  <int> <int> <int> <int>
#1  2003  1102     3     8
#2  2003  1103     5    10
#3  2003  1104     9   100
#4  2018  1102     3     6
#5  2018  1103     7    NA
#6  2018  1104     5   100

EDIT

If we also want to extract corresponding Day for each value, we can change the function to return both values as comma-separated string

get_last_non_NA_value <- function(x, y) {
   ind <- which.max(cumsum(!is.na(x[order(y)])))
   paste(x[ind], y[ind], sep = ",")
}

and then use cSplit to separate these comma separated values into different columns.

library(dplyr)
library(splitstackshape)
cols <- c("V1", "V2")

df %>%
 group_by(Year, ID) %>%
 summarise_at(cols, funs(get_last_non_NA_value(., Day))) %>%
 cSplit(cols) %>%
 rename_at(vars(contains("_1")), funs(sub("_1", "_last_value", .))) %>%
 rename_at(vars(contains("_2")), funs(sub("_2", "_days", .)))

#   Year   ID V1_last_value V1_days V2_last_value V2_days
#1: 2003 1102             3      35             8      40
#2: 2003 1103             5      35            10      40
#3: 2003 1104             9      40           100      35
#4: 2018 1102             3      50             6      50
#5: 2018 1103             7      50            NA      50
#6: 2018 1104             5      49           100      50

Note that rename_at part renames the columns for better understanding of what value it holds, you can skip that part if you are not interested in renaming columns.

data

df <- structure(list(Year = c(2003L, 2003L, 2003L, 2003L, 2003L, 2003L, 
2018L, 2018L, 2018L, 2018L), Day = c(35L, 35L, 35L, 40L, 40L, 
40L, 49L, 50L, 50L, 50L), ID = c(1102L, 1103L, 1104L, 1102L, 
1103L, 1104L, 1104L, 1102L, 1103L, 1104L), V1 = c(3L, 5L, 8L, 
NA, NA, 9L, 5L, 3L, 7L, NA), V2 = c(6L, NA, 100L, 8L, 10L, NA, 
NA, 6L, NA, 100L)), .Names = c("Year", "Day", "ID", "V1", "V2"
), class = "data.frame", row.names = c(NA, -10L))

Extract last non-missing value in row with data.table

Here's another way:

dat[, res := NA_character_]
for (v in rev(names(dat))[-1]) dat[is.na(res), res := get(v)]

   X1 X2 X3 X4 X5 res
1:  u NA NA NA NA   u
2:  f  q NA NA NA   q
3:  f  b  w NA NA   w
4:  k  g  h NA NA   h
5:  u  b  r NA NA   r
6:  f  q  w  x  t   t
7:  u  g  h  i  e   e
8:  u  q  r  n  t   t

Benchmarks Using the same data as @alexis_laz and making (apparently) superficial changes to the functions, I see different results. Just showing them here in case anyone is curious. Alexis' answer (with small modifications) still comes out ahead.

Functions:

alex = function(x, ans = rep_len(NA, length(x[[1L]])), wh = seq_len(length(x[[1L]]))){
    if(!length(wh)) return(ans)
    ans[wh] = as.character(x[[length(x)]])[wh]
    Recall(x[-length(x)], ans, wh[is.na(ans[wh])])
}   

alex2 = function(x){
    x[, res := NA_character_]
    wh = x[, .I]
    for (v in (length(x)-1):1){
      if (!length(wh)) break
      set(x, j="res", i=wh, v = x[[v]][wh])
      wh = wh[is.na(x$res[wh])]
    }
    x$res
}

frank = function(x){
    x[, res := NA_character_]
    for(v in rev(names(x))[-1]) x[is.na(res), res := get(v)]
    return(x$res)       
}

frank2 = function(x){
    x[, res := NA_character_]
    for(v in rev(names(x))[-1]) x[is.na(res), res := .SD, .SDcols=v]
    x$res
}

Example data and benchmark:

DAT1 = as.data.table(lapply(ceiling(seq(0, 1e4, length.out = 1e2)), 
                     function(n) c(rep(NA, n), sample(letters, 3e5 - n, TRUE))))
DAT2 = copy(DAT1)
DAT3 = as.list(copy(DAT1))
DAT4 = copy(DAT1)

library(microbenchmark)
microbenchmark(frank(DAT1), frank2(DAT2), alex(DAT3), alex2(DAT4), times = 30)

Unit: milliseconds
         expr       min        lq      mean    median         uq        max neval
  frank(DAT1) 850.05980 909.28314 985.71700 979.84230 1023.57049 1183.37898    30
 frank2(DAT2)  88.68229  93.40476 118.27959 107.69190  121.60257  346.48264    30
   alex(DAT3)  98.56861 109.36653 131.21195 131.20760  149.99347  183.43918    30
  alex2(DAT4)  26.14104  26.45840  30.79294  26.67951   31.24136   50.66723    30

Select set of columns so that each row has at least one non-NA entry

Using a while loop, this should work to get the minimum set of variables with at least one non-NA per row.

best <- function(df){
  best <- which.max(colSums(sapply(df, complete.cases)))
  while(any(rowSums(sapply(df[best], complete.cases)) == 0)){
    best <- c(best, which.max(sapply(df[is.na(df[best]), ], \(x) sum(complete.cases(x)))))
  }
  best
}

testing

best(df)
#d c 
#4 3

df[best(df)]
#   d  c
#1  1  1
#2  1 NA
#3  1 NA
#4  1 NA
#5 NA  1

First, select the column with the least NAs (stored in best). Then, update the vector with the column that has the highest number of non-NA rows on the remaining rows (where best has still NAs), until you get every rows with a complete case.

Get Value of Last Non-Na Row Per Column in Data.Table