Get value of last non-NA row per column in data.table
If the dataset is data.table
, loop through the Subset of Data.table (.SD
), subset the non-NA element (x[!is.na(x)]
) and extract the last element among those with tail
.
df1[, lapply(.SD, function(x) tail(x[!is.na(x)],1))]
# a b c
#1: 63 57 4
How to get value of last non-NA column
You can use max.col
with ties.method
set as "last"
to get last non-NA value in each row.
test$val <- test[cbind(1:nrow(test), max.col(!is.na(test), ties.method = 'last'))]
test
# date a b c val
#1 2020-01-01 4 NA NA 4
#2 2020-01-02 3 2 NA 2
#3 2020-01-03 4 1 5 5
Getting the position of the the last non-NA value in a row in an R data.table
We can use max.col
:
max.col(!is.na(dt[, -1]), ties.method = 'last') * +(rowSums(!is.na(dt[,-1])) > 0)
#[1] 4 2 3 0
creating a table of last non-NA values that only stores last non-NA values
maybe such a solution will be useful
tidyverse
df <- data.frame(
id = c(rep('a', 5), rep('b', 5), rep('c', 5)),
year = c(seq(2011, 2015), seq(2011, 2015), seq(2011, 2015)),
x = c(c(1:3, NA, NA), c(1:5), rep(NA, 5)),
y = c(c(NA, NA, 3, 4, NA), c(NA, 2, 3, 4, NA), rep(NA, 5)),
z = c(c(1:5), c(1:4, NA), rep(NA, 5))
)
library(tidyverse)
df %>%
pivot_longer(-c(id, year), values_drop_na = TRUE) %>%
group_by(id, name) %>%
slice_tail(n = 1) %>%
ungroup() %>%
pivot_wider(id_cols = c(id, year), names_from = name, values_from = value)
#> # A tibble: 5 x 5
#> id year x y z
#> <chr> <int> <dbl> <dbl> <dbl>
#> 1 a 2013 3 NA NA
#> 2 a 2014 NA 4 NA
#> 3 a 2015 NA NA 5
#> 4 b 2015 5 NA NA
#> 5 b 2014 NA 4 4
Created on 2021-11-24 by the reprex package (v2.0.1)
data.table
library(data.table)
library(magrittr)
melt(data = setDT(df), id.vars = c("id", "year"), na.rm = TRUE) %>%
.[order(id, year), last(.SD), by = list(id, variable)] %>%
dcast(formula = id + year ~ variable)
#> id year x y z
#> 1: a 2013 3 NA NA
#> 2: a 2014 NA 4 NA
#> 3: a 2015 NA NA 5
#> 4: b 2014 NA 4 4
#> 5: b 2015 5 NA NA
Created on 2021-11-24 by the reprex package (v2.0.1)
Get the latest non-NA value based on date column by group
library(dplyr)
library(tidyr)
df1 %>%
mutate(date = as.Date(date)) %>%
group_by(country_name) %>%
arrange(date) %>%
select(-date) %>%
fill(everything()) %>%
slice(n())
#> # A tibble: 2 x 4
#> # Groups: country_name [2]
#> country_name column_1 column_2 column_3
#> <chr> <dbl> <int> <dbl>
#> 1 UK 0.5 3 NA
#> 2 US 10 3 7.3
Data:
read.table(text = "country_name date column_1 column_2 column_3
US 2016-11-02 7.5 NA NA
US 2017-09-12 NA NA 9
US 2017-09-19 NA 8 10
US 2020-02-10 10 NA NA
US 2021-03-10 NA NA 7.3
US 2021-05-02 NA 3 NA
UK 2016-11-02 NA 2 NA
UK 2017-09-12 0.5 3 NA",
header = T, stringsAsFactors = F) -> df1
Return years of first and last non-NA values for each column R
We can reshape into 'long' format and then do a group by the 'name' and summarise
to get the min
and max
'year'
library(dplyr)
library(tidyr)
library(tibble)
df1 %>%
select(-group) %>%
pivot_longer(cols = starts_with('value'), values_drop_na = TRUE) %>%
group_by(name) %>%
summarise(start = min(year), end = max(year)) %>%
column_to_rownames('name')
# start end
#value1 2001 2002
#value2 2000 2002
#value3 2000 2001
Or with melt
from data.table
library(data.table)
melt(setDT(df1), id.var = c('year', 'group'), na.rm = TRUE)[,
.(start = min(year), end = max(year)), .(variable)]
Or we could also make use of summarise_at
df1 %>%
summarise_at(vars(starts_with('value')), ~
list(range(year[!is.na(.)]))) %>%
unnest(everything()) %>%
pivot_longer(everything())
data
df1 <- structure(list(group = c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L),
year = c(2000L, 2001L, 2002L, 2000L, 2001L, 2002L, 2000L,
2001L, 2002L), value1 = c(NA, 8L, 4L, NA, 9L, 1L, NA, 9L,
NA), value2 = c(3L, 3L, 3L, NA, NA, NA, 5L, 5L, 5L), value3 = c(4L,
4L, NA, 1L, 1L, NA, NA, NA, NA)), class = "data.frame",
row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9"))
Selecting non `NA` values from duplicate rows with `data.table` -- when having more than one grouping variable
Here some data.table-based solutions.
setDT(df_id_year_and_type)
method 1
na.omit(df_id_year_and_type, cols="type")
drops NA
rows based on column type
.unique(df_id_year_and_type[, .(id, year)], fromLast=TRUE)
finds all the groups.
And by joining them (using the last match: mult="last"
), we obtain the desired output.
na.omit(df_id_year_and_type, cols="type"
)[unique(df_id_year_and_type[, .(id, year)], fromLast=TRUE),
on=c('id', 'year'),
mult="last"]
# id year type
# <num> <num> <char>
# 1: 1 2002 A
# 2: 2 2008 B
# 3: 3 2010 D
# 4: 3 2013 <NA>
# 5: 4 2020 C
# 6: 5 2009 A
# 7: 6 2010 B
# 8: 6 2012 <NA>
method 2
df_id_year_and_type[df_id_year_and_type[, .I[which.max(cumsum(!is.na(type)))], .(id, year)]$V1,]
method 3
(likely slower because of [
overhead)
df_id_year_and_type[, .SD[which.max(cumsum(!is.na(type)))], .(id, year)]
Extracting latest non-NA value in data frame based on grouping
We can create a function which gives us the latest non-NA value based on Day
for each Vn
column
get_last_non_NA_value <- function(x) {
x[which.max(cumsum(!is.na(x)))]
}
and then apply that function for each Year
and ID
library(dplyr)
df %>%
group_by(Year, ID) %>%
summarise_at(vars(V1:V2), funs(get_last_non_NA_value(.[order(Day)])))
# Year ID V1 V2
# <int> <int> <int> <int>
#1 2003 1102 3 8
#2 2003 1103 5 10
#3 2003 1104 9 100
#4 2018 1102 3 6
#5 2018 1103 7 NA
#6 2018 1104 5 100
EDIT
If we also want to extract corresponding Day
for each value, we can change the function to return both values as comma-separated string
get_last_non_NA_value <- function(x, y) {
ind <- which.max(cumsum(!is.na(x[order(y)])))
paste(x[ind], y[ind], sep = ",")
}
and then use cSplit
to separate these comma separated values into different columns.
library(dplyr)
library(splitstackshape)
cols <- c("V1", "V2")
df %>%
group_by(Year, ID) %>%
summarise_at(cols, funs(get_last_non_NA_value(., Day))) %>%
cSplit(cols) %>%
rename_at(vars(contains("_1")), funs(sub("_1", "_last_value", .))) %>%
rename_at(vars(contains("_2")), funs(sub("_2", "_days", .)))
# Year ID V1_last_value V1_days V2_last_value V2_days
#1: 2003 1102 3 35 8 40
#2: 2003 1103 5 35 10 40
#3: 2003 1104 9 40 100 35
#4: 2018 1102 3 50 6 50
#5: 2018 1103 7 50 NA 50
#6: 2018 1104 5 49 100 50
Note that rename_at
part renames the columns for better understanding of what value it holds, you can skip that part if you are not interested in renaming columns.
data
df <- structure(list(Year = c(2003L, 2003L, 2003L, 2003L, 2003L, 2003L,
2018L, 2018L, 2018L, 2018L), Day = c(35L, 35L, 35L, 40L, 40L,
40L, 49L, 50L, 50L, 50L), ID = c(1102L, 1103L, 1104L, 1102L,
1103L, 1104L, 1104L, 1102L, 1103L, 1104L), V1 = c(3L, 5L, 8L,
NA, NA, 9L, 5L, 3L, 7L, NA), V2 = c(6L, NA, 100L, 8L, 10L, NA,
NA, 6L, NA, 100L)), .Names = c("Year", "Day", "ID", "V1", "V2"
), class = "data.frame", row.names = c(NA, -10L))
Extract last non-missing value in row with data.table
Here's another way:
dat[, res := NA_character_]
for (v in rev(names(dat))[-1]) dat[is.na(res), res := get(v)]
X1 X2 X3 X4 X5 res
1: u NA NA NA NA u
2: f q NA NA NA q
3: f b w NA NA w
4: k g h NA NA h
5: u b r NA NA r
6: f q w x t t
7: u g h i e e
8: u q r n t t
Benchmarks Using the same data as @alexis_laz and making (apparently) superficial changes to the functions, I see different results. Just showing them here in case anyone is curious. Alexis' answer (with small modifications) still comes out ahead.
Functions:
alex = function(x, ans = rep_len(NA, length(x[[1L]])), wh = seq_len(length(x[[1L]]))){
if(!length(wh)) return(ans)
ans[wh] = as.character(x[[length(x)]])[wh]
Recall(x[-length(x)], ans, wh[is.na(ans[wh])])
}
alex2 = function(x){
x[, res := NA_character_]
wh = x[, .I]
for (v in (length(x)-1):1){
if (!length(wh)) break
set(x, j="res", i=wh, v = x[[v]][wh])
wh = wh[is.na(x$res[wh])]
}
x$res
}
frank = function(x){
x[, res := NA_character_]
for(v in rev(names(x))[-1]) x[is.na(res), res := get(v)]
return(x$res)
}
frank2 = function(x){
x[, res := NA_character_]
for(v in rev(names(x))[-1]) x[is.na(res), res := .SD, .SDcols=v]
x$res
}
Example data and benchmark:
DAT1 = as.data.table(lapply(ceiling(seq(0, 1e4, length.out = 1e2)),
function(n) c(rep(NA, n), sample(letters, 3e5 - n, TRUE))))
DAT2 = copy(DAT1)
DAT3 = as.list(copy(DAT1))
DAT4 = copy(DAT1)
library(microbenchmark)
microbenchmark(frank(DAT1), frank2(DAT2), alex(DAT3), alex2(DAT4), times = 30)
Unit: milliseconds
expr min lq mean median uq max neval
frank(DAT1) 850.05980 909.28314 985.71700 979.84230 1023.57049 1183.37898 30
frank2(DAT2) 88.68229 93.40476 118.27959 107.69190 121.60257 346.48264 30
alex(DAT3) 98.56861 109.36653 131.21195 131.20760 149.99347 183.43918 30
alex2(DAT4) 26.14104 26.45840 30.79294 26.67951 31.24136 50.66723 30
Select set of columns so that each row has at least one non-NA entry
Using a while
loop, this should work to get the minimum set of variables with at least one non-NA per row.
best <- function(df){
best <- which.max(colSums(sapply(df, complete.cases)))
while(any(rowSums(sapply(df[best], complete.cases)) == 0)){
best <- c(best, which.max(sapply(df[is.na(df[best]), ], \(x) sum(complete.cases(x)))))
}
best
}
testing
best(df)
#d c
#4 3
df[best(df)]
# d c
#1 1 1
#2 1 NA
#3 1 NA
#4 1 NA
#5 NA 1
First, select the column with the least NAs (stored in best
). Then, update the vector with the column that has the highest number of non-NA rows on the remaining rows (where best has still NAs), until you get every rows with a complete case.
Related Topics
Grouped Bar Graph Custom Colours
How to Merge Two Data Frame Based on Partial String Match with R
R: How to Create Grid-Graphics
Reshape Data from Long to Wide Format - More Than One Variable
Visual Bug When Changing Robinson Projection's Central Meridian with Ggplot2
Count Number of Distinct Values in a Vector
Code Folding for Individual Chunks in R Markdown
Filling in the Area Under a Line Graph in Ggplot2: Geom_Area()
R: "Make" Not Found When Installing a R-Package from Local Tar.Gz
Assignment to Empty Index (Empty Square Brackets X[]<-) on Lhs
R Table Function - How to Remove 0 Counts
Error in Install.Packages:Type =="Both" Cannot Be Used with 'Repos =Null'
Why Are Probabilities and Response in Ksvm in R Not Consistent