R: Find First Non-Na Observation in Data.Table Column by Group

R: find first non-NA observation in data.table column by group

Here's one way:

DT[!is.na(Petal.Width), first := as.integer(seq_len(.N) == 1L), by = Species]

dplyr::first() to choose first non NA value

Use na.omit, compare:

first(c(NA, 11, 22))
# [1] NA

first(na.omit(c(NA, 11, 22)))
# [1] 11

Using example data:

d %>%
  mutate(
    value = case_when(
      group == 2 & year ==2000 ~ NA_integer_,
      group == 3 & year ==2002 ~ NA_integer_,
      TRUE ~ value))%>%
  group_by(group) %>% 
  mutate(
    first = dplyr::first(na.omit(value)),
    last = dplyr::last(na.omit(value)))

# # A tibble: 9 x 5
# # Groups:   group [3]
#   group  year value first  last
#   <int> <dbl> <int> <int> <int>
# 1     1  2000     3     3     4
# 2     1  2001     8     3     4
# 3     1  2002     4     3     4
# 4     2  2000    NA     9     1
# 5     2  2001     9     9     1
# 6     2  2002     1     9     1
# 7     3  2000     5     5     9
# 8     3  2001     9     5     9
# 9     3  2002    NA     5     9

Selecting non `NA` values from duplicate rows with `data.table` -- when having more than one grouping variable

Here some data.table-based solutions.

setDT(df_id_year_and_type)

method 1

na.omit(df_id_year_and_type, cols="type") drops NA rows based on column type.
unique(df_id_year_and_type[, .(id, year)], fromLast=TRUE) finds all the groups.
And by joining them (using the last match: mult="last"), we obtain the desired output.

na.omit(df_id_year_and_type, cols="type"
        )[unique(df_id_year_and_type[, .(id, year)], fromLast=TRUE), 
          on=c('id', 'year'), 
          mult="last"]

#       id  year   type
#    <num> <num> <char>
# 1:     1  2002      A
# 2:     2  2008      B
# 3:     3  2010      D
# 4:     3  2013   <NA>
# 5:     4  2020      C
# 6:     5  2009      A
# 7:     6  2010      B
# 8:     6  2012   <NA>

method 2

df_id_year_and_type[df_id_year_and_type[, .I[which.max(cumsum(!is.na(type)))], .(id, year)]$V1,]

method 3

(likely slower because of [ overhead)

df_id_year_and_type[, .SD[which.max(cumsum(!is.na(type)))], .(id, year)]

R - Find first non zero elements per groups in data.table

Here's a data.table approach:

dat_long = melt(
  data = dat, 
  measure.vars = as.character(1:36), # column names to be melted
  variable.name = 'period', 
  variable.factor = FALSE
)

res = dat_long[
  value > 0,                              # we're looking for non-zero periods
  .(Earliest = min(as.integer(period))),  # extract the minimum (first) period
  by = .(State, Maturing, Soil)           # grouping variables
]

res
#    State Maturing  Soil Earliest
# 1:    PR    Early  CLAY       26
# 2:    PR   Medium  CLAY       26
# 3:    PR     Late  SILT       26
# 4:    PR     Late  CLAY       26
# 5:    PR    Early  SILT       26
# 6:    PR   Medium  SILT       26
# 7:    PR     Late SANDY       26
# 8:    PR   Medium SANDY       27
# 9:    RS    Early  SILT       27
# 10:    RS    Early  CLAY       27
# 11:    RS   Medium SANDY       27
# 12:    RS   Medium  SILT       27
# 13:    RS   Medium  CLAY       27
# 14:    RS     Late SANDY       27
# 15:    RS     Late  SILT       27
# 16:    RS     Late  CLAY       27
# 17:    RS    Early SANDY       28
# 18:    PR    Early SANDY       30

Bottom line: convert your data to long format and the computation becomes very easy (and will most likely be more efficient in long format).

Select first non-NA value using R

We can use first on the non-NA elements after grouping

library(dplyr)
df <- df %>%
    group_by(ID) %>% 
    mutate(value = first(test[complete.cases(test)]))

Need help finding a fast method to identify first non-missing observation per variable

This may be a case where melting the dataset and casting is faster when there are only 3 results per each group.

Using @chinsoon12's dataset, I get 2-3 seconds with OP's original solutions vs. 0.4 s with melt and cast. If you don't mind keeping the data molten (i.e., long), that is around 0.2 seconds which is about 10x faster than the original.

#melt and cast
dcast(melt(DT, id.vars = 'grp')[!is.na(value), .SD[1], by = .(grp, variable)], grp ~ variable)

#only melt
melt(DT, id.vars = 'grp')[!is.na(value), .SD[1], by = .(grp, variable)]

#approach with intermediate variables:
molten_DT<- na.omit(melt(DT, id.vars = 'grp'), 'value')
dcast(molten_DT[molten_DT[, .I[1], by = .(grp, variable)]$V1, ], grp ~ variable)

library(data.table)
library(microbenchmark)

#@chinsoon12's dataset
set.seed(0L)
ngrp <- 1000L #502540
avgNr <- 3L
nc <- 1000L #1019
DT <- data.table(
  as.data.table(matrix(sample(c(NA,1), ngrp*avgNr*nc, TRUE), nrow=ngrp*avgNr, ncol=nc)),
  grp=rep(1:ngrp, each=avgNr))

system.time(DT[, lapply(.SD, firstnonmiss_1), by = grp])
system.time(DT[, lapply(.SD, firstnonmiss_2), by = grp])
system.time(DT[, lapply(.SD, firstnonmiss_3), by = grp])
microbenchmark(melt_and_cast = {
  dcast(melt(DT, id.vars = 'grp')[!is.na(value), .SD[1], by = .(grp, variable)], grp ~ variable)
  },melt_1 = {
    melt(DT, id.vars = 'grp')[!is.na(value), .SD[1], by = .(grp, variable)]
  }
,times = 20)

Fill data.table with NA-values, using the nearest non-na-observation

dt.tst[is.na(Value), Value := dt.tst[!is.na(Value)][dt.tst[is.na(Value)], roll = "nearest", on = .(Type, Range_val)]$Value]

output

dt.tst

#     Type Range_val Value
#  1:    A         0 0.987
#  2:    A      1000 0.987
#  3:    A      2000 0.987
#  4:    A      3000 0.987
#  5:    A      4000 0.987
#  6:    A      5000 0.987
#  7:    A      6000 1.056
#  8:    A      7000 1.056
#  9:    A      8000 1.056
# 10:    A      9000 1.056
# 11:    A     10000 1.056
# 12:    A     11000 1.056
# 13:    A     12000 1.056
# 14:    A     13000 1.563
# 15:    A     14000 1.563
# 16:    A     15000 1.563
# 17:    A     16000 1.563
# 18:    A     17000 1.563
# 19:    A     18000 1.563
# 20:    A     19000 1.563
# 21:    A     20000 1.563
# 22:    B         0 1.987
# 23:    B      1000 1.987
# 24:    B      2000 1.987
# 25:    B      3000 1.987
# 26:    B      4000 1.987
# 27:    B      5000 1.987
# 28:    B      6000 2.138
# 29:    B      7000 2.138
# 30:    B      8000 2.138
# 31:    B      9000 2.138
# 32:    B     10000 2.138
# 33:    B     11000 2.138
# 34:    B     12000 2.138
# 35:    B     13000 2.089
# 36:    B     14000 2.089
# 37:    B     15000 2.089
# 38:    B     16000 2.089
# 39:    B     17000 2.089
# 40:    B     18000 2.089
# 41:    B     19000 2.089
# 42:    B     20000 2.089

How to get value of last non-NA column

You can use max.col with ties.method set as "last" to get last non-NA value in each row.

test$val <- test[cbind(1:nrow(test), max.col(!is.na(test), ties.method = 'last'))]
test

#        date a  b  c val
#1 2020-01-01 4 NA NA   4
#2 2020-01-02 3  2 NA   2
#3 2020-01-03 4  1  5   5

R: Find First Non-Na Observation in Data.Table Column by Group