Dplyr:How to Find the First-Non Missing String by Groups

Dplyr : how to find the first-non missing string by groups?

Summarise will give one entry per group, here, finding the first non-missing using which

data %>%
group_by(group) %>%
summarise(first_non_missing = names[which(!is.na(names))[1]])

gives

  group first_non_missing
<chr> <chr>
1 A fred
2 B josh

If you still want all of the rows, replace summarise with mutate.

dplyr::first() to choose first non NA value

Use na.omit, compare:

first(c(NA, 11, 22))
# [1] NA

first(na.omit(c(NA, 11, 22)))
# [1] 11

Using example data:

d %>%
mutate(
value = case_when(
group == 2 & year ==2000 ~ NA_integer_,
group == 3 & year ==2002 ~ NA_integer_,
TRUE ~ value))%>%
group_by(group) %>%
mutate(
first = dplyr::first(na.omit(value)),
last = dplyr::last(na.omit(value)))

# # A tibble: 9 x 5
# # Groups: group [3]
# group year value first last
# <int> <dbl> <int> <int> <int>
# 1 1 2000 3 3 4
# 2 1 2001 8 3 4
# 3 1 2002 4 3 4
# 4 2 2000 NA 9 1
# 5 2 2001 9 9 1
# 6 2 2002 1 9 1
# 7 3 2000 5 5 9
# 8 3 2001 9 5 9
# 9 3 2002 NA 5 9

R: How to find the first non-zero element in a dataframe by group

library(data.table)
setDT(df)

df[, if(1 %in% Flag) head(.SD, which.max(Flag == 1) - 1)
, by = ID]

# ID date Flag
# 1: ABC 2018-03-21 NA
# 2: ABC 2018-03-17 0
# 3: ABC 2018-03-12 0
# 4: ABC 2018-03-10 0
# 5: DEF 2018-03-24 NA
# 6: DEF 2018-03-21 0
# 7: DEF 2018-03-20 0
# 8: DEF 2018-03-14 0
# 9: DEF 2018-03-13 0
# 10: DEF 2018-03-12 0
# 11: DEF 2018-03-11 0
# 12: DEF 2018-03-10 0
# 13: DEF 2018-03-09 0

Or in dplyr (same result)

library(dplyr)
df %>%
group_by(ID) %>%
filter(1 %in% Flag) %>%
slice(1:(which.max(Flag == 1) - 1))

Data used:

df <- fread("
ID date Flag
ABC 2018-03-21 NA
ABC 2018-03-17 0
ABC 2018-03-12 0
ABC 2018-03-10 0
ABC 2018-03-09 1
ABC 2018-03-08 0
ABC 2018-03-07 1
DEF 2018-03-24 NA
DEF 2018-03-21 0
DEF 2018-03-20 0
DEF 2018-03-14 0
DEF 2018-03-13 0
DEF 2018-03-12 0
DEF 2018-03-11 0
DEF 2018-03-10 0
DEF 2018-03-09 0
DEF 2018-03-08 1
DEF 2018-03-07 0
DEF 2018-03-06 0
DEF 2018-03-05 1
")

Benchmark Output:

# Unit: relative
# expr min lq mean median uq max neval
# ry0 1.0000000 1.000000 1.000000 1.000000 1.000000 1.0000000 100
# ry1 0.9039601 1.005675 1.107913 1.007259 1.013925 0.9834608 100
# ry2 4.1922470 4.119451 3.833156 4.054261 4.064153 2.1996109 100
# mkr 2.7526006 2.860652 2.734473 2.851795 2.780521 1.4623569 100
# www 5.8029974 5.601037 5.293515 5.588397 5.372007 1.5343666 100
# leb 6.8563589 6.548586 6.687608 6.461585 6.991874 2.2607231 100
# mm1 1.8219038 1.782887 1.464588 1.791532 1.669813 0.2896809 100
# mm2 6.0007823 5.806987 5.393869 5.679563 5.672251 1.7103423 100
# mm3 2.1094639 2.372948 2.899198 2.437456 2.270863 1.8811060 100

Benchmark code:

df <- read.table(text="ID     date        Flag
ABC 2018-03-21 NA
ABC 2018-03-17 0
ABC 2018-03-12 0
ABC 2018-03-10 0
ABC 2018-03-09 1
ABC 2018-03-08 0
ABC 2018-03-07 1
DEF 2018-03-24 NA
DEF 2018-03-21 0
DEF 2018-03-20 0
DEF 2018-03-14 0
DEF 2018-03-13 0
DEF 2018-03-12 0
DEF 2018-03-11 0
DEF 2018-03-10 0
DEF 2018-03-09 0
DEF 2018-03-08 1
DEF 2018-03-07 0
DEF 2018-03-06 0
DEF 2018-03-05 1
FOO 1983-01-01 NA
FOO 1983-01-02 NA
FOO 1983-01-02 0
FOO 1983-01-02 0", header=TRUE, stringsAsFactors=FALSE)

df <- setDF(rbindlist(replicate(1e4, df, simplify = F)))

dt <- as.data.table(df)
microbenchmark::microbenchmark(
ry0 = dt[, if(1 %in% Flag) head(.SD, which.max(Flag == 1) - 1) , by = ID],
ry1 = dt[, if(1 %in% Flag) .SD[1:(which.max(Flag == 1) - 1)] , by = ID],
ry2 = df %>%
group_by(ID) %>%
filter(1 %in% Flag) %>%
slice(1:(which.max(Flag == 1) - 1)),
mkr = df %>% group_by(ID) %>%
filter(cumsum(!is.na(Flag) & Flag == 1) == 0),
www = df %>%
mutate(Flag2 = ifelse(is.na(Flag), 0, Flag)) %>%
group_by(ID) %>%
filter(cumsum(Flag2) < 1) %>%
ungroup() %>%
select(-Flag2),
leb = do.call(rbind,lapply(
split(df, df["ID"]),
function(.)
if(!1 %in% .$Flag) NULL
else .[1:(which.max(.$Flag %in% 1)-1),])),
mm1 = df %>%
group_by(ID) %>%
slice(seq_len(match(1,Flag,nomatch=1)-1)),
mm2 = do.call(rbind, by(df, df$ID, function(x) head(x,match(1,x$Flag,nomatch=1)-1))),
mm3 = df[ave(as.logical(df$Flag),df$ID,FUN=function(x){
y <- match(TRUE,x)-1
z <- logical(length(x))
if (is.na(y)) z
else {z[seq_len(y)] <- TRUE;z}
}),],
unit="relative",
times = 100
)

Need help finding a fast method to identify first non-missing observation per variable

This may be a case where melting the dataset and casting is faster when there are only 3 results per each group.

Using @chinsoon12's dataset, I get 2-3 seconds with OP's original solutions vs. 0.4 s with melt and cast. If you don't mind keeping the data molten (i.e., long), that is around 0.2 seconds which is about 10x faster than the original.

#melt and cast
dcast(melt(DT, id.vars = 'grp')[!is.na(value), .SD[1], by = .(grp, variable)], grp ~ variable)

#only melt
melt(DT, id.vars = 'grp')[!is.na(value), .SD[1], by = .(grp, variable)]

#approach with intermediate variables:
molten_DT<- na.omit(melt(DT, id.vars = 'grp'), 'value')
dcast(molten_DT[molten_DT[, .I[1], by = .(grp, variable)]$V1, ], grp ~ variable)
library(data.table)
library(microbenchmark)

#@chinsoon12's dataset
set.seed(0L)
ngrp <- 1000L #502540
avgNr <- 3L
nc <- 1000L #1019
DT <- data.table(
as.data.table(matrix(sample(c(NA,1), ngrp*avgNr*nc, TRUE), nrow=ngrp*avgNr, ncol=nc)),
grp=rep(1:ngrp, each=avgNr))

system.time(DT[, lapply(.SD, firstnonmiss_1), by = grp])
system.time(DT[, lapply(.SD, firstnonmiss_2), by = grp])
system.time(DT[, lapply(.SD, firstnonmiss_3), by = grp])
microbenchmark(melt_and_cast = {
dcast(melt(DT, id.vars = 'grp')[!is.na(value), .SD[1], by = .(grp, variable)], grp ~ variable)
},melt_1 = {
melt(DT, id.vars = 'grp')[!is.na(value), .SD[1], by = .(grp, variable)]
}
,times = 20)

How to take non-missing value associated with max index for each group using summarize_all

Instead of looking at all num, we may look only at those where the corresponding variable is not NA:

a %>%
group_by(grp) %>%
summarize_all(funs(na.omit(.)[which.max(num[!is.na(.)])]))
# A tibble: 4 x 7
# grp Sepal.Length Sepal.Width Petal.Length Petal.Width Species num
# <chr> <dbl> <dbl> <dbl> <dbl> <fct> <int>
# 1 a 4.7 3.2 1.3 0.2 setosa 3
# 2 b 5 3.6 1.4 0.2 setosa 5
# 3 c 4.6 3.4 1.4 0.3 setosa 7
# 4 d 4.9 3.1 1.5 0.1 setosa 10

Count non-NA values by group

You can use this

mydf %>% group_by(col_1) %>% summarise(non_na_count = sum(!is.na(col_2)))

# A tibble: 2 x 2
col_1 non_na_count
<fctr> <int>
1 A 1
2 B 2

Count non-`NA` of several columns by group using summarize and across from dplyr

I hope this is what you are looking for:

library(dplyr)

d %>%
group_by(ID) %>%
summarise(across(Col1:Col3, ~ sum(!is.na(.x)), .names = "non-{.col}"))

# A tibble: 3 x 4
ID `non-Col1` `non-Col2` `non-Col3`
<dbl> <int> <int> <int>
1 1 3 2 3
2 2 2 0 2
3 3 1 1 0

Or if you would like to select columns by their shared string you can use this:

d %>%
group_by(ID) %>%
summarise(across(contains("Col"), ~ sum(!is.na(.x)), .names = "non-{.col}"))

Find the index position of the first non-NA value in an R vector?

Use a combination of is.na and which to find the non-NA index locations.

NonNAindex <- which(!is.na(z))
firstNonNA <- min(NonNAindex)

# set the next 3 observations to NA
is.na(z) <- seq(firstNonNA, length.out=3)

Replace NA with previous or next value, by group, using dplyr

library(tidyr) #fill is part of tidyr

ps1 %>%
group_by(userID) %>%
#fill(color, age, gender) %>% #default direction down
fill(color, age, gender, .direction = "downup")

Which gives you:

Source: local data frame [9 x 4]
Groups: userID [3]

userID color age gender
<dbl> <fctr> <fctr> <fctr>
1 21 blue 3yrs F
2 21 blue 2yrs F
3 21 red 2yrs M
4 22 blue 3yrs F
5 22 blue 3yrs F
6 22 blue 3yrs F
7 23 red 4yrs F
8 23 red 4yrs F
9 23 gold 4yrs F


Related Topics



Leave a reply



Submit