Earliest Date for Each Id in R

Earliest Date for each id in R

We can use data.table. Convert the 'data.frame' to 'data.table' (setDT(data_full)), grouped by 'id', we get the 1st row (head(.SD, 1L)).

library(data.table)
setDT(data_full)[order(e_date), head(.SD, 1L), by = id]

Or using dplyr, after grouping by 'id', arrange the 'e_date' (assuming it is of Date class) and get the first row with slice.

library(dplyr)
data_full %>%
group_by(id) %>%
arrange(e_date) %>%
slice(1L)

If we need a base R option, ave can be used

data_full[with(data_full, ave(e_date, id, FUN = function(x) rank(x)==1)),]

For each ID return the earliest date from the start column and the latest date from the end column in r

Or with first and last:

library(dplyr)
data %>%
group_by(ID) %>%
summarise(
startDate = first(startDate),
endDate = last(endDate)
)
# A tibble: 2 x 3
ID startDate endDate
* <dbl> <chr> <chr>
1 1 2018-01-31 2019-07-09
2 2 2002-06-07 2002-10-02

Is there an R function to subset and remove earliest date associated with an ID?

I would use dplyr to remove the earliest date for each group. I'm providing some data here.

library(dplyr)

df <- structure(list(ID = c(1, 1, 1, 2, 2, 2), time = structure(c(1325485800,
1325487600, 1325489400, 1325491200, 1325493000, 1325494800), class = c("POSIXct",
"POSIXt"), tzone = "")), class = "data.frame", row.names = c(NA,
-6L))

df.updated <- df %>%
dplyr::group_by(ID) %>%
dplyr::slice(-which.min(time))

Be sure to provide data when asking a question to give a good reproducible example. You can do this through dput(head(df)) to provide some of your data (as usually it only takes a little data to solve an issue).

In R is there a way to extract the row with the earliest date per ID if it meets a condition (is equal to 1) and the latest date if it does not?

df %>% 
group_by(ID) %>% # group by ID
mutate(index = case_when(response == 1 ~ which.min(follow_up_date), # get earliest date if response == 0
response == 0 ~ which.max(follow_up_date))) %>% # get latest date if reponse == 1
slice(first(index)) %>% # get first occurance of index
select(-index)

or with data.table

library(data.table)
setDT(df)
df[df[,.I[ifelse(response[1] == 0,which.max(follow_up_date),which.min(follow_up_date))],by = ID]$V1]

Find the earliest and latest date within each row in R

We can use pmax and pmin on the 'date' columns to return the earliest and latest date for each row

library(dplyr)
df %>%
mutate(max_date = do.call(pmax, c(select(., starts_with('date')), na.rm = TRUE)),
min_date = do.call(pmin, c(select(., starts_with('date')),
na.rm = TRUE)))
# ID Other_columns date_column date_column2 date_column3 max_date min_date
#1 1 numeric 2019-11-04 19:33:50 2019-11-05 15:33:50 2019-11-05 16:33:50 2019-11-05 16:33:50 2019-11-04 19:33:50
#2 2 numeric <NA> 2019-11-04 17:20:10 2019-11-09 19:12:50 2019-11-09 19:12:50 2019-11-04 17:20:10
#3 3 numeric 2019-11-07 20:33:50 <NA> 2019-11-04 18:31:50 2019-11-07 20:33:50 2019-11-04 18:31:50
#4 4 <NA> <NA> <NA> <NA> <NA> <NA>

Or another option with rowwise with c_across

df %>% 
rowwise() %>%
mutate(max_date = max(as.POSIXct(c_across(starts_with('date'))),
na.rm = TRUE),
min_date = min(as.POSIXct(c_across(starts_with('date'))),
na.rm = TRUE))

-output

# A tibble: 4 x 7
# Rowwise:
# ID Other_columns date_column date_column2 date_column3 max_date min_date
# <int> <chr> <chr> <chr> <chr> <dttm> <dttm>
#1 1 numeric 2019-11-04 19:33:50 2019-11-05 15:33:50 2019-11-05 16:33:50 2019-11-05 16:33:50 2019-11-04 19:33:50
#2 2 numeric <NA> 2019-11-04 17:20:10 2019-11-09 19:12:50 2019-11-09 19:12:50 2019-11-04 17:20:10
#3 3 numeric 2019-11-07 20:33:50 <NA> 2019-11-04 18:31:50 2019-11-07 20:33:50 2019-11-04 18:31:50
#4 4 <NA> <NA> <NA> <NA> NA NA NA NA

data

df <- structure(list(ID = 1:4, Other_columns = c("numeric", "numeric", 
"numeric", NA), date_column = c("2019-11-04 19:33:50", NA, "2019-11-07 20:33:50",
NA), date_column2 = c("2019-11-05 15:33:50", "2019-11-04 17:20:10",
NA, NA), date_column3 = c("2019-11-05 16:33:50", "2019-11-09 19:12:50",
"2019-11-04 18:31:50", NA)), class = "data.frame", row.names = c(NA,
-4L))


How to creating a new column with year of first date to each id in r

A possible solution:

library(tidyverse)
library(lubridate)

df %>%
group_by(id) %>%
mutate(year = first(year(date))) %>%
ungroup

#> # A tibble: 4 × 3
#> id date year
#> <chr> <dttm> <dbl>
#> 1 A 2017-12-26 09:01:30 2017
#> 2 A 2018-01-01 09:06:40 2017
#> 3 B 2017-12-30 09:04:50 2017
#> 4 B 2018-02-02 09:01:00 2017

Group by id and drug (with dates 100 days of each other) take the earliest and latest date

You could create a new grouping by using cumsum:

library(dplyr)

mydata %>%
group_by(Id, drug) %>%
mutate(Diff = difftime(Date, lag(Date), units = 'days')) %>%
group_by(Id, drug, grp = cumsum(coalesce(Diff, as.difftime(0, units = 'days')) > 100)) %>%
summarise(startDate = min(as.Date(Date),na.rm = T),
endDate = max(as.Date(Date),na.rm = T),
.groups = "drop") %>%
select(-grp)

This returns

# A tibble: 4 x 4
Id drug startDate endDate
<dbl> <chr> <date> <date>
1 1 A 2000-01-01 2000-01-05
2 1 A 2000-05-13 2000-05-17
3 1 B 2000-02-02 2000-02-14
4 1 C 2000-05-16 2000-05-20

Select row with most recent date by group

You can try

library(dplyr)
df %>%
group_by(ID) %>%
slice(which.max(as.Date(date, '%m/%d/%Y')))

data

df <- data.frame(ID= rep(1:3, each=3), date=c('02/20/1989',
'03/14/2001', '02/25/1990', '04/20/2002', '02/04/2005', '02/01/2008',
'08/22/2011','08/20/2009', '08/25/2010' ), stringsAsFactors=FALSE)


Related Topics



Leave a reply



Submit