Earliest Date for Each Id in R

Earliest Date for each id in R

We can use data.table. Convert the 'data.frame' to 'data.table' (setDT(data_full)), grouped by 'id', we get the 1st row (head(.SD, 1L)).

library(data.table)
setDT(data_full)[order(e_date), head(.SD, 1L), by = id]

Or using dplyr, after grouping by 'id', arrange the 'e_date' (assuming it is of Date class) and get the first row with slice.

library(dplyr)
data_full %>%
    group_by(id) %>%
    arrange(e_date) %>%
    slice(1L)

If we need a base R option, ave can be used

data_full[with(data_full, ave(e_date, id, FUN = function(x) rank(x)==1)),]

For each ID return the earliest date from the start column and the latest date from the end column in r

Or with first and last:

library(dplyr)
data %>% 
  group_by(ID) %>%
  summarise(
    startDate = first(startDate),
    endDate = last(endDate)
  )
# A tibble: 2 x 3
     ID startDate  endDate   
* <dbl> <chr>      <chr>     
1     1 2018-01-31 2019-07-09
2     2 2002-06-07 2002-10-02

Is there an R function to subset and remove earliest date associated with an ID?

I would use dplyr to remove the earliest date for each group. I'm providing some data here.

library(dplyr)

df <- structure(list(ID = c(1, 1, 1, 2, 2, 2), time = structure(c(1325485800, 
1325487600, 1325489400, 1325491200, 1325493000, 1325494800), class = c("POSIXct", 
"POSIXt"), tzone = "")), class = "data.frame", row.names = c(NA, 
-6L))

df.updated <- df %>% 
  dplyr::group_by(ID) %>% 
  dplyr::slice(-which.min(time))

Be sure to provide data when asking a question to give a good reproducible example. You can do this through dput(head(df)) to provide some of your data (as usually it only takes a little data to solve an issue).

In R is there a way to extract the row with the earliest date per ID if it meets a condition (is equal to 1) and the latest date if it does not?

df %>% 
  group_by(ID) %>% # group by ID
  mutate(index = case_when(response == 1 ~ which.min(follow_up_date),  # get earliest date if response == 0
                           response == 0 ~ which.max(follow_up_date))) %>% # get latest date if reponse == 1
  slice(first(index)) %>% # get first occurance of index
  select(-index)

or with data.table

library(data.table)
setDT(df)
df[df[,.I[ifelse(response[1] == 0,which.max(follow_up_date),which.min(follow_up_date))],by = ID]$V1]

Find the earliest and latest date within each row in R

We can use pmax and pmin on the 'date' columns to return the earliest and latest date for each row

library(dplyr)
 df %>%
     mutate(max_date = do.call(pmax, c(select(., starts_with('date')), na.rm = TRUE)),
            min_date = do.call(pmin, c(select(., starts_with('date')), 
         na.rm = TRUE)))
#  ID Other_columns         date_column        date_column2        date_column3            max_date            min_date
#1  1       numeric 2019-11-04 19:33:50 2019-11-05 15:33:50 2019-11-05 16:33:50 2019-11-05 16:33:50 2019-11-04 19:33:50
#2  2       numeric                <NA> 2019-11-04 17:20:10 2019-11-09 19:12:50 2019-11-09 19:12:50 2019-11-04 17:20:10
#3  3       numeric 2019-11-07 20:33:50                <NA> 2019-11-04 18:31:50 2019-11-07 20:33:50 2019-11-04 18:31:50
#4  4          <NA>                <NA>                <NA>                <NA>                <NA>                <NA>

Or another option with rowwise with c_across

df %>% 
   rowwise() %>% 
   mutate(max_date =  max(as.POSIXct(c_across(starts_with('date'))), 
         na.rm = TRUE),
          min_date = min(as.POSIXct(c_across(starts_with('date'))), 
         na.rm = TRUE))

-output

# A tibble: 4 x 7
# Rowwise: 
#     ID Other_columns date_column         date_column2        date_column3        max_date            min_date           
#  <int> <chr>         <chr>               <chr>               <chr>               <dttm>              <dttm>             
#1     1 numeric       2019-11-04 19:33:50 2019-11-05 15:33:50 2019-11-05 16:33:50 2019-11-05 16:33:50 2019-11-04 19:33:50
#2     2 numeric       <NA>                2019-11-04 17:20:10 2019-11-09 19:12:50 2019-11-09 19:12:50 2019-11-04 17:20:10
#3     3 numeric       2019-11-07 20:33:50 <NA>                2019-11-04 18:31:50 2019-11-07 20:33:50 2019-11-04 18:31:50
#4     4 <NA>          <NA>                <NA>                <NA>                NA NA               NA NA

data

df <- structure(list(ID = 1:4, Other_columns = c("numeric", "numeric", 
"numeric", NA), date_column = c("2019-11-04 19:33:50", NA, "2019-11-07 20:33:50", 
NA), date_column2 = c("2019-11-05 15:33:50", "2019-11-04 17:20:10", 
NA, NA), date_column3 = c("2019-11-05 16:33:50", "2019-11-09 19:12:50", 
"2019-11-04 18:31:50", NA)), class = "data.frame", row.names = c(NA, 
-4L))

How to creating a new column with year of first date to each id in r

A possible solution:

library(tidyverse)
library(lubridate)

df %>% 
  group_by(id) %>% 
  mutate(year = first(year(date))) %>% 
  ungroup

#> # A tibble: 4 × 3
#>   id    date                 year
#>   <chr> <dttm>              <dbl>
#> 1 A     2017-12-26 09:01:30  2017
#> 2 A     2018-01-01 09:06:40  2017
#> 3 B     2017-12-30 09:04:50  2017
#> 4 B     2018-02-02 09:01:00  2017

Group by id and drug (with dates 100 days of each other) take the earliest and latest date

You could create a new grouping by using cumsum:

library(dplyr)

mydata %>% 
  group_by(Id, drug) %>% 
  mutate(Diff = difftime(Date, lag(Date), units = 'days')) %>%  
  group_by(Id, drug, grp = cumsum(coalesce(Diff, as.difftime(0, units = 'days')) > 100)) %>% 
  summarise(startDate = min(as.Date(Date),na.rm = T),
            endDate = max(as.Date(Date),na.rm = T),
            .groups = "drop") %>% 
  select(-grp)

This returns

# A tibble: 4 x 4
     Id drug  startDate  endDate   
  <dbl> <chr> <date>     <date>    
1     1 A     2000-01-01 2000-01-05
2     1 A     2000-05-13 2000-05-17
3     1 B     2000-02-02 2000-02-14
4     1 C     2000-05-16 2000-05-20

Select row with most recent date by group

You can try

library(dplyr)
df %>% 
  group_by(ID) %>%
  slice(which.max(as.Date(date, '%m/%d/%Y')))

data

df <- data.frame(ID= rep(1:3, each=3), date=c('02/20/1989',
'03/14/2001', '02/25/1990',  '04/20/2002', '02/04/2005', '02/01/2008',
'08/22/2011','08/20/2009', '08/25/2010' ), stringsAsFactors=FALSE)

Earliest Date for Each Id in R