Generate Id for Each Group with Repeated and Missing Observations

generate id for each group with repeated and missing observations

An option using data.table:

setDT(dt)[, id_week := rleid(week), individ]

Repeating a value within each ID when there are multiple value options in R

A data.table option using first + na.omit

setDT(data)[, height_filled := first(na.omit(height)), id]

gives

   id height height_filled
1:  1    150           150
2:  1     NA           150
3:  1     NA           150
4:  2     NA           148
5:  2    148           148
6:  3     NA           152
7:  3    152           152
8:  3    151           152
9:  3     NA           152

A base R option using ave

transform(
  data,
  height_filled = ave(height, id, FUN = function(x) head(na.omit(x), 1))
)

gives

  id height height_filled
1  1    150           150
2  1     NA           150
3  1     NA           150
4  2     NA           148
5  2    148           148
6  3     NA           152
7  3    152           152
8  3    151           152
9  3     NA           152

How to fill missing values grouped on id and based on time period from index date

You could make a small function (f, below) to handle each value column.

Make a grouped ID, and generate a rowid (this is only to retain your original order)

dat <- dat %>% 
  mutate(rowid = row_number()) %>% 
  arrange(registration_dat) %>% 
  group_by(ID)

Make a function that takes a df and val column, and returns and updated df with val fixed

f <- function(df, val) {
  bind_rows(
    df %>% filter(is.na({{val}}) & row_number()!=n()),
    df %>% filter(!is.na({{val}}) | row_number()==n()) %>% 
      mutate({{val}} := if_else(is.na({{val}}) & registration_dat-lag(registration_dat)<365, lag({{val}}),{{val}}))
  )
}

Apply the function to the columns of interest

dat = f(dat,value1)
dat = f(dat,value2)

If you want, recover the original order

dat %>% arrange(rowid) %>% select(-rowid)

Output:

      ID registration_dat value1 value2
   <int> <date>            <int>  <int>
 1     1 2020-03-04           33     25
 2     1 2019-05-06           33     25
 3     1 2019-01-02           32     21
 4     3 2021-10-31           NA     NA
 5     3 2018-10-12           33     NA
 6     3 2018-10-10           25     35
 7     4 2020-01-02           32     83
 8     4 2019-10-31           32     83
 9     4 2019-09-20           33     56
10     8 2019-12-12           32     46
11     8 2019-10-31           NA     43
12     8 2019-08-12           32     46

Update:

The OP wants the final row (i.e the last registration_dat) per ID. With 3 million rows and 14 value columns, I would use data.table and do something like this:

library(data.table)

f <- function(df) {
  df = df[df[1,registration_dat]-registration_dat<=365]
  df[1,value:=df[2:.N][!is.na(value)][1,value]][1]
}

dcast(
  melt(setDT(dat), id=c("ID", "registration_dat"))[order(-registration_dat),f(.SD), by=.(ID,variable)],
  ID+registration_dat~variable, value.var="value"
)

Output:

      ID registration_dat value1 value2
   <int>           <Date>  <int>  <int>
1:     1       2020-03-04     33     25
2:     3       2021-10-31     NA     NA
3:     4       2020-01-02     32     83
4:     8       2019-12-12     32     43

Get duplicate values within groups

We can group by 'household', 'children' and filter the rows where the number of rows is greater than 1

library(dplyr)
village %>% 
   group_by(household, children) %>% 
   filter(n() > 1) %>%
   ungroup

-output

# A tibble: 2 x 2
#  household children
#      <dbl> <chr>   
#1         1 A001    
#2         1 A001

Or using base R with duplicated

village[duplicated(village)|duplicated(village, fromLast = TRUE),]
#  household children
#1         1     A001
#3         1     A001

Filling the missing values within each id in r

We can group by 'id' and fill

library(dplyr)
library(tidyr)
df %>%
   group_by(id) %>% 
   fill(score, .direction = "downup") %>%
   ungroup

Repeat a set of ID's for every n rows

Instead of each, try using times:

question_data$id <- 
  rep(seq(bloc_len), times = nrow(question_data) %/% bloc_len, length.out = nrow(question_data))

How to create missing value for repeated measurement data?

Using tidyr, this is a one liner. You use the complete function, which creates rows with each combination of the columns passed to it, filling the rest of the rows with NA:

library(tidyr)
complete(m, id, age)

Source: local data frame [18 x 3]

      id   age    IQ
   (dbl) (dbl) (dbl)
1      1     2     3
2      1     3     4
3      1     4     5
4      1     5     4
5      1     6    NA
6      1     8    NA
7      2     2    NA
8      2     3     6
9      2     4    NA
10     2     5    NA
11     2     6     5
12     2     8    NA
13     3     2     3
14     3     3    NA
15     3     4    NA
16     3     5     8
17     3     6    NA
18     3     8    10

Filling missing values from other rows in group (including duplicates)

We may need complete here. After grouping by 'group', use complete to get the combinations of unique non-NA 'value' for each 'group' and 'ID'

library(dplyr)
library(tidyr)
library(stringr)
df1 %>% 
   group_by(group) %>%
   complete(ID, value = unique(value[!is.na(value)])) %>%
   na.omit %>%
   select(names(df1))
# A tibble: 15 x 3
# Groups:   group [3]
#      ID group value
#   <int> <chr> <chr>
# 1     1 A     blue 
# 2     2 A     blue 
# 3     3 A     blue 
# 4     4 B     green
# 5     4 B     red  
# 6     5 B     green
# 7     5 B     red  
# 8     6 B     green
# 9     6 B     red  
#10     7 C     blue 
#11     7 C     green
#12     8 C     blue 
#13     8 C     green
#14     9 C     blue 
#15     9 C     green

Update

with the new dataset, we can do

df2 %>%
   group_by(group) %>%
   mutate(valnew = str_c(value, specific_value, sep=":")) %>% 
   select(-value, -specific_value, -dataversion) %>%
   complete(ID, valnew = unique(valnew[!is.na(valnew)])) %>% 
   filter(!is.na(valnew)) %>% 
   separate(valnew, into = c('value', 'specific_value'), sep=":") %>% 
   mutate(rn = row_number()) %>%
   left_join(df2 %>% 
               select(ID, dataversion)) %>%
   filter(!duplicated(rn)) %>%
   select(names(df2))
# A tibble: 15 x 5
# Groups:   group [3]
#      ID group value specific_value dataversion
#   <int> <chr> <chr> <chr>          <chr>      
# 1     1 A     blue  sky_blue       version1   
# 2     2 A     blue  sky_blue       version2   
# 3     3 A     blue  sky_blue       version1   
# 4     4 B     green forest_green   version1   
# 5     4 B     red   scarlet        version1   
# 6     5 B     green forest_green   version2   
# 7     5 B     red   scarlet        version2   
# 8     6 B     green forest_green   <NA>       
# 9     6 B     red   scarlet        <NA>       
#10     7 C     blue  royal_blue     version2   
#11     7 C     green lime_green     version2   
#12     8 C     blue  royal_blue     version1   
#13     8 C     green lime_green     version1   
#14     9 C     blue  royal_blue     version1   
#15     9 C     green lime_green     version1

data

df1 <- structure(list(ID = c(1L, 2L, 3L, 4L, 4L, 5L, 6L, 7L, 8L, 9L), 
    group = c("A", "A", "A", "B", "B", "B", "B", "C", "C", "C"
    ), value = c("blue", NA, NA, "green", "red", NA, NA, "blue", 
    "green", NA)), row.names = c("1", "2", "3", "4", "5", "6", 
"7", "8", "9", "10"), class = "data.frame")

df2 <- structure(list(ID = c(1L, 2L, 3L, 4L, 4L, 5L, 6L, 7L, 8L, 9L), 
    group = c("A", "A", "A", "B", "B", "B", "B", "C", "C", "C"
    ), value = c("blue", NA, NA, "green", "red", NA, NA, "blue", 
    "green", NA), specific_value = c("sky_blue", NA, NA, "forest_green", 
    "scarlet", NA, NA, "royal_blue", "lime_green", NA), dataversion = c("version1", 
    "version2", "version1", "version1", "version1", "version2", 
    NA, "version2", "version1", "version1")), class = "data.frame",
    row.names = c(NA, 
-10L))

Generate Id for Each Group with Repeated and Missing Observations