Generate Id for Each Group with Repeated and Missing Observations

generate id for each group with repeated and missing observations

An option using data.table:

setDT(dt)[, id_week := rleid(week), individ]

Repeating a value within each ID when there are multiple value options in R

A data.table option using first + na.omit

setDT(data)[, height_filled := first(na.omit(height)), id]

gives

   id height height_filled
1: 1 150 150
2: 1 NA 150
3: 1 NA 150
4: 2 NA 148
5: 2 148 148
6: 3 NA 152
7: 3 152 152
8: 3 151 152
9: 3 NA 152

A base R option using ave

transform(
data,
height_filled = ave(height, id, FUN = function(x) head(na.omit(x), 1))
)

gives

  id height height_filled
1 1 150 150
2 1 NA 150
3 1 NA 150
4 2 NA 148
5 2 148 148
6 3 NA 152
7 3 152 152
8 3 151 152
9 3 NA 152

How to fill missing values grouped on id and based on time period from index date

You could make a small function (f, below) to handle each value column.

  1. Make a grouped ID, and generate a rowid (this is only to retain your original order)
dat <- dat %>% 
mutate(rowid = row_number()) %>%
arrange(registration_dat) %>%
group_by(ID)

  1. Make a function that takes a df and val column, and returns and updated df with val fixed
f <- function(df, val) {
bind_rows(
df %>% filter(is.na({{val}}) & row_number()!=n()),
df %>% filter(!is.na({{val}}) | row_number()==n()) %>%
mutate({{val}} := if_else(is.na({{val}}) & registration_dat-lag(registration_dat)<365, lag({{val}}),{{val}}))
)
}

  1. Apply the function to the columns of interest
dat = f(dat,value1)
dat = f(dat,value2)

  1. If you want, recover the original order
dat %>% arrange(rowid) %>% select(-rowid)

Output:

      ID registration_dat value1 value2
<int> <date> <int> <int>
1 1 2020-03-04 33 25
2 1 2019-05-06 33 25
3 1 2019-01-02 32 21
4 3 2021-10-31 NA NA
5 3 2018-10-12 33 NA
6 3 2018-10-10 25 35
7 4 2020-01-02 32 83
8 4 2019-10-31 32 83
9 4 2019-09-20 33 56
10 8 2019-12-12 32 46
11 8 2019-10-31 NA 43
12 8 2019-08-12 32 46

Update:

The OP wants the final row (i.e the last registration_dat) per ID. With 3 million rows and 14 value columns, I would use data.table and do something like this:

library(data.table)

f <- function(df) {
df = df[df[1,registration_dat]-registration_dat<=365]
df[1,value:=df[2:.N][!is.na(value)][1,value]][1]
}

dcast(
melt(setDT(dat), id=c("ID", "registration_dat"))[order(-registration_dat),f(.SD), by=.(ID,variable)],
ID+registration_dat~variable, value.var="value"
)

Output:

      ID registration_dat value1 value2
<int> <Date> <int> <int>
1: 1 2020-03-04 33 25
2: 3 2021-10-31 NA NA
3: 4 2020-01-02 32 83
4: 8 2019-12-12 32 43

Get duplicate values within groups

We can group by 'household', 'children' and filter the rows where the number of rows is greater than 1

library(dplyr)
village %>%
group_by(household, children) %>%
filter(n() > 1) %>%
ungroup

-output

# A tibble: 2 x 2
# household children
# <dbl> <chr>
#1 1 A001
#2 1 A001

Or using base R with duplicated

village[duplicated(village)|duplicated(village, fromLast = TRUE),]
# household children
#1 1 A001
#3 1 A001

Filling the missing values within each id in r

We can group by 'id' and fill

library(dplyr)
library(tidyr)
df %>%
group_by(id) %>%
fill(score, .direction = "downup") %>%
ungroup

Repeat a set of ID's for every n rows

Instead of each, try using times:

question_data$id <- 
rep(seq(bloc_len), times = nrow(question_data) %/% bloc_len, length.out = nrow(question_data))

How to create missing value for repeated measurement data?

Using tidyr, this is a one liner. You use the complete function, which creates rows with each combination of the columns passed to it, filling the rest of the rows with NA:

library(tidyr)
complete(m, id, age)

Source: local data frame [18 x 3]

id age IQ
(dbl) (dbl) (dbl)
1 1 2 3
2 1 3 4
3 1 4 5
4 1 5 4
5 1 6 NA
6 1 8 NA
7 2 2 NA
8 2 3 6
9 2 4 NA
10 2 5 NA
11 2 6 5
12 2 8 NA
13 3 2 3
14 3 3 NA
15 3 4 NA
16 3 5 8
17 3 6 NA
18 3 8 10

Filling missing values from other rows in group (including duplicates)

We may need complete here. After grouping by 'group', use complete to get the combinations of unique non-NA 'value' for each 'group' and 'ID'

library(dplyr)
library(tidyr)
library(stringr)
df1 %>%
group_by(group) %>%
complete(ID, value = unique(value[!is.na(value)])) %>%
na.omit %>%
select(names(df1))
# A tibble: 15 x 3
# Groups: group [3]
# ID group value
# <int> <chr> <chr>
# 1 1 A blue
# 2 2 A blue
# 3 3 A blue
# 4 4 B green
# 5 4 B red
# 6 5 B green
# 7 5 B red
# 8 6 B green
# 9 6 B red
#10 7 C blue
#11 7 C green
#12 8 C blue
#13 8 C green
#14 9 C blue
#15 9 C green

Update

with the new dataset, we can do

df2 %>%
group_by(group) %>%
mutate(valnew = str_c(value, specific_value, sep=":")) %>%
select(-value, -specific_value, -dataversion) %>%
complete(ID, valnew = unique(valnew[!is.na(valnew)])) %>%
filter(!is.na(valnew)) %>%
separate(valnew, into = c('value', 'specific_value'), sep=":") %>%
mutate(rn = row_number()) %>%
left_join(df2 %>%
select(ID, dataversion)) %>%
filter(!duplicated(rn)) %>%
select(names(df2))
# A tibble: 15 x 5
# Groups: group [3]
# ID group value specific_value dataversion
# <int> <chr> <chr> <chr> <chr>
# 1 1 A blue sky_blue version1
# 2 2 A blue sky_blue version2
# 3 3 A blue sky_blue version1
# 4 4 B green forest_green version1
# 5 4 B red scarlet version1
# 6 5 B green forest_green version2
# 7 5 B red scarlet version2
# 8 6 B green forest_green <NA>
# 9 6 B red scarlet <NA>
#10 7 C blue royal_blue version2
#11 7 C green lime_green version2
#12 8 C blue royal_blue version1
#13 8 C green lime_green version1
#14 9 C blue royal_blue version1
#15 9 C green lime_green version1

data

df1 <- structure(list(ID = c(1L, 2L, 3L, 4L, 4L, 5L, 6L, 7L, 8L, 9L), 
group = c("A", "A", "A", "B", "B", "B", "B", "C", "C", "C"
), value = c("blue", NA, NA, "green", "red", NA, NA, "blue",
"green", NA)), row.names = c("1", "2", "3", "4", "5", "6",
"7", "8", "9", "10"), class = "data.frame")

df2 <- structure(list(ID = c(1L, 2L, 3L, 4L, 4L, 5L, 6L, 7L, 8L, 9L),
group = c("A", "A", "A", "B", "B", "B", "B", "C", "C", "C"
), value = c("blue", NA, NA, "green", "red", NA, NA, "blue",
"green", NA), specific_value = c("sky_blue", NA, NA, "forest_green",
"scarlet", NA, NA, "royal_blue", "lime_green", NA), dataversion = c("version1",
"version2", "version1", "version1", "version1", "version2",
NA, "version2", "version1", "version1")), class = "data.frame",
row.names = c(NA,
-10L))


Related Topics



Leave a reply



Submit