generate id for each group with repeated and missing observations
An option using data.table
:
setDT(dt)[, id_week := rleid(week), individ]
Repeating a value within each ID when there are multiple value options in R
A data.table
option using first
+ na.omit
setDT(data)[, height_filled := first(na.omit(height)), id]
gives
id height height_filled
1: 1 150 150
2: 1 NA 150
3: 1 NA 150
4: 2 NA 148
5: 2 148 148
6: 3 NA 152
7: 3 152 152
8: 3 151 152
9: 3 NA 152
A base R option using ave
transform(
data,
height_filled = ave(height, id, FUN = function(x) head(na.omit(x), 1))
)
gives
id height height_filled
1 1 150 150
2 1 NA 150
3 1 NA 150
4 2 NA 148
5 2 148 148
6 3 NA 152
7 3 152 152
8 3 151 152
9 3 NA 152
How to fill missing values grouped on id and based on time period from index date
You could make a small function (f
, below) to handle each value column.
- Make a grouped ID, and generate a
rowid
(this is only to retain your original order)
dat <- dat %>%
mutate(rowid = row_number()) %>%
arrange(registration_dat) %>%
group_by(ID)
- Make a function that takes a
df
andval
column, and returns and updateddf
withval
fixed
f <- function(df, val) {
bind_rows(
df %>% filter(is.na({{val}}) & row_number()!=n()),
df %>% filter(!is.na({{val}}) | row_number()==n()) %>%
mutate({{val}} := if_else(is.na({{val}}) & registration_dat-lag(registration_dat)<365, lag({{val}}),{{val}}))
)
}
- Apply the function to the columns of interest
dat = f(dat,value1)
dat = f(dat,value2)
- If you want, recover the original order
dat %>% arrange(rowid) %>% select(-rowid)
Output:
ID registration_dat value1 value2
<int> <date> <int> <int>
1 1 2020-03-04 33 25
2 1 2019-05-06 33 25
3 1 2019-01-02 32 21
4 3 2021-10-31 NA NA
5 3 2018-10-12 33 NA
6 3 2018-10-10 25 35
7 4 2020-01-02 32 83
8 4 2019-10-31 32 83
9 4 2019-09-20 33 56
10 8 2019-12-12 32 46
11 8 2019-10-31 NA 43
12 8 2019-08-12 32 46
Update:
The OP wants the final row (i.e the last registration_dat) per ID. With 3 million rows and 14 value columns, I would use data.table
and do something like this:
library(data.table)
f <- function(df) {
df = df[df[1,registration_dat]-registration_dat<=365]
df[1,value:=df[2:.N][!is.na(value)][1,value]][1]
}
dcast(
melt(setDT(dat), id=c("ID", "registration_dat"))[order(-registration_dat),f(.SD), by=.(ID,variable)],
ID+registration_dat~variable, value.var="value"
)
Output:
ID registration_dat value1 value2
<int> <Date> <int> <int>
1: 1 2020-03-04 33 25
2: 3 2021-10-31 NA NA
3: 4 2020-01-02 32 83
4: 8 2019-12-12 32 43
Get duplicate values within groups
We can group by 'household', 'children' and filter
the rows where the number of rows is greater than 1
library(dplyr)
village %>%
group_by(household, children) %>%
filter(n() > 1) %>%
ungroup
-output
# A tibble: 2 x 2
# household children
# <dbl> <chr>
#1 1 A001
#2 1 A001
Or using base R
with duplicated
village[duplicated(village)|duplicated(village, fromLast = TRUE),]
# household children
#1 1 A001
#3 1 A001
Filling the missing values within each id in r
We can group by 'id' and fill
library(dplyr)
library(tidyr)
df %>%
group_by(id) %>%
fill(score, .direction = "downup") %>%
ungroup
Repeat a set of ID's for every n rows
Instead of each
, try using times
:
question_data$id <-
rep(seq(bloc_len), times = nrow(question_data) %/% bloc_len, length.out = nrow(question_data))
How to create missing value for repeated measurement data?
Using tidyr, this is a one liner. You use the complete
function, which creates rows with each combination of the columns passed to it, filling the rest of the rows with NA:
library(tidyr)
complete(m, id, age)
Source: local data frame [18 x 3]
id age IQ
(dbl) (dbl) (dbl)
1 1 2 3
2 1 3 4
3 1 4 5
4 1 5 4
5 1 6 NA
6 1 8 NA
7 2 2 NA
8 2 3 6
9 2 4 NA
10 2 5 NA
11 2 6 5
12 2 8 NA
13 3 2 3
14 3 3 NA
15 3 4 NA
16 3 5 8
17 3 6 NA
18 3 8 10
Filling missing values from other rows in group (including duplicates)
We may need complete
here. After grouping by 'group', use complete
to get the combinations of unique
non-NA 'value' for each 'group' and 'ID'
library(dplyr)
library(tidyr)
library(stringr)
df1 %>%
group_by(group) %>%
complete(ID, value = unique(value[!is.na(value)])) %>%
na.omit %>%
select(names(df1))
# A tibble: 15 x 3
# Groups: group [3]
# ID group value
# <int> <chr> <chr>
# 1 1 A blue
# 2 2 A blue
# 3 3 A blue
# 4 4 B green
# 5 4 B red
# 6 5 B green
# 7 5 B red
# 8 6 B green
# 9 6 B red
#10 7 C blue
#11 7 C green
#12 8 C blue
#13 8 C green
#14 9 C blue
#15 9 C green
Update
with the new dataset, we can do
df2 %>%
group_by(group) %>%
mutate(valnew = str_c(value, specific_value, sep=":")) %>%
select(-value, -specific_value, -dataversion) %>%
complete(ID, valnew = unique(valnew[!is.na(valnew)])) %>%
filter(!is.na(valnew)) %>%
separate(valnew, into = c('value', 'specific_value'), sep=":") %>%
mutate(rn = row_number()) %>%
left_join(df2 %>%
select(ID, dataversion)) %>%
filter(!duplicated(rn)) %>%
select(names(df2))
# A tibble: 15 x 5
# Groups: group [3]
# ID group value specific_value dataversion
# <int> <chr> <chr> <chr> <chr>
# 1 1 A blue sky_blue version1
# 2 2 A blue sky_blue version2
# 3 3 A blue sky_blue version1
# 4 4 B green forest_green version1
# 5 4 B red scarlet version1
# 6 5 B green forest_green version2
# 7 5 B red scarlet version2
# 8 6 B green forest_green <NA>
# 9 6 B red scarlet <NA>
#10 7 C blue royal_blue version2
#11 7 C green lime_green version2
#12 8 C blue royal_blue version1
#13 8 C green lime_green version1
#14 9 C blue royal_blue version1
#15 9 C green lime_green version1
data
df1 <- structure(list(ID = c(1L, 2L, 3L, 4L, 4L, 5L, 6L, 7L, 8L, 9L),
group = c("A", "A", "A", "B", "B", "B", "B", "C", "C", "C"
), value = c("blue", NA, NA, "green", "red", NA, NA, "blue",
"green", NA)), row.names = c("1", "2", "3", "4", "5", "6",
"7", "8", "9", "10"), class = "data.frame")
df2 <- structure(list(ID = c(1L, 2L, 3L, 4L, 4L, 5L, 6L, 7L, 8L, 9L),
group = c("A", "A", "A", "B", "B", "B", "B", "C", "C", "C"
), value = c("blue", NA, NA, "green", "red", NA, NA, "blue",
"green", NA), specific_value = c("sky_blue", NA, NA, "forest_green",
"scarlet", NA, NA, "royal_blue", "lime_green", NA), dataversion = c("version1",
"version2", "version1", "version1", "version1", "version2",
NA, "version2", "version1", "version1")), class = "data.frame",
row.names = c(NA,
-10L))
Related Topics
Highlight a Single "Bar" in Ggplot
Modify Spacing Between Key Glyphs in Vertical Legend Whilst Keeping Key Glyph Border
Backports 1.1.1 Package Fails to Install
The Difference Between & and && in R
Mapping Variable to Hexagon Size with Geom_Hex
Retain Numerical Precision in an R Data Frame
How to Convert a Numeric Value into a Date Value
How to Print on a Serie Sof Graphs Pairwise Comparisons Bars and Effect Size Value
Wordcloud Package: Get "Error in Strwidth(…):Invalid 'Cex' Value"
R: Removing Duplicate Elements in a Vector
Vector of Cumulative Sums in R
Merge Data.Frames with Duplicates
Dealing with Nan's in Matlab Functions
Transform One Column from Categoric to Binary, Keep the Rest
Test If Element Is in a List and Return 0 or 1
Accessing Functions with a Dot in Their Name (Eg. "As.Vector") Using Rpy2
R - Random Forest and More Than 53 Categories
How to Use User Input to Obtain a Data.Frame from My Environment in Shiny