Last Observation Carried Forward In a data frame?
This already exists:
library(zoo)
na.locf(data.frame(rep("a",4), 1:4,1:4, c(1,NA,NA,NA)))
R carry forward last observation n times by group
You can create and/or use your fiscal year (June - May) as one of the group by
criteria in your na.locf
solution
#show data before calculations
data.frame(dat)
#demo FY calculation
dat[, FY := year(MONTH) + as.numeric(month(MONTH) >= 6)]
#actual code
dat[, Portfolio_Monthly := zoo::na.locf(Portfolio_Monthly, na.rm=FALSE),
by=list(PERMNO, year(MONTH) + as.numeric(month(MONTH) >= 6))]
#show results
data.frame(dat)
sample data:
library(data.table)
set.seed(0L)
dat <- data.table(PERMNO=rep(LETTERS[1:12], each=20),
MONTH=rep(seq(as.Date("2000-01-01"), by="1 month", length.out=20), 12),
Portfolio_Monthly=NA_real_)
for (i in sample(1:dat[,.N], 5)) {
set(dat, i, 3L, rnorm(1))
}
setorder(dat, PERMNO, MONTH)
Last observation carried forward and last observation carried backward in R
This should work :
library(tidyverse)
df <- data.frame(ID=c(1,1,1,1,1,2,2,2,3,3), values=c(NA, NA, 12, 13, NA, 5, NA, NA, NA, 1))
df2 <- data.frame(ID=c(1,1,1,1,1,2,2,2,3,3), values=c(12, 12, 12, 13, 13, 5, 5, 5, 1, 1))
df <- df %>%
group_by(ID) %>%
fill(values, .direction = "downup") %>%
fill(values, .direction = "updown")
R -- Carry last observation forward n times
This is not the prettiest thing, but here's how I handle problems like this:
library(data.table)
data <- data.table(data)
data[, rowid:=1:.N, keyby = id]
## flag_a
data[, flag_a_min:=min(rowid[!is.na(flag_a)]), keyby = id]
data[, flag_a_max:=flag_a_min+2]
data[rowid <=flag_a_max & rowid >= flag_a_min, flag_a:=min(na.omit(flag_a))]
## flag_b
data[, flag_b_min:=min(rowid[!is.na(flag_b)]), keyby = id]
data[, flag_b_max:=flag_b_min+2]
data[rowid <=flag_b_max & rowid >= flag_b_min, flag_b:=min(na.omit(flag_b))]
## clean up
data[, c("rowid", "flag_a_min", "flag_a_max", "flag_b_min", "flag_b_max"):=NULL]
> data
id flag_a flag_b flag_a_desired_output flag_b_desired_output
1: 1 NA NA NA NA
2: 1 NA NA NA NA
3: 1 1 NA 1 NA
4: 1 1 1 1 1
5: 1 1 1 1 1
6: 1 NA 1 NA 1
7: 1 NA NA NA NA
8: 1 NA NA NA NA
9: 1 NA NA NA NA
10: 1 NA NA NA NA
11: 2 NA NA NA NA
12: 2 1 NA 1 NA
13: 2 1 NA 1 NA
14: 2 1 1 1 1
15: 2 NA 1 NA 1
16: 2 NA 1 NA 1
17: 2 NA NA NA NA
18: 2 NA NA NA NA
Carrying forward last observation with a limit, by group, using data.table
Note that normally when dealing with longer stretches of NA's one either fills them all or none of them and na.locf already handles that using the maxgap argument which only fills gaps that are no longer than specified. The idea is that interpolation is only reliable over short stretches so you should not interpolate at all over longer ones. Nevertheless, the following shows how to implement the scheme in the question but consider whether you should change strategy and use maxgap instead.
1) Compute na.locf using na.locf0 giving locf and also create a grouping variable for stretches of NA's and non-NAs, g. Then for each run of NA's take the first two elements of the na.locf column and fill the remainder with NA's from value. This does not overwrite df so it can be used in a pipeline without side effects.
library(data.table)
library(zoo)
df[, .(values, locf = na.locf0(values), g = rleid(is.na(values))), by = country][
, .(values = c(head(locf, 2), tail(values, -2))), by = .(country, g)][
, .(country, values)]
giving:
country values
1: USA 2
2: USA 1
3: USA 1
4: USA 1
5: USA NA
6: FR 2
7: FR 1
8: FR 2
9: FR 2
10: FR 2
2) A slightly modified formulation that still uses the same basic idea is the following. It also does not overwrite.
library(data.table)
library(zoo)
# like na.locf0 but only specifies vector, x, and limit to fill, k
na.locf2 <- function(x, k) {
nalocf <- na.locf0(x)
f <- function(ix) c(head(nalocf[ix], k), tail(x[ix], -k))
unlist(tapply(seq_along(x), rleid(is.na(x)), f))
}
df[, .(values = na.locf2(values, 2)), by = country]
giving:
country values
1: USA 2
2: USA 1
3: USA 1
4: USA 1
5: USA NA
6: FR 2
7: FR 1
8: FR 2
9: FR 2
10: FR 2
Last observation carried forward (LOCF) for all NA in R
You could use fill
from tidyr
.
tidyr::fill(df, everything())
# Value1 Value2 Value3 Group
#1 100 56 82 Group1
#2 150 56 97 Group2
#3 150 66 97 Group3
#4 120 66 97 Group4
Or with zoo::na.locf
df[] <- lapply(df, zoo::na.locf)
data
df <- structure(list(Value1 = c(100L, 150L, NA, 120L), Value2 = c(56L,
NA, 66L, NA), Value3 = c(82L, 97L, NA, NA), Group = structure(1:4,
.Label = c("Group1", "Group2", "Group3", "Group4"), class = "factor")),
class = "data.frame", row.names = c(NA, -4L))
Carry last Factor observation forward and backward in group of rows in R
One option is after grouping by 'ID', subset the 'Name' that are not NA
and not a blank (nzchar(Name)
), get the last observation (tail(...)
) and assign (:=
) it to 'Name'.
setDT(DF)[, Name := tail(Name[!is.na(Name) & nzchar(Name)], 1), by = ID]
DF
# ID Name
#1: 1 JAY
#2: 1 JAY
#3: 1 JAY
#4: 2 LAY
#5: 2 LAY
#6: 2 LAY
#7: 3 KAY
#8: 3 KAY
#9: 3 KAY
If the 'Name' column is factor
change the nzchar(Name)
to nzchar(as.character(Name))
Or specify the logical vector in the 'i' and assign (:=
) the last observation (Name[.N]
) to 'Name' after grouping by 'ID'
setDT(DF)[!is.na(Name) & nzchar(Name), Name := Name[.N], ID]
Note: For the second solution to work, the 'Name' should be character
class.
Carry Last Observation Forward by ID in R
1) Firstly note that the value
column is a character column with "NA"
values, not NA
values so lets fix that first in ##. Then create a wrapper function na.locf.na
which uses na.locf
in the zoo package and is the same except it defaults to na.rm = FALSE
. Finally use ave
to apply na.locf
by id
:
library(zoo)
data2 <- transform(data, value = as.numeric(value)) ##
na.locf.na <- function(x, na.rm = FALSE, ...) na.locf(x, na.rm = na.rm, ...)
transform(data2, value = ave(value, id, FUN = na.locf.na))
2) or this compact alternative using fn from the gsubfn package to represent na.locf.na
inline in a more compact manner:
library(zoo)
library(gsubfn)
transform(data2, value = fn$ave(value, id, FUN = ~ na.locf(x, na.rm = FALSE)))
In either of these two cases the result is:
id day value
1 1 0 NA
2 1 1 1
3 1 2 1
4 1 3 1
5 1 4 1
6 1 5 1
7 1 6 1
8 2 0 NA
9 2 1 NA
10 2 2 NA
11 2 3 1
12 2 4 1
13 2 5 1
14 2 6 1
15 2 7 1
16 2 8 1
3) We could alternately use dplyr together with zoo using na.locf.na
from above:
library(zoo)
library(dplyr)
data2 <- data %>% mutate(value = as.numeric(value)) # fix value column
data2 %>% group_by(id) %>% mutate(value = na.locf.na(value))
If the dplyr from CRAN does not work here try the one from github:
library(devtools)
install_github("hadley/dplyr")
REVISIONS Reorganized presentation and added alternatives.
Last observation carried forward conditional on value and colums
This is the use of accumulate2
ie
df%>%
group_by(ID)%>%
mutate(d = unlist(accumulate2(var,cond[-1],function(z,x,y) if(y) min(z,x,na.rm=TRUE) else max(z,x,na.rm=TRUE))))
# A tibble: 15 x 5
# Groups: ID [3]
ID date cond var d
<dbl> <dbl> <dbl> <dbl> <dbl>
1 1 1 0 1 1
2 1 2 0 NA 1
3 1 3 0 2 2
4 1 4 1 0 0
5 1 5 0 NA 0
6 2 1 0 NA NA
7 2 2 0 3 3
8 2 3 0 NA 3
9 2 4 1 0 0
10 2 5 0 NA 0
11 3 1 0 2 2
12 3 2 0 NA 2
13 3 3 0 1 2
14 3 4 0 NA 2
15 3 5 0 NA 2
Carry last observation forward (na.locf?) - but only when another column value changes
You can use fill
:
df %>%
mutate(sum = rowSums(!is.na(select(., DK0004612884:DK0002025170))),
across(DK0004612884:DK0002025170, ~ case_when(sum > 0 ~ replace_na(.x, 0)))) %>%
fill(DK0004612884:DK0002025170)
output
Date DK0004612884 DK0002025170 ym sum
1 2021-01-29 14.48 0.58 2021-1 2
2 2021-02-01 14.48 0.58 2021-2 0
3 2021-02-02 14.48 0.58 2021-2 0
4 2021-02-03 14.48 0.58 2021-2 0
5 2021-02-04 14.48 0.58 2021-2 0
6 2021-02-05 14.48 0.58 2021-2 0
7 2021-02-08 14.48 0.58 2021-2 0
8 2021-02-09 14.48 0.58 2021-2 0
9 2021-02-10 14.48 0.58 2021-2 0
10 2021-02-11 14.48 0.58 2021-2 0
11 2021-02-12 14.48 0.58 2021-2 0
12 2021-02-15 14.48 0.58 2021-2 0
13 2021-02-16 14.48 0.58 2021-2 0
14 2021-02-17 14.48 0.58 2021-2 0
15 2021-02-18 14.48 0.58 2021-2 0
16 2021-02-19 14.48 0.58 2021-2 0
17 2021-02-22 14.48 0.58 2021-2 0
18 2021-02-23 14.48 0.58 2021-2 0
19 2021-02-24 14.48 0.58 2021-2 0
20 2021-02-25 14.48 0.58 2021-2 0
21 2021-02-26 14.15 0.00 2021-2 1
22 2021-03-01 14.15 0.00 2021-3 0
23 2021-03-02 14.15 0.00 2021-3 0
24 2021-03-03 14.15 0.00 2021-3 0
25 2021-03-04 14.15 0.00 2021-3 0
26 2021-03-05 14.15 0.00 2021-3 0
27 2021-03-08 14.15 0.00 2021-3 0
28 2021-03-09 14.15 0.00 2021-3 0
29 2021-03-10 14.15 0.00 2021-3 0
30 2021-03-11 14.15 0.00 2021-3 0
31 2021-03-12 14.15 0.00 2021-3 0
Related Topics
Removing/Replacing Brackets from R String Using Gsub
R: Raster Mosaic from List of Rasters
Ggplot: Line Plot for Discrete X-Axis
Plot Weighted Frequency Matrix
Getting Stargazer Column Labels to Print on Two or Three Lines
Numerical Triple Integration in R
Find Second Highest Value on a Raster Stack in R
Do Not Open Rstudio Internal Browser After Knitting
R Markdown Add Tag to Head of HTML Output
R - Error When Using Geturl from Curl After Site Was Changed
Spread with Duplicate Identifiers for Rows
How to Simulate Bimodal Distribution
Change Plot Panel in Multipanel Plot in R