Replace Na with Previous and Next Rows Mean in R

Replace NA with previous and next rows mean in R

Or you may try na.approx from package zoo: "Missing values (NAs) are replaced by linear interpolation"

library(zoo)
x <- c(56, NA, 70, 96)
na.approx(x)
# [1] 56 63 70 96

This also works if you have more than one consecutive NA:

vals <- c(1, NA, NA, 7, NA, 10)
na.approx(vals)
# [1] 1.0 3.0 5.0 7.0 8.5 10.0

na.approx is based on the base function approx, which may be used instead:

vals <- c(1, NA, NA, 7, NA, 10)
xout <- seq_along(vals)
x <- xout[!is.na(vals)]
y <- vals[!is.na(vals)]

approx(x = x, y = y, xout = xout)$y
# [1] 1.0 3.0 5.0 7.0 8.5 10.0

Replace NA with previous or next value, by group, using dplyr

library(tidyr) #fill is part of tidyr

ps1 %>%
group_by(userID) %>%
#fill(color, age, gender) %>% #default direction down
fill(color, age, gender, .direction = "downup")

Which gives you:

Source: local data frame [9 x 4]
Groups: userID [3]

userID color age gender
<dbl> <fctr> <fctr> <fctr>
1 21 blue 3yrs F
2 21 blue 2yrs F
3 21 red 2yrs M
4 22 blue 3yrs F
5 22 blue 3yrs F
6 22 blue 3yrs F
7 23 red 4yrs F
8 23 red 4yrs F
9 23 gold 4yrs F

Replace NA with previous occurrence

Another option with dplyr and tidyr:

library(dplyr); library(tidyr);
df %>% group_by(Product_id) %>% fill(Product_Weight)

Source: local data frame [5 x 4]
Groups: Product_id [3]

Product_id Product_Weight Product_Name Shop_Name
(fctr) (int) (fctr) (fctr)
1 A 10 xxxx Walmart
2 A 10 xxxx Walmart
3 B 12 yyyy Target
4 C 11 zzzz Target
5 C 11 zzzz Target

The result is sorted by Product_id though.

Replace NA values if last and next non-NA value are the same

You can fill forwards and backwards, then set the rows where they don't match to NA.

library(zoo)
library(dplyr)

df %>%
mutate_if(is.factor, as.character) %>%
group_by(ID) %>%
mutate(result = na.locf(with_missing, fromLast = T),
result = ifelse(result == na.locf(with_missing), result, NA))

# ID with_missing desired_result result
# 1 1 a a a
# 2 1 a a a
# 3 1 <NA> a a
# 4 1 <NA> a a
# 5 1 a a a
# 6 1 a a a
# 7 2 a a a
# 8 2 a a a
# 9 2 <NA> <NA> <NA>
# 10 2 b b b
# 11 2 b b b
# 12 2 b b b
# 13 3 a a a
# 14 3 <NA> <NA> <NA>
# 15 3 <NA> <NA> <NA>
# 16 3 <NA> <NA> <NA>
# 17 3 c c c
# 18 3 c c c
# 19 4 b b b
# 20 4 <NA> <NA> <NA>
# 21 4 a a a
# 22 4 a a a
# 23 4 a a a
# 24 4 a a a
# 25 5 a a a
# 26 5 <NA> a a
# 27 5 <NA> a a
# 28 5 <NA> a a
# 29 5 <NA> a a
# 30 5 a a a
# 31 6 a a a
# 32 6 a b a
# 33 6 <NA> b <NA>
# 34 6 b b b
# 35 6 a a a
# 36 6 a a a
# 37 7 a a a
# 38 7 a a a
# 39 7 <NA> a a
# 40 7 <NA> a a
# 41 7 a a a
# 42 7 a a a
# 43 8 a a a
# 44 8 a a a
# 45 8 <NA> <NA> <NA>
# 46 8 b b b
# 47 8 b b b
# 48 8 b b b
# 49 9 a a a
# 50 9 <NA> <NA> <NA>
# 51 9 <NA> <NA> <NA>
# 52 9 <NA> <NA> <NA>
# 53 9 c c c
# 54 9 c c c
# 55 10 b b b
# 56 10 <NA> <NA> <NA>
# 57 10 a a a
# 58 10 a a a
# 59 10 a a a
# 60 10 a a a

Replace value with previous row value

Does this work:

library(dplyr)
library(tidyr)
df %>% mutate(DSWP10 = as.numeric(na_if(DSWP10, '.'))) %>% fill(DSWP10, .direction = 'up')
# A tibble: 7 x 2
Date DSWP10
<chr> <dbl>
1 07/01/2015 2.1
2 06/01/2015 1.99
3 05/01/2015 1.99
4 04/01/2015 1.99
5 03/01/2015 1.98
6 02/01/2015 1.95
7 01/01/2015 1.95

R: replace NA that is preceded and followed by non-na values

The following function does what the question asks for.

meanNA <- function(x){
na <- is.na(x)
prev <- c(na[-1], FALSE)
nxt <- c(FALSE, na[-length(x)])
x[na] <- rowMeans(cbind(x[prev], x[nxt]), na.rm = TRUE)
is.na(x) <- is.nan(x)
x
}

df[] <- lapply(df, meanNA)

df
# t2 t3 t4
#1 0.00 0.00 0.00
#2 0.00 0.00 -0.02
#3 0.02 0.00 0.01
#4 0.04 0.00 0.00
#5 0.04 -0.01 0.00
#6 NA 0.03 -0.02

R: replace NAs with mean of two most contiguous values

We can use the zoo::na.locf() function for this:

x <- c(3,4,8,10,NA,NA,NA,8,10,10,NA,22)
y <- c(1,6,3,5,NA,44,23,NA,NA,5,34,33)
df <- data.frame(x,y)

contiguous_mean <- function(vec) {
return( (zoo::na.locf(vec) + zoo::na.locf(vec, fromLast = TRUE)) / 2 )
}

apply(df, 2, contiguous_mean)

# x y
# [1,] 3 1.0
# [2,] 4 6.0
# [3,] 8 3.0
# [4,] 10 5.0
# [5,] 9 24.5
# [6,] 9 44.0
# [7,] 9 23.0
# [8,] 8 14.0
# [9,] 10 14.0
# [10,] 10 5.0
# [11,] 16 34.0
# [12,] 22 33.0

Here, "locf" stands for last observation carried forward, which replaces NA values with the last observed value; with the fromLast argument, you can use the closest previous observation, or the closest subsequent observation. We want the mean of the last previous observed value and the next subsequent observed value, so we just divide by two the sum of the result with fromLast as TRUE and FALSE.

Update: Leading or trailing NAs

G. Grothendieck makes the excellent suggestion to use na.locf0() rather than na.locf() to make use of the na.rm = FALSE default of the former. When the initial or last values are not NA, the two approaches would be equivalent, but when your columns start or end with NAs, then we need na.locf0(). Here's a demonstration:

z <- c(NA, 1, 2, NA, 3)
contiguous_mean <- function(vec) {
return( (zoo::na.locf(vec) + zoo::na.locf(vec, fromLast = TRUE)) / 2 )
}
contiguous_mean2 <- function(vec) {
return( (zoo::na.locf0(vec) + zoo::na.locf0(vec, fromLast = TRUE)) / 2 )
}
## When no leading or trailing NAs, they are equivalent:
all.equal(apply(df, 2, contiguous_mean), apply(df, 2, contiguous_mean2))
# [1] TRUE
## However, when there *are* leading or trailing NAs, the first approach
## causes bad recycling:
contiguous_mean2(z) ## New version
# [1] NA 1.0 2.0 2.5 3.0
contiguous_mean(z) ## Old version
# [1] 1.0 1.5 2.0 3.0 2.0
# Warning message:
# In zoo::na.locf(vec) + zoo::na.locf(vec, fromLast = TRUE) :
# longer object length is not a multiple of shorter object length


Related Topics



Leave a reply



Submit