Fill Missing Values Rowwise (Right/Left)

Fill missing values rowwise (right / left)

We can do a gather into 'long' format, do the fill grouped by the row number and then spread back to 'wide' format

library(tidyverse)
rownames_to_column(d, 'rn') %>%
gather(key, val, -rn) %>%
group_by(rn) %>%
fill(val) %>%
spread(key, val) %>%
ungroup %>%
select(-rn)
# A tibble: 5 x 3
# c1 c2 c3
# <chr> <chr> <chr>
#1 a a a
#2 1 2 3
#3 2 2 4
#4 3 4 4
#5 4 5 6

or another option without reshaping would be doing rowwise fill with na.locf

library(zoo)
d %>%
mutate(c1 = as.character(c1)) %>%
pmap_dfr(., ~ na.locf(c(...)) %>%
as.list %>%
as_tibble)

Also, if we use na.locf, it run columnwise, so the data can be transposed and apply na.locf directly

d[] <- t(na.locf(t(d)))
d
# c1 c2 c3
#1 a a a
#2 1 2 3
#3 2 2 4
#4 3 4 4
#5 4 5 6

As @G.Grothendieck mentioned in the comments, inorder to take care of the elements that are NA at the beginning of the row, use na.locf0 instead of na.locf

Filling NA row values with nearest right side row value in R

Update

As there was lot of confusion on the expected output, updating the answer as suggested by @DavidArenburg using a tidyverse solution

library(dplyr)
library(tidyr)
df %>%
add_rownames() %>%
gather(variable, value, -rowname) %>%
filter(!is.na(value)) %>%
group_by(rowname) %>%
mutate(indx = row_number()) %>%
select(-variable) %>%
spread(indx, value)

# rowname `1` `2`
#* <chr> <dbl> <dbl>
#1 BAKERY_Total 28 84.04
#2 CHICKEN_PUFF 16 88.24
#3 VEG_PUFF 12 78.43

Another solution could be

library(data.table)
temp <- apply(df, 1, function(x) data.frame(matrix(x[!is.na(x)], nrow = 1)))
rbindlist(temp, fill = T)

Previous Answer

If I have understand you correctly, you are trying to replace NA values in a row with the latest non-NA value in the same row

We can use na.locf with fromLast set as TRUE

t(apply(df, 1, function(x) na.locf(x, fromLast = T, na.rm = F)))

# c1 c2 c3 c4 c5
#VEG_PUFF 12 12 78.43 78.43 78.43
#CHICKEN_PUFF 16 16 88.24 88.24 NA
#BAKERY_Total 28 28 28.00 84.04 84.04

R: fill missing value with prior values

Using tidyr we can use fill(data, vars):

library(tidyr)
fill(d, county)

Fill missing values with previous values by row using dplyr

One solution could be using na.locf function from package zoo combining with purrr::pmap function in a row-wise operation. na.locf takes the most recent non-NA value and replace all the upcoming NA values by that. Just as a reminder c(...) in both solutions captures all values of V1:V4 in each row in every iteration. However, I excluded id column in both as it is not involved in the our calculations.

library(zoo)
library(purrr)

df %>%
mutate(pmap_df(., ~ na.locf(c(...)[-1])))

id V1 V2 V3 V4
1 01 1 1 1 1
2 02 2 1 1 1
3 03 3 1 1 1
4 04 4 1 2 2

Or we can use coalesce function from dplyr. We can replace every NA values in each row with the last non-NA value, something we did earlier with na.locf. However this solution is a bit verbose:

df %>%
mutate(pmap_df(., ~ {x <- c(...)[!is.na(c(...))];
coalesce(c(...), x[length(x)])}))

id V1 V2 V3 V4
1 01 1 1 1 1
2 02 2 1 1 1
3 03 3 1 1 1
4 04 4 1 2 2

Or you could also use this:

library(purrr)

df %>%
mutate(across(!id, ~ replace(., is.na(.), invoke(coalesce, rev(df[-1])))))

id V1 V2 V3 V4
1 01 1 1 1 1
2 02 2 1 1 1
3 03 3 1 1 1
4 04 4 1 2 2

The warning message can be ignored. It is in fact produced because we have 6 NA values but the result of applying dplyr::coalesce on every vector is 1 element resulting in 4 elements to replace 6 slots.

Rowwise duplicate to missing for second degree neighbors

To get the desired output we could do:

df1 <- t(apply(df, 1, function(x) replace(x, duplicated(x), NA)))

x <- df1 %>%
as_tibble() %>%
pivot_longer(
everything()
) %>%
group_by(value) %>%
mutate(id = row_number()-1,
value = paste0("X.",value,"."),
value = ifelse(value == "X.NA." & id > 0, paste0(NA, "..", id), value),
value = ifelse(value == "X.NA.", NA, value)) %>%
select(-id) %>%
mutate(value = str_replace(value, " ", ".")) %>%
pivot_wider(
names_from = name,
values_from = value
)

colnames(df1) <- x

df1
     X.Ashanti. X.Brong.Ahafo. X.Central. X.Eastern. X.Western. <NA> NA..1 NA..2 X.Northern. X.Volta. NA..3
[1,] "Ashanti" "Brong Ahafo" "Central" "Eastern" "Western" NA NA NA "Northern" "Volta" NA

Fill in NA column values with the last value that was not NA (na.locf by column)

apply na.locf rowwise :

DF[] <- t(apply(DF, 1, zoo::na.locf, na.rm = FALSE))
DF
# A tibble: 20 x 7
# toberevised ...2 ...3 ...4 ...5 ...6 ...7
# <chr> <chr> <chr> <chr> <chr> <chr> <chr>
# 1 [Money amounts are in th… UNITED ST… UNITED ST… UNITED STATES … UNITED STATES … UNITED STATES … UNITED STATES…
# 2 NA NA NA NA NA NA NA
# 3 NA NA NA Size of adjust… Size of adjust… Size of adjust… Size of adjus…
# 4 NA NA NA NA NA NA NA
# 5 Item All retur… Under 50000 75000 100000 200000
# 6 NA NA $50,000 [… under under under or more
# 7 NA NA NA 75000 100000 200000 200000
# 8 NA 1 2 3 4 5 6
# 9 NA NA NA NA NA NA NA
#10 Number of returns 135257620 92150166 18221115 10499106 10797979 3589254
#11 Number of joint returns 52607676 20743943 11329459 8296546 9193700 3044028
#12 Number with paid prepare… 80455243 53622647 11025624 6260725 6678965 2867282
#13 Number of exemptions 273738434 159649737 44189517 28555195 30919226 10424759
#14 Adjusted gross income (A… 7364640131 1797097083 1119634632 905336768 1429575727 2112995921
#15 Salaries and wages in AG… 114060887 75422766 16299827 9520214 9782173 3035907
#16 Salaries and wages in AG… 5161583318 1541276272 896339313 721137490 1083175205 919655038
#17 Taxable interest: Number 59553985 28527550 10891905 7636612 9092673 3405245
#18 Taxable interest: Amount 161324824 39043002 16353293 12852148 23160862 69915518
#19 Ordinary dividends: Num… 31158675 13174923 5255958 4095938 5824522 2807334
#20 Ordinary dividends: Amou… 164247298 23867893 12810282 11524298 25842394 90202431

As suggested by @G. Grothendieck na.locf0 is a better candidate here.

DF[] <- t(apply(DF, 1, zoo::na.locf0))

How to show names of missing variables rowwise?

Without a great view of what your data looks like, it is difficult to assess. However, you may try the sapply() function. This function can loop through variables in a data frame and return a list object, which is quite flexible in terms of what it stores. Here is an example that might fit your scenario:

# construct silly data.frame
temp <- data.frame("a"=1:10, "aa"=rep(1:5, 2), "b"=rnorm(10),
"c"=sample(c("good", "bad", "ugly"), 10, replace=TRUE))
# build in some missing values
temp$a[c(1,5)] <- NA
temp$b[c(3,7, 9)] <- NA
temp$c[c(2,5)] <- NA
# take a peek at the data
temp
# construct empty list to store names of missing vars
missingVars <- list()
# loop through observations
for(i in 1:nrow(temp)) {
# subset to one row data set
obs.row <- temp[i,]
# fill in missing var list with names of variables that are missing
missingVars[[paste0("obs.",i)]] <-
names(obs.row)[unlist(sapply(obs.row, is.na))]
}

This should work given what you have described. You can then extract the names of the missing variables either by using the row number:

missingVars[[1]]

or by using the name of the list element:

missingVars[["obs.1"]]

would both extract the names of missing variables for the first observation.

Replace NAs with previous day value for returns

You can replace NAs with the previous values at the start of your pipe using fill() like this:

library(tidyverse)
df %>%
fill(MDAXClosing) %>%
dplyr::mutate(Date = as.Date(Date, format = "%d.%m.%Y"),
week = cut.Date(Date, breaks = "1 week", labels = FALSE)) %>%
dplyr::group_by(Underlying, week) %>%
dplyr::summarise(Stockreturn = log(ClosingPrice[1] / ClosingPrice[n()]),
MDAXreturn = log(MDAXClosing[1] / MDAXClosing[n()]))

# A tibble: 3 x 4
# Groups: Underlying [1]
Underlying week Stockreturn MDAXreturn
<chr> <int> <dbl> <dbl>
1 DE0005089031 1 0.0354 0.0472
2 DE0005089031 2 0.117 0.0226
3 DE0005089031 3 -0.00780 0.0184

MDAXreturn can be calculated by calculating it in the same summarise statement as Stockreturn

Data

df <- tibble::tribble(
~Underlying, ~Date, ~ClosingPrice, ~MDAXClosing,
"DE0005089031", "04.01.2016", 49.501, 20256.14,
"DE0005089031", "05.01.2016", 49.7855, 20228.06,
"DE0005089031", "06.01.2016", 49.0595, 19989.88,
"DE0005089031", "07.01.2016", 47.7785, 19537.39,
"DE0005089031", "08.01.2016", 47.7435, 19321.93,
"DE0005089031", "09.01.2016", 47.816, NA,
"DE0005089031", "10.01.2016", 47.777, NA,
"DE0005089031", "11.01.2016", 48.8095, 19219.43,
"DE0005089031", "12.01.2016", 48.9545, 19627.76,
"DE0005089031", "13.01.2016", 48.0195, 19587.69,
"DE0005089031", "14.01.2016", 47.146, 19296.48,
"DE0005089031", "15.01.2016", 43.558, 18789.76,
"DE0005089031", "16.01.2016", 43.4, NA,
"DE0005089031", "17.01.2016", 43.4, NA,
"DE0005089031", "18.01.2016", 44.4815, 18662.69,
"DE0005089031", "19.01.2016", 45.6485, 19029.23,
"DE0005089031", "20.01.2016", 44.83, 18322.99
)

Replacing missing values

One dplyr and tidyr possibility could be:

df %>%
group_by(quarter = substr(Period, 5, 6)) %>%
mutate(Sales_temp = replace_na(Sales, last(na.omit(Sales)))) %>%
group_by(quarter, na = is.na(Sales)) %>%
mutate(constant = 1.05,
Sales_temp = Sales_temp * cumprod(constant),
Sales = coalesce(Sales, Sales_temp)) %>%
ungroup() %>%
select(1:2)

Period Sales
<chr> <dbl>
1 1999Q1 353.
2 1999Q2 426.
3 1999Q3 358.
4 1999Q4 364.
5 2000Q1 303.
6 2000Q2 394.
7 2000Q3 435.
8 2000Q4 388.
9 2001Q1 318.
10 2001Q2 414.
11 2001Q3 457.
12 2001Q4 407.
13 2002Q1 334.
14 2002Q2 435.
15 2002Q3 480.
16 2002Q4 428.
17 2003Q1 351.
18 2003Q2 456.
19 2003Q3 504.
20 2003Q4 449.

Or with just dplyr:

df %>%
group_by(quarter = substr(Period, 5, 6)) %>%
mutate(Sales_temp = if_else(is.na(Sales), last(na.omit(Sales)), Sales)) %>%
group_by(quarter, na = is.na(Sales)) %>%
mutate(constant = 1.05,
Sales_temp = Sales_temp * cumprod(constant),
Sales = coalesce(Sales, Sales_temp)) %>%
ungroup() %>%
select(1:2)


Related Topics



Leave a reply



Submit