Replace <Na> in a Factor Column

Replace NA in a factor column

1) addNA If fac is a factor addNA(fac) is the same factor but with NA added as a level. See ?addNA

To force the NA level to be 88:

facna <- addNA(fac)
levels(facna) <- c(levels(fac), 88)

giving:

> facna
[1] 1 2 3 3 4 88 2 4 88 3
Levels: 1 2 3 4 88

1a) This can be written in a single line as follows:

`levels<-`(addNA(fac), c(levels(fac), 88))

2) factor It can also be done in one line using the various arguments of factor like this:

factor(fac, levels = levels(addNA(fac)), labels = c(levels(fac), 88), exclude = NULL)

2a) or equivalently:

factor(fac, levels = c(levels(fac), NA), labels = c(levels(fac), 88), exclude = NULL)

3) ifelse Another approach is:

factor(ifelse(is.na(fac), 88, paste(fac)), levels = c(levels(fac), 88))

4) forcats The forcats package has a function for this:

library(forcats)

fct_explicit_na(fac, "88")
## [1] 1 2 3 3 4 88 2 4 88 3
## Levels: 1 2 3 4 88

Note: We used the following for input fac

fac <- structure(c(1L, 2L, 3L, 3L, 4L, NA, 2L, 4L, NA, 3L), .Label = c("1", 
"2", "3", "4"), class = "factor")

Update: Have improved (1) and added (1a). Later added (4).

Replace NA in a series of variables by a factor level

If these are already existing factors, you can use forcats::fct_explicit_na():

library(dplyr)
library(forcats)

# Make sample data vars factors
dat <- dat %>%
mutate(across(starts_with("s_"), as.factor))

# Add 'No' as factor level
dat %>%
mutate(across(starts_with("s_"), fct_explicit_na, "No"))

# A tibble: 10 x 6
id x s_0 s_1 s_2 s_3
<dbl> <dbl> <fct> <fct> <fct> <fct>
1 1 5 75 A 4 110
2 2 9 36 No No 921
3 3 11 13 B 7 769
4 4 11 34 C 2 912
5 5 11 No C No 835
6 6 13 39 No 4 No
7 7 14 45 B 4 577
8 8 19 42 D 6 No
9 9 20 4 No 7 577
10 10 13 28 No 3 573

R: replacing NA within factor variables as 0

For factor variables you need to first include the new level (0) in the data if it is not already present.

See this example -

df <- data.frame(a = factor(c(1, NA, 2, 5)), b = 1:4, 
c = c('a', 'b', 'c', NA), d = c(1, 2, NA, 1))

#Include 0 in the levels for "a" variable
levels(df$a) <- c(levels(df$a), 0)
#Replace NA to 0
df[is.na(df)] <- 0
df
# a b c d
#1 1 1 a 1
#2 0 2 b 2
#3 2 3 c 0
#4 5 4 0 1

str(df)
#'data.frame': 4 obs. of 4 variables:
# $ a: Factor w/ 4 levels "1","2","5","0": 1 4 2 3
# $ b: int 1 2 3 4
# $ c: chr "a" "b" "c" "0"
# $ d: num 1 2 0 1

Cannot replace NA with 0 for a factor in R

Factors needed to be treated differently. If you have a dataframe include additional levels in your data before changing the values.

pastP <- data.frame(CODE = factor(c(3, 4, NA, 1)))
levels(pastP$CODE) <- c(levels(pastP$CODE), 0)
pastP[is.na(pastP)] <- 0

If you have a tibble you can use fct_explicit_na -

library(dplyr)
library(forcats)

pastP <- tibble(CODE = factor(c(3, 4, NA, 1)))

pastP %>% mutate(CODE = fct_explicit_na(CODE, '0'))

# CODE
# <fct>
#1 3
#2 4
#3 0
#4 1

Replace NA values with modal value for factor variables in dplyr


  library(dplyr)
library(tidyr)

# manually get the most frequent values and tidyr::replace_na
most_value <- table(df$result) %>% sort(decreasing = TRUE) %>%
head(1) %>% names()
df %>% replace_na(list(result = most_value))
#> id result
#> 1 1 a
#> 2 2 a
#> 3 3 a
#> 4 4 b
#> 5 5 a

Dynamically apply on multiple column

  # do it acorss multiple column - still kind of using functions
most <- function(x) {
table(x) %>% sort(decreasing = TRUE) %>% head(1) %>% names()
}
multiple_column <- left_join(df, df, by = "id")
multiple_column
#> id result.x result.y
#> 1 1 a a
#> 2 2 a a
#> 3 3 a a
#> 4 4 b b
#> 5 5 <NA> <NA>

multiple_column %>%
mutate(across(.cols = starts_with("result"), .fns = function(x) {
if_else(is.na(x), most(x), x)
}))
#> id result.x result.y
#> 1 1 a a
#> 2 2 a a
#> 3 3 a a
#> 4 4 b b
#> 5 5 a a

Created on 2021-04-24 by the reprex package (v2.0.0)

Replacing NA values by nearest value and factor

An option using nearest rolling from data.table:

cols <- paste0("V", 1L:6L)
for (x in cols) {
DT[is.na(get(x)), (x) :=
DT[!is.na(get(x))][.SD, on=.(factorID, RDate), roll="nearest", get(paste0("x.",x))]]
}

output:

   factorID       Date RDate  V1  V2  V3   V4  V5  V6
1: 1 1989-02-06 6976 318 351 172 570 260 108
2: 1 1989-05-13 7072 77 305 591 835 801 550
3: 1 1989-05-29 7088 77 305 591 835 801 550
4: 1 1989-06-14 7104 252 305 286 835 271 85
5: 2 1989-02-06 6976 236 389 323 2078 908 373
6: 2 1989-05-13 7072 77 62 591 2001 801 550
7: 2 1989-05-29 7088 55 62 410 2001 801 550
8: 2 1989-06-14 7104 351 508 456 1618 780 421

data:

library(data.table)
DT <- fread("factorID Date RDate V1 V2 V3 V4 V5 V6
1 1989-02-06 6976 318 351 172 570 260 108
1 1989-05-13 7072 77 NA 591 NA 801 550
1 1989-05-29 7088 NA NA NA NA NA NA
1 1989-06-14 7104 252 305 286 835 271 85
2 1989-02-06 6976 236 389 323 2078 908 373
2 1989-05-13 7072 77 NA 591 NA 801 550
2 1989-05-29 7088 55 62 410 2001 NA NA
2 1989-06-14 7104 351 508 456 1618 780 421")

Note that for factorID=1, for V2, 1989-06-14 is the nearest date both 1989-05-13 and 1989-05-29 and hence 305 should be used to fill these NA rows.

Replace factor value by NA if condition

We can use replace and avoid the step 2 and 4. It would keep the factor column as such and doesn't coerce factor to integer (unless converted to character class) as in ifelse

library(dplyr)
x %>%
mutate(PayeeID_Hash = droplevels(replace(PayeeID_Hash, Payee == "0", NA)))
# A tibble: 3 x 2
# Payee PayeeID_Hash
# <fct> <fct>
#1 <NA> <NA>
#2 0 <NA>
#3 x 0xB672841


Related Topics



Leave a reply



Submit