Replace <Na> in a Factor Column

Replace NA in a factor column

1) addNA If fac is a factor addNA(fac) is the same factor but with NA added as a level. See ?addNA

To force the NA level to be 88:

facna <- addNA(fac)
levels(facna) <- c(levels(fac), 88)

giving:

> facna
 [1] 1  2  3  3  4  88 2  4  88 3 
Levels: 1 2 3 4 88

1a) This can be written in a single line as follows:

`levels<-`(addNA(fac), c(levels(fac), 88))

2) factor It can also be done in one line using the various arguments of factor like this:

factor(fac, levels = levels(addNA(fac)), labels = c(levels(fac), 88), exclude = NULL)

2a) or equivalently:

factor(fac, levels = c(levels(fac), NA), labels = c(levels(fac), 88), exclude = NULL)

3) ifelse Another approach is:

factor(ifelse(is.na(fac), 88, paste(fac)), levels = c(levels(fac), 88))

4) forcats The forcats package has a function for this:

library(forcats)

fct_explicit_na(fac, "88")
## [1] 1  2  3  3  4  88 2  4  88 3 
## Levels: 1 2 3 4 88

Note: We used the following for input fac

fac <- structure(c(1L, 2L, 3L, 3L, 4L, NA, 2L, 4L, NA, 3L), .Label = c("1", 
"2", "3", "4"), class = "factor")

Update: Have improved (1) and added (1a). Later added (4).

Replace NA in a series of variables by a factor level

If these are already existing factors, you can use forcats::fct_explicit_na():

library(dplyr)
library(forcats)

# Make sample data vars factors
dat <- dat %>%
  mutate(across(starts_with("s_"), as.factor))

# Add 'No' as factor level
dat %>%
  mutate(across(starts_with("s_"), fct_explicit_na, "No"))

# A tibble: 10 x 6
      id     x s_0   s_1   s_2   s_3  
   <dbl> <dbl> <fct> <fct> <fct> <fct>
 1     1     5 75    A     4     110  
 2     2     9 36    No    No    921  
 3     3    11 13    B     7     769  
 4     4    11 34    C     2     912  
 5     5    11 No    C     No    835  
 6     6    13 39    No    4     No   
 7     7    14 45    B     4     577  
 8     8    19 42    D     6     No   
 9     9    20 4     No    7     577  
10    10    13 28    No    3     573

R: replacing NA within factor variables as 0

For factor variables you need to first include the new level (0) in the data if it is not already present.

See this example -

df <- data.frame(a = factor(c(1, NA, 2, 5)), b = 1:4, 
                 c = c('a', 'b', 'c', NA), d = c(1, 2, NA, 1))

#Include 0 in the levels for "a" variable
levels(df$a) <- c(levels(df$a), 0)
#Replace NA to 0
df[is.na(df)] <- 0
df
#  a b c d
#1 1 1 a 1
#2 0 2 b 2
#3 2 3 c 0
#4 5 4 0 1

str(df)
#'data.frame':  4 obs. of  4 variables:
# $ a: Factor w/ 4 levels "1","2","5","0": 1 4 2 3
# $ b: int  1 2 3 4
# $ c: chr  "a" "b" "c" "0"
# $ d: num  1 2 0 1

Cannot replace NA with 0 for a factor in R

Factors needed to be treated differently. If you have a dataframe include additional levels in your data before changing the values.

pastP <- data.frame(CODE = factor(c(3, 4, NA, 1)))
levels(pastP$CODE) <- c(levels(pastP$CODE), 0)
pastP[is.na(pastP)] <- 0

If you have a tibble you can use fct_explicit_na -

library(dplyr)
library(forcats)

pastP <- tibble(CODE = factor(c(3, 4, NA, 1)))

pastP %>% mutate(CODE = fct_explicit_na(CODE, '0'))

#  CODE 
#  <fct>
#1 3    
#2 4    
#3 0    
#4 1

Replace NA values with modal value for factor variables in dplyr

  library(dplyr)
  library(tidyr)
  
  # manually get the most frequent values and tidyr::replace_na 
  most_value <- table(df$result) %>% sort(decreasing = TRUE) %>%
    head(1) %>% names()
  df %>% replace_na(list(result = most_value))
#>   id result
#> 1  1      a
#> 2  2      a
#> 3  3      a
#> 4  4      b
#> 5  5      a

Dynamically apply on multiple column

  # do it acorss multiple column - still kind of using functions
  most <- function(x) {
    table(x) %>% sort(decreasing = TRUE) %>% head(1) %>% names()
  }
  multiple_column <- left_join(df, df, by = "id")
  multiple_column
#>   id result.x result.y
#> 1  1        a        a
#> 2  2        a        a
#> 3  3        a        a
#> 4  4        b        b
#> 5  5     <NA>     <NA>
  
  multiple_column %>%
    mutate(across(.cols = starts_with("result"), .fns = function(x) {
      if_else(is.na(x), most(x), x)
    }))
#>   id result.x result.y
#> 1  1        a        a
#> 2  2        a        a
#> 3  3        a        a
#> 4  4        b        b
#> 5  5        a        a

^{Created on 2021-04-24 by the reprex package (v2.0.0)}

Replacing NA values by nearest value and factor

An option using nearest rolling from data.table:

cols <- paste0("V", 1L:6L)
for (x in cols) {
    DT[is.na(get(x)), (x) := 
        DT[!is.na(get(x))][.SD, on=.(factorID, RDate), roll="nearest", get(paste0("x.",x))]]
}

output:

   factorID       Date RDate  V1  V2  V3   V4  V5  V6
1:        1 1989-02-06  6976 318 351 172  570 260 108
2:        1 1989-05-13  7072  77 305 591  835 801 550
3:        1 1989-05-29  7088  77 305 591  835 801 550
4:        1 1989-06-14  7104 252 305 286  835 271  85
5:        2 1989-02-06  6976 236 389 323 2078 908 373
6:        2 1989-05-13  7072  77  62 591 2001 801 550
7:        2 1989-05-29  7088  55  62 410 2001 801 550
8:        2 1989-06-14  7104 351 508 456 1618 780 421

data:

library(data.table)
DT <- fread("factorID   Date         RDate   V1   V2   V3   V4   V5   V6
1          1989-02-06   6976    318  351  172  570  260  108
1          1989-05-13   7072    77   NA   591  NA   801  550
1          1989-05-29   7088    NA   NA   NA   NA   NA   NA
1          1989-06-14   7104    252  305  286  835  271  85
2          1989-02-06   6976    236  389  323  2078 908  373
2          1989-05-13   7072    77   NA   591  NA   801  550
2          1989-05-29   7088    55   62   410  2001 NA   NA
2          1989-06-14   7104    351  508  456  1618 780  421")

Note that for factorID=1, for V2, 1989-06-14 is the nearest date both 1989-05-13 and 1989-05-29 and hence 305 should be used to fill these NA rows.

Replace factor value by NA if condition

We can use replace and avoid the step 2 and 4. It would keep the factor column as such and doesn't coerce factor to integer (unless converted to character class) as in ifelse

library(dplyr)
x %>%
   mutate(PayeeID_Hash = droplevels(replace(PayeeID_Hash, Payee == "0", NA)))
# A tibble: 3 x 2
#  Payee PayeeID_Hash
#  <fct> <fct>       
#1 <NA>  <NA>        
#2 0     <NA>        
#3 x     0xB672841

Replace <Na> in a Factor Column