Replace Na with Mode Based on Id Attribute

Replace NA with mode based on ID attribute

We can use na.aggrgate from library(zoo), specify the FUN as Mode. If this is a group by operation, we can do this using data.table. Convert the 'data.frame' to 'data.table' (setDT(df1)), grouped by 'id', we apply the na.aggregate

library(data.table)
library(zoo)
setDT(df1)[, att:= na.aggregate(att, FUN=Mode), by = id]
df1
# id att
#1: 1 v
#2: 1 v
#3: 1 v
#4: 1 c
#5: 2 c
#6: 2 v
#7: 2 c
#8: 2 c

A similar option with dplyr

library(dplyr)
df1 %>%
group_by(id) %>%
mutate(att = na.aggregate(att, FUN=Mode))

NOTE: Mode from OP's post. Also, assuming that the 'att' is character class.

Suggestion to replace NA with the mode

na_replace_Mode <- function(x) {
ux <- unique(na.omit(x))
x[is.na(x)] <- ux[which.max(tabulate(match(x, ux)))]
x
}

transform(df, var_1 = ave(var_1, group, FUN = na_replace_Mode))

group var_1
1 cake 1
2 cake 2
3 cake 1
4 cake 3
5 cake 1
6 cookie 8
7 cookie 9
8 cookie 8
9 cookie 7
10 cookie 8

You could also do:

Mode <- function(x) {
x <- na.omit(x)
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}

df %>%
group_by(group) %>%
mutate(var_1 = replace_na(Mode(var_1)))
# A tibble: 10 x 2
# Groups: group [2]
group var_1
<chr> <dbl>
1 cake 1
2 cake 2
3 cake 1
4 cake 3
5 cake 1
6 cookie 8
7 cookie 9
8 cookie 8
9 cookie 7
10 cookie 8

How can I fill NA-values in a data frame column based on the values from an other column?

Here is a base R solution. First define a function for Mode (Taken from here) and then apply it to you data frame, i.e.

Mode <- function(x) {
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}

df$F2 <- with(df, ave(F2, F1, FUN = function(i) replace(i, is.na(i), Mode(i))))

df
# F1 F2
#1 A C
#2 B D
#3 A C
#4 A C
#5 B D

R: Randomly Replace Values with NA

You could replace random elements in lapply.

set.seed(42)
r1 <- as.data.frame(lapply(dat, \(x) replace(x, sample(length(x), .1*length(x)), NA)))

r1
# X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
# 1 NA 7 NA 10 3 11 4 4 NA 7
# 2 6 6 8 8 4 11 NA 8 10 9
# 3 1 12 4 5 12 3 10 3 11 1
# 4 3 10 6 2 11 NA 3 11 2 11
# 5 8 NA 10 12 5 7 2 9 4 10
# 6 12 4 9 12 9 2 7 9 8 8
# 7 7 5 9 4 2 12 12 3 4 4
# 8 12 5 3 1 6 1 4 7 6 NA
# 9 4 6 12 NA 5 8 4 4 6 7
# 10 3 2 11 3 NA 5 4 NA 2 4

mean(is.na(r1))
# [1] 0.1

However, this replaces .1 of the values in each column with NA. If we want each cell to be replaced with NA with a probability of .1, we could use apply on both MARGINS=1:2.

set.seed(42)
p <- .1
r2 <- as.data.frame(apply(dat, 1:2, \(x) sample(c(x, NA), 1, prob=c((1 - p), p))))

r2
# X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
# 1 NA 7 NA 10 3 11 4 4 12 7
# 2 NA 6 8 8 4 11 NA 8 10 9
# 3 1 NA NA 5 12 3 10 3 11 1
# 4 3 10 NA 2 NA 9 3 11 2 NA
# 5 8 12 10 12 5 7 2 9 4 NA
# 6 12 NA 9 12 NA 2 7 9 8 8
# 7 7 NA 9 4 2 12 12 3 4 4
# 8 12 5 NA 1 6 1 4 7 6 12
# 9 4 6 12 NA NA 8 4 4 6 7
# 10 3 2 11 3 3 5 4 8 2 4
mean(is.na(r2))
# [1] 0.16

If it's possible to coerce the data as.matrix you could treat it like a vector

set.seed(42)
m <- as.matrix(dat)
m[sample(seq_along(m), .1*length(m))] <- NA

m
# X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
# [1,] 6 7 1 10 3 11 4 NA 12 7
# [2,] 6 6 8 8 4 11 10 8 10 9
# [3,] 1 12 4 5 12 3 10 3 11 1
# [4,] 3 10 NA 2 11 9 3 NA 2 11
# [5,] 8 12 NA 12 5 7 NA 9 4 10
# [6,] 12 4 9 12 9 2 7 9 8 8
# [7,] 7 5 9 4 NA 12 12 3 4 4
# [8,] 12 NA 3 1 6 1 4 7 6 12
# [9,] 4 6 12 3 NA 8 4 4 NA 7
# [10,] 3 2 11 3 3 5 4 8 2 NA

mean(is.na(m))
# [1] 0.1

and coerce back to "data.frame".

dat_na <- as.data.frame(m) |> type.convert(as.is=TRUE)

The type.convert takes care of getting back classes like "numeric" and "character", since matrices can only have one mode. Note that you may lose attributes in the process.


Data:

dat <- structure(list(X1 = c(6L, 6L, 1L, 3L, 8L, 12L, 7L, 12L, 4L, 3L
), X2 = c(7L, 6L, 12L, 10L, 12L, 4L, 5L, 5L, 6L, 2L), X3 = c(1L,
8L, 4L, 6L, 10L, 9L, 9L, 3L, 12L, 11L), X4 = c(10L, 8L, 5L, 2L,
12L, 12L, 4L, 1L, 3L, 3L), X5 = c(3L, 4L, 12L, 11L, 5L, 9L, 2L,
6L, 5L, 3L), X6 = c(11L, 11L, 3L, 9L, 7L, 2L, 12L, 1L, 8L, 5L
), X7 = c(4L, 10L, 10L, 3L, 2L, 7L, 12L, 4L, 4L, 4L), X8 = c(4L,
8L, 3L, 11L, 9L, 9L, 3L, 7L, 4L, 8L), X9 = c(12L, 10L, 11L, 2L,
4L, 8L, 4L, 6L, 6L, 2L), X10 = c(7L, 9L, 1L, 11L, 10L, 8L, 4L,
12L, 7L, 4L)), class = "data.frame", row.names = c(NA, -10L))

How to replace NaN values by Zeroes in a column of a Pandas Dataframe?

I believe DataFrame.fillna() will do this for you.

Link to Docs for a dataframe and for a Series.

Example:

In [7]: df
Out[7]:
0 1
0 NaN NaN
1 -0.494375 0.570994
2 NaN NaN
3 1.876360 -0.229738
4 NaN NaN

In [8]: df.fillna(0)
Out[8]:
0 1
0 0.000000 0.000000
1 -0.494375 0.570994
2 0.000000 0.000000
3 1.876360 -0.229738
4 0.000000 0.000000

To fill the NaNs in only one column, select just that column. in this case I'm using inplace=True to actually change the contents of df.

In [12]: df[1].fillna(0, inplace=True)
Out[12]:
0 0.000000
1 0.570994
2 0.000000
3 -0.229738
4 0.000000
Name: 1

In [13]: df
Out[13]:
0 1
0 NaN 0.000000
1 -0.494375 0.570994
2 NaN 0.000000
3 1.876360 -0.229738
4 NaN 0.000000

EDIT:

To avoid a SettingWithCopyWarning, use the built in column-specific functionality:

df.fillna({1:0}, inplace=True)

R replace missing values if all are missing within a group

Does this work:

library(dplyr)
df %>% group_by(ID) %>% mutate(val2 = case_when(all(is.na(val2)) ~ val1, TRUE ~ val2))
# A tibble: 6 x 3
# Groups: ID [2]
ID val1 val2
<chr> <dbl> <dbl>
1 A 1 1
2 A 2 2
3 A 3 3
4 B 4 NA
5 B 5 2
6 B 6 3

How to replace NA with mean by group / subset?

Not my own technique I saw it on the boards a while back:

dat <- read.table(text = "id    taxa        length  width
101 collembola 2.1 0.9
102 mite 0.9 0.7
103 mite 1.1 0.8
104 collembola NA NA
105 collembola 1.5 0.5
106 mite NA NA", header=TRUE)

library(plyr)
impute.mean <- function(x) replace(x, is.na(x), mean(x, na.rm = TRUE))
dat2 <- ddply(dat, ~ taxa, transform, length = impute.mean(length),
width = impute.mean(width))

dat2[order(dat2$id), ] #plyr orders by group so we have to reorder

Edit A non plyr approach with a for loop:

for (i in which(sapply(dat, is.numeric))) {
for (j in which(is.na(dat[, i]))) {
dat[j, i] <- mean(dat[dat[, "taxa"] == dat[j, "taxa"], i], na.rm = TRUE)
}
}

Edit many moons later here is a data.table & dplyr approach:

data.table

library(data.table)
setDT(dat)

dat[, length := impute.mean(length), by = taxa][,
width := impute.mean(width), by = taxa]

dplyr

library(dplyr)

dat %>%
group_by(taxa) %>%
mutate(
length = impute.mean(length),
width = impute.mean(width)
)

Clean dataframe column with null values by substituting a factor based on summary conditions

The lapply function filters the average height data frame by year and finds the position with the minimum absolute difference between the players height and the average. If the position is missing then it is updated with closest position from y.

library(dplyr)

df_avg <- mutate(df_avg, pos = as.character(pos))

df_player <- df_player %>%
as_tibble() %>%
mutate(id = 1:nrow(df_player),
pos = as.character(pos)) %>%
split(.$id) %>%
lapply(function(x, ref) {

y <- ref %>%
as_tibble() %>%
filter(year == x$year) %>%
mutate(diff = abs(ref[ref$year == x$year, ]$avg_height - as.numeric(x$height))) %>%
top_n(1, desc(diff))

mutate(x, pos = ifelse(is.na(pos), y$pos, pos))

}, ref = df_avg) %>%
bind_rows() %>%
select(-id)

Update

This calculates and applies the means within lapply.

library(dplyr)

df_player <- tibble(id = 1:100,
year = floor(runif(100,2000,2006)),
height = runif(100,70,85),
pos = sample(c("G","F","C",NA), size = 100, replace = TRUE))

df_player %>%
mutate(id = 1:nrow(df_player)) %>%
split(.$id) %>%
lapply(function(x, ref) {

y <- ref %>%
filter(year == x$year,
!is.na(pos)) %>%
group_by(pos) %>%
summarise(avg_height = mean(height, na.rm = TRUE)) %>%
mutate(diff = abs(avg_height - as.numeric(x$height))) %>%
top_n(1, desc(diff))

mutate(x, pos = ifelse(is.na(pos), y$pos, pos))

}, ref = df_player) %>%
bind_rows() %>%
select(-id)


Related Topics



Leave a reply



Submit