Replace NA with mode based on ID attribute
We can use na.aggrgate
from library(zoo)
, specify the FUN
as Mode
. If this is a group by operation, we can do this using data.table
. Convert the 'data.frame' to 'data.table' (setDT(df1)
), grouped by 'id', we apply the na.aggregate
library(data.table)
library(zoo)
setDT(df1)[, att:= na.aggregate(att, FUN=Mode), by = id]
df1
# id att
#1: 1 v
#2: 1 v
#3: 1 v
#4: 1 c
#5: 2 c
#6: 2 v
#7: 2 c
#8: 2 c
A similar option with dplyr
library(dplyr)
df1 %>%
group_by(id) %>%
mutate(att = na.aggregate(att, FUN=Mode))
NOTE: Mode
from OP's post. Also, assuming that the 'att' is character
class.
Suggestion to replace NA with the mode
na_replace_Mode <- function(x) {
ux <- unique(na.omit(x))
x[is.na(x)] <- ux[which.max(tabulate(match(x, ux)))]
x
}
transform(df, var_1 = ave(var_1, group, FUN = na_replace_Mode))
group var_1
1 cake 1
2 cake 2
3 cake 1
4 cake 3
5 cake 1
6 cookie 8
7 cookie 9
8 cookie 8
9 cookie 7
10 cookie 8
You could also do:
Mode <- function(x) {
x <- na.omit(x)
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}
df %>%
group_by(group) %>%
mutate(var_1 = replace_na(Mode(var_1)))
# A tibble: 10 x 2
# Groups: group [2]
group var_1
<chr> <dbl>
1 cake 1
2 cake 2
3 cake 1
4 cake 3
5 cake 1
6 cookie 8
7 cookie 9
8 cookie 8
9 cookie 7
10 cookie 8
How can I fill NA-values in a data frame column based on the values from an other column?
Here is a base R solution. First define a function for Mode (Taken from here) and then apply it to you data frame, i.e.
Mode <- function(x) {
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}
df$F2 <- with(df, ave(F2, F1, FUN = function(i) replace(i, is.na(i), Mode(i))))
df
# F1 F2
#1 A C
#2 B D
#3 A C
#4 A C
#5 B D
R: Randomly Replace Values with NA
You could replace
random elements in lapply
.
set.seed(42)
r1 <- as.data.frame(lapply(dat, \(x) replace(x, sample(length(x), .1*length(x)), NA)))
r1
# X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
# 1 NA 7 NA 10 3 11 4 4 NA 7
# 2 6 6 8 8 4 11 NA 8 10 9
# 3 1 12 4 5 12 3 10 3 11 1
# 4 3 10 6 2 11 NA 3 11 2 11
# 5 8 NA 10 12 5 7 2 9 4 10
# 6 12 4 9 12 9 2 7 9 8 8
# 7 7 5 9 4 2 12 12 3 4 4
# 8 12 5 3 1 6 1 4 7 6 NA
# 9 4 6 12 NA 5 8 4 4 6 7
# 10 3 2 11 3 NA 5 4 NA 2 4
mean(is.na(r1))
# [1] 0.1
However, this replaces .1 of the values in each column with NA
. If we want each cell to be replaced with NA
with a probability of .1, we could use apply
on both MARGINS=1:2
.
set.seed(42)
p <- .1
r2 <- as.data.frame(apply(dat, 1:2, \(x) sample(c(x, NA), 1, prob=c((1 - p), p))))
r2
# X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
# 1 NA 7 NA 10 3 11 4 4 12 7
# 2 NA 6 8 8 4 11 NA 8 10 9
# 3 1 NA NA 5 12 3 10 3 11 1
# 4 3 10 NA 2 NA 9 3 11 2 NA
# 5 8 12 10 12 5 7 2 9 4 NA
# 6 12 NA 9 12 NA 2 7 9 8 8
# 7 7 NA 9 4 2 12 12 3 4 4
# 8 12 5 NA 1 6 1 4 7 6 12
# 9 4 6 12 NA NA 8 4 4 6 7
# 10 3 2 11 3 3 5 4 8 2 4
mean(is.na(r2))
# [1] 0.16
If it's possible to coerce the data as.matrix
you could treat it like a vector
set.seed(42)
m <- as.matrix(dat)
m[sample(seq_along(m), .1*length(m))] <- NA
m
# X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
# [1,] 6 7 1 10 3 11 4 NA 12 7
# [2,] 6 6 8 8 4 11 10 8 10 9
# [3,] 1 12 4 5 12 3 10 3 11 1
# [4,] 3 10 NA 2 11 9 3 NA 2 11
# [5,] 8 12 NA 12 5 7 NA 9 4 10
# [6,] 12 4 9 12 9 2 7 9 8 8
# [7,] 7 5 9 4 NA 12 12 3 4 4
# [8,] 12 NA 3 1 6 1 4 7 6 12
# [9,] 4 6 12 3 NA 8 4 4 NA 7
# [10,] 3 2 11 3 3 5 4 8 2 NA
mean(is.na(m))
# [1] 0.1
and coerce back to "data.frame"
.
dat_na <- as.data.frame(m) |> type.convert(as.is=TRUE)
The type.convert
takes care of getting back classes like "numeric"
and "character"
, since matrices can only have one mode. Note that you may lose attributes in the process.
Data:
dat <- structure(list(X1 = c(6L, 6L, 1L, 3L, 8L, 12L, 7L, 12L, 4L, 3L
), X2 = c(7L, 6L, 12L, 10L, 12L, 4L, 5L, 5L, 6L, 2L), X3 = c(1L,
8L, 4L, 6L, 10L, 9L, 9L, 3L, 12L, 11L), X4 = c(10L, 8L, 5L, 2L,
12L, 12L, 4L, 1L, 3L, 3L), X5 = c(3L, 4L, 12L, 11L, 5L, 9L, 2L,
6L, 5L, 3L), X6 = c(11L, 11L, 3L, 9L, 7L, 2L, 12L, 1L, 8L, 5L
), X7 = c(4L, 10L, 10L, 3L, 2L, 7L, 12L, 4L, 4L, 4L), X8 = c(4L,
8L, 3L, 11L, 9L, 9L, 3L, 7L, 4L, 8L), X9 = c(12L, 10L, 11L, 2L,
4L, 8L, 4L, 6L, 6L, 2L), X10 = c(7L, 9L, 1L, 11L, 10L, 8L, 4L,
12L, 7L, 4L)), class = "data.frame", row.names = c(NA, -10L))
How to replace NaN values by Zeroes in a column of a Pandas Dataframe?
I believe DataFrame.fillna()
will do this for you.
Link to Docs for a dataframe and for a Series.
Example:
In [7]: df
Out[7]:
0 1
0 NaN NaN
1 -0.494375 0.570994
2 NaN NaN
3 1.876360 -0.229738
4 NaN NaN
In [8]: df.fillna(0)
Out[8]:
0 1
0 0.000000 0.000000
1 -0.494375 0.570994
2 0.000000 0.000000
3 1.876360 -0.229738
4 0.000000 0.000000
To fill the NaNs in only one column, select just that column. in this case I'm using inplace=True to actually change the contents of df.
In [12]: df[1].fillna(0, inplace=True)
Out[12]:
0 0.000000
1 0.570994
2 0.000000
3 -0.229738
4 0.000000
Name: 1
In [13]: df
Out[13]:
0 1
0 NaN 0.000000
1 -0.494375 0.570994
2 NaN 0.000000
3 1.876360 -0.229738
4 NaN 0.000000
EDIT:
To avoid a SettingWithCopyWarning
, use the built in column-specific functionality:
df.fillna({1:0}, inplace=True)
R replace missing values if all are missing within a group
Does this work:
library(dplyr)
df %>% group_by(ID) %>% mutate(val2 = case_when(all(is.na(val2)) ~ val1, TRUE ~ val2))
# A tibble: 6 x 3
# Groups: ID [2]
ID val1 val2
<chr> <dbl> <dbl>
1 A 1 1
2 A 2 2
3 A 3 3
4 B 4 NA
5 B 5 2
6 B 6 3
How to replace NA with mean by group / subset?
Not my own technique I saw it on the boards a while back:
dat <- read.table(text = "id taxa length width
101 collembola 2.1 0.9
102 mite 0.9 0.7
103 mite 1.1 0.8
104 collembola NA NA
105 collembola 1.5 0.5
106 mite NA NA", header=TRUE)
library(plyr)
impute.mean <- function(x) replace(x, is.na(x), mean(x, na.rm = TRUE))
dat2 <- ddply(dat, ~ taxa, transform, length = impute.mean(length),
width = impute.mean(width))
dat2[order(dat2$id), ] #plyr orders by group so we have to reorder
Edit A non plyr approach with a for
loop:
for (i in which(sapply(dat, is.numeric))) {
for (j in which(is.na(dat[, i]))) {
dat[j, i] <- mean(dat[dat[, "taxa"] == dat[j, "taxa"], i], na.rm = TRUE)
}
}
Edit many moons later here is a data.table & dplyr approach:
data.table
library(data.table)
setDT(dat)
dat[, length := impute.mean(length), by = taxa][,
width := impute.mean(width), by = taxa]
dplyr
library(dplyr)
dat %>%
group_by(taxa) %>%
mutate(
length = impute.mean(length),
width = impute.mean(width)
)
Clean dataframe column with null values by substituting a factor based on summary conditions
The lapply
function filters the average height data frame by year and finds the position with the minimum absolute difference between the players height and the average. If the position is missing then it is updated with closest position from y
.
library(dplyr)
df_avg <- mutate(df_avg, pos = as.character(pos))
df_player <- df_player %>%
as_tibble() %>%
mutate(id = 1:nrow(df_player),
pos = as.character(pos)) %>%
split(.$id) %>%
lapply(function(x, ref) {
y <- ref %>%
as_tibble() %>%
filter(year == x$year) %>%
mutate(diff = abs(ref[ref$year == x$year, ]$avg_height - as.numeric(x$height))) %>%
top_n(1, desc(diff))
mutate(x, pos = ifelse(is.na(pos), y$pos, pos))
}, ref = df_avg) %>%
bind_rows() %>%
select(-id)
Update
This calculates and applies the means within lapply
.
library(dplyr)
df_player <- tibble(id = 1:100,
year = floor(runif(100,2000,2006)),
height = runif(100,70,85),
pos = sample(c("G","F","C",NA), size = 100, replace = TRUE))
df_player %>%
mutate(id = 1:nrow(df_player)) %>%
split(.$id) %>%
lapply(function(x, ref) {
y <- ref %>%
filter(year == x$year,
!is.na(pos)) %>%
group_by(pos) %>%
summarise(avg_height = mean(height, na.rm = TRUE)) %>%
mutate(diff = abs(avg_height - as.numeric(x$height))) %>%
top_n(1, desc(diff))
mutate(x, pos = ifelse(is.na(pos), y$pos, pos))
}, ref = df_player) %>%
bind_rows() %>%
select(-id)
Related Topics
How to Convert All Column Data Type to Numeric and Character Dynamically
Dplyr Write a Function with Column Names as Inputs
R Split a Column into Multiple Column by Pattern
Stats on Every N Rows for Each Column
Sequentially Rename 100+ Columns Having Idiosyncratic Names
Filter Group of Rows Based on Sum of Values from Different Column
Dealing with Nan's in Matlab Functions
R: Miscellaneous Errors While Trying to Plot Graphs
Distance Calculation on Large Vectors [Performance]
R - Help in Converting Factor to Date (%M/%D/%Y %H:%M)
Predict.Lm in R Fails to Recognize Newdata
Replace Column Values with Column Name Using Dplyr's Transmute_All
Set Standard Legend Key Size with Long Label Names Ggplot
Downgrade R Version (No Issues with Bioconductor Installation)