Omit Inf from Row Sum in R

How to remove rows with inf from a dataframe in R

To remove the rows with +/-Inf I'd suggest the following:

df <- df[!is.infinite(rowSums(df)),]

or, equivalently,

df <- df[is.finite(rowSums(df)),]

The second option (the one with is.finite() and without the negation) removes also rows containing NA values in case that this has not already been done.

How can I remove rows with inf from my dataframe in R?

To remove rows with Inf values you can use :

ICS_data[rowSums(sapply(ICS_data[-ncol(ICS_data)], is.infinite)) == 0, ]

Or using dplyr :

library(dplyr)
ICS_data %>% filter_at(-ncol(.), all_vars(is.finite(.)))

We can break the code into smaller steps to understand how it works.

Consider this data.

data <- data.frame(a = 1:4, b = 2:5, c = letters[1:4], stringsAsFactors = TRUE)
data$b[2] <- Inf
data
# a b c
#1 1 2 a
#2 2 Inf b
#3 3 4 c
#4 4 5 d

First we remove the last column from data. We remove that since the last column is factor as we don't want to include that to find infinite values. So we get only numeric columns.

data[-ncol(data)]

# a b
#1 1 2
#2 2 Inf
#3 3 4
#4 4 5

Next using sapply we find out in each column which value are infinite using is.infinite. This returns back a matrix with TRUE/FALSE values.

sapply(data[-ncol(data)], is.infinite)

# a b
#[1,] FALSE FALSE
#[2,] FALSE TRUE
#[3,] FALSE FALSE
#[4,] FALSE FALSE

We can sum these logical values using rowSums. Here TRUE is considered as 1 and FALSE as 0.

rowSums(sapply(data[-ncol(data)], is.infinite))
#[1] 0 1 0 0

Using this we come to know that the second row has 1 infinite value and we need to drop that. So we select rows which has 0 infinite value.

data[rowSums(sapply(data[-ncol(data)], is.infinite)) == 0, ]

# a b c
#1 1 2 a
#3 3 4 c
#4 4 5 d

Remove infinite values from a matrix in R

Use is.finite. I presume this is how you wish to "remove" those -Inf values:

m[!is.finite(m)] <- NA
colMeans(m, na.rm=TRUE)

R: Remove -Inf and Inf from a vector

Remember that is.na and is.infinite may operate on vectors, returning vectors of booleans. So you can filter the vector as so:

> x <- c(1, 2, NA, Inf, -Inf)
> x[!is.na(x) & !is.infinite(x)]
[1] 1 2

If this needs to be done inline, consider putting the above in a function.

Cleaning `Inf` values from an R dataframe

Option 1

Use the fact that a data.frame is a list of columns, then use do.call to recreate a data.frame.

do.call(data.frame,lapply(DT, function(x) replace(x, is.infinite(x),NA)))

Option 2 -- data.table

You could use data.table and set. This avoids some internal copying.

DT <- data.table(dat)
invisible(lapply(names(DT),function(.name) set(DT, which(is.infinite(DT[[.name]])), j = .name,value =NA)))

Or using column numbers (possibly faster if there are a lot of columns):

for (j in 1:ncol(DT)) set(DT, which(is.infinite(DT[[j]])), j, NA)

Timings

# some `big(ish)` data
dat <- data.frame(a = rep(c(1,Inf), 1e6), b = rep(c(Inf,2), 1e6),
c = rep(c('a','b'),1e6),d = rep(c(1,Inf), 1e6),
e = rep(c(Inf,2), 1e6))
# create data.table
library(data.table)
DT <- data.table(dat)

# replace (@mnel)
system.time(na_dat <- do.call(data.frame,lapply(dat, function(x) replace(x, is.infinite(x),NA))))
## user system elapsed
# 0.52 0.01 0.53

# is.na (@dwin)
system.time(is.na(dat) <- sapply(dat, is.infinite))
# user system elapsed
# 32.96 0.07 33.12

# modified is.na
system.time(is.na(dat) <- do.call(cbind,lapply(dat, is.infinite)))
# user system elapsed
# 1.22 0.38 1.60

# data.table (@mnel)
system.time(invisible(lapply(names(DT),function(.name) set(DT, which(is.infinite(DT[[.name]])), j = .name,value =NA))))
# user system elapsed
# 0.29 0.02 0.31

data.table is the quickest. Using sapply slows things down noticeably.

Replace infinite values with max/min value of a row

You could use raster::clamp

as.data.frame(t(apply(x, 1, function(i) {
raster::clamp(i, min(i[is.finite(i)]), max(i[is.finite(i)]))
})))

#> val1 val2 val3 val4
#> 1 1 4 13 13
#> 2 4 2 1 51
#> 3 5 46 4 3
#> 4 2 2 9 2

How to remove NaN and Inf values from data.table where all columns are character types in R

One way would be to find the index of the rows containing NaN:

unique(which(data == "NaN" | data == "Inf", arr.ind=T)[,1])
[1]  1  2  7  8  9 10 11

And then set a logical condition to remove these rows:

data[!unique(which(data == "NaN" | data == "Inf", arr.ind=T)[,1])]
         date open high  low close volume
1: 2021-11-26 0.43 0.43 0.43 0.43 2
2: 2021-11-24 0.17 0.17 0.17 0.17 10
3: 2021-11-26 0.19 0.19 0.19 0.19 75
4: 2021-11-24 0.15 0.15 0.15 0.15 1

Some benchmarks

Unit: milliseconds
expr min lq mean median uq max neval cld
me 4.513141 5.545293 7.068744 6.707279 8.356170 31.30188 100 a
langtang 3.535727 3.646819 8.718629 6.318445 6.983275 59.76049 100 a
akrun 51.169168 195.102026 208.889413 204.564707 216.545022 274.02575 100 c
paul 11.235627 145.195062 146.721146 146.670909 148.432261 200.56718 100 b
Macosso 370.269687 448.143027 468.074160 457.499264 497.636319 553.70491 100 d
data = structure(list(date = c("2021-11-24", "2021-11-24", "2021-11-26", 
"2021-11-24", "2021-11-26", "2021-11-24", "2021-11-24", "2021-11-26",
"2021-11-26", "2021-11-26", "2021-11-26"), open = c("NaN", "NaN",
"0.43", "0.17", "0.19", "0.15", "NaN", "NaN", "NaN", "NaN", "NaN"
), high = c("NaN", "NaN", "0.43", "0.17", "0.19", "0.15", "NaN",
"NaN", "NaN", "NaN", "NaN"), low = c("NaN", "NaN", "0.43", "0.17",
"0.19", "0.15", "NaN", "NaN", "NaN", "NaN", "NaN"), close = c("NaN",
"NaN", "0.43", "0.17", "0.19", "0.15", "NaN", "NaN", "NaN", "NaN",
"NaN"), volume = c(0L, 0L, 2L, 10L, 75L, 1L, 0L, 0L, 0L, 0L,
0L)), row.names = c(NA, -11L), class = c("data.table", "data.frame"
))
data = do.call("rbind", replicate(1000, data, simplify = FALSE))

library(dtplyr)

res = microbenchmark::microbenchmark(
me = data[!unique(which(data == NaN, arr.ind=T)[,1])],

langtang = na.omit(cbind(data[, .(date,volume)], data[, lapply(.SD, as.numeric), .SDcols = 2:5])),

akrun = {data <- type.convert(data, as.is = TRUE);
data[data[, Reduce(`&`, lapply(.SD, function(x)
!is.nan(x) & is.finite(x))), .SDcols = -1]]},

paul = data %>%
lazy_dt %>%
filter(across(2:5, ~ .x != "NaN")) %>%
as.data.table,

Macosso = {data$Row <- row.names(data);
rm_rw <- data[apply(data, 1,
function(X) any(X== "NaN"|X== "Inf")),] %>% pull(Row);
data[!row.names(data) %in% rm_rw ,] %>% select(-Row)
}

)

Finding the max of a R dataframe column ignoring -Inf and NA

One solution would be the following:

data <- data.frame(column1 = c(-Inf, 4, NA, 7, 10), column2 = c(2, 8, 5, 4, 4))
column1b <- data$column1[which(!is.na(data$column1))]
column1c <- column1b[which(column1b < Inf)]
max(column1c)


Related Topics



Leave a reply



Submit