Using Grep in R to Delete Rows from a Data.Frame

Using grep in R to delete rows from a data.frame

You can use TRUE/FALSE subsetting instead of numeric.

grepl is like grep, but it returns a logical vector. Negation works with it.

 d[!grepl("K",d$z),]
x y z
1 1 1 apple
2 1 2 pear
3 1 3 banana
4 1 4 A
5 1 5 B
6 1 6 C
7 1 7 D
8 1 8 E
9 1 9 F
10 1 10 G

Delete rows in text file with grep (?) in R

myfun will take the text file as input and returns a list of data frames. what argument will set whether user wants to extract data or basic statistics from the text file.

myfun <- function( file, what )
{
x <- readLines( file )
g1 <- which( grepl("ROI:", x))
if( what == 'Basic Stats'){
g2 <- which( grepl('Basic Stats', x))
} else if ( what == "Histogram" ) {
g2 <- which( grepl("Histogram", x))
} else {
stop( 'what value is not supported')
}

df_list <- list()
counter <- 0
while( counter < length( g1 ))
{
counter <- counter + 1

if( counter != length( g1 ) ){
low <- g1[ counter ]
high <- g1[ counter + 1 ]
} else {
low <- g1[ counter ]
high <- length( x )
}

min_ind <- min( g2[ which( g2 > low & g2 < high ) ] )

title <- ifelse( counter == 1,
list( gsub( '\\[|\\]', '', unlist( strsplit( x[ low ], "\ ") )[ 2:4 ] ) ),
list( gsub( '\\[|\\]', '', unlist( strsplit( x[ low ], "\ ") )[ 4:6 ] ) ) )

if( what == 'Basic Stats'){
min_ind <- min( g2[ which( g2 > low & g2 < high ) ] )
x1 <- data.frame( do.call( 'rbind', strsplit( x[ min_ind: ( min_ind + 5 ) ], "\t")), stringsAsFactors = FALSE )
colnames( x1 ) <- x1[1, ]
x1 <- x1[2:5, ]
x1 <- do.call( 'cbind', list( x1, do.call( 'rbind', title )))
colnames(x1)[(ncol(x1)-2): ncol(x1)] <- c( 'ROI', 'color', 'points') # column names of last 3 columns
colnames(x1) <- gsub("\ ", '', colnames(x1)) # remove spaces
# convert from character to numeric data type
x1[, 2:5 ] <- lapply( x1[, 2:5 ], function(x) as.numeric( as.character( x ) ) )
df_list[[ as.character(counter) ]] <- x1

} else if ( what == "Histogram" ) {
x1 <- data.frame( do.call( 'rbind', strsplit( x[ min_ind: (high-1) ], "\t")), stringsAsFactors = FALSE )
# column names and band and bin columns
colnames( x1 ) <- x1[1, ]
colnames(x1)[1] <- 'Histogram'
x1$Band <- rep( gsub("[Band\ ]", '', grep( "Band", x1$Histogram, value = TRUE )),
diff( c( grep( "Histogram", x1$Histogram ), ( nrow(x1) + 1 ) ) ) )
x1$Bin <- rep( gsub("[Bin=\ ]", '', grep( "Bin", x1$Histogram, value = TRUE )),
diff( c( grep( "Histogram", x1$Histogram ), ( nrow(x1) + 1 ) ) ) )
x1 <- x1[! grepl( 'Histogram', x1$Histogram ), ]
x1$Histogram <- NULL

x1 <- do.call( 'cbind', list( x1, do.call( 'rbind', title )))
colnames(x1)[(ncol(x1)-2): ncol(x1)] <- c( 'ROI', 'color', 'points') # column names of last 3 columns
colnames(x1) <- gsub("\ ", '', colnames(x1)) # remove spaces
# convert from character to numeric data type
x1[, c(1:7, 10) ] <- lapply( x1[, c(1:7, 10) ], function(x) as.numeric( as.character( x ) ) )
df_list[[ as.character(counter) ]] <- x1
}
}

return( df_list )
}

1. Extract Basic Statistics:

df_list <- myfun(file = "test2.txt", what = 'Basic Stats')
df_list[[1]]
# BasicStats Min Max Mean Stdev ROI color points
# 2 Band 1 0.013282 0.133982 0.061581 0.034069 red_1 Red 20
# 3 Band 2 0.009866 0.112935 0.042688 0.026618 red_1 Red 20
# 4 Band 3 0.008304 0.037059 0.018434 0.007515 red_1 Red 20
# 5 Band 4 0.004726 0.040089 0.018490 0.009605 red_1 Red 20
df_list[[2]]
# BasicStats Min Max Mean Stdev ROI color points
# 2 Band 1 0.032262 0.124425 0.078073 0.028031 red_2 Red 12
# 3 Band 2 0.021072 0.064156 0.037923 0.012178 red_2 Red 12
# 4 Band 3 0.013404 0.066043 0.036316 0.014787 red_2 Red 12
# 5 Band 4 0.005162 0.055781 0.015526 0.013255 red_2 Red 12
df_list[[3]]
# BasicStats Min Max Mean Stdev ROI color points
# 2 Band 1 0.037488 0.107830 0.057892 0.018964 red_3 Red 12
# 3 Band 2 0.028140 0.072370 0.045340 0.014507 red_3 Red 12
# 4 Band 3 0.014960 0.112973 0.032751 0.026575 red_3 Red 12
# 5 Band 4 0.006566 0.029133 0.018201 0.006897 red_3 Red 12

2. Extract Data:

df_list <- myfun(file = "test2.txt", what = 'Histogram')

head

head(df_list[[1]])
# DN Npts Total Percent AccPct Band Bin ROI color points
# 2 0.013282 1 1 5 5 1 0.00047 red_1 Red 20
# 3 0.013755 0 1 0 5 1 0.00047 red_1 Red 20
# 4 0.014228 0 1 0 5 1 0.00047 red_1 Red 20
# 5 0.014702 0 1 0 5 1 0.00047 red_1 Red 20
# 6 0.015175 0 1 0 5 1 0.00047 red_1 Red 20
# 7 0.015648 0 1 0 5 1 0.00047 red_1 Red 20

summary statistics:

library('data.table')
df1 <- df_list[[1]]
setDT(df1)[, .( Min = min( DN ),
Max = max( DN ),
Mean = mean( DN ),
Stdev = sd( DN ) ), by = 'Band']
# Band Min Max Mean Stdev
# 1: 1 0.013282 0.133982 0.07363182 0.035048124
# 2: 2 0.009866 0.112935 0.06140034 0.029928470
# 3: 3 0.008304 0.037059 0.02268180 0.008349628
# 4: 4 0.004726 0.040089 0.02240761 0.010268456

df2 <- df_list[[2]]
setDT(df2)[, .( Min = min( DN ),
Max = max( DN ),
Mean = mean( DN ),
Stdev = sd( DN ) ), by = 'Band']
# Band Min Max Mean Stdev
# 1: 1 0.032262 0.124425 0.07834352 0.02676153
# 2: 2 0.021072 0.064156 0.04261389 0.01251049
# 3: 3 0.013404 0.066043 0.03972310 0.01528497
# 4: 4 0.005162 0.055781 0.03047151 0.01469855

Removing rows surrounding a grepl pattern match in R

Using grep you can get the row number where you find a pattern. Increment the row number by 1 and remove both the rows.

inds <- grep("my_string",df$V1)
result <- df[-unique(c(inds, inds + 1)), ]

Using tidyverse -

library(dplyr)
library(stringr)

result <- df %>%
filter({
inds <- str_detect("my_string", V1)
!(inds | lag(inds, default = FALSE))
})

Using grep or dplyr to conditionally remove rows and replace others?

Try this (using the data you provided) and now updated:

library(tidyverse)

#Data
df_rep <- data.frame(IDD, Valve, Seconds,stringsAsFactors = F)

#Replace all NA with zero
df_rep[df_rep=='NA']<-0

#Code
df_rep %>% group_by(IDD) %>% mutate(key=1:n(),
Flag=ifelse(key==2 & Seconds==0,1,0)) %>%
filter(Flag!=1) %>% ungroup() %>% select(-c(key,Flag))

Which produces:

# A tibble: 4 x 3
IDD Valve Seconds
<chr> <chr> <chr>
1 999674642 1 0
2 999269097 1 0
3 998496846 0 12
4 998067840 0 5

Delete rows containing specific strings in R

This should do the trick:

df[- grep("REVERSE", df$Name),]

Or a safer version would be:

df[!grepl("REVERSE", df$Name),]

Delete rows with grep() and lapply with data.table

DT[mapply( grepl, id1, id2), ]

# id id1 id2
# 1: 52 3505H6 3505H6856
# 2: 52 3505H6 3505H6856
# 3: 52 3505H6 3505H6856
# 4: 54 3505H6 3505H67158
# 5: 54 3505H6 3505H67158
# 6: 84 3505H6 3505H63188
# 7: 84 3505H6 3505H63188
# 8: 129 3505H6 3505H664133
# 9: 129 3505H6 3505H664133
# 10: 130 3505H6 3505H658134
# 11: 130 3505H6 3505H658134
# 12: 130 3505H6 3505H658134

Grep in R to remove entire row if city column cell is blank

Maybe

subset(mydata,city!="")

? This assumes that the city column is stored in such a way that the blanks are zero-length strings. If they might be whitespace, then something like

grep("^[[:space:]]*$",mydata$city,invert=TRUE)

would find the elements you wanted. Since grepl doesn't have an invert argument you could use (edit: thanks to @JoshO'Brien)

subset(mydata,!grepl("^[[:space:]]*$",city))

(No reproducible example given, so neither of these is tested.)

Remove rows containing specific strings

  • We can first grep the indices of the rows contains one of remove_list words , then exclude them from your data.frame
remove_ind <- lapply(strsplit(remove_list , "\\|")[[1]] ,
\(x) grep(x , PKV$Aufzeichnungen , fixed = T)) |>
unlist() |> unique()

#> [1] 12 15 10 13

PKV[-remove_ind,]

  • output
   ID                   Aufzeichnungen
1 1 Aufzeichnungen
2 1 07.03.22 A: stechender Schmerz
3 1 scharfkantig
4 1 D/B:
5 1 T:
6 1 pat aht an 36 üz distal
7 1 seit paartagen
8 1 36 vipr++
9 1 perk-
11 1 üz bilfuird
14 1 pat
16 1
17 1 pat knirscht
18 1 schiene empohlen
19 1 pat meldet sich..


Related Topics



Leave a reply



Submit