Using Grep in R to Delete Rows from a Data.Frame

Using grep in R to delete rows from a data.frame

You can use TRUE/FALSE subsetting instead of numeric.

grepl is like grep, but it returns a logical vector. Negation works with it.

 d[!grepl("K",d$z),]
   x  y      z
1  1  1  apple
2  1  2   pear
3  1  3 banana
4  1  4      A
5  1  5      B
6  1  6      C
7  1  7      D
8  1  8      E
9  1  9      F
10 1 10      G

Delete rows in text file with grep (?) in R

myfun will take the text file as input and returns a list of data frames. what argument will set whether user wants to extract data or basic statistics from the text file.

myfun <- function( file, what )
{
  x <- readLines( file )
  g1 <- which( grepl("ROI:", x))
  if( what == 'Basic Stats'){
    g2 <- which( grepl('Basic Stats', x))
  } else if ( what == "Histogram" ) {
    g2 <- which( grepl("Histogram", x))
  } else {
    stop( 'what value is not supported')
  }

  df_list <- list()
  counter <- 0
  while( counter < length( g1 ))
  {
    counter <- counter + 1

    if( counter != length( g1 ) ){
      low  <- g1[ counter ]
      high <- g1[ counter + 1 ]
    } else {
      low  <- g1[ counter ]
      high <- length( x )
    }

    min_ind <- min( g2[ which( g2 > low & g2 < high ) ] )

    title <- ifelse( counter == 1,
                     list( gsub( '\\[|\\]', '', unlist( strsplit( x[ low ], "\ ") )[ 2:4 ] ) ), 
                     list( gsub( '\\[|\\]', '', unlist( strsplit( x[ low ], "\ ") )[ 4:6 ] ) ) )

    if( what == 'Basic Stats'){
      min_ind <- min( g2[ which( g2 > low & g2 < high ) ] )
      x1 <- data.frame( do.call( 'rbind', strsplit( x[ min_ind: ( min_ind + 5 ) ], "\t")), stringsAsFactors = FALSE )
      colnames( x1 ) <- x1[1, ]
      x1 <- x1[2:5, ]
      x1 <- do.call( 'cbind', list( x1, do.call( 'rbind', title )))
      colnames(x1)[(ncol(x1)-2): ncol(x1)] <- c( 'ROI', 'color', 'points') # column names of last 3 columns
      colnames(x1) <- gsub("\ ", '', colnames(x1))  # remove spaces
      # convert from character to numeric data type
      x1[, 2:5 ] <- lapply( x1[, 2:5 ], function(x) as.numeric( as.character( x ) ) )
      df_list[[ as.character(counter) ]] <- x1

    } else if ( what == "Histogram" ) {
      x1 <- data.frame( do.call( 'rbind', strsplit( x[ min_ind: (high-1) ], "\t")), stringsAsFactors = FALSE )
      # column names and band and bin columns
      colnames( x1 ) <- x1[1, ]
      colnames(x1)[1] <- 'Histogram'
      x1$Band <- rep( gsub("[Band\ ]", '', grep( "Band", x1$Histogram, value = TRUE )),
                      diff( c( grep( "Histogram", x1$Histogram ), ( nrow(x1) + 1 ) ) ) )
      x1$Bin <- rep( gsub("[Bin=\ ]", '', grep( "Bin", x1$Histogram, value = TRUE )),
                     diff( c( grep( "Histogram", x1$Histogram ), ( nrow(x1) + 1 ) ) ) )
      x1 <- x1[! grepl( 'Histogram', x1$Histogram ), ]
      x1$Histogram <- NULL

      x1 <- do.call( 'cbind', list( x1, do.call( 'rbind', title )))
      colnames(x1)[(ncol(x1)-2): ncol(x1)] <- c( 'ROI', 'color', 'points') # column names of last 3 columns
      colnames(x1) <- gsub("\ ", '', colnames(x1))  # remove spaces
      # convert from character to numeric data type
      x1[, c(1:7, 10) ] <- lapply( x1[, c(1:7, 10) ], function(x) as.numeric( as.character( x ) ) )
      df_list[[ as.character(counter) ]] <- x1
    }
  }

  return( df_list )
}

1. Extract Basic Statistics:

df_list <- myfun(file = "test2.txt", what = 'Basic Stats')
df_list[[1]]
#    BasicStats      Min      Max     Mean    Stdev   ROI color points
# 2      Band 1 0.013282 0.133982 0.061581 0.034069 red_1   Red     20
# 3      Band 2 0.009866 0.112935 0.042688 0.026618 red_1   Red     20
# 4      Band 3 0.008304 0.037059 0.018434 0.007515 red_1   Red     20
# 5      Band 4 0.004726 0.040089 0.018490 0.009605 red_1   Red     20
df_list[[2]]
#    BasicStats      Min      Max     Mean    Stdev   ROI color points
# 2      Band 1 0.032262 0.124425 0.078073 0.028031 red_2   Red     12
# 3      Band 2 0.021072 0.064156 0.037923 0.012178 red_2   Red     12
# 4      Band 3 0.013404 0.066043 0.036316 0.014787 red_2   Red     12
# 5      Band 4 0.005162 0.055781 0.015526 0.013255 red_2   Red     12
df_list[[3]]
#    BasicStats      Min      Max     Mean    Stdev   ROI color points
# 2      Band 1 0.037488 0.107830 0.057892 0.018964 red_3   Red     12
# 3      Band 2 0.028140 0.072370 0.045340 0.014507 red_3   Red     12
# 4      Band 3 0.014960 0.112973 0.032751 0.026575 red_3   Red     12
# 5      Band 4 0.006566 0.029133 0.018201 0.006897 red_3   Red     12

2. Extract Data:

df_list <- myfun(file = "test2.txt", what = 'Histogram')

head

head(df_list[[1]])
#         DN Npts Total Percent AccPct Band     Bin   ROI color points
# 2 0.013282    1     1       5      5    1 0.00047 red_1   Red     20
# 3 0.013755    0     1       0      5    1 0.00047 red_1   Red     20
# 4 0.014228    0     1       0      5    1 0.00047 red_1   Red     20
# 5 0.014702    0     1       0      5    1 0.00047 red_1   Red     20
# 6 0.015175    0     1       0      5    1 0.00047 red_1   Red     20
# 7 0.015648    0     1       0      5    1 0.00047 red_1   Red     20

summary statistics:

library('data.table')
df1 <- df_list[[1]]
setDT(df1)[, .( Min = min( DN ),
                Max = max( DN ),
                Mean = mean( DN ),
                Stdev = sd( DN ) ), by = 'Band']
#    Band      Min      Max       Mean       Stdev
# 1:    1 0.013282 0.133982 0.07363182 0.035048124
# 2:    2 0.009866 0.112935 0.06140034 0.029928470
# 3:    3 0.008304 0.037059 0.02268180 0.008349628
# 4:    4 0.004726 0.040089 0.02240761 0.010268456

df2 <- df_list[[2]]
setDT(df2)[, .( Min = min( DN ),
                Max = max( DN ),
                Mean = mean( DN ),
                Stdev = sd( DN ) ), by = 'Band']
#    Band      Min      Max       Mean      Stdev
# 1:    1 0.032262 0.124425 0.07834352 0.02676153
# 2:    2 0.021072 0.064156 0.04261389 0.01251049
# 3:    3 0.013404 0.066043 0.03972310 0.01528497
# 4:    4 0.005162 0.055781 0.03047151 0.01469855

Removing rows surrounding a grepl pattern match in R

Using grep you can get the row number where you find a pattern. Increment the row number by 1 and remove both the rows.

inds <- grep("my_string",df$V1)
result <- df[-unique(c(inds, inds + 1)), ]

Using tidyverse -

library(dplyr)
library(stringr)

result <- df %>%
  filter({
    inds <- str_detect("my_string", V1)
    !(inds | lag(inds, default = FALSE))
    })

Using grep or dplyr to conditionally remove rows and replace others?

Try this (using the data you provided) and now updated:

library(tidyverse)

#Data
df_rep <- data.frame(IDD, Valve, Seconds,stringsAsFactors = F)

#Replace all NA with zero
df_rep[df_rep=='NA']<-0

#Code
df_rep %>% group_by(IDD) %>% mutate(key=1:n(),
                                    Flag=ifelse(key==2 & Seconds==0,1,0)) %>%
  filter(Flag!=1) %>% ungroup() %>% select(-c(key,Flag))

Which produces:

# A tibble: 4 x 3
  IDD       Valve Seconds
  <chr>     <chr> <chr>  
1 999674642 1     0      
2 999269097 1     0      
3 998496846 0     12     
4 998067840 0     5

Delete rows containing specific strings in R

This should do the trick:

df[- grep("REVERSE", df$Name),]

Or a safer version would be:

df[!grepl("REVERSE", df$Name),]

Delete rows with grep() and lapply with data.table

DT[mapply( grepl, id1, id2), ]

#     id    id1         id2
# 1:  52 3505H6   3505H6856
# 2:  52 3505H6   3505H6856
# 3:  52 3505H6   3505H6856
# 4:  54 3505H6  3505H67158
# 5:  54 3505H6  3505H67158
# 6:  84 3505H6  3505H63188
# 7:  84 3505H6  3505H63188
# 8: 129 3505H6 3505H664133
# 9: 129 3505H6 3505H664133
# 10: 130 3505H6 3505H658134
# 11: 130 3505H6 3505H658134
# 12: 130 3505H6 3505H658134

Grep in R to remove entire row if city column cell is blank

Maybe

subset(mydata,city!="")

? This assumes that the city column is stored in such a way that the blanks are zero-length strings. If they might be whitespace, then something like

grep("^[[:space:]]*$",mydata$city,invert=TRUE)

would find the elements you wanted. Since grepl doesn't have an invert argument you could use (edit: thanks to @JoshO'Brien)

subset(mydata,!grepl("^[[:space:]]*$",city))

(No reproducible example given, so neither of these is tested.)

Remove rows containing specific strings

We can first grep the indices of the rows contains one of remove_list words , then exclude them from your data.frame

remove_ind <- lapply(strsplit(remove_list , "\\|")[[1]] ,
              \(x) grep(x , PKV$Aufzeichnungen , fixed = T)) |>
              unlist() |> unique()

#> [1] 12 15 10 13

PKV[-remove_ind,]

output

   ID                   Aufzeichnungen
1   1                   Aufzeichnungen
2   1 07.03.22   A: stechender Schmerz
3   1                     scharfkantig
4   1                             D/B:
5   1                               T:
6   1          pat aht an 36 üz distal
7   1                   seit paartagen
8   1                        36 vipr++
9   1                            perk-
11  1                      üz bilfuird
14  1                              pat
16  1                                 
17  1                     pat knirscht
18  1                 schiene empohlen
19  1                pat meldet sich..

Using Grep in R to Delete Rows from a Data.Frame