How to Filter on Partial Match Using Sparklyr

How to filter on partial match using sparklyr

The same as in standard Spark, you can use either rlike (Java regular expressions):

df <- copy_to(sc, iris) 

df %>% filter(rlike(Species, "osa"))

# or anchored
df %>% filter(rlike(Species, "^.*osa.*$"))

or like (simple SQL regular expressions):

df %>% filter(like(Species, "%osa%"))

Both methods can be also used with suffix notation as

df %>% filter(Species %rlike%  "^.*osa.*$")

and

df %>% filter(Species %like% "%osa%")

respectively.

For details see vignette("sql-translation").

Creating new variable by matching on strings in sparklyr

If you're looking for a well defined prefix like here, you can extract it:

sdf_1 <- copy_to(sc, df_1)
sdf_2 <- copy_to(sc, df_2)

sdf_1_keyed <- sdf_1 %>% mutate(key = regexp_extract(col2, "^(.*)/", 1))

apply left equi-join:

matched <- sdf_1_keyed %>% 
  left_join(sdf_2 %>% transmute(key = col2, id = col1), by="key")

and summarise

matched %>% group_by(col1, col2) %>% 
   summarise(col3 = as.numeric(sum(as.numeric(!is.na(id)), na.rm = TRUE) > 0))

# Source:   lazy query [?? x 3]
# Database: spark_connection
# Groups:   col1
    col1 col2             col3
   <dbl> <chr>           <dbl>
 1     1 john.com/abcd       1
 2     5 rob.com/pqrs        1
 3     6 sam.com/tuvw        0
 4     9 bob.com/fghi        0
 5     3 paul.com/hijk       1
 6     4 george.com/lmno     0
 7     8 lenny.com/bcde      0
 8    10 tom.com/jklm        0
 9     2 ringo.com/defg      0
10     7 matt.com/xyza       0
# ... with more rows

A similar thing can be done with RLIKE condition:

candidates <- sdf_1 %>% spark_dataframe() %>% 
  sparklyr::invoke("crossJoin",
    sdf_2 %>% transmute(target = col2) %>% spark_dataframe()) %>% 
  sdf_register()

candidates %>% 
    mutate(matched = as.numeric(rlike(col2, target))) %>% 
    group_by(col1, col2) %>% 
    summarise(col3 = as.numeric(sum(matched, na.rm=TRUE) > 0))

# Source:   lazy query [?? x 3]
# Database: spark_connection
# Groups:   col1
    col1 col2             col3
   <dbl> <chr>           <dbl>
 1     1 john.com/abcd       1
 2     5 rob.com/pqrs        1
 3     6 sam.com/tuvw        0
 4     9 bob.com/fghi        0
 5     3 paul.com/hijk       1
 6     4 george.com/lmno     0
 7     8 lenny.com/bcde      0
 8    10 tom.com/jklm        0
 9     2 ringo.com/defg      0
10     7 matt.com/xyza       0
# ... with more rows

Finally you could extract unique values:

targets <- unique(as.character(df_2$col2))

and create SQL expression:

library(glue)

expr <- glue_collapse(glue("col2 rlike '{targets}'"), " OR ")

sdf_1 %>% 
  spark_dataframe() %>%
  sparklyr::invoke(
    "selectExpr", 
    list("*", as.character(glue("{expr} as col3")))) %>% 
  sdf_register() %>%
  mutate(col3 = as.numeric(col3))

 # Source:   lazy query [?? x 3]
 # Database: spark_connection
     col1 col2             col3
    <dbl> <chr>           <dbl>
  1     1 john.com/abcd       1
  2     2 ringo.com/defg      0
  3     3 paul.com/hijk       1
  4     4 george.com/lmno     0
  5     5 rob.com/pqrs        1
  6     6 sam.com/tuvw        0
  7     7 matt.com/xyza       0
  8     8 lenny.com/bcde      0
  9     9 bob.com/fghi        0
 10    10 tom.com/jklm        0
 # ... with more rows

or R expression:

library(rlang)

rexpr <- glue_collapse(glue("rlike(col2, '{targets}')"), " | ")

sdf_1 %>% mutate(col3 = !!parse_quosure(glue("as.numeric({rexpr})")))

# Source:   lazy query [?? x 3]
# Database: spark_connection
    col1 col2             col3
   <dbl> <chr>           <dbl>
 1     1 john.com/abcd       1
 2     2 ringo.com/defg      0
 3     3 paul.com/hijk       1
 4     4 george.com/lmno     0
 5     5 rob.com/pqrs        1
 6     6 sam.com/tuvw        0
 7     7 matt.com/xyza       0
 8     8 lenny.com/bcde      0
 9     9 bob.com/fghi        0
10    10 tom.com/jklm        0
# ... with more rows

Filter according to partial match of string variable in R

If we want to specify the word boundary, use \\b at the start. Also, for different cases, we can use ignore_case = TRUE by wrapping with modifiers

library(dplyr)
library(stringr)
out <- df %>%
        filter(str_detect(disease, regex("\\btrauma", ignore_case = TRUE)))

sum(str_detect(out$disease, regex("^Non", ignore_case = TRUE)))
#[1] 0

data

set.seed(24)
df <- data.frame(disease = sample(c("Nontraumatic", "Trauma", 
 "Traumatic", "nontraumatic", "traumatic", "trauma"), 50 ,
        replace = TRUE), value = rnorm (50))

sparklyr can't filter missing value of `sd` on single value

This is a matter of incompatibility between sparklyr and Spark. In Spark there are both NULLS (somewhat equivalent to R NA) and NaNs, each with different processing rules, but both values are fetched as NaN in sparklyr.

To filter out NaN you have to use isnan (don't confuse it with R is.nan):

x_tbl %>% filter(!isnan(x_sd)) %>% collect()

# A tibble: 2 x 3
# Groups:   grp [1]
    grp     x      x_sd
  <chr> <dbl>     <dbl>
1     a     1 0.7071068
2     a     2 0.7071068

To better illustrate the problem:

df <- copy_to(sc,
  data.frame(x = c("1", "NaN", "")), "df", overwrite = TRUE
) %>% mutate(x = as.double(x))

df %>% mutate_all(funs(isnull, isnan))

# Source:   lazy query [?? x 3]
# Database: spark_connection
      x isnull isnan
  <dbl>  <lgl> <lgl> 
1     1  FALSE FALSE
2   NaN  FALSE  TRUE
3   NaN   TRUE FALSE

Looking to sort a Spark Data Frame by Index using SparklyR

This is the best solution I could come up with. Although correct, the sdf_with_unique_id function returns some very high sequential values above the 62,000 row. Regardless, it's one way to create a distributed index column with SparklyR.

library(sparklyr)
library(dplyr)
library(Lahman)

options(tibble.width = Inf) 
options(dplyr.print_max = Inf) 

spark_install(version = "2.0.0")
sc <- spark_connect(master = "local")

batting_tbl <- copy_to(sc, Lahman::Batting, "batting"); batting_tbl
tbl_uncache(sc, "batting")

y <- Lahman::Batting

batting_tbl <- batting_tbl %>% sdf_with_unique_id(., id = "id") # Note 62300 threshold for higher values
batting_tbl %>% arrange(-id)

Gather in sparklyr

Here's a function to mimic gather in sparklyr. This would gather the given columns while keeping everything else intact, but it can easily be extended if required.

# Function
sdf_gather <- function(tbl, gather_cols){

  other_cols <- colnames(tbl)[!colnames(tbl) %in% gather_cols]

  lapply(gather_cols, function(col_nm){
    tbl %>% 
      select(c(other_cols, col_nm)) %>% 
      mutate(key = col_nm) %>%
      rename(value = col_nm)  
  }) %>% 
    sdf_bind_rows() %>% 
    select(c(other_cols, 'key', 'value'))
}

# Example
spark_df %>% 
  select(col_1, col_2, col_3, col_4) %>% 
  sdf_gather(c('col_3', 'col_4'))

RStudio/Sparklyr on MAPR/Spark - Replace , to . in string

regexp_replace is the function you need here:

tbl_bun %>% mutate(value=as.numeric(regexp_replace(value, ",", "\\.")))

When in doubt see Hive Language Manual UDF. Pretty much every function there either has native Spark implementation or is exposed as a Hive UDF.

How to Filter on Partial Match Using Sparklyr