How to Filter on Partial Match Using Sparklyr

How to filter on partial match using sparklyr

The same as in standard Spark, you can use either rlike (Java regular expressions):

df <- copy_to(sc, iris) 

df %>% filter(rlike(Species, "osa"))

# or anchored
df %>% filter(rlike(Species, "^.*osa.*$"))

or like (simple SQL regular expressions):

df %>% filter(like(Species, "%osa%"))

Both methods can be also used with suffix notation as

df %>% filter(Species %rlike%  "^.*osa.*$")

and

df %>% filter(Species %like% "%osa%")

respectively.

For details see vignette("sql-translation").

Creating new variable by matching on strings in sparklyr

If you're looking for a well defined prefix like here, you can extract it:

sdf_1 <- copy_to(sc, df_1)
sdf_2 <- copy_to(sc, df_2)

sdf_1_keyed <- sdf_1 %>% mutate(key = regexp_extract(col2, "^(.*)/", 1))

apply left equi-join:

matched <- sdf_1_keyed %>% 
left_join(sdf_2 %>% transmute(key = col2, id = col1), by="key")

and summarise

matched %>% group_by(col1, col2) %>% 
summarise(col3 = as.numeric(sum(as.numeric(!is.na(id)), na.rm = TRUE) > 0))
# Source:   lazy query [?? x 3]
# Database: spark_connection
# Groups: col1
col1 col2 col3
<dbl> <chr> <dbl>
1 1 john.com/abcd 1
2 5 rob.com/pqrs 1
3 6 sam.com/tuvw 0
4 9 bob.com/fghi 0
5 3 paul.com/hijk 1
6 4 george.com/lmno 0
7 8 lenny.com/bcde 0
8 10 tom.com/jklm 0
9 2 ringo.com/defg 0
10 7 matt.com/xyza 0
# ... with more rows

A similar thing can be done with RLIKE condition:

candidates <- sdf_1 %>% spark_dataframe() %>% 
sparklyr::invoke("crossJoin",
sdf_2 %>% transmute(target = col2) %>% spark_dataframe()) %>%
sdf_register()

candidates %>%
mutate(matched = as.numeric(rlike(col2, target))) %>%
group_by(col1, col2) %>%
summarise(col3 = as.numeric(sum(matched, na.rm=TRUE) > 0))
# Source:   lazy query [?? x 3]
# Database: spark_connection
# Groups: col1
col1 col2 col3
<dbl> <chr> <dbl>
1 1 john.com/abcd 1
2 5 rob.com/pqrs 1
3 6 sam.com/tuvw 0
4 9 bob.com/fghi 0
5 3 paul.com/hijk 1
6 4 george.com/lmno 0
7 8 lenny.com/bcde 0
8 10 tom.com/jklm 0
9 2 ringo.com/defg 0
10 7 matt.com/xyza 0
# ... with more rows

Finally you could extract unique values:

targets <- unique(as.character(df_2$col2))

and create SQL expression:

library(glue)

expr <- glue_collapse(glue("col2 rlike '{targets}'"), " OR ")

sdf_1 %>%
spark_dataframe() %>%
sparklyr::invoke(
"selectExpr",
list("*", as.character(glue("{expr} as col3")))) %>%
sdf_register() %>%
mutate(col3 = as.numeric(col3))
 # Source:   lazy query [?? x 3]
# Database: spark_connection
col1 col2 col3
<dbl> <chr> <dbl>
1 1 john.com/abcd 1
2 2 ringo.com/defg 0
3 3 paul.com/hijk 1
4 4 george.com/lmno 0
5 5 rob.com/pqrs 1
6 6 sam.com/tuvw 0
7 7 matt.com/xyza 0
8 8 lenny.com/bcde 0
9 9 bob.com/fghi 0
10 10 tom.com/jklm 0
# ... with more rows

or R expression:

library(rlang)

rexpr <- glue_collapse(glue("rlike(col2, '{targets}')"), " | ")

sdf_1 %>% mutate(col3 = !!parse_quosure(glue("as.numeric({rexpr})")))
# Source:   lazy query [?? x 3]
# Database: spark_connection
col1 col2 col3
<dbl> <chr> <dbl>
1 1 john.com/abcd 1
2 2 ringo.com/defg 0
3 3 paul.com/hijk 1
4 4 george.com/lmno 0
5 5 rob.com/pqrs 1
6 6 sam.com/tuvw 0
7 7 matt.com/xyza 0
8 8 lenny.com/bcde 0
9 9 bob.com/fghi 0
10 10 tom.com/jklm 0
# ... with more rows

Filter according to partial match of string variable in R

If we want to specify the word boundary, use \\b at the start. Also, for different cases, we can use ignore_case = TRUE by wrapping with modifiers

library(dplyr)
library(stringr)
out <- df %>%
filter(str_detect(disease, regex("\\btrauma", ignore_case = TRUE)))

sum(str_detect(out$disease, regex("^Non", ignore_case = TRUE)))
#[1] 0

data

set.seed(24)
df <- data.frame(disease = sample(c("Nontraumatic", "Trauma",
"Traumatic", "nontraumatic", "traumatic", "trauma"), 50 ,
replace = TRUE), value = rnorm (50))

sparklyr can't filter missing value of `sd` on single value

This is a matter of incompatibility between sparklyr and Spark. In Spark there are both NULLS (somewhat equivalent to R NA) and NaNs, each with different processing rules, but both values are fetched as NaN in sparklyr.

To filter out NaN you have to use isnan (don't confuse it with R is.nan):

x_tbl %>% filter(!isnan(x_sd)) %>% collect()
# A tibble: 2 x 3
# Groups: grp [1]
grp x x_sd
<chr> <dbl> <dbl>
1 a 1 0.7071068
2 a 2 0.7071068

To better illustrate the problem:

df <- copy_to(sc,
data.frame(x = c("1", "NaN", "")), "df", overwrite = TRUE
) %>% mutate(x = as.double(x))

df %>% mutate_all(funs(isnull, isnan))
# Source:   lazy query [?? x 3]
# Database: spark_connection
x isnull isnan
<dbl> <lgl> <lgl>
1 1 FALSE FALSE
2 NaN FALSE TRUE
3 NaN TRUE FALSE

Looking to sort a Spark Data Frame by Index using SparklyR

This is the best solution I could come up with. Although correct, the sdf_with_unique_id function returns some very high sequential values above the 62,000 row. Regardless, it's one way to create a distributed index column with SparklyR.

library(sparklyr)
library(dplyr)
library(Lahman)

options(tibble.width = Inf)
options(dplyr.print_max = Inf)

spark_install(version = "2.0.0")
sc <- spark_connect(master = "local")

batting_tbl <- copy_to(sc, Lahman::Batting, "batting"); batting_tbl
tbl_uncache(sc, "batting")

y <- Lahman::Batting

batting_tbl <- batting_tbl %>% sdf_with_unique_id(., id = "id") # Note 62300 threshold for higher values
batting_tbl %>% arrange(-id)

Gather in sparklyr

Here's a function to mimic gather in sparklyr. This would gather the given columns while keeping everything else intact, but it can easily be extended if required.

# Function
sdf_gather <- function(tbl, gather_cols){

other_cols <- colnames(tbl)[!colnames(tbl) %in% gather_cols]

lapply(gather_cols, function(col_nm){
tbl %>%
select(c(other_cols, col_nm)) %>%
mutate(key = col_nm) %>%
rename(value = col_nm)
}) %>%
sdf_bind_rows() %>%
select(c(other_cols, 'key', 'value'))
}

# Example
spark_df %>%
select(col_1, col_2, col_3, col_4) %>%
sdf_gather(c('col_3', 'col_4'))

RStudio/Sparklyr on MAPR/Spark - Replace , to . in string

regexp_replace is the function you need here:

tbl_bun %>% mutate(value=as.numeric(regexp_replace(value, ",", "\\.")))

When in doubt see Hive Language Manual UDF. Pretty much every function there either has native Spark implementation or is exposed as a Hive UDF.



Related Topics



Leave a reply



Submit