How to Make Join Operations in Dplyr Silent

How to make join operations in dplyr silent?

If you want to be heavy-handed, you can do

aa = suppressMessages(inner_join(a, b))

The better choice, as Jazzurro suggests, is to specify the by argument. dplyr only prints a message to let you know what its guess is for which columns to join by. If you don't make it guess, it doesn't confirm things with you. This is a safer choice as well, from defensive coding standpoint.

If this is in a knitr document, you can set the chunk option message=FALSE.

Incorporating Dplyr Join and Set Operations into a Custom Function

We can use anti_join

library(dplyr)
anti_join(Df2, Df, by = c("Task2"="Task")) %>%
group_by(CaseWorker,Client, Task2) %>%
summarise(Time = sum(Time))
# CaseWorker Client Task2 Time
# <fctr> <fctr> <fctr> <dbl>
#1 John Chris Iron shirt 30
#2 Kim Eric Buy groceries 8

If we need to convert to a function

DiffGoals <- function(dat1, dat2) {
anti_join(dat1, dat2, by = c("Task2" = "Task")) %>%
group_by(CaseWorker, Client, Task2) %>%
summarise(Time = sum(Time))
}

DiffGoals(Df2, Df)

Remove verbose from left_join

You can explicitely provide the column by which you want to join and you avoid the message:

df <- left_join(df1, df2, by = "id")

Adding missing grouping variables message in dplyr in R

For consistency sake the grouping variables should be always present when defined earlier and thus are added when select(value) is executed. ungroup should resolve it:

qu25 <- mydata %>% 
group_by(month, day, station_number) %>%
arrange(desc(value)) %>%
slice(2) %>%
ungroup() %>%
select(value)

The requested result is without warnings:

> mydata %>% 
+ group_by(month, day, station_number) %>%
+ arrange(desc(value)) %>%
+ slice(2) %>%
+ ungroup() %>%
+ select(value)
# A tibble: 1 x 1
value
<dbl>
1 113

Use dynamically generated column names in dplyr

In the OP's code, we need select

library(dplyr)
df %>%
rowwise() %>%
mutate(consensus=max(table(unlist(select(cur_data(), select_vars))) ))

-output

# A tibble: 10 x 4
# Rowwise:
var1 var2 var3 consensus
<int> <int> <int> <int>
1 1 1 1 2
2 1 1 3 1
3 1 2 1 2
4 1 2 1 2
5 2 2 2 2
6 2 3 3 1
7 2 3 2 2
8 1 1 1 2
9 3 1 2 1
10 3 3 2 1

Or just subset from cur_data() which would only return the data keeping the group attributes

df %>%
rowwise %>%
mutate(consensus = max(table(unlist(cur_data()[select_vars]))))
# A tibble: 10 x 4
# Rowwise:
var1 var2 var3 consensus
<int> <int> <int> <int>
1 1 1 1 2
2 1 1 3 1
3 1 2 1 2
4 1 2 1 2
5 2 2 2 2
6 2 3 3 1
7 2 3 2 2
8 1 1 1 2
9 3 1 2 1
10 3 3 2 1

Or using pmap

library(purrr)
df %>%
mutate(consensus = pmap_dbl(cur_data()[select_vars], ~ max(table(c(...)))))
# A tibble: 10 x 4
var1 var2 var3 consensus
<int> <int> <int> <dbl>
1 1 1 1 2
2 1 1 3 1
3 1 2 1 2
4 1 2 1 2
5 2 2 2 2
6 2 3 3 1
7 2 3 2 2
8 1 1 1 2
9 3 1 2 1
10 3 3 2 1

As these are rowwise operations, can get some efficiency if we use collapse functions

library(collapse)
tfm(df, consensus = dapply(slt(df, select_vars), MARGIN = 1,
FUN = function(x) fmax(tabulate(x))))
# A tibble: 10 x 4
var1 var2 var3 consensus
* <int> <int> <int> <int>
1 1 1 1 2
2 1 1 3 1
3 1 2 1 2
4 1 2 1 2
5 2 2 2 2
6 2 3 3 1
7 2 3 2 2
8 1 1 1 2
9 3 1 2 1
10 3 3 2 1

Benchmarks

As noted above, collapse is faster (run on a slightly bigger dataset)

df1 <- df[rep(seq_len(nrow(df)), 1e5), ]

system.time({
tfm(df1, consensus = dapply(slt(df1, select_vars), MARGIN = 1,
FUN = function(x) fmax(tabulate(x))))

})
#user system elapsed
# 5.257 0.123 5.323

system.time({
df1 %>%
mutate(consensus = pmap_dbl(cur_data()[select_vars], ~ max(table(c(...)))))

})
#user system elapsed
# 54.813 0.517 55.246

The rowwise operation is taking too much time, so stopped the execution

df1 %>% 
rowwise() %>%
mutate(consensus=max(table(unlist(select(cur_data(), select_vars))) ))
})
Timing stopped at: 575.5 3.342 581.3

Merge two columns into duplicate row in R

I. input --> output

You can simply split your dataframe and bind it again

library(dplyr)

df1 <- df %>% select(A, B, D, E)
df2 <- df %>% select(A, C, D, E) %>% rename(B = C)

bind_rows(df1, df2)
A B D E
1 1 a 10 20
2 2 b 15 30
3 3 c 20 40
4 1 d 10 20
5 2 e 15 30
6 3 f 20 40

If you want to use pivot_longer, this is how you could do it:

library(tidyr)

df %>%
pivot_longer( cols = B:C # the cols we want to combine
, names_to = "old_col_names" # the col where we store the old names
, values_to = "B" # and we rename the new col to B
) %>%
# ------------- reoder columns and arrange to make it look like output
select(A, B, D, E) %>% # this removes the 'old_col_names' you could also do select(-old_col_names)
arrange(B) # arrange in alphabetical order

II output --> input

For the reverse operation - assuming you do not want to split the df and recombine it - you can use {tidyr}'s pivot_wider() using a "new" name column that meets your requirements. For more complex data sets you may have to be creative here.

library(tidyr)

output %>%
# --------- introduce a "name" vector -------------
## -------- we use rep() to create set of 3s ... adapt as required!
mutate(group = c(rep("B",3), rep("C",3)) ) %>%
# --------- spread the data frame and rearrange the columns
pivot_wider( id_cols = c(A,D,E) # these columns are "constant"
, names_from = group # pull "new" column names from our group var
, values_from = B) %>% # spread the values we aggregated in B
select(A, B, C, D, E) # rearrange column order to input style

Getting different results when using the sparklyr and dplyr

The key difference is that in the non-sparklyr, we are not using na.rm = TRUE in mean, therefore, those elements having NA in 'distance' or 'arr_delay' will become NA when we take the mean but in sparklyr the NA values are already removed so the argument is not needed

We can check the NA elements in 'distance' and 'arr_delay'

nycflights13::flights %>% 
summarise_at(vars(distance, arr_delay), funs(sum(is.na(.))))
# A tibble: 1 x 2
# distance arr_delay
# <int> <int>
#1 0 9430 #### number of NAs

So, if we correct for that, then the output will be the same

res <- nycflights13::flights %>% 
group_by(tailnum) %>%
summarise(count = n(),
dist = mean(distance, na.rm = TRUE),
delay = mean(arr_delay, na.rm = TRUE)) %>%
filter(count > 20, dist < 2000, !is.na(delay)) %>%
arrange(tailnum)
res
# A tibble: 2,961 x 4
# tailnum count dist delay
# <chr> <int> <dbl> <dbl>
# 1 N0EGMQ 371 676 9.98
# 2 N10156 153 758 12.7
# 3 N102UW 48 536 2.94
# 4 N103US 46 535 - 6.93
# 5 N104UW 47 535 1.80
# 6 N10575 289 520 20.7
# 7 N105UW 45 525 - 0.267
# 8 N107US 41 529 - 5.73
# 9 N108UW 60 534 - 1.25
#10 N109UW 48 536 - 2.52
# ... with 2,951 more rows

Using sparklyr

library(sparklyr)
library(dplyr)
library(nycflights13)
sc <- spark_connect(master = "local")

flights_tbl <- copy_to(sc, nycflights13::flights, "flights")

delay <- flights_tbl %>%
group_by(tailnum) %>%
summarise(count = n(), dist = mean(distance), delay = mean(arr_delay)) %>%
filter(count > 20, dist < 2000, !is.na(delay)) %>%
arrange(tailnum) %>%
collect
delay
# A tibble: 2,961 x 4
# tailnum count dist delay
# <chr> <dbl> <dbl> <dbl>
# 1 N0EGMQ 371 676 9.98
# 2 N10156 153 758 12.7
# 3 N102UW 48.0 536 2.94
# 4 N103US 46.0 535 - 6.93
# 5 N104UW 47.0 535 1.80
# 6 N10575 289 520 20.7
# 7 N105UW 45.0 525 - 0.267
# 8 N107US 41.0 529 - 5.73
# 9 N108UW 60.0 534 - 1.25
#10 N109UW 48.0 536 - 2.52
# ... with 2,951 more rows

Appropriate procedure for using group_by() %% summarize() %% mutate() with sf objects

I made a change to the following line of code.

mutate(stupid_var = rowSums(dplyr::select(.,'SID74':'NWBIR74'))/BIR74)

This line of code was probably causing an issue. Unless I am missing something, it would appear there is no reason for summing the entire columns for each row. So the code was changed to remove the rowSums() function. The mutate function was still used to perform the math from the data on each row of data, but without involving any rowSums() values.

p1 <- nc %>% 
separate(CNTY_ID,into = c("ID1","ID2"),sep = 2,remove = FALSE) %>%
group_by(ID1) %>%
dplyr::summarize(AREA = sum(AREA, na.rm = TRUE),
BIR74 = sum(BIR74,na.rm = TRUE),
SID74 = sum(SID74,na.rm = TRUE),
NWBIR74 = sum(NWBIR74,na.rm = TRUE)) %>%
mutate( stupid_var = ( (p2$SID74) + (p2$NWBIR74)) / (p2$BIR74) )
p1

The output can be viewed from this link.



Related Topics



Leave a reply



Submit