How to Make Join Operations in Dplyr Silent

How to make join operations in dplyr silent?

If you want to be heavy-handed, you can do

aa = suppressMessages(inner_join(a, b))

The better choice, as Jazzurro suggests, is to specify the by argument. dplyr only prints a message to let you know what its guess is for which columns to join by. If you don't make it guess, it doesn't confirm things with you. This is a safer choice as well, from defensive coding standpoint.

If this is in a knitr document, you can set the chunk option message=FALSE.

Incorporating Dplyr Join and Set Operations into a Custom Function

We can use anti_join

library(dplyr)
anti_join(Df2, Df, by = c("Task2"="Task")) %>%
         group_by(CaseWorker,Client, Task2) %>% 
         summarise(Time = sum(Time))
#    CaseWorker Client         Task2  Time
#        <fctr> <fctr>        <fctr> <dbl>
#1       John  Chris    Iron shirt    30
#2        Kim   Eric Buy groceries     8

If we need to convert to a function

DiffGoals <- function(dat1, dat2) {
            anti_join(dat1, dat2, by = c("Task2" = "Task")) %>%
                   group_by(CaseWorker, Client, Task2) %>%
                   summarise(Time = sum(Time))
 }

DiffGoals(Df2, Df)

Remove verbose from left_join

You can explicitely provide the column by which you want to join and you avoid the message:

df <- left_join(df1, df2, by = "id")

Adding missing grouping variables message in dplyr in R

For consistency sake the grouping variables should be always present when defined earlier and thus are added when select(value) is executed. ungroup should resolve it:

qu25 <- mydata %>% 
  group_by(month, day, station_number) %>%
  arrange(desc(value)) %>% 
  slice(2) %>% 
  ungroup() %>%
  select(value)

The requested result is without warnings:

> mydata %>% 
+   group_by(month, day, station_number) %>%
+   arrange(desc(value)) %>% 
+   slice(2) %>% 
+   ungroup() %>%
+   select(value)
# A tibble: 1 x 1
  value
  <dbl>
1   113

Use dynamically generated column names in dplyr

In the OP's code, we need select

library(dplyr)
df %>% 
  rowwise() %>% 
  mutate(consensus=max(table(unlist(select(cur_data(), select_vars))) ))

-output

# A tibble: 10 x 4
# Rowwise: 
    var1  var2  var3 consensus
   <int> <int> <int>     <int>
 1     1     1     1         2
 2     1     1     3         1
 3     1     2     1         2
 4     1     2     1         2
 5     2     2     2         2
 6     2     3     3         1
 7     2     3     2         2
 8     1     1     1         2
 9     3     1     2         1
10     3     3     2         1

Or just subset from cur_data() which would only return the data keeping the group attributes

df %>%
     rowwise %>% 
     mutate(consensus = max(table(unlist(cur_data()[select_vars]))))
# A tibble: 10 x 4
# Rowwise: 
    var1  var2  var3 consensus
   <int> <int> <int>     <int>
 1     1     1     1         2
 2     1     1     3         1
 3     1     2     1         2
 4     1     2     1         2
 5     2     2     2         2
 6     2     3     3         1
 7     2     3     2         2
 8     1     1     1         2
 9     3     1     2         1
10     3     3     2         1

Or using pmap

library(purrr)
df %>%
     mutate(consensus = pmap_dbl(cur_data()[select_vars], ~ max(table(c(...)))))
# A tibble: 10 x 4
    var1  var2  var3 consensus
   <int> <int> <int>     <dbl>
 1     1     1     1         2
 2     1     1     3         1
 3     1     2     1         2
 4     1     2     1         2
 5     2     2     2         2
 6     2     3     3         1
 7     2     3     2         2
 8     1     1     1         2
 9     3     1     2         1
10     3     3     2         1

As these are rowwise operations, can get some efficiency if we use collapse functions

library(collapse)
tfm(df, consensus = dapply(slt(df, select_vars), MARGIN = 1,
       FUN = function(x) fmax(tabulate(x))))
# A tibble: 10 x 4
    var1  var2  var3 consensus
 * <int> <int> <int>     <int>
 1     1     1     1         2
 2     1     1     3         1
 3     1     2     1         2
 4     1     2     1         2
 5     2     2     2         2
 6     2     3     3         1
 7     2     3     2         2
 8     1     1     1         2
 9     3     1     2         1
10     3     3     2         1

Benchmarks

As noted above, collapse is faster (run on a slightly bigger dataset)

df1 <- df[rep(seq_len(nrow(df)), 1e5), ]

system.time({
tfm(df1, consensus = dapply(slt(df1, select_vars), MARGIN = 1,
       FUN = function(x) fmax(tabulate(x))))

})
#user  system elapsed 
#  5.257   0.123   5.323 

system.time({
df1 %>%
     mutate(consensus = pmap_dbl(cur_data()[select_vars], ~ max(table(c(...)))))

})
#user  system elapsed 
# 54.813   0.517  55.246

The rowwise operation is taking too much time, so stopped the execution

df1 %>% 
   rowwise() %>% 
   mutate(consensus=max(table(unlist(select(cur_data(), select_vars))) ))
 })
Timing stopped at: 575.5 3.342 581.3

Merge two columns into duplicate row in R

I. input --> output

You can simply split your dataframe and bind it again

library(dplyr)

df1 <- df %>% select(A, B, D, E)
df2 <- df %>% select(A, C, D, E) %>% rename(B = C)

bind_rows(df1, df2)
  A B  D  E
1 1 a 10 20
2 2 b 15 30
3 3 c 20 40
4 1 d 10 20
5 2 e 15 30
6 3 f 20 40

If you want to use pivot_longer, this is how you could do it:

library(tidyr)

df %>% 
  pivot_longer( cols = B:C                  # the cols we want to combine
              , names_to = "old_col_names"  # the col where we store the old names
              , values_to = "B"             # and we rename the new col to B
              ) %>% 
# ------------- reoder columns and arrange to make it look like output
   select(A, B, D, E) %>%                   # this removes the 'old_col_names' you could also do select(-old_col_names)
   arrange(B)                               # arrange in alphabetical order

II output --> input

For the reverse operation - assuming you do not want to split the df and recombine it - you can use {tidyr}'s pivot_wider() using a "new" name column that meets your requirements. For more complex data sets you may have to be creative here.

library(tidyr)

output %>% 
# --------- introduce a "name" vector -------------
## -------- we use rep() to create set of 3s ... adapt as required!
  mutate(group = c(rep("B",3), rep("C",3)) ) %>% 
# --------- spread the data frame and rearrange the columns
  pivot_wider( id_cols = c(A,D,E)    # these columns are "constant"
             , names_from  = group   # pull "new" column names from our group var 
             , values_from = B) %>%  # spread the values we aggregated in B
  select(A, B, C, D, E)              # rearrange column order to input style

Getting different results when using the sparklyr and dplyr

The key difference is that in the non-sparklyr, we are not using na.rm = TRUE in mean, therefore, those elements having NA in 'distance' or 'arr_delay' will become NA when we take the mean but in sparklyr the NA values are already removed so the argument is not needed

We can check the NA elements in 'distance' and 'arr_delay'

nycflights13::flights %>% 
       summarise_at(vars(distance, arr_delay), funs(sum(is.na(.))))
# A tibble: 1 x 2
#  distance arr_delay
#     <int>     <int>
#1        0      9430  #### number of NAs

So, if we correct for that, then the output will be the same

res <- nycflights13::flights %>% 
    group_by(tailnum) %>%
    summarise(count = n(),
              dist = mean(distance, na.rm = TRUE),
              delay = mean(arr_delay, na.rm = TRUE)) %>%
     filter(count > 20, dist < 2000, !is.na(delay))  %>%
     arrange(tailnum)
res
# A tibble: 2,961 x 4
#   tailnum count  dist   delay
#   <chr>   <int> <dbl>   <dbl>
# 1 N0EGMQ    371   676   9.98 
# 2 N10156    153   758  12.7  
# 3 N102UW     48   536   2.94 
# 4 N103US     46   535 - 6.93 
# 5 N104UW     47   535   1.80 
# 6 N10575    289   520  20.7  
# 7 N105UW     45   525 - 0.267
# 8 N107US     41   529 - 5.73 
# 9 N108UW     60   534 - 1.25 
#10 N109UW     48   536 - 2.52 
# ... with 2,951 more rows

Using sparklyr

library(sparklyr)
library(dplyr)
library(nycflights13)
sc <- spark_connect(master = "local")

flights_tbl <- copy_to(sc, nycflights13::flights, "flights")

delay <- flights_tbl %>% 
  group_by(tailnum) %>%
  summarise(count = n(), dist = mean(distance), delay = mean(arr_delay)) %>%
  filter(count > 20, dist < 2000, !is.na(delay)) %>% 
  arrange(tailnum) %>%
  collect
delay
# A tibble: 2,961 x 4
#   tailnum count  dist   delay
#   <chr>   <dbl> <dbl>   <dbl>
# 1 N0EGMQ  371     676   9.98 
# 2 N10156  153     758  12.7  
# 3 N102UW   48.0   536   2.94 
# 4 N103US   46.0   535 - 6.93 
# 5 N104UW   47.0   535   1.80 
# 6 N10575  289     520  20.7  
# 7 N105UW   45.0   525 - 0.267
# 8 N107US   41.0   529 - 5.73 
# 9 N108UW   60.0   534 - 1.25 
#10 N109UW   48.0   536 - 2.52 
# ... with 2,951 more rows

Appropriate procedure for using group_by() %% summarize() %% mutate() with sf objects

I made a change to the following line of code.

mutate(stupid_var = rowSums(dplyr::select(.,'SID74':'NWBIR74'))/BIR74)

This line of code was probably causing an issue. Unless I am missing something, it would appear there is no reason for summing the entire columns for each row. So the code was changed to remove the rowSums() function. The mutate function was still used to perform the math from the data on each row of data, but without involving any rowSums() values.

p1 <- nc %>% 
  separate(CNTY_ID,into = c("ID1","ID2"),sep = 2,remove = FALSE) %>% 
               group_by(ID1)  %>% 
               dplyr::summarize(AREA = sum(AREA, na.rm = TRUE), 
               BIR74 = sum(BIR74,na.rm = TRUE), 
               SID74 = sum(SID74,na.rm = TRUE), 
               NWBIR74 = sum(NWBIR74,na.rm = TRUE)) %>%
               mutate( stupid_var = ( (p2$SID74) + (p2$NWBIR74)) / (p2$BIR74) )
p1

The output can be viewed from this link.

How to Make Join Operations in Dplyr Silent