Mutating Dummy Variables in Dplyr

Mutating dummy variables in dplyr

If you want to do this with the pipe, you can do something like:

library(dplyr)
library(sjmisc)

mydf %>% 
  to_dummy(day, suffix = "label") %>% 
  bind_cols(mydf) %>% 
  select(x, day, everything())

Returns:

# A tibble: 9 x 9
  x     day   day_Fri day_Mon day_Sat day_Sun day_Thurs day_Tues day_Wed
  <fct> <fct>   <dbl>   <dbl>   <dbl>   <dbl>     <dbl>    <dbl>   <dbl>
1 a     Mon        0.      1.      0.      0.        0.       0.      0.
2 b     Tues       0.      0.      0.      0.        0.       1.      0.
3 c     Wed        0.      0.      0.      0.        0.       0.      1.
4 d     Thurs      0.      0.      0.      0.        1.       0.      0.
5 e     Fri        1.      0.      0.      0.        0.       0.      0.
6 f     Sat        0.      0.      1.      0.        0.       0.      0.
7 g     Sun        0.      0.      0.      1.        0.       0.      0.
8 h     Fri        1.      0.      0.      0.        0.       0.      0.
9 i     Mon        0.      1.      0.      0.        0.       0.      0.

With dplyr and tidyr we could do:

library(dplyr)
library(tidyr)

mydf %>% 
  mutate(var = 1) %>% 
  spread(day, var, fill = 0, sep = "_") %>% 
  left_join(mydf) %>% 
  select(x, day, everything())

And with base R we could do something like:

as.data.frame.matrix(table(rep(mydf$x, lengths(mydf$day)), unlist(mydf$day)))

Returns:

  Fri Mon Sat Sun Thurs Tues Wed
a   0   1   0   0     0    0   0
b   0   0   0   0     0    1   0
c   0   0   0   0     0    0   1
d   0   0   0   0     1    0   0
e   1   0   0   0     0    0   0
f   0   0   1   0     0    0   0
g   0   0   0   1     0    0   0
h   1   0   0   0     0    0   0
i   0   1   0   0     0    0   0

use dplyr to create dummy variables

Try mutate:

> y <- data.frame(var1 = (-2):2)
> y %>% mutate(var2 = as.numeric(var1 > 0.5))
  var1 var2
1   -2    0
2   -1    0
3    0    0
4    1    1
5    2    1

Update: dplyr now uses %>% in place of %.%

create a dummy variable (using mutate) based on a pattern in a character string

This works:

library(stringr)
iris%>% mutate(
    anyV = ifelse(str_detect(Species, "v"), "withV", "noV"))

    Sepal.Length Sepal.Width Petal.Length Petal.Width    Species  anyV
1            5.1         3.5          1.4         0.2     setosa   noV
2            4.9         3.0          1.4         0.2     setosa   noV
3            4.7         3.2          1.3         0.2     setosa   noV
4            4.6         3.1          1.5         0.2     setosa   noV
5            5.0         3.6          1.4         0.2     setosa   noV
...
52           6.4         3.2          4.5         1.5 versicolor withV
53           6.9         3.1          4.9         1.5 versicolor withV
54           5.5         2.3          4.0         1.3 versicolor withV
55           6.5         2.8          4.6         1.5 versicolor withV
56           5.7         2.8          4.5         1.3 versicolor withV
57           6.3         3.3          4.7         1.6 versicolor withV
58           4.9         2.4          3.3         1.0 versicolor withV
59           6.6         2.9          4.6         1.3 versicolor withV

An alternative to nested ifelse statements:

iris%>% mutate(newVar = case_when(
    str_detect(.$Species, "se") ~ "group1",
    str_detect(.$Species, "ve") ~ "group2",
    str_detect(.$Species, "vi") ~ "group3",
    TRUE ~ as.character(.$Species)))

Mutate dummy variable with observation before and after

Here is an approach using dplyr:

library(dplyr)

df1 %>% 
  group_by(title, v1) %>% 
  mutate(summe = sum(volume)) %>% 
  group_by(title) %>% 
  mutate(dummy_volume = all(summe > 0)) %>% 
  select(-summe)

# A tibble: 15 x 5
# Groups:   title [3]
   title   day volume    v1 dummy_volume
   <fct> <dbl>  <dbl> <dbl> <lgl>       
 1 x         0      0     0 FALSE       
 2 x         1      0     0 FALSE       
 3 x         2      1     1 FALSE       
 4 x         3      1     1 FALSE       
 5 x         4      2     1 FALSE       
 6 y         0      3     0 FALSE       
 7 y         1      0     1 FALSE       
 8 y         2      0     1 FALSE       
 9 y         3      0     1 FALSE       
10 y         4      0     1 FALSE       
11 z         0      3     0 TRUE        
12 z         1      3     0 TRUE        
13 z         2      4     0 TRUE        
14 z         3      2     1 TRUE        
15 z         4      1     1 TRUE

With the Dummy coded as 0/1 as in your desired output:

df1 %>% 
  group_by(title, v1) %>% 
  mutate(summe = sum(volume)) %>% 
  group_by(title) %>% 
  mutate(dummy_volume = as.integer(all(summe > 0))) %>% 
  select(-summe)

# A tibble: 15 x 5
# Groups:   title [3]
   title   day volume    v1 dummy_volume
   <fct> <dbl>  <dbl> <dbl>        <int>
 1 x         0      0     0            0
 2 x         1      0     0            0
 3 x         2      1     1            0
 4 x         3      1     1            0
 5 x         4      2     1            0
 6 y         0      3     0            0
 7 y         1      0     1            0
 8 y         2      0     1            0
 9 y         3      0     1            0
10 y         4      0     1            0
11 z         0      3     0            1
12 z         1      3     0            1
13 z         2      4     0            1
14 z         3      2     1            1
15 z         4      1     1            1

creating a conditional dummy variable using dplyr and ifelse statements in R

You can use any to check if any value of manu_GDP is greater than 20.

library(dplyr)

df %>% 
  group_by(country) %>%
  summarise(new_dummy = as.integer(any(manu_GDP > 20, na.rm = TRUE)))

If you want to maintain the number of rows in the data use mutate instead of summarise.

Creating dummy variables as counts using tidyverse/dplyr

using reshape2 but you could pretty much use any package that lets you reformat from long to wide

    library(reshape2)
    df = dcast(fruitData,ID~FRUIT,length)
   
    > df
    ID apple banana grape
  1  1     2      1     0
  2  2     1      0     1
  3  3     1      0     0

Transforming dummy variables to single column in R

We could first define a custom function that allows us to recode dummy variables based on their variable names, below called var_nm2value.

This function takes the values of the variables as x argument. In dplyr::across this is the .x part. And it takes a list of name-value pairs as value_ls argument. The function just loops over the list of name-value pairs, checks if the name in value_ls is found in the variable name. To do this it uses grepl on dplyr::cur_column(). If we have a match then we replace all 1s with the value from our value_ls and we return all other values, that is the zeros, as is.

Then we can define a list of recode values, below recode_ls.

Finally, we use purrr::map_dfc in a dplyr::summarise where we use the variable strings we want to create "age" and "chol_test", then ii) select only columns which contain this string, and in each iteration we iii) apply dplyr::across to recode the values, iv) pipe the result in a do.call to get the max and finally v) recode 0s to NA:

# custom function to recode  0/1 dummy variables based on their variable name an 
var_nm2value <- function(x, values_ls) {
  for (val in seq_along(values_ls)) {
    if(grepl(names(values_ls)[val], dplyr::cur_column())) {
      return(ifelse(x == 1L, values_ls[[val]], x))
    } 
  }
}

# define list of recode values
recode_ls <- list(low = 1, medium = 2, high = 3)

library(tidyverse)

# apply functions to data.frame
df1 %>% 
  summarise(race = race,
            gender = gender,
            map_dfc(set_names(c("age", "chol_test")), # i)
                    function(x) { 
                      select(., contains(x)) %>% # ii)
                        summarise("{x}" := across(everything(), var_nm2value, recode_ls) %>% # iii)
                                    do.call("pmax", .) %>% # iv) 
                                    ifelse(. == 0, NA, .))} # v)
            )) 

#>    race gender age chol_test
#> 1 white      0   1        NA
#> 2 white      0   1        NA
#> 3 white      1   1        NA
#> 4 black      1   2        NA
#> 5 white      0   3         3
#> 6 black      0   2         1

^{Created on 2022-01-03 by the reprex package (v0.3.0)}

Using dplyr to gather dummy variables

This can be done using the 'tidyverse' library - specificially 'tidyr' and 'dplyr'. The following code produces the output you are after.

library(tidyverse)
type %>% gather(TypeOfCar, Count) %>% filter(Count >= 1) %>% select(TypeOfCar)

Output:

   TypeOfCar
    <chr>
1 convertible
2 convertible
3 convertible
4 convertible
5       coupe
6       sedan

Hopefully this solves your problem, let me know if any changes are needed! Thanks.

How to generate a set of dummy variables dependent on values in several other columns with same prefix in R?

Does this help you?

library(tidyverse)

document <- data.frame(
  stringsAsFactors = FALSE,
  ID = c(1L, 2L, 3L, 4L),
  Name = c("Contract XYZ","Agreement ABC",
           "Document 123","Empty Space"),
  Year = c(2000L, 2003L, 2003L, 2004L),
  K1 = c("transport", "pens", "elephants", "music"),
  K2 = c("elephants", "music", NA, NA),
  K50 = c(NA, NA, NA, "transport")
)
document %>%
  pivot_longer(starts_with("K")) %>%
  select(-name) %>%
  filter(! is.na(value)) %>%
  mutate(has_property = 1) %>%
  pivot_wider(names_from = value, values_from = has_property)
#> # A tibble: 4 x 7
#>      ID Name           Year transport elephants  pens music
#>   <int> <chr>         <int>     <dbl>     <dbl> <dbl> <dbl>
#> 1     1 Contract XYZ   2000         1         1    NA    NA
#> 2     2 Agreement ABC  2003        NA        NA     1     1
#> 3     3 Document 123   2003        NA         1    NA    NA
#> 4     4 Empty Space    2004         1        NA    NA     1

^{Created on 2021-09-21 by the reprex package (v2.0.1)}

Mutating Dummy Variables in Dplyr