Reshape from Long to Wide and Create Columns with Binary Value

Reshape from long to wide and create columns with binary value

Using reshape2 we can dcast from long to wide.

As you only want a binary outcome we can unique the data first

library(reshape2)

si <- unique(studentInfo)
dcast(si, formula = StudentID ~ Subject, fun.aggregate = length)

# StudentID English History Maths Science
#1 1 1 0 1 1
#2 2 0 0 1 0
#3 3 0 1 0 0

Another approach using tidyr and dplyr is

library(tidyr)
library(dplyr)

studentInfo %>%
mutate(yesno = 1) %>%
distinct %>%
spread(Subject, yesno, fill = 0)

# StudentID English History Maths Science
#1 1 1 0 1 1
#2 2 0 0 1 0
#3 3 0 1 0 0

Although I'm not a fan (yet) of tidyr syntax...

reordering my reshape: long to wide with pivot_wider, different column order

It may be easier with names_glue in pivot_wider

library(dplyr)
library(tidyr)
df %>%
pivot_wider(names_from = Set, values_from = c(Street, State),
names_glue = "{tools::toTitleCase(Set)}_{.value}") %>%
dplyr::select(ID, Name, order(readr::parse_number(names(.)[-(1:2)])) + 2)

-output

# A tibble: 3 × 8
ID Name Set1_Street Set1_State Set2_Street Set2_State Set3_Street Set3_State
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 A mary 123 St al 234 St nc <NA> <NA>
2 B berry 543 St fl 492 st ca 231 st md
3 C paul 492 st tx 231 st vt <NA> <NA>

Pandas long to wide reshape, by two variables

A simple pivot might be sufficient for your needs but this is what I did to reproduce your desired output:

df['idx'] = df.groupby('Salesman').cumcount()

Just adding a within group counter/index will get you most of the way there but the column labels will not be as you desired:

print df.pivot(index='Salesman',columns='idx')[['product','price']]

product price
idx 0 1 2 0 1 2
Salesman
Knut bat ball wand 5 1 3
Steve pen NaN NaN 2 NaN NaN

To get closer to your desired output I added the following:

df['prod_idx'] = 'product_' + df.idx.astype(str)
df['prc_idx'] = 'price_' + df.idx.astype(str)

product = df.pivot(index='Salesman',columns='prod_idx',values='product')
prc = df.pivot(index='Salesman',columns='prc_idx',values='price')

reshape = pd.concat([product,prc],axis=1)
reshape['Height'] = df.set_index('Salesman')['Height'].drop_duplicates()
print reshape

product_0 product_1 product_2 price_0 price_1 price_2 Height
Salesman
Knut bat ball wand 5 1 3 6
Steve pen NaN NaN 2 NaN NaN 5

Edit: if you want to generalize the procedure to more variables I think you could do something like the following (although it might not be efficient enough):

df['idx'] = df.groupby('Salesman').cumcount()

tmp = []
for var in ['product','price']:
df['tmp_idx'] = var + '_' + df.idx.astype(str)
tmp.append(df.pivot(index='Salesman',columns='tmp_idx',values=var))

reshape = pd.concat(tmp,axis=1)

@Luke said:

I think Stata can do something like this with the reshape command.

You can but I think you also need a within group counter to get the reshape in stata to get your desired output:

     +-------------------------------------------+
| salesman idx height product price |
|-------------------------------------------|
1. | Knut 0 6 bat 5 |
2. | Knut 1 6 ball 1 |
3. | Knut 2 6 wand 3 |
4. | Steve 0 5 pen 2 |
+-------------------------------------------+

If you add idx then you could do reshape in stata:

reshape wide product price, i(salesman) j(idx)

Reshape categorical data in columns to binary data in rows using R

There is probably a shorter solution for this, but as a quick fix:

library(stringr)
library(dplyr)
library(reshape2)

#initial melt
df2 <- melt(df[-6], id.vars='team')
#split the variable column to get the 'away and home' column
df2 <- cbind(df2, str_split_fixed(df2$variable, "\\.", 2))
#create the wins column
df2$win <- ifelse(df2$`1` == 'wins',1,0)
#replicate the number of rows according to value
df2 <- df2[rep(1:nrow(df2), df2$value),]
#remove unnecessary columns
df2$variable <- df2$value <- df2$`1` <- NULL
#final group by to add the game column
df2 %>% group_by(team) %>% mutate(game = 1:n()) %>% arrange(team)

Output:

Source: local data frame [18 x 4]
Groups: team [4]

team 2 win game
(fctr) (fctr) (dbl) (int)
1 a away 1 1
2 a away 0 2
3 a away 0 3
4 b home 1 1
5 b away 1 2
6 b home 0 3
7 b away 0 4
#...and so on

From long (with multiple categories) to wide format

We can do this by splitting the 'type' at the delimiter with separate_rows, create an column of 1s. and 'spread` it to 'wide' format

library(tidyverse)
dataset_long %>%
separate_rows(type) %>%
mutate(type = paste0('type_', type)) %>%
mutate(ind = 1) %>%
spread(type, ind, fill = 0)
# id type_A type_B type_C
#1 1 1 0 0
#2 2 1 1 0
#3 3 1 0 1
#4 4 0 0 1
#5 5 1 0 0

Or using strsplit and table from base R

table(stack(setNames(strsplit(as.character(dataset_long$type),
', '), dataset_long$id))[2:1])

Or in a single line with mtabulate

library(qdapTools)
mtabulate(setNames(strsplit(as.character(dataset_long$type), ', '), dataset_long$id))

Reshape multiple categorical variables to binary response variables

How much spice is too much? Here is a solution via tidyr:

library(dplyr)
library(tidyr)

mydata %>%
gather(actor,name,starts_with("actor")) %>%
mutate(present = 1) %>%
select(-actor) %>%
spread(name,present,fill = 0)

movie Jack Kate Leo
1 Departed 1 0 1
2 Titanic 0 1 1

reshape wide long with key specifying levels of grouping variable

We can use pivot_longer

library(dplyr)
library(tidyr)
library(stringr)
v1 <- c("part.1", "part.2", "part.3")
pat <- sprintf("^(outcome\\d*).*(%s).*$", str_c(v1, collapse="|"))
df1 %>%
pivot_longer(cols = starts_with('outcome'),
names_to = c(".value", "strata"),
names_pattern = pat)

-output

# A tibble: 21 × 5
group technique strata outcome1 outcome2
<chr> <chr> <chr> <int> <int>
1 A "attack" part.1 24 14
2 A "attack" part.2 52 234
3 A "attack" part.3 14 234
4 B "defenese" part.1 1234 234
5 B "defenese" part.2 321 234
6 B "defenese" part.3 23 234
7 A "attack " part.1 15 145
8 A "attack " part.2 23 234
9 A "attack " part.3 3 234
10 B "defense " part.1 234 4
# … with 11 more rows

NOTE: There was a typo in the column names i.e. third column should be

names(df1)[3] <- 'outcome1.part.1'

Convert Duplicated data from long to wide, while preserving key-value pairs in R

Two pivots do it:

library(dplyr)
library(tidyr) # pivot_*
df2 %>%
pivot_longer(-id, names_pattern = "(.*?)([0-9]+)", names_to = c(".value", "iter")) %>%
group_by(id) %>%
mutate(iter = row_number()) %>%
pivot_wider(id, names_from = "iter", values_from = c("key", "value"), names_sep = "") %>%
ungroup()
# # A tibble: 3 x 31
# id key1 key2 key3 key4 key5 key6 key7 key8 key9 key10 key11 key12 key13 key14 key15 value1 value2 value3 value4 value5 value6 value7 value8 value9 value10 value11 value12 value13 value14 value15
# <dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> 23 45 89 786 10 NA NA NA NA NA NA NA NA NA NA
# 2 2 UJD02 JFF00 <NA> <NA> <NA> UJD05 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> NA NA NA 670 NA 11 25 89 12 51 NA NA NA NA NA
# 3 3 UJD05 TPX10 DV071 <NA> <NA> KFC10 DR036 UGC12 UEN05 XXA00 DR036 UJD05 JFF00 TPX10 <NA> 45 23 103 987 29 67 NA 6700 67 15 1 78 50 14 3


Related Topics



Leave a reply



Submit