Reshape from long to wide and create columns with binary value
Using reshape2
we can dcast
from long to wide.
As you only want a binary outcome we can unique
the data first
library(reshape2)
si <- unique(studentInfo)
dcast(si, formula = StudentID ~ Subject, fun.aggregate = length)
# StudentID English History Maths Science
#1 1 1 0 1 1
#2 2 0 0 1 0
#3 3 0 1 0 0
Another approach using tidyr
and dplyr
is
library(tidyr)
library(dplyr)
studentInfo %>%
mutate(yesno = 1) %>%
distinct %>%
spread(Subject, yesno, fill = 0)
# StudentID English History Maths Science
#1 1 1 0 1 1
#2 2 0 0 1 0
#3 3 0 1 0 0
Although I'm not a fan (yet) of tidyr
syntax...
reordering my reshape: long to wide with pivot_wider, different column order
It may be easier with names_glue
in pivot_wider
library(dplyr)
library(tidyr)
df %>%
pivot_wider(names_from = Set, values_from = c(Street, State),
names_glue = "{tools::toTitleCase(Set)}_{.value}") %>%
dplyr::select(ID, Name, order(readr::parse_number(names(.)[-(1:2)])) + 2)
-output
# A tibble: 3 × 8
ID Name Set1_Street Set1_State Set2_Street Set2_State Set3_Street Set3_State
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 A mary 123 St al 234 St nc <NA> <NA>
2 B berry 543 St fl 492 st ca 231 st md
3 C paul 492 st tx 231 st vt <NA> <NA>
Pandas long to wide reshape, by two variables
A simple pivot might be sufficient for your needs but this is what I did to reproduce your desired output:
df['idx'] = df.groupby('Salesman').cumcount()
Just adding a within group counter/index will get you most of the way there but the column labels will not be as you desired:
print df.pivot(index='Salesman',columns='idx')[['product','price']]
product price
idx 0 1 2 0 1 2
Salesman
Knut bat ball wand 5 1 3
Steve pen NaN NaN 2 NaN NaN
To get closer to your desired output I added the following:
df['prod_idx'] = 'product_' + df.idx.astype(str)
df['prc_idx'] = 'price_' + df.idx.astype(str)
product = df.pivot(index='Salesman',columns='prod_idx',values='product')
prc = df.pivot(index='Salesman',columns='prc_idx',values='price')
reshape = pd.concat([product,prc],axis=1)
reshape['Height'] = df.set_index('Salesman')['Height'].drop_duplicates()
print reshape
product_0 product_1 product_2 price_0 price_1 price_2 Height
Salesman
Knut bat ball wand 5 1 3 6
Steve pen NaN NaN 2 NaN NaN 5
Edit: if you want to generalize the procedure to more variables I think you could do something like the following (although it might not be efficient enough):
df['idx'] = df.groupby('Salesman').cumcount()
tmp = []
for var in ['product','price']:
df['tmp_idx'] = var + '_' + df.idx.astype(str)
tmp.append(df.pivot(index='Salesman',columns='tmp_idx',values=var))
reshape = pd.concat(tmp,axis=1)
@Luke said:
I think Stata can do something like this with the reshape command.
You can but I think you also need a within group counter to get the reshape in stata to get your desired output:
+-------------------------------------------+
| salesman idx height product price |
|-------------------------------------------|
1. | Knut 0 6 bat 5 |
2. | Knut 1 6 ball 1 |
3. | Knut 2 6 wand 3 |
4. | Steve 0 5 pen 2 |
+-------------------------------------------+
If you add idx
then you could do reshape in stata
:
reshape wide product price, i(salesman) j(idx)
Reshape categorical data in columns to binary data in rows using R
There is probably a shorter solution for this, but as a quick fix:
library(stringr)
library(dplyr)
library(reshape2)
#initial melt
df2 <- melt(df[-6], id.vars='team')
#split the variable column to get the 'away and home' column
df2 <- cbind(df2, str_split_fixed(df2$variable, "\\.", 2))
#create the wins column
df2$win <- ifelse(df2$`1` == 'wins',1,0)
#replicate the number of rows according to value
df2 <- df2[rep(1:nrow(df2), df2$value),]
#remove unnecessary columns
df2$variable <- df2$value <- df2$`1` <- NULL
#final group by to add the game column
df2 %>% group_by(team) %>% mutate(game = 1:n()) %>% arrange(team)
Output:
Source: local data frame [18 x 4]
Groups: team [4]
team 2 win game
(fctr) (fctr) (dbl) (int)
1 a away 1 1
2 a away 0 2
3 a away 0 3
4 b home 1 1
5 b away 1 2
6 b home 0 3
7 b away 0 4
#...and so on
From long (with multiple categories) to wide format
We can do this by splitting the 'type' at the delimiter with separate_rows
, create an column of 1s. and 'spread` it to 'wide' format
library(tidyverse)
dataset_long %>%
separate_rows(type) %>%
mutate(type = paste0('type_', type)) %>%
mutate(ind = 1) %>%
spread(type, ind, fill = 0)
# id type_A type_B type_C
#1 1 1 0 0
#2 2 1 1 0
#3 3 1 0 1
#4 4 0 0 1
#5 5 1 0 0
Or using strsplit
and table
from base R
table(stack(setNames(strsplit(as.character(dataset_long$type),
', '), dataset_long$id))[2:1])
Or in a single line with mtabulate
library(qdapTools)
mtabulate(setNames(strsplit(as.character(dataset_long$type), ', '), dataset_long$id))
Reshape multiple categorical variables to binary response variables
How much spice is too much? Here is a solution via tidyr
:
library(dplyr)
library(tidyr)
mydata %>%
gather(actor,name,starts_with("actor")) %>%
mutate(present = 1) %>%
select(-actor) %>%
spread(name,present,fill = 0)
movie Jack Kate Leo
1 Departed 1 0 1
2 Titanic 0 1 1
reshape wide long with key specifying levels of grouping variable
We can use pivot_longer
library(dplyr)
library(tidyr)
library(stringr)
v1 <- c("part.1", "part.2", "part.3")
pat <- sprintf("^(outcome\\d*).*(%s).*$", str_c(v1, collapse="|"))
df1 %>%
pivot_longer(cols = starts_with('outcome'),
names_to = c(".value", "strata"),
names_pattern = pat)
-output
# A tibble: 21 × 5
group technique strata outcome1 outcome2
<chr> <chr> <chr> <int> <int>
1 A "attack" part.1 24 14
2 A "attack" part.2 52 234
3 A "attack" part.3 14 234
4 B "defenese" part.1 1234 234
5 B "defenese" part.2 321 234
6 B "defenese" part.3 23 234
7 A "attack " part.1 15 145
8 A "attack " part.2 23 234
9 A "attack " part.3 3 234
10 B "defense " part.1 234 4
# … with 11 more rows
NOTE: There was a typo in the column names i.e. third column should be
names(df1)[3] <- 'outcome1.part.1'
Convert Duplicated data from long to wide, while preserving key-value pairs in R
Two pivots do it:
library(dplyr)
library(tidyr) # pivot_*
df2 %>%
pivot_longer(-id, names_pattern = "(.*?)([0-9]+)", names_to = c(".value", "iter")) %>%
group_by(id) %>%
mutate(iter = row_number()) %>%
pivot_wider(id, names_from = "iter", values_from = c("key", "value"), names_sep = "") %>%
ungroup()
# # A tibble: 3 x 31
# id key1 key2 key3 key4 key5 key6 key7 key8 key9 key10 key11 key12 key13 key14 key15 value1 value2 value3 value4 value5 value6 value7 value8 value9 value10 value11 value12 value13 value14 value15
# <dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> 23 45 89 786 10 NA NA NA NA NA NA NA NA NA NA
# 2 2 UJD02 JFF00 <NA> <NA> <NA> UJD05 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> NA NA NA 670 NA 11 25 89 12 51 NA NA NA NA NA
# 3 3 UJD05 TPX10 DV071 <NA> <NA> KFC10 DR036 UGC12 UEN05 XXA00 DR036 UJD05 JFF00 TPX10 <NA> 45 23 103 987 29 67 NA 6700 67 15 1 78 50 14 3
Related Topics
Why Does As.Factor Return a Character When Used Inside Apply
What Is Integer Overflow in R and How Can It Happen
Displaying a Greater Than or Equal Sign
Subset Based on Variable Column Name
Get All Diagonal Vectors from Matrix
Setting Function Defaults R on a Project Specific Basis
Assigning Dates to Fiscal Year
Create End of the Month Date from a Date Variable
Speed Up Plot() Function for Large Dataset
How to Plot a Hybrid Boxplot: Half Boxplot with Jitter Points on the Other Half
Print Unicode Character String in R