How to Tidy This Dataset

How to tidy this dataset?

With:

library(tidyr)
df %>% gather(month, val, may:october) %>% spread(topic, val)

you get:

  user_id     month bark harp talk walk
1 192775 august 0 0 2 146
2 192775 july 0 0 0 128
3 192775 june 0 0 0 123
4 192775 may 0 0 2 165
5 192775 october 0 1 1 105
6 192775 september 0 0 2 113

Another option is to use recast from the reshape2-package:

library(reshape2)
recast(df, user_id + variable ~ topic, id.var = c('user_id','topic'))

How to tidy dataset where columns are dummy variables and cell values are names of observations?

library(tidyr)

df %>%
pivot_longer(everything()) %>%
drop_na(value) %>%
pivot_wider(values_from = name,
values_fill = list(name = 0),
values_fn = list(name = ~1))

# # A tibble: 4 x 4
# value Var1 Var2 Var3
# <chr> <dbl> <dbl> <dbl>
# 1 Name1 1 1 0
# 2 Name2 1 0 1
# 3 Name3 0 1 0
# 4 Name4 1 1 0

Description

  • values_fn = list(name = ~ 1): convert strings to 1

  • values_fill = list(name = 0): specify 0 to be filled when missing

Data

df <- structure(list(
Var1 = c("Name1", "Name2", "Name4"),
Var2 = c("Name1", "Name3", "Name4"),
Var3 = c("Name2", NA, NA)
), row.names = c(NA, -3L), class = "data.frame")

How to tidy the data set with column containing multiple information-Sample data put?

Untidy data can be a challenge. Here is a tidyverse approach.

First, added proposed column names expected for d1, d2, and no. Assumes rows are in this order.

Column Farmer.Name is separated into two columns, by :.

The Name itself is separated before the word Gender.

fill allows for common values to be filled in for the same individual (such as v1, adress, amount, and Name).

pivot_wider is done to spread the data wide, first, by d1, d2, and no, and then by the other columns including Gender, farmer_type, and farmer_category.

library(tidyverse)

df1 <- mydata %>%
mutate(d_var = rep(c("d1", "d2", "no"), times = 3)) %>%
separate(Farmer.Name, into = c("Var", "Val"), sep = ":") %>%
separate(Var, into = c("Name", "Var"), sep = "(?=Gender)", fill = "left") %>%
mutate_at(c("Name", "Var"), trimws) %>%
fill(v1, adress, amount, Name, .direction = "down") %>%
mutate(Var = gsub(" ", "_", Var))

df1 %>%
pivot_wider(id_cols = c(v1, Name, adress, amount), names_from = d_var, values_from = d1..d2..no) %>%
left_join(pivot_wider(df1, id_cols = c(v1, Name, adress, amount), names_from = Var, values_from = Val))

Output

# A tibble: 3 x 10
v1 Name adress amount d1 d2 no Gender farmer_type farmer_category
<dbl> <chr> <chr> <dbl> <chr> <chr> <chr> <chr> <chr> <chr>
1 1 S Jacob k11 25 27/01/2020 43832 KE004421 male "marginal" general
2 2 J Isac k12 25 43832 43832 KE003443 Female " large" general
3 3 P Kumar k13 32 31/12/2019 43832 KE0001512 Male "small" general

Tidying dataset by gathering multiple columns?

With melt from data.table (see ?patterns):

library(data.table)

melt(setDT(df), measure = patterns("^qID", "^time_taken"),
value.name = c("qID", "time_taken"))

Result:

   age gender     education previous_comp_exp tutorial_time variable  qID time_taken
1: 18 Male Undergraduate casual_gamer 62.17926 1 sor9 39.61206
2: 24 Male Undergraduate casual_gamer 85.01288 1 sor9 50.92343
3: 18 Male Undergraduate casual_gamer 62.17926 2 sor8 19.48920
4: 24 Male Undergraduate casual_gamer 85.01288 2 sor8 16.15616

or with tidyr:

library(dplyr)
library(tidyr)

df %>%
gather(variable, value, qID.1:time_taken.2) %>%
mutate(variable = sub("\\.\\d$", "", variable)) %>%
group_by(variable) %>%
mutate(ID = row_number()) %>%
spread(variable, value, convert = TRUE) %>%
select(-ID)

Result:

# A tibble: 4 x 7
age gender education previous_comp_exp tutorial_time qID time_taken
<int> <fctr> <fctr> <fctr> <dbl> <chr> <dbl>
1 18 Male Undergraduate casual_gamer 62.17926 sor9 39.61206
2 18 Male Undergraduate casual_gamer 62.17926 sor8 19.48920
3 24 Male Undergraduate casual_gamer 85.01288 sor9 50.92343
4 24 Male Undergraduate casual_gamer 85.01288 sor8 16.15616

Note:

For the tidyr method, convert=TRUE is used to convert time_taken back to numeric, since it was coerced to character when gathered with the qID columns.

Data:

df = structure(list(age = c(18L, 24L), gender = structure(c(1L, 1L
), .Label = "Male", class = "factor"), education = structure(c(1L,
1L), .Label = "Undergraduate", class = "factor"), previous_comp_exp = structure(c(1L,
1L), .Label = "casual_gamer", class = "factor"), tutorial_time = c(62.17926,
85.01288), qID.1 = structure(c(1L, 1L), .Label = "sor9", class = "factor"),
time_taken.1 = c(39.61206, 50.92343), qID.2 = structure(c(1L,
1L), .Label = "sor8", class = "factor"), time_taken.2 = c(19.4892,
16.15616)), .Names = c("age", "gender", "education", "previous_comp_exp",
"tutorial_time", "qID.1", "time_taken.1", "qID.2", "time_taken.2"
), class = "data.frame", row.names = c(NA, -2L))

Make tidy tibble dataset with matrix variable

As perhaps half of an answer which would at least mean you don't have to repeat the typing of exp, exp2... up to exp50 etc., you could pivot_longer before joining and pivot_wider after nesting to give the right number of list columns. These can then be plucked out into matrix columns by pattern matching column names:

library(tidyverse)

person_data <-
tibble(
ID = paste0("ID", 1:50),
outcome = sample(0:1, 50, T),
date = sample(5:145, 50, F),
min_date = date - 5
)

value_data <-
tibble(
date = seq(1:150),
exp1 = sample(20:100, 150, T),
exp2 = sample(20:100, 150, T)
)

merged <-
fuzzyjoin::fuzzy_left_join(
person_data,
value_data |> pivot_longer(-date, names_to = "exp", values_to = "val"),
by = c("date" = "date", "min_date" = "date"),
match_fun = c(`>=`, `<=`)
)

merged |>
select(-date.y) |>
group_by(ID, exp) |>
nest(val = val) |>
pivot_wider(names_from = exp,
values_from = val) |>
rowwise() |>
mutate(across(starts_with("exp"), ~ t(as.matrix(.x$val))))
#> # A tibble: 50 × 6
#> # Rowwise: ID
#> ID outcome date.x min_date exp1[,1] [,2] [,3] [,4] [,5] [,6] exp2[,1]
#> <chr> <int> <int> <dbl> <int> <int> <int> <int> <int> <int> <int>
#> 1 ID1 0 118 113 96 83 69 59 42 34 60
#> 2 ID2 1 9 4 38 91 83 28 87 73 96
#> 3 ID3 1 83 78 49 31 33 84 57 50 83
#> 4 ID4 1 97 92 91 92 21 53 59 39 23
#> 5 ID5 1 71 66 97 28 56 91 67 43 98
#> 6 ID6 1 27 22 81 88 41 22 24 84 36
#> 7 ID7 1 64 59 46 51 88 76 39 63 53
#> 8 ID8 0 72 67 28 56 91 67 43 55 23
#> 9 ID9 0 80 75 76 79 62 49 31 33 66
#> 10 ID10 0 87 82 57 50 31 72 95 31 66
#> # … with 40 more rows, and 1 more variable: exp2[2:6] <int>

Created on 2022-08-10 by the reprex package (v2.0.1)

As noted, it's not fully answering your question, but would at least get you to the half-way-acceptable stage you'd reached when producing analysis_data.

The things remaining would be:

  1. is this where you expect your exp1 etc. variables to go?
  2. it's not clear where your index values would come from for each row?

is there a cleaner way to tidy a list of data frames?

You can use dplyr::bind_rows with .id parameter:

.id Data frame identifier.

When .id is supplied, a new column of
identifiers is created to link each row to its original data frame.
The labels are taken from the named arguments to bind_rows(). When a
list of data frames is supplied, the labels are taken from the names
of the list.
If no names are found a numeric sequence is used instead.

bind_rows(obs, .id = "item")

# A tibble: 6 x 5
# item date X Y Z
# <chr> <date> <dbl> <dbl> <dbl>
#1 obsA 2009-01-01 -1.73508885 -0.4402811 7.342978
#2 obsA 2009-01-02 1.17149983 -0.5429690 8.167079
#3 obsA 2009-01-03 0.08631895 -0.1430551 5.925108
#4 obsB 2009-01-01 9.66203430 7.1094147 15.577023
#5 obsB 2009-01-02 10.43062660 9.6160614 15.077929
#6 obsB 2009-01-03 8.80792988 8.9604396 7.413831


Related Topics



Leave a reply



Submit