Merge Panel data to get balanced panel data
There's a function for that. Combine the data frames with rbind
. Then use complete
. It will look through the groups in variable
and fill any with missing values:
library(tidyr)
df3 <- do.call(rbind.data.frame, list(df1, df2))
df3$Month <- as.character(df3$Month)
df4 <- complete(df3, Month, variable)
df4$Month <- as.yearmon(df4$Month, "%b %Y")
df5 <- df4[order(df4$variable,df4$Month),]
df5
# Source: local data frame [72 x 8]
#
# Month variable Beta1 Beta2 Beta3 Beta4 Beta5 Beta6
# (yrmn) (fctr) (int) (int) (int) (int) (int) (int)
# 1 Jan 2005 A 1 2 3 4 5 6
# 2 Feb 2005 A 2 3 4 5 6 7
# 3 Mar 2005 A 3 4 5 6 7 8
# 4 Apr 2005 A 4 5 6 7 8 9
# 5 May 2005 A 5 6 7 8 9 10
# 6 Jun 2005 A 6 7 8 9 10 11
# 7 Jul 2005 A 7 8 9 10 11 12
# 8 Aug 2005 A 8 9 10 11 12 13
# 9 Sep 2005 A 9 10 11 12 13 14
# 10 Oct 2005 A 10 11 12 13 14 15
# .. ... ... ... ... ... ... ... ...
An alternative implementation with dplyr & tidyr:
library(dplyr)
library(tidyr)
df3 <- bind_rows(df1, df2) %>%
complete(Month, variable)
Merging uneven Panel Data frames in R
It would help if you posted your data (or a working subset of it) and a little more detail on how you are trying to merge, but if I understand what you are trying to do, you want each final data record to have individual stats for each player on a particular date followed by the player's team's stats for that date. In this case, you should have a team
column in the Player table that identifies the player's team, and then join the two tables on the composite key Date and Team by setting the by=
attribute in merge:
merge(PData, TData, by=c("Date", "Team"))
The fact that the data frames are of different lengths doesn't matter--this is exactly what join/merge operations are for.
For an alternative to merge(), you might check out the dplyr package join functions at https://cran.r-project.org/web/packages/dplyr/vignettes/two-table.html
Transform into balanced panel data
We do a cross join (CJ
) with unique
'ID', and 'date' of the dataset after setting the key
columns as 'ID' and 'date' and then do a join
with the original dataset.
setDT(test, key = c("ID", "date"))[CJ(ID, date, unique=TRUE)]
# ID date nr namecol
# 1: A 2009-01-01 NA NA
# 2: A 2010-01-01 1 rdm
# 3: A 2010-01-10 2 dfg
# 4: A 2010-01-14 3 fdgfd
# 5: A 2010-02-15 4 fdgfd
# 6: A 2010-08-17 5 dg
# 7: A 2010-12-19 6 dfg
# 8: B 2009-01-01 1 dfg
# 9: B 2010-01-01 2 ydg
#10: B 2010-01-10 3 fdgfd
#11: B 2010-01-14 4 dfg
#12: B 2010-02-15 5 dfg
#13: B 2010-08-17 NA NA
#14: B 2010-12-19 NA NA
data
test <- structure(list(ID = c("A", "A", "A", "A", "A", "A", "B", "B",
"B", "B", "B"), date = structure(c(14610, 14619, 14623, 14655,
14838, 14962, 14245, 14610, 14619, 14623, 14655), class = "Date"),
nr = c(1L, 2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L), namecol = c("rdm",
"dfg", "fdgfd", "fdgfd", "dg", "dfg", "dfg", "ydg", "fdgfd",
"dfg", "dfg")), .Names = c("ID", "date", "nr", "namecol"),
row.names = c(NA, -11L), class = "data.frame")
How to Merge two Panel data sets on Date and a combination of columns?
It sounds like you're looking for the how
keyword argument in pd.DataFrame.merge
and pd.DataFrame.join
.
Here is a sample:
import pandas as pd
df1 = pd.read_json(
'{"Date":{"0":1583020800000,"1":1583020800000,"2":1583020800000,"3":1625097600000,"4":1625097600000,"5":1625097600000},"City":{"0":"Los Angeles","1":"Sacramento","2":"Houston","3":"Los Angeles","4":"Sacramento","5":"Houston"},"State":{"0":"CA","1":"CA","2":"TX","3":"CA","4":"CA","5":"TX"},"Population":{"0":5000000,"1":5400000,"2":3500000,"3":5000002,"4":5444000,"5":4443300},"Cases":{"0":122,"1":120,"2":23,"3":12220,"4":211,"5":2111},"Deaths":{"0":12,"1":2,"2":11,"3":2200,"4":22,"5":330}}'
)
df2 = pd.read_json(
'{"Date":{"0":1546300800000,"1":1546300800000,"2":1546300800000,"3":1546300800000,"4":1625097600000,"5":1625097600000,"6":1625097600000},"City":{"0":"LOS ANGELES","1":"LOS ANGELES","2":"SACRAMENTO","3":"HOUSTON","4":"LOS ANGELES","5":"SACRAMENTO","6":"HOUSTON"},"State":{"0":"CA","1":"CA","2":"CA","3":"TX","4":"CA","5":"CA","6":"TX"},"Quantity x":{"0":null,"1":330.0,"2":4450.0,"3":440.0,"4":31113.0,"5":3220.0,"6":null},"Quantity y":{"0":445.0,"1":null,"2":566.0,"3":null,"4":3455.0,"5":null,"6":3200.0}}'
)
print("\ndf1 = \n", df1)
print("\ndf2 = \n", df2)
# Transform df1
df1["City"] = df1["City"].apply(str.upper) # To merge, need consistent casing
df1 = df1.groupby(["Date", "City", "State"])[
["Cases", "Deaths"]
].sum() # Aggregate cases + deaths just in case...
# Aggregate in df2
df2 = df2.groupby(["Date", "City", "State"])[
["Quantity x", "Quantity y"]
].sum() # implicit skipna=True
print("\ndf1' = \n", df1)
print("\ndf2' = \n", df2)
# MERGE: merging on indices
df3 = df1.join(df2, how="outer") # key: "how"
df3[["Cases", "Deaths"]] = (
df3[["Cases", "Deaths"]].fillna(0).astype(int)
) # inplace: downcasting complaint
df3.reset_index(
inplace=True
) # Will cause ["Date", "City", "State"] to be ordinary columns, not indices.
print("\ndf3 = \n", df3)
...the output is:
df1 =
Date City State Population Cases Deaths
0 2020-03-01 Los Angeles CA 5000000 122 12
1 2020-03-01 Sacramento CA 5400000 120 2
2 2020-03-01 Houston TX 3500000 23 11
3 2021-07-01 Los Angeles CA 5000002 12220 2200
4 2021-07-01 Sacramento CA 5444000 211 22
5 2021-07-01 Houston TX 4443300 2111 330
df2 =
Date City State Quantity x Quantity y
0 2019-01-01 LOS ANGELES CA NaN 445.0
1 2019-01-01 LOS ANGELES CA 330.0 NaN
2 2019-01-01 SACRAMENTO CA 4450.0 566.0
3 2019-01-01 HOUSTON TX 440.0 NaN
4 2021-07-01 LOS ANGELES CA 31113.0 3455.0
5 2021-07-01 SACRAMENTO CA 3220.0 NaN
6 2021-07-01 HOUSTON TX NaN 3200.0
df1' =
Cases Deaths
Date City State
2020-03-01 HOUSTON TX 23 11
LOS ANGELES CA 122 12
SACRAMENTO CA 120 2
2021-07-01 HOUSTON TX 2111 330
LOS ANGELES CA 12220 2200
SACRAMENTO CA 211 22
df2' =
Quantity x Quantity y
Date City State
2019-01-01 HOUSTON TX 440.0 0.0
LOS ANGELES CA 330.0 445.0
SACRAMENTO CA 4450.0 566.0
2021-07-01 HOUSTON TX 0.0 3200.0
LOS ANGELES CA 31113.0 3455.0
SACRAMENTO CA 3220.0 0.0
df3 =
Date City State Cases Deaths Quantity x Quantity y
0 2019-01-01 HOUSTON TX 0 0 440.0 0.0
1 2019-01-01 LOS ANGELES CA 0 0 330.0 445.0
2 2019-01-01 SACRAMENTO CA 0 0 4450.0 566.0
3 2020-03-01 HOUSTON TX 23 11 NaN NaN
4 2020-03-01 LOS ANGELES CA 122 12 NaN NaN
5 2020-03-01 SACRAMENTO CA 120 2 NaN NaN
6 2021-07-01 HOUSTON TX 2111 330 0.0 3200.0
7 2021-07-01 LOS ANGELES CA 12220 2200 31113.0 3455.0
8 2021-07-01 SACRAMENTO CA 211 22 3220.0 0.0
A few other points:
City
casing needs to be consistent at join/merge time.- You could also do:
df1.merge(df2, ..., left_index=True, right_index=True)
instead ofdf1.join
. You could also reset the indices viadf1.reset_index(inplace=True)
, etc. after the groupby-sum line(s) then use.merge(..., on=...)
(but the indices are convenient). - The final values of
Quantity {x,y}
are floats becauseNaN
s are present. (See next point.) - I would be deliberate about your treatment of
NaN
s v. auto-filled 0s. In the case ofCases
/Deaths
it sounds like you had no data BUT you were making the assumption that - in the absence ofCases
/Deaths
data - the values are0
. For theQuantity {x,y}
variables, no such assumption seemed to be warranted.
Data cleaning, from cross-sectional (multiple files) to panel in RStudio: merge/gather?
I believe you could do something along these lines:
fam = bind_rows(fam_list)
inc = bind_rows(inc_list)
ws = bind_rows(ws_list)
result = fam %>%
left_join(inc, by=c("HouseholdMember", "Year")) %>%
left_join(ws, by=c("HouseholdMember", "Year"))
Output:
HouseholdMember Year fam_v1 fam_v2 fam_v3 inc_v1 inc_v2 inc_v3 ws_v1 ws_v2 ws_v3
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 8001 2008 0.609 -0.253 -1.30 0.0147 0.719 -0.765 0.120 0.974 -0.764
2 8002 2008 0.395 1.73 -0.503 0.119 -3.33 -0.798 0.325 0.664 1.65
3 8003 2008 0.562 0.157 0.243 -1.18 -0.260 0.105 1.09 0.855 1.19
4 8004 2008 1.32 0.737 -1.18 0.725 -1.82 0.356 0.362 2.04 1.76
5 8005 2008 -0.497 -0.444 -0.632 -0.534 1.63 0.984 1.29 0.614 0.576
6 8006 2008 -1.70 -0.989 -1.32 0.868 0.0979 0.468 -0.0146 1.11 0.957
7 8007 2008 -2.19 -0.419 1.69 1.34 -0.404 -1.43 -0.156 0.648 -0.186
8 8008 2008 1.48 0.350 -0.595 0.785 -0.609 1.28 -1.01 1.04 0.845
9 8009 2008 -0.315 -0.530 0.419 0.390 -0.0951 -0.755 0.135 0.696 -1.97
10 8010 2008 -0.882 1.38 2.06 -0.0757 1.53 -0.494 -1.03 1.14 1.87
Note:
I manufactured the data for this example by creating a lists of tibbles; I believe the fam_list
, inc_list
, and ws_list
are similar to the list objects in your image. These are list of data frames / tibbles. I then use bind_rows
to bind these similar structure tibbles together so that I have a three large tibbles.
I then use left_join
twice to join inc
and ws
to fam
Input Data:
library(tidyverse)
fam_list = lapply(8:20, function(x) {
tibble(HouseholdMember = c(8000+seq(1:100)),
Year=2000+x,
fam_v1=rnorm(100),
fam_v2=rnorm(100),
fam_v3=rnorm(100)
)
})
names(fam_list) = paste0("fam_20", 8:20)
inc_list = lapply(8:20, function(x) {
tibble(HouseholdMember = c(8000+seq(1:100)),
Year=2000+x,
inc_v1=rnorm(100),
inc_v2=rnorm(100),
inc_v3=rnorm(100)
)
})
names(inc_list) = paste0("inc_20", 8:20)
ws_list = lapply(8:20, function(x) {
tibble(HouseholdMember = c(8000+seq(1:100)),
Year=2000+x,
ws_v1=rnorm(100),
ws_v2=rnorm(100),
ws_v3=rnorm(100)
)
})
names(ws_list) = paste0("ws_20", 8:20)
Input
How to balance panel data by adding missing rows with no information?
One option, that offers an efficient abstraction, is with complete from pyjanitor to get missing rows for the combination of Date
vs the group of ('ID', 'City', 'State')
:
# pip install pyjanitor
import pandas as pd
import janitor
df.complete(('ID', 'City', 'State'), 'Date')
Date ID City State Quantity
0 2019-01 10001 Los Angeles CA 500.0
1 2019-02 10001 Los Angeles CA 995.0
2 2019-03 10001 Los Angeles CA 943.0
3 2019-01 10002 Houston TX 4330.0
4 2019-02 10002 Houston TX NaN
5 2019-03 10002 Houston TX 2340.0
6 2019-01 10003 Sacramento CA 235.0
7 2019-02 10003 Sacramento CA 239.0
8 2019-03 10003 Sacramento CA 233.0
How to balance an unbalanced panel data?
We can use complete
from the tidyr
package. The key is to set nesting
properly.
library(dplyr)
library(tidyr)
balanced.panel <- unbalanced.panel %>%
complete(nesting(firm, ind, charac1), year = full_seq(year, period = 1))
balanced.panel
# # A tibble: 20 x 5
# firm ind charac1 year var1
# <chr> <dbl> <chr> <dbl> <dbl>
# 1 A 1 x 2010 11
# 2 A 1 x 2011 12
# 3 A 1 x 2012 13
# 4 A 1 x 2013 14
# 5 A 2 z 2010 NA
# 6 A 2 z 2011 15
# 7 A 2 z 2012 29
# 8 A 2 z 2013 NA
# 9 B 1 g 2010 31
# 10 B 1 g 2011 NA
# 11 B 1 g 2012 NA
# 12 B 1 g 2013 NA
# 13 B 2 y 2010 NA
# 14 B 2 y 2011 15
# 15 B 2 y 2012 NA
# 16 B 2 y 2013 18
# 17 C 1 h 2010 NA
# 18 C 1 h 2011 NA
# 19 C 1 h 2012 13
# 20 C 1 h 2013 2
Creating a balanced panel dataset
idx = pd.MultiIndex.from_product(
[df.id.unique(), df.year.unique()], names=["id", "year"]
)
df = df.set_index(["id", "year"]).reindex(idx).reset_index()
print(df)
Prints:
id year sales
0 1 2000 10.0
1 1 2001 NaN
2 1 2002 NaN
3 2 2000 10.0
4 2 2001 20.0
5 2 2002 30.0
Related Topics
Randomly Insert Nas into Dataframe Proportionaly
Add Percentage Labels to a Stacked Barplot
How to Stack Error Bars in a Stacked Bar Plot Using Geom_Errorbar
How to Add a Index by Set of Data When Using Rbindlist
R: Replace Multiple Values in Multiple Columns of Dataframes with Na
Why Use As.Factor() Instead of Just Factor()
Accept Http Request in R Shiny Application
How to Test If List Element Exists
Plot a Function with Ggplot, Equivalent of Curve()
Non-Redundant Version of Expand.Grid
Ggplot2 Does Not Appear to Work When Inside a Function R
Using R Statistics Add a Group Sum to Each Row
How to Clear Only a Few Specific Objects from the Workspace
Stepwise Regression Using P-Values to Drop Variables with Nonsignificant P-Values