Merge Panel Data to Get Balanced Panel Data

Merge Panel data to get balanced panel data

There's a function for that. Combine the data frames with rbind. Then use complete. It will look through the groups in variable and fill any with missing values:

library(tidyr)
df3 <- do.call(rbind.data.frame, list(df1, df2))
df3$Month <- as.character(df3$Month)
df4 <- complete(df3, Month, variable)
df4$Month <- as.yearmon(df4$Month, "%b %Y")
df5 <- df4[order(df4$variable,df4$Month),]
df5
# Source: local data frame [72 x 8]
# 
#       Month variable Beta1 Beta2 Beta3 Beta4 Beta5 Beta6
#      (yrmn)   (fctr) (int) (int) (int) (int) (int) (int)
# 1  Jan 2005        A     1     2     3     4     5     6
# 2  Feb 2005        A     2     3     4     5     6     7
# 3  Mar 2005        A     3     4     5     6     7     8
# 4  Apr 2005        A     4     5     6     7     8     9
# 5  May 2005        A     5     6     7     8     9    10
# 6  Jun 2005        A     6     7     8     9    10    11
# 7  Jul 2005        A     7     8     9    10    11    12
# 8  Aug 2005        A     8     9    10    11    12    13
# 9  Sep 2005        A     9    10    11    12    13    14
# 10 Oct 2005        A    10    11    12    13    14    15
# ..      ...      ...   ...   ...   ...   ...   ...   ...

An alternative implementation with dplyr & tidyr:

library(dplyr)
library(tidyr)

df3 <- bind_rows(df1, df2) %>% 
  complete(Month, variable)

Merging uneven Panel Data frames in R

It would help if you posted your data (or a working subset of it) and a little more detail on how you are trying to merge, but if I understand what you are trying to do, you want each final data record to have individual stats for each player on a particular date followed by the player's team's stats for that date. In this case, you should have a team column in the Player table that identifies the player's team, and then join the two tables on the composite key Date and Team by setting the by= attribute in merge:

merge(PData, TData, by=c("Date", "Team"))

The fact that the data frames are of different lengths doesn't matter--this is exactly what join/merge operations are for.

For an alternative to merge(), you might check out the dplyr package join functions at https://cran.r-project.org/web/packages/dplyr/vignettes/two-table.html

Transform into balanced panel data

We do a cross join (CJ) with unique 'ID', and 'date' of the dataset after setting the key columns as 'ID' and 'date' and then do a join with the original dataset.

setDT(test, key = c("ID", "date"))[CJ(ID, date, unique=TRUE)]
#    ID       date nr namecol
# 1:  A 2009-01-01 NA      NA
# 2:  A 2010-01-01  1     rdm
# 3:  A 2010-01-10  2     dfg
# 4:  A 2010-01-14  3   fdgfd
# 5:  A 2010-02-15  4   fdgfd
# 6:  A 2010-08-17  5      dg
# 7:  A 2010-12-19  6     dfg
# 8:  B 2009-01-01  1     dfg
# 9:  B 2010-01-01  2     ydg
#10:  B 2010-01-10  3   fdgfd
#11:  B 2010-01-14  4     dfg
#12:  B 2010-02-15  5     dfg
#13:  B 2010-08-17 NA      NA
#14:  B 2010-12-19 NA      NA

data

test <- structure(list(ID = c("A", "A", "A", "A", "A", "A", "B", "B", 
"B", "B", "B"), date = structure(c(14610, 14619, 14623, 14655, 
14838, 14962, 14245, 14610, 14619, 14623, 14655), class = "Date"), 
nr = c(1L, 2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L), namecol = c("rdm", 
"dfg", "fdgfd", "fdgfd", "dg", "dfg", "dfg", "ydg", "fdgfd", 
"dfg", "dfg")), .Names = c("ID", "date", "nr", "namecol"),
 row.names = c(NA, -11L), class = "data.frame")

How to Merge two Panel data sets on Date and a combination of columns?

It sounds like you're looking for the how keyword argument in pd.DataFrame.merge and pd.DataFrame.join.

Here is a sample:

import pandas as pd

df1 = pd.read_json(
    '{"Date":{"0":1583020800000,"1":1583020800000,"2":1583020800000,"3":1625097600000,"4":1625097600000,"5":1625097600000},"City":{"0":"Los Angeles","1":"Sacramento","2":"Houston","3":"Los Angeles","4":"Sacramento","5":"Houston"},"State":{"0":"CA","1":"CA","2":"TX","3":"CA","4":"CA","5":"TX"},"Population":{"0":5000000,"1":5400000,"2":3500000,"3":5000002,"4":5444000,"5":4443300},"Cases":{"0":122,"1":120,"2":23,"3":12220,"4":211,"5":2111},"Deaths":{"0":12,"1":2,"2":11,"3":2200,"4":22,"5":330}}'
)
df2 = pd.read_json(
    '{"Date":{"0":1546300800000,"1":1546300800000,"2":1546300800000,"3":1546300800000,"4":1625097600000,"5":1625097600000,"6":1625097600000},"City":{"0":"LOS ANGELES","1":"LOS ANGELES","2":"SACRAMENTO","3":"HOUSTON","4":"LOS ANGELES","5":"SACRAMENTO","6":"HOUSTON"},"State":{"0":"CA","1":"CA","2":"CA","3":"TX","4":"CA","5":"CA","6":"TX"},"Quantity x":{"0":null,"1":330.0,"2":4450.0,"3":440.0,"4":31113.0,"5":3220.0,"6":null},"Quantity y":{"0":445.0,"1":null,"2":566.0,"3":null,"4":3455.0,"5":null,"6":3200.0}}'
)

print("\ndf1 = \n", df1)
print("\ndf2 = \n", df2)

# Transform df1
df1["City"] = df1["City"].apply(str.upper)  # To merge, need consistent casing
df1 = df1.groupby(["Date", "City", "State"])[
    ["Cases", "Deaths"]
].sum()  # Aggregate cases + deaths just in case...


# Aggregate in df2
df2 = df2.groupby(["Date", "City", "State"])[
    ["Quantity x", "Quantity y"]
].sum()  # implicit skipna=True

print("\ndf1' = \n", df1)
print("\ndf2' = \n", df2)

# MERGE: merging on indices
df3 = df1.join(df2, how="outer")  # key: "how"
df3[["Cases", "Deaths"]] = (
    df3[["Cases", "Deaths"]].fillna(0).astype(int)
)  # inplace: downcasting complaint

df3.reset_index(
    inplace=True
)  # Will cause ["Date", "City", "State"] to be ordinary columns, not indices.

print("\ndf3 = \n", df3)

...the output is:

df1 = 
         Date         City State  Population  Cases  Deaths
0 2020-03-01  Los Angeles    CA     5000000    122      12
1 2020-03-01   Sacramento    CA     5400000    120       2
2 2020-03-01      Houston    TX     3500000     23      11
3 2021-07-01  Los Angeles    CA     5000002  12220    2200
4 2021-07-01   Sacramento    CA     5444000    211      22
5 2021-07-01      Houston    TX     4443300   2111     330

df2 = 
         Date         City State  Quantity x  Quantity y
0 2019-01-01  LOS ANGELES    CA         NaN       445.0
1 2019-01-01  LOS ANGELES    CA       330.0         NaN
2 2019-01-01   SACRAMENTO    CA      4450.0       566.0
3 2019-01-01      HOUSTON    TX       440.0         NaN
4 2021-07-01  LOS ANGELES    CA     31113.0      3455.0
5 2021-07-01   SACRAMENTO    CA      3220.0         NaN
6 2021-07-01      HOUSTON    TX         NaN      3200.0

df1' = 
                               Cases  Deaths
Date       City        State               
2020-03-01 HOUSTON     TX        23      11
           LOS ANGELES CA       122      12
           SACRAMENTO  CA       120       2
2021-07-01 HOUSTON     TX      2111     330
           LOS ANGELES CA     12220    2200
           SACRAMENTO  CA       211      22

df2' = 
                               Quantity x  Quantity y
Date       City        State                        
2019-01-01 HOUSTON     TX          440.0         0.0
           LOS ANGELES CA          330.0       445.0
           SACRAMENTO  CA         4450.0       566.0
2021-07-01 HOUSTON     TX            0.0      3200.0
           LOS ANGELES CA        31113.0      3455.0
           SACRAMENTO  CA         3220.0         0.0

df3 = 
         Date         City State  Cases  Deaths  Quantity x  Quantity y
0 2019-01-01      HOUSTON    TX      0       0       440.0         0.0
1 2019-01-01  LOS ANGELES    CA      0       0       330.0       445.0
2 2019-01-01   SACRAMENTO    CA      0       0      4450.0       566.0
3 2020-03-01      HOUSTON    TX     23      11         NaN         NaN
4 2020-03-01  LOS ANGELES    CA    122      12         NaN         NaN
5 2020-03-01   SACRAMENTO    CA    120       2         NaN         NaN
6 2021-07-01      HOUSTON    TX   2111     330         0.0      3200.0
7 2021-07-01  LOS ANGELES    CA  12220    2200     31113.0      3455.0
8 2021-07-01   SACRAMENTO    CA    211      22      3220.0         0.0

A few other points:

City casing needs to be consistent at join/merge time.
You could also do: df1.merge(df2, ..., left_index=True, right_index=True) instead of df1.join. You could also reset the indices via df1.reset_index(inplace=True), etc. after the groupby-sum line(s) then use .merge(..., on=...) (but the indices are convenient).
The final values of Quantity {x,y} are floats because NaNs are present. (See next point.)
I would be deliberate about your treatment of NaNs v. auto-filled 0s. In the case of Cases/Deaths it sounds like you had no data BUT you were making the assumption that - in the absence of Cases/Deaths data - the values are 0. For the Quantity {x,y} variables, no such assumption seemed to be warranted.

Data cleaning, from cross-sectional (multiple files) to panel in RStudio: merge/gather?

I believe you could do something along these lines:

fam = bind_rows(fam_list)
inc = bind_rows(inc_list)
ws = bind_rows(ws_list)

result = fam %>%
  left_join(inc, by=c("HouseholdMember", "Year")) %>% 
  left_join(ws, by=c("HouseholdMember", "Year"))

Output:

   HouseholdMember  Year fam_v1 fam_v2 fam_v3  inc_v1  inc_v2 inc_v3   ws_v1 ws_v2  ws_v3
             <dbl> <dbl>  <dbl>  <dbl>  <dbl>   <dbl>   <dbl>  <dbl>   <dbl> <dbl>  <dbl>
 1            8001  2008  0.609 -0.253 -1.30   0.0147  0.719  -0.765  0.120  0.974 -0.764
 2            8002  2008  0.395  1.73  -0.503  0.119  -3.33   -0.798  0.325  0.664  1.65 
 3            8003  2008  0.562  0.157  0.243 -1.18   -0.260   0.105  1.09   0.855  1.19 
 4            8004  2008  1.32   0.737 -1.18   0.725  -1.82    0.356  0.362  2.04   1.76 
 5            8005  2008 -0.497 -0.444 -0.632 -0.534   1.63    0.984  1.29   0.614  0.576
 6            8006  2008 -1.70  -0.989 -1.32   0.868   0.0979  0.468 -0.0146 1.11   0.957
 7            8007  2008 -2.19  -0.419  1.69   1.34   -0.404  -1.43  -0.156  0.648 -0.186
 8            8008  2008  1.48   0.350 -0.595  0.785  -0.609   1.28  -1.01   1.04   0.845
 9            8009  2008 -0.315 -0.530  0.419  0.390  -0.0951 -0.755  0.135  0.696 -1.97 
10            8010  2008 -0.882  1.38   2.06  -0.0757  1.53   -0.494 -1.03   1.14   1.87

Note:

I manufactured the data for this example by creating a lists of tibbles; I believe the fam_list, inc_list, and ws_list are similar to the list objects in your image. These are list of data frames / tibbles. I then use bind_rows to bind these similar structure tibbles together so that I have a three large tibbles.

I then use left_join twice to join inc and ws to fam

Input Data:

library(tidyverse)
fam_list = lapply(8:20, function(x) {
  tibble(HouseholdMember = c(8000+seq(1:100)),
         Year=2000+x, 
         fam_v1=rnorm(100), 
         fam_v2=rnorm(100), 
         fam_v3=rnorm(100)
  )
})
names(fam_list) = paste0("fam_20", 8:20)

inc_list = lapply(8:20, function(x) {
  tibble(HouseholdMember = c(8000+seq(1:100)),
         Year=2000+x, 
         inc_v1=rnorm(100), 
         inc_v2=rnorm(100), 
         inc_v3=rnorm(100)
  )
})
names(inc_list) = paste0("inc_20", 8:20)
ws_list = lapply(8:20, function(x) {
  tibble(HouseholdMember = c(8000+seq(1:100)),
         Year=2000+x, 
         ws_v1=rnorm(100), 
         ws_v2=rnorm(100), 
         ws_v3=rnorm(100)
  )
})
names(ws_list) = paste0("ws_20", 8:20)

Input

How to balance panel data by adding missing rows with no information?

One option, that offers an efficient abstraction, is with complete from pyjanitor to get missing rows for the combination of Date vs the group of ('ID', 'City', 'State'):

# pip install pyjanitor
import pandas as pd
import janitor

df.complete(('ID', 'City', 'State'), 'Date')

      Date     ID         City State  Quantity
0  2019-01  10001  Los Angeles    CA     500.0
1  2019-02  10001  Los Angeles    CA     995.0
2  2019-03  10001  Los Angeles    CA     943.0
3  2019-01  10002      Houston    TX    4330.0
4  2019-02  10002      Houston    TX       NaN
5  2019-03  10002      Houston    TX    2340.0
6  2019-01  10003   Sacramento    CA     235.0
7  2019-02  10003   Sacramento    CA     239.0
8  2019-03  10003   Sacramento    CA     233.0

How to balance an unbalanced panel data?

We can use complete from the tidyr package. The key is to set nesting properly.

library(dplyr)
library(tidyr)

balanced.panel <- unbalanced.panel %>%  
  complete(nesting(firm, ind, charac1), year = full_seq(year, period = 1))
balanced.panel
# # A tibble: 20 x 5
#  firm    ind charac1  year  var1
#  <chr> <dbl> <chr>   <dbl> <dbl>
#  1 A         1 x        2010    11
#  2 A         1 x        2011    12
#  3 A         1 x        2012    13
#  4 A         1 x        2013    14
#  5 A         2 z        2010    NA
#  6 A         2 z        2011    15
#  7 A         2 z        2012    29
#  8 A         2 z        2013    NA
#  9 B         1 g        2010    31
# 10 B         1 g        2011    NA
# 11 B         1 g        2012    NA
# 12 B         1 g        2013    NA
# 13 B         2 y        2010    NA
# 14 B         2 y        2011    15
# 15 B         2 y        2012    NA
# 16 B         2 y        2013    18
# 17 C         1 h        2010    NA
# 18 C         1 h        2011    NA
# 19 C         1 h        2012    13
# 20 C         1 h        2013     2

Creating a balanced panel dataset

idx = pd.MultiIndex.from_product(
    [df.id.unique(), df.year.unique()], names=["id", "year"]
)
df = df.set_index(["id", "year"]).reindex(idx).reset_index()
print(df)

Prints:

   id  year  sales
0   1  2000   10.0
1   1  2001    NaN
2   1  2002    NaN
3   2  2000   10.0
4   2  2001   20.0
5   2  2002   30.0

Merge Panel Data to Get Balanced Panel Data