Merge Plm Fitted Values to Dataset

R, add back fitted values plm(), the fitted values are fewer than the observations in the regression

Apparently, the return of fitted() carries an index attribute which is a data frame of the panel groups for fitted values. Therefore, consider cbind on this index attribute to fitted values and then run left_join or merge (with all.x=TRUE) on original data frame:

fitted_values_vec <- fitted(MP_regression)
fitted_values_df <- cbind(attr(fitted_values_vec, "index"), 
                          fitted_values = fitted_values_vec)

Produc <- base::merge(Produc, fit_values, by=c("firm", "date"), all.x=TRUE)    
# Produc <- dplyr::left_join(Produc, fit_values, by=c("firm", "date"))

To demonstrate with built-in plm data frame, Produc:

data("Produc", package = "plm")

# ASSIGN RANDOM NAs ACROSS NON-PANEL COLUMNS
set.seed(41120)
for(col in names(Produc)[!names(Produc) %in% c("state", "year")]) {
  Produc[sample(nrow(Produc), 50), col] <- NA
}

results <- plm(log(gsp) ~ log(pcap) + log(pc) + log(emp) + unemp,
               data = Produc, index = c("state","year"))

fitted_values_vec <- fitted(results)
str(fitted_values_vec)
# 'pseries' Named num [1:588] -0.2459 -0.2274 -0.0927 -0.0981 -0.0184 ...
# - attr(*, "names")= chr [1:588] "ALABAMA" "ALABAMA" "ALABAMA" "ALABAMA" ...
# - attr(*, "index")=Classes ‘pindex’ and 'data.frame': 588 obs. of  2 variables:
#   ..$ state: Factor w/ 48 levels "ALABAMA","ARIZONA",..: 1 1 1 1 1 1 1 1 1 1 ...
#   ..$ year : Factor w/ 17 levels "1970","1971",..: 1 2 5 6 7 8 9 10 12 13 ...

fitted_values_df <- cbind(attr(fitted_values_vec, "index"), 
                          fitted_values = fitted_values_vec)

Produc <- merge(Produc, fitted_values_df, by= c("state","year"), all.x=TRUE)

Output

head(Produc,10)

#      state year region     pcap     hwy   water    util       pc   gsp    emp unemp fitted_values
# 1  ALABAMA 1970      6 15032.67 7325.80 1655.68 6051.20 35793.80 28418 1010.5   4.7   -0.24591969
# 2  ALABAMA 1971      6 15501.94 7525.94 1721.02 6254.98 37299.91 29375 1021.9   5.2   -0.22735513
# 3  ALABAMA 1972      6 15972.41 7765.42 1764.75 6442.23       NA 31303 1072.3    NA            NA
# 4  ALABAMA 1973   <NA>       NA 7907.66 1742.41 6756.19 40084.01 33430 1135.5   3.9            NA
# 5  ALABAMA 1974      6 16762.67 8025.52      NA 7002.29 42057.31 33749 1169.8   5.5   -0.09272471
# 6  ALABAMA 1975      6 17316.26 8158.23      NA 7405.76 43971.71 33604 1155.4   7.7   -0.09806212
# 7  ALABAMA 1976      6 17732.86      NA 1799.74 7704.93 50221.57 35764 1207.0   6.8   -0.01841929
# 8  ALABAMA 1977      6 18111.93 8365.67 1845.11 7901.15 51084.99 37463 1269.2   7.4    0.02047675
# 9  ALABAMA 1978      6 18479.74 8510.64 1960.51 8008.59 52604.05 39964 1336.5   6.3    0.07225304
# 10 ALABAMA 1979      6 18881.49 8640.61 2081.91 8158.97 54525.86 40979 1362.0   7.1    0.09364171

tail(Produc,10)

#       state year region    pcap     hwy  water    util       pc   gsp   emp unemp fitted_values
# 807 WYOMING 1977      8 4037.03 2898.34 291.64  847.04 19977.67  9779 170.5   3.6     0.0871588
# 808 WYOMING 1978      8 4115.61 2920.85 294.73  900.04 20760.24 11038 187.4    NA            NA
# 809 WYOMING 1979      8 4268.71 2950.53 313.47 1004.71 21643.50 11988 200.7   2.8     0.2346269
# 810 WYOMING 1980      8      NA 2979.23 338.06 1082.40 22628.22 13027 210.2   4.0            NA
# 811 WYOMING 1981      8 4572.67 3005.62 379.19 1187.86 26330.20 13717 223.5   4.1     0.3704301
# 812 WYOMING 1982      8 4731.98 3060.64 408.43 1262.90 27724.96 13056 217.7   5.8     0.3595080
# 813 WYOMING 1983      8 4950.82 3119.98 445.59      NA 28586.46 11922    NA   8.4            NA
# 814 WYOMING 1984      8 5184.73 3195.68 476.57      NA 28794.80 12073 204.3   6.3     0.3199823
# 815 WYOMING 1985      8 5448.38 3295.92 523.01 1629.45 29326.94 12022    NA   7.1            NA
# 816 WYOMING 1986      8 5700.41 3400.96 565.58 1733.88 27110.51    NA 196.3   9.0            NA

plm: using fixef() to manually calculate fitted values for a fixed effects twoways model

Edit: adapted to two-ways unbalanced model, needs plm version >= 2.4-0

Is this what you wanted? Extract the fixed effects by fixef. Here is an example for the Grunfeld data on an unbalanced two-way model (works the same for the balanced two-way model):

gtw_u <- plm(inv ~ value + capital, data = Grunfeld[-200, ], effect = "twoways")
yhat <- as.numeric(gtw_u$model[ , 1] - gtw_u$residuals) # reference
pred_beta <- as.numeric(tcrossprod(coef(gtw_u), as.matrix(gtw_u$model[ , -1])))
pred_effs <- as.numeric(fixef(gtw_u, "twoways")) # sum of ind and time effects

all.equal(pred_effs + pred_beta, yhat) # TRUE -> matches fitted values (yhat)

If you want to split the sum of individual and time effects (given by effect = "twoways") in its components, you will need to choose a reference and two come naturally to mind which are both given below:

# Splits of summed up individual and time effects:
# use one "level" and one "dfirst"
ii <- index(gtw_u)[[1L]]; it <- index(gtw_u)[[2L]]
eff_id_dfirst <- c(0, as.numeric(fixef(gtw_u, "individual", "dfirst")))[ii]
eff_ti_dfirst <- c(0, as.numeric(fixef(gtw_u, "time",       "dfirst")))[it]
eff_id_level <- as.numeric(fixef(gtw_u, "individual"))[ii]
eff_ti_level <- as.numeric(fixef(gtw_u, "time"))[it]

all.equal(pred_effs, eff_id_level  + eff_ti_dfirst) # TRUE
all.equal(pred_effs, eff_id_dfirst + eff_ti_level)  # TRUE

(This is based on the man page of fixef, ?fixef. There it is also shown how the (balanced and unbalanced) one-way model is to be handled).

Pad fitted values of PLM model with NAs

I found the following solution using complete.cases. It works, but there are probably better ways:

fited.values <- rep(NA,nrow(dt))
fited.values[complete.cases(dt)] <- fit.plm$model[[1]]-fit.plm$residuals

fited.values
 [1]           NA  0.044116999 -0.001511951  0.182792055 -0.136758888
 [6] -0.009162091  0.220851814  0.221807764  0.228046083  0.297558446
[11]           NA           NA  0.130133821  0.211737223  0.339328498
[16]  0.379826505  0.102156480  0.024129950  0.213088736  0.235454141
[21]  0.321319682  0.420673101  0.474030175  0.497573470  0.205056353
[26]  0.168080225  0.309537308  0.010202845  0.082264514  0.260143856

In R, how to add the fitted value column to the original dataframe?

Suppose:

fm <- lm(demand ~ Time, BOD)

Then try this:

cbind(BOD, resid = resid(fm), fitted = fitted(fm))

or this:

BOD$resid <- resid(fm)
BOD$fitted <- fitted(fm)

ADDED:

If you have NA values in demand then your fitted values and residuals will be of a different length than the number of rows of your data, meaning the above will not work. In such a case use: na.exclude like this:

BOD$demand[3] <- NA # set up test data
fm <- lm(demand ~ Time, BOD, na.action = na.exclude)

na.exclude will automatically pad the predictions and residuals with NA values so that they are of the same length as the original data. Now the previous lines should work.

Get fitted values of a log dependent variable to the original panel data set in R

The problem you're facing is twofold. First, you have NA's that your model is dropping when calculating the residuals so naturally your result will have fewer elements. The second problem is that you're lagging one of your covariates, which produces an NA for your first observation (first temporally) -after all that's what a lag does. To overcome that problem you need additional data from the previous time period or you have to forgo that observation. Assuming you do not have access to data from the previous time period, this is how I would go about resolving the issue.

 #First I would create a new variable for CAP and just lag and log that separately, rather than applying the function in the formula of the model itself
 pdat$Cap.lag.ln<-lag(log(pdat$Cap), 1)
 pdat$Cap<-NULL #deleting the old variable to clear up the mess

 #Dont necessarily need the na.omit but it couldn't hurt...
 Model1<- plm(log(Out) ~ Cap.lag.ln + log(Lab + 1),
    model = "within", data=pdat, na.omit=TRUE)
 FV_Log <- data.table(Model1$model[[1]] - Model1$residuals)

 #Now this is where you reduce your original dataset (pdat) by getting rid of the NAs
 pdat2<-na.omit(pdat)
 #You will notice that they're the same dimensions now and you can cbind
 pdat3 <-cbind(pdat2,FV_Log)
      Time Firm    Out Lab    time1 Cap.lag.ln       V1
 1: Feb-00    A 142452 334 Feb 2000          2 12.41211
 2: Mar-00    A 365697 156 Mar 2000          2 12.54861
 3: Apr-00    A 355789 134 Apr 2000          2 12.57580
 4: May-00    A 376843 159 May 2000          2 12.54520
 5: Jun-00    A 258762 119 Jun 2000          2 12.59702
 6: Jul-00    A 255447  41 Jul 2000          2 12.78611
 7: Aug-00    A 188545 247 Aug 2000          3 12.28887
 8: Sep-00    A 213663 251 Sep 2000          4 12.10858
 9: Nov-00    A 317468 525 Nov 2000          2 12.33084
 10: Dec-00   A 238668 217 Dec 2000          2 12.48949
 11: Feb-01   B 135288 109 Feb 2001          3 12.20776
 12: Mar-01   B 363609   7 Mar 2001          3 12.67984
 13: May-01   B 446279   0 May 2001          4 12.87698
 14: Jun-01   B 390230  50 Jun 2001          2 12.52360
 15: Jul-01   B 118945 143 Jul 2001          2 12.33665
 16: Aug-01   B 174887  85 Aug 2001          3 12.25209
 17: Oct-01   B 197832 214 Oct 2001          2 12.26445
 18: Nov-01   B 317468 525 Nov 2001          2 12.10331
 19: Dec-01   B 238668 217 Dec 2001          2 12.26195

And if you want to retrieve those NA's back, you can do the following:

 pdat3 <-as.data.frame(pdat3)
 pdat4<-merge(pdat3, pdat, 
    by=c("Time","Firm","Out", "Lab","time1"), 
    all.x=TRUE,all.y=TRUE)

   Time Firm    Out Lab    time1 Cap.lag.ln       V1 Cap
 1  Apr-00    A 355789 134 Apr 2000          2 12.57580  12
 2  Apr-01    B 318472  NA Apr 2001         NA       NA  56
 3  Aug-00    A 188545 247 Aug 2000          3 12.28887  75
 4  Aug-01    B 174887  85 Aug 2001          3 12.25209  NA
 5  Dec-00    A 238668 217 Dec 2000          2 12.48949  16
 6  Dec-01    B 238668 217 Dec 2001          2 12.26195  16
 7  Feb-00    A 142452 334 Feb 2000          2 12.41211  15
 8  Feb-01    B 135288 109 Feb 2001          3 12.20776  45
 9  Jan-00    A 161521 261 Jan 2000         NA       NA  13
 10 Jan-01    B 241286 298 Jan 2001         NA       NA  42
 11 Jul-00    A 255447  41 Jul 2000          2 12.78611  45
 12 Jul-01    B 118945 143 Jul 2001          2 12.33665  45
 13 Jun-00    A 258762 119 Jun 2000          2 12.59702  12
 14 Jun-01    B 390230  50 Jun 2001          2 12.52360  12
 15 Mar-00    A 365697 156 Mar 2000          2 12.54861  14
 16 Mar-01    B 363609   7 Mar 2001          3 12.67984  24
 17 May-00    A 376843 159 May 2000          2 12.54520  15
 18 May-01    B 446279   0 May 2001          4 12.87698  12
 19 Nov-00    A 317468 525 Nov 2000          2 12.33084  15
 20 Nov-01    B 317468 525 Nov 2001          2 12.10331  15
 21 Oct-00    A 273209  62 Oct 2000         NA       NA  12
 22 Oct-01    B 197832 214 Oct 2001          2 12.26445  12
 23 Sep-00    A 213663 251 Sep 2000          4 12.10858  NA
 24 Sep-01    B 183770  80 Sep 2001         NA       NA  15

Predict on test data, using plm package in R, and calculate RMSE for test data

Regarding out-of-sample prediction with fixed effects models, it is not clear how data relating to fixed effects not in the original model are to be treated, e.g., data for an individual not contained in the orignal data set the model was estimated on. (This is rather a methodological question than a programming question).

The version 2.6-2 of plm allows predict for fixed effect models with the original data and with out-of-sample data (see ?predict.plm).

Find below an example with 10 firms for model estimation and the data to be used for prediction contains a firm not contained in the original data set (besides that firm, there are also years not contained in the original model object but these are irrelevant here as it is a one-way individual model). It is unclear what the fixed effect of that out-of-sample firm would be. Hence, by default, no predicted value is given (NA value). If argument na.fill is set to TRUE, the (weighted) mean of the fixed effects contained in the original model object is used as a best guess.

library(plm)
data("Grunfeld", package = "plm")

# fit a fixed effect model
fit.fe <- plm(inv ~ value + capital, data = Grunfeld, model = "within")

# generate 55 new observations of three firms used for prediction:
#  * firm 1 with years 1935:1964 (has out-of-sample years 1955:1964), 
#  * firm 2 with years 1935:1949 (all in sample),
#  * firm 11 with years 1935:1944 (firm 11 is out-of-sample)
set.seed(42L)

new.value2   <- runif(55, min = min(Grunfeld$value),   max = max(Grunfeld$value))
new.capital2 <- runif(55, min = min(Grunfeld$capital), max = max(Grunfeld$capital))

newdata <- data.frame(firm = c(rep(1, 30), rep(2, 15), rep(11, 10)),
                      year = c(1935:(1935+29), 1935:(1935+14), 1935:(1935+9)),
                      value = new.value2, capital = new.capital2)
# make pdata.frame
newdata.p <- pdata.frame(newdata, index = c("firm", "year"))

## predict from fixed effect model with new data as pdata.frame
predict(fit.fe, newdata = newdata.p) # has NA values for the 11'th firm

## set na.fill = TRUE to have the weighted mean used to for fixed effects -> no NA values
predict(fit.fe, newdata = newdata.p, na.fill = TRUE)

NB: When you input a plain data.frame as newdata, it is not clear how the data related to the individuals and time periods, which is why the weighted mean of fixed effects from the original model object is used for all observations in newdata and a warning is printed. For fixed effect model prediction, it is reasonable to assume the user can provide information (via a pdata.frame) how the data the user wants to use for prediction relates to the individual and time dimension of panel data.