Merge Plm Fitted Values to Dataset

R, add back fitted values plm(), the fitted values are fewer than the observations in the regression

Apparently, the return of fitted() carries an index attribute which is a data frame of the panel groups for fitted values. Therefore, consider cbind on this index attribute to fitted values and then run left_join or merge (with all.x=TRUE) on original data frame:

fitted_values_vec <- fitted(MP_regression)
fitted_values_df <- cbind(attr(fitted_values_vec, "index"),
fitted_values = fitted_values_vec)

Produc <- base::merge(Produc, fit_values, by=c("firm", "date"), all.x=TRUE)
# Produc <- dplyr::left_join(Produc, fit_values, by=c("firm", "date"))

To demonstrate with built-in plm data frame, Produc:

data("Produc", package = "plm")

# ASSIGN RANDOM NAs ACROSS NON-PANEL COLUMNS
set.seed(41120)
for(col in names(Produc)[!names(Produc) %in% c("state", "year")]) {
Produc[sample(nrow(Produc), 50), col] <- NA
}

results <- plm(log(gsp) ~ log(pcap) + log(pc) + log(emp) + unemp,
data = Produc, index = c("state","year"))

fitted_values_vec <- fitted(results)
str(fitted_values_vec)
# 'pseries' Named num [1:588] -0.2459 -0.2274 -0.0927 -0.0981 -0.0184 ...
# - attr(*, "names")= chr [1:588] "ALABAMA" "ALABAMA" "ALABAMA" "ALABAMA" ...
# - attr(*, "index")=Classes ‘pindex’ and 'data.frame': 588 obs. of 2 variables:
# ..$ state: Factor w/ 48 levels "ALABAMA","ARIZONA",..: 1 1 1 1 1 1 1 1 1 1 ...
# ..$ year : Factor w/ 17 levels "1970","1971",..: 1 2 5 6 7 8 9 10 12 13 ...

fitted_values_df <- cbind(attr(fitted_values_vec, "index"),
fitted_values = fitted_values_vec)

Produc <- merge(Produc, fitted_values_df, by= c("state","year"), all.x=TRUE)

Output

head(Produc,10)

# state year region pcap hwy water util pc gsp emp unemp fitted_values
# 1 ALABAMA 1970 6 15032.67 7325.80 1655.68 6051.20 35793.80 28418 1010.5 4.7 -0.24591969
# 2 ALABAMA 1971 6 15501.94 7525.94 1721.02 6254.98 37299.91 29375 1021.9 5.2 -0.22735513
# 3 ALABAMA 1972 6 15972.41 7765.42 1764.75 6442.23 NA 31303 1072.3 NA NA
# 4 ALABAMA 1973 <NA> NA 7907.66 1742.41 6756.19 40084.01 33430 1135.5 3.9 NA
# 5 ALABAMA 1974 6 16762.67 8025.52 NA 7002.29 42057.31 33749 1169.8 5.5 -0.09272471
# 6 ALABAMA 1975 6 17316.26 8158.23 NA 7405.76 43971.71 33604 1155.4 7.7 -0.09806212
# 7 ALABAMA 1976 6 17732.86 NA 1799.74 7704.93 50221.57 35764 1207.0 6.8 -0.01841929
# 8 ALABAMA 1977 6 18111.93 8365.67 1845.11 7901.15 51084.99 37463 1269.2 7.4 0.02047675
# 9 ALABAMA 1978 6 18479.74 8510.64 1960.51 8008.59 52604.05 39964 1336.5 6.3 0.07225304
# 10 ALABAMA 1979 6 18881.49 8640.61 2081.91 8158.97 54525.86 40979 1362.0 7.1 0.09364171

tail(Produc,10)

# state year region pcap hwy water util pc gsp emp unemp fitted_values
# 807 WYOMING 1977 8 4037.03 2898.34 291.64 847.04 19977.67 9779 170.5 3.6 0.0871588
# 808 WYOMING 1978 8 4115.61 2920.85 294.73 900.04 20760.24 11038 187.4 NA NA
# 809 WYOMING 1979 8 4268.71 2950.53 313.47 1004.71 21643.50 11988 200.7 2.8 0.2346269
# 810 WYOMING 1980 8 NA 2979.23 338.06 1082.40 22628.22 13027 210.2 4.0 NA
# 811 WYOMING 1981 8 4572.67 3005.62 379.19 1187.86 26330.20 13717 223.5 4.1 0.3704301
# 812 WYOMING 1982 8 4731.98 3060.64 408.43 1262.90 27724.96 13056 217.7 5.8 0.3595080
# 813 WYOMING 1983 8 4950.82 3119.98 445.59 NA 28586.46 11922 NA 8.4 NA
# 814 WYOMING 1984 8 5184.73 3195.68 476.57 NA 28794.80 12073 204.3 6.3 0.3199823
# 815 WYOMING 1985 8 5448.38 3295.92 523.01 1629.45 29326.94 12022 NA 7.1 NA
# 816 WYOMING 1986 8 5700.41 3400.96 565.58 1733.88 27110.51 NA 196.3 9.0 NA

plm: using fixef() to manually calculate fitted values for a fixed effects twoways model

Edit: adapted to two-ways unbalanced model, needs plm version >= 2.4-0

Is this what you wanted? Extract the fixed effects by fixef. Here is an example for the Grunfeld data on an unbalanced two-way model (works the same for the balanced two-way model):

gtw_u <- plm(inv ~ value + capital, data = Grunfeld[-200, ], effect = "twoways")
yhat <- as.numeric(gtw_u$model[ , 1] - gtw_u$residuals) # reference
pred_beta <- as.numeric(tcrossprod(coef(gtw_u), as.matrix(gtw_u$model[ , -1])))
pred_effs <- as.numeric(fixef(gtw_u, "twoways")) # sum of ind and time effects

all.equal(pred_effs + pred_beta, yhat) # TRUE -> matches fitted values (yhat)

If you want to split the sum of individual and time effects (given by effect = "twoways") in its components, you will need to choose a reference and two come naturally to mind which are both given below:

# Splits of summed up individual and time effects:
# use one "level" and one "dfirst"
ii <- index(gtw_u)[[1L]]; it <- index(gtw_u)[[2L]]
eff_id_dfirst <- c(0, as.numeric(fixef(gtw_u, "individual", "dfirst")))[ii]
eff_ti_dfirst <- c(0, as.numeric(fixef(gtw_u, "time", "dfirst")))[it]
eff_id_level <- as.numeric(fixef(gtw_u, "individual"))[ii]
eff_ti_level <- as.numeric(fixef(gtw_u, "time"))[it]

all.equal(pred_effs, eff_id_level + eff_ti_dfirst) # TRUE
all.equal(pred_effs, eff_id_dfirst + eff_ti_level) # TRUE

(This is based on the man page of fixef, ?fixef. There it is also shown how the (balanced and unbalanced) one-way model is to be handled).

Pad fitted values of PLM model with NAs

I found the following solution using complete.cases. It works, but there are probably better ways:

fited.values <- rep(NA,nrow(dt))
fited.values[complete.cases(dt)] <- fit.plm$model[[1]]-fit.plm$residuals

fited.values
[1] NA 0.044116999 -0.001511951 0.182792055 -0.136758888
[6] -0.009162091 0.220851814 0.221807764 0.228046083 0.297558446
[11] NA NA 0.130133821 0.211737223 0.339328498
[16] 0.379826505 0.102156480 0.024129950 0.213088736 0.235454141
[21] 0.321319682 0.420673101 0.474030175 0.497573470 0.205056353
[26] 0.168080225 0.309537308 0.010202845 0.082264514 0.260143856

In R, how to add the fitted value column to the original dataframe?

Suppose:

fm <- lm(demand ~ Time, BOD)

Then try this:

cbind(BOD, resid = resid(fm), fitted = fitted(fm))

or this:

BOD$resid <- resid(fm)
BOD$fitted <- fitted(fm)

ADDED:

If you have NA values in demand then your fitted values and residuals will be of a different length than the number of rows of your data, meaning the above will not work. In such a case use: na.exclude like this:

BOD$demand[3] <- NA # set up test data
fm <- lm(demand ~ Time, BOD, na.action = na.exclude)

na.exclude will automatically pad the predictions and residuals with NA values so that they are of the same length as the original data. Now the previous lines should work.

Get fitted values of a log dependent variable to the original panel data set in R

The problem you're facing is twofold. First, you have NA's that your model is dropping when calculating the residuals so naturally your result will have fewer elements. The second problem is that you're lagging one of your covariates, which produces an NA for your first observation (first temporally) -after all that's what a lag does. To overcome that problem you need additional data from the previous time period or you have to forgo that observation. Assuming you do not have access to data from the previous time period, this is how I would go about resolving the issue.

 #First I would create a new variable for CAP and just lag and log that separately, rather than applying the function in the formula of the model itself
pdat$Cap.lag.ln<-lag(log(pdat$Cap), 1)
pdat$Cap<-NULL #deleting the old variable to clear up the mess

#Dont necessarily need the na.omit but it couldn't hurt...
Model1<- plm(log(Out) ~ Cap.lag.ln + log(Lab + 1),
model = "within", data=pdat, na.omit=TRUE)
FV_Log <- data.table(Model1$model[[1]] - Model1$residuals)

#Now this is where you reduce your original dataset (pdat) by getting rid of the NAs
pdat2<-na.omit(pdat)
#You will notice that they're the same dimensions now and you can cbind
pdat3 <-cbind(pdat2,FV_Log)
Time Firm Out Lab time1 Cap.lag.ln V1
1: Feb-00 A 142452 334 Feb 2000 2 12.41211
2: Mar-00 A 365697 156 Mar 2000 2 12.54861
3: Apr-00 A 355789 134 Apr 2000 2 12.57580
4: May-00 A 376843 159 May 2000 2 12.54520
5: Jun-00 A 258762 119 Jun 2000 2 12.59702
6: Jul-00 A 255447 41 Jul 2000 2 12.78611
7: Aug-00 A 188545 247 Aug 2000 3 12.28887
8: Sep-00 A 213663 251 Sep 2000 4 12.10858
9: Nov-00 A 317468 525 Nov 2000 2 12.33084
10: Dec-00 A 238668 217 Dec 2000 2 12.48949
11: Feb-01 B 135288 109 Feb 2001 3 12.20776
12: Mar-01 B 363609 7 Mar 2001 3 12.67984
13: May-01 B 446279 0 May 2001 4 12.87698
14: Jun-01 B 390230 50 Jun 2001 2 12.52360
15: Jul-01 B 118945 143 Jul 2001 2 12.33665
16: Aug-01 B 174887 85 Aug 2001 3 12.25209
17: Oct-01 B 197832 214 Oct 2001 2 12.26445
18: Nov-01 B 317468 525 Nov 2001 2 12.10331
19: Dec-01 B 238668 217 Dec 2001 2 12.26195

And if you want to retrieve those NA's back, you can do the following:

 pdat3 <-as.data.frame(pdat3)
pdat4<-merge(pdat3, pdat,
by=c("Time","Firm","Out", "Lab","time1"),
all.x=TRUE,all.y=TRUE)

Time Firm Out Lab time1 Cap.lag.ln V1 Cap
1 Apr-00 A 355789 134 Apr 2000 2 12.57580 12
2 Apr-01 B 318472 NA Apr 2001 NA NA 56
3 Aug-00 A 188545 247 Aug 2000 3 12.28887 75
4 Aug-01 B 174887 85 Aug 2001 3 12.25209 NA
5 Dec-00 A 238668 217 Dec 2000 2 12.48949 16
6 Dec-01 B 238668 217 Dec 2001 2 12.26195 16
7 Feb-00 A 142452 334 Feb 2000 2 12.41211 15
8 Feb-01 B 135288 109 Feb 2001 3 12.20776 45
9 Jan-00 A 161521 261 Jan 2000 NA NA 13
10 Jan-01 B 241286 298 Jan 2001 NA NA 42
11 Jul-00 A 255447 41 Jul 2000 2 12.78611 45
12 Jul-01 B 118945 143 Jul 2001 2 12.33665 45
13 Jun-00 A 258762 119 Jun 2000 2 12.59702 12
14 Jun-01 B 390230 50 Jun 2001 2 12.52360 12
15 Mar-00 A 365697 156 Mar 2000 2 12.54861 14
16 Mar-01 B 363609 7 Mar 2001 3 12.67984 24
17 May-00 A 376843 159 May 2000 2 12.54520 15
18 May-01 B 446279 0 May 2001 4 12.87698 12
19 Nov-00 A 317468 525 Nov 2000 2 12.33084 15
20 Nov-01 B 317468 525 Nov 2001 2 12.10331 15
21 Oct-00 A 273209 62 Oct 2000 NA NA 12
22 Oct-01 B 197832 214 Oct 2001 2 12.26445 12
23 Sep-00 A 213663 251 Sep 2000 4 12.10858 NA
24 Sep-01 B 183770 80 Sep 2001 NA NA 15

Predict on test data, using plm package in R, and calculate RMSE for test data

Regarding out-of-sample prediction with fixed effects models, it is not clear how data relating to fixed effects not in the original model are to be treated, e.g., data for an individual not contained in the orignal data set the model was estimated on. (This is rather a methodological question than a programming question).

The version 2.6-2 of plm allows predict for fixed effect models with the original data and with out-of-sample data (see ?predict.plm).

Find below an example with 10 firms for model estimation and the data to be used for prediction contains a firm not contained in the original data set (besides that firm, there are also years not contained in the original model object but these are irrelevant here as it is a one-way individual model). It is unclear what the fixed effect of that out-of-sample firm would be. Hence, by default, no predicted value is given (NA value). If argument na.fill is set to TRUE, the (weighted) mean of the fixed effects contained in the original model object is used as a best guess.

library(plm)
data("Grunfeld", package = "plm")

# fit a fixed effect model
fit.fe <- plm(inv ~ value + capital, data = Grunfeld, model = "within")

# generate 55 new observations of three firms used for prediction:
# * firm 1 with years 1935:1964 (has out-of-sample years 1955:1964),
# * firm 2 with years 1935:1949 (all in sample),
# * firm 11 with years 1935:1944 (firm 11 is out-of-sample)
set.seed(42L)

new.value2 <- runif(55, min = min(Grunfeld$value), max = max(Grunfeld$value))
new.capital2 <- runif(55, min = min(Grunfeld$capital), max = max(Grunfeld$capital))

newdata <- data.frame(firm = c(rep(1, 30), rep(2, 15), rep(11, 10)),
year = c(1935:(1935+29), 1935:(1935+14), 1935:(1935+9)),
value = new.value2, capital = new.capital2)
# make pdata.frame
newdata.p <- pdata.frame(newdata, index = c("firm", "year"))

## predict from fixed effect model with new data as pdata.frame
predict(fit.fe, newdata = newdata.p) # has NA values for the 11'th firm

## set na.fill = TRUE to have the weighted mean used to for fixed effects -> no NA values
predict(fit.fe, newdata = newdata.p, na.fill = TRUE)

NB: When you input a plain data.frame as newdata, it is not clear how the data related to the individuals and time periods, which is why the weighted mean of fixed effects from the original model object is used for all observations in newdata and a warning is printed. For fixed effect model prediction, it is reasonable to assume the user can provide information (via a pdata.frame) how the data the user wants to use for prediction relates to the individual and time dimension of panel data.



Related Topics



Leave a reply



Submit