duplicate couples (id-time) error in plm with only two IDs
Consider the following appropriate data.
set.seed(42)
(d1 <- transform(expand.grid(id=1:2, time=1:2), X=rnorm(4), y=rnorm(4)))
# id time X y
# 1 1 1 1.3709584 0.40426832
# 2 2 1 -0.5646982 -0.10612452
# 3 1 2 0.3631284 1.51152200
# 4 2 2 0.6328626 -0.09465904
library(plm)
plm(y ~ X, index=c("id", "time"), d1)
# works
Now let's duplicate the last row to simulate a flaw in the data,
(d1 <- rbind(d1, d1[nrow(d1), ]))
# id time X y
# 1 1 1 1.3709584 0.40426832
# 2 2 1 -0.5646982 -0.10612452
# 3 1 2 0.3631284 1.51152200
# 4 2 2 0.6328626 -0.09465904
# 41 2 2 0.6328626 -0.09465904 ## duplicated (X and y may be different though)
where we get an error:
plm(y ~ X, index=c("id", "time"), d1)
# Error in pdim.default(index[[1]], index[[2]]) :
# duplicate couples (id-time)
Similarly we get an error if we have data with id
, time
and some cond
ition:
(d2 <- transform(expand.grid(id=1:2, time=1:2, cond=0:1), X=rnorm(4), y=rnorm(4)))
# id time cond X y
# 1 1 1 0 2.0184237 -1.3888607
# 2 2 1 0 -0.0627141 -0.2787888
# 3 1 2 0 1.3048697 -0.1333213
# 4 2 2 0 2.2866454 0.6359504
# 5 1 1 1 2.0184237 -1.3888607
# 6 2 1 1 -0.0627141 -0.2787888
# 7 1 2 1 1.3048697 -0.1333213
# 8 2 2 1 2.2866454 0.6359504
plm(y ~ X, index=c("id", "time"), d2)
# Error in pdim.default(index[[1]], index[[2]]) :
# duplicate couples (id-time)
To overcome this, we can technically merge the two indices, whatever that means statistically:
(d2 <- transform(d2, id2=apply(d2[c("id", "cond")], 1, paste, collapse=".")))
# id time cond X y id2
# 1 1 1 0 2.0184237 -1.3888607 1.0
# 2 2 1 0 -0.0627141 -0.2787888 2.0
# 3 1 2 0 1.3048697 -0.1333213 1.0
# 4 2 2 0 2.2866454 0.6359504 2.0
# 5 1 1 1 2.0184237 -1.3888607 1.1
# 6 2 1 1 -0.0627141 -0.2787888 2.1
# 7 1 2 1 1.3048697 -0.1333213 1.1
# 8 2 2 1 2.2866454 0.6359504 2.1
plm(y ~ X, index=c("id2", "time"), d2)
# works
At the end, this stopifnot
should not yield an error, where c("id", "time")
corresponds to what you have defined in plm(..., index=c("id", "time"))
:
stopifnot(!any(duplicated(d1[c("id", "time")])))
# Error: !any(duplicated(d1[c("id", "time")])) is not TRUE
Is it ok to run a plm fixed effect model and add a factor dummy variable (tree way fixed effects)?
It is okay to add additional factors. We can prove this by calculating an LSDV model. As a preliminary note, you will of course need robust standard errors, usually clustered at the highest aggregate level, i.e. country in this case.
Note: R >= 4.1 is used in the following.
LSDV
fit1 <-
lm(y ~ d + x1 + x2 + x3 + x4 + factor(id) + factor(time) + factor(country),
dat)
lmtest::coeftest(
fit1, vcov.=sandwich::vcovCL(fit1, cluster=dat$country, type='HC0')) |>
{\(.) .[!grepl('\\(|factor', rownames(.)), ]}()
# Estimate Std. Error t value Pr(>|t|)
# d 10.1398727 0.3181993 31.8664223 4.518874e-191
# x1 1.1217514 1.6509390 0.6794627 4.968995e-01
# x2 3.4913273 2.7782157 1.2566797 2.089718e-01
# x3 0.6257981 3.3162148 0.1887085 8.503346e-01
# x4 0.1942742 0.8998307 0.2159008 8.290804e-01
After adding factor(country)
, the estimators we get with plm::plm
are identical to LSDV:
plm::plm
fit2 <- plm::plm(y ~ d + x1 + x2 + x3 + x4 + factor(country),
index=c('id', 'time'), model='within', effect='twoways', dat)
summary(fit2, vcov=plm::vcovHC(fit2, cluster='group', type='HC1'))$coe
# Estimate Std. Error t-value Pr(>|t|)
# d 10.1398727 0.3232850 31.3651179 5.836597e-186
# x1 1.1217514 1.9440165 0.5770277 5.639660e-01
# x2 3.4913273 3.2646905 1.0694206 2.849701e-01
# x3 0.6257981 3.1189939 0.2006410 8.409935e-01
# x4 0.1942742 0.9250759 0.2100089 8.336756e-01
However, cluster='group'
will refer to "id"
and not to "country"
, so the standard errors are wrong. It seems that clustering by the additional factor with plm
is currently not possible, at least I am not aware of anything.
Alternatively you may use lfe::felm
to not have to do without the immensely reduced computing times relative to LSDV:
lfe::felm
summary(lfe::felm(y ~ d + x1 + x2 + x3 + x4 | id + time + country | 0 | country,
dat))$coe
# Estimate Cluster s.e. t value Pr(>|t|)
# d 10.1398727 0.3184067 31.8456637 1.826374e-33
# x1 1.1217514 1.6520151 0.6790201 5.004554e-01
# x2 3.4913273 2.7800267 1.2558611 2.153737e-01
# x3 0.6257981 3.3183765 0.1885856 8.512296e-01
# x4 0.1942742 0.9004173 0.2157602 8.301083e-01
For comparison, here is what Stata computes, the standard errors closely resemble those of LSDV and lfe::felm
:
Stata
. reghdfe y d x1 x2 x3 x4, absorb (country time id) vce(cluster country)
y | Coef. Std. Err. t P>|t| [95% Conf. Interval]
-------------+----------------------------------------------------------------
d | 10.13987 .3185313 31.83 0.000 9.49907 10.78068
x1 | 1.121751 1.652662 0.68 0.501 -2.202975 4.446478
x2 | 3.491327 2.781115 1.26 0.216 -2.103554 9.086209
x3 | .6257981 3.319675 0.19 0.851 -6.052528 7.304124
x4 | .1942742 .9007698 0.22 0.830 -1.617841 2.006389
_cons | 14.26801 23.65769 0.60 0.549 -33.32511 61.86114
Simulated Panel Data:
n1 <- 20; t1 <- 4; n2 <- 48
dat <- expand.grid(id=1:n1, time=1:t1, country=1:n2)
set.seed(42)
dat <- within(dat, {
id <- as.vector(apply(matrix(1:(n1*n2), n1), 2, rep, t1))
d <- runif(nrow(dat), 70, 80)
x1 <- sample(0:1, nrow(dat), replace=TRUE)
x2 <- runif(nrow(dat))
x3 <- runif(nrow(dat))
x4 <- rnorm(nrow(dat))
y <-
10*d + ## treatment effect
as.vector(replicate(n2, rep(runif(n1, 2, 5), t1))) + ## id FE
rep(runif(n1, 10, 12), each=t1) + ## time FE
rep(runif(n2, 10, 12), each=n1*t1) + ## country FE
- .7*x1 + 1.3*x2 + 2.4*x3 +
.5 * x4 + rnorm(nrow(dat), 0, 50)
})
readstata13::save.dta13(dat, 'panel.dta') ## for Stata
Fixed Effects plm package R - multiple observations per year/id
The plm
function needs just one pair of id/time. For each id you supplied you have more than one year.
If each st_name
and race
pairs form an "individual" (or whatever the name you give to this dimension of the panel), then you could do:
library(dplyr)
my.data$id <- group_indices(my.data, st_name, race)
#which would be the same as my.data <- my.data %>% mutate(id = group_indices(st_name, race)), if this function supported mutate.
plm.reg <- plm(drugcrime_ar ~ decrim_dummy + median_income + factor(race),
data = my.data, index=c("id","year"), model = "within",
effect = "twoways")
See, however, that in this situation you are not using a kind of nested panel structure as @Helix123 suggested. You are only redefining the first dimension of the panel.
Linear regression with object oriented programming R
Your slot definitions shouldn't all be "character" class - you need to indicate the type of data you will be passing in to each:
setClass("definition",
slots = list(dv = "character",
vars = "character",
method = "character",
data = "data.frame",
model = "function",
plm = "list")
)
Secondly, it sounds like you want the model to run when your new object is created. To do this, you will need to define an initialize
method for your class:
setMethod("initialize", "definition",
function(.Object, dv, vars, method, data, model) {
fo <- as.formula(paste0(dv, "~", vars, "+ factor(year)"))
model_ols <- model(fo, index = "NAME",
model = method,
data = data)
.Object@plm <- list(model_ols)
.Object
}
)
If you only want your show
method to show the regression model, then it should do just that:
setMethod("show", "definition",
function(object) {
print(object@plm[[1]])
})
Before we create an object from your new class, we need to define some random data with the same names as your own data frame. You can skip this step since you already have the data.
set.seed(1)
mydata <- data.frame(year = rep(2010:2020, 3),
qe_ann = sample(100, 33, TRUE),
NAME = rep(c("A", "B", "C"), each = 11))
mydata$npl_lnratio <- mydata$year - 2000 + mydata$qe_ann / 100 + rnorm(33)
Now we can create an instance of our new class:
ols <- new("definition",
dv = "npl_lnratio",
vars = "qe_ann",
method = "within",
data = mydata,
model = plm)
Remember that show
will be invoked if we simply type the name of our object into the console:
ols
#>
#> Model Formula: npl_lnratio ~ qe_ann + factor(year)
#> <environment: 0x00000176689c3ec0>
#>
#> Coefficients:
#> qe_ann factor(year)2011 factor(year)2012 factor(year)2013 factor(year)2014
#> 0.011662 0.222901 0.115303 2.614314 3.563795
#> factor(year)2015 factor(year)2016 factor(year)2017 factor(year)2018 factor(year)2019
#> 4.308154 5.627329 6.624514 7.314393 8.909426
#> factor(year)2020
#> 9.254426
Related Topics
Ggplot2: Adding Lines in a Loop and Retaining Colour Mappings
How to Pass Multiple Group_By Arguments and a Dynamic Variable Argument to a Dplyr Function
R Shiny - Checkboxes and Action Button Combination Issue
How to Create a Dropdown List in a Shiny Table Using Datatable When Editing the Table
Changing Line Color in Ggplot Based on Slope
In R Data.Frame, Promote Rownames to Actual Column
Convert Byte Encoding to Unicode
How to Embed Plots into a Tab in Rmarkdown in a Procedural Fashion
Calculating the Distance Between Points in Different Data Frames
Convert Data with One Column and Multiple Rows into Multi Column Multi Row Data
How to Force the X-Axis Tick Marks to Appear at the End of Bar in Heatmap Graph
Importing Multiple .CSV Files with Variable Column Types into R
How to Unlock Environment in R
Bar Plot for Count Data by Group in R
Selecting Unique Rows in Matrix Using R