ecstasy 2023. 5. 20. 12:38

Statistical Leanring Theory

$${\displaystyle \text{Statistical Learning Theory} }$$ $${\displaystyle \begin{aligned} \text{Expected Risk:} &&\quad I[f] &=\displaystyle \int _{X\times Y}V(f({\vec {x}}),y)\,p({\vec {x}},y)\,d{\vec {x}}\,dy \\ \text{Empirical Risk:} &&\quad I_{S}[f]&={\frac {1}{n}}\displaystyle \sum _{{i=1}}^{n} \underbrace{V(f({\vec {x}}_{i}), y_{i})}_{\text{Convex/Concave Function}} \\ \end{aligned} }$$
$${\displaystyle \text{Least Squares Estimation} \quad V(f({\vec {x}}),y)=(y-f({\vec {x}}))^{2} }$$ $${\displaystyle \mathbf {X} {\boldsymbol {\beta }}=\mathbf {y}, \quad {\hat {\boldsymbol {\beta }}}={\underset {\boldsymbol {\beta }}{\operatorname {arg\,min} }}\,S({\boldsymbol {\beta }}), }$$ $${\displaystyle \begin{aligned} \; & \text{(1) OLS: Ordinary Least Squares} & \; \\ \; & \qquad S({\boldsymbol {\beta }})=\sum _{i=1}^{n}\left|y_{i}-\sum _{j=1}^{p}X_{ij}\beta _{j}\right|^{2}=\left\|\mathbf {y} -\mathbf {X} {\boldsymbol {\beta }}\right\|^{2} & \; \\ \; & \qquad {\hat {\boldsymbol {\beta }}}_{OLS}=\left(\mathbf {X} ^{\mathsf {T}}\mathbf {X} \right)^{-1}\mathbf {X} ^{\mathsf {T}}\mathbf {y} & \; \\ \; & \; & \; \\ \; & \text{(2) WLS: Weighted Least Squares} & \; \\ \; & \qquad S({\boldsymbol {\beta }}) = \sum _{i=1}^{n}w_{i}\left|y_{i}-\sum _{j=1}^{p}X_{ij}\beta _{j}\right|^{2}=\,\left\| \mathbf{W}^{\frac {1}{2}}\left(\mathbf {y} -\mathbf {X} {\boldsymbol {\beta }}\right)\right\|^{2} & \; \\ \; & \qquad {\hat {\boldsymbol {\beta }}}_{WLS}=(\mathbf{X}^{\textsf {T}} \mathbf{W}\mathbf{X})^{-1} \mathbf{X}^{\textsf {T}} \mathbf{W}\mathbf{y} & \; \\ \; & \; & \; \\ \; & \text{(3) GLS: Generalized Least Squares} & \; \\ \; & \qquad S({\boldsymbol {\beta }})= \sum _{i=1}^{n}\sigma_{i}^{-1}\left|y_{i}-\sum _{j=1}^{p}X_{ij}\beta _{j}\right|^{2} =\,\left\| \boldsymbol{\Omega}^{ - \frac {1}{2}}\left(\mathbf {y} -\mathbf {X} {\boldsymbol {\beta }}\right)\right\|^{2} & \; \\ \; & \qquad \boldsymbol {\hat {\beta }}_{GLS} =\left(\mathbf {X} ^{\mathsf {T}}\mathbf {\Omega } ^{-1}\mathbf {X} \right)^{-1}\mathbf {X} ^{\mathsf {T}}\mathbf {\Omega } ^{-1}\mathbf {y} & \; \\ \; & \qquad \boldsymbol{\Omega}= \boldsymbol{\Sigma} \otimes \mathbf{I}_{N} & \; \\ \; & \; & \; \\ \; & \text{(4) FGLS: Feasible GLS} & \; \\ \; & \qquad S({\boldsymbol {\beta }})= \sum _{i=1}^{n}\hat{\sigma}_{i}^{-1}\left|y_{i}-\sum _{j=1}^{p}X_{ij}\beta _{j}\right|^{2} =\,\left\| \boldsymbol{\hat{\Omega}}^{-\frac {1}{2}}\left(\mathbf {y} -\mathbf {X} {\boldsymbol {\beta }}\right)\right\|^{2} & \; \\ \; & \qquad \boldsymbol {\hat {\beta }}_{FGLS} =\left(\mathbf {X} ^{\mathsf {T}}\mathbf {\hat{\Omega} } ^{-1}\mathbf {X} \right)^{-1}\mathbf {X} ^{\mathsf {T}}\mathbf {\hat{\Omega} } ^{-1}\mathbf {y} & \; \\ \; & \qquad \hat{\boldsymbol{\Omega}}=\hat{\boldsymbol{\Sigma}}\otimes \mathbf{I}_{N}, \quad \hat{\boldsymbol{\Sigma}}=N^{-1}\left[\begin{array}{cccc} \hat{\epsilon}_{1} & \hat{\epsilon}_{2} & \ldots & \hat{\epsilon}_{N}\end{array}\right]^{\prime}\left[\begin{array}{cccc} \hat{\epsilon}_{1} & \hat{\epsilon}_{2} & \ldots & \hat{\epsilon}_{N}\end{array}\right] \; \\ \; & \text{(5) IV: Instrumental Variables} & \; \\ \; & \qquad S({\boldsymbol {\beta }})= \sum _{i=1}^{n}z_{i}\left|y_{i}-\sum _{j=1}^{p}X_{ij}\beta _{j}\right|^{2} =\,\left\| \mathbf{Z}^{ - \frac {1}{2}}\left(\mathbf {y} -\mathbf {X} {\boldsymbol {\beta }}\right)\right\|^{2} & \; \\ \; & \qquad \boldsymbol {\hat {\beta }}_{IV} =\left(\mathbf {Z} ^{\mathsf {T}}\mathbf {X} \right)^{-1}\mathbf {Z} ^{\mathsf {T}}\mathbf {y} & \; \\ \; & \; & \; \\ \; & \text{(6) GMM: Generalized Method of Moments} & \; \\ \; & \qquad \boldsymbol {\hat {\beta }}_{GMM} =\left(\mathbf {X} ^{\mathsf {T}}\mathbf{P}_{\mathbf{Z}}\mathbf {X} \right)^{-1}\mathbf {X} ^{\mathsf {T}}\mathbf{P}_{\mathbf{Z}} \mathbf {y} & \; \\ \; & \qquad \mathbf{P}_{\mathbf{Z}}=\mathbf{Z}(\mathbf{Z}^{\mathrm {T} }\mathbf{Z})^{-1}\mathbf{Z}^{\mathrm {T} } & \; \\ \; & \; & \; \\ \; & \text{(7) IV2SLS: Instrumental Variables: 2-Stage Least Squares} & \; \\ \; & \qquad \boldsymbol {\hat {\beta }}_{IV2SLS} =\left(\mathbf {\widehat{X}} ^{\mathsf {T}}\mathbf {\widehat{X}} \right)^{-1}\mathbf {\widehat{X}} ^{\mathsf {T}}\mathbf {y} & \; \\ \; & \qquad \mathbf {\widehat{X}} = \mathbf {P}_{\mathbf {Z}}\mathbf {X} = \mathbf{Z}(\mathbf{Z}^{\mathrm {T} }\mathbf{Z})^{-1}\mathbf{Z}^{\mathrm {T} } \mathbf {X} & \; \\ \; & \; & \; \\ \end{aligned} }$$

Linear Model Design

import statsmodels.formula.api as smf
smf.ols#(formula, data, subset=None, drop_cols=None)
smf.wls#(formula, data, subset=None, drop_cols=None)
smf.gls#(formula, data, subset=None, drop_cols=None)
smf.glm#(formula, data, subset=None, drop_cols=None)
smf.mixedlm#(formula, data, re_formula=None, vc_formula=None, subset=None, use_sparse=False, missing='none')

# Generalized Method of Moments
from statsmodels.sandbox.regression import gmm
gmm.GMM#(endog, exog, instrument, k_moms=None, k_params=None, missing='none')
gmm.IV2SLS#(endog, exog, instrument=None)
gmm.IVGMM#(endog, exog, instrument, k_moms=None, k_params=None, missing='none')
gmm.LinearIVGMM#(endog, exog, instrument, k_moms=None, k_params=None, missing='none')
gmm.NonlinearIVGMM#(endog, exog, instrument, func)

# Instrumental Variable Estimation
# https://bashtage.github.io/linearmodels/iv/mathematical-formula.html
from linearmodels import iv
iv.IV2SLS#(dependent, exog=None, endog=None, instruments=None, weights=None)
iv.IVGMM#(dependent, exog=None, endog=None, instruments=None, weights=None, weight_type='robust', weight_config=dict())
iv.IVGMMCUE#(dependent, exog=None, endog=None, instruments=None, weights=None, weight_type='robust', weight_config=dict())
iv.IVLIML#(dependent, exog=None, endog=None, instruments=None, weights=None, weight_type='robust', fuller=0, kappa=None)
iv.AbsorbingLS#(dependent, exog=None, absorb=None, interactions=None, weights=None, drop_absorbed=False)
iv.Interaction#(cat=None, cont=None, nobs=None)

# Panel Data Model Estimation
# https://bashtage.github.io/linearmodels/panel/mathematical-formula.html
from linearmodels import panel
panel.PanelOLS#(dependent, exog, weights=None, entity_effects=False, time_effects=False, other_effects=None, singletons=True, drop_absorbed=False, check_rank=True)      # fixed effect model
panel.RandomEffects#(dependent, exog, weights=None, check_rank=True) # random effect model
panel.BetweenOLS#(dependent, exog, weights=None, check_rank=True)
panel.FirstDifferenceOLS#(dependent, exog, weights=None, check_rank=True)
panel.FamaMacBeth#(dependent, exog, weights=None, check_rank=True)
panel.PooledOLS#(dependent, exog, weights=None, check_rank=True)

# Linear Factor Models for Asset Pricing
# https://bashtage.github.io/linearmodels/asset-pricing/mathematical-formula.html
from linearmodels.asset_pricing import model
model.LinearFactorModel#(portfolios, factors, risk_free=False, sigma=None)
model.LinearFactorModelGMM#(portfolios, factors, risk_free=False)
model.TradedFactorModel#(portfolios, factors)

# System Regression Models
# https://bashtage.github.io/linearmodels/system/mathematical-formula.html
from linearmodels import system
system.SUR#(equations, sigma=None)
system.IV3SLS#(equations, sigma=None)
system.IVSystemGMM#(equations, sigma=None, weight_type='robust', weight_config=dict())

# Multi-Level Regression Model References
# https://www.pymc.io/projects/docs/en/v3/pymc-examples/examples/generalized_linear_models/GLM.html
# https://eshinjolly.com/pymer4/

 

 

Instrumental variables estimation

#

 

 

Generalized Method of Moments

#

 

 

Covariance Estimation

$${\displaystyle \begin{cases} \text{Diagonal Covariance Structure} \\ \text{Compound Symmetry Covariance Structure} \\ \text{Unstructured Covariance Structure} \\ \text{Autoregressive Covariance Structure} \\ \text{Toeplitz Covariance Structure} \\ \text{Spatial Covariance Structure} \\ \text{Heteroscedastic Covariance Structure} \\ \text{Custom Covariance Structure} \\ \end{cases} }$$

 

# https://www.ibm.com/docs/en/spss-statistics/beta?topic=statistics-covariance-structures

 

 

 


Panel Analysis

Panel Analysis Overview

$${\displaystyle \text{Linear Model Assumptions} }$$ $${\displaystyle \begin{aligned} p(\mathbf{y}_{i}, \mathbf{x}_{i}) &= p(\mathbf{y}_{i} \mid \mathbf{x}_{i}) \, p(\mathbf{x}_{i}) \\ \; &= \left[ \int p(\mathbf{y}_{i} \mid \mathbf{x}_{i}, \alpha_{i}, \boldsymbol{\lambda}) \, \underbrace{p(\alpha_{i}, \boldsymbol{\lambda} \mid \mathbf{x}_{i})}_{= p(\alpha_{i}, \boldsymbol{\lambda}) = p(\alpha_{i})p(\boldsymbol{\lambda})} \; d \alpha_{i} d \boldsymbol{\lambda} \right] \, p(\mathbf{x}_{i}) \\ \end{aligned} }$$ $${\displaystyle \begin{aligned} \operatorname{E}[\alpha_{i}] &= 0, &\quad \operatorname{E}[\lambda_{t}] &= 0, &\quad \operatorname{E}[u_{it}] &= 0\\ \operatorname{E}[\alpha_{i}\lambda_{t}] &= 0, &\quad \operatorname{E}[\alpha_{i}u_{it}] &= 0, &\quad \operatorname{E}[\lambda_{t}u_{it}] &= 0\\ \operatorname{E}[\alpha_{i}\alpha_{i^{\prime}}] &= \sigma_{\alpha}\mathbf{I}_{N}, &\quad \operatorname{E}[\lambda_{t}\lambda_{t^{\prime}}] &= \sigma_{\lambda}\mathbf{I}_{T}, &\quad \operatorname{E}[u_{it}u_{i^{\prime}t^{\prime}}] &= \sigma_{u} (\mathbf{I}_{N} \otimes \mathbf{I}_{T}) \\ \operatorname{E}[\alpha_{i}\mathbf{x}_{it}^{T}] &= \mathbf{0}^{T}, &\quad \operatorname{E}[\lambda_{t}\mathbf{x}_{it}^{T}] &= \mathbf{0}^{T}, &\quad \operatorname{E}[u_{it}\mathbf{x}_{it}^{T}] &= \mathbf{0}^{T} \\ \end{aligned} }$$ $${\displaystyle \begin{aligned} \therefore \; \sigma_{y} &= \sigma_{\alpha} + \sigma_{\lambda} + \sigma_{u} \\ \end{aligned} }$$
$${\displaystyle \text{Linear Model} }$$ $${\displaystyle \begin{aligned} \; & \quad \text{for}\; i \in \{1, \dots, N\}, \; t \in \{1, \dots, T\} \\ y_{it} &= \underbrace{\sum_{k=1}^{p}\gamma_{ki}z_{ki}}_{\text{time invariant}} + \underbrace{\sum_{k=1}^{l}\rho_{kt}r_{kt}}_{\text{cross-sectional invariant}} + \sum_{k=1}^{K}\beta_{kit}x_{kit} + v_{it} \\ \; &= \boldsymbol{\gamma}_{i}^{T}\mathbf{z}_{i} + \boldsymbol{\rho}_{t}^{T}\mathbf{r}_{t} + \boldsymbol{\beta}_{it}^{T}\mathbf{x}_{it} + v_{it} \\ \; &= \underbrace{\mathbf{z}_{i}^{T}\boldsymbol{\gamma}_{i}}_{(1 \times p) \times (p \times 1)} + \underbrace{\mathbf{r}_{t}^{T}\boldsymbol{\rho}_{t}}_{(1 \times l) \times (l \times 1)} + \underbrace{\mathbf{x}_{it}^{T}\boldsymbol{\beta}_{it}}_{(1 \times K) \times (K \times 1)} + v_{it} \\ \; & \\ y_{it} &= \mathbf{z}_{i}^{T}\boldsymbol{\gamma}_{i} + \mathbf{r}_{t}^{T}\boldsymbol{\rho}_{t} + \mathbf{x}_{it}^{T}\boldsymbol{\beta}_{it} + v_{it}, \qquad \qquad v_{it} = \alpha_{it}^{*} + u_{it} \\ \; &= \mathbf{z}_{i}^{T}\boldsymbol{\gamma}_{i} + \mathbf{r}_{t}^{T}\boldsymbol{\rho}_{t} + \mathbf{x}_{it}^{T}\boldsymbol{\beta}_{it} + \alpha_{it}^{*} + u_{it}, \quad \; \alpha_{it}^{*} = \mu + \alpha_{i} + \lambda_{t} \\ \; &= \mathbf{z}_{i}^{T}\boldsymbol{\gamma}_{i} + \mathbf{r}_{t}^{T}\boldsymbol{\rho}_{t} + \mathbf{x}_{it}^{T}\boldsymbol{\beta}_{it} + \mu + \alpha_{i} + \lambda_{t} + u_{it} \\ \; &= \underbrace{\mu + \mathbf{z}_{i}^{T}\boldsymbol{\gamma}_{i} + \mathbf{r}_{t}^{T}\boldsymbol{\rho}_{t} + \mathbf{x}_{it}^{T}\boldsymbol{\beta}_{it}}_{\text{Instrumental Variable Estimation}} + \underbrace{\alpha_{i}}_{\text{Fixed/Random Effect}} + \underbrace{\lambda_{t}}_{\text{Fixed/Random Effect}} + u_{it} \\ \end{aligned} }$$ $${\displaystyle \begin{aligned} \text{Fixed Effect Model} \qquad & \sum_{i=1}^{N} \alpha_{i}=0 \; \And \; \sum_{t=1}^{T} \lambda_{t}=0 \\ \text{Random Effect Model} \qquad & \sum_{i=1}^{N} \alpha_{i} \ne 0 \; \And \; \sum_{t=1}^{T} \lambda_{t} \ne 0 \\ \text{Mixed Effect Model} \qquad & \text{otherwise} \\ \end{aligned} }$$
$${\displaystyle \underbrace{\begin{bmatrix} \mathbf{y}_{1} \\ \mathbf{y}_{2} \\ \vdots \\ \mathbf{y}_{N} \\ \end{bmatrix}}_{NT \times 1} = \underbrace{\begin{bmatrix} \mathbf{e}_{T} & \mathbf{Z}_{1} & \mathbf{R} & \mathbf{X}_{1} \\ \mathbf{e}_{T} & \mathbf{Z}_{2} & \mathbf{R} & \mathbf{X}_{2} \\ \vdots & \vdots & \vdots & \vdots \\ \mathbf{e}_{T} & \mathbf{Z}_{N} & \mathbf{R} & \mathbf{X}_{N} \\ \end{bmatrix} \begin{bmatrix} \mu \\ \boldsymbol{\gamma} \\ \boldsymbol{\rho} \\ \boldsymbol{\beta} \\ \end{bmatrix}}_{\text{Instrumental Variable Estimation}} + (\mathbf{I}_{N} \otimes \mathbf{e}_{T} ) \boldsymbol{\alpha} + ( \mathbf{e}_{N} \otimes \mathbf{I}_{T} ) \boldsymbol{\lambda} + \begin{bmatrix} \mathbf{u}_{1} \\ \mathbf{u}_{2} \\ \vdots \\ \mathbf{u}_{N} \\ \end{bmatrix} }$$ $${\displaystyle \underbrace{\boldsymbol{\alpha}}_{N \times 1} = \begin{bmatrix} \alpha_{1} \\ \alpha_{2} \\ \vdots \\ \alpha_{N} \\ \end{bmatrix}, \quad \underbrace{\boldsymbol{\lambda}}_{T \times 1} = \begin{bmatrix} \lambda_{1} \\ \lambda_{2} \\ \vdots \\ \lambda_{T} \\ \end{bmatrix}, \quad \underbrace{\mathbf{R}}_{T \times l} = \begin{bmatrix} \mathbf{r}_{1}^{T} \\ \mathbf{r}_{2}^{T} \\ \vdots \\ \mathbf{r}_{T}^{T} \\ \end{bmatrix}, \quad \underbrace{\mathbf{e}_{N}}_{N \times 1} = \begin{bmatrix} 1 \\ 1 \\ \vdots \\ 1 \\ \end{bmatrix}, \quad \underbrace{\mathbf{e}_{T}}_{T \times 1} = \begin{bmatrix} 1 \\ 1 \\ \vdots \\ 1 \\ \end{bmatrix} }$$
$${\displaystyle \begin{aligned} \text{No Constraint} \quad \text{varing over time and individuals} && \quad y_{it} &= \alpha_{it}^{*} + \boldsymbol{\beta}_{it}^{T}\mathbf{x}_{it} +u_{it} \\ \; && \; &= \alpha_{it}^{*} + \boldsymbol{\beta}_{it}^{T}\mathbf{x}_{it} + 0 \\ \end{aligned} }$$ $${\displaystyle \text{Intercept Constraint}\quad \begin{cases} \text{varing over time} && \quad y_{it} &= \alpha_{t}^{*} + \boldsymbol{\beta}_{it}^{T}\mathbf{x}_{it} +u_{it} \\ \text{varing over individuals} && \quad y_{it} &= \alpha_{i}^{*} + \boldsymbol{\beta}_{it}^{T}\mathbf{x}_{it} +u_{it} \\ \text{being constant} && \quad y_{it} &= \alpha^{*} + \boldsymbol{\beta}_{it}^{T}\mathbf{x}_{it} +u_{it} \\ \end{cases} }$$ $${\displaystyle \text{Slope Constraint}\quad \begin{cases} \text{varing over time} && \quad y_{it} &= \alpha_{it}^{*} + \boldsymbol{\beta}_{t}^{T}\mathbf{x}_{it} +u_{it} \\ \text{varing over individuals} && \quad y_{it} &= \alpha_{it}^{*} + \boldsymbol{\beta}_{i}^{T}\mathbf{x}_{it} +u_{it} \\ \text{being constant} && \quad y_{it} &= \alpha_{it}^{*} + \boldsymbol{\beta}^{T}\mathbf{x}_{it} +u_{it} \\ \end{cases} }$$ $${\displaystyle \text{Intercept-Slope Constraint}\quad \begin{cases} \text{varing over time (1)} && \quad y_{it} &= \alpha_{t}^{*} + \boldsymbol{\beta}_{t}^{T}\mathbf{x}_{it} +u_{it} \\ \text{varing over time (2)} && \quad y_{it} &= \alpha_{t}^{*} + \boldsymbol{\beta}^{T}\mathbf{x}_{it} +u_{it} \\ \text{varing over time (3)} && \quad y_{it} &= \alpha^{*} + \boldsymbol{\beta}_{t}^{T}\mathbf{x}_{it} +u_{it} \\ \text{varing over individuals (1)} && \quad y_{it} &= \alpha_{i}^{*} + \boldsymbol{\beta}_{i}^{T}\mathbf{x}_{it} +u_{it} \\ \text{varing over individuals (2)} && \quad y_{it} &= \alpha_{i}^{*} + \boldsymbol{\beta}^{T}\mathbf{x}_{it} +u_{it} \\ \text{varing over individuals (3)} && \quad y_{it} &= \alpha^{*} + \boldsymbol{\beta}_{i}^{T}\mathbf{x}_{it} +u_{it} \\ \text{varing crossover time and individuals (1)} && \quad y_{it} &= \alpha_{i}^{*} + \boldsymbol{\beta}_{t}^{T}\mathbf{x}_{it} +u_{it} \\ \text{varing crossover time and individuals (2)} && \quad y_{it} &= \alpha_{t}^{*} + \boldsymbol{\beta}_{i}^{T}\mathbf{x}_{it} +u_{it} \\ \text{being constant} && \quad y_{it} &= \alpha^{*} + \boldsymbol{\beta}^{T}\mathbf{x}_{it} +u_{it} \\ \end{cases} }$$

 

 

Panel Data Design: Data Formats for Panel Data Analysis

1-Level Panel Design

import numpy as np
import pandas as pd

panels = map(lambda group: list(map(lambda timestamp: [group, timestamp], pd.date_range(end='00:00:00', periods=np.random.poisson(200), freq='H'))), range(5))
for idx, a_panel in enumerate(panels):
    data = np.r_[data, np.array(a_panel)] if idx else np.array(a_panel)
paneldata = pd.DataFrame(np.c_[data, np.zeros(data.shape[0])], columns=['Individual', 'DateTime', 'Dependent']).set_index('Individual')
paneldata.index = paneldata.groupby('Individual', group_keys=True)['DateTime'].apply(lambda datetime: pd.Series(pd.DatetimeIndex(datetime, freq='H'))).index
paneldata.index = paneldata.index.set_levels([paneldata.index.levels[0].astype(int), paneldata.index.levels[1].astype(int)])
paneldata.index.names = ['Individual', 'TimeIndexOrder']

# Panel Index: Individual * TimeIndexOrder
hour = paneldata.groupby('Individual', group_keys=True)['DateTime'].apply(lambda datetime: pd.Series(pd.DatetimeIndex(datetime, freq='H').hour)); hour.index.names = ['Individual', 'TimeIndexOrder']
day = paneldata.groupby('Individual', group_keys=True)['DateTime'].apply(lambda datetime: pd.Series(pd.DatetimeIndex(datetime, freq='H').day)); day.index.names = ['Individual', 'TimeIndexOrder']
month = paneldata.groupby('Individual', group_keys=True)['DateTime'].apply(lambda datetime: pd.Series(pd.DatetimeIndex(datetime, freq='H').month)); month.index.names = ['Individual', 'TimeIndexOrder']
year = paneldata.groupby('Individual', group_keys=True)['DateTime'].apply(lambda datetime: pd.Series(pd.DatetimeIndex(datetime, freq='H').year)); year.index.names = ['Individual', 'TimeIndexOrder']
paneldata['Hour'] = hour
paneldata['Day'] = day
paneldata['Month'] = month
paneldata['Year'] = year
paneldata['TimeStamp'] = year + month/12 + day/365 + hour/(365*24)

# Panel Value: Dependent ~ f(Predictors)
paneldata['X0'] = np.random.normal(3, 5, size=paneldata.shape[0])
paneldata['X1'] = np.random.normal(5, 3, size=paneldata.shape[0])
paneldata['X2'] = np.random.normal(4, 1, size=paneldata.shape[0])
paneldata['Dependent'] = paneldata.groupby('Individual', group_keys=False).apply(lambda panel: panel['Dependent'] + 2*panel['X0'] + 3*panel['X1'] + 4*panel['X2'] + np.random.normal(size=panel.shape[0]))

 

2-Level Nested or Crossed Group Design

  • Nested Group: number of higher-level groups < number of lower-level groups

G1 G1 G1 G2 G2 G2 G3 G3 G3
g1 g2 g3 g4 g5 g6 g7 g8 g9
  • Crossed Group

G1 G1 G1 G2 G2 G2 G3 G3 G3
g1 g2 g3 g1 g2 g3 g1 g2 g3
import numpy as np
import pandas as pd

np.kron(np.arange(5), np.ones(3))  # [0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4]
np.repeat(np.arange(5), repeats=3) # [0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4]
np.tile(np.arange(3), reps=5)      # [0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2]

# nested group
individuals = np.arange(12)
group = np.kron(np.arange(4), np.ones(3)) # 4: number of groups, 3: number of instances in the group
nested_group = pd.DataFrame(index=pd.MultiIndex.from_arrays(np.c_[group, individuals].T, names=['G1', 'G2']))
nested_group['e'] = np.random.normal(size=12) #nested_group.unstack(-1)
G1_effect = nested_group.groupby(['G1'], group_keys=False)[['e']].apply(lambda e: e*0 + np.random.normal(0, 1)).rename(columns={'e':'G1_Effect'})
G1G2_effect = nested_group.groupby(['G1', 'G2'], group_keys=False)[['e']].apply(lambda e: e*0 + np.random.normal(0, 1)).rename(columns={'e':'G1G2_Effect'})
nested_group = pd.concat([nested_group, G1_effect, G1G2_effect], axis=1)
nested_group['y'] = nested_group['G1_Effect'] + nested_group['G1G2_Effect'] + nested_group['e']
nested_group

# crossed group
individuals = np.tile(np.arange(3), reps=4)
group = np.kron(np.arange(4), np.ones(3))
crossed_group = pd.DataFrame(index=pd.MultiIndex.from_arrays(np.c_[group, individuals].T, names=['G1', 'G2']))
crossed_group['e'] = np.random.normal(size=12) #crossed_group.unstack(-1)
G1_effect = crossed_group.groupby(['G1'], group_keys=False)[['e']].apply(lambda e: e*0 + np.random.normal(0, 1)).rename(columns={'e':'G1_Effect'})
G2_effect = crossed_group.groupby(['G2'], group_keys=False)[['e']].apply(lambda e: e*0 + np.random.normal(0, 1)).rename(columns={'e':'G2_Effect'})
crossed_group = pd.concat([crossed_group, G1_effect, G2_effect], axis=1)
crossed_group['y'] = crossed_group['G1_Effect'] + crossed_group['G2_Effect'] + crossed_group['e']
crossed_group
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.stats.api as sms
import statsmodels.formula.api as smf

n_group1 = 20; n_group2 = 30
data = pd.DataFrame(data=0, columns=['e'], index=pd.MultiIndex.from_product([np.arange(n_group1), np.arange(n_group2)], names=['G1', 'G2']))
G1_effect = data.groupby(['G1'], group_keys=False)[['e']].apply(lambda e: e + np.random.normal(0, 2)).rename(columns={'e':'G1_Effect'})
G1G2_effect = data.groupby(['G1', 'G2'], group_keys=False)[['e']].apply(lambda e: e + np.random.normal(0, 3)).rename(columns={'e':'G1G2_Effect'})
data = pd.concat([data, G1_effect, G1G2_effect], axis=1)
data = pd.concat([data]*10, axis=0)
data['X'] = np.random.normal(size=(data.shape[0],))
data['y'] = 3*data['X'] + data['G1_Effect'] + data['G1G2_Effect'] + np.random.normal(0, 1, size=data.shape[0])

ols_result = smf.ols("y ~ C(G1, Sum) + C(G1, Sum):C(G2) + X", data=data.reset_index()).fit()
display(sms.anova_lm(ols_result))

reml_result = smf.mixedlm('y ~ 1 + X', re_formula="1", vc_formula={'G2':'~ 0 + C(G2)'}, groups='G1', data=data.reset_index()).fit()
display(reml_result.summary())

 

 

 

 

 

General Linear Model on Panel

 

 

 


Panel Analysis (1): Structure-approach

Static Model

$${\displaystyle \text{Simple Regression with Variable Intercepts} }$$ $${\displaystyle \begin{aligned} y_{it} &= \alpha_{i}^{*} + \mathbf{x}_{it}^{T}\boldsymbol{\beta} +u_{it} \\ \boldsymbol{\alpha} &= \underbrace{\begin{bmatrix} D_{11} & \cdots & D_{1n} \\ \vdots & \ddots & \vdots \\ D_{n1} & \cdots & D_{nn} \end{bmatrix}}_{\text{Design Matrix: Contrast}} \underbrace{\begin{bmatrix} \alpha_{1} & \alpha_{2} & \dots & \alpha_{n} \\ \end{bmatrix}}_{\text{Coefficients of } \boldsymbol{\alpha}} \\ \; &= \begin{bmatrix} \mathbf{D}_{1} \\ \mathbf{D}_{2} \\ \vdots \\ \mathbf{D}_{n} \\ \end{bmatrix} \begin{bmatrix} \alpha_{1} & \alpha_{2} & \dots & \alpha_{n} \\ \end{bmatrix} \\ \; &= \alpha_{1}\mathbf{D}_{1} + \alpha_{2}\mathbf{D}_{2} + \dots + \alpha_{n}\mathbf{D}_{n} \\ \end{aligned} }$$ $${\displaystyle \begin{aligned} \mathbf{D}_{full} &= \begin{bmatrix} \mathbf{D}_{1} \\ \mathbf{D}_{2} \\ \vdots \\ \mathbf{D}_{n} \\ \end{bmatrix} = \underbrace{\begin{bmatrix} D_{11} & \cdots & D_{1n} \\ \vdots & \ddots & \vdots \\ D_{n1} & \cdots & D_{nn} \end{bmatrix}}_{\text{Design Matrix: Contrast}} = \begin{bmatrix} \color{Red}{1} & 0 & 0 & \cdots & 0 & 0 & 0 \\ 0 & \color{Red}{1} & 0 & \cdots & 0 & 0 & 0 \\ 0 & 0 & \color{Red}{1} & \cdots & 0 & 0 & 0\\ \vdots & \vdots & \vdots & \color{Red}{\ddots} & \vdots & \vdots & \vdots \\ 0 & 0 & 0 & \cdots & \color{Red}{1} & 0 & 0\\ 0 & 0 & 0 & \cdots & 0 & \color{Red}{1} & 0\\ 0 & 0 & 0 & \cdots & 0 & 0 & \color{Red}{1} \\ \end{bmatrix} \\ \end{aligned} }$$ $${\displaystyle \begin{aligned} \mathbf{D}_{reduce} &= \begin{bmatrix} \mathbf{D}_{1} \\ \mathbf{D}_{2} \\ \vdots \\ \mathbf{D}_{n} \\ \end{bmatrix} = \underbrace{\begin{bmatrix} D_{11} & \cdots & D_{1n} \\ \vdots & \ddots & \vdots \\ D_{n1} & \cdots & D_{nn} \end{bmatrix}}_{\text{Design Matrix: Contrast}} = \begin{bmatrix} \color{Red}{1} & 0 & \color{Red}{1} & \cdots & 0 & 0 & 0 \\ 0 & \color{Red}{1} & \color{Red}{1} & \cdots & 0 & 0 & 0 \\ 0 & 0 & \color{Red}{1} & \cdots & 0 & 0 & 0\\ \vdots & \vdots & \color{Red}{\vdots} & \color{Red}{\ddots} & \vdots & \vdots & \vdots \\ 0 & 0 & \color{Red}{1} & \cdots & \color{Red}{1} & 0 & 0\\ 0 & 0 & \color{Red}{1} & \cdots & 0 & \color{Red}{1} & 0\\ 0 & 0 & \underbrace{\color{Red}{1}}_{ref} & \cdots & 0 & 0 & \color{Red}{1} \\ \end{bmatrix} \\ \end{aligned} }$$
Pasty Formula Rank
' y ~ 0 + C(X) + ... ' Full Rank
' y ~ 1 + C(X) + ... ' Reduced Rank
' y ~ C(X) + ... ' Reduced Rank
# https://patsy.readthedocs.io/en/latest/builtins-reference.html
# https://www.statsmodels.org/stable/example_formulas.html
# https://www.statsmodels.org/devel/contrasts.html
# https://en.wikipedia.org/wiki/Conditional_expectation

import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.stats.api as sms
import statsmodels.formula.api as smf
from patsy.contrasts import Treatment, Sum, Diff, Helmert, Poly, ContrastMatrix

contrast = Treatment(reference=0).code_without_intercept(list(range(5)))
contrast = Treatment(reference=0).code_with_intercept(list(range(5)))
contrast = Sum().code_without_intercept(list(range(5)))
contrast = Sum().code_with_intercept(list(range(5)))
contrast = Diff().code_without_intercept(list(range(5)))
contrast = Diff().code_with_intercept(list(range(5)))
contrast = Helmert().code_without_intercept(list(range(5)))
contrast = Helmert().code_with_intercept(list(range(5)))
contrast = Poly().code_without_intercept(list(range(5)))
contrast = Poly().code_with_intercept(list(range(5)))

X = pd.DataFrame(
    np.c_[
        np.ones((500,)),
        np.random.randint(0, 5, size=(500,)),
        np.random.normal(size=500),
    ]
)
y = 1*X[1] + 3*X[2] + np.random.normal(size=500)

data = pd.DataFrame(np.c_[X,y], columns=['X' + str(i) for i in range(X.shape[1])] + ['y'])
data['X1'] = data['X1'].apply(lambda x: {0:'A', 1:'B', 2:'B', 3:'C', 4:'D'}[x]).astype('category')

result = smf.ols('y ~ 0 + C(X1) + X2', data=data).fit()
result = smf.ols('y ~ 1 + C(X1) + X2', data=data).fit()
result = smf.ols('y ~ C(X1, Treatment(reference="D")) + X2', data=data).fit()
result = smf.ols('y ~ 0 + C(X1, Treatment) + X2', data=data).fit() # data.groupby('X1')[['y']].mean()
result = smf.ols('y ~ 1 + C(X1, Treatment(0)) + X2', data=data).fit()
result = smf.ols('y ~ 1 + C(X1, Treatment(1)) + X2', data=data).fit() 
result = smf.ols('y ~ 1 + C(X1, Treatment(2)) + X2', data=data).fit() 
result = smf.ols('y ~ 1 + C(X1, Treatment(3)) + X2', data=data).fit() 
result = smf.ols('y ~ 1 + C(X1, Treatment("A")) + X2', data=data).fit() 
result = smf.ols('y ~ 1 + C(X1, Treatment("B")) + X2', data=data).fit() 
result = smf.ols('y ~ 1 + C(X1, Treatment("C")) + X2', data=data).fit() 
result = smf.ols('y ~ 1 + C(X1, Treatment("D")) + X2', data=data).fit() 
result = smf.ols('y ~ 1 + C(X1, Sum) + X2', data=data).fit() # data.groupby('X1')[['y']].mean() - data.groupby('X1')[['y']].mean().mean()
result = smf.ols('y ~ 1 + C(X1, Sum(0)) + X2', data=data).fit() 
result = smf.ols('y ~ 1 + C(X1, Sum(1)) + X2', data=data).fit() 
result = smf.ols('y ~ 1 + C(X1, Sum(2)) + X2', data=data).fit() 
result = smf.ols('y ~ 1 + C(X1, Sum(3)) + X2', data=data).fit() 
result = smf.ols('y ~ 1 + C(X1, Sum("A")) + X2', data=data).fit() 
result = smf.ols('y ~ 1 + C(X1, Sum("B")) + X2', data=data).fit() 
result = smf.ols('y ~ 1 + C(X1, Sum("C")) + X2', data=data).fit() 
result = smf.ols('y ~ 1 + C(X1, Sum("D")) + X2', data=data).fit() 
result = smf.ols('y ~ 0 + C(X1, Diff) + X2', data=data).fit() # data.groupby('X1')[['y']].mean().diff()
result = smf.ols('y ~ 0 + C(X1, Helmert) + X2', data=data).fit() # data.groupby('X1')[['y']].mean().expanding(min_periods=2).apply(lambda m: (1/m.shape[0])*(m[-1] - m[:-1].mean()))
result = smf.ols('y ~ 0 + C(X1, Poly) + X2', data=data).fit() 

display(sms.anova_lm(result, typ=2))
display(result.summary())

 

 

Dynamic Model

# Ex-Post
# Ex-Ante

 

 

 

Variable-Coefficient Model

#

 

 

 

 


Panel Analysis (2): Deviation-approach

import pandas as pd
from linearmodels import panel
from linearmodels.panel.utility import generate_panel_data

paneldata, weights, other_effects, clusters = generate_panel_data(nentity=300, ntime=20, nexog=5, const=False, missing=0, other_effects=2, ncats=4, rng=None)
fixed = panel.PanelOLS.from_formula('y ~ 1 + x0 + x1 + x2 + x3 + x4 + EntityEffects + TimeEffects', paneldata).fit()
fixed.summary
$${\displaystyle \text{Within Transformation}}$$ $${\displaystyle \begin{aligned} \text{Fixed Effect} && y_{it}-\bar{y}_{i} &=(x_{it}-\bar{x}_{i})\beta+\left(u_{it}-\bar{u}_{i}\right) \\ \text{Random Effect} && y_{it}-\hat{\theta}_{i}\bar{y}_{i} &=\left(1-\hat{\theta}_{i}\right)\alpha_{i}+(x_{it}-\hat{\theta}_{i}\bar{x}_{i})\beta+\left(u_{it}-\hat{\theta}_{i}\bar{u}_{i}\right) \\ \text{Between} && \bar{y}_{i}&=\bar{x}_{i}\beta+\bar{u}_{i} \\ \text{First Difference} && \Delta y_{it} &=\Delta x_{it}\beta+\Delta u_{it} \\ \text{Pooled} && y_{it}&=x_{it}\beta+u_{it} \\ \end{aligned} }$$

Summary

import numpy as np
import pandas as pd

panels = map(lambda group: list(map(lambda timestamp: [group, timestamp], pd.date_range(end='00:00:00', periods=np.random.poisson(200), freq='h'))), range(5))
for idx, a_panel in enumerate(panels):
    data = np.r_[data, np.array(a_panel)] if idx else np.array(a_panel)
paneldata = pd.DataFrame(np.c_[data, np.zeros(data.shape[0])], columns=['Individual', 'DateTime', 'Dependent']).set_index('Individual')
paneldata.index = paneldata.groupby('Individual', group_keys=True)['DateTime'].apply(lambda datetime: pd.Series(pd.DatetimeIndex(datetime, freq='h'))).index
paneldata.index = paneldata.index.set_levels([paneldata.index.levels[0].astype(int), paneldata.index.levels[1].astype(int)])
paneldata.index.names = ['Individual', 'TimeIndexOrder']

# Panel Index: Individual * TimeIndexOrder
hour = paneldata.groupby('Individual', group_keys=True)['DateTime'].apply(lambda datetime: pd.Series(pd.DatetimeIndex(datetime, freq='h').hour)); hour.index.names = ['Individual', 'TimeIndexOrder']
day = paneldata.groupby('Individual', group_keys=True)['DateTime'].apply(lambda datetime: pd.Series(pd.DatetimeIndex(datetime, freq='h').day)); day.index.names = ['Individual', 'TimeIndexOrder']
month = paneldata.groupby('Individual', group_keys=True)['DateTime'].apply(lambda datetime: pd.Series(pd.DatetimeIndex(datetime, freq='h').month)); month.index.names = ['Individual', 'TimeIndexOrder']
year = paneldata.groupby('Individual', group_keys=True)['DateTime'].apply(lambda datetime: pd.Series(pd.DatetimeIndex(datetime, freq='h').year)); year.index.names = ['Individual', 'TimeIndexOrder']
paneldata['Hour'] = hour
paneldata['Day'] = day
paneldata['Month'] = month
paneldata['Year'] = year
paneldata['TimeStamp'] = year + month/12 + day/365 + hour/(365*24)

# Panel Value: Dependent ~ f(Predictors)
paneldata['X0'] = np.random.normal(3, 5, size=paneldata.shape[0])
paneldata['X1'] = np.random.normal(5, 3, size=paneldata.shape[0])
paneldata['X2'] = np.random.normal(4, 1, size=paneldata.shape[0])
paneldata['Dependent'] = paneldata.groupby('Individual', group_keys=False).apply(lambda panel: panel['Dependent'] + 2*panel['X0'] + 3*panel['X1'] + 4*panel['X2'] + np.random.normal(size=panel.shape[0]))
paneldata.to_csv('panel.csv')
paneldata = pd.read_csv('panel.csv').set_index(['Individual', 'TimeIndexOrder'])


# Panel Model
from linearmodels import panel
import statsmodels.formula.api as smf
pooled = panel.PooledOLS.from_formula('Dependent ~ 1 + X0 + X1 + X2', paneldata).fit()
between = panel.BetweenOLS.from_formula('Dependent ~ 1 + X0 + X1 + X2', paneldata).fit()
firstdiff = panel.FirstDifferenceOLS.from_formula('Dependent ~ X0 + X1 + X2', paneldata).fit()
fixed = panel.PanelOLS.from_formula('Dependent ~ 1 + X0 + X1 + X2 + EntityEffects', paneldata).fit()
random = panel.RandomEffects.from_formula('Dependent ~ 1 + X0 + X1 + X2', paneldata).fit()
ols = smf.ols('Dependent ~ C(Individual, Sum) + X0 + X1 + X2', data=paneldata.reset_index('Individual')).fit()

from linearmodels.panel.results import compare
compare({'Pooled':pooled,'Between':between,'FirstDiff':firstdiff, 'Fixed':fixed, 'Random':random})

 

 

Fixed Effect Model

Fixed Entity Effect

import pandas as pd
from linearmodels import panel

paneldata = pd.read_csv('panel.csv').set_index(['Individual', 'TimeIndexOrder'])

# Entity Effect Model
fixed = panel.PanelOLS.from_formula('Dependent ~ 1 + X0 + X1 + X2 + EntityEffects', paneldata).fit()
fixed.summary

fixed.entity_info # paneldata.groupby('Individual')['Dependent'].count().describe()
fixed.time_info # paneldata.groupby('TimeIndexOrder')['Dependent'].count().describe()
fixed.other_info # None

fixed.included_effects
fixed.estimated_effects # (paneldata['Dependent']  - fixed.fitted_values['fitted_values']).groupby('Individual').mean() 
fixed.estimated_effects.sum() == 0 # approx 
fixed.idiosyncratic

fixed.variance_decomposition

summary_frame = pd.concat([paneldata[['Dependent']], fixed.fitted_values, fixed.estimated_effects, fixed.resids.to_frame()], axis=1)
summary_frame['residual_deviation'] = fixed.resids - fixed.resids.groupby('Individual').mean()
summary_frame['weighted_residual'] = fixed.wresids
summary_frame['idiosyncratic'] = fixed.idiosyncratic
summary_frame.insert(1, 'validation1', summary_frame['fitted_values'] + summary_frame['estimated_effects'] + summary_frame['residual'])
summary_frame.insert(3, 'validation2', fixed.params['Intercept'] + fixed.params['X0'] * paneldata['X0'] + fixed.params['X1'] * paneldata['X1'] + fixed.params['X2'] * paneldata['X2'])
summary_frame.insert(5, 'validation3', summary_frame.index.to_frame().reset_index(drop=True).merge((summary_frame['Dependent'] - summary_frame['fitted_values']).groupby('Individual').mean().rename('validation3').to_frame(), on='Individual', how='left').set_index(['Individual', 'TimeIndexOrder']))
summary_frame
import pandas as pd
import statsmodels.formula.api as smf
from linearmodels import panel

paneldata = pd.read_csv('panel.csv').set_index(['Individual', 'TimeIndexOrder'])
paneldata['dDependent'] = paneldata['Dependent'] - paneldata['Dependent'].groupby('Individual').mean()
paneldata['dX0'] = paneldata['X0'] - paneldata['X0'].groupby('Individual').mean()
paneldata['dX1'] = paneldata['X1'] - paneldata['X1'].groupby('Individual').mean()
paneldata['dX2'] = paneldata['X2'] - paneldata['X2'].groupby('Individual').mean()

# fixed model with entity effects v.s. ols with intercepts for a categorical variable
fixed = panel.PanelOLS.from_formula('Dependent ~ 1 + X0 + X1 + X2 + EntityEffects', data=paneldata).fit(); display(fixed.summary.tables[1])
ols = smf.ols('Dependent ~ C(Individual, Sum) + X0 + X1 + X2', data=paneldata.reset_index('Individual')).fit(); display(ols.summary().tables[1])
ols = smf.ols('dDependent ~ 1 + dX0 + dX1 + dX2', data=paneldata.reset_index('Individual')).fit(); display(ols.summary().tables[1])

Fixed Time Effect

import pandas as pd
from linearmodels import panel

paneldata = pd.read_csv('panel.csv').set_index(['Individual', 'TimeIndexOrder'])
paneldata['dy'] = paneldata['Dependent'] - paneldata['Dependent'].groupby('Individual').mean()
paneldata['dX0'] = paneldata['X0'] - paneldata['X0'].groupby('Individual').mean()
paneldata['dX1'] = paneldata['X1'] - paneldata['X1'].groupby('Individual').mean()
paneldata['dX2'] = paneldata['X2'] - paneldata['X2'].groupby('Individual').mean()

# Time Effect Model
fixed = panel.PanelOLS.from_formula('Dependent ~ 1 + X0 + X1 + X2 + TimeEffects', paneldata).fit()
fixed.summary

fixed.entity_info # paneldata.groupby('Individual')['Dependent'].count().describe()
fixed.time_info # paneldata.groupby('TimeIndexOrder')['Dependent'].count().describe()
fixed.other_info # None

fixed.included_effects
fixed.estimated_effects.swaplevel(0, 1).sort_index() # (paneldata['Dependent']  - fixed.fitted_values['fitted_values']).groupby('TimeIndexOrder').mean() 
fixed.estimated_effects.sum() == 0 # approx 
fixed.idiosyncratic

fixed.variance_decomposition

summary_frame = pd.concat([paneldata[['Dependent']], fixed.fitted_values, fixed.estimated_effects, fixed.resids.to_frame()], axis=1)
summary_frame['residual_deviation'] = fixed.resids - fixed.resids.groupby('Individual').mean()
summary_frame['weighted_residual'] = fixed.wresids
summary_frame['idiosyncratic'] = fixed.idiosyncratic
summary_frame.insert(1, 'validation1', summary_frame['fitted_values'] + summary_frame['estimated_effects'] + summary_frame['residual'])
summary_frame.insert(3, 'validation2', fixed.params['Intercept'] + fixed.params['X0'] * paneldata['X0'] + fixed.params['X1'] * paneldata['X1'] + fixed.params['X2'] * paneldata['X2'])
summary_frame

 

Random Effect Model

import pandas as pd
from linearmodels import panel

paneldata = pd.read_csv('panel.csv').set_index(['Individual', 'TimeIndexOrder'])
paneldata['dy'] = paneldata['Dependent'] - paneldata['Dependent'].groupby('Individual').mean()
paneldata['dX0'] = paneldata['X0'] - paneldata['X0'].groupby('Individual').mean()
paneldata['dX1'] = paneldata['X1'] - paneldata['X1'].groupby('Individual').mean()
paneldata['dX2'] = paneldata['X2'] - paneldata['X2'].groupby('Individual').mean()

random = panel.RandomEffects.from_formula('Dependent ~ 1 + X0 + X1 + X2', paneldata).fit()
random.summary

random.entity_info # paneldata.groupby('Individual')['Dependent'].count().describe()
random.time_info # paneldata.groupby('TimeIndexOrder')['Dependent'].count().describe()

random.estimated_effects 
random.estimated_effects.sum() != 0 # True
random.idiosyncratic

random.variance_decomposition

summary_frame = pd.concat([paneldata[['Dependent']], random.fitted_values, random.resids.to_frame()], axis=1)
summary_frame['weighted_residual'] = random.wresids
summary_frame['idiosyncratic'] = random.idiosyncratic
summary_frame['estimated_effects'] = random.estimated_effects
summary_frame.insert(1, 'validation1', summary_frame['fitted_values'] + summary_frame['estimated_effects'] + summary_frame['residual'])
summary_frame.insert(3, 'validation2', random.params['Intercept'] + random.params['X0'] * paneldata['X0'] + random.params['X1'] * paneldata['X1'] + random.params['X2'] * paneldata['X2'])
summary_frame

 

Pooled Model

PooledOLS

import pandas as pd
from linearmodels import panel

paneldata = pd.read_csv('panel.csv').set_index(['Individual', 'TimeIndexOrder'])
pooled = panel.PooledOLS.from_formula('Dependent ~ 1 + X0 + X1 + X2', paneldata).fit()
pooled.summary

pooled.entity_info # paneldata.groupby('Individual')['Dependent'].count().describe()
pooled.time_info # paneldata.groupby('TimeIndexOrder')['Dependent'].count().describe()

pooled.estimated_effects # None
pooled.idiosyncratic


summary_frame = pd.concat([paneldata[['Dependent']], pooled.fitted_values, pooled.resids.to_frame()], axis=1)
summary_frame['weighted_residual'] = pooled.wresids
summary_frame['idiosyncratic'] = pooled.idiosyncratic
summary_frame.insert(1, 'validation1', summary_frame['fitted_values'] + summary_frame['residual'])
summary_frame.insert(3, 'validation2', pooled.params['Intercept'] + pooled.params['X0'] * paneldata['X0'] + pooled.params['X1'] * paneldata['X1'] + pooled.params['X2'] * paneldata['X2'])
summary_frame
import pandas as pd
import statsmodels.formula.api as smf
from linearmodels import panel

paneldata = pd.read_csv('panel.csv').set_index(['Individual', 'TimeIndexOrder'])

# fixed model with no entity effects = pooled model = ols with one intercept
pooled = panel.PooledOLS.from_formula('Dependent ~ 1 + X0 + X1 + X2', data=paneldata).fit(); display(pooled.summary.tables[1])
fixed = panel.PanelOLS.from_formula('Dependent ~ 1 + X0 + X1 + X2', data=paneldata).fit(); display(fixed.summary.tables[1])
ols = smf.ols('Dependent ~ 1 + X0 + X1 + X2', data=paneldata.reset_index('Individual')).fit(); display(ols.summary().tables[1])

No Effect: PanelOLS

import pandas as pd
from linearmodels import panel

paneldata = pd.read_csv('panel.csv').set_index(['Individual', 'TimeIndexOrder'])
fixed = panel.PanelOLS.from_formula('Dependent ~ 1 + X0 + X1 + X2', paneldata).fit()
fixed.summary

fixed.entity_info # paneldata.groupby('Individual')['Dependent'].count().describe()
fixed.time_info # paneldata.groupby('TimeIndexOrder')['Dependent'].count().describe()

fixed.estimated_effects # None
fixed.idiosyncratic
fixed.variance_decomposition

summary_frame = pd.concat([paneldata[['Dependent']], fixed.fitted_values, fixed.estimated_effects, fixed.resids.to_frame()], axis=1)
summary_frame['weighted_residual'] = fixed.wresids
summary_frame['idiosyncratic'] = fixed.idiosyncratic
summary_frame.insert(1, 'validation1', summary_frame['fitted_values'] + summary_frame['estimated_effects'] + summary_frame['residual'])
summary_frame.insert(3, 'validation2', fixed.params['Intercept'] + fixed.params['X0'] * paneldata['X0'] + fixed.params['X1'] * paneldata['X1'] + fixed.params['X2'] * paneldata['X2'])
summary_frame

OLS

import pandas as pd
import statsmodels.api as sm

paneldata = pd.read_csv('panel.csv').set_index(['Individual', 'TimeIndexOrder'])
result = sm.OLS.from_formula('Dependent ~ 1 + X0 + X1 + X2', paneldata).fit()
result.summary()

summary_frame = pd.concat([paneldata[['Dependent']], result.fittedvalues.rename('fittedvalues'), result.resid.rename('residual').to_frame()], axis=1)
summary_frame.insert(1, 'validation1', summary_frame['fittedvalues'] + summary_frame['residual'])
summary_frame.insert(3, 'validation2', result.params['Intercept'] + result.params['X0'] * paneldata['X0'] + result.params['X1'] * paneldata['X1'] + result.params['X2'] * paneldata['X2'])
summary_frame

 

Between Model

import pandas as pd
from linearmodels import panel

paneldata = pd.read_csv('panel.csv').set_index(['Individual', 'TimeIndexOrder'])
between = panel.BetweenOLS.from_formula('Dependent ~ 1 + X0 + X1 + X2', paneldata).fit()
between.summary

between.entity_info # paneldata.groupby('Individual')['Dependent'].count().describe()
between.time_info # paneldata.groupby('TimeIndexOrder')['Dependent'].count().describe()

between.estimated_effects 
between.idiosyncratic

summary_frame = between.fitted_values.reset_index().merge(pd.concat([between.resids, between.wresids.rename('weighted_residual')], axis=1).reset_index().rename(columns={'index':'Individual'}), on='Individual', how='left', validate='m:1').set_index(['Individual', 'TimeIndexOrder'])
summary_frame = pd.concat([paneldata[['Dependent']], summary_frame], axis=1)
summary_frame['idiosyncratic'] = between.idiosyncratic
summary_frame['estimated_effects'] = between.estimated_effects
summary_frame.insert(1, 'validation1', summary_frame['fitted_values'] + summary_frame['idiosyncratic'] + summary_frame['residual'])
summary_frame.insert(3, 'validation2', between.params['Intercept'] + between.params['X0'] * paneldata['X0'] + between.params['X1'] * paneldata['X1'] + between.params['X2'] * paneldata['X2'])
summary_frame
import pandas as pd
from linearmodels import panel
import statsmodels.formula.api as smf

paneldata = pd.read_csv('panel.csv').set_index(['Individual', 'TimeIndexOrder'])

between = panel.BetweenOLS.from_formula('Dependent ~ 1 + X0 + X1 + X2', paneldata).fit(); display(between.summary.tables[1])
ols = smf.ols('Dependent ~ 1 + X0 + X1 + X2', data=paneldata.groupby('Individual')[['Dependent', 'X0', 'X1', 'X2']].mean()).fit(); display(ols.summary().tables[1])

 

First Difference Model

import pandas as pd
from linearmodels import panel

paneldata = pd.read_csv('panel.csv').set_index(['Individual', 'TimeIndexOrder'])
firstdiff = panel.FirstDifferenceOLS.from_formula('Dependent ~ X0 + X1 + X2', paneldata).fit()
firstdiff.summary

firstdiff.entity_info # paneldata.groupby('Individual')['Dependent'].count().describe()
firstdiff.time_info # paneldata.groupby('TimeIndexOrder')['Dependent'].count().describe()

firstdiff.estimated_effects 
firstdiff.idiosyncratic

summary_frame = pd.concat([paneldata[['Dependent']], firstdiff.fitted_values, firstdiff.resids.to_frame()], axis=1)
summary_frame['weighted_residual'] = firstdiff.wresids
summary_frame['idiosyncratic'] = firstdiff.idiosyncratic
summary_frame.insert(1, 'validation1', summary_frame['fitted_values'] + summary_frame['idiosyncratic'] )
summary_frame.insert(3, 'validation2', firstdiff.params['X0'] * paneldata['X0'] + firstdiff.params['X1'] * paneldata['X1'] + firstdiff.params['X2'] * paneldata['X2'])
summary_frame

 

 

 

 

 

 


Incomplete Panel Analysis

 

 

 

 

 

 


Linear Relationship

Categorical Explanatory Variables and Numerical Response Variables

ANOVA: One Numerical Response Variable

import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.stats.api as sms
import statsmodels.formula.api as smf

sm.OLS, sm.WLS, sm.GLS, sm.GLM, sm.GEE, sm.OrdinalGEE, sm.NominalGEE
smf.ols, smf.wls, smf.gls, smf.glm, smf.gee, smf.ordinal_gee, smf.nominal_gee
sms.anova_lm
sms.AnovaRM

MANOVA: One More Numerical Response Variables

import numpy as np
import pandas as pd
import statsmodels.api as sm

sm.MANOVA

 

 

Mixed Explanatory Variables and Numerical Response Variables

ANCOVA: One Numerical Response Variable

import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.stats.api as sms
import statsmodels.formula.api as smf
from statsmodels.regression.mixed_linear_model import MixedLMParams

sms.anova_lm
sms.AnovaRM

sm.OLS, sm.MixedLM, sm.PoissonBayesMixedGLM, sm.BinomialBayesMixedGLM
smf.ols, smf.mixedlm

MANCOVA: One More Numerical Response Variables

#

 

 

Numerical Explanatory Variables and Categorical Response Variable

LDA(Linear Discriminant Analysis)

import numpy as np
import pandas as pd
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis

LinearDiscriminantAnalysis()

QDA(Quadratic Discriminant Analysis)

import numpy as np
import pandas as pd
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis

QuadraticDiscriminantAnalysis()

 

 

 

 

 

 


Hierarchical Linear Model

Multi-Level Model

 

 


Reference