// MathJax

Analysis of Variance: Variance Components Model

Assumptions of ANOVA

  • Homogeneity of variance
  • Normality
  • Independence of error components
$${\displaystyle \begin{aligned} X_{ij} &= \mu + \underbrace{\tau_{i}}_{\tau_{i} = \mu_{i} - \mu } + \underbrace{\varepsilon_{ij}}_{\varepsilon_{ij} = X_{ij} - \mu_{i}}, \quad \varepsilon_{ij} \sim \mathcal{N}(0, \sigma^{2}_{\varepsilon}) \\ \; & \quad \operatorname{E}[X_{ij}\mid i] = \mu_{i}, \quad \operatorname{E}[X_{ij}] = \mu \\ \end{aligned} }$$

$${\displaystyle \begin{align} \text{Variance Partition} &:\quad SS_{total} = SS_{\text{between-group}} + SS_{\text{within-group}} \\ \text{Treatment Effect} &:\quad \operatorname E \left[ \frac{MS_{treatment}}{MS_{error}} \right] = \frac{\sigma_{\varepsilon}^{2} + \tilde{n}\sigma_{\tau}^{2}}{\sigma_{\varepsilon}^{2}} \sim F \\ \end{align} }$$ $${\displaystyle \text{by variables} \; \begin{cases} \; \text{Fixed: } \beta, \qquad & Y &&= X \boldsymbol{\beta} + \boldsymbol{\varepsilon}, \quad\quad\quad\;\;\, \boldsymbol{\varepsilon} \sim \mathcal{L}(\mathbf{0}, \sigma^{2}_{\varepsilon}\mathbf{I}) \\ \; \text{Random: } u, \qquad & Y &&= Z \boldsymbol{u} + \boldsymbol{\varepsilon}, \quad\quad\quad\;\;\, \mathbf{u} \sim \mathcal{L}(\mathbf{0}, \sigma^{2}_{u}\mathbf{I}), \quad \boldsymbol{\varepsilon} \sim \mathcal{L}(\mathbf{0}, \sigma^{2}_{\varepsilon}\mathbf{I}), \quad \operatorname{Cov}[\mathbf{u}, \boldsymbol{\varepsilon}] = \mathbf{0} \\ \; \text{Mixed: } \beta \And u, \qquad & Y &&= X \boldsymbol{\beta} + Z \boldsymbol{u} + \boldsymbol{\varepsilon}, \quad \mathbf{u} \sim \mathcal{L}(\mathbf{0}, \sigma^{2}_{u}\mathbf{I}), \quad \boldsymbol{\varepsilon} \sim \mathcal{L}(\mathbf{0}, \sigma^{2}_{\varepsilon}\mathbf{I}), \quad \operatorname{Cov}[\mathbf{u}, \boldsymbol{\varepsilon}] = \mathbf{0} \\ \end{cases} }$$ $${\displaystyle \text{by factors} \; \begin{cases} \; Y_{ij} &= \mu +\alpha _{i}+\epsilon _{ij} \\ \; Y_{ijk} &= \mu +\alpha _{i}+\beta_{j}+ (\alpha \beta)_{ij}+ \epsilon _{ijk} \\ \; Y_{ijkl} &= \mu +\alpha _{i}+\beta_{j}+\gamma_{k} + (\alpha \beta)_{ij} + (\beta \gamma)_{jk} + (\gamma \alpha)_{ki} + (\alpha \beta \gamma)_{ijk} + \epsilon _{ijkl} \\ \; &\cdots \\ \end{cases} }$$ $${\displaystyle \text{by repeated measure} \; \begin{cases} \; SS_{\text{Total}} &= SS_{\text{Treatments}} + SS_{\text{Error}} \\ \; SS_{\text{Total}} &= SS_{\text{Treatments(excluding individual difference)}} + \left( SS_{\text{Subjects}} + SS_{\text{Error}} \right) \\ \end{cases} }$$
import numpy as np
import pandas as pd

class DataGenerator:
    def __init__(self, repeated_num:int):
        self.limit = repeated_num

    def __iter__(self):
        return self

    def __next__(self):
        if self.limit != 0:
            self.limit -= 1
            return DataGenerator.generate()
        else:                           
            raise StopIteration

    @classmethod
    def generate(cls):
        # Design of Experiments, DOE
        index = pd.MultiIndex.from_product([['G1', 'G2', 'G3'], list('AB')], names=['Group', 'Factor'])
        
        frame = pd.DataFrame(
            index = index,
            data = [None]*len(index),
            columns = ['Value'],
        ).applymap(lambda x: np.random.normal(0,1))
        
        # tfi: target feature index
        # Variable Feature Control
        tfi = (frame.index.get_level_values(0) == 'G1')&(frame.index.get_level_values(1) == 'A')
        frame.loc[lambda x: tfi] = frame.loc[lambda x: tfi] + np.random.normal(loc=10, scale=1, size=(tfi.sum(), 1))
        
        frame['Value'] = frame['Value'].astype(float)
        return frame

def anova_data(repeated_num=1):
    data = pd.concat(DataGenerator(repeated_num=repeated_num), axis=0).sort_index().reset_index()
    data = data.reset_index().rename(columns={'index':'Index'})
    data['Index'] = data.groupby(['Group'] + sorted(data.columns.difference(['Group', 'Index', 'Value']).tolist()))['Index'].rank()
    data = data.set_index(['Group'] + sorted(data.columns.difference(['Group', 'Index', 'Value']).tolist()) + ['Index']).unstack(0)
    return data

data = anova_data(5).stack().reset_index(level=[0,2]).reset_index(drop=True)

# Sum squares division
SST1_division = data.groupby(['Factor']).apply(lambda datum: datum.shape[0]*(datum[['Value']].var(ddof=0)) + datum.shape[0]*(datum[['Value']].mean() - data[['Value']].mean())**2)
SSB1_division = data.groupby(['Factor']).apply(lambda datum: datum.shape[0]*(datum[['Value']].mean() - data[['Value']].mean())**2)
SSW1_division = data.groupby(['Factor']).apply(lambda datum: datum.shape[0]*(datum[['Value']].var(ddof=0)))
SST1 = SST1_division.sum()
SSB1 = SSB1_division.sum()
SSW1 = SSW1_division.sum()

# Sum squares division
SST2_division = data.groupby(['Group']).apply(lambda datum: datum.shape[0]*(datum[['Value']].var(ddof=0)) + datum.shape[0]*(datum[['Value']].mean() - data[['Value']].mean())**2)
SSB2_division = data.groupby(['Group']).apply(lambda datum: datum.shape[0]*(datum[['Value']].mean() - data[['Value']].mean())**2)
SSW2_division = data.groupby(['Group']).apply(lambda datum: datum.shape[0]*(datum[['Value']].var(ddof=0)))
SST2 = SST2_division.sum()
SSB2 = SSB2_division.sum()
SSW2 = SSW2_division.sum()

# Sum squares division
SST3_division = data.groupby(['Factor', 'Group']).apply(lambda datum: datum.shape[0]*(datum[['Value']].var(ddof=0)) + datum.shape[0]*(datum[['Value']].mean() - data[['Value']].mean())**2)
SSB3_division = data.groupby(['Factor', 'Group']).apply(lambda datum: datum.shape[0]*(datum[['Value']].mean() - data[['Value']].mean())**2)
SSW3_division = data.groupby(['Factor', 'Group']).apply(lambda datum: datum.shape[0]*(datum[['Value']].var(ddof=0)))
SST3 = SST3_division.sum()
SSB3 = SSB3_division.sum()
SSW3 = SSW3_division.sum()

SST1, SST2, SST3 # same thing

 

 

 

 

 

 

 

 


TABLE OF CONTENTS

Group Properties

A/B Test

 

Design of Experiments, DOE

Moment Decomposition

Variance Structure


 

 

 

 

Group Properties

Macro, Micro, Weighted Sample Averaging

Macro Averaging: unweighted mean for each group

Micro Averaging: globally mean by counting the total

Weighted Averaging: weighted mean for each group

Sample Averaging: for structured group (multi group)

 

$$ \begin{aligned} N_{\text{total}} &= \sum^{n}_{i=1} N_{G_{i}} = N_{G_{1}} + N_{G_{2}} + \cdots + N_{G_{n}}, \qquad && n\text{ is the number of groups} \\ f_{\text{micro-average}} &= \frac{S_{G}}{N_{\text{total}}}, \qquad && S_{G}\text{ is the metric score for all group} \\ f_{\text{macro-average}} &= \sum^{n}_{i=1} \frac{S_{G_{i}}}{n} = \frac{S_{G_{1}} + S_{G_{2}} + \cdots + S_{G_{n}}}{n}, \qquad && S_{G_{i}}\text{ is a metric score for each group} \\ f_{\text{weighted-average}} &= \sum^{n}_{i=1} \frac{N_{G_{i}}}{N_{\text{total}}} S_{G_{i}} = \frac{N_{G_{1}} S_{G_{1}} + N_{G_{2}}S_{G_{2}} + \cdots + N_{G_{n}} S_{G_{n}}}{N_{\text{total}}} \\ \end{aligned} $$

 

 

Arithmetic and Geometric Average

$${\displaystyle \begin{aligned} \text{Log Transformation}&: \quad y \equiv \ln Y \quad \leftrightarrow \quad \mu_{y} = \ln \mu_{Y}, \quad \sigma_{y} = \ln \sigma_{Y} \\ \end{aligned} }$$ $${\displaystyle \begin{aligned} \text{Sample Mean} \quad & \begin{cases} \begin{aligned} \; \mu_{y} &= \frac{y_{1} + y_{2} + \cdots + y_{n}}{n} = \sum_{i=1}^{n} y_{i} \\ \; \ln \mu_{Y} &= \frac{\ln Y_{1} + \ln Y_{2} + \cdots + \ln Y_{n}}{n} = \frac{1 }{n} \sum_{i=1}^{n} \ln Y_{i} \\ \; &= \frac{\ln Y_{1}Y_{2} \cdots Y_{n}}{n} = \frac{1 }{n} \ln \prod_{i=1}^{n} Y_{i} \\ \; \mu_{Y} &= \sqrt[n] {Y_{1}Y_{2} \cdots Y_{n}} = \sqrt[n] {\prod_{i=1}^{n} Y_{i}} \\ \end{aligned} \end{cases} \\ \text{Sample Standard Deviation} \quad & \begin{cases} \begin{aligned} \; \sigma_{y} &= \sqrt{ \frac{( y_{1} - \mu_{y} )^{2} + ( y_{2} - \mu_{y} )^{2} + \cdots + ( y_{n} - \mu_{y} )^{2} }{n}} = \sqrt{ \sum_{i=1}^{n} \frac{( y_{i} - \mu_{y} )^{2} }{n}} \\ \; \ln \sigma_{Y} &= \sqrt{ \frac{( \ln Y_{1} - \ln \mu_{Y} )^{2} + ( \ln Y_{2} - \ln \mu_{Y} )^{2} + \cdots + ( \ln Y_{n} - \ln \mu_{Y} )^{2} }{n}} = \sqrt{\frac{1}{n} ( \sum_{i=1}^{n} \ln Y_{i} - \ln \mu_{Y} )^{2}} \\ \; &= \sqrt{ \frac{( \ln \frac{Y_{1}}{\mu_{Y}} )^{2} + ( \ln \frac{Y_{2}}{\mu_{Y}} )^{2} + \cdots + ( \ln \frac{Y_{n}}{\mu_{Y}} )^{2} }{n}} = \sqrt{\frac{1}{n} ( \ln \prod_{i=1}^{n} \frac{Y_{i}}{\mu_{Y}} )^{2}} \\ \; \sigma_{Y} &= \exp \sqrt{\frac{1}{n} ( \sum_{i=1}^{n} \ln Y_{i} - \ln \mu_{Y} )^{2}} = \exp \sqrt{\frac{1}{n} ( \ln \prod_{i=1}^{n} \frac{Y_{i}}{\mu_{Y}} )^{2}} \\ \end{aligned} \end{cases} \end{aligned} }$$

 

import numpy as np
import pandas as pd
from scipy import stats

y = pd.Series(np.random.normal(1, 1, size=100))
Y = y.apply(np.exp) # Y = np.random.lognormal(1,1,size=100)

y_mean = y.mean()
y_std = y.std(ddof=0)
print(y_mean, y_std)

Y_mean = np.power(Y.prod(), 1/Y.size)
Y_std = np.exp(np.sqrt(np.log((Y/Y_mean).prod())**2/Y.size))
print(Y_mean, Y_std, Y.mean(), Y.std(ddof=1), stats.gstd(Y))

 

 

 

A/B Test

Independent T-Test

import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

data1 = 5 * np.random.normal(size=100) + 50
data2 = 5 * np.random.normal(size=100) + 51

# Normality Test
stat, p = stats.shapiro(data1) # Normality if p > 0.05
stat, p = stats.shapiro(data2) # Normality if p > 0.05

# Correlation Test
corr, p = stats.pearsonr(data1, data2) # Correlation if p < 0.05

# t-Test
stat, p = stats.ttest_ind(data1, data2)
print('Statistics=%.3f, p=%.3f' % (stat, p))

alpha = 0.05
if p > alpha:
    print('Same distributions (fail to reject H0)')
else:
    print('Different distributions (reject H0)')

_, axes = plt.subplots(2,1)
axes[0].hist(data1, bins=30)
axes[1].hist(data2, bins=30)
axes[0].grid(True)
axes[1].grid(True)
plt.show()

 

Paired T-Test

import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

data1 = 5 * np.random.normal(size=100) + 50
data2 = data1 + np.random.normal(1, 1, size=100)

# Normality Test
stat, p = stats.shapiro(data1) # Normality if p > 0.05
stat, p = stats.shapiro(data2) # Normality if p > 0.05

# Correlation Test
corr, p = stats.pearsonr(data1, data2) # Correlation if p < 0.05

# t-Test
stat, p = stats.ttest_rel(data1, data2)
print('Statistics=%.3f, p=%.3f' % (stat, p))

alpha = 0.05
if p > alpha:
    print('Same distributions (fail to reject H0)')
else:
    print('Different distributions (reject H0)')

_, axes = plt.subplots(2,1)
axes[0].hist(data1, bins=30)
axes[1].hist(data2, bins=30)
axes[0].grid(True)
axes[1].grid(True)
plt.show()

 

 

 

 

 


Design of Experiments, DOE

$${\displaystyle \text{One-way subject factor summary} \; }$$ $${\displaystyle \begin{align} \; U \times S &: \quad SSB_{U \times S} &&= SSW_{S} - SSB_{U} \\ \end{align} }$$ $${\displaystyle \text{Two-way subject factor summary} \; }$$ $${\displaystyle \begin{align} \; U \times V \times S &: \quad SSB_{U \times V \times S} &&= SSW_{S} {\color{Gray}- SSB_{U \times S} - SSB_{V \times S}} - SSB_{U} - SSB_{V} - SSB_{U \times V} \\ \; U \times S &: \quad SSB_{U \times S} &&= SSW_{S} {\color{Gray}- SSW_{U \cup S}} - SSB_{U} \\ \; V \times S &: \quad SSB_{V \times S} &&= SSW_{S} {\color{Gray}- SSW_{V \cup S}} - SSB_{V} \\ \end{align} }$$ $${\displaystyle \text{Three-way subject factor summary} \; }$$ $${\displaystyle \begin{align} \; U \times V \times W \times S &: \quad SSB_{U \times V \times W \times S} &&= SSW_{S} \\ \; & \; &&\quad {\color{Gray}- SSB_{U \times V \times S} - SSB_{V \times W \times S} - SSB_{W \times U \times S} - SSB_{U \times S} - SSB_{V \times S} - SSB_{W \times S}} \\ \; & \; &&\quad - SSB_{U} - SSB_{V} - SSB_{W} - SSB_{U \times V} - SSB_{V \times W} - SSB_{W \times U} - SSB_{U \times V \times W} \\ \; U \times V \times S &: \quad SSB_{U \times V \times S} &&= SSW_{S} {\color{Gray} - SSW_{U \cup V \cup S} - SSB_{U \times S} - SSB_{V \times S}} \\ \; & \; &&\quad - SSB_{U} - SSB_{V} - SSB_{U \times V} \\ \; V \times W \times S &: \quad SSB_{V \times W \times S} &&= SSW_{S} {\color{Gray} - SSW_{V \cup W \cup S} - SSB_{V \times S} - SSB_{W \times S}} \\ \; & \; &&\quad - SSB_{V} - SSB_{W} - SSB_{V \times W} \\ \; W \times U \times S &: \quad SSB_{W \times U \times S} &&= SSW_{S} {\color{Gray} - SSW_{W \cup U \cup S} - SSB_{W \times S} - SSB_{U \times S}} \\ \; & \; &&\quad - SSB_{W} - SSB_{U} - SSB_{W \times U} \\ \; U \times S &: \quad SSB_{U \times S} &&= SSW_{S} {\color{Gray}- SSW_{U \cup S}} - SSB_{U} \\ \; V \times S &: \quad SSB_{V \times S} &&= SSW_{S} {\color{Gray}- SSW_{V \cup S}} - SSB_{V} \\ \; W \times S &: \quad SSB_{W \times S} &&= SSW_{S} {\color{Gray}- SSW_{W \cup S}} - SSB_{W} \\ \end{align} }$$
더보기
$${\displaystyle \text{For example, one-way ANOVA} }$$ $${\displaystyle \begin{align} \; SS_{\text{Total}} &= SSB_{U} + SSW_{U} \\ &= SSB_{U \cup S} + SSW_{U \cup S} \\ &= SSB_{U \cup S} \\ &= SSB_{U} + SSB_{S} + SSB_{U \times S} \\ \; & \; \\ \; \therefore \; SSW_{U} &= SSB_{U \times S} + SSB_{S} \\ SSW_{S} &= SSB_{U \times S} + SSB_{U} \\ SSB_{U \times S} &= SSW_{U} - SSB_{S} \\ &= SSW_{S} - SSB_{U} \\ \; & \; \\ \; & \; \\ \end{align} }$$
DOE Model Name Description Sources of variance
One between-subjects factor One-way ANOVA A × S SStotal = SSA + SSerror
Two between-subjects factors Two-way ANOVA A × B × S SStotal = SSA + SSB + SSA×B + SSerror
Three between-subjects factors Three-way ANOVA A × B × C × S SStotal = SSA + SSB + SSC + SSA×B + SSA×C + SSB×C + SSA×B×C + SSerror
One within-subjects factor Repeated-measures ANOVA with one factor (U × S) SStotal = SSsubjects + SSU + SSerror
Two within-subjects factors Repeated-measures ANOVA with two factors (U × V × S) SStotal = SSsubjects + SSU + SSV + SSU×V + SSerror
Three within-subjects factors Repeated-measures ANOVA with three factors (U × V × W × S) SStotal = SSsubjects + SSU + SSV + SSW + SSU×V + SSU×W + SSV×W + SSU×V×W + SSerror
One between- and one within-subjects factor Mixed ANOVA A × (U × S) SStotal = (SSA + SSS/A) + (SSU + SSU×A + SSU×S/A)
Two between-subjects factors and one within-subjects factor Mixed ANOVA A × B × (U × S) SStotal = (SSA + SSB + SSA×B + SSerror-between) + (SSU + SSU×A + SSU×B + SSU×A×B + SSerror-within)
One between-subjects factor and two within-subjects factors Mixed ANOVA A × (U × V × S) SStotal = (SSA + SSerror-between) + (SSU + SSU×A + SSU×S/A + SSV + SSV×A + SSV×S/A + SSU×V + SSU×V×A + SSU×V×S/A)

Libaraies Comparisons

import numpy as np
import pandas as pd
from scipy import stats
import statsmodels.api as sm
from statsmodels.formula.api import ols
import pingouin as pg

class DataGenerator:
    def __init__(self, repeated_num:int):
        self.limit = repeated_num

    def __iter__(self):
        return self

    def __next__(self):
        if self.limit != 0:
            self.limit -= 1
            return DataGenerator.generate()
        else:                           
            raise StopIteration

    @classmethod
    def generate(cls):
        # Design of Experiments, DOE
        index = pd.MultiIndex.from_product([['G1', 'G2', 'G3'], list('AB')], names=['Group', 'Factor'])
        
        frame = pd.DataFrame(
            index = index,
            data = [None]*len(index),
            columns = ['Value'],
        ).applymap(lambda x: np.random.normal(0,1))
        
        # tfi: target feature index
        # Variable Feature Control
        tfi = (frame.index.get_level_values(0) == 'G1')&(frame.index.get_level_values(1) == 'A')
        frame.loc[lambda x: tfi] = frame.loc[lambda x: tfi] + np.random.normal(loc=10, scale=1, size=(tfi.sum(), 1))
        
        frame['Value'] = frame['Value'].astype(float)
        return frame

def anova_data(repeated_num=1):
    data = pd.concat(DataGenerator(repeated_num=repeated_num), axis=0).sort_index().reset_index()
    data = data.reset_index().rename(columns={'index':'Index'})
    data['Index'] = data.groupby(['Group'] + sorted(data.columns.difference(['Group', 'Index', 'Value']).tolist()))['Index'].rank()
    data = data.set_index(['Group'] + sorted(data.columns.difference(['Group', 'Index', 'Value']).tolist()) + ['Index']).unstack(0)
    return data

data = anova_data(5)

# [scipy] compare samples
f, p = stats.f_oneway(data['Value']['G1'], data['Value']['G2'], data['Value']['G3'])
print(f, p)

# [statsmodels] compare samples
model = ols('Value ~ C(Group)', data=data.stack(1).reset_index()).fit()
# model.summary(), model.params, data.mean()

anova_table = sm.stats.anova_lm(model, typ=2)
f = anova_table.loc['C(Group)', 'F']
p = anova_table.loc['C(Group)', 'PR(>F)']
print(f, p)


# [pingouin] compare samples
anova_table = pg.anova(dv='Value', between='Group', data=data.stack(1).reset_index(), detailed=True)
f = anova_table.loc[lambda x: x['Source'] == 'Group', 'F'].item()
p = anova_table.loc[lambda x: x['Source'] == 'Group', 'p-unc'].item()
print(f, p)

# [direct] compare samples
tot_mean = data.mean().mean()
tot = data.applymap(lambda x: (x - tot_mean)**2).sum().sum()
ssb = ((data.mean(axis=0) - tot_mean)**2).sum() * data.shape[0]
ssw = tot-ssb

f = (ssb/(data.shape[1] - 1)) / (ssw/(data.size - data.shape[1]))
p = 1 - stats.f.cdf(f, dfn=data.shape[1]-1 , dfd=data.size-data.shape[1])
print(f, p)

Pingouin

import pingouin as pg

# one-way anova
data = pg.read_dataset('anova')
pg.normality(data=data, dv='Pain threshold', group='Hair color') # Shapiro-Wilk test
pg.homoscedasticity(data=data, dv='Pain threshold', group='Hair color') # Levene’s test
pg.kruskal(data=data, dv='Pain threshold', between='Hair color')

pg.anova(data=data, dv='Pain threshold', between='Hair color', ss_type=1, detailed=True)
pg.pairwise_tukey(data=data, dv='Pain threshold', between='Hair color')



# two-way anova
data = pg.read_dataset('anova2')
pg.homoscedasticity(data=data, dv='Yield', group='Blend')
pg.homoscedasticity(data=data, dv='Yield', group='Crop')

pg.anova(data=data, dv="Yield", between=["Blend", "Crop"], ss_type=2, detailed=True)
pg.pairwise_tukey(data=data, dv='Yield', between="Blend")
pg.pairwise_tukey(data=data, dv='Yield', between="Crop")



# three-way anova
data = pg.read_dataset('anova3')
pg.homoscedasticity(data=data, dv='Cholesterol', group='Sex')
pg.homoscedasticity(data=data, dv='Cholesterol', group='Risk')
pg.homoscedasticity(data=data, dv='Cholesterol', group='Drug')

pg.anova(data=data, dv='Cholesterol', between=['Sex', 'Risk', 'Drug'], ss_type=3, detailed=True)
pg.pairwise_tukey(data=data, dv='Cholesterol', between="Sex")
pg.pairwise_tukey(data=data, dv='Cholesterol', between="Risk")
pg.pairwise_tukey(data=data, dv='Cholesterol', between="Drug")



# repeated measure anova
data = pg.read_dataset('rm_anova')
pg.homoscedasticity(data=data, dv='DesireToKill', group='Disgustingness')
pg.homoscedasticity(data=data, dv='DesireToKill', group='Frighteningness')

pg.rm_anova(data=data, dv='DesireToKill', within='Disgustingness', subject='Subject', detailed=True, effsize="np2")
pg.sphericity(data=data, dv='DesireToKill', subject='Subject', within='Disgustingness') # Mauchly’s test of sphericity

pg.rm_anova(data=data, dv='DesireToKill', within=['Disgustingness', 'Frighteningness'], subject='Subject', detailed=True)
pg.pairwise_tests(data=data, dv='DesireToKill', within='Disgustingness', subject='Subject', padjust='fdr_bh')
pg.pairwise_tests(data=data, dv='DesireToKill', within='Frighteningness', subject='Subject', padjust='fdr_bh')



# mixed anova
data = pg.read_dataset('mixed_anova')
pg.homoscedasticity(data=data, dv='Scores', group='Group')
pg.mixed_anova(data=data, dv='Scores', between='Group', within='Time', subject='Subject', effsize="ng2")
pg.pairwise_tests(data=data, dv='Scores', within='Time', subject='Subject', padjust='fdr_bh')



# welch anova
pg.welch_anova(data=data, dv='Pain threshold', between='Hair color')
pg.pairwise_gameshowell(data=data, dv='Pain threshold', between='Hair color')

Statsmodels

import numpy as np
import pandas as pd
from scipy import stats
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.anova import AnovaRM
from statsmodels.stats.multicomp import MultiComparison
from statsmodels.graphics.factorplots import interaction_plot

class DataGenerator:
    def __init__(self, repeated_num:int):
        self.limit = repeated_num

    def __iter__(self):
        return self

    def __next__(self):
        if self.limit != 0:
            self.limit -= 1
            return DataGenerator.generate()
        else:                           
            raise StopIteration

    @classmethod
    def generate(cls):
        # Design of Experiments, DOE
        index = pd.MultiIndex.from_product([['G1', 'G2', 'G3'], list('AB')], names=['Group', 'Factor'])
        
        frame = pd.DataFrame(
            index = index,
            data = [None]*len(index),
            columns = ['Value'],
        ).applymap(lambda x: np.random.normal(0,1))
        
        # tfi: target feature index
        # Variable Feature Control
        tfi = (frame.index.get_level_values(0) == 'G1')&(frame.index.get_level_values(1) == 'A')
        frame.loc[lambda x: tfi] = frame.loc[lambda x: tfi] + np.random.normal(loc=10, scale=1, size=(tfi.sum(), 1))
        
        frame['Value'] = frame['Value'].astype(float)
        return frame

def anova_data(repeated_num=1):
    data = pd.concat(DataGenerator(repeated_num=repeated_num), axis=0).sort_index().reset_index()
    data = data.reset_index().rename(columns={'index':'Index'})
    data['Index'] = data.groupby(['Group'] + sorted(data.columns.difference(['Group', 'Index', 'Value']).tolist()))['Index'].rank()
    data = data.set_index(['Group'] + sorted(data.columns.difference(['Group', 'Index', 'Value']).tolist()) + ['Index']).unstack(0)
    return data

# Anova Result
data = anova_data(5).stack(1).reset_index()

model = ols('Value ~ C(Group) + C(Factor) + C(Group):C(Factor)', data=data).fit()
display(sm.stats.anova_lm(model, typ=2))
display(AnovaRM(data=data, depvar='Value', within=['Group', 'Factor'], subject='Index').fit().anova_table)

# Assumption
display(model.summary()) # Normality
display(stats.shapiro(model.resid)) # Normality
display(sm.qqplot(model.resid_pearson, line='45')) # Normality Visualization
display(stats.levene(
    data.loc[lambda x: x['Factor'] == 'A', 'Value'],
    data.loc[lambda x: x['Factor'] == 'B', 'Value'],
)) # Homoscadesticity
display(stats.levene(
    data.loc[lambda x: x['Group'] == 'G1', 'Value'],
    data.loc[lambda x: x['Group'] == 'G2', 'Value'],
    data.loc[lambda x: x['Group'] == 'G3', 'Value'],
))# Homoscadesticity

# post-hoc test
mc = MultiComparison(data['Value'], data['Group']+data['Factor'])
posthoc_results = mc.tukeyhsd() # TUKEY HONESTLY SIGNIFICANT DIFFERENCE (HSD)
display(posthoc_results.summary())
display(posthoc_results.plot_simultaneous())
table, _, _ = mc.allpairtest(stats.ttest_ind, method= "bonf"); display(table) # BONFERRONI CORRECTION
table, _, _ = mc.allpairtest(stats.ttest_ind, method= "sidak"); display(table) # ŠIDÁK CORRECTION (A.K.A. DUNN-ŠIDÁK CORRECTION)

# treatment effect visualization
display(interaction_plot(x=data['Factor'], trace=data['Group']+data['Factor'], response=data['Value']))

 

Moment Decomposition

Continous Random Variable

import numpy as np
import pandas as pd

X = np.random.normal(3, 10, size=1000)
X_bins = pd.cut(X, bins=10, precision=4, retbins=False)
X = pd.Series(data=X_bins, index=X)

X_prob = X.value_counts().sort_index().to_frame()
X_prob['probability'] = X_prob['count'].apply(lambda cnt: cnt/X_prob['count'].sum()) # probability mass function
X_prob['cumulance'] = X_prob['probability'].cumsum() # cumulative distribution function
X_prob['rank'] = X_prob['count'].rank(ascending=False)
display(X_prob)

X.index.to_series(name='PPF').quantile(q=[0, .025, .5, .975, 1]) # inverse cumulative: percentile point function
X.index.to_series(name='PPF').describe(percentiles=[.5])

Categorical Random Variable

import numpy as np
import pandas as pd

ds = pd.Series(data=list(map(lambda x: dict(enumerate(list('ABCDEFGHIJIJKLMNOPGRSTUVWXYZ')))[x], np.random.binomial(n=25, p=1/3, size=1000))), name='PRV')
dist = ds.value_counts().sample(frac=1).to_frame().sort_values(by='count', ascending=False)
dist['probability'] = dist['count'].apply(lambda x: x/dist['count'].sum())
dist['cumulance'] = dist['probability'].cumsum()
dist['rank'] = dist['count'].rank(ascending=False)
dist

 

 

Law of total probability

$${\displaystyle \begin{aligned} P(A)&=\sum _{n}P(A\cap B_{n}) \\ &=\sum _{n}P(A\mid B_{n})P(B_{n}) \\ \end{aligned} }$$
import numpy as np
import pandas as pd

data = pd.DataFrame(index=range(1000))
n_groups_A = 10
n_groups_B = 5
for g in range(n_groups_A-1):
    data.loc[np.random.randint(0, 1000, 150), 'A'] = f'A{g}'
data['A'] = data['A'].fillna('A0')
for g in range(n_groups_B-1):
    data.loc[np.random.randint(0, 1000, 150), 'B'] = f'B{g}'
data['B'] = data['B'].fillna('B0')

freq = pd.crosstab(index=[data['A']], columns=[data['B']], dropna=False, margins=False)
joint_prob_table = freq.map(lambda x: x/freq.sum().sum())
joint_prob_table.loc['A0', 'B3']
joint_prob_table.loc['A2', 'B3']

conditional_prob_A_for_given = freq.div(freq.sum(axis=0), axis=1) # conditional_prob_A_for_given_B.sum(axis=0)
conditional_prob_B_for_given = freq.div(freq.sum(axis=1), axis=0).T # conditional_prob_B_for_given_A.sum(axis=0)
conditional_prob_B_for_given['A0']
conditional_prob_A_for_given['B3']
import numpy as np
import pandas as pd

data = pd.DataFrame(index=range(1000))
n_groups_A = 10
n_groups_B = 5
for g in range(n_groups_A-1):
    data.loc[np.random.randint(0, 1000, 150), 'A'] = f'A{g}'
data['A'] = data['A'].fillna('A0')
for g in range(n_groups_B-1):
    data.loc[np.random.randint(0, 1000, 150), 'B'] = f'B{g}'
data['B'] = data['B'].fillna('B0')

prob_table = data.groupby('B')['B'].count().rename('count').to_frame()
prob_table['P(B)'] = prob_table['count'].apply(lambda x: x/prob_table['count'].sum())

conditional_prob_table = data.groupby(['B', 'A'])['A'].count().rename('count').to_frame()
conditional_prob_table['P(A|B)'] = conditional_prob_table.groupby(['B'], group_keys=False)['count'].apply(lambda x: x/x.sum()).rename('P(A|B)')
conditional_prob_table = conditional_prob_table.reset_index('A').merge(prob_table[['P(B)']], how='left', on='B', validate='m:1').reset_index().set_index(['B', 'A'])
conditional_prob_table['P(A|B)P(A)'] = conditional_prob_table.pipe(lambda cpt: cpt['P(A|B)']*cpt['P(B)'])
conditional_prob_table = conditional_prob_table.reset_index().set_index(['B', 'P(B)', 'A'])

#conditional_prob_table.groupby('B').apply(lambda x: x['P(A|B)P(A)'].sum()).sum() # 1
prob_B = conditional_prob_table.groupby('B').apply(lambda x: x['P(A|B)P(A)'].sum())
prob_B - prob_table['P(B)'] # 0

 

 

Law of total expectation

$${\displaystyle \begin{aligned} \operatorname{E}[X] &= \operatorname{E} [\operatorname{E} [X\mid A]] \\ \; &= \sum_{i}{\operatorname {E} [X\mid A_{i}]\operatorname {P} (A_{i})} \\ \end{aligned} }$$
import numpy as np
import pandas as pd

data = pd.DataFrame(np.random.normal(10, 1, size=1000), columns=['X'])
n_groups = 10
for g in range(n_groups-1):
    data.loc[np.random.randint(0, 1000, 150), 'G'] = f'G{g}'
data['G'] = data['G'].fillna('G0')
data = data.set_index('G').sort_index()
index_func = lambda key: data.index.get_level_values(0) == key
for i in range(n_groups-1):
    data.loc[index_func(f'G{i}'), 'X'] = data.loc[index_func(f'G{i}'), 'X'] + np.random.normal(i, 1, size=index_func(f'G{i}').sum())
data = data.reset_index()

# 1st-Moment Partition
prob_table = data.groupby('G')['G'].count().rename('count').to_frame()
prob_table['P(G)'] = prob_table['count'].apply(lambda x: x/prob_table['count'].sum())
prob_table = prob_table.merge(data.groupby('G')['X'].mean().rename('E[X|G]'), how='left', on='G', validate='1:1')
prob_table['1M-P'] = prob_table.pipe(lambda pt: pt['E[X|G]']*pt['P(G)'])
prob_table['1M-P:Ratio'] = prob_table['1M-P'].apply(lambda x: x/prob_table['1M-P'].sum())
prob_table['1M-P:Rank'] = prob_table['1M-P'].rank(ascending=False)

prob_table['1M-P'].sum(), data['X'].mean()

 

Law of total variance

$${\displaystyle \text{Static System}}$$ $${\displaystyle {\begin{aligned} \operatorname {Var} (X)&= \operatorname {E} [\operatorname {Var} (X\mid A)]+\operatorname {Var} (\operatorname {E} [X\mid A]) \\ &= \operatorname {E} [\operatorname {Var} (X\mid A)]+\operatorname {E} (\operatorname {E} [X\mid A]^{2}) - \operatorname {E} (\operatorname {E} [X\mid A])^{2} \\ &= \sum _{i=1}^{n}\operatorname {Var} (X\mid A_{i})\Pr(A_{i})+\sum _{i=1}^{n}\operatorname {E} [X\mid A_{i}]^{2}(1-\Pr(A_{i}))\Pr(A_{i})\\[4pt]&{}-2\sum _{i=2}^{n}\sum _{j=1}^{i-1}\operatorname {E} [X\mid A_{i}]\Pr(A_{i})\operatorname {E} [X\mid A_{j}]\Pr(A_{j}) \\ \end{aligned}} }$$ $${\displaystyle \text{Dynamic System}}$$ $${\displaystyle {\begin{aligned}\operatorname {Var} [Y(t)]={}&\operatorname {E} (\operatorname {Var} [Y(t)\mid H_{1t},H_{2t},\ldots ,H_{c-1,t}])\\[4pt]&{}+\sum _{j=2}^{c-1}\operatorname {E} (\operatorname {Var} [\operatorname {E} [Y(t)\mid H_{1t},H_{2t},\ldots ,H_{jt}]\mid H_{1t},H_{2t},\ldots ,H_{j-1,t}])\\[4pt]&{}+\operatorname {Var} (\operatorname {E} [Y(t)\mid H_{1t}]).\end{aligned}} }$$
import numpy as np
import pandas as pd

data = pd.DataFrame(np.random.normal(10, 1, size=1000), columns=['X'])
n_groups = 10
for g in range(n_groups-1):
    data.loc[np.random.randint(0, 1000, 200), 'G'] = f'G{g}'
data['G'] = data['G'].fillna('G0')
data = data.set_index('G').sort_index()
index_func = lambda key: data.index.get_level_values(0) == key
for i in range(n_groups-1):
    data.loc[index_func(f'G{i}'), 'X'] = data.loc[index_func(f'G{i}'), 'X'] + np.random.normal(i, 1, size=index_func(f'G{i}').sum())
data = data.reset_index()

# 2nd-Moment Partition
prob_table = data.groupby('G')['G'].count().rename('count').to_frame()
prob_table['P(G)'] = prob_table['count'].apply(lambda x: x/prob_table['count'].sum())
prob_table = prob_table.merge(data.groupby('G')['X'].mean().rename('E[X|G]'), how='left', on='G', validate='1:1')
prob_table = prob_table.merge(data.groupby('G')['X'].var(ddof=0).rename('Var[X|G]'), how='left', on='G', validate='1:1')
prob_table['2M-P1'] = prob_table.pipe(lambda pt: pt['Var[X|G]']*pt['P(G)'])
prob_table['2M-P2'] = prob_table.pipe(lambda pt: (pt['E[X|G]']**2)*pt['P(G)'])
prob_table['2M-P3'] = prob_table['P(G)'] * prob_table.pipe(lambda pt: pt['E[X|G]']*pt['P(G)']).sum()**2
prob_table['2M-P'] = prob_table['2M-P1'] + prob_table['2M-P2'] - prob_table['2M-P3']
prob_table['2M-P:Ratio'] = prob_table['2M-P'].apply(lambda x: x/prob_table['2M-P'].sum())
prob_table['2M-P:Rank'] = prob_table['2M-P'].apply(lambda x: abs(x)).rank(ascending=False)

prob_table['2M-P'].sum(), data['X'].var(ddof=0)

 

Law of total covariance

$${\displaystyle \operatorname {Cov} (X,Y)=\operatorname {E} (\operatorname {Cov} (X,Y\mid Z))+\operatorname {Cov} (\operatorname {E} (X\mid Z),\operatorname {E} (Y\mid Z)) }$$
import numpy as np
import pandas as pd

data = pd.DataFrame(np.random.normal(10, 1, size=(1000, 2)), columns=['X', 'Y'])
n_groups = 10
for g in range(n_groups-1):
    data.loc[np.random.randint(0, 1000, 150), 'G'] = f'G{g}'
data['G'] = data['G'].fillna('G0')
data = data.set_index('G').sort_index()
index_func = lambda key: data.index.get_level_values(0) == key
for i in range(n_groups-1):
    data.loc[index_func(f'G{i}'), 'Y'] = 0.1*i*data.loc[index_func(f'G{i}'), 'X'] + np.random.normal(0, 1, size=index_func(f'G{i}').sum())
data = data.reset_index()

prob_table = data.groupby('G')['G'].count().rename('count').to_frame()
prob_table['P(G)'] = prob_table['count'].apply(lambda x: x/prob_table['count'].sum())
prob_table = prob_table.merge(data.groupby('G').cov(ddof=0, numeric_only=True).xs(key='X', level=1, axis=0)['Y'].rename('Cov[X,Y|Z]'), how='left', on='G', validate='1:1')
prob_table = prob_table.merge(data.groupby('G')[['X', 'Y']].mean().rename(columns={'X':'E[X|Z]', 'Y':'E[Y|Z]'}), how='left', on='G', validate='1:1')
prob_table['2.5M-P1'] = prob_table.pipe(lambda pt: pt['Cov[X,Y|Z]']*pt['P(G)'])
prob_table['2.5M-P2'] = prob_table['P(G)'] * prob_table.pipe(lambda pt: prob_table[['E[X|Z]', 'E[Y|Z]']].cov(ddof=0).loc['E[X|Z]', 'E[Y|Z]'])
prob_table['2.5M-P'] = prob_table['2.5M-P1'] + prob_table['2.5M-P2']
prob_table['2.5M-P:Ratio'] = prob_table['2.5M-P'].apply(lambda x: x/prob_table['2.5M-P'].sum())
prob_table['2.5M-P:Rank'] = prob_table['2.5M-P'].apply(lambda x: abs(x)).rank(ascending=False)

prob_table['2.5M-P'].sum(), data[['X', 'Y']].cov(ddof=0).loc['X', 'Y']

Law of total cumulance

#

 

 

Variance Structure

Variance structure for cross-sectional data

import numpy as np
import pandas as pd
import statsmodels.tsa.api as smt
import statsmodels.formula.api as smf
import statsmodels.stats.api as sms

data = pd.DataFrame(np.random.normal(10, 1, size=1000), columns=['Y'])
n_groups = 10
for g in range(n_groups):
    data.loc[data.index[g::n_groups], 'G'] = f'G{g}'
data = data.set_index('G').sort_index()

# ANOVA Data
index_func = lambda key: data.index.get_level_values(0) == key
for i in range(n_groups):
    data.loc[index_func(f'G{i}'), 'Y'] = data.loc[index_func(f'G{i}'), 'Y'] + np.random.normal(i, 1, size=index_func(f'G{i}').sum())

# Simple Regression Data
np.random.seed(0)
data['X1'] = np.random.normal(0, 1, size=1000)
data['X2'] = smt.ArmaProcess(ar=[1, -.3, -.2], ma=[1]).generate_sample(nsample=1000, burnin=50).cumsum()
data['Y'] = data['Y'] + 3*data['X1'] + 2*data['X2'] 
data = data.reset_index()

# Analysis
result = smf.ols('Y ~ 1 + C(G) + X1 + X2', data=data).fit()
data['Y'].plot()
result.fittedvalues.plot()
result.resid.plot()

display(result.summary().tables[1])
display(sms.anova_lm(result, typ=2))

# Variance Components
sms.anova_lm(result, typ=1)['sum_sq'].sum(), result.centered_tss, result.ess, result.ssr

 

Variance structure for time series data

Stationary Time Series

import numpy as np
import pandas as pd
import statsmodels.tsa.api as smt
import statsmodels.formula.api as smf
import statsmodels.stats.api as sms

y = smt.ArmaProcess(ar=[1, -.7, -.2], ma=[1, .3, .5]).generate_sample(nsample=1000, burnin=50)
y = pd.Series(y, index=pd.date_range('00:00:00', periods=y.shape[0], freq='D'))

# SARIMAX
result = smt.SARIMAX(y, exog=np.c_[np.ones_like(y)], order=(2, 0, 2), trend="n").fit(disp=False)
sarimax_params = result.params.copy()[:'ma.L2'].values
display(result.summary().tables[1])
y.plot(figsize=(30,5))
result.fittedvalues.plot()
result.resid.plot()


# Variance Propagation
long_run_variance = result.impulse_responses(500, orthogonalized=True, cumulative=False)[0:].apply(lambda x :x**2).sum() # asymptotic variance of time series
lagged_run_variance = result.impulse_responses(500, orthogonalized=True, cumulative=False)[1:].apply(lambda x :x**2).sum() # approximately lagged variance of time series
short_run_variance = result.impulse_responses(500, orthogonalized=True, cumulative=False)[0:1].apply(lambda x :x**2).sum() # unit noise variance on arma process
variance_propagation = pd.DataFrame(
    data=[
        
        [long_run_variance, lagged_run_variance, short_run_variance], 
        [result.model.endog.var(ddof=1), result.fittedvalues.var(ddof=1), result.resid.var(ddof=1)]
    ], columns=['Long-Run', 'Lagged-Run', 'Short-Run'], index=['IRF', 'Time Series'])
display(variance_propagation)

# ANCOVA
data = y.to_frame()
data['arL1'] = y.shift(1).bfill()
data['arL2'] = y.shift(2).bfill()
data['maL1'] = result.resid.shift(1).bfill()
data['maL2'] = result.resid.shift(2).bfill()
result = smf.ols('y ~ 1 + arL1 + arL2 + maL1 + maL2', data=data).fit()
result.params[:] = sarimax_params
display(result.summary().tables[1])
display(sms.anova_lm(result, typ=1))

# Variance Components
sms.anova_lm(result, typ=1)['sum_sq'].sum(), result.centered_tss, result.ess, result.ssr

Non-Stationary Time Series

#

 

 

 

 

 


TABLE OF CONTENTS

 

One-way ANOVA
  Fixed Model
  Repeated measured ANOVA ( r=1 )

Two-way ANOVA
  Fixed Model
  Repeated measured ANOVA ( r=1 )
  Mixed ANOVA

Three-way ANOVA
  Fixed Model
  Repeated measured ANOVA ( r=1 )
  Mixed ANOVA


 

 

 

 

 

One-way ANOVA

Fixed Model

$${\displaystyle \begin{cases} \textit{source of variance} \\ \qquad : SS_{\text{total}} &= SS_{\text{A}} + SS_{\text{S/A}} \\ \textit{model[A × S]} \\ \qquad : Y_{ij} &= \mu + \alpha_{i} + \varepsilon _{ij}, \quad \varepsilon_{ij} \sim \mathcal{N}(0, \sigma^{2}_{\varepsilon}) \\ \end{cases} }$$ $${\displaystyle \begin{aligned} \operatorname{E}[Y_{ij}] &= \mu \\ \operatorname{E}[Y_{ij} \mid A = A_{i}] & = \mu + \alpha_{i} = \mu_{A_{i}}\\ \end{aligned} }$$
$${\displaystyle \begin{aligned} \varepsilon_{ij} &= Y_{ij} - \left( \mu + \alpha_{i} \right) \\ \alpha_{i} &= \mu_{A_{i}} - \mu, \quad \sum_{i} \alpha_{i} = 0 \\ \end{aligned} }$$
Source of variation Degree of freedom Sum of squares Mean square Expected MS F value
A a-1 SSA MSA σε2 + nσA2 MSA/MSS/A
Error S/A a(n-1) SSS/A MSS/A σε2  
Total an-1 SStotal MStotal    

 

import pandas as pd
import pingouin as pg
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.graphics.factorplots import interaction_plot

data = pg.read_dataset('anova').rename(columns={'Pain threshold':'Pain_threshold', 'Hair color':'Hair_color'})

# Assumption
display(pg.normality(data=data, dv='Pain_threshold', group='Hair_color')) # Shapiro-Wilk test
display(pg.homoscedasticity(data=data, dv='Pain_threshold', group='Hair_color')) # Levene’s test

# Anova result
display(pg.anova(data=data, dv='Pain_threshold', between='Hair_color', ss_type=1, detailed=True))
display(sm.stats.anova_lm(ols('Pain_threshold ~ C(Hair_color)', data=data).fit(), typ=2))

# Mean-grouping
data['M'] = data['Pain_threshold'].mean()
data = data.set_index('Hair_color')
data['m'] = data.groupby('Hair_color')['Pain_threshold'].apply(lambda x: x.mean())

# Sum-squares
data = data.reset_index()
data['SST'] = ((data['Pain_threshold'] - data['M'])**2).sum() 
data['SSB'] = ((data['m']              - data['M'])**2).sum() 
data['SSW'] = ((data['Pain_threshold'] - data['m'])**2).sum() # data['SST'] - data['SSB']

# Error definition(1): variance within group
data['SSE'] = data['SSW'].copy()

# Degree of freedom arguments
N = data['Pain_threshold'].size
a = data['Hair_color'].unique().size
n = N/a

# Mean-sqaures
data['MST'] = data['SST'] / (N-1)
data['MSB'] = data['SSB'] / (a-1)

# Error definition(2)
data['MSE'] = data['SSE'] / (a*(n-1)) # A ~ No Random Effect

# post-hoc test
display(pg.pairwise_tukey(data=data, dv='Pain_threshold', between="Hair_color"))
display(interaction_plot(x=data['Subject'], trace=data['Hair_color'], response=data['Pain_threshold']))
import pandas as pd
import pingouin as pg
import statsmodels.stats.api as sms
import statsmodels.formula.api as smf

data = pg.read_dataset('anova').rename(columns={'Pain threshold':'Pain_threshold', 'Hair color':'Hair_color'})

result = smf.ols('Pain_threshold ~ 0 + C(Hair_color)', data=data).fit() # data.groupby('Hair_color')['Pain_threshold'].mean()
result = smf.ols('Pain_threshold ~ 1 + C(Hair_color, Treatment(0))', data=data).fit() # Intercept: mean of Dark Blond > data.groupby('Hair_color')['Pain_threshold'].mean() - data.groupby('Hair_color')['Pain_threshold'].mean()['Dark Blond']
result = smf.ols('Pain_threshold ~ 1 + C(Hair_color, Treatment(1))', data=data).fit() # Intercept: mean of Dark Brunette > data.groupby('Hair_color')['Pain_threshold'].mean() - data.groupby('Hair_color')['Pain_threshold'].mean()['Dark Brunette']
result = smf.ols('Pain_threshold ~ 1 + C(Hair_color, Treatment(2))', data=data).fit() # Intercept: mean of Light Blond > data.groupby('Hair_color')['Pain_threshold'].mean() - data.groupby('Hair_color')['Pain_threshold'].mean()['Light Blond']
result = smf.ols('Pain_threshold ~ 1 + C(Hair_color, Treatment(3))', data=data).fit() # Intercept: mean of Light Brunette > data.groupby('Hair_color')['Pain_threshold'].mean() - data.groupby('Hair_color')['Pain_threshold'].mean()['Light Brunette']
result = smf.ols('Pain_threshold ~ 1 + C(Hair_color, Treatment("Dark Blond"))', data=data).fit()
result = smf.ols('Pain_threshold ~ 1 + C(Hair_color, Treatment("Dark Brunette"))', data=data).fit()
result = smf.ols('Pain_threshold ~ 1 + C(Hair_color, Treatment("Light Blond"))', data=data).fit()
result = smf.ols('Pain_threshold ~ 1 + C(Hair_color, Treatment("Light Brunette"))', data=data).fit()
result = smf.ols('Pain_threshold ~ 1 + C(Hair_color, Sum(0))', data=data).fit() # Intercept: mean of group means > data.groupby('Hair_color')['Pain_threshold'].mean() - data.groupby('Hair_color')['Pain_threshold'].mean().mean()  
result = smf.ols('Pain_threshold ~ 1 + C(Hair_color, Sum(1))', data=data).fit() # Intercept: mean of group means > data.groupby('Hair_color')['Pain_threshold'].mean() - data.groupby('Hair_color')['Pain_threshold'].mean().mean() 
result = smf.ols('Pain_threshold ~ 1 + C(Hair_color, Sum(2))', data=data).fit() # Intercept: mean of group means > data.groupby('Hair_color')['Pain_threshold'].mean() - data.groupby('Hair_color')['Pain_threshold'].mean().mean() 
result = smf.ols('Pain_threshold ~ 1 + C(Hair_color, Sum(3))', data=data).fit() # Intercept: mean of group means > data.groupby('Hair_color')['Pain_threshold'].mean() - data.groupby('Hair_color')['Pain_threshold'].mean().mean() 
result = smf.ols('Pain_threshold ~ 1 + C(Hair_color, Sum("Dark Blond"))', data=data).fit()
result = smf.ols('Pain_threshold ~ 1 + C(Hair_color, Sum("Dark Brunette"))', data=data).fit()
result = smf.ols('Pain_threshold ~ 1 + C(Hair_color, Sum("Light Blond"))', data=data).fit()
result = smf.ols('Pain_threshold ~ 1 + C(Hair_color, Sum("Light Brunette"))', data=data).fit()
result.summary() 
# Design Matrix(1): np.unique(result.model.exog, axis=0)
# Design Matrix(2-1): result.model.data.orig_exog
# Design Matrix(2-2): result.model.data.frame

sms.anova_lm(
    smf.ols('Pain_threshold ~ 0 + C(Hair_color)', data=data).fit(),
    smf.ols('Pain_threshold ~ 1 + C(Hair_color, Treatment(0))', data=data).fit(),
    smf.ols('Pain_threshold ~ 1 + C(Hair_color, Treatment(1))', data=data).fit(),
    smf.ols('Pain_threshold ~ 1 + C(Hair_color, Treatment(2))', data=data).fit(),
    smf.ols('Pain_threshold ~ 1 + C(Hair_color, Treatment(3))', data=data).fit(), 
    smf.ols('Pain_threshold ~ 1 + C(Hair_color, Treatment("Dark Blond"))', data=data).fit(),
    smf.ols('Pain_threshold ~ 1 + C(Hair_color, Treatment("Dark Brunette"))', data=data).fit(),
    smf.ols('Pain_threshold ~ 1 + C(Hair_color, Treatment("Light Blond"))', data=data).fit(),
    smf.ols('Pain_threshold ~ 1 + C(Hair_color, Treatment("Light Brunette"))', data=data).fit(),
    smf.ols('Pain_threshold ~ 1 + C(Hair_color, Sum(0))', data=data).fit(),
    smf.ols('Pain_threshold ~ 1 + C(Hair_color, Sum(1))', data=data).fit(),
    smf.ols('Pain_threshold ~ 1 + C(Hair_color, Sum(2))', data=data).fit(),
    smf.ols('Pain_threshold ~ 1 + C(Hair_color, Sum(3))', data=data).fit(),
    smf.ols('Pain_threshold ~ 1 + C(Hair_color, Sum("Dark Blond"))', data=data).fit(),
    smf.ols('Pain_threshold ~ 1 + C(Hair_color, Sum("Dark Brunette"))', data=data).fit(),
    smf.ols('Pain_threshold ~ 1 + C(Hair_color, Sum("Light Blond"))', data=data).fit(),
    smf.ols('Pain_threshold ~ 1 + C(Hair_color, Sum("Light Brunette"))', data=data).fit(),
)

 

 

Repeated measured ANOVA ( r=1 )

$${\displaystyle \begin{cases} \textit{source of variance} \\ \qquad : SS_{\text{total(full)}} &= SS_{\text{subjects}} + SS_{\text{U}} + SS_{\text{U×S}}\\ \qquad : SS_{\text{total(reduced)}} &= SS_{\text{subjects}} + SS_{\text{U}} + SS_{\text{error}} \\ \textit{model[(U × S)]} \\ \qquad :\text{full}\; Y_{ij} &= \mu + \pi_{i} +\alpha_{j} + (\pi\alpha)_{ij} + \varepsilon _{ij} \\ \qquad :\text{reduced}\; Y_{ij} &= \mu + \pi_{i} +\alpha_{j} + \varepsilon _{ij} \\ \end{cases} }$$

 

Full Model

Source of variation Degree of freedom Sum of squares Mean square Expected MS F value
Between subjects n-1 SSsubjects MSsubjects σε2 + uσS2 MSsubjects/MSU×S
U u-1 SSU MSU σε2 + σU×S2 + nσU2 MSU/MSU×S
error U × S (u-1)(n-1) SSU×S MSU×S σε2 + σU×S2  
Total un-1 SStotal MStotal    

Reduced Model

Source of variation Degree of freedom Sum of squares Mean square Expected MS F value
Between subjects n-1 SSsubjects MSsubjects σε2 + uσS2 MSsubjects/MSS/U
U u-1 SSU MSU σε2 + nσU2 MSU/MSS/U
Error S/U u(n-1) SSS/U MSS/U σε2  
Total un-1 SStotal MStotal    

 

import numpy as np
import pandas as pd
import pingouin as pg
from statsmodels.stats.anova import AnovaRM
from statsmodels.graphics.factorplots import interaction_plot

data = pd.DataFrame({'Cars': np.repeat([1, 2, 3, 4, 5], 4),
                          'Oil': np.tile([1, 2, 3, 4], 5),
                          'Mileage': [36, 38, 30, 29,
                                      34, 38, 30, 29,
                                      34, 28, 38, 32,
                                      38, 34, 20, 44,
                                      26, 28, 34, 50]})

# Assumption
display(pg.normality(data=data, dv='Mileage', group='Oil'))
display(pg.sphericity(data=data, dv='Mileage', within='Oil', subject='Cars'))

# Anova result
display(pg.rm_anova(data=data, dv='Mileage', within='Oil', subject='Cars', detailed=True, effsize="np2"))
display(AnovaRM(data=data, depvar='Mileage', within=['Oil'], subject='Cars').fit().anova_table)

# Mean-grouping
data['M'] = data['Mileage'].mean()
data = data.set_index(['Oil'])
data['m_O'] = data.groupby('Oil')['Mileage'].mean()
data = data.reset_index().set_index(['Cars'])
data['m_S'] = data.groupby('Cars')['Mileage'].mean()
data = data.reset_index().set_index(['Oil', 'Cars'])
data['m_OS'] = data.groupby(['Oil', 'Cars'])['Mileage'].mean()


# Sum-squares
data = data.reset_index()
data['SST'] = ((data['Mileage'] - data['M'])**2).sum() 
data['SSB_S'] = ((data['m_S'] - data['M'])**2).sum() 
data['SSW_S'] = ((data['Mileage'] - data['m_S'])**2).sum() # data['SSW_S'] = data['SST'] - data['SSB_S'] 
data['SSB_O'] = ((data['m_O'] - data['M'])**2).sum()
data['SSW_O'] = ((data['Mileage'] - data['m_O'])**2).sum() # data['SSW_O'] = data['SST'] - data['SSB_O'] 
data['SSB_O+S'] = ((data['m_OS'] - data['M'])**2).sum()
data['SSB_O*S'] = ((data['m_OS'] - data['M'])**2).sum() - data['SSB_S'] - data['SSB_O']  
data['SSW_O+S'] = ((data['Mileage'] - data['m_OS'])**2).sum() # data['SSW_O+S'] = data['SST'] - data['SSB_O'] - data['SSB_S'] - data['SSB_O*S']

# Subject statistical properties
data['SST'] == ((data['m_OS'] - data['M'])**2).sum()
data['SSB_O*S'] == data['SST'] - data['SSB_O'] - data['SSB_S'] 
data['SSB_O*S'] == data['SSW_S'] - data['SSB_O']
data['SSW_O+S'] == 0

# Error definition(1): variance within subjects after removing repeated measure effects 
data['SSE'] = data['SSB_O*S'].copy() # data['SSE'] = data['SSW_S'] - data['SSB_O'] 

# Degree of freedom arguments
N = data.shape[0]
u = data['Oil'].unique().size
s = data['Cars'].unique().size
n = s

# Mean-squares
data['MST'] = data['SST'] / (N-1)
data['MSB_S'] = data['SSB_S'] / (n-1)
data['MSB_O'] = data['SSB_O'] / (u-1)

# Error definition(2)
data['MSE'] = data['SSE'] / ((n-1)*(u-1)) # U ~ Random Effect

# F values
data['F_O'] = data['MSB_O']/data['MSE']

# post-hoc test
display(pg.pairwise_tests(data=data, dv='Mileage', within='Oil', subject='Cars', padjust='fdr_bh'))
display(interaction_plot(x=data['Cars'], trace=data['Oil'], response=data['Mileage']))
import numpy as np
import pandas as pd
import statsmodels.stats.api as sms

data = pd.DataFrame({'Cars': np.repeat([1, 2, 3, 4, 5], 4),
                          'Oil': np.tile([1, 2, 3, 4], 5),
                          'Mileage': [36, 38, 30, 29,
                                      34, 38, 30, 29,
                                      34, 28, 38, 32,
                                      38, 34, 20, 44,
                                      26, 28, 34, 50]})
sms.AnovaRM(data=data, depvar='Mileage', within=['Oil'], subject='Cars').fit().anova_table

 

 

 

 


Two-way ANOVA

Fixed Model

$${\displaystyle \begin{cases} \textit{source of variance} \\ \qquad : SS_{\text{total}} &= SS_{\text{A}} + SS_{\text{B}} + SS_{\text{A×B}} + SS_{\text{error}} \\ \textit{model[A × B × S]} \\ \qquad : Y_{ijk} &= \mu +\alpha_{i} + \beta_{j} + (\alpha\beta)_{ij} + \varepsilon_{ijk}, \quad \varepsilon_{ijk} \sim \mathcal{N}(0, \sigma^{2}_{\varepsilon}) \\ \end{cases} }$$ $${\displaystyle \begin{aligned} \operatorname{E}[ Y_{ijk} ] &= \mu \\ \operatorname{E}[Y_{ijk} \mid A=A_{i}] &= \mu +\alpha_{i} = \mu_{A_{i}} \\ \operatorname{E}[Y_{ijk} \mid B=B_{j}] &= \mu + \beta_{j} = \mu_{B_{j}} \\ \operatorname{E}[ Y_{ijk} \mid A=A_{i} \; \And \; B=B_{j}] &= \mu +\alpha_{i} + \beta_{j} + (\alpha\beta)_{ij} = \mu_{A_{i}B_{j}} \\ \end{aligned} }$$
$${\displaystyle \begin{aligned} \varepsilon_{ijk} &= Y_{ijk} - \left( \mu +\alpha_{i} + \beta_{j} + (\alpha\beta)_{ij} \right) \\ \alpha_{i} &= \mu_{A_{i}} - \mu, \quad \sum_{i} \alpha_{i} = 0 \\ \beta_{j} &= \mu_{B_{j}} - \mu, \quad \sum_{j} \beta_{j} = 0 \\ (\alpha\beta)_{ij} &= \mu_{A_{i}B_{j}} - (\mu + \alpha_{i} + \beta_{j}), \quad \sum_{i} (\alpha\beta)_{ij} = \sum_{j} (\alpha\beta)_{ij} = 0 \\ \end{aligned} }$$
Source of variation Degree of freedom Sum of squares Mean square Expected MS F value
A a-1 SSA MSA σε2 + nbσA2 MSA/MSS/AB
B b-1 SSB MSB σε2 + naσB2 MSB/MSS/AB
A×B (a-1)(b-1) SSA×B MSA×B σε2 + nσA×B2 MSA×B/MSS/AB
Error S/AB ab(n-1) SSS/AB MSS/AB σε2  
Total abn-1 SStotal MStotal    
import numpy as np
import pandas as pd
import pingouin as pg
from statsmodels.formula.api import ols
import statsmodels.api as sm
from statsmodels.graphics.factorplots import interaction_plot

data = pg.read_dataset('anova2')

# Assumption
display(pg.normality(data=data, dv='Yield', group=["Blend", "Crop"][0])) # Shapiro-Wilk test
display(pg.normality(data=data, dv='Yield', group=["Blend", "Crop"][1])) # Shapiro-Wilk test
display(pg.homoscedasticity(data=data, dv='Yield', group=["Blend", "Crop"][0])) # Levene’s test
display(pg.homoscedasticity(data=data, dv='Yield', group=["Blend", "Crop"][1])) # Levene’s test

# Anova result
display(pg.anova(data=data, dv="Yield", between=["Blend", "Crop"], ss_type=2, detailed=True))
display(sm.stats.anova_lm(ols('Yield ~ C(Blend) + C(Crop) + C(Blend):C(Crop)', data=data).fit(), typ=2))

# Mean-grouping
data["M"] = data["Yield"].mean()
data = data.set_index("Blend")
data["m_A"] = data.groupby("Blend")['Yield'].mean()
data = data.reset_index().set_index("Crop")
data["m_B"] = data.groupby("Crop")['Yield'].mean()
data = data.reset_index().set_index(["Blend", "Crop"])
data["m_AB"] = data.groupby(["Blend", "Crop"])['Yield'].mean()

# Sum-squares
data = data.reset_index()
data["SST"] = ((data["Yield"] - data["M"])**2).sum()
data["SSB_A"] = ((data["m_A"] - data["M"])**2).sum()
data["SSB_B"] = ((data["m_B"] - data["M"])**2).sum()
data["SSB_AB"] = ((data["m_AB"] - data["M"])**2).sum() - data["SSB_A"] - data["SSB_B"]
data["SSW_AB"] = ((data["Yield"] - data["m_AB"])**2).sum() # data["SST"] - data["SSB_A"] - data["SSB_B"] - data["SSB_AB"]

# Error definition(1)
data["SSE"] = data["SSW_AB"].copy() # A, B: ERROR!

# Degree of freedom arguments
N = data.shape[0]
a = data['Blend'].unique().size
b = data['Crop'].unique().size
n = N/(a*b)

# Mean squares
data["MST"] = data["SST"] / (N-1)
data["MSB_A"] = data["SSB_A"] / (a-1)
data["MSB_B"] = data["SSB_B"] / (b-1)
data["MSB_AB"] = data["SSB_AB"] / ((a-1)*(b-1))

# Error definition(2)
data["MSE"] = data["SSE"] / (a*b*(n-1)) # A, B ~ No Random Effect

# F-values
data["F_A"] = data["MSB_A"] / data["MSE"]
data["F_B"] = data["MSB_B"] / data["MSE"]
data["F_AB"] = data["MSB_AB"] / data["MSE"]

# post-hoc test
display(pg.pairwise_tukey(data=data, dv='Yield', between=["Blend", "Crop"][0]))
display(pg.pairwise_tukey(data=data, dv='Yield', between=["Blend", "Crop"][1]))
display(interaction_plot(x=data['Blend'], trace=data['Crop'], response=data['Yield']))
import pingouin as pg
import statsmodels.formula.api as smf
import statsmodels.stats.api as sms

data = pg.read_dataset('anova2')
result = smf.ols('Yield ~ C(Blend) + C(Crop) + C(Blend):C(Crop)', data=data).fit()
sms.anova_lm(result, typ=2)

 

 

 

Repeated measured ANOVA ( r=1 )

$${\displaystyle \begin{cases} \textit{source of variance} \\ \qquad : SS_{\text{total(full)}} &= SS_{\text{subjects}} + SS_{\text{U}} + SS_{\text{V}} + SS_{\text{U×V}} + SS_{\text{U×S}} + SS_{\text{V×S}} + SS_{\text{U×V×S}} \\ \qquad : SS_{\text{total(reduced)}} &= SS_{\text{subjects}} + SS_{\text{U}} + SS_{\text{V}} + SS_{\text{U×V}} + SS_{\text{error}} \\ \textit{model[(U × V × S)]} \\ \qquad :\text{full}\; Y_{ijk} &= \mu +\alpha_{i} + \beta_{j} + (\alpha\beta)_{ij} + \pi_{k} + (\alpha\pi)_{ik} + (\beta\pi)_{jk} + (\alpha\beta\pi)_{ijk} + \varepsilon_{ijk} \\ \qquad :\text{reduced}\; Y_{ijk} &= \mu +\alpha_{i} + \beta_{j} + (\alpha\beta)_{ij} + \pi_{k} + \varepsilon_{ijk} \\ \end{cases} }$$

Full Model ANOVA Table

Source of variation Degree of freedom Sum of squares Mean square Expected MS F value
Between subjects n-1 SSsubjects MSsubjects    
U u-1 SSU MSU σε2 + vσU×S2 + nvσU2 MSU/MSU×S
error U × S (u-1)(n-1) SSU×S MSU×S σε2 + vσU×S2  
V v-1 SSV MSV σε2 + uσV×S2 + nuσV2 MSV/MSV×S
error V × S (v-1)(n-1) SSV×S MSV×S σε2 + uσV×S2  
U×V (u-1)(v-1) SSU×V MSU×V σε2 + σU×V×S2 + nσU×V2 MSU×V/MSU×V×S
error U × V × S (u-1)(v-1)(n-1) SSU×V×S MSU×V×S σε2 + σU×V×S2  
Total uvn-1 SStotal MStotal    

Reduced Model ANOVA Table

Source of variation Degree of freedom Sum of squares Mean square Expected MS F value
Between subjects n-1 SSsubjects MSsubjects σε2 + uvσS2  
U u-1 SSU MSU σε2 + nvσU2 MSU/MSS/UV
V v-1 SSV MSV σε2 + nuσV2 MSV/MSS/UV
U×V (u-1)(v-1) SSU×V MSU×V σε2 + nσU×V2 MSU×V/MSS/UV
Error S/UV uv(n-1) SSS/UV MSS/UV σε2  
Total uvn-1 SStotal MStotal    
import numpy as np
import pandas as pd
from scipy import stats
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.anova import AnovaRM
from statsmodels.graphics.factorplots import interaction_plot
import pingouin as pg

class DataGenerator:
    def __init__(self, repeated_num:int):
        self.limit = repeated_num

    def __iter__(self):
        return self

    def __next__(self):
        if self.limit != 0:
            self.limit -= 1
            return DataGenerator.generate()
        else:                           
            raise StopIteration

    @classmethod
    def generate(cls):
        # Design of Experiments, DOE
        index = pd.MultiIndex.from_product([['G1', 'G2', 'G3'], list('AB')], names=['Group', 'Factor'])
        
        frame = pd.DataFrame(
            index = index,
            data = [None]*len(index),
            columns = ['Value'],
        ).applymap(lambda x: np.random.normal(0,1))
        
        # tfi: target feature index
        # Variable Feature Control
        tfi = (frame.index.get_level_values(1) == 'A')
        frame.loc[lambda x: tfi] = frame.loc[lambda x: tfi] + np.random.normal(loc=10, scale=1, size=(tfi.sum(), 1))
        
        frame['Value'] = frame['Value'].astype(float)
        return frame

def anova_data(repeated_num=1):
    data = pd.concat(DataGenerator(repeated_num=repeated_num), axis=0).sort_index().reset_index()
    data = data.reset_index().rename(columns={'index':'Index'})
    data['Index'] = data.groupby(['Group'] + sorted(data.columns.difference(['Group', 'Index', 'Value']).tolist()))['Index'].rank()
    data = data.set_index(['Group'] + sorted(data.columns.difference(['Group', 'Index', 'Value']).tolist()) + ['Index']).unstack(0)
    return data

data = anova_data(10)
data = data.stack(1).reset_index()

# Assumption
display(pg.normality(data=data, dv='Value', group='Group'))
display(pg.normality(data=data, dv='Value', group='Factor'))
display(pg.sphericity(data=data, dv='Value', within=['Group', 'Factor'], subject='Index'))

display(pg.rm_anova(data=data, dv='Value', within=['Group', 'Factor'], subject='Index', detailed=True))
display(AnovaRM(data=data, depvar='Value', within=['Group', 'Factor'], subject='Index').fit().anova_table)

# Mean-Grouping
data["M"] = data["Value"].mean()
data = data.set_index("Group")
data["m_G"] = data.groupby("Group")['Value'].mean()
data = data.reset_index().set_index("Factor")
data["m_F"] = data.groupby("Factor")['Value'].mean()
data = data.reset_index().set_index("Index")
data["m_S"] = data.groupby("Index")['Value'].mean()
data = data.reset_index().set_index(["Group", "Factor"])
data["m_GF"] = data.groupby(["Group", "Factor"])['Value'].mean()
data = data.reset_index().set_index(["Group", "Index"])
data["m_GS"] = data.groupby(["Group", "Index"])['Value'].mean()
data = data.reset_index().set_index(["Factor", "Index"])
data["m_FS"] = data.groupby(["Factor", "Index"])['Value'].mean()
data = data.reset_index().set_index(["Group", "Factor", "Index"])
data["m_GFS"] = data.groupby(["Group", "Factor", "Index"])['Value'].mean()

# Sum-Squares
data = data.reset_index()
data["SST"] = ((data["Value"] - data["M"])**2).sum()
data["SSB_S"] = ((data["m_S"] - data["M"])**2).sum()
data["SSB_G"] = ((data["m_G"] - data["M"])**2).sum()
data["SSB_F"] = ((data["m_F"] - data["M"])**2).sum()
data["SSB_GF"] = ((data["m_GF"] - data["M"])**2).sum() - data["SSB_G"] - data["SSB_F"]

# Error Definition(1)
data["SSB_GS"] = ((data["m_GS"] - data["M"])**2).sum() - data["SSB_G"] - data["SSB_S"]
data["SSB_FS"] = ((data["m_FS"] - data["M"])**2).sum() - data["SSB_F"] - data["SSB_S"]
data["SSB_GFS"] = ((data["m_GFS"] - data["M"])**2).sum() - data["SSB_G"] - data["SSB_F"] - data["SSB_S"] - data["SSB_GF"] - data["SSB_GS"] - data["SSB_FS"]

# Degree of Freedom Arguments
N = data.shape[0]
u = data['Group'].unique().size
v = data['Factor'].unique().size
s = data['Index'].unique().size
n = s


# Mean-Squares
data["MSB_G"] = data["SSB_G"]/(u-1)
data["MSB_F"] = data["SSB_F"]/(v-1)
data["MSB_GF"] = data["SSB_GF"]/((u-1)*(v-1))

# Error Definition(2)
data["MSB_GS"] = data["SSB_GS"]/((u-1)*(n-1)) # U ~ Random Effect
data["MSB_FS"] = data["SSB_FS"]/((v-1)*(n-1)) # V ~ Random Effect
data["MSB_GFS"] = data["SSB_GFS"]/((u-1)*(v-1)*(n-1)) # U, V ~ Random Effect

# F-values
data['F_G'] = data["MSB_G"] / data["MSB_GS"]
data['F_F'] = data["MSB_F"] / data["MSB_FS"]
data['F_GF'] = data["MSB_GF"] / data["MSB_GFS"]

# post-hoc test
display(pg.pairwise_tests(data=data, dv='Value', within=['Group', 'Factor'][0], subject='Index', padjust='fdr_bh'))
display(pg.pairwise_tests(data=data, dv='Value', within=['Group', 'Factor'][1], subject='Index', padjust='fdr_bh'))
display(interaction_plot(x=data['Group'], trace=data['Factor'], response=data['Value']))

 

Mixed ANOVA

: One between- and one within-subjects factor

$${\displaystyle \begin{cases} \textit{source of variance} \\ \qquad : SS_{\text{total}} &= (SS_{\text{A}} + SS_{\text{S/A}}) + (SS_{\text{U}} + SS_{\text{U×A}} + SS_{\text{U×S/A}} ) \\ &= (SS_{\text{A}} + SS_{\text{error-between}}) + (SS_{\text{U}} + SS_{\text{U×A}} + SS_{\text{error-within}} ) \\ \textit{model[A × (U × S)]} \\ \qquad : Y_{ijk} &= \mu +\alpha_{i} + \pi_{j/i} + \beta_{k} + (\alpha\beta)_{ik} + (\pi\beta)_{jk/i} + \varepsilon_{ijk} \\ \end{cases} }$$
Source of variation Degree of freedom Sum of squares Mean square Expected MS F value
Between subjects an-1        
A a-1 SSA MSA σε2 + uσS/A2 + nuσA2 MSA/MSS/A
Error S/A a(n-1) SSS/A MSS/A σε2 + uσS/A2  
Within subjects an(u-1)        
U u-1 SSU MSU σε2 + σU×S/A2 + naσU2 MSU/MSU×S/A
U×A (u-1)(a-1) SSU×A MSU×A σε2 + σU×S/A2 + nσU×A2 MSU×A /MSU×S/A
Error U×S/A a(u-1)(n-1) SSU×S/A MSU×S/A σε2 + σU×S/A2  
Total aun-1 SStotal MStotal    
import pingouin as pg
from statsmodels.graphics.factorplots import interaction_plot

data = pg.read_dataset('mixed_anova')

# Assumption
display(pg.normality(data=data, dv='Scores', group='Group'))
display(pg.normality(data=data, dv='Scores', group='Time'))
display(pg.homoscedasticity(data=data, dv='Scores', group='Group'))
display(pg.sphericity(data=data, dv='Scores', within='Time', subject='Subject'))

# Anova result
display(pg.mixed_anova(data=data, dv='Scores', between='Group', within='Time', subject='Subject', effsize='ng2'))

# Mean-grouping
data['M'] = data['Scores'].mean()
data = data.set_index('Group')
data['m_G'] = data.groupby('Group')['Scores'].mean()
data = data.reset_index().set_index('Time')
data['m_T'] = data.groupby('Time')['Scores'].mean()
data = data.reset_index().set_index('Subject')
data['m_S'] = data.groupby('Subject')['Scores'].mean()
data = data.reset_index().set_index(['Group',  'Time'])
data['m_GT'] = data.groupby(['Group', 'Time'])['Scores'].mean()
data = data.reset_index().set_index(['Time',  'Subject'])
data['m_TS'] = data.groupby(['Time', 'Subject'])['Scores'].mean()
data = data.reset_index().set_index(['Group', 'Subject'])
data['m_GS'] = data.groupby(['Group', 'Subject'])['Scores'].mean()
data = data.reset_index().set_index(['Group', 'Time',  'Subject'])
data['m_GTS'] = data.groupby(['Group', 'Time', 'Subject'])['Scores'].mean()

# Sum-squares
data = data.reset_index()
data['SST'] = ((data['Scores']  - data['M'])**2).sum()
data['SSB_S'] = ((data['m_S']  - data['M'])**2).sum()
data['SSW_S'] = ((data['Scores'] - data['m_S'])**2).sum()
data['SSB_G'] = ((data['m_G']  - data['M'])**2).sum()
data['SSB_T'] = ((data['m_T']  - data['M'])**2).sum()
data['SSB_GT'] = ((data['m_GT']  - data['M'])**2).sum() - data['SSB_G'] - data['SSB_T']
data['SSB_TS'] = ((data['m_TS']  - data['M'])**2).sum() - data['SSB_T'] - data['SSB_S']
data['SSB_GS'] = ((data['m_GS']  - data['M'])**2).sum() - data['SSB_G'] - data['SSB_S']
data['SSB_GTS'] = ((data['m_GTS']  - data['M'])**2).sum() - data['SSB_T'] - data['SSB_S'] - data['SSB_GT'] - data['SSB_TS'] - data['SSB_GS']

# Error definition(1): <Rule> removing sources of variance from subject variance 
data['SSE_B'] = data['SSB_S'] - data['SSB_G']
data['SSE_W'] = data['SSW_S'] - data['SSB_T'] - data['SSB_GT'] # data['SSE_W'] = data['SSB_TS'] + data['SSB_GTS'] + data['SSB_GS']

# Degree of freedom arguments
N = data.shape[0]
a = data['Group'].unique().size
u = data['Time'].unique().size
s = data['Subject'].unique().size
n = N / (a*u)
r = s/n

# Mean-squares
data['MSB_G'] = data['SSB_G'] / (a-1)
data['MSB_T'] = data['SSB_T'] / (u-1)
data['MSB_GT'] = data['SSB_GT'] / ((a-1)*(u-1)) 

# Error definition(2)
data['MSE_B'] = data['SSE_B'] / (a*(n-1))  # A ~ No Random Effect
data['MSE_W'] = data['SSE_W'] / (a*(u-1)*(n-1)) # A ~ No Random Effect / U ~ Random Effect


# F-values
data['F_G'] = data['MSB_G'] / data['MSE_B']
data['F_T'] = data['MSB_T'] / data['MSE_W']
data['F_GT'] = data['MSB_GT'] / data['MSE_W']

display(pg.pairwise_tests(data=data, dv='Scores', within='Time', subject='Subject', padjust='fdr_bh'))
display(pg.pairwise_tukey(data=data, dv='Scores', between='Group'))
display(interaction_plot(x=data['Time'], trace=data['Group'], response=data['Scores']))

 

 

 


Three-way ANOVA

Fixed Model

$${\displaystyle \begin{cases} \textit{source of variance} \\ \qquad : SS_{\text{total}} &= SS_{\text{A}} + SS_{\text{B}} + SS_{\text{C}} + SS_{\text{A×B}} + SS_{\text{B×C}} + SS_{\text{C×A}} + SS_{\text{A×B×C}} + SS_{\text{error}} \\ \textit{model[A × B × C × S]} \\ \qquad : Y_{ijkl} &= \mu +\alpha_{i} + \beta_{j} + \gamma_{k} + (\alpha\beta)_{ij} + (\beta\gamma)_{jk} + (\gamma\alpha)_{ki} + (\alpha\beta\gamma)_{ijk}+ \varepsilon_{ijkl}, \quad \varepsilon_{ijkl} \sim \mathcal{N}(0, \sigma^{2}_{\varepsilon}) \\ \end{cases} }$$ $${\displaystyle \begin{aligned} \operatorname{E}[Y_{ijkl} \mid A=A_{i}] &= \mu + \alpha_{i} = \mu_{A_{i}} \\ \operatorname{E}[Y_{ijkl} \mid B=B_{j}] &= \mu + \beta_{j} = \mu_{B_{j}} \\ \operatorname{E}[Y_{ijkl} \mid C=C_{k}] &= \mu + \gamma_{k} = \mu_{C_{k}} \\ \operatorname{E}[Y_{ijkl} \mid A=A_{i} \; \And \; B=B_{j}] &= \mu + \alpha_{i} + \beta_{j} + (\alpha\beta)_{ij} = \mu_{A_{i}B_{j}} \\ \operatorname{E}[Y_{ijkl} \mid B=B_{j} \; \And \; C=C_{k}] &= \mu + \beta_{j} + \gamma_{k} + (\beta\gamma)_{jk} = \mu_{B_{j}C_{k}} \\ \operatorname{E}[Y_{ijkl} \mid C=C_{k} \; \And \; A=A_{i}] &= \mu + \gamma_{k} + \alpha_{i} + (\gamma\alpha)_{ki} = \mu_{C_{k}A_{i}} \\ \operatorname{E}[Y_{ijkl} \mid A=A_{i} \; \And \; B=B_{j} \; \And \; C=C_{k}] &= \mu +\alpha_{i} + \beta_{j} + \gamma_{k} + (\alpha\beta)_{ij} + (\beta\gamma)_{jk} + (\gamma\alpha)_{ki} + (\alpha\beta\gamma)_{ijk} = \mu_{A_{i}B_{j}C_{k}} \\ \end{aligned} }$$
$${\displaystyle \begin{aligned} \varepsilon_{ijkl} &= Y_{ijkl} - \left( \mu +\alpha_{i} + \beta_{j} + \gamma_{k} + (\alpha\beta)_{ij} + (\beta\gamma)_{jk} + (\gamma\alpha)_{ki} + (\alpha\beta\gamma)_{ijk} \right) \\ \alpha_{i} &= \mu_{A_{i}} - \mu, \quad \sum_{i} \alpha_{i} = 0 \\ \beta_{j} &= \mu_{B_{j}} - \mu, \quad \sum_{j} \beta_{j} = 0 \\ \gamma_{k} &= \mu_{C_{k}} - \mu, \quad \sum_{k} \gamma_{k} = 0 \\ (\alpha\beta)_{ij} &= \mu_{A_{i}B_{j}} - (\mu + \alpha_{i} + \beta_{j}), \quad \sum_{i} (\alpha\beta)_{ij} = \sum_{j} (\alpha\beta)_{ij} = 0 \\ (\beta\gamma)_{jk} &= \mu_{B_{j}C_{k}} - (\mu + \beta_{j} + \gamma_{k}), \quad \sum_{j} (\beta\gamma)_{jk} = \sum_{k} (\beta\gamma)_{jk} = 0 \\ (\gamma\alpha)_{ki} &= \mu_{C_{k}A_{i}} - (\mu + \gamma_{k} + \alpha_{i}), \quad \sum_{k} (\gamma\alpha)_{ki} = \sum_{i} (\gamma\alpha)_{ij} = 0 \\ (\alpha\beta\gamma)_{ijk} &= \mu_{A_{i}B_{j}C_{k}} - (\mu + \alpha_{i} + \beta_{j} + \gamma_{k} + (\alpha\beta)_{ij} + (\beta\gamma)_{jk} + (\gamma\alpha)_{ki}), \\ \; & \qquad \sum_{i} (\alpha\beta\gamma)_{ijk} = \sum_{j} (\alpha\beta\gamma)_{ijk} = \sum_{k} (\alpha\beta\gamma)_{ijk} = \sum_{ij} (\alpha\beta\gamma)_{ijk} = \sum_{jk} (\alpha\beta\gamma)_{ijk} = \sum_{ki} (\alpha\beta\gamma)_{ijk} = 0, \\ \end{aligned} }$$
Source of variation Degree of freedom Sum of squares Mean square Expected MS F value
A a-1 SSA MSA σε2 + nbcσA2 MSA/MSS/ABC
B b-1 SSB MSB σε2 + nacσε2 MSB/MSS/ABC
C c-1 SSC MSC σε2 + nabσε2 MSC/MSS/ABC
A × B (a-1)(b-1) SSA×B MSA×B σε2 + ncσA×B2 MSA×B/MSS/ABC
B × C (b-1)(c-1) SSB×C MSB×C σε2 + naσB×C2 MSB×C/MSS/ABC
C × A (c-1)(a-1) SSC×A MSC×A σε2 + nbσC×A2 MSC×A/MSS/ABC
A × B × C (a-1)(b-1)(c-1) SSA×B×C MSA×B×C σε2 + nσA×B×C2 MSA×B×C/MSS/ABC
Error S/ABC abc(n-1) SSS/ABC MSS/ABC σε2  
Total abcn - 1 SStotal MStotal    
import pingouin as pg
from statsmodels.graphics.factorplots import interaction_plot

data = pg.read_dataset('anova3')

# Assumption
display(pg.normality(data=data, dv='Cholesterol', group=['Sex', 'Risk', 'Drug'][0])) # Shapiro-Wilk test
display(pg.normality(data=data, dv='Cholesterol', group=['Sex', 'Risk', 'Drug'][1])) # Shapiro-Wilk test
display(pg.normality(data=data, dv='Cholesterol', group=['Sex', 'Risk', 'Drug'][2])) # Shapiro-Wilk test
display(pg.homoscedasticity(data=data, dv='Cholesterol', group=['Sex', 'Risk', 'Drug'][0])) # Levene’s test
display(pg.homoscedasticity(data=data, dv='Cholesterol', group=['Sex', 'Risk', 'Drug'][1])) # Levene’s test
display(pg.homoscedasticity(data=data, dv='Cholesterol', group=['Sex', 'Risk', 'Drug'][2])) # Levene’s test

# Anova result
display(pg.anova(data=data, dv='Cholesterol', between=['Sex', 'Risk', 'Drug'], ss_type=3, detailed=True))

# Mean-grouping
data['M'] = data['Cholesterol'].mean()
data = data.set_index('Sex')
data['m_S'] = data.groupby(['Sex'])['Cholesterol'].mean()
data = data.reset_index().set_index('Risk')
data['m_R'] = data.groupby(['Risk'])['Cholesterol'].mean()
data = data.reset_index().set_index('Drug')
data['m_D'] = data.groupby(['Drug'])['Cholesterol'].mean()
data = data.reset_index().set_index(['Sex', 'Risk'])
data['m_SR'] = data.groupby(['Sex', 'Risk'])['Cholesterol'].mean()
data = data.reset_index().set_index(['Risk', 'Drug'])
data['m_RD'] = data.groupby(['Risk', 'Drug'])['Cholesterol'].mean()
data = data.reset_index().set_index(['Drug', 'Sex'])
data['m_DS'] = data.groupby(['Drug', 'Sex'])['Cholesterol'].mean()
data = data.reset_index().set_index(['Sex', 'Risk', 'Drug'])
data['m_SRD'] = data.groupby(['Sex', 'Risk', 'Drug'])['Cholesterol'].mean()

# Sum-squares
data = data.reset_index()
data['SST'] = ((data['Cholesterol'] - data['M'])**2).sum()
data['SSB_S'] = ((data['m_S'] - data['M'])**2).sum()
data['SSB_R'] = ((data['m_R'] - data['M'])**2).sum()
data['SSB_D'] = ((data['m_D'] - data['M'])**2).sum()
data['SSB_SR'] = ((data['m_SR'] - data['M'])**2).sum() - data['SSB_S'] - data['SSB_R']
data['SSB_RD'] = ((data['m_RD'] - data['M'])**2).sum() - data['SSB_R'] - data['SSB_D']
data['SSB_DS'] = ((data['m_DS'] - data['M'])**2).sum() - data['SSB_D'] - data['SSB_S']
data['SSB_SRD'] = ((data['m_SRD'] - data['M'])**2).sum() - data['SSB_S'] - data['SSB_R'] - data['SSB_D'] - data['SSB_SR'] - data['SSB_RD'] - data['SSB_DS']
data['SSW_SRD'] = data['SST'] - data['SSB_S'] - data['SSB_R'] - data['SSB_D'] - data['SSB_SR'] - data['SSB_RD'] - data['SSB_DS'] - data['SSB_SRD']  # ((data['Cholesterol'] - data['m_SRD'])**2).sum()

# Error definition(1)
data['SSE'] = data['SSW_SRD'].copy()


# Degree of freedom arguments
N = data.shape[0]
a = data['Sex'].unique().size
b = data['Risk'].unique().size
c = data['Drug'].unique().size
n = N/(a*b*c)

# Mean-squares
data['MSB_S'] = data['SSB_S'] / (a-1)
data['MSB_R'] = data['SSB_R'] / (b-1)
data['MSB_D'] = data['SSB_D'] / (c-1)
data['MSB_SR'] = data['SSB_SR'] / ((a-1)*(b-1))
data['MSB_RD'] = data['SSB_RD'] / ((b-1)*(c-1))
data['MSB_DS'] = data['SSB_DS'] / ((c-1)*(a-1))
data['MSB_SRD'] = data['SSB_SRD'] / ((a-1)*(b-1)*(c-1))

# Error Definition(2)
data['MSE'] = data['SSE'] / (a*b*c*(n-1)) # A,B,C ~ No Random Effect

# F-values
data['F_S'] = data['MSB_S'] / data['MSE']
data['F_R'] = data['MSB_R'] / data['MSE']
data['F_D'] = data['MSB_D'] / data['MSE']
data['F_SR'] = data['MSB_SR'] / data['MSE']
data['F_RD'] = data['MSB_RD'] / data['MSE']
data['F_DS'] = data['MSB_DS'] / data['MSE']
data['F_SRD'] = data['MSB_SRD'] / data['MSE']

# post-hoc test
display(pg.pairwise_tukey(data=data, dv='Cholesterol', between="Sex"))
display(pg.pairwise_tukey(data=data, dv='Cholesterol', between="Risk"))
display(pg.pairwise_tukey(data=data, dv='Cholesterol', between="Drug"))
display(interaction_plot(x=data['Sex'], trace=data['Risk'], response=data['Cholesterol']))
display(interaction_plot(x=data['Risk'], trace=data['Drug'], response=data['Cholesterol']))
display(interaction_plot(x=data['Drug'], trace=data['Sex'], response=data['Cholesterol']))

 

Repeated measured ANOVA ( r=1 )

$${\displaystyle \begin{cases} \textit{source of variance} \\ \qquad : SS_{\text{total(full)}} &= SS_{\text{subjects}} + SS_{\text{U}} + SS_{\text{V}} + SS_{\text{W}} + SS_{\text{U×V}} + SS_{\text{V×W}} + SS_{\text{W×U}} + SS_{\text{U×V×W}} \\ & \; + SS_{\text{U×S}} + SS_{\text{V×S}} + SS_{\text{W×S}} + SS_{\text{U×V×S}} + SS_{\text{V×W×S}} + SS_{\text{W×U×S}} + SS_{\text{U×V×W×S}} \\ \qquad : SS_{\text{total(reduced)}} &= SS_{\text{subjects}} + SS_{\text{U}} + SS_{\text{V}} + SS_{\text{W}} + SS_{\text{U×V}} + SS_{\text{V×W}} + SS_{\text{W×U}} + SS_{\text{U×V×W}} + SS_{\text{error}} \\ \textit{model[(U × V × W × S)]} \\ \qquad : \text{full}\; Y_{ijkl} &= \mu +\alpha_{i} + \beta_{j} + \gamma_{k} + (\alpha\beta)_{ij} + (\beta\gamma)_{jk} + (\gamma\alpha)_{ki} + (\alpha\beta\gamma)_{ijk} \\ &\; + \pi_{l} + (\alpha\pi)_{il} + (\beta\pi)_{jl} + (\gamma\pi)_{kl} + (\alpha\beta\pi)_{ijl} + (\beta\gamma\pi)_{jkl} + (\gamma\alpha\pi)_{kil} + (\alpha\beta\gamma\pi)_{ijkl} + \varepsilon_{ijkl} \\ \qquad : \text{reduced}\; Y_{ijkl} &= \mu +\alpha_{i} + \beta_{j} + \gamma_{k} + (\alpha\beta)_{ij} + (\beta\gamma)_{jk} + (\gamma\alpha)_{ki} + (\alpha\beta\gamma)_{ijk} + \pi_{l} + \varepsilon_{ijkl} \\ \end{cases} }$$

Full Model ANOVA Table

Source of variation Degree of freedom Sum of squares Mean square Expected MS F value
Between subjects n-1 SSsubjects MSsubjects    
U u-1 SSU MSU   MSU/MSU×S
error U × S (u-1)(n-1) SSU×S MSU×S    
V v-1 SSV MSV   MSV/MSV×S
error V × S (v-1)(n-1) SSV×S MSV×S    
W w-1 SSW MSW   MSW/MSW×S
error W × S (w-1)(n-1) SSW×S MSW×S    
U × V (u-1)(v-1) SSU×V MSU×V   MSU×V/MSU×V×S
error U × V × S (u-1)(v-1)(n-1) SSU×V×S MSU×V×S    
V × W (v-1)(w-1) SSV×W MSV×W   MSU×W/MSU×W×S
error V × W × S (v-1)(w-1)(n-1) SSV×W×S MSV×W×S    
W × U (w-1)(u-1) SSW×U MSW×U   MSV×W/MSV×W×S
error W ×U × S (w-1)(u-1)(s-1) SSW×U×S SSW×U×S    
U × V × W (u-1)(v-1)(w-1) SSU×V×W MSU×V×W   MSU×V×W/MSU×V×W×S
error U × V × W × S (u-1)(v-1)(w-1)(n-1) SSU×V×W×S MSU×V×W×S    
Total uvwn - 1 SStotal MStotal    

Reduced Model ANOVA Table

Source of variation Degree of freedom Sum of squares Mean square Expected MS F value
Between subjects n-1 SSsubjects MSsubjects    
U u-1 SSU MSU   MSU/MSS/UVW
V v-1 SSV MSV   MSV/MSS/UVW
W w-1 SSW MSW   MSW/MSS/UVW
U × V (u-1)(v-1) SSU×V MSU×V   MSU×V/MSS/UVW
V × W (v-1)(w-1) SSV×W MSV×W   MSV×W/MSS/UVW
W × U (w-1)(u-1) SSW×U MSW×U   MSW×U/MSS/UVW
U × V × W (u-1)(v-1)(w-1) SSU×V×W MSU×V×W   MSU×V×W/MSS/UVW
Error S/UVW uvw(n-1) SSS/UVW MSS/UVW    
Total uvwn - 1 SStotal MStotal    
import numpy as np
import pandas as pd
from scipy import stats
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.anova import AnovaRM
from statsmodels.graphics.factorplots import interaction_plot
import pingouin as pg

class DataGenerator:
    def __init__(self, repeated_num:int):
        self.limit = repeated_num

    def __iter__(self):
        return self

    def __next__(self):
        if self.limit != 0:
            self.limit -= 1
            return DataGenerator.generate()
        else:                           
            raise StopIteration

    @classmethod
    def generate(cls):
        # Design of Experiments, DOE
        index = pd.MultiIndex.from_product(
            names = ['Factor1', 'Factor2', 'Factor3'],
            iterables = [
                ['P1', 'P2', 'P3'], 
                ['Q1', 'Q2'], 
                ['R1', 'R2', 'R3']])
        
        frame = pd.DataFrame(
            index = index,
            data = [None]*len(index),
            columns = ['Value'],
        ).applymap(lambda x: np.random.normal(0,1))
        
        # tfi: target feature index
        # Variable Feature Control
        tfi = (frame.index.get_level_values(0) == 'P1')&(frame.index.get_level_values(1) == 'Q2')
        frame.loc[lambda x: tfi] = frame.loc[lambda x: tfi] + np.random.normal(loc=30, scale=3, size=(tfi.sum(), 1))
        
        frame['Value'] = frame['Value'].astype(float)
        return frame

def anova_data(repeated_num=1):
    data = pd.concat(DataGenerator(repeated_num=repeated_num), axis=0).sort_index().reset_index()
    data = data.reset_index().rename(columns={'index':'Index'})
    data['Index'] = data.groupby(['Factor1'] + sorted(data.columns.difference(['Factor1', 'Index', 'Value']).tolist()))['Index'].rank()
    data = data.set_index(['Factor1'] + sorted(data.columns.difference(['Factor1', 'Index', 'Value']).tolist()) + ['Index']).unstack(0)
    return data

data = anova_data(30).stack(1).reset_index()[['Factor1', 'Factor2', 'Factor3', 'Index', 'Value']].sort_values(by=['Factor1', 'Factor2', 'Factor3', 'Index']).reset_index(drop=True)

# Assumption
display(pg.normality(data=data, dv='Value', group='Factor1')) # Shapiro-Wilk test
display(pg.normality(data=data, dv='Value', group='Factor2')) # Shapiro-Wilk test
display(pg.normality(data=data, dv='Value', group='Factor3')) # Shapiro-Wilk test
display(pg.sphericity(data=data, dv='Value', subject='Index', within=['Factor1', 'Factor2'])) # Mauchly’s test of sphericity
display(pg.sphericity(data=data, dv='Value', subject='Index', within=['Factor2', 'Factor3'])) # Mauchly’s test of sphericity
#display(pg.sphericity(data=data, dv='Value', subject='Index', within=['Factor1', 'Factor3'])) # Mauchly’s test of sphericity

# Anova result
#pg.rm_anova(data=data, dv='Value', within=['Factor1', 'Factor2', 'Factor3'], subject='Index')
display(AnovaRM(data=data, depvar='Value', within=['Factor1', 'Factor2', 'Factor3'], subject='Index').fit().anova_table)

# mean-grouping
data['M'] = data['Value'].mean()
data = data.set_index('Factor1')
data['m_F1'] = data.groupby('Factor1')['Value'].mean()
data = data.reset_index().set_index('Factor2')
data['m_F2'] = data.groupby('Factor2')['Value'].mean()
data = data.reset_index().set_index('Factor3')
data['m_F3'] = data.groupby('Factor3')['Value'].mean()
data = data.reset_index().set_index(['Index'])
data['m_S'] = data.groupby(['Index'])['Value'].mean()
data = data.reset_index().set_index(['Factor1', 'Index'])
data['m_F1S'] = data.groupby(['Factor1', 'Index'])['Value'].mean()
data = data.reset_index().set_index(['Factor2', 'Index'])
data['m_F2S'] = data.groupby(['Factor2', 'Index'])['Value'].mean()
data = data.reset_index().set_index(['Factor3', 'Index'])
data['m_F3S'] = data.groupby(['Factor3', 'Index'])['Value'].mean()
data = data.reset_index().set_index(['Factor1', 'Factor2'])
data['m_F12'] = data.groupby(['Factor1', 'Factor2'])['Value'].mean()
data = data.reset_index().set_index(['Factor2', 'Factor3'])
data['m_F23'] = data.groupby(['Factor2', 'Factor3'])['Value'].mean()
data = data.reset_index().set_index(['Factor3', 'Factor1'])
data['m_F31'] = data.groupby(['Factor3', 'Factor1'])['Value'].mean()
data = data.reset_index().set_index(['Factor1', 'Factor2', 'Index'])
data['m_F12S'] = data.groupby(['Factor1', 'Factor2', 'Index'])['Value'].mean()
data = data.reset_index().set_index(['Factor2', 'Factor3', 'Index'])
data['m_F23S'] = data.groupby(['Factor2', 'Factor3', 'Index'])['Value'].mean()
data = data.reset_index().set_index(['Factor3', 'Factor1', 'Index'])
data['m_F31S'] = data.groupby(['Factor3', 'Factor1', 'Index'])['Value'].mean()
data = data.reset_index().set_index(['Factor1', 'Factor2', 'Factor3'])
data['m_F123'] = data.groupby(['Factor1', 'Factor2', 'Factor3'])['Value'].mean()
data = data.reset_index().set_index(['Factor1', 'Factor2', 'Factor3', 'Index'])
data['m_F123S'] = data.groupby(['Factor1', 'Factor2', 'Factor3', 'Index'])['Value'].mean()


# Sum-squares
data = data.reset_index()
data['SST'] = ((data['Value'] - data['M'])**2).mean()
data['SSB_S'] = ((data['m_S'] - data['M'])**2).mean()
data['SSW_S'] = ((data['Value'] - data['m_S'])**2).mean()

data['SSB_F1'] = ((data['m_F1'] - data['M'])**2).mean()
data['SSB_F2'] = ((data['m_F2'] - data['M'])**2).mean()
data['SSB_F3'] = ((data['m_F3'] - data['M'])**2).mean()
data['SSB_F12'] = ((data['m_F12'] - data['M'])**2).mean() - data['SSB_F1'] - data['SSB_F2']
data['SSB_F23'] = ((data['m_F23'] - data['M'])**2).mean() - data['SSB_F2'] - data['SSB_F3']
data['SSB_F31'] = ((data['m_F31'] - data['M'])**2).mean() - data['SSB_F3'] - data['SSB_F1']
data['SSB_F123'] = ((data['m_F123'] - data['M'])**2).mean() - data['SSB_F1'] - data['SSB_F2'] - data['SSB_F3'] - data['SSB_F12'] - data['SSB_F23'] - data['SSB_F31']

# Error Definition(1)
data['SSB_F1S'] = ((data['m_F1S'] - data['M'])**2).mean() - data['SSB_F1'] - data['SSB_S']
data['SSB_F2S'] = ((data['m_F2S'] - data['M'])**2).mean() - data['SSB_F2'] - data['SSB_S']
data['SSB_F3S'] = ((data['m_F3S'] - data['M'])**2).mean() - data['SSB_F3'] - data['SSB_S']
data['SSB_F12S'] = ((data['m_F12S'] - data['M'])**2).mean() - data['SSB_F1'] - data['SSB_F2'] - data['SSB_S'] - data['SSB_F12'] - data['SSB_F1S'] - data['SSB_F2S']
data['SSB_F23S'] = ((data['m_F23S'] - data['M'])**2).mean() - data['SSB_F2'] - data['SSB_F3'] - data['SSB_S'] - data['SSB_F23'] - data['SSB_F2S'] - data['SSB_F3S']
data['SSB_F31S'] = ((data['m_F31S'] - data['M'])**2).mean() - data['SSB_F3'] - data['SSB_F1'] - data['SSB_S'] - data['SSB_F31'] - data['SSB_F3S'] - data['SSB_F1S']
data['SSB_F123S'] = ((data['m_F123S'] - data['M'])**2).mean() \
- data['SSB_F1'] - data['SSB_F2'] - data['SSB_F3'] - data['SSB_S'] \
- data['SSB_F12'] - data['SSB_F23'] - data['SSB_F31'] - data['SSB_F1S'] - data['SSB_F2S'] - data['SSB_F3S'] \
- data['SSB_F123'] - data['SSB_F12S'] - data['SSB_F23S'] - data['SSB_F31S'] 


# Degree of Freedom Arguments
N = data.shape[0]
u = data['Factor1'].unique().size
v = data['Factor2'].unique().size
w = data['Factor3'].unique().size
s = data['Index'].unique().size
n = s

# Mean-squares
data['MST'] = data['SST'] / (N-1)
data['MSB_F1'] = data['SSB_F1'] / (u-1)
data['MSB_F2'] = data['SSB_F2'] / (v-1)
data['MSB_F3'] = data['SSB_F3'] / (w-1)
data['MSB_F12'] = data['SSB_F12'] / ((u-1)*(v-1))
data['MSB_F23'] = data['SSB_F23'] / ((v-1)*(w-1))
data['MSB_F31'] = data['SSB_F31'] / ((w-1)*(u-1))
data['MSB_F123'] = data['SSB_F123'] / ((u-1)*(v-1)*(w-1))

# Error Definition(2)
data['MSB_F1S'] = data['SSB_F1S'] / ((u-1)*(n-1)) # U ~ Random Effect
data['MSB_F2S'] = data['SSB_F2S'] / ((v-1)*(n-1)) # V ~ Random Effect
data['MSB_F3S'] = data['SSB_F3S'] / ((w-1)*(n-1)) # W ~ Random Effect
data['MSB_F12S'] = data['SSB_F12S'] / ((u-1)*(v-1)*(n-1)) # U,V ~ Random Effect
data['MSB_F23S'] = data['SSB_F23S'] / ((v-1)*(w-1)*(n-1)) # V,W ~ Random Effect
data['MSB_F31S'] = data['SSB_F31S'] / ((w-1)*(u-1)*(n-1)) # W,U ~ Random Effect
data['MSB_F123S'] = data['SSB_F123S'] / ((u-1)*(v-1)*(w-1)*(n-1)) # U,V,W ~ Random Effect


# F-values
# Full Model
data['F_F1'] = data['MSB_F1'] / data['MSB_F1S']
data['F_F2'] = data['MSB_F2'] / data['MSB_F2S']
data['F_F3'] = data['MSB_F3'] / data['MSB_F3S']
data['F_F12'] = data['MSB_F12'] / data['MSB_F12S']
data['F_F23'] = data['MSB_F23'] / data['MSB_F23S']
data['F_F31'] = data['MSB_F31'] / data['MSB_F31S']
data['F_F123'] = data['MSB_F123'] / data['MSB_F123S']

# post-hoc test
display(pg.pairwise_tests(data=data, dv='Value', within='Factor1', subject='Index', padjust='fdr_bh'))
display(pg.pairwise_tests(data=data, dv='Value', within='Factor2', subject='Index', padjust='fdr_bh'))
display(pg.pairwise_tests(data=data, dv='Value', within='Factor3', subject='Index', padjust='fdr_bh'))
display(interaction_plot(x=data['Factor1'], trace=data['Factor2'], response=data['Value']))
display(interaction_plot(x=data['Factor2'], trace=data['Factor3'], response=data['Value']))
display(interaction_plot(x=data['Factor3'], trace=data['Factor1'], response=data['Value']))

 

 

Mixed ANOVA

: Two between-subjects factors and one within-subjects factor

$${\displaystyle \begin{cases} \textit{source of variance} \; &: SS_{\text{total}} &&= (SS_{\text{A}} + SS_{\text{B}} + SS_{\text{A×B}} + SS_{\text{error-between}}) \\ \; & \; &&\quad + (SS_{\text{U}} + SS_{\text{U×A}} + SS_{\text{U×B}} + SS_{\text{U×A×B}} + SS_{\text{error-within}}) \\ \textit{model[A × B × (U × S)]} \; &: Y_{ijkl} &&= \mu +\alpha_{i} + \beta_{j} + (\alpha\beta)_{ij} + \pi_{k/ij}\\ \; & \; &&\quad + \gamma_{l} + (\alpha\gamma)_{il} + (\beta\gamma)_{jl} + (\alpha\beta\gamma)_{ijl} + (\pi\gamma)_{kl/ij} + \varepsilon_{ijkl} \\ \end{cases} }$$
Source of variation Degree of freedom Sum of squares Mean square Expected MS F value
Between subjects abn-1 SSsubjects MSsubjects    
A a-1 SSA MSA   MSA/MSS/AB
B b-1 SSB MSB   MSB/MSS/AB
A × B (a-1)(b-1) SSA×B MSA×B   MSA×B/MSS/AB
Error S/AB ab(n-1) SSS/AB MSS/AB    
Within subjects abn(u-1)        
U u-1 SSU MSU   MSU/MSU×S/AB
U × A (u-1)(a-1) SSU×A MSU×A   MSU×A /MSU×S/AB
U × B (u-1)(b-1) SSU×B MSU×B   MSU×B /MSU×S/AB
U × A × B (u-1)(a-1)(b-1) SSU×A×B MSU×A×B   MSU×A×B /MSU×S/AB
Error U × S/AB ab(u-1)(n-1) SSU×S/AB MSU×S/AB    
Total abun - 1 SStotal MStotal    
import numpy as np
import pandas as pd
from scipy import stats
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.anova import AnovaRM
from statsmodels.graphics.factorplots import interaction_plot
import pingouin as pg

class DataGenerator:
    def __init__(self, repeated_num:int):
        self.limit = repeated_num

    def __iter__(self):
        return self

    def __next__(self):
        if self.limit != 0:
            self.limit -= 1
            return DataGenerator.generate()
        else:                           
            raise StopIteration

    @classmethod
    def generate(cls):
        # Design of Experiments, DOE
        index = pd.MultiIndex.from_product(
            names = ['Group', 'SubGroup', 'Factor'],
            iterables = [
                ['MG1', 'MG2', 'MG3'], 
                ['SG_1', 'SG_2'], 
                ['Q1', 'Q2', 'Q3']])
        
        frame = pd.DataFrame(
            index = index,
            data = [None]*len(index),
            columns = ['Value'],
        ).applymap(lambda x: np.random.normal(0,1))
        
        # tfi: target feature index
        # Variable Feature Control
        tfi = (frame.index.get_level_values(0) == 'MG1')&(frame.index.get_level_values(1) == 'SG_2')
        frame.loc[lambda x: tfi] = frame.loc[lambda x: tfi] + np.random.normal(loc=10, scale=3, size=(tfi.sum(), 1))
  
        frame['Value'] = frame['Value'].astype(float)
        return frame

def anova_data(repeated_num=1):
    data = pd.concat(DataGenerator(repeated_num=repeated_num), axis=0).sort_index().reset_index()
    data = data.reset_index().rename(columns={'index':'Index'})
    data['Index'] = data.groupby(['Group'] + sorted(data.columns.difference(['Group', 'Index', 'Value']).tolist()))['Index'].rank()
    data = data.set_index(['Group'] + sorted(data.columns.difference(['Group', 'Index', 'Value']).tolist()) + ['Index']).unstack(0)
    return data

# mixed anova data: Two between-subjects factors and one within-subjects factor
data = anova_data(60).stack(1).unstack(0).swaplevel(0,2, axis=0).swaplevel(1,2, axis=0).sort_index(ascending=True)
data.index = pd.MultiIndex.from_tuples(map(lambda x: (x[0][0], x[0][1], x[1]), zip(data.index, range(data.index.shape[0])))).rename(['Group', 'SubGroup', 'Index'])
data = data.stack(1).reset_index() 

# Assumption
display(pg.normality(data=data, dv='Value', group=['Group', 'SubGroup', 'Factor'][0])) # Shapiro-Wilk test
display(pg.normality(data=data, dv='Value', group=['Group', 'SubGroup', 'Factor'][1])) # Shapiro-Wilk test
display(pg.normality(data=data, dv='Value', group=['Group', 'SubGroup', 'Factor'][2])) # Shapiro-Wilk test
display(pg.homoscedasticity(data=data, dv='Value', group=['Group', 'SubGroup'][0])) # Levene’s test
display(pg.homoscedasticity(data=data, dv='Value', group=['Group', 'SubGroup'][1])) # Levene’s test
display(pg.sphericity(data=data, dv='Value', within='Factor', subject='Index'))


# Mean-grouping
data['M'] = data['Value'].mean()
data = data.set_index('Index')
data['m_S'] = data.groupby('Index')['Value'].mean()
data = data.reset_index().set_index('Group')
data['m_G1'] = data.groupby('Group')['Value'].mean()
data = data.reset_index().set_index('SubGroup')
data['m_G2'] = data.groupby('SubGroup')['Value'].mean()
data = data.reset_index().set_index('Factor')
data['m_F'] = data.groupby('Factor')['Value'].mean()
data = data.reset_index().set_index(['Group', 'SubGroup'])
data['m_G12'] = data.groupby(['Group', 'SubGroup'])['Value'].mean()
data = data.reset_index().set_index(['Group', 'Factor'])
data['m_G1F'] = data.groupby(['Group', 'Factor'])['Value'].mean()
data = data.reset_index().set_index(['SubGroup', 'Factor'])
data['m_G2F'] = data.groupby(['SubGroup', 'Factor'])['Value'].mean()
data = data.reset_index().set_index(['Group', 'SubGroup', 'Factor'])
data['m_G12F'] = data.groupby(['Group', 'SubGroup', 'Factor'])['Value'].mean()
data = data.reset_index()

# Sum-squares
data['SST'] = ((data['Value'] - data['M'])**2).sum()
data['SSB_S'] = ((data['m_S'] - data['M'])**2).sum()
data['SSW_S'] = ((data['Value'] - data['m_S'])**2).sum()
data['SSB_G1'] = ((data['m_G1'] - data['M'])**2).sum()
data['SSB_G2'] = ((data['m_G2'] - data['M'])**2).sum()
data['SSB_F'] = ((data['m_F'] - data['M'])**2).sum()
data['SSB_G12'] = ((data['m_G12'] - data['M'])**2).sum() - data['SSB_G1'] - data['SSB_G2']
data['SSB_G1F'] = ((data['m_G1F'] - data['M'])**2).sum() - data['SSB_G1'] - data['SSB_F']
data['SSB_G2F'] = ((data['m_G2F'] - data['M'])**2).sum() - data['SSB_G2'] - data['SSB_F']
data['SSB_G12F'] = ((data['m_G12F'] - data['M'])**2).sum() - data['SSB_G1'] - data['SSB_G2'] - data['SSB_F'] - data['SSB_G1F'] - data['SSB_G2F'] - data['SSB_G12']


# Error definition(1)
data['SSE_B'] = data['SSB_S'] - data['SSB_G1'] - data['SSB_G2'] - data['SSB_G12']
data['SSE_W'] = data['SSW_S'] - data['SSB_F'] - data['SSB_G1F'] - data['SSB_G2F'] - data['SSB_G12F']

# Degree of freedom arguments
N = data.shape[0]
a = data['Group'].unique().size
b = data['SubGroup'].unique().size
u = data['Factor'].unique().size
n = N/(a*b*u)

# Mean-squares
data['MSB_G1'] = data['SSB_G1'] / (a-1)
data['MSB_G2'] = data['SSB_G2'] / (b-1)
data['MSB_G12'] = data['SSB_G12'] / ((a-1)*(b-1))
data['MSB_F'] = data['SSB_F'] / (u-1)
data['MSB_G1F'] = data['SSB_G1F'] / ((a-1)*(u-1))
data['MSB_G2F'] = data['SSB_G2F'] / ((b-1)*(u-1))
data['MSB_G12F'] = data['SSB_G12F'] / ((a-1)*(b-1)*(u-1))

# Error definition(2)
data['MSE_B'] = data['SSE_B'] / (a*b*(n-1)) # A,B ~ No Random Effect
data['MSE_W'] = data['SSE_W'] / (a*b*(u-1)*(n-1)) # A,B ~ No Random Effect / U ~ Random Effect

# F-values
data['F_G1'] = data['MSB_G1'] / data['MSE_B']
data['F_G2'] = data['MSB_G2'] / data['MSE_B']
data['F_G12'] = data['MSB_G12'] / data['MSE_B']
data['F_F'] = data['MSB_F'] / data['MSE_W']
data['F_G1F'] = data['MSB_G1F'] / data['MSE_W']
data['F_G2F'] = data['MSB_G2F'] / data['MSE_W']
data['F_G12F'] = data['MSB_G12F'] / data['MSE_W']
display(data.loc[:, 'F_G1':].iloc[:1].T)

# post-hoc test
display(pg.pairwise_tukey(data=data, dv='Value', between="Group"))
display(pg.pairwise_tukey(data=data, dv='Value', between="SubGroup"))
display(pg.pairwise_tests(data=data, dv='Value', within='Factor', subject='Index', padjust='fdr_bh'))
display(interaction_plot(x=data['Group'], trace=data['SubGroup'], response=data['Value']))
display(interaction_plot(x=data['SubGroup'], trace=data['Factor'], response=data['Value']))
display(interaction_plot(x=data['Factor'], trace=data['Group'], response=data['Value']))

 

: One between-subjects factor and two within-subjects factors

$${\displaystyle \begin{cases} \textit{source of variance} \; &: SS_{\text{total}} &&= (SS_{\text{A}} + SS_{\text{error-between}}) \\ \; & \; && \quad + (SS_{\text{U}} + SS_{\text{U×A}} + SS_{\text{U×S/A}} \\ \; & \; && \; \quad + SS_{\text{V}} + SS_{\text{V×A}} + SS_{\text{V×S/A}} \\ \; & \; && \; \quad + SS_{\text{U×V}} + SS_{\text{U×V×A}} + SS_{\text{U×V×S/A}} + SS_{\text{error}}) \\ \textit{model[A × (U × V × S)]} \; &: Y_{ijkl} &&= \mu +\alpha_{i} + \pi_{j/i} \\ \; & \; && \quad + \beta_{k} + \gamma_{l} + (\alpha\beta)_{ik} + (\alpha\gamma)_{il} + (\beta\gamma)_{kl} + (\alpha\beta\gamma)_{ikl} \\ \; & \; && \quad + (\pi\beta)_{jk/i} + (\pi\gamma)_{jl/i} + (\pi\beta\gamma)_{jkl/i} + \varepsilon_{ijkl} \\ \end{cases} }$$
Source of variation Degree of freedom Sum of squares Mean square Expected MS F value
Between subjects an-1 SSsubjects MSsubjects    
A a-1 SSA MSA   MSA/MSS/A
Error S/A a(n-1) SSS/A MSS/A    
Within subjects an(uv-1)        
U u-1 SSU MSU   MSU/MSU×S/A
U × A (u-1)(a-1) SSA×U MSA×U   MSA×U/MSU×S/A
Error U × S/A a(u-1)(n-1) SSU×S/A MSU×S/A    
V v-1 SSV MSV   MSV/MSV×S/A
V × A (v--1)(a-1) SSV×A MSV×A   MSV×A /MSV×S/A
Error V × S/A a(v-1)(n-1) SSV×S/A MSV×S/A    
U × V (u-1)(v-1) SSU×V MSU×V   MSU×V /MSU×V×S/A
U × V × A (u-1)(v-1)(a-1) SSU×V×A MSU×V×A   MSU×V×A /MSU×V×S/A
Error U × V × S/A a(u-1)(v-1)(n-1) SSU×V×S/A MSU×V×S/A    
Total auvn-1 SStotal MStotal    
import numpy as np
import pandas as pd
from scipy import stats
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.anova import AnovaRM
from statsmodels.graphics.factorplots import interaction_plot
import pingouin as pg

class DataGenerator:
    def __init__(self, repeated_num:int):
        self.limit = repeated_num

    def __iter__(self):
        return self

    def __next__(self):
        if self.limit != 0:
            self.limit -= 1
            return DataGenerator.generate()
        else:                           
            raise StopIteration

    @classmethod
    def generate(cls):
        # Design of Experiments, DOE
        index = pd.MultiIndex.from_product(
            names = ['Group', 'Factor1', 'Factor2'],
            iterables = [
                ['G1', 'G2', 'G3'], 
                ['P1', 'P2'], 
                ['Q1', 'Q2', 'Q3']])
        
        frame = pd.DataFrame(
            index = index,
            data = [None]*len(index),
            columns = ['Value'],
        ).applymap(lambda x: np.random.normal(0,1))
        
        # tfi: target feature index
        # Variable Feature Control
        tfi = (frame.index.get_level_values(0) == 'G2')
        frame.loc[lambda x: tfi] = frame.loc[lambda x: tfi] + np.random.normal(loc=30, scale=3, size=(tfi.sum(), 1))
        
        tfi = (frame.index.get_level_values(0) == 'G1')&(frame.index.get_level_values(1) == 'P2')
        frame.loc[lambda x: tfi] = frame.loc[lambda x: tfi] + np.random.normal(loc=10, scale=3, size=(tfi.sum(), 1))
  
        frame['Value'] = frame['Value'].astype(float)
        return frame

def anova_data(repeated_num=1):
    data = pd.concat(DataGenerator(repeated_num=repeated_num), axis=0).sort_index().reset_index()
    data = data.reset_index().rename(columns={'index':'Index'})
    data['Index'] = data.groupby(['Group'] + sorted(data.columns.difference(['Group', 'Index', 'Value']).tolist()))['Index'].rank()
    data = data.set_index(['Group'] + sorted(data.columns.difference(['Group', 'Index', 'Value']).tolist()) + ['Index']).unstack(0)
    return data

# mixed anova data: One between-subjects factor and two within-subjects factors
data = anova_data(60).stack(1).unstack(0).unstack(0).swaplevel(0,1, axis=0).sort_index(ascending=True)
data.index = pd.MultiIndex.from_tuples(map(lambda x: (x[0][0], x[1]), zip(data.index, range(data.index.shape[0])))).rename(['Group', 'Index'])
data = data.stack(1).stack(1).reset_index()

# Assumption
display(pg.normality(data=data, dv='Value', group=['Group', 'Factor1', 'Factor2'][0])) # Shapiro-Wilk test
display(pg.normality(data=data, dv='Value', group=['Group', 'Factor1', 'Factor2'][1])) # Shapiro-Wilk test
display(pg.normality(data=data, dv='Value', group=['Group', 'Factor1', 'Factor2'][2])) # Shapiro-Wilk test
display(pg.homoscedasticity(data=data, dv='Value', group=['Group'][0])) # Levene’s test
display(pg.sphericity(data=data, dv='Value', within=['Factor1', 'Factor2'], subject='Index'))

# Mean-grouping
data['M'] = data['Value'].mean()
data = data.set_index('Index')
data['m_S'] = data.groupby('Index')['Value'].mean()
data = data.reset_index().set_index('Group')
data['m_G'] = data.groupby('Group')['Value'].mean()
data = data.reset_index().set_index('Factor1')
data['m_F1'] = data.groupby('Factor1')['Value'].mean()
data = data.reset_index().set_index('Factor2')
data['m_F2'] = data.groupby('Factor2')['Value'].mean()
data = data.reset_index().set_index(['Group', 'Factor1'])
data['m_GF1'] = data.groupby(['Group', 'Factor1'])['Value'].mean()
data = data.reset_index().set_index(['Group', 'Factor2'])
data['m_GF2'] = data.groupby(['Group', 'Factor2'])['Value'].mean()
data = data.reset_index().set_index(['Factor1', 'Factor2'])
data['m_F12'] = data.groupby(['Factor1', 'Factor2'])['Value'].mean()
data = data.reset_index().set_index(['Group', 'Factor1', 'Factor2'])
data['m_GF12'] = data.groupby(['Group', 'Factor1', 'Factor2'])['Value'].mean()
data = data.reset_index()

# Sum-squares
data['SST'] = ((data['Value'] - data['M'])**2).sum()
data['SSB_S'] = ((data['m_S'] - data['M'])**2).sum()
data['SSW_S'] = ((data['Value'] - data['m_S'])**2).sum()
data['SSB_G'] = ((data['m_G'] - data['M'])**2).sum()
data['SSB_F1'] = ((data['m_F1'] - data['M'])**2).sum()
data['SSB_F2'] = ((data['m_F2'] - data['M'])**2).sum()
data['SSB_GF1'] = ((data['m_GF1'] - data['M'])**2).sum() - data['SSB_G'] - data['SSB_F1']
data['SSB_GF2'] = ((data['m_GF2'] - data['M'])**2).sum() - data['SSB_G'] - data['SSB_F2']
data['SSB_F12'] = ((data['m_F12'] - data['M'])**2).sum() - data['SSB_F1'] - data['SSB_F2']
data['SSB_GF12'] = ((data['m_GF12'] - data['M'])**2).sum() - data['SSB_G'] - data['SSB_F1'] - data['SSB_F2'] - data['SSB_GF1'] - data['SSB_GF2'] - data['SSB_F12']

# Error definition(1)
data['SSE_B'] = data['SSB_S'] - data['SSB_G']
data['SSE_W_F1'] = data['SSW_S'] - data['SSB_F1'] - data['SSB_GF1']
data['SSE_W_F2'] = data['SSW_S'] - data['SSB_F2'] - data['SSB_GF2']
data['SSE_W_F12'] = data['SSW_S'] - data['SSB_F12'] - data['SSB_GF12']

# Degree of freedom arguments
N = data.shape[0]
a = data['Group'].unique().size
u = data['Factor1'].unique().size
v = data['Factor2'].unique().size
n = N/(a*u*v)

# Mean-squares
data['MSB_G'] = data['SSB_G'] / (a-1)
data['MSB_F1'] = data['SSB_F1'] / (u-1)
data['MSB_F2'] = data['SSB_F2'] / (v-1)
data['MSB_GF1'] = data['SSB_GF1'] / ((a-1)*(u-1))
data['MSB_GF2'] = data['SSB_GF2'] / ((a-1)*(v-1))
data['MSB_GF12'] = data['SSB_GF12'] / ((a-1)*(u-1)*(v-1))
data['MSB_F12'] = data['SSB_F12'] / ((u-1)*(v-1))

# Error definition(2)
data['MSE_B'] = data['SSE_B'] / (a*(n-1)) # A ~ No Random Effect
data['MSE_W_F1'] = data['SSE_W_F1'] / (a*(u-1)*(n-1)) # A ~ No Random Effect / U ~ Random Effect
data['MSE_W_F2'] = data['SSE_W_F2'] / (a*(v-1)*(n-1)) # A ~ No Random Effect / V ~ Random Effect
data['MSE_W_F12'] = data['SSE_W_F12'] / (a*(u-1)*(v-1)*(n-1)) # A ~ No Random Effect / U,V ~ Random Effect

# f-values
data['F_G'] = data['MSB_G']/data['MSE_B']
data['F_F1'] = data['MSB_F1']/data['MSE_W_F1']
data['F_GF1'] = data['MSB_GF1']/data['MSE_W_F1']
data['F_F2'] = data['MSB_F2']/data['MSE_W_F2']
data['F_GF2'] = data['MSB_GF2']/data['MSE_W_F2']
data['F_F12'] = data['MSB_F12']/data['MSE_W_F12']
data['F_GF12'] = data['MSB_GF12']/data['MSE_W_F12']
display(data.loc[:, 'F_G':].iloc[:1].T)

# post-hoc test
display(pg.pairwise_tukey(data=data, dv='Value', between="Group"))
display(pg.pairwise_tests(data=data, dv='Value', within='Factor1', subject='Index', padjust='fdr_bh'))
display(pg.pairwise_tests(data=data, dv='Value', within='Factor2', subject='Index', padjust='fdr_bh'))
display(interaction_plot(x=data['Group'], trace=data['Factor1'], response=data['Value']))
display(interaction_plot(x=data['Factor1'], trace=data['Factor2'], response=data['Value']))
display(interaction_plot(x=data['Factor2'], trace=data['Group'], response=data['Value']))

 

 

 


Reference

'quantitative analysis > statistics' 카테고리의 다른 글

Survival Analysis  (0) 2023.07.06
Bayesian Estimation  (0) 2023.05.31
Panel Analysis  (4) 2023.05.20
Correlation and Covariance Analysis  (0) 2023.05.08
Regression Analysis  (1) 2023.05.06

+ Recent posts