Дисперсионный анализ

https://www.marsja.se/four-ways-to-conduct-one-way-anovas-using-python/

Пример кода в коммнтариях к курсу по статистике А.Карпова на Степике https://stepik.org/lesson/8083/step/11?unit=1362

import pandas as pd
import scipy.stats as stats
URL = 'https://stepik.org/media/attachments/lesson/8083/genetherapy.csv'
data = pd.read_csv(URL)
A = data[data["Therapy"] == "A"]["expr"]
B = data[data["Therapy"] == "B"]["expr"]
C = data[data["Therapy"] == "C"]["expr"]
D = data[data["Therapy"] == "D"]["expr"]
stats.f_oneway(A, B, C, D)
Получается результат:
F_onewayResult(statistic=8.0373024811439908, pvalue=0.00015249722895229536)

 

from scipy.stats import stats import pandas as pd import numpy as np def ssw(main_data: pd.DataFrame, group_col: str, value_col: str): group_names = data[group_col].unique() return np.sum([ ( main_data.loc[main_data[group_col] == group, value_col] - main_data.loc[main_data[group_col] == group, value_col].mean() ) ** 2 for group in group_names ]) def ssb(main_data: pd.DataFrame, group_col: str, value_col: str): group_names = data[group_col].unique() overall_mean = main_data[value_col].mean() return np.sum([ len(main_data.loc[main_data[group_col] == group, value_col]) * ( main_data.loc[main_data[group_col] == group, value_col].mean() - overall_mean ) ** 2 for group in group_names ]) # Файл данных должен лежать в папке с файлом .py (или .ipynb) data = pd.read_csv('genetherapy.csv') groups = data['Therapy'].unique() subsets = [data.query('Therapy == @group')['expr'] for group in groups] ssb_df = len(groups) - 1 ssw_df = data.shape[0] - len(groups) SSW = ssw(data, 'Therapy', 'expr') SSB = ssb(data, 'Therapy', 'expr') f_val, p_val = stats.f_oneway(*subsets) anova_results = pd.DataFrame([ { 'Df': ssb_df, 'Sum Sq': '{0:.2f}'.format(SSB), 'Mean Sq': '{0:.2f}'.format(SSB / ssb_df), 'F-value': '{0:.2f}'.format(f_val), 'p-value': '{0:.4f}'.format(p_val), }, { 'Df': ssw_df, 'Sum Sq': '{0:.2f}'.format(SSW), 'Mean Sq': '{0:.2f}'.format(SSW / ssw_df), 'F-value': '', 'p-value': '', }, ], index=['Therapy', 'Residuals']) print(anova_results)