>>> import statsmodels.api as sm >>> import statsmodels.formula.api as smf >>> import pandas >>> df = sm.datasets.get_rdataset("Guerry", "HistData").data >>> df = df.dropna() # step 1 Describe model,return model class >>> mod = smf.ols(formula='Lottery ~ Literacy + Wealth + Region', data=df) # step 2 Fit model,return result class >>> res = mod.fit() # step 3 Summarize model >>> print(res.summary()) OLS Regression Results ============================================================================== Dep. Variable: Lottery R-squared: 0.338 Model: OLS Adj. R-squared: 0.287 Method: Least Squares F-statistic: 6.636 Date: Sun, 23 Dec 2018 Prob (F-statistic): 1.07e-05 Time: 18:41:19 Log-Likelihood: -375.30 No. Observations: 85 AIC: 764.6 Df Residuals: 78 BIC: 781.7 Df Model: 6 Covariance Type: nonrobust =============================================================================== coef std err t P>|t| [0.0250.975] ------------------------------------------------------------------------------- Intercept 38.65179.4564.0870.00019.82657.478 Region[T.E] -15.42789.727 -1.5860.117 -34.7933.938 Region[T.N] -10.01709.260 -1.0820.283 -28.4538.419 Region[T.S] -4.54837.279 -0.6250.534 -19.0399.943 Region[T.W] -10.09137.196 -1.4020.165 -24.4184.235 Literacy -0.18580.210 -0.8860.378 -0.6030.232 Wealth 0.45150.1034.3900.0000.2470.656 ============================================================================== Omnibus: 3.049 Durbin-Watson: 1.785 Prob(Omnibus): 0.218 Jarque-Bera (JB): 2.694 Skew: -0.340 Prob(JB): 0.260 Kurtosis: 2.454 Cond. No. 371. ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. >>> res.params # 获取模型参数 Intercept 38.651655 Region[T.E] -15.427785 Region[T.N] -10.016961 Region[T.S] -4.548257 Region[T.W] -10.091276 Literacy -0.185819 Wealth 0.451475 dtype: float64 >>> dir(res) # 查看完整的属性列表
输入输出模型
from statsmodels.iolib import smpickle smpickle.save_pickle(obj, fname) #Save the object to file via pickling. smpickle.load_pickle(fname) #Load a previously saved object from file
#Using formulas with models that do not (yet) support them In [22]: import patsy In [23]: f = 'Lottery ~ Literacy * Wealth' In [24]: y, X = patsy.dmatrices(f, df, return_type='matrix') #y被转化为patsy.DesignMatrix,x被转化为转化为numpy.ndarray In [26]: print(X[:5]) [[ 1.37.73.2701.] [ 1.51.22.1122.] [ 1.13.61.793.] [ 1.46.76.3496.] [ 1.69.83.5727.]] In [27]: f = 'Lottery ~ Literacy * Wealth' In [28]: y, X = patsy.dmatrices(f, df, return_type='dataframe') #转化为pandas.dataframe In [30]: print(X[:5]) Intercept Literacy Wealth Literacy:Wealth 01.037.073.02701.0 11.051.022.01122.0 21.013.061.0793.0 31.046.076.03496.0 41.069.083.05727.0 In [31]: res=smf.OLS(y, X).fit()
Return linear predicted values from a design matrix.
score(params)
Evaluate the score function at a given point
Attributes
desc
df_model
模型自由度,定义为回归矩阵的秩,如果包括常数则减1
df_resid
剩余自由度,定义为观察数减去回归矩阵的rank
Results Classes
Results Classes
结果类
RegressionResults
总结了线性回归模型的拟合
OLSResults
OLS模型的结果类
PredictionResults
QuantRegResults
QuantReg模型的结果实例
Methods
desc
aic(), bic(), bse()…
cov_params()
返回方差/协方差矩阵
eigenvals()
返回按降序排序的特征值
fvalue(), pvalues(), f_pvalue(), tvalues()
f_test(r_matrix), t_test()
F检验,t检验
get_prediction()
计算预测结果
save(fname), load(fname)
保存pickle,加载(类方法)
Examples
# Load modules and data In [1]: import numpy as np In [2]: import statsmodels.api as sm In [3]: spector_data = sm.datasets.spector.load() In [4]: spector_data.exog = sm.add_constant(spector_data.exog, prepend=False) # Fit and summarize OLS model In [5]: mod = sm.OLS(spector_data.endog, spector_data.exog) In [6]: res = mod.fit() In [7]: res.summary()
The statistical model for each observation ii is assumed to be Yi∼FEDM(⋅∣θ,ϕ,wi) and μi=E[Yi∣xi]=g−1(xi′β)
Model Classes GLM(endog, exog, family=None)
Parameters:
endog (array-like)
exog (array-like)
family (family class instance) – The default is Gaussian
Methods
desc
fit()
Full fit of the model
from_formula(formula, data)
Create a Model from a formula and dataframe
predict(params)
Return linear predicted values from a design matrix.
score(params)
Evaluate the score function at a given point
Results Classes
Results Classes
结果类
GLMResults
包含GLM结果的类。
PredictionResults
Families
Families
desc
Family(link,variances)
单参数指数族的父类。
Binomial(link=None)
二项式指数族分布。
Gamma(link=None)
Gamma指数族分布。
Gaussian(link=None)
高斯指数族分布。
InverseGaussian(link=None)
InverseGaussian指数族。
NegativeBinomial(link=None,alpha=None)
负二项指数族。
Poisson(link=None)
泊松指数族。
Tweedie(link=None,var_power=None)
Tweedie。
Link Functions
>>> sm.families.family.<familyname>.links
Link Functions
desc
Link
单参数指数族的通用链接函数。
CDFLink([DBN])
使用scipy.stats发行版的CDF
CLogLog
互补的log-log变换
Log
对数转换
Logit
logit变换
NegativeBinomial([α])
负二项式link函数
Power([power])
指数转换
cauchy()
Cauchy(标准Cauchy CDF)变换
cloglog
CLogLog转换link函数。
identity()
identity转换
inverse_power()
逆变换
inverse_squared()
逆平方变换
nbinom([α])
负二项式link函数。
probit([DBN])
probit(标准普通CDF)变换
Examples
In [1]: import statsmodels.api as sm In [2]: data = sm.datasets.scotland.load() In [3]: data.exog = sm.add_constant(data.exog) # Instantiate a gamma family model with the default link function. In [4]: gamma_model = sm.GLM(data.endog, data.exog, family=sm.families.Gamma()) In [5]: gamma_results = gamma_model.fit() In [6]: print(gamma_results.summary()) In [7]: gamma_results = gamma_model.fit() In [8]: gamma_results.params
In [1]: import statsmodels.api as sm In [2]: import statsmodels.formula.api as smf In [3]: data = sm.datasets.get_rdataset('epil', package='MASS').data In [4]: fam = sm.families.Poisson() In [5]: ind = sm.cov_struct.Exchangeable() In [6]: mod = smf.gee("y ~ age + trt + base", "subject", data, ...: cov_struct=ind, family=fam) In [7]: res = mod.fit() In [8]: print(res.summary())
# Load modules and data In [1]: import statsmodels.api as sm In [2]: data = sm.datasets.stackloss.load() In [3]: data.exog = sm.add_constant(data.exog) # Fit model and print summary In [4]: rlm_model = sm.RLM(data.endog, data.exog,M=sm.robust.norms.HuberT()) In [5]: rlm_results = rlm_model.fit() In [6]: print(rlm_results.params) [-41.02650.82940.9261 -0.1278]
In [1]: import statsmodels.api as sm In [2]: import statsmodels.formula.api as smf In [3]: data = sm.datasets.get_rdataset("dietox", "geepack").data In [4]: md = smf.mixedlm("Weight ~ Time", data, groups=data["Pig"]) In [5]: mdf = md.fit() In [6]: print(mdf.summary())
# Load the data from Spector and Mazzeo (1980) In [1]: spector_data = sm.datasets.spector.load() In [2]: spector_data.exog = sm.add_constant(spector_data.exog) # Logit Model In [3]: logit_mod = sm.Logit(spector_data.endog, spector_data.exog) In [4]: logit_res = logit_mod.fit() Optimization terminated successfully. Current function value: 0.402801 Iterations 7 In [5]: print(logit_res.summary())
方差分析(Analysis of Variance,简称ANOVA),又称“变异数分析”,为数据分析中常见的统计模型,主要为探讨连续型(Continuous)因变量(Dependent variable)与类别型自变量(Independent variable)的关系。当自变量的因子等于或超过三个类别时,检验各类别平均值是否相等,采用方差分析。
广义t检验中,方差相等(Equality of variance)的合并t检验(Pooled T-test)视为是方差分析的一种。t检验分析两组平均数是否相等,方差分析也采用相同的计算概念,实际上,当方差分析套用在合并t检验的分析上时,产生的F值则会等于t检验的平方项。
Repeated measures Anova using least squares regression
In [1]: import statsmodels.api as sm In [2]: from statsmodels.formula.api import ols In [3]: moore = sm.datasets.get_rdataset("Moore", "car", ...: cache=True) # load data In [4]: data = moore.data In [5]: data = data.rename(columns={"partner.status": ...: "partner_status"}) # make name pythonic In [6]: moore_lm = ols('conformity ~ C(fcategory, Sum)*C(partner_status, Sum)', ...: data=data).fit() In [7]: table = sm.stats.anova_lm(moore_lm, typ=2) # Type 2 ANOVA DataFrame In [8]: print(table) sum_sq df F \ C(fcategory, Sum) 11.6147002.00.276958 C(partner_status, Sum) 212.2137781.010.120692 C(fcategory, Sum):C(partner_status, Sum) 175.4889282.04.184623 Residual 817.76396139.0 NaN PR(>F) C(fcategory, Sum) 0.759564 C(partner_status, Sum) 0.002874 C(fcategory, Sum):C(partner_status, Sum) 0.022572 Residual NaN
from __future__ import print_function import numpy as np from scipy import stats import pandas as pd import matplotlib.pyplot as plt import statsmodels.api as sm from statsmodels.graphics.api import qqplot dta = sm.datasets.sunspots.load_pandas().data dta.index = pd.Index(sm.tsa.datetools.dates_from_range('1700', '2008')) del dta["YEAR"] dta.plot(figsize=(12,8)) plt.show()
>>> arma_mod20 = sm.tsa.ARMA(dta, (2,0)).fit(disp=False) >>> print(arma_mod20.params) const 49.659542 ar.L1.SUNACTIVITY 1.390656 ar.L2.SUNACTIVITY -0.688571 dtype: float64 /Users/taugspurger/sandbox/statsmodels/statsmodels/tsa/base/tsa_model.py:171: ValueWarning: No frequency information was provided, so inferred frequency A-DEC will be used. % freq, ValueWarning) >>> arma_mod30 = sm.tsa.ARMA(dta, (3,0)).fit(disp=False) /Users/taugspurger/sandbox/statsmodels/statsmodels/tsa/base/tsa_model.py:171: ValueWarning: No frequency information was provided, so inferred frequency A-DEC will be used. % freq, ValueWarning) >>> print(arma_mod20.aic, arma_mod20.bic, arma_mod20.hqic) 2622.6363380658092637.56970317342628.606725911055 >>> print(arma_mod30.params) const 49.749936 ar.L1.SUNACTIVITY 1.300810 ar.L2.SUNACTIVITY -0.508093 ar.L3.SUNACTIVITY -0.129650 dtype: float64 >>> print(arma_mod30.aic, arma_mod30.bic, arma_mod30.hqic) 2619.40362869644742638.07033508093632626.866613503005
# some example data In [1]: import numpy as np In [2]: import pandas In [3]: import statsmodels.api as sm In [4]: from statsmodels.tsa.api import VAR, DynamicVAR In [5]: mdata = sm.datasets.macrodata.load_pandas().data # prepare the dates index In [6]: dates = mdata[['year', 'quarter']].astype(int).astype(str) In [7]: quarterly = dates["year"] + "Q" + dates["quarter"] In [8]: from statsmodels.tsa.base.datetools import dates_from_str In [9]: quarterly = dates_from_str(quarterly) In [10]: mdata = mdata[['realgdp','realcons','realinv']] In [11]: mdata.index = pandas.DatetimeIndex(quarterly) In [12]: data = np.log(mdata).diff().dropna() # make a VAR model In [13]: model = VAR(data) In [14]: results = model.fit(2) In [15]: results.summary()
In [18]: model.select_order(15) Out[18]: <statsmodels.tsa.vector_ar.var_model.LagOrderResults at 0x10c89fef0> # 调用fit函数时,可以传递最大滞后数和order标准以用于order选择 In [19]: results = model.fit(maxlags=15, ic='aic')
预测(Forecasting)
The linear predictor is the optimal h-step ahead forecast in terms of mean-squared error: yt(h)=ν+A1yt(h−1)+⋯+Apyt(h−p)
我们可以使用预测函数来生成此预测。请注意,我们必须为预测指定“初始值”:
In [30]: results.test_causality('realgdp', ['realinv', 'realcons'], kind='f') Out[30]: <statsmodels.tsa.vector_ar.hypothesis_test_results.CausalityTestResults at 0x10ca15978>
In [31]: np.random.seed(1) In [32]: import pandas.util.testing as ptest In [33]: ptest.N = 500 In [34]: data = ptest.makeTimeDataFrame().cumsum(0) In [35]: data Out[35]: A B C D 2000-01-03 1.624345 -1.719394 -0.1532361.301225 2000-01-04 1.012589 -1.662273 -2.5857450.988833 2000-01-05 0.484417 -2.461821 -2.0777600.717604 2000-01-06 -0.588551 -2.753416 -2.4017932.580517 2000-01-07 0.276856 -3.012398 -3.9128691.937644 ... ... ... ... ... 2001-11-2629.55208514.27403639.222558 -13.243907 2001-11-2730.08096411.99673838.589968 -12.682989 2001-11-2827.84387811.92711438.380121 -13.604648 2001-11-2926.73616512.28098440.277282 -12.957273 2001-11-3026.71844712.09402938.895890 -11.570447 [500 rows x 4 columns] In [36]: var = DynamicVAR(data, lag_order=2, window_type='expanding')
stat, pv = sm.duration.survdiff(data.futime, data.death, data.sex) #以下是survdiff实施的一些其他测试程序 # Fleming-Harrington with p=1, i.e. weight by pooled survival time stat, pv = sm.duration.survdiff(data.futime, data.death, data.sex, weight_type='fh', fh_p=1) # Gehan-Breslow, weight by number at risk stat, pv = sm.duration.survdiff(data.futime, data.death, data.sex, weight_type='gb') # Tarone-Ware, weight by the square root of the number at risk stat, pv = sm.duration.survdiff(data.futime, data.death, data.sex, weight_type='tw')
In [1]: import numpy as np In [2]: import pandas as pd In [3]: import statsmodels.api as sm In [4]: df = sm.datasets.get_rdataset("Arthritis", "vcd").data In [5]: tab = pd.crosstab(df['Treatment'], df['Improved']) In [6]: tab = tab.loc[:, ["None", "Some", "Marked"]] In [7]: table = sm.stats.Table(tab)
或者,我们可以传递原始数据,让Table类为我们构建单元格数组:
In [8]: table = sm.stats.Table.from_data(df[["Treatment", "Improved"]])
In [17]: print(table.local_oddsratios) Improved Marked None Some Treatment Placebo 0.1494252.230769 NaN Treated NaN NaN NaN In [18]: taloc = sm.stats.Table2x2(np.asarray([[7, 29], [21, 13]])) In [19]: print(taloc.oddsratio) 0.14942528735632185 In [20]: taloc = sm.stats.Table2x2(np.asarray([[29, 7], [13, 7]])) In [21]: print(taloc.oddsratio) 2.230769230769231
In [22]: print(table.cumulative_oddsratios) Improved Marked None Some Treatment Placebo 0.1851851.058824 NaN Treated NaN NaN NaN In [23]: tab1 = np.asarray([[7, 29 + 7], [21, 13 + 7]]) In [24]: tacum = sm.stats.Table2x2(tab1) In [25]: print(tacum.oddsratio) 0.18518518518518517 In [26]: tab1 = np.asarray([[7 + 29, 7], [21 + 13, 7]]) In [27]: tacum = sm.stats.Table2x2(tab1) In [28]: print(tacum.oddsratio) 1.0588235294117647
马赛克图(mosaic plot)是一种非正式评估双向表中依赖性的图形方法。
from statsmodels.graphics.mosaicplot import mosaic mosaic(data)
Symmetry and homogeneity(对称性和同质性)
Symmetry(对称性) is the property
that Pi,j=Pj,i for every i and j。
Homogeneity(同质性)是行因子和列因子的边际分布相同的特性
meaning that ∑jPij=∑jPji for all i
注意,P (and T)必须是正方形,行和列类别必须相同,并且必须以相同的顺序出现。
为了说明,我们加载数据集,创建列联表,并计算行和列边距(the row and column margins)。本Table类包含分析方法r×c列联表。下面加载的数据集包含人们左眼和右眼视敏度的评估。我们首先加载数据并创建一个列联表。
In [29]: df = sm.datasets.get_rdataset("VisualAcuity", "vcd").data In [30]: df = df.loc[df.gender == "female", :] In [31]: tab = df.set_index(['left', 'right']) In [32]: del tab["gender"] In [33]: tab = tab.unstack() In [34]: tab.columns = tab.columns.get_level_values(1) In [35]: print(tab) right 1234 left 1152023411736 2266151236282 31244321772179 46678205492 # 从列联表创建一个SquareTable对象 In [36]: sqtab = sm.stats.SquareTable(tab) In [37]: row, col = sqtab.marginal_probabilities In [38]: print(row) right 10.255049 20.297178 30.335295 40.112478 dtype: float64 In [39]: print(col) right 10.264277 20.301725 30.328474 40.105524 dtype: float64 # 该summary方法打印对称性和均匀性检测结果 In [40]: print(sqtab.summary()) Statistic P-value DF -------------------------------- Symmetry 19.1070.0046 Homogeneity 11.9570.0083 --------------------------------
In [41]: table = np.asarray([[35, 21], [25, 58]]) In [42]: t22 = sm.stats.Table2x2(table) In [43]: print(t22.summary()) Estimate SE LCB UCB p-value ------------------------------------------------- Odds ratio 3.8671.8907.9120.000 Log odds ratio 1.3520.3650.6362.0680.000 Risk ratio 2.0751.4113.0510.000 Log risk ratio 0.7300.1970.3451.1150.000 ------------------------------------------------- #请注意,风险比不是对称的,因此如果分析转置表,将获得不同的结果。 In [44]: table = np.asarray([[35, 21], [25, 58]]) In [45]: t22 = sm.stats.Table2x2(table.T) In [46]: print(t22.summary()) Estimate SE LCB UCB p-value ------------------------------------------------- Odds ratio 3.8671.8907.9120.000 Log odds ratio 1.3520.3650.6362.0680.000 Risk ratio 2.1941.4363.3540.000 Log risk ratio 0.7860.2160.3621.2100.000 -------------------------------------------------
In [47]: data = sm.datasets.china_smoking.load() In [48]: mat = np.asarray(data.data) In [49]: tables = [np.reshape(x.tolist()[1:], (2, 2)) for x in mat] In [50]: st = sm.stats.StratifiedTable(tables) In [51]: print(st.summary()) Estimate LCB UCB ----------------------------------------- Pooled odds 2.1741.9842.383 Pooled log odds 0.7770.6850.868 Pooled risk ratio 1.519 Statistic P-value ----------------------------------- Test of OR=1280.1380.000 Test constant OR 5.2000.636 ----------------------- Number of tables 8 Min n 213 Max n 2900 Avg n 1052 Total n 8419 -----------------------
>>> import numpy as np >>> from statsmodels.multivariate.pca import PCA >>> x = np.random.randn(100)[:, None] >>> x = x + np.random.randn(100, 100) >>> pc = PCA(x)
注意,主成分是使用SVD计算的,因此从不构造相关矩阵,除非method =‘eig’。
PCA使用数据的协方差矩阵
>>> pc = PCA(x, standardize=False)
使用NIPALS将返回的因子数限制为1
>>> pc = PCA(x, ncomp=1, method='nipals') >>> pc.factors.shape (100, 1)