scikit-learn 学习 (五) 广义线性模型

2018-01-19

scikit-learn

一般最小二乘法

1
2
3

from sklearn import linear_model
reg = linear_model.LinearRegression()
reg.fit([[0, 0], [1, 1], [2, 2]], [0, 1, 2])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

reg.coef_

array([ 0.5,  0.5])

设计矩阵的列如果近似线性相关，会产生多重共线性。

# Code source: Jaques Grobler
# License: BSD 3 clause


import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score

# Load the diabetes dataset
diabetes = datasets.load_diabetes()


# Use only one feature
diabetes_X = diabetes.data[:, np.newaxis, 2]

# Split the data into training/testing sets
diabetes_X_train = diabetes_X[:-20]
diabetes_X_test = diabetes_X[-20:]

# Split the targets into training/testing sets
diabetes_y_train = diabetes.target[:-20]
diabetes_y_test = diabetes.target[-20:]

# Create linear regression object
regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(diabetes_X_train, diabetes_y_train)

# Make predictions using the testing set
diabetes_y_pred = regr.predict(diabetes_X_test)

# The coefficients
print('Coefficients: \n', regr.coef_)
# The mean squared error
print("Mean squared error: %.2f"
      % mean_squared_error(diabetes_y_test, diabetes_y_pred))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(diabetes_y_test, diabetes_y_pred))

# Plot outputs
plt.scatter(diabetes_X_test, diabetes_y_test,  color='black')
plt.plot(diabetes_X_test, diabetes_y_pred, color='blue', linewidth=3)

plt.xticks(())
plt.yticks(())

plt.show()

Coefficients: 
 [ 938.23786125]
Mean squared error: 2548.07
Variance score: 0.47

png

时间复杂度：采用 SVD，如果矩阵(n, p)，则O(np^2)

岭回归

alpha 越大，系数对多重共线性越鲁棒。

1
2
3

from sklearn import linear_model
reg = linear_model.Ridge(alpha=0.5)
reg.fit([[0, 0], [0, 0], [1, 1]], [0, 0.1, 1])

Ridge(alpha=0.5, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

reg.coef_

array([ 0.34545455,  0.34545455])

1	reg.intercept_

0.13636363636363641

# Author: Fabian Pedregosa -- <fabian.pedregosa@inria.fr>
# License: BSD 3 clause

print(__doc__)

import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model

# X is the 10x10 Hilbert matrix
X = 1. / (np.arange(1, 11) + np.arange(0, 10)[:, np.newaxis])
y = np.ones(10)

# #############################################################################
# Compute paths

n_alphas = 200
alphas = np.logspace(-10, -2, n_alphas)

coefs = []
for a in alphas:
    ridge = linear_model.Ridge(alpha=a, fit_intercept=False)
    ridge.fit(X, y)
    coefs.append(ridge.coef_)

# #############################################################################
# Display results

ax = plt.gca()

ax.plot(alphas, coefs)
ax.set_xscale('log')
ax.set_xlim(ax.get_xlim()[::-1])  # reverse axis
plt.xlabel('alpha')
plt.ylabel('weights')
plt.title('Ridge coefficients as a function of the regularization')
plt.axis('tight')
plt.show()

Automatically created module for IPython interactive environment

png

设置正则化参数：广义交叉验证

RidgeCV 默认使用GCV 广义交叉验证，是留一的有效形式。

1
2
3

from sklearn import linear_model
reg = linear_model.RidgeCV(alphas=[0.1, 1.0, 10.0])
reg.fit([[0, 0], [0, 0], [1, 1]], [0, 0.1, 1])

RidgeCV(alphas=[0.1, 1.0, 10.0], cv=None, fit_intercept=True, gcv_mode=None,
    normalize=False, scoring=None, store_cv_values=False)

1	reg.alpha_

0.10000000000000001

Lasso

Lasso 是一个估计稀疏系数的线性模型。Lasso 和它的变体是压缩感知的基础。

1
2
3

from sklearn import linear_model
reg = linear_model.Lasso(alpha=0.1)
reg.fit([[0, 0], [1, 1]], [0, 1])

Lasso(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

1	reg.predict([[1, 1]])

array([ 0.8])

Lasso and Elastic Net for Sparse Signals

print(__doc__)

# Author: Olivier Grisel, Gael Varoquaux, Alexandre Gramfort
# License: BSD 3 clause

import time

import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LassoCV, LassoLarsCV, LassoLarsIC
from sklearn import datasets

diabetes = datasets.load_diabetes()
X = diabetes.data
y = diabetes.target

rng = np.random.RandomState(42)
X = np.c_[X, rng.randn(X.shape[0], 14)]  # add some bad features

# normalize data as done by Lars to allow for comparison
X /= np.sqrt(np.sum(X ** 2, axis=0))

# #############################################################################
# LassoLarsIC: least angle regression with BIC/AIC criterion

model_bic = LassoLarsIC(criterion='bic')
t1 = time.time()
model_bic.fit(X, y)
t_bic = time.time() - t1
alpha_bic_ = model_bic.alpha_

model_aic = LassoLarsIC(criterion='aic')
model_aic.fit(X, y)
alpha_aic_ = model_aic.alpha_


def plot_ic_criterion(model, name, color):
    alpha_ = model.alpha_
    alphas_ = model.alphas_
    criterion_ = model.criterion_
    plt.plot(-np.log10(alphas_), criterion_, '--', color=color,
             linewidth=3, label='%s criterion' % name)
    plt.axvline(-np.log10(alpha_), color=color, linewidth=3,
                label='alpha: %s estimate' % name)
    plt.xlabel('-log(alpha)')
    plt.ylabel('criterion')

plt.figure()
plot_ic_criterion(model_aic, 'AIC', 'b')
plot_ic_criterion(model_bic, 'BIC', 'r')
plt.legend()
plt.title('Information-criterion for model selection (training time %.3fs)'
          % t_bic)

# #############################################################################
# LassoCV: coordinate descent

# Compute paths
print("Computing regularization path using the coordinate descent lasso...")
t1 = time.time()
model = LassoCV(cv=20).fit(X, y)
t_lasso_cv = time.time() - t1

# Display results
m_log_alphas = -np.log10(model.alphas_)

plt.figure()
ymin, ymax = 2300, 3800
plt.plot(m_log_alphas, model.mse_path_, ':')
plt.plot(m_log_alphas, model.mse_path_.mean(axis=-1), 'k',
         label='Average across the folds', linewidth=2)
plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k',
            label='alpha: CV estimate')

plt.legend()

plt.xlabel('-log(alpha)')
plt.ylabel('Mean square error')
plt.title('Mean square error on each fold: coordinate descent '
          '(train time: %.2fs)' % t_lasso_cv)
plt.axis('tight')
plt.ylim(ymin, ymax)

# #############################################################################
# LassoLarsCV: least angle regression

# Compute paths
print("Computing regularization path using the Lars lasso...")
t1 = time.time()
model = LassoLarsCV(cv=20).fit(X, y)
t_lasso_lars_cv = time.time() - t1

# Display results
m_log_alphas = -np.log10(model.cv_alphas_)

plt.figure()
plt.plot(m_log_alphas, model.mse_path_, ':')
plt.plot(m_log_alphas, model.mse_path_.mean(axis=-1), 'k',
         label='Average across the folds', linewidth=2)
plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k',
            label='alpha CV')
plt.legend()

plt.xlabel('-log(alpha)')
plt.ylabel('Mean square error')
plt.title('Mean square error on each fold: Lars (train time: %.2fs)'
          % t_lasso_lars_cv)
plt.axis('tight')
plt.ylim(ymin, ymax)

plt.show()

Automatically created module for IPython interactive environment


/root/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:42: RuntimeWarning: divide by zero encountered in log10


Computing regularization path using the coordinate descent lasso...
Computing regularization path using the Lars lasso...


/root/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:95: RuntimeWarning: divide by zero encountered in log10

png

多任务 Lasso

MultiTaskLasso 是一个线性模型，对多回归问题估计稀疏系数:

y 是 2d 数组，(n_samples, n_tasks)。这里的约束是在不同的回归问题上特征是一样的。

Examples

print(__doc__)

# Author: Alexandre Gramfort <alexandre.gramfort@inria.fr>
# License: BSD 3 clause

import matplotlib.pyplot as plt
import numpy as np

from sklearn.linear_model import MultiTaskLasso, Lasso

rng = np.random.RandomState(42)

# Generate some 2D coefficients with sine waves with random frequency and phase
n_samples, n_features, n_tasks = 100, 30, 40
n_relevant_features = 5
coef = np.zeros((n_tasks, n_features))
times = np.linspace(0, 2 * np.pi, n_tasks)
for k in range(n_relevant_features):
    coef[:, k] = np.sin((1. + rng.randn(1)) * times + 3 * rng.randn(1))

X = rng.randn(n_samples, n_features)
Y = np.dot(X, coef.T) + rng.randn(n_samples, n_tasks)

coef_lasso_ = np.array([Lasso(alpha=0.5).fit(X, y).coef_ for y in Y.T])
coef_multi_task_lasso_ = MultiTaskLasso(alpha=1.).fit(X, Y).coef_

# #############################################################################
# Plot support and time series
fig = plt.figure(figsize=(8, 5))
plt.subplot(1, 2, 1)
plt.spy(coef_lasso_)
plt.xlabel('Feature')
plt.ylabel('Time (or Task)')
plt.text(10, 5, 'Lasso')
plt.subplot(1, 2, 2)
plt.spy(coef_multi_task_lasso_)
plt.xlabel('Feature')
plt.ylabel('Time (or Task)')
plt.text(10, 5, 'MultiTaskLasso')
fig.suptitle('Coefficient non-zero location')

feature_to_plot = 0
plt.figure()
lw = 2
plt.plot(coef[:, feature_to_plot], color='seagreen', linewidth=lw,
         label='Ground truth')
plt.plot(coef_lasso_[:, feature_to_plot], color='cornflowerblue', linewidth=lw,
         label='Lasso')
plt.plot(coef_multi_task_lasso_[:, feature_to_plot], color='gold', linewidth=lw,
         label='MultiTaskLasso')
plt.legend(loc='upper center')
plt.axis('tight')
plt.ylim([-1.1, 1.1])
plt.show()

Automatically created module for IPython interactive environment

png

Elastic Net

Elastic Net（弹性网线性模型）结合了岭回归和Lasso回归的优点，即在目标函数中对系数的约束既有L1范数，也有L2范数。这个模型求出来的表示系数，既有稀疏性，又有正则化约束的特性，继承了岭回归的健壮性。

ElasticNetCV 可以通过交叉验证调整 alpha 与 l1_ratio。

Examples:

多任务 Elastic Net

Least Angle Regression

LAR 最小角回归是一个针对高维数据集的回归算法。LARS 类似于前向梯度算法。在每一步找到与 response 最相关的 predictor。当两个 predictors 与 response 的相关性相同时，选择角平分线方向继续走。

优势：

适合于特征维数 p 远高于样本数 n
算法的最坏计算复杂度和最小二乘法类似，但是其计算速度几乎和前向选择算法一样
可以产生分段线性结果的完整路径，这在模型的交叉验证中极为有用

缺点：

对噪音敏感

LARS Lasso

Lasso path using LARS

LassoLars 是一个用 LARS 算法实现的 lasso 模型。不像那些基于坐标下降的算法，它产生了精确解。

1
2
3

from sklearn import linear_model
reg = linear_model.LassoLars(alpha=0.1)
reg.fit([[0, 0], [1, 1]], [0, 1])

LassoLars(alpha=0.1, copy_X=True, eps=2.2204460492503131e-16,
     fit_intercept=True, fit_path=True, max_iter=500, normalize=True,
     positive=False, precompute='auto', verbose=False)

reg.coef_

array([ 0.71715729,  0.        ])

Orthogonal Matching Pursuit (OMP) 正交匹配追踪法

Orthogonal Matching Pursuit

Bayesian 回归

岭回归中的l2范数正则化等价于参数 w 和精度 lambda^-1 的高斯先验的 MAP 估计。

对于 lambda 参数，把它视为随机变量，从数据中估计

详见 PRML

Bayesian 岭回归

alpha,lambda 选择为 gamma 分布，这是高斯分布的共轭先验。

from sklearn import linear_model
X = [[0, 0], [1, 1], [2, 2], [3, 3]]
Y = [0, 1, 2, 3]
reg = linear_model.BayesianRidge()
reg.fit(X, Y)

BayesianRidge(alpha_1=1e-06, alpha_2=1e-06, compute_score=False, copy_X=True,
       fit_intercept=True, lambda_1=1e-06, lambda_2=1e-06, n_iter=300,
       normalize=False, tol=0.001, verbose=False)

1	reg.predict([[1, 0]])

array([ 0.50000013])

reg.coef_

array([ 0.49999993,  0.49999993])

在贝叶斯框架下，权重参数与一般最小二乘有区别。但贝叶斯岭回归对病态矩阵更鲁棒。

Examples:

Bayesian Ridge Regression

Automatic Relevance Determination - ARD

ARD 回归与贝叶斯岭回归相似，但是它的权重 w 是稀疏的。ARD 回归提出了关于 w 的不同的先验，它舍弃了球形高斯分布的假设。反之，它假设高斯分布是椭圆形的，与坐标轴平行。

这意味着每个权重采样自中心为0精度为 $\lambda _i$ 的高斯分布

与贝叶斯岭回归不同，每个坐标 $w_i$ 有它自己的标准差 $\lambda _i$。所有 $\lambda _i$ 的先验有相同的 gamma 分布。

ARD 是稀疏贝叶斯学习和相关向量机的代表。

Examples:

Automatic Relevance Determination Regression

Logistic 回归

Logistic 回归，不管它的名字，它是一个线性分类模型而不是回归。Logistic 回归也被称为 logit 回归，最大熵分类 (MaxEnt) 或者 log-linear 分类器. 在这个模型中, 描述单一尝试的可能结果的概率用的是 logistic 函数.

在 scikit-learn 中实现 Logistic 回归的类是 LogisticRegression. 它可以实现 L2 或 L1 正则下的二分类, One-vs-Rest, 多分类回归.

Examples:

随机梯度下降 -SGD

随机梯度下降是拟合线性模型的一个简单而高效的方法. 当样本数量很大时非常有效.

SGDClassifier 和 SGDRegressor 分别用于拟合分类问题和回归问题的线性模型，可使用不同的（凸）损失函数，支持不同的惩罚项。例如，设定 loss=”log” ，则 SGDClassifier 拟合一个 logistic 回归模型，而 loss=”hinge” 拟合线性支持向量机(SVM).

Perceptron 感知器

Passive Aggressive Algorithms（被动攻击算法）

Robustness 回归: outliers and modeling errors

Robustness 回归用于拟合有损坏数据的回归模型

多项式回归

from sklearn.preprocessing import PolynomialFeatures
import numpy as np
X = np.arange(6).reshape(3, 2)
X

array([[0, 1],
       [2, 3],
       [4, 5]])

1 2	poly = PolynomialFeatures(degree=2) poly.fit_transform(X)

array([[  1.,   0.,   1.,   0.,   0.,   1.],
       [  1.,   2.,   3.,   4.,   6.,   9.],
       [  1.,   4.,   5.,  16.,  20.,  25.]])

from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
import numpy as np
model = Pipeline([('poly', PolynomialFeatures(degree=3)),
                 ('linear', LinearRegression(fit_intercept=False))])
# fit to an order-3 polynomial data
x = np.arange(5)
y = 3 - 2 * x + x ** 2 - x ** 3
model = model.fit(x[:, np.newaxis], y)
model.named_steps['linear'].coef_

array([ 3., -2.,  1., -1.])

1	x[:, np.newaxis]

array([[0],
       [1],
       [2],
       [3],
       [4]])

from sklearn.linear_model import Perceptron
from sklearn.preprocessing import PolynomialFeatures
import numpy as np
X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
y = X[:, 0] ^ X[:, 1]
y

array([0, 1, 1, 0])

1	X = PolynomialFeatures(interaction_only=True).fit_transform(X).astype(int)

array([[1, 0, 0, 0],
       [1, 0, 1, 0],
       [1, 1, 0, 0],
       [1, 1, 1, 1]])

1 2	clf = Perceptron(fit_intercept=False, max_iter=10, tol=None, shuffle=False).fit(X, y)

1	clf.predict(X)

array([0, 1, 1, 0])

1	clf.score(X, y)

1.0