@spiritnotes
2016-03-14T10:05:09.000000Z
字数 1746
阅读 3044
机器学习实践
from sklearn.datasets import load_boston
boston = load_boston()
该数据集是回归算法所用,数据集的总体描述如下
Data Set Characteristics:
:Number of Instances: 506
:Number of Attributes: 13 numeric/categorical predictive
:Median Value (attribute 14) is usually the target
数据集中未有缺失数据。
通过作图可以看到房价与各个单变量之间的关系,可以看到有几个变量有可见的相关关系
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import KFold
import numpy as np
def get_rmse_of_regression(data, target, cv=10, fit_intercept=True):
kf = KFold(len(data), n_folds=cv , shuffle=True)
err_test_all, err_train_all = 0, 0
for train, test in kf:
lr = LinearRegression(fit_intercept = fit_intercept)
lr.fit(data[train], target[train])
pre_train = lr.predict(data[train])
err_train = pre_train - target[train]
err_train_all += np.sum(err_train*err_train)
pre_test = lr.predict(data[test])
err_test = pre_test - target[test]
err_test_all += np.sum(err_test*err_test)
rmse_test = np.sqrt(err_test_all/len(data))
rmse_train = np.sqrt(err_train_all/(cv-1.0)/len(data))
return rmse_test, rmse_train
采用线性回归通过10折法进行计算其训练集的RMSE约为4.8~5之间。
通过随机测试40,其RMSE结果如下所示,测试的RMSE相比训练集的RMSE略高,这个是过拟合现象,很正常。
我们采用弹性网再进行测试,其结果如下,可见其导致RSME有所上升。
from sklearn.cross_validation import KFold
import numpy as np
from sklearn.linear_model import ElasticNetCV
def get_rmse_of_ElasticNetCV(data, target, cv=10, fit_intercept=True):
kf = KFold(len(data), n_folds=cv , shuffle=True)
err_test_all, err_train_all = 0, 0
for train, test in kf:
en = ElasticNetCV(fit_intercept = fit_intercept)
en.fit(data[train], target[train])
pre_train = en.predict(data[train])
err_train = pre_train - target[train]
err_train_all += np.sum(err_train*err_train)
pre_test = en.predict(data[test])
err_test = pre_test - target[test]
err_test_all += np.sum(err_test*err_test)
rmse_test = np.sqrt(err_test_all/len(data))
rmse_train = np.sqrt(err_train_all/(cv-1.0)/len(data))
return rmse_test, rmse_train