@spiritnotes
2016-03-14T10:05:09.000000Z
字数 1746
阅读 3204
机器学习实践
from sklearn.datasets import load_bostonboston = load_boston()
该数据集是回归算法所用,数据集的总体描述如下
Data Set Characteristics::Number of Instances: 506:Number of Attributes: 13 numeric/categorical predictive:Median Value (attribute 14) is usually the target
数据集中未有缺失数据。
通过作图可以看到房价与各个单变量之间的关系,可以看到有几个变量有可见的相关关系

from sklearn.linear_model import LinearRegressionfrom sklearn.cross_validation import KFoldimport numpy as npdef get_rmse_of_regression(data, target, cv=10, fit_intercept=True):kf = KFold(len(data), n_folds=cv , shuffle=True)err_test_all, err_train_all = 0, 0for train, test in kf:lr = LinearRegression(fit_intercept = fit_intercept)lr.fit(data[train], target[train])pre_train = lr.predict(data[train])err_train = pre_train - target[train]err_train_all += np.sum(err_train*err_train)pre_test = lr.predict(data[test])err_test = pre_test - target[test]err_test_all += np.sum(err_test*err_test)rmse_test = np.sqrt(err_test_all/len(data))rmse_train = np.sqrt(err_train_all/(cv-1.0)/len(data))return rmse_test, rmse_train
采用线性回归通过10折法进行计算其训练集的RMSE约为4.8~5之间。
通过随机测试40,其RMSE结果如下所示,测试的RMSE相比训练集的RMSE略高,这个是过拟合现象,很正常。

我们采用弹性网再进行测试,其结果如下,可见其导致RSME有所上升。
from sklearn.cross_validation import KFoldimport numpy as npfrom sklearn.linear_model import ElasticNetCVdef get_rmse_of_ElasticNetCV(data, target, cv=10, fit_intercept=True):kf = KFold(len(data), n_folds=cv , shuffle=True)err_test_all, err_train_all = 0, 0for train, test in kf:en = ElasticNetCV(fit_intercept = fit_intercept)en.fit(data[train], target[train])pre_train = en.predict(data[train])err_train = pre_train - target[train]err_train_all += np.sum(err_train*err_train)pre_test = en.predict(data[test])err_test = pre_test - target[test]err_test_all += np.sum(err_test*err_test)rmse_test = np.sqrt(err_test_all/len(data))rmse_train = np.sqrt(err_train_all/(cv-1.0)/len(data))return rmse_test, rmse_train

