红酒和白酒检测
# -*- coding: utf-8 -*-""" Created on Tue May 22 11:39:44 2018 @author: DELL """import pandas as pdimport numpy as npfrom sklearn.tree import DecisionTreeClassifierfrom sklearn.linear_model import LogisticRegression#读取数据train = pd.read_csv("train.csv")test = pd.read_csv("test.csv")#预测量定义y = train.type## 特征定义predictors = ['fixedAcidity', 'volatileAcidity', 'citricAcid', 'residualSugar', 'chlorides', 'freeSulfurDioxide', 'totalSulfurDioxide', 'density', 'pH', 'sulphates', 'alcohol', 'quality']## 生成结构化数据X = train[predictors]# 定义模型wine_model = LogisticRegression()# 模型训练wine_model.fit(X, y)# print("Making predictions for the following 5 wines:")# print(X.head())# print("The predictions are")# print(wine_model.predict_proba(X.head()))# 生成预测结果prediction = wine_model.predict_proba(test[predictors])submission = pd.DataFrame({'id': test['id'], 'type': prediction[:, 1]})submission.to_csv('test_prediction.csv', index=False)
#!/usr/bin/env python# -*- coding: utf-8 -*-from __future__ import divisionimport numpy as npimport matplotlib.pyplot as pltimport pandas as pdfrom sklearn.cross_validation import train_test_splitfrom sklearn.linear_model import LogisticRegressionfrom sklearn.metrics import classification_report from sklearn.metrics import precision_recall_curve, roc_curve, auc data = pd.read_csv('ex2data1.txt', sep=',', \ skiprows=[2], names=['score1','score2','result'])score_data = data.loc[:,['score1','score2']]result_data = data.resultp = 0for i in xrange(10): x_train, x_test, y_train, y_test = \ train_test_split(score_data, result_data, test_size = 0.2) model = LogisticRegression(C=1e9) model.fit(x_train, y_train) predict_y = model.predict(x_test) p += np.mean(predict_y == y_test)# 绘制图像pos_data = data[data.result == 1].loc[:,['score1','score2']]neg_data = data[data.result == 0].loc[:,['score1','score2']]h = 0.02x_min, x_max = score_data.loc[:, ['score1']].min() - .5, score_data.loc[:, ['score1']].max() + .5y_min, y_max = score_data.loc[:, ['score2']].min() - .5, score_data.loc[:, ['score2']].max() + .5xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))Z = model.predict(np.c_[xx.ravel(), yy.ravel()])# 绘制边界和散点Z = Z.reshape(xx.shape)plt.pcolormesh(xx, yy, Z, cmap=plt.cm.Paired)plt.scatter(x=pos_data.score1, y=pos_data.score2, color='black', marker='o')plt.scatter(x=neg_data.score1, y=neg_data.score2, color='red', marker='*')plt.xlim(xx.min(), xx.max())plt.ylim(yy.min(), yy.max())plt.show()# 模型表现answer = model.predict_proba(x_test)[:,1] precision, recall, thresholds = precision_recall_curve(y_test, answer) report = answer > 0.5 print(classification_report(y_test, report, target_names = ['neg', 'pos'])) print("average precision:", p/100)