[关闭]
@ZSCDumin 2018-06-05T11:38:25.000000Z 字数 2480 阅读 454

红酒和白酒检测

  1. # -*- coding: utf-8 -*-
  2. """ Created on Tue May 22 11:39:44 2018 @author: DELL """
  3. import pandas as pd
  4. import numpy as np
  5. from sklearn.tree import DecisionTreeClassifier
  6. from sklearn.linear_model import LogisticRegression
  7. #读取数据
  8. train = pd.read_csv("train.csv")
  9. test = pd.read_csv("test.csv")
  10. #预测量定义
  11. y = train.type
  12. ## 特征定义
  13. predictors = ['fixedAcidity', 'volatileAcidity', 'citricAcid', 'residualSugar',
  14. 'chlorides', 'freeSulfurDioxide', 'totalSulfurDioxide',
  15. 'density', 'pH', 'sulphates', 'alcohol', 'quality']
  16. ## 生成结构化数据
  17. X = train[predictors]
  18. # 定义模型
  19. wine_model = LogisticRegression()
  20. # 模型训练
  21. wine_model.fit(X, y)
  22. # print("Making predictions for the following 5 wines:")
  23. # print(X.head())
  24. # print("The predictions are")
  25. # print(wine_model.predict_proba(X.head()))
  26. # 生成预测结果
  27. prediction = wine_model.predict_proba(test[predictors])
  28. submission = pd.DataFrame({'id': test['id'], 'type': prediction[:, 1]})
  29. submission.to_csv('test_prediction.csv', index=False)
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. from __future__ import division
  4. import numpy as np
  5. import matplotlib.pyplot as plt
  6. import pandas as pd
  7. from sklearn.cross_validation import train_test_split
  8. from sklearn.linear_model import LogisticRegression
  9. from sklearn.metrics import classification_report
  10. from sklearn.metrics import precision_recall_curve, roc_curve, auc
  11. data = pd.read_csv('ex2data1.txt', sep=',', \
  12. skiprows=[2], names=['score1','score2','result'])
  13. score_data = data.loc[:,['score1','score2']]
  14. result_data = data.result
  15. p = 0
  16. for i in xrange(10):
  17. x_train, x_test, y_train, y_test = \
  18. train_test_split(score_data, result_data, test_size = 0.2)
  19. model = LogisticRegression(C=1e9)
  20. model.fit(x_train, y_train)
  21. predict_y = model.predict(x_test)
  22. p += np.mean(predict_y == y_test)
  23. # 绘制图像
  24. pos_data = data[data.result == 1].loc[:,['score1','score2']]
  25. neg_data = data[data.result == 0].loc[:,['score1','score2']]
  26. h = 0.02
  27. x_min, x_max = score_data.loc[:, ['score1']].min() - .5, score_data.loc[:, ['score1']].max() + .5
  28. y_min, y_max = score_data.loc[:, ['score2']].min() - .5, score_data.loc[:, ['score2']].max() + .5
  29. xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
  30. Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
  31. # 绘制边界和散点
  32. Z = Z.reshape(xx.shape)
  33. plt.pcolormesh(xx, yy, Z, cmap=plt.cm.Paired)
  34. plt.scatter(x=pos_data.score1, y=pos_data.score2, color='black', marker='o')
  35. plt.scatter(x=neg_data.score1, y=neg_data.score2, color='red', marker='*')
  36. plt.xlim(xx.min(), xx.max())
  37. plt.ylim(yy.min(), yy.max())
  38. plt.show()
  39. # 模型表现
  40. answer = model.predict_proba(x_test)[:,1]
  41. precision, recall, thresholds = precision_recall_curve(y_test, answer)
  42. report = answer > 0.5
  43. print(classification_report(y_test, report, target_names = ['neg', 'pos']))
  44. print("average precision:", p/100)
添加新批注
在作者公开此批注前,只有你和作者可见。
回复批注