[关闭]
@rianusr 2019-08-13T02:15:30.000000Z 字数 6086 阅读 1969

机器学习02:监督学习01:构建第一个分类模型

06-机器学习


0 本章概况

0.1 本章学习目标

  • 快速构建你的第一个简单分类模型
  • 简单了解分类模型的优化方向

0.2 课程安排

01本章课程安排.png-135kB

1 构建第一个分类模型

1.1 分类问题

02分类问题.png-134.7kB

1.2 kaggle竞赛经典案例--预测Titanic号上的乘客生存概率

1.2.1 业务和数据理解

03业务和数据理解.png-330kB

1.2.2 数据准备

04数据准备.png-208.2kB

(1):挑选特征变量
(2):特征变量变换
(3):缺失值处理

1.2.3 构建模型-逻辑回归模型

05模型构建.png-184.6kB

1.2.4 模型评估

06模型评估.png-105.2kB

1.2.5 模型发布

07模型发布.png-112.6kB

2 代码实现 -- 基于Numpy和pandas

  1. import numpy as np
  2. import pandas as pd
  3. from sklearn.linear_model import LogisticRegression
  4. from sklearn import metrics
  5. import matplotlib.pyplot as plt
  6. %matplotlib inline
  7. from IPython.core.interactiveshell import InteractiveShell
  8. InteractiveShell.ast_node_interactivity = "all"
  9. import warnings
  10. warnings.filterwarnings("ignore")
  11. train_src=pd.read_csv("./../data/titanic_data/train.csv")
  12. train_src.info()
  13. train_src.head()
  14. # Age分布
  15. train_src.hist(column="Age",bins=50)
  16. # 性别分布
  17. train_src["Sex"].value_counts().plot(kind="bar")
  18. # Sex与目标(是否生存)的相关性
  19. pd.crosstab(train_src["Sex"],train_src["Survived"]).plot(kind="bar")
  20. # Pclass与目标(是否生存)的相关性
  21. pd.crosstab(train_src["Pclass"],train_src["Survived"]).plot(kind="bar")
  22. # Age与目标(是否生存)的相关性
  23. train_src.age=pd.cut(train_src.Age,[0,5,15,20,35,50,60,100])
  24. pd.crosstab(train_src.age,train_src.Survived).plot(kind="bar")
  25. # 筛选Plcass,Sex,Age,SibSp,Parch,Fare六个变量作为预测变量(特征)
  26. train=train_src[["Survived","Pclass","Sex","Age","SibSp","Parch","Fare"]]
  27. # 把Sex变量的取值male替换为1,female替换为0
  28. train["Sex"]=train["Sex"].replace({"male":1,"female":0})
  29. #有117和乘客Age有缺失,用平均年龄替换
  30. age_mean=train["Age"].mean()
  31. train["Age"]=train["Age"].fillna(age_mean)
  32. # 查看一下准备好的数据集
  33. train.head(10)
  34. train.describe()
  35. # 拆分出自变量X,目标变量y
  36. train_X = train.ix[:,1:] # 训练集自变量
  37. train_y = train["Survived"] #训练集因变量
  38. # 使用逻辑回归算法训练模型
  39. lr = LogisticRegression() #使用默认参数
  40. lr.fit(train_X,train_y) #训练
  41. # 查看lr模型的系数
  42. print (lr.coef_)
  43. print(train_X.columns)
  44. pd.DataFrame(list(zip(np.transpose(lr.coef_),train_X.columns)),columns=["coef","columns"])
  45. train_y_pred=lr.predict(train_X) #对训练集进行预测,输出标签
  46. train_y_pred_prob=lr.predict_proba(train_X) # 对训练集进行预测,输出概率
  47. print(train_y_pred)
  48. print(train_y_pred_prob)
  49. # 误分类矩阵
  50. cnf_matrix=metrics.confusion_matrix(train_y,train_y_pred)
  51. print(cnf_matrix)
  52. # 准确率
  53. precision = metrics.accuracy_score(train_y,train_y_pred)
  54. print(precision)
  55. # 更直观一点的展现误分类矩阵
  56. def show_confusion_matrix(cnf_matrix,class_labels):
  57. plt.matshow(cnf_matrix,cmap=plt.cm.YlGn,alpha=0.7)
  58. ax=plt.gca()
  59. ax.set_xlabel("Predicted Label",fontsize=16)
  60. ax.set_xticks(range(0,len(class_labels)))
  61. ax.set_xticklabels(class_labels,rotation=45)
  62. ax.set_ylabel("Actual Label",fontsize=16,rotation=90)
  63. ax.set_yticks(range(0,len(class_labels)))
  64. ax.set_yticklabels(class_labels)
  65. ax.xaxis.set_label_position("top")
  66. ax.xaxis.tick_top()
  67. for row in range(len(cnf_matrix)):
  68. for col in range(len(cnf_matrix[row])):
  69. ax.text(col,row,cnf_matrix[row][col],va="center",ha="center",fontsize=16)
  70. class_labels=[0,1]
  71. show_confusion_matrix(cnf_matrix,class_labels)
  72. # 测试数据准备,与训练集的准备完全一致
  73. test_src=pd.read_csv("./../data/titanic_data/test.csv")
  74. test=test_src[["PassengerId","Pclass","Sex","Age","SibSp","Parch","Fare"]]
  75. test["Sex"].replace({"male":1,"female":0},inplace=True)
  76. test["Age"].fillna(age_mean,inplace=True)
  77. # Fare船票价格在测试集中出现了控制,用训练集的平均值替换
  78. test["Fare"].fillna(round(train["Fare"].mean()),inplace=True)
  79. # 对测试数据预测
  80. test_X=test.ix[:,1:]
  81. test_y_pred = lr.predict(test_X) #对测试集进行预测
  82. test_pred = pd.DataFrame({"PassengerId":test["PassengerId"],"Survived":test_y_pred.astype(int)})
  83. test_pred.to_csv("./../data/titanic_data/test_pred_0601.csv",index=False)
  84. # 查看预测结果
  85. test_pred.head()

3 用测试集对模型进行交叉验证

3.1 课程目标

08课程目标.png-69kB

3.2 为什么要用测试集验证模型?

09为什么要用测试集验证模型.png-280.3kB

3.3 数据拆分

10数据拆分.png-132.3kB

3.4 代码层面

11代码层面.png-657.4kB

3.5 代码演示:拆分数据集,训练集进行训练,测试集进行预测

  1. import numpy as np
  2. import pandas as pd
  3. from sklearn.linear_model import LogisticRegression
  4. from sklearn import metrics
  5. from sklearn import model_selectiom
  6. import matplotlib.pyplot as plt
  7. %matplotlib inline
  8. from IPython.core.interactiveshell import InteractiveShell
  9. InteractiveShell.ast_node_interactivity = "all"
  10. import warnings
  11. warnings.filterwarnings("ignore")
  12. train_src=pd.read_csv("./../data/titanic_data/train.csv")
  13. train_src.info()
  14. train_src.head()
  15. # Age分布
  16. train_src.hist(column="Age",bins=50)
  17. # 性别分布
  18. train_src["Sex"].value_counts().plot(kind="bar")
  19. # Sex与目标(是否生存)的相关性
  20. pd.crosstab(train_src["Sex"],train_src["Survived"]).plot(kind="bar")
  21. # Pclass与目标(是否生存)的相关性
  22. pd.crosstab(train_src["Pclass"],train_src["Survived"]).plot(kind="bar")
  23. # Age与目标(是否生存)的相关性
  24. train_src.age=pd.cut(train_src.Age,[0,5,15,20,35,50,60,100])
  25. pd.crosstab(train_src.age,train_src.Survived).plot(kind="bar")
  26. # 筛选Plcass,Sex,Age,SibSp,Parch,Fare六个变量作为预测变量(特征)
  27. train=train_src[["Survived","Pclass","Sex","Age","SibSp","Parch","Fare"]]
  28. # 把Sex变量的取值male替换为1,female替换为0
  29. train["Sex"]=train["Sex"].replace({"male":1,"female":0})
  30. #有117和乘客Age有缺失,用平均年龄替换
  31. age_mean=train["Age"].mean()
  32. train["Age"]=train["Age"].fillna(age_mean)
  33. # 查看一下准备好的数据集
  34. train.head(10)
  35. train.describe()
  36. # 拆分出自变量X,目标变量y
  37. train_X = train.ix[:,1:] # 训练集自变量
  38. train_y = train["Survived"] #训练集因变量
  39. X_train,X_test,y_train,y_test=model_selection.train_test_split(train_X,train_y,test_size=0.3,random_state=42)
  40. # 使用逻辑回归算法训练模型
  41. lr = LogisticRegression() #使用默认参数
  42. lr.fit(X_train,y_train) #训练
  43. y_train_pre=lr.predict(X_train) # 对训练集进行预测
  44. metrics.accuracy_score(y_train,y_train_pre) # 训练集准确率
  45. y_test_pre=lr.predict(X_test) # 对测试集进行预测
  46. metrics.accuracy_score(y_test,y_test_pre)

4 尝试其他的分类算法

12尝试其他分类算法.png-205.8kB

  1. from sklearn.linear_model import LogisticRegression
  2. from sklearn.svm import SVC,LinearSVC
  3. from sklearn.neighbors import KNeighborsClassifier
  4. from sklearn.tree import DecisionTreeClassifier
  5. from sklearn.ensemble import RandomForestClassifier

4.1 SVM

  1. svc=SVC()
  2. svc.fit(X_train,y_train)
  3. print("train accurary:",svc.score(X_train,y_train))
  4. print("test accurary:",svc.score(X_test,y_test))

4.2 决策树

  1. dtree=DecisionTreeClassifier()
  2. dtree.fit(X_train,y_train)
  3. print("train accurary:",dtree.score(X_train,y_train))
  4. print("test accurary:",dtree.score(X_test,y_test))

4.3 随机森林

  1. random_forest=RandomForestClassifier(n_estimators=10)
  2. random_forest.fit(X_train,y_train)
  3. print("train accurary:",random_forest.score(X_train,y_train))
  4. print("test accurary:",random_forest.score(X_test,y_test))

4.4 K近邻

  1. knn=KNeighborsClassifier(n_neighbors=3)
  2. knn.fit(X_train,y_train)
  3. print("train accurary:",knn.score(X_train,y_train))
  4. print("test accurary:",knn.score(X_test,y_test))

4.5 各个算法预测结果对比

image_1cetlq9emn2fjdmgco6ovcqu5m.png-100.5kB

5 准备一个更好的训练集

5.1 准备更好的训练集

13更好的训练集1.png-199.1kB

5.2 提取有价值的特征

13更好的训练集2.png-156kB

5.3 缺失值处理

15缺失值处理.png-91.4kB

5.4 特征变量转化处理

16特征变量转化处理.png-108.1kB

5.5 特征变量汇总

17变量汇总.png-662kB

5.6 新模型的变量权重

18新模型的变量权重.png-668.1kB

5.7 新模型评估

19模型评估.png-235.8kB

5.8 新模型发布

20新模型发部.png-141.9kB

5.9 代码演示

#E:\Jupyter_workspace\Scikit-learn video learning\监督学习\准备一个更好的训练集.ipynb

5 将多个模型的结果融合起来

5.1 多个模型的融合方法1:投票法(Voting)

21多个模型的融合方法-投票法.png-153.6kB

5.2 模型融合后的结果对比

22模型融合后的结果对比.png-149.9kB
模型融合存在过拟合的现象

6 模型优化的三个要素

23模型优化-1.png-251kB
25模型优化-2.png-159.7kB

添加新批注
在作者公开此批注前,只有你和作者可见。
回复批注