[关闭]
@zhengyuhong 2015-04-08T11:57:25.000000Z 字数 10192 阅读 2030

scikit-learn笔记

Python 机器学习 数据挖掘


模型选择

选择合适的模型

SVM

sklearn.svm.svc

  1. from sklearn import svm
  2. from sklearn import datasets
  3. clf = svm.SVC()
  4. iris = datasets.load_iris()
  5. X, Y = iris.data[:-1], iris.target[:-1]
  6. clf.fit(X, Y)
  7. test_x = iris.data[-1]
  8. print clf.predict(test_x)
  9. print iris.target[-1]
  10. class sklearn.svm.SVC(C=1.0, kernel='rbf', degree=3, gamma=0.0, coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, random_state=None)

sklearn.svm.LinearSVC

  1. class sklearn.svm.LinearSVC(penalty='l2', loss='squared_hinge', dual=True, tol=0.0001, C=1.0, multi_class='ovr', fit_intercept=True, intercept_scaling=1, class_weight=None, verbose=0, random_state=None, max_iter=1000)

sklearn.svm.NuSVC

  1. class sklearn.svm.NuSVC(nu=0.5, kernel='rbf', degree=3, gamma=0.0, coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, verbose=False, max_iter=-1, random_state=None)

sklearn.svm.SVR

  1. from sklearn.svm import SVR
  2. import numpy as np
  3. n_samples, n_features = 10, 5
  4. np.random.seed(0)
  5. y = np.random.randn(n_samples)
  6. X = np.random.randn(n_samples, n_features)
  7. clf = SVR(C=1.0, epsilon=0.2)
  8. clf.fit(X, y)
  9. class sklearn.svm.SVR(kernel='rbf', degree=3, gamma=0.0, coef0=0.0, tol=0.001, C=1.0, epsilon=0.1, shrinking=True, cache_size=200, verbose=False, max_iter=-1)

sklearn.svm.LinearSVR

  1. class sklearn.svm.LinearSVR(epsilon=0.0, tol=0.0001, C=1.0, loss='epsilon_insensitive', fit_intercept=True, intercept_scaling=1.0, dual=True, verbose=0, random_state=None, max_iter=1000)

sklearn.svm.NuSVR

  1. from sklearn.svm import NuSVR
  2. import numpy as np
  3. n_samples, n_features = 10, 5
  4. np.random.seed(0)
  5. y = np.random.randn(n_samples)
  6. X = np.random.randn(n_samples, n_features)
  7. clf = NuSVR(C=1.0, nu=0.1)
  8. clf.fit(X, y)

sklearn.svm.LinearRegression

  1. from sklearn import linear_model
  2. clf = linear_model.LinearRegression()
  3. clf.fit ([[0, 0], [1, 1], [2, 2]], [0, 1, 2])
  4. print clf.coef_
  5. class sklearn.linear_model.LinearRegression(fit_intercept=True, normalize=False, copy_X=True, n_jobs=1)

linear_model

sklearn.Linear_model.Ridge

  1. from sklearn import linear_model
  2. clf = linear_model.Ridge (alpha = .5)
  3. clf.fit ([[0, 0], [0, 0], [1, 1]], [0, .1, 1])
  4. clf.coef_
  5. clf.intercept_
  6. class sklearn.linear_model.Ridge(alpha=1.0, fit_intercept=True, normalize=False, copy_X=True, max_iter=None, tol=0.001, solver='auto')

sklearn.Linear_model.RidgeCV

  1. from sklearn import linear_model
  2. clf = linear_model.RidgeCV(alphas=[0.1, 1.0, 10.0])
  3. clf.fit([[0, 0], [0, 0], [1, 1]], [0, .1, 1])
  4. clf.alpha_
  5. class sklearn.linear_model.RidgeCV(alphas=array([ 0.1, 1., 10. ]), fit_intercept=True, normalize=False, scoring=None, cv=None, gcv_mode=None, store_cv_values=False)

Bayesian Ridge Regression

  1. from sklearn import linear_model
  2. X = [[0., 0.], [1., 1.], [2., 2.], [3., 3.]]
  3. Y = [0., 1., 2., 3.]
  4. clf = linear_model.BayesianRidge()
  5. clf.fit(X, Y)
  6. clf.predict ([[1, 0.]])
  7. print clf.coef_
  8. class sklearn.linear_model.BayesianRidge(n_iter=300, tol=0.001, alpha_1=1e-06, alpha_2=1e-06, lambda_1=1e-06, lambda_2=1e-06, compute_score=False, fit_intercept=True, normalize=False, copy_X=True, verbose=False)

Logistic regression

  1. class sklearn.linear_model.LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver='liblinear', max_iter=100, multi_class='ovr', verbose=0)

sklearn.preprocessing.PolynomialFeatures

  1. from sklearn.preprocessing import PolynomialFeatures
  2. import numpy as np
  3. X = np.arange(6).reshape(3, 2)
  4. X
  5. poly = PolynomialFeatures(degree=2)
  6. poly.fit_transform(X)
  7. from sklearn.linear_model import Perceptron
  8. from sklearn.preprocessing import PolynomialFeatures
  9. X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
  10. y = X[:, 0] ^ X[:, 1]
  11. X = PolynomialFeatures(interaction_only=True).fit_transform(X)
  12. X
  13. clf = Perceptron(fit_intercept=False, n_iter=10, shuffle=False).fit(X, y)
  14. clf.score(X, y)

The features of X have been transformed from [x1, x2] to [1, x1, x2, x21, x1 x2, x22], and can now be used within any linear model.

Pipeline and FeatureUnion: combining estimators

  1. from sklearn.preprocessing import PolynomialFeatures
  2. from sklearn.linear_model import LinearRegression
  3. from sklearn.pipeline import Pipeline
  4. model = Pipeline([('poly', PolynomialFeatures(degree=3)),
  5. ('linear', LinearRegression(fit_intercept=False))])
  6. # fit to an order-3 polynomial data
  7. x = np.arange(5)
  8. y = 3 - 2 * x + x ** 2 - x ** 3
  9. model = model.fit(x[:, np.newaxis], y)
  10. model.named_steps['linear'].coef_

sklearn.linear_model.SGDClassifier

  1. from sklearn.linear_model import SGDClassifier
  2. X = [[0., 0.], [1., 1.]]
  3. y = [0, 1]
  4. clf = SGDClassifier(loss="hinge", penalty="l2")
  5. clf.fit(X, y)
  6. class sklearn.linear_model.SGDClassifier(loss='hinge', penalty='l2', alpha=0.0001, l1_ratio=0.15, fit_intercept=True, n_iter=5, shuffle=True, verbose=0, epsilon=0.1, n_jobs=1, random_state=None, learning_rate='optimal', eta0=0.0, power_t=0.5, class_weight=None, warm_start=False, average=False)

To get the signed distance to the hyperplane use SGDClassifier.decision_function:

  1. print clf.decision_function([[2., 2.]])
  2. array([ 29.6...])

Using loss="log" or loss="modified_huber" enables the predict_proba method, which gives a vector of probability estimates P(y|x) per sample x:

  1. clf = SGDClassifier(loss="log").fit(X, y)
  2. clf.predict_proba([[1., 1.]])

sklearn.linear_model.SGDRegressor

  1. import numpy as np
  2. from sklearn import linear_model
  3. n_samples, n_features = 10, 5
  4. np.random.seed(0)
  5. y = np.random.randn(n_samples)
  6. X = np.random.randn(n_samples, n_features)
  7. clf = linear_model.SGDRegressor()
  8. clf.fit(X, y)

Naive Bayes

sklearn.naive_bayes.GaussianNB

  1. import numpy as np
  2. X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
  3. Y = np.array([1, 1, 1, 2, 2, 2])
  4. from sklearn.naive_bayes import GaussianNB
  5. clf = GaussianNB()
  6. clf.fit(X, Y)
  7. print(clf.predict([[-0.8, -1]]))
  8. clf_pf = GaussianNB()
  9. clf_pf.partial_fit(X, Y, np.unique(Y))
  10. print(clf_pf.predict([[-0.8, -1]]))

slearn.naive_bayes.MultinomialNB

  1. import numpy as np
  2. X = np.random.randint(5, size=(6, 100))
  3. y = np.array([1, 2, 3, 4, 5, 6])
  4. from sklearn.naive_bayes import MultinomialNB
  5. clf = MultinomialNB()
  6. clf.fit(X, y)
  7. print(clf.predict(X[2]))
  8. class sklearn.naive_bayes.MultinomialNB(alpha=1.0, fit_prior=True, class_prior=None)

sklearn.naive_bayes.BernoulliNB

  1. import numpy as np
  2. X = np.random.randint(2, size=(6, 100))
  3. Y = np.array([1, 2, 3, 4, 4, 5])
  4. from sklearn.naive_bayes import BernoulliNB
  5. clf = BernoulliNB()
  6. clf.fit(X, Y)
  7. print(clf.predict(X[2]))

Decision Trees

sklearn.tree.DecisionTreeClassifier

  1. from sklearn import tree
  2. X = [[0, 0], [1, 1]]
  3. Y = [0, 1]
  4. clf = tree.DecisionTreeClassifier()
  5. clf = clf.fit(X, Y)
  6. class sklearn.tree.DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, class_weight=None)

sklearn.tree.DecisionTreeRegressor

  1. from sklearn import tree
  2. X = [[0, 0], [2, 2]]
  3. y = [0.5, 2.5]
  4. clf = tree.DecisionTreeRegressor()
  5. clf = clf.fit(X, y)
  6. clf.predict([[1, 1]])
  7. class sklearn.tree.DecisionTreeRegressor(criterion='mse', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None)

Ensemble Methods

sklearn.ensemble.BaggingClassifier

  1. from sklearn.ensemble import BaggingClassifier
  2. from sklearn.neighbors import KNeighborsClassifier
  3. bagging = BaggingClassifier(KNeighborsClassifier(), max_samples=0.5, max_features=0.5)
  4. class sklearn.ensemble.BaggingClassifier(base_estimator=None, n_estimators=10, max_samples=1.0, max_features=1.0, bootstrap=True, bootstrap_features=False, oob_score=False, n_jobs=1, random_state=None, verbose=0)

sklearn.ensemble.BaggingRegressor

  1. class sklearn.ensemble.BaggingRegressor(base_estimator=None, n_estimators=10, max_samples=1.0, max_features=1.0, bootstrap=True, bootstrap_features=False, oob_score=False, n_jobs=1, random_state=None, verbose=0)

sklearn.ensemble.AdaBoostClassifier

  1. class sklearn.ensemble.AdaBoostClassifier(base_estimator=None, n_estimators=50, learning_rate=1.0, algorithm='SAMME.R', random_state=None)

sklearn.ensemble.AdaBoostRegressor

  1. class sklearn.ensemble.AdaBoostRegressor(base_estimator=None, n_estimators=50, learning_rate=1.0, loss='linear', random_state=None)

sklearn.ensemble.GradientBoostingClassifier

  1. from sklearn.datasets import make_hastie_10_2
  2. from sklearn.ensemble import GradientBoostingClassifier
  3. X, y = make_hastie_10_2(random_state=0)
  4. X_train, X_test = X[:2000], X[2000:]
  5. y_train, y_test = y[:2000], y[2000:]
  6. clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
  7. max_depth=1, random_state=0).fit(X_train, y_train)
  8. clf.score(X_test, y_test)
  9. class sklearn.ensemble.GradientBoostingClassifier(loss='deviance', learning_rate=0.1, n_estimators=100, subsample=1.0, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3, init=None, random_state=None, max_features=None, verbose=0, max_leaf_nodes=None, warm_start=False)

sklearn.ensemble.GradientBoostingRegressor

  1. class sklearn.ensemble.GradientBoostingRegressor(loss='ls', learning_rate=0.1, n_estimators=100, subsample=1.0, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3, init=None, random_state=None, max_features=None, alpha=0.9, verbose=0, max_leaf_nodes=None, warm_start=False)

sklearn.ensemble.RandomForestClassifier

  1. class sklearn.ensemble.RandomForestClassifier(n_estimators=10, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, bootstrap=True, oob_score=False, n_jobs=1, random_state=None, verbose=0, warm_start=False, class_weight=None)

sklearn.ensemble.RandomForestRegressor

  1. class sklearn.ensemble.RandomForestRegressor(n_estimators=10, criterion='mse', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, bootstrap=True, oob_score=False, n_jobs=1, random_state=None, verbose=0, warm_start=False

unsupervised learning

sklearn.mixture.GMM

  1. import numpy as np
  2. from sklearn import mixture
  3. np.random.seed(1)
  4. g = mixture.GMM(n_components=2)
  5. # Generate random observations with two modes centered on 0
  6. # and 10 to use for training.
  7. obs = np.concatenate((np.random.randn(100, 1),
  8. 10 + np.random.randn(300, 1)))
  9. g.fit(obs)
  10. np.round(g.weights_, 2)
  11. np.round(g.means_, 2)
  12. np.round(g.covars_, 2)
  13. g.predict([[0], [2], [9], [10]])
  14. np.round(g.score([[0], [2], [9], [10]]), 2)
  15. class sklearn.mixture.GMM(n_components=1, covariance_type='diag', random_state=None, thresh=None, tol=0.001, min_covar=0.001, n_iter=100, n_init=1, params='wmc', init_params='wmc')

cluster

sklearn.cluster.KMeans

  1. class sklearn.cluster.KMeans(n_clusters=8, init='k-means++', n_init=10, max_iter=300, tol=0.0001, precompute_distances='auto', verbose=0, random_state=None, copy_x=True, n_jobs=1)

sklearn.cluster.Ward

  1. class sklearn.cluster.Ward(n_clusters=2, memory=Memory(cachedir=None), connectivity=None, n_components=None, compute_full_tree='auto', pooling_func=<function mean at 0x2b54b918b938>)
添加新批注
在作者公开此批注前,只有你和作者可见。
回复批注