@zhengyuhong
2015-04-08T11:57:25.000000Z
字数 10192
阅读 2030
Python 机器学习 数据挖掘

from sklearn import svmfrom sklearn import datasetsclf = svm.SVC()iris = datasets.load_iris()X, Y = iris.data[:-1], iris.target[:-1]clf.fit(X, Y)test_x = iris.data[-1]print clf.predict(test_x)print iris.target[-1]class sklearn.svm.SVC(C=1.0, kernel='rbf', degree=3, gamma=0.0, coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, random_state=None)
class sklearn.svm.LinearSVC(penalty='l2', loss='squared_hinge', dual=True, tol=0.0001, C=1.0, multi_class='ovr', fit_intercept=True, intercept_scaling=1, class_weight=None, verbose=0, random_state=None, max_iter=1000)
class sklearn.svm.NuSVC(nu=0.5, kernel='rbf', degree=3, gamma=0.0, coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, verbose=False, max_iter=-1, random_state=None)
from sklearn.svm import SVRimport numpy as npn_samples, n_features = 10, 5np.random.seed(0)y = np.random.randn(n_samples)X = np.random.randn(n_samples, n_features)clf = SVR(C=1.0, epsilon=0.2)clf.fit(X, y)class sklearn.svm.SVR(kernel='rbf', degree=3, gamma=0.0, coef0=0.0, tol=0.001, C=1.0, epsilon=0.1, shrinking=True, cache_size=200, verbose=False, max_iter=-1)
class sklearn.svm.LinearSVR(epsilon=0.0, tol=0.0001, C=1.0, loss='epsilon_insensitive', fit_intercept=True, intercept_scaling=1.0, dual=True, verbose=0, random_state=None, max_iter=1000)
from sklearn.svm import NuSVRimport numpy as npn_samples, n_features = 10, 5np.random.seed(0)y = np.random.randn(n_samples)X = np.random.randn(n_samples, n_features)clf = NuSVR(C=1.0, nu=0.1)clf.fit(X, y)
from sklearn import linear_modelclf = linear_model.LinearRegression()clf.fit ([[0, 0], [1, 1], [2, 2]], [0, 1, 2])print clf.coef_class sklearn.linear_model.LinearRegression(fit_intercept=True, normalize=False, copy_X=True, n_jobs=1)
from sklearn import linear_modelclf = linear_model.Ridge (alpha = .5)clf.fit ([[0, 0], [0, 0], [1, 1]], [0, .1, 1])clf.coef_clf.intercept_class sklearn.linear_model.Ridge(alpha=1.0, fit_intercept=True, normalize=False, copy_X=True, max_iter=None, tol=0.001, solver='auto')
from sklearn import linear_modelclf = linear_model.RidgeCV(alphas=[0.1, 1.0, 10.0])clf.fit([[0, 0], [0, 0], [1, 1]], [0, .1, 1])clf.alpha_class sklearn.linear_model.RidgeCV(alphas=array([ 0.1, 1., 10. ]), fit_intercept=True, normalize=False, scoring=None, cv=None, gcv_mode=None, store_cv_values=False)
from sklearn import linear_modelX = [[0., 0.], [1., 1.], [2., 2.], [3., 3.]]Y = [0., 1., 2., 3.]clf = linear_model.BayesianRidge()clf.fit(X, Y)clf.predict ([[1, 0.]])print clf.coef_class sklearn.linear_model.BayesianRidge(n_iter=300, tol=0.001, alpha_1=1e-06, alpha_2=1e-06, lambda_1=1e-06, lambda_2=1e-06, compute_score=False, fit_intercept=True, normalize=False, copy_X=True, verbose=False)
class sklearn.linear_model.LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver='liblinear', max_iter=100, multi_class='ovr', verbose=0)
from sklearn.preprocessing import PolynomialFeaturesimport numpy as npX = np.arange(6).reshape(3, 2)Xpoly = PolynomialFeatures(degree=2)poly.fit_transform(X)from sklearn.linear_model import Perceptronfrom sklearn.preprocessing import PolynomialFeaturesX = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])y = X[:, 0] ^ X[:, 1]X = PolynomialFeatures(interaction_only=True).fit_transform(X)Xclf = Perceptron(fit_intercept=False, n_iter=10, shuffle=False).fit(X, y)clf.score(X, y)
The features of X have been transformed from [
from sklearn.preprocessing import PolynomialFeaturesfrom sklearn.linear_model import LinearRegressionfrom sklearn.pipeline import Pipelinemodel = Pipeline([('poly', PolynomialFeatures(degree=3)),('linear', LinearRegression(fit_intercept=False))])# fit to an order-3 polynomial datax = np.arange(5)y = 3 - 2 * x + x ** 2 - x ** 3model = model.fit(x[:, np.newaxis], y)model.named_steps['linear'].coef_
from sklearn.linear_model import SGDClassifierX = [[0., 0.], [1., 1.]]y = [0, 1]clf = SGDClassifier(loss="hinge", penalty="l2")clf.fit(X, y)class sklearn.linear_model.SGDClassifier(loss='hinge', penalty='l2', alpha=0.0001, l1_ratio=0.15, fit_intercept=True, n_iter=5, shuffle=True, verbose=0, epsilon=0.1, n_jobs=1, random_state=None, learning_rate='optimal', eta0=0.0, power_t=0.5, class_weight=None, warm_start=False, average=False)
To get the signed distance to the hyperplane use SGDClassifier.decision_function:
print clf.decision_function([[2., 2.]])array([ 29.6...])
Using loss="log" or loss="modified_huber" enables the predict_proba method, which gives a vector of probability estimates
clf = SGDClassifier(loss="log").fit(X, y)clf.predict_proba([[1., 1.]])
import numpy as npfrom sklearn import linear_modeln_samples, n_features = 10, 5np.random.seed(0)y = np.random.randn(n_samples)X = np.random.randn(n_samples, n_features)clf = linear_model.SGDRegressor()clf.fit(X, y)
import numpy as npX = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])Y = np.array([1, 1, 1, 2, 2, 2])from sklearn.naive_bayes import GaussianNBclf = GaussianNB()clf.fit(X, Y)print(clf.predict([[-0.8, -1]]))clf_pf = GaussianNB()clf_pf.partial_fit(X, Y, np.unique(Y))print(clf_pf.predict([[-0.8, -1]]))
import numpy as npX = np.random.randint(5, size=(6, 100))y = np.array([1, 2, 3, 4, 5, 6])from sklearn.naive_bayes import MultinomialNBclf = MultinomialNB()clf.fit(X, y)print(clf.predict(X[2]))class sklearn.naive_bayes.MultinomialNB(alpha=1.0, fit_prior=True, class_prior=None)
import numpy as npX = np.random.randint(2, size=(6, 100))Y = np.array([1, 2, 3, 4, 4, 5])from sklearn.naive_bayes import BernoulliNBclf = BernoulliNB()clf.fit(X, Y)print(clf.predict(X[2]))
from sklearn import treeX = [[0, 0], [1, 1]]Y = [0, 1]clf = tree.DecisionTreeClassifier()clf = clf.fit(X, Y)class sklearn.tree.DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, class_weight=None)
from sklearn import treeX = [[0, 0], [2, 2]]y = [0.5, 2.5]clf = tree.DecisionTreeRegressor()clf = clf.fit(X, y)clf.predict([[1, 1]])class sklearn.tree.DecisionTreeRegressor(criterion='mse', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None)
from sklearn.ensemble import BaggingClassifierfrom sklearn.neighbors import KNeighborsClassifierbagging = BaggingClassifier(KNeighborsClassifier(), max_samples=0.5, max_features=0.5)class sklearn.ensemble.BaggingClassifier(base_estimator=None, n_estimators=10, max_samples=1.0, max_features=1.0, bootstrap=True, bootstrap_features=False, oob_score=False, n_jobs=1, random_state=None, verbose=0)
class sklearn.ensemble.BaggingRegressor(base_estimator=None, n_estimators=10, max_samples=1.0, max_features=1.0, bootstrap=True, bootstrap_features=False, oob_score=False, n_jobs=1, random_state=None, verbose=0)
class sklearn.ensemble.AdaBoostClassifier(base_estimator=None, n_estimators=50, learning_rate=1.0, algorithm='SAMME.R', random_state=None)
class sklearn.ensemble.AdaBoostRegressor(base_estimator=None, n_estimators=50, learning_rate=1.0, loss='linear', random_state=None)
from sklearn.datasets import make_hastie_10_2from sklearn.ensemble import GradientBoostingClassifierX, y = make_hastie_10_2(random_state=0)X_train, X_test = X[:2000], X[2000:]y_train, y_test = y[:2000], y[2000:]clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,max_depth=1, random_state=0).fit(X_train, y_train)clf.score(X_test, y_test)class sklearn.ensemble.GradientBoostingClassifier(loss='deviance', learning_rate=0.1, n_estimators=100, subsample=1.0, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3, init=None, random_state=None, max_features=None, verbose=0, max_leaf_nodes=None, warm_start=False)
class sklearn.ensemble.GradientBoostingRegressor(loss='ls', learning_rate=0.1, n_estimators=100, subsample=1.0, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3, init=None, random_state=None, max_features=None, alpha=0.9, verbose=0, max_leaf_nodes=None, warm_start=False)
class sklearn.ensemble.RandomForestClassifier(n_estimators=10, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, bootstrap=True, oob_score=False, n_jobs=1, random_state=None, verbose=0, warm_start=False, class_weight=None)
class sklearn.ensemble.RandomForestRegressor(n_estimators=10, criterion='mse', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, bootstrap=True, oob_score=False, n_jobs=1, random_state=None, verbose=0, warm_start=False
import numpy as npfrom sklearn import mixturenp.random.seed(1)g = mixture.GMM(n_components=2)# Generate random observations with two modes centered on 0# and 10 to use for training.obs = np.concatenate((np.random.randn(100, 1),10 + np.random.randn(300, 1)))g.fit(obs)np.round(g.weights_, 2)np.round(g.means_, 2)np.round(g.covars_, 2)g.predict([[0], [2], [9], [10]])np.round(g.score([[0], [2], [9], [10]]), 2)class sklearn.mixture.GMM(n_components=1, covariance_type='diag', random_state=None, thresh=None, tol=0.001, min_covar=0.001, n_iter=100, n_init=1, params='wmc', init_params='wmc')
class sklearn.cluster.KMeans(n_clusters=8, init='k-means++', n_init=10, max_iter=300, tol=0.0001, precompute_distances='auto', verbose=0, random_state=None, copy_x=True, n_jobs=1)
class sklearn.cluster.Ward(n_clusters=2, memory=Memory(cachedir=None), connectivity=None, n_components=None, compute_full_tree='auto', pooling_func=<function mean at 0x2b54b918b938>)