1,数据集
from sklearn.datasets import load_boston
from sklearn.datasets import load_iris
from sklearn.datasets import load_digits
from sklearn.datasets import fetch_20newsgroups
boston = load_boston()
iris = load_iris()
digits = load_digits()
news = fetch_20newsgroups(subset = 'all')
digits.data .shape
digits.target
digits.DESCR
digits.target_names
2,训练/测试数据集划分
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, test_size = 0.25, random_state = 42)
3,数据预处理
标准缩放预处理,一位有效编码预处理
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)
特征提取:字典向量,文本计数向量,文本Tfidf向量
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
4,有监督学习:分类器与回归器
线性模型:逻辑斯蒂回归,随机梯度下降
from sklearn.linear_model import LogisticRegression, LinearRegression, SGDClassifier, SGDRegressor
from sklearn.svm import SVC, SVR, LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
先有决策树,再有随机森林,在森林的基础上作梯度提升
DecisionTree < RandomForest < GradientBoosting
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
5,无监督学习
聚类:K均值
from sklearn.cluster import KMeans
6,度量/评价
from sklearn.metrics import classification_report
from sklearn.metrics import r2_score
均方差MSE,平均绝对偏差MAE
from sklearn.metrics import mean_squared_error, mean_absolute_error