1. 为什么要进行特征抽取
为了将原始数据转化为算法要求的数据
2.特征抽取常用的方法有哪些
- one-hot编码
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
data = cv.fit_transform(['life is is short, i like python', 'life is too long, i dislike python'])
print(cv.get_feature_names())
print(data.toarray())
- tf-idf算法
from sklearn.feature_extraction.text import TfidfVectorizer
import jieba
def cutword():
c1 = ' '.join(list(jieba.cut('今天的天气不错,我很高兴')))
c2 = ' '.join(list(jieba.cut('今天的天气很差,,我不高兴')))
return c1, c2
def tfidfvec():
c1, c2 = cutword()
tf = TfidfVectorizer()
data = tf.fit_transform([c1, c2])
print(tf.get_feature_names())
print(data.toarray())
if __name__ == '__main__':
tfidfvec()
3. 处理数据的方法有哪些
- 归一化
原理: 利用最大值,最小值计算
缺点: 异常点影响较大
from sklearn.preprocessing import MinMaxScaler
mm = MinMaxScaler() # feature_range=(2, 3)
data = mm.fit_transform([[90, 2, 10, 40], [60, 4, 15, 45], [75, 3, 13, 46]])
- 标准化
原理: 利用平方差和方差计算
优点: 少量异常点影响不大
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
data = ss.fit_transform([[90, 2, 10, 40], [60, 4, 15, 45], [75, 3, 13, 46]])
4. 如何进行特征选择
- 过滤式
from sklearn.feature_selection import VarianceThreshold
# 删除低方差数据
var = VarianceThreshold(threshold=0.0)
data = var.fit_transform([[0,2,0,3],[0,1,4,3],[0,1,1,3]])
- PCA降维
from sklearn.decomposition import PCA
#主成为分析进行特征降维
pca = PCA(n_components=0.9) # 一般在90%-95% 不建议使用数字
data = pca.fit_transform([[2,8,4,5],[6,3,0,8],[5,4,9,1]])
5.监督学习有哪些常用算法
- k-近邻算法
原理: 相似值的样本,特征间的值应该是相似的
优点: 简单,无需估计参数,无需训练
缺点: 计算量大,内存占用大,受k值影响大
数据: facebook位置预测比赛
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
import pandas as pd
# k-近邻预测用户签到位置
def knncls():
# 读取数据
data = pd.read_csv('./data/train.csv')
# print(data.head(10))
data = data.query('x>1.0&x<1.25&y>2.5&y<2.75')
time_value = pd.to_datetime(data['time'], unit='s')
# print(time_value)
# 把日期格式转化为字典格式
time_value = pd.DatetimeIndex(time_value)
# 构造一些特征
data['day'] = time_value.day
data['hour'] = time_value.hour
data['weekday'] = time_value.weekday
# 包时间戳特征删除
data = data.drop(['time'], axis=1)
# print(data)
# 把签到次数少于n个目标位置删除
place_count = data.groupby('place_id').count()
tf = place_count[place_count.row_id > 3].reset_index()
data = data[data['place_id'].isin(tf.place_id)]
# 取出数据当中的特征值和目标值
x = data.drop(['place_id'], axis=1)
y = data['place_id']
# 进行数据的分割训练集合测试集
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)
# print(data)
# 特征工程(标准化)
std = StandardScaler()
# 对测试集和训练集的特征值进行标准化
x_train = std.fit_transform(x_train)
x_test = std.transform(x_test)
# 进行算法流程
knn = KNeighborsClassifier(n_neighbors=5) # 太小容易受异常影响 太大容易受k值类别波动
# fit predict score
knn.fit(x_train, y_train)
# 得出预测结果
y_predict = knn.predict(x_test)
print('预测目标签到位置:', y_predict)
print('预测准确率:', knn.score(x_test, y_test))
return None
if __name__ == '__main__':
knncls()
利用交叉验证和网格搜索优化模型
knn = KNeighborsClassifier()
param = {'n_neighbors': [3, 5, 10]}
gc = GridSearchCV(knn, param_grid=param, cv=2)
gc.fit(x_train, y_train)
print('在测试集上准确率:', gc.score(x_test, y_test))
print('在交叉验证中做好结果:', gc.best_score_)
print('选择最好的模型:', gc.best_estimator_)
print('每个超参数每次交叉验证的结果:', gc.cv_results_)
- 决策树
原理: 信息熵
优点:理解简单,数据量不大
缺点:无法使用非常复杂的预测,结果不稳定
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import pandas as pd
def decision():
titan = pd.read_csv('http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt')
x = titan[['pclass', 'age', 'sex']]
y = titan['survived']
x['age'].fillna(x['age'].mean(), inplace=True)
x_train, x_test, y_train, y_test = train_test_split(x, y,test_size=0.25)
# 进行处理(特征工程) 特征->类别->one_hot编码
dict = DictVectorizer(sparse=False)
x_train = dict.fit_transform(x_train.to_dict(orient='records'))
print(dict.get_feature_names())
x_test = dict.transform(x_test.to_dict(orient='records'))
# 用决策树进行预测
dec = DecisionTreeClassifier(max_depth=8) # max_depth=5
dec.fit(x_train, y_train)
# 预测准确率
print('预测准确率:', dec.score(x_test, y_test))
# 导出决策树结构
export_graphviz(dec, out_file='./tree.dot', feature_names=['age', 'pclass=1st', 'pclass=2nd', 'pclass=3rd', 'woman', 'man'])
return None
if __name__ == '__main__':
decision()