一.机器学习概述
二.特征工程
1.数据集
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
def datasets_demo():
'''
sklearn数据集的使用
:return:
'''
#获取数据集
iris = load_iris()
#print("鸢尾花数据集:\n",iris)
#数据集的划分
x_train,x_test,y_train,y_test = train_test_split(iris.data,iris.target,test_size=0.2,random_state=22)
#print("训练集的特征值:\n",x_train,x_train.shape)
return None
test_size:表示测试数据占0.2,训练数据占0.8
random_state:随机种子函数,同一个值代表了同样的结果,不同的值代表不同的随机结果。
2.特征抽取
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer
def dict_demo():
'''
字典特征提取
:return:
'''
data = [{'city':'北京','temperature':100},{'city':'上海','temperature':60},{'city':'深圳','temperature':30}]
#1.实例化一个转换器类
transfer = DictVectorizer(sparse=False) #sparse 稀疏矩阵 节省内存提高加载效率 one——hot编码
#2.调用fit_transform()
data_new = transfer.fit_transform(data)
print(data_new)
return None
def cut_word(text):
return " ".join(jieba.cut(text))
def count_chinese_demo():
'''
中文文本特征提取,自动分词
:return:
'''
#1.将中文文本进行分词
data = ["今天很残酷,明天更残酷,后天很美好,但是绝大部分人死在明天晚上.",
"运气就是自己好的时候多想想别人,自己不好的时候多检查检查自己",
"永远相信你的对手不在你边上,在你边上的都是你的榜样,哪怕这个人你特讨厌"]
data_new = []
for sent in data:
data_new.append(cut_word(sent))
#print(data_new)
#1.实例化一个转换器类
transfer = CountVectorizer(stop_words=["人死"])
#2.调用fit_transform
data_final = transfer.fit_transform(data_new)
print("data_new:\n",data_final.toarray())
print("特征名字:\n",transfer.get_feature_names())
return None
from sklearn.preprocessing import MinMaxScaler,StandardScaler
def minmax_demo():
'''
归一化:通过对原始数据进行变换吧数据映射到(默认为[0,1])之间
:return:
'''
#1.获取数据
data = pd.read_csv("dating.txt",sep='\t')
data = data.iloc[:,0:3]
#print("data:\n\t", data)
#2.实例化一个转换器类
transfer = StandardScaler()
#3.调用fit_transform
data_new = transfer.fit_transform(data)
print("data_new\n",data_new)
return None
def stand_demo():
# 1.获取数据
data = pd.read_csv("dating.txt", sep='\t')
data = data.iloc[:, 0:3] #取所有行的前三列
# print("data:\n\t", data)
# 2.实例化一个转换器类
transfer = MinMaxScaler()
# 3.调用fit_transform
data_new = transfer.fit_transform(data)
print("data_new\n", data_new)
return None
from sklearn.feature_selection import VarianceThreshold
def variance_demo():
'''
过滤低方差特征
:return:
'''
#1.获取数据
data = pd.read_csv("factor_returns.csv")
data = data.iloc[:,1:-2]
#print("data\n",data)
#2.实例化一个转化器类
transfer = VarianceThreshold()
#3.调用fit——transfrom
data_new = transfer.fit_transform(data)
print(data_new,'\n',type(data_new),data_new.shape)
#计算某两个变量之间的相关系数
r = pearsonr(data["pe_ratio"],data["pb_ratio"])
print("相关系数:\n",r)
from sklearn.decomposition import PCA
def pca_demo():
'''
PCA降维
:return:
'''
data= [[2,8,4,5],[6,3,0,8],[5,4,9,1]]
#1.实例化一个转换器类
transform = PCA(n_components=2) #传整数保留到多少个特征 #传小数保留百分之多少特征
#2.调用fit_transform
data_new = transform.fit_transform(data)
print("data_new:\n",data_new)
return None