了解机器学习常用包
import sklearn.feature_extraction.text
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.feature_selection import VarianceThreshold
from scipy.stats import pearsonr
import jieba
import pandas as pd
import matplotlib.pyplot as plt
def datasets_demo():
"""
sklearn 数据集使用
:return:
"""
iris = load_iris()
print("鸢尾花数据集:\n",iris)
print("查看数据集描述:\n",iris["DESCR"])
#数据集划分
x_train,x_test,y_train,y_test=train_test_split(iris.data,iris.target,test_size=0.2,random_state=22)
print("训练寄的特征值:\n",x_train.shape)
return None
#字典特征抽取
def dict_demo():
data = [{'city': '北京', 'temperature': 100}, {'city': '上海', 'temperature': 60}, {'city': '深圳', 'temperature': 30}]
# 1、实例化一个转换器类
transfer = DictVectorizer(sparse=False)
# 2、调用fit_transform
datanew=transfer.fit_transform(data)
print("datanew 为: \n",datanew)
print("特征名字:\n",transfer.get_feature_names_out())
return None
def count_demo():
"""
对文本进行特征抽取,countvetorizer
:return:
"""
data = ["oh crazy i like like python", "omg i dislike python"]
transfer = CountVectorizer()
datanew = transfer.fit_transform(data)
print("特征名字:\n", transfer.get_feature_names_out())
print("datanew 为:\n",datanew.toarray()) #toarrary为显示二维数组的方法,原先默认是稀疏数组
return None
def cut_word(text):
"""
对中文进行分词
"我爱北京天安门"————>"我 爱 北京 天安门"
:param text:
:return: text
"""
# 用结巴对中文字符串进行分词
text = " ".join(list(jieba.cut(text)))
return text
def text_chinese_tfidf_demo():
"""
对中文进行特征抽取
:return: None
"""
data = ["在我过去四十余年的生涯中,冬民情味尝得最深刻的要算十年前初移居的时侯了,",
"十年以来,白马湖已成了一个小村落,当我移居的时侯,还是一片荒野,",
"此外两三里内没有人烟。一家人于阴历十一月下甸从热闹的杭州移居于这荒凉的山野,宛如投身于极带中。"]
# 将原始数据转换成分好词的形式
text_list = []
for sent in data:
text_list.append(cut_word(sent))
print(text_list)
# 1、实例化一个转换器类
# transfer = CountVectorizer(sparse=False)
transfer = TfidfVectorizer(stop_words=['一种', '不会', '不要'])
# 2、调用fit_transform
data = transfer.fit_transform(text_list)
print("文本特征抽取的结果:\n", data.toarray())
print("返回特征名字:\n", transfer.get_feature_names())
return None
#归一化演示
def minmax_demo():
data = pd.read_csv("dating.txt")
print(data)
transfer = MinMaxScaler(feature_range=(0, 1))
data = transfer.fit_transform(data[['milage', 'Liters', 'Consumtime']])
print("最小值最大值归一化处理的结果:\n", data)
return None
def standard_demo():
data = pd.read_csv("dating.txt")
print(data)
transfer = StandardScaler()
data = transfer.fit_transform(data[['milage', 'Liters', 'Consumtime']])
print("标准化的结果:\n", data,"\n")
print("每一列特征的平均值:", transfer.mean_,"\n")
print("每一列特征的方差:", transfer.var_,"\n")
return None
#过滤方差特征
def filter_demo():
data = pd.read_csv("factor_returns.csv")
# print("结果:\n", data, "\n")
transfer =VarianceThreshold(threshold=20)
datanew = transfer.fit_transform(data.iloc[:,1:-2]) #[:,1:-2],逗号左边表行全要,逗号右边表要1列到-2列
# print("结果:\n", datanew, datanew.shape,"\n")
#相关系数
r = pearsonr(data["revenue"],data["total_expense"])
print("相关系数:\n", r, "\n")
#通过画图表示
plt.figure(figsize=(20, 8), dpi=100)
plt.scatter(data['revenue'], data['total_expense'])
plt.show()
return None
if __name__ == '__main__':
filter_demo()