代码部分:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.feature_selection import VarianceThreshold
from scipy.stats import pearsonr
from sklearn.decomposition import PCA
import jieba
import pandas as pd
def datasets_demo():
"""
sklearn数据集的使用
:return:
"""
# 获取数据集
iris = load_iris()
# print(f'鸢尾花数据集:\n{iris}')
# print(f'查看数据集的描述:\n{iris["DESCR"]}')
# print(f'查看特征值的名字:\n{iris.feature_names}')
# print(f'查看特征值:\n{iris.data, iris.data.shape}')
# 数据集划分
x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=22)
# print(f'训练集的特征值:\n{x_train, x_train.shape}')
def dict_demo():
"""
字典特征抽取
:return:
"""
data = [{'city': '北京', 'templature': 100}, {'city': '上海', 'templature': 60}, {'city': '深圳', 'templature': 30}]
# 实例化一个转换器类 sparse=False:让结果为ndarray类型
transfer = DictVectorizer(sparse=False)
# 调用fit_transform()
data_new = transfer.fit_transform(data)
# print(f'data_new:\n{data_new}')
# print(f'类别名称:\n{transfer.get_feature_names()}')
data_1 = transfer.inverse_transform(data_new)
# print(f'data_reverse:\n{data_1}')
def count_demo():
"""
文本特征抽取
:return:
"""
data = ['Life is short, I like python.', 'Life is too long to like python']
# 实例化一个转换器类
transfer = CountVectorizer()
# 调用fit_transform()
data_new = transfer.fit_transform(data)
# toarray() 转换为ndarray类型
# print(f'data_new:\n{data_new.toarray()}')
# print(f'特征名字:\n{transfer.get_feature_names()}')
def count_chinese_demo():
"""
文本特征抽取: CountVectorizer
:return:
"""
data = ['我 爱 北京 天安门', '天安门上 太阳升']
# 实例化一个转换器类
transfer = CountVectorizer()
# 调用fit_transform()
data_new = transfer.fit_transform(data)
# print(f'data_new:\n{data_new}')
# print(f'特征名字:\n{transfer.get_feature_names()}')
def cut_word(text):
"""
中文分词 例如:'我是一位计算机领域的大佬' --> '我 是 一位 计算机领域 的 大佬'
:param text:
:return:
"""
# 分词之后组成列表,重新组成字符串
return ' '.join(list(jieba.cut(text)))
def count_chinese_demo2():
"""
中文文本特征抽取,自动分词
:return:
"""
# 将中文文本进行分词
data = ['一种还是一种今天很残酷,明天更残酷,后天很美好,但绝对大部分是死在明天晚上,所以每个人不要放弃今天.', '我们看到的从很远星来的光是在几百万年前发出的,这样当我们看到宇宙时,我们是在看他的过去.', '如果只用一种方式了解某样事物,你就不会真正了解它了解事物真正含义的秘密取决于如何将其与我们所了解的事物相联系.']
data_new = []
for i in data:
data_new.append(cut_word(i))
# print(data_new)
# 实例化一个转换器类
transfer = CountVectorizer(stop_words=['一种', '所以', '之前'])
# 调用fit_transform()
data_new1 = transfer.fit_transform(data_new)
# print(f'最终结果:\n{data_new1.toarray()}')
# print(f'特征名字:\n{transfer.get_feature_names()}')
def tfidf_demo():
"""
用TF-IDF的方法进行文本特征抽取
:return:
"""
data = ['一种还是一种今天很残酷,明天更残酷,后天很美好,但绝对大部分是死在明天晚上,所以每个人不要放弃今天.', '我们看到的从很远星来的光是在几百万年前发出的,这样当我们看到宇宙时,我们是在看他的过去.', '如果只用一种方式了解某样事物,你就不会真正了解它了解事物真正含义的秘密取决于如何将其与我们所了解的事物相联系.']
data_new = []
for i in data:
data_new.append(cut_word(i))
# 实例化一个转换器类
transfer = TfidfVectorizer()
# 调用fit_transform
data_final = transfer.fit_transform(data_new)
print(f'data_final:\n{data_final.toarray()}')
print(f'特征名字:\n{transfer.get_feature_names()}')
def minmax():
"""
归一化
:return:
"""
# 获取数据
data = pd.read_csv('dating.txt')
data = data.iloc[:, :3]
# print(f'data:\n{data}')
# 实例化一个转换器类
transfer = MinMaxScaler()
# 调用fit_transform()
data_new = transfer.fit_transform(data)
# print(f'data_new:\n{data_new}')
def stand_demo():
"""
标准化
:return:
"""
# 获取数据
data = pd.read_csv('dating.csv')
data = data.iloc[:, :3]
# print(f'data:\n{data}')
# 实例化一个转换器类
transfer = StandardScaler()
# 调用fit_transform()
data_new = transfer.fit_transform(data)
# print(f'data_new:\n{data_new}')
def variance_demo():
"""
过滤低方差特征
:return:
"""
# 获取数据
data = pd.read_csv('factor_returns.csv')
data = data.iloc[:, 1:-2]
# print(f'data:\n{data}')
# 实例化一个转换器类
transfer = VarianceThreshold(threshold=10)
# 调用fit_transform()
data_new = transfer.fit_transform(data)
# print(f'data_new:\n{data_new, data_new.shape}')
# 计算两个变量之间的线性关系
# pearsonr() 计算特征与目标变量之间的相关度
r1 = pearsonr(data['pe_ratio'], data['pb_ratio'])
# print(f'相关系数:\n{r1}')
r2 = pearsonr(data['revenue'], data['total_expense'])
# print(f'相关性:\n{r2}')
def pca_demo():
"""
PCA降维
:return:
"""
# 准备数据
data = [[2, 8, 4, 5], [6, 3, 0, 8], [5, 4, 9, 1]]
# 实例化一个转换器类
transfer = PCA(n_components=0.95)
# 调用fit_transform()
data_new = transfer.fit_transform(data)
print(f'data_new:\n{data_new}')
if __name__ == '__main__':
datasets_demo()
dict_demo()
count_demo()
count_chinese_demo()
count_chinese_demo2()
tfidf_demo()
# 没有数据,运行会出错
# minmax()
# stand_demo()
# variance_demo()
# pca_demo()