导图:
代码:
from scipy.stats import pearsonr
from sklearn.datasets import load_iris
from sklearn.decomposition import PCA
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
import jieba
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler
def datasets_demo():
# 获取数据集
iris = load_iris()
print("鸢尾花数据集:\n",iris)
print("查看数据集描述:\n",iris["DESCR"])
print("查看特征值的名字:\n",iris.feature_names)
print("查看特征值:\n",iris.data,iris.data.shape)
# 数据集划分
# random_state=22 随机种子,因为随机种子是固定的,所以每次运行程序时都会得到相同的训练集和测试集
# 这对于代码的可重复性和调试非常有用
x_train,x_test,y_train,y_test=train_test_split(iris.data,iris.target,test_size=0.2,random_state=22)
print("训练集的特征值:\n",x_train,x_train.shape)
return None
def dict_demo():
# 字典特征抽取 DictVectorizer
data=[{'city':'背景','temperature':100},{'city':'shan','temperature':200}]
# 1、实例化一个转换器
transfer = DictVectorizer(sparse=False)
# 2、调用fit_transform()
data_new = transfer.fit_transform(data)
print("data_new:\n",data_new)
return None
def count_demo():
# 文本特征提取
data = ["life is short,i like python","life is too long,i dislike python"]
# 1、实例化一个转换器类
transfer = CountVectorizer()
# 2、调用fit_transform
data_new = transfer.fit_transform(data)
print("data_new:\n", data_new.toarray())
print("特征名字:\n",transfer.get_feature_names_out())
return None
def count_chinese_demo():
# 中文文本特征提取
data = ["呜呜 呜呜 等等 是 否 分析","等等 实时 还"]
# 1、实例化一个转换器类
transfer = CountVectorizer()
# 2、调用fit_transform
data_new = transfer.fit_transform(data)
print("data_new:\n", data_new.toarray())
print("特征名字:\n",transfer.get_feature_names_out())
return None
def cut_word(text):
# 进行中文分词
return " ".join(list(jieba.cut(text)))
def count_chinese_demo2():
# 中文文本特征抽取 自动分词
data =["少时诵诗书所所所所丰富创新型",
"伺服电机寻或错过过过过过过过过过",
"爱何辜大V阿深V次氯酸钠计算出吧看啥迟迟不你"]
# 将中文文本进行分离
data_new=[]
for sent in data:
data_new.append(cut_word(sent) )
# 1、实例化一个转换器类
transfer = CountVectorizer()
# 2、调用fit_transform
data_final= transfer.fit_transform(data_new)
print("data_new:\n", data_final.toarray())
print("特征名字:\n", transfer.get_feature_names_out())
return None
def tfidf_demo2():
# 中文文本特征抽取 自动分词 TfidfVectorizer
data =["少时诵诗书所所所所丰富创新型",
"伺服电机寻或错过过过过过过过过过",
"爱何辜大V阿深V次氯酸钠计算出吧看啥迟迟不你"]
# 将中文文本进行分离
data_new=[]
for sent in data:
data_new.append(cut_word(sent) )
# 1、实例化一个转换器类
transfer = TfidfVectorizer()
# 2、调用fit_transform
data_final= transfer.fit_transform(data_new)
print("data_new:\n", data_final.toarray())
print("特征名字:\n", transfer.get_feature_names_out())
return None
def minmax_demo():
# 归一化
# 1、获取数据
data = pd.read_csv("dating.txt")
# 前三列数据
data = data.iloc[:,:3]
print(data)
# 2、实例化一个转换器
transfer = MinMaxScaler()
# 3、调用fit_transform
data_new = transfer.fit_transform(data)
print(data_new)
return None
def stand_demo():
# 标准化
# 1、获取数据
data = pd.read_csv("dating.txt")
# 前三列数据
data = data.iloc[:,:3]
print(data)
# 2、实例化一个转换器
transfer = StandardScaler()
# 3、调用fit_transform
data_new = transfer.fit_transform(data)
print(data_new)
return None
def variance_demo():
# 过滤低方差特征
# 1、获取数据
data = pd.read_csv("ffff.csv")
data = data.iloc[:,1:-2]
# 2、实例化一个转换器
transfer = VarianceThreshold(threshold=5)
# 3、调用fit_transform
data_new = transfer.fit_transform(data)
# 计算某两个变量之间的相关系数
r1 = pearsonr(data["pre1"],data["prt2"])
print("相关系数:",r1)
print(data_new,data_new.shape)
return None
def pca_demo():
# PCA 降维
data = [[2,8,4,5],[6,3,0,8],[5,4,9,1]]
# 1、实例化一个转换器类
transfer = PCA(n_components=2)
# 2、调用 fit_transform
data_new = transfer.fit_transform(data)
print(data_new)
return None
if __name__ == '__main__':
# 1、sklearn数据集使用
# datasets_demo()
# 2、字典特征抽取
# dict_demo()
# 3、文本特征抽取
# count_demo()
# 4、中文文本特征抽取
# count_chinese_demo()
# 5、中文文本特征抽取、自动分词
# count_chinese_demo2()
# 6、中文分词
# cut_word("我爱北京天安门")
# 7、用TF-IDF方法进行文本特征抽取
# tfidf_demo2()
# 8、归一化
# minmax_demo()
# 9、标准化
# stand_demo()
# 10、过滤低方差特征
# variance_demo()
# 11、pca 降维
pca_demo()