数据集构成
特征值+目标值
机器学习算法分类
目标值:类别-分类问题
目标值:连续型的数据-回归问题
目标:无-监督学习(聚类)
归一化处理
缺点:如果有异常值:最大值、最小值,,容易收到印象
标准化
(x-mean)/std
标准差:集中程度
特征降维
降维
特征选择
Fileter过滤式
方差选择法:低方差特征过滤
相关系数-特征与特征之间的相关程度
Embeded嵌入式
主成分分析
案例探究用户对物品类别的喜好细分降维
# 好好学习!天天向上!
# 今天不学习!明天变废物!
import jieba
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
import pandas as pd
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.feature_selection import VarianceThreshold
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
def count_demo():
data=["life is short,i like like python","life is too long,i dislike python"]
#实例化一个转换器
transfer=CountVectorizer(stop_words=["is","too"])
data_new=transfer.fit_transform(data)
print("data_new:\n",data_new.toarray())
print("特征名字:\n",transfer.get_feature_names_out())
#调用fit_transform
return None
def count_chinese_demo():
data=["我爱北京天安门","天安门上太阳升"]
#实例化一个转换器
transfer=CountVectorizer()
data_new=transfer.fit_transform(data)
print("data_new:\n",data_new.toarray())
#调用fit_transform
return None
def cut_word(text):
text=" ".join(list(jieba.cut(text)))
return text
def count_chinese_demo2():
data=["3、当你接手一项新工作时,不要犹豫和退缩,不要担心自己能力不足而无法胜任。坚持奋斗下去,当你踏上一个台阶,相应的才识就会把你武装起来。4、最大的遗憾是,连离开都不能当面说清,或许一个拥抱就能解决的事情,最后却是没有任何解释的形同陌路。早安!"]
#实例化一个转换器
data_new=[]
for sent in data:
data_new.append(cut_word(sent))
# print(data_new)
transfer = CountVectorizer()
data_final = transfer.fit_transform(data_new)
print("data_new:\n", data_final.toarray())
print("特征名字:\n", transfer.get_feature_names_out())
#调用fit_transform
return None
def tfidf_demo():
data=["3、当你接手一项新工作时,不要犹豫和退缩,不要担心自己能力不足而无法胜任。坚持奋斗下去,当你踏上一个台阶,相应的才识就会把你武装起来。4、最大的遗憾是,连离开都不能当面说清,或许一个拥抱就能解决的事情,最后却是没有任何解释的形同陌路。早安!"]
#实例化一个转换器
data_new=[]
for sent in data:
data_new.append(cut_word(sent))
# print(data_new)
transfer = TfidfVectorizer(stop_words=["一种","所以"])
data_final = transfer.fit_transform(data_new)
print("data_new:\n", data_final.toarray())
print("特征名字:\n", transfer.get_feature_names_out())
#调用fit_transform
return None
def minmax_demo():
"""
归一化
"""
# 1.获取数据
data= pd.read_csv("dating.txt")
# print("data:\n",data)
data=data.iloc[:,:3]
# print("data:\n", data)
# 2.实例化一个转换器
# 设计范围
transfer= MinMaxScaler(feature_range=(2,3))
data_new=transfer.fit_transform(data)
print("data_new:\n",data_new)
# 3.调用fit_transform
return None
def stand_demo():
"""
归一化
"""
# 1.获取数据
data = pd.read_csv("dating.txt")
# print("data:\n",data)
data = data.iloc[:, :3]
# print("data:\n", data)
# 2.实例化一个转换器
# 设计范围
transfer = StandardScaler()
data_new = transfer.fit_transform(data)
# print("data_new:\n", data_new)
# 3.调用fit_transform
return None
def variance_demo():
"""
过滤低方差特征
:return:
"""
#获取数据
data=pd.read_csv("factor_returns.csv")
# print("data:\n",data)
data = data.iloc[:, 1:-2]
# print("data:\n", data)
# 实例化一个转换器类
tranfer=VarianceThreshold(threshold=10)
# 调用fit_transform
data_new=tranfer.fit_transform(data )
print("data_new\n",data_new,data_new.shape)
#计算某两个变量之间的相关系数
r=pearsonr(data["pe_ratio"],data["pb_ratio"])
r2= pearsonr(data["revenue"], data["total_expense"])
print("revenue与total_expense之间的相关性:\n",r2)
print("相关系数:\n",r)
plt.figure(figsize=(20, 8), dpi=100)
plt.scatter(data['revenue'], data['total_expense'])
plt.show()
return None
def pca_demo():
"""
PCA降维
:return:
"""
data=[[2,8,4,5],[6,3,0,8],[5,4,9,1]]
#实例化一个转换器类
transfer=PCA(n_components=0.90)
data_new=transfer.fit_transform(data)
print("data_new:\n",data_new)
#调用fit_transform
if __name__ == '__main__':
# count_demo()
# count_chinese_demo()
# print(cut_word("我爱北京天安门"))
# count_chinese_demo2()
#用TF-IDF的方法进行文本特征抽取
# tfidf_demo()
#归一化
# minmax_demo()
#标准化
# stand_demo()
# variance_demo()
#PCA降维
pca_demo()
#获取数据
#合并表
#找到user_id和aisile之间的关系
#PCA降维
import pandas as pd
#获取数据
aisles=pd.read_csv("./aisles.csv")
order_products=pd.read_csv("./order_products__prior.csv")
orders=pd.read_csv("./orders.csv")
products=pd.read_csv("./products.csv")
tab1=pd.merge(aisles,products,on=["aisle_id","aisle_id"])
tab1
tab2=pd.merge(tab1,order_products,on=["product_id","product_id"])
tab2
tab3=pd.merge(tab2,orders,on=["order_id","order_id"])
tab3
#找到user_id和aisile之间的关系
table=pd.crosstab(tab3["user_id"],tab3["aisle"])
table
data=table[:10000]
#PCA降维
from sklearn.decomposition import PCA
#实例化一个转换器类
transfer=PCA(n_components=0.95)
#调用fit-transform
data_new=transfer.fit_transform(data)
data_new.shape