机器学习常用包

了解机器学习常用包

import sklearn.feature_extraction.text
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.feature_selection import VarianceThreshold
from scipy.stats import pearsonr


import jieba
import pandas as pd
import matplotlib.pyplot as plt

def datasets_demo():
    """
    sklearn 数据集使用
    :return:
    """
    iris = load_iris()
    print("鸢尾花数据集:\n",iris)
    print("查看数据集描述:\n",iris["DESCR"])

    #数据集划分
    x_train,x_test,y_train,y_test=train_test_split(iris.data,iris.target,test_size=0.2,random_state=22)
    print("训练寄的特征值:\n",x_train.shape)
    return None

#字典特征抽取
def dict_demo():
    data = [{'city': '北京', 'temperature': 100}, {'city': '上海', 'temperature': 60}, {'city': '深圳', 'temperature': 30}]
    # 1、实例化一个转换器类
    transfer = DictVectorizer(sparse=False)
    # 2、调用fit_transform
    datanew=transfer.fit_transform(data)
    print("datanew 为: \n",datanew)
    print("特征名字:\n",transfer.get_feature_names_out())
    return None

def count_demo():
    """
    对文本进行特征抽取,countvetorizer
    :return:
    """
    data = ["oh crazy i like like python", "omg i dislike python"]
    transfer = CountVectorizer()
    datanew = transfer.fit_transform(data)
    print("特征名字:\n", transfer.get_feature_names_out())
    print("datanew 为:\n",datanew.toarray()) #toarrary为显示二维数组的方法,原先默认是稀疏数组
    return None

def cut_word(text):
    """
    对中文进行分词
    "我爱北京天安门"————>"我 爱 北京 天安门"
    :param text:
    :return: text
    """
    # 用结巴对中文字符串进行分词
    text = " ".join(list(jieba.cut(text)))

    return text

def text_chinese_tfidf_demo():
    """
    对中文进行特征抽取
    :return: None
    """
    data = ["在我过去四十余年的生涯中,冬民情味尝得最深刻的要算十年前初移居的时侯了,",
            "十年以来,白马湖已成了一个小村落,当我移居的时侯,还是一片荒野,",
            "此外两三里内没有人烟。一家人于阴历十一月下甸从热闹的杭州移居于这荒凉的山野,宛如投身于极带中。"]
    # 将原始数据转换成分好词的形式
    text_list = []
    for sent in data:
        text_list.append(cut_word(sent))
    print(text_list)


    # 1、实例化一个转换器类
    # transfer = CountVectorizer(sparse=False)
    transfer = TfidfVectorizer(stop_words=['一种', '不会', '不要'])
    # 2、调用fit_transform
    data = transfer.fit_transform(text_list)
    print("文本特征抽取的结果:\n", data.toarray())
    print("返回特征名字:\n", transfer.get_feature_names())

    return None


#归一化演示
def minmax_demo():
    data = pd.read_csv("dating.txt")
    print(data)
    transfer = MinMaxScaler(feature_range=(0, 1))
    data = transfer.fit_transform(data[['milage', 'Liters', 'Consumtime']])
    print("最小值最大值归一化处理的结果:\n", data)
    return None

def standard_demo():
    data = pd.read_csv("dating.txt")
    print(data)
    transfer = StandardScaler()
    data = transfer.fit_transform(data[['milage', 'Liters', 'Consumtime']])
    print("标准化的结果:\n", data,"\n")
    print("每一列特征的平均值:", transfer.mean_,"\n")
    print("每一列特征的方差:", transfer.var_,"\n")
    return None

    #过滤方差特征
def filter_demo():
    data = pd.read_csv("factor_returns.csv")
  #  print("结果:\n", data, "\n")
    transfer =VarianceThreshold(threshold=20)
    datanew = transfer.fit_transform(data.iloc[:,1:-2]) #[:,1:-2],逗号左边表行全要,逗号右边表要1列到-2列
  #  print("结果:\n", datanew, datanew.shape,"\n")


    #相关系数
    r = pearsonr(data["revenue"],data["total_expense"])
    print("相关系数:\n", r, "\n")
    #通过画图表示
    plt.figure(figsize=(20, 8), dpi=100)
    plt.scatter(data['revenue'], data['total_expense'])
    plt.show()
    return None

if __name__ == '__main__':
    filter_demo()
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值