2021-08-14

最新推荐文章于 2021-11-17 11:36:58 发布

过眼云烟

最新推荐文章于 2021-11-17 11:36:58 发布

阅读量157

点赞数

本文链接：https://blog.csdn.net/weixin_52443264/article/details/119707653

版权

文章目录

pandas之生成时间序列
pandas之groupby单个分组
pandas之groupby多个分组
统计电影分类的genre的情况
pandas、matplotlib出店铺排名前10的国家
pandas、matplotlib呈现出每个中国城市的店铺数量横着的条形统计图
pandas、matplotlib绘制不同年份的书的数量
pandas、matplotlib统计不同年份书的平均评分的情况
911之不同月份不同类型的电话的次数的变化情况
911之不同月份电话次数的变化情况
911值统计不同月份的紧急情况的次数
911数据分析之统计不同类型的紧急情况的次数
机器学习

pandas之生成时间序列

import pandas as pd
import numpy as np

#生成时间范围
#pd.date_range(start="",end="",freq="")"D"表示天
print(pd.date_range(start="20171230",end="20180110",freq="D"))
# DatetimeIndex(['2017-12-30', '2017-12-31', '2018-01-01', '2018-01-02',
#                '2018-01-03', '2018-01-04', '2018-01-05', '2018-01-06',
#                '2018-01-07', '2018-01-08', '2018-01-09', '2018-01-10'],
#               dtype='datetime64[ns]', freq='D')

#10D  隔10天取一次
print(pd.date_range(start="2017/12/30",end="20180131",freq="10D"))
# DatetimeIndex(['2017-12-30', '2018-01-09', '2018-01-19', '2018-01-29'], dtype='datetime64[ns]', freq='10D')

#period=10 取10个数 freq="D" 按天取
print(pd.date_range(start="2017-12-30",periods=10,freq="D"))#从2017-12-30开始连续按天取10天
print(pd.date_range(start="20171230",periods=10,freq="M"))#从2017-12-30开始连续按月取10天
# DatetimeIndex(['2017-12-31', '2018-01-31', '2018-02-28', '2018-03-31',
#                '2018-04-30', '2018-05-31', '2018-06-30', '2018-07-31',
#                '2018-08-31', '2018-09-30'],
#               dtype='datetime64[ns]', freq='M')


index=pd.date_range("2017-01-01",periods=4)
df=pd.DataFrame(np.random.rand(4),index=index)
print(df)

pandas之groupby单个分组

import pandas as pd
import numpy as np

file_path="./directory.csv"
df=pd.read_csv(file_path)
print(df.head(1))
print(df.info())
print(type(df))#DataFrame

grouped=df.groupby(by="Country")
print(grouped)#<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000015D4C1C2C10>

#DataFrameGroupBy
#可以进行遍历
# for i,j in grouped:
#     print(i)
#     print("-"*100)
#     print(j)
#     print("*"*100)

for i in grouped:
    print(i)


#调用聚合方法
# print(grouped.count())#将每一列求和
#
# print(grouped["Brand"].count())

country_count=grouped["Brand"].count()
print(type(country_count))#Series
print(country_count["US"])#13608
print(country_count["CN"])#2734
print(type(country_count["CN"]))#int64
print(type(grouped.count()))#DataFrame
print(type(grouped["Brand"]))#SeriesGroupBy
print(grouped["Brand"])#<pandas.core.groupby.generic.SeriesGroupBy object at 0x000001EA0D9B6220>
print(country_count)
# Country
# AD        1
# AE      144
# AR      108
# AT       18
# AU       22
#       ...
# TT        3
# TW      394
# US    13608
# VN       25
# ZA        3
# Name: Brand, Length: 73, dtype: int64

#统计中国每个省份店铺的数量
china_data=df[df["Country"]=="CN"]
grouped=china_data.groupby(by="State/Province").count()["Brand"]
print(grouped)
#print(type(china_data))#DataFrame
# print(china_data)
# print(df["Country"])
# print(df.head(5))
# print(grouped.count())#将每一列求和

pandas之groupby多个分组

import pandas as pd

file_path="./directory.csv"
df=pd.read_csv(file_path)
#得到的虽然只是一列数据，但这一列数据却是DataFrame类型的
print(type(df[["Brand"]]))#DataFrame
#得到的是一列数据，这一列数据的类型是Series
print(type(df["Brand"]))


#数据按照多个条件进行分组
#因为是用df["Country"]来调用groupby()方法，所以后面的by要想用"Country"分组的话就要写成df["Country"]
grouped=df["Country"].groupby(by=[df["Country"],df["State/Province"]]).count()
print(grouped)#grouped是Series类型

#数据按照多个条件进行分组，返回DataFrame
grouped1=df[["Brand"]].groupby(by=[df["Country"],df["State/Province"]]).count()
grouped2=df.groupby(by=[df["Country"],df["State/Province"]])[["Brand"]].count()
grouped3=df.groupby(by=[df["Country"],df["State/Province"]]).count()[["Brand"]]

print(grouped1,type(grouped1))#DataFrame
print("*"*100)
print(grouped2,type(grouped2))#DataFrame
print("*"*100)
print(grouped3,type(grouped3))#DataFrame

统计电影分类的genre的情况

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

file_path="./IMDB-Movie-Data.csv"

df=pd.read_csv(file_path)

print(df["Genre"])

#统计分类的列表
temp_list=df["Genre"].str.split(",").tolist()#[[][][]]
genre_list=list(set([i for j in temp_list for i in j]))

#构造全为0的数组
zeros_df=pd.DataFrame(np.zeros((df.shape[0],len(genre_list))),columns=genre_list)
print(zeros_df)

#给每个电影出现分类的位置赋值为
for i in range(df.shape[0]):
    zeros_df.loc[i,temp_list[i]]=1

print(zeros_df.head(3))

#统计每个分类的电影的数量和
#sum(axis=0)按列求和
genre_count=zeros_df.sum(axis=0)
print(genre_count)
# Thriller     195.0
# Family        51.0
# Drama        513.0
# History       29.0
# Sport         18.0
# Musical        5.0
# Action       303.0
# Romance      141.0
# War           13.0
# Sci-Fi       120.0
# Adventure    259.0
# Fantasy      101.0
# Crime        150.0
# Animation     49.0
# Mystery      106.0
# Biography     81.0
# Horror       119.0
# Music         16.0
# Western        7.0
# Comedy       279.0
# dtype: float64


#排序 sort_values()
genre_count=genre_count.sort_values()

#绘制图形大小
plt.figure(figsize=(20,8),dpi=80)

#绘制条形图,Series的index和values
_x=genre_count.index
_y=genre_count.values
plt.bar(range(len(_x)),_y)
plt.xticks(range(len(_x)),_x)#把_x的字符串刻在len(_x)上
plt.show()

在这里插入图片描述

pandas、matplotlib出店铺排名前10的国家

import pandas as pd
from matplotlib import pyplot as plt
file_path="./directory.csv"
df=pd.read_csv(file_path)

#准备数据 ascending=False降序
data1=df.groupby(by="Country").count()["Brand"].sort_values(ascending=False)[:10]

_x=data1.index
_y=data1.values

#画图
plt.figure(figsize=(20,8),dpi=80)

plt.bar(range(len(_x)),_x)

plt.xticks(range(len(_x)),_x)

plt.show()

在这里插入图片描述

pandas、matplotlib呈现出每个中国城市的店铺数量横着的条形统计图

import pandas as pd
from matplotlib import pyplot as plt
from matplotlib import font_manager

my_font=font_manager.FontProperties(fname="c:\Windows\Fonts\simsun.ttc")
file_path="./directory.csv"
df=pd.read_csv(file_path)
df=df[df["Country"]=='CN']
#准备数据 ascending=False降序
data1=df.groupby(by="City").count()["Brand"].sort_values(ascending=False)[:25]

_x=data1.index
_y=data1.values

#画图
plt.figure(figsize=(20,10),dpi=80)

plt.barh(range(len(_x)),_y,height=0.3,color="orange")

plt.yticks(range(len(_x)),_x,fontproperties=my_font)

plt.show()

在这里插入图片描述

pandas、matplotlib绘制不同年份的书的数量

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from matplotlib import font_manager

file_path="./books.csv"

df=pd.read_csv(file_path)
#np.count_nonzero(df.isna()) 查找DataFrame数据里的Nan有多少个
# print(np.count_nonzero(df.isna()))#2975
# print(df.info())
# print(df.count())
#取没有nan的那几行数据
data1=df[pd.notnull(df["original_publication_year"])]
# print(data1)
grouped=data1.groupby(by="original_publication_year").count()["title"]
# print(grouped)

x=grouped.index
y=grouped.values

plt.figure(figsize=(20,8),dpi=80)

plt.scatter(x,y)

plt.show()

在这里插入图片描述

pandas、matplotlib统计不同年份书的平均评分的情况

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from matplotlib import font_manager

file_path="./books.csv"

df=pd.read_csv(file_path)

data1=df[pd.notnull(df["original_publication_year"])]

#求每一年打分的平均数
grouped=data1["average_rating"].groupby(by=data1["original_publication_year"]).mean()

_x=grouped.index
_y=grouped.values

#画折线图
plt.figure(figsize=(20,8),dpi=80)
plt.plot(range(len(_x)),_y)

plt.xticks(list(range(len(_x)))[::10],_x[::10].astype(int),rotation=90)
plt.show()

#grouped1里面的元素是一个个元组，元组里面有两个元素，一个是年份，一个是序号和对应的平均打分
grouped1=data1["average_rating"].groupby(by=data1["original_publication_year"])
for i in grouped1:
    print(i)
print(type(grouped1))

在这里插入图片描述

911之不同月份不同类型的电话的次数的变化情况

#911数据中不同月份不同类型的电话的次数的变化情况
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

#把时间字符串转为时间类型设置为索引
df = pd.read_csv("./911.csv")
df["timeStamp"] = pd.to_datetime(df["timeStamp"])

temp_list = df["title"].str.split(": ").tolist()
cate_list = [i[0] for i in temp_list]
# print(np.array(cate_list).reshape((df.shape[0],1)))
#添加一列
df["cate"] = pd.DataFrame(np.array(cate_list).reshape((df.shape[0],1)))

#要后给df设置索引，因为df["cate"]的索引是0,1,2...而要是先设置df索引为时间戳的话
#df["cate"]就和df的索引不一样了，这样就不能用上面那种方式添加新的一列了
df.set_index("timeStamp",inplace=True)

print(df.head(1))

#提供一张白纸
plt.figure(figsize=(20, 8), dpi=80)

#分组，每循环一次画一次图
for group_name,group_data in df.groupby(by="cate"):

    #对不同的分类都进行绘图
    #行索引为不同月份，列索引为当前title的次数
    count_by_month = group_data.resample("M").count()["title"]
    # 画图
    _x = count_by_month.index
    print(_x)
    _y = count_by_month.values
    #将_x变成年月日形式的
    _x = [i.strftime("%Y%m%d") for i in _x]
    #传入x,y画图，label表示图例的意思
    plt.plot(range(len(_x)), _y, label=group_name)

#刻画x轴坐标
plt.xticks(range(len(_x)), _x, rotation=45)
#设置图例位置
plt.legend(loc="best")#loc表示图例位置
#展示图形
plt.show()

在这里插入图片描述

911之不同月份电话次数的变化情况

import pandas as pd
from matplotlib import pyplot as plt
import numpy as np

df=pd.read_csv("./911.csv")

#将原来df里面的时间变成时间戳 pd.to_datetime()  格式化时间 format
#format 如果不指定时分秒，默认均为0 其他年月日按照格式，变成日期格式
df["timeStamp"]=pd.to_datetime(df["timeStamp"],format="%Y-%m-%d")

#用timeStamp这一列作为索引,inplace=True 原地修改df
df.set_index("timeStamp",inplace=True)
print(df.head())

#统计出911数据中不同月份电话次数
#按照月份分组 df.resample("M") 计算次数 count()
count_by_month=df.resample("M").count()["title"]

#画图
_x=count_by_month.index
_y=count_by_month.values

#想把_x后面的时分秒去掉，用strftime()函数
_x=[i.strftime("%Y%m%d") for i in _x]

plt.figure(figsize=(20,8),dpi=80)

plt.plot(range(len(_x)),_y)

plt.xticks(range(len(_x)),_x,rotation=45)

plt.show()

在这里插入图片描述

911值统计不同月份的紧急情况的次数

import pandas as pd
from matplotlib import pyplot as plt
import numpy as np


df=pd.read_csv("./911.csv")
# print(df)
temp_list=df["title"].str.split(":").tolist()#[[][]]
cate_list=list([i[0] for i in temp_list])

cate_df=pd.DataFrame(np.array(cate_list).reshape((df.shape[0],1)),columns=["cate"])
# print(cate_df)

#DtaFrame 数据的合并，axis=1表示横向合并
# print(pd.concat([df,cate_df],axis=1))
df=pd.concat([df,cate_df],axis=1)

#用groupby()统计次数 title:不同类型 count()统计次数
print(df.groupby(by="cate").count()["title"])
# cate
# EMS        332692
# Fire       100622
# Traffic    230208
# Name: title, dtype: int64

#方式二，给df添加新的一列
#df["cate"]=pd.DataFram(np.array(cate_list).reshape((df.shape[0],1)))

# print(np.array(cate_list).reshape((df.shape[0],1)))
#
# print(np.array(cate_list))
# print(np.arange(12).reshape((3,4)))
# print(pd.DataFrame(np.arange(12).reshape((3,4))))

911数据分析之统计不同类型的紧急情况的次数

import pandas as pd
from matplotlib import pyplot as plt
import numpy as np

df=pd.read_csv("./911.csv")

# print(df.head(10))
print(df.info())

#获取分类
#print(df["title"].str.split(":"))
temp_list=df["title"].str.split(":").tolist()#[[][]]
# print(temp_list)

#去重，取列表
cate_list=list(set([i[0] for i in temp_list]))
print(cate_list)#['Fire', 'Traffic', 'EMS']

#构造全为0的数组,columns的参数得是列表
zeros_df=pd.DataFrame(np.zeros((df.shape[0],len(cate_list))),columns=cate_list)
print(zeros_df)

#赋值,选含cate的这一列，然后把这一列为True的地方赋值为1
for cate in cate_list:
    zeros_df[cate][df["title"].str.contains(cate)]=1
    # print(df["title"].str.contains(cate))

#contains 返回表示个字符串是否含有指定模式的布尔型数组

#方式二：赋值
# for i in range(df.shape[0]):
#     zeros_df.loc[i,temp_list[i][0]]=1
# print(zeros_df)

#求不同类型紧急情况的次数
sum_ret=zeros_df.sum(axis=0)

#画图
plt.figure(figsize=(20,8),dpi=80)
_x=sum_ret.index
_y=sum_ret.values

plt.bar(_x,_y,width=0.3)

plt.show()
# axis=1 从左到右求平均
# axis=0 从上到下求平均
# shape[0] 表示矩阵的行数
# shape[1] 表示矩阵的列数

在这里插入图片描述

机器学习

定义

机器学习是从数据中自动分析获得模型，并利用模型对未知数据进行预测。

我们人从大量的日常经验中归纳规律，当面临新的问题的时候，就可以利用以往总结的规律去分析现实状况，采取最佳策略。

从数据（大量的猫和狗的图片）中自动分析获得模型（辨别猫和狗的规律），从而使机器拥有识别猫和狗的能力。

从数据（房屋的各种信息）中自动分析获得模型（判断房屋价格的规律），从而使机器拥有预测房屋价格的能力。

数据集的构成

结构：特征值+目标值

对于每一行数据我们可以称之为样本。
有些数据集可以没有目标值：

可用数据集

在这里插入图片描述
Kaggle网址：https://www.kaggle.com/datasets

UCI数据集网址： http://archive.ics.uci.edu/ml/

scikit-learn网址：http://scikit-learn.org/stable/datasets/index.html#datasets

sklearn数据集

1 scikit-learn数据集API介绍
sklearn.datasets
加载获取流行数据集
datasets.load_()
获取小规模数据集，数据包含在datasets里
datasets.fetch_(data_home=None)
获取大规模数据集，需要从网络上下载，函数的第一个参数是data_home，表示数据集下载的目录,默认是 ~/scikit_learn_data/

2 sklearn小数据集
sklearn.datasets.load_iris()

加载并返回鸢尾花数据集

sklearn.datasets.load_boston()

加载并返回波士顿房价数据集

3 sklearn大数据集
sklearn.datasets.fetch_20newsgroups(data_home=None,subset=‘train’)
subset：‘train’或者’test’，‘all’，可选，选择要加载的数据集。
训练集的“训练”，测试集的“测试”，两者的“全部”

数据集划分api

sklearn.model_selection.train_test_split(arrays, *options)
x 数据集的特征值
y 数据集的标签值
test_size 测试集的大小，一般为float
random_state 随机数种子,不同的种子会造成不同的随机采样结果。相同的种子采样结果相同。
return 测试集特征训练集特征值值，训练标签，测试标签(默认随机取)

实例：鸢尾花

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split


def datasets_demo():
    """
    对鸢尾花数据集的演示
    :return: None
    """
    # 1、获取鸢尾花数据集
    iris = load_iris()
    print("鸢尾花数据集的返回值：\n", iris)
    # 返回值是一个继承自字典的Bench
    print("鸢尾花的特征值:\n", iris["data"])
    print("鸢尾花的目标值：\n", iris.target)
    print("鸢尾花特征的名字：\n", iris.feature_names)
    print("鸢尾花目标值的名字：\n", iris.target_names)
    print("鸢尾花的描述：\n", iris.DESCR)

    # 2、对鸢尾花数据集进行分割
    # 训练集的特征值x_train 测试集的特征值x_test 训练集的目标值y_train 测试集的目标值y_test
    x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=22)
    print("x_train:\n", x_train.shape)
    # 随机数种子
    x_train1, x_test1, y_train1, y_test1 = train_test_split(iris.data, iris.target, random_state=6)
    x_train2, x_test2, y_train2, y_test2 = train_test_split(iris.data, iris.target, random_state=6)
    print("如果随机数种子不一致：\n", x_train == x_train1)
    print("如果随机数种子一致：\n", x_train1 == x_train2)

    return None

特征工程

特征工程包含内容

特征抽取
特征预处理
特征降维

特征抽取

字典特征提取

作用：对字典数据进行特征值化

from sklearn.feature_extraction import DictVectorizer


#字典特征抽取
def dict_demo():
    data= [{'city': '北京', 'temperature': 100}, {'city': '上海', 'temperature': 60},
                 {'city': '深圳', 'temperature': 30}]

    #实例化t一个转换器类
    transfer=DictVectorizer(sparse=False)
    #调用
    data_new=transfer.fit_transform(data)#将data转换为机器学习的数据
    print("data_new:\n",data_new)
    print("特征名字:\n",transfer.get_feature_names())#获取特征值的名字


if __name__=="__main__":
    dict_demo()


# sklearn.feature_extraction.DictVectorizer(sparse = True)
# 将映射列表转换为Numpy数组或scipy.sparse矩阵
#如果返回的是sparse矩阵，那就是下面这种类型，将为0值用位置表示出来，节省内存，提高加载效率
# data_new:
#    (0, 1)	1.0
#   (0, 3)	100.0
#   (1, 0)	1.0
#   (1, 3)	60.0
#   (2, 2)	1.0
#   (2, 3)	30.0

#sparse为False时是这种形式，把city这个特征值转换为了3个数字表示(one-hot编码)
# data_new:
#sparse默认为True
#  [[  0.   1.   0. 100.]
#  [  1.   0.   0.  60.]
#  [  0.   0.   1.  30.]]
# 特征名字:
#  ['city=上海', 'city=北京', 'city=深圳', 'temperature']
#第一列表示上海，样本数据是上海的话就是1，不是就为0
#第二列表示北京，样本数据是北京的话就是1，不是就为0
#第三列表示深圳，样本数据是深圳的话就是1，不是就为0
#第四列表示temperature

#对于特征中存在类别信息的，我们都会做one-hot编码处理

文本特征抽取

from sklearn.feature_extraction.text import CountVectorizer



data=["life is short,i like python",
"life is too long,i dislike python"]
# 实例化一个转换器类,CountVectorizer()里没有sparse这个参数
#要是想把sparse矩阵转换为二维数组就用toarray()
#停用词表，stop_words=[],is和too就不统计在特征词里
transfer=CountVectorizer(stop_words=["is","too"])

#调用,CountVectorizer()统计特征词出现的次数
data_new=transfer.fit_transform(data)

print("data_new:\n",data_new.toarray())
# data_new:
#  [[0 1 1 1 0 1 1 0]
#  [1 1 1 0 1 1 0 1]]

#英文的,单个字母会被API自动认为不是特征词
print("特征名字:\n",transfer.get_feature_names())
# ['dislike', 'is', 'life', 'like', 'long', 'python', 'short', 'too']

#中文的,单个字会被API自动认为不是特征词，中文要隔开有空格才行
data1=["我 爱 北京 天安门","天安门 上 太阳 升"]

transfer1=CountVectorizer()

data_new1=transfer1.fit_transform(data1)

print("data_new1:\n",data_new1.toarray())

print("特征名字:\n",transfer1.get_feature_names())


#中文特征提取，自动提取

jieba分词之中文文本提取

from sklearn.feature_extraction.text import CountVectorizer
import jieba

text="我爱北京天安门,天安门上太阳升"
print(list(jieba.cut(text)))
#['我', '爱', '北京', '天安门', ',', '天安门', '上', '太阳升']

#python join()方法用于将序列中的元素以指定的字符连接生成一个新的字符串·
a=" ".join(list(jieba.cut(text)))
print(a)

b="-".join("我是一个粉刷匠，粉刷本领强")
print(b)

def cut_word(text):
    return " ".join(list(jieba.cut(text)))


#将中文文本先进行分词，再进行提取
def count_chinese():
    data =["一种还是一种今天很残酷，明天更残酷，后天很美好，但绝对大部分是死在明天晚上，所以每个人不要放弃今天。",
    "我们看到的从很远星系来的光是在几百万年之前发出的，这样当我们看到宇宙时，我们是在看它的过去。",
    "如果只用一种方式了解某样事物，你就不会真正了解它。了解事物真正含义的秘密取决于如何将其与我们所了解的事物相联系。"]

    data_new=[]
    for sent in data:
        data_new.append(cut_word(sent))
    print(data_new)
    #1.实例化一个转换器类
    transfer=CountVectorizer()
    #2.调用 fit_transform
    data_final=transfer.fit_transform(data_new)
    print("data_new:\n",data_final.toarray())
    print("特征名字:\n",transfer.get_feature_names())


if __name__=="__main__":
    count_chinese()

在这里插入图片描述

Tf-idf之文本特征抽取

from sklearn.feature_extraction.text import TfidfVectorizer
import jieba
#TF-IDF的主要思想是∶如果某个词或短语在一篇文章中
# 出现的概率高，并且在其他文章中很少出现，则认为此
# 词或者短语具有很好的类别区分能力，适合用来分类。
# TF-IDF作用:用以评估一字词对于一个文件集或一个语
# 料库中的其中一份文件的重要程度。

# 词频(term frequency，tf)指的是某一个给定的词语在该
# 文件中出现的频率·
# 逆向文档频率(inverse document frequency，idf)是一
# 个词语普遍重要性的度量。某一特定词语的idf，可以由
# 总文件数目除以包含该词语之文件的数目，再将得到的商
# 取以10为底的对数得到

# 关键词：在某一个类别的文章中，出现的次数很多，但是在其他类别的文章当中出现很少
# 方法2：TfidfVectorizer
# TF-IDF - 重要程度
# 两个词 “经济”，“非常”
#1000篇文章-语料库
#100篇文章 - "非常"
# 10篇文章 - “经济”
#两篇文章
#文章A(100词) : 10次“经济” TF-IDF:0.2
#tf:10/100 = 0.1
# idf:lg 1000/10 = 2
# 文章B(100词) : 10次“非常” TF-IDF:0.1
# tf:10/100 = 0.1
# idf: log 10 1000/100 = 1
#对数？
# 2 ^ 3 = 8
# log 2 8 = 3
# log 10 10 = 1
#TF - 词频（term frequency，tf)
#IDF - 逆向文档频率
#tfidf=tf*idf:最终得出的结果可以理解为重要程度
def cut_word(text):
    return " ".join(list(jieba.cut(text)))


#将中文文本先进行分词，再进行提取
def count_chinese():
    data =["一种还是一种今天很残酷，明天更残酷，后天很美好，但绝对大部分是死在明天晚上，所以每个人不要放弃今天。",
    "我们看到的从很远星系来的光是在几百万年之前发出的，这样当我们看到宇宙时，我们是在看它的过去。",
    "如果只用一种方式了解某样事物，你就不会真正了解它。了解事物真正含义的秘密取决于如何将其与我们所了解的事物相联系。"]

    data_new=[]
    for sent in data:
        data_new.append(cut_word(sent))
    print(data_new)
    #1.实例化一个转换器类，用来计算Tfidf的值
    transfer=TfidfVectorizer()
    #2.调用 fit_transform
    data_final=transfer.fit_transform(data_new)
    #toarray() 把sparse矩阵转换成二维数组形式
    print("data_new:\n",data_final.toarray())
    print("特征名字:\n",transfer.get_feature_names())


if __name__=="__main__":
    count_chinese()

在这里插入图片描述

特征处理

归一化

from sklearn.feature_extraction.text import TfidfVectorizer
import jieba
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

#翻译过来:通过一些转换函数将特征数据转换成更加适合
#  算法模型的特征数据过程

# 为什么我们要进行 归一化/标准化?
# 特征的单位或者大小相差较大，或者某特征的方差相比
# 其他的特征要大出几个数量级，容易影响(支配)目标结
# 果，使得一些算法无法学习到其它的特征

# 归一化:定义
# 通过对原始数据进行变换把数据映射到(默认为[0,1])
# 之间

#归一化缺点
#最大值和最小值有可能是异常值
# 注意最大值最小值是变化的，另外，最大值与最小值非常容
# 易受异常点影响,所以这种方法鲁棒性较差,只适合传统精确
# 小数据场景。


def minmax_demo():

    #1.获取数据
    data=pd.read_csv("dating.csv")
    # print(data)
    data=data.iloc[:,:3]
    #2.实例化一个转换器类,feature_range=[]调整映射范围在2到3之间
    transfer=MinMaxScaler(feature_range=[2,3])
    #调用fit_transform()
    data_new=transfer.fit_transform(data)
    print("data_new:\n",data_new)
if __name__=="__main__":
    minmax_demo()

标准化

# 对于归一化来说:如果出现异常点。影响了最大值和最小值，
# 那么结果显然会发生改变
# 对于标准化来说:如果出现异常点，由于具有一定数据量，
# 少量的异常点对于平均值的影响并不大，从而方差改变较小。
# 对于标准化来说:如果出现异常点，由于具有一定数据量，
# 少量的异常点对于平均值的影响并不大，从而方差改变较小。

# sklearn.preprocessing.StandardScaler( )
# 处理之后每列来说所有数据都聚集在均值0附近标准差差为1
# StandardScaler.fit_transform(X)
# X:numpy array格式的数据[n_samples,n_features]
# 返回值：转换后的形状相同的array