【python机器学习手册】第三章 数据整理

本章节深入探讨Python在机器学习中用于数据整理的实践,涉及不同规模的数据集,包括239行×2列、792行×17列及100000行×1列的数据集,讲解如何有效地清洗、转换和准备数据以供模型训练。
摘要由CSDN通过智能技术生成
#3.1
import pandas as pd
dataframe=pd.DataFrame()#DF一定要大写
dataframe["name"]=["JACK","steven"]#增加列
dataframe["age"]=[38,25]#数字不用引号,字符串采用引号
dataframe["driver"]=[True,False]#布朗值不用引号,一定要首字母大写才是布朗值
dataframe
nameagedriver
0JACK38True
1steven25False
new=pd.Series(["MOLLY",40,True],index=["name","age","driver"])#一定要一一对应
dataframe.append(new,ignore_index=True)#保证出现的索引不重复
nameagedriver
0JACK38True
1steven25False
2MOLLY40True
#3.2
import pandas as pd
url='C:\\Users\\杨胜国\\Desktop\\研究生文件\\数据挖掘作业\\cpi.xls'#要双斜杠,单斜杠在python中有转义的意思
dataframe1=pd.read_excel(url)
dataframe1
指标居民消费价格指数(上月=100)_当期
0地区全国
1频度
2单位-
32001-01100.9
42001-02100.1
.........
2342020-0499.14
2352020-0599.21
2362020-0699.93
2372020-07100.62
2382020-08100.4

239 rows × 2 columns

dataframe1.shape
(239, 2)
dataframe1.describe()#查看描述性统计内容
指标居民消费价格指数(上月=100)_当期
count239239.0
unique239107.0
top2004-12100.1
freq114.0
dataframe.describe()
age
count2.000000
mean31.500000
std9.192388
min25.000000
25%28.250000
50%31.500000
75%34.750000
max38.000000
#3.3
dataframe.iloc[0]#S索引某一行
name      JACK
age         38
driver    True
Name: 0, dtype: object
dataframe

#索引某一个元素,要先设置索引元素标准
dataframe=dataframe.set_index(dataframe["age"])
dataframe.loc[38]
name      JACK
age         38
driver    True
Name: 38, dtype: object
dataframe=dataframe.reset_index(drop=True)#去掉索引
dataframe
nameagedriver
0JACK38True
1steven25False
#3.5
dataframe.replace("JACK","rose")#替换值
nameagedriver
0rose38True
1steven25False
#3.6
dataframe.rename(columns={"name":"Name"})#替换列,用花括弧
Nameagedriver
0JACK38True
1steven25False
import collections#把列名字全部换掉,用遍历字典方法
column_names=collections.defaultdict(str)
for Name in dataframe.columns:
    column_names[Name]
column_names
defaultdict(str, {'name': '', 'age': '', 'driver': ''})
#3.8
url1='D:\\研究生数据集\\train_data.csv'#文件打开看一下,指不定人家是csv
dataframe2=pd.read_csv(url1)
dataframe2
Unnamed: 0PassengerIdSurvivedSexAgeFarePclass_1Pclass_2Pclass_3Family_sizeTitle_1Title_2Title_3Title_4Emb_1Emb_2Emb_3
001010.27500.0141510010.11000001
112100.47500.1391361000.11000100
223100.32500.0154690010.00001001
334100.43750.1036441000.11000001
445010.43750.0157130010.01000001
......................................................
787787788010.10000.0568480010.50010010
788788789110.01250.0401600010.30010001
789789790010.57500.1545881000.01000100
790790791010.35000.0151270010.01000010
791791792010.20000.0507490100.01000001

792 rows × 17 columns

dataframe2.describe()
Unnamed: 0PassengerIdSurvivedSexAgeFarePclass_1Pclass_2Pclass_3Family_sizeTitle_1Title_2Title_3Title_4Emb_1Emb_2Emb_3
count792.000000792.000000792.000000792.000000792.000000792.000000792.000000792.000000792.000000792.000000792.000000792.000000792.000000792.000000792.000000792.000000792.000000
mean395.500000396.5000000.3863640.6477270.3682440.0646770.2436870.2083330.5479800.0886360.7449490.0050510.0404040.2095960.1856060.0921720.720960
std228.774999228.7749990.4872230.4779800.1629940.1009870.4295770.4063730.4980070.1544850.4361650.0709320.1970290.4072770.3890340.2894510.448811
min0.0000001.0000000.0000000.0000000.0083750.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000
25%197.750000198.7500000.0000000.0000000.2750000.0154690.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000
50%395.500000396.5000000.0000001.0000000.3500000.0283020.0000000.0000001.0000000.0000001.0000000.0000000.0000000.0000000.0000000.0000001.000000
75%593.250000594.2500001.0000001.0000000.4375000.0610450.0000000.0000001.0000000.1000001.0000000.0000000.0000000.0000000.0000000.0000001.000000
max791.000000792.0000001.0000001.0000001.0000001.0000001.0000001.0000001.0000001.0000001.0000001.0000001.0000001.0000001.0000001.0000001.000000
dataframe2["Sex"].unique()#查看某一列中全部的唯一值组成的数组,dataframe2["Sex"]代表某列做操作,dataframe.函数对整个数据框操作
array([1, 0], dtype=int64)
dataframe2["Sex"].value_counts()#数一下出现的次数
1    513
0    279
Name: Sex, dtype: int64
#3.9
dataframe2[dataframe2["Age"].isnull()].head(2)#如下说明没缺失值,有缺失值的会显示数据内容
Unnamed: 0PassengerIdSurvivedSexAgeFarePclass_1Pclass_2Pclass_3Family_sizeTitle_1Title_2Title_3Title_4Emb_1Emb_2Emb_3
#3.10
dataframe2.drop("Age",axis=1).head(2)#没有产生新的数据框,而是修改了一下,dataframe还是原来的样子
Unnamed: 0PassengerIdSurvivedSexFarePclass_1Pclass_2Pclass_3Family_sizeTitle_1Title_2Title_3Title_4Emb_1Emb_2Emb_3
001010.0141510010.11000001
112100.1391361000.11000100
#3.11
dataframe2[dataframe2["Sex"]!=1].head(2)#删除不等于某些值的数据
Unnamed: 0PassengerIdSurvivedSexAgeFarePclass_1Pclass_2Pclass_3Family_sizeTitle_1Title_2Title_3Title_4Emb_1Emb_2Emb_3
112100.4750.1391361000.11000100
223100.3250.0154690010.00001001
#3.12
dataframe3=dataframe2.drop_duplicates()#数据的大小并没有改变,说明drop_duplicates只能删除所有数据完美适配的重复行
dataframe3
Unnamed: 0PassengerIdSurvivedSexAgeFarePclass_1Pclass_2Pclass_3Family_sizeTitle_1Title_2Title_3Title_4Emb_1Emb_2Emb_3
001010.27500.0141510010.11000001
112100.47500.1391361000.11000100
223100.32500.0154690010.00001001
334100.43750.1036441000.11000001
445010.43750.0157130010.01000001
......................................................
787787788010.10000.0568480010.50010010
788788789110.01250.0401600010.30010001
789789790010.57500.1545881000.01000100
790790791010.35000.0151270010.01000010
791791792010.20000.0507490100.01000001

792 rows × 17 columns

dataframe4=dataframe2.drop_duplicates(subset=["Sex"])#删除某行为标准的重复值
dataframe4.shape
(2, 17)
#3.13
dataframe2.groupby("Sex").mean()
Unnamed: 0PassengerIdSurvivedAgeFarePclass_1Pclass_2Pclass_3Family_sizeTitle_1Title_2Title_3Title_4Emb_1Emb_2Emb_3
Sex
0378.014337379.0143370.7491040.3444440.0883300.2903230.2508960.4587810.1290320.3906810.0143370.0000000.5949820.2258060.1254480.645161
1405.009747406.0097470.1890840.3811870.0518130.2183240.1851850.5964910.0666670.9376220.0000000.0623780.0000000.1637430.0740740.762183
dataframe5=dataframe2.groupby("Sex")
dataframe5#只是心里默默分组,并不能展示出来,但是可以进行计算操作,需要和计算操作结合起来
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000002754E1C1F48>
#3.14
import numpy as np
time_index=pd.date_range("06/06/2017",periods=100000,freq="30s")#从2017年0606开始,每三十秒产生一个数据,产生十万个
dataframe=pd.DataFrame(index=time_index)#DataFrame包的名字表示必须大学
dataframe["Sale_Amount"]=np.random.randint(1,10,100000)#产生一列叫销售数量的,范围在1-10之间的整数十万个
dataframe.resample("W").sum()#按周来分类总和销售数量
Sale_Amount
2017-06-1186203
2017-06-18100508
2017-06-25100897
2017-07-02100522
2017-07-09100964
2017-07-1610459
dataframe.shape
(100000, 1)
dataframe
Sale_Amount
2017-06-06 00:00:001
2017-06-06 00:00:305
2017-06-06 00:01:006
2017-06-06 00:01:302
2017-06-06 00:02:004
......
2017-07-10 17:17:308
2017-07-10 17:18:004
2017-07-10 17:18:305
2017-07-10 17:19:004
2017-07-10 17:19:305

100000 rows × 1 columns


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值