作业要求:
1、成功读取“商铺数据.csv”文件
2、解析数据,存成列表字典格式:[{‘var1’:value1,‘var2’:value2,‘var3’:values,…},…,{}]
3、数据清洗:
① comment,price两个字段清洗成数字
② 清除字段缺失的数据
③ commentlist拆分成三个字段,并且清洗成数字
数据源格式如下:
import pandas as pd
import numpy as np
# 1.成功读入数据
df=pd.read_csv('D:/Python数据分析与挖掘实战/python-data-analysis-master/'
'练习1:商铺数据清洗/商铺数据.csv')
#查看第一行数据
print(df.loc[1])
out:
classify 美食
name 泰国街边料理
comment 74 条点评
star 准四星商户
price 人均 ¥48
address 黄兴路合生汇B2美食集市内
commentlist 口味7.4 环境7.6 ...
Name: 1, dtype: object
#查看数据信息
print(df.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1265 entries, 0 to 1264
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 classify 1265 non-null object
1 name 1265 non-null object
2 comment 1265 non-null object
3 star 1265 non-null object
4 price 1265 non-null object
5 address 1265 non-null object
6 commentlist 1235 non-null object
dtypes: object(7)
memory usage: 69.3+ KB
#2 清洗数据,
# ① comment,price两个字段清洗成数字,② 清除字段缺失的数据,
# ③ commentlist拆分成三个字段,并且清洗成数字
def comment_co(s):
if '条'in s:
return (int((s.split(' ')[0])))
else:
return ('no.nan') #设置成NAN方便丢弃和补充缺失值
#运用匿名函数传递参数
df['comment']=df['comment'].apply(lambda x:comment_co(x))
print(df['comment'])
out:
0 NaN
1 74.0
2 265.0
3 2748.0
4 5.0
...
1260 1.0
1261 4.0
1262 1.0
1263 1.0
1264 1.0
Name: comment, Length: 1265, dtype: float64
def price_co(s):
if '¥'in s:
return (int((s.split('¥')[-1])))
else:
return (np.NaN)
df['price']=df['price'].apply(lambda x:price_co(x))
print(df['price'])
out:
0 125.0
1 48.0
2 21.0
3 142.0
4 NaN
...
1260 NaN
1261 NaN
1262 NaN
1263 NaN
1264 NaN
Name: price, Length: 1265, dtype: float64
#定义函数,将comment_list分隔成三个字段
#这里必须要用try except 因为comment_list中有空值,空值不能进行切片,会报错。
def commentlist_co(arrList):
tmp=str(arrList['commentlist']).split(' ')
try:
arrList['taste']=tmp[0][-3:]
arrList['eviorment']=tmp[1][-3:]
arrList['service']=tmp[2][-3:]
except:
print(' ',end='')
return arrList
df=df.apply(commentlist_co,axis=1)
print(df.columns)
out:
Index(['address', 'classify', 'comment', 'commentlist', 'eviorment', 'name',
'price', 'service', 'star', 'taste'],
dtype='object')
#删除掉comment_list列
df=df.drop(columns='commentlist')