【项目01】 数据加载及存储
要求:
1、成功读取“store_data.csv”文件
2、解析数据,存成列表字典格式:[{‘var1’:value1,‘var2’:value2,‘var3’:values,…},…,{}]
3、数据清洗:
① comment,price两个字段清洗成数字
② 清除字段缺失的数据
③ commentlist拆分成三个字段,并且清洗成数字
4、结果存为.pkl文件
import pandas as pd
import numpy as np
import re
# 读取数据
data = pd.read_csv('store_data.csv')
# 转换为列表字典
datalist = []
for i in np.arange(data.shape[0]):
linedict = data.iloc[i,:].to_dict()
datalist.append(linedict)
# 数据清洗, 将文本中的数字提取出来
import re
def str_to_num(strs):
strs = re.sub("\D", "", str(strs))
if strs == '':
strs = np.nan
else :
strs = int(strs)
return strs
data['price']= data['price'].apply(lambda x: np.nan if re.sub("\D", "", str(x))=='' else re.sub("\D", "", str(x)) )
data['comment'] = data['comment'].apply(str_to_num)
# 删除缺失值
data.dropna(axis=0, how='any', inplace=True)
# # commentlist 里面有三列,排列整齐,可使用split分列
data['flavor'] = data['commentlist'].apply(lambda x: float(re.split(',', str(x).replace('1 ', ','))[0][2:5]))
data['environment'] = data['commentlist'].apply(lambda x: float(re.split(',', str(x).replace(' ', ','))[1][2:5]))
data['service'] = data['commentlist'].apply(lambda x: float(re.split(',', str(x).replace(' ', ','))[2][2:5]))
# 保存为pickle数据
import pickle
pklfile = open('store_data1.pkl', 'wb')
pickle.dump(data, pklfile)
pklfile.close()
# 或直接写为
data.to_pickle('store_data.pkl')