前言
最近在文档看到以前帮别人写的淘宝购物记录爬取和分析,也想简单分析一下自己的,由于本人经常用拼夕夕,所以就爬一下自己的购物记录并简单分析一下
一、购物记录爬取
import json
import re
import csv
import time
header={
"AccessToken": "",
"Content-Type": "application/json;charset=UTF-8",
"Cookie": "",
"Host": "omsproduction.yangkeduo.com",
"Origin": "https://omsproduction.yangkeduo.com",
"Referer": "https://omsproduction.yangkeduo.com/orders.html?type=0&comment_tab=1&combine_orders=1&main_orders=1&refer_page_name=personal&refer_page_id=10001_1681214651841_ptc6qkb8vb&refer_page_sn=10001",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.0.0 Safari/537.36"
}
def get_api(offset):
url='https://omsproduction.yangkeduo.com/proxy/api/api/aristotle/order_list_v4?pdduid=7955997481811'
data={
"type": "all",
"page": 1,
"origin_host_name": "omsproduction.yangkeduo.com",
# "page_from": p+1,
# "anti_content": "0aqAfqnUgcKgjgEVARuwr1wgsALtDl_s5ze_pPYIw1cNL4nputy21tL4uQ-f1F-ehbb-eZUf2Iac6v7Invh4Hzh4UtcdBS2wmv7TEThLIkKzG4PcBf2fTc2p7qC_MU7VMluN8Nl4TgCk-onXYpAPXgjho4z2lQysWZe20vIgK3TEfhkCjXd8t9zb0as6FGJbwPsrFfQbsXtYs9DN9dswIUchg82Q7P7WI69foUf5cGCgv-0j8-aT6L0NY_b_7L6tY_EfkogeUnBZn8B52sv43VIhCjoFms7qJlbz9eDFCiacTaAztb_L8daDeU5J1p2wJIs84h0uWKedk89lMwGEIP4Dc0oZnUt-tGWgbOn8_gAvN9UododGK60O6HevEdeyiwYf8a8Hley9qWYCLkXnzcfycfeFXx5Lq2zDuVPPZnSYeZImOEOLm5n6IF5szF-a1TBJsY00mCk_oaIjKgIqu1WeL9dXVtLFgEfpvtwzLi5HgFtPa2bk7ZoW77WmFn7ObljmLf0s65zL70EzAgbfrCblRmdVgETIDD67R8O5dCFhz_mSZOlVW4aH4Z_5DAqpgcXj1tbUBQ4dEe5HAxffcEpg3qcpPAhtqZB2H1FyNErwtc0EsILhLtVb0sBUDlPLd_0zuYtA0XBv1BnEYUH8Y5-Sf__0QSJ_rdFHooQxozP1G6RKOD6GNwxPOaGPup7HNpOASm-Anb7XGHNMAQMnUcxQEIvk4tkddQRZJa-PQ_s7WkZObZBVrapJQiXM8AISRnQrxWDAGLcp7_Ll2W6YwrqeHX2ywLl6Qv8Z-G2KxuEJhVPhiJYx0yb15cA6jaRFek4wc5p-AKniC3I_KRV3FzQJ8uB9jsmaGFCTvvLo7Glr3s_mZI4vsRQG3t_swqOyMYLCwMtCazE4hbXqCM",
"size": 10,
"offset":offset
}
resp=requests.post(url,json=data,headers=header)
time.sleep(1)
return resp
def deel_tetx(resp):
goods=json.loads(resp.text)['orders']
datas=[]
for i in goods:
# print(i)
good=i['order_goods'][0]['goods_name']
price=i['order_goods'][0]['goods_price']/100
order_time=i['order_time']
order_status_prompt=i['order_status_prompt']
if order_status_prompt=='交易成功':
datas.append([good,price,order_time])
get_save(datas)
offset = goods[-2]['offset']
return offset
def get_save(datas):
with open('购物记录.csv','a',encoding='utf-8',newline='')as f:
f=csv.writer(f)
f.writerows(datas)
def main():
offset='MO-01-230410-202674083412899'
while 1:
try:
resp=get_api(offset)
print(f'{offset}完成')
offset=deel_tetx(resp)
except:
break
if __name__ == '__main__':
main()
cookie和token的话,登录一下网页版本抓包获取一下就行。具体的分析过程大家看代码就行, offset 在我看来就是每一个日期的购物记录,每一次只能请求10条,然后获取倒数第二条条的购物记录的offset作为下一次的请求的offset,大家可以自行研究,爬取没有太大难度。
二、数据分析
- 数据读取
import matplotlib.pyplot as plt
import pandas as pd
import sklearn as sn
#%%
df=pd.read_csv('购物记录.csv')
1.数据清洗
- 由于返回日期是时间戳,所以我们将时间戳转化为日期格式
import time
def to_date(t):
t=time.localtime(t)
t=time.strftime('%Y-%m-%d',t)
t=pd.to_datetime(t)
return t
df['date']=df.date.apply(to_date)
df
2.去重
- 处理重复的数据,因为购物习惯问题,一部分数据重复了,这里只去除相邻两个相同的话,只保留一个
df.drop_duplicates(subset=['good','price'],keep='first',inplace=True)
df
3.数据情况
df.describe()
想不到还有0.01的,不愧是我,大部分%75都是在40元以下的,平均虽然90多,但std比较大,受到极值影响
4.可视化展示
- 购物区间饼图
matplotlib.rcParams['font.sans-serif'] = ['Microsoft YaHei']
matplotlib.rcParams['axes.unicode_minus'] = False
expence=[0,30,100,500,1000,5000,10000]
expence_count=pd.cut(df['price'],expence).value_counts()
expence_count
plt.pie(expence_count,labels=expence_count.keys())
plt.legend()
plt.title('购物区间')
plt.show()
根据消费习惯,分为这几个区间,计算购物次数
大额宗件比较少,最主要的还是在小玩意的购物上和普通生活用品,结合下面词云图,食品买的比较福哦一点
- 词云图
分词:
import jieba
from collections import Counter
from wordcloud import WordCloud
import numpy as np
import matplotlib
matplotlib.rcParams['font.sans-serif'] = ['Microsoft YaHei']
matplotlib.rcParams['axes.unicode_minus'] = False
df['good']=df['good'].str.replace(r'[^\w\s]+','')
words=[]
words_list=[jieba.lcut(i.strip()) for i in df['good'].tolist()]
for i in words_list:
words+=i
count=Counter(words)
count=count.most_common(n=100)
count=dict(count)
剔除掉里面的特殊字符,再用jieba进行分词,利用collections模块进行计数
然后可视化展示
word=WordCloud(font_path='simhei.ttf')#字体
word.generate_from_frequencies(count)
plt.imshow(word,interpolation='bilinear')
plt.axis('off')
plt.title('购物消费类别词云图')
plt.show()
米饭,学生,男出现的次数比较多,看来我还是一个爱干饭学生
- 柱状折线图
# 按月份分组并计算每月消费金额总和
monthly_expense=df.groupby(df['date'].dt.strftime('%Y-%m'))['price'].sum()
monthly_expense=pd.DataFrame(monthly_expense)
monthly_expense['price']=monthly_expense['price'].astype('int')
plt.figure(figsize=(10,6))
plt.xticks(fontsize=6)
plt.plot(monthly_expense.index,monthly_expense.price,'r--')
plt.bar(monthly_expense.index,monthly_expense.price)
# 显示数据标签
for i,a in enumerate(monthly_expense.index):
plt.text(a,monthly_expense.price[i],
'{}'.format(monthly_expense.price[i]),
ha='center',
va='bottom',
)
plt.show()
4月份没有任何消费,这一年的购物消费额度都比较低。整体上不怎么购物,偶热会有一些大额消费
总结
由于返回的数据只有价格,日期,名称这几个标签,没有过于分析,大家可以自行决定。
有时间再分享一下淘宝的购物记录可视化分析
希望大家多多支持,后续分享更多有趣的东西