采用python + selenium + browsermobproxy抓包 提取拼多多订单数据,通过selenium模拟登录拼多多后到订单页提取订单数据。browsermob-proxy是一个代理工具,能抓取网页所有的访问细节。下载地址 使用还需要配置java环境 在python安装browsermobproxy
pip install browsermob-proxy
打开拼多多注意,查看network 发现存储数据的json文件
注意,这个只存储从第十一个订单开始的数据,下滑每十个刷新一次,前十个订单数据在源代码中window.rawData中,也在控制台输入即可看见。那么我们要做的就是selenium登录跳转到订单页面,从源代码中获取前十订单数据,从order_list_v3中获取剩余订单数据。
启动browsermob-proxy
from browsermobproxy import Server
server = Server(r'D:\browsermob-proxy-2.1.4-bin\browsermob-proxy-2.1.4\bin\browsermob-proxy.bat')//添加browsermob-proxy路径
server.start()//启动
proxy = server.create_proxy()//创建
启动selenium
chrome_driver = r'C:\Users\dell\AppData\Local\Google\Chrome\Application\chromedriver.exe'
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
chrome_options = Options()
chrome_options.add_argument('--ignore-certificate-errors')
# 解决你的链接不是私密问题
chrome_options.add_argument('--proxy-server={0}'.format(proxy.proxy))
driver = webdriver.Chrome(executable_path=chrome_driver,options=chrome_options)
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": """
Object.defineProperty(navigator, 'webdriver', {
get: () => Chrome
})
"""
})
driver.implicitly_wait(10)
driver.maximize_window()
driver.get("http://yangkeduo.com/")
selenium登录评多多跳转到全订单页
driver.find_element_by_xpath('//*[@id="main"]/div/div[2]/div[4]/div/div[1]/div[5]').click()//点击订单跳转到登录页面
driver.find_element_by_class_name('phone-login').click()
driver.find_element_by_id('user-mobile').send_keys('手机号')
driver.find_element_by_id('code-button').click()
code = input('清输入验证码')
driver.find_element_by_id('input-code').send_keys(code)
driver.find_element_by_id('submit-button').click()//登录
启动监听,获取源代码数据
proxy.new_har('order',options={'captureContent': True})
driver.execute_script('return JSON.stringify(window.rawData)')//返回前十个订单数据,里面的json格式与order_list_v3中的不一样,需要单独存储提取==还有编码的坑
f = open('top_ten.json', 'a')
json.dump(data,f)
f.close()//保存数据
循环滚动到底
from selenium.common.exceptions import NoSuchElementException
def isElementPresent():
try:
element = driver.find_element_by_xpath('//*[@class = "loading-text"]')//到底不为空
except NoSuchElementException as e:
return False
else:
return True
while True:
target = isElementPresent()//判断是否到底,如果没有,继续滚动
if target is False :
driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
time.sleep(2)
else:
break
从代理的包提取包保存
for entry in result['log']['entries']:
# print(entry['request']['url'])
url = entry['request']['url']
# 根据URL找到数据接口
if "order_list_v3" in url:
response = entry['response']['content']
content = response['text']
f = open('data.json', 'a')
f.write(content + '\n')
f.close()
整体思路就是这样的,最后就是处理数据提取输出Excel表,欢迎指出错误。
def get_data():
detail_list = []
with open(phone_number.get() + 'top_ten.json', 'r+', encoding='utf-8') as f1:
data1 = json.load(f1)
d = json.loads(data1)
for order in d['ordersStore']['orders']:
detail = {}
detail['order_sn'] = order['orderSn']
detail['order_amount'] = order['orderAmount']
detail['goods_name'] = order['orderGoods'][0]['goodsName']
detail['goods_price'] = order['orderGoods'][0]['goodsPrice']
detail['goods_number'] = order['orderGoods'][0]['goodsNumber']
detail['order_status_prompt'] = order['orderStatusPrompt']
if not detail['order_status_prompt'] in ['交易已取消', '未发货,退款成功', '已发货,退款成功']:
detail_list.append(detail.copy())
with open(phone_number.get() + 'data.json', 'r+') as f:
try:
for item in jsonlines.Reader(f):
for order in item['orders']:
detail = {}
detail['order_sn'] = order['order_sn']
detail['order_amount'] = order['order_amount']
detail['goods_name'] = order['order_goods'][0]['goods_name']
detail['goods_price'] = order['order_goods'][0]['goods_price']
detail['goods_number'] = order['order_goods'][0]['goods_number']
detail['order_status_prompt'] = order['order_status_prompt']
if not detail['order_status_prompt'] in ['交易已取消', '未发货,退款成功', '已发货,退款成功']:
detail_list.append(detail.copy())
except:
print('没有更多订单了')
return detail_list
def export_excel(export):
# 将字典列表转换为DataFrame
pf = pd.DataFrame(list(export))
# 指定字段顺序
order = ['order_sn', 'order_amount', 'goods_name', 'goods_price', 'goods_number', 'order_status_prompt']
pf = pf[order]
# 将列名替换为中文
columns_map = {
'order_sn': '订单编号',
'order_amount': '订单数额',
'goods_name': '商品名称',
'goods_price': '商品价格',
'goods_number': '商品数量',
'order_status_prompt': '订单状态',
}
pf.rename(columns=columns_map, inplace=True)
file_path = pd.ExcelWriter(phone_number.get() + '.xlsx')
pf.fillna(' ', inplace=True)
pf.to_excel(file_path, encoding='utf-8', index=False)
file_path.save()
欢迎指正错误。