多线程京东抓包爬取
考虑到上次利用splash动态渲染爬取京东商品信息效率有限,此次是对京东网站进行逆向分析爬取,利用requests模拟浏览器请求爬取商品信息,并加上多线程爬取,爬取效率得到了大大提高。
爬取商品的数据包含:商品名,商品ID,作者,价格(折后价、折前价和电子版价格(若有)),书籍排名,评论统计(评论总数、好评数、差评数、好评率、默认好评数),评论内容。
数据以json形式保存到非关系数据库elasticsearch中。
代码实现:
import requests
from bs4 import BeautifulSoup
import time
import re
from scrapy.selector import Selector
import json
from e_commerce.module.es_mapping import ProductInfoType
from threading import Thread
from queue import Queue
sessions = requests.session() #创建一个会话
page_url = "https://search.jd.com/Search"
comment_count_url = 'https://sclub.jd.com/comment/productCommentSummaries.action'
comment_url = 'https://sclub.jd.com/comment/productPageComments.action'
start_time = time.time()
exit_ParseThread = False
exit_SaveThread = False
def parse_Product(page_queue , info_queue):
#解析列表商品信息
#获取第一页商品数据,可遍历page获取更多页面商品数据
while True:
if page_queue.empty():
break
page = page_queue.get() #1,3,5,...
params_1 = {
"keyword": "python" ,
"enc" : "utf-8" ,
"qrst" : "1",
"rt" : '1' ,
"stop" : '1' ,
"vt" : '2',
"wq" : 'python',
"page" : page ,
# "s" : '1' ,
# "click" : '0'
}
headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9',
'cache-control': 'max-age=0',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
}
#请求京东前30个商品信息
pro_res = sessions.get(page_url , headers= headers , params= params_1)
# print("parse front 30 product info")
parseIdAndPrice(pro_res.content , info_queue)
params_2 = params_1.copy()
params_2.update({
"page" : page + 1 ,
# "s" : '31' ,
# "click" : '',
'scrolling': 'y',
'log_id': str(int(time.time()*100000)/100000) ,
'tpl': '2_M' ,
})
headers = {
'referer': 'https://s