爬遍电商之京东篇:
目标是爬取指定商品的商品列表信息,包括商品名,价格,评论数,店铺名
打开京东页面,随便搜一个笔记本,F12打开NetWork开始抓包,翻个3页,遇到断点就按F8执行,然后看到第一个返回内容的ajax请求,是返回了第1页的后30个商品,下面开头名一样的依次返回第2页前30个,第2页后30个,第3页前30个,第三页后30个…别问是怎么知道的,对比一下就行了
看看第一个ajax是有哪些请求参数,通过跟下面几个对比发现,请求前30个商品和请求后30个商品,请求的参数有一点点不同,并且page、s、log_id都是会变化的
假设当前爬的是第N页
对于请求后30个商品的:
page 都是偶数依次是2、4、6、8…
s=(N*2-1)*30+1
log_id就是15位的时间戳/100000
对于请求前30个商品的:
page 都是奇数依次是1、3、5、7…
s=(N-1)*30+1
都是比较简单的找规律,齐活,开始撸代码
首先是获取前30个商品的代码,传入参数(查询的字段,当前爬取第几页),构造请求头和请求参数,后30页的代码类似
def get_first(shuru,i):
page=2*i-1
s = (i - 1) * 30 + 1
he = {
'accept': '*/*',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9',
'cookie': '__jdu=783078358; areaId=15; ipLoc-djd=15-1213-3038-0; shshshfpa=25d2efdd-812b-00bf-0465-bc601a32664e-1572142042; xtest=7667.cf6b6759; shshshfpb=y%2F%201kbkJW0rrCZxHU6os3WA%3D%3D; user-key=56e092e2-0b10-4615-9bc1-dd211435cb26; cn=0; qrsc=3; unpl=V2_ZzNtbUIDRhRzCBIEexhdUmIBFAhKUBNGJQ1DVikcVFY3CxVcclRCFX0URlVnGlQUZwcZXUJcRhxFCEdkeB5fA2AFEFlBZxBFLV0CFi9JH1c%2bbRJcRV5CE3cPRVB7Gmw1ZAMiXUNnRRx3CUBdeR1VNVcEIm1yUUATcAtCZHopXTUlV05eRV5LFXFFQF15GFoMZQcbbUNnQA%3d%3d; __jdv=76161171|baidu-search|t_262767352_baidusearch|cpc|106807362512_0_1e4071ea100f437d96aba443c49ba960|1572333108335; __jda=122270672.783078358.1564988642.1572328729.1572333108.5; __jdc=122270672; __jdb=122270672.3.783078358|5.1572333108; shshshfp=f71d3f04ca730a97469ed0ded5889260; shshshsID=3394e8dd5a61826ebe3e3b39c51a7b35_2_1572333115068; rkv=V0000; 3AB9D23F7A4B3C9B=JDBODRIZ2EQH56E43CTTKRIEK74SLK6SDCZN2MDTPNMDAHNJBPOF7RIORAQB4F75VW3UNR635OBECG4L3P24AWIE2U',
'referer': 'https://search.jd.com/Search?keyword=%E7%AC%94%E8%AE%B0%E6%9C%AC&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=%E7%AC%94%E8%AE%B0%E6%9C%AC&page='+str(page)+'&s='+str(s)+'&click=0',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'x-requested-with': 'XMLHttpRequest',
'user-agent': ka.random
}
data = {
'keyword': shuru,
'enc': 'utf-8',
'qrst': 1,
'rt': 1,
'stop': 1,
'vt': 2,
'wq': shuru,
'page': page,
's': s,
'click':0
}
url = 'https://search.jd.com/s_new.php?'
res = requests.get(url + urlencode(data), headers=he,timeout=5)
res.encoding='utf-8'
source = etree.HTML(res.text)
title_list=source.xpath('//li[@class="gl-item"]')
for title in title_list:
tt=title.xpath('./div[@class="gl-i-wrap"]//div[@class="p-name p-name-type-2"]/a/em/text()')
print(tt)
完整代码如下,2个方法就行,具体想获取的信息自行补充奥
# -*- coding: utf-8 -*-
import os
import re
import time
from urllib.parse import urlencode
import fake_useragent
import requests
from lxml import etree
# 方法二,从本地文件夹获取
location = os.getcwd() + 'headers.csv'
ka = fake_useragent.UserAgent(path=location, verify_ssl=False, use_cache_server=False)
def get_first(shuru,i):
page=2*i-1
s = (i - 1) * 30 + 1
he = {
'accept': '*/*',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9',
'cookie': '__jdu=783078358; areaId=15; ipLoc-djd=15-1213-3038-0; shshshfpa=25d2efdd-812b-00bf-0465-bc601a32664e-1572142042; xtest=7667.cf6b6759; shshshfpb=y%2F%201kbkJW0rrCZxHU6os3WA%3D%3D; user-key=56e092e2-0b10-4615-9bc1-dd211435cb26; cn=0; qrsc=3; unpl=V2_ZzNtbUIDRhRzCBIEexhdUmIBFAhKUBNGJQ1DVikcVFY3CxVcclRCFX0URlVnGlQUZwcZXUJcRhxFCEdkeB5fA2AFEFlBZxBFLV0CFi9JH1c%2bbRJcRV5CE3cPRVB7Gmw1ZAMiXUNnRRx3CUBdeR1VNVcEIm1yUUATcAtCZHopXTUlV05eRV5LFXFFQF15GFoMZQcbbUNnQA%3d%3d; __jdv=76161171|baidu-search|t_262767352_baidusearch|cpc|106807362512_0_1e4071ea100f437d96aba443c49ba960|1572333108335; __jda=122270672.783078358.1564988642.1572328729.1572333108.5; __jdc=122270672; __jdb=122270672.3.783078358|5.1572333108; shshshfp=f71d3f04ca730a97469ed0ded5889260; shshshsID=3394e8dd5a61826ebe3e3b39c51a7b35_2_1572333115068; rkv=V0000; 3AB9D23F7A4B3C9B=JDBODRIZ2EQH56E43CTTKRIEK74SLK6SDCZN2MDTPNMDAHNJBPOF7RIORAQB4F75VW3UNR635OBECG4L3P24AWIE2U',
'referer': 'https://search.jd.com/Search?keyword=%E7%AC%94%E8%AE%B0%E6%9C%AC&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=%E7%AC%94%E8%AE%B0%E6%9C%AC&page='+str(page)+'&s='+str(s)+'&click=0',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'x-requested-with': 'XMLHttpRequest',
'user-agent': ka.random
}
data = {
'keyword': shuru,
'enc': 'utf-8',
'qrst': 1,
'rt': 1,
'stop': 1,
'vt': 2,
'wq': shuru,
'page': page,
's': s,
'click':0
}
url = 'https://search.jd.com/s_new.php?'
res = requests.get(url + urlencode(data), headers=he,timeout=5)
res.encoding='utf-8'
source = etree.HTML(res.text)
title_list=source.xpath('//li[@class="gl-item"]')
for title in title_list:
tt=title.xpath('./div[@class="gl-i-wrap"]//div[@class="p-name p-name-type-2"]/a/em/text()')
print(tt)
def get_laterpage(shuru,i):
s = (i * 2 - 1) * 30 + 1
page = i * 2
he = {
'accept': '*/*',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9',
'cookie': '__jdu=783078358; areaId=15; ipLoc-djd=15-1213-3038-0; shshshfpa=25d2efdd-812b-00bf-0465-bc601a32664e-1572142042; xtest=7667.cf6b6759; shshshfpb=y%2F%201kbkJW0rrCZxHU6os3WA%3D%3D; user-key=56e092e2-0b10-4615-9bc1-dd211435cb26; cn=0; qrsc=3; unpl=V2_ZzNtbUIDRhRzCBIEexhdUmIBFAhKUBNGJQ1DVikcVFY3CxVcclRCFX0URlVnGlQUZwcZXUJcRhxFCEdkeB5fA2AFEFlBZxBFLV0CFi9JH1c%2bbRJcRV5CE3cPRVB7Gmw1ZAMiXUNnRRx3CUBdeR1VNVcEIm1yUUATcAtCZHopXTUlV05eRV5LFXFFQF15GFoMZQcbbUNnQA%3d%3d; __jdv=76161171|baidu-search|t_262767352_baidusearch|cpc|106807362512_0_1e4071ea100f437d96aba443c49ba960|1572333108335; __jda=122270672.783078358.1564988642.1572328729.1572333108.5; __jdc=122270672; __jdb=122270672.3.783078358|5.1572333108; shshshfp=f71d3f04ca730a97469ed0ded5889260; shshshsID=3394e8dd5a61826ebe3e3b39c51a7b35_2_1572333115068; rkv=V0000; 3AB9D23F7A4B3C9B=JDBODRIZ2EQH56E43CTTKRIEK74SLK6SDCZN2MDTPNMDAHNJBPOF7RIORAQB4F75VW3UNR635OBECG4L3P24AWIE2U',
'referer': 'https://search.jd.com/Search?keyword=%E7%AC%94%E8%AE%B0%E6%9C%AC&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=%E7%AC%94%E8%AE%B0%E6%9C%AC&page='+str(page)+'&s='+str(s)+'&click=0',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'x-requested-with': 'XMLHttpRequest',
'user-agent': ka.random
}
data = {
'keyword': shuru,
'enc': 'utf-8',
'qrst': 1,
'rt': 1,
'stop': 1,
'vt': 2,
'wq': shuru,
'page': page,
's': s,
'scrolling': 'y',
'log_id': int(time.time() * 100000) / 100000,
'tpl': '1_M'
}
url = 'https://search.jd.com/s_new.php?'
res = requests.get(url + urlencode(data), headers=he,timeout=5)
res.encoding='utf-8'
source = etree.HTML(res.text)
title_list=source.xpath('//li[@class="gl-item"]')
for title in title_list:
tt=title.xpath('./div[@class="gl-i-wrap"]//div[@class="p-name p-name-type-2"]/a/em/text()')
print(tt)
if __name__ == '__main__':
shuru='笔记本'
s=1
for i in range(1,5):
get_first(shuru,i)
print('---------------------------')
time.sleep(2)
get_laterpage(shuru,i)
print('第%s页结束'%i)
time.sleep(2)
我只获取了商品标题,最后是这样的