python爬取京东商品信息_Python简单爬取京东商品列表

最新推荐文章于 2021-02-04 06:20:05 发布

weixin_39801158

最新推荐文章于 2021-02-04 06:20:05 发布

阅读量553

点赞数

文章标签： python爬取京东商品信息

#!/usr/bin/python3

# -*- coding: UTF-8 -*-

import urllib.request

import urllib.error

import re

import csv

import time

global_row = 0

'''

需求：

爬取京东商品数据，以‘java’关键字为例。要求使用最基础的urllib和re库。

需要保存书名，价格，评论数，出版社等信息。

实现：

找出页面规律如下

一、每页显示60个商品，但分为两部分。

1. 每页前30个商品，通过search.jd.com/Search?keyword=java接口获取

每页后30个商品，通过search.jd.com/s_new.php?keyword=java接口获取，这个接口是个XHR请求，通过Chrome的开发者选项可以看出。模拟人向下滚动页面

2. 每个接口有两个关键的参数'page'和's'

参数'page'好理解，就是页面数。对第一个接口变化规律为1,3,5...，对第二个接口变化规律为2,4,6...；

参数's'我猜应该是start，也就是起始商品的索引。实际测试时，这个参数变化不是很规律，这里强制设置每页30个，

这样对第一个接口变化规律1,61,121...，对第二个接口变化规律为31,91,151...

3. 对于获取不到商店名称的情况，再构造chat1.jd.com/api/checkChat请求，获取商店名称。(但还是存在获取不到的情况，页面也不能显示)

'''

def crawl_page(n, csv_writer):

#上半部分网页

top_url = 'http://search.jd.com/Search?keyword=java&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&page=%d&s=%d&click=0' % (2*n- 1, 1 + 60*(n - 1))

content = get_page_content(top_url, True)

parse_content(content, n, csv_writer)

#下半部分网页(需要发送XHR请求的)

current_time = '%.5f' % time.time()

url = 'http://search.jd.com/s_new.php?keyword=java&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&page=%d&s=%d&scrolling=y&log_id=%s' %(2*n, 31 + 60*(n - 1), current_time)

content = get_page_content(url, False)

parse_content(content, n, csv_writer)

def get_page_content(url, is_top):

req = urllib.request.Request(url)

req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3192.0 Safari/537.36')

if not is_top:

req.add_header('Referer', 'http://search.jd.com/Search?keyword=java&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&click=0')

req.add_header('X-Requested-With', 'XMLHttpRequest')

response = urllib.request.urlopen(req)

return response.read().decode('utf8')

def get_seller(shop_url):

req = urllib.request.Request(shop_url)

req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3192.0 Safari/537.36')

req.add_header('Referer', 'http://search.jd.com/Search?keyword=java&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&click=0')

response = urllib.request.urlopen(req)

content = response.read().decode('utf8')

print('content', content)

seller = re.search(r'"seller":"(.*?)"', content)

if seller:

#获得的值为类似于"\u58a8\u9a6c\u56fe\u4e66\u65d7\u8230\u5e97"，需要转码

return seller.group(1).encode('latin-1').decode('unicode_escape')

return ''

def parse_content(content, n, csv_writer):

li_list = re.findall(r'

.*?', content, re.DOTALL)

for li in li_list:

#书名

name_match = re.search(r'

.*? (.*?).*?

', li, re.DOTALL)

name = ''

if name_match:

name = re.sub(r'<.>', '', name_match.group(1))

else:

print('page %d, name is empty' % n)

#价格

price_match = re.search(r'

.*? (.*?).*?

', li, re.DOTALL)

price = ''

if price_match:

price = price_match.group(1)

else:

print('page %d, price is empty' % n)

#评论数

commit_match = re.search(r'

.*? (.*?).*?

', li, re.DOTALL)

commit = ''

if commit_match:

commit = commit_match.group(1)

else:

print('page %d, price is empty' % n)

#出版社(商店)

shop_match = re.search(r'

]*>\s* ]*>]*>(.*?).*?

', li, re.DOTALL)

shop = ''

if shop_match:

shop = shop_match.group(1)

else:

uid = re.match(r'

.*?', li, re.DOTALL).group(1)

seller_url = 'https://chat1.jd.com/api/checkChat?pid=' + uid + '&returnCharset=utf-8'

shop = get_seller(seller_url)

if not shop:

print('page %d, shop is empty' % n)

global global_row

global_row = global_row + 1

csv_writer.writerow([str(global_row), name, price, commit, shop])

def main():

with open('output.csv', 'w', encoding='gbk') as f:

csv_writer = csv.writer(f)

csv_writer.writerow(['序号', '书名', '价格', '评论数', '出版社'])

for i in range(1, 6):

crawl_page(i, csv_writer)

if __name__ == '__main__':

main()

weixin_39801158

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
python爬取京东商品信息_Python简单爬取京东商品列表

#!/usr/bin/python3# -*- coding: UTF-8 -*-import urllib.requestimport urllib.errorimport reimport csvimport timeglobal_row = 0'''需求：爬取京东商品数据，以‘java’关键字为例。要求使用最基础的urllib和re库。需要保存书名，价格，评论数，出版社等信息。实现：找出页面...
复制链接

扫一扫