京东手机爬虫,仅供交流学习使用,不得用作商业用途。
如有违规侵权,请联系删除。
效果如下:
import requests
from lxml import etree
from concurrent.futures import ThreadPoolExecutor, as_completed
import json
from urllib.parse import quote
from queue import Queue
import random
import time
data_queue = Queue()
f = open('jingdong.json', 'a')
class CrawlPool(object):
def __init__(self, session, ):
self.thread_pool = ThreadPoolExecutor(max_workers=5)
self.url = 'https://search.jd.com/Search?keyword={0}&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&cid2=653&cid3=655&page={1}'
self.next_half_url = 'https://search.jd.com/s_new.php?keyword={0}&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&cid2=653&cid3=655&page={1}&show_items='
self.session = session
def crawlpage(self, url, next_half_url):
try:
res = self.session.get(url)
html = etree.HTML(res.text.encode(res.encoding).decode('utf8'))
next_half_item = ','.join(html.xpath('//ul[@class="gl-warp clearfix"]/li/@data-sku'))
next_half_res = self.session.get(next_half_url + next_half_item)
next_html = etree.HTML(next_half_res.text.encode(next_half_res.encoding).decode('utf8'))
except Exception as e:
html = None
next_html = None
print(str(e))
return html, next_html
def crawl(self, keyword, page):
key = quote(keyword)
for i in range(1, page):
url = self.url.format(key, str(2 * i - 1))
next_half_url = self.next_half_url.format(key, str(2 * i))
future = self.thread_pool.submit(self.crawlpage, url, next_half_url)
data_queue.put(future.result())
class OutputPool(object):
def __init__(self):
self.thread_pool = ThreadPoolExecutor(max_workers=5)
self.data = {}
def save(self):
while True:
if data_queue.empty():
break
else:
crawl_result = data_queue.get(False)
self.thread_pool.submit(self.save_to_json, crawl_result)
time.sleep(0.5)
data_queue.task_done()
def save_to_json(self, crawl_result):
try:
html = crawl_result[0]
next_html = crawl_result[1]
root_element = html.xpath('//ul[@class="gl-warp clearfix"]/li')
for item in root_element:
try:
self.data['title'] = item.xpath('.//div[@class="p-name p-name-type-2"]//em')[0].xpath('string(.)')
price = item.xpath('.//div[@class="p-price"]/strong/i/text()')
if price:
self.data['price'] = price[0]
else:
price_ex = item.xpath('.//div[@class="p-price"]/strong/@data-price')
self.data['price'] = price_ex[0]
self.data['item_link'] = item.xpath('.//div[@class="p-img"]/a/@href')[0]
self.data['item_comment'] = item.xpath('.//div[@class="p-commit"]/strong/a/text()')[0]
shop_name = item.xpath('.//div[@class="p-shop"]//a/text()')
self.data['shop_name'] = (shop_name[0] if shop_name else '京东自营')
img = item.xpath('.//div[@class="p-img"]/a/img/@src')
if img:
self.data['img'] = img[0]
else:
img_ex = item.xpath('.//div[@class="p-img"]/a/img/@data-lazy-img')
self.data['img'] = img_ex[0]
f.write(json.dumps(self.data, ensure_ascii=False) + '\n')
except Exception as e:
print('parse data error' + str(e))
root_element = next_html.xpath('//li[@class="gl-item"]')
for item in root_element:
try:
self.data['title'] = item.xpath('.//div[@class="p-name p-name-type-2"]//em')[0].xpath('string(.)')
price = item.xpath('.//div[@class="p-price"]//i/text()')
if price:
self.data['price'] = price[0]
else:
price_ex = item.xpath('.//div[@class="p-price"]/strong/@data-price')
self.data['price'] = price_ex[0]
self.data['item_link'] = item.xpath('.//div[@class="p-img"]/a/@href')[0]
self.data['item_comment'] = item.xpath('.//div[@class="p-commit"]/strong/a/text()')[0]
shop_name = item.xpath('.//div[@class="p-shop"]//a/text()')
self.data['shop_name'] = (shop_name[0] if shop_name else '京东自营')
img = item.xpath('.//div[@class="p-img"]/a/img/@src')
if img:
self.data['img'] = img[0]
else:
img_ex = item.xpath('.//div[@class="p-img"]/a/img/@data-lazy-img')
self.data['img'] = img_ex[0]
f.write(json.dumps(self.data, ensure_ascii=False) + '\n')
except Exception as e:
print('parse next_half data error' + str(e))
except Exception as e:
print(str(e))
def main():
ua_list = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 Edge/16.16299',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0'
]
user_agent = random.choice(ua_list)
session = requests.Session()
session.headers['User-Agent'] = user_agent
session.headers['Referer'] = 'https://search.jd.com/Search'
page = 101
keyword = '手机'
crawl_pool = CrawlPool(session)
output_pool = OutputPool()
crawl_pool.crawl(keyword, page)
output_pool.save()
if __name__ == '__main__':
main()