京东手机爬虫（仅供学习交流）

最新推荐文章于 2024-08-24 11:00:06 发布

爱吃鸡的小螃蟹

最新推荐文章于 2024-08-24 11:00:06 发布

阅读量2.3k

点赞数 1

分类专栏： python

本文链接：https://blog.csdn.net/weixin_40958757/article/details/80379528

版权

python 专栏收录该内容

3 篇文章 1 订阅

订阅专栏

京东手机爬虫，仅供交流学习使用，不得用作商业用途。
如有违规侵权，请联系删除。

效果如下：
这里写图片描述

import requests
from lxml import etree
from concurrent.futures import ThreadPoolExecutor, as_completed
import json
from urllib.parse import quote
from queue import Queue
import random
import time

data_queue = Queue()
f = open('jingdong.json', 'a')


class CrawlPool(object):
    def __init__(self, session, ):
        self.thread_pool = ThreadPoolExecutor(max_workers=5)
        self.url = 'https://search.jd.com/Search?keyword={0}&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&cid2=653&cid3=655&page={1}'
        self.next_half_url = 'https://search.jd.com/s_new.php?keyword={0}&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&cid2=653&cid3=655&page={1}&show_items='
        self.session = session

    def crawlpage(self, url, next_half_url):
        try:
            res = self.session.get(url)
            html = etree.HTML(res.text.encode(res.encoding).decode('utf8'))

            next_half_item = ','.join(html.xpath('//ul[@class="gl-warp clearfix"]/li/@data-sku'))
            next_half_res = self.session.get(next_half_url + next_half_item)
            next_html = etree.HTML(next_half_res.text.encode(next_half_res.encoding).decode('utf8'))

        except Exception as e:
            html = None
            next_html = None
            print(str(e))
        return html, next_html

    def crawl(self, keyword, page):
        key = quote(keyword)
        for i in range(1, page):
            url = self.url.format(key, str(2 * i - 1))
            next_half_url = self.next_half_url.format(key, str(2 * i))
            future = self.thread_pool.submit(self.crawlpage, url, next_half_url)
            data_queue.put(future.result())


class OutputPool(object):
    def __init__(self):
        self.thread_pool = ThreadPoolExecutor(max_workers=5)
        self.data = {}

    def save(self):
        while True:
            if data_queue.empty():
                break
            else:
                crawl_result = data_queue.get(False)
                self.thread_pool.submit(self.save_to_json, crawl_result)
                time.sleep(0.5)
                data_queue.task_done()

    def save_to_json(self, crawl_result):
        try:
            html = crawl_result[0]
            next_html = crawl_result[1]
            root_element = html.xpath('//ul[@class="gl-warp clearfix"]/li')
            for item in root_element:
                try:
                    self.data['title'] = item.xpath('.//div[@class="p-name p-name-type-2"]//em')[0].xpath('string(.)')
                    price = item.xpath('.//div[@class="p-price"]/strong/i/text()')
                    if price:
                        self.data['price'] = price[0]
                    else:
                        price_ex = item.xpath('.//div[@class="p-price"]/strong/@data-price')
                        self.data['price'] = price_ex[0]
                    self.data['item_link'] = item.xpath('.//div[@class="p-img"]/a/@href')[0]
                    self.data['item_comment'] = item.xpath('.//div[@class="p-commit"]/strong/a/text()')[0]
                    shop_name = item.xpath('.//div[@class="p-shop"]//a/text()')
                    self.data['shop_name'] = (shop_name[0] if shop_name else '京东自营')
                    img = item.xpath('.//div[@class="p-img"]/a/img/@src')
                    if img:
                        self.data['img'] = img[0]
                    else:
                        img_ex = item.xpath('.//div[@class="p-img"]/a/img/@data-lazy-img')
                        self.data['img'] = img_ex[0]

                    f.write(json.dumps(self.data, ensure_ascii=False) + '\n')
                except Exception as e:
                    print('parse data error' + str(e))

            root_element = next_html.xpath('//li[@class="gl-item"]')
            for item in root_element:
                try:
                    self.data['title'] = item.xpath('.//div[@class="p-name p-name-type-2"]//em')[0].xpath('string(.)')
                    price = item.xpath('.//div[@class="p-price"]//i/text()')
                    if price:
                        self.data['price'] = price[0]
                    else:
                        price_ex = item.xpath('.//div[@class="p-price"]/strong/@data-price')
                        self.data['price'] = price_ex[0]
                    self.data['item_link'] = item.xpath('.//div[@class="p-img"]/a/@href')[0]
                    self.data['item_comment'] = item.xpath('.//div[@class="p-commit"]/strong/a/text()')[0]
                    shop_name = item.xpath('.//div[@class="p-shop"]//a/text()')
                    self.data['shop_name'] = (shop_name[0] if shop_name else '京东自营')
                    img = item.xpath('.//div[@class="p-img"]/a/img/@src')
                    if img:
                        self.data['img'] = img[0]
                    else:
                        img_ex = item.xpath('.//div[@class="p-img"]/a/img/@data-lazy-img')
                        self.data['img'] = img_ex[0]

                    f.write(json.dumps(self.data, ensure_ascii=False) + '\n')
                except Exception as e:
                    print('parse next_half data error' + str(e))

        except Exception as e:
            print(str(e))


def main():
    ua_list = [
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 Edge/16.16299',
        'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36',
        'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0'
    ]
    user_agent = random.choice(ua_list)
    session = requests.Session()
    session.headers['User-Agent'] = user_agent
    session.headers['Referer'] = 'https://search.jd.com/Search'
    page = 101
    keyword = '手机'

    crawl_pool = CrawlPool(session)
    output_pool = OutputPool()
    crawl_pool.crawl(keyword, page)
    output_pool.save()


if __name__ == '__main__':
    main()