京东商品抓取

import csv
import random
import re
import time
import logging
from datetime import datetime
from lxml import etree
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys


class Spider(object):
    def __init__(self):
        self.chromeOptions = self.get_profile()
        self.browser = self.get_browser()
        self.wait = self.get_wait()
        self.log = self.get_log()

    #日志
    def get_log(self):
        logger = logging.getLogger(__name__)
        logger.setLevel(level=logging.INFO)
        formatter = logging.Formatter(
            '%(asctime)s - %(name)s - %(levelname)s - %(message)s')  # 日志时间、执行程序路径、日志当前行号、日志级别、日志信息
        sh = logging.StreamHandler()
        sh.setFormatter(formatter)  # 设置屏幕上显示的格式
        today = datetime.now()
        log_file_path = "./-{}-{}-{}-{}.log".format(ky,today.year, today.month, today.day)
        handler = logging.FileHandler(log_file_path, encoding='utf-8')
        handler.setFormatter(formatter)
        logger.addHandler(handler)
        logger.addHandler(sh)
        return logger

    def main(self):
        self.csv_head()
        self.parse_website(ky,start_page,end_page)
        self.browser.quit() #浏览器关闭

    def get_profile(self):
        ua = [  # Opera
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60",
            "Opera/8.0 (Windows NT 5.1; U; en)",
            "Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50",
            "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50",  # Firefox
            "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0",
            "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10 Gecko / 20100922Ubuntu / 10.10(maverick)Firefox / 3.6.10",
            # Safari
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2",
            # chrome
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36",
            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
            "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16",
            # 360
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
            "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko",  # 淘宝浏览器
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
            # 猎豹浏览器
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
            "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
            "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
            # QQ浏览器
            "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
            "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",  # sogou浏览器
            "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0",
            "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)",
            # maxthon浏览器
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36",
            # UC浏览器
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
        ]

        # 谷歌相关设置
        chromeOptions = webdriver.ChromeOptions()
        # chromeOptions.add_argument('--headless')       # 谷歌无头模式
        chromeOptions.add_argument('--disable-gpu')  # 禁用显卡
        chromeOptions.add_argument('window-size=1280,800')  # 指定浏览器分辨率
        chromeOptions.add_argument("--no-sandbox")
        prefs = {"profile.managed_default_content_settings.images": 2}
        chromeOptions.add_experimental_option("prefs", prefs)
        chromeOptions.add_argument('user-agent=' + random.choice(ua))
        return chromeOptions

    def get_browser(self):
        browser = webdriver.Chrome(chrome_options=self.chromeOptions)
        return browser

    def get_wait(self):
        wait = WebDriverWait(self.browser, 3)
        return wait


    def parse_website(self,ky,start_page,end_page):

        self.browser.get("https://www.jd.com")# 访问京东网站
        #等待直到局部元素显示出来,这里的局部元素为淘宝网页搜索框部分
        input = self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#key")))
        input.send_keys(ky) #在输入框调用send_keys方法模拟输入关键字
        #等待直到元素可被点击,这里的元素为搜索按钮
        submit = self.wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '.button')))
        submit.send_keys(Keys.ENTER) #模拟点击搜索按钮操作
        #翻页爬取商品
        for i in range(start_page,end_page+1):
            time.sleep(1)
            #等待直到局部元素显示出来,这里的局部元素为到第[2]页中的[..]
            input = self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#J_bottomPage > span.p-skip > input")))
            input.clear() #清除当前输入框中的内容
            input.send_keys(i) #把下一页的页码传入输入框中

            #等待直到元素可被点击,这里的元素为输入页码后的的确定按钮
            submit = self.wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#J_bottomPage > span.p-skip > a')))
            submit.send_keys(Keys.ENTER) #模拟点击确定按钮

            time.sleep(3)
            self.browser.refresh() #刷新网页
            self.browser.execute_script("window.scrollTo(0,document.body.scrollHeight)") # 滑动至浏览器底部

            # 一页显示60个商品,确保60个商品都正常加载出来。
            try:
                self.wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#J_goodsList > ul > li:nth-child(60)")))
            except:
                pass
            html = self.browser.page_source # 页面渲染完毕后生成html

            time.sleep(1)
            print('正在解析第%d页数据'%i)

            parseHtml = etree.HTML(html)
            li_list = parseHtml.xpath('//*[@class="gl-item"]')

            count = 0
            for item in li_list:
                count += 1
                print('正在解析第%d个资源'%count)

                try:
                    id = item.xpath('./@data-sku')[0]
                except:
                    id = None
                try:
                    img_url = item.xpath('.//div[@class="gl-i-wrap"]/div[@class="p-img"]/a/img/@data-lazy-img')[0]
                    img_url = f'https:{img_url}'
                except:
                    img_url = None

                try:
                    detail_url = item.xpath('.//div[@class="gl-i-wrap"]/div[@class="p-img"]/a/@href')[0]
                    if detail_url[:4] != 'http':
                        detail_url = f'https:{detail_url}'
                except:
                    detail_url = None

                try:
                    price = item.xpath('.//div[@class="gl-i-wrap"]/div[@class="p-price"]/strong/i/text()')[0]
                except:
                    price = None

                try:
                    title = ''.join(item.xpath('.//div[@class="gl-i-wrap"]/div[@class="p-name p-name-type-2"]/a/em//text()'))
                except:
                    title = None

                if '京东超市' == title[:4]:
                    verify = True
                else:
                    verify = False

                try:
                    comment_num = item.xpath('.//div[@class="gl-i-wrap"]/div[@class="p-commit"]/strong/a/text()')[0]
                except:
                    comment_num = None

                try:
                    shop_url = item.xpath('.//div[@class="gl-i-wrap"]/div[@class="p-shop"]/span/a/@href')[0]
                    shop_url = 'https:'+shop_url
                except:
                    shop_url = None

                try:
                    shop_name = item.xpath('.//div[@class="gl-i-wrap"]/div[@class="p-shop"]/span/a/text()')[0]
                except:
                    shop_name = None

                info = [title,id,price,verify,comment_num,shop_name,shop_url,img_url,detail_url]
                self.save(info)

    #csv表格商品头
    def csv_head(self):
        head = ['标题','商品ID','价格','是否为京东超市','评论数','店铺名称','店铺url','商品图片url','商品url']
        csvFile = open(fr'{ky}.csv', 'a+', newline='', encoding='utf-8-sig')  # 设置newline,否则两行之间会空一行
        writer = csv.writer(csvFile)
        writer.writerow(head)
        csvFile.close()

    #存储本地csv
    def save(self,info):
        csvFile = open(fr'{ky}.csv', 'a+', newline='', encoding='utf-8-sig')  # 设置newline,否则两行之间会空一行
        writer = csv.writer(csvFile)
        writer.writerow(info)
        csvFile.close()

if __name__ == "__main__":
    ky = input('请输入爬取商品信息:')
    start = time.time()
    start_page = int(input('请输入要爬取的起始页(1-80):'))
    end_page = int(input('请输入要爬取的结束页(1-80):'))
    testspider = Spider()
    testspider.main()
    end = time.time()
    self.log.info(end - start)
    self.log.info('商品爬取完毕,谢谢使用!')
  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值