京东华为P20手机评论数据抓取

用selenium 和chromedriver抓取京东手机P20的评论信息

import time
from selenium import webdriver
import csv
from lxml import etree


class JDSpider:
    def __init__(self):
        self.base_url = 'https://www.jd.com'
        self.opt = webdriver.ChromeOptions()
        self.opt.set_headless()
        self.driver = webdriver.Chrome(options=self.opt)
        self.driver_comment = webdriver.Chrome(options=self.opt)
        self.key = '华为p20'

    # 列表下一页
    def click_button(self):
        if self.driver.page_source.find('pn-next disabled') == -1:
            self.driver.find_element_by_xpath('//*[@id="J_bottomPage"]/span[1]/a[9]/em').click()
            time.sleep(1)
            self.driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")
            time.sleep(1)
            res = self.driver.page_source
            self.parse_base_html(res)
        else:
            self.driver.quit()

    # 评论下一页
    def comment_next_click(self):
        if self.driver_comment.page_source.find('ui-pager-next') != -1:
            # 可能是这边网速的问题,这里老是提示无法点击
            self.driver_comment.find_element_by_class_name('ui-pager-next').click()
            time.sleep(1)
            res = self.driver.page_source
            self.parse_child_html(res)
        else:
            self.driver_comment.quit()

    # 读取商品列表页面
    def parse_base_html(self, res):
        par_res = etree.HTML(res)
        base_list = par_res.xpath('//*[@id="J_goodsList"]/ul/li/div')
        for base in base_list:
            goods_url = base.xpath('./div[@class="p-img"]/a/@href')[0]
            url = goods_url if "https:" in goods_url else 'https:' + goods_url
            self.driver_comment.get(url)
            time.sleep(1)
            self.driver_comment.find_element_by_xpath('//*[@id="detail"]/div[1]/ul/li[5]').click()
            self.driver_comment.execute_script("window.scrollTo(0,document.body.scrollHeight)")
            time.sleep(2)
            goods_res = self.driver_comment.page_source
            self.parse_child_html(goods_res)
            print('开始解析商品评论')
        time.sleep(1)
        self.click_button()

    # 解析商品详情评论
    def parse_child_html(self, goods_res):
        parse_goods_res = etree.HTML(goods_res)
        goods_base_list = parse_goods_res.xpath('//*[@id="comment-0"]/div[@class="comment-item"]')
        for info in goods_base_list:
            username = info.xpath('./div[1]/div[1]/img/@alt')[0]
            star = info.xpath('./div[2]/div[1]/@class')[0][-5:]
            comment = info.xpath('./div[2]/p/text()')[0]
            print("解析成功")
            self.to_csv(username, star, comment)
        time.sleep(1)
        self.comment_next_click()

    def to_csv(self, username, star, comment):
        with open('aaa.csv', 'a', newline="") as f:
            writer = csv.writer(f)
            writer.writerow((username, star, comment))
            print("写入成功")

    def work_on(self):
        self.driver.get(self.base_url)
        self.driver.find_element_by_id("key").send_keys(self.key)
        self.driver.find_element_by_class_name("button").click()
        time.sleep(2)
        self.driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")
        res = self.driver.page_source
        self.parse_base_html(res)


if __name__ == '__main__':
    spider = JDSpider()
    spider.work_on()
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值