工欲善其事，必先利其器 —— selenium 模块（5）

最新推荐文章于 2024-04-26 23:58:12 发布

世界的隐喻

最新推荐文章于 2024-04-26 23:58:12 发布

阅读量128

点赞数 2

分类专栏： python爬虫文章标签： selenium xpath python

本文链接：https://blog.csdn.net/ShiJieDeYinYu/article/details/118312353

版权

python爬虫专栏收录该内容

35 篇文章 0 订阅

订阅专栏

selenium 综合应用（1）

需求

模拟浏览器，从京东首页开始输入关键词，爬取两页商品的名称、价格和一页评价（只包括文字，不包括视频）。评价包括用户的名称和星数。

网页分析

首先定位搜索框的位置
前面说过商品页的商品是动态加载。不然我们只能得到 30 个商品的数据
定位商品的链接

在这里插入图片描述
4. 商品详情页的需要爬取的数据的定位

在这里插入图片描述

5. 翻页

在这里插入图片描述

代码

from selenium import webdriver
from time import sleep
from lxml import etree
import re

key = input("请输入商品名称：")
page_index = int(input("请输入要爬取的页数："))

bor = webdriver.Chrome("chromedriver.exe")
bor.get("https://www.jd.com/")

# 定位搜索框
input_key = bor.find_element_by_id("key")
input_key.send_keys(key)
sleep(2)

# 定位按钮
btn = bor.find_element_by_xpath('//*[@id="search"]/div/div[2]/button')
bor.execute_script("arguments[0].click()", btn)
sleep(2)

index = 0
while index < page_index:
    # 滚轮到底，破解动态加载
    bor.execute_script('window.scrollTo(0,document.body.scrollHeight)')
    sleep(2)

    page = bor.page_source
    tree = etree.HTML(page)
    # 商品 li 标签的列表
    good_lists = tree.xpath('//*[@id="J_goodsList"]/ul/li')
    # print(len(good_lists))
    for i in range(1, len(good_lists)+1):
        # 商品的链接
        chain = bor.find_element_by_xpath('//*[@id="J_goodsList"]/ul/li[' + str(i) + ']//a[@target="_blank"]')
        # 点击，来到商品详情页
        bor.execute_script("arguments[0].click()", chain)
        sleep(2)

        # 跳转到新打开的窗口
        handles = bor.window_handles
        bor.switch_to.window(handles[-1])

        # 点击商品评价
        good_btn = bor.find_element_by_xpath('//*[@id="detail"]/div[1]/ul/li[5]')
        bor.execute_script("arguments[0].click()", good_btn)
        # 滚轮到底，破解动态加载
        bor.execute_script('window.scrollTo(0,document.body.scrollHeight)')
        sleep(2)

        good_tree = etree.HTML(bor.page_source)

        good_name = good_tree.xpath('//div[@class="sku-name"]//text()')[0]
        good_name = re.sub(r"\s", "", good_name)
        good_price = good_tree.xpath('//div[@class="dd"]/span/span[2]//text()')[0] + " 元"
        # print(good_price, "\n", good_name)
        # print("正在爬取", good_name + "……")

        with open("商品.txt", "a", encoding = "utf-8") as fp:
            fp.write("商品名称：" + good_name + "\n")
            fp.write("商品价格：" + good_price + "\n")

        user_name_list = good_tree.xpath('//*[@id="comment-0"]/div/div[1]/div[1]/img/@alt')
        star_list = good_tree.xpath('//*[@id="comment-0"]/div/div[2]/div[1]/@class')
        conten_listt = good_tree.xpath('//*[@id="comment-0"]/div/div[2]/p//text()')

        for i in range(len(user_name_list)):
            user_name = user_name_list[i]
            star = star_list[i]
            content =conten_listt[i]
            with open("商品.txt", "a", encoding = "utf-8") as fp:
                fp.write("用户评价：" + "\n")
                fp.write("\t" + "用户名：" + user_name + "\n")
                fp.write("\t" + "星数：" + star + "\n")
                fp.write("\t" + content + "\n" + "\n")

        # print(good_name, "爬取成功！！！")
        sleep(2)
        # 关闭当前窗口，并跳转到一开始的窗口
        bor.close()
        bor.switch_to.window(handles[0])
        break

    index += 1
    # 输入页数并翻页
    input_text = bor.find_element_by_xpath('//*[@id="J_bottomPage"]/span[2]/input')
    key = index + 1
    input_text.clear()
    input_text.send_keys(str(key))
    sleep(5)
    btn = bor.find_element_by_xpath('//*[@id="J_bottomPage"]/span[2]/a')
    bor.execute_script("arguments[0].click()", btn)
    sleep(2)

bor.quit()