Python运用Selenium+BeautifulSoup爬取亚马逊电子通讯商品列表 2022/4/3

from selenium import webdriver
from selenium.webdriver.firefox.service import Service
from bs4 import BeautifulSoup
import csv
import time

s = Service(r'D:\geckodriver\geckodriver.exe')
#把上述地址改成你电脑中geckodriver.exe程序的地址
fp = webdriver.FirefoxOptions()
fp.set_preference('permissions.default.image', 2)        #禁止加载图片
fp.set_preference('permissions.default.stylesheet', 2)   #禁止加载CSS样式表
driver = webdriver.Firefox(options=fp, service=s)


def outputOneResult(soup, output_list, category):
    div = soup.find('div', class_='p13n-gridRow _p13n-zg-list-grid-desktop_style_grid-row__3Cywl')
    item_list = div.find_all("div", id='gridItemRoot')
    for item in item_list:
        try:  # 提取排名
            rank = item.find("span", class_='zg-bdg-text').text.strip()
            rank = rank.replace("#", "")
        except:
            rank = ""
        try:  # 提取标题
            title = item.find_all("a", class_='a-link-normal')[1].text.strip()
        except:
            title = ""
        try:  # 提取链接
            link = "https://www.amazon.cn" + item.find_all("a", class_='a-link-normal')[1]["href"]
        except:
            link = ""
        try:  # 提取星级
            star = item.find("span", class_='a-icon-alt').text.split(',')[0].strip()
            star = star.replace("颗星", "").strip()
        except:
            star = ""
        try:  # 提取评论数
            comment = item.find("span", class_='a-size-small').text.strip()
        except:
            comment = ""
        try:  # 提取评论链接
            commentlink = "https://www.amazon.cn" + item.find("div", class_='a-icon-row').a["href"]
        except:
            commentlink = ""
        try:  # 提取价格
            price = item.find("span", class_='a-size-base a-color-price').text.strip()
        except:
            price = ""
        if title != "":
            output_list.append([rank, title, category, link, star, comment, commentlink, price])
    return output_list


for i in range(1, 2):
    link = "https://www.amazon.cn/gp/bestsellers/wireless/ref=zg_bs_pg_" + str(i) + "?ie=UTF8&pg=" + str(i)
    driver.get(link)
    driver.implicitly_wait(30)
    for j in range(0, 5):
        time.sleep(2)
        driver.execute_script("window.scrollTo(0,document.body.scrollHeight);")
        driver.implicitly_wait(10)
    soup = BeautifulSoup(driver.page_source, "lxml")

    output_list = []
    output_list = outputOneResult(soup, output_list, "总体")
with open(r'C:\Users\86159\Desktop\book_list.csv', 'a+', newline='', encoding='utf-8') as csvfile:
    spamwriter = csv.writer(csvfile, dialect='excel')
    spamwriter.writerows(output_list)
time.sleep(2)

该代码需要预先下载geckodriver,下载地址为Releases · mozilla/geckodriver · GitHub

运行环境为pycharm

奋斗!!!

  • 1
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值