from selenium import webdriver
from selenium.webdriver.firefox.service import Service
from bs4 import BeautifulSoup
import csv
import time
s = Service(r'D:\geckodriver\geckodriver.exe')
#把上述地址改成你电脑中geckodriver.exe程序的地址
fp = webdriver.FirefoxOptions()
fp.set_preference('permissions.default.image', 2) #禁止加载图片
fp.set_preference('permissions.default.stylesheet', 2) #禁止加载CSS样式表
driver = webdriver.Firefox(options=fp, service=s)
def outputOneResult(soup, output_list, category):
div = soup.find('div', class_='p13n-gridRow _p13n-zg-list-grid-desktop_style_grid-row__3Cywl')
item_list = div.find_all("div", id='gridItemRoot')
for item in item_list:
try: # 提取排名
rank = item.find("span", class_='zg-bdg-text').text.strip()
rank = rank.replace("#", "")
except:
rank = ""
try: # 提取标题
title = item.find_all("a", class_='a-link-normal')[1].text.strip()
except:
title = ""
try: # 提取链接
link = "https://www.amazon.cn" + item.find_all("a", class_='a-link-normal')[1]["href"]
except:
link = ""
try: # 提取星级
star = item.find("span", class_='a-icon-alt').text.split(',')[0].strip()
star = star.replace("颗星", "").strip()
except:
star = ""
try: # 提取评论数
comment = item.find("span", class_='a-size-small').text.strip()
except:
comment = ""
try: # 提取评论链接
commentlink = "https://www.amazon.cn" + item.find("div", class_='a-icon-row').a["href"]
except:
commentlink = ""
try: # 提取价格
price = item.find("span", class_='a-size-base a-color-price').text.strip()
except:
price = ""
if title != "":
output_list.append([rank, title, category, link, star, comment, commentlink, price])
return output_list
for i in range(1, 2):
link = "https://www.amazon.cn/gp/bestsellers/wireless/ref=zg_bs_pg_" + str(i) + "?ie=UTF8&pg=" + str(i)
driver.get(link)
driver.implicitly_wait(30)
for j in range(0, 5):
time.sleep(2)
driver.execute_script("window.scrollTo(0,document.body.scrollHeight);")
driver.implicitly_wait(10)
soup = BeautifulSoup(driver.page_source, "lxml")
output_list = []
output_list = outputOneResult(soup, output_list, "总体")
with open(r'C:\Users\86159\Desktop\book_list.csv', 'a+', newline='', encoding='utf-8') as csvfile:
spamwriter = csv.writer(csvfile, dialect='excel')
spamwriter.writerows(output_list)
time.sleep(2)
该代码需要预先下载geckodriver,下载地址为Releases · mozilla/geckodriver · GitHub
运行环境为pycharm
奋斗!!!