import time
import random
from openpyxl import Workbook
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from pyquery import PyQuery as pq
# 设置为开发者模式,防止被各大网站识别出来使用了Selenium console中输入window.navigator.webdriver 测试
def get_browser():
browser = webdriver.Chrome()
# 2.使用js关闭检测机制
browser.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": """
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
})
"""
})
# 3.返回浏览器对象
return browser
def page_turning(page_num):
print(f'正在跳转至第{page_num}页')
try:
# 使用JavaScript滚动到页面底部
#browser.execute_script("arguments[0].scrollIntoView(false);",item)
#browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(8)
# 找到下一页的按钮
submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#pageContent > div.LeftLay--leftWrap--xBQipVc > div.LeftLay--leftContent--AMmPNfB > div.Pagination--pgWrap--kfPsaVv > div > div > button.next-btn.next-medium.next-btn-normal.next-pagination-item.next-next')))
submit.click()
# 判断页码是否和当前页相等
wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR, '#pageContent > div.LeftLay--leftWrap--xBQipVc > div.LeftLay--leftContent--AMmPNfB > div.Pagination--pgWrap--kfPsaVv > div > div > div > button.next-btn.next-medium.next-btn-normal.next-pagination-item.next-current > span'), str(page_num)))
print("跳转页面成功")
get_products(page_num) # 获取该页商品数据
except TimeoutException:
page_turning(page_num)
def simulate_scroll():
"""
模拟鼠标慢慢滚动以加载商品图片
:return:
"""
# 获取页面高度
page_height = browser.execute_script("return document.body.scrollHeight")
# 定义滚动步长和间隔时间
scroll_step = 3 # 每次滚动的距离
scroll_delay = 8 # 每次滚动的间隔时间(秒)
# 模拟慢慢滚动
current_position = 0
while current_position < page_height-1500:
# 计算下一个滚动位置
next_position = current_position + scroll_step
# 执行滚动动作
browser.execute_script(f"window.scrollTo(0, {next_position});")
# 等待一段时间
browser.implicitly_wait(scroll_delay)
# 更新当前滚动位置
current_position = next_position
#duizhao=driver.find_element(By.CSS_SELECTOR, '#pageContent > div.LeftLay--leftWrap--xBQipVc > div.LeftLay--leftContent--AMmPNfB > div.Pagination--pgWrap--kfPsaVv > div > div > button.next-btn.next-medium.next-btn-normal.next-pagination-item.next-next')
#dz=driver.execute_script("arguments[0].scrollIntoView();",duizhao)
def get_products(page_num):
"""
获取对应页码下的所有商品
:param page_num:页码
:return:
"""
print(f"正在提取第{page_num}页的商品信息...")
time.sleep(random.randint(5, 8))
simulate_scroll() # 模拟鼠标慢慢滚动以加载图片
time.sleep(random.randint(7, 12))
html = browser.page_source
doc = pq(html)
# 提取所有商品的共同父元素的类选择器
items = doc('div.PageContent--contentWrap--mep7AEm > div.LeftLay--leftWrap--xBQipVc > div.LeftLay--leftContent--AMmPNfB > div.Content--content--sgSCZ12 > div > div').items()
for item in items:
title = item.find('.Title--title--jCOPvpf > span').text() # 定位商品标题
image = item.find('.MainPic--mainPicWrapper--iv9Yv90 > img').attr('src') # 定位商品图片地址
price = item.find('span.Price--priceInt--ZlsSi_M').text() + item.find('span.Price--priceFloat--h2RR0RK').text() + '元' # 定位价格
deal = item.find('.Price--realSales--FhTZc7U').text() # 定位交易量
shop = item.find('.ShopInfo--shopName--rg6mGmy').text() # 定位店名
# 定位所在地信息
location = item.find('div.Price--priceWrapper--Q0Dn7pN > div:nth-child(4) > span').text() + ' ' + item.find('div.Price--priceWrapper--Q0Dn7pN > div:nth-child(5) > span').text()
product = {
'商品图片': image,
'价格': price,
'商品简介': title,
'交易数量': deal,
'店铺名称': shop,
'店铺所在地': location
}
ws.append([product['商品简介'], product['价格'], product['交易数量'], product['店铺名称'], product['店铺所在地'], product['商品图片']])
print(product)
def search_goods(start_page, total_pages):
"""
抓取索引页
:param page: 页码
"""
print(f'正在爬取第{start_page}页')
try:
url = 'https://s.taobao.com'
browser.get(url)
time.sleep(10) # 强制停止5秒,手动扫码登录
# 找到搜索输入框
input_box = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#q")))
# 找到搜索按钮
search_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#J_SearchForm > div > div.search-button > button')))
input_box.send_keys(KEYWORD)
search_button.click()
# 搜索商品后会再强制等待10秒,如有滑块请手动操作
time.sleep(10)
if start_page != 1:
# 使用JavaScript滚动到页面底部
browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(random.randint(5, 8)) # 滑倒底部后停留 1 - 3 秒
# 定位到页面底部的页码输入框
page_box = wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="pageContent"]/div[1]/div[3]/div[4]/div/div/span[3]/input')))
#print('定位到页面底部的页码输入框成功')
page_box.clear() # 清空输入框
page_box.send_keys(start_page) # 调用 send_keys()方法将页码填充到输入框中
submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#pageContent > div.LeftLay--leftWrap--xBQipVc > div.LeftLay--leftContent--AMmPNfB > div.Pagination--pgWrap--kfPsaVv > div > div > button.next-btn.next-medium.next-btn-normal.next-pagination-item.next-next > span')))
submit.click()
# 获取每一页的信息
get_products(start_page)
for i in range(start_page + 1, start_page + total_pages):
page_turning(i)
wb.save(r'淘宝商品数据.xlsx')
except TimeoutException as e:
print("搜索商品超时", e)
search_goods(start_page, total_pages)
def main():
try:
page_start = int(input("输入您想开始爬取的页面数:"))
page_all = int(input("输入您想爬取的总页面数: "))
search_goods(page_start, page_all)
except Exception as e:
print("main函数报错:", e)
if __name__ == '__main__':
KEYWORD = '女童连衣裙'#在这里修改商品名称
browser = get_browser()
wait = WebDriverWait(browser,20)
wb = Workbook() # 新建工作簿
ws = wb.active # 获取工作表
ws.append(['商品简介', '价格', '交易数量', '店铺名称', '店铺所在地', '商品图片'])
main()
运行后在下方输入框输入开始页码和结束页面,在主函数keyword修改商品名称。
运行后存到excel,效果如图