概述
- 结果展示
- 系统环境配置
- 安装依赖
- 代码示例
1.爬取结果展示

2.系统环境配置
-
mac系统
需要打开safari浏览器的远程自动化开关,如下:

-
windows系统
安装chrome浏览器
3.创建虚拟环境及安装相关依赖
当前使用的python版本为3.12.5,python3以上的应该都没问题
python -m venv py3_env # 在当前路径下创建虚拟环境
source py3_env/bin/activate # 进入虚拟环境中
pip install selenium # 安装依赖包
4.代码示例
* 修改keyword和page_start,page_end来爬取你需要的指定商品信息
mac代码示例
import sys
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.safari.options import Options
options = Options()
options.add_argument('--headless') # 设置option
browser = webdriver.Safari(options=options)
# 页数控制
page_start = 1
page_end = 5
# 搜索条件
keyword = "3d打印"
# 输出列表准备
output_list = []
# 爬取淘宝商品列表
def get_page(keyword, output_list):
# 循环爬取程序
for page in range(page_start, page_end):
# WebDriver控制打开页面
browser.get('https://uland.taobao.com/sem/tbsearch?localImgKey=&page=' + str(page) + '&q=' + keyword + '&tab=all')
browser.maximize_window()
browser.implicitly_wait(6)
# 定位元素
try:
print('using classic version selector')
goods_arr = browser.find_elements(By.CLASS_NAME, 'Card--doubleCardWrapper--L2XFE73')
# 遍历商品
for i, goods in enumerate(goods_arr):
item_name = goods.find_element(By.CSS_SELECTOR,'.Title--title--jCOPvpf > span').text
item_price = goods.find_element(By.CLASS_NAME,'Price--priceInt--ZlsSi_M').text
item_shop = goods.find_element(By.CLASS_NAME,'ShopInfo--shopName--rg6mGmy').text
goods_item = {
"商品名称": item_name,
"商品价格": item_price,
"商品店铺名称": item_shop
}
output_list += [goods_item]
except:
print(f'注意:第【{page}】页将跳过如需获取请重新运行程序!')
time.sleep(2)
print(output_list)
time.sleep(5)
def save_result(result):
with open("3d_info.txt", "w", encoding='utf-8') as f:
for item in result:
f.write(str(item) + "\n")
get_page(keyword, output_list)
# 结果保存到txt文件中
save_result(result=output_list)
browser.close()
browser.quit()
sys.exit()
windows代码示例
import sys
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
# WebDriver防检测
options = webdriver.ChromeOptions()
options.add_experimental_option('excludeSwitches', ['enable-automation'])
options.add_argument("--disable-blink-features")
# option.add_argument('headless')
options.add_argument("--disable-blink-features=AutomationControlled")
options.add_experimental_option('excludeSwitches', ['enable-logging'])
browser = webdriver.Chrome(options=options)
browser.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": """Object.defineProperty(navigator, 'webdriver', {get: () => undefined})"""
})
# 页数控制
page_start = 1
page_end = 5
# 搜索条件
keyword = "3d打印"
# 输出列表准备
output_list = []
# 爬取淘宝商品列表
def get_page(keyword, output_list):
# 循环爬取程序
for page in range(page_start, page_end):
# WebDriver控制打开页面
browser.get('https://uland.taobao.com/sem/tbsearch?localImgKey=&page=' + str(page) + '&q=' + keyword + '&tab=all')
browser.maximize_window()
browser.implicitly_wait(6)
# 定位元素
try:
print('using classic version selector')
goods_arr = browser.find_elements(By.CLASS_NAME, 'Card--doubleCardWrapper--L2XFE73')
# 遍历商品
for i, goods in enumerate(goods_arr):
item_name = goods.find_element(By.CSS_SELECTOR,'.Title--title--jCOPvpf > span').text
item_price = goods.find_element(By.CLASS_NAME,'Price--priceInt--ZlsSi_M').text
item_shop = goods.find_element(By.CLASS_NAME,'ShopInfo--shopName--rg6mGmy').text
goods_item = {
"商品名称": item_name,
"商品价格": item_price,
"商品店铺名称": item_shop
}
output_list += [goods_item]
except:
print(f'注意:第【{page}】页将跳过如需获取请重新运行程序!')
time.sleep(2)
print(output_list)
time.sleep(5)
def save_result(result):
with open("3d_info.txt", "w", encoding='utf-8') as f:
for item in result:
f.write(str(item) + "\n")
get_page(keyword, output_list)
# 结果保存到txt文件中
save_result(result=output_list)
browser.close()
browser.quit()
sys.exit()
保存代码为:taobao_shangpin_list.py
在上面的虚拟环境中执行:
python taobao_shangpin_list.py
执行过程中,程序会打开浏览器进行页面爬取信息操作,如果是chrome浏览器,可以使用headless参数让进程在后台运行
2502

被折叠的 条评论
为什么被折叠?



