第四节:电商信息爬取项目实战项目
课程目标
- 学习如何通过playwright完成某宝商品信息爬取
课程内容
编码实现
import json
from playwright.sync_api import sync_playwright
import time
from tqdm import tqdm
import pandas as pd
with sync_playwright() as p:
browser = p.chromium.launch(headless=False)
with open("cookies.json", 'r') as f:
storage_state = json.load(f)
page = browser.new_page()
page.goto("https://uland.taobao.com/sem/tbsearch?localImgKey=&page=1&q=%E5%B7%A5%E8%A3%85%E8%A3%A4&tab=all")
page.wait_for_selector(".Card--doubleCardWrapper--L2XFE73")
for i in range(10):
page.mouse.wheel(0, 1000)
time.sleep(0.3)
boxes = page.locator(".Card--doubleCardWrapper--L2XFE73").all()
goods_infos = []
for box in tqdm(boxes):
title_item = box.locator(".Title--title--jCOPvpf")
title = title_item.inner_text()
img_item = box.locator(".MainPic--mainPic--rcLNaCv")
img = img_item.get_attribute("src")
price_int_item = box.locator(".Price--priceInt--ZlsSi_M")
price_float_item = box.locator(".Price--priceFloat--h2RR0RK")
price_int = price_int_item.inner_text()
price_float = price_float_item.inner_text()
price = price_int + price_float
price = float(price)
city_items = box.locator(".Price--procity--_7Vt3mX").all()
if len(city_items) == 2:
father_city = city_items[0].inner_text()
son_city = city_items[1].inner_text()
else:
father_city = ""
son_city = city_items[0].inner_text()
shop_name_item = box.locator(".ShopInfo--shopName--rg6mGmy")
shop_name = shop_name_item.inner_text()
goods_info = {
"商店名称":shop_name,
"发货省":father_city,
"发货市":son_city,
"价格":price,
"商品图片":img,
"商品标题":title,
}
goods_infos.append(goods_info)
df = pd.DataFrame(goods_infos)
df.to_excel("淘宝-工装裤.xlsx",index=False)
测试与调试