虾皮获取商品信息
用于从Shopee网站获取商品信息。它接受一个header参数(通常包含用户代理和cookie等信息)和一个可选的URL参数(默认为"https://shopee.co.id/")。如果提供了有效的URL,它将尝试获取该URL的商品信息,包括标题、运费、属性、图片等。如果没有提供有效的URL或登录信息,它将返回错误信息。
import requests
from bs4 import BeautifulSoup
import re
import json
from urllib.parse import urlparse
import uuid
def get_goods_info(header, Url="https://shopee.co.id/", *args, **kwargs):
"""
:获取shopee商品信息
"""
# 判断URL是否存在
if Url == "https://shopee.co.id/":
return {"code": "0001", "error": "not url"}
response = requests.get(Url, headers=header)
soup = BeautifulSoup(response.text, 'html.parser')
if not response.headers.get("X-Request-ID"):
return {"code": "0002", "error": "not login"}
data_box = {
"title": soup.find("div", attrs={"class": "WBVL_7"}),
"freight": soup.find("div", attrs={"class": "flex items-center PZGOkt"}),
"attrs": soup.find("div", attrs={"class": "flex KIoPj6 W5LiQM"}),
}
for box in data_box.values():
if box:
continue
return {}
att = {}
for i in data_box["attrs"].findAll("section"):
if not i.find("div", attrs={"class": "flex items-center j7HL5Q"}):
continue
key = i.find("h3").text
value = []
for x in i.find("div", attrs={"class": "flex items-center j7HL5Q"}).findAll("button"):
value.append({"src": x.find("img").get("src") if x.find("img") else None, "attr_name": x.text})
att[key] = value
img_block = soup.find("div", attrs={"class": "airUhU"}).findAll("div", attrs={"class": "UBG7wZ"})
if not img_block:
return {}
images = [img.find("img").attrs.get("src") for img in img_block]
# id参数筛选
result = re.search(r'-i.\d+\.\d+|/\d+/\d+', urlparse(Url).path)
if not result:
return {}
at = re.search(r'\d+\.\d+|\d+/\d+', result.group()).group()
if not at:
return {}
at = at.split(".") if "." in at else at.split("/")
get_info_response = requests.get("https://shopee.co.id/api/v4/pdp/get_pc", headers=header, params={
"shop_id": at[0], "item_id": at[1], "detail_level": 0
})
get_info_text = BeautifulSoup(get_info_response.text, 'html.parser')
p_data = json.loads(get_info_text.text)
if not p_data and not len(p_data) >= 400:
return {}
p_datad = p_data.get("data").get("product_attributes").get("attrs")
models = p_data.get("data").get("item").get("models")
p_datad = [{"name": i.get("name"), "value": i.get("value")} for i in p_datad]
description = p_data.get("data").get("item").get("description")
return {
"url":Url,
"title": data_box["title"].text,
"freight": data_box["freight"].text,
"attrs_info": att,
"images": images,
"info": p_datad,
"models": models,
"description": description,
}