在数据爬取中,我们经常需要从电商平台上提取产品信息,比如在 1688 上获取某个店铺的所有商品链接。1688 平台对部分 API 的调用做了加密和鉴权处理,其中主要是对 sign 参数的生成。因此,在本文中,我们将使用 Python 对 1688 的店铺页面进行爬取,提取店铺的商品链接,并通过模拟请求、获取签名等技术手段实现这一功能。
注:若需要获取商品全部信息,只需改进一下fetch_company_data()即可
项目目标
通过输入店铺的名称和链接,爬取并获取该店铺所有商品的链接,最终将链接保存到 Excel 文件中。代码分为以下几个部分:
- 生成签名 sign 参数
- 获取用于请求的 cookies
- 提取店铺 memberId 以识别店铺信息
- 分页请求店铺商品数据
- 将商品链接保存到 Excel 文件中
实现步骤
实现方案包括以下几个步骤:
- 初始化请求参数:包括 cookies 和 headers。
- 获取签名参数:使用 sign.js 来生成 sign 参数,以通过 1688的鉴权机制。
- 爬取数据并分页:请求店铺商品的分页数据,直到没有新数据为止。
- 存储到 Excel:将商品的链接保存到 Excel文件中,方便后续数据处理。
代码解析
python代码
该代码通过 Python 实现从 1688 店铺中获取商品链接。它使用 requests
库发送 HTTP 请求,execjs
执行 JavaScript 文件 sign.js
来生成请求所需的签名参数 sign
,而 httpx
负责异步请求和处理 Cookies。在生成签名参数时,代码首先获取当前时间戳 current_timestamp
,然后将 cookie
中的 _m_h5_tk
、时间戳、APP_KEY
和请求数据 data
拼接成字符串,通过 sign.js
生成最终的 sign
,以便通过 1688 平台的鉴权。
import json
import random
import re
import time
import pandas as pd
from typing import Dict, Any, Optional
import re
import requests
import execjs
import httpx
"""
使用说明:
1.1688加密的接口几乎都是用的sign参数配合cookie进行鉴权
2.sign参数生成需要4个参数,分别是_m_h5_tk、毫秒时间戳、app_key、请求参数data数据
3.请求时带上生成的sign值和用来生成sign的时间戳即可
4.有部分接口没有用sign进行鉴权,使用的是_tb_token_参数,这个值可以通过接口获取并且在初始化的cookie中
"""
JS_VERSION = '2.7.0'
APP_KEY = '12574478'
headers = {
'cookie': " ",
'referer': 'https://sycm.1688.com/ms/home/home?dateRange=2024-04-01%7C2024-04-30&dateType=month',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36',
}
def get_milliseconds_timestamp() -> int:
return int(time.time() * 1000)
def jsonp_to_json(jsonp_data: str) -> Dict[str, Any] | None:
try:
match = re.match(r'.*?({.*}).*?', jsonp_data, re.S)
if match:
json_str = match.group(1)
return json.loads(json_str)
else:
raise ValueError('Invalid JSONP format')
except (json.JSONDecodeError, re.error):
raise ValueError('Invalid Input')
def cookies_str_to_dict(cookies_str: str) -> Dict[str, str]:
cookies_dict = {}
if cookies_str:
cookies_list = cookies_str.split('; ')
for cookie in cookies_list:
key, value = cookie.split('=', 1)
cookies_dict[key] = value
return cookies_dict
def dict_to_cookies_str(cookies_dict: Dict[str, str]) -> str:
cookie_str = '; '.join([f"{key}={value}" for key, value in cookies_dict.items()])
return cookie_str
def update_cookie(cookie_list) -> Dict[str, str]:
cookies_dict = cookies_str_to_dict(headers.get('cookie', {}))
for cookie in cookie_list:
cookies_dict[cookie['name']] = cookie['value']
# 更新headers中的cookie
new_cookie = dict_to_cookies_str(cookies_dict)
headers['cookie'] = new_cookie
return headers
def get_sign_params(_m_h5_tk: str, data: str) -> Dict[str, Any]:
"""
生成API请求所需的签名参数。
参数:
_m_h5_tk (str): 存在cookie中,也可通过请求接口生成。
data (str): 请求数据中的data参数值。
"""
current_timestamp = get_milliseconds_timestamp()
pre_sign_str = f'{_m_h5_tk.split("_")[0]}&{current_timestamp}&{APP_KEY}&' + data
sign_js_path = './sign.js'
sign = execjs.compile(open(sign_js_path).read()).call('sign', pre_sign_str)
return {"sign": sign, "t": current_timestamp}
async def get_tb_token() -> None:
params = {
'group': 'tao',
'target': 'https://work.1688.com/home/unReadMsgCount.htm?tbpm=1&callback=jQuery0',
}
async with httpx.AsyncClient() as client:
timeout = httpx.Timeout(60.0, connect=60.0)
response = await client.get('https://login.taobao.com/jump', params=params, headers=headers,
timeout=timeout, follow_redirects=False)
cookies = response.cookies
cookies_list = [{"name": k, "value": v} for k, v in cookies.items()]
update_cookie(cookies_list)
async def get_cna() -> None:
async with httpx.AsyncClient() as client:
timeout = httpx.Timeout(60.0, connect=60.0)
timestamp = str(int(time.time() * 1000))
response = await client.get(f"https://log.mmstat.com/eg.js?t={timestamp}", headers=headers, timeout=timeout)
cookies = response.cookies
cookies_list = [{"name": k, "value": v} for k, v in cookies.items()]
update_cookie(cookies_list)
async def api_request(
req_type: str,
api: str,
params: Optional[Dict[str, Any]] = None,
data: Optional[str] = None,
req_body: Optional[Dict[str, Any]] = None,
_m_h5_tk: Optional[str] = 'undefined'
) -> Dict[str, Any] | str | None:
"""
用于发送API请求并获取响应数据
参数:
api (str): 请求的API地址。
params (Dict[str, Any]): 请求参数字典,注意请求参数中的t需要严格和用来签名获取sign的时间戳一致,否则鉴权失败。
data (str): 用于生成签名的数据。
_m_h5_tk (Optional[str]): 用于签名的参数值,来自响应cookie,如果未提供则默认为'undefined'。
"""
async with httpx.AsyncClient() as client:
timeout = httpx.Timeout(60.0, connect=60.0)
sign_dict = get_sign_params(_m_h5_tk, data)
if _m_h5_tk == 'undefined':
print(f"初始化sign参数: {sign_dict}")
else:
print(f"获取sign参数: {sign_dict}")
# 添加签名参数到请求参数中
params['t'] = sign_dict['t']
params['sign'] = sign_dict['sign']
if req_type == 'POST':
req_body['data'] = data
response = await client.post(api, data=req_body, params=params, headers=headers, timeout=timeout)
else:
response = await client.get(api, params=params, headers=headers, timeout=timeout)
# 如果是首次请求,需要从响应中获取_m_h5_tk和_m_h5_tk_enc
if _m_h5_tk == 'undefined':
cookies = response.cookies
cookies_list = [{"name": k, "value": v} for k, v in cookies.items()]
_m_h5_tk = cookies.get('_m_h5_tk', '')
update_cookie(cookies_list)
print(f"获取token参数: {dict(cookies)}")
return _m_h5_tk
content = response.text
if content.startswith('{'):
return response.json()
else:
return jsonp_to_json(content)
async def fetch_company_data(member_name: str, store_id: str) -> Dict[str, Any] | None:
"""
获取1688店铺信息以及所有商品链接。
参数:
data (str): 请求所需的数据,通常为JSON格式的字符串。
返回:
Dict[str, Any] | None: 解析后的JSON数据,如果请求失败则为None。
"""
# 起始页码
page_num = 1
all_urls = [] # 用来保存所有拼接的 URL
# API请求地址
api = 'https://h5api.m.1688.com/h5/mtop.alibaba.alisite.cbu.server.moduleasyncservice/1.0/'
# 循环请求直到没有数据为止
while True:
# 构造data请求体
data = f'{{"componentKey":"Wp_pc_common_offerlist","params":"{{\\"memberId\\":\\"{store_id}\\",\\"appdata\\":{{\\"sortType\\":\\"wangpu_score\\",\\"sellerRecommendFilter\\":false,\\"mixFilter\\":false,\\"tradenumFilter\\":false,\\"quantityBegin\\":null,\\"pageNum\\":{page_num},\\"count\\":30}}}}"}}'
# 构造params
params = {
'jsv': JS_VERSION,
'appKey': APP_KEY,
't': '',
'sign': '',
'api': 'mtop.alibaba.alisite.cbu.server.ModuleAsyncService',
'v': '1.0',
'type': 'jsonp',
'valueType': 'string',
'dataType': 'jsonp',
'timeout': '60000', # 10000
'callback': 'mtopjsonp1',
'data': data
}
# 首次请求,获取 _m_h5_tk(如果尚未获取)
_m_h5_tk = await api_request('GET', api, params, data)
# 使用获取到的 _m_h5_tk 发送 POST 请求
response_data = await api_request('POST', api, params, data, params, _m_h5_tk=_m_h5_tk)
# 获取返回数据中的 offerList
offer_list = response_data.get('data', {}).get('content', {}).get('offerList', [])
# 如果当前页没有数据,说明所有数据已经爬取完毕,退出循环
if not offer_list:
# print(response_data)
print(f"没有更多数据,爬取结束。")
break
# 提取所有 offer 的 id,并拼接成对应的 URL
urls = [f"https://detail.1688.com/offer/{offer.get('id')}.html" for offer in offer_list]
all_urls.extend(urls) # 将当前页的 URL 添加到 all_urls 中
print(f"第 {page_num} 页抓取成功,当前共 {len(all_urls)} 个 URL")
# 增加页码,爬取下一页
page_num += 1
# 随机等待 1-2 秒
wait_time = random.uniform(5, 10)
print(f"暂停 {wait_time:.2f} 秒...")
await asyncio.sleep(wait_time)
# 将所有 URL 存入 Excel 文件
print(f"总共爬取了 {len(all_urls)} 个 URL,正在保存到 Excel 文件...")
# 创建 DataFrame,用 URL 列表创建一个简单的表格
df = pd.DataFrame(all_urls, columns=["URL"])
# 保存为 Excel 文件
file_name = 'data/' + member_name + '.xlsx'
df.to_excel(file_name, index=False, engine='openpyxl')
print(f"所有 URL 已保存到 {file_name}")
return response_data
async def init() -> None:
# 初始化cookies参数值
await get_cna()
await get_tb_token()
def get_memberId(url):
member_id = None
pageurl = url + '/page/offerlist.htm?spm=a2615.2177701.wp_pc_common_topnav.0'
# 发送 GET 请求
response = requests.get(pageurl, headers=headers)
# 如果请求成功
if response.status_code == 200:
# 获取页面 HTML 内容
html_content = response.text
# 使用正则表达式提取 memberId 的值
match = re.search(r'"memberId":"([^"]+)"', html_content)
if match:
# 提取 memberId 的值
member_id = match.group(1)
print('Member ID:', member_id)
else:
print('未找到 memberId')
else:
print('请求失败,状态码:', response.status_code)
return member_id
if __name__ == '__main__':
import asyncio
# 示例:请求1688店铺企业信息接口获取店铺信息
member_name = 'xxxx有限公司'
member_url = 'https://xxxxx.1688.com/'
member_id = get_memberId(member_url) # 店铺ID
if member_id == None:
print("无该店铺")
else:
asyncio.run(init())
asyncio.run(fetch_company_data(member_name, member_id))
sign.js
function sign(e) {
function t(e, t) {
return e << t | e >>> 32 - t
}
function o(e, t) {
var o, n, r, i, a;
return r = 2147483648 & e,
i = 2147483648 & t,
a = (1073741823 & e) + (1073741823 & t),
(o = 1073741824 & e) & (n = 1073741824 & t) ? 2147483648 ^ a ^ r ^ i : o | n ? 1073741824 & a ? 3221225472 ^ a ^ r ^ i : 1073741824 ^ a ^ r ^ i : a ^ r ^ i
}
function n(e, n, r, i, a, s, u) {
return o(t(e = o(e, o(o(function(e, t, o) {
return e & t | ~e & o
}(n, r, i), a), u)), s), n)
}
function r(e, n, r, i, a, s, u) {
return o(t(e = o(e, o(o(function(e, t, o) {
return e & o | t & ~o
}(n, r, i), a), u)), s), n)
}
function i(e, n, r, i, a, s, u) {
return o(t(e = o(e, o(o(function(e, t, o) {
return e ^ t ^ o
}(n, r, i), a), u)), s), n)
}
function a(e, n, r, i, a, s, u) {
return o(t(e = o(e, o(o(function(e, t, o) {
return t ^ (e | ~o)
}(n, r, i), a), u)), s), n)
}
function s(e) {
var t, o = "", n = "";
for (t = 0; 3 >= t; t++)
o += (n = "0" + (e >>> 8 * t & 255).toString(16)).substr(n.length - 2, 2);
return o
}
var u, l, d, c, p, f, h, m, y, g;
for (g = function(e) {
for (var t = e.length, o = t + 8, n = 16 * ((o - o % 64) / 64 + 1), r = Array(n - 1), i = 0, a = 0; t > a; )
i = a % 4 * 8,
r[(a - a % 4) / 4] |= e.charCodeAt(a) << i,
a++;
return i = a % 4 * 8,
r[(a - a % 4) / 4] |= 128 << i,
r[n - 2] = t << 3,
r[n - 1] = t >>> 29,
r
}(e = function(e) {
var t = String.fromCharCode;
e = e.replace(/\r\n/g, "\n");
for (var o, n = "", r = 0; r < e.length; r++)
128 > (o = e.charCodeAt(r)) ? n += t(o) : o > 127 && 2048 > o ? (n += t(o >> 6 | 192),
n += t(63 & o | 128)) : (n += t(o >> 12 | 224),
n += t(o >> 6 & 63 | 128),
n += t(63 & o | 128));
return n
}(e)),
f = 1732584193,
h = 4023233417,
m = 2562383102,
y = 271733878,
u = 0; u < g.length; u += 16)
l = f,
d = h,
c = m,
p = y,
h = a(h = a(h = a(h = a(h = i(h = i(h = i(h = i(h = r(h = r(h = r(h = r(h = n(h = n(h = n(h = n(h, m = n(m, y = n(y, f = n(f, h, m, y, g[u + 0], 7, 3614090360), h, m, g[u + 1], 12, 3905402710), f, h, g[u + 2], 17, 606105819), y, f, g[u + 3], 22, 3250441966), m = n(m, y = n(y, f = n(f, h, m, y, g[u + 4], 7, 4118548399), h, m, g[u + 5], 12, 1200080426), f, h, g[u + 6], 17, 2821735955), y, f, g[u + 7], 22, 4249261313), m = n(m, y = n(y, f = n(f, h, m, y, g[u + 8], 7, 1770035416), h, m, g[u + 9], 12, 2336552879), f, h, g[u + 10], 17, 4294925233), y, f, g[u + 11], 22, 2304563134), m = n(m, y = n(y, f = n(f, h, m, y, g[u + 12], 7, 1804603682), h, m, g[u + 13], 12, 4254626195), f, h, g[u + 14], 17, 2792965006), y, f, g[u + 15], 22, 1236535329), m = r(m, y = r(y, f = r(f, h, m, y, g[u + 1], 5, 4129170786), h, m, g[u + 6], 9, 3225465664), f, h, g[u + 11], 14, 643717713), y, f, g[u + 0], 20, 3921069994), m = r(m, y = r(y, f = r(f, h, m, y, g[u + 5], 5, 3593408605), h, m, g[u + 10], 9, 38016083), f, h, g[u + 15], 14, 3634488961), y, f, g[u + 4], 20, 3889429448), m = r(m, y = r(y, f = r(f, h, m, y, g[u + 9], 5, 568446438), h, m, g[u + 14], 9, 3275163606), f, h, g[u + 3], 14, 4107603335), y, f, g[u + 8], 20, 1163531501), m = r(m, y = r(y, f = r(f, h, m, y, g[u + 13], 5, 2850285829), h, m, g[u + 2], 9, 4243563512), f, h, g[u + 7], 14, 1735328473), y, f, g[u + 12], 20, 2368359562), m = i(m, y = i(y, f = i(f, h, m, y, g[u + 5], 4, 4294588738), h, m, g[u + 8], 11, 2272392833), f, h, g[u + 11], 16, 1839030562), y, f, g[u + 14], 23, 4259657740), m = i(m, y = i(y, f = i(f, h, m, y, g[u + 1], 4, 2763975236), h, m, g[u + 4], 11, 1272893353), f, h, g[u + 7], 16, 4139469664), y, f, g[u + 10], 23, 3200236656), m = i(m, y = i(y, f = i(f, h, m, y, g[u + 13], 4, 681279174), h, m, g[u + 0], 11, 3936430074), f, h, g[u + 3], 16, 3572445317), y, f, g[u + 6], 23, 76029189), m = i(m, y = i(y, f = i(f, h, m, y, g[u + 9], 4, 3654602809), h, m, g[u + 12], 11, 3873151461), f, h, g[u + 15], 16, 530742520), y, f, g[u + 2], 23, 3299628645), m = a(m, y = a(y, f = a(f, h, m, y, g[u + 0], 6, 4096336452), h, m, g[u + 7], 10, 1126891415), f, h, g[u + 14], 15, 2878612391), y, f, g[u + 5], 21, 4237533241), m = a(m, y = a(y, f = a(f, h, m, y, g[u + 12], 6, 1700485571), h, m, g[u + 3], 10, 2399980690), f, h, g[u + 10], 15, 4293915773), y, f, g[u + 1], 21, 2240044497), m = a(m, y = a(y, f = a(f, h, m, y, g[u + 8], 6, 1873313359), h, m, g[u + 15], 10, 4264355552), f, h, g[u + 6], 15, 2734768916), y, f, g[u + 13], 21, 1309151649), m = a(m, y = a(y, f = a(f, h, m, y, g[u + 4], 6, 4149444226), h, m, g[u + 11], 10, 3174756917), f, h, g[u + 2], 15, 718787259), y, f, g[u + 9], 21, 3951481745),
f = o(f, l),
h = o(h, d),
m = o(m, c),
y = o(y, p);
return (s(f) + s(h) + s(m) + s(y)).toLowerCase()
}
声明
本项目仅供学习和研究使用,任何因其导致的版权或服务条款侵犯等行为均与维护者无关,请确保合规使用。