目录
1.明确需求和抓包思路
目标: 爬取唯品会中泳衣的商品信息
思路:点击F12打开抓包工具 --> 刷新页面 --> 搜索关键字找到我们想要的数据包,并分析
2.发送请求,获取数据
请求链接:获得商品ID数据
当url为长连接时,我们可以分段写:
问号前面 -> 请求链接
问号后面 -> 请求参数、查询参数(在开发者工具network-payload查找)
import requests
# 构造伪请求头
headers = {
"Referer": "https://category.vip.com/",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
}
# 请求链接
url = 'https://mapi.vip.com/vips-mobile/rest/shopping/pc/product/module/list/v2?'
# 请求参数
data = {
"app_name": "shop_pc",
"app_version":"4.0",
"warehouse": "VIP_HZ",
'fdc_area_id': "104103105",
"client": "pc",
'mobile_platform': "1",
"province_id": "104103",
"api_key": "70f71280d5d547b2a7bb370a529aeea1",
"user_id": "",
"mars_cid": "1689329487922_7feadb1a833bb7d1380e40f6e7572fff",
"wap_consumer": "a",
"productIds": "6920269641265660623,6919269290708597831,6918915874224063836,6919798151514506573,2736390477346888,2891474976578247,6919798151514518861,6919675341290738836,6919259179854452052,6920250903815424655,2766703467616269,2169994865,6920279689597107167,6919960987713959836,6920382449658235796,2876504189630669,2910440252357662,2736390477355080,2895685386241758,2765190986319752,6920447656244110279,2804371112156959,6920442282866906503,6919827210032574940,6920194289836579348,6918576590623773716,6919768033477487444,6919938807569897364,2765190986295176,6919827210015789532,2876504189626573,6919764953763153552,6919165075042048660,6920431008557578196,6920342169219109916,6919930400923369948,2902922323390983,6920121891363422797,6919798151531308365,6920416110144640532,6920315893087596252,6920147527227647124,2884110521883335,6920434934424803860,6919772356565115220,6920071029232646484,6919945966029039508,6919675341324334228,6920230370920125213,2924666982808199",
"scene": "search",
"standby_id": "nature",
"context": "",
"_": "1689389675519",
}
# 发送HTTP请求
requeston = requests.get(url=url, params=data, headers=headers)
# 获得响应的数据
print(requeston.json())
3.解析数据
requeston.json()表示将返回的响应数据解析为JSON格式,
requeston.json()['data']['products']表示从JSON数据中获取名为"data"的键对应的值,然后再获取该值中的名为"products"的键对应的值。
接下来通过列表推导式[i['productId'] for i in ...]遍历上述取得的"products"值中的每个元素,将每个元素中的"productId"键对应的值提取出来,并存储在新的products列表中。
# 获得商品Id
# 方法1:for循环
# products = []
# for i in requests.json()['data']['products']:
# # print(i['productId'])
# products.append(i['productId'])
# 方法2:列表推导式
products = [i['productId'] for i in requeston.json()['data']['products']]
print(len(products))
将商品id进行分组。在进行大规模数据获取时,可能会遇到网络异常、请求超时等问题。如果一次性请求所有数据,可能会对错误处理和重试造成困难。而将商品ID分组后,可以针对每个分组进行错误处理和重试,提高数据获取的可靠性。
# 商品ID分组 --> 切片
# 列表合并成字符串
product_id_1 = ",".join(products[:50])
product_id_2 = ",".join(products[50:100])
product_id_3 = ",".join(products[100:])
product_id_list = [product_id_1, product_id_2, product_id_3]
通过for循环获得每个商品的详细信息
for product_id in product_id_list:
# 请求链接
link = 'https://mapi.vip.com/vips-mobile/rest/shopping/pc/product/module/list/v2?app_name=shop_pc&app_version=4.0&warehouse=VIP_HZ&fdc_area_id=104103105&client=pc&mobile_platform=1&province_id=104103&api_key=70f71280d5d547b2a7bb370a529aeea1&user_id=&mars_cid=1689329487922_7feadb1a833bb7d1380e40f6e7572fff&wap_consumer=a&productIds=6919798151514506573%2C6920269641265660623%2C6919798151514518861%2C6919269290708597831%2C6918915874224063836%2C2736390477346888%2C2891474976578247%2C2876504189630669%2C6919675341290738836%2C6919960987713959836%2C2169994865%2C2766703467616269%2C6920382449658235796%2C2910440252357662%2C2902922323390983%2C2765190986319752%2C6919827210032574940%2C6919259179854452052%2C2876504189634765%2C6920250903815424655%2C2804371112156959%2C2736390477355080%2C6920342169219109916%2C6920279689597107167%2C2895685386241758%2C6920447656244110279%2C6920194289836579348%2C6920442282866906503%2C6919827210015789532%2C6919768033477487444%2C2765190986295176%2C6919165075042048660%2C6919764953763153552%2C6920315893087596252%2C6920416110144640532%2C6918576590623773716%2C6919938807569897364%2C6919930400923369948%2C6920366918282336732%2C2882749161196231%2C2891329926023245%2C6918163184803156827%2C6919221354186146452%2C2924666982808199%2C6920147527227647124%2C6920121891363422797%2C6920431008557578196%2C6919798151531308365%2C6919945966029039508%2C2917169264452045%2C&scene=search&standby_id=nature&extParams=%7B%22stdSizeVids%22%3A%22%22%2C%22preheatTipsVer%22%3A%223%22%2C%22couponVer%22%3A%22v2%22%2C%22exclusivePrice%22%3A%221%22%2C%22iconSpec%22%3A%222x%22%2C%22ic2label%22%3A1%2C%22superHot%22%3A1%2C%22bigBrand%22%3A%221%22%7D&context=&_=1689412122582'
# 发送请求
json_data = requests.get(url=link, headers=headers)
a = json_data.json()
4.保存数据
导入csv模块,将数据写入csv文件中。
import csv
fieldnames = ['标题', '品牌', '原价', '售价', '折扣', '商品信息', '详情页链接']
with open('data.csv', 'a', newline='', encoding='utf-8-sig') as file:
writer = csv.DictWriter(file, fieldnames=fieldnames)
writer.writeheader() # 写入表头
for index in a['data']['products']:
attr = ','.join([j['value'] for j in index['attrs']])
row = {
'标题': index['title'],
'品牌': index['brandShowName'],
'原价': index['price']['marketPrice'],
'售价': index['price']['salePrice'],
'折扣': index['price']['mixPriceLabel'],
'商品信息': attr,
'详情页链接': f'https://detail.vip.com/detail-{index["brandId"]}-{index["productId"]}.html'
}
writer.writerow(row)
5.最终效果
import requests
import csv
# 模拟浏览器请求
headers = {
# 防盗链 告诉服务器请求链接地址从哪里跳转过来
"Referer": "https://category.vip.com/",
# 用户代理 浏览器基本身份信息
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
}
# 请求链接
url = 'https://mapi.vip.com/vips-mobile/rest/shopping/pc/product/module/list/v2?'
# 请求参数
data = {
#"callback": "getMerchandiseDroplets1", # 回调函数 获取requests.json()错误,将其省略
"app_name": "shop_pc",
"app_version":"4.0",
"warehouse": "VIP_HZ",
'fdc_area_id': "104103105",
"client": "pc",
'mobile_platform': "1",
"province_id": "104103",
"api_key": "70f71280d5d547b2a7bb370a529aeea1",
"user_id": "",
"mars_cid": "1689329487922_7feadb1a833bb7d1380e40f6e7572fff",
"wap_consumer": "a",
"productIds": "6920269641265660623,6919269290708597831,6918915874224063836,6919798151514506573,2736390477346888,2891474976578247,6919798151514518861,6919675341290738836,6919259179854452052,6920250903815424655,2766703467616269,2169994865,6920279689597107167,6919960987713959836,6920382449658235796,2876504189630669,2910440252357662,2736390477355080,2895685386241758,2765190986319752,6920447656244110279,2804371112156959,6920442282866906503,6919827210032574940,6920194289836579348,6918576590623773716,6919768033477487444,6919938807569897364,2765190986295176,6919827210015789532,2876504189626573,6919764953763153552,6919165075042048660,6920431008557578196,6920342169219109916,6919930400923369948,2902922323390983,6920121891363422797,6919798151531308365,6920416110144640532,6920315893087596252,6920147527227647124,2884110521883335,6920434934424803860,6919772356565115220,6920071029232646484,6919945966029039508,6919675341324334228,6920230370920125213,2924666982808199",
"scene": "search",
"standby_id": "nature",
"context": "",
"_": "1689389675519",
}
# 发送请求
requeston = requests.get(url=url, params=data, headers=headers)
# 获得商品Id
products = [i['productId'] for i in requeston.json()['data']['products']]
print(len(products))
# 商品ID分组 --> 切片
# 列表合并成字符串
product_id_1 = ",".join(products[:50])
product_id_2 = ",".join(products[50:100])
product_id_3 = ",".join(products[100:])
product_id_list = [product_id_1, product_id_2, product_id_3]
for product_id in product_id_list:
# 请求链接
link = 'https://mapi.vip.com/vips-mobile/rest/shopping/pc/product/module/list/v2?app_name=shop_pc&app_version=4.0&warehouse=VIP_HZ&fdc_area_id=104103105&client=pc&mobile_platform=1&province_id=104103&api_key=70f71280d5d547b2a7bb370a529aeea1&user_id=&mars_cid=1689329487922_7feadb1a833bb7d1380e40f6e7572fff&wap_consumer=a&productIds=6919798151514506573%2C6920269641265660623%2C6919798151514518861%2C6919269290708597831%2C6918915874224063836%2C2736390477346888%2C2891474976578247%2C2876504189630669%2C6919675341290738836%2C6919960987713959836%2C2169994865%2C2766703467616269%2C6920382449658235796%2C2910440252357662%2C2902922323390983%2C2765190986319752%2C6919827210032574940%2C6919259179854452052%2C2876504189634765%2C6920250903815424655%2C2804371112156959%2C2736390477355080%2C6920342169219109916%2C6920279689597107167%2C2895685386241758%2C6920447656244110279%2C6920194289836579348%2C6920442282866906503%2C6919827210015789532%2C6919768033477487444%2C2765190986295176%2C6919165075042048660%2C6919764953763153552%2C6920315893087596252%2C6920416110144640532%2C6918576590623773716%2C6919938807569897364%2C6919930400923369948%2C6920366918282336732%2C2882749161196231%2C2891329926023245%2C6918163184803156827%2C6919221354186146452%2C2924666982808199%2C6920147527227647124%2C6920121891363422797%2C6920431008557578196%2C6919798151531308365%2C6919945966029039508%2C2917169264452045%2C&scene=search&standby_id=nature&extParams=%7B%22stdSizeVids%22%3A%22%22%2C%22preheatTipsVer%22%3A%223%22%2C%22couponVer%22%3A%22v2%22%2C%22exclusivePrice%22%3A%221%22%2C%22iconSpec%22%3A%222x%22%2C%22ic2label%22%3A1%2C%22superHot%22%3A1%2C%22bigBrand%22%3A%221%22%7D&context=&_=1689412122582'
# 发送请求
json_data = requests.get(url=link, headers=headers)
a = json_data.json()
print(len(a))
fieldnames = ['标题', '品牌', '原价', '售价', '折扣', '商品信息', '详情页链接']
with open('data.csv', 'a', newline='', encoding='utf-8-sig') as file:
writer = csv.DictWriter(file, fieldnames=fieldnames)
writer.writeheader() # 写入表头
for index in a['data']['products']:
attr = ','.join([j['value'] for j in index['attrs']])
row = {
'标题': index['title'],
'品牌': index['brandShowName'],
'原价': index['price']['marketPrice'],
'售价': index['price']['salePrice'],
'折扣': index['price']['mixPriceLabel'],
'商品信息': attr,
'详情页链接': f'https://detail.vip.com/detail-{index["brandId"]}-{index["productId"]}.html'
}
writer.writerow(row)