目标链接:演出
难度半颗星
爬取思路:
1.谷歌浏览器抓包分析。
2.抓到包后看是否有参数加密,在载荷中发现uuid参数看似加密,其实是不变的,小伙伴们别被虚晃一枪了哈。
3.好好好,开始码代码(最最最标准写法,没有之一)。
import requests
cookies = {
'_lxsdk_cuid': '18b6123c42bc8-0bbb517c92e5b1-26031c51-144000-18b6123c42bc8',
'_lxsdk': '18b6123c42bc8-0bbb517c92e5b1-26031c51-144000-18b6123c42bc8',
'_lx_utm': '',
'_hc.v': '46ac3e9f-476a-fe69-7bbe-4a729d947072.1698141816',
'WEBDFPID': '04w29yxww15250u107vw43vuvw01x98v81y3876x00497958z0vy6742-2013501816250-1698141815504IMIUQIIfd79fef3d01d5e9aadc18ccd4d0c95072977',
'_lxsdk_s': '18b6123c07d-b3-f5f-a75%7C%7C18',
}
headers = {
'Accept': 'application/json, text/plain, */*',
'Accept-Language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7,zh-TW;q=0.6',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Content-Type': 'application/x-www-form-urlencoded',
# 'Cookie': '_lxsdk_cuid=18b6123c42bc8-0bbb517c92e5b1-26031c51-144000-18b6123c42bc8; _lxsdk=18b6123c42bc8-0bbb517c92e5b1-26031c51-144000-18b6123c42bc8; _lx_utm=; _hc.v=46ac3e9f-476a-fe69-7bbe-4a729d947072.1698141816; WEBDFPID=04w29yxww15250u107vw43vuvw01x98v81y3876x00497958z0vy6742-2013501816250-1698141815504IMIUQIIfd79fef3d01d5e9aadc18ccd4d0c95072977; _lxsdk_s=18b6123c07d-b3-f5f-a75%7C%7C18',
'Origin': 'https://h5.dianping.com',
'Pragma': 'no-cache',
'Referer': 'https://h5.dianping.com/',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-site',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
'mtgsig': '{"a1":"1.1","a2":1698141976232,"a3":"04w29yxww15250u107vw43vuvw01x98v81y3876x00497958z0vy6742","a5":"MQZZxZrWYoBKATXufXfWSIjtPgYAEwZy","a6":"h1.5pcWMHp7MF3eoE3Ql4ncvF01HxHFNl4pBlnNuvCS6jmwAYuorg1yPgAk4egdGxplzsVgmHpRTkGfW7i3DQzdt4sxAs6LE5+LPJmYCr424jFK5YYrigp9SlKqXZqfvjc7GMGU9bH6qIuXgcGdzGT3SE/LXIJYJJLTkqN1iYI7nCVIxw6pHFXGzMxuMDHyPSOcwO2BvJAFVKglA6Jl9Ar1uy6ebd6PRWGPMl+Jt46SBlMf3vXmGeET3CUg9H4qiTM/yqil4r3RuQNJdFUJJHWrJj49ZLgBw+e2cU0zxX/xlgrtZVrXUZdezOScyPw5VdKEZ9ibET4zTpr9De8Zct5Qlp6hYou3mKH9BoqK2H1TZLG091hiv87TNqFw6F4nYUSOGwgAE2Hipk3N0kyLE3LHkzg==","x0":4,"d1":"40eb8b3923001a5dc723d14d63eb9161"}',
'sec-ch-ua': '"Not/A)Brand";v="99", "Google Chrome";v="115", "Chromium";v="115"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
}
params = {
'needRank': 'true',
'labelId': '0',
'optimus_risk_level': '71',
'optimus_code': '10',
'uuid': '62r94ya7oiui39oxuq92pql00wtb96jd2e0gab5z1lk9yqtxa6g5wx02a5k7ykuu',
'sellChannel': '1',
'cityId': '299',
'yodaReady': 'h5',
'csecplatform': '4',
'csecversion': '2.2.1',
}
response = requests.get(
'https://m.dianping.com/myshow/ajax/performances/1;st=0;p=1;s=20;tft=0;marketLevel=0',
params=params,
cookies=cookies,
headers=headers,
).json()
print(response)
打印结果看一下呗:
依打印结果来看,跟网页上的一模一样,这时候我们就确定了,这是我们想要的数据,接下来的开始数据提取,这不是简单的一批么?对于大家来说~
提取数据,提取的字段演唱会的名称、时间、地址、打折情况、价格区间
info_lis = response['data']
for i in info_lis:
dic = {
'演唱会名': i['shortName'],
'演唱时间': i['showTimeRange'],
'演唱地址': i['address'],
'打折': i['minDiscount'],
'价格': i['priceRange'],
}
print(dic)
结果:
最后保存数据:
df = pd.DataFrame(result_lis)
writer = pd.ExcelWriter('演唱会.xlsx')
df.to_excel(writer, index=False)
writer.close()
保存结果:
爬取其他栏目只需要把url中的performances/1 1变为其他数字即可。
整体代码:
# -*- coding: UTF-8 -*-
"""
@Time : 2023/10/24 18:08
@Author : 蝌蚪啊@
@File : 音乐会_spider.py
"""
import pandas as pd
import requests
result_lis = []
cookies = {
'_lxsdk_cuid': '18b6123c42bc8-0bbb517c92e5b1-26031c51-144000-18b6123c42bc8',
'_lxsdk': '18b6123c42bc8-0bbb517c92e5b1-26031c51-144000-18b6123c42bc8',
'_lx_utm': '',
'_hc.v': '46ac3e9f-476a-fe69-7bbe-4a729d947072.1698141816',
'WEBDFPID': '04w29yxww15250u107vw43vuvw01x98v81y3876x00497958z0vy6742-2013501816250-1698141815504IMIUQIIfd79fef3d01d5e9aadc18ccd4d0c95072977',
'_lxsdk_s': '18b6123c07d-b3-f5f-a75%7C%7C18',
}
headers = {
'Accept': 'application/json, text/plain, */*',
'Accept-Language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7,zh-TW;q=0.6',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Content-Type': 'application/x-www-form-urlencoded',
# 'Cookie': '_lxsdk_cuid=18b6123c42bc8-0bbb517c92e5b1-26031c51-144000-18b6123c42bc8; _lxsdk=18b6123c42bc8-0bbb517c92e5b1-26031c51-144000-18b6123c42bc8; _lx_utm=; _hc.v=46ac3e9f-476a-fe69-7bbe-4a729d947072.1698141816; WEBDFPID=04w29yxww15250u107vw43vuvw01x98v81y3876x00497958z0vy6742-2013501816250-1698141815504IMIUQIIfd79fef3d01d5e9aadc18ccd4d0c95072977; _lxsdk_s=18b6123c07d-b3-f5f-a75%7C%7C18',
'Origin': 'https://h5.dianping.com',
'Pragma': 'no-cache',
'Referer': 'https://h5.dianping.com/',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-site',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
'mtgsig': '{"a1":"1.1","a2":1698141976232,"a3":"04w29yxww15250u107vw43vuvw01x98v81y3876x00497958z0vy6742","a5":"MQZZxZrWYoBKATXufXfWSIjtPgYAEwZy","a6":"h1.5pcWMHp7MF3eoE3Ql4ncvF01HxHFNl4pBlnNuvCS6jmwAYuorg1yPgAk4egdGxplzsVgmHpRTkGfW7i3DQzdt4sxAs6LE5+LPJmYCr424jFK5YYrigp9SlKqXZqfvjc7GMGU9bH6qIuXgcGdzGT3SE/LXIJYJJLTkqN1iYI7nCVIxw6pHFXGzMxuMDHyPSOcwO2BvJAFVKglA6Jl9Ar1uy6ebd6PRWGPMl+Jt46SBlMf3vXmGeET3CUg9H4qiTM/yqil4r3RuQNJdFUJJHWrJj49ZLgBw+e2cU0zxX/xlgrtZVrXUZdezOScyPw5VdKEZ9ibET4zTpr9De8Zct5Qlp6hYou3mKH9BoqK2H1TZLG091hiv87TNqFw6F4nYUSOGwgAE2Hipk3N0kyLE3LHkzg==","x0":4,"d1":"40eb8b3923001a5dc723d14d63eb9161"}',
'sec-ch-ua': '"Not/A)Brand";v="99", "Google Chrome";v="115", "Chromium";v="115"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
}
params = {
'needRank': 'true',
'labelId': '0',
'optimus_risk_level': '71',
'optimus_code': '10',
'uuid': '62r94ya7oiui39oxuq92pql00wtb96jd2e0gab5z1lk9yqtxa6g5wx02a5k7ykuu',
'sellChannel': '1',
'cityId': '299',
'yodaReady': 'h5',
'csecplatform': '4',
'csecversion': '2.2.1',
}
response = requests.get(
'https://m.dianping.com/myshow/ajax/performances/1;st=0;p=1;s=20;tft=0;marketLevel=0',
params=params,
cookies=cookies,
headers=headers,
).json()
print(response)
info_lis = response['data']
for i in info_lis:
dic = {
'演唱会名': i['shortName'],
'演唱时间': i['showTimeRange'],
'演唱地址': i['address'],
'打折': i['minDiscount'],
'价格': i['priceRange'],
}
print(dic)
result_lis.append(dic)
df = pd.DataFrame(result_lis)
writer = pd.ExcelWriter('演唱会.xlsx')
df.to_excel(writer, index=False)
writer.close()
今天就分享到这,祝在做的各位程序员大佬,1024节日快乐,薪水加多多~~