一、fiddler抓包工具了解
- Tools>>actions(安装证书)
- Rules>>Require Proxy Authentication(若勾选,每次登陆都要有提示)
- Process(进程)
二、爬移动端数据前的配置
fiddler(见以前的blog)
模拟器的配置(以下)
查找ip ipconfig (a.b)
1. 点击系统设置
-- 预设型号(默认) -- 网络桥接模式(开启 然后需要安装驱动点击确认即可 保存设置 重启模拟器)
2. 手机设置
WLAN -- WXXXX(鼠标左键长按直到出现弹框为止) -- 修改网络 -- 主机名(a.b) 端口(fiddler设置的端口c) -- 保存
3.下载fiddler证书
打开模拟器中的浏览器,输入 a.b:c
以后爬移动端数据要做的设置:
1、
选择from remote dients only
2、下载app:上豌豆荚
3、一般为post请求
三、案例(豆果美食、全民小视频)
3.1 豆果美食
1 豆果美食
需求: 菜谱分类里面所有分类的数据
第一步 页面结构分析
1》目标url ##注意要找对,根据response
http://api.douguo.net/recipe/flatcatalogs
2》headers (字典) ##request headers里面要进行测试,有的要删
3》post请求的方式 (要携带data(字典)) data 需要处理 request headers里面##也要进行测试,有的要删
第二步 实现步骤
正常的写法
关键是URL要找对
raw里是字符串
源代码:
#爬豆果美食的各类菜
import requests
import json
def get_html(url,data):
headers = {
"client": "4",
"version": "6922.2",
"device": "MI 6",
"sdk": "19,4.4.2",
"imei": "863254010448503",
"channel": "qqkp",
# "mac": "44:85:00:5E:5B:28",
"resolution": "720*1280",
"dpi": "1.5",
# "android-id": "4485005e5b281516",
# "pseudo-id": "05e5b28151644850",
"brand": "Xiaomi",
"scale": "1.5",
"timezone": "28800",
"language": "zh",
"cns": "3",
"carrier": "CMCC",
# "imsi": "460074485009491",
"user-agent": "Mozilla/5.0 (Linux; Android 4.4.2; MI 6 Build/NMF26X) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/30.0.0.0 Mobile Safari/537.36",
"reach": "1",
"newbie": "1",
"lon": "116.568176",
"lat": "26.997867",
"cid": "361000",
"Content-Type": "application/x-www-form-urlencoded; charset=utf-8",
"Accept-Encoding": "gzip, deflate",
"Connection": "Keep-Alive",
# "Cookie": "duid=57158696",
"Host": "api.douguo.net",
# "Content-Length": "68",
}
re = requests.post(url=url,headers=headers,data=data)
return re
def handle_html():
url = 'http://api.douguo.net/recipe/flatcatalogs'
data = {
"client": "4",
# "_session": "1537295931652863254010448503",
# "v": "1503650468",
"_vs": "2305",
}
response = get_html(url=url,data=data)
# print(response.text)
result_dict = json.loads(response.text)
#解析result_dict字典数据
for item_1 in result_dict['result']['cs']:
# print(item_1)
for item_2 in item_1['cs']:
print(item_2['name'])
if __name__ == '__main__':
handle_html()
3.2 全民小视频
源代码:
#爬取全民小视频的部分视频
import requests
import json
def get_html(url,data):
headers = {
"Referer": "https://quanmin.baidu.com/",
"User-Agent": "Mozilla/5.0 (Linux; Android 5.1.1; MI 6 Build/NMF26X) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/39.0.0.0 Mobile Safari/537.36 bdminivideo/3.2.0.10 arsdk/5030 (Baidu; P1 5.1.1)",
"Host": "quanmin.baidu.com",
# 'Cookie': 'BAIDUID=35116BA5B916871829C37B1C4CFDF85D:FG=1; BAIDUZID=HIBz0cAVtiQTSJXkx3W19OACU9gPP9ZM2dHuCmGVx-BFhHUBTr2PLvEzV-aACilkuQ1g2LpuJnHD4PZmiCO9pFcO1vfYMO35P1ZsHbl8hZLw; BAIDUCUID=luvI80u-2i0ki2af_aHa8_uU28gqOS8K_u-1u08rva8ykS8xlNWpRyxPA'
}
re = requests.post(url=url,headers=headers,data=data)
return re
def handle_html():
url = 'https://quanmin.baidu.com/appui/video/feed?log=vhk&tn=1021212e&ctn=1021212e&imei=0&od=&cuid=F551E670DB1D64B2047009C5569DFF2E|VC3EYOTK5&bdboxcuid=null&os=android&osbranch=a0&ua=900_1440_320&ut=MI%206%20_5.1.1_22_Xiaomi&uh=Xiaomi%20,qcom,msm8998,1&apiv=1.0.0.10&appv=3020010&version=3.2.0.10&life=1624977343&clife=1624977343&hid=664A787BC87F85BC5D09FB2E535CF8C5&network=1&network_state=20&sids=10012_3-10033_3-10034_4-15014_2-15094_1-3258_2&teenager=0&oaid=&activity_ext=&c3_aid=A00-5KIB3QK746NCOXYCHF6GPEXXJF6MNJDE-VQWTG7CI&push_source=no_push&yyuser=1&api_name=feed&sign=f22d45e5b28873d3431b610e3c4fe85c'
data = {
'subTab': 'faxian',
'subTag': 'immersion',
'conf': '%7B%22autoplay%22%3A%220%22%7D',
'location': '%7B%22prov%22%3A%22%E5%AE%89%E5%BE%BD%E7%9C%81%22%2C%22city%22%3A%22%E5%AE%89%E5%BA%86%E5%B8%82%22%2C%22county%22%3A%22%E5%AE%BF%E6%9D%BE%E5%8E%BF%22%2C%22city-code%22%3A%22130%22%2C%22street%22%3A%22%22%2C%22latitude%22%3A30.004295%2C%22longitude%22%3A116.574989%7D',
'refresh_type': 'init',
'is_close_individual': 0,
'param_ext': '%7B%22entrance_vid%22%3A%220%22%2C%22tabfrom%22%3A%22detail%22%7D',
'visit_id': '1625033654',
'refresh_state': 6,
'refresh_index': 1,
'confirmed_interes': '%7B%22confirmed_list%22%3A%5B%5D%2C%22threshold%22%3A0%2C%22is_triggered_refresh%22%3A0%7D',
}
response = get_html(url=url,data=data)
# print(response.text)
result_dict = json.loads(response.text)
# 解析result_dict字典数据
list_urls = []
for item_1 in result_dict['feed']['data']['list']:
for item_2 in item_1['content']['videoInfo']['multiClarity']:
# print(item_2['videoPlayUrl'])
list_urls.append(item_2['videoPlayUrl'])
return list_urls
def save(list_urls):
num = 1
for i in list_urls:
response = requests.get(i)
with open("video/%s.mp4"%num,'wb') as video_obj:
video_obj.write(response.content)
print(str(num)+'has done!')
num+=1
if __name__ == '__main__':
re = handle_html()
save(re)
技巧:
1.
#转化为字符串
import urllib.parse
str_1 = '\u600e\u4e48\u505a\u51fa\u597d\u5403\u7684\u3010'
re = urllib.parse.unquote(str_1)
print(re)#怎么做出好吃的【
正式写代码前,先把url,headers,data等写好