Ajax 异步加载 网页爬取
分析网页
在Networka条目里找xhr
Ajax 加载
import requests
from urllib.parse import urlencode
import re
headers = {
‘User-Agent’: ‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36’,
‘X-Requested-With’: ‘XMLHttpRequest’
}
def get_page(url):
“”“获取网页源码”""
response = requests.get(url,headers = headers)
response.encoding = 'utf-8'
if response.status_code == 200:
print(response)
return response.json()
else:
print('错误:',response.status_code)
def parse_page(json):
data = json.get('data')
if data:
for item in data:
image = item.get('hoverURL')
name = item.get('fromPageTitleEnc')
print(name)
print(image)
if image != None :
reg = "[^0-9A-Za-z\u4e00-\u9fa5]" #使用删除字符串里的符号 ?!,。之类的 保留允许存在的命名的字符串
names = re.sub(reg, '', name)
images = requests.get(image).content
with open('./images/%s'%names+'.jpg', 'wb') as fp:
fp.write(images)
else:
print('None')
def main(pn):
data = {
"tn": "resultjson_com", # 这里不能有空格 否则会打印的是其他的网页源码
"ipn": " rj",
"ct": "201326592",
"queryWord": "彭于晏图片",
"ie": "utf-8",
"oe": "utf-8",
"adpicid": "",
"copyright": "",
"word": " 彭于晏图片",
"pn": pn,
}
url = "https://image.baidu.com/search/acjson?" + urlencode(data)
json = get_page(url)
parse_page(json)
if name == ‘main’:
pn = 30
for i in range(4):
main(pn * i)
# print(main)
# print(pn)