马蜂窝平台的景点图片爬取
这次完成的是对马蜂窝景点图片爬取的改进,之前是通过地图点来爬取的,但是地图点的数据不全,所以研究之后打算从这一板块重新爬取。
但是这一部分进行翻页时,网页本身是没有任何变化的,进行开发者工具抓包发现,在进行翻页时,有一个post请求传入网页,而它携带的就是每一页的数据:
这就是需要的参数。
以下为本次实现的主要函数:
def getAllImg(self):
'''
获取所有图片
:return:
'''
html = self.getHtml('http://www.mafengwo.cn/mdd/')
soup = BeautifulSoup(html,'html.parser')
china1 = soup.find('div',{'class':'hot-list clearfix'})
china2 = soup.find('div',{'class':'hot-list clearfix hide'})
allCityList = {}
allCityidList={}
allCity = china1.find_all('dd')
for city in allCity:
c = city.find_all('a')
for a in c:
'http://www.mafengwo.cn/jd/11065/gonglve.html'
allCityList[a.string] = 'https://www.mafengwo.cn/jd/'+a['href'][29:a['href'].find('.html')]+'/gonglve.html'
allCityidList[a.string] = a['href'][29:a['href'].find('.html')]
# allCityList[a.string]=a['href'][29:a['href'].find('.html')]
for city,id in allCityidList.items():
post_url = "http://www.mafengwo.cn/ajax/router.php"
page = 1
while page <= 231:
param = {'sAct': 'KMdd_StructWebAjax|GetPoisByTag', 'iMddid': 10065, 'iTagId': 0, 'iPage': page}
cookies = {
'__jsluid_h': 'c19e9d181bb271865e1be7d61d799c67',
'mfw_uuid': '60b8420e-95c4-eda2-bfc9-5e02d4fd4333',
'oad_n': 'a%3A3%3A%7Bs%3A3%3A%22oid%22%3Bi%3A1029%3Bs%3A2%3A%22dm%22%3Bs%3A15%3A%22www.mafengwo.cn%22%3Bs%3A2%3A%22ft%22%3Bs%3A19%3A%222021-06-03+10%3A44%3A30%22%3B%7D',
'uva': 's%3A91%3A%22a%3A3%3A%7Bs%3A2%3A%22lt%22%3Bi%3A1622688271%3Bs%3A10%3A%22last_refer%22%3Bs%3A23%3A%22http%3A%2F%2Fwww.mafengwo.cn%2F%22%3Bs%3A5%3A%22rhost%22%3BN%3B%7D%22%3B',
'__mfwurd': 'a%3A3%3A%7Bs%3A6%3A%22f_time%22%3Bi%3A1622688271%3Bs%3A9%3A%22f_rdomain%22%3Bs%3A15%3A%22www.mafengwo.cn%22%3Bs%3A6%3A%22f_host%22%3Bs%3A3%3A%22www%22%3B%7D',
'__mfwuuid': '60b8420e-95c4-eda2-bfc9-5e02d4fd4333',
'UM_distinctid': '179cfc20b6734d-00221eee7872ec-2363163-144000-179cfc20b68465',
'__omc_chl': '',
'_r': 'csdn',
'_rp': 'a%3A2%3A%7Bs%3A1%3A%22p%22%3Bs%3A49%3A%22blog.csdn.net%2Fu011291072%2Farticle%2Fdetails%2F81266372%22%3Bs%3A1%3A%22t%22%3Bi%3A1622824665%3B%7D',
'__mfwothchid': 'referrer%7Cblog.csdn.net',
'__omc_r': 'blog.csdn.net',
'__mfwc': 'referrer%7Cblog.csdn.net',
'bottom_ad_status': '0',
'PHPSESSID': 'tt8tvp1bg1287j9t4fpd66rpn5',
'__mfwa': '1622688269985.12455.3.1622824668000.1622879218167',
'__mfwlv': '1622879218',
'__mfwvn': '3',
'Hm_lvt_8288b2ed37e5bc9b4c9f7008798d2de0': '1622688271,1622824670,1622879218',
'CNZZDATA30065558': 'cnzz_eid%3D1705560339-1622684260-http%253A%252F%252Fwww.mafengwo.cn%252F%26ntime%3D1622878977',
'__jsl_clearance': '1622879888.257|0|MruEnfhwo70geWcDqzscOHYZ%2FGM%3D',
'Hm_lpvt_8288b2ed37e5bc9b4c9f7008798d2de0': '1622879993',
'__mfwb': '0155e7e63b27.4.direct',
'__mfwlt': '1622880000',
}
headers = {
'Connection': 'keep-alive',
'Accept': 'application/json, text/javascript, */*; q=0.01',
'X-Requested-With': 'XMLHttpRequest',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Origin': 'http://www.mafengwo.cn',
'Referer': 'http://www.mafengwo.cn/jd/10065/gonglve.html',
'Accept-Language': 'zh-CN,zh;q=0.9',
}
data = {
'sAct': 'KMdd_StructWebAjax|GetPoisByTag',
'iMddid': '10065',
'iTagId': '0',
'iPage': page,
'_ts': '1622880000595',
'_sn': 'f10aea0391'
}
page = page+1
response = requests.post('http://www.mafengwo.cn/ajax/router.php', headers=headers, cookies=cookies,
data=data, verify=False)
data_json = json.loads(response.text)
# print(data_json)
li_list = data_json.get("data").get("list")
# 转为BeautifulSoup对象
soup = BeautifulSoup(li_list, 'html.parser')
beijing_pois = soup.find_all({"li"})
# print(beijing_pois)
for li in beijing_pois:
detail_url = 'http://www.mafengwo.cn'+li.find('a')['href']
# print(detail_url)
try:
self.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'
self.headers[
'Cookie'] = '__jsluid_h=c19e9d181bb271865e1be7d61d799c67; mfw_uuid=60b8420e-95c4-eda2-bfc9-5e02d4fd4333; oad_n=a:3:{s:3:"oid";i:1029;s:2:"dm";s:15:"www.mafengwo.cn";s:2:"ft";s:19:"2021-06-03+10:44:30";}; uva=s:91:"a:3:{s:2:"lt";i:1622688271;s:10:"last_refer";s:23:"http://www.mafengwo.cn/";s:5:"rhost";N;}";; __mfwurd=a:3:{s:6:"f_time";i:1622688271;s:9:"f_rdomain";s:15:"www.mafengwo.cn";s:6:"f_host";s:3:"www";}; __mfwuuid=60b8420e-95c4-eda2-bfc9-5e02d4fd4333; UM_distinctid=179cfc20b6734d-00221eee7872ec-2363163-144000-179cfc20b68465; __omc_chl=; _r=csdn; _rp=a:2:{s:1:"p";s:49:"blog.csdn.net/u011291072/article/details/81266372";s:1:"t";i:1622824665;}; __mfwothchid=referrer|blog.csdn.net; __omc_r=blog.csdn.net; __mfwc=referrer|blog.csdn.net; bottom_ad_status=0; PHPSESSID=tt8tvp1bg1287j9t4fpd66rpn5; Hm_lvt_8288b2ed37e5bc9b4c9f7008798d2de0=1622688271,1622824670,1622879218; __jsl_clearance=1622886950.346|0|LB6Pa55KGMvSoQxKRMTXJD57sgU=; __mfwa=1622688269985.12455.4.1622879218167.1622886952813; __mfwlv=1622886952; __mfwvn=4; CNZZDATA30065558=cnzz_eid=1705560339-1622684260-http%3A%2F%2Fwww.mafengwo.cn%2F&ntime=1622884377; __mfwb=eb105c7e9121.2.direct; __mfwlt=1622886975; Hm_lpvt_8288b2ed37e5bc9b4c9f7008798d2de0=1622886976'
# self.headers['Host'] = 'www.tuniu.com'
self.headers['Referer'] = 'http://www.mafengwo.cn/mdd/'
resp = requests.get(detail_url, headers=self.headers)
# print(resp.text)
# <script>document.cookie=('_')+('_')+('j')+('s')+('l')+('_')+('c')+('l')+('e')+('a')+('r')+('a')+('n')+('c')+('e')+('=')+(-~0+'')+([2]*(3)+'')+((1<<1)+'')+(2+'')+(2+6+'')+((2<<2)+'')+(+!+[]*2+'')+(-~[2]+'')+(1+3+'')+((1<<1)+'')+('.')+(-~[2]+'')+((2^1)+'')+(-~(8)+'')+('|')+('-')+(-~0+'')+('|')+((1+[0])/[2]+'')+('J')+('T')+('c')+('e')+('q')+('b')+(1+5+'')+('U')+('x')+('q')+('X')+('h')+('b')+(3+6+'')+('J')+('V')+('D')+('F')+('H')+('R')+('%')+(0+1+0+1+'')+('F')+('N')+('r')+('%')+(-~1+'')+('F')+('Q')+('c')+('%')+((1+[2]>>2)+'')+('D')+(';')+('m')+('a')+('x')+('-')+('a')+('g')+('e')+('=')+((2^1)+'')+(3+3+'')+(~~''+'')+(~~''+'')+(';')+('p')+('a')+('t')+('h')+('=')+('/');location.href=location.pathname+location.search</script>
if resp.status_code == 521:
cookie_dic = self.error_http_521res(resp)
# 将cookie放入 重新请求
cookie_dic = 'ab_jid=caea858d5233a99bb76ddd016cea995a1f4b; Path=/; Domain=miao.baidu.com; Max-Age=2147483647; HttpOnly; Secure; SameSite=None'
self.headers['Cookie'] = cookie_dic
print(cookie_dic)
resp = requests.get(detail_url, headers=self.headers)
resp.raise_for_status()
resp.encoding = 'utf-8'
html_context = resp.text
except:
html_context = ""
print(html_context)
print(self.list)
filename = 'allScenic.json'
with open(filename,'w') as file:
json.dump(self.list,file)