分析Ajax抓取街拍图片,将url等数据存储为json格式,并把图片保存下来
#coding:utf-8
import os
from _md5 import md5
from urllib.parse import urlencode
import re
from bs4 import BeautifulSoup
from requests.exceptions import ConnectionError
import requests
import json
from multiprocessing import Pool
def get_page_index(offset,keyword):#Ajax传入参数
data={
'offset': offset,
'format': 'json',
'keyword': keyword,
'autoload': 'true',
'count': '20',
'cur_tab': 3,
'from': 'gallery'
}
url='https://www.toutiao.com/search_content/?'+urlencode(data)
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'
}
try:
response=requests.get(url,headers=headers)
if response.status_code==200:
return response.text
return None
except ConnectionError:
print('请求索引值出错')
return None
def parse_page_index(html):#解析组图
data=json.loads(html)
if data and 'data' in data.keys():
for item in data.get('data'):
yield item.get('article_url')
def get_page_detail(url):#请求详情页
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'
}
try:
response=requests.get(url,headers=headers)
if response.status_code==200:
return response.text
return None
except ConnectionError:
print('请求详情页出错')
return None
def parse_page_detail(html,url):#解析详情页
soup=BeautifulSoup(html,'lxml')
# print(soup.get_text())
title=soup.select('title')[0].get_text()
# print(title)
images_pattern=re.compile('gallery:.*?\("(.*?)"\)',re.S)
html = re.sub(r'\\', '', html)
result=re.search(images_pattern,html)
if result:
data=json.loads(result.group(1),"UTF-8")
# # print(result)
if data and 'sub_images'in data.keys():
sub_images=data.get('sub_images')
images=[item.get('url') for item in sub_images]
return {
'title':title,
'url':url,
'images':images,
}
def write_to_file(content):#保存为Json格式
with open('result_jiepai.txt','a',encoding='utf-8') as f:
f.write(json.dumps(content,ensure_ascii=False)+'\n')
f.close()
def get_parse_image(url):
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'
}
try:
response=requests.get(url,headers=headers)
if response.status_code==200:
image_save(response.content)
return None
except ConnectionError:
print('图片解析出错')
return None
def image_save(content):#保存图片
# content=get_parse_image(content)
file_path='{0}/{1}.{2}'.format(os.getcwd()+'/images',md5(content).hexdigest(),'jpg')
with open(file_path,'wb') as f:
f.write(content)
f.close()
def main(offset):
html=get_page_index(offset,'街拍')
for url in parse_page_index(html):
html=get_page_detail(url)
if html:
result=parse_page_detail(html,url)
write_to_file(result)
try:
for item in result.get('images') if result.get('images') else None:
get_parse_image(item)
print('正在下载:',item)
except Exception as e:
print('下载出错:',e)
# continue
if __name__=="__main__":
pool=Pool()
pool.map(main,[x*20 for x in range(1,21)])
查看下下载的图片