爬取今日头条美女图片需要分析Ajax请求
首先打开网址:https://www.toutiao.com/search/?keyword=%E8%A1%97%E6%8B%8D
返回的数据以及每个data展开后为下图:
图中标出的是详情页的url,然后进入详情页:
详细代码:
import json
import os
import re
from urllib.parse import urlencode
from bs4 import BeautifulSoup
from requests.exceptions import RequestException
import requests
def get_page_index(offest, keyword):
data = {
'offset': offest,
'format': 'json',
'keyword': keyword,
'autoload': 'true',
'count': '20',
'cur_tab': 1
}
url = 'https://www.toutiao.com/search_content/?' + urlencode(data)
try:
response = requests.get(url)
if response.status_code == 200:
return response.text
return None
except RequestException:
print("请求索引页出错")
return None
def parse_page_index(html):
data = json.loads(html)
if data and 'data' in data.keys():
for item in data.get('data'):
yield item.get('article_url')
def get_page_detail(url):
try:
response = requests.get(url)
if response.status_code == 200:
return response.text
return None
except RequestException:
print("请求详情页出错", url)
return None
def parse_page_detail(html, url):
soup = BeautifulSoup(html, 'lxml')
title = soup.select('title')[0].get_text()
# print("hah"+title)
images_pattern = re.compile('gallery: (.*?),\n', re.S)
# print(images_pattern)
result = re.search(images_pattern, html)
if result:
# print("haha"+result.group(1))
data = json.loads(result.group(1))
if data and 'sub_images' in data.keys():
sub_images = data.get('sub_images')
# print(sub_images)
images = [item.get('url') for item in sub_images]
if images:
return {
'title': title,
'url': url,
'images': images
}
else:
pass
def main():
html = get_page_index(0, '街拍')
for url in parse_page_index(html):
html = get_page_detail(url)
if html:
result = parse_page_detail(html, url)
if result is not None:
for url in result.get('images'):
print(url)
pic=requests.get(url)
pic_cun='F:\images\\'+str(url)[-8:-1]+'.jpg'
fp = open(pic_cun, 'wb') # 以二进制写入模式新建一个文件
fp.write(pic.content) # 把图片写入文件
fp.close()
if __name__ == '__main__':
main()