import json
import os
from _md5 import md5
import requests
from urllib.parse import urlencode
from bs4 import BeautifulSoup
import re
import time
from requests import RequestException
def get_index():
data = { #Query String Parameters XHR
'offset': '0',
'format': 'json',
'keyword': '街拍',
'autoload': 'true',
'count': '20',
'cur_tab': '5'
}
url = 'https://www.toutiao.com/search_content/?' + urlencode(data)
try:
response = requests.get(url)
if response.status_code == 200: #判断返回状态
return response.text
return None
except RequestException:
return None
def parse_page_index (get_html): #json解析返回数据;
data = json.loads(get_html)
if data and 'data' in data.keys(): #判断返回中含有data属性;
for item in data.get('data'): #遍历解析出详情页连接article_url
yield item.get('article_url')
def get_page_detail(url): #获取详情页的url
try:
header = {'user-agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36'}
response = requests.get(url,headers=header)
if response.status_code == 200: # 判断返回状态
return response.text
return None
except RequestException:
return None
def pares_page_detail(html,url): #解析详情页的url;
if html == None:
pass
else:
soup = BeautifulSoup(html,'lxml')
title = soup.select('title')[0].get_text()#获取标题;
image_pattern = re.compile('JSON.parse\("(.*?)"\),',re.S)
result = re.search(image_pattern,html)
if result:
result_url = str(result.group(1))
images = re.findall(r'url\\":\\"(.*?)\\"', result_url, re.S)
images_url = [item for item in images]
for image in images:
download_image(image)
return {
'title': title,
'url': url,
'images': images_url
}
def download_image(url):
url = re.sub('\\\\', '', url)
print('正在下载', url)
try:
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'
headers = {"User-Agent": user_agent}
response = requests.get(url, headers=headers)
if response.status_code == 200:
save_image(response.content)
return None
except RequestException:
print('请求图片出错', url)
return None
def save_image(content):
path = 'D://街拍'
file_path = '{0}/{1}.{2}'.format(path, md5(content).hexdigest(), 'jpg')
if not os.path.exists(file_path):
with open(file_path, 'wb') as f:
f.write(content)
f.close()
def main ():
get_html = get_index()
# print(get_html)
for url in parse_page_index(get_html):
html = get_page_detail(url)
# print(html)
pares_page_detail(html,url)
if __name__ == '__main__':
main()
python--头条街拍抓取
最新推荐文章于 2018-12-04 13:31:20 发布