python--头条街拍抓取

最新推荐文章于 2018-12-04 13:31:20 发布

狠卟乖

最新推荐文章于 2018-12-04 13:31:20 发布

阅读量177

点赞数

分类专栏： python

本文链接：https://blog.csdn.net/qq_43595047/article/details/84526888

版权

python 专栏收录该内容

11 篇文章 0 订阅

订阅专栏

import json
import os
from _md5 import md5
import requests
from urllib.parse import urlencode
from bs4 import BeautifulSoup
import re
import time
from requests import RequestException


def get_index():
	data = {    #Query String Parameters XHR
		'offset': '0',
		'format': 'json',
		'keyword': '街拍',
		'autoload': 'true',
		'count': '20',
		'cur_tab': '5'
	}
	url = 'https://www.toutiao.com/search_content/?' + urlencode(data)
	try:
		response = requests.get(url)
		if response.status_code == 200:  #判断返回状态
			return response.text
		return None
	except RequestException:
		return None
def parse_page_index (get_html): #json解析返回数据；
	data = json.loads(get_html)
	if data and 'data' in data.keys(): #判断返回中含有data属性；
		for item in data.get('data'): #遍历解析出详情页连接article_url
			yield item.get('article_url')

def get_page_detail(url): #获取详情页的url
	try:
		header = {'user-agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36'}
		response = requests.get(url,headers=header)
		if response.status_code == 200:  # 判断返回状态
			return response.text
		return None
	except RequestException:
		return None

def pares_page_detail(html,url): #解析详情页的url;
	if html == None:
		pass
	else:
		soup = BeautifulSoup(html,'lxml')
		title = soup.select('title')[0].get_text()#获取标题；
		image_pattern = re.compile('JSON.parse\("(.*?)"\),',re.S)
		result =  re.search(image_pattern,html)
		if result:
			result_url = str(result.group(1))
			images = re.findall(r'url\\":\\"(.*?)\\"', result_url, re.S)
			images_url = [item for item in images]
			for image in images:
				download_image(image)
			return {
				'title': title,
				'url': url,
				'images': images_url
			}

def download_image(url):
	url = re.sub('\\\\', '', url)
	print('正在下载', url)
	try:
		user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'
		headers = {"User-Agent": user_agent}
		response = requests.get(url, headers=headers)
		if response.status_code == 200:
			save_image(response.content)
		return None
	except RequestException:
		print('请求图片出错', url)
		return None

def save_image(content):
	path = 'D://街拍'
	file_path = '{0}/{1}.{2}'.format(path, md5(content).hexdigest(), 'jpg')
	if not os.path.exists(file_path):
		with open(file_path, 'wb') as f:
			f.write(content)
			f.close()
def main ():
	get_html = get_index()
	# print(get_html)
	for url in parse_page_index(get_html):
		html = get_page_detail(url)
		# print(html)
		pares_page_detail(html,url)

if __name__ == '__main__':
    main()