python--头条街拍抓取

import json
import os
from _md5 import md5
import requests
from urllib.parse import urlencode
from bs4 import BeautifulSoup
import re
import time
from requests import RequestException


def get_index():
	data = {    #Query String Parameters XHR
		'offset': '0',
		'format': 'json',
		'keyword': '街拍',
		'autoload': 'true',
		'count': '20',
		'cur_tab': '5'
	}
	url = 'https://www.toutiao.com/search_content/?' + urlencode(data)
	try:
		response = requests.get(url)
		if response.status_code == 200:  #判断返回状态
			return response.text
		return None
	except RequestException:
		return None
def parse_page_index (get_html): #json解析返回数据;
	data = json.loads(get_html)
	if data and 'data' in data.keys(): #判断返回中含有data属性;
		for item in data.get('data'): #遍历解析出详情页连接article_url
			yield item.get('article_url')

def get_page_detail(url): #获取详情页的url
	try:
		header = {'user-agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36'}
		response = requests.get(url,headers=header)
		if response.status_code == 200:  # 判断返回状态
			return response.text
		return None
	except RequestException:
		return None

def pares_page_detail(html,url): #解析详情页的url;
	if html == None:
		pass
	else:
		soup = BeautifulSoup(html,'lxml')
		title = soup.select('title')[0].get_text()#获取标题;
		image_pattern = re.compile('JSON.parse\("(.*?)"\),',re.S)
		result =  re.search(image_pattern,html)
		if result:
			result_url = str(result.group(1))
			images = re.findall(r'url\\":\\"(.*?)\\"', result_url, re.S)
			images_url = [item for item in images]
			for image in images:
				download_image(image)
			return {
				'title': title,
				'url': url,
				'images': images_url
			}

def download_image(url):
	url = re.sub('\\\\', '', url)
	print('正在下载', url)
	try:
		user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'
		headers = {"User-Agent": user_agent}
		response = requests.get(url, headers=headers)
		if response.status_code == 200:
			save_image(response.content)
		return None
	except RequestException:
		print('请求图片出错', url)
		return None

def save_image(content):
	path = 'D://街拍'
	file_path = '{0}/{1}.{2}'.format(path, md5(content).hexdigest(), 'jpg')
	if not os.path.exists(file_path):
		with open(file_path, 'wb') as f:
			f.write(content)
			f.close()
def main ():
	get_html = get_index()
	# print(get_html)
	for url in parse_page_index(get_html):
		html = get_page_detail(url)
		# print(html)
		pares_page_detail(html,url)

if __name__ == '__main__':
    main()
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值