2017年11月30日 运行通过
# -*-coding:utf-8-*-
import json
import os
from json import JSONDecodeError
from urllib.parse import urlencode
import requests
from bs4 import BeautifulSoup
from requests.exceptions import ConnectionError
import re
from hashlib import md5
def get_page_index(offset, keyword):
data = {
'autoload': 'true',
'count': 20,
'cur_tab': 3,
'format': 'json',
'keyword': keyword,
'offset': offset,
}
params = urlencode(data)
base = 'http://www.toutiao.com/search_content/'
url = base + '?' + params
try:
response = requests.get(url)
if response.status_code == 200:
return response.text
return None
except ConnectionError:
print('Error occurred')
return None
def download_image(url):
print('Downloading', url)
try:
response = requests.get(url)
if response.status_code == 200:
save_image(response.content)
return None
except ConnectionError:
return None
def save_image(content):
file_path = '{0}/images/{1}.{2}'.format(os.getcwd(), md5(content).hexdigest(), 'jpg')
if not os.path.exists(file_path):
with open(file_path, 'wb') as f:
f.write(content)
f.close()
def parse_page_index(text):
try:
data = json.loads(text)
if data and 'data' in data.keys():
for item in data.get('data'):
yield item.get('article_url')
except JSONDecodeError:
pass
def get_page_detail(url):
try:
response = requests.get(url)
if response.status_code == 200:
return response.text
return None
except ConnectionError:
print('Error occurred')
return None
def parse_page_detail(html, url):
soup = BeautifulSoup(html, 'lxml')
result = soup.select('title')
title = result[0].get_text() if result else ''
images_pattern = re.compile('BASE_DATA.galleryInfo = (.*?)</script>', re.S)
result = re.search(images_pattern, html)
if result:
data = result.group(1)
gallery = data[data.index('JSON.parse("')+12:data.index('siblingList')-8]
images = json.loads(json.loads('"' + gallery + '"'))
sub_images = images["sub_images"]
images = [item['url'] for item in sub_images]
for image in images:
download_image(image)
if __name__ == '__main__':
offset = [offset for offset in range(0,140,20)]
KEYWORD = "小黄人"
for off in offset:
text = get_page_index(offset, KEYWORD)
urls = parse_page_index(text)
for url in urls:
html = get_page_detail(url)
result = parse_page_detail(html, url)
此代码基本为 https://github.com/Germey/TouTiao中代码
修改了原正则表达式提取所有组图url部分。
(因为网站变更,原来的已经匹配不到了)
本来想直接匹配 gallery: JSON.parse后的内容,使用如下匹配不到
re.compile('JSON.parse (.*?),', re.S)
所以改成取BASE_DATA.galleryInfo = {} 整个内容。再拿JSON.parse的部分。取来的内容直接json.loads会报错,可能是有转义的字符\导致。使用如下解决:
json.loads(json.loads('"' + gallery + '"'))
还是使用scrapy下载图片方便