之前写的很菜,后来无意中找到了崔庆才的视频,发现对不上,网页已经改版,所以就特地改写了一下,其中一个坑,坑了我一天把,就是js的JSON.parse()方法和python中的json.parse()不一样,js的可以序列化\\,而且全都替换没了,python的不行,这点不得不吐槽。后来用demjson转义也不行,最后用了正则放法才替换掉后进行反序列化(用replace也行),为自己鼓掌。下面就是代码,不多bb了。
import json,demjson
import re
from urllib.parse import urlencode
import requests
from bs4 import BeautifulSoup
from requests import RequestException
def get_page_index(offset, keyword):
data = {
'offset': offset,
'format': 'json',
'keyword': keyword,
'autoload': 'true',
'count': '20',
'cur_tab': '3',
'from': 'gallery'
}
# https://www.toutiao.com/search_content/?offset=0&format=json&keyword=%E8%A1%97%E6%8B%8D&autoload=true&count=20&cur_tab=1&from=search_tab
url = 'https://www.toutiao.com/search_content/?' + urlencode(data)
try:
print(url)
response = requests.get(url)
if response.status_code == 200:
return response.text
return None
except RequestException:
print('请求索引页出错')
return None
def parse_page_index(html):
data = json.loads(html)
if data and 'data' in data.keys():
for item in data.get('data'):
yield item.get('article_url')
def get_page_detail(url):
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/52.0.2743.116 Safari/537.36 '
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.text
return None
except RequestException:
print('请求详情页出错', url)
return None
def parse_page_detail(html,url):
pic_url = 'http://p3.pstatp.com/'
soup = BeautifulSoup(html, 'lxml')
title = soup.select('title')[0].get_text()
image_pattern = re.compile('gallery: JSON.parse(.*?)siblingList:', re.S)
result = re.search(image_pattern, html)
with open('a.txt', 'a', encoding='utf8') as f:
f.write(result.group(1))
f.close()
results = re.sub(r'\\', '', result.group(1))
results = results.strip().lstrip('("').rstrip('"),')
if results:
data = json.loads(results)
if data and 'sub_images' in data.keys():
sub_images = data.get('sub_images')
images = [(pic_url+item.get('uri')) for item in sub_images]
return {
'title': title,
'url': url,
'images':images,
}
pass
def main():
html = get_page_index(0, '街拍')
for url in parse_page_index(html):
html = get_page_detail(url)
if html:
result = parse_page_detail(html, url)
with open('toutiao.txt', 'a', encoding='utf8')as f:
f.write(json.dumps(result, ensure_ascii=False) + '\n')
print(result)
if __name__ == '__main__':
main()