# !user/bin/env python3
# -*-coding: utf-8 -*-
__author__ = 'xxx'
import requests
import re
import json
r = requests.get('http://wz.sun0769.com/index.php/question/questionType?type=4')
r.encoding = 'gbk'
# print(r.content.decode('gbk'))
number = re.findall(r'<td width="53" height="30" align="center" bgcolor="#FFFFFF">(\d+)</td>', r.content.decode('gbk'),
re.S)
# print(number)
articles = re.findall(
r'<a href="http://wz.sun0769.com/html/question/201910/(\d+).shtml" title="(.*?)" target="_blank" class="news14">',
r.content.decode('gbk'), re.S)
# for article in articles:
# print(article)
for i in range(len(articles)):
new_request = 'http://wz.sun0769.com/html/question/201910/{}.shtml'.format(articles[i][0])
content_r = requests.get(new_request)
article_content = re.findall(r'<meta name="description" content="(.*?)" />', content_r.content.decode('gbk'), re.S)
# print(article_content)
test_dict = {'number': number[i], 'url': new_request, 'article': articles[i][-1], 'content': article_content}
with open(str(number[i]) + '.json', 'w', encoding='utf-8') as f:
json.dump(test_dict, f, ensure_ascii=False, indent=4)
requests 代码
最新推荐文章于 2024-04-22 12:04:16 发布