Python3.6环境
from lxml import etree
import requests
def wb_url (page_num) :
for num in range(1 , page_num+1 ):
url = 'https://36kr.com/api/search-column/mainsite?per_page=20&page=' + str(num)
yield url
def rejson (url) :
header = {'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36' }
wb_data = requests.get(url, headers=header)
return wb_data.json()
def article_urls (rejson) :
items_list = rejson.get('data' ).get('items' )
for item_dict in items_list:
article_id = item_dict.get('id' )
article_url = 'https://36kr.com/api/post/' + str(article_id) + '/next'
yield article_url
def article_data (url) :
wb_data = rejson(url)
title = str(wb_data.get('data' ).get('title' )).replace('|' ,'' ).replace('?' ,'' )
content_old = wb_data.get('data' ).get('content' )
content_n = str(content_old).replace('</p><p>' , '\n\n' )
content_html = etree.HTML(content_n)
clear_content = content_html.xpath('string(//*)' )
file_write(title, clear_content)
i = 3000000
def file_write (title,content) :
global i
with open('./wenzhang/' + str(i) + '_' + title + '.txt' , 'w' , encoding='utf-8' ) as f:
f.write(content)
i += 1
def main () :
n = int(input('您需要几页新闻数据? 请输入整数(0:退出): ' ))
if n == 0 :
exit()
for url in wb_url(n):
for article_url in article_urls(rejson(url)):
article_data(article_url)
main()