源代码为:
from urllib.request import Request, urlopen
import requests
import re
import time
def getHtml(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'
}# 设置虚拟Headers信息
request = Request(url, headers=headers)
response = urlopen(request)
html = response.read().decode('utf-8')
return html
def write_to_file(content):
with open('duanzi.txt','a',encoding='utf=8') as f:
f.write(json.dumps(content,ensure_ascii=False)+'\n')
def getText(pageNum=1):
text_list=[]
for page in range(1,pageNum+1):
url = 'https://www.qiushibaike.com/hot/page/' + str(page)
html=getHtml(url)
time.sleep(1)
pattern = re.compile(
'<div class="article block untagged mb15.*?<h2>(.*?)</h2>.*?<span>(.*?)</span>.*?<span class="stats-vote"><i class="number">(.*?)</i>',
re.S)
items = re.findall(pattern, html)
text_list.append(items) # 将不同页面匹配到的内容,均放入text_list列表中存放起来
for each_items in text_list: # 迭代获取每一个网页的每一个段子信息
for item in each_items:
count=0
for i in item: # 处理文本,加强阅读效果
i = i.strip('\n') # 将'\n'去掉,避免多个换行符叠加
i = i.replace('<br/>', '\n') # <br/>是HTML中的用于段落的换行标签,
# 为了保持原本的段落格式,所以需要在我们阅读时替换成文本换行符'\n'
print(i)
count+=1
if count%3==0:
print('----' * 20)
if __name__ == '__main__':
try:
num=int(input('请输入你想要爬取的页面数量:'))
getText(num)
except Exception as e:
print("对不住,出错了!")