python3 练手:爬糗事百科
参考原文地址:
https://blog.csdn.net/pleasecallmewhy/article/details/8932310
https://cuiqingcai.com/990.html
糗事百科地址:https://www.qiushibaike.com/hot/page/1/
每页显示25则糗事,糗事内容在〈div class = ‘content’〉…〈/div〉内
目标:爬取糗事内容并在每按一次回车时显示一则糗事,当按下"q"键时结束
分析:设置标志开关flag,根据输入判断是否显示。
flag为真,判断存储列表stories是否为空:
为空:加载页面,解析页面,添加内容至stroies
非空:显示一则内容
flag为假:退出
注意:只解析一个结果item与解析多个结果不同
多个结果可用item[0],item[1]…
单个结果引用item[0]无内容
代码:
import urllib
import urllib.request
import re
class QSBK:
def __init__(self):
self.headers = {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'}
self.pageindex = 1
self.flag = True
self.stories = []
def get_html(self,url):
try:
myreq = urllib.request.Request(url,headers = self.headers)
myrespoonse = urllib.request.urlopen(myreq)
html = myrespoonse.read().decode('utf-8')
return html
except urllib.request.URLError as e:
if(hasattr(e,'reason')):
print('未连接,原因:' + e.reason + '\n')
def parse_html(self,html):
patterns = re.compile('<div.*?"content">.*?<span>(.*?)</span>',re.S)
items = re.findall(patterns,html)
for item in items:
replacebr = re.compile('<br/>')
modifytext = re.sub(replacebr,'\n',item)
self.stories.append(modifytext.strip())
def load_page(self):
download_url = 'https://www.qiushibaike.com/hot/page/' + str(self.pageindex)
html = self.get_html(download_url)
self.parse_html(html)
def getonestory(self):
story = self.stories[0]
print(story)
del self.stories[0]
def start(self):
self.load_page()
print(len(self.stories))
self.pageindex += 1
print('糗事百科:\n')
print('按回车键加载内容,按‘q’键结束:\n')
i = 1
while(self.flag):
exityesorno = input()
if (exityesorno == 'q'):
self.flag = False
else:
if(len(self.stories) > 0):
print("第" + str(i) + "则糗事:")
self.getonestory()
i += 1
else:
self.load_page()
self.pageindex += 1
print("第" + str(i) + "则糗事:")
self.getonestory()
i += 1
spider = QSBK()
spider.start()