1.抓取糗事百科热门段子
2.过滤带有图片的段子
3.实现每按一次回车显示一个段子发布人,段子内容
——————————————————————————————————————————————————
原作者抓取了时间。点赞数,我没弄
from urllib import request,parse,error
import re
class QSBK:
def __init__(self):
'''初始化一下变量:
pageIndex : 页号
user_agent:url添加浏览器模拟
headers :url添加头部
tories :存放段子
enable :程序运行状态
'''
self.pageIndex = 1
self.user_agent = 'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:49.0) Gecko/20100101 Firefox/49.0'
self.headers = {'User-Agent':self.user_agent}
self.stories = []
self.enable = False
def getpage(self,pageIndex):
'''获得一个源网页的内容
:param pageIndex: 页号
:return: qbcontent 糗百一页内容
'''
try:
url = ' http://www.qiushibaike.com/hot/page/' +str(pageIndex)
req = request.Request(url,headers=self.headers)
respose = request.urlopen(req)
qbcontent = respose.read().decode('utf-8')
return qbcontent
except error.URLError as e:
if hasattr(e,'reason'):
print('连接失败,错误原因:',e.reason)
return None
def getItems(self,pageIndex):
'''从一个网页中获取所有段子
:param pageIndex: 页号
:return: 段子列表
'''
qbcontent = self.getpage(pageIndex)
if not qbcontent:
print('页面加载失败。。。。')
return None
articles = re.compile(r'<div class="author clearfix">.*?<h2>(.*?)</h2>.*?content">.*?<span>(.*?)</span>',re.S)
#正则匹配段子
items = re.findall(articles,qbcontent)
pagestories = []
for item in items:
replaceBR = re.compile(r'<br/>')
text = re.sub(replaceBR,'\n',item[1])
pagestories.append( (item[0],text.strip()) )
return pagestories
def loadpage(self):
'''
加载页面获取一页的段子
:return: 无
'''
if self.enable == True:
if len(self.stories)<2:
pagestories = self.getItems(self.pageIndex)
if pagestories:
self.stories.append(pagestories)
self.pageIndex += 1
def getOneStory(self,pagestories,page):
'''每次任意输入就返回一个段子
:param pagestories:段子列表
:param page:页号
:return:一个段子
'''
for story in pagestories:
input_status = input()
if input_status == 'q':
self.enable = False
return
self.loadpage()
print('第{0}页\t发布人:{1}\n{2}'.format(page,story[0],story[1]))
def downloadtexts(self,filename,pages):
'''输入要保存的路径,页数,下载段子到本地文件
:param pages:要下载的页数
:return: 无
'''
allStories = []
for page in range(pages):
onePageStories = self.getItems(page+1)
allStories.append(onePageStories)
with open(filename,'w') as f:
for onepage in allStories:
f.write('*'*78)
f.write('\n')
for oneStory in onepage:
f.write('作者: ' + oneStory[0])
f.write('\n')
f.write('段子: ' + oneStory[1])
f.write('\n\n')
def startLook(self):
print('正在读取糗事百科,回车查看新段子,q退出:')
self.enable = True
self.loadpage()
nowpage = 0
while self.enable:
if len(self.stories)>0:
pagestories = self.stories[0]
nowpage += 1
del self.stories[0]
self.getOneStory(pagestories,nowpage)
测试一下在:
if __name__ == '__main__':
qb = QSBK()
qb.downloadtexts('qb.txt',2)
不错,可以运行。
主要的难度应该就是正则表达式的编写了
我的正则开始写的是一坨屎,后面看了看别人的哥们的修改的,但是还有更简单的,没看明白