最近在学习获取网络数据,在网上看到一个获取获取糗事百科段子内容的实例(点击打开链接)
因为糗事百科的版本变化了所以原代码不可以直接使用,在评论中有可直接使用的代码,下面的代码是我根据博主的源码改编而成的,思路和方法可以看博主原文
# encoding: utf-8
import urllib2
import re
class Qsbk():
def __init__(self):
self.url = 'http://www.qiushibaike.com/hot/page/'
self.user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
# 初始化headers
self.headers = {'User-Agent': self.user_agent}
def html_data(self, pageIndex):
try:
url = self.url + str(pageIndex)
# 构建请求的request
request = urllib2.Request(url, headers=self.headers)
# 利用urlopen获取页面代码
response = urllib2.urlopen(request)
# 将页面转化为UTF-8编码
pageCode = response.read().decode('utf-8')
# print pageCode
return pageCode
except urllib2.URLError, e:
if hasattr(e, "reason"):
print u"连接糗事百科失败,错误原因", e.reason
return None
def data_split(self, pageIndex):
pageCode = self.html_data(pageIndex)
if not pageCode:
print "页面加载失败...."
return None
pattern = re.compile('<div class="author clearfix">.*?<h2>(.*?)</h2>.*?</div>.*?'
'<div class="content">(.*?)</div>.*?'
'<i class="number">(.*?)</i>.*?<i class="number">(.*?)</i>', re.S)
items = pattern.findall(pageCode)
return list(items)
def out_put(self):
i = 1
j = 0
output = ''
datats = self.data_split(i)
while (j < len(datats)):
print "正在读取糗事百科,按回车查看新段子,q退出"
output = raw_input()
if output == '':
print '第' + str(i) + '页'
print ''.join(datats[j])
j += 1
if(j == len(datats)):
i += 1
j = 0
datats = self.data_split(i)
elif output == 'q':
break
else:
print "正在读取糗事百科,按回车查看新段子,Q退出"
output = raw_input()
if __name__ == '__main__':
Qsbk().out_put()