糗百网址:http://www.qiushibaike.com/hot
打开页面后 查看元素
找到段子的html代码片段
<div class="content">
<span>小时候家里穷从来没穿过名牌,当时我们特别羡慕班上的一个同学,因为他姑姑送给他一件两面都可以穿的夹克。一面是阿di的,一面是nai克的。</span>
</div>
//段子
# -*- coding:utf-8 -*-
import urllibimport urllib2
import re
class LC_QiuBai(object):
"""docstring for LC_QiuBai"""
def __init__(self):
super(LC_QiuBai, self).__init__()
self.page = 1
self.enable = True
def hello(self):
print '6666'
def getData(self):
url = url = 'http://www.qiushibaike.com/hot/page/' + str(self.page)
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers = { 'User-Agent' : user_agent }
try:
request = urllib2.Request(url,headers = headers)
response = urllib2.urlopen(request)
content = response.read().decode('utf-8')
pubName = r'<h2>(.*?)</h2>'
# content_duan = r'<div .*?content">(.*?)</div>'
content_duan = r'<div .*?content">.*?<span>(.*?)</span>.*?</div>'
match_L = re.findall(content_duan,content,re.S|re.M)
for line in match_L:
#print '\033[35m %s \033[0m' %line + '\n'
print line + '\n'
except urllib2.URLError, e:
if hasattr(e,"code"):
print e.code
if hasattr(e,"reason"):
print e.reason
self.page+=1;
print u'''
-----------------------------
操作:输入stop退出
功能:按下回车依次浏览今日的糗百热点
-----------------------------
'''
lcQiuBai=LC_QiuBai()
lcQiuBai.getData()
while lcQiuBai.enable:
myInput=raw_input()
if 'stop'==myInput:
lcQiuBai.enable=False
break
else:
lcQiuBai.getData()