# -*- coding: utf-8 -*-
import urllib2
from bs4 import BeautifulSoup
import thread
import time
class qiushibaike:
"""docstring for ClassName"""
def __init__(self):
self.page = 1 #下载了的页数
self.pages = [] #保存已下载的html
self.enable = False #标志位
self.url='http://m.qiushibaike.com/hot/page/'
# 用于加载新的页面
def LoadPage(self):
# 如果用户未输入quit则一直运行
while self.enable:
# 如果pages数组中的内容小于5个
if len(self.pages) < 5:
try:
# 获取新的页面,加入到数组中
url=self.url+str(self.page)
newPage = self.GetHtml(url)
self.page += 1
self.pages.append(newPage)
except:
print '无法链接糗事百科!'
else:
time.sleep(1)
def ParseHtml(self,html):
items=self.GetContenBlock(html)
for item in items:
content=self.ParseContent(item)
try:
print u"作者",content['author'],u"时间:",content["time"]
print content["content"]
print '------------------------------------------'
except:
print u'尼玛这样都有错啊!'
#用来获取html
def GetHtml(self,url):
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers = { 'User-Agent' : user_agent }
req = urllib2.Request(url,None,headers)
response = urllib2.urlopen(req)
html = response.read()
return html
def GetContenBlock(self,html):
soup = BeautifulSoup(str(html))
items=soup.findAll('div',{'class':'article block untagged mb15'})
return items;
def ParseContent(self,item):
#soup = BeautifulSoup(str(item))
content=item.find('div',{'class':'content'})
result={}
if content!=None:
try:
result["content"]=content.text.strip()
result["time"]=content.get("title").strip()
except:
result["content"]=None
result["time"]=None
else:
result["content"]=None
result["time"]=None
#author=item.find('div',{'class':'author clearfix'})
#if author!=None:
# result["author"]=author.findAll('a')[1].string
#else:
# result["author"]=None
result['author']=self.ParseAuthor(item)
return result;
def ParseAuthor(self,item):
try:
#soup = BeautifulSoup(str(item))
item=item.find('div',{'class':'author clearfix'})
if item!=None:
return item.findAll('a')[1].text
else:
return None;
except:
return None;
def Start(self):
self.enable = True
page = self.page
print u'正在加载中请稍候......'
# 新建一个线程在后台加载页面并存储
thread.start_new_thread(self.LoadPage,())
#----------- 加载处理糗事百科 -----------
while self.enable:
# 如果self的page数组中存有元素
if self.pages:
nowPage = self.pages[0]
del self.pages[0]
self.ParseHtml(nowPage)
page += 1
print u"""
---------------------------------------
程序:糗百爬虫
版本:0.1
作者:zz
日期:2013-05-15
语言:Python 2.7
功能:按下回车依次浏览今日的糗百热点
---------------------------------------
"""
print u'请按下回车浏览今日的糗百内容:'
raw_input(' ')
myModel = qiushibaike()
myModel.Start()
转载于:https://my.oschina.net/zhujunxxxxx/blog/311990