糗百新鲜事——爬虫python

最新推荐文章于 2024-11-13 17:25:30 发布

baijie5865

最新推荐文章于 2024-11-13 17:25:30 发布

阅读量122

点赞数

文章标签：爬虫 python 操作系统

原文链接：http://www.cnblogs.com/SilenceCity/p/3639493.html

版权

这个是学爬虫时的练习。

例子上面的用起来不太爽，就自己稍微改了下，练手用

  1 # -*- coding:utf-8 -*-
  2 # --------------------------------------------
  3 #     程序：【糗百最热】爬虫
  4 #     版本：0.1
  5 #     作者：Silence
  6 #     日期：2014-03-25
  7 #     操作：输入quit退出
  8 #     功能：运行后按Enter键可以浏览今天的糗百热点
  9 # ---------------------------------------------
 10 
 11 import urllib
 12 import urllib2
 13 import re
 14 import thread
 15 import time
 16 
 17 class HTML_Tool:
 18     """定义一个工具类来处理页面上的各种标签"""
 19 
 20     # 用 非贪婪模式匹配 \t 或者 \n 或者 空格 或者 超链接 或者 图片
 21     bgnCharToNoneRex = re.compile("(\t|\n| |<a.*?>|<img.*?>)")
 22 
 23     # 用 非贪婪模式 匹配任意的 <>标签
 24     endCharToNoneRex = re.compile("<.*?>")
 25 
 26     # 用 非贪婪模式 匹配任意的<p>标签
 27     bgnPartRex = re.compile("<p.*?>")
 28     charToNewLineRex = re.compile("(<br/>|</p>|<tr>|<div>|</div>)")
 29     charToNewTabRex = re.compile("<td>")
 30 
 31     # 将一些html的符号转变为原始符号
 32     replaceTab = [("<","<"),(">",">"),("&","&"),("&","\""),(" "," ")]
 33 
 34     def replace_Char(self,content):
 35         content = self.bgnCharToNoneRex.sub("",content)
 36         content = self.bgnPartRex.sub("\n     ",content)
 37         content = self.charToNewLineRex.sub("\n",content)
 38         content = self.charToNewTabRex.sub("\t",content)
 39         content = self.endCharToNoneRex.sub("",content)
 40 
 41         for rt in self.replaceTab:
 42             content = content.replace(rt[0],rt[1])
 43         return content
 44 
 45 class QiuBai_Model:
 46     """docstring for QiuBai_Model"""
 47     def __init__(self):
 48         self.page = 1
 49         self.pages = []
 50         self.myTool = HTML_Tool()
 51         self.enable = False
 52         
 53     def getPage(self,page):
 54         myurl = "http://m.qiushibaike.com/hot/page/" + page
 55         
 56         #糗百现在加了过滤规则，不允许直接抓包了，所以这里伪造为一个浏览器请求
 57         headers = {
 58             'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'
 59         }
 60         req = urllib2.Request(
 61             url = myurl,
 62             headers = headers
 63         )
 64         myResponse = urllib2.urlopen(req)
 65         myPage = myResponse.read()
 66         # python中默认的编码是unicode编码
 67         # String的encode是把unicode编码转换为其他编码的字符
 68         # decode是把其他编码字符转换为unicode编码
 69         unicodePage = myPage.decode("utf-8")
 70 
 71         # 先找出所有包含新鲜事的结构
 72         # 糗百页面中，所有新鲜事都写在形如<div class="content" title="">
 73         # 这里使用正则
 74         myItems = re.findall('<div.*?class="content".*?title="(.*?)">(.*?)</div>',unicodePage,re.S)
 75         items = []
 76         for item in myItems:
 77             # 糗百中一个<div>标题时间;内容</div>
 78             items.append([item[0].replace("\n",""),item[1].replace("\n","")])
 79         return items
 80 
 81     # 加载新的段子
 82     def loadPage(self):
 83         # 如果用户没有输入quit就一直运行
 84         while self.enable:
 85             # 如果pages数组中的内容小于两个就获取新页面中的段子
 86             if len(self.pages) < 2:
 87                 try:
 88                     myPage = self.getPage(str(self.page))
 89                     self.page += 1
 90                     self.pages.append(myPage)
 91                 except:
 92                     print '无法链接糗事百科！'
 93                 else:
 94                     time.sleep(1)
 95 
 96     def showPage(self,pageNum,page):
 97         for items in pageNum:
 98             print u"第%d页"%page, items[0]
 99             print self.myTool.replace_Char(items[1])
100 
101 
102     def start(self):
103         self.enable = True
104         page = self.page
105 
106         print u'正在加载中，请稍后…………'
107 
108         # 新启动一个线程，在后台进行加载，并存储
109         thread.start_new_thread(self.loadPage,())
110 
111         while self.enable:
112             if self.pages:
113                 currentPage = self.pages[0]
114                 del self.pages[0]
115 
116                 # 原来每显示一个就得按一次enter，太繁琐了，现在一次显示一页，瞬间高大上有木有
117                 self.showPage(currentPage,page)
118                 page += 1
119                 
120                 myinput = raw_input('是否继续下一页？请按Enter键\n')
121                 if myinput == "quit":
122                     print '您本次观看到第 %d 页，欢迎下次再来！'%page
123                     self.enable = False
124                     break
125 
126 if __name__ == '__main__':
127     print u"""
128 --------------------------------------------
129      程序：【糗百最热】爬虫
130      版本：0.1
131      作者：Silence
132      日期：2014-03-25
133      操作：运行后输入enter，一次可以显示一页，输入quit退出
134      功能：就是看糗百最热栏的内容，由于现在还是爬虫爬的文字，所以，没有图片看了
135 ---------------------------------------------
136     """
137     raw_input('请输入Enter键，精彩的糗百内容即将展示\n')
138     qiubai = QiuBai_Model()
139     qiubai.start()