糗事百科爬虫改进

无事,抓糗事!

看到一个哥们的代码,无事拿来改改,抓糗事百科文字内容

#!/usr/bin/env python
'''
for qiushibaike.com
'''

import  urllib2
# import  urllib
import  re
import  thread
import  time


class Spider_Model():
    def __init__(self):
        self.page = 1
        self.pages = []
        self.enable = False

    def GetPage(self,page):
        myurl = r'http://www.qiushibaike.com/textnew/page/'+page
        user_agent = 'Mozilla/5.0 (X11; Linux x86_64)'
        headers = {'User-Agent':user_agent}

        req = urllib2.Request(myurl,headers=headers)
        myres = urllib2.urlopen(req)
        mypage = myres.read()


        unicodepage = mypage.decode('utf-8')
        myItems = re.findall('<div.*?class="content">(.*?)<!--.*?-->.*?</div>',unicodepage,re.S)
        Items = []
        # print myItems

        for item in myItems:
            # print  item
            item = item.replace('\n','')
            Items.append(item.replace(r'<br/>','\n'))
            # Items.append(item[0])

        return  Items

    def LoadPage(self):
        while self.enable:
            if len(self.pages) < 2:
                try:
                    mypage = self.GetPage(str(self.page))
                    self.page += 1
                    self.pages.append(mypage)
                except:
                    print 'can not connected to the url.'
            else:
                time.sleep(1)

    def ShowPage(self,nowPage,page):
        print '\n\n############################ Page %d #################################\n\n' % page

        for item in nowPage:
            print item

            myinput = raw_input()
            if myinput == 'quit':
                self.enable = False
                break

    def start(self):
        page = self.page
        self.enable = True

        print(u'waiting..............')

        thread.start_new_thread(self.LoadPage,())

        while self.enable:
            if self.pages:
                nowpage = self.pages[0]
                del self.pages[0]
                self.ShowPage(nowpage,page)
                page +=1


if __name__ == '__main__':
    #---------the begin of program-----------------
    print u'''
    -------------------------------------------------
    xxxx
    x
    xxx
    xxx
    -------------------------------------------------
    '''

    print 'Press any key,to continue......'
    raw_input()
    mymodel = Spider_Model()
    mymodel.start()

一切从简。不解释不说明,随便拍!

详细内容请参考:http://blog.csdn.net/pleasecallmewhy/article/details/8932310

评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值