web测试常用python代码——爬虫程序

#coding=utf-8
#爬虫程序——起点
'''
Created on 2012-4-18

@author: xxx
'''

import urllib2
import Queue
import threading
import time
import socket
import sgmllib

urls = ['http://www.qidian.com/Book/%d.aspx/' % i for i in range(0, 1000000)]
threadsNum = 100
#设置线程栈大小
threading.stack_size(32768 * 16)
#设置连接超时
socket.setdefaulttimeout(10)

class BookSpider(sgmllib.SGMLParser):
    def __init__(self, threadsNum):
        self.opener = urllib2.build_opener(urllib2.HTTPHandler)
        self.lock = threading.Lock()
        #请求队列
        self.requestQueue = Queue.Queue()
        #完成队列
        self.completeQueue = Queue.Queue()
        self.runThreadsNum = 0
        for i in range(threadsNum):
            tmpThread = threading.Thread(target = self.threadRun)
            tmpThread.daemon = True
            tmpThread.start()

    def __del__(self):
        time.sleep(2)
        #等待两个队列结束
        self.requestQueue.join()
        self.completeQueue.join()

    def taskLeft(self):
        return self.requestQueue.qsize() + self.completeQueue.qsize() + self.runThreadsNum

    def push(self, request):
        self.requestQueue.put(request)

    def pop(self):
        return self.completeQueue.get()

    def threadRun(self):
        while True:
            request = self.requestQueue.get()
            with self.lock:
                self.runThreadsNum += 1
            try:
                result = self.opener.open(request).read()
                self.completeQueue.put((request, result))
            except Exception:
                time.sleep(0.1)
            with self.lock:
                self.runThreadsNum -= 1
            self.requestQueue.task_done()
            time.sleep(0.1)

class MyParser(sgmllib.SGMLParser):
    def __init__(self):
        sgmllib.SGMLParser.__init__(self)
        self.text_meta = []
        self.text_br = []
        self.is_b = 0

    def start_meta(self, attrs):
        for attr in attrs:
            if attr[0] == 'title':
                self.text_meta.append(attr[1])

    def start_b(self, attrs):
        for attr in attrs:
            if (attr[0] == 'style') and (attr[1] == 'color:Red; display:none'):
                self.is_b = 1

    def end_b(self):
        if self.is_b == 1:
            self.is_b = 2

    def unknown_starttag(self, tag, attrs):
        if (self.is_b == 2) and (tag != 'br'):
            self.is_b = 0

    def handle_data(self, text):
        if (self.is_b == 2):
            self.text_br.append(text.strip())

if __name__ == '__main__':
    spider = BookSpider(threadsNum)
    for url in urls:
        spider.push(url)
    while spider.taskLeft():
        url, contents = spider.pop()
        myParser = MyParser()
        myParser.feed(contents)
        writeFile = open('qidian.txt', 'a')
        for i in myParser.text_meta:
            writeFile.write(url + '\n')
            writeFile.write(i + '\n')
        for i in myParser.text_br:
            writeFile.write(i + '\n')
        writeFile.close()
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值