实现python自定义爬虫框架

import urllib2
from lxml import etree
import Queue
import ssl
import re
import threading
import json


class CrawlThread(threading.Thread):

    def __init__(self, urlQueue, dataQueue, threadName):
        super(CrawlThread, self).__init__()
        self.urlQueue = urlQueue
        self.dataQueue = dataQueue
        self.name = threadName

    def run(self):
        while not urlQueue.empty():
            try:

                dict = self.urlQueue.get(block=False)

                suffix_url = dict.get('url')

                filename = dict.get('filename')

                print  threading.current_thread().name + '\t' + filename

                response = opener.open(url + suffix_url)

                html = response.read().decode('gbk')

                # content = etree.HTML(html)

                # text = content.xpath('//*[@id="content"]')[0].text

                pattern = re.compile(r'<div id="content">(.*?)</div>')

                text = pattern.search(html).group(1)

                self.dataQueue.put({'filename': filename, 'content': text})

            except:
                pass


class WriteThread(threading.Thread):

    def __init__(self, dataQueue, threadName):
        super(WriteThread, self).__init__()
        self.name = threadName
        self.dataQueue = dataQueue

    def run(self):

        while not dataQueue.empty():

            try:
                dict = dataQueue.get(block=False)

                filename = dict.get('filename')

                content = dict.get('content')

                # print content

                with open('./novel/' + filename.strip() + '.txt', 'w') as f:
                    r = doContent(content)

                    f.write(r.encode('utf-8'))
            except Exception, e:
                print e.message


def doContent(content):
    pattern = re.compile(r'<br/><br/>&nbsp;&nbsp;&nbsp;&nbsp;')

    r = pattern.sub('\n', content)

    return r


if __name__ == "__main__":

    proxy = {
        'http': '***',
        'https': '***'
    }

    ssl_context = ssl._create_unverified_context()

    https_handler = urllib2.HTTPSHandler(context=ssl_context)

    global url

    url = "https://www.i7wx.com/book/0/636/"

    proxy_handler = urllib2.ProxyHandler(proxy)

    opener = urllib2.build_opener(proxy_handler, https_handler)

    response = opener.open(url)

    # print response.read().decode('gbk')

    pattern = re.compile(r'<a href="(\d*.html)">(.*?)</a>', re.I)

    result = pattern.findall(response.read().decode('gbk'))

    urlQueue = Queue.Queue()

    dataQueue = Queue.Queue()

    global CRAWl_EXIT, SAVE_EXIT
    CRAWl_EXIT = False
    SAVE_EXIT = False

    for k, v in result:
        # print k, v

        urlQueue.put({
            'url': k,
            'filename': v
        })
    crawlThreads = []
    thread = CrawlThread(urlQueue, dataQueue, "crawl thread 1")
    thread2 = CrawlThread(urlQueue, dataQueue, "crawl thread 2")
    thread3 = CrawlThread(urlQueue, dataQueue, "crawl thread 3")
    crawlThreads.append(thread)
    crawlThreads.append(thread2)
    crawlThreads.append(thread3)
    thread.start()
    thread2.start()
    thread3.start()
    for t in crawlThreads:
        t.join()

    writeThreads = []
    thread4 = WriteThread(dataQueue, 't4')
    thread5 = WriteThread(dataQueue, 't5')
    thread6 = WriteThread(dataQueue, 't6')

    writeThreads.append(thread4)
    writeThreads.append(thread5)
    writeThreads.append(thread6)

    for t in writeThreads:
        t.start()
    for t in writeThreads:
        t.join()
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值