分享一個python爬蟲,用來檢測網站可用性

talk is cheep, show you my code. 歡迎吐槽。


Code1. queue server.

#!/usr/bin/python
# -*- coding: utf-8 -*-
# queue server. 

import socket
from base64 import decodestring as b64decode
from Queue import Queue

visited = set()
prequeue= Queue()

def rqs_process(ss):
    rqs = ss.recv(1024).split()
    if rqs[0] == 'rqaddr': # request a address to spide.
        try:
            url = prequeue.get(timeout=1)
            ss.send(url)
        except:
            ss.send('wait')
    elif rqs[0] == 'response': # submit data to slave.
        dsize = int(rqs[1])
        ss.send('start')
        data = ''
        while len(data) < dsize:
            data += ss.recv(1024)
        addrs = eval(b64decode(data))
        for addr in addrs:
            if addr not in visited:
                visited.add(addr)
                prequeue.put(addr)
    print 'Queue size:',prequeue.qsize()

if __name__ == '__main__':
    #init
    root = 'http://m.sohu.com/'
    prequeue.put(root)
    visited.add(root)

    s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    s.bind(('127.0.0.1', 52000))
    s.listen(5)

    while True:
        ss, addr = s.accept()
        rqs_process(ss)
        ss.close()
    s.close()

Code 2. worker. 可多機並行工作,可自己寫header設置函數和網頁分析函數。

#!/usr/bin/python
# -*- coding: utf-8 -*-
# worker.

from base64 import encodestring as b64encode
from time import sleep
import re
import socket
from threading import Thread, Lock
import requests
from datetime import datetime as dt
from BeautifulSoup import BeautifulSoup,SoupStrainer
import MySQLdb
import urlparse as urlp

running, queue_addr = None, None
mutex = Lock()

class PseudoRequest():
    status_code = None
    reason = None
    url = None

class ErrorRecoder():
    host = None
    user = None
    passwd = None
    db = None
    def __init__(self):
        self.db = MySQLdb.connect(
                host=self.host, user=self.user,
                passwd=self.passwd, db=self.db, charset="utf8")
        self.cr = self.db.cursor()

    def save(self, url, status_code, reason):
        self.cr.execute(
           "insert into "
           "errors (url, create_time, status_code, reason) "
           "values (%s,%s,%s,%s)",
           (url, dt.now().isoformat(), status_code, reason)
        )
        self.db.commit()
        print 'RECORD:', url, status_code, reason

    def __del__(self):
        self.cr.close()
        self.db.close()

def get_url():
    s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    try:
        s.connect(queue_addr)
        s.send('rqaddr')
        data = s.recv(1024)
        s.close()
    except:
        data = 'wait'
    return data

def get_req(url, timeout=5, header=None):
    if not header:
        header = {
                'Accept-Encoding': 'gzip, deflate, compress',
                'Accept': '*/*',
                'User-Agent': 'WebChk0.0001'
        }
    for time in xrange(3):
        try:
            r = requests.get(url, timeout=timeout, headers = header)
            return r
        except:
            pass
    # create PseudoClass if timeout
    r = PseudoRequest()
    r.url = url
    r.status_code = 999
    r.reason = 'Server Unreachable'
    return r

def submit_data(addrs):
    s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    s.connect(queue_addr)

    data = b64encode(str(addrs))
    s.send('response ' + '%06d ' % (len(data)))
    verify = s.recv(1024)
    if verify == 'start':
        s.send(data)
    s.close()


def worker_t(href_get, head_mker, timeout):
    rcd = ErrorRecoder()
    while True:
        if running == False:
            break

        mutex.acquire(1)
        proc_url = get_url()
        mutex.release()
        if proc_url == 'wait':
            sleep(2)
            print 'server problem. keep trying...'
            continue

        r = get_req(proc_url, timeout, head_mker(proc_url))
        if r.status_code != 200:
            rcd.save(proc_url, r.status_code, r.reason)
        elif r.text.find(u'<header class="ns">403:页面没有找到。</header>') > 0:
            rcd.save(proc_url, '403', 'Normal Bad Request')
        else:
            hrefs = href_get(r)
            mutex.acquire(1)
            submit_data(hrefs)
            mutex.release()


def href_get(r):
    def parse_href(href):
        x = urlp.urljoin(r.url, href)
        x = urlp.urlparse(x)
        x = list(x[:3]) + ['']*3
        x = urlp.urlunparse(x)
        return x

    if not re.search('m\.sohu\.com', r.url):
        return []
    try:
        soup = BeautifulSoup(r.text, parseOnlyThese=SoupStrainer('a'))
    except:
        return []
    hrefs = [ parse_href(l['href']) for l in soup if l.has_key('href')]
    #print 'links:',len(hrefs)
    return hrefs

def start_work(workers = 2,
               qaddr = ('127.0.0.1', 52000),
               dbargs = ('localhost','root','','webchk'),
               href_fliter_func = href_get,
               header_set_func = lambda x:None,
               request_time_out = 5):
    # globals
    global running, queue_addr
    running = True
    queue_addr = qaddr
    ER = ErrorRecoder
    ER.host, ER.user, ER.passwd, ER.db = dbargs

    # mk threads
    args = (href_fliter_func, header_set_func, request_time_out)
    t = [ Thread(target=worker_t, args=args) for i in range(workers)]
    for i in t:
        i.start()

    # waitting for halt
    raw_input('Press ENTER to stop threads...\n')
    running = False
    print 'Workers dead.'

if __name__ == '__main__':
    start_work(50)

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值