分享一個python爬蟲,用來檢測網站可用性

talk is cheep, show you my code. 歡迎吐槽。


Code1. queue server.

#!/usr/bin/python
# -*- coding: utf-8 -*-
# queue server. 

import socket
from base64 import decodestring as b64decode
from Queue import Queue

visited = set()
prequeue= Queue()

def rqs_process(ss):
    rqs = ss.recv(1024).split()
    if rqs[0] == 'rqaddr': # request a address to spide.
        try:
            url = prequeue.get(timeout=1)
            ss.send(url)
        except:
            ss.send('wait')
    elif rqs[0] == 'response': # submit data to slave.
        dsize = int(rqs[1])
        ss.send('start')
        data = ''
        while len(data) < dsize:
            data += ss.recv(1024)
        addrs = eval(b64decode(data))
        for addr in addrs:
            if addr not in visited:
                visited.add(addr)
                prequeue.put(addr)
    print 'Queue size:',prequeue.qsize()

if __name__ == '__main__':
    #init
    root = 'http://m.sohu.com/'
    prequeue.put(root)
    visited.add(root)

    s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    s.bind(('127.0.0.1', 52000))
    s.listen(5)

    while True:
        ss, addr = s.accept()
        rqs_process(ss)
        ss.close()
    s.close()

Code 2. worker. 可多機並行工作,可自己寫header設置函數和網頁分析函數。

#!/usr/bin/python
# -*- coding: utf-8 -*-
# worker.

from base64 import encodestring as b64encode
from time import sleep
import re
import socket
from threading import Thread, Lock
import requests
from datetime import datetime as dt
from BeautifulSoup import BeautifulSoup,SoupStrainer
import MySQLdb
import urlparse as urlp

running, queue_addr = None, None
mutex = Lock()

class PseudoRequest():
    status_code = None
    reason = None
    url = None

class ErrorRecoder():
    host = None
    user = None
    passwd = None
    db = None
    def __init__(self):
        self.db = MySQLdb.connect(
                host=self.host, user=self.user,
                passwd=self.passwd, db=self.db, charset="utf8")
        self.cr = self.db.cursor()

    def save(self, url, status_code, reason):
        self.cr.execute(
           "insert into "
           "errors (url, create_time, status_code, reason) "
           "values (%s,%s,%s,%s)",
           (url, dt.now().isoformat(), status_code, reason)
        )
        self.db.commit()
        print 'RECORD:', url, status_code, reason

    def __del__(self):
        self.cr.close()
        self.db.close()

def get_url():
    s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    try:
        s.connect(queue_addr)
        s.send('rqaddr')
        data = s.recv(1024)
        s.close()
    except:
        data = 'wait'
    return data

def get_req(url, timeout=5, header=None):
    if not header:
        header = {
                'Accept-Encoding': 'gzip, deflate, compress',
                'Accept': '*/*',
                'User-Agent': 'WebChk0.0001'
        }
    for time in xrange(3):
        try:
            r = requests.get(url, timeout=timeout, headers = header)
            return r
        except:
            pass
    # create PseudoClass if timeout
    r = PseudoRequest()
    r.url = url
    r.status_code = 999
    r.reason = 'Server Unreachable'
    return r

def submit_data(addrs):
    s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    s.connect(queue_addr)

    data = b64encode(str(addrs))
    s.send('response ' + '%06d ' % (len(data)))
    verify = s.recv(1024)
    if verify == 'start':
        s.send(data)
    s.close()


def worker_t(href_get, head_mker, timeout):
    rcd = ErrorRecoder()
    while True:
        if running == False:
            break

        mutex.acquire(1)
        proc_url = get_url()
        mutex.release()
        if proc_url == 'wait':
            sleep(2)
            print 'server problem. keep trying...'
            continue

        r = get_req(proc_url, timeout, head_mker(proc_url))
        if r.status_code != 200:
            rcd.save(proc_url, r.status_code, r.reason)
        elif r.text.find(u'<header class="ns">403:页面没有找到。</header>') > 0:
            rcd.save(proc_url, '403', 'Normal Bad Request')
        else:
            hrefs = href_get(r)
            mutex.acquire(1)
            submit_data(hrefs)
            mutex.release()


def href_get(r):
    def parse_href(href):
        x = urlp.urljoin(r.url, href)
        x = urlp.urlparse(x)
        x = list(x[:3]) + ['']*3
        x = urlp.urlunparse(x)
        return x

    if not re.search('m\.sohu\.com', r.url):
        return []
    try:
        soup = BeautifulSoup(r.text, parseOnlyThese=SoupStrainer('a'))
    except:
        return []
    hrefs = [ parse_href(l['href']) for l in soup if l.has_key('href')]
    #print 'links:',len(hrefs)
    return hrefs

def start_work(workers = 2,
               qaddr = ('127.0.0.1', 52000),
               dbargs = ('localhost','root','','webchk'),
               href_fliter_func = href_get,
               header_set_func = lambda x:None,
               request_time_out = 5):
    # globals
    global running, queue_addr
    running = True
    queue_addr = qaddr
    ER = ErrorRecoder
    ER.host, ER.user, ER.passwd, ER.db = dbargs

    # mk threads
    args = (href_fliter_func, header_set_func, request_time_out)
    t = [ Thread(target=worker_t, args=args) for i in range(workers)]
    for i in t:
        i.start()

    # waitting for halt
    raw_input('Press ENTER to stop threads...\n')
    running = False
    print 'Workers dead.'

if __name__ == '__main__':
    start_work(50)

发布了57 篇原创文章 · 获赞 13 · 访问量 7万+
展开阅读全文

没有更多推荐了,返回首页

©️2019 CSDN 皮肤主题: 大白 设计师: CSDN官方博客

分享到微信朋友圈

×

扫一扫,手机浏览