talk is cheep, show you my code. 歡迎吐槽。
Code1. queue server.
#!/usr/bin/python
# -*- coding: utf-8 -*-
# queue server.
import socket
from base64 import decodestring as b64decode
from Queue import Queue
visited = set()
prequeue= Queue()
def rqs_process(ss):
rqs = ss.recv(1024).split()
if rqs[0] == 'rqaddr': # request a address to spide.
try:
url = prequeue.get(timeout=1)
ss.send(url)
except:
ss.send('wait')
elif rqs[0] == 'response': # submit data to slave.
dsize = int(rqs[1])
ss.send('start')
data = ''
while len(data) < dsize:
data += ss.recv(1024)
addrs = eval(b64decode(data))
for addr in addrs:
if addr not in visited:
visited.add(addr)
prequeue.put(addr)
print 'Queue size:',prequeue.qsize()
if __name__ == '__main__':
#init
root = 'http://m.sohu.com/'
prequeue.put(root)
visited.add(root)
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.bind(('127.0.0.1', 52000))
s.listen(5)
while True:
ss, addr = s.accept()
rqs_process(ss)
ss.close()
s.close()
Code 2. worker. 可多機並行工作,可自己寫header設置函數和網頁分析函數。
#!/usr/bin/python
# -*- coding: utf-8 -*-
# worker.
from base64 import encodestring as b64encode
from time import sleep
import re
import socket
from threading import Thread, Lock
import requests
from datetime import datetime as dt
from BeautifulSoup import BeautifulSoup,SoupStrainer
import MySQLdb
import urlparse as urlp
running, queue_addr = None, None
mutex = Lock()
class PseudoRequest():
status_code = None
reason = None
url = None
class ErrorRecoder():
host = None
user = None
passwd = None
db = None
def __init__(self):
self.db = MySQLdb.connect(
host=self.host, user=self.user,
passwd=self.passwd, db=self.db, charset="utf8")
self.cr = self.db.cursor()
def save(self, url, status_code, reason):
self.cr.execute(
"insert into "
"errors (url, create_time, status_code, reason) "
"values (%s,%s,%s,%s)",
(url, dt.now().isoformat(), status_code, reason)
)
self.db.commit()
print 'RECORD:', url, status_code, reason
def __del__(self):
self.cr.close()
self.db.close()
def get_url():
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
try:
s.connect(queue_addr)
s.send('rqaddr')
data = s.recv(1024)
s.close()
except:
data = 'wait'
return data
def get_req(url, timeout=5, header=None):
if not header:
header = {
'Accept-Encoding': 'gzip, deflate, compress',
'Accept': '*/*',
'User-Agent': 'WebChk0.0001'
}
for time in xrange(3):
try:
r = requests.get(url, timeout=timeout, headers = header)
return r
except:
pass
# create PseudoClass if timeout
r = PseudoRequest()
r.url = url
r.status_code = 999
r.reason = 'Server Unreachable'
return r
def submit_data(addrs):
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.connect(queue_addr)
data = b64encode(str(addrs))
s.send('response ' + '%06d ' % (len(data)))
verify = s.recv(1024)
if verify == 'start':
s.send(data)
s.close()
def worker_t(href_get, head_mker, timeout):
rcd = ErrorRecoder()
while True:
if running == False:
break
mutex.acquire(1)
proc_url = get_url()
mutex.release()
if proc_url == 'wait':
sleep(2)
print 'server problem. keep trying...'
continue
r = get_req(proc_url, timeout, head_mker(proc_url))
if r.status_code != 200:
rcd.save(proc_url, r.status_code, r.reason)
elif r.text.find(u'<header class="ns">403:页面没有找到。</header>') > 0:
rcd.save(proc_url, '403', 'Normal Bad Request')
else:
hrefs = href_get(r)
mutex.acquire(1)
submit_data(hrefs)
mutex.release()
def href_get(r):
def parse_href(href):
x = urlp.urljoin(r.url, href)
x = urlp.urlparse(x)
x = list(x[:3]) + ['']*3
x = urlp.urlunparse(x)
return x
if not re.search('m\.sohu\.com', r.url):
return []
try:
soup = BeautifulSoup(r.text, parseOnlyThese=SoupStrainer('a'))
except:
return []
hrefs = [ parse_href(l['href']) for l in soup if l.has_key('href')]
#print 'links:',len(hrefs)
return hrefs
def start_work(workers = 2,
qaddr = ('127.0.0.1', 52000),
dbargs = ('localhost','root','','webchk'),
href_fliter_func = href_get,
header_set_func = lambda x:None,
request_time_out = 5):
# globals
global running, queue_addr
running = True
queue_addr = qaddr
ER = ErrorRecoder
ER.host, ER.user, ER.passwd, ER.db = dbargs
# mk threads
args = (href_fliter_func, header_set_func, request_time_out)
t = [ Thread(target=worker_t, args=args) for i in range(workers)]
for i in t:
i.start()
# waitting for halt
raw_input('Press ENTER to stop threads...\n')
running = False
print 'Workers dead.'
if __name__ == '__main__':
start_work(50)