把之前的爬虫用python改写了,多线程和队列 来抓取,典型的生产者消费者模式
,用mvc分层来写的![抓狂](http://static.blog.csdn.net/xheditor/xheditor_emot/default/crazy.gif)
#-*- coding: utf-8 -*-
'''
spider.py
Created on 2013-12-21
http://www.cn360cn.com/news.aspx?pageno=2
@author: Administrator
'''
import Pager
import Queue
import sys
import time
import threading
task_queue = Queue.Queue(10)
class ProducerThread(threading.Thread):
def __init__(self,threadname):
threading.Thread.__init__(self)
self.name = threadname
def run(self):
for page in xrange(1,50000):
url = 'http://www.cn360cn.com/news.aspx?pageno=%d' % page
task_queue.put(url)
print "put %s " % url
class ConsumerThread(threading.Thread):
name = ''
url = ''
def __init__(self,threadname):
threading.Thread.__init__(self)
self.name = threadname
def run(self):
while 1:
print self.name
page = Pager.Pager()
url = task_queue.get()
print "get %s " % url
try:
page.get_html(url)
except Exception as e:
print "exception occurred : " , e
finally:
task_queue.task_done()
#控制速度
time.sleep(0.5)
print "start crawing ...."
producer = ProducerThread("crawler producer ....")
producer.setDaemon(True)
producer.start()
thread_num = 60
threads = []
for num in xrange(thread_num):
print "crawler consumer : %d " % num
threads.append(ConsumerThread("crawler consumer : %d " % num))
for num in xrange(thread_num):
threads[num].setDaemon(True)
threads[num].start()
#只到所有任务完成后,才退出主程序
task_queue.join()
print "finished ...."
HttpClient.py
# -*- coding:utf-8 -*-
import urllib,urllib2
class HttpClient:
user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36'
proxy = ''
def __init__(self):
urllib2.socket.setdefaulttimeout(8) # timeout
pass
def set_ua(self,user_agent):
self.user_agent = user_agent
def set_proxy(self,proxy):
self.proxy = proxy
def get(self,url,data = {}):
headers = {
'User-Agent' : self.user_agent,
'Referer': 'http://www.cn360cn.com/',
'Host':'www.cn360cn.com'
}
data = urllib.urlencode(data)
req = urllib2.Request(url, data, headers)
response = urllib2.urlopen(req)
response = response.read()
return response
if __name__ == '__main__':
c = HttpClient()
print c.get("http://blog.csdn.net/pleasecallmewhy/article/details/8923067")
pager.py
# -*_ coding: utf-8 -*-
'''
Created on 2013-12-21
@author: Administrator
'''
import HttpClient
import re
import Dao
class Pager:
client = ''
dao = ''
def __init__(self):
self.client = HttpClient.HttpClient()
self.dao = Dao.Mysql()
pass
def get_html(self,url):
html = self.client.get(url)
#gb2312 to unicode to utf8
html = html.decode('gb2312', 'ignore').encode('utf-8')
# print html
self.parse_html(html)
def parse_html(self,html):
page_pattern = re.compile(r'<li>\s*<a.*?>(.*?)<\/a>\s*<div\s+class=tel\s*>\s*电话:(.*?)地址:(.*?)<\/div>\s*<\/li>', re.I | re.M)
# page_pattern = re.compile(r'<li>\s*<a.*?>(.*?)<\/a>\s*<div\s+class=tel\s*>\s*(.*?)<\/div>\s*<\/li>', re.I | re.M)
result = page_pattern.findall(html)
for item in result:
# print item[0],',',item[1],',',item[2]
tag, number, address = item[0],item[1],item[2]
number = number.strip()
number = number.replace("-", "")
tag = tag.strip()
address = address.strip()
temp = {'tag':tag,'number':number,'address':address}
result = self.dao.insert(temp)
if __name__ == '__main__':
page = Pager()
url = 'http://www.cn360cn.com/news.aspx?pageno=2'
page.get_html(url)
Dao.py
'''
Created on 2013-7-20
@author: Administrator
'''
import MySQLdb
import config;
import sys
class Mysql:
table = 'cn360'
conn = ''
def __init__(self):
self.conn = MySQLdb.connect(host=config.conf['host'],
user= config.conf['user'],
passwd=config.conf['pwd'],
db=config.conf['db'],
charset=config.conf['charset'])
self.cursor = self.conn.cursor();
def insert(self, data):
if not data:
return False
sql = "insert ignore into " + self.table + " set "
for k, v in enumerate(data):
sql += v + "='" + MySQLdb.escape_string(data[v]) + "',"
sql = sql.strip(',')
# print sql
return self.execute(sql)
def execute(self, sql):
if not sql:
return False
try:
self.cursor.execute(sql)
self.conn.commit()
except:
self.conn.rollback();
return False
return True
def get_rows(self, sql):
if not sql:
return False
result = self.execute(sql)
if result:
return self.cursor.fetchall()
return False
def update(self, source, filter_array = None):
if not source:
return False
sql = "update " + self.table + " set "
for k, v in enumerate(source):
sql += v + "='" + MySQLdb.escape_string(source[v]) + "',"
sql = sql.strip(',')
if filter_array:
where = ''
for k, v in enumerate(filter_array):
where += v + "='" + MySQLdb.escape_string(filter_array[v]) + "',"
where = where.strip(',')
if where:
sql += " where " + where
print sql
return self.execute(sql)
def delete(self, filter_array = None):
if not filter_array:
return False
sql = "delete from " + self.table
if filter_array:
where = ''
for k, v in enumerate(filter_array):
where += v + "='" + MySQLdb.escape_string(filter_array[v]) + "',"
where = where.strip(',')
if where:
sql += " where " + where
print sql
return self.execute(sql)
def destroy(self):
self.conn = None
self.cursor = None