Python学习笔记-简易抓取网页-1

主要是通过urllib2获取预先指定好的地址页面,通过BeautifulSoup来解析界面元素,找到href标签,并将相关的数据存入数据库,以方便后面取出继续抓取。整个抓取和解析也是基于多线程与队列来控制的。做的比较简单与粗糙,后续深入可以改进。

import CodeHelper
import urllib2
from bs4 import BeautifulSoup
import threading as thread
import Queue
import time
from DBCUtils import DBOperation

class Resource:
    
    def __init__(self, url, text, content, status):
        self._url = url
        self._text = text
        self._content = content
        self._status = status
        
    def insert(self):
        sql = 'select * from resource where url=%s'
        data = DBOperation.readOne(sql, [self._url])
        if data is not None :
            return
        sql = 'insert into resource(url,text,content,status) values(%s,%s,%s,%s)'
        print 'url: %s content: %s status: %s' %(self._url, self._text, self._content, self._status)
        DBOperation.execute(sql, [self._url, self._text, self._content, self._status]);
        
    def updateStatus(self):
        sql = 'update resource set status=%s where url=%s'
        DBOperation.execute(sql, [self._status, self._url]);
        
    def updateContentAndStatus(self):
        sql = 'update resource set content=%s,status=%s where url=%s'
        DBOperation.execute(sql, [self._content, self._status, self._url]);
        
    def readListByStatus(self):
        sql = 'select * from resource where status=%s'
        return DBOperation.readList(sql, [self._status]);
        
    def readList(self):
        return DBOperation.readList('select * from resource');
        
class ResourceThread(thread.Thread):
    
    def __init__(self, task_queue):
        thread.Thread.__init__(self)
        self._task_queue = task_queue
        self.setDaemon(True)
        self.start()
    
    def run(self):
        print 'current thread name %s' %thread.currentThread().name
        while True :
            try :
                func, args = self._task_queue.get(block = False)
                func(args)
                self._task_queue.task_done()
            except Exception,e :
                print str(e)
                break
            
class ResourceManager:
    
    def __init__(self, taskNum = 10, threadNum = 2) :
        self._task_queue = Queue.Queue()
        self._threads = []
        self.__init__task_queue__(taskNum)
        self.__init__thread_pool(threadNum)
        
    def __init__task_queue__(self, taskNum) :
        for i in range(taskNum) :
            print 'this is %s task' %i
            self.add_task(do_task, i)
        
    def __init__thread_pool(self, threadNum) :
        for i in range(threadNum) :
            print 'threadNum %s' %i
            resourceThread = ResourceThread(self._task_queue)
            self._threads.append(resourceThread)
            
    def add_task(self, func, *args) :
        self._task_queue.put((func, args))
    
    def check_queue(self):
        return self._task_queue.qsize()
    
    def wait_for_complete(self) :
        for thread_item in self._threads :
            if thread_item.isAlive() :
                thread_item.join()
    
def do_task(args):
    print 'this task args %s' %args
    resource = Resource(None, None, None, 0)
    data = resource.readListByStatus()
    print 'read status 0 data is %s' %data
    if data is None :
        return
    for item in data :
        url = item[1]
        if url is None or url.find('http://') == -1 :
            continue
        content = urllib2.urlopen(url).read()
        html = BeautifulSoup(content)
        fetch_resource = Resource(url, None, str(html.find('body'))[0:9999], 1)
        fetch_resource.updateContentAndStatus()
        aLinks = html.find_all('a')
        print 'aLinks %s' %aLinks
        for aLink in aLinks :
            href = aLink.get('href')
            a_text = CodeHelper.encodeContent(aLink.get_text())
            print 'href %s text %s' %(href, a_text)
            subResource = Resource(href, a_text, '', 0)
            subResource.insert()
            
def execute():
    urls = ['http://www.kuwo.cn', 'http://www.1ting.com/', 'http://www.kugou.com/', 'http://y.qq.com/']
    for url in urls :
        resource = Resource(url, None, 0)
        resource.insert()
    
    start = time.time()
    resource_manager =  ResourceManager(20, 4)
    resource_manager.wait_for_complete()
    end = time.time()
    print "cost all time: %s" % (end-start)

if __name__ == '__main__':
    execute()

import MySQLdb

from DBUtils.PooledDB import PooledDB

class DBOperation(object):
    
    __pool = None
    
    @staticmethod
    def getConnection():
        if DBOperation.__pool is None :
            __pool = PooledDB(creator = MySQLdb,
                              mincached = 1,
                              maxcached = 20,
                              host = '127.0.0.1',
                              port = 3306,
                              user = 'root',
                              passwd = '123456',
                              db = 'test',
                              charset = 'utf8')

        return __pool.connection()

    @staticmethod
    def getCursor():
        connection = DBOperation.getConnection()
        return connection.cursor()

    @staticmethod
    def execute(sql, parameter=None):
        dbConnection = DBOperation.getConnection()
        if parameter is None:
            dbConnection.cursor().execute(sql)
        else :
            dbConnection.cursor().execute(sql, parameter)
        dbConnection.commit()
        dbConnection.close()
    
    @staticmethod
    def readOne(sql, parameter=None):
        dbConnection = DBOperation.getConnection()
        if parameter is None :
            count = dbConnection.cursor().execute(sql)
        else :
            count = dbConnection.cursor().execute(sql, parameter)
        if count > 0:
            return dbConnection.cursor().fetchone()
        else :
            return None
        dbConnection.commit()
        dbConnection.close()
        
    @staticmethod
    def readList(sql, parameter=None):
        dbConnection = DBOperation.getConnection()
        if parameter is None :
            count = dbConnection.cursor().execute(sql)
        else :
            count = dbConnection.cursor().execute(sql, parameter)
        if count > 0:
            return dbConnection.cursor().fetchall()
        else :
            return None
        dbConnection.commit()
        dbConnection.close()
    
    @staticmethod
    def commitConnection(connection):
        if connection :
            connection.commit()
            
    @staticmethod
    def closeConnection(connection):
        if connection :
            connection.close()



 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值