pytho多线程+html正文抽取

最新推荐文章于 2021-07-03 20:07:30 发布

Deep_IT

最新推荐文章于 2021-07-03 20:07:30 发布

阅读量491

点赞数

分类专栏：爬虫文章标签： python 爬虫

爬虫专栏收录该内容

5 篇文章 0 订阅

订阅专栏

今天将一个bfs 的爬虫和抽取Html整合到一起了。现在功能还是有局限性。其中抽取正文，详见 http://www.fuxiang90.me/2012/02/%E6%8A%BD%E5%8F%96html-%E6%AD%A3%E6%96%87/

现在只限定爬取 http 协议的网址，并只在内网测试了，因为和外网的连接不是不快。
一个全局的 url 队列和 url set 。队列是为了方便的实现bfs ， set 是为了不重复爬取网页，流程还是相当的简单的，原理也是相当的简单。
然后是单线程的，所以应该是比较慢的，之后会考虑多线程，爬取网页，抽取URL ，抽取正文，可以同步进行。

其中下图是来源 https://www.ibm.com/developerworks/cn/opensource/os-cn-crawler/ ，然后抽取网页中的url ，我同时还抽取了里面的正文，这个是为了以后建立索引的时候，方便进行中文分词

# encoding:utf-8  
# use BeautifulSoup to get font|p context  
# 可以随意的使用这段代码，但请保留 下面的一行  
# author ： fuxiang ，mail： fuxiang90@gmail.com  
from BeautifulSoup import BeautifulSoup          # For processing HTML  
import urllib2  
import os  
import sys  
import re  
import Queue  
import socket  
import time  
import threading  
  
  
queue_lock = threading.RLock()  
file_lock = threading.RLock()  
socket.setdefaulttimeout(8)   
  
g_url_queue = Queue.Queue()  
g_url_queue.put('http://www.bupt.edu.cn/')  
  
g_file_queue = Queue.Queue()  
tt = ['http://www.bupt.edu.cn/']  
g_url_set = set(tt)  
max_deep = 1  
  
  
  
#######################################################  
def strip_tags(html):  
    """  
    Python中过滤HTML标签的函数  
    >>> str_text=strip_tags("<font color=red>hello</font>")  
    >>> print str_text  
    hello  
    """  
    from HTMLParser import HTMLParser  
    html = html.strip()  
    html = html.strip("\n")  
    result = []  
    parser = HTMLParser()  
    parser.handle_data = result.append  
    parser.feed(html)  
    parser.close()  
    return ''.join(result)  
  
    
def get_context( soup ,url):         
    allfonttext=soup.findAll(['a','p','font'])  
    if len(allfonttext)<=0:  
        print 'not found text'  
    fwrite = open('u'+str(url) ,'w')  
    for i in allfonttext:  
        t = (i.renderContents() )  
        context = strip_tags(t)  
        fwrite.write (context)  
  
    fwrite.close()  
  
          
  
class get_page_thread(threading.Thread):  
    def __init__(self, name):  
        threading.Thread.__init__(self)  
        self.t_name = name  
      
    def run(self):  
        global g_url_set  
        global g_url_queue  
        global g_file_queue  
  
        count = 0  
        print 'debug'  
        while g_url_queue.empty() is not True:  
            print self.t_name  
  
            # 增加一个锁  
            queue_lock.acquire()  
            l_url = g_url_queue.get()  
  
            queue_lock.release()  
            print l_url  
            # 捕捉超时错误 ，有些网页链接不上  
            try:  
                fp = urllib2.urlopen(l_url)  
            except :  
                continue  
            html = fp.read()  
  
            fwrite = open(str(count+1) ,'w')  
            fwrite.write(html)  
            fwrite.close()  
  
            file_lock.acquire()  
            g_file_queue.put(count+1)  
            file_lock.release()  
              
            count += 1  
            if count >= 100 :  
               exit  
          
class get_url_list_thread(threading.Thread):  
    def __init__(self, name):  
        threading.Thread.__init__(self)  
        self.t_name = name  
          
    
  
          
    def run(self):  
        global g_url_set  
        global g_file_queue  
        global queue_lock  
        global file_lock  
  
        while g_file_queue.empty() is not True:  
            file_lock.acquire()  
            filename = g_file_queue.get()  
            file_lock.release()  
  
            fd = open(str(filename),'r')  
            html = fd.read();  
            soup = BeautifulSoup(html)   
  
            get_context(soup,filename)  
              
            re_html = r'(http://(\w+\.)+\w+)'          
            res = soup.findAll('a') #找到所有a标签  
  
              
              
            for x in res:  
                t = unicode(x) #这里的x是soup对象  
                #url[pos] = str(unicode(x['href']) )  
                #t = unicode(x)  
                #print unicode(x['href'])  
                m = re.findall(re_html , t)  
                if m  is None:  
                    continue  
                for xx in m:              
                    str_url = xx[0]  
                    #print str_url  
                    g_url_set |= set('fuxiang')  
                    if str_url not in g_url_set :  
                        queue_lock.acquire()  
                        g_url_queue.put(str_url )  
                        queue_lock.release()  
                        g_url_set |= set(str_url)  
              
# uncomplete  
def get_html_page(url):  
    furl =  urllib2.urlopen(url)  
    html = furl.read()  
    soup = BeautifulSoup(html)  
  
  
if __name__ == "__main__":  
    thread1 = get_page_thread('a')  
      
    thread2 = get_url_list_thread('b')  
    thread3 = get_page_thread('c')  
    thread4 = get_page_thread('d')  
  
    thread1.start()  
    time.sleep(20)  
    thread2.start()  
  
    time.sleep(20)  
    thread3.start()  
    thread4.start()

Deep_IT

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
pytho多线程+html正文抽取

今天将一个bfs 的爬虫和抽取Html整合到一起了。现在功能还是有局限性。其中抽取正文，详见 http://www.fuxiang90.me/2012/02/%E6%8A%BD%E5%8F%96html-%E6%AD%A3%E6%96%87/现在只限定爬取 http 协议的网址，并只在内网测试了，因为和外网的连接不是不快。一个全局的 url 队列和 url set 。队
复制链接

扫一扫