python 简单的网络爬虫 + html 正文抽取

最新推荐文章于 2024-07-21 21:39:32 发布

fx397993401

最新推荐文章于 2024-07-21 21:39:32 发布

阅读量9.1k

点赞数

分类专栏： Linux || C 文章标签： html 网络爬虫 python url thread file

本文链接：https://blog.csdn.net/fx397993401/article/details/7267902

版权

Linux || C 专栏收录该内容

32 篇文章 0 订阅

订阅专栏

今天将一个bfs 的爬虫和抽取Html整合到一起了。现在功能还是有局限性。其中抽取正文，详见 http://www.fuxiang90.me/2012/02/%E6%8A%BD%E5%8F%96html-%E6%AD%A3%E6%96%87/

现在只限定爬取 http 协议的网址，并只在内网测试了，因为和外网的连接不是不快。
一个全局的 url 队列和 url set 。队列是为了方便的实现bfs ， set 是为了不重复爬取网页，流程还是相当的简单的，原理也是相当的简单。
然后是单线程的，所以应该是比较慢的，之后会考虑多线程，爬取网页，抽取URL ，抽取正文，可以同步进行。
其中下图是来源 https://www.ibm.com/developerworks/cn/opensource/os-cn-crawler/ ，然后抽取网页中的url ，我同时还抽取了里面的正文，这个是为了以后建立索引的时候，方便进行中文分词

代码在这里贴有问题，可能是里面有 html 的标签，请移步http://www.fuxiang90.me/?p=728

# encoding:utf-8
# use BeautifulSoup to get font|p context
# 单线程版本的爬取html ，并深度遍历 ，之后 抽取正文 ，但是单线程未免有点慢
# 可以随意的使用这段代码，但请保留 下面的一行
# author ： fuxiang ，mail： fuxiang90@gmail.com
from BeautifulSoup import BeautifulSoup          # For processing HTML
import urllib2
import os
import sys
import re
import Queue
import socket
import time

socket.setdefaulttimeout(8) 

g_url_queue = Queue.Queue()
g_url_queue.put('http://www.bupt.edu.cn/')
tt = ['http://www.bupt.edu.cn/']
g_url_set = set(tt)
max_deep = 1

#  传入的参数是soup 类型 这个提取soup 类型里面的网址
def get_url_list(html):
    global g_url_set
    re_html = r'(http://(\w+\.)+\w+)'
    
    res = html.findAll('a') #找到所有a标签
    
    for x in res:
        t = unicode(x) #这里的x是soup对象
        #url[pos] = str(unicode(x['href']) )
        #t = unicode(x)
        #print unicode(x['href'])
        m = re.findall(re_html , t)
        if m  is None:
            continue
        for xx in m:            
            str_url = xx[0]
            #print str_url
            g_url_set |= set('fuxiang')
            if str_url not in g_url_set :
                g_url_queue.put(str_url ) 
                g_url_set |= set(str_url)

#######################################################
def strip_tags(html):
    """
    Python中过滤HTML标签的函数
    >>> str_text=strip_tags("<font color=red>hello</font>")
    >>> print str_text
    hello
    """
    from HTMLParser import HTMLParser
    html = html.strip()
    html = html.strip("\n")
    result = []
    parser = HTMLParser()
    parser.handle_data = result.append
    parser.feed(html)
    parser.close()
    return ''.join(result)

#######################################################
# 可以传入 网址 或者 本地文件 ，解析出里面的正文
def get_context( url ):
    re_html = 'http[s]?://[A-Za-z0-9]+.[A-Za-z0-9]+.[A-Za-z0-9]+

m = re.match(re_html,str(url)) if m is None : # 如果url 是本地文件 fp = open(unicode(url),'r') else: fp = urllib2.urlopen(url) html = fp.read() soup = BeautifulSoup(html) allfonttext=soup.findAll(['a','p','font']) if len(allfonttext)<=0: print 'not found text' fwrite = open('u'+str(url) ,'w') for i in allfonttext: t = (i.renderContents() ) context = strip_tags(t) fwrite.write (context) fwrite.close()####################################################### def main_fun(deep): global g_url_set global g_url_queue if deep > max_deep: return count = 0 print 'debug' while g_url_queue.empty() is not True: print 'debug2' l_url = g_url_queue.get() print l_url # 捕捉超时错误，有些网页链接不上 try: fp = urllib2.urlopen(l_url) except : continue html = fp.read() fwrite = open(str(count+1) ,'w') fwrite.write(html) fwrite.close() soup = BeautifulSoup(html) get_url_list(soup) get_context(count+1) count += 1 if count >= 100 : return # uncompletedef get_html_page(url): furl = urllib2.urlopen(url) html = furl.read() soup = BeautifulSoup(html)if __name__ == "__main__": main_fun(1) time.sleep(10)

然后我现在想做一个多线程的，即下载页面和分析html 抽取里面的正文和url 是可以同步进行的，然后对上面的代码进行简单的修改后，勉强能运行，主要是增加了 threading ，对全局的queue 访问了加了锁控制，因为之前没有写过多线程的代码，所以觉得还是希望路过的朋友可以，提出建议。

# encoding:utf-8
# use BeautifulSoup to get font|p context
# 可以随意的使用这段代码，但请保留 下面的一行
# author ： fuxiang ，mail： fuxiang90@gmail.com
from BeautifulSoup import BeautifulSoup          # For processing HTML
import urllib2
import os
import sys
import re
import Queue
import socket
import time
import threading


queue_lock = threading.RLock()
file_lock = threading.RLock()
socket.setdefaulttimeout(8) 

g_url_queue = Queue.Queue()
g_url_queue.put('http://www.bupt.edu.cn/')

g_file_queue = Queue.Queue()
tt = ['http://www.bupt.edu.cn/']
g_url_set = set(tt)
max_deep = 1



#######################################################
def strip_tags(html):
    """
    Python中过滤HTML标签的函数
    >>> str_text=strip_tags("<font color=red>hello</font>")
    >>> print str_text
    hello
    """
    from HTMLParser import HTMLParser
    html = html.strip()
    html = html.strip("\n")
    result = []
    parser = HTMLParser()
    parser.handle_data = result.append
    parser.feed(html)
    parser.close()
    return ''.join(result)

  
def get_context( soup ,url):       
    allfonttext=soup.findAll(['a','p','font'])
    if len(allfonttext)<=0:
        print 'not found text'
    fwrite = open('u'+str(url) ,'w')
    for i in allfonttext:
        t = (i.renderContents() )
        context = strip_tags(t)
        fwrite.write (context)

    fwrite.close()

        

class get_page_thread(threading.Thread):
    def __init__(self, name):
        threading.Thread.__init__(self)
        self.t_name = name
    
    def run(self):
        global g_url_set
        global g_url_queue
        global g_file_queue

        count = 0
        print 'debug'
        while g_url_queue.empty() is not True:
            print self.t_name

            # 增加一个锁
            queue_lock.acquire()
            l_url = g_url_queue.get()

            queue_lock.release()
            print l_url
            # 捕捉超时错误 ，有些网页链接不上
            try:
                fp = urllib2.urlopen(l_url)
            except :
                continue
            html = fp.read()

            fwrite = open(str(count+1) ,'w')
            fwrite.write(html)
            fwrite.close()

            file_lock.acquire()
            g_file_queue.put(count+1)
            file_lock.release()
            
            count += 1
            if count >= 100 :
               exit
        
class get_url_list_thread(threading.Thread):
    def __init__(self, name):
        threading.Thread.__init__(self)
        self.t_name = name
        
  

        
    def run(self):
        global g_url_set
        global g_file_queue
        global queue_lock
        global file_lock

        while g_file_queue.empty() is not True:
            file_lock.acquire()
            filename = g_file_queue.get()
            file_lock.release()

            fd = open(str(filename),'r')
            html = fd.read();
            soup = BeautifulSoup(html) 

            get_context(soup,filename)
            
            re_html = r'(http://(\w+\.)+\w+)'        
            res = soup.findAll('a') #找到所有a标签

            
            
            for x in res:
                t = unicode(x) #这里的x是soup对象
                #url[pos] = str(unicode(x['href']) )
                #t = unicode(x)
                #print unicode(x['href'])
                m = re.findall(re_html , t)
                if m  is None:
                    continue
                for xx in m:            
                    str_url = xx[0]
                    #print str_url
                    g_url_set |= set('fuxiang')
                    if str_url not in g_url_set :
                        queue_lock.acquire()
                        g_url_queue.put(str_url )
                        queue_lock.release()
                        g_url_set |= set(str_url)
            
# uncomplete
def get_html_page(url):
    furl =  urllib2.urlopen(url)
    html = furl.read()
    soup = BeautifulSoup(html)


if __name__ == "__main__":
    thread1 = get_page_thread('a')
    
    thread2 = get_url_list_thread('b')
    thread3 = get_page_thread('c')
    thread4 = get_page_thread('d')

    thread1.start()
    time.sleep(20)
    thread2.start()

    time.sleep(20)
    thread3.start()
    thread4.start()