今天将 一个bfs 的爬虫 和 抽取Html整合到一起了。现在功能还是有局限性 。 其中抽取正文,详见 http://www.fuxiang90.me/2012/02/%E6%8A%BD%E5%8F%96html-%E6%AD%A3%E6%96%87/
- 现在只限定爬取 http 协议的网址,并只在内网测试了,因为和外网的连接不是不快。
- 一个全局的 url 队列 和 url set 。队列是为了方便的实现bfs , set 是为了不重复爬取网页,流程还是相当的简单的,原理也是相当的简单。
- 然后是单线程的,所以应该是比较慢的,之后会考虑多线程 ,爬取网页 ,抽取URL ,抽取正文,可以同步进行。
- 其中 下图是来源 https://www.ibm.com/developerworks/cn/opensource/os-cn-crawler/ ,然后抽取网页中的url ,我同时还抽取了里面的正文,这个是为了以后建立索引的时候 ,方便进行中文分词
代码在这里贴 有问题,可能是里面有 html 的标签 ,请移步http://www.fuxiang90.me/?p=728
# encoding:utf-8
# use BeautifulSoup to get font|p context
# 单线程版本的爬取html ,并深度遍历 ,之后 抽取正文 ,但是单线程未免有点慢
# 可以随意的使用这段代码,但请保留 下面的一行
# author : fuxiang ,mail: fuxiang90@gmail.com
from BeautifulSoup import BeautifulSoup # For processing HTML
import urllib2
import os
import sys
import re
import Queue
import socket
import time
socket.setdefaulttimeout(8)
g_url_queue = Queue.Queue()
g_url_queue.put('http://www.bupt.edu.cn/')
tt = ['http://www.bupt.edu.cn/']
g_url_set = set(tt)
max_deep = 1
# 传入的参数是soup 类型 这个提取soup 类型里面的网址
def get_url_list(html):
global g_url_set
re_html = r'(http://(\w+\.)+\w+)'
res = html.findAll('a') #找到所有a标签
for x in res:
t = unicode(x) #这里的x是soup对象
#url[pos] = str(unicode(x['href']) )
#t = unicode(x)
#print unicode(x['href'])
m = re.findall(re_html , t)
if m is None:
continue
for xx in m:
str_url = xx[0]
#print str_url
g_url_set |= set('fuxiang')
if str_url not in g_url_set :
g_url_queue.put(str_url )
g_url_set |= set(str_url)
#######################################################
def strip_tags(html):
"""
Python中过滤HTML标签的函数
>>> str_text=strip_tags("<font color=red>hello</font>")
>>> print str_text
hello
"""
from HTMLParser import HTMLParser
html = html.strip()
html = html.strip("\n")
result = []
parser = HTMLParser()
parser.handle_data = result.append
parser.feed(html)
parser.close()
return ''.join(result)
#######################################################
# 可以传入 网址 或者 本地文件 ,解析出里面的正文
def get_context( url ):
re_html = 'http[s]?://[A-Za-z0-9]+.[A-Za-z0-9]+.[A-Za-z0-9]+
m = re.match(re_html,str(url)) if m is None : # 如果url 是本地文件 fp = open(unicode(url),'r') else: fp = urllib2.urlopen(url) html = fp.read() soup = BeautifulSoup(html) allfonttext=soup.findAll(['a','p','font']) if len(allfonttext)<=0: print 'not found text' fwrite = open('u'+str(url) ,'w') for i in allfonttext: t = (i.renderContents() ) context = strip_tags(t) fwrite.write (context) fwrite.close()####################################################### def main_fun(deep): global g_url_set global g_url_queue if deep > max_deep: return count = 0 print 'debug' while g_url_queue.empty() is not True: print 'debug2' l_url = g_url_queue.get() print l_url # 捕捉超时错误 ,有些网页链接不上 try: fp = urllib2.urlopen(l_url) except : continue html = fp.read() fwrite = open(str(count+1) ,'w') fwrite.write(html) fwrite.close() soup = BeautifulSoup(html) get_url_list(soup) get_context(count+1) count += 1 if count >= 100 : return # uncompletedef get_html_page(url): furl = urllib2.urlopen(url) html = furl.read() soup = BeautifulSoup(html)if __name__ == "__main__": main_fun(1) time.sleep(10)
然后我现在想做一个多线程的,即下载页面 和分析html 抽取里面的正文和url 是可以同步进行的 ,然后对上面的代码进行简单的修改后,勉强能运行 ,主要是增加了 threading ,对全局的queue 访问了加了锁控制,因为之前没有写过多线程的代码,所以觉得还是希望路过的朋友可以,提出建议。
# encoding:utf-8
# use BeautifulSoup to get font|p context
# 可以随意的使用这段代码,但请保留 下面的一行
# author : fuxiang ,mail: fuxiang90@gmail.com
from BeautifulSoup import BeautifulSoup # For processing HTML
import urllib2
import os
import sys
import re
import Queue
import socket
import time
import threading
queue_lock = threading.RLock()
file_lock = threading.RLock()
socket.setdefaulttimeout(8)
g_url_queue = Queue.Queue()
g_url_queue.put('http://www.bupt.edu.cn/')
g_file_queue = Queue.Queue()
tt = ['http://www.bupt.edu.cn/']
g_url_set = set(tt)
max_deep = 1
#######################################################
def strip_tags(html):
"""
Python中过滤HTML标签的函数
>>> str_text=strip_tags("<font color=red>hello</font>")
>>> print str_text
hello
"""
from HTMLParser import HTMLParser
html = html.strip()
html = html.strip("\n")
result = []
parser = HTMLParser()
parser.handle_data = result.append
parser.feed(html)
parser.close()
return ''.join(result)
def get_context( soup ,url):
allfonttext=soup.findAll(['a','p','font'])
if len(allfonttext)<=0:
print 'not found text'
fwrite = open('u'+str(url) ,'w')
for i in allfonttext:
t = (i.renderContents() )
context = strip_tags(t)
fwrite.write (context)
fwrite.close()
class get_page_thread(threading.Thread):
def __init__(self, name):
threading.Thread.__init__(self)
self.t_name = name
def run(self):
global g_url_set
global g_url_queue
global g_file_queue
count = 0
print 'debug'
while g_url_queue.empty() is not True:
print self.t_name
# 增加一个锁
queue_lock.acquire()
l_url = g_url_queue.get()
queue_lock.release()
print l_url
# 捕捉超时错误 ,有些网页链接不上
try:
fp = urllib2.urlopen(l_url)
except :
continue
html = fp.read()
fwrite = open(str(count+1) ,'w')
fwrite.write(html)
fwrite.close()
file_lock.acquire()
g_file_queue.put(count+1)
file_lock.release()
count += 1
if count >= 100 :
exit
class get_url_list_thread(threading.Thread):
def __init__(self, name):
threading.Thread.__init__(self)
self.t_name = name
def run(self):
global g_url_set
global g_file_queue
global queue_lock
global file_lock
while g_file_queue.empty() is not True:
file_lock.acquire()
filename = g_file_queue.get()
file_lock.release()
fd = open(str(filename),'r')
html = fd.read();
soup = BeautifulSoup(html)
get_context(soup,filename)
re_html = r'(http://(\w+\.)+\w+)'
res = soup.findAll('a') #找到所有a标签
for x in res:
t = unicode(x) #这里的x是soup对象
#url[pos] = str(unicode(x['href']) )
#t = unicode(x)
#print unicode(x['href'])
m = re.findall(re_html , t)
if m is None:
continue
for xx in m:
str_url = xx[0]
#print str_url
g_url_set |= set('fuxiang')
if str_url not in g_url_set :
queue_lock.acquire()
g_url_queue.put(str_url )
queue_lock.release()
g_url_set |= set(str_url)
# uncomplete
def get_html_page(url):
furl = urllib2.urlopen(url)
html = furl.read()
soup = BeautifulSoup(html)
if __name__ == "__main__":
thread1 = get_page_thread('a')
thread2 = get_url_list_thread('b')
thread3 = get_page_thread('c')
thread4 = get_page_thread('d')
thread1.start()
time.sleep(20)
thread2.start()
time.sleep(20)
thread3.start()
thread4.start()