一、引言
在web爬取领域,hub自动识别是一个比较重要的研究内容,通过自动识别的hub,可以增加爬虫的种子源列表,进而提高数据的抓取量和覆盖度。业界已有一些基于传统机器学习方法的hub识别算法,但传统识别算法需要有丰富的特征工程经验才能建立一个比较稳定而可靠的模型。此外,web页面的结构千变万化,提取稳定而可靠的特征是相当困难的。
相反,基于深度学习的方法可以摆脱对特征工程的束缚,能够自动提取样本中的特征,外加其强大的学习和表示能力,在Hub识别方面一定会比传统方法要有效的多。
二、训练样本生成
web页面是一个版结构化的数据,为了生成训练数据,关键的一步便是如何将半结构化的web页面转换为一个向量。如果将一个web页面的body体中的节点递归的组织为一棵树,它会有如下的结构。
如今的网页源码基本都是div+css的搭配模式。同时,如果你仔细观察会发现,div的组织一般是从上到下或者从左到右,非常遵循我们人眼的视觉感知顺序。下面要引入的样本生成算法将按照这种视觉感知顺序处理网页中的每个节点,并最终将其转换为一个一维向量。
PageToVec算法
输入: url, html, domain
输出: one-dim vector
- 利用lxml等开源类库构建节点树.
- 采用深度优先遍历,递归的对每个节点做处理: 针对不同节点,在向量尾部填充充不同的标识.
- 向量映射,将向量映射为仅包含-1,0,1三个数字的向量.
- 视觉增强,对p、h1、video等有视觉冲击感的节点,在对应的向量位置上按照指定规则进行扩充.
- 向量正规化,对于超出长度的向量做采样截断,对于长度不足的向量采用首部或尾部填充.
下面是算法PageToVec中的page_to_vec.py文件。
# usr/bin/env python
# -*- coding: UTF-8 -*-
import time, datetime
from time import sleep
import sys
import os
import math
import urllib
import random
import logging
import copy
from lxml import etree
from facility import *
from ac_automation import *
reload(sys)
sys.setdefaultencoding('utf-8')
class PageToVec(object):
def __init__(self):
super(PageToVec, self).__init__()
self.domain = ""
self.host = ""
self.url = ""
# url最长限制
self.min_url_len = 256
self.vector = []
# max vector len
self.vsize = 512
self.h_tags = ['h1', 'h2']
# 无需递归处理的标签
self.skip_tags = ["footer","noscript","nav","input","select","option"]
self.kw_dict = {}
# AC自动机
self.ac_automation = ac_automation()
self.ac_automation.parse("./key_word.txt")
# 段落最小长度
self.min_para_len = 16
# 段落每行平均字数
self.ave_char_num_per_line = 32
# 标题最小长度
self.min_h_len = 5
# p 标签数
self.p_num = 0
# a 标签数
self.a_num = 0
# anchor最短长度
self.min_anchor_len = 5
def erase(self):
self.vector = []
self.p_num = 0
self.a_num = 0
self.domain = ""
self.host = ""
def parse(self, url, content, domain):
try:
host = url.split('/')[2]
self.url = url
self.host = host
self.domain = domain
self.clearNotes(content)
self.clearStyle(content)
lst = []
for line in content:
if line == "*\n" or line == "#\n" or line == "~\n":
continue
lst.append(line)
if not lst or len(lst) == 0:
return []
html = etree.HTML("".join(lst))
if html is None:
return []
tree = etree.ElementTree(html)
node_list = tree.xpath('/html/body')
if node_list is not None and len(node_list) > 0:
self.fill(node_list[0])
self.normalize()
return self.vector
else:
self.vector = []
return self.vector
except Exception as e:
logger.info("error: %s" % str(e))
self.erase()
return self.vector
"""
valid paragraph length > 16
"""
def is_valid_para(self, para):
if para == None or para == "":
return False
ln = len(para)
if ln >= self.min_para_len:
return True
else:
return False
"""
fill vector by iterate node tree
param: tree root
param: recuresive depth
"""
def fill(self, root):
for node in root.getchildren():
if self.skip(node):
continue
childs = node.getchildren()
# a tag process
if node.tag == "a":
self.tag_a(node)
continue
# h tag process
if node.tag in self.h_tags:
h = self.tag_h(node)
if h and len(h) >= self.min_h_len:
self.vector.append("h-" + str(len(h)))
else:
self.vector.append("0")
continue
# img tag process
if node is not None and node.tag == "img" and ".jpg" in node.get("src"):
self.tag_img(node)
continue
if node.tag == "script":
self.tag_script(node)
continue
if node.tag == "iframe":
self.tag_iframe(node)
continue
if node.tag == "video":
self.tag_video(node)
continue
if node.tag == "embed":
self.tag_embed(node)
continue
if node.tag == "audio":
self.tag_audio(node)
continue
# br tag
if node.tag == "br":
self.vector.append("p-0")
continue
# paragragh process
if node.tag == "p" or (not childs or len(childs) == 0):
level = 0
para = self.tag_p(node, level)
if self.is_valid_para(para):
self.vector.append("p-" + str(len(para)))
else:
self.vector.append("0")
continue
self.fill(node)
"""
normalize vector so that all of the element bounds in [0,1,-1]
h: 1
p: 1
a: -1
others: 0
"""
def normalize(self):
ln = len(self.vector)
self.p_num = 0
self.a_num = 0
if ln <= 0:
return
self.vector[ln - 1] = 0
# phase one: map
for i in range(ln-2, -1, -1):
c = self.vector[i][0]
if c == '0':
self.vector[i] = 0
continue
if c == '-':
self.a_num += 1
self.vector[i] = -1
continue
if c == 'h':
self.vector[i] = 'h'
continue
if c == 'i':
self.vector[i] = 'i'
continue
if c == 'm':
self.vector[i] = 'm'
continue
ac = 0
if i < ln-1:
ac = self.vector[i+1]
bc = ""
line_num = int(self.vector[i].split('-')[1]) / self.ave_char_num_per_line
if i > 0:
bc = self.vector[i-1][0]
if c == 'p' and (ac >= 1 or bc in ['p','i','h']):
self.p_num += 1
self.vector[i] = line_num
elif c == 'p':