html自动采集seo,SEO之泛采集——HTML正文抽取算法

很多SEO软件有泛采集功能,只需要指定关键字,就自动抓取相关文章。这种抓取技术,需要用到HTML正文抽取算法,这里分享根据cx-extractor线性算法php版编写的VB HTML正文抽取类模块。

感谢cx,感谢xwf_like。

'========================================

'模块名称:clsHtmlExtractor

'模块作用:从HTML中抽取正文,根据http header或者html自动获取编码

'模块编写:楚吟风 QQ:112704422  http://www.chuyinfeng.com/

'模块更新:2011-03-15

'模块说明:感谢cx的基于行块分布函数的通用网页正文抽取算法

'========================================

Option Explicit

'========================================

'函数名称:ReplaceX

'函数作用:正则替换

'========================================

Public Function ReplaceX(ByVal sSource As String, ByVal sPattern As String, ByVal sTarget As String) As String

On Error GoTo ErrHandle

Dim RegEx, ReplaceTest As String, sRet As String

Set RegEx = CreateObject("VBSCRIPT.REGEXP")

RegEx.IgnoreCase = True

RegEx.Global = True

RegEx.Pattern = sPattern

sRet = RegEx.Replace(sSource, sTarget)

Set RegEx = Nothing

ReplaceX = sRet

Exit Function

ErrHandle:

Set RegEx = Nothing

End Function

'========================================

'函数名称:InstrX

'函数作用:正则查找

'========================================

Public Function InstrX(ByVal Source As String, ByVal sPattern As String, Optional ByRef strs As Variant) As Integer

On Error GoTo ErrHandle

Dim i As Integer

ReDim strs(i)

Dim RegEx, Matches, Match, sCSet As String

Set RegEx = CreateObject("VBSCRIPT.REGEXP")

RegEx.IgnoreCase = True

RegEx.Global = True

RegEx.Pattern = sPattern

If RegEx.Test(Source) Then

Set Matches = RegEx.execute(Source)

For Each Match In Matches

i = i + 1

ReDim Preserve strs(i)

strs(i) = Match.Value

Next

End If

Set Match = Nothing '

Set Matches = Nothing

Set RegEx = Nothing

InstrX = i

Exit Function

ErrHandle:

Set RegEx = Nothing

End Function

'========================================

'函数名称:GetCset

'函数作用:根据给定的字符串获取html编码方式

'========================================

Public Function GetCset(ByVal Source As String) As String

Dim i As Integer, strs() As String, sCSet As String

i = InstrX(Source, "content-type.*?charset.*?=.*", strs)

If i > 0 Then sCSet = strs(1)

sCSet = ReplaceX(sCSet, ".*charset.*?=", "")

sCSet = ReplaceX(sCSet, """|\s|/|>", "")

GetCset = sCSet

End Function

'========================================

'函数名称:LenX

'函数作用:把全角字符做为2字节计算长度,忽略空格长度

'========================================

Private Function LenX(ByVal s_str As String) As Integer

Dim i_num As Integer, i_index As Integer, i_len As Integer

s_str = Replace(s_str, " ", "")

i_len = Len(s_str)

For i_index = 1 To i_len

If Asc(Mid(s_str, i_index, 1)) < 0 Then

i_num = i_num + 1

End If

Next

LenX = i_len + i_num

End Function

'========================================

'函数名称:Extract

'函数作用:根据cx-extractor算法抽取正文

'========================================

Public Function Extract(ByVal Source As String, Optional ByVal BlockLine As Integer = 3, Optional ByVal OneLine As Boolean = True)

Dim sLine() As String, iLine() As Long, i As Integer, iBlockLen() As Long, sBlock() As String

Dim iStart As Long, iEnd As Long, iMaxLen As Long, iTemp As Long

Dim sPortion As String, iCurTextLen As Long, sTemp As String, sOneLine As String

sOneLine = IIf(OneLine, "", vbCrLf)

'初步去噪

'去除DTD信息

Source = ReplaceX(Source, "*?>", "")

'去除注释

Source = ReplaceX(Source, "", "")

'去除script标签

Source = ReplaceX(Source, "(.|\n)*?", "")

'去除style标签

Source = ReplaceX(Source, "(.|\n)*?", "")

'去除html tag标签

Source = ReplaceX(Source, "", "")

'去除特殊字符

Source = ReplaceX(Source, "&.{1,5};|.{1,5};", "")

'规范换行

Source = Replace(Source, vbCrLf, vbLf)

Source = Replace(Source, vbCr, vbLf)

Source = Replace(Source, vbLf, vbCrLf)

'分割到行

sLine = Split(Source, vbCrLf)

ReDim iBlockLen(0)

For i = 0 To UBound(sLine)

'将多个空白字符替换为一个空格

sLine(i) = ReplaceX(sLine(i), "\s+", " ")

Next

'计算第一块大小

For i = 0 To (BlockLine - 1)

iBlockLen(0) = iBlockLen(0) + LenX(sLine(i))

Next

'计算其他块大小

For i = 1 To UBound(sLine) - BlockLine - 1

ReDim Preserve iBlockLen(i)

iBlockLen(i) = iBlockLen(i - 1) + LenX(sLine(i - 1 + BlockLine)) - LenX(sLine(i - 1))

Next

'根据各个块大小变化的峰值峰谷提取正文

iStart = -1: iEnd = -1: i = 0

Do While i < UBound(iBlockLen)

Do While (i < UBound(iBlockLen) And iBlockLen(i) = 0)

i = i + 1

Loop

iTemp = i

iCurTextLen = 0

sPortion = ""

Do While (i < UBound(iBlockLen) And iBlockLen(i) <> 0)

sPortion = sPortion & sLine(i) & sOneLine

iCurTextLen = iCurTextLen + iBlockLen(i)

i = i + 1

Loop

If iCurTextLen > iMaxLen Then

sTemp = sPortion

iMaxLen = iCurTextLen

iStart = iTemp

iEnd = i - 1

End If

Loop

'MsgBox sLine(iStart - 1), , iStart

'MsgBox sLine(iEnd + 1), , iEnd

Extract = sTemp

End Function

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值