以前的一个项目为抓内容写的小偷类,感觉是一个不错的类,便拿出来分享一下。
<
%
' ******************************************************************
' ClassName: Pub_Thief
' Name: 通用小偷类
' Version: 2.0
' Author: lg970044
' Date: 2006-2-16
' Note: 1.出错标记为"$False$"
' 2.分隔标记为"$|||$"
' ******************************************************************
Class Pub_Thief
Private PT_URL ' 目标网址
Public PT_RegExp ' 正则表达式对象
Public CharSet ' 设置字符集(可设置,默认GB2312)
Public Str_Html ' 获取的HTML代码,是加工操作的字符串
Public IgnoreCase ' 设置是否区分大小写,True忽略大小写,False区分大小写(可设置,默认True)
Public RequestMethod ' 设置网页请求方式(可设置,默认GET)
Public RequestForm ' 设置Post请求方式的表单内容(格式为:表单项1=值1&表单项2=值2,当传递的表单值有特殊字符时,应用Server.URLEncode来转换)
Public Property Get Version
Version = " 通用小偷类 V2.0 "
End Property
' 类初始化
Private Sub Class_Initialize()
PT_URL = " $False$ "
Str_Html = " $False$ "
Set PT_RegExp = New RegExp
PT_RegExp.IgnoreCase = True ' 忽略大小写
PT_RegExp.Global = True ' 全程匹配。
IgnoreCase = True
CharSet = " GB2312 "
RequestMethod = " GET "
RequestForm = ""
End Sub
' 注销类
Private Sub Class_Terminate()
Set PT_RegExp = Nothing
End Sub
' 连接网址,获取网页HTML源码(Get_Url为目标网址)
Public Sub Open(Get_Url)
PT_URL = Get_Url
getHttpPage()
End Sub
' 根据目标网页的HTML代码写入到Str_Html属性中
Private Sub getHttpPage()
If PT_URL = "" Or PT_URL = " $False$ " Then
Str_Html = " $False$ "
Exit Sub
End If
Dim Http
On Error Resume Next
Set Http = Server.CreateObject( " MSXML2.XMLHTTP " )
Http.Open RequestMethod,PT_URL, False
If UCase (RequestMethod) = " POST " Then ' 当RequestMethod属性为POST时传送表单
Http.setRequestHeader " Content-Type " , " application/x-www-form-urlencoded "
Http.Send(RequestForm)
Else
Http.Send()
End If
' 过滤传输失败的情况
If Http.ReadyState <> 4 Then
Set Http = Nothing
Str_Html = " $False$ "
Exit Sub
End If
' 过滤响应错误的情况
If Http.Status = 200 Then
Str_Html = BytesToBstr(Http.ResponseBody)
Else
Str_Html = " $False$ "
End If
Set Http = Nothing
If Err.Number <> 0 Then
Err.Clear
End if
End Sub
' 编码转换
Private Function BytesToBstr(Body)
Dim ObjStream
Set ObjStream = Server.CreateObject( " AdoDB.Stream " )
ObjStream.Type = 1
ObjStream.Mode = 3
ObjStream.Open
ObjStream.Write Body
ObjStream.Position = 0
ObjStream.Type = 2
ObjStream.Charset = CharSet
BytesToBstr = ObjStream.ReadText
ObjStream.Close()
Set ObjStream = Nothing
End Function
' 保存HTML源码到文件中
Public Sub saveFile(Out_Path)
Dim ObjStream,Stream
Set ObjStream = Server.CreateObject( " AdoDB.Stream " )
ObjStream.Type = 2
ObjStream.Charset = CharSet
ObjStream.Open
ObjStream.WriteText Str_Html
ObjStream.SaveToFile Server.Mappath(Out_Path), 2
ObjStream.Close()
Set ObjStream = Nothing
End Sub
' 删除HTML中里面的换行、回车、换页
Public Sub delNRF()
PT_RegExp.Global = True ' 设置全程匹配
PT_RegExp.Pattern = " | | "
Str_Html = PT_RegExp.Replace(Str_Html, "" )
End Sub
' 替换HTML中里面字符串(Get_StrOld为被替换的字符串,Get_StrNew为替换字符串)
Public Sub replaceText(Get_StrOld,Get_StrNew)
Str_Html = Replace (Str_Html,Get_StrOld,Get_StrNew)
End Sub
' 查找指定首尾字符串的第一个字符串,返回查找到的值(Get_StrS为指定首字符串,Get_StrNew为指定尾字符串,Get_Start为开始查找位置,Get_Include为是否包含首尾字符串)
Public Function SelectTextFirst(Get_StrS,Get_StrE,Get_Start,Get_Include)
If Get_StrS = "" Or Get_StrE = "" Then
SelectTextFirst = ""
Exit Function
End If
Dim SN,EN,SL,EL,Total,Start,i,str,str_arr ' 首字符串位置,尾字符串位置,首字符串长度,尾字符串长度,总长度,开始位置
Start = Abs (Get_Start)
If Start = 0 Then Start = 1
Total = Len (Str_Html)
SL = Len (Get_StrS)
EL = Len (Get_StrE)
SN = Instr (Start,Str_Html,Get_StrS)
EN = Instr (Start,Str_Html,Get_StrE)
If SN > 0 and EN > 0 Then
If Get_Include Then
i = Total - SN + 1
str = Right (Str_Html,i)
i = EN - SN + EL
str = Left (str,i)
Else
i = Total - SN - SL + 1
str = Right (Str_Html,i)
i = EN - SN - SL
str = Left (str,i)
End If
SelectTextFirst = str
Else
SelectTextFirst = ""
End If
End Function
' 查找指定首尾字符串的一组字符串,返回查找到的值的数组(Get_StrS为指定首字符串,Get_StrNew为指定尾字符串,Get_Start为开始查找位置,Get_Include为是否包含首尾字符串)
Public Function SelectText(Get_StrS,Get_StrE,Get_Start,Get_Include)
If Get_StrS = "" Or Get_StrE = "" Then
SelectText = Null
Exit Function
End If
Dim SN,EN,SL,EL,Total,Start,i,str,str_arr ' 首字符串位置,尾字符串位置,首字符串长度,尾字符串长度,总长度,开始位置
Start = Abs (Get_Start)
If Start = 0 Then Start = 1
Total = Len (Str_Html)
SL = Len (Get_StrS)
EL = Len (Get_StrE)
str_arr = ""
Do While Start < Total
SN = Instr (Start,Str_Html,Get_StrS)
EN = Instr (Start,Str_Html,Get_StrE)
If SN < 1 and EN < 1 Then Exit Do ' 当找不到时退出循环
If Get_Include Then
i = Total - SN + 1
str = Right (Str_Html,i)
i = EN - SN + EL
str = Left (str,i)
Else
i = Total - SN - SL + 1
str = Right (Str_Html,i)
i = EN - SN - SL
str = Left (str,i)
End If
Start = EN + Len (Get_StrE)
str_arr = str_arr & str & " $|||$ "
Loop
If str_arr = "" Then
SelectText = Null
Else
str_arr = Left (str_arr, Len (str_arr) - Len ( " $|||$ " )) ' 去掉最后一个分隔符
SelectText = Split (str_arr, " $|||$ " ) ' 输出数组
End If
End Function
' 正则表达式替换字符串(Get_StrOld为要被替换的匹配模式,Get_StrNew为替换匹配模式)
Public Sub RegExpRepl(Get_PatternOld,Get_PatternNew)
PT_RegExp.Global = True ' 设置全程匹配
PT_RegExp.IgnoreCase = IgnoreCase
PT_RegExp.Pattern = Get_PatternOld
Str_Html = PT_RegExp.Replace(Str_Html,Get_PatternNew)
End Sub
' 正则表达式匹配第一个字符串(Get_Pattern为匹配模式,Get_SubMatches为子匹配[0表示返回匹配,不为0返回相应子匹配])
Public Function RegExpExecFirst(Get_Pattern,Get_SubMatches)
Dim Matches
PT_RegExp.Global = False ' 设置匹配第一个
PT_RegExp.IgnoreCase = IgnoreCase
PT_RegExp.Pattern = Get_Pattern
Set Matches = PT_RegExp.Execute(Str_Html)
If Matches.Count = 0 Then
RegExpExecFirst = ""
Else
If Get_SubMatches = 0 Then
RegExpExecFirst = Matches.Item( 0 ).Value
Else
RegExpExecFirst = Matches.Item( 0 ).SubMatches(Get_SubMatches - 1 )
End If
End If
PT_RegExp.Global = True ' 设置全程匹配
End Function
' 正则表达式匹配字符串,返回数组(Get_Pattern为匹配模式,Get_SubMatches为子匹配[0表示返回匹配,不为0返回相应子匹配])
Public Function RegExpExec(Get_Pattern,Get_SubMatches)
Dim Matches,Match,str_arr
PT_RegExp.Global = True ' 设置全程匹配
PT_RegExp.IgnoreCase = IgnoreCase
PT_RegExp.Pattern = Get_Pattern
Set Matches = PT_RegExp.Execute(Str_Html)
str_arr = ""
For Each Match in Matches ' 遍历 Matches 集合。
If Get_SubMatches = 0 Then
str_arr = str_arr & Match.Value & " $|||$ "
Else
str_arr = str_arr & Match.SubMatches(Get_SubMatches - 1 ) & " $|||$ "
End If
Next
If str_arr = "" Then
RegExpExec = Null
Else
str_arr = Left (str_arr, Len (str_arr) - Len ( " $|||$ " )) ' 去掉最后一个分隔符
RegExpExec = Split (str_arr, " $|||$ " ) ' 输出数组
End If
End Function
' 正则表达式匹配检测,返回Boolean值指示是否找到(Get_Pattern为匹配模式)
Public Function RegExpTest(Get_Pattern)
PT_RegExp.Global = False
PT_RegExp.Pattern = Get_Pattern
RegExpTest = PT_RegExp.Test(Str_Html)
PT_RegExp.Global = True ' 设置全程匹配
End Function
' 取得目标网址的二进制流
Private Function getBinary (Get_Url)
If Get_Url = "" Or Get_Url = " $False$ " Then
getBinary = Null
Exit Function
End If
Dim Binary
On Error Resume Next
Set Binary = Server.CreateObject( " MSXML2.XMLHTTP " )
Binary.Open " GET " ,Get_Url, False
Binary.Send()
If Binary.ReadyState <> 4 Then
Set Binary = Nothing
getBinary = Null
Exit Function
End If
' 过滤响应错误的情况
If Binary.Status = 200 Then
getBinary = Binary.ResponseBody
Else
getBinary = Null
End If
Set Binary = Nothing
If Err.Number <> 0 Then
Err.Clear
End if
End Function
' 根据目标网址保存二进制文件,返回保存的文件路径(Get_Url为要保存的远程文件[要求完整URL],Out_Path为保存的路径前缀[使用虚路径])
Public Function saveBinaryFile(Get_Url,Out_Path)
Dim ObjStream,Stream,Matches
Stream = getBinary(Get_Url)
If IsNull (Stream) Then ' 当无法获取二进制文件时退出函数,并返回远程文件URL
saveBinaryFile = Get_Url
Exit Function
End If
' 从URL中取得文件名
PT_RegExp.IgnoreCase = IgnoreCase
PT_RegExp.Pattern = " ^http://.+?/(.+)$ "
Set Matches = PT_RegExp.Execute(Get_Url)
If Matches.Count = 0 Then ' 当取没有相匹配文件名时退出函数,并返回远程文件URL
saveBinaryFile = Get_Url
Exit Function
Else
Out_Path = Out_Path & Replace (Matches.Item( 0 ).SubMatches( 0 ), " / " , " - " )
End If
Set ObjStream = Server.CreateObject( " AdoDB.Stream " )
ObjStream.Type = 1
ObjStream.Open
ObjStream.Write Stream
ObjStream.SaveToFile Server.Mappath(Out_Path), 2
ObjStream.Close()
Set ObjStream = Nothing
saveBinaryFile = Out_Path ' 返回本地保存的文件路径
End Function
End Class
% >
' ******************************************************************
' ClassName: Pub_Thief
' Name: 通用小偷类
' Version: 2.0
' Author: lg970044
' Date: 2006-2-16
' Note: 1.出错标记为"$False$"
' 2.分隔标记为"$|||$"
' ******************************************************************
Class Pub_Thief
Private PT_URL ' 目标网址
Public PT_RegExp ' 正则表达式对象
Public CharSet ' 设置字符集(可设置,默认GB2312)
Public Str_Html ' 获取的HTML代码,是加工操作的字符串
Public IgnoreCase ' 设置是否区分大小写,True忽略大小写,False区分大小写(可设置,默认True)
Public RequestMethod ' 设置网页请求方式(可设置,默认GET)
Public RequestForm ' 设置Post请求方式的表单内容(格式为:表单项1=值1&表单项2=值2,当传递的表单值有特殊字符时,应用Server.URLEncode来转换)
Public Property Get Version
Version = " 通用小偷类 V2.0 "
End Property
' 类初始化
Private Sub Class_Initialize()
PT_URL = " $False$ "
Str_Html = " $False$ "
Set PT_RegExp = New RegExp
PT_RegExp.IgnoreCase = True ' 忽略大小写
PT_RegExp.Global = True ' 全程匹配。
IgnoreCase = True
CharSet = " GB2312 "
RequestMethod = " GET "
RequestForm = ""
End Sub
' 注销类
Private Sub Class_Terminate()
Set PT_RegExp = Nothing
End Sub
' 连接网址,获取网页HTML源码(Get_Url为目标网址)
Public Sub Open(Get_Url)
PT_URL = Get_Url
getHttpPage()
End Sub
' 根据目标网页的HTML代码写入到Str_Html属性中
Private Sub getHttpPage()
If PT_URL = "" Or PT_URL = " $False$ " Then
Str_Html = " $False$ "
Exit Sub
End If
Dim Http
On Error Resume Next
Set Http = Server.CreateObject( " MSXML2.XMLHTTP " )
Http.Open RequestMethod,PT_URL, False
If UCase (RequestMethod) = " POST " Then ' 当RequestMethod属性为POST时传送表单
Http.setRequestHeader " Content-Type " , " application/x-www-form-urlencoded "
Http.Send(RequestForm)
Else
Http.Send()
End If
' 过滤传输失败的情况
If Http.ReadyState <> 4 Then
Set Http = Nothing
Str_Html = " $False$ "
Exit Sub
End If
' 过滤响应错误的情况
If Http.Status = 200 Then
Str_Html = BytesToBstr(Http.ResponseBody)
Else
Str_Html = " $False$ "
End If
Set Http = Nothing
If Err.Number <> 0 Then
Err.Clear
End if
End Sub
' 编码转换
Private Function BytesToBstr(Body)
Dim ObjStream
Set ObjStream = Server.CreateObject( " AdoDB.Stream " )
ObjStream.Type = 1
ObjStream.Mode = 3
ObjStream.Open
ObjStream.Write Body
ObjStream.Position = 0
ObjStream.Type = 2
ObjStream.Charset = CharSet
BytesToBstr = ObjStream.ReadText
ObjStream.Close()
Set ObjStream = Nothing
End Function
' 保存HTML源码到文件中
Public Sub saveFile(Out_Path)
Dim ObjStream,Stream
Set ObjStream = Server.CreateObject( " AdoDB.Stream " )
ObjStream.Type = 2
ObjStream.Charset = CharSet
ObjStream.Open
ObjStream.WriteText Str_Html
ObjStream.SaveToFile Server.Mappath(Out_Path), 2
ObjStream.Close()
Set ObjStream = Nothing
End Sub
' 删除HTML中里面的换行、回车、换页
Public Sub delNRF()
PT_RegExp.Global = True ' 设置全程匹配
PT_RegExp.Pattern = " | | "
Str_Html = PT_RegExp.Replace(Str_Html, "" )
End Sub
' 替换HTML中里面字符串(Get_StrOld为被替换的字符串,Get_StrNew为替换字符串)
Public Sub replaceText(Get_StrOld,Get_StrNew)
Str_Html = Replace (Str_Html,Get_StrOld,Get_StrNew)
End Sub
' 查找指定首尾字符串的第一个字符串,返回查找到的值(Get_StrS为指定首字符串,Get_StrNew为指定尾字符串,Get_Start为开始查找位置,Get_Include为是否包含首尾字符串)
Public Function SelectTextFirst(Get_StrS,Get_StrE,Get_Start,Get_Include)
If Get_StrS = "" Or Get_StrE = "" Then
SelectTextFirst = ""
Exit Function
End If
Dim SN,EN,SL,EL,Total,Start,i,str,str_arr ' 首字符串位置,尾字符串位置,首字符串长度,尾字符串长度,总长度,开始位置
Start = Abs (Get_Start)
If Start = 0 Then Start = 1
Total = Len (Str_Html)
SL = Len (Get_StrS)
EL = Len (Get_StrE)
SN = Instr (Start,Str_Html,Get_StrS)
EN = Instr (Start,Str_Html,Get_StrE)
If SN > 0 and EN > 0 Then
If Get_Include Then
i = Total - SN + 1
str = Right (Str_Html,i)
i = EN - SN + EL
str = Left (str,i)
Else
i = Total - SN - SL + 1
str = Right (Str_Html,i)
i = EN - SN - SL
str = Left (str,i)
End If
SelectTextFirst = str
Else
SelectTextFirst = ""
End If
End Function
' 查找指定首尾字符串的一组字符串,返回查找到的值的数组(Get_StrS为指定首字符串,Get_StrNew为指定尾字符串,Get_Start为开始查找位置,Get_Include为是否包含首尾字符串)
Public Function SelectText(Get_StrS,Get_StrE,Get_Start,Get_Include)
If Get_StrS = "" Or Get_StrE = "" Then
SelectText = Null
Exit Function
End If
Dim SN,EN,SL,EL,Total,Start,i,str,str_arr ' 首字符串位置,尾字符串位置,首字符串长度,尾字符串长度,总长度,开始位置
Start = Abs (Get_Start)
If Start = 0 Then Start = 1
Total = Len (Str_Html)
SL = Len (Get_StrS)
EL = Len (Get_StrE)
str_arr = ""
Do While Start < Total
SN = Instr (Start,Str_Html,Get_StrS)
EN = Instr (Start,Str_Html,Get_StrE)
If SN < 1 and EN < 1 Then Exit Do ' 当找不到时退出循环
If Get_Include Then
i = Total - SN + 1
str = Right (Str_Html,i)
i = EN - SN + EL
str = Left (str,i)
Else
i = Total - SN - SL + 1
str = Right (Str_Html,i)
i = EN - SN - SL
str = Left (str,i)
End If
Start = EN + Len (Get_StrE)
str_arr = str_arr & str & " $|||$ "
Loop
If str_arr = "" Then
SelectText = Null
Else
str_arr = Left (str_arr, Len (str_arr) - Len ( " $|||$ " )) ' 去掉最后一个分隔符
SelectText = Split (str_arr, " $|||$ " ) ' 输出数组
End If
End Function
' 正则表达式替换字符串(Get_StrOld为要被替换的匹配模式,Get_StrNew为替换匹配模式)
Public Sub RegExpRepl(Get_PatternOld,Get_PatternNew)
PT_RegExp.Global = True ' 设置全程匹配
PT_RegExp.IgnoreCase = IgnoreCase
PT_RegExp.Pattern = Get_PatternOld
Str_Html = PT_RegExp.Replace(Str_Html,Get_PatternNew)
End Sub
' 正则表达式匹配第一个字符串(Get_Pattern为匹配模式,Get_SubMatches为子匹配[0表示返回匹配,不为0返回相应子匹配])
Public Function RegExpExecFirst(Get_Pattern,Get_SubMatches)
Dim Matches
PT_RegExp.Global = False ' 设置匹配第一个
PT_RegExp.IgnoreCase = IgnoreCase
PT_RegExp.Pattern = Get_Pattern
Set Matches = PT_RegExp.Execute(Str_Html)
If Matches.Count = 0 Then
RegExpExecFirst = ""
Else
If Get_SubMatches = 0 Then
RegExpExecFirst = Matches.Item( 0 ).Value
Else
RegExpExecFirst = Matches.Item( 0 ).SubMatches(Get_SubMatches - 1 )
End If
End If
PT_RegExp.Global = True ' 设置全程匹配
End Function
' 正则表达式匹配字符串,返回数组(Get_Pattern为匹配模式,Get_SubMatches为子匹配[0表示返回匹配,不为0返回相应子匹配])
Public Function RegExpExec(Get_Pattern,Get_SubMatches)
Dim Matches,Match,str_arr
PT_RegExp.Global = True ' 设置全程匹配
PT_RegExp.IgnoreCase = IgnoreCase
PT_RegExp.Pattern = Get_Pattern
Set Matches = PT_RegExp.Execute(Str_Html)
str_arr = ""
For Each Match in Matches ' 遍历 Matches 集合。
If Get_SubMatches = 0 Then
str_arr = str_arr & Match.Value & " $|||$ "
Else
str_arr = str_arr & Match.SubMatches(Get_SubMatches - 1 ) & " $|||$ "
End If
Next
If str_arr = "" Then
RegExpExec = Null
Else
str_arr = Left (str_arr, Len (str_arr) - Len ( " $|||$ " )) ' 去掉最后一个分隔符
RegExpExec = Split (str_arr, " $|||$ " ) ' 输出数组
End If
End Function
' 正则表达式匹配检测,返回Boolean值指示是否找到(Get_Pattern为匹配模式)
Public Function RegExpTest(Get_Pattern)
PT_RegExp.Global = False
PT_RegExp.Pattern = Get_Pattern
RegExpTest = PT_RegExp.Test(Str_Html)
PT_RegExp.Global = True ' 设置全程匹配
End Function
' 取得目标网址的二进制流
Private Function getBinary (Get_Url)
If Get_Url = "" Or Get_Url = " $False$ " Then
getBinary = Null
Exit Function
End If
Dim Binary
On Error Resume Next
Set Binary = Server.CreateObject( " MSXML2.XMLHTTP " )
Binary.Open " GET " ,Get_Url, False
Binary.Send()
If Binary.ReadyState <> 4 Then
Set Binary = Nothing
getBinary = Null
Exit Function
End If
' 过滤响应错误的情况
If Binary.Status = 200 Then
getBinary = Binary.ResponseBody
Else
getBinary = Null
End If
Set Binary = Nothing
If Err.Number <> 0 Then
Err.Clear
End if
End Function
' 根据目标网址保存二进制文件,返回保存的文件路径(Get_Url为要保存的远程文件[要求完整URL],Out_Path为保存的路径前缀[使用虚路径])
Public Function saveBinaryFile(Get_Url,Out_Path)
Dim ObjStream,Stream,Matches
Stream = getBinary(Get_Url)
If IsNull (Stream) Then ' 当无法获取二进制文件时退出函数,并返回远程文件URL
saveBinaryFile = Get_Url
Exit Function
End If
' 从URL中取得文件名
PT_RegExp.IgnoreCase = IgnoreCase
PT_RegExp.Pattern = " ^http://.+?/(.+)$ "
Set Matches = PT_RegExp.Execute(Get_Url)
If Matches.Count = 0 Then ' 当取没有相匹配文件名时退出函数,并返回远程文件URL
saveBinaryFile = Get_Url
Exit Function
Else
Out_Path = Out_Path & Replace (Matches.Item( 0 ).SubMatches( 0 ), " / " , " - " )
End If
Set ObjStream = Server.CreateObject( " AdoDB.Stream " )
ObjStream.Type = 1
ObjStream.Open
ObjStream.Write Stream
ObjStream.SaveToFile Server.Mappath(Out_Path), 2
ObjStream.Close()
Set ObjStream = Nothing
saveBinaryFile = Out_Path ' 返回本地保存的文件路径
End Function
End Class
% >
详细内容见: http://www.qlolo.com/?m=pc&a=page_fh_diary&target_c_diary_id=879