<
!
--
#include file
=
"
fget.asp
"
-->
<
!
--
#include file
=
"
conn.asp
"
-->
<
html
>
<
head
>
<
meta http
-
equiv
=
"
Content-Type
"
content
=
"
text/html; charset=gb2312
"
>
<
title
>
信息采集
</
title
>
</
head
>
<
body
>
<
% Server.ScriptTimeOut
=
9999999
PageStart
=
""
'
抓取开始页
PageEnd
=
30
'
抓取结束页
lburl
=
"
http://www.tignet.cn/zhaoshang/index.asp?CurPageNum=
"
'
列表第一页开始url
pg
=
cint
(request.querystring(
"
pg
"
))
'
取得页数
'
=========列表分页处理开始=========================
if
PageStart
=
""
and
pg
=
0
then
'
判断是否为第一页
pg
=
1
'
第一页直接抓取
list_url
=
"
http://www.tignet.cn/zhaoshang/
"
elseif
PageStart
=
""
and
pg
<>
0
then
'
设置下一页抓取url
list_url
=
lburl
&
pg
elseif
PageStart
<>
""
and
pg
=
0
then
pg
=
PageStart
'
设置采集开始页数
list_url
=
lburl
&
pg
elseif
PageStart
<>
""
and
pg
<>
0
then
list_url
=
lburl
&
pg
end
if
'
response.Write list_url
'
response.End()
'
=========截取数据开始=============================
'
第一步设置数据
lists
=
"
发布信息
"
'
列表截取
listo
=
"
【中国虎网】 为医药界
"
listxs
=
"
留言咨询
"
'
循环链接截取
links
=
"
<a href='
"
'
标题链接
linko
=
"
' target='_blank' >
"
'
=================内容加字段=======================
companys
=
"
<span style='font-size:12px;'>
"
'
公司名称
companyo
=
"
</span>
"
names
=
"
padding-bottom:3px;'>
"
'
药品名称
nameo
=
"
</a>
"
kinds
=
"
>类别:
"
'
药品类型
kindo
=
"
</span>
"
times
=
"
更新时间:
"
'
代理商介绍
timeo
=
"
</span>
"
Response.Write
"
</br>
"
Response.Write
"
<center><font size=3pt>=============抓取
"
&
list_url
&
"
信息开始=============</font></center>
"
'
调用主题函数NewsList
Call
NewsList()
'
调用转向下一页函数
Call
EndPage()
Function
NewsList()
'
获取某类列表代码
strHtml
=
GetHTTPPage(list_url)
'
获得html代码
strHtml
=
strCut(strHtml,lists,listo,
1
)
'
获取列表代码
'
response.Write strHtml
'
response.End()
strHtml
=
split
(strHtml,listxs)
'
拆分代码
'
response.Write strHtml(1)
'
response.End()
for
i
=
0
to
(
ubound
(strHtml)
-
1
)
'
拆分标题,链接地址
newsurl
=
"
http://www.tignet.cn
"
&
strCut(strHtml(i),links,linko,
2
)
'
response.Write newsurl
'
response.End()
'
Get_time=FormatStr(Trim(strCut(strHtml(i),times,timeo,2)))'发布时间
'
if FormatStr(strCut(strHtml(i),links,linko,2))<>"" then
'
NewsHtml=GetHTTPPage(newsurl)'获取下一步详细内容页面html代码
'
' response.Write NewsHtml
'
' response.End()
'
else
'
response.Write "抓取第"&i&"条链接地址失败,不能抓取此项详细内容,程序将跳过此项目!"
'
end if
'
leibie=FormatStr(Trim(strCut(NewsHtml,kinds,kindo,2)))'采集产品类别
leibie
=
FormatStr(
Trim
(strCut(strHtml(i),kinds,kindo,
2
)))
if
leibie
<>
""
then
company
=
FormatStr(
Trim
(strCut(strHtml(i),companys,companyo,
2
)))
'
采集公司名称
'
ming=replace(FormatStr(Trim(strCut(strHtml(i),names,nameo,2))),"★","")'采集产品名称
ming
=
FormatStr(
Trim
(strCut(strHtml(i),names,nameo,
2
)))
'
采集产品名称
shijian
=
replace
(FormatStr(
Trim
(strCut(strHtml(i),times,timeo,
2
))),
"
/
"
,
"
-
"
)
'
发布时间
s1
=
instr
(leibie,
"
品
"
) s2
=
len
(leibie)
if
s1
>
0
then
bigkind
=
mid
(leibie,
1
,s1) kind
=
mid
(leibie,(s1
+
1
),(s2
-
s1))
else
bigkind
=
leibie kind
=
""
end
if
if
newsurl
<>
""
then
set
rs
=
server.CreateObject(
"
adodb.recordset
"
) sql
=
"
select url from Get_zhaoshang where url='
"
&
newsurl
&
"
'
"
rs.open sql,conn,
1
,
1
if
rs.eof
then
'
插入数据
SQL
=
"
insert into Get_zhaoshang(company,names,bigkind,kind,url,times) values('
"
&
company
&
"
','
"
&
ming
&
"
','
"
&
bigkind
&
"
','
"
&
kind
&
"
','
"
&
newsurl
&
"
','
"
&
shijian
&
"
')
"
Conn.execute(SQL) response.write
"
<font color=Green size=3pt>+</font>
"
&
newsurl
&
"
<br>
"
else
response.write
"
<font color=red size=3pt>此条信息已经存在,程序将跳过!</font><br>
"
end
if
end
if
end
if
Next
set
strHtml
=
nothing
Response.Write
"
<center><font size=3pt>第
"
&
pg
&
"
页信息抓取结束!!!</font></center>
"
End Function
Function
GetHTTPPage(Url)
'
获取Html代码函数
err.clear
On
Error
Resume
Next
dim
http
set
http
=
Server.createobject(
"
Microsoft.XMLHTTP
"
) Http.open
"
GET
"
,url,
false
'
HTTP的通信方式,比如GET或是POST '接收XML数据的服务器的URL地址。通常在URL中要指明ASP或CGI程序
'
如果是异步通信方式(true)如果是同步方式(false)
Http.send()
'
Send方法的参数类型是Variant,可以是字符串、DOM树或任意数据流。
'
发送数据的方式分为同步和异步两种。在异步方式下,数据包一旦发送完毕,就结束Send进程,
'
客户机执行其他的操作;而在同步方式下,客户机要等到服务器返回确认消息后才结束Send进程
if
Http.readystate
<>
4
then
'
0 Response对象已经创建,但XML文档上载过程尚未结束
'
1 XML文档已经装载完毕
'
2 XML文档已经装载完毕,正在处理中
'
3 部分XML文档已经解析
'
4 文档已经解析完毕,客户端可以接受返回消息
exit
function
end
if
GetHTTPPage
=
bytesToBSTR(Http.responseBody,
"
GB2312
"
)
'
bytesToBSTR 编码转化函数
'
=======对Http.responseBody的解释=========
'
responseText:将返回消息作为文本字符串;
'
responseBody:将返回消息作为HTML文档内容;
'
responseXML:将返回消息视为XML文档,在服务器响应消息中含有XML数据时使用;
'
responseStream:将返回消息视为Stream对象
'
response.write GetHTTPPage
set
http
=
Nothing
If
Err
Then
response.write err.description Response.Write
"
<br><br><p align='center'><font color='red'><b>无法抓取本页面列表信息!!!</b></font></p>
"
End
If
End function
Function
EndPage()
'
抓取下一页,跳转函数.PageNo--->抓取的页数
if
pg
<
PageEnd
Then
'
抓取下一页
response.write
"
<script>window.location='tignetcn.asp?pg=
"
&
pg
+
1
&
"
';</script>
"
else
Response.Write
"
<hr size=1 color=#00FF00 width=500>
"
response.write
"
<center><font size=2pt><b>===============================信息抓取完毕!!!================================</b></font></center>
"
response.end
end
if
End Function
%
>
</
body
>
</
html
>
下面是fget.asp里两个函数,一个是截取,一个事过滤html: 1:截取函数:
Function
strCut(strContent,StartStr,EndStr,CutType) 'strContent 要截取的内容 'StartStr 开始标志字符 'EndStr 结束标志字符 'CutType 截取类型 1--包括开始,结尾标记 2----不包括开始,结尾标记
Dim
strHtml,S1,S2 strHtml
=
strContent
On
Error
Resume
Next
If
CutType
=
2
Then
'
不包括开始,结尾标记
S1
=
InStr
(strHtml,StartStr)
+
Len
(StartStr) S2
=
InStr
(S1,strHtml,EndStr)
If
Err
Then
response.write
"
Unknow Wrong:
"
&
err.description
&
"
---BG:
"
&
S1
&
"
End:
"
&
S2
&
"
<br>
"
Err.Clear strCut
=
""
Exit
Function
Else
If
S1
>
Len
(StartStr)
and
S2
>
0
then
strCut
=
Mid
(strHtml,S1,S2
-
S1)
Else
strCut
=
""
End
If
End
if
'
response.Write strCut
'
response.End()
Else
'
包括开始,结尾标记
S1
=
InStr
(strHtml,StartStr) S2
=
InStr
(S1,strHtml,EndStr)
+
Len
(EndStr)
If
Err
Then
response.write
"
Unknow Wrong:
"
&
err.description
&
"
---BG:
"
&
S1
&
"
End:
"
&
S2
&
"
<br>
"
Err.Clear strCut
=
""
Exit
Function
Else
If
S1
>
0
and
S2
>
Len
(EndStr)
then
strCut
=
Mid
(strHtml,S1,S2
-
S1)
Else
strCut
=
""
End
If
End
if
End
If
End Function
2.html过滤函数,也过滤一些 回车,空格之类的
Function
FormatStr(str)
Dim
s1,s2
If
str
<>
""
then
str
=
replace
(
replace
(
Trim
(str),
chr
(
32
)
&
chr
(
32
),
""
),
chr
(
9
),
""
)
DO
While
(
instr
(str,
"
>
"
)
>
0
and
instr
(str,
"
<
"
)
>
0
) s1
=
InStr
(str,
"
<
"
) s2
=
Instr
(s1,str,
"
>
"
)
If
s1
>
0
and
s2
>
0
then
str
=
replace
(str,
mid
(str,s1,s2
-
s1
+
1
),
""
)
End
if
Loop
str
=
replace
(
replace
(str,
"
<
"
,
"
<
"
),
"
>
"
,
"
>
"
) str
=
Replace
(
Replace
(
Replace
(
replace
(
replace
(str,
chr
(
13
),
""
),
chr
(
10
),
""
),
"
""
"
,
"
”
"
),
"
'
"
,
"
’
"
),
"
"
,
""
) FormatStr
=
str
Else
FormatStr
=
""
End
if
End Function