用XMLHTTP对象抓取网页源代码,拆分数据写入数据库

< ! -- #include file = " fget.asp " -->
< ! -- #include file = " conn.asp " -->
< html >
< head >
< meta http - equiv = " Content-Type "  content = " text/html; charset=gb2312 " >
< title > 信息采集 </ title >
</ head >
< body  >
< %
    Server.ScriptTimeOut
= 9999999  
    PageStart
= "" ' 抓取开始页
    PageEnd = 30 ' 抓取结束页
    lburl = " http://www.tignet.cn/zhaoshang/index.asp?CurPageNum= " ' 列表第一页开始url
    pg = cint (request.querystring( " pg " )) ' 取得页数
'
=========列表分页处理开始=========================
     if  PageStart = ""   and  pg = 0   then ' 判断是否为第一页
               pg = 1 ' 第一页直接抓取
               list_url = " http://www.tignet.cn/zhaoshang/ "
             
elseif  PageStart = ""   and  pg <> 0   then ' 设置下一页抓取url
               list_url = lburl & pg
             
elseif  PageStart <> ""   and  pg = 0   then
               pg
= PageStart ' 设置采集开始页数
               list_url = lburl & pg
             
elseif  PageStart <> ""   and  pg <> 0   then
               list_url
= lburl & pg
     
end   if       
'      response.Write list_url
'
     response.End()
'
=========截取数据开始=============================
     ' 第一步设置数据
    lists = " 发布信息 " ' 列表截取
    listo = " 【中国虎网】 为医药界 "
    listxs
= " 留言咨询 " ' 循环链接截取
    links = " <a href=' " ' 标题链接
    linko = " ' target='_blank' > "
' =================内容加字段=======================
    companys = " <span style='font-size:12px;'> " ' 公司名称
    companyo = " </span> "
    names
= " padding-bottom:3px;'> " ' 药品名称
    nameo = " </a> "
    kinds
= " >类别: " ' 药品类型
    kindo = " </span> "
    times = " 更新时间: " ' 代理商介绍
    timeo = " </span> "
    Response.Write  " </br> "
    Response.Write 
" <center><font size=3pt>=============抓取 " & list_url & " 信息开始=============</font></center> "
' 调用主题函数NewsList
Call  NewsList()
' 调用转向下一页函数
Call  EndPage()
Function  NewsList() ' 获取某类列表代码
    strHtml = GetHTTPPage(list_url) ' 获得html代码
    strHtml = strCut(strHtml,lists,listo, 1 ) ' 获取列表代码
'
    response.Write strHtml
'
    response.End()
    strHtml = split (strHtml,listxs) ' 拆分代码
'
    response.Write strHtml(1)
'
    response.End()
     for  i = 0   to  ( ubound (strHtml) - 1 ) ' 拆分标题,链接地址
        newsurl = " http://www.tignet.cn " & strCut(strHtml(i),links,linko, 2 )
'         response.Write newsurl
'
        response.End()
         ' Get_time=FormatStr(Trim(strCut(strHtml(i),times,timeo,2)))'发布时间
'
        if FormatStr(strCut(strHtml(i),links,linko,2))<>"" then
'
           NewsHtml=GetHTTPPage(newsurl)'获取下一步详细内容页面html代码
'
'           response.Write NewsHtml
'
'           response.End()
'
        else
'
           response.Write "抓取第"&i&"条链接地址失败,不能抓取此项详细内容,程序将跳过此项目!"
'
        end if
         ' leibie=FormatStr(Trim(strCut(NewsHtml,kinds,kindo,2)))'采集产品类别
        leibie = FormatStr( Trim (strCut(strHtml(i),kinds,kindo, 2 )))
        
if  leibie <> ""   then
            company
= FormatStr( Trim (strCut(strHtml(i),companys,companyo, 2 ))) ' 采集公司名称
             ' ming=replace(FormatStr(Trim(strCut(strHtml(i),names,nameo,2))),"★","")'采集产品名称
            ming = FormatStr( Trim (strCut(strHtml(i),names,nameo, 2 ))) ' 采集产品名称
            shijian = replace (FormatStr( Trim (strCut(strHtml(i),times,timeo, 2 ))), " / " , " - " ) ' 发布时间
                s1 = instr (leibie, " 品  " )
                s2
= len (leibie)
                
if  s1 > 0   then
                    bigkind
= mid (leibie, 1 ,s1)
                    kind
= mid (leibie,(s1 + 1 ),(s2 - s1))
                
else
                    bigkind
= leibie
                    kind
= ""
                
end   if    
 
         if  newsurl <> ""   then
            
set  rs = server.CreateObject( " adodb.recordset " )
            sql
= " select url from Get_zhaoshang where url=' " & newsurl & " ' "
            rs.open sql,conn,
1 , 1
            
if  rs.eof  then
               
' 插入数据
               SQL = " insert into Get_zhaoshang(company,names,bigkind,kind,url,times) values(' " & company & " ',' " & ming & " ',' " & bigkind & " ',' " & kind & " ',' " & newsurl & " ',' " & shijian & " ') "
               Conn.execute(SQL)
               response.write 
" &nbsp;&nbsp;&nbsp;<font color=Green size=3pt>+</font> " & newsurl & " <br> "
               
else
               response.write 
" &nbsp;&nbsp;&nbsp;<font color=red size=3pt>此条信息已经存在,程序将跳过!</font><br> "
             
end   if   
        
end   if
        
end   if
    
Next
    
set  strHtml = nothing
    Response.Write 
" <center><font size=3pt>第 " & pg & " 页信息抓取结束!!!</font></center> "
End Function

Function  GetHTTPPage(Url) ' 获取Html代码函数
    err.clear
    
On   Error   Resume   Next
    
dim  http 
    
set  http = Server.createobject( " Microsoft.XMLHTTP "
    Http.open 
" GET " ,url, false  
    
' HTTP的通信方式,比如GET或是POST '接收XML数据的服务器的URL地址。通常在URL中要指明ASP或CGI程序 
     ' 如果是异步通信方式(true)如果是同步方式(false)
    Http.send()
    
' Send方法的参数类型是Variant,可以是字符串、DOM树或任意数据流。
     ' 发送数据的方式分为同步和异步两种。在异步方式下,数据包一旦发送完毕,就结束Send进程,
     ' 客户机执行其他的操作;而在同步方式下,客户机要等到服务器返回确认消息后才结束Send进程 
     if  Http.readystate <> 4   then
    
' 0   Response对象已经创建,但XML文档上载过程尚未结束 
     ' 1   XML文档已经装载完毕 
     ' 2   XML文档已经装载完毕,正在处理中 
     ' 3   部分XML文档已经解析 
     ' 4   文档已经解析完毕,客户端可以接受返回消息

        
exit   function  
    
end   if  
    GetHTTPPage 
=  bytesToBSTR(Http.responseBody, " GB2312 " ) ' bytesToBSTR 编码转化函数
     ' =======对Http.responseBody的解释=========
     ' responseText:将返回消息作为文本字符串; 
     ' responseBody:将返回消息作为HTML文档内容;
     ' responseXML:将返回消息视为XML文档,在服务器响应消息中含有XML数据时使用; 
     ' responseStream:将返回消息视为Stream对象 
     ' response.write GetHTTPPage
     set  http  =   Nothing
    
If  Err  Then
        response.write err.description
        Response.Write 
" <br><br><p align='center'><font color='red'><b>无法抓取本页面列表信息!!!</b></font></p> "
    
End   If
End function

Function  EndPage() ' 抓取下一页,跳转函数.PageNo--->抓取的页数
         if  pg < PageEnd  Then ' 抓取下一页
            response.write  " <script>window.location='tignetcn.asp?pg= " & pg + 1 & " ';</script> "
        
else
            Response.Write 
" <hr size=1 color=#00FF00 width=500> "
            response.write 
" <center><font size=2pt><b>===============================信息抓取完毕!!!================================</b></font></center> "
            response.end
        
end   if
End Function
%
>
</ body >
</ html >

 下面是fget.asp里两个函数,一个是截取,一个事过滤html:
1:截取函数:

Function  strCut(strContent,StartStr,EndStr,CutType)
       'strContent  要截取的内容
       'StartStr 开始标志字符
       'EndStr  结束标志字符
       'CutType 截取类型 1--包括开始,结尾标记  2----不包括开始,结尾标记

    
Dim  strHtml,S1,S2
    strHtml 
=  strContent
    
On   Error   Resume   Next
    
If  CutType = 2   Then ' 不包括开始,结尾标记
        S1  =   InStr (strHtml,StartStr) + Len (StartStr)
        S2 
=   InStr (S1,strHtml,EndStr)

        
If  Err  Then
            response.write 
" Unknow Wrong: " & err.description & " ---BG: "   &  S1  &   " &nbsp;End: " & S2 & " <br> "
            Err.Clear
            strCut
= ""
            
Exit   Function
        
Else
            
If  S1 > Len (StartStr)  and  S2 > 0   then
               strCut
= Mid (strHtml,S1,S2 - S1)
            
Else
               strCut
= ""
            
End   If
        
End   if  
'         response.Write strCut
'
        response.End()
     Else ' 包括开始,结尾标记
        S1  =   InStr (strHtml,StartStr)
        S2 
=   InStr (S1,strHtml,EndStr) + Len (EndStr)
        
If  Err  Then
            response.write 
" Unknow Wrong: " & err.description & " ---BG: "   &  S1  &   " &nbsp;End: " & S2 & " <br> "
            Err.Clear
            strCut
= ""
            
Exit   Function
        
Else
            
If  S1 > 0   and  S2 > Len (EndStr)  then
               strCut
= Mid (strHtml,S1,S2 - S1)
            
Else
               strCut
= ""
            
End   If
        
End   if    
    
End   If
End Function

2.html过滤函数,也过滤一些 回车,空格之类的

Function  FormatStr(str)
    
Dim  s1,s2
    
If  str <> ""   then
        str
= replace ( replace ( Trim (str), chr ( 32 ) & chr ( 32 ), "" ), chr ( 9 ), "" )
        
DO   While  ( instr (str, " > " ) > 0   and   instr (str, " < " ) > 0 )
            s1
= InStr (str, " < " )
            s2
= Instr (s1,str, " > " )
            
If  s1 > 0   and  s2 > 0   then
                str
= replace (str, mid (str,s1,s2 - s1 + 1 ), "" )
            
End   if         
        
Loop
        str
= replace ( replace (str, " < " , " &lt; " ), " > " , " &gt; " )
        str
= Replace ( Replace ( Replace ( replace ( replace (str, chr ( 13 ), "" ), chr ( 10 ), "" ), " "" " , " " ), " ' " , " " ), " &nbsp; " , "" )
        FormatStr
= str
     
Else
        FormatStr
= ""
     
End   if         
End Function


  • 0
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值