记一次javaScript动态网页爬虫手动分析，非webkit

最新推荐文章于 2024-06-28 16:01:48 发布

liangxiansong

最新推荐文章于 2024-06-28 16:01:48 发布

阅读量1.6k

点赞数 4

分类专栏：动态网页 python 爬虫文章标签： chrome 数据 javascript 爬虫 python

本文链接：https://blog.csdn.net/liangxiansong/article/details/50813919

版权

动态网页同时被 3 个专栏收录

1 篇文章 0 订阅

订阅专栏

python

1 篇文章 0 订阅

订阅专栏

爬虫

1 篇文章 0 订阅

订阅专栏

应朋友之要求，要爬一个小数据量的网站数据，过程分享下来（菜鸟一只，哪里不对望谅解么么哒(* ￣3)(ε￣ *)）

那么给出target：

这里写图片描述

常规流程，先requests 抓下来发现并没有想要的记录数据，是动态网站无疑了，那么接下来就开始分析。

chrome打开网页，检查元素
什么鬼

filter过滤非js请求
这里写图片描述

基本可以锁定请求是在这个js请求里的（别问我是怎么知道的233333），复制下来
接下来就是看网页源码了，右键显示源码
这里写图片描述

这里先以“互联网药品交易服务入手”为例子
搜素该关键字
这里写图片描述
发现onclick=javascript:sform(7,0,this,'28') 是它的点击事件，我们在刚才的复制下来的js请求中搜索该函数 sform
就是它了

//cur2id        当前select2的Div的Id
//next2Index    select2的然后selectedIndex
//nextText      然后显示表名
//nextTableId   然后tableId
function sform(cur2Id,next2Index,nextTD,nextTableId,nextBcId)
{
//  alert("*******enter sform");
//  alert("***cur2Id==="+cur2Id+"***next2Index==="+next2Index+"***nextTD==="+nextTD+"***nextTableId==="+"***nextBcId==="+nextBcId);

    var selectone=document.getElementById("select1");
//    alert("************select1.selectedIndex===="+selectone.selectedIndex);
    var curIndex=selectone.selectedIndex;
    selectone.selectedIndex=cur2Id;

   try
   {
      if(cur2Id == curIndex){
          var curSelectTwo=document.getElementById("s2"+cur2Id);
          var curSelectTwoIndex=curSelectTwo.selectedIndex;

//        alert("&&&&&&&&&&&&&&&&&&&&&===========curSelectTwoIndex===="+curSelectTwoIndex);

          curSelectTwo.selectedIndex=next2Index;
          var curDivThree=document.getElementById("s3d"+cur2Id+'p'+curSelectTwoIndex);

          var curFormDiv=document.getElementById("formDiv"+cur2Id+'p'+curSelectTwoIndex);
          if(curFormDiv.style.display=="none")
          {
              curFormDiv=s3Form;
          }
          curFormDiv.style.display="none";
          curDivThree.style.display="none";

          var trs=document.getElementById("ta"+cur2Id).rows;
          for(var i=0;i<trs.length;i++)
          {
            trs[i].bgColor="";
          }  
      }else{
          var curSelectTwo=document.getElementById("s2"+curIndex);
          var curSelectTwoIndex=curSelectTwo.selectedIndex;
//        alert("&&&&&&&&&&&&&&&&&&&&&===========curSelectTwoIndex===="+curSelectTwoIndex);

          curSelectTwoNew=document.getElementById("s2"+cur2Id);
          curSelectTwoNew.selectedIndex=next2Index;

          var curDivTwo=document.getElementById("s2d"+curIndex);

          var curDivThree=document.getElementById("s3d"+curIndex+'p'+curSelectTwoIndex);

          var curFormDiv=document.getElementById("formDiv"+curIndex+'p'+curSelectTwoIndex);
          if(curFormDiv.style.display=="none")
          {
              curFormDiv=s3Form;
          }
          curFormDiv.style.display="none";
          curDivThree.style.display="none";

          curDivTwo.style.display="none";

          var trs=document.getElementById("ta"+cur2Id).rows;
          for(var i=0;i<trs.length;i++)
          {
            trs[i].bgColor="";
          }  
      }
      //var curSelectTwo=document.getElementById("s2"+cur2Id);
     // var curSelectTwoIndex=curSelectTwo.selectedIndex;

     // alert("&&&&&&&&&&&&&&&&&&&&&===========curSelectTwoIndex===="+curSelectTwoIndex);

      //curSelectTwo.selectedIndex=next2Index;
     // var curDivThree=document.getElementById("s3d"+cur2Id+'p'+curSelectTwoIndex);

      //var curFormDiv=document.getElementById("formDiv"+cur2Id+'p'+curSelectTwoIndex);
      //if(curFormDiv.style.display=="none")
      //{
      //    curFormDiv=s3Form;
      //}
      //curFormDiv.style.display="none";
      //curDivThree.style.display="none";

      //var trs=document.getElementById("ta"+cur2Id).rows;
      //for(var i=0;i<trs.length;i++)
      //{
     //   trs[i].bgColor="";
     // }
    }
    catch(e)
    {}

    try
    {

      var nextDivTwo=document.getElementById("s2d"+cur2Id);
      nextDivTwo.selectedIndex=next2Index;

      var nextDivThree=document.getElementById("s3d"+cur2Id+'p'+next2Index);
      var nextFormDiv=document.getElementById("formDiv"+cur2Id+'p'+next2Index);

      nextDivTwo.style.display="block";
      nextDivThree.style.display="block";
      nextFormDiv.style.display="block";

      var tn=document.getElementById("tn");
      var keyTableId=document.getElementById("keyTableId");
      var keyBcId=document.getElementById("keyBcId");

      tn.innerHTML=nextTD.innerHTML;
      keyTableId.value=nextTableId;
      keyBcId.value=nextBcId;

      document.getElementById("tr"+cur2Id+'p'+next2Index).bgColor=sColor;

      try
      {
         nextDivThree.firstChild.selectedIndex=0;
      }catch(eee)
      {}

    }
    catch(e)
    {
      tn.innerHTML="";
      keyTableId.value=""
    }

   document.getElementById("keyword").value="";
   document.getElementById("form"+cur2Id+'p'+next2Index).submit();
}

好吧，我承认你头昏脑胀的看完代码后并不能得到什么有用的东西

那么接下来怎么办呢，我们知道js一般是ajax请求的，我们尝试搜索XMLHttpRequest
这里写图片描述
我们发现了这一个createXMLHttp()函数，稍微看下代码，发现是初始化请求的函数，应该请求数据的函数会调用这个函数，我们搜索这个函数名
就是他了，代码如下

function commitForECMA(callback,url,curForm)
{  
//  alert("*******enter commitForECMA");
   request=createXMLHttp();      
   request.onreadystatechange=callback;   
   if(curForm==null)
   {
      request.open("GET",url);
      request.setRequestHeader("Content-Type","text/html;encoding=gbk");
   }
   else
   {
      var fromEle="";
      var myElements=curForm.elements;
      var myLength=myElements.length;
      for(var i=0;i<myLength;i++)
      {
            var myEle=myElements[i];
            if(myEle.type!="submit"&&myEle.value!="")
            {
               if(fromEle.length>0)
               {
                 fromEle+="&"+myEle.name+"="+myEle.value;
               }
               else
               {
                  fromEle+=myEle.name+"="+myEle.value;
               }

               fromEle+="&State=1";
            }
      }
      request.open("POST",url);
      fromEle=encodeURI(fromEle);
      fromEle=encodeURI(fromEle);
      request.setRequestHeader("cache-control","no-cache"); 
      request.setRequestHeader("Content-Type","application/x-www-form-urlencoded");
    }
//  alert('FAFA44');
   request.send(fromEle);
//alert('AAA22');
   if(curForm != null){
   curForm.reset(); 
   }

}

我们分析下代码，if （curForm == null）是一个GET请求，else则构造一个POST请求，
这里写图片描述好的接下来我们隐隐觉得下面的setRequestHeader必有大用，我们记下来，（这里剧透一下，POST请求是动态请求Table里的数据， Get是用来请求table数据里的下一级页面）
我们在在源码里搜索这个函数名CommitForECMA
发现这处调用很特别
这里写图片描述

devPage难道和页面跳转有关？，我们仔细研读一下，我们发现它有获得id为‘pageForm’的标签，搜索它
这里写图片描述

<form method=post id=pageForm nam<form method=post id=pageForm name=pageForm>
    <input type=hidden name=tableId value=33>
    <input type=hidden name=bcId value=118715801943244717582221630944>
    <input type=hidden name=tableName value=TABLE33>
    <input type=hidden name=viewtitleName value=COLUMN312>
    <input type=hidden name=viewsubTitleName value=COLUMN310>
    <input type=hidden name=keyword value=>
    <input type=hidden name=curstart value=>
    <input type=hidden name=tableView value=浜掕仈缃戣嵂鍝佷氦鏄撴湇鍔?>
</form>

诶嘿，你是不是发现了什么？POST请求和参数是不是有了？跳转下一页参数curstart不就是吗！好的我们已经很接近真相了
那么问题来了post请求的URL是什么？别忘了之前的devPage这个函数，
function devPage(jj) { // alert("*******enter devPage"); var curForm=document.getElementById('pageForm'); curForm.curstart.value=jj; commitForECMA(callback,'search.jsp',curForm); }
‘search.jsp’是不是？什么你说这个不是url？
“http://app1.sfda.gov.cn/datasearch/face3/search.jsp”主站页面加它不就是url了吗~
至于，跳转到下一页怎么跳呢？忘了之前的GET了吗？我们再次搜索commitForECMA这个函数
这里写图片描述
you got it，get地址你有了，我们试一试

yes~那么已经拨云见日了，post请求header和data你都有了，下一级的页面地址你也有了，那么就开始愉快的写代码吧~~~
下面给出丑陋的python代码：

# -*- coding: utf-8 -*-
from multiprocessing import Process, Pool
import os, threading
from bs4 import BeautifulSoup
import requests
from requests import Request, Session
import pymysql,random, selenium, re

# 下一级的url列表
URLS = []
# 地址列表
IPANDPORT = []
# 目标host
HOST = 'http://app1.sfda.gov.cn/datasearch/face3/'
# session会话
session = Session()

# 获取代理列表
def getTheRemoteAgent():
    f = open("proxy_list.txt", "r")
    for line in f:
        IPANDPORT.append(line)
    f.close()

def getNextUrl(target_url, index):
    headers = {'User-Agent':'Mozilla/5.0 (compatible; WOW64; MSIE 10.0; Windows NT 6.2)',
               'X-Requested-With':'XMLHttpRequest',
               "cache-control":"no-cache",
                "Content-Type":"application/x-www-form-urlencoded"
    }
    if index != 0:
        data = {
        'tableId':'33',
        'bcId':'118715801943244717582221630944',
        'tableName':'TABLE33',
        'viewtitleName':'COLUMN312',
        'viewsubTitleName':'COLUMN310',
        'curstart':'%s' % index,
        }
    else:
        data = {
        'tableId':'33',
        'bcId':'118715801943244717582221630944',
        'tableName':'TABLE33',
        'viewtitleName':'COLUMN312',
        'viewsubTitleName':'COLUMN310',
        }


    prepare = Request('POST', target_url, headers=headers, data=data).prepare()
    # try多次
    attempts = 0
    success = False
    while attempts < 3 and not success:
        try:
            index = random.randint(0, len(IPANDPORT)-1)
            proxy = {'http':'http://%s' % IPANDPORT[index].strip()}
            print proxy
            result = session.send(prepare, timeout=20, proxies=proxy)
            success = True
        except Exception, e:
            print '请求失败, 重试...'
            print e
            attempts += 1
            if attempts==3:
                print '请求三次失败,跳过'
                pass
    # print result.text
    prase(result.text)

def getNextPage(url):
    attempts = 0
    success = False
    while attempts < 3 and not success:
        try:
            index = random.randint(0, len(IPANDPORT)-1)
            proxy = {'http':'http://%s' % IPANDPORT[index].strip()}
            print proxy
            result = session.get(url, timeout=20, proxies=proxy, headers={'User-Agent':'Mozilla/5.0 (compatible; WOW64; MSIE 10.0; Windows NT 6.2)',})
            success = True
        except Exception, e:
            print '请求失败, 重试...'
            print e
            attempts += 1
            if attempts==3:
                print '请求三次失败,跳过'
                pass

    parseSecPage(result.text)

def parseSecPage(text):
    soup = BeautifulSoup(text, 'lxml')
    trs = soup.find_all('tr')
    args = []
    for tr in trs:
        td = tr.find('td', {'width':'83%'})
        if (td):
            print td.text
            args.append(td.text)
    writeTofile(*args)

def prase(text):
    soup = BeautifulSoup(text, 'lxml')
    As = soup.find_all('a')
    for a in As:
        urlSuff = a['href'].split('\'')[1]
        print HOST + urlSuff
        # 开启多线程获取下一级页面
        t = threading.Thread(target=getNextPage, args=(HOST + urlSuff,))
        # getNextPage(HOST+urlSuff)
        t.start()
        t.join()

def writeTofile(*arr):
    print arr
    path = os.path.expanduser(r'~/Desktop/data/test.txt')
    print path
    f = open(path, "a")
    if arr:
        for rag in arr:
            if isinstance(rag, int):
                f.write(str(rag)+',')
            else:
                f.write(rag.encode('utf8')+',')
        f.write('\n')
        f.close()


# if __name__ == '__name__':
getTheRemoteAgent()
pool = Pool()
for index in range(37):
    host = 'http://app1.sfda.gov.cn/datasearch/face3/search.jsp'
    pool.apply_async(getNextUrl, args=(host, index,))
pool.close()
pool.join()
print 'All subprocesses done.'