基于ES6的爬网脚本

最新推荐文章于 2022-04-14 22:46:17 发布

懒人Ethan

最新推荐文章于 2022-04-14 22:46:17 发布

阅读量149

点赞数

分类专栏：前端 ES6

本文链接：https://blog.csdn.net/weixin_43263355/article/details/107867851

版权

前端同时被 2 个专栏收录

32 篇文章 0 订阅

订阅专栏

ES6

20 篇文章 0 订阅

订阅专栏

爬网脚本

需求介绍
关键代码分析
附录
- 完整爬网代码
- 格式化后的顶部按钮html

本文通过使用ES6的新特性，如async函数，扩展运算符，正则表达式的具名组匹配等，实现了一个简单的爬网脚本。
通过获取每个页面的html代码，进行指定区域的正则表达式匹配，最后完成所有需求。

需求介绍

获取指定页面title元素的内容。
获取指定页面顶端链接的名称和地址。

关键代码分析

我们希望通过异步请求获取每个页面html代码，再对每个页面进行分析。因此，传统的异步请求很难解决该问题，所以使用async函数，让整个异步处理和同步处理几乎没有差别，循环体中实现获取一个html代码，处理并输出一次，代码如下：

(async function($){
	for(let url of Urls){
		await GetResponse(obj.href);
	}
})(jQuery)

按照async/await的要求，封装一个返回Promise对象的Http Get请求。

 function GetResponse(url){
    return new Promise((resolve,reject) => {
      $.ajax(
        {
          url:url,
          type:'get',
          success:function(data){
            resolve(data);
          },
          error:function(err){
            console.log(err);
            reject(err);
          }
        });
    });
  }

精确匹配html中的title元素的内容，使用.*?避免贪婪匹配。请注意，对于html元素内容的匹配，我非常推荐非贪婪的匹配模式。在这个例子中，如果我们去掉问号，则为贪婪匹配模式，这样将把大量无关的内容匹配尽量。
另外，考虑到在一些页面压缩的情况下，title元素中有可能包含换行符或结束符，因此采用dotAll模式，让点(.)可以匹配任意单个字符，具体代码如下：

 function getTitle(html){
    var title = html.match(/<title>(?<title>.*?)<\/title>/s).groups?.title; //具名组匹配更加方便
    return title || NA; // 如果没有匹配到，返回N/A
  }

匹配顶部按钮的html如下，可以看到html代码并不规整，格式化后的html代码请参看附录。

<div class="m-hero is-left-align" id="sc-lb-module-product-masthead" data-post-status="publish-status" data-post-status-label="Published">
								<div class="background">
						<img class="img-bg lazyloaded" data-src="/cn/content/images/cn-wm-app-1600x490px.jpg" alt="Cn wm app xpx" src="/cn/content/images/cn-wm-app-1600x490px.jpg" loading="lazy">
					</div>
					
		
		<div class="content">
			<div class="outer-wrapper">
				<div class="hero-header">
					<div class="wrapper">
						<ul class="breadcrumb">
							
<li class="post post-page"><a property="item" typeof="WebPage"  href="/cn/promotions/" class="post post-page"><span property="name">APP</span></a></li>
<li class="post post-page current-item"><h1><span property="name" class="post post-page current">欢迎下载APP</span></h1></li>
						</ul>
													<h2 class="title">
								欢迎下载财富管理APP							</h2>
												<ul class="buttons">
																								<li>
					<a href="#sc-lb-module-product-action-3" title="立刻下载APP" data-context="sc-lb-module-product-masthead-Masthead CTA title" class="c-button is-theme-solid-green-hollow-white">立刻下载APP</a>
	</li>

<li>
        <a href="#sc-lb-module-product-action-4" title="获取红包" data-context="sc-lb-module-product-masthead-Masthead CTA title" class="c-button is-theme-solid-green-hollow-white">获取红包</a>
    </li>
																													</ul>

						
																	</div>
				</div>
			</div>
		</div>

					</div>

我们需要上面的html中获取用类的ul.buttons中的内容，当前页面可能包括多个ul.buttons，根据需求，我们只需要div.hero-header中的链接列表。
匹配思路：
考虑到最后匹配的结果可能包括多个顶部按钮，本例中包括“立刻下载APP”和“获取红包”，其他页面顶部按钮也是不定数目。因此，我们要采用全局匹配模式。考虑到全局匹配，数据量比较大，所以笔者采用先大后小的策略。即先把div.hero-header中的链接列表内容全部匹配到，再具体处理每个按钮。

关键代码如下：

  let ulHtml = html.match(/<div\s*class="m-hero.*?".*?>.*?<ul class="buttons">(?<buttons>.*?)<\/ul>/s).groups.buttons;

因为采用了dotAll模式，所以尽管div和ul中包括多个空格和换行，但是并不影响最后的匹配。得到的匹配结果如下：


                                                                            <li>
<a href="#sc-lb-module-product-action-3" title="立刻下载APP" data-context="sc-lb-module-product-masthead-Masthead CTA title" class="c-button is-theme-solid-green-hollow-white">立刻下载APP</a>
</li>

<li>
        <a href="#sc-lb-module-product-action-4" title="获取红包" data-context="sc-lb-module-product-masthead-Masthead CTA title" class="c-button is-theme-solid-green-hollow-white">获取红包</a>
    </li>

对于剩下的按钮列表内容匹配，就变得很简单了，我们采用全局匹配matchAll函数，因为要匹配的内容已经被过滤，所以即使全局匹配，也不会产生性能问题。关键代码如下：

let buttons = [...ulHtml.matchAll(/\s*<li\s*>.*?<a\s+href="(?<url>.*?)".*?>(?<text>.*?)<\/a>\s*<\/li>/gs)].map( li => li.groups);

matchAll返回的结果是一个迭代器，我们将扩展运算符和[]一同使用，直接将其转为普通数组，进行迭代获取顶部链接内容。因为在匹配中，采用具名组匹配，链接的名称为text，链接的URL为url，因此我们直接访问groups属性，即可获取链接的内容。

附录

完整爬网代码

(async function($){
  const NA = "N/A" ;
  const Urls = Array.of();
  let index = 0;
  function GetResponse(url){
    return new Promise((resolve,reject) => {
      $.ajax(
        {
          url:url,
          type:'get',
          success:function(data){
            resolve(data);
          },
          error:function(err){
            console.log(err);
            reject(err);
          }
        });
    });
  }
  function getTitle(html){
    var title = html.match(/<title>(?<title>.*?)<\/title>/).groups?.title;
    return title || NA;
  }  
  function getButtons(html){
    let ulHtml = html.match(/<div\s*class="m-hero.*?".*?>.*?<ul class="buttons">(?<buttons>.*?)<\/ul>/s).groups.buttons;
    if (!ulHtml) return Array.of();
    let buttons = [...ulHtml.matchAll(/\s*<li\s*>.*?<a\s+href="(?<url>.*?)".*?>(?<text>.*?)<\/a>\s*<\/li>/gs)].map( li => li.groups);
    return buttons;
  }
  function setDataSource(){
    let pages = ``; // 添加具体的url 地址，每个地址以换行隔开
    Urls.push(...pages.split("\n"));   
  }

  console.log("%c%s", "color:green", "Begin");
  setDataSource();
  for(let url of Urls){
    let obj = {};
    url = url.trim();
    try{
      obj.href = url;
      let html = await GetResponse(obj.href);
      obj.title = getTitle(html);
      obj.buttons = getButtons(html);
      console.log("%c%s", "color:blue", (index+1));
      console.log("%c%s", "color:blue", obj.title);
      console.log("%c%s", "color:blue", obj.href);
      obj.buttons.forEach(btn => {
        console.log("%c%s", "color:blue", `Button Text: ${btn.text}`);
        console.log("%c%s", "color:blue", `Button Url: ${btn.url}`);
      });
    }catch(err){
      console.log("%c%s", "color:red", err);
    }finally{
      ++index;
    }
  }
  console.log("End");
})(jQuery)

格式化后的顶部按钮html

<div class="m-hero is-left-align" id="sc-lb-module-product-masthead" data-post-status="publish-status" data-post-status-label="Published">
        <div class="background">
            <img class="img-bg lazyloaded" data-src="/cn/content/images/cn-wm-app-1600x490px.jpg" alt="Cn wm app xpx" src="/cn/content/images/cn-wm-app-1600x490px.jpg" loading="lazy">
        </div>
        <div class="content">
            <div class="outer-wrapper">
                <div class="hero-header">
                    <div class="wrapper">
                        <ul class="breadcrumb">     
                            <li class="post post-page"><a property="item" typeof="WebPage"  href="/cn/promotions/" class="post post-page"><span property="name">APP</span></a></li>
                            <li class="post post-page current-item"><h1><span property="name" class="post post-page current">欢迎下载APP</span></h1></li>
                        </ul>
                        <h2 class="title">
            欢迎下载财富管理APP							</h2>
                        <ul class="buttons">
                            <li>
                                <a href="#sc-lb-module-product-action-3" title="立刻下载APP" data-context="sc-lb-module-product-masthead-Masthead CTA title" class="c-button is-theme-solid-green-hollow-white">立刻下载APP</a>
                            </li>
                            <li>
                                <a href="#sc-lb-module-product-action-4" title="获取红包" data-context="sc-lb-module-product-masthead-Masthead CTA title" class="c-button is-theme-solid-green-hollow-white">获取红包</a>
                            </li>
                        </ul>
                    </div>
                </div>
            </div>
        </div>
    </div>