网页内容，图片及连接抓取通用类

weixin_30252155

于 2011-09-29 18:03:00 发布

阅读量48

点赞数

原文链接：http://www.cnblogs.com/wlly216/archive/2011/09/29/2195778.html

版权

网页抓取类

using System;
using System.Collections.Generic;
using System.Linq;
using System.Web;
using System.Text;
using System.Net;
using System.IO;
using System.Text.RegularExpressions;
using System.Collections;
using System.IO.Compression;

/// <summary>
/// Name:网页抓取类
/// Author:loafinweb
/// Date:2011-09-12
/// </summary>
public class webCrawl
{
     public webCrawl() { }

     // 获取网页字符根据url
     public static string getHtml( string url)
    {
         try
        {
             string str = "" ;
            Encoding en = Encoding.GetEncoding(getEncoding(url));
            HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
            request.Headers.Set( " Pragma " , " no-cache " );
            request.Timeout = 30000 ;
            HttpWebResponse response = (HttpWebResponse)request.GetResponse();
             if (response.StatusCode == HttpStatusCode.OK && response.ContentLength < 1024 * 1024 )
            {
                Stream strM = response.GetResponseStream();
                StreamReader sr = new StreamReader(strM, en);
                str = sr.ReadToEnd();
                strM.Close();
                sr.Close();
            }
             return str;
        }
         catch
        {
             return String.Empty;
        }
    }

     // 获取编码
     public static string getEncoding( string url)
    {
        HttpWebRequest request = null ;
        HttpWebResponse response = null ;
        StreamReader reader = null ;
         try
        {
            request = (HttpWebRequest)WebRequest.Create(url);
            request.Timeout = 30000 ;
            request.AllowAutoRedirect = false ;

            response = (HttpWebResponse)request.GetResponse();
             if (response.StatusCode == HttpStatusCode.OK && response.ContentLength < 1024 * 1024 )
            {
                 if (response.ContentEncoding != null && response.ContentEncoding.Equals( " gzip " , StringComparison.InvariantCultureIgnoreCase))
                    reader = new StreamReader( new GZipStream(response.GetResponseStream(), CompressionMode.Decompress));
                 else
                    reader = new StreamReader(response.GetResponseStream(), Encoding.ASCII);

                 string html = reader.ReadToEnd();

                Regex reg_charset = new Regex( @" charset\b\s*=\s*(?<charset>[^""]*) " );
                 if (reg_charset.IsMatch(html))
                {
                     return reg_charset.Match(html).Groups[ " charset " ].Value;
                }
                 else if (response.CharacterSet != string .Empty)
                {
                     return response.CharacterSet;
                }
                 else
                     return Encoding.Default.BodyName;
            }
        }
         catch (Exception ex)
        {
             throw new Exception(ex.Message);
        }
         finally
        {
             if (response != null )
            {
                response.Close();
                response = null ;
            }
             if (reader != null )
                reader.Close();

             if (request != null )
                request = null ;
        }
         return Encoding.Default.BodyName;
    }

     // 根据内容--获取标题
     public static string getTitle( string url)
    {
         string title = string .Empty;
         string htmlStr = getHtml(url); // 获取网页
        Match TitleMatch = Regex.Match(htmlStr, " <title>([^<]*)</title> " , RegexOptions.IgnoreCase | RegexOptions.Multiline);
        title = TitleMatch.Groups[ 1 ].Value;
        title = Regex.Replace(title, @" \W " , "" ); // 去除空格
         return title;

    }

     // 根据内容--获取描述信息
     public static string getDescription( string url)
    {
         string htmlStr = getHtml(url);
        Match Desc = Regex.Match(htmlStr, " <meta name=\"Description\" content=\"([^<]*)\"*> " , RegexOptions.IgnoreCase | RegexOptions.Multiline);
         string mdd = Desc.Groups[ 1 ].Value;
         return Regex.Replace(Desc.Groups[ 1 ].Value, @" \W " , "" );
    }

     // 根据内容--获取所有链接
     public static List < string > getLink( string htmlStr)
    {
        List < string > list = new List < string > (); // 用来存放链接
        String reg = @" http://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)? " ;   // 链接的正则表达式
        Regex regex = new Regex(reg, RegexOptions.IgnoreCase);
        MatchCollection mc = regex.Matches(htmlStr);
         for ( int i = 0 ; i < mc.Count; i ++ ) // 存放匹配的集合
        {
             bool hasExist = false ;    // 链接存在与否的标记
            String name = mc[i].ToString();
             foreach (String one in list)
            {
                 if (name == one)
                {
                    hasExist = true ; // 链接已存在
                     break ;
                }
            }
             if ( ! hasExist) list.Add(name); // 链接不存在，添加
        }
         return list;

    }

     // 根据内容--取得body内的内容
     public static string getBody( string url)
    {
         string htmlStr = getHtml(url);
         string result = string .Empty;
        Regex regBody = new Regex( @" (?is)<body[^>]*>(?:(?!</?body\b).)*</body> " );
        Match m = regBody.Match(htmlStr);
         if (m.Success)
        {
            result = parseHtml(m.Value);
        }
         return result;
    }

     // 获取所有图片
     public static List < string > getImg( string url)
    {
        List < string > list = new List < string > ();
         string temp = string .Empty;
         string htmlStr = getHtml(url);
        MatchCollection matchs = Regex.Matches(htmlStr, @" <(IMG|img)[^>]+> " ); // 抽取所有图片
         for ( int i = 0 ; i < matchs.Count; i ++ )
        {
            list.Add(matchs[i].Value);
        }
         return list;
    }

     // 所有图片路径(如果是相对路径的话，自动设置成绝对路径)
     public static List < string > getImgPath( string url)
    {
        List < string > list = new List < string > ();
         string htmlStr = getHtml(url);
         string pat = @" <img\b[^<>]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*[""']?[\s\t\r\n]*(?<imgUrl>[^\s\t\r\n""'<>]*)[^<>]*?/?[\s\t\r\n]*> " ;
        MatchCollection matches = Regex.Matches(htmlStr, pat, RegexOptions.IgnoreCase | RegexOptions.Multiline);
         foreach (Match m in matches)
        {
             string imgPath = m.Groups[ " imgUrl " ].Value.Trim();
             if (Regex.IsMatch(imgPath, @" \w+\.(gif|jpg|bmp|png)$ " )) // 用了2次匹配，去除链接是网页的只留图片
            {
                 if ( ! imgPath.Contains( " http " )) // 必须包含http 否则无法下载
                {
                    imgPath = getUrl(url) + imgPath;
                }
                list.Add(imgPath);
            }
        }
         return list;
    }

     // 下载图片
     public void DownloadImg( string fileurl)
    {
         if (fileurl.Contains( ' . ' )) // url路径必须是绝对路径例如http: // xxx.com/img/logo.jpg
        {
             string imgName = DateTime.Now.ToString( " yyyyMMddHHmmssffff " ) + fileurl.Substring(fileurl.LastIndexOf( ' . ' )); // 生成图片的名字
             string filepath = System.Web.HttpContext.Current.Server.MapPath( "" ) + " / " + imgName;
            WebClient mywebclient = new WebClient();
            mywebclient.DownloadFile(fileurl, filepath);
        }
    }

     // 过滤html
     public static string parseHtml( string html)
    {
         string value = Regex.Replace(html, " <[^>]*> " , string .Empty);
        value = value.Replace( " < " , string .Empty);
        value = value.Replace( " > " , string .Empty);
         // return value.Replace(" ", string.Empty);

         return Regex.Replace(value, @" \s+ " , "" );
    }

     // 处理url路径问题
     public static string getUrl( string url)
    {
         // 如果是http: // www.xxx.com           返回http: // www.xxx.com/
         // 如果是http: // www.xxx.com/art.aspx  返回http: // www.xxx.com/
         return url = url.Substring( 0 , url.LastIndexOf( ' / ' )) + " / " ;
    }
}

转载于:https://www.cnblogs.com/wlly216/archive/2011/09/29/2195778.html