网页内容,图片及连接 抓取通用类(转)

using System;
using System.Collections.Generic;
using System.Linq;
using System.Web;
using System.Text;
using System.Net;
using System.IO;
using System.Text.RegularExpressions;
using System.Collections;
using System.IO.Compression;

/// <summary>
/// Name:网页抓取类
/// Author:loafinweb
/// Date:2011-09-12
/// </summary>
public class webCrawl
{
public webCrawl() { }

// 获取网页字符根据url
public static string getHtml( string url)
{
try
{
string str = "" ;
Encoding en
= Encoding.GetEncoding(getEncoding(url));
HttpWebRequest request
= (HttpWebRequest)WebRequest.Create(url);
request.Headers.Set(
" Pragma " , " no-cache " );
request.Timeout
= 30000 ;
HttpWebResponse response
= (HttpWebResponse)request.GetResponse();
if (response.StatusCode == HttpStatusCode.OK && response.ContentLength < 1024 * 1024 )
{
Stream strM
= response.GetResponseStream();
StreamReader sr
= new StreamReader(strM, en);
str
= sr.ReadToEnd();
strM.Close();
sr.Close();
}
return str;
}
catch
{
return String.Empty;
}
}

// 获取编码
public static string getEncoding( string url)
{
HttpWebRequest request
= null ;
HttpWebResponse response
= null ;
StreamReader reader
= null ;
try
{
request
= (HttpWebRequest)WebRequest.Create(url);
request.Timeout
= 30000 ;
request.AllowAutoRedirect
= false ;

response
= (HttpWebResponse)request.GetResponse();
if (response.StatusCode == HttpStatusCode.OK && response.ContentLength < 1024 * 1024 )
{
if (response.ContentEncoding != null && response.ContentEncoding.Equals( " gzip " , StringComparison.InvariantCultureIgnoreCase))
reader
= new StreamReader( new GZipStream(response.GetResponseStream(), CompressionMode.Decompress));
else
reader
= new StreamReader(response.GetResponseStream(), Encoding.ASCII);

string html = reader.ReadToEnd();

Regex reg_charset
= new Regex( @" charset\b\s*=\s*(?<charset>[^""]*) " );
if (reg_charset.IsMatch(html))
{
return reg_charset.Match(html).Groups[ " charset " ].Value;
}
else if (response.CharacterSet != string .Empty)
{
return response.CharacterSet;
}
else
return Encoding.Default.BodyName;
}
}
catch (Exception ex)
{
throw new Exception(ex.Message);
}
finally
{
if (response != null )
{
response.Close();
response
= null ;
}
if (reader != null )
reader.Close();

if (request != null )
request
= null ;
}
return Encoding.Default.BodyName;
}

// 根据内容--获取标题
public static string getTitle( string url)
{
string title = string .Empty;
string htmlStr = getHtml(url); // 获取网页
Match TitleMatch = Regex.Match(htmlStr, " <title>([^<]*)</title> " , RegexOptions.IgnoreCase | RegexOptions.Multiline);
title
= TitleMatch.Groups[ 1 ].Value;
title
= Regex.Replace(title, @" \W " , "" ); // 去除空格
return title;

}

// 根据内容--获取描述信息
public static string getDescription( string url)
{
string htmlStr = getHtml(url);
Match Desc
= Regex.Match(htmlStr, " <meta name=\"Description\" content=\"([^<]*)\"*> " , RegexOptions.IgnoreCase | RegexOptions.Multiline);
string mdd = Desc.Groups[ 1 ].Value;
return Regex.Replace(Desc.Groups[ 1 ].Value, @" \W " , "" );
}


// 根据内容--获取所有链接
public static List < string > getLink( string htmlStr)
{
List
< string > list = new List < string > (); // 用来存放链接
String reg = @" http://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)? " ; // 链接的正则表达式
Regex regex = new Regex(reg, RegexOptions.IgnoreCase);
MatchCollection mc
= regex.Matches(htmlStr);
for ( int i = 0 ; i < mc.Count; i ++ ) // 存放匹配的集合
{
bool hasExist = false ; // 链接存在与否的标记
String name = mc[i].ToString();
foreach (String one in list)
{
if (name == one)
{
hasExist
= true ; // 链接已存在
break ;
}
}
if ( ! hasExist) list.Add(name); // 链接不存在,添加
}
return list;

}

// 根据内容--取得body内的内容
public static string getBody( string url)
{
string htmlStr = getHtml(url);
string result = string .Empty;
Regex regBody
= new Regex( @" (?is)<body[^>]*>(?:(?!</?body\b).)*</body> " );
Match m
= regBody.Match(htmlStr);
if (m.Success)
{
result
= parseHtml(m.Value);
}
return result;
}

// 获取所有图片
public static List < string > getImg( string url)
{
List
< string > list = new List < string > ();
string temp = string .Empty;
string htmlStr = getHtml(url);
MatchCollection matchs
= Regex.Matches(htmlStr, @" <(IMG|img)[^>]+> " ); // 抽取所有图片
for ( int i = 0 ; i < matchs.Count; i ++ )
{
list.Add(matchs[i].Value);
}
return list;
}

// 所有图片路径(如果是相对路径的话,自动设置成绝对路径)
public static List < string > getImgPath( string url)
{
List
< string > list = new List < string > ();
string htmlStr = getHtml(url);
string pat = @" <img\b[^<>]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*[""']?[\s\t\r\n]*(?<imgUrl>[^\s\t\r\n""'<>]*)[^<>]*?/?[\s\t\r\n]*> " ;
MatchCollection matches
= Regex.Matches(htmlStr, pat, RegexOptions.IgnoreCase | RegexOptions.Multiline);
foreach (Match m in matches)
{
string imgPath = m.Groups[ " imgUrl " ].Value.Trim();
if (Regex.IsMatch(imgPath, @" \w+\.(gif|jpg|bmp|png)$ " )) // 用了2次匹配,去除链接是网页的 只留图片
{
if ( ! imgPath.Contains( " http " )) // 必须包含http 否则无法下载
{
imgPath
= getUrl(url) + imgPath;
}
list.Add(imgPath);
}
}
return list;
}

// 下载图片
public void DownloadImg( string fileurl)
{
if (fileurl.Contains( ' . ' )) // url路径必须是绝对路径 例如http: // xxx.com/img/logo.jpg
{
string imgName = DateTime.Now.ToString( " yyyyMMddHHmmssffff " ) + fileurl.Substring(fileurl.LastIndexOf( ' . ' )); // 生成图片的名字
string filepath = System.Web.HttpContext.Current.Server.MapPath( "" ) + " / " + imgName;
WebClient mywebclient
= new WebClient();
mywebclient.DownloadFile(fileurl, filepath);
}
}

// 过滤html
public static string parseHtml( string html)
{
string value = Regex.Replace(html, " <[^>]*> " , string .Empty);
value
= value.Replace( " < " , string .Empty);
value
= value.Replace( " > " , string .Empty);
// return value.Replace("&nbsp;", string.Empty);

return Regex.Replace(value, @" \s+ " , "" );
}

// 处理url路径问题
public static string getUrl( string url)
{
// 如果是http: // www.xxx.com 返回http: // www.xxx.com/
// 如果是http: // www.xxx.com/art.aspx 返回http: // www.xxx.com/
return url = url.Substring( 0 , url.LastIndexOf( ' / ' )) + " / " ;
}
}

转载于:https://www.cnblogs.com/zhang9418hn/archive/2011/09/13/2175173.html

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值