网页抓取类
using
System;
using System.Collections.Generic;
using System.Linq;
using System.Web;
using System.Text;
using System.Net;
using System.IO;
using System.Text.RegularExpressions;
using System.Collections;
using System.IO.Compression;
/// <summary>
/// Name:网页抓取类
/// Author:loafinweb
/// Date:2011-09-12
/// </summary>
public class webCrawl
{
public webCrawl() { }
// 获取网页字符根据url
public static string getHtml( string url)
{
try
{
string str = "" ;
Encoding en = Encoding.GetEncoding(getEncoding(url));
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
request.Headers.Set( " Pragma " , " no-cache " );
request.Timeout = 30000 ;
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
if (response.StatusCode == HttpStatusCode.OK && response.ContentLength < 1024 * 1024 )
{
Stream strM = response.GetResponseStream();
StreamReader sr = new StreamReader(strM, en);
str = sr.ReadToEnd();
strM.Close();
sr.Close();
}
return str;
}
catch
{
return String.Empty;
}
}
// 获取编码
public static string getEncoding( string url)
{
HttpWebRequest request = null ;
HttpWebResponse response = null ;
StreamReader reader = null ;
try
{
request = (HttpWebRequest)WebRequest.Create(url);
request.Timeout = 30000 ;
request.AllowAutoRedirect = false ;
response = (HttpWebResponse)request.GetResponse();
if (response.StatusCode == HttpStatusCode.OK && response.ContentLength < 1024 * 1024 )
{
if (response.ContentEncoding != null && response.ContentEncoding.Equals( " gzip " , StringComparison.InvariantCultureIgnoreCase))
reader = new StreamReader( new GZipStream(response.GetResponseStream(), CompressionMode.Decompress));
else
reader = new StreamReader(response.GetResponseStream(), Encoding.ASCII);
string html = reader.ReadToEnd();
Regex reg_charset = new Regex( @" charset\b\s*=\s*(?<charset>[^""]*) " );
if (reg_charset.IsMatch(html))
{
return reg_charset.Match(html).Groups[ " charset " ].Value;
}
else if (response.CharacterSet != string .Empty)
{
return response.CharacterSet;
}
else
return Encoding.Default.BodyName;
}
}
catch (Exception ex)
{
throw new Exception(ex.Message);
}
finally
{
if (response != null )
{
response.Close();
response = null ;
}
if (reader != null )
reader.Close();
if (request != null )
request = null ;
}
return Encoding.Default.BodyName;
}
// 根据内容--获取标题
public static string getTitle( string url)
{
string title = string .Empty;
string htmlStr = getHtml(url); // 获取网页
Match TitleMatch = Regex.Match(htmlStr, " <title>([^<]*)</title> " , RegexOptions.IgnoreCase | RegexOptions.Multiline);
title = TitleMatch.Groups[ 1 ].Value;
title = Regex.Replace(title, @" \W " , "" ); // 去除空格
return title;
}
// 根据内容--获取描述信息
public static string getDescription( string url)
{
string htmlStr = getHtml(url);
Match Desc = Regex.Match(htmlStr, " <meta name=\"Description\" content=\"([^<]*)\"*> " , RegexOptions.IgnoreCase | RegexOptions.Multiline);
string mdd = Desc.Groups[ 1 ].Value;
return Regex.Replace(Desc.Groups[ 1 ].Value, @" \W " , "" );
}
// 根据内容--获取所有链接
public static List < string > getLink( string htmlStr)
{
List < string > list = new List < string > (); // 用来存放链接
String reg = @" http://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)? " ; // 链接的正则表达式
Regex regex = new Regex(reg, RegexOptions.IgnoreCase);
MatchCollection mc = regex.Matches(htmlStr);
for ( int i = 0 ; i < mc.Count; i ++ ) // 存放匹配的集合
{
bool hasExist = false ; // 链接存在与否的标记
String name = mc[i].ToString();
foreach (String one in list)
{
if (name == one)
{
hasExist = true ; // 链接已存在
break ;
}
}
if ( ! hasExist) list.Add(name); // 链接不存在,添加
}
return list;
}
// 根据内容--取得body内的内容
public static string getBody( string url)
{
string htmlStr = getHtml(url);
string result = string .Empty;
Regex regBody = new Regex( @" (?is)<body[^>]*>(?:(?!</?body\b).)*</body> " );
Match m = regBody.Match(htmlStr);
if (m.Success)
{
result = parseHtml(m.Value);
}
return result;
}
// 获取所有图片
public static List < string > getImg( string url)
{
List < string > list = new List < string > ();
string temp = string .Empty;
string htmlStr = getHtml(url);
MatchCollection matchs = Regex.Matches(htmlStr, @" <(IMG|img)[^>]+> " ); // 抽取所有图片
for ( int i = 0 ; i < matchs.Count; i ++ )
{
list.Add(matchs[i].Value);
}
return list;
}
// 所有图片路径(如果是相对路径的话,自动设置成绝对路径)
public static List < string > getImgPath( string url)
{
List < string > list = new List < string > ();
string htmlStr = getHtml(url);
string pat = @" <img\b[^<>]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*[""']?[\s\t\r\n]*(?<imgUrl>[^\s\t\r\n""'<>]*)[^<>]*?/?[\s\t\r\n]*> " ;
MatchCollection matches = Regex.Matches(htmlStr, pat, RegexOptions.IgnoreCase | RegexOptions.Multiline);
foreach (Match m in matches)
{
string imgPath = m.Groups[ " imgUrl " ].Value.Trim();
if (Regex.IsMatch(imgPath, @" \w+\.(gif|jpg|bmp|png)$ " )) // 用了2次匹配,去除链接是网页的 只留图片
{
if ( ! imgPath.Contains( " http " )) // 必须包含http 否则无法下载
{
imgPath = getUrl(url) + imgPath;
}
list.Add(imgPath);
}
}
return list;
}
// 下载图片
public void DownloadImg( string fileurl)
{
if (fileurl.Contains( ' . ' )) // url路径必须是绝对路径 例如http: // xxx.com/img/logo.jpg
{
string imgName = DateTime.Now.ToString( " yyyyMMddHHmmssffff " ) + fileurl.Substring(fileurl.LastIndexOf( ' . ' )); // 生成图片的名字
string filepath = System.Web.HttpContext.Current.Server.MapPath( "" ) + " / " + imgName;
WebClient mywebclient = new WebClient();
mywebclient.DownloadFile(fileurl, filepath);
}
}
// 过滤html
public static string parseHtml( string html)
{
string value = Regex.Replace(html, " <[^>]*> " , string .Empty);
value = value.Replace( " < " , string .Empty);
value = value.Replace( " > " , string .Empty);
// return value.Replace(" ", string.Empty);
return Regex.Replace(value, @" \s+ " , "" );
}
// 处理url路径问题
public static string getUrl( string url)
{
// 如果是http: // www.xxx.com 返回http: // www.xxx.com/
// 如果是http: // www.xxx.com/art.aspx 返回http: // www.xxx.com/
return url = url.Substring( 0 , url.LastIndexOf( ' / ' )) + " / " ;
}
}
using System.Collections.Generic;
using System.Linq;
using System.Web;
using System.Text;
using System.Net;
using System.IO;
using System.Text.RegularExpressions;
using System.Collections;
using System.IO.Compression;
/// <summary>
/// Name:网页抓取类
/// Author:loafinweb
/// Date:2011-09-12
/// </summary>
public class webCrawl
{
public webCrawl() { }
// 获取网页字符根据url
public static string getHtml( string url)
{
try
{
string str = "" ;
Encoding en = Encoding.GetEncoding(getEncoding(url));
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
request.Headers.Set( " Pragma " , " no-cache " );
request.Timeout = 30000 ;
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
if (response.StatusCode == HttpStatusCode.OK && response.ContentLength < 1024 * 1024 )
{
Stream strM = response.GetResponseStream();
StreamReader sr = new StreamReader(strM, en);
str = sr.ReadToEnd();
strM.Close();
sr.Close();
}
return str;
}
catch
{
return String.Empty;
}
}
// 获取编码
public static string getEncoding( string url)
{
HttpWebRequest request = null ;
HttpWebResponse response = null ;
StreamReader reader = null ;
try
{
request = (HttpWebRequest)WebRequest.Create(url);
request.Timeout = 30000 ;
request.AllowAutoRedirect = false ;
response = (HttpWebResponse)request.GetResponse();
if (response.StatusCode == HttpStatusCode.OK && response.ContentLength < 1024 * 1024 )
{
if (response.ContentEncoding != null && response.ContentEncoding.Equals( " gzip " , StringComparison.InvariantCultureIgnoreCase))
reader = new StreamReader( new GZipStream(response.GetResponseStream(), CompressionMode.Decompress));
else
reader = new StreamReader(response.GetResponseStream(), Encoding.ASCII);
string html = reader.ReadToEnd();
Regex reg_charset = new Regex( @" charset\b\s*=\s*(?<charset>[^""]*) " );
if (reg_charset.IsMatch(html))
{
return reg_charset.Match(html).Groups[ " charset " ].Value;
}
else if (response.CharacterSet != string .Empty)
{
return response.CharacterSet;
}
else
return Encoding.Default.BodyName;
}
}
catch (Exception ex)
{
throw new Exception(ex.Message);
}
finally
{
if (response != null )
{
response.Close();
response = null ;
}
if (reader != null )
reader.Close();
if (request != null )
request = null ;
}
return Encoding.Default.BodyName;
}
// 根据内容--获取标题
public static string getTitle( string url)
{
string title = string .Empty;
string htmlStr = getHtml(url); // 获取网页
Match TitleMatch = Regex.Match(htmlStr, " <title>([^<]*)</title> " , RegexOptions.IgnoreCase | RegexOptions.Multiline);
title = TitleMatch.Groups[ 1 ].Value;
title = Regex.Replace(title, @" \W " , "" ); // 去除空格
return title;
}
// 根据内容--获取描述信息
public static string getDescription( string url)
{
string htmlStr = getHtml(url);
Match Desc = Regex.Match(htmlStr, " <meta name=\"Description\" content=\"([^<]*)\"*> " , RegexOptions.IgnoreCase | RegexOptions.Multiline);
string mdd = Desc.Groups[ 1 ].Value;
return Regex.Replace(Desc.Groups[ 1 ].Value, @" \W " , "" );
}
// 根据内容--获取所有链接
public static List < string > getLink( string htmlStr)
{
List < string > list = new List < string > (); // 用来存放链接
String reg = @" http://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)? " ; // 链接的正则表达式
Regex regex = new Regex(reg, RegexOptions.IgnoreCase);
MatchCollection mc = regex.Matches(htmlStr);
for ( int i = 0 ; i < mc.Count; i ++ ) // 存放匹配的集合
{
bool hasExist = false ; // 链接存在与否的标记
String name = mc[i].ToString();
foreach (String one in list)
{
if (name == one)
{
hasExist = true ; // 链接已存在
break ;
}
}
if ( ! hasExist) list.Add(name); // 链接不存在,添加
}
return list;
}
// 根据内容--取得body内的内容
public static string getBody( string url)
{
string htmlStr = getHtml(url);
string result = string .Empty;
Regex regBody = new Regex( @" (?is)<body[^>]*>(?:(?!</?body\b).)*</body> " );
Match m = regBody.Match(htmlStr);
if (m.Success)
{
result = parseHtml(m.Value);
}
return result;
}
// 获取所有图片
public static List < string > getImg( string url)
{
List < string > list = new List < string > ();
string temp = string .Empty;
string htmlStr = getHtml(url);
MatchCollection matchs = Regex.Matches(htmlStr, @" <(IMG|img)[^>]+> " ); // 抽取所有图片
for ( int i = 0 ; i < matchs.Count; i ++ )
{
list.Add(matchs[i].Value);
}
return list;
}
// 所有图片路径(如果是相对路径的话,自动设置成绝对路径)
public static List < string > getImgPath( string url)
{
List < string > list = new List < string > ();
string htmlStr = getHtml(url);
string pat = @" <img\b[^<>]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*[""']?[\s\t\r\n]*(?<imgUrl>[^\s\t\r\n""'<>]*)[^<>]*?/?[\s\t\r\n]*> " ;
MatchCollection matches = Regex.Matches(htmlStr, pat, RegexOptions.IgnoreCase | RegexOptions.Multiline);
foreach (Match m in matches)
{
string imgPath = m.Groups[ " imgUrl " ].Value.Trim();
if (Regex.IsMatch(imgPath, @" \w+\.(gif|jpg|bmp|png)$ " )) // 用了2次匹配,去除链接是网页的 只留图片
{
if ( ! imgPath.Contains( " http " )) // 必须包含http 否则无法下载
{
imgPath = getUrl(url) + imgPath;
}
list.Add(imgPath);
}
}
return list;
}
// 下载图片
public void DownloadImg( string fileurl)
{
if (fileurl.Contains( ' . ' )) // url路径必须是绝对路径 例如http: // xxx.com/img/logo.jpg
{
string imgName = DateTime.Now.ToString( " yyyyMMddHHmmssffff " ) + fileurl.Substring(fileurl.LastIndexOf( ' . ' )); // 生成图片的名字
string filepath = System.Web.HttpContext.Current.Server.MapPath( "" ) + " / " + imgName;
WebClient mywebclient = new WebClient();
mywebclient.DownloadFile(fileurl, filepath);
}
}
// 过滤html
public static string parseHtml( string html)
{
string value = Regex.Replace(html, " <[^>]*> " , string .Empty);
value = value.Replace( " < " , string .Empty);
value = value.Replace( " > " , string .Empty);
// return value.Replace(" ", string.Empty);
return Regex.Replace(value, @" \s+ " , "" );
}
// 处理url路径问题
public static string getUrl( string url)
{
// 如果是http: // www.xxx.com 返回http: // www.xxx.com/
// 如果是http: // www.xxx.com/art.aspx 返回http: // www.xxx.com/
return url = url.Substring( 0 , url.LastIndexOf( ' / ' )) + " / " ;
}
}