这个早些日子就写好了,现共享出来大家一起分享分享,希望自己能够坚持每天至少更新一篇自己的BLOG!
/* ################# 说 明 ####################
* 本类为对网站的新闻信息进行自动抓取的类/蜘蛛程序/网络机器人
* 主要是运用.NET FRAMEWORK内库的强大函数功能
* Author:longjun
* Date:2007.10
*/
//引入.NET FRAMEWORK内库资源
using System;
using System.Text;
using System.Text.RegularExpressions;
using System.Net;
using System.IO;
using System.Collections;
using System.Drawing;
namespace ljun.BaseFunction //定义命名空间名为:ljun.BaseFunction
{
/// <summary>
/// 定义类NewsCollection
/// 其继存于BasicPage类
/// [BasicPage类为数据库连接处理类,在此暂时不给出相关代码]
/// </summary>
public class NewsCollection : BasicPage
{
public NewsCollection()
{
//
// TODO: 在此处添加构造函数逻辑
//
}
//对HTTP进行检索抓取的方法[判断能否连接或连接超时,以及编码方式]
public string GetHttpPage(string url,int timeout,Encoding EnCodeType)
{
//定义空字符串strResult,用于存取抓取的资源
string strResult = string.Empty;
//如果URL长度小于10,则返回URL书写错误
if (url.Length < 10)
{
return "$UrlIsFalse$";
}
try
{
WebClient MyWebClient = new WebClient();
//获取应用程序的凭据
MyWebClient.Credentials = CredentialCache.DefaultCredentials;
//获取和设置用于字符串的编码类型
MyWebClient.Encoding = EnCodeType;
//抓取资源
strResult = MyWebClient.DownloadString(url);
}
catch(Exception)
{
//如果抓取出错,则抛出异常信息
strResult = "$GetFalse$";
}
return strResult;
}
//对抓取内容进行检索的方法1
public string GetBody(string pageStr, string strStart, string strEnd, bool inStart, bool inEnd)
{
//去掉pageStr中的空格
pageStr = pageStr.Trim();
//定义开始抓取的标记字符串长度
int start = pageStr.IndexOf(strStart);
//如果长度小于0或等于0,则返回错误
if (strStart.Length == 0 || start < 0)
{
return "$StartFalse$";
}
//对抓取的字符串进行检索
pageStr = pageStr.Substring(start + strStart.Length, pageStr.Length - start - strStart.Length);
//定义结束抓取的标记字符串长度
int end = pageStr.IndexOf(strEnd);
//如果长度小于0或等于0,则返回错误
if (strEnd.Length == 0 || end < 0)
{
return "$EndFalse$";
}
//对抓取的字符串长度进行检索比较处理,并根据其长度进行存取
string strResult = pageStr.Substring(0, end);
if (inStart)
{
strResult = strStart + strResult;
}
if (inEnd)
{
strResult += strEnd;
}
return strResult.Trim();
}
//对抓取内容进行检索的方法2
public string GetPaing(string pageStr, string strStart, string strEnd)
{
int end = pageStr.IndexOf(strEnd);
if (strEnd.Length == 0 || end < 0)
{
return "$EndFalse$";
}
pageStr = pageStr.Substring(0, end);
int start = pageStr.LastIndexOf(strStart);
if (strStart.Length == 0 || start < 0 || start > end)
{
return "$StartFalse$";
}
pageStr = pageStr.Substring(start + strStart.Length);
pageStr = pageStr.Replace("/"", "");
pageStr = pageStr.Replace("'", "");
pageStr = pageStr.Replace(" ", "");
pageStr = pageStr.Trim();
return pageStr;
}
//对链接进行检索抓取的方法
public ArrayList GetArray(string pageStr, string strStart, string strEnd)
{
//定义一个可动态改变其大小的ArrayList数组linkArray
ArrayList linkArray = new ArrayList();
//定义要抓取的链接开始标记的长度
int start = pageStr.IndexOf(strStart);
//如果其长度等于或小于O,则出错提示
if (strStart.Length == 0 || start < 0)
{
linkArray.Add("$StartFalse$");
return linkArray;
}
//定义要抓取的链接结束标记长度
int end = pageStr.IndexOf(strEnd);
//如果其长度等于或小于O,则出错提示
if (strEnd.Length == 0 || end < 0)
{
linkArray.Add("$EndFalse$");
return linkArray;
}
//定义正则表达式模式
Regex myRegex = new Regex(@"(" + strStart + ").+?(" + strEnd + ")", RegexOptions.IgnoreCase);
//以迭代方式将链接正则表达式模式应用于抓取到的链接地址,并将其存入linkArray
MatchCollection matches = myRegex.Matches(pageStr);
foreach (Match match in matches)
{
linkArray.Add(match.ToString());
}
//如果链接地址长度为O,则返回提示没有链接地址
if (linkArray.Count == 0)
{
linkArray.Add("$NoneLink$");
return linkArray;
}
string TempStr = string.Empty;
for (int i = 0; i < linkArray.Count; i++)
{
TempStr = linkArray[i].ToString();
//将链接开始标记去掉
TempStr = TempStr.Replace(strStart, "");
//将链接结束标记去掉
TempStr = TempStr.Replace(strEnd, "");
linkArray[i] = (object)TempStr;
}
return linkArray;
}
//对抓取到的图片进行检索处理的方法
public ArrayList ReplaceSaveRemoteFile(string pageStr, string SavePath, string CDir, string webUrl, string isSave)
{
//以下为将图片正则表达式模式应用于抓取到的图片地址,方法类同上面的 [故在此不作详细说明]
ArrayList replaceArray = new ArrayList();
Regex imgReg = new Regex(@"<img.+?[^/>]>", RegexOptions.IgnoreCase);
MatchCollection matches = imgReg.Matches(pageStr);
string TempStr = string.Empty;
string TitleImg = string.Empty;
foreach (Match match in matches)
{
if (TempStr != string.Empty)
TempStr += "$Array$" + match.ToString();
else
TempStr = match.ToString();
}
string[] TempArr = TempStr.Split(new string[] { "$Array$" }, StringSplitOptions.None);
TempStr = string.Empty;
imgReg = new Regex(@"src/s*=/s*.+?/.(gif|jpg|bmp|jpeg|psd|png|svg|dxf|wmf|tiff)", RegexOptions.IgnoreCase);
for (int i = 0; i < TempArr.Length; i++)
{
matches = imgReg.Matches(TempArr[i]);
foreach (Match match in matches)
{
if (TempStr != string.Empty)
TempStr += "$Array$" + match.ToString();
else
TempStr = match.ToString();
}
}
if (TempStr.Length > 0)
{
imgReg = new Regex(@"src/s*=/s*", RegexOptions.IgnoreCase);
TempStr = imgReg.Replace(TempStr, "");
}
if (TempStr.Length == 0)
{
replaceArray.Add(pageStr);
return replaceArray;
}
TempStr = TempStr.Replace("/"", "");
TempStr = TempStr.Replace("'", "");
TempStr = TempStr.Replace(" ", "");
//定义图片本地存放的地址
SavePath = SavePath + "/UserFiles/" + DateTime.Now.ToString("yyyyMM");
if (!System.IO.Directory.Exists(SavePath))
{
System.IO.Directory.CreateDirectory(SavePath);
}
//去掉重复抓取到的图片
TempArr = TempStr.Split(new string[] { "$Array$" }, StringSplitOptions.None);
TempStr = string.Empty;
for (int i = 0; i < TempArr.Length; i++)
{
if (TempStr.IndexOf(TempArr[i]) == -1)
TempStr += "$Array$" + TempArr[i];
}
TempStr = TempStr.Substring(7);
TempArr = TempStr.Split(new string[] { "$Array$" }, StringSplitOptions.None);
TempStr = string.Empty;
string ImageArr = string.Empty;
for (int i = 0; i < TempArr.Length; i++)
{
imgReg = new Regex(TempArr[i]);
//对地址进行正则表达式的匹配处理
string RemoteFileUrl = DefiniteUrl(TempArr[i], webUrl);
//判断是否对图片进行本地保存
if (isSave == "1")
{
string fileType = RemoteFileUrl.Substring(RemoteFileUrl.LastIndexOf('.'));
//图片以抓取时间及一个随机生成数来命名
string filename = string.Empty;
filename = DateTime.Now.ToString("yyyyMMdd");
filename += new Random().Next(100, 999).ToString() + fileType;
//如果图片保存成功,则返回其保存地址
if (SaveRemotePhoto(SavePath + "/" + filename, RemoteFileUrl))
{
RemoteFileUrl = CDir + "/UserFiles/" + DateTime.Now.ToString("yyyyMM") + "/" + filename;
}
}
pageStr = imgReg.Replace(pageStr, RemoteFileUrl);
if (i == 0)
{
TitleImg = RemoteFileUrl;
ImageArr = RemoteFileUrl;
}
else
ImageArr += "|||" + RemoteFileUrl;
}
replaceArray.Add(pageStr);
replaceArray.Add(TitleImg);
replaceArray.Add(ImageArr);
return replaceArray;
}
//对地址进行正则表达式匹配处理的方法
//[主要用正则表达式进行匹配处理]
public string DefiniteUrl(string PrimitiveUrl, string ConsultUrl)
{
if (ConsultUrl.Substring(0, 7) != "http://")
{
ConsultUrl = "http://" + ConsultUrl;
}
ConsultUrl = ConsultUrl.Replace(@"/", "/"); //将"/"换成"/",以下类同,不再作详细说明
ConsultUrl = ConsultUrl.Replace("://", @"://");
PrimitiveUrl = PrimitiveUrl.Replace(@"/", "/");
if (ConsultUrl.Substring(ConsultUrl.Length - 1) != "/")
{
if (ConsultUrl.IndexOf('/') > 0)
{
if (ConsultUrl.Substring(ConsultUrl.LastIndexOf("/"), ConsultUrl.Length - ConsultUrl.LastIndexOf("/")).IndexOf('.') == -1)
ConsultUrl += "/";
}
else
ConsultUrl += "/";
}
string[] ConArray = ConsultUrl.Split('/');
string returnStr = string.Empty;
string[] PriArray;
int pi = 0;
if (PrimitiveUrl.Substring(0, 7) == "http://")
returnStr = PrimitiveUrl.Replace("://", @"://");
else if (PrimitiveUrl.Substring(0, 1) == "/")
returnStr = ConArray[0] + PrimitiveUrl;
else if (PrimitiveUrl.Substring(0, 2) == "./")
{
PrimitiveUrl = PrimitiveUrl.Substring(PrimitiveUrl.Length - 2, 2);
if (ConsultUrl.Substring(ConsultUrl.Length - 1) == "/")
returnStr = ConsultUrl + PrimitiveUrl;
else
returnStr = ConsultUrl.Substring(0, ConsultUrl.LastIndexOf('/')) + PrimitiveUrl;
}
else if (PrimitiveUrl.Substring(0, 3) == "../")
{
while (PrimitiveUrl.Substring(0, 3) == "../")
{
PrimitiveUrl = PrimitiveUrl.Substring(3);
pi++;
}
for (int i = 0; i < ConArray.Length - 1 - pi; i++)
{
if (returnStr.Length > 0)
returnStr = returnStr + ConArray[i];
else
returnStr = ConArray[i];
}
returnStr = returnStr + PrimitiveUrl;
}
else
{
if (PrimitiveUrl.IndexOf('/') > -1)
{
PriArray = PrimitiveUrl.Split('/');
if (PriArray[0].IndexOf('.') > -1)
{
if (PrimitiveUrl.Substring(PrimitiveUrl.Length - 1) == "/")
returnStr = "http://" + PrimitiveUrl;
{
if (PriArray[PriArray.Length - 1].IndexOf('.') > -1)
returnStr = "http://" + PrimitiveUrl;
else
returnStr = "http://" + PrimitiveUrl + "/";
}
}
else
{
if (ConsultUrl.Substring(ConsultUrl.Length - 1) == "/")
returnStr = ConsultUrl + PrimitiveUrl;
else
returnStr = ConsultUrl.Substring(0, ConsultUrl.LastIndexOf('/')) + PrimitiveUrl;
}
}
else
{
if (PrimitiveUrl.IndexOf('.') > -1)
{
string lastUrl = ConsultUrl.Substring(ConsultUrl.LastIndexOf('.'));
if (ConsultUrl.Substring(ConsultUrl.Length - 1) == "/")
{
if (lastUrl == "com" || lastUrl == "cn" || lastUrl == "net" || lastUrl == "org")
returnStr = "http://" + PrimitiveUrl + "/";
else
returnStr = ConsultUrl + PrimitiveUrl;
}
else
{
if (lastUrl == "com" || lastUrl == "cn" || lastUrl == "net" || lastUrl == "org")
returnStr = "http://" + PrimitiveUrl + "/";
else
returnStr = ConsultUrl.Substring(0, ConsultUrl.LastIndexOf('/')) + "/" + PrimitiveUrl;
}
}
else
{
if (ConsultUrl.Substring(ConsultUrl.Length - 1) == "/")
returnStr = ConsultUrl + PrimitiveUrl + "/";
else
returnStr = ConsultUrl.Substring(0, ConsultUrl.LastIndexOf('/')) + "/" + PrimitiveUrl + "/";
}
}
}
if (returnStr.Substring(0, 1) == "/")
returnStr = returnStr.Substring(1);
if (returnStr.Length > 0)
{
returnStr = returnStr.Replace("//", "/");
returnStr = returnStr.Replace(@"://", "://");
}
else
returnStr = "$False$";
return returnStr;
}
//对图片进行保存的方法
public bool SaveRemotePhoto(string fileName, string RemoteFileUrl)
{
try
{
//发出创建保存地址的请求
WebRequest request = WebRequest.Create(RemoteFileUrl);
//定义反应时间为20000
request.Timeout = 20000;
//返回创建请求的数据流
Stream stream = request.GetResponse().GetResponseStream();
//从返回指定的数据流保存图片
Image getImage = Image.FromStream(stream);
getImage.Save(fileName);
return true;
}
catch (Exception)
{
return false;
}
}
//对HTML标识进行过滤处理的方法1
//[主要用正则表达式的方式来进行匹配判断处理]
public string ScriptHtml(string ConStr, string TagName, int FType)
{
Regex myReg;
switch (FType)
{
case 1:
myReg = new Regex("<" + TagName + "([^>])*>", RegexOptions.IgnoreCase);
ConStr = myReg.Replace(ConStr, "");
break;
case 2:
myReg = new Regex("<" + TagName + "([^>])*>.*?</" + TagName + "([^>])*>", RegexOptions.IgnoreCase);
ConStr = myReg.Replace(ConStr, "");
break;
case 3:
myReg = new Regex("<" + TagName + "([^>])*>", RegexOptions.IgnoreCase);
ConStr = myReg.Replace(ConStr, "");
myReg = new Regex("</" + TagName + "([^>])*>", RegexOptions.IgnoreCase);
ConStr = myReg.Replace(ConStr, "");
break;
}
return ConStr;
}
//对HTML标识进行过滤处理的方法2
//[主要用正则表达式的方式来进行匹配判断处理]
public string NoHtml(string ConStr)
{
Regex myReg = new Regex(@"(/<.[^/<]*/>)", RegexOptions.IgnoreCase);
ConStr = myReg.Replace(ConStr, "");
myReg = new Regex(@"(/<//[^/<]*/>)", RegexOptions.IgnoreCase);
ConStr = myReg.Replace(ConStr, "");
return ConStr;
}
}
}
网站信息自动抓取类/蜘蛛程序/网络机器人
最新推荐文章于 2023-02-17 14:11:39 发布