以前看到很多关于spider , crawler , robot的的一些介绍, 闲着无聊 自己就try ,try ; 完美实现 可能无法完称;
spider 的实现最基本的几个方法;
1:根据url地址取得网页内容;
2:根据网页内容取得它所包含的所有url地址;
一下为两个方法;
-----------------------------------------------------
//取得网页内容;
public static string getHtmlContent(string url)
{
string resultStr = string.Empty;
System.Net.HttpWebRequest hreq = null;
System.Net.HttpWebResponse hrep = null;
Stream stream = null ;
StreamReader sReader = null ;
try
{
hreq = (HttpWebRequest)WebRequest.Create(url);
hrep = (HttpWebResponse)hreq.GetResponse();
stream = hrep.GetResponseStream();
sReader = new StreamReader(stream,System.Text.Encoding.Default);
resultStr = sReader.ReadToEnd();
}
finally
{
sReader.Close();
stream.Close();
hrep.Close();
}
return resultStr;
}
2://取得page中的超连接地址;
public static ArrayList getHttpUrlList(string page,string curUrl, int index_s)
{
ArrayList urlList= new ArrayList(25) ;
Regex r;
string urlStr = string.Empty;
try
{
r = new Regex("(?<=//s+href//s*=)//s*(?:(?<url>/"//w*/")|(?<url>[^>//s]*))");
MatchCollection mc1 = r.Matches(page);
urlList.Clear();
foreach(Match m1 in mc1)
{
urlStr = CompleteUrl(m1.Value,curUrl);
if(!urlList.Contains(urlStr)) urlList.Add(urlStr);
}
}
catch(Exception e)
{ MessageBox.Show("进行正则匹配时出错"+e.Message); return null ;}
return urlList;
}
//标准url地址;
private static string CompleteUrl(string oldUrl,string curUrl)
{
//1
oldUrl = oldUrl.Replace("/"","").ToLower();
oldUrl = oldUrl.Replace("'","");
//2
if(!oldUrl.ToLower().StartsWith("http:"))
oldUrl = curUrl+"/"+oldUrl;
//3
oldUrl = oldUrl.Replace("http://","");
oldUrl = oldUrl.Replace("http://","");
oldUrl = oldUrl.Trim();
return oldUrl;
}
----------------------------------------------------------
存在疑问的地方:
1: StreamReader(stream,System.Text.Encoding.Default); 在取得网页流数据后,用了Encoding.Default来解码(相当与本地的gb2312) ,但碰到big5等其它字符集时就会乱码; 现在还没找到好的解决方法;
2:url的正则表达式,经测试.net帮助中给出的不太理想,因此, 我进行了修改;还需要测试;