using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using Spider.VO;
using System.Net;
using System.IO;
using System.Text.RegularExpressions;
namespace CAIJI
{
public class SiteGet
{
//<div class="fixList">.*?(((?'Open'<div[^>]*>).*?)+((?'-Open'</div>).*?)+)*(?(Open)(?!))</div>//正解
private Regex asynchroRegex = new Regex(@"<a[^>]*>((?!<a)(?!</a>).)+</a>", RegexOptions.IgnoreCase);
public List<ArticleLink> GetTitle(Page page)
{
string url = page.Url;
string htmlBody;
try
{
HttpWebRequest wr = WebRequest.Create(url) as HttpWebRequest;
wr.Timeout = 3000;
WebResponse response = wr.GetResponse();
StreamReader sr = new StreamReader(response.GetResponseStream(), System.Text.Encoding.GetEncoding(page.Encod));
//读取整个页面
htmlBody = sr.ReadToEnd();
sr.Close();
}
catch (Exception e)
{
throw e;
}
Regex re = new Regex(page.BeginTag+".*?(((?'Open'<div[^>]*>).*?)+((?'-Open'</div>).*?)+)*(?(Open)(?!))"+page.EndTag, RegexOptions.Singleline);
Match ma = re.Match(htmlBody);
htmlBody = ma.Value.Trim();
MatchCollection mc = asynchroRegex.Matches(htmlBody);
string aLink;
string href;
string innerHTML;
ArticleLink al;
List<ArticleLink> als = new List<ArticleLink>();
for (int i = 0; i < mc.Count; i++)
{
al = new ArticleLink();
aLink = mc[i].Value;//获取a标签
//当A标签内的href地址是Javascript则跳过不采集
if (Regex.IsMatch(aLink, @"\s*javascript\s*:.[^)]*.[^>]*>.*/a>"))
{
continue;
}
href = Regex.Match(aLink, @"(?<=href\s*=)\s*[^\s]*").Value.Trim();//获取A标签的href地址
innerHTML = Regex.Match(aLink, "(?<=>).*(?=<)").Value.Trim();//获取A标签内的文本
//href地址校正
if (href.IndexOf("'") == 0 || href.IndexOf("\"") == 0)
{
href = href.Substring(0, href.Length - 1);
}
if (href == null || href.Equals("") || href.Equals("#") || innerHTML.Equals(""))
{
continue;
}
al.LinkUrl = href;
al.LinkTitle = innerHTML;
als.Add(al);
}
return als;
}
}
}
结果: