1、从指定网址下载网页
public
string
Get_SourceHtml(
string
a_strUrl)
{
string strResult;
try
{
HttpWebRequest myReq = (HttpWebRequest)HttpWebRequest.Create(a_strUrl);
myReq.Timeout = 60000;
HttpWebResponse HttpWResp = (HttpWebResponse)myReq.GetResponse();
Stream myStream = HttpWResp.GetResponseStream();
StreamReader sr = new StreamReader(myStream, Encoding.Default);
strResult = sr.ReadToEnd();
myStream.Close();
}
catch (Exception exp)
{
strResult = "错误:" + exp.Message;
}
return strResult;
}
2、分离出帖子正文
{
string strResult;
try
{
HttpWebRequest myReq = (HttpWebRequest)HttpWebRequest.Create(a_strUrl);
myReq.Timeout = 60000;
HttpWebResponse HttpWResp = (HttpWebResponse)myReq.GetResponse();
Stream myStream = HttpWResp.GetResponseStream();
StreamReader sr = new StreamReader(myStream, Encoding.Default);
strResult = sr.ReadToEnd();
myStream.Close();
}
catch (Exception exp)
{
strResult = "错误:" + exp.Message;
}
return strResult;
}
public
string
SplitHtml(
string
htmlContent)
{
string result =htmlContent;
//剪切掉正文后面的内容
Regex regexObj = new Regex("</div><center>");
Match m = regexObj.Match(result);
if (m.Value != string.Empty)
{
result = result.Substring(0, result.IndexOf(m.Value));
}
//清除掉广告
regexObj =new Regex(@"<center><IFRAME(.|\s)+?</IFRAME></center>");
result=regexObj.Replace(result,"");
//剪切掉正文前面的内容
//regexObj = new Regex("<TABLE align=center border=0 cellSpacing=0 width='100%'>");
regexObj = new Regex(@"<span id='AddMyDigest'></span>");
m = regexObj.Match(result);
if (m.Value != string.Empty)
{
result = result.Substring(result.IndexOf(m.Value));
}
return result;
}
3、获取指定作者的发帖在本地生成新的Html文件以便在程序内嵌浏览器中查看
{
string result =htmlContent;
//剪切掉正文后面的内容
Regex regexObj = new Regex("</div><center>");
Match m = regexObj.Match(result);
if (m.Value != string.Empty)
{
result = result.Substring(0, result.IndexOf(m.Value));
}
//清除掉广告
regexObj =new Regex(@"<center><IFRAME(.|\s)+?</IFRAME></center>");
result=regexObj.Replace(result,"");
//剪切掉正文前面的内容
//regexObj = new Regex("<TABLE align=center border=0 cellSpacing=0 width='100%'>");
regexObj = new Regex(@"<span id='AddMyDigest'></span>");
m = regexObj.Match(result);
if (m.Value != string.Empty)
{
result = result.Substring(result.IndexOf(m.Value));
}
return result;
}
public
string
GetAuthorsContent(
string
SourceHtml,
string
AuthorName)
{
StringBuilder html = new StringBuilder();
html.Append("<Html><body>");
SourceHtml = SplitHtml(SourceHtml);
Regex r = new Regex(@"<TABLE(.|\s)+?</table>");
MatchCollection ms = r.Matches(SourceHtml);
for (int i = 0; i < ms.Count - 2; i++)
{
Match m = ms[i];
int start = SourceHtml.IndexOf(m.Value) + m.Value.Length;
string title = m.Value;
Regex re = new Regex(@"k>(.|\s)+?<");
Match m1 = re.Match(title);
string author = m1.Value;
if (author.Length > 0)
author = author.Substring(2, author.IndexOf("<") - 2);
if (author == AuthorName)
{
html.Append(title);
Match NextMatch = ms[i + 1];
string content;
if (NextMatch != null)
content = SourceHtml.Substring(start, SourceHtml.IndexOf(NextMatch.Value) - start);
else
content = SourceHtml.Substring(start);
html.Append(content);
html.Append("<br>");
}
}
html.Append("</body></html>");
return html.ToString();
}
4、获取分页数据
{
StringBuilder html = new StringBuilder();
html.Append("<Html><body>");
SourceHtml = SplitHtml(SourceHtml);
Regex r = new Regex(@"<TABLE(.|\s)+?</table>");
MatchCollection ms = r.Matches(SourceHtml);
for (int i = 0; i < ms.Count - 2; i++)
{
Match m = ms[i];
int start = SourceHtml.IndexOf(m.Value) + m.Value.Length;
string title = m.Value;
Regex re = new Regex(@"k>(.|\s)+?<");
Match m1 = re.Match(title);
string author = m1.Value;
if (author.Length > 0)
author = author.Substring(2, author.IndexOf("<") - 2);
if (author == AuthorName)
{
html.Append(title);
Match NextMatch = ms[i + 1];
string content;
if (NextMatch != null)
content = SourceHtml.Substring(start, SourceHtml.IndexOf(NextMatch.Value) - start);
else
content = SourceHtml.Substring(start);
html.Append(content);
html.Append("<br>");
}
}
html.Append("</body></html>");
return html.ToString();
}
public
List
<
PageData
>
GetPages(
string
SourceHtml)
{
SourceHtml=SplitHtml(SourceHtml);
Regex r = new Regex(@"http:.+?((\[\d+\])|首页)");
MatchCollection ms = r.Matches(SourceHtml);
List<PageData> pages = new List<PageData>();
foreach (Match m in ms)
{
r = new Regex("><.+?>");
string s = r.Replace(m.Value, "");
s = s.Replace(">", "");
if (s.IndexOf("首页") >= 0)
pages.Add(new PageData(s.Substring(0, s.IndexOf("首页")),"首页"));
else
pages.Add(new PageData(s.Substring(0,s.IndexOf("[")),s.Substring(s.IndexOf("["))));
}
return pages;
}
{
SourceHtml=SplitHtml(SourceHtml);
Regex r = new Regex(@"http:.+?((\[\d+\])|首页)");
MatchCollection ms = r.Matches(SourceHtml);
List<PageData> pages = new List<PageData>();
foreach (Match m in ms)
{
r = new Regex("><.+?>");
string s = r.Replace(m.Value, "");
s = s.Replace(">", "");
if (s.IndexOf("首页") >= 0)
pages.Add(new PageData(s.Substring(0, s.IndexOf("首页")),"首页"));
else
pages.Add(new PageData(s.Substring(0,s.IndexOf("[")),s.Substring(s.IndexOf("["))));
}
return pages;
}
完整源代码点击下载