本文简单介绍如何抓取单页的数据
先根据传入URL获取网页源码(注:这里有一个编码的问题,现在还不完善,不能自动判断编码)
//获取网页源码
public static string Get_Http(string a_strUrl, int timeout)
{
string strResult;
try
{
HttpWebRequest myReq = (HttpWebRequest)HttpWebRequest.Create(a_strUrl);
myReq.Timeout = timeout;
HttpWebResponse HttpWResp = (HttpWebResponse)myReq.GetResponse();
Stream myStream = HttpWResp.GetResponseStream();
Encoding encoding = Encoding.GetEncoding("UTF-8");
//如果是gb2312编码
StreamReader sr = new StreamReader(myStream, Encoding.Default);
//如果是utf-8编码
//StreamReader sr = new StreamReader(myStream, Encoding.UTF8);
StringBuilder strBuilder = new StringBuilder();
while (-1 != sr.Peek())
{
strBuilder.Append(sr.ReadLine() + "/r/n");
}
strResult = strBuilder.ToString();
}
catch (Exception exp)
{
strResult = "错误:" + exp.Message;
}
return strResult;
}
挑出标题、时间、内容转换成实体类
(这里进行正则匹配)
//转换成实体
public static MsgInfo getinfomation(string strhtml, string strbtstart, string strbtend,string strsjstart,string strsjend,string strnrstart,string strnrend) {
string retitle = string.Format("{0}(?<g>(.|[\r\n])+?){1}", strbtstart, strbtend);//匹配标题
string redate = string.Format("{0}(?<g>(.|[\r\n])+?){1}", strsjstart, strsjend);//匹配日期
string recontent = string.Format("{0}(?<g>(.|[\r\n])+?){1}", strnrstart, strnrend);//匹配正文
string title = Regex.Match(strhtml, retitle).Groups["g"].Value;
string date = Regex.Match(strhtml, redate).Groups["g"].Value;
string contents = Regex.Match(strhtml, recontent).Groups["g"].Value;
MsgInfo msg = new MsgInfo();
msg.title = title;
msg.pubdate = Convert.ToDateTime(date);
msg.content = contents;
return msg;
}