采集网页,我是从网上搜的,确实很多人在这方面已经做了很多,感谢他们的共享资源,让我们工作变得更加快。
采集网页源码:这个暂时还没意外就不改了,
/// <summary> /// 取得网页源码 /// 对于带BOM的网页很有效,不管是什么编码都能正确识别 /// </summary> /// <param name="url">网页地址, </param> /// <returns>返回网页源文件</returns> public static string GetHtmlSource(string url) { WebClient myWebClient = new WebClient(); byte[] myDataBuffer = myWebClient.DownloadData(url); return Encoding.Default.GetString(myDataBuffer); }
/// <summary> /// 简单过滤,定位到信息区域有用 /// </summary> /// <param name="code">网页源码</param> /// <param name="wordsBegin">开始</param> /// <param name="wordsEnd">结束</param> /// <returns>主题内容</returns> public static string SniffwebCode(string code, string wordsBegin, string wordsEnd) { string NewsTitle = ""; Regex regex1 = new Regex("" + wordsBegin + @"(?<title>[\s\S]+?)" + wordsEnd + "", RegexOptions.Compiled | RegexOptions.IgnoreCase); for (Match match1 = regex1.Match(code); match1.Success; match1 = match1.NextMatch()) { NewsTitle = match1.Groups["title"].ToString(); } return NewsTitle; }