闲来无事写了一个简单的抓取网页信息的例子
抓取网易新闻的title和url
分为以下2个步骤:
1.通过url获取html页面内容
public string GetHtml(string url) { var strMsg = ""; try { var request = WebRequest.Create(url); var response = request.GetResponse(); var reader = new StreamReader(stream: response.GetResponseStream(), encoding: Encoding.GetEncoding("gb2312")); strMsg = reader.ReadToEnd(); reader.Close(); reader.Dispose(); esponse.Close(); } catch { return ""; } return strMsg; }
2.过滤html字符串获得想要的东西
public List<Dictionary<string, string>> GetUrl() { var list = new List<Dictionary<string, string>>(); var regDiv = new Regex(string.Format(@"(?is)<div {0}[^>]*>(?><div[^>]*>(?<o>)|</div>(?<-o>)|(?:(?!</?div\b).)*)*(?(o)(?!))</div>","class=\"mod_top_news2\" id=\"js_top_news\"")); var strHtml = GetHtml("http://news.163.com/"); //抓取id = "js_top_news"的div var m = regDiv.Match(strHtml); if (!m.Success) return list; var strDiv = m.Value; //抓取所有a标签 var mc = Regex.Matches(strDiv, @"<a[^>]*>([^<]*)</a>"); for (var i = 0; i < mc.Count; i++) { //抓取所有a标签的url和标题 var contentMatch = Regex.Matches(mc[i].ToString(), @"<a.*?(href|src)=""(?<href>\S+)"".*?>(?<content>.*?)</a>", RegexOptions.IgnoreCase); if (contentMatch.Count <= 0) continue; var dictionary = new Dictionary<string, string>(); dictionary["href"] = contentMatch[0].Groups["href"].Value; dictionary["content"] = contentMatch[0].Groups["content"].Value; list.Add(dictionary); } return list; }
这就是我的东西了,欢迎大家一起交流技术