.net简单爬虫爬取网易新闻头条的，title和URL

最新推荐文章于 2022-04-19 17:03:17 发布

weixin_30951231

最新推荐文章于 2022-04-19 17:03:17 发布

阅读量203

点赞数

文章标签：爬虫

原文链接：http://www.cnblogs.com/lc0915/p/5843530.html

版权

闲来无事写了一个简单的抓取网页信息的例子

抓取网易新闻的title和url

分为以下2个步骤：

1.通过url获取html页面内容

public string GetHtml(string url)
　　{
　　　　var strMsg = "";
　　　　try
　　　　{
　　　　　　var request = WebRequest.Create(url);
　　　　　　var response = request.GetResponse();
　　　　　　var reader = new StreamReader(stream: response.GetResponseStream(), encoding: Encoding.GetEncoding("gb2312"));
　　　　　　strMsg = reader.ReadToEnd();
　　　　　　reader.Close();
　　　　　　reader.Dispose();
　　　　　　esponse.Close();
　　　　}
　　　　catch
　　　　{
　　　　　　return "";
　　　　}
　　　　return strMsg;  
　　}

2.过滤html字符串获得想要的东西

public List<Dictionary<string, string>> GetUrl()
　　{
　　　　var list = new List<Dictionary<string, string>>();
　　　　var regDiv = new Regex(string.Format(@"(?is)<div {0}[^>]*>(?><div[^>]*>(?<o>)|</div>(?<-o>)|(?:(?!</?div\b).)*)*(?(o)(?!))</div>","class=\"mod_top_news2\" id=\"js_top_news\""));
　　　　var strHtml = GetHtml("http://news.163.com/");
　　　　//抓取id = "js_top_news"的div
　　　　var m = regDiv.Match(strHtml);
　　　　if (!m.Success) return list;
　　　　var strDiv = m.Value;
　　　　//抓取所有a标签
　　　　var mc = Regex.Matches(strDiv, @"<a[^>]*>([^<]*)</a>");
　　　　for (var i = 0; i < mc.Count; i++)
　　　　{
　　　　　　//抓取所有a标签的url和标题
　　　　　　var contentMatch = Regex.Matches(mc[i].ToString(), @"<a.*?(href|src)=""(?<href>\S+)"".*?>(?<content>.*?)</a>", RegexOptions.IgnoreCase);
　　　　　　if (contentMatch.Count <= 0) continue;
　　　　　　var dictionary = new Dictionary<string, string>();
　　　　　　dictionary["href"] = contentMatch[0].Groups["href"].Value;
　　　　　　dictionary["content"] = contentMatch[0].Groups["content"].Value;
　　　　　　list.Add(dictionary);
　　　　}
　　　　return list;
　　}