C#抓取网页上的所有连接

最新推荐文章于 2016-11-23 22:02:00 发布

caoguangguang

最新推荐文章于 2016-11-23 22:02:00 发布

阅读量1.2k

点赞数

分类专栏： winForm 文章标签： c# string regex textbox null url

本文链接：https://blog.csdn.net/caoguangguang/article/details/5873298

版权

winForm 专栏收录该内容

5 篇文章 0 订阅

订阅专栏

using System; using System.Collections.Generic; using System.ComponentModel; using System.Data; using System.Drawing; using System.Linq; using System.Text; using System.Windows.Forms; using System.Xml; using System.Net; using System.IO; using System.Collections; using System.Text.RegularExpressions; namespace text { public partial class Form1 : Form { string strCode; ArrayList alLinks; public Form1() { InitializeComponent(); } private void button1_Click(object sender, EventArgs e) { if (textBox1.Text == "") { MessageBox.Show("请输入网址"); return; } string strURL = textBox1.Text.ToString().Trim(); if (strURL.Substring(0, 7) != @"http://") { strURL = @"http://" + strURL; } MessageBox.Show("正在获取页面代码，请稍后..."); strCode = GetPageSource(strURL); MessageBox.Show("正在提取超链接，请稍侯..."); alLinks = GetHyperLinks(strCode); MessageBox.Show("正在写入文件，请稍侯..."); WriteToXml(strURL, alLinks); } // 获取指定网页的HTML代码 public static string GetPageSource(string URL) { Uri uri = new Uri(URL); HttpWebRequest hwReq = (HttpWebRequest)WebRequest.Create(uri); HttpWebResponse hwRes = (HttpWebResponse)hwReq.GetResponse(); hwReq.Method = "Get"; hwReq.KeepAlive = false; StreamReader reader = new StreamReader(hwRes.GetResponseStream(), System.Text.Encoding.GetEncoding("GB2312")); return reader.ReadToEnd(); } // 提取HTML代码中的网址 public static ArrayList GetHyperLinks(string htmlCode) { ArrayList al = new ArrayList(); string strRegex = @"http://([/w-]+/.)+[/w-]+(/[/w- ./?%&=]*)?"; Regex r = new Regex(strRegex, RegexOptions.IgnoreCase); MatchCollection m = r.Matches(htmlCode); for (int i = 0; i <= m.Count - 1; i++) { bool rep = false; string strNew = m[i].ToString(); // 过滤重复的URL foreach (string str in al) { if (strNew == str) { rep = true; break; } } if (!rep) al.Add(strNew); } al.Sort(); return al; } // 把网址写入xml文件 static void WriteToXml(string strURL, ArrayList alHyperLinks) { XmlTextWriter writer = new XmlTextWriter("HyperLinks.xml", Encoding.UTF8); writer.Formatting = Formatting.Indented; writer.WriteStartDocument(false); writer.WriteDocType("HyperLinks", null, "urls.dtd", null); writer.WriteComment("提取自" + strURL + "的超链接"); writer.WriteStartElement("HyperLinks"); writer.WriteStartElement("HyperLinks", null); writer.WriteAttributeString("DateTime", DateTime.Now.ToString()); foreach (string str in alHyperLinks) { string title = GetDomain(str); string body = str; writer.WriteElementString(title, null, body); } writer.WriteEndElement(); writer.WriteEndElement(); writer.Flush(); writer.Close(); } // 获取网址的域名后缀 static string GetDomain(string strURL) { string retVal; string strRegex = @"(/.com/|/.net/|/.cn/|/.org/|/.gov/)"; Regex r = new Regex(strRegex, RegexOptions.IgnoreCase); Match m = r.Match(strURL); retVal = m.ToString(); strRegex = @"/.|/$"; retVal = Regex.Replace(retVal, strRegex, "").ToString(); if (retVal == "") retVal = "other"; return retVal; } } }