- using System;
- using System.Collections.Generic;
- using System.ComponentModel;
- using System.Data;
- using System.Drawing;
- using System.Text;
- using System.Windows.Forms;
- using System.Net;
- using System.IO;
- using System.Text.RegularExpressions;
- using System.Collections;
- namespace CopyHtml
- {
- public partial class Form1 : Form
- {
- public Form1()
- {
- InitializeComponent();
- }
- private void button1_Click(object sender, EventArgs e)
- {
- //获取指定网页中的源数据
- string rl;
- WebRequest Request = WebRequest.Create(textBox1.Text.Trim());
- WebResponse Response = Request.GetResponse();
- Stream resStream = Response.GetResponseStream();
- StreamReader sr = new StreamReader(resStream, Encoding.Default);
- StringBuilder sb = new StringBuilder();
- while ((rl = sr.ReadLine()) != null)
- {
- sb.Append(rl);
- }
- textBox2.Text = sb.ToString();//抓取得到的源网页
- string he = textBox2.Text.ToString();
- textBox3.Text = stripHtml(he);//去除html标签后得到的源网页
- Match TitleMatch = Regex.Match(he, "<title>([^<]*)</title>", RegexOptions.IgnoreCase | RegexOptions.Multiline);//获取网页的标题
- string title = TitleMatch.Groups[1].Value;
- textBox4.Text = ("网页的标题是:" + title );
- }
- /// <summary>
- /// 去掉网页中的html标签
- /// </summary>
- /// <param name="strHtml">待转化的字符串</param>
- /// <returns></returns>
- private string stripHtml(string strHtml)
- {
- Regex objRegExp = new Regex("<(.|/n)+?>");
- string strOutput = objRegExp.Replace(strHtml, "");
- strOutput = strOutput.Replace("<", "<");
- strOutput = strOutput.Replace(">", ">");
- return strOutput;
- }
- // 提取HTML代码中的网址
- public static ArrayList GetHyperLinks(string htmlCode)
- {
- ArrayList al = new ArrayList();
- string strRegex = @"(href)[ ]*=[ ]*[""'][^""'#>]+[""']";
- Regex r = new Regex(strRegex, RegexOptions.IgnoreCase);
- MatchCollection m = r.Matches(htmlCode);
- for (int i = 0; i <= m.Count - 1; i++)
- {
- bool rep = false;
- string strNew = m[i].ToString();
- // 过滤重复的URL
- foreach (string str in al)
- {
- if (strNew == str)
- {
- rep = true;
- break;
- }
- }
- if (!rep) al.Add(strNew);
- }
- al.Sort();
- return al;
- }
- }
- }
C#程序抓取网页实例
最新推荐文章于 2024-04-12 17:50:44 发布