C#程序抓取网页实例

最新推荐文章于 2024-04-12 17:50:44 发布

哈哈军团

最新推荐文章于 2024-04-12 17:50:44 发布

阅读量333

点赞数

using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Text;
using System.Windows.Forms;
using System.Net;
using System.IO;
using System.Text.RegularExpressions;
using System.Collections;
namespace CopyHtml
{
public partial class Form1 : Form
{
public Form1()
{
InitializeComponent();
}
private void button1_Click(object sender, EventArgs e)
{
//获取指定网页中的源数据
string rl;
WebRequest Request = WebRequest.Create(textBox1.Text.Trim());
WebResponse Response = Request.GetResponse();
Stream resStream = Response.GetResponseStream();
StreamReader sr = new StreamReader(resStream, Encoding.Default);
StringBuilder sb = new StringBuilder();
while ((rl = sr.ReadLine()) != null)
{
sb.Append(rl);
}
textBox2.Text = sb.ToString();//抓取得到的源网页
string he = textBox2.Text.ToString();
textBox3.Text = stripHtml(he);//去除html标签后得到的源网页
Match TitleMatch = Regex.Match(he, "<title>([^<]*)</title>", RegexOptions.IgnoreCase | RegexOptions.Multiline);//获取网页的标题
string title = TitleMatch.Groups[1].Value;
textBox4.Text = ("网页的标题是：" + title );
}
/// <summary>
/// 去掉网页中的html标签
/// </summary>
/// <param name="strHtml">待转化的字符串</param>
/// <returns></returns>
private string stripHtml(string strHtml)
{
Regex objRegExp = new Regex("<(.|/n)+?>");
string strOutput = objRegExp.Replace(strHtml, "");
strOutput = strOutput.Replace("<", "<");
strOutput = strOutput.Replace(">", ">");
return strOutput;
}
// 提取HTML代码中的网址
public static ArrayList GetHyperLinks(string htmlCode)
{
ArrayList al = new ArrayList();
string strRegex = @"(href)[ ]*=[ ]*[""'][^""'#>]+[""']";
Regex r = new Regex(strRegex, RegexOptions.IgnoreCase);
MatchCollection m = r.Matches(htmlCode);
for (int i = 0; i <= m.Count - 1; i++)
{
bool rep = false;
string strNew = m[i].ToString();
// 过滤重复的URL
foreach (string str in al)
{
if (strNew == str)
{
rep = true;
break;
}
}
if (!rep) al.Add(strNew);
}
al.Sort();
return al;
}
}
}