这段时间对数据采集忽感兴趣,就在网上查了相关资料,并动手实践。
一、添加com引用:Microsoft Xml 3.0;
二、添加using:using MSXML2;
三、代码:(一个TextBox、两个RichTextBox、两个Button)
//以下代码实现对“百度”首页html的获取以及以<div id=m>.+?</div>为例,实现对所采集html的处理。
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Windows.Forms;
using MSXML2;
using System.Text.RegularExpressions;
namespace caiji
{
public partial class Form1 : Form
{
public Form1()
{
InitializeComponent();
}
private void button1_Click(object sender, EventArgs e)
{//点击按钮读取“百度”首页html
string url="http://www.baidu.com";
if(this.textBox1.Text!="")
{//输入网址
url = this.textBox1.Text.Trim().ToString();
}
string result=this.GetRemoteHtmlCode(url);//调用GetRemoteHtmlCode()方法
//将结果显示在richTexBox上
this.richTextBox1.Text = result;
}
/// 获取远程文件源代码
/// </summary>
/// <param name="url">远程url</param>
/// <returns></returns>
public string GetRemoteHtmlCode(string Url)
{
string s = "";
MSXML2.XMLHTTP _xmlhttp = new MSXML2.XMLHTTPClass();
_xmlhttp.open("GET", Url, false, null, null);
_xmlhttp.send("");
if (_xmlhttp.readyState == 4)
{
s = System.Text.Encoding.Default.GetString((byte[])_xmlhttp.responseBody);
}
return s;
}
private void button2_Click(object sender, EventArgs e)
{//以<div id=m>.+?</div>为例,实现对所采集html的处理。
string Reg = "<div id=m>.+?</div>";
string str=this.richTextBox1.Text.Trim().ToString();
string GetValue = this.GetRegValue(Reg,str);
this.richTextBox2.Text = GetValue;
}
public string GetRegValue(string RegexString, string RemoteStr)
{
string MatchVale = "";
Regex r = new Regex(RegexString);
Match m = r.Match(RemoteStr);
if (m.Success)
{
MatchVale = m.Value;
}
return MatchVale;
}
}
}