简单的信息采集程序示例(小偷程序)
最近正准备做一个信息采集的程序,下面是一个简单的采集程序,提供给初学者入门参考。aspx页面代码
<
asp:TextBox ID
=
"
Txt_Url
"
runat
=
"
server
"
Width
=
"
441px
"
></
asp:TextBox
><
br
/>
< asp:Button id = " Btn_GetUrlSource " runat = " server " Text = " 取得网页代码 " OnClick = " Btn_GetUrlSource_Click " ></ asp:Button >
< br />
< asp:TextBox id = " Txt_UrlSource " runat = " server " Width = " 100% " Height = " 195px " TextMode = " MultiLine " ></ asp:TextBox >< br />
< br />
采集开始代码
< asp:TextBox ID = " Txt_First " runat = " server " Height = " 90px " TextMode = " MultiLine " Width = " 280px " ></ asp:TextBox >< br />
< asp:Button ID = " Btn_ListCheck " runat = " server " OnClick = " Btn_ListCheck_Click " Text = " 测试唯一性 " />< br />
采集结束代码
< asp:TextBox ID = " Txt_Last " runat = " server " Height = " 90px " TextMode = " MultiLine "
Width = " 280px " ></ asp:TextBox >< br />
< br />
< asp:Button ID = " Btn_Result " runat = " server " Text = " 取得采集结果 " OnClick = " Btn_Result_Click " />< br />
< asp:TextBox ID = " Txt_Result " runat = " server " Height = " 134px " TextMode = " MultiLine " Width = " 579px " ></ asp:TextBox >
< asp:Button id = " Btn_GetUrlSource " runat = " server " Text = " 取得网页代码 " OnClick = " Btn_GetUrlSource_Click " ></ asp:Button >
< br />
< asp:TextBox id = " Txt_UrlSource " runat = " server " Width = " 100% " Height = " 195px " TextMode = " MultiLine " ></ asp:TextBox >< br />
< br />
采集开始代码
< asp:TextBox ID = " Txt_First " runat = " server " Height = " 90px " TextMode = " MultiLine " Width = " 280px " ></ asp:TextBox >< br />
< asp:Button ID = " Btn_ListCheck " runat = " server " OnClick = " Btn_ListCheck_Click " Text = " 测试唯一性 " />< br />
采集结束代码
< asp:TextBox ID = " Txt_Last " runat = " server " Height = " 90px " TextMode = " MultiLine "
Width = " 280px " ></ asp:TextBox >< br />
< br />
< asp:Button ID = " Btn_Result " runat = " server " Text = " 取得采集结果 " OnClick = " Btn_Result_Click " />< br />
< asp:TextBox ID = " Txt_Result " runat = " server " Height = " 134px " TextMode = " MultiLine " Width = " 579px " ></ asp:TextBox >
.cs页面代码
using
System;
using System.Collections;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Web;
using System.Web.SessionState;
using System.Web.UI;
using System.Web.UI.WebControls;
using System.Web.UI.HtmlControls;
using System.IO;
using System.Net;
using System.Text.RegularExpressions;
using NetShuai.Database;
private string PageUrl = "" ;
private void Page_Load( object sender, System.EventArgs e)
{
// 在此处放置用户代码以初始化页面
}
protected void Btn_GetUrlSource_Click( object sender, EventArgs e)
{
PageUrl = Txt_Url.Text;
WebRequest request = WebRequest.Create(PageUrl);
WebResponse response = request.GetResponse();
Stream resStream = response.GetResponseStream();
StreamReader sr = new StreamReader(resStream, System.Text.Encoding.Default);
Txt_UrlSource.Text = sr.ReadToEnd();
resStream.Close();
sr.Close();
}
protected void Btn_Result_Click( object sender, EventArgs e)
{
string strExp;
strExp = @"(?<=" + Server.HtmlEncode(Txt_First.Text) + ")[\w\W]*?(?=" + Server.HtmlEncode(Txt_Last.Text) + ")";
MatchCollection mc = Regex.Matches(Server.HtmlEncode(Txt_UrlSource.Text), strExp);
for (int i = 0; i < mc.Count; i++)
{
Txt_Result.Text +=Server.HtmlDecode(mc[i].Value);
}
}
protected void Btn_ListCheck_Click( object sender, EventArgs e)
{
string strExp;
strExp = Server.HtmlEncode(Txt_First.Text);
MatchCollection mc = Regex.Matches(Server.HtmlEncode(Txt_UrlSource.Text), strExp);
if(mc.Count>1)
{
Response.Write("<script>alert('列表开始代码有重复!')</script>");
return;
}
strExp = Server.HtmlEncode(Txt_Last.Text);
mc = Regex.Matches(Server.HtmlEncode(Txt_UrlSource.Text), strExp);
if (mc.Count > 1)
{
Response.Write("<script>alert('列表结束代码有重复!')</script>");
return;
}
}
using System.Collections;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Web;
using System.Web.SessionState;
using System.Web.UI;
using System.Web.UI.WebControls;
using System.Web.UI.HtmlControls;
using System.IO;
using System.Net;
using System.Text.RegularExpressions;
using NetShuai.Database;
private string PageUrl = "" ;
private void Page_Load( object sender, System.EventArgs e)
{
// 在此处放置用户代码以初始化页面
}
protected void Btn_GetUrlSource_Click( object sender, EventArgs e)
{
PageUrl = Txt_Url.Text;
WebRequest request = WebRequest.Create(PageUrl);
WebResponse response = request.GetResponse();
Stream resStream = response.GetResponseStream();
StreamReader sr = new StreamReader(resStream, System.Text.Encoding.Default);
Txt_UrlSource.Text = sr.ReadToEnd();
resStream.Close();
sr.Close();
}
protected void Btn_Result_Click( object sender, EventArgs e)
{
string strExp;
strExp = @"(?<=" + Server.HtmlEncode(Txt_First.Text) + ")[\w\W]*?(?=" + Server.HtmlEncode(Txt_Last.Text) + ")";
MatchCollection mc = Regex.Matches(Server.HtmlEncode(Txt_UrlSource.Text), strExp);
for (int i = 0; i < mc.Count; i++)
{
Txt_Result.Text +=Server.HtmlDecode(mc[i].Value);
}
}
protected void Btn_ListCheck_Click( object sender, EventArgs e)
{
string strExp;
strExp = Server.HtmlEncode(Txt_First.Text);
MatchCollection mc = Regex.Matches(Server.HtmlEncode(Txt_UrlSource.Text), strExp);
if(mc.Count>1)
{
Response.Write("<script>alert('列表开始代码有重复!')</script>");
return;
}
strExp = Server.HtmlEncode(Txt_Last.Text);
mc = Regex.Matches(Server.HtmlEncode(Txt_UrlSource.Text), strExp);
if (mc.Count > 1)
{
Response.Write("<script>alert('列表结束代码有重复!')</script>");
return;
}
}