应用部分:
using
System;
using
System.Collections.Generic;
using
System.Text;
using
System.IO;
using
System.Data;
using
System.Text.RegularExpressions;
using
System.Net;
using
System.Threading;
![](https://i-blog.csdnimg.cn/blog_migrate/6810355c2f78c12e91b7997a8e8c583a.gif)
![](https://i-blog.csdnimg.cn/blog_migrate/a41954a27d6ad96fa2c2cf816e677448.gif)
namespace
GetBrand
...
{
![](https://i-blog.csdnimg.cn/blog_migrate/37c8bf68cdc3cc81759c34160776bc53.gif)
class Program ...{
![](https://i-blog.csdnimg.cn/blog_migrate/37c8bf68cdc3cc81759c34160776bc53.gif)
static void Main(string[] args) ...{
GetUrlAndDoWork();
Console.WriteLine("数据获取完毕,完任意键退出。。。");
Console.Read();
}
![](https://i-blog.csdnimg.cn/blog_migrate/6a9c071a08f1dae2d3e1c512000eef41.gif)
//处理第一次获取失败的url
![](https://i-blog.csdnimg.cn/blog_migrate/37c8bf68cdc3cc81759c34160776bc53.gif)
private static void DoError() ...{
DataSet ds = new DBHelp().GetErrorDate();
![](https://i-blog.csdnimg.cn/blog_migrate/37c8bf68cdc3cc81759c34160776bc53.gif)
if (ds != null && ds.Tables[0].Rows.Count > 0) ...{
![](https://i-blog.csdnimg.cn/blog_migrate/37c8bf68cdc3cc81759c34160776bc53.gif)
for (int i = 0; i < ds.Tables[0].Rows.Count; i++) ...{
string url = ds.Tables[0].Rows[i]["url"].ToString();
SaveDate(url,i);
}
DataSet dsOther = new DBHelp().GetErrorDate();
![](https://i-blog.csdnimg.cn/blog_migrate/37c8bf68cdc3cc81759c34160776bc53.gif)
if (dsOther != null && dsOther.Tables[0].Rows.Count > 0) ...{
DoError();
}
}
}
//获取url并处理数据
![](https://i-blog.csdnimg.cn/blog_migrate/37c8bf68cdc3cc81759c34160776bc53.gif)
private static void GetUrlAndDoWork() ...{
![](https://i-blog.csdnimg.cn/blog_migrate/37c8bf68cdc3cc81759c34160776bc53.gif)
for (int i = 1; i <= 813; i++) ...{
string url = "http://brand.chinasspp.com/Index-0-{0}.htm";
url = string.Format(url,i.ToString());
SaveDate(url,i);
}
DoError();
}
![](https://i-blog.csdnimg.cn/blog_migrate/6a9c071a08f1dae2d3e1c512000eef41.gif)
//获取数据保存到本地
![](https://i-blog.csdnimg.cn/blog_migrate/37c8bf68cdc3cc81759c34160776bc53.gif)
private static void SaveDate(string url,int i) ...{
string content = ReturnByUrl(url);
![](https://i-blog.csdnimg.cn/blog_migrate/37c8bf68cdc3cc81759c34160776bc53.gif)
if (!string.IsNullOrEmpty(content)) ...{
int start = content.IndexOf("</SELECT>");
int end = content.LastIndexOf("转到");
![](https://i-blog.csdnimg.cn/blog_migrate/6a9c071a08f1dae2d3e1c512000eef41.gif)
![](https://i-blog.csdnimg.cn/blog_migrate/37c8bf68cdc3cc81759c34160776bc53.gif)
if (end > start) ...{
string WorkContent = content.Substring(start, end - start);
//Console.WriteLine(WorkContent);
Regex r1 = new Regex("<b>");
MatchCollection mc1 = r1.Matches(WorkContent);
Regex r2 = new Regex("</b>");
MatchCollection mc2 = r2.Matches(WorkContent);
Console.WriteLine("================开始获取第" + i.ToString() + "页数据==================");
![](https://i-blog.csdnimg.cn/blog_migrate/37c8bf68cdc3cc81759c34160776bc53.gif)
for (int k = 0; k < mc1.Count; k++) ...{
//Console.WriteLine(mc1[k].Index.ToString());
string BrandName = WorkContent.Substring(mc1[k].Index + 3, mc2[k].Index - mc1[k].Index - 3);
k++;
string Company = WorkContent.Substring(mc1[k].Index + 3, mc2[k].Index - mc1[k].Index - 3);
![](https://i-blog.csdnimg.cn/blog_migrate/37c8bf68cdc3cc81759c34160776bc53.gif)
if (IsValidate(BrandName)) ...{
Console.WriteLine(BrandName + "----" + Company);
// Add the match string to the DataBase.
//new DBHelp().Add(BrandName, Company);
Brand model = new Brand();
model.BrandName = BrandName;
model.ComPan = Company;
model.Pic = "";
model.Create = DateTime.Now;
new DBHelp().Add(model);
}
}
Console.WriteLine("================本页获取数据结束=====================");
}
}
}
![](https://i-blog.csdnimg.cn/blog_migrate/6a9c071a08f1dae2d3e1c512000eef41.gif)
//验证数据的有效性
![](https://i-blog.csdnimg.cn/blog_migrate/37c8bf68cdc3cc81759c34160776bc53.gif)
private static bool IsValidate(string _str) ...{
return !Regex.IsMatch(_str, @"^-?d+$");
}
![](https://i-blog.csdnimg.cn/blog_migrate/6a9c071a08f1dae2d3e1c512000eef41.gif)
//根据url返回请求的内容
![](https://i-blog.csdnimg.cn/blog_migrate/37c8bf68cdc3cc81759c34160776bc53.gif)
private static string ReturnByUrl(string url) ...{
string responseFromServer = string.Empty;
![](https://i-blog.csdnimg.cn/blog_migrate/37c8bf68cdc3cc81759c34160776bc53.gif)
try ...{
WebRequest request = WebRequest.Create(url);
request.Credentials = CredentialCache.DefaultCredentials;
WebResponse response = request.GetResponse();
![](https://i-blog.csdnimg.cn/blog_migrate/6a9c071a08f1dae2d3e1c512000eef41.gif)
Stream dataStream = response.GetResponseStream();
StreamReader reader = new StreamReader(dataStream, Encoding.Default);
responseFromServer = reader.ReadToEnd();
reader.Close();
response.Close();
}
![](https://i-blog.csdnimg.cn/blog_migrate/37c8bf68cdc3cc81759c34160776bc53.gif)
catch ...{
//保存日志
new DBHelp().AddLog(url);
//Ucar.Common.LogHelper.ErrorLog(e, @"D:");
//Thread.Sleep(500000);
//IsValidate(url);
}
return responseFromServer;
}
}
}
数据交互部分:
using
System;
using
System.Collections.Generic;
using
System.Text;
using
System.Data;
using
System.Data.SqlClient;
![](https://i-blog.csdnimg.cn/blog_migrate/a41954a27d6ad96fa2c2cf816e677448.gif)
namespace
GetBrand
...
{
![](https://i-blog.csdnimg.cn/blog_migrate/37c8bf68cdc3cc81759c34160776bc53.gif)
class DBHelp ...{
private string SqlConnection = "server=.;database=pubs;uid=sa;pwd=123123;";
![](https://i-blog.csdnimg.cn/blog_migrate/6a9c071a08f1dae2d3e1c512000eef41.gif)
![](https://i-blog.csdnimg.cn/blog_migrate/37c8bf68cdc3cc81759c34160776bc53.gif)
public void Add(string BrandName,string Conmpany) ...{
string sql = "insert into Brand (BrandName,ComPan,[Create])values('" + BrandName + "','" + Conmpany + "','" + DateTime.Now.ToString() + "')";
Ucar.BaseClass.DbHelperSQL.ExecuteSql(sql,SqlConnection);
}
public void Add(Brand model)
![](https://i-blog.csdnimg.cn/blog_migrate/37c8bf68cdc3cc81759c34160776bc53.gif)
...{
StringBuilder strSql=new StringBuilder();
strSql.Append("insert into Brand(");
strSql.Append("BrandName,ComPan,Pic,[Create])");
strSql.Append(" values (");
strSql.Append("@BrandName,@ComPan,@Pic,@Create)");
![](https://i-blog.csdnimg.cn/blog_migrate/37c8bf68cdc3cc81759c34160776bc53.gif)
SqlParameter[] parameters = ...{
new SqlParameter("@BrandName", SqlDbType.VarChar,50),
new SqlParameter("@ComPan", SqlDbType.VarChar,100),
new SqlParameter("@Pic", SqlDbType.VarChar,100),
new SqlParameter("@Create", SqlDbType.DateTime)};
parameters[0].Value = model.BrandName;
parameters[1].Value = model.ComPan;
parameters[2].Value = model.Pic;
parameters[3].Value = model.Create;
![](https://i-blog.csdnimg.cn/blog_migrate/6a9c071a08f1dae2d3e1c512000eef41.gif)
Ucar.BaseClass.DbHelperSQL.ExecuteSql(strSql.ToString(), SqlConnection, parameters);
}
![](https://i-blog.csdnimg.cn/blog_migrate/6a9c071a08f1dae2d3e1c512000eef41.gif)
![](https://i-blog.csdnimg.cn/blog_migrate/37c8bf68cdc3cc81759c34160776bc53.gif)
public DataSet GetErrorDate() ...{
string sql = "select * from Log where IsValidata=0 and DoTimes<=3";
return Ucar.BaseClass.DbHelperSQL.Query(sql, SqlConnection);
}
![](https://i-blog.csdnimg.cn/blog_migrate/37c8bf68cdc3cc81759c34160776bc53.gif)
public bool Exists(string url) ...{
StringBuilder strSql = new StringBuilder();
strSql.Append("select count(1) from Log");
strSql.Append(" where url= @url");
![](https://i-blog.csdnimg.cn/blog_migrate/37c8bf68cdc3cc81759c34160776bc53.gif)
SqlParameter[] parameters = ...{
new SqlParameter("@url", SqlDbType.VarChar,50)
};
parameters[0].Value = url;
return Ucar.BaseClass.DbHelperSQL.Exists(strSql.ToString(), SqlConnection, parameters);
}
![](https://i-blog.csdnimg.cn/blog_migrate/6a9c071a08f1dae2d3e1c512000eef41.gif)
![](https://i-blog.csdnimg.cn/blog_migrate/37c8bf68cdc3cc81759c34160776bc53.gif)
public void AddLog(string url) ...{
string sql = string.Empty;
![](https://i-blog.csdnimg.cn/blog_migrate/37c8bf68cdc3cc81759c34160776bc53.gif)
if (Exists(url)) ...{
sql = "update Log set DoTimes=DoTimes+1 where url ='" + url + "'";
}
![](https://i-blog.csdnimg.cn/blog_migrate/37c8bf68cdc3cc81759c34160776bc53.gif)
else ...{
sql = "insert into Log (url,IsValidata,DoTimes)values('" + url + "','0',0)";
}
Ucar.BaseClass.DbHelperSQL.ExecuteSql(sql, SqlConnection);
}
}
![](https://i-blog.csdnimg.cn/blog_migrate/37c8bf68cdc3cc81759c34160776bc53.gif)
public class Brand ...{
![](https://i-blog.csdnimg.cn/blog_migrate/37c8bf68cdc3cc81759c34160776bc53.gif)
public Brand() ...{ }
![](https://i-blog.csdnimg.cn/blog_migrate/7ff8d92cded7e0ce15e7ca1acc870052.gif)
Model#region Model
private int _id;
private string _brandname;
private string _compan;
private string _pic;
private DateTime _create;
![](https://i-blog.csdnimg.cn/blog_migrate/37c8bf68cdc3cc81759c34160776bc53.gif)
/**//// <summary>
///
/// </summary>
![](https://i-blog.csdnimg.cn/blog_migrate/37c8bf68cdc3cc81759c34160776bc53.gif)
public int ID ...{
![](https://i-blog.csdnimg.cn/blog_migrate/37c8bf68cdc3cc81759c34160776bc53.gif)
set ...{ _id = value; }
![](https://i-blog.csdnimg.cn/blog_migrate/37c8bf68cdc3cc81759c34160776bc53.gif)
get ...{ return _id; }
}
![](https://i-blog.csdnimg.cn/blog_migrate/37c8bf68cdc3cc81759c34160776bc53.gif)
/**//// <summary>
///
/// </summary>
![](https://i-blog.csdnimg.cn/blog_migrate/37c8bf68cdc3cc81759c34160776bc53.gif)
public string BrandName ...{
![](https://i-blog.csdnimg.cn/blog_migrate/37c8bf68cdc3cc81759c34160776bc53.gif)
set ...{ _brandname = value; }
![](https://i-blog.csdnimg.cn/blog_migrate/37c8bf68cdc3cc81759c34160776bc53.gif)
get ...{ return _brandname; }
}
![](https://i-blog.csdnimg.cn/blog_migrate/37c8bf68cdc3cc81759c34160776bc53.gif)
/**//// <summary>
///
/// </summary>
![](https://i-blog.csdnimg.cn/blog_migrate/37c8bf68cdc3cc81759c34160776bc53.gif)
public string ComPan ...{
![](https://i-blog.csdnimg.cn/blog_migrate/37c8bf68cdc3cc81759c34160776bc53.gif)
set ...{ _compan = value; }
![](https://i-blog.csdnimg.cn/blog_migrate/37c8bf68cdc3cc81759c34160776bc53.gif)
get ...{ return _compan; }
}
![](https://i-blog.csdnimg.cn/blog_migrate/37c8bf68cdc3cc81759c34160776bc53.gif)
/**//// <summary>
///
/// </summary>
![](https://i-blog.csdnimg.cn/blog_migrate/37c8bf68cdc3cc81759c34160776bc53.gif)
public string Pic ...{
![](https://i-blog.csdnimg.cn/blog_migrate/37c8bf68cdc3cc81759c34160776bc53.gif)
set ...{ _pic = value; }
![](https://i-blog.csdnimg.cn/blog_migrate/37c8bf68cdc3cc81759c34160776bc53.gif)
get ...{ return _pic; }
}
![](https://i-blog.csdnimg.cn/blog_migrate/37c8bf68cdc3cc81759c34160776bc53.gif)
/**//// <summary>
///
/// </summary>
![](https://i-blog.csdnimg.cn/blog_migrate/37c8bf68cdc3cc81759c34160776bc53.gif)
public DateTime Create ...{
![](https://i-blog.csdnimg.cn/blog_migrate/37c8bf68cdc3cc81759c34160776bc53.gif)
set ...{ _create = value; }
![](https://i-blog.csdnimg.cn/blog_migrate/37c8bf68cdc3cc81759c34160776bc53.gif)
get ...{ return _create; }
}
#endregion Model
![](https://i-blog.csdnimg.cn/blog_migrate/6a9c071a08f1dae2d3e1c512000eef41.gif)
}
}
SQL脚本:
if exists (select * from dbo.sysobjects where id = object_id(N'[dbo].[Brand]') and OBJECTPROPERTY(id, N'IsUserTable') = 1)
drop table [dbo].[Brand]
GO
CREATE TABLE [dbo].[Brand] (
[ID] [int] IDENTITY (1, 1) NOT NULL ,
[BrandName] [varchar] (50) COLLATE Chinese_PRC_CI_AS NULL ,
[ComPan] [varchar] (100) COLLATE Chinese_PRC_CI_AS NULL ,
[Pic] [varchar] (100) COLLATE Chinese_PRC_CI_AS NULL ,
[Create] [datetime] NULL
) ON [PRIMARY]
GO
运行截图:
![](https://p-blog.csdn.net/images/p_blog_csdn_net/wangjiafeng2008/4.jpg)
总结:简单的可以针对某个网站来进行数据抓取,但普遍性的抓取数据还在研究中,由于各个网站的风格可能不一致,所以本程序的局限性很大,扩展性很差...