一个简单的爬行器

应用部分:

using  System;
using  System.Collections.Generic;
using  System.Text;
using  System.IO;
using  System.Data;
using  System.Text.RegularExpressions;
using  System.Net;
using  System.Threading;

namespace  GetBrand  {
    
class Program {
        
static void Main(string[] args) {
            GetUrlAndDoWork();
            Console.WriteLine(
"数据获取完毕,完任意键退出。。。");
            Console.Read();
        }


        
//处理第一次获取失败的url
        private static void DoError() {
            DataSet ds 
= new DBHelp().GetErrorDate();
            
if (ds != null && ds.Tables[0].Rows.Count > 0{
                
for (int i = 0; i < ds.Tables[0].Rows.Count; i++{
                    
string url = ds.Tables[0].Rows[i]["url"].ToString();
                    SaveDate(url,i);
                }

                DataSet dsOther 
= new DBHelp().GetErrorDate();
                
if (dsOther != null && dsOther.Tables[0].Rows.Count > 0{
                    DoError();
                }

            }

        }

        
//获取url并处理数据
        private static void GetUrlAndDoWork() {
            
for (int i = 1; i <= 813; i++{
                
string url = "http://brand.chinasspp.com/Index-0-{0}.htm";
                url 
= string.Format(url,i.ToString());
                SaveDate(url,i);
            }

            DoError();
        }


        
//获取数据保存到本地
        private static void SaveDate(string url,int i) {
            
string content = ReturnByUrl(url);
            
if (!string.IsNullOrEmpty(content)) {
                
int start = content.IndexOf("</SELECT>");
                
int end = content.LastIndexOf("转到");

                
if (end > start) {
                    
string WorkContent = content.Substring(start, end - start);
                    
//Console.WriteLine(WorkContent);
                    Regex r1 = new Regex("<b>");
                    MatchCollection mc1 
= r1.Matches(WorkContent);
                    Regex r2 
= new Regex("</b>");
                    MatchCollection mc2 
= r2.Matches(WorkContent);
                    Console.WriteLine(
"================开始获取第" + i.ToString() + "页数据==================");
                    
for (int k = 0; k < mc1.Count; k++{
                        
//Console.WriteLine(mc1[k].Index.ToString());
                        string BrandName = WorkContent.Substring(mc1[k].Index + 3, mc2[k].Index - mc1[k].Index - 3);
                        k
++;
                        
string Company = WorkContent.Substring(mc1[k].Index + 3, mc2[k].Index - mc1[k].Index - 3);
                        
if (IsValidate(BrandName)) {
                            Console.WriteLine(BrandName 
+ "----" + Company);
                            
// Add the match string to the DataBase.
                            
//new DBHelp().Add(BrandName, Company);
                            Brand model = new Brand();
                            model.BrandName 
= BrandName;
                            model.ComPan 
= Company;
                            model.Pic 
= "";
                            model.Create 
= DateTime.Now;
                            
new DBHelp().Add(model);
                        }

                    }

                    Console.WriteLine(
"================本页获取数据结束=====================");
                }

            }

        }


        
//验证数据的有效性
        private static bool IsValidate(string _str) {
            
return !Regex.IsMatch(_str, @"^-?d+$");
        }


        
//根据url返回请求的内容
        private static string ReturnByUrl(string url) {
            
string responseFromServer = string.Empty;
            
try {
                WebRequest request 
= WebRequest.Create(url);
                request.Credentials 
= CredentialCache.DefaultCredentials;
                WebResponse response 
= request.GetResponse();

                Stream dataStream 
= response.GetResponseStream();
                StreamReader reader 
= new StreamReader(dataStream, Encoding.Default);
                responseFromServer 
= reader.ReadToEnd();
                reader.Close();
                response.Close();
            }

            
catch {
                
//保存日志
                new DBHelp().AddLog(url);
                
//Ucar.Common.LogHelper.ErrorLog(e, @"D:");
                
//Thread.Sleep(500000);
                
//IsValidate(url);
            }

            
return responseFromServer;
        }

    }

}

数据交互部分:

using  System;
using  System.Collections.Generic;
using  System.Text;
using  System.Data;
using  System.Data.SqlClient;
namespace  GetBrand  {
    
class DBHelp {
        
private string SqlConnection = "server=.;database=pubs;uid=sa;pwd=123123;";

        
public void Add(string BrandName,string Conmpany) {
            
string sql = "insert into Brand (BrandName,ComPan,[Create])values('" + BrandName + "','" + Conmpany + "','" + DateTime.Now.ToString() + "')";
            Ucar.BaseClass.DbHelperSQL.ExecuteSql(sql,SqlConnection);
        }

        
public void Add(Brand model)
        
{
            StringBuilder strSql
=new StringBuilder();
            strSql.Append(
"insert into Brand(");
            strSql.Append(
"BrandName,ComPan,Pic,[Create])");
            strSql.Append(
" values (");
            strSql.Append(
"@BrandName,@ComPan,@Pic,@Create)");
            SqlParameter[] parameters 
= {
                    
new SqlParameter("@BrandName", SqlDbType.VarChar,50),
                    
new SqlParameter("@ComPan", SqlDbType.VarChar,100),
                    
new SqlParameter("@Pic", SqlDbType.VarChar,100),
                    
new SqlParameter("@Create", SqlDbType.DateTime)}
;
            parameters[
0].Value = model.BrandName;
            parameters[
1].Value = model.ComPan;
            parameters[
2].Value = model.Pic;
            parameters[
3].Value = model.Create;

            Ucar.BaseClass.DbHelperSQL.ExecuteSql(strSql.ToString(), SqlConnection, parameters);
        }


        
public DataSet GetErrorDate() {
            
string sql = "select * from Log where IsValidata=0 and DoTimes<=3";
            
return Ucar.BaseClass.DbHelperSQL.Query(sql, SqlConnection);
        }

        
public bool Exists(string url) {
            StringBuilder strSql 
= new StringBuilder();
            strSql.Append(
"select count(1) from Log");
            strSql.Append(
" where url= @url");
            SqlParameter[] parameters 
= {
                    
new SqlParameter("@url", SqlDbType.VarChar,50)
                }
;
            parameters[
0].Value = url;
            
return Ucar.BaseClass.DbHelperSQL.Exists(strSql.ToString(), SqlConnection, parameters);
        }


        
public void AddLog(string url) {
            
string sql = string.Empty;
            
if (Exists(url)) {
                 sql 
= "update Log set DoTimes=DoTimes+1 where url ='" + url + "'";
            }

            
else {
                 sql 
= "insert into Log (url,IsValidata,DoTimes)values('" + url + "','0',0)";
            }

            Ucar.BaseClass.DbHelperSQL.ExecuteSql(sql, SqlConnection);
        }

    }

    
public class Brand {
        
public Brand() { }
        
Model Model

    }

}

SQL脚本:

if exists (select * from dbo.sysobjects where id = object_id(N'[dbo].[Brand]') and OBJECTPROPERTY(id, N'IsUserTable') = 1)
drop table [dbo].[Brand]
GO

CREATE TABLE [dbo].[Brand] (
 [ID] [int] IDENTITY (1, 1) NOT NULL ,
 [BrandName] [varchar] (50) COLLATE Chinese_PRC_CI_AS NULL ,
 [ComPan] [varchar] (100) COLLATE Chinese_PRC_CI_AS NULL ,
 [Pic] [varchar] (100) COLLATE Chinese_PRC_CI_AS NULL ,
 [Create] [datetime] NULL
) ON [PRIMARY]
GO

运行截图:

总结:简单的可以针对某个网站来进行数据抓取,但普遍性的抓取数据还在研究中,由于各个网站的风格可能不一致,所以本程序的局限性很大,扩展性很差...

 

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值