C#开发的一个定向搜索引擎的思路和源代码

当前在互联网上,开源的搜索引擎技术已经十分成熟了,它们无论在可靠性,灵活度,扩展能力还是在数据抓取分析,索引优化等方面都与商业化搜索引擎相差不远了。比较常见的搭配是:Lucene和Heritrix。本文所描述的搜索引擎程序是我一年前写的,虽然它很小、很雏形,但却具有搜索引擎基本的功能,如链接分析,多线程,文本摘取,数据库访问及可配置参数等。

    此程序的主要目的是对给定的站点进行迭代式文本搜索,并将搜索到的信息放入数据库以备查询。由于使用了多线程,此程序在配置不良的情况下可能会对被搜索站点的运行效率造成负面影响,严重者有可能造成被搜索站点瘫痪。凡因使用本程序造成不良后果的,自己负责,请观者自酌。

    开发环境:windows server 2003, .NET2.0 C#, notepad++, Sql Server 2005 Ent

    运行环境:windows 2000/xp/2003, .NET 2.0 Framework, Sql Server 2005 Ent

    MainEntry Class:程序的入口类,根据配置文件创建配置对象,数据库初始化,插入根节点,创建抓取器,并运行全部线程。

    SearchConfig Class:配置文件在运行时的体现。

    UrlData Class:超链接与标题对应关系的体现。

    Crawler Class:网页分析器,根据Url抓取网页内容并过滤次级Url。

    config.xml:配置文件。

-----------------------------------------------------------config.xml-------------------------------------------------------------------------------------------

<?xml version="1.0" encoding="GB2312"?>
<configuration>
<property>
   <name>DBServerAddress</name>
   <value>(local)</value>
   <!--数据库服务器的IP地址-->
</property>
<property>
   <name>DBUserName</name>
   <value>sa</value>
   <!--数据库的用户名-->
</property>
<property>
   <name>DBPassword</name>
   <value>td.sdf,2968eds</value>
   <!--数据库用户密码-->
</property>
<property>
   <name>DBName</name>
   <value>SearchEngine</value>
   <!--数据库名称-->
</property>
<property>
   <name>TablePrefix</name>
   <value>site1</value>
   <!--数据表前缀-->
</property>
<property>
   <name>ConnectionTimeout</name>
   <value>15</value>
   <!--尝试连接数据库存储区的等待时间,单位为秒,默认为15秒-->
</property>
<property>
   <name>MinPoolSize</name>
   <value>50</value>
   <!--连接池最小存活数,默认为0-->
</property>
<property>
   <name>MaxPoolSize</name>
   <value>200</value>
   <!--连接池的最大存活数,默认为100-->
</property>
<property>
   <name>ConnectionReset</name>
   <value>true</value>
   <!--是否重置池化连接的状态-->
</property>
<property>
   <name>ConnectionLifeTime</name>
   <value>0</value>
   <!--池化连接的最大生命周期-->
</property>
<property>
   <name>Urls</name>
   <value>http://www.sohu.com/</value>
   <!--要抓取内容的网站url-->
</property>
<property>
   <name>ThreadCount</name>
   <value>100</value>
   <!--同时执行的线程数-->
</property>
<property>
   <name>ContentType</name>
   <value>text/html</value>
   <!--可供接收的内容类型-->
</property>
<property>
   <name>CachePolicy</name>
   <value>NoCacheNoStore</value>
   <!--缓存策略-->
</property>
<property>
   <name>Encoding</name>
   <value>GB2312</value>
   <!--编码类型-->
</property>
<property>
   <name>Depth</name>
   <value>10</value>
   <!--网站内容攫取深度-->
</property>
<property>
   <name>LogFile</name>
   <value>./Logs.txt</value>
   <!--日志文件位置-->
</property>
<property>
   <name>SleepTicks</name>
   <value>0</value>
   <!--惰性值,单位毫秒,默认为500ms-->
</property>
</configuration>

--------------------------------------------------------end config.xml--------------------------------------------------------------------------------------

--------------------------------------------------------UrlData.cs-------------------------------------------------------------------------------------------

using System;

namespace zhaol {

public class UrlData {
  
   private string pUrl = "";
   private string pTitle = "";
  
   public string Url {
   
    get { return this.pUrl; }
    set { this.pUrl = value; }
   
   }
  
   public string Title {
   
    get { return this.pTitle; }
    set { this.pTitle = value; }
   
   }
  
   public UrlData (string url, string title) {
   
    this.Url = url;
    this.Title = title;
   
   }
  
}

}

---------------------------------------------------------------end UrlData.cs------------------------------------------------------------------------------

---------------------------------------------------------------SearchConfig.cs----------------------------------------------------------------------------

namespace zhaol {

public class SearchConfig {
  
   private string pDBServerAddress = "";
   private string pDBUserName = "";
   private string pDBPassword = "";
   private string pDBName = "";
   private string pTablePrefix = "";
   private string pConnectionTimeout = "";
   private string pMinPoolSize = "";
   private string pMaxPoolSize = "";
   private string pConnectionReset = "";
   private string pConnectionLifeTime = "";
   private string pUrls = "";
   private string pThreadCount = "";
   private string pContentType = "";
   private string pCachePolicy = "";
   private string pEncoding = "";
   private string pDepth = "";
   private string pLogFile = "";
   private string pSleepTicks = "";
  
   public string DBServerAddress {
   
    get { return this.pDBServerAddress; }
    set { this.pDBServerAddress = value; }
   
   }
  
   public string DBUserName {
   
    get { return this.pDBUserName; }
    set { this.pDBUserName = value; }
   
   }
  
   public string DBPassword {
   
    get { return this.pDBPassword; }
    set { this.pDBPassword = value; }
   
   }
  
   public string DBName {
   
    get { return this.pDBName; }
    set { this.pDBName = value; }
   
   }
  
   public string TablePrefix {
   
    get { return this.pTablePrefix; }
    set { this.pTablePrefix = value; }
   
   }
  
   public string ConnectionTimeout {
   
    get { return this.pConnectionTimeout; }
    set { this.pConnectionTimeout = value; }
   
   }
  
   public string MinPoolSize {
   
    get { return this.pMinPoolSize; }
    set { this.pMinPoolSize = value; }
   
   }
  
   public string MaxPoolSize {
   
    get { return this.pMaxPoolSize; }
    set { this.pMaxPoolSize = value; }
   
   }
  
   public string ConnectionReset {
   
    get { return this.pConnectionReset; }
    set { this.pConnectionReset = value; }
   
   }
  
   public string ConnectionLifeTime {
   
    get { return this.pConnectionLifeTime; }
    set { this.pConnectionLifeTime = value; }
   
   }
  
   public string Urls {
   
    get { return this.pUrls; }
    set { this.pUrls = value; }
   
   }
  
   public string ThreadCount {
   
    get { return this.pThreadCount; }
    set { this.pThreadCount = value; }
   
   }
  
   public string ContentType {
   
    get { return this.pContentType; }
    set { this.pContentType = value; }
   
   }
  
   public string CachePolicy {
   
    get { return this.pCachePolicy; }
    set { this.pCachePolicy = value; }
   
   }
  
   public string Encoding {
   
    get { return this.pEncoding; }
    set { this.pEncoding = value; }
   
   }
  
   public string Depth {
   
    get { return this.pDepth; }
    set { this.pDepth = value; }
   
   }
  
   public string LogFile {
   
    get { return this.pLogFile; }
    set { this.pLogFile = value; }
   
   }
  
   public string SleepTicks {
   
    get { return this.pSleepTicks; }
    set { this.pSleepTicks = value; }
   
   }
  
}

}

----------------------------------------------------------------end SearchConfig.cs--------------------------------------------------------------------

 

----------------------------------------------------------------MainEntry.cs-------------------------------------------------------------------------------

using System;
using System.Text;
using System.Xml;
using System.Xml.XPath;
using System.Data;
using System.Data.SqlClient;
using System.Threading;
using zhaol;

//Begin Of Namespace zhaol
namespace zhaol {

//Begin Of Class MainEntry
public class MainEntry {
  
   public static void Main (string [] args) {
   
    XmlReaderSettings settings = new XmlReaderSettings();
    settings.CheckCharacters = true;
    settings.CloseInput = true;
    settings.ConformanceLevel = ConformanceLevel.Document;
    settings.ProhibitDtd = true;
    settings.IgnoreComments = true;   
    settings.IgnoreProcessingInstructions = true;   
    settings.IgnoreWhitespace = true;
    settings.ValidationType = ValidationType.None;
    settings.LineNumberOffset = 0;
    settings.LinePositionOffset = 0;
    SearchConfig config = null;
   
    try {
    
     XmlReader xReader = XmlReader.Create(@"./config.xml", settings);
     Console.WriteLine("配置读取完毕.");
     config = new SearchConfig();
     while (xReader.Read()) {
     
      if (xReader.HasValue) {
      
       if (xReader.Value == "version=/"1.0/" encoding=/"GB2312/"") continue;
       if (xReader.Value == "DBServerAddress") {
       
        for (int i = 0; i < 3; i++) xReader.Read();
        config.DBServerAddress = xReader.Value;
        Console.WriteLine("数据库服务器地址: " + config.DBServerAddress);
        continue;
       
       }else if (xReader.Value == "DBUserName") {
       
        for (int i = 0; i < 3; i++) xReader.Read();
        config.DBUserName = xReader.Value;
        Console.WriteLine("数据库用户: " + config.DBUserName);
        continue;
       
       }else if (xReader.Value == "DBPassword") {
       
        for (int i = 0; i < 3; i++) xReader.Read();
        config.DBPassword = xReader.Value;
        Console.WriteLine("数据库密码: ************");
        continue;
       
       }else if (xReader.Value == "DBName") {
       
        for (int i = 0; i < 3; i++) xReader.Read();
        config.DBName = xReader.Value;
        Console.WriteLine("数据库名称: " + config.DBName);
        continue;
       
       }else if (xReader.Value == "TablePrefix") {
       
        for (int i = 0; i < 3; i++) xReader.Read();
        config.TablePrefix = xReader.Value;
        Console.WriteLine("表前缀: " + config.TablePrefix);
        continue;
       
       }else if (xReader.Value == "ConnectionTimeout") {
       
        for (int i = 0; i < 3; i++) xReader.Read();
        config.ConnectionTimeout = xReader.Value;
        Console.WriteLine("DB连接超时时间: " + config.ConnectionTimeout + "秒");
        continue;
       
       }else if (xReader.Value == "MinPoolSize") {
       
        for (int i = 0; i < 3; i++) xReader.Read();
        config.MinPoolSize = xReader.Value;
        Console.WriteLine("连接池最小存活数: " + config.MinPoolSize);
        continue;
       
       }else if (xReader.Value == "MaxPoolSize") {
       
        for (int i = 0; i < 3; i++) xReader.Read();
        config.MaxPoolSize = xReader.Value;
        Console.WriteLine("连接池最大存活数: " + config.MaxPoolSize);
        continue;
       
       }else if (xReader.Value == "ConnectionReset") {
       
        for (int i = 0; i < 3; i++) xReader.Read();
        config.ConnectionReset = xReader.Value;
        Console.WriteLine("是否重置DB连接状态: " + config.ConnectionReset);
        continue;
       
       }else if (xReader.Value == "ConnectionLifeTime") {
       
        for (int i = 0; i < 3; i++) xReader.Read();
        config.ConnectionLifeTime = xReader.Value;
        Console.WriteLine("池化连接的最大生存周期: " + config.ConnectionLifeTime + "秒");
        continue;
       
       }else if (xReader.Value == "Urls") {
       
        for (int i = 0; i < 3; i++) xReader.Read();
        config.Urls = xReader.Value;
        Console.WriteLine("目标网站: " + config.Urls);
        continue;
       
       }else if (xReader.Value == "ThreadCount") {
       
        for (int i = 0; i < 3; i++) xReader.Read();
        config.ThreadCount = xReader.Value;
        Console.WriteLine("并发线程数: " + config.ThreadCount);
        continue;
       
       }else if (xReader.Value == "ContentType") {
       
        for (int i = 0; i < 3; i++) xReader.Read();
        config.ContentType = xReader.Value;
        Console.WriteLine("内容类型: " + config.ContentType);
        continue;
       
       }else if (xReader.Value == "CachePolicy") {
       
        for (int i = 0; i < 3; i++) xReader.Read();
        config.CachePolicy = xReader.Value;
        Console.WriteLine("缓存策略: " + config.CachePolicy);
        continue;
       
       }else if (xReader.Value == "Encoding") {
       
        for (int i = 0; i < 3; i++) xReader.Read();
        config.Encoding = xReader.Value;
        Console.WriteLine("编码类型: " + config.Encoding);
        continue;
       
       }else if (xReader.Value == "Depth") {
       
        for (int i = 0; i < 3; i++) xReader.Read();
        config.Depth = xReader.Value;
        Console.WriteLine("攫取深度: " + config.Depth + "层");
        continue;
       
       }else if (xReader.Value == "LogFile") {
       
        for (int i = 0; i < 3; i++) xReader.Read();
        config.LogFile = xReader.Value;
        Console.WriteLine("日志文件: " + config.LogFile);
        continue;
       
       }else if (xReader.Value == "SleepTicks") {
       
        for (int i = 0; i < 3; i++) xReader.Read();
        config.SleepTicks = xReader.Value;
        Console.WriteLine("惰性值: " + config.SleepTicks + "毫秒");
        continue;
       
       }
      
      }
     
     }
    
    }catch (Exception e) {
    
     Console.WriteLine(e.ToString());
    
    }finally {}
   
    CreateTables(config);
    InsertRoot(config);
    for (int i = 0; i < Int32.Parse(config.ThreadCount); i++) {
    
     new Thread(new ThreadStart(new Crawler(config).run)).Start();
    
    }
    Console.WriteLine("线程已全部启动.");
    Console.WriteLine("任务开始时间: " + DateTime.Now.ToString());
   
   }
  
   public static void CreateTables (SearchConfig config) {   
   
    string createScript = "create table " + config.TablePrefix + "_Contents (Id bigint identity(0,1) primary key, RequestUrl varchar(8000) not null, Encoding varchar(8000) not null, ResponseHeaders varchar(8000) not null, Depth int not null, TimeStm bigint not null, Title varchar(max) not null, Contents varchar(max) not null); create table " + config.TablePrefix + "_Urls (Id bigint identity(0,1) primary key, Url varchar(8000) not null, Title varchar(max) not null, Status varchar(1000) not null, TimeStm bigint not null)";
    SqlConnection connection = null;
    try {
   
     connection = new SqlConnection(MainEntry.CreateDBUrl(config));
     connection.Open();
     SqlCommand command = new SqlCommand(createScript, connection);
     command.ExecuteNonQuery();
     Console.WriteLine("数据表已创建.");
    
    }catch (Exception e) {
    
     Console.WriteLine(e.ToString());
    
    }finally {
   
     if (connection != null && connection.State != ConnectionState.Closed) connection.Close();
   
    }   
   
   }
  
   public static void InsertRoot (SearchConfig config) {
   
    SqlConnection connection = null;
    try {
    
     connection = new SqlConnection(MainEntry.CreateDBUrl(config));
     connection.Open();
     SqlCommand command = new SqlCommand("insert into " + config.TablePrefix + "_Urls (Url, Title, Status, TimeStm) values (@Url, @Title, @Status, @TimeStm)", connection);
     command.Parameters.AddWithValue("@Url", config.Urls.Trim());
     command.Parameters.AddWithValue("@Title", "NA");
     command.Parameters.AddWithValue("@Status", "Pending");
     command.Parameters.AddWithValue("@TimeStm", DateTime.Now.Ticks);
     command.ExecuteNonQuery();
     Console.WriteLine("根节点已插入.");
    
    }catch (Exception e) {
    
     Console.WriteLine(e.ToString());
    
    }finally {
    
     if (connection != null && connection.State != ConnectionState.Closed) connection.Close();
    
    }
   
   }
  
   public static string CreateDBUrl (SearchConfig config) {
   
    StringBuilder DBUrlBuilder = new StringBuilder("");
    DBUrlBuilder.Append("Persist Security Info=false;");
    DBUrlBuilder.Append("Integrated Security=false;");
    DBUrlBuilder.Append("MultipleActiveResultSets=true;");
    DBUrlBuilder.Append("Packet Size=8192;");
    DBUrlBuilder.Append("Type System Version=SQL Server 2005;");
    DBUrlBuilder.Append("Data Source=");
    DBUrlBuilder.Append(config.DBServerAddress);
    DBUrlBuilder.Append(";");
    DBUrlBuilder.Append("User Id=");
    DBUrlBuilder.Append(config.DBUserName);
    DBUrlBuilder.Append(";");
    DBUrlBuilder.Append("Password=");
    DBUrlBuilder.Append(config.DBPassword);
    DBUrlBuilder.Append(";");
    DBUrlBuilder.Append("Database=");
    DBUrlBuilder.Append(config.DBName);
    DBUrlBuilder.Append(";");
    DBUrlBuilder.Append("Connection Timeout=");
    DBUrlBuilder.Append(config.ConnectionTimeout);
    DBUrlBuilder.Append(";");
    DBUrlBuilder.Append("Pooling=true;");
    DBUrlBuilder.Append("Min Pool Size=");
    DBUrlBuilder.Append(config.MinPoolSize);
    DBUrlBuilder.Append(";");
    DBUrlBuilder.Append("Max Pool Size=");
    DBUrlBuilder.Append(config.MaxPoolSize);
    DBUrlBuilder.Append(";");
    DBUrlBuilder.Append("Connection Reset=");
    DBUrlBuilder.Append(config.ConnectionReset);
    DBUrlBuilder.Append(";");
    DBUrlBuilder.Append("Connection Lifetime=");
    DBUrlBuilder.Append(config.ConnectionLifeTime);
    String DBUrl = DBUrlBuilder.ToString();
   
    //Console.WriteLine("数据库连接字符串: " + DBUrl);
    return DBUrl;
   
   }
  
}
//End Of Class MainEntry

}
//End Of Namespace zhaol

-------------------------------------------------------------------end MainEntry.cs----------------------------------------------------------------------

-------------------------------------------------------------------Crawler.cs--------------------------------------------------------------------------------

using System;
using System.Text;
using System.Net;
using System.Net.Cache;
using System.Text.RegularExpressions;
using System.Collections;
using System.Data;
using System.Data.SqlClient;
using System.Threading;
using zhaol;

namespace zhaol {

public class Crawler {
  
   private WebClient pClient = null;
   private SearchConfig pConfig = null;
   private string pDBUrl = null;
  
   public WebClient Client {
   
    get { return this.pClient; }
    set { this.pClient = value; }
   
   }
  
   public SearchConfig Config {
   
    get { return this.pConfig; }
    set { this.pConfig = value; }
   
   }
  
   public string DBUrl {
   
    get { return this.pDBUrl; }
    set { this.pDBUrl = value; }
   
   }
  
   public Crawler (SearchConfig config) {
   
    this.Config = config;
    this.DBUrl = MainEntry.CreateDBUrl(this.Config);
    this.Client = new WebClient();
    this.Client.Encoding = Encoding.GetEncoding(this.Config.Encoding);
    this.Client.CachePolicy = new RequestCachePolicy(RequestCacheLevel.NoCacheNoStore);
   
   }
  
   public void run () {
   
    while (true) {
   
     UrlData uData = null;
     lock (this) {
          
      uData = this.getPendingUrl();
     
     }
     if (uData == null) continue;
     if (String.ReferenceEquals(uData.Url, null) || uData.Url.Trim() == "") continue;
     try {
     
      string contents = this.Client.DownloadString(uData.Url);
      if (String.ReferenceEquals(contents, null) || contents.Trim() == "") continue;
      this.insertContents(uData.Url.Trim(), uData.Title.Trim(), this.Config.Encoding.Trim(), this.Client.ResponseHeaders, this.countDepth(uData.Url.Trim()), contents.Trim());
      lock (this) {
     
       this.insertUrls(uData.Url.Trim(), contents.Trim());      
       this.changeUrlState(uData.Url);
      
      }
     
     
     
     }catch (Exception e) {
     
      Console.WriteLine(e.ToString());
      Console.WriteLine("空连接: " + "[" + uData.Url + "]");
     
     }finally {}
    
     int sleepTicks = 500;
     Int32.TryParse(this.Config.SleepTicks, out sleepTicks);   
     if (sleepTicks <= 0) continue;
     else Thread.Sleep(sleepTicks);    
    
    }
   
   }
  
   public UrlData getPendingUrl () {
   
    SqlConnection connection = null;
    SqlDataReader reader = null;
    UrlData uData = null;
    try {
    
     connection = new SqlConnection(this.pDBUrl);
     connection.Open();
     SqlCommand command = new SqlCommand("select top 1 * from " + this.Config.TablePrefix + "_Urls where Status = 'Pending' order by TimeStm asc", connection);
     reader = command.ExecuteReader();
     if (reader.HasRows) {     
     
      reader.Read();
      uData = new UrlData(((string)reader["Url"]).Trim(), ((string)reader["Title"]).Trim());
      long Id = (long)reader["Id"];
      reader.Close();
      command.CommandText = "update " + this.Config.TablePrefix + "_Urls set Status = 'Running' where Id = @Id";
      command.Parameters.AddWithValue("@Id", Id);
      command.ExecuteNonQuery();
     
     
     }    
    
    }catch (Exception e) {
    
     Console.WriteLine(e.ToString());
    
    }finally {
    
     if (reader != null && !reader.IsClosed) reader.Close();
     if (connection != null && connection.State != ConnectionState.Closed) connection.Close();
    
    }
    return uData;
   
   }
  
   public int countDepth (string Url) {
   
    string tempUrl = Url.Replace(this.Config.Urls, "");
    Regex regex = new Regex(@"/", RegexOptions.Compiled);
    MatchCollection mCollection = regex.Matches(tempUrl);
    return mCollection.Count;
   
   }
  
   public void insertContents (string url, string title, string encoding, WebHeaderCollection headers, int depth, string contents) {   
  
    SqlConnection connection = null;
    try {
    
     StringBuilder bHeaders = new StringBuilder("");
     for (int i = 0; i < headers.Keys.Count; i++) {
     
      bHeaders.Append("[" + headers.Keys[i] + ">>>" + headers[headers.Keys[i]] + "]");
     
     }
     connection = new SqlConnection(this.DBUrl);
     connection.Open();
     SqlCommand command = new SqlCommand("insert into " + this.Config.TablePrefix + "_Contents (RequestUrl, Title, Encoding, ResponseHeaders, Depth, TimeStm, Contents) values (@Url, @Title, @Encoding, @Headers, @Depth, @TimeStm, @Contents)", connection);
     command.Parameters.AddWithValue("@Url", url);
     command.Parameters.AddWithValue("@Title", title);
     command.Parameters.AddWithValue("@encoding", encoding);
     command.Parameters.AddWithValue("@Headers", bHeaders.ToString());
     command.Parameters.AddWithValue("@Depth", depth);
     command.Parameters.AddWithValue("@TimeStm", DateTime.Now.Ticks);
     command.Parameters.AddWithValue("@Contents", contents);
     command.ExecuteNonQuery();
    
    }finally {
    
     if (connection != null && connection.State != ConnectionState.Closed) connection.Close();
    
    }
   
   }
  
   public void insertUrls (string baseUrl, string contents) {
   
    if (new Regex("^http://([^/<>/"]*/)*[^/<>/"]+$", RegexOptions.IgnoreCase).IsMatch(this.Config.Urls)) {
     this.Config.Urls = new Regex("^http://([^/<>/"]*/)*", RegexOptions.IgnoreCase).Match(this.Config.Urls).Value;
    }
    contents = contents.Replace("/n", "");
    Regex hyperLinkReg = new Regex(@"<a[^>]+href=/s*(?:'(?<href>[^']+)'|""(?<href>[^""]+)""|(?<href>[^>/s]+))/s*[^>]*>(?<text>.*?)</a>", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.Compiled);
    MatchCollection matchCollection = hyperLinkReg.Matches(contents);
    ArrayList urlDataList = new ArrayList();
    Regex hrefReg = new Regex("href//s*=//s*(?:/"(?<1>[^/"]*)/"|(?<1>//S+))", RegexOptions.IgnoreCase | RegexOptions.Compiled);
    Regex hReg = new Regex(@"href=""", RegexOptions.IgnoreCase | RegexOptions.Compiled);
    Regex hReg2 = new Regex(@"""", RegexOptions.IgnoreCase | RegexOptions.Compiled);
    Regex titleReg = new Regex(@"<a.{1,}>", RegexOptions.IgnoreCase | RegexOptions.Compiled);
    Regex titleReg2 = new Regex(@"</a>", RegexOptions.IgnoreCase | RegexOptions.Compiled);
    if (matchCollection.Count > 0) {
    
     for (int i = 0; i < matchCollection.Count; i++) {
     
      string value = matchCollection[i].Value;
      if (value.Contains("javascript")) continue;
      if (value.Contains("&amp;")) value = value.Replace("&amp;", "&");
      string tempHref = hReg2.Replace(hReg.Replace(hrefReg.Match(value).Value, ""), "");
      string href = "";
      if (new Regex("^http://.*", RegexOptions.IgnoreCase).IsMatch(tempHref)) {
      
       if (new Regex("^" + Config.Urls + "[^<>/"]*", RegexOptions.IgnoreCase).IsMatch(tempHref)) {
       
        if (new Regex("^" + this.Config.Urls + "([^/<>/"]*/)*([0-9a-zA-Z//-]*)$", RegexOptions.IgnoreCase).IsMatch(tempHref)) {
       
         href = tempHref + "/";
         urlDataList.Add(new UrlData(href.Trim(), titleReg.Replace(titleReg2.Replace(value, ""), "").Trim()));
        
        }else {
        
         href = tempHref;
         urlDataList.Add(new UrlData(href.Trim(), titleReg.Replace(titleReg2.Replace(value, ""), "").Trim()));
        
        }
       
       }else {
       
        continue;

       }
      
      }else if (new Regex(@"^/[^<>""]*", RegexOptions.IgnoreCase).IsMatch(tempHref)) {
      
       if (new Regex("^/([^/<>/"]*/)*([0-9a-zA-Z//-]*)$", RegexOptions.IgnoreCase).IsMatch(tempHref)) {
      
        href = Config.Urls + tempHref.Remove(0, 1) + "/";
        urlDataList.Add(new UrlData(href.Trim(), titleReg.Replace(titleReg2.Replace(value, ""), "").Trim()));
       
       }else {
       
        href = Config.Urls + tempHref.Remove(0, 1);
        urlDataList.Add(new UrlData(href.Trim(), titleReg.Replace(titleReg2.Replace(value, ""), "").Trim()));
       
       }
      
      }else if (new Regex(@"^[^/][^<>""]*", RegexOptions.IgnoreCase).IsMatch(tempHref)) {
      
       if (new Regex("^[^/]([^/<>/"]*/)*([0-9a-zA-Z//-]*)$", RegexOptions.IgnoreCase).IsMatch(tempHref)) {
       
        tempHref = tempHref + "/";
       
       }
      
       if (new Regex("^" + this.Config.Urls + "([^/<>/"]*/)*[^/<>/"]+$", RegexOptions.IgnoreCase).IsMatch(baseUrl)) {
        
        Match baseMatch = new Regex("^" + this.Config.Urls + "([^/<>/"]*/)*", RegexOptions.IgnoreCase).Match(baseUrl);
        href = baseMatch.Value + tempHref;
        urlDataList.Add(new UrlData(href.Trim(), titleReg.Replace(titleReg2.Replace(value, ""), "").Trim()));
       
       
       }else if (new Regex("^" + this.Config.Urls + "([^/<>/"]*/)*$", RegexOptions.IgnoreCase).IsMatch(baseUrl)) {
       
        href = baseUrl + tempHref;
        urlDataList.Add(new UrlData(href.Trim(), titleReg.Replace(titleReg2.Replace(value, ""), "").Trim()));
       
       }
      
      }else {
      
       continue;
      
      }           
     
     }
    
    }
   
    SqlConnection connection = null;
    SqlDataReader reader = null;
    try {
    
     connection = new SqlConnection(this.DBUrl);
     connection.Open();
     SqlCommand command1 = null;   
     SqlCommand command2 = null;
     for (int i = 0; i < urlDataList.Count; i++) {
     
      command1 = new SqlCommand("select Id from " + this.Config.TablePrefix + "_Urls where Url = @Urll", connection);
      command2 = new SqlCommand("insert into " + this.Config.TablePrefix + "_Urls (Url, Title, Status, TimeStm) values (@Url, @Title, @Status, @TimeStm)", connection);
      command1.Parameters.AddWithValue("@Urll", ((UrlData)urlDataList[i]).Url);
      reader = command1.ExecuteReader();
      if (reader.HasRows) {     
      
       if (reader != null && !reader.IsClosed) reader.Close();
       continue;
      
      }
      if (reader != null && !reader.IsClosed) reader.Close();
      command2.Parameters.AddWithValue("@Url", ((UrlData)urlDataList[i]).Url);
      command2.Parameters.AddWithValue("@Title", ((UrlData)urlDataList[i]).Title);
      command2.Parameters.AddWithValue("@Status", "Pending");
      command2.Parameters.AddWithValue("@TimeStm", DateTime.Now.Ticks);
      command2.ExecuteNonQuery();
     
     }
    
    }finally {
    
     if (reader != null && !reader.IsClosed) reader.Close();
     if (connection != null && connection.State != ConnectionState.Closed) connection.Close();
    
    }
   
   
   }
  
   public void changeUrlState (string url) {
   
    SqlConnection connection = null;
    try {
    
     connection = new SqlConnection(this.DBUrl);
     connection.Open();
     SqlCommand command = new SqlCommand("update " + this.Config.TablePrefix + "_Urls set Status = @Status where Url = @Url", connection);
     command.Parameters.AddWithValue("@Status", "Complete");
     command.Parameters.AddWithValue("@Url", url);
     command.ExecuteNonQuery();
    
    }finally {
    
     if (connection != null && connection.State != ConnectionState.Closed) connection.Close();
    
    }
   
   }
  
}

}

-----------------------------------------------------------------end Crawler.cs-----------------------------------------------------------------------------

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值