【原】获取网页所有链接——简易搜索引擎雏形（C#,已测试通过,有CODE）

最新推荐文章于 2016-11-23 22:02:00 发布

weixin_34315485

最新推荐文章于 2016-11-23 22:02:00 发布

阅读量87

点赞数

文章标签： c# ui

最近一直在看搜索引擎方面的资料，在Google上找了很多，可是没有找到一个下载下来可以直接使用的！

没有办法只能看别人的代码，然后一点一点的调试！功夫不负有心人，终于算是修得正果！贴代码：

Code
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Collections;
using System.Net;
using System.IO;
using System.Text.RegularExpressions;
using System.Xml;

namespace SearchEngineConsoleApp
{
    class Program
    {
        public static void Main()
        {

            string strcode;
            ArrayList allinks;

            Console.Write("请输入一个网页地址：");
            string strurl = "http://www.cnblogs.com/OceanChen/";
            if (strurl.Substring(0, 7) != @"http://")
            {
                strurl = @"http://" + strurl;
            }

            Console.WriteLine("正在获取页面代码，请稍侯

");
strcode = getpagesource(strurl);

Console.WriteLine("正在提取超链接，请稍侯

");
allinks = gethyperlinks(strcode);

Console.WriteLine("正在写入文件，请稍侯

");
            Writetoxml(strurl, allinks);
        }

        // 获取指定网页的html代码
        static string getpagesource(string url)
        {
            Uri uri = new Uri(url);

            HttpWebRequest hwreq = (HttpWebRequest)WebRequest.Create(uri);

            HttpWebResponse hwres = (HttpWebResponse)hwreq.GetResponse();

            hwreq.Method = "get";

            hwreq.KeepAlive = false;

            StreamReader reader = new StreamReader(hwres.GetResponseStream(), System.Text.Encoding.GetEncoding("gb2312"));

            return reader.ReadToEnd();
        }

        // 提取html代码中的网址
        static ArrayList gethyperlinks(string htmlcode)
        {
            ArrayList al = new ArrayList();

            string strregex = @"http://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)?";

            Regex r = new Regex(strregex, RegexOptions.IgnoreCase);
            MatchCollection m = r.Matches(htmlcode);

            for (int i = 0; i <= m.Count - 1; i++)
            {
                bool rep = false;
                string strnew = m[i].ToString();

                // 过滤重复的url
                foreach (string str in al)
                {
                    if (strnew == str)
                    {
                        rep = true;
                        break;
                    }
                }

                if (!rep) al.Add(strnew);
            }

            al.Sort();

            return al;
        }

        // 把网址写入xml文件
        static void Writetoxml(string strurl, ArrayList alhyperlinks)
        {

            XmlTextWriter Writer = new XmlTextWriter("hyperlinks.xml", Encoding.UTF8);

            Writer.Formatting = Formatting.Indented;
            Writer.WriteStartDocument(false);
            //Writer.WriteDocType("hyperlinks", null, "urls.dtd", null);
            Writer.WriteComment("提取自" + strurl + "的超链接");
            Writer.WriteStartElement("hyperlinks");
            Writer.WriteStartElement("hyperlinks", null);
            Writer.WriteAttributeString("datetime", DateTime.Now.ToString());

            foreach (string str in alhyperlinks)
            {
                string title = getdomain(str);
                string body = str;
                Writer.WriteElementString(title, null, body);
            }

            Writer.WriteEndElement();
            Writer.WriteEndElement();

            Writer.Flush();
            Writer.Close();
        }

        // 获取网址的域名后缀
        static string getdomain(string strurl)
        {
            string retval;

            string strregex = @"(\.com/|\.net/|\.cn/|\.org/|\.gov/)";

            Regex r = new Regex(strregex, RegexOptions.IgnoreCase);
            Match m = r.Match(strurl);
            retval = m.ToString();

            strregex = @"\.|/$";
            retval = Regex.Replace(retval, strregex, "").ToString();

            if (retval == "")
                retval = "other";

            return retval;
        }

    }
}

这个代码只是实现了部分功能，即单页面抓取不重复链接，看来还有部分工作要做，就是页面内部循环，利用正则抓取页单一站点的整体链接，目的是为了收集自己最感兴趣的内容。

为了使读者有信心，继续贴一些重量级别的代码过来，这段代码是统计一个URL内关键字出现的次数，测试通过！见代码：

Search.aspx 页面代码如下：

Code
<%@ Page Language="C#" AutoEventWireup="false" Inherits="SearchEngine" Src="search.aspx.cs" %>

<script language="c#" runat="server">
    protected void search(Object sender, EventArgs e)
    {
        if (SearchWebSites(keyword.Text, urls.Text))
        {
            info.Text = "Searched <font color=\"red\">" + SearchResults.Count + "</font> web page(s) ";
            info.Text += "on the keyword <font color=\"red\">\"" + keyword.Text + "</font>\". ";
            info.Text += "Total search time was <font color=\"red\">" + timeSpent + "</font>";
            SearchForm.Visible = false;
            ResultList.DataSource = SearchResults;
            ResultList.DataBind();
        }
    }
</script>

<html>
<head>
    <title>Multi-threaded Search Engine</title>
    <style>
        .BodyText
        {
            font-family: verdana;
            font-size: 12px;
            color: 333333;
        }
    </style>
</head>
<body>
    <asp:Label ID="info" class="BodyText" Text="URL of the web sites to search, one url per line."
        runat="server" /><br />
    <asp:Repeater ID="ResultList" runat="server">
        <HeaderTemplate>
            <table class="BodyText" border="0" cellpadding="3" cellspacing="3">
                <tr>
                    <td>
                        <b>Found</b>
                    </td>
                    <td>
                        <b>Web Page Title</b>
                    </td>
                    <td>
                        <b>Web Page URL</b>
                    </td>
                    <td>
                        <b>Searched Time</b>
                    </td>
                </tr>
        </HeaderTemplate>
        <ItemTemplate>
            <tr>
                <td>
                    <%# DataBinder.Eval(Container.DataItem, "instanceCount") %>
                </td>
                <td>
                    <%# DataBinder.Eval(Container.DataItem, "pageTitle") %>
                </td>
                <td>
                    <%# DataBinder.Eval(Container.DataItem, "pageURL") %>
                </td>
                <td>
                    <%# DataBinder.Eval(Container.DataItem, "timeSpent") %>
                </td>
            </tr>
        </ItemTemplate>
        <FooterTemplate>
            </table>
        </FooterTemplate>
    </asp:Repeater>
    <form id="SearchForm" runat="server">
    <table class="BodyText">
        <tr>
            <td>
                keyword:
            </td>
            <td>
                <asp:TextBox class="BodyText" Text="news" ID="keyword" runat="server" />
            </td>
        </tr>
        <tr>
            <td valign="top">
                urls:
            </td>
            <td>
                <asp:TextBox class="BodyText" Text="" ID="urls" Rows="10" Columns="30" TextMode="MultiLine"
                    runat="server" />
            </td>
        </tr>
        <tr>
            <td align="right" colspan="2">
                <asp:Button class="BodyText" Text="search!" type="submit" OnClick="search" runat="server"
                    ID="Button1" />
            </td>
        </tr>
    </table>
    </form>
</body>
</html>

Search.aspx.cs 后台代码如下：

Code
using System;
using System.IO;
using System.Net;
using System.Web;
using System.Web.UI;
using System.Text;
using System.Text.RegularExpressions;
using System.Collections;
using System.Threading;

public class SearchEngine : Page
{
    // private member fields.
    private ArrayList _pages;
    private TimeSpan _timeSpent;
    /// <summary>
    /// Returns an ArrayList of WebPage objects,
    /// which contains the search results information.
    /// </summary>
    public ArrayList SearchResults
    {
        get { return _pages; }
    }
    /// <summary>
    /// A TimeSpan object. It lets us know how long was the entire search.
    /// </summary>
    public TimeSpan timeSpent
    {
        get { return _timeSpent; }
    }
    /// <summary>
    /// Start searching the web sites.
    /// </summary>
    /// <param name="keyword">The keyword to search for.</param>
    /// <param name="pURLs">List of URLs, seperated by the \n character.</param>
    /// <returns></returns>
    public bool SearchWebSites(string keyword, string pURLs)
    {
        // start the timer
        DateTime lStarted = DateTime.Now;
        _pages = new ArrayList();
        // split the urls string to an array
        string[] lURLs = pURLs.Split('\n');
        int lIdx;
        WebPage wp;
        // create the Thread array
        Thread[] t = new Thread[lURLs.Length];
        for (lIdx = 0; lIdx < lURLs.Length; lIdx++)
        {
            // create a WebPage object for each url
            wp = new WebPage(keyword, lURLs[lIdx]);
            // add it to the _pages ArrayList
            _pages.Add(wp);
            // pass the search() method of the new WebPage object
            // to the ThreadStart object. Then pass the ThreadStart
            // object to the Thread object.
            t[lIdx] = new Thread(new ThreadStart(wp.search));
            // start the Thread object, which executes the search().
            t[lIdx].Start();
        }
        for (lIdx = 0; lIdx < _pages.Count; lIdx++)
        {
            // waiting for all the Threads to finish.
            t[lIdx].Join();
        }
        // stop the timer.
        _timeSpent = DateTime.Now.Subtract(lStarted);
        return true;
    }
}
/// <summary>
/// The class that contains information for each searched web page.
/// </summary>
public class WebPage
{
    // private member fields.
    private int _instanceCount;
    private string _pageURL;
    private string _pageTitle;
    private string _keyword;
    private TimeSpan _timeSpent;
    /// <summary>
    /// A TimeSpan object. It lets us know how long was the page search.
    /// </summary>
    public TimeSpan timeSpent
    {
        get { return _timeSpent; }
    }
    /// <summary>
    /// How many times the search keyword appears on the page.
    /// </summary>
    public int instanceCount
    {
        get { return _instanceCount; }
    }
    /// <summary>
    /// The URL of the search page
    /// </summary>
    public string pageURL
    {
        get { return _pageURL; }
    }
    /// <summary>
    /// The title of the search page
    /// </summary>
    public string pageTitle
    {
        get { return _pageTitle; }
    }
    public WebPage() { }
    /// <summary>
    /// A parameterized constructor of the WebPage class.
    /// </summary>
    /// <param name="keyword">The keyword to search for.</param>
    /// <param name="pageURL">The URL to connect to.</param>
    public WebPage(string keyword, string pageURL)
    {
        _keyword = keyword;
        _pageURL = pageURL;
    }
    /// <summary>
    /// This method connects to the searching page, and retrieve the page content.
    /// It then passes the content to various private methods to perform other operations.
    /// </summary>
    public void search()
    {
        // start timing it
        DateTime lStarted = DateTime.Now;
        // create the WebRequest
        WebRequest webreq = WebRequest.Create(_pageURL);
        // connect to the page, and get its response
        WebResponse webresp = webreq.GetResponse();
        // wrap the response stream to a stream reader
        StreamReader sr = new StreamReader(webresp.GetResponseStream(), Encoding.ASCII);
        StringBuilder sb = new StringBuilder();
        string line;
        while ((line = sr.ReadLine()) != null)
        {
            // append each line the server sends, to the string builder
            sb.Append(line);
        }
        sr.Close();
        string pageCode = sb.ToString();
        // get the page title
        _pageTitle = getPageTitle(pageCode);
        // get the amount of time the keyword appeared on the page
        _instanceCount = countInstance(getPureContent(pageCode));
        // stop the timer
        _timeSpent = DateTime.Now.Subtract(lStarted);
    }
    // this method uses the regular expression to match the keyword.
    // it then count the matches to find out how many times the keyword appeared on the page.
    private int countInstance(string str)
    {
        string lPattern = "(" + _keyword + ")";
        int count = 0;
        Regex rx = new Regex(lPattern, RegexOptions.IgnoreCase | RegexOptions.Compiled);
        StringBuilder sb = new StringBuilder();
        Match mt;
        for (mt = rx.Match(str); mt.Success; mt = mt.NextMatch())
            count++;
        return count;
    }
    // this method uses the regular expression to match the pattern that represent all
    // string enclosed between ">" and "<". It removes all the HTML tags,
    // and only returns the HTML decoded content string.
    private string getPureContent(string str)
    {
        string lPattern = ">(?:(?<c>[^<]+))";
        Regex rx = new Regex(lPattern, RegexOptions.IgnoreCase | RegexOptions.Compiled);
        StringBuilder sb = new StringBuilder();
        Match mt;
        for (mt = rx.Match(str); mt.Success; mt = mt.NextMatch())
        {
            sb.Append(HttpUtility.HtmlDecode(mt.Groups["c"].ToString()));
            sb.Append(" ");
        }
        return sb.ToString();
    }
    // this method uses the regular expression to match the pattern that represent the
    // HTML Title tag of the page. It only returns the first match, and ignores the rest.
    private string getPageTitle(string str)
    {
        string lTitle = "";
        string lPattern = "(?:<\\s*title\\s*>(?<t>[^<]+))";
        Regex rx = new Regex(lPattern, RegexOptions.IgnoreCase | RegexOptions.Compiled);
        Match mt = rx.Match(str);
        if (mt.Success)
            try
            {
                lTitle = mt.Groups["t"].Value.ToString();
            }
            catch
            {
                lTitle = "";
            }
        else
            lTitle = "";
        return lTitle;
    }
}

再推荐两篇文章：

http://www.codeproject.com/KB/applications/SearchDotnet.aspx （Internal Site Search Engine 测试通过）

http://www.codeproject.com/KB/IP/Searcharoo_4.aspx （C# search engine: refactored to search Word, PDF and more）

测试通过，不过有点小麻烦，得学会序列化和反序列化（这里采用二进制）。解决问题方法是：

1. 随便找4个文件，分别命名为：plaintext.txt，Kilimanjaro.pdf，Decorator.ppt，Marathoning.doc 放在目录 content 中；

2. 根据错误提示(e文，注意e文提示上面的链接，那个链接告诉你如何序列化和反序列化二进制文件以及XML文件)，创建二进制文件 z_searcharoo.dat 放在特定的目录下，然后RUN！

weixin_34315485

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
【原】获取网页所有链接——简易搜索引擎雏形（C#,已测试通过,有CODE）

最近一直在看搜索引擎方面的资料，在Google上找了很多，可是没有找到一个下载下来可以直接使用的！没有办法只能看别人的代码，然后一点一点的调试！功夫不负有心人，终于算是修得正果！贴代码：Codeusing System;using System.Collections.Generic;using System.Linq;using System.Text;using System.Coll...
复制链接

扫一扫