c#爬取整个互联网链接，百度搜索引擎原理

jianxue01

已于 2023-05-23 12:40:27 修改

阅读量222

点赞数

文章标签： c# 开发语言

于 2023-05-15 19:31:32 首次发布

本文链接：https://blog.csdn.net/jianxue01/article/details/130690871

版权

爬取整站链接我采用的是递归算法，递归算法有个缺点就是栈溢出（可以用Thread控制栈的大小 Thread myThread = new Thread(new ThreadStart(MyMethod), 1024 * 1024);），所有可以用循环来写

由于我是用内存直接输出所有链接，所以有内存溢出的问题，可以把程序编译为64位，物理内存有多大就用多大，但是单台计算机不可能无限内存，所有可以用数据库来做

下面是代码安装包HtmlAgilityPack 改进了乱码是HtmlAgilityPack的问题编码问题自己解决吧我很懒

public class WebReptile//调用 GetHtmlLinks("https://www.zzxx.org");
    {
        static List<string> list = new List<string>();
        static List<string> list2 = new List<string>();
        static int i = 0;
        static int i2 = 0;
        static int i3 = 0;
        static HtmlWeb webClient = new HtmlWeb();
        static HttpClient client = new HttpClient();
        static HtmlAgilityPack.HtmlDocument doc = null;
        static HtmlNodeCollection hrefList = null;
        static string str;
        static Dictionary<string, string> dic = new Dictionary<string, string>();
        
        static WebReptile()
        {
            webClient.PreRequest = r => { r.Timeout = 10000;return true; };
            client.Timeout = TimeSpan.FromSeconds(10);
            
        }
        public static void GetHtmlLinks(string url, string s = null)
        {

            try
            {
                if (!url.ToLower().Contains("java") && !url.ToLower().Contains("http"))
                {
                    doc = webClient.Load(s + url);
                    hrefList = doc.DocumentNode.SelectNodes(".//a[@href]");
                    i++;
                    i3++;
                    Console.WriteLine("第二访问");
                    try
                    {
                        Console.WriteLine(i3 + doc.DocumentNode.SelectSingleNode("//title").InnerText + " " + url);
                    }
                    catch (Exception)
                    {


                    }


                    Console.WriteLine();
                    if (hrefList != null)
                    {

                        foreach (HtmlNode href in hrefList)
                        {
                            HtmlAttribute att = href.Attributes["href"];
                            if (!dic.ContainsKey(att.Value))
                            {
                                dic.Add(att.Value, att.Value);
                                list.Add(att.Value);
                            }

                        }
                        for (int j = i - 1; j < list.Count; j++)
                        {
                            GetHtmlLinks(list[j], str);

                        }
                        list.Clear();
                        i = 0；
                    }
                }
                if (url.ToLower().Contains("http"))
                {
                    doc = webClient.Load(url);
                    hrefList = doc.DocumentNode.SelectNodes(".//a[@href]");
                    i2++;
                    i3++;

                    str = GetUrl(url);
                    Console.WriteLine("第一访问");
                    try
                    {
                        Console.WriteLine(i3 + doc.DocumentNode.SelectSingleNode("//title").InnerText + " " + url);
                    }
                    catch (Exception)
                    {


                    }


                    // + doc.DocumentNode.SelectSingleNode("//title").InnerText
                    Console.WriteLine();
                    if (hrefList != null)
                    {

                        foreach (HtmlNode href in hrefList)
                        {
                            HtmlAttribute att = href.Attributes["href"];
                            if (att.Value.ToLower().Contains(url.ToLower()) || att.Value.ToLower().Contains("java"))
                            {
                                continue;


                            }

                            if (!dic.ContainsKey(att.Value))
                            {
                                dic.Add(att.Value, att.Value);
                                list2.Add(att.Value);
                            }

                        }
                        for (int j = i2 - 1; j < list2.Count; j++)//http://www.123.com /123.com
                        {

                            GetHtmlLinks(list2[j], str);

                        }

                        list.Clear();
						i2 = 0;
                    }
                }
            }
            catch (Exception)
            {

                
            }
        }

        static string GetUrl(string s)
        {
            
            string str = null;
            string[] arr = s.Split('/');
            for (int i = 0; i < arr.Length; i++)
            {
                if (i == 0)
                {

                    str = arr[0] + "//";
                }
                if (i == 1)
                {

                    str += arr[2];
                    return str;
                }
            }
            return null;
        }

        public static bool IsResourceUrl(string url)
        {

            try
            {
                HttpResponseMessage rs = client.GetAsync(url).Result;
                string s = rs.Content.Headers.ContentType.MediaType;
                Console.WriteLine(s);
                if (s.StartsWith("text/html"))
                {
                    Console.WriteLine(1);
                    return false;
                }
                if (s.StartsWith("application/"))
                {
                    Console.WriteLine("是应用软件");
                }
                else if (s.StartsWith("audio/"))
                {
                    Console.WriteLine("是音乐文件");
                }
                else if (s.StartsWith("image/"))
                {
                    Console.WriteLine("是图片文件");
                }
                else if (s.StartsWith("video/"))
                {
                    Console.WriteLine("是视频文件");
                }
                else
                {
                    Console.WriteLine("是资源文件");
                }
                return true;
            }
            catch (Exception)
            {
                return true;

            }
            
        }
    }

调用

WebReptile.GetHtmlLinks("https://www.zzxx.org");或
WebReptile.GetHtmlLinks("https://www.zzxx.org/xs/24183/");或
WebReptile.GetHtmlLinks("https://www.baidu.com");或
WebReptile.GetHtmlLinks("https://www.kugou.com")