C# 实现抓取国家统计局行政区划数据爬虫

应为逻辑很简单直接上代码:

using HtmlAgilityPack;
using System;
using System.Collections.Generic;
using System.IO;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading;

public  class Program
    {
        public static List<Html_a> html_As = new List<Html_a>();

        /// <summary>
        /// 网络请求:请求方式为Get
        /// </summary>
        /// <param name="Url"> 请求地址</param>
        /// <returns>返回结果</returns>
        public static string HttpGet(string Url)
        {
            try
            {
                Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);

                HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url);
                request.Method = "GET";
                request.ContentType = "text/html;charset=gb2312";
                HttpWebResponse response = (HttpWebResponse)request.GetResponse();
                Stream myResponseStream = response.GetResponseStream();
                StreamReader myStreamReader = new StreamReader(myResponseStream, Encoding.GetEncoding("gb2312"));
                string retString = myStreamReader.ReadToEnd();
                myStreamReader.Close();
                myResponseStream.Close();
                return retString;
            }
            catch
            {
                Thread.Sleep(100);
               return  HttpGet(Url);
            }
           
        }
        static void Main(string[] args)
        {
             GetMsg("http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/");
        }

        public static void GetMsg(string Url)
        {

            var shenarr = GetShen(HttpGet(Url + "index.html"));

            for (var shen_i=2;shen_i<shenarr.Count;shen_i++)
            {
                Html_a sen = shenarr[shen_i];
                Wreiterl(sen);
                if (sen.href == null)
                {
                    continue;
                }
                var shiarr = GetShi(HttpGet(Url + sen.href));
                for (var shi_i= 0; shi_i < shiarr.Count; shi_i++)
                {
                    Html_a shi = shiarr[shi_i];
                    shi.sjcode = sen.code;
                    Wreiterl(shi);
                    if (shi.href == null)
                    {
                        continue;
                    }
                    var quarr = GetQu(HttpGet(Url + shi.href));
                    for (var qu_i=0;qu_i< quarr.Count;qu_i++)
                    {
                        Html_a qu = quarr[qu_i];
                        qu.sjcode = shi.code;
                        Wreiterl(qu);

                        var strarr = shi.href.Split("/");
                        if (qu.href == null)
                        {
                            continue;
                        }
                        var qurl = strarr[0];
                        var xianarr = GetXian(HttpGet(Url + qurl + "/" + qu.href));
                        for (var xian_i=0;xian_i< xianarr.Count;xian_i++)
                        {
                            Html_a xian = xianarr[xian_i];
                            xian.sjcode = qu.code;
                            Wreiterl(xian);
                            if (xian.href == null)
                            {
                                continue;
                            }
                            strarr = qu.href.Split("/");
                            var jdrl = strarr[0];
                            var jiandao = Getjiedao(HttpGet(Url + qurl + "/" + jdrl + "/" + xian.href));

                            foreach (var jd in jiandao)
                            {

                                jd.sjcode = xian.code;
                                Wreiterl(jd);
                            }
                        }
                    }
                }
            }
        }

        public static void Wreiterl(Html_a html_A)
        {
            html_As.Add(html_A);
            var Msg = html_A.code + "\t\t" + (html_A.cxtype == null ? "Null" : html_A.cxtype) + "\t\t" +(html_A.sjcode == null ? "Null" : html_A.sjcode) +"\t\t"+ html_A.name;
            Console.WriteLine(Msg);
            string Folder = ".\\data\\";

            if (!System.IO.Directory.Exists(Folder))
                System.IO.Directory.CreateDirectory(Folder);
            string FilePath = $"{ Folder }Msg.txt";
            using (TextWriter fs = new StreamWriter(FilePath, true))
            {
                fs.WriteLine(Msg);
                fs.Close();
                fs.Dispose();
            }
        }


        public static Html_a GetA(string html)
        {
            Html_a a = new Html_a();
            string regex = "href=[\\\"\\\'](http:\\/\\/|\\.\\/|\\/)?\\w+(\\.\\w+)*(\\/\\w+(\\.\\w+)?)*(\\/|\\?\\w*=\\w*(&\\w*=\\w*)*)?[\\\"\\\']";
            Regex re = new Regex(regex);
            MatchCollection matches = re.Matches(html);
            var href = matches[0].ToString().Replace("href=\'", "").Replace("\'", "");
            a.code = href.Replace(".html", "");
            a.href = href;
            var htmlDoc = new HtmlDocument();
            htmlDoc.LoadHtml(html);
            var ass = htmlDoc.DocumentNode.SelectSingleNode("//a");
            a.name = ass.InnerText;
            return a;
        }


        /// <summary>
        /// 省
        /// </summary>
        /// <param name="str"></param>
        /// <returns></returns>
        public static List<Html_a> GetShen(string str)
        {
            List<Html_a> aArr = new List<Html_a>();
            //string regex = "href=[\\\"\\\'](http:\\/\\/|\\.\\/|\\/)?\\w+(\\.\\w+)*(\\/\\w+(\\.\\w+)?)*(\\/|\\?\\w*=\\w*(&\\w*=\\w*)*)?[\\\"\\\']";
            string regex = "<tr class='provincetr'>(.*?)</tr>";
            Regex re = new Regex(regex);
            MatchCollection matches = re.Matches(str);

            foreach (var a in matches)
            {
                string agx = "<a href='(.*?)'>(.*?)</a>";
                Regex are = new Regex(agx);
                MatchCollection mc_a = are.Matches(a.ToString());
                foreach (var aitem in mc_a)
                {
                    aArr.Add(GetA(aitem.ToString()));
                }
            }
            return aArr;
        }
        /// <summary>
        /// 获取a标签
        /// </summary>
        /// <param name="html"></param>
        /// <returns></returns>
        public static MatchCollection Get_A(string html)
        {
            string agx = "<a href='(.*?)'>(.*?)</a>";
            Regex are = new Regex(agx);
            MatchCollection mc_a = are.Matches(html);
            return mc_a;
        }

        /// <summary>
        /// 市
        /// </summary>
        /// <param name="str"></param>
        /// <returns></returns>
        public static List<Html_a> GetShi(string str)
        {
            string regex = "<tr class='citytr'>(.*?)</tr>";
            Regex re = new Regex(regex);
            MatchCollection matches = re.Matches(str);
            return GetHtmlaArr(re, matches);
        }
        /// <summary>
        /// 区
        /// </summary>
        /// <param name="str"></param>
        /// <returns></returns>
        public static List<Html_a> GetQu(string str)
        {
            string regex = "<tr class='countytr'>(.*?)</tr>";
            Regex re = new Regex(regex);
            MatchCollection matches = re.Matches(str);

            return GetHtmlaArr(re, matches);
        }

        private static List<Html_a> GetHtmlaArr(Regex re, MatchCollection matches)
        {
            List<Html_a> aArr = new List<Html_a>();
            foreach (var ma in matches)
            {
                string rema = "<td>(.*?)</td>";
                Regex ma2 = new Regex(rema);
                MatchCollection matches2 = re.Matches(ma.ToString());
                foreach (var td in matches2)
                {
                    var a = Get_A(td.ToString());
                    if (a.Count == 2)
                    {
                        var ca0 = GetA(a[0].ToString());
                        var ca1 = GetA(a[1].ToString());
                        Html_a html_A = new Html_a();
                        html_A.code = ca0.name;
                        html_A.href = ca0.href;
                        html_A.name = ca1.name;
                        aArr.Add(html_A);
                    }
                    else
                    {
                        var msc = ma2.Matches(td.ToString());
                        if (msc.Count == 2)
                        {
                            Html_a html_A = new Html_a();
                            var htmlDoc = new HtmlDocument();
                            htmlDoc.LoadHtml(msc[0].ToString());
                            var ass = htmlDoc.DocumentNode.SelectSingleNode("//td");
                            html_A.code = ass.InnerText;
                            htmlDoc.LoadHtml(msc[1].ToString());
                            var ass2 = htmlDoc.DocumentNode.SelectSingleNode("//td");
                            html_A.name = ass2.InnerText;
                            aArr.Add(html_A);
                        }

                        if (msc.Count == 3)
                        {
                            Html_a html_A = new Html_a();
                            var htmlDoc = new HtmlDocument();
                            htmlDoc.LoadHtml(msc[0].ToString());
                            var ass = htmlDoc.DocumentNode.SelectSingleNode("//td");
                            html_A.code = ass.InnerText;
                            htmlDoc.LoadHtml(msc[1].ToString());
                            var ass2 = htmlDoc.DocumentNode.SelectSingleNode("//td");
                            html_A.cxtype = ass2.InnerText;

                            htmlDoc.LoadHtml(msc[2].ToString());
                            var ass3 = htmlDoc.DocumentNode.SelectSingleNode("//td");
                            html_A.name = ass3.InnerText;
                            aArr.Add(html_A);
                        }
                    }

                }
            }
            return aArr;
        }

        /// <summary>
        /// 县 镇
        /// </summary>
        /// <param name="str"></param>
        /// <returns></returns>
        public static List<Html_a> GetXian(string str)
        {
            string regex = "<tr class='towntr'>(.*?)</tr>";
            Regex re = new Regex(regex);
            MatchCollection matches = re.Matches(str);

            return GetHtmlaArr(re, matches);
        }

        /// <summary>
        /// 街道
        /// </summary>
        /// <param name="str"></param>
        /// <returns></returns>
        public static List<Html_a> Getjiedao(string str)
        {
            string regex = "<tr class='villagetr'>(.*?)</tr>";
            Regex re = new Regex(regex);
            MatchCollection matches = re.Matches(str);

            return GetHtmlaArr(re, matches);
        }
    }


    class Html_a
    {
        public string code { get; set; }
        public string href { get; set; }
        public string cxtype { get; set; }
        public string name { get; set; }

        public string sjcode { get; set; }
    }
}

项目链接可提供新手参考:ToMoveTheBick.rar-互联网文档类资源-CSDN下载

  • 1
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
您好!对于使用C#来编写爬虫抓取网站后台数据,您可以借助一些库和框架来实现。以下是一个基本的步骤指南: 1. 首先,您需要使用C#的网络请求库,例如HttpClient,来发送HTTP请求到目标网站。您可以使用GET或POST方法发送请求,并传递必要的参数。 2. 接下来,您可以使用HTML解析库,例如HtmlAgilityPack,来解析网页内容。这样您可以提取出所需的数据,如文本、链接、表格等等。 3. 如果目标网站需要登录或进行身份验证,您可能需要模拟登录过程。通过发送POST请求,将用户名和密码等凭据提交给登录页面,并在后续请求中使用Cookie来维持登录状态。 4. 在获取到所需数据后,您可以将其保存到数据库中或进行进一步处理和分析。您可以使用C#数据库访问库,如Entity Framework或SqlClient,来连接和操作数据库。 5. 最后,为了确保爬取过程的合法性和稳定性,建议添加适当的延时和错误处理机制。遵守网站的robots.txt文件中的规定,并设置合理的请求间隔,以免给目标网站造成过大的负担。 请注意,在进行网络爬虫时,应该遵守法律法规和网站的使用规则,确保以合法和道德的方式使用爬虫技术。此外,如果目标网站有反爬虫机制,您可能需要采取一些反反爬虫的策略来应对。 希望以上信息对您有所帮助!如果您有任何问题,请随时提问。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值