自己实现爬取地区划分数据-CSDN博客

本文链接：https://blog.csdn.net/sqlite_me/article/details/142744415

做地址录入时，全手动输入不是很友好，通过联级选择就需要地区数据，在网上找了一通，都是很很旧的数据，索性自己动手搞一份，最新的应该是官方的2023年统计用区划代码和城乡划分代码 (stats.gov.cn)，看到那么多数据，要是手动去复制不知要弄到什么时候，就想到自己尝试写一个获取的工具，下面是我实现的主要的代码：

1、首先定义数据结构

public class Node
{
    public string Id { get; set; }
    public string Name { get; set; }
    public List<Node> Children { get; set; }
}

2、获取页面数据

/** 定义Http请求客户端*/
static HttpClient client1 = new HttpClient()
        {
            BaseAddress = new Uri("https://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/2023/")
        };
/** 由于是异步方法，所以定义信号量，保证只有一个任务发起请求*/
static SemaphoreSlim _semaphoreSlim = new SemaphoreSlim(1);
/** Http获取方法*/
static async Task<string> httpGet(string path)
{
    await _semaphoreSlim.WaitAsync();
Again:
    var rlt = await client1.GetStringAsync(path);

    if (!rlt.Contains("<table"))
    {
        await Task.Delay(180000);// 失败之后 等待一段时间重试
        goto Again;
    }
    var releasedCount = _semaphoreSlim.Release();
    return rlt;
}

可以采取多个客户端并行发起请求，但是请求过快，导致IP被锁，花费的时间更上，还不如老老实实一个一个的请求

3、比较复制的页面解析

/** 
解析Html内容
path：如果的子级是子级的路径，如果没有则为空
id:惟一编码
name：名称
*/
static (string path, string id, string name)[] getMatches(string txt)
{
    Regex regexTb = new(
                @"<table[^>]+?class=(""|')(\w+)table\1.*?>\s*(<tr[^>]+?class=\1\w+head\1[^>]*?>.*?</tr>)?\s*(<tr[^>]+?class=\1\w+tr\1[^>]*?>.*?</tr>)\s*</table>"
, RegexOptions.IgnoreCase | RegexOptions.Singleline);
    var matcheTd = regexTb.Match(txt);
    if (!matcheTd.Success)
    {
        throw new Exception("no match table");
    }
    var quat = matcheTd.Groups[1].Value;
    var name = matcheTd.Groups[2].Value;
    int indexCode=0, indexName=0;
    if ("province" != name)
    {
        if (!matcheTd.Groups[3].Success)
        {
            throw new Exception("no match head");
        }
        else
        {
            Regex regexHead = new(@"<td[^>]*?>\s*([^<]+)\s*</td>", RegexOptions.IgnoreCase | RegexOptions.Singleline);
            var matchHeads = regexHead.Matches(matcheTd.Groups[3].Value);
            var index = 0;
            foreach(Match match in matchHeads)
            {
                switch (match.Groups[1].Value)
                {
                    case "统计用区划代码":
                        indexCode = index;
                        break;
                    case "名称":
                        indexName = index;
                        break;
               }
                index++;
            }

            if(indexCode == indexName)
            {
                throw new Exception("table head error");
            }
       }
   }

   if (!matcheTd.Groups[4].Success)
   {
       throw new Exception("no match tr");
   }
   var trs = matcheTd.Groups[4].Value;
   Regex regexTr = new(
    $@"<tr[^>]+?class=({quat})\w+tr\1[^>]*?>\s*<td[^>]*>.*?</td>\s*</tr>"
, RegexOptions.IgnoreCase | RegexOptions.Singleline);
    var matcheTrs = regexTr.Matches(trs);
    string path, string id, string name)[] result = null;
    result = name switch
    {
        "province" => getProvince(matcheTrs).ToArray(),
        _ => getOther(matcheTrs, indexCode, indexName).ToArray(),
    };
    if (result.Length == 0)
        throw new Exception("not found in " + name);
    return result;

    IEnumerable<(string path, string id, string name)> getProvince(MatchCollection matcheTrs)
    {
        Regex regex = new(
$@"<a[^>]+?href=({quat})((\d+)\.html)\1[^>]*?>.*?([^<]+).*?</a>"
, RegexOptions.IgnoreCase | RegexOptions.Singleline);
        foreach (Match match in matcheTrs)
        {
            var matchAs = regex.Matches(match.Value);
            foreach (Match matchA in matchAs)
            {
                yield return (matchA.Groups[2].Value, matchA.Groups[3].Value, matchA.Groups[4].Value);
            }
       }
   }

   IEnumerable<(string path, string id, string name)> getOther(MatchCollection matcheTrs, int indexCode, int indexName)
   {
       Regex regexTd = new(@"<td[^>]*?>\s*(.+?)\s*</td>", RegexOptions.IgnoreCase | RegexOptions.Singleline);
                
       Regex regex = new(@"<a ((href=""(\S+?)"")?)>\s*([^<]+?)\s*</a>", RegexOptions.IgnoreCase | RegexOptions.Singleline);
       foreach (Match match in matcheTrs)
       {
           var matchTds = regexTd.Matches(match.Value);
           var codeVal = matchTds[indexCode].Groups[1].Value;
           var nameVal = matchTds[indexName].Groups[1].Value;
           var matchCode = regex.Match(codeVal);
           if (matchCode.Success) {
               var matchName = regex.Match(nameVal);
               if (matchCode.Groups[1].Value != matchName.Groups[1].Value) {
                   throw new Exception("error name and value");
               }
               yield return (matchCode.Groups[3].Value, matchCode.Groups[4].Value, matchName.Groups[4].Value);
           }
           else
           {
               yield return (null, codeVal, nameVal);
           }
       }
   }

上面就是最关键地内容

4、这一步是添加一个获取数据的入口

/** 从index页面开始爬取，从第2、3级里获取的地址不完整，要自动补上*/
private static async Task<List<Node>> getDatas(string url="index.html", string parentId=null,int leve=-1)
{
    var txt = await getPage(url);
    var items = getMatches(txt);
    string preUrl = null;
    for (var i = 0; i < leve; i++)
    {
        preUrl += string.Concat(parentId.AsSpan(2 * i, 2), "/");
    }

    List<Node> result = new();
    await Parallel.ForEachAsync(items, async (match2, cancel) =>
    {
        List<Node> Children = null;
        if (!string.IsNullOrEmpty(match2.path))
        {
            Children = await getDatas(preUrl + match2.path, match2.id,leve+1);
        }
        var data = new Node
        {
            Id = match2.id,
            Name = match2.name,
            Children = Children
        };

        lock (result)
            result.Add(data);
    });
    return result.OrderBy(t => t.Id).ToList();
}

var task = getDatas();
var rlt = task.Result;
var json = JsonConvert.SerializeObject(rlt, Newtonsoft.Json.Formatting.Indented);
File.WriteAllText(@".\datas.json", json);