做地址录入时,全手动输入不是很友好,通过联级选择就需要地区数据,在网上找了一通,都是很很旧的数据,索性自己动手搞一份,最新的应该是官方的2023年统计用区划代码和城乡划分代码 (stats.gov.cn),看到那么多数据,要是手动去复制不知要弄到什么时候,就想到自己尝试写一个获取的工具,下面是我实现的主要的代码:
1、首先定义数据结构
public class Node
{
public string Id { get; set; }
public string Name { get; set; }
public List<Node> Children { get; set; }
}
2、获取页面数据
/** 定义Http请求客户端*/
static HttpClient client1 = new HttpClient()
{
BaseAddress = new Uri("https://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/2023/")
};
/** 由于是异步方法,所以定义信号量,保证只有一个任务发起请求*/
static SemaphoreSlim _semaphoreSlim = new SemaphoreSlim(1);
/** Http获取方法*/
static async Task<string> httpGet(string path)
{
await _semaphoreSlim.WaitAsync();
Again:
var rlt = await client1.GetStringAsync(path);
if (!rlt.Contains("<table"))
{
await Task.Delay(180000);// 失败之后 等待一段时间重试
goto Again;
}
var releasedCount = _semaphoreSlim.Release();
return rlt;
}
可以采取多个客户端并行发起请求,但是请求过快,导致IP被锁,花费的时间更上,还不如老老实实一个一个的请求
3、比较复制的页面解析
/**
解析Html内容
path:如果的子级是子级的路径,如果没有则为空
id:惟一编码
name:名称
*/
static (string path, string id, string name)[] getMatches(string txt)
{
Regex regexTb = new(
@"<table[^>]+?class=(""|')(\w+)table\1.*?>\s*(<tr[^>]+?class=\1\w+head\1[^>]*?>.*?</tr>)?\s*(<tr[^>]+?class=\1\w+tr\1[^>]*?>.*?</tr>)\s*</table>"
, RegexOptions.IgnoreCase | RegexOptions.Singleline);
var matcheTd = regexTb.Match(txt);
if (!matcheTd.Success)
{
throw new Exception("no match table");
}
var quat = matcheTd.Groups[1].Value;
var name = matcheTd.Groups[2].Value;
int indexCode=0, indexName=0;
if ("province" != name)
{
if (!matcheTd.Groups[3].Success)
{
throw new Exception("no match head");
}
else
{
Regex regexHead = new(@"<td[^>]*?>\s*([^<]+)\s*</td>", RegexOptions.IgnoreCase | RegexOptions.Singleline);
var matchHeads = regexHead.Matches(matcheTd.Groups[3].Value);
var index = 0;
foreach(Match match in matchHeads)
{
switch (match.Groups[1].Value)
{
case "统计用区划代码":
indexCode = index;
break;
case "名称":
indexName = index;
break;
}
index++;
}
if(indexCode == indexName)
{
throw new Exception("table head error");
}
}
}
if (!matcheTd.Groups[4].Success)
{
throw new Exception("no match tr");
}
var trs = matcheTd.Groups[4].Value;
Regex regexTr = new(
$@"<tr[^>]+?class=({quat})\w+tr\1[^>]*?>\s*<td[^>]*>.*?</td>\s*</tr>"
, RegexOptions.IgnoreCase | RegexOptions.Singleline);
var matcheTrs = regexTr.Matches(trs);
string path, string id, string name)[] result = null;
result = name switch
{
"province" => getProvince(matcheTrs).ToArray(),
_ => getOther(matcheTrs, indexCode, indexName).ToArray(),
};
if (result.Length == 0)
throw new Exception("not found in " + name);
return result;
IEnumerable<(string path, string id, string name)> getProvince(MatchCollection matcheTrs)
{
Regex regex = new(
$@"<a[^>]+?href=({quat})((\d+)\.html)\1[^>]*?>.*?([^<]+).*?</a>"
, RegexOptions.IgnoreCase | RegexOptions.Singleline);
foreach (Match match in matcheTrs)
{
var matchAs = regex.Matches(match.Value);
foreach (Match matchA in matchAs)
{
yield return (matchA.Groups[2].Value, matchA.Groups[3].Value, matchA.Groups[4].Value);
}
}
}
IEnumerable<(string path, string id, string name)> getOther(MatchCollection matcheTrs, int indexCode, int indexName)
{
Regex regexTd = new(@"<td[^>]*?>\s*(.+?)\s*</td>", RegexOptions.IgnoreCase | RegexOptions.Singleline);
Regex regex = new(@"<a ((href=""(\S+?)"")?)>\s*([^<]+?)\s*</a>", RegexOptions.IgnoreCase | RegexOptions.Singleline);
foreach (Match match in matcheTrs)
{
var matchTds = regexTd.Matches(match.Value);
var codeVal = matchTds[indexCode].Groups[1].Value;
var nameVal = matchTds[indexName].Groups[1].Value;
var matchCode = regex.Match(codeVal);
if (matchCode.Success) {
var matchName = regex.Match(nameVal);
if (matchCode.Groups[1].Value != matchName.Groups[1].Value) {
throw new Exception("error name and value");
}
yield return (matchCode.Groups[3].Value, matchCode.Groups[4].Value, matchName.Groups[4].Value);
}
else
{
yield return (null, codeVal, nameVal);
}
}
}
上面就是最关键地内容
4、这一步是添加一个获取数据的入口
/** 从index页面开始爬取,从第2、3级里获取的地址不完整,要自动补上*/
private static async Task<List<Node>> getDatas(string url="index.html", string parentId=null,int leve=-1)
{
var txt = await getPage(url);
var items = getMatches(txt);
string preUrl = null;
for (var i = 0; i < leve; i++)
{
preUrl += string.Concat(parentId.AsSpan(2 * i, 2), "/");
}
List<Node> result = new();
await Parallel.ForEachAsync(items, async (match2, cancel) =>
{
List<Node> Children = null;
if (!string.IsNullOrEmpty(match2.path))
{
Children = await getDatas(preUrl + match2.path, match2.id,leve+1);
}
var data = new Node
{
Id = match2.id,
Name = match2.name,
Children = Children
};
lock (result)
result.Add(data);
});
return result.OrderBy(t => t.Id).ToList();
}
var task = getDatas();
var rlt = task.Result;
var json = JsonConvert.SerializeObject(rlt, Newtonsoft.Json.Formatting.Indented);
File.WriteAllText(@".\datas.json", json);
一切准备好,启动程序,剩下的就交给时间。经过漫长的等待,最终获取665552条数据