一、省市区三级围栏数据获取
我们先来看下最终效果,我是在Ubantu22中装的ES8.13
1. 免费获取数据
我们从https://github.com/xiangyuecn/AreaCity-JsSpider-StatsGov免费下载ok_geo.csv.7z,这里更新维护效果非常高
2. ES索引创建
# 省市区围栏数据建表语句
PUT /region_fence
{
"settings": {
"number_of_shards": 1
},
"mappings": {
"properties": {
"code": { "type": "keyword"},
"p_code": { "type": "keyword"},
"deep": { "type": "integer"},
"name": { "type": "keyword"},
"ext_path": { "type": "text"},
"location": {
"type": "geo_point"
},
"fence": {
"type": "geo_shape"
}
}
}
}
3. python处理CSV并写入ES
# 从csv文件解析省市区数据至ES中
import pandas as pd
from elasticsearch import helpers, Elasticsearch
region_fence_df = pd.read_csv("../data/region_fence.csv")
def init_es_client(es_host):
es = Elasticsearch(hosts=[es_host], verify_certs=False)
return es
es_client = init_es_client("http://127.0.0.1:9200")
actions = list()
count = 0
for index, item in region_fence_df.iterrows():
info = dict()
info["code"] = item["id"]
info["p_code"] = item["pid"]
info["deep"] = item["deep"]
info["name"] = item["name"]
info["ext_path"] = item["ext_path"]
geo = item["geo"].split(" ")
if len(geo) != 2:
geo = None
else:
info["location"] = [round(float(geo[0]), 6), round(float(geo[1]), 6)]
# 有的围栏是多块,如天津,分开编号写,否则报多边形自相交异常
polygon_parent_arr = item["polygon"].split(";")
id_index = 0
for polygon in polygon_parent_arr:
coordinates_parent = []
coordinates = []
polygon_arr = polygon.split(",")
lng_lat_first = []
for i in range(0, len(polygon_arr)):
lng_lat = polygon_arr[i]
lng_lat_arr = lng_lat.split(" ")
coordinate = [round(float(lng_lat_arr[0]), 6), round(float(lng_lat_arr[1]), 6)]
if i == 0:
lng_lat_first = coordinate
coordinates.append(coordinate)
# 保证围栏闭合
coordinates.append(lng_lat_first)
coordinates_parent.append(coordinates)
info["fence"] = {"type": "Polygon", "coordinates": coordinates_parent}
my_id = str(info["code"]) + "_" + str(id_index) if id_index > 0 else str(info["code"])
action = {
"_op_type": "index",
"_index": "region_fence",
"_id": my_id,
"_source": info.copy()
}
actions.append(action.copy())
id_index += 1
if len(actions) == 100:
helpers.bulk(es_client, actions)
count += len(actions)
print(count)
actions.clear()
if len(actions) > 0:
helpers.bulk(es_client, actions)
count += len(actions)
print(count)
actions.clear()
es_client.close()
二、乡镇围栏解析存储
空白的是ES渲染的问题,放大后看到几乎没有丢失数据
1. ES索引建立
- 分片数设置为1个
- 各个字段设置为keyword类型,不分词
- 围栏设置为geo_shape类型,存储多边形信息
PUT /town_fence
{
"settings": {
"number_of_shards": 1
},
"mappings": {
"properties": {
"province": {
"type": "keyword"
},
"city": {
"type": "keyword"
},
"district": {
"type": "keyword"
},
"town": {
"type": "keyword"
},
"fence" : {
"type": "geo_shape"
}
}
}
}
2. Python文件解析入ES
geo_shape类型存储时设置多边形存储: {“type”: “Polygon”, “coordinates”: coordinates_parent},其中多边形需要闭合,不可有空洞;copy的使用是为了解决对象覆盖的问题
# 从excel文件解析乡镇围栏数据至ES中
import pandas as pd
from elasticsearch import helpers, Elasticsearch
town_fence_df = pd.read_csv("../data/town.csv", sep='@', encoding='UTF-8')
def init_es_client(es_host):
es = Elasticsearch(hosts=[es_host], verify_certs=False)
return es
es_client = init_es_client("http://127.0.0.1:9200")
actions = list()
count = 0
for index, item in town_fence_df.iterrows():
info = dict()
info["province"] = item["province"]
info["city"] = item["city"]
info["district"] = item["region"]
info["town"] = item["town"]
# 有的围栏是多块,如天津,分开编号写,否则报多边形自相交异常
polygon_parent_arr = item["polyline"].split("|")
id_index = 0
try:
for polygon in polygon_parent_arr:
coordinates_parent = []
coordinates = []
polygon_arr = polygon.split(";")
lng_lat_first = []
lng_lat_last = []
for i in range(0, len(polygon_arr)):
lng_lat = polygon_arr[i]
lng_lat_arr = lng_lat.split(",")
coordinate = [round(float(lng_lat_arr[0]), 6), round(float(lng_lat_arr[1]), 6)]
if i == 0:
lng_lat_first = coordinate
if i == len(polygon_arr) - 1:
lng_lat_last = coordinate
coordinates.append(coordinate)
# 保证围栏闭合
if lng_lat_first[0] != lng_lat_last[0]:
coordinates.append(lng_lat_first)
coordinates_parent.append(coordinates)
info["fence"] = {"type": "Polygon", "coordinates": coordinates_parent}
unique_id = str(hash(info["province"] + info["city"] + info["district"] + info["town"]))
my_id = unique_id + "_" + str(id_index) if id_index > 0 else unique_id
action = {
"_op_type": "index",
"_index": "town_fence",
"_id": my_id,
"_source": info.copy()
}
actions.append(action.copy())
id_index += 1
if len(actions) == 1:
try:
helpers.bulk(es_client, actions)
count += len(actions)
print(count)
actions.clear()
except Exception as e:
town_fence_df.loc[index, 'flag'] = False
actions.clear()
except Exception as e:
town_fence_df.loc[index, 'flag'] = False
actions.clear()
if len(actions) > 0:
helpers.bulk(es_client, actions)
count += len(actions)
print(count)
actions.clear()
town_fence_df[town_fence_df['flag'] == False].to_csv('../data/town_errors.csv', sep='@', encoding='UTF-8', index=False)
es_client.close()
3. 数据查询验证
三、国家统计局行政区划获取及入库ES实践
我们先看下最终效果:
1. ES索引新建
PUT administrative_division
{
"mappings": {
"properties": {
"province": {
"type": "keyword"
},
"province_code": {
"type": "keyword"
},
"city": {
"type": "keyword"
},
"city_code": {
"type": "keyword"
},
"district": {
"type": "keyword"
},
"district_code": {
"type": "keyword"
},
"town": {
"type": "keyword"
},
"town_code": {
"type": "keyword"
},
"committee": {
"type": "keyword"
},
"committee_code": {
"type": "keyword"
},
"type_code": {
"type": "keyword"
}
}
},
"settings": {
"number_of_replicas": 0,
"number_of_shards": 1
}
}
2. 代码编写
此处代码找的网上大神写的个人认为较为简洁的,直接拿来用改下存储
from lxml import etree
import requests
import time
import random
"""
国家统计局行政区划获取
"""
from elasticsearch import helpers, Elasticsearch
def init_es_client(es_host):
es = Elasticsearch(hosts=[es_host], verify_certs=False)
return es
es_client = init_es_client('http://127.0.0.1:9200')
actions = list()
count = 0
def get_html(url):
response = requests.get(url)
response.encoding = "utf8"
res = response.text
html = etree.HTML(res)
return html
base_url = "http://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/2023/"
url = base_url + "index.html"
province_html = get_html(url)
province_list = province_html.xpath('//tr[@class="provincetr"]/td')
province_code = province_list[0].xpath('//td/a/@href')
province_name = province_list[0].xpath('//td/a/text()')
province = dict(zip([p.split(".")[0] for p in province_code], province_name))
actions = list()
for p_key in province.keys():
url_city = base_url + p_key + ".html"
time.sleep(random.randint(0, 3))
city_html = get_html(url_city)
if city_html is None:
print("city_html is None", url_city)
continue
city_code = city_html.xpath('//tr[@class="citytr"]/td[1]/a/text()')
city_name = city_html.xpath('//tr[@class="citytr"]/td[2]/a/text()')
city_url = city_html.xpath('//tr[@class="citytr"]/td[1]/a/@href')
for c_num in range(len(city_url)):
county_url = base_url + city_url[c_num]
time.sleep(random.randint(0, 3))
county_html = get_html(county_url)
if county_html is None:
print("county_html is None", county_url)
continue
county_code = county_html.xpath('//tr[@class="countytr"]/td[1]/a/text()')
county_name = county_html.xpath('//tr[@class="countytr"]/td[2]/a/text()')
county_url = county_html.xpath('//tr[@class="countytr"]/td[1]/a/@href')
for t_num in range(len(county_url)):
town_url = base_url + "/" + city_url[c_num].split('/')[0] + "/" + county_url[t_num]
time.sleep(random.randint(0, 3))
town_html = get_html(town_url)
if town_html is None:
print("town_html is None", town_url)
continue
town_code = town_html.xpath('//tr[@class="towntr"]/td[1]/a/text()')
town_name = town_html.xpath('//tr[@class="towntr"]/td[2]/a/text()')
town_url = town_html.xpath('//tr[@class="towntr"]/td[1]/a/@href')
for v_num in range(len(town_url)):
code_ = town_url[v_num].split("/")[1].rstrip(".html")
village_url = base_url + code_[0:2] + "/" + code_[2:4] + "/" + town_url[v_num]
time.sleep(random.randint(0, 3))
village_html = get_html(village_url)
if village_html is None:
print("village_html is None", village_url)
continue
# 居委村委代码
village_code = village_html.xpath('//tr[@class="villagetr"]/td[1]/text()')
# 居委村委城乡分类代码
village_type_code = village_html.xpath('//tr[@class="villagetr"]/td[2]/text()')
# 居委村委名称
village_name = village_html.xpath('//tr[@class="villagetr"]/td[3]/text()')
for num in range(len(village_code)):
v_name = village_name[num]
v_code = village_code[num]
type_code = village_type_code[num]
info = dict()
info['province'] = str(p_key).ljust(12, '0')
info['province_code'] = province[p_key]
info['city_code'] = city_code[c_num]
info['city'] = city_name[c_num]
info['district_code'] = county_code[t_num]
info['district'] = county_name[t_num]
info['town_code'] = town_code[v_num]
info['town'] = town_name[v_num]
info['type_code'] = type_code
info['committee_code'] = v_code
info['committee'] = v_name
action = {
"_op_type": "index",
"_index": "administrative_division",
"_id": v_code,
"_source": info
}
actions.append(action)
if len(actions) == 10:
helpers.bulk(es_client, actions)
count += len(actions)
print(count)
actions.clear()
if len(actions) > 0:
helpers.bulk(es_client, actions)
count += len(actions)
print(count)
actions.clear()
好了,每年更新一次,慢慢跑着吧,当然我们没有考虑历史变更情况,欢迎关注公众号 算法小生,获取第一资讯
欢迎关注公众号 算法小生