省市区乡镇围栏数据获取及存储ES实践

算法小生Đ

已于 2024-06-19 20:05:05 修改

阅读量318

点赞数 3

分类专栏：干货文章标签： elasticsearch 大数据搜索引擎

于 2024-06-02 12:10:37 首次发布

本文链接：https://blog.csdn.net/sjshenjian/article/details/139389485

版权

干货专栏收录该内容

22 篇文章 0 订阅

订阅专栏

一、省市区三级围栏数据获取

我们先来看下最终效果，我是在Ubantu22中装的ES8.13

1. 免费获取数据

我们从https://github.com/xiangyuecn/AreaCity-JsSpider-StatsGov免费下载ok_geo.csv.7z，这里更新维护效果非常高
在这里插入图片描述

2. ES索引创建

# 省市区围栏数据建表语句
PUT /region_fence
{
  "settings": {
    "number_of_shards": 1
  },
  "mappings": {
    "properties": {
      "code": { "type": "keyword"},
      "p_code": { "type": "keyword"},
      "deep": { "type": "integer"},
      "name": { "type": "keyword"},
      "ext_path": { "type": "text"},
      "location": {
        "type": "geo_point"
      },
      "fence": {
        "type": "geo_shape"
      }
    }
  }
}

3. python处理CSV并写入ES

# 从csv文件解析省市区数据至ES中
import pandas as pd
from elasticsearch import helpers, Elasticsearch

region_fence_df = pd.read_csv("../data/region_fence.csv")


def init_es_client(es_host):
    es = Elasticsearch(hosts=[es_host], verify_certs=False)
    return es


es_client = init_es_client("http://127.0.0.1:9200")

actions = list()
count = 0
for index, item in region_fence_df.iterrows():
    info = dict()
    info["code"] = item["id"]
    info["p_code"] = item["pid"]
    info["deep"] = item["deep"]
    info["name"] = item["name"]
    info["ext_path"] = item["ext_path"]
    geo = item["geo"].split(" ")
    if len(geo) != 2:
        geo = None
    else:
        info["location"] = [round(float(geo[0]), 6), round(float(geo[1]), 6)]

        # 有的围栏是多块，如天津，分开编号写，否则报多边形自相交异常
        polygon_parent_arr = item["polygon"].split(";")
        id_index = 0
        for polygon in polygon_parent_arr:
            coordinates_parent = []
            coordinates = []
            polygon_arr = polygon.split(",")
            lng_lat_first = []
            for i in range(0, len(polygon_arr)):
                lng_lat = polygon_arr[i]
                lng_lat_arr = lng_lat.split(" ")
                coordinate = [round(float(lng_lat_arr[0]), 6), round(float(lng_lat_arr[1]), 6)]
                if i == 0:
                    lng_lat_first = coordinate
                coordinates.append(coordinate)
            # 保证围栏闭合
            coordinates.append(lng_lat_first)
            coordinates_parent.append(coordinates)
            info["fence"] = {"type": "Polygon", "coordinates": coordinates_parent}
            my_id = str(info["code"]) + "_" + str(id_index) if id_index > 0 else str(info["code"])
            action = {
                "_op_type": "index",
                "_index": "region_fence",
                "_id": my_id,
                "_source": info.copy()
            }
            actions.append(action.copy())
            id_index += 1
            if len(actions) == 100:
                helpers.bulk(es_client, actions)
                count += len(actions)
                print(count)
                actions.clear()
if len(actions) > 0:
    helpers.bulk(es_client, actions)
    count += len(actions)
    print(count)
    actions.clear()


es_client.close()

二、乡镇围栏解析存储

在这里插入图片描述

空白的是ES渲染的问题，放大后看到几乎没有丢失数据
在这里插入图片描述

1. ES索引建立

分片数设置为1个
各个字段设置为keyword类型，不分词
围栏设置为geo_shape类型，存储多边形信息

PUT /town_fence
{
  "settings": {
    "number_of_shards": 1
  },
  "mappings": {
    "properties": {
      "province": {
        "type": "keyword"
      },
      "city": {
        "type": "keyword"
      },
      "district": {
        "type": "keyword"
      },
      "town": {
        "type": "keyword"
      },
      "fence" : {
        "type": "geo_shape"
      }
    }
  }
}

2. Python文件解析入ES

geo_shape类型存储时设置多边形存储： {“type”: “Polygon”, “coordinates”: coordinates_parent}，其中多边形需要闭合，不可有空洞；copy的使用是为了解决对象覆盖的问题

# 从excel文件解析乡镇围栏数据至ES中
import pandas as pd
from elasticsearch import helpers, Elasticsearch

town_fence_df = pd.read_csv("../data/town.csv", sep='@', encoding='UTF-8')


def init_es_client(es_host):
    es = Elasticsearch(hosts=[es_host], verify_certs=False)
    return es


es_client = init_es_client("http://127.0.0.1:9200")

actions = list()
count = 0

for index, item in town_fence_df.iterrows():
    info = dict()
    info["province"] = item["province"]
    info["city"] = item["city"]
    info["district"] = item["region"]
    info["town"] = item["town"]
    # 有的围栏是多块，如天津，分开编号写，否则报多边形自相交异常
    polygon_parent_arr = item["polyline"].split("|")
    id_index = 0
    try:
        for polygon in polygon_parent_arr:
            coordinates_parent = []
            coordinates = []
            polygon_arr = polygon.split(";")
            lng_lat_first = []
            lng_lat_last = []
            for i in range(0, len(polygon_arr)):
                lng_lat = polygon_arr[i]
                lng_lat_arr = lng_lat.split(",")
                coordinate = [round(float(lng_lat_arr[0]), 6), round(float(lng_lat_arr[1]), 6)]
                if i == 0:
                    lng_lat_first = coordinate
                if i == len(polygon_arr) - 1:
                    lng_lat_last = coordinate
                coordinates.append(coordinate)
            # 保证围栏闭合
            if lng_lat_first[0] != lng_lat_last[0]:
                coordinates.append(lng_lat_first)
            coordinates_parent.append(coordinates)
            info["fence"] = {"type": "Polygon", "coordinates": coordinates_parent}
            unique_id = str(hash(info["province"] + info["city"] + info["district"] + info["town"]))
            my_id = unique_id + "_" + str(id_index) if id_index > 0 else unique_id
            action = {
                "_op_type": "index",
                "_index": "town_fence",
                "_id": my_id,
                "_source": info.copy()
            }
            actions.append(action.copy())
            id_index += 1
            if len(actions) == 1:
                try:
                    helpers.bulk(es_client, actions)
                    count += len(actions)
                    print(count)
                    actions.clear()
                except Exception as e:
                    town_fence_df.loc[index, 'flag'] = False
                    actions.clear()
    except Exception as e:
        town_fence_df.loc[index, 'flag'] = False
        actions.clear()
if len(actions) > 0:
    helpers.bulk(es_client, actions)
    count += len(actions)
    print(count)
    actions.clear()

town_fence_df[town_fence_df['flag'] == False].to_csv('../data/town_errors.csv', sep='@', encoding='UTF-8', index=False)

es_client.close()

3. 数据查询验证

在这里插入图片描述

三、国家统计局行政区划获取及入库ES实践

我们先看下最终效果：
在这里插入图片描述

1. ES索引新建

PUT administrative_division
{
  "mappings": {
    "properties": {
      "province": {
        "type": "keyword"
      },
      "province_code": {
        "type": "keyword"
      },
      "city": {
        "type": "keyword"
      },
      "city_code": {
        "type": "keyword"
      },
      "district": {
        "type": "keyword"
      },
      "district_code": {
        "type": "keyword"
      },
      "town": {
        "type": "keyword"
      },
      "town_code": {
        "type": "keyword"
      },
      "committee": {
        "type": "keyword"
      },
      "committee_code": {
        "type": "keyword"
      },
      "type_code": {
        "type": "keyword"
      }
    }
  },
  "settings": {
    "number_of_replicas": 0,
    "number_of_shards": 1
  }
}

2. 代码编写

此处代码找的网上大神写的个人认为较为简洁的，直接拿来用改下存储

from lxml import etree
import requests
import time
import random

"""
国家统计局行政区划获取
"""
from elasticsearch import helpers, Elasticsearch


def init_es_client(es_host):
    es = Elasticsearch(hosts=[es_host], verify_certs=False)
    return es


es_client = init_es_client('http://127.0.0.1:9200')

actions = list()
count = 0


def get_html(url):
    response = requests.get(url)
    response.encoding = "utf8"
    res = response.text
    html = etree.HTML(res)
    return html


base_url = "http://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/2023/"
url = base_url + "index.html"
province_html = get_html(url)
province_list = province_html.xpath('//tr[@class="provincetr"]/td')
province_code = province_list[0].xpath('//td/a/@href')
province_name = province_list[0].xpath('//td/a/text()')
province = dict(zip([p.split(".")[0] for p in province_code], province_name))

actions = list()
for p_key in province.keys():
    url_city = base_url + p_key + ".html"
    time.sleep(random.randint(0, 3))
    city_html = get_html(url_city)
    if city_html is None:
        print("city_html is None", url_city)
        continue
    city_code = city_html.xpath('//tr[@class="citytr"]/td[1]/a/text()')
    city_name = city_html.xpath('//tr[@class="citytr"]/td[2]/a/text()')
    city_url = city_html.xpath('//tr[@class="citytr"]/td[1]/a/@href')
    for c_num in range(len(city_url)):
        county_url = base_url + city_url[c_num]
        time.sleep(random.randint(0, 3))
        county_html = get_html(county_url)
        if county_html is None:
            print("county_html is None", county_url)
            continue
        county_code = county_html.xpath('//tr[@class="countytr"]/td[1]/a/text()')
        county_name = county_html.xpath('//tr[@class="countytr"]/td[2]/a/text()')
        county_url = county_html.xpath('//tr[@class="countytr"]/td[1]/a/@href')
        for t_num in range(len(county_url)):
            town_url = base_url + "/" + city_url[c_num].split('/')[0] + "/" + county_url[t_num]
            time.sleep(random.randint(0, 3))
            town_html = get_html(town_url)
            if town_html is None:
                print("town_html is None", town_url)
                continue
            town_code = town_html.xpath('//tr[@class="towntr"]/td[1]/a/text()')
            town_name = town_html.xpath('//tr[@class="towntr"]/td[2]/a/text()')
            town_url = town_html.xpath('//tr[@class="towntr"]/td[1]/a/@href')
            for v_num in range(len(town_url)):
                code_ = town_url[v_num].split("/")[1].rstrip(".html")
                village_url = base_url + code_[0:2] + "/" + code_[2:4] + "/" + town_url[v_num]
                time.sleep(random.randint(0, 3))
                village_html = get_html(village_url)
                if village_html is None:
                    print("village_html is None", village_url)
                    continue
                # 居委村委代码
                village_code = village_html.xpath('//tr[@class="villagetr"]/td[1]/text()')
                # 居委村委城乡分类代码
                village_type_code = village_html.xpath('//tr[@class="villagetr"]/td[2]/text()')
                # 居委村委名称
                village_name = village_html.xpath('//tr[@class="villagetr"]/td[3]/text()')
                for num in range(len(village_code)):
                    v_name = village_name[num]
                    v_code = village_code[num]
                    type_code = village_type_code[num]
                    info = dict()
                    info['province'] = str(p_key).ljust(12, '0')
                    info['province_code'] = province[p_key]
                    info['city_code'] = city_code[c_num]
                    info['city'] = city_name[c_num]
                    info['district_code'] = county_code[t_num]
                    info['district'] = county_name[t_num]
                    info['town_code'] = town_code[v_num]
                    info['town'] = town_name[v_num]
                    info['type_code'] = type_code
                    info['committee_code'] = v_code
                    info['committee'] = v_name
                    action = {
                        "_op_type": "index",
                        "_index": "administrative_division",
                        "_id": v_code,
                        "_source": info
                    }
                    actions.append(action)
                    if len(actions) == 10:
                        helpers.bulk(es_client, actions)
                        count += len(actions)
                        print(count)
                        actions.clear()
if len(actions) > 0:
    helpers.bulk(es_client, actions)
    count += len(actions)
    print(count)
    actions.clear()