Python构造树结构应用到城市层级编码
应用背景
在实际的场景应用中,经常需要对一些数据做一些层级编码,这时候需要使用树形结构来支持这一实现。这里给出一个典型的应用例子,对全国城市的层级编码,例如全国下面有34个省级行政区,以湖北省为例,湖北省下面有15个省辖市或者州,每个市下面又有若干个区域或者县。这样的非常常见的层级结构可以用一颗数来表示,这里使用Python来实现树的结构
数据样本
定义城市样本文件数据(这里只是部分)如下,每行一条记录,每条记录中不同级别用逗号分隔
上海市,市辖区,徐汇区
上海市,市辖区,杨浦区
上海市,市辖区,松江区
上海市,市辖区,长宁区
上海市,市辖区,闵行区
云南省,临沧市,临翔区
云南省,临沧市,云县
云南省,临沧市,凤庆县
云南省,临沧市,双江拉祜族佤族布朗族傣族自治县
云南省,临沧市,永德县
云南省,临沧市,沧源佤族自治县
云南省,临沧市,耿马傣族佤族自治县
云南省,临沧市,镇康县
云南省,丽江市,华坪县
云南省,丽江市,古城区
云南省,丽江市,宁蒗彝族自治县
云南省,丽江市,永胜县
云南省,丽江市,玉龙纳西族自治县
云南省,保山市,施甸县
云南省,保山市,昌宁县
云南省,保山市,腾冲县
云南省,保山市,隆阳区
云南省,保山市,龙陵县
树的结构
我们定义树的节点,包含以下几个元素
名称 | 类型 | 描述 | 描述 |
---|---|---|---|
name | String | 名称,省名称、城市名称、区名称 | “云南省” |
index | String | 节点编号值的路径,从根节点到此节点的路径 | “0|11|1101|110101” |
value | String | 节点编号的值 | “110101” |
children | array | 节点的孩子节点 | [“110101”,“110102”,“110103”] |
编码规则
我们定义一个编码规则,首先,对待排序文件按照文本进行排序,这样每个省的数据都集中在一起而且都是排好序的,然后设置“全国”的value为“0”,然后一级目录(省)value从“10”开始,二级目录(市)的value从对应的一级目录 * 100 开始,三级四级依次类推。
树节点代码
class Node(object):
def __init__(self, name, value="0", index="0"):
self.name = name
self.index = index
self.value = value
self.children = []
# 查询节点在树中的位置
def search(self, node):
if self.name == node.name:
return self
if not self.children:
return None
else:
for child in self.children:
childres = child.search(node)
if childres:
return childres
return None
def add_child(self, node):
self.children.append(node)
def set_value(self, value):
self.value = value
def set_index(self, index):
self.index = index
# 将整棵树变成json格式
def to_json(self):
res = {}
res['value'] = self.value
res['name'] = self.name
res['index'] = self.index
res['children'] = []
for child in self.children:
res['children'].append(child.to_json())
return res
# 通过节点全路径寻找节点名称
def index_display(self, index):
if self.index == index:
print(self.name)
if self.children:
for child in self.children:
child.index_display(index)
树的创建过程
def create_tree_from_local_file():
first_no = 10 # 一集目录从10开始,二级目录为对应的一级目录*100开始,三级四季依次类推
root_node = Node("全国", index="0") # 根节点
path = "your file path" # 这里填写你的路径,文件内容如上面样本数据所示
for line in open(path encoding="utf-8"):
lines = line.strip("\r\n").split(",")
first_node = Node(lines[0]) # 一级目录需要指定编码
if first_node.name not in [child.name for child in root_node.children]:
first_node.set_value(str(first_no))
first_node.set_index(root_node.index + "|" + first_node.value)
first_no = first_no + 1
root_node.add_child(first_node)
cur_node = root_node.search(first_node)
for node in [Node(name=tmp) for tmp in lines[1:]]: # 二级以后直接根据一级目录的编号开始
if node.name not in [child.name for child in cur_node.children]:
length = len(cur_node.children)
node.set_value(str(length + 100 * int(cur_node.value)))
node.set_index(cur_node.index + "|" + node.value)
cur_node.add_child(node)
cur_node = root_node.search(node)
print(root_node.to_json())
运行结果
{
"value": "0",
"name": "全国",
"index": "0",
"children": [
{
"value": "10",
"name": "上海市",
"index": "0|10",
"children": [
{
"value": "1000",
"name": "市辖区",
"index": "0|10|1000",
"children": [
{
"value": "100000",
"name": "徐汇区",
"index": "0|10|1000|100000",
"children": [ ]
},
{
"value": "100001",
"name": "杨浦区",
"index": "0|10|1000|100001",
"children": [ ]
},
{
"value": "100002",
"name": "松江区",
"index": "0|10|1000|100002",
"children": [ ]
},
{
"value": "100003",
"name": "长宁区",
"index": "0|10|1000|100003",
"children": [ ]
},
{
"value": "100004",
"name": "闵行区",
"index": "0|10|1000|100004",
"children": [ ]
}
]
}
]
},
{
"value": "11",
"name": "云南省",
"index": "0|11",
"children": [
{
"value": "1100",
"name": "临沧市",
"index": "0|11|1100",
"children": [
{
"value": "110000",
"name": "临翔区",
"index": "0|11|1100|110000",
"children": [ ]
},
{
"value": "110001",
"name": "云县",
"index": "0|11|1100|110001",
"children": [ ]
},
{
"value": "110002",
"name": "凤庆县",
"index": "0|11|1100|110002",
"children": [ ]
},
{
"value": "110003",
"name": "双江拉祜族佤族布朗族傣族自治县",
"index": "0|11|1100|110003",
"children": [ ]
},
{
"value": "110004",
"name": "永德县",
"index": "0|11|1100|110004",
"children": [ ]
},
{
"value": "110005",
"name": "沧源佤族自治县",
"index": "0|11|1100|110005",
"children": [ ]
},
{
"value": "110006",
"name": "耿马傣族佤族自治县",
"index": "0|11|1100|110006",
"children": [ ]
},
{
"value": "110007",
"name": "镇康县",
"index": "0|11|1100|110007",
"children": [ ]
}
]
},
{
"value": "1101",
"name": "丽江市",
"index": "0|11|1101",
"children": [
{
"value": "110100",
"name": "华坪县",
"index": "0|11|1101|110100",
"children": [ ]
},
{
"value": "110101",
"name": "古城区",
"index": "0|11|1101|110101",
"children": [ ]
},
{
"value": "110102",
"name": "宁蒗彝族自治县",
"index": "0|11|1101|110102",
"children": [ ]
},
{
"value": "110103",
"name": "永胜县",
"index": "0|11|1101|110103",
"children": [ ]
},
{
"value": "110104",
"name": "玉龙纳西族自治县",
"index": "0|11|1101|110104",
"children": [ ]
}
]
},
{
"value": "1102",
"name": "保山市",
"index": "0|11|1102",
"children": [
{
"value": "110200",
"name": "施甸县",
"index": "0|11|1102|110200",
"children": [ ]
},
{
"value": "110201",
"name": "昌宁县",
"index": "0|11|1102|110201",
"children": [ ]
},
{
"value": "110202",
"name": "腾冲县",
"index": "0|11|1102|110202",
"children": [ ]
},
{
"value": "110203",
"name": "隆阳区",
"index": "0|11|1102|110203",
"children": [ ]
},
{
"value": "110204",
"name": "龙陵县",
"index": "0|11|1102|110204",
"children": [ ]
}
]
}
]
}
]
}