数据挖掘常用码表数据清洗整合积累,如全新全国各地机动车牌号归属地数据集合
1. 碎片代码
import re
import pandas as pd
from pypinyin import lazy_pinyin, Style
pd.set_option('display.width', 500)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_colwidth', 1000)
# 1 JSON 格式数据处理
jsons_ss = [
{
"code": "冀A",
"city": "石家庄",
"province": "河北",
"Pcode": "HB"
}
...
]
jsons_ss_df = pd.DataFrame( jsons_ss )
jsons_ss_df2 = jsons_ss_df[['code', 'city']]
# 2 网页数据处理
from bs4 import BeautifulSoup
# HTML代码片段
html_fragment = '''
'''
soup = BeautifulSoup(html_fragment, 'html.parser')
td_tags = soup.find_all('td')
cont_lst = []
for td in td_tags:
txt_cont = td.get_text(strip=True)
if '牌号' == txt_cont or '地区'== txt_cont:
continue
cont_lst.append( txt_cont )
new_lst = [cont_lst[i:i+2] for i in range(0, len(cont_lst), 2)]
cols = ['code', 'city']
new_lst_df = pd.DataFrame( new_lst, columns=cols, )
new_lst_df['city'] = new_lst_df.city.apply( lambda city : city.replace('市', ''))
# 3 数据合并去重
df3 = pd.read_excel(r'..\车辆归属地(全国).xlsx')
df31 = df3[['车牌开头', '车辆归属地']]
df31.columns = cols
df = pd.concat( [new_lst_df, jsons_ss_df2, df31] )
df.duplicated()
df.drop_duplicates(inplace=True)
# 4 省份回填
dics = {
"京": "北京市",
"津": "天津市",
"沪": "上海市",
"渝": "重庆市",
"冀": "河北省",
"吉": "吉林省",
"辽": "辽宁省",
"黑": "黑龙江省",
"湘": "湖南省",
"鄂": "湖北省",
"甘": "甘肃省",
"晋": "山西省",
"陕": "陕西省",
"豫": "河南省",
"川": "四川省",
"云": "云南省",
"桂": "广西壮族自治区",
"蒙": "内蒙古自治区",
"贵": "贵州省",
"青": "青海省",
"藏": "西藏",
"新": "新疆维吾尔自治区",
"宁": "宁夏回族自治区",
"粤": "广东省",
"琼": "海南省",
"闽": "福建省",
"苏": "江苏省",
"浙": "浙江省",
"赣": "江西省",
"鲁": "山东省",
"皖": "安徽省"
}
df['province'] = df.code.apply( lambda code: dics[code[0]] if code[0] in dics.keys() else '')
df.to_excel(r'..\全国各地机动车牌号归属地.xlsx', index=None)
2.参考网站
- 车牌归属查询
- 数据获取、问题咨询等可添加 xsimah 微备注