import json
import pandas as pd
import numpy as np
import requests
from lxml import etree
def resolveJson(path):
file = open(path, "rb")
fileJson = json.load(file)
data_list = []
for i in fileJson:
first_name = i['name']
result = i['children']
for j in result:
print(j)
result1 = j['children']
for d in result1:
name = d['name']
pcode = d['code']
# print(name,pcode)
data_list.append((first_name,name,pcode))
end_data = pd.DataFrame(data_list,columns=["first_name","name","pcode1"], dtype=object)
print(end_data)
return end_data
def mca_data():
url = 'http://www.mca.gov.cn/article/sj/xzqh/2020/20201201.html'
Headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Host': 'www.mca.gov.cn',
'If-Modified-Since': 'Wed, 30 Jun 2021 06:35:23 GMT',
'If-None-Match': 'W/"825152-151781-5c5f5ed62bae1"',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36'
}
response = requests.get(url, headers=Headers)
response.encoding = 'utf-8'
resHtml = response.text
# print(resHtml)
html = etree.HTML(resHtml)
data = html.xpath('//tr[@height="19"]')
data_list =[]
for i in data:
try:
pcode = i.xpath('./td[2]/text()')[0].strip()+'000000'
except:
pcode = 0
try:
name = i.xpath('./td[3]/text()')[0].strip()
except:
name = i.xpath('./td[3]/span/text()')[0].strip()
# print(name,pcode)
data_list.append((name,pcode))
end_data = pd.DataFrame(data_list,columns=["name","pcode2"], dtype=object)
# print(end_data)
end_data['pcode2'] = end_data['pcode2'].astype(str)
end_data['pcode3'] = end_data['pcode2'].apply(lambda x: x[:2])
data_list1 = []
for code, group in end_data.groupby(["pcode3"]):
group_data = group.sort_values(by='pcode2', ascending=True, na_position='first')
# group_data['first_name'] = group_data.iloc[0]['name']
first_name = group_data.iloc[0]['name']
# print(group_data)
for i in range(0, len(group_data)):
name = group_data.iloc[i]['name']
pcode2 = group_data.iloc[i]['pcode2']
data_list1.append((first_name,name,pcode2))
df1 = pd.DataFrame(data_list1, columns=["first_name","name", "pcode2"], dtype=object)
print(df1)
return df1
def function(a, b):
if a == b:
return 1
else:
return 0
if __name__ == '__main__':
path = r"C:\Users\PC\Desktop\area.json"
jsondata = resolveJson(path)
mcadata = mca_data()
df = jsondata.merge(mcadata, on=["name","first_name"], how="left")
df['pcode1'] = df['pcode1'].astype(float)
df['pcode2'] = df['pcode2'].astype(float)
df['check'] = df.apply(lambda x: function(x['pcode1'], x['pcode2']), axis=1)
print(df)
result = df[df['check']==0]
print(result)
关于不同数据用pandas来比较,并生成结果
最新推荐文章于 2024-03-24 11:40:38 发布