init.py
password = ""
database1 = "old"
database2 = "new"
out_file = "./数据库差异.xlsx"
main.py
import pandas as pd
from data_privode import *
from data_process import *
import time
s = time.time()
data_diff = pd.DataFrame(columns=["id","message","add", "sub","accuracy","recall","flag"])
total_lens = get_resume_num()
current_message_id = 1
for i in range(total_lens):
current_message = resume_old_data[current_message_id]
if current_message[2]==0:
current_message_id += 1
continue
add_data = add_process(current_message_id)
sub_data = sub_process(current_message_id)
one_ar = one_accuracy_recall(current_message_id)
data_diff = data_diff.append(
{"id":current_message_id,"message":current_message[1],"add":str(add_data), "sub":str(sub_data),"accuracy":one_ar[0],"recall":one_ar[1],"flag":current_message[2]},
ignore_index=True)
current_message_id+=1
all_ar = all_accuracy_recall()
data_diff = data_diff.append(
{"id":"全部","message":"准确率为:{}".format(all_ar[0]) +" "+ "召回率为:{}".format(all_ar[1])},ignore_index=True)
writer = pd.ExcelWriter(out_file)
data_diff.to_excel(writer,'Sheet1',index=False)
writer.save()
f = time.time()
print(f-s)
data_provider.py
import json
import pymysql
from init import *
def change_database(database_name):
db = pymysql.connect(
host="localhost",
user="root",
password = "".format(password),
db="{}".format(database_name),
port=3306)
return db
"""获取数据库数据"""
def get_resume_message():
db = change_database(database2)
cur = db.cursor()
sql = "select id, content,flag from resume "
try:
cur.execute(sql )
results = cur.fetchall()
for index, data in enumerate(results):
resume_old_data[index+1] = data
except Exception as e:
raise e
def get_total_tag():
db = change_database(database1)
cur = db.cursor()
sql = "select tag from tag"
try:
cur.execute(sql)
results = cur.fetchall()
total_tag_dict = {}
for num,i in enumerate(results):
total_tag_dict[num+1] = i[0]
return total_tag_dict
except Exception as e:
raise e
def get_old_tag_span():
db = change_database(database1)
tag_data_dict = tag_old_data
cur = db.cursor()
have_data = {}
for i in range(len(resume_old_data)):
sql = "select resume_id, tag_id, details from relation where id=%d"
try:
cur.execute(sql % (i+1))
one_have_data = cur.fetchall()
have_data[i] = one_have_data
except Exception as e:
have_data[i] = -1
raise e
for i in range(len(resume_old_data)):
res1_tag_span_str_dict = {}
res2_tag_list = []
results = have_data[i]
if results == -1:
relation_old_data[i] = [{},()]
continue
for index,i1 in enumerate(results):
message_data = resume_old_data[i + 1][1]
span_list_str = []
span_list = i1[2].split("/")
for num, i2 in enumerate(span_list):
if i2 != "":
span_list[num] = i2.split("-")
else:
span_list.pop(num)
for i3 in span_list:
start = int(i3[0])
finish = int(i3[1])
span_list_str.append(message_data[start:finish])
res2_tag_list.append(tag_data_dict[i1[1]])
res1_tag_span_str_dict[tag_data_dict[i1[1]]] = span_list_str
relation_old_data[i+1] = [res1_tag_span_str_dict,res2_tag_list]
def get_new_tag_span():
db = change_database(database2)
tag_data_dict = tag_old_data
cur = db.cursor()
have_data = {}
for i in range(len(resume_old_data)):
sql = "select resume_id, tag_id, details from relation where id=%d"
try:
cur.execute(sql % (i + 1))
one_have_data = cur.fetchall()
have_data[i] = one_have_data
except Exception as e:
have_data[i] = -1
raise e
for i in range(len(resume_old_data)):
res1_tag_span_str_dict = {}
res2_tag_list = []
results = have_data[i]
if results == -1:
relation_old_data[i] = [{}, ()]
continue
for index, i1 in enumerate(results):
message_data = resume_old_data[index + 1][1]
span_list_str = []
span_list = i1[2].split("/")
for num, i2 in enumerate(span_list):
if i2 != "":
span_list[num] = i2.split("-")
else:
span_list.pop(num)
for i3 in span_list:
start = int(i3[0])
finish = int(i3[1])
span_list_str.append(message_data[start:finish])
res2_tag_list.append(tag_data_dict[i1[1]])
res1_tag_span_str_dict[tag_data_dict[i1[1]]] = span_list_str
relation_new_data[i + 1] = [res1_tag_span_str_dict, res2_tag_list]
old_cursor = change_database(database1)
resume_old_data = {}
get_resume_message()
tag_old_data = get_total_tag()
relation_old_data = {}
get_old_tag_span()
relation_new_data = {}
get_new_tag_span()
"""获取各表的数量--因为保持俩数据库数据数量一致故,只取旧数据库数量即可"""
def get_resume_num():
return len(resume_old_data)
def get_tag_num():
return len(tag_old_data)
def get_relation_num_old():
return len(relation_old_data)
def get_relation_num_new():
return len(relation_new_data)
if __name__ == '__main__':
get_old_tag_span()
data_process.py
from data_privode import *
from init import *
accuracy_list = []
recall_list = []
total_lens = get_resume_num()
def sub_process(message_id):
old_data = relation_old_data[message_id]
new_data = relation_new_data[message_id]
sub_dict = {}
for old_tag_ in list(old_data[0].keys()):
if old_tag_ not in list(new_data[0].keys()):
sub_dict[old_tag_] = old_data[0][old_tag_]
return sub_dict
def add_process(message_id):
old_data = relation_old_data[message_id]
new_data = relation_new_data[message_id]
add_dict = {}
for new_tag_ in list(new_data[0].keys()):
if new_tag_ not in list(old_data[0].keys()):
add_dict[new_tag_] = new_data[0][new_tag_]
return add_dict
def one_accuracy_recall(message_id):
old_tag_list = relation_old_data[message_id][1]
new_tag_list = relation_new_data[message_id][1]
same_count = 0
if len(old_tag_list) == 0 or len(new_tag_list) == 0:
return 0,0
else:
for i in new_tag_list:
if i in old_tag_list:
same_count += 1
one_accuracy = same_count/len(old_tag_list)
one_recall = same_count/len(new_tag_list)
return one_accuracy,one_recall
def all_accuracy_recall():
sum_accuracy_list = []
sum_recall_list = []
for i in range(total_lens):
accuracy,recall = one_accuracy_recall(i+1)
sum_accuracy_list.append(accuracy)
sum_recall_list.append(recall)
return sum(sum_accuracy_list)/len(sum_accuracy_list),sum(sum_recall_list)/len(sum_recall_list)
if __name__ == '__main__':
print(one_accuracy_recall(1))