init.py
password = ""
database1 = "old"
database2 = "new"
out_file = "./数据库差异.xlsx"
main.py
import pandas as pd
from data_privode import *
from data_process import *
import time
s = time.time()
data_diff = pd.DataFrame(columns=["id","message","add", "sub","accuracy","recall","flag"])
total_lens = get_resume_num()
current_message_id = 1
for i in range(total_lens):
current_message = get_cv_message(current_message_id)
if current_message[2]==0:
current_message_id += 1
continue
add_data = add_process(current_message_id)
sub_data = sub_process(current_message_id)
one_ar = one_accuracy_recall(current_message_id)
data_diff = data_diff.append(
{"id":current_message_id,"message":current_message[1],"add":str(add_data), "sub":str(sub_data),"accuracy":one_ar[0],"recall":one_ar[1],"flag":current_message[2]},
ignore_index=True)
current_message_id+=1
all_ar = all_accuracy_recall()
data_diff = data_diff.append(
{"id":"全部","message":"准确率为:{}".format(all_ar[0]) +" "+ "召回率为:{}".format(all_ar[1])},ignore_index=True)
writer = pd.ExcelWriter(out_file)
data_diff.to_excel(writer,'Sheet1',index=False)
writer.save()
f = time.time()
print(f-s)
data_provider.py
import json
import pymysql
from init import *
def change_database(database_name):
db = pymysql.connect(
host="localhost",
user="root",
password = "".format(password),
db="{}".format(database_name),
port=3306)
return db
"""获取各表的数量--因为保持俩数据库数据数量一致故,只取旧数据库数量即可"""
def get_resume_num():
db = change_database(database1)
cur = db.cursor()
sql = "select id, content from resume"
try:
cur.execute(sql )
results = cur.fetchall()
return len(results)
except Exception as e:
raise e
def get_tag_num():
db = change_database(database1)
cur = db.cursor()
sql = "select tag from tag"
try:
cur.execute(sql )
results = cur.fetchall()
return len(results)
except Exception as e:
raise e
def get_relation_num():
db = change_database(database1)
cur = db.cursor()
sql = "select resume_id from relation"
try:
cur.execute(sql )
results = cur.fetchall()
return len(results)
except Exception as e:
raise e
def get_relation_num2():
db = change_database(database2)
cur = db.cursor()
sql = "select resume_id from relation"
try:
cur.execute(sql )
results = cur.fetchall()
return len(results)
except Exception as e:
raise e
"""获取数据库数据"""
def get_cv_message(message_id):
db = change_database(database2)
cur = db.cursor()
sql = "select id, content,flag from resume where id=%d"
try:
cur.execute(sql % message_id)
results = cur.fetchall()
return results[0]
except Exception as e:
raise e
def get_total_tag():
db = change_database(database1)
cur = db.cursor()
sql = "select tag from tag"
try:
cur.execute(sql)
results = cur.fetchall()
total_tag_dict = {}
for num,i in enumerate(results):
total_tag_dict[num+1] = i[0]
return total_tag_dict
except Exception as e:
raise e
def get_old_tag_span(message_id):
db = change_database(database1)
tag_data_dict = get_total_tag()
message_data = get_cv_message(message_id)[1]
res1_tag_span_str_dict = {}
res2_tag_list = []
cur = db.cursor()
sql = "select resume_id, tag_id, details from relation where id=%d"
try:
cur.execute(sql % message_id)
results = cur.fetchall()
except Exception as e:
raise e
return {},[]
for i1 in results:
span_list_str = []
span_list = i1[2].split("/")
for num, i2 in enumerate(span_list):
if i2 != "":
span_list[num] = i2.split("-")
else:
span_list.pop(num)
for i3 in span_list:
start = int(i3[0])
finish = int(i3[1])
span_list_str.append(message_data[start:finish])
res2_tag_list.append(tag_data_dict[i1[1]])
res1_tag_span_str_dict[tag_data_dict[i1[1]]] = span_list_str
return res1_tag_span_str_dict,res2_tag_list
def get_new_tag_span(message_id):
db = change_database(database2)
tag_data_dict = get_total_tag()
message_data = get_cv_message(message_id)[1]
res1_tag_span_str_dict = {}
res2_tag_list = []
cur = db.cursor()
sql = "select resume_id, tag_id, details from relation where id=%d"
try:
cur.execute(sql % message_id)
results = cur.fetchall()
except Exception as e:
raise e
return {}, []
for i1 in results:
span_list_str = []
span_list = i1[2].split("/")
for num, i2 in enumerate(span_list):
if i2 != "":
span_list[num] = i2.split("-")
else:
span_list.pop(num)
for i3 in span_list:
start = int(i3[0])
finish = int(i3[1])
span_list_str.append(message_data[start:finish])
res2_tag_list.append(tag_data_dict[i1[1]])
res1_tag_span_str_dict[tag_data_dict[i1[1]]] = span_list_str
return res1_tag_span_str_dict, res2_tag_list
if __name__ == '__main__':
print(get_relation_num(),get_relation_num2())
data_process.py
from data_privode import *
from init import *
accuracy_list = []
recall_list = []
total_lens = get_resume_num()
def sub_process(message_id):
old_data = get_old_tag_span(message_id)
new_data = get_new_tag_span(message_id)
sub_dict = {}
for old_tag_ in list(old_data[0].keys()):
if old_tag_ not in list(new_data[0].keys()):
sub_dict[old_tag_] = old_data[0][old_tag_]
return sub_dict
def add_process(message_id):
old_data = get_old_tag_span(message_id)
new_data = get_new_tag_span(message_id)
add_dict = {}
for new_tag_ in list(new_data[0].keys()):
if new_tag_ not in list(old_data[0].keys()):
add_dict[new_tag_] = new_data[0][new_tag_]
return add_dict
def one_accuracy_recall(message_id):
old_tag_list = get_old_tag_span(message_id)[1]
new_tag_list = get_new_tag_span(message_id)[1]
same_count = 0
if len(old_tag_list) == 0 or len(new_tag_list) == 0:
return 0,0
else:
for i in new_tag_list:
if i in old_tag_list:
same_count += 1
one_accuracy = same_count/len(old_tag_list)
one_recall = same_count/len(new_tag_list)
return one_accuracy,one_recall
def all_accuracy_recall():
sum_accuracy_list = []
sum_recall_list = []
for i in range(total_lens):
accuracy,recall = one_accuracy_recall(i+1)
sum_accuracy_list.append(accuracy)
sum_recall_list.append(recall)
return sum(sum_accuracy_list)/len(sum_accuracy_list),sum(sum_recall_list)/len(sum_recall_list)
if __name__ == '__main__':
print(one_accuracy_recall(1))