1、 data_handler.py
from load_data import From_file
import pandas as pd
def Data_disperse(data, disperse_col, money_split_rule):
money_split_str = []
money_distribute = []
money_type = []
index = 1
for cur_split_rule in money_split_rule:
range_lower = cur_split_rule[0]
range_upper = cur_split_rule[2]
analysis_data = data[(data[disperse_col] > range_lower) & (data[disperse_col] <= range_upper)]
money_distribute.append(len(analysis_data))
split_rule_str = "".join([str(split_rule) for split_rule in cur_split_rule])
money_split_str.append(split_rule_str)
money_type.append(index)
index += 1
money_dist_dict = dict(zip(money_split_str, money_distribute))
money_split_map = dict(zip(money_split_str, money_type))
return money_dist_dict, money_split_map
def Locate_money(money, money_dist_dict, money_split_map):
money_map = 1
map_num = 0
for money_range, map in money_split_map.items():
money_range = money_range.split('-')
if money>float(money_range[0]) and money<=float(money_range[1]):
money_map = map
for money_range, num in money_dist_dict.items():
money_range = money_range.split('-')
if money>float(money_range[0]) and money<=float(money_range[1]):
map_num = num
return money_map, map_num
def Sort_data_col(data, col, loc, pre):
col_names = data.columns.tolist()
data_col = data[col]
data.drop(labels=[col], axis=1, inplace=True)
if (pre):
data.insert(col_names.index(loc), col, data_col)
else:
data.insert(col_names.index(loc)+1, col, data_col)
return data
def Data_disp_construct(data, disperse_rule, money_dist_dict, money_split_map):
money_maps = []
map_nums = []
maps_nums = []
for money in data[disperse_rule]:
money_map, map_num = Locate_money(float(money), money_dist_dict, money_split_map)
money_maps.append(money_map)
map_nums.append(map_num)
for index in range(0, len(money_maps), 1):
maps_nums.append([money_maps[index], map_nums[index]])
temp_data = pd.DataFrame(columns=['金额映射', '映射数量'], data=maps_nums)
data = data.join(temp_data, on='index')
data = Sort_data_col(data, '金额映射', '内容', True)
data = Sort_data_col(data, '映射数量', '内容', True)
return data
def Date_handler(data, start_col, end_col):
return data
def Data_pre_handle(data):
#去除全为空值的行数据
data = data.dropna(axis=0, how='all')
#print (data) #294*12
#去除重复值
data = data.drop_duplicates()
#print (data) #294*12
#数据筛选(分管领导=李博洋)
select_col = '分管领导'
col_value = '李博洋'
data = data[data[select_col]==col_value]
data['index'] = [index for index in range(0, data.shape[0], 1)]
data = Sort_data_col(data, 'index', '我方主体', True)
#print (data) #74*13
#数据的离散化(金额)
disperse_rule = '金额'
money_split_rule = [[0, '-', 5000.0], [5000.0, '-', 10000.0], [10000.0, '-', 15000.0], [15000.0, '-', 20000.0],
[20000.0, '-', 25000.0],
[25000.0, '-', 35000.0], [35000.0, '-', 50000.0], [50000.0, '-', 80000.0],
[80000.0, '-', 120000.0],
[120000, '-', 999999999999]]
money_dist_dict, money_split_map = Data_disperse(data, disperse_rule, money_split_rule)
data = Data_disp_construct(data, disperse_rule, money_dist_dict, money_split_map)
#日期处理
start_col = '合同开始日期'
end_col = '合同结束日期'
data = Date_handler(data, start_col, end_col)
#数据映射(合作方、结果)
partner_map_dict = {}
partner_map = []
partners = data['合作方']
partners = partners.drop_duplicates()
map = 1
for partner in partners:
partner_map_dict[partner] = map
map += 1
for partner in data['合作方']:
partner_map.append(partner_map_dict[partner])
data['合作方映射'] = partner_map
data = Sort_data_col(data, '合作方映射', '合作方', False)
result_map = []
for result in data['结果']:
if result == '分管领导':
map = 1
else:
map = 0
result_map.append(map)
data['结果映射'] = result_map
data = Sort_data_col(data, '结果映射', '结果', False)
return data
if __name__ == '__main__':
#通过load_data取数
filename = 'data/data.xlsx'
pick_list = ['我方主体', '合同号', '合同开始日期', '合同结束日期', '金额', '内容', '合作方', '经办部门', '经办人员', '经办人上级', '分管领导', '结果']
data = From_file(filename, pick_list)
#print (data) #294*12
handled_data = Data_pre_handle(data)
#print (type(handled_data)) #74*16
handled_data.to_csv('data/handled_data.csv', encoding='gbk', index=False)
2、 get_properties.py
from data_handler import Data_pre_handle
from load_data import From_file
import pandas as pd
def Get_properties(data, property_dict):
properties_data = pd.DataFrame()
for key in property_dict:
properties_data[property_dict[key]] = data[key]
properties_data_file = 'data/properties.csv'
properties_data.to_csv(properties_data_file, encoding='gbk', index=False)
return properties_data_file, properties_data
if __name__ == '__main__':
filename = 'data/data.xlsx'
pick_list = ['我方主体', '合同号', '合同开始日期', '合同结束日期', '金额', '内容', '合作方', '经办部门', '经办人员', '经办人上级', '分管领导', '结果']
data = From_file(filename, pick_list)
handled_data = Data_pre_handle(data)
#print (handled_data) #74*16
property_dict = {'金额映射':'money_map', '合作方映射':'partner_map', '结果映射':'result_map'}
properties_data_file, properties_data = Get_properties(handled_data, property_dict)
print (properties_data_file)
3、 solver.py
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB
from data_handler import Data_pre_handle
from load_data import From_file
from get_properties import Get_properties
from domain import Domain
from data_handler import Data_disperse, Data_disp_construct
def Training(data):
model = Domain(data, 'result_map')
x_train, x_test, y_train, y_test = train_test_split(model.x, model.y, test_size=0.2)
clf = GaussianNB().fit(x_train, y_train)
'''
precision, recall, f1_score = metrics.classification_report(expected, doc_class_predicted)
confuse_matrix = metrics.confusion_matrix(expected, doc_class_predicted)
algorithm_assess = {
'precision' : precision,
'recall' : recall,
'f1_score' : f1_score,
'confuse_matrix' : confuse_matrix
}
'''
return clf
def Handle_input(data):
disperse_rule = '合同金额'
money_split_rule = [[0, '-', 5000.0], [5000.0, '-', 10000.0], [10000.0, '-', 15000.0], [15000.0, '-', 20000.0],
[20000.0, '-', 25000.0],
[25000.0, '-', 35000.0], [35000.0, '-', 50000.0], [50000.0, '-', 80000.0],
[80000.0, '-', 120000.0],
[120000, '-', 999999999999]]
money_dist_dict, money_split_map = Data_disperse(data, disperse_rule, money_split_rule)
data = Data_disp_construct(data, disperse_rule, money_dist_dict, money_split_map)
if __name__ == '__main__':
filename = 'data/data.xlsx'
pick_list = ['我方主体', '合同号', '合同开始日期', '合同结束日期', '金额', '内容', '合作方', '经办部门', '经办人员', '经办人上级', '分管领导', '结果']
data = From_file(filename, pick_list)
handled_data = Data_pre_handle(data)
# print (handled_data) #74*16
property_dict = {'金额映射': 'money_map', '合作方映射': 'partner_map', '结果映射': 'result_map'}
properties_data_file, properties_data = Get_properties(handled_data, property_dict)
print (properties_data)
clf = Training(properties_data)
input_data = [[8,28], [4,40], [3,31], [2,40], [10,41]]
#, [9,40], [8,30], [7,40],[7,33], [10,33]
#input_handled_data = Handle_input(input_data)
solver_result = clf.predict(input_data)
print (solver_result)
'''
model = Domain(properties_data, 'result_map')
#print (model.x, model.y)
x_train, x_test, y_train, y_test = train_test_split(model.x, model.y, test_size=0.2)
clf = GaussianNB().fit(x_train, y_train)
doc_class_predicted = clf.predict(x_test)
expected = y_test
print(y_test) # 输出实际结果
print(doc_class_predicted) # 输出测试结果
print(metrics.classification_report(expected, doc_class_predicted)) # 输出结果,精确度、召回率、f-1分数
print(metrics.confusion_matrix(expected, doc_class_predicted)) # 混淆矩阵
'''
4、 score.py
import pandas as pd
from data_handler import Data_pre_handle
from load_data import From_file
from get_properties import Get_properties
from solver import Solver
from utils.send_mail import Mail
from utils.all_data_statis import All_data_statis
import time
def Score(data, input_data, solver_result, constraints):
standarded = True
result_data_dict = {
'index' : [],
'结果': [],
'结果映射' : []
}
input_data['index'] = [len(data), len(data)+1]
index = len(data)
for result in solver_result:
if result == 0:
result_data_dict['index'].append(index)
result_data_dict['结果'].append('财务部')
result_data_dict['结果映射'].append(0)
else:
result_data_dict['index'].append(index)
result_data_dict['结果'].append('分管领导')
result_data_dict['结果映射'].append(1)
index += 1
result_data = pd.DataFrame(result_data_dict)
recovery_data = pd.merge(input_data, result_data, on='index')
all_data = pd.concat([data, recovery_data])
#print (all_data)
all_data.to_csv('data/all_data.csv', encoding='gbk', index=False)
money_coverage, amount_coverage = All_data_statis(all_data)
print (money_coverage, amount_coverage)
money_constraint = constraints.get('hard_constraint').get('money_coverage')
amount_constraint = constraints.get('soft_constraint').get('amount_coverage')
amount_pardon_range = constraints.get('soft_constraint').get('pardon_range')
if money_coverage < money_constraint:
standarded = False
elif amount_coverage<(amount_constraint-amount_pardon_range) or amount_coverage>(amount_constraint+amount_pardon_range):
standarded = False
return standarded
if __name__ == '__main__':
filename = 'data/data.xlsx'
pick_list = ['我方主体', '合同号', '合同开始日期', '合同结束日期', '金额', '内容', '合作方', '经办部门', '经办人员', '经办人上级', '分管领导', '结果']
data = From_file(filename, pick_list)
handled_data = Data_pre_handle(data)
# print (handled_data) #74*16
handled_money_coverage, handled_amount_coverage = All_data_statis(handled_data)
print (handled_money_coverage, handled_amount_coverage)
property_dict = {'金额映射': 'money_map', '合作方映射': 'partner_map', '结果映射': 'result_map'}
properties_data_file, properties_data = Get_properties(handled_data, property_dict)
'''
input_data = handled_data.iloc[0:2,].copy()
input_data.drop(labels=['结果', '结果映射'], axis=1, inplace=True)
x_input = []
for index in range(0, len(input_data), 1):
data = input_data.iloc[index,]
input = []
input.append(data['金额映射'])
input.append(data['合作方映射'])
x_input.append(input)
'''
solver_result = Solver(properties_data, x_input)
constraints = {
'hard_constraint': {
'money_coverage': 0.70
},
'soft_constraint': {
'amount_coverage': 0.30,
'pardon_range': 0.05
}
}
standarded = Score(handled_data, input_data, solver_result, constraints)
if (standarded):
mail_data = []
for index in range(0, len(input_data), 1):
left_data = input_data.iloc[index,]
right_data = solver_result[index]
print (right_data)
input = []
input.append(data['合同号'])
input.append(right_data)
mail_data.append(input)
print (mail_data)
mail_result = ''
for mail in mail_data:
if (mail[1]==0):
mail_result += mail[0] + '\t' + '财务部' + '\n'
else:
mail_result += mail[0] + '\t' + '分管领导' + '\n'
print (mail_result)
ret = Mail(mail_result, 'Notify!')
if ret:
print ('Mail Done!')
else:
print ('Mail Failed, try again!')
else:
print('penalty properties, and try solver again, plz!')