def calculate_B(A):
# d_len = len(dict_all)
# l_len = len(list_class)
# times = d_len / l_len
# for cnt in range(len(list_class)):
# if list_class[cnt][0] in dict_all:
# word = list_class[cnt][0]
# A = 1000*(list_class[cnt][1]/l_len)
# B = 1000*(dict_all[word]/d_len)
# print("The value of A is %d" % A)
# print("The value of B is %d" % B)
# X2 = A*()-B*()
# else:
# print("All_dict don't contain all the elements.")
B = {}
for key in A:
B[key] = {}
for word in A[key]:
B[key][word] = 0
for key_value in A:
if key_value != key and (word in A[key_value]):
B[key][word] += A[key_value][word]
return B
# A is a element of CHI; B is a element of CHI
# count is a storage of one class's number
# N is all the number
# word_set is a dictionary for all the word
def feature_select_use_new_CHI(A, B, count, N, word_set):
# word_dict = []
word_features = {}
string_of_class = ['car',
'finance',
'science',
'health',
'sports',
'education',
'culture',
'military',
'joy',
'fashion']
word_features = word_features.fromkeys(string_of_class)
for num in range(len(string_of_class)):
CHI = {}
name = string_of_class[num]
M = N - count[num]
for word in A[name]:
chi_t_1 = A[name][word] * (M - B[name][word]) - \
(count[num] - A[name][word]) * B[name][word]
chi_t_2 = math.pow(chi_t_1, 2)
chi_t_3 = (A[name][word] + B[name][word]) * \
(N - A[name][word] - B[name][word])
chi_t_4 = (chi_t_2 / chi_t_3)
CHI[word] = math.log10(
N / (A[name][word] + B[name][word])) * chi_t_4
# print(CHI[word])
""" Function Programming """
# sorted_list = sorted(word_set_dic[name_class].items(
# ), key=lambda item: item[1], reverse=True)
# # print(sorted_list)
sorted_list = sorted(
CHI.items(), key=lambda item: item[1], reverse=True)
sorted_list = sorted_list[:200]
# print(len(sorted_list))
# print(sorted_list)
# b = []
# for aa in a:
# b.append(aa[0])
# word_dict.extend(b)
# for word in word_dict:
# if word not in word_features:
# word_features.append(word)
word_features[name] = {}
for u in range(len(sorted_list)):
if sorted_list[u][0] in word_set[name]:
w = sorted_list[u][0]
word_features[name][w] = word_set[name][w]
else:
print("ERROR")
print("The value of CHI has been counted.")
return word_features
def count_tf_idf(word_features, count):
string_of_class = ['car',
'finance',
'science',
'health',
'sports',
'education',
'culture',
'military',
'joy',
'fashion']
out = {}
out = out.fromkeys(string_of_class)
for num in range(len(string_of_class)):
class_name = string_of_class[num]
out[class_name] = {}
for word in word_features[class_name]:
word_file = 0
for c_n in word_features:
if word in word_features[c_n]:
word_file = word_file + 1
# print(word_file)
# print(word_features[class_name][word])
# print(count[num])
tf = word_features[class_name][word] / count[num]
idf = math.log10(10 / word_file)
tf_idf = tf * idf * 1000000
out[class_name][word] = tf_idf + 1
# """ Function Programming """
# sorted_list = sorted(out["car"].items(
# ), key=lambda item: item[1], reverse=True)
# print(sorted_list)
# print(len(sorted_list))
# print(out)
print("The value of tf-idf has been counted.")
return out
def naiveBayes():
# print(word_set_dic)
# print(word_set_dic)
# print(count_word_train(f_train_car, 'car') + " Done")
# print(wd)
# print(nc)
string_of_class = ['car',
'finance',
'science',
'health',
'sports',
'education',
'culture',
'military',
'joy',
'fashion']
f_train_car = load_train_file('car')
f_train_finance = load_train_file('finance')
f_train_science = load_train_file('science')
f_train_health = load_train_file('health')
f_train_sports = load_train_file('sports')
f_train_education = load_train_file('education')
f_train_culture = load_train_file('culture')
f_train_military = load_train_file('military')
f_train_joy = load_train_file('joy')
f_train_fashion = load_train_file('fashion')
print(count_word_train(f_train_car, 'car') + " Done")
print(count_word_train(f_train_finance, 'finance') + " Done")
print(count_word_train(f_train_science, 'science') + " Done")
print(count_word_train(f_train_health, 'health') + " Done")
print(count_word_train(f_train_sports, 'sports') + " Done")
print(count_word_train(f_train_education, 'education') + " Done")
print(count_word_train(f_train_culture, 'culture') + " Done")
print(count_word_train(f_train_military, 'military') + " Done")
print(count_word_train(f_train_joy, 'joy') + " Done")
print(count_word_train(f_train_fashion, 'fashion') + " Done")
d1, l1, n1 = process_train_word(f_train_car, "car")
d2, l2, n2 = process_train_word(f_train_finance, 'finance')
d3, l3, n3 = process_train_word(f_train_science, 'science')
d4, l4, n4 = process_train_word(f_train_health, 'health')
d5, l5, n5 = process_train_word(f_train_sports, 'sports')
d6, l6, n6 = process_train_word(f_train_education, 'education')
d7, l7, n7 = process_train_word(f_train_culture, 'culture')
d8, l8, n8 = process_train_word(f_train_military, 'military')
d9, l9, n9 = process_train_word(f_train_joy, 'joy')
d10, l10, n10 = process_train_word(f_train_fashion, 'fashion')
# print(d1)
# num = n1 + n2 + n3 + n4 + n5 + n6 + n7 + n8 + n9 + n10
all_list = l1 + l2 + l3 + l4 + l5 + l6 + l7 + l8 + l9 + l10
count = [n1, n2, n3, n4, n5, n6, n7, n8, n9, n10]
# print(count)
# print(num)
# print(len(all_list))
all_dict = Change_list_to_dict(all_list)
# For count CHI
all_class_dict = {}
N = len(all_dict)
# print(N)
word_set = {}
word_set = word_set.fromkeys(string_of_class)
word_set[string_of_class[0]] = d1
word_set[string_of_class[1]] = d2
word_set[string_of_class[2]] = d3
word_set[string_of_class[3]] = d4
word_set[string_of_class[4]] = d5
word_set[string_of_class[5]] = d6
word_set[string_of_class[6]] = d7
word_set[string_of_class[7]] = d8
word_set[string_of_class[8]] = d9
word_set[string_of_class[9]] = d10
# print(word_set)
c1 = bulid_dict_for_one_class(l1, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10)
c2 = bulid_dict_for_one_class(l2, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10)
c3 = bulid_dict_for_one_class(l3, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10)
c4 = bulid_dict_for_one_class(l4, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10)
c5 = bulid_dict_for_one_class(l5, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10)
c6 = bulid_dict_for_one_class(l6, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10)
c7 = bulid_dict_for_one_class(l7, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10)
c8 = bulid_dict_for_one_class(l8, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10)
c9 = bulid_dict_for_one_class(l9, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10)
c10 = bulid_dict_for_one_class(
l10, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10)
all_class_dict[string_of_class[0]] = c1
all_class_dict[string_of_class[1]] = c2
all_class_dict[string_of_class[2]] = c3
all_class_dict[string_of_class[3]] = c4
all_class_dict[string_of_class[4]] = c5
all_class_dict[string_of_class[5]] = c6
all_class_dict[string_of_class[6]] = c7
all_class_dict[string_of_class[7]] = c8
all_class_dict[string_of_class[8]] = c9
all_class_dict[string_of_class[9]] = c10
# print(all_class_dict.keys())
# l = d1.keys()
# print(d1['新辉腾'])
# print(type(l))
# print(type(l[0]))
# print("新辉腾" in d1.keys())
# print(d1)
# print(c1)
# print(all_class_dict)
# print(len(all_dict))
# print(len(all_list))
# print(len(all_dict))
# print(num)
B = calculate_B(all_class_dict)
# print(B)
# print(B.keys())
word_features = feature_select_use_new_CHI(
all_class_dict, B, count, N, word_set)
# print(word_features["car"])
# print(word_features["military"])
# print(len(word_features["military"]))
out = count_tf_idf(word_features, count)
# print(out.keys())
return out
def test(model):
accuracy = [0]*10
f_test = [0]*10
string_of_class = ['car',
'finance',
'science',
'health',
'sports',
'education',
'culture',
'military',
'joy',
'fashion']
# Load data
# test_label = load_test_label()
for i in range(10):
f_test[i] = load_test_file(string_of_class[i])
for u in range(10):
# Load data
# np.loadtxt(f_test[u], str)
word_list_test = []
f = open(f_test[u], "r")
for line in f.readlines():
l = list(jieba.cut(line, cut_all=False))
word_list_test.append(l)
# word_list_test = np.loadtxt(f_test[u], str)
# word_list_test = word_list_test.tolist()
# print(word_list_test)
# print(len(word_list_test))
# Test begin
acc_temp = 0
for y in range(len(word_list_test)):
score_list = {}
score_list = score_list.fromkeys(string_of_class, 1)
for t in range(10):
for word in word_list_test[y]:
t_w = string_of_class[t]
# print(word)
if word in model[string_of_class[t]]:
# python nb support big number calculate
score_list[t_w] *= model[t_w][word]
else:
score_list[t_w] = score_list[t_w] * 1
""" Function Programming """
s_list = sorted(score_list.items(
), key=lambda item: item[1], reverse=True)
# print(s_list)
# if s_list[0][0] == string_of_class[u]:
# acc_temp = acc_temp + 1
# else:
# pass
if s_list[0][0] == string_of_class[u]:
acc_temp = acc_temp + 1
else:
pass
# print(acc_temp)
accuracy[u] = acc_temp / 100
print("%shas been classified, and the accuracy is%.2f" %
(string_of_class[u], accuracy[u]))
# for word in word_list:
# cnt = cnt + 1
# if word in word_set_dic[name_class]:
# word_set_dic[name_class][word] += 1
# else:
# word_set_dic[name_class][word] = 1
# f_test_car = load_test_file('car')
# f_test_finance = load_test_file('finance')
# f_test_science = load_test_file('science')
# f_test_health = load_test_file('health')
# f_test_sports = load_test_file('sports')
# f_test_education = load_test_file('education')
# f_test_culture = load_test_file('culture')
# f_test_military = load_test_file('military')
# f_test_joy = load_test_file('joy')
# f_test_fashion = load_test_file('fashion')
accuracy_ave = 0
for i in range(len(accuracy)):
accuracy_ave += accuracy[i]
accuracy_ave = accuracy_ave / 10
return accuracy_ave