java贝叶斯文本分类算法_用朴素贝叶斯算法做文本分类

最新推荐文章于 2021-02-28 03:44:33 发布

网件NETGEAR

最新推荐文章于 2021-02-28 03:44:33 发布

阅读量158

点赞数

文章标签： java贝叶斯文本分类算法

本文链接：https://blog.csdn.net/weixin_30126739/article/details/114307133

版权

def calculate_B(A):

# d_len = len(dict_all)

# l_len = len(list_class)

# times = d_len / l_len

# for cnt in range(len(list_class)):

# if list_class[cnt][0] in dict_all:

# word = list_class[cnt][0]

# A = 1000*(list_class[cnt][1]/l_len)

# B = 1000*(dict_all[word]/d_len)

# print("The value of A is %d" % A)

# print("The value of B is %d" % B)

# X2 = A*()-B*()

# else:

# print("All_dict don't contain all the elements.")

B = {}

for key in A:

B[key] = {}

for word in A[key]:

B[key][word] = 0

for key_value in A:

if key_value != key and (word in A[key_value]):

B[key][word] += A[key_value][word]

return B

# A is a element of CHI; B is a element of CHI

# count is a storage of one class's number

# N is all the number

# word_set is a dictionary for all the word

def feature_select_use_new_CHI(A, B, count, N, word_set):

# word_dict = []

word_features = {}

string_of_class = ['car',

'finance',

'science',

'health',

'sports',

'education',

'culture',

'military',

'joy',

'fashion']

word_features = word_features.fromkeys(string_of_class)

for num in range(len(string_of_class)):

CHI = {}

name = string_of_class[num]

M = N - count[num]

for word in A[name]:

chi_t_1 = A[name][word] * (M - B[name][word]) - \

(count[num] - A[name][word]) * B[name][word]

chi_t_2 = math.pow(chi_t_1, 2)

chi_t_3 = (A[name][word] + B[name][word]) * \

(N - A[name][word] - B[name][word])

chi_t_4 = (chi_t_2 / chi_t_3)

CHI[word] = math.log10(

N / (A[name][word] + B[name][word])) * chi_t_4

# print(CHI[word])

""" Function Programming """

# sorted_list = sorted(word_set_dic[name_class].items(

# ), key=lambda item: item[1], reverse=True)

# # print(sorted_list)

sorted_list = sorted(

CHI.items(), key=lambda item: item[1], reverse=True)

sorted_list = sorted_list[:200]

# print(len(sorted_list))

# print(sorted_list)

# b = []

# for aa in a:

# b.append(aa[0])

# word_dict.extend(b)

# for word in word_dict:

# if word not in word_features:

# word_features.append(word)

word_features[name] = {}

for u in range(len(sorted_list)):

if sorted_list[u][0] in word_set[name]:

w = sorted_list[u][0]

word_features[name][w] = word_set[name][w]

else:

print("ERROR")

print("The value of CHI has been counted.")

return word_features

def count_tf_idf(word_features, count):

string_of_class = ['car',

'finance',

'science',

'health',

'sports',

'education',

'culture',

'military',

'joy',

'fashion']

out = {}

out = out.fromkeys(string_of_class)

for num in range(len(string_of_class)):

class_name = string_of_class[num]

out[class_name] = {}

for word in word_features[class_name]:

word_file = 0

for c_n in word_features:

if word in word_features[c_n]:

word_file = word_file + 1

# print(word_file)

# print(word_features[class_name][word])

# print(count[num])

tf = word_features[class_name][word] / count[num]

idf = math.log10(10 / word_file)

tf_idf = tf * idf * 1000000

out[class_name][word] = tf_idf + 1

# """ Function Programming """

# sorted_list = sorted(out["car"].items(

# ), key=lambda item: item[1], reverse=True)

# print(sorted_list)

# print(len(sorted_list))

# print(out)

print("The value of tf-idf has been counted.")

return out

def naiveBayes():

# print(word_set_dic)

# print(count_word_train(f_train_car, 'car') + " Done")

# print(wd)

# print(nc)

string_of_class = ['car',

'finance',

'science',

'health',

'sports',

'education',

'culture',

'military',

'joy',

'fashion']

f_train_car = load_train_file('car')

f_train_finance = load_train_file('finance')

f_train_science = load_train_file('science')

f_train_health = load_train_file('health')

f_train_sports = load_train_file('sports')

f_train_education = load_train_file('education')

f_train_culture = load_train_file('culture')

f_train_military = load_train_file('military')

f_train_joy = load_train_file('joy')

f_train_fashion = load_train_file('fashion')

print(count_word_train(f_train_car, 'car') + " Done")

print(count_word_train(f_train_finance, 'finance') + " Done")

print(count_word_train(f_train_science, 'science') + " Done")

print(count_word_train(f_train_health, 'health') + " Done")

print(count_word_train(f_train_sports, 'sports') + " Done")

print(count_word_train(f_train_education, 'education') + " Done")

print(count_word_train(f_train_culture, 'culture') + " Done")

print(count_word_train(f_train_military, 'military') + " Done")

print(count_word_train(f_train_joy, 'joy') + " Done")

print(count_word_train(f_train_fashion, 'fashion') + " Done")

d1, l1, n1 = process_train_word(f_train_car, "car")

d2, l2, n2 = process_train_word(f_train_finance, 'finance')

d3, l3, n3 = process_train_word(f_train_science, 'science')

d4, l4, n4 = process_train_word(f_train_health, 'health')

d5, l5, n5 = process_train_word(f_train_sports, 'sports')

d6, l6, n6 = process_train_word(f_train_education, 'education')

d7, l7, n7 = process_train_word(f_train_culture, 'culture')

d8, l8, n8 = process_train_word(f_train_military, 'military')

d9, l9, n9 = process_train_word(f_train_joy, 'joy')

d10, l10, n10 = process_train_word(f_train_fashion, 'fashion')

# print(d1)

# num = n1 + n2 + n3 + n4 + n5 + n6 + n7 + n8 + n9 + n10

all_list = l1 + l2 + l3 + l4 + l5 + l6 + l7 + l8 + l9 + l10

count = [n1, n2, n3, n4, n5, n6, n7, n8, n9, n10]

# print(count)

# print(num)

# print(len(all_list))

all_dict = Change_list_to_dict(all_list)

# For count CHI

all_class_dict = {}

N = len(all_dict)

# print(N)

word_set = {}

word_set = word_set.fromkeys(string_of_class)

word_set[string_of_class[0]] = d1

word_set[string_of_class[1]] = d2

word_set[string_of_class[2]] = d3

word_set[string_of_class[3]] = d4

word_set[string_of_class[4]] = d5

word_set[string_of_class[5]] = d6

word_set[string_of_class[6]] = d7

word_set[string_of_class[7]] = d8

word_set[string_of_class[8]] = d9

word_set[string_of_class[9]] = d10

# print(word_set)

c1 = bulid_dict_for_one_class(l1, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10)

c2 = bulid_dict_for_one_class(l2, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10)

c3 = bulid_dict_for_one_class(l3, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10)

c4 = bulid_dict_for_one_class(l4, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10)

c5 = bulid_dict_for_one_class(l5, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10)

c6 = bulid_dict_for_one_class(l6, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10)

c7 = bulid_dict_for_one_class(l7, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10)

c8 = bulid_dict_for_one_class(l8, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10)

c9 = bulid_dict_for_one_class(l9, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10)

c10 = bulid_dict_for_one_class(

l10, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10)

all_class_dict[string_of_class[0]] = c1

all_class_dict[string_of_class[1]] = c2

all_class_dict[string_of_class[2]] = c3

all_class_dict[string_of_class[3]] = c4

all_class_dict[string_of_class[4]] = c5

all_class_dict[string_of_class[5]] = c6

all_class_dict[string_of_class[6]] = c7

all_class_dict[string_of_class[7]] = c8

all_class_dict[string_of_class[8]] = c9

all_class_dict[string_of_class[9]] = c10

# print(all_class_dict.keys())

# l = d1.keys()

# print(d1['新辉腾'])

# print(type(l))

# print(type(l[0]))

# print("新辉腾" in d1.keys())

# print(d1)

# print(c1)

# print(all_class_dict)

# print(len(all_dict))

# print(len(all_list))

# print(len(all_dict))

# print(num)

B = calculate_B(all_class_dict)

# print(B)

# print(B.keys())

word_features = feature_select_use_new_CHI(

all_class_dict, B, count, N, word_set)

# print(word_features["car"])

# print(word_features["military"])

# print(len(word_features["military"]))

out = count_tf_idf(word_features, count)

# print(out.keys())

return out

def test(model):

accuracy = [0]*10

f_test = [0]*10

string_of_class = ['car',

'finance',

'science',

'health',

'sports',

'education',

'culture',

'military',

'joy',

'fashion']

# Load data

# test_label = load_test_label()

for i in range(10):

f_test[i] = load_test_file(string_of_class[i])

for u in range(10):

# Load data

# np.loadtxt(f_test[u], str)

word_list_test = []

f = open(f_test[u], "r")

for line in f.readlines():

l = list(jieba.cut(line, cut_all=False))

word_list_test.append(l)

# word_list_test = np.loadtxt(f_test[u], str)

# word_list_test = word_list_test.tolist()

# print(word_list_test)

# print(len(word_list_test))

# Test begin

acc_temp = 0

for y in range(len(word_list_test)):

score_list = {}

score_list = score_list.fromkeys(string_of_class, 1)

for t in range(10):

for word in word_list_test[y]:

t_w = string_of_class[t]

# print(word)

if word in model[string_of_class[t]]:

# python nb support big number calculate

score_list[t_w] *= model[t_w][word]

else:

score_list[t_w] = score_list[t_w] * 1

""" Function Programming """

s_list = sorted(score_list.items(

), key=lambda item: item[1], reverse=True)

# print(s_list)

# if s_list[0][0] == string_of_class[u]:

# acc_temp = acc_temp + 1

# else:

# pass

if s_list[0][0] == string_of_class[u]:

acc_temp = acc_temp + 1

else:

pass

# print(acc_temp)

accuracy[u] = acc_temp / 100

print("%shas been classified, and the accuracy is%.2f" %

(string_of_class[u], accuracy[u]))

# for word in word_list:

# cnt = cnt + 1

# if word in word_set_dic[name_class]:

# word_set_dic[name_class][word] += 1

# else:

# word_set_dic[name_class][word] = 1

# f_test_car = load_test_file('car')

# f_test_finance = load_test_file('finance')

# f_test_science = load_test_file('science')

# f_test_health = load_test_file('health')

# f_test_sports = load_test_file('sports')

# f_test_education = load_test_file('education')

# f_test_culture = load_test_file('culture')

# f_test_military = load_test_file('military')

# f_test_joy = load_test_file('joy')

# f_test_fashion = load_test_file('fashion')

accuracy_ave = 0

for i in range(len(accuracy)):

accuracy_ave += accuracy[i]

accuracy_ave = accuracy_ave / 10

return accuracy_ave

网件NETGEAR

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
java贝叶斯文本分类算法_用朴素贝叶斯算法做文本分类

def calculate_B(A):# d_len = len(dict_all)# l_len = len(list_class)# times = d_len / l_len# for cnt in range(len(list_class)):# if list_class[cnt][0] in dict_all:# word = list_class[cnt][0...
复制链接

扫一扫