import numpy
import random
def get_vac_list(data):
vac_list = []
for v in data:
for vv in v:
if len(vv) > 2:
vac_list.append(vv)
return list(set(vac_list))
def get_word_vec(sentence, vac_list):
word_vec = [0]*len(vac_list)
for v in sentence:
if v in vac_list:
word_vec[vac_list.index(v)] += 1
return word_vec
def train(word_vec_list, class_list):
n = len(word_vec_list)
m = len(word_vec_list[0])
p0vec = numpy.ones(m)
p1vec = numpy.ones(m)
p0sum = 2.0
p1sum = 2.0
for i in range(n):
if class_list[i] == 0:
p0vec += word_vec_list[i]
p0sum += sum(word_vec_list[i])
else:
p1vec += word_vec_list[i]
p1sum += sum(word_vec_list[i])
p0class = numpy.log(p0vec/p0sum)
p1class = numpy.log(p1vec/p1sum)
pham = sum(class_list)/float(len(class_list))
return p0class, p1class, pham
def classify(word_vec, p0class, p1class, pham):
p0 = sum(p0class*word_vec)+numpy.log(1.0-pham)
p1 = sum(p1class*word_vec)+numpy.log(pham)
if p0 > p1:
return 0
else:
return 1
def parse(str):
import re
data = re.split(r'\W*', str)
data = [v.lower() for v in data if len(v) > 2]
return data
def test():
data = []
class_list = []
for i in range(1, 26):
data.append(parse(open('email/ham/%d.txt' % i).read()))
class_list.append(1)
data.append(parse(open('email/spam/%d.txt' % i).read()))
class_list.append(0)
train_index_list = list(range(50))
test_index_list = []
for i in range(10):
x = int(random.uniform(0, len(train_index_list)))
test_index_list.append(train_index_list[x])
del(train_index_list[x])
train_data = []
train_class_list = []
for i in train_index_list:
train_data.append(data[i])
train_class_list.append(class_list[i])
vac_list = get_vac_list(train_data)
word_vec_list = []
for v in train_data:
word_vec_list.append(get_word_vec(v, vac_list))
p0class, p1class, pham = train(word_vec_list, train_class_list)
m = 0
for i in test_index_list:
if classify(get_word_vec(data[i], vac_list), p0class, p1class, pham) == class_list[i]:
m += 1
return m
if __name__ == '__main__':
m = 0
for i in range(1000):
m += test()
print(m)