def read_data(file):
a=[]
with open(file,'r',encoding='utf-8') as f:
data=f.readlines()
for i in data:
a.append(i.strip())
return a
file1=r'C:\Users\Administrator\PycharmProjects\untitled\data\stop_words.txt'
file2=r'C:\Users\Administrator\PycharmProjects\untitled\data\pos.txt'
file3=r'C:\Users\Administrator\PycharmProjects\untitled\data\neg.txt'
stop_word=read_data(file1)
pos=read_data(file2)
neg=read_data(file3)
a=5300
pos_train=pos[:a]
neg_train=neg[:a]
pos_test=pos[a:]
neg_test=neg[a:]
pos_all_words=[]
neg_all_words=[]
vocab=[]
print('数据处理')
for sentence in pos_train:
for word in sentence.split():
if word not in stop_word:
pos_all_words.append(word)
vocab.append(word)
for sentence in neg_train:
for word in sentence.split():
if word not in stop_word:
neg_all_words.append(word)
vocab.append(word)
vocab=set(vocab)
vocab=list(vocab)
pos_count={}
neg_count={}
for word in pos_all_words:
if word in pos_count.keys():
pos_count[word]+=1
else:
pos_count[word]=1
for word in neg_all_words:
if word in neg_count.keys():
neg_count[word]+=1
else:
neg_count[word]=1
pro_pos={}
pro_neg={}
print('开始训练')
for word in vocab:
if word not in pos_all_words:
n_k=0
else:
n_k=pos_count[word]
p=(n_k+1)/(len(vocab)+len(pos_all_words))
pro_pos[word]=p
for word in vocab:
if word not in neg_all_words:
n_k=0
else:
n_k=neg_count[word]
p=(n_k+1)/(len(vocab)+len(neg_all_words))
pro_neg[word]=p
print('开始测试')
test=pos_test+neg_test
label=[1]*(len(pos)-a)+[0]*(len(neg)-a)
predict=[]
for sentence in test:
p0=p1=1.0
for word in sentence.split():
if word in vocab and word not in stop_word:
p0=p0*pro_neg[word]
p1=p1*pro_pos[word]
if max(p0,p1)==p0:
predict.append(0)
else:
predict.append(1)
count=0
for i in range(len(label)):
if label[i]==predict[i]:
count=count+1
acc=count/len(label)
print('准确率为:{}'.format(acc))
while True:
s=input('请输入一句话:')
for word in s.split():
if word in vocab and word not in stop_word:
p0 = p0 * pro_neg[word]
p1 = p1 * pro_pos[word]
if max(p0, p1) == p0:
print('标签为0,是负面的')
else:
print('标签为1,是正面的')
统计文章中的匹配率
最新推荐文章于 2024-07-08 00:01:13 发布