统计文章中的匹配率

最新推荐文章于 2024-07-08 00:01:13 发布

Python小老六

最新推荐文章于 2024-07-08 00:01:13 发布

阅读量27

点赞数

文章标签： python 开发语言数据库

本文链接：https://blog.csdn.net/2301_79958007/article/details/134072980

版权

def read_data(file):
    a=[]
    with open(file,'r',encoding='utf-8') as f:
        data=f.readlines()
        for i in data:
            a.append(i.strip())
    return a

file1=r'C:\Users\Administrator\PycharmProjects\untitled\data\stop_words.txt'
file2=r'C:\Users\Administrator\PycharmProjects\untitled\data\pos.txt'
file3=r'C:\Users\Administrator\PycharmProjects\untitled\data\neg.txt'
stop_word=read_data(file1)
pos=read_data(file2)
neg=read_data(file3)
a=5300
pos_train=pos[:a]
neg_train=neg[:a]
pos_test=pos[a:]
neg_test=neg[a:]
pos_all_words=[]
neg_all_words=[]
vocab=[]
print('数据处理')
for sentence in pos_train:
    for word in sentence.split():
        if word not in stop_word:
            pos_all_words.append(word)
            vocab.append(word)
for sentence in neg_train:
    for word in sentence.split():
        if word not in stop_word:
            neg_all_words.append(word)
            vocab.append(word)
vocab=set(vocab)
vocab=list(vocab)
pos_count={}
neg_count={}
for word in pos_all_words:
    if word in pos_count.keys():
        pos_count[word]+=1
    else:
        pos_count[word]=1
for word in neg_all_words:
    if word in neg_count.keys():
        neg_count[word]+=1
    else:
        neg_count[word]=1
pro_pos={}
pro_neg={}
print('开始训练')
for word in vocab:
    if word not in pos_all_words:
        n_k=0
    else:
        n_k=pos_count[word]
    p=(n_k+1)/(len(vocab)+len(pos_all_words))
    pro_pos[word]=p

for word in vocab:
    if word not in neg_all_words:
        n_k=0
    else:
        n_k=neg_count[word]
    p=(n_k+1)/(len(vocab)+len(neg_all_words))
    pro_neg[word]=p
print('开始测试')
test=pos_test+neg_test
label=[1]*(len(pos)-a)+[0]*(len(neg)-a)
predict=[]
for sentence in test:
    p0=p1=1.0
    for word in sentence.split():
        if word in vocab and word not in stop_word:
            p0=p0*pro_neg[word]
            p1=p1*pro_pos[word]
    if max(p0,p1)==p0:
        predict.append(0)
    else:
        predict.append(1)
count=0
for i in range(len(label)):
    if label[i]==predict[i]:
        count=count+1
acc=count/len(label)
print('准确率为:{}'.format(acc))
while True:
    s=input('请输入一句话：')
    for word in s.split():
        if word in vocab and word not in stop_word:
            p0 = p0 * pro_neg[word]
            p1 = p1 * pro_pos[word]
    if max(p0, p1) == p0:
        print('标签为0，是负面的')
    else:
        print('标签为1，是正面的')