统计文章中的匹配率

def read_data(file):
    a=[]
    with open(file,'r',encoding='utf-8') as f:
        data=f.readlines()
        for i in data:
            a.append(i.strip())
    return a

file1=r'C:\Users\Administrator\PycharmProjects\untitled\data\stop_words.txt'
file2=r'C:\Users\Administrator\PycharmProjects\untitled\data\pos.txt'
file3=r'C:\Users\Administrator\PycharmProjects\untitled\data\neg.txt'
stop_word=read_data(file1)
pos=read_data(file2)
neg=read_data(file3)
a=5300
pos_train=pos[:a]
neg_train=neg[:a]
pos_test=pos[a:]
neg_test=neg[a:]
pos_all_words=[]
neg_all_words=[]
vocab=[]
print('数据处理')
for sentence in pos_train:
    for word in sentence.split():
        if word not in stop_word:
            pos_all_words.append(word)
            vocab.append(word)
for sentence in neg_train:
    for word in sentence.split():
        if word not in stop_word:
            neg_all_words.append(word)
            vocab.append(word)
vocab=set(vocab)
vocab=list(vocab)
pos_count={}
neg_count={}
for word in pos_all_words:
    if word in pos_count.keys():
        pos_count[word]+=1
    else:
        pos_count[word]=1
for word in neg_all_words:
    if word in neg_count.keys():
        neg_count[word]+=1
    else:
        neg_count[word]=1
pro_pos={}
pro_neg={}
print('开始训练')
for word in vocab:
    if word not in pos_all_words:
        n_k=0
    else:
        n_k=pos_count[word]
    p=(n_k+1)/(len(vocab)+len(pos_all_words))
    pro_pos[word]=p

for word in vocab:
    if word not in neg_all_words:
        n_k=0
    else:
        n_k=neg_count[word]
    p=(n_k+1)/(len(vocab)+len(neg_all_words))
    pro_neg[word]=p
print('开始测试')
test=pos_test+neg_test
label=[1]*(len(pos)-a)+[0]*(len(neg)-a)
predict=[]
for sentence in test:
    p0=p1=1.0
    for word in sentence.split():
        if word in vocab and word not in stop_word:
            p0=p0*pro_neg[word]
            p1=p1*pro_pos[word]
    if max(p0,p1)==p0:
        predict.append(0)
    else:
        predict.append(1)
count=0
for i in range(len(label)):
    if label[i]==predict[i]:
        count=count+1
acc=count/len(label)
print('准确率为:{}'.format(acc))
while True:
    s=input('请输入一句话:')
    for word in s.split():
        if word in vocab and word not in stop_word:
            p0 = p0 * pro_neg[word]
            p1 = p1 * pro_pos[word]
    if max(p0, p1) == p0:
        print('标签为0,是负面的')
    else:
        print('标签为1,是正面的')

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值