朴素贝叶斯分类器基本代码 && n折交叉优化 2

这个代码基于上一个代码

不同的是:读取了txt文件,改变了min_ft与max_ft的参数

import re
import pandas as pd
import warnings
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB as MNB #多项分布朴素贝叶斯公式
from sklearn.naive_bayes import BernoulliNB as BNB
from sklearn.model_selection  import cross_val_score
warnings.filterwarnings("ignore")
def proces(col2):
    col2_text=re.sub("[^a-zA-Z]"," ",col2)
    words=col2_text.lower().split()
    #print(words)
    return words
train=pd.read_table('sentimentLabel.txt',lineterminator='\n', header=None, names=[0, 1])
print(train.head(5))
train_labers=train[0]
train_texts=train[1]
class_mapping={'Negative':0, 'Positive':1}
train_labers=train_labers.map(class_mapping)
#print(labers)

test=pd.read_table('test.txt', lineterminator='\n', header=None, names=[0, 1])
test_labers=test[0]
test_texts=test[1]
test_labers=test_labers.map(class_mapping)

train_data=[]
for i in range(len(train_texts)):
    train_data.append(' '.join(proces(train_texts[i])))
    pass
test_data=[]
for i in range(len(test_texts)):
    test_data.append(' '.join(proces(test_texts[i])))
#print(train_data)
#print(test_data)
data_all = train_data+test_data
#print(data_all)
count_vec = TfidfVectorizer(min_df=1,
                            max_df=60,
                            analyzer='word',
                            ngram_range=(1, 2),
                            use_idf=1,
                            smooth_idf=1,
                            sublinear_tf=1,
                            stop_words='english'
)
length=len(train_data)
count_vec.fit(data_all)
data_all=count_vec.transform(data_all)
#print(data_all)
train_data=data_all[:length]
test_data=data_all[length:]


model=MNB()
#model=BNB()
model.fit(train_data,train_labers)
#pred=model.predict(test_data)
MNB(alpha=1.0, class_prior=False, fit_prior=True)
#print("roc_auc",roc_auc_score(test_labers, pred))
#print("roc_auc",roc_auc_score(w, pred))
'''
MX = 0.7996632996632996
MX_idx = 5
for i in range(400, 500):
    if MX < np.mean(cross_val_score(model, train_data, train_labers, cv=i, scoring='roc_auc')):
        MX=np.mean(cross_val_score(model, train_data, train_labers, cv=i, scoring='roc_auc'))
        MX_idx=i
    pass
print("roc_auc",MX, MX_idx)
'''
print("roc_auc", np.mean(cross_val_score(model, train_data, train_labers, cv=297, scoring='roc_auc')))

 

转载于:https://www.cnblogs.com/mpeter/p/11172284.html

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值