2类分类器实践1

import pandas as pd
import nltk


# 定义特征提取器
def document_features(document, word_features):
    document_words = set(document)
    features = {}
    for word in word_features:
        features["contains(%s)" % word] = (word in document_words)
    return features


#读取excel
documents0 = []
documents1 = []
df = pd.read_excel("window regulator01.xlsx")
# print(df.head())
Nodf = df[df.categories == 0]
# print(len(Nodf))
for rows in Nodf.values:
    documents0.append((nltk.word_tokenize(rows[0]) , "0"))
Yesdf = df[df.categories == 1]
# print(len(Yesdf))
for rows in Yesdf.values:
    documents1.append( (nltk.word_tokenize(rows[0]) , "1") )

#分析整个文本,将频率最大的2000个单词作为特征
sentences = "" #将每个title合成字符串
titles = df[ "title"]
for title in titles:
    sentences = sentences + " "+ title
words = nltk.word_tokenize(sentences)
print(len(words) / len(titles)) #计算平均每个title的单词个数
all_words = nltk.FreqDist(w.lower() for w in words) #小写后统计词频
word_features = list(all_words.keys())[:2000] #将整个语料库中前2000个高濒词作为特征
print(word_features)

#特征提取函数

#训练和测试分类器 0代表非window regulator样本 1代表是
featuresets0 = [(document_features(d , word_features) , c) for (d , c) in documents0]
featuresets1 = [(document_features(d , word_features) , c) for (d , c) in documents1]
train_set = featuresets0[:2000]
train_set.extend(featuresets1[:2000])
print(len(train_set))
test_set = featuresets0[2000:]
test_set.extend(featuresets1[2000:])
print(len(test_set))
classifier = nltk.NaiveBayesClassifier.train(train_set) #训练分类器
print(nltk.classify.accuracy(classifier , test_set))#分类正确率
print(classifier.show_most_informative_features(20)) #分类器发现的最有信息量的特征



第二版

import pandas as pd
import nltk
# 定义特征提取器
def document_features(document, word_features):
    document_words = set(document)
    features = {}
    for word in word_features:
        features["contains(%s)" % word] = (word in document_words)
    return features

#读取excel
documents0 = []
documents1 = []
df = pd.read_excel("window regulator01.xlsx")
Nodf = df[df.categories == 0]
# print(len(Nodf))
for rows in Nodf.values:
    documents0.append((nltk.word_tokenize(rows[0]) , "0"))
Yesdf = df[df.categories == 1]
# print(len(Yesdf))
for rows in Yesdf.values:
    documents1.append( (nltk.word_tokenize(rows[0]) , "1") )

#分析整个文本,将频率最大的2000个单词作为特征
sentences = "" #将每个title合成字符串
titles = df[ "title"]
for title in titles:
    sentences = sentences + " "+ title
words = nltk.word_tokenize(sentences)
print(len(words) / len(titles)) #计算平均每个title的单词个数
all_words = nltk.FreqDist(w.lower() for w in words) #小写后统计词频
print("词汇总数 = %d" % len( all_words))
common = all_words.most_common(500)  #出现次数最多的前n个高频词列表
word_features = []  #将整个语料库中前n个高濒词作为特征
for item in common:
    word_features.append(item[0]) #获取元祖的第一个元素key
print("特征列表数 = %d" %len(word_features))
print(word_features)
# all_words.plot(50)

#特征提取函数

#训练和测试分类器 0代表非window regulator样本 1代表是
featuresets0 = [(document_features(d , word_features) , c) for (d , c) in documents0]
featuresets1 = [(document_features(d , word_features) , c) for (d , c) in documents1]
selectsample = 2000
train_set = featuresets0[:selectsample]
train_set.extend(featuresets1[:selectsample])
print("训练样本数 = %d" %len(train_set))
test_set = featuresets0[selectsample:]
test_set.extend(featuresets1[selectsample:])
print("测试样本数 = %d" %len(test_set))
print("训练中。。。")
classifier = nltk.NaiveBayesClassifier.train(train_set) #训练分类器
print("测试中。。。")
print("分类正确率 = %f" %(nltk.classify.accuracy(classifier , test_set)))#分类正确率
print("最有信息量的特征")
print(classifier.show_most_informative_features(20)) #分类器发现的最有信息量的特征




  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值