情感分析
import nltk.corpus as nc
import nltk.classify as cf
import nltk.classify.util as cu
pdata = []
fileids = nc.movie_reviews.fileids("pos")
for fileid in fileids:
feature = {}
words = nc.movie_reviews.words(fileid)
for word in words:
feature[word] = True
pdata.append((feature,"POSITIVE"))
ndata = []
nfileids = nc.movie_reviews.fileids("neg")
for fileid in nfileids:
feature = {}
words = nc.movie_reviews.words(fileid)
for word in words:
feature[word] = True
ndata.append((feature,"NEGETIVE"))
pnum,nnum = int(0.8 * len(pdata)), int(0.8 * len(ndata))
train_data = pdata[:pnum] + ndata[:nnum]
test_data = pdata[pnum:] + ndata[nnum:]
model = cf.NaiveBayesClassifier.train(train_data)
ac = cu.accuracy(model,test_data)
print("%.2f%%"%round(ac*100,2))
tops = model.most_informative_features()# 关键字
for top in tops:
print(top[0])
reviews = [
"It is a amazing movie.",
"This is a dull movie. I would never recommend it to anyone",
"The cinematography is pretty great in this movie.",
"This direction was terrible and the story was all over the place."
]
sents, probs = [], []
for review in reviews:
feature = {}
words = review.split()
for word in words:
feature[word] = True
pcls = model.prob_classify(feature)
sent = pcls.max()
prob = pcls.prob(sent)
sents.append(sent)
probs.append(prob)
for review, sent, probs in zip(reviews, sents, probs):
print(review,"->",sent,"%.2f%%"%round(prob * 100, 2))
性别识别
import random
import numpy as np
import nltk.corpus as nc
import nltk.classify as cf
male_names = nc.names.words("male.txt")
female_names = nc.names.words("female.txt")
models, acs = [],[]
for n_letter in range(1,6):
data = []
for male_name in male_names:
feature= {"feature":male_name[-n_letter:].lower()}
data.append((feature,"male"))
for female_name in female_names:
feature = {"feature":female_name[-n_letter:].lower()}
data.append((feature,"female"))
random.seed(7)
random.shuffle(data)
train_data = data[:int(len(data)/2)]
test_data = data[int(len(data)/2):]
model = cf.NaiveBayesClassifier.train(train_data)
ac = cf.accuracy(model,test_data)
acs.append(ac)
models.append(model)
best_index = np.array(acs).argmax()
best_letter = best_index + 1
names = ["Leonardo","Amy","Sam","Tom","Katherine","Tayior",
"Susanne","Watermelon","Alpaca","Paris","Python","JAVA"]
print(names)
genders = []
for name in names:
feature = {"feature":name[-best_letter:].lower()}
gender = models[best_index].classify(feature)
genders.append(gender)
print(genders)
文本分类
import sklearn.datasets as sd
import sklearn.feature_extraction.text as ft
import sklearn.naive_bayes as nb
cld = {"misc.forsale":"SALES","rec.motorcycles":"MOTORCYLES",
"rec.sport.baseball":"BASEBALL",
"sci.crypt":"CRYPTOGRAPHY",
"sci.space":"SPAVE"}
train = sd.fetch_20newsgroups(subset="train",
categories=cld.keys(),random_state=7,
shuffle = True)
train_data = train.data
train_y = train.target
cat