import os
import re
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import nltk.data
from nltk import word_tokenize
from nltk.corpus import stopwords
from gensim.models.word2vec import Word2Vec
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
def load_dataset(name,nrows=None):
datasets={
"unlabeled_train":"unlabeledTrainData.tsv",
"labeled_train":"labeledTrainData.tsv",
"test":"testData.tsv"
}
if name not in datasets:
raise ValueError(name)
data_file=os.path.join("..","data",datasets[name])
df=pd.read_csv(data_file,sep="\t",escapechar="\\",nrows=nrows)
print("number of reviews:{}".format(len(df)))
return df
#读入无标签数据
df=load_dataset("unlabeled_train")
print(df.head())#50000
#数据预处理
stopword=list(stopwords.words("english"))
def clean_text(text,remove_stopwords=False):
text=BeautifulSoup(text,"html.parser").get_text()
text=re.sub(r"[^a-zA-Z]"," ",text)
words=text.lower().split()
if remove_stopwords:
words=[w for w in words if w not in stopword]
return words
def split_sentences(review):
raw_sentences=word_tokenize(review.strip())
sentences=[clean_text(s) for s in raw_sentences if s]
return sentences
#将dataframe文本预处理,分词。
df["clean_review"]=df.review.apply(clean_text)
sentences=sum(df.review.apply(split_sentences,[]))
#用gensim训练词嵌入模型
num_features=300
min_word_count=40
num_workes=4
context=10
downsampling=1e-3
model=Word2Vec(sentences,workers=num_workes,size=num_features,min_count=min_word_count,
window=context,sample=downsampling)
model.init_sims(replace=True)
model.save(os.path.join("..","models","model_name"))
#看看训练的词向量结果如何
print(model.doesnt_match("man woman child kitchen".split()))#kitchen
model.most_similar("man")
df=load_dataset("labeled_train")
def to_review_vector(review):
words=clean_text(review,remove_stopwords=True)
array=np.array(model[w] for w in words if w in model)
return pd.Series(array.mean(axis=0))
train_data_feature=df.review.apply(to_review_vector)
print(train_data_feature.head())
#使用随机森林构建分类器
forest=RandomForestClassifier(n_estimators=100,random_state=42)
clf=forest.fit(train_data_feature,df["sentiment"])
confusion_matrix(df["sentiment",forest.predict(train_data_feature)])
del df
del train_data_feature
df=load_dataset("test")
test_data=df.review.apply(to_review_vector)
predict=forest.predict(test_data)
output=pd.DataFrame({"id":df["id"],"sentiment":predict})
#保存到csv文件
2、使用word2vec做情感分析
import os
import re
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import nltk.data
from nltk import word_tokenize
from nltk.corpus import stopwords
from gensim.models.word2vec import Word2Vec
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
def load_dataset(name,nrows=None):
datasets={
"unlabeled_train":"unlabeledTrainData.tsv",
"labeled_train":"labeledTrainData.tsv",
"test":"testData.tsv"
}
if name not in datasets:
raise ValueError(name)
data_file=os.path.join("..","data",datasets[name])
df=pd.read_csv(data_file,sep="\t",escapechar="\\",nrows=nrows)
print("number of reviews:{}".format(len(df)))
return df
#读入无标签数据
df=load_dataset("unlabeled_train")
print(df.head())#50000
#数据预处理
stopword=list(stopwords.words("english"))
def clean_text(text,remove_stopwords=False):
text=BeautifulSoup(text,"html.parser").get_text()
text=re.sub(r"[^a-zA-Z]"," ",text)
words=text.lower().split()
if remove_stopwords:
words=[w for w in words if w not in stopword]
return words
def split_sentences(review):
raw_sentences=word_tokenize(review.strip())
sentences=[clean_text(s) for s in raw_sentences if s]
return sentences
#将dataframe文本预处理,分词。
df["clean_review"]=df.review.apply(clean_text)
sentences=sum(df.review.apply(split_sentences,[]))
#用gensim训练词嵌入模型
num_features=300
min_word_count=40
num_workes=4
context=10
downsampling=1e-3
model=Word2Vec(sentences,workers=num_workes,size=num_features,min_count=min_word_count,
window=context,sample=downsampling)
model.init_sims(replace=True)
model.save(os.path.join("..","models","model_name"))
#看看训练的词向量结果如何
print(model.doesnt_match("man woman child kitchen".split()))#kitchen
model.most_similar("man")
df=load_dataset("labeled_train")
def to_review_vector(review):
words=clean_text(review,remove_stopwords=True)
array=np.array(model[w] for w in words if w in model)
return pd.Series(array.mean(axis=0))
train_data_feature=df.review.apply(to_review_vector)
print(train_data_feature.head())
#使用随机森林构建分类器
forest=RandomForestClassifier(n_estimators=100,random_state=42)
clf=forest.fit(train_data_feature,df["sentiment"])
confusion_matrix(df["sentiment",forest.predict(train_data_feature)])
del df
del train_data_feature
df=load_dataset("test")
test_data=df.review.apply(to_review_vector)
predict=forest.predict(test_data)
output=pd.DataFrame({"id":df["id"],"sentiment":predict})
#保存到csv文件