Bag of Words Meets Bags of Popcorn
https://www.kaggle.com/c/word2vec-nlp-tutorial/data
版本 1 未用word2vec
4个文件 :
labelTrainData testData unlabeledTrainData sampleSubmission
import 所需库
import os
import re
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import CountVectorizer #简单计数
from sklearn.ensemble import RandomForestClassifier #随机森林
from sklearn.metrics import confusion_matrix #混淆矩阵 评估
import nltk
from nltk.corpus import stopwords
编译时报出no module named bz2 解决方法
http://stackoverflow.com/questions/8115280/importerror-no-module-named-bz2-for-python-2-7-2
关于bs的介绍
http://beautifulsoup.readthedocs.io/zh_CN/latest/
用pandas 读取数据
datafile = os.path.join('labeledTrainData.tsv')
df = pd.read_csv(datafile, sep='\t',escapechar='\\') #sep 分隔符为'\t' ,escapechar = '\\'
print format(len(df))
df.head()
print df[‘review’][0]
里面有
这样的东西
对数据进行预处理(清洗数据)
去掉html 去掉标点 token 去掉the a 重组句子
stopwords = {}.fromkeys([line.rstrip() for line in open'stopwords.txt'])
eng_stopwords = set(stopwords)
def clean_text(text):
text = BeautifulSoup(text,'html.parser').get_text() #去掉html格式
text = re.sub(r'[^a-zA-Z]',' ',text)
words = text.lower().split()
words = [w for w in words if w not in eng_stopwords]
return ''.join(words)
df['clean_review'] = df.review.apply(clean_text)
df.head()
得到清理后的数据
stuff going moment mj started listening music watching odd documentary watched wiz watched moonwalker maybe want get certain insight guy thought really cool eighties maybe make mind whether guilty innocent moonwalker part biography part feature film remember going see cinema originally released subtle messages mj feeling towards press also obvious message drugs bad kay visually impressive course michael jackson unless remotely like mj anyway going hate find boring may call mj egotist consenting making movie mj fans would say made fans true really nice actual feature film bit finally starts minutes excluding smooth criminal sequence joe pesci convincing psychopathic powerful drug lord wants mj dead bad beyond mj overheard plans nah joe pesci character ranted wanted people know supplying drugs etc dunno maybe hates mj music lots cool things like mj turning car robot whole speed demon sequence also director must patience saint came filming kiddy bad sequence usually directors hate working one kid let alone whole bunch performing complex dance scene bottom line movie people like mj one level another think people stay away try give wholesome message ironically mj bestest buddy movie girl michael jackson truly one talented people ever grace planet guilty well attention gave subject hmmm well know people different behind closed doors know fact either extremely nice stupid guy one sickest liars hope latter
抽取 bag of words 特征 (用CountVectorizer 得到one-hot 特征矩阵)
vectorizer = CountVectorizer(max_features= 5000)
train_data_features = vectorizer.fit_transform(df.clean_review).toarray()
print train_data_features.shape
训练模型
训练分类器
forest = RandomForestClassifier(n_estimators=100)
forest = forest.fit(train_data_features, df.sentiment)
在训练集上做predict
confusion_matrix(df.sentiment,forest.predict(train_data_features))
处理testData 清洗数据
datafile = os.path.join('testData.tsv')
df = pd.read_csv(datafile, sep='\t',escapechar='\\')
df['clean_review'] = df.review.apply(clean_text)
得到one-hot 特征矩阵
test_data_features = vectorizer.transform(df.clean_review).toarray()
fit_transform 和transform 的区别 fit_transform方法是先调用fit然后调用transform 直接用transform要参数
http://stackoverflow.com/questions/23838056/what-is-the-difference-between-transform-and-fit-transform-in-sklearn
result = forest.predict(test_data_features)
output = pd.DataFrame({'id':df.id,'sentiment':result})
得到结果
版本2 用word2vec
import 所需库
import os
import re
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from nltk.tokenize import TreebankWordTokenizer
from gensim.models.word2vec import Word2Vec
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
import nltk
tokenizer = TreebankWordTokenizer()
#读入unlabeledData
def load_dataset(name,nrows=None):
datasets = {
'unlabeled_train':'unlabeledTrainData.tsv',
'labeled_train' :'labeledTrainData.tsv',
'test' : 'testData.tsv'
}
if name not in datasets:
raise ValueError(name)
data_file = os.path.join(datasets[name])
df = pd.read_csv(data_file,sep='\t',escapechar='\\',nrows=nrows)
# print format(len(df))
return df
df = load_dataset('unlabeled_train')
#进行数据处理
def clean_text(text):
text = BeautifulSoup(text,'html.parser').get_text()
text = re.sub(r'[^a-zA-z]',' ',text)
words = text.lower().split()
return words
def split_sentences(review):
raw_sentences = tokenizer.tokenize(review.strip())
sentences = [clean_text(s) for s in raw_sentences if s]
return sentences
%time sentences = sum(df.review.apply(split_sentences),[])
print format(len(df),len(sentences))
设置word2vec model
num_features = 300
min_word_count = 40
num_workers = 4
context = 10
downsampling = 1e-3
model_name='{}features_minwords'.format(num_features,min_word_count)
model = word2vec.Word2Vec(sentences,workers=num_workers,size=num_features,min_count=min_word_count,window=context,sample=downsampling)
model.init_sims(replace=True)
model.save(os.path.join(model_name))
del df
print(model.doesnt_match(“man woman child kitchen”.split()))
读入labeled数据
df = load_dataset('labeled_train')
def to_review_vector(review):
words = clean_text(review)
array = np.array([model[w] for w in words if w in model])
return pd.Series(array.mean(axis=0))
用该数据建一个随机森林
rain_data_features = df.review.apply(to_review_vector)
forest = RandomForestClassifier(n_estimators = 100,random_state=42)
forest = forest.fit(train_data_features,df.sentiment)
del df
del train_data_features
读入test数据,应用的森林模型中,输出结果
df = load_dataset('test')
test_data_feature = df.review.apply(to_review_vector)
result = forest.predict(test_data_feature)
output =pd.DataFrame({'id':df.id,'sentiment':result})
output.to_csv(os.path.join('Word2Vec_model.csv'),index=False)
del df
del forest
del test_data_feature