初识nlp word2vec实战1 英语文本分析

Bag of Words Meets Bags of Popcorn
https://www.kaggle.com/c/word2vec-nlp-tutorial/data

版本 1 未用word2vec

4个文件 :
labelTrainData testData unlabeledTrainData sampleSubmission

import 所需库
import os
import re
import numpy as np
import pandas as pd

from bs4 import BeautifulSoup

from sklearn.feature_extraction.text import CountVectorizer #简单计数
from sklearn.ensemble import RandomForestClassifier #随机森林
from sklearn.metrics import confusion_matrix #混淆矩阵 评估
import nltk
from nltk.corpus import stopwords

编译时报出no module named bz2 解决方法
http://stackoverflow.com/questions/8115280/importerror-no-module-named-bz2-for-python-2-7-2

关于bs的介绍
http://beautifulsoup.readthedocs.io/zh_CN/latest/

用pandas 读取数据

datafile = os.path.join('labeledTrainData.tsv')
df = pd.read_csv(datafile, sep='\t',escapechar='\\') #sep 分隔符为'\t' ,escapechar = '\\' 

print format(len(df))
df.head()

print df[‘review’][0]
里面有
这样的东西

对数据进行预处理(清洗数据)
去掉html 去掉标点 token 去掉the a 重组句子

stopwords = {}.fromkeys([line.rstrip() for line in open'stopwords.txt'])
eng_stopwords = set(stopwords)
def clean_text(text):
    text = BeautifulSoup(text,'html.parser').get_text() #去掉html格式
    text = re.sub(r'[^a-zA-Z]',' ',text)
    words = text.lower().split()
    words = [w for w in words if w not in eng_stopwords]
    return ''.join(words)

df['clean_review'] = df.review.apply(clean_text)
df.head()

得到清理后的数据
stuff going moment mj started listening music watching odd documentary watched wiz watched moonwalker maybe want get certain insight guy thought really cool eighties maybe make mind whether guilty innocent moonwalker part biography part feature film remember going see cinema originally released subtle messages mj feeling towards press also obvious message drugs bad kay visually impressive course michael jackson unless remotely like mj anyway going hate find boring may call mj egotist consenting making movie mj fans would say made fans true really nice actual feature film bit finally starts minutes excluding smooth criminal sequence joe pesci convincing psychopathic powerful drug lord wants mj dead bad beyond mj overheard plans nah joe pesci character ranted wanted people know supplying drugs etc dunno maybe hates mj music lots cool things like mj turning car robot whole speed demon sequence also director must patience saint came filming kiddy bad sequence usually directors hate working one kid let alone whole bunch performing complex dance scene bottom line movie people like mj one level another think people stay away try give wholesome message ironically mj bestest buddy movie girl michael jackson truly one talented people ever grace planet guilty well attention gave subject hmmm well know people different behind closed doors know fact either extremely nice stupid guy one sickest liars hope latter

抽取 bag of words 特征 (用CountVectorizer 得到one-hot 特征矩阵)

vectorizer = CountVectorizer(max_features= 5000)
train_data_features = vectorizer.fit_transform(df.clean_review).toarray()
print train_data_features.shape

训练模型
训练分类器

forest = RandomForestClassifier(n_estimators=100)         
forest = forest.fit(train_data_features, df.sentiment)    

在训练集上做predict

confusion_matrix(df.sentiment,forest.predict(train_data_features))

处理testData 清洗数据

datafile = os.path.join('testData.tsv')          
df = pd.read_csv(datafile, sep='\t',escapechar='\\')  
df['clean_review'] = df.review.apply(clean_text) 

得到one-hot 特征矩阵

test_data_features = vectorizer.transform(df.clean_review).toarray()

fit_transform 和transform 的区别 fit_transform方法是先调用fit然后调用transform 直接用transform要参数
http://stackoverflow.com/questions/23838056/what-is-the-difference-between-transform-and-fit-transform-in-sklearn

result = forest.predict(test_data_features)                
output = pd.DataFrame({'id':df.id,'sentiment':result})     

得到结果

版本2 用word2vec

import 所需库
import os
import re
import numpy as np
import pandas as pd

from bs4 import BeautifulSoup

from nltk.tokenize import TreebankWordTokenizer
from gensim.models.word2vec import Word2Vec
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
import nltk

tokenizer = TreebankWordTokenizer() 

#读入unlabeledData
def load_dataset(name,nrows=None):
    datasets = {
        'unlabeled_train':'unlabeledTrainData.tsv',
        'labeled_train' :'labeledTrainData.tsv',
        'test' : 'testData.tsv'
    }

    if name not in datasets:
        raise ValueError(name)
    data_file = os.path.join(datasets[name])
    df = pd.read_csv(data_file,sep='\t',escapechar='\\',nrows=nrows)
    # print format(len(df))
    return df

df = load_dataset('unlabeled_train')

#进行数据处理
def clean_text(text):
    text = BeautifulSoup(text,'html.parser').get_text()
    text = re.sub(r'[^a-zA-z]',' ',text)
    words = text.lower().split()
    return words


def split_sentences(review):
    raw_sentences = tokenizer.tokenize(review.strip())
    sentences = [clean_text(s) for s in raw_sentences if s]
    return sentences

%time sentences = sum(df.review.apply(split_sentences),[])
print format(len(df),len(sentences))

设置word2vec model

num_features = 300
min_word_count = 40
num_workers = 4
context = 10
downsampling = 1e-3

model_name='{}features_minwords'.format(num_features,min_word_count)

model = word2vec.Word2Vec(sentences,workers=num_workers,size=num_features,min_count=min_word_count,window=context,sample=downsampling)

model.init_sims(replace=True)
model.save(os.path.join(model_name))
del df

print(model.doesnt_match(“man woman child kitchen”.split()))

读入labeled数据

df = load_dataset('labeled_train')
def to_review_vector(review):
    words = clean_text(review)
    array = np.array([model[w] for w in words if w in model])
    return pd.Series(array.mean(axis=0))

用该数据建一个随机森林

rain_data_features = df.review.apply(to_review_vector)

forest = RandomForestClassifier(n_estimators = 100,random_state=42)
forest = forest.fit(train_data_features,df.sentiment)
del df
del train_data_features

读入test数据,应用的森林模型中,输出结果

df = load_dataset('test')
test_data_feature = df.review.apply(to_review_vector)

result = forest.predict(test_data_feature)
output =pd.DataFrame({'id':df.id,'sentiment':result})
output.to_csv(os.path.join('Word2Vec_model.csv'),index=False)
del df
del forest
del test_data_feature
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值