Word2vec 实战

本文详细介绍了如何运用Word2vec进行情感分析,包括对英文文本的情感分析以及利用word2vec模型进行深入的情感理解操作。
摘要由CSDN通过智能技术生成

1、对英文文本做情感分析

import  os
import re
import  numpy as np
import  pandas as pd
from bs4 import BeautifulSoup
import  nltk.data
from nltk import  word_tokenize
from nltk.corpus import stopwords
from gensim.models.word2vec import Word2Vec
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

def load_dataset(name,nrows=None):
    datasets={
        "unlabeled_train":"unlabeledTrainData.tsv",
        "labeled_train":"labeledTrainData.tsv",
        "test":"testData.tsv"
    }
    if name not in datasets:
        raise ValueError(name)
    data_file=os.path.join("..","data",datasets[name])
    df=pd.read_csv(data_file,sep="\t",escapechar="\\",nrows=nrows)
    print("number of reviews:{}".format(len(df)))
    return df

#读入无标签数据
df=load_dataset("unlabeled_train")
print(df.head())#50000

#数据预处理
stopword=list(stopwords.words("english"))

def clean_text(text,remove_stopwords=False):
    text=BeautifulSoup(text,"html.parser").get_text()
    text=re.sub(r"[^a-zA-Z]"," ",text)
    words=text.lower().split()
    if remove_stopwords:
        words=[w for w in words if w not in stopword]
    return words

def split_sentences(review):
    raw_sentences=word_tokenize(review.strip())
    sentences=[clean_text(s) for s in raw_sentences if s]
    return sentences

#将dataframe文本预处理,分词。

df["clean_review"]=df.review.apply(clean_text)
sentences=sum(df.review.apply(split_sentences,[]))

#用gensim训练词嵌入模型
num_features=300
min_word_count=40
num_workes=4
context=10
downsampling=1e-3

model=Word2Vec(sentences,workers=num_workes,size=num_features,min_count=min_word_count,
               window=context,sample=downsampling)

model.init_sims(replace=True)
model.save(os.path.join("..","models","model_name"))

#看看训练的词向量结果如何
print(model.doesnt_match("man woman child kitchen".split()))#kitchen
model.most_similar("man")


df=load_dataset("labeled_train")

def to_review_vector(review):
    words=clean_text(review,remove_stopwords=True)
    array=np.array(model[w] for w in words if w in model)
    return pd.Series(array.mean(axis=0))

train_data_feature=df.review.apply(to_review_vector)
print(train_data_feature.head())

#使用随机森林构建分类器
forest=RandomForestClassifier(n_estimators=100,random_state=42)
clf=forest.fit(train_data_feature,df["sentiment"])

confusion_matrix(df["sentiment",forest.predict(train_data_feature)])

del df
del train_data_feature


df=load_dataset("test")
test_data=df.review.apply(to_review_vector)

predict=forest.predict(test_data)
output=pd.DataFrame({"id":df["id"],"sentiment":predict})
#保存到csv文件

2、使用word2vec做情感分析

import  os
import re
import  numpy as np
import  pandas as pd
from bs4 import BeautifulSoup
import  nltk.data
from nltk import  word_tokenize
from nltk.corpus import stopwords
from gensim.models.word2vec import Word2Vec
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

def load_dataset(name,nrows=None):
    datasets={
        "unlabeled_train":"unlabeledTrainData.tsv",
        "labeled_train":"labeledTrainData.tsv",
        "test":"testData.tsv"
    }
    if name not in datasets:
        raise ValueError(name)
    data_file=os.path.join("..","data",datasets[name])
    df=pd.read_csv(data_file,sep="\t",escapechar="\\",nrows=nrows)
    print("number of reviews:{}".format(len(df)))
    return df

#读入无标签数据
df=load_dataset("unlabeled_train")
print(df.head())#50000

#数据预处理
stopword=list(stopwords.words("english"))

def clean_text(text,remove_stopwords=False):
    text=BeautifulSoup(text,"html.parser").get_text()
    text=re.sub(r"[^a-zA-Z]"," ",text)
    words=text.lower().split()
    if remove_stopwords:
        words=[w for w in words if w not in stopword]
    return words

def split_sentences(review):
    raw_sentences=word_tokenize(review.strip())
    sentences=[clean_text(s) for s in raw_sentences if s]
    return sentences

#将dataframe文本预处理,分词。

df["clean_review"]=df.review.apply(clean_text)
sentences=sum(df.review.apply(split_sentences,[]))

#用gensim训练词嵌入模型
num_features=300
min_word_count=40
num_workes=4
context=10
downsampling=1e-3

model=Word2Vec(sentences,workers=num_workes,size=num_features,min_count=min_word_count,
               window=context,sample=downsampling)

model.init_sims(replace=True)
model.save(os.path.join("..","models","model_name"))

#看看训练的词向量结果如何
print(model.doesnt_match("man woman child kitchen".split()))#kitchen
model.most_similar("man")


df=load_dataset("labeled_train")

def to_review_vector(review):
    words=clean_text(review,remove_stopwords=True)
    array=np.array(model[w] for w in words if w in model)
    return pd.Series(array.mean(axis=0))

train_data_feature=df.review.apply(to_review_vector)
print(train_data_feature.head())

#使用随机森林构建分类器
forest=RandomForestClassifier(n_estimators=100,random_state=42)
clf=forest.fit(train_data_feature,df["sentiment"])

confusion_matrix(df["sentiment",forest.predict(train_data_feature)])

del df
del train_data_feature


df=load_dataset("test")
test_data=df.review.apply(to_review_vector)

predict=forest.predict(test_data)
output=pd.DataFrame({"id":df["id"],"sentiment":predict})
#保存到csv文件
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值