NLP学习4——IMDB电影情感分析

kaggle竞赛:Bag of Words Meets Bags of Popcorn
目的:根据给出的训练数据集提取相关文本特征,训练一个合适的分类模型能够对测试集的review正确分类。

一、词袋特征

有关词袋模型的解释请看我的博文朴素贝叶斯一章。下面我只讲解用sklearn构造词袋特征。

import numpy as np
import pandas as pd
train_data = pd.read_csv('labeledTrainData.tsv', header=0, delimiter='\t', quoting=3)
# 参数意义:header:指定行数作为列名,默认是0
#          delimiter:分隔符
#          quoting:无视双引号
train_data.shape  # (25000, 3)  二维数据,25000行,3列
train_data.columns.values  # 展示数据集的列名
train_data.head(10)   # 展示数据集的前10行
train_data['review'][0]   # 展示第0行数据

from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
# 如果没有下载stopwords,需要先运行该语句nltk.download('stopwords')
# 定义数据清洗函数,返回清洗过后的词
def review_to_words(raw_review):
    review_text = BeautifulSoup(raw_review, 'lxml').get_text()  # 清洗html符号
    letters_only = re.sub('[^a-zA-Z]', ' ', review_text)     # 正则表达式:只匹配大小写英文字母,其余符号或数字用空格代替
    words = letters_only.lower().split()  # 字母全部转换为小写并分开
    stops = set(stopwords.words('english'))  # 加载stopwords
    meaningful_words = [w for w in words if w not in stops]  # 去除stopwords
    return (" ".join(meaningful_words))
# 清洗训练集    
num_reviews = train_data['review'].size
clean_train_review = []
for i in range(0, num_reviews):
    if (i + 1) % 1000 == 0:
        print('Review %d of %d\n' % (i + 1, num_reviews))
    clean_train_review.append(review_to_words(train_data['review'][i]))    
# 创建词袋特征
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(analyzer='word',  
	tokenizer=None, # 没有设置分词器,关于分词器会在下面讲解
	preprocessor=None, 
	stop_words=None, # 由于已经剔除停用词,此处不再设置
	max_features=5000)  # 词汇表大小设置为5000
train_data_features = vectorizer.fit_transform(clean_train_review)
train_data_features = train_data_features.toarray()
print(train_data_features.shape)    # (25000, 5000)此时词袋特征已经提取完毕
# 下面选择分类模型,调用sklearn库,我分别选择了随机森林模型和支持向量机模型,最终分类得分分别是0.56752和0.51308
from sklearn.svm import SVC
train_label = train_data['sentiment']
train_label = np.array(train_label) # shape=(25000, )
svm_model = SVC(kernel='rbf', gamma=0.1)
svm_model.fit(train_data_features, train_label) # 训练
test_data = pd.read_csv('testData.tsv', header=0, delimiter='\t', quoting=3) # 加载测试数据
num_test_reviews = test_data['review'].size
clean_test_reviews = []
for i in range(0, num_test_reviews):
    if (i + 1) % 1000 == 0:
        print('Review %d of %d\n' % (i + 1, num_reviews))
    clean_test_reviews.append(review_to_words(test_data['review'][i]))
test_features = vectorizer.fit_transform(clean_test_reviews)
test_features = test_features.toarray()
pred = svm_model.predict(test_features)  # 测试
output = pd.DataFrame(data={'id':test_data['id'], 'sentiment':pred})
output.to_csv('Bag_of_words_model.csv', index=False, quoting=3)

#from sklearn.ensemble import RandomForestClassifier
#forest = RandomForestClassifier(n_estimators = 100)
#forest = forest.fit( train_data_features, train_data["sentiment"] )
#pred = forest.predict(test_features)    

词袋特征的缺点是将各个有情感的句子分成毫不相干的词语,这样的特征对情感分析帮助不大。

二、词向量

关于词向量的理解可以看我的博文NLP学习2和3,总而言之词向量可以很好的保留上下文的关系,使词语之间的语义更加完善。这里用gensim实现词向量。

import pandas as pd
train = pd.read_csv('labeledTrainData.tsv', header=0, delimiter='\t', quoting=3)
from bs4 import BeautifulSoup
import refrom nltk.corpus 
import stopwords

def review_to_wordlist(review, remove_stopwords=False):  # 在训练词向量时最好不要剔除stopwords
    review_text = BeautifulSoup(review, 'lxml').get_text()
    review_text = re.sub('[^a-zA-Z]', ' ', review_text)
    words = review_text.lower().split()
    if remove_stopwords:
       stops = set(stopwords.words('english'))
       words = [w for w in words if w not in stops]
    return words
import nltk.data
import nltk
nltk.download('punkt')
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')  # 分句工具,遇到句号会分句,english.pickle是英文分句模型
#注意:英文文法中,一句话结束后会有空格。
# 分句函数
def review_to_sentences(review, tokenizer, remove_stopwords=False):
    raw_sentences = tokenizer.tokenize(review.strip())    # 首先对review分句,即将每个review分解成各个句子
    sentences = [] # 存放句子,注意此时存放的不是一个review而是review中的句子,一个review可能有好几个句子
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
           sentences.append(review_to_wordlist(raw_sentence))
    return sentences

sentences = []
num_review = train['review'].size
for i in range(num_review):
    if (i+1)%5000 == 0:
       print("Review %d of %d\n" % (i+1, num_review)) 
    sentences += (review_to_sentences(train['review'][i], tokenizer))
    # 注意这里不能用append()方法,因为经过review_to_sentences函数之后,一个review会变成一个二维数组的形式
    # 即[[], []],而append()方法会把这个数组原样加到新列表中,而我们要求的列表是一个一维列表,所以不能用append()。
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) #训练日志
from gensim.models import Word2Vec
model = Word2Vec(sentences, 
		workers=4, 
		size=300, # 词嵌入维度,即隐藏层单元个数
		min_count = 40,  # 忽略词频小于40的词语
		window = 10,  # 窗口大小
		sample = 1e-3,  # 高频词随机下采样的配置阈值,二次采样选择高频词(采样率大于sample的词)
		sg=1)  # sg=1是skip-gramm模型,sg=0是CBOW模型
model.init_sims(replace=True) # 锁定模型,具体含义不太清楚
model_name = "300features_40minwords_10context_1" 
model.save(model_name)		    # 默认迭代5轮
# 模型训练完成之后,我们可以进行一些测试,了解词向量训练的效果
model.similarity('man', 'woman') # 返回两个词的相似度
model.most_similar('awful')  # 返回与awful最相似的10个词,10是窗口大小
model.doesnt_match('man woman child kitchen'.split())  # 返回最不匹配的一个词,这里返回kitchen

三、bags-of-centroids特征

from gensim.models import Word2Vec
model = Word2Vec.load('300features_40minwords_10context_1')
model.wv.syn0.shape  # (8306, 300) 即8306个词
model['awful'][:20] # 可以查看awful词向量的前20个数
# array([ 0.01825871,  0.06728431, -0.0239216 ,  0.01360764, -0.0137817 ,
      # -0.00200418,  0.09867329,  0.00460091, -0.0635394 , -0.02026206,
       # 0.08574579,  0.02123835,  0.00635097,  0.0113341 , -0.00884318,
      # -0.02041932,  0.01206732, -0.01385165,  0.00291235,  0.02749209],
     # dtype=float32)
import pandas as pd
train = pd.read_csv("labeledTrainData.tsv", header=0, delimiter="\t", quoting=3)
test = pd.read_csv("testData.tsv", header=0, delimiter="\t", quoting=3)
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords     
def review_to_wordlist(review, remove_stopwords=False): 
    review_text = BeautifulSoup(review, 'lxml').get_text()
    review_text = re.sub('[^a-zA-Z]', ' ', review_text)
    words = review_text.lower().split()
    if remove_stopwords:
       stops = set(stopwords.words('english'))
       words = [w for w in words if w not in stops]
    return words

clean_train_reviews = []
for review in train['review']:
    clean_train_reviews.append(review_to_wordlist(review, remove_stopwords=True))
clean_test_reviews = []    
for review in test['review']:
    clean_test_reviews.append(review_to_wordlist(review, remove_stopwords=True))

# 首先应用Kmeans聚类方法对将语义相似的词归为一类
from sklearn.cluster import KMeans
word_vectors = model.wv.syn0
num_clusters = word_vectors.shape[0] // 5    # 类别个数
kmeans_clustering = KMeans( n_clusters = num_clusters, n_jobs=-2 )
idx = kmeans_clustering.fit_predict( word_vectors )  # 训练每个词使其归类,idx的shape=(8306, )
word_centroid_map = dict(zip( model.wv.index2word, idx )) # 将每个词和类别打包成一个词典
# 提取bag-of-centroids特征的函数
def create_bag_of_centroids( wordlist, word_centroid_map ):
    num_centroids = max( word_centroid_map.values() ) + 1  # 得到类别总数
    bag_of_centroids = np.zeros( num_centroids, dtype="float32" )
    for word in wordlist:
        if word in word_centroid_map:
           index = word_centroid_map[word]
           bag_of_centroids[index] += 1  # 此特征可以类比词袋特征,词袋特征的列数等于词汇表的长度,此特征的列数等于类别的长度,在一条review中,如果存在某一类别的单词,则此类别加1
    return bag_of_centroids

train_centroids = np.zeros((train["review"].size, num_clusters), dtype="float32")
counter = 0
for review in clean_train_reviews:
    train_centroids[counter] = create_bag_of_centroids( review, word_centroid_map )
    counter += 1           
test_centroids = np.zeros(( test["review"].size, num_clusters), dtype="float32" )
counter = 0
for review in clean_test_reviews:
    test_centroids[counter] = create_bag_of_centroids(review, word_centroid_map )
    counter += 1
# 特征提取完毕,用随机森林算法分类
forest = RandomForestClassifier(n_estimators = 100)
forest = forest.fit(train_centroids,train["sentiment"])
result = forest.predict(test_centroids)
output = pd.DataFrame(data={"id":test["id"], "sentiment":result})
output.to_csv( "BagOfCentroids.csv", index=False, quoting=3 )        
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值