kaggle竞赛:Bag of Words Meets Bags of Popcorn
目的:根据给出的训练数据集提取相关文本特征,训练一个合适的分类模型能够对测试集的review正确分类。
一、词袋特征
有关词袋模型的解释请看我的博文朴素贝叶斯一章。下面我只讲解用sklearn构造词袋特征。
import numpy as np
import pandas as pd
train_data = pd.read_csv('labeledTrainData.tsv', header=0, delimiter='\t', quoting=3)
# 参数意义:header:指定行数作为列名,默认是0
# delimiter:分隔符
# quoting:无视双引号
train_data.shape # (25000, 3) 二维数据,25000行,3列
train_data.columns.values # 展示数据集的列名
train_data.head(10) # 展示数据集的前10行
train_data['review'][0] # 展示第0行数据
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
# 如果没有下载stopwords,需要先运行该语句nltk.download('stopwords')
# 定义数据清洗函数,返回清洗过后的词
def review_to_words(raw_review):
review_text = BeautifulSoup(raw_review, 'lxml').get_text() # 清洗html符号
letters_only = re.sub('[^a-zA-Z]', ' ', review_text) # 正则表达式:只匹配大小写英文字母,其余符号或数字用空格代替
words = letters_only.lower().split() # 字母全部转换为小写并分开
stops = set(stopwords.words('english')) # 加载stopwords
meaningful_words = [w for w in words if w not in stops] # 去除stopwords
return (" ".join(meaningful_words))
# 清洗训练集
num_reviews = train_data['review'].size
clean_train_review = []
for i in range(0, num_reviews):
if (i + 1) % 1000 == 0:
print('Review %d of %d\n' % (i + 1, num_reviews))
clean_train_review.append(review_to_words(train_data['review'][i]))
# 创建词袋特征
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(analyzer='word',
tokenizer=None, # 没有设置分词器,关于分词器会在下面讲解
preprocessor=None,
stop_words=None, # 由于已经剔除停用词,此处不再设置
max_features=5000) # 词汇表大小设置为5000
train_data_features = vectorizer.fit_transform(clean_train_review)
train_data_features = train_data_features.toarray()
print(train_data_features.shape) # (25000, 5000)此时词袋特征已经提取完毕
# 下面选择分类模型,调用sklearn库,我分别选择了随机森林模型和支持向量机模型,最终分类得分分别是0.56752和0.51308
from sklearn.svm import SVC
train_label = train_data['sentiment']
train_label = np.array(train_label) # shape=(25000, )
svm_model = SVC(kernel='rbf', gamma=0.1)
svm_model.fit(train_data_features, train_label) # 训练
test_data = pd.read_csv('testData.tsv', header=0, delimiter='\t', quoting=3) # 加载测试数据
num_test_reviews = test_data['review'].size
clean_test_reviews = []
for i in range(0, num_test_reviews):
if (i + 1) % 1000 == 0:
print('Review %d of %d\n' % (i + 1, num_reviews))
clean_test_reviews.append(review_to_words(test_data['review'][i]))
test_features = vectorizer.fit_transform(clean_test_reviews)
test_features = test_features.toarray()
pred = svm_model.predict(test_features) # 测试
output = pd.DataFrame(data={'id':test_data['id'], 'sentiment':pred})
output.to_csv('Bag_of_words_model.csv', index=False, quoting=3)
#from sklearn.ensemble import RandomForestClassifier
#forest = RandomForestClassifier(n_estimators = 100)
#forest = forest.fit( train_data_features, train_data["sentiment"] )
#pred = forest.predict(test_features)
词袋特征的缺点是将各个有情感的句子分成毫不相干的词语,这样的特征对情感分析帮助不大。
二、词向量
关于词向量的理解可以看我的博文NLP学习2和3,总而言之词向量可以很好的保留上下文的关系,使词语之间的语义更加完善。这里用gensim实现词向量。
import pandas as pd
train = pd.read_csv('labeledTrainData.tsv', header=0, delimiter='\t', quoting=3)
from bs4 import BeautifulSoup
import refrom nltk.corpus
import stopwords
def review_to_wordlist(review, remove_stopwords=False): # 在训练词向量时最好不要剔除stopwords
review_text = BeautifulSoup(review, 'lxml').get_text()
review_text = re.sub('[^a-zA-Z]', ' ', review_text)
words = review_text.lower().split()
if remove_stopwords:
stops = set(stopwords.words('english'))
words = [w for w in words if w not in stops]
return words
import nltk.data
import nltk
nltk.download('punkt')
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') # 分句工具,遇到句号会分句,english.pickle是英文分句模型
#注意:英文文法中,一句话结束后会有空格。
# 分句函数
def review_to_sentences(review, tokenizer, remove_stopwords=False):
raw_sentences = tokenizer.tokenize(review.strip()) # 首先对review分句,即将每个review分解成各个句子
sentences = [] # 存放句子,注意此时存放的不是一个review而是review中的句子,一个review可能有好几个句子
for raw_sentence in raw_sentences:
if len(raw_sentence) > 0:
sentences.append(review_to_wordlist(raw_sentence))
return sentences
sentences = []
num_review = train['review'].size
for i in range(num_review):
if (i+1)%5000 == 0:
print("Review %d of %d\n" % (i+1, num_review))
sentences += (review_to_sentences(train['review'][i], tokenizer))
# 注意这里不能用append()方法,因为经过review_to_sentences函数之后,一个review会变成一个二维数组的形式
# 即[[], []],而append()方法会把这个数组原样加到新列表中,而我们要求的列表是一个一维列表,所以不能用append()。
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) #训练日志
from gensim.models import Word2Vec
model = Word2Vec(sentences,
workers=4,
size=300, # 词嵌入维度,即隐藏层单元个数
min_count = 40, # 忽略词频小于40的词语
window = 10, # 窗口大小
sample = 1e-3, # 高频词随机下采样的配置阈值,二次采样选择高频词(采样率大于sample的词)
sg=1) # sg=1是skip-gramm模型,sg=0是CBOW模型
model.init_sims(replace=True) # 锁定模型,具体含义不太清楚
model_name = "300features_40minwords_10context_1"
model.save(model_name) # 默认迭代5轮
# 模型训练完成之后,我们可以进行一些测试,了解词向量训练的效果
model.similarity('man', 'woman') # 返回两个词的相似度
model.most_similar('awful') # 返回与awful最相似的10个词,10是窗口大小
model.doesnt_match('man woman child kitchen'.split()) # 返回最不匹配的一个词,这里返回kitchen
三、bags-of-centroids特征
from gensim.models import Word2Vec
model = Word2Vec.load('300features_40minwords_10context_1')
model.wv.syn0.shape # (8306, 300) 即8306个词
model['awful'][:20] # 可以查看awful词向量的前20个数
# array([ 0.01825871, 0.06728431, -0.0239216 , 0.01360764, -0.0137817 ,
# -0.00200418, 0.09867329, 0.00460091, -0.0635394 , -0.02026206,
# 0.08574579, 0.02123835, 0.00635097, 0.0113341 , -0.00884318,
# -0.02041932, 0.01206732, -0.01385165, 0.00291235, 0.02749209],
# dtype=float32)
import pandas as pd
train = pd.read_csv("labeledTrainData.tsv", header=0, delimiter="\t", quoting=3)
test = pd.read_csv("testData.tsv", header=0, delimiter="\t", quoting=3)
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
def review_to_wordlist(review, remove_stopwords=False):
review_text = BeautifulSoup(review, 'lxml').get_text()
review_text = re.sub('[^a-zA-Z]', ' ', review_text)
words = review_text.lower().split()
if remove_stopwords:
stops = set(stopwords.words('english'))
words = [w for w in words if w not in stops]
return words
clean_train_reviews = []
for review in train['review']:
clean_train_reviews.append(review_to_wordlist(review, remove_stopwords=True))
clean_test_reviews = []
for review in test['review']:
clean_test_reviews.append(review_to_wordlist(review, remove_stopwords=True))
# 首先应用Kmeans聚类方法对将语义相似的词归为一类
from sklearn.cluster import KMeans
word_vectors = model.wv.syn0
num_clusters = word_vectors.shape[0] // 5 # 类别个数
kmeans_clustering = KMeans( n_clusters = num_clusters, n_jobs=-2 )
idx = kmeans_clustering.fit_predict( word_vectors ) # 训练每个词使其归类,idx的shape=(8306, )
word_centroid_map = dict(zip( model.wv.index2word, idx )) # 将每个词和类别打包成一个词典
# 提取bag-of-centroids特征的函数
def create_bag_of_centroids( wordlist, word_centroid_map ):
num_centroids = max( word_centroid_map.values() ) + 1 # 得到类别总数
bag_of_centroids = np.zeros( num_centroids, dtype="float32" )
for word in wordlist:
if word in word_centroid_map:
index = word_centroid_map[word]
bag_of_centroids[index] += 1 # 此特征可以类比词袋特征,词袋特征的列数等于词汇表的长度,此特征的列数等于类别的长度,在一条review中,如果存在某一类别的单词,则此类别加1
return bag_of_centroids
train_centroids = np.zeros((train["review"].size, num_clusters), dtype="float32")
counter = 0
for review in clean_train_reviews:
train_centroids[counter] = create_bag_of_centroids( review, word_centroid_map )
counter += 1
test_centroids = np.zeros(( test["review"].size, num_clusters), dtype="float32" )
counter = 0
for review in clean_test_reviews:
test_centroids[counter] = create_bag_of_centroids(review, word_centroid_map )
counter += 1
# 特征提取完毕,用随机森林算法分类
forest = RandomForestClassifier(n_estimators = 100)
forest = forest.fit(train_centroids,train["sentiment"])
result = forest.predict(test_centroids)
output = pd.DataFrame(data={"id":test["id"], "sentiment":result})
output.to_csv( "BagOfCentroids.csv", index=False, quoting=3 )