import pandas as pd
import numpy as np
import re
import string
from collections import defaultdict
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
使用GloVe模型预训练词向量。
获取地址:
http://nlp.stanford.edu/data/glove.twitter.27B.zip
实验发现使用50维向量时效果最佳,因为提高向量并不会提高准确性,但会带来较大的性能损失
with open("./input/glove-global-vectors-for-word-representation/glove.twitter.27B.50d.txt", "rb") as lines:
w2v = {line.split()[0].decode("utf-8"): np.array([float(value) for value in line.split()[1:]])
for line in lines}
数据集获取地址:tweet dataset
# 读取数据
train = pd.read_csv('./input/tweet-sentiment-extraction/train.csv')
test = pd.read_csv('./input/tweet-sentiment-extraction/test.csv')
sample = pd.read_csv('./input/tweet-sentiment-extraction/sample_submission.csv')
# 删除空白行
train[train['text'].isna()]
train.drop(314, inplace = True)
# 使用单词替换连续出现的符号
def replace_symbol_word(text, symbol, word):
starIdx = text.find(symbol)
count = 0
while starIdx > -1 and count < 20:
firstIdx = starIdx
while(starIdx+1 < len(text) and text[starIdx+1] == symbol):
starIdx += 1
text = text[:firstIdx] + " " + word + " " + text[starIdx+1:]
starIdx = -1
starIdx = text.find(symbol)
count += 1
return text
# 文本预处理,删除url,数字,标点符号
# 英文字典中没有单词连续有3个相同的字母,故将一个单词中连续出现3次及以上的同一字母直接缩减为2个
# 例如,将cooooool变为cool,将yummmmy变为yummy
# 试图用单词替换符号,但是却使性能变差。尝试了几个不同的单词,但找不到适合单词嵌入的单词
# 比如将‘*’替换为‘abusive’,将‘!’替换为‘exclaim’
# text = replace_symbol_word(text, '*', 'abusive')
# text = replace_symbol_word(text, '!', 'exclaim')
def clean_text(text):
text = re.sub(r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))', '', text)
# 移除 [\], ['], [`], and ["]
text = re.sub(r"\\", "", text)
text = re.sub(r"\'", "", text)
text = re.sub(r"\`", "", text)
text = re.sub(r"\"", "", text)
# 移除数字
text = re.sub(r"[0-9]+", "", text)
# 转为小写
text = text.strip().lower()
text = re.sub(r'(.)\1{2,}', r'\1\1', text)
# 用空格替换标点符号
filters = '!"\'#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
translate_dict = dict((c, " ") for c in filters)
translate_map = str.maketrans(translate_dict)
text = text.translate(translate_map)
text = ' '.join(text.split())
return text
# 清理text和selected_text列数据
train['clean_selected_text'] = train['selected_text'].apply(clean_text)
train['clean_text'] = train['text'].apply(clean_text)
# 划分训练集和验证集
X_train, X_val = train_test_split(
train, train_size = 0.80, random_state = 0)
X_train = X_train.copy()
X_val = X_val.copy()
X_train.head()
textID | text | selected_text | sentiment | clean_selected_text | clean_text | |
---|---|---|---|---|---|---|
24567 | a5ca70509c | Cant stop playin` in my head -- pussycat doll... | Cant stop playin` in my head -- pussycat doll... | neutral | cant stop playin in my head pussycat dolls jai... | cant stop playin in my head pussycat dolls jai... |
24619 | f18b75e863 | I hate you | I hate you | negative | i hate you | i hate you |
19766 | 649e31adcc | Starbucks I`m lovin` it | Starbucks I`m lovin` it | positive | starbucks im lovin it | starbucks im lovin it |
21737 | 8891d08a8c | Ben and Jerry...yummmmy!!! | .yummmmy! | positive | yummy | ben and jerry yummy |
8980 | 7fb24b4a56 | wow.. purple leopard skin. fieeerrceee.. | wow.. purple leopard skin. fieeerrceee.. | neutral | wow purple leopard skin fieerrcee | wow purple leopard skin fieerrcee |
# 基于sentiment切分数据
pos_train = X_train[X_train['sentiment'] == 'positive']
neutral_train = X_train[X_train['sentiment'] == 'neutral']
neg_train = X_train[X_train['sentiment'] == 'negative']
# print(pos_train)
# 获得字数统计
n = 1
# CountVectorizer是通过fit_transform函数将文本中的词语转换为词频矩阵,矩阵元素a[i][j] 表示j词在第i个文本下的词频。
# 即各个词语出现的次数,通过get_feature_names()可看到所有文本的关键字,通过toarray()可看到词频矩阵的结果。
cv = CountVectorizer(ngram_range=(n, n), max_df=0.8, min_df=2,
max_features=None,
stop_words='english') # 创建词袋数据结构
# 向量化所有cleaned selected text
X_train_cv = cv.fit_transform(X_train['clean_selected_text'])
# 输出格式(data输入列表的元素索引(第几个文章(或列表元素)),词典里词索引) 词频
# print(X_train_cv[0:3])
X_pos = cv.transform(pos_train['clean_selected_text'])
X_neutral = cv.transform(neutral_train['clean_selected_text'])
X_neg = cv.transform(neg_train['clean_selected_text'])
# print(X_pos.shape)
# print(X_neg.shape)
# 将原始训练和测试文本转化为特征向量
pos_count_df = pd.DataFrame(X_pos.toarray(), columns=cv.get_feature_names())
neutral_count_df = pd.DataFrame(X_neutral.toarray(), columns=cv.get_feature_names())
neg_count_df = pd.DataFrame(X_neg.toarray(), columns=cv.get_feature_names())
# print(neg_count_df)
# 这3个集合包含每种情感对应text的单词个数 {单词:出现次数}
pos_words = {}
neut_words = {}
neg_words = {}
# 比例
pos_words_proportion = {}
neutral_words_proportion = {}
neg_words_proportion = {}
for k in cv.get_feature_names():
pos_words[k] = pos_count_df[k].sum()
neut_words[k] = neutral_count_df[k].sum()
neg_words[k] = neg_count_df[k].sum()
# 将字数除以样本数以获得比例
pos_words_proportion[k] = pos_words[k]/pos_train.shape[0]
neutral_words_proportion[k] = neut_words[k]/neutral_train.shape[0]
neg_words_proportion[k] = neg_words[k]/neg_train.shape[0]
# print(pos_words_proportion)
neg_words_adj = {}
pos_words_adj = {}
neutral_words_adj = {}
# 调整比例值,以考虑到单词会出现在其他情感text中的情况
for key, value in neg_words_proportion.items():
neg_words_adj[key] = neg_words_proportion[key] - (neutral_words_proportion[key] + pos_words_proportion[key])
for key, value in pos_words_proportion.items():
pos_words_adj[key] = pos_words_proportion[key] - (neutral_words_proportion[key] + neg_words_proportion[key])
for key, value in neutral_words_proportion.items():
neutral_words_adj[key] = neutral_words_proportion[key] - (neg_words_proportion[key] + pos_words_proportion[key])
# jaccard相似度
def jaccard(str1, str2):
a = set(str1.lower().split())
b = set(str2.lower().split())
c = a.intersection(b)
return float(len(c)) / (len(a) + len(b) - len(c))
class MeanEmbeddingVectorizer(object):
def __init__(self, word2vec, pos_words, neg_words, neut_words):
self.pos_words = pos_words
self.neg_words = neg_words
self.neut_words = neut_words
self.word2vec = word2vec
# 如果文本为空,则应返回全为0的向量,且维数与所有其他向量相同
self.dim = len(next(iter(word2vec.items()))[1])
# X : clean_selected_text, y : sentiment.
def fit(self, X, y):
ratio = 0.8
self.average_positive = self.get_average_vector(X[y == 'positive'], 'positive', ratio)
self.average_neutral = self.get_average_vector(X[y == 'neutral'], 'neutral', ratio)
self.average_negative = self.get_average_vector(X[y == 'negative'], 'negative', ratio)
print(np.dot(self.average_negative, self.average_positive)/(np.linalg.norm(self.average_negative)*np.linalg.norm(self.average_positive)))
return self
# ratio 用来确定哪些单词频繁出现,根据该数据计算该情感的平均向量,注意传入的数据来自clean_selected_text
def get_average_vector(self, X, sentiment, ratio):
numerator_dict = (self.pos_words if sentiment == 'positive' else self.neg_words if sentiment == 'negative' else self.neut_words)
denominator_dict = {k: self.pos_words[k] + self.neut_words[k] + self.neg_words[k] for k in self.neut_words.keys()}
# 默认字典可处理我们未曾看到的单词,并阻止不会在此字典中显示的单词,但会在clean text中显示
word_proportion_dict = defaultdict(float)
for k in numerator_dict.keys():
word_proportion_dict[k] = numerator_dict[k]/denominator_dict[k]
sent_vec_list = []
for sent in X:
sent_word_vecs = []
for w in sent.split(" "):
if w in self.word2vec and word_proportion_dict[w] > ratio:
# 如果我们有一个单词的向量,并且它的比率足够高,可以使用该向量,则将其添加
sent_word_vecs.append(self.word2vec[w])
if(len(sent_word_vecs) > 0):
# 一旦我们添加了所有单词,如果我们至少有1个单词,则获取该tweet的平均值并将其附加到我们的tweet列表中
sent_vec_list.append(np.mean(sent_word_vecs, axis=0))
# 返回轴0上所有推文的平均值,因此我们得到一个50d向量,该向量是该情感的selected_text中经常出现的所有单词的平均值,这意味着经常出现的单词被多次包含,因此具有更大的效果 这就是为什么我们不对字数进行加权。
return np.mean(np.array(sent_vec_list), axis=0)
# 将一个句子转换为向量,其中sent为单词列表,其中每个item均为一个单词,这意味着无需在此处拆分
def transform(self, sent, sentiment):
sent_vec_list = []
scalars = pos_words_adj if sentiment == 'positive' else neg_words_adj
sent_word_vecs = [[x * scalars[w] for x in self.word2vec[w]] for w in sent if (w in self.word2vec and w in pos_words.keys())]
if(len(sent_word_vecs) > 0):
sent_vec_list.append(np.mean(sent_word_vecs, axis=0))
# 确保我们得到了一个向量输出,则返回返回全零向量
if(len(sent_vec_list)):
return np.array(sent_vec_list)
return np.zeros(self.dim)
# 得到3个向量和给定句子之间的余弦相似度
def get_sent_dist(self, sent, sentiment):
sent_vect = self.transform(sent, sentiment)
if sent_vect.sum() != 0.0:
# cosine similarity = dot(vec1, vec2) / (norm(vec1) * norm(vec2))
sim_pos = np.dot(sent_vect, self.average_positive)/(np.linalg.norm(sent_vect)*np.linalg.norm(self.average_positive))
sim_neut = np.dot(sent_vect, self.average_neutral)/(np.linalg.norm(sent_vect)*np.linalg.norm(self.average_neutral))
sim_neg = np.dot(sent_vect, self.average_negative)/(np.linalg.norm(sent_vect)*np.linalg.norm(self.average_negative))
return sim_pos[0], sim_neut[0], sim_neg[0]
# 如果我们无法从给定的句子中提取任何单词,那么我们说相似度为0
return 0, 0, 0
# 根据数据创建并计算不同情感不同text的平均向量
mev = MeanEmbeddingVectorizer(w2v, pos_words, neg_words, neut_words)
mev = mev.fit(X_train['clean_selected_text'], X_train['sentiment'])
0.6451794880372194
def calc_selected_text(df_row):
words_in_tweet = df_row['text'].split()
sentiment = df_row['sentiment']
# 如果text的情绪是中立的,或者text少于3个字,那么几乎所有neutral的selected_text都选择该Tweet的text,而大多数简短的Tweet最终都使用了text中所有单词。 主要节省了计算时间,而且提高了非常小的精度,因为当返回短推文时,jaccard得分平均约为0.77。
if sentiment == 'neutral' or len(words_in_tweet) < 3:
return df_row['text']
# 获取所有的subtext,selected_text则从subtext中得到
word_subsets = [words_in_tweet[i:j+1]
for i in range(len(words_in_tweet)) for j in range(i, len(words_in_tweet))]
sorted_subsets = sorted(word_subsets, key=len)
max_val = -10000000;
final_subset = []
# 对于每个subtext,我们都会获得该subtext与该sentiment的平均向量之间的余弦相似度,其中最相似的那个是需要返回的那个,即selected_text
for subset in sorted_subsets:
cleaned_text = clean_text(' '.join(subset)).split(" ")
# 获得余弦相似度
pos, neut, neg = mev.get_sent_dist(cleaned_text, sentiment)
# print(pos, neut, neg)
# 最高相似度
val_to_check = pos if sentiment == 'positive' else neg
if val_to_check > max_val:
max_val = val_to_check
final_subset = subset
# 返回subset
# 请注意,我们返回未预处理的文本,因为那就是问题的答案
return " ".join(final_subset)
def calc_jaccard_df(data):
data['predicted_selection'] = ''
data['jaccard'] = 0.0
# 对于我们数据中的每个样本,我们计算selected text和predicted_selection的相似度
for index, row in data.iterrows():
selected_text = calc_selected_text(row)
data.loc[data['textID'] == row['textID'], ['predicted_selection']] = selected_text
data['jaccard'] = data.apply(lambda x: jaccard(x['selected_text'], x['predicted_selection']), axis = 1)
print('The jaccard score for the validation set is:', np.mean(data['jaccard']))
calc_jaccard_df(X_val)
The jaccard score for the validation set is: 0.6254443337432527
# 遍历测试数据集并为每个样本计算预测,然后将其写回到sample中
for index, row in test.iterrows():
selected_text = calc_selected_text(row)
sample.loc[sample['textID'] == row['textID'], ['selected_text']] = selected_text
# write the sample dataframe to a submissions file
sample.to_csv('submission.csv', index = False)
sample.head(5)