Word Embedding+Cosine Similarity

import pandas as pd 
import numpy as np
import re
import string
from collections import defaultdict
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

使用GloVe模型预训练词向量。
获取地址:
http://nlp.stanford.edu/data/glove.twitter.27B.zip
实验发现使用50维向量时效果最佳,因为提高向量并不会提高准确性,但会带来较大的性能损失

with open("./input/glove-global-vectors-for-word-representation/glove.twitter.27B.50d.txt", "rb") as lines:
    w2v = {line.split()[0].decode("utf-8"): np.array([float(value) for value in line.split()[1:]])
           for line in lines}

数据集获取地址:tweet dataset

# 读取数据
train = pd.read_csv('./input/tweet-sentiment-extraction/train.csv')
test = pd.read_csv('./input/tweet-sentiment-extraction/test.csv')
sample = pd.read_csv('./input/tweet-sentiment-extraction/sample_submission.csv')
# 删除空白行
train[train['text'].isna()]
train.drop(314, inplace = True)

# 使用单词替换连续出现的符号
def replace_symbol_word(text, symbol, word):
    starIdx = text.find(symbol)
    count = 0
    while starIdx > -1 and count < 20:
        firstIdx = starIdx
        while(starIdx+1 < len(text) and text[starIdx+1] == symbol):
            starIdx += 1
        text = text[:firstIdx] + " " + word + " " + text[starIdx+1:]
        starIdx = -1
        starIdx = text.find(symbol)
        count += 1
    
    return text
# 文本预处理,删除url,数字,标点符号
# 英文字典中没有单词连续有3个相同的字母,故将一个单词中连续出现3次及以上的同一字母直接缩减为2个
# 例如,将cooooool变为cool,将yummmmy变为yummy
# 试图用单词替换符号,但是却使性能变差。尝试了几个不同的单词,但找不到适合单词嵌入的单词
# 比如将‘*’替换为‘abusive’,将‘!’替换为‘exclaim’
#     text = replace_symbol_word(text, '*', 'abusive')
#     text = replace_symbol_word(text, '!', 'exclaim')
def clean_text(text):    
    text = re.sub(r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))', '', text)
    
    # 移除 [\], ['], [`], and ["]
    text = re.sub(r"\\", "", text)
    text = re.sub(r"\'", "", text)
    text = re.sub(r"\`", "", text)
    text = re.sub(r"\"", "", text)
    
    # 移除数字
    text = re.sub(r"[0-9]+", "", text)
    
    # 转为小写
    text = text.strip().lower()
    
    text = re.sub(r'(.)\1{2,}', r'\1\1', text)
    
    # 用空格替换标点符号
    filters = '!"\'#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
    translate_dict = dict((c, " ") for c in filters)
    translate_map = str.maketrans(translate_dict)
    text = text.translate(translate_map)
    text = ' '.join(text.split())
    
    return text
# 清理text和selected_text列数据
train['clean_selected_text'] = train['selected_text'].apply(clean_text)
train['clean_text'] = train['text'].apply(clean_text)


# 划分训练集和验证集
X_train, X_val = train_test_split(
    train, train_size = 0.80, random_state = 0)
X_train = X_train.copy()
X_val = X_val.copy()

X_train.head()
textIDtextselected_textsentimentclean_selected_textclean_text
24567a5ca70509cCant stop playin` in my head -- pussycat doll...Cant stop playin` in my head -- pussycat doll...neutralcant stop playin in my head pussycat dolls jai...cant stop playin in my head pussycat dolls jai...
24619f18b75e863I hate youI hate younegativei hate youi hate you
19766649e31adccStarbucks I`m lovin` itStarbucks I`m lovin` itpositivestarbucks im lovin itstarbucks im lovin it
217378891d08a8cBen and Jerry...yummmmy!!!.yummmmy!positiveyummyben and jerry yummy
89807fb24b4a56wow.. purple leopard skin. fieeerrceee..wow.. purple leopard skin. fieeerrceee..neutralwow purple leopard skin fieerrceewow purple leopard skin fieerrcee
# 基于sentiment切分数据
pos_train = X_train[X_train['sentiment'] == 'positive']
neutral_train = X_train[X_train['sentiment'] == 'neutral']
neg_train = X_train[X_train['sentiment'] == 'negative']
# print(pos_train)
# 获得字数统计
n = 1
# CountVectorizer是通过fit_transform函数将文本中的词语转换为词频矩阵,矩阵元素a[i][j] 表示j词在第i个文本下的词频。
# 即各个词语出现的次数,通过get_feature_names()可看到所有文本的关键字,通过toarray()可看到词频矩阵的结果。

cv = CountVectorizer(ngram_range=(n, n), max_df=0.8, min_df=2,
                                         max_features=None,
                                         stop_words='english')  # 创建词袋数据结构

# 向量化所有cleaned selected text
X_train_cv = cv.fit_transform(X_train['clean_selected_text'])
# 输出格式(data输入列表的元素索引(第几个文章(或列表元素)),词典里词索引)  词频
# print(X_train_cv[0:3])


X_pos = cv.transform(pos_train['clean_selected_text'])
X_neutral = cv.transform(neutral_train['clean_selected_text'])
X_neg = cv.transform(neg_train['clean_selected_text'])
# print(X_pos.shape)
# print(X_neg.shape)

# 将原始训练和测试文本转化为特征向量
pos_count_df = pd.DataFrame(X_pos.toarray(), columns=cv.get_feature_names())
neutral_count_df = pd.DataFrame(X_neutral.toarray(), columns=cv.get_feature_names())
neg_count_df = pd.DataFrame(X_neg.toarray(), columns=cv.get_feature_names())
# print(neg_count_df)

# 这3个集合包含每种情感对应text的单词个数   {单词:出现次数}
pos_words = {}
neut_words = {}
neg_words = {}

# 比例
pos_words_proportion = {}
neutral_words_proportion = {}
neg_words_proportion = {}

for k in cv.get_feature_names():
    pos_words[k] = pos_count_df[k].sum()
    neut_words[k] = neutral_count_df[k].sum()
    neg_words[k] = neg_count_df[k].sum()
    
    # 将字数除以样本数以获得比例
    pos_words_proportion[k] = pos_words[k]/pos_train.shape[0]
    neutral_words_proportion[k] = neut_words[k]/neutral_train.shape[0]
    neg_words_proportion[k] = neg_words[k]/neg_train.shape[0]
# print(pos_words_proportion)

neg_words_adj = {}
pos_words_adj = {}
neutral_words_adj = {}

# 调整比例值,以考虑到单词会出现在其他情感text中的情况
for key, value in neg_words_proportion.items():
    neg_words_adj[key] = neg_words_proportion[key] - (neutral_words_proportion[key] + pos_words_proportion[key])

for key, value in pos_words_proportion.items():
    pos_words_adj[key] = pos_words_proportion[key] - (neutral_words_proportion[key] + neg_words_proportion[key])

for key, value in neutral_words_proportion.items():
    neutral_words_adj[key] = neutral_words_proportion[key] - (neg_words_proportion[key] + pos_words_proportion[key])
# jaccard相似度
def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))
class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec, pos_words, neg_words, neut_words):
        self.pos_words = pos_words
        self.neg_words = neg_words
        self.neut_words = neut_words
        self.word2vec = word2vec
        # 如果文本为空,则应返回全为0的向量,且维数与所有其他向量相同
        self.dim = len(next(iter(word2vec.items()))[1])

    #  X : clean_selected_text, y : sentiment.
    def fit(self, X, y):
        ratio = 0.8
        self.average_positive = self.get_average_vector(X[y == 'positive'], 'positive', ratio)
        self.average_neutral = self.get_average_vector(X[y == 'neutral'], 'neutral', ratio)
        self.average_negative = self.get_average_vector(X[y == 'negative'], 'negative', ratio)
       
        print(np.dot(self.average_negative, self.average_positive)/(np.linalg.norm(self.average_negative)*np.linalg.norm(self.average_positive)))
        return self
    
    #  ratio 用来确定哪些单词频繁出现,根据该数据计算该情感的平均向量,注意传入的数据来自clean_selected_text
    def get_average_vector(self, X, sentiment, ratio):

        numerator_dict = (self.pos_words if sentiment == 'positive' else self.neg_words if sentiment == 'negative' else self.neut_words)
        denominator_dict = {k: self.pos_words[k] + self.neut_words[k] + self.neg_words[k] for k in self.neut_words.keys()}
        # 默认字典可处理我们未曾看到的单词,并阻止不会在此字典中显示的单词,但会在clean text中显示
        word_proportion_dict = defaultdict(float)
        for k in numerator_dict.keys():
            word_proportion_dict[k] = numerator_dict[k]/denominator_dict[k]
                
        sent_vec_list = []
        for sent in X:
            sent_word_vecs = []
            for w in sent.split(" "):
                if w in self.word2vec and word_proportion_dict[w] > ratio:
                    # 如果我们有一个单词的向量,并且它的比率足够高,可以使用该向量,则将其添加
                    sent_word_vecs.append(self.word2vec[w])
            if(len(sent_word_vecs) > 0):
                # 一旦我们添加了所有单词,如果我们至少有1个单词,则获取该tweet的平均值并将其附加到我们的tweet列表中
                sent_vec_list.append(np.mean(sent_word_vecs, axis=0))
        
        # 返回轴0上所有推文的平均值,因此我们得到一个50d向量,该向量是该情感的selected_text中经常出现的所有单词的平均值,这意味着经常出现的单词被多次包含,因此具有更大的效果 这就是为什么我们不对字数进行加权。
        return np.mean(np.array(sent_vec_list), axis=0)

    # 将一个句子转换为向量,其中sent为单词列表,其中每个item均为一个单词,这意味着无需在此处拆分
    def transform(self, sent, sentiment):
        sent_vec_list = []
        scalars = pos_words_adj if sentiment == 'positive' else neg_words_adj
        
        sent_word_vecs = [[x * scalars[w] for x in self.word2vec[w]]  for w in sent if (w in self.word2vec and w in pos_words.keys())]
        if(len(sent_word_vecs) > 0):
            
            sent_vec_list.append(np.mean(sent_word_vecs, axis=0))
        
        # 确保我们得到了一个向量输出,则返回返回全零向量
        if(len(sent_vec_list)):
            return np.array(sent_vec_list)
        return np.zeros(self.dim)
    
    # 得到3个向量和给定句子之间的余弦相似度
    def get_sent_dist(self, sent, sentiment):
        sent_vect = self.transform(sent, sentiment)
                     
        if sent_vect.sum() != 0.0:
            # cosine similarity = dot(vec1, vec2) / (norm(vec1) * norm(vec2))
            sim_pos = np.dot(sent_vect, self.average_positive)/(np.linalg.norm(sent_vect)*np.linalg.norm(self.average_positive))
            sim_neut = np.dot(sent_vect, self.average_neutral)/(np.linalg.norm(sent_vect)*np.linalg.norm(self.average_neutral))
            sim_neg = np.dot(sent_vect, self.average_negative)/(np.linalg.norm(sent_vect)*np.linalg.norm(self.average_negative))
            return sim_pos[0], sim_neut[0], sim_neg[0]
        # 如果我们无法从给定的句子中提取任何单词,那么我们说相似度为0
        return 0, 0, 0
        
# 根据数据创建并计算不同情感不同text的平均向量
mev = MeanEmbeddingVectorizer(w2v, pos_words, neg_words, neut_words)
mev = mev.fit(X_train['clean_selected_text'], X_train['sentiment'])
0.6451794880372194
def calc_selected_text(df_row):
    
    words_in_tweet = df_row['text'].split()
    sentiment = df_row['sentiment']
    
    # 如果text的情绪是中立的,或者text少于3个字,那么几乎所有neutral的selected_text都选择该Tweet的text,而大多数简短的Tweet最终都使用了text中所有单词。 主要节省了计算时间,而且提高了非常小的精度,因为当返回短推文时,jaccard得分平均约为0.77。
    if sentiment == 'neutral' or len(words_in_tweet) < 3:
        return df_row['text']
    
    # 获取所有的subtext,selected_text则从subtext中得到
    word_subsets = [words_in_tweet[i:j+1]
                    for i in range(len(words_in_tweet)) for j in range(i, len(words_in_tweet))]

    sorted_subsets = sorted(word_subsets, key=len)

    max_val = -10000000;
    final_subset = []

    # 对于每个subtext,我们都会获得该subtext与该sentiment的平均向量之间的余弦相似度,其中最相似的那个是需要返回的那个,即selected_text
    for subset in sorted_subsets:
        cleaned_text = clean_text(' '.join(subset)).split(" ")
        
        # 获得余弦相似度
        pos, neut, neg = mev.get_sent_dist(cleaned_text, sentiment)
#         print(pos, neut, neg)
        # 最高相似度
        val_to_check = pos if sentiment == 'positive' else neg
        if val_to_check > max_val:
            max_val = val_to_check
            final_subset = subset

    # 返回subset
    # 请注意,我们返回未预处理的文本,因为那就是问题的答案
    return " ".join(final_subset)
def calc_jaccard_df(data):
    data['predicted_selection'] = ''
    data['jaccard'] = 0.0
    
    # 对于我们数据中的每个样本,我们计算selected text和predicted_selection的相似度
    for index, row in data.iterrows():
        selected_text = calc_selected_text(row)
        data.loc[data['textID'] == row['textID'], ['predicted_selection']] = selected_text

    data['jaccard'] = data.apply(lambda x: jaccard(x['selected_text'], x['predicted_selection']), axis = 1)
    print('The jaccard score for the validation set is:', np.mean(data['jaccard']))
    
calc_jaccard_df(X_val)
The jaccard score for the validation set is: 0.6254443337432527
# 遍历测试数据集并为每个样本计算预测,然后将其写回到sample中
for index, row in test.iterrows():
    selected_text = calc_selected_text(row)
    sample.loc[sample['textID'] == row['textID'], ['selected_text']] = selected_text
# write the sample dataframe to a submissions file
sample.to_csv('submission.csv', index = False)
sample.head(5)
  • 1
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
from transformers import pipeline, BertTokenizer, BertModel import numpy as np import torch import jieba tokenizer = BertTokenizer.from_pretrained('bert-base-chinese') model = BertModel.from_pretrained('bert-base-chinese') ner_pipeline = pipeline('ner', model='bert-base-chinese') with open('output/weibo1.txt', 'r', encoding='utf-8') as f: data = f.readlines() def cosine_similarity(v1, v2): return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2)) def get_word_embedding(word): input_ids = tokenizer.encode(word, add_special_tokens=True) inputs = torch.tensor([input_ids]) outputs = model(inputs)[0][0][1:-1] word_embedding = np.mean(outputs.detach().numpy(), axis=0) return word_embedding def get_privacy_word(seed_word, data): privacy_word_list = [] seed_words = jieba.lcut(seed_word) jieba.load_userdict('data/userdict.txt') for line in data: words = jieba.lcut(line.strip()) ner_results = ner_pipeline(''.join(words)) for seed_word in seed_words: seed_word_embedding = get_word_embedding(seed_word) for ner_result in ner_results: if ner_result['word'] == seed_word and ner_result['entity'] == 'O': continue if ner_result['entity'] != seed_word: continue word = ner_result['word'] if len(word) < 3: continue word_embedding = get_word_embedding(word) similarity = cosine_similarity(seed_word_embedding, word_embedding) print(similarity, word) if similarity >= 0.6: privacy_word_list.append(word) privacy_word_set = set(privacy_word_list) return privacy_word_set 上述代码运行之后,结果为空集合,哪里出问题了,帮我修改一下
05-30

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值