NLP 之word2vec实现

NLP 之word2vec实现

###来源:kaggle

import os
import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn import svm
import matplotlib.pyplot as plt
import re #regex
import nltk
import string
# nltk.download('stopwords')
# from nltk.corpus import stopwords
import seaborn as sns
from scipy.sparse import coo_matrix
train_data = pd.read_csv('F:/跨媒体计算实验组/NLP/数据集/SentimentAnalysis/train.csv').copy()
test_data = pd.read_csv('F:/跨媒体计算实验组/NLP/数据集/SentimentAnalysis/test.csv').copy()
sample = pd.read_csv('F:/跨媒体计算实验组/NLP/数据集/SentimentAnalysis/sample_submission.csv').copy()

在这里插入图片描述

# Pre-processing step - from https://www.kaggle.com/rajaram1988/ignored-stop-words-using-only-word-counts
# drop entries with no text
train_data = train_data.dropna()
train_data[train_data['text'].isna()]
test_datat = test_data.dropna()

# convert text to lowercase
train_data['text'] = train_data['text'].map(lambda x: x.lower())
test_data['text'] = test_data['text'].map(lambda x: x.lower())

#remove '\\n',----re=regular expression(正则表达式),sub=substitute(替换);re.sub(字符串中需要替换的内容, 想要替换成什么内容, string需要被替换的总字符串)
train_data['text'] = train_data['text'].map(lambda x: re.sub('\\n','',str(x)))#将一列作为输入
test_data['text'] = test_data['text'].map(lambda x:re.sub('\\n',' ',str(x)))

# remove any text starting with User...删除以用户开头的任何文本。。。
train_data['text'] = train_data['text'].map(lambda x: re.sub("\[\[User.*", '', str(x)))
test_data['text'] = test_data['text'].map(lambda x: re.sub("\[\[User.*", '', str(x)))

# remove IP addresses or user IDs
train_data['text'] = train_data['text'].map(lambda x: re.sub("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", '', str(x)))
test_data['text'] = test_data['text'].map(lambda x: re.sub("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", '', str(x)))

#remove http links in the text
train_data['text'] = train_data['text'].map(lambda x: re.sub("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", '', str(x)))
test_data['text'] = test_data['text'].map(lambda x: re.sub("(http://.*?\s)|(http://.*)", '', str(x)))

在这里插入图片描述

x_train,x_val = train_test_split(train_data,train_size=0.8,random_state=23)

## set aside positive/negative/neutral tweets拆分积极/消极/中立的微博
positive_tweets = x_train[x_train['sentiment'] == 'positive']
negative_tweets = x_train[x_train['sentiment'] == 'negative']
neutral_tweets = x_train[x_train['sentiment'] == 'neutral']

# get lengths of 'selected_text' for non-neutral tweet
#获得每个样本中selected_text的单词个数,split()将一个字符串分割成一个个独立的单词
pos_selected_lengths = positive_tweets['selected_text'].map(lambda x: len(x.split()))
neg_selected_lengths = negative_tweets['selected_text'].map(lambda x: len(x.split()))
#plot 'selected_text' lengths against frequency in sentiment
plt.figure(figsize=(12, 6))
#核密度估计图,可以比较直观的看出样本数据本身的分布特征
p1 = sns.kdeplot(pos_selected_lengths, shade=True, color="b").set_title('Selected Text lengths across Positive and Negative Sentiments')
p2 = sns.kdeplot(neg_selected_lengths, shade=True, color="r")
plt.legend(labels=['positive', 'negative'])
plt.show()
plt.clf()
# based on this plot we can see that selected_text is more frequently shorter in positive tweets
##根据这个图,我们可以看到,在积极的tweet中,被选中的_文本中,字符串中含有的单词通常更短

在这里插入图片描述

# get lengths of 'text' for non-neutral tweets
#同理,对特征‘text'也做核密度图观察
pos_lengths = positive_tweets['text'].map(lambda x: len(x.split()))
neg_lengths = negative_tweets['text'].map(lambda x: len(x.split()))

# let's look at the lengths of positive vs negative tweets
plt.figure(figsize=(12,6))
p1 = sns.kdeplot(pos_lengths, shade=True, color="b").set_title('Text Lengths across Positive and Negative Sentiments')
p2 = sns.kdeplot(neg_lengths, shade=True, color="r")
plt.legend(labels=['positive', 'negative'])
plt.show()
plt.clf()
# this doesn't seem all that useful. tweet lengths are distributed evenly in positive and negative tweets
#这似乎没什么用。 推文长度在正面和负面推文中平均分布

在这里插入图片描述

# create feature vectors that include ngrams of size max_ngram 创建包含大小为max_ngram的ngram的特征向量
# so we can select a feature that is a word or phrase to be our 'selected_text'因此我们可以选择一个单词或短语作为特征“ selected_text”
#取样本中特征’selected_text‘和’text‘两者中最大长度的字符串的单词数量大小,同样的选最小作为最小
max_ngram = max(pos_selected_lengths) if max(pos_selected_lengths) > max(neg_selected_lengths) else max(neg_selected_lengths)#----30
min_ngram = min(pos_selected_lengths) if min(pos_selected_lengths) > min(neg_selected_lengths) else min(neg_selected_lengths)#----1
# this ended up being fruitless

一、CountVectorizer会将文本中的词语转换为词频矩阵,它通过fit_transform函数计算各个词语出现的次数。结果可以用toarray()看到
注:然而有些词在文本中尽管词频高,但是并不重要,这个时候就可以用TF-IDF技术
二、TF-IDF是一种统计方法,用以评估一个字词对于一个文件集或一个语料库中的其中一份文件的重要程度,字词的重要性随着它在文件中出现的次数成正比增加,但同时会随着它在语料库中出现的频率成反比下降
三、SVM是一个二类分类器,它的目标是找到一个超平面,使用两类数据离超平面越远越好,从而对新的数据分类更准确,即使分类器更加健壮。
SVM接收训练集x_train和目标集x_test

用sklearn进行TF-IDF预处理
第一种方法是在用 CountVectorizer 类向量化之后再调用 TfidfTransformer 类进行预处理。
第二种方法是直接用 TfidfVectorizer 完成向量化与 TF-IDF 预处理。

统计个数和计算频率两种方法虽然非常实用,但是也由其局限性导致词汇量可能变得非常大。词汇量过大又将导致需要非常大的矢量来编码文档,从而对内存产生很大的要求,同时拖慢算法的速度。


#实例化countvectorizer
vectorizer = CountVectorizer(
    max_df=0.85,#作为一个阈值,大于max_df,这个词不会被当作关键词,0.85代表百分比
    min_df=2,#也是阈值,小于min_df,则这个词不会被当作关键词
    stop_words="english",#设置停用词,设为english将使用内置的英语停用词,设为一个list可自定义停用词,设为None不使用停用词
    max_features=10000,#对所有关键词的term frequency进行降序排序,只取前max_features个作为关键词集
)

# let's remove all neutral tweets from x_train so that we can train the cvm properly
#移除所有的中性样本
non_neutral = x_train[x_train['sentiment'] != 'neutral']
#CountVectorizer(词袋法(Bag of Words))向量化特征’text‘
# fit the vectorizer to the non_neutral training data
train_vectors = vectorizer.fit_transform(non_neutral['text'])
#向量化分开的积极、消极样本的’text‘
x_pos = vectorizer.transform(positive_tweets['text'])
x_neg = vectorizer.transform(negative_tweets['text'])
#TfidfTransformer(词频逆文档频率模型)向量化
#实例化TF-IDF
tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf=True)
#train_vectors是经过CountVectorizer过的,这里用TF-IDF再训练
tfidf_vectors = tfidf_transformer.fit_transform(train_vectors)
#svm实例化,#C: 目标函数的惩罚系数C,用来平衡分类间隔margin和错分样本的,default C = 1.0
supportVector = svm.SVC(C=1.0,kernel='linear')    ##kernel:参数选择有rbf, linear, poly, Sigmoid, 默认的是"RBF"
#tfidf_vectors是经过TF-IDF过的,这里是SVM训练tfidf_vectors是训练集, non_neutral['sentiment']是目标值
supportVector.fit(tfidf_vectors, non_neutral['sentiment'])
#weights_dict字典:存储每个单词的权重.这里的键是单词名,值是对应的权重
weights_dict = {}
features = vectorizer.get_feature_names()#features有5979个样本
for i in range(len(features)):
    feature = features[i]
    weights_dict[feature] = supportVector.coef_[0, i]
    
# get bag of words and weights of key words/phrases in negative tweets
#获得负面tweets中的关键词或短语的词袋向量和其权重
#weights_list使weights_dict字典变成列表,weights_sorted使weights_list 列表按权重从高到低进行排序
weights_list = [(word, weights_dict[word])for word in features]
weights_sorted = sorted(weights_list, key=lambda x: x[1],reverse=True)

在这里插入图片描述

# get the top 50 words and plot them 
#取排好序的前50个样本,以横轴为单词名称,纵轴为权重,画图
top_50_words = weights_sorted[:50]
weight_top_df = pd.DataFrame(top_50_words)#将列表表格化
weight_top_df.columns = ["Word", "Weight"]#设置表格的特征名(没有设置的话在化为表格时,特征名是0,1.。。。)

sns.set(rc={'figure.figsize': (13, 8)})
g = sns.barplot(x="Word", y="Weight", data=weight_top_df)
g.set_xticklabels(g.get_xticklabels(), rotation=60)
plt.show()
plt.clf()

在这里插入图片描述

# get the bottom 50 words and plot them
#同样的,取倒数50个样本
bot_50_words = weights_sorted[len(weights_sorted) - 50:]#可以【-50:】
weight_bot_df = pd.DataFrame(bot_50_words)
weight_bot_df.columns = ["Word", "Weight"]

sns.set(rc={'figure.figsize': (13, 8)})
g = sns.barplot(x="Word", y="Weight", data=weight_bot_df)
g.set_xticklabels(g.get_xticklabels(), rotation=60)
plt.show()
plt.clf()

在这里插入图片描述

#感觉这部分有问题
#it looks like positive words have a negative weight and negative words have a positive weight????????????????????????????????????
# so we should calculate selected_text based on sentiment
#上面说积极单词有负权重,消极却有正的,这里乘以-1颠转过来
inv_weights_dict = {}
for key in weights_dict.keys():
    inv_weights_dict[key] = weights_dict[key] * -1
#row--x:一行的内容(series格式,可以用表格中的特征名进行检索), tol:0.0015, a:5
def calc_selected_text(x, tol, a):
    tweet = x['text']#tweet是内容
    sentiment = x['sentiment']#sentiment代表该句的情绪

    if sentiment == 'neutral':#如果是中性的,就直接返回
        return tweet
    if sentiment == 'positive':#如果是积极的,取颠转前的权重
        weights = weights_dict
    if sentiment == 'negative':##如果是消极的,取颠转后的权重,总之是保证得到正权重
        weights = inv_weights_dict

    text = tweet.split()#将字符串分为一个个单词
    text_len = len(text)#单词的个数
    #[str(i) for i in ls1]这是列表生成式,等价于第一步:ls2  =  [],第二步:for  i  in  ls1:
    #                                                                           ls2.append( str (i))
    #取text中的内容,作为文本用来生成队列
    subsets = [text[i:j+1]for i in range(text_len) for j in range(i, text_len)]
    subsets = sorted(subsets, key=len)

    score = 0
    selected = ''
    #遍历队列subsets,队列中的元素又是队列
    for i in range(len(subsets)):
        subtr_sum = 0
        #遍历里队列,队列中的元素是一个个单词
        for p in range(len(subsets[i])):
            #str.translate(maketrans(intab, outtab)),将字符串中指定的单字符intab,转换为指定的单字符outtab,这里是可以把一个单词中如果存在标点符号,就把他去掉
            words_in_substr = subsets[i][p].translate(str.maketrans('', '', string.punctuation))
            #words_in_substr是一个个单词,weights.keys是字典里的keys,即单词名,但weights.keys()里可能有非单词的
            if(words_in_substr in weights.keys()):
                # We noticed that our selected strings were ~375% longer than they should be, so we implemented a "cost function"
                # to encourage smaller strings
                #获得一个句子中所有单词的总权重
                subtr_sum += weights[words_in_substr] - a * (len(words_in_substr) / text_len)#单词长度除以句子的单词总长度

        #tol = tol*5 # Increase the tolerance a bit each time we choose a selection
        #tol = tol * 5#每次我们选择一个选项时,都会稍微提高公差
        if(subtr_sum > score + tol):
            score = subtr_sum
            selected = subsets[i]
    #如果句子都是纯粹的单词,则直接返回原字符串,否则返回计算好的处理过的字符串
    if len(selected) == 0:
        selected = text

    return ' '.join(selected)

# from https://www.kaggle.com/rajaram1988/ignored-stop-words-using-only-word-counts
def jaccard(str1, str2):
    a = set(str1.lower().split())
    b = set(str2.lower().split())
    # print("{} - {}".format(str1, str2))
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

在这里插入图片描述

# to prevent warnings appearing in the console防止在控制台中出现警告的步骤
pd.options.mode.chained_assignment = None

# some parameters for the text selector
tol = 0.0015
a = 5
print("tol = {}".format(tol))#----0.0015
print("a = {}".format(a))#------5
# make predictions on training set
x_train['prediction'] = ''#在训练集中添加预测栏
for key, row in x_train.iterrows():#iterrows() 函数对dataframe进行遍历,key是行的索引,row每一行的内容(series形式)。tol,a是参数
    #返回的selected是每个句子的有着关键情绪信息的单词或多个单词,如sad!
    selected = calc_selected_text(row, tol, a)
    #将x_train中对应样本(以textID来匹配)的预测情感prediction进行填充
    x_train.loc[x_train['textID'] == row['textID'], ['prediction']] = selected
#jaccard是一个相似系数在[0,1]间,是描述集合之间的相似度,越靠近1越相似
x_train['jaccard'] = x_train.apply(lambda x: jaccard(x['selected_text'], x['prediction']), axis=1)
#打印训练集的平均相似系数
print('Jaccard for training set = ', np.mean(x_train['jaccard']))

在这里插入图片描述
在这里插入图片描述

# make predictions on validation set
#x_val即测试集x_test,对其做同样的事
x_val['prediction'] = ''
for key, row in x_val.iterrows():
    selected = calc_selected_text(row, tol, a)

    x_val.loc[x_val['textID'] == row['textID'], ['prediction']] = selected

x_val['jaccard'] = x_val.apply(lambda x: jaccard(x['selected_text'], x['prediction']), axis=1)

print('Jaccard for validation set = ', np.mean(x_val['jaccard']))

在这里插入图片描述

# make final submission
#这里是将训练好的模型用来真正做预测
test_data['prediction'] = ''
for index, row in test_data.iterrows():
    selected_text = calc_selected_text(row, tol, a)

    sample.loc[sample['textID'] == row['textID'], ['selected_text']] = selected_text

sample.to_csv('submission.csv', index=False)
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值