Word2vec ——The shapes of stories

word2vec可以用来捕捉词之间的关系。
这里写图片描述

word2vec也可以通过一组反义词得到一个维度,比如性别,阶层,然后看其他词在这个维度上的投射。

这里写图片描述

这里写图片描述

这里写图片描述

受到启发,我们尝试去刻画一个幸运和不幸运的维度,来看故事随时间发展的轨迹。

def flushPrint(s):
    sys.stdout.write('\r')
    sys.stdout.write('%s' % s)
    sys.stdout.flush()

def normalize(v):
    norm = np.linalg.norm(v)
    if norm == 0:
        return v
    return v / norm


def cultureD2(model, words):
    cultures = []
    for word in words: #看故事中每一个词在word2vec中向量取平均看某一时刻故事的发展
        if (word[0] in model.wv.vocab) & (word[1] in model.wv.vocab):
            cultures.append(normalize(model[word[0]] - model[word[1]]))
        else:
            print(word)
    culturev = normalize(np.mean(cultures, axis=0))
    return culturev 

def avglead(model,results):
    leadlist=[]
    veclist=[]
    for i in range(100):
        leadlist.append(results[i][0])
    for w in leadlist:
        if w in model.wv.vocab:
            veclist.append(model[w])
    avgv=normalize(np.mean(veclist,axis=0)) 
    return avgv

def storyv(model,words):
    veclist=[]
    for w in words:
        if w in model.wv.vocab:
            veclist.append(model[w])
    storyv=normalize(np.mean(veclist,axis=0))
    return storyv

def projection(word, culturev):
    # cosine similarity
    return (1 - spatial.distance.cosine(normalize(word), culturev))


def cosine2angle(i):
    return np.arccos(i) / np.pi * 180 #长度换角度

swords = [
    ['success','failure'],
    ['succeed','fail'],
    ['lucky', 'unlucky'],
    ['fortunate', 'unfortunate'],
    ['happy', 'unhappy'],
    ['glad', 'sad'],
    ['joy', 'sorrow'],
    ['smile', 'tear']
] # 一组有关幸运和不幸运的词

def savitzky_golay(y, window_size, order, deriv=0, rate=1):

    import numpy as np
    from math import factorial

    try:
        window_size = np.abs(np.int(window_size))
        order = np.abs(np.int(order))
    except ValueError:
        raise ValueError("window_size and order have to be of type int")
    if window_size % 2 != 1 or window_size < 1:
        raise TypeError("window_size size must be a positive odd number")
    if window_size < order + 2:
        raise TypeError("window_size is too small for the polynomials order")
    order_range = range(order+1)
    half_window = (window_size -1) // 2
    # precompute coefficients
    b = np.mat([[k**i for i in order_range] for k in range(-half_window, half_window+1)])
    m = np.linalg.pinv(b).A[deriv] * rate**deriv * factorial(deriv)
    # pad the signal at the extremes with
    # values taken from the signal itself
    firstvals = y[0] - np.abs( y[1:half_window+1][::-1] - y[0] )
    lastvals = y[-1] + np.abs(y[-half_window-1:-1][::-1] - y[-1])
    y = np.concatenate((firstvals, y, lastvals))
    return np.convolve( m[::-1], y, mode='valid') #平滑处理

# 以灰姑娘的故事为例
film_script_list=["Cinderella was a lovely girl",
"Her old mother died",
"Her father married a wife", 
"and the stepmother has brought two elder sisters",
"She does a lot of housework everyday",
"So the girl's clothes were very dirty",
"Everybody called her Cinderella",
"The prince gave all the females an invitation", 
"asked them to come in the palace to join the dance party", 
"But Cinderella’s stepmother didn’t let her go",
"The day of the party arrived. Cinderella met a fairy godmother",
"She gave Cinderella a coach, two shoots, two horses and a beautiful dress",
"But magic will stop working at midnight",
"Cinderella arrived at the palace and walked into the dance hall", 
"The Prince saw her. He danced only with Cinderella, and he fell in love with her.",
"A clock chiming reminded Cinderella of her fairy godmother's warning",
"She ran out of the palace. The Prince only found Cinderella’s one glass shoot",
"The next morning, The Prince proclaimed: Whoever the shoot fits, shall be wife to him",
"He arrived at the stepmother's house.",
"And Cinderella’s sisters can’t fit the shoot. But she can.",
"Then Cinderella took the other shoot from her pocket and put it on",
"Finally, the Prince and Cinderella were married. They lived happily ever after"]

film_script_str = ','.join(film_script_list)
tknzr = WordPunctTokenizer()
corpus = [tknzr.tokenize(i.lower()) for i in film_script_str.split('\n')]
corpus = [i for i in corpus if i != []]
corpus=corpus[0] # 对语料做预处理

n_split = 10
corpus = [corpus[int(len(corpus)/n_split*i):int(len(corpus)/n_split*(i+1)-1)] for i in range(n_split)]

anglesi = []
for n in range (n_split):
    wordlist=corpus[n]
    story_vector=storyv(model,wordlist)
    story_word = [i[0] for i in model.similar_by_vector(story_vector, 1)]

    anglesi.append(90 - cosine2angle(projection(model[success_word], model[story_word])))        

exa = np.array(anglesi)
exa = savitzky_golay(exa, 11, 3) # window size 51, polynomial order 3
p30 = np.poly1d(np.polyfit(range(len(anglesi)),exa, 3))

plt.plot(range(len(anglesi)),exa,label='smooth')

plt.text(0.3,0.3,'stepmother')
plt.text(2.5,10.8,'dance with prince')
plt.text(6.6,9.4,'marry with prince')
plt.title('Cinderella')
# plt.legend()
plt.show() 

这里写图片描述

[1]: Kozlowski A C, Taddy M, Evans J A. The Geometry of Culture: Analyzing Meaning through Word Embeddings[J]. 2018.
[2]: Garg N, Schiebinger L, Jurafsky D, et al. Word embeddings quantify 100 years of gender and ethnic stereotypes.[J]. Proceedings of the National Academy of Sciences of the United States of America, 2018, 115(16):E3635.

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值