tensorflow实现Word2Vec(找到目标英文单词的相近词)

最新推荐文章于 2023-02-24 13:30:55 发布

秦玉坤_nj

最新推荐文章于 2023-02-24 13:30:55 发布

阅读量1.6k

点赞数

分类专栏：深度学习 matplotlib

本文链接：https://blog.csdn.net/qq_39131592/article/details/79056185

版权

根据自己的理解写的读书笔记。

import collections
import math
import os
import random
import zipfile
import urllib
import numpy as np
import tensorflow as tf

#定义下载文本数据的函数
# url = 'http://mattmahoney.net/dc/'
#
# def maybe_download(filename,expected_bytes):
#     if not os.path.exists(filename):
#         filename,_ = urllib.request.urlretrieve(url + filename,filename)
#     statinfo = os.stat(filename)  #访问一个文件的详细信息。
#     if statinfo.st_size == expected_bytes:  #文件大小(以字节为单位)
#         print('Found and verified(验证）',filename)
#     else:
#         print(statinfo.st_size)
#         raise Exception('Failed to verify(验证）' + filename + 'Can you get to it with a browser(浏览器)?')
#     return filename
#
# filename = maybe_download('text8.zip',31344016)

filename = './text8.zip'

#解压文件，并将数据转化成单词的列表
def read_data(filename):
    with zipfile.ZipFile(filename) as f:
        #获得名字列表，读取成字符串，编码成'utf-8'，最后进行分割
        data = tf.compat.as_str(f.read(f.namelist()[0])).split()
    return data

words = read_data(filename)
# print('Data size',len(words))
# print(words)

#创建词汇表，将出现最多的50000个单词作为词汇表，放入字典中。
vocabulary_size = 50000

def build_dataset(words):
    count = [['UNK',-1]]
    count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
    # c=collections.Counter(words).most_common(10)
    # print(c)
    # count.extend(c)
    # print(count)  #[['UNK', -1], ('the', 1061396), ('of', 593677), ('and', 416629), ('one', 411764), ('in', 372201), ('a', 325873), ('to', 316376), ('zero', 264975), ('nine', 250430), ('two', 192644)]
    dictionary = dict()#新建空字典
    for word,_ in count:
        dictionary[word] = len(dictionary)
    # print(dictionary)  #{'UNK': 0, 'the': 1, 'of': 2, 'and': 3, 'one': 4, 'in': 5, 'a': 6, 'to': 7, 'zero': 8, 'nine': 9, 'two': 10}
    data = list()
    unk_count = 0#未知单词数量
    for word in words:#单词索引，不在字典中，则索引为0
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reverse_dictionary = dict(zip(dictionary.values(),dictionary.keys()))
    return data,count,dictionary,reverse_dictionary

data,count,dictionary,reverse_dictionary = build_dataset(words)

#删除原始单词列表，节约内存。打印词汇表，了解词频
del words
# print('Most common words (+UNK)',count[:5])
# print('Sample data',data[:10],[reverse_dictionary[i] for i in data[:10]])

#以上代码为数据处理，得到单词的词频和在字典中的索引

#skip-gram模式：从目标单词反推语境

最低0.47元/天解锁文章

秦玉坤_nj

关注

0
点赞
踩
3

收藏

觉得还不错? 一键收藏
0
评论
tensorflow实现Word2Vec(找到目标英文单词的相近词)

根据自己的理解写的读书笔记。import collectionsimport mathimport osimport randomimport zipfileimport urllibimport numpy as npimport tensorflow as tf#定义下载文本数据的函数# url = 'http://mattmahoney.net/dc/'## de
复制链接

扫一扫