Word2Vec

最新推荐文章于 2021-09-27 09:49:26 发布

神坑教无心

最新推荐文章于 2021-09-27 09:49:26 发布

阅读量264

点赞数

分类专栏： tensorflow

本文链接：https://blog.csdn.net/qq_34000894/article/details/80384513

版权

#encoding:utf-8
import collections
import math
import os
import random
import zipfile
import numpy as np
import urllib
import tensorflow as tf
# url = 'http://mattmahoney.net/dc/'
# def maybe_download(filename, expected_bytes):
#     if not os.path.exists(filename):
#         filename, _ = urllib.request.urlretrieve(url + filename, filename)
#     statinfo = os.stat(filename)
#     if statinfo.st_size == expected_bytes:
#         print('Found and verified', filename)
#     else:
#         print(statinfo.st_size)
#         raise Exception(        'Failed to verify ' + filename + '. Can you get to it with a browser?')
#     return filename
#
# filename = maybe_download('text8.zip', 31344016)

filename = './text8.zip'
def read_data(filename):
    """解压缩并读取数据到数组中"""
    with zipfile.ZipFile(filename) as f:
        data = tf.compat.as_str(f.read(f.namelist()[0])).split()
    return data

words = read_data(filename)
print('Data size', len(words))

vocabulary_size = 50000

def build_dataset(words):
    count = [['UNK', -1]]
    count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, coun

最低0.47元/天解锁文章

神坑教无心

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
Word2Vec

#encoding:utf-8import collectionsimport mathimport osimport randomimport zipfileimport numpy as npimport urllibimport tensorflow as tf# url = 'http://mattmahoney.net/dc/'# def maybe_download...
复制链接

扫一扫

专栏目录