#encoding:utf-8
import collections
import math
import os
import random
import zipfile
import numpy as np
import urllib
import tensorflow as tf
# url = 'http://mattmahoney.net/dc/'
# def maybe_download(filename, expected_bytes):
# if not os.path.exists(filename):
# filename, _ = urllib.request.urlretrieve(url + filename, filename)
# statinfo = os.stat(filename)
# if statinfo.st_size == expected_bytes:
# print('Found and verified', filename)
# else:
# print(statinfo.st_size)
# raise Exception( 'Failed to verify ' + filename + '. Can you get to it with a browser?')
# return filename
#
# filename = maybe_download('text8.zip', 31344016)
filename = './text8.zip'
def read_data(filename):
"""解压缩并读取数据到数组中"""
with zipfile.ZipFile(filename) as f:
data = tf.compat.as_str(f.read(f.namelist()[0])).split()
return data
words = read_data(filename)
print('Data size', len(words))
vocabulary_size = 50000
def build_dataset(words):
count = [['UNK', -1]]
count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
dictionary = dict()
for word, _ in count:
dictionary[word] = len(dictionary)
data = list()
unk_count = 0
for word in words:
if word in dictionary:
index = dictionary[word]
else:
index = 0
unk_count += 1
data.append(index)
count[0][1] = unk_count
reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
return data, count, dictionary, reve
Word2Vec
最新推荐文章于 2023-09-21 21:05:05 发布