根据自己的理解写的读书笔记。
import collections import math import os import random import zipfile import urllib import numpy as np import tensorflow as tf #定义下载文本数据的函数 # url = 'http://mattmahoney.net/dc/' # # def maybe_download(filename,expected_bytes): # if not os.path.exists(filename): # filename,_ = urllib.request.urlretrieve(url + filename,filename) # statinfo = os.stat(filename) #访问一个文件的详细信息。 # if statinfo.st_size == expected_bytes: #文件大小(以字节为单位) # print('Found and verified(验证)',filename) # else: # print(statinfo.st_size) # raise Exception('Failed to verify(验证)' + filename + 'Can you get to it with a browser(浏览器)?') # return filename # # filename = maybe_download('text8.zip',31344016) filename = './text8.zip' #解压文件,并将数据转化成单词的列表 def read_data(filename): with zipfile.ZipFile(filename) as f: #获得名字列表,读取成字符串,编码成'utf-8',最后进行分割 data = tf.compat.as_str(f.read(f.namelist()[0])).split() return data words = read_data(filename) # print('Data size',len(words)) # print(words) #创建词汇表,将出现最多的50000个单词作为词汇表,放入字典中。 vocabulary_size = 50000 def build_dataset(words): count = [['UNK',-1]] count.extend(collections.Counter(words).most_common(vocabulary_size - 1)) # c=collections.Counter(words).most_common(10) # print(c) # count.extend(c) # print(count) #[['UNK', -1], ('the', 1061396), ('of', 593677), ('and', 416629), ('one', 411764), ('in', 372201), ('a', 325873), ('to', 316376), ('zero', 264975), ('nine', 250430), ('two', 192644)] dictionary = dict()#新建空字典 for word,_ in count: dictionary[word] = len(dictionary) # print(dictionary) #{'UNK': 0, 'the': 1, 'of': 2, 'and': 3, 'one': 4, 'in': 5, 'a': 6, 'to': 7, 'zero': 8, 'nine': 9, 'two': 10} data = list() unk_count = 0#未知单词数量 for word in words:#单词索引,不在字典中,则索引为0 if word in dictionary: index = dictionary[word] else: index = 0 unk_count += 1 data.append(index) count[0][1] = unk_count reverse_dictionary = dict(zip(dictionary.values(),dictionary.keys())) return data,count,dictionary,reverse_dictionary data,count,dictionary,reverse_dictionary = build_dataset(words) #删除原始单词列表,节约内存。打印词汇表,了解词频 del words # print('Most common words (+UNK)',count[:5]) # print('Sample data',data[:10],[reverse_dictionary[i] for i in data[:10]]) #以上代码为数据处理,得到单词的词频和在字典中的索引 #skip-gram模式:从目标单词反推语境