counter = collections.Counter([token for sentence in raw_dataset for token in sentence])# print(counter.items())
counter =dict(filter(lambda x:x[1]>=5,counter.items()))# [token for sentence in raw_dataset for token in sentence] 代码作用同下# a = []# for sentence in raw_dataset:# for token in sentence:# a.append(token)# 将词映射到整数索引
idx_to_token =[token for token,_ in counter.items()]# print(idx_to_token) # ['pierre', '<unk>', 'N', 'years', 'old', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'nov.',
token_to_idx ={token:idx for idx,token inenumerate(idx_to_token)}# print(token_to_idx) # {'pierre': 0, '<unk>': 1, 'N': 2, 'years': 3, 'old': 4, 'will': 5, 'join': 6, 'the': 7, 'board': 8, 'as': 9,
dataset =[[token_to_idx[token]for token in sentence if token in token_to_idx]for sentence in raw_dataset]
num_tokens =sum([len(sentence)for sentence in dataset])# print(num_tokens) # 二次采样前 887100# [[token_to_idx[token] for token in sentence if token in token_to_idx] for sentence in raw_dataset]# 代码作用同下# print(len(dataset))# print(dataset)# b = []# for sentence in raw_dataset:# c = []# for token in sentence:# if token in token_to_idx:# c.append(token_to_idx[token])# b.append(c)# print(len(b))# print(b)
1、2二次采样
# 1、2 二次采样defdiscard(idx):return random.uniform(0,1)<1- math.sqrt(1e-4/ counter[idx_to_token[idx]]* num_tokens)
subsampled_dataset =[[token for token in sentence ifnot discard(token)]for sentence in dataset]
num_tokens_2 =sum([len(sentence)for sentence in subsampled_dataset])print('二次采样后:',num_tokens_2)# 二次采样后 375930# 比较一个词在二次采样前后出现在数据集中的次数defcompare_counts(token):return'# %s: before=%d, after=%d'%(token,sum([st.count(token_to_idx[token])for st in dataset]),sum([st.count(token_to_idx[token])for st in subsampled_dataset]))# print(compare_counts('the')) # the: before=50770, after=2089# print(compare_counts('join')) # join: before=45, after=45
1、3 提取中心词和背景词
# 1、3 提取中心词和背景词defget_centers_and_contexts(dataset, max_window_size):
centers, contexts =[],[]for st in dataset:iflen(st)<2:# 每个句子至少要有2个词才可能组成一对“中心词-背景词”continue
centers += st
for center_i inrange(len(st)):
window_size = random.randint(1, max_window_size)
indices =list(range(max(0, center_i - window_size),min(len(st), center_i +1+ window_size)))
indices.remove(center_i)# 将中心词排除在背景词之外
contexts.append([st[idx]for idx in indices])return centers, contexts
# tiny_dataset = [list(range(7)), list(range(7, 10))]# print('dataset', tiny_dataset)# for center, context in zip(*get_centers_and_contexts(tiny_dataset, 2)):# print('center', center, 'has contexts', context)## # dataset [[0, 1, 2, 3, 4, 5, 6], [7, 8, 9]]# # center 0 has contexts [1]# # center 1 has contexts [0, 2]# # center 2 has contexts [0, 1, 3, 4]# # center 3 has contexts [1, 2, 4, 5]# # center 4 has contexts [3, 5]# # center 5 has contexts [3, 4, 6]# # center 6 has contexts [4, 5]