深度学习视觉
公众号:深度学习视觉
import pandas as pd
from tqdm import tqdm
import operator
# 获取词汇表中的所有字
dict_path = '../bertModel/vocab.txt'
token_dict = getTokenDict(dict_path)
# 获取sentences
train_data_file = './tcdata/train.csv'
train_data = pd.read_csv(train_data_file)
sentences = train_data[['query1','query2']].apply(lambda x:x[0]+x[1],axis=1).values
相关函数
# 对文本建立词典
def build_vocab(sentences):
# key is word,value is frequency
'''
sentences:[sentence]
sentence:"w1w2w3w4,w5w6,w7."
return:文本词频
'''
vocab = {}
for sentence in tqdm(sentences):
for word in sentence:
try:
vocab[word] += 1
except:
vocab[word] = 1
return vocab
def check_coverage(vocab,embeddings_index):
'''
统计词典与文本的覆盖率
return:没有覆盖到的字的频数
'''
iv = {} # in vocab
oov = {} # out of vocba
k = 0
i = 0
for word in tqdm(vocab):
try:
# 词典中的单词在embedding中
iv[word] = embeddings_index[word]
k += vocab[word]
except:
oov[word] = vocab[word]
i += vocab[word]
pass
print('Found embeddings for {:.2%} of vocab'.format(len(iv) / len(vocab)))
print('Found embeddings for {:.2%} of all text'.format(k / (k + i)))
sorted_x = sorted(oov.items(), key=operator.itemgetter(1))[::-1]
return sorted_x
def getTokenDict(dict_path,encoding='utf-8'):
'''
dict_path:字典文件,每一个字为一行。
'''
token_dict = {}
with open(dict_path, encoding=encoding) as reader:
for line in reader:
token = line.strip()
token_dict[token] = len(token_dict)
return token_dict
def clean_numbers(x):
'''
将数字替换
'''
x = re.sub('[0-9]{5,}', '#####', x)
x = re.sub('[0-9]{4}', '####', x)
x = re.sub('[0-9]{3}', '###', x)
x = re.sub('[0-9]{2}', '##', x)
return x