记录下一些常用的框架代码,以便后续快速使用
embedding
查看覆盖率
import pandas as pd
from tqdm import tqdm
tqdm.pandas()
import operator
from gensim.models import KeyedVectors
news_path = '../input/embeddings/GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin'
embeddings_index = KeyedVectors.load_word2vec_format(news_path, binary=True)
# vacab就是字典 {key:频数}
def check_coverage(vocab,embeddings_index):
a = {}
oov = {}
k = 0
i = 0
for word in tqdm(vocab):
try:
a[word] = embeddings_index[word]
k += vocab[word]
except:
oov[word] = vocab[word]
i += vocab[word]
pass
print('Found embeddings for {:.2%} of vocab'.format(len(a) / len(vocab)))
print('Found embeddings for {:.2%} of all text'.format(k / (k + i)))
sorted_x = sorted(oov.items(), key=operator.itemgetter(1))[::-1]
return sorted_x
文本清洗
def clean_text(x):
x = str(x)
for punct in "/-'":
x = x.replace(punct, ' ')
for punct in '&':
x = x.replace(punct, f' {punct} ')
for punct in '?!.,"#$%\'()*+-/:;<=>@[\\]^_`{|}~' + '“”’':
x = x.replace(punct, '')
return x
# 因为在embedding的时候,会把不同位数的数字改成不同#
def clean_numbers(x):
x = re.sub('[0-9]{5,}', '#####', x)
x = re.sub('[0-9]{4}', '####', x)
x = re.sub('[0-9]{3}', '###', x)
x = re.sub('[0-9]{2}', '##', x)
return x
# 示例
train["question_text"] = train["question_text"].progress_apply(lambda x: clean_numbers(x))
# 不同表达词的转换
def _get_mispell(mispell_dict):
mispell_re = re.compile('(%s)' % '|'.join(mispell_dict.keys()))
return mispell_dict, mispell_re
mispell_dict = {'colour':'color',
'centre':'center',
'didnt':'did not',
'doesnt':'does not',
'isnt':'is not',
'shouldnt':'should not',
'favourite':'favorite',
'travelling':'traveling',
'counselling':'counseling',
'theatre':'theater',
'cancelled':'canceled',
'labour':'labor',
'organisation':'organization',
'wwii':'world war 2',
'citicise':'criticize',
'instagram': 'social medium',
'whatsapp': 'social medium',
'snapchat': 'social medium'
}
mispellings, mispellings_re = _get_mispell(mispell_dict)
def replace_typical_misspell(text):
def replace(match):
return mispellings[match.group(0)]
return mispellings_re.sub(replace, text)
train["question_text"] = train["question_text"].progress_apply(lambda x: replace_typical_misspell(x))
sentences = train["question_text"].progress_apply(lambda x: x.split())
# 停用词的去除
to_remove = ['a','to','of','and']
sentences = [[word for word in sentence if not word in to_remove] for sentence in tqdm(sentences)]
evaluation
F1 score
Not sure it has been shared here yet, but @faron had implemented a very effective FI score optimization based on the paper Optimizing F-measure: A Tale of Two Approaches.
# 这是调节二分类的
def bestThresshold(y_train,train_preds):
tmp = [0,0,0] # idx, cur, max
delta = 0
for tmp[0] in tqdm(np.arange(0.2, 0.35, 0.01)):
tmp[1] = f1_score(y_train, np.array(train_preds)>tmp[0])
if tmp[1] > tmp[2]:
delta = tmp[0]
tmp[2] = tmp[1]
print('best threshold is {:.4f} with F1 score: {:.4f}'.format(delta, tmp[2]))
return delta,tmp[2]