挑战是代码刽子手的算法来玩这个游戏。你的算法应该比baseline算法,我们提供给你和理想有超过50%的准确率。用这个电子邮件,我们已经附加训练字典文件以及Jupyter笔记本模板显示了如何代码,执行和提交你的算法
游戏的规则,给定单词长度,让你猜单词,猜对的字母可以保留,猜错的字母将用掉你一条命,每个单词你有六条命。
模型背景,已知游戏规则, 还有一个单词字典作为训练集。
这个游戏有两种思路,一个是比较吃GPU硬件设施,也就是用RNN的方法来训练字典内的信息做nlp预测,另外一个基于游戏规则从训练的字典中获得一些统计信息,基于这些先验信息来预测字典外的单词。由于硬件问题,我用的是后者,采取的是,基于单词长度以及猜对单词后,字母对应的位置,未猜对的字母对应位置等等作为我的特征进行训练。
// An highlighted block
#my code
def update_counter(lm, data, order):
for i in range(len(data)-order):
history, char = data[i:i+order], data[i+order]
if history.find(' ') == -1 and char != ' ':
lm[history][char]+=1
return lm
def train_char_lm(dic, order=3):
lm = defaultdict(Counter)
data = dic
# calculate the probability of letter appearnce in a specified gram (length decided by order)
for i in range(len(data)-order):
gram = data[i:i+order]
histories = []
chars = []
if gram.find(' ') == -1 and gram.find('_') == -1:
replace_space = order - 1
while(replace_space>0):
gram_list = list(gram)
comb = combinations(range(len(gram)), replace_space)
for i in list(comb):
keeped_letter = [x if idx not in list(i) else '_' for idx, x in enumerate(gram_list)]
replaced_letter = [x for idx, x in enumerate(gram_list) if idx in list(i)]
histories += ["".join(keeped_letter)]
chars += [Counter("".join(replaced_letter)).most_common()[0][0]]
replace_space -= 1
for history, char in zip(histories, chars):
if history.find(' ') == -1 and char != ' ':
lm[history][char]+=1
def normalize(counter):
s = float(sum(counter.values()))
return [(c,cnt/s) for c,cnt in counter.items()]
outlm = {hist:normalize(chars) for hist, chars in lm.items()}
return outlm
def check_guessed(sorted_letter_count, guessed_letters):
flag = False
for letter, instance_prob in sorted_letter_count:
if letter not in guessed_letters and letter is not '_':
flag = True
break
if flag:
return (letter, instance_prob)
else:
return
def update_candidate(candidate_list, curr_guess):
if curr_guess is not None:
if curr_guess[0] in candidate_list:
candidate_list[curr_guess[0]] += curr_guess[1]
else:
candidate_list[curr_guess[0]] = curr_guess[1]
return candidate_list
def generate_letter(full_dict, guess, lm, d, guessed_letters, order = 3):
candidate_list = {}
print (guess)
## the first time guess is based on probablity of all words with the length of target word
if len(set(guess))== 1 and guess[0] == '_':
curr_guess = check_guessed(collections.Counter("".join(d[len(guess)])).most_common(), guessed_letters)
candidate_list = update_candidate(candidate_list, curr_guess)
## get the gram of current word and get the probablity of letters from language model
for i in range(len(guess)):
stem = guess[i:i+order]
if stem in lm:
curr_guess = check_guessed(sorted(lm[stem], key=lambda item:item[1], reverse=True), guessed_letters)
candidate_list = update_candidate(candidate_list, curr_guess)
## if the gram is not in the language model, then use the default probability of all words with the length of target word
if (len(candidate_list) == 0):
curr_guess = check_guessed(collections.Counter("".join(d[len(guess)])).most_common(), guessed_letters)
candidate_list = update_candidate(candidate_list, curr_guess)
letter = max(candidate_list, key=lambda k: candidate_list[k])
return letter
def play(answer, lm, d, order, nTrials=6):
guess = "_ " * int(len(answer)/2)
guess_clean = guess[::2].replace(" ", "")
full_dict = full_dictionary
guessed_letters = []
errors = 0
count = 0
flag = False
while(errors < nTrials):
c = generate_letter(full_dict, guess_clean, lm, d, guessed_letters, order)
guessed_letters += [c]
if answer.find(c)!=-1:
idx = [pos for pos, char in enumerate(answer) if char == c]
for j in idx:
guess = '%s%s%s'%(guess[:j],c,guess[j+1:])
else:
errors += 1
print ("-------------")
print (count, errors, c, ': ', guess_clean)
print ("-------------")
guess_clean = guess[::2]
if guess_clean.find('_') == -1:
flag = True
break
count += 1
return guess, flag
def build_dictionary(dictionary_file_location):
text_file = open(dictionary_file_location,"r")
full_dictionary = text_file.read().splitlines()
text_file.close()
return full_dictionary
full_dictionary_location = "words_250000_train.txt"
full_dictionary = build_dictionary(full_dictionary_location)
#95% train and 5% test
full_dic, answers = train_test_split(full_dictionary, test_size = 0.05)
lm = train_char_lm(" ".join(full_dic), 5)
d=defaultdict(list)
for word in full_dic:
d[len(word)].append(word)
N = len(answers)
success = 0
for answer in answers:
answer = " ".join(answer) + " "
res, flag = play(answer, lm, d, order = 5)
if flag:
success += 1
print ("success!, the answer is " + res)
else:
print("failed!, the answer is " + answer + " the guess is " + res)
acc = success/(N*1.0)*100
print ("success rate is %0.2f%%"%acc)
# here ahout is 51%
通过这个方法,大概可以提高到50%左右,正确率,在6次live的情况下,以上是我在做trex机试的时候上机题,有空再写详细点吧