hangman游戏

挑战是代码刽子手的算法来玩这个游戏。你的算法应该比baseline算法,我们提供给你和理想有超过50%的准确率。用这个电子邮件,我们已经附加训练字典文件以及Jupyter笔记本模板显示了如何代码,执行和提交你的算法

游戏的规则,给定单词长度,让你猜单词,猜对的字母可以保留,猜错的字母将用掉你一条命,每个单词你有六条命。

模型背景,已知游戏规则, 还有一个单词字典作为训练集。

这个游戏有两种思路,一个是比较吃GPU硬件设施,也就是用RNN的方法来训练字典内的信息做nlp预测,另外一个基于游戏规则从训练的字典中获得一些统计信息,基于这些先验信息来预测字典外的单词。由于硬件问题,我用的是后者,采取的是,基于单词长度以及猜对单词后,字母对应的位置,未猜对的字母对应位置等等作为我的特征进行训练。

// An highlighted block

 #my code
def update_counter(lm, data, order):
    for i in range(len(data)-order):
        history, char = data[i:i+order], data[i+order]
        if history.find(' ') == -1 and char != ' ':
            lm[history][char]+=1
    return lm

def train_char_lm(dic, order=3):
    lm = defaultdict(Counter)
    data = dic
    # calculate the probability of letter appearnce in a specified gram (length decided by order)
    for i in range(len(data)-order):
        gram = data[i:i+order]
        histories = []
        chars = []
        if gram.find(' ') == -1 and gram.find('_') == -1:
            replace_space = order - 1
            while(replace_space>0):
                gram_list = list(gram)
                comb = combinations(range(len(gram)), replace_space)
                for i in list(comb):
                    keeped_letter = [x if idx not in list(i) else '_' for idx, x in enumerate(gram_list)]
                    replaced_letter = [x for idx, x in enumerate(gram_list) if idx in list(i)]
                    histories += ["".join(keeped_letter)]
                    chars += [Counter("".join(replaced_letter)).most_common()[0][0]]               
                replace_space -= 1
        for history, char in zip(histories, chars):
            if history.find(' ') == -1 and char != ' ':
                lm[history][char]+=1
    
    def normalize(counter):
        s = float(sum(counter.values()))
        return [(c,cnt/s) for c,cnt in counter.items()]
    outlm = {hist:normalize(chars) for hist, chars in lm.items()}
    return outlm
def check_guessed(sorted_letter_count, guessed_letters):
    flag = False
    for letter, instance_prob in sorted_letter_count:
        if letter not in guessed_letters and letter is not '_':
            flag = True
            break
    if flag:
        return (letter, instance_prob)
    else:
        return

def update_candidate(candidate_list, curr_guess):
    if curr_guess is not None:
        if curr_guess[0] in candidate_list:
            candidate_list[curr_guess[0]] += curr_guess[1]
        else:
            candidate_list[curr_guess[0]] = curr_guess[1]
    return candidate_list

def generate_letter(full_dict, guess, lm, d, guessed_letters, order = 3):
    candidate_list = {} 
    print (guess)
     ## the first time guess is based on probablity of all words with the length of target word
    if len(set(guess))== 1 and guess[0] == '_':
        curr_guess = check_guessed(collections.Counter("".join(d[len(guess)])).most_common(), guessed_letters)
        candidate_list = update_candidate(candidate_list, curr_guess)
    
    ## get the gram of current word and get the probablity of letters from language model
    for i in range(len(guess)):
        stem = guess[i:i+order]
        if stem in lm:
            curr_guess = check_guessed(sorted(lm[stem], key=lambda item:item[1], reverse=True), guessed_letters)
            candidate_list = update_candidate(candidate_list, curr_guess)
            
    ## if the gram is not in the language model, then use the default probability of all words with the length of target word
    if (len(candidate_list) == 0):
        curr_guess = check_guessed(collections.Counter("".join(d[len(guess)])).most_common(), guessed_letters)
        candidate_list = update_candidate(candidate_list, curr_guess)
    
    letter = max(candidate_list, key=lambda k: candidate_list[k])
    return letter

def play(answer, lm,  d, order, nTrials=6):
    guess = "_ " * int(len(answer)/2)
    guess_clean = guess[::2].replace(" ", "")
    full_dict = full_dictionary
    guessed_letters = []
    errors = 0
    count = 0
    flag = False
    while(errors < nTrials):
        c = generate_letter(full_dict, guess_clean, lm, d, guessed_letters, order)
        guessed_letters += [c]
        if answer.find(c)!=-1:
            idx = [pos for pos, char in enumerate(answer) if char == c]
            for j in idx:
                guess = '%s%s%s'%(guess[:j],c,guess[j+1:])
        else:
            errors += 1
        print ("-------------")   
        print (count, errors, c, ':  ', guess_clean)
        print ("-------------")
        guess_clean = guess[::2]
        if guess_clean.find('_') == -1:
            flag = True
            break
        count += 1
    return guess, flag
def build_dictionary(dictionary_file_location):
    text_file = open(dictionary_file_location,"r")
    full_dictionary = text_file.read().splitlines()
    text_file.close()
    return full_dictionary

full_dictionary_location = "words_250000_train.txt"
full_dictionary = build_dictionary(full_dictionary_location)
#95% train and 5% test
full_dic, answers = train_test_split(full_dictionary, test_size = 0.05)
lm = train_char_lm(" ".join(full_dic), 5)
d=defaultdict(list)
for word in full_dic:
    d[len(word)].append(word)
N = len(answers)
success = 0
for answer in answers:
    answer = " ".join(answer) + " "
    res, flag = play(answer, lm, d, order = 5)
    if flag:
        success += 1
        print ("success!, the answer is " + res)
    else:
        print("failed!, the answer is " + answer + " the guess is " + res)
acc = success/(N*1.0)*100
print ("success rate is %0.2f%%"%acc)
# here ahout is 51%
通过这个方法,大概可以提高到50%左右,正确率,在6次live的情况下,以上是我在做trex机试的时候上机题,有空再写详细点吧

 

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值