见2021 week12 task1
建立字典来承载文本文档的内容,此段代码要放在dictionary.py文件里。
通过从hash_table文件引入已经编译好的LinearProbeHashTable,同时也要从python内置的typing库里引入Tuple,还要再引用timeit来计时。
from hash_table import LinearProbeHashTable
from typing import Tuple
import timeit
class Dictionary:
DEFAULT_ENCODING = 'utf-8'
def __init__(self, hash_base: int, table_size: int) -> None:
self.hash_table = LinearProbeHashTable(hash_base, table_size)
def load_dictionary(self, filename: str, time_limit: int = None) -> int:
# self.table = LinearProbeHashTable(self.hash_base, self.table_size)
start_time = timeit.default_timer()
words = 0
with open(filename, 'r', encoding=Dictionary.DEFAULT_ENCODING) as file:
line = file.readline()
while line:
line = line.strip()
self.hash_table[line] = 1
if time_limit is not None and timeit.default_timer() - start_time > time_limit:
raise TimeoutError("Exceeded time limit: " + str(time_limit))
words += 1
line = file.readline()
return words
def add_word(self, word: str) -> None:
self.hash_table[word.lower()] = 1
def find_word(self, word: str) -> bool:
return word.lower() in self.hash_table
def delete_word(self, word: str) -> None:
del self.hash_table[word.lower()]
def process_option(dictionary: Dictionary, method_name: str) -> None:
""" Helper code for processing menu options."""
if method_name == 'read_file':
filename = input('Enter filename: ')
try:
dictionary.load_dictionary(filename)
print('Successfully read file')
except FileNotFoundError as e:
print(e)
else:
word = input('Enter word: ')
if method_name == 'add_word':
dictionary.add_word(word)
try:
dictionary.add_word(word)
print('[{}] {}'.format(word, 'Successfully added'))
except IndexError as e:
print('[{}] {}'.format(word, e))
elif method_name == 'find_word':
if dictionary.find_word(word):
print('[{}] {}'.format(word, 'Found in dictionary'))
else:
print('[{}] {}'.format(word, 'Not found in dictionary'))
elif method_name == 'delete_word':
try:
dictionary.delete_word(word)
print('[{}] {}'.format(word, 'Deleted from dictionary'))
except KeyError:
print('[{}] {}'.format(word, 'Not found in dictionary'))
def menu(dictionary: Dictionary):
""" Wrapper for using the dictionary. """
option = None
menu_options = {'read_file': 'Read File',
'add_word': 'Add Word',
'find_word': 'Find Word',
'delete_word': 'Delete Word',
'exit': 'Exit'}
exit_option = list(menu_options.keys()).index('exit') + 1
while option != exit_option:
print('---------------------')
opt = 1
for menu_option in menu_options.values():
print('{}. {}'.format(opt, menu_option))
opt += 1
print('---------------------')
try:
option = int(input("Enter option: "))
if option < 1 or option > exit_option:
raise ValueError('Option must be between 1 and ' + str(exit_option))
except ValueError as e:
print('[{}] {}'.format('menu', e))
else:
if option != exit_option:
process_option(dictionary, list(menu_options.keys())[option - 1])
print("---------------------")
if __name__ == '__main__':
dictionary = Dictionary(31, 250727)
menu(dictionary)
将此段代码放在frequency.py文件里:
from enum import Enum
from string import punctuation
from dictionary import Dictionary
from hash_table import LinearProbeHashTable
class Rarity(Enum):
COMMON = 0
UNCOMMON = 1
RARE = 2
MISSPELT = 3
class Frequency:
# TODO
# raise NotImplementedError
def __init__(self) -> None:
self.hash_base = 27183
self.table_size = 250727
self.hash_table = LinearProbeHashTable(self.hash_base, self.table_size)
self.dictionary = Dictionary(self.hash_base, self.table_size)
self.dictionary.load_dictionary('english_large.txt', 10)
self.max_word = ('', 0)
# O(N)
def add_file(self, filename: str) -> None:
with open(filename, mode = 'r', encoding = 'utf-8') as f:
content = f.read().split() # split words in text
for word in content:
word = word.strip(punctuation).lower()
if self.dictionary.find_word(word):
if word in self.hash_table:
t = self.hash_table[word]
self.hash_table[word] = t + 1
if self.max_word[1] < t + 1:
self.max_word = (word, t + 1)
else:
self.hash_table.insert(word, 1)
# O(1)
def rarity(self, word: str) -> Rarity:
cnt = self.hash_table[word]
if cnt >= max(self.max_word[1] / 100, 1):
return Rarity.COMMON
elif cnt >= max(self.max_word[1] / 1000, 1):
return Rarity.UNCOMMON
elif cnt != 0:
return Rarity.RARE
else:
return Rarity.MISSPELT
def frequency_analysis() -> None:
# TODO
pass
if __name__ == '__main__':
frequency_analysis()
有几个点需要注意,在#TODO部份是根据要求新添加的功能。例如,raise NotImplementedError
self.hash_base = 27183
self.table_size = 250727
定义好hash_base和table_size的大小。
定义__init__
后,执行实例化的过程须变成Frequency(arg1),新建的实例本身,连带其中的参数,会一并传给__init__
函数自动并执行它。所以__init__
函数的参数列表会在开头多出一项,它永远指代新建的那个实例对象,Python语法要求这个参数必须要有,而名称随意,习惯上就命为self
。
接下来要使用已经定义好的hash_table.py, list_adt, referential_array.py。
test_frequency.py文件来测试字典对于文本文档内容的承载以及修改是否成功,通过定义不同的参数来测试方法定义是否成功。
"""Unit Testing for Task 1 and 2"""
import unittest
import sys
from hash_table import LinearProbeHashTable
from frequency import Frequency, Rarity
class TestFrequency(unittest.TestCase):
def setUp(self) -> None:
self.frequency = Frequency()
def test_init(self) -> None:
self.assertEqual(type(self.frequency.hash_table), LinearProbeHashTable)
self.assertEqual(self.frequency.dictionary.find_word('test'), 1)
def test_add_file(self) -> None:
# TODO: Add 2 or more unit tests
# raise NotImplementedError
self.frequency.add_file('215-0.txt')
self.assertEqual(self.frequency.hash_table['warm'], 2)
self.frequency.add_file('84-0.txt')
self.assertEqual(self.frequency.hash_table['warm'], 11)
def test_rarity(self) -> None:
# TODO: Add 2 or more unit tests
# raise NotImplementedError
self.frequency.add_file('215-0.txt')
self.assertEqual(self.frequency.rarity('warm'), Rarity.UNCOMMON)
self.assertEqual(self.frequency.rarity('the'), Rarity.COMMON)
if __name__ == '__main__':
unittest.main()