def tokenlize(content):
# content = content.decode()
content = content.replace("'s"," is")
content = content.replace("n't"," not")
content = re.sub("<.*?>"," " ,content)
fileters = ["\(","\)","\t", "\n", "\x97", "\x96", "#", "$", "%","\0x93", "&", "\.", ",","!",":","\"","\'","\?","\x95"]
content = re.sub("|".join(fileters)," ", content)
tokens = [i.strip().lower() for i in content.split()]
# tokens = [item.lower() for item in tokens]
return tokens
注意到,main.py 文件中的content来源:
if __name__ == '__main__':
ws = Word2Sequence()
path = r"D:\data\Desktop\aclImdb_v1.tar\aclImdb_v1\aclImdb\train"
temp_data_path = [os.path.join(path,"pos"),os.path.join(path,"neg")]
for data_path in temp_data_path:
file_paths = [os.path.join(data_path,file_name) for file_name in os.listdir(data_path) if file_name.endswith("txt")]
for file_path in tqdm(file_paths):
sentence = tokenlize(open(file_path,'rb').read())
ws.fit(sentence)
ws.build_vocab(min=10,max_feature=5000)
pickle.dump(ws, open("../pythonProject/ws.pkl",'wb'))
print(ws)
tokenlize(open(file_path,'rb').read())
是磁盘上读取的字节流,因此读到的数据就是 bytes类型
需decode() 转化为 str 类型 方可进行语句的拼接替换工作
content.decode()
更新后的tokenlize()函数:
def tokenlize(content):
content = content.decode()
content = content.replace("'s"," is")
content = content.replace("n't"," not")
content = re.sub("<.*?>"," " ,content)
fileters = ["\(","\)","\t", "\n", "\x97", "\x96", "#", "$", "%","\0x93", "&", "\.", ",","!",":","\"","\'","\?","\x95"]
content = re.sub("|".join(fileters)," ", content)
tokens = [i.strip().lower() for i in content.split()]
# tokens = [item.lower() for item in tokens]
return tokens