读取tar.gz
这个读取的不好
path = os.getcwd() + os.path.sep
tar = tarfile.open(path + '20news-19997.tar.gz', mode='r:gz')
text = []
for member in tar.getmembers():
f = tar.extractfile(member)
if f is not None:
content = f.read() # types类型
text.append(content.decode('utf-8', errors='ignore').splitlines())
# 剔除长度小于5的
for i,x in enumerate(text):
text[i] = [y for y in x if len(y) > 5]