利用Python切片处理文本非常方便,下面是一个简单的例子,进行分词统计
(需要读取的文件为utf-8编码,运行环境为Windows,版本为python3)
# -*- coding: utf-8 -*-
import re
import os
Total = 0; #总字母数
words = []
#获取所有单词
readfile = open('Data.txt', encoding = 'utf-8')
for line in readfile.readlines():
lineArr = line.strip().split()
for word in lineArr:
data = re.findall(r'[a-zA-Z]*', word)
for