功能:将一篇文章分词后,将分词后的词语用数字表示。
用途:将词语转换成数字后,简化后续处理。如相似度计算、朴素贝叶斯分类。
下面贴代码:
运行命令 Python **.py youzi_data data (读取youzi_data文件夹下面的所有文件 输出到data.txt中)
# encoding: utf-8 import os import sys reload(sys) sys.setdefaultencoding("utf-8") inpath = sys.argv[1] OutFileName = sys.argv[2] OutFile = file(OutFileName + ".txt", "w") wordList=[] word_dic = {} def word_num(): # 统计文件个数 i=0 for filename in os.listdir(inpath): i+=1 infile = file(inpath + '/' + filename, 'r') content = infile.read().strip() content = content.decode("utf-8", 'ignore') words = content.replace('\n', ' ').split(' ') for word in words: if len(word.strip())<1: continue # wordList 用于记录唯一词语长度,下一个不同的词语来,wordList长度增加1 if word not in word_dic: wordList.append(word) word_dic[word]=len(wordList) # 如果来一之前出现过的词语,将之前记录的数字赋值 OutFile.write(str(word_dic[word]) + " ") infile.close() # 输出词语对应的数字 for k, v in word_dic.items(): OutFile.write("%s,%s\n" % (k, v)) print i,"files load" print len(wordList),"unique words found!" OutFile.close() if __name__ == "__main__": word_num()