还是sklearn,不多做解释:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
#!usr/bin/env python
# -*- coding:utf-8 -*-
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
import sys
reload(sys)
sys.setdefaultencoding("utf8")
#for UnicodeEncodeError
#get all file names in the "ParentFolder"
def GetFilesInFolder(ParentFolder):
import os
filenameList = []
for filename in os.listdir(ParentFolder):
print filename
filenameList.append(filename)
return filenameList
ParentFolder="wikiData"
filenameList=GetFilesInFolder(ParentFolder)
dataList=[]
for fileName in filenameList:
f=open(ParentFolder+"/"+fileName,"r")
f