首先我的原始数据是这样的,关于爬虫请看http://blog.csdn.net/jemila/article/details/61196863
我的数据链接:http://pan.baidu.com/s/1hskNlEO 密码:dxv5
加载以下模块
import os
import jieba
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
from langconv import *
from langconv import *加载这个模块是为了简繁转换
加载停止词
f=open(r'C:/Users/user/Desktop/stopword.txt')
stopwords = f.readlines()
stopwords = [i.replace("\n","").decode("gbk") for i in stopwords]
定义一个分词函数
def sent2word(sentence):
"""
Segment a sentence to words
Delete stopwords
"""
segList = jieba.cut(sentence)
segResult = []
for w in segList:
segResult.append(w)
newSent = []
for word in segResult:
if word in stopwords:
# print "stopword: %s" % word
continue
else:
newSent.append(Converter('zh-hans').convert(word.decode('utf-8')))
return newSent
定义一个新建文件夹函数
def mkdir(path):
# 引入模块
import os
# 去除首位空格
path&#