1.安装nltk
pip install nltk
2.下载文本到本地
----wordnet汉语开放词网,可从以下网址下载:
http://compling.hss.ntu.edu.sg/cow/
----停用词:参考以下网页,另外加入常用标点符号
http://blog.csdn.net/u010533386/article/details/51458591
3.下载WordNet语料库
import nltk
nltk.download()
---运行后,出现图形界面,选择第二项"all-corpora”,然后download
4.代码
# encoding=utf-8
import jieba
import importlib, sys
import codecs
importlib.reload(sys)
import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn
def doSeg(filename):
f = open(filename, 'r+')
file_list = f.read()
f.close()
seg_list = jieba.cut(file_list)
stopwords = []
for word in open("D:\Work\Python学习\SVM\stop_words.txt", "r"):
stopwords.append(word.strip())
ll = []
for seg in seg_list:
if (seg.encode("utf-8") not in stopwords and seg != ' ' and seg != '' and seg != '\n' and seg != "\n\n"):
ll.append(seg)
return ll
def loadWordNet():
f = codecs.open("D:\Work\Python学习\SVM\cow-not-full.txt", "rb", "utf-8")
known = set()
for l in f:
if l.startswith('#') or not l.strip():
continue
row = l.strip().split("\t")
if len(row) == 3:
(synset, lemma, status) = row
elif len(row) == 2:
(synset, lemma) = row
status = 'Y'
else:
print("illformed line: " + l.strip())
if status in ['Y', "0"]:
if not (synset.strip(), lemma.strip()) in known:
known.add((synset.strip(), lemma.strip()))
return known
def findWordNet(known, key):
ll = []
for kk in known:
if (kk[1] == key):
ll.append(kk[0])
return ll
def id2ss(ID):
return wn._synset_from_pos_and_offset(str(ID[-1:]), int(ID[:8]))
def getSenti(word):
return swn.senti_synset(word.name())
if __name__ == '__main__':
known = loadWordNet()
words = doSeg(sys.argv[1])
n = 0
p = 0
for word in words:
ll = findWordNet(known, word)
if (len(ll) != 0):
n1 = 0.0
p1 = 0.0
for wid in ll:
desc = id2ss(wid)
swninfo = getSenti(desc)
p1 = p1 + swninfo.pos_score()
n1 = n1 + swninfo.neg_score()
if (p1 != 0.0 or n1 != 0.0):
print(word + '-> n' + str(n1/len(ll)) + ", p " + str(p1/len(ll)))
p = p + p1/len(ll)
n = n + n1/len(ll)
print("n:" + str(n) + ", p:" + str(p))
4.参考