包括预处理,使用tfidf加权重
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# created by fhqplzj on 2017/05/15 上午10:48
import itertools
import re
import jieba
from six.moves import xrange
from sklearn.feature_extraction.text import TfidfVectorizer
def load_stopwords():
path = '/Users/fhqplzj/PycharmProjects/data_service/service/dic/why/stopwords'
content = open(path, 'rb').read().decode('utf-8')
return frozenset(content.splitlines())
stopwords = load_stopwords()
chinese = re.compile(ur'^[0-9a-zA-Z_\u4e00-\u9fa5]+$')
def filter_func(word):
result = True if re.match(chinese, word) else False
return result and word not in stopwords
def my_tokenizer(sentence):
words = jieba.lcut(sentence)
return filter(filter_func, words)
def word_and_weight(corpus):
vectorizer = TfidfVectorizer(tokenizer=my_tokenizer, norm='l1')
tfidf_