结论:
数据:
ham_data.txt
spam_data.txt
stop_words.utf8
数据处理:
"""
@author: liushuchun
"""
import re
import string
import jieba
# 加载停用词
with open("dict/stop_words.utf8", encoding="utf8") as f:
stopword_list = f.readlines()
def tokenize_text(text):
tokens = jieba.cut(text)
tokens = [token.strip() for token in tokens]
return tokens
def remove_special_characters(text):
tokens = tokenize_text(text)
pattern = re.compile('[{}]'.format(re.escape(string.punctuation)))
filtered_tokens = filter(None, [pattern.sub('', token) for token in tokens])
filtered_text = ' '.join(filtered_tokens)
return filtered_text
def remove_stopwords(text):
tokens = tokenize_text(text)
filtered_tokens = [token for token in tokens if token not in stopword_list]
filtered_text = ''.join(filtered_tokens)
return filtered_text
def normalize_corpus(corpus, tokenize=False):
normalized_corpus = []
for text in corpus:
text = remove_special_characters(text)
text = remove_stopwords(text)
normalized_corpus.append(text)
if tokenize:
text = tokenize_text(text)
normalized_corpus.append(text)
return normalized_corpus
特征提取
"""
@author: liushuchun
"""
from sklearn.feature_extraction.text import CountVectorizer
def bow_extractor(corpus, ngram_range=(1, 1)):
vectorizer = CountVectorizer(min_df=1, ngram_range=ngram_range)
features = vectorizer.fit_transform(corpus)
return vectorizer, features
from sklearn.feature_extraction.text import TfidfTransformer
def tfidf_transformer(bow_matrix):
transformer = TfidfTransformer(norm='l2',
smooth_idf=True,
use_idf=True)
tfidf_matrix = transformer.fit_transform(bow_matrix)
return transformer, tfidf_matrix
from sklearn