nltk 机器学习。文本分类一整套

最新推荐文章于 2023-04-11 03:24:59 发布
戚兆禹
最新推荐文章于 2023-04-11 03:24:59 发布
阅读量855
点赞数 2
分类专栏： python
python 专栏收录该内容
4 篇文章 0 订阅
订阅专栏
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import csv
def preprocessing(text):
    #text = text.decode("utf8")
    # tokenize into words
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    # remove stopwords
    stop = stopwords.words('english')
    tokens = [token for token in tokens if token not in stop]
    # remove words less than three letters
    tokens = [word for word in tokens if len(word) >= 3]
    # lower capitalization
    tokens = [word.lower() for word in tokens]
    # lemmatize
    lmtzr = WordNetLemmatizer()
    tokens = [lmtzr.lemmatize(word) for word in tokens]
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text
#https://archive.ics.uci.edu/ml/machine-learning-databases/00228/
sms = open(r'C:\Users\pc\Desktop\SMSSpamCollection', encoding='utf8') # check the structure of this file!
sms_data = []
sms_labels = []
csv_reader = csv.reader(sms, delimiter = '\t')
for line in csv_reader:
    # adding the sms_id
    sms_labels.append(line[0])
    # adding the cleaned text We are calling preprocessing method
    sms_data.append(preprocessing(line[1]))
sms.close()

# 6.3 采样操作
import sklearn
import numpy as np
trainset_size = int(round(len(sms_data)*0.70))
# i chose this threshold for 70:30 train and test split.
print('The training set size for this classifier is ' + str(trainset_size) + '\n')
x_train = np.array([''.join(el) for el in sms_data[0: trainset_size]])
y_train = np.array([el for el in sms_labels[0: trainset_size]])
x_test = np.array([''.join(el) for el in sms_data[trainset_size+1:len(sms_data)]])
y_test = np.array([el for el in sms_labels[trainset_size+1:len(sms_labels)]])
         #or el in sms_labels[trainset_size+1:len(sms_labels)]

print(x_train)
print(y_train)

from sklearn.feature_extraction.text import CountVectorizer
sms_exp = []
for line in sms_data:
    sms_exp.append(preprocessing(line))
vectorizer = CountVectorizer(min_df = 1, encoding='utf-8')
X_exp = vectorizer.fit_transform(sms_exp)
print("||".join(vectorizer.get_feature_names()))
print(X_exp.toarray())

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(min_df = 2, ngram_range=(1, 2),
                             stop_words = 'english', strip_accents = 'unicode', norm = 'l2')
X_train = vectorizer.fit_transform(x_train)
X_test = vectorizer.transform(x_test)
#NB
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
clf = MultinomialNB().fit(X_train, y_train)
y_nb_predicted = clf.predict(X_test)
print(y_nb_predicted)
print('\n confusion_matrix \n')
#cm = confusion_matrix(y_test, y_pred)
cm = confusion_matrix(y_test, y_nb_predicted)
print(cm)
print('\n Here is the classification report:')
print(classification_report(y_test, y_nb_predicted))

feature_names = vectorizer.get_feature_names()
coefs = clf.coef_
intercept = clf.intercept_
coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
n = 20
top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1])
for (coef_1, fn_1), (coef_2, fn_2) in top:
    print('\t%.4f\t%-15s\t\t%.4f\t%-15s' %(coef_1, fn_1, coef_2, fn_2))

# 6.3.2 决策树
from sklearn import tree
clf = tree.DecisionTreeClassifier().fit(X_train.toarray(), y_train)
y_tree_predicted = clf.predict(X_test.toarray())
print(y_tree_predicted)
print('\n Here is the classification report:')
print(classification_report(y_test, y_tree_predicted))

# 6.3.3 随机梯度下降法
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix
clf = SGDClassifier(alpha = 0.0001, n_iter=50).fit(X_train, y_train)
y_pred = clf.predict(X_test)
print('\n Here is the classification report:')
print(classification_report(y_test, y_pred))
print(' \n confusion_matrix \n')
cm = confusion_matrix(y_test, y_pred)
print(cm)


# 6.3.4 逻辑回归
# 6.3.5 支持向量机
from sklearn.svm import LinearSVC
svm_classifier = LinearSVC().fit(X_train, y_train)
y_svm_predicted = svm_classifier.predict(X_test)
print('\n Here is the classification report:')
print(classification_report(y_test, y_svm_predicted))
cm = confusion_matrix(y_test, y_pred)
print(cm)

# 6.4 随机森林
from sklearn.ensemble import RandomForestClassifier
RF_clf = RandomForestClassifier(n_estimators=10).fit(X_train, y_train)
predicted = RF_clf.predict(X_test)
print('\n Here is the classification report:')
print(classification_report(y_test, predicted))
cm = confusion_matrix(y_test, y_pred)
print(cm)

# 6.5 文本聚类
# K 均值法
from sklearn.cluster import KMeans, MiniBatchKMeans
from collections import defaultdict
true_k = 5
km = KMeans(n_clusters = true_k, init='k-means++', max_iter=100, n_init= 1)
kmini = MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=1, init_size=1000, batch_size=1000, verbose=2)
km_model = km.fit(X_train)
kmini_model = kmini.fit(X_train)
print("For K-mean clustering ")
clustering = defaultdict(list)
for idx, label in enumerate(km_model.labels_):
    clustering[label].append(idx)
print("For K-mean Mini batch clustering ")
clustering = defaultdict(list)
for idx, label in enumerate(kmini_model.labels_):
    clustering[label].append(idx)

# 6.6 文本中的主题建模
# https://pypi.python.org/pypi/gensim#downloads
import gensim
from gensim import corpora, models, similarities
from itertools import chain
import nltk
from nltk.corpus import stopwords
from operator import itemgetter
import re
documents = [document for document in sms_data]
stoplist = stopwords.words('english')
texts = [[word for word in document.lower().split() if word not in stoplist] for document in documents]
print(texts)


dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
lsi = models.LsiModel(corpus_tfidf, id2word = dictionary, num_topics = 100)
# print(lsi.print_topics(20))
n_topics = 5
lda = models.LdaModel(corpus_tfidf, id2word = dictionary, num_topics = n_topics)
for i in range(0, n_topics):
    temp = lda.show_topic(i, 10)
    terms = []
    for term in temp:
        terms.append(str(term[0]))
    print("Top 10 terms for topic #" + str(i) + ": " + ",".join(terms))