import nltk from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer import csv def preprocessing(text): #text = text.decode("utf8") # tokenize into words tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)] # remove stopwords stop = stopwords.words('english') tokens = [token for token in tokens if token not in stop] # remove words less than three letters tokens = [word for word in tokens if len(word) >= 3] # lower capitalization tokens = [word.lower() for word in tokens] # lemmatize lmtzr = WordNetLemmatizer() tokens = [lmtzr.lemmatize(word) for word in tokens] preprocessed_text = ' '.join(tokens) return preprocessed_text #https://archive.ics.uci.edu/ml/machine-learning-databases/00228/ sms = open(r'C:\Users\pc\Desktop\SMSSpamCollection', encoding='utf8') # check the structure of this file! sms_data = [] sms_labels = [] csv_reader = csv.reader(sms, delimiter = '\t') for line in csv_reader: # adding the sms_id sms_labels.append(line[0]) # adding the cleaned text We are calling preprocessing method sms_data.append(preprocessing(line[1])) sms.close() # 6.3 采样操作 import sklearn import numpy as np trainset_size = int(round(len(sms_data)*0.70)) # i chose this threshold for 70:30 train and test split. print('The training set size for this classifier is ' + str(trainset_size) + '\n') x_train = np.array([''.join(el) for el in sms_data[0: trainset_size]]) y_train = np.array([el for el in sms_labels[0: trainset_size]]) x_test = np.array([''.join(el) for el in sms_data[trainset_size+1:len(sms_data)]]) y_test = np.array([el for el in sms_labels[trainset_size+1:len(sms_labels)]]) #or el in sms_labels[trainset_size+1:len(sms_labels)] print(x_train) print(y_train) from sklearn.feature_extraction.text import CountVectorizer sms_exp = [] for line in sms_data: sms_exp.append(preprocessing(line)) vectorizer = CountVectorizer(min_df = 1, encoding='utf-8') X_exp = vectorizer.fit_transform(sms_exp) print("||".join(vectorizer.get_feature_names())) print(X_exp.toarray()) from sklearn.feature_extraction.text import TfidfVectorizer vectorizer = TfidfVectorizer(min_df = 2, ngram_range=(1, 2), stop_words = 'english', strip_accents = 'unicode', norm = 'l2') X_train = vectorizer.fit_transform(x_train) X_test = vectorizer.transform(x_test) #NB from sklearn.naive_bayes import MultinomialNB from sklearn.metrics import confusion_matrix from sklearn.metrics import classification_report clf = MultinomialNB().fit(X_train, y_train) y_nb_predicted = clf.predict(X_test) print(y_nb_predicted) print('\n confusion_matrix \n') #cm = confusion_matrix(y_test, y_pred) cm = confusion_matrix(y_test, y_nb_predicted) print(cm) print('\n Here is the classification report:') print(classification_report(y_test, y_nb_predicted)) feature_names = vectorizer.get_feature_names() coefs = clf.coef_ intercept = clf.intercept_ coefs_with_fns = sorted(zip(clf.coef_[0], feature_names)) n = 20 top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1]) for (coef_1, fn_1), (coef_2, fn_2) in top: print('\t%.4f\t%-15s\t\t%.4f\t%-15s' %(coef_1, fn_1, coef_2, fn_2)) # 6.3.2 决策树 from sklearn import tree clf = tree.DecisionTreeClassifier().fit(X_train.toarray(), y_train) y_tree_predicted = clf.predict(X_test.toarray()) print(y_tree_predicted) print('\n Here is the classification report:') print(classification_report(y_test, y_tree_predicted)) # 6.3.3 随机梯度下降法 from sklearn.linear_model import SGDClassifier from sklearn.metrics import confusion_matrix clf = SGDClassifier(alpha = 0.0001, n_iter=50).fit(X_train, y_train) y_pred = clf.predict(X_test) print('\n Here is the classification report:') print(classification_report(y_test, y_pred)) print(' \n confusion_matrix \n') cm = confusion_matrix(y_test, y_pred) print(cm) # 6.3.4 逻辑回归 # 6.3.5 支持向量机 from sklearn.svm import LinearSVC svm_classifier = LinearSVC().fit(X_train, y_train) y_svm_predicted = svm_classifier.predict(X_test) print('\n Here is the classification report:') print(classification_report(y_test, y_svm_predicted)) cm = confusion_matrix(y_test, y_pred) print(cm) # 6.4 随机森林 from sklearn.ensemble import RandomForestClassifier RF_clf = RandomForestClassifier(n_estimators=10).fit(X_train, y_train) predicted = RF_clf.predict(X_test) print('\n Here is the classification report:') print(classification_report(y_test, predicted)) cm = confusion_matrix(y_test, y_pred) print(cm) # 6.5 文本聚类 # K 均值法 from sklearn.cluster import KMeans, MiniBatchKMeans from collections import defaultdict true_k = 5 km = KMeans(n_clusters = true_k, init='k-means++', max_iter=100, n_init= 1) kmini = MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=1, init_size=1000, batch_size=1000, verbose=2) km_model = km.fit(X_train) kmini_model = kmini.fit(X_train) print("For K-mean clustering ") clustering = defaultdict(list) for idx, label in enumerate(km_model.labels_): clustering[label].append(idx) print("For K-mean Mini batch clustering ") clustering = defaultdict(list) for idx, label in enumerate(kmini_model.labels_): clustering[label].append(idx) # 6.6 文本中的主题建模 # https://pypi.python.org/pypi/gensim#downloads import gensim from gensim import corpora, models, similarities from itertools import chain import nltk from nltk.corpus import stopwords from operator import itemgetter import re documents = [document for document in sms_data] stoplist = stopwords.words('english') texts = [[word for word in document.lower().split() if word not in stoplist] for document in documents] print(texts) dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] lsi = models.LsiModel(corpus_tfidf, id2word = dictionary, num_topics = 100) # print(lsi.print_topics(20)) n_topics = 5 lda = models.LdaModel(corpus_tfidf, id2word = dictionary, num_topics = n_topics) for i in range(0, n_topics): temp = lda.show_topic(i, 10) terms = [] for term in temp: terms.append(str(term[0])) print("Top 10 terms for topic #" + str(i) + ": " + ",".join(terms))
nltk 机器学习。文本分类一整套
最新推荐文章于 2023-04-11 03:24:59 发布