naive bayes/svm算法解释网上很多,这里主要讲讲文本分类。
这里主要集中在英文分类,基于词袋模型(bag of words),而中文文本分类还有一个分词问题。
数据集是twenty-news-groups,
c/c++代码是参考http://blog.sina.com.cn/s/blog_6b36e6750100zf6l.html,但是测试文本有点简单,并且是直接写在main中的,当时要求对twenty-news-groups进行K折交叉验证,我用了system函数进行文件操作,导致速度很慢,还有内存泄露问题,所以c的代码就不贴了,后来从github找了个python代码稍微改改,其实python的sklearn封装了twenty-news-groups算法。
NB_SVM.py
import sklearn.datasets
import sklearn.metrics
import sklearn.cross_validation
import sklearn.svm
import sklearn.naive_bayes
import sklearn.neighbors
import sys
def refine_single_email(email):
"""
Delete the unnecessary information in the header of emails
Deletes only lines in the email that starts with 'Path:', 'Newsgroups:', 'Xref:'
parameter is a string.
returns a string.
"""
parts = email.split('\n')
newparts = []
# finished is when we have reached a line with something like 'Lines:' at the begining of it
# this is because we want to only remove stuff from headers of emails
# look at the dataset!
finished = False
for part in parts:
if finished:
newparts.append(part)
continue
if not (part.startswith('Path:') or part.startswith('Newsgroups:') or part.startswith('Xref:')) and not finished:
newparts.append(part)
if part.startswith('Lines:'):
finished = True
return '\n'.join(newparts)
def refine_all_emails(file_data):
"""
Does `refine_single_email` for every single email included in the list
parameter is a list of strings
returns NOTHING!
"""
for i, email in zip(range(len(file_data)),file_data):
file_data[i] = refine_single_email(email)
def bagOfWords(files_data):
"""
Converts a list of strings (which are loaded from files) to a BOW representation of it
parameter 'files_data' is a list of strings
returns a `scipy.sparse.coo_matrix`
"""
count_vector = sklearn.feature_extraction.text.CountVectorizer()
return count_vector.fit_transform(files_data)
def cross_validation(data, target, classifier, cv=5):
"""
Does a cross validation with the classifier
parameters:
- `data`: array-like, shape=[n_samples, n_features]
Training vectors
- `target`: array-like, shape=[n_samples]
Target values for corresponding training vectors
- `classifier`: A classifier from the scikit-learn family would work!
- `cv`: number of times to do the cross validation. (default=5)
return a list of numbers, where the length of the list is equal to `cv` argument.
"""
return sklearn.cross_validation.cross_val_score(classifier, data, target, cv=cv)
def pretty_print_scores(scores):
"""
Prints mean and std of a list of scores, pretty and colorful!
parameter `scores` is a list of numbers.
"""
print (" ", 'white', 'on_white')
print (" Mean accuracy: %0.3f (+/- %0.3f std) " % (scores.mean(), scores.std() / 2), 'magenta', 'on_white')
print (" ", 'white', 'on_white')
def test_classifier(X, y, clf, test_size=0.4, y_names=None, confusion=False):
#train-test split
print 'test size is: %2.0f%%' % (test_size*100)
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(X, y, test_size=test_size)
clf.fit(X_train, y_train)
y_predicted = clf.predict(X_test)
if not confusion:
print ('Classification report:', 'magenta')
print sklearn.metrics.classification_report(y_test, y_predicted, target_names=y_names)
else:
print ('Confusion Matrix:', 'magenta')
print sklearn.metrics.confusion_matrix(y_test, y_predicted)
def main():
# get the dataset
print "\nPlease input the path of 4 categories folder:\n"
ans = sys.stdin.readline()
# remove any newlines or spaces at the end of the input
container_path = ans.strip('\n')
if container_path.endswith(' '):
container_path = container_path.rstrip(' ')
#container_path="F:\\PycharmProjects\\SupporVectorMachine\\data"
categories = ['c1_alt.atheism','c2_sci.crypt','c3_talk.politics.guns','c4_comp.sys.mac.hardware']
files=sklearn.datasets.load_files(container_path,charset='utf-8',charse_error='ignore')
refine_all_emails(files.data)
word_counts = bagOfWords(files.data)
# TFIDF
tf_transformer = sklearn.feature_extraction.text.TfidfTransformer(use_idf=True).fit(word_counts)
X_tfidf = tf_transformer.transform(word_counts)
X = X_tfidf
#cross validation
clf_NB = sklearn.naive_bayes.MultinomialNB()
clf_SVM = sklearn.svm.LinearSVC()
#evaluation for NB
print "\nEvaluation for Naive Bayes\n"
scores_NB = cross_validation(X, files.target, clf_NB, cv=5)
pretty_print_scores(scores_NB)
test_classifier(X, files.target, clf_NB, test_size=0.2, y_names=files.target_names, confusion=False)
#evaluation for SVM
print "\nEvaluation for Supported Vector Machine\n"
scores_SVM = cross_validation(X, files.target, clf_SVM, cv=5)
pretty_print_scores(scores_SVM)
test_classifier(X, files.target, clf_SVM, test_size=0.2, y_names=files.target_names, confusion=False)
if __name__ == '__main__':
main()
import NB_SVM
def main():
NB_SVM.main()
if __name__ == '__main__':
main()