人工智能作业homework5------naive bayes/svm文本分类

naive bayes/svm算法解释网上很多,这里主要讲讲文本分类。         


这里主要集中在英文分类,基于词袋模型(bag of words),而中文文本分类还有一个分词问题。

数据集是twenty-news-groups,


c/c++代码是参考http://blog.sina.com.cn/s/blog_6b36e6750100zf6l.html,但是测试文本有点简单,并且是直接写在main中的,当时要求对twenty-news-groups进行K折交叉验证,我用了system函数进行文件操作,导致速度很慢,还有内存泄露问题,所以c的代码就不贴了,后来从github找了个python代码稍微改改,其实python的sklearn封装了twenty-news-groups算法。


NB_SVM.py

import sklearn.datasets
import sklearn.metrics
import sklearn.cross_validation
import sklearn.svm
import sklearn.naive_bayes
import sklearn.neighbors
import sys


def refine_single_email(email):
	"""
	Delete the unnecessary information in the header of emails
	Deletes only lines in the email that starts with 'Path:', 'Newsgroups:', 'Xref:'
	parameter is a string.
	returns a string.
	"""

	parts = email.split('\n')
	newparts = []

	# finished is when we have reached a line with something like 'Lines:' at the begining of it
	# this is because we want to only remove stuff from headers of emails
	# look at the dataset!
	finished = False
	for part in parts:
		if finished:
			newparts.append(part)
			continue
		if not (part.startswith('Path:') or part.startswith('Newsgroups:') or part.startswith('Xref:')) and not finished:
			newparts.append(part)
		if part.startswith('Lines:'):
			finished = True

	return '\n'.join(newparts)

def refine_all_emails(file_data):
	"""
	Does `refine_single_email` for every single email included in the list
	parameter is a list of strings
	returns NOTHING!
	"""

	for i, email in zip(range(len(file_data)),file_data):
		file_data[i] = refine_single_email(email)


def bagOfWords(files_data):
	"""
	Converts a list of strings (which are loaded from files) to a BOW representation of it
	parameter 'files_data' is a list of strings
	returns a `scipy.sparse.coo_matrix`
	"""

	count_vector = sklearn.feature_extraction.text.CountVectorizer()
	return count_vector.fit_transform(files_data)

def cross_validation(data, target, classifier, cv=5):
	"""
	Does a cross validation with the classifier
	parameters:
		- `data`: array-like, shape=[n_samples, n_features]
			Training vectors
		- `target`: array-like, shape=[n_samples]
			Target values for corresponding training vectors
		- `classifier`: A classifier from the scikit-learn family would work!
		- `cv`: number of times to do the cross validation. (default=5)
	return a list of numbers, where the length of the list is equal to `cv` argument.
	"""
	return sklearn.cross_validation.cross_val_score(classifier, data, target, cv=cv)


def pretty_print_scores(scores):
	"""
	Prints mean and std of a list of scores, pretty and colorful!
	parameter `scores` is a list of numbers.
	"""
	print  ("                                      ", 'white', 'on_white')
	print  (" Mean accuracy: %0.3f (+/- %0.3f std) " % (scores.mean(), scores.std() / 2), 'magenta', 'on_white')
	print  ("                                      ", 'white', 'on_white')

def test_classifier(X, y, clf, test_size=0.4, y_names=None, confusion=False):
	#train-test split
	print 'test size is: %2.0f%%' % (test_size*100)
	X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(X, y, test_size=test_size)

	clf.fit(X_train, y_train)
	y_predicted = clf.predict(X_test)

	if not confusion:
		print ('Classification report:', 'magenta')
		print sklearn.metrics.classification_report(y_test, y_predicted, target_names=y_names)
	else:
		print ('Confusion Matrix:', 'magenta')
		print sklearn.metrics.confusion_matrix(y_test, y_predicted)


def main():

	# get the dataset
	print "\nPlease input the path of 4 categories folder:\n"
	ans = sys.stdin.readline()
	# remove any newlines or spaces at the end of the input
	container_path = ans.strip('\n')
	if container_path.endswith(' '):
		container_path = container_path.rstrip(' ')

	#container_path="F:\\PycharmProjects\\SupporVectorMachine\\data"

	categories = ['c1_alt.atheism','c2_sci.crypt','c3_talk.politics.guns','c4_comp.sys.mac.hardware']
	files=sklearn.datasets.load_files(container_path,charset='utf-8',charse_error='ignore')
	refine_all_emails(files.data)
	word_counts = bagOfWords(files.data)
	# TFIDF
	tf_transformer = sklearn.feature_extraction.text.TfidfTransformer(use_idf=True).fit(word_counts)
	X_tfidf = tf_transformer.transform(word_counts)
	X = X_tfidf
	#cross validation
	clf_NB = sklearn.naive_bayes.MultinomialNB()
	clf_SVM = sklearn.svm.LinearSVC()
	#evaluation for NB
	print "\nEvaluation for Naive Bayes\n"
	scores_NB = cross_validation(X, files.target, clf_NB, cv=5)
	pretty_print_scores(scores_NB)
	test_classifier(X, files.target, clf_NB, test_size=0.2, y_names=files.target_names, confusion=False)
	#evaluation for SVM
	print "\nEvaluation for Supported Vector Machine\n"
	scores_SVM = cross_validation(X, files.target, clf_SVM, cv=5)
	pretty_print_scores(scores_SVM)
	test_classifier(X, files.target, clf_SVM, test_size=0.2, y_names=files.target_names, confusion=False)

if __name__ == '__main__':
	main()


main.py

import NB_SVM

def main():
    NB_SVM.main()

if __name__ == '__main__':
	main()



  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值