实验框架图见
libsvm文本分类:二分类(二) 实验框架图 下面是主模块代码,暂不公布全部代码
代码
#
-*- codin
g: cp936 -*-
# coding gb2312
from SVM import FoldersCreation
import os
# #############################################################################################
# 参数设计
N = 100 # N: half of total corpus size
vfold = 5 # vfold: 循环验证的次数
featureDimension = 2000 # featureDimension:VSM模型特征维度
toCalInfoGain = 0 # 是否计算词袋子模型中的词集合的信息增益=1则不计算
count_done_research_times = 0 # 已经进行了几次试验
# N,count_done_research 为CorpusPartition.moveAccordingPartition的参数
# featureDimension,toCalInfoGain 2*N/vfold 为FeatureSelectionModel.featureSelectionIG
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
# #############创建文件夹########################################################################
os.mkdir(r ' D:\TextCategorization ' )
FoldersCreation.CreateAssist()
print ' 创建文件夹模块运行结束 '
print ' *************************************************************************** '
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
# ###############处理文档集合,对文档集合进行划分,区分测试集合和训练集合###############################
from SVM import CorpusPartition
CorpusPartition.MoveCorpus(N)
CorpusPartition.moveAccordingPartition(N,count_done_research_times)
print ' 分割文本集模块运行结束 '
print ' ******************************************************************* '
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
# ########################文档集合分词##########################################################
from SVM import DataManager
from ctypes import *
import os
import cPickle as p
import re
roots = [r ' D:\TextCategorization\training ' ,r ' D:\TextCategorization\testing ' ]
rootfinals = [r ' D:\TextCategorization\segmented ' ,r ' D:\TextCategorization\tsegmented ' ]
# root=r'D:\TextCategorization\training'
# rootfinal=r'D:\TextCategorization\segmented'
for i in range(0, 2 ):
dm = DataManager.DataManager(roots[i])
subdir = dm.GetSubDir()
filepathstotalsrc = []
for sub in subdir:
dm.SetFilePathsFromsubDir(roots[i] + os.sep + sub)
filepaths = dm.GetFilePaths()
filepathsassist = [sub + os.sep + path for path in filepaths ]
filepathstotalsrc = filepathstotalsrc + filepathsassist
for path in filepathstotalsrc:
myfile = file(roots[i] + os.sep + path)
s = myfile.read()
myfile.close()
dll = cdll.LoadLibrary( " ICTCLAS30.dll " )
dll.ICTCLAS_Init(c_char_p( " . " ))
bSuccess = dll.ICTCLAS_ParagraphProcess(c_char_p(s),0)
segmented = c_char_p(bSuccess).value
segmentedtmp = re.sub( " \s+ " , ' | ' ,segmented,0)
segmentedfinal = re.sub( ' \xa1\xa1 ' , '' ,segmentedtmp)
fid = file(rootfinals[i] + os.sep + path, ' w ' )
fid.write(segmentedfinal)
fid.close()
dll.ICTCLAS_Exit()
# print 'finalfinish congratulations!'
print ' 文档集分词模块运行结束 '
print ' ********************************************************************** '
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
# #################建立词袋子模型######################################################################
from SVM import BagOfWordsConstruction
BagOfWordsConstruction.BagOfWordsConstruction(r ' D:\TextCategorization\segmented ' )
print ' 建立词袋子模型模块运行结束 '
print ' *********************************************************************************** '
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
# ######################特征词选择##################################################################
from SVM import FeatureSelectionModel
featurewords = FeatureSelectionModel.featureSelectionIG(featureDimension,toCalInfoGain, 2 * N / vfold) # feature
import cPickle as mypickle
fid = file(r ' D:\TextCategorization\VITData\keywords.dat ' , ' w ' )
mypickle.dump(featurewords,fid)
fid.close()
print ' 特征词选择模块运行结束 '
print ' ******************************************************************************************* '
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
# ######################文档向量模型建立模块##############################################################
from SVM import VSMformation
root1 = r ' D:\TextCategorization\segmented '
root2 = r ' D:\TextCategorization\tsegmented '
print ' begin..... '
VSMformation.LibSVMFormat(r ' D:\TextCategorization\data\train.libsvm ' ,root1)
print ' 训练语料库转化完毕 '
VSMformation.LibSVMFormat(r ' D:\TextCategorization\data\test.libsvm ' ,root2)
print ' 测试语料库转化完毕 '
print ' 文档向量模型建立模块运行结束 '
print ' 批处理完毕,congratulations! '
# coding gb2312
from SVM import FoldersCreation
import os
# #############################################################################################
# 参数设计
N = 100 # N: half of total corpus size
vfold = 5 # vfold: 循环验证的次数
featureDimension = 2000 # featureDimension:VSM模型特征维度
toCalInfoGain = 0 # 是否计算词袋子模型中的词集合的信息增益=1则不计算
count_done_research_times = 0 # 已经进行了几次试验
# N,count_done_research 为CorpusPartition.moveAccordingPartition的参数
# featureDimension,toCalInfoGain 2*N/vfold 为FeatureSelectionModel.featureSelectionIG
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
# #############创建文件夹########################################################################
os.mkdir(r ' D:\TextCategorization ' )
FoldersCreation.CreateAssist()
print ' 创建文件夹模块运行结束 '
print ' *************************************************************************** '
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
# ###############处理文档集合,对文档集合进行划分,区分测试集合和训练集合###############################
from SVM import CorpusPartition
CorpusPartition.MoveCorpus(N)
CorpusPartition.moveAccordingPartition(N,count_done_research_times)
print ' 分割文本集模块运行结束 '
print ' ******************************************************************* '
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
# ########################文档集合分词##########################################################
from SVM import DataManager
from ctypes import *
import os
import cPickle as p
import re
roots = [r ' D:\TextCategorization\training ' ,r ' D:\TextCategorization\testing ' ]
rootfinals = [r ' D:\TextCategorization\segmented ' ,r ' D:\TextCategorization\tsegmented ' ]
# root=r'D:\TextCategorization\training'
# rootfinal=r'D:\TextCategorization\segmented'
for i in range(0, 2 ):
dm = DataManager.DataManager(roots[i])
subdir = dm.GetSubDir()
filepathstotalsrc = []
for sub in subdir:
dm.SetFilePathsFromsubDir(roots[i] + os.sep + sub)
filepaths = dm.GetFilePaths()
filepathsassist = [sub + os.sep + path for path in filepaths ]
filepathstotalsrc = filepathstotalsrc + filepathsassist
for path in filepathstotalsrc:
myfile = file(roots[i] + os.sep + path)
s = myfile.read()
myfile.close()
dll = cdll.LoadLibrary( " ICTCLAS30.dll " )
dll.ICTCLAS_Init(c_char_p( " . " ))
bSuccess = dll.ICTCLAS_ParagraphProcess(c_char_p(s),0)
segmented = c_char_p(bSuccess).value
segmentedtmp = re.sub( " \s+ " , ' | ' ,segmented,0)
segmentedfinal = re.sub( ' \xa1\xa1 ' , '' ,segmentedtmp)
fid = file(rootfinals[i] + os.sep + path, ' w ' )
fid.write(segmentedfinal)
fid.close()
dll.ICTCLAS_Exit()
# print 'finalfinish congratulations!'
print ' 文档集分词模块运行结束 '
print ' ********************************************************************** '
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
# #################建立词袋子模型######################################################################
from SVM import BagOfWordsConstruction
BagOfWordsConstruction.BagOfWordsConstruction(r ' D:\TextCategorization\segmented ' )
print ' 建立词袋子模型模块运行结束 '
print ' *********************************************************************************** '
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
# ######################特征词选择##################################################################
from SVM import FeatureSelectionModel
featurewords = FeatureSelectionModel.featureSelectionIG(featureDimension,toCalInfoGain, 2 * N / vfold) # feature
import cPickle as mypickle
fid = file(r ' D:\TextCategorization\VITData\keywords.dat ' , ' w ' )
mypickle.dump(featurewords,fid)
fid.close()
print ' 特征词选择模块运行结束 '
print ' ******************************************************************************************* '
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
# ######################文档向量模型建立模块##############################################################
from SVM import VSMformation
root1 = r ' D:\TextCategorization\segmented '
root2 = r ' D:\TextCategorization\tsegmented '
print ' begin..... '
VSMformation.LibSVMFormat(r ' D:\TextCategorization\data\train.libsvm ' ,root1)
print ' 训练语料库转化完毕 '
VSMformation.LibSVMFormat(r ' D:\TextCategorization\data\test.libsvm ' ,root2)
print ' 测试语料库转化完毕 '
print ' 文档向量模型建立模块运行结束 '
print ' 批处理完毕,congratulations! '