作者:finallyliuyu 转载使用等请注明出处
功能:根据:档集合大小,特征词数目,交叉验证折数的不同需求,快速生成Libsvm格式数据
数据Demo请见:新闻文本分类libsvm格式数据
此模块的各个子模块的功能我就不详细介绍了,与此系列博客K-means文本聚类系列(已经完成) 一曲同工。
下面开始给出各个子模块的代码:
# -*- coding: cp936 -*-
########################################################################3
#
#自动建立文件夹
#
########################################################################
def CreateFolders(path):
import os
curpath=os.getcwd()
os.mkdir(path)
os.chdir(path)
os.mkdir('class1')
os.mkdir('class2')
os.chdir(curpath)
def CreateAssist(toCalInfoGain):
import os
if toCalInfoGain==0:
folders=[r'D:\TextCategorization\corpus',r'D:\TextCategorization\testing',r'D:\TextCategorization\training',r'D:\TextCategorization\segmented',r'D:\TextCategorization\tsegmented']
for myfolder in folders:
CreateFolders(myfolder)
os.mkdir(r'D:\TextCategorization\VITdata')
os.mkdir(r'D:\TextCategorization\data')
os.mkdir(r'D:\TextCategorization\VITdata\data')
os.mkdir(r'D:\TextCategorization\data\data')
print 'finish,congratulations'
if __name__=="__main__":
import os
os.mkdir(r'D:\TextCategorization')
CreateAssist()
文档集分割:多少篇文章归为训练集,多少篇文章归为测试集:注意参数N为整个文档集合(包括测试集和训练集)的每一类的文章数目。这里设置两个类别有相等的文章数目集整个文档集合的文章数目为2N
# -*- coding: cp936 -*-
#此模块用于对文本语料库进行预处理
###################################################
#origidir:原语料库目录如E:\新闻语料\EntireTrainingSet\ClassFile\C000024
#destidir:目标语料库目录D:\corpus\class1
#N:需要移动的文本的个数
#####################################################
def CorpusFormation(origidir,destidir,N):
paths=[]
import os,shutil
for i in range(0,N):#产生待移动的文件路径
paths.append(origidir+'\\'+str(i)+'.txt')
for mypath in paths:
shutil.copy(mypath,destidir)
#print 'finsh%s'%mypath
#####################################################
#移动语料库
#####################################################
def MoveCorpus(N,toCalInfoGain):
if toCalInfoGain==0:
originaldirs=[r'E:\新闻语料\EntireTrainingSet\ClassFile\C000024',r'E:\新闻语料\EntireTrainingSet\ClassFile\C000013']
destinationdirs=[r'D:\TextCategorization\corpus\class1',r'D:\TextCategorization\corpus\class2']
for i in range(0,2):
CorpusFormation(originaldirs[i],destinationdirs[i],N)
print 'finish'
#####################################################################
#origidir:原语料库目录,如D:\corpus\class1
#destdir1:目标目录:如D:\training\class1
#destidir2:目标目录:如D:\testing\class1
#Vfold:几折交叉验证 count:已经进行了几次实验count=0,1,2,3
#N语料库的总规模
#####################################################################
def CorpusPartition(origidir,destdir1,destdir2,count,N,vfold=5):
import os,shutil
step=N/vfold
paths=[]
for i in range(0+count*step,step+count*step):
paths.append(origidir+'\\'+str(i)+'.txt')
for mypath in paths:
shutil.move(mypath,destdir1)
#print 'finsh%s'%mypath
paths=[]
pathstemp=os.listdir(origidir)
for m in pathstemp:
paths.append(origidir+os.sep+m)
for mypath in paths:
shutil.move(mypath,destdir2)
#print 'finish%s'%mypath
#print 'finalfinish congratulations!'
def moveAccordingPartition(N,count,toCalInfoGain):
if toCalInfoGain==0:
originaldirs=[r'D:\TextCategorization\corpus\class1',r'D:\TextCategorization\corpus\class2']
destidirs1=[r'D:\TextCategorization\training\class1',r'D:\TextCategorization\training\class2']
destidirs2=[r'D:\TextCategorization\testing\class1',r'D:\TextCategorization\testing\class2']
for i in range(0,2):
CorpusPartition(originaldirs[i],destidirs1[i],destidirs2[i],count,N)
print '第%s finish'%i
if __name__=="__main__":
#MoveCorpus(500)
N=500
count=0
moveAccordingPartition(N,count)
# -*- coding: cp936 -*-
#此模块用于建立词袋子模型
def BagOfWordsConstruction(root,toCalInfoGain):
if toCalInfoGain==0:
import cPickle as mypickle
file_dest=file(r'D:\TextCategorization\VITdata\vocabularystatistics.dat','w')
rawVSMMatrix=TrainingFileProcess(root)
vocabularystatistics={}
templist=[]
for rawVSM in rawVSMMatrix:
templist=templist+rawVSM
wordscollection=list(set(templist))
for word in wordscollection:
index=0
for rawVSM in rawVSMMatrix:
count=rawVSM.count(word)
if count>0 :
if vocabularystatistics.has_key(word)==False:
vocabularystatistics[word]=[]
vocabularystatistics[word].append((index,count))
else:
vocabularystatistics[word].append((index,count))
index=index+1
mypickle.dump(vocabularystatistics,file_dest)
print len(vocabularystatistics)
file_dest.close()
print 'BagOfWordsConstructionFinish'
############################################################################################
#将文章内容变成词集合
def FilePreProcess(rawtext):
import re
listresult=rawtext.split("|")
finalrawVSM=[]
stopwordlist=FilterNoiseWord(r'C:\Python26\SVM\stopwords.txt')
for m in listresult:
if m!=''and m not in stopwordlist and re.search('\xa3[\xa1-\xfe]',m)==None:
finalrawVSM.append(m)
return finalrawVSM
#################################################################################################
#训练集文档预处理
def TrainingFileProcess(root):
from SVM import DataManager
import cPickle as mypickle
import os
rawVSMMatrix=[]#存放整个文档集
dm=DataManager.DataManager(root)
subdir=dm.GetSubDir()
for sub in subdir:
dm.SetFilePathsFromsubDir(root+os.sep+sub)
filepaths=dm.GetFilePaths()
for path in filepaths:
myfile=file(root+os.sep+sub+os.sep+path)
rawtext=myfile.read()
myfile.close()
rawVSM=FilePreProcess(rawtext)
rawVSMMatrix.append(rawVSM)
return rawVSMMatrix
####################################################################################
#生成停用词列表
def FilterNoiseWord(stopword_file_name):
import re
f=file(stopword_file_name)
stopword=f.read()
f.close()
stopwordlist=re.split('\n',stopword)
return stopwordlist
if __name__=="__main__":
BagOfWordsConstruction(r'D:\TextCategorization\segmented')
#fid=file(r'D:\3011.txt')
#rawtext=fid.read()
#fid.close()
#FilePreProcess(rawtext)
class IG:
'''
此模块用于计算信息增益
'''
#######################################################################################
def __init__(self,n_size):
''' 类的构造函数,初始化 类的数据成员变量
keys保存Term的值
labelOneNums保存对应的term出现在类1中的文章的篇数
labelTwoNums保存对应的term出现在类2中的文字的篇数
'''
import cPickle as mypickle
mydict=mypickle.load(file(r'D:\TextCategorization\VITData\vocabularystatistics.dat'))
self.mykeys=[]
self.labelOneNums=[]
self.labelTwoNums=[]
self.probs=[]
self.conProbs=[]
#self.informationgain=[]
for key ,value in mydict.iteritems():
self.mykeys.append(key)
class1_count=0 #某个term属于类别一的次数
class2_count=0#某个term 属于类别二的次数
for val in value:
if val[0]<n_size/2:#该文章标号属于类别1
class1_count=class1_count+val[1]
#class1_count=class1_count+1
else:#该文章标号属于类别2
class2_count=class2_count+val[1]
#class2_count=class2_count+1
self.labelOneNums.append(class1_count)
self.labelTwoNums.append(class2_count)
#测试代码
#fid=file('1.txt','a')
#for m in self.labelOneNums:
#print>>fid,m
#fid.flush()
#fid.close()
#print len([m for m in self.labelOneNums if m>0])
#print len(self.labelTwoNums)
#print len(self.mykeys)
###################################################################################3
#def GetConditionProbabilityBaseC(self,n_size,termcount):
#conditionPtxC=float(termcount+1)/(n_size/2+len(self.mykeys))
#return conditionPtxC
def GetConditionProbabilityBaseC(self,index,termcount):
'''
计算P(t|C)
'''
if index==1:
conditionPtxC=float(termcount+1)/(len(self.mykeys)+sum(self.labelOneNums))
else:
conditionPtxC=float(termcount+1)/(len(self.mykeys)+sum(self.labelTwoNums))
return conditionPtxC
def GetTermProbability(self,n_size):
'''
计算每个term的先验概率
'''
#sumtotal=sum(self.labelOneNums)+sum(self.labelTwoNums)
for i in range(0,len(self.mykeys)):
prob=0.5*self.GetConditionProbabilityBaseC(1,self.labelOneNums[i])+0.5*self.GetConditionProbabilityBaseC(2,self.labelTwoNums[i])
self.probs.append(prob)
#测试代码
#fid=file('prob.txt','a')
#for m in self.probs:
#print>>fid,m
#fid.flush()
#fid.close()
###################################################################################
def GetCategoryProbConditionTerm(self,n_size):
'''
保存在一个词出现与否的前提下,文章属于某个类的概率
'''
for i in range(0,len(self.mykeys)):
# conprob1:出现term t 其属于类别1的概率
conprob1=self.GetConditionProbabilityBaseC(1,self.labelOneNums[i])*0.5/self.probs[i]
# conprob2:出现term t 其属于类别2的概率
conprob2=self.GetConditionProbabilityBaseC(2,self.labelTwoNums[i])*0.5/self.probs[i]
#nonconprob1:不出现term t的条件下,属于类别 1的概率
nonconprob1=1-conprob1
#nonconprob2:不出现term t的条件下,属于类别 2的概率
nonconprob2=1-conprob2
self.conProbs.append((conprob1,conprob2,nonconprob1,nonconprob2))
#测试代码
#fid=file('conprob.txt','a')
#for m in self.conProbs:
#print>>fid,m
#fid.flush()
#fid.close()
########################################################################################
def CalInformationGain(self,n_size):
'''
计算每个单词的信息增益
'''
import math
import cPickle as mypickle
temp=0#辅助计算变量
nontemp=0#辅助计算变量
self.GetTermProbability(n_size)
self.GetCategoryProbConditionTerm(n_size)
infoGain={}
for i in range(0,len(self.mykeys)):
temp=0#辅助计算变量
nontemp=0#辅助计算变量
conprob1=self.conProbs[i][0]
conprob2=self.conProbs[i][1]
nonconprob1=self.conProbs[i][2]
nonconprob2=self.conProbs[i][3]
if conprob1!=0:
temp=temp+conprob1*math.log(conprob1,2)
if conprob2!=0:
temp=temp+conprob2*math.log(conprob2,2)
if nonconprob1!=0:
nontemp=nontemp+nonconprob1*math.log(nonconprob1,2)
if nonconprob2!=0:
nontemp=nontemp+nonconprob2+math.log(nonconprob2,2)
igval=2+self.probs[i]*temp+(1-self.probs[i])*nontemp
infoGain[self.mykeys[i]]=igval
#infoGain.sort(key=lambda d:d[1],reverse=True)
infoGainResult=sorted(infoGain.iteritems(),key=lambda infoGain:infoGain[1],reverse=True)
print '共计算了%s个词的IG值' %len(infoGainResult)
#for m in infoGainResult:
#print '%s,%s'%(m[0],m[1])
fid=file(r'D:\TextCategorization\VITData\infoGain.dat','w')
mypickle.dump(infoGainResult,fid)
fid.close()
if __name__=="__main__":
MyIG=IG(200)
MyIG.CalInformationGain(200)
-*- coding: cp936 -*-
'''
此模块根据信息增益选择特征词
'''
###########################################################################
def featureSelectionIG(N,flag,n_size):
'''
更新数据库,并返回特征词集合
#flag=0表示infoGain没有被计算
'''
from SVM import InformationGain
import cPickle as mypickle
if flag==0:
MyIG=InformationGain.IG(n_size)
MyIG.CalInformationGain(n_size)
featurewords=[]
infoGainResult=mypickle.load(file(r'D:\TextCategorization\VITdata\infoGain.dat'))
print 'infoGainResult的长度%s'%len(infoGainResult)
#N=1000#确定特征维数。
infoGainfinal=infoGainResult[0:N]
print 'infoGainfinal的长度%s' %len(infoGainfinal)
featurewords=[m[0] for m in infoGainfinal]
print '共有%s个特征词'%len(featurewords)
return featurewords
#####################################################################
if __name__=="__main__":
featurewords=featureSelectionIG(1000,0,200)
import cPickle as mypickle
fid=file(r'D:\TextCategorization\VITData\data\keywords.dat','w')
mypickle.dump(featurewords,fid)
fid.close()
'''
此模块用于形成文档向量模型
'''
################################################################
def FormatVSM(sub,root,keywordsaddress):
'''
对文档集建立文档向量模型,储存在一个二维list中
'''
from SVM import DataManager
import cPickle as mypickle
import re
import os
#root=r'D:\tsegmented'
keywords=mypickle.load(file(keywordsaddress))
dm=DataManager.DataManager(root)#读数据专家
VSMMatrix=[]
dm.SetFilePathsFromsubDir(root+os.sep+sub)
filepaths=dm.GetFilePaths()
for path in filepaths:
myfile=file(root+os.sep+sub+os.sep+path)
rawtext=myfile.read()
myfile.close()
textwordslist=FilePreProcess(rawtext)
VSM=[]
for i in range(0,len(keywords)):
count=textwordslist.count(keywords[i])
VSM.append((i+1,count))
VSMMatrix.append(VSM)
return VSMMatrix
####################################################################
def LibSVMFormat(dest,root,keywordsaddress):
'''
形成VSM
'''
fid=file(dest,'a')
VSMMatrix=FormatVSM('class1',root,keywordsaddress)
for VSM in VSMMatrix:
s='1'
for elem in VSM:
if elem[1]!=0:
s=s+' \t'+str(elem[0])+':'+str(elem[1])
s=s+' \t\n'
fid.write(s)
VSMMatrix=FormatVSM('class2',root,keywordsaddress)
for VSM in VSMMatrix:
s='0'
for elem in VSM:
if elem[1]!=0:
s=s+' \t'+str(elem[0])+':'+str(elem[1])
s=s+' \t\n'
fid.write(s)
#print 'finish'
fid.close()
print 'functionfinish'
#############################################################33
def FilePreProcess(rawtext):
listresult=rawtext.split("|")
return listresult
###################################################################
if __name__=="__main__":
root1=r'D:\TextCategorization\segmented'
root2=r'D:\TextCategorization\tsegmented'
print 'begin.....'
LibSVMFormat(r'D:\TextCategorization\data\train.libsvm',root1,keywordsaddress)
print '训练语料库转化完毕'
LibSVMFormat(r'D:\TextCategorization\data\test.libsvm',root2,keywordsaddress)
print '测试语料库转化完毕'
文本预处理的主程序模块,该模块调用上面的各个子模块完成“根据:档集合大小,特征词数目,交叉验证折数的不同需求,快速生成Libsvm格式数据”的功能。
# -*- coding: cp936 -*-
#coding gb2312
from SVM import FoldersCreation
import os
##############################################################################################
#参数设计
corpus_size=[1500]
#N: half of total corpus size
vfold=5 #vfold: 循环验证的次数
featureDimensions=[10,20,30,40,50,60,70,80,90,100,110,120,130,140,150] #featureDimension:VSM模型特征维度
toCalInfoGain=0#是否计算词袋子模型中的词集合的信息增益=1则不计算
times=[2]
#count_done_research_times=0#已经进行了几次试验
# N,count_done_research 为CorpusPartition.moveAccordingPartition的参数
#featureDimension,toCalInfoGain 2*N/vfold 为FeatureSelectionModel.featureSelectionIG
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
for count_done_research_times in times:
for N in corpus_size:
print '目前文档集规模为%s'%N
print '目前在该规模文档集上面已经进行了%s次实验'%count_done_research_times
for featureDimension in featureDimensions:
if featureDimension>10:
toCalInfoGain=1
print '目前处理的特征维数是%s'%featureDimension
##############创建文件夹########################################################################
if toCalInfoGain==0:
os.mkdir(r'D:\TextCategorization')
FoldersCreation.CreateAssist(toCalInfoGain)
print '创建文件夹模块运行结束'
print '***************************************************************************'
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
################处理文档集合,对文档集合进行划分,区分测试集合和训练集合###############################
from SVM import CorpusPartition
CorpusPartition.MoveCorpus(N,toCalInfoGain)
CorpusPartition.moveAccordingPartition(N,count_done_research_times,toCalInfoGain)
print '分割文本集模块运行结束'
print '*******************************************************************'
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
#########################文档集合分词##########################################################
from SVM import DataManager
from ctypes import *
import os
import cPickle as p
import re
if toCalInfoGain==0:
roots=[r'D:\TextCategorization\training',r'D:\TextCategorization\testing']
rootfinals=[r'D:\TextCategorization\segmented',r'D:\TextCategorization\tsegmented']
for i in range(0,2):
dm=DataManager.DataManager(roots[i])
subdir=dm.GetSubDir()
filepathstotalsrc=[]
for sub in subdir:
dm.SetFilePathsFromsubDir(roots[i]+os.sep+sub)
filepaths=dm.GetFilePaths()
filepathsassist=[sub+os.sep+path for path in filepaths ]
filepathstotalsrc=filepathstotalsrc+filepathsassist
for path in filepathstotalsrc:
myfile=file(roots[i]+os.sep+path)
s=myfile.read()
myfile.close()
dll=cdll.LoadLibrary("ICTCLAS30.dll")
dll.ICTCLAS_Init(c_char_p("."))
bSuccess = dll.ICTCLAS_ParagraphProcess(c_char_p(s),0)
segmented=c_char_p(bSuccess).value
segmentedtmp=re.sub("\s+",'|',segmented,0)
segmentedfinal=re.sub('\xa1\xa1','',segmentedtmp)
fid=file(rootfinals[i]+os.sep+path,'w')
fid.write(segmentedfinal)
fid.close()
dll.ICTCLAS_Exit()
#print 'finalfinish congratulations!'
print '文档集分词模块运行结束'
print '**********************************************************************'
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
##################建立词袋子模型######################################################################
from SVM import BagOfWordsConstruction
BagOfWordsConstruction.BagOfWordsConstruction(r'D:\TextCategorization\segmented',toCalInfoGain)
print '建立词袋子模型模块运行结束'
print '***********************************************************************************'
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
#######################特征词选择##################################################################
from SVM import FeatureSelectionModel
featurewords=FeatureSelectionModel.featureSelectionIG(featureDimension,toCalInfoGain,2*N/vfold)#feature
import cPickle as mypickle
fid=file(r'D:\TextCategorization\VITData\data\keywords.dat','w')
mypickle.dump(featurewords,fid)
fid.close()
print '特征词选择模块运行结束'
print '*******************************************************************************************'
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
#######################文档向量模型建立模块##############################################################
from SVM import VSMformation
import shutil
root1=r'D:\TextCategorization\segmented'
root2=r'D:\TextCategorization\tsegmented'
keywordsaddress=r'D:\TextCategorization\VITData\data\keywords.dat'
print 'begin.....'
VSMformation.LibSVMFormat(r'D:\TextCategorization\data\data\train.libsvm',root1,keywordsaddress)
print '训练语料库转化完毕'
VSMformation.LibSVMFormat(r'D:\TextCategorization\data\data\test.libsvm',root2,keywordsaddress)
print '测试语料库转化完毕'
print '文档向量模型建立模块运行结束'
print '批处理完毕,congratulations!'
os.chdir(r'C:\\Python26')
os.chdir('D:\\TextCategorization')
new_dir='TextCategorization_'+str(count_done_research_times)+'_'+str(N)+'_'+str(featureDimension)
os.mkdir(new_dir)
os.chdir(new_dir)
os.mkdir('data')
os.chdir(r'C:\\Python26')
print os.getcwd()
shutil.move(r'D:\TextCategorization\VITdata\data\keywords.dat','D:\\TextCategorization\\'+new_dir+'\\data')
shutil.move(r'D:\TextCategorization\data\data\train.libsvm','D:\\TextCategorization\\'+new_dir+'\\data')
shutil.move(r'D:\TextCategorization\data\data\test.libsvm','D:\\TextCategorization\\'+new_dir+'\\data')
print'恭喜,文件夹重命名完毕'
print '###########################finish##################################'
os.chdir('D:\\')
print os.getcwd()
if os.path.isdir('TextCategorization'):
os.rename('TextCategorization',str(count_done_research_times)+'_'+str(N)+'_rfinish')
os.chdir(r'C:\Python26')
toCalInfoGain=0
print str(count_done_research_times)+'_'+str(N)+'finish'