文本挖掘过程的特征提取

最新推荐文章于 2024-06-18 19:51:45 发布

rocky_zheng

最新推荐文章于 2024-06-18 19:51:45 发布

阅读量2.3k

点赞数

分类专栏：文本数据挖掘文章标签： python 分词文本挖掘数据

本文链接：https://blog.csdn.net/sinat_16233463/article/details/36375067

版权

文本数据挖掘专栏收录该内容

2 篇文章 0 订阅

订阅专栏

文本是非结构化的数据，通常采用空间向量模型（vsm）来将文本转换成结构化的数据。用某些特征来作为文本的特征，通常是文中的词，但是不是所有的词都能很好的代表文中的特性，就算是把所有的词都用上，这导致很高的维度，计算量增加，产生很多的噪音。所以需要特征提取来选择部分词作为特征项，这就是特征提取要做的。选择好特征就需要给每个特征一定的权重。比如二维向量（2,3）根据这个2和3就能在平面上确定这个向量的方向和长度。在多为空间同样需要为每个维度确定一个值，从而将每个文本表示成一个向量。特征提取的函数会有很多种，在此不列举，可网上找到。对于文本分类效果较好的是卡方，和信息增益。权重的计算公式比较经典的是一个经验公式tf-idf.在此也不详细说明词公式的具体算法，直接搜索就能找到，当然会有很多种变形公式。

下面贴上我用搜狗提供的语料库完成的文本向量化的Python代码。其中包括分词，分词是用上一篇博文的方法做的，可参考我的第一篇博文。其中还有词性选择，停用词等文本数据的处理。中科院张华平博士的分词工具可以标出词的词性，所以将需要的词性保存在一个文本文件中，停用词表也保存咋文档中。代码中只实现了卡方函数，你可以很容易的实现信息增益等。

import os
import sys
import re
import math
import csv
from SplitWord import *


###处理文本并完成分词
def readDoc(fileName):
    fr=open(fileName)
    content=""
    for line in fr.readlines():
        content+=line.strip('\n')
    return(content)


def LoadData():
    fw01=open("..\\result\\crop.txt",'w',encoding='utf-8')
    fileBook=os.listdir("..\\文本分类语料库")
    i=0
    j=0
    for book in fileBook:
        for doc in os.listdir("..\\文本分类语料库\\"+book):
            try:
                content=readDoc("..\\文本分类语料库\\"+book+"\\"+doc)
            except:
                j+=1
                continue
            content=SplitWord(content)
            fw01.write(str(i)+" ")
            fw01.write(content)
            fw01.write('\n')
        i+=1

###############start here###############
##去停用词，选择词性
#LoadData()
def loadStopWords():
    stopWords=[]
    fr=open("SupportFile\\stopwordsall.txt")
    for line in fr.readlines():
        line=line.strip('\n')
        stopWords.append(line)
    return(stopWords)

def loadKeepNature():
    wordNature=[]
    fr=open("SupportFile\\wordNature.txt")
    for line in fr.readlines():
        line=line.strip('\n')
        wordNature.append(line)
    return(wordNature)

def cleanData(inContent,LowLength=2,UpLength=5):
    NatureList=loadKeepNature()
    stopWords=loadStopWords()
    ret=[]
    for w in inContent:
        
        if len(w)==0:
            continue
        nature=re.findall('[a-z].*$',w)
        if ''.join(nature) in NatureList:
            w=re.findall('^.*/',w)
            w=''.join(re.findall('[^/]',''.join(w)))
            if w not in stopWords and LowLength<=len(w)<=UpLength:
                ret.append(w)
    return ret
            

##计算tf,tfidf,df
def catTf(content):
    wordTf={}
    for w in content:
        wordTf[w]=wordTf.setdefault(w,0)+1
    return wordTf

def caculateF():
    docNum=0       #文章总数
    classLabel=[]  #文章类型标签
    wordTf=[]      #存储词频字典
    file=open("..\\result\\crop.txt",encoding='utf-8')
    for line in file.readlines():
        docNum+=1
        docTf={}
        line=line.split(' ')
        classLabel.append(line[0])
        content=cleanData(line[1:])
        docTf=catTf(content)
        wordTf.append(docTf)
        
        #if docNum>1000:
            #break
    file.close()
    return docNum,classLabel,wordTf
    
#文档频数
def termDf(docNum,classLabel,wordTf):
    termDf={} #词的文档频数
    for i in range(docNum):
        for w in wordTf[i].keys():
            termDf[w]=termDf.setdefault(w,0)+1
    return termDf

#tfidf
def wordTfidf(docNum,wordTf,termDf):
    wordTfidf=[]
    for i in range(docNum):
        tempc={}
        for w,v in wordTf[i].items():
            tempc[w]=v*math.log2(docNum/termDf[w])
        wordTfidf.append(tempc)
    return wordTfidf

#词在不同类中的文档频数
def termClassDf(docNum,classLabel,wordTf):
    termClassDf={}
    for i in range(docNum):
        for w in wordTf[i].keys():
            if w not in termClassDf.keys():
                termClassDf[w]={}
            termClassDf[w][classLabel[i]]=termClassDf[w].setdefault(classLabel[i],0)+1
    return termClassDf

##特征提取
#Chi
def CHI(docNum,classLabel,termDf,termClassDf,selectNum=500):
    termChi={}
    termSelect=[]
    docClassCount={}
    for cl in classLabel:
        docClassCount[cl]=docClassCount.setdefault(cl,0)+1
    for cl in tuple(set(classLabel)):
        termChi[cl]={}
        for w in termDf.keys():
            
            if cl in termClassDf[w].keys():
                n11=termClassDf[w][cl]
            else:
                n11=0
                
            n12=termDf[w]-n11
            n21=docClassCount[cl]-n11
            n22=docNum-n11-n12-n21
            termChi[cl][w]=docNum*pow((n11*n22-n12*n21),2)/(n11+n12)*(n11+n21)*(n12+n22)*(n21+n22)
    for cl in termChi.keys():
        termpSort=sorted(termChi[cl].items(),key=lambda d:d[1],reverse=True)
        for i in range(selectNum):
            if termpSort[i][0] not in termSelect:
                termSelect.append(termpSort[i][0])
    return termSelect

def writeTerm(termSelect):
    fw=open("..\\result\\term_select.txt",'w',encoding='utf-8')
    for k in termSelect:
        fw.write(k)
        fw.write('\n')
    fw.close()

def vSpaceModel(docNum,termSelect,wordTfidf,classlabel):
    vSpace=[]
    
    for i in range(docNum):
        tempList=[]
        for w in termSelect:
            if w in wordTfidf[i].keys():
                tempList.append(wordTfidf[i][w])
            else:
                tempList.append(0)
        vSpace.append(tempList)
    with open("..\\result\\vSpace.csv",'w',newline='',encoding='utf-8') as fw:
        wr=csv.writer(fw)
        i=0
        for line in vSpace:
            
            line.insert(0,classlabel[i])
            
            wr.writerow(line)
            i+=1
    fw.close()
    
##生成分词后的语料集，只运行一次就好
#LoadData

##遍历语料
docNum,classLabel,wordTf=caculateF()
##计算df
termDf=termDf(docNum,classLabel,wordTf)
termClassDf=termClassDf(docNum,classLabel,wordTf)#每个类中词的文档频数

##计算tfidf
wordTfidf=wordTfidf(docNum,wordTf,termDf)
##特征提取
termSelect=CHI(docNum,classLabel,termDf,termClassDf,1000)

##写出训练数据集
writeTerm(termSelect)
vSpaceModel(docNum,termSelect,wordTfidf,classLabel)