基于tensorflow 的cnn实现文本分类

最新推荐文章于 2022-11-19 23:47:50 发布

小木头1209

最新推荐文章于 2022-11-19 23:47:50 发布

阅读量1.5k

点赞数 1

分类专栏： python学习

本文链接：https://blog.csdn.net/jiasudu1234/article/details/78338957

版权

该博客介绍了一个基于TensorFlow的卷积神经网络（CNN）实现文本分类的示例。首先，从数据预处理开始，包括读取、分词、去除特殊字符和分词。然后，使用预训练的词向量模型进行特征提取。接下来，构建CNN模型，包含多个卷积层和池化层，最后是全连接层和输出层。模型经过训练并评估其在测试集上的性能。此外，还提到了使用HannLP进行中文处理的相关操作，如分词、关键词提取和自动摘要。

摘要由CSDN通过智能技术生成

# coding: utf-8

# In[72]:


import os, xlrd
import codecs, re
import jieba
import rarfile  
import os  
import jieba.analyse


# In[22]:


file_name = '/mfsdata/pachong/cnn/Data_MeiTi'#读取文件路径
#files = os.listdir(file_name)
files=os.listdir(file_name)#文件夹名


# In[32]:


#数据读取函数
def Read_content(title):
    All_content=[]
    file_content=os.listdir(title)
    for singe_file in file_content:
        path=title+'/'+singe_file
        Parse=codecs.open(path,'r','utf-8')
        Content=Parse.read()
        All_content.append(Content,)
    return All_content
      


# In[135]:


#正则表达式去除标点符号，数字等
import re
#from zhon.hanzi import punctuation
def Re(line):
    punctuation=u'▼◆！．※\\×·■★〓!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､\u3000、\ue65c〃〈〉《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏﹑﹔·！？｡。'
    line=re.sub(r"[%s]+" %punctuation,' ',line)
    line=re.sub(r'\d+',' ',line)
    line=re.sub(r'[A-Za-z0-9]',' ',line)
    line=re.sub('\s+',' ',line)
    line=re.sub(r'\b.\b',' ',line)
    #line=re.sub(r'\ue65c',' ',line)
    line=re.sub(r'\\+',' ',line)
    line=re.sub('\s+',' ',line)
    return line


# In[109]:


#文本分词函数
def Content_deal(line):
    line=line.split('__Label__')
    contents=Re(' '.join(jieba.cut(line[0])))
    return contents+'__lable__'+line[ 1]


# In[67]:


All_content=[]#文本列表
for i in files:
    title=file_name+"/"+ i
    singe__All_content=Read_content(title)
    singe__All= [text+'__Label__'+i.decode('gbk') for text in singe__All_content]
    All_content=All_content+ singe__All


# In[ ]:





# In[113]:


#from tqdm import tqdm
#Deal_content=[]

#for single_content in tqdm(All_content):
  #  Deal_content.append(Content_deal(single_content))
    
    


# In[152]:


#将文本分词，去除特殊字符
import time
import multiprocessing
pool = multiprocessing.Pool(processes = 5)
t1=time.time()
Deal_Data=pool.map(Content_deal,All_content)
t2=time.time()
print t2-t1


# In[150]:


#将分词处理好的数字写出到txt 文件
import pandas as pd
Deal_Data1=pd.DataFrame(Deal_Data,index=None)
Deal_Data1.to_csv("/mfsdata/pachong/cnn/MeiTi_Deal_data.txt",header=None,index=None,encoding='utf-8')


# In[160]:


from gensim.models import Word2Vec
S_model = Word2Vec.load('/mfsdata/pachong/cnn/Sougou.model') #load词向量模型


# In[202]:


for f in files:
    print f.decode('gbk')


# In[968]:


#词向量及其词标签label 词标签，num_label 词标签向量化。词向量矩阵
def Word_Vec(sentent):
    sentent=sentent.split('__lable__')
    dictL={u'游戏':0,u'星座':1,u'时尚':2,u'娱乐':3,u'养生':4}#,u'其他':5}
    content=sentent[0].split(' ')
    label=sentent[1]
    num

最低0.47元/天解锁文章

小木头1209

关注

1
点赞
踩
3

收藏

觉得还不错? 一键收藏
0
评论
基于tensorflow 的cnn实现文本分类

# coding: utf-8# In[72]:import os, xlrdimport codecs, reimport jiebaimport rarfile import os import jieba.analyse# In[22]:file_name = '/mfsdata/pachong/cnn/Data_MeiTi'#读取文件路径#files
复制链接

扫一扫

专栏目录