python_读取 doc,docx,pdf，txt， excel

最新推荐文章于 2024-08-05 11:15:55 发布

stay_foolish12

最新推荐文章于 2024-08-05 11:15:55 发布

阅读量418

点赞数

分类专栏：自然语言处理文章标签： pdf txt docx doc

本文链接：https://blog.csdn.net/stay_foolish12/article/details/114366371

版权

自然语言处理专栏收录该内容

233 篇文章 21 订阅

订阅专栏

#!/usr/bin/env python
# -*- coding: utf-8 -*-
import docx

from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO

from win32com import client
import sys
reload(sys)
sys.setdefaultencoding('gb2312')

def readDocx(docxPath):
    fullText = []
    doc = docx.Document(docxPath)
    paras = doc.paragraphs
    for p in paras:
        fullText.append(p.text.strip())
    return '\n'.join(fullText)
def readPdf(pdfPath):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = file(pdfPath, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()
    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
        interpreter.process_page(page)
    fp.close()
    device.close()
    str = retstr.getvalue()
    retstr.close()
    return str
def readDoc(docPath):
    fullText = []
    word = client.Dispatch('Word.Application')    
    # 打开一个已存在的文件
    doc = word.Documents.Open(docPath)
    #print doc.Content
    #print text
    doc.SaveAs('c:/temp.txt', 2)
    # 关闭
    doc.Close()
    word.Quit()
    f=open(r'c:/temp.txt','r')  
    for line in f.readlines(): 
        #f len(line)!=line.count('\n'):
        fullText.append(line.decode('gbk').strip())
    f.close()
    return '\n'.join(fullText)
if __name__ == '__main__':
    #docxValue=readDocx('d:/1.docx')
    #print docxValue
    #pdfValue = readPdf('d:/3.pdf')
    #print pdfValue
    docValue = readDoc('d:/2.doc')
    print docValue