#!/usr/bin/env python
# -*- coding: utf-8 -*-
import docx
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO
from win32com import client
import sys
reload(sys)
sys.setdefaultencoding('gb2312')
def readDocx(docxPath):
fullText = []
doc = docx.Document(docxPath)
paras = doc.paragraphs
for p in paras:
fullText.append(p.text.strip())
return '\n'.join(fullText)
def readPdf(pdfPath):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = file(pdfPath, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos=set()
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
fp.close()
device.close()
str = retstr.getvalue()
retstr.close()
return str
def readDoc(docPath):
fullText = []
word = client.Dispatch('Word.Application')
# 打开一个已存在的文件
doc = word.Documents.Open(docPath)
#print doc.Content
#print text
doc.SaveAs('c:/temp.txt', 2)
# 关闭
doc.Close()
word.Quit()
f=open(r'c:/temp.txt','r')
for line in f.readlines():
#f len(line)!=line.count('\n'):
fullText.append(line.decode('gbk').strip())
f.close()
return '\n'.join(fullText)
if __name__ == '__main__':
#docxValue=readDocx('d:/1.docx')
#print docxValue
#pdfValue = readPdf('d:/3.pdf')
#print pdfValue
docValue = readDoc('d:/2.doc')
print docValue
python_读取 doc,docx,pdf,txt, excel
最新推荐文章于 2024-08-05 11:15:55 发布