python解析pdf和docx
代码
// An highlighted block
import os
from win32com import client as wc
w = wc.Dispatch('Word.Application')
path = r'E:\工作日志\6.5\2'
name = os.listdir(path)
for n in name:
n1 = n.split('.')[0]
doc = w.Documents.Open(path + '\\' + n) #打开的必须是文件的绝对路径
doc.SaveAs(path + '\\' + n1 + '.docx',16)
doc.Close()
os.remove(path + '\\' + n)
import docx
doc = docx.Document(r'E:\工作日志\6.3\建设银行净值\104.docx') #绝对路径
#读取表格外全部内容
for i in doc.paragraphs:
print(i.text)
#读取表格内全部内容
tables = doc.tables
str1 = ''
for table in tables:
for row in table.rows:
row_temp = []
for cell in row.cells:
if cell.text != '':
row_temp.append(cell.text)
cell.text = ''
str1 += str(row_temp)
#print(str1.replace('][',',').replace(' ','').replace('\\n',''))
print(str1)
print(type(str1))
#解析pdf
import pdfplumber
path = r'sdfsdfsdf\C1315820000027.pdf'
pdf = pdfplumber.open(path)
str1 = ''
#解析pdf全部内容
for i in pdf.pages:
str1 += i.extract_text().replace(' ','')
print(str1)
#只解析pdf表格中的内容
for i in pdf.pages:
str1 += str(i.extract_tables()).replace('\\n','').replace('None','').replace(' ','').replace(',,','').replace('[','').replace(']','\n')
print(str1)
chart.js/