第一步,从PDF转成TXT
注意:去除空格、空行等
__author__ = 'wangfei'
# -*- coding: utf-8 -*-
import sys
import os
reload(sys)
sys.setdefaultencoding('utf-8')
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import *
#手动输入,以后改成在某个文件夹下读取子文件
fp = open('pdf/sln.pdf', 'rb')
#用文件对象来创建一个pdf文档分析器
parser = PDFParser(fp)
# 创建一个 PDF 文档
doc = PDFDocument(parser)
# 检测文档是否提供txt转换,不提供就忽略
if not doc.is_extractable:
raise PDFTextExtractionNotAllowed
# 创建PDf 资源管理器 来管理共享资源
rsrcmgr = PDFResourceManager()
# 创建一个PDF设备对象
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
# 处理文档对象中每一页的内容
# doc.get_pages() 获取page列表
# 循环遍历列表,每次处理一个page的内容
# 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象
# 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 想要获取文本就获得对象的text属性,
all = []
for page in PDFPage.create_pages(doc):
interpreter.process_page(page)
layout = device.get_result()
for x in layout:
if(isinstance(x, LTTextBox)):
#coment by dz :delete the \n
string = x.get_text().replace('\n', '')
#print string
all.append(string.strip())
#写入文件到txt目录
ls = os.linesep
fObj = open('txt/sln.txt', 'wb')
fObj.writelines(['%s%s' % (x, ls) for x in all])
fObj.close()
第二步,从TXT到XML
首先XML处理类:
__author__ = 'wangfei'
import xml.dom.minidom as Dom
class XMLGenerator:
def __init__(self, xml_name):
self.doc = Dom.Document()
self.xml_name = xml_name
def createNode(self, node_name):
return self.doc.createElement(node_name)
def addNode(self, node, pre_node = None):
cur_node = node
if pre_node is not None:
pre_node.appendChild(cur_node)
else:
self.doc.appendChild(cur_node)
return cur_node
def setNodeAttr(self, node, att_name, value):
cur_node = node
cur_node.setAttribute(att_name, value)
def setNodeValue(self, cur_node, value):
node_data = self.doc.createTextNode(value)
cur_node.appendChild(node_data)
def genXML(self):
f = open(self.xml_name, "w")
f.write(self.doc.toprettyxml(indent="\t", newl="\n", encoding="utf-8"))
f.close()
根据文本信息处理TXT文档,文本信息包括章节标题标记、图像、公式等
#! /usr/bin/env python
#coding:utf-8
import sys
import linecache
import re
from XMLGenerator import *
reload(sys)
sys.setdefaultencoding('utf-8')
fileName = "txt/sln.txt"
try:
fobj = open(fileName, 'r')
except IOError, e:
print("*** file open error:", e)
else:
tittle = linecache.getline(fileName, 1).lower().strip('\n')
xmlFile = XMLGenerator(tittle.strip('\n') + ".xml")
#xml root node article
#add article
node_article = xmlFile.createNode("div")
xmlFile.setNodeAttr(node_article,"id","article")
xmlFile.addNode(node=node_article)
#add tittle
node_tittle =xmlFile.createNode("div")
xmlFile.setNodeAttr(node_tittle,"id","tittle")
xmlFile.setNodeAttr(node_tittle,"class","ltx_title ltx_title_document")
xmlFile.addNode(node_tittle,node_article)
#add intru
node_intru = xmlFile.createNode("div")
xmlFile.setNodeAttr(node_intru,"id","intru")
xmlFile.setNodeAttr(node_intru,"class","ltx_p")
xmlFile.addNode(node_intru,node_tittle)
#NULL
node_section= xmlFile.createNode("div")
node_sub = xmlFile.createNode("div")
n_sec = [([-1] * 15) for i in range(22)]
section = []
k=0
sec=0
sub=0
sec_info=0
sub_info_1=0
sub_info_2=0
#read paper section name
for (num,eachLine) in enumerate(fobj):
if num ==0:
xmlFile.setNodeValue(node_tittle,eachLine.strip())
elif num ==1:
xmlFile.setNodeValue(node_intru,eachLine.strip())
elif(eachLine != "\n"):
p_set = '^2\.(\d+)'
p_sub ='^2\.(\d+)\.(\d+)'
p_num ='^\d\d+'
p_fig ='^Fig'
p_ch = '^Chapter'
p_h ='^H\.'
words = len(eachLine.split(' '))
m_h =re.search(p_h,eachLine)
m_ch = re.search(p_ch,eachLine)
m_fig = re.search(p_fig,eachLine)
m_num =re.search(p_num,eachLine)
m_set = re.search(p_set,eachLine)
m_sub = re.search(p_sub,eachLine)
if(m_h == None and m_num==None and m_fig==None and m_ch==None and words>=9 or m_set!=None):
k =k+1
section.append(eachLine.strip())
if(m_set!=None and m_sub==None): #for the 2nd section
sec_info = int(m_set.group(1))
n_sec[sec_info][0]=k
sec = k
#add the next section to the cur node
node_section = xmlFile.createNode("div")
xmlFile.setNodeAttr(node_section,"id","s2.ss" + m_set.group(1))
xmlFile.setNodeAttr(node_section, "class", "ltx_section")
xmlFile.addNode(node_section, node_article)
#xmlFile.setNodeValue(node_section,eachLine.strip())
node_st =xmlFile.createNode("h2")
xmlFile.setNodeAttr(node_st,"class","ltx_title ltx_titleh_section")
xmlFile.addNode(node_st,node_section)
xmlFile.setNodeValue(node_st,eachLine.strip())
elif m_set!=None and m_sub!=None:
sub_info_1 = int(m_sub.group(1))
sub_info_2 = int(m_sub.group(2))
print sub_info_1
print sub_info_2
n_sec[sub_info_1][sub_info_2]=k
sub =k
#add sub to section
node_sub =xmlFile.createNode("div")
xmlFile.setNodeAttr(node_sub,"id","s"+"2.ss"+m_sub.group(1)+".sss"+m_sub.group(2))
xmlFile.setNodeAttr(node_sub,"class","ltx_subsection")
xmlFile.addNode(node_sub,node_section)
node_st =xmlFile.createNode("h2")
xmlFile.setNodeAttr(node_st,"class","ltx_title ltx_titleh_subsection")
xmlFile.addNode(node_st,node_sub)
xmlFile.setNodeValue(node_st,eachLine.strip())
else :
#add paragraph
if sec>sub:
n_para = str(k-sec)
node_para = xmlFile.createNode("div")
xmlFile.setNodeAttr(node_para,"id","s2.ss"+str(sec_info)+".p"+n_para)
xmlFile.setNodeAttr(node_sub,"class","ltx_para")
xmlFile.addNode(node_para,node_section)
xmlFile.setNodeValue(node_para,eachLine.strip())
else :
n_para = str(k-sub)
node_para =xmlFile.createNode("div")
xmlFile.setNodeAttr(node_para,"id","s2."+str(sub_info_1)+"."+str(sub_info_1)+".p"+n_para)
xmlFile.setNodeAttr(node_para,"class","ltx_para")
xmlFile.addNode(node_para,node_sub)
xmlFile.setNodeValue(node_para,eachLine.strip())
#gen
xmlFile.genXML()
fobj.close()