php 读取docx,PHP读取docx文档内容

PHP读取并转换docx内容的实现
这段代码展示了一个名为`Docx2Text`的PHP类,用于读取docx文件的内容并将其转换为PHP可识别的字符串。类中包含多个私有方法,如`setDocx`来设置docx文件,`extract`来提取内容,以及`table`、`listNumbering`等方法处理表格和列表。此外,还处理了脚注和尾注的转换。通过这个类,可以将docx文档的格式化内容转换为纯文本输出。

客户需求, 需要从docx文档读取内容并且做简单格式化, 难点就在于如何读取docx格式并且转换为php可以识别的字符串形式, 惯例先贴代码.class Docx2Text

{

const SEPARATOR_TAB = "\t";

private $docx;

private $domDocument;

private $_document;

private $_numbering;

private $_footnote;

private $_endnote;

private $endnotes;

private $footnotes;

private $relations;

private $numberingList;

private $textOuput;

private $chart2text;

private $table2text;

private $list2text;

private $paragraph2text;

private $footnote2text;

private $endnote2text;

public function __construct($boolTransforms = array())

{

if (isset($boolTransforms['table'])) {

$this->table2text = $boolTransforms['table'];

} else {

$this->table2text = true;

}

if (isset($boolTransforms['list'])) {

$this->list2text = $boolTransforms['list'];

} else {

$this->list2text = true;

}

if (isset($boolTransforms['paragraph'])) {

$this->paragraph2text = $boolTransforms['paragraph'];

} else {

$this->paragraph2text = true;

}

if (isset($boolTransforms['footnote'])) {

$this->footnote2text = $boolTransforms['footnote'];

} else {

$this->footnote2text = true;

}

if (isset($boolTransforms['endnote'])) {

$this->endnote2text = $boolTransforms['endnote'];

} else {

$this->endnote2text = true;

}

if (isset($boolTransforms['chart'])) {

$this->chart2text = $boolTransforms['chart'];

} else {

$this->chart2text = true;

}

$this->textOuput = '';

$this->docx = null;

$this->_numbering = '';

$this->numberingList = array();

$this->endnotes = array();

$this->footnotes = array();

$this->relations = array();

}

public function extract($filename = '')

{

if (empty($this->_document)) {

exit('There is no content');

}

$this->domDocument = new DomDocument();

$this->domDocument->loadXML($this->_document);

$bodyNode = $this->domDocument->getElementsByTagNameNS('http://schemas.openxmlformats.org/wordprocessingml/2006/main', 'body');

$bodyNode = $bodyNode->item(0);

foreach ($bodyNode->childNodes as $child) {

if ($this->table2text && $child->tagName == 'w:tbl') {

$this->textOuput .= $this->table($child) . $this->separator();

} else {

$this->textOuput .= $this->printWP($child) . ($this->paragraph2text ? $this->separator() : '');

}

}

if (!empty($filename)) {

$this->writeFile($filename, $this->textOuput);

} else {

return $this->textOuput;

}

}

public function setDocx($filename)

{

$this->docx = new ZipArchive();

$ret = $this->docx->open($filename);

if ($ret === true) {

$this->_document = $this->docx->getFromName('word/document.xml');

} else {

exit('failed');

}

}

private function loadEndNote()

{

if (empty($this->endnotes)) {

if (empty($this->_endnote)) {

$this->_endnote = $this->docx->getFromName('word/endnotes.xml');

}

if (!empty($this->_endnote)) {

$domDocument = new DomDocument();

$domDocument->loadXML($this->_endnote);

$endnotes = $domDocument->getElementsByTagNameNS('http://schemas.openxmlformats.org/wordprocessingml/2006/main', 'endnote');

foreach ($endnotes as $endnote) {

$xml = $endnote->ownerDocument->saveXML($endnote);

$this->endnotes[$endnote->getAttribute('w:id')] = trim(strip_tags($xml));

}

}

}

}

private function loadFootNote()

{

if (empty($this->footnotes)) {

if (empty($this->_footnote)) {

$this->_footnote = $this->docx->getFromName('word/footnotes.xml');

}

if (!empty($this->_footnote)) {

$domDocument = new DomDocument();

$domDocument->loadXML($this->_footnote);

$footnotes = $domDocument->getElementsByTagNameNS('http://schemas.openxmlformats.org/wordprocessingml/2006/main', 'footnote');

foreach ($footnotes as $footnote) {

$xml = $footnote->ownerDocument->saveXML($footnote);

$this->footnotes[$footnote->getAttribute('w:id')] = trim(strip_tags($xml));

}

}

}

}

private function listNumbering()

{

$ids = array();

$nums = array();

$this->_numbering = $this->docx->getFromName('word/numbering.xml');

if (!empty($this->_numbering)) {

$domDocument = new DomDocument();

$domDocument->loadXML($this->_numbering);

$numberings = $domDocument->getElementsByTagNameNS('http://schemas.openxmlformats.org/wordprocessingml/2006/main', 'numbering');

$numberings = $numberings->item(0);

foreach ($numberings->childNodes as $child) {

$flag = true;

foreach ($child->childNodes as $son) {

if ($child->tagName == 'w:abstractNum' && $son->tagName == 'w:lvl') {

foreach ($son->childNodes as $daughter) {

if ($daughter->tagName == 'w:numFmt' && $flag) {

$nums[$child->getAttribute('w:abstractNumId')] = $daughter->getAttribute('w:val');

$flag = false;

}

}

} elseif ($child->tagName == 'w:num' && $son->tagName == 'w:abstractNumId') {

$ids[$son->getAttribute('w:val')] = $child->getAttribute('w:numId');

}

}

}

foreach ($ids as $ind => $id) {

if ($nums[$ind] == 'decimal') {

$this->numberingList[$id][0] = range(1, 10);

$this->numberingList[$id][1] = range(1, 10);

$this->numberingList[$id][2] = range(1, 10);

$this->numberingList[$id][3] = range(1, 10);

} else {

$this->numberingList[$id][0] = array('*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*');

$this->numberingList[$id][1] = array(chr(175), chr(175), chr(175), chr(175), chr(175), chr(175), chr(175), chr(175), chr(175), chr(175), chr(175), chr(175));

$this->numberingList[$id][2] = array(chr(237), chr(237), chr(237), chr(237), chr(237), chr(237), chr(237), chr(237), chr(237), chr(237), chr(237), chr(237));

$this->numberingList[$id][3] = array(chr(248), chr(248), chr(248), chr(248), chr(248), chr(248), chr(248), chr(248), chr(248), chr(248), chr(248));

}

}

}

}

private function printWP($node)

{

$ilvl = $numId = -1;

if ($this->list2text) {

if (empty($this->numberingList)) {

$this->listNumbering();

}

$xpath = new DOMXPath($this->domDocument);

$query = 'w:pPr/w:numPr';

$xmlLists = $xpath->query($query, $node);

$xmlLists = $xmlLists->item(0);

$ret = $this->toText($node);

} else {

$ret = $this->toText($node);

}

if ($this->chart2text) {

$query = 'w:r/w:drawing/wp:inline';

$xmlChart = $xpath->query($query, $node);

foreach ($xmlChart as $chart) {

foreach ($chart->childNodes as $child) {

foreach ($child->childNodes as $child2) {

foreach ($child2->childNodes as $child3) {

$rid = $child3->getAttribute('r:id');

}

}

}

}

}

if ($this->endnote2text) {

if (empty($this->endnotes)) {

$this->loadEndNote();

}

$query = 'w:r/w:endnoteReference';

$xmlEndNote = $xpath->query($query, $node);

foreach ($xmlEndNote as $note) {

$ret .= '[' . $this->endnotes[$note->getAttribute('w:id')] . '] ';

}

}

if ($this->footnote2text) {

if (empty($this->footnotes)) {

$this->loadFootNote();

}

$query = 'w:r/w:footnoteReference';

$xmlFootNote = $xpath->query($query, $node);

foreach ($xmlFootNote as $note) {

$ret .= '[' . $this->footnotes[$note->getAttribute('w:id')] . '] ';

}

}

if ((($ilvl != -1) && ($numId != -1)) || (1)) {

$ret .= $this->separator();

}

return $ret;

}

private function separator()

{

return "\r\n";

}

private function table($node)

{

$output = '';

if ($node->hasChildNodes()) {

foreach ($node->childNodes as $child) {

if ($child->tagName == 'w:tr') {

foreach ($child->childNodes as $cell) {

if ($cell->tagName == 'w:tc') {

if ($cell->hasChildNodes()) {

foreach ($cell->childNodes as $p) {

$output .= $this->printWP($p);

}

$output .= self::SEPARATOR_TAB;

}

}

}

}

$output .= $this->separator();

}

}

return $output;

}

private function toText($node)

{

$xml = $node->ownerDocument->saveXML($node);

return trim(strip_tags($xml));

}

}

$text = new Docx2Text();

$text->setDocx('./1.docx');

$docx = $text->extract();

var_dump($docx);

代码中处理docx的类来自这里

其实docx就是xml的一种扩展类型的文档.

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值