一个简单的word文档阅读类,使用正则实现简单的docx文档阅读,下面是代码
<?php
class Lib {
/**
* @param $file docx文件路径
* @return string 生成的html字符串
* ---读取docx文档转换为html,仅保留段落,表格,文本框,不保留样式
* by sdxjwkq
*/
public function docxToHtml($file) {
$zip = new \ZipArchive();
$zip->open($file);
$xml = $zip->getFromName("word/document.xml");
$table = array(); //缓存表格
$textbox = array(); //缓存文本框
//处理表格
preg_match_all('/<w:tbl>([\s\S]*?)<\/w:tbl>/s', $xml, $tableHandel);
for ($i = 0; $i < count($tableHandel[0]); $i++) {
$table["@TABLECONTENT@" . $i] = $tableHandel[0][$i];
$xml = str_replace($tableHandel[0][$i], "@TABLECONTENT@" . $i, $xml);
}
//处理文本框
preg_match_all('/<w:pict>([\s\S]*?)<\/w:pict>/s', $xml, $textboxHandel);
foreach ($textboxHandel[0] as $key => &$value) {
$temp = $value;
$temp2 = "";
preg_match_all('/<w:t>([\s\S]*?)<\/w:t>/s', $value, $div);
foreach ($div[0] as $k => &$v) {
$temp2 .= $v;
}
$xml = str_replace($temp, $temp2, $xml);
}
for ($i = 0; $i < count($textboxHandel[0]); $i++) {
$textbox["@TEXTBOXCONTENT@" . $i] = $textboxHandel[0][$i];
$xml = str_replace($textboxHandel[0][$i], "@TEXTBOXCONTENT@" . $i, $xml);
}
preg_match_all('/<w:p([\s\S]*?)<\/w:p>|@TABLECONTENT@\d|@TEXTBOXCONTENT@\d/s', $xml, $content);
foreach ($content[0] as $key => &$value) {
if (strpos($value, "TABLECONTENT")) {
$value = $table[$value];
}
$value = str_replace("w:", "", $value);
}
$content = $content[0]; //把段落和表格解析出来
$docx = <<<HTML_ENTITIES
<style>
table{
background-color:#000;
}
table td{
padding:5px 5px 5px 5px;
}
table tr{
background-color:#fff;
}
</style>
HTML_ENTITIES;
foreach ($content as $a => &$b) {
$b = json_decode(
json_encode(
simplexml_load_string($b)
),
true
);
if (isset($b['tr'])) {
//表格
$docx .= "<table border='0' cellspacing='1' cellpadding='0'>";
foreach ($b['tr'] as $key => $value) {
$docx .= "<tr>";
foreach ($value['tc'] as $k => $v) {
if (isset($v['p']['r'][0])) {
$docx .= "<td>";
foreach ($v['p']['r'] as $ke => $va) {
$docx .= $va['t'];
}
$docx .= "</td>";
} else {
$docx .= "<td>" . $v['p']['r']['t'] . "</td>";
}
}
$docx .= "</tr>";
}
$docx .= "</table>";
} else {
//段落
$docx .= "<p>";
if (isset($b['r'][0])) {
foreach ($b['r'] as $key => &$value) {
if (is_string($value['t'])) {
$docx .= $value['t'];
}
}
} else {
if (is_string($b['r']['t'])) {
$docx .= $b['r']['t'];
}
}
$docx .= "</p>";
}
}
return $docx;
}
}