获取word文件文本内容及将.doc类型文件转化成.docx类型文件
use PhpOffice\PhpWord\IOFactory;
public function readWord($path,$ends='docx')
{
if($ends == 'doc'){
//此处是针对于doc文件,在vue预览插件中不能预览.doc类型的文件,需要先将.doc转化成.docx文件
$pathLast = dirname($path);
// LibreOffice的命令行工具的路径
//$libreofficePath = '/usr/bin/soffice';//linux环境
$libreofficePath = "F:\www\LibreOffice\program\soffice.exe";//window环境
// 构造命令行,将doc格式内容转化成docx格式内容,并且添加到$pathLast文件中
$command = escapeshellcmd("$libreofficePath --headless --convert-to docx --outdir " . $pathLast . " $path");
// 执行命令
system($command);
//删除doc文件
$content1 = file_get_contents($path);
unlink($path);
$path .= 'x';
$content2 = file_get_contents($path);
if($content1 && empty($content2)){
unlink($path);
//识别失败
$return['code'] = 400;
return $return;
}
}
$return['code'] = 200;
$phpWord = IOFactory::load($path);
//读取文件文本内容
$return['content'] = $this->getNodeContent($phpWord);
$return['path'] = $path;
return $return;
}
/**
* 根据word主节点获取分节点内容
* @param $word
* @return array
*/
public function getNodeContent($word)
{
$return = [];
//分解部分
foreach ($word->getSections() as $section)
{
if ($section instanceof \PhpOffice\PhpWord\Element\Section) {
//分解元素
foreach ($section->getElements() as $element)
{
//文本元素
if ($element instanceof \PhpOffice\PhpWord\Element\TextRun) {
$text = '';
foreach ($element->getElements() as $ele) {
$text .= $this->getTextNode($ele);
}
$return[] = $text;
}
//表格元素
else if ($element instanceof \PhpOffice\PhpWord\Element\Table) {
foreach ($element->getRows() as $ele)
{
$return[] = $this->getTableNode($ele);
}
}
}
}
}
return $return;
}
/**
* 获取文档节点内容
* @param $node
* @return string
*/
public function getTextNode($node)
{
$return = '';
//处理文本
if ($node instanceof \PhpOffice\PhpWord\Element\Text)
{
$return .= $node->getText();
}
//处理图片
else if ($node instanceof \PhpOffice\PhpWord\Element\Image)
{
$return .= $this->pic2text($node);
}
//处理文本元素
else if ($node instanceof \PhpOffice\PhpWord\Element\TextRun) {
foreach ($node->getElements() as $ele) {
$return .= $this->getTextNode($ele);
}
}
return $return;
}
/**
* 获取表格节点内容
* @param $node
* @return string
*/
public function getTableNode($node)
{
$return = '';
//处理行
if ($node instanceof \PhpOffice\PhpWord\Element\Row) {
foreach ($node->getCells() as $ele)
{
$return .= $this->getTableNode($ele);
}
}
//处理列
else if ($node instanceof \PhpOffice\PhpWord\Element\Cell) {
foreach ($node->getElements() as $ele)
{
$return .= $this->getTextNode($ele);
}
}
return $return;
}
/**
* 处理word文档中base64格式图片
* @param $node
* @return string
*/
public function pic2text($node)
{
//获取图片编码
$imageData = $node->getImageStringData(true);
//添加图片html显示标头
$imageData = 'data:' . $node->getImageType() . ';base64,' . $imageData;
$return = '<img src="'.$imageData.'">';
return $return;
}