PHP代码读取word文档内容并转为txt文档

Casual_1573

已于 2023-04-27 10:13:26 修改

阅读量315

点赞数

分类专栏： PHP 文章标签： php word

于 2023-04-26 14:02:20 首次发布

本文链接：https://blog.csdn.net/qq_36345633/article/details/130384651

版权

PHP 专栏收录该内容

21 篇文章 0 订阅

订阅专栏

方法一

<?php
class WordPHP
{
	private $debug = false;
	private $file;
	private $rels_xml;
	private $doc_xml;
	private $doc_media = [];
	private $last = 'none';
// 	private $encoding = 'ISO-8859-1';
	private $encoding = 'UTF-8';
	private $tmpDir = 'tmp';
	
	/**
	 * CONSTRUCTOR
	 * 
	 * @param Boolean $debug Debug mode or not
	 * @return void
	 */
	public function __construct($debug_=null, $encoding=null)
	{
		if($debug_ != null) {
			$this->debug = $debug_;
		}
		if ($encoding != null) {
			$this->encoding = $encoding;
		}
		$this->tmpDir = dirname(__FILE__);
	}

	/**
	 * Sets the tmp directory where images will be stored
	 * 
	 * @param string $tmp The location 
	 * @return void
	 */
	private function setTmpDir($tmp)
	{
		$this->tmpDir = $tmp;
	}

	/**
	 * READS The Document and Relationships into separated XML files
	 * 
	 * @param var $object The class variable to set as DOMDocument 
	 * @param var $xml The xml file
	 * @param string $encoding The encoding to be used
	 * @return void
	 */
	private function setXmlParts(&$object, $xml, $encoding)
	{
		$object = new DOMDocument();
		$object->encoding = $encoding;
		$object->preserveWhiteSpace = false;
		$object->formatOutput = true;
		$object->loadXML($xml);
		$object->saveXML();
	}

	/**
	 * READS The Document and Relationships into separated XML files
	 * 
	 * @param String $filename The filename
	 * @return void
	 */
	private function readZipPart($filename)
	{
		$zip = new ZipArchive();
		$_xml = 'word/document.xml';
		$_xml_rels = 'word/_rels/document.xml.rels';
		
		if (true === $zip->open($filename)) {
			if (($index = $zip->locateName($_xml)) !== false) {
				$xml = $zip->getFromIndex($index);
			}
			//Get the relationships
			if (($index = $zip->locateName($_xml_rels)) !== false) {
				$xml_rels = $zip->getFromIndex($index);
			}
			// load all images if they exist
			for ($i=0; $i<$zip->numFiles;$i++) {
            	$zip_element = $zip->statIndex($i);
            	 if(preg_match("([^\s]+(\.(?i)(jpg|jpeg|png|gif|bmp))$)",$zip_element['name'])) {
            	 	$this->doc_media[$zip_element['name']] = $zip_element['name'];
            	 }
        	}
			$zip->close();
		} else die('non zip file');

		$enc = mb_detect_encoding($xml);
		$this->setXmlParts($this->doc_xml, $xml, $enc);
		$this->setXmlParts($this->rels_xml, $xml_rels, $enc);
		
		if($this->debug) {
			echo "<textarea style='width:100%; height: 200px;'>";
			echo $this->doc_xml->saveXML();
			echo "</textarea>";
			echo "<textarea style='width:100%; height: 200px;'>";
			echo $this->rels_xml->saveXML();
			echo "</textarea>";
		}
	}

	/**
	 * CHECKS THE FONT FORMATTING OF A GIVEN ELEMENT
	 * Currently checks and formats: bold, italic, underline, background color and font family
	 * 
	 * @param XML $xml The XML node
	 * @return String HTML formatted code
	 */
	private function checkFormating(&$xml)
	{	
		$node = trim($xml->readOuterXML());
		$t = '';
		// add <br> tags
		if (strstr($node,'<w:br ')) $t = '<br>';					 
		// look for formatting tags
		$f = "<span style='";
		$reader = new XMLReader();
		$reader->XML($node);
		$img = null;

		while ($reader->read()) {
			if($reader->name == "w:b") {
				$f .= "font-weight: bold,";
			}
			if($reader->name == "w:i") {
				$f .= "text-decoration: underline,";
			}
			if($reader->name == "w:color") {
				$f .="color: #".$reader->getAttribute("w:val").",";
			}
			if($reader->name == "w:rFont") {
				$f .="font-family: #".$reader->getAttribute("w:ascii").",";
			}
			if($reader->name == "w:shd" && $reader->getAttribute("w:val") != "clear" && $reader->getAttribute("w:fill") != "000000") {
				$f .="background-color: #".$reader->getAttribute("w:fill").",";
			}
			if($reader->name == 'w:drawing' && !empty($reader->readInnerXml())) {
				$r = $this->checkImageFormating($reader);
				$img = $r !== null ? "<image src='".$r."' />" : null;
			}
		}
		
		$f = rtrim($f, ',');
		$f .= "'>";
		$t .= ($img !== null ? $img : htmlentities($xml->expand()->textContent));

		return $f.$t."</span>";
	}
	
	/**
	 * CHECKS THE ELEMENT FOR UL ELEMENTS
	 * Currently under development
	 * 
	 * @param XML $xml The XML node
	 * @return String HTML formatted code
	 */
	private function getListFormating(&$xml)
	{	
		$node = trim($xml->readOuterXML());
		
		$reader = new XMLReader();
		$reader->XML($node);
		$ret="";
		$close = "";
		while ($reader->read()){
			if($reader->name == "w:numPr" && $reader->nodeType == XMLReader::ELEMENT ) {
				
			}
			if($reader->name == "w:numId" && $reader->hasAttributes) {
				switch($reader->getAttribute("w:val")) {
					case 1:
						$ret['open'] = "<ol><li>";
						$ret['close'] = "</li></ol>";
						break;
					case 2:
						$ret['open'] = "<ul><li>";
						$ret['close'] = "</li></ul>";
						break;
				}
				
			}
		}
		return $ret;
	}
	
	/**
	 * CHECKS IF THERE IS AN IMAGE PRESENT
	 * Currently under development
	 * 
	 * @param XML $xml The XML node
	 * @return String The location of the image
	 */
	private function checkImageFormating(&$xml)
	{
		$content = trim($xml->readInnerXml());

		if (!empty($content)) {

			$relId;
			$notfound = true;
			$reader = new XMLReader();
			$reader->XML($content);
			
			while ($reader->read() && $notfound) {
				if ($reader->name == "a:blip") {
					$relId = $reader->getAttribute("r:embed");
					$notfound = false;
				}
			}

			// image id found, get the image location
			if (!$notfound && $relId) {
				$reader = new XMLReader();
				$reader->XML($this->rels_xml->saveXML());
				
				while ($reader->read()) {
					if ($reader->nodeType == XMLREADER::ELEMENT && $reader->name=='Relationship') {
						if($reader->getAttribute("Id") == $relId) {
							$link = "word/".$reader->getAttribute('Target');
							break;
						}
					}
				}

    			$zip = new ZipArchive();
    			$im = null;
    			if (true === $zip->open($this->file)) {
        			$im = $this->createImage($zip->getFromName($link), $relId, $link);
    			}
    			$zip->close();
    			return $im;
			}
		}

		return null;
	}

	/**
	 * Creates an image in the filesystem
	 *  
	 * @param objetc $image The image object
	 * @param string $relId The image relationship Id
	 * @param string $name The image name
	 * @return Array With HTML open and closing tag definition
	 */
	private function createImage($image, $relId, $name)
	{
		$arr = explode('.', $name);
		$l = count($arr);
		$ext = strtolower($arr[$l-1]);

		$im = imagecreatefromstring($image);
		$fname = $this->tmpDir.'/tmp/'.$relId.'.'.$ext;

		switch ($ext) {
			case 'png':
				imagepng($im, $fname);
				break;
			case 'bmp':
				imagebmp($im, $fname);
				break;
			case 'gif':
				imagegif($im, $fname);
				break;
			case 'jpeg':
			case 'jpg':
				imagejpeg($im, $fname);
				break;
			default:
				return null;
		}

		return $fname;
	}

	/**
	 * CHECKS IF ELEMENT IS AN HYPERLINK
	 *  
	 * @param XML $xml The XML node
	 * @return Array With HTML open and closing tag definition
	 */
	private function getHyperlink(&$xml)
	{
		$ret = array('open'=>'<ul>','close'=>'</ul>');
		$link ='';
		if($xml->hasAttributes) {
			$attribute = "";
			while($xml->moveToNextAttribute()) {
				if($xml->name == "r:id")
					$attribute = $xml->value;
			}
			
			if($attribute != "") {
				$reader = new XMLReader();
				$reader->XML($this->rels_xml->saveXML());
				
				while ($reader->read()) {
					if ($reader->nodeType == XMLREADER::ELEMENT && $reader->name=='Relationship') {
						if($reader->getAttribute("Id") == $attribute) {
							$link = $reader->getAttribute('Target');
							break;
						}
					}
				}
			}
		}
		
		if($link != "") {
			$ret['open'] = "<a href='".$link."' target='_blank'>";
			$ret['close'] = "</a>";
		}
		
		return $ret;
	}


	/**
	 * PROCESS TABLE CONTENT
	 *  
	 * @param XML $xml The XML node
	 * @return THe HTML code of the table
	 */
	private function checkTableFormating(&$xml)
	{
		$table = "<table><tbody>";

		while ($xml->read()) {
			if ($xml->nodeType == XMLREADER::ELEMENT && $xml->name === 'w:tr') { //table row
				$tc = $ts = "";


				$tr = new XMLReader;
				$tr->xml(trim($xml->readOuterXML()));

				while ($tr->read()) {
					if ($tr->nodeType == XMLREADER::ELEMENT && $tr->name === 'w:tcPr') { //table element properties
						$ts = $this->processTableStyle(trim($tr->readOuterXML()));
					}
					if ($tr->nodeType == XMLREADER::ELEMENT && $tr->name === 'w:tc') { //table column
						$tc .= $this->processTableRow(trim($tr->readOuterXML()));
					}
				}
				$table .= '<tr style="'.$ts.'">'.$tc.'</tr>';
			}
		}

		$table .= "</tbody></table>";
		return $table;
	}

	/**
	 * PROCESS THE TABLE ROW STYLE
	 *  
	 * @param string $content The XML node content
	 * @return THe HTML code of the table
	 */
	private function processTableStyle($content)
	{
		/*border-collapse:collapse; 
		border-bottom:4px dashed #0000FF; 
		border-top:6px double #FF0000; 
		border-left:5px solid #00FF00; 
		border-right:5px solid #666666;*/

		$tc = new XMLReader;
		$tc->xml($content);
		$style = "border-collapse:collapse;";

		while ($tc->read()) {
			if ($tc->name === "w:tcBorders") {
				$tc2 = new SimpleXMLElement($tc->readOuterXML());

				foreach ($tc2->children('w',true) as $ch) {
					if (in_array($ch->getName(), ['left','top','botom','right']) ) {
						$line = $this->convertLine($ch['val']);
						$style .= " border-".$ch->getName().":".$ch['sz']."px $line #".$ch['color'].";";
					}
				}
				
				$tc->next();
			}
		}
		return $style;
	}

	private function convertLine($in)
	{
		if (in_array($in, ['dotted']))
			return "dashed";

		if (in_array($in, ['dotDash','dotdotDash','dotted','dashDotStroked','dashed','dashSmallGap']))
			return "dashed";
		
		if (in_array($in, ['double','triple','threeDEmboss','threeDEngrave','thick']))
			return "double";

		if (in_array($in, ['nil','none']))
			return "none";

		return "solid";
	}

	/**
	 * PROCESS THE TABLE ROW
	 *  
	 * @param string $content The XML node content
	 * @return THe HTML code of the table
	 */
	private function processTableRow($content)
	{
		$tc = new XMLReader;
		$tc->xml($content);
		$ct = "";

		while ($tc->read()) {
			if ($tc->name === "w:r") {
				$ct .= "<td>".$this->checkFormating($tc)."</td>";
				$tc->next();
			}
		}
		return $ct;
	}

	/**
	 * READS THE GIVEN DOCX FILE INTO HTML FORMAT
	 *  
	 * @param String $filename The DOCX file name
	 * @return String With HTML code
	 */
	public function readDocument($filename)
	{
		$this->file = $filename;
		$this->readZipPart($filename);
		$reader = new XMLReader();
		$reader->XML($this->doc_xml->saveXML());

		$text = ''; $list_format=[];

		$formatting['header'] = 0;
		// loop through docx xml dom
		while ($reader->read()) {
		// look for new paragraphs
			$paragraph = new XMLReader;
			$p = $reader->readOuterXML();

			if ($reader->nodeType == XMLREADER::ELEMENT && $reader->name === 'w:p') {
				// set up new instance of XMLReader for parsing paragraph independantly				
				$paragraph->xml($p);

				preg_match('/<w:pStyle w:val="(Heading.*?[1-6])"/',$p,$matches);
				if(isset($matches[1])) {
					switch($matches[1]){
						case 'Heading1': $formatting['header'] = 1; break;
						case 'Heading2': $formatting['header'] = 2; break;
						case 'Heading3': $formatting['header'] = 3; break;
						case 'Heading4': $formatting['header'] = 4; break;
						case 'Heading5': $formatting['header'] = 5; break;
						case 'Heading6': $formatting['header'] = 6; break;
						default: $formatting['header'] = 0; break;
					}
				}
				// open h-tag or paragraph
				$text .= ($formatting['header'] > 0) ? '<h'.$formatting['header'].'>' : '<p>';
				
				// loop through paragraph dom
				while ($paragraph->read()) {
					// look for elements
					if ($paragraph->nodeType == XMLREADER::ELEMENT && $paragraph->name === 'w:r') {
						if($list_format == "")
							$text .= $this->checkFormating($paragraph);
						else {
							$text .= $list_format['open'];
							$text .= $this->checkFormating($paragraph);
							$text .= $list_format['close'];
						}
						$list_format ="";
						$paragraph->next();
					}
					else if($paragraph->nodeType == XMLREADER::ELEMENT && $paragraph->name === 'w:pPr') { //lists
						$list_format = $this->getListFormating($paragraph);
						$paragraph->next();
					}
					else if($paragraph->nodeType == XMLREADER::ELEMENT && $paragraph->name === 'w:drawing') { //images
						$text .= $this->checkImageFormating($paragraph);
						$paragraph->next();
					}
					else if ($paragraph->nodeType == XMLREADER::ELEMENT && $paragraph->name === 'w:hyperlink') {
						$hyperlink = $this->getHyperlink($paragraph);
						$text .= $hyperlink['open'];
						$text .= $this->checkFormating($paragraph);
						$text .= $hyperlink['close'];
						$paragraph->next();
					}
				}
				$text .= ($formatting['header'] > 0) ? '</h'.$formatting['header'].'>' : '</p>';
			}
			else if ($reader->nodeType == XMLREADER::ELEMENT && $reader->name === 'w:tbl') { //tables
				$paragraph->xml($p);
				$text .= $this->checkTableFormating($paragraph);
				$reader->next();
			}
		}
		$reader->close();
		if($this->debug) {
			echo "<div style='width:100%; height: 200px;'>";
			echo mb_convert_encoding($text, $this->encoding);
			echo "</div>";
		}
		return mb_convert_encoding($text, $this->encoding);
	}
}

调用


<?php
include 'class.wordphp.php';
error_reporting(0);

$rt = new WordPHP();
$text = $rt->readDocument('ces.docx');

file_put_contents('ces.html', $text);


$line = file_get_contents('ces.html'); //读取生成的html
$line = preg_replace( "@<script(.*?)</script>@is", "", $line);
$line= preg_replace( "@<iframe(.*?)</iframe>@is", "", $line);
$line= preg_replace( "@<style(.*?)</style>@is", "", $line);
$line= preg_replace( "@<(.*?)>@is", "", $line);
$line= html_entity_decode($line, ENT_QUOTES, 'UTF-8');
$line= htmlspecialchars_decode($line);
$line = str_replace(["\n","\t","\r"], "", $line);//去除换行符
$line = preg_replace('/\s(?=\s)/', '', $line); //去掉跟随别的挤在一块的空白

echo $line;

file_put_contents('word.txt', $line);

方法二

<?php
// 引入PHPWord库
require_once 'vendor/autoload.php';

function readWordToHtml($source)
{
    $phpWord = \PhpOffice\PhpWord\IOFactory::load($source);
    $html = '';
    foreach ($phpWord->getSections() as $section) {
        foreach ($section->getElements() as $ele1) {
            if ($ele1 instanceof \PhpOffice\PhpWord\Element\TextRun) {
                foreach ($ele1->getElements() as $ele2) {
                    if ($ele2 instanceof \PhpOffice\PhpWord\Element\Text) {
                        $html .= mb_convert_encoding($ele2->getText(), 'GBK', 'UTF-8');
                    }
                }
            }
        }
    }
    
    return mb_convert_encoding($html, 'UTF-8', 'GBK');
}

    //方法一：文件
    $file = 'ces.docx';
    $text = readWordToHtml($file);

    //方法二：文件流
    $word_binary_data = file_get_contents($file);   //文件流
    $temp_word_file = tempnam(sys_get_temp_dir(), 'word');     //临时文件
    file_put_contents($temp_word_file, $word_binary_data);      

    $text = readWordToHtml($temp_word_file);

    unlink($temp_word_file);    //删除临时文件

    echo $text;