PHP代码读取word文档内容并转为txt文档

方法一

<?php
class WordPHP
{
	private $debug = false;
	private $file;
	private $rels_xml;
	private $doc_xml;
	private $doc_media = [];
	private $last = 'none';
// 	private $encoding = 'ISO-8859-1';
	private $encoding = 'UTF-8';
	private $tmpDir = 'tmp';
	
	/**
	 * CONSTRUCTOR
	 * 
	 * @param Boolean $debug Debug mode or not
	 * @return void
	 */
	public function __construct($debug_=null, $encoding=null)
	{
		if($debug_ != null) {
			$this->debug = $debug_;
		}
		if ($encoding != null) {
			$this->encoding = $encoding;
		}
		$this->tmpDir = dirname(__FILE__);
	}

	/**
	 * Sets the tmp directory where images will be stored
	 * 
	 * @param string $tmp The location 
	 * @return void
	 */
	private function setTmpDir($tmp)
	{
		$this->tmpDir = $tmp;
	}

	/**
	 * READS The Document and Relationships into separated XML files
	 * 
	 * @param var $object The class variable to set as DOMDocument 
	 * @param var $xml The xml file
	 * @param string $encoding The encoding to be used
	 * @return void
	 */
	private function setXmlParts(&$object, $xml, $encoding)
	{
		$object = new DOMDocument();
		$object->encoding = $encoding;
		$object->preserveWhiteSpace = false;
		$object->formatOutput = true;
		$object->loadXML($xml);
		$object->saveXML();
	}

	/**
	 * READS The Document and Relationships into separated XML files
	 * 
	 * @param String $filename The filename
	 * @return void
	 */
	private function readZipPart($filename)
	{
		$zip = new ZipArchive();
		$_xml = 'word/document.xml';
		$_xml_rels = 'word/_rels/document.xml.rels';
		
		if (true === $zip->open($filename)) {
			if (($index = $zip->locateName($_xml)) !== false) {
				$xml = $zip->getFromIndex($index);
			}
			//Get the relationships
			if (($index = $zip->locateName($_xml_rels)) !== false) {
				$xml_rels = $zip->getFromIndex($index);
			}
			// load all images if they exist
			for ($i=0; $i<$zip->numFiles;$i++) {
            	$zip_element = $zip->statIndex($i);
            	 if(preg_match("([^\s]+(\.(?i)(jpg|jpeg|png|gif|bmp))$)",$zip_element['name'])) {
            	 	$this->doc_media[$zip_element['name']] = $zip_element['name'];
            	 }
        	}
			$zip->close();
		} else die('non zip file');

		$enc = mb_detect_encoding($xml);
		$this->setXmlParts($this->doc_xml, $xml, $enc);
		$this->setXmlParts($this->rels_xml, $xml_rels, $enc);
		
		if($this->debug) {
			echo "<textarea style='width:100%; height: 200px;'>";
			echo $this->doc_xml->saveXML();
			echo "</textarea>";
			echo "<textarea style='width:100%; height: 200px;'>";
			echo $this->rels_xml->saveXML();
			echo "</textarea>";
		}
	}

	/**
	 * CHECKS THE FONT FORMATTING OF A GIVEN ELEMENT
	 * Currently checks and formats: bold, italic, underline, background color and font family
	 * 
	 * @param XML $xml The XML node
	 * @return String HTML formatted code
	 */
	private function checkFormating(&$xml)
	{	
		$node = trim($xml->readOuterXML());
		$t = '';
		// add <br> tags
		if (strstr($node,'<w:br ')) $t = '<br>';					 
		// look for formatting tags
		$f = "<span style='";
		$reader = new XMLReader();
		$reader->XML($node);
		$img = null;

		while ($reader->read()) {
			if($reader->name == "w:b") {
				$f .= "font-weight: bold,";
			}
			if($reader->name == "w:i") {
				$f .= "text-decoration: underline,";
			}
			if($reader->name == "w:color") {
				$f .="color: #".$reader->getAttribute("w:val").",";
			}
			if($reader->name == "w:rFont") {
				$f .="font-family: #".$reader->getAttribute("w:ascii").",";
			}
			if($reader->name == "w:shd" && $reader->getAttribute("w:val") != "clear" && $reader->getAttribute("w:fill") != "000000") {
				$f .="background-color: #".$reader->getAttribute("w:fill").",";
			}
			if($reader->name == 'w:drawing' && !empty($reader->readInnerXml())) {
				$r = $this->checkImageFormating($reader);
				$img = $r !== null ? "<image src='".$r."' />" : null;
			}
		}
		
		$f = rtrim($f, ',');
		$f .= "'>";
		$t .= ($img !== null ? $img : htmlentities($xml->expand()->textContent));

		return $f.$t."</span>";
	}
	
	/**
	 * CHECKS THE ELEMENT FOR UL ELEMENTS
	 * Currently under development
	 * 
	 * @param XML $xml The XML node
	 * @return String HTML formatted code
	 */
	private function getListFormating(&$xml)
	{	
		$node = trim($xml->readOuterXML());
		
		$reader = new XMLReader();
		$reader->XML($node);
		$ret="";
		$close = "";
		while ($reader->read()){
			if($reader->name == "w:numPr" && $reader->nodeType == XMLReader::ELEMENT ) {
				
			}
			if($reader->name == "w:numId" && $reader->hasAttributes) {
				switch($reader->getAttribute("w:val")) {
					case 1:
						$ret['open'] = "<ol><li>";
						$ret['close'] = "</li></ol>";
						break;
					case 2:
						$ret['open'] = "<ul><li>";
						$ret['close'] = "</li></ul>";
						break;
				}
				
			}
		}
		return $ret;
	}
	
	/**
	 * CHECKS IF THERE IS AN IMAGE PRESENT
	 * Currently under development
	 * 
	 * @param XML $xml The XML node
	 * @return String The location of the image
	 */
	private function checkImageFormating(&$xml)
	{
		$content = trim($xml->readInnerXml());

		if (!empty($content)) {

			$relId;
			$notfound = true;
			$reader = new XMLReader();
			$reader->XML($content);
			
			while ($reader->read() && $notfound) {
				if ($reader->name == "a:blip") {
					$relId = $reader->getAttribute("r:embed");
					$notfound = false;
				}
			}

			// image id found, get the image location
			if (!$notfound && $relId) {
				$reader = new XMLReader();
				$reader->XML($this->rels_xml->saveXML());
				
				while ($reader->read()) {
					if ($reader->nodeType == XMLREADER::ELEMENT && $reader->name=='Relationship') {
						if($reader->getAttribute("Id") == $relId) {
							$link = "word/".$reader->getAttribute('Target');
							break;
						}
					}
				}

    			$zip = new ZipArchive();
    			$im = null;
    			if (true === $zip->open($this->file)) {
        			$im = $this->createImage($zip->getFromName($link), $relId, $link);
    			}
    			$zip->close();
    			return $im;
			}
		}

		return null;
	}

	/**
	 * Creates an image in the filesystem
	 *  
	 * @param objetc $image The image object
	 * @param string $relId The image relationship Id
	 * @param string $name The image name
	 * @return Array With HTML open and closing tag definition
	 */
	private function createImage($image, $relId, $name)
	{
		$arr = explode('.', $name);
		$l = count($arr);
		$ext = strtolower($arr[$l-1]);

		$im = imagecreatefromstring($image);
		$fname = $this->tmpDir.'/tmp/'.$relId.'.'.$ext;

		switch ($ext) {
			case 'png':
				imagepng($im, $fname);
				break;
			case 'bmp':
				imagebmp($im, $fname);
				break;
			case 'gif':
				imagegif($im, $fname);
				break;
			case 'jpeg':
			case 'jpg':
				imagejpeg($im, $fname);
				break;
			default:
				return null;
		}

		return $fname;
	}

	/**
	 * CHECKS IF ELEMENT IS AN HYPERLINK
	 *  
	 * @param XML $xml The XML node
	 * @return Array With HTML open and closing tag definition
	 */
	private function getHyperlink(&$xml)
	{
		$ret = array('open'=>'<ul>','close'=>'</ul>');
		$link ='';
		if($xml->hasAttributes) {
			$attribute = "";
			while($xml->moveToNextAttribute()) {
				if($xml->name == "r:id")
					$attribute = $xml->value;
			}
			
			if($attribute != "") {
				$reader = new XMLReader();
				$reader->XML($this->rels_xml->saveXML());
				
				while ($reader->read()) {
					if ($reader->nodeType == XMLREADER::ELEMENT && $reader->name=='Relationship') {
						if($reader->getAttribute("Id") == $attribute) {
							$link = $reader->getAttribute('Target');
							break;
						}
					}
				}
			}
		}
		
		if($link != "") {
			$ret['open'] = "<a href='".$link."' target='_blank'>";
			$ret['close'] = "</a>";
		}
		
		return $ret;
	}


	/**
	 * PROCESS TABLE CONTENT
	 *  
	 * @param XML $xml The XML node
	 * @return THe HTML code of the table
	 */
	private function checkTableFormating(&$xml)
	{
		$table = "<table><tbody>";

		while ($xml->read()) {
			if ($xml->nodeType == XMLREADER::ELEMENT && $xml->name === 'w:tr') { //table row
				$tc = $ts = "";


				$tr = new XMLReader;
				$tr->xml(trim($xml->readOuterXML()));

				while ($tr->read()) {
					if ($tr->nodeType == XMLREADER::ELEMENT && $tr->name === 'w:tcPr') { //table element properties
						$ts = $this->processTableStyle(trim($tr->readOuterXML()));
					}
					if ($tr->nodeType == XMLREADER::ELEMENT && $tr->name === 'w:tc') { //table column
						$tc .= $this->processTableRow(trim($tr->readOuterXML()));
					}
				}
				$table .= '<tr style="'.$ts.'">'.$tc.'</tr>';
			}
		}

		$table .= "</tbody></table>";
		return $table;
	}

	/**
	 * PROCESS THE TABLE ROW STYLE
	 *  
	 * @param string $content The XML node content
	 * @return THe HTML code of the table
	 */
	private function processTableStyle($content)
	{
		/*border-collapse:collapse; 
		border-bottom:4px dashed #0000FF; 
		border-top:6px double #FF0000; 
		border-left:5px solid #00FF00; 
		border-right:5px solid #666666;*/

		$tc = new XMLReader;
		$tc->xml($content);
		$style = "border-collapse:collapse;";

		while ($tc->read()) {
			if ($tc->name === "w:tcBorders") {
				$tc2 = new SimpleXMLElement($tc->readOuterXML());

				foreach ($tc2->children('w',true) as $ch) {
					if (in_array($ch->getName(), ['left','top','botom','right']) ) {
						$line = $this->convertLine($ch['val']);
						$style .= " border-".$ch->getName().":".$ch['sz']."px $line #".$ch['color'].";";
					}
				}
				
				$tc->next();
			}
		}
		return $style;
	}

	private function convertLine($in)
	{
		if (in_array($in, ['dotted']))
			return "dashed";

		if (in_array($in, ['dotDash','dotdotDash','dotted','dashDotStroked','dashed','dashSmallGap']))
			return "dashed";
		
		if (in_array($in, ['double','triple','threeDEmboss','threeDEngrave','thick']))
			return "double";

		if (in_array($in, ['nil','none']))
			return "none";

		return "solid";
	}

	/**
	 * PROCESS THE TABLE ROW
	 *  
	 * @param string $content The XML node content
	 * @return THe HTML code of the table
	 */
	private function processTableRow($content)
	{
		$tc = new XMLReader;
		$tc->xml($content);
		$ct = "";

		while ($tc->read()) {
			if ($tc->name === "w:r") {
				$ct .= "<td>".$this->checkFormating($tc)."</td>";
				$tc->next();
			}
		}
		return $ct;
	}

	/**
	 * READS THE GIVEN DOCX FILE INTO HTML FORMAT
	 *  
	 * @param String $filename The DOCX file name
	 * @return String With HTML code
	 */
	public function readDocument($filename)
	{
		$this->file = $filename;
		$this->readZipPart($filename);
		$reader = new XMLReader();
		$reader->XML($this->doc_xml->saveXML());

		$text = ''; $list_format=[];

		$formatting['header'] = 0;
		// loop through docx xml dom
		while ($reader->read()) {
		// look for new paragraphs
			$paragraph = new XMLReader;
			$p = $reader->readOuterXML();

			if ($reader->nodeType == XMLREADER::ELEMENT && $reader->name === 'w:p') {
				// set up new instance of XMLReader for parsing paragraph independantly				
				$paragraph->xml($p);

				preg_match('/<w:pStyle w:val="(Heading.*?[1-6])"/',$p,$matches);
				if(isset($matches[1])) {
					switch($matches[1]){
						case 'Heading1': $formatting['header'] = 1; break;
						case 'Heading2': $formatting['header'] = 2; break;
						case 'Heading3': $formatting['header'] = 3; break;
						case 'Heading4': $formatting['header'] = 4; break;
						case 'Heading5': $formatting['header'] = 5; break;
						case 'Heading6': $formatting['header'] = 6; break;
						default: $formatting['header'] = 0; break;
					}
				}
				// open h-tag or paragraph
				$text .= ($formatting['header'] > 0) ? '<h'.$formatting['header'].'>' : '<p>';
				
				// loop through paragraph dom
				while ($paragraph->read()) {
					// look for elements
					if ($paragraph->nodeType == XMLREADER::ELEMENT && $paragraph->name === 'w:r') {
						if($list_format == "")
							$text .= $this->checkFormating($paragraph);
						else {
							$text .= $list_format['open'];
							$text .= $this->checkFormating($paragraph);
							$text .= $list_format['close'];
						}
						$list_format ="";
						$paragraph->next();
					}
					else if($paragraph->nodeType == XMLREADER::ELEMENT && $paragraph->name === 'w:pPr') { //lists
						$list_format = $this->getListFormating($paragraph);
						$paragraph->next();
					}
					else if($paragraph->nodeType == XMLREADER::ELEMENT && $paragraph->name === 'w:drawing') { //images
						$text .= $this->checkImageFormating($paragraph);
						$paragraph->next();
					}
					else if ($paragraph->nodeType == XMLREADER::ELEMENT && $paragraph->name === 'w:hyperlink') {
						$hyperlink = $this->getHyperlink($paragraph);
						$text .= $hyperlink['open'];
						$text .= $this->checkFormating($paragraph);
						$text .= $hyperlink['close'];
						$paragraph->next();
					}
				}
				$text .= ($formatting['header'] > 0) ? '</h'.$formatting['header'].'>' : '</p>';
			}
			else if ($reader->nodeType == XMLREADER::ELEMENT && $reader->name === 'w:tbl') { //tables
				$paragraph->xml($p);
				$text .= $this->checkTableFormating($paragraph);
				$reader->next();
			}
		}
		$reader->close();
		if($this->debug) {
			echo "<div style='width:100%; height: 200px;'>";
			echo mb_convert_encoding($text, $this->encoding);
			echo "</div>";
		}
		return mb_convert_encoding($text, $this->encoding);
	}
}

调用

​
<?php
include 'class.wordphp.php';
error_reporting(0);

$rt = new WordPHP();
$text = $rt->readDocument('ces.docx');

file_put_contents('ces.html', $text);


$line = file_get_contents('ces.html'); //读取生成的html
$line = preg_replace( "@<script(.*?)</script>@is", "", $line);
$line= preg_replace( "@<iframe(.*?)</iframe>@is", "", $line);
$line= preg_replace( "@<style(.*?)</style>@is", "", $line);
$line= preg_replace( "@<(.*?)>@is", "", $line);
$line= html_entity_decode($line, ENT_QUOTES, 'UTF-8');
$line= htmlspecialchars_decode($line);
$line = str_replace(["\n","\t","\r"], "", $line);//去除换行符
$line = preg_replace('/\s(?=\s)/', '', $line); //去掉跟随别的挤在一块的空白

echo $line;

file_put_contents('word.txt', $line);


​

方法二

<?php
// 引入PHPWord库
require_once 'vendor/autoload.php';

function readWordToHtml($source)
{
    $phpWord = \PhpOffice\PhpWord\IOFactory::load($source);
    $html = '';
    foreach ($phpWord->getSections() as $section) {
        foreach ($section->getElements() as $ele1) {
            if ($ele1 instanceof \PhpOffice\PhpWord\Element\TextRun) {
                foreach ($ele1->getElements() as $ele2) {
                    if ($ele2 instanceof \PhpOffice\PhpWord\Element\Text) {
                        $html .= mb_convert_encoding($ele2->getText(), 'GBK', 'UTF-8');
                    }
                }
            }
        }
    }
    
    return mb_convert_encoding($html, 'UTF-8', 'GBK');
}

    //方法一:文件
    $file = 'ces.docx';
    $text = readWordToHtml($file);

    //方法二:文件流
    $word_binary_data = file_get_contents($file);   //文件流
    $temp_word_file = tempnam(sys_get_temp_dir(), 'word');     //临时文件
    file_put_contents($temp_word_file, $word_binary_data);      

    $text = readWordToHtml($temp_word_file);

    unlink($temp_word_file);    //删除临时文件

    echo $text;

PHPWord Beta 0.6.2 开发者指南 目 录 首先我们要了解文档最基本的信息和设置: 4 计量单位:缇(twips) 4 字体设置 4 文档属性设置 4 新建文档 5 添加页面 5 页面样式 5 页面样式属性 6 文本 7 添加文本 7 添加文本资源 7 文本样式 8 样式属性列表 9 添加换行符 10 添加分页符 10 列表 10 添加列表 10 列表样式 11 列表样式属性列表 11 超链接 11 添加超链接 11 超链接样式 12 图片 13 添加图片 13 图片样式 13 图片样式属性 13 添加GD生成图片 14 添加水印 14 添加对象 15 添加标题 15 添加目录 16 表格 17 添加表格 17 添加行 17 添加单元格 17 单元格样式 19 表格样式 20 页脚 22 页眉 23 模版 23 其他问题修改 25 解决文本缩进问题 25 表格对齐和表格缩进 27 图片缩进和绝对相对悬浮定位 30 首先我们要了解文档最基本的信息和设置:  因为是国外编辑的类库,存在对中文支持的问题,使用前,我们需要进行一些修正: 1、解决编码问题,PHPword 会对输入的文字进行utf8_encode编码转化,如果你使用GBK、GB2312或者utf8编码的话就会出现乱码,如果你用utf8编码,就查找类库中所有方法中的 utf8_encode 转码将其删除,如果你采用GBK或者GB2312编码,使用iconv进行编码转换。 2、解决中文字体支持,在writer/word2007/base.php中 312行添加 $objWriter->writeAttribute('w:eastAsia',$font) 3、启动php zip支持,windows环境下在php配置文件php.ini中,将extension=php_zip.dll前面的分号“;”去除;(如果没有,请添加extension=php_zip.dll此行并确保php_zip.dll文件存在相应的目录),然后同样在php.ini文件中,将 zlib.output_compression = Off 改为zlib.output_compression = On ; 
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值