php html word_php实现将HTML页面转换成word并且保存的方法

本文实例讲述了php实现将HTML页面转换成word并且保存的方法。分享给大家供大家参考,具体如下:

这里用使用到一个PHP的工具叫:PHPWord。

生成Word的原理是,将堆规定好了的xml压缩成一个zip包,并且把后缀名改成doc或者docx即可。

所以使用PHPWord,需要你的PHP环境安装zip.dll压缩扩展,我写了一个demo.

功能说明:

20150507 — HTML中的

标签和

  1. 列表标签的获取

20150508 — 新增获取文章中的图片功能

20150509 — 新增行间距,并且过滤一下错误图片

20150514 — 新增表格处理,并且将代码改成面向对象

20150519 — 新增GD库处理网络图片

require_once 'PHPWord.php';

require_once 'SimpleHtmlDom.class.php';

class Word{

private $url;

private $LinetextArr = array();

public $CurrentDir;

public $error = array(); //错误数组

public $filename = null;

public $Allowtag = "p,ol,ul,table";

/**数据统计**/

public $DownImg = 0;

public $expendTime = 0;

public $HttpRequestTime = 0;

public $ContentLen = 0;

public $HttpRequestArr = array();

public $expendmemory = 0;

public function __construct($url)

{

$startTime = $this->_Time();

$startMemory = $this->_memory();

$this->url = $url;

$UrlArr = parse_url($this->url);

$this->host = $UrlArr["scheme"]."://".$UrlArr['host'];

$this->CurrentDir = getcwd();

$this->LinetextArr["table"] = array();

$html = new simple_html_dom($this->url);

$this->HttpRequestArr[] = $this->url;

$this->HttpRequestTime++;

foreach($html->find($this->Allowtag) as $key=>$value)

{

if($value->tag == "table")

{

$this->ParseTable($value,0,$this->LinetextArr["table"]);

}

else

{

$this->AnalysisHtmlDom($value);

}

$this->error[] = error_get_last();

}

$endTime = $this->_Time();

$endMemory = $this->_memory();

$this->expendTime = round(($endTime-$startTime),2); //微秒

$this->expendmemory = round(($endMemory-$startMemory)/1000,2); //bytes

$this->CreateWordDom();

}

private function _Time()

{

return array_sum(explode(" ", microtime()));

}

private function _memory()

{

return memory_get_usage();

}

/**

* 解析HTML中的Table,这里考虑到多层table嵌套的情况

* @param $value HTMLDOM

* @param $i 遍历层级

* **/

private function ParseTable($value,$i,$Arr)

{

if($value->firstChild() && in_array($value->firstChild()->tag,array("table","tbody","thead","tfoot","tr")))

{

foreach($value->children as $k=>$v)

{

$this->ParseTable($v,$i++,$Arr);

}

}

else

{

foreach($value->children as $k=>$v)

{

if($v->firstChild() && $v->firstChild()->tag != "table")

{

$Arr[$i][] = array("tag"=>$v->tag,"text"=>trim($v->plaintext));

}

if(!$v->firstChild())

{

$Arr[$i][] = array("tag"=>$v->tag,"text"=>trim($v->plaintext));

}

}

}

}

/**

* 解析HTML里面的表情

* @param $value HTMLDOM

* **/

private function AnalysisHtmlDom($value)

{

$tmp = array();

if($value->has_child())

{

foreach($value->children as $k=>$v)

{

$this->AnalysisHtmlDom($v);

}

}

else

{

if($value->tag == "a")

{

$tmp = array("tag"=>$value->tag,"href"=>$value->href,"text"=>$value->innertext);

}

else if($value->tag == "img")

{

$src = $this->unescape($value->src);

$UrlArr = parse_url($src);

if(!isset($UrlArr['host']))

{

$src = $this->host.$value->src;

$UrlArr = parse_url($src);

}

$src = $this->getImageFromNet($src,$UrlArr); //表示有网络图片,需要下载

if($src)

{

$imgsArr = $this->GD($src);

$tmp = array("tag"=>$value->tag,"src"=>$src,"text"=>$value->alt,"width"=>$imgsArr['width'],"height"=>$imgsArr['height']); }

}

else

{

$tmp = array("tag"=>$value->tag,"text"=>strip_tags($value->innertext));

}

$this->LinetextArr[] = $tmp;

}

}

/**

* 根据GD库来获取图片的如果太多,进行比例压缩

* **/

private function GD($src)

{

list($width, $height, $type, $attr) = getimagesize($src);

if($width > 800 || $height > 800 )

{

$width = $width/2;

$height = $height/2;

}

return array("width"=>$width,"height"=>$height);

}

/**

* 将Uincode编码转移回原来的字符

* **/

public function unescape($str) {

$str = rawurldecode($str);

preg_match_all("/(?:%u.{4})|.{4};|\d+;|.+/U",$str,$r);

$ar = $r[0];

foreach($ar as $k=>$v) {

if(substr($v,0,2) == "%u"){

$ar[$k] = iconv("UCS-2BE","UTF-8",pack("H4",substr($v,-4)));

}

elseif(substr($v,0,3) == ""){

$ar[$k] = iconv("UCS-2BE","UTF-8",pack("H4",substr($v,3,-1)));

}

elseif(substr($v,0,2) == ""){

$ar[$k] = iconv("UCS-2BE","UTF-8",pack("n",substr($v,2,-1)));

}

}

return join("",$ar);

}

/**

* 图片下载

* @param $Src 目标资源

* @param $UrlArr 目标URL对应的数组

* **/

private function getImageFromNet($Src,$UrlArr)

{

$file = basename($UrlArr['path']);

$ext = explode('.',$file);

$this->ImgDir = $this->CurrentDir."/".$UrlArr['host'];

$_supportedImageTypes = array('jpg', 'jpeg', 'gif', 'png', 'bmp', 'tif', 'tiff');

if(isset($ext['1']) && in_array($ext['1'],$_supportedImageTypes))

{

$file = file_get_contents($Src);

$this->HttpRequestArr[] = $Src;

$this->HttpRequestTime++;

$this->_mkdir(); //创建目录,或者收集错误

$imgName = md5($UrlArr['path']).".".$ext['1'];

file_put_contents($this->ImgDir."/".$imgName,$file);

$this->DownImg++;

return $UrlArr['host']."/".$imgName;

}

return false;

}

/**

* 创建目录

* **/

private function _mkdir()

{

if(!is_dir($this->ImgDir))

{

if(!mkdir($this->ImgDir,"7777"))

{

$this->error[] = error_get_last();

}

}

}

/**

* 构造WordDom

* **/

private function CreateWordDom()

{

$PHPWord = new PHPWord();

$PHPWord->setDefaultFontName('宋体');

$PHPWord->setDefaultFontSize("11");

$styleTable = array('borderSize'=>6, 'borderColor'=>'006699', 'cellMargin'=>120);

// New portrait section

$section = $PHPWord->createSection();

$section->addText($this->Details(),array(),array('spacing'=>120));

//数据进行处理

foreach($this->LinetextArr as $key=>$lineArr)

{

if(isset($lineArr['tag']))

{

if($lineArr['tag'] == "li")

{

$section->addListItem($lineArr['text'],0,"","",array('spacing'=>120));

}

else if($lineArr['tag'] == "img")

{

$section->addImage($lineArr['src'],array('width'=>$lineArr['width'], 'height'=>$lineArr['height'], 'align'=>'center'));

}

else if($lineArr['tag'] == "p")

{

$section->addText($lineArr['text'],array(),array('spacing'=>120));

}

}

else if($key == "table")

{

$PHPWord->addTableStyle('myOwnTableStyle', $styleTable);

$table = $section->addTable("myOwnTableStyle");

foreach($lineArr as $key=>$tr)

{

$table->addRow();

foreach($tr as $ky=>$td)

{

$table->addCell(2000)->addText($td['text']);

}

}

}

}

$this->downFile($PHPWord);

}

public function Details()

{

$msg = "一共请求:{$this->HttpRequestTime}次,共下载的图片有{$this->DownImg}张,并且下载完成大约使用时间:{$this->expendTime}秒,整个程序执行大约消耗内存是:{$this->expendmemory}KB,";

return $msg;

}

public function downFile($PHPWord)

{

if(empty($this->filename))

{

$UrlArr = parse_url($this->url);

$this->filename = $UrlArr['host'].".docx";

}

// Save File

$objWriter = PHPWord_IOFactory::createWriter($PHPWord, 'Word2007');

$objWriter->save($this->filename);

header("Pragma: public");

header("Expires: 0");

header("Cache-Control: must-revalidate, post-check=0, pre-check=0");

header("Cache-Control: public");

header("Content-Description: File Transfer");

//Use the switch-generated Content-Type

header('Content-type: application/msword');//输出的类型

//Force the download

$header="Content-Disposition: attachment; filename=".$this->filename.";";

header($header);

@readfile($this->filename);

}

}

上面的代码重点感觉不是word生成,而是Simplehtmldom的使用,这是一个开源的HTML解析器,之前有提到,这几天在看他的代码,

引出了两个学习方向

① 正在表达式

② 这个扩展的函数整理

看源代码的收获:

PHP的异常是可以捕获的,而且PHP的错误也是可以捕获的。

error_get_last() //用这个函数可以捕获页面中的PHP错误,不谢。

希望本文所述对大家PHP程序设计有所帮助。

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值