php写入文本检查重复,利用原生php方法，把docx文件内容项重复的查出来

最新推荐文章于 2022-04-16 15:28:46 发布

Rabenda

最新推荐文章于 2022-04-16 15:28:46 发布

阅读量251

点赞数

文章标签： php写入文本检查重复

原本想用phpword来读我的word文件的，奈何死活都不成功，也不知道是为什么，因此在互联网的大洋中寻找解决方案，终于是找到了一个解决的方案了，

思路解读

大概看了一下这个类的代码基本是利用原生的zip包来解决的，但是至于原理的话我也不是太懂，所以就抄了实现了需求先。

目前只支撑docx，如果要读doc文件，我试了直接强转成docx，问题不大

主要用这个功能来解决的问题就是，例如有1000个人每人发了一份报名表给你，那你怎样快速的辨别这其中报名表中是否有重复的，如果是一份份登记出来的话，那得搞到猴年马月，加班加到你怀疑人生而且在重复的操作

/**

* Created: jiangshiwen

* Date: 2021/1/6

* Time: 15:03

* Theme:

class WordPHP

{

private $debug = false;

private $file;

private $rels_xml;

private $doc_xml;

private $doc_media = [];

private $last = 'none';

// private $encoding = 'ISO-8859-1';

private $encoding = 'UTF-8';

private $tmpDir = 'tmp';

/**

* CONSTRUCTOR

* @param Boolean $debug Debug mode or not

* @return void

public function __construct($debug_=null, $encoding=null)

{

if($debug_ != null) {

$this->debug = $debug_;

}

if ($encoding != null) {

$this->encoding = $encoding;

}

$this->tmpDir = dirname(__FILE__);

}

/**

* Sets the tmp directory where images will be stored

* @param string $tmp The location

* @return void

private function setTmpDir($tmp)

{

$this->tmpDir = $tmp;

}

/**

* READS The Document and Relationships into separated XML files

* @param var $object The class variable to set as DOMDocument

* @param var $xml The xml file

* @param string $encoding The encoding to be used

* @return void

private function setXmlParts(&$object, $xml, $encoding)

{

$object = new DOMDocument();

$object->encoding = $encoding;

$object->preserveWhiteSpace = false;

$object->formatOutput = true;

$object->loadXML($xml);

$object->saveXML();

}

/**

* READS The Document and Relationships into separated XML files

* @param String $filename The filename

* @return void

private function readZipPart($filename)

{

$zip = new ZipArchive();

$_xml = 'word/document.xml';

$_xml_rels = 'word/_rels/document.xml.rels';

// var_dump($zip->open($filename));exit;

if (true === $zip->open($filename)) {

if (($index = $zip->locateName($_xml)) !== false) {

$xml = $zip->getFromIndex($index);

}

//Get the relationships

if (($index = $zip->locateName($_xml_rels)) !== false) {

$xml_rels = $zip->getFromIndex($index);

}

// load all images if they exist

for ($i=0; $inumFiles;$i++) {

$zip_element = $zip->statIndex($i);

if(preg_match("([^\s]+(\.(?i)(jpg|jpeg|png|gif|bmp))$)",$zip_element['name'])) {

$this->doc_media[$zip_element['name']] = $zip_element['name'];

}

$zip->close();

} else die('non zip file');

$enc = mb_detect_encoding($xml);

$this->setXmlParts($this->doc_xml, $xml, $enc);

$this->setXmlParts($this->rels_xml, $xml_rels, $enc);

if($this->debug) {

echo "";

echo $this->doc_xml->saveXML();

echo "";

echo $this->rels_xml->saveXML();

echo "";

}

/**

* CHECKS THE FONT FORMATTING OF A GIVEN ELEMENT

* Currently checks and formats: bold, italic, underline, background color and font family

* @param XML $xml The XML node

* @return String HTML formatted code

private function checkFormating(&$xml)

{

$node = trim($xml->readOuterXML());

$t = '';

// add
tags

if (strstr($node,'

// look for formatting tags

$f = "readInnerXml())) {

$r = $this->checkImageFormating($reader);

$img = $r !== null ? "" : null;

}

$f = rtrim($f, ',');

$f .= "'>";

$t .= ($img !== null ? $img : htmlentities($xml->expand()->textContent));

return $f.$t."";

}

/**

* CHECKS THE ELEMENT FOR UL ELEMENTS

* Currently under development

* @param XML $xml The XML node

* @return String HTML formatted code

private function getListFormating(&$xml)

{

$node = trim($xml->readOuterXML());

$reader = new XMLReader();

$reader->XML($node);

$ret=[];

$close = "";

while ($reader->read()){

if($reader->name == "w:numPr" && $reader->nodeType == XMLReader::ELEMENT ) {

}

if($reader->name == "w:numId" && $reader->hasAttributes) {

switch($reader->getAttribute("w:val")) {

case 1:

$ret['open'] = "

";
$ret['close'] = "

break;

case 2:

$ret['open'] = "

";
$ret['close'] = "

break;

}

return $ret;

}

/**

* CHECKS IF THERE IS AN IMAGE PRESENT

* Currently under development

* @param XML $xml The XML node

* @return String The location of the image

private function checkImageFormating(&$xml)

{

$content = trim($xml->readInnerXml());

if (!empty($content)) {

$relId;

$notfound = true;

$reader = new XMLReader();

$reader->XML($content);

while ($reader->read() && $notfound) {

if ($reader->name == "a:blip") {

$relId = $reader->getAttribute("r:embed");

$notfound = false;

}

// image id found, get the image location

if (!$notfound && $relId) {

$reader = new XMLReader();

$reader->XML($this->rels_xml->saveXML());

while ($reader->read()) {

if ($reader->nodeType == XMLREADER::ELEMENT && $reader->name=='Relationship') {

if($reader->getAttribute("Id") == $relId) {

$link = "word/".$reader->getAttribute('Target');

break;

}

$zip = new ZipArchive();

$im = null;

if (true === $zip->open($this->file)) {

$im = $this->createImage($zip->getFromName($link), $relId, $link);

}

$zip->close();

return $im;

}

return null;

}

/**

* Creates an image in the filesystem

* @param objetc $image The image object

* @param string $relId The image relationship Id

* @param string $name The image name

* @return Array With HTML open and closing tag definition

private function createImage($image, $relId, $name)

{

$arr = explode('.', $name);

$l = count($arr);

$ext = strtolower($arr[$l-1]);

$im = imagecreatefromstring($image);

$fname = $this->tmpDir.'/tmp/'.$relId.'.'.$ext;

switch ($ext) {

case 'png':

imagepng($im, $fname);

break;

case 'bmp':

imagebmp($im, $fname);

break;

case 'gif':

imagegif($im, $fname);

break;

case 'jpeg':

case 'jpg':

imagejpeg($im, $fname);

break;

default:

return null;

}

return $fname

;

}

/**

* CHECKS IF ELEMENT IS AN HYPERLINK

* @param XML $xml The XML node

* @return Array With HTML open and closing tag definition

private function getHyperlink(&$xml)

{

$ret = array('open'=>'

','close'=>'

');

$link ='';

if($xml->hasAttributes) {

$attribute = "";

while($xml->moveToNextAttribute()) {

if($xml->name == "r:id")

$attribute = $xml->value;

}

if($attribute != "") {

$reader = new XMLReader();

$reader->XML($this->rels_xml->saveXML());

while ($reader->read()) {

if ($reader->nodeType == XMLREADER::ELEMENT && $reader->name=='Relationship') {

if($reader->getAttribute("Id") == $attribute) {

$link = $reader->getAttribute('Target');

break;

}

if($link != "") {

$ret['open'] = "";

$ret['close'] = "";

}

return $ret;

}

/**

* PROCESS TABLE CONTENT

* @param XML $xml The XML node

* @return THe HTML code of the table

private function checkTableFormating(&$xml)

{

$table = "

while ($xml->read()) {

if ($xml->nodeType == XMLREADER::ELEMENT && $xml->name === 'w:tr') { //table row

$tc = $ts = "";

$tr = new XMLReader;

$tr->xml(trim($xml->readOuterXML()));

while ($tr->read()) {

if ($tr->nodeType == XMLREADER::ELEMENT && $tr->name === 'w:tcPr') { //table element properties

$ts = $this->processTableStyle(trim($tr->readOuterXML()));

}

if ($tr->nodeType == XMLREADER::ELEMENT && $tr->name === 'w:tc') { //table column

$tc .= $this->processTableRow(trim($tr->readOuterXML()));

}

$table .= '

'.$tc.'';

}

$table .= "

return $table;

}

/**

* PROCESS THE TABLE ROW STYLE

* @param string $content The XML node content

* @return THe HTML code of the table

private function processTableStyle($content)

{

/*border-collapse:collapse;

border-bottom:4px dashed #0000FF;

border-top:6px double #FF0000;

border-left:5px solid #00FF00;

border-right:5px solid #666666;*/

$tc = new XMLReader;

$tc->xml($content);

$style = "border-collapse:collapse;";

while ($tc->read()) {

if ($tc->name === "w:tcBorders") {

$tc2 = new SimpleXMLElement($tc->readOuterXML());

foreach ($tc2->children('w',true) as $ch) {

if (in_array($ch->getName(), ['left','top','botom','right']) ) {

$line = $this->convertLine($ch['val']);

$style .= " border-".$ch->getName().":".$ch['sz']."px $line #".$ch['color'].";";

}

$tc->next();

}

return $style;

}

private function convertLine($in)

{

if (in_array($in, ['dotted']))

return "dashed";

if (in_array($in, ['dotDash','dotdotDash','dotted','dashDotStroked','dashed','dashSmallGap']))

return "dashed";

if (in_array($in, ['double','triple','threeDEmboss','threeDEngrave','thick']))

return "double";

if (in_array($in, ['nil','none']))

return "none";

return "solid";

}

/**

* PROCESS THE TABLE ROW

* @param string $content The XML node content

* @return THe HTML code of the table

private function processTableRow($content)

{

$tc = new XMLReader;

$tc->xml($content);

$ct = "";

while ($tc->read()) {

if ($tc->name === "w:r") {

$ct .= "

".$this->checkFormating($tc)."";

$tc->next();

}

return $ct;

}

/**

* READS THE GIVEN DOCX FILE INTO HTML FORMAT

* @param String $filename The DOCX file name

* @return String With HTML code

public function readDocument($filename)

{

$this->file = $filename;

$this->readZipPart($filename);

$reader = new XMLReader();

$reader->XML($this->doc_xml->saveXML());

$text = ''; $list_format=[];

$formatting['header'] = 0;

// loop through docx xml dom

while ($reader->read()) {

// look for new paragraphs

$paragraph = new XMLReader;

$p = $reader->readOuterXML();

if ($reader->nodeType == XMLREADER::ELEMENT && $reader->name === 'w:p') {

// set up new instance of XMLReader for parsing paragraph independantly

$paragraph->xml($p);

preg_match('/

if(isset($matches[1])) {

switch($matches[1]){

case 'Heading1': $formatting['header'] = 1; break;

case 'Heading2': $formatting['header'] = 2; break;

case 'Heading3': $formatting['header'] = 3; break;

case 'Heading4': $formatting['header'] = 4; break;

case 'Heading5': $formatting['header'] = 5; break;

case 'Heading6': $formatting['header'] = 6; break;

default: $formatting['header'] = 0; break;

}

// open h-tag or paragraph

$text .= ($formatting['header'] > 0) ? '' : '

// loop through paragraph dom

while ($paragraph->read()) {

// look for elements

if ($paragraph->nodeType == XMLREADER::ELEMENT && $paragraph->name === 'w:r') {

if($list_format == "")

$text .= $this->checkFormating($paragraph);

else {

$text .= $list_format['open'];

$text .= $this->checkFormating($paragraph);

$text .= $list_format['close'];

}

$list_format ="";

$paragraph->next();

}

else if($paragraph->nodeType == XMLREADER::ELEMENT && $paragraph->name === 'w:pPr') { //lists

$list_format = $this->getListFormating($paragraph);

$paragraph->next();

}

else if($paragraph->nodeType == XMLREADER::ELEMENT && $paragraph->name === 'w:drawing') { //images

$text .= $this->checkImageFormating($paragraph);

$paragraph->next();

}

else if ($paragraph->nodeType == XMLREADER::ELEMENT && $paragraph->name === 'w:hyperlink') {

$hyperlink = $this->getHyperlink($paragraph);

$text .= $hyperlink['open'];

$text .= $this->checkFormating($paragraph);

$text .= $hyperlink['close'];

$paragraph->next();

}

$text .= ($formatting['header'] > 0) ? '

' : '';

}

else if ($reader->nodeType == XMLREADER::ELEMENT && $reader->name === 'w:tbl') { //tables

$paragraph->xml($p);

$text .= $this->checkTableFormating($paragraph);

$reader->next();

}

$reader->close();

if($this->debug) {

echo "

echo mb_convert_encoding($text, $this->encoding);

echo "

}

return mb_convert_encoding($text, $this->encoding);

}

$rt = new WordPHP();

$text = $rt->readDocument('xxx.docx');//这里要利用php读取文件下的方法遍历操作

$patt1 = '/姓名([\w\W]*?)性别/'; //这里要根据你表格的具体信息来操作

preg_match_all($patt1,$text,$rs);

preg_match_all('/[\x{4e00}-\x{9fa5}]+/u', $rs[1][0], $res);

var_dump($res[0]); //后续就不实现了大概思路已经整理

Rabenda

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
php写入文本检查重复,利用原生php方法，把docx文件内容项重复的查出来

原本想用phpword来读我的word文件的，奈何死活都不成功，也不知道是为什么，因此在互联网的大洋中寻找解决方案，终于是找到了一个解决的方案了，思路解读大概看了一下这个类的代码基本是利用原生的zip包来解决的，但是至于原理的话我也不是太懂，所以就抄了实现了需求先。目前只支撑docx，如果要读doc文件，我试了直接强转成docx，问题不大主要用这个功能来解决的问题就是，例如有1000个人每人发了一...
复制链接

扫一扫