php读取word文档,没找到合适的php读取word的包,然后找了个读取pdf文档,wps先把word转成pdf,然后php读取pdf里面的内容,过滤内容
composer require smalot/pdfparser
网址: https://github.com/smalot/pdfparser
<?php
require 'vendor/autoload.php';
$inputFileName = './5000.pdf';
$mysql = new mysqli("xxx","user","pwd","db");
$parser = new \Smalot\PdfParser\Parser();
$pdf = $parser->parseFile($inputFileName);
$text = $pdf->getText();
$arr = explode(PHP_EOL,$text);
header('content-type:text/html;charset=utf-8');
$words = array();
$index = 0;
$addNum = $updateNum = 0;
foreach ($arr as $k=>$v){
if(empty($v) ||$k==0){
continue;
}
//这个有点奇怪,用exlpode不行
$math = preg_split("/\s/",$v,-1,PREG_SPLIT_OFFSET_CAPTURE);
$temp_word = array();
foreach($math as $k2=>$v2){
if(empty($v2[0]) || ($k2==0&&$v2[0]<1)){
continue;
}
$temp_word[] = $v2[0];
}
$index++;
$insert = array(
"spell"=>$temp_word[1],
"type"=>$temp_word[2],
"frequency"=>$temp_word[3],
"dispersion"=>$temp_word[4],
);
$res = $mysql->query("select * from `words` where spell='{$insert['spell']}' ");
$data = $res->fetch_assoc();
$exist_id = !empty($data["id"])?$data["id"]:"";
if($exist_id){
file_put_contents("update.log","...数据存在,正在更新...word:{$insert['spell']}".PHP_EOL,FILE_APPEND);
$sql = update_sql($insert," where id= {$exist_id} ");
file_put_contents("sql.txt",$sql.PHP_EOL,FILE_APPEND);
$res = $mysql->query($sql);
$updateNum++;
}else{
$sql = insert_sql($insert);
file_put_contents("log.log","...数据存在,正在更新...word:{$insert['spell']}".PHP_EOL,FILE_APPEND);
file_put_contents("sql.txt",$sql.PHP_EOL,FILE_APPEND);
$res = $mysql->query($sql);
$addNum++;
}
}
echo "总数量{$index},新增数量:{$addNum},更新数量:{$updateNum}";
function replace_string($str=''){
$v = str_replace('"','\"',$str);
$v = str_replace("'","\'",$v);
$v = str_replace('`','\`',$v);
//$v = str_replace('/','\/',$v);
$v = str_replace('_x000D_','',$v);
//$v = addslashes($v);
return $v;
}
function insert_sql($insert=array(),$table='words'){
if(empty($insert))return false;
$value = "";
$fields = "";
foreach($insert as $k=>$v){
$fields .= "`{$k}`,";
if(!is_numeric($v)){
//替换字符串
$v = replace_string($v);
}
$value .= is_numeric($v)?$v:"'$v'";
$value .= ',';
}
$fields = trim($fields,",");
$value = trim($value,",");
$sql = "insert into `{$table}` ($fields) values ($value);";
//var_dump($sql);
return $sql;
}
function update_sql($update=array(),$where="",$table='words'){
if(empty($update))return false;
$value = "";
$setVal = "";
foreach($update as $k=>$v){
$fields = "`{$k}`";
if(is_numeric($v)){
$value = $v;
}else{
//替换字符串
$value = replace_string($v);;
}
$setVal .= $fields."='$value',";
}
$setVal = trim($setVal,",");
$sql = "update `{$table}` set $setVal $where;";
return $sql;
}