php读取word文档,没找到合适的php读取word的包,然后找了个读取pdf文档,wps先把word转成pdf,然后php读取pdf里面的内容,过滤内容

php读取word文档,没找到合适的php读取word的包,然后找了个读取pdf文档,wps先把word转成pdf,然后php读取pdf里面的内容,过滤内容 
composer require smalot/pdfparser 
网址: https://github.com/smalot/pdfparser


<?php

require 'vendor/autoload.php';
$inputFileName = './5000.pdf';

$mysql = new mysqli("xxx","user","pwd","db");

$parser = new \Smalot\PdfParser\Parser();
$pdf = $parser->parseFile($inputFileName);

$text = $pdf->getText();

$arr = explode(PHP_EOL,$text);
header('content-type:text/html;charset=utf-8');
$words = array();
$index = 0;
$addNum = $updateNum = 0;
foreach ($arr as $k=>$v){
    if(empty($v) ||$k==0){
        continue;
    }

    //这个有点奇怪,用exlpode不行
    $math = preg_split("/\s/",$v,-1,PREG_SPLIT_OFFSET_CAPTURE);

    $temp_word = array();
    foreach($math as $k2=>$v2){
        if(empty($v2[0]) || ($k2==0&&$v2[0]<1)){
            continue;
        }
        $temp_word[] = $v2[0];
    }
    $index++;

    $insert = array(
          "spell"=>$temp_word[1],
          "type"=>$temp_word[2],
          "frequency"=>$temp_word[3],
          "dispersion"=>$temp_word[4],
    );

    $res      = $mysql->query("select * from  `words` where spell='{$insert['spell']}' ");
    $data     = $res->fetch_assoc();

    $exist_id = !empty($data["id"])?$data["id"]:"";

    if($exist_id){
        file_put_contents("update.log","...数据存在,正在更新...word:{$insert['spell']}".PHP_EOL,FILE_APPEND);

        $sql = update_sql($insert," where id= {$exist_id} ");
        file_put_contents("sql.txt",$sql.PHP_EOL,FILE_APPEND);

        $res = $mysql->query($sql);
        $updateNum++;
    }else{
        $sql = insert_sql($insert);

        file_put_contents("log.log","...数据存在,正在更新...word:{$insert['spell']}".PHP_EOL,FILE_APPEND);
        file_put_contents("sql.txt",$sql.PHP_EOL,FILE_APPEND);

        $res = $mysql->query($sql);
        $addNum++;
    }

}
echo "总数量{$index},新增数量:{$addNum},更新数量:{$updateNum}";

function replace_string($str=''){
    $v = str_replace('"','\"',$str);
    $v = str_replace("'","\'",$v);
    $v = str_replace('`','\`',$v);
    //$v = str_replace('/','\/',$v);
    $v = str_replace('_x000D_','',$v);
    //$v = addslashes($v);
    return $v;
}
function insert_sql($insert=array(),$table='words'){
    if(empty($insert))return false;

    $value    = "";
    $fields   = "";
    foreach($insert as $k=>$v){
        $fields .= "`{$k}`,";

        if(!is_numeric($v)){
            //替换字符串
            $v = replace_string($v);
        }
        $value  .= is_numeric($v)?$v:"'$v'";
        $value  .= ',';
    }

    $fields = trim($fields,",");
    $value  = trim($value,",");

    $sql    = "insert  into `{$table}` ($fields) values ($value);";

    //var_dump($sql);
    return $sql;
}

function update_sql($update=array(),$where="",$table='words'){
    if(empty($update))return false;

    $value    = "";
    $setVal   = "";
    foreach($update as $k=>$v){
        $fields = "`{$k}`";

        if(is_numeric($v)){
            $value  = $v;
        }else{
            //替换字符串
            $value = replace_string($v);;
        }

        $setVal .= $fields."='$value',";
    }
    $setVal = trim($setVal,",");

    $sql    = "update `{$table}` set $setVal $where;";

    return $sql;
}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值