PHP识别复杂pdf文档
使用阿里云的pdf文档识别
阿里云视觉智能平台开通文字识别-PDF识别
/**
* 使用AK&SK初始化账号Client
* @param string $accessKeyId
* @param string $accessKeySecret
* @return Ocrapi Client
*/
public function createClient($accessKeyId, $accessKeySecret){
$config = new Config([
// 必填,您的 AccessKey ID
"accessKeyId" => 'AccessKey ID',
// 必填,您的 AccessKey Secret
"accessKeySecret" => 'AccessKey Secret'
]);
// Endpoint 请参考 https://api.aliyun.com/product/ocr
$config->endpoint = "ocr.cn-shanghai.aliyuncs.com";
$model = new Ocr($config);
return $model;
}
/**
* @param string[] $args
* @return void
* 阿里云一次智能识别5页pdf,所以需要对原文件进行拆分,分别取请求
*/
public function getContent($param){
set_time_limit(0);
$inputPdf = $param['url'];
// 计算需要拆分成多少个PDF文件
$totalPages = intval(shell_exec("pdftk $inputPdf dump_data | grep NumberOfPages | awk '{print $2}'"));
$numFiles = intval(ceil($totalPages / 5));
$domain = request()->domain();
$param['is_delete'] = $param['is_delete'] ?? 0;
// 请确保代码运行环境设置了环境变量 ALIBABA_CLOUD_ACCESS_KEY_ID 和 ALIBABA_CLOUD_ACCESS_KEY_SECRET。
// 工程代码泄露可能会导致 AccessKey 泄露,并威胁账号下所有资源的安全性。以下代码示例使用环境变量获取 AccessKey 的方式进行调用,仅供参考,建议使用更安全的 STS 方式,更多鉴权访问方式请参见:https://help.aliyun.com/document_detail/311677.html
$client = self::createClient(getenv("ALIBABA_CLOUD_ACCESS_KEY_ID"), getenv('ALIBABA_CLOUD_ACCESS_KEY_SECRET'));
$runtime = new RuntimeOptions([]);
try {
// 使用循环拆分PDF并保存为单独的文件
$str = '';
for ($i=0; $i<$numFiles; $i++) {
$startPage = ($i * 5) + 1;
$endPage = min(($startPage + 4), intval($totalPages));
//执行拆分
exec("pdftk A=$inputPdf cat A$startPage-$endPage output {$param['path']}_{$i}.pdf",$output, $returnVar);
$del_url = $param['path'].'_'.$i.'.pdf';
//执行pdf识别
$recognizePdfRequest = new RecognizePdfRequest([
"fileURL" => $domain.'/'.$del_url
]);
// 复制代码运行请自行打印 API 的返回值
$result = $client->recognizePdfWithOptions($recognizePdfRequest, $runtime);
$content = $result->body->data->wordsInfo;
//拼接识别内容
foreach ($content as &$vv){
$word = get_object_vars($vv);
if(substr($word['word'],0,1) == '>' || substr($word['word'],0,1) == '<'){
$word['word'] = substr($word['word'],1);
}
$str .= $word['word']."\n ";
}
}
$data['code'] = 200;
$data['content'] = $str;
}
catch (Exception $error) {
if (!($error instanceof TeaError)) {
$error = new TeaError([], $error->getMessage(), $error->getCode(), $error);
}
$data['code'] = 400;
$data['content'] = $error->message;
//unset($del_url);
// 错误 message
// var_dump($error->message);
// // 诊断地址
// var_dump($error->data["Recommend"]);
// Utils::assertAsString($error->message);
}
//识别成功删除原文件
if($param['is_delete']){
unlink($param['url']);
}
//删除生成的文件
for ($i=0; $i<$numFiles; $i++) {
$del_url = $param['path'].'_'.$i.'.pdf';
if (file_exists($del_url)) {
unlink($del_url);
}
}
return $data;
}