OCR提取学历证信息

最新推荐文章于 2024-06-02 01:13:17 发布

shanesu

最新推荐文章于 2024-06-02 01:13:17 发布

阅读量932

点赞数

文章标签： ocr python 人工智能

本文链接：https://blog.csdn.net/qq_36080693/article/details/131485974

版权

import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

from paddleocr import PaddleOCR
ocr = PaddleOCR(use_angle_cls=True, lang="ch",use_gpu=True)

[2023/07/01 01:12:07] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=True, use_xpu=False, use_npu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='C:\\Users\\37740/.paddleocr/whl\\det\\ch\\ch_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='C:\\Users\\37740/.paddleocr/whl\\rec\\ch\\ch_PP-OCRv3_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=6, max_text_length=25, rec_char_dict_path='C:\\Users\\37740\\AppData\\Roaming\\Python\\Python310\\site-packages\\paddleocr\\ppocr\\utils\\ppocr_keys_v1.txt', use_space_char=True, vis_font_path='./doc/fonts/simfang.ttf', drop_score=0.5, e2e_algorithm='PGNet', e2e_model_dir=None, e2e_limit_side_len=768, e2e_limit_type='max', e2e_pgnet_score_thresh=0.5, e2e_char_dict_path='./ppocr/utils/ic15_dict.txt', e2e_pgnet_valid_set='totaltext', e2e_pgnet_mode='fast', use_angle_cls=True, cls_model_dir='C:\\Users\\37740/.paddleocr/whl\\cls\\ch_ppocr_mobile_v2.0_cls_infer', cls_image_shape='3, 48, 192', label_list=['0', '180'], cls_batch_num=6, cls_thresh=0.9, enable_mkldnn=False, cpu_threads=10, use_pdserving=False, warmup=False, sr_model_dir=None, sr_image_shape='3, 32, 128', sr_batch_num=1, draw_img_save_dir='./inference_results', save_crop_res=False, crop_res_save_dir='./output', use_mp=False, total_process_num=1, process_id=0, benchmark=False, save_log_path='./log_output/', show_log=True, use_onnx=False, output='./output', table_max_len=488, table_algorithm='TableAttn', table_model_dir=None, merge_no_span_structure=True, table_char_dict_path=None, layout_model_dir=None, layout_dict_path=None, layout_score_threshold=0.5, layout_nms_threshold=0.5, kie_algorithm='LayoutXLM', ser_model_dir=None, re_model_dir=None, use_visual_backbone=True, ser_dict_path='../train_data/XFUND/class_list_xfun.txt', ocr_order_method=None, mode='structure', image_orientation=False, layout=True, table=True, ocr=True, recovery=False, use_pdf2docx_api=False, lang='ch', det=True, rec=True, type='ocr', ocr_version='PP-OCRv3', structure_version='PP-StructureV2')

#要识别图片的路径：
img_path = r"C:\Users\37740\Pictures\Screenshots\1.png"

在这里插入图片描述

#识别结果：
result = ocr.ocr(img_path, cls=True)

[2023/07/01 01:13:30] ppocr DEBUG: dt_boxes num : 18, elapse : 7.974095821380615
[2023/07/01 01:13:30] ppocr DEBUG: cls num  : 18, elapse : 0.03300142288208008
[2023/07/01 01:13:30] ppocr DEBUG: rec_res num  : 18, elapse : 0.2569854259490967

for line in result[0]:
    print(line)

[[[541.0, 227.0], [590.0, 227.0], [590.0, 243.0], [541.0, 243.0]], ('1927', 0.8937745094299316)]
[[[358.0, 268.0], [768.0, 268.0], [768.0, 312.0], [358.0, 312.0]], ('学士学位证书', 0.9404186606407166)]
[[[281.0, 346.0], [308.0, 346.0], [308.0, 365.0], [281.0, 365.0]], ('女', 0.5978937149047852)]
[[[334.0, 345.0], [630.0, 346.0], [630.0, 369.0], [334.0, 368.0]], ('1990年11月16日生：在', 0.9189730286598206)]
[[[662.0, 347.0], [806.0, 347.0], [806.0, 367.0], [662.0, 367.0]], ('南京晓庄学院', 0.8395994305610657)]
[[[198.0, 400.0], [415.0, 400.0], [415.0, 420.0], [198.0, 420.0]], ('经济学（金融与保险）', 0.886256992816925)]
[[[521.0, 400.0], [842.0, 400.0], [842.0, 420.0], [521.0, 420.0]], ('专业完成本科学习计划业已', 0.9024704098701477)]
[[[106.0, 455.0], [858.0, 447.0], [858.0, 474.0], [106.0, 481.0]], ('毕业，经审核符合《中华人民共和国学位条例》的规定授予经济学', 0.9644796252250671)]
[[[106.0, 509.0], [211.0, 506.0], [212.0, 534.0], [106.0, 536.0]], ('学士学位', 0.8394070863723755)]
[[[508.0, 552.0], [622.0, 552.0], [622.0, 581.0], [508.0, 581.0]], ('院长', 0.9888302087783813)]
[[[197.0, 597.0], [346.0, 594.0], [347.0, 618.0], [197.0, 621.0]], ('南京晓庄学院', 0.9270262718200684)]
[[[484.0, 589.0], [657.0, 589.0], [657.0, 612.0], [484.0, 612.0]], ('学位评定委员会主席', 0.9949430823326111)]
[[[743.0, 644.0], [999.0, 638.0], [999.0, 663.0], [744.0, 669.0]], ('〇六年六月三十日', 0.8751929998397827)]
[[[101.0, 661.0], [186.0, 657.0], [187.0, 681.0], [102.0, 685.0]], ('证书编号', 0.9989465475082397)]
[[[476.0, 685.0], [667.0, 681.0], [667.0, 701.0], [476.0, 704.0]], ('普通高等教育本科毕业生', 0.9703378081321716)]
[[[938.0, 749.0], [1044.0, 749.0], [1044.0, 777.0], [938.0, 777.0]], ('迈成教育', 0.9693202376365662)]
[[[12.0, 773.0], [208.0, 774.0], [208.0, 799.0], [12.0, 797.0]], ('搜狐号@转本小达人', 0.9803941249847412)]

#结果输出展示：
for line in result[0]:
    print(line[1][0])

1927
学士学位证书
女
1990年11月16日生：在
南京晓庄学院
经济学（金融与保险）
专业完成本科学习计划业已
毕业，经审核符合《中华人民共和国学位条例》的规定授予经济学
学士学位
院长
南京晓庄学院
学位评定委员会主席
〇六年六月三十日
证书编号
普通高等教育本科毕业生
迈成教育
搜狐号@转本小达人

#结果输出展示：
istr=""
for line in result[0]:
    istr=istr+"#"+line[1][0]
print(istr)

#1927#学士学位证书#女#1990年11月16日生：在#南京晓庄学院#经济学（金融与保险）#专业完成本科学习计划业已#毕业，经审核符合《中华人民共和国学位条例》的规定授予经济学#学士学位#院长#南京晓庄学院#学位评定委员会主席#〇六年六月三十日#证书编号#普通高等教育本科毕业生#迈成教育#搜狐号@转本小达人

import openai

def get_completion_from_messages(prompt, 
                                 model="gpt-3.5-turbo", 
                                 temperature=0, 
                                 max_tokens=500):
    messages = [{"role": "user", "content": prompt}]
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=temperature, # this is the degree of randomness of the model's output
        max_tokens=max_tokens, # the maximum number of tokens the model can ouptut 
    )
    return response.choices[0].message["content"]

prompt = f"""

从文本段落中提取以下信息：姓名、出生日期、学校名称、毕业形式、学历、专业、毕业日期。 \
文本段落通过前后三个井号来界定范围。 \
将您的反馈结果格式化为一个JSON对象，以"姓名"、"出生日期"、"学校名称"、"毕业形式"、"学历"、"专业"、"毕业日期"作为key。
如果信息未提供, 使用"unknown" 作为值。
文本段落：###{istr}###

"""

prompt

'\n\n从文本段落中提取以下信息：姓名、出生日期、学校名称、毕业形式、学历、专业、毕业日期。 文本段落通过前后三个井号来界定范围。 将您的反馈结果格式化为一个JSON对象，以"姓名"、"出生日期"、"学校名称"、"毕业形式"、"学历"、"专业"、"毕业日期"作为key。\n如果信息未提供, 使用"unknown" 作为值。\n文本段落：####1927#学士学位证书#女#1990年11月16日生：在#南京晓庄学院#经济学（金融与保险）#专业完成本科学习计划业已#毕业，经审核符合《中华人民共和国学位条例》的规定授予经济学#学士学位#院长#南京晓庄学院#学位评定委员会主席#〇六年六月三十日#证书编号#普通高等教育本科毕业生#迈成教育#搜狐号@转本小达人###\n\n'

response = get_completion_from_messages(prompt)
print(response)

{
  "姓名": "unknown",
  "出生日期": "1990年11月16日",
  "学校名称": "南京晓庄学院",
  "毕业形式": "unknown",
  "学历": "学士学位",
  "专业": "经济学（金融与保险）",
  "毕业日期": "2006年6月30日"
}

shanesu

关注

0
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
OCR提取学历证信息

【代码】OCR提取学历证信息。
复制链接

扫一扫