python 医学文献检索（包括文献图片中的文字）

小吴学不会

已于 2023-01-13 00:39:03 修改

阅读量919

点赞数 3

文章标签：数据结构 python pandas

于 2023-01-13 00:31:42 首次发布

本文链接：https://blog.csdn.net/qq_52556429/article/details/128668303

版权

python 医学文献检索（包括文献图片中的文字）

实现条件
具体实现
结果
文献引用

实现条件

图像转文字

import pytesseract
from PIL import Image
text = pytesseract.image_to_string(Image.open('a01.png'))
print(text)

文献检索

path = "./aa" #文件夹目录
files = os.listdir(path) #数组??得到文件夹下的所有文件名称
print (files)
for file in files:
    logfile = open(path+"/"+file,"r",encoding = 'utf -8')
    lines = logfile.readlines()
    for line in lines:
        if line.find("wxy") != -1:
            print("111 exist in file:" + file)
        elif line.find("222") != -1:
            print ("222 exist in file:" + file)

dataframe创建并插入行

df = pd.DataFrame(data=None, columns=["文献名称", "位置"])
df.loc[len(df.index)] = [1, 7]
print(df)
df.to_csv('data.csv',encoding='gbk',index=0)

具体实现

目录展示

在这里插入图片描述

导入相关库

import os
import fitz
import time
import re
import pytesseract
from PIL import Image
from jupyterlab_widgets import data
from pandas import DataFrame
from tqdm import tqdm

pdf转图片函数

def pdf2pic(path, pic_path,keywords,df):
    '''
    # 从pdf中提取图片,并转换成文字
    :param path: pdf的路径
    :param pic_path: 图片保存的路径
    :return:
    '''
    #t0 = time.clock()
    t0=time.perf_counter()#现在开始的时间，一个非常小的数，约等于0
    # 使用正则表达式来查找图片
    checkXO = r"/Type(?= */XObject)"
    checkIM = r"/Subtype(?= */Image)"
    # 打开pdf
    doc = fitz.open(path)
    #lines = doc.readlines()
    # 图片计数
    imgcount = 0
    lenXREF = doc.xref_length()#对象长度

    # 打印PDF的信息
    print("文件名:{}, 页数: {}, 对象: {}".format(path, len(doc), lenXREF - 1))

    lenXREF = doc.xref_length()  # 对象长度
    # 遍历每一个对象
    for i in range(1, lenXREF):
        # 定义对象字符串
        text = doc.xref_object(i)
        isXObject = re.search(checkXO, text)
        # 使用正则表达式查看是否是图片
        isImage = re.search(checkIM, text)
        # 如果不是对象也不是图片，则continue
        if not isXObject or not isImage:
            continue
        imgcount += 1
        # 根据索引生成图像
        pix = fitz.Pixmap(doc, i)  # 对象字符串转图像

        if not pix.colorspace.name in (fitz.csGRAY.name, fitz.csRGB.name):
            pix = fitz.Pixmap(fitz.csRGB, pix)
        # 根据pdf的路径生成图片的名称
        new_name = path.replace('/', '_') + "_img{}.png".format(imgcount)
        new_name = new_name.replace(':', '')
        # 如果pix.n<5,可以直接存为PNG
        if pix.n < 5:
            pix.save(os.path.join(pic_path, new_name))
        # 否则先转换CMYK
        else:
            pix0 = fitz.Pixmap(fitz.csRGB, pix)
            pix0.writePNG(os.path.join(pic_path, new_name))
            pix0 = None
        # ------------------------------图像转文字----------------------------
        text = pytesseract.image_to_string(Image.open(os.path.join(pic_path, new_name)))
        print(text)
        # -----------------------------搜索并放入csv文件-----------------------------------
        if text.find(keywords) != -1:
            print("exist in file:" + file)
            df.loc[len(df.index)] = [path, imgcount]
        # 释放资源
        pix = None
        t1 = time.perf_counter()
        print("运行时间:{}s".format(t1 - t0))
        print("提取了{}张图片".format(imgcount))
    print("------------------------------------------")

pdf识别text函数

def pdf2text(path,keywords,df):
    '''
    获取每页pdf文件文本
    :param pdf_path: pdf文件路径
    :return:
    '''
    # 打开pdf文件，并新建html文件
    with fitz.open(path) as pdf:
        # 遍历每一页pdf，并显示进度条
        pageIndex=0
        for page in tqdm(pdf):
            pageIndex=pageIndex+1
            text = page.get_text()  # 提取文本，传入参数'html'即：page.get_text('html') 则提取每页内容为html
            if text.find(keywords) != -1:
                print("exist in file:" + file)
                df.loc[len(df.index)] = [path,pageIndex]
        print(f'第{pageIndex}页解析内容:\n{text}')

main函数

要搜索的关键字放在第一行了，通过更改keywords字符串来检索不同的内容
path是存放pdf的文件夹，我们检索的只是pdf中的内容

if __name__=='__main__':
    keywords = "病例"  # 搜索的关键词

    path = "./books" #文件夹目录
    files = os.listdir(path) #数组??得到文件夹下的所有文件名称
    print(files)
    pic_path = r'img'#图片存放文件
    df = DataFrame(data=None, columns=["文献名称", "位置"])

    for file in files:
        m = pdf2pic(path+"/"+file, pic_path,keywords,df)
        n=pdf2text(path+"/"+file, keywords,df)


    print(df)
    df.to_csv('data.csv', encoding='gbk', index=0)

结果

识别pdf中文字和pdf中图片中文字，检索出含有关键字的文件，dataframe展示位置和具体页码

输出的dataframe

在这里插入图片描述

输出的data.csv

在这里插入图片描述

输出的图片

存放在img文件夹中
在这里插入图片描述

文献引用

作业要求-医学文献智能识别与检索系统设计
 灵感来源
 python中pdf处理库fitz的简单使用
 Python如何读取pdf中的图片
 python 之遍历文件夹下的所有文件并查找特定内容
 SCIHUB你不得不知道的python超级文献搜索工具

小吴学不会

关注

3
点赞
踩
13

收藏

觉得还不错? 一键收藏
0
评论
python 医学文献检索（包括文献图片中的文字）

识别pdf中文字和pdf中图片中文字，检索出含有关键字的文件，dataframe展示位置和具体页码。要搜索的关键字放在第一行了，通过更改keywords字符串来检索不同的内容。path是存放pdf的文件夹，我们检索的只是pdf中的内容。存放在img文件夹中。
复制链接

扫一扫