Python分析pdf简历

最新推荐文章于 2024-08-13 04:21:57 发布

湾区人工智能

最新推荐文章于 2024-08-13 04:21:57 发布

阅读量2.4k

点赞数

分类专栏： python项目文章标签：简历

本文链接：https://blog.csdn.net/BTUJACK/article/details/88069366

版权

本文介绍如何使用Python库如PyPDF2和PDFMiner解析PDF简历，提取关键信息，包括姓名、联系方式、工作经验和教育背景等，帮助进行自动化人才筛选和数据分析。

摘要由CSDN通过智能技术生成

# -*- coding:utf-8 -*-


import pdfplumber #解析pdf文件，尤其带有表格的文件
from openpyxl import Workbook #读写Excel的文件
import xlrd
from xlutils.copy import copy

    

# 解析：按照文字页边距判断级别（标题或内容）
def parse(pdf):
    targets = {} #保存结果，key是简历左侧内容，value是简历右侧内容。
    for page in pdf.pages: 
        words = page.extract_words(x_tolerance=5) #两页，两个列表
        #print(words)
        
        # 合并距离小于dis的单词，主要是把同一行的东西写在一起，比如街道名字和街道号码肯定要写在一起。通过给哈希表里面添加键，值对。已经被使用过的就标记为FALSE，完整的部分标记为真。
        pre_x = words[0]['x1']  #x1是右边距，x0是左边距, hash['x1']= 355.490, 116.304
        pre_top = words[0]['top'] #hash['top'] = 68.012, 58.932,固定值
        #print(pre_top)
        tolerance = 5 #这个数字是超参数，根据经验判断的；
        for index, word in enumerate(words):
            words[index]['valid'] = True #给每一个Word都添加一个键值对；方便以后决定是否继续添加，hash['valid'] = true;添加新的键值对，index = 0-19
            if index == 0:
                continue
            x0, top = word['x0'], word['top'] #68.064 107.912,每个元素的对应值。
            #print('x0, top', x0, top)
            text = word['text'] #获取每个内容
            if abs(top - pre_top) < 1 and abs(x0 - pre_x) < tolerance: #合并同一行内的内容，比如：0176 和81470662（top相同，差<1，并且左右间隔小于5的字符串
                ppre = 0
                while not words[index - 1 - ppre]['valid']: #words[]= false，说明已经被使用过了，就不用继续添加了
                    ppre += 1
                    print(ppre)
                words[index - 1 - ppre]['text'] += text
                print(words[index - 1 - ppre]['text'])
                words[index]['valid'] = False #被使用过的就标记为FALSE
            pre_x = word['x1'] #更新数字
            pre_top = top
            


        #for index, word in enumerate(words):
            #words[index]['valid'] = True #给每一个Word都添加一个键值对；方便以后决定是否继续添加，hash['valid'] = true;添加新的键值对，index = 0-19
        # 拆分重组，只添加标记为true的部分内容
        distance = [int(word['x0']) for word in words] #对所有x0取整数,两个列表
        #print(distance)
        first_dis = min(distance) #min 68
        #print(first_dis)
        cur_top = None #简历左侧top
        cur_text = None #简历左侧text
        pre_top = 0 #简历右侧高度
        x_dis = 5 #简历左右内容距离
        #x_dis = 50
        y_dis = 3 #同一行高度差在3以内
        for word in words:
            #print(word['top'], word['text'])
            #if not word['valid']:  #word['valid'] = false,说明前面已经使用过了，这里就不处理了
                #continue
            x0, top = word['x0'], word['top']
            text = word['text']
            #print('x0-first_dis', x0-first_dis)
            if abs(x0 - first_dis) < x_dis: #合并起来,主要通过距离区分key 和value部分
                targets[text] = '' 
                cur_text = text #简历左侧内容
                cur_top = top #简历左侧top
                pre = top
            elif cur_top is not None and cur_text is not None:
                if abs(cur_top - top) < y_dis: #在同一行内容相加，简历右侧和简历左侧高度比较
                    targets[cur_text] += text
                elif abs(pre_top - top) < y_dis: #简历右侧同一高度添加到一起
                    targets[cur_text] += ' ' + text #保证右侧内容能够放在一行，例如：硕士论文课题: „Tribologische Untersuchung strukturierter
                else:
                    pre_top = top
                    targets[cur_text] += '\n' + text #保证右侧一个内容块的内容要添加，而不是只添加右侧第一行内容
                    #print(targets)
    #print(targets)
    return targets


# 保存
def save(targets, out_path, sheet_name='targets'):
    wb = Workbook()
    ws = wb.active
    ws.title = sheet_name
    #print(list(targets.keys()))
    ws.append(list(targets.keys()))
    ws.append(list(targets.values()))

    wb.save(out_path)


# 主函数入口
print(__doc__)
path = r'/Users/apple/Documents/ST/python/俞伟简历.pdf'
out_path = r'/Users/apple/Documents/ST/python/俞伟简历.xlsx'
pdf = pdfplumber.open(path)
targets = parse(pdf)
save(targets, out_path)

print('运行结束！')










'''
words两列内容，
每页是一列内容；每个列表由无数个字典组成，每个字典由一些key，value对组成；简历里面的每个内容都会有一个字典包括，字典指明了这个内容的前后左右页边距；字典里text对应的value就是pdf内容了，提取出来就行。
x0是text内容左侧距离左边的距离，x1是text内容右侧距离左边的距离；如何保证把左侧放在表头，右侧放在表头下面呢？
[{'x0': Decimal('251.090'), 'x1': Decimal('355.490'), 'top': Decimal('68.012'), 'bottom': Decimal('94.052'), 'text': '个人简历'}, {'x0': Decimal('68.064'), 'x1': Decimal('116.304'), 'top': Decimal('107.912'), 'bottom': Decimal('119.912'), 'text': '基本信息'}, {'x0': Decimal('68.064'), 'x1': Decimal('104.172'), 'top': Decimal('141.152'), 'bottom': Decimal('153.152'), 'text': '姓名：'}, {'x0': Decimal('215.090'), 'x1': Decimal('239.090'), 'top': Decimal('141.152'), 'bottom': Decimal('153.152'), 'text': '俞伟'}, {'x0': Decimal('68.064'),