python pdf 加密无法复制读取解决

最新推荐文章于 2024-08-23 16:53:34 发布

Sanfor

最新推荐文章于 2024-08-23 16:53:34 发布

阅读量1.5k

点赞数

分类专栏：日常工具文章标签： python

本文链接：https://blog.csdn.net/weixin_42452716/article/details/123840625

版权

日常工具专栏收录该内容

3 篇文章 0 订阅

订阅专栏

加密分两种：
1、打开文件需要密码的
2、复制编辑需要权限密码的

pdf哈希值获取（解决1）

#!python3.8
# coding=utf-8
'''
 Author: Sanfor Chow
 Date: 2022-03-29 17:02:43
 LastEditors: Sanfor Chow
 LastEditTime: 2022-03-29 17:02:43
 FilePath: /knowledgeGraph/demo/pdf2john.py
'''

# Copyright (c) 2013 Shane Quigley, < shane at softwareontheside.info >

# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import re
import sys
import os
from xml.dom import minidom

PY3 = sys.version_info[0] == 3

class PdfParser:
    def __init__(self, file_name):
        self.file_name = file_name
        f = open(file_name, 'rb')
        self.encrypted = f.read()
        f.close()
        self.process = True
        psr = re.compile(b'PDF-\d\.\d')
        try:
            self.pdf_spec = psr.findall(self.encrypted)[0]
        except IndexError:
            sys.stderr.write("%s is not a PDF file!\n" % file_name)
            self.process = False

    def parse(self):
        if not self.process:
            return

        try:
            trailer = self.get_trailer()
        except RuntimeError:
            e = sys.exc_info()[1]
            sys.stderr.write("%s : %s\n" % (self.file_name, str(e)))
            return
        # print >> sys.stderr, trailer
        object_id = self.get_object_id(b'Encrypt', trailer)
        # print >> sys.stderr, object_id
        if(len(object_id) == 0):
            raise RuntimeError("Could not find object id")
        encryption_dictionary = self.get_encryption_dictionary(object_id)
        # print >> sys.stderr, encryption_dictionary
        dr = re.compile(b'\d+')
        vr = re.compile(b'\/V \d')
        rr = re.compile(b'\/R \d')
        try:
            v = dr.findall(vr.findall(encryption_dictionary)[0])[0]
        except IndexError:
            raise RuntimeError("Could not find /V")
        r = dr.findall(rr.findall(encryption_dictionary)[0])[0]
        lr = re.compile(b'\/Length \d+')
        longest = 0
        # According to the docs:
        # Length : (Optional; PDF 1.4; only if V is 2 or 3). Default value: 40
        length = b'40'
        for le in lr.findall(encryption_dictionary):
            if(int(dr.findall(le)[0]) > longest):
                longest = int(dr.findall(le)[0])
                length = dr.findall(le)[0]
        pr = re.compile(b'\/P -?\d+')
        try:
            p = pr.findall(encryption_dictionary)[0]
        except IndexError:
                # print >> sys.stderr, "** dict:", encryption_dictionary
                raise RuntimeError("Could not find /P")
        pr = re.compile(b'-?\d+')
        p = pr.findall(p)[0]
        meta = '1' if self.is_meta_data_encrypted(encryption_dictionary) else '0'
        idr = re.compile(b'\/ID\s*\[\s*<\w+>\s*<\w+>\s*\]')
        try:
            i_d = idr.findall(trailer)[0] # id key word
        except IndexError:
            # some pdf files use () instead of <>
            idr = re.compile(b'\/ID\s*\[\s*\(\w+\)\s*\(\w+\)\s*\]')
            try:
                i_d = idr.findall(trailer)[0] # id key word
            except IndexError:
                # print >> sys.stderr, "** idr:", idr
                # print >> sys.stderr, "** trailer:", trailer
                raise RuntimeError("Could not find /ID tag")
                return
        idr = re.compile(b'<\w+>')
        try:
            i_d = idr.findall(trailer)[0]
        except IndexError:
            idr = re.compile(b'\(\w+\)')
            i_d = idr.findall(trailer)[0]
        i_d = i_d.replace(b'<',b'')
        i_d = i_d.replace(b'>',b'')
        i_d = i_d.lower()
        passwords = self.get_passwords_for_JtR(encryption_dictionary)
        output = '$pdf$'+v.decode('ascii')+'*'+r.decode('ascii')+'*'+length.decode('ascii')+'*'
        output += p.decode('ascii')+'*'+meta+'*'
        output += str(int(len(i_d)/2))+'*'+i_d.decode('ascii')+'*'+passwords
        if(self.is_meta_data_encrypted(encryption_dictionary)):
            sys.stdout.write("%s:%s:::::%s\n" % (os.path.basename(self.file_name.encode('UTF-8')), output.encode('UTF-8'), self.file_name.encode('UTF-8')))
        else:
            gecos = self.parse_meta_data(trailer)
            sys.stdout.write("%s:%s:::%s::%s\n" % (os.path.basename(self.file_name.encode('UTF-8')), output.encode('UTF-8'), gecos.encode('UTF-8'), self.file_name.encode('UTF-8')))

    def get_passwords_for_JtR(self, encryption_dictionary):
        output = ""
        letters = [b"U", b"O"]
        if(b"1.7" in self.pdf_spec):
            letters = [b"U", b"O", b"UE", b"OE"]
        for let in letters:
            pr_str = b'\/' + let + b'\s*\([^)]+\)'
            pr = re.compile(pr_str)
            pas = pr.findall(encryption_dictionary)
            if(len(pas) > 0):
                pas = pr.findall(encryption_dictionary)[0]
                # because regexs in python suck <=== LOL
                while(pas[-2] == b'\\'):
                    pr_str += b'[^)]+\)'
                    pr = re.compile(pr_str)
                    # print >> sys.stderr, "pr_str:", pr_str
                    # print >> sys.stderr, encryption_dictionary
                    try:
                        pas = pr.findall(encryption_dictionary)[0]
                    except IndexError:
                        break
                output += self.get_password_from_byte_string(pas)+"*"
            else:
                pr = re.compile(let + b'\s*<\w+>')
                pas = pr.findall(encryption_dictionary)
                if not pas:
                    continue
                pas = pas[0]
                pr = re.compile(b'<\w+>')
                pas = pr.findall(pas)[0]
                pas = pas.replace(b"<",b"")
                pas = pas.replace(b">",b"")
                if PY3:
                    output += str(int(len(pas)/2))+'*'+str(pas.lower(),'ascii')+'*'
                else:
                    output += str(int(len(pas)/2))+'*'+pas.lower()+'*'
        return output[:-1]

    def is_meta_data_encrypted(self, encryption_dictionary):
        mr = re.compile(b'\/EncryptMetadata\s\w+')
        if(len(mr.findall(encryption_dictionary)) > 0):
            wr = re.compile(b'\w+')
            is_encrypted = wr.findall(mr.findall(encryption_dictionary)[0])[-1]
            if(is_encrypted == b"false"):
                return False
            else:
                return True
        else:
            return True

    def parse_meta_data(self, trailer):
        root_object_id = self.get_object_id(b'Root', trailer)
        root_object = self.get_pdf_object(root_object_id)
        object_id = self.get_object_id(b'Metadata', root_object)
        xmp_metadata_object = self.get_pdf_object(object_id)
        return self.get_xmp_values(xmp_metadata_object)

    def get_xmp_values(self, xmp_metadata_object):
        xmp_metadata_object = xmp_metadata_object.partition(b"stream")[2]
        xmp_metadata_object = xmp_metadata_object.partition(b"endstream")[0]
        try:
            xml_metadata = minidom.parseString(xmp_metadata_object)
        except:
            return ""
        values = []
        values.append(self.get_dc_value("title", xml_metadata))
        values.append(self.get_dc_value("creator", xml_metadata))
        values.append(self.get_dc_value("description", xml_metadata))
        values.append(self.get_dc_value("subject", xml_metadata))
        created_year = xml_metadata.getElementsByTagName("xmp:CreateDate")
        if(len(created_year) > 0):
            created_year = created_year[0].firstChild.data[0:4]
            values.append(str(created_year))
        return " ".join(values).replace(":", "")

    def get_dc_value(self, value, xml_metadata):
        output = xml_metadata.getElementsByTagName("dc:"+value)
        if(len(output) > 0):
            output = output[0]
            output = output.getElementsByTagName("rdf:li")[0]
            if(output.firstChild):
                output = output.firstChild.data
                return output
        return ""

    def get_encryption_dictionary(self, object_id):
        encryption_dictionary = self.get_pdf_object(object_id)
        for o in encryption_dictionary.split(b"endobj"):
            if(object_id+b" obj" in o):
                encryption_dictionary = o
        return encryption_dictionary

    def get_object_id(self, name , trailer):
        oir = re.compile(b'\/' + name + b'\s\d+\s\d\sR')
        try:
            object_id = oir.findall(trailer)[0]
        except IndexError:
            # print >> sys.stderr, " ** get_object_id: name \"", name, "\", trailer ", trailer
            return ""
        oir = re.compile(b'\d+ \d')
        object_id = oir.findall(object_id)[0]
        return object_id

    def get_pdf_object(self, object_id):
        output = object_id+b" obj" + \
            self.encrypted.partition(b"\r"+object_id+b" obj")[2]
        if(output == object_id+b" obj"):
            output = object_id+b" obj" + \
            self.encrypted.partition(b"\n"+object_id+b" obj")[2]
        output = output.partition(b"endobj")[0] + b"endobj"
        # print >> sys.stderr, output
        return output

    def get_trailer(self):
        trailer = self.get_data_between(b"trailer", b">>", b"/ID")
        if(trailer == b""):
            trailer = self.get_data_between(b"DecodeParms", b"stream", b"")
            if(trailer == ""):
                raise RuntimeError("Can't find trailer")
        if(trailer != "" and trailer.find(b"Encrypt") == -1):
            # print >> sys.stderr, trailer
            raise RuntimeError("File not encrypted")
        return trailer

    def get_data_between(self, s1, s2, tag):
        output = b""
        inside_first = False
        lines = re.split(b'\n|\r', self.encrypted)
        for line in lines:
            inside_first = inside_first or line.find(s1) != -1
            if(inside_first):
                output += line
                if(line.find(s2) != -1):
                    if(tag == b"" or output.find(tag) != -1):
                        break
                    else:
                        output = b""
                        inside_first = False
        return output

    def get_hex_byte(self, o_or_u, i):
        if PY3:
            return hex(o_or_u[i]).replace('0x', '')
        else:
            return hex(ord(o_or_u[i])).replace('0x', '')

    def get_password_from_byte_string(self, o_or_u):
        pas = ""
        escape_seq = False
        escapes = 0
        excluded_indexes = [0, 1, 2]
        #For UE & OE in 1.7 spec
        if not PY3:
            if(o_or_u[2] != '('):
                excluded_indexes.append(3)
        else:
            if(o_or_u[2] != 40):
                excluded_indexes.append(3)
        for i in range(len(o_or_u)):
            if(i not in excluded_indexes):
                if(len(self.get_hex_byte(o_or_u, i)) == 1 \
                   and o_or_u[i] != "\\"[0]):
                    pas += "0"  # need to be 2 digit hex numbers
                is_back_slash = True
                if not PY3:
                    is_back_slash = o_or_u[i] != "\\"[0]
                else:
                    is_back_slash = o_or_u[i] != 92
                if(is_back_slash or escape_seq):
                    if(escape_seq):
                        if not PY3:
                            esc = "\\"+o_or_u[i]
                        else:
                            esc = "\\"+chr(o_or_u[i])
                        esc = self.unescape(esc)
                        if(len(hex(ord(esc[0])).replace('0x', '')) == 1):
                            pas += "0"
                        pas += hex(ord(esc[0])).replace('0x', '')
                        escape_seq = False
                    else:
                        pas += self.get_hex_byte(o_or_u, i)
                else:
                    escape_seq = True
                    escapes += 1
        output = len(o_or_u)-(len(excluded_indexes)+1)-escapes
        return str(output)+'*'+pas[:-2]

    def unescape(self, esc):
        escape_seq_map = {'\\n':"\n", '\\s':"\s", '\\e':"\e",
                '\\r':"\r", '\\t':"\t", '\\v':"\v", '\\f':"\f",
                '\\b':"\b", '\\a':"\a", "\\)":")",
                "\\(":"(", "\\\\":"\\" }

        return escape_seq_map[esc]

if __name__ == "__main__":
    if len(sys.argv) < 2:
        sys.stderr.write("Usage: %s <PDF file(s)>\n" %os.path.basename(sys.argv[0]))
        sys.exit(-1)
    for j in range(1, len(sys.argv)):
        if not PY3:
            filename = sys.argv[j].decode('UTF-8')
        else:
            filename = sys.argv[j]
        # sys.stderr.write("Analyzing %s\n" % sys.argv[j].decode('UTF-8'))
        parser = PdfParser(filename)
        try:
            parser.parse()
        except RuntimeError:
            e = sys.exc_info()[1]
            sys.stderr.write("%s : %s\n" % (filename, str(e)))

使用一个在线的网站也可以。
https://www.onlinehashcrack.com/tools-pdf-hash-extractor.php

# 获取hash
pdf2john.py foo-protected.pdf | sed "s/::.*$//" | sed "s/^.*://" | sed -r 's/^.{2}//' | sed 's/.\{1\}$//' > hash 
# hashcat破
hashcat -m 10700 -a 3 $pdf$4*4*128*-4*1*16*f8e0565b3c4d9795db089448fcf5426e*32*6cff45e6bdfcecaf9039ba17cdb06ae700000000000000000000000000000000*32*f78487d59915fc93bd17f9c687bf84d1360e98df8e382132d494ed78dc3518b1 -1 ?l?u?d ?1?1?1?1?1?1

复制编辑加密

import PyPDF2
import pikepdf
import textract
import pikepdf
from tqdm import tqdm

def pdf_decrypt(pdf_file):
    '''
        破解密码：字典
        复制编辑加密副本
    '''
    pdf = pikepdf.open(pdf_file, password='')
    if os.path.exists(pdf_file):
        os.remove(pdf_file)
    pdf.save(pdf_file)

    # password = None
    # wordlist = "dict/rockyou.txt"                #密码字典路径
    # n_words = len(list(open(wordlist, 'rb')))
    # fp = open(pdf_file, "rb+")
    # pdfFile = PyPDF2.PdfFileReader(fp)

    # with open(wordlist, "rb") as wordlist:
    #     if pdfFile.isEncrypted:
    #         for word in tqdm(wordlist, total=n_words, unit="word"):
    #             try:
    #                 pdf = pikepdf.open(pdf_file, password=word.strip())
    #             except:
    #                 continue
    #     else:
    #         password = word.decode().strip()
    #         print("[+] Password found:", password)
    #         exit(0)

pdf读取

    # num_pages = pdfReader.numPages
    # count = 0
    # text = ""
    # # while 循环会读取每一页
    # while count < num_pages:
    #     pageObj = pdfReader.getPage(count)
    #     count += 1
    #     text += pageObj.extractText()
    # #这里的if语句用以检查上面的库是否返回了词汇，因为PyPDF2 无法读取扫描文本
    # if text != "":
    #     text = text
    # # 如果上面返回False，就运行库textract 将PDF扫描文件转换为文本
    # else:
    #     text = textract.process(fileurl, method='tesseract', language='eng')
    # return text

def pdf_content(pdf_file):
    '''
        读取pdf文本内容
    '''
    rsrcmgr = PDFResourceManager()
    codec = 'utf-8'
    outfp = StringIO()
    laparams = LAParams()
    device = TextConverter(rsrcmgr=rsrcmgr, outfp=outfp, codec=codec, laparams=laparams)
    with open(pdf_file, 'rb') as fp:
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        password = ""
        maxpages = 0
        caching = True
        pagenos = set()
        for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True):
            interpreter.process_page(page)
    mystr = outfp.getvalue()
    device.close()
    outfp.close()
    return mystr

Sanfor

关注

0
点赞
踩
6

收藏

觉得还不错? 一键收藏
0
评论
python pdf 加密无法复制读取解决

加密分两种：1、打开文件需要密码的2、复制编辑需要权限密码的pdf哈希值获取（解决1）#!python3.8# coding=utf-8''' Author: Sanfor Chow Date: 2022-03-29 17:02:43 LastEditors: Sanfor Chow LastEditTime: 2022-03-29 17:02:43 FilePath: /knowledgeGraph/demo/pdf2john.py'''# Copyright (c) 2013
复制链接

扫一扫

专栏目录