【chatGPT】用chatGPT写代码（二）----在doc文档里面提取指定字符之间的内容，内容保存到对应的excel的独立单元。

最新推荐文章于 2024-07-24 00:52:05 发布

大道生

最新推荐文章于 2024-07-24 00:52:05 发布

阅读量181

点赞数

文章标签： python

本文链接：https://blog.csdn.net/hellotzx/article/details/130801679

版权

# This is a sample Python script.

# Press Shift+F10 to execute it or replace it with your code.
# Press Double Shift to search everywhere for classes, files, tool windows, actions, and settings.
'''
import pandas as pd
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.pdfpage import PDFPage
from pdfminer.layout import LAParams, LTTextBoxHorizontal

# 需要解析的PDF文件路径
pdf_file_path = 'D:\SVN\kernel_project\产品管理\产品技术规范\Autosar\CP\R4.2.2\AUTOSAR_SWS_OS.pdf'

def main():
# 打开PDF文件并解析内容
with open(pdf_file_path, 'rb') as fp:
parser = PDFParser(fp)
doc = PDFDocument(parser)
rsrcmgr = PDFResourceManager()
laparams = LAParams()
device = PDFDevice(rsrcmgr)
interpreter = PDFPageInterpreter(rsrcmgr, device)

# 用于保存PDF内容的列表
content_list = []

# 遍历PDF页面，查找特定字段
for page in PDFPage.create_pages(doc):
interpreter.process_page(page)
layout = device.get_result()
for lt_obj in layout:
if isinstance(lt_obj, LTTextBoxHorizontal):
# 这里需要根据实际情况设置需要查找的字段和相应的条件
if '[SWS_Os' in lt_obj.get_text() and '⌋ ( )' in lt_obj.get_text():
# 将匹配到的字段按照空格分割，并去除多余的空格和换行符
fields = [field.strip() for field in lt_obj.get_text().split(' ') if field.strip()]
# 将字段添加到内容列表中
content_list.append(fields)

# 将内容保存到Excel表格中
df = pd.DataFrame(content_list)
df.to_excel('output.xlsx', index=False, header=False)
'''

'''
from docx import Document
import xlwt

def extract_fields_from_word(docx_path, fields, output_path):
doc = Document(docx_path)

# 创建 Excel 工作簿和工作表
workbook = xlwt.Workbook()
worksheet = workbook.add_sheet('Extracted Fields')

# 设置 Excel 表头
worksheet.write(0, 0, 'Word File')
for row, field in enumerate(fields):
worksheet.write(row + 1, 0, field)

# 提取字段内容并保存到 Excel 表格
for col, field in enumerate(fields):
worksheet.write(0, col + 1, field)

for paragraph in doc.paragraphs:
if field in paragraph.text:
extracted_text = paragraph.text.replace(field, "").strip()
worksheet.write(fields.index(field) + 1, col + 1, extracted_text)

# 保存 Excel 表格
workbook.save(output_path)

# 使用示例
def main():
docx_path = "D:\WorkspaceTZX\实习期工作\AUTOSAR_SWS_OS.docx" # 替换为实际的 Word 文档路径
fields = ['[SWS_Os', '⌋ ( )'] # 替换为要提取的字段列表
output_path = "D:\WorkspaceTZX\实习期工作\AUTOSAR_SWS_OS.xls" # 替换为输出的 Excel 表格路径

extract_fields_from_word(docx_path, fields, output_path)
'''

import mysql.connector
import xlwt
from docx import Document

'''
def extract_text_between_fields(docx_path, field1, field2, output_path):
doc = Document(docx_path)
extracted_text = ""
is_between_fields = False

for paragraph in doc.paragraphs:
#if field1 in paragraph.text and field2 in paragraph.text:
if field1 in paragraph.text:
extracted_text += paragraph.text + "\n"
is_between_fields = True
####
elif field2 in paragraph.text:
extracted_text += paragraph.text + "\n"
is_between_fields = False
elif is_between_fields:
extracted_text += paragraph.text + "\n"

mydb = mysql.connector.connect(
host="localhost", # 数据库主机地址
user="yourusername", # 数据库用户名
passwd="yourpassword" # 数据库密码
)

print(mydb)
####
# 创建 Excel 工作簿和工作表
try:
workbook = xlwt.Workbook()
worksheet = workbook.add_sheet('Extracted Text')

# 将提取的文本保存到 Excel 表格中的单元格
worksheet.write(0, 0, 'Extracted Text')
worksheet.write(1, 0, extracted_text)

# 保存 Excel 表格
workbook.save(output_path)
print("Excel 表格已经保存至",output_path)

except Exception as e:
print("创建 Excel表格时出现错误",str(e))
'''

def extract_text_between_fields(docx_path, field1, field2, output_path):
doc = Document(docx_path)
extracted_text = []
is_between_fields = False

for paragraph in doc.paragraphs:
#if field1 in paragraph.text and field2 in paragraph.text:
if field1 in paragraph.text:
extracted_text.append(paragraph.text)
is_between_fields = True
'''elif field2 in paragraph.text:
extracted_text += paragraph.text + "\n"
is_between_fields = False
elif is_between_fields:
extracted_text += paragraph.text + "\n"

mydb = mysql.connector.connect(
host="localhost", # 数据库主机地址
user="yourusername", # 数据库用户名
passwd="yourpassword" # 数据库密码
)

print(mydb)
'''
# 创建 Excel 工作簿和工作表
try:
workbook = xlwt.Workbook()
worksheet = workbook.add_sheet('Extracted Text')

# 将提取的文本保存到 Excel 表格中的单元格
for i, text in enumerate(extracted_text):
worksheet.write(i, 0, text)

# 保存 Excel 表格
workbook.save(output_path)
print("Excel 表格已经保存至",output_path)

except Exception as e:
print("创建 Excel表格时出现错误",str(e))

# 使用示例
def main():
docx_path = "D:\WorkspaceTZX\实习期工作\AUTOSAR_SWS_OS.docx" # 替换为实际的 Word 文档路径
field1 = "[SWS_Os_" # 替换为字段1的标识
field2 = "⌋" # 替换为字段2的标识
output_path = "D:\WorkspaceTZX\实习期工作\AUTOSAR_SWS_OS_1.1.xls" # 替换为输出的 Excel 表格路径

extract_text_between_fields(docx_path, field1, field2, output_path)
def print_hi(name):
# Use a breakpoint in the code line below to debug your script.
print(f'Hi, {name}') # Press Ctrl+F8 to toggle the breakpoint.

# Press the green button in the gutter to run the script.
if __name__ == '__main__':
print_hi('PyCharm')
main()

# See PyCharm help at https://www.jetbrains.com/help/pycharm/

# This is a sample Python script.

# Press Shift+F10 to execute it or replace it with your code.
# Press Double Shift to search everywhere for classes, files, tool windows, actions, and settings.
'''
import pandas as pd
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.pdfpage import PDFPage
from pdfminer.layout import LAParams, LTTextBoxHorizontal

# 需要解析的PDF文件路径
pdf_file_path = 'D:\SVN\kernel_project\产品管理\产品技术规范\Autosar\CP\R4.2.2\AUTOSAR_SWS_OS.pdf'

def main():
    # 打开PDF文件并解析内容
    with open(pdf_file_path, 'rb') as fp:
        parser = PDFParser(fp)
        doc = PDFDocument(parser)
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFDevice(rsrcmgr)
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        # 用于保存PDF内容的列表
        content_list = []

        # 遍历PDF页面，查找特定字段
        for page in PDFPage.create_pages(doc):
            interpreter.process_page(page)
            layout = device.get_result()
            for lt_obj in layout:
                if isinstance(lt_obj, LTTextBoxHorizontal):
                    # 这里需要根据实际情况设置需要查找的字段和相应的条件
                    if '[SWS_Os' in lt_obj.get_text() and '⌋ ( )' in lt_obj.get_text():
                        # 将匹配到的字段按照空格分割，并去除多余的空格和换行符
                        fields = [field.strip() for field in lt_obj.get_text().split(' ') if field.strip()]
                        # 将字段添加到内容列表中
                        content_list.append(fields)

    # 将内容保存到Excel表格中
    df = pd.DataFrame(content_list)
    df.to_excel('output.xlsx', index=False, header=False)
'''

'''
from docx import Document
import xlwt


def extract_fields_from_word(docx_path, fields, output_path):
    doc = Document(docx_path)

    # 创建 Excel 工作簿和工作表
    workbook = xlwt.Workbook()
    worksheet = workbook.add_sheet('Extracted Fields')

    # 设置 Excel 表头
    worksheet.write(0, 0, 'Word File')
    for row, field in enumerate(fields):
        worksheet.write(row + 1, 0, field)

    # 提取字段内容并保存到 Excel 表格
    for col, field in enumerate(fields):
        worksheet.write(0, col + 1, field)

        for paragraph in doc.paragraphs:
            if field in paragraph.text:
                extracted_text = paragraph.text.replace(field, "").strip()
                worksheet.write(fields.index(field) + 1, col + 1, extracted_text)

    # 保存 Excel 表格
    workbook.save(output_path)


# 使用示例
def main():
    docx_path = "D:\WorkspaceTZX\实习期工作\AUTOSAR_SWS_OS.docx"  # 替换为实际的 Word 文档路径
    fields = ['[SWS_Os', '⌋ ( )']  # 替换为要提取的字段列表
    output_path = "D:\WorkspaceTZX\实习期工作\AUTOSAR_SWS_OS.xls"  # 替换为输出的 Excel 表格路径

    extract_fields_from_word(docx_path, fields, output_path)
'''

import mysql.connector
import xlwt
from docx import Document

'''
def extract_text_between_fields(docx_path, field1, field2, output_path):
    doc = Document(docx_path)
    extracted_text = ""
    is_between_fields = False

    for paragraph in doc.paragraphs:
        #if field1 in paragraph.text and field2 in paragraph.text:
        if field1 in paragraph.text:
            extracted_text += paragraph.text + "\n"
            is_between_fields = True
        ####
        elif field2 in paragraph.text:
            extracted_text += paragraph.text + "\n"
            is_between_fields = False
        elif is_between_fields:
            extracted_text += paragraph.text + "\n"


    mydb = mysql.connector.connect(
        host="localhost",  # 数据库主机地址
        user="yourusername",  # 数据库用户名
        passwd="yourpassword"  # 数据库密码
    )

    print(mydb)
    ####
    # 创建 Excel 工作簿和工作表
    try:
        workbook = xlwt.Workbook()
        worksheet = workbook.add_sheet('Extracted Text')

        # 将提取的文本保存到 Excel 表格中的单元格
        worksheet.write(0, 0, 'Extracted Text')
        worksheet.write(1, 0, extracted_text)

        # 保存 Excel 表格
        workbook.save(output_path)
        print("Excel 表格已经保存至",output_path)

    except Exception as e:
        print("创建 Excel表格时出现错误",str(e))
'''

def extract_text_between_fields(docx_path, field1, field2, output_path):
    doc = Document(docx_path)
    extracted_text = []
    is_between_fields = False

    for paragraph in doc.paragraphs:
        #if field1 in paragraph.text and field2 in paragraph.text:
        if field1 in paragraph.text:
            extracted_text.append(paragraph.text)
            is_between_fields = True
        '''elif field2 in paragraph.text:
            extracted_text += paragraph.text + "\n"
            is_between_fields = False
        elif is_between_fields:
            extracted_text += paragraph.text + "\n"


    mydb = mysql.connector.connect(
        host="localhost",  # 数据库主机地址
        user="yourusername",  # 数据库用户名
        passwd="yourpassword"  # 数据库密码
    )

    print(mydb)
    '''
    # 创建 Excel 工作簿和工作表
    try:
        workbook = xlwt.Workbook()
        worksheet = workbook.add_sheet('Extracted Text')

        # 将提取的文本保存到 Excel 表格中的单元格
        for i, text in enumerate(extracted_text):
            worksheet.write(i, 0, text)

        # 保存 Excel 表格
        workbook.save(output_path)
        print("Excel 表格已经保存至",output_path)

    except Exception as e:
        print("创建 Excel表格时出现错误",str(e))


# 使用示例
def main():
    docx_path = "D:\WorkspaceTZX\实习期工作\AUTOSAR_SWS_OS.docx"  # 替换为实际的 Word 文档路径
    field1 = "[SWS_Os_"  # 替换为字段1的标识
    field2 = "⌋"  # 替换为字段2的标识
    output_path = "D:\WorkspaceTZX\实习期工作\AUTOSAR_SWS_OS_1.1.xls"  # 替换为输出的 Excel 表格路径

    extract_text_between_fields(docx_path, field1, field2, output_path)
def print_hi(name):
    # Use a breakpoint in the code line below to debug your script.
    print(f'Hi, {name}')  # Press Ctrl+F8 to toggle the breakpoint.


# Press the green button in the gutter to run the script.
if __name__ == '__main__':
    print_hi('PyCharm')
    main()

# See PyCharm help at https://www.jetbrains.com/help/pycharm/