【chatGPT】用chatGPT写代码(二)----在doc文档里面提取指定字符之间的内容,内容保存到对应的excel的独立单元。

# This is a sample Python script.

# Press Shift+F10 to execute it or replace it with your code.
# Press Double Shift to search everywhere for classes, files, tool windows, actions, and settings.
'''
import pandas as pd
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.pdfpage import PDFPage
from pdfminer.layout import LAParams, LTTextBoxHorizontal

# 需要解析的PDF文件路径
pdf_file_path = 'D:\SVN\kernel_project\产品管理\产品技术规范\Autosar\CP\R4.2.2\AUTOSAR_SWS_OS.pdf'

def main():
# 打开PDF文件并解析内容
with open(pdf_file_path, 'rb') as fp:
parser = PDFParser(fp)
doc = PDFDocument(parser)
rsrcmgr = PDFResourceManager()
laparams = LAParams()
device = PDFDevice(rsrcmgr)
interpreter = PDFPageInterpreter(rsrcmgr, device)

# 用于保存PDF内容的列表
content_list = []

# 遍历PDF页面,查找特定字段
for page in PDFPage.create_pages(doc):
interpreter.process_page(page)
layout = device.get_result()
for lt_obj in layout:
if isinstance(lt_obj, LTTextBoxHorizontal):
# 这里需要根据实际情况设置需要查找的字段和相应的条件
if '[SWS_Os' in lt_obj.get_text() and '⌋ ( )' in lt_obj.get_text():
# 将匹配到的字段按照空格分割,并去除多余的空格和换行符
fields = [field.strip() for field in lt_obj.get_text().split(' ') if field.strip()]
# 将字段添加到内容列表中
content_list.append(fields)

# 将内容保存到Excel表格中
df = pd.DataFrame(content_list)
df.to_excel('output.xlsx', index=False, header=False)
'''

'''
from docx import Document
import xlwt


def extract_fields_from_word(docx_path, fields, output_path):
doc = Document(docx_path)

# 创建 Excel 工作簿和工作表
workbook = xlwt.Workbook()
worksheet = workbook.add_sheet('Extracted Fields')

# 设置 Excel 表头
worksheet.write(0, 0, 'Word File')
for row, field in enumerate(fields):
worksheet.write(row + 1, 0, field)

# 提取字段内容并保存到 Excel 表格
for col, field in enumerate(fields):
worksheet.write(0, col + 1, field)

for paragraph in doc.paragraphs:
if field in paragraph.text:
extracted_text = paragraph.text.replace(field, "").strip()
worksheet.write(fields.index(field) + 1, col + 1, extracted_text)

# 保存 Excel 表格
workbook.save(output_path)


# 使用示例
def main():
docx_path = "D:\WorkspaceTZX\实习期工作\AUTOSAR_SWS_OS.docx" # 替换为实际的 Word 文档路径
fields = ['[SWS_Os', '⌋ ( )'] # 替换为要提取的字段列表
output_path = "D:\WorkspaceTZX\实习期工作\AUTOSAR_SWS_OS.xls" # 替换为输出的 Excel 表格路径

extract_fields_from_word(docx_path, fields, output_path)
'''

import mysql.connector
import xlwt
from docx import Document

'''
def extract_text_between_fields(docx_path, field1, field2, output_path):
doc = Document(docx_path)
extracted_text = ""
is_between_fields = False

for paragraph in doc.paragraphs:
#if field1 in paragraph.text and field2 in paragraph.text:
if field1 in paragraph.text:
extracted_text += paragraph.text + "\n"
is_between_fields = True
####
elif field2 in paragraph.text:
extracted_text += paragraph.text + "\n"
is_between_fields = False
elif is_between_fields:
extracted_text += paragraph.text + "\n"


mydb = mysql.connector.connect(
host="localhost", # 数据库主机地址
user="yourusername", # 数据库用户名
passwd="yourpassword" # 数据库密码
)

print(mydb)
####
# 创建 Excel 工作簿和工作表
try:
workbook = xlwt.Workbook()
worksheet = workbook.add_sheet('Extracted Text')

# 将提取的文本保存到 Excel 表格中的单元格
worksheet.write(0, 0, 'Extracted Text')
worksheet.write(1, 0, extracted_text)

# 保存 Excel 表格
workbook.save(output_path)
print("Excel 表格已经保存至",output_path)

except Exception as e:
print("创建 Excel表格时出现错误",str(e))
'''

def extract_text_between_fields(docx_path, field1, field2, output_path):
doc = Document(docx_path)
extracted_text = []
is_between_fields = False

for paragraph in doc.paragraphs:
#if field1 in paragraph.text and field2 in paragraph.text:
if field1 in paragraph.text:
extracted_text.append(paragraph.text)
is_between_fields = True
'''elif field2 in paragraph.text:
extracted_text += paragraph.text + "\n"
is_between_fields = False
elif is_between_fields:
extracted_text += paragraph.text + "\n"


mydb = mysql.connector.connect(
host="localhost", # 数据库主机地址
user="yourusername", # 数据库用户名
passwd="yourpassword" # 数据库密码
)

print(mydb)
'''
# 创建 Excel 工作簿和工作表
try:
workbook = xlwt.Workbook()
worksheet = workbook.add_sheet('Extracted Text')

# 将提取的文本保存到 Excel 表格中的单元格
for i, text in enumerate(extracted_text):
worksheet.write(i, 0, text)

# 保存 Excel 表格
workbook.save(output_path)
print("Excel 表格已经保存至",output_path)

except Exception as e:
print("创建 Excel表格时出现错误",str(e))


# 使用示例
def main():
docx_path = "D:\WorkspaceTZX\实习期工作\AUTOSAR_SWS_OS.docx" # 替换为实际的 Word 文档路径
field1 = "[SWS_Os_" # 替换为字段1的标识
field2 = "⌋" # 替换为字段2的标识
output_path = "D:\WorkspaceTZX\实习期工作\AUTOSAR_SWS_OS_1.1.xls" # 替换为输出的 Excel 表格路径

extract_text_between_fields(docx_path, field1, field2, output_path)
def print_hi(name):
# Use a breakpoint in the code line below to debug your script.
print(f'Hi, {name}') # Press Ctrl+F8 to toggle the breakpoint.


# Press the green button in the gutter to run the script.
if __name__ == '__main__':
print_hi('PyCharm')
main()

# See PyCharm help at https://www.jetbrains.com/help/pycharm/

# This is a sample Python script.

# Press Shift+F10 to execute it or replace it with your code.
# Press Double Shift to search everywhere for classes, files, tool windows, actions, and settings.
'''
import pandas as pd
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.pdfpage import PDFPage
from pdfminer.layout import LAParams, LTTextBoxHorizontal

# 需要解析的PDF文件路径
pdf_file_path = 'D:\SVN\kernel_project\产品管理\产品技术规范\Autosar\CP\R4.2.2\AUTOSAR_SWS_OS.pdf'

def main():
    # 打开PDF文件并解析内容
    with open(pdf_file_path, 'rb') as fp:
        parser = PDFParser(fp)
        doc = PDFDocument(parser)
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFDevice(rsrcmgr)
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        # 用于保存PDF内容的列表
        content_list = []

        # 遍历PDF页面,查找特定字段
        for page in PDFPage.create_pages(doc):
            interpreter.process_page(page)
            layout = device.get_result()
            for lt_obj in layout:
                if isinstance(lt_obj, LTTextBoxHorizontal):
                    # 这里需要根据实际情况设置需要查找的字段和相应的条件
                    if '[SWS_Os' in lt_obj.get_text() and '⌋ ( )' in lt_obj.get_text():
                        # 将匹配到的字段按照空格分割,并去除多余的空格和换行符
                        fields = [field.strip() for field in lt_obj.get_text().split(' ') if field.strip()]
                        # 将字段添加到内容列表中
                        content_list.append(fields)

    # 将内容保存到Excel表格中
    df = pd.DataFrame(content_list)
    df.to_excel('output.xlsx', index=False, header=False)
'''

'''
from docx import Document
import xlwt


def extract_fields_from_word(docx_path, fields, output_path):
    doc = Document(docx_path)

    # 创建 Excel 工作簿和工作表
    workbook = xlwt.Workbook()
    worksheet = workbook.add_sheet('Extracted Fields')

    # 设置 Excel 表头
    worksheet.write(0, 0, 'Word File')
    for row, field in enumerate(fields):
        worksheet.write(row + 1, 0, field)

    # 提取字段内容并保存到 Excel 表格
    for col, field in enumerate(fields):
        worksheet.write(0, col + 1, field)

        for paragraph in doc.paragraphs:
            if field in paragraph.text:
                extracted_text = paragraph.text.replace(field, "").strip()
                worksheet.write(fields.index(field) + 1, col + 1, extracted_text)

    # 保存 Excel 表格
    workbook.save(output_path)


# 使用示例
def main():
    docx_path = "D:\WorkspaceTZX\实习期工作\AUTOSAR_SWS_OS.docx"  # 替换为实际的 Word 文档路径
    fields = ['[SWS_Os', '⌋ ( )']  # 替换为要提取的字段列表
    output_path = "D:\WorkspaceTZX\实习期工作\AUTOSAR_SWS_OS.xls"  # 替换为输出的 Excel 表格路径

    extract_fields_from_word(docx_path, fields, output_path)
'''

import mysql.connector
import xlwt
from docx import Document

'''
def extract_text_between_fields(docx_path, field1, field2, output_path):
    doc = Document(docx_path)
    extracted_text = ""
    is_between_fields = False

    for paragraph in doc.paragraphs:
        #if field1 in paragraph.text and field2 in paragraph.text:
        if field1 in paragraph.text:
            extracted_text += paragraph.text + "\n"
            is_between_fields = True
        ####
        elif field2 in paragraph.text:
            extracted_text += paragraph.text + "\n"
            is_between_fields = False
        elif is_between_fields:
            extracted_text += paragraph.text + "\n"


    mydb = mysql.connector.connect(
        host="localhost",  # 数据库主机地址
        user="yourusername",  # 数据库用户名
        passwd="yourpassword"  # 数据库密码
    )

    print(mydb)
    ####
    # 创建 Excel 工作簿和工作表
    try:
        workbook = xlwt.Workbook()
        worksheet = workbook.add_sheet('Extracted Text')

        # 将提取的文本保存到 Excel 表格中的单元格
        worksheet.write(0, 0, 'Extracted Text')
        worksheet.write(1, 0, extracted_text)

        # 保存 Excel 表格
        workbook.save(output_path)
        print("Excel 表格已经保存至",output_path)

    except Exception as e:
        print("创建 Excel表格时出现错误",str(e))
'''

def extract_text_between_fields(docx_path, field1, field2, output_path):
    doc = Document(docx_path)
    extracted_text = []
    is_between_fields = False

    for paragraph in doc.paragraphs:
        #if field1 in paragraph.text and field2 in paragraph.text:
        if field1 in paragraph.text:
            extracted_text.append(paragraph.text)
            is_between_fields = True
        '''elif field2 in paragraph.text:
            extracted_text += paragraph.text + "\n"
            is_between_fields = False
        elif is_between_fields:
            extracted_text += paragraph.text + "\n"


    mydb = mysql.connector.connect(
        host="localhost",  # 数据库主机地址
        user="yourusername",  # 数据库用户名
        passwd="yourpassword"  # 数据库密码
    )

    print(mydb)
    '''
    # 创建 Excel 工作簿和工作表
    try:
        workbook = xlwt.Workbook()
        worksheet = workbook.add_sheet('Extracted Text')

        # 将提取的文本保存到 Excel 表格中的单元格
        for i, text in enumerate(extracted_text):
            worksheet.write(i, 0, text)

        # 保存 Excel 表格
        workbook.save(output_path)
        print("Excel 表格已经保存至",output_path)

    except Exception as e:
        print("创建 Excel表格时出现错误",str(e))


# 使用示例
def main():
    docx_path = "D:\WorkspaceTZX\实习期工作\AUTOSAR_SWS_OS.docx"  # 替换为实际的 Word 文档路径
    field1 = "[SWS_Os_"  # 替换为字段1的标识
    field2 = "⌋"  # 替换为字段2的标识
    output_path = "D:\WorkspaceTZX\实习期工作\AUTOSAR_SWS_OS_1.1.xls"  # 替换为输出的 Excel 表格路径

    extract_text_between_fields(docx_path, field1, field2, output_path)
def print_hi(name):
    # Use a breakpoint in the code line below to debug your script.
    print(f'Hi, {name}')  # Press Ctrl+F8 to toggle the breakpoint.


# Press the green button in the gutter to run the script.
if __name__ == '__main__':
    print_hi('PyCharm')
    main()

# See PyCharm help at https://www.jetbrains.com/help/pycharm/

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

大道生

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值