如何在windows电脑上加载pdf文档到本地大模型上从无到有实现

1.环境配置

1.1 ollama配置

        在ollama官网下载ollama软件并安装,安装时需将安装路径添加到环境变量path里面。

1.2 ollama加载模型

        ollama官网上有对应大模型,通过ollama run 模型名称,即可下载。例如 ollama run qwen:7b;下载完成后即可进行聊天对话功能。模型大小根据自己的电脑性能进行选择,当前事例16G内存可运行。

1.3 python环境搭建,langchain 安装,vscode软件下载

2.pdf文档处理

2.1扫描版pdf文字识别并保存到world

当前大模型在处理扫描版pdf时,对扫描版的处理会报错,需先将其转换成可进行识别的pdf文件

代码如下,运行前需先进行相应包的安装,通过pip install 对应安装包即可:



from docx import Document

from paddleocr import PaddleOCR  

import docx

import os

from docx import Document

from docx.enum.text import WD_PARAGRAPH_ALIGNMENT

from docx.shared import  Cm

import fitz

from PIL import Image, ImageEnhance, ImageFilter

# 初始化 OCR 模型  

ocr = PaddleOCR(use_gpu=True, lang='ch' or 'en')

def enhance_and_process_image(image_path, lang="chi_sim+eng"):

    """图像增强后执行OCR处理,返回纯文本内容,忽略布局信息"""

    try:

        with Image.open(image_path) as im:

     

            # 图像增强(根据需要调整增强参数)

            enhancer = ImageEnhance.Contrast(im)

            im = enhancer.enhance(1.5)  # 增强对比度

            #enhancer = ImageEnhance.Brightness(im)

            #im = enhancer.enhance(1.2)  # 增亮

            im = im.filter(ImageFilter.MedianFilter(size=3))  # 减少噪声

            im.save(image_path)

    except FileNotFoundError:

        print(f"文件未找到: {image_path}")

    except Exception as e:

        print(f"处理{image_path}时发生未知错误:{e}")

     

def img_deal(image_path):

    try:

        result = ocr.ocr(image_path)

        if result:  # Check if result is not None or empty before iterating

            all_data = []

            for page_items in result:

                for region in page_items:

                    raw_bbox, (text, _) = region

                    all_data.append((raw_bbox, text))

            return all_data

        else:

            print(f"No text detected in image: {image_path}")

            return []  # Return an empty list to avoid iteration over None

    except Exception as e:

        print(f"An error occurred processing image {image_path}: {e}")

        return []  # In case of an exception, also return an empty list

   

def coord_to_indent(coord, scale_factor=1000):

    """将坐标转换为Word的缩进量,这里假设1单位坐标等于scale_factor的厘米缩进"""

    x, _ = coord

    return Cm(x / scale_factor)  # 仅使用x坐标调整左缩进,您可以根据需要加入y坐标处理上下边距


 

def process_and_save_images_to_word(image_paths, output_word_path):

    doc = Document()

    all_recognized_data = []

    for image_path in image_paths:

        print(image_path)

        #enhance_and_process_image(image_path)  # 先增强并保存图片

        recognized_data = img_deal(image_path)

        if recognized_data:  # 确保有数据才保存

            all_recognized_data.append(recognized_data)

            # 实时保存当前图片的识别结果到Word

            for bbox, text in recognized_data:

                left_indent = coord_to_indent(bbox[0])

                paragraph = doc.add_paragraph(style='Normal')

                paragraph.paragraph_format.left_indent = left_indent

                paragraph.alignment = WD_PARAGRAPH_ALIGNMENT.LEFT

                run = paragraph.add_run(text)

                font = run.font

                font.size = docx.shared.Pt(12)

            doc.add_page_break()  # 每处理完一个图片后添加分页符

        # 实时保存文档,以确保每处理完一张图片就更新一次

        temp_output_path = f"{output_word_path}_temp.docx"

        doc.save(temp_output_path)

       

    # 最终保存文档并删除临时文件(如果需要的话)

    os.rename(temp_output_path, output_word_path)

    print(f"Text with position saved to Word document at: {output_word_path}")

if __name__ == "__main__":

    pdf_file_path = 'F:/ai/output'  

    image_folder_path = "F:/ai/output/images/"

    image_paths =sorted([os.path.join(image_folder_path, img) for img in os.listdir(image_folder_path) if img.endswith(('.png', '.jpg', '.jpeg'))])

   

    output_word_path = os.path.splitext(pdf_file_path)[0] + '.docx'

    process_and_save_images_to_word(image_paths, output_word_path)

​

2.2  world文档转pdf

可将doc文件打开另存为pdf,亦可通过代码直接转换

2.3 pdf分割,实现多线程共同训练加载

通过将大文件分解成小文件,再通过多线程的方式进而提升训练速度,代码如下:

import os
import fitz  # PyMuPDF

def split_pdf(input_path, output_dir, max_pages=20):
    """
    将一个PDF文件分割成多个不超过max_pages的PDF文件,
    且在分割后的文件名中保留原文档的名称。
    
    :param input_path: 输入PDF文件的路径。
    :param output_dir: 输出PDF文件的目录。
    :param max_pages: 单个输出PDF文件的最大页数,默认为20。
    """
    for input_path_file in os.listdir(input_path):
        file = os.path.join(input_path, input_path_file)
        base_name = os.path.splitext(input_path_file)[0]  # 提取不含扩展名的文件名
        # 打开PDF文件
        doc = fitz.open(file)
        total_pages = doc.page_count
        
        # 分割PDF
        for i in range(0, total_pages, max_pages):
            # 计算当前批次结束的页码,确保不会超过总页数
            end_page = min(i + max_pages, total_pages)
            
            # 创建一个新的PDF文档用于存储当前批次的页面
            new_doc = fitz.Document()
            
            # 将页面复制到新文档
            for page_num in range(i, end_page):
                page = doc.load_page(page_num)
                new_doc.insert_pdf(doc, from_page=page_num, to_page=page_num)
            
            # 构造包含原文档名称的输出文件名
            output_filename = f"{base_name}_split_{i+1}-{end_page}.pdf"
            output_path = os.path.join(output_dir, output_filename)
            new_doc.save(output_path)
            new_doc.close()
        
        doc.close()

# 示例使用
input_pdf_path = r'F:\ai\DB\文件\原文件2'
output_directory =r'F:\ai\DB\文件\文件2'

# 确保输出目录存在
os.makedirs(output_directory, exist_ok=True)

split_pdf(input_pdf_path, output_directory)

3.开始pdf文件训练,并将其保存,代码如下,该代码的优势是不仅可以对新文档进行训练保存,亦可在原有已训练模型的基础上进行加载,进而将两个模型保存:

import os
from langchain_community.document_loaders import PDFPlumberLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import FAISS
from concurrent.futures import ProcessPoolExecutor
import time
import threading
import concurrent.futures

import pickle
from typing import List, Tuple


def process_pdf(file_path):
    """处理单个PDF文件并返回切分后的文档"""
    loader = PDFPlumberLoader(file_path)
    data = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=256, chunk_overlap=128)
    return text_splitter.split_documents(data)


def batch_process_pdfs(paths,embeddings):
    """批量处理PDF文件并构建FAISS数据库,每次最多并行处理2个文件"""
    all_docs = []
    failed_files = []  # 用于记录处理失败的文件路径

    # 定义每次处理的文件数
    batch_size = 2
    print(len(paths))
    # 分批次处理文件
    for i in range(0, len(paths), batch_size):
        # 获取当前批次的文件路径
        current_batch_paths = paths[i:i+batch_size]
        
        try:
            with ProcessPoolExecutor(max_workers=batch_size) as executor:
                # 并行处理每个PDF文件并捕获异常
                futures = {executor.submit(process_pdf, path): path for path in current_batch_paths}
                
                for future in futures:
                    path = futures[future]
                    try:
                        docs = future.result()
                        all_docs.extend(docs)
                    except Exception as e:
                        print(f"处理文件 {path} 时发生错误: {e}")
                        failed_files.append(path)
                        continue
            
            # 确保当前批次所有任务完成后再继续
            concurrent.futures.wait(futures)

        except Exception as e:
            print(f"处理过程中发生未知错误: {e}")
            return None

    # 确保所有文件处理完毕后才进行后续操作
    if all_docs:
        print(f"处理完成,共处理 {len(all_docs)} 个文档。")
        db = FAISS.from_documents(all_docs, embeddings)
        print(f"成功构建数据库")
        db.save_local("faiss_index.pkl")
        

    else:
        print("没有成功处理的文档,无法继续构建数据库。")
        return None
    print(f"处理失败的文件有: {failed_files}")
    return db 

def save_db_async(db, path):
    
    try:
        if db is None:
            print("db is not initialized properly.")
        else:
            db.save_local(path)
        
    except Exception as e:
        print(f"异步保存数据库时发生错误: {e}")

def load_existing_db(file_path,embeddings) :
    
    try:
        
        db = FAISS.load_local(file_path, embeddings=embeddings, allow_dangerous_deserialization=True)
        return db
    
    except FileNotFoundError:
        print("未找到现有数据库,将创建新的数据库。")
        return None
    except Exception as e:
        print(f"加载现有数据库时发生其他错误: {e}")
        return None


def merge_dbs(old_db_data, new_db):
    """合并旧数据库和新文档"""
    if old_db_data:
        # 合并
        old_db_data.merge_from(new_db)
        db=old_db_data
        print(db.docstore._dict)
    else:
        db=new_db
    return db
        

def main():
    start_time = time.time()
    path = r"F:\ai\DB\文件\文件2"
    pdf_files = [os.path.join(path, filename) for filename in os.listdir(path) if filename.endswith(".pdf")]
    
    embeddings =  OllamaEmbeddings(model="shaw/dmeta-embedding-zh:latest")
    
    # 加载可能存在的旧数据库
    path1=r"F:\ai\DB\IMAGE"
    existing_db = load_existing_db(path1,embeddings)
    

    # 批量处理所有PDF文件
    vector_db = batch_process_pdfs(pdf_files,embeddings)
    if vector_db is None:
        print("警告:处理文档时出现问题,无法继续。")
        return

    # 合并新旧数据
    final_db = merge_dbs(existing_db, vector_db) 
    save_db_async(final_db, r"F:\ai\DB\IMAGE")

    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"处理完成,总耗时: {elapsed_time:.2f} 秒")
    os._exit(0)

if __name__ == '__main__':
    main()

4.加载训练模型并对话

4.1 单次多伦对话,并添加有记忆功能,代码如下:
 

# -*- coding: utf-8 -*-
import logging
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.llms import Ollama
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.memory import ConversationBufferWindowMemory
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.callbacks.base import BaseCallbackManager
from typing import List


# 配置日志
logging.basicConfig(level=logging.INFO)



def load_file(path):
    embeddings = OllamaEmbeddings(model="shaw/dmeta-embedding-zh:latest")
    db = FAISS.load_local(path, embeddings=embeddings, 
                          allow_dangerous_deserialization=True)
    retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 24})
    
    callback_manager = BaseCallbackManager([StreamingStdOutCallbackHandler()])
    ollama_llm = Ollama(model="qwen:7b", temperature=0.2, top_p=0.9, 
                         callback_manager=callback_manager)
    
    memory = ConversationBufferWindowMemory(memory_key="history", k=5, 
                                            return_messages=True)
    
    
    # 直接将memory作为参数传递
    qa_chain = RetrievalQA.from_chain_type(
        llm=ollama_llm,
        chain_type="stuff",
        retriever=retriever,
        memory=memory,
        verbose=True,
    )
    
    return qa_chain, retriever


def handle_user_query(qa, retriever, user_query: str):
    try:
        docs = retriever.invoke(user_query)
        answer = qa.invoke(user_query, documents=docs)
            
                # 检查answer类型并打印
        if isinstance(answer, dict) and 'result' in answer:
            #print(f"\n问题: {user_query}\n回答: {answer['result']}")
            
            # 判断回答是否明显引用了文档
            if answer['result'] not in docs:
                # 检查回答内容是否提及了检索到的文档内容
                referenced = any(doc.page_content in answer['result'] for doc in docs)
                if not referenced:
                    print("回答未直接基于现有文件。")

                else:
                    print("没有找到直接的参考文献")
                    return
            else:
                print("\n参考文献或相关文档:")
        
            # 显示检索到的文档信息
            unique_sources = set()
            for doc in docs:
                source = doc.metadata.get('source', '未知')
                unique_sources.add(source)
            i=0
            for source in unique_sources:
                if i==4:
                    break
                i+=1
                print(f"来源: {source}")
        else:
            print("未预期的响应类型,请检查qa.invoke的返回值。")
        return answer
        
    except Exception as e:
        logging.error(f"Error handling user query: {e}")

if __name__ == "__main__":
    path = r"F:\ai\DB\IMAGE"
    qa_chain, retriever = load_file(path)
    if qa_chain and retriever:
        while True:
            user_query = input("请输入您的问题(输入'退出'以结束): ")
            if user_query.lower() == '退出':
                break
            handle_user_query(qa_chain, retriever, user_query)
    else:
        logging.warning("Initialization failed. QA Chain or Retriever not properly loaded.")

4.2 通过读取word文档,添加辅助内容,进行多轮问答,并将问和答写入在word文档中,代码如下:

# -*- coding: utf-8 -*-
import logging
from langchain.memory import ConversationBufferWindowMemory
from langchain.chains import RetrievalQA
from langchain.callbacks.base import BaseCallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.llms import Ollama
from langchain_community.vectorstores import FAISS
from typing import Dict, List

from docx import Document

# 配置日志
logging.basicConfig(level=logging.INFO)

def read_docx_lines(file_path):
    """从.docx文件中读取每一行文本"""
    doc = Document(file_path)
    lines = []
    for para in doc.paragraphs:
        lines.extend(para.text.split('\n'))
    return [line.strip() for line in lines if line.strip()]

def save_answers_to_word(answers, output_path):
    """将问题与回答保存至新的Word文档"""
    
    doc = Document()
    for question, answer in answers.items():
        print(answer)
        doc.add_paragraph(question)
        doc.add_paragraph(answer)

    doc.save(output_path)
    print(f"问答已保存至: {output_path}")

def process_doc_and_query_ai(file_path, qa_chain, retriever):
    """从.docx文件读取内容,逐行查询AI并附加附加信息进行回答"""
    lines = read_docx_lines(file_path)
    answers = {}
    # 将附加信息定义为一个常量或直接在询问时添加,而不是在循环内修改列表项
    additional_info = (
        "请结合以下信息进行回答:结合医疗器械可用性工程注册审查指导原则;”
    )
    
    for line in lines:
        if line:  # 确保行不为空
            # 正确的方式是构造一个包含原始问题和附加信息的新问题字符串
            full_query = f"{line} {additional_info}"
            answer = handle_user_query(qa_chain, retriever, full_query)
            answers[line] = answer['result']
            #print(f"问题: {line}\n回答: {answer['result']}\n") 
    return answers


def load_file(path):
    embeddings = OllamaEmbeddings(model="shaw/dmeta-embedding-zh:latest")
    db = FAISS.load_local(path, embeddings=embeddings, 
                          allow_dangerous_deserialization=True)
    retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 24})
    
    callback_manager = BaseCallbackManager([StreamingStdOutCallbackHandler()])
    ollama_llm = Ollama(model="qwen:7b", temperature=0.2, top_p=0.9, 
                         callback_manager=callback_manager)
    
    memory = ConversationBufferWindowMemory(memory_key="history", k=5, 
                                            return_messages=True)
   
    
    # 直接将memory作为参数传递
    qa_chain = RetrievalQA.from_chain_type(
        llm=ollama_llm,
        chain_type="stuff",
        retriever=retriever,
        memory=memory,
        verbose=True,
    )
    
    return qa_chain, retriever

def handle_user_query(qa, retriever, user_query):
    """处理用户查询并返回答案及出处"""
    # 使用retriever检索最相关的文档
    docs = retriever.invoke(user_query)
    
    # 构建答案
    answer = qa.invoke(user_query, documents=docs)
    
    
    # 检查answer类型并打印
    if isinstance(answer, dict) and 'result' in answer:
        #print(f"\n问题: {user_query}\n回答: {answer['result']}")
        
        # 判断回答是否明显引用了文档
        if answer['result'] not in docs:
            # 检查回答内容是否提及了检索到的文档内容
            referenced = any(doc.page_content in answer['result'] for doc in docs)
            if not referenced:
                print("回答未直接基于现有文件。")

            else:
                print("没有找到直接的参考文献")
                return
        else:
            print("\n参考文献或相关文档:")
    
        # 显示检索到的文档信息
        unique_sources = set()
        for doc in docs:
            source = doc.metadata.get('source', '未知')
            unique_sources.add(source)
        i=0
        for source in unique_sources:
            if i==4:
                break
            i+=1
            print(f"来源: {source}")
    else:
        print("未预期的响应类型,请检查qa.invoke的返回值。")
    return answer
if __name__ == "__main__":
    path=r"F:\ai\DB\inventor\db_faiss"
    # 初始化资源
    qa_chain ,retriever= load_file(path)


    # .docx文件路径
    docx_file_path = r"F:\ai\path\可用性模板文件.docx"
    processed_answers = process_doc_and_query_ai(docx_file_path, qa_chain,retriever)
    
    # 保存结果到新的Word文档
    output_docx_path = r"F:\ai\path\output.docx"
    save_answers_to_word(processed_answers, output_docx_path)


  • 4
    点赞
  • 7
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
下面是一个示例代码片段,演示如何在 Three.js 中加载外部骨骼动画并绑定到本地模型上: ```javascript // 加载模型和动画 var loader = new THREE.GLTFLoader(); loader.load('model.gltf', function (gltf) { var model = gltf.scene; var animation = gltf.animations[0]; // 创建动画混合器 var mixer = new THREE.AnimationMixer(model); // 将动画剪辑添加到动画混合器 var action = mixer.clipAction(animation); action.play(); // 加载骨骼动画 var loader = new THREE.FBXLoader(); loader.load('animation.fbx', function (object) { var animation = object.animations[0]; // 获取模型的骨骼和蒙皮网格 var skeleton = new THREE.SkeletonHelper(model); var skinnedMesh = model.children[0]; // 绑定骨骼和蒙皮网格 skinnedMesh.add(skeleton); skinnedMesh.bind(skeleton.skeleton); // 将骨骼动画添加到动画混合器 var action = mixer.clipAction(animation); action.play(); // 将模型添加到场景中 scene.add(model); // 更新动画混合器 function update() { requestAnimationFrame(update); mixer.update(deltaTime); } update(); }); }); ``` 其中,`model.gltf`是包含本地模型数据的 GLTF 文件,`animation.fbx`是包含骨骼动画数据的 FBX 文件。在加载完成后,我们可以获取模型、动画和骨骼网格,并创建一个动画混合器将动画剪辑添加到其中。然后,我们加载骨骼动画并将其添加到动画混合器中。接下来,我们将骨骼和蒙皮网格绑定在一起,并将模型添加到场景中。最后,我们创建一个循环函数来更新动画混合器。
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值