内容来自官方文档,记录下

参考使用

  • 代码
from typing import List
from pydantic import BaseModel
from openai import OpenAI
import instructor
 
class Data(BaseModel):
    index: int
    data_type: str
    pii_value: str
 
class PIIDataExtraction(BaseModel):
    """
    Extracted PII data from a document, all data_types should try to have consistent property names
    """
 
    private_data: List[Data]
 
    def scrub_data(self, content: str) -> str:
        """
        Iterates over the private data and replaces the value with a placeholder in the form of
        <{data_type}_{i}>
        """
        for i, data in enumerate(self.private_data):
            content = content.replace(data.pii_value, f"<{data.data_type}_{i}>")
        return content
 
 
 
 
client = instructor.from_openai(OpenAI(
    base_url="http://localhost:4000",
    api_key="sk-ZTp5zuetNQoJNgG4xHgGzw",
))
 
EXAMPLE_DOCUMENT = """
我叫xxx,家住xxxxx,我的电话号码是xxxxxx,我的身份证号码是xxxxx,我的邮箱是xxxx
"""
 
pii_data = client.chat.completions.create(
    model="dalongdemov3",
    response_model=PIIDataExtraction,
    messages=[
        {
            "role": "system",
            "content": "You are a world class PII scrubbing model, Extract the PII data from the following document",
        },
        {
            "role": "user",
            "content": EXAMPLE_DOCUMENT,
        },
    ],
)
 
print("Extracted PII Data:")
print(pii_data.model_dump_json())
  • 1.
  • 2.
  • 3.
  • 4.
  • 5.
  • 6.
  • 7.
  • 8.
  • 9.
  • 10.
  • 11.
  • 12.
  • 13.
  • 14.
  • 15.
  • 16.
  • 17.
  • 18.
  • 19.
  • 20.
  • 21.
  • 22.
  • 23.
  • 24.
  • 25.
  • 26.
  • 27.
  • 28.
  • 29.
  • 30.
  • 31.
  • 32.
  • 33.
  • 34.
  • 35.
  • 36.
  • 37.
  • 38.
  • 39.
  • 40.
  • 41.
  • 42.
  • 43.
  • 44.
  • 45.
  • 46.
  • 47.
  • 48.
  • 49.
  • 50.
  • 51.
  • 52.
  • 53.
  • 54.
  • 55.
from pydantic import BaseModel
  • 1.
  • 效果

instructor 进行PII 数据处理_参考资料

说明

利用LLM 的结构化输出还是可以解决不少以前感觉比较费事的东西的

参考资料

 https://python.useinstructor.com/examples/pii/#defining-the-structures