Python运用讯飞公文校对端口对Word文档进行校对

Andiya_dv
已于 2024-01-26 16:58:25 修改
阅读量978
点赞数 19
文章标签： python word
于 2024-01-25 16:41:24 首次发布
本文链接：https://blog.csdn.net/qq_45955249/article/details/135843926
版权
人狠不多话，直接上代码
# -*- coding:utf-8 -*-
from datetime import datetime
from wsgiref.handlers import format_date_time
from time import mktime
import hashlib
import base64
import hmac
from urllib.parse import urlencode
import json
import requests
import easygui
import docx
from spire.doc import *
from spire.doc.common import *
from tkinter.messagebox import *


class AssembleHeaderException(Exception):
    def __init__(self, msg):
        self.message = msg


class Url:
    def __init__(this, host, path, schema):
        this.host = host
        this.path = path
        this.schema = schema
        pass


class WebsocketDemo:
    def __init__(self, APPId, APISecret, APIKey, Text):
        self.appid = APPId
        self.apisecret = APISecret
        self.apikey = APIKey
        self.text = Text
        self.url = 'https://cn-huadong-1.xf-yun.com/v1/private/s37b42a45'

    # calculate sha256 and encode to base64
    def sha256base64(self, data):
        sha256 = hashlib.sha256()
        sha256.update(data)
        digest = base64.b64encode(sha256.digest()).decode(encoding='utf-8')
        return digest

    def parse_url(self, requset_url):
        stidx = requset_url.index("://")
        host = requset_url[stidx + 3:]
        schema = requset_url[:stidx + 3]
        edidx = host.index("/")
        if edidx <= 0:
            raise AssembleHeaderException("invalid request url:" + requset_url)
        path = host[edidx:]
        host = host[:edidx]
        u = Url(host, path, schema)
        return u

    # build websocket auth request url
    def assemble_ws_auth_url(self, requset_url, method="POST", api_key="", api_secret=""):
        u = self.parse_url(requset_url)
        host = u.host
        path = u.path
        now = datetime.now()
        date = format_date_time(mktime(now.timetuple()))
        # print(date)
        # date = "Thu, 12 Dec 2019 01:57:27 GMT"
        signature_origin = "host: {}\ndate: {}\n{} {} HTTP/1.1".format(host, date, method, path)
        # print(signature_origin)
        signature_sha = hmac.new(api_secret.encode('utf-8'), signature_origin.encode('utf-8'),
                                 digestmod=hashlib.sha256).digest()
        signature_sha = base64.b64encode(signature_sha).decode(encoding='utf-8')
        authorization_origin = "api_key=\"%s\", algorithm=\"%s\", headers=\"%s\", signature=\"%s\"" % (
            api_key, "hmac-sha256", "host date request-line", signature_sha)
        authorization = base64.b64encode(authorization_origin.encode('utf-8')).decode(encoding='utf-8')
        # print(authorization_origin)
        values = {
            "host": host,
            "date": date,
            "authorization": authorization
        }

        return requset_url + "?" + urlencode(values)

    def get_body(self):
        body = {
            "header": {
                "app_id": self.appid,
                "status": 3,
                # "uid":"your_uid"
            },
            "parameter": {
                "midu_correct": {
                    # "res_id":"your_res_id",
                    "output_result": {
                        "encoding": "utf8",
                        "compress": "raw",
                        "format": "json"
                    }
                }
            },
            "payload": {
                "text": {
                    "encoding": "utf8",
                    "compress": "raw",
                    "format": "plain",
                    "status": 3,
                    "text": base64.b64encode(self.text.encode("utf-8")).decode('utf-8')
                }
            }
        }
        return body

    def get_result(self):
        request_url = self.assemble_ws_auth_url(self.url, "POST", self.apikey, self.apisecret)
        headers = {'content-type': "application/json", 'host': 'api.xf-yun.com', 'app_id': self.appid}
        body = self.get_body()
        response = requests.post(request_url, data=json.dumps(body), headers=headers)
        # print('onMessage：\n' + response.content.decode())
        tempResult = json.loads(response.content.decode())
        # print('公文校对text字段解析：\n' + base64.b64decode(tempResult['payload']['output_result']['text']).decode())
        par_jch = (base64.b64decode(tempResult['payload']['output_result']['text']).decode())
        # print(json.loads(par_jch))
        return json.loads(par_jch)


def out_result(input_text):
    # 控制台获取
    app_id = "********"    #向讯飞云控制台获取
    api_secret = "abcdefghijklmn"    #向讯飞云控制台获取
    api_key = "abcdefg123456789"   #向讯飞云控制台获取
    # 需纠错文本
    in_text = input_text
    # 向服务器传送数据样本
    in_demo = WebsocketDemo(app_id, api_secret, api_key, in_text)
    output_result = in_demo.get_result()
    # 返回一个字典，用于存放纠错结果
    return output_result


def get_text(input_file):
    """
    1.获取指定路径文件名
    2.生成文件路径
    3.读取文件；
    4.输出全文段落数（即该文档最高调用次数）
    5.获取文档内容，用于传入接口进行纠错识别，返回值为按换行分割后的文本内容,返回类型为列表,用于存放文档内容
    """

    # 使用easygui打开文件
    # file = docx.Document(r"{}".format(easygui.fileopenbox(default=os.path.join(os.path.expanduser("~"), 'Desktop\\'))))
    file = docx.Document(input_file)

    """ 输出段落数 """
    nub = str(len(file.paragraphs))
    par_nub = "正在纠错，请等待......"
    print(par_nub)

    """ 获取文档内容 """
    list_wb = []
    for par_str in range(len(file.paragraphs)):
        text_input = file.paragraphs[par_str].text
        text_strip = text_input.replace(" ", "")
        if text_strip != "":
            list_wb.append(text_strip)
    return list_wb


def result_cleaning(intput_result_list):
    original_text_list = []
    proofreading_results_list = []
    for the_result in intput_result_list:
        for the_re in range(len(the_result['data']['checklist'])):
            suggest_list = the_result['data']['checklist'][the_re - 1]['suggest']
            suggest_word = ""
            if len(suggest_list) == 0:
                suggest_word = "无"
            else:
                suggest_word = "或".join(suggest_list)
            proofreading_results = "错误类型：{a}\n错误词：{b}\n建议：“{c}”".format(
                a=the_result['data']['checklist'][the_re - 1]['type']['name'],
                b=the_result['data']['checklist'][the_re - 1]['word'],
                c=suggest_word)
            original_text_list.append(the_result['data']['checklist'][the_re - 1]['context'])
            proofreading_results_list.append(proofreading_results)
    return original_text_list, proofreading_results_list


def get_file():
    the_file = r"{}".format(easygui.fileopenbox(default=os.path.join(os.path.expanduser("~"), 'Desktop\\')))
    return the_file


def add_result(word_list, the_result_list, word_list_len, into_file):
    # 创建一个 Document 类的对象并加载一个 Word 文档
    doc = Document()
    the_file = into_file
    doc.LoadFromFile(the_file)

    # 定义一个列表，用于存放原文档文本
    input_text_list = word_list  # ["补助", "行政"]
    # 定义一个列表，用于存储纠错后文本
    text_jch_list = the_result_list  # ["还有如Lab、YUV、XYZ等色彩空间范德萨发生型。", "还有如Lab、YUV、XY萨发生型。"]
    for i in word_list_len:
        try:
            # 查找要添加评论的文本
            the_text_word = doc.FindString(input_text_list[i - 1], True, True)

            # 创建一个评论并设置评论的内容和作者
            comment = Comment(doc)
            comment.Body.AddParagraph().Text = text_jch_list[i - 1]
            comment.Format.Author = "讯飞公文校对"

            # 将找到的文本作为文本范围，并获取其所属的段落
            text_range = the_text_word.GetAsOneRange()
            paragraph = text_range.OwnerParagraph

            # 将评论添加到段落中
            paragraph.ChildObjects.Insert(paragraph.ChildObjects.IndexOf(text_range) + 1, comment)

            # 创建评论起始标记和结束标记，并将它们设置为创建的评论的起始标记和结束标记
            commentStart = CommentMark(doc, CommentMarkType.CommentStart)
            commentEnd = CommentMark(doc, CommentMarkType.CommentEnd)
            commentStart.CommentId = comment.Format.CommentId
            commentEnd.CommentId = comment.Format.CommentId

            # 在找到的文本之前和之后插入创建的评论起始和结束标记
            paragraph.ChildObjects.Insert(paragraph.ChildObjects.IndexOf(text_range), commentStart)
            paragraph.ChildObjects.Insert(paragraph.ChildObjects.IndexOf(text_range) + 1, commentEnd)
        except:
            pass

    # 保存文档
    doc.SaveToFile(r"{a}添加批注--{b}".format(
        a=os.path.join(os.path.expanduser("~"), 'Desktop\\'),
        b=os.path.basename(the_file)))
    doc.Close()


def running_correct():
    result_list = []
    showinfo(title="提示", message="请选择要修改的文件！")
    the_file = get_file()
    into_text = get_text(the_file)
    for i in into_text:
        a_result = out_result(i)
        result_list.append(a_result)
    mian_list = result_cleaning(result_list)
    list_to_word = mian_list[0]
    list_to_result = mian_list[1]
    input_range = range(len(list_to_word))
    add_result(list_to_word, list_to_result, input_range, the_file)


if __name__ == '__main__':
    running_correct()