基于星火大模型的群聊对话分角色要素提取挑战赛|#AI夏令营#Datawhale#夏令营-笔记

往今生

已于 2024-07-02 23:08:26 修改

阅读量264

点赞数 10

文章标签：笔记 python

于 2024-07-02 23:03:00 首次发布

本文链接：https://blog.csdn.net/ao_wj/article/details/140138231

版权

链接奉上：
https://challenge.xfyun.cn/topic/info?type=role-element-extraction&option=phb

先看看赛题任务：

从给定的<客服>与<客户>的群聊对话中, 提取出指定的字段信息，待提取的全部字段见下数据说明。

然后数据说明：

赛题方提供了184条真实场景的群聊对话数据以及人工标注后的字段提取结果，其中训练数据129条，测试数据 55条。

心得：

我刚开始的理解就是直接提供一个大模型让他自己找，后来发现，不行啊，超时，一看数据集，很多都是重复的尤其是里面的引用，链接，可能就相当于我们的多选转发，很多的重复信息。这些信息都是需要去除的，因为我感觉是没有用的。后来看来大佬一个简单的数据清洗的思路，下面上链接：https://blog.csdn.net/qq_44511981/article/details/140043813?csdn_share_tail
他讲的比较全面，我就做了一点数据清洗。刚学，记录记录。

代码

import os
import json
import re
import matplotlib.pyplot as plt



def read_json_file(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            data = json.load(file)
            return data
    except FileNotFoundError:
        print(f"File {file_path} not found.")
    except json.JSONDecodeError:
        print(f"Error decoding JSON from file {file_path}.")
    except Exception as e:
        print(f"An error occurred: {e}")


def clean_message(message):
    # 定义需要删除的内容的正则表达式
    patterns_to_remove = [
        r'\[图片\]',  # 匹配 [图片]
        r'\[玫瑰\]',  # 匹配 [玫瑰]
        r'\[链接\]',  # 匹配 [链接]
        r'https?://\S+',  # 匹配链接
        r"data-online-sheet-link='.*?'",
        r"data-sheet-href='.*?'",
        r"data-sheets-hyperlink='.*?'",
        r'data-\w+=[\'\"].*?[\'\"]',  # 匹配 data-开头的任意属性,
        r'title=".*?"',  # 匹配 title=""
        r'style=".*?"',  # 匹配 style="..."
        r'\[[^\]]+\]',  # 匹配方括号中的任何内容，例如 [收集表]
        r'\".*?\"',  # 匹配任意双引号中的内容
        r'\'[^\']*?\''  # 匹配任意单引号中的内容
    ]
    # 使用 re.sub() 方法匹配并删除
    for pattern in patterns_to_remove:
        message = re.sub(pattern, '', message)
    return message.strip()


def delete_repeat(text):
    # 用于存储已出现过的消息
    seen_messages = set()

    # 用于存储去重后的数据
    unique_data = []

    for entry in text:
        message = entry['message']
        if message not in seen_messages:
            seen_messages.add(message)
            unique_data.append(entry)
    return unique_data


def merge_message(chat_text):
    # 分割对话文本
    chat_lines = chat_text.split('\n')

    # 初始化合并后的对话列表
    merged_chat_data = []
    current_speaker = ""
    current_dialogue = ""

    # 逐行处理对话
    for line in chat_lines:
        if line.strip():
            try:
                speaker, message = line.split("：", 1)
            except ValueError:
                continue
            message = clean_message(message)
            if speaker == current_speaker:
                current_dialogue += message
            else:
                if current_speaker:
                    merged_chat_data.append({"speaker": current_speaker, "message": current_dialogue})
                current_speaker = speaker
                current_dialogue = message
    # 添加最后一个对话
    if current_speaker:
        merged_chat_data.append({"speaker": current_speaker, "message": current_dialogue})
    merged_chat_data = delete_repeat(merged_chat_data)
    # 将合并后的对话转换回“姓名：对话”的形式
    final_output = "\n".join([f"{entry['speaker']}：{entry['message']}" for entry in merged_chat_data])
    return final_output


if __name__ == "__main__":
    data_dir = "./dataset"
    train_file = "train.json"
    test_file = "test_data.json"
    json_data = read_json_file(os.path.join(data_dir, test_file))
    data = json.dumps(json_data, indent=4, ensure_ascii=False)
    data = json.loads(data)
    final_data = data
    length = []
    for i in range(len(data)):
        chat_text = merge_message(data[i]['chat_text'])
        length.append(len(chat_text))
        final_data[i]['chat_text'] = chat_text
        # print(final_data[i]['chat_text'])
        # if i >3:
        #     break
    plt.hist(length)
    plt.show()

#
#     with open("dataset/my_train.json", "w", encoding="utf-8") as f:
#         json.dump(final_data, f, ensure_ascii=False, indent=4)
#     f.close()
# #
with open("dataset/my_test.json", "w", encoding="utf-8") as f:
    json.dump(final_data, f, ensure_ascii=False, indent=4)
f.close()