#AI夏令营 #Datawhale #夏令营
baseline已跑通,但主要流程还不太熟悉。
记录调整数据集后,跑通的代码,但最后得分比昨日更低,让人费解,还需要进一步学习。
轰隆隆:
from dataclasses import dataclass
from sparkai.llm.llm import ChatSparkLLM, ChunkPrintHandler
from sparkai.core.messages import ChatMessage
import pandas as pd
import os
import json
import re
import matplotlib.pyplot as plt
from tqdm import tqdm
from math import ceil
import numpy as np
from copy import deepcopy
import random
tqdm.pandas()
plt.rcParams[‘font.family’] = [‘STFangsong’]
plt.rcParams[‘axes.unicode_minus’] = False
data_dir = “./data”
train_file = “train.json”
test_file = “test_data.json”
train_data = pd.read_json(os.path.join(data_dir, train_file))
test_data = pd.read_json(os.path.join(data_dir, test_file))
data_dir = “./data”
train_file = “train.json”
test_file = “test_data.json”
train_data = pd.read_json(os.path.join(data_dir, train_file))
test_data = pd.read_json(os.path.join(data_dir, test_file))
data_dir = “./data”
train_file = “train.json”
test_file = “test_data.json”
train_data = pd.read_json(os.path.join(data_dir, train_file))
test_data = pd.read_json(os.path.join(data_dir, test_file))
轰隆隆:
data_dir = “./data”
train_file = “train.json”
test_file = “test_data.json”
train_data = pd.read_json(os.path.join(data_dir, train_file))
test_data = pd.read_json(os.path.join(data_dir, test_file))
轰隆隆:
def process(excemple):
chat_list = excemple[“chat_text”].split(“\n”)
res = []
s = 0
while s < len(chat_list):
i, j = s, s+1
start_j = j
while i < len(chat_list) and j < len(chat_list):
if chat_list[i] == chat_list[j]:
i += 1
else:
if i != s:
if j - start_j >10:
res += list(range(start_j, j))
i = s
start_j = j
j += 1
s += 1
texts = []
for i in range(len(chat_list)):
if i not in res:
texts.append(chat_list[i])
return “\n”.join(texts)
train_data[“chat_text”] = train_data.apply(process, axis = 1)
test_data[“chat_text”] = test_data.apply(process, axis = 1)
train_data.to_json(os.path.join(data_dir, train_file), orient=‘records’)
test_data.to_json(os.path.join(data_dir, test_file), orient=‘records’)