包发布
整体架构: data download文件夹:原始数据的默认下载/存放路径 files:处理完数据的默认存放路径 util 通用操作代码 cleandata_helper:去停用词、html标签、分词后重新拼接等 logs:日志文件 models:存放数据清洗或测试所需的模型文件 resources:静态资源,如词库文件等 examples:各类数据处理操作的使用示例 core 业务流程代码 savedata_to_mysql:数据字段验证和入库 getdata_from_dataset:获取数据并查看相关信息 data_convert:dataset转pandas 或者 jsonl cleandata_for_llm:llm文本清洗 cleandata_for_vlm:vlm文本清洗 deduped_by_simhash:simhash去重 deduped_by_minhash:minhash去重 readdata_from_mysql:数据库读取数据操作 datatest_with_model:加载模型进行数据测试或验证 requirements.txt:开发环境 1.获取数据——getLLMData.py getLLMData.readByDataSetName 输入hugging数据集名称读入dataset getLLMData.readByDataSetUrl 输入hugging数据集Url地址下载至指定地址 使用说明 一.dbOperate:数据库相关NLP工具 1.默认数据库是29上的deepseek_data,访问其他数据库请修改相关类变量 2.util常用工具类 mysqlHelper *获取数据表整体数据量大小(GB)和行数 mysqlHelper.AllSize *获取指定Source数据的数据量大小(MB)和行数 mysqlHelper.SourceSize *执行具体sql语句 mysqlHelper.SqlQuery 3.extractDataFromMysql:根据长度或者比例获取某个source的测试用例的数据 4.上传
import os.path import json from getFeature_nltk_plus_end import get_features import pandas as pd from simhash import Simhash,SimhashIndex from datasets import load_dataset def text_to_simhash(text): features = get_features(text) return Simhash(features) #filePath=r'C:\Users\A25073\.cache\huggingface\hub\datasets--jiaxin-wen--CodePlan\snapshots\e8d56b9509ca4f2771ff72dbc5e3c7b0feb277af\train.jsonl' #dataset = load_dataset('json', data_files=filePath, split='train') filePath=r'D:\data\DapoMath\dapo-math-17k.parquet' #dataset=load_dataset("parquet",data_files = {'train':r'D:\data\DapoMath\dapo-math-17k.parquet'})['train'] dataset=load_dataset("parquet",data_files = {'test':r'D:\data\DapoMath\aime-2024.parquet'})['test'] #print(len(dataset1['train']),len(dataset2['test'])) #print(dataset1['train'][0]) #print(dataset2['test'][0]) print(len(dataset)) index_obj = SimhashIndex([], k=3) not_unique_rows=[] unique_indexs=[] for i, item in enumerate(dataset): if True: text = item['prompt'][0]['content'] # 假设数据集中有 'text' 字段` text_simhash=text_to_simhash(text) #print(index_obj) near_duplicates = index_obj.get_near_dups(text_simhash) if not near_duplicates: index_obj.add(i, text_to_simhash(text)) unique_indexs.append(i) else: print('mingzhong !') print(near_duplicates) indexs_take=[int(j) for j in near_duplicates] not_unique_rows.append( {'已有文本index': indexs_take, '已有文本':[dataset[m]['prompt'][0]['content'] for m in indexs_take], '重复文本index': i,'重复文本':dataset[i]['prompt'][0]['content']}) else: break if True: # ii>2000: detailPath=os.path.basename(filePath).split('.')[0] detailFile=r'D:\data\%s_19去重结果明细alnum和不替换数字.json'%detailPath #if os.path.exists(detailFile): # os.remove(detailFile) with open(detailFile, 'w', encoding='utf-8') as f: json.dump(not_unique_rows, f, ensure_ascii=False, indent=4) # 使用 select 方法筛选出这些索引对应的数据集 filtered_dataset = dataset.select(unique_indexs) filtered_dataset.to_json(r'D:/data/20filter_jsonl.json')
from getFeature_nltk_plus import get_features import pandas as pd from simhash import Simhash,SimhashIndex from datasets import load_dataset def text_to_simhash(text): features = get_features(text) return Simhash(features) files = [ r'D:\data\KodeCode\datasets--KodCode--KodCode-V1\snapshots\0350cf9d8d66005e4962d5fc2d224c438740f517\data\train-00000-of-00014.parquet', r'D:\data\KodeCode\datasets--KodCode--KodCode-V1\snapshots\0350cf9d8d66005e4962d5fc2d224c438740f517\data\train-00001-of-00014.parquet', r'D:\data\KodeCode\datasets--KodCode--KodCode-V1\snapshots\0350cf9d8d66005e4962d5fc2d224c438740f517\data\train-00002-of-00014.parquet', r'D:\data\KodeCode\datasets--KodCode--KodCode-V1\snapshots\0350cf9d8d66005e4962d5fc2d224c438740f517\data\train-00003-of-00014.parquet', r'D:\data\KodeCode\datasets--KodCode--KodCode-V1\snapshots\0350cf9d8d66005e4962d5fc2d224c438740f517\data\train-00004-of-00014.parquet', r'D:\data\KodeCode\datasets--KodCode--KodCode-V1\snapshots\0350cf9d8d66005e4962d5fc2d224c438740f517\data\train-00005-of-00014.parquet', r'D:\data\KodeCode\datasets--KodCode--KodCode-V1\snapshots\0350cf9d8d66005e4962d5fc2d224c438740f517\data\train-00006-of-00014.parquet', r'D:\data\KodeCode\datasets--KodCode--KodCode-V1\snapshots\0350cf9d8d66005e4962d5fc2d224c438740f517\data\train-00007-of-00014.parquet', r'D:\data\KodeCode\datasets--KodCode--KodCode-V1\snapshots\0350cf9d8d66005e4962d5fc2d224c438740f517\data\train-00008-of-00014.parquet', r'D:\data\KodeCode\datasets--KodCode--KodCode-V1\snapshots\0350cf9d8d66005e4962d5fc2d224c438740f517\data\train-00009-of-00014.parquet', r'D:\data\KodeCode\datasets--KodCode--KodCode-V1\snapshots\0350cf9d8d66005e4962d5fc2d224c438740f517\data\train-00010-of-00014.parquet', r'D:\data\KodeCode\datasets--KodCode--KodCode-V1\snapshots\0350cf9d8d66005e4962d5fc2d224c438740f517\data\train-00011-of-00014.parquet', r'D:\data\KodeCode\datasets--KodCode--KodCode-V1\snapshots\0350cf9d8d66005e4962d5fc2d224c438740f517\data\train-00012-of-00014.parquet', r'D:\data\KodeCode\datasets--KodCode--KodCode-V1\snapshots\0350cf9d8d66005e4962d5fc2d224c438740f517\data\train-00013-of-00014.parquet', r'D:\data\KodeCode\datasets--KodCode--KodCode-V1\snapshots\0350cf9d8d66005e4962d5fc2d224c438740f517\data\use_with_caution-00000-of-00001.parquet'] dataset = load_dataset('parquet', data_files=files) print(dataset) dataset = load_dataset('parquet', data_files=files) dataset=dataset['train'] print(dataset) columns_to_keep = ['question'] selected_dataset = dataset.remove_columns([col for col in dataset.column_names if col not in columns_to_keep]) unique_rows=[] batch_set = selected_dataset.select(range(0, len(dataset))) #test df_batch = batch_set.to_pandas() df_batch.reset_index(inplace=True) print(len(df_batch)) df_batch['question_simhash'] = df_batch['question'].apply(lambda x: text_to_simhash(x)) print(df_batch['question_simhash'][1]) print(df_batch['question'][1]) # 创建 SimhashIndex 对象,指定汉明距离阈值 k(例如 k=3) index_obj = SimhashIndex([], k=3) for row in df_batch.itertuples(index=False): question_simhash = row.question_simhash index_obj.add(str(row.index), row.question_simhash) df_toDeal=pd.read_csv(r'D:/data/Leetcode_us_cleaned9.csv',encoding='utf8') for idx,row2 in df_toDeal.iterrows(): question_simhash2=text_to_simhash(row2['question']) near_duplicates = index_obj.get_near_dups(question_simhash2) # 获取与该simhash对象相似的(k=10)所有的id索引值 if not near_duplicates: # 如果没有找到相似的哈希对象 unique_rows.append(row2) else: print('mingzhong!!!!!!!') unique_df = pd.DataFrame(unique_rows, columns=df_toDeal.columns) #unique_df=unique_df.drop('index',axis=1) print(len(unique_df)) unique_df.to_csv('leetcode_us_solutions_deal32.csv', index=False, encoding='utf-8-sig')
rom transformers import AutoTokenizer import mysql.connector import json import pandas as pd import pymysql mydb = mysql.connector.connect( host="10.239.121.29", # 数据库主机地址 user="root", # 数据库用户名 password="autel_wxp", # 数据库密码 database="deepseek_data" # 数据库名称 ) tokenizer = AutoTokenizer.from_pretrained(r"D:/soft/model/Qwen2.5-3B") df=pd.read_csv(r'D:/data/leetcode_us_solutions_deal2.csv',encoding='utf8') print(sum(df['type']=='reasoning')) print(len(df)) print('xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx') print(df['question_length'].max(),df['question_length'].min(),df['question_length'].mean()) print(df['thought_length'].max(),df['thought_length'].min(),df['thought_length'].mean()) print(df['answer_length'].max(),df['answer_length'].min(),df['answer_length'].mean()) mycursor = mydb.cursor() insert_count = 0 for idx,row in df.iterrows(): if idx<100: source=row['source'] type=row['type'] category=row['category'] question=row['question'] thought=row['thought'] answer=row['answer'] conversations_json = json.dumps(row['conversations']) print(conversations_json) print(row) difficulty=None question_length = len(tokenizer(question)["input_ids"]) thought_length = len(tokenizer(thought)["input_ids"]) answer_length = len(tokenizer(answer)["input_ids"]) language='en' #test_json=json.dumps(row['test'])############################ test_json='' # 插入数据的 SQL 语句 sql = "INSERT INTO deepseek_r1_copy (source, type, category, question, thought, answer, conversations, difficulty, question_length, thought_length, answer_length, language, test) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)" val = (source, type, category, question, thought, answer, conversations_json, difficulty, question_length, thought_length, answer_length, language, test_json) if True:#try: # 执行 SQL 语句 mycursor.execute(sql, val) # 提交事务 mydb.commit() #计数器加 1 insert_count += 1 print(f"第 {insert_count} 条记录插入成功。") #except mysql.connector.Error as err: #print(f"插入数据时出错: {err}") # 关闭游标和数据库连接 mycursor.close() mydb.close()
import mysql.connector import pandas as pd import numpy as np class ExtractDataFromMysql: __host = "10.239.121.29", # 数据库主机地址 __user = "root", # 数据库用户名 __password = "autel_wxp", # 数据库密码 __database = "deepseek_data" # 数据库名称 def __init__(self): self.test_length = test_length self.source = source self.csv_name = csv_name def extractData(self): try: myCursor = mydb.cursor() # 执行 SQL 语句 countGetSql = "Select count(*) from deepseek_r1 where source='%s'" % source myCursor.execute(countGetSql) countResult = myCursor.fetchall() length = countResult[0][0] startIndexGetSql = "Select id from deepseek_r1 where source='%s' limit 1" % source myCursor.execute(startIndexGetSql) startIndexGetResult = myCursor.fetchall() startIndex = startIndexGetResult[0][0] print(f'开始位置索引为:{startIndex},数据总量:{length},随机抽取数量:{test_length}') id_list = getRandomIndex(length, startIndex, test_length) val = tuple(id_list) + (source,) extractSql = f"Select * from deepseek_r1 where id in ({','.join(['%s'] * len(id_list))}) and source=%s" myCursor.execute(extractSql, val) extractResult = myCursor.fetchall() # 提交事务 mydb.commit() df_result = pd.DataFrame(extractResult, columns=columns) df_result.to_csv('%s.csv' % csv_name, index=None) except mysql.connector.Error as err: print(f"查询数据时出错: {err}") @classmethod def ConnectMysql(cls): try: mydb = mysql.connector.connect( host=cls.__host, user=cls.__user, password=cls.__password, database=cls.__database ) except Exception as e: print('数据库连接出错') raise e return mydb @staticmethod def getRandomIndex(range_num, start_num, size_num): # 索引范围为[start,start + n),随机选x个不重复,注意replace=False才是不重复,replace=True则有可能重复 index = np.random.choice(np.arange(start_num, start_num + range_num), size=size_num, replace=False) index = index.astype(int) index = index.tolist() return index columns = ['id', 'source', 'type', 'category', 'question', 'thought', 'answer', 'conversations_json', 'difficulty', 'question_length', 'thought_length', 'answer_length', 'language', 'test'] test_length = 100 source = 'Magpie-Reasoning-V2-250K-CoT-Deepseek-R1-Llama-70B' csv_name = 'Llama-70B_100冷启动' if __name__=='__main__': # 连接数据库 mydb = ExtractDataFromMysql.ConnectMysql() #获取数据 extractData() #finally: myCursor.close() mydb.close()
#author:fudashuang from datasets import Dataset import numpy as np import pandas as pd from simhash import Simhash,SimhashIndex import json, os,shutil import paramiko import tempfile from datasets import Dataset, concatenate_datasets import pyarrow.parquet as pq import pyarrow as pa import pyarrow.ipc as ipc from getFeature_nltk_plus import get_features data_frames = [] # SFTP连接参数 hostname = "10.239.121.25" port = 22 username = "fudashuang" password = "fudashuang" remote_dir = "/data4/dataset/openthoughts-Math-220k/default" temp_dir = r'D:/temp' def sftp_connect(hostname, port, username, password): sftp=None transport=None try: transport = paramiko.Transport((hostname, port)) transport.connect(username=username, password=password) sftp = paramiko.SFTPClient.from_transport(transport) except Exception as e: print(e) return sftp,transport def sftp_disconnect(sftp,transport): try: sftp.close() transport.close() except Exception as e: pass #sftp直接下载dataset数据集 def sftp_download(remote_dir, temp_dir): dataset_lists = [] i = 1 try: # 建立SFTP连接 sftp,transport=sftp_connect(hostname, port, username, password) if sftp is not None: # 创建临时文件 for filename in sftp.listdir(remote_dir): if filename.endswith('.arrow'): file_fullname = '%s/%s' % (remote_dir, filename) print(file_fullname) with tempfile.NamedTemporaryFile(delete=True, dir=temp_dir) as temp_file: local_file_path = temp_file.name # 从远程服务器下载文件到本地临时文件 sftp.get(file_fullname, local_file_path) data_set = Dataset.from_file(local_file_path) dataset_lists.append(data_set) i += 1 #if i > 2: #break except Exception as e: print(e) finally: # 关闭SFTP连接 sftp_disconnect(sftp,transport) return dataset_lists #def sftp_upload(local_dir,remote_dir): #直接使用本地数据集 def local_download(): dataset_lists = [] dataset_files=[] i = 1 try: for local_file_path in os.listdir(r'D:\data\openthoughts'): if local_file_path.endswith('.arrow') :#and not local_file_path.startswith('train'): local_file_path = os.path.join(r'D:\data\openthoughts', local_file_path) print(local_file_path) #data_set = Dataset.from_parquet(local_file_path) data_set = Dataset.from_file(local_file_path) dataset_lists.append(data_set) dataset_files.append(local_file_path) #i += 1 #if i > 2: # break except Exception as e: print(e) return dataset_lists,dataset_files #处理datasetlists,将结果写入一整个pandas def datasetlists_to_pandas(dataset_lists): columns_to_keep = ['conversations'] for j, dataset in enumerate(dataset_lists): selected_dataset = dataset.remove_columns([col for col in dataset.column_names if col not in columns_to_keep]) batch_size = 1000 m = 1 for i in range(0, len(dataset), batch_size): batch_set = selected_dataset.select(range(i, min(i + batch_size, len(dataset)))) df_batch = batch_set.to_pandas() df_batch['conversations'] = df_batch['conversations'].apply(lambda x: x[0]['value']) df_batch.reset_index(inplace=True) df_batch['new_uuid'] = df_batch['index'].apply(lambda x: str(j) + '_' + str(int(x) + i)) data_frames.append(df_batch) m += 1 print(len(data_frames), 'len_data_frames', m) #if m > 3: #break # 合并所有批次 df = pd.concat(data_frames, ignore_index=True) return df #生成simhash对象 def text_to_simhash(text): features = get_features(text) return Simhash(features) #simhash去重,返回去重后的pandas对象,并存储去重的重复结果到json def SimHashDeduped(df): # 计算每个文本的SimHash值 df['conversations_simhash'] = df['conversations'].apply(lambda x: text_to_simhash(x)) # 创建 SimhashIndex 对象,指定汉明距离阈值 k(例如 k=3) index = SimhashIndex([], k=4) unique_rows = [] not_unique_rows = [] duplicate_rows = [] # 检查并去除重复文本 ii = 1 for row in df.itertuples(index=False): conversations_simhash = row.conversations_simhash near_duplicates = index.get_near_dups(conversations_simhash) # 获取与该simhash对象相似的(k=10)所有的id索引值 ii += 1 if not near_duplicates : # 如果没有找到相似的哈希对象 unique_rows.append(row) index.add(str(row.new_uuid), row.conversations_simhash) else: print('.................................................') print(near_duplicates) insdex = df[df['new_uuid'] == near_duplicates[0]].index.tolist() # 可能为【’None‘】 if len(insdex) >= 1: if insdex[0] != 'None': not_unique_rows.append({'已有文本new_uuid': near_duplicates, '已有文本': df.at[int(insdex[0]), 'conversations'], '重复文本new_uuid': row.new_uuid, '重复文本': row.conversations}) duplicate_rows.append(row) if True: # ii>2000: if os.path.exists(r'D:\data\openthoughts_nltk_plus\nltk_plus_去重结果明细.json'): os.remove(r'D:\data\openthoughts_nltk_plus\nltk_plus_去重结果明细.json') with open(r'D:\data\openthoughts_nltk_plus\nltk_plus_去重结果明细.json', 'w', encoding='utf-8') as f: json.dump(not_unique_rows, f, ensure_ascii=False, indent=4) # raise('e') # 创建去重后的数据框 unique_df = pd.DataFrame(unique_rows, columns=df.columns) duplicate_df = pd.DataFrame(duplicate_rows, columns=df.columns) if os.path.exists(r'D:\data\openthoughts_nltk_plus\count_results_去重统计及说明.txt'): os.remove(r'D:\data\openthoughts_nltk_plus\count_results_去重统计及说明.txt') with open(r'D:\data\openthoughts_nltk_plus\count_results_去重统计及说明.txt', 'a+') as a: a.write('nltk_plus处理前的数据条数:\n') a.write('%s\n' % str(len(df))) a.write('nltk_plus处理后的数据条数:\n') a.write('%s\n' % str(len(unique_df))) print(duplicate_df) return unique_df,duplicate_df #保存dataset到arrow def save_sharded_dataset(dataset, output_file,chunksize = 1000): num_rows = dataset.num_rows df=dataset.to_pandas() arrow_table = pa.Table.from_pandas(df) arrow_schema = pa.Table.from_pandas(df).schema # 检查 OutputPath 并创建 writer sink = pa.OSFile(output_file, 'wb') writer = ipc.new_file(sink, arrow_schema) try: # 分批次写入 for start in range(0, len(df), chunksize): end = start + chunksize df_chunk = df.iloc[start:end] # 确保 chunk 的 schema 与初始 schema 一致 table_chunk = pa.Table.from_pandas(df_chunk, schema=arrow_schema) # 写入 Arrow 文件 writer.write(table_chunk) except pa.lib.ArrowInvalid as e: print(f"ArrowInvalid error occurred: {e}") finally: # 关闭 RecordBatchWriter 和 sink 即便出现异常 writer.close() sink.close() print(f"Saved {num_rows} rows to output_path") def save_dataset(dataset, output_dir): dataset.save_to_disk(output_dir) #如果需要手动保存为其他的csv\json\arrow\parquet等文件,参考DealDataSetEnd_arrow.py并替换文件处理引擎 print(f"Saved {len(dataset)} rows to output_path") if __name__=='__main__': if not os.path.exists(r'D:\data\openthoughts_nltk_plus'): os.makedirs(r'D:\data\openthoughts_nltk_plus') dataset_lists,dataset_files=local_download() df=datasetlists_to_pandas(dataset_lists) unique_df,duplicate_df=SimHashDeduped(df) listA=[] for file_index, (data_set, dataset_file) in enumerate(zip(dataset_lists, dataset_files)): # filter_data_set=data_set.filter(lambda x: x['uuid'] not in duplicate_df['uuid'].tolist() ) bool_filter = [True] * len(data_set) remove_indices = [int(x.split('_')[1]) for x in duplicate_df['new_uuid'].tolist() if int(x.split('_')[0]) == file_index] print(remove_indices) # 将需要删除的行索引标记为False for index in remove_indices: bool_filter[index] = False # 使用布尔列表过滤数据集 filter_data_set = data_set.filter(lambda _, i: bool_filter[i], with_indices=True) with open(r'D:\data\openthoughts_nltk_plus\count_results_去重统计及说明.txt', 'a+') as a: a.write(f'文件:{os.path.basename(dataset_file)}') a.write(f'原来行数为{data_set.num_rows}\n') a.write(f'过滤后的行数为{filter_data_set.num_rows}\n\n') #save_sharded_dataset(filter_data_set, dataset_file.replace('openthoughts', 'openthoughts_nltk_plus')) listA.append(filter_data_set) dataset_end=concatenate_datasets(listA) save_dataset(dataset_end, r'D:/data/openthoughts_nltk_plus') for local_file_path in os.listdir(r'D:\data\openthoughts'): if not local_file_path.endswith('.arrow'): file_path = os.path.join(r'D:\data\openthoughts', local_file_path) shutil.copy(file_path, file_path.replace('openthoughts', 'openthoughts_nltk_plus', 1000)) for file in os.listdir(r'D:\data\openthoughts_nltk_plus'): if file.startswith('cache-'): print(file)
from nltk.tokenize import word_tokenize from nltk.corpus import stopwords import re,nltk nltk.download('punkt', quiet=True) nltk.download('stopwords', quiet=True) def remove_repeated_words(text): #删除分词处理后拼接成的干净字符串中的重复次数超过4次的词组(忽略中间的标点、数字等间隔) # 使用正则表达式匹配单词词组,并将结果存入列表中 def get_phrases(text): phrases = re.findall(r'\b\w+\b', text.lower())#\b自动寻找单词边界,去除了其他运算符 return phrases phrases = get_phrases(text) # 创建一个有序的集合(Ordered Dict)来保持插入顺序 seen=[] # 添加每个词组 for phrase in phrases: normalized_phrase = re.sub(r'[\W\d]+', '', phrase) if seen.count(normalized_phrase)<4: seen.append(normalized_phrase) # 构建保留的非重复部分 retained_phrases = seen # 按顺序拼接保留的词组,将它们在原文本中替换 delimiter = r'([\W\d]+)' parts = re.split(delimiter, text) cleaned_parts = [] for part in parts: cleaned_part = re.sub(r'[\W\d]+', '', part).lower() if cleaned_part in seen: cleaned_parts.append(part) #print(seen,cleaned_part) seen.remove(cleaned_part) elif re.match(delimiter, part): cleaned_parts.append(part) # 重新组合各部分 cleaned_text = ''.join(cleaned_parts).strip() #去除空格 return cleaned_text def deal_math(text): text=text.replace('*','mul') #text=text.replace('^','mat') text=text.replace('+','add') text=text.replace('-','sub') return text #利用nltk进行分词,后将\\视作分隔符再分割,生成分词列表 #忽略单字符的非英文分词,忽略超过50字的长分词(错误结果) #为满足simhash条件,选择结果的前399字符作为生成的特征向量 def get_features(text): # 分词 words = word_tokenize(text) stop_words = set(stopwords.words('english')) special_math_words = {} # 转小写、去除停用词和数学领域特定词汇 _words = [word.lower() for word in words] _words1 = [i.split('\\') for i in _words] words_new = [item for sublist in _words1 for item in (sublist if isinstance(sublist, list) else [sublist])] processed_words = [word for word in words_new if word not in stop_words and word not in special_math_words and len(word) > 1 and len(word)<20] processed_words_add = [word for word in words_new if word.isalpha() and word not in stop_words and word not in special_math_words and len(word) == 1 and len(word)<20] words_new=' '.join(processed_words + processed_words_add) words_new=deal_math(words_new) return words_new if len(words_new)<399 else words_new[:399]
论文精读-Qwen2 Technical Report - 知乎
pycharm如何连接远程服务器的docker容器进行运行和调试代码(二)_pycharm 连接远程docker-CSDN博客
docker篇---pycharm连接docker,使用docker环境-CSDN博客
Pycharm远程连接Linux服务器上的Docker项目进行开发调试-详细教程_pycharm docker远程开发-CSDN博客
Pycharm远程连接Spark(超详细图文教程)_pycharm远程连接spark集群-CSDN博客
SSH的免密登录详细步骤(注释+命令+图)_ssh 免密登录-CSDN博客
用 Spark's MinHashLSH进行文本语料去重 - 华东博客 - 博客园
Spark 在反作弊聚类场景的实践_spark+simhash-CSDN博客在Spark上基于Minhash计算jaccard相似度_spark minhash-CSDN博客
spark streaming去重_sparkstreaming去重-CSDN博客
data-juicer/data_juicer/ops/filter/video_motion_score_filter.py at main · modelscope/data-juicer
Python连接数据库的类_python 类 + 数据库-CSDN博客
datasets(HuggingFace)学习笔记 - 文跃君 - 博客园
LLM实践--数据去重:Simhash&Minhash 原理分析&代码实现 - 知乎
【大模型理论篇】主流大模型的分词器选择及讨论(BPE/BBPE/WordPiece/Unigram)_大模型分词器-CSDN博客
Qwen2.5等大模型词汇表解析(BPE)_qwen2.5 扩充词表-CSDN博客
MinHash-LSH 哈希模糊去重:如何解决医学大模型的大规模数据去重?_minhash lsh 去重复-CSDN博客
一文搞懂如何自己写一个Python库_制作python库-CSDN博客https://blog.csdn.net/kfashfasf/article/details/136557319