数据可视化

import os
import matplotlib.pyplot as plt
import pandas as pd

# 总文件夹路径,其中包含各个领域的子文件夹
root_folder_path = '/data3/zhouqiang/LLM_scratch/dataset/COIG-CQIA'

# 获取所有领域文件夹
domains = [d for d in os.listdir(root_folder_path) if os.path.isdir(os.path.join(root_folder_path, d))]
domain_counts = []

# 遍历每个领域文件夹,计算JSON文件数量
for domain in domains:
    domain_path = os.path.join(root_folder_path, domain)
    json_files = [f for f in os.listdir(domain_path) if f.endswith('.jsonl')]
    domain_counts.append(len(json_files))

# 创建DataFrame
df = pd.DataFrame({
    'Domain': domains,
    'Counts': domain_counts
})

# 创建输出文件夹保存图像
output_folder = '/data3/zhouqiang/LLM_scratch/dataset/output_images'
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# 绘制并保存条形图
for i, row in df.iterrows():
    plt.figure()  # 创建新的图形
    plt.bar(row['Domain'], row['Counts'], color='skyblue')
    plt.title(f'Number of JSON Files in {row["Domain"]}')
    plt.xlabel('Domain')
    plt.ylabel('Number of JSON Files')
    plt.xticks(rotation=45)
    plt.tight_layout()  # 调整布局以避免标签重叠

    # 图像保存路径
    file_path = os.path.join(output_folder, f'{row["Domain"]}.jpg')
    plt.savefig(file_path)
    plt.close()  # 关闭图形,避免内存泄漏
import os
import random

# 设置调整因子 k,根据实际数据调整这个值
k = 500  # 需要根据数据的规模和分布进行调整

# 总文件夹路径,其中包含各个领域的子文件夹
root_folder_path = '/data3/zhouqiang/LLM_scratch/dataset/COIG-CQIA'
domain_data_counts = {}
total_data_count = 0

# 计算每个领域的数据总量
for domain in os.listdir(root_folder_path):
    domain_path = os.path.join(root_folder_path, domain)
    if os.path.isdir(domain_path):
        jsonl_files = [f for f in os.listdir(domain_path) if f.endswith('.jsonl')]
        domain_total = 0
        for jsonl_file in jsonl_files:
            file_path = os.path.join(domain_path, jsonl_file)
            with open(file_path, 'r', encoding='utf-8') as file:
                lines_count = sum(1 for line in file)
                domain_total += lines_count
        domain_data_counts[domain] = domain_total
        total_data_count += domain_total

# 新JSONL文件的路径
output_file_path = '/data3/zhouqiang/LLM_scratch/dataset/inverse_sampled_data.jsonl'
os.makedirs(os.path.dirname(output_file_path), exist_ok=True)

# 按逆比例采样并合并到一个新文件
with open(output_file_path, 'w', encoding='utf-8') as output_file:
    for domain, count in domain_data_counts.items():
        if count > 0:  # 检查以避免除以零
            sample_rate = k / count  # 计算逆比例采样率
            sample_rate = min(sample_rate, 1)  # 保证采样率不超过100%
            domain_path = os.path.join(root_folder_path, domain)
            jsonl_files = [f for f in os.listdir(domain_path) if f.endswith('.jsonl')]
            for jsonl_file in jsonl_files:
                file_path = os.path.join(domain_path, jsonl_file)
                with open(file_path, 'r', encoding='utf-8') as file:
                    lines = file.readlines()
                    sampled_lines = random.sample(lines, max(1, int(len(lines) * sample_rate)))
                    output_file.writelines(sampled_lines)
        else:
            print(f"No data to sample in domain: {domain}")

print(f"Inverse sampling completed and combined data is stored in: {output_file_path}")

固定比率的采样

import os
import random

# 设置采样比率
sample_rate = 0.05  # 10%的采样比率

# 总文件夹路径,其中包含各个领域的子文件夹
root_folder_path = '/data3/zhouqiang/LLM_scratch/dataset/COIG-CQIA'
total_data_count = 0

# 计算整个数据集的数据总量
file_line_counts = []
for domain in os.listdir(root_folder_path):
    domain_path = os.path.join(root_folder_path, domain)
    if os.path.isdir(domain_path):
        jsonl_files = [f for f in os.listdir(domain_path) if f.endswith('.jsonl')]
        for jsonl_file in jsonl_files:
            file_path = os.path.join(domain_path, jsonl_file)
            with open(file_path, 'r', encoding='utf-8') as file:
                lines = file.readlines()
                total_data_count += len(lines)
                file_line_counts.append((file_path, lines))

# 打印总行数
print("Total lines before sampling:", total_data_count)

# 新JSONL文件的路径
output_file_path = '/data3/zhouqiang/LLM_scratch/dataset/combined_sampled_data.jsonl'
os.makedirs(os.path.dirname(output_file_path), exist_ok=True)

sampled_total_count = 0  # 采样后的总行数

# 按固定比例采样并合并到一个新文件
with open(output_file_path, 'w', encoding='utf-8') as output_file:
    for file_path, lines in file_line_counts:
        sampled_lines = random.sample(lines, max(1, int(len(lines) * sample_rate)))  # 确保至少采样1行
        sampled_total_count += len(sampled_lines)
        output_file.writelines(sampled_lines)

# 打印采样后总行数
print("Total lines after sampling:", sampled_total_count)

print(f"Sampling completed and combined data is stored in: {output_file_path}")

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值