基于Qwen2-VL模型针对 ImageToText 任务进行微调训练 - 数据处理
flyfish
给定的图像生成一段自然语言描述。它的目标是生成一个或多个句子,能够准确地描述图像中的主要内容、物体、动作、场景等信息。例如,对于一张包含一只狗在草地上奔跑的图像,ImageToText 可能会生成 “一只狗在绿色的草地上快乐地奔跑” 这样的文字描述。
数据集描述
Image-caption task的数据集,包含train和valid
数据集简介
mscoco 2014的image caption数据集。
数据集支持的任务
支持image caption任务
数据集的格式和结构
数据格式
包含image_id, caption, image等信息。
数据集加载方式
from modelscope.msdatasets import MsDataset
ds = MsDataset.load("coco_2014_caption", namespace="modelscope", split="train")
print(ds[0])
生成的数据集格式
[
{
"id": "identity_12801",
"conversations": [
{
"from": "user",
"value": "/home/sss/datasets/1/coco_2014_caption/467265.jpg"
},
{
"from": "assistant",
"value": "A group of young people standing in the middle of a street."
}
]
},
{
"id": "identity_12802",
"conversations": [
{
"from": "user",
"value": "/home/sss/datasets/1/coco_2014_caption/227117.jpg"
},
{
"from": "assistant",
"value": "Oven light on in a kitchen with wooden countertops. "
}
]
},
......
完整代码如下
import os
import pandas as pd
import json
import argparse
from modelscope.msdatasets import MsDataset
class CocoCaptionProcessor:
"""COCO 2014 Caption数据集处理器"""
def __init__(self, max_data_number=500, dataset_dir='coco_2014_caption', csv_file='coco2014.csv'):
"""
初始化CocoCaptionProcessor
:param max_data_number: 最大处理的数据量
:param dataset_dir: 数据集保存目录
:param csv_file: CSV文件路径
"""
self.max_data_number = max_data_number
self.dataset_dir = dataset_dir
self.csv_file = csv_file
self.image_paths = []
self.captions = []
def download_and_process(self):
"""从ModelScope下载并处理COCO 2014 Caption数据集"""
if not os.path.exists(self.dataset_dir):
ds = MsDataset.load('modelscope/coco_2014_caption', subset_name='coco_2014_caption', split='train')
total = min(self.max_data_number, len(ds))
os.makedirs(self.dataset_dir, exist_ok=True)
for i in range(total):
item = ds[i]
image_id = item['image_id']
caption = item['caption']
image = item['image']
image_path = os.path.abspath(f'{self.dataset_dir}/{image_id}.jpg')
image.save(image_path)
self.image_paths.append(image_path)
self.captions.append(caption)
if (i + 1) % 50 == 0:
print(f'Processing {i+1}/{total} images ({(i+1)/total*100:.1f}%)')
df = pd.DataFrame({
'image_path': self.image_paths,
'caption': self.captions
})
df.to_csv(self.csv_file, index=False)
print(f'Data processing completed, processed a total of {total} images.')
else:
print(f'{self.dataset_dir} directory already exists, skipping data processing.')
def generate_conversations_json(self, output_file='coco2014.json', train_ratio=0.8):
"""根据CSV文件生成对话格式的JSON文件,并按给定比例分割训练集和验证集"""
df = pd.read_csv(self.csv_file)
conversations = []
for i in range(len(df)):
conversations.append({
"id": f"identity_{i+1}",
"conversations": [
{"from": "user", "value": df.iloc[i]['image_path']},
{"from": "assistant", "value": df.iloc[i]['caption']}
]
})
# 分割训练集和验证集
split_index = int(len(conversations) * train_ratio)
train_conversations = conversations[:split_index]
val_conversations = conversations[split_index:]
with open(output_file.replace('.json', '_train.json'), 'w', encoding='utf-8') as f:
json.dump(train_conversations, f, ensure_ascii=False, indent=2)
with open(output_file.replace('.json', '_val.json'), 'w', encoding='utf-8') as f:
json.dump(val_conversations, f, ensure_ascii=False, indent=2)
print(f'Generated JSON files for training and validation sets.')
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Process COCO 2014 Caption Dataset.",
epilog="Example usage: python script.py --max_data_number 100 --dataset_dir ./data --csv_file ./output.csv"
)
# 参数分组,让帮助信息更清晰
data_options = parser.add_argument_group('Data Options')
data_options.add_argument('--max_data_number', type=int, default=16000,
help='Maximum number of data entries to process (default: %(default)s)')
data_options.add_argument('--dataset_dir', type=str, default='coco_2014_caption',
help='Directory to save the dataset (default: %(default)s)')
data_options.add_argument('--csv_file', type=str, default='./coco2014.csv',
help='Path to save the CSV file (default: %(default)s)')
output_options = parser.add_argument_group('Output Options')
output_options.add_argument('--output_file', type=str, default='coco2014.json',
help='Base name for output JSON files (default: %(default)s)')
output_options.add_argument('--train_ratio', type=float, default=0.8,
help='Ratio of data to use for training set (default: %(default)s)')
args = parser.parse_args()
processor = CocoCaptionProcessor(
max_data_number=args.max_data_number,
dataset_dir=args.dataset_dir,
csv_file=args.csv_file
)
processor.download_and_process()
processor.generate_conversations_json(output_file=args.output_file, train_ratio=args.train_ratio)