使用 Ray 分布式计算框架读取 jsonl(json line) 格式文件.
ValueError: No input files found to read. Please double check that 'partition_filter' field is set properly.
import ray
from ray.data.datasource import FileExtensionFilter
from pyarrow import json as pyarrow_json
read_options = pyarrow_json.ReadOptions(block_size = 1024*1024*10) # 10MB
ds = ray.data.read_json(
args.path,
partition_filter=FileExtensionFilter(['jsonl', 'json']),
read_options=read_options # 设置块大小
)