import json
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
读json,json,parquet文件
defread_json_file(file_path):try:withopen(file_path,'r', encoding='utf-8')asfile:
data = json.load(file)return data
except FileNotFoundError:print(f"File {file_path} not found.")except json.JSONDecodeError:print(f"File {file_path} is not a valid JSON file.")except Exception as e:print(f"An error occurred: {e}")defread_jsonl_file(file_path):
data =[]withopen(file_path,'r', encoding='utf-8')asfile:for line infile:try:
data.append(json.loads(line))except:print(line)1/0return data
defread_praquet_file(file_path):
table = pq.read_table(file_path)
df = table.to_pandas()
result=[row for _, row in df.iterrows()]return result
写json,json,parquet文件
defsave_json(file_path,data):withopen(file_path,'w', encoding='utf-8')asfile:
json.dump(data,file, indent=4, ensure_ascii=False)print(f'Save {file_path} is ok!')defsave_jsonl(file_path,data):try:withopen(file_path,'w', encoding='utf-8')asfile:for item in data:file.write(json.dumps(item, ensure_ascii=False)+'\n')print(f"Data saved to {file_path}")except Exception as e:print(f"An error occurred while saving the data: {e}")defsave_parquet(file_path, data):ifisinstance(data,list):
data = pd.DataFrame(data)ifnotisinstance(data, pd.DataFrame):raise ValueError("data must be a pandas DataFrame or a list of lists")
pq.write_table(pa.Table.from_pandas(data), file_path)print(f'Save {file_path} is ok!')# 写parquet的示例数据
data ={'col1':[1,2,3],'col2':['a','b','c']}
df = pd.DataFrame(data)# 保存数据到 Parquet 文件
save_parquet('output.parquet', df)
读取jsonl文件,并保存为parquet文件
defconvert_lists_to_json(df):"""Convert lists in DataFrame to JSON strings."""for column in df.columns:if df[column].apply(lambda x:isinstance(x,list)).any():
df[column]= df[column].apply(lambda x: json.dumps(x)ifisinstance(x,list)else x)return df
defsave_parquet(file_path, data):ifisinstance(data,list):
data = pd.DataFrame(data)ifnotisinstance(data, pd.DataFrame):raise ValueError("data must be a pandas DataFrame or a list of lists")# Convert lists to JSON strings before saving to Parquet
data = convert_lists_to_json(data)
pq.write_table(pa.Table.from_pandas(data), file_path)print(f'Save {file_path} is ok!')
file_path='input.jsonl'
save_path='output.parquet'
data = read_jsonl_file(file_path)
df = pd.DataFrame(data)
save_parquet(save_path, df)