使用webapi的方式,对pandas.DataFrame数据进行收发
- client端使用requests包,发送接收数据。
- server端使用fastapi响应请求。
import io
import os
import fastapi
import requests
import pandas as pd
from fastapi import FastAPI, File, Form, UploadFile
from fastapi.responses import FileResponse, StreamingResponse
client发送 + server接收
# client,发送数据
# 直接发送df
def send_df(df):
buffer = io.BytesIO()
df.to_parquet(buffer, index=False)
# 文件流对象,(文件名,字节流)
file_obj = (f'{factor_cid}.parquet', buffer.getvalue())
requests.post(url, files={"file": file_obj})
# 发送文件
def send_file(path):
# 文件流对象
file_obj = open(file_path, "rb")
requests.post(url, files={"file": file_obj})
# server,接收数据
# 接收,并写入文件
async def received_file2file(file: UploadFile = File(...)):
with open(output_path, "wb") as f:
f.write(await file.read())
# 接收,并转为dataframe
async def received_file2df(file: UploadFile = File(...)):
contents = await file.read()
buffer= io.BytesIO(contents)
df= pd.read_parquet(buffer)
server回复 + client解析
# server,返回数据
# 返回文件数据
async def rsp_file():
return FileResponse(factor_path, media_type="application/octet-stream", filename=filename)
# 返回字节流数据
async def rsp_bytes():
# 将df按parquet格式写入字节流
buffer = io.BytesIO()
df.to_parquet(buffer, index=False)
# 旧方案,比较慢
# return StreamingResponse(buffer, media_type="application/octet-stream")
# 新方案
def _yield_buffer(buffer):
while True:
# 按(n)MB分块
chunk = buffer.read(n * 1024 * 1024)
if not chunk:
break
yield chunk
response = StreamingResponse(
_yield_buffer(buffer), # 通过迭代器发送整个buffer的内容
media_type="application/octet-stream" # 指定响应的MIME类型
)
# 重要,重置buffer的位置,以便StreamingResponse可以从头开始读取
buffer.seek(0)
return response
# client,解析数据
def get_data():
response = requests.get()
# response是字节流
if response.headers.get('Content-Type') == 'application/octet-stream':
file_obj = io.BytesIO(response.content)
df= pd.read_parquet(file_obj)
return df
优化历程
针对server回复 + client解析这个需求,之前有个方案是,server端将df数据转成json
格式,进行return。client端转json与dataframe初始化。
# server
return df.to_json()
# client received and parse
response = requests.get()
json_data = json.loads(response.content)
df = pd.DataFrame(json_data)
但该方案性能很差(测试用的df很大,200w行+)。耗时主要在client端的json to df
转换。不推荐使用。