import requests
from starlette.requests import Request
from typing import Dict
from ray import serve
# 1: Define a Ray Serve application.
@serve.deployment
class MyModelDeployment:
def __init__(self, msg: str):
# Initialize model state: could be very large neural net weights.
self._msg = msg
def __call__(self, request: Request) -> Dict:
return {"result": self._msg}
app = MyModelDeployment.bind(msg="Hello world!")
# 2: Deploy the application locally.
serve.run(app, route_prefix="/")
# 3: Query the application and print the result.
print(requests.get("http://localhost:8000/").json())
# {'result': 'Hello world!'}
server脚本:
# File name: serve_quickstart.py
from starlette.requests import Request
import ray
from ray import serve
from transformers import pipeline
@serve.deployment(num_replicas=2, ray_actor_options={"num_cpus": 0.2, "num_gpus": 0})
class Translator:
def __init__(self):
# Load model
self.model = pipeline("translation_en_to_fr", model="t5-small")
def translate(self, text: str) -> str:
# Run inference
model_output = self.model(text)
# Post-process output to return only the translation text
translation = model_output[0]["translation_text"]
return translation
async def __call__(self, http_request: Request) -> str:
english_text: str = await http_request.json()
return self.translate(english_text)
translator_app = Translator.bind()
确保server脚本启动
serve run serve_quickstart:translator_app
默认在服务在http://127.0.0.1:8000/ 运行
client脚本
# File name: model_client.py
import requests
english_text = "Hello world!"
response = requests.post("http://127.0.0.1:8000/", json=english_text)
french_text = response.text
print(french_text)
测试:
python model_client.py
组合
# File name: serve_quickstart_composed.py
from starlette.requests import Request
import ray
from ray import serve
from ray.serve.handle import DeploymentHandle
from transformers import pipeline
@serve.deployment
class Translator:
def __init__(self):
# Load model
self.model = pipeline("translation_en_to_fr", model="t5-small")
def translate(self, text: str) -> str:
# Run inference
model_output = self.model(text)
# Post-process output to return only the translation text
translation = model_output[0]["translation_text"]
return translation
@serve.deployment
class Summarizer:
def __init__(self, translator: DeploymentHandle):
self.translator = translator
# Load model.
self.model = pipeline("summarization", model="t5-small")
def summarize(self, text: str) -> str:
# Run inference
model_output = self.model(text, min_length=5, max_length=15)
# Post-process output to return only the summary text
summary = model_output[0]["summary_text"]
return summary
async def __call__(self, http_request: Request) -> str:
english_text: str = await http_request.json()
summary = self.summarize(english_text)
translation = await self.translator.translate.remote(summary)
return translation
app = Summarizer.bind(Translator.bind())
serve run serve_quickstart_composed:app
# File name: composed_client.py
import requests
english_text = (
"It was the best of times, it was the worst of times, it was the age "
"of wisdom, it was the age of foolishness, it was the epoch of belief"
)
response = requests.post("http://127.0.0.1:8000/", json=english_text)
french_text = response.text
print(french_text)
测试:
python composed_client.py
结果:
c'était le meilleur des temps, c'était le pire des temps .