From ac212409277837232744eacd508b614ae11495a2 Mon Sep 17 00:00:00 2001 From: Ricardo Decal Date: Thu, 13 Nov 2025 19:19:14 -0800 Subject: [PATCH 01/18] initial draft --- beginner_source/5_Intro_Serve.ipynb | 881 +++++++++++++++++++++++++ beginner_source/ray_serve_tutorial.rst | 529 +++++++++++++++ 2 files changed, 1410 insertions(+) create mode 100644 beginner_source/5_Intro_Serve.ipynb create mode 100644 beginner_source/ray_serve_tutorial.rst diff --git a/beginner_source/5_Intro_Serve.ipynb b/beginner_source/5_Intro_Serve.ipynb new file mode 100644 index 00000000000..e7fa8cb0988 --- /dev/null +++ b/beginner_source/5_Intro_Serve.ipynb @@ -0,0 +1,881 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "682e224e-1bb9-470c-b363-386ede0785a4", + "metadata": {}, + "source": [ + "# Intro to Ray Serve\n", + "\n", + "This notebook will introduce you to Ray Serve, a framework for building and deploying scalable ML applications.\n", + "\n", + "
\n", + " \n", + "Here is the roadmap for this notebook:\n", + "\n", + "\n", + "
\n" + ] + }, + { + "cell_type": "markdown", + "id": "1060aea0", + "metadata": {}, + "source": [ + "## Imports" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "099b7710", + "metadata": {}, + "outputs": [], + "source": [ + "from typing import Any\n", + "from torchvision import transforms\n", + "\n", + "import json\n", + "import numpy as np\n", + "import ray\n", + "import requests\n", + "import torch\n", + "from ray import serve\n", + "from matplotlib import pyplot as plt\n", + "from fastapi import FastAPI\n", + "from starlette.requests import Request" + ] + }, + { + "cell_type": "markdown", + "id": "7250cc03-e52c-4e30-a262-8d8e0a5a0837", + "metadata": {}, + "source": [ + "## 1. Overview of Ray Serve\n", + "\n", + "Serve is a framework for serving ML applications. \n", + "\n", + "Here is a high-level overview of the architecture of a Ray Serve Application.\n", + "\n", + "\n", + "\n", + "An Application is a collection of one or more Deployments that are deployed together.\n", + "\n", + "### Deployments\n", + "\n", + "`Deployment` is the fundamental developer-facing element of serve.\n", + "\n", + "\n", + "\n", + "Each deployment can have multiple replicas. \n", + "\n", + "A replica is implemented as a Ray actor with a queue to process incoming requests.\n", + "\n", + "Each replica can be configured with a set of compute resources. " + ] + }, + { + "cell_type": "markdown", + "id": "6380b141", + "metadata": {}, + "source": [ + "### When to use Ray Serve?\n", + "\n", + "Ray Serve is designed to be used in the following scenarios:\n", + "- Build end-to-end ML applications with a flexible and programmable python API\n", + "- Flexibly scale up and down your compute resources to meet the demand of your application\n", + "- Easy to develop on a local machine, and scale to a multi-node GPU cluster\n", + "\n", + "#### Key Ray Serve Features\n", + "Ray Serve provides the following key features and optimizations:\n", + "- [response streaming](https://docs.ray.io/en/latest/serve/tutorials/streaming.html)\n", + "- [dynamic request batching](https://docs.ray.io/en/latest/serve/advanced-guides/dyn-req-batch.html)\n", + "- [multi-node/multi-GPU serving](https://docs.ray.io/en/latest/serve/tutorials/vllm-example.html)\n", + "- [model multiplexing](https://docs.ray.io/en/latest/serve/model-multiplexing.html)\n", + "- [fractional compute resource usage](https://docs.ray.io/en/latest/serve/configure-serve-deployment.html)" + ] + }, + { + "cell_type": "markdown", + "id": "a43da1a6", + "metadata": {}, + "source": [ + "## 2. Implement an MNISTClassifier service\n", + "\n", + "Let’s jump right in and get a simple ML service up and running on Ray Serve. \n", + "\n", + "Recall the `MNISTClassifier` we built to perform batch inference on the `MNIST` dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "14fb17a6-a71c-4a11-8ea8-b1b350a5fa1c", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "class OfflineMNISTClassifier:\n", + " def __init__(self, local_path: str):\n", + " self.model = torch.jit.load(local_path)\n", + " self.model.to(\"cuda\")\n", + " self.model.eval()\n", + "\n", + " def __call__(self, batch: dict[str, np.ndarray]) -> dict[str, np.ndarray]:\n", + " return self.predict(batch)\n", + " \n", + " def predict(self, batch: dict[str, np.ndarray]) -> dict[str, np.ndarray]:\n", + " images = torch.tensor(batch[\"image\"]).float().to(\"cuda\")\n", + "\n", + " with torch.no_grad():\n", + " logits = self.model(images).cpu().numpy()\n", + "\n", + " batch[\"predicted_label\"] = np.argmax(logits, axis=1)\n", + " return batch" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "48d148b8", + "metadata": {}, + "outputs": [], + "source": [ + "# We download the model from s3 to the EFS storage\n", + "!aws s3 cp s3://anyscale-public-materials/ray-ai-libraries/mnist/model/model.pt /mnt/cluster_storage/model.pt" + ] + }, + { + "cell_type": "markdown", + "id": "e1a79961", + "metadata": {}, + "source": [ + "Here is how we can use the `OfflineMNISTClassifier` to perform batch inference on a dataset of random images." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "41b16400", + "metadata": {}, + "outputs": [], + "source": [ + "# Create a dataset of random images\n", + "ds = ray.data.from_items([{\"image\": np.random.rand(1, 28, 28)} for _ in range(100)])\n", + "\n", + "# Map the OfflineMNISTClassifier to the dataset\n", + "ds = ds.map_batches(\n", + " OfflineMNISTClassifier,\n", + " fn_constructor_kwargs={\"local_path\": \"/mnt/cluster_storage/model.pt\"},\n", + " concurrency=1,\n", + " num_gpus=1,\n", + " batch_size=10\n", + ")\n", + "\n", + "# Take a look at the first 10 predictions\n", + "ds.take_batch(10)[\"predicted_label\"]" + ] + }, + { + "cell_type": "markdown", + "id": "fbb1a687", + "metadata": {}, + "source": [ + "Now, if want to migrate to an online inference setting, we can transform this into a Ray Serve Deployment by applying the `@serve.deployment` decorator \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c68888dd", + "metadata": {}, + "outputs": [], + "source": [ + "@serve.deployment() # this is the decorator to add\n", + "class OnlineMNISTClassifier:\n", + " def __init__(self, local_path: str):\n", + " self.model = torch.jit.load(local_path)\n", + " self.model.to(\"cuda\")\n", + " self.model.eval()\n", + "\n", + " async def __call__(self, request: Request) -> dict[str, Any]: # __call__ now takes a Starlette Request object\n", + " batch = json.loads(await request.json()) # we will need to parse the JSON body of the request\n", + " return await self.predict(batch)\n", + " \n", + " async def predict(self, batch: dict[str, np.ndarray]) -> dict[str, np.ndarray]:\n", + " # same code as OfflineMNISTClassifier.predict except we added async to the method\n", + " images = torch.tensor(batch[\"image\"]).float().to(\"cuda\")\n", + "\n", + " with torch.no_grad():\n", + " logits = self.model(images).cpu().numpy()\n", + "\n", + " batch[\"predicted_label\"] = np.argmax(logits, axis=1)\n", + " return batch" + ] + }, + { + "cell_type": "markdown", + "id": "2cf85ff1", + "metadata": {}, + "source": [ + "We can now instantiate the `OnlineMNISTClassifier` as a Ray Serve Application using `.bind`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "df46ddd7", + "metadata": {}, + "outputs": [], + "source": [ + "mnist_deployment = OnlineMNISTClassifier.options(\n", + " num_replicas=1,\n", + " ray_actor_options={\"num_gpus\": 1},\n", + ")\n", + "\n", + "mnist_app = mnist_deployment.bind(local_path=\"/mnt/cluster_storage/model.pt\")" + ] + }, + { + "cell_type": "markdown", + "id": "098e8ac4", + "metadata": {}, + "source": [ + "
\n", + "\n", + "**Note:** `.bind` is a method that takes in the arguments to pass to the Deployment constructor.\n", + "\n", + "
\n" + ] + }, + { + "cell_type": "markdown", + "id": "e3e70529", + "metadata": {}, + "source": [ + "We can then run the application " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e96056cd", + "metadata": {}, + "outputs": [], + "source": [ + "mnist_deployment_handle = serve.run(mnist_app, name='mnist_classifier', blocking=False)" + ] + }, + { + "cell_type": "markdown", + "id": "5f4a0cdb-822a-4439-aeab-9916dd8d059c", + "metadata": {}, + "source": [ + "We can test it as an HTTP endpoint" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1c0a80e9-c26f-48d2-8985-ef4eab4dc580", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "images = np.random.rand(2, 1, 28, 28).tolist()\n", + "json_request = json.dumps({\"image\": images})\n", + "response = requests.post(\"http://localhost:8000/\", json=json_request)\n", + "response.json()[\"predicted_label\"]" + ] + }, + { + "cell_type": "markdown", + "id": "7cd2cb01", + "metadata": {}, + "source": [ + "We can also test it as a gRPC endpoint" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "342928ea", + "metadata": {}, + "outputs": [], + "source": [ + "batch = {\"image\": np.random.rand(10, 1, 28, 28)}\n", + "response = await mnist_deployment_handle.predict.remote(batch)\n", + "response[\"predicted_label\"]" + ] + }, + { + "cell_type": "markdown", + "id": "4e170084", + "metadata": {}, + "source": [ + "## 3. Advanced features of Ray Serve" + ] + }, + { + "cell_type": "markdown", + "id": "da2b22a2", + "metadata": {}, + "source": [ + "### Using fractions of a GPU\n", + "\n", + "With Ray we can specify fractional compute resources for each deployment's replica. \n", + "\n", + "This is useful to help us fully utilize a GPU especially when running small models like our `MNISTClassifier` model.\n", + "\n", + "Here is how to specify only 10% of a GPU's compute resources for our `MNISTClassifier` model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "230f9ff2", + "metadata": {}, + "outputs": [], + "source": [ + "mnist_app = OnlineMNISTClassifier.options(\n", + " num_replicas=4, # we can scale to up to 10 replicas on a single GPU\n", + " ray_actor_options={\"num_gpus\": 0.1}, \n", + ").bind(local_path=\"/mnt/cluster_storage/model.pt\")" + ] + }, + { + "cell_type": "markdown", + "id": "b35a8d83", + "metadata": {}, + "source": [ + "Next we update the running application by running serve.run with the new options." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9e9ad6fa", + "metadata": {}, + "outputs": [], + "source": [ + "mnist_deployment_handle = serve.run(mnist_app, name='mnist_classifier', blocking=False)" + ] + }, + { + "cell_type": "markdown", + "id": "b196a535", + "metadata": {}, + "source": [ + "We can test the new application by sending a sample request." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c5aad97c", + "metadata": {}, + "outputs": [], + "source": [ + "images = np.random.rand(2, 1, 28, 28).tolist()\n", + "json_request = json.dumps({\"image\": images})\n", + "response = requests.post(\"http://localhost:8000/\", json=json_request)\n", + "response.json()[\"predicted_label\"]" + ] + }, + { + "cell_type": "markdown", + "id": "05041234", + "metadata": {}, + "source": [ + "### Customizing autoscaling\n", + "\n", + "Ray Serve provides a simple way to autoscale the number of replicas in a deployment. It is primarily based on the target number of ongoing requests per replica.\n", + "\n", + "i.e. here is how we can set the autoscaling config for our `OnlineMNISTClassifier` deployment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e356f749", + "metadata": {}, + "outputs": [], + "source": [ + "mnist_app = OnlineMNISTClassifier.options(\n", + " ray_actor_options={\"num_gpus\": 0.1}, \n", + " autoscaling_config={\n", + " \"target_ongoing_requests\": 10,\n", + " },\n", + ").bind(local_path=\"/mnt/cluster_storage/model.pt\")" + ] + }, + { + "cell_type": "markdown", + "id": "1ae8a244", + "metadata": {}, + "source": [ + "We can also control more granularly the autoscaling logic by setting:\n", + "- the upscale and downscale delays\n", + "- the intervals at which the replica sends metrics reports about the current number of ongoing requests\n", + "- the look-back period used to evaluate the current number of ongoing requests\n", + "\n", + "Here is an example of how to set these options:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6e5594d5", + "metadata": {}, + "outputs": [], + "source": [ + "mnist_app = OnlineMNISTClassifier.options(\n", + " ray_actor_options={\"num_gpus\": 0.1}, \n", + " autoscaling_config={\n", + " \"target_ongoing_requests\": 10,\n", + " \"upscale_delay_s\": 10,\n", + " \"downscale_delay_s\": 10,\n", + " \"metrics_interval_s\": 10,\n", + " \"look_back_period_s\": 10, \n", + " },\n", + ").bind(local_path=\"/mnt/cluster_storage/model.pt\")" + ] + }, + { + "cell_type": "markdown", + "id": "e8a643b4", + "metadata": {}, + "source": [ + "We can additionally control the minimum and maximum number of replicas that can be scaled up and down. \n", + "\n", + "We can even specify to start scaling up from 0 replicas." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ebea6c15", + "metadata": {}, + "outputs": [], + "source": [ + "mnist_app = OnlineMNISTClassifier.options(\n", + " ray_actor_options={\"num_gpus\": 0.1}, \n", + " autoscaling_config={\n", + " \"target_ongoing_requests\": 10,\n", + " \"initial_replicas\": 0, # scale up from 0 replicas\n", + " \"min_replicas\": 0,\n", + " \"max_replicas\": 10,\n", + " # extreme upscale speeds\n", + " \"upscale_delay_s\": 0,\n", + " \"metrics_interval_s\": 0.1,\n", + " \"look_back_period_s\": 0.1,\n", + " },\n", + ").bind(local_path=\"/mnt/cluster_storage/model.pt\")" + ] + }, + { + "cell_type": "markdown", + "id": "e040d6ac", + "metadata": {}, + "source": [ + "Let's run the application with the new autoscaling config." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fbe684a4", + "metadata": {}, + "outputs": [], + "source": [ + "mnist_deployment_handle = serve.run(mnist_app, name='mnist_classifier', blocking=False)" + ] + }, + { + "cell_type": "markdown", + "id": "75be6e25", + "metadata": {}, + "source": [ + "Looking at the Ray Serve dashboard, we can see we are currently at 0 replicas - i.e. no GPU resources are being used.\n", + "\n", + "" + ] + }, + { + "cell_type": "markdown", + "id": "761fd6a6", + "metadata": {}, + "source": [ + "We can send out a larger number of requests to the `OnlineMNISTClassifier` deployment to see the autoscaling in action." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7a7a834f", + "metadata": {}, + "outputs": [], + "source": [ + "batch = {\"image\": np.random.rand(10, 1, 28, 28)}\n", + "[\n", + " mnist_deployment_handle.predict.remote(batch)\n", + " for _ in range(100)\n", + "]" + ] + }, + { + "cell_type": "markdown", + "id": "91a4e5e9", + "metadata": {}, + "source": [ + "Looking at the Ray Serve dashboard, we can see that the number of replicas has scaled up to 10 as expected.\n", + "\n", + "" + ] + }, + { + "cell_type": "markdown", + "id": "c52225df", + "metadata": {}, + "source": [ + "Let's shutdown the service for now." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0d53e06b", + "metadata": {}, + "outputs": [], + "source": [ + "serve.shutdown()" + ] + }, + { + "cell_type": "markdown", + "id": "2e2d1c58", + "metadata": {}, + "source": [ + "### Composing Deployments\n", + "\n", + "Ray Serve allows us to compose Deployments together to build more complex applications.\n", + "\n", + "Lets compose our `OnlineMNISTClassifier` with an `OnlineMNISTPreprocessor` deployment that performs the necessary transformations on the input data.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "67670984", + "metadata": {}, + "outputs": [], + "source": [ + "@serve.deployment\n", + "class OnlineMNISTPreprocessor:\n", + " def __init__(self):\n", + " self.transform = transforms.Compose([\n", + " transforms.ToTensor(),\n", + " transforms.Normalize((0.5,), (0.5,))\n", + " ])\n", + " \n", + " async def run(self, batch: dict[str, Any]) -> dict[str, Any]:\n", + " images = batch[\"image\"]\n", + " images = [self.transform(np.array(image, dtype=np.uint8)).cpu().numpy() for image in images]\n", + " return {\"image\": images}\n", + "\n", + "preprocessor_app = OnlineMNISTPreprocessor.bind()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8b0dc24f", + "metadata": {}, + "outputs": [], + "source": [ + "preprocessor_handle = serve.run(preprocessor_app, name='mnist_preprocessor', blocking=False, route_prefix=\"/preprocess\")" + ] + }, + { + "cell_type": "markdown", + "id": "92daf899", + "metadata": {}, + "source": [ + "Let's load an image and pass it to the `ImageTransformDeployment`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "441a8762", + "metadata": {}, + "outputs": [], + "source": [ + "ds = ray.data.read_images(\"s3://anyscale-public-materials/ray-ai-libraries/mnist/50_per_index/\", include_paths=True)\n", + "image_batch = ds.take_batch(10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1289797c", + "metadata": {}, + "outputs": [], + "source": [ + "# plot the first image using matplotlib\n", + "plt.imshow(image_batch[\"image\"][0], cmap=\"gray\")\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "94df4e63", + "metadata": {}, + "outputs": [], + "source": [ + "normalized_batch = await preprocessor_handle.run.remote(image_batch)\n", + "\n", + "for image in normalized_batch[\"image\"]:\n", + " assert image.shape == (1, 28, 28) # channel, height, width\n", + " assert image.min() >= -1 and image.max() <= 1 # normalized to [-1, 1]" + ] + }, + { + "cell_type": "markdown", + "id": "da2848fc", + "metadata": {}, + "source": [ + "We will proceed to shutdown the preprocessor application to prove it will be automatically created by the ingress.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d0ac5957", + "metadata": {}, + "outputs": [], + "source": [ + "serve.shutdown()" + ] + }, + { + "cell_type": "markdown", + "id": "b0e44763", + "metadata": {}, + "source": [ + "Let's now build an ingress for our application that composes the `ImageTransformDeployment` and `OnlineMNISTClassifier`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "88340028", + "metadata": {}, + "outputs": [], + "source": [ + "@serve.deployment\n", + "class ImageServiceIngress:\n", + " def __init__(self, preprocessor: OnlineMNISTPreprocessor, model: OnlineMNISTClassifier):\n", + " self.preprocessor = preprocessor\n", + " self.model = model\n", + "\n", + " async def __call__(self, request: Request):\n", + " batch = json.loads(await request.json())\n", + " response = await self.preprocessor.run.remote(batch)\n", + " return await self.model.predict.remote(response)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "affcac11", + "metadata": {}, + "outputs": [], + "source": [ + "image_classifier_ingress = ImageServiceIngress.bind(\n", + " preprocessor=OnlineMNISTPreprocessor.bind(),\n", + " model=OnlineMNISTClassifier.options(\n", + " num_replicas=1,\n", + " ray_actor_options={\"num_gpus\": 0.1},\n", + " ).bind(local_path=\"/mnt/cluster_storage/model.pt\"),\n", + ")\n", + "\n", + "handle = serve.run(image_classifier_ingress, name='image_classifier', blocking=False)" + ] + }, + { + "cell_type": "markdown", + "id": "aa81a51f", + "metadata": {}, + "source": [ + "Let's test the application by sending a sample HTTP request to our ingress endpoint.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6d084ee7", + "metadata": {}, + "outputs": [], + "source": [ + "json_request = json.dumps({\"image\": image_batch[\"image\"].tolist()}) \n", + "response = requests.post(\"http://localhost:8000/\", json=json_request)\n", + "response.json()[\"predicted_label\"]" + ] + }, + { + "cell_type": "markdown", + "id": "1fbe8773", + "metadata": {}, + "source": [ + "### Integrating with FastAPI\n", + "\n", + "Ray Serve can be integrated with FastAPI to provide:\n", + "- HTTP routing\n", + "- Pydantic model validation\n", + "- OpenAPI documentation\n", + "\n", + "To integrate a Deployment with FastAPI, we can use the `@serve.ingress` decorator to designate a FastAPI app as the entrypoint for HTTP requests to our Serve application." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5d163431", + "metadata": {}, + "outputs": [], + "source": [ + "app = FastAPI()\n", + "\n", + "@serve.deployment\n", + "@serve.ingress(app)\n", + "class ImageServiceIngress:\n", + " def __init__(self, preprocessor: OnlineMNISTPreprocessor, model: OnlineMNISTClassifier):\n", + " self.preprocessor = preprocessor\n", + " self.model = model\n", + " \n", + " @app.post(\"/predict\")\n", + " async def predict(self, request: Request):\n", + " batch = json.loads(await request.json())\n", + " response = await self.preprocessor.run.remote(batch)\n", + " out = await self.model.predict.remote(response)\n", + " return {\"predicted_label\": out[\"predicted_label\"].tolist()}" + ] + }, + { + "cell_type": "markdown", + "id": "a3a31b87", + "metadata": {}, + "source": [ + "We now can build the application and run it." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0371807e", + "metadata": {}, + "outputs": [], + "source": [ + "image_classifier_ingress = ImageServiceIngress.bind(\n", + " preprocessor=OnlineMNISTPreprocessor.bind(),\n", + " model=OnlineMNISTClassifier.options(\n", + " num_replicas=1,\n", + " ray_actor_options={\"num_gpus\": 0.1},\n", + " ).bind(local_path=\"/mnt/cluster_storage/model.pt\"),\n", + ")\n", + "\n", + "handle = serve.run(image_classifier_ingress, name='image_classifier', blocking=False)" + ] + }, + { + "cell_type": "markdown", + "id": "012894c6", + "metadata": {}, + "source": [ + "After running the application, we can get test it as an HTTP endpoint programmatically." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2e217336", + "metadata": {}, + "outputs": [], + "source": [ + "json_request = json.dumps({\"image\": image_batch[\"image\"].tolist()}) \n", + "response = requests.post(\"http://localhost:8000/predict\", json=json_request)\n", + "response.json()[\"predicted_label\"]" + ] + }, + { + "cell_type": "markdown", + "id": "287ff14a", + "metadata": {}, + "source": [ + "We can also visit the auto-generated FastAPI docs at http://localhost:8000/docs to get an interactive UI to test our endpoint." + ] + }, + { + "cell_type": "markdown", + "id": "5e2af689", + "metadata": {}, + "source": [ + "## 4. Ray Serve in Production\n", + "\n", + "1. Klaviyo built their model serving platform with Ray Serve. See [this article from Klaviyo Engineering](https://klaviyo.tech/how-klaviyo-built-a-robust-model-serving-platform-with-ray-serve-c02ec65788b3)\n", + "2. Samsara uses Ray Serve to bridge the gap of development to deployment of their models. See [this article from Samsara Engineering](https://www.samsara.com/blog/building-a-modern-machine-learning-platform-with-ray)" + ] + }, + { + "cell_type": "markdown", + "id": "d59f4a09", + "metadata": {}, + "source": [ + "## Clean up \n", + "\n", + "Let's shutdown the application and clean up the resources we created." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e0e8131d", + "metadata": {}, + "outputs": [], + "source": [ + "serve.shutdown()\n", + "!rm -rf /mnt/cluster_storage/model.pt" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.8" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/beginner_source/ray_serve_tutorial.rst b/beginner_source/ray_serve_tutorial.rst new file mode 100644 index 00000000000..44aeacb6ace --- /dev/null +++ b/beginner_source/ray_serve_tutorial.rst @@ -0,0 +1,529 @@ +Serving PyTorch Models at Scale with Ray Serve +============================================== + +**Author:** `Ricardo Decal `_ + +This tutorial introduces you to Ray Serve, a scalable framework for serving machine learning models in production. +Ray Serve is part of `Ray Distributed `_, which is a PyTorch Foundation project. + +In this tutorial, you'll learn how to: + +1. Deploy a PyTorch model as a web service using Ray Serve +2. Scale your model serving with multiple replicas +3. Use advanced features like autoscaling and request batching +4. Compose multiple deployments into a complete ML application +5. Handle concurrent requests efficiently + +What is Ray Serve? +------------------ + +`Ray Serve `_ is a framework for building and deploying scalable ML applications. +It provides a simple Python API for turning your PyTorch models into production-ready services that can handle thousands +of concurrent requests. + +Ray Serve is designed for: + +- **Flexible ML Applications**: Build end-to-end ML pipelines with a programmable Python API +- **Scalable Serving**: Easily scale from a laptop to a multi-node GPU cluster +- **Production Features**: Dynamic request batching, response streaming, and autoscaling out of the box + +Key Features +~~~~~~~~~~~~ + +- **Response Streaming**: Stream results back to clients for long-running inference +- **Dynamic Request Batching**: Automatically batch requests to maximize GPU utilization +- **Multi-GPU Serving**: Distribute models across multiple GPUs and nodes +- **Fractional GPU Usage**: Efficiently use GPU resources by allocating fractions to replicas +- **Model Composition**: Chain multiple models together into complex pipelines + +Prerequisites +------------- + +This tutorial assumes basic familiarity with PyTorch and Python. You'll need to install Ray Serve: + +.. code-block:: bash + + pip install "ray[serve]" torch torchvision + +Setup +----- + +Let's start by importing the necessary libraries: + +.. code-block:: python + + import asyncio + import json + import time + from typing import Any, Dict, List + + import aiohttp + import numpy as np + import requests + import torch + import torch.nn as nn + from ray import serve + from starlette.requests import Request + from torchvision import transforms + +Part 1: Deploy a Simple PyTorch Model +-------------------------------------- + +We'll start with a simple convolutional neural network for MNIST digit classification. +First, let's define our model architecture: + +.. code-block:: python + + class MNISTNet(nn.Module): + """Simple CNN for MNIST digit classification""" + def __init__(self): + super(MNISTNet, self).__init__() + self.conv1 = nn.Conv2d(1, 32, 3, 1) + self.conv2 = nn.Conv2d(32, 64, 3, 1) + self.dropout1 = nn.Dropout(0.25) + self.dropout2 = nn.Dropout(0.5) + self.fc1 = nn.Linear(9216, 128) + self.fc2 = nn.Linear(128, 10) + + def forward(self, x): + x = self.conv1(x) + x = nn.functional.relu(x) + x = self.conv2(x) + x = nn.functional.relu(x) + x = nn.functional.max_pool2d(x, 2) + x = self.dropout1(x) + x = torch.flatten(x, 1) + x = self.fc1(x) + x = nn.functional.relu(x) + x = self.dropout2(x) + x = self.fc2(x) + return nn.functional.log_softmax(x, dim=1) + +Creating a Ray Serve Deployment +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +To deploy this model with Ray Serve, we wrap it in a class and add the ``@serve.deployment`` decorator. +The deployment handles incoming HTTP requests and runs inference: + +.. code-block:: python + + @serve.deployment + class MNISTClassifier: + def __init__(self, model_path: str = None): + """Initialize the model. If model_path is provided, load weights.""" + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + self.model = MNISTNet().to(self.device) + + if model_path: + self.model.load_state_dict(torch.load(model_path, map_location=self.device)) + + self.model.eval() + + async def __call__(self, request: Request) -> Dict[str, Any]: + """Handle incoming HTTP requests""" + # Parse the JSON request body + data = await request.json() + batch = json.loads(data) + + # Run inference + return await self.predict(batch) + + async def predict(self, batch: Dict[str, np.ndarray]) -> Dict[str, Any]: + """Run inference on a batch of images""" + # Convert numpy array to tensor + images = torch.tensor(batch["image"], dtype=torch.float32).to(self.device) + + # Run inference + with torch.no_grad(): + logits = self.model(images) + predictions = torch.argmax(logits, dim=1).cpu().numpy() + + return { + "predicted_label": predictions.tolist(), + "logits": logits.cpu().numpy().tolist() + } + +Running the Deployment +~~~~~~~~~~~~~~~~~~~~~~ + +Now let's deploy and run our model: + +.. code-block:: python + + # Create the deployment (but don't run it yet) + mnist_app = MNISTClassifier.bind() + + # Start the Ray Serve application + handle = serve.run(mnist_app, name="mnist_classifier") + +Testing the Deployment +~~~~~~~~~~~~~~~~~~~~~~ + +Let's test our deployment with some random data: + +.. code-block:: python + + # Create a batch of random images (simulating MNIST format: 28x28 grayscale) + images = np.random.rand(2, 1, 28, 28).tolist() + json_request = json.dumps({"image": images}) + + # Send HTTP request + response = requests.post("http://localhost:8000/", json=json_request) + print(f"Predictions: {response.json()['predicted_label']}") + +Part 2: Scaling with Multiple Replicas +--------------------------------------- + +One of Ray Serve's key features is the ability to scale your deployment across multiple replicas. +Each replica is an independent instance of your model that can handle requests in parallel. + +Configuring Replicas +~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + # Create deployment with 4 replicas + mnist_app = MNISTClassifier.options( + num_replicas=4, + ray_actor_options={"num_gpus": 0.25} # Each replica uses 1/4 of a GPU + ).bind() + + # Update the running deployment + handle = serve.run(mnist_app, name="mnist_classifier") + +This configuration creates 4 replicas, each using 25% of a GPU. This allows you to serve 4 models +on a single GPU, maximizing resource utilization for small models. + +Part 3: Autoscaling +------------------- + +Ray Serve can automatically scale the number of replicas based on incoming traffic. +This is useful for handling variable workloads without over-provisioning resources. + +Configuring Autoscaling +~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + mnist_app = MNISTClassifier.options( + autoscaling_config={ + "target_ongoing_requests": 10, # Target 10 requests per replica + "min_replicas": 0, # Scale down to 0 when idle + "max_replicas": 10, # Scale up to 10 replicas max + "upscale_delay_s": 5, # Wait 5s before scaling up + "downscale_delay_s": 30, # Wait 30s before scaling down + }, + ray_actor_options={"num_gpus": 0.1} + ).bind() + + handle = serve.run(mnist_app, name="mnist_classifier") + +With this configuration, Ray Serve will: + +- Start with 0 replicas (no resources used when idle) +- Scale up when requests arrive (targeting 10 concurrent requests per replica) +- Scale down after 30 seconds of low traffic + +Testing Autoscaling with Concurrent Requests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +To see autoscaling in action, we need to send many concurrent requests. Using ``aiohttp``, +we can fire requests asynchronously: + +.. code-block:: python + + async def send_request(session, url, data): + """Send a single async HTTP request""" + async with session.post(url, json=data) as response: + return await response.json() + + async def send_concurrent_requests(num_requests=100): + """Send many requests concurrently""" + url = "http://localhost:8000/" + + # Create sample data + images = np.random.rand(10, 1, 28, 28).tolist() + json_request = json.dumps({"image": images}) + + # Send all requests concurrently + async with aiohttp.ClientSession() as session: + tasks = [ + send_request(session, url, json_request) + for _ in range(num_requests) + ] + responses = await asyncio.gather(*tasks) + + return responses + + # Run the concurrent requests + start_time = time.time() + responses = asyncio.run(send_concurrent_requests(100)) + elapsed = time.time() - start_time + + print(f"Processed {len(responses)} requests in {elapsed:.2f} seconds") + print(f"Throughput: {len(responses)/elapsed:.2f} requests/second") + +This approach allows Ray Serve to buffer and batch process the requests efficiently, +automatically scaling replicas as needed. + +Part 4: Dynamic Request Batching +--------------------------------- + +Dynamic request batching is a powerful optimization that groups multiple incoming requests +and processes them together, maximizing GPU utilization. + +Implementing Batching +~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + @serve.deployment + class BatchedMNISTClassifier: + def __init__(self, model_path: str = None): + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + self.model = MNISTNet().to(self.device) + + if model_path: + self.model.load_state_dict(torch.load(model_path, map_location=self.device)) + + self.model.eval() + + @serve.batch(max_batch_size=32, batch_wait_timeout_s=0.1) + async def predict_batch(self, images: List[np.ndarray]) -> List[Dict[str, Any]]: + """Process a batch of images together""" + print(f"Processing batch of size: {len(images)}") + + # Stack all images into a single tensor + batch_tensor = torch.tensor( + np.stack(images), + dtype=torch.float32 + ).to(self.device) + + # Run inference on the entire batch + with torch.no_grad(): + logits = self.model(batch_tensor) + predictions = torch.argmax(logits, dim=1).cpu().numpy() + + # Return individual results + return [ + { + "predicted_label": int(pred), + "logits": logit.cpu().numpy().tolist() + } + for pred, logit in zip(predictions, logits) + ] + + async def __call__(self, request: Request) -> Dict[str, Any]: + data = await request.json() + batch = json.loads(data) + + # Extract single image and pass to batch handler + image = np.array(batch["image"]) + result = await self.predict_batch(image) + + return result + +The ``@serve.batch`` decorator automatically: + +- Collects up to ``max_batch_size`` requests +- Waits up to ``batch_wait_timeout_s`` seconds for more requests +- Processes them together in a single forward pass + +This can dramatically improve throughput, especially for GPU inference. + +Part 5: Composing Multiple Deployments +--------------------------------------- + +Real-world ML applications often involve multiple steps: preprocessing, inference, and postprocessing. +Ray Serve makes it easy to compose multiple deployments into a pipeline. + +Creating a Preprocessing Deployment +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + @serve.deployment + class ImagePreprocessor: + def __init__(self): + """Initialize preprocessing transforms""" + self.transform = transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize((0.1307,), (0.3081,)) # MNIST mean and std + ]) + + async def preprocess(self, images: List[np.ndarray]) -> np.ndarray: + """Preprocess a batch of images""" + processed = [] + for img in images: + # Convert to PIL-compatible format if needed + if img.dtype != np.uint8: + img = (img * 255).astype(np.uint8) + + # Apply transforms + tensor = self.transform(img) + processed.append(tensor.numpy()) + + return np.stack(processed) + +Creating an Ingress Deployment +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The ingress deployment orchestrates the pipeline, routing requests through preprocessing +and then to the model: + +.. code-block:: python + + @serve.deployment + class MLPipeline: + def __init__(self, preprocessor, classifier): + """Initialize with handles to other deployments""" + self.preprocessor = preprocessor + self.classifier = classifier + + async def __call__(self, request: Request) -> Dict[str, Any]: + """Handle end-to-end inference""" + # Parse request + data = await request.json() + batch = json.loads(data) + images = batch["image"] + + # Step 1: Preprocess + processed_images = await self.preprocessor.preprocess.remote(images) + + # Step 2: Run inference + result = await self.classifier.predict.remote({ + "image": processed_images.tolist() + }) + + return result + +Deploying the Pipeline +~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + # Build the application graph + preprocessor = ImagePreprocessor.bind() + classifier = MNISTClassifier.options( + num_replicas=2, + ray_actor_options={"num_gpus": 0.5} + ).bind() + + pipeline = MLPipeline.bind( + preprocessor=preprocessor, + classifier=classifier + ) + + # Deploy the entire pipeline + handle = serve.run(pipeline, name="ml_pipeline") + +Now when you send a request to the pipeline, it automatically flows through preprocessing +and inference: + +.. code-block:: python + + # Send request to the pipeline + images = [np.random.rand(28, 28) for _ in range(5)] + json_request = json.dumps({"image": images}) + + response = requests.post("http://localhost:8000/", json=json_request) + print(response.json()) + +Part 6: Integration with FastAPI +--------------------------------- + +Ray Serve integrates seamlessly with FastAPI, giving you access to: + +- HTTP routing and path parameters +- Request validation with Pydantic models +- Automatic OpenAPI documentation + +.. code-block:: python + + from fastapi import FastAPI + from pydantic import BaseModel + + app = FastAPI() + + class PredictionRequest(BaseModel): + image: List[List[List[float]]] # Batch of images + + class PredictionResponse(BaseModel): + predicted_label: List[int] + + @serve.deployment + @serve.ingress(app) + class FastAPIMNISTService: + def __init__(self): + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + self.model = MNISTNet().to(self.device) + self.model.eval() + + @app.post("/predict", response_model=PredictionResponse) + async def predict(self, request: PredictionRequest): + """Predict digit from image""" + images = torch.tensor( + request.image, + dtype=torch.float32 + ).to(self.device) + + with torch.no_grad(): + logits = self.model(images) + predictions = torch.argmax(logits, dim=1).cpu().numpy() + + return PredictionResponse(predicted_label=predictions.tolist()) + + @app.get("/health") + async def health(self): + """Health check endpoint""" + return {"status": "healthy"} + + # Deploy with FastAPI + fastapi_app = FastAPIMNISTService.bind() + handle = serve.run(fastapi_app, name="fastapi_mnist") + +After deploying, you can: + +- Visit ``http://localhost:8000/docs`` for interactive API documentation +- Use the ``/predict`` endpoint for inference +- Use the ``/health`` endpoint for health checks + +Cleanup +------- + +When you're done, shut down the Ray Serve application: + +.. code-block:: python + + serve.shutdown() + +Summary +------- + +In this tutorial, you learned how to: + +- Deploy PyTorch models as web services with Ray Serve +- Scale deployments with multiple replicas and fractional GPU usage +- Configure autoscaling to handle variable workloads +- Use dynamic request batching to maximize throughput +- Compose multiple deployments into ML pipelines +- Send concurrent requests efficiently with async HTTP +- Integrate with FastAPI for production-ready APIs + +Ray Serve provides a powerful, flexible framework for serving PyTorch models at scale. +Its Python-first API makes it easy to go from a trained model to a production service. + +Next Steps +---------- + +- Read the `Ray Serve documentation `_ for more advanced features +- Learn about `Ray `_, the distributed computing framework that powers Ray Serve +- Explore `response streaming `_ for long-running inference +- Try `model multiplexing `_ to serve multiple model versions + +Additional Resources +-------------------- + +- `Ray Serve GitHub `_ +- `Ray Documentation `_ +- `PyTorch Projects - Ray `_ From b45b2ccb168217ba26705f967e11344f7fa048f2 Mon Sep 17 00:00:00 2001 From: Ricardo Decal Date: Mon, 17 Nov 2025 12:49:46 -0800 Subject: [PATCH 02/18] progress --- beginner_source/ray_serve_tutorial.rst | 60 +++++++++++--------------- 1 file changed, 24 insertions(+), 36 deletions(-) diff --git a/beginner_source/ray_serve_tutorial.rst b/beginner_source/ray_serve_tutorial.rst index 44aeacb6ace..bfc074b971e 100644 --- a/beginner_source/ray_serve_tutorial.rst +++ b/beginner_source/ray_serve_tutorial.rst @@ -1,40 +1,37 @@ Serving PyTorch Models at Scale with Ray Serve ============================================== - **Author:** `Ricardo Decal `_ -This tutorial introduces you to Ray Serve, a scalable framework for serving machine learning models in production. -Ray Serve is part of `Ray Distributed `_, which is a PyTorch Foundation project. +This tutorial introduces `Ray Serve `_, a scalable framework for serving machine learning models in production. Ray Serve is part of `Ray Distributed `_, an open-source PyTorch Foundation project. -In this tutorial, you'll learn how to: +Introduction +------------ -1. Deploy a PyTorch model as a web service using Ray Serve -2. Scale your model serving with multiple replicas -3. Use advanced features like autoscaling and request batching -4. Compose multiple deployments into a complete ML application -5. Handle concurrent requests efficiently +`Ray Serve `_ is an online serving library that helps you deploy machine learning models in production. -What is Ray Serve? ------------------- +Production-ready features +************************* -`Ray Serve `_ is a framework for building and deploying scalable ML applications. -It provides a simple Python API for turning your PyTorch models into production-ready services that can handle thousands -of concurrent requests. +Ray Serve provides the following production-ready features: -Ray Serve is designed for: +- Handle thousands of concurrent requests efficiently with dynamic request batching +- Autoscale your endpoint to handle variable traffic +- Buffer requests when the endpoint is busy +- Compose multiple models along with business logic into a complete ML application +- Gracefully heal the deployment when nodes are lost +- Handle multi-node/multi-GPU serving +- Flexibly allocate heterogenous compute resources and fractional GPUs +- `LLM-specific features `_ such as response streaming, LoRA multiplexing, prefill-decode disaggregation, and more. -- **Flexible ML Applications**: Build end-to-end ML pipelines with a programmable Python API -- **Scalable Serving**: Easily scale from a laptop to a multi-node GPU cluster -- **Production Features**: Dynamic request batching, response streaming, and autoscaling out of the box +Ray Serve also has LLM-specific features such as response streaming, model multiplexing, dynamic request batching, and more. -Key Features -~~~~~~~~~~~~ +In this tutorial, you'll learn how to: -- **Response Streaming**: Stream results back to clients for long-running inference -- **Dynamic Request Batching**: Automatically batch requests to maximize GPU utilization -- **Multi-GPU Serving**: Distribute models across multiple GPUs and nodes -- **Fractional GPU Usage**: Efficiently use GPU resources by allocating fractions to replicas -- **Model Composition**: Chain multiple models together into complex pipelines +1. Deploy a PyTorch model as a web service using Ray Serve +2. Scale the deployment with multiple replicas +3. Use advanced features like autoscaling and request batching +4. Compose multiple deployments into a complete ML application +5. Handle concurrent requests efficiently Prerequisites ------------- @@ -516,14 +513,5 @@ Its Python-first API makes it easy to go from a trained model to a production se Next Steps ---------- -- Read the `Ray Serve documentation `_ for more advanced features -- Learn about `Ray `_, the distributed computing framework that powers Ray Serve -- Explore `response streaming `_ for long-running inference -- Try `model multiplexing `_ to serve multiple model versions - -Additional Resources --------------------- - -- `Ray Serve GitHub `_ -- `Ray Documentation `_ -- `PyTorch Projects - Ray `_ +- For more information on Ray Serve, read the `Ray Serve documentation `_. +- Learn about `Ray Distributed `_, the distributed computing framework that powers Ray Serve. From 0c1c18ff12701f00d1a159045232c2484cfdf4cb Mon Sep 17 00:00:00 2001 From: Ricardo Decal Date: Tue, 25 Nov 2025 10:45:31 -0800 Subject: [PATCH 03/18] formatting --- beginner_source/ray_serve_tutorial.rst | 66 ++++++++++++-------------- 1 file changed, 31 insertions(+), 35 deletions(-) diff --git a/beginner_source/ray_serve_tutorial.rst b/beginner_source/ray_serve_tutorial.rst index bfc074b971e..fcf1091e5e1 100644 --- a/beginner_source/ray_serve_tutorial.rst +++ b/beginner_source/ray_serve_tutorial.rst @@ -1,22 +1,18 @@ -Serving PyTorch Models at Scale with Ray Serve +Serving PyTorch models at scale with Ray Serve ============================================== **Author:** `Ricardo Decal `_ This tutorial introduces `Ray Serve `_, a scalable framework for serving machine learning models in production. Ray Serve is part of `Ray Distributed `_, an open-source PyTorch Foundation project. -Introduction ------------- - -`Ray Serve `_ is an online serving library that helps you deploy machine learning models in production. Production-ready features -************************* +~~~~~~~~~~~~~~~~~~~~~~~~~ Ray Serve provides the following production-ready features: - Handle thousands of concurrent requests efficiently with dynamic request batching -- Autoscale your endpoint to handle variable traffic -- Buffer requests when the endpoint is busy +- Autoscale your endpoints in response to variable traffic +- Buffer incoming requests when the endpoints are busy - Compose multiple models along with business logic into a complete ML application - Gracefully heal the deployment when nodes are lost - Handle multi-node/multi-GPU serving @@ -45,14 +41,14 @@ This tutorial assumes basic familiarity with PyTorch and Python. You'll need to Setup ----- -Let's start by importing the necessary libraries: +Start by importing the necessary libraries: .. code-block:: python import asyncio import json import time - from typing import Any, Dict, List + from typing import Any import aiohttp import numpy as np @@ -63,7 +59,7 @@ Let's start by importing the necessary libraries: from starlette.requests import Request from torchvision import transforms -Part 1: Deploy a Simple PyTorch Model +Part 1: Deploy a simple PyTorch model -------------------------------------- We'll start with a simple convolutional neural network for MNIST digit classification. @@ -96,7 +92,7 @@ First, let's define our model architecture: x = self.fc2(x) return nn.functional.log_softmax(x, dim=1) -Creating a Ray Serve Deployment +Creating a Ray Serve deployment ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ To deploy this model with Ray Serve, we wrap it in a class and add the ``@serve.deployment`` decorator. @@ -116,7 +112,7 @@ The deployment handles incoming HTTP requests and runs inference: self.model.eval() - async def __call__(self, request: Request) -> Dict[str, Any]: + async def __call__(self, request: Request) -> dict[str, Any]: """Handle incoming HTTP requests""" # Parse the JSON request body data = await request.json() @@ -125,7 +121,7 @@ The deployment handles incoming HTTP requests and runs inference: # Run inference return await self.predict(batch) - async def predict(self, batch: Dict[str, np.ndarray]) -> Dict[str, Any]: + async def predict(self, batch: dict[str, np.ndarray]) -> dict[str, Any]: """Run inference on a batch of images""" # Convert numpy array to tensor images = torch.tensor(batch["image"], dtype=torch.float32).to(self.device) @@ -140,7 +136,7 @@ The deployment handles incoming HTTP requests and runs inference: "logits": logits.cpu().numpy().tolist() } -Running the Deployment +Running the deployment ~~~~~~~~~~~~~~~~~~~~~~ Now let's deploy and run our model: @@ -153,7 +149,7 @@ Now let's deploy and run our model: # Start the Ray Serve application handle = serve.run(mnist_app, name="mnist_classifier") -Testing the Deployment +Testing the deployment ~~~~~~~~~~~~~~~~~~~~~~ Let's test our deployment with some random data: @@ -168,13 +164,13 @@ Let's test our deployment with some random data: response = requests.post("http://localhost:8000/", json=json_request) print(f"Predictions: {response.json()['predicted_label']}") -Part 2: Scaling with Multiple Replicas +Part 2: Scaling with multiple replicas --------------------------------------- One of Ray Serve's key features is the ability to scale your deployment across multiple replicas. Each replica is an independent instance of your model that can handle requests in parallel. -Configuring Replicas +Configuring replicas ~~~~~~~~~~~~~~~~~~~~ .. code-block:: python @@ -197,7 +193,7 @@ Part 3: Autoscaling Ray Serve can automatically scale the number of replicas based on incoming traffic. This is useful for handling variable workloads without over-provisioning resources. -Configuring Autoscaling +Configuring autoscaling ~~~~~~~~~~~~~~~~~~~~~~~~ .. code-block:: python @@ -221,7 +217,7 @@ With this configuration, Ray Serve will: - Scale up when requests arrive (targeting 10 concurrent requests per replica) - Scale down after 30 seconds of low traffic -Testing Autoscaling with Concurrent Requests +Testing autoscaling with concurrent requests ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ To see autoscaling in action, we need to send many concurrent requests. Using ``aiohttp``, @@ -263,13 +259,13 @@ we can fire requests asynchronously: This approach allows Ray Serve to buffer and batch process the requests efficiently, automatically scaling replicas as needed. -Part 4: Dynamic Request Batching +Part 4: Dynamic request batching --------------------------------- Dynamic request batching is a powerful optimization that groups multiple incoming requests and processes them together, maximizing GPU utilization. -Implementing Batching +Implementing batching ~~~~~~~~~~~~~~~~~~~~~ .. code-block:: python @@ -286,7 +282,7 @@ Implementing Batching self.model.eval() @serve.batch(max_batch_size=32, batch_wait_timeout_s=0.1) - async def predict_batch(self, images: List[np.ndarray]) -> List[Dict[str, Any]]: + async def predict_batch(self, images: list[np.ndarray]) -> list[dict[str, Any]]: """Process a batch of images together""" print(f"Processing batch of size: {len(images)}") @@ -310,7 +306,7 @@ Implementing Batching for pred, logit in zip(predictions, logits) ] - async def __call__(self, request: Request) -> Dict[str, Any]: + async def __call__(self, request: Request) -> dict[str, Any]: data = await request.json() batch = json.loads(data) @@ -328,13 +324,13 @@ The ``@serve.batch`` decorator automatically: This can dramatically improve throughput, especially for GPU inference. -Part 5: Composing Multiple Deployments +Part 5: Composing multiple deployments --------------------------------------- Real-world ML applications often involve multiple steps: preprocessing, inference, and postprocessing. Ray Serve makes it easy to compose multiple deployments into a pipeline. -Creating a Preprocessing Deployment +Creating a preprocessing deployment ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. code-block:: python @@ -348,7 +344,7 @@ Creating a Preprocessing Deployment transforms.Normalize((0.1307,), (0.3081,)) # MNIST mean and std ]) - async def preprocess(self, images: List[np.ndarray]) -> np.ndarray: + async def preprocess(self, images: list[np.ndarray]) -> np.ndarray: """Preprocess a batch of images""" processed = [] for img in images: @@ -362,7 +358,7 @@ Creating a Preprocessing Deployment return np.stack(processed) -Creating an Ingress Deployment +Creating an ingress deployment ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The ingress deployment orchestrates the pipeline, routing requests through preprocessing @@ -377,7 +373,7 @@ and then to the model: self.preprocessor = preprocessor self.classifier = classifier - async def __call__(self, request: Request) -> Dict[str, Any]: + async def __call__(self, request: Request) -> dict[str, Any]: """Handle end-to-end inference""" # Parse request data = await request.json() @@ -394,7 +390,7 @@ and then to the model: return result -Deploying the Pipeline +Deploying the pipeline ~~~~~~~~~~~~~~~~~~~~~~~ .. code-block:: python @@ -442,11 +438,11 @@ Ray Serve integrates seamlessly with FastAPI, giving you access to: app = FastAPI() - class PredictionRequest(BaseModel): - image: List[List[List[float]]] # Batch of images + class PredictionRequest(BaseModel): + image: list[list[list[float]]] # Batch of images - class PredictionResponse(BaseModel): - predicted_label: List[int] + class PredictionResponse(BaseModel): + predicted_label: list[int] @serve.deployment @serve.ingress(app) @@ -510,7 +506,7 @@ In this tutorial, you learned how to: Ray Serve provides a powerful, flexible framework for serving PyTorch models at scale. Its Python-first API makes it easy to go from a trained model to a production service. -Next Steps +Next steps ---------- - For more information on Ray Serve, read the `Ray Serve documentation `_. From 7f2e70a12e4392115c0fac65d01ca739888d6bc0 Mon Sep 17 00:00:00 2001 From: Ricardo Decal Date: Tue, 9 Dec 2025 11:11:48 -0800 Subject: [PATCH 04/18] bump Ray version --- .ci/docker/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.ci/docker/requirements.txt b/.ci/docker/requirements.txt index 086633cf043..d9e7b338cfd 100644 --- a/.ci/docker/requirements.txt +++ b/.ci/docker/requirements.txt @@ -32,7 +32,7 @@ bs4 awscliv2==2.1.1 flask spacy==3.4.1 -ray[tune]==2.7.2 +ray[tune]==2.52.1 tensorboard jinja2==3.1.3 pytorch-lightning From 2b7b56005a01a667632bb8b26b9c5fc2fe9e8a77 Mon Sep 17 00:00:00 2001 From: Ricardo Decal Date: Tue, 9 Dec 2025 11:44:05 -0800 Subject: [PATCH 05/18] Create paired tutorial docs --- beginner_source/5_Intro_Serve.ipynb | 881 ----------------------- beginner_source/ray_serve_tutorial.ipynb | 710 ++++++++++++++++++ beginner_source/ray_serve_tutorial.md | 501 +++++++++++++ beginner_source/ray_serve_tutorial.py | 495 +++++++++++++ beginner_source/ray_serve_tutorial.rst | 513 ------------- 5 files changed, 1706 insertions(+), 1394 deletions(-) delete mode 100644 beginner_source/5_Intro_Serve.ipynb create mode 100644 beginner_source/ray_serve_tutorial.ipynb create mode 100644 beginner_source/ray_serve_tutorial.md create mode 100644 beginner_source/ray_serve_tutorial.py delete mode 100644 beginner_source/ray_serve_tutorial.rst diff --git a/beginner_source/5_Intro_Serve.ipynb b/beginner_source/5_Intro_Serve.ipynb deleted file mode 100644 index e7fa8cb0988..00000000000 --- a/beginner_source/5_Intro_Serve.ipynb +++ /dev/null @@ -1,881 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "682e224e-1bb9-470c-b363-386ede0785a4", - "metadata": {}, - "source": [ - "# Intro to Ray Serve\n", - "\n", - "This notebook will introduce you to Ray Serve, a framework for building and deploying scalable ML applications.\n", - "\n", - "
\n", - " \n", - "Here is the roadmap for this notebook:\n", - "\n", - "
    \n", - "
  • Part 1: Overview of Ray Serve
  • \n", - "
  • Part 2: Implement an MNISTClassifier service
  • \n", - "
  • Part 3: Advanced features of Ray Serve
  • \n", - "
  • Part 4: Ray Serve in Production
  • \n", - "
\n", - "
\n" - ] - }, - { - "cell_type": "markdown", - "id": "1060aea0", - "metadata": {}, - "source": [ - "## Imports" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "099b7710", - "metadata": {}, - "outputs": [], - "source": [ - "from typing import Any\n", - "from torchvision import transforms\n", - "\n", - "import json\n", - "import numpy as np\n", - "import ray\n", - "import requests\n", - "import torch\n", - "from ray import serve\n", - "from matplotlib import pyplot as plt\n", - "from fastapi import FastAPI\n", - "from starlette.requests import Request" - ] - }, - { - "cell_type": "markdown", - "id": "7250cc03-e52c-4e30-a262-8d8e0a5a0837", - "metadata": {}, - "source": [ - "## 1. Overview of Ray Serve\n", - "\n", - "Serve is a framework for serving ML applications. \n", - "\n", - "Here is a high-level overview of the architecture of a Ray Serve Application.\n", - "\n", - "\n", - "\n", - "An Application is a collection of one or more Deployments that are deployed together.\n", - "\n", - "### Deployments\n", - "\n", - "`Deployment` is the fundamental developer-facing element of serve.\n", - "\n", - "\n", - "\n", - "Each deployment can have multiple replicas. \n", - "\n", - "A replica is implemented as a Ray actor with a queue to process incoming requests.\n", - "\n", - "Each replica can be configured with a set of compute resources. " - ] - }, - { - "cell_type": "markdown", - "id": "6380b141", - "metadata": {}, - "source": [ - "### When to use Ray Serve?\n", - "\n", - "Ray Serve is designed to be used in the following scenarios:\n", - "- Build end-to-end ML applications with a flexible and programmable python API\n", - "- Flexibly scale up and down your compute resources to meet the demand of your application\n", - "- Easy to develop on a local machine, and scale to a multi-node GPU cluster\n", - "\n", - "#### Key Ray Serve Features\n", - "Ray Serve provides the following key features and optimizations:\n", - "- [response streaming](https://docs.ray.io/en/latest/serve/tutorials/streaming.html)\n", - "- [dynamic request batching](https://docs.ray.io/en/latest/serve/advanced-guides/dyn-req-batch.html)\n", - "- [multi-node/multi-GPU serving](https://docs.ray.io/en/latest/serve/tutorials/vllm-example.html)\n", - "- [model multiplexing](https://docs.ray.io/en/latest/serve/model-multiplexing.html)\n", - "- [fractional compute resource usage](https://docs.ray.io/en/latest/serve/configure-serve-deployment.html)" - ] - }, - { - "cell_type": "markdown", - "id": "a43da1a6", - "metadata": {}, - "source": [ - "## 2. Implement an MNISTClassifier service\n", - "\n", - "Let’s jump right in and get a simple ML service up and running on Ray Serve. \n", - "\n", - "Recall the `MNISTClassifier` we built to perform batch inference on the `MNIST` dataset." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "14fb17a6-a71c-4a11-8ea8-b1b350a5fa1c", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "class OfflineMNISTClassifier:\n", - " def __init__(self, local_path: str):\n", - " self.model = torch.jit.load(local_path)\n", - " self.model.to(\"cuda\")\n", - " self.model.eval()\n", - "\n", - " def __call__(self, batch: dict[str, np.ndarray]) -> dict[str, np.ndarray]:\n", - " return self.predict(batch)\n", - " \n", - " def predict(self, batch: dict[str, np.ndarray]) -> dict[str, np.ndarray]:\n", - " images = torch.tensor(batch[\"image\"]).float().to(\"cuda\")\n", - "\n", - " with torch.no_grad():\n", - " logits = self.model(images).cpu().numpy()\n", - "\n", - " batch[\"predicted_label\"] = np.argmax(logits, axis=1)\n", - " return batch" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "48d148b8", - "metadata": {}, - "outputs": [], - "source": [ - "# We download the model from s3 to the EFS storage\n", - "!aws s3 cp s3://anyscale-public-materials/ray-ai-libraries/mnist/model/model.pt /mnt/cluster_storage/model.pt" - ] - }, - { - "cell_type": "markdown", - "id": "e1a79961", - "metadata": {}, - "source": [ - "Here is how we can use the `OfflineMNISTClassifier` to perform batch inference on a dataset of random images." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "41b16400", - "metadata": {}, - "outputs": [], - "source": [ - "# Create a dataset of random images\n", - "ds = ray.data.from_items([{\"image\": np.random.rand(1, 28, 28)} for _ in range(100)])\n", - "\n", - "# Map the OfflineMNISTClassifier to the dataset\n", - "ds = ds.map_batches(\n", - " OfflineMNISTClassifier,\n", - " fn_constructor_kwargs={\"local_path\": \"/mnt/cluster_storage/model.pt\"},\n", - " concurrency=1,\n", - " num_gpus=1,\n", - " batch_size=10\n", - ")\n", - "\n", - "# Take a look at the first 10 predictions\n", - "ds.take_batch(10)[\"predicted_label\"]" - ] - }, - { - "cell_type": "markdown", - "id": "fbb1a687", - "metadata": {}, - "source": [ - "Now, if want to migrate to an online inference setting, we can transform this into a Ray Serve Deployment by applying the `@serve.deployment` decorator \n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c68888dd", - "metadata": {}, - "outputs": [], - "source": [ - "@serve.deployment() # this is the decorator to add\n", - "class OnlineMNISTClassifier:\n", - " def __init__(self, local_path: str):\n", - " self.model = torch.jit.load(local_path)\n", - " self.model.to(\"cuda\")\n", - " self.model.eval()\n", - "\n", - " async def __call__(self, request: Request) -> dict[str, Any]: # __call__ now takes a Starlette Request object\n", - " batch = json.loads(await request.json()) # we will need to parse the JSON body of the request\n", - " return await self.predict(batch)\n", - " \n", - " async def predict(self, batch: dict[str, np.ndarray]) -> dict[str, np.ndarray]:\n", - " # same code as OfflineMNISTClassifier.predict except we added async to the method\n", - " images = torch.tensor(batch[\"image\"]).float().to(\"cuda\")\n", - "\n", - " with torch.no_grad():\n", - " logits = self.model(images).cpu().numpy()\n", - "\n", - " batch[\"predicted_label\"] = np.argmax(logits, axis=1)\n", - " return batch" - ] - }, - { - "cell_type": "markdown", - "id": "2cf85ff1", - "metadata": {}, - "source": [ - "We can now instantiate the `OnlineMNISTClassifier` as a Ray Serve Application using `.bind`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "df46ddd7", - "metadata": {}, - "outputs": [], - "source": [ - "mnist_deployment = OnlineMNISTClassifier.options(\n", - " num_replicas=1,\n", - " ray_actor_options={\"num_gpus\": 1},\n", - ")\n", - "\n", - "mnist_app = mnist_deployment.bind(local_path=\"/mnt/cluster_storage/model.pt\")" - ] - }, - { - "cell_type": "markdown", - "id": "098e8ac4", - "metadata": {}, - "source": [ - "
\n", - "\n", - "**Note:** `.bind` is a method that takes in the arguments to pass to the Deployment constructor.\n", - "\n", - "
\n" - ] - }, - { - "cell_type": "markdown", - "id": "e3e70529", - "metadata": {}, - "source": [ - "We can then run the application " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e96056cd", - "metadata": {}, - "outputs": [], - "source": [ - "mnist_deployment_handle = serve.run(mnist_app, name='mnist_classifier', blocking=False)" - ] - }, - { - "cell_type": "markdown", - "id": "5f4a0cdb-822a-4439-aeab-9916dd8d059c", - "metadata": {}, - "source": [ - "We can test it as an HTTP endpoint" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1c0a80e9-c26f-48d2-8985-ef4eab4dc580", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "images = np.random.rand(2, 1, 28, 28).tolist()\n", - "json_request = json.dumps({\"image\": images})\n", - "response = requests.post(\"http://localhost:8000/\", json=json_request)\n", - "response.json()[\"predicted_label\"]" - ] - }, - { - "cell_type": "markdown", - "id": "7cd2cb01", - "metadata": {}, - "source": [ - "We can also test it as a gRPC endpoint" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "342928ea", - "metadata": {}, - "outputs": [], - "source": [ - "batch = {\"image\": np.random.rand(10, 1, 28, 28)}\n", - "response = await mnist_deployment_handle.predict.remote(batch)\n", - "response[\"predicted_label\"]" - ] - }, - { - "cell_type": "markdown", - "id": "4e170084", - "metadata": {}, - "source": [ - "## 3. Advanced features of Ray Serve" - ] - }, - { - "cell_type": "markdown", - "id": "da2b22a2", - "metadata": {}, - "source": [ - "### Using fractions of a GPU\n", - "\n", - "With Ray we can specify fractional compute resources for each deployment's replica. \n", - "\n", - "This is useful to help us fully utilize a GPU especially when running small models like our `MNISTClassifier` model.\n", - "\n", - "Here is how to specify only 10% of a GPU's compute resources for our `MNISTClassifier` model." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "230f9ff2", - "metadata": {}, - "outputs": [], - "source": [ - "mnist_app = OnlineMNISTClassifier.options(\n", - " num_replicas=4, # we can scale to up to 10 replicas on a single GPU\n", - " ray_actor_options={\"num_gpus\": 0.1}, \n", - ").bind(local_path=\"/mnt/cluster_storage/model.pt\")" - ] - }, - { - "cell_type": "markdown", - "id": "b35a8d83", - "metadata": {}, - "source": [ - "Next we update the running application by running serve.run with the new options." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9e9ad6fa", - "metadata": {}, - "outputs": [], - "source": [ - "mnist_deployment_handle = serve.run(mnist_app, name='mnist_classifier', blocking=False)" - ] - }, - { - "cell_type": "markdown", - "id": "b196a535", - "metadata": {}, - "source": [ - "We can test the new application by sending a sample request." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c5aad97c", - "metadata": {}, - "outputs": [], - "source": [ - "images = np.random.rand(2, 1, 28, 28).tolist()\n", - "json_request = json.dumps({\"image\": images})\n", - "response = requests.post(\"http://localhost:8000/\", json=json_request)\n", - "response.json()[\"predicted_label\"]" - ] - }, - { - "cell_type": "markdown", - "id": "05041234", - "metadata": {}, - "source": [ - "### Customizing autoscaling\n", - "\n", - "Ray Serve provides a simple way to autoscale the number of replicas in a deployment. It is primarily based on the target number of ongoing requests per replica.\n", - "\n", - "i.e. here is how we can set the autoscaling config for our `OnlineMNISTClassifier` deployment." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e356f749", - "metadata": {}, - "outputs": [], - "source": [ - "mnist_app = OnlineMNISTClassifier.options(\n", - " ray_actor_options={\"num_gpus\": 0.1}, \n", - " autoscaling_config={\n", - " \"target_ongoing_requests\": 10,\n", - " },\n", - ").bind(local_path=\"/mnt/cluster_storage/model.pt\")" - ] - }, - { - "cell_type": "markdown", - "id": "1ae8a244", - "metadata": {}, - "source": [ - "We can also control more granularly the autoscaling logic by setting:\n", - "- the upscale and downscale delays\n", - "- the intervals at which the replica sends metrics reports about the current number of ongoing requests\n", - "- the look-back period used to evaluate the current number of ongoing requests\n", - "\n", - "Here is an example of how to set these options:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6e5594d5", - "metadata": {}, - "outputs": [], - "source": [ - "mnist_app = OnlineMNISTClassifier.options(\n", - " ray_actor_options={\"num_gpus\": 0.1}, \n", - " autoscaling_config={\n", - " \"target_ongoing_requests\": 10,\n", - " \"upscale_delay_s\": 10,\n", - " \"downscale_delay_s\": 10,\n", - " \"metrics_interval_s\": 10,\n", - " \"look_back_period_s\": 10, \n", - " },\n", - ").bind(local_path=\"/mnt/cluster_storage/model.pt\")" - ] - }, - { - "cell_type": "markdown", - "id": "e8a643b4", - "metadata": {}, - "source": [ - "We can additionally control the minimum and maximum number of replicas that can be scaled up and down. \n", - "\n", - "We can even specify to start scaling up from 0 replicas." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ebea6c15", - "metadata": {}, - "outputs": [], - "source": [ - "mnist_app = OnlineMNISTClassifier.options(\n", - " ray_actor_options={\"num_gpus\": 0.1}, \n", - " autoscaling_config={\n", - " \"target_ongoing_requests\": 10,\n", - " \"initial_replicas\": 0, # scale up from 0 replicas\n", - " \"min_replicas\": 0,\n", - " \"max_replicas\": 10,\n", - " # extreme upscale speeds\n", - " \"upscale_delay_s\": 0,\n", - " \"metrics_interval_s\": 0.1,\n", - " \"look_back_period_s\": 0.1,\n", - " },\n", - ").bind(local_path=\"/mnt/cluster_storage/model.pt\")" - ] - }, - { - "cell_type": "markdown", - "id": "e040d6ac", - "metadata": {}, - "source": [ - "Let's run the application with the new autoscaling config." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fbe684a4", - "metadata": {}, - "outputs": [], - "source": [ - "mnist_deployment_handle = serve.run(mnist_app, name='mnist_classifier', blocking=False)" - ] - }, - { - "cell_type": "markdown", - "id": "75be6e25", - "metadata": {}, - "source": [ - "Looking at the Ray Serve dashboard, we can see we are currently at 0 replicas - i.e. no GPU resources are being used.\n", - "\n", - "" - ] - }, - { - "cell_type": "markdown", - "id": "761fd6a6", - "metadata": {}, - "source": [ - "We can send out a larger number of requests to the `OnlineMNISTClassifier` deployment to see the autoscaling in action." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7a7a834f", - "metadata": {}, - "outputs": [], - "source": [ - "batch = {\"image\": np.random.rand(10, 1, 28, 28)}\n", - "[\n", - " mnist_deployment_handle.predict.remote(batch)\n", - " for _ in range(100)\n", - "]" - ] - }, - { - "cell_type": "markdown", - "id": "91a4e5e9", - "metadata": {}, - "source": [ - "Looking at the Ray Serve dashboard, we can see that the number of replicas has scaled up to 10 as expected.\n", - "\n", - "" - ] - }, - { - "cell_type": "markdown", - "id": "c52225df", - "metadata": {}, - "source": [ - "Let's shutdown the service for now." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0d53e06b", - "metadata": {}, - "outputs": [], - "source": [ - "serve.shutdown()" - ] - }, - { - "cell_type": "markdown", - "id": "2e2d1c58", - "metadata": {}, - "source": [ - "### Composing Deployments\n", - "\n", - "Ray Serve allows us to compose Deployments together to build more complex applications.\n", - "\n", - "Lets compose our `OnlineMNISTClassifier` with an `OnlineMNISTPreprocessor` deployment that performs the necessary transformations on the input data.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "67670984", - "metadata": {}, - "outputs": [], - "source": [ - "@serve.deployment\n", - "class OnlineMNISTPreprocessor:\n", - " def __init__(self):\n", - " self.transform = transforms.Compose([\n", - " transforms.ToTensor(),\n", - " transforms.Normalize((0.5,), (0.5,))\n", - " ])\n", - " \n", - " async def run(self, batch: dict[str, Any]) -> dict[str, Any]:\n", - " images = batch[\"image\"]\n", - " images = [self.transform(np.array(image, dtype=np.uint8)).cpu().numpy() for image in images]\n", - " return {\"image\": images}\n", - "\n", - "preprocessor_app = OnlineMNISTPreprocessor.bind()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8b0dc24f", - "metadata": {}, - "outputs": [], - "source": [ - "preprocessor_handle = serve.run(preprocessor_app, name='mnist_preprocessor', blocking=False, route_prefix=\"/preprocess\")" - ] - }, - { - "cell_type": "markdown", - "id": "92daf899", - "metadata": {}, - "source": [ - "Let's load an image and pass it to the `ImageTransformDeployment`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "441a8762", - "metadata": {}, - "outputs": [], - "source": [ - "ds = ray.data.read_images(\"s3://anyscale-public-materials/ray-ai-libraries/mnist/50_per_index/\", include_paths=True)\n", - "image_batch = ds.take_batch(10)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1289797c", - "metadata": {}, - "outputs": [], - "source": [ - "# plot the first image using matplotlib\n", - "plt.imshow(image_batch[\"image\"][0], cmap=\"gray\")\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "94df4e63", - "metadata": {}, - "outputs": [], - "source": [ - "normalized_batch = await preprocessor_handle.run.remote(image_batch)\n", - "\n", - "for image in normalized_batch[\"image\"]:\n", - " assert image.shape == (1, 28, 28) # channel, height, width\n", - " assert image.min() >= -1 and image.max() <= 1 # normalized to [-1, 1]" - ] - }, - { - "cell_type": "markdown", - "id": "da2848fc", - "metadata": {}, - "source": [ - "We will proceed to shutdown the preprocessor application to prove it will be automatically created by the ingress.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d0ac5957", - "metadata": {}, - "outputs": [], - "source": [ - "serve.shutdown()" - ] - }, - { - "cell_type": "markdown", - "id": "b0e44763", - "metadata": {}, - "source": [ - "Let's now build an ingress for our application that composes the `ImageTransformDeployment` and `OnlineMNISTClassifier`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "88340028", - "metadata": {}, - "outputs": [], - "source": [ - "@serve.deployment\n", - "class ImageServiceIngress:\n", - " def __init__(self, preprocessor: OnlineMNISTPreprocessor, model: OnlineMNISTClassifier):\n", - " self.preprocessor = preprocessor\n", - " self.model = model\n", - "\n", - " async def __call__(self, request: Request):\n", - " batch = json.loads(await request.json())\n", - " response = await self.preprocessor.run.remote(batch)\n", - " return await self.model.predict.remote(response)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "affcac11", - "metadata": {}, - "outputs": [], - "source": [ - "image_classifier_ingress = ImageServiceIngress.bind(\n", - " preprocessor=OnlineMNISTPreprocessor.bind(),\n", - " model=OnlineMNISTClassifier.options(\n", - " num_replicas=1,\n", - " ray_actor_options={\"num_gpus\": 0.1},\n", - " ).bind(local_path=\"/mnt/cluster_storage/model.pt\"),\n", - ")\n", - "\n", - "handle = serve.run(image_classifier_ingress, name='image_classifier', blocking=False)" - ] - }, - { - "cell_type": "markdown", - "id": "aa81a51f", - "metadata": {}, - "source": [ - "Let's test the application by sending a sample HTTP request to our ingress endpoint.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6d084ee7", - "metadata": {}, - "outputs": [], - "source": [ - "json_request = json.dumps({\"image\": image_batch[\"image\"].tolist()}) \n", - "response = requests.post(\"http://localhost:8000/\", json=json_request)\n", - "response.json()[\"predicted_label\"]" - ] - }, - { - "cell_type": "markdown", - "id": "1fbe8773", - "metadata": {}, - "source": [ - "### Integrating with FastAPI\n", - "\n", - "Ray Serve can be integrated with FastAPI to provide:\n", - "- HTTP routing\n", - "- Pydantic model validation\n", - "- OpenAPI documentation\n", - "\n", - "To integrate a Deployment with FastAPI, we can use the `@serve.ingress` decorator to designate a FastAPI app as the entrypoint for HTTP requests to our Serve application." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5d163431", - "metadata": {}, - "outputs": [], - "source": [ - "app = FastAPI()\n", - "\n", - "@serve.deployment\n", - "@serve.ingress(app)\n", - "class ImageServiceIngress:\n", - " def __init__(self, preprocessor: OnlineMNISTPreprocessor, model: OnlineMNISTClassifier):\n", - " self.preprocessor = preprocessor\n", - " self.model = model\n", - " \n", - " @app.post(\"/predict\")\n", - " async def predict(self, request: Request):\n", - " batch = json.loads(await request.json())\n", - " response = await self.preprocessor.run.remote(batch)\n", - " out = await self.model.predict.remote(response)\n", - " return {\"predicted_label\": out[\"predicted_label\"].tolist()}" - ] - }, - { - "cell_type": "markdown", - "id": "a3a31b87", - "metadata": {}, - "source": [ - "We now can build the application and run it." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0371807e", - "metadata": {}, - "outputs": [], - "source": [ - "image_classifier_ingress = ImageServiceIngress.bind(\n", - " preprocessor=OnlineMNISTPreprocessor.bind(),\n", - " model=OnlineMNISTClassifier.options(\n", - " num_replicas=1,\n", - " ray_actor_options={\"num_gpus\": 0.1},\n", - " ).bind(local_path=\"/mnt/cluster_storage/model.pt\"),\n", - ")\n", - "\n", - "handle = serve.run(image_classifier_ingress, name='image_classifier', blocking=False)" - ] - }, - { - "cell_type": "markdown", - "id": "012894c6", - "metadata": {}, - "source": [ - "After running the application, we can get test it as an HTTP endpoint programmatically." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2e217336", - "metadata": {}, - "outputs": [], - "source": [ - "json_request = json.dumps({\"image\": image_batch[\"image\"].tolist()}) \n", - "response = requests.post(\"http://localhost:8000/predict\", json=json_request)\n", - "response.json()[\"predicted_label\"]" - ] - }, - { - "cell_type": "markdown", - "id": "287ff14a", - "metadata": {}, - "source": [ - "We can also visit the auto-generated FastAPI docs at http://localhost:8000/docs to get an interactive UI to test our endpoint." - ] - }, - { - "cell_type": "markdown", - "id": "5e2af689", - "metadata": {}, - "source": [ - "## 4. Ray Serve in Production\n", - "\n", - "1. Klaviyo built their model serving platform with Ray Serve. See [this article from Klaviyo Engineering](https://klaviyo.tech/how-klaviyo-built-a-robust-model-serving-platform-with-ray-serve-c02ec65788b3)\n", - "2. Samsara uses Ray Serve to bridge the gap of development to deployment of their models. See [this article from Samsara Engineering](https://www.samsara.com/blog/building-a-modern-machine-learning-platform-with-ray)" - ] - }, - { - "cell_type": "markdown", - "id": "d59f4a09", - "metadata": {}, - "source": [ - "## Clean up \n", - "\n", - "Let's shutdown the application and clean up the resources we created." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e0e8131d", - "metadata": {}, - "outputs": [], - "source": [ - "serve.shutdown()\n", - "!rm -rf /mnt/cluster_storage/model.pt" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.8" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/beginner_source/ray_serve_tutorial.ipynb b/beginner_source/ray_serve_tutorial.ipynb new file mode 100644 index 00000000000..ee6610f4b7b --- /dev/null +++ b/beginner_source/ray_serve_tutorial.ipynb @@ -0,0 +1,710 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "682e224e-1bb9-470c-b363-386ede0785a4", + "metadata": {}, + "source": [ + "# Serve PyTorch models at scale with Ray Serve\n", + "\n", + "**Author:** [Ricardo Decal](https://github.com/crypdick)\n", + "\n", + "This tutorial introduces [Ray Serve](https://docs.ray.io/en/latest/serve/index.html), a scalable framework for serving machine learning models in production. Ray Serve is part of [Ray Distributed](https://pytorch.org/projects/ray/), an open-source PyTorch Foundation project.\n", + "\n", + "## Production-ready features\n", + "\n", + "Ray Serve provides the following production-ready features:\n", + "\n", + "- Handle thousands of concurrent requests efficiently with dynamic request batching.\n", + "- Autoscale endpoints in response to variable traffic.\n", + "- Buffer incoming requests when the endpoints are busy.\n", + "- Compose multiple models along with business logic into a complete machine learning application.\n", + "- Gracefully heal the deployment when nodes are lost.\n", + "- Handle multi-node and multi-GPU serving.\n", + "- Flexibly allocate heterogeneous compute resources and fractional GPUs.\n", + "- Use [LLM-specific features](https://docs.ray.io/en/latest/serve/llm/index.html) such as response streaming, LoRA multiplexing, prefill-decode disaggregation, and more.\n", + "\n", + "
\n", + " \n", + "Roadmap for this notebook:\n", + "\n", + "
    \n", + "
  • Part 1: Deploy a simple PyTorch model.
  • \n", + "
  • Part 2: Scale with multiple replicas.
  • \n", + "
  • Part 3: Configure autoscaling.
  • \n", + "
  • Part 4: Use dynamic request batching.
  • \n", + "
  • Part 5: Compose multiple deployments.
  • \n", + "
  • Part 6: Integrate with FastAPI.
  • \n", + "
\n", + "
\n", + "\n", + "## Prerequisites\n", + "\n", + "This tutorial assumes basic familiarity with PyTorch and Python. Install Ray Serve:\n", + "\n", + "```bash\n", + "pip install \"ray[serve]\" torch torchvision\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "1060aea0", + "metadata": {}, + "source": [ + "## Set up environment\n", + "\n", + "Start by importing the required libraries." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "099b7710", + "metadata": {}, + "outputs": [], + "source": [ + "import asyncio\n", + "import json\n", + "import time\n", + "from typing import Any\n", + "\n", + "import aiohttp\n", + "import numpy as np\n", + "import requests\n", + "import torch\n", + "import torch.nn as nn\n", + "from ray import serve\n", + "from starlette.requests import Request\n", + "from torchvision import transforms" + ] + }, + { + "cell_type": "markdown", + "id": "7250cc03-e52c-4e30-a262-8d8e0a5a0837", + "metadata": {}, + "source": [ + "## Part 1: Deploy a simple PyTorch model\n", + "\n", + "Use a simple convolutional neural network for MNIST digit classification. First, define the model architecture." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "14fb17a6-a71c-4a11-8ea8-b1b350a5fa1c", + "metadata": {}, + "outputs": [], + "source": [ + "class MNISTNet(nn.Module):\n", + " \"\"\"Convolutional neural network for MNIST digit classification.\"\"\"\n", + " def __init__(self):\n", + " super(MNISTNet, self).__init__()\n", + " self.conv1 = nn.Conv2d(1, 32, 3, 1)\n", + " self.conv2 = nn.Conv2d(32, 64, 3, 1)\n", + " self.dropout1 = nn.Dropout(0.25)\n", + " self.dropout2 = nn.Dropout(0.5)\n", + " self.fc1 = nn.Linear(9216, 128)\n", + " self.fc2 = nn.Linear(128, 10)\n", + "\n", + " def forward(self, x):\n", + " x = self.conv1(x)\n", + " x = nn.functional.relu(x)\n", + " x = self.conv2(x)\n", + " x = nn.functional.relu(x)\n", + " x = nn.functional.max_pool2d(x, 2)\n", + " x = self.dropout1(x)\n", + " x = torch.flatten(x, 1)\n", + " x = self.fc1(x)\n", + " x = nn.functional.relu(x)\n", + " x = self.dropout2(x)\n", + " x = self.fc2(x)\n", + " return nn.functional.log_softmax(x, dim=1)" + ] + }, + { + "cell_type": "markdown", + "id": "e1a79961", + "metadata": {}, + "source": [ + "### Create a Ray Serve deployment\n", + "\n", + "To deploy this model with Ray Serve, wrap it in a class and add the `@serve.deployment` decorator. The deployment handles incoming HTTP requests and runs inference." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c68888dd", + "metadata": {}, + "outputs": [], + "source": [ + "@serve.deployment\n", + "class MNISTClassifier:\n", + " def __init__(self, model_path: str = None):\n", + " \"\"\"Initialize the model and optionally load weights from ``model_path``.\"\"\"\n", + " self.device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", + " self.model = MNISTNet().to(self.device)\n", + " \n", + " if model_path:\n", + " self.model.load_state_dict(torch.load(model_path, map_location=self.device))\n", + " \n", + " self.model.eval()\n", + "\n", + " async def __call__(self, request: Request) -> dict[str, Any]:\n", + " \"\"\"Handle an incoming HTTP request.\"\"\"\n", + " # Parse the JSON request body.\n", + " data = await request.json()\n", + " batch = json.loads(data)\n", + " \n", + " # Run inference.\n", + " return await self.predict(batch)\n", + " \n", + " async def predict(self, batch: dict[str, np.ndarray]) -> dict[str, Any]:\n", + " \"\"\"Run inference on a batch of images.\"\"\"\n", + " # Convert NumPy array to tensor.\n", + " images = torch.tensor(batch[\"image\"], dtype=torch.float32).to(self.device)\n", + " \n", + " # Run inference.\n", + " with torch.no_grad():\n", + " logits = self.model(images)\n", + " predictions = torch.argmax(logits, dim=1).cpu().numpy()\n", + " \n", + " return {\n", + " \"predicted_label\": predictions.tolist(),\n", + " \"logits\": logits.cpu().numpy().tolist()\n", + " }" + ] + }, + { + "cell_type": "markdown", + "id": "2cf85ff1", + "metadata": {}, + "source": [ + "### Run the deployment\n", + "\n", + "Deploy and run the model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "df46ddd7", + "metadata": {}, + "outputs": [], + "source": [ + "# Create the deployment (but do not run it yet).\n", + "mnist_app = MNISTClassifier.bind()\n", + "\n", + "# Start the Ray Serve application.\n", + "handle = serve.run(mnist_app, name=\"mnist_classifier\")" + ] + }, + { + "cell_type": "markdown", + "id": "098e8ac4", + "metadata": {}, + "source": [ + "### Test the deployment\n", + "\n", + "Test the deployment with some random data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1c0a80e9-c26f-48d2-8985-ef4eab4dc580", + "metadata": {}, + "outputs": [], + "source": [ + "# Create a batch of random images (MNIST format: 28x28 grayscale).\n", + "images = np.random.rand(2, 1, 28, 28).tolist()\n", + "json_request = json.dumps({\"image\": images})\n", + "\n", + "# Send HTTP request.\n", + "response = requests.post(\"http://localhost:8000/\", json=json_request)\n", + "print(f\"Predictions: {response.json()['predicted_label']}\")" + ] + }, + { + "cell_type": "markdown", + "id": "7cd2cb01", + "metadata": {}, + "source": [ + "## Part 2: Scale with multiple replicas\n", + "\n", + "One of Ray Serve's key features is the ability to scale the deployment across multiple replicas. Each replica is an independent instance of the model that can handle requests in parallel.\n", + "\n", + "### Configure replicas" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "230f9ff2", + "metadata": {}, + "outputs": [], + "source": [ + "# Create deployment with 4 replicas.\n", + "mnist_app = MNISTClassifier.options(\n", + " num_replicas=4,\n", + " ray_actor_options={\"num_gpus\": 0.25} # Each replica uses one quarter of a GPU.\n", + ").bind()\n", + "\n", + "# Update the running deployment.\n", + "handle = serve.run(mnist_app, name=\"mnist_classifier\")" + ] + }, + { + "cell_type": "markdown", + "id": "b35a8d83", + "metadata": {}, + "source": [ + "This configuration creates 4 replicas, each using 25% of a GPU. This configuration allows you to serve 4 models on a single GPU and maximize resource utilization for small models.\n", + "\n", + "## Part 3: Configure autoscaling\n", + "\n", + "Ray Serve can automatically scale the number of replicas based on incoming traffic. This behavior is useful for handling variable workloads without over-provisioning resources.\n", + "\n", + "### Configure autoscaling" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e356f749", + "metadata": {}, + "outputs": [], + "source": [ + "mnist_app = MNISTClassifier.options(\n", + " autoscaling_config={\n", + " \"target_ongoing_requests\": 10, # Target 10 requests per replica.\n", + " \"min_replicas\": 0, # Scale down to 0 when idle.\n", + " \"max_replicas\": 10, # Scale up to 10 replicas maximum.\n", + " \"upscale_delay_s\": 5, # Wait 5 seconds before scaling up.\n", + " \"downscale_delay_s\": 30, # Wait 30 seconds before scaling down.\n", + " },\n", + " ray_actor_options={\"num_gpus\": 0.1}\n", + ").bind()\n", + "\n", + "handle = serve.run(mnist_app, name=\"mnist_classifier\")" + ] + }, + { + "cell_type": "markdown", + "id": "1ae8a244", + "metadata": {}, + "source": [ + "With this configuration, Ray Serve:\n", + "\n", + "- Starts with 0 replicas (no resources used when idle).\n", + "- Scales up when requests arrive (targeting 10 concurrent requests per replica).\n", + "- Scales down after 30 seconds of low traffic.\n", + "\n", + "### Test autoscaling with concurrent requests\n", + "\n", + "To see autoscaling in action, send many concurrent requests. Using `aiohttp`, you can send requests asynchronously." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6e5594d5", + "metadata": {}, + "outputs": [], + "source": [ + "async def send_request(session, url, data):\n", + " \"\"\"Send a single asynchronous HTTP request.\"\"\"\n", + " async with session.post(url, json=data) as response:\n", + " return await response.json()\n", + "\n", + "async def send_concurrent_requests(num_requests=100):\n", + " \"\"\"Send many requests concurrently.\"\"\"\n", + " url = \"http://localhost:8000/\"\n", + " \n", + " # Create sample data.\n", + " images = np.random.rand(10, 1, 28, 28).tolist()\n", + " json_request = json.dumps({\"image\": images})\n", + " \n", + " # Send all requests concurrently.\n", + " async with aiohttp.ClientSession() as session:\n", + " tasks = [\n", + " send_request(session, url, json_request)\n", + " for _ in range(num_requests)\n", + " ]\n", + " responses = await asyncio.gather(*tasks)\n", + " \n", + " return responses\n", + "\n", + "# Run the concurrent requests.\n", + "start_time = time.time()\n", + "responses = asyncio.run(send_concurrent_requests(100))\n", + "elapsed = time.time() - start_time\n", + "\n", + "print(f\"Processed {len(responses)} requests in {elapsed:.2f} seconds\")\n", + "print(f\"Throughput: {len(responses)/elapsed:.2f} requests/second\")" + ] + }, + { + "cell_type": "markdown", + "id": "e040d6ac", + "metadata": {}, + "source": [ + "This approach allows Ray Serve to buffer and batch process the requests efficiently and automatically scale replicas as needed.\n", + "\n", + "## Part 4: Use dynamic request batching\n", + "\n", + "Dynamic request batching is an optimization that groups multiple incoming requests and processes them together to maximize GPU utilization.\n", + "\n", + "### Implement batching" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ebea6c15", + "metadata": {}, + "outputs": [], + "source": [ + "@serve.deployment\n", + "class BatchedMNISTClassifier:\n", + " def __init__(self, model_path: str = None):\n", + " self.device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", + " self.model = MNISTNet().to(self.device)\n", + " \n", + " if model_path:\n", + " self.model.load_state_dict(torch.load(model_path, map_location=self.device))\n", + " \n", + " self.model.eval()\n", + "\n", + " @serve.batch(max_batch_size=32, batch_wait_timeout_s=0.1)\n", + " async def predict_batch(self, images: list[np.ndarray]) -> list[dict[str, Any]]:\n", + " \"\"\"Process a batch of images together.\"\"\"\n", + " print(f\"Processing batch of size: {len(images)}\")\n", + " \n", + " # Stack all images into a single tensor.\n", + " batch_tensor = torch.tensor(\n", + " np.stack(images), \n", + " dtype=torch.float32\n", + " ).to(self.device)\n", + " \n", + " # Run inference on the entire batch.\n", + " with torch.no_grad():\n", + " logits = self.model(batch_tensor)\n", + " predictions = torch.argmax(logits, dim=1).cpu().numpy()\n", + " \n", + " # Return individual results.\n", + " return [\n", + " {\n", + " \"predicted_label\": int(pred),\n", + " \"logits\": logit.cpu().numpy().tolist()\n", + " }\n", + " for pred, logit in zip(predictions, logits)\n", + " ]\n", + "\n", + " async def __call__(self, request: Request) -> dict[str, Any]:\n", + " data = await request.json()\n", + " batch = json.loads(data)\n", + " \n", + " # Extract single image and pass it to the batch handler.\n", + " image = np.array(batch[\"image\"])\n", + " result = await self.predict_batch(image)\n", + " \n", + " return result" + ] + }, + { + "cell_type": "markdown", + "id": "75be6e25", + "metadata": {}, + "source": [ + "The `@serve.batch` decorator automatically:\n", + "\n", + "- Collects up to `max_batch_size` requests.\n", + "- Waits up to `batch_wait_timeout_s` seconds for more requests.\n", + "- Processes them together in a single forward pass.\n", + "\n", + "This behavior can improve throughput, especially for GPU inference.\n", + "\n", + "## Part 5: Compose multiple deployments\n", + "\n", + "Real-world machine learning applications often involve multiple steps: preprocessing, inference, and postprocessing. Ray Serve makes it easy to compose multiple deployments into a pipeline.\n", + "\n", + "### Create a preprocessing deployment" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "67670984", + "metadata": {}, + "outputs": [], + "source": [ + "@serve.deployment\n", + "class ImagePreprocessor:\n", + " def __init__(self):\n", + " \"\"\"Initialize preprocessing transforms.\"\"\"\n", + " self.transform = transforms.Compose([\n", + " transforms.ToTensor(),\n", + " transforms.Normalize((0.1307,), (0.3081,)) # MNIST mean and standard deviation.\n", + " ])\n", + " \n", + " async def preprocess(self, images: list[np.ndarray]) -> np.ndarray:\n", + " \"\"\"Preprocess a batch of images.\"\"\"\n", + " processed = []\n", + " for img in images:\n", + " # Convert to PIL-compatible format if needed.\n", + " if img.dtype != np.uint8:\n", + " img = (img * 255).astype(np.uint8)\n", + " \n", + " # Apply transforms.\n", + " tensor = self.transform(img)\n", + " processed.append(tensor.numpy())\n", + " \n", + " return np.stack(processed)" + ] + }, + { + "cell_type": "markdown", + "id": "92daf899", + "metadata": {}, + "source": [ + "### Create an ingress deployment\n", + "\n", + "The ingress deployment orchestrates the pipeline and routes requests through preprocessing and then to the model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "88340028", + "metadata": {}, + "outputs": [], + "source": [ + "@serve.deployment\n", + "class MLPipeline:\n", + " def __init__(self, preprocessor, classifier):\n", + " \"\"\"Initialize with handles to other deployments.\"\"\"\n", + " self.preprocessor = preprocessor\n", + " self.classifier = classifier\n", + " \n", + " async def __call__(self, request: Request) -> dict[str, Any]:\n", + " \"\"\"Handle end-to-end inference.\"\"\"\n", + " # Parse request.\n", + " data = await request.json()\n", + " batch = json.loads(data)\n", + " images = batch[\"image\"]\n", + " \n", + " # Step 1: Preprocess.\n", + " processed_images = await self.preprocessor.preprocess.remote(images)\n", + " \n", + " # Step 2: Run inference.\n", + " result = await self.classifier.predict.remote({\n", + " \"image\": processed_images.tolist()\n", + " })\n", + " \n", + " return result" + ] + }, + { + "cell_type": "markdown", + "id": "b0e44763", + "metadata": {}, + "source": [ + "### Deploy the pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "affcac11", + "metadata": {}, + "outputs": [], + "source": [ + "# Build the application graph.\n", + "preprocessor = ImagePreprocessor.bind()\n", + "classifier = MNISTClassifier.options(\n", + " num_replicas=2,\n", + " ray_actor_options={\"num_gpus\": 0.5}\n", + ").bind()\n", + "\n", + "pipeline = MLPipeline.bind(\n", + " preprocessor=preprocessor,\n", + " classifier=classifier\n", + ")\n", + "\n", + "# Deploy the entire pipeline.\n", + "handle = serve.run(pipeline, name=\"ml_pipeline\")" + ] + }, + { + "cell_type": "markdown", + "id": "aa81a51f", + "metadata": {}, + "source": [ + "When you send a request to the pipeline, the request automatically flows through preprocessing and inference." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6d084ee7", + "metadata": {}, + "outputs": [], + "source": [ + "# Send request to the pipeline.\n", + "images = [np.random.rand(28, 28) for _ in range(5)]\n", + "json_request = json.dumps({\"image\": images})\n", + "\n", + "response = requests.post(\"http://localhost:8000/\", json=json_request)\n", + "print(response.json())" + ] + }, + { + "cell_type": "markdown", + "id": "1fbe8773", + "metadata": {}, + "source": [ + "## Part 6: Integrate with FastAPI\n", + "\n", + "Ray Serve integrates with FastAPI and gives you access to:\n", + "\n", + "- HTTP routing and path parameters.\n", + "- Request validation with Pydantic models.\n", + "- Automatic OpenAPI documentation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5d163431", + "metadata": {}, + "outputs": [], + "source": [ + "from fastapi import FastAPI\n", + "from pydantic import BaseModel\n", + "\n", + "app = FastAPI()\n", + "\n", + "class PredictionRequest(BaseModel):\n", + " image: list[list[list[float]]] # Batch of images.\n", + "\n", + "class PredictionResponse(BaseModel):\n", + " predicted_label: list[int]\n", + "\n", + "@serve.deployment\n", + "@serve.ingress(app)\n", + "class FastAPIMNISTService:\n", + " def __init__(self):\n", + " self.device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", + " self.model = MNISTNet().to(self.device)\n", + " self.model.eval()\n", + " \n", + " @app.post(\"/predict\", response_model=PredictionResponse)\n", + " async def predict(self, request: PredictionRequest):\n", + " \"\"\"Predict a digit from an image.\"\"\"\n", + " images = torch.tensor(\n", + " request.image, \n", + " dtype=torch.float32\n", + " ).to(self.device)\n", + " \n", + " with torch.no_grad():\n", + " logits = self.model(images)\n", + " predictions = torch.argmax(logits, dim=1).cpu().numpy()\n", + " \n", + " return PredictionResponse(predicted_label=predictions.tolist())\n", + " \n", + " @app.get(\"/health\")\n", + " async def health(self):\n", + " \"\"\"Return health status.\"\"\"\n", + " return {\"status\": \"healthy\"}\n", + "\n", + "# Deploy with FastAPI.\n", + "fastapi_app = FastAPIMNISTService.bind()\n", + "handle = serve.run(fastapi_app, name=\"fastapi_mnist\")" + ] + }, + { + "cell_type": "markdown", + "id": "a3a31b87", + "metadata": {}, + "source": [ + "After deploying, you can:\n", + "\n", + "- Visit `http://localhost:8000/docs` for interactive API documentation.\n", + "- Use the `/predict` endpoint for inference.\n", + "- Use the `/health` endpoint for health checks." + ] + }, + { + "cell_type": "markdown", + "id": "5e2af689", + "metadata": {}, + "source": [ + "## Clean up resources\n", + "\n", + "When you finish, shut down the Ray Serve application." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e0e8131d", + "metadata": {}, + "outputs": [], + "source": [ + "serve.shutdown()" + ] + }, + { + "cell_type": "markdown", + "id": "d59f4a09", + "metadata": {}, + "source": [ + "## Summary\n", + "\n", + "In this tutorial, you learned how to:\n", + "\n", + "- Deploy PyTorch models as web services with Ray Serve.\n", + "- Scale deployments with multiple replicas and fractional GPU usage.\n", + "- Configure autoscaling to handle variable workloads.\n", + "- Use dynamic request batching to maximize throughput.\n", + "- Compose multiple deployments into machine learning pipelines.\n", + "- Send concurrent requests efficiently with asynchronous HTTP.\n", + "- Integrate with FastAPI for production-ready APIs.\n", + "\n", + "Ray Serve provides a flexible framework for serving PyTorch models at scale. Its Python-first API makes it easy to go from a trained model to a production service.\n", + "\n", + "## Next steps\n", + "\n", + "- For more information on Ray Serve, read the [Ray Serve documentation](https://docs.ray.io/en/latest/serve/index.html).\n", + "- Learn about [Ray Distributed](https://docs.ray.io/en/latest/ray-overview.html), the distributed computing framework that powers Ray Serve." + ] + } + ], + "metadata": { + "jupytext": { + "default_lexer": "ipython3", + "formats": "ipynb,md,py:sphinx" + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.8" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/beginner_source/ray_serve_tutorial.md b/beginner_source/ray_serve_tutorial.md new file mode 100644 index 00000000000..56f7f48e97b --- /dev/null +++ b/beginner_source/ray_serve_tutorial.md @@ -0,0 +1,501 @@ +--- +jupyter: + jupytext: + default_lexer: ipython3 + formats: ipynb,md,py:sphinx + text_representation: + extension: .md + format_name: markdown + format_version: '1.3' + jupytext_version: 1.18.1 + kernelspec: + display_name: Python 3 (ipykernel) + language: python + name: python3 +--- + + +# Serve PyTorch models at scale with Ray Serve + +**Author:** [Ricardo Decal](https://github.com/crypdick) + +This tutorial introduces [Ray Serve](https://docs.ray.io/en/latest/serve/index.html), a scalable framework for serving machine learning models in production. Ray Serve is part of [Ray Distributed](https://pytorch.org/projects/ray/), an open-source PyTorch Foundation project. + +## Production-ready features + +Ray Serve provides the following production-ready features: + +- Handle thousands of concurrent requests efficiently with dynamic request batching. +- Autoscale endpoints in response to variable traffic. +- Buffer incoming requests when the endpoints are busy. +- Compose multiple models along with business logic into a complete machine learning application. +- Gracefully heal the deployment when nodes are lost. +- Handle multi-node and multi-GPU serving. +- Flexibly allocate heterogeneous compute resources and fractional GPUs. +- Use [LLM-specific features](https://docs.ray.io/en/latest/serve/llm/index.html) such as response streaming, LoRA multiplexing, prefill-decode disaggregation, and more. + +
+ +Roadmap for this notebook: + +
    +
  • Part 1: Deploy a simple PyTorch model.
  • +
  • Part 2: Scale with multiple replicas.
  • +
  • Part 3: Configure autoscaling.
  • +
  • Part 4: Use dynamic request batching.
  • +
  • Part 5: Compose multiple deployments.
  • +
  • Part 6: Integrate with FastAPI.
  • +
+
+ +## Prerequisites + +This tutorial assumes basic familiarity with PyTorch and Python. Install Ray Serve: + +```bash +pip install "ray[serve]" torch torchvision +``` + + +## Set up environment + +Start by importing the required libraries. + +```python +import asyncio +import json +import time +from typing import Any + +import aiohttp +import numpy as np +import requests +import torch +import torch.nn as nn +from ray import serve +from starlette.requests import Request +from torchvision import transforms +``` + +## Part 1: Deploy a simple PyTorch model + +Use a simple convolutional neural network for MNIST digit classification. First, define the model architecture. + +```python +class MNISTNet(nn.Module): + """Convolutional neural network for MNIST digit classification.""" + def __init__(self): + super(MNISTNet, self).__init__() + self.conv1 = nn.Conv2d(1, 32, 3, 1) + self.conv2 = nn.Conv2d(32, 64, 3, 1) + self.dropout1 = nn.Dropout(0.25) + self.dropout2 = nn.Dropout(0.5) + self.fc1 = nn.Linear(9216, 128) + self.fc2 = nn.Linear(128, 10) + + def forward(self, x): + x = self.conv1(x) + x = nn.functional.relu(x) + x = self.conv2(x) + x = nn.functional.relu(x) + x = nn.functional.max_pool2d(x, 2) + x = self.dropout1(x) + x = torch.flatten(x, 1) + x = self.fc1(x) + x = nn.functional.relu(x) + x = self.dropout2(x) + x = self.fc2(x) + return nn.functional.log_softmax(x, dim=1) +``` + +### Create a Ray Serve deployment + +To deploy this model with Ray Serve, wrap it in a class and add the `@serve.deployment` decorator. The deployment handles incoming HTTP requests and runs inference. + +```python +@serve.deployment +class MNISTClassifier: + def __init__(self, model_path: str = None): + """Initialize the model and optionally load weights from ``model_path``.""" + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + self.model = MNISTNet().to(self.device) + + if model_path: + self.model.load_state_dict(torch.load(model_path, map_location=self.device)) + + self.model.eval() + + async def __call__(self, request: Request) -> dict[str, Any]: + """Handle an incoming HTTP request.""" + # Parse the JSON request body. + data = await request.json() + batch = json.loads(data) + + # Run inference. + return await self.predict(batch) + + async def predict(self, batch: dict[str, np.ndarray]) -> dict[str, Any]: + """Run inference on a batch of images.""" + # Convert NumPy array to tensor. + images = torch.tensor(batch["image"], dtype=torch.float32).to(self.device) + + # Run inference. + with torch.no_grad(): + logits = self.model(images) + predictions = torch.argmax(logits, dim=1).cpu().numpy() + + return { + "predicted_label": predictions.tolist(), + "logits": logits.cpu().numpy().tolist() + } +``` + +### Run the deployment + +Deploy and run the model. + +```python +# Create the deployment (but do not run it yet). +mnist_app = MNISTClassifier.bind() + +# Start the Ray Serve application. +handle = serve.run(mnist_app, name="mnist_classifier") +``` + +### Test the deployment + +Test the deployment with some random data. + +```python +# Create a batch of random images (MNIST format: 28x28 grayscale). +images = np.random.rand(2, 1, 28, 28).tolist() +json_request = json.dumps({"image": images}) + +# Send HTTP request. +response = requests.post("http://localhost:8000/", json=json_request) +print(f"Predictions: {response.json()['predicted_label']}") +``` + +## Part 2: Scale with multiple replicas + +One of Ray Serve's key features is the ability to scale the deployment across multiple replicas. Each replica is an independent instance of the model that can handle requests in parallel. + +### Configure replicas + +```python +# Create deployment with 4 replicas. +mnist_app = MNISTClassifier.options( + num_replicas=4, + ray_actor_options={"num_gpus": 0.25} # Each replica uses one quarter of a GPU. +).bind() + +# Update the running deployment. +handle = serve.run(mnist_app, name="mnist_classifier") +``` + +This configuration creates 4 replicas, each using 25% of a GPU. This configuration allows you to serve 4 models on a single GPU and maximize resource utilization for small models. + +## Part 3: Configure autoscaling + +Ray Serve can automatically scale the number of replicas based on incoming traffic. This behavior is useful for handling variable workloads without over-provisioning resources. + +### Configure autoscaling + +```python +mnist_app = MNISTClassifier.options( + autoscaling_config={ + "target_ongoing_requests": 10, # Target 10 requests per replica. + "min_replicas": 0, # Scale down to 0 when idle. + "max_replicas": 10, # Scale up to 10 replicas maximum. + "upscale_delay_s": 5, # Wait 5 seconds before scaling up. + "downscale_delay_s": 30, # Wait 30 seconds before scaling down. + }, + ray_actor_options={"num_gpus": 0.1} +).bind() + +handle = serve.run(mnist_app, name="mnist_classifier") +``` + +With this configuration, Ray Serve: + +- Starts with 0 replicas (no resources used when idle). +- Scales up when requests arrive (targeting 10 concurrent requests per replica). +- Scales down after 30 seconds of low traffic. + +### Test autoscaling with concurrent requests + +To see autoscaling in action, send many concurrent requests. Using `aiohttp`, you can send requests asynchronously. + +```python +async def send_request(session, url, data): + """Send a single asynchronous HTTP request.""" + async with session.post(url, json=data) as response: + return await response.json() + +async def send_concurrent_requests(num_requests=100): + """Send many requests concurrently.""" + url = "http://localhost:8000/" + + # Create sample data. + images = np.random.rand(10, 1, 28, 28).tolist() + json_request = json.dumps({"image": images}) + + # Send all requests concurrently. + async with aiohttp.ClientSession() as session: + tasks = [ + send_request(session, url, json_request) + for _ in range(num_requests) + ] + responses = await asyncio.gather(*tasks) + + return responses + +# Run the concurrent requests. +start_time = time.time() +responses = asyncio.run(send_concurrent_requests(100)) +elapsed = time.time() - start_time + +print(f"Processed {len(responses)} requests in {elapsed:.2f} seconds") +print(f"Throughput: {len(responses)/elapsed:.2f} requests/second") +``` + +This approach allows Ray Serve to buffer and batch process the requests efficiently and automatically scale replicas as needed. + +## Part 4: Use dynamic request batching + +Dynamic request batching is an optimization that groups multiple incoming requests and processes them together to maximize GPU utilization. + +### Implement batching + +```python +@serve.deployment +class BatchedMNISTClassifier: + def __init__(self, model_path: str = None): + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + self.model = MNISTNet().to(self.device) + + if model_path: + self.model.load_state_dict(torch.load(model_path, map_location=self.device)) + + self.model.eval() + + @serve.batch(max_batch_size=32, batch_wait_timeout_s=0.1) + async def predict_batch(self, images: list[np.ndarray]) -> list[dict[str, Any]]: + """Process a batch of images together.""" + print(f"Processing batch of size: {len(images)}") + + # Stack all images into a single tensor. + batch_tensor = torch.tensor( + np.stack(images), + dtype=torch.float32 + ).to(self.device) + + # Run inference on the entire batch. + with torch.no_grad(): + logits = self.model(batch_tensor) + predictions = torch.argmax(logits, dim=1).cpu().numpy() + + # Return individual results. + return [ + { + "predicted_label": int(pred), + "logits": logit.cpu().numpy().tolist() + } + for pred, logit in zip(predictions, logits) + ] + + async def __call__(self, request: Request) -> dict[str, Any]: + data = await request.json() + batch = json.loads(data) + + # Extract single image and pass it to the batch handler. + image = np.array(batch["image"]) + result = await self.predict_batch(image) + + return result +``` + +The `@serve.batch` decorator automatically: + +- Collects up to `max_batch_size` requests. +- Waits up to `batch_wait_timeout_s` seconds for more requests. +- Processes them together in a single forward pass. + +This behavior can improve throughput, especially for GPU inference. + +## Part 5: Compose multiple deployments + +Real-world machine learning applications often involve multiple steps: preprocessing, inference, and postprocessing. Ray Serve makes it easy to compose multiple deployments into a pipeline. + +### Create a preprocessing deployment + +```python +@serve.deployment +class ImagePreprocessor: + def __init__(self): + """Initialize preprocessing transforms.""" + self.transform = transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize((0.1307,), (0.3081,)) # MNIST mean and standard deviation. + ]) + + async def preprocess(self, images: list[np.ndarray]) -> np.ndarray: + """Preprocess a batch of images.""" + processed = [] + for img in images: + # Convert to PIL-compatible format if needed. + if img.dtype != np.uint8: + img = (img * 255).astype(np.uint8) + + # Apply transforms. + tensor = self.transform(img) + processed.append(tensor.numpy()) + + return np.stack(processed) +``` + +### Create an ingress deployment + +The ingress deployment orchestrates the pipeline and routes requests through preprocessing and then to the model. + +```python +@serve.deployment +class MLPipeline: + def __init__(self, preprocessor, classifier): + """Initialize with handles to other deployments.""" + self.preprocessor = preprocessor + self.classifier = classifier + + async def __call__(self, request: Request) -> dict[str, Any]: + """Handle end-to-end inference.""" + # Parse request. + data = await request.json() + batch = json.loads(data) + images = batch["image"] + + # Step 1: Preprocess. + processed_images = await self.preprocessor.preprocess.remote(images) + + # Step 2: Run inference. + result = await self.classifier.predict.remote({ + "image": processed_images.tolist() + }) + + return result +``` + +### Deploy the pipeline + +```python +# Build the application graph. +preprocessor = ImagePreprocessor.bind() +classifier = MNISTClassifier.options( + num_replicas=2, + ray_actor_options={"num_gpus": 0.5} +).bind() + +pipeline = MLPipeline.bind( + preprocessor=preprocessor, + classifier=classifier +) + +# Deploy the entire pipeline. +handle = serve.run(pipeline, name="ml_pipeline") +``` + +When you send a request to the pipeline, the request automatically flows through preprocessing and inference. + +```python +# Send request to the pipeline. +images = [np.random.rand(28, 28) for _ in range(5)] +json_request = json.dumps({"image": images}) + +response = requests.post("http://localhost:8000/", json=json_request) +print(response.json()) +``` + +## Part 6: Integrate with FastAPI + +Ray Serve integrates with FastAPI and gives you access to: + +- HTTP routing and path parameters. +- Request validation with Pydantic models. +- Automatic OpenAPI documentation. + +```python +from fastapi import FastAPI +from pydantic import BaseModel + +app = FastAPI() + +class PredictionRequest(BaseModel): + image: list[list[list[float]]] # Batch of images. + +class PredictionResponse(BaseModel): + predicted_label: list[int] + +@serve.deployment +@serve.ingress(app) +class FastAPIMNISTService: + def __init__(self): + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + self.model = MNISTNet().to(self.device) + self.model.eval() + + @app.post("/predict", response_model=PredictionResponse) + async def predict(self, request: PredictionRequest): + """Predict a digit from an image.""" + images = torch.tensor( + request.image, + dtype=torch.float32 + ).to(self.device) + + with torch.no_grad(): + logits = self.model(images) + predictions = torch.argmax(logits, dim=1).cpu().numpy() + + return PredictionResponse(predicted_label=predictions.tolist()) + + @app.get("/health") + async def health(self): + """Return health status.""" + return {"status": "healthy"} + +# Deploy with FastAPI. +fastapi_app = FastAPIMNISTService.bind() +handle = serve.run(fastapi_app, name="fastapi_mnist") +``` + +After deploying, you can: + +- Visit `http://localhost:8000/docs` for interactive API documentation. +- Use the `/predict` endpoint for inference. +- Use the `/health` endpoint for health checks. + + +## Clean up resources + +When you finish, shut down the Ray Serve application. + +```python +serve.shutdown() +``` + +## Summary + +In this tutorial, you learned how to: + +- Deploy PyTorch models as web services with Ray Serve. +- Scale deployments with multiple replicas and fractional GPU usage. +- Configure autoscaling to handle variable workloads. +- Use dynamic request batching to maximize throughput. +- Compose multiple deployments into machine learning pipelines. +- Send concurrent requests efficiently with asynchronous HTTP. +- Integrate with FastAPI for production-ready APIs. + +Ray Serve provides a flexible framework for serving PyTorch models at scale. Its Python-first API makes it easy to go from a trained model to a production service. + +## Next steps + +- For more information on Ray Serve, read the [Ray Serve documentation](https://docs.ray.io/en/latest/serve/index.html). +- Learn about [Ray Distributed](https://docs.ray.io/en/latest/ray-overview.html), the distributed computing framework that powers Ray Serve. diff --git a/beginner_source/ray_serve_tutorial.py b/beginner_source/ray_serve_tutorial.py new file mode 100644 index 00000000000..e3a9775266c --- /dev/null +++ b/beginner_source/ray_serve_tutorial.py @@ -0,0 +1,495 @@ +# --- +# jupyter: +# jupytext: +# default_lexer: ipython3 +# formats: ipynb,md,py:sphinx +# text_representation: +# extension: .py +# format_name: sphinx +# format_version: '1.1' +# jupytext_version: 1.18.1 +# kernelspec: +# display_name: Python 3 (ipykernel) +# language: python +# name: python3 +# --- + +""" +# Serve PyTorch models at scale with Ray Serve + +**Author:** [Ricardo Decal](https://github.com/crypdick) + +This tutorial introduces [Ray Serve](https://docs.ray.io/en/latest/serve/index.html), a scalable framework for serving machine learning models in production. Ray Serve is part of [Ray Distributed](https://pytorch.org/projects/ray/), an open-source PyTorch Foundation project. + +## Production-ready features + +Ray Serve provides the following production-ready features: + +- Handle thousands of concurrent requests efficiently with dynamic request batching. +- Autoscale endpoints in response to variable traffic. +- Buffer incoming requests when the endpoints are busy. +- Compose multiple models along with business logic into a complete machine learning application. +- Gracefully heal the deployment when nodes are lost. +- Handle multi-node and multi-GPU serving. +- Flexibly allocate heterogeneous compute resources and fractional GPUs. +- Use [LLM-specific features](https://docs.ray.io/en/latest/serve/llm/index.html) such as response streaming, LoRA multiplexing, prefill-decode disaggregation, and more. + +
+ +Roadmap for this notebook: + +
    +
  • Part 1: Deploy a simple PyTorch model.
  • +
  • Part 2: Scale with multiple replicas.
  • +
  • Part 3: Configure autoscaling.
  • +
  • Part 4: Use dynamic request batching.
  • +
  • Part 5: Compose multiple deployments.
  • +
  • Part 6: Integrate with FastAPI.
  • +
+
+ +## Prerequisites + +This tutorial assumes basic familiarity with PyTorch and Python. Install Ray Serve: + +```bash +pip install "ray[serve]" torch torchvision +``` +""" + +############################################################################### +# ## Set up environment +# +# Start by importing the required libraries. + +import asyncio +import json +import time +from typing import Any + +import aiohttp +import numpy as np +import requests +import torch +import torch.nn as nn +from ray import serve +from starlette.requests import Request +from torchvision import transforms + + +############################################################################### +# ## Part 1: Deploy a simple PyTorch model +# +# Use a simple convolutional neural network for MNIST digit classification. First, define the model architecture. + +class MNISTNet(nn.Module): + """Convolutional neural network for MNIST digit classification.""" + def __init__(self): + super(MNISTNet, self).__init__() + self.conv1 = nn.Conv2d(1, 32, 3, 1) + self.conv2 = nn.Conv2d(32, 64, 3, 1) + self.dropout1 = nn.Dropout(0.25) + self.dropout2 = nn.Dropout(0.5) + self.fc1 = nn.Linear(9216, 128) + self.fc2 = nn.Linear(128, 10) + + def forward(self, x): + x = self.conv1(x) + x = nn.functional.relu(x) + x = self.conv2(x) + x = nn.functional.relu(x) + x = nn.functional.max_pool2d(x, 2) + x = self.dropout1(x) + x = torch.flatten(x, 1) + x = self.fc1(x) + x = nn.functional.relu(x) + x = self.dropout2(x) + x = self.fc2(x) + return nn.functional.log_softmax(x, dim=1) + + +############################################################################### +# ### Create a Ray Serve deployment +# +# To deploy this model with Ray Serve, wrap it in a class and add the `@serve.deployment` decorator. The deployment handles incoming HTTP requests and runs inference. + +@serve.deployment +class MNISTClassifier: + def __init__(self, model_path: str = None): + """Initialize the model and optionally load weights from ``model_path``.""" + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + self.model = MNISTNet().to(self.device) + + if model_path: + self.model.load_state_dict(torch.load(model_path, map_location=self.device)) + + self.model.eval() + + async def __call__(self, request: Request) -> dict[str, Any]: + """Handle an incoming HTTP request.""" + # Parse the JSON request body. + data = await request.json() + batch = json.loads(data) + + # Run inference. + return await self.predict(batch) + + async def predict(self, batch: dict[str, np.ndarray]) -> dict[str, Any]: + """Run inference on a batch of images.""" + # Convert NumPy array to tensor. + images = torch.tensor(batch["image"], dtype=torch.float32).to(self.device) + + # Run inference. + with torch.no_grad(): + logits = self.model(images) + predictions = torch.argmax(logits, dim=1).cpu().numpy() + + return { + "predicted_label": predictions.tolist(), + "logits": logits.cpu().numpy().tolist() + } + + +############################################################################### +# ### Run the deployment +# +# Deploy and run the model. + +# Create the deployment (but do not run it yet). +mnist_app = MNISTClassifier.bind() + +# Start the Ray Serve application. +handle = serve.run(mnist_app, name="mnist_classifier") + +############################################################################### +# ### Test the deployment +# +# Test the deployment with some random data. + +# Create a batch of random images (MNIST format: 28x28 grayscale). +images = np.random.rand(2, 1, 28, 28).tolist() +json_request = json.dumps({"image": images}) + +# Send HTTP request. +response = requests.post("http://localhost:8000/", json=json_request) +print(f"Predictions: {response.json()['predicted_label']}") + +############################################################################### +# ## Part 2: Scale with multiple replicas +# +# One of Ray Serve's key features is the ability to scale the deployment across multiple replicas. Each replica is an independent instance of the model that can handle requests in parallel. +# +# ### Configure replicas + +# Create deployment with 4 replicas. +mnist_app = MNISTClassifier.options( + num_replicas=4, + ray_actor_options={"num_gpus": 0.25} # Each replica uses one quarter of a GPU. +).bind() + +# Update the running deployment. +handle = serve.run(mnist_app, name="mnist_classifier") + +############################################################################### +# This configuration creates 4 replicas, each using 25% of a GPU. This configuration allows you to serve 4 models on a single GPU and maximize resource utilization for small models. +# +# ## Part 3: Configure autoscaling +# +# Ray Serve can automatically scale the number of replicas based on incoming traffic. This behavior is useful for handling variable workloads without over-provisioning resources. +# +# ### Configure autoscaling + +mnist_app = MNISTClassifier.options( + autoscaling_config={ + "target_ongoing_requests": 10, # Target 10 requests per replica. + "min_replicas": 0, # Scale down to 0 when idle. + "max_replicas": 10, # Scale up to 10 replicas maximum. + "upscale_delay_s": 5, # Wait 5 seconds before scaling up. + "downscale_delay_s": 30, # Wait 30 seconds before scaling down. + }, + ray_actor_options={"num_gpus": 0.1} +).bind() + +handle = serve.run(mnist_app, name="mnist_classifier") + + +############################################################################### +# With this configuration, Ray Serve: +# +# - Starts with 0 replicas (no resources used when idle). +# - Scales up when requests arrive (targeting 10 concurrent requests per replica). +# - Scales down after 30 seconds of low traffic. +# +# ### Test autoscaling with concurrent requests +# +# To see autoscaling in action, send many concurrent requests. Using `aiohttp`, you can send requests asynchronously. + +async def send_request(session, url, data): + """Send a single asynchronous HTTP request.""" + async with session.post(url, json=data) as response: + return await response.json() + +async def send_concurrent_requests(num_requests=100): + """Send many requests concurrently.""" + url = "http://localhost:8000/" + + # Create sample data. + images = np.random.rand(10, 1, 28, 28).tolist() + json_request = json.dumps({"image": images}) + + # Send all requests concurrently. + async with aiohttp.ClientSession() as session: + tasks = [ + send_request(session, url, json_request) + for _ in range(num_requests) + ] + responses = await asyncio.gather(*tasks) + + return responses + +# Run the concurrent requests. +start_time = time.time() +responses = asyncio.run(send_concurrent_requests(100)) +elapsed = time.time() - start_time + +print(f"Processed {len(responses)} requests in {elapsed:.2f} seconds") +print(f"Throughput: {len(responses)/elapsed:.2f} requests/second") + + +############################################################################### +# This approach allows Ray Serve to buffer and batch process the requests efficiently and automatically scale replicas as needed. +# +# ## Part 4: Use dynamic request batching +# +# Dynamic request batching is an optimization that groups multiple incoming requests and processes them together to maximize GPU utilization. +# +# ### Implement batching + +@serve.deployment +class BatchedMNISTClassifier: + def __init__(self, model_path: str = None): + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + self.model = MNISTNet().to(self.device) + + if model_path: + self.model.load_state_dict(torch.load(model_path, map_location=self.device)) + + self.model.eval() + + @serve.batch(max_batch_size=32, batch_wait_timeout_s=0.1) + async def predict_batch(self, images: list[np.ndarray]) -> list[dict[str, Any]]: + """Process a batch of images together.""" + print(f"Processing batch of size: {len(images)}") + + # Stack all images into a single tensor. + batch_tensor = torch.tensor( + np.stack(images), + dtype=torch.float32 + ).to(self.device) + + # Run inference on the entire batch. + with torch.no_grad(): + logits = self.model(batch_tensor) + predictions = torch.argmax(logits, dim=1).cpu().numpy() + + # Return individual results. + return [ + { + "predicted_label": int(pred), + "logits": logit.cpu().numpy().tolist() + } + for pred, logit in zip(predictions, logits) + ] + + async def __call__(self, request: Request) -> dict[str, Any]: + data = await request.json() + batch = json.loads(data) + + # Extract single image and pass it to the batch handler. + image = np.array(batch["image"]) + result = await self.predict_batch(image) + + return result + + +############################################################################### +# The `@serve.batch` decorator automatically: +# +# - Collects up to `max_batch_size` requests. +# - Waits up to `batch_wait_timeout_s` seconds for more requests. +# - Processes them together in a single forward pass. +# +# This behavior can improve throughput, especially for GPU inference. +# +# ## Part 5: Compose multiple deployments +# +# Real-world machine learning applications often involve multiple steps: preprocessing, inference, and postprocessing. Ray Serve makes it easy to compose multiple deployments into a pipeline. +# +# ### Create a preprocessing deployment + +@serve.deployment +class ImagePreprocessor: + def __init__(self): + """Initialize preprocessing transforms.""" + self.transform = transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize((0.1307,), (0.3081,)) # MNIST mean and standard deviation. + ]) + + async def preprocess(self, images: list[np.ndarray]) -> np.ndarray: + """Preprocess a batch of images.""" + processed = [] + for img in images: + # Convert to PIL-compatible format if needed. + if img.dtype != np.uint8: + img = (img * 255).astype(np.uint8) + + # Apply transforms. + tensor = self.transform(img) + processed.append(tensor.numpy()) + + return np.stack(processed) + + +############################################################################### +# ### Create an ingress deployment +# +# The ingress deployment orchestrates the pipeline and routes requests through preprocessing and then to the model. + +@serve.deployment +class MLPipeline: + def __init__(self, preprocessor, classifier): + """Initialize with handles to other deployments.""" + self.preprocessor = preprocessor + self.classifier = classifier + + async def __call__(self, request: Request) -> dict[str, Any]: + """Handle end-to-end inference.""" + # Parse request. + data = await request.json() + batch = json.loads(data) + images = batch["image"] + + # Step 1: Preprocess. + processed_images = await self.preprocessor.preprocess.remote(images) + + # Step 2: Run inference. + result = await self.classifier.predict.remote({ + "image": processed_images.tolist() + }) + + return result + + +############################################################################### +# ### Deploy the pipeline + +# Build the application graph. +preprocessor = ImagePreprocessor.bind() +classifier = MNISTClassifier.options( + num_replicas=2, + ray_actor_options={"num_gpus": 0.5} +).bind() + +pipeline = MLPipeline.bind( + preprocessor=preprocessor, + classifier=classifier +) + +# Deploy the entire pipeline. +handle = serve.run(pipeline, name="ml_pipeline") + +############################################################################### +# When you send a request to the pipeline, the request automatically flows through preprocessing and inference. + +# Send request to the pipeline. +images = [np.random.rand(28, 28) for _ in range(5)] +json_request = json.dumps({"image": images}) + +response = requests.post("http://localhost:8000/", json=json_request) +print(response.json()) + +############################################################################### +# ## Part 6: Integrate with FastAPI +# +# Ray Serve integrates with FastAPI and gives you access to: +# +# - HTTP routing and path parameters. +# - Request validation with Pydantic models. +# - Automatic OpenAPI documentation. + +from fastapi import FastAPI +from pydantic import BaseModel + +app = FastAPI() + +class PredictionRequest(BaseModel): + image: list[list[list[float]]] # Batch of images. + +class PredictionResponse(BaseModel): + predicted_label: list[int] + +@serve.deployment +@serve.ingress(app) +class FastAPIMNISTService: + def __init__(self): + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + self.model = MNISTNet().to(self.device) + self.model.eval() + + @app.post("/predict", response_model=PredictionResponse) + async def predict(self, request: PredictionRequest): + """Predict a digit from an image.""" + images = torch.tensor( + request.image, + dtype=torch.float32 + ).to(self.device) + + with torch.no_grad(): + logits = self.model(images) + predictions = torch.argmax(logits, dim=1).cpu().numpy() + + return PredictionResponse(predicted_label=predictions.tolist()) + + @app.get("/health") + async def health(self): + """Return health status.""" + return {"status": "healthy"} + +# Deploy with FastAPI. +fastapi_app = FastAPIMNISTService.bind() +handle = serve.run(fastapi_app, name="fastapi_mnist") + +############################################################################### +# After deploying, you can: +# +# - Visit `http://localhost:8000/docs` for interactive API documentation. +# - Use the `/predict` endpoint for inference. +# - Use the `/health` endpoint for health checks. + +############################################################################### +# ## Clean up resources +# +# When you finish, shut down the Ray Serve application. + +serve.shutdown() + +############################################################################### +# ## Summary +# +# In this tutorial, you learned how to: +# +# - Deploy PyTorch models as web services with Ray Serve. +# - Scale deployments with multiple replicas and fractional GPU usage. +# - Configure autoscaling to handle variable workloads. +# - Use dynamic request batching to maximize throughput. +# - Compose multiple deployments into machine learning pipelines. +# - Send concurrent requests efficiently with asynchronous HTTP. +# - Integrate with FastAPI for production-ready APIs. +# +# Ray Serve provides a flexible framework for serving PyTorch models at scale. Its Python-first API makes it easy to go from a trained model to a production service. +# +# ## Next steps +# +# - For more information on Ray Serve, read the [Ray Serve documentation](https://docs.ray.io/en/latest/serve/index.html). +# - Learn about [Ray Distributed](https://docs.ray.io/en/latest/ray-overview.html), the distributed computing framework that powers Ray Serve. diff --git a/beginner_source/ray_serve_tutorial.rst b/beginner_source/ray_serve_tutorial.rst deleted file mode 100644 index fcf1091e5e1..00000000000 --- a/beginner_source/ray_serve_tutorial.rst +++ /dev/null @@ -1,513 +0,0 @@ -Serving PyTorch models at scale with Ray Serve -============================================== -**Author:** `Ricardo Decal `_ - -This tutorial introduces `Ray Serve `_, a scalable framework for serving machine learning models in production. Ray Serve is part of `Ray Distributed `_, an open-source PyTorch Foundation project. - - -Production-ready features -~~~~~~~~~~~~~~~~~~~~~~~~~ - -Ray Serve provides the following production-ready features: - -- Handle thousands of concurrent requests efficiently with dynamic request batching -- Autoscale your endpoints in response to variable traffic -- Buffer incoming requests when the endpoints are busy -- Compose multiple models along with business logic into a complete ML application -- Gracefully heal the deployment when nodes are lost -- Handle multi-node/multi-GPU serving -- Flexibly allocate heterogenous compute resources and fractional GPUs -- `LLM-specific features `_ such as response streaming, LoRA multiplexing, prefill-decode disaggregation, and more. - -Ray Serve also has LLM-specific features such as response streaming, model multiplexing, dynamic request batching, and more. - -In this tutorial, you'll learn how to: - -1. Deploy a PyTorch model as a web service using Ray Serve -2. Scale the deployment with multiple replicas -3. Use advanced features like autoscaling and request batching -4. Compose multiple deployments into a complete ML application -5. Handle concurrent requests efficiently - -Prerequisites -------------- - -This tutorial assumes basic familiarity with PyTorch and Python. You'll need to install Ray Serve: - -.. code-block:: bash - - pip install "ray[serve]" torch torchvision - -Setup ------ - -Start by importing the necessary libraries: - -.. code-block:: python - - import asyncio - import json - import time - from typing import Any - - import aiohttp - import numpy as np - import requests - import torch - import torch.nn as nn - from ray import serve - from starlette.requests import Request - from torchvision import transforms - -Part 1: Deploy a simple PyTorch model --------------------------------------- - -We'll start with a simple convolutional neural network for MNIST digit classification. -First, let's define our model architecture: - -.. code-block:: python - - class MNISTNet(nn.Module): - """Simple CNN for MNIST digit classification""" - def __init__(self): - super(MNISTNet, self).__init__() - self.conv1 = nn.Conv2d(1, 32, 3, 1) - self.conv2 = nn.Conv2d(32, 64, 3, 1) - self.dropout1 = nn.Dropout(0.25) - self.dropout2 = nn.Dropout(0.5) - self.fc1 = nn.Linear(9216, 128) - self.fc2 = nn.Linear(128, 10) - - def forward(self, x): - x = self.conv1(x) - x = nn.functional.relu(x) - x = self.conv2(x) - x = nn.functional.relu(x) - x = nn.functional.max_pool2d(x, 2) - x = self.dropout1(x) - x = torch.flatten(x, 1) - x = self.fc1(x) - x = nn.functional.relu(x) - x = self.dropout2(x) - x = self.fc2(x) - return nn.functional.log_softmax(x, dim=1) - -Creating a Ray Serve deployment -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -To deploy this model with Ray Serve, we wrap it in a class and add the ``@serve.deployment`` decorator. -The deployment handles incoming HTTP requests and runs inference: - -.. code-block:: python - - @serve.deployment - class MNISTClassifier: - def __init__(self, model_path: str = None): - """Initialize the model. If model_path is provided, load weights.""" - self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - self.model = MNISTNet().to(self.device) - - if model_path: - self.model.load_state_dict(torch.load(model_path, map_location=self.device)) - - self.model.eval() - - async def __call__(self, request: Request) -> dict[str, Any]: - """Handle incoming HTTP requests""" - # Parse the JSON request body - data = await request.json() - batch = json.loads(data) - - # Run inference - return await self.predict(batch) - - async def predict(self, batch: dict[str, np.ndarray]) -> dict[str, Any]: - """Run inference on a batch of images""" - # Convert numpy array to tensor - images = torch.tensor(batch["image"], dtype=torch.float32).to(self.device) - - # Run inference - with torch.no_grad(): - logits = self.model(images) - predictions = torch.argmax(logits, dim=1).cpu().numpy() - - return { - "predicted_label": predictions.tolist(), - "logits": logits.cpu().numpy().tolist() - } - -Running the deployment -~~~~~~~~~~~~~~~~~~~~~~ - -Now let's deploy and run our model: - -.. code-block:: python - - # Create the deployment (but don't run it yet) - mnist_app = MNISTClassifier.bind() - - # Start the Ray Serve application - handle = serve.run(mnist_app, name="mnist_classifier") - -Testing the deployment -~~~~~~~~~~~~~~~~~~~~~~ - -Let's test our deployment with some random data: - -.. code-block:: python - - # Create a batch of random images (simulating MNIST format: 28x28 grayscale) - images = np.random.rand(2, 1, 28, 28).tolist() - json_request = json.dumps({"image": images}) - - # Send HTTP request - response = requests.post("http://localhost:8000/", json=json_request) - print(f"Predictions: {response.json()['predicted_label']}") - -Part 2: Scaling with multiple replicas ---------------------------------------- - -One of Ray Serve's key features is the ability to scale your deployment across multiple replicas. -Each replica is an independent instance of your model that can handle requests in parallel. - -Configuring replicas -~~~~~~~~~~~~~~~~~~~~ - -.. code-block:: python - - # Create deployment with 4 replicas - mnist_app = MNISTClassifier.options( - num_replicas=4, - ray_actor_options={"num_gpus": 0.25} # Each replica uses 1/4 of a GPU - ).bind() - - # Update the running deployment - handle = serve.run(mnist_app, name="mnist_classifier") - -This configuration creates 4 replicas, each using 25% of a GPU. This allows you to serve 4 models -on a single GPU, maximizing resource utilization for small models. - -Part 3: Autoscaling -------------------- - -Ray Serve can automatically scale the number of replicas based on incoming traffic. -This is useful for handling variable workloads without over-provisioning resources. - -Configuring autoscaling -~~~~~~~~~~~~~~~~~~~~~~~~ - -.. code-block:: python - - mnist_app = MNISTClassifier.options( - autoscaling_config={ - "target_ongoing_requests": 10, # Target 10 requests per replica - "min_replicas": 0, # Scale down to 0 when idle - "max_replicas": 10, # Scale up to 10 replicas max - "upscale_delay_s": 5, # Wait 5s before scaling up - "downscale_delay_s": 30, # Wait 30s before scaling down - }, - ray_actor_options={"num_gpus": 0.1} - ).bind() - - handle = serve.run(mnist_app, name="mnist_classifier") - -With this configuration, Ray Serve will: - -- Start with 0 replicas (no resources used when idle) -- Scale up when requests arrive (targeting 10 concurrent requests per replica) -- Scale down after 30 seconds of low traffic - -Testing autoscaling with concurrent requests -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -To see autoscaling in action, we need to send many concurrent requests. Using ``aiohttp``, -we can fire requests asynchronously: - -.. code-block:: python - - async def send_request(session, url, data): - """Send a single async HTTP request""" - async with session.post(url, json=data) as response: - return await response.json() - - async def send_concurrent_requests(num_requests=100): - """Send many requests concurrently""" - url = "http://localhost:8000/" - - # Create sample data - images = np.random.rand(10, 1, 28, 28).tolist() - json_request = json.dumps({"image": images}) - - # Send all requests concurrently - async with aiohttp.ClientSession() as session: - tasks = [ - send_request(session, url, json_request) - for _ in range(num_requests) - ] - responses = await asyncio.gather(*tasks) - - return responses - - # Run the concurrent requests - start_time = time.time() - responses = asyncio.run(send_concurrent_requests(100)) - elapsed = time.time() - start_time - - print(f"Processed {len(responses)} requests in {elapsed:.2f} seconds") - print(f"Throughput: {len(responses)/elapsed:.2f} requests/second") - -This approach allows Ray Serve to buffer and batch process the requests efficiently, -automatically scaling replicas as needed. - -Part 4: Dynamic request batching ---------------------------------- - -Dynamic request batching is a powerful optimization that groups multiple incoming requests -and processes them together, maximizing GPU utilization. - -Implementing batching -~~~~~~~~~~~~~~~~~~~~~ - -.. code-block:: python - - @serve.deployment - class BatchedMNISTClassifier: - def __init__(self, model_path: str = None): - self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - self.model = MNISTNet().to(self.device) - - if model_path: - self.model.load_state_dict(torch.load(model_path, map_location=self.device)) - - self.model.eval() - - @serve.batch(max_batch_size=32, batch_wait_timeout_s=0.1) - async def predict_batch(self, images: list[np.ndarray]) -> list[dict[str, Any]]: - """Process a batch of images together""" - print(f"Processing batch of size: {len(images)}") - - # Stack all images into a single tensor - batch_tensor = torch.tensor( - np.stack(images), - dtype=torch.float32 - ).to(self.device) - - # Run inference on the entire batch - with torch.no_grad(): - logits = self.model(batch_tensor) - predictions = torch.argmax(logits, dim=1).cpu().numpy() - - # Return individual results - return [ - { - "predicted_label": int(pred), - "logits": logit.cpu().numpy().tolist() - } - for pred, logit in zip(predictions, logits) - ] - - async def __call__(self, request: Request) -> dict[str, Any]: - data = await request.json() - batch = json.loads(data) - - # Extract single image and pass to batch handler - image = np.array(batch["image"]) - result = await self.predict_batch(image) - - return result - -The ``@serve.batch`` decorator automatically: - -- Collects up to ``max_batch_size`` requests -- Waits up to ``batch_wait_timeout_s`` seconds for more requests -- Processes them together in a single forward pass - -This can dramatically improve throughput, especially for GPU inference. - -Part 5: Composing multiple deployments ---------------------------------------- - -Real-world ML applications often involve multiple steps: preprocessing, inference, and postprocessing. -Ray Serve makes it easy to compose multiple deployments into a pipeline. - -Creating a preprocessing deployment -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. code-block:: python - - @serve.deployment - class ImagePreprocessor: - def __init__(self): - """Initialize preprocessing transforms""" - self.transform = transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize((0.1307,), (0.3081,)) # MNIST mean and std - ]) - - async def preprocess(self, images: list[np.ndarray]) -> np.ndarray: - """Preprocess a batch of images""" - processed = [] - for img in images: - # Convert to PIL-compatible format if needed - if img.dtype != np.uint8: - img = (img * 255).astype(np.uint8) - - # Apply transforms - tensor = self.transform(img) - processed.append(tensor.numpy()) - - return np.stack(processed) - -Creating an ingress deployment -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -The ingress deployment orchestrates the pipeline, routing requests through preprocessing -and then to the model: - -.. code-block:: python - - @serve.deployment - class MLPipeline: - def __init__(self, preprocessor, classifier): - """Initialize with handles to other deployments""" - self.preprocessor = preprocessor - self.classifier = classifier - - async def __call__(self, request: Request) -> dict[str, Any]: - """Handle end-to-end inference""" - # Parse request - data = await request.json() - batch = json.loads(data) - images = batch["image"] - - # Step 1: Preprocess - processed_images = await self.preprocessor.preprocess.remote(images) - - # Step 2: Run inference - result = await self.classifier.predict.remote({ - "image": processed_images.tolist() - }) - - return result - -Deploying the pipeline -~~~~~~~~~~~~~~~~~~~~~~~ - -.. code-block:: python - - # Build the application graph - preprocessor = ImagePreprocessor.bind() - classifier = MNISTClassifier.options( - num_replicas=2, - ray_actor_options={"num_gpus": 0.5} - ).bind() - - pipeline = MLPipeline.bind( - preprocessor=preprocessor, - classifier=classifier - ) - - # Deploy the entire pipeline - handle = serve.run(pipeline, name="ml_pipeline") - -Now when you send a request to the pipeline, it automatically flows through preprocessing -and inference: - -.. code-block:: python - - # Send request to the pipeline - images = [np.random.rand(28, 28) for _ in range(5)] - json_request = json.dumps({"image": images}) - - response = requests.post("http://localhost:8000/", json=json_request) - print(response.json()) - -Part 6: Integration with FastAPI ---------------------------------- - -Ray Serve integrates seamlessly with FastAPI, giving you access to: - -- HTTP routing and path parameters -- Request validation with Pydantic models -- Automatic OpenAPI documentation - -.. code-block:: python - - from fastapi import FastAPI - from pydantic import BaseModel - - app = FastAPI() - - class PredictionRequest(BaseModel): - image: list[list[list[float]]] # Batch of images - - class PredictionResponse(BaseModel): - predicted_label: list[int] - - @serve.deployment - @serve.ingress(app) - class FastAPIMNISTService: - def __init__(self): - self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - self.model = MNISTNet().to(self.device) - self.model.eval() - - @app.post("/predict", response_model=PredictionResponse) - async def predict(self, request: PredictionRequest): - """Predict digit from image""" - images = torch.tensor( - request.image, - dtype=torch.float32 - ).to(self.device) - - with torch.no_grad(): - logits = self.model(images) - predictions = torch.argmax(logits, dim=1).cpu().numpy() - - return PredictionResponse(predicted_label=predictions.tolist()) - - @app.get("/health") - async def health(self): - """Health check endpoint""" - return {"status": "healthy"} - - # Deploy with FastAPI - fastapi_app = FastAPIMNISTService.bind() - handle = serve.run(fastapi_app, name="fastapi_mnist") - -After deploying, you can: - -- Visit ``http://localhost:8000/docs`` for interactive API documentation -- Use the ``/predict`` endpoint for inference -- Use the ``/health`` endpoint for health checks - -Cleanup -------- - -When you're done, shut down the Ray Serve application: - -.. code-block:: python - - serve.shutdown() - -Summary -------- - -In this tutorial, you learned how to: - -- Deploy PyTorch models as web services with Ray Serve -- Scale deployments with multiple replicas and fractional GPU usage -- Configure autoscaling to handle variable workloads -- Use dynamic request batching to maximize throughput -- Compose multiple deployments into ML pipelines -- Send concurrent requests efficiently with async HTTP -- Integrate with FastAPI for production-ready APIs - -Ray Serve provides a powerful, flexible framework for serving PyTorch models at scale. -Its Python-first API makes it easy to go from a trained model to a production service. - -Next steps ----------- - -- For more information on Ray Serve, read the `Ray Serve documentation `_. -- Learn about `Ray Distributed `_, the distributed computing framework that powers Ray Serve. From 9a64caafe1b4093c6164ca8ff81dd9587e57fcf9 Mon Sep 17 00:00:00 2001 From: Ricardo Decal Date: Thu, 11 Dec 2025 18:06:22 -0800 Subject: [PATCH 06/18] ignore cursor configs --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 3f1f927ee33..9158514018d 100644 --- a/.gitignore +++ b/.gitignore @@ -126,6 +126,7 @@ cleanup.sh # VSCode *.vscode +.cursor/ # pyspelling dictionary.dic From 608aa192a785bc156ac7fa9e6e7884b7d4809db1 Mon Sep 17 00:00:00 2001 From: Ricardo Decal Date: Thu, 11 Dec 2025 18:06:33 -0800 Subject: [PATCH 07/18] add serve to docker reqs --- .ci/docker/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.ci/docker/requirements.txt b/.ci/docker/requirements.txt index d9e7b338cfd..9df7a6f9f79 100644 --- a/.ci/docker/requirements.txt +++ b/.ci/docker/requirements.txt @@ -32,7 +32,7 @@ bs4 awscliv2==2.1.1 flask spacy==3.4.1 -ray[tune]==2.52.1 +ray[serve,tune]==2.52.1 tensorboard jinja2==3.1.3 pytorch-lightning From 92578272053748e1ba2abbc466f05b1b1e955d8c Mon Sep 17 00:00:00 2001 From: Ricardo Decal Date: Thu, 11 Dec 2025 22:23:53 -0800 Subject: [PATCH 08/18] Add completed ray serve tutorial --- .ci/docker/requirements.txt | 5 +- beginner_source/ray_serve_tutorial.py | 569 ++++++++------------------ 2 files changed, 184 insertions(+), 390 deletions(-) diff --git a/.ci/docker/requirements.txt b/.ci/docker/requirements.txt index 9df7a6f9f79..401a81a1e01 100644 --- a/.ci/docker/requirements.txt +++ b/.ci/docker/requirements.txt @@ -21,6 +21,8 @@ pytorch_sphinx_theme2==0.2.0 # Tutorial dependencies tqdm==4.66.1 numpy==1.24.4 +pydantic>=2.0 +fastapi matplotlib librosa torch==2.9 @@ -31,7 +33,7 @@ PyHamcrest bs4 awscliv2==2.1.1 flask -spacy==3.4.1 +spacy>=3.7.0 ray[serve,tune]==2.52.1 tensorboard jinja2==3.1.3 @@ -59,6 +61,7 @@ sphinxcontrib.katex boto3 pandas requests +aiohttp scikit-image scipy==1.11.1 numba==0.57.1 diff --git a/beginner_source/ray_serve_tutorial.py b/beginner_source/ray_serve_tutorial.py index e3a9775266c..30fa5223422 100644 --- a/beginner_source/ray_serve_tutorial.py +++ b/beginner_source/ray_serve_tutorial.py @@ -1,96 +1,67 @@ -# --- -# jupyter: -# jupytext: -# default_lexer: ipython3 -# formats: ipynb,md,py:sphinx -# text_representation: -# extension: .py -# format_name: sphinx -# format_version: '1.1' -# jupytext_version: 1.18.1 -# kernelspec: -# display_name: Python 3 (ipykernel) -# language: python -# name: python3 -# --- - """ -# Serve PyTorch models at scale with Ray Serve - -**Author:** [Ricardo Decal](https://github.com/crypdick) +Serve PyTorch models at scale with Ray Serve +============================================ -This tutorial introduces [Ray Serve](https://docs.ray.io/en/latest/serve/index.html), a scalable framework for serving machine learning models in production. Ray Serve is part of [Ray Distributed](https://pytorch.org/projects/ray/), an open-source PyTorch Foundation project. +**Author:** `Ricardo Decal `__ -## Production-ready features +This tutorial shows how to deploy a PyTorch model using Ray Serve with +production-ready features. -Ray Serve provides the following production-ready features: +`Ray Serve `__ is a +scalable framework for serving machine learning models in production +built on top of Ray. `Ray `__, +a project of the PyTorch Foundation, is an open-source unified framework +for scaling AI and Python applications. Ray simplifies distributed +workloads by handling the complexity of distributed computing. -- Handle thousands of concurrent requests efficiently with dynamic request batching. -- Autoscale endpoints in response to variable traffic. -- Buffer incoming requests when the endpoints are busy. -- Compose multiple models along with business logic into a complete machine learning application. -- Gracefully heal the deployment when nodes are lost. -- Handle multi-node and multi-GPU serving. -- Flexibly allocate heterogeneous compute resources and fractional GPUs. -- Use [LLM-specific features](https://docs.ray.io/en/latest/serve/llm/index.html) such as response streaming, LoRA multiplexing, prefill-decode disaggregation, and more. +In this tutorial, you’ll learn how to deploy a PyTorch model with Ray +Serve and use its production-ready features. Ray Serve allows you to +easily scale your model inference across multiple nodes and GPUs, +providing features like dynamic batching, autoscaling, fault tolerance, +and observability out of the box. -
- -Roadmap for this notebook: +Setup +----- -
    -
  • Part 1: Deploy a simple PyTorch model.
  • -
  • Part 2: Scale with multiple replicas.
  • -
  • Part 3: Configure autoscaling.
  • -
  • Part 4: Use dynamic request batching.
  • -
  • Part 5: Compose multiple deployments.
  • -
  • Part 6: Integrate with FastAPI.
  • -
-
+Install the dependencies: -## Prerequisites +.. code-block:: bash -This tutorial assumes basic familiarity with PyTorch and Python. Install Ray Serve: + pip install "ray[serve]" torch torchvision -```bash -pip install "ray[serve]" torch torchvision -``` """ -############################################################################### -# ## Set up environment -# -# Start by importing the required libraries. +###################################################################### +# Start by importing the required libraries: import asyncio -import json import time from typing import Any +from fastapi import FastAPI +from pydantic import BaseModel import aiohttp import numpy as np -import requests import torch import torch.nn as nn from ray import serve -from starlette.requests import Request -from torchvision import transforms - +from torchvision.transforms import v2 -############################################################################### -# ## Part 1: Deploy a simple PyTorch model +###################################################################### +# Define a PyTorch model +# ---------------------- # -# Use a simple convolutional neural network for MNIST digit classification. First, define the model architecture. +# We will define a simple convolutional neural network for MNIST digit +# classification: class MNISTNet(nn.Module): - """Convolutional neural network for MNIST digit classification.""" def __init__(self): - super(MNISTNet, self).__init__() + super().__init__() self.conv1 = nn.Conv2d(1, 32, 3, 1) - self.conv2 = nn.Conv2d(32, 64, 3, 1) self.dropout1 = nn.Dropout(0.25) - self.dropout2 = nn.Dropout(0.5) + self.conv2 = nn.Conv2d(32, 64, 3, 1) self.fc1 = nn.Linear(9216, 128) + self.dropout2 = nn.Dropout(0.5) self.fc2 = nn.Linear(128, 10) def forward(self, x): @@ -108,139 +79,155 @@ def forward(self, x): return nn.functional.log_softmax(x, dim=1) -############################################################################### -# ### Create a Ray Serve deployment +###################################################################### +# Define the Ray Serve deployment +# ------------------------------- +# +# To deploy this model with Ray Serve, wrap the model in a Python class +# and decorate it with ``@serve.deployment``. # -# To deploy this model with Ray Serve, wrap it in a class and add the `@serve.deployment` decorator. The deployment handles incoming HTTP requests and runs inference. +# Processing requests in batches is more efficient than processing +# requests one by one, especially when using GPUs. Ray Serve provides +# built-in support for **dynamic request batching**, where individual +# incoming requests are opportunistically batched. The ``@serve.batch`` +# decorator on the ``predict_batch`` method below enables this. + +app = FastAPI() + +class ImageRequest(BaseModel): # Used for request validation and documentation + image: list[list[float]] | list[list[list[float]]] @serve.deployment +@serve.ingress(app) class MNISTClassifier: - def __init__(self, model_path: str = None): - """Initialize the model and optionally load weights from ``model_path``.""" + def __init__(self): self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.model = MNISTNet().to(self.device) - - if model_path: - self.model.load_state_dict(torch.load(model_path, map_location=self.device)) - + self.transform = v2.Compose([ + v2.ToImage(), + v2.ToDtype(torch.float32, scale=True), + v2.Normalize(mean=[0.1307], std=[0.3013]), + ]) + self.model.eval() - async def __call__(self, request: Request) -> dict[str, Any]: - """Handle an incoming HTTP request.""" - # Parse the JSON request body. - data = await request.json() - batch = json.loads(data) - - # Run inference. - return await self.predict(batch) - - async def predict(self, batch: dict[str, np.ndarray]) -> dict[str, Any]: - """Run inference on a batch of images.""" - # Convert NumPy array to tensor. - images = torch.tensor(batch["image"], dtype=torch.float32).to(self.device) + # batch_wait_timeout_s is the maximum time to wait for a full batch. + @serve.batch(max_batch_size=128, batch_wait_timeout_s=0.1) + async def predict_batch(self, images: list[np.ndarray]) -> list[dict[str, Any]]: + # Stack all images into a single tensor. + batch_tensor = torch.cat([ + self.transform(img).unsqueeze(0) + for img in images + ]).to(self.device).float() - # Run inference. + # Run inference on the entire batch. with torch.no_grad(): - logits = self.model(images) + logits = self.model(batch_tensor) predictions = torch.argmax(logits, dim=1).cpu().numpy() - return { - "predicted_label": predictions.tolist(), - "logits": logits.cpu().numpy().tolist() - } - - -############################################################################### -# ### Run the deployment -# -# Deploy and run the model. - -# Create the deployment (but do not run it yet). -mnist_app = MNISTClassifier.bind() - -# Start the Ray Serve application. -handle = serve.run(mnist_app, name="mnist_classifier") + # Unbatch the results and preserve their original order. + return [ + { + "predicted_label": int(pred), + "logits": logit.cpu().numpy().tolist() + } + for pred, logit in zip(predictions, logits) + ] -############################################################################### -# ### Test the deployment -# -# Test the deployment with some random data. + @app.post("/") + async def handle_request(self, request: ImageRequest): + """Handle an incoming HTTP request using FastAPI. + + Inputs are automatically validated using the Pydantic model. + """ + # Process the single request. + image_array = np.array(request.image) -# Create a batch of random images (MNIST format: 28x28 grayscale). -images = np.random.rand(2, 1, 28, 28).tolist() -json_request = json.dumps({"image": images}) + # Ray Serve's @serve.batch will automatically batch requests. + result = await self.predict_batch(image_array) + + return result -# Send HTTP request. -response = requests.post("http://localhost:8000/", json=json_request) -print(f"Predictions: {response.json()['predicted_label']}") -############################################################################### -# ## Part 2: Scale with multiple replicas +###################################################################### +# This is a FastAPI app, which gives us batteries-included features like +# automatic request validation (via Pydantic), OpenAPI-style API +# documentation, and more. # -# One of Ray Serve's key features is the ability to scale the deployment across multiple replicas. Each replica is an independent instance of the model that can handle requests in parallel. +# Configure autoscaling and resource allocation +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # -# ### Configure replicas - -# Create deployment with 4 replicas. -mnist_app = MNISTClassifier.options( - num_replicas=4, - ray_actor_options={"num_gpus": 0.25} # Each replica uses one quarter of a GPU. -).bind() - -# Update the running deployment. -handle = serve.run(mnist_app, name="mnist_classifier") - -############################################################################### -# This configuration creates 4 replicas, each using 25% of a GPU. This configuration allows you to serve 4 models on a single GPU and maximize resource utilization for small models. +# In production, traffic can vary significantly. Ray Serve’s +# **autoscaling** feature automatically adjusts the number of replicas +# based on traffic load, ensuring you have enough capacity during peaks +# while saving resources during quiet periods. # -# ## Part 3: Configure autoscaling +# You can also specify **resource allocation** per replica, such as the +# number of CPUs or GPUs. Ray Serve handles the orchestration of these +# resources across your cluster. # -# Ray Serve can automatically scale the number of replicas based on incoming traffic. This behavior is useful for handling variable workloads without over-provisioning resources. -# -# ### Configure autoscaling +# Below is a sample configuration with autoscaling and resource +# allocation: mnist_app = MNISTClassifier.options( autoscaling_config={ - "target_ongoing_requests": 10, # Target 10 requests per replica. - "min_replicas": 0, # Scale down to 0 when idle. - "max_replicas": 10, # Scale up to 10 replicas maximum. - "upscale_delay_s": 5, # Wait 5 seconds before scaling up. - "downscale_delay_s": 30, # Wait 30 seconds before scaling down. + "target_ongoing_requests": 50, # Target 50 ongoing requests per replica. + "min_replicas": 1, # Keep at least 1 replica alive. + "max_replicas": 5, # Scale up to 5 replicas to maintain target_ongoing_requests. + "upscale_delay_s": 5, # Wait 5s before scaling up. + "downscale_delay_s": 30, # Wait 30s before scaling down. }, - ray_actor_options={"num_gpus": 0.1} + # Max concurrent requests per replica before queueing. + # If the queue fills the shared cluster memory, future requests are backpressured until memory is freed. + max_ongoing_requests=100, + ray_actor_options={"num_cpus": 1, "num_gpus": 1} ).bind() -handle = serve.run(mnist_app, name="mnist_classifier") +###################################################################### +# The app is now ready to be deployed. +# +# Testing the endpoint with with concurrent requests +# -------------------------------------------------- +# +# To deploy the app, use the ``serve.run`` function: +# Start the Ray Serve application. +handle = serve.run(mnist_app, name="mnist_classifier") -############################################################################### -# With this configuration, Ray Serve: +###################################################################### +# You should see an output similar to: # -# - Starts with 0 replicas (no resources used when idle). -# - Scales up when requests arrive (targeting 10 concurrent requests per replica). -# - Scales down after 30 seconds of low traffic. +# .. code-block:: bash # -# ### Test autoscaling with concurrent requests +# Started Serve in namespace "serve". +# Registering autoscaling state for deployment Deployment(name='MNISTClassifier', app='mnist_classifier') +# Deploying new version of Deployment(name='MNISTClassifier', app='mnist_classifier') (initial target replicas: 1). +# Proxy starting on node ... (HTTP port: 8000). +# Got updated endpoints: {}. +# Got updated endpoints: {Deployment(name='MNISTClassifier', app='mnist_classifier'): EndpointInfo(route='/', app_is_cross_language=False, route_patterns=None)}. +# Started . +# Adding 1 replica to Deployment(name='MNISTClassifier', app='mnist_classifier'). +# Got updated endpoints: {Deployment(name='MNISTClassifier', app='mnist_classifier'): EndpointInfo(route='/', app_is_cross_language=False, route_patterns=['/', '/docs', '/docs/oauth2-redirect', '/openapi.json', '/redoc'])}. +# Application 'mnist_classifier' is ready at http://127.0.0.1:8000/. + +###################################################################### +# The app is now listening for requests on port 8000. # -# To see autoscaling in action, send many concurrent requests. Using `aiohttp`, you can send requests asynchronously. +# To test the batching, you can send many requests concurrently using +# ``aiohttp``. Below is a sample function that sends 2000 concurrent +# requests to the app: -async def send_request(session, url, data): - """Send a single asynchronous HTTP request.""" +async def send_single_request(session, url, data): async with session.post(url, json=data) as response: return await response.json() -async def send_concurrent_requests(num_requests=100): - """Send many requests concurrently.""" - url = "http://localhost:8000/" +async def send_concurrent_requests(num_requests): + image = np.random.rand(28, 28).tolist() - # Create sample data. - images = np.random.rand(10, 1, 28, 28).tolist() - json_request = json.dumps({"image": images}) - - # Send all requests concurrently. + print(f"Sending {num_requests} concurrent requests...") async with aiohttp.ClientSession() as session: tasks = [ - send_request(session, url, json_request) + send_single_request(session, url="http://localhost:8000/", data={"image": image}) for _ in range(num_requests) ] responses = await asyncio.gather(*tasks) @@ -249,247 +236,51 @@ async def send_concurrent_requests(num_requests=100): # Run the concurrent requests. start_time = time.time() -responses = asyncio.run(send_concurrent_requests(100)) +responses = asyncio.run(send_concurrent_requests(2000)) elapsed = time.time() - start_time print(f"Processed {len(responses)} requests in {elapsed:.2f} seconds") print(f"Throughput: {len(responses)/elapsed:.2f} requests/second") - -############################################################################### -# This approach allows Ray Serve to buffer and batch process the requests efficiently and automatically scale replicas as needed. +###################################################################### +# You should see high throughput numbers, confirming that requests are +# being batched and processed in parallel across the replicas. # -# ## Part 4: Use dynamic request batching +# Monitoring the deployment +# ------------------------- # -# Dynamic request batching is an optimization that groups multiple incoming requests and processes them together to maximize GPU utilization. -# -# ### Implement batching - -@serve.deployment -class BatchedMNISTClassifier: - def __init__(self, model_path: str = None): - self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - self.model = MNISTNet().to(self.device) - - if model_path: - self.model.load_state_dict(torch.load(model_path, map_location=self.device)) - - self.model.eval() - - @serve.batch(max_batch_size=32, batch_wait_timeout_s=0.1) - async def predict_batch(self, images: list[np.ndarray]) -> list[dict[str, Any]]: - """Process a batch of images together.""" - print(f"Processing batch of size: {len(images)}") - - # Stack all images into a single tensor. - batch_tensor = torch.tensor( - np.stack(images), - dtype=torch.float32 - ).to(self.device) - - # Run inference on the entire batch. - with torch.no_grad(): - logits = self.model(batch_tensor) - predictions = torch.argmax(logits, dim=1).cpu().numpy() - - # Return individual results. - return [ - { - "predicted_label": int(pred), - "logits": logit.cpu().numpy().tolist() - } - for pred, logit in zip(predictions, logits) - ] - - async def __call__(self, request: Request) -> dict[str, Any]: - data = await request.json() - batch = json.loads(data) - - # Extract single image and pass it to the batch handler. - image = np.array(batch["image"]) - result = await self.predict_batch(image) - - return result - - -############################################################################### -# The `@serve.batch` decorator automatically: -# -# - Collects up to `max_batch_size` requests. -# - Waits up to `batch_wait_timeout_s` seconds for more requests. -# - Processes them together in a single forward pass. -# -# This behavior can improve throughput, especially for GPU inference. -# -# ## Part 5: Compose multiple deployments -# -# Real-world machine learning applications often involve multiple steps: preprocessing, inference, and postprocessing. Ray Serve makes it easy to compose multiple deployments into a pipeline. -# -# ### Create a preprocessing deployment - -@serve.deployment -class ImagePreprocessor: - def __init__(self): - """Initialize preprocessing transforms.""" - self.transform = transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize((0.1307,), (0.3081,)) # MNIST mean and standard deviation. - ]) - - async def preprocess(self, images: list[np.ndarray]) -> np.ndarray: - """Preprocess a batch of images.""" - processed = [] - for img in images: - # Convert to PIL-compatible format if needed. - if img.dtype != np.uint8: - img = (img * 255).astype(np.uint8) - - # Apply transforms. - tensor = self.transform(img) - processed.append(tensor.numpy()) - - return np.stack(processed) - - -############################################################################### -# ### Create an ingress deployment -# -# The ingress deployment orchestrates the pipeline and routes requests through preprocessing and then to the model. - -@serve.deployment -class MLPipeline: - def __init__(self, preprocessor, classifier): - """Initialize with handles to other deployments.""" - self.preprocessor = preprocessor - self.classifier = classifier - - async def __call__(self, request: Request) -> dict[str, Any]: - """Handle end-to-end inference.""" - # Parse request. - data = await request.json() - batch = json.loads(data) - images = batch["image"] - - # Step 1: Preprocess. - processed_images = await self.preprocessor.preprocess.remote(images) - - # Step 2: Run inference. - result = await self.classifier.predict.remote({ - "image": processed_images.tolist() - }) - - return result - - -############################################################################### -# ### Deploy the pipeline - -# Build the application graph. -preprocessor = ImagePreprocessor.bind() -classifier = MNISTClassifier.options( - num_replicas=2, - ray_actor_options={"num_gpus": 0.5} -).bind() - -pipeline = MLPipeline.bind( - preprocessor=preprocessor, - classifier=classifier -) - -# Deploy the entire pipeline. -handle = serve.run(pipeline, name="ml_pipeline") - -############################################################################### -# When you send a request to the pipeline, the request automatically flows through preprocessing and inference. - -# Send request to the pipeline. -images = [np.random.rand(28, 28) for _ in range(5)] -json_request = json.dumps({"image": images}) - -response = requests.post("http://localhost:8000/", json=json_request) -print(response.json()) - -############################################################################### -# ## Part 6: Integrate with FastAPI -# -# Ray Serve integrates with FastAPI and gives you access to: -# -# - HTTP routing and path parameters. -# - Request validation with Pydantic models. -# - Automatic OpenAPI documentation. - -from fastapi import FastAPI -from pydantic import BaseModel - -app = FastAPI() - -class PredictionRequest(BaseModel): - image: list[list[list[float]]] # Batch of images. - -class PredictionResponse(BaseModel): - predicted_label: list[int] - -@serve.deployment -@serve.ingress(app) -class FastAPIMNISTService: - def __init__(self): - self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - self.model = MNISTNet().to(self.device) - self.model.eval() - - @app.post("/predict", response_model=PredictionResponse) - async def predict(self, request: PredictionRequest): - """Predict a digit from an image.""" - images = torch.tensor( - request.image, - dtype=torch.float32 - ).to(self.device) - - with torch.no_grad(): - logits = self.model(images) - predictions = torch.argmax(logits, dim=1).cpu().numpy() - - return PredictionResponse(predicted_label=predictions.tolist()) - - @app.get("/health") - async def health(self): - """Return health status.""" - return {"status": "healthy"} - -# Deploy with FastAPI. -fastapi_app = FastAPIMNISTService.bind() -handle = serve.run(fastapi_app, name="fastapi_mnist") - -############################################################################### -# After deploying, you can: -# -# - Visit `http://localhost:8000/docs` for interactive API documentation. -# - Use the `/predict` endpoint for inference. -# - Use the `/health` endpoint for health checks. - -############################################################################### -# ## Clean up resources -# -# When you finish, shut down the Ray Serve application. - -serve.shutdown() - -############################################################################### -# ## Summary +# Ray Serve provides built-in monitoring tools to help you track the +# status and performance of your deployment. This dashboard lets you view +# Serving metrics like request throughput, latency, and error rates, as +# well as cluster status and resource utilization. For more information, +# see the `Ray Serve monitoring +# documentation `__. + +###################################################################### +# Summary +# ------- # # In this tutorial, you learned how to: # -# - Deploy PyTorch models as web services with Ray Serve. -# - Scale deployments with multiple replicas and fractional GPU usage. -# - Configure autoscaling to handle variable workloads. -# - Use dynamic request batching to maximize throughput. -# - Compose multiple deployments into machine learning pipelines. -# - Send concurrent requests efficiently with asynchronous HTTP. -# - Integrate with FastAPI for production-ready APIs. +# - Deploy PyTorch models using Ray Serve with production best practices. +# - Enable **dynamic request batching** to optimize performance. +# - Configure **autoscaling** to handle traffic spikes. +# - Test the service with concurrent asynchronous requests. +# +# Further reading +# --------------- # -# Ray Serve provides a flexible framework for serving PyTorch models at scale. Its Python-first API makes it easy to go from a trained model to a production service. +# Ray Serve has more production features that are out of scope for this +# tutorial, but are worth checking out: # -# ## Next steps +# - Specialized **LLM serving APIs** that handles complexities like +# managing KV caches and continuous batching. +# - **Model multiplexing** to dynamically load and serve many different +# models (e.g., per-user fine-tuned models) on a single deployment. +# - **Composed Deployments** to orchestrate multiple deployments into a +# single application. # -# - For more information on Ray Serve, read the [Ray Serve documentation](https://docs.ray.io/en/latest/serve/index.html). -# - Learn about [Ray Distributed](https://docs.ray.io/en/latest/ray-overview.html), the distributed computing framework that powers Ray Serve. +# For more information, see the `Ray Serve +# documentation `__ and +# `Ray Serve +# examples `__. From 035d4f1722ca079194be090c14af83d97cfaa4fd Mon Sep 17 00:00:00 2001 From: Ricardo Decal Date: Thu, 11 Dec 2025 22:30:36 -0800 Subject: [PATCH 09/18] add tutorial to index --- ecosystem.rst | 8 ++++++++ index.rst | 7 +++++++ 2 files changed, 15 insertions(+) diff --git a/ecosystem.rst b/ecosystem.rst index da2a926851a..ab865590423 100644 --- a/ecosystem.rst +++ b/ecosystem.rst @@ -35,6 +35,13 @@ to production deployment. :link: beginner/hyperparameter_tuning_tutorial.html :tags: Model-Optimization,Best-Practice,Ecosystem +.. customcarditem:: + :header: Serving PyTorch Tutorial + :card_description: Deploy and scale a PyTorch model with Ray Serve. + :image: _static/img/ray-serve.png + :link: beginner/ray_serve_tutorial.html + :tags: Production,Best-Practice,Ray-Distributed,Ecosystem + .. customcarditem:: :header: Multi-Objective Neural Architecture Search with Ax :card_description: Learn how to use Ax to search over architectures find optimal tradeoffs between accuracy and latency. @@ -65,6 +72,7 @@ to production deployment. :hidden: beginner/hyperparameter_tuning_tutorial + beginner/ray_serve_tutorial intermediate/ax_multiobjective_nas_tutorial intermediate/tensorboard_profiler_tutorial intermediate/realtime_rpi diff --git a/index.rst b/index.rst index 5a5e80abfbb..b3ba2cc00ab 100644 --- a/index.rst +++ b/index.rst @@ -323,6 +323,13 @@ Welcome to PyTorch Tutorials .. Deploying PyTorch Models in Production +.. customcarditem:: + :header: Serving PyTorch Tutorial + :card_description: Deploy and scale a PyTorch model with Ray Serve. + :image: _static/img/ray-tune.png + :link: beginner/ray_serve_tutorial.html + :tags: Production,Best-Practice,Ray-Distributed,Ecosystem + .. customcarditem:: :header: Profiling PyTorch :card_description: Learn how to profile a PyTorch application From 1f3e44e37afc1a8b63f6740530a12eca241eeda9 Mon Sep 17 00:00:00 2001 From: Ricardo Decal Date: Thu, 11 Dec 2025 22:37:04 -0800 Subject: [PATCH 10/18] unstage ipynb and md --- beginner_source/ray_serve_tutorial.ipynb | 710 ----------------------- beginner_source/ray_serve_tutorial.md | 501 ---------------- 2 files changed, 1211 deletions(-) delete mode 100644 beginner_source/ray_serve_tutorial.ipynb delete mode 100644 beginner_source/ray_serve_tutorial.md diff --git a/beginner_source/ray_serve_tutorial.ipynb b/beginner_source/ray_serve_tutorial.ipynb deleted file mode 100644 index ee6610f4b7b..00000000000 --- a/beginner_source/ray_serve_tutorial.ipynb +++ /dev/null @@ -1,710 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "682e224e-1bb9-470c-b363-386ede0785a4", - "metadata": {}, - "source": [ - "# Serve PyTorch models at scale with Ray Serve\n", - "\n", - "**Author:** [Ricardo Decal](https://github.com/crypdick)\n", - "\n", - "This tutorial introduces [Ray Serve](https://docs.ray.io/en/latest/serve/index.html), a scalable framework for serving machine learning models in production. Ray Serve is part of [Ray Distributed](https://pytorch.org/projects/ray/), an open-source PyTorch Foundation project.\n", - "\n", - "## Production-ready features\n", - "\n", - "Ray Serve provides the following production-ready features:\n", - "\n", - "- Handle thousands of concurrent requests efficiently with dynamic request batching.\n", - "- Autoscale endpoints in response to variable traffic.\n", - "- Buffer incoming requests when the endpoints are busy.\n", - "- Compose multiple models along with business logic into a complete machine learning application.\n", - "- Gracefully heal the deployment when nodes are lost.\n", - "- Handle multi-node and multi-GPU serving.\n", - "- Flexibly allocate heterogeneous compute resources and fractional GPUs.\n", - "- Use [LLM-specific features](https://docs.ray.io/en/latest/serve/llm/index.html) such as response streaming, LoRA multiplexing, prefill-decode disaggregation, and more.\n", - "\n", - "
\n", - " \n", - "Roadmap for this notebook:\n", - "\n", - "
    \n", - "
  • Part 1: Deploy a simple PyTorch model.
  • \n", - "
  • Part 2: Scale with multiple replicas.
  • \n", - "
  • Part 3: Configure autoscaling.
  • \n", - "
  • Part 4: Use dynamic request batching.
  • \n", - "
  • Part 5: Compose multiple deployments.
  • \n", - "
  • Part 6: Integrate with FastAPI.
  • \n", - "
\n", - "
\n", - "\n", - "## Prerequisites\n", - "\n", - "This tutorial assumes basic familiarity with PyTorch and Python. Install Ray Serve:\n", - "\n", - "```bash\n", - "pip install \"ray[serve]\" torch torchvision\n", - "```" - ] - }, - { - "cell_type": "markdown", - "id": "1060aea0", - "metadata": {}, - "source": [ - "## Set up environment\n", - "\n", - "Start by importing the required libraries." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "099b7710", - "metadata": {}, - "outputs": [], - "source": [ - "import asyncio\n", - "import json\n", - "import time\n", - "from typing import Any\n", - "\n", - "import aiohttp\n", - "import numpy as np\n", - "import requests\n", - "import torch\n", - "import torch.nn as nn\n", - "from ray import serve\n", - "from starlette.requests import Request\n", - "from torchvision import transforms" - ] - }, - { - "cell_type": "markdown", - "id": "7250cc03-e52c-4e30-a262-8d8e0a5a0837", - "metadata": {}, - "source": [ - "## Part 1: Deploy a simple PyTorch model\n", - "\n", - "Use a simple convolutional neural network for MNIST digit classification. First, define the model architecture." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "14fb17a6-a71c-4a11-8ea8-b1b350a5fa1c", - "metadata": {}, - "outputs": [], - "source": [ - "class MNISTNet(nn.Module):\n", - " \"\"\"Convolutional neural network for MNIST digit classification.\"\"\"\n", - " def __init__(self):\n", - " super(MNISTNet, self).__init__()\n", - " self.conv1 = nn.Conv2d(1, 32, 3, 1)\n", - " self.conv2 = nn.Conv2d(32, 64, 3, 1)\n", - " self.dropout1 = nn.Dropout(0.25)\n", - " self.dropout2 = nn.Dropout(0.5)\n", - " self.fc1 = nn.Linear(9216, 128)\n", - " self.fc2 = nn.Linear(128, 10)\n", - "\n", - " def forward(self, x):\n", - " x = self.conv1(x)\n", - " x = nn.functional.relu(x)\n", - " x = self.conv2(x)\n", - " x = nn.functional.relu(x)\n", - " x = nn.functional.max_pool2d(x, 2)\n", - " x = self.dropout1(x)\n", - " x = torch.flatten(x, 1)\n", - " x = self.fc1(x)\n", - " x = nn.functional.relu(x)\n", - " x = self.dropout2(x)\n", - " x = self.fc2(x)\n", - " return nn.functional.log_softmax(x, dim=1)" - ] - }, - { - "cell_type": "markdown", - "id": "e1a79961", - "metadata": {}, - "source": [ - "### Create a Ray Serve deployment\n", - "\n", - "To deploy this model with Ray Serve, wrap it in a class and add the `@serve.deployment` decorator. The deployment handles incoming HTTP requests and runs inference." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c68888dd", - "metadata": {}, - "outputs": [], - "source": [ - "@serve.deployment\n", - "class MNISTClassifier:\n", - " def __init__(self, model_path: str = None):\n", - " \"\"\"Initialize the model and optionally load weights from ``model_path``.\"\"\"\n", - " self.device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", - " self.model = MNISTNet().to(self.device)\n", - " \n", - " if model_path:\n", - " self.model.load_state_dict(torch.load(model_path, map_location=self.device))\n", - " \n", - " self.model.eval()\n", - "\n", - " async def __call__(self, request: Request) -> dict[str, Any]:\n", - " \"\"\"Handle an incoming HTTP request.\"\"\"\n", - " # Parse the JSON request body.\n", - " data = await request.json()\n", - " batch = json.loads(data)\n", - " \n", - " # Run inference.\n", - " return await self.predict(batch)\n", - " \n", - " async def predict(self, batch: dict[str, np.ndarray]) -> dict[str, Any]:\n", - " \"\"\"Run inference on a batch of images.\"\"\"\n", - " # Convert NumPy array to tensor.\n", - " images = torch.tensor(batch[\"image\"], dtype=torch.float32).to(self.device)\n", - " \n", - " # Run inference.\n", - " with torch.no_grad():\n", - " logits = self.model(images)\n", - " predictions = torch.argmax(logits, dim=1).cpu().numpy()\n", - " \n", - " return {\n", - " \"predicted_label\": predictions.tolist(),\n", - " \"logits\": logits.cpu().numpy().tolist()\n", - " }" - ] - }, - { - "cell_type": "markdown", - "id": "2cf85ff1", - "metadata": {}, - "source": [ - "### Run the deployment\n", - "\n", - "Deploy and run the model." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "df46ddd7", - "metadata": {}, - "outputs": [], - "source": [ - "# Create the deployment (but do not run it yet).\n", - "mnist_app = MNISTClassifier.bind()\n", - "\n", - "# Start the Ray Serve application.\n", - "handle = serve.run(mnist_app, name=\"mnist_classifier\")" - ] - }, - { - "cell_type": "markdown", - "id": "098e8ac4", - "metadata": {}, - "source": [ - "### Test the deployment\n", - "\n", - "Test the deployment with some random data." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1c0a80e9-c26f-48d2-8985-ef4eab4dc580", - "metadata": {}, - "outputs": [], - "source": [ - "# Create a batch of random images (MNIST format: 28x28 grayscale).\n", - "images = np.random.rand(2, 1, 28, 28).tolist()\n", - "json_request = json.dumps({\"image\": images})\n", - "\n", - "# Send HTTP request.\n", - "response = requests.post(\"http://localhost:8000/\", json=json_request)\n", - "print(f\"Predictions: {response.json()['predicted_label']}\")" - ] - }, - { - "cell_type": "markdown", - "id": "7cd2cb01", - "metadata": {}, - "source": [ - "## Part 2: Scale with multiple replicas\n", - "\n", - "One of Ray Serve's key features is the ability to scale the deployment across multiple replicas. Each replica is an independent instance of the model that can handle requests in parallel.\n", - "\n", - "### Configure replicas" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "230f9ff2", - "metadata": {}, - "outputs": [], - "source": [ - "# Create deployment with 4 replicas.\n", - "mnist_app = MNISTClassifier.options(\n", - " num_replicas=4,\n", - " ray_actor_options={\"num_gpus\": 0.25} # Each replica uses one quarter of a GPU.\n", - ").bind()\n", - "\n", - "# Update the running deployment.\n", - "handle = serve.run(mnist_app, name=\"mnist_classifier\")" - ] - }, - { - "cell_type": "markdown", - "id": "b35a8d83", - "metadata": {}, - "source": [ - "This configuration creates 4 replicas, each using 25% of a GPU. This configuration allows you to serve 4 models on a single GPU and maximize resource utilization for small models.\n", - "\n", - "## Part 3: Configure autoscaling\n", - "\n", - "Ray Serve can automatically scale the number of replicas based on incoming traffic. This behavior is useful for handling variable workloads without over-provisioning resources.\n", - "\n", - "### Configure autoscaling" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e356f749", - "metadata": {}, - "outputs": [], - "source": [ - "mnist_app = MNISTClassifier.options(\n", - " autoscaling_config={\n", - " \"target_ongoing_requests\": 10, # Target 10 requests per replica.\n", - " \"min_replicas\": 0, # Scale down to 0 when idle.\n", - " \"max_replicas\": 10, # Scale up to 10 replicas maximum.\n", - " \"upscale_delay_s\": 5, # Wait 5 seconds before scaling up.\n", - " \"downscale_delay_s\": 30, # Wait 30 seconds before scaling down.\n", - " },\n", - " ray_actor_options={\"num_gpus\": 0.1}\n", - ").bind()\n", - "\n", - "handle = serve.run(mnist_app, name=\"mnist_classifier\")" - ] - }, - { - "cell_type": "markdown", - "id": "1ae8a244", - "metadata": {}, - "source": [ - "With this configuration, Ray Serve:\n", - "\n", - "- Starts with 0 replicas (no resources used when idle).\n", - "- Scales up when requests arrive (targeting 10 concurrent requests per replica).\n", - "- Scales down after 30 seconds of low traffic.\n", - "\n", - "### Test autoscaling with concurrent requests\n", - "\n", - "To see autoscaling in action, send many concurrent requests. Using `aiohttp`, you can send requests asynchronously." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6e5594d5", - "metadata": {}, - "outputs": [], - "source": [ - "async def send_request(session, url, data):\n", - " \"\"\"Send a single asynchronous HTTP request.\"\"\"\n", - " async with session.post(url, json=data) as response:\n", - " return await response.json()\n", - "\n", - "async def send_concurrent_requests(num_requests=100):\n", - " \"\"\"Send many requests concurrently.\"\"\"\n", - " url = \"http://localhost:8000/\"\n", - " \n", - " # Create sample data.\n", - " images = np.random.rand(10, 1, 28, 28).tolist()\n", - " json_request = json.dumps({\"image\": images})\n", - " \n", - " # Send all requests concurrently.\n", - " async with aiohttp.ClientSession() as session:\n", - " tasks = [\n", - " send_request(session, url, json_request)\n", - " for _ in range(num_requests)\n", - " ]\n", - " responses = await asyncio.gather(*tasks)\n", - " \n", - " return responses\n", - "\n", - "# Run the concurrent requests.\n", - "start_time = time.time()\n", - "responses = asyncio.run(send_concurrent_requests(100))\n", - "elapsed = time.time() - start_time\n", - "\n", - "print(f\"Processed {len(responses)} requests in {elapsed:.2f} seconds\")\n", - "print(f\"Throughput: {len(responses)/elapsed:.2f} requests/second\")" - ] - }, - { - "cell_type": "markdown", - "id": "e040d6ac", - "metadata": {}, - "source": [ - "This approach allows Ray Serve to buffer and batch process the requests efficiently and automatically scale replicas as needed.\n", - "\n", - "## Part 4: Use dynamic request batching\n", - "\n", - "Dynamic request batching is an optimization that groups multiple incoming requests and processes them together to maximize GPU utilization.\n", - "\n", - "### Implement batching" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ebea6c15", - "metadata": {}, - "outputs": [], - "source": [ - "@serve.deployment\n", - "class BatchedMNISTClassifier:\n", - " def __init__(self, model_path: str = None):\n", - " self.device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", - " self.model = MNISTNet().to(self.device)\n", - " \n", - " if model_path:\n", - " self.model.load_state_dict(torch.load(model_path, map_location=self.device))\n", - " \n", - " self.model.eval()\n", - "\n", - " @serve.batch(max_batch_size=32, batch_wait_timeout_s=0.1)\n", - " async def predict_batch(self, images: list[np.ndarray]) -> list[dict[str, Any]]:\n", - " \"\"\"Process a batch of images together.\"\"\"\n", - " print(f\"Processing batch of size: {len(images)}\")\n", - " \n", - " # Stack all images into a single tensor.\n", - " batch_tensor = torch.tensor(\n", - " np.stack(images), \n", - " dtype=torch.float32\n", - " ).to(self.device)\n", - " \n", - " # Run inference on the entire batch.\n", - " with torch.no_grad():\n", - " logits = self.model(batch_tensor)\n", - " predictions = torch.argmax(logits, dim=1).cpu().numpy()\n", - " \n", - " # Return individual results.\n", - " return [\n", - " {\n", - " \"predicted_label\": int(pred),\n", - " \"logits\": logit.cpu().numpy().tolist()\n", - " }\n", - " for pred, logit in zip(predictions, logits)\n", - " ]\n", - "\n", - " async def __call__(self, request: Request) -> dict[str, Any]:\n", - " data = await request.json()\n", - " batch = json.loads(data)\n", - " \n", - " # Extract single image and pass it to the batch handler.\n", - " image = np.array(batch[\"image\"])\n", - " result = await self.predict_batch(image)\n", - " \n", - " return result" - ] - }, - { - "cell_type": "markdown", - "id": "75be6e25", - "metadata": {}, - "source": [ - "The `@serve.batch` decorator automatically:\n", - "\n", - "- Collects up to `max_batch_size` requests.\n", - "- Waits up to `batch_wait_timeout_s` seconds for more requests.\n", - "- Processes them together in a single forward pass.\n", - "\n", - "This behavior can improve throughput, especially for GPU inference.\n", - "\n", - "## Part 5: Compose multiple deployments\n", - "\n", - "Real-world machine learning applications often involve multiple steps: preprocessing, inference, and postprocessing. Ray Serve makes it easy to compose multiple deployments into a pipeline.\n", - "\n", - "### Create a preprocessing deployment" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "67670984", - "metadata": {}, - "outputs": [], - "source": [ - "@serve.deployment\n", - "class ImagePreprocessor:\n", - " def __init__(self):\n", - " \"\"\"Initialize preprocessing transforms.\"\"\"\n", - " self.transform = transforms.Compose([\n", - " transforms.ToTensor(),\n", - " transforms.Normalize((0.1307,), (0.3081,)) # MNIST mean and standard deviation.\n", - " ])\n", - " \n", - " async def preprocess(self, images: list[np.ndarray]) -> np.ndarray:\n", - " \"\"\"Preprocess a batch of images.\"\"\"\n", - " processed = []\n", - " for img in images:\n", - " # Convert to PIL-compatible format if needed.\n", - " if img.dtype != np.uint8:\n", - " img = (img * 255).astype(np.uint8)\n", - " \n", - " # Apply transforms.\n", - " tensor = self.transform(img)\n", - " processed.append(tensor.numpy())\n", - " \n", - " return np.stack(processed)" - ] - }, - { - "cell_type": "markdown", - "id": "92daf899", - "metadata": {}, - "source": [ - "### Create an ingress deployment\n", - "\n", - "The ingress deployment orchestrates the pipeline and routes requests through preprocessing and then to the model." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "88340028", - "metadata": {}, - "outputs": [], - "source": [ - "@serve.deployment\n", - "class MLPipeline:\n", - " def __init__(self, preprocessor, classifier):\n", - " \"\"\"Initialize with handles to other deployments.\"\"\"\n", - " self.preprocessor = preprocessor\n", - " self.classifier = classifier\n", - " \n", - " async def __call__(self, request: Request) -> dict[str, Any]:\n", - " \"\"\"Handle end-to-end inference.\"\"\"\n", - " # Parse request.\n", - " data = await request.json()\n", - " batch = json.loads(data)\n", - " images = batch[\"image\"]\n", - " \n", - " # Step 1: Preprocess.\n", - " processed_images = await self.preprocessor.preprocess.remote(images)\n", - " \n", - " # Step 2: Run inference.\n", - " result = await self.classifier.predict.remote({\n", - " \"image\": processed_images.tolist()\n", - " })\n", - " \n", - " return result" - ] - }, - { - "cell_type": "markdown", - "id": "b0e44763", - "metadata": {}, - "source": [ - "### Deploy the pipeline" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "affcac11", - "metadata": {}, - "outputs": [], - "source": [ - "# Build the application graph.\n", - "preprocessor = ImagePreprocessor.bind()\n", - "classifier = MNISTClassifier.options(\n", - " num_replicas=2,\n", - " ray_actor_options={\"num_gpus\": 0.5}\n", - ").bind()\n", - "\n", - "pipeline = MLPipeline.bind(\n", - " preprocessor=preprocessor,\n", - " classifier=classifier\n", - ")\n", - "\n", - "# Deploy the entire pipeline.\n", - "handle = serve.run(pipeline, name=\"ml_pipeline\")" - ] - }, - { - "cell_type": "markdown", - "id": "aa81a51f", - "metadata": {}, - "source": [ - "When you send a request to the pipeline, the request automatically flows through preprocessing and inference." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6d084ee7", - "metadata": {}, - "outputs": [], - "source": [ - "# Send request to the pipeline.\n", - "images = [np.random.rand(28, 28) for _ in range(5)]\n", - "json_request = json.dumps({\"image\": images})\n", - "\n", - "response = requests.post(\"http://localhost:8000/\", json=json_request)\n", - "print(response.json())" - ] - }, - { - "cell_type": "markdown", - "id": "1fbe8773", - "metadata": {}, - "source": [ - "## Part 6: Integrate with FastAPI\n", - "\n", - "Ray Serve integrates with FastAPI and gives you access to:\n", - "\n", - "- HTTP routing and path parameters.\n", - "- Request validation with Pydantic models.\n", - "- Automatic OpenAPI documentation." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5d163431", - "metadata": {}, - "outputs": [], - "source": [ - "from fastapi import FastAPI\n", - "from pydantic import BaseModel\n", - "\n", - "app = FastAPI()\n", - "\n", - "class PredictionRequest(BaseModel):\n", - " image: list[list[list[float]]] # Batch of images.\n", - "\n", - "class PredictionResponse(BaseModel):\n", - " predicted_label: list[int]\n", - "\n", - "@serve.deployment\n", - "@serve.ingress(app)\n", - "class FastAPIMNISTService:\n", - " def __init__(self):\n", - " self.device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", - " self.model = MNISTNet().to(self.device)\n", - " self.model.eval()\n", - " \n", - " @app.post(\"/predict\", response_model=PredictionResponse)\n", - " async def predict(self, request: PredictionRequest):\n", - " \"\"\"Predict a digit from an image.\"\"\"\n", - " images = torch.tensor(\n", - " request.image, \n", - " dtype=torch.float32\n", - " ).to(self.device)\n", - " \n", - " with torch.no_grad():\n", - " logits = self.model(images)\n", - " predictions = torch.argmax(logits, dim=1).cpu().numpy()\n", - " \n", - " return PredictionResponse(predicted_label=predictions.tolist())\n", - " \n", - " @app.get(\"/health\")\n", - " async def health(self):\n", - " \"\"\"Return health status.\"\"\"\n", - " return {\"status\": \"healthy\"}\n", - "\n", - "# Deploy with FastAPI.\n", - "fastapi_app = FastAPIMNISTService.bind()\n", - "handle = serve.run(fastapi_app, name=\"fastapi_mnist\")" - ] - }, - { - "cell_type": "markdown", - "id": "a3a31b87", - "metadata": {}, - "source": [ - "After deploying, you can:\n", - "\n", - "- Visit `http://localhost:8000/docs` for interactive API documentation.\n", - "- Use the `/predict` endpoint for inference.\n", - "- Use the `/health` endpoint for health checks." - ] - }, - { - "cell_type": "markdown", - "id": "5e2af689", - "metadata": {}, - "source": [ - "## Clean up resources\n", - "\n", - "When you finish, shut down the Ray Serve application." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e0e8131d", - "metadata": {}, - "outputs": [], - "source": [ - "serve.shutdown()" - ] - }, - { - "cell_type": "markdown", - "id": "d59f4a09", - "metadata": {}, - "source": [ - "## Summary\n", - "\n", - "In this tutorial, you learned how to:\n", - "\n", - "- Deploy PyTorch models as web services with Ray Serve.\n", - "- Scale deployments with multiple replicas and fractional GPU usage.\n", - "- Configure autoscaling to handle variable workloads.\n", - "- Use dynamic request batching to maximize throughput.\n", - "- Compose multiple deployments into machine learning pipelines.\n", - "- Send concurrent requests efficiently with asynchronous HTTP.\n", - "- Integrate with FastAPI for production-ready APIs.\n", - "\n", - "Ray Serve provides a flexible framework for serving PyTorch models at scale. Its Python-first API makes it easy to go from a trained model to a production service.\n", - "\n", - "## Next steps\n", - "\n", - "- For more information on Ray Serve, read the [Ray Serve documentation](https://docs.ray.io/en/latest/serve/index.html).\n", - "- Learn about [Ray Distributed](https://docs.ray.io/en/latest/ray-overview.html), the distributed computing framework that powers Ray Serve." - ] - } - ], - "metadata": { - "jupytext": { - "default_lexer": "ipython3", - "formats": "ipynb,md,py:sphinx" - }, - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.8" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/beginner_source/ray_serve_tutorial.md b/beginner_source/ray_serve_tutorial.md deleted file mode 100644 index 56f7f48e97b..00000000000 --- a/beginner_source/ray_serve_tutorial.md +++ /dev/null @@ -1,501 +0,0 @@ ---- -jupyter: - jupytext: - default_lexer: ipython3 - formats: ipynb,md,py:sphinx - text_representation: - extension: .md - format_name: markdown - format_version: '1.3' - jupytext_version: 1.18.1 - kernelspec: - display_name: Python 3 (ipykernel) - language: python - name: python3 ---- - - -# Serve PyTorch models at scale with Ray Serve - -**Author:** [Ricardo Decal](https://github.com/crypdick) - -This tutorial introduces [Ray Serve](https://docs.ray.io/en/latest/serve/index.html), a scalable framework for serving machine learning models in production. Ray Serve is part of [Ray Distributed](https://pytorch.org/projects/ray/), an open-source PyTorch Foundation project. - -## Production-ready features - -Ray Serve provides the following production-ready features: - -- Handle thousands of concurrent requests efficiently with dynamic request batching. -- Autoscale endpoints in response to variable traffic. -- Buffer incoming requests when the endpoints are busy. -- Compose multiple models along with business logic into a complete machine learning application. -- Gracefully heal the deployment when nodes are lost. -- Handle multi-node and multi-GPU serving. -- Flexibly allocate heterogeneous compute resources and fractional GPUs. -- Use [LLM-specific features](https://docs.ray.io/en/latest/serve/llm/index.html) such as response streaming, LoRA multiplexing, prefill-decode disaggregation, and more. - -
- -Roadmap for this notebook: - -
    -
  • Part 1: Deploy a simple PyTorch model.
  • -
  • Part 2: Scale with multiple replicas.
  • -
  • Part 3: Configure autoscaling.
  • -
  • Part 4: Use dynamic request batching.
  • -
  • Part 5: Compose multiple deployments.
  • -
  • Part 6: Integrate with FastAPI.
  • -
-
- -## Prerequisites - -This tutorial assumes basic familiarity with PyTorch and Python. Install Ray Serve: - -```bash -pip install "ray[serve]" torch torchvision -``` - - -## Set up environment - -Start by importing the required libraries. - -```python -import asyncio -import json -import time -from typing import Any - -import aiohttp -import numpy as np -import requests -import torch -import torch.nn as nn -from ray import serve -from starlette.requests import Request -from torchvision import transforms -``` - -## Part 1: Deploy a simple PyTorch model - -Use a simple convolutional neural network for MNIST digit classification. First, define the model architecture. - -```python -class MNISTNet(nn.Module): - """Convolutional neural network for MNIST digit classification.""" - def __init__(self): - super(MNISTNet, self).__init__() - self.conv1 = nn.Conv2d(1, 32, 3, 1) - self.conv2 = nn.Conv2d(32, 64, 3, 1) - self.dropout1 = nn.Dropout(0.25) - self.dropout2 = nn.Dropout(0.5) - self.fc1 = nn.Linear(9216, 128) - self.fc2 = nn.Linear(128, 10) - - def forward(self, x): - x = self.conv1(x) - x = nn.functional.relu(x) - x = self.conv2(x) - x = nn.functional.relu(x) - x = nn.functional.max_pool2d(x, 2) - x = self.dropout1(x) - x = torch.flatten(x, 1) - x = self.fc1(x) - x = nn.functional.relu(x) - x = self.dropout2(x) - x = self.fc2(x) - return nn.functional.log_softmax(x, dim=1) -``` - -### Create a Ray Serve deployment - -To deploy this model with Ray Serve, wrap it in a class and add the `@serve.deployment` decorator. The deployment handles incoming HTTP requests and runs inference. - -```python -@serve.deployment -class MNISTClassifier: - def __init__(self, model_path: str = None): - """Initialize the model and optionally load weights from ``model_path``.""" - self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - self.model = MNISTNet().to(self.device) - - if model_path: - self.model.load_state_dict(torch.load(model_path, map_location=self.device)) - - self.model.eval() - - async def __call__(self, request: Request) -> dict[str, Any]: - """Handle an incoming HTTP request.""" - # Parse the JSON request body. - data = await request.json() - batch = json.loads(data) - - # Run inference. - return await self.predict(batch) - - async def predict(self, batch: dict[str, np.ndarray]) -> dict[str, Any]: - """Run inference on a batch of images.""" - # Convert NumPy array to tensor. - images = torch.tensor(batch["image"], dtype=torch.float32).to(self.device) - - # Run inference. - with torch.no_grad(): - logits = self.model(images) - predictions = torch.argmax(logits, dim=1).cpu().numpy() - - return { - "predicted_label": predictions.tolist(), - "logits": logits.cpu().numpy().tolist() - } -``` - -### Run the deployment - -Deploy and run the model. - -```python -# Create the deployment (but do not run it yet). -mnist_app = MNISTClassifier.bind() - -# Start the Ray Serve application. -handle = serve.run(mnist_app, name="mnist_classifier") -``` - -### Test the deployment - -Test the deployment with some random data. - -```python -# Create a batch of random images (MNIST format: 28x28 grayscale). -images = np.random.rand(2, 1, 28, 28).tolist() -json_request = json.dumps({"image": images}) - -# Send HTTP request. -response = requests.post("http://localhost:8000/", json=json_request) -print(f"Predictions: {response.json()['predicted_label']}") -``` - -## Part 2: Scale with multiple replicas - -One of Ray Serve's key features is the ability to scale the deployment across multiple replicas. Each replica is an independent instance of the model that can handle requests in parallel. - -### Configure replicas - -```python -# Create deployment with 4 replicas. -mnist_app = MNISTClassifier.options( - num_replicas=4, - ray_actor_options={"num_gpus": 0.25} # Each replica uses one quarter of a GPU. -).bind() - -# Update the running deployment. -handle = serve.run(mnist_app, name="mnist_classifier") -``` - -This configuration creates 4 replicas, each using 25% of a GPU. This configuration allows you to serve 4 models on a single GPU and maximize resource utilization for small models. - -## Part 3: Configure autoscaling - -Ray Serve can automatically scale the number of replicas based on incoming traffic. This behavior is useful for handling variable workloads without over-provisioning resources. - -### Configure autoscaling - -```python -mnist_app = MNISTClassifier.options( - autoscaling_config={ - "target_ongoing_requests": 10, # Target 10 requests per replica. - "min_replicas": 0, # Scale down to 0 when idle. - "max_replicas": 10, # Scale up to 10 replicas maximum. - "upscale_delay_s": 5, # Wait 5 seconds before scaling up. - "downscale_delay_s": 30, # Wait 30 seconds before scaling down. - }, - ray_actor_options={"num_gpus": 0.1} -).bind() - -handle = serve.run(mnist_app, name="mnist_classifier") -``` - -With this configuration, Ray Serve: - -- Starts with 0 replicas (no resources used when idle). -- Scales up when requests arrive (targeting 10 concurrent requests per replica). -- Scales down after 30 seconds of low traffic. - -### Test autoscaling with concurrent requests - -To see autoscaling in action, send many concurrent requests. Using `aiohttp`, you can send requests asynchronously. - -```python -async def send_request(session, url, data): - """Send a single asynchronous HTTP request.""" - async with session.post(url, json=data) as response: - return await response.json() - -async def send_concurrent_requests(num_requests=100): - """Send many requests concurrently.""" - url = "http://localhost:8000/" - - # Create sample data. - images = np.random.rand(10, 1, 28, 28).tolist() - json_request = json.dumps({"image": images}) - - # Send all requests concurrently. - async with aiohttp.ClientSession() as session: - tasks = [ - send_request(session, url, json_request) - for _ in range(num_requests) - ] - responses = await asyncio.gather(*tasks) - - return responses - -# Run the concurrent requests. -start_time = time.time() -responses = asyncio.run(send_concurrent_requests(100)) -elapsed = time.time() - start_time - -print(f"Processed {len(responses)} requests in {elapsed:.2f} seconds") -print(f"Throughput: {len(responses)/elapsed:.2f} requests/second") -``` - -This approach allows Ray Serve to buffer and batch process the requests efficiently and automatically scale replicas as needed. - -## Part 4: Use dynamic request batching - -Dynamic request batching is an optimization that groups multiple incoming requests and processes them together to maximize GPU utilization. - -### Implement batching - -```python -@serve.deployment -class BatchedMNISTClassifier: - def __init__(self, model_path: str = None): - self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - self.model = MNISTNet().to(self.device) - - if model_path: - self.model.load_state_dict(torch.load(model_path, map_location=self.device)) - - self.model.eval() - - @serve.batch(max_batch_size=32, batch_wait_timeout_s=0.1) - async def predict_batch(self, images: list[np.ndarray]) -> list[dict[str, Any]]: - """Process a batch of images together.""" - print(f"Processing batch of size: {len(images)}") - - # Stack all images into a single tensor. - batch_tensor = torch.tensor( - np.stack(images), - dtype=torch.float32 - ).to(self.device) - - # Run inference on the entire batch. - with torch.no_grad(): - logits = self.model(batch_tensor) - predictions = torch.argmax(logits, dim=1).cpu().numpy() - - # Return individual results. - return [ - { - "predicted_label": int(pred), - "logits": logit.cpu().numpy().tolist() - } - for pred, logit in zip(predictions, logits) - ] - - async def __call__(self, request: Request) -> dict[str, Any]: - data = await request.json() - batch = json.loads(data) - - # Extract single image and pass it to the batch handler. - image = np.array(batch["image"]) - result = await self.predict_batch(image) - - return result -``` - -The `@serve.batch` decorator automatically: - -- Collects up to `max_batch_size` requests. -- Waits up to `batch_wait_timeout_s` seconds for more requests. -- Processes them together in a single forward pass. - -This behavior can improve throughput, especially for GPU inference. - -## Part 5: Compose multiple deployments - -Real-world machine learning applications often involve multiple steps: preprocessing, inference, and postprocessing. Ray Serve makes it easy to compose multiple deployments into a pipeline. - -### Create a preprocessing deployment - -```python -@serve.deployment -class ImagePreprocessor: - def __init__(self): - """Initialize preprocessing transforms.""" - self.transform = transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize((0.1307,), (0.3081,)) # MNIST mean and standard deviation. - ]) - - async def preprocess(self, images: list[np.ndarray]) -> np.ndarray: - """Preprocess a batch of images.""" - processed = [] - for img in images: - # Convert to PIL-compatible format if needed. - if img.dtype != np.uint8: - img = (img * 255).astype(np.uint8) - - # Apply transforms. - tensor = self.transform(img) - processed.append(tensor.numpy()) - - return np.stack(processed) -``` - -### Create an ingress deployment - -The ingress deployment orchestrates the pipeline and routes requests through preprocessing and then to the model. - -```python -@serve.deployment -class MLPipeline: - def __init__(self, preprocessor, classifier): - """Initialize with handles to other deployments.""" - self.preprocessor = preprocessor - self.classifier = classifier - - async def __call__(self, request: Request) -> dict[str, Any]: - """Handle end-to-end inference.""" - # Parse request. - data = await request.json() - batch = json.loads(data) - images = batch["image"] - - # Step 1: Preprocess. - processed_images = await self.preprocessor.preprocess.remote(images) - - # Step 2: Run inference. - result = await self.classifier.predict.remote({ - "image": processed_images.tolist() - }) - - return result -``` - -### Deploy the pipeline - -```python -# Build the application graph. -preprocessor = ImagePreprocessor.bind() -classifier = MNISTClassifier.options( - num_replicas=2, - ray_actor_options={"num_gpus": 0.5} -).bind() - -pipeline = MLPipeline.bind( - preprocessor=preprocessor, - classifier=classifier -) - -# Deploy the entire pipeline. -handle = serve.run(pipeline, name="ml_pipeline") -``` - -When you send a request to the pipeline, the request automatically flows through preprocessing and inference. - -```python -# Send request to the pipeline. -images = [np.random.rand(28, 28) for _ in range(5)] -json_request = json.dumps({"image": images}) - -response = requests.post("http://localhost:8000/", json=json_request) -print(response.json()) -``` - -## Part 6: Integrate with FastAPI - -Ray Serve integrates with FastAPI and gives you access to: - -- HTTP routing and path parameters. -- Request validation with Pydantic models. -- Automatic OpenAPI documentation. - -```python -from fastapi import FastAPI -from pydantic import BaseModel - -app = FastAPI() - -class PredictionRequest(BaseModel): - image: list[list[list[float]]] # Batch of images. - -class PredictionResponse(BaseModel): - predicted_label: list[int] - -@serve.deployment -@serve.ingress(app) -class FastAPIMNISTService: - def __init__(self): - self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - self.model = MNISTNet().to(self.device) - self.model.eval() - - @app.post("/predict", response_model=PredictionResponse) - async def predict(self, request: PredictionRequest): - """Predict a digit from an image.""" - images = torch.tensor( - request.image, - dtype=torch.float32 - ).to(self.device) - - with torch.no_grad(): - logits = self.model(images) - predictions = torch.argmax(logits, dim=1).cpu().numpy() - - return PredictionResponse(predicted_label=predictions.tolist()) - - @app.get("/health") - async def health(self): - """Return health status.""" - return {"status": "healthy"} - -# Deploy with FastAPI. -fastapi_app = FastAPIMNISTService.bind() -handle = serve.run(fastapi_app, name="fastapi_mnist") -``` - -After deploying, you can: - -- Visit `http://localhost:8000/docs` for interactive API documentation. -- Use the `/predict` endpoint for inference. -- Use the `/health` endpoint for health checks. - - -## Clean up resources - -When you finish, shut down the Ray Serve application. - -```python -serve.shutdown() -``` - -## Summary - -In this tutorial, you learned how to: - -- Deploy PyTorch models as web services with Ray Serve. -- Scale deployments with multiple replicas and fractional GPU usage. -- Configure autoscaling to handle variable workloads. -- Use dynamic request batching to maximize throughput. -- Compose multiple deployments into machine learning pipelines. -- Send concurrent requests efficiently with asynchronous HTTP. -- Integrate with FastAPI for production-ready APIs. - -Ray Serve provides a flexible framework for serving PyTorch models at scale. Its Python-first API makes it easy to go from a trained model to a production service. - -## Next steps - -- For more information on Ray Serve, read the [Ray Serve documentation](https://docs.ray.io/en/latest/serve/index.html). -- Learn about [Ray Distributed](https://docs.ray.io/en/latest/ray-overview.html), the distributed computing framework that powers Ray Serve. From 83e68e7725aa8c827984dae19e5133c9da567d4b Mon Sep 17 00:00:00 2001 From: Ricardo Decal Date: Thu, 18 Dec 2025 00:02:40 -0800 Subject: [PATCH 11/18] tmp checkpt --- beginner_source/ray_serve_tutorial.md | 258 ++++++++++++++++++++++++++ 1 file changed, 258 insertions(+) create mode 100644 beginner_source/ray_serve_tutorial.md diff --git a/beginner_source/ray_serve_tutorial.md b/beginner_source/ray_serve_tutorial.md new file mode 100644 index 00000000000..c3fa785db0c --- /dev/null +++ b/beginner_source/ray_serve_tutorial.md @@ -0,0 +1,258 @@ +--- +jupyter: + jupytext: + default_lexer: ipython3 + formats: ipynb,md + text_representation: + extension: .md + format_name: markdown + format_version: '1.3' + jupytext_version: 1.18.1 + kernelspec: + display_name: Python 3 (ipykernel) + language: python + name: python3 +--- + +# Serve PyTorch models at scale with Ray Serve + +**Author:** [Ricardo Decal](https://github.com/crypdick) + +This tutorial shows how to deploy a PyTorch model using Ray Serve with production-ready features. + +[Ray Serve](https://docs.ray.io/en/latest/serve/index.html) is a scalable framework for serving machine learning models in production built on top of Ray. [Ray](https://docs.ray.io/en/latest/index.html), a project of the PyTorch Foundation, is an open-source unified framework for scaling AI and Python applications. Ray simplifies distributed workloads by handling the complexity of distributed computing. + +In this tutorial, you'll learn how to deploy a PyTorch model with Ray Serve and use its production-ready features. Ray Serve allows you to easily scale your model inference across multiple nodes and GPUs, providing features like dynamic batching, autoscaling, fault tolerance, and observability out of the box. + +## Setup + +Install the dependencies: + +```bash +pip install "ray[serve]" torch torchvision +``` + +Start by importing the required libraries: + +```python +import asyncio +import json +import time +from typing import Any + +from fastapi import FastAPI +from pydantic import BaseModel +import aiohttp +import numpy as np +import torch +import torch.nn as nn +from ray import serve +from torchvision.transforms import v2 +``` + +## Define a PyTorch model + +Define a simple convolutional neural network for MNIST digit classification: + +```python +class MNISTNet(nn.Module): + def __init__(self): + super().__init__() + self.conv1 = nn.Conv2d(1, 32, 3, 1) + self.dropout1 = nn.Dropout(0.25) + self.conv2 = nn.Conv2d(32, 64, 3, 1) + self.fc1 = nn.Linear(9216, 128) + self.dropout2 = nn.Dropout(0.5) + self.fc2 = nn.Linear(128, 10) + + def forward(self, x): + x = self.conv1(x) + x = nn.functional.relu(x) + x = self.conv2(x) + x = nn.functional.relu(x) + x = nn.functional.max_pool2d(x, 2) + x = self.dropout1(x) + x = torch.flatten(x, 1) + x = self.fc1(x) + x = nn.functional.relu(x) + x = self.dropout2(x) + x = self.fc2(x) + return nn.functional.log_softmax(x, dim=1) + +``` + +## Define the Ray Serve deployment + +To deploy this model with Ray Serve, wrap the model in a Python class and decorate it with `@serve.deployment`. + +Processing requests in batches is more efficient than processing requests one by one, especially when using GPUs. Ray Serve provides built-in support for **dynamic request batching**, where individual incoming requests are opportunistically batched. The `@serve.batch` decorator on the `predict_batch` method below enables this. + +```python +app = FastAPI() + +class ImageRequest(BaseModel): # Used for request validation and documentation + image: list[list[float]] | list[list[list[float]]] + +@serve.deployment +@serve.ingress(app) +class MNISTClassifier: + def __init__(self): + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + self.model = MNISTNet().to(self.device) + self.transform = v2.Compose([ + v2.ToImage(), + v2.ToDtype(torch.float32, scale=True), + v2.Normalize(mean=[0.1307], std=[0.3013]), + ]) + + self.model.eval() + + # batch_wait_timeout_s is the maximum time to wait for a full batch. + @serve.batch(max_batch_size=128, batch_wait_timeout_s=0.1) + async def predict_batch(self, images: list[np.ndarray]) -> list[dict[str, Any]]: + # Stack all images into a single tensor. + batch_tensor = torch.cat([ + self.transform(img).unsqueeze(0) + for img in images + ]).to(self.device).float() + + # Run inference on the entire batch. + with torch.no_grad(): + logits = self.model(batch_tensor) + predictions = torch.argmax(logits, dim=1).cpu().numpy() + + # Unbatch the results and preserve their original order. + return [ + { + "predicted_label": int(pred), + "logits": logit.cpu().numpy().tolist() + } + for pred, logit in zip(predictions, logits) + ] + + @app.post("/") + async def handle_request(self, request: ImageRequest): + """Handle an incoming HTTP request using FastAPI. + + Inputs are automatically validated using the Pydantic model. + """ + # Process the single request. + image_array = np.array(request.image) + + # Ray Serve's @serve.batch will automatically batch requests. + result = await self.predict_batch(image_array) + + return result + +``` + +This is a FastAPI app, which gives us batteries-included features like automatic request validation (via Pydantic), OpenAPI-style API documentation, and more. + +### Configure autoscaling and resource allocation + +In production, traffic can vary significantly. Ray Serve's **autoscaling** feature automatically adjusts the number of replicas based on traffic load, ensuring you have enough capacity during peaks while saving resources during quiet periods. + +You can also specify **resource allocation** per replica, such as the number of CPUs or GPUs. Ray Serve handles the orchestration of these resources across your cluster. + +Below is a sample configuration with autoscaling and resource allocation: + +```python +mnist_app = MNISTClassifier.options( + autoscaling_config={ + "target_ongoing_requests": 50, # Target 50 ongoing requests per replica. + "min_replicas": 1, # Keep at least 1 replica alive. + "max_replicas": 5, # Scale up to 5 replicas to maintain target_ongoing_requests. + "upscale_delay_s": 5, # Wait 5s before scaling up. + "downscale_delay_s": 30, # Wait 30s before scaling down. + }, + # Max concurrent requests per replica before queueing. + # If the queue fills the shared cluster memory, future requests are backpressured until memory is freed. + max_ongoing_requests=100, + ray_actor_options={"num_cpus": 1, "num_gpus": 1} +).bind() +``` + +The app is now ready to be deployed. + +## Testing the endpoint with with concurrent requests + +To deploy the app, use the `serve.run` function: + +```python +# Start the Ray Serve application. +handle = serve.run(mnist_app, name="mnist_classifier") +``` + +You should see an output similar to: + +```bash +Started Serve in namespace "serve". +Registering autoscaling state for deployment Deployment(name='MNISTClassifier', app='mnist_classifier') +Deploying new version of Deployment(name='MNISTClassifier', app='mnist_classifier') (initial target replicas: 1). +Proxy starting on node ... (HTTP port: 8000). +Got updated endpoints: {}. +Got updated endpoints: {Deployment(name='MNISTClassifier', app='mnist_classifier'): EndpointInfo(route='/', app_is_cross_language=False, route_patterns=None)}. +Started . +Adding 1 replica to Deployment(name='MNISTClassifier', app='mnist_classifier'). +Got updated endpoints: {Deployment(name='MNISTClassifier', app='mnist_classifier'): EndpointInfo(route='/', app_is_cross_language=False, route_patterns=['/', '/docs', '/docs/oauth2-redirect', '/openapi.json', '/redoc'])}. +Application 'mnist_classifier' is ready at http://127.0.0.1:8000/. +``` + +The app is now listening for requests on port 8000. + +To test the batching, you can send many requests concurrently using `aiohttp`. Below is a sample function that sends 2000 concurrent requests to the app: + +```python +async def send_single_request(session, url, data): + async with session.post(url, json=data) as response: + return await response.json() + +async def send_concurrent_requests(num_requests): + image = np.random.rand(28, 28).tolist() + + print(f"Sending {num_requests} concurrent requests...") + async with aiohttp.ClientSession() as session: + tasks = [ + send_single_request(session, url="http://localhost:8000/", data={"image": image}) + for _ in range(num_requests) + ] + responses = await asyncio.gather(*tasks) + + return responses + +# Run the concurrent requests. +start_time = time.time() +responses = asyncio.run(send_concurrent_requests(2000)) +elapsed = time.time() - start_time + +print(f"Processed {len(responses)} requests in {elapsed:.2f} seconds") +print(f"Throughput: {len(responses)/elapsed:.2f} requests/second") +``` + +You should see high throughput numbers, confirming that requests are being batched and processed in parallel across the replicas. + +## Monitoring the deployment + +Ray Serve provides built-in monitoring tools to help you track the status and performance of your deployment. +This dashboard lets you view Serving metrics like request throughput, latency, and error rates, as well as cluster status and resource utilization. For more information, see the [Ray Serve monitoring documentation](https://docs.ray.io/en/latest/serve/monitoring.html). + + +## Summary + +In this tutorial, you learned how to: + +- Deploy PyTorch models using Ray Serve with production best practices. +- Enable **dynamic request batching** to optimize performance. +- Configure **autoscaling** to handle traffic spikes. +- Test the service with concurrent asynchronous requests. + + +## Further reading + +Ray Serve has more production features that are out of scope for this tutorial, but are worth checking out: + +* Specialized **LLM serving APIs** that handles complexities like managing KV caches and continuous batching. +* **Model multiplexing** to dynamically load and serve many different models (e.g., per-user fine-tuned models) on a single deployment. +* **Composed Deployments** to orchestrate multiple deployments into a single application. + +For more information, see the [Ray Serve documentation](https://docs.ray.io/en/latest/serve/index.html) and [Ray Serve examples](https://docs.ray.io/en/latest/serve/examples/index.html). From fa5d2e0ebb2b3efd4571ed7f4d54fea825e63def Mon Sep 17 00:00:00 2001 From: Ricardo Decal Date: Thu, 18 Dec 2025 10:44:21 -0800 Subject: [PATCH 12/18] sync --- .../{ray_serve_tutorial.md => serving_tutorial.md} | 9 +++------ .../{ray_serve_tutorial.py => serving_tutorial.py} | 0 2 files changed, 3 insertions(+), 6 deletions(-) rename beginner_source/{ray_serve_tutorial.md => serving_tutorial.md} (96%) rename beginner_source/{ray_serve_tutorial.py => serving_tutorial.py} (100%) diff --git a/beginner_source/ray_serve_tutorial.md b/beginner_source/serving_tutorial.md similarity index 96% rename from beginner_source/ray_serve_tutorial.md rename to beginner_source/serving_tutorial.md index c3fa785db0c..b9c927f3b1a 100644 --- a/beginner_source/ray_serve_tutorial.md +++ b/beginner_source/serving_tutorial.md @@ -36,7 +36,6 @@ Start by importing the required libraries: ```python import asyncio -import json import time from typing import Any @@ -236,7 +235,6 @@ You should see high throughput numbers, confirming that requests are being batch Ray Serve provides built-in monitoring tools to help you track the status and performance of your deployment. This dashboard lets you view Serving metrics like request throughput, latency, and error rates, as well as cluster status and resource utilization. For more information, see the [Ray Serve monitoring documentation](https://docs.ray.io/en/latest/serve/monitoring.html). - ## Summary In this tutorial, you learned how to: @@ -246,13 +244,12 @@ In this tutorial, you learned how to: - Configure **autoscaling** to handle traffic spikes. - Test the service with concurrent asynchronous requests. - ## Further reading Ray Serve has more production features that are out of scope for this tutorial, but are worth checking out: -* Specialized **LLM serving APIs** that handles complexities like managing KV caches and continuous batching. -* **Model multiplexing** to dynamically load and serve many different models (e.g., per-user fine-tuned models) on a single deployment. -* **Composed Deployments** to orchestrate multiple deployments into a single application. +- Specialized **LLM serving APIs** that handles complexities like managing KV caches and continuous batching. +- **Model multiplexing** to dynamically load and serve many different models (e.g., per-user fine-tuned models) on a single deployment. +- **Composed Deployments** to orchestrate multiple deployments into a single application. For more information, see the [Ray Serve documentation](https://docs.ray.io/en/latest/serve/index.html) and [Ray Serve examples](https://docs.ray.io/en/latest/serve/examples/index.html). diff --git a/beginner_source/ray_serve_tutorial.py b/beginner_source/serving_tutorial.py similarity index 100% rename from beginner_source/ray_serve_tutorial.py rename to beginner_source/serving_tutorial.py From 700292099a71945b8376a8ee7ce36378f04e6557 Mon Sep 17 00:00:00 2001 From: Ricardo Decal Date: Thu, 18 Dec 2025 10:44:43 -0800 Subject: [PATCH 13/18] ignore more data files; ignore intermediates --- .gitignore | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 9158514018d..153099c2833 100644 --- a/.gitignore +++ b/.gitignore @@ -8,8 +8,9 @@ prototype /unstable sg_execution_times.rst -#data things +# Datasets _data/ +data/ advanced_source/images/ advanced_source/data/ beginner_source/.data/ @@ -133,3 +134,15 @@ dictionary.dic # linters /.lintbin + +# Intermediate tutorial files +beginner_source/*/*_tutorial.md +beginner_source/*/*_tutorial.ipynb +intermediate_source/*/*_tutorial.md +intermediate_source/*/*_tutorial.ipynb +advanced_source/*/*_tutorial.md +advanced_source/*/*_tutorial.ipynb +recipes_source/*/*_tutorial.md +recipes_source/*/*_tutorial.ipynb +prototype_source/*/*_tutorial.md +prototype_source/*/*_tutorial.ipynb \ No newline at end of file From 3d7a1f0ee43b2fcc7f1488019d28869f3f1e9939 Mon Sep 17 00:00:00 2001 From: Ricardo Decal Date: Thu, 18 Dec 2025 11:26:38 -0800 Subject: [PATCH 14/18] ignore more files --- .gitignore | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index 153099c2833..3ee1e62c82d 100644 --- a/.gitignore +++ b/.gitignore @@ -106,6 +106,7 @@ target/ celerybeat-schedule # dotenv .env +.envrc # virtualenv venv/ @@ -125,9 +126,10 @@ cleanup.sh # PyTorch things *.pt -# VSCode +# IDEs *.vscode .cursor/ +.devtools/ # pyspelling dictionary.dic @@ -145,4 +147,4 @@ advanced_source/*/*_tutorial.ipynb recipes_source/*/*_tutorial.md recipes_source/*/*_tutorial.ipynb prototype_source/*/*_tutorial.md -prototype_source/*/*_tutorial.ipynb \ No newline at end of file +prototype_source/*/*_tutorial.ipynb From 06378e2fefae78f1d8cba93d340cffe8e868498e Mon Sep 17 00:00:00 2001 From: Ricardo Decal Date: Thu, 18 Dec 2025 14:11:47 -0800 Subject: [PATCH 15/18] edit gitignore for symlink --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 3ee1e62c82d..a0e9a85a9cf 100644 --- a/.gitignore +++ b/.gitignore @@ -128,7 +128,7 @@ cleanup.sh # IDEs *.vscode -.cursor/ +.cursor .devtools/ # pyspelling From 69dfe1cd4147367943492f8be5255d7dbf8fe18f Mon Sep 17 00:00:00 2001 From: Ricardo Decal Date: Thu, 18 Dec 2025 14:12:04 -0800 Subject: [PATCH 16/18] finalize serving tutorial --- beginner_source/serving_tutorial.py | 172 ++++++++++++++++------------ 1 file changed, 97 insertions(+), 75 deletions(-) diff --git a/beginner_source/serving_tutorial.py b/beginner_source/serving_tutorial.py index 30fa5223422..63c9033d444 100644 --- a/beginner_source/serving_tutorial.py +++ b/beginner_source/serving_tutorial.py @@ -8,29 +8,28 @@ production-ready features. `Ray Serve `__ is a -scalable framework for serving machine learning models in production -built on top of Ray. `Ray `__, -a project of the PyTorch Foundation, is an open-source unified framework -for scaling AI and Python applications. Ray simplifies distributed -workloads by handling the complexity of distributed computing. +scalable framework for serving machine learning models in production. +It’s built on top of `Ray `__, +which is a unified framework for scaling AI and Python applications that +simplifies the complexities of distributed computing. Ray is also open +source and part of the PyTorch Foundation. In this tutorial, you’ll learn how to deploy a PyTorch model with Ray -Serve and use its production-ready features. Ray Serve allows you to -easily scale your model inference across multiple nodes and GPUs, -providing features like dynamic batching, autoscaling, fault tolerance, -and observability out of the box. +Serve and use its production-ready features. Ray Serve lets you scale +your model inference across thousands of nodes and GPUs, and it provides +features like dynamic batching, autoscaling, fault tolerance, and model +multiplexing. Setup ----- -Install the dependencies: - -.. code-block:: bash - - pip install "ray[serve]" torch torchvision +To install the dependencies: """ +# %%bash +# pip install "ray[serve]" torch torchvision + ###################################################################### # Start by importing the required libraries: @@ -51,8 +50,8 @@ # Define a PyTorch model # ---------------------- # -# We will define a simple convolutional neural network for MNIST digit -# classification: +# Define a simple convolutional neural network for the MNIST digit +# classification dataset: class MNISTNet(nn.Module): def __init__(self): @@ -78,7 +77,6 @@ def forward(self, x): x = self.fc2(x) return nn.functional.log_softmax(x, dim=1) - ###################################################################### # Define the Ray Serve deployment # ------------------------------- @@ -89,13 +87,13 @@ def forward(self, x): # Processing requests in batches is more efficient than processing # requests one by one, especially when using GPUs. Ray Serve provides # built-in support for **dynamic request batching**, where individual -# incoming requests are opportunistically batched. The ``@serve.batch`` -# decorator on the ``predict_batch`` method below enables this. +# incoming requests are opportunistically batched. Enable dynamic batching +# using the ``@serve.batch`` decorator as shown in the following code: app = FastAPI() -class ImageRequest(BaseModel): # Used for request validation and documentation - image: list[list[float]] | list[list[list[float]]] +class ImageRequest(BaseModel): # Used for request validation and generating API documentation + image: list[list[float]] | list[list[list[float]]] # 2D or 3D array @serve.deployment @serve.ingress(app) @@ -103,15 +101,18 @@ class MNISTClassifier: def __init__(self): self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.model = MNISTNet().to(self.device) + # Define the transformation pipeline for the input images. self.transform = v2.Compose([ v2.ToImage(), v2.ToDtype(torch.float32, scale=True), + # Mean and standard deviation of the MNIST training subset. v2.Normalize(mean=[0.1307], std=[0.3013]), ]) self.model.eval() - # batch_wait_timeout_s is the maximum time to wait for a full batch. + # batch_wait_timeout_s is the maximum time to wait for a full batch, + # trading off latency for throughput. @serve.batch(max_batch_size=128, batch_wait_timeout_s=0.1) async def predict_batch(self, images: list[np.ndarray]) -> list[dict[str, Any]]: # Stack all images into a single tensor. @@ -120,7 +121,7 @@ async def predict_batch(self, images: list[np.ndarray]) -> list[dict[str, Any]]: for img in images ]).to(self.device).float() - # Run inference on the entire batch. + # Single forward pass on the entire batch at once. with torch.no_grad(): logits = self.model(batch_tensor) predictions = torch.argmax(logits, dim=1).cpu().numpy() @@ -143,51 +144,60 @@ async def handle_request(self, request: ImageRequest): # Process the single request. image_array = np.array(request.image) - # Ray Serve's @serve.batch will automatically batch requests. + # Ray Serve's @serve.batch automatically batches requests. result = await self.predict_batch(image_array) return result ###################################################################### -# This is a FastAPI app, which gives us batteries-included features like -# automatic request validation (via Pydantic), OpenAPI-style API -# documentation, and more. +# This is a FastAPI app, which extends Ray Serve with features like +# automatic request validation with Pydantic, auto-generated OpenAPI-style +# API documentation, and more. # # Configure autoscaling and resource allocation -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# --------------------------------------------- # # In production, traffic can vary significantly. Ray Serve’s # **autoscaling** feature automatically adjusts the number of replicas # based on traffic load, ensuring you have enough capacity during peaks -# while saving resources during quiet periods. +# while saving resources during quiet periods. Ray Serve scales to very +# large deployments with thousands of nodes and replicas. # # You can also specify **resource allocation** per replica, such as the # number of CPUs or GPUs. Ray Serve handles the orchestration of these -# resources across your cluster. +# resources across your cluster. Ray also supports `fractional +# GPUs `__, +# allowing multiple replicas to share a single GPU when models are small +# enough to fit in memory together. # -# Below is a sample configuration with autoscaling and resource +# The following is a sample configuration with autoscaling and resource # allocation: mnist_app = MNISTClassifier.options( autoscaling_config={ "target_ongoing_requests": 50, # Target 50 ongoing requests per replica. "min_replicas": 1, # Keep at least 1 replica alive. - "max_replicas": 5, # Scale up to 5 replicas to maintain target_ongoing_requests. + "max_replicas": 80, # Scale up to 80 replicas to maintain target_ongoing_requests. "upscale_delay_s": 5, # Wait 5s before scaling up. "downscale_delay_s": 30, # Wait 30s before scaling down. }, # Max concurrent requests per replica before queueing. # If the queue fills the shared cluster memory, future requests are backpressured until memory is freed. max_ongoing_requests=100, - ray_actor_options={"num_cpus": 1, "num_gpus": 1} + ray_actor_options={"num_cpus": 1, "num_gpus": 0.5} # Each replica uses half a GPU. ).bind() ###################################################################### -# The app is now ready to be deployed. +# The app is ready to deploy. Suppose you ran this on a cluster of 10 +# machines, each with 4 GPUs. With ``num_gpus=0.5``, Ray schedules 2 +# replicas per GPU, giving you 80 replicas across the cluster. This +# configuration permits the deployment to elastically scale up to 80 +# replicas as needed to handle traffic spikes and scale back down to 1 +# replica when traffic subsides. # -# Testing the endpoint with with concurrent requests -# -------------------------------------------------- +# Test the endpoint with concurrent requests +# ------------------------------------------ # # To deploy the app, use the ``serve.run`` function: @@ -195,26 +205,25 @@ async def handle_request(self, request: ImageRequest): handle = serve.run(mnist_app, name="mnist_classifier") ###################################################################### -# You should see an output similar to: -# -# .. code-block:: bash -# -# Started Serve in namespace "serve". -# Registering autoscaling state for deployment Deployment(name='MNISTClassifier', app='mnist_classifier') -# Deploying new version of Deployment(name='MNISTClassifier', app='mnist_classifier') (initial target replicas: 1). -# Proxy starting on node ... (HTTP port: 8000). -# Got updated endpoints: {}. -# Got updated endpoints: {Deployment(name='MNISTClassifier', app='mnist_classifier'): EndpointInfo(route='/', app_is_cross_language=False, route_patterns=None)}. -# Started . -# Adding 1 replica to Deployment(name='MNISTClassifier', app='mnist_classifier'). -# Got updated endpoints: {Deployment(name='MNISTClassifier', app='mnist_classifier'): EndpointInfo(route='/', app_is_cross_language=False, route_patterns=['/', '/docs', '/docs/oauth2-redirect', '/openapi.json', '/redoc'])}. -# Application 'mnist_classifier' is ready at http://127.0.0.1:8000/. +# You will see output similar to: + +# %%bash +# Started Serve in namespace "serve". +# Registering autoscaling state for deployment Deployment(name='MNISTClassifier', app='mnist_classifier') +# Deploying new version of Deployment(name='MNISTClassifier', app='mnist_classifier') (initial target replicas: 1). +# Proxy starting on node ... (HTTP port: 8000). +# Got updated endpoints: {}. +# Got updated endpoints: {Deployment(name='MNISTClassifier', app='mnist_classifier'): EndpointInfo(route='/', app_is_cross_language=False, route_patterns=None)}. +# Started . +# Adding 1 replica to Deployment(name='MNISTClassifier', app='mnist_classifier'). +# Got updated endpoints: {Deployment(name='MNISTClassifier', app='mnist_classifier'): EndpointInfo(route='/', app_is_cross_language=False, route_patterns=['/', '/docs', '/docs/oauth2-redirect', '/openapi.json', '/redoc'])}. +# Application 'mnist_classifier' is ready at http://127.0.0.1:8000/. ###################################################################### # The app is now listening for requests on port 8000. # -# To test the batching, you can send many requests concurrently using -# ``aiohttp``. Below is a sample function that sends 2000 concurrent +# To test the deployment, you can send many requests concurrently using +# ``aiohttp``. The following code demonstrates how to send 2000 concurrent # requests to the app: async def send_single_request(session, url, data): @@ -243,42 +252,55 @@ async def send_concurrent_requests(num_requests): print(f"Throughput: {len(responses)/elapsed:.2f} requests/second") ###################################################################### -# You should see high throughput numbers, confirming that requests are -# being batched and processed in parallel across the replicas. +# Ray Serve automatically buffers and load balances requests across the +# replicas. # -# Monitoring the deployment -# ------------------------- +# Monitor the deployment +# ---------------------- +# +# Monitoring is critical when running large-scale deployments. The `Ray +# dashboard `__ +# displays Serve metrics like request throughput, latency, and error +# rates. It also shows cluster resource usage and replica status in real +# time. The dashboard also lets you inspect logs from individual replicas +# across the cluster. # -# Ray Serve provides built-in monitoring tools to help you track the -# status and performance of your deployment. This dashboard lets you view -# Serving metrics like request throughput, latency, and error rates, as -# well as cluster status and resource utilization. For more information, -# see the `Ray Serve monitoring +# For debugging, Ray offers `distributed debugging +# tools `__ +# that let you attach a debugger to running replicas across the cluster. +# For more information, see the `Ray Serve monitoring # documentation `__. - -###################################################################### +# # Summary # ------- # -# In this tutorial, you learned how to: +# In this tutorial, you: # -# - Deploy PyTorch models using Ray Serve with production best practices. -# - Enable **dynamic request batching** to optimize performance. -# - Configure **autoscaling** to handle traffic spikes. -# - Test the service with concurrent asynchronous requests. +# - Deployed a PyTorch model using Ray Serve with production best +# practices. +# - Enabled **dynamic request batching** to optimize performance. +# - Configured **autoscaling** and **fractional GPU allocation** to +# efficiently scale across a cluster. +# - Tested the service with concurrent asynchronous requests. # # Further reading # --------------- # # Ray Serve has more production features that are out of scope for this -# tutorial, but are worth checking out: +# tutorial but are worth checking out: # -# - Specialized **LLM serving APIs** that handles complexities like -# managing KV caches and continuous batching. -# - **Model multiplexing** to dynamically load and serve many different -# models (e.g., per-user fine-tuned models) on a single deployment. -# - **Composed Deployments** to orchestrate multiple deployments into a -# single application. +# - Specialized `large language model (LLM) serving +# APIs `__ that +# handle complexities like managing key-value (KV) caches and continuous +# batching. +# - `Model +# multiplexing `__ +# to dynamically load and serve many different models on the same +# deployment. This is useful for serving per-user fine-tuned models, for +# example. +# - `Composed +# deployments `__ +# to orchestrate multiple deployments into a single app. # # For more information, see the `Ray Serve # documentation `__ and From 0ac13d638244d22900e0d71af26d4d230115f6c9 Mon Sep 17 00:00:00 2001 From: Ricardo Decal Date: Thu, 18 Dec 2025 14:14:29 -0800 Subject: [PATCH 17/18] rm tutorial intermediates --- beginner_source/serving_tutorial.md | 255 ---------------------------- 1 file changed, 255 deletions(-) delete mode 100644 beginner_source/serving_tutorial.md diff --git a/beginner_source/serving_tutorial.md b/beginner_source/serving_tutorial.md deleted file mode 100644 index b9c927f3b1a..00000000000 --- a/beginner_source/serving_tutorial.md +++ /dev/null @@ -1,255 +0,0 @@ ---- -jupyter: - jupytext: - default_lexer: ipython3 - formats: ipynb,md - text_representation: - extension: .md - format_name: markdown - format_version: '1.3' - jupytext_version: 1.18.1 - kernelspec: - display_name: Python 3 (ipykernel) - language: python - name: python3 ---- - -# Serve PyTorch models at scale with Ray Serve - -**Author:** [Ricardo Decal](https://github.com/crypdick) - -This tutorial shows how to deploy a PyTorch model using Ray Serve with production-ready features. - -[Ray Serve](https://docs.ray.io/en/latest/serve/index.html) is a scalable framework for serving machine learning models in production built on top of Ray. [Ray](https://docs.ray.io/en/latest/index.html), a project of the PyTorch Foundation, is an open-source unified framework for scaling AI and Python applications. Ray simplifies distributed workloads by handling the complexity of distributed computing. - -In this tutorial, you'll learn how to deploy a PyTorch model with Ray Serve and use its production-ready features. Ray Serve allows you to easily scale your model inference across multiple nodes and GPUs, providing features like dynamic batching, autoscaling, fault tolerance, and observability out of the box. - -## Setup - -Install the dependencies: - -```bash -pip install "ray[serve]" torch torchvision -``` - -Start by importing the required libraries: - -```python -import asyncio -import time -from typing import Any - -from fastapi import FastAPI -from pydantic import BaseModel -import aiohttp -import numpy as np -import torch -import torch.nn as nn -from ray import serve -from torchvision.transforms import v2 -``` - -## Define a PyTorch model - -Define a simple convolutional neural network for MNIST digit classification: - -```python -class MNISTNet(nn.Module): - def __init__(self): - super().__init__() - self.conv1 = nn.Conv2d(1, 32, 3, 1) - self.dropout1 = nn.Dropout(0.25) - self.conv2 = nn.Conv2d(32, 64, 3, 1) - self.fc1 = nn.Linear(9216, 128) - self.dropout2 = nn.Dropout(0.5) - self.fc2 = nn.Linear(128, 10) - - def forward(self, x): - x = self.conv1(x) - x = nn.functional.relu(x) - x = self.conv2(x) - x = nn.functional.relu(x) - x = nn.functional.max_pool2d(x, 2) - x = self.dropout1(x) - x = torch.flatten(x, 1) - x = self.fc1(x) - x = nn.functional.relu(x) - x = self.dropout2(x) - x = self.fc2(x) - return nn.functional.log_softmax(x, dim=1) - -``` - -## Define the Ray Serve deployment - -To deploy this model with Ray Serve, wrap the model in a Python class and decorate it with `@serve.deployment`. - -Processing requests in batches is more efficient than processing requests one by one, especially when using GPUs. Ray Serve provides built-in support for **dynamic request batching**, where individual incoming requests are opportunistically batched. The `@serve.batch` decorator on the `predict_batch` method below enables this. - -```python -app = FastAPI() - -class ImageRequest(BaseModel): # Used for request validation and documentation - image: list[list[float]] | list[list[list[float]]] - -@serve.deployment -@serve.ingress(app) -class MNISTClassifier: - def __init__(self): - self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - self.model = MNISTNet().to(self.device) - self.transform = v2.Compose([ - v2.ToImage(), - v2.ToDtype(torch.float32, scale=True), - v2.Normalize(mean=[0.1307], std=[0.3013]), - ]) - - self.model.eval() - - # batch_wait_timeout_s is the maximum time to wait for a full batch. - @serve.batch(max_batch_size=128, batch_wait_timeout_s=0.1) - async def predict_batch(self, images: list[np.ndarray]) -> list[dict[str, Any]]: - # Stack all images into a single tensor. - batch_tensor = torch.cat([ - self.transform(img).unsqueeze(0) - for img in images - ]).to(self.device).float() - - # Run inference on the entire batch. - with torch.no_grad(): - logits = self.model(batch_tensor) - predictions = torch.argmax(logits, dim=1).cpu().numpy() - - # Unbatch the results and preserve their original order. - return [ - { - "predicted_label": int(pred), - "logits": logit.cpu().numpy().tolist() - } - for pred, logit in zip(predictions, logits) - ] - - @app.post("/") - async def handle_request(self, request: ImageRequest): - """Handle an incoming HTTP request using FastAPI. - - Inputs are automatically validated using the Pydantic model. - """ - # Process the single request. - image_array = np.array(request.image) - - # Ray Serve's @serve.batch will automatically batch requests. - result = await self.predict_batch(image_array) - - return result - -``` - -This is a FastAPI app, which gives us batteries-included features like automatic request validation (via Pydantic), OpenAPI-style API documentation, and more. - -### Configure autoscaling and resource allocation - -In production, traffic can vary significantly. Ray Serve's **autoscaling** feature automatically adjusts the number of replicas based on traffic load, ensuring you have enough capacity during peaks while saving resources during quiet periods. - -You can also specify **resource allocation** per replica, such as the number of CPUs or GPUs. Ray Serve handles the orchestration of these resources across your cluster. - -Below is a sample configuration with autoscaling and resource allocation: - -```python -mnist_app = MNISTClassifier.options( - autoscaling_config={ - "target_ongoing_requests": 50, # Target 50 ongoing requests per replica. - "min_replicas": 1, # Keep at least 1 replica alive. - "max_replicas": 5, # Scale up to 5 replicas to maintain target_ongoing_requests. - "upscale_delay_s": 5, # Wait 5s before scaling up. - "downscale_delay_s": 30, # Wait 30s before scaling down. - }, - # Max concurrent requests per replica before queueing. - # If the queue fills the shared cluster memory, future requests are backpressured until memory is freed. - max_ongoing_requests=100, - ray_actor_options={"num_cpus": 1, "num_gpus": 1} -).bind() -``` - -The app is now ready to be deployed. - -## Testing the endpoint with with concurrent requests - -To deploy the app, use the `serve.run` function: - -```python -# Start the Ray Serve application. -handle = serve.run(mnist_app, name="mnist_classifier") -``` - -You should see an output similar to: - -```bash -Started Serve in namespace "serve". -Registering autoscaling state for deployment Deployment(name='MNISTClassifier', app='mnist_classifier') -Deploying new version of Deployment(name='MNISTClassifier', app='mnist_classifier') (initial target replicas: 1). -Proxy starting on node ... (HTTP port: 8000). -Got updated endpoints: {}. -Got updated endpoints: {Deployment(name='MNISTClassifier', app='mnist_classifier'): EndpointInfo(route='/', app_is_cross_language=False, route_patterns=None)}. -Started . -Adding 1 replica to Deployment(name='MNISTClassifier', app='mnist_classifier'). -Got updated endpoints: {Deployment(name='MNISTClassifier', app='mnist_classifier'): EndpointInfo(route='/', app_is_cross_language=False, route_patterns=['/', '/docs', '/docs/oauth2-redirect', '/openapi.json', '/redoc'])}. -Application 'mnist_classifier' is ready at http://127.0.0.1:8000/. -``` - -The app is now listening for requests on port 8000. - -To test the batching, you can send many requests concurrently using `aiohttp`. Below is a sample function that sends 2000 concurrent requests to the app: - -```python -async def send_single_request(session, url, data): - async with session.post(url, json=data) as response: - return await response.json() - -async def send_concurrent_requests(num_requests): - image = np.random.rand(28, 28).tolist() - - print(f"Sending {num_requests} concurrent requests...") - async with aiohttp.ClientSession() as session: - tasks = [ - send_single_request(session, url="http://localhost:8000/", data={"image": image}) - for _ in range(num_requests) - ] - responses = await asyncio.gather(*tasks) - - return responses - -# Run the concurrent requests. -start_time = time.time() -responses = asyncio.run(send_concurrent_requests(2000)) -elapsed = time.time() - start_time - -print(f"Processed {len(responses)} requests in {elapsed:.2f} seconds") -print(f"Throughput: {len(responses)/elapsed:.2f} requests/second") -``` - -You should see high throughput numbers, confirming that requests are being batched and processed in parallel across the replicas. - -## Monitoring the deployment - -Ray Serve provides built-in monitoring tools to help you track the status and performance of your deployment. -This dashboard lets you view Serving metrics like request throughput, latency, and error rates, as well as cluster status and resource utilization. For more information, see the [Ray Serve monitoring documentation](https://docs.ray.io/en/latest/serve/monitoring.html). - -## Summary - -In this tutorial, you learned how to: - -- Deploy PyTorch models using Ray Serve with production best practices. -- Enable **dynamic request batching** to optimize performance. -- Configure **autoscaling** to handle traffic spikes. -- Test the service with concurrent asynchronous requests. - -## Further reading - -Ray Serve has more production features that are out of scope for this tutorial, but are worth checking out: - -- Specialized **LLM serving APIs** that handles complexities like managing KV caches and continuous batching. -- **Model multiplexing** to dynamically load and serve many different models (e.g., per-user fine-tuned models) on a single deployment. -- **Composed Deployments** to orchestrate multiple deployments into a single application. - -For more information, see the [Ray Serve documentation](https://docs.ray.io/en/latest/serve/index.html) and [Ray Serve examples](https://docs.ray.io/en/latest/serve/examples/index.html). From aa881f3a7b76463bbe6545e2f1c8d98de02f474b Mon Sep 17 00:00:00 2001 From: Ricardo Decal Date: Thu, 18 Dec 2025 15:21:48 -0800 Subject: [PATCH 18/18] minor fixes --- .gitignore | 2 ++ beginner_source/serving_tutorial.py | 10 ++++++---- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/.gitignore b/.gitignore index a0e9a85a9cf..2b2d167da07 100644 --- a/.gitignore +++ b/.gitignore @@ -53,6 +53,7 @@ var/ *.egg-info/ .installed.cfg *.egg +.git # PyInstaller # Usually these files are written by a python script from a template @@ -110,6 +111,7 @@ celerybeat-schedule # virtualenv venv/ +.venv/ ENV/ # Spyder project settings diff --git a/beginner_source/serving_tutorial.py b/beginner_source/serving_tutorial.py index 63c9033d444..0d7a8202fa8 100644 --- a/beginner_source/serving_tutorial.py +++ b/beginner_source/serving_tutorial.py @@ -174,6 +174,8 @@ async def handle_request(self, request: ImageRequest): # The following is a sample configuration with autoscaling and resource # allocation: +num_cpus_per_replica = 1 +num_gpus_per_replica = 1 # Set to 0 to run the model on CPUs instead of GPUs. mnist_app = MNISTClassifier.options( autoscaling_config={ "target_ongoing_requests": 50, # Target 50 ongoing requests per replica. @@ -184,8 +186,8 @@ async def handle_request(self, request: ImageRequest): }, # Max concurrent requests per replica before queueing. # If the queue fills the shared cluster memory, future requests are backpressured until memory is freed. - max_ongoing_requests=100, - ray_actor_options={"num_cpus": 1, "num_gpus": 0.5} # Each replica uses half a GPU. + max_ongoing_requests=200, + ray_actor_options={"num_cpus": num_cpus_per_replica, "num_gpus": num_gpus_per_replica} ).bind() ###################################################################### @@ -223,7 +225,7 @@ async def handle_request(self, request: ImageRequest): # The app is now listening for requests on port 8000. # # To test the deployment, you can send many requests concurrently using -# ``aiohttp``. The following code demonstrates how to send 2000 concurrent +# ``aiohttp``. The following code demonstrates how to send 1000 concurrent # requests to the app: async def send_single_request(session, url, data): @@ -245,7 +247,7 @@ async def send_concurrent_requests(num_requests): # Run the concurrent requests. start_time = time.time() -responses = asyncio.run(send_concurrent_requests(2000)) +responses = asyncio.run(send_concurrent_requests(1000)) elapsed = time.time() - start_time print(f"Processed {len(responses)} requests in {elapsed:.2f} seconds")