diff --git a/README.md b/README.md index f629ce61e3..1a5891e4dd 100644 --- a/README.md +++ b/README.md @@ -85,9 +85,9 @@ curl -X POST -d '{"prompt":"count from 1 to 9 in french ", "max_tokens": 100}' - ```bash #export token= -docker build --pull . -f docker/Dockerfile.llm -t ts/llm +docker build --pull . -f docker/Dockerfile.vllm -t ts/vllm -docker run --rm -ti --shm-size 10g --gpus all -e HUGGING_FACE_HUB_TOKEN=$token -p 8080:8080 -v data:/data ts/llm --model_id meta-llama/Meta-Llama-3-8B-Instruct --disable_token_auth +docker run --rm -ti --shm-size 10g --gpus all -e HUGGING_FACE_HUB_TOKEN=$token -p 8080:8080 -v data:/data ts/vllm --model_id meta-llama/Meta-Llama-3-8B-Instruct --disable_token_auth # Try it out curl -X POST -d '{"model":"meta-llama/Meta-Llama-3-8B-Instruct", "prompt":"Hello, my name is", "max_tokens": 200}' --header "Content-Type: application/json" "http://localhost:8080/predictions/model/1.0/v1/completions" diff --git a/benchmarks/utils/system_under_test.py b/benchmarks/utils/system_under_test.py index 3f20f8b0d2..85820203b8 100644 --- a/benchmarks/utils/system_under_test.py +++ b/benchmarks/utils/system_under_test.py @@ -113,6 +113,7 @@ def start(self): execute("torchserve --stop", wait=True) click.secho("*Setting up model store...", fg="green") self._prepare_local_dependency() + self._clear_neuron_cache_if_exists() click.secho("*Starting local Torchserve instance...", fg="green") ts_cmd = ( @@ -141,6 +142,31 @@ def start(self): if "Model server started" in str(line).strip(): break + def _clear_neuron_cache_if_exists(self): + cache_dir = "/var/tmp/neuron-compile-cache/" + + # Check if the directory exists + if os.path.exists(cache_dir) and os.path.isdir(cache_dir): + click.secho( + f"Directory {cache_dir} exists. Clearing contents...", fg="green" + ) + + # Remove the directory contents + for filename in os.listdir(cache_dir): + file_path = os.path.join(cache_dir, filename) + try: + if os.path.isfile(file_path) or os.path.islink(file_path): + os.unlink(file_path) + elif os.path.isdir(file_path): + shutil.rmtree(file_path) + except Exception as e: + click.secho(f"Failed to delete {file_path}. Reason: {e}", fg="red") + click.secho(f"Cache cleared: {cache_dir}", fg="green") + else: + click.secho( + f"Directory {cache_dir} does not exist. No action taken.", fg="green" + ) + def stop(self): click.secho("*Terminating Torchserve instance...", fg="green") execute("torchserve --stop", wait=True) diff --git a/docker/Dockerfile b/docker/Dockerfile index 71d48f58fe..94f4a1ba99 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -73,7 +73,7 @@ COPY ./ serve RUN \ if echo "$LOCAL_CHANGES" | grep -q "false"; then \ rm -rf serve;\ - git clone --recursive $REPO_URL -b $BRANCH_NAME; \ + git clone --recursive $REPO_URL -b $BRANCH_NAME serve; \ fi @@ -238,7 +238,7 @@ COPY ./ serve RUN \ if echo "$LOCAL_CHANGES" | grep -q "false"; then \ rm -rf serve;\ - git clone --recursive $REPO_URL -b $BRANCH_NAME; \ + git clone --recursive $REPO_URL -b $BRANCH_NAME serve; \ fi COPY --from=compile-image /home/venv /home/venv diff --git a/docker/Dockerfile.llm b/docker/Dockerfile.vllm similarity index 100% rename from docker/Dockerfile.llm rename to docker/Dockerfile.vllm diff --git a/docs/llm_deployment.md b/docs/llm_deployment.md index 282dd558fe..2a7bfc8742 100644 --- a/docs/llm_deployment.md +++ b/docs/llm_deployment.md @@ -11,7 +11,7 @@ The launcher can either be used standalone or in combination with our provided T To launch the docker we first need to build it: ```bash -docker build . -f docker/Dockerfile.llm -t ts/llm +docker build . -f docker/Dockerfile.vllm -t ts/vllm ``` Models are usually loaded from the HuggingFace hub and are cached in a [docker volume](https://docs.docker.com/storage/volumes/) for faster reload. @@ -22,7 +22,7 @@ export token= You can then go ahead and launch a TorchServe instance serving your selected model: ```bash -docker run --rm -ti --shm-size 1g --gpus all -e HUGGING_FACE_HUB_TOKEN=$token -p 8080:8080 -v data:/data ts/llm --model_id meta-llama/Meta-Llama-3-8B-Instruct --disable_token_auth +docker run --rm -ti --shm-size 1g --gpus all -e HUGGING_FACE_HUB_TOKEN=$token -p 8080:8080 -v data:/data ts/vllm --model_id meta-llama/Meta-Llama-3-8B-Instruct --disable_token_auth ``` To change the model you just need to exchange the identifier given to the `--model_id` parameter. diff --git a/examples/large_models/utils/test_llm_streaming_response.py b/examples/large_models/utils/test_llm_streaming_response.py index 55f9129bc3..94e6b373e5 100644 --- a/examples/large_models/utils/test_llm_streaming_response.py +++ b/examples/large_models/utils/test_llm_streaming_response.py @@ -196,7 +196,7 @@ def parse_args(): "--model-version", type=str, default="1.0", - help="Model vesion. Default: 1.0", + help="Model version. Default: 1.0", ) return parser.parse_args() diff --git a/examples/large_models/vllm/llama3/Readme.md b/examples/large_models/vllm/llama3/Readme.md index fb80f7a3e3..ac8ea048d4 100644 --- a/examples/large_models/vllm/llama3/Readme.md +++ b/examples/large_models/vllm/llama3/Readme.md @@ -9,7 +9,7 @@ To leverage the power of vLLM we fist need to install it using pip in out develo ```bash python -m pip install -r ../requirements.txt ``` -For later deployments we can make vLLM part of the deployment environment by adding the requirements.txt while building the model archive in step 2 (see [here](../../../../model-archiver/README.md#model-specific-custom-python-requirements) for details) or we can make it part of a docker image like [here](../../../../docker/Dockerfile.llm). +For later deployments we can make vLLM part of the deployment environment by adding the requirements.txt while building the model archive in step 2 (see [here](../../../../model-archiver/README.md#model-specific-custom-python-requirements) for details) or we can make it part of a docker image like [here](../../../../docker/Dockerfile.vllm). ### Step 1: Download Model from HuggingFace diff --git a/examples/large_models/vllm/lora/Readme.md b/examples/large_models/vllm/lora/Readme.md index 0469d53567..0b855261e3 100644 --- a/examples/large_models/vllm/lora/Readme.md +++ b/examples/large_models/vllm/lora/Readme.md @@ -9,7 +9,7 @@ To leverage the power of vLLM we fist need to install it using pip in out develo ```bash python -m pip install -r ../requirements.txt ``` -For later deployments we can make vLLM part of the deployment environment by adding the requirements.txt while building the model archive in step 2 (see [here](../../../../model-archiver/README.md#model-specific-custom-python-requirements) for details) or we can make it part of a docker image like [here](../../../../docker/Dockerfile.llm). +For later deployments we can make vLLM part of the deployment environment by adding the requirements.txt while building the model archive in step 2 (see [here](../../../../model-archiver/README.md#model-specific-custom-python-requirements) for details) or we can make it part of a docker image like [here](../../../../docker/Dockerfile.vllm). ### Step 1: Download Model from HuggingFace diff --git a/examples/large_models/vllm/mistral/Readme.md b/examples/large_models/vllm/mistral/Readme.md index 4816adcae5..d7c504a54c 100644 --- a/examples/large_models/vllm/mistral/Readme.md +++ b/examples/large_models/vllm/mistral/Readme.md @@ -9,7 +9,7 @@ To leverage the power of vLLM we fist need to install it using pip in out develo ```bash python -m pip install -r ../requirements.txt ``` -For later deployments we can make vLLM part of the deployment environment by adding the requirements.txt while building the model archive in step 2 (see [here](../../../../model-archiver/README.md#model-specific-custom-python-requirements) for details) or we can make it part of a docker image like [here](../../../../docker/Dockerfile.llm). +For later deployments we can make vLLM part of the deployment environment by adding the requirements.txt while building the model archive in step 2 (see [here](../../../../model-archiver/README.md#model-specific-custom-python-requirements) for details) or we can make it part of a docker image like [here](../../../../docker/Dockerfile.vllm). ### Step 1: Download Model from HuggingFace diff --git a/frontend/archive/src/main/java/org/pytorch/serve/archive/model/ModelConfig.java b/frontend/archive/src/main/java/org/pytorch/serve/archive/model/ModelConfig.java index c83da0523c..1693bc8424 100644 --- a/frontend/archive/src/main/java/org/pytorch/serve/archive/model/ModelConfig.java +++ b/frontend/archive/src/main/java/org/pytorch/serve/archive/model/ModelConfig.java @@ -11,6 +11,9 @@ public class ModelConfig { private static final Logger logger = LoggerFactory.getLogger(ModelConfig.class); + public static final int defaultStartupTimeout = 120; // unit: sec + public static final int defaultResponseTimeout = 120; // unit: sec + /** the minimum number of workers of a model */ private int minWorkers; /** the maximum number of workers of a model */ @@ -20,9 +23,9 @@ public class ModelConfig { /** the maximum delay in msec of a batch of a model */ private int maxBatchDelay; /** the timeout in sec of a specific model's response. */ - private int responseTimeout = 120; // unit: sec + private int responseTimeout = defaultResponseTimeout; /** the timeout in sec of a specific model's startup. */ - private int startupTimeout = 120; // unit: sec + private int startupTimeout = defaultStartupTimeout; /** * the device type where the model is loaded. It can be gpu, cpu. The model is loaded on CPU if * deviceType: "cpu" is set on a GPU host. diff --git a/frontend/server/src/main/java/org/pytorch/serve/ModelServer.java b/frontend/server/src/main/java/org/pytorch/serve/ModelServer.java index f5b02ee222..90f983333a 100644 --- a/frontend/server/src/main/java/org/pytorch/serve/ModelServer.java +++ b/frontend/server/src/main/java/org/pytorch/serve/ModelServer.java @@ -193,7 +193,8 @@ private void initModelStore() throws InvalidSnapshotException, IOException { String fileName = file.getName(); if (file.isFile() && !fileName.endsWith(".mar") - && !fileName.endsWith(".model")) { + && !fileName.endsWith(".model") + && !fileName.endsWith(".tar.gz")) { continue; } try { diff --git a/frontend/server/src/main/java/org/pytorch/serve/openapi/OpenApiUtils.java b/frontend/server/src/main/java/org/pytorch/serve/openapi/OpenApiUtils.java index 5d216f0465..f307188647 100644 --- a/frontend/server/src/main/java/org/pytorch/serve/openapi/OpenApiUtils.java +++ b/frontend/server/src/main/java/org/pytorch/serve/openapi/OpenApiUtils.java @@ -203,7 +203,7 @@ private static Operation getSetDefaultOperation() { MediaType error = getErrorResponse(); operation.addResponse( - new Response("200", "Default vesion succsesfully updated for model", status)); + new Response("200", "Default version successfully updated for model", status)); operation.addResponse( new Response("404", "Model not found or Model version not found", error)); operation.addResponse(new Response("500", "Internal Server Error", error)); diff --git a/frontend/server/src/main/java/org/pytorch/serve/util/ApiUtils.java b/frontend/server/src/main/java/org/pytorch/serve/util/ApiUtils.java index 46c476affd..70f5a1c644 100644 --- a/frontend/server/src/main/java/org/pytorch/serve/util/ApiUtils.java +++ b/frontend/server/src/main/java/org/pytorch/serve/util/ApiUtils.java @@ -98,7 +98,7 @@ public static String setDefault(String modelName, String newModelVersion) ModelManager modelManager = ModelManager.getInstance(); modelManager.setDefaultVersion(modelName, newModelVersion); String msg = - "Default vesion succsesfully updated for model \"" + "Default version successfully updated for model \"" + modelName + "\" to \"" + newModelVersion diff --git a/frontend/server/src/main/java/org/pytorch/serve/wlm/Model.java b/frontend/server/src/main/java/org/pytorch/serve/wlm/Model.java index c41a495260..32ae5bcc06 100644 --- a/frontend/server/src/main/java/org/pytorch/serve/wlm/Model.java +++ b/frontend/server/src/main/java/org/pytorch/serve/wlm/Model.java @@ -193,9 +193,17 @@ public void setModelState(JsonObject modelInfo) { minWorkers = modelInfo.get(MIN_WORKERS).getAsInt(); maxWorkers = modelInfo.get(MAX_WORKERS).getAsInt(); maxBatchDelay = modelInfo.get(MAX_BATCH_DELAY).getAsInt(); - responseTimeout = modelInfo.get(RESPONSE_TIMEOUT).getAsInt(); - startupTimeout = modelInfo.get(STARTUP_TIMEOUT).getAsInt(); batchSize = modelInfo.get(BATCH_SIZE).getAsInt(); + responseTimeout = + modelInfo.has(RESPONSE_TIMEOUT) && !modelInfo.get(RESPONSE_TIMEOUT).isJsonNull() + ? modelInfo.get(RESPONSE_TIMEOUT).getAsInt() + : modelArchive.getModelConfig() + .defaultResponseTimeout; // default value for responseTimeout + startupTimeout = + modelInfo.has(STARTUP_TIMEOUT) && !modelInfo.get(STARTUP_TIMEOUT).isJsonNull() + ? modelInfo.get(STARTUP_TIMEOUT).getAsInt() + : modelArchive.getModelConfig() + .defaultStartupTimeout; // default value for startupTimeout JsonElement runtime = modelInfo.get(RUNTIME_TYPE); String runtime_str = Manifest.RuntimeType.PYTHON.getValue(); diff --git a/frontend/server/src/test/java/org/pytorch/serve/ModelServerTest.java b/frontend/server/src/test/java/org/pytorch/serve/ModelServerTest.java index 518143ec04..f419a26657 100644 --- a/frontend/server/src/test/java/org/pytorch/serve/ModelServerTest.java +++ b/frontend/server/src/test/java/org/pytorch/serve/ModelServerTest.java @@ -429,7 +429,7 @@ public void testSetDefaultVersionNoop() throws InterruptedException { StatusResponse resp = JsonUtils.GSON.fromJson(TestUtils.getResult(), StatusResponse.class); Assert.assertEquals( resp.getStatus(), - "Default vesion succsesfully updated for model \"noopversioned\" to \"1.2.1\""); + "Default version successfully updated for model \"noopversioned\" to \"1.2.1\""); } @Test( diff --git a/frontend/server/src/test/resources/management_open_api.json b/frontend/server/src/test/resources/management_open_api.json index 352cc1bd83..4e926297a1 100644 --- a/frontend/server/src/test/resources/management_open_api.json +++ b/frontend/server/src/test/resources/management_open_api.json @@ -1671,7 +1671,7 @@ ], "responses": { "200": { - "description": "Default vesion succsesfully updated for model", + "description": "Default version successfully updated for model", "content": { "application/json": { "schema": { diff --git a/frontend/server/src/test/resources/model_management_api.json b/frontend/server/src/test/resources/model_management_api.json index 9ca6badc69..93cb7bae6b 100644 --- a/frontend/server/src/test/resources/model_management_api.json +++ b/frontend/server/src/test/resources/model_management_api.json @@ -1216,7 +1216,7 @@ ], "responses": { "200": { - "description": "Default vesion succsesfully updated for model", + "description": "Default version successfully updated for model", "content": { "application/json": { "schema": { diff --git a/kubernetes/kserve/build_image.sh b/kubernetes/kserve/build_image.sh index 670915f1fc..39a005b8f0 100755 --- a/kubernetes/kserve/build_image.sh +++ b/kubernetes/kserve/build_image.sh @@ -66,5 +66,5 @@ cp -r ../../third_party . if [ "${MULTI}" == "true" ]; then DOCKER_BUILDKIT=1 docker buildx build --file "$DOCKER_FILE" --build-arg BASE_IMAGE=$BASE_IMAGE --platform "${ARCH}" -t "$DOCKER_TAG" --push . else - DOCKER_BUILDKIT=1 docker buildx build --file "$DOCKER_FILE" --build-arg BASE_IMAGE=$BASE_IMAGE -t "$DOCKER_TAG" --load . + DOCKER_BUILDKIT=1 docker buildx build --file "$DOCKER_FILE" --build-arg BASE_IMAGE=$BASE_IMAGE -t "$DOCKER_TAG" --push . fi diff --git a/kubernetes/kserve/build_upload_release.py b/kubernetes/kserve/build_upload_release.py index 0c7c481671..e9dd311610 100644 --- a/kubernetes/kserve/build_upload_release.py +++ b/kubernetes/kserve/build_upload_release.py @@ -39,11 +39,6 @@ dry_run, ) - for image in [ - f"{organization}/torchserve-kfs:{check_ts_version()}-gpu", - ]: - try_and_handle(f"docker push {image}", dry_run) - # Cleanup built images if args.cleanup: try_and_handle(f"docker system prune --all --volumes -f", dry_run) diff --git a/kubernetes/kserve/docker_nightly.py b/kubernetes/kserve/docker_nightly.py index d5d3f13b76..38c3bdea19 100644 --- a/kubernetes/kserve/docker_nightly.py +++ b/kubernetes/kserve/docker_nightly.py @@ -43,22 +43,17 @@ dry_run, ) - # Push Nightly images to official PyTorch Dockerhub account - try_and_handle(f"docker push {organization}/{gpu_version}", dry_run) - # Tag nightly images with latest try_and_handle( f"docker buildx imagetools create --tag {organization}/{project}:latest-cpu {organization}/{cpu_version}", dry_run, ) + try_and_handle( - f"docker tag {organization}/{gpu_version} {organization}/{project}:latest-gpu", + f"docker buildx imagetools create --tag {organization}/{project}:latest-gpu {organization}/{gpu_version}", dry_run, ) - # Push images with latest tag - try_and_handle(f"docker push {organization}/{project}:latest-gpu", dry_run) - # Cleanup built images if args.cleanup: try_and_handle(f"docker system prune --all --volumes -f", dry_run)