From ae34bc24d7aba747b4459bc93d38f71cdf5d2967 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@126.com>
Date: Thu, 11 Jul 2024 16:58:41 -0700
Subject: [PATCH] remove ray for launching api server

---
 tests/async_engine/test_openapi_server_ray.py | 14 ++------------
 tests/distributed/test_pipeline_parallel.py   |  6 ++----
 tests/entrypoints/openai/test_chat.py         | 14 ++------------
 tests/entrypoints/openai/test_completion.py   | 14 ++------------
 tests/entrypoints/openai/test_embedding.py    | 12 ++----------
 tests/entrypoints/openai/test_models.py       | 14 ++------------
 tests/entrypoints/openai/test_vision.py       | 10 +---------
 tests/tensorizer_loader/test_tensorizer.py    |  7 +------
 8 files changed, 14 insertions(+), 77 deletions(-)

diff --git a/tests/async_engine/test_openapi_server_ray.py b/tests/async_engine/test_openapi_server_ray.py
index cc05d79e56874..26d4e6a359644 100644
--- a/tests/async_engine/test_openapi_server_ray.py
+++ b/tests/async_engine/test_openapi_server_ray.py
@@ -1,24 +1,14 @@
 import openai  # use the official client for correctness check
 import pytest
-# using Ray for overall ease of process management, parallel requests,
-# and debugging.
-import ray
 
-from ..utils import VLLM_PATH, RemoteOpenAIServer
+from ..utils import RemoteOpenAIServer
 
 # any model with a chat template should work here
 MODEL_NAME = "facebook/opt-125m"
 
 
 @pytest.fixture(scope="module")
-def ray_ctx():
-    ray.init(runtime_env={"working_dir": VLLM_PATH})
-    yield
-    ray.shutdown()
-
-
-@pytest.fixture(scope="module")
-def server(ray_ctx):
+def server():
     return RemoteOpenAIServer([
         "--model",
         MODEL_NAME,
diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py
index 4dd3de9167758..1014174f5d2bf 100644
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -2,11 +2,8 @@
 
 import openai  # use the official client for correctness check
 import pytest
-# using Ray for overall ease of process management, parallel requests,
-# and debugging.
-import ray
 
-from ..utils import VLLM_PATH, RemoteOpenAIServer
+from ..utils import RemoteOpenAIServer
 
 # downloading lora to test lora requests
 
@@ -19,6 +16,7 @@
 
 pytestmark = pytest.mark.asyncio
 
+
 @pytest.fixture(scope="module")
 def server():
     args = [
diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py
index 3e80214f24dc5..f901031778b59 100644
--- a/tests/entrypoints/openai/test_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
@@ -6,15 +6,12 @@
 import jsonschema
 import openai  # use the official client for correctness check
 import pytest
-# using Ray for overall ease of process management, parallel requests,
-# and debugging.
-import ray
 import torch
 # downloading lora to test lora requests
 from huggingface_hub import snapshot_download
 from openai import BadRequestError
 
-from ...utils import VLLM_PATH, RemoteOpenAIServer
+from ...utils import RemoteOpenAIServer
 
 # any model with a chat template should work here
 MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
@@ -76,14 +73,7 @@ def zephyr_lora_files():
 
 
 @pytest.fixture(scope="module")
-def ray_ctx():
-    ray.init(runtime_env={"working_dir": VLLM_PATH})
-    yield
-    ray.shutdown()
-
-
-@pytest.fixture(scope="module")
-def server(zephyr_lora_files, ray_ctx):
+def server(zephyr_lora_files):
     return RemoteOpenAIServer([
         "--model",
         MODEL_NAME,
diff --git a/tests/entrypoints/openai/test_completion.py b/tests/entrypoints/openai/test_completion.py
index 52a848b7831d5..40195ae3009f8 100644
--- a/tests/entrypoints/openai/test_completion.py
+++ b/tests/entrypoints/openai/test_completion.py
@@ -6,9 +6,6 @@
 import jsonschema
 import openai  # use the official client for correctness check
 import pytest
-# using Ray for overall ease of process management, parallel requests,
-# and debugging.
-import ray
 import requests
 # downloading lora to test lora requests
 from huggingface_hub import snapshot_download
@@ -16,7 +13,7 @@
 
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
-from ...utils import VLLM_PATH, RemoteOpenAIServer
+from ...utils import RemoteOpenAIServer
 
 # any model with a chat template should work here
 MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
@@ -78,14 +75,7 @@ def zephyr_lora_files():
 
 
 @pytest.fixture(scope="module")
-def ray_ctx():
-    ray.init(runtime_env={"working_dir": VLLM_PATH})
-    yield
-    ray.shutdown()
-
-
-@pytest.fixture(scope="module")
-def server(zephyr_lora_files, ray_ctx):
+def server(zephyr_lora_files):
     return RemoteOpenAIServer([
         "--model",
         MODEL_NAME,
diff --git a/tests/entrypoints/openai/test_embedding.py b/tests/entrypoints/openai/test_embedding.py
index f8aa1c9143a3b..8865d32d417ad 100644
--- a/tests/entrypoints/openai/test_embedding.py
+++ b/tests/entrypoints/openai/test_embedding.py
@@ -3,22 +3,14 @@
 import numpy as np
 import openai
 import pytest
-import ray
 
-from ...utils import VLLM_PATH, RemoteOpenAIServer
+from ...utils import RemoteOpenAIServer
 
 EMBEDDING_MODEL_NAME = "intfloat/e5-mistral-7b-instruct"
 
 
 @pytest.fixture(scope="module")
-def ray_ctx():
-    ray.init(runtime_env={"working_dir": VLLM_PATH})
-    yield
-    ray.shutdown()
-
-
-@pytest.fixture(scope="module")
-def embedding_server(ray_ctx):
+def embedding_server():
     return RemoteOpenAIServer([
         "--model",
         EMBEDDING_MODEL_NAME,
diff --git a/tests/entrypoints/openai/test_models.py b/tests/entrypoints/openai/test_models.py
index 914ef6e19e109..da147756ac61f 100644
--- a/tests/entrypoints/openai/test_models.py
+++ b/tests/entrypoints/openai/test_models.py
@@ -1,12 +1,9 @@
 import openai  # use the official client for correctness check
 import pytest
-# using Ray for overall ease of process management, parallel requests,
-# and debugging.
-import ray
 # downloading lora to test lora requests
 from huggingface_hub import snapshot_download
 
-from ...utils import VLLM_PATH, RemoteOpenAIServer
+from ...utils import RemoteOpenAIServer
 
 # any model with a chat template should work here
 MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
@@ -21,14 +18,7 @@ def zephyr_lora_files():
 
 
 @pytest.fixture(scope="module")
-def ray_ctx():
-    ray.init(runtime_env={"working_dir": VLLM_PATH})
-    yield
-    ray.shutdown()
-
-
-@pytest.fixture(scope="module")
-def server(zephyr_lora_files, ray_ctx):
+def server(zephyr_lora_files):
     return RemoteOpenAIServer([
         "--model",
         MODEL_NAME,
diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py
index b869717608d0f..ed47bd7094b60 100644
--- a/tests/entrypoints/openai/test_vision.py
+++ b/tests/entrypoints/openai/test_vision.py
@@ -3,7 +3,6 @@
 import openai
 import pytest
 import pytest_asyncio
-import ray
 
 from vllm.multimodal.utils import ImageFetchAiohttp, encode_image_base64
 
@@ -23,14 +22,7 @@
 
 
 @pytest.fixture(scope="module")
-def ray_ctx():
-    ray.init(runtime_env={"working_dir": VLLM_PATH})
-    yield
-    ray.shutdown()
-
-
-@pytest.fixture(scope="module")
-def server(ray_ctx):
+def server():
     return RemoteOpenAIServer([
         "--model",
         MODEL_NAME,
diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py
index b2ebcc15cd0fc..3e4ec0e116bc1 100644
--- a/tests/tensorizer_loader/test_tensorizer.py
+++ b/tests/tensorizer_loader/test_tensorizer.py
@@ -6,7 +6,6 @@
 
 import openai
 import pytest
-import ray
 import torch
 from tensorizer import EncryptionParams
 
@@ -22,7 +21,7 @@
                                                          tensorize_vllm_model)
 
 from ..conftest import VllmRunner, cleanup
-from ..utils import VLLM_PATH, RemoteOpenAIServer
+from ..utils import RemoteOpenAIServer
 
 # yapf conflicts with isort for this docstring
 
@@ -220,8 +219,6 @@ def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path):
         json.dumps(model_loader_extra_config),
     ]
 
-    ray.init(runtime_env={"working_dir": VLLM_PATH})
-
     server = RemoteOpenAIServer(openai_args)
     print("Server ready.")
 
@@ -282,7 +279,6 @@ def test_deserialized_encrypted_vllm_model_with_tp_has_same_outputs(vllm_runner,
     base_model.model.llm_engine.model_executor.shutdown()
     del base_model
     cleanup()
-    ray.shutdown()
 
     # load model with two shards and serialize with encryption
     model_path = str(tmp_path / (model_ref + "-%02d.tensors"))
@@ -305,7 +301,6 @@ def test_deserialized_encrypted_vllm_model_with_tp_has_same_outputs(vllm_runner,
     assert os.path.isfile(model_path % 0), "Serialization subprocess failed"
     assert os.path.isfile(model_path % 1), "Serialization subprocess failed"
     cleanup()
-    ray.shutdown()
 
     loaded_vllm_model = vllm_runner(
         model_ref,