(ServeController pid=9277) Traceback (most recent call last):
(ServeController pid=9277) File "/root/miniconda3/envs/yons/lib/python3.10/site-packages/ray/serve/_private/deployment_state.py", line 656, in check_ready
(ServeController pid=9277) _, self._version = ray.get(self._ready_obj_ref)
(ServeController pid=9277) File "/root/miniconda3/envs/yons/lib/python3.10/site-packages/ray/_private/auto_init_hook.py", line 22, in auto_init_wrapper
(ServeController pid=9277) return fn(*args, **kwargs)
(ServeController pid=9277) File "/root/miniconda3/envs/yons/lib/python3.10/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
(ServeController pid=9277) return func(*args, **kwargs)
(ServeController pid=9277) File "/root/miniconda3/envs/yons/lib/python3.10/site-packages/ray/_private/worker.py", line 2624, in get
(ServeController pid=9277) raise value.as_instanceof_cause()
(ServeController pid=9277) ray.exceptions.RayTaskError(RuntimeError): ray::5-72B-Chat-GGUF.initialize_and_get_metadata() (pid=9483, ip=172.17.0.3, actor_id=b5fcde3ad8e5c6c8e719d32404000000, repr=<ray.serve._private.replica.ServeReplica:Qwen--Qwen1.5-72B-Chat-GGUF:Qwen--Qwen1.5-72B-Chat-GGUF object at 0x7fa4048274c0>)
(ServeController pid=9277) File "/root/miniconda3/envs/yons/lib/python3.10/concurrent/futures/_base.py", line 451, in result
(ServeController pid=9277) return self.__get_result()
(ServeController pid=9277) File "/root/miniconda3/envs/yons/lib/python3.10/concurrent/futures/_base.py", line 403, in __get_result
(ServeController pid=9277) raise self._exception
(ServeController pid=9277) File "/root/miniconda3/envs/yons/lib/python3.10/site-packages/ray/serve/_private/replica.py", line 455, in initialize_and_get_metadata
(ServeController pid=9277) raise RuntimeError(traceback.format_exc()) from None
(ServeController pid=9277) RuntimeError: Traceback (most recent call last):
(ServeController pid=9277) File "/root/miniconda3/envs/yons/lib/python3.10/site-packages/ray/serve/_private/replica.py", line 445, in initialize_and_get_metadata
(ServeController pid=9277) await self.replica.update_user_config(
(ServeController pid=9277) File "/root/miniconda3/envs/yons/lib/python3.10/site-packages/ray/serve/_private/replica.py", line 724, in update_user_config
(ServeController pid=9277) await reconfigure_method(user_config)
(ServeController pid=9277) File "/data/llm-inference/llmserve/backend/server/app.py", line 151, in reconfigure
(ServeController pid=9277) await self.rollover(
(ServeController pid=9277) File "/data/llm-inference/llmserve/backend/llm/predictor.py", line 64, in rollover
(ServeController pid=9277) self.new_worker_group = await self._create_worker_group(
(ServeController pid=9277) File "/data/llm-inference/llmserve/backend/llm/predictor.py", line 154, in _create_worker_group
(ServeController pid=9277) engine = await self.engine.launch_engine(scaling_config, self.pg, scaling_options)
(ServeController pid=9277) File "/data/llm-inference/llmserve/backend/llm/engines/generic.py", line 333, in launch_engine
(ServeController pid=9277) await asyncio.gather(
(ServeController pid=9277) File "/root/miniconda3/envs/yons/lib/python3.10/asyncio/tasks.py", line 650, in _wrap_awaitable
(ServeController pid=9277) return (yield from awaitable.__await__())
(ServeController pid=9277) ray.exceptions.RayTaskError(ValueError): ray::PredictionWorker.init_model() (pid=9703, ip=172.17.0.3, actor_id=5691b4ad8e1d62a67ddc668004000000, repr=PredictionWorker:Qwen/Qwen1.5-72B-Chat-GGUF)
(ServeController pid=9277) File "/root/miniconda3/envs/yons/lib/python3.10/concurrent/futures/_base.py", line 451, in result
(ServeController pid=9277) return self.__get_result()
(ServeController pid=9277) File "/root/miniconda3/envs/yons/lib/python3.10/concurrent/futures/_base.py", line 403, in __get_result
(ServeController pid=9277) raise self._exception
(ServeController pid=9277) File "/data/llm-inference/llmserve/backend/llm/engines/generic.py", line 217, in init_model
(ServeController pid=9277) self.generator = init_model(
(ServeController pid=9277) File "/data/llm-inference/llmserve/backend/llm/utils.py", line 159, in inner
(ServeController pid=9277) ret = func(*args, **kwargs)
(ServeController pid=9277) File "/data/llm-inference/llmserve/backend/llm/engines/generic.py", line 133, in init_model
(ServeController pid=9277) resp_batch = generate(
(ServeController pid=9277) File "/data/llm-inference/llmserve/backend/llm/utils.py", line 159, in inner
(ServeController pid=9277) ret = func(*args, **kwargs)
(ServeController pid=9277) File "/data/llm-inference/llmserve/backend/llm/engines/generic.py", line 168, in generate
(ServeController pid=9277) outputs = pipeline(
(ServeController pid=9277) File "/data/llm-inference/llmserve/backend/llm/pipelines/llamacpp/llamacpp_pipeline.py", line 141, in __call__
(ServeController pid=9277) output = self.model(input, **kwargs)
(ServeController pid=9277) File "/root/miniconda3/envs/yons/lib/python3.10/site-packages/llama_cpp/llama.py", line 1547, in __call__
(ServeController pid=9277) return self.create_completion(
(ServeController pid=9277) File "/root/miniconda3/envs/yons/lib/python3.10/site-packages/llama_cpp/llama.py", line 1480, in create_completion
(ServeController pid=9277) completion: Completion = next(completion_or_chunks) # type: ignore
(ServeController pid=9277) File "/root/miniconda3/envs/yons/lib/python3.10/site-packages/llama_cpp/llama.py", line 959, in _create_completion
(ServeController pid=9277) raise ValueError(
(ServeController pid=9277) ValueError: Requested tokens (818) exceed context window of 512
(ServeController pid=9277) INFO 2024-04-05 11:16:41,444 controller 9277 deployment_state.py:2185 - Replica Qwen--Qwen1.5-72B-Chat-GGUF#Qwen--Qwen1.5-72B-Chat-GGUF#ZgAOMG is stopped.
(ServeController pid=9277) INFO 2024-04-05 11:16:41,445 controller 9277 deployment_state.py:1831 - Adding 1 replica to deployment Qwen--Qwen1.5-72B-Chat-GGUF in application 'Qwen--Qwen1.5-72B-Chat-GGUF'.