1717import asyncio
1818import os
1919import sys
20+ import tempfile
2021import threading
2122import time
2223import types
@@ -39,7 +40,11 @@ def enable_torch_proxy(scope=None):
3940 paddle .compat = _PaddleCompat ()
4041
4142from fastdeploy .engine .args_utils import EngineArgs
42- from fastdeploy .engine .common_engine import EngineService
43+ from fastdeploy .engine .common_engine import (
44+ EngineService ,
45+ _format_worker_launch_failure_message ,
46+ _read_latest_worker_traceback ,
47+ )
4348from fastdeploy .engine .request import (
4449 ControlRequest ,
4550 ControlResponse ,
@@ -3720,3 +3725,87 @@ def fake_time():
37203725
37213726 eng .resource_manager .recycle_abort_task .assert_called_with ("req-1_0" )
37223727 self ._detach_finalizer (eng )
3728+
3729+
3730+ class TestWorkerTracebackFunctions (unittest .TestCase ):
3731+ """测试 _read_latest_worker_traceback 和 _format_worker_launch_failure_message 函数"""
3732+
3733+ def test_read_latest_worker_traceback_finds_traceback (self ):
3734+ """测试能够正确读取 workerlog 文件中的 traceback"""
3735+ with tempfile .TemporaryDirectory () as temp_dir :
3736+ worker_log = os .path .join (temp_dir , "workerlog.0" )
3737+ with open (worker_log , "w" , encoding = "utf-8" ) as fp :
3738+ fp .write (
3739+ "Some normal log output\n "
3740+ "Traceback (most recent call last):\n "
3741+ ' File "worker_process.py", line 1, in <module>\n '
3742+ " run_worker_proc()\n "
3743+ "ValueError: The total number of blocks cannot be less than zero.\n "
3744+ )
3745+
3746+ result = _read_latest_worker_traceback (temp_dir )
3747+ self .assertIsNotNone (result )
3748+ self .assertIn ("Traceback (most recent call last):" , result )
3749+ self .assertIn ("ValueError:" , result )
3750+
3751+ def test_read_latest_worker_traceback_returns_none_when_no_traceback (self ):
3752+ """测试当没有 traceback 时返回 None"""
3753+ with tempfile .TemporaryDirectory () as temp_dir :
3754+ worker_log = os .path .join (temp_dir , "workerlog.0" )
3755+ with open (worker_log , "w" , encoding = "utf-8" ) as fp :
3756+ fp .write ("Normal log output without any errors\n " )
3757+
3758+ result = _read_latest_worker_traceback (temp_dir )
3759+ self .assertIsNone (result )
3760+
3761+ def test_read_latest_worker_traceback_returns_none_when_no_files (self ):
3762+ """测试当没有 workerlog 文件时返回 None"""
3763+ with tempfile .TemporaryDirectory () as temp_dir :
3764+ result = _read_latest_worker_traceback (temp_dir )
3765+ self .assertIsNone (result )
3766+
3767+ def test_read_latest_worker_traceback_returns_none_for_nonexistent_dir (self ):
3768+ """测试当目录不存在时返回 None"""
3769+ result = _read_latest_worker_traceback ("/nonexistent/path" )
3770+ self .assertIsNone (result )
3771+
3772+ def test_read_latest_worker_traceback_picks_latest_file (self ):
3773+ """测试当有多个 workerlog 文件时选择最新的"""
3774+ with tempfile .TemporaryDirectory () as temp_dir :
3775+ # 创建较旧的文件
3776+ old_log = os .path .join (temp_dir , "workerlog.0" )
3777+ with open (old_log , "w" , encoding = "utf-8" ) as fp :
3778+ fp .write ("Traceback (most recent call last):\n OldError: old error\n " )
3779+
3780+ # 短暂等待以确保时间戳不同
3781+ time .sleep (0.01 )
3782+
3783+ # 创建较新的文件
3784+ new_log = os .path .join (temp_dir , "workerlog.1" )
3785+ with open (new_log , "w" , encoding = "utf-8" ) as fp :
3786+ fp .write ("Traceback (most recent call last):\n NewError: new error\n " )
3787+
3788+ result = _read_latest_worker_traceback (temp_dir )
3789+ self .assertIsNotNone (result )
3790+ self .assertIn ("NewError" , result )
3791+
3792+ def test_format_worker_launch_failure_message_with_traceback (self ):
3793+ """测试带有 traceback 的错误消息格式化"""
3794+ with tempfile .TemporaryDirectory () as temp_dir :
3795+ worker_log = os .path .join (temp_dir , "workerlog.0" )
3796+ with open (worker_log , "w" , encoding = "utf-8" ) as fp :
3797+ fp .write ("Traceback (most recent call last):\n " "ValueError: Test error message\n " )
3798+
3799+ result = _format_worker_launch_failure_message (temp_dir )
3800+ self .assertIn ("Failed to launch worker processes" , result )
3801+ self .assertIn ("workerlog.*" , result )
3802+ self .assertIn ("Traceback (most recent call last):" , result )
3803+ self .assertIn ("ValueError: Test error message" , result )
3804+
3805+ def test_format_worker_launch_failure_message_without_traceback (self ):
3806+ """测试没有 traceback 时的错误消息格式化"""
3807+ with tempfile .TemporaryDirectory () as temp_dir :
3808+ result = _format_worker_launch_failure_message (temp_dir )
3809+ self .assertIn ("Failed to launch worker processes" , result )
3810+ self .assertIn ("workerlog.*" , result )
3811+ self .assertNotIn ("Traceback" , result )
0 commit comments