Skip to content

Commit bb7fdd1

Browse files
committed
include MetricsUnavailable condition to Complete in Trial
It is not easy for users to find why Trial failed when training code output incorrect format logs since the trial-controller sets Succeeded condition with Failed to Trial if there are unavailable metrics in Katib DB as described in #1343. So we also include MetricsUnavailable condition to Complete in Trial.
1 parent a9d92bd commit bb7fdd1

File tree

20 files changed

+388
-245
lines changed

20 files changed

+388
-245
lines changed

Makefile

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@ ifndef HAS_SETUP_ENVTEST
2424
endif
2525
echo "setup-envtest has already installed"
2626

27-
2827
check: generate fmt vet lint
2928

3029
fmt:

pkg/apis/controller/experiments/v1beta1/experiment_types.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,9 @@ type ExperimentStatus struct {
100100
// List of trial names which have been early stopped.
101101
EarlyStoppedTrialList []string `json:"earlyStoppedTrialList,omitempty"`
102102

103+
// List of trial names which have been metrics unavailable
104+
MetricsUnavailableTrialList []string `json:"metricsUnavailableTrialList,omitempty"`
105+
103106
// Trials is the total number of trials owned by the experiment.
104107
Trials int32 `json:"trials,omitempty"`
105108

@@ -120,6 +123,9 @@ type ExperimentStatus struct {
120123

121124
// How many trials are currently early stopped.
122125
TrialsEarlyStopped int32 `json:"trialsEarlyStopped,omitempty"`
126+
127+
// How many trials are currently metrics unavailable.
128+
TrialMetricsUnavailable int32 `json:"trialMetricsUnavailable,omitempty"`
123129
}
124130

125131
// OptimalTrial is the metrics and assignments of the best trial.

pkg/apis/controller/experiments/v1beta1/zz_generated.deepcopy.go

Lines changed: 5 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pkg/apis/controller/trials/v1beta1/trial_types.go

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -113,12 +113,13 @@ type TrialCondition struct {
113113
type TrialConditionType string
114114

115115
const (
116-
TrialCreated TrialConditionType = "Created"
117-
TrialRunning TrialConditionType = "Running"
118-
TrialSucceeded TrialConditionType = "Succeeded"
119-
TrialKilled TrialConditionType = "Killed"
120-
TrialFailed TrialConditionType = "Failed"
121-
TrialEarlyStopped TrialConditionType = "EarlyStopped"
116+
TrialCreated TrialConditionType = "Created"
117+
TrialRunning TrialConditionType = "Running"
118+
TrialSucceeded TrialConditionType = "Succeeded"
119+
TrialKilled TrialConditionType = "Killed"
120+
TrialFailed TrialConditionType = "Failed"
121+
TrialMetricsUnavailable TrialConditionType = "MetricsUnavailable"
122+
TrialEarlyStopped TrialConditionType = "EarlyStopped"
122123
)
123124

124125
// +genclient

pkg/apis/controller/trials/v1beta1/util.go

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -86,15 +86,11 @@ func (trial *Trial) IsKilled() bool {
8686

8787
// IsMetricsUnavailable returns true if Trial metrics are not available
8888
func (trial *Trial) IsMetricsUnavailable() bool {
89-
cond := getCondition(trial, TrialSucceeded)
90-
if cond != nil && cond.Status == v1.ConditionFalse {
91-
return true
92-
}
93-
return false
89+
return hasCondition(trial, TrialMetricsUnavailable)
9490
}
9591

9692
func (trial *Trial) IsCompleted() bool {
97-
return trial.IsSucceeded() || trial.IsFailed() || trial.IsKilled() || trial.IsEarlyStopped()
93+
return trial.IsSucceeded() || trial.IsFailed() || trial.IsKilled() || trial.IsEarlyStopped() || trial.IsMetricsUnavailable()
9894
}
9995

10096
func (trial *Trial) IsEarlyStopped() bool {
@@ -158,3 +154,11 @@ func (trial *Trial) MarkTrialStatusKilled(reason, message string) {
158154
}
159155
trial.setCondition(TrialKilled, v1.ConditionTrue, reason, message)
160156
}
157+
158+
func (trial *Trial) MarkTrialStatusMetricsUnavailable(reason, message string) {
159+
currentCond := getCondition(trial, TrialRunning)
160+
if currentCond != nil {
161+
trial.setCondition(TrialRunning, v1.ConditionFalse, currentCond.Reason, currentCond.Message)
162+
}
163+
trial.setCondition(TrialMetricsUnavailable, v1.ConditionTrue, reason, message)
164+
}

pkg/apis/manager/v1beta1/api.pb.go

Lines changed: 137 additions & 132 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pkg/apis/manager/v1beta1/api.proto

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -226,16 +226,17 @@ message ParameterAssignment {
226226
* Current Trial status. It contains Trial's latest condition, start time, completion time, observation.
227227
*/
228228
message TrialStatus {
229-
// Trial can be in one of 6 conditions.
229+
// Trial can be in one of 8 conditions.
230230
// TODO (andreyvelich): Remove unused conditions.
231231
enum TrialConditionType {
232232
CREATED = 0;
233233
RUNNING = 1;
234234
SUCCEEDED = 2;
235235
KILLED = 3;
236236
FAILED = 4;
237-
EARLYSTOPPED = 5;
238-
UNKNOWN = 6;
237+
METRICSUNAVAILABLE = 5;
238+
EARLYSTOPPED = 6;
239+
UNKNOWN = 7;
239240
}
240241
string start_time = 1; // Trial start time in RFC3339 format
241242
string completion_time = 2; // Trial completion time in RFC3339 format

pkg/apis/manager/v1beta1/gen-doc/api.md

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -771,7 +771,7 @@ Types of value for HyperParameter.
771771
<a name="api-v1-beta1-TrialStatus-TrialConditionType"></a>
772772

773773
### TrialStatus.TrialConditionType
774-
Trial can be in one of 6 conditions.
774+
Trial can be in one of 8 conditions.
775775
TODO (andreyvelich): Remove unused conditions.
776776

777777
| Name | Number | Description |
@@ -781,8 +781,9 @@ TODO (andreyvelich): Remove unused conditions.
781781
| SUCCEEDED | 2 | |
782782
| KILLED | 3 | |
783783
| FAILED | 4 | |
784-
| EARLYSTOPPED | 5 | |
785-
| UNKNOWN | 6 | |
784+
| METRICSUNAVAILABLE | 5 | |
785+
| EARLYSTOPPED | 6 | |
786+
| UNKNOWN | 7 | |
786787

787788

788789

pkg/apis/manager/v1beta1/gen-doc/index.html

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1777,7 +1777,7 @@ <h3 id="api.v1.beta1.ParameterType">ParameterType</h3>
17771777
</table>
17781778

17791779
<h3 id="api.v1.beta1.TrialStatus.TrialConditionType">TrialStatus.TrialConditionType</h3>
1780-
<p>Trial can be in one of 6 conditions.</p><p>TODO (andreyvelich): Remove unused conditions.</p>
1780+
<p>Trial can be in one of 8 conditions.</p><p>TODO (andreyvelich): Remove unused conditions.</p>
17811781
<table class="enum-table">
17821782
<thead>
17831783
<tr><td>Name</td><td>Number</td><td>Description</td></tr>
@@ -1815,17 +1815,23 @@ <h3 id="api.v1.beta1.TrialStatus.TrialConditionType">TrialStatus.TrialConditionT
18151815
</tr>
18161816

18171817
<tr>
1818-
<td>EARLYSTOPPED</td>
1818+
<td>METRICSUNAVAILABLE</td>
18191819
<td>5</td>
18201820
<td><p></p></td>
18211821
</tr>
18221822

18231823
<tr>
1824-
<td>UNKNOWN</td>
1824+
<td>EARLYSTOPPED</td>
18251825
<td>6</td>
18261826
<td><p></p></td>
18271827
</tr>
18281828

1829+
<tr>
1830+
<td>UNKNOWN</td>
1831+
<td>7</td>
1832+
<td><p></p></td>
1833+
</tr>
1834+
18291835
</tbody>
18301836
</table>
18311837

pkg/apis/manager/v1beta1/python/api_pb2.py

Lines changed: 64 additions & 60 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)