Skip to content

Commit ab2f596

Browse files
authored
Include MetricsUnavailable condition to Complete in Trial (#1877)
* include MetricsUnavailable condition to Complete in Trial It is not easy for users to find why Trial failed when training code output incorrect format logs since the trial-controller sets Succeeded condition with False to Trial if there are unavailable metrics in Katib DB as described in #1343. So we also include MetricsUnavailable condition to Complete in Trial. * add gh-actions tasks to verify generated codes * fix gh-actions workflow * when the number of Failed Trials reaches maxTrialCount, experiment-controller sets Failed to Experiment status * fix e2e test * To avoid being set Failed in Experiment status when and is equal to 0, we need to add condition,
1 parent c9001d8 commit ab2f596

File tree

22 files changed

+429
-258
lines changed

22 files changed

+429
-258
lines changed

.github/workflows/test-go.yaml

Lines changed: 38 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,14 @@ on:
55
- pull_request
66

77
jobs:
8-
test:
9-
name: Test
8+
generatetests:
9+
name: Generate And Format Test
1010
runs-on: ubuntu-latest
1111
env:
1212
GOPATH: ${{ github.workspace }}/go
1313
defaults:
1414
run:
1515
working-directory: ${{ env.GOPATH }}/src/github.com/kubeflow/katib
16-
1716
steps:
1817
- name: Check out code
1918
uses: actions/checkout@v2
@@ -28,19 +27,47 @@ jobs:
2827
# Verify that go.mod and go.sum is synchronized
2928
- name: Check Go modules
3029
run: |
31-
if [[ ! -z $(go mod tidy && git diff --exit-code) ]]; then
32-
echo "Please run "go mod tidy" to sync Go modules"
33-
exit 1
34-
fi
30+
go mod tidy &&
31+
git add go.* &&
32+
git diff --cached --exit-code || (echo 'Please run "go mod tidy" to sync Go modules' && exit 1)
3533
36-
- name: Run Go test
34+
- name: Run Generate And Go Format Test
3735
run: |
38-
go mod download
39-
make check
40-
make test
36+
go mod download &&
37+
make check &&
38+
git add pkg/apis hack/gen-python-sdk &&
39+
git diff --cached --exit-code || (echo 'Please run "make check" to generate codes and to format Go codes' && exit 1)
40+
41+
unittests:
42+
name: Unit Test
43+
runs-on: ubuntu-latest
44+
env:
45+
GOPATH: ${{ github.workspace }}/go
46+
defaults:
47+
run:
48+
working-directory: ${{ env.GOPATH }}/src/github.com/kubeflow/katib
49+
steps:
50+
- name: Check out code
51+
uses: actions/checkout@v2
52+
with:
53+
path: ${{ env.GOPATH }}/src/github.com/kubeflow/katib
54+
55+
- name: Setup Go
56+
uses: actions/setup-go@v2
57+
with:
58+
go-version: 1.17.10
59+
60+
- name: Run Go test
61+
run: go mod download && make test ENVTEST_K8S_VERSION=${{ matrix.kubernetes-version }}
4162

4263
- name: Coveralls report
4364
uses: shogo82148/actions-goveralls@v1
4465
with:
4566
path-to-profile: coverage.out
4667
working-directory: ${{ env.GOPATH }}/src/github.com/kubeflow/katib
68+
69+
strategy:
70+
fail-fast: false
71+
matrix:
72+
# Detail: `setup-envtest list --arch amd64`
73+
kubernetes-version: ["1.21.4", "1.22.1", "1.23.5"]

Makefile

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@ ifndef HAS_SETUP_ENVTEST
2424
endif
2525
@echo "setup-envtest has already installed"
2626

27-
2827
check: generate fmt vet lint
2928

3029
fmt:

pkg/apis/controller/experiments/v1beta1/experiment_types.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,9 @@ type ExperimentStatus struct {
100100
// List of trial names which have been early stopped.
101101
EarlyStoppedTrialList []string `json:"earlyStoppedTrialList,omitempty"`
102102

103+
// List of trial names which have been metrics unavailable
104+
MetricsUnavailableTrialList []string `json:"metricsUnavailableTrialList,omitempty"`
105+
103106
// Trials is the total number of trials owned by the experiment.
104107
Trials int32 `json:"trials,omitempty"`
105108

@@ -120,6 +123,9 @@ type ExperimentStatus struct {
120123

121124
// How many trials are currently early stopped.
122125
TrialsEarlyStopped int32 `json:"trialsEarlyStopped,omitempty"`
126+
127+
// How many trials are currently metrics unavailable.
128+
TrialMetricsUnavailable int32 `json:"trialMetricsUnavailable,omitempty"`
123129
}
124130

125131
// OptimalTrial is the metrics and assignments of the best trial.

pkg/apis/controller/experiments/v1beta1/zz_generated.deepcopy.go

Lines changed: 5 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pkg/apis/controller/trials/v1beta1/trial_types.go

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -113,12 +113,13 @@ type TrialCondition struct {
113113
type TrialConditionType string
114114

115115
const (
116-
TrialCreated TrialConditionType = "Created"
117-
TrialRunning TrialConditionType = "Running"
118-
TrialSucceeded TrialConditionType = "Succeeded"
119-
TrialKilled TrialConditionType = "Killed"
120-
TrialFailed TrialConditionType = "Failed"
121-
TrialEarlyStopped TrialConditionType = "EarlyStopped"
116+
TrialCreated TrialConditionType = "Created"
117+
TrialRunning TrialConditionType = "Running"
118+
TrialSucceeded TrialConditionType = "Succeeded"
119+
TrialKilled TrialConditionType = "Killed"
120+
TrialFailed TrialConditionType = "Failed"
121+
TrialMetricsUnavailable TrialConditionType = "MetricsUnavailable"
122+
TrialEarlyStopped TrialConditionType = "EarlyStopped"
122123
)
123124

124125
// +genclient

pkg/apis/controller/trials/v1beta1/util.go

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -86,15 +86,11 @@ func (trial *Trial) IsKilled() bool {
8686

8787
// IsMetricsUnavailable returns true if Trial metrics are not available
8888
func (trial *Trial) IsMetricsUnavailable() bool {
89-
cond := getCondition(trial, TrialSucceeded)
90-
if cond != nil && cond.Status == v1.ConditionFalse {
91-
return true
92-
}
93-
return false
89+
return hasCondition(trial, TrialMetricsUnavailable)
9490
}
9591

9692
func (trial *Trial) IsCompleted() bool {
97-
return trial.IsSucceeded() || trial.IsFailed() || trial.IsKilled() || trial.IsEarlyStopped()
93+
return trial.IsSucceeded() || trial.IsFailed() || trial.IsKilled() || trial.IsEarlyStopped() || trial.IsMetricsUnavailable()
9894
}
9995

10096
func (trial *Trial) IsEarlyStopped() bool {
@@ -158,3 +154,11 @@ func (trial *Trial) MarkTrialStatusKilled(reason, message string) {
158154
}
159155
trial.setCondition(TrialKilled, v1.ConditionTrue, reason, message)
160156
}
157+
158+
func (trial *Trial) MarkTrialStatusMetricsUnavailable(reason, message string) {
159+
currentCond := getCondition(trial, TrialRunning)
160+
if currentCond != nil {
161+
trial.setCondition(TrialRunning, v1.ConditionFalse, currentCond.Reason, currentCond.Message)
162+
}
163+
trial.setCondition(TrialMetricsUnavailable, v1.ConditionTrue, reason, message)
164+
}

0 commit comments

Comments
 (0)