diff --git a/.github/workflows/darts-cifar10-e2e-test.yaml b/.github/workflows/darts-cifar10-e2e-test.yaml index 7a82b1b8fb6..980056e484c 100644 --- a/.github/workflows/darts-cifar10-e2e-test.yaml +++ b/.github/workflows/darts-cifar10-e2e-test.yaml @@ -24,6 +24,7 @@ jobs: experiments: ${{ matrix.experiments }} # Comma Delimited trial-images: darts-cnn-cifar10-cpu + database-type: ${{ matrix.database-type }} strategy: fail-fast: false @@ -33,3 +34,4 @@ jobs: kubernetes-version: ["v1.21.13", "v1.22.10", "v1.23.7"] # Comma Delimited experiments: ["darts-cpu"] + database-type: ["mysql", "postgres"] diff --git a/.github/workflows/enas-cifar10-e2e-test.yaml b/.github/workflows/enas-cifar10-e2e-test.yaml index db36d0ff7c1..f1aae47c4db 100644 --- a/.github/workflows/enas-cifar10-e2e-test.yaml +++ b/.github/workflows/enas-cifar10-e2e-test.yaml @@ -24,6 +24,7 @@ jobs: experiments: ${{ matrix.experiments }} # Comma Delimited trial-images: enas-cnn-cifar10-cpu + database-type: ${{ matrix.database-type }} strategy: fail-fast: false @@ -33,3 +34,4 @@ jobs: kubernetes-version: ["v1.21.13", "v1.22.10", "v1.23.7"] # Comma Delimited experiments: ["enas-cpu"] + database-type: ["mysql", "postgres"] diff --git a/.github/workflows/mxnet-mnist-e2e-test.yaml b/.github/workflows/mxnet-mnist-e2e-test.yaml index 2506098f156..2befac9ea79 100644 --- a/.github/workflows/mxnet-mnist-e2e-test.yaml +++ b/.github/workflows/mxnet-mnist-e2e-test.yaml @@ -24,6 +24,7 @@ jobs: experiments: ${{ matrix.experiments }} # Comma Delimited trial-images: mxnet-mnist + database-type: ${{ matrix.database-type }} strategy: fail-fast: false @@ -39,3 +40,4 @@ jobs: # others - "grid,bayesian-optimization,tpe" - "multivariate-tpe,cma-es,hyperband" + database-type: ["mysql", "postgres"] diff --git a/.github/workflows/pytorch-mnist-e2e-test.yaml b/.github/workflows/pytorch-mnist-e2e-test.yaml index 65a1a38d54c..3f96f1b976a 100644 --- a/.github/workflows/pytorch-mnist-e2e-test.yaml +++ b/.github/workflows/pytorch-mnist-e2e-test.yaml @@ -25,6 +25,7 @@ jobs: training-operator: true # Comma Delimited trial-images: pytorch-mnist-cpu + database-type: ${{ matrix.database-type }} strategy: fail-fast: false @@ -36,3 +37,4 @@ jobs: experiments: - "file-metrics-collector,pytorchjob-mnist" - "median-stop-with-json-format,file-metrics-collector-with-json-format" + database-type: ["mysql", "postgres"] diff --git a/.github/workflows/simple-pbt-e2e-test.yaml b/.github/workflows/simple-pbt-e2e-test.yaml index 798efeb0fc2..983f968665d 100644 --- a/.github/workflows/simple-pbt-e2e-test.yaml +++ b/.github/workflows/simple-pbt-e2e-test.yaml @@ -24,6 +24,7 @@ jobs: experiments: ${{ matrix.experiments }} # Comma Delimited trial-images: simple-pbt + database-type: ${{ matrix.database-type }} strategy: fail-fast: false @@ -34,3 +35,4 @@ jobs: kubernetes-version: ["v1.21.12", "v1.22.9", "v1.23.6"] # Comma Delimited experiments: ["simple-pbt"] + database-type: ["mysql", "postgres"] diff --git a/.github/workflows/template-e2e-test/action.yaml b/.github/workflows/template-e2e-test/action.yaml index 130af6c594c..342eb5428e5 100644 --- a/.github/workflows/template-e2e-test/action.yaml +++ b/.github/workflows/template-e2e-test/action.yaml @@ -7,6 +7,7 @@ inputs: training-operator: required: false type: boolean + default: false trial-images: required: true type: string @@ -14,6 +15,10 @@ inputs: required: true type: boolean default: false + database-type: + required: false + type: string + default: mysql runs: using: composite @@ -24,7 +29,7 @@ runs: - name: Set Up Katib shell: bash - run: ./test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh ${{ inputs.katib-ui }} ${{ inputs.training-operator }} + run: ./test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh ${{ inputs.katib-ui }} ${{ inputs.training-operator }} ${{ inputs.database-type }} - name: Run E2E Experiment shell: bash diff --git a/.github/workflows/tf-mnist-with-summaries-e2e-test.yaml b/.github/workflows/tf-mnist-with-summaries-e2e-test.yaml index 4d38889c6e2..14752f80149 100644 --- a/.github/workflows/tf-mnist-with-summaries-e2e-test.yaml +++ b/.github/workflows/tf-mnist-with-summaries-e2e-test.yaml @@ -25,6 +25,7 @@ jobs: training-operator: true # Comma Delimited trial-images: tf-mnist-with-summaries + database-type: ${{ matrix.database-type }} strategy: fail-fast: false @@ -34,3 +35,4 @@ jobs: kubernetes-version: ["v1.21.13", "v1.22.10", "v1.23.7"] # Comma Delimited experiments: ["tfjob-mnist-with-summaries"] + database-type: ["mysql", "postgres"] diff --git a/Makefile b/Makefile index 336c564eea2..3fb53269617 100755 --- a/Makefile +++ b/Makefile @@ -59,7 +59,7 @@ update: # Deploy Katib v1beta1 manifests using Kustomize into a k8s cluster. deploy: - bash scripts/v1beta1/deploy.sh + bash scripts/v1beta1/deploy.sh $(WITH_DATABASE_TYPE) # Undeploy Katib v1beta1 manifests using Kustomize from a k8s cluster undeploy: diff --git a/go.mod b/go.mod index bd7cec3aa98..6bb3b8c59c8 100644 --- a/go.mod +++ b/go.mod @@ -14,6 +14,7 @@ require ( github.com/google/go-containerregistry/pkg/authn/k8schain v0.0.0-20211222182933-7c19fa370dbd github.com/grpc-ecosystem/go-grpc-middleware v1.3.0 github.com/hpcloud/tail v1.0.1-0.20180514194441-a1dbeea552b7 + github.com/lib/pq v1.10.6 github.com/mattbaird/jsonpatch v0.0.0-20171005235357-81af80346b1a github.com/onsi/gomega v1.17.0 github.com/prometheus/client_golang v1.11.0 diff --git a/go.sum b/go.sum index 2d2a9909766..5fe63c5fa82 100644 --- a/go.sum +++ b/go.sum @@ -869,6 +869,8 @@ github.com/lib/pq v1.3.0/go.mod h1:5WUZQaWbwv1U+lTReE5YruASi9Al49XbQIvNi/34Woo= github.com/lib/pq v1.8.0/go.mod h1:AlVN5x4E4T544tWzH6hKfbfQvm3HdbOxrmggDNAPY9o= github.com/lib/pq v1.9.0/go.mod h1:AlVN5x4E4T544tWzH6hKfbfQvm3HdbOxrmggDNAPY9o= github.com/lib/pq v1.10.3/go.mod h1:AlVN5x4E4T544tWzH6hKfbfQvm3HdbOxrmggDNAPY9o= +github.com/lib/pq v1.10.6 h1:jbk+ZieJ0D7EVGJYpL9QTz7/YW6UHbmdnZWYyK5cdBs= +github.com/lib/pq v1.10.6/go.mod h1:AlVN5x4E4T544tWzH6hKfbfQvm3HdbOxrmggDNAPY9o= github.com/logrusorgru/aurora v0.0.0-20181002194514-a7b3b318ed4e/go.mod h1:7rIyQOR62GCctdiQpZ/zOJlFyk6y+94wXzv6RNZgaR4= github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 h1:6E+4a0GO5zZEnZ81pIr0yLvtUWk2if982qA3F3QD6H4= github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0/go.mod h1:zJYVVT2jmtg6P3p1VtQj7WsuWi/y4VnjVBn7F8KPB3I= diff --git a/manifests/v1beta1/components/postgres/kustomization.yaml b/manifests/v1beta1/components/postgres/kustomization.yaml new file mode 100644 index 00000000000..0d18841b32e --- /dev/null +++ b/manifests/v1beta1/components/postgres/kustomization.yaml @@ -0,0 +1,9 @@ +--- +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: + - postgres.yaml + - pvc.yaml + - secret.yaml + - service.yaml diff --git a/manifests/v1beta1/components/postgres/postgres.yaml b/manifests/v1beta1/components/postgres/postgres.yaml new file mode 100644 index 00000000000..0d94d7b7e10 --- /dev/null +++ b/manifests/v1beta1/components/postgres/postgres.yaml @@ -0,0 +1,42 @@ +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: katib-postgres + namespace: kubeflow + labels: + katib.kubeflow.org/component: postgres +spec: + replicas: 1 + selector: + matchLabels: + katib.kubeflow.org/component: postgres + strategy: + type: Recreate + template: + metadata: + labels: + katib.kubeflow.org/component: postgres + annotations: + sidecar.istio.io/inject: "false" + spec: + containers: + - name: katib-postgres + image: postgres:14.5-alpine + envFrom: + - secretRef: + name: katib-postgres-secrets + env: + - name: PGDATA + value: /var/lib/postgresql/data/pgdata + ports: + - name: postgres + containerPort: 5432 + protocol: TCP + volumeMounts: + - name: katib-postgres + mountPath: /var/lib/postgresql/data + volumes: + - name: katib-postgres + persistentVolumeClaim: + claimName: katib-postgres diff --git a/manifests/v1beta1/components/postgres/pvc.yaml b/manifests/v1beta1/components/postgres/pvc.yaml new file mode 100644 index 00000000000..d3e45a763bc --- /dev/null +++ b/manifests/v1beta1/components/postgres/pvc.yaml @@ -0,0 +1,12 @@ +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: katib-postgres + namespace: kubeflow +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 3Gi diff --git a/manifests/v1beta1/components/postgres/secret.yaml b/manifests/v1beta1/components/postgres/secret.yaml new file mode 100644 index 00000000000..184df5a4e8e --- /dev/null +++ b/manifests/v1beta1/components/postgres/secret.yaml @@ -0,0 +1,10 @@ +--- +apiVersion: v1 +kind: Secret +type: Opaque +metadata: + name: katib-postgres-secrets +data: + POSTGRES_USER: a2F0aWI= # katib + POSTGRES_PASSWORD: a2F0aWI= # katib + POSTGRES_DB: a2F0aWI= # katib diff --git a/manifests/v1beta1/components/postgres/service.yaml b/manifests/v1beta1/components/postgres/service.yaml new file mode 100644 index 00000000000..b95d5fcc7ff --- /dev/null +++ b/manifests/v1beta1/components/postgres/service.yaml @@ -0,0 +1,16 @@ +--- +apiVersion: v1 +kind: Service +metadata: + name: katib-postgres + namespace: kubeflow + labels: + katib.kubeflow.org/component: postgres +spec: + type: ClusterIP + ports: + - port: 5432 + protocol: TCP + name: dbapi + selector: + katib.kubeflow.org/component: postgres diff --git a/manifests/v1beta1/installs/katib-standalone-postgres/kustomization.yaml b/manifests/v1beta1/installs/katib-standalone-postgres/kustomization.yaml new file mode 100644 index 00000000000..30a5b7193f3 --- /dev/null +++ b/manifests/v1beta1/installs/katib-standalone-postgres/kustomization.yaml @@ -0,0 +1,41 @@ +--- +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +namespace: kubeflow +resources: + # Namespace. + - ../../components/namespace/ + # Katib controller. + - ../../components/controller/ + # Katib CRDs. + - ../../components/crd/ + # Katib DB manager. + - ../../components/db-manager/ + # Katib DB postgres. + - ../../components/postgres/ + # Katib UI. + - ../../components/ui/ + # Katib Cert Generator + - ../../components/cert-generator/ + # Katib webhooks. + - ../../components/webhook/ +images: + - name: docker.io/kubeflowkatib/katib-controller + newName: docker.io/kubeflowkatib/katib-controller + newTag: latest + - name: docker.io/kubeflowkatib/katib-db-manager + newName: docker.io/kubeflowkatib/katib-db-manager + newTag: latest + - name: docker.io/kubeflowkatib/katib-ui + newName: docker.io/kubeflowkatib/katib-ui + newTag: latest + - name: docker.io/kubeflowkatib/cert-generator + newName: docker.io/kubeflowkatib/cert-generator + newTag: latest +patchesJson6902: + - target: + group: apps + version: v1 + kind: Deployment + name: katib-db-manager + path: ./patches/db-manager.yaml diff --git a/manifests/v1beta1/installs/katib-standalone-postgres/patches/db-manager.yaml b/manifests/v1beta1/installs/katib-standalone-postgres/patches/db-manager.yaml new file mode 100644 index 00000000000..1077f91b0ef --- /dev/null +++ b/manifests/v1beta1/installs/katib-standalone-postgres/patches/db-manager.yaml @@ -0,0 +1,8 @@ +--- +- op: replace + path: /spec/template/spec/containers/0/env + value: + - name: DB_NAME + value: "postgres" + - name: DB_PASSWORD + value: "katib" diff --git a/pkg/db/v1beta1/common/connection.go b/pkg/db/v1beta1/common/connection.go new file mode 100644 index 00000000000..2f65e19ce96 --- /dev/null +++ b/pkg/db/v1beta1/common/connection.go @@ -0,0 +1,48 @@ +/* +Copyright 2022 The Kubeflow Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package common + +import ( + "database/sql" + "fmt" + "time" + + "k8s.io/klog" +) + +func OpenSQLConn(driverName string, dataSourceName string, interval time.Duration, + timeout time.Duration) (*sql.DB, error) { + ticker := time.NewTicker(interval) + defer ticker.Stop() + + timeoutC := time.After(timeout) + for { + select { + case <-ticker.C: + if db, err := sql.Open(driverName, dataSourceName); err == nil { + if err = db.Ping(); err == nil { + return db, nil + } + klog.Errorf("Ping to Katib db failed: %v", err) + } else { + klog.Errorf("Open sql connection failed: %v", err) + } + case <-timeoutC: + return nil, fmt.Errorf("Timeout waiting for DB conn successfully opened.") + } + } +} diff --git a/pkg/db/v1beta1/common/const.go b/pkg/db/v1beta1/common/const.go index f75ffad3e1a..97927c07731 100644 --- a/pkg/db/v1beta1/common/const.go +++ b/pkg/db/v1beta1/common/const.go @@ -16,15 +16,18 @@ limitations under the License. package common +import "time" + const ( - DBUserEnvName = "DB_USER" + ConnectInterval = 5 * time.Second + ConnectTimeout = 60 * time.Second - DBNameEnvName = "DB_NAME" + DBUserEnvName = "DB_USER" + DBNameEnvName = "DB_NAME" + DBPasswordEnvName = "DB_PASSWORD" MySqlDBNameEnvValue = "mysql" - DBPasswordEnvName = "DB_PASSWORD" - MySQLDBHostEnvName = "KATIB_MYSQL_DB_HOST" MySQLDBPortEnvName = "KATIB_MYSQL_DB_PORT" MySQLDatabase = "KATIB_MYSQL_DB_DATABASE" @@ -33,4 +36,15 @@ const ( DefaultMySQLDatabase = "katib" DefaultMySQLHost = "katib-mysql" DefaultMySQLPort = "3306" + + PostgresSQLDBNameEnvValue = "postgres" + + PostgreSQLDBHostEnvName = "KATIB_POSTGRESQL_DB_HOST" + PostgreSQLDBPortEnvName = "KATIB_POSTGRESQL_DB_PORT" + PostgreSQLDatabase = "KATIB_POSTGRESQL_DB_DATABASE" + + DefaultPostgreSQLUser = "katib" + DefaultPostgreSQLDatabase = "katib" + DefaultPostgreSQLHost = "katib-postgres" + DefaultPostgreSQLPort = "5432" ) diff --git a/pkg/db/v1beta1/db.go b/pkg/db/v1beta1/db.go index 84c36a50ca6..7d48069b589 100644 --- a/pkg/db/v1beta1/db.go +++ b/pkg/db/v1beta1/db.go @@ -21,12 +21,18 @@ import ( "github.com/kubeflow/katib/pkg/db/v1beta1/common" "github.com/kubeflow/katib/pkg/db/v1beta1/mysql" + "github.com/kubeflow/katib/pkg/db/v1beta1/postgres" + "k8s.io/klog" ) func NewKatibDBInterface(dbName string) (common.KatibDBInterface, error) { if dbName == common.MySqlDBNameEnvValue { + klog.Info("Using MySQL") return mysql.NewDBInterface() + } else if dbName == common.PostgresSQLDBNameEnvValue { + klog.Info("Using Postgres") + return postgres.NewDBInterface() } return nil, errors.New("Invalid DB Name") } diff --git a/pkg/db/v1beta1/mysql/mysql.go b/pkg/db/v1beta1/mysql/mysql.go index 93078f0d396..4e7b4427b95 100644 --- a/pkg/db/v1beta1/mysql/mysql.go +++ b/pkg/db/v1beta1/mysql/mysql.go @@ -38,9 +38,6 @@ const ( //dbNameTmpl = "root:%s@tcp(%s:%s)/%s?timeout=5s" dbNameTmpl = "%s:%s@tcp(%s:%s)/%s?timeout=5s" mysqlTimeFmt = "2006-01-02 15:04:05.999999" - - connectInterval = 5 * time.Second - connectTimeout = 60 * time.Second ) type dbConn struct { @@ -62,29 +59,6 @@ func getDbName() string { return fmt.Sprintf(dbNameTmpl, dbUser, dbPass, dbHost, dbPort, dbName) } -func openSQLConn(driverName string, dataSourceName string, interval time.Duration, - timeout time.Duration) (*sql.DB, error) { - ticker := time.NewTicker(interval) - defer ticker.Stop() - - timeoutC := time.After(timeout) - for { - select { - case <-ticker.C: - if db, err := sql.Open(driverName, dataSourceName); err == nil { - if err = db.Ping(); err == nil { - return db, nil - } - klog.Errorf("Ping to Katib db failed: %v", err) - } else { - klog.Errorf("Open sql connection failed: %v", err) - } - case <-timeoutC: - return nil, fmt.Errorf("Timeout waiting for DB conn successfully opened.") - } - } -} - func NewWithSQLConn(db *sql.DB) (common.KatibDBInterface, error) { d := new(dbConn) d.db = db @@ -100,7 +74,7 @@ func NewWithSQLConn(db *sql.DB) (common.KatibDBInterface, error) { } func NewDBInterface() (common.KatibDBInterface, error) { - db, err := openSQLConn(dbDriver, getDbName(), connectInterval, connectTimeout) + db, err := common.OpenSQLConn(dbDriver, getDbName(), common.ConnectInterval, common.ConnectTimeout) if err != nil { return nil, fmt.Errorf("DB open failed: %v", err) } diff --git a/pkg/db/v1beta1/postgres/init.go b/pkg/db/v1beta1/postgres/init.go new file mode 100644 index 00000000000..3ebfad40a46 --- /dev/null +++ b/pkg/db/v1beta1/postgres/init.go @@ -0,0 +1,47 @@ +/* +Copyright 2022 The Kubeflow Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package postgres + +import ( + "fmt" + + "k8s.io/klog" +) + +func (d *dbConn) DBInit() { + db := d.db + klog.Info("Initializing v1beta1 DB schema") + + _, err := db.Exec(`CREATE TABLE IF NOT EXISTS observation_logs + (trial_name VARCHAR(255) NOT NULL, + id serial PRIMARY KEY, + time TIMESTAMP(6), + metric_name VARCHAR(255) NOT NULL, + value TEXT NOT NULL)`) + if err != nil { + klog.Fatalf("Error creating observation_logs table: %v", err) + } +} + +func (d *dbConn) SelectOne() error { + db := d.db + _, err := db.Exec(`SELECT 1`) + if err != nil { + return fmt.Errorf("Error `SELECT 1` probing: %v", err) + } + return nil +} diff --git a/pkg/db/v1beta1/postgres/postgres.go b/pkg/db/v1beta1/postgres/postgres.go new file mode 100644 index 00000000000..05781a4dfe2 --- /dev/null +++ b/pkg/db/v1beta1/postgres/postgres.go @@ -0,0 +1,201 @@ +/* +Copyright 2022 The Kubeflow Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package postgres + +import ( + crand "crypto/rand" + "database/sql" + "fmt" + "math/big" + "math/rand" + "os" + "time" + + _ "github.com/lib/pq" + + v1beta1 "github.com/kubeflow/katib/pkg/apis/manager/v1beta1" + "github.com/kubeflow/katib/pkg/db/v1beta1/common" + "github.com/kubeflow/katib/pkg/util/v1beta1/env" + "k8s.io/klog" +) + +const dbDriver = "postgres" + +type dbConn struct { + db *sql.DB +} + +func getDbName() string { + dbPassEnvName := common.DBPasswordEnvName + dbPass := os.Getenv(dbPassEnvName) + + dbUser := env.GetEnvOrDefault( + common.DBUserEnvName, common.DefaultPostgreSQLUser) + dbHost := env.GetEnvOrDefault( + common.PostgreSQLDBHostEnvName, common.DefaultPostgreSQLHost) + dbPort := env.GetEnvOrDefault( + common.PostgreSQLDBPortEnvName, common.DefaultPostgreSQLPort) + dbName := env.GetEnvOrDefault(common.DefaultPostgreSQLDatabase, + common.DefaultPostgreSQLDatabase) + + psqlInfo := fmt.Sprintf("host=%s port=%s user=%s "+ + "password=%s dbname=%s sslmode=disable", + dbHost, dbPort, dbUser, dbPass, dbName) + + return psqlInfo +} + +func NewDBInterface() (common.KatibDBInterface, error) { + db, err := common.OpenSQLConn(dbDriver, getDbName(), common.ConnectInterval, common.ConnectTimeout) + if err != nil { + return nil, fmt.Errorf("DB open failed: %v", err) + } + return NewWithSQLConn(db) +} + +func NewWithSQLConn(db *sql.DB) (common.KatibDBInterface, error) { + d := new(dbConn) + d.db = db + seed, err := crand.Int(crand.Reader, big.NewInt(1<<63-1)) + if err != nil { + return nil, fmt.Errorf("RNG initialization failed: %v", err) + } + // We can do the following instead, but it creates a locking issue + //d.rng = rand.New(rand.NewSource(seed.Int64())) + rand.Seed(seed.Int64()) + + return d, nil +} + +func (d *dbConn) RegisterObservationLog(trialName string, observationLog *v1beta1.ObservationLog) error { + statement := "INSERT INTO observation_logs (trial_name, time, metric_name, value) VALUES " + values := []interface{}{} + + index_of_qparam := 1 + for _, mlog := range observationLog.MetricLogs { + if mlog.TimeStamp == "" { + continue + } + t, err := time.Parse(time.RFC3339Nano, mlog.TimeStamp) + if err != nil { + return fmt.Errorf("Error parsing start time %s: %v", mlog.TimeStamp, err) + } + sqlTimeStr := t.UTC().Format(time.RFC3339Nano) + + statement += fmt.Sprintf("($%d, $%d, $%d, $%d),", + index_of_qparam, index_of_qparam+1, index_of_qparam+2, index_of_qparam+3, + ) + values = append(values, trialName, sqlTimeStr, mlog.Metric.Name, mlog.Metric.Value) + index_of_qparam += 4 + } + + statement = statement[:len(statement)-1] + + // Prepare the statement + stmt, err := d.db.Prepare(statement) + if err != nil { + return fmt.Errorf("Prepare SQL statement failed: %v", err) + } + + // Defer Close the statement + defer stmt.Close() + + // Execute INSERT + _, err = stmt.Exec(values...) + if err != nil { + return fmt.Errorf("Execute SQL INSERT failed: %v", err) + } + + return nil +} + +func (d *dbConn) GetObservationLog(trialName string, metricName string, startTime string, endTime string) (*v1beta1.ObservationLog, error) { + qfield := []interface{}{trialName} + qstr := "" + index_of_qparam := 1 + + base_stmt := fmt.Sprintf("SELECT time, metric_name, value FROM observation_logs WHERE trial_name = $%d", index_of_qparam) + index_of_qparam += 1 + + if metricName != "" { + qstr += fmt.Sprintf(" AND metric_name = $%d", index_of_qparam) + qfield = append(qfield, metricName) + index_of_qparam += 1 + } + + if startTime != "" { + s_time, err := time.Parse(time.RFC3339Nano, startTime) + if err != nil { + return nil, fmt.Errorf("Error parsing start time %s: %v", startTime, err) + } + formattedStartTime := s_time.UTC().Format(time.RFC3339Nano) + qstr += fmt.Sprintf(" AND time >= $%d", index_of_qparam) + qfield = append(qfield, formattedStartTime) + index_of_qparam += 1 + } + if endTime != "" { + e_time, err := time.Parse(time.RFC3339Nano, endTime) + if err != nil { + return nil, fmt.Errorf("Error parsing completion time %s: %v", endTime, err) + } + formattedEndTime := e_time.UTC().Format(time.RFC3339Nano) + qstr += fmt.Sprintf(" AND time <= $%d", index_of_qparam) + qfield = append(qfield, formattedEndTime) + // index_of_qparam += 1 // if any other filters are added, this should be incremented + } + + rows, err := d.db.Query(base_stmt+qstr+" ORDER BY time", qfield...) + if err != nil { + return nil, fmt.Errorf("Failed to get ObservationLogs %v", err) + } + + // Defer Close the rows + defer rows.Close() + + result := &v1beta1.ObservationLog{ + MetricLogs: []*v1beta1.MetricLog{}, + } + for rows.Next() { + var mname, mvalue, sqlTimeStr string + err := rows.Scan(&sqlTimeStr, &mname, &mvalue) + if err != nil { + klog.Errorf("Error scanning log: %v", err) + continue + } + ptime, err := time.Parse(time.RFC3339Nano, sqlTimeStr) + if err != nil { + klog.Errorf("Error parsing time %s: %v", sqlTimeStr, err) + continue + } + timeStamp := ptime.UTC().Format(time.RFC3339Nano) + result.MetricLogs = append(result.MetricLogs, &v1beta1.MetricLog{ + TimeStamp: timeStamp, + Metric: &v1beta1.Metric{ + Name: mname, + Value: mvalue, + }, + }) + } + + return result, nil +} + +func (d *dbConn) DeleteObservationLog(trialName string) error { + _, err := d.db.Exec("DELETE FROM observation_logs WHERE trial_name = $1", trialName) + + return err +} diff --git a/pkg/db/v1beta1/postgres/postgres_test.go b/pkg/db/v1beta1/postgres/postgres_test.go new file mode 100644 index 00000000000..bf6c4191b91 --- /dev/null +++ b/pkg/db/v1beta1/postgres/postgres_test.go @@ -0,0 +1,142 @@ +/* +Copyright 2022 The Kubeflow Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package postgres + +import ( + "fmt" + "os" + "testing" + + sqlmock "github.com/DATA-DOG/go-sqlmock" + _ "github.com/lib/pq" + + api_pb "github.com/kubeflow/katib/pkg/apis/manager/v1beta1" + "github.com/kubeflow/katib/pkg/db/v1beta1/common" +) + +var dbInterface common.KatibDBInterface +var mock sqlmock.Sqlmock + +func TestMain(m *testing.M) { + db, sm, err := sqlmock.New() + mock = sm + if err != nil { + fmt.Printf("error opening db: %v\n", err) + os.Exit(1) + } + dbInterface, err = NewWithSQLConn(db) + if err != nil { + fmt.Printf("error NewWithSQLConn: %v\n", err) + } + mock.ExpectExec("CREATE TABLE IF NOT EXISTS observation_logs").WithArgs().WillReturnResult(sqlmock.NewResult(1, 1)) + dbInterface.DBInit() + mock.ExpectExec("SELECT 1").WithArgs().WillReturnResult(sqlmock.NewResult(1, 1)) + err = dbInterface.SelectOne() + if err != nil { + fmt.Printf("error `SELECT 1` probing: %v\n", err) + } + m.Run() +} + +func TestRegisterObservationLog(t *testing.T) { + obsLog := &api_pb.ObservationLog{ + MetricLogs: []*api_pb.MetricLog{ + { + TimeStamp: "2016-12-31T20:01:05.123456Z", + Metric: &api_pb.Metric{ + Name: "f1_score", + Value: "88.95", + }, + }, + { + TimeStamp: "2016-12-31T20:02:05.123456Z", + Metric: &api_pb.Metric{ + Name: "loss", + Value: "0.5", + }, + }, + }, + } + mock.ExpectPrepare("INSERT") + mock.ExpectExec( + "INSERT", + ).WithArgs( + "test1_trial1", + "2016-12-31T20:01:05.123456Z", + "f1_score", + "88.95", + "test1_trial1", + "2016-12-31T20:02:05.123456Z", + "loss", + "0.5", + ).WillReturnResult(sqlmock.NewResult(1, 1)) + + err := dbInterface.RegisterObservationLog("test1_trial1", obsLog) + if err != nil { + t.Errorf("RegisterExperiment failed: %v", err) + } + +} + +func TestGetObservationLog(t *testing.T) { + mock.ExpectQuery("SELECT").WillReturnRows( + sqlmock.NewRows([]string{"time", "metric_name", "value"}).AddRow( + "2016-12-31T20:01:05.123456Z", + "loss", + "0.9", + ).AddRow( + "2016-12-31T20:02:05.123456Z", + "loss", + "0.9", + ), + ) + obsLog, err := dbInterface.GetObservationLog( + "test1_trial1", + "loss", + "2016-12-31T20:01:05.123456Z", + "2016-12-31T20:02:05.123456Z", + ) + if err != nil { + t.Errorf("GetObservationLog failed %v", err) + } else if len(obsLog.MetricLogs) != 2 { + t.Errorf("GetObservationLog incorrect return %v", obsLog) + } + +} + +func TestDeleteObservationLog(t *testing.T) { + trialName := "test1_trial1" + + mock.ExpectExec( + "DELETE FROM observation_logs", + ).WithArgs(trialName).WillReturnResult(sqlmock.NewResult(1, 1)) + + err := dbInterface.DeleteObservationLog(trialName) + if err != nil { + t.Errorf("DeleteObservationLog failed: %v", err) + } +} + +func TestGetDbName(t *testing.T) { + // dbName := "root:@tcp(katib-mysql:3306)/katib?timeout=5s" + dbName := "host=katib-postgres port=5432 user=katib password= dbname=katib sslmode=disable" + + if getDbName() != dbName { + t.Errorf("getDbName returns wrong value %v", getDbName()) + } + +} diff --git a/scripts/v1beta1/deploy.sh b/scripts/v1beta1/deploy.sh index 20d428506f2..3438dbde112 100755 --- a/scripts/v1beta1/deploy.sh +++ b/scripts/v1beta1/deploy.sh @@ -20,4 +20,15 @@ set -o errexit SCRIPT_ROOT="$(dirname "${BASH_SOURCE[0]}")/../.." cd "${SCRIPT_ROOT}" -kustomize build manifests/v1beta1/installs/katib-standalone | kubectl apply -f - + +WITH_DATABASE_TYPE=${1:-mysql} + +# if mysql, use below kustomize, else use postgres +if [ "$WITH_DATABASE_TYPE" == "mysql" ]; then + kustomize build manifests/v1beta1/installs/katib-standalone | kubectl apply -f - +elif [ "$WITH_DATABASE_TYPE" == "postgres" ]; then + kustomize build manifests/v1beta1/installs/katib-standalone-postgres | kubectl apply -f - +else + echo "Unknown database type: $WITH_DATABASE_TYPE" + exit 1 +fi diff --git a/test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh b/test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh index fbf239acd15..191efc06616 100755 --- a/test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh +++ b/test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh @@ -22,6 +22,8 @@ cd "$(dirname "$0")" DEPLOY_KATIB_UI=${1:-false} DEPLOY_TRAINING_OPERATOR=${2:-false} +WITH_DATABASE_TYPE=${3:-mysql} + E2E_TEST_IMAGE_TAG="e2e-test" TRAINING_OPERATOR_VERSION="v1.5.0-rc.0" @@ -30,32 +32,50 @@ echo "Start to install Katib" # Update Katib images with `e2e-test`. cd ../../../../../ && make update-images OLD_PREFIX="docker.io/kubeflowkatib/" NEW_PREFIX="docker.io/kubeflowkatib/" TAG="$E2E_TEST_IMAGE_TAG" && cd - +# first declare the which kustomization file to use, by default use mysql. +KUSTOMIZATION_FILE="../../../../../manifests/v1beta1/installs/katib-standalone/kustomization.yaml" +PVC_FILE="../../../../../manifests/v1beta1/components/mysql/pvc.yaml" + +# If the database type is postgres, then use postgres. +if [ "$WITH_DATABASE_TYPE" == "postgres" ]; then + KUSTOMIZATION_FILE="../../../../../manifests/v1beta1/installs/katib-standalone-postgres/kustomization.yaml" + PVC_FILE="../../../../../manifests/v1beta1/components/postgres/pvc.yaml" +fi + +# If the user wants to deploy Katib UI, then use the kustomization file for Katib UI. if ! "$DEPLOY_KATIB_UI"; then - index="$(yq eval '.resources.[] | select(. == "../../components/ui/") | path | .[-1]' ../../../../../manifests/v1beta1/installs/katib-standalone/kustomization.yaml)" - index="$index" yq eval -i 'del(.resources.[env(index)])' ../../../../../manifests/v1beta1/installs/katib-standalone/kustomization.yaml + index="$(yq eval '.resources.[] | select(. == "../../components/ui/") | path | .[-1]' $KUSTOMIZATION_FILE)" + index="$index" yq eval -i 'del(.resources.[env(index)])' $KUSTOMIZATION_FILE fi -yq eval -i '.spec.resources.requests.storage|="2Gi"' ../../../../../manifests/v1beta1/components/mysql/pvc.yaml +# Since e2e test doesn't need to large storage, we use a small PVC for Katib. +yq eval -i '.spec.resources.requests.storage|="2Gi"' $PVC_FILE echo -e "\n The Katib will be deployed with the following configs" -cat ../../../../../manifests/v1beta1/installs/katib-standalone/kustomization.yaml +cat $KUSTOMIZATION_FILE cat ../../../../../manifests/v1beta1/components/controller/katib-config.yaml +# If the user wants to deploy training operator, then use the kustomization file for training operator. if "$DEPLOY_TRAINING_OPERATOR"; then echo "Deploying Training Operator $TRAINING_OPERATOR_VERSION" kustomize build "github.com/kubeflow/training-operator/manifests/overlays/standalone?ref=$TRAINING_OPERATOR_VERSION" | kubectl apply -f - fi echo "Deploying Katib" -cd ../../../../../ && make deploy && cd - +cd ../../../../../ && WITH_DATABASE_TYPE=$WITH_DATABASE_TYPE make deploy && cd - # Wait until all Katib pods is running. TIMEOUT=120s kubectl wait --for=condition=complete --timeout=${TIMEOUT} -l katib.kubeflow.org/component=cert-generator -n kubeflow job || (kubectl get pods -n kubeflow && kubectl describe pods -n kubeflow && exit 1) -kubectl wait --for=condition=ready --timeout=${TIMEOUT} -l "katib.kubeflow.org/component in (controller,db-manager,mysql,ui)" -n kubeflow pod || + +kubectl wait --for=condition=ready --timeout=${TIMEOUT} -l "katib.kubeflow.org/component in ($WITH_DATABASE_TYPE,controller,db-manager,ui)" -n kubeflow pod || (kubectl get pods -n kubeflow && kubectl describe pods -n kubeflow && exit 1) +# Wait until all Katib pods is actually ready. +# Since Katib-controller does not use Readinessprobe yet, just wait for a while. +sleep 30 + echo "All Katib components are running." echo "Katib deployments" kubectl -n kubeflow get deploy @@ -68,6 +88,7 @@ kubectl -n kubeflow get pod kubectl apply -f ../../testdata/valid-experiment.yaml kubectl delete -f ../../testdata/valid-experiment.yaml +# Check the ValidatingWebhookConfiguration works well. set +o errexit kubectl apply -f ../../testdata/invalid-experiment.yaml if [ $? -ne 1 ]; then