@@ -17,7 +17,10 @@ limitations under the License.
1717package v1beta1
1818
1919import (
20+ "bytes"
21+ "context"
2022 "encoding/json"
23+ "io"
2124 "log"
2225 "net/http"
2326 "path/filepath"
@@ -27,8 +30,15 @@ import (
2730 "sigs.k8s.io/controller-runtime/pkg/client"
2831
2932 experimentv1beta1 "github.com/kubeflow/katib/pkg/apis/controller/experiments/v1beta1"
33+ trialsv1beta1 "github.com/kubeflow/katib/pkg/apis/controller/trials/v1beta1"
3034 api_pb_v1beta1 "github.com/kubeflow/katib/pkg/apis/manager/v1beta1"
3135 "github.com/kubeflow/katib/pkg/util/v1beta1/katibclient"
36+
37+ apiv1 "k8s.io/api/core/v1"
38+ metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
39+ "k8s.io/apimachinery/pkg/types"
40+ corev1 "k8s.io/client-go/kubernetes/typed/core/v1"
41+ "sigs.k8s.io/controller-runtime/pkg/client/config"
3242)
3343
3444func NewKatibUIHandler (dbManagerAddr string ) * KatibUIHandler {
@@ -421,3 +431,87 @@ func (k *KatibUIHandler) FetchTrial(w http.ResponseWriter, r *http.Request) {
421431 return
422432 }
423433}
434+
435+ // FetchTrialLogs fetches logs for a trial in specific namespace.
436+ func (k * KatibUIHandler ) FetchTrialLogs (w http.ResponseWriter , r * http.Request ) {
437+ log .Printf ("Requesting logs" )
438+
439+ trialName := r .URL .Query ()["trialName" ][0 ]
440+ namespace := r .URL .Query ()["namespace" ][0 ]
441+ log .Printf ("Requesting logs" )
442+
443+ logs , err := getTrialLogs (k , trialName , namespace )
444+ if err != nil {
445+ log .Printf ("GetLogs failed: %v" , err )
446+ http .Error (w , err .Error (), http .StatusInternalServerError )
447+ return
448+ }
449+ response , err := json .Marshal (logs )
450+ if err != nil {
451+ log .Printf ("Marshal logs failed: %v" , err )
452+ http .Error (w , err .Error (), http .StatusInternalServerError )
453+ return
454+ }
455+ w .Write (response )
456+ }
457+
458+ // GetTrialLogs returns logs of a master Pod for the given job name and namespace
459+ func getTrialLogs (k * KatibUIHandler , trialName string , namespace string ) (string , error ) {
460+ cfg , err := config .GetConfig ()
461+ if err != nil {
462+ return "" , err
463+ }
464+
465+ clientset , err := corev1 .NewForConfig (cfg )
466+ if err != nil {
467+ return "" , err
468+ }
469+
470+ trial := & trialsv1beta1.Trial {}
471+ if err := k .katibClient .GetClient ().Get (context .TODO (), types.NamespacedName {Name : trialName , Namespace : namespace }, trial ); err != nil {
472+ return "" , err
473+ }
474+
475+ selectionLabel := "training.kubeflow.org/job-name=" + trialName + ",training.kubeflow.org/job-role=master"
476+ if trial .Spec .RunSpec .GetKind () == "Job" {
477+ selectionLabel = "job-name=" + trialName
478+ }
479+
480+ podList , err := clientset .Pods (namespace ).List (context .Background (), metav1.ListOptions {LabelSelector : selectionLabel })
481+ if err != nil {
482+ return "" , err
483+ }
484+
485+ if len (podList .Items ) == 0 {
486+ message := `Logs for the trial could not be found.
487+ Was 'retain: true' specified in the Experiment definition?
488+ An example can be found here: https://github.com/kubeflow/katib/blob/master/examples/v1beta1/argo/argo-workflow.yaml#L33`
489+
490+ return message , nil
491+ }
492+
493+ podLogOpts := apiv1.PodLogOptions {}
494+ podLogOpts .Container = trial .Spec .PrimaryContainerName
495+ for container := range podList .Items [0 ].Spec .Containers {
496+ if podList .Items [0 ].Spec .Containers [container ].Name == "metrics-logger-and-collector" {
497+ podLogOpts .Container = "metrics-logger-and-collector"
498+ break
499+ }
500+ }
501+
502+ req := clientset .Pods (namespace ).GetLogs (podList .Items [0 ].Name , & podLogOpts )
503+ podLogs , err := req .Stream (context .Background ())
504+ if err != nil {
505+ return "" , err
506+ }
507+ defer podLogs .Close ()
508+
509+ buf := new (bytes.Buffer )
510+ _ , err = io .Copy (buf , podLogs )
511+ if err != nil {
512+ return "" , err
513+ }
514+ str := buf .String ()
515+
516+ return str , nil
517+ }
0 commit comments