diff --git a/.github/workflows/build-deploy.yaml b/.github/workflows/build-deploy.yaml index 9f3f7c7..ea049a5 100644 --- a/.github/workflows/build-deploy.yaml +++ b/.github/workflows/build-deploy.yaml @@ -18,7 +18,7 @@ jobs: uses: actions/checkout@v3 - uses: actions/setup-go@v3 with: - go-version: ^1.18.1 + go-version: ^1.22 - name: GHCR Login if: (github.event_name != 'pull_request') uses: docker/login-action@v2 @@ -48,7 +48,7 @@ jobs: uses: actions/checkout@v3 - uses: actions/setup-go@v3 with: - go-version: ^1.18.1 + go-version: ^1.22 - name: GHCR Login if: (github.event_name != 'pull_request') uses: docker/login-action@v2 diff --git a/.github/workflows/helm.yaml b/.github/workflows/helm.yaml index 665dc85..6e7adf5 100644 --- a/.github/workflows/helm.yaml +++ b/.github/workflows/helm.yaml @@ -16,7 +16,7 @@ jobs: uses: actions/checkout@v3 - uses: actions/setup-go@v3 with: - go-version: ^1.18.1 + go-version: ^1.22 - name: GHCR Login if: (github.event_name != 'pull_request') uses: docker/login-action@v2 diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index 2f3f411..9e7be03 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -34,7 +34,7 @@ jobs: - name: Setup Go uses: actions/setup-go@v3 with: - go-version: ^1.20 + go-version: ^1.22 - name: fmt check run: make fmt @@ -88,7 +88,7 @@ jobs: - name: Setup Go uses: actions/setup-go@v3 with: - go-version: ^1.20 + go-version: ^1.22 - name: Start minikube uses: medyagh/setup-minikube@697f2b7aaed5f70bf2a94ee21a4ec3dde7b12f92 # v0.0.9 diff --git a/.github/workflows/python.yaml b/.github/workflows/python.yaml index 6fb6a4e..fae946e 100644 --- a/.github/workflows/python.yaml +++ b/.github/workflows/python.yaml @@ -34,7 +34,7 @@ jobs: - name: Setup Go uses: actions/setup-go@v3 with: - go-version: ^1.20 + go-version: ^1.22 - name: Start minikube uses: medyagh/setup-minikube@697f2b7aaed5f70bf2a94ee21a4ec3dde7b12f92 # v0.0.9 diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 81cc2a4..9bb8aec 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -21,7 +21,7 @@ jobs: echo "tag=${{ inputs.release_tag }}" >> ${GITHUB_ENV} - uses: actions/setup-go@v3 with: - go-version: ^1.20 + go-version: ^1.22 - name: GHCR Login uses: docker/login-action@v2 with: @@ -51,7 +51,7 @@ jobs: uses: actions/checkout@v3 - uses: actions/setup-go@v3 with: - go-version: ^1.20 + go-version: ^1.22 - name: Set tag run: | echo "Tag for release is ${{ inputs.release_tag }}" @@ -86,7 +86,7 @@ jobs: uses: actions/checkout@v3 - uses: actions/setup-go@v3 with: - go-version: ^1.20 + go-version: ^1.22 - name: Set tag run: | echo "Tag for release is ${{ inputs.release_tag }}" diff --git a/Dockerfile b/Dockerfile index a14a18f..d02e96d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,5 @@ # Build the manager binary -FROM golang:1.20 as builder +FROM golang:1.22 as builder ARG TARGETOS ARG TARGETARCH diff --git a/Makefile b/Makefile index f8bd25a..15777ae 100644 --- a/Makefile +++ b/Makefile @@ -173,6 +173,19 @@ deploy: manifests kustomize ## Deploy controller to the K8s cluster specified in undeploy: ## Undeploy controller from the K8s cluster specified in ~/.kube/config. Call with ignore-not-found=true to ignore resource not found errors during deletion. $(KUSTOMIZE) build config/default | kubectl delete --ignore-not-found=$(ignore-not-found) -f - + +.PHONY: test-deploy +test-deploy: manifests kustomize + docker build --no-cache -t ${DEVIMG} . + docker push ${DEVIMG} + cd config/manager && $(KUSTOMIZE) edit set image controller=${DEVIMG} + $(KUSTOMIZE) build config/default > examples/dist/metrics-operator-dev.yaml + +.PHONY: test-deploy-recreate +test-deploy-recreate: test-deploy + kubectl delete -f ./examples/dist/metrics-operator-dev.yaml || echo "Already deleted" + kubectl apply -f ./examples/dist/metrics-operator-dev.yaml + ##@ Build Dependencies ## Location to install dependencies to @@ -187,7 +200,7 @@ ENVTEST ?= $(LOCALBIN)/setup-envtest ## Tool Versions KUSTOMIZE_VERSION ?= v3.8.7 -CONTROLLER_TOOLS_VERSION ?= v0.11.1 +CONTROLLER_TOOLS_VERSION ?= v0.14.0 KUSTOMIZE_INSTALL_SCRIPT ?= "https://raw.githubusercontent.com/kubernetes-sigs/kustomize/master/hack/install_kustomize.sh" .PHONY: kustomize @@ -205,6 +218,7 @@ $(CONTROLLER_GEN): $(LOCALBIN) test -s $(LOCALBIN)/controller-gen && $(LOCALBIN)/controller-gen --version | grep -q $(CONTROLLER_TOOLS_VERSION) || \ GOBIN=$(LOCALBIN) go install sigs.k8s.io/controller-tools/cmd/controller-gen@$(CONTROLLER_TOOLS_VERSION) + .PHONY: envtest envtest: $(ENVTEST) ## Download envtest-setup locally if necessary. $(ENVTEST): $(LOCALBIN) diff --git a/api/v1alpha2/zz_generated.deepcopy.go b/api/v1alpha2/zz_generated.deepcopy.go index 21a05ff..cd9bb75 100644 --- a/api/v1alpha2/zz_generated.deepcopy.go +++ b/api/v1alpha2/zz_generated.deepcopy.go @@ -1,5 +1,4 @@ //go:build !ignore_autogenerated -// +build !ignore_autogenerated /* Copyright 2023. @@ -147,7 +146,8 @@ func (in *Metric) DeepCopyInto(out *Metric) { if val == nil { (*out)[key] = nil } else { - in, out := &val, &outVal + inVal := (*in)[key] + in, out := &inVal, &outVal *out = make([]intstr.IntOrString, len(*in)) copy(*out, *in) } @@ -162,7 +162,8 @@ func (in *Metric) DeepCopyInto(out *Metric) { if val == nil { (*out)[key] = nil } else { - in, out := &val, &outVal + inVal := (*in)[key] + in, out := &inVal, &outVal *out = make(map[string]intstr.IntOrString, len(*in)) for key, val := range *in { (*out)[key] = val @@ -203,7 +204,8 @@ func (in *MetricAddon) DeepCopyInto(out *MetricAddon) { if val == nil { (*out)[key] = nil } else { - in, out := &val, &outVal + inVal := (*in)[key] + in, out := &inVal, &outVal *out = make([]intstr.IntOrString, len(*in)) copy(*out, *in) } @@ -218,7 +220,8 @@ func (in *MetricAddon) DeepCopyInto(out *MetricAddon) { if val == nil { (*out)[key] = nil } else { - in, out := &val, &outVal + inVal := (*in)[key] + in, out := &inVal, &outVal *out = make(map[string]intstr.IntOrString, len(*in)) for key, val := range *in { (*out)[key] = val diff --git a/config/crd/bases/flux-framework.org_metricsets.yaml b/config/crd/bases/flux-framework.org_metricsets.yaml index 4467753..7a1823d 100644 --- a/config/crd/bases/flux-framework.org_metricsets.yaml +++ b/config/crd/bases/flux-framework.org_metricsets.yaml @@ -3,8 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.11.1 - creationTimestamp: null + controller-gen.kubebuilder.io/version: v0.14.0 name: metricsets.flux-framework.org spec: group: flux-framework.org @@ -21,14 +20,19 @@ spec: description: MetricSet is the Schema for the metrics API properties: apiVersion: - description: 'APIVersion defines the versioned schema of this representation - of an object. Servers should convert recognized schemas to the latest - internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources' + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources type: string kind: - description: 'Kind is a string value representing the REST resource this - object represents. Servers may infer this from the endpoint the client - submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds type: string metadata: type: object @@ -37,7 +41,8 @@ spec: properties: deadlineSeconds: default: 31500000 - description: Should the job be limited to a particular number of seconds? + description: |- + Should the job be limited to a particular number of seconds? Approximately one year. This cannot be zero or job won't start format: int64 type: integer @@ -45,13 +50,14 @@ spec: description: Don't set JobSet FQDN type: boolean logging: - description: Logging spec, preparing for other kinds of logging Right - now we just include an interactive option + description: |- + Logging spec, preparing for other kinds of logging + Right now we just include an interactive option properties: interactive: - description: Don't allow the application, metric, or storage test - to finish This adds sleep infinity at the end to allow for interactive - mode. + description: |- + Don't allow the application, metric, or storage test to finish + This adds sleep infinity at the end to allow for interactive mode. type: boolean type: object metrics: @@ -60,15 +66,15 @@ spec: items: properties: addons: - description: A Metric addon can be storage (volume) or an application, - It's an additional entity that can customize a replicated - job, either adding assets / features or entire containers - to the pod + description: |- + A Metric addon can be storage (volume) or an application, + It's an additional entity that can customize a replicated job, + either adding assets / features or entire containers to the pod items: - description: 'A Metric addon is an interface that exposes - extra volumes for a metric. Examples include: A storage - volume to be mounted on one or more of the replicated jobs - A single application container.' + description: |- + A Metric addon is an interface that exposes extra volumes for a metric. Examples include: + A storage volume to be mounted on one or more of the replicated jobs + A single application container. properties: listOptions: additionalProperties: @@ -129,7 +135,9 @@ spec: - type: string x-kubernetes-int-or-string: true type: array - description: Metric List Options Metric specific options + description: |- + Metric List Options + Metric specific options type: object mapOptions: additionalProperties: @@ -149,7 +157,9 @@ spec: - type: integer - type: string x-kubernetes-int-or-string: true - description: Metric Options Metric specific options + description: |- + Metric Options + Metric specific options type: object resources: description: Resources include limits and requests for the metric diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml index 43d29f0..a595dd2 100644 --- a/config/rbac/role.yaml +++ b/config/rbac/role.yaml @@ -2,7 +2,6 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: - creationTimestamp: null name: manager-role rules: - apiGroups: diff --git a/controllers/metric/metric_controller.go b/controllers/metric/metric_controller.go index 4785302..ddeebca 100644 --- a/controllers/metric/metric_controller.go +++ b/controllers/metric/metric_controller.go @@ -129,7 +129,7 @@ func (r *MetricSetReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( // Ensure the metricset is mapped to a JobSet. For design: // 1. If an application is provided, we pair the application at some scale with each metric as a contaienr - // 2. If storage is provided, we create the volumes for the metric containers + // 2. If storage or other addons are provided, we create the volumes for the metric containers result, err := r.ensureMetricSet(ctx, &spec, &set) if err != nil { r.Log.Error(err, "🟥️ Issue ensuring metric set") diff --git a/docs/_static/data/metrics.json b/docs/_static/data/metrics.json index 5d1bc46..86502ec 100644 --- a/docs/_static/data/metrics.json +++ b/docs/_static/data/metrics.json @@ -20,6 +20,13 @@ "image": "ghcr.io/converged-computing/metric-cabanapic:latest", "url": "https://github.com/ECP-copa/CabanaPIC" }, + { + "name": "app-custom", + "description": "Provide a custom application for MPI trace", + "family": "proxyapp", + "image": "", + "url": "https://converged-computing.github.io/metrics-operator" + }, { "name": "app-hpl", "description": "High-Performance Linpack (HPL)", diff --git a/docs/getting_started/addons.md b/docs/getting_started/addons.md index 2e100d9..b5e3205 100644 --- a/docs/getting_started/addons.md +++ b/docs/getting_started/addons.md @@ -270,7 +270,7 @@ wrapper to the actual executable. ### perf-mpitrace - - *[perf-mpitrace](https://github.com/converged-computing/metrics-operator/tree/main/examples/addons/perf-mpitrace)* + - *[perf-mpitrace](https://github.com/converged-computing/metrics-operator/tree/main/examples/addons/mpitrace-lammps)* This metric provides [mpitrace](https://github.com/IBM/mpitrace) to wrap an MPI application. The setup is the same as hpctoolkit, and we currently only provide a rocky base (please let us know if you need another). It works by way of wrapping the mpirun command with `LD_PRELOAD`. diff --git a/docs/getting_started/metrics.md b/docs/getting_started/metrics.md index f8bd87a..fd07113 100644 --- a/docs/getting_started/metrics.md +++ b/docs/getting_started/metrics.md @@ -251,6 +251,49 @@ Here are some useful resources for the benchmarks: - [HPC Council](https://hpcadvisorycouncil.atlassian.net/wiki/spaces/HPCWORKS/pages/1284538459/OSU+Benchmark+Tuning+for+2nd+Gen+AMD+EPYC+using+HDR+InfiniBand+over+HPC-X+MPI) - [AWS Tutorials](https://www.hpcworkshops.com/08-efa/04-complie-run-osu.html) +### app-custom + +A custom application can support any application to be used as a metric app. For the following parameters, "command" and "container" are required. + +| Name | Description | Option Key | Type | Default | +|-----|-------------|------------|------|---------| +| command | The full mpirun command | options->command |string | unset | +| workdir | The working directory for the command | options->workdir | string | unset | +| soleTenancy | require each pod to have sole tenancy | command->soleTenancy | string | "false" | + +As an example, here is running mpitrace (an addon) with a custom container. + +```yaml +apiVersion: flux-framework.org/v1alpha2 +kind: MetricSet +metadata: + labels: + app.kubernetes.io/name: metricset + app.kubernetes.io/instance: metricset-sample + name: metricset-sample +spec: + # Number of pods for lammps (one launcher, the rest workers) + pods: 4 + metrics: + - name: app-custom + image: ghcr.io/converged-computing/ + options: + command: mpirun --hostfile ./hostlist.txt -mca orte_keep_fqdn_hostnames t -np 4 --map-by socket + workdir: + + # Add on hpctoolkit, will mount a volume and wrap lammps + addons: + - name: perf-mpitrace + options: + mount: /opt/mnt + image: ghcr.io/converged-computing/metric-mpitrace:ubuntu-jammy + workdir: + # this is the target of the replicated job "l" means launcher + target: l + # This is the target container, with full name "launcher" + containerTarget: launcher +``` + ### app-lammps - *[app-lammps](https://github.com/converged-computing/metrics-operator/tree/main/examples/tests/app-lammps)* diff --git a/examples/addons/mpitrace-lammps/metrics-rocky.yaml b/examples/addons/mpitrace-lammps/metrics-rocky.yaml index 578dec9..2d5881e 100644 --- a/examples/addons/mpitrace-lammps/metrics-rocky.yaml +++ b/examples/addons/mpitrace-lammps/metrics-rocky.yaml @@ -22,7 +22,6 @@ spec: command: /opt/intel/mpi/2021.8.0/bin/mpirun --hostfile ./hostlist.txt -np 4 --map-by socket lmp -v x 2 -v y 2 -v z 2 -in in.reaxc.hns -nocite workdir: /opt/lammps/examples/reaxff/HNS - # Add on hpctoolkit, will mount a volume and wrap lammps addons: - name: perf-mpitrace options: diff --git a/examples/dist/metrics-operator-arm.yaml b/examples/dist/metrics-operator-arm.yaml index 0afd08a..1e04cce 100644 --- a/examples/dist/metrics-operator-arm.yaml +++ b/examples/dist/metrics-operator-arm.yaml @@ -15,8 +15,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.11.1 - creationTimestamp: null + controller-gen.kubebuilder.io/version: v0.14.0 name: metricsets.flux-framework.org spec: group: flux-framework.org @@ -33,10 +32,19 @@ spec: description: MetricSet is the Schema for the metrics API properties: apiVersion: - description: 'APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources' + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources type: string kind: - description: 'Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds type: string metadata: type: object @@ -45,17 +53,23 @@ spec: properties: deadlineSeconds: default: 31500000 - description: Should the job be limited to a particular number of seconds? Approximately one year. This cannot be zero or job won't start + description: |- + Should the job be limited to a particular number of seconds? + Approximately one year. This cannot be zero or job won't start format: int64 type: integer dontSetFQDN: description: Don't set JobSet FQDN type: boolean logging: - description: Logging spec, preparing for other kinds of logging Right now we just include an interactive option + description: |- + Logging spec, preparing for other kinds of logging + Right now we just include an interactive option properties: interactive: - description: Don't allow the application, metric, or storage test to finish This adds sleep infinity at the end to allow for interactive mode. + description: |- + Don't allow the application, metric, or storage test to finish + This adds sleep infinity at the end to allow for interactive mode. type: boolean type: object metrics: @@ -63,9 +77,15 @@ spec: items: properties: addons: - description: A Metric addon can be storage (volume) or an application, It's an additional entity that can customize a replicated job, either adding assets / features or entire containers to the pod + description: |- + A Metric addon can be storage (volume) or an application, + It's an additional entity that can customize a replicated job, + either adding assets / features or entire containers to the pod items: - description: 'A Metric addon is an interface that exposes extra volumes for a metric. Examples include: A storage volume to be mounted on one or more of the replicated jobs A single application container.' + description: |- + A Metric addon is an interface that exposes extra volumes for a metric. Examples include: + A storage volume to be mounted on one or more of the replicated jobs + A single application container. properties: listOptions: additionalProperties: @@ -126,7 +146,9 @@ spec: - type: string x-kubernetes-int-or-string: true type: array - description: Metric List Options Metric specific options + description: |- + Metric List Options + Metric specific options type: object mapOptions: additionalProperties: @@ -146,7 +168,9 @@ spec: - type: integer - type: string x-kubernetes-int-or-string: true - description: Metric Options Metric specific options + description: |- + Metric Options + Metric specific options type: object resources: description: Resources include limits and requests for the metric container @@ -280,7 +304,6 @@ rules: apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: - creationTimestamp: null name: metrics-manager-role rules: - apiGroups: diff --git a/examples/dist/metrics-operator.yaml b/examples/dist/metrics-operator.yaml index c82cbcd..1fefb29 100644 --- a/examples/dist/metrics-operator.yaml +++ b/examples/dist/metrics-operator.yaml @@ -15,8 +15,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.11.1 - creationTimestamp: null + controller-gen.kubebuilder.io/version: v0.14.0 name: metricsets.flux-framework.org spec: group: flux-framework.org @@ -33,10 +32,19 @@ spec: description: MetricSet is the Schema for the metrics API properties: apiVersion: - description: 'APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources' + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources type: string kind: - description: 'Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds type: string metadata: type: object @@ -45,17 +53,23 @@ spec: properties: deadlineSeconds: default: 31500000 - description: Should the job be limited to a particular number of seconds? Approximately one year. This cannot be zero or job won't start + description: |- + Should the job be limited to a particular number of seconds? + Approximately one year. This cannot be zero or job won't start format: int64 type: integer dontSetFQDN: description: Don't set JobSet FQDN type: boolean logging: - description: Logging spec, preparing for other kinds of logging Right now we just include an interactive option + description: |- + Logging spec, preparing for other kinds of logging + Right now we just include an interactive option properties: interactive: - description: Don't allow the application, metric, or storage test to finish This adds sleep infinity at the end to allow for interactive mode. + description: |- + Don't allow the application, metric, or storage test to finish + This adds sleep infinity at the end to allow for interactive mode. type: boolean type: object metrics: @@ -63,9 +77,15 @@ spec: items: properties: addons: - description: A Metric addon can be storage (volume) or an application, It's an additional entity that can customize a replicated job, either adding assets / features or entire containers to the pod + description: |- + A Metric addon can be storage (volume) or an application, + It's an additional entity that can customize a replicated job, + either adding assets / features or entire containers to the pod items: - description: 'A Metric addon is an interface that exposes extra volumes for a metric. Examples include: A storage volume to be mounted on one or more of the replicated jobs A single application container.' + description: |- + A Metric addon is an interface that exposes extra volumes for a metric. Examples include: + A storage volume to be mounted on one or more of the replicated jobs + A single application container. properties: listOptions: additionalProperties: @@ -126,7 +146,9 @@ spec: - type: string x-kubernetes-int-or-string: true type: array - description: Metric List Options Metric specific options + description: |- + Metric List Options + Metric specific options type: object mapOptions: additionalProperties: @@ -146,7 +168,9 @@ spec: - type: integer - type: string x-kubernetes-int-or-string: true - description: Metric Options Metric specific options + description: |- + Metric Options + Metric specific options type: object resources: description: Resources include limits and requests for the metric container @@ -280,7 +304,6 @@ rules: apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: - creationTimestamp: null name: metrics-manager-role rules: - apiGroups: diff --git a/go.mod b/go.mod index 65dcc54..d0343ed 100644 --- a/go.mod +++ b/go.mod @@ -1,12 +1,12 @@ module github.com/converged-computing/metrics-operator -go 1.20 +go 1.22 require ( github.com/go-logr/logr v1.2.4 github.com/onsi/ginkgo/v2 v2.11.0 github.com/onsi/gomega v1.27.8 - gopkg.in/yaml.v2 v2.4.0 + go.uber.org/zap v1.24.0 k8s.io/api v0.27.3 k8s.io/apimachinery v0.27.3 k8s.io/client-go v0.27.3 @@ -51,7 +51,6 @@ require ( github.com/spf13/pflag v1.0.5 // indirect go.uber.org/atomic v1.11.0 // indirect go.uber.org/multierr v1.11.0 // indirect - go.uber.org/zap v1.24.0 // indirect golang.org/x/net v0.10.0 // indirect golang.org/x/oauth2 v0.8.0 // indirect golang.org/x/sys v0.9.0 // indirect @@ -65,6 +64,7 @@ require ( google.golang.org/grpc v1.51.0 // indirect google.golang.org/protobuf v1.30.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect + gopkg.in/yaml.v2 v2.4.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect k8s.io/apiextensions-apiserver v0.27.2 // indirect k8s.io/component-base v0.27.2 // indirect diff --git a/go.sum b/go.sum index dc2ee6b..c1735d7 100644 --- a/go.sum +++ b/go.sum @@ -39,6 +39,7 @@ github.com/envoyproxy/go-control-plane v0.9.9-0.20210512163311-63b5d3c536b0/go.m github.com/envoyproxy/go-control-plane v0.10.2-0.20220325020618-49ff273808a1/go.mod h1:KJwIaB5Mv44NWtYuAOFCVOjcI94vtpEz2JU/D2v6IjE= github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c= github.com/evanphx/json-patch v4.12.0+incompatible h1:4onqiflcdA9EOZ4RxV643DvftH5pOlLGNtQ5lPWQu84= +github.com/evanphx/json-patch v4.12.0+incompatible/go.mod h1:50XU6AFN0ol/bzJsmQLiYLvXMP4fmwYFNcr97nuDLSk= github.com/evanphx/json-patch/v5 v5.6.0 h1:b91NhWfaz02IuVxO9faSllyAtNXHMPkC5J8sJCLunww= github.com/evanphx/json-patch/v5 v5.6.0/go.mod h1:G79N1coSVB93tBe7j6PhzjmR3/2VvlbKOFpnXhI9Bw4= github.com/flowstack/go-jsonschema v0.1.1/go.mod h1:yL7fNggx1o8rm9RlgXv7hTBWxdBM0rVwpMwimd3F3N0= @@ -114,6 +115,7 @@ github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORN github.com/kr/pretty v0.2.0/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= +github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= @@ -149,6 +151,7 @@ github.com/prometheus/procfs v0.10.1 h1:kYK1Va/YMlutzCGazswoHKo//tZVlFpKYh+Pymzi github.com/prometheus/procfs v0.10.1/go.mod h1:nwNm2aOCAYw8uTR/9bWRREkZFxAUcWzPHWJq+XBB/FM= github.com/rogpeppe/fastuuid v1.2.0/go.mod h1:jVj6XXZzXRy/MSR5jhDC/2q6DgLz+nrA6LYCDYWNEvQ= github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ= +github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog= github.com/spaolacci/murmur3 v0.0.0-20180118202830-f09979ecbc72/go.mod h1:JwIasOWyU6f++ZhiEuf87xNszmSA2myDM2Kzu9HwQUA= github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= @@ -164,6 +167,7 @@ github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/ github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk= +github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= github.com/xeipuuv/gojsonpointer v0.0.0-20180127040702-4e3ac2762d5f/go.mod h1:N2zxlSyiKSe5eX1tZViRH5QA0qijqEDrYZiPEAiq3wU= github.com/xeipuuv/gojsonreference v0.0.0-20180127040603-bd5ef7bd5415/go.mod h1:GwrjFmJcFw6At/Gs6z4yjiIwzuJ1/+UwLxMQDVQXShQ= github.com/xeipuuv/gojsonschema v1.2.0/go.mod h1:anYRn/JVcOK2ZgGU+IjEV4nwlhoK5sQluxsYJ78Id3Y= @@ -176,6 +180,7 @@ go.uber.org/atomic v1.11.0 h1:ZvwS0R+56ePWxUNi+Atn9dWONBPp/AUETXlHW0DxSjE= go.uber.org/atomic v1.11.0/go.mod h1:LUxbIzbOniOlMKjJjyPfpl4v+PKK2cNJn91OQbhoJI0= go.uber.org/goleak v1.1.11/go.mod h1:cwTWslyiVhfpKIDGSZEM2HlOvcqm+tG4zioyIeLoqMQ= go.uber.org/goleak v1.2.1 h1:NBol2c7O1ZokfZ0LEU9K6Whx/KnwvepVetCUhtKja4A= +go.uber.org/goleak v1.2.1/go.mod h1:qlT2yGI9QafXHhZZLxlSuNsMw3FFLxBr+tBRlmO1xH4= go.uber.org/multierr v1.6.0/go.mod h1:cdWPpRnG4AhwMwsgIHip0KRBQjJy5kYEpYjJxpXp9iU= go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0= go.uber.org/multierr v1.11.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y= @@ -193,6 +198,7 @@ golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.4.2/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.10.0 h1:lFO9qtOdlre5W1jxS3r/4szv2/6iXxScdzjoBMXNhYk= +golang.org/x/mod v0.10.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20190108225652-1e06a53dbb7e/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= diff --git a/pkg/addons/mpitrace.go b/pkg/addons/mpitrace.go index 89f9cf1..106eac2 100644 --- a/pkg/addons/mpitrace.go +++ b/pkg/addons/mpitrace.go @@ -93,7 +93,10 @@ func (a *MPITrace) CustomizeEntrypoints( cs []*specs.ContainerSpec, rjs []*jobset.ReplicatedJob, ) { + logger.Infof("🟧️ Customizing entrypoints for %s\n", rjs) + for _, rj := range rjs { + logger.Infof("🟧️ Comparing job target %s vs job name %s\n", a.target, rj.Name) // Only customize if the replicated job name matches the target if a.target != "" && a.target != rj.Name { @@ -101,7 +104,6 @@ func (a *MPITrace) CustomizeEntrypoints( } a.customizeEntrypoint(cs, rj) } - } // CustomizeEntrypoint for a single replicated job @@ -120,7 +122,7 @@ echo "%s" wget -q https://github.com/converged-computing/goshare/releases/download/2023-09-06/wait-fs chmod +x ./wait-fs mv ./wait-fs /usr/bin/goshare-wait-fs - + # Ensure spack view is on the path, wherever it is mounted viewbase="%s" software="${viewbase}/software" @@ -205,6 +207,6 @@ func init() { } app := ApplicationAddon{AddonBase: base} spack := SpackView{ApplicationAddon: app} - toolkit := MPITrace{SpackView: spack} - Register(&toolkit) + tracer := MPITrace{SpackView: spack} + Register(&tracer) } diff --git a/pkg/metrics/app/custom.go b/pkg/metrics/app/custom.go new file mode 100644 index 0000000..d8a1f6b --- /dev/null +++ b/pkg/metrics/app/custom.go @@ -0,0 +1,133 @@ +/* +Copyright 2024 Lawrence Livermore National Security, LLC + (c.f. AUTHORS, NOTICE.LLNS, COPYING) + +SPDX-License-Identifier: MIT +*/ + +package application + +import ( + "fmt" + + api "github.com/converged-computing/metrics-operator/api/v1alpha2" + "k8s.io/apimachinery/pkg/util/intstr" + + "github.com/converged-computing/metrics-operator/pkg/metadata" + metrics "github.com/converged-computing/metrics-operator/pkg/metrics" + "github.com/converged-computing/metrics-operator/pkg/specs" +) + +const ( + customIdentifier = "app-custom" + customSummary = "Provide a custom application for MPI trace" +) + +type CustomApp struct { + metrics.LauncherWorker +} + +func (m CustomApp) Url() string { + return "https://converged-computing.github.io/metrics-operator" +} + +func (m CustomApp) Family() string { + return metrics.ProxyAppFamily +} + +// Set custom options / attributes for the metric +func (m *CustomApp) SetOptions(metric *api.Metric) { + + m.Identifier = customIdentifier + m.Summary = customSummary + + // Ensure we set sole tenancy if desired + st, ok := metric.Options["soleTenancy"] + if ok && st.StrVal == "true" || st.StrVal == "yes" { + m.SoleTenancy = true + } + + // We require both a command and workdir + m.SetDefaultOptions(metric) + if m.Command == "" || m.Container == "" { + fmt.Printf("Either \"command\" or \"container\" is not defined - this will not work as expected") + } +} + +// We don't know if the app can run on one node or not +func (m CustomApp) Validate(spec *api.MetricSet) bool { + return true +} + +// Exported options and list options +func (m CustomApp) Options() map[string]intstr.IntOrString { + values := map[string]intstr.IntOrString{ + "command": intstr.FromString(m.Command), + "workdir": intstr.FromString(m.Workdir), + "soleTenancy": intstr.FromString("false"), + } + if m.SoleTenancy { + values["soleTenancy"] = intstr.FromString("true") + } + return values +} + +// Prepare containers with jobs and entrypoint scripts +func (m CustomApp) PrepareContainers( + spec *api.MetricSet, + metric *metrics.Metric, +) []*specs.ContainerSpec { + + // Metadata to add to beginning of run + meta := metrics.Metadata(spec, metric) + hosts := m.GetHostlist(spec) + prefix := m.GetCommonPrefix(meta, m.Command, hosts) + + // Template blocks for launcher script + preBlock := ` +echo "%s" +` + + postBlock := ` +echo "%s" +%s +` + interactive := metadata.Interactive(spec.Spec.Logging.Interactive) + preBlock = prefix + fmt.Sprintf(preBlock, metadata.Separator) + postBlock = fmt.Sprintf(postBlock, metadata.CollectionEnd, interactive) + + // Entrypoint for the launcher + launcherEntrypoint := specs.EntrypointScript{ + Name: specs.DeriveScriptKey(m.LauncherScript), + Path: m.LauncherScript, + Pre: preBlock, + Command: m.Command, + Post: postBlock, + } + + // Entrypoint for the worker + // Just has a sleep infinity added to the prefix + workerEntrypoint := specs.EntrypointScript{ + Name: specs.DeriveScriptKey(m.WorkerScript), + Path: m.WorkerScript, + Pre: prefix, + Command: "sleep infinity", + } + + // These are associated with replicated jobs via JobName + launcherContainer := m.GetLauncherContainerSpec(launcherEntrypoint) + workerContainer := m.GetWorkerContainerSpec(workerEntrypoint) + + // Return the script templates for each of launcher and worker + return []*specs.ContainerSpec{&launcherContainer, &workerContainer} +} + +func init() { + base := metrics.BaseMetric{ + Identifier: customIdentifier, + Summary: customSummary, + } + launcher := metrics.LauncherWorker{BaseMetric: base} + custom := CustomApp{LauncherWorker: launcher} + metrics.Register(&custom) +} diff --git a/pkg/metrics/base.go b/pkg/metrics/base.go index 7c6413a..ce94e0d 100644 --- a/pkg/metrics/base.go +++ b/pkg/metrics/base.go @@ -116,6 +116,8 @@ func (m *BaseMetric) ReplicatedJobs(spec *api.MetricSet) ([]*jobset.ReplicatedJo } // SetDefaultOptions that are shared (possibly) +// TODO this doesn't do anything given an interface and needs +// a different placement. func (m BaseMetric) SetDefaultOptions(metric *api.Metric) { st, ok := metric.Options["soleTenancy"] if ok && st.StrVal == "false" || st.StrVal == "no" { diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go index d43f083..2da7894 100644 --- a/pkg/metrics/metrics.go +++ b/pkg/metrics/metrics.go @@ -88,7 +88,7 @@ func GetMetric(metric *api.Metric, set *api.MetricSet) (Metric, error) { logger.Infof("Attempting to add addon %s", a.Name) addon, err := addons.GetAddon(&a, set) if err != nil { - return nil, fmt.Errorf("Addon %s for metric %s did not validate", a.Name, metric.Name) + return nil, fmt.Errorf("addon %s for metric %s did not validate", a.Name, metric.Name) } logger.Infof("Registering addon %s", a.Name) m.RegisterAddon(&addon)