Skip to content

Commit

Permalink
custom: add support for custom container
Browse files Browse the repository at this point in the history
We should be able to support custom containers, and
configuration of addons to them. I am not liking the
design to have addons defined in parallel, and want to
refactor so they are part of the metric. I am also
wondering if the metrics themselves are more akin to
apps. I have not looked at this project in a bit and
need to think about it.

Signed-off-by: vsoch <[email protected]>
  • Loading branch information
vsoch committed Sep 24, 2024
1 parent 8835f15 commit f87207c
Show file tree
Hide file tree
Showing 15 changed files with 247 additions and 39 deletions.
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Build the manager binary
FROM golang:1.20 as builder
FROM golang:1.22 as builder
ARG TARGETOS
ARG TARGETARCH

Expand Down
16 changes: 15 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,19 @@ deploy: manifests kustomize ## Deploy controller to the K8s cluster specified in
undeploy: ## Undeploy controller from the K8s cluster specified in ~/.kube/config. Call with ignore-not-found=true to ignore resource not found errors during deletion.
$(KUSTOMIZE) build config/default | kubectl delete --ignore-not-found=$(ignore-not-found) -f -


.PHONY: test-deploy
test-deploy: manifests kustomize
docker build --no-cache -t ${DEVIMG} .
docker push ${DEVIMG}
cd config/manager && $(KUSTOMIZE) edit set image controller=${DEVIMG}
$(KUSTOMIZE) build config/default > examples/dist/metrics-operator-dev.yaml

.PHONY: test-deploy-recreate
test-deploy-recreate: test-deploy
kubectl delete -f ./examples/dist/metrics-operator-dev.yaml || echo "Already deleted"
kubectl apply -f ./examples/dist/metrics-operator-dev.yaml

##@ Build Dependencies

## Location to install dependencies to
Expand All @@ -187,7 +200,7 @@ ENVTEST ?= $(LOCALBIN)/setup-envtest

## Tool Versions
KUSTOMIZE_VERSION ?= v3.8.7
CONTROLLER_TOOLS_VERSION ?= v0.11.1
CONTROLLER_TOOLS_VERSION ?= v0.14.0

KUSTOMIZE_INSTALL_SCRIPT ?= "https://raw.githubusercontent.com/kubernetes-sigs/kustomize/master/hack/install_kustomize.sh"
.PHONY: kustomize
Expand All @@ -205,6 +218,7 @@ $(CONTROLLER_GEN): $(LOCALBIN)
test -s $(LOCALBIN)/controller-gen && $(LOCALBIN)/controller-gen --version | grep -q $(CONTROLLER_TOOLS_VERSION) || \
GOBIN=$(LOCALBIN) go install sigs.k8s.io/controller-tools/cmd/controller-gen@$(CONTROLLER_TOOLS_VERSION)


.PHONY: envtest
envtest: $(ENVTEST) ## Download envtest-setup locally if necessary.
$(ENVTEST): $(LOCALBIN)
Expand Down
58 changes: 34 additions & 24 deletions config/crd/bases/flux-framework.org_metricsets.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@ apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
annotations:
controller-gen.kubebuilder.io/version: v0.11.1
creationTimestamp: null
controller-gen.kubebuilder.io/version: v0.14.0
name: metricsets.flux-framework.org
spec:
group: flux-framework.org
Expand All @@ -21,14 +20,19 @@ spec:
description: MetricSet is the Schema for the metrics API
properties:
apiVersion:
description: 'APIVersion defines the versioned schema of this representation
of an object. Servers should convert recognized schemas to the latest
internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources'
description: |-
APIVersion defines the versioned schema of this representation of an object.
Servers should convert recognized schemas to the latest internal value, and
may reject unrecognized values.
More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
type: string
kind:
description: 'Kind is a string value representing the REST resource this
object represents. Servers may infer this from the endpoint the client
submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds'
description: |-
Kind is a string value representing the REST resource this object represents.
Servers may infer this from the endpoint the client submits requests to.
Cannot be updated.
In CamelCase.
More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
type: string
metadata:
type: object
Expand All @@ -37,21 +41,23 @@ spec:
properties:
deadlineSeconds:
default: 31500000
description: Should the job be limited to a particular number of seconds?
description: |-
Should the job be limited to a particular number of seconds?
Approximately one year. This cannot be zero or job won't start
format: int64
type: integer
dontSetFQDN:
description: Don't set JobSet FQDN
type: boolean
logging:
description: Logging spec, preparing for other kinds of logging Right
now we just include an interactive option
description: |-
Logging spec, preparing for other kinds of logging
Right now we just include an interactive option
properties:
interactive:
description: Don't allow the application, metric, or storage test
to finish This adds sleep infinity at the end to allow for interactive
mode.
description: |-
Don't allow the application, metric, or storage test to finish
This adds sleep infinity at the end to allow for interactive mode.
type: boolean
type: object
metrics:
Expand All @@ -60,15 +66,15 @@ spec:
items:
properties:
addons:
description: A Metric addon can be storage (volume) or an application,
It's an additional entity that can customize a replicated
job, either adding assets / features or entire containers
to the pod
description: |-
A Metric addon can be storage (volume) or an application,
It's an additional entity that can customize a replicated job,
either adding assets / features or entire containers to the pod
items:
description: 'A Metric addon is an interface that exposes
extra volumes for a metric. Examples include: A storage
volume to be mounted on one or more of the replicated jobs
A single application container.'
description: |-
A Metric addon is an interface that exposes extra volumes for a metric. Examples include:
A storage volume to be mounted on one or more of the replicated jobs
A single application container.
properties:
listOptions:
additionalProperties:
Expand Down Expand Up @@ -129,7 +135,9 @@ spec:
- type: string
x-kubernetes-int-or-string: true
type: array
description: Metric List Options Metric specific options
description: |-
Metric List Options
Metric specific options
type: object
mapOptions:
additionalProperties:
Expand All @@ -149,7 +157,9 @@ spec:
- type: integer
- type: string
x-kubernetes-int-or-string: true
description: Metric Options Metric specific options
description: |-
Metric Options
Metric specific options
type: object
resources:
description: Resources include limits and requests for the metric
Expand Down
2 changes: 1 addition & 1 deletion config/manager/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,4 @@ kind: Kustomization
images:
- name: controller
newName: ghcr.io/converged-computing/metrics-operator
newTag: latest
newTag: test
1 change: 0 additions & 1 deletion config/rbac/role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
creationTimestamp: null
name: manager-role
rules:
- apiGroups:
Expand Down
2 changes: 1 addition & 1 deletion controllers/metric/metric_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ func (r *MetricSetReconciler) Reconcile(ctx context.Context, req ctrl.Request) (

// Ensure the metricset is mapped to a JobSet. For design:
// 1. If an application is provided, we pair the application at some scale with each metric as a contaienr
// 2. If storage is provided, we create the volumes for the metric containers
// 2. If storage or other addons are provided, we create the volumes for the metric containers
result, err := r.ensureMetricSet(ctx, &spec, &set)
if err != nil {
r.Log.Error(err, "🟥️ Issue ensuring metric set")
Expand Down
2 changes: 1 addition & 1 deletion docs/getting_started/addons.md
Original file line number Diff line number Diff line change
Expand Up @@ -270,7 +270,7 @@ wrapper to the actual executable.

### perf-mpitrace

- *[perf-mpitrace](https://github.com/converged-computing/metrics-operator/tree/main/examples/addons/perf-mpitrace)*
- *[perf-mpitrace](https://github.com/converged-computing/metrics-operator/tree/main/examples/addons/mpitrace-lammps)*

This metric provides [mpitrace](https://github.com/IBM/mpitrace) to wrap an MPI application. The setup is the same as hpctoolkit, and we
currently only provide a rocky base (please let us know if you need another). It works by way of wrapping the mpirun command with `LD_PRELOAD`.
Expand Down
40 changes: 40 additions & 0 deletions docs/getting_started/metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,46 @@ Here are some useful resources for the benchmarks:
- [HPC Council](https://hpcadvisorycouncil.atlassian.net/wiki/spaces/HPCWORKS/pages/1284538459/OSU+Benchmark+Tuning+for+2nd+Gen+AMD+EPYC+using+HDR+InfiniBand+over+HPC-X+MPI)
- [AWS Tutorials](https://www.hpcworkshops.com/08-efa/04-complie-run-osu.html)

### app-custom

A custom application can support any application to be used as a metric app. For the following parameters, "command" and "container" are required.

| Name | Description | Option Key | Type | Default |
|-----|-------------|------------|------|---------|
| command | The full mpirun command | options->command |string | unset |
| workdir | The working directory for the command | options->workdir | string | unset |
| soleTenancy | require each pod to have sole tenancy | command->soleTenancy | string | "false" |

As an example, here is running mpitrace (an addon) with a custom container.

```yaml
apiVersion: flux-framework.org/v1alpha2
kind: MetricSet
metadata:
labels:
app.kubernetes.io/name: metricset
app.kubernetes.io/instance: metricset-sample
name: metricset-sample
spec:
# Number of pods for lammps (one launcher, the rest workers)
pods: 4
metrics:
- name: app-custom
image: ghcr.io/converged-computing/<your-container>
options:
command: mpirun --hostfile ./hostlist.txt -mca orte_keep_fqdn_hostnames t -np 4 --map-by socket <app> <options>
workdir: <workdir>

# Add on hpctoolkit, will mount a volume and wrap lammps
addons:
- name: perf-mpitrace
options:
mount: /opt/mnt
image: ghcr.io/converged-computing/metric-mpitrace:ubuntu-jammy
workdir: <workdir>
containerTarget: launcher
```
### app-lammps
- *[app-lammps](https://github.com/converged-computing/metrics-operator/tree/main/examples/tests/app-lammps)*
Expand Down
1 change: 0 additions & 1 deletion examples/addons/mpitrace-lammps/metrics-rocky.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ spec:
command: /opt/intel/mpi/2021.8.0/bin/mpirun --hostfile ./hostlist.txt -np 4 --map-by socket lmp -v x 2 -v y 2 -v z 2 -in in.reaxc.hns -nocite
workdir: /opt/lammps/examples/reaxff/HNS

# Add on hpctoolkit, will mount a volume and wrap lammps
addons:
- name: perf-mpitrace
options:
Expand Down
6 changes: 3 additions & 3 deletions go.mod
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
module github.com/converged-computing/metrics-operator

go 1.20
go 1.22

require (
github.com/go-logr/logr v1.2.4
github.com/onsi/ginkgo/v2 v2.11.0
github.com/onsi/gomega v1.27.8
gopkg.in/yaml.v2 v2.4.0
go.uber.org/zap v1.24.0
k8s.io/api v0.27.3
k8s.io/apimachinery v0.27.3
k8s.io/client-go v0.27.3
Expand Down Expand Up @@ -51,7 +51,6 @@ require (
github.com/spf13/pflag v1.0.5 // indirect
go.uber.org/atomic v1.11.0 // indirect
go.uber.org/multierr v1.11.0 // indirect
go.uber.org/zap v1.24.0 // indirect
golang.org/x/net v0.10.0 // indirect
golang.org/x/oauth2 v0.8.0 // indirect
golang.org/x/sys v0.9.0 // indirect
Expand All @@ -65,6 +64,7 @@ require (
google.golang.org/grpc v1.51.0 // indirect
google.golang.org/protobuf v1.30.0 // indirect
gopkg.in/inf.v0 v0.9.1 // indirect
gopkg.in/yaml.v2 v2.4.0 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
k8s.io/apiextensions-apiserver v0.27.2 // indirect
k8s.io/component-base v0.27.2 // indirect
Expand Down
6 changes: 6 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ github.com/envoyproxy/go-control-plane v0.9.9-0.20210512163311-63b5d3c536b0/go.m
github.com/envoyproxy/go-control-plane v0.10.2-0.20220325020618-49ff273808a1/go.mod h1:KJwIaB5Mv44NWtYuAOFCVOjcI94vtpEz2JU/D2v6IjE=
github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c=
github.com/evanphx/json-patch v4.12.0+incompatible h1:4onqiflcdA9EOZ4RxV643DvftH5pOlLGNtQ5lPWQu84=
github.com/evanphx/json-patch v4.12.0+incompatible/go.mod h1:50XU6AFN0ol/bzJsmQLiYLvXMP4fmwYFNcr97nuDLSk=
github.com/evanphx/json-patch/v5 v5.6.0 h1:b91NhWfaz02IuVxO9faSllyAtNXHMPkC5J8sJCLunww=
github.com/evanphx/json-patch/v5 v5.6.0/go.mod h1:G79N1coSVB93tBe7j6PhzjmR3/2VvlbKOFpnXhI9Bw4=
github.com/flowstack/go-jsonschema v0.1.1/go.mod h1:yL7fNggx1o8rm9RlgXv7hTBWxdBM0rVwpMwimd3F3N0=
Expand Down Expand Up @@ -114,6 +115,7 @@ github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORN
github.com/kr/pretty v0.2.0/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI=
github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI=
github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
Expand Down Expand Up @@ -149,6 +151,7 @@ github.com/prometheus/procfs v0.10.1 h1:kYK1Va/YMlutzCGazswoHKo//tZVlFpKYh+Pymzi
github.com/prometheus/procfs v0.10.1/go.mod h1:nwNm2aOCAYw8uTR/9bWRREkZFxAUcWzPHWJq+XBB/FM=
github.com/rogpeppe/fastuuid v1.2.0/go.mod h1:jVj6XXZzXRy/MSR5jhDC/2q6DgLz+nrA6LYCDYWNEvQ=
github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ=
github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog=
github.com/spaolacci/murmur3 v0.0.0-20180118202830-f09979ecbc72/go.mod h1:JwIasOWyU6f++ZhiEuf87xNszmSA2myDM2Kzu9HwQUA=
github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA=
github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
Expand All @@ -164,6 +167,7 @@ github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/
github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk=
github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
github.com/xeipuuv/gojsonpointer v0.0.0-20180127040702-4e3ac2762d5f/go.mod h1:N2zxlSyiKSe5eX1tZViRH5QA0qijqEDrYZiPEAiq3wU=
github.com/xeipuuv/gojsonreference v0.0.0-20180127040603-bd5ef7bd5415/go.mod h1:GwrjFmJcFw6At/Gs6z4yjiIwzuJ1/+UwLxMQDVQXShQ=
github.com/xeipuuv/gojsonschema v1.2.0/go.mod h1:anYRn/JVcOK2ZgGU+IjEV4nwlhoK5sQluxsYJ78Id3Y=
Expand All @@ -176,6 +180,7 @@ go.uber.org/atomic v1.11.0 h1:ZvwS0R+56ePWxUNi+Atn9dWONBPp/AUETXlHW0DxSjE=
go.uber.org/atomic v1.11.0/go.mod h1:LUxbIzbOniOlMKjJjyPfpl4v+PKK2cNJn91OQbhoJI0=
go.uber.org/goleak v1.1.11/go.mod h1:cwTWslyiVhfpKIDGSZEM2HlOvcqm+tG4zioyIeLoqMQ=
go.uber.org/goleak v1.2.1 h1:NBol2c7O1ZokfZ0LEU9K6Whx/KnwvepVetCUhtKja4A=
go.uber.org/goleak v1.2.1/go.mod h1:qlT2yGI9QafXHhZZLxlSuNsMw3FFLxBr+tBRlmO1xH4=
go.uber.org/multierr v1.6.0/go.mod h1:cdWPpRnG4AhwMwsgIHip0KRBQjJy5kYEpYjJxpXp9iU=
go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0=
go.uber.org/multierr v1.11.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y=
Expand All @@ -193,6 +198,7 @@ golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
golang.org/x/mod v0.4.2/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
golang.org/x/mod v0.10.0 h1:lFO9qtOdlre5W1jxS3r/4szv2/6iXxScdzjoBMXNhYk=
golang.org/x/mod v0.10.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/net v0.0.0-20190108225652-1e06a53dbb7e/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
Expand Down
10 changes: 6 additions & 4 deletions pkg/addons/mpitrace.go
Original file line number Diff line number Diff line change
Expand Up @@ -93,15 +93,17 @@ func (a *MPITrace) CustomizeEntrypoints(
cs []*specs.ContainerSpec,
rjs []*jobset.ReplicatedJob,
) {
logger.Infof("🟧️ Customizing entrypoints for %s\n", rjs)

for _, rj := range rjs {
logger.Infof("🟧️ Comparing %s vs %s\n", a.target, rj.Name)

// Only customize if the replicated job name matches the target
if a.target != "" && a.target != rj.Name {
continue
}
a.customizeEntrypoint(cs, rj)
}

}

// CustomizeEntrypoint for a single replicated job
Expand All @@ -120,7 +122,7 @@ echo "%s"
wget -q https://github.com/converged-computing/goshare/releases/download/2023-09-06/wait-fs
chmod +x ./wait-fs
mv ./wait-fs /usr/bin/goshare-wait-fs
# Ensure spack view is on the path, wherever it is mounted
viewbase="%s"
software="${viewbase}/software"
Expand Down Expand Up @@ -205,6 +207,6 @@ func init() {
}
app := ApplicationAddon{AddonBase: base}
spack := SpackView{ApplicationAddon: app}
toolkit := MPITrace{SpackView: spack}
Register(&toolkit)
tracer := MPITrace{SpackView: spack}
Register(&tracer)
}
Loading

0 comments on commit f87207c

Please sign in to comment.