diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index bf8217a..daa0ede 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -21,19 +21,23 @@ jobs: strategy: matrix: k8s: - - v1.21.2 - - v1.22.1 - - v1.23.0 + - v1.22.7 + - v1.23.6 + - v1.24.0 steps: - name: Checkout uses: actions/checkout@v1 - name: Create kind ${{ matrix.k8s }} cluster - uses: helm/kind-action@v1.1.0 + uses: helm/kind-action@v1.2.0 with: - version: v0.11.1 + version: v0.13.0 node_image: kindest/node:${{ matrix.k8s }} - name: Set up chart-testing uses: helm/chart-testing-action@v2.2.1 + - name: Inject secrets + run: | + find ./charts/*/ci/*.yaml -type f -exec sed -i "s/AGENTK8SGLUEKEY/${{ secrets.agentk8sglueKey }}/g" {} \; + find ./charts/*/ci/*.yaml -type f -exec sed -i "s/AGENTK8SGLUESECRET/${{ secrets.agentk8sglueSecret }}/g" {} \; - name: Run chart-testing (list-changed) id: list-changed run: | diff --git a/charts/clearml-agent/.helmignore b/charts/clearml-agent/.helmignore new file mode 100644 index 0000000..0e8a0eb --- /dev/null +++ b/charts/clearml-agent/.helmignore @@ -0,0 +1,23 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/charts/clearml-agent/Chart.yaml b/charts/clearml-agent/Chart.yaml new file mode 100644 index 0000000..72c9add --- /dev/null +++ b/charts/clearml-agent/Chart.yaml @@ -0,0 +1,19 @@ +apiVersion: v2 +name: clearml-agent +description: MLOps platform +type: application +version: "1.0.0" +appVersion: "1.21" +kubeVersion: ">= 1.19.0-0 < 1.25.0-0" +home: https://clear.ml +icon: https://raw.githubusercontent.com/allegroai/clearml/master/docs/clearml-logo.svg +sources: + - https://github.com/allegroai/clearml-helm-charts + - https://github.com/allegroai/clearml +maintainers: + - name: valeriano-manassero + url: https://github.com/valeriano-manassero +keywords: + - clearml + - "machine learning" + - mlops diff --git a/charts/clearml-agent/README.md b/charts/clearml-agent/README.md new file mode 100644 index 0000000..51310a7 --- /dev/null +++ b/charts/clearml-agent/README.md @@ -0,0 +1,57 @@ +# clearml-agent + +![Version: 1.0.0](https://img.shields.io/badge/Version-1.0.0-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 1.21](https://img.shields.io/badge/AppVersion-1.21-informational?style=flat-square) + +MLOps platform + +**Homepage:** + +## Maintainers + +| Name | Email | Url | +| ---- | ------ | --- | +| valeriano-manassero | | | + +## Source Code + +* +* + +## Requirements + +Kubernetes: `>= 1.19.0-0 < 1.25.0-0` + +## Values + +| Key | Type | Default | Description | +|-----|------|---------|-------------| +| agentk8sglue | object | `{"apiServerUrlReference":"https://api.clear.ml","defaultContainerImage":"ubuntu:18.04","fileServerUrlReference":"https://files.clear.ml","id":"k8s-agent","image":{"repository":"allegroai/clearml-agent-k8s","tag":"base-1.21"},"maxPods":10,"podTemplate":{"env":[],"nodeSelector":{},"resources":{},"tolerations":[],"volumes":[]},"queue":"default","replicaCount":1,"serviceAccountName":"default","webServerUrlReference":"https://app.clear.ml"}` | This agent will spawn queued experiments in new pods, a good use case is to combine this with GPU autoscaling nodes. https://github.com/allegroai/clearml-agent/tree/master/docker/k8s-glue | +| agentk8sglue.apiServerUrlReference | string | `"https://api.clear.ml"` | Reference to Api server url | +| agentk8sglue.defaultContainerImage | string | `"ubuntu:18.04"` | default container image for ClearML Task pod | +| agentk8sglue.fileServerUrlReference | string | `"https://files.clear.ml"` | Reference to File server url | +| agentk8sglue.id | string | `"k8s-agent"` | ClearML worker ID (must be unique across the entire ClearMLenvironment) | +| agentk8sglue.image | object | `{"repository":"allegroai/clearml-agent-k8s","tag":"base-1.21"}` | Glue Agent image configuration | +| agentk8sglue.maxPods | int | `10` | maximum concurrent consume ClearML Task pod | +| agentk8sglue.podTemplate | object | `{"env":[],"nodeSelector":{},"resources":{},"tolerations":[],"volumes":[]}` | template for pods spawned to consume ClearML Task | +| agentk8sglue.podTemplate.env | list | `[]` | environment variables for pods spawned to consume ClearML Task (example in values.yaml comments) | +| agentk8sglue.podTemplate.nodeSelector | object | `{}` | nodeSelector setup for pods spawned to consume ClearML Task (example in values.yaml comments) | +| agentk8sglue.podTemplate.resources | object | `{}` | resources declaration for pods spawned to consume ClearML Task (example in values.yaml comments) | +| agentk8sglue.podTemplate.tolerations | list | `[]` | tolerations setup for pods spawned to consume ClearML Task (example in values.yaml comments) | +| agentk8sglue.podTemplate.volumes | list | `[]` | volumes definition for pods spawned to consume ClearML Task (example in values.yaml comments) | +| agentk8sglue.queue | string | `"default"` | ClearML queue this agent will consume | +| agentk8sglue.replicaCount | int | `1` | Glue Agent number of pods | +| agentk8sglue.serviceAccountName | string | `"default"` | serviceAccountName for pods spawned to consume ClearML Task | +| agentk8sglue.webServerUrlReference | string | `"https://app.clear.ml"` | Reference to Web server url | +| clearml | object | `{"agentk8sglueKey":"ACCESSKEY","agentk8sglueSecret":"SECRETKEY"}` | ClearMl generic configurations | +| clearml.agentk8sglueKey | string | `"ACCESSKEY"` | Agent k8s Glue basic auth key | +| clearml.agentk8sglueSecret | string | `"SECRETKEY"` | Agent k8s Glue basic auth secret | +| imageCredentials | object | `{"email":"someone@host.com","enabled":false,"existingSecret":"","password":"pwd","registry":"docker.io","username":"someone"}` | Private image registry configuration | +| imageCredentials.email | string | `"someone@host.com"` | Email | +| imageCredentials.enabled | bool | `false` | Use private authentication mode | +| imageCredentials.existingSecret | string | `""` | If this is set, chart will not generate a secret but will use what is defined here | +| imageCredentials.password | string | `"pwd"` | Registry password | +| imageCredentials.registry | string | `"docker.io"` | Registry name | +| imageCredentials.username | string | `"someone"` | Registry username | + +---------------------------------------------- +Autogenerated from chart metadata using [helm-docs v1.10.0](https://github.com/norwoodj/helm-docs/releases/v1.10.0) diff --git a/charts/clearml-agent/ci/default-values.yaml b/charts/clearml-agent/ci/default-values.yaml new file mode 100644 index 0000000..f9d9205 --- /dev/null +++ b/charts/clearml-agent/ci/default-values.yaml @@ -0,0 +1,3 @@ +clearml: + agentk8sglueKey: "AGENTK8SGLUEKEY" + agentk8sglueSecret: "AGENTK8SGLUESECRET" diff --git a/charts/clearml-agent/templates/NOTES.txt b/charts/clearml-agent/templates/NOTES.txt new file mode 100644 index 0000000..1c2bfce --- /dev/null +++ b/charts/clearml-agent/templates/NOTES.txt @@ -0,0 +1 @@ +Glue Agent deployed. diff --git a/charts/clearml-agent/templates/_helpers.tpl b/charts/clearml-agent/templates/_helpers.tpl new file mode 100644 index 0000000..ea06a29 --- /dev/null +++ b/charts/clearml-agent/templates/_helpers.tpl @@ -0,0 +1,86 @@ +{{/* +Expand the name of the chart. +*/}} +{{- define "clearml.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "clearml.fullname" -}} +{{- if .Values.fullnameOverride }} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- $name := default .Chart.Name .Values.nameOverride }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} +{{- end }} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "clearml.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Common labels +*/}} +{{- define "clearml.labels" -}} +helm.sh/chart: {{ include "clearml.chart" . }} +{{ include "clearml.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} + +{{/* +Selector labels +*/}} +{{- define "clearml.selectorLabels" -}} +app.kubernetes.io/name: {{ include "clearml.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end }} + +{{/* +Reference Name (agentk8sglue) +*/}} +{{- define "agentk8sglue.referenceName" -}} +{{- include "clearml.name" . }}-agentk8sglue +{{- end }} + +{{/* +Selector labels (agentk8sglue) +*/}} +{{- define "agentk8sglue.selectorLabels" -}} +app.kubernetes.io/name: {{ include "clearml.name" . }} +app.kubernetes.io/instance: {{ include "agentk8sglue.referenceName" . }} +{{- end }} + +{{/* +Create the name of the service account to use +*/}} +{{- define "clearml.serviceAccountName" -}} +{{- if .Values.serviceAccount.create }} +{{- default (include "clearml.fullname" .) .Values.serviceAccount.name }} +{{- else }} +{{- default "default" .Values.serviceAccount.name }} +{{- end }} +{{- end }} + +{{/* +Create secret to access docker registry +*/}} +{{- define "imagePullSecret" }} +{{- with .Values.imageCredentials }} +{{- printf "{\"auths\":{\"%s\":{\"username\":\"%s\",\"password\":\"%s\",\"email\":\"%s\",\"auth\":\"%s\"}}}" .registry .username .password .email (printf "%s:%s" .username .password | b64enc) | b64enc }} +{{- end }} +{{- end }} diff --git a/charts/clearml-agent/templates/agentk8sglue-configmap.yaml b/charts/clearml-agent/templates/agentk8sglue-configmap.yaml new file mode 100644 index 0000000..c7ca83e --- /dev/null +++ b/charts/clearml-agent/templates/agentk8sglue-configmap.yaml @@ -0,0 +1,55 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: k8sagent-pod-template +data: + template.yaml: | + apiVersion: v1 + metadata: + namespace: {{ .Release.Namespace }} + spec: + serviceAccountName: {{ .Values.agentk8sglue.serviceAccountName }} + volumes: + {{- range .Values.agentk8sglue.podTemplate.volumes }} + - name: {{ .name }} + persistentVolumeClaim: + claimName: {{ .name }} + {{- end }} + containers: + - resources: + {{- toYaml .Values.agentk8sglue.podTemplate.resources | nindent 10 }} + ports: + - containerPort: 10022 + volumeMounts: + {{- range .Values.agentk8sglue.podTemplate.volumes }} + - mountPath: {{ .path }} + name: {{ .name }} + {{- end }} + env: + - name: CLEARML_API_HOST + value: {{.Values.agentk8sglue.apiServerUrlReference}} + - name: CLEARML_WEB_HOST + value: {{.Values.agentk8sglue.webServerUrlReference}} + - name: CLEARML_FILES_HOST + value: {{.Values.agentk8sglue.fileServerUrlReference}} + - name: CLEARML_API_ACCESS_KEY + valueFrom: + secretKeyRef: + name: clearml-agent-conf + key: agentk8sglue_key + - name: CLEARML_API_SECRET_KEY + valueFrom: + secretKeyRef: + name: clearml-agent-conf + key: agentk8sglue_secret + {{- if .Values.agentk8sglue.podTemplate.env }} + {{ toYaml .Values.agentk8sglue.podTemplate.env | nindent 8 }} + {{- end }} + {{- with .Values.agentk8sglue.podTemplate.nodeSelector}} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.agentk8sglue.podTemplate.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} diff --git a/charts/clearml-agent/templates/agentk8sglue-deployment.yaml b/charts/clearml-agent/templates/agentk8sglue-deployment.yaml new file mode 100644 index 0000000..28ff2ed --- /dev/null +++ b/charts/clearml-agent/templates/agentk8sglue-deployment.yaml @@ -0,0 +1,91 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "agentk8sglue.referenceName" . }} + labels: + {{- include "clearml.labels" . | nindent 4 }} +spec: + replicas: {{ .Values.agentk8sglue.replicaCount }} + selector: + matchLabels: + {{- include "agentk8sglue.selectorLabels" . | nindent 6 }} + template: + metadata: + annotations: + checksum/config: {{ printf "%s" .Values.clearml | sha256sum }} + labels: + {{- include "agentk8sglue.selectorLabels" . | nindent 8 }} + spec: + {{- if .Values.imageCredentials.enabled }} + imagePullSecrets: + {{- if .Values.imageCredentials.existingSecret }} + - name: .Values.imageCredentials.existingSecret + {{- else }} + - name: clearml-agent-registry-key + {{- end }} + {{- end }} + initContainers: + - name: init-k8s-glue + image: "{{ .Values.agentk8sglue.image.repository }}:{{ .Values.agentk8sglue.image.tag }}" + command: + - /bin/sh + - -c + - > + set -x; + while [ $(curl -sw '%{http_code}' "{{.Values.agentk8sglue.apiServerUrlReference}}/debug.ping" -o /dev/null) -ne 200 ] ; do + echo "waiting for apiserver" ; + sleep 5 ; + done; + while [ $(curl -sw '%{http_code}' "{{.Values.agentk8sglue.fileServerUrlReference}}/" -o /dev/null) -ne 403 ] ; do + echo "waiting for fileserver" ; + sleep 5 ; + done; + while [ $(curl -sw '%{http_code}' "{{.Values.agentk8sglue.webServerUrlReference}}/" -o /dev/null) -ne 200 ] ; do + echo "waiting for webserver" ; + sleep 5 ; + done + containers: + - name: k8s-glue + image: "{{ .Values.agentk8sglue.image.repository }}:{{ .Values.agentk8sglue.image.tag }}" + imagePullPolicy: Always + command: ["/bin/bash", "-c", "export PATH=$PATH:$HOME/bin; source /root/.bashrc && /root/entrypoint.sh"] + volumeMounts: + - name: k8sagent-pod-template + mountPath: /root/template + env: + - name: CLEARML_API_HOST + value: "{{.Values.agentk8sglue.apiServerUrlReference}}" + - name: CLEARML_WEB_HOST + value: "{{.Values.agentk8sglue.webServerUrlReference}}" + - name: CLEARML_FILES_HOST + value: "{{.Values.agentk8sglue.fileServerUrlReference}}" + - name: K8S_GLUE_MAX_PODS + value: "{{.Values.agentk8sglue.maxPods}}" + - name: K8S_GLUE_QUEUE + value: "{{.Values.agentk8sglue.queue}}" + - name: K8S_GLUE_EXTRA_ARGS + value: "--namespace {{ .Release.Namespace }} --template-yaml /root/template/template.yaml" + - name: K8S_DEFAULT_NAMESPACE + value: "{{ .Release.Namespace }}" + - name: CLEARML_API_ACCESS_KEY + valueFrom: + secretKeyRef: + name: clearml-agent-conf + key: agentk8sglue_key + - name: CLEARML_API_SECRET_KEY + valueFrom: + secretKeyRef: + name: clearml-agent-conf + key: agentk8sglue_secret + - name: CLEARML_WORKER_ID + value: "{{.Values.agentk8sglue.id}}" + - name: CLEARML_AGENT_UPDATE_REPO + value: "" + - name: FORCE_CLEARML_AGENT_REPO + value: "" + - name: CLEARML_DOCKER_IMAGE + value: "{{.Values.agentk8sglue.defaultContainerImage}}" + volumes: + - name: k8sagent-pod-template + configMap: + name: k8sagent-pod-template diff --git a/charts/clearml-agent/templates/agentk8sglue-rbac.yaml b/charts/clearml-agent/templates/agentk8sglue-rbac.yaml new file mode 100644 index 0000000..b2b5d44 --- /dev/null +++ b/charts/clearml-agent/templates/agentk8sglue-rbac.yaml @@ -0,0 +1,23 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: k8sagent-pods-access +rules: + - apiGroups: + - "" + resources: + - pods + verbs: ["get", "list", "watch", "create", "patch", "delete"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: k8sagent-pods-access +subjects: + - kind: ServiceAccount + name: default + namespace: {{ .Release.Namespace }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: k8sagent-pods-access diff --git a/charts/clearml-agent/templates/clearml-secrets.yaml b/charts/clearml-agent/templates/clearml-secrets.yaml new file mode 100644 index 0000000..7939625 --- /dev/null +++ b/charts/clearml-agent/templates/clearml-secrets.yaml @@ -0,0 +1,19 @@ +apiVersion: v1 +kind: Secret +metadata: + name: clearml-agent-conf +data: + agentk8sglue_key: {{ .Values.clearml.agentk8sglueKey | b64enc }} + agentk8sglue_secret: {{ .Values.clearml.agentk8sglueSecret | b64enc }} +--- +{{- if .Values.imageCredentials.enabled }} +{{- if not .Values.imageCredentials.existingSecret }} +apiVersion: v1 +kind: Secret +metadata: + name: clearml-agent-registry-key +type: kubernetes.io/dockerconfigjson +data: + .dockerconfigjson: {{ template "imagePullSecret" . }} +{{- end }} +{{- end }} diff --git a/charts/clearml-agent/values.yaml b/charts/clearml-agent/values.yaml new file mode 100644 index 0000000..d6ec5d1 --- /dev/null +++ b/charts/clearml-agent/values.yaml @@ -0,0 +1,81 @@ +# -- Private image registry configuration +imageCredentials: + # -- Use private authentication mode + enabled: false + # -- If this is set, chart will not generate a secret but will use what is defined here + existingSecret: "" + # -- Registry name + registry: docker.io + # -- Registry username + username: someone + # -- Registry password + password: pwd + # -- Email + email: someone@host.com + +# -- ClearMl generic configurations +clearml: + # -- Agent k8s Glue basic auth key + agentk8sglueKey: "ACCESSKEY" + # -- Agent k8s Glue basic auth secret + agentk8sglueSecret: "SECRETKEY" + +# -- This agent will spawn queued experiments in new pods, a good use case is to combine this with +# GPU autoscaling nodes. +# https://github.com/allegroai/clearml-agent/tree/master/docker/k8s-glue +agentk8sglue: + # -- Glue Agent image configuration + image: + repository: "allegroai/clearml-agent-k8s" + tag: "base-1.21" + + # -- Glue Agent number of pods + replicaCount: 1 + + # -- Reference to Api server url + apiServerUrlReference: "https://api.clear.ml" + # -- Reference to File server url + fileServerUrlReference: "https://files.clear.ml" + # -- Reference to Web server url + webServerUrlReference: "https://app.clear.ml" + + # -- serviceAccountName for pods spawned to consume ClearML Task + serviceAccountName: default + # -- maximum concurrent consume ClearML Task pod + maxPods: 10 + # -- default container image for ClearML Task pod + defaultContainerImage: ubuntu:18.04 + # -- ClearML queue this agent will consume + queue: default + + # -- ClearML worker ID (must be unique across the entire ClearMLenvironment) + id: k8s-agent + + # -- template for pods spawned to consume ClearML Task + podTemplate: + # -- volumes definition for pods spawned to consume ClearML Task (example in values.yaml comments) + volumes: [] + # - name: "yourvolume" + # path: "/yourpath" + # -- environment variables for pods spawned to consume ClearML Task (example in values.yaml comments) + env: [] + # # to setup access to private repo, setup secret with git credentials: + # - name: CLEARML_AGENT_GIT_USER + # value: mygitusername + # - name: CLEARML_AGENT_GIT_PASS + # valueFrom: + # secretKeyRef: + # name: git-password + # key: git-password + # -- resources declaration for pods spawned to consume ClearML Task (example in values.yaml comments) + resources: {} + # limits: + # nvidia.com/gpu: 1 + # -- tolerations setup for pods spawned to consume ClearML Task (example in values.yaml comments) + tolerations: [] + # - key: "nvidia.com/gpu" + # operator: Exists + # effect: "NoSchedule" + # -- nodeSelector setup for pods spawned to consume ClearML Task (example in values.yaml comments) + nodeSelector: {} + # fleet: gpu-nodes