Skip to content

Commit

Permalink
Merge pull request #21 from converged-computing/add-range-algorithm
Browse files Browse the repository at this point in the history
feat: support for selection and match algorithms
  • Loading branch information
vsoch authored Mar 30, 2024
2 parents 3ccc698 + 4475624 commit 1771f53
Show file tree
Hide file tree
Showing 43 changed files with 2,513 additions and 256 deletions.
7 changes: 5 additions & 2 deletions cmd/rainbow/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,18 @@ var (
)

// Run will init a new config
func RunInit(path string) error {
func RunInit(
path string,
clusterName, selectAlgo, matchAlgo string,
) error {

if path == "" {
path = defaultConfigFile
}

// Generate an empty config - providing an empty filename ensures we don't read an existing one
// This defaults to an in-memory vanilla database
cfg, err := config.NewRainbowClientConfig("", "rainbow-cluster", "chocolate-cookies", "", "random")
cfg, err := config.NewRainbowClientConfig("", clusterName, "chocolate-cookies", "", selectAlgo, matchAlgo)
if err != nil {
return err
}
Expand Down
15 changes: 10 additions & 5 deletions cmd/rainbow/rainbow.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,10 @@ import (
"github.com/converged-computing/rainbow/pkg/types"

// Register database backends and selection algorithms
_ "github.com/converged-computing/rainbow/plugins/algorithms/random"
_ "github.com/converged-computing/rainbow/plugins/algorithms/match"
_ "github.com/converged-computing/rainbow/plugins/algorithms/range"
_ "github.com/converged-computing/rainbow/plugins/backends/memory"
_ "github.com/converged-computing/rainbow/plugins/selection/random"
)

var (
Expand Down Expand Up @@ -49,7 +51,8 @@ func main() {
host := parser.String("", "host", &argparse.Options{Default: "localhost:50051", Help: "Scheduler server address (host:port)"})
clusterName := parser.String("", "cluster-name", &argparse.Options{Help: "Name of cluster to register"})
graphDatabase := parser.String("", "graph-database", &argparse.Options{Help: "Graph database backend to use"})
selectionAlgorithm := parser.String("", "select-algorithm", &argparse.Options{Default: "random", Help: "Selection algorithm for graph database (defaults to random)"})
selectAlgo := parser.String("", "select-algorithm", &argparse.Options{Default: "random", Help: "Selection algorithm for final cluster selection (defaults to random)"})
matchAlgo := parser.String("", "match-algorithm", &argparse.Options{Default: "match", Help: "Match algorithm for graph database (defaults to match)"})

// Receive Jobs
clusterSecret := receiveCmd.String("", "request-secret", &argparse.Options{Help: "Cluster 'secret' to retrieve jobs"})
Expand Down Expand Up @@ -83,7 +86,7 @@ func main() {
}

if configCmd.Happened() && configInitCmd.Happened() {
err := config.RunInit(*cfg)
err := config.RunInit(*cfg, *clusterName, *selectAlgo, *matchAlgo)
if err != nil {
log.Fatalf("Issue with config: %s\n", err)
}
Expand Down Expand Up @@ -111,7 +114,8 @@ func main() {
*cfg,
*graphDatabase,
*subsystem,
*selectionAlgorithm,
*selectAlgo,
*matchAlgo,
)
if err != nil {
log.Fatalf("Issue with register: %s\n", err)
Expand Down Expand Up @@ -143,7 +147,8 @@ func main() {
*clusterName,
*graphDatabase,
*cfg,
*selectionAlgorithm,
*selectAlgo,
*matchAlgo,
)
if err != nil {
log.Fatal(err.Error())
Expand Down
2 changes: 1 addition & 1 deletion cmd/rainbow/receive/receive.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ func Run(
}

// Read in the config, if provided, TODO we need a set of tokens here?
cfg, err := config.NewRainbowClientConfig(cfgFile, cluster, secret, "", "")
cfg, err := config.NewRainbowClientConfig(cfgFile, cluster, secret, "", "", "")
if err != nil {
return err
}
Expand Down
10 changes: 10 additions & 0 deletions cmd/rainbow/register/register.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package register

import (
"context"
"fmt"
"log"
"os"

Expand All @@ -20,20 +21,25 @@ func Run(
graphDatabase,
subsystem,
selectionAlgorithm string,
matchAlgorithm string,
) error {

c, err := client.NewClient(host)
if err != nil {
return err
}

if clusterName == "" {
return fmt.Errorf("s --cluster-name is required")
}
// Read in the config, if provided, command line takes preference
cfg, err := config.NewRainbowClientConfig(
cfgFile,
clusterName,
secret,
graphDatabase,
selectionAlgorithm,
matchAlgorithm,
)
if err != nil {
return err
Expand Down Expand Up @@ -62,6 +68,10 @@ func Run(
if saveSecret && cfgFile != "" {
log.Printf("Saving cluster secret to %s\n", cfgFile)
cfg.Cluster = config.ClusterCredential{Secret: response.Secret, Name: clusterName}

// Assume we want to submit to our cluster too
newCluster := config.ClusterCredential{Token: response.Token, Name: clusterName}
cfg.Clusters = []config.ClusterCredential{newCluster}
yaml, err := cfg.ToYaml()
if err != nil {
return err
Expand Down
2 changes: 1 addition & 1 deletion cmd/rainbow/register/subsystem.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ func RegisterSubsystem(
return fmt.Errorf("a subsystem name is required to register")
}
// Read in the config, if provided, command line takes preference
cfg, err := config.NewRainbowClientConfig(cfgFile, "", "", "", "")
cfg, err := config.NewRainbowClientConfig(cfgFile, "", "", "", "", "")
if err != nil {
return err
}
Expand Down
4 changes: 2 additions & 2 deletions cmd/rainbow/submit/submit.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ func Run(
nodes, tasks int,
token, jobspec, clusterName,
database, cfgFile string,
selectionAlgorithm string,
selectAlgo, matchAlgo string,
) error {

c, err := client.NewClient(host)
Expand Down Expand Up @@ -49,7 +49,7 @@ func Run(
}

// Read in the config, if provided, TODO we need a set of tokens here?
cfg, err := config.NewRainbowClientConfig(cfgFile, "", "", database, selectionAlgorithm)
cfg, err := config.NewRainbowClientConfig(cfgFile, "", "", database, selectAlgo, matchAlgo)
if err != nil {
return err
}
Expand Down
12 changes: 8 additions & 4 deletions cmd/server/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,19 @@ import (
"github.com/converged-computing/rainbow/pkg/types"

// Register database backends
_ "github.com/converged-computing/rainbow/plugins/algorithms/random"
_ "github.com/converged-computing/rainbow/plugins/algorithms/match"
_ "github.com/converged-computing/rainbow/plugins/algorithms/range"
_ "github.com/converged-computing/rainbow/plugins/backends/memory"
_ "github.com/converged-computing/rainbow/plugins/selection/random"
)

var (
host string
name = "rainbow"
sqliteFile = "rainbow.db"
configFile = ""
algorithm = "random"
matchAlgo = "match"
selectAlgo = "random"
database = ""
cleanup = false
secret = "chocolate-cookies"
Expand All @@ -33,13 +36,14 @@ func main() {
flag.StringVar(&globalToken, "global-token", name, "global token for cluster access (not recommended)")
flag.StringVar(&secret, "secret", secret, "secret to validate registration (default: chocolate-cookies)")
flag.StringVar(&database, "graph-database", database, "graph database backend (defaults to memory)")
flag.StringVar(&algorithm, "select-algorithm", algorithm, "selection algorithm for graph (defaults to random)")
flag.StringVar(&selectAlgo, "select-algorithm", selectAlgo, "selection algorithm for final cluster selection (defaults to random)")
flag.StringVar(&matchAlgo, "match-algorithm", matchAlgo, "match algorithm for graph (defaults to random)")
flag.StringVar(&configFile, "config", configFile, "rainbow config file")
flag.BoolVar(&cleanup, "cleanup", cleanup, "cleanup previous sqlite database (default: false)")
flag.Parse()

// Load (or generate a default) config file here, if provided
cfg, err := config.NewRainbowClientConfig(configFile, name, secret, database, algorithm)
cfg, err := config.NewRainbowClientConfig(configFile, name, secret, database, selectAlgo, matchAlgo)
if err != nil {
log.Fatalf("error while creating server: %v", err)
}
Expand Down
101 changes: 90 additions & 11 deletions docs/algorithms.md
Original file line number Diff line number Diff line change
@@ -1,16 +1,26 @@
# Algorithms

This is a brief summary of notes about current algorithms.
This is a brief summary of notes about current interfaces that support algorithms. While algorithms are an important part of rainbow, they are implemented via interfaces. There are currently three kinds of interfaces:

## Memory Graph
- [Backends](#graph-backends) are graph database backends. These backends use match algorithms directly
- [Match Algorithms](#match-algorithms) are used by the graph databases to determine how to match a subsystem to a slot. Each graph backend can have a default and (likely) support a subset
- [Selection](#selection-algorithms) are the last set that are given a set of cluster matches and allowed to decide on a final assignment, usually from stateful data.

These sections will go through the different interfaces and algorithms afforded by each.

## Graph Backends

We currently only support a custom memory graph backend. It would be good to get fluxion in here soon, when it's ready.

### Memory Graph

The "memory" graph backend is an in-memory graph database that is a custom implementation (by @vsoch). Although it is primarily intended for learning, it serves as a good base for development and prototyping too, and warrants a discussion of algorithms involved. For design, see the [design](design.md) document. This will detail basics about the search.

### Depth First Search
#### Depth First Search

While Fluxion uses depth first search and up (to support an adjacency list), since we are just using this graph for prototyping, we instead use recursion, which means we can traverse (depth) and not need to find our way back up, because we can return from a recursive call.

#### 1. Quick Check
##### 1. Quick Check

We start with a hieuristic that says "if I know the totals that are needed for this Jobspec are not available across the cluster, bail out before doing any search." That works as follows.

Expand All @@ -21,13 +31,13 @@ We start with a hieuristic that says "if I know the totals that are needed for t

At the end, we have a summary of the total resources requested by the jobspec, and do a quick check to see if any clusters have less than that amount (the totals we already have cached from registration) OR if the clusters are missing a resource entirely. Note that this is only for the dominant subsystem. If a cluster passes these checks, it proceeds into depth first search.

#### 2. Depth First Search
##### 2. Depth First Search

Depth first search is going to do checks from the perspective of a slot, because this (as I understand it) is the level where we are "pinning" the request. Thus, we start our search by creating a lookup of slots, which we do from the "tasks" section of the jobspec. We do this because as we are traversing we are going to be randomly hitting slots defined by the user, and we need to be able to look up details about it.

Note that this search is still rooted in the dominant subsystem, and for other subsystem resources (e.g., IO) these are going to linked off of vertices here. For each cluster in our matches, we then start at the root, which is generally just a node named by the cluster. We get that vertex, because since this memory database has an object oriented design, all children vertices are going to be edges off of that.

##### findSlots
###### findSlots

We then define a recursive function `findSlots` that is going to recurse into a slot resource and recurse into child resources under that to count what it finds. For example, if the Jobspec is saying that it wants some number of cores per slot, the `findSlots` function will start at a vertex where the slot is, and then figure out if we have that number. It returns a number that represents that count. Specifically, the function works as follows:

Expand All @@ -39,7 +49,7 @@ We then define a recursive function `findSlots` that is going to recurse into a

The function `findSlots` will (should) return with the number of matches for a specific resource type below a vertex in the graph, allowing us to determine if a subtree can match a request that is specific to a slot.

##### satisfies
###### satisfies

Satisfies is a recursive function that determines if a vertex can satisfy a resource need.
Given a resource and a vertex root, it returns the count of vertices under the root that satisfy the request. This function uses `findSlots` because as it is traversing, when it finds a `resource.Type`
Expand All @@ -51,7 +61,7 @@ of type "slot" it will call that function. Akin to `findSlots`, it works as foll

The result of satisfies is returning the count for some resource that is satisfied starting at some root, accounting for slots too.

##### traverseResource
###### traverseResource

The traverse resource is the main (also recursive function) to handle traversing the graph. It starts at the top level resource from the Jobspec, and instead of returning a count, returns a boolean to indicate if the match is a yes or no. It has two cases:

Expand All @@ -75,7 +85,7 @@ if isMatch is true here, add the cluster to matches

At this point, the basic list of clusters is returned to the calling function (the interface in rainbow) and passed on to a selection algorithm, which can take some logic about the clusters (likely state) and make a final decision. We currently just randomly select from the set (random is the only selection algorithm available, mainly for development).

## Jobspec Resources
#### Jobspec Resources

While we need to have more [discussion](https://github.com/flux-framework/flux-sched/discussions/1153#discussioncomment-8726678) on what constitutes a request for subsystem resources, I am taking a simple approach that will satisfy an initial need to run experiments with compatibility metadata (relevant to subsystems) that use a scheduler. The approach I am taking is the following. You can read about the [design](design.md) and I'll repeat the high level points here. When we register a subsystem, it is a separate graph that (at the highest level) is still organized by cluster name. However, each node in the graph needs to be attached to another node known to itself, or to a vertex in the dominant subsystem graph. When asking for a subsystem resource, we are asking for a check at a specific vertex (defined by the slot) that is relevant for a specific subsystem and resource type. We do this by way of defining "resources" under a task, as shown below:

Expand All @@ -91,8 +101,8 @@ resources:
with:
- count: 2
type: core
tasks:
- command:
task:
command:
- ior
slot: default
count:
Expand All @@ -107,4 +117,73 @@ In the above, we are saying that when we find a slot, we need to see if the vert
I understand this is likely not perfect for what everyone wants, but I believe it to be a reasonable first shot, and within the ability of what I can prototype without having fluxion ready yet.
## Match Algorithms
### Match
The expliciy "match" type is going to look exactly at the type of a subsystem node, and return true (match) if it matches what the subsystem needs. For example, given this task:
```yaml
task:
command:
- ior
slot: default
count:
per_slot: 1
resources:
io:
match:
- type: shm
```
We would look for a node of type "shm" in the io subsystem that is directly attached (an edge) to a node in the dominant subsystem graph.
### Range
Range is designed typically to handle package versions. You *must* specify a field that is to be inspected on the subsystem metadata, and you must specify one of "min" or "max" or both. For example:
```yaml
task:
command:
- spack
slot: default
count:
per_slot: 1
resources:
spack:
range:
- field: version
min: "0.5.1"
max: "0.5.5"
```
The above would expect to look for the field `version` defined for a slot, and use semver to determine within that range. Here is what the subsystem node might look like. In this case, the node is saying "the dominant subsystem node that I'm connected to has this package with this metadata":

```json
"spack1": {
"label": "spack1",
"metadata": {
"basename": "package",
"exclusive": true,
"id": 1,
"name": "package0",
"paths": {
"containment": "/spack0/package0"
},
"size": 1,
"type": "package",
"uniq_id": 1,
"version": "0.5.2"
}
},
```

In the above, the field is "version" and it is an arbitrary metadata field in the "metadata" section of a node. For the time being, the match algorithm is the determination of types allowed there. For example, the range algorithm interface is expecting to parse a string in a semantic version format. Different plugins might expect differently.

## Selection Algorithms

### Random

This algorithm speaks for itself. Given a listing of contender clusters (where all clusters have a match) we randomly choose.

[home](/README.md#rainbow-scheduler)
Loading

0 comments on commit 1771f53

Please sign in to comment.