diff --git a/clusterloader2/pkg/chaos/monkey.go b/clusterloader2/pkg/chaos/monkey.go index dbcca368e1..39909e234c 100644 --- a/clusterloader2/pkg/chaos/monkey.go +++ b/clusterloader2/pkg/chaos/monkey.go @@ -17,6 +17,9 @@ limitations under the License. package chaos import ( + "fmt" + "strings" + clientset "k8s.io/client-go/kubernetes" "k8s.io/perf-tests/clusterloader2/api" ) @@ -47,3 +50,13 @@ func (m *Monkey) Init(config api.ChaosMonkeyConfig, stopCh <-chan struct{}) erro return nil } + +// Summary logs Monkey execution +func (m *Monkey) Summary() string { + var sb strings.Builder + if m.nodeKiller != nil { + sb.WriteString(fmt.Sprintf("Summary of Chaos Monkey execution\n")) + sb.WriteString(m.nodeKiller.Summary()) + } + return sb.String() +} diff --git a/clusterloader2/pkg/chaos/nodes.go b/clusterloader2/pkg/chaos/nodes.go index 43c839fca3..74ab1ffee7 100644 --- a/clusterloader2/pkg/chaos/nodes.go +++ b/clusterloader2/pkg/chaos/nodes.go @@ -18,20 +18,29 @@ package chaos import ( "fmt" + "math" "math/rand" + "strings" "sync" "time" "k8s.io/perf-tests/clusterloader2/api" + "k8s.io/perf-tests/clusterloader2/pkg/framework/client" "k8s.io/perf-tests/clusterloader2/pkg/util" v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/sets" "k8s.io/apimachinery/pkg/util/wait" clientset "k8s.io/client-go/kubernetes" "k8s.io/klog" ) +const ( + monitoringNamespace = "monitoring" + prometheusLabel = "prometheus=k8s" +) + // NodeKiller is a utility to simulate node failures. type NodeKiller struct { config api.NodeFailureConfig @@ -39,6 +48,36 @@ type NodeKiller struct { provider string // killedNodes stores names of the nodes that have been killed by NodeKiller. killedNodes sets.String + recorder *eventRecorder +} + +type nodeAction string + +const ( + stopServices nodeAction = "stopService" + rebootNode = "rebootNode" +) + +type event struct { + time time.Time + action nodeAction + nodeName string +} + +type eventRecorder struct { + events []event + mux sync.Mutex +} + +func newEventRecorder() *eventRecorder { + return &eventRecorder{[]event{}, sync.Mutex{}} +} + +func (r *eventRecorder) record(a nodeAction, nodeName string) { + e := event{time.Now(), a, nodeName} + r.mux.Lock() + r.events = append(r.events, e) + r.mux.Unlock() } // NewNodeKiller creates new NodeKiller. @@ -46,7 +85,7 @@ func NewNodeKiller(config api.NodeFailureConfig, client clientset.Interface, pro if provider != "gce" && provider != "gke" { return nil, fmt.Errorf("provider %q is not supported by NodeKiller", provider) } - return &NodeKiller{config, client, provider, sets.NewString()}, nil + return &NodeKiller{config, client, provider, sets.NewString(), newEventRecorder()}, nil } // Run starts NodeKiller until stopCh is closed. @@ -68,18 +107,37 @@ func (k *NodeKiller) pickNodes() ([]v1.Node, error) { if err != nil { return nil, err } + + prometheusPods, err := client.ListPodsWithOptions(k.client, monitoringNamespace, metav1.ListOptions{ + LabelSelector: prometheusLabel, + }) + if err != nil { + return nil, err + } + nodesHasPrometheusPod := sets.NewString() + for i := range prometheusPods { + if prometheusPods[i].Spec.NodeName != "" { + nodesHasPrometheusPod.Insert(prometheusPods[i].Spec.NodeName) + klog.Infof("%s: Node %s removed from killing. Runs pod %s", k, prometheusPods[i].Spec.NodeName, prometheusPods[i].Name) + } + } + nodes := allNodes[:0] for _, node := range allNodes { - if !k.killedNodes.Has(node.Name) { + if !nodesHasPrometheusPod.Has(node.Name) && !k.killedNodes.Has(node.Name) { nodes = append(nodes, node) } } rand.Shuffle(len(nodes), func(i, j int) { nodes[i], nodes[j] = nodes[j], nodes[i] }) - numNodes := int(k.config.FailureRate * float64(len(nodes))) + numNodes := int(math.Ceil(k.config.FailureRate * float64(len(nodes)))) + klog.Infof("%s: %d nodes available, wants to fail %d nodes", k, len(nodes), numNodes) if len(nodes) > numNodes { - return nodes[:numNodes], nil + nodes = nodes[:numNodes] + } + for _, node := range nodes { + klog.Infof("%s: Node %q schedule for failure", k, node.Name) } return nodes, nil } @@ -94,6 +152,7 @@ func (k *NodeKiller) kill(nodes []v1.Node) { defer wg.Done() klog.Infof("%s: Stopping docker and kubelet on %q to simulate failure", k, node.Name) + k.addStopServicesEvent(node.Name) err := util.SSH("sudo systemctl stop docker kubelet", &node, nil) if err != nil { klog.Errorf("%s: ERROR while stopping node %q: %v", k, node.Name, err) @@ -103,7 +162,19 @@ func (k *NodeKiller) kill(nodes []v1.Node) { time.Sleep(time.Duration(k.config.SimulatedDowntime)) klog.Infof("%s: Rebooting %q to repair the node", k, node.Name) - err = util.SSH("sudo reboot", &node, nil) + // Scheduling a reboot in one second, then disconnecting. + // + // Bash command explanation: + // 'nohup' - Making sure that end of SSH connection signal will not break sudo + // 'sudo' - Elevated priviliages, required by 'shutdown' + // 'shutdown' - Control machine power + // '-r' - Making 'shutdown' to reboot, instead of power-off + // '+1s' - Parameter to 'reboot', to wait 1 second before rebooting. + // '> /dev/null 2> /dev/null < /dev/null' - File descriptor redirect, all three I/O to avoid ssh hanging, + // see https://web.archive.org/web/20090429074212/http://www.openssh.com/faq.html#3.10 + // '&' - Execute command in background, end without waiting for result + k.addRebootEvent(node.Name) + err = util.SSH("nohup sudo shutdown -r +1s > /dev/null 2> /dev/null < /dev/null &", &node, nil) if err != nil { klog.Errorf("%s: Error while rebooting node %q: %v", k, node.Name, err) return @@ -113,6 +184,24 @@ func (k *NodeKiller) kill(nodes []v1.Node) { wg.Wait() } +func (k *NodeKiller) addStopServicesEvent(nodeName string) { + k.recorder.record(stopServices, nodeName) +} + +func (k *NodeKiller) addRebootEvent(nodeName string) { + k.recorder.record(rebootNode, nodeName) +} + +// Summary logs NodeKiller execution +func (k *NodeKiller) Summary() string { + var sb strings.Builder + sb.WriteString(fmt.Sprintf("%s: Recorded following events\n", k)) + for _, e := range k.recorder.events { + sb.WriteString(fmt.Sprintf("%s: At %v %v happend for node %s\n", k, e.time.Format(time.UnixDate), e.action, e.nodeName)) + } + return sb.String() +} + func (k *NodeKiller) String() string { return "NodeKiller" } diff --git a/clusterloader2/pkg/framework/client/objects.go b/clusterloader2/pkg/framework/client/objects.go index f4c03a9ac4..6d5d20cbbb 100644 --- a/clusterloader2/pkg/framework/client/objects.go +++ b/clusterloader2/pkg/framework/client/objects.go @@ -131,6 +131,23 @@ func RetryFunction(f func() error, options ...*ApiCallOptions) wait.ConditionFun } } +// ListPodsWithOptions lists the pods using the provided options. +func ListPodsWithOptions(c clientset.Interface, namespace string, listOpts metav1.ListOptions) ([]apiv1.Pod, error) { + var pods []apiv1.Pod + listFunc := func() error { + podsList, err := c.CoreV1().Pods(namespace).List(listOpts) + if err != nil { + return err + } + pods = podsList.Items + return nil + } + if err := RetryWithExponentialBackOff(RetryFunction(listFunc)); err != nil { + return pods, err + } + return pods, nil +} + // ListNodes returns list of cluster nodes. func ListNodes(c clientset.Interface) ([]apiv1.Node, error) { return ListNodesWithOptions(c, metav1.ListOptions{}) diff --git a/clusterloader2/pkg/test/simple_test_executor.go b/clusterloader2/pkg/test/simple_test_executor.go index 09c5144502..49daebba27 100644 --- a/clusterloader2/pkg/test/simple_test_executor.go +++ b/clusterloader2/pkg/test/simple_test_executor.go @@ -102,6 +102,7 @@ func (ste *simpleTestExecutor) ExecuteTest(ctx Context, conf *api.Config) *error } } } + klog.Infof(ctx.GetChaosMonkey().Summary()) return errList } diff --git a/clusterloader2/testing/load/config.yaml b/clusterloader2/testing/load/config.yaml index d7d075ea83..21d9711f6d 100644 --- a/clusterloader2/testing/load/config.yaml +++ b/clusterloader2/testing/load/config.yaml @@ -64,8 +64,8 @@ tuningSets: chaosMonkey: nodeFailure: failureRate: 0.01 - interval: 1m - jitterFactor: 10.0 + interval: 5m + jitterFactor: 2.0 simulatedDowntime: 10m {{end}} steps: