Skip to content

Commit

Permalink
Cherry-pick NodeKiller changes
Browse files Browse the repository at this point in the history
- [NodeKiller] Schedule reboot and disconnect from node
- [NodeKiller] Change rounding of a number of selected nodes
- [NodeKiller] Add summary
- Node-killer avoid killing nodes where prometheus is running
  (change by mozhuli)
  • Loading branch information
jprzychodzen committed Mar 9, 2020
1 parent 4baf007 commit 03ef2b1
Show file tree
Hide file tree
Showing 5 changed files with 127 additions and 7 deletions.
13 changes: 13 additions & 0 deletions clusterloader2/pkg/chaos/monkey.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@ limitations under the License.
package chaos

import (
"fmt"
"strings"

clientset "k8s.io/client-go/kubernetes"
"k8s.io/perf-tests/clusterloader2/api"
)
Expand Down Expand Up @@ -47,3 +50,13 @@ func (m *Monkey) Init(config api.ChaosMonkeyConfig, stopCh <-chan struct{}) erro

return nil
}

// Summary logs Monkey execution
func (m *Monkey) Summary() string {
var sb strings.Builder
if m.nodeKiller != nil {
sb.WriteString(fmt.Sprintf("Summary of Chaos Monkey execution\n"))
sb.WriteString(m.nodeKiller.Summary())
}
return sb.String()
}
99 changes: 94 additions & 5 deletions clusterloader2/pkg/chaos/nodes.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,35 +18,74 @@ package chaos

import (
"fmt"
"math"
"math/rand"
"strings"
"sync"
"time"

"k8s.io/perf-tests/clusterloader2/api"
"k8s.io/perf-tests/clusterloader2/pkg/framework/client"
"k8s.io/perf-tests/clusterloader2/pkg/util"

v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/apimachinery/pkg/util/wait"
clientset "k8s.io/client-go/kubernetes"
"k8s.io/klog"
)

const (
monitoringNamespace = "monitoring"
prometheusLabel = "prometheus=k8s"
)

// NodeKiller is a utility to simulate node failures.
type NodeKiller struct {
config api.NodeFailureConfig
client clientset.Interface
provider string
// killedNodes stores names of the nodes that have been killed by NodeKiller.
killedNodes sets.String
recorder *eventRecorder
}

type nodeAction string

const (
stopServices nodeAction = "stopService"
rebootNode = "rebootNode"
)

type event struct {
time time.Time
action nodeAction
nodeName string
}

type eventRecorder struct {
events []event
mux sync.Mutex
}

func newEventRecorder() *eventRecorder {
return &eventRecorder{[]event{}, sync.Mutex{}}
}

func (r *eventRecorder) record(a nodeAction, nodeName string) {
e := event{time.Now(), a, nodeName}
r.mux.Lock()
r.events = append(r.events, e)
r.mux.Unlock()
}

// NewNodeKiller creates new NodeKiller.
func NewNodeKiller(config api.NodeFailureConfig, client clientset.Interface, provider string) (*NodeKiller, error) {
if provider != "gce" && provider != "gke" {
return nil, fmt.Errorf("provider %q is not supported by NodeKiller", provider)
}
return &NodeKiller{config, client, provider, sets.NewString()}, nil
return &NodeKiller{config, client, provider, sets.NewString(), newEventRecorder()}, nil
}

// Run starts NodeKiller until stopCh is closed.
Expand All @@ -68,18 +107,37 @@ func (k *NodeKiller) pickNodes() ([]v1.Node, error) {
if err != nil {
return nil, err
}

prometheusPods, err := client.ListPodsWithOptions(k.client, monitoringNamespace, metav1.ListOptions{
LabelSelector: prometheusLabel,
})
if err != nil {
return nil, err
}
nodesHasPrometheusPod := sets.NewString()
for i := range prometheusPods {
if prometheusPods[i].Spec.NodeName != "" {
nodesHasPrometheusPod.Insert(prometheusPods[i].Spec.NodeName)
klog.Infof("%s: Node %s removed from killing. Runs pod %s", k, prometheusPods[i].Spec.NodeName, prometheusPods[i].Name)
}
}

nodes := allNodes[:0]
for _, node := range allNodes {
if !k.killedNodes.Has(node.Name) {
if !nodesHasPrometheusPod.Has(node.Name) && !k.killedNodes.Has(node.Name) {
nodes = append(nodes, node)
}
}
rand.Shuffle(len(nodes), func(i, j int) {
nodes[i], nodes[j] = nodes[j], nodes[i]
})
numNodes := int(k.config.FailureRate * float64(len(nodes)))
numNodes := int(math.Ceil(k.config.FailureRate * float64(len(nodes))))
klog.Infof("%s: %d nodes available, wants to fail %d nodes", k, len(nodes), numNodes)
if len(nodes) > numNodes {
return nodes[:numNodes], nil
nodes = nodes[:numNodes]
}
for _, node := range nodes {
klog.Infof("%s: Node %q schedule for failure", k, node.Name)
}
return nodes, nil
}
Expand All @@ -94,6 +152,7 @@ func (k *NodeKiller) kill(nodes []v1.Node) {
defer wg.Done()

klog.Infof("%s: Stopping docker and kubelet on %q to simulate failure", k, node.Name)
k.addStopServicesEvent(node.Name)
err := util.SSH("sudo systemctl stop docker kubelet", &node, nil)
if err != nil {
klog.Errorf("%s: ERROR while stopping node %q: %v", k, node.Name, err)
Expand All @@ -103,7 +162,19 @@ func (k *NodeKiller) kill(nodes []v1.Node) {
time.Sleep(time.Duration(k.config.SimulatedDowntime))

klog.Infof("%s: Rebooting %q to repair the node", k, node.Name)
err = util.SSH("sudo reboot", &node, nil)
// Scheduling a reboot in one second, then disconnecting.
//
// Bash command explanation:
// 'nohup' - Making sure that end of SSH connection signal will not break sudo
// 'sudo' - Elevated priviliages, required by 'shutdown'
// 'shutdown' - Control machine power
// '-r' - Making 'shutdown' to reboot, instead of power-off
// '+1s' - Parameter to 'reboot', to wait 1 second before rebooting.
// '> /dev/null 2> /dev/null < /dev/null' - File descriptor redirect, all three I/O to avoid ssh hanging,
// see https://web.archive.org/web/20090429074212/http://www.openssh.com/faq.html#3.10
// '&' - Execute command in background, end without waiting for result
k.addRebootEvent(node.Name)
err = util.SSH("nohup sudo shutdown -r +1s > /dev/null 2> /dev/null < /dev/null &", &node, nil)
if err != nil {
klog.Errorf("%s: Error while rebooting node %q: %v", k, node.Name, err)
return
Expand All @@ -113,6 +184,24 @@ func (k *NodeKiller) kill(nodes []v1.Node) {
wg.Wait()
}

func (k *NodeKiller) addStopServicesEvent(nodeName string) {
k.recorder.record(stopServices, nodeName)
}

func (k *NodeKiller) addRebootEvent(nodeName string) {
k.recorder.record(rebootNode, nodeName)
}

// Summary logs NodeKiller execution
func (k *NodeKiller) Summary() string {
var sb strings.Builder
sb.WriteString(fmt.Sprintf("%s: Recorded following events\n", k))
for _, e := range k.recorder.events {
sb.WriteString(fmt.Sprintf("%s: At %v %v happend for node %s\n", k, e.time.Format(time.UnixDate), e.action, e.nodeName))
}
return sb.String()
}

func (k *NodeKiller) String() string {
return "NodeKiller"
}
17 changes: 17 additions & 0 deletions clusterloader2/pkg/framework/client/objects.go
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,23 @@ func RetryFunction(f func() error, options ...*ApiCallOptions) wait.ConditionFun
}
}

// ListPodsWithOptions lists the pods using the provided options.
func ListPodsWithOptions(c clientset.Interface, namespace string, listOpts metav1.ListOptions) ([]apiv1.Pod, error) {
var pods []apiv1.Pod
listFunc := func() error {
podsList, err := c.CoreV1().Pods(namespace).List(listOpts)
if err != nil {
return err
}
pods = podsList.Items
return nil
}
if err := RetryWithExponentialBackOff(RetryFunction(listFunc)); err != nil {
return pods, err
}
return pods, nil
}

// ListNodes returns list of cluster nodes.
func ListNodes(c clientset.Interface) ([]apiv1.Node, error) {
return ListNodesWithOptions(c, metav1.ListOptions{})
Expand Down
1 change: 1 addition & 0 deletions clusterloader2/pkg/test/simple_test_executor.go
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ func (ste *simpleTestExecutor) ExecuteTest(ctx Context, conf *api.Config) *error
}
}
}
klog.Infof(ctx.GetChaosMonkey().Summary())
return errList
}

Expand Down
4 changes: 2 additions & 2 deletions clusterloader2/testing/load/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,8 @@ tuningSets:
chaosMonkey:
nodeFailure:
failureRate: 0.01
interval: 1m
jitterFactor: 10.0
interval: 5m
jitterFactor: 2.0
simulatedDowntime: 10m
{{end}}
steps:
Expand Down

0 comments on commit 03ef2b1

Please sign in to comment.