Skip to content

Commit

Permalink
Merge pull request #9172 from rwsu/AGENT-967-improve-monitor-log-output
Browse files Browse the repository at this point in the history
AGENT-967: Improve monitoring output for multi-node
  • Loading branch information
openshift-merge-bot[bot] authored Nov 9, 2024
2 parents c3e9b51 + ff3b202 commit 640266e
Show file tree
Hide file tree
Showing 7 changed files with 169 additions and 80 deletions.
2 changes: 1 addition & 1 deletion cmd/node-joiner/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ func nodeJoiner() error {
nodesAddCmd.Flags().BoolP("pxe", "p", false, "Instead of an ISO, generates PXE artifacts that can be used to boot the configured nodes to let them join an existing cluster")

nodesMonitorCmd := &cobra.Command{
Use: "monitor-add-nodes",
Use: "monitor-add-nodes <ip-addresses>",
Short: "Monitors the configured nodes while they are joining an existing cluster",
RunE: func(cmd *cobra.Command, args []string) error {
dir, kubeConfig, err := getCommonFlags(cmd)
Expand Down
27 changes: 11 additions & 16 deletions pkg/agent/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ func (czero *Cluster) IsBootstrapComplete() (bool, bool, error) {

// Agent Rest API is available
if agentRestAPILive {
exitOnErr, err := czero.MonitorStatusFromAssistedService()
exitOnErr, err := czero.MonitorStatusFromAssistedService(nil)
if err != nil {
return false, exitOnErr, err
}
Expand All @@ -214,17 +214,16 @@ func (czero *Cluster) IsBootstrapComplete() (bool, bool, error) {
// After cluster or host installation has started, new events from
// the Assisted Service API are also logged and updated to the cluster's
// install history.
func (czero *Cluster) MonitorStatusFromAssistedService() (bool, error) {
func (czero *Cluster) MonitorStatusFromAssistedService(ch chan logEntry) (bool, error) {
logger := logrus.StandardLogger()
resource := "cluster"
logPrefix := ""
if czero.workflow == workflow.AgentWorkflowTypeAddNodes {
resource = "host"
logPrefix = fmt.Sprintf("Node %s: ", czero.API.Rest.NodeZeroIP)
}

// First time we see the agent Rest API
if !czero.installHistory.RestAPISeen {
logrus.Debugf("%sAgent Rest API Initialized", logPrefix)
log(Debug, "Agent Rest API Initialized", logger, ch)
czero.installHistory.RestAPISeen = true
czero.installHistory.NotReadyTime = time.Now()
}
Expand Down Expand Up @@ -256,18 +255,14 @@ func (czero *Cluster) MonitorStatusFromAssistedService() (bool, error) {
return false, errors.New("cluster metadata returned nil from Agent Rest API")
}

czero.PrintInstallStatus(clusterMetadata)
czero.PrintInstallStatus(clusterMetadata, ch)

// If status indicates pending action, log host info to help pinpoint what is missing
if (*clusterMetadata.Status != czero.installHistory.RestAPIPreviousClusterStatus) &&
(*clusterMetadata.Status == models.ClusterStatusInstallingPendingUserAction) {
for _, host := range clusterMetadata.Hosts {
if *host.Status == models.ClusterStatusInstallingPendingUserAction {
if logPrefix != "" {
logrus.Warningf("%s%s %s", logPrefix, host.RequestedHostname, *host.StatusInfo)
} else {
logrus.Warningf("Host %s %s", host.RequestedHostname, *host.StatusInfo)
}
log(Warning, fmt.Sprintf("Host %s %s", host.RequestedHostname, *host.StatusInfo), logger, ch)
}
}
}
Expand All @@ -293,7 +288,7 @@ func (czero *Cluster) MonitorStatusFromAssistedService() (bool, error) {
}
}

validationsErr := checkValidations(clusterMetadata, czero.installHistory.ValidationResults, logrus.StandardLogger(), logPrefix)
validationsErr := checkValidations(clusterMetadata, czero.installHistory.ValidationResults, logger, ch)
if validationsErr != nil {
return false, errors.Wrap(validationsErr, "host validations failed")
}
Expand All @@ -310,9 +305,9 @@ func (czero *Cluster) MonitorStatusFromAssistedService() (bool, error) {
// Don't print the same status message back to back
if *mostRecentEvent.Message != czero.installHistory.RestAPIPreviousEventMessage {
if *mostRecentEvent.Severity == models.EventSeverityInfo {
logrus.Infof("%s%s", logPrefix, *mostRecentEvent.Message)
log(Info, *mostRecentEvent.Message, logger, ch)
} else {
logrus.Warnf("%s%s", logPrefix, *mostRecentEvent.Message)
log(Warning, *mostRecentEvent.Message, logger, ch)
}
}
czero.installHistory.RestAPIPreviousEventMessage = *mostRecentEvent.Message
Expand Down Expand Up @@ -473,11 +468,11 @@ func (czero *Cluster) PrintInstallationComplete() error {
}

// PrintInstallStatus Print a human friendly message using the models from the Agent Rest API.
func (czero *Cluster) PrintInstallStatus(cluster *models.Cluster) {
func (czero *Cluster) PrintInstallStatus(cluster *models.Cluster, ch chan logEntry) {
friendlyStatus := czero.humanFriendlyClusterInstallStatus(*cluster.Status)
// Don't print the same status message back to back
if *cluster.Status != czero.installHistory.RestAPIPreviousClusterStatus {
logrus.Info(friendlyStatus)
log(Info, friendlyStatus, logrus.StandardLogger(), ch)
}
}

Expand Down
84 changes: 84 additions & 0 deletions pkg/agent/logging.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
package agent

import (
"fmt"
"sync"
"time"

"github.com/sirupsen/logrus"
)

// Constants representing logging levels.
const (
Debug = "Debug"
Info = "Info"
Warning = "Warning"
Trace = "Trace"
Error = "Error"
)

const (
logInterval = 5
)

type logEntry struct {
level string
message string
}

// Uses logger if ch is nil.
func log(level, message string, logger *logrus.Logger, ch chan logEntry) {
if ch != nil {
ch <- logEntry{level: level, message: message}
} else {
switch level {
case Debug:
logger.Debug(message)
case Info:
logger.Info(message)
case Warning:
logger.Warn(message)
case Trace:
logger.Trace(message)
}
}
}

func printChannelLogs(ip string, ch chan logEntry) {
for len(ch) > 0 {
entry := <-ch
message := fmt.Sprintf("Node %s: %s", ip, entry.message)
switch entry.level {
case Debug:
logrus.Debug(message)
case Info:
logrus.Info(message)
case Warning:
logrus.Warn(message)
default:
logrus.Info(message)
}
}
}

func printLogs(wg *sync.WaitGroup, ipChanMap map[string]chan logEntry) {
defer wg.Done()
for {
if len(ipChanMap) == 0 {
// no IPs to monitor or all channels are closed, exit loop
break
}
for ip, ch := range ipChanMap {
if len(ch) == 0 {
// check if channel is closed
_, ok := <-ch
if !ok {
// channel is closed, remove IP from map to stop checking for logs
delete(ipChanMap, ip)
}
}
printChannelLogs(ip, ch)
}
time.Sleep(logInterval * time.Second)
}
}
Loading

0 comments on commit 640266e

Please sign in to comment.