Skip to content

Commit

Permalink
add check task
Browse files Browse the repository at this point in the history
  • Loading branch information
wayyoungboy committed Jan 22, 2025
1 parent be05ba5 commit 281ed4b
Show file tree
Hide file tree
Showing 6 changed files with 176 additions and 0 deletions.
45 changes: 45 additions & 0 deletions plugins/check/tasks/observer/cluster/ls_nu.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
info: 'Check ls id is not_enough_replica'
task:
- version: "[4.0.0.0,*]"
steps:
- type: sql
sql: "
WITH LeaderInfo AS (
SELECT
tenant_id,
ls_id,
paxos_replica_num
FROM
oceanbase.__all_virtual_log_stat
WHERE
role = 'LEADER'
),
RowCounts AS (
SELECT
tenant_id,
ls_id,
COUNT(*) as row_count
FROM
oceanbase.__all_virtual_log_stat
GROUP BY
tenant_id,
ls_id
)
SELECT
GROUP_CONCAT(DISTINCT L.tenant_id)
FROM
LeaderInfo L
JOIN
RowCounts R
ON
L.tenant_id = R.tenant_id AND L.ls_id = R.ls_id
WHERE
R.row_count < L.paxos_replica_num;"
result:
set_value: not_enough_replica
verify: '[ -z "${not_enough_replica}" ]'
err_msg: 'There is not_enough_replica tenant_id: #{not_enough_replica} not_enough_replica, please check as soon as possible.'




12 changes: 12 additions & 0 deletions plugins/check/tasks/observer/cluster/no_leader.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
info: 'Check cluster tenant ls leader.'
task:
- version: "[4.0.0.0,*]"
steps:
- type: sql
sql: "SELECT GROUP_CONCAT(DISTINCT TENANT_ID)
FROM oceanbase.GV$OB_LOG_STAT
HAVING COUNT(CASE WHEN ROLE = 'LEADER' THEN 1 END) = 0;"
result:
set_value: no_leader_tenant_id
verify: '[ -z "${no_leader_tenant_id}" ]'
err_msg: 'there is no leader tenant, please check the cluster. tenant_id: #{no_leader_tenant_id}. You can use "obdiag rca run --scene=log_error" to get more information.'
14 changes: 14 additions & 0 deletions plugins/check/tasks/observer/cluster/zone_not_active.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
info: 'Check whether there is any zone not in the ACTIVE state.'
task:
- version: "[4.0.0.0,*]"
steps:
- type: sql
sql: 'select GROUP_CONCAT(DISTINCT ZONE) from oceanbase.dba_ob_zones where STATUS<>"ACTIVE";'
result:
set_value: not_ACTIVE
verify: '[ -z "${not_ACTIVE}" ]'
err_msg: 'There is #{not_ACTIVE} not_ACTIVE zone, please check as soon as possible.'




38 changes: 38 additions & 0 deletions plugins/check/tasks/observer/network/network_offset.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
info: 'Check cluster info about network clockdiff offset'
task:
- version: "[4.0.0.0,*]"
steps:
- type: local_ssh
ssh: 'clockdiff -o #{remote_ip}'
result:
set_value: clockdiff
verify: '[[ ! $clockdiff == *"is down"* ]]'
#report_type: warning
err_msg: "node: #{remote_ip} can not get clock offset by 'clockdiff -o #{remote_ip}', doc: https://www.oceanbase.com/knowledge-base/ocp-ee-1000000000346970?back=kb"
- type: local_ssh
ssh: "echo \"#{clockdiff}\" | awk '{print $2}'"
result:
set_value: offset
verify_type: max
verify: 50
err_msg: "node: #{remote_ip} clock offset is #{offset}, it is over 50ms, issue: https://github.com/oceanbase/obdiag/issues/701"




















64 changes: 64 additions & 0 deletions src/handler/checker/step/local_ssh.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
#!/usr/bin/env python
# -*- coding: UTF-8 -*
# Copyright (c) 2022 OceanBase
# OceanBase Diagnostic Tool is licensed under Mulan PSL v2.
# You can use this software according to the terms and conditions of the Mulan PSL v2.
# You may obtain a copy of Mulan PSL v2 at:
# http://license.coscl.org.cn/MulanPSL2
# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
# See the Mulan PSL v2 for more details.
"""
@time: 2025/01/21
@file: local_ssh.py
@desc:
"""
from src.common.ssh_client.local_client import LocalClient
from src.handler.checker.check_exception import StepExecuteFailException
from src.handler.checker.check_report import TaskReport
from src.common.tool import StringUtils
from src.common.tool import Util


class StepLocalHandler:
def __init__(self, context, step, node, task_variable_dict):
self.context = context
self.stdio = context.stdio
self.ssh_report_value = None
self.parameters = None
self.step = step
self.node = node
try:
self.ssh_client = LocalClient(self.context, node={"ssh_type": "local"})
except Exception as e:
self.stdio.error("StepLocalHandler init fail. Please check the NODES conf. node: {0}. Exception : {1} .".format(node, e))
raise Exception("StepLocalHandler init fail. Please check the NODES conf node: {0} Exception : {1} .".format(node, e))
self.task_variable_dict = task_variable_dict
self.parameter = []
self.report = TaskReport

def execute(self):
try:
if "ssh" not in self.step:
raise StepExecuteFailException("StepLocalHandler execute ssh is not set")
ssh_cmd = StringUtils.build_str_on_expr_by_dict(self.step["ssh"], self.task_variable_dict)
self.stdio.verbose("step StepLocalHandler execute :{0} ".format(ssh_cmd))
ssh_report_value = self.ssh_client.exec_cmd(ssh_cmd)
if ssh_report_value is None:
ssh_report_value = ""
if len(ssh_report_value) > 0:
ssh_report_value = ssh_report_value.strip()
self.stdio.verbose("ssh result:{0}".format(Util.convert_to_number(ssh_report_value)))
if "result" in self.step and "set_value" in self.step["result"]:
self.stdio.verbose("ssh result set {0}".format(self.step["result"]["set_value"], Util.convert_to_number(ssh_report_value)))
self.task_variable_dict[self.step["result"]["set_value"]] = Util.convert_to_number(ssh_report_value)
except Exception as e:
self.stdio.error("ssh execute Exception:{0}".format(e).strip())
raise StepExecuteFailException("ssh execute Exception:{0}".format(e).strip())
finally:
self.ssh_client.ssh_close()
self.stdio.verbose("step StepLocalHandler ssh_report_value:{0}".format(ssh_report_value))

def update_step_variable_dict(self):
return self.task_variable_dict
3 changes: 3 additions & 0 deletions src/handler/checker/step/stepbase.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from src.handler.checker.step.data_size import DataSizeHandler
from src.handler.checker.step.get_system_parameter import GetSystemParameterHandler
from src.handler.checker.result.result import CheckResult
from src.handler.checker.step.local_ssh import StepLocalHandler
from src.handler.checker.step.ssh import SshHandler
from src.handler.checker.step.sql import StepSQLHandler
from src.common.ssh_client.ssh import SshClient
Expand Down Expand Up @@ -58,6 +59,8 @@ def execute(self, report):
handler = StepSQLHandler(self.context, self.step, task_variable_dict=self.task_variable_dict)
elif self.step["type"] == "data_size":
handler = DataSizeHandler(self.context, self.step, self.cluster, self.task_variable_dict)
elif self.step["type"] == "local_ssh":
handler = StepLocalHandler(self.context, self.step, self.cluster, self.task_variable_dict)
else:
raise StepExecuteFailException("the type not support: {0}".format(self.step["type"]))
self.stdio.verbose("task execute and result")
Expand Down

0 comments on commit 281ed4b

Please sign in to comment.