Skip to content

Commit

Permalink
NSFS | NC | Health script | distinguish between temporary and persist…
Browse files Browse the repository at this point in the history
…ent errors

Signed-off-by: naveenpaul1 <[email protected]>
  • Loading branch information
naveenpaul1 committed Jan 18, 2024
1 parent d7b172c commit b2d1d23
Show file tree
Hide file tree
Showing 5 changed files with 239 additions and 115 deletions.
2 changes: 2 additions & 0 deletions config.js
Original file line number Diff line number Diff line change
Expand Up @@ -717,6 +717,8 @@ config.NSFS_WHITELIST = [];
// NSFS_RESTORE_ENABLED can override internal autodetection and will force
// the use of restore for all objects.
config.NSFS_RESTORE_ENABLED = false;
config.NSFS_HEALTH_ENDPOINT_RETRY_COUNT = 3
config.NSFS_HEALTH_ENDPOINT_RETRY_DELAY = 10

//Quota
config.QUOTA_LOW_THRESHOLD = 80;
Expand Down
87 changes: 50 additions & 37 deletions docs/non_containerized_NSFS.md
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,7 @@ NSFS Health status can be fetched using the command line. Run `--help` to get al
NOTE - health script execution requires root permissions.

```
sudo node usr/local/noobaa-core/src/cmd/health [--https_port,--all_account_details, --all_bucket_details]
sudo node usr/local/noobaa-core/src/cmd/health [--https_port, --all_account_details, --all_bucket_details]
```

output:
Expand All @@ -244,12 +244,14 @@ NOTE - health script execution requires root permissions.
{
"name": "nsfs",
"service_status": "active",
"pid": "1204"
"pid": "1204",
"error_type": "PERSISTENT"
},
{
"name": "rsyslog",
"service_status": "inactive",
"pid": "0"
"pid": "0",
"error_type": "PERSISTENT"
}
],
"endpoint": {
Expand All @@ -258,41 +260,50 @@ NOTE - health script execution requires root permissions.
"response_code": 200,
"response_message": "Endpoint running successfuly."
},
"total_fork_count": 0,
"running_workers": []
}
},
"invalid_accounts": [
{
"name": "naveen",
"storage_path": "/tmp/nsfs_root_invalid/",
"code": "STORAGE_NOT_EXIST"
}
],
"valid_accounts": [
{
"name": "naveen",
"storage_path": "/tmp/nsfs_root"
}
],
"invalid_buckets": [
{
"name": "bucket1.json",
"config_path": "/etc/noobaa.conf.d/buckets/bucket1.json",
"code": "INVALID_CONFIG"
"total_fork_count": 1,
"running_workers": [
"1"
]
},
{
"name": "bucket3",
"storage_path": "/tmp/nsfs_root/bucket3",
"code": "STORAGE_NOT_EXIST"
}
],
"valid_buckets": [
{
"name": "bucket2",
"storage_path": "/tmp/nsfs_root/bucket2"
}
]
"error_type": "TEMPORARY"
},
"accounts_status": {
"invalid_accounts": [
{
"name": "naveen",
"storage_path": "/tmp/nsfs_root_invalid/",
"code": "STORAGE_NOT_EXIST"
}
],
"valid_accounts": [
{
"name": "naveen",
"storage_path": "/tmp/nsfs_root"
}
],
"error_type": "PERSISTENT"
},
"buckets_status": {
"invalid_buckets": [
{
"name": "bucket1.json",
"config_path": "/etc/noobaa.conf.d/buckets/bucket1.json",
"code": "INVALID_CONFIG"
},
{
"name": "bucket3",
"storage_path": "/tmp/nsfs_root/bucket3",
"code": "STORAGE_NOT_EXIST"
}
],
"valid_buckets": [
{
"name": "bucket2",
"storage_path": "/tmp/nsfs_root/bucket2"
}
],
"error_type": "PERSISTENT"
}
}
}
```
Expand Down Expand Up @@ -320,6 +331,8 @@ NOTE - health script execution requires root permissions.

`valid_buckets`: List all the valid buckets if `all_bucket_details` flag is `true`.

`error_type` : This property could have two values, `PERSISTENT` and `TEMPORARY`, It means the retry could fix the issue or not. For `TEMPORARY` error types multiple retries are initiated from Noobaa side before updating the status with failed status. Right now only Noobaa endpoint has the error type `TEMPORARY`.

In this health output, `bucket2`'s storage path is invalid and the directory mentioned in `new_buckets_path` for `user1` is missing or not accessible. Endpoint curl command returns an error response(`"endpoint_response":404`) if one or more buckets point to an invalid bucket storage path.

### Health Error Codes
Expand Down
79 changes: 66 additions & 13 deletions src/cmd/health.js
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,11 @@ const fork_response_code = {
},
};

const health_errors_tyes = {
PERSISTENT: 'PERSISTENT',
TEMPORARY: 'TEMPORARY',
}

//suppress aws sdk related commands.
process.env.AWS_SDK_JS_SUPPRESS_MAINTENANCE_MODE_MESSAGE = '1';

Expand Down Expand Up @@ -129,24 +134,61 @@ class NSFSHealth {
name: NSFS_SERVICE,
service_status: service_status,
pid: pid,
error_type: health_errors_tyes.PERSISTENT,
},
{
name: RSYSLOG_SERVICE,
service_status: rsyslog.service_status,
pid: rsyslog.pid,
error_type: health_errors_tyes.PERSISTENT,
}],
endpoint: {
endpoint_state
endpoint_state,
error_type: health_errors_tyes.TEMPORARY,
},
invalid_accounts: account_details.invalid_storages,
valid_accounts: account_details.valid_storages,
invalid_buckets: bucket_details.invalid_storages,
valid_buckets: bucket_details.valid_storages,
accounts_status: {
invalid_accounts: account_details.invalid_storages,
valid_accounts: account_details.valid_storages,
error_type: health_errors_tyes.PERSISTENT,
},
buckets_status: {
invalid_buckets: bucket_details.invalid_storages,
valid_buckets: bucket_details.valid_storages,
error_type: health_errors_tyes.PERSISTENT,
}
}
};
if (!this.all_account_details) {
delete health.checks.accounts_status;
}
if (!this.all_bucket_details) {
delete health.checks.buckets_status;
}
return health;
}

async get_endpoint_response() {
let endpoint_state;
try {
await P.retry({
attempts: config.NSFS_HEALTH_ENDPOINT_RETRY_COUNT,
delay_ms: config.NSFS_HEALTH_ENDPOINT_RETRY_DELAY,
func: async () => {
endpoint_state = await this.get_endpoint_fork_response();
if (endpoint_state.response.response_code === fork_response_code.NOT_RUNNING.response_code) {
throw new Error('Noobaa endpoint is not running, all the retries failed');
}
}
});
} catch(err) {
console.log('Error while pinging endpoint host :' + HOSTNAME + ', port ' + this.https_port, err);
return {
response: fork_response_code.NOT_RUNNING.response_code,
};
}
return endpoint_state;
}

async get_error_code(nsfs_status, pid, rsyslog_status, endpoint_response_code) {
if (nsfs_status !== "active" || pid === "0") {
return health_errors.NSFS_SERVICE_FAILED;
Expand Down Expand Up @@ -188,7 +230,7 @@ class NSFSHealth {
}
}

async get_endpoint_response() {
async get_endpoint_fork_response() {
let url_path = '/total_fork_count';
const worker_ids = [];
let total_fork_count = 0;
Expand Down Expand Up @@ -257,24 +299,35 @@ class NSFSHealth {
};
}

async get_bucket_storage_status(config_root) {
async get_bucket_storage_status(config_root) {
const bucket_details = await this.get_storage_status(config_root, 'bucket', this.all_bucket_details);
return bucket_details;
}

async get_account_storage_status(config_root) {
async get_account_storage_status(config_root) {
const account_details = await this.get_storage_status(config_root, 'account', this.all_account_details);
return account_details;
}

async get_storage_status(config_root, type, all_details) {
async get_storage_status(config_root, type, all_details) {
const fs_context = this.get_root_fs_context();
const entries = await nb_native().fs.readdir(fs_context, this.get_config_path(config_root, type));
const config_files = entries.filter(entree => !native_fs_utils.isDirectory(entree) && entree.name.endsWith('.json'));
const config_root_type_path = this.get_config_path(config_root, type);
const invalid_storages = [];
const valid_storages = [];
//check for account and buckets dir paths
try {
await nb_native().fs.stat(fs_context, config_root_type_path);
} catch (err) {
dbg.log1(`Config root path missing ${type} folder in ${config_root_type_path}`);
return {
invalid_storages: invalid_storages,
valid_storages: valid_storages
};
}
const entries = await nb_native().fs.readdir(fs_context, config_root_type_path);
const config_files = entries.filter(entree => !native_fs_utils.isDirectory(entree) && entree.name.endsWith('.json'));
for (const config_file of config_files) {
const config_file_path = path.join(this.get_config_path(config_root, type), config_file.name);
const config_file_path = path.join(config_root_type_path, config_file.name);
let config_data;
let storage_path;
try {
Expand Down Expand Up @@ -315,7 +368,7 @@ class NSFSHealth {
};
}

get_config_path(config_root, type) {
get_config_path(config_root, type) {
return path.join(config_root, type === 'bucket' ? '/buckets' : '/accounts');
}
}
Expand Down
9 changes: 4 additions & 5 deletions src/deploy/standalone/noobaa_rsyslog.conf
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,7 @@
#### MODULES ####

# The imjournal module bellow is now used as a message source instead of imuxsock.
module(load="imuxsock" # provides support for local system logging (e.g. via logger command)
SysSock.Use="off") # Turn off message reception via local log socket;
# $ModLoad imuxsock # Turn off message reception via local log socket;
#$ModLoad imjournal # provides access to the systemd journal
#$ModLoad imklog # reads kernel messages (the same are read from journald)
#$ModLoad immark # provides --MARK-- message capability
Expand All @@ -24,17 +23,17 @@ module(load="imuxsock" # provides support for local system logging (e.g. via
#### GLOBAL DIRECTIVES ####

# Where to place auxiliary files
global(workDirectory="/var/lib/rsyslog")
$WorkDirectory /var/lib/rsyslog

# Use default timestamp format
module(load="builtin:omfile" Template="RSYSLOG_TraditionalFileFormat")
# $ActionFileDefaultTemplate RSYSLOG_TraditionalFileFormat

# File syncing capability is disabled by default. This feature is usually not required,
# not useful and an extreme performance hit
#$ActionFileEnableSync on

# Include all config files in /etc/rsyslog.d/
include(file="/etc/rsyslog.d/*.conf" mode="optional")
# $IncludeConfig /etc/rsyslog.d/*.conf

# Turn off message reception via local log socket;
# local messages are retrieved through imjournal now.
Expand Down
Loading

0 comments on commit b2d1d23

Please sign in to comment.