diff --git a/.github/workflows/functional_test_v2.yaml b/.github/workflows/functional_test_v2.yaml index 9a63c41028..31b17d8dad 100644 --- a/.github/workflows/functional_test_v2.yaml +++ b/.github/workflows/functional_test_v2.yaml @@ -6,8 +6,12 @@ on: branches: [ main ] workflow_dispatch: inputs: - UPDATE_EXPECTED_RESULTS: - description: 'Set this to true to update expected results and collect updated test output as a Github workflow artifact.' + UPLOAD_UPDATED_EXPECTED_RESULTS: + description: 'Set this to true to upload updated golden file expected results and upload these results as a Github workflow artifact.' + required: false + default: false + UPLOAD_KUBERNETES_DEBUG_INFO: + description: 'Set this to true to collect the debug info of the k8s cluster and upload this info as a Github workflow artifact.' required: false default: false @@ -20,7 +24,8 @@ jobs: env: KUBECONFIG: /tmp/kube-config-splunk-otel-collector-chart-functional-testing KUBE_TEST_ENV: kind - UPDATE_EXPECTED_RESULTS: ${{ github.event.inputs.UPDATE_EXPECTED_RESULTS || 'false' }} + UPLOAD_UPDATED_EXPECTED_RESULTS: ${{ github.event.inputs.UPLOAD_UPDATED_EXPECTED_RESULTS || 'false' }} + UPLOAD_KUBERNETES_DEBUG_INFO: ${{ github.event.inputs.UPLOAD_KUBERNETES_DEBUG_INFO || 'false' }} strategy: fail-fast: false matrix: @@ -71,13 +76,28 @@ jobs: run: | make cert-manager - name: run functional tests + id: run-functional-tests env: K8S_VERSION: ${{ matrix.k8s-version }} run: | cd functional_tests - TEARDOWN_BEFORE_SETUP=true UPDATE_EXPECTED_RESULTS=${{ env.UPDATE_EXPECTED_RESULTS }} go test -v -tags ${{ matrix.test-job }} - - name: 'Upload test results' - if: always() && env.UPDATE_EXPECTED_RESULTS == 'true' + TEARDOWN_BEFORE_SETUP=true UPDATE_EXPECTED_RESULTS=${{ env.UPLOAD_UPDATED_EXPECTED_RESULTS }} go test -v -tags ${{ matrix.test-job }} + - name: Collect Kubernetes Cluster debug info on failure + if: always() && (steps.run-functional-tests.outcome == 'failure' || env.UPLOAD_KUBERNETES_DEBUG_INFO == 'true') + id: collect-debug-info + run: | + echo "Functional tests failed. Collecting debug info for current state of the Kubernetes cluster..." + cd tools + ./splunk_kubernetes_debug_info.sh + - name: Upload Kubernetes Cluster debug info + if: always() && (steps.run-functional-tests.outcome == 'failure' || env.UPLOAD_KUBERNETES_DEBUG_INFO == 'true') + uses: actions/upload-artifact@v4 + with: + name: k8s-debug-info-${{ matrix.test-job }}-${{ matrix.k8s-version }} + path: tools/splunk_kubernetes_debug_info_* + retention-days: 5 + - name: Upload test results + if: always() && env.UPLOAD_UPDATED_EXPECTED_RESULTS == 'true' uses: actions/upload-artifact@v4 with: name: functional_tests-${{ matrix.test-job }}-${{ matrix.k8s-version }} diff --git a/.gitignore b/.gitignore index 893e47be6a..843d2761e9 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ .idea *.iml .DS_Store +*splunk_kubernetes_debug_info_* # Helm **/charts/*.tgz diff --git a/tools/splunk_kubernetes_debug_info.sh b/tools/splunk_kubernetes_debug_info.sh index 69e2988c9e..826ca55f17 100755 --- a/tools/splunk_kubernetes_debug_info.sh +++ b/tools/splunk_kubernetes_debug_info.sh @@ -3,7 +3,7 @@ # Description: # This script collects debugging information from a Kubernetes cluster. # It retrieves networking, firewall, security policies, custom resource definitions (CRDs), -# and logs from specified pods and secrets (sanitized). The outputs are saved to files for each namespace and object type. +# and logs from specified pods. The outputs are saved to files for each namespace and object type. # This helps in diagnosing and troubleshooting cluster configurations. # Finally, it compresses all the collected files into a ZIP archive. # @@ -28,7 +28,7 @@ # # Objects Scraped: # - Pod logs for agent, cluster-receiver, certmanager, operator, gateway, splunk pods -# - Deployments, daemonsets, secrets, Helm releases matching K8S_OBJECT_NAME_FILTER +# - Deployments, daemonsets, Helm releases matching K8S_OBJECT_NAME_FILTER # - NetworkPolicies, Services, Ingress resources, Endpoints, Roles, RoleBindings, Security contexts # - OpenTelemetry Instrumentation objects # - Custom Resource Definitions (CRDs), Pod Security Policies (PSPs), Security Context Constraints (SCCs) @@ -87,10 +87,10 @@ write_output() { collect_data_namespace() { local ns=$1 - object_types=("deployments" "daemonsets" "configmaps" "secrets" "networkpolicies" "svc" "ingress" "endpoints" "roles" "rolebindings" "otelinst") + object_types=("configmaps" "daemonsets" "deployments" "endpoints" "events" "ingress" "jobs" "networkpolicies" "otelinst" "rolebindings" "roles" "svc") for type in "${object_types[@]}"; do stdbuf -oL echo "Collecting $type data for $ns namespace with $k8s_object_name_filter name filter" - if [[ "$type" == "deployment" || "$type" == "daemonset" || "$type" == "configmaps" || "$type" == "secrets" ]]; then + if [[ "$type" == "deployment" || "$type" == "daemonset" || "$type" == "configmaps" ]]; then kubectl get "$type" -n "$ns" -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' | grep -E "$k8s_object_name_filter" | while read object; do cmd="kubectl get $type $object -n $ns -o yaml" output=$(eval "$cmd") @@ -201,21 +201,6 @@ collect_data_cluster() { output=$(eval "$cmd") write_output "$output" "$temp_dir/cluster_custom_resource_definitions.yaml" "$cmd" - echo "Collecting pod security policies..." - cmd="kubectl get psp -o yaml" - output=$(eval "$cmd") - write_output "$output" "$temp_dir/cluster_pod_security_policies.yaml" "$cmd" - - echo "Collecting security context constraints..." - cmd="kubectl get scc -o yaml" - output=$(eval "$cmd") - write_output "$output" "$temp_dir/cluster_security_context_constraints.yaml" "$cmd" - - echo "Collecting MutatingWebhookConfiguration objects..." - cmd="kubectl get mutatingwebhookconfiguration.admissionregistration.k8s.io -o yaml; kubectl describe mutatingwebhookconfiguration.admissionregistration.k8s.io; kubectl get --raw /metrics | grep apiserver_admission_webhook_rejection_count;" - output=$(eval "$cmd") - write_output "$output" "$temp_dir/cluster_webhooks.yaml" "$cmd" - echo "Checking for cert-manager installation..." cert_manager_pods=$(kubectl get pods --all-namespaces -l app=cert-manager --no-headers) if [ -n "$cert_manager_pods" ]; then @@ -233,6 +218,33 @@ collect_data_cluster() { done } +collect_cluster_resources() { + # List of cluster-scoped resource types to collect + cluster_object_types=( + "crds" + "psp" + "scc" + "mutatingwebhookconfiguration.admissionregistration.k8s.io" + "validatingwebhookconfiguration.admissionregistration.k8s.io" + ) + + for type in "${cluster_object_types[@]}"; do + echo "Collecting $type cluster-scoped resources..." + + # Fetch each object's name + kubectl get "$type" -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' | while read object; do + # Get the API version for this object, fallback to "unknown" + api_version=$(kubectl get "$type" "$object" -o jsonpath='{.apiVersion}' 2>/dev/null || echo "unknown") + api_version=${api_version//\//_} # Sanitize slashes in API version + + # Collect YAML output + cmd="kubectl get $type $object -o yaml" + output=$(eval "$cmd") + write_output "$output" "$temp_dir/cluster_${type//./_}_${api_version}_${object}.yaml" "$cmd" + done + done +} + # Parse input parameters namespaces="" k8s_object_name_filter="splunk|collector|otel|certmanager|test|sck|sock" @@ -279,9 +291,12 @@ script_start_time=$(date +"%Y-%m-%d %H:%M:%S") echo "Script start time: $script_start_time" echo "Script start time: $script_start_time" >> "$output_file" -# Collect cluster-wide data +# Collect cluster instance specific data collect_data_cluster +# Collect cluster scoped resources data +collect_cluster_resources + # Function to manage parallel processing of namespaces collect_data_namespace_namespaces() { local parallelism=20