diff --git a/configure.ac b/configure.ac index ca964d9039a0..3151bff17a92 100644 --- a/configure.ac +++ b/configure.ac @@ -741,6 +741,33 @@ exit 1 fi + LIBHWLOC="" + AC_ARG_ENABLE(hwloc, + AS_HELP_STRING([--enable-hwloc], [Enable hwloc support [default=no]]), + [enable_hwloc=$enableval],[enable_hwloc=no]) + AS_IF([test "x$enable_hwloc" = "xyes"], [ + PKG_CHECK_MODULES([HWLOC], [hwloc >= 2.0.0], + [AC_DEFINE([HAVE_HWLOC], [1], [Define if hwloc library is present and meets version requirements])], + LIBHWLOC="no") + + if test "$LIBHWLOC" = "no"; then + echo + echo " ERROR! hwloc library version > 2.0.0 not found, go get it" + echo " from https://www.open-mpi.org/projects/hwloc/ " + echo " or your distribution:" + echo + echo " Ubuntu: apt-get install hwloc libhwloc-dev" + echo " Fedora: dnf install hwloc hwloc-devel" + echo " CentOS/RHEL: yum install hwloc hwloc-devel" + echo + exit 1 + else + CFLAGS="${CFLAGS} ${HWLOC_CFLAGS}" + LDFLAGS="${LDFLAGS} ${HWLOC_LIBS}" + enable_hwloc="yes" + fi + ]) + # libpthread AC_ARG_WITH(libpthread_includes, [ --with-libpthread-includes=DIR libpthread include directory], @@ -2561,6 +2588,7 @@ SURICATA_BUILD_CONF="Suricata Configuration: JA4 support: ${enable_ja4} Non-bundled htp: ${enable_non_bundled_htp} Hyperscan support: ${enable_hyperscan} + Hwloc support: ${enable_hwloc} Libnet support: ${enable_libnet} liblz4 support: ${enable_liblz4} Landlock support: ${enable_landlock} diff --git a/doc/userguide/configuration/suricata-yaml.rst b/doc/userguide/configuration/suricata-yaml.rst index 33b3c5528fed..17a68d14067e 100644 --- a/doc/userguide/configuration/suricata-yaml.rst +++ b/doc/userguide/configuration/suricata-yaml.rst @@ -917,6 +917,7 @@ per available CPU/CPU core. threading: set-cpu-affinity: yes + autopin: no cpu-affinity: management-cpu-set: cpu: [ 0 ] # include only these cpus in affinity settings @@ -933,6 +934,13 @@ per available CPU/CPU core. medium: [ "1-2" ] high: [ 3 ] default: "medium" + interface-specific-cpu-set: + - interface: "enp4s0f0" # 0000:3b:00.0 # net_bonding0 # ens1f0 + cpu: [ 1,3,5,7,9 ] + mode: "exclusive" + prio: + high: [ "all" ] + default: "medium" verdict-cpu-set: cpu: [ 0 ] prio: @@ -969,6 +977,80 @@ Runmode Workers:: worker-cpu-set - used for receive,streamtcp,decode,detect,output(logging),respond/reject, verdict +Interface-specific CPU affinity settings +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Using the new configuration format introduced in Suricata 8.0 it is possible +to set CPU affinity settings per interface. This can be useful +when you have multiple interfaces and you want to dedicate specific CPU cores +to specific interfaces. This can be useful for example when Suricata runs on +multiple NUMA nodes and reads from interfaces on each NUMA node. + +Interface-specific affinity settings can be configured for the worker-cpu-set +and the receive-cpu-set (only used in autofp mode). +This feature is available for capture modes which work with interfaces +(af-packet, dpdk, etc.). The value of the interface key can be the kernel +interface name (e.g. eth0 for af-packet), the PCI address of the interface +(e.g. 0000:3b:00.0 for DPDK capture mode), or the name of the virtual device +interface (e.g. net_bonding0 for DPDK capture mode). +The interface names needs to be unique and be located under the capture mode +configuration. + +The interface-specific settings will override the global settings for the +worker-cpu-set and receive-cpu-set. The CPUs do not need to be contained in +the parent node settings. If the interface-specific settings are not defined, +the global settings will be used. + +:: + + threading: + set-cpu-affinity: yes + cpu-affinity: + worker-cpu-set: + interface-specific-cpu-set: + - interface: "eth0" # 0000:3b:00.0 # net_bonding0 + cpu: [ 1,3,5,7,9 ] + mode: "exclusive" + prio: + high: [ "all" ] + default: "medium" + +Automatic NUMA-aware CPU core pinning +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +When Suricata is running on a system with multiple NUMA nodes, it is possible +to automatically use CPUs from the same NUMA node as the network capture +interface. +CPU cores on the same NUMA nodes as the network capture interface have +reduced memory access latency and increased the performance of Suricata. +This is enabled by setting the `autopin` option to `yes` in the threading +section. This option is available for worker-cpu-set and receive-cpu-set. + +:: + + threading: + set-cpu-affinity: yes + autopin: yes + cpu-affinity: + worker-cpu-set: + cpu: [ "all" ] + mode: "exclusive" + prio: + high: [ "all" ] + +Consider 2 interfaces defined in the capture mode configuration, one on each +NUMA node. The `autopin` option is enabled to automatically use CPUs from the +same NUMA node as the interface. The worker-cpu-set is set to use all CPUs. +When interface on the first NUMA node is used, the worker threads will be +pinned to CPUs on the first NUMA node. When interface on the second NUMA node +is used, the worker threads will be pinned to CPUs on the second NUMA node. +If the number of CPU cores on a given NUMA node is exhausted then the worker +threads will be pinned to CPUs on the other NUMA node. + +The option `threading.autopin` can be combined with the interface-specific CPU +affinity settings. +To use the `autopin` option, the system must have the `hwloc` +dependency installed and pass `--enable-hwloc` to the configure script. IP Defrag --------- diff --git a/doc/userguide/upgrade.rst b/doc/userguide/upgrade.rst index fa08c8d14280..a738224d990a 100644 --- a/doc/userguide/upgrade.rst +++ b/doc/userguide/upgrade.rst @@ -99,6 +99,20 @@ Major changes + worker-cpu-set: + cpu: [0, 1] + - The `threading.cpu-affinity` configuration has been extended to support + interface-specific CPU affinity settings. This allows you to specify + CPU affinity settings for each interface separately. + The new configuration format is described in :ref:`suricata-yaml-threading`. + The old configuration format does not support this extension and will be + removed in Suricata 9.0. + - The `threading.cpu-affinity` configuration now supports autopinning + worker or receive threads to the same NUMA node as the network capture + interface is located on. + This can be enabled by setting `threading.autopin` to `yes`. + See :ref:`suricata-yaml-threading` for more information. + This requires hwloc dependency to be installed and `--enable-hwloc` + to be passed to configure script. + Removals ~~~~~~~~ - The ssh keywords ``ssh.protoversion`` and ``ssh.softwareversion`` have been removed. diff --git a/src/runmode-dpdk.c b/src/runmode-dpdk.c index 6bbe3c1f2ed6..32d8a1ff255b 100644 --- a/src/runmode-dpdk.c +++ b/src/runmode-dpdk.c @@ -368,12 +368,17 @@ static int ConfigSetThreads(DPDKIfaceConfig *iconf, const char *entry_str) SCReturnInt(-EINVAL); } - ThreadsAffinityType *wtaf = GetAffinityTypeFromName("worker-cpu-set"); + bool wtaf_periface = true; + ThreadsAffinityType *wtaf = GetAffinityTypeForNameAndIface("worker-cpu-set", iconf->iface); if (wtaf == NULL) { - SCLogError("Specify worker-cpu-set list in the threading section"); - SCReturnInt(-EINVAL); + wtaf_periface = false; + wtaf = GetAffinityTypeForNameAndIface("worker-cpu-set", NULL); // mandatory + if (wtaf == NULL) { + SCLogError("Specify worker-cpu-set list in the threading section"); + SCReturnInt(-EINVAL); + } } - ThreadsAffinityType *mtaf = GetAffinityTypeFromName("management-cpu-set"); + ThreadsAffinityType *mtaf = GetAffinityTypeForNameAndIface("management-cpu-set", NULL); if (mtaf == NULL) { SCLogError("Specify management-cpu-set list in the threading section"); SCReturnInt(-EINVAL); @@ -406,7 +411,12 @@ static int ConfigSetThreads(DPDKIfaceConfig *iconf, const char *entry_str) } if (strcmp(entry_str, "auto") == 0) { - iconf->threads = (uint16_t)sched_cpus / LiveGetDeviceCount(); + if (wtaf_periface) { + iconf->threads = (uint16_t)sched_cpus; + SCLogConfig("%s: auto-assigned %u threads", iconf->iface, iconf->threads); + SCReturnInt(0); + } + iconf->threads = (uint16_t)sched_cpus / LiveGetDeviceCountWithoutAssignedThreading(); if (iconf->threads == 0) { SCLogError("Not enough worker CPU cores with affinity were configured"); SCReturnInt(-ERANGE); @@ -416,7 +426,8 @@ static int ConfigSetThreads(DPDKIfaceConfig *iconf, const char *entry_str) iconf->threads++; remaining_auto_cpus--; } else if (remaining_auto_cpus == -1) { - remaining_auto_cpus = (int32_t)sched_cpus % LiveGetDeviceCount(); + remaining_auto_cpus = + (int32_t)sched_cpus % LiveGetDeviceCountWithoutAssignedThreading(); if (remaining_auto_cpus > 0) { iconf->threads++; remaining_auto_cpus--; @@ -844,23 +855,46 @@ static int ConfigLoad(DPDKIfaceConfig *iconf, const char *iface) SCReturnInt(0); } -static int32_t ConfigValidateThreads(uint16_t iface_threads) +static bool ConfigThreadsGenericIsValid(uint16_t iface_threads, ThreadsAffinityType *wtaf) { static uint32_t total_cpus = 0; total_cpus += iface_threads; - ThreadsAffinityType *wtaf = GetAffinityTypeFromName("worker-cpu-set"); if (wtaf == NULL) { SCLogError("Specify worker-cpu-set list in the threading section"); - return -1; + return false; } if (total_cpus > UtilAffinityGetAffinedCPUNum(wtaf)) { - SCLogError("Interfaces requested more cores than configured in the threading section " - "(requested %d configured %d", + SCLogError("Interfaces requested more cores than configured in the worker-cpu-set " + "threading section (requested %d configured %d", total_cpus, UtilAffinityGetAffinedCPUNum(wtaf)); - return -1; + return false; } - return 0; + return true; +} + +static bool ConfigThreadsInterfaceIsValid(uint16_t iface_threads, ThreadsAffinityType *itaf) +{ + if (iface_threads > UtilAffinityGetAffinedCPUNum(itaf)) { + SCLogError("Interface requested more cores than configured in the interface-specific " + "threading section (requested %d configured %d", + iface_threads, UtilAffinityGetAffinedCPUNum(itaf)); + return false; + } + + return true; +} + +static bool ConfigIsThreadingValid(uint16_t iface_threads, const char *iface) +{ + ThreadsAffinityType *itaf = GetAffinityTypeForNameAndIface("worker-cpu-set", iface); + ThreadsAffinityType *wtaf = GetAffinityTypeForNameAndIface("worker-cpu-set", NULL); + if (itaf && !ConfigThreadsInterfaceIsValid(iface_threads, itaf)) { + return false; + } else if (itaf == NULL && !ConfigThreadsGenericIsValid(iface_threads, wtaf)) { + return false; + } + return true; } static DPDKIfaceConfig *ConfigParse(const char *iface) @@ -873,7 +907,7 @@ static DPDKIfaceConfig *ConfigParse(const char *iface) ConfigInit(&iconf); retval = ConfigLoad(iconf, iface); - if (retval < 0 || ConfigValidateThreads(iconf->threads) != 0) { + if (retval < 0 || !ConfigIsThreadingValid(iconf->threads, iface)) { iconf->DerefFunc(iconf); SCReturnPtr(NULL, "void *"); } diff --git a/src/suricata.c b/src/suricata.c index ee9dfc0b5b69..b0b1721ccab0 100644 --- a/src/suricata.c +++ b/src/suricata.c @@ -111,6 +111,7 @@ #include "tmqh-packetpool.h" #include "tm-queuehandlers.h" +#include "util-affinity.h" #include "util-byte.h" #include "util-conf.h" #include "util-coredump-config.h" @@ -2298,6 +2299,9 @@ void PostRunDeinit(const int runmode, struct timeval *start_time) StreamTcpFreeConfig(STREAM_VERBOSE); DefragDestroy(); HttpRangeContainersDestroy(); +#ifdef HAVE_HWLOC + TopologyDestroy(); +#endif /* HAVE_HWLOC */ TmqResetQueues(); #ifdef PROFILING diff --git a/src/threadvars.h b/src/threadvars.h index 6f339e9839d5..471714a254c4 100644 --- a/src/threadvars.h +++ b/src/threadvars.h @@ -136,6 +136,9 @@ typedef struct ThreadVars_ { struct FlowQueue_ *flow_queue; bool break_loop; + /** Interface-specific thread affinity */ + char *iface_name; + Storage storage[]; } ThreadVars; diff --git a/src/tm-threads.c b/src/tm-threads.c index 07f9a9390df0..20ea29120e0d 100644 --- a/src/tm-threads.c +++ b/src/tm-threads.c @@ -865,8 +865,24 @@ TmEcode TmThreadSetupOptions(ThreadVars *tv) TmThreadSetPrio(tv); if (tv->thread_setup_flags & THREAD_SET_AFFTYPE) { ThreadsAffinityType *taf = &thread_affinity[tv->cpu_affinity]; + bool use_iface_affinity = RunmodeIsAutofp() && tv->cpu_affinity == RECEIVE_CPU_SET && + FindAffinityByInterface(taf, tv->iface_name) != NULL; + use_iface_affinity |= RunmodeIsWorkers() && tv->cpu_affinity == WORKER_CPU_SET && + FindAffinityByInterface(taf, tv->iface_name) != NULL; + + if (use_iface_affinity) { + taf = FindAffinityByInterface(taf, tv->iface_name); + } + + if (CPU_COUNT(&taf->cpu_set) == 0) { + if (!taf->nocpu_warned) { + SCLogWarning("No CPU affinity set for %s", AffinityGetYamlPath(taf)); + taf->nocpu_warned = true; + } + } + if (taf->mode_flag == EXCLUSIVE_AFFINITY) { - uint16_t cpu = AffinityGetNextCPU(taf); + uint16_t cpu = AffinityGetNextCPU(tv, taf); SetCPUAffinity(cpu); /* If CPU is in a set overwrite the default thread prio */ if (CPU_ISSET(cpu, &taf->lowprio_cpu)) { @@ -1600,6 +1616,10 @@ static void TmThreadFree(ThreadVars *tv) SCFree(tv->printable_name); } + if (tv->iface_name) { + SCFree(tv->iface_name); + } + if (tv->stream_pq_local) { BUG_ON(tv->stream_pq_local->len); SCMutexDestroy(&tv->stream_pq_local->mutex_q); diff --git a/src/util-affinity.c b/src/util-affinity.c index ee365372702a..e78fd7e8af61 100644 --- a/src/util-affinity.c +++ b/src/util-affinity.c @@ -31,50 +31,169 @@ #include "util-cpu.h" #include "util-byte.h" #include "util-debug.h" +#include "util-dpdk.h" ThreadsAffinityType thread_affinity[MAX_CPU_SET] = { { .name = "receive-cpu-set", .mode_flag = EXCLUSIVE_AFFINITY, .prio = PRIO_MEDIUM, - .lcpu = 0, + .lcpu = { 0 }, }, { .name = "worker-cpu-set", .mode_flag = EXCLUSIVE_AFFINITY, .prio = PRIO_MEDIUM, - .lcpu = 0, + .lcpu = { 0 }, }, { .name = "verdict-cpu-set", .mode_flag = BALANCED_AFFINITY, .prio = PRIO_MEDIUM, - .lcpu = 0, + .lcpu = { 0 }, }, { .name = "management-cpu-set", .mode_flag = BALANCED_AFFINITY, .prio = PRIO_MEDIUM, - .lcpu = 0, + .lcpu = { 0 }, }, }; int thread_affinity_init_done = 0; +#if !defined __CYGWIN__ && !defined OS_WIN32 && !defined __OpenBSD__ && !defined sun +#ifdef HAVE_HWLOC +static hwloc_topology_t topology = NULL; +#endif /* HAVE_HWLOC */ +#endif /* OS_WIN32 and __OpenBSD__ */ + +static ThreadsAffinityType *AllocAndInitAffinityType( + const char *name, const char *interface_name, ThreadsAffinityType *parent) +{ + ThreadsAffinityType *new_affinity = SCCalloc(1, sizeof(ThreadsAffinityType)); + if (new_affinity == NULL) { + FatalError("Unable to allocate memory for new CPU affinity type"); + } + + new_affinity->name = SCStrdup(interface_name); + if (new_affinity->name == NULL) { + FatalError("Unable to allocate memory for new CPU affinity type name"); + } + new_affinity->parent = parent; + new_affinity->mode_flag = EXCLUSIVE_AFFINITY; + new_affinity->prio = PRIO_MEDIUM; + for (int i = 0; i < MAX_NUMA_NODES; i++) { + new_affinity->lcpu[i] = 0; + } + + if (parent != NULL) { + if (parent->nb_children == parent->nb_children_capacity) { + if (parent->nb_children_capacity == 0) { + parent->nb_children_capacity = 2; + } else { + parent->nb_children_capacity *= 2; + } + void *p = SCRealloc( + parent->children, parent->nb_children_capacity * sizeof(ThreadsAffinityType *)); + if (p == NULL) { + FatalError("Unable to reallocate memory for children CPU affinity types"); + } + parent->children = p; + } + parent->children[parent->nb_children++] = new_affinity; + } + + return new_affinity; +} + +ThreadsAffinityType *FindAffinityByInterface( + ThreadsAffinityType *parent, const char *interface_name) +{ + for (uint32_t i = 0; i < parent->nb_children; i++) { + if (interface_name && strcmp(parent->children[i]->name, interface_name) == 0) { + return parent->children[i]; + } + } + return NULL; +} + +/** + * \brief Find affinity by name (*-cpu-set name) and an interface name. + * \param name the name of the affinity (e.g. worker-cpu-set, receive-cpu-set). + * The name is required and cannot be NULL. + * \param interface_name the name of the interface. + * If NULL, the affinity is looked up by name only. + * \retval a pointer to the affinity or NULL if not found + */ +ThreadsAffinityType *GetAffinityTypeForNameAndIface(const char *name, const char *interface_name) +{ + int i; + ThreadsAffinityType *parent_affinity = NULL; + + for (i = 0; i < MAX_CPU_SET; i++) { + if (strcmp(thread_affinity[i].name, name) == 0) { + parent_affinity = &thread_affinity[i]; + break; + } + } + + if (parent_affinity == NULL) { + SCLogError("CPU affinity with name \"%s\" not found", name); + return NULL; + } + + if (interface_name != NULL) { + ThreadsAffinityType *child_affinity = + FindAffinityByInterface(parent_affinity, interface_name); + // found or not found, it is returned + return child_affinity; + } + + return parent_affinity; +} + /** - * \brief find affinity by its name + * \brief Finds affinity by its name and interface name. + * Interfaces are children of cpu-set names. If the queried interface is not + * found, then it is allocated, initialized and assigned to the queried cpu-set. + * \param name the name of the affinity (e.g. worker-cpu-set, receive-cpu-set). + * The name is required and cannot be NULL. + * \param interface_name the name of the interface. + * If NULL, the affinity is looked up by name only. * \retval a pointer to the affinity or NULL if not found */ -ThreadsAffinityType * GetAffinityTypeFromName(const char *name) +ThreadsAffinityType *GetOrAllocAffinityTypeForIfaceOfName( + const char *name, const char *interface_name) { int i; + ThreadsAffinityType *parent_affinity = NULL; + for (i = 0; i < MAX_CPU_SET; i++) { - if (!strcmp(thread_affinity[i].name, name)) { - return &thread_affinity[i]; + if (strcmp(thread_affinity[i].name, name) == 0) { + parent_affinity = &thread_affinity[i]; + break; } } - return NULL; + + if (parent_affinity == NULL) { + SCLogError("CPU affinity with name \"%s\" not found", name); + return NULL; + } + + if (interface_name != NULL) { + ThreadsAffinityType *child_affinity = + FindAffinityByInterface(parent_affinity, interface_name); + if (child_affinity != NULL) { + return child_affinity; + } + + // If not found, allocate and initialize a new child affinity + return AllocAndInitAffinityType(name, interface_name, parent_affinity); + } + + return parent_affinity; } #if !defined __CYGWIN__ && !defined OS_WIN32 && !defined __OpenBSD__ && !defined sun @@ -275,38 +394,114 @@ static void SetupAffinityThreads(ThreadsAffinityType *taf, ConfNode *affinity) } } -static bool AllCPUsUsed(ThreadsAffinityType *taf) +/** + * \brief Get the YAML path for the given affinity type. + * The path is built using the parent name (if available) and the affinity name. + * Do not free the returned string. + * \param taf the affinity type - if NULL, the path is built for the root node + * \return a string containing the YAML path, or NULL if the path is too long + */ +char *AffinityGetYamlPath(ThreadsAffinityType *taf) { - if (taf->lcpu < UtilCpuGetNumProcessorsOnline()) { - return false; + static char rootpath[] = "threading.cpu-affinity"; + static char path[1024] = { 0 }; + char subpath[256] = { 0 }; + + if (taf == NULL) { + return rootpath; + } + + if (taf->parent != NULL) { + long r = snprintf( + subpath, sizeof(subpath), "%s.interface-specific-cpu-set.", taf->parent->name); + if (r < 0 || r >= (long)sizeof(subpath)) { + FatalError("Unable to build YAML path for CPU affinity %s.%s", taf->parent->name, + taf->name); + } + } else { + subpath[0] = '\0'; } - return true; + + long r = snprintf(path, sizeof(path), "%s.%s%s", rootpath, subpath, taf->name); + if (r < 0 || r >= (long)sizeof(path)) { + FatalError("Unable to build YAML path for CPU affinity %s", taf->name); + } + + return path; } static void ResetCPUs(ThreadsAffinityType *taf) { - taf->lcpu = 0; + for (int i = 0; i < MAX_NUMA_NODES; i++) { + taf->lcpu[i] = 0; + } } -static uint16_t GetNextAvailableCPU(ThreadsAffinityType *taf) +/** + * \brief Check if the set name corresponds to a worker CPU set. + */ +static bool IsWorkerCpuSet(const char *setname) { - uint16_t cpu = taf->lcpu; - int attempts = 0; + return (strcmp(setname, "worker-cpu-set") == 0); +} - while (!CPU_ISSET(cpu, &taf->cpu_set) && attempts < 2) { - cpu = (cpu + 1) % UtilCpuGetNumProcessorsOnline(); - if (cpu == 0) - attempts++; +/** + * \brief Check if the set name corresponds to a receive CPU set. + */ +static bool IsReceiveCpuSet(const char *setname) +{ + return (strcmp(setname, "receive-cpu-set") == 0); +} + +/** + * \brief Set up affinity configuration for a single interface. + */ +static void SetupSingleIfaceAffinity(ThreadsAffinityType *taf, ConfNode *iface_node) +{ + // offload to Setup function + ConfNode *child_node; + const char *interface_name = NULL; + TAILQ_FOREACH (child_node, &iface_node->head, next) { + if (strcmp(child_node->name, "interface") == 0) { + interface_name = child_node->val; + break; + } + } + if (interface_name == NULL) { + return; + } + + ThreadsAffinityType *iface_taf = + GetOrAllocAffinityTypeForIfaceOfName(taf->name, interface_name); + if (iface_taf == NULL) { + FatalError("Unknown CPU affinity type for interface: %s", interface_name); } - taf->lcpu = cpu + 1; + SetupCpuSets(iface_taf, iface_node, interface_name); + SetupAffinityPriority(iface_taf, iface_node, interface_name); + SetupAffinityMode(iface_taf, iface_node); + SetupAffinityThreads(iface_taf, iface_node); +} - if (attempts == 2) { - SCLogError( - "cpu_set does not contain available CPUs, CPU affinity configuration is invalid"); +/** + * \brief Set up per-interface affinity configurations. + */ +static void SetupPerIfaceAffinity(ThreadsAffinityType *taf, ConfNode *affinity) +{ + char if_af[] = "interface-specific-cpu-set"; + ConfNode *per_iface_node = ConfNodeLookupChild(affinity, if_af); + if (per_iface_node == NULL) { + return; } - return cpu; + ConfNode *iface_node; + TAILQ_FOREACH (iface_node, &per_iface_node->head, next) { + if (strcmp(iface_node->val, "interface") == 0) { + SetupSingleIfaceAffinity(taf, iface_node); + } else { + SCLogWarning("Unknown node in %s: %s", if_af, iface_node->name); + } + } } /** @@ -323,9 +518,8 @@ static bool AffinityConfigIsDeprecated(void) return threading_affinity_deprecated; } - ConfNode *root = ConfGetNode("threading.cpu-affinity"); + ConfNode *root = ConfGetNode(AffinityGetYamlPath(NULL)); if (root == NULL) { - threading_affinity_deprecated = false; initialized = true; return threading_affinity_deprecated; } @@ -357,18 +551,17 @@ void AffinitySetupLoadFromConfig(void) AffinitySetupInit(); thread_affinity_init_done = 1; if (AffinityConfigIsDeprecated()) { - SCLogWarning("CPU affinity configuration uses a deprecated structure and will become " - "obsolete in a future major release (Suricata 9.0). Please update your " - "threading.cpu-affinity to the new format. " - "See notes in %s/upgrade.html#upgrading-7-0-to-8-0", - GetDocURL()); + SCLogWarning("CPU affinity configuration uses a deprecated structure and will not be " + "supported in a future major release (Suricata 9.0). Please update your " + "%s to the new format. See notes in %s/upgrade.html#upgrading-7-0-to-8-0", + AffinityGetYamlPath(NULL), GetDocURL()); } } - SCLogDebug("Loading threading.cpu-affinity from config"); - ConfNode *root = ConfGetNode("threading.cpu-affinity"); + SCLogDebug("Loading %s from config", AffinityGetYamlPath(NULL)); + ConfNode *root = ConfGetNode(AffinityGetYamlPath(NULL)); if (root == NULL) { - SCLogInfo("Cannot find threading.cpu-affinity node in config"); + SCLogInfo("Cannot find %s node in config", AffinityGetYamlPath(NULL)); return; } @@ -380,7 +573,7 @@ void AffinitySetupLoadFromConfig(void) continue; } - ThreadsAffinityType *taf = GetAffinityTypeFromName(setname); + ThreadsAffinityType *taf = GetOrAllocAffinityTypeForIfaceOfName(setname, NULL); if (taf == NULL) { FatalError("Unknown CPU affinity type: %s", setname); } @@ -393,25 +586,372 @@ void AffinitySetupLoadFromConfig(void) SetupAffinityPriority(taf, aff_query_node, setname); SetupAffinityMode(taf, aff_query_node); SetupAffinityThreads(taf, aff_query_node); + + if (!AffinityConfigIsDeprecated() && + (IsWorkerCpuSet(setname) || IsReceiveCpuSet(setname))) { + SetupPerIfaceAffinity(taf, affinity); + } } #endif /* OS_WIN32 and __OpenBSD__ */ } +#if !defined __CYGWIN__ && !defined OS_WIN32 && !defined __OpenBSD__ && !defined sun +#ifdef HAVE_HWLOC +static int HwLocDeviceNumaGet(hwloc_topology_t topo, hwloc_obj_t obj) +{ +#if HWLOC_VERSION_MAJOR >= 2 && HWLOC_VERSION_MINOR >= 5 + hwloc_obj_t nodes[MAX_NUMA_NODES]; + unsigned num_nodes = MAX_NUMA_NODES; + struct hwloc_location location; + + location.type = HWLOC_LOCATION_TYPE_OBJECT; + location.location.object = obj; + + int result = hwloc_get_local_numanode_objs(topo, &location, &num_nodes, nodes, 0); + if (result == 0 && num_nodes > 0 && num_nodes <= MAX_NUMA_NODES) { + return nodes[0]->logical_index; + } + return -1; +#endif /* HWLOC_VERSION_MAJOR >= 2 && HWLOC_VERSION_MINOR >= 5 */ + + hwloc_obj_t non_io_ancestor = hwloc_get_non_io_ancestor_obj(topo, obj); + if (non_io_ancestor == NULL) { + return -1; + } + + // Iterate over NUMA nodes and check their nodeset + hwloc_obj_t numa_node = NULL; + while ((numa_node = hwloc_get_next_obj_by_type(topo, HWLOC_OBJ_NUMANODE, numa_node)) != NULL) { + if (hwloc_bitmap_isset(non_io_ancestor->nodeset, numa_node->os_index)) { + return numa_node->logical_index; + } + } + + return -1; +} + +static hwloc_obj_t HwLocDeviceGetByKernelName(hwloc_topology_t topo, const char *interface_name) +{ + hwloc_obj_t obj = NULL; + + while ((obj = hwloc_get_next_osdev(topo, obj)) != NULL) { + if (obj->attr->osdev.type == HWLOC_OBJ_OSDEV_NETWORK && + strcmp(obj->name, interface_name) == 0) { + hwloc_obj_t parent = obj->parent; + while (parent) { + if (parent->type == HWLOC_OBJ_PCI_DEVICE) { + return parent; + } + parent = parent->parent; + } + } + } + return NULL; +} + +// Static function to deparse PCIe interface string name to individual components /** - * \brief Return next cpu to use for a given thread family - * \retval the cpu to used given by its id + * \brief Parse PCIe address string to individual components + * \param[in] pcie_address PCIe address string + * \param[out] domain Domain component + * \param[out] bus Bus component + * \param[out] device Device component + * \param[out] function Function component */ -uint16_t AffinityGetNextCPU(ThreadsAffinityType *taf) +static int PcieAddressToComponents(const char *pcie_address, unsigned int *domain, + unsigned int *bus, unsigned int *device, unsigned int *function) { - uint16_t ncpu = 0; -#if !defined __CYGWIN__ && !defined OS_WIN32 && !defined __OpenBSD__ && !defined sun + // Handle both full and short PCIe address formats + if (sscanf(pcie_address, "%x:%x:%x.%x", domain, bus, device, function) != 4) { + if (sscanf(pcie_address, "%x:%x.%x", bus, device, function) != 3) { + return -1; + } + *domain = 0; // Default domain to 0 if not provided + } + return 0; +} + +// Function to convert PCIe address to hwloc object +static hwloc_obj_t HwLocDeviceGetByPcie(hwloc_topology_t topo, const char *pcie_address) +{ + hwloc_obj_t obj = NULL; + unsigned int domain, bus, device, function; + int r = PcieAddressToComponents(pcie_address, &domain, &bus, &device, &function); + if (r == 0) { + while ((obj = hwloc_get_next_pcidev(topo, obj)) != NULL) { + if (obj->attr->pcidev.domain == domain && obj->attr->pcidev.bus == bus && + obj->attr->pcidev.dev == device && obj->attr->pcidev.func == function) { + return obj; + } + } + } + return NULL; +} + +static void HwlocObjectDump(hwloc_obj_t obj, const char *iface_name) +{ + if (!obj) { + SCLogDebug("No object found for the given PCIe address.\n"); + return; + } + + static char pcie_address[32]; + snprintf(pcie_address, sizeof(pcie_address), "%04x:%02x:%02x.%x", obj->attr->pcidev.domain, + obj->attr->pcidev.bus, obj->attr->pcidev.dev, obj->attr->pcidev.func); + SCLogDebug("Interface (%s / %s) has NUMA ID %d", iface_name, pcie_address, + HwLocDeviceNumaGet(topology, obj)); + + SCLogDebug("Object type: %s\n", hwloc_obj_type_string(obj->type)); + SCLogDebug("Logical index: %u\n", obj->logical_index); + SCLogDebug("Depth: %u\n", obj->depth); + SCLogDebug("Attributes:\n"); + if (obj->type == HWLOC_OBJ_PCI_DEVICE) { + SCLogDebug(" Domain: %04x\n", obj->attr->pcidev.domain); + SCLogDebug(" Bus: %02x\n", obj->attr->pcidev.bus); + SCLogDebug(" Device: %02x\n", obj->attr->pcidev.dev); + SCLogDebug(" Function: %01x\n", obj->attr->pcidev.func); + SCLogDebug(" Class ID: %04x\n", obj->attr->pcidev.class_id); + SCLogDebug(" Vendor ID: %04x\n", obj->attr->pcidev.vendor_id); + SCLogDebug(" Device ID: %04x\n", obj->attr->pcidev.device_id); + SCLogDebug(" Subvendor ID: %04x\n", obj->attr->pcidev.subvendor_id); + SCLogDebug(" Subdevice ID: %04x\n", obj->attr->pcidev.subdevice_id); + SCLogDebug(" Revision: %02x\n", obj->attr->pcidev.revision); + SCLogDebug(" Link speed: %f GB/s\n", obj->attr->pcidev.linkspeed); + } else { + SCLogDebug(" No PCI device attributes available.\n"); + } +} + +static bool TopologyShouldAutopin(ThreadVars *tv, ThreadsAffinityType *taf) +{ + bool cond; SCMutexLock(&taf->taf_mutex); - ncpu = GetNextAvailableCPU(taf); + cond = tv->type == TVT_PPT && tv->iface_name && + (strcmp(tv->iface_name, taf->name) == 0 || + (strcmp("worker-cpu-set", taf->name) == 0 && RunmodeIsWorkers()) || + (strcmp("receive-cpu-set", taf->name) == 0 && RunmodeIsAutofp())); + SCMutexUnlock(&taf->taf_mutex); + return cond; +} + +static void TopologyInitialize(void) +{ + if (topology == NULL) { + if (hwloc_topology_init(&topology) == -1) { + FatalError("Failed to initialize topology"); + } + + if (hwloc_topology_set_flags(topology, HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM) == -1 || + hwloc_topology_set_io_types_filter(topology, HWLOC_TYPE_FILTER_KEEP_ALL) == -1 || + hwloc_topology_load(topology) == -1) { + FatalError("Failed to set/load topology"); + } + } +} + +void TopologyDestroy() +{ + if (topology != NULL) { + hwloc_topology_destroy(topology); + topology = NULL; + } +} + +static int InterfaceGetNumaNode(ThreadVars *tv) +{ + hwloc_obj_t if_obj = HwLocDeviceGetByKernelName(topology, tv->iface_name); + if (if_obj == NULL) { + if_obj = HwLocDeviceGetByPcie(topology, tv->iface_name); + } + + if (if_obj != NULL && SCLogGetLogLevel() == SC_LOG_DEBUG) { + HwlocObjectDump(if_obj, tv->iface_name); + } + + int32_t numa_id = HwLocDeviceNumaGet(topology, if_obj); + if (numa_id < 0 && SCRunmodeGet() == RUNMODE_DPDK) { + // DPDK fallback for e.g. net_bonding (vdev) PMDs + int32_t r = DPDKDeviceNameSetSocketID(tv->iface_name, &numa_id); + if (r < 0) { + numa_id = -1; + } + } + + if (numa_id < 0) { + SCLogDebug("Unable to find NUMA node for interface %s", tv->iface_name); + } + + return numa_id; +} +#endif /* HAVE_HWLOC */ + +static bool CPUIsFromNuma(uint16_t ncpu, uint16_t numa) +{ +#ifdef HAVE_HWLOC + int core_id = ncpu; + int depth = hwloc_get_type_depth(topology, HWLOC_OBJ_NUMANODE); + hwloc_obj_t numa_node = NULL; + + while ((numa_node = hwloc_get_next_obj_by_depth(topology, depth, numa_node)) != NULL) { + hwloc_cpuset_t cpuset = hwloc_bitmap_alloc(); + hwloc_bitmap_copy(cpuset, numa_node->cpuset); + + if (hwloc_bitmap_isset(cpuset, core_id)) { + SCLogDebug("Core %d - NUMA %d", core_id, numa_node->logical_index); + hwloc_bitmap_free(cpuset); + break; + } + hwloc_bitmap_free(cpuset); + } + + if (numa == numa_node->logical_index) { + return true; + } + +#endif /* HAVE_HWLOC */ + + return false; +} + +static int16_t FindCPUInNumaNode(int numa_node, ThreadsAffinityType *taf) +{ + if (numa_node < 0) { + return -1; + } + + if (taf->lcpu[numa_node] >= UtilCpuGetNumProcessorsOnline()) { + return -1; + } + + uint16_t cpu = taf->lcpu[numa_node]; + while (cpu < UtilCpuGetNumProcessorsOnline() && + (!CPU_ISSET(cpu, &taf->cpu_set) || !CPUIsFromNuma(cpu, (uint16_t)numa_node))) { + cpu++; + } + + taf->lcpu[numa_node] = + (CPU_ISSET(cpu, &taf->cpu_set) && CPUIsFromNuma(cpu, (uint16_t)numa_node)) + ? cpu + 1 + : UtilCpuGetNumProcessorsOnline(); + return (CPU_ISSET(cpu, &taf->cpu_set) && CPUIsFromNuma(cpu, (uint16_t)numa_node)) ? (int16_t)cpu + : -1; +} + +static int16_t CPUSelectFromNuma(int iface_numa, ThreadsAffinityType *taf) +{ + if (iface_numa != -1) { + return FindCPUInNumaNode(iface_numa, taf); + } + return -1; +} + +static int16_t CPUSelectAlternative(int iface_numa, ThreadsAffinityType *taf) +{ + for (int nid = 0; nid < MAX_NUMA_NODES; nid++) { + if (iface_numa == nid) { + continue; + } + + int16_t cpu = FindCPUInNumaNode(nid, taf); + if (cpu != -1) { + SCLogPerf("CPU %d from NUMA %d assigned to a network interface located on NUMA %d", cpu, + nid, iface_numa); + return cpu; + } + } + return -1; +} + +/** + * \brief Select the next available CPU for the given affinity type. + * taf->cpu_set is a bit array where each bit represents a CPU core. + * The function iterates over the bit array and returns the first available CPU. + * If last used CPU core index is higher than the indexes of available cores, + * we reach the end of the array, and we reset the CPU selection. + * On the second reset attempt, the function bails out with a default value. + * The second attempt should only happen with an empty CPU set. + */ +static uint16_t CPUSelectDefault(ThreadsAffinityType *taf) +{ + uint16_t cpu = taf->lcpu[0]; + int attempts = 0; + while (!CPU_ISSET(cpu, &taf->cpu_set) && attempts < 2) { + cpu = (cpu + 1) % UtilCpuGetNumProcessorsOnline(); + if (cpu == 0) { + attempts++; + } + } - if (AllCPUsUsed(taf)) { - ResetCPUs(taf); + taf->lcpu[0] = cpu + 1; + return cpu; +} + +static uint16_t CPUSelectFromNumaOrDefault(int iface_numa, ThreadsAffinityType *taf) +{ + uint16_t attempts = 0; + int16_t cpu = -1; + while (attempts < 2) { + cpu = CPUSelectFromNuma(iface_numa, taf); + if (cpu == -1) { + cpu = CPUSelectAlternative(iface_numa, taf); + if (cpu == -1) { + // All CPUs from all NUMAs are used at this point + ResetCPUs(taf); + attempts++; + } + } + + if (cpu >= 0) { + return (uint16_t)cpu; + } + } + return CPUSelectDefault(taf); +} + +static uint16_t GetNextAvailableCPU(int iface_numa, ThreadsAffinityType *taf) +{ + if (iface_numa < 0) { + return CPUSelectDefault(taf); } + return CPUSelectFromNumaOrDefault(iface_numa, taf); +} + +static bool AutopinEnabled(void) +{ + int autopin = 0; + if (ConfGetBool("threading.autopin", &autopin) != 1) { + return false; + } + return (bool)autopin; +} + +#endif /* OS_WIN32 and __OpenBSD__ */ + +uint16_t AffinityGetNextCPU(ThreadVars *tv, ThreadsAffinityType *taf) +{ + uint16_t ncpu = 0; +#if !defined __CYGWIN__ && !defined OS_WIN32 && !defined __OpenBSD__ && !defined sun + int iface_numa = -1; + if (AutopinEnabled()) { +#ifdef HAVE_HWLOC + if (TopologyShouldAutopin(tv, taf)) { + TopologyInitialize(); + iface_numa = InterfaceGetNumaNode(tv); + } +#else + static bool printed = false; + if (!printed) { + printed = true; + SCLogWarning( + "threading.autopin option is enabled but hwloc support is not compiled in. " + "Make sure to pass --enable-nfqueue to configure when building Suricata."); + } +#endif /* HAVE_HWLOC */ + } + + SCMutexLock(&taf->taf_mutex); + ncpu = GetNextAvailableCPU(iface_numa, taf); SCLogDebug("Setting affinity on CPU %d", ncpu); SCMutexUnlock(&taf->taf_mutex); #endif /* OS_WIN32 and __OpenBSD__ */ diff --git a/src/util-affinity.h b/src/util-affinity.h index 2fa4509ffa2c..ceca26dfeff9 100644 --- a/src/util-affinity.h +++ b/src/util-affinity.h @@ -26,6 +26,11 @@ #include "suricata-common.h" #include "conf.h" #include "threads.h" +#include "threadvars.h" + +#ifdef HAVE_HWLOC +#include +#endif /* HAVE_HWLOC */ #if defined OS_FREEBSD #include @@ -62,10 +67,12 @@ enum { MAX_AFFINITY }; +#define MAX_NUMA_NODES 16 + typedef struct ThreadsAffinityType_ { const char *name; uint8_t mode_flag; - uint16_t lcpu; /* use by exclusive mode */ + uint16_t lcpu[MAX_NUMA_NODES]; /* use by exclusive mode */ int prio; uint32_t nb_threads; SCMutex taf_mutex; @@ -76,6 +83,12 @@ typedef struct ThreadsAffinityType_ { cpu_set_t medprio_cpu; cpu_set_t hiprio_cpu; #endif + struct ThreadsAffinityType_ **children; + uint32_t nb_children; + uint32_t nb_children_capacity; + struct ThreadsAffinityType_ *parent; + // a flag to avoid multiple warnings when no CPU is set + bool nocpu_warned; } ThreadsAffinityType; /** store thread affinity mode for all type of threads */ @@ -83,10 +96,16 @@ typedef struct ThreadsAffinityType_ { extern ThreadsAffinityType thread_affinity[MAX_CPU_SET]; #endif +char *AffinityGetYamlPath(ThreadsAffinityType *taf); void AffinitySetupLoadFromConfig(void); -ThreadsAffinityType * GetAffinityTypeFromName(const char *name); +ThreadsAffinityType *GetOrAllocAffinityTypeForIfaceOfName( + const char *name, const char *interface_name); +ThreadsAffinityType *GetAffinityTypeForNameAndIface(const char *name, const char *interface_name); +ThreadsAffinityType *FindAffinityByInterface( + ThreadsAffinityType *parent, const char *interface_name); -uint16_t AffinityGetNextCPU(ThreadsAffinityType *taf); +void TopologyDestroy(void); +uint16_t AffinityGetNextCPU(ThreadVars *tv, ThreadsAffinityType *taf); uint16_t UtilAffinityGetAffinedCPUNum(ThreadsAffinityType *taf); #ifdef HAVE_DPDK uint16_t UtilAffinityCpusOverlap(ThreadsAffinityType *taf1, ThreadsAffinityType *taf2); diff --git a/src/util-device.c b/src/util-device.c index fd4cf5685f0b..ec1e91b41374 100644 --- a/src/util-device.c +++ b/src/util-device.c @@ -24,6 +24,7 @@ #include "device-storage.h" #include "util-debug.h" +#include "util-affinity.h" #define MAX_DEVNAME 10 @@ -173,6 +174,20 @@ int LiveGetDeviceCount(void) return i; } +int LiveGetDeviceCountWithoutAssignedThreading(void) +{ + int i = 0; + LiveDevice *pd; + + TAILQ_FOREACH (pd, &live_devices, next) { + if (GetAffinityTypeForNameAndIface("worker-cpu-set", pd->dev) == NULL) { + i++; + } + } + + return i; +} + /** * \brief Get a pointer to the device name at idx * diff --git a/src/util-device.h b/src/util-device.h index 0774825385a3..075c21567c81 100644 --- a/src/util-device.h +++ b/src/util-device.h @@ -85,6 +85,7 @@ void LiveDevAddBypassStats(LiveDevice *dev, uint64_t cnt, int family); void LiveDevSubBypassStats(LiveDevice *dev, uint64_t cnt, int family); void LiveDevAddBypassFail(LiveDevice *dev, uint64_t cnt, int family); void LiveDevAddBypassSuccess(LiveDevice *dev, uint64_t cnt, int family); +int LiveGetDeviceCountWithoutAssignedThreading(void); int LiveGetDeviceCount(void); const char *LiveGetDeviceName(int number); LiveDevice *LiveGetDevice(const char *dev); diff --git a/src/util-runmodes.c b/src/util-runmodes.c index f78e857abfc6..be4da6bd49ee 100644 --- a/src/util-runmodes.c +++ b/src/util-runmodes.c @@ -175,6 +175,14 @@ int RunModeSetLiveCaptureAutoFp(ConfigIfaceParserFunc ConfigParser, FatalError("TmThreadsCreate failed"); } tv_receive->printable_name = printable_threadname; + if (dev) { + tv_receive->iface_name = SCStrdup(dev); + if (tv_receive->iface_name == NULL) { + FatalError("Failed to allocate memory for iface name"); + } + } else { + tv_receive->iface_name = NULL; + } TmModule *tm_module = TmModuleGetByName(recv_mod_name); if (tm_module == NULL) { FatalError("TmModuleGetByName failed for %s", recv_mod_name); @@ -283,6 +291,14 @@ static int RunModeSetLiveCaptureWorkersForDevice(ConfigIfaceThreadsCountFunc Mod FatalError("TmThreadsCreate failed"); } tv->printable_name = printable_threadname; + if (live_dev) { + tv->iface_name = SCStrdup(live_dev); + if (tv->iface_name == NULL) { + FatalError("Failed to allocate memory for iface name"); + } + } else { + tv->iface_name = NULL; + } tm_module = TmModuleGetByName(recv_mod_name); if (tm_module == NULL) { diff --git a/suricata.yaml.in b/suricata.yaml.in index 954acde2d3ae..c4319f02b45b 100644 --- a/suricata.yaml.in +++ b/suricata.yaml.in @@ -1777,6 +1777,7 @@ spm-algo: auto # Suricata is multi-threaded. Here the threading can be influenced. threading: set-cpu-affinity: no + autopin: no # Tune cpu affinity of threads. Each family of threads can be bound # to specific CPUs. # @@ -1793,6 +1794,13 @@ threading: cpu: [ 0 ] # include only these CPUs in affinity settings receive-cpu-set: cpu: [ 0 ] # include only these CPUs in affinity settings + # interface-specific-cpu-set: + # - interface: "enp4s0f0" + # cpu: [ 1,3,5,7,9 ] + # mode: "exclusive" + # prio: + # high: [ "all" ] + # default: "medium" worker-cpu-set: cpu: [ "all" ] mode: "exclusive" @@ -1804,6 +1812,13 @@ threading: medium: [ "1-2" ] high: [ 3 ] default: "medium" + interface-specific-cpu-set: + - interface: "enp4s0f0" # 0000:3b:00.0 # net_bonding0 # ens1f0 + cpu: [ 1,3,5,7,9 ] + mode: "exclusive" + prio: + high: [ "all" ] + default: "medium" #verdict-cpu-set: # cpu: [ 0 ] # prio: