Skip to content

Commit

Permalink
Feature/metrics for 1.7 release (#1356)
Browse files Browse the repository at this point in the history
* Feature/monitoring (#1306)

* ongoing metrics redefinition

* configured kafka-ui, resorted services

* change visibility

* wip: new gauge

* new logging

* more logging

* #1251: move event EventMessageCountingMonitor setup to constructor. add /actuator/prometheus to permitted paths

* create additional metrics for tasks, reduce loggingin integration itests

* confgured monitoring

* tasklist service fix prometheus path

* remove unnneded config

* rename metric to match the sender side

* rename label

---------

Co-authored-by: stephan.strehler <[email protected]>
Co-authored-by: Simon Hirtreiter <[email protected]>

* configurble metrics, fix #1338 (#1353)

* configurble metrics, fix #1338

* Update digiwf-engine/digiwf-engine-service/src/main/resources/application.yml

Co-authored-by: Simon Hirtreiter <[email protected]>

* Update digiwf-libs/digiwf-camunda-prometheus/digiwf-camunda-prometheus-starter/src/main/java/de/muenchen/oss/digiwf/camunda/prometheus/MetricsProviderSchedulerAutoConfiguration.java

Co-authored-by: Simon Hirtreiter <[email protected]>

* Update digiwf-libs/digiwf-camunda-prometheus/digiwf-camunda-prometheus-starter/src/main/java/de/muenchen/oss/digiwf/camunda/prometheus/CamundaPrometheusProperties.java

Co-authored-by: Simon Hirtreiter <[email protected]>

---------

Co-authored-by: Simon Hirtreiter <[email protected]>

---------

Co-authored-by: Simon Zambrovski <[email protected]>
Co-authored-by: stephan.strehler <[email protected]>
  • Loading branch information
3 people authored Feb 23, 2024
1 parent 36e9d4c commit e0f5fa5
Show file tree
Hide file tree
Showing 50 changed files with 935 additions and 218 deletions.
6 changes: 6 additions & 0 deletions digiwf-engine/digiwf-engine-service/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,12 @@
<version>${project.version}</version>
</dependency>

<dependency>
<groupId>de.muenchen.oss.digiwf</groupId>
<artifactId>digiwf-camunda-prometheus-starter</artifactId>
<version>${project.version}</version>
</dependency>


<!-- Streaming -->
<dependency>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,6 @@
# logging:
# requests: all

spring:
sleuth:
web:
enabled: false

server:
error:
include-exception: true
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,16 @@ digiwf:
s3service:
topic: 'dwf-s3-${DIGIWF_ENV}'
httpAPI: ${DIGIWF_S3_HTTPAPI:http://localhost:8086}
prometheus:
process-engine:
update-interval: 30000
providers:
fniAndEde: true
incident: true
job: true
process: false # deactivate the process definition metrics
task: true

io:
muenchendigital:
digiwf:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
logging:
level:
org.apache.kafka.clients.admin: WARN
org.apache.kafka.clients.consumer: WARN
org.apache.kafka.clients.producer: WARN

# overwrite vars
DIGIWF_ENV: itest
#KAFKA_BOOTSTRAP_SERVER: localhost
#KAFKA_BOOTSTRAP_SERVER_PORT: 19999
#KAFKA_SECURITY_PROTOCOL: PLAINTEXT
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
logging:
level:
org.apache.kafka.clients.admin: WARN
org.apache.kafka.clients.consumer: WARN
org.apache.kafka.clients.producer: WARN

# overwrite vars
DIGIWF_ENV: itest
#KAFKA_BOOTSTRAP_SERVER: localhost
#KAFKA_BOOTSTRAP_SERVER_PORT: 19999
#KAFKA_SECURITY_PROTOCOL: PLAINTEXT
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@
import de.muenchen.oss.digiwf.s3.integration.client.api.FolderApiApi;
import de.muenchen.oss.digiwf.s3.integration.client.properties.S3IntegrationClientProperties;
import de.muenchen.oss.digiwf.s3.integration.client.service.ApiClientFactory;
import jakarta.annotation.PostConstruct;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
import org.springframework.boot.context.properties.EnableConfigurationProperties;
import org.springframework.context.annotation.Bean;
Expand Down Expand Up @@ -39,10 +41,16 @@
})
@RequiredArgsConstructor
@EnableConfigurationProperties(S3IntegrationClientProperties.class)
@Slf4j
public class S3IntegrationClientAutoConfiguration {

public final S3IntegrationClientProperties s3IntegrationClientProperties;

@PostConstruct
public void init() {
log.info("[DIGIWF-S3-INTEGRATION-CLIENT]: Staring integration client, security is {}.", s3IntegrationClientProperties.isEnableSecurity() ? "enabled" : "disabled" );
}

@Bean
@ConditionalOnProperty(prefix = "io.muenchendigital.digiwf.s3.client", name = "securityEnabled", havingValue = "true")
public ApiClientFactory securedApiClientFactory(final ClientRegistrationRepository clientRegistrationRepository,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,11 @@
<artifactId>camunda-engine</artifactId>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-context</artifactId>
<scope>provided</scope>
</dependency>

<dependency>
<groupId>io.micrometer</groupId>
Expand All @@ -31,7 +36,6 @@
<version>${project.version}</version>
<scope>test</scope>
</dependency>

</dependencies>

</project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
package de.muenchen.oss.digiwf.camunda.prometheus;

import io.prometheus.client.CollectorRegistry;
import io.prometheus.client.Counter;
import lombok.RequiredArgsConstructor;
import org.camunda.bpm.engine.RepositoryService;
import org.camunda.bpm.engine.delegate.DelegateExecution;
import org.springframework.context.event.EventListener;

@RequiredArgsConstructor
public class ExecutionEventReporter implements MetricsReporter {

private final RepositoryService repositoryService;
private Counter startExecutionEvents;
private Counter endExecutionEvents;

@Override
public void registerMetrics(CollectorRegistry collectorRegistry) {
this.startExecutionEvents = Counter.build()
.name("camunda_start_process_events_count")
.help("Start process events by process definition.")
.labelNames("processDefinitionKey")
.register(collectorRegistry);
this.endExecutionEvents = Counter.build()
.name("camunda_end_process_events_count")
.help("End process events by process definition.")
.labelNames("processDefinitionKey")
.register(collectorRegistry);
}

@EventListener(condition = "#execution.eventName.equals('start')")
public void countStartEvent(DelegateExecution execution) {
if (isModelElementOfType("startEvent", execution)) {
startExecutionEvents.labels(getProcessDefinitionKey(execution)).inc();
}
}

@EventListener(condition = "#execution.eventName.equals('end')")
public void countEndEvent(DelegateExecution execution) {
if (isModelElementOfType("endEvent", execution)) {
endExecutionEvents.labels(getProcessDefinitionKey(execution)).inc();
}
}

private static boolean isModelElementOfType(String typeName, DelegateExecution execution) {
return execution.getBpmnModelElementInstance() != null
&& typeName.equals(execution.getBpmnModelElementInstance().getElementType().getTypeName());
}

private String getProcessDefinitionKey(DelegateExecution execution) {
return repositoryService.getProcessDefinition(execution.getProcessDefinitionId()).getKey();
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -3,21 +3,36 @@
import io.prometheus.client.CollectorRegistry;
import io.prometheus.client.Gauge;
import lombok.RequiredArgsConstructor;
import org.camunda.bpm.engine.HistoryService;
import org.camunda.bpm.engine.ManagementService;
import org.camunda.bpm.engine.RepositoryService;
import org.camunda.bpm.engine.repository.ProcessDefinition;

import java.util.List;

@RequiredArgsConstructor
public class FniAndEdeMetricsProvider implements MetricsProvider {

private final ManagementService managementService;
private final RepositoryService repositoryService;
private final HistoryService historyService;

private Gauge fniCount;
private Gauge edeCount;

@Override
public void updateMetrics() {
// Get total number of FNIs
this.fniCount.labels("total").set(this.managementService.createMetricsQuery().name("activity-instance-start").sum());

this.fniCount
.labels("total", "total")
.set(this.managementService.createMetricsQuery().name("activity-instance-start").sum());

// Get FNIs by process definition (this only includes the currently deployed definitions and may be lower than the total count)
repositoryService.createProcessDefinitionQuery().list().forEach(processDefinition ->
this.fniCount
.labels(processDefinition.getKey(), processDefinition.getId())
.set(historyService.createHistoricActivityInstanceQuery().processDefinitionId(processDefinition.getId()).count())
);
// Get total number of EDEs
this.edeCount.set(this.managementService.createMetricsQuery().name("executed-decision-elements").sum());
}
Expand All @@ -27,7 +42,7 @@ public void registerMetrics(final CollectorRegistry collectorRegistry) {
this.fniCount = Gauge.build()
.name("camunda_activity_instances")
.help("Number of activity instances (BPMN FNI) in total and by deployed process definition.")
.labelNames("processDefinitionId")
.labelNames("processDefinitionKey", "processDefinitionId")
.register(collectorRegistry);

this.edeCount = Gauge.build()
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
package de.muenchen.oss.digiwf.camunda.prometheus;

import io.prometheus.client.CollectorRegistry;
import io.prometheus.client.Summary;
import org.camunda.bpm.engine.impl.history.event.HistoricActivityInstanceEventEntity;
import org.springframework.context.event.EventListener;

public class HistoryEventReporter implements MetricsReporter {

private Summary camundaActivityTime;

@Override
public void registerMetrics(CollectorRegistry collectorRegistry) {
camundaActivityTime = Summary.build()
.quantile(0.5, 0.05)
.quantile(0.9, 0.01)
.quantile(0.99, 0.001)
.name("camunda_activity_execution_time_milliseconds")
.labelNames("processDefinitionKey", "activityType", "activityName")
.help("Duration of activities in milliseconds.")
.register(collectorRegistry);
}

/**
* Observes the duration of any ended camunda activity
*
* @param historyEvent the caught event
*/
@EventListener(condition = "#historyEvent != null && #historyEvent.eventType.equals('end')")
public void handleEvent(HistoricActivityInstanceEventEntity historyEvent) {

String activityName = historyEvent.getActivityName();
if (activityName == null || activityName.isEmpty()) {
activityName = historyEvent.getActivityId();
}

if (historyEvent.getDurationInMillis() != null) {
camundaActivityTime
.labels(historyEvent.getProcessDefinitionKey(), historyEvent.getActivityType(), activityName)
.observe(historyEvent.getDurationInMillis());
}
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -3,26 +3,43 @@
import io.prometheus.client.CollectorRegistry;
import io.prometheus.client.Gauge;
import lombok.RequiredArgsConstructor;
import org.camunda.bpm.engine.RepositoryService;
import org.camunda.bpm.engine.RuntimeService;
import org.camunda.bpm.engine.repository.ProcessDefinition;

import static java.util.stream.Collectors.toMap;

@RequiredArgsConstructor
public class IncidentMetricsProvider implements MetricsProvider {

private final RuntimeService runtimeService;
private final RepositoryService repositoryService;
private static final String NOT_PROCESS_INCIDENT = "__no_process_definition_key";

private Gauge openIncidents;

@Override
public void updateMetrics() {
this.openIncidents.set(this.runtimeService.createIncidentQuery().count());
var processDefinitions = repositoryService.createProcessDefinitionQuery().list();
var incidentCounts = processDefinitions.stream().collect(toMap(ProcessDefinition::getKey, ignored -> 0L, Long::sum));
incidentCounts.put(NOT_PROCESS_INCIDENT, 0L);
var processDefinitionKeys = processDefinitions.stream().collect(toMap(ProcessDefinition::getId, ProcessDefinition::getKey));

runtimeService.createIncidentQuery().list().forEach(incident -> {
var processDefinitionKey = incident.getProcessDefinitionId() == null ? NOT_PROCESS_INCIDENT : processDefinitionKeys.get(incident.getProcessDefinitionId());
incidentCounts.merge(processDefinitionKey, 1L, Long::sum);
});
incidentCounts.forEach(
(processDefinitionKey, incidents) -> this.openIncidents.labels(processDefinitionKey).set(incidents));
}

@Override
public void registerMetrics(final CollectorRegistry collectorRegistry) {
this.openIncidents = Gauge.build()
.name("camunda_incidents_open")
.help("Number of open incidents.")
.register(collectorRegistry);
.name("camunda_incidents_open")
.labelNames("processDefinitionKey")
.help("Number of open incidents by process definition key.")
.register(collectorRegistry);
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import io.prometheus.client.Gauge;
import lombok.RequiredArgsConstructor;
import org.camunda.bpm.engine.ManagementService;
import org.camunda.bpm.engine.management.Metrics;

import java.util.Date;

Expand All @@ -16,36 +17,48 @@ public class JobMetricsProvider implements MetricsProvider {
private Gauge jobsFuture;
private Gauge jobsNoRetries;
private Gauge jobsSuspended;
private Gauge jobsSuccess;
private Gauge jobsFailed;


@Override
public void updateMetrics() {
this.jobsExecutable.set(this.managementService.createJobQuery().executable().count());
this.jobsFuture.set(this.managementService.createJobQuery().duedateHigherThan(new Date()).count());
this.jobsNoRetries.set(this.managementService.createJobQuery().noRetriesLeft().count());
this.jobsSuspended.set(this.managementService.createJobQuery().suspended().count());
this.jobsSuccess.set(managementService.createMetricsQuery().name(Metrics.JOB_SUCCESSFUL).sum());
this.jobsFailed.set(managementService.createMetricsQuery().name(Metrics.JOB_FAILED).sum());

}

@Override
public void registerMetrics(final CollectorRegistry collectorRegistry) {
this.jobsExecutable = Gauge.build()
.name("camunda_jobs_executable")
.help("Number of jobs which are executable, ie. retries > 0 and due date is null or due date is in the past")
.register(collectorRegistry);

.name("camunda_jobs_executable")
.help("Number of jobs which are executable, ie. retries > 0 and due date is null or due date is in the past")
.register(collectorRegistry);
this.jobsFuture = Gauge.build()
.name("camunda_jobs_future")
.help("Number of jobs where the due date is in the future")
.register(collectorRegistry);

.name("camunda_jobs_future")
.help("Number of jobs where the due date is in the future")
.register(collectorRegistry);
this.jobsNoRetries = Gauge.build()
.name("camunda_jobs_out_of_retries")
.help("Number of jobs with no retries left")
.register(collectorRegistry);

.name("camunda_jobs_out_of_retries")
.help("Number of jobs with no retries left")
.register(collectorRegistry);
this.jobsSuspended = Gauge.build()
.name("camunda_jobs_suspended")
.help("Number of suspended jobs")
.register(collectorRegistry);
.name("camunda_jobs_suspended")
.help("Number of suspended jobs")
.register(collectorRegistry);
this.jobsSuccess = Gauge.build()
.name("camunda_jobs_successfully_executed")
.help("The number of jobs successfully executed.")
.register(collectorRegistry);
this.jobsFailed = Gauge.build()
.name("camunda_jobs_failed")
.help("The number of jobs that failed to executed and were submitted for retry.")
.register(collectorRegistry);

}

}
Loading

0 comments on commit e0f5fa5

Please sign in to comment.