From 48f27b37e515f21aaa36a147044ce30517be93ad Mon Sep 17 00:00:00 2001 From: Yiran Date: Mon, 6 Jan 2025 11:00:55 +0800 Subject: [PATCH] docs: integrate Kafka to open source GreptimeDB (#1420) --- blog/release-0-7-2.md | 2 + docs/greptimecloud/integrations/kafka.md | 5 + docs/user-guide/ingest-data/for-iot/kafka.md | 8 + .../ingest-data/for-observerbility/kafka.md | 172 ++++++++++++++++++ .../ingest-data/for-observerbility/vector.md | 10 +- docs/user-guide/integrations/kafka.md | 10 + docusaurus.config.ts | 2 +- .../greptimecloud/integrations/kafka.md | 5 + .../user-guide/ingest-data/for-iot/kafka.md | 9 + .../ingest-data/for-observerbility/kafka.md | 170 +++++++++++++++++ .../ingest-data/for-observerbility/vector.md | 12 +- .../current/user-guide/integrations/kafka.md | 9 + sidebars.ts | 3 + 13 files changed, 414 insertions(+), 3 deletions(-) create mode 100644 docs/user-guide/ingest-data/for-iot/kafka.md create mode 100644 docs/user-guide/ingest-data/for-observerbility/kafka.md create mode 100644 docs/user-guide/integrations/kafka.md create mode 100644 i18n/zh/docusaurus-plugin-content-docs/current/user-guide/ingest-data/for-iot/kafka.md create mode 100644 i18n/zh/docusaurus-plugin-content-docs/current/user-guide/ingest-data/for-observerbility/kafka.md create mode 100644 i18n/zh/docusaurus-plugin-content-docs/current/user-guide/integrations/kafka.md diff --git a/blog/release-0-7-2.md b/blog/release-0-7-2.md index 6a0be4646..87c61ace7 100644 --- a/blog/release-0-7-2.md +++ b/blog/release-0-7-2.md @@ -2,6 +2,8 @@ keywords: [release, GreptimeDB, changelog, v0.7.2] description: GreptimeDB v0.7.2 Changelog date: 2024-04-08 +keywords: [release notes, greptimedb, 0.7.2] +description: GreptimeDB 0.7.2 release notes. --- # v0.7.2 diff --git a/docs/greptimecloud/integrations/kafka.md b/docs/greptimecloud/integrations/kafka.md index ae3f7016e..0dfe9bf61 100644 --- a/docs/greptimecloud/integrations/kafka.md +++ b/docs/greptimecloud/integrations/kafka.md @@ -63,3 +63,8 @@ username = "" password = "" tls = {} ``` + +## Reference + +For detailed information on the data ingestion process, please refer to the [Ingest Data via Kafka](https://docs.greptime.com/nightly/user-guide/ingest-data/for-observerbility/kafka) guide. + diff --git a/docs/user-guide/ingest-data/for-iot/kafka.md b/docs/user-guide/ingest-data/for-iot/kafka.md new file mode 100644 index 000000000..c7200e76f --- /dev/null +++ b/docs/user-guide/ingest-data/for-iot/kafka.md @@ -0,0 +1,8 @@ +--- +keywords: [Kafka, Data Ingestion] +description: Write data from Kafka to GreptimeDB. +--- + +# Kafka + +Please refer to the [Kafka documentation](/user-guide/ingest-data/for-observerbility/kafka.md) for instructions on how to ingest data from Kafka into GreptimeDB. diff --git a/docs/user-guide/ingest-data/for-observerbility/kafka.md b/docs/user-guide/ingest-data/for-observerbility/kafka.md new file mode 100644 index 000000000..5d5b9c61b --- /dev/null +++ b/docs/user-guide/ingest-data/for-observerbility/kafka.md @@ -0,0 +1,172 @@ +--- +keywords: [Kafka, data ingestion, observability, metrics, logs, JSON logs, text logs, Vector, InfluxDB line protocol] +description: Learn how to ingest observability data from Kafka into GreptimeDB using Vector. This guide covers metrics and logs ingestion, including JSON and text log formats, with detailed configuration examples. +--- + +# Kafka + +If you are using Kafka or Kafka-compatible message queue for observability data +transporting, it's possible to ingest data into GreptimeDB directly. + +Here we are using Vector as the tool to transport data from Kafka to GreptimeDB. + +## Metrics + +When ingesting metrics from Kafka into GreptimeDB, messages should be formatted in InfluxDB line protocol. For example: + +```txt +census,location=klamath,scientist=anderson bees=23 1566086400000000000 +``` + +Then configure Vector to use the `influxdb` decoding codec to process these messages. + +```toml +[sources.metrics_mq] +# Specifies that the source type is Kafka +type = "kafka" +# The consumer group ID for Kafka +group_id = "vector0" +# The list of Kafka topics to consume messages from +topics = ["test_metric_topic"] +# The address of the Kafka broker to connect to +bootstrap_servers = "kafka:9092" +# The `influxdb` means the messages are expected to be in InfluxDB line protocol format. +decoding.codec = "influxdb" + +[sinks.metrics_in] +inputs = ["metrics_mq"] +# Specifies that the sink type is `greptimedb_metrics` +type = "greptimedb_metrics" +# The endpoint of the GreptimeDB server. +# Replace with the actual hostname or IP address. +endpoint = ":4001" +dbname = "" +username = "" +password = "" +tls = {} +``` + +For details on how InfluxDB line protocol metrics are mapped to GreptimeDB data, please refer to the [Data Model](/user-guide/ingest-data/for-iot/influxdb-line-protocol.md#data-model) section in the InfluxDB line protocol documentation. + + +## Logs + +Developers commonly work with two types of logs: JSON logs and plain text logs. +Consider the following examples sent from Kafka. + +A plain text log: + +```txt +127.0.0.1 - - [25/May/2024:20:16:37 +0000] "GET /index.html HTTP/1.1" 200 612 "-" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" +``` + +Or a JSON log: + +```json +{ + "timestamp": "2024-12-23T10:00:00Z", + "level": "INFO", + "message": "Service started" +} +``` + +GreptimeDB transforms these logs into structured data with multiple columns and automatically creates the necessary tables. +A pipeline processes the logs into structured data before ingestion into GreptimeDB. Different log formats require different [Pipelines](/user-guide/logs/quick-start.md#write-logs-by-pipeline) for parsing. See the following sections for details. + +### Logs with JSON format + +For logs in JSON format (e.g., `{"timestamp": "2024-12-23T10:00:00Z", "level": "INFO", "message": "Service started"}`), +you can use the built-in [`greptime_identity`](/user-guide/logs/manage-pipelines.md#greptime_identity) pipeline for direct ingestion. +This pipeline creates columns automatically based on the fields in your JSON log message. + +Simply configure Vector's `transforms` settings to parse the JSON message and use the `greptime_identity` pipeline as shown in the following example: + +```toml +[sources.logs_in] +type = "kafka" +# The consumer group ID for Kafka +group_id = "vector0" +# The list of Kafka topics to consume messages from +topics = ["test_log_topic"] +# The address of the Kafka broker to connect to +bootstrap_servers = "kafka:9092" + +# transform the log to JSON format +[transforms.logs_json] +type = "remap" +inputs = ["logs_in"] +source = ''' +. = parse_json!(.message) +''' + +[sinks.logs_out] +# Specifies that this sink will receive data from the `logs_json` source +inputs = ["logs_json"] +# Specifies that the sink type is `greptimedb_logs` +type = "greptimedb_logs" +# The endpoint of the GreptimeDB server +endpoint = "http://:4000" +compression = "gzip" +# Replace , , and with the actual values +dbname = "" +username = "" +password = "" +# The table name in GreptimeDB, if it doesn't exist, it will be created automatically +table = "demo_logs" +# Use the built-in `greptime_identity` pipeline +pipeline_name = "greptime_identity" +``` + +### Logs with text format + +For logs in text format, such as the access log format below, you'll need to create a custom pipeline to parse them: + +``` +127.0.0.1 - - [25/May/2024:20:16:37 +0000] "GET /index.html HTTP/1.1" 200 612 "-" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" +``` + +#### Create a pipeline + +To create a custom pipeline, +please refer to the [Create Pipeline](/user-guide/logs/quick-start.md#create-a-pipeline) +and [Pipeline Configuration](/user-guide/logs/pipeline-config.md) documentation for detailed instructions. + +#### Ingest data + +After creating the pipeline, configure it to the `pipeline_name` field in the Vector configuration file. + +```toml +# sample.toml +[sources.log_mq] +# Specifies that the source type is Kafka +type = "kafka" +# The consumer group ID for Kafka +group_id = "vector0" +# The list of Kafka topics to consume messages from +topics = ["test_log_topic"] +# The address of the Kafka broker to connect to +bootstrap_servers = "kafka:9092" + +[sinks.sink_greptime_logs] +# Specifies that the sink type is `greptimedb_logs` +type = "greptimedb_logs" +# Specifies that this sink will receive data from the `log_mq` source +inputs = [ "log_mq" ] +# Use `gzip` compression to save bandwidth +compression = "gzip" +# The endpoint of the GreptimeDB server +# Replace with the actual hostname or IP address +endpoint = "http://:4000" +dbname = "" +username = "" +password = "" +# The table name in GreptimeDB, if it doesn't exist, it will be created automatically +table = "demo_logs" +# The custom pipeline name that you created +pipeline_name = "your_custom_pipeline" +``` + +## Demo + +For a runnable demo of data transformation and ingestion, please refer to the [Kafka Ingestion Demo](https://github.com/GreptimeTeam/demo-scene/tree/main/kafka-ingestion). + diff --git a/docs/user-guide/ingest-data/for-observerbility/vector.md b/docs/user-guide/ingest-data/for-observerbility/vector.md index 18785a8d7..566753c6d 100644 --- a/docs/user-guide/ingest-data/for-observerbility/vector.md +++ b/docs/user-guide/ingest-data/for-observerbility/vector.md @@ -11,7 +11,7 @@ sink. With vector, you can ingest metrics data from various sources, including Prometheus, OpenTelemetry, StatsD and many more. GreptimeDB can be used as a Vector Sink component to receive metrics. -## Collect metrics +## Collect host metrics ### Configuration @@ -60,3 +60,11 @@ The following rules are used when storing Vector metrics into GreptimeDB: - For AggregatedSummary metrics, the values of each percentile are stored in the `pxx` column, where xx is the percentile, and the `sum/count` columns are also stored; - For Sketch metrics, the values of each percentile are stored in the `pxx` column, where xx is the percentile, and the `min/max/avg/sum` columns are also stored; +## Collect metrics with InfluxDB line protocol format + +Vector can collect metrics in the InfluxDB line protocol format and send them to GreptimeDB. For more information, refer to the [Kafka guide](/user-guide/ingest-data/for-observerbility/kafka.md#metrics). + +## Collect logs + +Vector can also collect logs and send them to GreptimeDB. For more details, refer to the [Kafka guide](/user-guide/ingest-data/for-observerbility/kafka.md#logs). + diff --git a/docs/user-guide/integrations/kafka.md b/docs/user-guide/integrations/kafka.md new file mode 100644 index 000000000..67c04764e --- /dev/null +++ b/docs/user-guide/integrations/kafka.md @@ -0,0 +1,10 @@ +--- +keywords: [Kafka, data ingestion, observability, metrics, logs] +description: Learn how to ingest observability data from Kafka into GreptimeDB using Vector. +--- + +# Kafka + +Vector can be used as a tool to transport data from Kafka to GreptimeDB. +For more information, please refer to the [Ingest Data via Kafka](/user-guide/ingest-data/for-observerbility/kafka.md) guide. + diff --git a/docusaurus.config.ts b/docusaurus.config.ts index 5485bf731..30018ddd4 100644 --- a/docusaurus.config.ts +++ b/docusaurus.config.ts @@ -207,7 +207,7 @@ const config: Config = { prism: { theme: prismThemes.github, darkTheme: prismThemes.dracula, - additionalLanguages: ['java'], + additionalLanguages: ['java', 'toml'], }, algolia: algoliaMap[locale] //, diff --git a/i18n/zh/docusaurus-plugin-content-docs/current/greptimecloud/integrations/kafka.md b/i18n/zh/docusaurus-plugin-content-docs/current/greptimecloud/integrations/kafka.md index d2b7ace2e..30108f9ef 100644 --- a/i18n/zh/docusaurus-plugin-content-docs/current/greptimecloud/integrations/kafka.md +++ b/i18n/zh/docusaurus-plugin-content-docs/current/greptimecloud/integrations/kafka.md @@ -60,3 +60,8 @@ username = "" password = "" tls = {} ``` + +## 参考文档 + +请参考[通过 Kafka 写入数据](https://docs.greptime.cn/nightly/user-guide/ingest-data/for-observerbility/kafka)获取数据写入过程的详细信息。 + diff --git a/i18n/zh/docusaurus-plugin-content-docs/current/user-guide/ingest-data/for-iot/kafka.md b/i18n/zh/docusaurus-plugin-content-docs/current/user-guide/ingest-data/for-iot/kafka.md new file mode 100644 index 000000000..bde191ee7 --- /dev/null +++ b/i18n/zh/docusaurus-plugin-content-docs/current/user-guide/ingest-data/for-iot/kafka.md @@ -0,0 +1,9 @@ +--- +keywords: [Kafka, 数据写入] +description: 将数据从 Kafka 写入到 GreptimeDB. +--- + +# Kafka + +请参考 [Kafka 文档](/user-guide/ingest-data/for-observerbility/kafka.md)了解如何将数据从 Kafka 写入到 GreptimeDB。 + diff --git a/i18n/zh/docusaurus-plugin-content-docs/current/user-guide/ingest-data/for-observerbility/kafka.md b/i18n/zh/docusaurus-plugin-content-docs/current/user-guide/ingest-data/for-observerbility/kafka.md new file mode 100644 index 000000000..1cd1a3667 --- /dev/null +++ b/i18n/zh/docusaurus-plugin-content-docs/current/user-guide/ingest-data/for-observerbility/kafka.md @@ -0,0 +1,170 @@ +--- +keywords: [Kafka, 数据提取, 可观察性, 指标, 日志, JSON 日志, 文本日志, Vector, InfluxDB 行协议] +description: 了解如何使用 Vector 将可观察性数据从 Kafka 写入到 GreptimeDB。本指南涵盖指标和日志提取,包括 JSON 和文本日志格式,并附有详细的配置示例。 +--- + +# Kafka + +如果你使用 Kafka 或兼容 Kafka 的消息队列进行可观测性数据传输,可以直接将数据写入到 GreptimeDB 中。 + +这里我们使用 Vector 作为工具将数据从 Kafka 传输到 GreptimeDB。 + +## 指标 + +从 Kafka 写入指标到 GreptimeDB 时,消息应采用 InfluxDB 行协议格式。例如: + +```txt +census,location=klamath,scientist=anderson bees=23 1566086400000000000 +``` + +然后配置 Vector 使用 `influxdb` 解码器来处理这些消息。 + +```toml +[sources.metrics_mq] +# 指定源类型为 Kafka +type = "kafka" +# Kafka 的消费者组 ID +group_id = "vector0" +# 要消费消息的 Kafka 主题列表 +topics = ["test_metric_topic"] +# 要连接的 Kafka 地址 +bootstrap_servers = "kafka:9092" +# `influxdb` 表示消息应采用 InfluxDB 行协议格式 +decoding.codec = "influxdb" + +[sinks.metrics_in] +inputs = ["metrics_mq"] +# 指定接收器类型为 `greptimedb_metrics` +type = "greptimedb_metrics" +# GreptimeDB 服务器的端点 +# 将 替换为实际的主机名或 IP 地址 +endpoint = ":4001" +dbname = "" +username = "" +password = "" +tls = {} +``` + +有关 InfluxDB 行协议指标如何映射到 GreptimeDB 数据的详细信息,请参阅 InfluxDB 行协议文档中的[数据模型](/user-guide/ingest-data/for-iot/influxdb-line-protocol.md#数据模型)部分。 + +## 日志 + +开发人员通常处理两种类型的日志:JSON 日志和纯文本日志。 +例如以下从 Kafka 发送的日志示例。 + +纯文本日志: + +```txt +127.0.0.1 - - [25/May/2024:20:16:37 +0000] "GET /index.html HTTP/1.1" 200 612 "-" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" +``` + +或 JSON 日志: + +```json +{ + "timestamp": "2024-12-23T10:00:00Z", + "level": "INFO", + "message": "Service started" +} +``` + +GreptimeDB 将这些日志转换为具有多个列的结构化数据,并自动创建必要的表。 +Pipeline 在写入到 GreptimeDB 之前将日志处理为结构化数据。 +不同的日志格式需要不同的 [Pipeline](/user-guide/logs/quick-start.md#write-logs-by-pipeline) 来解析,详情请继续阅读下面的内容。 + +### JSON 格式的日志 + +对于 JSON 格式的日志(例如 `{"timestamp": "2024-12-23T10:00:00Z", "level": "INFO", "message": "Service started"}`), +你可以使用内置的 [`greptime_identity`](/user-guide/logs/manage-pipelines.md#greptime_identity) pipeline 直接写入日志。 +此 pipeline 根据 JSON 日志消息中的字段自动创建列。 + +你只需要配置 Vector 的 `transforms` 设置以解析 JSON 消息,并使用 `greptime_identity` pipeline,如以下示例所示: + +```toml +[sources.logs_in] +type = "kafka" +# Kafka 的消费者组 ID +group_id = "vector0" +# 要消费消息的 Kafka 主题列表 +topics = ["test_log_topic"] +# 要连接的 Kafka 代理地址 +bootstrap_servers = "kafka:9092" + +# 将日志转换为 JSON 格式 +[transforms.logs_json] +type = "remap" +inputs = ["logs_in"] +source = ''' +. = parse_json!(.message) +''' + +[sinks.logs_out] +# 指定此接收器将接收来自 `logs_json` 源的数据 +inputs = ["logs_json"] +# 指定接收器类型为 `greptimedb_logs` +type = "greptimedb_logs" +# GreptimeDB 服务器的端点 +endpoint = "http://:4000" +compression = "gzip" +# 将 替换为实际值 +dbname = "" +username = "" +password = "" +# GreptimeDB 中的表名,如果不存在,将自动创建 +table = "demo_logs" +# 使用内置的 `greptime_identity` 管道 +pipeline_name = "greptime_identity" +``` + +### 文本格式的日志 + +对于文本格式的日志,例如下面的访问日志格式,你需要创建自定义 pipeline 来解析它们: + +``` +127.0.0.1 - - [25/May/2024:20:16:37 +0000] "GET /index.html HTTP/1.1" 200 612 "-" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" +``` + +#### 创建 pipeline + +要创建自定义 pipeline, +请参阅[创建 pipeline](/user-guide/logs/quick-start.md#创建-pipeline) 和 [pipeline 配置](/user-guide/logs/pipeline-config.md)文档获取详细说明。 + +#### 写入数据 + +创建 pipeline 后,将其配置到 Vector 配置文件中的 `pipeline_name` 字段。 + +```toml +# sample.toml +[sources.log_mq] +# 指定源类型为 Kafka +type = "kafka" +# Kafka 的消费者组 ID +group_id = "vector0" +# 要消费消息的 Kafka 主题列表 +topics = ["test_log_topic"] +# 要连接的 Kafka 地址 +bootstrap_servers = "kafka:9092" + +[sinks.sink_greptime_logs] +# 指定接收器类型为 `greptimedb_logs` +type = "greptimedb_logs" +# 指定此接收器将接收来自 `log_mq` 源的数据 +inputs = [ "log_mq" ] +# 使用 `gzip` 压缩以节省带宽 +compression = "gzip" +# GreptimeDB 服务器的端点 +# 将 替换为实际的主机名或 IP 地址 +endpoint = "http://:4000" +dbname = "" +username = "" +password = "" +# GreptimeDB 中的表名,如果不存在,将自动创建 +table = "demo_logs" +# 你创建的自定义管道名称 +pipeline_name = "your_custom_pipeline" +``` + +## Demo + +有关数据转换和写入的可运行演示,请参阅 [Kafka Ingestion Demo](https://github.com/GreptimeTeam/demo-scene/tree/main/kafka-ingestion)。 + diff --git a/i18n/zh/docusaurus-plugin-content-docs/current/user-guide/ingest-data/for-observerbility/vector.md b/i18n/zh/docusaurus-plugin-content-docs/current/user-guide/ingest-data/for-observerbility/vector.md index 91214dda0..f987406ab 100644 --- a/i18n/zh/docusaurus-plugin-content-docs/current/user-guide/ingest-data/for-observerbility/vector.md +++ b/i18n/zh/docusaurus-plugin-content-docs/current/user-guide/ingest-data/for-observerbility/vector.md @@ -10,7 +10,7 @@ Vector 是高性能的可观测数据管道。 通过 Vector,你可以从各种来源接收指标数据,包括 Prometheus、OpenTelemetry、StatsD 等。 GreptimeDB 可以作为 Vector 的 Sink 组件来接收指标数据。 -## 收集指标 +## 收集主机指标 ### 配置 @@ -57,3 +57,13 @@ vector -c sample.toml - AggregatedHistoragm 类型,每个 bucket 的数值将被存入 `bxx` 列,其中 xx 是 bucket 数值的上限,此外我们还会记录 `sum/count` 列; - AggregatedSummary 类型,各个百分位数值点分别存入 `pxx` 列,其中 xx 是 quantile 数值,此外我们还会记录 `sum/count` 列; - Sketch 类型,各个百分位数值点分别存入 `pxx` 列,其中 xx 是 quantile 数值,此外我们还会记录 `min/max/avg/sum` 列; + +## 收集 InfluxDB 行协议格式的指标 + +Vector 可以收集 InfluxDB 行协议格式的指标并将其发送到 GreptimeDB。更多信息请参考 [Kafka 指南](/user-guide/ingest-data/for-observerbility/kafka.md#指标)。 + + +## 收集日志 + +Vector 可以收集日志并发送到 GreptimeDB。更多信息请参考 [Kafka 指南](/user-guide/ingest-data/for-observerbility/kafka.md#日志)。 + diff --git a/i18n/zh/docusaurus-plugin-content-docs/current/user-guide/integrations/kafka.md b/i18n/zh/docusaurus-plugin-content-docs/current/user-guide/integrations/kafka.md new file mode 100644 index 000000000..c3eacdc5c --- /dev/null +++ b/i18n/zh/docusaurus-plugin-content-docs/current/user-guide/integrations/kafka.md @@ -0,0 +1,9 @@ +--- +keywords: [Kafka, 数据传输, 可观测性, 指标, 日志] +description: 从 Kafka 写入数据到 GreptimeDB。 +--- + +# Kafka + +你可以使用 Vector 作为从 Kafka 到 GreptimeDB 的数据传输工具。 +请前往[通过 Kafka 写入数据](/user-guide/ingest-data/for-observerbility/kafka.md)了解更多信息。 diff --git a/sidebars.ts b/sidebars.ts index 6702c944f..088ee3f22 100644 --- a/sidebars.ts +++ b/sidebars.ts @@ -57,6 +57,7 @@ const sidebars: SidebarsConfig = { 'user-guide/ingest-data/for-observerbility/vector', 'user-guide/ingest-data/for-observerbility/opentelemetry', 'user-guide/ingest-data/for-observerbility/influxdb-line-protocol', + 'user-guide/ingest-data/for-observerbility/kafka', 'user-guide/ingest-data/for-observerbility/loki', 'user-guide/ingest-data/for-observerbility/alloy', ], @@ -77,6 +78,7 @@ const sidebars: SidebarsConfig = { ], }, 'user-guide/ingest-data/for-iot/influxdb-line-protocol', + 'user-guide/ingest-data/for-iot/kafka', 'user-guide/ingest-data/for-iot/emqx', 'user-guide/ingest-data/for-iot/opentsdb', ], @@ -103,6 +105,7 @@ const sidebars: SidebarsConfig = { 'user-guide/integrations/overview', 'user-guide/integrations/prometheus', 'user-guide/integrations/vector', + 'user-guide/integrations/kafka', 'user-guide/integrations/grafana', 'user-guide/integrations/superset', 'user-guide/integrations/metabase',