diff --git a/changelog/fragments/1726572104-enable-persistence-by-default.yaml b/changelog/fragments/1726572104-enable-persistence-by-default.yaml new file mode 100644 index 00000000000..e07766a099f --- /dev/null +++ b/changelog/fragments/1726572104-enable-persistence-by-default.yaml @@ -0,0 +1,32 @@ +# Kind can be one of: +# - breaking-change: a change to previously-documented behavior +# - deprecation: functionality that is being removed in a later release +# - bug-fix: fixes a problem in a previous version +# - enhancement: extends functionality but does not break or fix existing behavior +# - feature: new functionality +# - known-issue: problems that we are aware of in a given version +# - security: impacts on the security of a product or a user’s deployment. +# - upgrade: important information for someone upgrading from a prior version +# - other: does not fit into any of the other categories +kind: feature + +# Change summary; a 80ish characters long description of the change. +summary: Enable persistence in the configuration provided with our OTel Collector distribution. + +# Long description; in case the summary is not enough to describe the change +# this field accommodate a description without length limits. +# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment. +#description: + +# Affected component; usually one of "elastic-agent", "fleet-server", "filebeat", "metricbeat", "auditbeat", "all", etc. +component: elastic-agent,otel + +# PR URL; optional; the PR number that added the changeset. +# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added. +# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number. +# Please provide it if you are adding a fragment for a different PR. +pr: + +# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of). +# If not present is automatically filled by the tooling with the issue linked to the PR number. +#issue: https://github.com/owner/repo/1234 diff --git a/internal/pkg/agent/cmd/otel.go b/internal/pkg/agent/cmd/otel.go index d23c308d30f..171369badbf 100644 --- a/internal/pkg/agent/cmd/otel.go +++ b/internal/pkg/agent/cmd/otel.go @@ -9,12 +9,15 @@ package cmd import ( "context" goerrors "errors" + "os" + "path/filepath" "sync" "github.com/spf13/cobra" "github.com/spf13/pflag" "github.com/elastic/elastic-agent-libs/service" + "github.com/elastic/elastic-agent/internal/pkg/agent/application/paths" "github.com/elastic/elastic-agent/internal/pkg/agent/errors" "github.com/elastic/elastic-agent/internal/pkg/cli" "github.com/elastic/elastic-agent/internal/pkg/otel" @@ -30,6 +33,9 @@ func newOtelCommandWithArgs(args []string, streams *cli.IOStreams) *cobra.Comman if err != nil { return err } + if err := prepareEnv(); err != nil { + return err + } return runCollector(cmd.Context(), cfgFiles) }, PreRun: func(c *cobra.Command, args []string) { @@ -118,3 +124,18 @@ func runCollector(cmdCtx context.Context, configFiles []string) error { return goerrors.Join(errs...) } + +func prepareEnv() error { + if _, ok := os.LookupEnv("STORAGE_DIR"); !ok { + // STORAGE_DIR is not set. Set it to ${path.Top()}/otel_registry because we do not want to use any of the paths, that are also used by Beats or Agent + // because a standalone OTel collector must be able to run alongside them without issue. + + // The filestorage extension will handle directory creation since create_directory: true is set by default. + // If the user hasn’t specified the env:STORAGE_DIR in filestorage, they may have opted for a custom path, and the extension will create the directory accordingly. + // In this case, setting env:STORAGE_DIR will have no effect. + if err := os.Setenv("STORAGE_DIR", filepath.Join(paths.Top(), "otel_registry")); err != nil { + return err + } + } + return nil +} diff --git a/internal/pkg/otel/README.md b/internal/pkg/otel/README.md index 46edef344fc..081b42f63d6 100644 --- a/internal/pkg/otel/README.md +++ b/internal/pkg/otel/README.md @@ -77,4 +77,52 @@ This section provides a summary of components included in the Elastic Distributi | Component | Version | |---|---| -| [spanmetricsconnector](https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/connector/spanmetricsconnector/v0.111.0/connector/spanmetricsconnector/README.md) | v0.111.0 | +| [spanmetricsconnector](https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/connector/spanmetricsconnector/v0.110.0/connector/spanmetricsconnector/README.md) | v0.111.0 | +## Persistense in OpenTelemetry Collector + +By default, the OpenTelemetry Collector is stateless, which means it doesn't store offsets on disk while reading files. As a result, if you restart the collector, it won't retain the last read offset, potentially leading to data duplication or loss. However, we have configured persistence in the settings provided with the Elastic Agent package. + +To enable persistence for the `filelogreceiver`, we add the `file_storage` extension and activate it for `filelog`. +Execute `export STORAGE_DIR=/path/to/store/otel/offsets` and use the following configuration to enable persistence: + +```yaml +receivers: + filelog/platformlogs: + include: [ /var/log/system.log ] + start_at: beginning + storage: file_storage/filelogreceiver +extensions: + file_storage/filelogreceiver: + directory: ${env:STORAGE_DIR} + create_directory: true +exporters: + ... +processors: + ... +service: + extensions: [file_storage] + pipelines: + logs/platformlogs: + receivers: [filelog/platformlogs] + processors: [...] + exporters: [...] +``` + +> [!WARNING] +Removing the storage key from the filelog section will disable persistence, which will lead to data duplication or loss when the collector restarts. + +> [!IMPORTANT] +If you remove the `create_directory: true` option, you'll need to manually create a directory to store the data. You can ignore this option if the directory already exists. + +### Persistense in standalone Docker mode + +By default, when running Elastic Distribution for OpenTelemetry Collector in Docker, checkpoints are stored in `/usr/share/elastic-agent/otel_registry` by default. To ensure data persists across container restarts, you can use the following command: + +```bash +docker run --rm -ti --entrypoint="elastic-agent" --mount type=bind,source=/path/on/host,target=/usr/share/elastic-agent/otel_registry docker.elastic.co/beats/elastic-agent:9.0.0-SNAPSHOT otel +``` + +### Known issues: +- You face following `failed to build extensions: failed to create extension "file_storage/filelogreceiver": mkdir ...: permission denied` error while running the otel mode + - Cause: This issue is likely because the user running the executable lacks sufficient permissions to create the directory. + - Resolution: You can either create the directory manually or specify a path with necessary permissions. diff --git a/internal/pkg/otel/templates/README.md.tmpl b/internal/pkg/otel/templates/README.md.tmpl index 7216aec1c43..0343594e145 100644 --- a/internal/pkg/otel/templates/README.md.tmpl +++ b/internal/pkg/otel/templates/README.md.tmpl @@ -74,3 +74,53 @@ This section provides a summary of components included in the Elastic Distributi | [{{ .Name }}]({{ .Link }}) | {{ .Version }} | {{ end -}} {{ end -}} + + +## Persistense in OpenTelemetry Collector + +By default, the OpenTelemetry Collector is stateless, which means it doesn't store offsets on disk while reading files. As a result, if you restart the collector, it won't retain the last read offset, potentially leading to data duplication or loss. However, we have configured persistence in the settings provided with the Elastic Agent package. + +To enable persistence for the `filelogreceiver`, we add the `file_storage` extension and activate it for `filelog`. +Execute `export STORAGE_DIR=/path/to/store/otel/offsets` and use the following configuration to enable persistence: + +```yaml +receivers: + filelog/platformlogs: + include: [ /var/log/system.log ] + start_at: beginning + storage: file_storage/filelogreceiver +extensions: + file_storage/filelogreceiver: + directory: ${env:STORAGE_DIR} + create_directory: true +exporters: + ... +processors: + ... +service: + extensions: [file_storage] + pipelines: + logs/platformlogs: + receivers: [filelog/platformlogs] + processors: [...] + exporters: [...] +``` + +> [!WARNING] +Removing the storage key from the filelog section will disable persistence, which will lead to data duplication or loss when the collector restarts. + +> [!IMPORTANT] +If you remove the `create_directory: true` option, you'll need to manually create a directory to store the data. You can ignore this option if the directory already exists. + +### Persistense in standalone Docker mode + +By default, when running Elastic Distribution for OpenTelemetry Collector in Docker, checkpoints are stored in `/usr/share/elastic-agent/otel_registry` by default. To ensure data persists across container restarts, you can use the following command: + +```bash +docker run --rm -ti --entrypoint="elastic-agent" --mount type=bind,source=/path/on/host,target=/usr/share/elastic-agent/otel_registry docker.elastic.co/beats/elastic-agent:9.0.0-SNAPSHOT otel +``` + +### Known issues: +- You face following `failed to build extensions: failed to create extension "file_storage/filelogreceiver": mkdir ...: permission denied` error while running the otel mode + - Cause: This issue is likely because the user running the executable lacks sufficient permissions to create the directory. + - Resolution: You can either create the directory manually or specify a path with necessary permissions. diff --git a/otel.yml b/otel.yml index e21750f6e19..87dd26d5d97 100644 --- a/otel.yml +++ b/otel.yml @@ -2,6 +2,7 @@ receivers: filelog: include: [ /var/log/system.log ] start_at: beginning + storage: file_storage/filelogreceiver processors: resource: @@ -24,9 +25,12 @@ extensions: health_check: endpoint: "localhost:13133" path: "/health/status" + file_storage/filelogreceiver: + create_directory: true + directory: ${env:STORAGE_DIR} service: - extensions: [health_check, memory_limiter] + extensions: [health_check, memory_limiter, file_storage/filelogreceiver] pipelines: logs: receivers: [filelog]