From 2a9f34962d9085787d4c80cd6546ff89ddca7d6d Mon Sep 17 00:00:00 2001 From: ahrav Date: Thu, 7 Sep 2023 09:03:37 -0700 Subject: [PATCH] Add optional param to Chunks (#1747) * Add interface for targeted chunking. * use optional args. * update Chunks method signature. * update tests. * fix test. * update QueryCriteria type. --- pkg/sources/circleci/circleci.go | 2 +- pkg/sources/docker/docker.go | 2 +- pkg/sources/filesystem/filesystem.go | 2 +- pkg/sources/gcs/gcs.go | 2 +- pkg/sources/git/git.go | 2 +- pkg/sources/github/github.go | 2 +- pkg/sources/gitlab/gitlab.go | 2 +- pkg/sources/s3/s3.go | 2 +- pkg/sources/source_manager_test.go | 19 +++++++++++-------- pkg/sources/sources.go | 18 ++++++++++++++++-- pkg/sources/syslog/syslog.go | 2 +- 11 files changed, 36 insertions(+), 19 deletions(-) diff --git a/pkg/sources/circleci/circleci.go b/pkg/sources/circleci/circleci.go index 2ae7e6b3b805..301f102a135a 100644 --- a/pkg/sources/circleci/circleci.go +++ b/pkg/sources/circleci/circleci.go @@ -75,7 +75,7 @@ func (s *Source) Init(_ context.Context, name string, jobId, sourceId int64, ver } // Chunks emits chunks of bytes over a channel. -func (s *Source) Chunks(ctx context.Context, chunksChan chan *sources.Chunk) error { +func (s *Source) Chunks(ctx context.Context, chunksChan chan *sources.Chunk, _ ...sources.ChunkingTarget) error { projects, err := s.projects(ctx) if err != nil { return fmt.Errorf("error getting projects: %w", err) diff --git a/pkg/sources/docker/docker.go b/pkg/sources/docker/docker.go index 667dbd60001c..8bd5d48d2b57 100644 --- a/pkg/sources/docker/docker.go +++ b/pkg/sources/docker/docker.go @@ -85,7 +85,7 @@ type layerInfo struct { } // Chunks emits data over a channel that is decoded and scanned for secrets. -func (s *Source) Chunks(ctx context.Context, chunksChan chan *sources.Chunk) error { +func (s *Source) Chunks(ctx context.Context, chunksChan chan *sources.Chunk, _ ...sources.ChunkingTarget) error { ctx = context.WithValues(ctx, "source_type", s.Type(), "source_name", s.name) workers := new(errgroup.Group) diff --git a/pkg/sources/filesystem/filesystem.go b/pkg/sources/filesystem/filesystem.go index 49f504006f2e..2af668a5898c 100644 --- a/pkg/sources/filesystem/filesystem.go +++ b/pkg/sources/filesystem/filesystem.go @@ -77,7 +77,7 @@ func (s *Source) WithFilter(filter *common.Filter) { } // Chunks emits chunks of bytes over a channel. -func (s *Source) Chunks(ctx context.Context, chunksChan chan *sources.Chunk) error { +func (s *Source) Chunks(ctx context.Context, chunksChan chan *sources.Chunk, _ ...sources.ChunkingTarget) error { for i, path := range s.paths { logger := ctx.Logger().WithValues("path", path) if common.IsDone(ctx) { diff --git a/pkg/sources/gcs/gcs.go b/pkg/sources/gcs/gcs.go index dfb0a40f3a93..ab867c3b5c9d 100644 --- a/pkg/sources/gcs/gcs.go +++ b/pkg/sources/gcs/gcs.go @@ -248,7 +248,7 @@ func (s *Source) enumerate(ctx context.Context) error { } // Chunks emits chunks of bytes over a channel. -func (s *Source) Chunks(ctx context.Context, chunksChan chan *sources.Chunk) error { +func (s *Source) Chunks(ctx context.Context, chunksChan chan *sources.Chunk, _ ...sources.ChunkingTarget) error { persistableCache := s.setupCache(ctx) objectCh, err := s.gcsManager.ListObjects(ctx) diff --git a/pkg/sources/git/git.go b/pkg/sources/git/git.go index 76301cbae94b..81560cbdb6b5 100644 --- a/pkg/sources/git/git.go +++ b/pkg/sources/git/git.go @@ -153,7 +153,7 @@ func (s *Source) Init(aCtx context.Context, name string, jobId, sourceId int64, } // Chunks emits chunks of bytes over a channel. -func (s *Source) Chunks(ctx context.Context, chunksChan chan *sources.Chunk) error { +func (s *Source) Chunks(ctx context.Context, chunksChan chan *sources.Chunk, _ ...sources.ChunkingTarget) error { if err := s.scanRepos(ctx, chunksChan); err != nil { return err } diff --git a/pkg/sources/github/github.go b/pkg/sources/github/github.go index 3b5d1f016cf5..511a0ec84351 100644 --- a/pkg/sources/github/github.go +++ b/pkg/sources/github/github.go @@ -413,7 +413,7 @@ func (s *Source) visibilityOf(ctx context.Context, repoURL string) (visibility s } // Chunks emits chunks of bytes over a channel. -func (s *Source) Chunks(ctx context.Context, chunksChan chan *sources.Chunk) error { +func (s *Source) Chunks(ctx context.Context, chunksChan chan *sources.Chunk, _ ...sources.ChunkingTarget) error { apiEndpoint := s.conn.Endpoint if len(apiEndpoint) == 0 || endsWithGithub.MatchString(apiEndpoint) { apiEndpoint = "https://api.github.com" diff --git a/pkg/sources/gitlab/gitlab.go b/pkg/sources/gitlab/gitlab.go index 9cfacea4dfff..06c5f5d00d67 100644 --- a/pkg/sources/gitlab/gitlab.go +++ b/pkg/sources/gitlab/gitlab.go @@ -139,7 +139,7 @@ func (s *Source) Init(_ context.Context, name string, jobId, sourceId int64, ver } // Chunks emits chunks of bytes over a channel. -func (s *Source) Chunks(ctx context.Context, chunksChan chan *sources.Chunk) error { +func (s *Source) Chunks(ctx context.Context, chunksChan chan *sources.Chunk, _ ...sources.ChunkingTarget) error { // Start client. apiClient, err := s.newClient() if err != nil { diff --git a/pkg/sources/s3/s3.go b/pkg/sources/s3/s3.go index 07b1a19dd067..a9f80732e388 100644 --- a/pkg/sources/s3/s3.go +++ b/pkg/sources/s3/s3.go @@ -232,7 +232,7 @@ func (s *Source) scanBuckets(ctx context.Context, client *s3.S3, role string, bu } // Chunks emits chunks of bytes over a channel. -func (s *Source) Chunks(ctx context.Context, chunksChan chan *sources.Chunk) error { +func (s *Source) Chunks(ctx context.Context, chunksChan chan *sources.Chunk, _ ...sources.ChunkingTarget) error { visitor := func(c context.Context, defaultRegionClient *s3.S3, roleArn string, buckets []string) { s.scanBuckets(c, defaultRegionClient, roleArn, buckets, chunksChan) } diff --git a/pkg/sources/source_manager_test.go b/pkg/sources/source_manager_test.go index 9a196a897bb5..31378eb02dfd 100644 --- a/pkg/sources/source_manager_test.go +++ b/pkg/sources/source_manager_test.go @@ -6,10 +6,11 @@ import ( "testing" "github.com/stretchr/testify/assert" + "google.golang.org/protobuf/types/known/anypb" + "github.com/trufflesecurity/trufflehog/v3/pkg/common" "github.com/trufflesecurity/trufflehog/v3/pkg/context" "github.com/trufflesecurity/trufflehog/v3/pkg/pb/sourcespb" - "google.golang.org/protobuf/types/known/anypb" ) // DummySource implements Source and is used for testing a SourceManager. @@ -31,7 +32,7 @@ func (d *DummySource) GetProgress() *Progress { return nil } // Interface to easily test different chunking methods. type chunker interface { - Chunks(context.Context, chan *Chunk) error + Chunks(context.Context, chan *Chunk, ...ChunkingTarget) error ChunkUnit(ctx context.Context, unit SourceUnit, reporter ChunkReporter) error Enumerate(ctx context.Context, reporter UnitReporter) error } @@ -42,7 +43,7 @@ type counterChunker struct { count int } -func (c *counterChunker) Chunks(ctx context.Context, ch chan *Chunk) error { +func (c *counterChunker) Chunks(ctx context.Context, ch chan *Chunk, _ ...ChunkingTarget) error { for i := 0; i < c.count; i++ { select { case ch <- &Chunk{Data: []byte{c.chunkCounter}}: @@ -75,9 +76,9 @@ func (c *counterChunker) ChunkUnit(ctx context.Context, unit SourceUnit, reporte // Chunk method that always returns an error. type errorChunker struct{ error } -func (c errorChunker) Chunks(context.Context, chan *Chunk) error { return c } -func (c errorChunker) Enumerate(context.Context, UnitReporter) error { return c } -func (c errorChunker) ChunkUnit(context.Context, SourceUnit, ChunkReporter) error { return c } +func (c errorChunker) Chunks(context.Context, chan *Chunk, ...ChunkingTarget) error { return c } +func (c errorChunker) Enumerate(context.Context, UnitReporter) error { return c } +func (c errorChunker) ChunkUnit(context.Context, SourceUnit, ChunkReporter) error { return c } // enrollDummy is a helper function to enroll a DummySource with a SourceManager. func enrollDummy(mgr *SourceManager, chunkMethod chunker) (handle, error) { @@ -176,7 +177,7 @@ type unitChunk struct { type unitChunker struct{ steps []unitChunk } -func (c *unitChunker) Chunks(ctx context.Context, ch chan *Chunk) error { +func (c *unitChunker) Chunks(ctx context.Context, ch chan *Chunk, _ ...ChunkingTarget) error { for _, step := range c.steps { if step.err != "" { continue @@ -294,7 +295,9 @@ type callbackChunker struct { cb func(context.Context, chan *Chunk) error } -func (c callbackChunker) Chunks(ctx context.Context, ch chan *Chunk) error { return c.cb(ctx, ch) } +func (c callbackChunker) Chunks(ctx context.Context, ch chan *Chunk, _ ...ChunkingTarget) error { + return c.cb(ctx, ch) +} func (c callbackChunker) Enumerate(context.Context, UnitReporter) error { return nil } func (c callbackChunker) ChunkUnit(context.Context, SourceUnit, ChunkReporter) error { return nil } diff --git a/pkg/sources/sources.go b/pkg/sources/sources.go index 07da7ccaf521..eba823a6cfc4 100644 --- a/pkg/sources/sources.go +++ b/pkg/sources/sources.go @@ -30,6 +30,16 @@ type Chunk struct { Verify bool } +// ChunkingTarget specifies criteria for a targeted chunking process. +// Instead of collecting data indiscriminately, this struct allows the caller +// to specify particular subsets of data they're interested in. This becomes +// especially useful when one needs to verify or recheck specific data points +// without processing the entire dataset. +type ChunkingTarget struct { + // QueryCriteria represents specific parameters or conditions to target the chunking process. + QueryCriteria source_metadatapb.MetaData +} + // Source defines the interface required to implement a source chunker. type Source interface { // Type returns the source type, used for matching against configuration and jobs. @@ -40,8 +50,12 @@ type Source interface { JobID() int64 // Init initializes the source. Init(aCtx context.Context, name string, jobId, sourceId int64, verify bool, connection *anypb.Any, concurrency int) error - // Chunks emits data over a channel that is decoded and scanned for secrets. - Chunks(ctx context.Context, chunksChan chan *Chunk) error + // Chunks emits data over a channel which is then decoded and scanned for secrets. + // By default, data is obtained indiscriminately. However, by providing one or more + // ChunkingTarget parameters, the caller can direct the function to retrieve + // specific chunks of data. This targeted approach allows for efficient and + // intentional data processing, beneficial when verifying or rechecking specific data points. + Chunks(ctx context.Context, chunksChan chan *Chunk, targets ...ChunkingTarget) error // GetProgress is the completion progress (percentage) for Scanned Source. GetProgress() *Progress } diff --git a/pkg/sources/syslog/syslog.go b/pkg/sources/syslog/syslog.go index d9be9d11752c..76814b03ed3d 100644 --- a/pkg/sources/syslog/syslog.go +++ b/pkg/sources/syslog/syslog.go @@ -182,7 +182,7 @@ func (s *Source) verifyConnectionConfig() error { } // Chunks emits chunks of bytes over a channel. -func (s *Source) Chunks(ctx context.Context, chunksChan chan *sources.Chunk) error { +func (s *Source) Chunks(ctx context.Context, chunksChan chan *sources.Chunk, _ ...sources.ChunkingTarget) error { switch { case s.conn.TlsCert != nilString || s.conn.TlsKey != nilString: cert, err := tls.X509KeyPair([]byte(s.conn.TlsCert), []byte(s.conn.TlsKey))