optimize sequence number calculation and reduce search requests in do…

…c level monitor execution (opensearch-project#1445) * optimize sequence number calculation and reduce search requests by n where n is number of shards being queried in the executino Signed-off-by: Surya Sashank Nistala <snistala@amazon.com> * fix tests Signed-off-by: Surya Sashank Nistala <snistala@amazon.com> * optimize check indices and execute to query only write index of aliases and datastreams during monitor creation Signed-off-by: Surya Sashank Nistala <snistala@amazon.com> * fix test Signed-off-by: Surya Sashank Nistala <snistala@amazon.com> * add javadoc Signed-off-by: Surya Sashank Nistala <snistala@amazon.com> * add tests to verify seq_no calculation Signed-off-by: Surya Sashank Nistala <snistala@amazon.com> --------- Signed-off-by: Surya Sashank Nistala <snistala@amazon.com>
engechas · Mar 14, 2024 · 9824ef0 · 9824ef0
1 parent 313248a
commit 9824ef0
Show file tree

Hide file tree

Showing 9 changed files with 298 additions and 112 deletions.
diff --git a/alerting/src/main/kotlin/org/opensearch/alerting/AlertingPlugin.kt b/alerting/src/main/kotlin/org/opensearch/alerting/AlertingPlugin.kt
@@ -39,6 +39,7 @@ import org.opensearch.alerting.resthandler.RestSearchEmailGroupAction
 import org.opensearch.alerting.resthandler.RestSearchMonitorAction
 import org.opensearch.alerting.script.TriggerScript
 import org.opensearch.alerting.settings.AlertingSettings
+import org.opensearch.alerting.settings.AlertingSettings.Companion.DOC_LEVEL_MONITOR_SHARD_FETCH_SIZE
 import org.opensearch.alerting.settings.DestinationSettings
 import org.opensearch.alerting.settings.LegacyOpenDistroAlertingSettings
 import org.opensearch.alerting.settings.LegacyOpenDistroDestinationSettings
@@ -276,6 +277,7 @@ internal class AlertingPlugin : PainlessExtension, ActionPlugin, ScriptPlugin, R
             AlertingSettings.ALERT_HISTORY_RETENTION_PERIOD,
             AlertingSettings.ALERTING_MAX_MONITORS,
             AlertingSettings.PERCOLATE_QUERY_DOCS_SIZE_MEMORY_PERCENTAGE_LIMIT,
+            DOC_LEVEL_MONITOR_SHARD_FETCH_SIZE,
             AlertingSettings.PERCOLATE_QUERY_MAX_NUM_DOCS_IN_MEMORY,
             AlertingSettings.REQUEST_TIMEOUT,
             AlertingSettings.MAX_ACTION_THROTTLE_VALUE,

diff --git a/alerting/src/main/kotlin/org/opensearch/alerting/DocumentLevelMonitorRunner.kt b/alerting/src/main/kotlin/org/opensearch/alerting/DocumentLevelMonitorRunner.kt
@@ -14,8 +14,8 @@ import org.opensearch.action.search.SearchAction
 import org.opensearch.action.search.SearchRequest
 import org.opensearch.action.search.SearchResponse
 import org.opensearch.action.support.WriteRequest
-import org.opensearch.alerting.model.DocumentExecutionContext
 import org.opensearch.alerting.model.DocumentLevelTriggerRunResult
+import org.opensearch.alerting.model.IndexExecutionContext
 import org.opensearch.alerting.model.InputRunResults
 import org.opensearch.alerting.model.MonitorMetadata
 import org.opensearch.alerting.model.MonitorRunResult
@@ -47,6 +47,8 @@ import org.opensearch.index.IndexNotFoundException
 import org.opensearch.index.query.BoolQueryBuilder
 import org.opensearch.index.query.Operator
 import org.opensearch.index.query.QueryBuilders
+import org.opensearch.index.seqno.SequenceNumbers
+import org.opensearch.indices.IndexClosedException
 import org.opensearch.percolator.PercolateQueryBuilderExt
 import org.opensearch.rest.RestStatus
 import org.opensearch.search.SearchHit
@@ -190,7 +192,7 @@ class DocumentLevelMonitorRunner : MonitorRunner() {
                     }
 
                     // Prepare updatedLastRunContext for each index
-                    val indexUpdatedRunContext = updateLastRunContext(
+                    val indexUpdatedRunContext = initializeNewLastRunContext(
                         indexLastRunContext.toMutableMap(),
                         monitorCtx,
                         concreteIndexName,
@@ -238,24 +240,28 @@ class DocumentLevelMonitorRunner : MonitorRunner() {
                                     "${fieldsToBeQueried.joinToString()} instead of entire _source of documents"
                             )
                     }
-
-                    // Prepare DocumentExecutionContext for each index
-                    val docExecutionContext = DocumentExecutionContext(queries, indexLastRunContext, indexUpdatedRunContext)
+                    val indexExecutionContext = IndexExecutionContext(
+                        queries,
+                        indexLastRunContext,
+                        indexUpdatedRunContext,
+                        updatedIndexName,
+                        concreteIndexName,
+                        conflictingFields.toList(),
+                    )
 
                     fetchShardDataAndMaybeExecutePercolateQueries(
                         monitor,
                         monitorCtx,
-                        docExecutionContext,
-                        updatedIndexName,
-                        concreteIndexName,
-                        conflictingFields.toList(),
+                        indexExecutionContext,
                         monitorMetadata,
                         inputRunResults,
                         docsToQueries,
                         updatedIndexNames,
                         concreteIndicesSeenSoFar,
                         ArrayList(fieldsToBeQueried)
-                    )
+                    ) { shard, maxSeqNo -> // function passed to update last run context with new max sequence number
+                        indexExecutionContext.updatedLastRunContext[shard] = maxSeqNo
+                    }
                 }
             }
             /* if all indices are covered still in-memory docs size limit is not breached we would need to submit
@@ -468,7 +474,7 @@ class DocumentLevelMonitorRunner : MonitorRunner() {
         return finding.id
     }
 
-    private suspend fun updateLastRunContext(
+    private fun initializeNewLastRunContext(
         lastRunContext: Map<String, Any>,
         monitorCtx: MonitorRunnerExecutionContext,
         index: String,
@@ -477,8 +483,7 @@ class DocumentLevelMonitorRunner : MonitorRunner() {
         val updatedLastRunContext = lastRunContext.toMutableMap()
         for (i: Int in 0 until count) {
             val shard = i.toString()
-            val maxSeqNo: Long = getMaxSeqNo(monitorCtx.client!!, index, shard)
-            updatedLastRunContext[shard] = maxSeqNo.toString()
+            updatedLastRunContext[shard] = SequenceNumbers.UNASSIGNED_SEQ_NO.toString()
         }
         return updatedLastRunContext
     }
@@ -497,25 +502,6 @@ class DocumentLevelMonitorRunner : MonitorRunner() {
         }
     }
 
-    suspend fun createRunContext(
-        clusterService: ClusterService,
-        client: Client,
-        index: String,
-        createdRecently: Boolean = false
-    ): HashMap<String, Any> {
-        val lastRunContext = HashMap<String, Any>()
-        lastRunContext["index"] = index
-        val count = getShardsCount(clusterService, index)
-        lastRunContext["shards_count"] = count
-
-        for (i: Int in 0 until count) {
-            val shard = i.toString()
-            val maxSeqNo: Long = if (createdRecently) -1L else getMaxSeqNo(client, index, shard)
-            lastRunContext[shard] = maxSeqNo
-        }
-        return lastRunContext
-    }
-
     // Checks if the index was created from the last execution run or when the monitor was last updated to ensure that
     // new index is monitored from the beginning of that index
     private fun createdRecently(
@@ -529,33 +515,6 @@ class DocumentLevelMonitorRunner : MonitorRunner() {
         return indexCreationDate > lastExecutionTime.toEpochMilli()
     }
 
-    /**
-     * Get the current max seq number of the shard. We find it by searching the last document
-     *  in the primary shard.
-     */
-    private suspend fun getMaxSeqNo(client: Client, index: String, shard: String): Long {
-        val request: SearchRequest = SearchRequest()
-            .indices(index)
-            .preference("_shards:$shard")
-            .source(
-                SearchSourceBuilder()
-                    .version(true)
-                    .sort("_seq_no", SortOrder.DESC)
-                    .seqNoAndPrimaryTerm(true)
-                    .query(QueryBuilders.matchAllQuery())
-                    .size(1)
-            )
-        val response: SearchResponse = client.suspendUntil { client.search(request, it) }
-        if (response.status() !== RestStatus.OK) {
-            throw IOException("Failed to get max seq no for shard: $shard")
-        }
-        nonPercolateSearchesTimeTakenStat += response.took.millis
-        if (response.hits.hits.isEmpty())
-            return -1L
-
-        return response.hits.hits[0].seqNo
-    }
-
     private fun getShardsCount(clusterService: ClusterService, index: String): Int {
         val allShards: List<ShardRouting> = clusterService!!.state().routingTable().allShards(index)
         return allShards.filter { it.primary() }.size
@@ -569,50 +528,79 @@ class DocumentLevelMonitorRunner : MonitorRunner() {
     private suspend fun fetchShardDataAndMaybeExecutePercolateQueries(
         monitor: Monitor,
         monitorCtx: MonitorRunnerExecutionContext,
-        docExecutionCtx: DocumentExecutionContext,
-        indexName: String,
-        concreteIndexName: String,
-        conflictingFields: List<String>,
+        indexExecutionCtx: IndexExecutionContext,
         monitorMetadata: MonitorMetadata,
         inputRunResults: MutableMap<String, MutableSet<String>>,
         docsToQueries: MutableMap<String, MutableList<String>>,
         monitorInputIndices: List<String>,
         concreteIndices: List<String>,
         fieldsToBeQueried: List<String>,
+        updateLastRunContext: (String, String) -> Unit
     ) {
-        val count: Int = docExecutionCtx.updatedLastRunContext["shards_count"] as Int
+        val count: Int = indexExecutionCtx.updatedLastRunContext["shards_count"] as Int
         for (i: Int in 0 until count) {
             val shard = i.toString()
             try {
-                val maxSeqNo: Long = docExecutionCtx.updatedLastRunContext[shard].toString().toLong()
-                val prevSeqNo = docExecutionCtx.lastRunContext[shard].toString().toLongOrNull()
-
-                val hits: SearchHits = searchShard(
-                    monitorCtx,
-                    concreteIndexName,
-                    shard,
-                    prevSeqNo,
-                    maxSeqNo,
-                    null,
-                    fieldsToBeQueried
-                )
-                val startTime = System.currentTimeMillis()
-                transformedDocs.addAll(
-                    transformSearchHitsAndReconstructDocs(
-                        hits,
-                        indexName,
-                        concreteIndexName,
-                        monitor.id,
-                        conflictingFields,
+                val prevSeqNo = indexExecutionCtx.lastRunContext[shard].toString().toLongOrNull()
+                val from = prevSeqNo ?: SequenceNumbers.NO_OPS_PERFORMED
+                var to: Long = Long.MAX_VALUE
+                while (to >= from) {
+                    val hits: SearchHits = searchShard(
+                        monitorCtx,
+                        indexExecutionCtx.concreteIndexName,
+                        shard,
+                        from,
+                        to,
+                        indexExecutionCtx.docIds,
+                        fieldsToBeQueried,
                     )
-                )
-                docTransformTimeTakenStat += System.currentTimeMillis() - startTime
+                    if (hits.hits.isEmpty()) {
+                        if (to == Long.MAX_VALUE) {
+                            updateLastRunContext(shard, (prevSeqNo ?: SequenceNumbers.NO_OPS_PERFORMED).toString()) // didn't find any docs
+                        }
+                        break
+                    }
+                    if (to == Long.MAX_VALUE) { // max sequence number of shard needs to be computed
+                        updateLastRunContext(shard, hits.hits[0].seqNo.toString())
+                    }
+                    val leastSeqNoFromHits = hits.hits.last().seqNo
+                    to = leastSeqNoFromHits - 1
+                    val startTime = System.currentTimeMillis()
+                    transformedDocs.addAll(
+                        transformSearchHitsAndReconstructDocs(
+                            hits,
+                            indexExecutionCtx.indexName,
+                            indexExecutionCtx.concreteIndexName,
+                            monitor.id,
+                            indexExecutionCtx.conflictingFields,
+                        )
+                    )
+                    if (
+                        transformedDocs.isNotEmpty() &&
+                        shouldPerformPercolateQueryAndFlushInMemoryDocs(transformedDocs.size, monitorCtx)
+                    ) {
+                        performPercolateQueryAndResetCounters(
+                            monitorCtx,
+                            monitor,
+                            monitorMetadata,
+                            monitorInputIndices,
+                            concreteIndices,
+                            inputRunResults,
+                            docsToQueries,
+                        )
+                    }
+                    docTransformTimeTakenStat += System.currentTimeMillis() - startTime
+                }
             } catch (e: Exception) {
                 logger.error(
                     "Monitor ${monitor.id} :" +
-                        " Failed to run fetch data from shard [$shard] of index [$concreteIndexName]. Error: ${e.message}",
+                        "Failed to run fetch data from shard [$shard] of index [${indexExecutionCtx.concreteIndexName}]. " +
+                        "Error: ${e.message}",
                     e
                 )
+                if (e is IndexClosedException) {
+                    throw e
+                }
             }
             if (
                 transformedDocs.isNotEmpty() &&
@@ -704,8 +692,10 @@ class DocumentLevelMonitorRunner : MonitorRunner() {
             .source(
                 SearchSourceBuilder()
                     .version(true)
+                    .sort("_seq_no", SortOrder.DESC)
+                    .seqNoAndPrimaryTerm(true)
                     .query(boolQueryBuilder)
-                    .size(10000)
+                    .size(monitorCtx.docLevelMonitorShardFetchSize)
             )
 
         if (monitorCtx.fetchOnlyQueryFieldNames && fieldsToFetch.isNotEmpty()) {
@@ -716,7 +706,6 @@ class DocumentLevelMonitorRunner : MonitorRunner() {
         }
         val response: SearchResponse = monitorCtx.client!!.suspendUntil { monitorCtx.client!!.search(request, it) }
         if (response.status() !== RestStatus.OK) {
-            logger.error("Failed search shard. Response: $response")
             throw IOException("Failed to search shard: [$shard] in index [$index]. Response status is ${response.status()}")
         }
         nonPercolateSearchesTimeTakenStat += response.took.millis

diff --git a/alerting/src/main/kotlin/org/opensearch/alerting/MonitorRunnerExecutionContext.kt b/alerting/src/main/kotlin/org/opensearch/alerting/MonitorRunnerExecutionContext.kt
@@ -54,4 +54,6 @@ data class MonitorRunnerExecutionContext(
     @Volatile var percQueryMaxNumDocsInMemory: Int = AlertingSettings.DEFAULT_PERCOLATE_QUERY_NUM_DOCS_IN_MEMORY,
     @Volatile var percQueryDocsSizeMemoryPercentageLimit: Int =
         AlertingSettings.DEFAULT_PERCOLATE_QUERY_DOCS_SIZE_MEMORY_PERCENTAGE_LIMIT,
+    @Volatile var docLevelMonitorShardFetchSize: Int =
+        AlertingSettings.DEFAULT_DOC_LEVEL_MONITOR_SHARD_FETCH_SIZE,
 )
diff --git a/alerting/src/main/kotlin/org/opensearch/alerting/MonitorRunnerService.kt b/alerting/src/main/kotlin/org/opensearch/alerting/MonitorRunnerService.kt
@@ -25,6 +25,7 @@ import org.opensearch.alerting.script.TriggerExecutionContext
 import org.opensearch.alerting.settings.AlertingSettings.Companion.ALERT_BACKOFF_COUNT
 import org.opensearch.alerting.settings.AlertingSettings.Companion.ALERT_BACKOFF_MILLIS
 import org.opensearch.alerting.settings.AlertingSettings.Companion.DOC_LEVEL_MONITOR_FETCH_ONLY_QUERY_FIELDS_ENABLED
+import org.opensearch.alerting.settings.AlertingSettings.Companion.DOC_LEVEL_MONITOR_SHARD_FETCH_SIZE
 import org.opensearch.alerting.settings.AlertingSettings.Companion.INDEX_TIMEOUT
 import org.opensearch.alerting.settings.AlertingSettings.Companion.MAX_ACTIONABLE_ALERT_COUNT
 import org.opensearch.alerting.settings.AlertingSettings.Companion.MOVE_ALERTS_BACKOFF_COUNT
@@ -182,6 +183,13 @@ object MonitorRunnerService : JobRunner, CoroutineScope, AbstractLifecycleCompon
                 monitorCtx.percQueryDocsSizeMemoryPercentageLimit = it
             }
 
+        monitorCtx.docLevelMonitorShardFetchSize =
+            DOC_LEVEL_MONITOR_SHARD_FETCH_SIZE.get(monitorCtx.settings)
+        monitorCtx.clusterService!!.clusterSettings
+            .addSettingsUpdateConsumer(DOC_LEVEL_MONITOR_SHARD_FETCH_SIZE) {
+                monitorCtx.docLevelMonitorShardFetchSize = it
+            }
+
         return this
     }
 

diff --git a/alerting/src/main/kotlin/org/opensearch/alerting/model/DocumentExecutionContext.kt b/alerting/src/main/kotlin/org/opensearch/alerting/model/DocumentExecutionContext.kt
diff --git a/alerting/src/main/kotlin/org/opensearch/alerting/model/IndexExecutionContext.kt b/alerting/src/main/kotlin/org/opensearch/alerting/model/IndexExecutionContext.kt
@@ -0,0 +1,19 @@
+/*
+ * Copyright OpenSearch Contributors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+package org.opensearch.alerting.model
+
+import org.opensearch.commons.alerting.model.DocLevelQuery
+
+/** DTO that contains all the necessary context for fetching data from shard and performing percolate queries */
+data class IndexExecutionContext(
+    val queries: List<DocLevelQuery>,
+    val lastRunContext: MutableMap<String, Any>,
+    val updatedLastRunContext: MutableMap<String, Any>,
+    val indexName: String,
+    val concreteIndexName: String,
+    val conflictingFields: List<String>,
+    val docIds: List<String>? = null,
+)
diff --git a/alerting/src/main/kotlin/org/opensearch/alerting/settings/AlertingSettings.kt b/alerting/src/main/kotlin/org/opensearch/alerting/settings/AlertingSettings.kt
@@ -20,6 +20,7 @@ class AlertingSettings {
         const val DEFAULT_FINDINGS_INDEXING_BATCH_SIZE = 1000
         const val DEFAULT_PERCOLATE_QUERY_NUM_DOCS_IN_MEMORY = 50000
         const val DEFAULT_PERCOLATE_QUERY_DOCS_SIZE_MEMORY_PERCENTAGE_LIMIT = 10
+        const val DEFAULT_DOC_LEVEL_MONITOR_SHARD_FETCH_SIZE = 10000
 
         val ALERTING_MAX_MONITORS = Setting.intSetting(
             "plugins.alerting.monitor.max_monitors",
@@ -38,6 +39,16 @@ class AlertingSettings {
             Setting.Property.NodeScope, Setting.Property.Dynamic
         )
 
+        /** Purely a setting used to verify seq_no calculation
+         */
+        val DOC_LEVEL_MONITOR_SHARD_FETCH_SIZE = Setting.intSetting(
+            "plugins.alerting.monitor.doc_level_monitor_shard_fetch_size",
+            DEFAULT_DOC_LEVEL_MONITOR_SHARD_FETCH_SIZE,
+            1,
+            10000,
+            Setting.Property.NodeScope, Setting.Property.Dynamic
+        )
+
         /** Defines the threshold of the maximum number of docs accumulated in memory to query against percolate query index in document
          * level monitor execution. The docs are being collected from searching on shards of indices mentioned in the
          * monitor input indices field. When the number of in-memory docs reaches or exceeds threshold we immediately perform percolate

diff --git a/alerting/src/main/kotlin/org/opensearch/alerting/transport/TransportIndexMonitorAction.kt b/alerting/src/main/kotlin/org/opensearch/alerting/transport/TransportIndexMonitorAction.kt
@@ -187,7 +187,15 @@ class TransportIndexMonitorAction @Inject constructor(
             else (it as DocLevelMonitorInput).indices
             indices.addAll(inputIndices)
         }
-        val searchRequest = SearchRequest().indices(*indices.toTypedArray())
+        val updatedIndices = indices.map { index ->
+            if (IndexUtils.isAlias(index, clusterService.state()) || IndexUtils.isDataStream(index, clusterService.state())) {
+                val metadata = clusterService.state().metadata.indicesLookup[index]?.writeIndex
+                metadata?.index?.name ?: index
+            } else {
+                index
+            }
+        }
+        val searchRequest = SearchRequest().indices(*updatedIndices.toTypedArray())
             .source(SearchSourceBuilder.searchSource().size(1).query(QueryBuilders.matchAllQuery()))
         client.search(
             searchRequest,