Bulk TTL modifcation

michaelraney · michaelraney · commit c0bb6b692b9f · 2025-06-08T22:38:39.000-04:00
Added ability to bulk modify TTL
diff --git a/scala/datastax-v4/aws-glue/modify-time-to-live/README.md b/scala/datastax-v4/aws-glue/modify-time-to-live/README.md
@@ -0,0 +1,39 @@
+## Using Glue Modify TTL Example
+This example provides scala script for modifying TTL across many or all rows in a table.
+
+## Prerequisites
+* Setup Spark Cassandra connector using provided [setup script](../)
+
+### Setup Export to S3
+The following script sets up AWS Glue job to modify ttl on a table. The script takes the following parameters 
+* PARENT_STACK_NAME is the stack name used to create the spark cassandra connector with Glue. [setup script](../)
+* TLL_STACK_NAME is the stack name used to create glue job. 
+* KEYSPACE_NAME and TABLE_NAME Keyspaces and table is the fully qualified name of the table you wish to modify.
+* TTL_FIELD the field used as existing TTL value
+* TTL_TIME_TO_ADD the amount of time to add to the existing ttl value. 
+
+```shell
+./setup-modify-ttl.sh SETUP_STACK_NAME TLL_STACK_NAME KEYSPACE_TABLE TABLE_NAME TTL_FIELD TTL_TIME_TO_ADD 
+
+```
+
+### Running the script from the CLI
+
+Running the job can be done through the AWS CLI. In the following example the command is running the job created in the previous step, but overrides the number of glue workers, worker type, and script arguments such as the table name. You can override any of the glue job parameters at run time and the default arguments. 
+
+```shell
+aws glue start-job-run --job-name AmazonKeyspacesModifyTTL-aksglue-aksglue-export --number-of-workers 8 --worker-type G.2X --arguments '{"--TABLE_NAME":"keyvalue"}'
+```
+
+Full list of aws cli arguments [start-job-run arguments](https://docs.aws.amazon.com/cli/latest/reference/glue/start-job-run.html)
+
+### List of arguments
+
+| argument          | defenition                                      | default |
+| :---------------- | :---------------------------------------------- | ----: |
+| --KEYSPACE_NAME   |   Name of the keyspace of the table to export   | none  |
+| --TABLE_NAME      |   Name of the table to export                   | none  |
+| --TTL_FIELD       |   Name of the field to use ttl value            | none  |
+| --TTL_TIME_TO_ADD |   Amount of time to modify the existig ttl      | node  |
+
+
diff --git a/scala/datastax-v4/aws-glue/modify-time-to-live/glue-job-modify-ttl.yaml b/scala/datastax-v4/aws-glue/modify-time-to-live/glue-job-modify-ttl.yaml
@@ -0,0 +1,111 @@
+AWSTemplateFormatVersion: 2010-09-09
+Description: ' Modify TTL Glue Job for Amazon Keyspaces'
+Parameters:
+  KeyspaceName: 
+      NoEcho: false
+      Description: Cassandra Keyspace name
+      Type: String
+      Default: mykeyspace
+      MinLength: 3
+      MaxLength: 48
+  TableName: 
+      NoEcho: false
+      Description: Cassandra Table name
+      Type: String
+      Default: mytable
+      MinLength: 3
+      MaxLength: 48
+  ParentStack:
+      NoEcho: false
+      Description: Stack used to setup the spark cassandra connector
+      Type: String
+      Default: aksglue1
+      MinLength: 3
+      MaxLength: 48
+  TTLField:
+      NoEcho: false
+      Description: The field to modify the TTL for
+      Type: String
+      Default: ttl
+      MinLength: 3
+      MaxLength: 48
+  TTLTimeToAdd:
+      NoEcho: false
+      Description: The time to add to the TTL
+      Type: String
+      Default: 2592000 # 30 days
+Resources:
+  GlueJob:
+    Type: AWS::Glue::Job
+    Properties: 
+      Command:
+        Name: glueetl
+        ScriptLocation: !Sub 
+        - "s3://${IMPORTBUCKETNAME}/scripts/${ParentStack}-${AWS::StackName}-modify-ttl.scala"
+        - IMPORTBUCKETNAME: 
+            Fn::ImportValue: 
+              !Sub 'KeyspacesBucketNameExport-${ParentStack}'
+      DefaultArguments: 
+        "--job-language": "scala"
+        "--user-jars-first": "true"
+        "--extra-jars": !Sub 
+        - 's3://${IMPORTBUCKETNAME}/jars/spark-cassandra-connector-assembly_2.12-3.1.0.jar,s3://${IMPORTBUCKETNAME}/jars/aws-sigv4-auth-cassandra-java-driver-plugin-4.0.9-shaded.jar,s3://${IMPORTBUCKETNAME}/jars/spark-extension_2.12-2.8.0-3.4.jar,s3://${IMPORTBUCKETNAME}/jars/amazon-keyspaces-helpers-1.0-SNAPSHOT.jar'
+        - IMPORTBUCKETNAME:
+            Fn::ImportValue:
+              !Sub 'KeyspacesBucketNameExport-${ParentStack}'
+        "--extra-files": !Sub 
+        - 's3://${IMPORTBUCKETNAME}/conf/keyspaces-application.conf'
+        - IMPORTBUCKETNAME: 
+            Fn::ImportValue:
+              !Sub 'KeyspacesBucketNameExport-${ParentStack}'
+        "--enable-metrics": "true"
+        "--enable-continuous-cloudwatch-log": "true"
+        "--enable-spark-ui": "true"
+        "--spark-event-logs-path": !Sub 
+        - "s3://${IMPORTBUCKETNAME}/spark-logs/"
+        - IMPORTBUCKETNAME: 
+            Fn::ImportValue:
+              !Sub 'KeyspacesBucketNameExport-${ParentStack}'
+        "--write-shuffle-files-to-s3": "true"
+        "--write-shuffle-spills-to-s3": "true"
+        "--TempDir": !Sub 
+        - 's3://${IMPORTBUCKETNAME}/shuffle-space/modify-ttl-sample/'
+        - IMPORTBUCKETNAME: 
+            Fn::ImportValue:
+              !Sub 'KeyspacesBucketNameExport-${ParentStack}'
+        "--FORMAT": !Sub '${FORMAT}'
+        "--KEYSPACE_NAME": !Sub '${KeyspaceName}'
+        "--TABLE_NAME": !Sub '${TableName}'
+        "--DRIVER_CONF": "keyspaces-application.conf"
+        "--TTL_FIELD": !Sub '${TTLField}'
+        "--TTL_TIME_TO_ADD": !Sub '${TTLTimeToAdd}'
+        #"--DISTINCT_KEYS": "id,create_date"
+        "--class": "GlueApp"
+      #Connections: 
+      #  ConnectionsList
+      Description: 'modify ttl'
+      #ExecutionClass: String
+      #ExecutionProperty: 
+        #ExecutionProperty
+      GlueVersion: "3.0"
+      #LogUri: String
+      #MaxCapacity: Double
+      #MaxRetries: Double
+      Name: !Sub ['AmazonKeyspacesModifyTTL-${STACKNAME}', STACKNAME: !Join [ "-", [!Ref ParentStack, !Ref  AWS::StackName]]]
+      #NonOverridableArguments: Json
+      #NotificationProperty: 
+      #NotificationProperty
+      NumberOfWorkers: 2
+      Role: 
+        Fn::ImportValue:
+            !Sub 'KeyspacesGlueJobServiceRoleExport-${ParentStack}'
+      #SecurityConfiguration: String
+      #Tags: Json
+      #Timeout: Integer
+      WorkerType: G.2X
+Outputs:
+  KeyspacesGlueJobName:
+    Description: Glue job id
+    Value: !Sub ['AmazonKeyspacesModifyTTL-${STACKNAME}', STACKNAME: !Join [ "-", [!Ref ParentStack, !Ref  AWS::StackName]]]
+    Export:
+      Name: !Sub ['KeyspaceExportJobName-${STACKNAME}', STACKNAME: !Ref AWS::StackName]
diff --git a/scala/datastax-v4/aws-glue/modify-time-to-live/modify-time-to-live.scala b/scala/datastax-v4/aws-glue/modify-time-to-live/modify-time-to-live.scala
@@ -0,0 +1,171 @@
+import com.amazonaws.services.glue.GlueContext
+import com.amazonaws.services.glue.util.GlueArgParser
+import com.amazonaws.services.glue.util.Job
+import org.apache.spark.SparkContext
+import org.apache.spark.SparkConf
+import org.apache.spark.sql.Dataset
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.SaveMode
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.functions.from_json
+import org.apache.spark.sql.streaming.Trigger
+import scala.collection.JavaConverters._
+import com.datastax.spark.connector._
+import org.apache.spark.sql.cassandra._
+import org.apache.spark.sql.SaveMode._
+import com.datastax.spark.connector._
+import com.datastax.spark.connector.cql._
+import com.datastax.oss.driver.api.core._
+import org.apache.spark.sql.functions.rand
+import com.amazonaws.services.glue.log.GlueLogger
+import java.time.ZonedDateTime
+import java.time.ZoneOffset
+import java.time.temporal.ChronoUnit
+import java.time.format.DateTimeFormatter
+import org.apache.spark.sql.functions._
+import org.apache.spark.sql.Row
+
+
+object GlueApp {
+
+  //currentTTL is the time left on the record
+  //timeToAdd time the delta add or subtract. Use negative number for subtraction. 
+  def addTimeToExistingTTL(currentTTL: Int, timeToAdd: Int): Int = {
+   
+    val finalTTLValue = currentTTL + timeToAdd;
+
+    // Scenario where the future ttl is less than the remaininng TTL.
+    // Moving from 60 to 90 days. 
+    // TODO: May be more efficient to just delete, than modify/expire 
+    Math.max(1, finalTTLValue) 
+  }
+
+  //update the row with the new ttl using LWT
+  //to update the ttl we must overwrite using the same row values 
+  //Using LWT to check the value has not changed since reading the row for the current ttl. 
+  def updateRowWithLWT(row: Row, connector: CassandraConnector): Unit = {
+    //open seach creates a session or updates a reference counter on shared session. 
+    val session = connector.openSession()
+    
+    val query =
+      """UPDATE tlp_stress.keyvalue
+        |USING TTL ?
+        |SET value = ?
+        |WHERE key = ?
+        |IF value = ?""".stripMargin
+
+    //prepared statmeents are cached by the driver, and not an issue if called multiple times. 
+    val prepared = session.prepare(query)
+
+    val key = row.getAs[String]("key")
+    val value = row.getAs[String]("value")
+    val expectedValue = row.getAs[String]("value")
+    val ttl = row.getAs[Int]("ttlCol")
+
+    //bind the values to the prepared statement. 
+    val bound = prepared.bind(
+      java.lang.Integer.valueOf(ttl), 
+      value, key, expectedValue)
+    
+    val result = session.execute(bound)
+
+    // Optional: check whether LWT succeeded
+    if (!result.wasApplied()) {
+      println(s"Conditional update failed for id=$key")
+      // Here you may want want to:
+      //1. read the latest row and ttl
+      //2. apply the correct ttl 
+      //3. use LWT to avoid conflicts
+    }
+    session.close()
+  }
+
+  def main(sysArgs: Array[String]) {
+
+    val args = GlueArgParser.getResolvedOptions(sysArgs, Seq("JOB_NAME", "KEYSPACE_NAME", "TABLE_NAME", "DRIVER_CONF", "TTL_FIELD", "TTL_TIME_TO_ADD").toArray)
+
+    val driverConfFileName = args("DRIVER_CONF")
+
+    val conf = new SparkConf()
+        .setAll(
+         Seq(
+             ("spark.task.maxFailures",  "100"),
+          
+            ("spark.cassandra.connection.config.profile.path",  driverConfFileName),
+            ("spark.sql.extensions", "com.datastax.spark.connector.CassandraSparkExtensions"),
+            ("directJoinSetting", "on"),
+            
+            ("spark.cassandra.output.consistency.level",  "LOCAL_QUORUM"),//WRITES
+            ("spark.cassandra.input.consistency.level",  "LOCAL_ONE"),//READS
+
+            ("spark.cassandra.sql.inClauseToJoinConversionThreshold", "0"),
+            ("spark.cassandra.sql.inClauseToFullScanConversionThreshold", "0"),
+            ("spark.cassandra.concurrent.reads", "50"),
+
+            ("spark.cassandra.output.concurrent.writes", "5"),
+            ("spark.cassandra.output.batch.grouping.key", "none"),
+            ("spark.cassandra.output.batch.size.rows", "1"),
+            ("spark.cassandra.output.batch.size.rows", "1"),
+            ("spark.cassandra.output.ignoreNulls", "true")
+        ))
+
+
+    val spark: SparkContext = new SparkContext(conf)
+    val glueContext: GlueContext = new GlueContext(spark)
+    val sparkSession: SparkSession = glueContext.getSparkSession
+
+    import sparkSession.implicits._
+
+    Job.init(args("JOB_NAME"), glueContext, args.asJava)
+
+    val logger = new GlueLogger
+    
+    //validation steps for peers and partitioner 
+    val connector = CassandraConnector.apply(conf);
+    val session = connector.openSession();
+    val peersCount = session.execute("SELECT * FROM system.peers").all().size()
+    
+    val partitioner = session.execute("SELECT partitioner from system.local").one().getString("partitioner")
+    
+    logger.info("Total number of seeds:" + peersCount)
+    logger.info("Configured partitioner:" + partitioner)
+    
+    if(peersCount == 0){
+       throw new Exception("No system peers found. Check required permissions to read from the system.peers table. If using VPCE check permissions for describing VPCE endpoints. https://docs.aws.amazon.com/keyspaces/latest/devguide/vpc-endpoints.html")
+    }
+    
+    if(partitioner.equals("com.amazonaws.cassandra.DefaultPartitioner")){
+        throw new Exception("Sark requires the use of RandomPartitioner or Murmur3Partitioner. See Working with partioners in Amazon Keyspaces documentation. https://docs.aws.amazon.com/keyspaces/latest/devguide/working-with-partitioners.html")
+    }
+    
+    val tableName = args("TABLE_NAME")
+    val keyspaceName = args("KEYSPACE_NAME")
+    val backupS3 = args("S3_URI")
+    val backupFormat = args("FORMAT")
+    
+    val tableDf = sparkSession.read
+      .format("org.apache.spark.sql.cassandra")
+      .options(Map( "table" -> tableName, 
+                    "keyspace" -> keyspaceName, 
+                    "pushdown" -> "false"))//set to true when executing against Apache Cassandra, false when working with Keyspaces
+      .load()
+      //.filter("my_column=='somevalue' AND my_othercolumn=='someothervalue'")
+
+    // Register the UDF for calculating TTL
+    val calculateTTLUDF = udf((currentTTL: Int, timeToAdd: Int) => addTimeToExistingTTL(currentTTL, timeToAdd))
+    
+    val timeToAdd = args("TTL_TIME_TO_ADD").toInt
+    val ttlField = args("TTL_FIELD")
+    // val timeToAdd = 5 * 365 * 24 * 60 * 60 //add 5 years
+    //val timeToAdd = -1 * 365 * 24 * 60 * 60 //subtract 1 year
+    // Calculate TTL values
+    val tableDfWithTTL = tableDf
+      .withColumn("ttlCol", calculateTTLUDF(ttl(col(ttlField)), lit(timeToAdd)))
+    
+    tableDfWithTTL.foreachPartition { partition: Iterator[Row] =>
+      partition.foreach { row => updateRowWithLWT(row, connector) }
+    }
+    
+    Job.commit()
+  }
+}
diff --git a/scala/datastax-v4/aws-glue/modify-time-to-live/setup-modify-ttl.sh b/scala/datastax-v4/aws-glue/modify-time-to-live/setup-modify-ttl.sh
@@ -0,0 +1,49 @@
+ #!/bin/bash
+
+echo "Positional Arguments: PARENT_STACK_NAME, STACK_NAME, KEYSPACE_NAME, TABLE_NAME, S3_URI, FORMAT"
+echo ""
+echo "PARENT_STACK_NAME: Stack name used for setting up the connector"
+echo "STACK_NAME: Stack name used for setting up glue job"
+echo "KEYSPACE_NAME: Keyspace to export from"
+echo "TABLE_NAME: Table to export from"
+echo "TTL_FIELD: Field to modify the TTL for"
+echo "TTL_TIME_TO_ADD: Time to add to the TTL"
+
+PARENT_STACK_NAME=${1:-aksglue}
+STACK_NAME="${2:-$PARENT_STACK_NAME-export}"
+KEYSPACE_NAME=${3:-mykeyspace}
+TABLE_NAME=${4:-mytable}
+TTL_FIELD=${5:-ttl}
+TTL_TIME_TO_ADD=${6:-2592000} # 30 days
+
+echo "Parent stack used: ${PARENT_STACK_NAME}"
+echo "Stack name used:   ${STACK_NAME}"
+echo "Keyspace used used: ${KEYSPACE_NAME}"
+echo "Table used: ${TABLE_NAME}"
+echo "TTL Field used: ${TTL_FIELD}"
+echo "TTL Time to add: ${TTL_TIME_TO_ADD}"
+
+if ! command -v aws &> /dev/null; then
+    echo "AWS CLI \"aws\" is not installed. aws is required for deploying artifacts to s3. See https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html "
+    exit 1
+fi
+
+export KEYSPACES_GLUE_BUCKET=$(aws cloudformation describe-stacks --query "Stacks[?StackName==\`$PARENT_STACK_NAME\`][].Outputs[?ExportName==\`KeyspacesBucketNameExport-$PARENT_STACK_NAME\`]".OutputValue --output text)
+
+if [ -z "${KEYSPACES_GLUE_BUCKET}" ]; then
+	echo "Parent stack not found. Cloudformation Export not found KeyspacesBucketNameExport-$PARENT_STACK_NAME"
+	exit 1
+fi
+
+echo "Moving script to bucket ${KEYSPACES_GLUE_BUCKET}"
+
+aws s3api put-object --bucket $KEYSPACES_GLUE_BUCKET --key scripts/$PARENT_STACK_NAME-$STACK_NAME-export.scala --body export-sample.scala || exit 1
+
+aws cloudformation create-stack --stack-name ${STACK_NAME} --parameters ParameterKey=ParentStack,ParameterValue=$PARENT_STACK_NAME ParameterKey=KeyspaceName,ParameterValue=$KEYSPACE_NAME ParameterKey=TableName,ParameterValue=$TABLE_NAME ParameterKey=TTLField,ParameterValue=$TTL_FIELD ParameterKey=TTLTimeToAdd,ParameterValue=$TTL_TIME_TO_ADD --template-body 'file://glue-job-modify-ttl.yaml' || exit 1  #--debug  
+
+echo Waiting for CloudFormation stack to complete ...
+aws cloudformation wait stack-create-complete --stack-name ${STACK_NAME}  || exit 1 
+
+aws cloudformation describe-stacks --stack-name $STACK_NAME --query "Stacks[0].Outputs" || exit 1 
+
+