diff --git a/content/02-deploy-pcm/01-deploy-pcm.md b/content/02-deploy-pcm/01-deploy-pcm.md index 8e421953..f0d3de10 100644 --- a/content/02-deploy-pcm/01-deploy-pcm.md +++ b/content/02-deploy-pcm/01-deploy-pcm.md @@ -1,6 +1,6 @@ +++ title = "a. Deploy ParallelCluster UI" -weight = 31 +weight = 21 tags = ["tutorial", "ParallelCluster"] +++ diff --git a/content/02-deploy-pcm/02-connect-pcm.md b/content/02-deploy-pcm/02-connect-pcm.md index 8c69e914..70957c4b 100644 --- a/content/02-deploy-pcm/02-connect-pcm.md +++ b/content/02-deploy-pcm/02-connect-pcm.md @@ -1,6 +1,6 @@ +++ title = "b. Connect to ParallelCluster UI" -weight = 32 +weight = 22 tags = ["tutorial", "cloud9", "ParallelCluster"] +++ diff --git a/content/02-deploy-pcm/03-summary.md b/content/02-deploy-pcm/03-summary.md index 5aeda75e..4ab42383 100644 --- a/content/02-deploy-pcm/03-summary.md +++ b/content/02-deploy-pcm/03-summary.md @@ -1,6 +1,6 @@ +++ title = "c. Summary" -weight = 32 +weight = 23 tags = ["tutorial", "cloud9", "ParallelCluster"] +++ diff --git a/content/03-parallel-cluster-cli/01-requirement_notes.md b/content/03-parallel-cluster-cli/01-requirement_notes.md index 49b4d688..b5d719d4 100644 --- a/content/03-parallel-cluster-cli/01-requirement_notes.md +++ b/content/03-parallel-cluster-cli/01-requirement_notes.md @@ -2,7 +2,7 @@ title = "Prerequisites" date = 2019-09-18T10:46:30-04:00 draft = false -weight = 21 +weight = 31 tags = ["tutorial", "Prerequisite", "ec2"] +++ diff --git a/content/03-parallel-cluster-cli/02-aws-console-login.md b/content/03-parallel-cluster-cli/02-aws-console-login.md index 169e60f9..f5ea2897 100644 --- a/content/03-parallel-cluster-cli/02-aws-console-login.md +++ b/content/03-parallel-cluster-cli/02-aws-console-login.md @@ -2,7 +2,7 @@ title = "a. Sign in to the Console" date = 2019-09-18T10:46:30-04:00 draft = false -weight = 22 +weight = 32 tags = ["tutorial", "aws console", "ec2"] +++ diff --git a/content/03-parallel-cluster-cli/03-start_cloud9.md b/content/03-parallel-cluster-cli/03-start_cloud9.md index 13a4e493..94f4b8f8 100644 --- a/content/03-parallel-cluster-cli/03-start_cloud9.md +++ b/content/03-parallel-cluster-cli/03-start_cloud9.md @@ -1,6 +1,6 @@ +++ title = "b. Create a Cloud9 Environment" -weight = 23 +weight = 33 tags = ["tutorial", "cloud9", "ParallelCluster"] +++ ![Cloud 9](/images/hpc-aws-parallelcluster-workshop/cloud9.png) diff --git a/content/03-parallel-cluster-cli/04-start-aws-cli.md b/content/03-parallel-cluster-cli/04-start-aws-cli.md index 23dbdc78..63bbb4e5 100644 --- a/content/03-parallel-cluster-cli/04-start-aws-cli.md +++ b/content/03-parallel-cluster-cli/04-start-aws-cli.md @@ -1,6 +1,6 @@ +++ title = "c. Work with the AWS CLI" -weight = 25 +weight = 34 tags = ["tutorial", "cloud9", "aws cli", "s3"] +++ diff --git a/content/03-parallel-cluster-cli/05-key-pair-create.md b/content/03-parallel-cluster-cli/05-key-pair-create.md index 7b911915..d06625cd 100644 --- a/content/03-parallel-cluster-cli/05-key-pair-create.md +++ b/content/03-parallel-cluster-cli/05-key-pair-create.md @@ -1,6 +1,6 @@ +++ title = "d. Create a Key Pair" -weight = 28 +weight = 35 tags = ["tutorial", "cloud9", "aws cli", "ec2", "key-pair"] +++ diff --git a/content/03-parallel-cluster-cli/06-install-pc.md b/content/03-parallel-cluster-cli/06-install-pc.md index 4cab8c51..9cbcf6d6 100644 --- a/content/03-parallel-cluster-cli/06-install-pc.md +++ b/content/03-parallel-cluster-cli/06-install-pc.md @@ -1,7 +1,7 @@ +++ title = "e. Install AWS ParallelCluster" date = 2019-09-18T10:46:30-04:00 -weight = 41 +weight = 36 tags = ["tutorial", "install", "ParallelCluster"] +++ diff --git a/content/03-parallel-cluster-cli/07-initialize-pcluster.md b/content/03-parallel-cluster-cli/07-initialize-pcluster.md index 78a8d5b1..96fe44be 100644 --- a/content/03-parallel-cluster-cli/07-initialize-pcluster.md +++ b/content/03-parallel-cluster-cli/07-initialize-pcluster.md @@ -1,7 +1,7 @@ +++ title = "f. (Optional) Create config with 'pcluster configure'" date = 2019-09-18T10:46:30-04:00 -weight = 42 +weight = 37 tags = ["tutorial", "initialize", "ParallelCluster"] +++ diff --git a/content/03-parallel-cluster-cli/08-configure-pcluster.md b/content/03-parallel-cluster-cli/08-configure-pcluster.md index 58436ead..588190c9 100644 --- a/content/03-parallel-cluster-cli/08-configure-pcluster.md +++ b/content/03-parallel-cluster-cli/08-configure-pcluster.md @@ -1,7 +1,7 @@ +++ title = "g. Create a Cluster Config" date = 2019-09-18T10:46:30-04:00 -weight = 43 +weight = 38 tags = ["tutorial", "initialize", "ParallelCluster"] +++ diff --git a/content/03-parallel-cluster-cli/09-launch-pc.md b/content/03-parallel-cluster-cli/09-launch-pc.md index 35befc8b..88ca0d7d 100644 --- a/content/03-parallel-cluster-cli/09-launch-pc.md +++ b/content/03-parallel-cluster-cli/09-launch-pc.md @@ -1,7 +1,7 @@ +++ title = "h. Build an HPC Cluster" date = 2019-09-18T10:46:30-04:00 -weight = 44 +weight = 39 tags = ["tutorial", "create", "ParallelCluster"] +++ diff --git a/content/03-parallel-cluster-cli/10-logon-pc.md b/content/03-parallel-cluster-cli/10-logon-pc.md index 0f0808db..376ee796 100644 --- a/content/03-parallel-cluster-cli/10-logon-pc.md +++ b/content/03-parallel-cluster-cli/10-logon-pc.md @@ -1,7 +1,7 @@ +++ title = "i. Log in to Your Cluster" date = 2019-09-18T10:46:30-04:00 -weight = 45 +weight = 40 tags = ["tutorial", "create", "ParallelCluster"] +++ diff --git a/content/03-parallel-cluster-cli/11-1stjob.md b/content/03-parallel-cluster-cli/11-1stjob.md index f078e7c1..f5194344 100644 --- a/content/03-parallel-cluster-cli/11-1stjob.md +++ b/content/03-parallel-cluster-cli/11-1stjob.md @@ -1,7 +1,7 @@ +++ title = "j. Submit your first HPC job" date = 2022-03-01T10:46:30-04:00 -weight = 54 +weight = 41 tags = ["tutorial", "create", "ParallelCluster"] +++ diff --git a/content/03-parallel-cluster-cli/99-summary.md b/content/03-parallel-cluster-cli/99-summary.md index 17bb8c9b..6505e463 100644 --- a/content/03-parallel-cluster-cli/99-summary.md +++ b/content/03-parallel-cluster-cli/99-summary.md @@ -1,6 +1,6 @@ +++ title = "Summary" -weight = 99 +weight = 42 tags = ["tutorial", "summary"] +++ diff --git a/content/05-create-cluster/01-create-cluster.md b/content/04-create-cluster/01-create-cluster.md similarity index 99% rename from content/05-create-cluster/01-create-cluster.md rename to content/04-create-cluster/01-create-cluster.md index 7a101bb7..d33d2726 100644 --- a/content/05-create-cluster/01-create-cluster.md +++ b/content/04-create-cluster/01-create-cluster.md @@ -1,6 +1,6 @@ --- title: "a. Create a Cluster" -weight: 51 +weight: 41 tags: ["tutorial", "cloud9", "ParallelCluster"] --- diff --git a/content/05-create-cluster/02-connect-cluster.md b/content/04-create-cluster/02-connect-cluster.md similarity index 99% rename from content/05-create-cluster/02-connect-cluster.md rename to content/04-create-cluster/02-connect-cluster.md index 8fd91a43..da1c6134 100644 --- a/content/05-create-cluster/02-connect-cluster.md +++ b/content/04-create-cluster/02-connect-cluster.md @@ -1,6 +1,6 @@ --- title: "b. Connect to the Cluster" -weight: 52 +weight: 42 tags: ["tutorial", "cloud9", "ParallelCluster"] --- diff --git a/content/05-create-cluster/03-get-to-know-cluster.md b/content/04-create-cluster/03-get-to-know-cluster.md similarity index 99% rename from content/05-create-cluster/03-get-to-know-cluster.md rename to content/04-create-cluster/03-get-to-know-cluster.md index f0d29255..ef93edb7 100644 --- a/content/05-create-cluster/03-get-to-know-cluster.md +++ b/content/04-create-cluster/03-get-to-know-cluster.md @@ -1,6 +1,6 @@ --- title: "c. Get to know your Cluster" -weight: 53 +weight: 43 tags: ["tutorial", "parallelcluster-ui", "ParallelCluster"] --- diff --git a/content/05-create-cluster/04-run-1stjob.md b/content/04-create-cluster/04-run-1stjob.md similarity index 99% rename from content/05-create-cluster/04-run-1stjob.md rename to content/04-create-cluster/04-run-1stjob.md index 29471715..310f6347 100644 --- a/content/05-create-cluster/04-run-1stjob.md +++ b/content/04-create-cluster/04-run-1stjob.md @@ -1,7 +1,7 @@ +++ title = "f. Submit your first HPC job" date = 2022-03-01T10:46:30-04:00 -weight = 54 +weight = 44 tags = ["tutorial", "create", "ParallelCluster"] +++ diff --git a/content/05-create-cluster/05-update-cluster.md b/content/04-create-cluster/05-update-cluster.md similarity index 99% rename from content/05-create-cluster/05-update-cluster.md rename to content/04-create-cluster/05-update-cluster.md index d8fcddf0..ab0a5b64 100644 --- a/content/05-create-cluster/05-update-cluster.md +++ b/content/04-create-cluster/05-update-cluster.md @@ -1,7 +1,7 @@ +++ title = "g. Update your cluster" date = 2022-03-01T10:46:30-04:00 -weight = 55 +weight = 45 tags = ["tutorial", "create", "ParallelCluster"] +++ diff --git a/content/05-create-cluster/06-delete-pc.md b/content/04-create-cluster/06-delete-pc.md similarity index 96% rename from content/05-create-cluster/06-delete-pc.md rename to content/04-create-cluster/06-delete-pc.md index 760b8072..ba75e359 100644 --- a/content/05-create-cluster/06-delete-pc.md +++ b/content/04-create-cluster/06-delete-pc.md @@ -1,7 +1,7 @@ +++ title = "h. Terminate Your Cluster" date = 2019-09-18T10:46:30-04:00 -weight = 56 +weight = 46 tags = ["tutorial", "create", "ParallelCluster"] +++ diff --git a/content/05-create-cluster/_index.md b/content/04-create-cluster/_index.md similarity index 99% rename from content/05-create-cluster/_index.md rename to content/04-create-cluster/_index.md index 7fd467d8..30779a13 100644 --- a/content/05-create-cluster/_index.md +++ b/content/04-create-cluster/_index.md @@ -1,7 +1,7 @@ --- title: "Create an HPC Cluster" date: 2019-01-24T09:05:54Z -weight: 50 +weight: 40 pre: "IV ⁃ " tags: ["HPC", "Overview"] --- diff --git a/content/06-fsx-for-lustre/01-create-cluster.md b/content/05-fsx-for-lustre/01-create-cluster.md similarity index 98% rename from content/06-fsx-for-lustre/01-create-cluster.md rename to content/05-fsx-for-lustre/01-create-cluster.md index cbce040b..04611848 100644 --- a/content/06-fsx-for-lustre/01-create-cluster.md +++ b/content/05-fsx-for-lustre/01-create-cluster.md @@ -1,7 +1,7 @@ +++ title = "a. Create HPC Cluster" date = 2019-09-18T10:46:30-04:00 -weight = 10 +weight = 51 tags = ["configuration", "FSx", "ParallelCluster"] +++ diff --git a/content/06-fsx-for-lustre/02-create-cluster-fsx.md b/content/05-fsx-for-lustre/02-create-cluster-fsx.md similarity index 99% rename from content/06-fsx-for-lustre/02-create-cluster-fsx.md rename to content/05-fsx-for-lustre/02-create-cluster-fsx.md index 92b7c389..ca00366c 100644 --- a/content/06-fsx-for-lustre/02-create-cluster-fsx.md +++ b/content/05-fsx-for-lustre/02-create-cluster-fsx.md @@ -1,7 +1,7 @@ +++ title = "b. Create FSx Lustre" date = 2019-09-18T10:46:30-04:00 -weight = 20 +weight = 52 tags = ["configuration", "FSx", "ParallelCluster"] +++ diff --git a/content/06-fsx-for-lustre/03-create-s3.md b/content/05-fsx-for-lustre/03-create-s3.md similarity index 99% rename from content/06-fsx-for-lustre/03-create-s3.md rename to content/05-fsx-for-lustre/03-create-s3.md index 88bcd416..d746ca5d 100644 --- a/content/06-fsx-for-lustre/03-create-s3.md +++ b/content/05-fsx-for-lustre/03-create-s3.md @@ -1,7 +1,7 @@ +++ title = "c. Create S3 Bucket" date = 2019-09-18T10:46:30-04:00 -weight = 30 +weight = 53 tags = ["configuration", "FSx", "ParallelCluster"] +++ diff --git a/content/06-fsx-for-lustre/04-link-s3-fsx.md b/content/05-fsx-for-lustre/04-link-s3-fsx.md similarity index 99% rename from content/06-fsx-for-lustre/04-link-s3-fsx.md rename to content/05-fsx-for-lustre/04-link-s3-fsx.md index a2726c69..53467201 100644 --- a/content/06-fsx-for-lustre/04-link-s3-fsx.md +++ b/content/05-fsx-for-lustre/04-link-s3-fsx.md @@ -1,7 +1,7 @@ +++ title = "d. Link S3 to FSx Lustre" date = 2019-09-18T10:46:30-04:00 -weight = 40 +weight = 54 tags = ["configuration", "FSx", "ParallelCluster"] +++ diff --git a/content/06-fsx-for-lustre/05-check-fs.md b/content/05-fsx-for-lustre/05-check-fs.md similarity index 99% rename from content/06-fsx-for-lustre/05-check-fs.md rename to content/05-fsx-for-lustre/05-check-fs.md index aee3cdfa..a70515be 100644 --- a/content/06-fsx-for-lustre/05-check-fs.md +++ b/content/05-fsx-for-lustre/05-check-fs.md @@ -1,7 +1,7 @@ +++ title = "e. Examine the File System" date = 2019-09-18T10:46:30-04:00 -weight = 50 +weight = 55 tags = ["tutorial", "HSM", "FSx"] +++ diff --git a/content/06-fsx-for-lustre/06-lazy-loading.md b/content/05-fsx-for-lustre/06-lazy-loading.md similarity index 99% rename from content/06-fsx-for-lustre/06-lazy-loading.md rename to content/05-fsx-for-lustre/06-lazy-loading.md index 11af69df..d7ca18a2 100644 --- a/content/06-fsx-for-lustre/06-lazy-loading.md +++ b/content/05-fsx-for-lustre/06-lazy-loading.md @@ -1,7 +1,7 @@ +++ title = "f. About Lazy File Loading" date = 2019-09-18T10:46:30-04:00 -weight = 60 +weight = 56 tags = ["tutorial", "HSM", "FSx", "Laxy Load"] +++ diff --git a/content/06-fsx-for-lustre/07-Install-IOR.md b/content/05-fsx-for-lustre/07-Install-IOR.md similarity index 98% rename from content/06-fsx-for-lustre/07-Install-IOR.md rename to content/05-fsx-for-lustre/07-Install-IOR.md index 2167fb19..13757e60 100644 --- a/content/06-fsx-for-lustre/07-Install-IOR.md +++ b/content/05-fsx-for-lustre/07-Install-IOR.md @@ -1,7 +1,7 @@ +++ title = "g. Install IOR Benchmark Tool" date = 2019-09-18T10:46:30-04:00 -weight = 70 +weight = 57 tags = ["tutorial", "install", "FSx", "Performances"] +++ diff --git a/content/06-fsx-for-lustre/08-cloudwatch.md b/content/05-fsx-for-lustre/08-cloudwatch.md similarity index 99% rename from content/06-fsx-for-lustre/08-cloudwatch.md rename to content/05-fsx-for-lustre/08-cloudwatch.md index eca160e2..f580c48e 100644 --- a/content/06-fsx-for-lustre/08-cloudwatch.md +++ b/content/05-fsx-for-lustre/08-cloudwatch.md @@ -1,7 +1,7 @@ +++ title = "h. View Metrics with CloudWatch" date = 2019-09-18T10:46:30-04:00 -weight = 80 +weight = 58 tags = ["tutorial", "IOR", "FSx", "metrics"] +++ diff --git a/content/06-fsx-for-lustre/09-performance-test.md b/content/05-fsx-for-lustre/09-performance-test.md similarity index 99% rename from content/06-fsx-for-lustre/09-performance-test.md rename to content/05-fsx-for-lustre/09-performance-test.md index 4980f4d4..f386f1d9 100644 --- a/content/06-fsx-for-lustre/09-performance-test.md +++ b/content/05-fsx-for-lustre/09-performance-test.md @@ -1,7 +1,7 @@ +++ title = "i. Test IO Performance" date = 2019-09-18T10:46:30-04:00 -weight = 90 +weight = 59 tags = ["tutorial", "IOR", "FSx", "Performances"] +++ diff --git a/content/06-fsx-for-lustre/10-summary.md b/content/05-fsx-for-lustre/10-summary.md similarity index 98% rename from content/06-fsx-for-lustre/10-summary.md rename to content/05-fsx-for-lustre/10-summary.md index 4845b83b..bd72ea34 100644 --- a/content/06-fsx-for-lustre/10-summary.md +++ b/content/05-fsx-for-lustre/10-summary.md @@ -1,7 +1,7 @@ +++ title = "j. Summary and Cleanup" date = 2019-09-18T10:46:30-04:00 -weight = 100 +weight = 60 tags = ["tutorial", "FSx", "summary"] +++ diff --git a/content/06-fsx-for-lustre/_index.md b/content/05-fsx-for-lustre/_index.md similarity index 99% rename from content/06-fsx-for-lustre/_index.md rename to content/05-fsx-for-lustre/_index.md index e6d1a23a..b2f878d8 100644 --- a/content/06-fsx-for-lustre/_index.md +++ b/content/05-fsx-for-lustre/_index.md @@ -1,7 +1,7 @@ --- title: "Build a High-Performance File System" date: 2019-01-24T09:05:54Z -weight: 200 +weight: 50 pre: "V ⁃ " tags: ["HPC", "Overview", "Batch"] --- diff --git a/content/06-nice-dcv/_index.md b/content/06-nice-dcv/_index.md index cc553f47..e3c7a112 100644 --- a/content/06-nice-dcv/_index.md +++ b/content/06-nice-dcv/_index.md @@ -1,7 +1,7 @@ --- title: "Remote Visualization using NICE DCV" date: 2019-01-24T09:05:54Z -weight: 400 +weight: 60 pre: "VI ⁃ " tags: ["HPC", "NICE", "Visualization", "Remote Desktop"] --- diff --git a/content/06-nice-dcv/pcluster/01-connect-dcv.md b/content/06-nice-dcv/pcluster/01-connect-dcv.md index dcea5f52..03ecb1ee 100644 --- a/content/06-nice-dcv/pcluster/01-connect-dcv.md +++ b/content/06-nice-dcv/pcluster/01-connect-dcv.md @@ -1,7 +1,7 @@ +++ title = "a. Connect to your NICE DCV Session" date = 2019-09-18T10:46:30-04:00 -weight = 80 +weight = 11 tags = ["tutorial", "NICE DCV", "ParallelCluster", "Remote Desktop"] +++ diff --git a/content/06-nice-dcv/pcluster/02-configure-pc-dcv.md b/content/06-nice-dcv/pcluster/02-configure-pc-dcv.md index a9893f82..5899de84 100644 --- a/content/06-nice-dcv/pcluster/02-configure-pc-dcv.md +++ b/content/06-nice-dcv/pcluster/02-configure-pc-dcv.md @@ -1,7 +1,7 @@ +++ title = "a. Create a cluster configured with NICE DCV" date = 2019-09-18T10:46:30-04:00 -weight = 50 +weight = 12 tags = ["tutorial", "initialize", "ParallelCluster", "DCV"] +++ diff --git a/content/06-nice-dcv/pcluster/_index.md b/content/06-nice-dcv/pcluster/_index.md index 3f8c693e..b3394122 100644 --- a/content/06-nice-dcv/pcluster/_index.md +++ b/content/06-nice-dcv/pcluster/_index.md @@ -1,7 +1,7 @@ --- title: "DCV Connect in ParallelCluster" date: 2019-01-24T09:05:54Z -weight: 20 +weight: 10 tags: ["HPC", "NICE", "Visualization", "Remote Desktop"] --- diff --git a/content/06-nice-dcv/queue/01-create-sg.md b/content/06-nice-dcv/queue/01-create-sg.md index 0518a969..6baef729 100644 --- a/content/06-nice-dcv/queue/01-create-sg.md +++ b/content/06-nice-dcv/queue/01-create-sg.md @@ -1,7 +1,7 @@ +++ title = "a. Create Security Group" date = 2019-09-18T10:46:30-04:00 -weight = 10 +weight = 21 tags = ["tutorial", "NICE DCV", "ParallelCluster", "Remote Desktop"] +++ diff --git a/content/06-nice-dcv/queue/02-edit-cluster.md b/content/06-nice-dcv/queue/02-edit-cluster.md index 5079d027..6e3a085c 100644 --- a/content/06-nice-dcv/queue/02-edit-cluster.md +++ b/content/06-nice-dcv/queue/02-edit-cluster.md @@ -1,7 +1,7 @@ +++ title = "b. Modify Cluster Configuration" date = 2019-09-18T10:46:30-04:00 -weight = 11 +weight = 22 tags = ["tutorial", "NICE DCV", "ParallelCluster", "Remote Desktop"] +++ diff --git a/content/06-nice-dcv/queue/03-create-dcv-session.md b/content/06-nice-dcv/queue/03-create-dcv-session.md index 72629bd5..51e2ab26 100644 --- a/content/06-nice-dcv/queue/03-create-dcv-session.md +++ b/content/06-nice-dcv/queue/03-create-dcv-session.md @@ -1,7 +1,7 @@ +++ title = "c. Create DCV Session" date = 2019-09-18T10:46:30-04:00 -weight = 12 +weight = 23 tags = ["tutorial", "NICE DCV", "ParallelCluster", "Remote Desktop"] +++ diff --git a/content/06-nice-dcv/queue/04-no-ingress-dcv.md b/content/06-nice-dcv/queue/04-no-ingress-dcv.md index e33eddcf..d02ba188 100644 --- a/content/06-nice-dcv/queue/04-no-ingress-dcv.md +++ b/content/06-nice-dcv/queue/04-no-ingress-dcv.md @@ -1,7 +1,7 @@ +++ title = "d. No-Ingress DCV Session" date = 2019-09-18T10:46:30-04:00 -weight = 13 +weight = 24 tags = ["tutorial", "NICE DCV", "ParallelCluster", "Remote Desktop"] +++ diff --git a/content/06-nice-dcv/queue/_index.md b/content/06-nice-dcv/queue/_index.md index 0d9db9ea..9fb11af4 100644 --- a/content/06-nice-dcv/queue/_index.md +++ b/content/06-nice-dcv/queue/_index.md @@ -1,7 +1,7 @@ --- title: "DCV Queue in ParallelCluster" date: 2019-01-24T09:05:54Z -weight: 30 +weight: 20 tags: ["HPC", "NICE", "Visualization", "Remote Desktop"] --- diff --git a/content/06-nice-dcv/standalone/08-deploy-ec2.md b/content/06-nice-dcv/standalone/08-deploy-ec2.md index 197904ad..2c6b6a42 100644 --- a/content/06-nice-dcv/standalone/08-deploy-ec2.md +++ b/content/06-nice-dcv/standalone/08-deploy-ec2.md @@ -1,7 +1,7 @@ +++ title = "a. Deploy EC2 instance with NICE DCV" date = 2019-01-24T09:05:54Z -weight = 140 +weight = 31 tags = ["HPC", "NICE", "Visualization", "Remote Desktop", "Native Client"] +++ diff --git a/content/06-nice-dcv/standalone/09-connect-ec2.md b/content/06-nice-dcv/standalone/09-connect-ec2.md index f8182892..bfb172e5 100644 --- a/content/06-nice-dcv/standalone/09-connect-ec2.md +++ b/content/06-nice-dcv/standalone/09-connect-ec2.md @@ -1,7 +1,7 @@ +++ title = "b. Connect to NICE DCV EC2 Instance" date = 2019-01-24T09:05:54Z -weight = 160 +weight = 32 tags = ["HPC", "NICE", "Visualization", "Remote Desktop", "Web Browser", "Native Client"] +++ diff --git a/content/06-nice-dcv/standalone/10-connect-dcv.md b/content/06-nice-dcv/standalone/10-connect-dcv.md index 71733f94..32a62e97 100644 --- a/content/06-nice-dcv/standalone/10-connect-dcv.md +++ b/content/06-nice-dcv/standalone/10-connect-dcv.md @@ -1,7 +1,7 @@ +++ title = "c. Connect to Remote Desktop Session" date = 2019-01-24T09:05:54Z -weight = 180 +weight = 33 tags = ["HPC", "NICE", "Visualization", "Remote Desktop", "Web Browser", "Native Client"] +++ diff --git a/content/06-nice-dcv/standalone/11-terminate-ec2-dcv.md b/content/06-nice-dcv/standalone/11-terminate-ec2-dcv.md index 0ccfca1f..0c8abe97 100644 --- a/content/06-nice-dcv/standalone/11-terminate-ec2-dcv.md +++ b/content/06-nice-dcv/standalone/11-terminate-ec2-dcv.md @@ -1,7 +1,7 @@ +++ title = "d. Terminate Your Instance" date = 2019-09-18T10:46:30-04:00 -weight = 200 +weight = 34 tags = ["HPC", "NICE", "Visualization", "Remote Desktop", "Web Browser", "Native Client"] +++ diff --git a/content/06-nice-dcv/standalone/_index.md b/content/06-nice-dcv/standalone/_index.md index 6128409a..e75ac2f8 100644 --- a/content/06-nice-dcv/standalone/_index.md +++ b/content/06-nice-dcv/standalone/_index.md @@ -1,7 +1,7 @@ --- title: "DCV using web browser/native client" date: 2019-01-24T09:05:54Z -weight: 120 +weight: 30 tags: ["HPC", "NICE", "Visualization", "Remote Desktop", "Native Client"] --- diff --git a/content/08-EFA/00-EFA-Basics.md b/content/07-EFA/00-EFA-Basics.md similarity index 99% rename from content/08-EFA/00-EFA-Basics.md rename to content/07-EFA/00-EFA-Basics.md index 49cdcd08..fca03dfe 100644 --- a/content/08-EFA/00-EFA-Basics.md +++ b/content/07-EFA/00-EFA-Basics.md @@ -1,7 +1,7 @@ --- title: "a. EFA Basics" date: 2020-05-13T10:00:58Z -weight : 5 +weight : 71 tags : ["EFA", "ParallelCluster", "basics",] --- diff --git a/content/08-EFA/01-create-efa-cluster.md b/content/07-EFA/01-create-efa-cluster.md similarity index 98% rename from content/08-EFA/01-create-efa-cluster.md rename to content/07-EFA/01-create-efa-cluster.md index eecd62f0..22d49679 100644 --- a/content/08-EFA/01-create-efa-cluster.md +++ b/content/07-EFA/01-create-efa-cluster.md @@ -1,7 +1,7 @@ --- title : "b. Create an HPC Cluster with EFA" date: 2020-05-12T10:00:58Z -weight : 10 +weight : 72 tags : ["configuration", "EFA", "ParallelCluster", "create"] --- diff --git a/content/08-EFA/02-check-efa.md b/content/07-EFA/02-check-efa.md similarity index 99% rename from content/08-EFA/02-check-efa.md rename to content/07-EFA/02-check-efa.md index 9185bdb4..decae795 100644 --- a/content/08-EFA/02-check-efa.md +++ b/content/07-EFA/02-check-efa.md @@ -1,7 +1,7 @@ --- title : "c. Examine an EFA enabled instance" date: 2020-05-12T10:00:58Z -weight : 20 +weight : 73 tags : ["tutorial", "EFA", "ec2", "fi_info", "mpi"] --- diff --git a/content/08-EFA/03-work-with-IntelMPI.md b/content/07-EFA/03-work-with-IntelMPI.md similarity index 99% rename from content/08-EFA/03-work-with-IntelMPI.md rename to content/07-EFA/03-work-with-IntelMPI.md index c608f9aa..d4b1cdbc 100644 --- a/content/08-EFA/03-work-with-IntelMPI.md +++ b/content/07-EFA/03-work-with-IntelMPI.md @@ -1,7 +1,7 @@ --- title: "d. Work With Intel MPI" date: 2020-05-12T12:57:20Z -weight : 30 +weight : 74 tags : ["tutorial", "EFA", "ec2", "IntelMPI", "MPI", "intel", "module"] --- diff --git a/content/08-EFA/04-complie-run-osu.md b/content/07-EFA/04-complie-run-osu.md similarity index 99% rename from content/08-EFA/04-complie-run-osu.md rename to content/07-EFA/04-complie-run-osu.md index d2f3f468..2a2aa2fd 100644 --- a/content/08-EFA/04-complie-run-osu.md +++ b/content/07-EFA/04-complie-run-osu.md @@ -1,7 +1,7 @@ --- title: "e. Download, compile and run the OSU Benchmark" date: 2020-05-12T13:27:03Z -weight : 40 +weight : 75 tags : ["tutorial", "EFA", "ec2", "OSU", "MPI", "Benchmark", "compile"] --- diff --git a/content/08-EFA/05-delete-cluster.md b/content/07-EFA/05-delete-cluster.md similarity index 97% rename from content/08-EFA/05-delete-cluster.md rename to content/07-EFA/05-delete-cluster.md index 9baeadc1..cd83fd28 100644 --- a/content/08-EFA/05-delete-cluster.md +++ b/content/07-EFA/05-delete-cluster.md @@ -1,7 +1,7 @@ --- title: "f. Delete Your EFA Cluster" date: 2020-05-13T09:52:23Z -weight : 50 +weight : 76 tags : ["tutorial", "delete", "ParallelCluster"] --- diff --git a/content/08-EFA/_index.md b/content/07-EFA/_index.md similarity index 98% rename from content/08-EFA/_index.md rename to content/07-EFA/_index.md index 85899e4c..3d37965b 100644 --- a/content/08-EFA/_index.md +++ b/content/07-EFA/_index.md @@ -1,8 +1,8 @@ --- title: "Elastic Fabric Adapter (EFA)" date: 2020-04-24T7:05:54Z -weight: 400 -pre: "VIII ⁃ " +weight: 70 +pre: "VII ⁃ " tags: ["HPC", "EFA", "Elastic Fabric Adapter", "Network", "MPI"] --- diff --git a/content/08-cost-controls/01-configure-slurm-accounting.md b/content/08-cost-controls/01-configure-slurm-accounting.md new file mode 100644 index 00000000..34a94bb6 --- /dev/null +++ b/content/08-cost-controls/01-configure-slurm-accounting.md @@ -0,0 +1,84 @@ +--- +title: "a. Configure Slurm Accounting and Prerequisites" +weight: 81 +--- + +In this section, you will modify the cluster created in Lab I to enable Slurm Accounting resource limits. + +### Modify Cluster Configuration for Cost Controls + +{{% notice note %}} +Make sure that you are in your Cloud9 terminal to start this lab. To access Cloud9, please refer to the instructions under *Connect to AWS Cloud9 Instance* in the **[Access Cloud9 Environment](/00-overview/03-access-cloud9)** section. +{{% /notice %}} + +#### 1. Open Terminal. + +Load the terminal used to maintain your AWS ParallelCluster clusters. In earlier labs, **Cloud9** was used; if following a workshop, go back to your **Cloud9 terminal**. If you have closed the Cloud9 terminal, go back to the [Cloud9 console](https://eu-north-1.console.aws.amazon.com/cloud9control/home?region=eu-north-1#/) and re-open the terminal using the instructions found at [Access Cloud9 Environment](/00-overview/03-access-cloud9). + +#### 2. Enable Resource Limits on Slurm. +The cost control solution requires that you apply a CPU minutes Resource Limit to the cluster's Slurm scheduler. +[Resource Limits](https://slurm.schedmd.com/resource_limits.html) are used in Slurm to restrict job execution after a resource (CPU, RAM, etc.) usage limit has been reached. + +Run the command below to apply the `PriorityType` and `AccountingStorageEnforce` settings to the cluster configuration file. `yq` is used to automate the update of the YAML cluster configuration file. + +```bash +yq -i '(.Scheduling.SlurmSettings.CustomSlurmSettings[0].PriorityType="priority/multifactor") | + (.Scheduling.SlurmSettings.CustomSlurmSettings[1].AccountingStorageEnforce="limits")' \ + ~/environment/cluster-config.yaml +``` + +{{% notice note %}} +If you receive an error `bash: yq: command not found` it can be installed on the Cloud9 instance with the command `pip3 install yq`. +{{% /notice %}} + +{{< detail-tag "**[Optional Information-Click here for more]** Additional details about the Slurm customizations" >}} +The [Slurm Priority Multifactor Plugin](https://slurm.schedmd.com/priority_multifactor.html#intro) provides advanced Slurm queue management features and the Trackable Resource (TRES) scheduling factor is required to apply the GrpTRESMins limit. The [AccountingStorageEnforce](https://slurm.schedmd.com/slurm.conf.html#OPT_AccountingStorageEnforce) limits +setting is required to enforce limits on job submissions and prevent jobs from running that have exceeded the defined resource limits. +{{< /detail-tag >}} + +{{% notice note %}} +For ParallelCluster versions >= 3.6.0, you can define custom slurm.conf customizations as part of an AWS +ParallelCluster configuration. See instructions [here](https://docs.aws.amazon.com/parallelcluster/latest/ug/slurm-configuration-settings-v3.html). +{{% /notice %}} + +#### 3. Grant head node access to the [AWS Price List service](https://docs.aws.amazon.com/awsaccountbilling/latest/aboutv2/price-changes.html). +Run the following command to update the cluster configuration so that an additional IAM policy that grants access the AWS Price List service is applied to the head node. + +```bash +yq -i '(.HeadNode.Iam.AdditionalIamPolicies[1].Policy="arn:aws:iam::aws:policy/AWSPriceListServiceFullAccess")' \ + ~/environment/cluster-config.yaml +``` + +{{< detail-tag "**[Optional Information-Click here for more]** Additional details about the need for the Price List service" >}} +The cost control Python script uses the AWS Price List service to determine the per hour cost of the cluster's EC2 compute nodes. +The Python script will query the AWS Price List service using the AWS Python SDK; however, the AWS Price List service API requires +the request to contain appropriate IAM credentials to access the Price List API service. EC2 instances can assume an IAM role, +called an [EC2 instance profile](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/iam-roles-for-amazon-ec2.html#ec2-instance-profile), +so that code/processes running on the instance have access to AWS IAM credentials. +{{< /detail-tag >}} + +{{% notice note %}} +You can define additional IAM policies for both your head and compute nodes by using the "AdditionalIamPolicies" option within your ParallelCluster configuration file. See details [here](https://docs.aws.amazon.com/parallelcluster/latest/ug/iam-roles-in-parallelcluster-v3.html#iam-roles-in-parallelcluster-v3-cluster-config-additionaliampolicies) +{{% /notice %}} + +### Apply Changes to Cluster + +#### 1. Update the cluster. +You have modified the configuration file in the previous steps for the required changes. However, these changes won't be applied until the cluster is updated. The `pcluster update-cluster` command below applies the changes in the configuration file to the cluster using the AWS CloudFormation service. + +```bash +source ~/environment/env_vars +pcluster update-cluster -n hpc --region ${AWS_REGION} -c ~/environment/cluster-config.yaml +``` + +#### 2. Wait for the cluster to be updated. +You can check the cluster update status using the `pcluster describe-cluster` command below. + +```bash +pcluster describe-cluster -n hpc --query clusterStatus --region ${AWS_REGION} +``` + +The cluster update will take **about 3 minutes**. You will know the cluster update is complete when you see an **UPDATE_COMPLETE** status. + +You have successfully updated the cluster with the required configuration changes. In the next section, you will +create cost controls on the cluster using Slurm Accounting and Resource Limits. \ No newline at end of file diff --git a/content/08-cost-controls/02-create-cost-controls.md b/content/08-cost-controls/02-create-cost-controls.md new file mode 100644 index 00000000..b77526a3 --- /dev/null +++ b/content/08-cost-controls/02-create-cost-controls.md @@ -0,0 +1,105 @@ +--- +title: "b. Create Cost Controls" +weight: 82 +--- + +In this section, you will implement a resource limit on the cluster using Slurm Accounting. [Slurm Resource Limits](https://slurm.schedmd.com/resource_limits.html) are used to enforce limits on the amount of resources that can be consumed. Our objective is to enforce a budget threshold in dollars; however, Slurm Accounting does not have a mechanism for understanding cloud costs associated with compute nodes. Therefore, you will convert a dollar budget to CPUMins group trackable resource minutes (GrpTRESMins) by using the AWS Price List API. You will then apply limits at the pcdefault Slurm account level for CPU Minutes in this lab. + +#### 1. Get your SSH key name. +Run the command below in your **Cloud9 terminal** to retrieve the name of the SSH key. + +```bash +source ~/environment/env_vars +echo ${SSH_KEY_NAME} +``` + +#### 2. Log in to the head node. +In this section, you will use the `pcluster ssh` command to connect to the head node from your AWS Cloud9 terminal. + +```bash +pcluster ssh -n hpc --region ${AWS_REGION} -i ~/.ssh/${SSH_KEY_NAME} +``` + +#### 3. Apply budget to cluster. +Upload the [attached Python File](/scripts/create_cluster_cost_controls.py) to a shared directory on the cluster such as the `/shared` directory. + +This script takes a single integer parameter that represents the US dollar budget limit that you would like to apply to the cluster. When executed, the script converts the budget to the number of used CPU minutes by compute nodes in the cluster. + +Install the boto3 module that is a dependency of the python script. + +```bash +pip3 install boto3 +``` + +[Boto3](https://boto3.amazonaws.com/v1/documentation/api/latest/guide/index.html) is the AWS SDK for Python module. +The `create_cluster_cost_controls.py` script will use the boto3 module to get AWS credentials and interact with AWS service APIs. + +Execute the python script to apply a budget of *$1000* to the cluster using the command below: + +```bash +python3 create_cluster_cost_controls.py 1000 +``` + +The python script has taken in the $1000 budget, calculated the cost per minute per core for the compute node EC2 instance type of hpc6id.32xlarge, and determined a total number of minutes that compute nodes can run before reaching the $1000 budget. +The total minutes is then applied as a CPUMins resource limit in Slurm Accounting. + +{{< detail-tag "**[Optional Information-Click here for more]** Additional details about GrpTRESMins and CPUMins" >}} +GrpTRESMins (Group Trackable Resource Minutes) represents the total number of trackable resource minutes that can possibly be used by past, present, and future Slurm jobs running from an association and its children. If any limit is reached, all running jobs with that trackable resource in this group will be killed and no new jobs will be allowed to run. + +CPUMins (CPU Minutes) is a trackable resource representing the number of CPU minutes used by jobs. As an example, a node with 64 CPUs running for 2 minutes would result in 128 CPUMins. +{{< /detail-tag >}} + +#### 4. Verify that the budget is applied. +Use `sshare` to view the resource limit setting that you applied in the previous step. Note that the resource limit was applied at the cluster level so you must retrieve data for the overall *pcdefault* account in Slurm. + +```bash +sshare -u " " -A pcdefault -o account,user,GrpTRESMins,GrpTRESRaw +``` + +{{< detail-tag "**[Optional Information-Click here for more]** Additional details about sshare and associations" >}} +**sshare** is used with the Slurm Priority Multifactor plugin and Slurm Accounting to provide share information by association. This command is useful in that it allows you to view both the resource limits (CPU Minutes limit in this lab) and the association's usage against that limit (the *pcdefault* Slurm account is the association that you use in this lab). +[sshare documentation](https://slurm.schedmd.com/sshare.html) + +Slurm maintains a hierarchy of **association** entities that are used to group information: accounts, clusters, partitions, and users. +In this lab, you will focus on information at the *pcdefault* account level as this is the overarching account created by ParallelCluster. +[Slurm association documentation](https://slurm.schedmd.com/sacctmgr.html#OPT_association) +{{< /detail-tag >}} + +Sample Output: + +![sshare](/images/cost-controls/sshare_show_limit.png) + +Here you can see that a number of CPUMins (CPU minutes) has been applied as a limit to the overall cluster account, pcdefault. All of the GrpTRESRaw datapoints, which represent resource usage, are zero because you have not run any jobs since enabling Slurm resource limits. + +{{< detail-tag "**[Optional Information-Click here for more]** Where has this number for the limit come from?" >}} +The script we used to implement this limit takes as input the dollar budget, which in our case was 1000. +We then calculate the CPU minutes that this translates to with the following steps: +- Divide by the cost of the instance in the region ($6.0352 per hour) +- Multiply by 60 to switch from hours to minutes +- Multiply by 64, which is the number of CPUs per node +- Multiply by 0.9 to add a 10% safety factor in the calculation for other costs not related to compute + +This gives a total budget in CPUmins of 572640. +{{< /detail-tag >}} + +#### 5. Submit a new job. + +```bash +cd /fsx/OpenFOAM/motorBikeDemo/ +sbatch openfoam.sbatch +``` + +Now that resource limits are enabled, Slurm Accounting will begin to track resource usage. This configuration will be +tracking CPU Minutes (CPUMins) usage of the compute nodes in the cluster. + +#### 6. Wait for the job to complete. + +You can monitor the job state with `squeue`: + +```bash +squeue -i 5 +``` + +Wait for the job to complete - this will take **about 5 minutes**. You will know the job is complete once it disappears from the squeue output. Exit the infinite loop by doing `ctrl-c`. + +In the next section, you will test the behavior of Slurm Accounting resource limits. diff --git a/content/08-cost-controls/03-test-cost-controls.md b/content/08-cost-controls/03-test-cost-controls.md new file mode 100644 index 00000000..f0632e8f --- /dev/null +++ b/content/08-cost-controls/03-test-cost-controls.md @@ -0,0 +1,104 @@ +--- +title: "c. Test Cost Controls" +weight: 83 +--- + +In this section, we will query the Slurm accounting database, and then submit jobs to observe how the applied resource limits affects our job execution. + +#### 1. Observe job execution statistics. +Use the `sacct` command to observe the Slurm Accounting job execution statistics for the previously executed job: + +```bash +sacct --format=jobid,jobname,partition,account,state,elapsed,alloccpus,allocnodes,cputime --allocations --starttime now-2days +``` + +![sacct](/images/cost-controls/sacct.png) + +Note that you can see the runtime of the previously executed jobs as well as the number of nodes and CPUs that were +allocated to that jobs execution. These values are used in Slurm Accounting to track against the CPUMins GrpTRESMins +limit. + +#### 2. Observe the current CPU resource limit. +Use the `sshare` command to observe the current GrpTRESMins CPU limit at the account level as well as how close the cluster is to approaching that limit. + +```bash +sshare -u " " -A pcdefault -o account,user,GrpTRESMins,GrpTRESRaw +``` + +![sshare_usage](/images/cost-controls/sshare_show_usage.png) + +Because this limit is set at the cluster level, observe the output where `Account=pcdefault` +and `UserName=` as this provides the account-level data. The upper limit should be the same but +note that the current value may differ due to varying run times. + +In this case, we can see that job 2 ran for 1 minute and 22 seconds, which is 1.366 minutes. It ran on 2 nodes, which is 128 CPUs. Therefore, the consumed CPUmins (since reporting started) was 174, which is the reported value in this image. + +#### 3. Lower the budget limit. + +Re-run the cost control script but this time use an arbitrarily low budget threshold of *$.05*. + +```bash +python3 /shared/create_cluster_cost_controls.py .05 +``` + +You are doing this to force the resource limit to be reached so that you can observe Slurm's behavior. + +#### 4. Submit a new job. + +For testing we can submit a small test job. If you have run a small OpenFOAM compute job previously this can be run, alternatively the job in [Submit your first HPC job](/aws-hpc-tutorials/content/04-create-cluster/04-run-1stjob.md) will be a good test. + + ```bash +sbatch openfoam.sbatch + ``` + +#### 5. View job status. +Use the `squeue` command to view the status of the job that was just submitted. + ```bash +squeue + ``` + +![squeue_pending](/images/cost-controls/squeue_pending.png) + +Note how the job is stuck in the Pending state as denoted by ST=PD. You can also see that the reason for the job +being stuck in the pending state is `AssocGrpCPUMinutesLimit`, meaning that you have exceeded our CPUMins resource +limit threshold. Slurm will not allow new jobs to be executed until the resource limit is raised or reset. + +#### 6. Raise the budget limit. +Re-run the cluster cost controls python script to reset the applied budget back to *$1000*. +```bash +python3 /shared/create_cluster_cost_controls.py 1000 +``` +In the subsequent steps you will be able to see Slurm's behavior when the budget has been raised back. + +#### 7. Check the job status. +Note that Slurm will automatically start the job once the limit is reset, but it may take up to a couple of minutes before Slurm recognizes that the CPUMins limit has been increased and the job +executes. To save time, you will requeue the job. First, find the job_id using the `squeue` command. + ```bash +squeue + ``` + +#### 8. Requeue the job. +The `scontrol` command below re-queues job_id 3 but replace "3" with your job_id from the `squeue` command above in step 7. +```bash +scontrol requeue 3 +``` + +#### 9. Monitor the job state. + +```bash +squeue -i 5 +``` + +The job has been re-queued but it may take up to **3 minutes** for the job to start running due to a [BeginTime](https://slurm.schedmd.com/squeue.html#OPT_BeginTime) limitation. +When a Slurm job is re-queued, the re-queued job's begin time is moved forward a couple of minutes to ensure that the previous job is cleaned up before the re-queued job starts. + +![squeue_begintime](/images/cost-controls/squeue_begintime.png) + +Wait for the job to transition into the **running** state. You will know that the job is in the **running** state when you see the following, denoted by **ST=R**: + +![squeue_running](/images/cost-controls/squeue_running.png) + +#### 10. Exit the infinite loop by doing `ctrl-c`. + +You have learned how to apply Slurm Accounting resource limits to your cluster as a means to implement cost controls. +In the next section, you will learn how to visualize cost data in Amazon CloudWatch. \ No newline at end of file diff --git a/content/08-cost-controls/04-cloudwatch-monitoring.md b/content/08-cost-controls/04-cloudwatch-monitoring.md new file mode 100644 index 00000000..0b86d116 --- /dev/null +++ b/content/08-cost-controls/04-cloudwatch-monitoring.md @@ -0,0 +1,87 @@ +--- +title: "d. View Metrics in CloudWatch" +weight: 84 +--- + +ParallelCluster automatically creates a CloudWatch dashboard with metrics for each cluster. +In this section, you will add a custom Cloudwatch metric to the existing dashboard. +You will do so using the AWS Python SDK to publish the custom CloudWatch metric, which captures the cost of the cluster's compute nodes over time. + +#### 1. Upload the python script. +Copy the [attached Python Script](/scripts/publish_cw_cost_metric.py) onto the cluster in a directory accessible by the headnode, such as `/shared/`. This python script calculates the total number of node minutes that have executed in the cluster, determines the cost per minute of the node instance type, and then publishes a total cost of the cluster's compute nodes as a CloudWatch metric where we can view the results in a dashboard. + +#### 2. Create a wrapper shell script. +The scheduled script needs to reference the settings in **/etc/profile** for it to work in cron scheduler. + +Therefore we will create a bash wrapper script that sources the settings, and then runs the python script we created above. + +```bash +cat > publish_cw_cost_metric.sh << EOF +#!/bin/bash + +source /etc/profile +python3 /shared/publish_cw_cost_metric.py +EOF +``` + +#### 3. Make the wrapper shell and python scripts executable: +```bash +chmod 755 /shared/publish_cw_cost_metric.sh +chmod 755 /shared/publish_cw_cost_metric.py +``` + +#### 4. Create a cron entry for the script. +Schedule the wrapper shell script to execute every minute so the dashboard is updated continuously using the command below: +```bash +cat > crontab_entry < All Metrics > ParallelCluster > ClusterName**. +Then select checkbox for the metric named `cluster_cost` to plot it. Note that it can take up to 5 minutes for metrics to be displayed in CloudWatch. Please wait until you see the `cluster_cost` metric in the CloudWatch console, then select `cluster_cost`. + +![CloudWatch Metric](/images/cost-controls/cloudwatch_metric.png) + +#### 6. Modify `cluster_cost` time period. +Select the **Graphed metrics** tab and modify the `cluster_cost` metric to show a period of one minute, as seen in the image below: + +![CloudWatch Period](/images/cost-controls/cloudwatch_period.png) +CloudWatch allows you to customize the way in which data is aggregated. In this lab, you have chosen a period of one minute because that is how frequently the custom metric data is published. + +#### 7. Add the cost graph to the dashboard. +The ParallelCluster CloudWatch dashboard is created as part of the standard ParallelCluster deployment for each cluster. You will add custom metric cost data to this dashboard to provide deeper insights into the overall health and cost of the cluster. + +Select **Actions** and choose **Add to Dashboard**. + +![CloudWatch Add to Dashboard](/images/cost-controls/cloudwatch_add_to_dashboard.png) + +Then choose the dashboard for your cluster and select **Add to Dashboard**. + +![CloudWatch Add to Dashboard2](/images/cost-controls/cloudwatch_add_to_dashboard_3.png) + +#### 8. View the ParallelCluster Dashboard. + +The cluster dashboard will now appear. Scroll to the bottom of the dashboard to view the `cluster_cost` graph. + +![CloudWatch Dashboard 1](/images/cost-controls/cloudwatch_dashboard_1.png) + +Move the `cluster_cost` graph to be under the "Cluster Health Metrics" section using the drag and drop functionality. +Hover your mouse over `cluster_cost` graph with your cursor, click, and drag the graph upwards to the "Cluster Health Metrics" section as seen in screenshot below: + +![CloudWatch Dashboard 2](/images/cost-controls/cloudwatch_dashboard_2.png) + +Your dashboard will now look like the image below, where you can visualize your cluster cost data alongside other relevant cluster metrics and logs. + +![CloudWatch Dashboard Final](/images/cost-controls/cloudwatch_dashboard_final.png) + +You have successfully created a CloudWatch Dashboard to visualize the cluster cost using a custom CloudWatch Metric. Please continue on to the next section. \ No newline at end of file diff --git a/content/08-cost-controls/05-summary.md b/content/08-cost-controls/05-summary.md new file mode 100644 index 00000000..bdd6f433 --- /dev/null +++ b/content/08-cost-controls/05-summary.md @@ -0,0 +1,15 @@ +--- +title: "e. Summary" +date: 2022-04-10T10:46:30-04:00 +weight: 85 +tags: ["tutorial", "create", "ParallelCluster"] +--- + +Congratulations, you have completed the AWS ParallelCluster cost control lab! + +In this lab, you have: +- Configured Slurm Accounting Resource Limits in an HPC cluster. +- Converted a dollar budget to CPU minutes and applied that as a limit to the cluster. +- Published a custom CloudWatch metric for cluster cost and visualized the data in the Amazon CloudWatch ParallelCluster Dashboard. + +You can learn more about Slurm Accounting in AWS ParallelCluster by visiting the [documentation](https://docs.aws.amazon.com/parallelcluster/latest/ug/slurm-accounting-v3.html). diff --git a/content/08-cost-controls/_index.md b/content/08-cost-controls/_index.md new file mode 100644 index 00000000..ef0c23c9 --- /dev/null +++ b/content/08-cost-controls/_index.md @@ -0,0 +1,71 @@ +--- +title: "Cost controls" +date: 2019-01-24T09:05:54Z +weight: 80 +pre: "VIII ⁃ " +tags: ["HPC", "Introduction", "EC2", "Optional"] +--- + +{{% notice info %}}If you have not yet deployed a cluster, you can use the instructions for either the [ParallelCluster UI](02-deploy-pcm.html) or the [CLI](03-parallel-cluster-cli.html) to create one. Please note the cleanup instructions at the end and include cleanup of any accounting artefacts created within this chapter - particularly if working within your own account. +{{% /notice %}} + +![hpc_logo](/images/hpc-aws-parallelcluster-workshop/aws-parallelclusterlogo.png) + +In this lab, you will use Slurm Accounting within AWS ParallelCluster to create cost controls and monitor costs at the cluster level. + +This lab includes the following steps: + +1. Configure Slurm Accounting and prerequisites +2. Create cost controls using Slurm Accounting resource limits +3. Test cost controls by running sample jobs +4. View cluster cost data in Amazon CloudWatch + +### Design Overview + +Cloud HPC resources have ephemeral components, such as compute nodes, that only incur cost to the user when the resources are running. Users of cloud HPC are able to leverage the cloud's variable-cost model to cost optimize their clusters. However, the cloud's "pay for what you use" cost model can present challenges for HPC user groups that are used to operating in a fixed-cost model. In this lab, you will implement a solution that tracks +the cluster's compute node costs and prevents jobs from running if a budget threshold is reached. + +The cost control solution contains the following steps: + + 1. Execute the cost conversion python script with the desired budget (in US dollars) as an input parameter. + 2. Convert the input budget to compute node CPU minutes using the AWS Price List API. + 3. Apply CPU minutes as a Slurm Accounting resource limit. + 4. Prevent Slurm job execution if the CPU minutes resource limit is reached. + +Separately, you will publish cost metric data to Amazon CloudWatch to provide cost visibility in the ParallelCluster CloudWatch dashboard. + +You will create the below architecture during this lab: + +![Lab4 Architecture Diagram](/images/cost-controls/lab4_arch_diagram.png) + +The architecture presented in this lab has the following components: +- **Slurm Accounting** collects accounting information for each job and job step executed. [Slurm Accounting](https://slurm.schedmd.com/accounting.html#Overview) tracks resources and allows users to apply resource limits through a configuration called Group Trackable +Resource Minutes (GrpTRESMins). In this lab, you will use Slurm Accounting to track compute node usage within ParallelCluster and translate usage to cost. +- **Amazon Aurora** is a MySQL- and PostgreSQL-compatible database built for the cloud. If taking this lab as part of a workshop an [Amazon Aurora](https://aws.amazon.com/rds/aurora/) MySQL-compatible database has been pre-provisioned within your AWS sandbox account and associated with your Slurm instance via AWS ParallelCluster configuration for use with Slurm Accounting. If this is being done outside a workshop event, a database can be deployed in US East 1 Region by clicking on this button: +{{% button href="https://us-east-1.console.aws.amazon.com/cloudformation/home?region=us-east-1#/stacks/create/review?stackName=pcluster-slurm-db&templateURL=https://us-east-1-aws-parallelcluster.s3.amazonaws.com/templates/1-click/serverless-database.yaml" icon="fas fa-rocket" %}}Deploy Amazon Aurora database{{% /button %}} +- **AWS Price List API** provides a catalog of the products and prices for AWS services. The [AWS Price List API](https://docs.aws.amazon.com/awsaccountbilling/latest/aboutv2/price-changes.html) is used in this lab to look up the dollar cost of EC2 compute node resources used by ParallelCluster. +- **Cost Conversion Python script** to define a cost limit in dollars, covert dollars to CPU minutes (CPUMins) using the AWS Price List API ,and apply the CPUMins value as a GrpTRESMins limit within Slurm Accounting. +- **Cost Data CloudWatch Python script** to publish cluster cost data to the [Amazon CloudWatch](https://aws.amazon.com/cloudwatch/) service. This script extracts the total cluster compute usage from Slurm +Accounting, converts usage into dollars via Price List API, and publishes the cost as a custom metric to CloudWatch. +- **Amazon CloudWatch** collects and visualizes real-time logs, metrics, and event data in dashboards to streamline your infrastructure and application maintenance. +In this lab, you will use the [Amazon CloudWatch ParallelCluster Dashboard](https://docs.aws.amazon.com/parallelcluster/latest/ug/cloudwatch-dashboard-v3.html) that is created during ParallelCluster cluster creation. + +### Considerations and Limitations +The solution presented in this lab demonstrates the art of the possible for AWS ParallelCluster cost controls. +This solution, provided as-is under the MIT license, is **not intended for use in production environments**. +Those intending to use the concepts presented in this lab in their own environments are encouraged to build upon the capabilities +demonstrated in this lab to meet their own requirements. + +Capabilities that are not currently supported: +- multiple Slurm partitions (multiple ParallelCluster compute queues) +- varied compute pricing models: Spot, On-Demand, Reserved Instances, Savings Plans +- incorporating "always-on" costs including, but not limited to: head node, shared storage, networking +- handling partially used or idle compute nodes +- time-based (monthly, weekly, etc.) budgets + + +### Documentation/Links +Some supplemental documentation is available in the AWS Docs that were used to build and develop this solution: +- [Slurm accounting with AWS ParallelCluster](https://docs.aws.amazon.com/parallelcluster/latest/ug/slurm-accounting-v3.html) guides users through the processes to carry out SLURM accounting; including the changes brought with AWS ParallelCluster 3.10.0 where support was included for accounting with an external Slurmdbd. +- [Creating a cluster with an external Slurmdbd accounting](https://docs.aws.amazon.com/parallelcluster/latest/ug/external-slurmdb-accounting.html) is documentation detailing the external Slurmdbd approach. This includes a CloudFormation process to create a Slurmdbd stack. +- [AWS ParallelCluster tutorial for Creating a cluster with Slurm accounting](https://docs.aws.amazon.com/parallelcluster/latest/ug/tutorials_07_slurm-accounting-v3.html) \ No newline at end of file diff --git a/content/09-ml-on-parallelcluster/00-upload-training-data.md b/content/09-ml-on-parallelcluster/00-upload-training-data.md index e0de9cb4..df48bdda 100644 --- a/content/09-ml-on-parallelcluster/00-upload-training-data.md +++ b/content/09-ml-on-parallelcluster/00-upload-training-data.md @@ -1,7 +1,7 @@ --- title : "a. Upload training data to S3" date: 2020-09-04T15:58:58Z -weight : 5 +weight : 91 tags : ["configuration", "S3", "Conda", "data", "nccl", "efa"] --- diff --git a/content/09-ml-on-parallelcluster/01-create-ml-cluster.md b/content/09-ml-on-parallelcluster/01-create-ml-cluster.md index 150b76dc..ef63dc75 100644 --- a/content/09-ml-on-parallelcluster/01-create-ml-cluster.md +++ b/content/09-ml-on-parallelcluster/01-create-ml-cluster.md @@ -1,7 +1,7 @@ --- title : "b. Create a distributed ML cluster" date: 2020-09-04T15:58:58Z -weight : 10 +weight : 92 tags : ["configuration", "ML", "ParallelCluster", "create", "cluster"] --- diff --git a/content/09-ml-on-parallelcluster/02-data-preprocessing.md b/content/09-ml-on-parallelcluster/02-data-preprocessing.md index ed57be9a..112b8a6a 100644 --- a/content/09-ml-on-parallelcluster/02-data-preprocessing.md +++ b/content/09-ml-on-parallelcluster/02-data-preprocessing.md @@ -1,7 +1,7 @@ --- title : "c. Run single node data preprocessing with Slurm" date: 2020-09-04T15:58:58Z -weight : 20 +weight : 93 tags : ["preprocessing", "data", "ML", "srun", "slurm"] --- diff --git a/content/09-ml-on-parallelcluster/03-distributed-data-parallel.md b/content/09-ml-on-parallelcluster/03-distributed-data-parallel.md index 71140d2a..4e0c0aa5 100644 --- a/content/09-ml-on-parallelcluster/03-distributed-data-parallel.md +++ b/content/09-ml-on-parallelcluster/03-distributed-data-parallel.md @@ -1,7 +1,7 @@ --- title : "d. Run PyTorch Data Parallel training on ParallelCluster" date: 2020-09-04T15:58:58Z -weight : 30 +weight : 94 tags : ["training", "data parallel", "ML", "sbatch", "slurm", "multi node", "multi gpu"] --- diff --git a/content/09-ml-on-parallelcluster/04-delete-ml-cluster.md b/content/09-ml-on-parallelcluster/04-delete-ml-cluster.md index edfa3c91..ad1a8592 100644 --- a/content/09-ml-on-parallelcluster/04-delete-ml-cluster.md +++ b/content/09-ml-on-parallelcluster/04-delete-ml-cluster.md @@ -1,7 +1,7 @@ --- title : "e. Delete Distributed ML Cluster" date: 2020-09-04T15:58:58Z -weight : 40 +weight : 95 tags : ["cleanup", "parallelcluster", "ML"] --- diff --git a/content/09-ml-on-parallelcluster/_index.md b/content/09-ml-on-parallelcluster/_index.md index b5144e10..03c4d2b7 100644 --- a/content/09-ml-on-parallelcluster/_index.md +++ b/content/09-ml-on-parallelcluster/_index.md @@ -1,7 +1,7 @@ --- title: "Distributed Machine Learning" date: 2020-09-04T15:58:58Z -weight: 500 +weight: 90 pre: "IX ⁃ " tags: ["Machine Learning", "ML", "ParallelCluster", "EFA", "FSx", "Slurm"] --- diff --git a/content/authors.md b/content/authors.md index 79c5b003..a93ebc15 100644 --- a/content/authors.md +++ b/content/authors.md @@ -13,6 +13,7 @@ The AWS HPC Workshops website has been developed by: - **Karthik Raman** - DCV content - **Fabio Nonato de Paula, Ph.D.** - Distributed ML content - **Sean Smith** - Labs, content and site build +- **Max Starr, Ph.D.** - Labs & content #### Additional Authors diff --git a/layouts/shortcodes/detail-tag.html b/layouts/shortcodes/detail-tag.html new file mode 100644 index 00000000..dbfda987 --- /dev/null +++ b/layouts/shortcodes/detail-tag.html @@ -0,0 +1,4 @@ +
+ {{ (.Get 0) | markdownify }} + {{ .Inner | markdownify }} +
\ No newline at end of file diff --git a/static/images/cost-controls/cloudwatch_add_to_dashboard.png b/static/images/cost-controls/cloudwatch_add_to_dashboard.png new file mode 100644 index 00000000..d992e744 Binary files /dev/null and b/static/images/cost-controls/cloudwatch_add_to_dashboard.png differ diff --git a/static/images/cost-controls/cloudwatch_add_to_dashboard_3.png b/static/images/cost-controls/cloudwatch_add_to_dashboard_3.png new file mode 100644 index 00000000..c4094003 Binary files /dev/null and b/static/images/cost-controls/cloudwatch_add_to_dashboard_3.png differ diff --git a/static/images/cost-controls/cloudwatch_dashboard_1.png b/static/images/cost-controls/cloudwatch_dashboard_1.png new file mode 100644 index 00000000..7c0c2ecc Binary files /dev/null and b/static/images/cost-controls/cloudwatch_dashboard_1.png differ diff --git a/static/images/cost-controls/cloudwatch_dashboard_2.png b/static/images/cost-controls/cloudwatch_dashboard_2.png new file mode 100644 index 00000000..8953b4cb Binary files /dev/null and b/static/images/cost-controls/cloudwatch_dashboard_2.png differ diff --git a/static/images/cost-controls/cloudwatch_dashboard_final.png b/static/images/cost-controls/cloudwatch_dashboard_final.png new file mode 100644 index 00000000..5ba713ae Binary files /dev/null and b/static/images/cost-controls/cloudwatch_dashboard_final.png differ diff --git a/static/images/cost-controls/cloudwatch_metric.png b/static/images/cost-controls/cloudwatch_metric.png new file mode 100644 index 00000000..877fa843 Binary files /dev/null and b/static/images/cost-controls/cloudwatch_metric.png differ diff --git a/static/images/cost-controls/cloudwatch_period.png b/static/images/cost-controls/cloudwatch_period.png new file mode 100644 index 00000000..f5b1e4d1 Binary files /dev/null and b/static/images/cost-controls/cloudwatch_period.png differ diff --git a/static/images/cost-controls/lab4_arch_diagram.png b/static/images/cost-controls/lab4_arch_diagram.png new file mode 100644 index 00000000..bc786a0d Binary files /dev/null and b/static/images/cost-controls/lab4_arch_diagram.png differ diff --git a/static/images/cost-controls/sacct.png b/static/images/cost-controls/sacct.png new file mode 100644 index 00000000..39f310ad Binary files /dev/null and b/static/images/cost-controls/sacct.png differ diff --git a/static/images/cost-controls/squeue_begintime.png b/static/images/cost-controls/squeue_begintime.png new file mode 100644 index 00000000..0476dd50 Binary files /dev/null and b/static/images/cost-controls/squeue_begintime.png differ diff --git a/static/images/cost-controls/squeue_pending.png b/static/images/cost-controls/squeue_pending.png new file mode 100644 index 00000000..ada1eb03 Binary files /dev/null and b/static/images/cost-controls/squeue_pending.png differ diff --git a/static/images/cost-controls/squeue_running.png b/static/images/cost-controls/squeue_running.png new file mode 100644 index 00000000..27ef9e51 Binary files /dev/null and b/static/images/cost-controls/squeue_running.png differ diff --git a/static/images/cost-controls/sshare_show_limit.png b/static/images/cost-controls/sshare_show_limit.png new file mode 100644 index 00000000..85052ad4 Binary files /dev/null and b/static/images/cost-controls/sshare_show_limit.png differ diff --git a/static/images/cost-controls/sshare_show_usage.png b/static/images/cost-controls/sshare_show_usage.png new file mode 100644 index 00000000..a6bb45b4 Binary files /dev/null and b/static/images/cost-controls/sshare_show_usage.png differ diff --git a/static/scripts/create_cluster_cost_controls.py b/static/scripts/create_cluster_cost_controls.py new file mode 100644 index 00000000..35328889 --- /dev/null +++ b/static/scripts/create_cluster_cost_controls.py @@ -0,0 +1,123 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this +# software and associated documentation files (the "Software"), to deal in the Software +# without restriction, including without limitation the rights to use, copy, modify, +# merge, publish, distribute, sublicense, and/or sell copies of the Software, and to +# permit persons to whom the Software is furnished to do so. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A +# PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +import boto3, json, sys, os +boto3.compat.filter_python_deprecation_warnings() +from decimal import Decimal +pricing_client = boto3.client('pricing', region_name='us-east-1') + + +def find_by_key(data, target): + for key, value in data.items(): + if isinstance(value, dict): + yield from find_by_key(value, target) + elif key == target: + yield value + + +def get_compute_type(): + + with open('/etc/parallelcluster/slurm_plugin/fleet-config.json', 'r') as f: + data = json.load(f) + + end_val = find_by_key(data, 'Instances') + for value in end_val: + return value[0]['InstanceType'] + + +def get_instance_type_pricing(instance_type): + #response = pricing_client.describe_services(ServiceCode='AmazonEC2') + + #response = pricing_client.get_attribute_values(ServiceCode='AmazonEC2', AttributeName='tenancy') + + response = pricing_client.get_products(ServiceCode='AmazonEC2', + Filters=[ + { + 'Field': 'instanceType', + 'Type': 'TERM_MATCH', + 'Value': instance_type, + }, + { + 'Field': 'regionCode', + 'Type': 'TERM_MATCH', + 'Value': 'eu-north-1', + }, + { + 'Field': 'operatingSystem', + 'Type': 'TERM_MATCH', + 'Value': 'Linux', + }, + { + 'Field': 'tenancy', + 'Type': 'TERM_MATCH', + 'Value': 'shared', + } + ] + ) + + product_pricing = None + for item in response['PriceList']: + json_item = json.loads(item) + if 'BoxUsage' in json_item['product']['attributes']['usagetype']: + product_pricing = json_item + else: + continue + + price = find_by_key(product_pricing['terms']['OnDemand'],'USD') + vcpu_count = int(product_pricing['product']['attributes']['vcpu']) + + return Decimal(next(price)) / 60 / vcpu_count + + +def convert_budget_to_minutes(budget, price_per_minute): + + # the budget_padding_factor setting configures a percent threshold against the overall budget to compare against + # for example, .9 means 90% of the budget will be used to set the GrpTRESMins limit + # TODO parameterize this value + budget_padding_factor = Decimal(.9) + return int((Decimal(budget) / price_per_minute) * budget_padding_factor) + + +def apply_grpstresmins(minutes): + + output_code = os.system('sacctmgr modify account pcdefault set GrpTRESMins=cpu={0} -i'.format(minutes)) + if output_code == 0: + return + else: + raise Exception('Unable to apply GrpTRESMins via sacctmgr') + + +if __name__ == '__main__': + + budget = sys.argv[1] + + # get instance type + instance_type = get_compute_type() + + # query pricelist API + price_per_minute = get_instance_type_pricing(instance_type) + + # convert price to minutes + total_mins = convert_budget_to_minutes(budget, price_per_minute) + + # apply grptresmins + apply_grpstresmins(total_mins) + + print('successfully applied {0} minute limit to sacctmgr'.format(total_mins)) + + # slurm-accounting-db-test-v5-publicdb + + \ No newline at end of file diff --git a/static/scripts/publish_cw_cost_metric.py b/static/scripts/publish_cw_cost_metric.py new file mode 100644 index 00000000..cc654c5a --- /dev/null +++ b/static/scripts/publish_cw_cost_metric.py @@ -0,0 +1,135 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this +# software and associated documentation files (the "Software"), to deal in the Software +# without restriction, including without limitation the rights to use, copy, modify, +# merge, publish, distribute, sublicense, and/or sell copies of the Software, and to +# permit persons to whom the Software is furnished to do so. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A +# PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +import subprocess, json, boto3 +boto3.compat.filter_python_deprecation_warnings() +from decimal import Decimal + +pricing_client = boto3.client('pricing', region_name='us-east-1') + + +def find_by_key(data, target): + for key, value in data.items(): + if isinstance(value, dict): + yield from find_by_key(value, target) + elif key == target: + yield value + + +def get_compute_type(): + with open('/etc/parallelcluster/slurm_plugin/fleet-config.json', 'r') as f: + data = json.load(f) + + end_val = find_by_key(data, 'Instances') + for value in end_val: + return value[0]['InstanceType'] + + +def calculate_node_mins(sacct_output): + node_minutes = 0 + + for job in sacct_output['jobs']: + if len(job['steps']) == 0: + continue + + tmp_time = job['time']['elapsed'] + tmp_node = 0 + + for val in job['tres']['allocated']: + if val['type'] == 'node': + tmp_node = val['count'] + + node_minutes = node_minutes + (tmp_time * tmp_node) + + return node_minutes + + +def get_instance_type_pricing(instance_type): + # response = pricing_client.describe_services(ServiceCode='AmazonEC2') + + # response = pricing_client.get_attribute_values(ServiceCode='AmazonEC2', AttributeName='tenancy') + + response = pricing_client.get_products(ServiceCode='AmazonEC2', + Filters=[ + { + 'Field': 'instanceType', + 'Type': 'TERM_MATCH', + 'Value': instance_type, + }, + { + 'Field': 'regionCode', + 'Type': 'TERM_MATCH', + 'Value': 'eu-north-1', + }, + { + 'Field': 'operatingSystem', + 'Type': 'TERM_MATCH', + 'Value': 'Linux', + }, + { + 'Field': 'tenancy', + 'Type': 'TERM_MATCH', + 'Value': 'shared', + } + ] + ) + + product_pricing = None + for item in response['PriceList']: + json_item = json.loads(item) + if 'BoxUsage' in json_item['product']['attributes']['usagetype']: + product_pricing = json_item + else: + continue + + price = find_by_key(product_pricing['terms']['OnDemand'], 'USD') + + return Decimal(next(price)) / 60 + + +if __name__ == '__main__': + # sacct to get job statistics (one week of data) + # output = subprocess.check_output('sacct --allocations --starttime now-7days --json', shell=True) + output = subprocess.check_output('sacct --starttime now-7days --json', shell=True) + + json_output = json.loads(output) + node_mins = calculate_node_mins(json_output) + + # get instance type + instance_type = get_compute_type() + + # query pricelist API + price_per_minute = get_instance_type_pricing(instance_type) + + compute_budget_total = Decimal(price_per_minute) * Decimal(node_mins) + + print('total cost= {0}'.format(str(compute_budget_total))) + + cw_client = boto3.client('cloudwatch', region_name='eu-north-1') + + response = cw_client.put_metric_data( + Namespace='ParallelCluster', + MetricData=[ + {'MetricName': 'cluster_cost', + 'Dimensions': [ + { + 'Name': 'ClusterName', + 'Value': 'hpc' + } + ], + 'Value': compute_budget_total} + ] + ) \ No newline at end of file