From 54395e4979981dd27eb3ee2f131f99583fe1f170 Mon Sep 17 00:00:00 2001 From: Kun Wu <11816012+K-Wu@users.noreply.github.com> Date: Fri, 21 Apr 2023 01:17:54 -0500 Subject: [PATCH] dev ncu profiling for sgemm lab --- labs/sgemm-regtiled-coarsened/README.md | 3 +++ labs/sgemm-regtiled-coarsened/rai_build.yml | 3 ++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/labs/sgemm-regtiled-coarsened/README.md b/labs/sgemm-regtiled-coarsened/README.md index 877c3f8..2a9ac5d 100644 --- a/labs/sgemm-regtiled-coarsened/README.md +++ b/labs/sgemm-regtiled-coarsened/README.md @@ -41,3 +41,6 @@ Your last RAI submission will be used for grading. Be sure that it passes all tests for full points (you may still lose points for bugs not exposed during testing). +## Use Nsight Compute to Profile Your Code + +Remove the `#` character in the last command in -rai_build.yml to enable profiling. Note that you have to submit to the rai_amd64_exclusive queue by adding `--queue=rai_amd64_exclusive` to your RAI invocation. \ No newline at end of file diff --git a/labs/sgemm-regtiled-coarsened/rai_build.yml b/labs/sgemm-regtiled-coarsened/rai_build.yml index d40c529..d9abb36 100644 --- a/labs/sgemm-regtiled-coarsened/rai_build.yml +++ b/labs/sgemm-regtiled-coarsened/rai_build.yml @@ -1,7 +1,7 @@ # lab: Sgemm rai: version: 0.2 - image: raiproject/pumps2018:amd64-cuda100 + image: tonywukun/pumps2018:amd64-cuda100-nsighttest resources: cpu: architecture: amd64 @@ -14,4 +14,5 @@ commands: - cmake /src - make - ./sgemm -a + # - /usr/local/NVIDIA-Nsight-Compute/ncu --export "sgemm" --force-overwrite --target-processes all --kernel-name-base function --kernel-name regex:mysgemm --launch-skip-before-match 0 --section ComputeWorkloadAnalysis --section InstructionStats --section LaunchStats --section MemoryWorkloadAnalysis --section MemoryWorkloadAnalysis_Chart --section MemoryWorkloadAnalysis_Tables --section Occupancy --section SchedulerStats --section SourceCounters --section SpeedOfLight --section SpeedOfLight_HierarchicalSingleRooflineChart --section WarpStateStats --sampling-interval auto --sampling-max-passes 5 --sampling-buffer-size 33554432 --profile-from-start 1 --cache-control all --clock-control base ./sgemm -a