From d8f0d7e580c60f733a7c3fdf7e2cc7c457a24899 Mon Sep 17 00:00:00 2001
From: Arjun Suresh <arjunsuresh1987@gmail.com>
Date: Thu, 31 Oct 2024 14:55:24 +0000
Subject: [PATCH 01/16] Update generate_final_report.py

---
 tools/submission/generate_final_report.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/submission/generate_final_report.py b/tools/submission/generate_final_report.py
index ba2c368cdf..79d9fe0767 100644
--- a/tools/submission/generate_final_report.py
+++ b/tools/submission/generate_final_report.py
@@ -79,7 +79,7 @@ def main():
     df["p#"] = df.apply(lambda x: int(x["host_processors_per_node"]), axis=1)
 
     # details url
-    base_url = f"https://github.com/mlcommons/{args.repository}/tree/main"
+    base_url = f"https://github.com/{args.repository_owner}/{args.repository}/tree/{args.repository_branch}"
     df["Details"] = df.apply(
         lambda x: '=HYPERLINK("{}","details")'.format(
             "/".join(

From 6b1a0f87f46288d7b4b487f89e18f3151422694c Mon Sep 17 00:00:00 2001
From: Arjun Suresh <arjunsuresh1987@gmail.com>
Date: Thu, 7 Nov 2024 11:54:48 +0000
Subject: [PATCH 02/16] Fix sdxl (#1911)

* Fix typo in fid_score.py, fail_safe for SDXL short runs

* [Automated Commit] Format Codebase

* Fix typo in fid_score.py, fail_safe for SDXL short runs

* Fix dlrmv2 reference implementation | Update run_local.sh
---
 recommendation/dlrm_v2/pytorch/run_local.sh |  4 +++-
 text_to_image/coco.py                       | 24 ++++++++++++---------
 text_to_image/tools/fid/fid_score.py        |  2 +-
 3 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/recommendation/dlrm_v2/pytorch/run_local.sh b/recommendation/dlrm_v2/pytorch/run_local.sh
index 0d054c6c45..3bc8ec667c 100755
--- a/recommendation/dlrm_v2/pytorch/run_local.sh
+++ b/recommendation/dlrm_v2/pytorch/run_local.sh
@@ -2,7 +2,9 @@
 
 source ./run_common.sh
 
-common_opt="--mlperf_conf ../../../mlperf.conf"
+#mlperf.conf is now automatically loaded by loadgen
+#common_opt="--mlperf_conf ../../../mlperf.conf"
+
 OUTPUT_DIR=`pwd`/output/$name
 if [ ! -d $OUTPUT_DIR ]; then
     mkdir -p $OUTPUT_DIR
diff --git a/text_to_image/coco.py b/text_to_image/coco.py
index cb3956a014..e9499b0e6c 100644
--- a/text_to_image/coco.py
+++ b/text_to_image/coco.py
@@ -176,20 +176,24 @@ def __call__(self, results, ids, expected=None, result_dict=None):
     def save_images(self, ids, ds):
         info = []
         idx = {}
-        for i, id in enumerate(self.content_ids):
-            if id in ids:
-                idx[id] = i
+        for i, image_id in enumerate(self.content_ids):
+            if image_id in ids:
+                idx[image_id] = i
         if not os.path.exists("images/"):
             os.makedirs("images/", exist_ok=True)
-        for id in ids:
-            caption = ds.get_caption(id)
-            generated = Image.fromarray(self.results[idx[id]])
-            image_path_tmp = f"images/{self.content_ids[idx[id]]}.png"
+        for image_id in ids:
+            if not idx.get(image_id):
+                print(
+                    f"image id {image_id} is missing in the results. Hence not saved.")
+                continue
+            caption = ds.get_caption(image_id)
+            generated = Image.fromarray(self.results[idx[image_id]])
+            image_path_tmp = f"images/{self.content_ids[idx[image_id]]}.png"
             generated.save(image_path_tmp)
-            info.append((self.content_ids[idx[id]], caption))
+            info.append((self.content_ids[idx[image_id]], caption))
         with open("images/captions.txt", "w+") as f:
-            for id, caption in info:
-                f.write(f"{id}  {caption}\n")
+            for image_id, caption in info:
+                f.write(f"{image_id}  {caption}\n")
 
     def start(self):
         self.results = []
diff --git a/text_to_image/tools/fid/fid_score.py b/text_to_image/tools/fid/fid_score.py
index febc12ff5d..8e486c8b7a 100644
--- a/text_to_image/tools/fid/fid_score.py
+++ b/text_to_image/tools/fid/fid_score.py
@@ -44,7 +44,7 @@
 import pathlib
 import os
 import sys
-sys.path.insert("..", 0)
+sys.path.insert(0, "..")
 from inception import InceptionV3  # noqa: E402
 
 

From a4ba51fb2244f2efc703c341b13411676297e299 Mon Sep 17 00:00:00 2001
From: Arjun Suresh <arjunsuresh1987@gmail.com>
Date: Thu, 7 Nov 2024 20:07:07 +0530
Subject: [PATCH 03/16] Fixes for filtering invalid results

---
 tools/submission/preprocess_submission.py | 28 +++++++++++++++++++----
 1 file changed, 24 insertions(+), 4 deletions(-)

diff --git a/tools/submission/preprocess_submission.py b/tools/submission/preprocess_submission.py
index 1e26b81ca4..9d44b91d63 100644
--- a/tools/submission/preprocess_submission.py
+++ b/tools/submission/preprocess_submission.py
@@ -141,6 +141,22 @@ def change_folder_name_in_path(path, old_folder_name, new_folder_name):
     new_path = os.path.join(*path_parts)
     return new_path
 
+def clean_model_dir(model_results_dir):
+    model_measurements_dir = change_folder_name_in_path(model_results_dir, "results", "measurements")
+    model_compliance_dir = change_folder_name_in_path(model_results_dir, "results", "compliance")
+
+    print(f"rmtree {model_results_dir}")
+    shutil.rmtree(model_results_dir)
+    shutil.rmtree(model_measurements_dir)
+    shutil.rmtree(model_compliance_dir)
+    sut_results_dir = os.path.dirname(model_results_dir)
+    if not os.listdir(sut_results_dir):
+        #clean sut dir
+        sut = os.path.basename(sut_results_dir)
+        print(f"No benchmark results remaining for {sut}. rmtree {sut_results_dir}")
+        shutil.rmtree(sut_results_dir)
+        shutil.rmtree(os.path.dirname(model_measurements_dir))
+        shutil.rmtree(os.path.dirname(model_compliance_dir))
 
 def clean_invalid_results(args, log_path, config, system_desc, system_json,
                           model, mlperf_model, division, system_id_json, is_closed_or_network):
@@ -176,6 +192,7 @@ def clean_invalid_results(args, log_path, config, system_desc, system_json,
         except Exception as e:
             log.warning(e)
             perf_is_valid = False
+        compliance_is_valid = False
         if perf_is_valid:
             power_path = os.path.join(scenario_path, "performance", "power")
             has_power = os.path.exists(power_path)
@@ -260,9 +277,11 @@ def clean_invalid_results(args, log_path, config, system_desc, system_json,
                     # if only accuracy or compliance failed, result is valid
                     # for open
                     if not perf_is_valid:
-                        shutil.rmtree(scenario_path)
                         log.warning(
                             f"{scenario} scenario result is invalid for {system_desc}: {model} in {division} and open divisions. Accuracy: {accuracy_is_valid}, Performance: {perf_is_valid}. Removing it...")
+                        shutil.rmtree(scenario_path)
+                        scenario_measurements_path = change_folder_name_in_path(scenario_path, "results", "measurements")
+                        shutil.rmtree(scenario_measurements_path)
                     if not os.path.exists(target_results_path):
                         shutil.copytree(
                             model_results_path, target_results_path)
@@ -288,9 +307,7 @@ def clean_invalid_results(args, log_path, config, system_desc, system_json,
                         log.warning(f"{scenario} scenario result is invalid for {system_desc}: {model} in {division} division. Accuracy: {accuracy_is_valid}, Performance: {perf_is_valid}. Compliance: {compliance_is_valid}. Moving other scenario results of {model} to open...")
                 else:
                     log.warning(f"{scenario} scenario result is invalid for {system_desc}: {model} in {division} division. Accuracy: {accuracy_is_valid}, Performance: {perf_is_valid}. Removing all dependent scenario results...")
-                shutil.rmtree(model_results_path)
-                shutil.rmtree(model_measurements_path)
-                shutil.rmtree(model_compliance_path)
+                clean_model_dir(model_results_path)
             else:  # delete this result
                 # delete other scenario results too
                 shutil.rmtree(scenario_path)
@@ -517,6 +534,9 @@ def main():
 
     infer_scenario_results(args, config)
 
+    if not args.nodelete_empty_dirs:
+        delete_empty_dirs(os.path.join(src_dir))
+
     return 0
 
 

From 451b310ef42a28e015ce5abc4e43ba6033ff8d4a Mon Sep 17 00:00:00 2001
From: arjunsuresh <arjunsuresh@users.noreply.github.com>
Date: Thu, 7 Nov 2024 14:44:42 +0000
Subject: [PATCH 04/16] [Automated Commit] Format Codebase

---
 tools/submission/preprocess_submission.py | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/tools/submission/preprocess_submission.py b/tools/submission/preprocess_submission.py
index 9d44b91d63..7803cf5684 100644
--- a/tools/submission/preprocess_submission.py
+++ b/tools/submission/preprocess_submission.py
@@ -141,9 +141,12 @@ def change_folder_name_in_path(path, old_folder_name, new_folder_name):
     new_path = os.path.join(*path_parts)
     return new_path
 
+
 def clean_model_dir(model_results_dir):
-    model_measurements_dir = change_folder_name_in_path(model_results_dir, "results", "measurements")
-    model_compliance_dir = change_folder_name_in_path(model_results_dir, "results", "compliance")
+    model_measurements_dir = change_folder_name_in_path(
+        model_results_dir, "results", "measurements")
+    model_compliance_dir = change_folder_name_in_path(
+        model_results_dir, "results", "compliance")
 
     print(f"rmtree {model_results_dir}")
     shutil.rmtree(model_results_dir)
@@ -151,13 +154,15 @@ def clean_model_dir(model_results_dir):
     shutil.rmtree(model_compliance_dir)
     sut_results_dir = os.path.dirname(model_results_dir)
     if not os.listdir(sut_results_dir):
-        #clean sut dir
+        # clean sut dir
         sut = os.path.basename(sut_results_dir)
-        print(f"No benchmark results remaining for {sut}. rmtree {sut_results_dir}")
+        print(
+            f"No benchmark results remaining for {sut}. rmtree {sut_results_dir}")
         shutil.rmtree(sut_results_dir)
         shutil.rmtree(os.path.dirname(model_measurements_dir))
         shutil.rmtree(os.path.dirname(model_compliance_dir))
 
+
 def clean_invalid_results(args, log_path, config, system_desc, system_json,
                           model, mlperf_model, division, system_id_json, is_closed_or_network):
     # cleanup invalid results
@@ -280,7 +285,8 @@ def clean_invalid_results(args, log_path, config, system_desc, system_json,
                         log.warning(
                             f"{scenario} scenario result is invalid for {system_desc}: {model} in {division} and open divisions. Accuracy: {accuracy_is_valid}, Performance: {perf_is_valid}. Removing it...")
                         shutil.rmtree(scenario_path)
-                        scenario_measurements_path = change_folder_name_in_path(scenario_path, "results", "measurements")
+                        scenario_measurements_path = change_folder_name_in_path(
+                            scenario_path, "results", "measurements")
                         shutil.rmtree(scenario_measurements_path)
                     if not os.path.exists(target_results_path):
                         shutil.copytree(

From 4c109ea8b5b17d0c422d4b8a08a55070142c68ae Mon Sep 17 00:00:00 2001
From: Arjun Suresh <arjunsuresh1987@gmail.com>
Date: Thu, 7 Nov 2024 14:58:42 +0000
Subject: [PATCH 05/16] Update preprocess_submission.py

---
 tools/submission/preprocess_submission.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/tools/submission/preprocess_submission.py b/tools/submission/preprocess_submission.py
index 7803cf5684..a1678c79d0 100644
--- a/tools/submission/preprocess_submission.py
+++ b/tools/submission/preprocess_submission.py
@@ -2,10 +2,6 @@
 Tool to infer scenario results and cleanup submission tree
 """
 
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-
 import argparse
 import logging
 import os
@@ -156,7 +152,7 @@ def clean_model_dir(model_results_dir):
     if not os.listdir(sut_results_dir):
         # clean sut dir
         sut = os.path.basename(sut_results_dir)
-        print(
+        log.info(
             f"No benchmark results remaining for {sut}. rmtree {sut_results_dir}")
         shutil.rmtree(sut_results_dir)
         shutil.rmtree(os.path.dirname(model_measurements_dir))

From 40c1fe0c28364b243b5944b3569000611ddf2b7d Mon Sep 17 00:00:00 2001
From: Arjun Suresh <arjunsuresh1987@gmail.com>
Date: Thu, 7 Nov 2024 21:20:52 +0530
Subject: [PATCH 06/16] Added an option to pass in sample_ids.txt for SDXL
 accuracy check

---
 text_to_image/tools/accuracy_coco.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/text_to_image/tools/accuracy_coco.py b/text_to_image/tools/accuracy_coco.py
index 2d7c36506d..8740ee1726 100644
--- a/text_to_image/tools/accuracy_coco.py
+++ b/text_to_image/tools/accuracy_coco.py
@@ -51,6 +51,10 @@ def get_args():
         required=False,
         help="path to dump 10 stable diffusion xl compliance images",
     )
+    #Do not use for official MLPerf inference submissions as only the default one is valid
+    parser.add_argument(
+        "--ids-path", help="Path to 10 caption ids to dump as compliance images", default="os.path.join(os.path.dirname(__file__), 'sample_ids.txt')"
+    )
     parser.add_argument("--device", default="cpu", choices=["gpu", "cpu"])
     parser.add_argument(
         "--low_memory",
@@ -97,8 +101,9 @@ def main():
             os.makedirs(args.compliance_images_path)
         dump_compliance_images = True
         compliance_images_idx_list = []
+        sample_ids_file_path = args.ids_path if args.ids_path else os.path.join(os.path.dirname(__file__), "sample_ids.txt")
         with open(
-            os.path.join(os.path.dirname(__file__), "sample_ids.txt"), "r"
+            os.path.join(sample_ids_file_path, "r"
         ) as compliance_id_file:
             for line in compliance_id_file:
                 idx = int(line.strip())

From 89a2ffe257bc8c4c0d8e81cb5c1fec4e15080b2a Mon Sep 17 00:00:00 2001
From: arjunsuresh <arjunsuresh@users.noreply.github.com>
Date: Thu, 7 Nov 2024 15:51:36 +0000
Subject: [PATCH 07/16] [Automated Commit] Format Codebase

---
 text_to_image/tools/accuracy_coco.py | 88 ++++++++++++++--------------
 1 file changed, 45 insertions(+), 43 deletions(-)

diff --git a/text_to_image/tools/accuracy_coco.py b/text_to_image/tools/accuracy_coco.py
index 8740ee1726..bc3f87d04b 100644
--- a/text_to_image/tools/accuracy_coco.py
+++ b/text_to_image/tools/accuracy_coco.py
@@ -51,7 +51,8 @@ def get_args():
         required=False,
         help="path to dump 10 stable diffusion xl compliance images",
     )
-    #Do not use for official MLPerf inference submissions as only the default one is valid
+    # Do not use for official MLPerf inference submissions as only the default
+    # one is valid
     parser.add_argument(
         "--ids-path", help="Path to 10 caption ids to dump as compliance images", default="os.path.join(os.path.dirname(__file__), 'sample_ids.txt')"
     )
@@ -101,12 +102,13 @@ def main():
             os.makedirs(args.compliance_images_path)
         dump_compliance_images = True
         compliance_images_idx_list = []
-        sample_ids_file_path = args.ids_path if args.ids_path else os.path.join(os.path.dirname(__file__), "sample_ids.txt")
+        sample_ids_file_path = args.ids_path if args.ids_path else os.path.join(
+            os.path.dirname(__file__), "sample_ids.txt")
         with open(
             os.path.join(sample_ids_file_path, "r"
         ) as compliance_id_file:
             for line in compliance_id_file:
-                idx = int(line.strip())
+                idx=int(line.strip())
                 compliance_images_idx_list.append(idx)
         # Dump caption.txt
         with open(
@@ -153,28 +155,28 @@ def compute_accuracy(
     statistics_path,
 ):
     # Load torchmetrics modules
-    clip = CLIPEncoder(device=device)
-    clip_scores = []
-    seen = set()
-    result_list = []
-    result_dict = {}
+    clip=CLIPEncoder(device=device)
+    clip_scores=[]
+    seen=set()
+    result_list=[]
+    result_dict={}
 
     # Load model outputs
     with open(mlperf_accuracy_file, "r") as f:
-        results = json.load(f)
+        results=json.load(f)
 
     for j in tqdm(results):
-        idx = j["qsl_idx"]
+        idx=j["qsl_idx"]
         if idx in seen:
             continue
         seen.add(idx)
 
         # Load generated image
-        generated_img = np.frombuffer(bytes.fromhex(j["data"]), np.uint8).reshape(
+        generated_img=np.frombuffer(bytes.fromhex(j["data"]), np.uint8).reshape(
             1024, 1024, 3
         )
         result_list.append(generated_img)
-        generated_img = Image.fromarray(generated_img)
+        generated_img=Image.fromarray(generated_img)
 
         # Dump compliance images
         if dump_compliance_images and idx in compliance_images_idx_list:
@@ -185,16 +187,16 @@ def compute_accuracy(
 
         # generated_img = torch.Tensor(generated_img).to(torch.uint8).to(device)
         # Load Ground Truth
-        caption = df_captions.iloc[idx]["caption"]
+        caption=df_captions.iloc[idx]["caption"]
         clip_scores.append(
             100 *
             clip.get_clip_score(
                 caption,
                 generated_img).item())
-    fid_score = compute_fid(result_list, statistics_path, device)
+    fid_score=compute_fid(result_list, statistics_path, device)
 
-    result_dict["FID_SCORE"] = fid_score
-    result_dict["CLIP_SCORE"] = np.mean(clip_scores)
+    result_dict["FID_SCORE"]=fid_score
+    result_dict["CLIP_SCORE"]=np.mean(clip_scores)
     print(f"Accuracy Results: {result_dict}")
 
     with open(output_file, "w") as fp:
@@ -216,43 +218,43 @@ def compute_accuracy_low_memory(
 ):
     if num_workers is None:
         try:
-            num_cpus = len(os.sched_getaffinity(0))
+            num_cpus=len(os.sched_getaffinity(0))
         except AttributeError:
             # os.sched_getaffinity is not available under Windows, use
             # os.cpu_count instead (which may not return the *available* number
             # of CPUs).
-            num_cpus = os.cpu_count()
+            num_cpus=os.cpu_count()
 
-        num_workers = min(num_cpus, 8) if num_cpus is not None else 0
+        num_workers=min(num_cpus, 8) if num_cpus is not None else 0
     else:
-        num_workers = num_workers
+        num_workers=num_workers
 
     # Load torchmetrics modules
-    block_idx = InceptionV3.BLOCK_INDEX_BY_DIM[inception_dims]
-    inception_model = InceptionV3([block_idx]).to(device)
-    clip_model = CLIPEncoder(device=device)
+    block_idx=InceptionV3.BLOCK_INDEX_BY_DIM[inception_dims]
+    inception_model=InceptionV3([block_idx]).to(device)
+    clip_model=CLIPEncoder(device=device)
 
-    clip_scores = []
-    seen = set()
-    result_batch = []
-    result_dict = {}
-    activations = np.empty((0, inception_dims))
+    clip_scores=[]
+    seen=set()
+    result_batch=[]
+    result_dict={}
+    activations=np.empty((0, inception_dims))
 
     # Load model outputs
     with open(mlperf_accuracy_file, "r") as f:
-        results = ijson.items(f, "item")
+        results=ijson.items(f, "item")
 
         for j in tqdm(results):
-            idx = j["qsl_idx"]
+            idx=j["qsl_idx"]
             if idx in seen:
                 continue
             seen.add(idx)
 
             # Load generated image
-            generated_img = np.frombuffer(bytes.fromhex(j["data"]), np.uint8).reshape(
+            generated_img=np.frombuffer(bytes.fromhex(j["data"]), np.uint8).reshape(
                 1024, 1024, 3
             )
-            generated_img = Image.fromarray(generated_img)
+            generated_img=Image.fromarray(generated_img)
 
             # Dump compliance images
             if dump_compliance_images and idx in compliance_images_idx_list:
@@ -262,7 +264,7 @@ def compute_accuracy_low_memory(
                         f"{idx}.png"))
 
             # Load Ground Truth
-            caption = df_captions.iloc[idx]["caption"]
+            caption=df_captions.iloc[idx]["caption"]
             clip_scores.append(
                 100 * clip_model.get_clip_score(caption, generated_img).item()
             )
@@ -270,7 +272,7 @@ def compute_accuracy_low_memory(
             result_batch.append(generated_img.convert("RGB"))
 
             if len(result_batch) == batch_size:
-                act = get_activations(
+                act=get_activations(
                     result_batch,
                     inception_model,
                     batch_size,
@@ -278,12 +280,12 @@ def compute_accuracy_low_memory(
                     device,
                     num_workers,
                 )
-                activations = np.append(activations, act, axis=0)
+                activations=np.append(activations, act, axis=0)
                 result_batch.clear()
 
         # Remaining data for last batch
         if len(result_batch) > 0:
-            act = get_activations(
+            act=get_activations(
                 result_batch,
                 inception_model,
                 len(result_batch),
@@ -291,9 +293,9 @@ def compute_accuracy_low_memory(
                 device,
                 num_workers,
             )
-            activations = np.append(activations, act, axis=0)
+            activations=np.append(activations, act, axis=0)
 
-    m1, s1 = compute_statistics_of_path(
+    m1, s1=compute_statistics_of_path(
         statistics_path,
         inception_model,
         batch_size,
@@ -304,13 +306,13 @@ def compute_accuracy_low_memory(
         None,
     )
 
-    m2 = np.mean(activations, axis=0)
-    s2 = np.cov(activations, rowvar=False)
+    m2=np.mean(activations, axis=0)
+    s2=np.cov(activations, rowvar=False)
 
-    fid_score = calculate_frechet_distance(m1, s1, m2, s2)
+    fid_score=calculate_frechet_distance(m1, s1, m2, s2)
 
-    result_dict["FID_SCORE"] = fid_score
-    result_dict["CLIP_SCORE"] = np.mean(clip_scores)
+    result_dict["FID_SCORE"]=fid_score
+    result_dict["CLIP_SCORE"]=np.mean(clip_scores)
     print(f"Accuracy Results: {result_dict}")
 
     with open(output_file, "w") as fp:

From 69ffdc0aa783f9127af612a7de57c6329703c1dc Mon Sep 17 00:00:00 2001
From: Arjun Suresh <arjunsuresh1987@gmail.com>
Date: Thu, 7 Nov 2024 20:19:13 +0000
Subject: [PATCH 08/16] Update accuracy_coco.py

---
 text_to_image/tools/accuracy_coco.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/text_to_image/tools/accuracy_coco.py b/text_to_image/tools/accuracy_coco.py
index bc3f87d04b..0d0c015607 100644
--- a/text_to_image/tools/accuracy_coco.py
+++ b/text_to_image/tools/accuracy_coco.py
@@ -105,7 +105,7 @@ def main():
         sample_ids_file_path = args.ids_path if args.ids_path else os.path.join(
             os.path.dirname(__file__), "sample_ids.txt")
         with open(
-            os.path.join(sample_ids_file_path, "r"
+            os.path.join(sample_ids_file_path, "r")
         ) as compliance_id_file:
             for line in compliance_id_file:
                 idx=int(line.strip())

From d1d642e06f91e5b8f56088f8d1a4b127a65d962c Mon Sep 17 00:00:00 2001
From: arjunsuresh <arjunsuresh@users.noreply.github.com>
Date: Thu, 7 Nov 2024 20:19:47 +0000
Subject: [PATCH 09/16] [Automated Commit] Format Codebase

---
 text_to_image/tools/accuracy_coco.py | 82 ++++++++++++++--------------
 1 file changed, 41 insertions(+), 41 deletions(-)

diff --git a/text_to_image/tools/accuracy_coco.py b/text_to_image/tools/accuracy_coco.py
index 0d0c015607..d73325897b 100644
--- a/text_to_image/tools/accuracy_coco.py
+++ b/text_to_image/tools/accuracy_coco.py
@@ -108,7 +108,7 @@ def main():
             os.path.join(sample_ids_file_path, "r")
         ) as compliance_id_file:
             for line in compliance_id_file:
-                idx=int(line.strip())
+                idx = int(line.strip())
                 compliance_images_idx_list.append(idx)
         # Dump caption.txt
         with open(
@@ -155,28 +155,28 @@ def compute_accuracy(
     statistics_path,
 ):
     # Load torchmetrics modules
-    clip=CLIPEncoder(device=device)
-    clip_scores=[]
-    seen=set()
-    result_list=[]
-    result_dict={}
+    clip = CLIPEncoder(device=device)
+    clip_scores = []
+    seen = set()
+    result_list = []
+    result_dict = {}
 
     # Load model outputs
     with open(mlperf_accuracy_file, "r") as f:
-        results=json.load(f)
+        results = json.load(f)
 
     for j in tqdm(results):
-        idx=j["qsl_idx"]
+        idx = j["qsl_idx"]
         if idx in seen:
             continue
         seen.add(idx)
 
         # Load generated image
-        generated_img=np.frombuffer(bytes.fromhex(j["data"]), np.uint8).reshape(
+        generated_img = np.frombuffer(bytes.fromhex(j["data"]), np.uint8).reshape(
             1024, 1024, 3
         )
         result_list.append(generated_img)
-        generated_img=Image.fromarray(generated_img)
+        generated_img = Image.fromarray(generated_img)
 
         # Dump compliance images
         if dump_compliance_images and idx in compliance_images_idx_list:
@@ -187,16 +187,16 @@ def compute_accuracy(
 
         # generated_img = torch.Tensor(generated_img).to(torch.uint8).to(device)
         # Load Ground Truth
-        caption=df_captions.iloc[idx]["caption"]
+        caption = df_captions.iloc[idx]["caption"]
         clip_scores.append(
             100 *
             clip.get_clip_score(
                 caption,
                 generated_img).item())
-    fid_score=compute_fid(result_list, statistics_path, device)
+    fid_score = compute_fid(result_list, statistics_path, device)
 
-    result_dict["FID_SCORE"]=fid_score
-    result_dict["CLIP_SCORE"]=np.mean(clip_scores)
+    result_dict["FID_SCORE"] = fid_score
+    result_dict["CLIP_SCORE"] = np.mean(clip_scores)
     print(f"Accuracy Results: {result_dict}")
 
     with open(output_file, "w") as fp:
@@ -218,43 +218,43 @@ def compute_accuracy_low_memory(
 ):
     if num_workers is None:
         try:
-            num_cpus=len(os.sched_getaffinity(0))
+            num_cpus = len(os.sched_getaffinity(0))
         except AttributeError:
             # os.sched_getaffinity is not available under Windows, use
             # os.cpu_count instead (which may not return the *available* number
             # of CPUs).
-            num_cpus=os.cpu_count()
+            num_cpus = os.cpu_count()
 
-        num_workers=min(num_cpus, 8) if num_cpus is not None else 0
+        num_workers = min(num_cpus, 8) if num_cpus is not None else 0
     else:
-        num_workers=num_workers
+        num_workers = num_workers
 
     # Load torchmetrics modules
-    block_idx=InceptionV3.BLOCK_INDEX_BY_DIM[inception_dims]
-    inception_model=InceptionV3([block_idx]).to(device)
-    clip_model=CLIPEncoder(device=device)
+    block_idx = InceptionV3.BLOCK_INDEX_BY_DIM[inception_dims]
+    inception_model = InceptionV3([block_idx]).to(device)
+    clip_model = CLIPEncoder(device=device)
 
-    clip_scores=[]
-    seen=set()
-    result_batch=[]
-    result_dict={}
-    activations=np.empty((0, inception_dims))
+    clip_scores = []
+    seen = set()
+    result_batch = []
+    result_dict = {}
+    activations = np.empty((0, inception_dims))
 
     # Load model outputs
     with open(mlperf_accuracy_file, "r") as f:
-        results=ijson.items(f, "item")
+        results = ijson.items(f, "item")
 
         for j in tqdm(results):
-            idx=j["qsl_idx"]
+            idx = j["qsl_idx"]
             if idx in seen:
                 continue
             seen.add(idx)
 
             # Load generated image
-            generated_img=np.frombuffer(bytes.fromhex(j["data"]), np.uint8).reshape(
+            generated_img = np.frombuffer(bytes.fromhex(j["data"]), np.uint8).reshape(
                 1024, 1024, 3
             )
-            generated_img=Image.fromarray(generated_img)
+            generated_img = Image.fromarray(generated_img)
 
             # Dump compliance images
             if dump_compliance_images and idx in compliance_images_idx_list:
@@ -264,7 +264,7 @@ def compute_accuracy_low_memory(
                         f"{idx}.png"))
 
             # Load Ground Truth
-            caption=df_captions.iloc[idx]["caption"]
+            caption = df_captions.iloc[idx]["caption"]
             clip_scores.append(
                 100 * clip_model.get_clip_score(caption, generated_img).item()
             )
@@ -272,7 +272,7 @@ def compute_accuracy_low_memory(
             result_batch.append(generated_img.convert("RGB"))
 
             if len(result_batch) == batch_size:
-                act=get_activations(
+                act = get_activations(
                     result_batch,
                     inception_model,
                     batch_size,
@@ -280,12 +280,12 @@ def compute_accuracy_low_memory(
                     device,
                     num_workers,
                 )
-                activations=np.append(activations, act, axis=0)
+                activations = np.append(activations, act, axis=0)
                 result_batch.clear()
 
         # Remaining data for last batch
         if len(result_batch) > 0:
-            act=get_activations(
+            act = get_activations(
                 result_batch,
                 inception_model,
                 len(result_batch),
@@ -293,9 +293,9 @@ def compute_accuracy_low_memory(
                 device,
                 num_workers,
             )
-            activations=np.append(activations, act, axis=0)
+            activations = np.append(activations, act, axis=0)
 
-    m1, s1=compute_statistics_of_path(
+    m1, s1 = compute_statistics_of_path(
         statistics_path,
         inception_model,
         batch_size,
@@ -306,13 +306,13 @@ def compute_accuracy_low_memory(
         None,
     )
 
-    m2=np.mean(activations, axis=0)
-    s2=np.cov(activations, rowvar=False)
+    m2 = np.mean(activations, axis=0)
+    s2 = np.cov(activations, rowvar=False)
 
-    fid_score=calculate_frechet_distance(m1, s1, m2, s2)
+    fid_score = calculate_frechet_distance(m1, s1, m2, s2)
 
-    result_dict["FID_SCORE"]=fid_score
-    result_dict["CLIP_SCORE"]=np.mean(clip_scores)
+    result_dict["FID_SCORE"] = fid_score
+    result_dict["CLIP_SCORE"] = np.mean(clip_scores)
     print(f"Accuracy Results: {result_dict}")
 
     with open(output_file, "w") as fp:

From 8d3b8ab09ac392b5a8656ad07d37fb8d7942595b Mon Sep 17 00:00:00 2001
From: Arjun Suresh <arjunsuresh1987@gmail.com>
Date: Fri, 8 Nov 2024 03:44:42 +0530
Subject: [PATCH 10/16] Fix typo

---
 text_to_image/tools/accuracy_coco.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/text_to_image/tools/accuracy_coco.py b/text_to_image/tools/accuracy_coco.py
index d73325897b..42ef8efe34 100644
--- a/text_to_image/tools/accuracy_coco.py
+++ b/text_to_image/tools/accuracy_coco.py
@@ -105,7 +105,7 @@ def main():
         sample_ids_file_path = args.ids_path if args.ids_path else os.path.join(
             os.path.dirname(__file__), "sample_ids.txt")
         with open(
-            os.path.join(sample_ids_file_path, "r")
+            sample_ids_file_path, "r"
         ) as compliance_id_file:
             for line in compliance_id_file:
                 idx = int(line.strip())

From b09b1efef4e5225d33618432cf71550ac135f501 Mon Sep 17 00:00:00 2001
From: Arjun Suresh <arjunsuresh1987@gmail.com>
Date: Fri, 8 Nov 2024 15:47:17 +0530
Subject: [PATCH 11/16] Not use default for sample_ids.txt

---
 text_to_image/tools/accuracy_coco.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/text_to_image/tools/accuracy_coco.py b/text_to_image/tools/accuracy_coco.py
index 42ef8efe34..b5f1be3783 100644
--- a/text_to_image/tools/accuracy_coco.py
+++ b/text_to_image/tools/accuracy_coco.py
@@ -54,7 +54,7 @@ def get_args():
     # Do not use for official MLPerf inference submissions as only the default
     # one is valid
     parser.add_argument(
-        "--ids-path", help="Path to 10 caption ids to dump as compliance images", default="os.path.join(os.path.dirname(__file__), 'sample_ids.txt')"
+        "--ids-path", help="Path to 10 caption ids to dump as compliance images"
     )
     parser.add_argument("--device", default="cpu", choices=["gpu", "cpu"])
     parser.add_argument(

From df5049d4dbec41862fef6dd7edf9fb064a779bd6 Mon Sep 17 00:00:00 2001
From: Arjun Suresh <arjunsuresh1987@gmail.com>
Date: Thu, 14 Nov 2024 23:12:35 +0000
Subject: [PATCH 12/16] Update requirements.txt (#1907)

Updating the pip packages
---
 text_to_image/requirements.txt | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/text_to_image/requirements.txt b/text_to_image/requirements.txt
index a0a8507731..857de950e8 100644
--- a/text_to_image/requirements.txt
+++ b/text_to_image/requirements.txt
@@ -1,8 +1,8 @@
-diffusers==0.21.2
-transformers==4.33.2
-accelerate==0.23.0
-open-clip-torch==2.7.0
-opencv-python==4.8.1.78
+diffusers==0.30.3
+transformers==4.45.2
+accelerate==1.0.1
+open-clip-torch==2.26.1
+opencv-python==4.10.0.84
 pycocotools==2.0.7
-torchmetrics[image]==1.2.0
-scipy==1.9.1
+torchmetrics[image]==1.4.3
+scipy==1.10.1

From a7e8c8ad2766e3fb64a31eb42c8cde724f7b055d Mon Sep 17 00:00:00 2001
From: Arjun Suresh <arjunsuresh1987@gmail.com>
Date: Fri, 15 Nov 2024 16:46:53 +0530
Subject: [PATCH 13/16] Fix preprocess_sudbmission for a bug

---
 tools/submission/preprocess_submission.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tools/submission/preprocess_submission.py b/tools/submission/preprocess_submission.py
index a1678c79d0..ec3aa1f7af 100644
--- a/tools/submission/preprocess_submission.py
+++ b/tools/submission/preprocess_submission.py
@@ -84,7 +84,6 @@ def delete_empty_dirs(src):
     """
     if not os.path.isdir(src):
         return False
-
     if all([delete_empty_dirs(os.path.join(src, file))
            for file in os.listdir(src)]):
         log.info("Removing empty dir: (%s)", src)
@@ -532,13 +531,16 @@ def main():
     if not args.nodelete_empty_dirs:
         delete_empty_dirs(os.path.join(src_dir))
 
+    run_dir = os.getcwd()
     os.chdir(src_dir)
 
     infer_scenario_results(args, config)
+    os.chdir(run_dir)
 
     if not args.nodelete_empty_dirs:
         delete_empty_dirs(os.path.join(src_dir))
 
+    
     return 0
 
 

From 8915a90ea0fed700afbffbc75908cd2fbf103104 Mon Sep 17 00:00:00 2001
From: Arjun Suresh <arjunsuresh1987@gmail.com>
Date: Sat, 16 Nov 2024 22:04:18 +0000
Subject: [PATCH 14/16] Update submission_checker.py | Removed TEST05

---
 tools/submission/submission_checker.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tools/submission/submission_checker.py b/tools/submission/submission_checker.py
index 5f2e272673..deff9eb8c4 100755
--- a/tools/submission/submission_checker.py
+++ b/tools/submission/submission_checker.py
@@ -188,6 +188,7 @@
             "sample_index_rng_seed": 198141574272810017,
             "schedule_rng_seed": 7575108116881280410,
         },
+        # not required for v5.0+
         "test05_seeds": {
             # TODO: Update random seeds
             "qsl_rng_seed": 2376919268182438552,
@@ -2880,7 +2881,7 @@ def check_compliance_dir(
     compliance_perf_pass = True
     compliance_perf_dir_pass = True
     compliance_acc_pass = True
-    test_list = ["TEST01", "TEST04", "TEST05"]
+    test_list = ["TEST01", "TEST04"]
 
     if model in [
         "bert-99",
@@ -2899,7 +2900,7 @@ def check_compliance_dir(
     ]:
         test_list.remove("TEST04")
 
-    if model in [
+    if config.version in ["v4.0", "v4.1"] and model not in [
         "gptj-99",
         "gptj-99.9",
         "llama2-70b-99",
@@ -2907,7 +2908,7 @@ def check_compliance_dir(
         "stable-diffusion-xl",
         "mixtral-8x7b",
     ]:
-        test_list.remove("TEST05")
+        test_list.append("TEST05")
 
     if model in [
         "gptj-99",

From 941c0c484f4395e9b63611fbddafbde001a9de39 Mon Sep 17 00:00:00 2001
From: zixianwang2022 <goghelloworld@gmail.com>
Date: Sat, 16 Nov 2024 21:18:39 -0600
Subject: [PATCH 15/16] move changes to fork 4 pr

---
 text_to_image/backend_pytorch.py | 112 +++++++-------
 text_to_image/main.py            | 241 ++++++++++++++++++++++---------
 text_to_image/mlperf.conf        |  99 +++++++++++++
 3 files changed, 326 insertions(+), 126 deletions(-)
 create mode 100644 text_to_image/mlperf.conf

diff --git a/text_to_image/backend_pytorch.py b/text_to_image/backend_pytorch.py
index 36e2b80090..f2af4d75c7 100644
--- a/text_to_image/backend_pytorch.py
+++ b/text_to_image/backend_pytorch.py
@@ -17,9 +17,9 @@ def __init__(
         model_id="xl",
         guidance=8,
         steps=20,
-        batch_size=1,
+        batch_size=2,
         device="cuda",
-        precision="fp32",
+        precision="fp16",
         negative_prompt="normal quality, low quality, worst quality, low res, blurry, nsfw, nude",
     ):
         super(BackendPytorch, self).__init__()
@@ -57,39 +57,41 @@ def image_format(self):
         return "NCHW"
 
     def load(self):
-        if self.model_path is None:
-            log.warning(
-                "Model path not provided, running with default hugging face weights\n"
-                "This may not be valid for official submissions"
-            )
-            self.scheduler = EulerDiscreteScheduler.from_pretrained(
-                self.model_id, subfolder="scheduler"
-            )
-            self.pipe = StableDiffusionXLPipeline.from_pretrained(
-                self.model_id,
-                scheduler=self.scheduler,
-                safety_checker=None,
-                add_watermarker=False,
-                variant="fp16" if (self.dtype == torch.float16) else None,
-                torch_dtype=self.dtype,
-            )
+        # if self.model_path is None:
+        #     log.warning(
+        #         "Model path not provided, running with default hugging face weights\n"
+        #         "This may not be valid for official submissions"
+        #     )
+        self.scheduler = EulerDiscreteScheduler.from_pretrained(
+            self.model_id, subfolder="scheduler"
+        )
+        self.pipe = StableDiffusionXLPipeline.from_pretrained(
+            self.model_id,
+            scheduler=self.scheduler,
+            safety_checker=None,
+            add_watermarker=False,
+            # variant="fp16" if (self.dtype == torch.float16) else None,
+            variant="fp16" ,
+            torch_dtype=self.dtype,
+        )
             # self.pipe.unet = torch.compile(self.pipe.unet, mode="reduce-overhead", fullgraph=True)
-        else:
-            self.scheduler = EulerDiscreteScheduler.from_pretrained(
-                os.path.join(self.model_path, "checkpoint_scheduler"),
-                subfolder="scheduler",
-            )
-            self.pipe = StableDiffusionXLPipeline.from_pretrained(
-                os.path.join(self.model_path, "checkpoint_pipe"),
-                scheduler=self.scheduler,
-                safety_checker=None,
-                add_watermarker=False,
-                torch_dtype=self.dtype,
-            )
+        # else:
+        #     self.scheduler = EulerDiscreteScheduler.from_pretrained(
+        #         os.path.join(self.model_path, "checkpoint_scheduler"),
+        #         subfolder="scheduler",
+        #     )
+        #     self.pipe = StableDiffusionXLPipeline.from_pretrained(
+        #         os.path.join(self.model_path, "checkpoint_pipe"),
+        #         scheduler=self.scheduler,
+        #         safety_checker=None,
+        #         add_watermarker=False,
+        #         variant="fp16" if (self.dtype == torch.float16) else None,
+        #         torch_dtype=self.dtype,
+        #     )
             # self.pipe.unet = torch.compile(self.pipe.unet, mode="reduce-overhead", fullgraph=True)
 
         self.pipe.to(self.device)
-        # self.pipe.set_progress_bar_config(disable=True)
+        #self.pipe.set_progress_bar_config(disable=True)
 
         self.negative_prompt_tokens = self.pipe.tokenizer(
             self.convert_prompt(self.negative_prompt, self.pipe.tokenizer),
@@ -210,15 +212,13 @@ def encode_tokens(
                     text_input_ids.to(device), output_hidden_states=True
                 )
 
-                # We are only ALWAYS interested in the pooled output of the
-                # final text encoder
+                # We are only ALWAYS interested in the pooled output of the final text encoder
                 pooled_prompt_embeds = prompt_embeds[0]
                 if clip_skip is None:
                     prompt_embeds = prompt_embeds.hidden_states[-2]
                 else:
                     # "2" because SDXL always indexes from the penultimate layer.
-                    prompt_embeds = prompt_embeds.hidden_states[-(
-                        clip_skip + 2)]
+                    prompt_embeds = prompt_embeds.hidden_states[-(clip_skip + 2)]
 
                 prompt_embeds_list.append(prompt_embeds)
 
@@ -234,8 +234,7 @@ def encode_tokens(
             and zero_out_negative_prompt
         ):
             negative_prompt_embeds = torch.zeros_like(prompt_embeds)
-            negative_pooled_prompt_embeds = torch.zeros_like(
-                pooled_prompt_embeds)
+            negative_pooled_prompt_embeds = torch.zeros_like(pooled_prompt_embeds)
         elif do_classifier_free_guidance and negative_prompt_embeds is None:
             negative_prompt = negative_prompt or ""
             negative_prompt_2 = negative_prompt_2 or negative_prompt
@@ -262,35 +261,30 @@ def encode_tokens(
                     uncond_input.to(device),
                     output_hidden_states=True,
                 )
-                # We are only ALWAYS interested in the pooled output of the
-                # final text encoder
+                # We are only ALWAYS interested in the pooled output of the final text encoder
                 negative_pooled_prompt_embeds = negative_prompt_embeds[0]
                 negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]
 
                 negative_prompt_embeds_list.append(negative_prompt_embeds)
 
-            negative_prompt_embeds = torch.concat(
-                negative_prompt_embeds_list, dim=-1)
+            negative_prompt_embeds = torch.concat(negative_prompt_embeds_list, dim=-1)
 
         if pipe.text_encoder_2 is not None:
             prompt_embeds = prompt_embeds.to(
                 dtype=pipe.text_encoder_2.dtype, device=device
             )
         else:
-            prompt_embeds = prompt_embeds.to(
-                dtype=pipe.unet.dtype, device=device)
+            prompt_embeds = prompt_embeds.to(dtype=pipe.unet.dtype, device=device)
 
         bs_embed, seq_len, _ = prompt_embeds.shape
-        # duplicate text embeddings for each generation per prompt, using mps
-        # friendly method
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
         prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
         prompt_embeds = prompt_embeds.view(
             bs_embed * num_images_per_prompt, seq_len, -1
         )
 
         if do_classifier_free_guidance:
-            # duplicate unconditional embeddings for each generation per
-            # prompt, using mps friendly method
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
             seq_len = negative_prompt_embeds.shape[1]
 
             if pipe.text_encoder_2 is not None:
@@ -322,7 +316,7 @@ def encode_tokens(
             pooled_prompt_embeds,
             negative_pooled_prompt_embeds,
         )
-
+    
     def prepare_inputs(self, inputs, i):
         if self.batch_size == 1:
             return self.encode_tokens(
@@ -337,7 +331,7 @@ def prepare_inputs(self, inputs, i):
             negative_prompt_embeds = []
             pooled_prompt_embeds = []
             negative_pooled_prompt_embeds = []
-            for prompt in inputs[i: min(i + self.batch_size, len(inputs))]:
+            for prompt in inputs[i:min(i+self.batch_size, len(inputs))]:
                 assert isinstance(prompt, dict)
                 text_input = prompt["input_tokens"]
                 text_input_2 = prompt["input_tokens_2"]
@@ -358,26 +352,19 @@ def prepare_inputs(self, inputs, i):
                 pooled_prompt_embeds.append(p_p_e)
                 negative_pooled_prompt_embeds.append(n_p_p_e)
 
+
             prompt_embeds = torch.cat(prompt_embeds)
             negative_prompt_embeds = torch.cat(negative_prompt_embeds)
             pooled_prompt_embeds = torch.cat(pooled_prompt_embeds)
-            negative_pooled_prompt_embeds = torch.cat(
-                negative_pooled_prompt_embeds)
-            return (
-                prompt_embeds,
-                negative_prompt_embeds,
-                pooled_prompt_embeds,
-                negative_pooled_prompt_embeds,
-            )
+            negative_pooled_prompt_embeds = torch.cat(negative_pooled_prompt_embeds)
+            return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
 
     def predict(self, inputs):
         images = []
         with torch.no_grad():
             for i in range(0, len(inputs), self.batch_size):
-                latents_input = [
-                    inputs[idx]["latents"]
-                    for idx in range(i, min(i + self.batch_size, len(inputs)))
-                ]
+                print (f'self.steps BEFORE pipe: {self.steps}')
+                latents_input = [inputs[idx]["latents"] for idx in range(i, min(i+self.batch_size, len(inputs)))]
                 latents_input = torch.cat(latents_input).to(self.device)
                 (
                     prompt_embeds,
@@ -392,8 +379,11 @@ def predict(self, inputs):
                     negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
                     guidance_scale=self.guidance,
                     num_inference_steps=self.steps,
+                    # num_inference_steps=20,
                     output_type="pt",
                     latents=latents_input,
                 ).images
+                print (f'self.steps AFTER pipe: {self.steps}')
                 images.extend(generated)
         return images
+
diff --git a/text_to_image/main.py b/text_to_image/main.py
index 6aa7c15e75..7d4da2a0ba 100644
--- a/text_to_image/main.py
+++ b/text_to_image/main.py
@@ -24,6 +24,8 @@
 import dataset
 import coco
 
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
 logging.basicConfig(level=logging.INFO)
 log = logging.getLogger("main")
 
@@ -73,34 +75,24 @@
 
 def get_args():
     parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--dataset",
-        choices=SUPPORTED_DATASETS.keys(),
-        help="dataset")
-    parser.add_argument(
-        "--dataset-path",
-        required=True,
-        help="path to the dataset")
+    parser.add_argument("--dataset", choices=SUPPORTED_DATASETS.keys(), help="dataset")
+    parser.add_argument("--dataset-path", required=True, help="path to the dataset")
     parser.add_argument(
         "--profile", choices=SUPPORTED_PROFILES.keys(), help="standard profiles"
     )
     parser.add_argument(
         "--scenario",
         default="SingleStream",
-        help="mlperf benchmark scenario, one of " +
-        str(list(SCENARIO_MAP.keys())),
+        help="mlperf benchmark scenario, one of " + str(list(SCENARIO_MAP.keys())),
     )
     parser.add_argument(
         "--max-batchsize",
         type=int,
-        default=1,
+        default=2,
         help="max batch size in a single inference",
     )
     parser.add_argument("--threads", default=1, type=int, help="threads")
-    parser.add_argument(
-        "--accuracy",
-        action="store_true",
-        help="enable accuracy pass")
+    parser.add_argument("--accuracy", action="store_true", help="enable accuracy pass")
     parser.add_argument(
         "--find-peak-performance",
         action="store_true",
@@ -111,10 +103,11 @@ def get_args():
     parser.add_argument("--output", default="output", help="test results")
     parser.add_argument("--qps", type=int, help="target qps")
     parser.add_argument("--model-path", help="Path to model weights")
+    parser.add_argument("--gpu-num", type=int, default=4, help="Number of gpus to run inference")
 
     parser.add_argument(
         "--dtype",
-        default="fp32",
+        default="fp16",
         choices=["fp32", "fp16", "bf16"],
         help="dtype of the model",
     )
@@ -131,6 +124,10 @@ def get_args():
         help="framework to load the latents",
     )
 
+    # file to use mlperf rules compliant parameters
+    parser.add_argument(
+        "--mlperf_conf", default="mlperf.conf", help="mlperf rules config"
+    )
     # file for user LoadGen settings such as target QPS
     parser.add_argument(
         "--user_conf",
@@ -145,20 +142,14 @@ def get_args():
     # pass this argument for official submission
     # parser.add_argument("--output-images", action="store_true", help="Store a subset of the generated images")
     # do not modify this argument for official submission
-    parser.add_argument(
-        "--ids-path", help="Path to caption ids", default="tools/sample_ids.txt"
-    )
+    parser.add_argument("--ids-path", help="Path to caption ids", default="tools/sample_ids.txt")
 
-    # below will override mlperf rules compliant settings - don't use for
-    # official submission
+    # below will override mlperf rules compliant settings - don't use for official submission
     parser.add_argument("--time", type=int, help="time to scan in seconds")
     parser.add_argument("--count", type=int, help="dataset items to use")
     parser.add_argument("--debug", action="store_true", help="debug")
     parser.add_argument(
-        "--performance-sample-count",
-        type=int,
-        help="performance sample count",
-        default=5000,
+        "--performance-sample-count", type=int, help="performance sample count", default=5000
     )
     parser.add_argument(
         "--max-latency", type=float, help="mlperf max latency in pct tile"
@@ -271,9 +262,9 @@ def enqueue(self, query_samples):
         else:
             bs = self.max_batchsize
             for i in range(0, len(idx), bs):
-                data, label = self.ds.get_samples(idx[i: i + bs])
+                data, label = self.ds.get_samples(idx[i : i + bs])
                 self.run_one_item(
-                    Item(query_id[i: i + bs], idx[i: i + bs], data, label)
+                    Item(query_id[i : i + bs], idx[i : i + bs], data, label)
                 )
 
     def finish(self):
@@ -288,9 +279,7 @@ def __init__(self, model, ds, threads, post_proc=None, max_batchsize=128):
         self.result_dict = {}
 
         for _ in range(self.threads):
-            worker = threading.Thread(
-                target=self.handle_tasks, args=(
-                    self.tasks,))
+            worker = threading.Thread(target=self.handle_tasks, args=(self.tasks,))
             worker.daemon = True
             self.workers.append(worker)
             worker.start()
@@ -333,13 +322,29 @@ def main():
     log.info(args)
 
     # find backend
-    backend = get_backend(
-        args.backend,
-        precision=args.dtype,
-        device=args.device,
-        model_path=args.model_path,
-        batch_size=args.max_batchsize,
-    )
+    
+    # backend = get_backend(
+    #     args.backend,
+    #     precision=args.dtype,
+    #     device=args.device,
+    #     model_path=args.model_path,
+    #     batch_size=args.max_batchsize
+    # )
+    # Zixian: Oct 21: create a list of backends for multi-gpu
+    
+    # Zixian: Nov 13: Force batchsize=2 since command line doesn't work 
+    args.max_batchsize = 5
+    
+    backends = [get_backend(
+                    args.backend,
+                    precision=args.dtype,
+                    device=f'cuda:{i}',
+                    model_path=args.model_path,
+                    batch_size=args.max_batchsize
+                ) 
+                for i in np.arange (args.gpu_num)]
+    
+    
     if args.dtype == "fp16":
         dtype = torch.float16
     elif args.dtype == "bf16":
@@ -355,7 +360,9 @@ def main():
         count_override = True
 
     # load model to backend
-    model = backend.load()
+    # model = backend.load()
+    # Zixian: Oct 21: create a list of models corresponding to each backend 
+    models = [backend.load() for backend in backends]
 
     # dataset to use
     dataset_class, pre_proc, post_proc, kwargs = SUPPORTED_DATASETS[args.dataset]
@@ -365,28 +372,37 @@ def main():
         pre_process=pre_proc,
         count=count,
         threads=args.threads,
-        pipe_tokenizer=model.pipe.tokenizer,
-        pipe_tokenizer_2=model.pipe.tokenizer_2,
+        # pipe_tokenizer=model.pipe.tokenizer,
+        # pipe_tokenizer_2=model.pipe.tokenizer_2,
+        pipe_tokenizer=models[0].pipe.tokenizer,
+        pipe_tokenizer_2=models[0].pipe.tokenizer_2,
         latent_dtype=dtype,
         latent_device=args.device,
         latent_framework=args.latent_framework,
         **kwargs,
     )
     final_results = {
-        "runtime": model.name(),
-        "version": model.version(),
+        # "runtime": model.name(),
+        # "version": model.version(),
+        "runtime": models[0].name(),
+        "version": models[0].version(),
         "time": int(time.time()),
         "args": vars(args),
         "cmdline": str(args),
     }
 
+    mlperf_conf = os.path.abspath(args.mlperf_conf)
+    if not os.path.exists(mlperf_conf):
+        log.error("{} not found".format(mlperf_conf))
+        sys.exit(1)
+
     user_conf = os.path.abspath(args.user_conf)
     if not os.path.exists(user_conf):
         log.error("{} not found".format(user_conf))
         sys.exit(1)
 
     audit_config = os.path.abspath(args.audit_conf)
-
+    
     if args.accuracy:
         ids_path = os.path.abspath(args.ids_path)
         with open(ids_path) as f:
@@ -405,16 +421,47 @@ def main():
     # warmup
     syntetic_str = "Lorem ipsum dolor sit amet, consectetur adipiscing elit"
     latents_pt = torch.rand(ds.latents.shape, dtype=dtype).to(args.device)
-    warmup_samples = [
-        {
-            "input_tokens": ds.preprocess(syntetic_str, model.pipe.tokenizer),
-            "input_tokens_2": ds.preprocess(syntetic_str, model.pipe.tokenizer_2),
-            "latents": latents_pt,
-        }
-        for _ in range(args.max_batchsize)
-    ]
-    for i in range(5):
-        _ = backend.predict(warmup_samples)
+    # warmup_samples = [
+    #     {
+    #         "input_tokens": ds.preprocess(syntetic_str, model.pipe.tokenizer),
+    #         "input_tokens_2": ds.preprocess(syntetic_str, model.pipe.tokenizer_2),
+    #         "latents": latents_pt,
+    #     }
+    #     for _ in range(args.max_batchsize)
+    # ]
+    warmup_samples_gpus = [
+                    [
+                        {
+                            "input_tokens": ds.preprocess(syntetic_str, model.pipe.tokenizer),
+                            "input_tokens_2": ds.preprocess(syntetic_str, model.pipe.tokenizer_2),
+                            "latents": latents_pt,
+                        }
+                        for _ in range(int(args.max_batchsize))
+                    ]
+                    for model in models] * 3 # 3 times warmup samples
+    
+    # Zixian: Oct 21: warm up each backend 
+    # for idx, backend in enumerate (backends): 
+    #     for i in range(1):
+    #         _ = backend.predict(warmup_samples_gpus[idx])
+    
+    print (f'Start distributed warmup')
+    with ThreadPoolExecutor(max_workers=len(backends)) as executor:
+            # Map each runner to its respective sublist
+            futures = {
+                executor.submit(backend.predict, queries): backend 
+                for backend, queries in zip(backends, warmup_samples_gpus)
+            }
+        
+            # Optionally process the results
+            for future in as_completed(futures):
+                backend = futures[future]
+                try:
+                    result = future.result()
+                    print(f'Warmup backend {backend} enqueued successfully.')
+                except Exception as exc:
+                    print(f'Warmup backend {backend} generated an exception: {exc}')
+
 
     scenario = SCENARIO_MAP[args.scenario]
     runner_map = {
@@ -423,12 +470,54 @@ def main():
         lg.TestScenario.Server: QueueRunner,
         lg.TestScenario.Offline: QueueRunner,
     }
-    runner = runner_map[scenario](
-        model, ds, args.threads, post_proc=post_proc, max_batchsize=args.max_batchsize
-    )
-
+    
+    # Zixian: Oct 21: create a list of runner
+    # runner = runner_map[scenario](
+    #     model, ds, args.threads, post_proc=post_proc, max_batchsize=args.max_batchsize
+    # )
+    runners = [runner_map[scenario](
+                                model, ds, args.threads, post_proc=post_proc, max_batchsize=args.max_batchsize
+                            )
+                for model in models]
+
+    # def issue_queries(query_samples):
+    #     runner.enqueue(query_samples)
     def issue_queries(query_samples):
-        runner.enqueue(query_samples)
+        print (f'\n\n len (query_samples): {len (query_samples)} \n\n')
+        
+        query_samples_len = len (query_samples)
+        query_samples_seg_len = query_samples_len / len (runners)
+        splitted_query_samples = []
+        
+        
+        for idx in range (len (runners)): 
+            log.info (f'\n\n\n')
+            log.info (f'idx: {idx}')
+            log.info (f'query_samples_len: {query_samples_len}')
+            log.info (f'idx: {idx}')
+            # if idx == len (runners) -1: 
+            #     splitted_query_samples.append (query_samples[idx*query_samples_seg_len:])
+            # else:
+            #     splitted_query_samples.append (query_samples[idx*query_samples_seg_len : (idx+1)*query_samples_seg_len])
+            splitted_query_samples.append (query_samples [int(round(query_samples_seg_len * idx)): int(round(query_samples_seg_len * (idx + 1)))])
+        
+        # splitted_query_samples = [query_samples[int(round(avg * i)): int(round(avg * (i + 1)))] for i in range(b)]
+        
+        with ThreadPoolExecutor(max_workers=len(runners)) as executor:
+            # Map each runner to its respective sublist
+            futures = {
+                executor.submit(runner.enqueue, queries): runner 
+                for runner, queries in zip(runners, splitted_query_samples)
+            }
+        
+            # Optionally process the results
+            for future in as_completed(futures):
+                runner = futures[future]
+                try:
+                    result = future.result()
+                    print(f'Runner {runner} enqueued successfully.')
+                except Exception as exc:
+                    print(f'Runner {runner} generated an exception: {exc}')
 
     def flush_queries():
         pass
@@ -441,8 +530,7 @@ def flush_queries():
     log_settings.log_output = log_output_settings
 
     settings = lg.TestSettings()
-    # mlperf.conf is automatically loaded by the loadgen
-    # settings.FromConfig(mlperf_conf, args.model_name, args.scenario)
+    settings.FromConfig(mlperf_conf, args.model_name, args.scenario)
     settings.FromConfig(user_conf, args.model_name, args.scenario)
     if os.path.exists(audit_config):
         settings.FromConfig(audit_config, args.model_name, args.scenario)
@@ -458,6 +546,8 @@ def flush_queries():
         settings.min_duration_ms = args.time * MILLI_SEC
         settings.max_duration_ms = args.time * MILLI_SEC
 
+    # Zixian: Nov8: manually setting args.qps to 1
+    # args.qps=1.0
     if args.qps:
         qps = float(args.qps)
         settings.server_target_qps = qps
@@ -471,14 +561,15 @@ def flush_queries():
         settings.multi_stream_samples_per_query = args.samples_per_query
     if args.max_latency:
         settings.server_target_latency_ns = int(args.max_latency * NANO_SEC)
-        settings.multi_stream_expected_latency_ns = int(
-            args.max_latency * NANO_SEC)
+        settings.multi_stream_expected_latency_ns = int(args.max_latency * NANO_SEC)
 
     performance_sample_count = (
         args.performance_sample_count
         if args.performance_sample_count
         else min(count, 500)
     )
+    
+    # count = 200
     sut = lg.ConstructSUT(issue_queries, flush_queries)
     qsl = lg.ConstructQSL(
         count, performance_sample_count, ds.load_query_samples, ds.unload_query_samples
@@ -486,19 +577,39 @@ def flush_queries():
 
     log.info("starting {}".format(scenario))
     result_dict = {"scenario": str(scenario)}
-    runner.start_run(result_dict, args.accuracy)
+    for runner in runners: 
+        runner.start_run(result_dict, args.accuracy)
+    
+    # with ThreadPoolExecutor(max_workers=len(runners)) as executor:
+    #         # Map each runner to its respective sublist
+    #         futures = {
+    #             executor.submit(runner.finish(), (result_dict, args.accuracy)): runner 
+    #             for runner in runners 
+    #         }
+        
 
     lg.StartTestWithLogSettings(sut, qsl, settings, log_settings, audit_config)
 
     if args.accuracy:
         post_proc.finalize(result_dict, ds, output_dir=args.output)
         final_results["accuracy_results"] = result_dict
-        post_proc.save_images(saved_images_ids, ds)
+        # post_proc.save_images(saved_images_ids, ds)
+
 
-    runner.finish()
     lg.DestroyQSL(qsl)
     lg.DestroySUT(sut)
 
+    # for runner in runners: 
+    #     runner.finish()
+    with ThreadPoolExecutor(max_workers=len(runners)) as executor:
+            # Map each runner to its respective sublist
+            futures = {
+                executor.submit(runner.finish()): runner 
+                for runner in runners 
+            }
+        
+        
+
     #
     # write final results
     #
diff --git a/text_to_image/mlperf.conf b/text_to_image/mlperf.conf
new file mode 100644
index 0000000000..0cea5351e1
--- /dev/null
+++ b/text_to_image/mlperf.conf
@@ -0,0 +1,99 @@
+# The format of this config file is 'key = value'.
+# The key has the format 'model.scenario.key'. Value is mostly int64_t.
+# Model maybe '*' as wildcard. In that case the value applies to all models.
+# All times are in milli seconds
+
+# Set performance_sample_count for each model.
+# User can optionally set this to higher values in user.conf.
+resnet50.*.performance_sample_count_override = 1024
+ssd-mobilenet.*.performance_sample_count_override = 256
+retinanet.*.performance_sample_count_override = 64
+bert.*.performance_sample_count_override = 10833
+dlrm.*.performance_sample_count_override = 204800
+dlrm-v2.*.performance_sample_count_override = 204800
+rnnt.*.performance_sample_count_override = 2513
+gptj.*.performance_sample_count_override = 13368
+llama2-70b.*.performance_sample_count_override = 24576
+stable-diffusion-xl.*.performance_sample_count_override = 5000
+# set to 0 to let entire sample set to be performance sample
+3d-unet.*.performance_sample_count_override = 0
+
+# Set seeds. The seeds will be distributed two weeks before the submission.
+*.*.qsl_rng_seed = 3066443479025735752
+*.*.sample_index_rng_seed = 10688027786191513374
+*.*.schedule_rng_seed = 14962580496156340209
+# Set seeds for TEST_05. The seeds will be distributed two weeks before the submission.
+*.*.test05_qsl_rng_seed = 16799458546791641818
+*.*.test05_sample_index_rng_seed = 5453809927556429288
+*.*.test05_schedule_rng_seed = 5435552105434836064
+
+
+*.SingleStream.target_latency_percentile = 90
+*.SingleStream.min_duration = 600000
+
+*.MultiStream.target_latency_percentile = 99
+*.MultiStream.samples_per_query = 8
+*.MultiStream.min_duration = 600000
+*.MultiStream.min_query_count = 662
+retinanet.MultiStream.target_latency = 528
+
+# 3D-UNet uses equal issue mode because it has non-uniform inputs
+3d-unet.*.sample_concatenate_permutation = 1
+
+# LLM benchmarks have non-uniform inputs and outputs, and use equal issue mode for all latency scenario
+gptj.*.sample_concatenate_permutation = 1
+llama2-70b.*.sample_concatenate_permutation = 1
+mixtral-8x7b.*.sample_concatenate_permutation = 1
+
+*.Server.target_latency = 10
+*.Server.target_latency_percentile = 99
+*.Server.target_duration = 0
+*.Server.min_duration = 600000
+resnet50.Server.target_latency = 15
+retinanet.Server.target_latency = 100
+bert.Server.target_latency = 130
+dlrm.Server.target_latency = 60
+dlrm-v2.Server.target_latency = 60
+rnnt.Server.target_latency = 1000
+gptj.Server.target_latency = 20000
+stable-diffusion-xl.Server.target_latency = 20000
+# Llama2-70b benchmarks measures token latencies
+llama2-70b.*.use_token_latencies = 1
+mixtral-8x7b.*.use_token_latencies = 1
+# gptj benchmark infers token latencies
+gptj.*.infer_token_latencies = 1
+gptj.*.token_latency_scaling_factor = 69
+# Only ttft and tpot are tracked for the llama2-70b & mixtral-8x7B benchmark therefore target_latency = 0
+llama2-70b.Server.target_latency = 0
+llama2-70b.Server.ttft_latency = 2000
+llama2-70b.Server.tpot_latency = 200
+
+mixtral-8x7b.Server.target_latency = 0
+mixtral-8x7b.Server.ttft_latency = 2000
+mixtral-8x7b.Server.tpot_latency = 200
+
+*.Offline.target_latency_percentile = 90
+*.Offline.min_duration = 600000
+
+# In Offline scenario, we always have one query. But LoadGen maps this to
+# min_sample_count internally in Offline scenario. If the dataset size is larger 
+# than 24576 we limit the min_query_count to 24576 and otherwise we use 
+# the dataset size as the limit
+
+resnet50.Offline.min_query_count = 24576
+retinanet.Offline.min_query_count = 24576
+dlrm-v2.Offline.min_query_count = 24576
+bert.Offline.min_query_count = 10833
+gptj.Offline.min_query_count = 13368
+rnnt.Offline.min_query_count = 2513
+3d-unet.Offline.min_query_count = 43
+stable-diffusion-xl.Offline.min_query_count = 4000
+llama2-70b.Offline.min_query_count = 24576
+mixtral-8x7b.Offline.min_query_count = 15000
+
+# These fields should be defined and overridden by user.conf.
+*.SingleStream.target_latency = 10
+*.MultiStream.target_latency = 80
+*.Server.target_qps = 1.0
+*.Offline.target_qps = 1.0
+

From dffdd592d5b191f7f23a45b0101490691fb5d9db Mon Sep 17 00:00:00 2001
From: zixianwang2022 <goghelloworld@gmail.com>
Date: Sat, 16 Nov 2024 21:27:21 -0600
Subject: [PATCH 16/16] update changes with fork 4 pr

---
 text_to_image/coco.py                  |  58 ++-
 text_to_image/main.py                  | 476 ++--------------------
 text_to_image/py_demo_server_lon.py    | 534 +++++++++++++++++++++++++
 text_to_image/sut_over_network_demo.py | 440 ++++++++++++++++++++
 4 files changed, 1034 insertions(+), 474 deletions(-)
 create mode 100644 text_to_image/py_demo_server_lon.py
 create mode 100644 text_to_image/sut_over_network_demo.py

diff --git a/text_to_image/coco.py b/text_to_image/coco.py
index e9499b0e6c..ac7f590f95 100644
--- a/text_to_image/coco.py
+++ b/text_to_image/coco.py
@@ -38,19 +38,23 @@ def __init__(
         **kwargs,
     ):
         super().__init__()
-        self.captions_df = pd.read_csv(
-            f"{data_path}/captions/captions.tsv", sep="\t")
+        self.captions_df = pd.read_csv(f"{data_path}/captions/captions.tsv", sep="\t")
         self.image_size = image_size
         self.preprocessed_dir = os.path.abspath(f"{data_path}/preprocessed/")
         self.img_dir = os.path.abspath(f"{data_path}/validation/data/")
         self.name = name
+        
+        self.pipe_tokenizer = pipe_tokenizer
+        self.pipe_tokenizer_2 = pipe_tokenizer_2
 
         # Preprocess prompts
         self.captions_df["input_tokens"] = self.captions_df["caption"].apply(
-            lambda x: self.preprocess(x, pipe_tokenizer)
+            # lambda x: self.preprocess(x, pipe_tokenizer)
+            lambda x: x
         )
         self.captions_df["input_tokens_2"] = self.captions_df["caption"].apply(
-            lambda x: self.preprocess(x, pipe_tokenizer_2)
+            # lambda x: self.preprocess(x, pipe_tokenizer_2)
+            lambda x: x
         )
         self.latent_dtype = latent_dtype
         self.latent_device = latent_device if torch.cuda.is_available() else "cpu"
@@ -117,10 +121,7 @@ def get_item_count(self):
         return len(self.captions_df)
 
     def get_img(self, id):
-        img = Image.open(
-            self.img_dir +
-            "/" +
-            self.captions_df.loc[id]["file_name"])
+        img = Image.open(self.img_dir + "/" + self.captions_df.loc[id]["file_name"])
         return self.image_to_tensor(img)
 
     def get_imgs(self, id_list):
@@ -141,11 +142,7 @@ def get_item_loc(self, id):
 
 class PostProcessCoco:
     def __init__(
-        self,
-        device="cpu",
-        dtype="uint8",
-        statistics_path=os.path.join(
-            os.path.dirname(__file__), "tools", "val2014.npz"),
+        self, device="cpu", dtype="uint8", statistics_path=os.path.join(os.path.dirname(__file__), "tools", "val2014.npz")
     ):
         self.results = []
         self.good = 0
@@ -167,33 +164,27 @@ def add_results(self, results):
     def __call__(self, results, ids, expected=None, result_dict=None):
         self.content_ids.extend(ids)
         return [
-            (t.cpu().permute(1, 2, 0).float().numpy() * 255)
-            .round()
-            .astype(self.numpy_dtype)
+            (t.cpu().permute(1, 2, 0).float().numpy() * 255).round().astype(self.numpy_dtype)
             for t in results
         ]
-
+    
     def save_images(self, ids, ds):
         info = []
         idx = {}
-        for i, image_id in enumerate(self.content_ids):
-            if image_id in ids:
-                idx[image_id] = i
+        for i, id in enumerate(self.content_ids):
+            if id in ids:
+                idx[id] = i
         if not os.path.exists("images/"):
             os.makedirs("images/", exist_ok=True)
-        for image_id in ids:
-            if not idx.get(image_id):
-                print(
-                    f"image id {image_id} is missing in the results. Hence not saved.")
-                continue
-            caption = ds.get_caption(image_id)
-            generated = Image.fromarray(self.results[idx[image_id]])
-            image_path_tmp = f"images/{self.content_ids[idx[image_id]]}.png"
+        for id in ids:
+            caption = ds.get_caption(id)
+            generated = Image.fromarray(self.results[idx[id]])
+            image_path_tmp = f"images/{self.content_ids[idx[id]]}.png"
             generated.save(image_path_tmp)
-            info.append((self.content_ids[idx[image_id]], caption))
+            info.append((self.content_ids[idx[id]], caption))
         with open("images/captions.txt", "w+") as f:
-            for image_id, caption in info:
-                f.write(f"{image_id}  {caption}\n")
+            for id, caption in info:
+                f.write(f"{id}  {caption}\n")
 
     def start(self):
         self.results = []
@@ -209,10 +200,7 @@ def finalize(self, result_dict, ds=None, output_dir=None):
                 100 * clip.get_clip_score(caption, generated).item()
             )
 
-        fid_score = compute_fid(
-            self.results,
-            self.statistics_path,
-            self.device)
+        fid_score = compute_fid(self.results, self.statistics_path, self.device)
         result_dict["FID_SCORE"] = fid_score
         result_dict["CLIP_SCORE"] = np.mean(self.clip_scores)
 
diff --git a/text_to_image/main.py b/text_to_image/main.py
index 7d4da2a0ba..72f5959a0b 100644
--- a/text_to_image/main.py
+++ b/text_to_image/main.py
@@ -1,3 +1,4 @@
+
 """
 mlperf inference benchmarking tool
 """
@@ -21,11 +22,16 @@
 import numpy as np
 import torch
 
+import subprocess
+from py_demo_server_lon import main as server_main
+
 import dataset
 import coco
 
 from concurrent.futures import ThreadPoolExecutor, as_completed
 
+# from sut_over_network_demo import main as 
+
 logging.basicConfig(level=logging.INFO)
 log = logging.getLogger("main")
 
@@ -42,6 +48,13 @@
 }
 
 
+SCENARIO_MAP = {
+    "SingleStream": lg.TestScenario.SingleStream,
+    "MultiStream": lg.TestScenario.MultiStream,
+    "Server": lg.TestScenario.Server,
+    "Offline": lg.TestScenario.Offline,
+}
+
 SUPPORTED_PROFILES = {
     "defaults": {
         "dataset": "coco-1024",
@@ -65,16 +78,9 @@
     },
 }
 
-SCENARIO_MAP = {
-    "SingleStream": lg.TestScenario.SingleStream,
-    "MultiStream": lg.TestScenario.MultiStream,
-    "Server": lg.TestScenario.Server,
-    "Offline": lg.TestScenario.Offline,
-}
-
-
 def get_args():
     parser = argparse.ArgumentParser()
+    parser.add_argument('--sut-server', default=['http://t004-005:8008', 'http://t004-006:8008'], nargs='+', help='A list of server address & port')
     parser.add_argument("--dataset", choices=SUPPORTED_DATASETS.keys(), help="dataset")
     parser.add_argument("--dataset-path", required=True, help="path to the dataset")
     parser.add_argument(
@@ -88,7 +94,7 @@ def get_args():
     parser.add_argument(
         "--max-batchsize",
         type=int,
-        default=2,
+        default=1,
         help="max batch size in a single inference",
     )
     parser.add_argument("--threads", default=1, type=int, help="threads")
@@ -103,11 +109,10 @@ def get_args():
     parser.add_argument("--output", default="output", help="test results")
     parser.add_argument("--qps", type=int, help="target qps")
     parser.add_argument("--model-path", help="Path to model weights")
-    parser.add_argument("--gpu-num", type=int, default=4, help="Number of gpus to run inference")
 
     parser.add_argument(
         "--dtype",
-        default="fp16",
+        default="fp32",
         choices=["fp32", "fp16", "bf16"],
         help="dtype of the model",
     )
@@ -179,444 +184,37 @@ def get_args():
     return args
 
 
-def get_backend(backend, **kwargs):
-    if backend == "pytorch":
-        from backend_pytorch import BackendPytorch
-
-        backend = BackendPytorch(**kwargs)
 
-    elif backend == "debug":
-        from backend_debug import BackendDebug
 
-        backend = BackendDebug()
-    else:
-        raise ValueError("unknown backend: " + backend)
-    return backend
-
-
-class Item:
-    """An item that we queue for processing by the thread pool."""
-
-    def __init__(self, query_id, content_id, inputs, img=None):
-        self.query_id = query_id
-        self.content_id = content_id
-        self.img = img
-        self.inputs = inputs
-        self.start = time.time()
-
-
-class RunnerBase:
-    def __init__(self, model, ds, threads, post_proc=None, max_batchsize=128):
-        self.take_accuracy = False
-        self.ds = ds
-        self.model = model
-        self.post_process = post_proc
-        self.threads = threads
-        self.take_accuracy = False
-        self.max_batchsize = max_batchsize
-        self.result_timing = []
-
-    def handle_tasks(self, tasks_queue):
-        pass
-
-    def start_run(self, result_dict, take_accuracy):
-        self.result_dict = result_dict
-        self.result_timing = []
-        self.take_accuracy = take_accuracy
-        self.post_process.start()
-
-    def run_one_item(self, qitem: Item):
-        # run the prediction
-        processed_results = []
-        try:
-            results = self.model.predict(qitem.inputs)
-            processed_results = self.post_process(
-                results, qitem.content_id, qitem.inputs, self.result_dict
-            )
-            if self.take_accuracy:
-                self.post_process.add_results(processed_results)
-            self.result_timing.append(time.time() - qitem.start)
-        except Exception as ex:  # pylint: disable=broad-except
-            src = [self.ds.get_item_loc(i) for i in qitem.content_id]
-            log.error("thread: failed on contentid=%s, %s", src, ex)
-            # since post_process will not run, fake empty responses
-            processed_results = [[]] * len(qitem.query_id)
-        finally:
-            response_array_refs = []
-            response = []
-            for idx, query_id in enumerate(qitem.query_id):
-                response_array = array.array(
-                    "B", np.array(processed_results[idx], np.uint8).tobytes()
-                )
-                response_array_refs.append(response_array)
-                bi = response_array.buffer_info()
-                response.append(lg.QuerySampleResponse(query_id, bi[0], bi[1]))
-            lg.QuerySamplesComplete(response)
-
-    def enqueue(self, query_samples):
-        idx = [q.index for q in query_samples]
-        query_id = [q.id for q in query_samples]
-        if len(query_samples) < self.max_batchsize:
-            data, label = self.ds.get_samples(idx)
-            self.run_one_item(Item(query_id, idx, data, label))
-        else:
-            bs = self.max_batchsize
-            for i in range(0, len(idx), bs):
-                data, label = self.ds.get_samples(idx[i : i + bs])
-                self.run_one_item(
-                    Item(query_id[i : i + bs], idx[i : i + bs], data, label)
-                )
-
-    def finish(self):
-        pass
-
-
-class QueueRunner(RunnerBase):
-    def __init__(self, model, ds, threads, post_proc=None, max_batchsize=128):
-        super().__init__(model, ds, threads, post_proc, max_batchsize)
-        self.tasks = Queue(maxsize=threads * 4)
-        self.workers = []
-        self.result_dict = {}
-
-        for _ in range(self.threads):
-            worker = threading.Thread(target=self.handle_tasks, args=(self.tasks,))
-            worker.daemon = True
-            self.workers.append(worker)
-            worker.start()
-
-    def handle_tasks(self, tasks_queue):
-        """Worker thread."""
-        while True:
-            qitem = tasks_queue.get()
-            if qitem is None:
-                # None in the queue indicates the parent want us to exit
-                tasks_queue.task_done()
-                break
-            self.run_one_item(qitem)
-            tasks_queue.task_done()
-
-    def enqueue(self, query_samples):
-        idx = [q.index for q in query_samples]
-        query_id = [q.id for q in query_samples]
-        if len(query_samples) < self.max_batchsize:
-            data, label = self.ds.get_samples(idx)
-            self.tasks.put(Item(query_id, idx, data, label))
-        else:
-            bs = self.max_batchsize
-            for i in range(0, len(idx), bs):
-                ie = i + bs
-                data, label = self.ds.get_samples(idx[i:ie])
-                self.tasks.put(Item(query_id[i:ie], idx[i:ie], data, label))
-
-    def finish(self):
-        # exit all threads
-        for _ in self.workers:
-            self.tasks.put(None)
-        for worker in self.workers:
-            worker.join()
-
-
-def main():
+def main(): 
+    
     args = get_args()
-
     log.info(args)
-
-    # find backend
-    
-    # backend = get_backend(
-    #     args.backend,
-    #     precision=args.dtype,
-    #     device=args.device,
-    #     model_path=args.model_path,
-    #     batch_size=args.max_batchsize
-    # )
-    # Zixian: Oct 21: create a list of backends for multi-gpu
     
-    # Zixian: Nov 13: Force batchsize=2 since command line doesn't work 
-    args.max_batchsize = 5
+    # Define the command and arguments
+    # command = ['python', 'script_to_run.py', '--num', '10', '--text', 'Hello, world!']
     
-    backends = [get_backend(
-                    args.backend,
-                    precision=args.dtype,
-                    device=f'cuda:{i}',
-                    model_path=args.model_path,
-                    batch_size=args.max_batchsize
-                ) 
-                for i in np.arange (args.gpu_num)]
+    server_main (args)
     
+    # command = ['python', 
+    #            'py_demo_server_lon.py', 
+    #            '--sut-server http://t007-001:8888 http://t006-001:8888',
+    #            '--dataset=coco-1024', 
+    #            '--dataset-path=/work1/zixian/ziw081/inference/text_to_image/coco2014',
+    #            '--profile=stable-diffusion-xl-pytorch',
+    #            '--dtype=fp16',
+    #            '--device=cuda',
+    #            '--time=30',
+    #            '--scenario=Offline',
+    #            '--max-batchsize=4'
+    #         ]
+
+
+    # # Run the command
+    # subprocess.run(command)
     
-    if args.dtype == "fp16":
-        dtype = torch.float16
-    elif args.dtype == "bf16":
-        dtype = torch.bfloat16
-    else:
-        dtype = torch.float32
-
-    # --count applies to accuracy mode only and can be used to limit the number of images
-    # for testing.
-    count_override = False
-    count = args.count
-    if count:
-        count_override = True
-
-    # load model to backend
-    # model = backend.load()
-    # Zixian: Oct 21: create a list of models corresponding to each backend 
-    models = [backend.load() for backend in backends]
-
-    # dataset to use
-    dataset_class, pre_proc, post_proc, kwargs = SUPPORTED_DATASETS[args.dataset]
-    ds = dataset_class(
-        data_path=args.dataset_path,
-        name=args.dataset,
-        pre_process=pre_proc,
-        count=count,
-        threads=args.threads,
-        # pipe_tokenizer=model.pipe.tokenizer,
-        # pipe_tokenizer_2=model.pipe.tokenizer_2,
-        pipe_tokenizer=models[0].pipe.tokenizer,
-        pipe_tokenizer_2=models[0].pipe.tokenizer_2,
-        latent_dtype=dtype,
-        latent_device=args.device,
-        latent_framework=args.latent_framework,
-        **kwargs,
-    )
-    final_results = {
-        # "runtime": model.name(),
-        # "version": model.version(),
-        "runtime": models[0].name(),
-        "version": models[0].version(),
-        "time": int(time.time()),
-        "args": vars(args),
-        "cmdline": str(args),
-    }
-
-    mlperf_conf = os.path.abspath(args.mlperf_conf)
-    if not os.path.exists(mlperf_conf):
-        log.error("{} not found".format(mlperf_conf))
-        sys.exit(1)
-
-    user_conf = os.path.abspath(args.user_conf)
-    if not os.path.exists(user_conf):
-        log.error("{} not found".format(user_conf))
-        sys.exit(1)
-
-    audit_config = os.path.abspath(args.audit_conf)
-    
-    if args.accuracy:
-        ids_path = os.path.abspath(args.ids_path)
-        with open(ids_path) as f:
-            saved_images_ids = [int(_) for _ in f.readlines()]
-
-    if args.output:
-        output_dir = os.path.abspath(args.output)
-        os.makedirs(output_dir, exist_ok=True)
-        os.chdir(output_dir)
-
-    #
-    # make one pass over the dataset to validate accuracy
-    #
-    count = ds.get_item_count()
-
-    # warmup
-    syntetic_str = "Lorem ipsum dolor sit amet, consectetur adipiscing elit"
-    latents_pt = torch.rand(ds.latents.shape, dtype=dtype).to(args.device)
-    # warmup_samples = [
-    #     {
-    #         "input_tokens": ds.preprocess(syntetic_str, model.pipe.tokenizer),
-    #         "input_tokens_2": ds.preprocess(syntetic_str, model.pipe.tokenizer_2),
-    #         "latents": latents_pt,
-    #     }
-    #     for _ in range(args.max_batchsize)
-    # ]
-    warmup_samples_gpus = [
-                    [
-                        {
-                            "input_tokens": ds.preprocess(syntetic_str, model.pipe.tokenizer),
-                            "input_tokens_2": ds.preprocess(syntetic_str, model.pipe.tokenizer_2),
-                            "latents": latents_pt,
-                        }
-                        for _ in range(int(args.max_batchsize))
-                    ]
-                    for model in models] * 3 # 3 times warmup samples
-    
-    # Zixian: Oct 21: warm up each backend 
-    # for idx, backend in enumerate (backends): 
-    #     for i in range(1):
-    #         _ = backend.predict(warmup_samples_gpus[idx])
-    
-    print (f'Start distributed warmup')
-    with ThreadPoolExecutor(max_workers=len(backends)) as executor:
-            # Map each runner to its respective sublist
-            futures = {
-                executor.submit(backend.predict, queries): backend 
-                for backend, queries in zip(backends, warmup_samples_gpus)
-            }
-        
-            # Optionally process the results
-            for future in as_completed(futures):
-                backend = futures[future]
-                try:
-                    result = future.result()
-                    print(f'Warmup backend {backend} enqueued successfully.')
-                except Exception as exc:
-                    print(f'Warmup backend {backend} generated an exception: {exc}')
-
-
-    scenario = SCENARIO_MAP[args.scenario]
-    runner_map = {
-        lg.TestScenario.SingleStream: RunnerBase,
-        lg.TestScenario.MultiStream: QueueRunner,
-        lg.TestScenario.Server: QueueRunner,
-        lg.TestScenario.Offline: QueueRunner,
-    }
-    
-    # Zixian: Oct 21: create a list of runner
-    # runner = runner_map[scenario](
-    #     model, ds, args.threads, post_proc=post_proc, max_batchsize=args.max_batchsize
-    # )
-    runners = [runner_map[scenario](
-                                model, ds, args.threads, post_proc=post_proc, max_batchsize=args.max_batchsize
-                            )
-                for model in models]
-
-    # def issue_queries(query_samples):
-    #     runner.enqueue(query_samples)
-    def issue_queries(query_samples):
-        print (f'\n\n len (query_samples): {len (query_samples)} \n\n')
-        
-        query_samples_len = len (query_samples)
-        query_samples_seg_len = query_samples_len / len (runners)
-        splitted_query_samples = []
-        
-        
-        for idx in range (len (runners)): 
-            log.info (f'\n\n\n')
-            log.info (f'idx: {idx}')
-            log.info (f'query_samples_len: {query_samples_len}')
-            log.info (f'idx: {idx}')
-            # if idx == len (runners) -1: 
-            #     splitted_query_samples.append (query_samples[idx*query_samples_seg_len:])
-            # else:
-            #     splitted_query_samples.append (query_samples[idx*query_samples_seg_len : (idx+1)*query_samples_seg_len])
-            splitted_query_samples.append (query_samples [int(round(query_samples_seg_len * idx)): int(round(query_samples_seg_len * (idx + 1)))])
-        
-        # splitted_query_samples = [query_samples[int(round(avg * i)): int(round(avg * (i + 1)))] for i in range(b)]
-        
-        with ThreadPoolExecutor(max_workers=len(runners)) as executor:
-            # Map each runner to its respective sublist
-            futures = {
-                executor.submit(runner.enqueue, queries): runner 
-                for runner, queries in zip(runners, splitted_query_samples)
-            }
-        
-            # Optionally process the results
-            for future in as_completed(futures):
-                runner = futures[future]
-                try:
-                    result = future.result()
-                    print(f'Runner {runner} enqueued successfully.')
-                except Exception as exc:
-                    print(f'Runner {runner} generated an exception: {exc}')
-
-    def flush_queries():
-        pass
-
-    log_output_settings = lg.LogOutputSettings()
-    log_output_settings.outdir = output_dir
-    log_output_settings.copy_summary_to_stdout = False
-    log_settings = lg.LogSettings()
-    log_settings.enable_trace = args.debug
-    log_settings.log_output = log_output_settings
-
-    settings = lg.TestSettings()
-    settings.FromConfig(mlperf_conf, args.model_name, args.scenario)
-    settings.FromConfig(user_conf, args.model_name, args.scenario)
-    if os.path.exists(audit_config):
-        settings.FromConfig(audit_config, args.model_name, args.scenario)
-    settings.scenario = scenario
-    settings.mode = lg.TestMode.PerformanceOnly
-    if args.accuracy:
-        settings.mode = lg.TestMode.AccuracyOnly
-    if args.find_peak_performance:
-        settings.mode = lg.TestMode.FindPeakPerformance
-
-    if args.time:
-        # override the time we want to run
-        settings.min_duration_ms = args.time * MILLI_SEC
-        settings.max_duration_ms = args.time * MILLI_SEC
-
-    # Zixian: Nov8: manually setting args.qps to 1
-    # args.qps=1.0
-    if args.qps:
-        qps = float(args.qps)
-        settings.server_target_qps = qps
-        settings.offline_expected_qps = qps
-
-    if count_override:
-        settings.min_query_count = count
-        settings.max_query_count = count
-
-    if args.samples_per_query:
-        settings.multi_stream_samples_per_query = args.samples_per_query
-    if args.max_latency:
-        settings.server_target_latency_ns = int(args.max_latency * NANO_SEC)
-        settings.multi_stream_expected_latency_ns = int(args.max_latency * NANO_SEC)
-
-    performance_sample_count = (
-        args.performance_sample_count
-        if args.performance_sample_count
-        else min(count, 500)
-    )
     
-    # count = 200
-    sut = lg.ConstructSUT(issue_queries, flush_queries)
-    qsl = lg.ConstructQSL(
-        count, performance_sample_count, ds.load_query_samples, ds.unload_query_samples
-    )
-
-    log.info("starting {}".format(scenario))
-    result_dict = {"scenario": str(scenario)}
-    for runner in runners: 
-        runner.start_run(result_dict, args.accuracy)
     
-    # with ThreadPoolExecutor(max_workers=len(runners)) as executor:
-    #         # Map each runner to its respective sublist
-    #         futures = {
-    #             executor.submit(runner.finish(), (result_dict, args.accuracy)): runner 
-    #             for runner in runners 
-    #         }
-        
-
-    lg.StartTestWithLogSettings(sut, qsl, settings, log_settings, audit_config)
-
-    if args.accuracy:
-        post_proc.finalize(result_dict, ds, output_dir=args.output)
-        final_results["accuracy_results"] = result_dict
-        # post_proc.save_images(saved_images_ids, ds)
-
-
-    lg.DestroyQSL(qsl)
-    lg.DestroySUT(sut)
-
-    # for runner in runners: 
-    #     runner.finish()
-    with ThreadPoolExecutor(max_workers=len(runners)) as executor:
-            # Map each runner to its respective sublist
-            futures = {
-                executor.submit(runner.finish()): runner 
-                for runner in runners 
-            }
-        
-        
-
-    #
-    # write final results
-    #
-    if args.output:
-        with open("results.json", "w") as f:
-            json.dump(final_results, f, sort_keys=True, indent=4)
-
 
 if __name__ == "__main__":
     main()
diff --git a/text_to_image/py_demo_server_lon.py b/text_to_image/py_demo_server_lon.py
new file mode 100644
index 0000000000..2ea4283d5a
--- /dev/null
+++ b/text_to_image/py_demo_server_lon.py
@@ -0,0 +1,534 @@
+"""
+Python demo showing how to use the MLPerf Inference LoadGen over the Network bindings.
+This program runs on the LON Node side.
+It runs the demo in MLPerf server mode over the network.
+It communicates over the network with Network SUT nodes,
+which are running the networked SUT code.
+"""
+
+import argparse
+import threading
+import requests
+import array
+import time
+import json
+import array
+import collections
+import logging
+import os
+import sys
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+from absl import app
+# from absl import flags
+import mlperf_loadgen as lg
+import numpy as np
+import torch
+
+import struct
+
+import dataset
+import coco
+
+from queue import Queue
+
+# FLAGS = flags.FLAGS
+
+# flags.DEFINE_list(
+#     "sut_server", "http://localhost:8000", "Address of the server(s) under test."
+# )
+
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger("main")
+
+NANO_SEC = 1e9
+MILLI_SEC = 1000
+
+SUPPORTED_DATASETS = {
+    "coco-1024": (
+        coco.Coco,
+        dataset.preprocess,
+        coco.PostProcessCoco(),
+        {"image_size": [3, 1024, 1024]},
+    )
+}
+
+SUPPORTED_PROFILES = {
+    "defaults": {
+        "dataset": "coco-1024",
+        "backend": "pytorch",
+        "model-name": "stable-diffusion-xl",
+    },
+    "debug": {
+        "dataset": "coco-1024",
+        "backend": "debug",
+        "model-name": "stable-diffusion-xl",
+    },
+    "stable-diffusion-xl-pytorch": {
+        "dataset": "coco-1024",
+        "backend": "pytorch",
+        "model-name": "stable-diffusion-xl",
+    },
+    "stable-diffusion-xl-pytorch-dist": {
+        "dataset": "coco-1024",
+        "backend": "pytorch-dist",
+        "model-name": "stable-diffusion-xl",
+    },
+}
+
+SCENARIO_MAP = {
+    "SingleStream": lg.TestScenario.SingleStream,
+    "MultiStream": lg.TestScenario.MultiStream,
+    "Server": lg.TestScenario.Server,
+    "Offline": lg.TestScenario.Offline,
+}
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--sut-server', required=True, nargs='+', help='A list of server address & port')
+    parser.add_argument("--dataset", choices=SUPPORTED_DATASETS.keys(), help="dataset")
+    parser.add_argument("--dataset-path", required=True, help="path to the dataset")
+    parser.add_argument(
+        "--profile", choices=SUPPORTED_PROFILES.keys(), help="standard profiles"
+    )
+    parser.add_argument(
+        "--scenario",
+        default="SingleStream",
+        help="mlperf benchmark scenario, one of " + str(list(SCENARIO_MAP.keys())),
+    )
+    parser.add_argument(
+        "--max-batchsize",
+        type=int,
+        default=1,
+        help="max batch size in a single inference",
+    )
+    parser.add_argument("--threads", default=1, type=int, help="threads")
+    parser.add_argument("--accuracy", action="store_true", help="enable accuracy pass")
+    parser.add_argument(
+        "--find-peak-performance",
+        action="store_true",
+        help="enable finding peak performance pass",
+    )
+    parser.add_argument("--backend", help="Name of the backend")
+    parser.add_argument("--model-name", help="Name of the model")
+    parser.add_argument("--output", default="output", help="test results")
+    parser.add_argument("--qps", type=int, help="target qps")
+    parser.add_argument("--model-path", help="Path to model weights")
+
+    parser.add_argument(
+        "--dtype",
+        default="fp32",
+        choices=["fp32", "fp16", "bf16"],
+        help="dtype of the model",
+    )
+    parser.add_argument(
+        "--device",
+        default="cuda",
+        choices=["cuda", "cpu", "rocm"],
+        help="device to run the benchmark",
+    )
+    parser.add_argument(
+        "--latent-framework",
+        default="torch",
+        choices=["torch", "numpy"],
+        help="framework to load the latents",
+    )
+
+    # file to use mlperf rules compliant parameters
+    parser.add_argument(
+        "--mlperf_conf", default="mlperf.conf", help="mlperf rules config"
+    )
+    # file for user LoadGen settings such as target QPS
+    parser.add_argument(
+        "--user_conf",
+        default="user.conf",
+        help="user config for user LoadGen settings such as target QPS",
+    )
+    # file for LoadGen audit settings
+    parser.add_argument(
+        "--audit_conf", default="audit.config", help="config for LoadGen audit settings"
+    )
+    # arguments to save images
+    # pass this argument for official submission
+    # parser.add_argument("--output-images", action="store_true", help="Store a subset of the generated images")
+    # do not modify this argument for official submission
+    parser.add_argument("--ids-path", help="Path to caption ids", default="tools/sample_ids.txt")
+
+    # below will override mlperf rules compliant settings - don't use for official submission
+    parser.add_argument("--time", type=int, help="time to scan in seconds")
+    parser.add_argument("--count", type=int, help="dataset items to use")
+    parser.add_argument("--debug", action="store_true", help="debug")
+    parser.add_argument(
+        "--performance-sample-count", type=int, help="performance sample count", default=5000
+    )
+    parser.add_argument(
+        "--max-latency", type=float, help="mlperf max latency in pct tile"
+    )
+    parser.add_argument(
+        "--samples-per-query",
+        default=8,
+        type=int,
+        help="mlperf multi-stream samples per query",
+    )
+    args = parser.parse_args()
+
+    # don't use defaults in argparser. Instead we default to a dict, override that with a profile
+    # and take this as default unless command line give
+    defaults = SUPPORTED_PROFILES["defaults"]
+
+    if args.profile:
+        profile = SUPPORTED_PROFILES[args.profile]
+        defaults.update(profile)
+    for k, v in defaults.items():
+        kc = k.replace("-", "_")
+        if getattr(args, kc) is None:
+            setattr(args, kc, v)
+
+    if args.scenario not in SCENARIO_MAP:
+        parser.error("valid scanarios:" + str(list(SCENARIO_MAP.keys())))
+    return args
+
+def get_backend(backend, **kwargs):
+    if backend == "pytorch":
+        from backend_pytorch import BackendPytorch
+
+        backend = BackendPytorch(**kwargs)
+
+    elif backend == "debug":
+        from backend_debug import BackendDebug
+
+        backend = BackendDebug()
+    else:
+        raise ValueError("unknown backend: " + backend)
+    return backend
+
+class QSL:
+    def __init__(self, total_sample_count, performance_sample_count, ds=None):
+        # self.eval_features = {
+        #     i: {"index": i, "id": i} for i in range(total_sample_count)
+        # }
+        self.qsl = lg.ConstructQSL(
+            total_sample_count, 
+            performance_sample_count, 
+            ds.load_query_samples, 
+            ds.unload_query_samples
+        )
+    
+    def __del__(self):
+        lg.DestroyQSL(self.qsl)
+
+class QDL:
+    """QDL acting as a proxy to the SUT.
+    This QDL communicates with the SUT via HTTP.
+    It uses two endpoints to communicate with the SUT:
+    - /predict/ : Send a query to the SUT and get a response.
+    - /getname/ : Get the name of the SUT. Send a getname to the SUT and get a response.
+    """
+
+    def __init__(self, qsl: QSL, sut_server_addr: list, ds=None):
+        """
+        Constructor for the QDL.
+        Args:
+            qsl: The QSL to use.
+            sut_server_addr: A list of addresses of the SUT.
+        """
+        self.qsl = qsl
+
+        # Construct QDL from the python binding
+        self.qdl = lg.ConstructQDL(
+            self.issue_query, self.flush_queries, self.client_get_name
+        )
+        self.sut_server_addr = sut_server_addr
+        self.ds = ds
+        
+
+    def issue_query(self, query_samples):
+        """Process the query to send to the SUT"""
+        threading.Thread(
+            target=self.process_query_async,
+            args=[query_samples],
+            daemon=True # remove
+            ).start()
+
+    def flush_queries(self):
+        """Flush the queries. Dummy implementation."""
+        pass
+
+    def process_query_async(self, query_samples):
+        """Serialize the query, send it to the SUT in round robin, and return the deserialized response."""
+        
+        query_samples_len = len (query_samples)
+        query_samples_seg_len = int (query_samples_len / len (self.sut_server_addr))
+        splitted_query_samples = []
+        for idx in range (len (self.sut_server_addr)): 
+            if idx == len (self.sut_server_addr) -1: 
+                splitted_query_samples.append (query_samples[idx*query_samples_seg_len:])
+            else:
+                splitted_query_samples.append (query_samples[idx*query_samples_seg_len : (idx+1)*query_samples_seg_len])
+        
+        responses = []
+        with ThreadPoolExecutor(max_workers=len(self.sut_server_addr)) as executor:
+            futures = { 
+                executor.submit(self.request_validate, '{}/predict/'.format(url), queries): self
+                for url, queries in zip(self.sut_server_addr, splitted_query_samples)
+            }
+        
+                
+
+    # Send inference request to one host, receive the inference result
+    # then calls loadgen to verify the inference result
+    def request_validate(self, url, query_samples):
+        # turn query_samples into list of json: 
+        indexes = [q.index for q in query_samples]
+        ids = [q.id for q in query_samples]
+        data, label = self.ds.get_samples(indexes)
+        
+        data = [
+            {
+                'input_tokens': d['input_tokens'],
+                'input_tokens_2': d['input_tokens_2'],
+                'latents': d['latents'].tolist()  # Convert tensor to a list
+            }
+            for d in data
+        ]
+        
+        '''
+        data[0]:
+        {
+            'input_tokens': <class 'transformers.tokenization_utils_base.BatchEncoding'>, 
+            'input_tokens_2': <class 'transformers.tokenization_utils_base.BatchEncoding'>, 
+            'latents': <class 'torch.Tensor'>  
+        }
+        '''
+        
+        # Todo: The response got None object when we have 2 inference nodes
+        # This problem doesn't exist when we just inference on one node
+        
+        query_samples = [ {'index': q[0], 'id': q[1], 'data': q[2]} 
+                         for q in zip(indexes, ids, data) ]
+        response = requests.post(url, json={"query_samples": query_samples})
+        e = time.time()
+        print (f'RETURNED from requests.post on predict at time \t {e}')
+        
+        
+        
+        
+        # print(response.json()["result"])
+        
+        # print("result type:", type(result))
+        # print("result:", result)
+        # result = response.json()["result"]
+        # print("result type:", type(type(result)))
+        # print("result type:", type(result))
+        # print("result:", result)
+        # print("result len:", len(result))
+        # print("result[0]:", result[0])
+        
+        
+        
+        # response_array_refs = []
+        # response = []
+        # for sample in result:
+        #     sample_in_memory = array.array("B", sample['data'])
+        #     bi = sample_in_memory.buffer_info()
+        #     response_array_refs.append(sample_in_memory)
+        #     response.append(lg.QuerySampleResponse(sample['query_id'], bi[0], bi[1]))
+            
+        response_bytes = response.content
+        offset = 0
+        responses = []
+        response_array_refs = []
+
+        while offset < len(response_bytes):
+            # Unpack the query_id
+            query_id = struct.unpack_from('Q', response_bytes, offset)[0]
+            offset += 8
+
+            # Unpack the data length
+            data_length = struct.unpack_from('I', response_bytes, offset)[0]
+            offset += 4
+
+            # Extract the data
+            data_bytes = response_bytes[offset:offset + data_length]
+            offset += data_length
+
+            # Convert bytes to array
+            sample_in_memory = array.array("B", data_bytes)
+            bi = sample_in_memory.buffer_info()
+            response_array_refs.append(sample_in_memory)
+
+            responses.append(lg.QuerySampleResponse(query_id, bi[0], bi[1]))
+        
+            
+        print (f'BEFORE lg.QuerySamplesComplete(response)')
+        lg.QuerySamplesComplete(responses)
+        print (f'AFTER lg.QuerySamplesComplete(response)')
+        
+        
+        '''
+        query_samples[0]:
+        {
+            'index': 1, 
+            'id': 1, 
+            'data': {
+                'inputs_tokens': "this is a prompt",
+                'inputs_tokens_2': "this is a prompt",
+                'latents': [list converted from tensor]
+            }
+        }
+        '''
+        
+
+    def client_get_name(self):
+        """Get the name of the SUT from ALL the SUTS."""
+        # if len(self.sut_server_addr) == 1:
+        #     return requests.post(
+        #         f"{self.sut_server_addr[0]}/getname/").json()["name"]
+
+        # sut_names = [
+        #     requests.post(f"{addr}/getname/").json()["name"]
+        #     for addr in self.sut_server_addr
+        # ]
+        # return "Multi-node SUT: " + ", ".join(sut_names)
+        return "Multi-node SUT: N1, N2"
+
+    def __del__(self):
+        lg.DestroyQDL(self.qdl)
+
+def main(args):
+    # args = get_args()
+    
+    backend = get_backend(
+                    args.backend,
+                    precision=args.dtype,
+                    device='cuda:0',
+                    model_path=args.model_path,
+                    batch_size=args.max_batchsize
+                )
+    model = backend.load()
+
+    
+    if args.dtype == "fp16":
+        dtype = torch.float16
+    elif args.dtype == "bf16":
+        dtype = torch.bfloat16
+    else:
+        dtype = torch.float32
+    
+    # --count applies to accuracy mode only and can be used to limit the number of images
+    # for testing.
+    count_override = False
+    count = args.count
+    if count:
+        count_override = True
+    
+    scenario = SCENARIO_MAP[args.scenario]
+    
+    dataset_class, pre_proc, post_proc, kwargs = SUPPORTED_DATASETS[args.dataset]
+    ds = dataset_class(
+        data_path=args.dataset_path,
+        name=args.dataset,
+        pre_process=pre_proc,
+        count=count,
+        threads=args.threads,
+        # pipe_tokenizer=models[0].pipe.tokenizer,
+        # pipe_tokenizer_2=models[0].pipe.tokenizer_2,
+        pipe_tokenizer=model.pipe.tokenizer,
+        pipe_tokenizer_2=model.pipe.tokenizer_2,
+        latent_dtype=dtype,
+        latent_device=args.device,
+        latent_framework=args.latent_framework,
+        **kwargs,
+    )
+    count = ds.get_item_count()
+    
+    
+    mlperf_conf = os.path.abspath(args.mlperf_conf)
+    if not os.path.exists(mlperf_conf):
+        log.error("{} not found".format(mlperf_conf))
+        sys.exit(1)
+
+    user_conf = os.path.abspath(args.user_conf)
+    if not os.path.exists(user_conf):
+        log.error("{} not found".format(user_conf))
+        sys.exit(1)
+
+    audit_config = os.path.abspath(args.audit_conf)
+    
+    if args.accuracy:
+        ids_path = os.path.abspath(args.ids_path)
+        with open(ids_path) as f:
+            saved_images_ids = [int(_) for _ in f.readlines()]
+
+    if args.output:
+        output_dir = os.path.abspath(args.output)
+        os.makedirs(output_dir, exist_ok=True)
+        os.chdir(output_dir)
+
+    performance_sample_count = (
+        args.performance_sample_count
+        if args.performance_sample_count
+        else min(count, 500)
+    )
+    
+
+    
+    log_output_settings = lg.LogOutputSettings()
+    log_output_settings.outdir = output_dir
+    log_output_settings.copy_summary_to_stdout = False
+    log_settings = lg.LogSettings()
+    log_settings.enable_trace = args.debug
+    log_settings.log_output = log_output_settings
+
+    settings = lg.TestSettings()
+    settings.FromConfig(mlperf_conf, args.model_name, args.scenario)
+    settings.FromConfig(user_conf, args.model_name, args.scenario)
+    if os.path.exists(audit_config):
+        settings.FromConfig(audit_config, args.model_name, args.scenario)
+    settings.scenario = scenario
+    settings.mode = lg.TestMode.PerformanceOnly
+    if args.accuracy:
+        settings.mode = lg.TestMode.AccuracyOnly
+    if args.find_peak_performance:
+        settings.mode = lg.TestMode.FindPeakPerformance
+
+    if args.time:
+        # override the time we want to run
+        settings.min_duration_ms = args.time * MILLI_SEC
+        settings.max_duration_ms = args.time * MILLI_SEC
+
+    if args.qps:
+        qps = float(args.qps)
+        settings.server_target_qps = qps
+        settings.offline_expected_qps = qps
+
+    if count_override:
+        settings.min_query_count = count
+        settings.max_query_count = count
+
+    if args.samples_per_query:
+        settings.multi_stream_samples_per_query = args.samples_per_query
+    if args.max_latency:
+        settings.server_target_latency_ns = int(args.max_latency * NANO_SEC)
+        settings.multi_stream_expected_latency_ns = int(args.max_latency * NANO_SEC)
+
+    performance_sample_count = (
+        args.performance_sample_count
+        if args.performance_sample_count
+        else min(count, 500)
+    )
+
+    # QDL and QSL
+    qsl = QSL(count, performance_sample_count, ds=ds)
+    # qsl = QSL(50, performance_sample_count, ds=ds)
+    qdl = QDL(qsl, sut_server_addr=args.sut_server, ds=ds)
+
+    lg.StartTest(qdl.qdl, qsl.qsl, settings)
+    
+    del qsl
+    del qdl
+
+
+if __name__ == "__main__":
+    # app.run(main)
+    main(None)
\ No newline at end of file
diff --git a/text_to_image/sut_over_network_demo.py b/text_to_image/sut_over_network_demo.py
new file mode 100644
index 0000000000..a5265a19a5
--- /dev/null
+++ b/text_to_image/sut_over_network_demo.py
@@ -0,0 +1,440 @@
+"""
+MLPerf Inference Benchmarking Tool - SUT Node
+"""
+
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import argparse
+import array
+import collections
+import json
+import logging
+import os
+import sys
+import threading
+import time
+import socket
+import struct
+
+import numpy as np
+import torch
+
+from flask import Flask, request, jsonify, Response
+from concurrent.futures import ThreadPoolExecutor, as_completed
+import subprocess
+
+import dataset
+import coco
+
+from queue import Queue
+
+import mlperf_loadgen as lg  # Only needed if you plan to run LoadGen locally
+
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger("main")
+
+NANO_SEC = 1e9
+MILLI_SEC = 1000
+
+SUPPORTED_DATASETS = {
+    "coco-1024": (
+        coco.Coco,
+        dataset.preprocess,
+        coco.PostProcessCoco(),
+        {"image_size": [3, 1024, 1024]},
+    )
+}
+
+SUPPORTED_PROFILES = {
+    "defaults": {
+        "dataset": "coco-1024",
+        "backend": "pytorch",
+        "model-name": "stable-diffusion-xl",
+    },
+    "debug": {
+        "dataset": "coco-1024",
+        "backend": "debug",
+        "model-name": "stable-diffusion-xl",
+    },
+    "stable-diffusion-xl-pytorch": {
+        "dataset": "coco-1024",
+        "backend": "pytorch",
+        "model-name": "stable-diffusion-xl",
+    },
+    "stable-diffusion-xl-pytorch-dist": {
+        "dataset": "coco-1024",
+        "backend": "pytorch-dist",
+        "model-name": "stable-diffusion-xl",
+    },
+}
+
+SCENARIO_MAP = {
+    "SingleStream": lg.TestScenario.SingleStream,
+    "MultiStream": lg.TestScenario.MultiStream,
+    "Server": lg.TestScenario.Server,
+    "Offline": lg.TestScenario.Offline,
+}
+
+app = Flask(__name__)
+
+# Global variables to hold models and runners
+backends = []
+models = []
+runners = []
+ds = None
+args = None
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--dataset", choices=SUPPORTED_DATASETS.keys(), help="dataset")
+    parser.add_argument("--dataset-path", required=True, help="path to the dataset")
+    parser.add_argument(
+        "--profile", choices=SUPPORTED_PROFILES.keys(), help="standard profiles"
+    )
+    parser.add_argument(
+        "--scenario",
+        default="SingleStream",
+        help="mlperf benchmark scenario, one of " + str(list(SCENARIO_MAP.keys())),
+    )
+    parser.add_argument(
+        "--max-batchsize",
+        type=int,
+        default=1,
+        help="max batch size in a single inference",
+    )
+    parser.add_argument("--threads", default=1, type=int, help="threads")
+    parser.add_argument("--accuracy", action="store_true", help="enable accuracy pass")
+    parser.add_argument(
+        "--find-peak-performance",
+        action="store_true",
+        help="enable finding peak performance pass",
+    )
+    parser.add_argument("--backend", help="Name of the backend")
+    parser.add_argument("--model-name", help="Name of the model")
+    parser.add_argument("--output", default="output", help="test results")
+    parser.add_argument("--qps", type=int, help="target qps")
+    parser.add_argument("--model-path", help="Path to model weights")
+
+    parser.add_argument(
+        "--dtype",
+        default="fp32",
+        choices=["fp32", "fp16", "bf16"],
+        help="dtype of the model",
+    )
+    parser.add_argument(
+        "--device",
+        default="cuda",
+        choices=["cuda", "cpu", "rocm"],
+        help="device to run the benchmark",
+    )
+    parser.add_argument(
+        "--latent-framework",
+        default="torch",
+        choices=["torch", "numpy"],
+        help="framework to load the latents",
+    )
+
+    # file to use mlperf rules compliant parameters
+    parser.add_argument(
+        "--mlperf_conf", default="mlperf.conf", help="mlperf rules config"
+    )
+    # file for user LoadGen settings such as target QPS
+    parser.add_argument(
+        "--user_conf",
+        default="user.conf",
+        help="user config for user LoadGen settings such as target QPS",
+    )
+    # file for LoadGen audit settings
+    parser.add_argument(
+        "--audit_conf", default="audit.config", help="config for LoadGen audit settings"
+    )
+    # arguments to save images
+    parser.add_argument("--ids-path", help="Path to caption ids", default="tools/sample_ids.txt")
+
+    # below will override mlperf rules compliant settings - don't use for official submission
+    parser.add_argument("--time", type=int, help="time to scan in seconds")
+    parser.add_argument("--count", type=int, help="dataset items to use")
+    parser.add_argument("--debug", action="store_true", help="debug")
+    parser.add_argument(
+        "--performance-sample-count", type=int, help="performance sample count", default=5000
+    )
+    parser.add_argument(
+        "--max-latency", type=float, help="mlperf max latency in pct tile"
+    )
+    parser.add_argument(
+        "--samples-per-query",
+        default=8,
+        type=int,
+        help="mlperf multi-stream samples per query",
+    )
+    args = parser.parse_args()
+
+    # don't use defaults in argparser. Instead we default to a dict, override that with a profile
+    # and take this as default unless command line give
+    defaults = SUPPORTED_PROFILES["defaults"]
+
+    if args.profile:
+        profile = SUPPORTED_PROFILES[args.profile]
+        defaults.update(profile)
+    for k, v in defaults.items():
+        kc = k.replace("-", "_")
+        if getattr(args, kc) is None:
+            setattr(args, kc, v)
+
+    if args.scenario not in SCENARIO_MAP:
+        parser.error("valid scenarios:" + str(list(SCENARIO_MAP.keys())))
+    return args
+
+def get_backend(backend, **kwargs):
+    if backend == "pytorch":
+        from backend_pytorch import BackendPytorch
+
+        backend = BackendPytorch(**kwargs)
+
+    elif backend == "debug":
+        from backend_debug import BackendDebug
+
+        backend = BackendDebug()
+    else:
+        raise ValueError("unknown backend: " + backend)
+    return backend
+
+class Item:
+    """An item that we queue for processing by the thread pool."""
+
+    def __init__(self, query_id, content_id, inputs, img=None):
+        self.query_id = query_id
+        self.content_id = content_id
+        self.img = img
+        self.inputs = inputs
+        self.start = time.time()
+
+class RunnerBase:
+    def __init__(self, model, ds, threads, post_proc=None, max_batchsize=128):
+        self.take_accuracy = False
+        self.ds = ds
+        self.model = model
+        self.post_process = post_proc
+        self.threads = threads
+        self.take_accuracy = False
+        self.max_batchsize = max_batchsize
+        self.result_timing = []
+        self.result_dict = {}
+
+    def handle_tasks(self, tasks_queue):
+        pass
+
+    def start_run(self, result_dict, take_accuracy):
+        self.result_dict = result_dict
+        self.result_timing = []
+        self.take_accuracy = take_accuracy
+        self.post_process.start()
+
+    def run_one_item(self, qitem: Item):
+        # print("in run_one_item")
+        # run the prediction
+        processed_results = []
+        
+        # preprocess the prompts:
+        qitem.inputs = [
+            {
+                "input_tokens": ds.preprocess(input['input_tokens'], ds.pipe_tokenizer),
+                "input_tokens_2": ds.preprocess(input['input_tokens_2'], ds.pipe_tokenizer_2),
+                "latents": torch.tensor(input['latents']).half(),
+            }
+            for input in qitem.inputs
+        ]
+        
+        try:
+            results = self.model.predict(qitem.inputs)
+            processed_results = self.post_process(
+                results, qitem.content_id, qitem.inputs, self.result_dict
+            )
+            if self.take_accuracy:
+                self.post_process.add_results(processed_results)
+            self.result_timing.append(time.time() - qitem.start)
+        except Exception as ex:  # pylint: disable=broad-except
+            src = [self.ds.get_item_loc(i) for i in qitem.content_id]
+            log.error("thread: failed on contentid=%s, %s", src, ex)
+            print("thread: failed on contentid=%s, %s", src, ex)
+            # since post_process will not run, fake empty responses
+            processed_results = [[]] * len(qitem.query_id)
+        finally:
+            response_array_refs = []
+            response = []
+            for idx, query_id in enumerate(qitem.query_id):
+                response_array = array.array(
+                    "B", np.array(processed_results[idx], np.uint8).tobytes()
+                )
+                # response_array_refs.append(response_array)
+                # bi = response_array.buffer_info()
+                # response.append({'query_id': query_id, 'data': bi[0], 'size': bi[1]})
+                response.append({'query_id': query_id, 'data': response_array.tolist()})
+            return response  # Return the response instead of calling QuerySamplesComplete
+
+    def enqueue(self, query_samples):
+        try:
+            idx = [q['index'] for q in query_samples]
+            query_id = [q['id'] for q in query_samples]
+            data = [q['data'] for q in query_samples]
+            label = None # label is never used in any functions
+            
+            responses = []
+            if len(idx) < self.max_batchsize:
+                responses.extend(self.run_one_item(Item(query_id, idx, data, label)))
+            else:
+                bs = self.max_batchsize
+                for i in range(0, len(idx), bs):
+                    # print("samples obtained")
+                    responses.extend(
+                        self.run_one_item(
+                            Item(query_id[i : i + bs], idx[i : i + bs], data[i : i + bs], label)
+                        )
+                    )
+        except Exception as e:
+            print(f'An error occured in enqueue: {e}')
+        return responses
+
+    def finish(self):
+        pass
+
+def initialize():
+    global backends, models, runners, ds, args, post_proc
+    args = get_args()
+
+    log.info(args)
+
+    # Initialize backends and models
+    backends = [get_backend(
+                    args.backend,
+                    precision=args.dtype,
+                    device=f'cuda:{i}',
+                    model_path=args.model_path,
+                    batch_size=args.max_batchsize
+                ) 
+                for i in [0,1,2,3]]  # Adjust GPU indices as needed
+
+    models = [backend.load() for backend in backends]
+
+    if args.dtype == "fp16":
+        dtype = torch.float16
+    elif args.dtype == "bf16":
+        dtype = torch.bfloat16
+    else:
+        dtype = torch.float32
+
+    # Load dataset
+    dataset_class, pre_proc, post_proc, kwargs = SUPPORTED_DATASETS[args.dataset]
+    ds = dataset_class(
+        data_path=args.dataset_path,
+        name=args.dataset,
+        pre_process=pre_proc,
+        count=args.count,
+        threads=args.threads,
+        pipe_tokenizer=models[0].pipe.tokenizer,
+        pipe_tokenizer_2=models[0].pipe.tokenizer_2,
+        latent_dtype=dtype,
+        latent_device=args.device,
+        latent_framework=args.latent_framework,
+        **kwargs,
+    )
+
+    scenario = SCENARIO_MAP[args.scenario]
+    runner_map = {
+        lg.TestScenario.SingleStream: RunnerBase,
+        lg.TestScenario.MultiStream: RunnerBase,
+        lg.TestScenario.Server: RunnerBase,
+        lg.TestScenario.Offline: RunnerBase,
+    }
+
+    runners = [runner_map[scenario](
+                    model, ds, args.threads, post_proc=post_proc, max_batchsize=args.max_batchsize
+                )
+                for model in models]
+    
+    # added because we need to pass result_dict to the runner class
+    log.info("starting {}".format(scenario))
+    result_dict = {"scenario": str(scenario)}
+    for runner in runners: 
+        runner.start_run(result_dict, args.accuracy)
+
+@app.route('/predict/', methods=['POST'])
+def predict():
+    query_data = request.get_json(force=True)
+    query_samples = query_data['query_samples']
+
+    # Distribute queries among runners
+    query_samples_len = len(query_samples)
+    num_runners = len(runners)
+    query_samples_seg_len = int(query_samples_len / num_runners)
+    splitted_query_samples = []
+    for idx in range(num_runners):
+        if idx == num_runners -1:
+            splitted_query_samples.append(query_samples[idx*query_samples_seg_len:])
+        else:
+            splitted_query_samples.append(query_samples[idx*query_samples_seg_len : (idx+1)*query_samples_seg_len])
+
+    # Use ThreadPoolExecutor to run queries concurrently
+    responses = []
+    with ThreadPoolExecutor(max_workers=num_runners) as executor:
+        futures = {
+            executor.submit(runner.enqueue, queries): runner 
+            for runner, queries in zip(runners, splitted_query_samples)
+        }
+
+        for future in as_completed(futures):
+            runner = futures[future]
+            try:
+                result = future.result()
+                responses.extend(result)
+            except Exception as exc:
+                log.error(f'Runner {runner} generated an exception: {exc}')
+
+    print(f'response of len {len(responses)} returned')
+    print (f'RETURNING from predict')
+    
+    s = time.time() 
+    # output = jsonify(result=responses)
+    response_bytes = bytearray()
+    for resp in responses:
+        query_id = resp['query_id']
+        data_array = np.array(resp['data'], dtype=np.uint8)
+        data_bytes = data_array.tobytes()
+
+        # Pack the query_id (8 bytes) and the length of data (4 bytes), then the data
+        packed_data = struct.pack('Q', query_id)
+        packed_data += struct.pack('I', len(data_bytes))
+        packed_data += data_bytes
+        response_bytes.extend(packed_data)
+    e = time.time()
+    
+    print (f'\n Time to jsonify output is: \t {e-s} \n')
+    print (f'\n Mark Time to return: \t {e} \n')
+    # Todo: send samples back
+    # return output 
+    print(f'Type of response_bytes: {type(response_bytes)}') 
+    return Response(bytes(response_bytes), mimetype='application/octet-stream')
+
+@app.route('/getname/', methods=['POST', 'GET'])
+def getname():
+    return jsonify(name=f"SUT Node running on {socket.gethostname()}")
+
+def issue_queries(query_samples):
+    # This function is not used in the networked version
+    pass
+
+def flush_queries():
+    pass
+
+if __name__ == "__main__":
+    initialize()
+    
+    # get public ip addr of current node
+    s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
+    s.connect(("8.8.8.8", 80))
+    ip_address = s.getsockname()[0]
+    
+    # Change host ip addr and port number 
+    app.run(host=ip_address, port=8008)