diff --git a/text_to_image/StableDiffusionMGX.py b/text_to_image/StableDiffusionMGX.py
new file mode 100644
index 0000000000..18186cdc62
--- /dev/null
+++ b/text_to_image/StableDiffusionMGX.py
@@ -0,0 +1,875 @@
+#  The MIT License (MIT)
+#
+#  Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+#
+#  Permission is hereby granted, free of charge, to any person obtaining a copy
+#  of this software and associated documentation files (the 'Software'), to deal
+#  in the Software without restriction, including without limitation the rights
+#  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+#  copies of the Software, and to permit persons to whom the Software is
+#  furnished to do so, subject to the following conditions:
+#
+#  The above copyright notice and this permission notice shall be included in
+#  all copies or substantial portions of the Software.
+#
+#  THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+#  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+#  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+#  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+#  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+#  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+#  THE SOFTWARE.
+
+from argparse import ArgumentParser
+from diffusers import EulerDiscreteScheduler
+from transformers import CLIPTokenizer
+from PIL import Image
+
+import migraphx as mgx
+from functools import wraps
+from tqdm import tqdm
+from hip import hip
+from collections import namedtuple
+
+import os
+import sys
+import torch
+import time
+import logging
+import coco
+import dataset
+
+logging.basicConfig(level=logging.ERROR)
+log = logging.getLogger("mgx-base")
+
+formatter = logging.Formatter("{levelname} - {message}", style="{")
+file_handler = logging.FileHandler("mgx.log", mode="a", encoding="utf-8")
+file_handler.setLevel("INFO")
+file_handler.setFormatter(formatter)
+log.addHandler(file_handler)
+
+
+
+HipEventPair = namedtuple('HipEventPair', ['start', 'end'])
+
+
+# measurement helper
+def measure(fn):
+    @wraps(fn)
+    def measure_ms(*args, **kwargs):
+        start_time = time.perf_counter_ns()
+        result = fn(*args, **kwargs)
+        end_time = time.perf_counter_ns()
+        print(
+            f"Elapsed time for {fn.__name__}: {(end_time - start_time) * 1e-6:.4f} ms\n"
+        )
+        return result
+
+    return measure_ms
+
+
+def get_args():
+    parser = ArgumentParser()
+    # Model compile
+    parser.add_argument(
+        "--pipeline-type",
+        type=str,
+        choices=["sdxl", "sdxl-opt", "sdxl-turbo"],
+        required=True,
+        help="Specify pipeline type. Options: `sdxl`, `sdxl-opt`, `sdxl-turbo`",
+    )
+
+    parser.add_argument(
+        "--onnx-model-path",
+        type=str,
+        default=None,
+        help=
+        "Path to onnx model files. Use it to override the default models/<sdxl*> path",
+    )
+
+    parser.add_argument(
+        "--compiled-model-path",
+        type=str,
+        default=None,
+        help=
+        "Path to compiled mxr model files. If not set, it will be saved next to the onnx model.",
+    )
+
+    parser.add_argument(
+        "--use-refiner",
+        action="store_true",
+        default=False,
+        help="Use the refiner model",
+    )
+
+    parser.add_argument(
+        "--refiner-onnx-model-path",
+        type=str,
+        default=None,
+        help=
+        "Path to onnx model files. Use it to override the default models/<sdxl*> path",
+    )
+
+    parser.add_argument(
+        "--refiner-compiled-model-path",
+        type=str,
+        default=None,
+        help=
+        "Path to compiled mxr model files. If not set, it will be saved next to the refiner onnx model.",
+    )
+
+    parser.add_argument(
+        "--fp16",
+        choices=[
+            "all", "vae", "clip", "clip2", "unetxl", "refiner_clip2",
+            "refiner_unetxl"
+        ],
+        nargs="+",
+        help="Quantize models with fp16 precision.",
+    )
+
+    parser.add_argument(
+        "--force-compile",
+        action="store_true",
+        default=False,
+        help="Ignore existing .mxr files and override them",
+    )
+
+    parser.add_argument(
+        "--exhaustive-tune",
+        action="store_true",
+        default=False,
+        help="Perform exhaustive tuning when compiling onnx models",
+    )
+
+    # Runtime
+    parser.add_argument(
+        "-s",
+        "--seed",
+        type=int,
+        default=42,
+        help="Random seed",
+    )
+
+    parser.add_argument(
+        "-t",
+        "--steps",
+        type=int,
+        default=20,
+        help="Number of steps",
+    )
+
+    parser.add_argument(
+        "--refiner-steps",
+        type=int,
+        default=20,
+        help="Number of refiner steps",
+    )
+
+    parser.add_argument(
+        "-p",
+        "--prompt",
+        type=str,
+        # required=True,
+        help="Prompt",
+    )
+
+    parser.add_argument(
+        "-n",
+        "--negative-prompt",
+        type=str,
+        default="",
+        help="Negative prompt",
+    )
+
+    parser.add_argument(
+        "--scale",
+        type=float,
+        default=5.0,
+        help="Guidance scale",
+    )
+
+    parser.add_argument(
+        "--refiner-aesthetic-score",
+        type=float,
+        default=6.0,
+        help="aesthetic score for refiner",
+    )
+
+    parser.add_argument(
+        "--refiner-negative-aesthetic-score",
+        type=float,
+        default=2.5,
+        help="negative aesthetic score for refiner",
+    )
+
+    parser.add_argument(
+        "-o",
+        "--output",
+        type=str,
+        default=None,
+        help="Output name",
+    )
+
+    parser.add_argument(
+        "--verbose",
+        action="store_true",
+        default=False,
+        help="Log during run",
+    )
+    return parser.parse_args()
+
+
+model_shapes = {
+    "clip": {
+        "input_ids": [2, 77]
+    },
+    "clip2": {
+        "input_ids": [2, 77]
+    },
+    "unetxl": {
+        "sample": [2, 4, 128, 128],
+        "encoder_hidden_states": [2, 77, 2048],
+        "text_embeds": [2, 1280],
+        "time_ids": [2, 6],
+        "timestep": [1],
+    },
+    "refiner_unetxl": {
+        "sample": [2, 4, 128, 128],
+        "encoder_hidden_states": [2, 77, 1280],
+        "text_embeds": [2, 1280],
+        "time_ids": [2, 5],
+        "timestep": [1],
+    },
+    "vae": {
+        "latent_sample": [1, 4, 128, 128]
+    },
+}
+
+model_names = {
+    "sdxl": {
+        "clip": "text_encoder",
+        "clip2": "text_encoder_2",
+        "unetxl": "unet",
+        "vae": "vae_decoder",
+    },
+    "sdxl-opt": {
+        "clip": "clip.opt.mod",
+        "clip2": "clip2.opt.mod",
+        "unetxl": "unetxl.opt",
+        "vae": "vae_decoder",
+    },
+    "sdxl-turbo": {
+        "clip": "text_encoder",
+        "clip2": "text_encoder_2",
+        "unetxl": "unet",
+        "vae": "vae_decoder",
+    },
+    "refiner": {
+        "clip2": "clip2.opt.mod",
+        "unetxl": "unetxl.opt",
+    },
+}
+
+default_model_paths = {
+    "sdxl": "models/sdxl-1.0-base",
+    "sdxl-opt": "models/sdxl-1.0-base",
+    "sdxl-turbo": "models/sdxl-turbo",
+    "refiner": "models/sdxl-1.0-refiner",
+}
+
+mgx_to_torch_dtype_dict = {
+    "bool_type": torch.bool,
+    "uint8_type": torch.uint8,
+    "int8_type": torch.int8,
+    "int16_type": torch.int16,
+    "int32_type": torch.int32,
+    "int64_type": torch.int64,
+    "float_type": torch.float32,
+    "double_type": torch.float64,
+    "half_type": torch.float16,
+}
+
+torch_to_mgx_dtype_dict = {
+    value: key
+    for (key, value) in mgx_to_torch_dtype_dict.items()
+}
+
+
+def tensor_to_arg(tensor):
+    return mgx.argument_from_pointer(
+        mgx.shape(
+            **{
+                "type": torch_to_mgx_dtype_dict[tensor.dtype],
+                "lens": list(tensor.size()),
+                "strides": list(tensor.stride())
+            }), tensor.data_ptr())
+
+
+def tensors_to_args(tensors):
+    return {name: tensor_to_arg(tensor) for name, tensor in tensors.items()}
+
+
+def get_output_name(idx):
+    return f"main:#output_{idx}"
+
+
+def copy_tensor_sync(tensor, data):
+    tensor.copy_(data.to(tensor.dtype))
+    torch.cuda.synchronize()
+
+
+def copy_tensor(tensor, data):
+    tensor.copy_(data.to(tensor.dtype))
+
+
+def run_model_sync(model, args):
+    model.run(args)
+    mgx.gpu_sync()
+
+
+def run_model_async(model, args, stream):
+    model.run_async(args, stream, "ihipStream_t")
+
+
+def allocate_torch_tensors(model):
+    input_shapes = model.get_parameter_shapes()
+    data_mapping = {
+        name: torch.zeros(shape.lens()).to(
+            mgx_to_torch_dtype_dict[shape.type_string()]).to(device="cuda")
+        for name, shape in input_shapes.items()
+    }
+    return data_mapping
+
+
+class StableDiffusionMGX():
+    def __init__(self, pipeline_type, onnx_model_path, compiled_model_path,
+                 use_refiner, refiner_onnx_model_path,
+                 refiner_compiled_model_path, fp16, force_compile,
+                 exhaustive_tune, tokenizers=None, scheduler=None):
+        if not (onnx_model_path or compiled_model_path):
+            onnx_model_path = default_model_paths[pipeline_type]
+
+        self.use_refiner = use_refiner
+        if not self.use_refiner and (refiner_onnx_model_path
+                                     or refiner_compiled_model_path):
+            print(
+                "WARN: Refiner model is provided, but was *not* enabled. Use --use-refiner to enable it."
+            )
+        if self.use_refiner and not (refiner_onnx_model_path
+                                     or refiner_compiled_model_path):
+            refiner_onnx_model_path = default_model_paths["refiner"]
+
+        is_turbo = "turbo" in pipeline_type
+        model_id = "stabilityai/sdxl-turbo" if is_turbo else "stabilityai/stable-diffusion-xl-base-1.0"
+        print(f"Using {model_id}")
+
+        if scheduler is None:
+            print("Creating EulerDiscreteScheduler scheduler")
+            self.scheduler = EulerDiscreteScheduler.from_pretrained(
+                model_id, subfolder="scheduler")
+        else:
+            self.scheduler = scheduler
+
+        print("Creating CLIPTokenizer tokenizers...")
+        if tokenizers is None:
+            tknz_path1 = os.path.join(onnx_model_path, "tokenizer")
+            tknz_path2 = os.path.join(onnx_model_path, "tokenizer_2")
+            self.tokenizers = {
+                "clip":
+                CLIPTokenizer.from_pretrained(tknz_path1),
+                "clip2":
+                CLIPTokenizer.from_pretrained(tknz_path2)
+            }
+        else:
+            self.tokenizers = tokenizers
+
+        if fp16 is None:
+            fp16 = []
+        elif "all" in fp16:
+            fp16 = [
+                "vae", "clip", "clip2", "unetxl", "refiner_clip2",
+                "refiner_unetxl"
+            ]
+
+        if "vae" in fp16:
+            model_names[pipeline_type]["vae"] = "vae_decoder_fp16_fix"
+
+        log.info("Load models...")
+        self.models = {
+            "vae":
+            StableDiffusionMGX.load_mgx_model(
+                model_names[pipeline_type]["vae"],
+                model_shapes["vae"],
+                os.path.join (onnx_model_path, 'vae_decoder_fp16_fix'),
+                compiled_model_path=compiled_model_path,
+                use_fp16="vae" in fp16,
+                force_compile=force_compile,
+                exhaustive_tune=exhaustive_tune,
+                offload_copy=False),
+            "clip":
+            StableDiffusionMGX.load_mgx_model(
+                model_names[pipeline_type]["clip"],
+                model_shapes["clip"],
+                os.path.join (onnx_model_path, 'text_encoder'),
+                compiled_model_path=compiled_model_path,
+                use_fp16="clip" in fp16,
+                force_compile=force_compile,
+                exhaustive_tune=exhaustive_tune,
+                offload_copy=False),
+            "clip2":
+            StableDiffusionMGX.load_mgx_model(
+                model_names[pipeline_type]["clip2"],
+                model_shapes["clip2"],
+                os.path.join (onnx_model_path, 'text_encoder_2'),
+                compiled_model_path=compiled_model_path,
+                use_fp16="clip2" in fp16,
+                force_compile=force_compile,
+                exhaustive_tune=exhaustive_tune,
+                offload_copy=False),
+            "unetxl":
+            StableDiffusionMGX.load_mgx_model(
+                model_names[pipeline_type]["unetxl"],
+                model_shapes["unetxl"],
+                os.path.join (onnx_model_path, 'unet'),
+                compiled_model_path=compiled_model_path,
+                use_fp16="unetxl" in fp16,
+                force_compile=force_compile,
+                exhaustive_tune=exhaustive_tune,
+                offload_copy=False)
+        }
+        
+        log.info(f"init: loaded models")
+
+        self.tensors = {
+            "clip": allocate_torch_tensors(self.models["clip"]),
+            "clip2": allocate_torch_tensors(self.models["clip2"]),
+            "unetxl": allocate_torch_tensors(self.models["unetxl"]),
+            "vae": allocate_torch_tensors(self.models["vae"]),
+        }
+        
+        # log.info(f"init: tensors: {self.tensors}")
+
+        self.model_args = {
+            "clip": tensors_to_args(self.tensors["clip"]),
+            "clip2": tensors_to_args(self.tensors["clip2"]),
+            "unetxl": tensors_to_args(self.tensors["unetxl"]),
+            "vae": tensors_to_args(self.tensors["vae"]),
+        }
+        
+        # log.info(f"init: self.model_args: {self.model_args}")
+
+        if self.use_refiner:
+            log.info(f"init: self.use_refiner: {self.use_refiner}")
+            
+            # Note: there is no clip for refiner, only clip2
+            self.models["refiner_clip2"] = StableDiffusionMGX.load_mgx_model(
+                model_names["refiner"]["clip2"],
+                model_shapes["clip2"],
+                refiner_onnx_model_path,
+                compiled_model_path=refiner_compiled_model_path,
+                use_fp16="refiner_clip2" in fp16,
+                force_compile=force_compile,
+                exhaustive_tune=exhaustive_tune,
+                offload_copy=False)
+            
+            log.info(f"init: load refiner clip2")
+            
+            self.models["refiner_unetxl"] = StableDiffusionMGX.load_mgx_model(
+                model_names["refiner"]["unetxl"],
+                model_shapes[
+                    "refiner_unetxl"],  # this differ from the original unetxl
+                refiner_onnx_model_path,
+                compiled_model_path=refiner_compiled_model_path,
+                use_fp16="refiner_unetxl" in fp16,
+                force_compile=force_compile,
+                exhaustive_tune=exhaustive_tune,
+                offload_copy=False)
+            
+            log.info(f"init: load refiner unet")
+
+            self.tensors["refiner_clip2"] = allocate_torch_tensors(
+                self.models["refiner_clip2"])
+            self.tensors["refiner_unetxl"] = allocate_torch_tensors(
+                self.models["refiner_unetxl"])
+            self.model_args["refiner_clip2"] = tensors_to_args(
+                self.tensors["refiner_clip2"])
+            self.model_args["refiner_unetxl"] = tensors_to_args(
+                self.tensors["refiner_unetxl"])
+        # hipEventCreate return a tuple(error, event)
+        
+        log.info(f"init: creating hip events")
+        
+        self.events = {
+            "warmup":
+            HipEventPair(start=hip.hipEventCreate()[1],
+                         end=hip.hipEventCreate()[1]),
+            "run":
+            HipEventPair(start=hip.hipEventCreate()[1],
+                         end=hip.hipEventCreate()[1]),
+            "clip":
+            HipEventPair(start=hip.hipEventCreate()[1],
+                         end=hip.hipEventCreate()[1]),
+            "denoise":
+            HipEventPair(start=hip.hipEventCreate()[1],
+                         end=hip.hipEventCreate()[1]),
+            "decode":
+            HipEventPair(start=hip.hipEventCreate()[1],
+                         end=hip.hipEventCreate()[1]),
+        }
+        
+        # log.info(f"init: self.events: {self.events}")
+
+        self.stream = hip.hipStreamCreate()[1]
+        
+        # log.info(f"init: self.stream: {self.stream}")
+
+    def cleanup(self):
+        for event in self.events.values():
+            hip.hipEventDestroy(event.start)
+            hip.hipEventDestroy(event.end)
+        hip.hipStreamDestroy(self.stream)
+
+    def profile_start(self, name):
+        if name in self.events:
+            hip.hipEventRecord(self.events[name].start, None)
+
+    def profile_end(self, name):
+        if name in self.events:
+            hip.hipEventRecord(self.events[name].end, None)
+
+    # @measure
+    @torch.no_grad()
+    def run(self,
+            prompt,
+            steps=20,
+            negative_prompt="normal quality, low quality, worst quality, low res, blurry, nsfw, nude",
+            seed=42,
+            scale=5.0,
+            refiner_steps=20,
+            refiner_aesthetic_score=6.0,
+            refiner_negative_aesthetic_score=2.5,
+            verbose=False,
+            prompt_tokens=None,
+            latents_in=None,
+            device="cuda"):
+        torch.cuda.synchronize()
+        self.profile_start("run")
+        # need to set this for each run
+        self.scheduler.set_timesteps(steps, device=device)
+
+        if verbose:
+            print("Tokenizing prompts...")
+            
+        if prompt_tokens is not None:
+            prompt_tokens = prompt_tokens
+        else:
+            # log.info(f"[mgx] input prompt: {prompt}")
+            prompt_tokens = self.tokenize(prompt, negative_prompt)
+            # log.info(f"[mgx] clip token: {prompt_tokens[0]['input_ids']}")
+            # log.info(f"[mgx] clip2 token: {prompt_tokens[1]['input_ids']}")
+            
+            # raise SystemExit("Checking if tokens match")
+
+        if verbose:
+            print("Creating text embeddings...")
+        self.profile_start("clip")
+        hidden_states, text_embeddings = self.get_embeddings(prompt_tokens)        
+        # log.info(f"[mgx] hidden_states (shape {hidden_states.shape}): {hidden_states}")
+        # log.info(f"[mgx] text_embeddings (shape {text_embeddings.shape}): {text_embeddings}")
+        # log.info(f"------DIVIDER--------")
+        self.profile_end("clip")
+        sample_size = list(self.tensors["vae"]["latent_sample"].size())
+        if verbose:
+            print(
+                f"Creating random input data {sample_size} (latents) with {seed = }..."
+            )
+        
+        height, width = sample_size[2:]
+        time_id = [height * 8, width * 8, 0, 0, height * 8, width * 8]
+        time_ids = torch.tensor([time_id, time_id]).to(device=device)
+        
+        if latents_in is None:
+            noise = torch.randn(
+                sample_size, generator=torch.manual_seed(seed)).to(device=device)
+            # input h/w crop h/w output h/w
+
+            if verbose:
+                print("Apply initial noise sigma\n")
+            
+            # print(f"noise.device -> {noise.device}")
+            # print(f"self.scheduler.init_noise_sigma.device -> {self.scheduler.init_noise_sigma.device}")
+            latents = noise * self.scheduler.init_noise_sigma
+        else:
+            
+            if verbose:
+                print("Apply initial noise sigma\n")
+            
+            # log.info(f"[MGX] input latents provided, no need to generate")
+            latents = latents_in * self.scheduler.init_noise_sigma
+
+        if verbose:
+            print("Running denoising loop...")
+        self.profile_start("denoise")
+        for step, t in tqdm(enumerate(self.scheduler.timesteps), 
+                    total=len(self.scheduler.timesteps), 
+                    desc=f"Device {device} Denoising", 
+                    ncols=100, 
+                    leave=True):
+            if verbose:
+                print(f"#{step}/{len(self.scheduler.timesteps)} step")
+            latents = self.denoise_step(text_embeddings,
+                                        hidden_states,
+                                        latents,
+                                        t,
+                                        scale,
+                                        time_ids,
+                                        model="unetxl",
+                                        device=device)
+        self.profile_end("denoise")
+        if self.use_refiner and refiner_steps > 0:
+            hidden_states, text_embeddings = self.get_embeddings(
+                prompt_tokens, is_refiner=True)
+            # input h/w crop h/w scores
+            time_id_pos = time_id[:4] + [refiner_aesthetic_score]
+            time_id_neg = time_id[:4] + [refiner_negative_aesthetic_score]
+            time_ids = torch.tensor([time_id_pos,
+                                     time_id_neg]).to(device=device)
+            # need to set this for each run
+            self.scheduler.set_timesteps(refiner_steps, device=device)
+            # Add noise to latents using timesteps
+            latents = self.scheduler.add_noise(latents, noise,
+                                               self.scheduler.timesteps[:1])
+            if verbose:
+                print("Running refiner denoising loop...")
+            for step, t in enumerate(self.scheduler.timesteps):
+                if verbose:
+                    print(f"#{step}/{len(self.scheduler.timesteps)} step")
+                latents = self.denoise_step(text_embeddings,
+                                            hidden_states,
+                                            latents,
+                                            t,
+                                            scale,
+                                            time_ids,
+                                            model="refiner_unetxl",
+                                            device=device)
+        if verbose:
+            print("Scale denoised result...")
+        latents = 1 / 0.18215 * latents
+
+        self.profile_start("decode")
+        if verbose:
+            print("Decode denoised result...")
+        image = self.decode(latents)
+        self.profile_end("decode")
+
+        torch.cuda.synchronize()
+        self.profile_end("run")
+        # assert image.shape == (1, 3, 1024, 1024), f"Actual shape of image is: {image.shape}"
+        return image
+
+    def print_summary(self, denoise_steps):
+        print('WARMUP\t{:>9.2f} ms'.format(
+            hip.hipEventElapsedTime(self.events['warmup'].start,
+                                    self.events['warmup'].end)[1]))
+        print('CLIP\t{:>9.2f} ms'.format(
+            hip.hipEventElapsedTime(self.events['clip'].start,
+                                    self.events['clip'].end)[1]))
+        print('UNetx{}\t{:>9.2f} ms'.format(
+            str(denoise_steps),
+            hip.hipEventElapsedTime(self.events['denoise'].start,
+                                    self.events['denoise'].end)[1]))
+        print('VAE-Dec\t{:>9.2f} ms'.format(
+            hip.hipEventElapsedTime(self.events['decode'].start,
+                                    self.events['decode'].end)[1]))
+        print('RUN\t{:>9.2f} ms'.format(
+            hip.hipEventElapsedTime(self.events['run'].start,
+                                    self.events['run'].end)[1]))
+
+    # @measure
+    @staticmethod
+    def load_mgx_model(name,
+                       shapes,
+                       onnx_model_path,
+                       compiled_model_path=None,
+                       use_fp16=False,
+                       force_compile=False,
+                       exhaustive_tune=False,
+                       offload_copy=True):
+        
+        log.info(f"Zixian: inside load_mgx_model")
+        print(f"Loading {name} model...")
+        
+        if compiled_model_path is None:
+            compiled_model_path = onnx_model_path
+        onnx_file = f"{onnx_model_path}/{name}/model.onnx"
+        mxr_file = f"{compiled_model_path}/{name}/model_{'fp16' if use_fp16 else 'fp32'}_{'gpu' if not offload_copy else 'oc'}.mxr"
+        log.info(f"Zixian: mxr_file: {mxr_file}")
+        
+        if not force_compile and os.path.isfile(mxr_file):
+            print(f"Found mxr, loading it from {mxr_file}")
+            model = mgx.load(mxr_file, format="msgpack")
+        elif os.path.isfile(onnx_file):
+            print(f"No mxr found at {mxr_file}")
+            print(f"[IMPORTANT] Parsing from {onnx_file}")
+            model = mgx.parse_onnx(onnx_file, map_input_dims=shapes)
+            if use_fp16:
+                print(f"[IMPORTANT] Unet quantizing to FP16...")
+                mgx.quantize_fp16(model)
+                
+            
+            model.compile(mgx.get_target("gpu"),
+                          exhaustive_tune=exhaustive_tune,
+                          offload_copy=offload_copy)
+            print(f"Saving {name} model to {mxr_file}")
+            os.makedirs(os.path.dirname(mxr_file), exist_ok=True)
+            mgx.save(model, mxr_file, format="msgpack")
+        else:
+            log.info(f"Zixian: no model found")
+            print(
+                f"No {name} model found at {onnx_file} or {mxr_file}. Please download it and re-try."
+            )
+            sys.exit(1)
+        return model
+
+    # @measure
+    def tokenize(self, prompt, negative_prompt):
+        def _tokenize(tokenizer):
+            return self.tokenizers[tokenizer](
+                [prompt, negative_prompt],
+                padding="max_length",
+                max_length=self.tokenizers[tokenizer].model_max_length,
+                truncation=True,
+                return_tensors="pt")
+
+        tokens = _tokenize("clip")
+        tokens2 = _tokenize("clip2")
+        return (tokens, tokens2)
+
+    # @measure
+    def get_embeddings(self, prompt_tokens, is_refiner=False):
+        def _create_embedding(model, input):
+            copy_tensor(self.tensors[model]["input_ids"], input.input_ids)
+            run_model_async(self.models[model], self.model_args[model],
+                            self.stream)
+
+        clip_input, clip2_input = prompt_tokens
+        clip, clip2 = "clip", ("refiner_" if is_refiner else "") + "clip2"
+        if not is_refiner:
+            _create_embedding(clip, clip_input)
+        _create_embedding(clip2, clip2_input)
+
+        hidden_states = torch.concatenate(
+            (self.tensors[clip][get_output_name(0)],
+             self.tensors[clip2][get_output_name(1)]),
+            axis=2) if not is_refiner else self.tensors[clip2][get_output_name(
+                1)]
+        text_embeds = self.tensors[clip2][get_output_name(0)]
+        return (hidden_states, text_embeds)
+
+    @staticmethod
+    def convert_to_rgb_image(image):
+        image = (image / 2 + 0.5).clamp(0, 1)
+        image = image.detach().cpu().permute(0, 2, 3, 1).numpy()
+        images = (image * 255).round().astype("uint8")
+        return Image.fromarray(images[0])
+
+    @staticmethod
+    def save_image(pil_image, filename="output.png"):
+        pil_image.save(filename)
+
+    # @measure
+    def denoise_step(self, text_embeddings, hidden_states, latents, t, scale,
+                     time_ids, model, device):
+        latents_model_input = torch.cat([latents] * 2)
+        latents_model_input = self.scheduler.scale_model_input(
+            latents_model_input, t).to(device=device)
+        timestep = torch.atleast_1d(t.to(device=device))  # convert 0D -> 1D
+
+        copy_tensor(self.tensors[model]["sample"], latents_model_input)
+        copy_tensor(self.tensors[model]["encoder_hidden_states"],
+                    hidden_states)
+        copy_tensor(self.tensors[model]["text_embeds"], text_embeddings)
+        copy_tensor(self.tensors[model]["timestep"], timestep)
+        copy_tensor(self.tensors[model]["time_ids"], time_ids)
+        run_model_async(self.models[model], self.model_args[model],
+                        self.stream)
+
+        noise_pred_text, noise_pred_uncond = torch.tensor_split(
+            self.tensors[model][get_output_name(0)], 2)
+
+        # perform guidance
+        noise_pred = noise_pred_uncond + scale * (noise_pred_text -
+                                                  noise_pred_uncond)
+
+        # compute the previous noisy sample x_t -> x_t-1
+        return self.scheduler.step(noise_pred, t, latents).prev_sample
+
+    # @measure
+    def decode(self, latents):
+        copy_tensor(self.tensors["vae"]["latent_sample"], latents)
+        run_model_async(self.models["vae"], self.model_args["vae"],
+                        self.stream)
+        return self.tensors["vae"][get_output_name(0)]
+
+    # @measure
+    def warmup(self, num_runs):
+        self.profile_start("warmup")
+        init_fn = lambda x: torch.ones if "clip" in x else torch.randn
+        for model in self.models.keys():
+            for tensor in self.tensors[model].values():
+                copy_tensor(tensor, init_fn(model)(tensor.size()))
+
+        for _ in range(num_runs):
+            for model in self.models.keys():
+                run_model_async(self.models[model], self.model_args[model],
+                                self.stream)
+        self.profile_end("warmup")
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # sd = StableDiffusionMGX(args.pipeline_type, args.onnx_model_path,
+    #                         args.compiled_model_path, args.use_refiner,
+    #                         args.refiner_onnx_model_path,
+    #                         args.refiner_compiled_model_path, args.fp16,
+    #                         args.force_compile, args.exhaustive_tune)
+    
+    sd = StableDiffusionMGX("sdxl", onnx_model_path=args.onnx_model_path,
+                            compiled_model_path=None, use_refiner=False,
+                            refiner_onnx_model_path=None,
+                            refiner_compiled_model_path=None, fp16=args.fp16,
+                            force_compile=False, exhaustive_tune=True)
+    print("Warmup")
+    sd.warmup(5)
+    print("Run")
+
+    prompt_list = []
+    prompt_list.append(["A young man in a white shirt is playing tennis.", "tennis.jpg"])
+    # prompt_list.append(["Lorem ipsum dolor sit amet, consectetur adipiscing elit", "woman.jpg"])
+    prompt_list.append(["Astronaut crashlanding in Madison Square Garden, cold color palette, muted colors, detailed, 8k", "crash_astro.jpg"])
+    # prompt_list.append(["John Cena giving The Rock an Attitude Adjustment off the roof, warm color palette, vivid colors, detailed, 8k", "cena_rock.jpg"])
+
+    for element in prompt_list:
+        prompt, img_name = element[0], element[1]
+        # result = sd.run(prompt, args.negative_prompt, args.steps, args.seed,
+        #         args.scale, args.refiner_steps,
+        #         args.refiner_aesthetic_score,
+        #         args.refiner_negative_aesthetic_score, args.verbose)
+        
+        result = sd.run(prompt=prompt, steps=20, seed=args.seed,
+                scale=5.0, refiner_steps=0,
+                refiner_aesthetic_score=0.0,
+                refiner_negative_aesthetic_score=0.0, verbose=False)
+
+        print("Summary")
+        sd.print_summary(args.steps)        
+
+        print("Convert result to rgb image...")
+        image = StableDiffusionMGX.convert_to_rgb_image(result)
+        StableDiffusionMGX.save_image(image, img_name)
+        print(f"Image saved to {img_name}")
+
+    print("Cleanup")
+    sd.cleanup()
\ No newline at end of file
diff --git a/text_to_image/backend_migraphx.py b/text_to_image/backend_migraphx.py
new file mode 100644
index 0000000000..2df2605a8d
--- /dev/null
+++ b/text_to_image/backend_migraphx.py
@@ -0,0 +1,301 @@
+from typing import Optional, List, Union
+import migraphx as mgx
+
+import os
+import torch
+import logging
+import sys
+import backend
+import time
+import random
+import json
+import re
+
+from hip import hip
+from PIL import Image
+from functools import wraps
+from collections import namedtuple
+from transformers import CLIPTokenizer, CLIPTextModelWithProjection, CLIPProcessor, CLIPFeatureExtractor
+from diffusers import StableDiffusionXLPipeline, EulerDiscreteScheduler
+from argparse import ArgumentParser
+from StableDiffusionMGX import StableDiffusionMGX
+from huggingface_hub import hf_hub_download, list_repo_files
+import numpy as np
+
+HipEventPair = namedtuple('HipEventPair', ['start', 'end'])
+
+logging.basicConfig(level=logging.ERROR)
+log = logging.getLogger("backend-mgx")
+
+
+formatter = logging.Formatter("{levelname} - {message}", style="{")
+file_handler = logging.FileHandler("backend_mgx.log", mode="a", encoding="utf-8")
+file_handler.setLevel("INFO")
+file_handler.setFormatter(formatter)
+log.addHandler(file_handler)
+
+def download_model(repo_id, model_path):    
+    # Zixian: Nov 10: Comment this out because model_path is current dir. 
+    if os.path.exists(model_path):
+        log.info(f"MGX models already exists at {model_path}")
+        return
+    else:
+        os.makedirs(model_path, exist_ok=True)
+    
+    repo_files = list_repo_files(repo_id)
+    
+    files_to_download = [
+        file for file in repo_files
+        if not file.endswith(".onnx") and not file.endswith("model_fp32_gpu.mxr")
+    ]
+    
+    for file_name in files_to_download:
+        local_file_path = os.path.join(model_path, file_name)
+        local_folder = os.path.dirname(local_file_path)
+
+        # Create directory structure if it does not exist
+        os.makedirs(local_folder, exist_ok=True)
+
+        # Download the file to the specific path
+        try:
+            hf_hub_download(repo_id=repo_id, filename=file_name, cache_dir=local_folder, local_dir=local_folder, local_dir_use_symlinks=False)
+            # log.info(f"Downloaded {file_name} to {local_file_path}")
+        except Exception as e:
+            log.error(f"Failed to download {file_name}: {e}")
+            
+        print (f"Zixian_in_the_log: Downloaded {file_name} to {local_file_path}")
+
+#! Yalu Ouyang [Nov 10 2024] Keep this in case we aren't allowed to modify coco.py
+# class Decoder:
+#     def __init__(self, vocab_path):
+#         # Load the vocabulary with UTF-8 encoding to support non-ASCII characters
+#         with open(vocab_path, "r", encoding="utf-8") as f:
+#             vocab = json.load(f)
+        
+#         # Reverse the mapping: token_id -> word
+#         self.id_to_word = {int(id_): word for word, id_ in vocab.items()}
+    
+#     def decode_tokens(self, token_ids):
+#         # Ensure token_ids is a list, even if a tensor is passed
+#         if isinstance(token_ids, torch.Tensor):
+#             token_ids = token_ids.tolist()
+        
+#         # Handle both single sequences and batches
+#         if isinstance(token_ids[0], list):  # Batch of sequences
+#             decoded_texts = [self._decode_sequence(sequence) for sequence in token_ids]
+#             return decoded_texts
+#         else:  # Single sequence
+#             return self._decode_sequence(token_ids)
+    
+#     def _decode_sequence(self, token_ids):
+#         # Convert token IDs to words, handling any unknown tokens
+#         words = [self.id_to_word.get(token_id, "[UNK]") for token_id in token_ids]
+        
+#         # Remove special tokens and `</w>` markers
+#         text = " ".join(words)
+#         text = re.sub(r"(<\|startoftext\|>|<\|endoftext\|>)", "", text)  # Remove special tokens
+#         text = text.replace("</w>", "").strip()  # Remove `</w>` markers and extra whitespace
+#         return text
+
+class BackendMIGraphX(backend.Backend):
+    def __init__(
+        self,
+        model_path=None,
+        model_id="xl",
+        guidance=5, #! To match the defaults of MiGraphX
+        steps=20,
+        batch_size=1,
+        device="cuda",
+        precision="fp32",
+        negative_prompt="normal quality, low quality, worst quality, low res, blurry, nsfw, nude",
+    ):
+        super(BackendMIGraphX, self).__init__()
+        # Zixian: Nov 10: Hard code to set model_path to current dir 
+        # self.model_path = model_path
+        # self.model_path = os.getcwd()
+        self.model_path = os.path.join(os.getcwd(), "downloaded_model_folder")
+        if self.model_path is None:            
+            raise SystemExit("Provide a valid Model Path to correctly run the program, exiting now...")
+        
+        self.pipeline_type = None
+        if model_id == "xl":
+            self.model_id = "SeaSponge/scc24_mlperf_mgx_exhaustive"
+            self.pipeline_type = "sdxl"
+        else:
+            raise ValueError(f"{model_id} is not a valid model id")
+        
+        download_model(self.model_id, self.model_path)
+        log.info(f"[mgx backend]: Returned from download_model")
+        
+        
+        self.device = device if torch.cuda.is_available() else "cpu"
+        self.device_num = int(device[-1]) \
+            if (device != "cuda" and device != "cpu") else -1
+        
+        # log.error(f"[mgx backend] self.device -> {self.device} | device_num -> {self.device_num}")        
+        
+        if precision == "fp16":
+            self.dtype = torch.float16
+        elif precision == "bf16":
+            self.dtype = torch.bfloat16
+        else:
+            self.dtype = torch.float32
+
+        if torch.cuda.is_available():
+            self.local_rank = 0
+            self.world_size = 1
+
+        self.guidance = guidance
+        self.steps = steps
+        self.negative_prompt = negative_prompt
+        self.max_length_neg_prompt = 77
+        self.batch_size = batch_size
+        
+        self.mgx = None
+        tknz_path1 = os.path.join(self.model_path, "tokenizer")
+        tknz_path2 = os.path.join(self.model_path, "tokenizer_2")
+        # self.scheduler = EulerDiscreteScheduler.from_pretrained(os.path.join(self.model_path, "scheduler"))
+        self.scheduler = EulerDiscreteScheduler.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", subfolder="scheduler")
+        log.info(f"Zixian: Loaded scheduler")
+        self.pipe = self.Pipe()
+        # self.pipe.tokenizer = CLIPTokenizer.from_pretrained(tknz_path1)
+        # self.pipe.tokenizer_2 = CLIPTokenizer.from_pretrained(tknz_path2)
+        self.pipe.tokenizer = CLIPTokenizer.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", subfolder="tokenizer")
+        self.pipe.tokenizer_2 = CLIPTokenizer.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", subfolder="tokenizer_2")
+        log.info(f"Zixian: Loaded tokenizer & tokenizer2")
+        # log.info(f"Zixian_in_the_log tknz_path1: {tknz_path1}")
+        # log.info(f"Zixian_in_the_log tknz_path2: {tknz_path2}")
+        # self.decoder1 = Decoder(os.path.join(self.model_path, "tokenizer/vocab.json"))
+        # self.decoder2 = Decoder(os.path.join(self.model_path, "tokenizer_2/vocab.json"))
+        self.tokenizers = [self.pipe.tokenizer, self.pipe.tokenizer_2]
+
+    class Pipe:
+        def __init__(self):
+            self.tokenizer = None
+            self.tokenizer_2 = None
+        
+    def version(self):
+        return torch.__version__
+
+    def name(self):
+        return "pytorch-SUT"
+
+    def image_format(self):
+        return "NCHW"
+
+    def load(self):
+        if self.model_path is None:
+            log.warning(
+                "Model path not provided, running with default hugging face weights\n"
+                "This may not be valid for official submissions"
+            )
+            
+            raise SystemExit("Provide a valid Model Path to correctly run the program, exiting now...")
+
+        else:
+            if self.device_num != -1:
+                # log.error(f"Hip set device to -> {self.device_num}")
+                hip.hipSetDevice(self.device_num)
+            
+            # raise SystemExit("Stopping to check")
+            
+            # Parameter explanations here:
+            # onnx_model_path = self.model_path
+            # path to compiled .mxr can be left as None
+            # Don't want to use refiner model
+            use_refiner = False
+            # Therefore refiner model path also None
+            # refiner compiled model path also None
+            
+            # set fp16 according to initialization input
+            fp16 = "all" if self.dtype == torch.float16 else None
+            # Don't want to force .onnx to .mxr compile
+            force_compile = False
+            # Use exhaustive tune when compilling .onnx -> .mxr
+            exhaustive_tune = True
+            
+            tokenizers = {"clip": self.tokenizers[0], "clip2": self.tokenizers[1]}
+            
+            self.mgx = StableDiffusionMGX(self.pipeline_type, onnx_model_path=self.model_path,
+                compiled_model_path=None, use_refiner=use_refiner,
+                refiner_onnx_model_path=None,
+                refiner_compiled_model_path=None, fp16=fp16,
+                force_compile=force_compile, exhaustive_tune=exhaustive_tune, tokenizers=tokenizers,
+                scheduler=self.scheduler)
+            
+            # log.info(f"[backend_migraphx.py]: after initializing self.mgx")
+            
+            # self.mgx.warmup(5)
+            
+            # log.info(f"[backend_migraphx.py]: after mgx.warmup")
+            
+        return self
+    
+    def predict(self, inputs):
+        images = []
+        
+        # Explanation for mgx.run() arguments        
+        # negative_prompt = self.negative_prompt
+        # steps = self.steps
+        # scale refers to guidance scale -> scale = self.guidance
+        # the default SDXLPipeline chooses a random seed everytime, we'll do so manually here
+        # not using refiner, so refiner_step = 0
+        # not using refiner, so aesthetic_score = 0
+        # not using refiner, so negative_aesthetic_score = 0
+        # defaults to not verbose
+        verbose = False
+        #! The main pipeline from loadgen doesn't have text prompt, only tokens
+        
+        for i in range(0, len(inputs), self.batch_size):
+            latents_input = [inputs[idx]["latents"] for idx in range(i, min(i+self.batch_size, len(inputs)))]
+            latents_input = torch.cat(latents_input).to(self.device)            
+            if self.batch_size == 1:
+                # prompt_token = inputs[i]["input_tokens"]
+                # log.info(f"[mgx backend batchsz=1] inputs[i] -> {inputs[i]}")
+                prompt_in = inputs[i]["caption"]
+                # log.info(f"[mgx backend] i -> {i} | prompt_in -> {prompt_in}")
+                seed = random.randint(0, 2**31 - 1)
+                
+                # prompt_in = self.decoder1.decode_tokens(prompt_token['input_ids'])
+                
+                result = self.mgx.run(prompt=prompt_in, negative_prompt=self.negative_prompt, steps=self.steps, seed=seed,
+                    scale=self.guidance, refiner_steps=0,
+                    refiner_aesthetic_score=0,
+                    refiner_negative_aesthetic_score=0, verbose=verbose,
+                    prompt_tokens=None, device=self.device, latents_in=latents_input)
+                
+                # result shape = (3, 1024, 1024)
+                
+                # img_name = f"{self.device}_{random.randint(0, 1000)}.jpg"
+                # image = StableDiffusionMGX.convert_to_rgb_image(result)
+                # StableDiffusionMGX.save_image(image, img_name)
+                # log.info(f"[mgx backend batchsz=1] Image saved to {img_name}")
+                #! COCO needs this to be 3-dimensions
+                
+                new_res = (result / 2 + 0.5).clamp(0, 1)
+                
+                # log.info(f"[mgx backend] type result: {type(result)} | result shape: {result.shape}")
+                # log.info(f"[mgx backend] type new_res: {type(new_res)} | new_res shape: {new_res.shape}")
+                # log.info(f"------DIVIDER--------")
+                images.extend(new_res)
+                
+            else:
+                prompt_list = []
+                for prompt in inputs[i:min(i+self.batch_size, len(inputs))]:
+                    assert isinstance(prompt, dict), "prompt (in inputs) isn't a dict"
+                    # prompt_token = prompt["input_tokens"]
+                    prompt_in = inputs[i]["caption"]
+                    
+                
+                for prompt in prompt_list:
+                    seed = random.randint(0, 2**31 - 1)
+                    result = self.mgx.run(prompt=prompt, negative_prompt=self.negative_prompt, steps=self.steps, seed=seed,
+                        scale=self.guidance, refiner_steps=0,
+                        refiner_aesthetic_score=0,
+                        refiner_negative_aesthetic_score=0, verbose=verbose,
+                        prompt_tokens=None, device=self.device, latents_in=latents_input)
+
+                    new_res = (result / 2 + 0.5).clamp(0, 1)
+                    images.extend(new_res)
+
+        return images
\ No newline at end of file
diff --git a/text_to_image/backend_pytorch.py b/text_to_image/backend_pytorch.py
index 36e2b80090..f2af4d75c7 100644
--- a/text_to_image/backend_pytorch.py
+++ b/text_to_image/backend_pytorch.py
@@ -17,9 +17,9 @@ def __init__(
         model_id="xl",
         guidance=8,
         steps=20,
-        batch_size=1,
+        batch_size=2,
         device="cuda",
-        precision="fp32",
+        precision="fp16",
         negative_prompt="normal quality, low quality, worst quality, low res, blurry, nsfw, nude",
     ):
         super(BackendPytorch, self).__init__()
@@ -57,39 +57,41 @@ def image_format(self):
         return "NCHW"
 
     def load(self):
-        if self.model_path is None:
-            log.warning(
-                "Model path not provided, running with default hugging face weights\n"
-                "This may not be valid for official submissions"
-            )
-            self.scheduler = EulerDiscreteScheduler.from_pretrained(
-                self.model_id, subfolder="scheduler"
-            )
-            self.pipe = StableDiffusionXLPipeline.from_pretrained(
-                self.model_id,
-                scheduler=self.scheduler,
-                safety_checker=None,
-                add_watermarker=False,
-                variant="fp16" if (self.dtype == torch.float16) else None,
-                torch_dtype=self.dtype,
-            )
+        # if self.model_path is None:
+        #     log.warning(
+        #         "Model path not provided, running with default hugging face weights\n"
+        #         "This may not be valid for official submissions"
+        #     )
+        self.scheduler = EulerDiscreteScheduler.from_pretrained(
+            self.model_id, subfolder="scheduler"
+        )
+        self.pipe = StableDiffusionXLPipeline.from_pretrained(
+            self.model_id,
+            scheduler=self.scheduler,
+            safety_checker=None,
+            add_watermarker=False,
+            # variant="fp16" if (self.dtype == torch.float16) else None,
+            variant="fp16" ,
+            torch_dtype=self.dtype,
+        )
             # self.pipe.unet = torch.compile(self.pipe.unet, mode="reduce-overhead", fullgraph=True)
-        else:
-            self.scheduler = EulerDiscreteScheduler.from_pretrained(
-                os.path.join(self.model_path, "checkpoint_scheduler"),
-                subfolder="scheduler",
-            )
-            self.pipe = StableDiffusionXLPipeline.from_pretrained(
-                os.path.join(self.model_path, "checkpoint_pipe"),
-                scheduler=self.scheduler,
-                safety_checker=None,
-                add_watermarker=False,
-                torch_dtype=self.dtype,
-            )
+        # else:
+        #     self.scheduler = EulerDiscreteScheduler.from_pretrained(
+        #         os.path.join(self.model_path, "checkpoint_scheduler"),
+        #         subfolder="scheduler",
+        #     )
+        #     self.pipe = StableDiffusionXLPipeline.from_pretrained(
+        #         os.path.join(self.model_path, "checkpoint_pipe"),
+        #         scheduler=self.scheduler,
+        #         safety_checker=None,
+        #         add_watermarker=False,
+        #         variant="fp16" if (self.dtype == torch.float16) else None,
+        #         torch_dtype=self.dtype,
+        #     )
             # self.pipe.unet = torch.compile(self.pipe.unet, mode="reduce-overhead", fullgraph=True)
 
         self.pipe.to(self.device)
-        # self.pipe.set_progress_bar_config(disable=True)
+        #self.pipe.set_progress_bar_config(disable=True)
 
         self.negative_prompt_tokens = self.pipe.tokenizer(
             self.convert_prompt(self.negative_prompt, self.pipe.tokenizer),
@@ -210,15 +212,13 @@ def encode_tokens(
                     text_input_ids.to(device), output_hidden_states=True
                 )
 
-                # We are only ALWAYS interested in the pooled output of the
-                # final text encoder
+                # We are only ALWAYS interested in the pooled output of the final text encoder
                 pooled_prompt_embeds = prompt_embeds[0]
                 if clip_skip is None:
                     prompt_embeds = prompt_embeds.hidden_states[-2]
                 else:
                     # "2" because SDXL always indexes from the penultimate layer.
-                    prompt_embeds = prompt_embeds.hidden_states[-(
-                        clip_skip + 2)]
+                    prompt_embeds = prompt_embeds.hidden_states[-(clip_skip + 2)]
 
                 prompt_embeds_list.append(prompt_embeds)
 
@@ -234,8 +234,7 @@ def encode_tokens(
             and zero_out_negative_prompt
         ):
             negative_prompt_embeds = torch.zeros_like(prompt_embeds)
-            negative_pooled_prompt_embeds = torch.zeros_like(
-                pooled_prompt_embeds)
+            negative_pooled_prompt_embeds = torch.zeros_like(pooled_prompt_embeds)
         elif do_classifier_free_guidance and negative_prompt_embeds is None:
             negative_prompt = negative_prompt or ""
             negative_prompt_2 = negative_prompt_2 or negative_prompt
@@ -262,35 +261,30 @@ def encode_tokens(
                     uncond_input.to(device),
                     output_hidden_states=True,
                 )
-                # We are only ALWAYS interested in the pooled output of the
-                # final text encoder
+                # We are only ALWAYS interested in the pooled output of the final text encoder
                 negative_pooled_prompt_embeds = negative_prompt_embeds[0]
                 negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]
 
                 negative_prompt_embeds_list.append(negative_prompt_embeds)
 
-            negative_prompt_embeds = torch.concat(
-                negative_prompt_embeds_list, dim=-1)
+            negative_prompt_embeds = torch.concat(negative_prompt_embeds_list, dim=-1)
 
         if pipe.text_encoder_2 is not None:
             prompt_embeds = prompt_embeds.to(
                 dtype=pipe.text_encoder_2.dtype, device=device
             )
         else:
-            prompt_embeds = prompt_embeds.to(
-                dtype=pipe.unet.dtype, device=device)
+            prompt_embeds = prompt_embeds.to(dtype=pipe.unet.dtype, device=device)
 
         bs_embed, seq_len, _ = prompt_embeds.shape
-        # duplicate text embeddings for each generation per prompt, using mps
-        # friendly method
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
         prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
         prompt_embeds = prompt_embeds.view(
             bs_embed * num_images_per_prompt, seq_len, -1
         )
 
         if do_classifier_free_guidance:
-            # duplicate unconditional embeddings for each generation per
-            # prompt, using mps friendly method
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
             seq_len = negative_prompt_embeds.shape[1]
 
             if pipe.text_encoder_2 is not None:
@@ -322,7 +316,7 @@ def encode_tokens(
             pooled_prompt_embeds,
             negative_pooled_prompt_embeds,
         )
-
+    
     def prepare_inputs(self, inputs, i):
         if self.batch_size == 1:
             return self.encode_tokens(
@@ -337,7 +331,7 @@ def prepare_inputs(self, inputs, i):
             negative_prompt_embeds = []
             pooled_prompt_embeds = []
             negative_pooled_prompt_embeds = []
-            for prompt in inputs[i: min(i + self.batch_size, len(inputs))]:
+            for prompt in inputs[i:min(i+self.batch_size, len(inputs))]:
                 assert isinstance(prompt, dict)
                 text_input = prompt["input_tokens"]
                 text_input_2 = prompt["input_tokens_2"]
@@ -358,26 +352,19 @@ def prepare_inputs(self, inputs, i):
                 pooled_prompt_embeds.append(p_p_e)
                 negative_pooled_prompt_embeds.append(n_p_p_e)
 
+
             prompt_embeds = torch.cat(prompt_embeds)
             negative_prompt_embeds = torch.cat(negative_prompt_embeds)
             pooled_prompt_embeds = torch.cat(pooled_prompt_embeds)
-            negative_pooled_prompt_embeds = torch.cat(
-                negative_pooled_prompt_embeds)
-            return (
-                prompt_embeds,
-                negative_prompt_embeds,
-                pooled_prompt_embeds,
-                negative_pooled_prompt_embeds,
-            )
+            negative_pooled_prompt_embeds = torch.cat(negative_pooled_prompt_embeds)
+            return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
 
     def predict(self, inputs):
         images = []
         with torch.no_grad():
             for i in range(0, len(inputs), self.batch_size):
-                latents_input = [
-                    inputs[idx]["latents"]
-                    for idx in range(i, min(i + self.batch_size, len(inputs)))
-                ]
+                print (f'self.steps BEFORE pipe: {self.steps}')
+                latents_input = [inputs[idx]["latents"] for idx in range(i, min(i+self.batch_size, len(inputs)))]
                 latents_input = torch.cat(latents_input).to(self.device)
                 (
                     prompt_embeds,
@@ -392,8 +379,11 @@ def predict(self, inputs):
                     negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
                     guidance_scale=self.guidance,
                     num_inference_steps=self.steps,
+                    # num_inference_steps=20,
                     output_type="pt",
                     latents=latents_input,
                 ).images
+                print (f'self.steps AFTER pipe: {self.steps}')
                 images.extend(generated)
         return images
+
diff --git a/text_to_image/coco.py b/text_to_image/coco.py
index e9499b0e6c..92773ed9ca 100644
--- a/text_to_image/coco.py
+++ b/text_to_image/coco.py
@@ -35,22 +35,27 @@ def __init__(
         latent_dtype=torch.float32,
         latent_device="cuda",
         latent_framework="torch",
+        pipe_type=None,
         **kwargs,
     ):
         super().__init__()
-        self.captions_df = pd.read_csv(
-            f"{data_path}/captions/captions.tsv", sep="\t")
+        self.captions_df = pd.read_csv(f"{data_path}/captions/captions.tsv", sep="\t")
         self.image_size = image_size
         self.preprocessed_dir = os.path.abspath(f"{data_path}/preprocessed/")
         self.img_dir = os.path.abspath(f"{data_path}/validation/data/")
         self.name = name
+        
+        self.pipe_tokenizer = pipe_tokenizer
+        self.pipe_tokenizer_2 = pipe_tokenizer_2
 
         # Preprocess prompts
         self.captions_df["input_tokens"] = self.captions_df["caption"].apply(
-            lambda x: self.preprocess(x, pipe_tokenizer)
+            # lambda x: self.preprocess(x, pipe_tokenizer)
+            lambda x: x
         )
         self.captions_df["input_tokens_2"] = self.captions_df["caption"].apply(
-            lambda x: self.preprocess(x, pipe_tokenizer_2)
+            # lambda x: self.preprocess(x, pipe_tokenizer_2)
+            lambda x: x
         )
         self.latent_dtype = latent_dtype
         self.latent_device = latent_device if torch.cuda.is_available() else "cpu"
@@ -66,6 +71,8 @@ def __init__(
                 .to(latent_dtype)
                 .to(latent_device)
             )
+        
+        self.pipe_type = pipe_type
 
     def preprocess(self, prompt, tokenizer):
         converted_prompt = self.convert_prompt(prompt, tokenizer)
@@ -113,14 +120,34 @@ def convert_prompt(self, prompt, tokenizer):
     def get_item(self, id):
         return dict(self.captions_df.loc[id], latents=self.latents)
 
+    #! Yalu Ouyang [Nov 14 2024] Overrides parent Dataset class, default behavior is same though
+    def get_samples(self, id_list):
+        if self.pipe_type == "migraphx":
+            #! Yalu Ouyang [Nov 14 2024] MGX backend just needs text prompt
+            data = [
+                {
+                    "caption": self.items_inmemory[id]["caption"],
+                    "latents": self.items_inmemory[id]["latents"],
+                }
+                for id in id_list
+            ]
+        else:
+            data = [
+                {
+                    "input_tokens": self.items_inmemory[id]["input_tokens"],
+                    "input_tokens_2": self.items_inmemory[id]["input_tokens_2"],
+                    "latents": self.items_inmemory[id]["latents"],
+                }
+                for id in id_list
+            ]
+        images = [self.items_inmemory[id]["file_name"] for id in id_list]
+        return data, images
+
     def get_item_count(self):
         return len(self.captions_df)
 
     def get_img(self, id):
-        img = Image.open(
-            self.img_dir +
-            "/" +
-            self.captions_df.loc[id]["file_name"])
+        img = Image.open(self.img_dir + "/" + self.captions_df.loc[id]["file_name"])
         return self.image_to_tensor(img)
 
     def get_imgs(self, id_list):
@@ -141,11 +168,7 @@ def get_item_loc(self, id):
 
 class PostProcessCoco:
     def __init__(
-        self,
-        device="cpu",
-        dtype="uint8",
-        statistics_path=os.path.join(
-            os.path.dirname(__file__), "tools", "val2014.npz"),
+        self, device="cpu", dtype="uint8", statistics_path=os.path.join(os.path.dirname(__file__), "tools", "val2014.npz")
     ):
         self.results = []
         self.good = 0
@@ -167,33 +190,27 @@ def add_results(self, results):
     def __call__(self, results, ids, expected=None, result_dict=None):
         self.content_ids.extend(ids)
         return [
-            (t.cpu().permute(1, 2, 0).float().numpy() * 255)
-            .round()
-            .astype(self.numpy_dtype)
+            (t.cpu().permute(1, 2, 0).float().numpy() * 255).round().astype(self.numpy_dtype)
             for t in results
         ]
-
+    
     def save_images(self, ids, ds):
         info = []
         idx = {}
-        for i, image_id in enumerate(self.content_ids):
-            if image_id in ids:
-                idx[image_id] = i
+        for i, id in enumerate(self.content_ids):
+            if id in ids:
+                idx[id] = i
         if not os.path.exists("images/"):
             os.makedirs("images/", exist_ok=True)
-        for image_id in ids:
-            if not idx.get(image_id):
-                print(
-                    f"image id {image_id} is missing in the results. Hence not saved.")
-                continue
-            caption = ds.get_caption(image_id)
-            generated = Image.fromarray(self.results[idx[image_id]])
-            image_path_tmp = f"images/{self.content_ids[idx[image_id]]}.png"
+        for id in ids:
+            caption = ds.get_caption(id)
+            generated = Image.fromarray(self.results[idx[id]])
+            image_path_tmp = f"images/{self.content_ids[idx[id]]}.png"
             generated.save(image_path_tmp)
-            info.append((self.content_ids[idx[image_id]], caption))
+            info.append((self.content_ids[idx[id]], caption))
         with open("images/captions.txt", "w+") as f:
-            for image_id, caption in info:
-                f.write(f"{image_id}  {caption}\n")
+            for id, caption in info:
+                f.write(f"{id}  {caption}\n")
 
     def start(self):
         self.results = []
@@ -209,10 +226,7 @@ def finalize(self, result_dict, ds=None, output_dir=None):
                 100 * clip.get_clip_score(caption, generated).item()
             )
 
-        fid_score = compute_fid(
-            self.results,
-            self.statistics_path,
-            self.device)
+        fid_score = compute_fid(self.results, self.statistics_path, self.device)
         result_dict["FID_SCORE"] = fid_score
         result_dict["CLIP_SCORE"] = np.mean(self.clip_scores)
 
diff --git a/text_to_image/main.py b/text_to_image/main.py
index 6aa7c15e75..3d6f7d1d0a 100644
--- a/text_to_image/main.py
+++ b/text_to_image/main.py
@@ -24,6 +24,11 @@
 import dataset
 import coco
 
+# import torchvision.transforms as T
+# transform_im = T.ToPILImage()
+
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
 logging.basicConfig(level=logging.INFO)
 log = logging.getLogger("main")
 
@@ -61,6 +66,12 @@
         "backend": "pytorch-dist",
         "model-name": "stable-diffusion-xl",
     },
+    # ? Yalu Ouyang modification: Oct 16 2024
+    "stable-diffusion-xl-mgx": {
+        "dataset": "coco-1024",
+        "backend": "migraphx",
+        "model-name": "stable-diffusion-xl",
+    },
 }
 
 SCENARIO_MAP = {
@@ -73,22 +84,20 @@
 
 def get_args():
     parser = argparse.ArgumentParser()
+    parser.add_argument("--dataset", 
+                        default="coco-1024",
+                        choices=SUPPORTED_DATASETS.keys(), help="dataset")
+    parser.add_argument("--dataset-path", 
+                        default="coco2014",help="path to the dataset")
     parser.add_argument(
-        "--dataset",
-        choices=SUPPORTED_DATASETS.keys(),
-        help="dataset")
-    parser.add_argument(
-        "--dataset-path",
-        required=True,
-        help="path to the dataset")
-    parser.add_argument(
-        "--profile", choices=SUPPORTED_PROFILES.keys(), help="standard profiles"
+        "--profile", 
+        default="stable-diffusion-xl-mgx",
+        choices=SUPPORTED_PROFILES.keys(), help="standard profiles"
     )
     parser.add_argument(
         "--scenario",
-        default="SingleStream",
-        help="mlperf benchmark scenario, one of " +
-        str(list(SCENARIO_MAP.keys())),
+        default="Offline",
+        help="mlperf benchmark scenario, one of " + str(list(SCENARIO_MAP.keys())),
     )
     parser.add_argument(
         "--max-batchsize",
@@ -97,24 +106,23 @@ def get_args():
         help="max batch size in a single inference",
     )
     parser.add_argument("--threads", default=1, type=int, help="threads")
-    parser.add_argument(
-        "--accuracy",
-        action="store_true",
-        help="enable accuracy pass")
+    parser.add_argument("--accuracy", action="store_true", help="enable accuracy pass")
     parser.add_argument(
         "--find-peak-performance",
         action="store_true",
         help="enable finding peak performance pass",
     )
-    parser.add_argument("--backend", help="Name of the backend")
+    parser.add_argument("--backend", default='migraphx', help="Name of the backend")
     parser.add_argument("--model-name", help="Name of the model")
     parser.add_argument("--output", default="output", help="test results")
     parser.add_argument("--qps", type=int, help="target qps")
-    parser.add_argument("--model-path", help="Path to model weights")
+    parser.add_argument("--model-path", 
+        default="/work1/zixian/youyang1/models/sdxl-1.0-base",
+        help="Path to model weights")
 
     parser.add_argument(
         "--dtype",
-        default="fp32",
+        default="fp16",
         choices=["fp32", "fp16", "bf16"],
         help="dtype of the model",
     )
@@ -131,6 +139,10 @@ def get_args():
         help="framework to load the latents",
     )
 
+    # file to use mlperf rules compliant parameters
+    parser.add_argument(
+        "--mlperf_conf", default="mlperf.conf", help="mlperf rules config"
+    )
     # file for user LoadGen settings such as target QPS
     parser.add_argument(
         "--user_conf",
@@ -141,24 +153,21 @@ def get_args():
     parser.add_argument(
         "--audit_conf", default="audit.config", help="config for LoadGen audit settings"
     )
+    parser.add_argument(
+        "--gpu-num", default=4, type=int, help="number of gpus to use"
+    )
     # arguments to save images
     # pass this argument for official submission
     # parser.add_argument("--output-images", action="store_true", help="Store a subset of the generated images")
     # do not modify this argument for official submission
-    parser.add_argument(
-        "--ids-path", help="Path to caption ids", default="tools/sample_ids.txt"
-    )
+    parser.add_argument("--ids-path", help="Path to caption ids", default="tools/sample_ids.txt")
 
-    # below will override mlperf rules compliant settings - don't use for
-    # official submission
+    # below will override mlperf rules compliant settings - don't use for official submission
     parser.add_argument("--time", type=int, help="time to scan in seconds")
     parser.add_argument("--count", type=int, help="dataset items to use")
     parser.add_argument("--debug", action="store_true", help="debug")
     parser.add_argument(
-        "--performance-sample-count",
-        type=int,
-        help="performance sample count",
-        default=5000,
+        "--performance-sample-count", type=int, help="performance sample count", default=1000
     )
     parser.add_argument(
         "--max-latency", type=float, help="mlperf max latency in pct tile"
@@ -193,6 +202,12 @@ def get_backend(backend, **kwargs):
         from backend_pytorch import BackendPytorch
 
         backend = BackendPytorch(**kwargs)
+    
+    # ? Yalu Ouyang Modification: Nov 5 2024
+    elif backend == "migraphx":
+        from backend_migraphx import BackendMIGraphX
+        
+        backend = BackendMIGraphX(**kwargs)
 
     elif backend == "debug":
         from backend_debug import BackendDebug
@@ -239,15 +254,17 @@ def run_one_item(self, qitem: Item):
         processed_results = []
         try:
             results = self.model.predict(qitem.inputs)
+            # log.info("[Line 254] runs fine after results")
             processed_results = self.post_process(
                 results, qitem.content_id, qitem.inputs, self.result_dict
             )
+            # log.info("[Line 258] runs fine after processed_results")
             if self.take_accuracy:
                 self.post_process.add_results(processed_results)
             self.result_timing.append(time.time() - qitem.start)
         except Exception as ex:  # pylint: disable=broad-except
             src = [self.ds.get_item_loc(i) for i in qitem.content_id]
-            log.error("thread: failed on contentid=%s, %s", src, ex)
+            log.error("[Line 262] thread: failed on contentid=%s, %s", src, ex)
             # since post_process will not run, fake empty responses
             processed_results = [[]] * len(qitem.query_id)
         finally:
@@ -256,6 +273,7 @@ def run_one_item(self, qitem: Item):
             for idx, query_id in enumerate(qitem.query_id):
                 response_array = array.array(
                     "B", np.array(processed_results[idx], np.uint8).tobytes()
+                    # "B", np.array(processed_results[idx], np.uint64).tobytes()
                 )
                 response_array_refs.append(response_array)
                 bi = response_array.buffer_info()
@@ -271,9 +289,9 @@ def enqueue(self, query_samples):
         else:
             bs = self.max_batchsize
             for i in range(0, len(idx), bs):
-                data, label = self.ds.get_samples(idx[i: i + bs])
+                data, label = self.ds.get_samples(idx[i : i + bs])
                 self.run_one_item(
-                    Item(query_id[i: i + bs], idx[i: i + bs], data, label)
+                    Item(query_id[i : i + bs], idx[i : i + bs], data, label)
                 )
 
     def finish(self):
@@ -288,9 +306,7 @@ def __init__(self, model, ds, threads, post_proc=None, max_batchsize=128):
         self.result_dict = {}
 
         for _ in range(self.threads):
-            worker = threading.Thread(
-                target=self.handle_tasks, args=(
-                    self.tasks,))
+            worker = threading.Thread(target=self.handle_tasks, args=(self.tasks,))
             worker.daemon = True
             self.workers.append(worker)
             worker.start()
@@ -298,13 +314,17 @@ def __init__(self, model, ds, threads, post_proc=None, max_batchsize=128):
     def handle_tasks(self, tasks_queue):
         """Worker thread."""
         while True:
+            # log.info ('getting tasks')
             qitem = tasks_queue.get()
+            # log.info ('getten tasks')
             if qitem is None:
                 # None in the queue indicates the parent want us to exit
                 tasks_queue.task_done()
                 break
             self.run_one_item(qitem)
+            # log.info ('going to task_done')
             tasks_queue.task_done()
+            # log.info ('tasks done')
 
     def enqueue(self, query_samples):
         idx = [q.index for q in query_samples]
@@ -333,13 +353,36 @@ def main():
     log.info(args)
 
     # find backend
-    backend = get_backend(
-        args.backend,
-        precision=args.dtype,
-        device=args.device,
-        model_path=args.model_path,
-        batch_size=args.max_batchsize,
-    )
+    
+    # backend = get_backend(
+    #     args.backend,
+    #     precision=args.dtype,
+    #     device=args.device,
+    #     model_path=args.model_path,
+    #     batch_size=args.max_batchsize
+    # )
+    # Zixian: Oct 21: create a list of backends for multi-gpu
+    # backends = [get_backend(
+    #                 args.backend,
+    #                 precision=args.dtype,
+    #                 device=f'cuda:{i}',
+    #                 model_path=args.model_path,
+    #                 batch_size=args.max_batchsize
+    #             ) 
+    #             for i in [0, 1, 2, 3]]
+    
+    backends = [get_backend(
+                    args.backend,
+                    precision=args.dtype,
+                    device=f'cuda:{int (i/int (args.gpu_num / 4))}',
+                    model_path=args.model_path,
+                    batch_size=args.max_batchsize
+                ) 
+                for i in np.arange (args.gpu_num)]
+    
+    log.info(f"Zixian: Returned from get_backends")
+    
+    
     if args.dtype == "fp16":
         dtype = torch.float16
     elif args.dtype == "bf16":
@@ -355,7 +398,11 @@ def main():
         count_override = True
 
     # load model to backend
-    model = backend.load()
+    # model = backend.load()
+    log.info(f"Zixian: entering backend.load")
+    # Zixian: Oct 21: create a list of models corresponding to each backend 
+    models = [backend.load() for backend in backends]
+    log.info(f"Zixian: loaded models from all backend")
 
     # dataset to use
     dataset_class, pre_proc, post_proc, kwargs = SUPPORTED_DATASETS[args.dataset]
@@ -365,28 +412,38 @@ def main():
         pre_process=pre_proc,
         count=count,
         threads=args.threads,
-        pipe_tokenizer=model.pipe.tokenizer,
-        pipe_tokenizer_2=model.pipe.tokenizer_2,
+        # pipe_tokenizer=model.pipe.tokenizer,
+        # pipe_tokenizer_2=model.pipe.tokenizer_2,
+        pipe_tokenizer=models[0].pipe.tokenizer,
+        pipe_tokenizer_2=models[0].pipe.tokenizer_2,
         latent_dtype=dtype,
         latent_device=args.device,
         latent_framework=args.latent_framework,
+        pipe_type=args.backend,
         **kwargs,
     )
     final_results = {
-        "runtime": model.name(),
-        "version": model.version(),
+        # "runtime": model.name(),
+        # "version": model.version(),
+        "runtime": models[0].name(),
+        "version": models[0].version(),
         "time": int(time.time()),
         "args": vars(args),
         "cmdline": str(args),
     }
 
+    mlperf_conf = os.path.abspath(args.mlperf_conf)
+    if not os.path.exists(mlperf_conf):
+        log.error("{} not found".format(mlperf_conf))
+        sys.exit(1)
+
     user_conf = os.path.abspath(args.user_conf)
     if not os.path.exists(user_conf):
         log.error("{} not found".format(user_conf))
         sys.exit(1)
 
     audit_config = os.path.abspath(args.audit_conf)
-
+    
     if args.accuracy:
         ids_path = os.path.abspath(args.ids_path)
         with open(ids_path) as f:
@@ -405,16 +462,30 @@ def main():
     # warmup
     syntetic_str = "Lorem ipsum dolor sit amet, consectetur adipiscing elit"
     latents_pt = torch.rand(ds.latents.shape, dtype=dtype).to(args.device)
-    warmup_samples = [
-        {
-            "input_tokens": ds.preprocess(syntetic_str, model.pipe.tokenizer),
-            "input_tokens_2": ds.preprocess(syntetic_str, model.pipe.tokenizer_2),
-            "latents": latents_pt,
-        }
-        for _ in range(args.max_batchsize)
-    ]
-    for i in range(5):
-        _ = backend.predict(warmup_samples)
+    # warmup_samples = [
+    #     {
+    #         "input_tokens": ds.preprocess(syntetic_str, model.pipe.tokenizer),
+    #         "input_tokens_2": ds.preprocess(syntetic_str, model.pipe.tokenizer_2),
+    #         "latents": latents_pt,
+    #     }
+    #     for _ in range(args.max_batchsize)
+    # ]
+    warmup_samples_gpus = [
+        [
+            {
+                "input_tokens": ds.preprocess(syntetic_str, model.pipe.tokenizer),
+                "input_tokens_2": ds.preprocess(syntetic_str, model.pipe.tokenizer_2),
+                "caption": syntetic_str,
+                "latents": latents_pt
+            }
+            for _ in range(int(args.max_batchsize))
+        ]
+        for model in models]
+    
+    # Zixian: Oct 21: warm up each backend 
+    for idx, backend in enumerate (backends): 
+        for i in range(1):
+            _ = backend.predict(warmup_samples_gpus[idx])
 
     scenario = SCENARIO_MAP[args.scenario]
     runner_map = {
@@ -423,12 +494,52 @@ def main():
         lg.TestScenario.Server: QueueRunner,
         lg.TestScenario.Offline: QueueRunner,
     }
-    runner = runner_map[scenario](
-        model, ds, args.threads, post_proc=post_proc, max_batchsize=args.max_batchsize
-    )
-
+    
+    # Zixian: Oct 21: create a list of runner
+    # runner = runner_map[scenario](
+    #     model, ds, args.threads, post_proc=post_proc, max_batchsize=args.max_batchsize
+    # )
+    runners = [runner_map[scenario](
+                                model, ds, args.threads, post_proc=post_proc, max_batchsize=args.max_batchsize
+                            )
+                for model in models]
+
+    # def issue_queries(query_samples):
+    #     runner.enqueue(query_samples)
     def issue_queries(query_samples):
-        runner.enqueue(query_samples)
+        print (f'\n\n len (query_samples): {len (query_samples)} \n\n')
+        
+        query_samples_len = len (query_samples)
+        query_samples_seg_len = query_samples_len / len (runners)
+        splitted_query_samples = []
+        for idx in range (len (runners)): 
+            log.info (f'\n\n\n')
+            log.info (f'idx: {idx}')
+            log.info (f'query_samples_len: {query_samples_len}')
+            log.info (f'idx: {idx}')
+            # if idx == len (runners) -1: 
+            #     splitted_query_samples.append (query_samples[idx*query_samples_seg_len:])
+            # else:
+            #     splitted_query_samples.append (query_samples[idx*query_samples_seg_len : (idx+1)*query_samples_seg_len])
+            
+            splitted_query_samples.append (query_samples [int(round(query_samples_seg_len * idx)): int(round(query_samples_seg_len * (idx + 1)))])
+                        
+        
+        with ThreadPoolExecutor(max_workers=len(runners)) as executor:
+            # Map each runner to its respective sublist
+            futures = {
+                executor.submit(runner.enqueue, queries): runner 
+                for runner, queries in zip(runners, splitted_query_samples)
+            }
+        
+            # Optionally process the results
+            for future in as_completed(futures):
+                runner = futures[future]
+                try:
+                    result = future.result()
+                    print(f'Runner {runner} enqueued successfully.')
+                except Exception as exc:
+                    print(f'Runner {runner} generated an exception: {exc}')
 
     def flush_queries():
         pass
@@ -441,8 +552,7 @@ def flush_queries():
     log_settings.log_output = log_output_settings
 
     settings = lg.TestSettings()
-    # mlperf.conf is automatically loaded by the loadgen
-    # settings.FromConfig(mlperf_conf, args.model_name, args.scenario)
+    settings.FromConfig(mlperf_conf, args.model_name, args.scenario)
     settings.FromConfig(user_conf, args.model_name, args.scenario)
     if os.path.exists(audit_config):
         settings.FromConfig(audit_config, args.model_name, args.scenario)
@@ -471,8 +581,7 @@ def flush_queries():
         settings.multi_stream_samples_per_query = args.samples_per_query
     if args.max_latency:
         settings.server_target_latency_ns = int(args.max_latency * NANO_SEC)
-        settings.multi_stream_expected_latency_ns = int(
-            args.max_latency * NANO_SEC)
+        settings.multi_stream_expected_latency_ns = int(args.max_latency * NANO_SEC)
 
     performance_sample_count = (
         args.performance_sample_count
@@ -480,22 +589,47 @@ def flush_queries():
         else min(count, 500)
     )
     sut = lg.ConstructSUT(issue_queries, flush_queries)
+    #! [Yalu Ouyang] count here affects how many items to run (even for accuracy)
     qsl = lg.ConstructQSL(
         count, performance_sample_count, ds.load_query_samples, ds.unload_query_samples
     )
 
     log.info("starting {}".format(scenario))
     result_dict = {"scenario": str(scenario)}
-    runner.start_run(result_dict, args.accuracy)
+    for runner in runners: 
+        runner.start_run(result_dict, args.accuracy)
+    
+    # with ThreadPoolExecutor(max_workers=len(runners)) as executor:
+    #         # Map each runner to its respective sublist
+    #         futures = {
+    #             executor.submit(runner.finish(), (result_dict, args.accuracy)): runner 
+    #             for runner in runners 
+    #         }
+        
 
     lg.StartTestWithLogSettings(sut, qsl, settings, log_settings, audit_config)
+    
+    log.info("Loadgen finished tests")
 
     if args.accuracy:
         post_proc.finalize(result_dict, ds, output_dir=args.output)
         final_results["accuracy_results"] = result_dict
         post_proc.save_images(saved_images_ids, ds)
 
-    runner.finish()
+    log.info("After processing accuracy")
+
+    for runner in runners: 
+        runner.finish()
+        
+    log.info("After runner.finish()") 
+    # with ThreadPoolExecutor(max_workers=len(runners)) as executor:
+    #         # Map each runner to its respective sublist
+    #         futures = {
+    #             executor.submit(runner.finish()): runner 
+    #             for runner in runners 
+    #         }
+        
+        
     lg.DestroyQSL(qsl)
     lg.DestroySUT(sut)
 
diff --git a/text_to_image/mlperf.conf b/text_to_image/mlperf.conf
new file mode 100644
index 0000000000..0cea5351e1
--- /dev/null
+++ b/text_to_image/mlperf.conf
@@ -0,0 +1,99 @@
+# The format of this config file is 'key = value'.
+# The key has the format 'model.scenario.key'. Value is mostly int64_t.
+# Model maybe '*' as wildcard. In that case the value applies to all models.
+# All times are in milli seconds
+
+# Set performance_sample_count for each model.
+# User can optionally set this to higher values in user.conf.
+resnet50.*.performance_sample_count_override = 1024
+ssd-mobilenet.*.performance_sample_count_override = 256
+retinanet.*.performance_sample_count_override = 64
+bert.*.performance_sample_count_override = 10833
+dlrm.*.performance_sample_count_override = 204800
+dlrm-v2.*.performance_sample_count_override = 204800
+rnnt.*.performance_sample_count_override = 2513
+gptj.*.performance_sample_count_override = 13368
+llama2-70b.*.performance_sample_count_override = 24576
+stable-diffusion-xl.*.performance_sample_count_override = 5000
+# set to 0 to let entire sample set to be performance sample
+3d-unet.*.performance_sample_count_override = 0
+
+# Set seeds. The seeds will be distributed two weeks before the submission.
+*.*.qsl_rng_seed = 3066443479025735752
+*.*.sample_index_rng_seed = 10688027786191513374
+*.*.schedule_rng_seed = 14962580496156340209
+# Set seeds for TEST_05. The seeds will be distributed two weeks before the submission.
+*.*.test05_qsl_rng_seed = 16799458546791641818
+*.*.test05_sample_index_rng_seed = 5453809927556429288
+*.*.test05_schedule_rng_seed = 5435552105434836064
+
+
+*.SingleStream.target_latency_percentile = 90
+*.SingleStream.min_duration = 600000
+
+*.MultiStream.target_latency_percentile = 99
+*.MultiStream.samples_per_query = 8
+*.MultiStream.min_duration = 600000
+*.MultiStream.min_query_count = 662
+retinanet.MultiStream.target_latency = 528
+
+# 3D-UNet uses equal issue mode because it has non-uniform inputs
+3d-unet.*.sample_concatenate_permutation = 1
+
+# LLM benchmarks have non-uniform inputs and outputs, and use equal issue mode for all latency scenario
+gptj.*.sample_concatenate_permutation = 1
+llama2-70b.*.sample_concatenate_permutation = 1
+mixtral-8x7b.*.sample_concatenate_permutation = 1
+
+*.Server.target_latency = 10
+*.Server.target_latency_percentile = 99
+*.Server.target_duration = 0
+*.Server.min_duration = 600000
+resnet50.Server.target_latency = 15
+retinanet.Server.target_latency = 100
+bert.Server.target_latency = 130
+dlrm.Server.target_latency = 60
+dlrm-v2.Server.target_latency = 60
+rnnt.Server.target_latency = 1000
+gptj.Server.target_latency = 20000
+stable-diffusion-xl.Server.target_latency = 20000
+# Llama2-70b benchmarks measures token latencies
+llama2-70b.*.use_token_latencies = 1
+mixtral-8x7b.*.use_token_latencies = 1
+# gptj benchmark infers token latencies
+gptj.*.infer_token_latencies = 1
+gptj.*.token_latency_scaling_factor = 69
+# Only ttft and tpot are tracked for the llama2-70b & mixtral-8x7B benchmark therefore target_latency = 0
+llama2-70b.Server.target_latency = 0
+llama2-70b.Server.ttft_latency = 2000
+llama2-70b.Server.tpot_latency = 200
+
+mixtral-8x7b.Server.target_latency = 0
+mixtral-8x7b.Server.ttft_latency = 2000
+mixtral-8x7b.Server.tpot_latency = 200
+
+*.Offline.target_latency_percentile = 90
+*.Offline.min_duration = 600000
+
+# In Offline scenario, we always have one query. But LoadGen maps this to
+# min_sample_count internally in Offline scenario. If the dataset size is larger 
+# than 24576 we limit the min_query_count to 24576 and otherwise we use 
+# the dataset size as the limit
+
+resnet50.Offline.min_query_count = 24576
+retinanet.Offline.min_query_count = 24576
+dlrm-v2.Offline.min_query_count = 24576
+bert.Offline.min_query_count = 10833
+gptj.Offline.min_query_count = 13368
+rnnt.Offline.min_query_count = 2513
+3d-unet.Offline.min_query_count = 43
+stable-diffusion-xl.Offline.min_query_count = 4000
+llama2-70b.Offline.min_query_count = 24576
+mixtral-8x7b.Offline.min_query_count = 15000
+
+# These fields should be defined and overridden by user.conf.
+*.SingleStream.target_latency = 10
+*.MultiStream.target_latency = 80
+*.Server.target_qps = 1.0
+*.Offline.target_qps = 1.0
+
diff --git a/tools/submission/preprocess_submission.py b/tools/submission/preprocess_submission.py
index a1678c79d0..a737c1f16c 100644
--- a/tools/submission/preprocess_submission.py
+++ b/tools/submission/preprocess_submission.py
@@ -84,7 +84,6 @@ def delete_empty_dirs(src):
     """
     if not os.path.isdir(src):
         return False
-
     if all([delete_empty_dirs(os.path.join(src, file))
            for file in os.listdir(src)]):
         log.info("Removing empty dir: (%s)", src)
@@ -532,9 +531,11 @@ def main():
     if not args.nodelete_empty_dirs:
         delete_empty_dirs(os.path.join(src_dir))
 
+    run_dir = os.getcwd()
     os.chdir(src_dir)
 
     infer_scenario_results(args, config)
+    os.chdir(run_dir)
 
     if not args.nodelete_empty_dirs:
         delete_empty_dirs(os.path.join(src_dir))
diff --git a/tools/submission/submission_checker.py b/tools/submission/submission_checker.py
index a31a6fede3..deff9eb8c4 100755
--- a/tools/submission/submission_checker.py
+++ b/tools/submission/submission_checker.py
@@ -188,6 +188,7 @@
             "sample_index_rng_seed": 198141574272810017,
             "schedule_rng_seed": 7575108116881280410,
         },
+        # not required for v5.0+
         "test05_seeds": {
             # TODO: Update random seeds
             "qsl_rng_seed": 2376919268182438552,
@@ -2880,7 +2881,7 @@ def check_compliance_dir(
     compliance_perf_pass = True
     compliance_perf_dir_pass = True
     compliance_acc_pass = True
-    test_list = ["TEST01", "TEST04", "TEST05"]
+    test_list = ["TEST01", "TEST04"]
 
     if model in [
         "bert-99",
@@ -2899,15 +2900,15 @@ def check_compliance_dir(
     ]:
         test_list.remove("TEST04")
 
-    if model in [
+    if config.version in ["v4.0", "v4.1"] and model not in [
         "gptj-99",
         "gptj-99.9",
         "llama2-70b-99",
         "llama2-70b-99.9",
         "stable-diffusion-xl",
         "mixtral-8x7b",
-    ] or config.version not in ["v4.0", "v4.1"]:
-        test_list.remove("TEST05")
+    ]:
+        test_list.append("TEST05")
 
     if model in [
         "gptj-99",