From d8f0d7e580c60f733a7c3fdf7e2cc7c457a24899 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Thu, 31 Oct 2024 14:55:24 +0000 Subject: [PATCH 01/15] Update generate_final_report.py --- tools/submission/generate_final_report.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/submission/generate_final_report.py b/tools/submission/generate_final_report.py index ba2c368cdf..79d9fe0767 100644 --- a/tools/submission/generate_final_report.py +++ b/tools/submission/generate_final_report.py @@ -79,7 +79,7 @@ def main(): df["p#"] = df.apply(lambda x: int(x["host_processors_per_node"]), axis=1) # details url - base_url = f"https://github.com/mlcommons/{args.repository}/tree/main" + base_url = f"https://github.com/{args.repository_owner}/{args.repository}/tree/{args.repository_branch}" df["Details"] = df.apply( lambda x: '=HYPERLINK("{}","details")'.format( "/".join( From 6b1a0f87f46288d7b4b487f89e18f3151422694c Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Thu, 7 Nov 2024 11:54:48 +0000 Subject: [PATCH 02/15] Fix sdxl (#1911) * Fix typo in fid_score.py, fail_safe for SDXL short runs * [Automated Commit] Format Codebase * Fix typo in fid_score.py, fail_safe for SDXL short runs * Fix dlrmv2 reference implementation | Update run_local.sh --- recommendation/dlrm_v2/pytorch/run_local.sh | 4 +++- text_to_image/coco.py | 24 ++++++++++++--------- text_to_image/tools/fid/fid_score.py | 2 +- 3 files changed, 18 insertions(+), 12 deletions(-) diff --git a/recommendation/dlrm_v2/pytorch/run_local.sh b/recommendation/dlrm_v2/pytorch/run_local.sh index 0d054c6c45..3bc8ec667c 100755 --- a/recommendation/dlrm_v2/pytorch/run_local.sh +++ b/recommendation/dlrm_v2/pytorch/run_local.sh @@ -2,7 +2,9 @@ source ./run_common.sh -common_opt="--mlperf_conf ../../../mlperf.conf" +#mlperf.conf is now automatically loaded by loadgen +#common_opt="--mlperf_conf ../../../mlperf.conf" + OUTPUT_DIR=`pwd`/output/$name if [ ! -d $OUTPUT_DIR ]; then mkdir -p $OUTPUT_DIR diff --git a/text_to_image/coco.py b/text_to_image/coco.py index cb3956a014..e9499b0e6c 100644 --- a/text_to_image/coco.py +++ b/text_to_image/coco.py @@ -176,20 +176,24 @@ def __call__(self, results, ids, expected=None, result_dict=None): def save_images(self, ids, ds): info = [] idx = {} - for i, id in enumerate(self.content_ids): - if id in ids: - idx[id] = i + for i, image_id in enumerate(self.content_ids): + if image_id in ids: + idx[image_id] = i if not os.path.exists("images/"): os.makedirs("images/", exist_ok=True) - for id in ids: - caption = ds.get_caption(id) - generated = Image.fromarray(self.results[idx[id]]) - image_path_tmp = f"images/{self.content_ids[idx[id]]}.png" + for image_id in ids: + if not idx.get(image_id): + print( + f"image id {image_id} is missing in the results. Hence not saved.") + continue + caption = ds.get_caption(image_id) + generated = Image.fromarray(self.results[idx[image_id]]) + image_path_tmp = f"images/{self.content_ids[idx[image_id]]}.png" generated.save(image_path_tmp) - info.append((self.content_ids[idx[id]], caption)) + info.append((self.content_ids[idx[image_id]], caption)) with open("images/captions.txt", "w+") as f: - for id, caption in info: - f.write(f"{id} {caption}\n") + for image_id, caption in info: + f.write(f"{image_id} {caption}\n") def start(self): self.results = [] diff --git a/text_to_image/tools/fid/fid_score.py b/text_to_image/tools/fid/fid_score.py index febc12ff5d..8e486c8b7a 100644 --- a/text_to_image/tools/fid/fid_score.py +++ b/text_to_image/tools/fid/fid_score.py @@ -44,7 +44,7 @@ import pathlib import os import sys -sys.path.insert("..", 0) +sys.path.insert(0, "..") from inception import InceptionV3 # noqa: E402 From a4ba51fb2244f2efc703c341b13411676297e299 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Thu, 7 Nov 2024 20:07:07 +0530 Subject: [PATCH 03/15] Fixes for filtering invalid results --- tools/submission/preprocess_submission.py | 28 +++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/tools/submission/preprocess_submission.py b/tools/submission/preprocess_submission.py index 1e26b81ca4..9d44b91d63 100644 --- a/tools/submission/preprocess_submission.py +++ b/tools/submission/preprocess_submission.py @@ -141,6 +141,22 @@ def change_folder_name_in_path(path, old_folder_name, new_folder_name): new_path = os.path.join(*path_parts) return new_path +def clean_model_dir(model_results_dir): + model_measurements_dir = change_folder_name_in_path(model_results_dir, "results", "measurements") + model_compliance_dir = change_folder_name_in_path(model_results_dir, "results", "compliance") + + print(f"rmtree {model_results_dir}") + shutil.rmtree(model_results_dir) + shutil.rmtree(model_measurements_dir) + shutil.rmtree(model_compliance_dir) + sut_results_dir = os.path.dirname(model_results_dir) + if not os.listdir(sut_results_dir): + #clean sut dir + sut = os.path.basename(sut_results_dir) + print(f"No benchmark results remaining for {sut}. rmtree {sut_results_dir}") + shutil.rmtree(sut_results_dir) + shutil.rmtree(os.path.dirname(model_measurements_dir)) + shutil.rmtree(os.path.dirname(model_compliance_dir)) def clean_invalid_results(args, log_path, config, system_desc, system_json, model, mlperf_model, division, system_id_json, is_closed_or_network): @@ -176,6 +192,7 @@ def clean_invalid_results(args, log_path, config, system_desc, system_json, except Exception as e: log.warning(e) perf_is_valid = False + compliance_is_valid = False if perf_is_valid: power_path = os.path.join(scenario_path, "performance", "power") has_power = os.path.exists(power_path) @@ -260,9 +277,11 @@ def clean_invalid_results(args, log_path, config, system_desc, system_json, # if only accuracy or compliance failed, result is valid # for open if not perf_is_valid: - shutil.rmtree(scenario_path) log.warning( f"{scenario} scenario result is invalid for {system_desc}: {model} in {division} and open divisions. Accuracy: {accuracy_is_valid}, Performance: {perf_is_valid}. Removing it...") + shutil.rmtree(scenario_path) + scenario_measurements_path = change_folder_name_in_path(scenario_path, "results", "measurements") + shutil.rmtree(scenario_measurements_path) if not os.path.exists(target_results_path): shutil.copytree( model_results_path, target_results_path) @@ -288,9 +307,7 @@ def clean_invalid_results(args, log_path, config, system_desc, system_json, log.warning(f"{scenario} scenario result is invalid for {system_desc}: {model} in {division} division. Accuracy: {accuracy_is_valid}, Performance: {perf_is_valid}. Compliance: {compliance_is_valid}. Moving other scenario results of {model} to open...") else: log.warning(f"{scenario} scenario result is invalid for {system_desc}: {model} in {division} division. Accuracy: {accuracy_is_valid}, Performance: {perf_is_valid}. Removing all dependent scenario results...") - shutil.rmtree(model_results_path) - shutil.rmtree(model_measurements_path) - shutil.rmtree(model_compliance_path) + clean_model_dir(model_results_path) else: # delete this result # delete other scenario results too shutil.rmtree(scenario_path) @@ -517,6 +534,9 @@ def main(): infer_scenario_results(args, config) + if not args.nodelete_empty_dirs: + delete_empty_dirs(os.path.join(src_dir)) + return 0 From 451b310ef42a28e015ce5abc4e43ba6033ff8d4a Mon Sep 17 00:00:00 2001 From: arjunsuresh Date: Thu, 7 Nov 2024 14:44:42 +0000 Subject: [PATCH 04/15] [Automated Commit] Format Codebase --- tools/submission/preprocess_submission.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/tools/submission/preprocess_submission.py b/tools/submission/preprocess_submission.py index 9d44b91d63..7803cf5684 100644 --- a/tools/submission/preprocess_submission.py +++ b/tools/submission/preprocess_submission.py @@ -141,9 +141,12 @@ def change_folder_name_in_path(path, old_folder_name, new_folder_name): new_path = os.path.join(*path_parts) return new_path + def clean_model_dir(model_results_dir): - model_measurements_dir = change_folder_name_in_path(model_results_dir, "results", "measurements") - model_compliance_dir = change_folder_name_in_path(model_results_dir, "results", "compliance") + model_measurements_dir = change_folder_name_in_path( + model_results_dir, "results", "measurements") + model_compliance_dir = change_folder_name_in_path( + model_results_dir, "results", "compliance") print(f"rmtree {model_results_dir}") shutil.rmtree(model_results_dir) @@ -151,13 +154,15 @@ def clean_model_dir(model_results_dir): shutil.rmtree(model_compliance_dir) sut_results_dir = os.path.dirname(model_results_dir) if not os.listdir(sut_results_dir): - #clean sut dir + # clean sut dir sut = os.path.basename(sut_results_dir) - print(f"No benchmark results remaining for {sut}. rmtree {sut_results_dir}") + print( + f"No benchmark results remaining for {sut}. rmtree {sut_results_dir}") shutil.rmtree(sut_results_dir) shutil.rmtree(os.path.dirname(model_measurements_dir)) shutil.rmtree(os.path.dirname(model_compliance_dir)) + def clean_invalid_results(args, log_path, config, system_desc, system_json, model, mlperf_model, division, system_id_json, is_closed_or_network): # cleanup invalid results @@ -280,7 +285,8 @@ def clean_invalid_results(args, log_path, config, system_desc, system_json, log.warning( f"{scenario} scenario result is invalid for {system_desc}: {model} in {division} and open divisions. Accuracy: {accuracy_is_valid}, Performance: {perf_is_valid}. Removing it...") shutil.rmtree(scenario_path) - scenario_measurements_path = change_folder_name_in_path(scenario_path, "results", "measurements") + scenario_measurements_path = change_folder_name_in_path( + scenario_path, "results", "measurements") shutil.rmtree(scenario_measurements_path) if not os.path.exists(target_results_path): shutil.copytree( From 4c109ea8b5b17d0c422d4b8a08a55070142c68ae Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Thu, 7 Nov 2024 14:58:42 +0000 Subject: [PATCH 05/15] Update preprocess_submission.py --- tools/submission/preprocess_submission.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/tools/submission/preprocess_submission.py b/tools/submission/preprocess_submission.py index 7803cf5684..a1678c79d0 100644 --- a/tools/submission/preprocess_submission.py +++ b/tools/submission/preprocess_submission.py @@ -2,10 +2,6 @@ Tool to infer scenario results and cleanup submission tree """ -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals - import argparse import logging import os @@ -156,7 +152,7 @@ def clean_model_dir(model_results_dir): if not os.listdir(sut_results_dir): # clean sut dir sut = os.path.basename(sut_results_dir) - print( + log.info( f"No benchmark results remaining for {sut}. rmtree {sut_results_dir}") shutil.rmtree(sut_results_dir) shutil.rmtree(os.path.dirname(model_measurements_dir)) From 40c1fe0c28364b243b5944b3569000611ddf2b7d Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Thu, 7 Nov 2024 21:20:52 +0530 Subject: [PATCH 06/15] Added an option to pass in sample_ids.txt for SDXL accuracy check --- text_to_image/tools/accuracy_coco.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/text_to_image/tools/accuracy_coco.py b/text_to_image/tools/accuracy_coco.py index 2d7c36506d..8740ee1726 100644 --- a/text_to_image/tools/accuracy_coco.py +++ b/text_to_image/tools/accuracy_coco.py @@ -51,6 +51,10 @@ def get_args(): required=False, help="path to dump 10 stable diffusion xl compliance images", ) + #Do not use for official MLPerf inference submissions as only the default one is valid + parser.add_argument( + "--ids-path", help="Path to 10 caption ids to dump as compliance images", default="os.path.join(os.path.dirname(__file__), 'sample_ids.txt')" + ) parser.add_argument("--device", default="cpu", choices=["gpu", "cpu"]) parser.add_argument( "--low_memory", @@ -97,8 +101,9 @@ def main(): os.makedirs(args.compliance_images_path) dump_compliance_images = True compliance_images_idx_list = [] + sample_ids_file_path = args.ids_path if args.ids_path else os.path.join(os.path.dirname(__file__), "sample_ids.txt") with open( - os.path.join(os.path.dirname(__file__), "sample_ids.txt"), "r" + os.path.join(sample_ids_file_path, "r" ) as compliance_id_file: for line in compliance_id_file: idx = int(line.strip()) From 89a2ffe257bc8c4c0d8e81cb5c1fec4e15080b2a Mon Sep 17 00:00:00 2001 From: arjunsuresh Date: Thu, 7 Nov 2024 15:51:36 +0000 Subject: [PATCH 07/15] [Automated Commit] Format Codebase --- text_to_image/tools/accuracy_coco.py | 88 ++++++++++++++-------------- 1 file changed, 45 insertions(+), 43 deletions(-) diff --git a/text_to_image/tools/accuracy_coco.py b/text_to_image/tools/accuracy_coco.py index 8740ee1726..bc3f87d04b 100644 --- a/text_to_image/tools/accuracy_coco.py +++ b/text_to_image/tools/accuracy_coco.py @@ -51,7 +51,8 @@ def get_args(): required=False, help="path to dump 10 stable diffusion xl compliance images", ) - #Do not use for official MLPerf inference submissions as only the default one is valid + # Do not use for official MLPerf inference submissions as only the default + # one is valid parser.add_argument( "--ids-path", help="Path to 10 caption ids to dump as compliance images", default="os.path.join(os.path.dirname(__file__), 'sample_ids.txt')" ) @@ -101,12 +102,13 @@ def main(): os.makedirs(args.compliance_images_path) dump_compliance_images = True compliance_images_idx_list = [] - sample_ids_file_path = args.ids_path if args.ids_path else os.path.join(os.path.dirname(__file__), "sample_ids.txt") + sample_ids_file_path = args.ids_path if args.ids_path else os.path.join( + os.path.dirname(__file__), "sample_ids.txt") with open( os.path.join(sample_ids_file_path, "r" ) as compliance_id_file: for line in compliance_id_file: - idx = int(line.strip()) + idx=int(line.strip()) compliance_images_idx_list.append(idx) # Dump caption.txt with open( @@ -153,28 +155,28 @@ def compute_accuracy( statistics_path, ): # Load torchmetrics modules - clip = CLIPEncoder(device=device) - clip_scores = [] - seen = set() - result_list = [] - result_dict = {} + clip=CLIPEncoder(device=device) + clip_scores=[] + seen=set() + result_list=[] + result_dict={} # Load model outputs with open(mlperf_accuracy_file, "r") as f: - results = json.load(f) + results=json.load(f) for j in tqdm(results): - idx = j["qsl_idx"] + idx=j["qsl_idx"] if idx in seen: continue seen.add(idx) # Load generated image - generated_img = np.frombuffer(bytes.fromhex(j["data"]), np.uint8).reshape( + generated_img=np.frombuffer(bytes.fromhex(j["data"]), np.uint8).reshape( 1024, 1024, 3 ) result_list.append(generated_img) - generated_img = Image.fromarray(generated_img) + generated_img=Image.fromarray(generated_img) # Dump compliance images if dump_compliance_images and idx in compliance_images_idx_list: @@ -185,16 +187,16 @@ def compute_accuracy( # generated_img = torch.Tensor(generated_img).to(torch.uint8).to(device) # Load Ground Truth - caption = df_captions.iloc[idx]["caption"] + caption=df_captions.iloc[idx]["caption"] clip_scores.append( 100 * clip.get_clip_score( caption, generated_img).item()) - fid_score = compute_fid(result_list, statistics_path, device) + fid_score=compute_fid(result_list, statistics_path, device) - result_dict["FID_SCORE"] = fid_score - result_dict["CLIP_SCORE"] = np.mean(clip_scores) + result_dict["FID_SCORE"]=fid_score + result_dict["CLIP_SCORE"]=np.mean(clip_scores) print(f"Accuracy Results: {result_dict}") with open(output_file, "w") as fp: @@ -216,43 +218,43 @@ def compute_accuracy_low_memory( ): if num_workers is None: try: - num_cpus = len(os.sched_getaffinity(0)) + num_cpus=len(os.sched_getaffinity(0)) except AttributeError: # os.sched_getaffinity is not available under Windows, use # os.cpu_count instead (which may not return the *available* number # of CPUs). - num_cpus = os.cpu_count() + num_cpus=os.cpu_count() - num_workers = min(num_cpus, 8) if num_cpus is not None else 0 + num_workers=min(num_cpus, 8) if num_cpus is not None else 0 else: - num_workers = num_workers + num_workers=num_workers # Load torchmetrics modules - block_idx = InceptionV3.BLOCK_INDEX_BY_DIM[inception_dims] - inception_model = InceptionV3([block_idx]).to(device) - clip_model = CLIPEncoder(device=device) + block_idx=InceptionV3.BLOCK_INDEX_BY_DIM[inception_dims] + inception_model=InceptionV3([block_idx]).to(device) + clip_model=CLIPEncoder(device=device) - clip_scores = [] - seen = set() - result_batch = [] - result_dict = {} - activations = np.empty((0, inception_dims)) + clip_scores=[] + seen=set() + result_batch=[] + result_dict={} + activations=np.empty((0, inception_dims)) # Load model outputs with open(mlperf_accuracy_file, "r") as f: - results = ijson.items(f, "item") + results=ijson.items(f, "item") for j in tqdm(results): - idx = j["qsl_idx"] + idx=j["qsl_idx"] if idx in seen: continue seen.add(idx) # Load generated image - generated_img = np.frombuffer(bytes.fromhex(j["data"]), np.uint8).reshape( + generated_img=np.frombuffer(bytes.fromhex(j["data"]), np.uint8).reshape( 1024, 1024, 3 ) - generated_img = Image.fromarray(generated_img) + generated_img=Image.fromarray(generated_img) # Dump compliance images if dump_compliance_images and idx in compliance_images_idx_list: @@ -262,7 +264,7 @@ def compute_accuracy_low_memory( f"{idx}.png")) # Load Ground Truth - caption = df_captions.iloc[idx]["caption"] + caption=df_captions.iloc[idx]["caption"] clip_scores.append( 100 * clip_model.get_clip_score(caption, generated_img).item() ) @@ -270,7 +272,7 @@ def compute_accuracy_low_memory( result_batch.append(generated_img.convert("RGB")) if len(result_batch) == batch_size: - act = get_activations( + act=get_activations( result_batch, inception_model, batch_size, @@ -278,12 +280,12 @@ def compute_accuracy_low_memory( device, num_workers, ) - activations = np.append(activations, act, axis=0) + activations=np.append(activations, act, axis=0) result_batch.clear() # Remaining data for last batch if len(result_batch) > 0: - act = get_activations( + act=get_activations( result_batch, inception_model, len(result_batch), @@ -291,9 +293,9 @@ def compute_accuracy_low_memory( device, num_workers, ) - activations = np.append(activations, act, axis=0) + activations=np.append(activations, act, axis=0) - m1, s1 = compute_statistics_of_path( + m1, s1=compute_statistics_of_path( statistics_path, inception_model, batch_size, @@ -304,13 +306,13 @@ def compute_accuracy_low_memory( None, ) - m2 = np.mean(activations, axis=0) - s2 = np.cov(activations, rowvar=False) + m2=np.mean(activations, axis=0) + s2=np.cov(activations, rowvar=False) - fid_score = calculate_frechet_distance(m1, s1, m2, s2) + fid_score=calculate_frechet_distance(m1, s1, m2, s2) - result_dict["FID_SCORE"] = fid_score - result_dict["CLIP_SCORE"] = np.mean(clip_scores) + result_dict["FID_SCORE"]=fid_score + result_dict["CLIP_SCORE"]=np.mean(clip_scores) print(f"Accuracy Results: {result_dict}") with open(output_file, "w") as fp: From 69ffdc0aa783f9127af612a7de57c6329703c1dc Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Thu, 7 Nov 2024 20:19:13 +0000 Subject: [PATCH 08/15] Update accuracy_coco.py --- text_to_image/tools/accuracy_coco.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/text_to_image/tools/accuracy_coco.py b/text_to_image/tools/accuracy_coco.py index bc3f87d04b..0d0c015607 100644 --- a/text_to_image/tools/accuracy_coco.py +++ b/text_to_image/tools/accuracy_coco.py @@ -105,7 +105,7 @@ def main(): sample_ids_file_path = args.ids_path if args.ids_path else os.path.join( os.path.dirname(__file__), "sample_ids.txt") with open( - os.path.join(sample_ids_file_path, "r" + os.path.join(sample_ids_file_path, "r") ) as compliance_id_file: for line in compliance_id_file: idx=int(line.strip()) From d1d642e06f91e5b8f56088f8d1a4b127a65d962c Mon Sep 17 00:00:00 2001 From: arjunsuresh Date: Thu, 7 Nov 2024 20:19:47 +0000 Subject: [PATCH 09/15] [Automated Commit] Format Codebase --- text_to_image/tools/accuracy_coco.py | 82 ++++++++++++++-------------- 1 file changed, 41 insertions(+), 41 deletions(-) diff --git a/text_to_image/tools/accuracy_coco.py b/text_to_image/tools/accuracy_coco.py index 0d0c015607..d73325897b 100644 --- a/text_to_image/tools/accuracy_coco.py +++ b/text_to_image/tools/accuracy_coco.py @@ -108,7 +108,7 @@ def main(): os.path.join(sample_ids_file_path, "r") ) as compliance_id_file: for line in compliance_id_file: - idx=int(line.strip()) + idx = int(line.strip()) compliance_images_idx_list.append(idx) # Dump caption.txt with open( @@ -155,28 +155,28 @@ def compute_accuracy( statistics_path, ): # Load torchmetrics modules - clip=CLIPEncoder(device=device) - clip_scores=[] - seen=set() - result_list=[] - result_dict={} + clip = CLIPEncoder(device=device) + clip_scores = [] + seen = set() + result_list = [] + result_dict = {} # Load model outputs with open(mlperf_accuracy_file, "r") as f: - results=json.load(f) + results = json.load(f) for j in tqdm(results): - idx=j["qsl_idx"] + idx = j["qsl_idx"] if idx in seen: continue seen.add(idx) # Load generated image - generated_img=np.frombuffer(bytes.fromhex(j["data"]), np.uint8).reshape( + generated_img = np.frombuffer(bytes.fromhex(j["data"]), np.uint8).reshape( 1024, 1024, 3 ) result_list.append(generated_img) - generated_img=Image.fromarray(generated_img) + generated_img = Image.fromarray(generated_img) # Dump compliance images if dump_compliance_images and idx in compliance_images_idx_list: @@ -187,16 +187,16 @@ def compute_accuracy( # generated_img = torch.Tensor(generated_img).to(torch.uint8).to(device) # Load Ground Truth - caption=df_captions.iloc[idx]["caption"] + caption = df_captions.iloc[idx]["caption"] clip_scores.append( 100 * clip.get_clip_score( caption, generated_img).item()) - fid_score=compute_fid(result_list, statistics_path, device) + fid_score = compute_fid(result_list, statistics_path, device) - result_dict["FID_SCORE"]=fid_score - result_dict["CLIP_SCORE"]=np.mean(clip_scores) + result_dict["FID_SCORE"] = fid_score + result_dict["CLIP_SCORE"] = np.mean(clip_scores) print(f"Accuracy Results: {result_dict}") with open(output_file, "w") as fp: @@ -218,43 +218,43 @@ def compute_accuracy_low_memory( ): if num_workers is None: try: - num_cpus=len(os.sched_getaffinity(0)) + num_cpus = len(os.sched_getaffinity(0)) except AttributeError: # os.sched_getaffinity is not available under Windows, use # os.cpu_count instead (which may not return the *available* number # of CPUs). - num_cpus=os.cpu_count() + num_cpus = os.cpu_count() - num_workers=min(num_cpus, 8) if num_cpus is not None else 0 + num_workers = min(num_cpus, 8) if num_cpus is not None else 0 else: - num_workers=num_workers + num_workers = num_workers # Load torchmetrics modules - block_idx=InceptionV3.BLOCK_INDEX_BY_DIM[inception_dims] - inception_model=InceptionV3([block_idx]).to(device) - clip_model=CLIPEncoder(device=device) + block_idx = InceptionV3.BLOCK_INDEX_BY_DIM[inception_dims] + inception_model = InceptionV3([block_idx]).to(device) + clip_model = CLIPEncoder(device=device) - clip_scores=[] - seen=set() - result_batch=[] - result_dict={} - activations=np.empty((0, inception_dims)) + clip_scores = [] + seen = set() + result_batch = [] + result_dict = {} + activations = np.empty((0, inception_dims)) # Load model outputs with open(mlperf_accuracy_file, "r") as f: - results=ijson.items(f, "item") + results = ijson.items(f, "item") for j in tqdm(results): - idx=j["qsl_idx"] + idx = j["qsl_idx"] if idx in seen: continue seen.add(idx) # Load generated image - generated_img=np.frombuffer(bytes.fromhex(j["data"]), np.uint8).reshape( + generated_img = np.frombuffer(bytes.fromhex(j["data"]), np.uint8).reshape( 1024, 1024, 3 ) - generated_img=Image.fromarray(generated_img) + generated_img = Image.fromarray(generated_img) # Dump compliance images if dump_compliance_images and idx in compliance_images_idx_list: @@ -264,7 +264,7 @@ def compute_accuracy_low_memory( f"{idx}.png")) # Load Ground Truth - caption=df_captions.iloc[idx]["caption"] + caption = df_captions.iloc[idx]["caption"] clip_scores.append( 100 * clip_model.get_clip_score(caption, generated_img).item() ) @@ -272,7 +272,7 @@ def compute_accuracy_low_memory( result_batch.append(generated_img.convert("RGB")) if len(result_batch) == batch_size: - act=get_activations( + act = get_activations( result_batch, inception_model, batch_size, @@ -280,12 +280,12 @@ def compute_accuracy_low_memory( device, num_workers, ) - activations=np.append(activations, act, axis=0) + activations = np.append(activations, act, axis=0) result_batch.clear() # Remaining data for last batch if len(result_batch) > 0: - act=get_activations( + act = get_activations( result_batch, inception_model, len(result_batch), @@ -293,9 +293,9 @@ def compute_accuracy_low_memory( device, num_workers, ) - activations=np.append(activations, act, axis=0) + activations = np.append(activations, act, axis=0) - m1, s1=compute_statistics_of_path( + m1, s1 = compute_statistics_of_path( statistics_path, inception_model, batch_size, @@ -306,13 +306,13 @@ def compute_accuracy_low_memory( None, ) - m2=np.mean(activations, axis=0) - s2=np.cov(activations, rowvar=False) + m2 = np.mean(activations, axis=0) + s2 = np.cov(activations, rowvar=False) - fid_score=calculate_frechet_distance(m1, s1, m2, s2) + fid_score = calculate_frechet_distance(m1, s1, m2, s2) - result_dict["FID_SCORE"]=fid_score - result_dict["CLIP_SCORE"]=np.mean(clip_scores) + result_dict["FID_SCORE"] = fid_score + result_dict["CLIP_SCORE"] = np.mean(clip_scores) print(f"Accuracy Results: {result_dict}") with open(output_file, "w") as fp: From 8d3b8ab09ac392b5a8656ad07d37fb8d7942595b Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Fri, 8 Nov 2024 03:44:42 +0530 Subject: [PATCH 10/15] Fix typo --- text_to_image/tools/accuracy_coco.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/text_to_image/tools/accuracy_coco.py b/text_to_image/tools/accuracy_coco.py index d73325897b..42ef8efe34 100644 --- a/text_to_image/tools/accuracy_coco.py +++ b/text_to_image/tools/accuracy_coco.py @@ -105,7 +105,7 @@ def main(): sample_ids_file_path = args.ids_path if args.ids_path else os.path.join( os.path.dirname(__file__), "sample_ids.txt") with open( - os.path.join(sample_ids_file_path, "r") + sample_ids_file_path, "r" ) as compliance_id_file: for line in compliance_id_file: idx = int(line.strip()) From b09b1efef4e5225d33618432cf71550ac135f501 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Fri, 8 Nov 2024 15:47:17 +0530 Subject: [PATCH 11/15] Not use default for sample_ids.txt --- text_to_image/tools/accuracy_coco.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/text_to_image/tools/accuracy_coco.py b/text_to_image/tools/accuracy_coco.py index 42ef8efe34..b5f1be3783 100644 --- a/text_to_image/tools/accuracy_coco.py +++ b/text_to_image/tools/accuracy_coco.py @@ -54,7 +54,7 @@ def get_args(): # Do not use for official MLPerf inference submissions as only the default # one is valid parser.add_argument( - "--ids-path", help="Path to 10 caption ids to dump as compliance images", default="os.path.join(os.path.dirname(__file__), 'sample_ids.txt')" + "--ids-path", help="Path to 10 caption ids to dump as compliance images" ) parser.add_argument("--device", default="cpu", choices=["gpu", "cpu"]) parser.add_argument( From df5049d4dbec41862fef6dd7edf9fb064a779bd6 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Thu, 14 Nov 2024 23:12:35 +0000 Subject: [PATCH 12/15] Update requirements.txt (#1907) Updating the pip packages --- text_to_image/requirements.txt | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/text_to_image/requirements.txt b/text_to_image/requirements.txt index a0a8507731..857de950e8 100644 --- a/text_to_image/requirements.txt +++ b/text_to_image/requirements.txt @@ -1,8 +1,8 @@ -diffusers==0.21.2 -transformers==4.33.2 -accelerate==0.23.0 -open-clip-torch==2.7.0 -opencv-python==4.8.1.78 +diffusers==0.30.3 +transformers==4.45.2 +accelerate==1.0.1 +open-clip-torch==2.26.1 +opencv-python==4.10.0.84 pycocotools==2.0.7 -torchmetrics[image]==1.2.0 -scipy==1.9.1 +torchmetrics[image]==1.4.3 +scipy==1.10.1 From a7e8c8ad2766e3fb64a31eb42c8cde724f7b055d Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Fri, 15 Nov 2024 16:46:53 +0530 Subject: [PATCH 13/15] Fix preprocess_sudbmission for a bug --- tools/submission/preprocess_submission.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/submission/preprocess_submission.py b/tools/submission/preprocess_submission.py index a1678c79d0..ec3aa1f7af 100644 --- a/tools/submission/preprocess_submission.py +++ b/tools/submission/preprocess_submission.py @@ -84,7 +84,6 @@ def delete_empty_dirs(src): """ if not os.path.isdir(src): return False - if all([delete_empty_dirs(os.path.join(src, file)) for file in os.listdir(src)]): log.info("Removing empty dir: (%s)", src) @@ -532,13 +531,16 @@ def main(): if not args.nodelete_empty_dirs: delete_empty_dirs(os.path.join(src_dir)) + run_dir = os.getcwd() os.chdir(src_dir) infer_scenario_results(args, config) + os.chdir(run_dir) if not args.nodelete_empty_dirs: delete_empty_dirs(os.path.join(src_dir)) + return 0 From 8915a90ea0fed700afbffbc75908cd2fbf103104 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Sat, 16 Nov 2024 22:04:18 +0000 Subject: [PATCH 14/15] Update submission_checker.py | Removed TEST05 --- tools/submission/submission_checker.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tools/submission/submission_checker.py b/tools/submission/submission_checker.py index 5f2e272673..deff9eb8c4 100755 --- a/tools/submission/submission_checker.py +++ b/tools/submission/submission_checker.py @@ -188,6 +188,7 @@ "sample_index_rng_seed": 198141574272810017, "schedule_rng_seed": 7575108116881280410, }, + # not required for v5.0+ "test05_seeds": { # TODO: Update random seeds "qsl_rng_seed": 2376919268182438552, @@ -2880,7 +2881,7 @@ def check_compliance_dir( compliance_perf_pass = True compliance_perf_dir_pass = True compliance_acc_pass = True - test_list = ["TEST01", "TEST04", "TEST05"] + test_list = ["TEST01", "TEST04"] if model in [ "bert-99", @@ -2899,7 +2900,7 @@ def check_compliance_dir( ]: test_list.remove("TEST04") - if model in [ + if config.version in ["v4.0", "v4.1"] and model not in [ "gptj-99", "gptj-99.9", "llama2-70b-99", @@ -2907,7 +2908,7 @@ def check_compliance_dir( "stable-diffusion-xl", "mixtral-8x7b", ]: - test_list.remove("TEST05") + test_list.append("TEST05") if model in [ "gptj-99", From 941c0c484f4395e9b63611fbddafbde001a9de39 Mon Sep 17 00:00:00 2001 From: zixianwang2022 Date: Sat, 16 Nov 2024 21:18:39 -0600 Subject: [PATCH 15/15] move changes to fork 4 pr --- text_to_image/backend_pytorch.py | 112 +++++++------- text_to_image/main.py | 241 ++++++++++++++++++++++--------- text_to_image/mlperf.conf | 99 +++++++++++++ 3 files changed, 326 insertions(+), 126 deletions(-) create mode 100644 text_to_image/mlperf.conf diff --git a/text_to_image/backend_pytorch.py b/text_to_image/backend_pytorch.py index 36e2b80090..f2af4d75c7 100644 --- a/text_to_image/backend_pytorch.py +++ b/text_to_image/backend_pytorch.py @@ -17,9 +17,9 @@ def __init__( model_id="xl", guidance=8, steps=20, - batch_size=1, + batch_size=2, device="cuda", - precision="fp32", + precision="fp16", negative_prompt="normal quality, low quality, worst quality, low res, blurry, nsfw, nude", ): super(BackendPytorch, self).__init__() @@ -57,39 +57,41 @@ def image_format(self): return "NCHW" def load(self): - if self.model_path is None: - log.warning( - "Model path not provided, running with default hugging face weights\n" - "This may not be valid for official submissions" - ) - self.scheduler = EulerDiscreteScheduler.from_pretrained( - self.model_id, subfolder="scheduler" - ) - self.pipe = StableDiffusionXLPipeline.from_pretrained( - self.model_id, - scheduler=self.scheduler, - safety_checker=None, - add_watermarker=False, - variant="fp16" if (self.dtype == torch.float16) else None, - torch_dtype=self.dtype, - ) + # if self.model_path is None: + # log.warning( + # "Model path not provided, running with default hugging face weights\n" + # "This may not be valid for official submissions" + # ) + self.scheduler = EulerDiscreteScheduler.from_pretrained( + self.model_id, subfolder="scheduler" + ) + self.pipe = StableDiffusionXLPipeline.from_pretrained( + self.model_id, + scheduler=self.scheduler, + safety_checker=None, + add_watermarker=False, + # variant="fp16" if (self.dtype == torch.float16) else None, + variant="fp16" , + torch_dtype=self.dtype, + ) # self.pipe.unet = torch.compile(self.pipe.unet, mode="reduce-overhead", fullgraph=True) - else: - self.scheduler = EulerDiscreteScheduler.from_pretrained( - os.path.join(self.model_path, "checkpoint_scheduler"), - subfolder="scheduler", - ) - self.pipe = StableDiffusionXLPipeline.from_pretrained( - os.path.join(self.model_path, "checkpoint_pipe"), - scheduler=self.scheduler, - safety_checker=None, - add_watermarker=False, - torch_dtype=self.dtype, - ) + # else: + # self.scheduler = EulerDiscreteScheduler.from_pretrained( + # os.path.join(self.model_path, "checkpoint_scheduler"), + # subfolder="scheduler", + # ) + # self.pipe = StableDiffusionXLPipeline.from_pretrained( + # os.path.join(self.model_path, "checkpoint_pipe"), + # scheduler=self.scheduler, + # safety_checker=None, + # add_watermarker=False, + # variant="fp16" if (self.dtype == torch.float16) else None, + # torch_dtype=self.dtype, + # ) # self.pipe.unet = torch.compile(self.pipe.unet, mode="reduce-overhead", fullgraph=True) self.pipe.to(self.device) - # self.pipe.set_progress_bar_config(disable=True) + #self.pipe.set_progress_bar_config(disable=True) self.negative_prompt_tokens = self.pipe.tokenizer( self.convert_prompt(self.negative_prompt, self.pipe.tokenizer), @@ -210,15 +212,13 @@ def encode_tokens( text_input_ids.to(device), output_hidden_states=True ) - # We are only ALWAYS interested in the pooled output of the - # final text encoder + # We are only ALWAYS interested in the pooled output of the final text encoder pooled_prompt_embeds = prompt_embeds[0] if clip_skip is None: prompt_embeds = prompt_embeds.hidden_states[-2] else: # "2" because SDXL always indexes from the penultimate layer. - prompt_embeds = prompt_embeds.hidden_states[-( - clip_skip + 2)] + prompt_embeds = prompt_embeds.hidden_states[-(clip_skip + 2)] prompt_embeds_list.append(prompt_embeds) @@ -234,8 +234,7 @@ def encode_tokens( and zero_out_negative_prompt ): negative_prompt_embeds = torch.zeros_like(prompt_embeds) - negative_pooled_prompt_embeds = torch.zeros_like( - pooled_prompt_embeds) + negative_pooled_prompt_embeds = torch.zeros_like(pooled_prompt_embeds) elif do_classifier_free_guidance and negative_prompt_embeds is None: negative_prompt = negative_prompt or "" negative_prompt_2 = negative_prompt_2 or negative_prompt @@ -262,35 +261,30 @@ def encode_tokens( uncond_input.to(device), output_hidden_states=True, ) - # We are only ALWAYS interested in the pooled output of the - # final text encoder + # We are only ALWAYS interested in the pooled output of the final text encoder negative_pooled_prompt_embeds = negative_prompt_embeds[0] negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2] negative_prompt_embeds_list.append(negative_prompt_embeds) - negative_prompt_embeds = torch.concat( - negative_prompt_embeds_list, dim=-1) + negative_prompt_embeds = torch.concat(negative_prompt_embeds_list, dim=-1) if pipe.text_encoder_2 is not None: prompt_embeds = prompt_embeds.to( dtype=pipe.text_encoder_2.dtype, device=device ) else: - prompt_embeds = prompt_embeds.to( - dtype=pipe.unet.dtype, device=device) + prompt_embeds = prompt_embeds.to(dtype=pipe.unet.dtype, device=device) bs_embed, seq_len, _ = prompt_embeds.shape - # duplicate text embeddings for each generation per prompt, using mps - # friendly method + # duplicate text embeddings for each generation per prompt, using mps friendly method prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1) prompt_embeds = prompt_embeds.view( bs_embed * num_images_per_prompt, seq_len, -1 ) if do_classifier_free_guidance: - # duplicate unconditional embeddings for each generation per - # prompt, using mps friendly method + # duplicate unconditional embeddings for each generation per prompt, using mps friendly method seq_len = negative_prompt_embeds.shape[1] if pipe.text_encoder_2 is not None: @@ -322,7 +316,7 @@ def encode_tokens( pooled_prompt_embeds, negative_pooled_prompt_embeds, ) - + def prepare_inputs(self, inputs, i): if self.batch_size == 1: return self.encode_tokens( @@ -337,7 +331,7 @@ def prepare_inputs(self, inputs, i): negative_prompt_embeds = [] pooled_prompt_embeds = [] negative_pooled_prompt_embeds = [] - for prompt in inputs[i: min(i + self.batch_size, len(inputs))]: + for prompt in inputs[i:min(i+self.batch_size, len(inputs))]: assert isinstance(prompt, dict) text_input = prompt["input_tokens"] text_input_2 = prompt["input_tokens_2"] @@ -358,26 +352,19 @@ def prepare_inputs(self, inputs, i): pooled_prompt_embeds.append(p_p_e) negative_pooled_prompt_embeds.append(n_p_p_e) + prompt_embeds = torch.cat(prompt_embeds) negative_prompt_embeds = torch.cat(negative_prompt_embeds) pooled_prompt_embeds = torch.cat(pooled_prompt_embeds) - negative_pooled_prompt_embeds = torch.cat( - negative_pooled_prompt_embeds) - return ( - prompt_embeds, - negative_prompt_embeds, - pooled_prompt_embeds, - negative_pooled_prompt_embeds, - ) + negative_pooled_prompt_embeds = torch.cat(negative_pooled_prompt_embeds) + return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds def predict(self, inputs): images = [] with torch.no_grad(): for i in range(0, len(inputs), self.batch_size): - latents_input = [ - inputs[idx]["latents"] - for idx in range(i, min(i + self.batch_size, len(inputs))) - ] + print (f'self.steps BEFORE pipe: {self.steps}') + latents_input = [inputs[idx]["latents"] for idx in range(i, min(i+self.batch_size, len(inputs)))] latents_input = torch.cat(latents_input).to(self.device) ( prompt_embeds, @@ -392,8 +379,11 @@ def predict(self, inputs): negative_pooled_prompt_embeds=negative_pooled_prompt_embeds, guidance_scale=self.guidance, num_inference_steps=self.steps, + # num_inference_steps=20, output_type="pt", latents=latents_input, ).images + print (f'self.steps AFTER pipe: {self.steps}') images.extend(generated) return images + diff --git a/text_to_image/main.py b/text_to_image/main.py index 6aa7c15e75..7d4da2a0ba 100644 --- a/text_to_image/main.py +++ b/text_to_image/main.py @@ -24,6 +24,8 @@ import dataset import coco +from concurrent.futures import ThreadPoolExecutor, as_completed + logging.basicConfig(level=logging.INFO) log = logging.getLogger("main") @@ -73,34 +75,24 @@ def get_args(): parser = argparse.ArgumentParser() - parser.add_argument( - "--dataset", - choices=SUPPORTED_DATASETS.keys(), - help="dataset") - parser.add_argument( - "--dataset-path", - required=True, - help="path to the dataset") + parser.add_argument("--dataset", choices=SUPPORTED_DATASETS.keys(), help="dataset") + parser.add_argument("--dataset-path", required=True, help="path to the dataset") parser.add_argument( "--profile", choices=SUPPORTED_PROFILES.keys(), help="standard profiles" ) parser.add_argument( "--scenario", default="SingleStream", - help="mlperf benchmark scenario, one of " + - str(list(SCENARIO_MAP.keys())), + help="mlperf benchmark scenario, one of " + str(list(SCENARIO_MAP.keys())), ) parser.add_argument( "--max-batchsize", type=int, - default=1, + default=2, help="max batch size in a single inference", ) parser.add_argument("--threads", default=1, type=int, help="threads") - parser.add_argument( - "--accuracy", - action="store_true", - help="enable accuracy pass") + parser.add_argument("--accuracy", action="store_true", help="enable accuracy pass") parser.add_argument( "--find-peak-performance", action="store_true", @@ -111,10 +103,11 @@ def get_args(): parser.add_argument("--output", default="output", help="test results") parser.add_argument("--qps", type=int, help="target qps") parser.add_argument("--model-path", help="Path to model weights") + parser.add_argument("--gpu-num", type=int, default=4, help="Number of gpus to run inference") parser.add_argument( "--dtype", - default="fp32", + default="fp16", choices=["fp32", "fp16", "bf16"], help="dtype of the model", ) @@ -131,6 +124,10 @@ def get_args(): help="framework to load the latents", ) + # file to use mlperf rules compliant parameters + parser.add_argument( + "--mlperf_conf", default="mlperf.conf", help="mlperf rules config" + ) # file for user LoadGen settings such as target QPS parser.add_argument( "--user_conf", @@ -145,20 +142,14 @@ def get_args(): # pass this argument for official submission # parser.add_argument("--output-images", action="store_true", help="Store a subset of the generated images") # do not modify this argument for official submission - parser.add_argument( - "--ids-path", help="Path to caption ids", default="tools/sample_ids.txt" - ) + parser.add_argument("--ids-path", help="Path to caption ids", default="tools/sample_ids.txt") - # below will override mlperf rules compliant settings - don't use for - # official submission + # below will override mlperf rules compliant settings - don't use for official submission parser.add_argument("--time", type=int, help="time to scan in seconds") parser.add_argument("--count", type=int, help="dataset items to use") parser.add_argument("--debug", action="store_true", help="debug") parser.add_argument( - "--performance-sample-count", - type=int, - help="performance sample count", - default=5000, + "--performance-sample-count", type=int, help="performance sample count", default=5000 ) parser.add_argument( "--max-latency", type=float, help="mlperf max latency in pct tile" @@ -271,9 +262,9 @@ def enqueue(self, query_samples): else: bs = self.max_batchsize for i in range(0, len(idx), bs): - data, label = self.ds.get_samples(idx[i: i + bs]) + data, label = self.ds.get_samples(idx[i : i + bs]) self.run_one_item( - Item(query_id[i: i + bs], idx[i: i + bs], data, label) + Item(query_id[i : i + bs], idx[i : i + bs], data, label) ) def finish(self): @@ -288,9 +279,7 @@ def __init__(self, model, ds, threads, post_proc=None, max_batchsize=128): self.result_dict = {} for _ in range(self.threads): - worker = threading.Thread( - target=self.handle_tasks, args=( - self.tasks,)) + worker = threading.Thread(target=self.handle_tasks, args=(self.tasks,)) worker.daemon = True self.workers.append(worker) worker.start() @@ -333,13 +322,29 @@ def main(): log.info(args) # find backend - backend = get_backend( - args.backend, - precision=args.dtype, - device=args.device, - model_path=args.model_path, - batch_size=args.max_batchsize, - ) + + # backend = get_backend( + # args.backend, + # precision=args.dtype, + # device=args.device, + # model_path=args.model_path, + # batch_size=args.max_batchsize + # ) + # Zixian: Oct 21: create a list of backends for multi-gpu + + # Zixian: Nov 13: Force batchsize=2 since command line doesn't work + args.max_batchsize = 5 + + backends = [get_backend( + args.backend, + precision=args.dtype, + device=f'cuda:{i}', + model_path=args.model_path, + batch_size=args.max_batchsize + ) + for i in np.arange (args.gpu_num)] + + if args.dtype == "fp16": dtype = torch.float16 elif args.dtype == "bf16": @@ -355,7 +360,9 @@ def main(): count_override = True # load model to backend - model = backend.load() + # model = backend.load() + # Zixian: Oct 21: create a list of models corresponding to each backend + models = [backend.load() for backend in backends] # dataset to use dataset_class, pre_proc, post_proc, kwargs = SUPPORTED_DATASETS[args.dataset] @@ -365,28 +372,37 @@ def main(): pre_process=pre_proc, count=count, threads=args.threads, - pipe_tokenizer=model.pipe.tokenizer, - pipe_tokenizer_2=model.pipe.tokenizer_2, + # pipe_tokenizer=model.pipe.tokenizer, + # pipe_tokenizer_2=model.pipe.tokenizer_2, + pipe_tokenizer=models[0].pipe.tokenizer, + pipe_tokenizer_2=models[0].pipe.tokenizer_2, latent_dtype=dtype, latent_device=args.device, latent_framework=args.latent_framework, **kwargs, ) final_results = { - "runtime": model.name(), - "version": model.version(), + # "runtime": model.name(), + # "version": model.version(), + "runtime": models[0].name(), + "version": models[0].version(), "time": int(time.time()), "args": vars(args), "cmdline": str(args), } + mlperf_conf = os.path.abspath(args.mlperf_conf) + if not os.path.exists(mlperf_conf): + log.error("{} not found".format(mlperf_conf)) + sys.exit(1) + user_conf = os.path.abspath(args.user_conf) if not os.path.exists(user_conf): log.error("{} not found".format(user_conf)) sys.exit(1) audit_config = os.path.abspath(args.audit_conf) - + if args.accuracy: ids_path = os.path.abspath(args.ids_path) with open(ids_path) as f: @@ -405,16 +421,47 @@ def main(): # warmup syntetic_str = "Lorem ipsum dolor sit amet, consectetur adipiscing elit" latents_pt = torch.rand(ds.latents.shape, dtype=dtype).to(args.device) - warmup_samples = [ - { - "input_tokens": ds.preprocess(syntetic_str, model.pipe.tokenizer), - "input_tokens_2": ds.preprocess(syntetic_str, model.pipe.tokenizer_2), - "latents": latents_pt, - } - for _ in range(args.max_batchsize) - ] - for i in range(5): - _ = backend.predict(warmup_samples) + # warmup_samples = [ + # { + # "input_tokens": ds.preprocess(syntetic_str, model.pipe.tokenizer), + # "input_tokens_2": ds.preprocess(syntetic_str, model.pipe.tokenizer_2), + # "latents": latents_pt, + # } + # for _ in range(args.max_batchsize) + # ] + warmup_samples_gpus = [ + [ + { + "input_tokens": ds.preprocess(syntetic_str, model.pipe.tokenizer), + "input_tokens_2": ds.preprocess(syntetic_str, model.pipe.tokenizer_2), + "latents": latents_pt, + } + for _ in range(int(args.max_batchsize)) + ] + for model in models] * 3 # 3 times warmup samples + + # Zixian: Oct 21: warm up each backend + # for idx, backend in enumerate (backends): + # for i in range(1): + # _ = backend.predict(warmup_samples_gpus[idx]) + + print (f'Start distributed warmup') + with ThreadPoolExecutor(max_workers=len(backends)) as executor: + # Map each runner to its respective sublist + futures = { + executor.submit(backend.predict, queries): backend + for backend, queries in zip(backends, warmup_samples_gpus) + } + + # Optionally process the results + for future in as_completed(futures): + backend = futures[future] + try: + result = future.result() + print(f'Warmup backend {backend} enqueued successfully.') + except Exception as exc: + print(f'Warmup backend {backend} generated an exception: {exc}') + scenario = SCENARIO_MAP[args.scenario] runner_map = { @@ -423,12 +470,54 @@ def main(): lg.TestScenario.Server: QueueRunner, lg.TestScenario.Offline: QueueRunner, } - runner = runner_map[scenario]( - model, ds, args.threads, post_proc=post_proc, max_batchsize=args.max_batchsize - ) - + + # Zixian: Oct 21: create a list of runner + # runner = runner_map[scenario]( + # model, ds, args.threads, post_proc=post_proc, max_batchsize=args.max_batchsize + # ) + runners = [runner_map[scenario]( + model, ds, args.threads, post_proc=post_proc, max_batchsize=args.max_batchsize + ) + for model in models] + + # def issue_queries(query_samples): + # runner.enqueue(query_samples) def issue_queries(query_samples): - runner.enqueue(query_samples) + print (f'\n\n len (query_samples): {len (query_samples)} \n\n') + + query_samples_len = len (query_samples) + query_samples_seg_len = query_samples_len / len (runners) + splitted_query_samples = [] + + + for idx in range (len (runners)): + log.info (f'\n\n\n') + log.info (f'idx: {idx}') + log.info (f'query_samples_len: {query_samples_len}') + log.info (f'idx: {idx}') + # if idx == len (runners) -1: + # splitted_query_samples.append (query_samples[idx*query_samples_seg_len:]) + # else: + # splitted_query_samples.append (query_samples[idx*query_samples_seg_len : (idx+1)*query_samples_seg_len]) + splitted_query_samples.append (query_samples [int(round(query_samples_seg_len * idx)): int(round(query_samples_seg_len * (idx + 1)))]) + + # splitted_query_samples = [query_samples[int(round(avg * i)): int(round(avg * (i + 1)))] for i in range(b)] + + with ThreadPoolExecutor(max_workers=len(runners)) as executor: + # Map each runner to its respective sublist + futures = { + executor.submit(runner.enqueue, queries): runner + for runner, queries in zip(runners, splitted_query_samples) + } + + # Optionally process the results + for future in as_completed(futures): + runner = futures[future] + try: + result = future.result() + print(f'Runner {runner} enqueued successfully.') + except Exception as exc: + print(f'Runner {runner} generated an exception: {exc}') def flush_queries(): pass @@ -441,8 +530,7 @@ def flush_queries(): log_settings.log_output = log_output_settings settings = lg.TestSettings() - # mlperf.conf is automatically loaded by the loadgen - # settings.FromConfig(mlperf_conf, args.model_name, args.scenario) + settings.FromConfig(mlperf_conf, args.model_name, args.scenario) settings.FromConfig(user_conf, args.model_name, args.scenario) if os.path.exists(audit_config): settings.FromConfig(audit_config, args.model_name, args.scenario) @@ -458,6 +546,8 @@ def flush_queries(): settings.min_duration_ms = args.time * MILLI_SEC settings.max_duration_ms = args.time * MILLI_SEC + # Zixian: Nov8: manually setting args.qps to 1 + # args.qps=1.0 if args.qps: qps = float(args.qps) settings.server_target_qps = qps @@ -471,14 +561,15 @@ def flush_queries(): settings.multi_stream_samples_per_query = args.samples_per_query if args.max_latency: settings.server_target_latency_ns = int(args.max_latency * NANO_SEC) - settings.multi_stream_expected_latency_ns = int( - args.max_latency * NANO_SEC) + settings.multi_stream_expected_latency_ns = int(args.max_latency * NANO_SEC) performance_sample_count = ( args.performance_sample_count if args.performance_sample_count else min(count, 500) ) + + # count = 200 sut = lg.ConstructSUT(issue_queries, flush_queries) qsl = lg.ConstructQSL( count, performance_sample_count, ds.load_query_samples, ds.unload_query_samples @@ -486,19 +577,39 @@ def flush_queries(): log.info("starting {}".format(scenario)) result_dict = {"scenario": str(scenario)} - runner.start_run(result_dict, args.accuracy) + for runner in runners: + runner.start_run(result_dict, args.accuracy) + + # with ThreadPoolExecutor(max_workers=len(runners)) as executor: + # # Map each runner to its respective sublist + # futures = { + # executor.submit(runner.finish(), (result_dict, args.accuracy)): runner + # for runner in runners + # } + lg.StartTestWithLogSettings(sut, qsl, settings, log_settings, audit_config) if args.accuracy: post_proc.finalize(result_dict, ds, output_dir=args.output) final_results["accuracy_results"] = result_dict - post_proc.save_images(saved_images_ids, ds) + # post_proc.save_images(saved_images_ids, ds) + - runner.finish() lg.DestroyQSL(qsl) lg.DestroySUT(sut) + # for runner in runners: + # runner.finish() + with ThreadPoolExecutor(max_workers=len(runners)) as executor: + # Map each runner to its respective sublist + futures = { + executor.submit(runner.finish()): runner + for runner in runners + } + + + # # write final results # diff --git a/text_to_image/mlperf.conf b/text_to_image/mlperf.conf new file mode 100644 index 0000000000..0cea5351e1 --- /dev/null +++ b/text_to_image/mlperf.conf @@ -0,0 +1,99 @@ +# The format of this config file is 'key = value'. +# The key has the format 'model.scenario.key'. Value is mostly int64_t. +# Model maybe '*' as wildcard. In that case the value applies to all models. +# All times are in milli seconds + +# Set performance_sample_count for each model. +# User can optionally set this to higher values in user.conf. +resnet50.*.performance_sample_count_override = 1024 +ssd-mobilenet.*.performance_sample_count_override = 256 +retinanet.*.performance_sample_count_override = 64 +bert.*.performance_sample_count_override = 10833 +dlrm.*.performance_sample_count_override = 204800 +dlrm-v2.*.performance_sample_count_override = 204800 +rnnt.*.performance_sample_count_override = 2513 +gptj.*.performance_sample_count_override = 13368 +llama2-70b.*.performance_sample_count_override = 24576 +stable-diffusion-xl.*.performance_sample_count_override = 5000 +# set to 0 to let entire sample set to be performance sample +3d-unet.*.performance_sample_count_override = 0 + +# Set seeds. The seeds will be distributed two weeks before the submission. +*.*.qsl_rng_seed = 3066443479025735752 +*.*.sample_index_rng_seed = 10688027786191513374 +*.*.schedule_rng_seed = 14962580496156340209 +# Set seeds for TEST_05. The seeds will be distributed two weeks before the submission. +*.*.test05_qsl_rng_seed = 16799458546791641818 +*.*.test05_sample_index_rng_seed = 5453809927556429288 +*.*.test05_schedule_rng_seed = 5435552105434836064 + + +*.SingleStream.target_latency_percentile = 90 +*.SingleStream.min_duration = 600000 + +*.MultiStream.target_latency_percentile = 99 +*.MultiStream.samples_per_query = 8 +*.MultiStream.min_duration = 600000 +*.MultiStream.min_query_count = 662 +retinanet.MultiStream.target_latency = 528 + +# 3D-UNet uses equal issue mode because it has non-uniform inputs +3d-unet.*.sample_concatenate_permutation = 1 + +# LLM benchmarks have non-uniform inputs and outputs, and use equal issue mode for all latency scenario +gptj.*.sample_concatenate_permutation = 1 +llama2-70b.*.sample_concatenate_permutation = 1 +mixtral-8x7b.*.sample_concatenate_permutation = 1 + +*.Server.target_latency = 10 +*.Server.target_latency_percentile = 99 +*.Server.target_duration = 0 +*.Server.min_duration = 600000 +resnet50.Server.target_latency = 15 +retinanet.Server.target_latency = 100 +bert.Server.target_latency = 130 +dlrm.Server.target_latency = 60 +dlrm-v2.Server.target_latency = 60 +rnnt.Server.target_latency = 1000 +gptj.Server.target_latency = 20000 +stable-diffusion-xl.Server.target_latency = 20000 +# Llama2-70b benchmarks measures token latencies +llama2-70b.*.use_token_latencies = 1 +mixtral-8x7b.*.use_token_latencies = 1 +# gptj benchmark infers token latencies +gptj.*.infer_token_latencies = 1 +gptj.*.token_latency_scaling_factor = 69 +# Only ttft and tpot are tracked for the llama2-70b & mixtral-8x7B benchmark therefore target_latency = 0 +llama2-70b.Server.target_latency = 0 +llama2-70b.Server.ttft_latency = 2000 +llama2-70b.Server.tpot_latency = 200 + +mixtral-8x7b.Server.target_latency = 0 +mixtral-8x7b.Server.ttft_latency = 2000 +mixtral-8x7b.Server.tpot_latency = 200 + +*.Offline.target_latency_percentile = 90 +*.Offline.min_duration = 600000 + +# In Offline scenario, we always have one query. But LoadGen maps this to +# min_sample_count internally in Offline scenario. If the dataset size is larger +# than 24576 we limit the min_query_count to 24576 and otherwise we use +# the dataset size as the limit + +resnet50.Offline.min_query_count = 24576 +retinanet.Offline.min_query_count = 24576 +dlrm-v2.Offline.min_query_count = 24576 +bert.Offline.min_query_count = 10833 +gptj.Offline.min_query_count = 13368 +rnnt.Offline.min_query_count = 2513 +3d-unet.Offline.min_query_count = 43 +stable-diffusion-xl.Offline.min_query_count = 4000 +llama2-70b.Offline.min_query_count = 24576 +mixtral-8x7b.Offline.min_query_count = 15000 + +# These fields should be defined and overridden by user.conf. +*.SingleStream.target_latency = 10 +*.MultiStream.target_latency = 80 +*.Server.target_qps = 1.0 +*.Offline.target_qps = 1.0 +