From dce2c0aa9a56d9f9618567c7df162c702c98067d Mon Sep 17 00:00:00 2001 From: yariseidenbenz Date: Tue, 23 May 2023 12:17:45 -0600 Subject: [PATCH 1/9] Generate HTML report summarizing file usage from .darshan log: DataFrame with 'glob_filename' and 'glob_count' columns. --- git_project/glob_feature/glob_feature.py | 66 ++++++++++++++++++++++++ 1 file changed, 66 insertions(+) create mode 100644 git_project/glob_feature/glob_feature.py diff --git a/git_project/glob_feature/glob_feature.py b/git_project/glob_feature/glob_feature.py new file mode 100644 index 000000000..d84cb65a5 --- /dev/null +++ b/git_project/glob_feature/glob_feature.py @@ -0,0 +1,66 @@ +# Creates a DataFrame with two columns ("glob_filename" and "glob_count") based on the files read by a .darshan file. +# It uses sequence matching and grouping techniques to group similar file paths together and generates an HTML report of the grouped paths and their counts +# Command to run python glob_feature.py -p path/to/log/file.darshan + + +import argparse +import pandas as pd +import difflib +import darshan + + +def path_grouper(): + matcher = difflib.SequenceMatcher() + def group_paths(paths): + if not matcher.a: + matcher.set_seq1(paths) + return paths + else: + matcher.set_seq2(paths) + matchings = matcher.get_matching_blocks() + if any(size > 25 for _, _, size in matchings): # change size to bigger number for more precise paths + return matcher.a + else: + matcher.set_seq1(paths) + return paths + + return group_paths + + +def regex_df_condenser(df, paths): + path_grouper_func = path_grouper() + df["filename_glob"] = df["filename_glob"].apply(path_grouper_func) + df = df.groupby("filename_glob").size().reset_index(name="glob_count") + + return df + + +def main(log_path): + report = darshan.DarshanReport(log_path) + df = pd.DataFrame.from_dict(report.name_records, orient="index", columns=["filename_glob"]) + df = df[df["filename_glob"].str.contains(r"/.*")] + df["glob_count"] = 1 + df = regex_df_condenser(df, df["filename_glob"]) + df.sort_values(by="glob_count", inplace=True, ascending=False) + + + style = df.style.background_gradient(axis=0, cmap="viridis", gmap=df["glob_count"]) + style.hide(axis="index") + style.set_table_styles([ + {"selector": "", "props": [("border", "1px solid grey")]}, + {"selector": "tbody td", "props": [("border", "1px solid grey")]}, + {"selector": "th", "props": [("border", "1px solid grey")]} + ]) + html = style.to_html() + +# can change name of the output html report here + with open("name_record_table.html", "w") as html_file: + html_file.write(html) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('-p', '--log-path', type=str) + args = parser.parse_args() + main(log_path=args.log_path) + From 8277396081f43a2025219460fd48dff9e04ddd2c Mon Sep 17 00:00:00 2001 From: yariseidenbenz Date: Tue, 30 May 2023 15:37:38 -0600 Subject: [PATCH 2/9] updated glob_feature.py which creates dataframe of glob_filename and glob_count --- git_project/glob_feature/glob_feature.py | 65 +++++++++++++++++++----- 1 file changed, 52 insertions(+), 13 deletions(-) diff --git a/git_project/glob_feature/glob_feature.py b/git_project/glob_feature/glob_feature.py index d84cb65a5..ac81c3218 100644 --- a/git_project/glob_feature/glob_feature.py +++ b/git_project/glob_feature/glob_feature.py @@ -7,9 +7,11 @@ import pandas as pd import difflib import darshan +import re +import os -def path_grouper(): +def make_path_grouper(): matcher = difflib.SequenceMatcher() def group_paths(paths): if not matcher.a: @@ -17,32 +19,68 @@ def group_paths(paths): return paths else: matcher.set_seq2(paths) - matchings = matcher.get_matching_blocks() - if any(size > 25 for _, _, size in matchings): # change size to bigger number for more precise paths + similarity_ratio = matcher.ratio() + if similarity_ratio >= 0.8: return matcher.a else: matcher.set_seq1(paths) return paths - return group_paths def regex_df_condenser(df, paths): - path_grouper_func = path_grouper() + path_grouper_func = make_path_grouper() df["filename_glob"] = df["filename_glob"].apply(path_grouper_func) + print("Paths after grouping:") + print(df["filename_glob"]) + df = df.groupby("filename_glob").size().reset_index(name="glob_count") + df = df.sort_values(by="glob_count", ascending=False) + + print("Paths after grouping and counting:") + print(df) + + + def find_common_prefix(paths): + # Sort the paths in lexicographical order + sorted_paths = sorted(paths) + + # Find the common prefix + common_prefix = os.path.commonprefix(sorted_paths) + + # Trim the common prefix to the last path separator + last_separator = common_prefix.rfind(os.path.sep) + common_prefix = common_prefix[:last_separator+1] if last_separator >= 0 else common_prefix + + return common_prefix + + + for group in df["filename_glob"].unique(): + group_df = df[df["filename_glob"] == group] + common_path = find_common_prefix(group_df["filename_glob"]) + df.loc[df["filename_glob"] == group, "filename_glob"] = common_path + + print("Paths after modifying filename_glob:") + print(df) + + df["regex"] = df.apply(lambda row: re.escape(row["filename_glob"]) + r".*", axis=1) + print("Paths after applying regex:") + print(df) + return df -def main(log_path): + +def main(log_path, output_path): report = darshan.DarshanReport(log_path) + + df = pd.DataFrame.from_dict(report.name_records, orient="index", columns=["filename_glob"]) + df = df[df["filename_glob"].str.contains(r"/.*")] df["glob_count"] = 1 df = regex_df_condenser(df, df["filename_glob"]) - df.sort_values(by="glob_count", inplace=True, ascending=False) - style = df.style.background_gradient(axis=0, cmap="viridis", gmap=df["glob_count"]) style.hide(axis="index") @@ -53,14 +91,15 @@ def main(log_path): ]) html = style.to_html() -# can change name of the output html report here - with open("name_record_table.html", "w") as html_file: + # can change name of the output html report here + with open("name_record_glob.html", "w") as html_file: html_file.write(html) - +# go back to hdf5_diagonal dxt if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument('-p', '--log-path', type=str) + parser.add_argument('-p', '--log-path', type=str, help="Path to the log file") + parser.add_argument('-o', '--output-path', type=str, help="Path to the output HTML file") args = parser.parse_args() - main(log_path=args.log_path) + main(log_path=args.log_path , output_path=args.output_path) From 397780a7254307c09e3a161ac91d5854abe9e15e Mon Sep 17 00:00:00 2001 From: yariseidenbenz Date: Wed, 31 May 2023 08:59:44 -0600 Subject: [PATCH 3/9] WIP: this script creates a condensed dataframe of glob_filename and glob_count from a log .darshan file --- git_project/glob_feature/glob_feature.py | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/git_project/glob_feature/glob_feature.py b/git_project/glob_feature/glob_feature.py index ac81c3218..9febba5dd 100644 --- a/git_project/glob_feature/glob_feature.py +++ b/git_project/glob_feature/glob_feature.py @@ -30,17 +30,13 @@ def group_paths(paths): def regex_df_condenser(df, paths): path_grouper_func = make_path_grouper() + df["filename_glob"] = df["filename_glob"].apply(path_grouper_func) - print("Paths after grouping:") - print(df["filename_glob"]) df = df.groupby("filename_glob").size().reset_index(name="glob_count") df = df.sort_values(by="glob_count", ascending=False) - print("Paths after grouping and counting:") - print(df) - def find_common_prefix(paths): # Sort the paths in lexicographical order @@ -61,12 +57,8 @@ def find_common_prefix(paths): common_path = find_common_prefix(group_df["filename_glob"]) df.loc[df["filename_glob"] == group, "filename_glob"] = common_path - print("Paths after modifying filename_glob:") - print(df) - df["regex"] = df.apply(lambda row: re.escape(row["filename_glob"]) + r".*", axis=1) - print("Paths after applying regex:") - print(df) + df["filename_glob"] = df.apply(lambda row: (row["filename_glob"]) + r".*", axis=1) return df @@ -92,10 +84,10 @@ def main(log_path, output_path): html = style.to_html() # can change name of the output html report here - with open("name_record_glob.html", "w") as html_file: + with open("name_record_glob_hd5f.html", "w") as html_file: html_file.write(html) -# go back to hdf5_diagonal dxt + if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('-p', '--log-path', type=str, help="Path to the log file") From edc12c383ec01cfb9d68ee4040089ee3e4a8952c Mon Sep 17 00:00:00 2001 From: yariseidenbenz Date: Tue, 6 Jun 2023 10:27:54 -0600 Subject: [PATCH 4/9] Refactored glob_feature.py script and improved data frame creation for e3sm_io_heatmap_only.darshan log file. Also, relocated the script to a more suitable spot among other Python files in the project. --- .../darshan/glob_feature/glob_feature.py | 89 +++++++++++++++++ git_project/glob_feature/glob_feature.py | 97 ------------------- 2 files changed, 89 insertions(+), 97 deletions(-) create mode 100644 darshan-util/pydarshan/darshan/glob_feature/glob_feature.py delete mode 100644 git_project/glob_feature/glob_feature.py diff --git a/darshan-util/pydarshan/darshan/glob_feature/glob_feature.py b/darshan-util/pydarshan/darshan/glob_feature/glob_feature.py new file mode 100644 index 000000000..4a29895fe --- /dev/null +++ b/darshan-util/pydarshan/darshan/glob_feature/glob_feature.py @@ -0,0 +1,89 @@ +# Creates a DataFrame with two columns ("glob_filename" and "glob_count") based on the files read by a .darshan file. +# It uses sequence matching and grouping techniques to group similar file paths together and generates an HTML report of the grouped paths and their counts +# Command to run python glob_feature.py -p path/to/log/file.darshan + + +import argparse +import pandas as pd +import difflib +import darshan +import re +import os + + +def generalize_filename_glob(df): + paths = df["filename_glob"].tolist() + grouped_paths = [] + + for i in range(len(paths)): + if not grouped_paths: + grouped_paths.append((paths[i],)) + else: + is_grouped = False + for j, group in enumerate(grouped_paths): + matcher = difflib.SequenceMatcher(None, paths[i], group[0]) + similarity_ratio = matcher.ratio() + if similarity_ratio >= 0.8: + grouped_paths[j] = group + (paths[i],) + is_grouped = True + break + if not is_grouped: + grouped_paths.append((paths[i],)) + + print("grouped paths list is", grouped_paths) + + new_paths = [] + for group in grouped_paths: + if len(group) > 1: + common_prefix = os.path.commonprefix(group) + pattern = r"({}.*)\d(.*)".format(common_prefix) + modified_path = re.sub(pattern, r"\1\\d\2", group[0]) + new_paths.append((modified_path, len(group))) + else: + new_paths.append((group[0], 1)) + + new_paths = [path for path in new_paths if path[0]] + + if len(new_paths) > len(df): + new_paths = new_paths[:len(df)] + + print("new paths are", new_paths) + return new_paths + + +def main(log_path, output_path): + + report = darshan.DarshanReport(log_path) + + df = pd.DataFrame.from_dict(report.name_records, orient="index", columns=["filename_glob"]) + + df = df[df["filename_glob"].str.contains(r"/.*")] + + df.reset_index(drop=True, inplace=True) # Reset the index + + + new_paths = generalize_filename_glob(df) + df = pd.DataFrame(new_paths, columns=["filename_glob", "glob_count"]) + df = df.reset_index(drop=True) + df = df.sort_values(by="glob_count", ascending=False) + + style = df.style.background_gradient(axis=0, cmap="viridis") + style.set_table_styles([ + {"selector": "", "props": [("border", "1px solid grey")]}, + {"selector": "tbody td", "props": [("border", "1px solid grey")]}, + {"selector": "th", "props": [("border", "1px solid grey")]} + ]) + + style = style.hide_index() + html = style.render() + + with open(output_path, "w") as html_file: + html_file.write(html) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('-p', '--log-path', type=str, help="Path to the log file") + parser.add_argument('-o', '--output_path', type=str, help="Path to the output file") + args = parser.parse_args() + main(log_path=args.log_path, output_path=args.output_path) diff --git a/git_project/glob_feature/glob_feature.py b/git_project/glob_feature/glob_feature.py deleted file mode 100644 index 9febba5dd..000000000 --- a/git_project/glob_feature/glob_feature.py +++ /dev/null @@ -1,97 +0,0 @@ -# Creates a DataFrame with two columns ("glob_filename" and "glob_count") based on the files read by a .darshan file. -# It uses sequence matching and grouping techniques to group similar file paths together and generates an HTML report of the grouped paths and their counts -# Command to run python glob_feature.py -p path/to/log/file.darshan - - -import argparse -import pandas as pd -import difflib -import darshan -import re -import os - - -def make_path_grouper(): - matcher = difflib.SequenceMatcher() - def group_paths(paths): - if not matcher.a: - matcher.set_seq1(paths) - return paths - else: - matcher.set_seq2(paths) - similarity_ratio = matcher.ratio() - if similarity_ratio >= 0.8: - return matcher.a - else: - matcher.set_seq1(paths) - return paths - return group_paths - - -def regex_df_condenser(df, paths): - path_grouper_func = make_path_grouper() - - df["filename_glob"] = df["filename_glob"].apply(path_grouper_func) - - df = df.groupby("filename_glob").size().reset_index(name="glob_count") - - df = df.sort_values(by="glob_count", ascending=False) - - - def find_common_prefix(paths): - # Sort the paths in lexicographical order - sorted_paths = sorted(paths) - - # Find the common prefix - common_prefix = os.path.commonprefix(sorted_paths) - - # Trim the common prefix to the last path separator - last_separator = common_prefix.rfind(os.path.sep) - common_prefix = common_prefix[:last_separator+1] if last_separator >= 0 else common_prefix - - return common_prefix - - - for group in df["filename_glob"].unique(): - group_df = df[df["filename_glob"] == group] - common_path = find_common_prefix(group_df["filename_glob"]) - df.loc[df["filename_glob"] == group, "filename_glob"] = common_path - - - df["filename_glob"] = df.apply(lambda row: (row["filename_glob"]) + r".*", axis=1) - - return df - - - -def main(log_path, output_path): - report = darshan.DarshanReport(log_path) - - - df = pd.DataFrame.from_dict(report.name_records, orient="index", columns=["filename_glob"]) - - df = df[df["filename_glob"].str.contains(r"/.*")] - df["glob_count"] = 1 - df = regex_df_condenser(df, df["filename_glob"]) - - style = df.style.background_gradient(axis=0, cmap="viridis", gmap=df["glob_count"]) - style.hide(axis="index") - style.set_table_styles([ - {"selector": "", "props": [("border", "1px solid grey")]}, - {"selector": "tbody td", "props": [("border", "1px solid grey")]}, - {"selector": "th", "props": [("border", "1px solid grey")]} - ]) - html = style.to_html() - - # can change name of the output html report here - with open("name_record_glob_hd5f.html", "w") as html_file: - html_file.write(html) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument('-p', '--log-path', type=str, help="Path to the log file") - parser.add_argument('-o', '--output-path', type=str, help="Path to the output HTML file") - args = parser.parse_args() - main(log_path=args.log_path , output_path=args.output_path) - From 9b757d51f4ddc65feb0ae80f4cde1c94489d73e8 Mon Sep 17 00:00:00 2001 From: yariseidenbenz Date: Wed, 14 Jun 2023 15:01:29 -0600 Subject: [PATCH 5/9] Rearranged glob_feature.py and added test_glob_feature.py to the test directory. --- .../darshan/tests/test_glob_feature.py | 49 ++++++++++ darshan-util/pydarshan/glob_feature.py | 98 +++++++++++++++++++ 2 files changed, 147 insertions(+) create mode 100644 darshan-util/pydarshan/darshan/tests/test_glob_feature.py create mode 100644 darshan-util/pydarshan/glob_feature.py diff --git a/darshan-util/pydarshan/darshan/tests/test_glob_feature.py b/darshan-util/pydarshan/darshan/tests/test_glob_feature.py new file mode 100644 index 000000000..8a779857b --- /dev/null +++ b/darshan-util/pydarshan/darshan/tests/test_glob_feature.py @@ -0,0 +1,49 @@ +import os +import darshan +from darshan.log_utils import get_log_path +import pandas as pd +print(pd.__version__) +from pandas.testing import assert_frame_equal +import pytest +import re +print(sys.path) # Print sys.path again +import glob_feature + +print("hello") +@pytest.mark.parametrize("log_name, expected_df", [ + # grow this with more logs... + ("e3sm_io_heatmap_only.darshan", + pd.DataFrame({"filename_glob": + # NOTE: usage of \\d or r"\d" for a literal backslash followed by "d" + ["/projects/radix-io/snyder/e3sm/can_I_out_h\\[.*]d.nc", + "/projects/radix-io/E3SM-IO-inputs/i_case_1344p.nc"], + "glob_count": [2, 1]})), +]) + +def test_glob_tables(tmpdir, log_name, expected_df): + print("Current working directory:", os.getcwd()) + + # test the glob table HTML outputs for various + # log files in the logs repo (and new log files + # that you creatively design yourself) + log_path = get_log_path(log_name) + print("log path is", log_path) + with tmpdir.as_cwd(): + cwd = os.getcwd() + # TODO: you shouldn't have a hardcoded HTML filename + # like this... + outfile = os.path.join(cwd, "name_record_glob_hd5f.html") + glob_feature.main(log_path, outfile) + actual_table = pd.read_html(outfile)[0] + actual_table.drop("Unnamed: 0", axis=1, inplace=True) # Drop the "Unnamed: 0" column + print("actual table is", actual_table) + print("expected_df is", expected_df) + print("pandas version is", pd.__version__) + print("log path is", log_path) + # Compare the two DataFrames + diff = actual_table['filename_glob'].compare(expected_df['filename_glob']) + # Print the differences + print(diff) + assert_frame_equal(actual_table, expected_df) + + diff --git a/darshan-util/pydarshan/glob_feature.py b/darshan-util/pydarshan/glob_feature.py new file mode 100644 index 000000000..8b1cb1455 --- /dev/null +++ b/darshan-util/pydarshan/glob_feature.py @@ -0,0 +1,98 @@ +# Creates a DataFrame with two columns ("glob_filename" and "glob_count") based on the files read b$ +# It uses sequence matching and grouping techniques to group similar file paths together and genera$ +# Command to run python glob_feature.py -p path/to/log/file.darshan + + +import argparse +import pandas as pd +import difflib +import darshan +import re +import os + + +def generalize_filename_glob(df): + paths = df["filename_glob"].tolist() + grouped_paths = [] + + for i in range(len(paths)): + if not grouped_paths: + grouped_paths.append((paths[i],)) + else: + is_grouped = False + for j, group in enumerate(grouped_paths): + matcher = difflib.SequenceMatcher(None, paths[i], group[0]) + similarity_ratio = matcher.ratio() + if similarity_ratio >= 0.8: + grouped_paths[j] = group + (paths[i],) + is_grouped = True + break + if not is_grouped: + grouped_paths.append((paths[i],)) + + print("grouped paths list is", grouped_paths) + + new_paths = [] + for group in grouped_paths: + if len(group) > 1: + common_prefix = os.path.commonprefix(group) + pattern = r"({}.*)\d(.*)".format(common_prefix) + modified_path = re.sub(pattern, r"\1\\d\2", group[0]) + new_paths.append((modified_path, len(group))) + else: + new_paths.append((group[0], 1)) + + new_paths = [path for path in new_paths if path[0]] + + if len(new_paths) > len(df): + new_paths = new_paths[:len(df)] + + print("new paths are", new_paths) + return new_paths + + + + +def main(log_path, output_path): + + report = darshan.DarshanReport(log_path) + + df = pd.DataFrame.from_dict(report.name_records, orient="index", columns=["filename_glob"]) + + df = df[df["filename_glob"].str.contains(r"/.*")] + + df.reset_index(drop=True, inplace=True) # Reset the index + + + new_paths = generalize_filename_glob(df) + df = pd.DataFrame(new_paths, columns=["filename_glob", "glob_count"]) + df = df.reset_index(drop=True) + df = df.sort_values(by="glob_count", ascending=False) + + + style = df.style.background_gradient(axis=0, cmap="viridis") + style.set_properties(subset=["glob_count"], **{"text-align": "right"}) + + style.set_table_styles([ + {"selector": "", "props": [("border", "1px solid grey")]}, + {"selector": "tbody td", "props": [("border", "1px solid grey")]}, + {"selector": "th", "props": [("border", "1px solid grey")]} + + ]) + + # html = style.render() # use this when running python glob_feature.py -p /path/to/logfile + + html = style.to_html() #use when running pytest + + + with open(output_path, "w") as html_file: + html_file.write(html) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('-p', '--log-path', type=str, help="Path to the log file") + parser.add_argument('-o', '--output_path', type=str, help="Path to the output file") + args = parser.parse_args() + main(log_path=args.log_path, output_path=args.output_path) + From a5df39450827d56cb49100f80743efcae16f4934 Mon Sep 17 00:00:00 2001 From: yariseidenbenz Date: Thu, 15 Jun 2023 15:34:14 -0600 Subject: [PATCH 6/9] Remove glob_feature.py from wrong location --- darshan-util/pydarshan/glob_feature.py | 98 -------------------------- 1 file changed, 98 deletions(-) delete mode 100644 darshan-util/pydarshan/glob_feature.py diff --git a/darshan-util/pydarshan/glob_feature.py b/darshan-util/pydarshan/glob_feature.py deleted file mode 100644 index 8b1cb1455..000000000 --- a/darshan-util/pydarshan/glob_feature.py +++ /dev/null @@ -1,98 +0,0 @@ -# Creates a DataFrame with two columns ("glob_filename" and "glob_count") based on the files read b$ -# It uses sequence matching and grouping techniques to group similar file paths together and genera$ -# Command to run python glob_feature.py -p path/to/log/file.darshan - - -import argparse -import pandas as pd -import difflib -import darshan -import re -import os - - -def generalize_filename_glob(df): - paths = df["filename_glob"].tolist() - grouped_paths = [] - - for i in range(len(paths)): - if not grouped_paths: - grouped_paths.append((paths[i],)) - else: - is_grouped = False - for j, group in enumerate(grouped_paths): - matcher = difflib.SequenceMatcher(None, paths[i], group[0]) - similarity_ratio = matcher.ratio() - if similarity_ratio >= 0.8: - grouped_paths[j] = group + (paths[i],) - is_grouped = True - break - if not is_grouped: - grouped_paths.append((paths[i],)) - - print("grouped paths list is", grouped_paths) - - new_paths = [] - for group in grouped_paths: - if len(group) > 1: - common_prefix = os.path.commonprefix(group) - pattern = r"({}.*)\d(.*)".format(common_prefix) - modified_path = re.sub(pattern, r"\1\\d\2", group[0]) - new_paths.append((modified_path, len(group))) - else: - new_paths.append((group[0], 1)) - - new_paths = [path for path in new_paths if path[0]] - - if len(new_paths) > len(df): - new_paths = new_paths[:len(df)] - - print("new paths are", new_paths) - return new_paths - - - - -def main(log_path, output_path): - - report = darshan.DarshanReport(log_path) - - df = pd.DataFrame.from_dict(report.name_records, orient="index", columns=["filename_glob"]) - - df = df[df["filename_glob"].str.contains(r"/.*")] - - df.reset_index(drop=True, inplace=True) # Reset the index - - - new_paths = generalize_filename_glob(df) - df = pd.DataFrame(new_paths, columns=["filename_glob", "glob_count"]) - df = df.reset_index(drop=True) - df = df.sort_values(by="glob_count", ascending=False) - - - style = df.style.background_gradient(axis=0, cmap="viridis") - style.set_properties(subset=["glob_count"], **{"text-align": "right"}) - - style.set_table_styles([ - {"selector": "", "props": [("border", "1px solid grey")]}, - {"selector": "tbody td", "props": [("border", "1px solid grey")]}, - {"selector": "th", "props": [("border", "1px solid grey")]} - - ]) - - # html = style.render() # use this when running python glob_feature.py -p /path/to/logfile - - html = style.to_html() #use when running pytest - - - with open(output_path, "w") as html_file: - html_file.write(html) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument('-p', '--log-path', type=str, help="Path to the log file") - parser.add_argument('-o', '--output_path', type=str, help="Path to the output file") - args = parser.parse_args() - main(log_path=args.log_path, output_path=args.output_path) - From 452568bf902c26aa968fab43a60e7647a9f50280 Mon Sep 17 00:00:00 2001 From: yariseidenbenz Date: Mon, 19 Jun 2023 14:10:59 -0600 Subject: [PATCH 7/9] Fixed styling of glob_feature.py and added [.*] grouping feature. Added __init__.py to glob_feature. Fixed errors in test_glob_feature.py. --- .../darshan/glob_feature/__init__.py | 5 +++ .../darshan/glob_feature/glob_feature.py | 32 +++++++++++-------- .../darshan/tests/test_glob_feature.py | 20 +++++------- 3 files changed, 32 insertions(+), 25 deletions(-) create mode 100644 darshan-util/pydarshan/darshan/glob_feature/__init__.py diff --git a/darshan-util/pydarshan/darshan/glob_feature/__init__.py b/darshan-util/pydarshan/darshan/glob_feature/__init__.py new file mode 100644 index 000000000..060f8bf81 --- /dev/null +++ b/darshan-util/pydarshan/darshan/glob_feature/__init__.py @@ -0,0 +1,5 @@ +""" +Creates a DataFrame with two columns ("glob_filename" and "glob_count") +based on the files read by a .darshan file. +""" + diff --git a/darshan-util/pydarshan/darshan/glob_feature/glob_feature.py b/darshan-util/pydarshan/darshan/glob_feature/glob_feature.py index 4a29895fe..fe13e225d 100644 --- a/darshan-util/pydarshan/darshan/glob_feature/glob_feature.py +++ b/darshan-util/pydarshan/darshan/glob_feature/glob_feature.py @@ -32,13 +32,19 @@ def generalize_filename_glob(df): print("grouped paths list is", grouped_paths) + new_paths = [] for group in grouped_paths: if len(group) > 1: - common_prefix = os.path.commonprefix(group) - pattern = r"({}.*)\d(.*)".format(common_prefix) - modified_path = re.sub(pattern, r"\1\\d\2", group[0]) - new_paths.append((modified_path, len(group))) + merged_path = "" + max_length = max(len(path) for path in group) + for i in range(max_length): + chars = set(path[i] if len(path) > i else "" for path in group) + if len(chars) == 1: + merged_path += chars.pop() + else: + merged_path += "[.*]" + new_paths.append((merged_path, len(group))) else: new_paths.append((group[0], 1)) @@ -59,23 +65,23 @@ def main(log_path, output_path): df = df[df["filename_glob"].str.contains(r"/.*")] - df.reset_index(drop=True, inplace=True) # Reset the index - - new_paths = generalize_filename_glob(df) + df = pd.DataFrame(new_paths, columns=["filename_glob", "glob_count"]) - df = df.reset_index(drop=True) + df = df.reset_index(drop=True) df = df.sort_values(by="glob_count", ascending=False) - style = df.style.background_gradient(axis=0, cmap="viridis") + style = df.style.background_gradient(axis=0, cmap="viridis", gmap=df["glob_count"]) + style = style.set_properties(subset=["glob_count"], **{"text-align": "right"}) + style.hide(axis="index") style.set_table_styles([ {"selector": "", "props": [("border", "1px solid grey")]}, {"selector": "tbody td", "props": [("border", "1px solid grey")]}, - {"selector": "th", "props": [("border", "1px solid grey")]} - ]) + {"selector": "th", "props": [("border", "1px solid grey")]}, + + ]) - style = style.hide_index() - html = style.render() + html = style.to_html() with open(output_path, "w") as html_file: html_file.write(html) diff --git a/darshan-util/pydarshan/darshan/tests/test_glob_feature.py b/darshan-util/pydarshan/darshan/tests/test_glob_feature.py index 8a779857b..b8072bc9b 100644 --- a/darshan-util/pydarshan/darshan/tests/test_glob_feature.py +++ b/darshan-util/pydarshan/darshan/tests/test_glob_feature.py @@ -1,22 +1,21 @@ +import sys import os import darshan from darshan.log_utils import get_log_path import pandas as pd -print(pd.__version__) from pandas.testing import assert_frame_equal import pytest -import re -print(sys.path) # Print sys.path again -import glob_feature +import re +print(sys.path) +from darshan.glob_feature import glob_feature + -print("hello") @pytest.mark.parametrize("log_name, expected_df", [ # grow this with more logs... ("e3sm_io_heatmap_only.darshan", pd.DataFrame({"filename_glob": - # NOTE: usage of \\d or r"\d" for a literal backslash followed by "d" - ["/projects/radix-io/snyder/e3sm/can_I_out_h\\[.*]d.nc", - "/projects/radix-io/E3SM-IO-inputs/i_case_1344p.nc"], + ["/projects/radix-io/snyder/e3sm/can_I_out_h[.*].nc", + "/projects/radix-io/E3SM-IO-inputs/i_case_1344p.nc"], "glob_count": [2, 1]})), ]) @@ -30,12 +29,9 @@ def test_glob_tables(tmpdir, log_name, expected_df): print("log path is", log_path) with tmpdir.as_cwd(): cwd = os.getcwd() - # TODO: you shouldn't have a hardcoded HTML filename - # like this... - outfile = os.path.join(cwd, "name_record_glob_hd5f.html") + outfile = os.path.join(cwd, "output.html") glob_feature.main(log_path, outfile) actual_table = pd.read_html(outfile)[0] - actual_table.drop("Unnamed: 0", axis=1, inplace=True) # Drop the "Unnamed: 0" column print("actual table is", actual_table) print("expected_df is", expected_df) print("pandas version is", pd.__version__) From 26c2572f00899ada961f74e8e5bc9da082ec922e Mon Sep 17 00:00:00 2001 From: yariseidenbenz Date: Mon, 24 Jul 2023 14:39:41 -0600 Subject: [PATCH 8/9] Instead of using difflib to group files together, glob_feature.py now uses agglomerative hierarchal clustering for grouping. The test_glob_feature.py expanded to more log files. The dependencies for these scripts were added to main_ci.yml. --- .github/workflows/main_ci.yml | 2 +- .../darshan/glob_feature/glob_feature.py | 163 ++++++---- .../darshan/tests/test_glob_feature.py | 278 +++++++++++++++++- 3 files changed, 372 insertions(+), 71 deletions(-) diff --git a/.github/workflows/main_ci.yml b/.github/workflows/main_ci.yml index 4de03b7f6..c2e6ca351 100644 --- a/.github/workflows/main_ci.yml +++ b/.github/workflows/main_ci.yml @@ -32,7 +32,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - python -m pip install --upgrade pytest pyflakes asv pytest-cov lxml matplotlib packaging humanize "mypy<1.0.0" + python -m pip install --upgrade pytest pyflakes asv pytest-cov lxml matplotlib packaging humanize Jinja2 bz2file pandas scikit-learn numpy "mypy<1.0.0" - if: ${{matrix.platform == 'macos-latest'}} name: Install MacOS deps run: | diff --git a/darshan-util/pydarshan/darshan/glob_feature/glob_feature.py b/darshan-util/pydarshan/darshan/glob_feature/glob_feature.py index fe13e225d..ff0eacb02 100644 --- a/darshan-util/pydarshan/darshan/glob_feature/glob_feature.py +++ b/darshan-util/pydarshan/darshan/glob_feature/glob_feature.py @@ -1,95 +1,136 @@ # Creates a DataFrame with two columns ("glob_filename" and "glob_count") based on the files read by a .darshan file. -# It uses sequence matching and grouping techniques to group similar file paths together and generates an HTML report of the grouped paths and their counts -# Command to run python glob_feature.py -p path/to/log/file.darshan - +# The script utilizes agglomerative hierarchical clustering to effectively group similar file paths together, based on their characteristics. +# It then displays a dataframe where one file represents a group and uses [.*] to show where filepaths within a group differ +# The result of this process is an HTML report that provides a comprehensive overview of the grouped paths and their respective counts. +# Command to run: python glob_feature.py -p path/to/log/file.darshan -o path/to/output_file import argparse import pandas as pd -import difflib import darshan -import re +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.cluster import AgglomerativeClustering +from sklearn.metrics import silhouette_score +import numpy as np import os -def generalize_filename_glob(df): - paths = df["filename_glob"].tolist() - grouped_paths = [] - - for i in range(len(paths)): - if not grouped_paths: - grouped_paths.append((paths[i],)) - else: - is_grouped = False - for j, group in enumerate(grouped_paths): - matcher = difflib.SequenceMatcher(None, paths[i], group[0]) - similarity_ratio = matcher.ratio() - if similarity_ratio >= 0.8: - grouped_paths[j] = group + (paths[i],) - is_grouped = True - break - if not is_grouped: - grouped_paths.append((paths[i],)) - - print("grouped paths list is", grouped_paths) - - - new_paths = [] - for group in grouped_paths: - if len(group) > 1: - merged_path = "" - max_length = max(len(path) for path in group) - for i in range(max_length): - chars = set(path[i] if len(path) > i else "" for path in group) - if len(chars) == 1: - merged_path += chars.pop() - else: - merged_path += "[.*]" - new_paths.append((merged_path, len(group))) - else: - new_paths.append((group[0], 1)) - - new_paths = [path for path in new_paths if path[0]] - - if len(new_paths) > len(df): - new_paths = new_paths[:len(df)] - - print("new paths are", new_paths) - return new_paths - - def main(log_path, output_path): report = darshan.DarshanReport(log_path) - df = pd.DataFrame.from_dict(report.name_records, orient="index", columns=["filename_glob"]) - df = df[df["filename_glob"].str.contains(r"/.*")] - new_paths = generalize_filename_glob(df) - + num_files = len(df) + optimal_k = 2 # Initialize optimal_k to 2 + if num_files == 1: + print("Only one file detected.") + optimal_k = 1 + # Process and save results for the single file + grouped_paths = {0: [df["filename_glob"].iloc[0]]} + new_paths = [(path, 1) for _, paths in grouped_paths.items() for path in paths] + + print("grouped_paths", grouped_paths) + + else: + + # Convert strings to feature vectors + vectorizer = TfidfVectorizer() + X = vectorizer.fit_transform(df["filename_glob"]) + print("X is:", X) + + # Determine the maximum number of clusters dynamically + max_clusters = int(np.sqrt(len(df))) + + silhouette_scores = [] + for k in range(2, max_clusters + 1): + print("max clusters is", max_clusters) + # Perform clustering + clustering = AgglomerativeClustering(n_clusters=k) + clusters = clustering.fit_predict(X.toarray()) + + # Calculate the silhouette score + score = silhouette_score(X, clusters) + print("clusters are:", clusters) + + silhouette_scores.append(score) + + # Find the optimal number of clusters based on the silhouette scores + optimal_k = np.argmax(silhouette_scores) + 2 # Add 2 because range starts from 2 + + print("Optimal number of clusters:", optimal_k) + + # Perform clustering with the optimal number of clusters + clustering = AgglomerativeClustering(n_clusters=optimal_k) + clusters = clustering.fit_predict(X.toarray()) + print("clusters are", clusters) + grouped_paths = {} + for i, cluster_label in enumerate(clusters): + if cluster_label not in grouped_paths: + grouped_paths[cluster_label] = [] + grouped_paths[cluster_label].append(df["filename_glob"].iloc[i]) + + new_paths = [] + for _, group in grouped_paths.items(): + if len(group) > 1: + merged_path = "" + max_length = max(len(path) for path in group) + differing_chars_encountered = False + common_extension = None + + + for i in range(max_length): + chars = set(path[i] if len(path) > i else "" for path in group) + if len(chars) == 1: + merged_path += chars.pop() + differing_chars_encountered = True + else: + if differing_chars_encountered: + merged_path += "[.*]" + differing_chars_encountered = False + + # Checks if all paths have the same file extension + extensions = [os.path.splitext(path)[1] for path in group] + common_extension = None + if len(set(extensions)) == 1: + common_extension = extensions[0] + + # Append the common extension if it exists and it's not already in the merged_path + if common_extension and common_extension not in merged_path: + merged_path += common_extension + + new_paths.append((merged_path, len(group))) + else: + new_paths.append((group[0], 1)) + + + # Save the results to an output file df = pd.DataFrame(new_paths, columns=["filename_glob", "glob_count"]) - df = df.reset_index(drop=True) - df = df.sort_values(by="glob_count", ascending=False) + df = df.sort_values(by="glob_count", ascending=False) + print("df is", df) style = df.style.background_gradient(axis=0, cmap="viridis", gmap=df["glob_count"]) style = style.set_properties(subset=["glob_count"], **{"text-align": "right"}) style.hide(axis="index") style.set_table_styles([ {"selector": "", "props": [("border", "1px solid grey")]}, {"selector": "tbody td", "props": [("border", "1px solid grey")]}, - {"selector": "th", "props": [("border", "1px solid grey")]}, - - ]) + {"selector": "th", "props": [("border", "1px solid grey")]} + ]) html = style.to_html() with open(output_path, "w") as html_file: html_file.write(html) + total_count = df["glob_count"].sum() + print("Total glob_count:", total_count) + if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('-p', '--log-path', type=str, help="Path to the log file") - parser.add_argument('-o', '--output_path', type=str, help="Path to the output file") + parser.add_argument('-o', '--output-path', type=str, help="Path to the output HTML file") args = parser.parse_args() main(log_path=args.log_path, output_path=args.output_path) + + diff --git a/darshan-util/pydarshan/darshan/tests/test_glob_feature.py b/darshan-util/pydarshan/darshan/tests/test_glob_feature.py index b8072bc9b..803b5c41c 100644 --- a/darshan-util/pydarshan/darshan/tests/test_glob_feature.py +++ b/darshan-util/pydarshan/darshan/tests/test_glob_feature.py @@ -1,3 +1,6 @@ +# Note: Some tests may currently fail, as this script is still under active development. +# The log files here are from the the darshan-logs repository + import sys import os import darshan @@ -9,7 +12,6 @@ print(sys.path) from darshan.glob_feature import glob_feature - @pytest.mark.parametrize("log_name, expected_df", [ # grow this with more logs... ("e3sm_io_heatmap_only.darshan", @@ -17,14 +19,267 @@ ["/projects/radix-io/snyder/e3sm/can_I_out_h[.*].nc", "/projects/radix-io/E3SM-IO-inputs/i_case_1344p.nc"], "glob_count": [2, 1]})), + + ("snyder_acme.exe_id1253318_9-27-24239-1515303144625770178_2.darshan", + pd.DataFrame({"filename_glob": + ["/gpfs/mira-fs1/projects/radix-io/snyder/acme/test_F_case_cetus_dxt/run/[.*]", + "/projects/ccsm/inputdata/atm/cam/chem/trop_mozart_aero/emis/[.*].nc", + "/projects/ccsm/inputdata/atm/cam/physprops/[.*].nc", + "/projects/ccsm/inputdata/atm/cam/[.*].nc", + "/gpfs/mira-fs1/projects/radix-io/snyder/acme/test_F_case_cetus_dxt/run/test_F_case_cetus_dxt.[.*]00[.*]", + "/projects/ccsm/inputdata/lnd/clm2/[.*].nc", + "/gpfs/mira-fs1/projects/radix-io/snyder/acme/test_F_case_cetus_dxt/run/timing/[.*]i[.*]", + "/projects/radix-io/snyder/acme/test_F_case_cetus_dxt/run/[.*].log.170927-064246", + "/projects/ccsm/inputdata/atm/waccm/[.*].nc", + "/projects/ccsm/inputdata/[.*]n.[.*].[.*]1[.*]0[.*].nc"], #Note: for this set of grouped paths it might be more benifical to display the individual filepaths + "glob_count": [22, 18, 14, 13, 10, 6, 6, 5, 3, 3]})), + + ("darshan-apmpi-2nodes-64mpi.darshan", + pd.DataFrame({"filename_glob": + ["/lus/theta-fs0/projects/Performance/chunduri/MILC/milctestv2-papi-reorder-darshan/MILC_2_526820_2021-06-14-15:58:47/[.*]n[.*]"], + "glob_count": [2]})), + + ("mpi-io-test.darshan", + pd.DataFrame({"filename_glob": + ["/global/cscratch1/sd/ssnyder/tmp/mpi-io-test.tmp.dat"], + "glob_count": [1]})), + + ("e3sm_io_heatmap_and_dxt.darshan", + pd.DataFrame({"filename_glob": + ["/projects/radix-io/snyder/e3sm/can_I_out_h[.*].nc", + "/projects/radix-io/E3SM-IO-inputs/i_case_1344p.nc"], + "glob_count": [2, 1]})), + + + ("hdf5_diagonal_write_1_byte_dxt.darshan", + pd.DataFrame({"filename_glob": + ["/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/site-packages/h5py/[.*]", + "/yellow/users/nawtrey/projects/hdf5_testing/test_files_write_1_bytes/test_[.*].h5[.*]", + "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/[.*]"], + "glob_count": [54, 20, 10]})), + + + ("hdf5_diagonal_write_bytes_range_dxt.darshan", + pd.DataFrame({"filename_glob": + ["/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/site-packages/h5py/[.*]", + "/yellow/users/nawtrey/projects/hdf5_testing/test_files_write_1_bytes/test_[.*].h5[.*]", + "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/[.*]"], + "glob_count": [54, 20, 10]})), + + ("hdf5_diagonal_write_half_flush_dxt.darshan", + pd.DataFrame({"filename_glob": + ["/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/site-packages/h5py/[.*]", + "/yellow/users/nawtrey/projects/hdf5_testing/test_files_write_1_bytes/test_[.*].h5[.*]", + "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/[.*]"], + "glob_count": [54, 20, 10]})), + + ("hdf5_diagonal_write_half_ranks_dxt.darshan", + pd.DataFrame({"filename_glob": + ["/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/site-packages/h5py/[.*]", + "/yellow/users/nawtrey/projects/hdf5_testing/test_files_write_1_bytes/test_[.*].h5[.*]", + "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/[.*]"], + "glob_count": [54, 15, 10]})), + + ("hdf5_file_opens_only.darshan", + pd.DataFrame({"filename_glob": + ["/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/site-packages/numpy[.*]", + "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/[.*]", + "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/site-packages/h5py/[.*]", + "/yellow/users/nawtrey/projects/hdf5_testing/test_h5f_only_[.*].h5"], + "glob_count": [175, 85, 54, 3 ]})), + + + + ("treddy_h5d_no_h5f.darshan", + pd.DataFrame({"filename_glob": + ["/home/treddy/python_virtual_envs/python_310_darshan/lib/python3.10/site-packages/h5py/_[.*].pyc", + "/home/treddy/rough_work/darshan/issue_709/rank_[.*].h5[.*]"], + "glob_count": [15, 6]})), + + + ("imbalanced-io.darshan", + pd.DataFrame({"filename_glob": + ["/lus/theta-fs0/[.*]", + "//3926523774", + "//1958007717", + "//946917208", + "//3186458368", + "//604249092", + "//2324418701", + "//2142813647", + "//3149983296", + "//1895353925", + "//425392719", + "//1053204904", + "//2446001947"], + "glob_count": [1015, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]})), + + + ("shane_ior-HDF5_id438090-438090_11-9-41522-17417065676046418211_1.darshan", + pd.DataFrame({"filename_glob": + ["/home/shane/software/ior/build/testFile[.*]"], + "glob_count": [2]})), + + ("shane_ior-PNETCDF_id438100-438100_11-9-41525-10280033558448664385_1.darshan", + pd.DataFrame({"filename_glob": + ["/home/shane/software/ior/build/testFile[.*]"], + "glob_count": [2]})), + + ("partial_data_stdio.darshan", + pd.DataFrame({"filename_glob": + ["/home/carns/working/dbg/darshan-examples/foo[.*]", + "/home/carns/working/dbg/darshan-examples/test.out"], + "glob_count": [1021, 1]})), + + +# This log file contains files that are only numeric. +# I commented them all out because I am unsure if we even want to include these files and if so do we want to group them together +# ("nonmpi_dxt_anonymized.darshan", +# pd.DataFrame({"filename_glob": +# ["//2585653418", +# "//3392535749", +# "//1750113851", +# "//68752815", +# "//155559223", +# "//1093384412", +# "//3046746762", +# "//2617286315", +# "//826480344", +# "//1571032323", +# "//4226169779", +# "//2418046705", +# "//2010395326", +# "//1767127016", +# "//4075905285", +# "//1067575933", +# "//3616928368", +# "//983841409", +# "//513688402", +# "//4287455549", +# "//2136275236", +# "//3097647757", +# "//236164485", +# "//1437530161", +# "//2689488546", +# "//4192870826", +# "//309267665", +# "//780646879", +# "//499632015", +# "//2507343021", +# "//2695660354", +# "//3091680351", +# "//3164053573", +# "//930552855", +# "//1137823565", +# "//2598810996", +# "//2330561107", +# "//2564488601", +# "//317014058", +# "//3342706664", +# "//2160565458", +# "//2907700500", +# "//2116489843", +# "//135439080", +# "//3098064231", +# "//2967008390", +# "//3067634051", +# "//1734260232", +# "//3120506952", +# "//642754434", +# "//463702723", +# "//1896899807", +# "//4260655471", +# "//827646422", +# "//942747095", +# "//432306240", +# "//583215908", +# "//1673153855", +# "//3192604617", +# "//3225174794", +# "//2990589364", +# "//37712466", +# "//2173526570", +# "//1117575673", +# "//3916290828", +# "//430181069", +# "//3645159644", +# "//529183092", +# "//3225006356", +# "//63288926", +# "//798211322", +# "//2256136699", +# "//4004231621", +# "//2379710227", +# "//3211841059", +# "//3397061505", +# "//416688243", +# "//1456531123"], +# "glob_count": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]})), + + + ("partial_data_dxt.darshan", + pd.DataFrame({"filename_glob": + ["/home/carns/working/dbg/darshan-examples/test.out"], + "glob_count": [1]})), + + + ("partial_data_stdio.darshan", + pd.DataFrame({"filename_glob": + ["/home/carns/working/dbg/darshan-examples/foo[.*]", + "/home/carns/working/dbg/darshan-examples/test.out"], + "glob_count": [1021 ,1]})), + + + ("mpi-io-test-ppc64-3.0.0.darshan", + pd.DataFrame({"filename_glob": + ["/gpfs/mira-fs0/projects/SSSPPg/snyder/tmp/mpi-io-test.tmp.dat"], + "glob_count": [1]})), + + ("mpi-io-test-x86_64-3.0.0.darshan", + pd.DataFrame({"filename_glob": + ["/tmp/tmp/mpi-io-test.tmp.dat"], + "glob_count": [1]})), + + ("mpi-io-test-x86_64-3.4.0-pre1.darshan", + pd.DataFrame({"filename_glob": + ["/tmp/test/mpi-io-test.tmp.dat"], + "glob_count": [1]})), + + + ("runtime_and_dxt_heatmaps_diagonal_write_only.darshan", + pd.DataFrame({"filename_glob": + ["/yellow/users/treddy/github_projects/heatmap_diagonal/rank_[.*]_write_1_bytes"], + "glob_count": [32]})), + + +# This log file contains no data + ("treddy_runtime_heatmap_inactive_ranks.darshan", + pd.DataFrame({"filename_glob": + [], + "glob_count": []})), + + + ("skew-app.darshan", + pd.DataFrame({"filename_glob": + ["/lus/theta-fs0/2934391481"], + "glob_count": [1]})), + + ("skew-autobench-ior.darshan", + pd.DataFrame({"filename_glob": + ["//1968299212", + "//4207382746"], + "glob_count": [1, 1]})), + + + ("laytonjb_test1_id28730_6-7-43012-2131301613401632697_1.darshan", + pd.DataFrame({"filename_glob": + ["/home/laytonjb/PROJECTS/DARSHAN/TEST/jeff.txt"], + "glob_count": [1]})), ]) + def test_glob_tables(tmpdir, log_name, expected_df): print("Current working directory:", os.getcwd()) - - # test the glob table HTML outputs for various - # log files in the logs repo (and new log files - # that you creatively design yourself) log_path = get_log_path(log_name) print("log path is", log_path) with tmpdir.as_cwd(): @@ -32,14 +287,19 @@ def test_glob_tables(tmpdir, log_name, expected_df): outfile = os.path.join(cwd, "output.html") glob_feature.main(log_path, outfile) actual_table = pd.read_html(outfile)[0] - print("actual table is", actual_table) - print("expected_df is", expected_df) - print("pandas version is", pd.__version__) print("log path is", log_path) + print("Shape of actual table:", actual_table.shape) + print("Shape of expected_df:", expected_df.shape) + + # Print the contents of the DataFrames + print("Actual DataFrame:") + print(actual_table) + print("Expected DataFrame:") + print(expected_df) + # Compare the two DataFrames diff = actual_table['filename_glob'].compare(expected_df['filename_glob']) # Print the differences print(diff) assert_frame_equal(actual_table, expected_df) - From cd9d522a11ee80635868359292ca5134c52e6540 Mon Sep 17 00:00:00 2001 From: yariseidenbenz Date: Tue, 8 Aug 2023 17:09:37 -0600 Subject: [PATCH 9/9] The glob_feature.py now groups files based on agglomerative hierarchical clustering and common file extension. It also has a verbose option (-v) which displays all the files within the respective groups. I added some modifications for the ideal values in the test_glob_feature.py script --- .../darshan/glob_feature/glob_feature.py | 123 +++++++-- .../darshan/tests/test_glob_feature.py | 247 +++++------------- 2 files changed, 164 insertions(+), 206 deletions(-) diff --git a/darshan-util/pydarshan/darshan/glob_feature/glob_feature.py b/darshan-util/pydarshan/darshan/glob_feature/glob_feature.py index ff0eacb02..c31e7f760 100644 --- a/darshan-util/pydarshan/darshan/glob_feature/glob_feature.py +++ b/darshan-util/pydarshan/darshan/glob_feature/glob_feature.py @@ -3,6 +3,8 @@ # It then displays a dataframe where one file represents a group and uses [.*] to show where filepaths within a group differ # The result of this process is an HTML report that provides a comprehensive overview of the grouped paths and their respective counts. # Command to run: python glob_feature.py -p path/to/log/file.darshan -o path/to/output_file +# Command to run with verbose: verbose will display all the files under the representing file +# python glob_feature.py -p path/to/log/file.darshan -o path/to/output_file -v import argparse import pandas as pd @@ -14,7 +16,7 @@ import os -def main(log_path, output_path): +def main(log_path, output_path, verbose): report = darshan.DarshanReport(log_path) df = pd.DataFrame.from_dict(report.name_records, orient="index", columns=["filename_glob"]) @@ -38,7 +40,7 @@ def main(log_path, output_path): X = vectorizer.fit_transform(df["filename_glob"]) print("X is:", X) - # Determine the maximum number of clusters dynamically + # Determine the maximum number of clusters dynamically max_clusters = int(np.sqrt(len(df))) silhouette_scores = [] @@ -51,9 +53,10 @@ def main(log_path, output_path): # Calculate the silhouette score score = silhouette_score(X, clusters) print("clusters are:", clusters) - silhouette_scores.append(score) + + # Find the optimal number of clusters based on the silhouette scores optimal_k = np.argmax(silhouette_scores) + 2 # Add 2 because range starts from 2 @@ -69,6 +72,16 @@ def main(log_path, output_path): grouped_paths[cluster_label] = [] grouped_paths[cluster_label].append(df["filename_glob"].iloc[i]) + # Group paths based on file extensions + grouped_by_extension = {} + for cluster_label, paths in grouped_paths.items(): + grouped_by_extension[cluster_label] = {} + for path in paths: + file_extension = os.path.splitext(path)[1] + if file_extension not in grouped_by_extension[cluster_label]: + grouped_by_extension[cluster_label][file_extension] = [] + grouped_by_extension[cluster_label][file_extension].append(path) + new_paths = [] for _, group in grouped_paths.items(): if len(group) > 1: @@ -77,7 +90,6 @@ def main(log_path, output_path): differing_chars_encountered = False common_extension = None - for i in range(max_length): chars = set(path[i] if len(path) > i else "" for path in group) if len(chars) == 1: @@ -85,15 +97,17 @@ def main(log_path, output_path): differing_chars_encountered = True else: if differing_chars_encountered: - merged_path += "[.*]" + merged_path += "(.*)" differing_chars_encountered = False + break - # Checks if all paths have the same file extension + # Check if all paths have the same file extension extensions = [os.path.splitext(path)[1] for path in group] common_extension = None if len(set(extensions)) == 1: common_extension = extensions[0] + # Append the common extension if it exists and it's not already in the merged_path if common_extension and common_extension not in merged_path: merged_path += common_extension @@ -103,34 +117,95 @@ def main(log_path, output_path): new_paths.append((group[0], 1)) - # Save the results to an output file - df = pd.DataFrame(new_paths, columns=["filename_glob", "glob_count"]) - df = df.sort_values(by="glob_count", ascending=False) - print("df is", df) - style = df.style.background_gradient(axis=0, cmap="viridis", gmap=df["glob_count"]) - style = style.set_properties(subset=["glob_count"], **{"text-align": "right"}) - style.hide(axis="index") - style.set_table_styles([ - {"selector": "", "props": [("border", "1px solid grey")]}, - {"selector": "tbody td", "props": [("border", "1px solid grey")]}, - {"selector": "th", "props": [("border", "1px solid grey")]} - ]) + if verbose: + new_paths_verbose = [] + + # Sort grouped_paths based on the size of each group (in descending order) + sorted_groups = sorted(grouped_paths.items(), key=lambda x: len(x[1]), reverse=True) + + for cluster_label, paths in sorted_groups: + + if len(paths) > 1: + merged_path = "" + max_length = max(len(path) for path in paths) + differing_chars_encountered = False + common_extension = None + + + for i in range(max_length): + chars = set(path[i] if len(path) > i else "" for path in paths) + if len(chars) == 1: + merged_path += chars.pop() + differing_chars_encountered = True + else: + if differing_chars_encountered: + merged_path += "(.*)" + differing_chars_encountered = False + break + + # Check if all paths have the same file extension + extensions = [os.path.splitext(path)[1] for path in paths] + common_extension = None + if len(set(extensions)) == 1: + common_extension = extensions[0] + + # Append the merged path if it's not already in the new_paths_verbose list + if merged_path and (merged_path, len(paths)) not in new_paths_verbose: + new_paths_verbose.append((merged_path, len(paths))) + + # Append the individual paths beneath the merged path + new_paths_verbose.extend([(f" {path}", 1) for path in paths]) + else: + new_paths_verbose.append((group[0], 1)) + + + df_verbose = pd.DataFrame(new_paths_verbose, columns=["filename_glob", "glob_count"]) + print(df_verbose.to_string(index=False)) + + + # Display or save the DataFrame using pandas styler + if verbose: + df_verbose = pd.DataFrame(new_paths_verbose, columns=["filename_glob", "glob_count"]) + styled_html = df_verbose.style.background_gradient(axis=0, cmap="viridis", gmap=df_verbose["glob_count"]) + styled_html = styled_html.set_properties(subset=["glob_count"], **{"text-align": "right"}) + styled_html.hide(axis="index") + styled_html.set_table_styles([ + {"selector": "", "props": [("border", "1px solid grey")]}, + {"selector": "tbody td", "props": [("border", "1px solid grey")]}, + {"selector": "th", "props": [("border", "1px solid grey")]} + ]) + html = styled_html.to_html() + + with open(output_path, "w") as html_file: + html_file.write(html) + + else: + df = pd.DataFrame(new_paths, columns=["filename_glob", "glob_count"]) + df = df.sort_values(by="glob_count", ascending=False) - html = style.to_html() + styled_html = df.style.background_gradient(axis=0, cmap="viridis", gmap=df["glob_count"]) + styled_html = styled_html.set_properties(subset=["glob_count"], **{"text-align": "right"}) + styled_html.hide(axis="index") + styled_html.set_table_styles([ + {"selector": "", "props": [("border", "1px solid grey")]}, + {"selector": "tbody td", "props": [("border", "1px solid grey")]}, + {"selector": "th", "props": [("border", "1px solid grey")]} + ]) + html = styled_html.to_html() - with open(output_path, "w") as html_file: - html_file.write(html) + with open(output_path, "w") as html_file: + html_file.write(html) - total_count = df["glob_count"].sum() - print("Total glob_count:", total_count) + print("Styled results saved to:", output_path) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('-p', '--log-path', type=str, help="Path to the log file") parser.add_argument('-o', '--output-path', type=str, help="Path to the output HTML file") + parser.add_argument('-v', '--verbose', action='store_true', help="Display verbose output") args = parser.parse_args() - main(log_path=args.log_path, output_path=args.output_path) + main(log_path=args.log_path, output_path=args.output_path, verbose=args.verbose) diff --git a/darshan-util/pydarshan/darshan/tests/test_glob_feature.py b/darshan-util/pydarshan/darshan/tests/test_glob_feature.py index 803b5c41c..4bd44e00f 100644 --- a/darshan-util/pydarshan/darshan/tests/test_glob_feature.py +++ b/darshan-util/pydarshan/darshan/tests/test_glob_feature.py @@ -1,6 +1,5 @@ # Note: Some tests may currently fail, as this script is still under active development. # The log files here are from the the darshan-logs repository - import sys import os import darshan @@ -13,223 +12,133 @@ from darshan.glob_feature import glob_feature @pytest.mark.parametrize("log_name, expected_df", [ - # grow this with more logs... ("e3sm_io_heatmap_only.darshan", pd.DataFrame({"filename_glob": - ["/projects/radix-io/snyder/e3sm/can_I_out_h[.*].nc", + ["/projects/radix-io/snyder/e3sm/can_I_out_h(.*).nc", "/projects/radix-io/E3SM-IO-inputs/i_case_1344p.nc"], "glob_count": [2, 1]})), + ("snyder_acme.exe_id1253318_9-27-24239-1515303144625770178_2.darshan", pd.DataFrame({"filename_glob": - ["/gpfs/mira-fs1/projects/radix-io/snyder/acme/test_F_case_cetus_dxt/run/[.*]", - "/projects/ccsm/inputdata/atm/cam/chem/trop_mozart_aero/emis/[.*].nc", - "/projects/ccsm/inputdata/atm/cam/physprops/[.*].nc", - "/projects/ccsm/inputdata/atm/cam/[.*].nc", - "/gpfs/mira-fs1/projects/radix-io/snyder/acme/test_F_case_cetus_dxt/run/test_F_case_cetus_dxt.[.*]00[.*]", - "/projects/ccsm/inputdata/lnd/clm2/[.*].nc", - "/gpfs/mira-fs1/projects/radix-io/snyder/acme/test_F_case_cetus_dxt/run/timing/[.*]i[.*]", - "/projects/radix-io/snyder/acme/test_F_case_cetus_dxt/run/[.*].log.170927-064246", - "/projects/ccsm/inputdata/atm/waccm/[.*].nc", - "/projects/ccsm/inputdata/[.*]n.[.*].[.*]1[.*]0[.*].nc"], #Note: for this set of grouped paths it might be more benifical to display the individual filepaths - "glob_count": [22, 18, 14, 13, 10, 6, 6, 5, 3, 3]})), + ["/projects/ccsm/inputdata/atm/cam/chem/trop_mozart_aero/emis/(.*).nc", + "/gpfs/mira-fs1/projects/radix-io/snyder/acme/test_F_case_cetus_dxt/run/(.*)", + "/projects/ccsm/inputdata/atm/cam/physprops/(.*).nc", + "/projects/ccsm/inputdata/atm/cam/(.*).nc", + "/gpfs/mira-fs1/projects/radix-io/snyder/acme/test_F_case_cetus_dxt/run/(.*).nml", + "/gpfs/mira-fs1/projects/radix-io/snyder/acme/test_F_case_cetus_dxt/run/test_F_case_cetus_dxt.c(.*).nc", + "/projects/ccsm/inputdata/lnd/clm2/(.*).nc", + "/gpfs/mira-fs1/projects/radix-io/snyder/acme/test_F_case_cetus_dxt/run/(.*)", + "/projects/radix-io/snyder/acme/test_F_case_cetus_dxt/run/(.*).170927-064246", + "/projects/ccsm/inputdata/atm/waccm/(.*).nc", + "/projects/ccsm/inputdata/(.*).nc"], + "glob_count": [18, 14, 14, 13, 9, 9, 6, 6, 5, 3, 3]})), + ("darshan-apmpi-2nodes-64mpi.darshan", pd.DataFrame({"filename_glob": - ["/lus/theta-fs0/projects/Performance/chunduri/MILC/milctestv2-papi-reorder-darshan/MILC_2_526820_2021-06-14-15:58:47/[.*]n[.*]"], + ["/lus/theta-fs0/projects/Performance/chunduri/MILC/milctestv2-papi-reorder-darshan/MILC_2_526820_2021-06-14-15:58:47/(.*)"], "glob_count": [2]})), + ("mpi-io-test.darshan", pd.DataFrame({"filename_glob": ["/global/cscratch1/sd/ssnyder/tmp/mpi-io-test.tmp.dat"], "glob_count": [1]})), + ("e3sm_io_heatmap_and_dxt.darshan", pd.DataFrame({"filename_glob": - ["/projects/radix-io/snyder/e3sm/can_I_out_h[.*].nc", + ["/projects/radix-io/snyder/e3sm/can_I_out_h(.*).nc", "/projects/radix-io/E3SM-IO-inputs/i_case_1344p.nc"], "glob_count": [2, 1]})), ("hdf5_diagonal_write_1_byte_dxt.darshan", pd.DataFrame({"filename_glob": - ["/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/site-packages/h5py/[.*]", - "/yellow/users/nawtrey/projects/hdf5_testing/test_files_write_1_bytes/test_[.*].h5[.*]", - "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/[.*]"], - "glob_count": [54, 20, 10]})), - + ["/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/site-packages/h5py/(.*).so", + "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/(.*).py", + "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/(.*).pyc", + "/yellow/users/nawtrey/projects/hdf5_testing/test_files_write_1_bytes/test_(.*).h5", + "/yellow/users/nawtrey/projects/hdf5_testing/test_files_write_1_bytes/test_(.*)"], + "glob_count": [24, 20, 20, 10, 10]})), ("hdf5_diagonal_write_bytes_range_dxt.darshan", pd.DataFrame({"filename_glob": - ["/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/site-packages/h5py/[.*]", - "/yellow/users/nawtrey/projects/hdf5_testing/test_files_write_1_bytes/test_[.*].h5[.*]", - "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/[.*]"], - "glob_count": [54, 20, 10]})), + ["/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/site-packages/h5py/(.*).so", + "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/(.*).py", + "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/(.*).pyc", + "/yellow/users/nawtrey/projects/hdf5_testing/test_files_write_1_bytes/test_(.*).h5", + "/yellow/users/nawtrey/projects/hdf5_testing/test_files_write_1_bytes/test_(.*)"], + "glob_count": [24, 20, 20, 10, 10]})), ("hdf5_diagonal_write_half_flush_dxt.darshan", pd.DataFrame({"filename_glob": - ["/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/site-packages/h5py/[.*]", - "/yellow/users/nawtrey/projects/hdf5_testing/test_files_write_1_bytes/test_[.*].h5[.*]", - "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/[.*]"], - "glob_count": [54, 20, 10]})), + ["/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/site-packages/h5py/(.*).so", + "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/(.*).py", + "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/(.*).pyc", + "/yellow/users/nawtrey/projects/hdf5_testing/test_files_write_1_bytes/test_(.*).h5", + "/yellow/users/nawtrey/projects/hdf5_testing/test_files_write_1_bytes/test_(.*)"], + "glob_count": [24, 20, 20, 10, 10]})), ("hdf5_diagonal_write_half_ranks_dxt.darshan", pd.DataFrame({"filename_glob": - ["/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/site-packages/h5py/[.*]", - "/yellow/users/nawtrey/projects/hdf5_testing/test_files_write_1_bytes/test_[.*].h5[.*]", - "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/[.*]"], - "glob_count": [54, 15, 10]})), + ["/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/site-packages/h5py/(.*).so", + "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/(.*).py", + "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/(.*).pyc", + "/yellow/users/nawtrey/projects/hdf5_testing/test_files_write_1_bytes/test_(.*).h5", + "/yellow/users/nawtrey/projects/hdf5_testing/test_files_write_1_bytes/test_(.*)"], + "glob_count": [24, 20, 20, 10, 10]})), ("hdf5_file_opens_only.darshan", pd.DataFrame({"filename_glob": - ["/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/site-packages/numpy[.*]", - "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/[.*]", - "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/site-packages/h5py/[.*]", - "/yellow/users/nawtrey/projects/hdf5_testing/test_h5f_only_[.*].h5"], - "glob_count": [175, 85, 54, 3 ]})), - + ["/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/site-packages/numpy/(.*)", + "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/(.*).py", + "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/__pycache__/(.*).pyc", + "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/site-packages/(.*).so", + "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/site-packages/h5py(.*)", + "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/lib-dynload/(.*).so", + "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/(.*).pyc", + "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/json/(.*)", + "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/site-packages/(.*).py", + "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/site-packages/(.*).pyc", + "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/importlib/(.*)", + "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/ctypes", + "/yellow/users/nawtrey/projects/hdf5_testing/test_h5f_only_(.*).h5"], + "glob_count": [140, 62, 47, 37, 22, 17, 15, 8, 6, 6, 4, 4, 3]})), ("treddy_h5d_no_h5f.darshan", pd.DataFrame({"filename_glob": - ["/home/treddy/python_virtual_envs/python_310_darshan/lib/python3.10/site-packages/h5py/_[.*].pyc", - "/home/treddy/rough_work/darshan/issue_709/rank_[.*].h5[.*]"], + ["/home/treddy/python_virtual_envs/python_310_darshan/lib/python3.10/site-packages/h5py/_(.*).pyc", + "/home/treddy/rough_work/darshan/issue_709/rank_(.*)"], "glob_count": [15, 6]})), - ("imbalanced-io.darshan", - pd.DataFrame({"filename_glob": - ["/lus/theta-fs0/[.*]", - "//3926523774", - "//1958007717", - "//946917208", - "//3186458368", - "//604249092", - "//2324418701", - "//2142813647", - "//3149983296", - "//1895353925", - "//425392719", - "//1053204904", - "//2446001947"], - "glob_count": [1015, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]})), - - ("shane_ior-HDF5_id438090-438090_11-9-41522-17417065676046418211_1.darshan", pd.DataFrame({"filename_glob": - ["/home/shane/software/ior/build/testFile[.*]"], + ["/home/shane/software/ior/build/testFile(.*)"], "glob_count": [2]})), ("shane_ior-PNETCDF_id438100-438100_11-9-41525-10280033558448664385_1.darshan", pd.DataFrame({"filename_glob": - ["/home/shane/software/ior/build/testFile[.*]"], + ["/home/shane/software/ior/build/testFile(.*)"], "glob_count": [2]})), + ("partial_data_stdio.darshan", pd.DataFrame({"filename_glob": - ["/home/carns/working/dbg/darshan-examples/foo[.*]", + ["/home/carns/working/dbg/darshan-examples/foo(.*)", "/home/carns/working/dbg/darshan-examples/test.out"], "glob_count": [1021, 1]})), -# This log file contains files that are only numeric. -# I commented them all out because I am unsure if we even want to include these files and if so do we want to group them together -# ("nonmpi_dxt_anonymized.darshan", -# pd.DataFrame({"filename_glob": -# ["//2585653418", -# "//3392535749", -# "//1750113851", -# "//68752815", -# "//155559223", -# "//1093384412", -# "//3046746762", -# "//2617286315", -# "//826480344", -# "//1571032323", -# "//4226169779", -# "//2418046705", -# "//2010395326", -# "//1767127016", -# "//4075905285", -# "//1067575933", -# "//3616928368", -# "//983841409", -# "//513688402", -# "//4287455549", -# "//2136275236", -# "//3097647757", -# "//236164485", -# "//1437530161", -# "//2689488546", -# "//4192870826", -# "//309267665", -# "//780646879", -# "//499632015", -# "//2507343021", -# "//2695660354", -# "//3091680351", -# "//3164053573", -# "//930552855", -# "//1137823565", -# "//2598810996", -# "//2330561107", -# "//2564488601", -# "//317014058", -# "//3342706664", -# "//2160565458", -# "//2907700500", -# "//2116489843", -# "//135439080", -# "//3098064231", -# "//2967008390", -# "//3067634051", -# "//1734260232", -# "//3120506952", -# "//642754434", -# "//463702723", -# "//1896899807", -# "//4260655471", -# "//827646422", -# "//942747095", -# "//432306240", -# "//583215908", -# "//1673153855", -# "//3192604617", -# "//3225174794", -# "//2990589364", -# "//37712466", -# "//2173526570", -# "//1117575673", -# "//3916290828", -# "//430181069", -# "//3645159644", -# "//529183092", -# "//3225006356", -# "//63288926", -# "//798211322", -# "//2256136699", -# "//4004231621", -# "//2379710227", -# "//3211841059", -# "//3397061505", -# "//416688243", -# "//1456531123"], -# "glob_count": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]})), - - ("partial_data_dxt.darshan", pd.DataFrame({"filename_glob": ["/home/carns/working/dbg/darshan-examples/test.out"], "glob_count": [1]})), - ("partial_data_stdio.darshan", - pd.DataFrame({"filename_glob": - ["/home/carns/working/dbg/darshan-examples/foo[.*]", - "/home/carns/working/dbg/darshan-examples/test.out"], - "glob_count": [1021 ,1]})), - - ("mpi-io-test-ppc64-3.0.0.darshan", pd.DataFrame({"filename_glob": ["/gpfs/mira-fs0/projects/SSSPPg/snyder/tmp/mpi-io-test.tmp.dat"], @@ -248,29 +157,10 @@ ("runtime_and_dxt_heatmaps_diagonal_write_only.darshan", pd.DataFrame({"filename_glob": - ["/yellow/users/treddy/github_projects/heatmap_diagonal/rank_[.*]_write_1_bytes"], + ["/yellow/users/treddy/github_projects/heatmap_diagonal/rank_(.*)_write_1_bytes"], "glob_count": [32]})), -# This log file contains no data - ("treddy_runtime_heatmap_inactive_ranks.darshan", - pd.DataFrame({"filename_glob": - [], - "glob_count": []})), - - - ("skew-app.darshan", - pd.DataFrame({"filename_glob": - ["/lus/theta-fs0/2934391481"], - "glob_count": [1]})), - - ("skew-autobench-ior.darshan", - pd.DataFrame({"filename_glob": - ["//1968299212", - "//4207382746"], - "glob_count": [1, 1]})), - - ("laytonjb_test1_id28730_6-7-43012-2131301613401632697_1.darshan", pd.DataFrame({"filename_glob": ["/home/laytonjb/PROJECTS/DARSHAN/TEST/jeff.txt"], @@ -285,11 +175,9 @@ def test_glob_tables(tmpdir, log_name, expected_df): with tmpdir.as_cwd(): cwd = os.getcwd() outfile = os.path.join(cwd, "output.html") - glob_feature.main(log_path, outfile) + glob_feature.main(log_path, outfile, verbose=False) actual_table = pd.read_html(outfile)[0] print("log path is", log_path) - print("Shape of actual table:", actual_table.shape) - print("Shape of expected_df:", expected_df.shape) # Print the contents of the DataFrames print("Actual DataFrame:") @@ -297,9 +185,4 @@ def test_glob_tables(tmpdir, log_name, expected_df): print("Expected DataFrame:") print(expected_df) - # Compare the two DataFrames - diff = actual_table['filename_glob'].compare(expected_df['filename_glob']) - # Print the differences - print(diff) assert_frame_equal(actual_table, expected_df) -