From dce2c0aa9a56d9f9618567c7df162c702c98067d Mon Sep 17 00:00:00 2001
From: yariseidenbenz <yeidenbenz@lanl.gov>
Date: Tue, 23 May 2023 12:17:45 -0600
Subject: [PATCH 1/9] Generate HTML report summarizing file usage from .darshan
 log: DataFrame with 'glob_filename' and 'glob_count' columns.

---
 git_project/glob_feature/glob_feature.py | 66 ++++++++++++++++++++++++
 1 file changed, 66 insertions(+)
 create mode 100644 git_project/glob_feature/glob_feature.py

diff --git a/git_project/glob_feature/glob_feature.py b/git_project/glob_feature/glob_feature.py
new file mode 100644
index 000000000..d84cb65a5
--- /dev/null
+++ b/git_project/glob_feature/glob_feature.py
@@ -0,0 +1,66 @@
+# Creates a DataFrame with two columns ("glob_filename" and "glob_count") based on the files read by a .darshan file.
+# It uses sequence matching and grouping techniques to group similar file paths together and generates an HTML report of the grouped paths and their counts
+# Command to run python glob_feature.py -p path/to/log/file.darshan 
+
+
+import argparse
+import pandas as pd
+import difflib
+import darshan
+
+
+def path_grouper():
+    matcher = difflib.SequenceMatcher()
+    def group_paths(paths):
+        if not matcher.a:
+            matcher.set_seq1(paths)
+            return paths
+        else:
+            matcher.set_seq2(paths)
+            matchings = matcher.get_matching_blocks()
+            if any(size > 25 for _, _, size in matchings): # change size to bigger number for more precise paths
+                return matcher.a
+            else:
+                matcher.set_seq1(paths)
+                return paths
+
+    return group_paths
+
+
+def regex_df_condenser(df, paths):
+    path_grouper_func = path_grouper()
+    df["filename_glob"] = df["filename_glob"].apply(path_grouper_func)
+    df = df.groupby("filename_glob").size().reset_index(name="glob_count")
+
+    return df
+
+
+def main(log_path):
+    report = darshan.DarshanReport(log_path)
+    df = pd.DataFrame.from_dict(report.name_records, orient="index", columns=["filename_glob"])
+    df = df[df["filename_glob"].str.contains(r"/.*")]
+    df["glob_count"] = 1
+    df = regex_df_condenser(df, df["filename_glob"])
+    df.sort_values(by="glob_count", inplace=True, ascending=False)
+
+
+    style = df.style.background_gradient(axis=0, cmap="viridis", gmap=df["glob_count"])
+    style.hide(axis="index")
+    style.set_table_styles([
+        {"selector": "", "props": [("border", "1px solid grey")]},
+        {"selector": "tbody td", "props": [("border", "1px solid grey")]},
+        {"selector": "th", "props": [("border", "1px solid grey")]}
+    ])
+    html = style.to_html()
+
+# can change name of the output html report here
+    with open("name_record_table.html", "w") as html_file:
+        html_file.write(html)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-p', '--log-path', type=str)
+    args = parser.parse_args()
+    main(log_path=args.log_path)
+

From 8277396081f43a2025219460fd48dff9e04ddd2c Mon Sep 17 00:00:00 2001
From: yariseidenbenz <yeidenbenz@lanl.gov>
Date: Tue, 30 May 2023 15:37:38 -0600
Subject: [PATCH 2/9] updated glob_feature.py which creates dataframe of
 glob_filename and glob_count

---
 git_project/glob_feature/glob_feature.py | 65 +++++++++++++++++++-----
 1 file changed, 52 insertions(+), 13 deletions(-)

diff --git a/git_project/glob_feature/glob_feature.py b/git_project/glob_feature/glob_feature.py
index d84cb65a5..ac81c3218 100644
--- a/git_project/glob_feature/glob_feature.py
+++ b/git_project/glob_feature/glob_feature.py
@@ -7,9 +7,11 @@
 import pandas as pd
 import difflib
 import darshan
+import re
+import os
 
 
-def path_grouper():
+def make_path_grouper():
     matcher = difflib.SequenceMatcher()
     def group_paths(paths):
         if not matcher.a:
@@ -17,32 +19,68 @@ def group_paths(paths):
             return paths
         else:
             matcher.set_seq2(paths)
-            matchings = matcher.get_matching_blocks()
-            if any(size > 25 for _, _, size in matchings): # change size to bigger number for more precise paths
+            similarity_ratio = matcher.ratio()
+            if similarity_ratio >= 0.8:
                 return matcher.a
             else:
                 matcher.set_seq1(paths)
                 return paths
-
     return group_paths
 
 
 def regex_df_condenser(df, paths):
-    path_grouper_func = path_grouper()
+    path_grouper_func = make_path_grouper()
     df["filename_glob"] = df["filename_glob"].apply(path_grouper_func)
+    print("Paths after grouping:")
+    print(df["filename_glob"])
+
     df = df.groupby("filename_glob").size().reset_index(name="glob_count")
 
+    df = df.sort_values(by="glob_count", ascending=False)
+
+    print("Paths after grouping and counting:")
+    print(df)
+
+
+    def find_common_prefix(paths):
+        # Sort the paths in lexicographical order
+        sorted_paths = sorted(paths)
+
+        # Find the common prefix
+        common_prefix = os.path.commonprefix(sorted_paths)
+
+        # Trim the common prefix to the last path separator
+        last_separator = common_prefix.rfind(os.path.sep)
+        common_prefix = common_prefix[:last_separator+1] if last_separator >= 0 else common_prefix
+
+        return common_prefix
+
+
+    for group in df["filename_glob"].unique():
+        group_df = df[df["filename_glob"] == group]
+        common_path = find_common_prefix(group_df["filename_glob"])
+        df.loc[df["filename_glob"] == group, "filename_glob"] = common_path
+
+    print("Paths after modifying filename_glob:")
+    print(df)
+
+    df["regex"] = df.apply(lambda row: re.escape(row["filename_glob"]) + r".*", axis=1)
+    print("Paths after applying regex:")
+    print(df)
+
     return df
 
 
-def main(log_path):
+
+def main(log_path, output_path):
     report = darshan.DarshanReport(log_path)
+
+
     df = pd.DataFrame.from_dict(report.name_records, orient="index", columns=["filename_glob"])
+
     df = df[df["filename_glob"].str.contains(r"/.*")]
     df["glob_count"] = 1
     df = regex_df_condenser(df, df["filename_glob"])
-    df.sort_values(by="glob_count", inplace=True, ascending=False)
-
 
     style = df.style.background_gradient(axis=0, cmap="viridis", gmap=df["glob_count"])
     style.hide(axis="index")
@@ -53,14 +91,15 @@ def main(log_path):
     ])
     html = style.to_html()
 
-# can change name of the output html report here
-    with open("name_record_table.html", "w") as html_file:
+    # can change name of the output html report here
+    with open("name_record_glob.html", "w") as html_file:
         html_file.write(html)
 
-
+# go back to hdf5_diagonal dxt
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('-p', '--log-path', type=str)
+    parser.add_argument('-p', '--log-path', type=str, help="Path to the log file")
+    parser.add_argument('-o', '--output-path', type=str, help="Path to the output HTML file")
     args = parser.parse_args()
-    main(log_path=args.log_path)
+    main(log_path=args.log_path , output_path=args.output_path)
 

From 397780a7254307c09e3a161ac91d5854abe9e15e Mon Sep 17 00:00:00 2001
From: yariseidenbenz <yeidenbenz@lanl.gov>
Date: Wed, 31 May 2023 08:59:44 -0600
Subject: [PATCH 3/9] WIP: this script creates a condensed  dataframe of
 glob_filename and glob_count from a log .darshan file

---
 git_project/glob_feature/glob_feature.py | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

diff --git a/git_project/glob_feature/glob_feature.py b/git_project/glob_feature/glob_feature.py
index ac81c3218..9febba5dd 100644
--- a/git_project/glob_feature/glob_feature.py
+++ b/git_project/glob_feature/glob_feature.py
@@ -30,17 +30,13 @@ def group_paths(paths):
 
 def regex_df_condenser(df, paths):
     path_grouper_func = make_path_grouper()
+
     df["filename_glob"] = df["filename_glob"].apply(path_grouper_func)
-    print("Paths after grouping:")
-    print(df["filename_glob"])
 
     df = df.groupby("filename_glob").size().reset_index(name="glob_count")
 
     df = df.sort_values(by="glob_count", ascending=False)
 
-    print("Paths after grouping and counting:")
-    print(df)
-
 
     def find_common_prefix(paths):
         # Sort the paths in lexicographical order
@@ -61,12 +57,8 @@ def find_common_prefix(paths):
         common_path = find_common_prefix(group_df["filename_glob"])
         df.loc[df["filename_glob"] == group, "filename_glob"] = common_path
 
-    print("Paths after modifying filename_glob:")
-    print(df)
 
-    df["regex"] = df.apply(lambda row: re.escape(row["filename_glob"]) + r".*", axis=1)
-    print("Paths after applying regex:")
-    print(df)
+    df["filename_glob"] = df.apply(lambda row: (row["filename_glob"]) + r".*", axis=1)
 
     return df
 
@@ -92,10 +84,10 @@ def main(log_path, output_path):
     html = style.to_html()
 
     # can change name of the output html report here
-    with open("name_record_glob.html", "w") as html_file:
+    with open("name_record_glob_hd5f.html", "w") as html_file:
         html_file.write(html)
 
-# go back to hdf5_diagonal dxt
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument('-p', '--log-path', type=str, help="Path to the log file")

From edc12c383ec01cfb9d68ee4040089ee3e4a8952c Mon Sep 17 00:00:00 2001
From: yariseidenbenz <yeidenbenz@lanl.gov>
Date: Tue, 6 Jun 2023 10:27:54 -0600
Subject: [PATCH 4/9] Refactored glob_feature.py script and improved data frame
 creation for e3sm_io_heatmap_only.darshan log file. Also, relocated the
 script to a more suitable spot among other Python files in the project.

---
 .../darshan/glob_feature/glob_feature.py      | 89 +++++++++++++++++
 git_project/glob_feature/glob_feature.py      | 97 -------------------
 2 files changed, 89 insertions(+), 97 deletions(-)
 create mode 100644 darshan-util/pydarshan/darshan/glob_feature/glob_feature.py
 delete mode 100644 git_project/glob_feature/glob_feature.py

diff --git a/darshan-util/pydarshan/darshan/glob_feature/glob_feature.py b/darshan-util/pydarshan/darshan/glob_feature/glob_feature.py
new file mode 100644
index 000000000..4a29895fe
--- /dev/null
+++ b/darshan-util/pydarshan/darshan/glob_feature/glob_feature.py
@@ -0,0 +1,89 @@
+# Creates a DataFrame with two columns ("glob_filename" and "glob_count") based on the files read by a .darshan file.
+# It uses sequence matching and grouping techniques to group similar file paths together and generates an HTML report of the grouped paths and their counts
+# Command to run python glob_feature.py -p path/to/log/file.darshan 
+
+
+import argparse
+import pandas as pd
+import difflib
+import darshan
+import re
+import os
+
+
+def generalize_filename_glob(df):
+    paths = df["filename_glob"].tolist()
+    grouped_paths = []
+
+    for i in range(len(paths)):
+        if not grouped_paths:
+            grouped_paths.append((paths[i],))
+        else:
+            is_grouped = False
+            for j, group in enumerate(grouped_paths):
+                matcher = difflib.SequenceMatcher(None, paths[i], group[0])
+                similarity_ratio = matcher.ratio()
+                if similarity_ratio >= 0.8:
+                    grouped_paths[j] = group + (paths[i],)
+                    is_grouped = True
+                    break
+            if not is_grouped:
+                grouped_paths.append((paths[i],))
+
+    print("grouped paths list is", grouped_paths)
+
+    new_paths = []
+    for group in grouped_paths:
+        if len(group) > 1:
+            common_prefix = os.path.commonprefix(group)
+            pattern = r"({}.*)\d(.*)".format(common_prefix)
+            modified_path = re.sub(pattern, r"\1\\d\2", group[0])
+            new_paths.append((modified_path, len(group)))
+        else:
+            new_paths.append((group[0], 1))
+
+    new_paths = [path for path in new_paths if path[0]]
+
+    if len(new_paths) > len(df):
+        new_paths = new_paths[:len(df)]
+
+    print("new paths are", new_paths)
+    return new_paths
+
+
+def main(log_path, output_path):
+
+    report = darshan.DarshanReport(log_path)
+
+    df = pd.DataFrame.from_dict(report.name_records, orient="index", columns=["filename_glob"])
+
+    df = df[df["filename_glob"].str.contains(r"/.*")]
+
+    df.reset_index(drop=True, inplace=True)  # Reset the index
+
+
+    new_paths = generalize_filename_glob(df)
+    df = pd.DataFrame(new_paths, columns=["filename_glob", "glob_count"])
+    df = df.reset_index(drop=True) 
+    df = df.sort_values(by="glob_count", ascending=False)
+
+    style = df.style.background_gradient(axis=0, cmap="viridis")
+    style.set_table_styles([
+        {"selector": "", "props": [("border", "1px solid grey")]},
+        {"selector": "tbody td", "props": [("border", "1px solid grey")]},
+        {"selector": "th", "props": [("border", "1px solid grey")]}
+    ])
+
+    style = style.hide_index()
+    html = style.render()
+
+    with open(output_path, "w") as html_file:
+        html_file.write(html)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-p', '--log-path', type=str, help="Path to the log file")
+    parser.add_argument('-o', '--output_path', type=str, help="Path to the output file")
+    args = parser.parse_args()
+    main(log_path=args.log_path, output_path=args.output_path)
diff --git a/git_project/glob_feature/glob_feature.py b/git_project/glob_feature/glob_feature.py
deleted file mode 100644
index 9febba5dd..000000000
--- a/git_project/glob_feature/glob_feature.py
+++ /dev/null
@@ -1,97 +0,0 @@
-# Creates a DataFrame with two columns ("glob_filename" and "glob_count") based on the files read by a .darshan file.
-# It uses sequence matching and grouping techniques to group similar file paths together and generates an HTML report of the grouped paths and their counts
-# Command to run python glob_feature.py -p path/to/log/file.darshan 
-
-
-import argparse
-import pandas as pd
-import difflib
-import darshan
-import re
-import os
-
-
-def make_path_grouper():
-    matcher = difflib.SequenceMatcher()
-    def group_paths(paths):
-        if not matcher.a:
-            matcher.set_seq1(paths)
-            return paths
-        else:
-            matcher.set_seq2(paths)
-            similarity_ratio = matcher.ratio()
-            if similarity_ratio >= 0.8:
-                return matcher.a
-            else:
-                matcher.set_seq1(paths)
-                return paths
-    return group_paths
-
-
-def regex_df_condenser(df, paths):
-    path_grouper_func = make_path_grouper()
-
-    df["filename_glob"] = df["filename_glob"].apply(path_grouper_func)
-
-    df = df.groupby("filename_glob").size().reset_index(name="glob_count")
-
-    df = df.sort_values(by="glob_count", ascending=False)
-
-
-    def find_common_prefix(paths):
-        # Sort the paths in lexicographical order
-        sorted_paths = sorted(paths)
-
-        # Find the common prefix
-        common_prefix = os.path.commonprefix(sorted_paths)
-
-        # Trim the common prefix to the last path separator
-        last_separator = common_prefix.rfind(os.path.sep)
-        common_prefix = common_prefix[:last_separator+1] if last_separator >= 0 else common_prefix
-
-        return common_prefix
-
-
-    for group in df["filename_glob"].unique():
-        group_df = df[df["filename_glob"] == group]
-        common_path = find_common_prefix(group_df["filename_glob"])
-        df.loc[df["filename_glob"] == group, "filename_glob"] = common_path
-
-
-    df["filename_glob"] = df.apply(lambda row: (row["filename_glob"]) + r".*", axis=1)
-
-    return df
-
-
-
-def main(log_path, output_path):
-    report = darshan.DarshanReport(log_path)
-
-
-    df = pd.DataFrame.from_dict(report.name_records, orient="index", columns=["filename_glob"])
-
-    df = df[df["filename_glob"].str.contains(r"/.*")]
-    df["glob_count"] = 1
-    df = regex_df_condenser(df, df["filename_glob"])
-
-    style = df.style.background_gradient(axis=0, cmap="viridis", gmap=df["glob_count"])
-    style.hide(axis="index")
-    style.set_table_styles([
-        {"selector": "", "props": [("border", "1px solid grey")]},
-        {"selector": "tbody td", "props": [("border", "1px solid grey")]},
-        {"selector": "th", "props": [("border", "1px solid grey")]}
-    ])
-    html = style.to_html()
-
-    # can change name of the output html report here
-    with open("name_record_glob_hd5f.html", "w") as html_file:
-        html_file.write(html)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument('-p', '--log-path', type=str, help="Path to the log file")
-    parser.add_argument('-o', '--output-path', type=str, help="Path to the output HTML file")
-    args = parser.parse_args()
-    main(log_path=args.log_path , output_path=args.output_path)
-

From 9b757d51f4ddc65feb0ae80f4cde1c94489d73e8 Mon Sep 17 00:00:00 2001
From: yariseidenbenz <yeidenbenz@lanl.gov>
Date: Wed, 14 Jun 2023 15:01:29 -0600
Subject: [PATCH 5/9] Rearranged glob_feature.py and added test_glob_feature.py
 to the test directory.

---
 .../darshan/tests/test_glob_feature.py        | 49 ++++++++++
 darshan-util/pydarshan/glob_feature.py        | 98 +++++++++++++++++++
 2 files changed, 147 insertions(+)
 create mode 100644 darshan-util/pydarshan/darshan/tests/test_glob_feature.py
 create mode 100644 darshan-util/pydarshan/glob_feature.py

diff --git a/darshan-util/pydarshan/darshan/tests/test_glob_feature.py b/darshan-util/pydarshan/darshan/tests/test_glob_feature.py
new file mode 100644
index 000000000..8a779857b
--- /dev/null
+++ b/darshan-util/pydarshan/darshan/tests/test_glob_feature.py
@@ -0,0 +1,49 @@
+import os
+import darshan
+from darshan.log_utils import get_log_path
+import pandas as pd
+print(pd.__version__)
+from pandas.testing import assert_frame_equal
+import pytest
+import re 
+print(sys.path)  # Print sys.path again
+import glob_feature
+
+print("hello")
+@pytest.mark.parametrize("log_name, expected_df", [
+     # grow this with more logs...
+     ("e3sm_io_heatmap_only.darshan",
+      pd.DataFrame({"filename_glob":
+                    # NOTE: usage of \\d or r"\d" for a literal backslash followed by "d"
+                    ["/projects/radix-io/snyder/e3sm/can_I_out_h\\[.*]d.nc",
+                     "/projects/radix-io/E3SM-IO-inputs/i_case_1344p.nc"],
+                    "glob_count": [2, 1]})),
+])
+
+def test_glob_tables(tmpdir, log_name, expected_df):
+    print("Current working directory:", os.getcwd())
+
+    # test the glob table HTML outputs for various
+    # log files in the logs repo (and new log files
+    # that you creatively design yourself)
+    log_path = get_log_path(log_name)
+    print("log path is", log_path)
+    with tmpdir.as_cwd():
+        cwd = os.getcwd()
+        # TODO: you shouldn't have a hardcoded HTML filename
+        # like this...
+        outfile = os.path.join(cwd, "name_record_glob_hd5f.html")
+        glob_feature.main(log_path, outfile)
+        actual_table = pd.read_html(outfile)[0]
+        actual_table.drop("Unnamed: 0", axis=1, inplace=True)  # Drop the "Unnamed: 0" column
+        print("actual table is", actual_table)
+        print("expected_df is", expected_df)
+        print("pandas version is", pd.__version__)
+        print("log path is", log_path)
+        # Compare the two DataFrames
+        diff = actual_table['filename_glob'].compare(expected_df['filename_glob'])
+        # Print the differences
+        print(diff)
+        assert_frame_equal(actual_table, expected_df)
+
+
diff --git a/darshan-util/pydarshan/glob_feature.py b/darshan-util/pydarshan/glob_feature.py
new file mode 100644
index 000000000..8b1cb1455
--- /dev/null
+++ b/darshan-util/pydarshan/glob_feature.py
@@ -0,0 +1,98 @@
+# Creates a DataFrame with two columns ("glob_filename" and "glob_count") based on the files read b$
+# It uses sequence matching and grouping techniques to group similar file paths together and genera$
+# Command to run python glob_feature.py -p path/to/log/file.darshan 
+
+
+import argparse
+import pandas as pd
+import difflib
+import darshan
+import re
+import os
+
+
+def generalize_filename_glob(df):
+    paths = df["filename_glob"].tolist()
+    grouped_paths = []
+
+    for i in range(len(paths)):
+        if not grouped_paths:
+            grouped_paths.append((paths[i],))
+        else:
+            is_grouped = False
+            for j, group in enumerate(grouped_paths):
+                matcher = difflib.SequenceMatcher(None, paths[i], group[0])
+                similarity_ratio = matcher.ratio()
+                if similarity_ratio >= 0.8:
+                    grouped_paths[j] = group + (paths[i],)
+                    is_grouped = True
+                    break
+            if not is_grouped:
+                grouped_paths.append((paths[i],))
+
+    print("grouped paths list is", grouped_paths)
+
+    new_paths = []
+    for group in grouped_paths:
+        if len(group) > 1:
+            common_prefix = os.path.commonprefix(group)
+            pattern = r"({}.*)\d(.*)".format(common_prefix)
+            modified_path = re.sub(pattern, r"\1\\d\2", group[0])
+            new_paths.append((modified_path, len(group)))
+        else:
+            new_paths.append((group[0], 1))
+
+    new_paths = [path for path in new_paths if path[0]]
+
+    if len(new_paths) > len(df):
+        new_paths = new_paths[:len(df)]
+
+    print("new paths are", new_paths)
+    return new_paths
+
+
+
+
+def main(log_path, output_path):
+
+    report = darshan.DarshanReport(log_path)
+
+    df = pd.DataFrame.from_dict(report.name_records, orient="index", columns=["filename_glob"])
+
+    df = df[df["filename_glob"].str.contains(r"/.*")]
+
+    df.reset_index(drop=True, inplace=True)  # Reset the index
+
+
+    new_paths = generalize_filename_glob(df)
+    df = pd.DataFrame(new_paths, columns=["filename_glob", "glob_count"])
+    df = df.reset_index(drop=True)
+    df = df.sort_values(by="glob_count", ascending=False)
+
+
+    style = df.style.background_gradient(axis=0, cmap="viridis")
+    style.set_properties(subset=["glob_count"], **{"text-align": "right"})
+
+    style.set_table_styles([
+        {"selector": "", "props": [("border", "1px solid grey")]},
+        {"selector": "tbody td", "props": [("border", "1px solid grey")]},
+        {"selector": "th", "props": [("border", "1px solid grey")]}
+
+    ])
+
+    # html = style.render() # use this when running python glob_feature.py -p /path/to/logfile 
+
+    html = style.to_html() #use when running pytest
+
+
+    with open(output_path, "w") as html_file:
+        html_file.write(html)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-p', '--log-path', type=str, help="Path to the log file")
+    parser.add_argument('-o', '--output_path', type=str, help="Path to the output file")
+    args = parser.parse_args()
+    main(log_path=args.log_path, output_path=args.output_path)
+

From a5df39450827d56cb49100f80743efcae16f4934 Mon Sep 17 00:00:00 2001
From: yariseidenbenz <yeidenbenz@lanl.gov>
Date: Thu, 15 Jun 2023 15:34:14 -0600
Subject: [PATCH 6/9] Remove glob_feature.py from wrong location

---
 darshan-util/pydarshan/glob_feature.py | 98 --------------------------
 1 file changed, 98 deletions(-)
 delete mode 100644 darshan-util/pydarshan/glob_feature.py

diff --git a/darshan-util/pydarshan/glob_feature.py b/darshan-util/pydarshan/glob_feature.py
deleted file mode 100644
index 8b1cb1455..000000000
--- a/darshan-util/pydarshan/glob_feature.py
+++ /dev/null
@@ -1,98 +0,0 @@
-# Creates a DataFrame with two columns ("glob_filename" and "glob_count") based on the files read b$
-# It uses sequence matching and grouping techniques to group similar file paths together and genera$
-# Command to run python glob_feature.py -p path/to/log/file.darshan 
-
-
-import argparse
-import pandas as pd
-import difflib
-import darshan
-import re
-import os
-
-
-def generalize_filename_glob(df):
-    paths = df["filename_glob"].tolist()
-    grouped_paths = []
-
-    for i in range(len(paths)):
-        if not grouped_paths:
-            grouped_paths.append((paths[i],))
-        else:
-            is_grouped = False
-            for j, group in enumerate(grouped_paths):
-                matcher = difflib.SequenceMatcher(None, paths[i], group[0])
-                similarity_ratio = matcher.ratio()
-                if similarity_ratio >= 0.8:
-                    grouped_paths[j] = group + (paths[i],)
-                    is_grouped = True
-                    break
-            if not is_grouped:
-                grouped_paths.append((paths[i],))
-
-    print("grouped paths list is", grouped_paths)
-
-    new_paths = []
-    for group in grouped_paths:
-        if len(group) > 1:
-            common_prefix = os.path.commonprefix(group)
-            pattern = r"({}.*)\d(.*)".format(common_prefix)
-            modified_path = re.sub(pattern, r"\1\\d\2", group[0])
-            new_paths.append((modified_path, len(group)))
-        else:
-            new_paths.append((group[0], 1))
-
-    new_paths = [path for path in new_paths if path[0]]
-
-    if len(new_paths) > len(df):
-        new_paths = new_paths[:len(df)]
-
-    print("new paths are", new_paths)
-    return new_paths
-
-
-
-
-def main(log_path, output_path):
-
-    report = darshan.DarshanReport(log_path)
-
-    df = pd.DataFrame.from_dict(report.name_records, orient="index", columns=["filename_glob"])
-
-    df = df[df["filename_glob"].str.contains(r"/.*")]
-
-    df.reset_index(drop=True, inplace=True)  # Reset the index
-
-
-    new_paths = generalize_filename_glob(df)
-    df = pd.DataFrame(new_paths, columns=["filename_glob", "glob_count"])
-    df = df.reset_index(drop=True)
-    df = df.sort_values(by="glob_count", ascending=False)
-
-
-    style = df.style.background_gradient(axis=0, cmap="viridis")
-    style.set_properties(subset=["glob_count"], **{"text-align": "right"})
-
-    style.set_table_styles([
-        {"selector": "", "props": [("border", "1px solid grey")]},
-        {"selector": "tbody td", "props": [("border", "1px solid grey")]},
-        {"selector": "th", "props": [("border", "1px solid grey")]}
-
-    ])
-
-    # html = style.render() # use this when running python glob_feature.py -p /path/to/logfile 
-
-    html = style.to_html() #use when running pytest
-
-
-    with open(output_path, "w") as html_file:
-        html_file.write(html)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument('-p', '--log-path', type=str, help="Path to the log file")
-    parser.add_argument('-o', '--output_path', type=str, help="Path to the output file")
-    args = parser.parse_args()
-    main(log_path=args.log_path, output_path=args.output_path)
-

From 452568bf902c26aa968fab43a60e7647a9f50280 Mon Sep 17 00:00:00 2001
From: yariseidenbenz <yeidenbenz@lanl.gov>
Date: Mon, 19 Jun 2023 14:10:59 -0600
Subject: [PATCH 7/9] Fixed styling of glob_feature.py and added [.*] grouping
 feature. Added __init__.py to glob_feature. Fixed errors in
 test_glob_feature.py.

---
 .../darshan/glob_feature/__init__.py          |  5 +++
 .../darshan/glob_feature/glob_feature.py      | 32 +++++++++++--------
 .../darshan/tests/test_glob_feature.py        | 20 +++++-------
 3 files changed, 32 insertions(+), 25 deletions(-)
 create mode 100644 darshan-util/pydarshan/darshan/glob_feature/__init__.py

diff --git a/darshan-util/pydarshan/darshan/glob_feature/__init__.py b/darshan-util/pydarshan/darshan/glob_feature/__init__.py
new file mode 100644
index 000000000..060f8bf81
--- /dev/null
+++ b/darshan-util/pydarshan/darshan/glob_feature/__init__.py
@@ -0,0 +1,5 @@
+"""
+Creates a DataFrame with two columns ("glob_filename" and "glob_count")
+based on the files read by a .darshan file.
+"""
+
diff --git a/darshan-util/pydarshan/darshan/glob_feature/glob_feature.py b/darshan-util/pydarshan/darshan/glob_feature/glob_feature.py
index 4a29895fe..fe13e225d 100644
--- a/darshan-util/pydarshan/darshan/glob_feature/glob_feature.py
+++ b/darshan-util/pydarshan/darshan/glob_feature/glob_feature.py
@@ -32,13 +32,19 @@ def generalize_filename_glob(df):
 
     print("grouped paths list is", grouped_paths)
 
+
     new_paths = []
     for group in grouped_paths:
         if len(group) > 1:
-            common_prefix = os.path.commonprefix(group)
-            pattern = r"({}.*)\d(.*)".format(common_prefix)
-            modified_path = re.sub(pattern, r"\1\\d\2", group[0])
-            new_paths.append((modified_path, len(group)))
+            merged_path = ""
+            max_length = max(len(path) for path in group)
+            for i in range(max_length):
+                chars = set(path[i] if len(path) > i else "" for path in group)
+                if len(chars) == 1:
+                    merged_path += chars.pop()
+                else:
+                    merged_path += "[.*]"
+            new_paths.append((merged_path, len(group)))
         else:
             new_paths.append((group[0], 1))
 
@@ -59,23 +65,23 @@ def main(log_path, output_path):
 
     df = df[df["filename_glob"].str.contains(r"/.*")]
 
-    df.reset_index(drop=True, inplace=True)  # Reset the index
-
-
     new_paths = generalize_filename_glob(df)
+
     df = pd.DataFrame(new_paths, columns=["filename_glob", "glob_count"])
-    df = df.reset_index(drop=True) 
+    df = df.reset_index(drop=True)
     df = df.sort_values(by="glob_count", ascending=False)
 
-    style = df.style.background_gradient(axis=0, cmap="viridis")
+    style = df.style.background_gradient(axis=0, cmap="viridis", gmap=df["glob_count"])
+    style = style.set_properties(subset=["glob_count"], **{"text-align": "right"})
+    style.hide(axis="index")
     style.set_table_styles([
         {"selector": "", "props": [("border", "1px solid grey")]},
         {"selector": "tbody td", "props": [("border", "1px solid grey")]},
-        {"selector": "th", "props": [("border", "1px solid grey")]}
-    ])
+        {"selector": "th", "props": [("border", "1px solid grey")]},
+
+     ])
 
-    style = style.hide_index()
-    html = style.render()
+    html = style.to_html()
 
     with open(output_path, "w") as html_file:
         html_file.write(html)
diff --git a/darshan-util/pydarshan/darshan/tests/test_glob_feature.py b/darshan-util/pydarshan/darshan/tests/test_glob_feature.py
index 8a779857b..b8072bc9b 100644
--- a/darshan-util/pydarshan/darshan/tests/test_glob_feature.py
+++ b/darshan-util/pydarshan/darshan/tests/test_glob_feature.py
@@ -1,22 +1,21 @@
+import sys
 import os
 import darshan
 from darshan.log_utils import get_log_path
 import pandas as pd
-print(pd.__version__)
 from pandas.testing import assert_frame_equal
 import pytest
-import re 
-print(sys.path)  # Print sys.path again
-import glob_feature
+import re
+print(sys.path)
+from darshan.glob_feature import glob_feature
+
 
-print("hello")
 @pytest.mark.parametrize("log_name, expected_df", [
      # grow this with more logs...
      ("e3sm_io_heatmap_only.darshan",
       pd.DataFrame({"filename_glob":
-                    # NOTE: usage of \\d or r"\d" for a literal backslash followed by "d"
-                    ["/projects/radix-io/snyder/e3sm/can_I_out_h\\[.*]d.nc",
-                     "/projects/radix-io/E3SM-IO-inputs/i_case_1344p.nc"],
+                   ["/projects/radix-io/snyder/e3sm/can_I_out_h[.*].nc",
+                    "/projects/radix-io/E3SM-IO-inputs/i_case_1344p.nc"],
                     "glob_count": [2, 1]})),
 ])
 
@@ -30,12 +29,9 @@ def test_glob_tables(tmpdir, log_name, expected_df):
     print("log path is", log_path)
     with tmpdir.as_cwd():
         cwd = os.getcwd()
-        # TODO: you shouldn't have a hardcoded HTML filename
-        # like this...
-        outfile = os.path.join(cwd, "name_record_glob_hd5f.html")
+        outfile = os.path.join(cwd, "output.html")
         glob_feature.main(log_path, outfile)
         actual_table = pd.read_html(outfile)[0]
-        actual_table.drop("Unnamed: 0", axis=1, inplace=True)  # Drop the "Unnamed: 0" column
         print("actual table is", actual_table)
         print("expected_df is", expected_df)
         print("pandas version is", pd.__version__)

From 26c2572f00899ada961f74e8e5bc9da082ec922e Mon Sep 17 00:00:00 2001
From: yariseidenbenz <yeidenbenz@lanl.gov>
Date: Mon, 24 Jul 2023 14:39:41 -0600
Subject: [PATCH 8/9] Instead of using difflib to group files together,
 glob_feature.py now uses agglomerative hierarchal clustering for grouping.
 The test_glob_feature.py expanded to more log files. The dependencies for
 these scripts were added to main_ci.yml.

---
 .github/workflows/main_ci.yml                 |   2 +-
 .../darshan/glob_feature/glob_feature.py      | 163 ++++++----
 .../darshan/tests/test_glob_feature.py        | 278 +++++++++++++++++-
 3 files changed, 372 insertions(+), 71 deletions(-)

diff --git a/.github/workflows/main_ci.yml b/.github/workflows/main_ci.yml
index 4de03b7f6..c2e6ca351 100644
--- a/.github/workflows/main_ci.yml
+++ b/.github/workflows/main_ci.yml
@@ -32,7 +32,7 @@ jobs:
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
-          python -m pip install --upgrade pytest pyflakes asv pytest-cov lxml matplotlib packaging humanize "mypy<1.0.0"
+          python -m pip install --upgrade pytest pyflakes asv pytest-cov lxml matplotlib packaging humanize Jinja2 bz2file pandas scikit-learn numpy "mypy<1.0.0"
       - if: ${{matrix.platform == 'macos-latest'}}
         name: Install MacOS deps
         run: |
diff --git a/darshan-util/pydarshan/darshan/glob_feature/glob_feature.py b/darshan-util/pydarshan/darshan/glob_feature/glob_feature.py
index fe13e225d..ff0eacb02 100644
--- a/darshan-util/pydarshan/darshan/glob_feature/glob_feature.py
+++ b/darshan-util/pydarshan/darshan/glob_feature/glob_feature.py
@@ -1,95 +1,136 @@
 # Creates a DataFrame with two columns ("glob_filename" and "glob_count") based on the files read by a .darshan file.
-# It uses sequence matching and grouping techniques to group similar file paths together and generates an HTML report of the grouped paths and their counts
-# Command to run python glob_feature.py -p path/to/log/file.darshan 
-
+# The script utilizes agglomerative hierarchical clustering to effectively group similar file paths together, based on their characteristics.
+# It then displays a dataframe where one file represents a group and uses [.*] to show where filepaths within a group differ 
+# The result of this process is an HTML report that provides a comprehensive overview of the grouped paths and their respective counts. 
+# Command to run: python glob_feature.py -p path/to/log/file.darshan -o path/to/output_file 
 
 import argparse
 import pandas as pd
-import difflib
 import darshan
-import re
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.cluster import AgglomerativeClustering
+from sklearn.metrics import silhouette_score
+import numpy as np
 import os
 
 
-def generalize_filename_glob(df):
-    paths = df["filename_glob"].tolist()
-    grouped_paths = []
-
-    for i in range(len(paths)):
-        if not grouped_paths:
-            grouped_paths.append((paths[i],))
-        else:
-            is_grouped = False
-            for j, group in enumerate(grouped_paths):
-                matcher = difflib.SequenceMatcher(None, paths[i], group[0])
-                similarity_ratio = matcher.ratio()
-                if similarity_ratio >= 0.8:
-                    grouped_paths[j] = group + (paths[i],)
-                    is_grouped = True
-                    break
-            if not is_grouped:
-                grouped_paths.append((paths[i],))
-
-    print("grouped paths list is", grouped_paths)
-
-
-    new_paths = []
-    for group in grouped_paths:
-        if len(group) > 1:
-            merged_path = ""
-            max_length = max(len(path) for path in group)
-            for i in range(max_length):
-                chars = set(path[i] if len(path) > i else "" for path in group)
-                if len(chars) == 1:
-                    merged_path += chars.pop()
-                else:
-                    merged_path += "[.*]"
-            new_paths.append((merged_path, len(group)))
-        else:
-            new_paths.append((group[0], 1))
-
-    new_paths = [path for path in new_paths if path[0]]
-
-    if len(new_paths) > len(df):
-        new_paths = new_paths[:len(df)]
-
-    print("new paths are", new_paths)
-    return new_paths
-
-
 def main(log_path, output_path):
 
     report = darshan.DarshanReport(log_path)
-
     df = pd.DataFrame.from_dict(report.name_records, orient="index", columns=["filename_glob"])
-
     df = df[df["filename_glob"].str.contains(r"/.*")]
 
-    new_paths = generalize_filename_glob(df)
-
+    num_files = len(df)
+    optimal_k = 2  # Initialize optimal_k to 2
+    if num_files == 1:
+        print("Only one file detected.")
+        optimal_k = 1
+        # Process and save results for the single file
+        grouped_paths = {0: [df["filename_glob"].iloc[0]]}
+        new_paths = [(path, 1) for _, paths in grouped_paths.items() for path in paths]
+
+        print("grouped_paths", grouped_paths)
+
+    else:
+
+        # Convert strings to feature vectors
+        vectorizer = TfidfVectorizer()
+        X = vectorizer.fit_transform(df["filename_glob"])
+        print("X is:", X)
+
+    # Determine the maximum number of clusters dynamically
+        max_clusters = int(np.sqrt(len(df)))
+
+        silhouette_scores = []
+        for k in range(2, max_clusters + 1):
+            print("max clusters is", max_clusters)
+            # Perform clustering
+            clustering = AgglomerativeClustering(n_clusters=k)
+            clusters = clustering.fit_predict(X.toarray())
+
+            # Calculate the silhouette score
+            score = silhouette_score(X, clusters)
+            print("clusters are:", clusters)
+
+            silhouette_scores.append(score)
+
+            # Find the optimal number of clusters based on the silhouette scores
+            optimal_k = np.argmax(silhouette_scores) + 2  # Add 2 because range starts from 2
+
+            print("Optimal number of clusters:", optimal_k)
+
+        # Perform clustering with the optimal number of clusters
+        clustering = AgglomerativeClustering(n_clusters=optimal_k)
+        clusters = clustering.fit_predict(X.toarray())
+        print("clusters are", clusters)
+        grouped_paths = {}
+        for i, cluster_label in enumerate(clusters):
+            if cluster_label not in grouped_paths:
+                grouped_paths[cluster_label] = []
+            grouped_paths[cluster_label].append(df["filename_glob"].iloc[i])
+
+        new_paths = []
+        for _, group in grouped_paths.items():
+            if len(group) > 1:
+                merged_path = ""
+                max_length = max(len(path) for path in group)
+                differing_chars_encountered = False
+                common_extension = None
+
+
+                for i in range(max_length):
+                    chars = set(path[i] if len(path) > i else "" for path in group)
+                    if len(chars) == 1:
+                        merged_path += chars.pop()
+                        differing_chars_encountered = True
+                    else:
+                        if differing_chars_encountered:
+                            merged_path += "[.*]"
+                            differing_chars_encountered = False
+
+                # Checks if all paths have the same file extension
+                extensions = [os.path.splitext(path)[1] for path in group]
+                common_extension = None
+                if len(set(extensions)) == 1:
+                    common_extension = extensions[0]
+
+                # Append the common extension if it exists and it's not already in the merged_path
+                if common_extension and common_extension not in merged_path:
+                    merged_path += common_extension
+
+                new_paths.append((merged_path, len(group)))
+            else:
+                new_paths.append((group[0], 1))
+
+
+    # Save the results to an output file
     df = pd.DataFrame(new_paths, columns=["filename_glob", "glob_count"])
-    df = df.reset_index(drop=True)
-    df = df.sort_values(by="glob_count", ascending=False)
 
+    df = df.sort_values(by="glob_count", ascending=False)
+    print("df is", df)
     style = df.style.background_gradient(axis=0, cmap="viridis", gmap=df["glob_count"])
     style = style.set_properties(subset=["glob_count"], **{"text-align": "right"})
     style.hide(axis="index")
     style.set_table_styles([
         {"selector": "", "props": [("border", "1px solid grey")]},
         {"selector": "tbody td", "props": [("border", "1px solid grey")]},
-        {"selector": "th", "props": [("border", "1px solid grey")]},
-
-     ])
+        {"selector": "th", "props": [("border", "1px solid grey")]}
+    ])
 
     html = style.to_html()
 
     with open(output_path, "w") as html_file:
         html_file.write(html)
 
+    total_count = df["glob_count"].sum()
+    print("Total glob_count:", total_count)
+
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument('-p', '--log-path', type=str, help="Path to the log file")
-    parser.add_argument('-o', '--output_path', type=str, help="Path to the output file")
+    parser.add_argument('-o', '--output-path', type=str, help="Path to the output HTML file")
     args = parser.parse_args()
     main(log_path=args.log_path, output_path=args.output_path)
+
+
diff --git a/darshan-util/pydarshan/darshan/tests/test_glob_feature.py b/darshan-util/pydarshan/darshan/tests/test_glob_feature.py
index b8072bc9b..803b5c41c 100644
--- a/darshan-util/pydarshan/darshan/tests/test_glob_feature.py
+++ b/darshan-util/pydarshan/darshan/tests/test_glob_feature.py
@@ -1,3 +1,6 @@
+# Note: Some tests may currently fail, as this script is still under active development.
+# The log files here are from the the darshan-logs repository
+
 import sys
 import os
 import darshan
@@ -9,7 +12,6 @@
 print(sys.path)
 from darshan.glob_feature import glob_feature
 
-
 @pytest.mark.parametrize("log_name, expected_df", [
      # grow this with more logs...
      ("e3sm_io_heatmap_only.darshan",
@@ -17,14 +19,267 @@
                    ["/projects/radix-io/snyder/e3sm/can_I_out_h[.*].nc",
                     "/projects/radix-io/E3SM-IO-inputs/i_case_1344p.nc"],
                     "glob_count": [2, 1]})),
+
+      ("snyder_acme.exe_id1253318_9-27-24239-1515303144625770178_2.darshan",
+      pd.DataFrame({"filename_glob":
+                   ["/gpfs/mira-fs1/projects/radix-io/snyder/acme/test_F_case_cetus_dxt/run/[.*]",
+                    "/projects/ccsm/inputdata/atm/cam/chem/trop_mozart_aero/emis/[.*].nc",
+                    "/projects/ccsm/inputdata/atm/cam/physprops/[.*].nc",
+                    "/projects/ccsm/inputdata/atm/cam/[.*].nc",
+                    "/gpfs/mira-fs1/projects/radix-io/snyder/acme/test_F_case_cetus_dxt/run/test_F_case_cetus_dxt.[.*]00[.*]",
+                    "/projects/ccsm/inputdata/lnd/clm2/[.*].nc",
+                    "/gpfs/mira-fs1/projects/radix-io/snyder/acme/test_F_case_cetus_dxt/run/timing/[.*]i[.*]",
+                    "/projects/radix-io/snyder/acme/test_F_case_cetus_dxt/run/[.*].log.170927-064246",
+                    "/projects/ccsm/inputdata/atm/waccm/[.*].nc",
+                    "/projects/ccsm/inputdata/[.*]n.[.*].[.*]1[.*]0[.*].nc"], #Note: for this set of grouped paths it might be more benifical to display the individual filepaths 
+                    "glob_count": [22, 18, 14, 13, 10, 6, 6, 5, 3, 3]})),
+
+     ("darshan-apmpi-2nodes-64mpi.darshan",
+      pd.DataFrame({"filename_glob":
+                   ["/lus/theta-fs0/projects/Performance/chunduri/MILC/milctestv2-papi-reorder-darshan/MILC_2_526820_2021-06-14-15:58:47/[.*]n[.*]"],
+                    "glob_count": [2]})),
+
+     ("mpi-io-test.darshan",
+      pd.DataFrame({"filename_glob":
+                   ["/global/cscratch1/sd/ssnyder/tmp/mpi-io-test.tmp.dat"],
+                    "glob_count": [1]})),
+
+     ("e3sm_io_heatmap_and_dxt.darshan",
+      pd.DataFrame({"filename_glob":
+                   ["/projects/radix-io/snyder/e3sm/can_I_out_h[.*].nc",
+                    "/projects/radix-io/E3SM-IO-inputs/i_case_1344p.nc"],
+                    "glob_count": [2, 1]})),
+
+
+     ("hdf5_diagonal_write_1_byte_dxt.darshan",
+      pd.DataFrame({"filename_glob":
+                   ["/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/site-packages/h5py/[.*]",
+                    "/yellow/users/nawtrey/projects/hdf5_testing/test_files_write_1_bytes/test_[.*].h5[.*]",
+                    "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/[.*]"],
+                    "glob_count": [54, 20, 10]})),
+
+
+     ("hdf5_diagonal_write_bytes_range_dxt.darshan",
+      pd.DataFrame({"filename_glob":
+                   ["/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/site-packages/h5py/[.*]",
+                    "/yellow/users/nawtrey/projects/hdf5_testing/test_files_write_1_bytes/test_[.*].h5[.*]",
+                    "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/[.*]"],
+                    "glob_count": [54, 20, 10]})),
+
+     ("hdf5_diagonal_write_half_flush_dxt.darshan",
+      pd.DataFrame({"filename_glob":
+                   ["/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/site-packages/h5py/[.*]",
+                    "/yellow/users/nawtrey/projects/hdf5_testing/test_files_write_1_bytes/test_[.*].h5[.*]",
+                    "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/[.*]"],
+                    "glob_count": [54, 20, 10]})),
+
+     ("hdf5_diagonal_write_half_ranks_dxt.darshan",
+      pd.DataFrame({"filename_glob":
+                   ["/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/site-packages/h5py/[.*]",
+                    "/yellow/users/nawtrey/projects/hdf5_testing/test_files_write_1_bytes/test_[.*].h5[.*]",
+                    "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/[.*]"],
+                    "glob_count": [54, 15, 10]})),
+
+     ("hdf5_file_opens_only.darshan",
+      pd.DataFrame({"filename_glob":
+                   ["/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/site-packages/numpy[.*]",
+                    "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/[.*]",
+                    "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/site-packages/h5py/[.*]",
+                    "/yellow/users/nawtrey/projects/hdf5_testing/test_h5f_only_[.*].h5"],
+                    "glob_count": [175, 85, 54, 3 ]})),
+
+
+
+     ("treddy_h5d_no_h5f.darshan",
+      pd.DataFrame({"filename_glob":
+                   ["/home/treddy/python_virtual_envs/python_310_darshan/lib/python3.10/site-packages/h5py/_[.*].pyc",
+                    "/home/treddy/rough_work/darshan/issue_709/rank_[.*].h5[.*]"],
+                    "glob_count": [15, 6]})),
+
+
+     ("imbalanced-io.darshan",
+      pd.DataFrame({"filename_glob":
+                   ["/lus/theta-fs0/[.*]",
+                    "//3926523774",
+                    "//1958007717",
+                    "//946917208",
+                    "//3186458368",
+                    "//604249092",
+                    "//2324418701",
+                    "//2142813647",
+                    "//3149983296",
+                    "//1895353925",
+                    "//425392719",
+                    "//1053204904",
+                    "//2446001947"],
+                    "glob_count": [1015, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]})),
+
+
+     ("shane_ior-HDF5_id438090-438090_11-9-41522-17417065676046418211_1.darshan",
+      pd.DataFrame({"filename_glob":
+                   ["/home/shane/software/ior/build/testFile[.*]"],
+                    "glob_count": [2]})),
+
+     ("shane_ior-PNETCDF_id438100-438100_11-9-41525-10280033558448664385_1.darshan",
+      pd.DataFrame({"filename_glob":
+                   ["/home/shane/software/ior/build/testFile[.*]"],
+                    "glob_count": [2]})),
+
+     ("partial_data_stdio.darshan",
+     pd.DataFrame({"filename_glob":
+                  ["/home/carns/working/dbg/darshan-examples/foo[.*]",
+                   "/home/carns/working/dbg/darshan-examples/test.out"],
+                   "glob_count": [1021, 1]})),
+
+
+# This log file contains files that are only numeric. 
+# I commented them all out because I am unsure if we even want to include these files and if so do we want to group them together
+#     ("nonmpi_dxt_anonymized.darshan",
+#     pd.DataFrame({"filename_glob":
+#                  ["//2585653418",
+#                   "//3392535749",
+#                   "//1750113851",
+#                   "//68752815",
+#                   "//155559223",
+#                   "//1093384412",
+#                   "//3046746762",
+#                   "//2617286315",
+#                   "//826480344",
+#                   "//1571032323",
+#                   "//4226169779",
+#                   "//2418046705",
+#                   "//2010395326",
+#                   "//1767127016",
+#                   "//4075905285",
+#                   "//1067575933",
+#                   "//3616928368",
+#                   "//983841409",
+#                   "//513688402",
+#                   "//4287455549",
+#                   "//2136275236",
+#                   "//3097647757",
+#                   "//236164485",
+#                   "//1437530161",
+#                   "//2689488546",
+#                   "//4192870826",
+#                   "//309267665",
+#                   "//780646879",
+#                   "//499632015",
+#                   "//2507343021",
+#                   "//2695660354",
+#                   "//3091680351",
+#                   "//3164053573",
+#                   "//930552855",
+#                   "//1137823565",
+#                   "//2598810996",
+#                   "//2330561107",
+#                   "//2564488601",
+#                   "//317014058",
+#                   "//3342706664",
+#                   "//2160565458",
+#                   "//2907700500",
+#                   "//2116489843",
+#                   "//135439080",
+#                   "//3098064231",
+#                   "//2967008390",
+#                   "//3067634051",
+#                   "//1734260232",
+#                   "//3120506952",
+#                   "//642754434",
+#                   "//463702723",
+#                   "//1896899807",
+#                   "//4260655471",
+#                   "//827646422",
+#                   "//942747095",
+#                   "//432306240",
+#                   "//583215908",
+#                   "//1673153855",
+#                   "//3192604617",
+#                   "//3225174794",
+#                   "//2990589364",
+#                   "//37712466",
+#                   "//2173526570",
+#                   "//1117575673",
+#                   "//3916290828",
+#                   "//430181069",
+#                   "//3645159644",
+#                   "//529183092",
+#                   "//3225006356",
+#                   "//63288926",
+#                   "//798211322",
+#                   "//2256136699",
+#                   "//4004231621",
+#                   "//2379710227",
+#                   "//3211841059",
+#                   "//3397061505",
+#                   "//416688243",
+#                   "//1456531123"],
+#                   "glob_count": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]})),
+
+
+     ("partial_data_dxt.darshan",
+     pd.DataFrame({"filename_glob":
+                  ["/home/carns/working/dbg/darshan-examples/test.out"],
+                   "glob_count": [1]})),
+
+
+     ("partial_data_stdio.darshan",
+     pd.DataFrame({"filename_glob":
+                  ["/home/carns/working/dbg/darshan-examples/foo[.*]",
+                   "/home/carns/working/dbg/darshan-examples/test.out"],
+                   "glob_count": [1021 ,1]})),
+
+
+     ("mpi-io-test-ppc64-3.0.0.darshan",
+     pd.DataFrame({"filename_glob":
+                  ["/gpfs/mira-fs0/projects/SSSPPg/snyder/tmp/mpi-io-test.tmp.dat"],
+                   "glob_count": [1]})),
+
+     ("mpi-io-test-x86_64-3.0.0.darshan",
+     pd.DataFrame({"filename_glob":
+                  ["/tmp/tmp/mpi-io-test.tmp.dat"],
+                   "glob_count": [1]})),
+
+     ("mpi-io-test-x86_64-3.4.0-pre1.darshan",
+     pd.DataFrame({"filename_glob":
+                  ["/tmp/test/mpi-io-test.tmp.dat"],
+                   "glob_count": [1]})),
+
+
+     ("runtime_and_dxt_heatmaps_diagonal_write_only.darshan",
+     pd.DataFrame({"filename_glob":
+                  ["/yellow/users/treddy/github_projects/heatmap_diagonal/rank_[.*]_write_1_bytes"],
+                   "glob_count": [32]})),
+
+
+# This log file contains no data
+     ("treddy_runtime_heatmap_inactive_ranks.darshan",
+     pd.DataFrame({"filename_glob":
+                  [],
+                   "glob_count": []})),
+
+
+     ("skew-app.darshan",
+     pd.DataFrame({"filename_glob":
+                  ["/lus/theta-fs0/2934391481"],
+                   "glob_count": [1]})),
+
+     ("skew-autobench-ior.darshan",
+     pd.DataFrame({"filename_glob":
+                  ["//1968299212",
+                   "//4207382746"],
+                   "glob_count": [1, 1]})),
+
+
+     ("laytonjb_test1_id28730_6-7-43012-2131301613401632697_1.darshan",
+     pd.DataFrame({"filename_glob":
+                  ["/home/laytonjb/PROJECTS/DARSHAN/TEST/jeff.txt"],
+                   "glob_count": [1]})),
 ])
 
+
 def test_glob_tables(tmpdir, log_name, expected_df):
     print("Current working directory:", os.getcwd())
-
-    # test the glob table HTML outputs for various
-    # log files in the logs repo (and new log files
-    # that you creatively design yourself)
     log_path = get_log_path(log_name)
     print("log path is", log_path)
     with tmpdir.as_cwd():
@@ -32,14 +287,19 @@ def test_glob_tables(tmpdir, log_name, expected_df):
         outfile = os.path.join(cwd, "output.html")
         glob_feature.main(log_path, outfile)
         actual_table = pd.read_html(outfile)[0]
-        print("actual table is", actual_table)
-        print("expected_df is", expected_df)
-        print("pandas version is", pd.__version__)
         print("log path is", log_path)
+        print("Shape of actual table:", actual_table.shape)
+        print("Shape of expected_df:", expected_df.shape)
+
+        # Print the contents of the DataFrames
+        print("Actual DataFrame:")
+        print(actual_table)
+        print("Expected DataFrame:")
+        print(expected_df)
+
         # Compare the two DataFrames
         diff = actual_table['filename_glob'].compare(expected_df['filename_glob'])
         # Print the differences
         print(diff)
         assert_frame_equal(actual_table, expected_df)
 
-

From cd9d522a11ee80635868359292ca5134c52e6540 Mon Sep 17 00:00:00 2001
From: yariseidenbenz <yeidenbenz@lanl.gov>
Date: Tue, 8 Aug 2023 17:09:37 -0600
Subject: [PATCH 9/9] The glob_feature.py now groups files based on
 agglomerative hierarchical clustering and common file extension. It also has
 a verbose option (-v) which displays all the files within the respective
 groups. I added some modifications for the ideal values in the
 test_glob_feature.py script

---
 .../darshan/glob_feature/glob_feature.py      | 123 +++++++--
 .../darshan/tests/test_glob_feature.py        | 247 +++++-------------
 2 files changed, 164 insertions(+), 206 deletions(-)

diff --git a/darshan-util/pydarshan/darshan/glob_feature/glob_feature.py b/darshan-util/pydarshan/darshan/glob_feature/glob_feature.py
index ff0eacb02..c31e7f760 100644
--- a/darshan-util/pydarshan/darshan/glob_feature/glob_feature.py
+++ b/darshan-util/pydarshan/darshan/glob_feature/glob_feature.py
@@ -3,6 +3,8 @@
 # It then displays a dataframe where one file represents a group and uses [.*] to show where filepaths within a group differ 
 # The result of this process is an HTML report that provides a comprehensive overview of the grouped paths and their respective counts. 
 # Command to run: python glob_feature.py -p path/to/log/file.darshan -o path/to/output_file 
+# Command to run with verbose: verbose will display all the files under the representing file 
+# python glob_feature.py -p path/to/log/file.darshan -o path/to/output_file  -v
 
 import argparse
 import pandas as pd
@@ -14,7 +16,7 @@
 import os
 
 
-def main(log_path, output_path):
+def main(log_path, output_path, verbose):
 
     report = darshan.DarshanReport(log_path)
     df = pd.DataFrame.from_dict(report.name_records, orient="index", columns=["filename_glob"])
@@ -38,7 +40,7 @@ def main(log_path, output_path):
         X = vectorizer.fit_transform(df["filename_glob"])
         print("X is:", X)
 
-    # Determine the maximum number of clusters dynamically
+        # Determine the maximum number of clusters dynamically
         max_clusters = int(np.sqrt(len(df)))
 
         silhouette_scores = []
@@ -51,9 +53,10 @@ def main(log_path, output_path):
             # Calculate the silhouette score
             score = silhouette_score(X, clusters)
             print("clusters are:", clusters)
-
             silhouette_scores.append(score)
 
+
+
             # Find the optimal number of clusters based on the silhouette scores
             optimal_k = np.argmax(silhouette_scores) + 2  # Add 2 because range starts from 2
 
@@ -69,6 +72,16 @@ def main(log_path, output_path):
                 grouped_paths[cluster_label] = []
             grouped_paths[cluster_label].append(df["filename_glob"].iloc[i])
 
+        # Group paths based on file extensions
+        grouped_by_extension = {}
+        for cluster_label, paths in grouped_paths.items():
+            grouped_by_extension[cluster_label] = {}
+            for path in paths:
+                 file_extension = os.path.splitext(path)[1]
+                 if file_extension not in grouped_by_extension[cluster_label]:
+                    grouped_by_extension[cluster_label][file_extension] = []
+                 grouped_by_extension[cluster_label][file_extension].append(path)
+
         new_paths = []
         for _, group in grouped_paths.items():
             if len(group) > 1:
@@ -77,7 +90,6 @@ def main(log_path, output_path):
                 differing_chars_encountered = False
                 common_extension = None
 
-
                 for i in range(max_length):
                     chars = set(path[i] if len(path) > i else "" for path in group)
                     if len(chars) == 1:
@@ -85,15 +97,17 @@ def main(log_path, output_path):
                         differing_chars_encountered = True
                     else:
                         if differing_chars_encountered:
-                            merged_path += "[.*]"
+                            merged_path += "(.*)"
                             differing_chars_encountered = False
+                            break
 
-                # Checks if all paths have the same file extension
+                # Check if all paths have the same file extension
                 extensions = [os.path.splitext(path)[1] for path in group]
                 common_extension = None
                 if len(set(extensions)) == 1:
                     common_extension = extensions[0]
 
+
                 # Append the common extension if it exists and it's not already in the merged_path
                 if common_extension and common_extension not in merged_path:
                     merged_path += common_extension
@@ -103,34 +117,95 @@ def main(log_path, output_path):
                 new_paths.append((group[0], 1))
 
 
-    # Save the results to an output file
-    df = pd.DataFrame(new_paths, columns=["filename_glob", "glob_count"])
 
-    df = df.sort_values(by="glob_count", ascending=False)
-    print("df is", df)
-    style = df.style.background_gradient(axis=0, cmap="viridis", gmap=df["glob_count"])
-    style = style.set_properties(subset=["glob_count"], **{"text-align": "right"})
-    style.hide(axis="index")
-    style.set_table_styles([
-        {"selector": "", "props": [("border", "1px solid grey")]},
-        {"selector": "tbody td", "props": [("border", "1px solid grey")]},
-        {"selector": "th", "props": [("border", "1px solid grey")]}
-    ])
+        if verbose:
+            new_paths_verbose = []
+
+            # Sort grouped_paths based on the size of each group (in descending order)
+            sorted_groups = sorted(grouped_paths.items(), key=lambda x: len(x[1]), reverse=True)
+
+            for cluster_label, paths in sorted_groups:
+
+                if len(paths) > 1:
+                    merged_path = ""
+                    max_length = max(len(path) for path in paths)
+                    differing_chars_encountered = False
+                    common_extension = None
+
+
+                    for i in range(max_length):
+                        chars = set(path[i] if len(path) > i else "" for path in paths)
+                        if len(chars) == 1:
+                            merged_path += chars.pop()
+                            differing_chars_encountered = True
+                        else:
+                            if differing_chars_encountered:
+                                merged_path += "(.*)"
+                                differing_chars_encountered = False
+                                break
+
+                    # Check if all paths have the same file extension
+                    extensions = [os.path.splitext(path)[1] for path in paths]
+                    common_extension = None
+                    if len(set(extensions)) == 1:
+                        common_extension = extensions[0]
+
+                    # Append the merged path if it's not already in the new_paths_verbose list
+                    if merged_path and (merged_path, len(paths)) not in new_paths_verbose:
+                        new_paths_verbose.append((merged_path, len(paths)))
+
+                    # Append the individual paths beneath the merged path
+                    new_paths_verbose.extend([(f"    {path}", 1) for path in paths])
+                else:
+                    new_paths_verbose.append((group[0], 1))
+
+
+            df_verbose = pd.DataFrame(new_paths_verbose, columns=["filename_glob", "glob_count"])
+            print(df_verbose.to_string(index=False))
+
+
+        # Display or save the DataFrame using pandas styler
+        if verbose:
+            df_verbose = pd.DataFrame(new_paths_verbose, columns=["filename_glob", "glob_count"])
+            styled_html = df_verbose.style.background_gradient(axis=0, cmap="viridis", gmap=df_verbose["glob_count"])
+            styled_html = styled_html.set_properties(subset=["glob_count"], **{"text-align": "right"})
+            styled_html.hide(axis="index")
+            styled_html.set_table_styles([
+                {"selector": "", "props": [("border", "1px solid grey")]},
+                {"selector": "tbody td", "props": [("border", "1px solid grey")]},
+                {"selector": "th", "props": [("border", "1px solid grey")]}
+            ])
+            html = styled_html.to_html()
+
+            with open(output_path, "w") as html_file:
+                html_file.write(html)
+
+        else:
+            df = pd.DataFrame(new_paths, columns=["filename_glob", "glob_count"])
+            df = df.sort_values(by="glob_count", ascending=False)
 
-    html = style.to_html()
+            styled_html = df.style.background_gradient(axis=0, cmap="viridis", gmap=df["glob_count"])
+            styled_html = styled_html.set_properties(subset=["glob_count"], **{"text-align": "right"})
+            styled_html.hide(axis="index")
+            styled_html.set_table_styles([
+                {"selector": "", "props": [("border", "1px solid grey")]},
+                {"selector": "tbody td", "props": [("border", "1px solid grey")]},
+                {"selector": "th", "props": [("border", "1px solid grey")]}
+            ])
+            html = styled_html.to_html()
 
-    with open(output_path, "w") as html_file:
-        html_file.write(html)
+            with open(output_path, "w") as html_file:
+                html_file.write(html)
 
-    total_count = df["glob_count"].sum()
-    print("Total glob_count:", total_count)
+            print("Styled results saved to:", output_path)
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument('-p', '--log-path', type=str, help="Path to the log file")
     parser.add_argument('-o', '--output-path', type=str, help="Path to the output HTML file")
+    parser.add_argument('-v', '--verbose', action='store_true', help="Display verbose output")
     args = parser.parse_args()
-    main(log_path=args.log_path, output_path=args.output_path)
+    main(log_path=args.log_path, output_path=args.output_path, verbose=args.verbose)
 
 
diff --git a/darshan-util/pydarshan/darshan/tests/test_glob_feature.py b/darshan-util/pydarshan/darshan/tests/test_glob_feature.py
index 803b5c41c..4bd44e00f 100644
--- a/darshan-util/pydarshan/darshan/tests/test_glob_feature.py
+++ b/darshan-util/pydarshan/darshan/tests/test_glob_feature.py
@@ -1,6 +1,5 @@
 # Note: Some tests may currently fail, as this script is still under active development.
 # The log files here are from the the darshan-logs repository
-
 import sys
 import os
 import darshan
@@ -13,223 +12,133 @@
 from darshan.glob_feature import glob_feature
 
 @pytest.mark.parametrize("log_name, expected_df", [
-     # grow this with more logs...
      ("e3sm_io_heatmap_only.darshan",
       pd.DataFrame({"filename_glob":
-                   ["/projects/radix-io/snyder/e3sm/can_I_out_h[.*].nc",
+                   ["/projects/radix-io/snyder/e3sm/can_I_out_h(.*).nc",
                     "/projects/radix-io/E3SM-IO-inputs/i_case_1344p.nc"],
                     "glob_count": [2, 1]})),
 
+
       ("snyder_acme.exe_id1253318_9-27-24239-1515303144625770178_2.darshan",
       pd.DataFrame({"filename_glob":
-                   ["/gpfs/mira-fs1/projects/radix-io/snyder/acme/test_F_case_cetus_dxt/run/[.*]",
-                    "/projects/ccsm/inputdata/atm/cam/chem/trop_mozart_aero/emis/[.*].nc",
-                    "/projects/ccsm/inputdata/atm/cam/physprops/[.*].nc",
-                    "/projects/ccsm/inputdata/atm/cam/[.*].nc",
-                    "/gpfs/mira-fs1/projects/radix-io/snyder/acme/test_F_case_cetus_dxt/run/test_F_case_cetus_dxt.[.*]00[.*]",
-                    "/projects/ccsm/inputdata/lnd/clm2/[.*].nc",
-                    "/gpfs/mira-fs1/projects/radix-io/snyder/acme/test_F_case_cetus_dxt/run/timing/[.*]i[.*]",
-                    "/projects/radix-io/snyder/acme/test_F_case_cetus_dxt/run/[.*].log.170927-064246",
-                    "/projects/ccsm/inputdata/atm/waccm/[.*].nc",
-                    "/projects/ccsm/inputdata/[.*]n.[.*].[.*]1[.*]0[.*].nc"], #Note: for this set of grouped paths it might be more benifical to display the individual filepaths 
-                    "glob_count": [22, 18, 14, 13, 10, 6, 6, 5, 3, 3]})),
+                   ["/projects/ccsm/inputdata/atm/cam/chem/trop_mozart_aero/emis/(.*).nc",
+                    "/gpfs/mira-fs1/projects/radix-io/snyder/acme/test_F_case_cetus_dxt/run/(.*)",
+                    "/projects/ccsm/inputdata/atm/cam/physprops/(.*).nc",
+                    "/projects/ccsm/inputdata/atm/cam/(.*).nc",
+                    "/gpfs/mira-fs1/projects/radix-io/snyder/acme/test_F_case_cetus_dxt/run/(.*).nml",
+                    "/gpfs/mira-fs1/projects/radix-io/snyder/acme/test_F_case_cetus_dxt/run/test_F_case_cetus_dxt.c(.*).nc",
+                    "/projects/ccsm/inputdata/lnd/clm2/(.*).nc",
+                    "/gpfs/mira-fs1/projects/radix-io/snyder/acme/test_F_case_cetus_dxt/run/(.*)",
+                    "/projects/radix-io/snyder/acme/test_F_case_cetus_dxt/run/(.*).170927-064246",
+                    "/projects/ccsm/inputdata/atm/waccm/(.*).nc",
+                    "/projects/ccsm/inputdata/(.*).nc"],
+                    "glob_count": [18, 14, 14, 13, 9, 9, 6, 6, 5, 3, 3]})),
+
 
      ("darshan-apmpi-2nodes-64mpi.darshan",
       pd.DataFrame({"filename_glob":
-                   ["/lus/theta-fs0/projects/Performance/chunduri/MILC/milctestv2-papi-reorder-darshan/MILC_2_526820_2021-06-14-15:58:47/[.*]n[.*]"],
+                   ["/lus/theta-fs0/projects/Performance/chunduri/MILC/milctestv2-papi-reorder-darshan/MILC_2_526820_2021-06-14-15:58:47/(.*)"],
                     "glob_count": [2]})),
 
+
      ("mpi-io-test.darshan",
       pd.DataFrame({"filename_glob":
                    ["/global/cscratch1/sd/ssnyder/tmp/mpi-io-test.tmp.dat"],
                     "glob_count": [1]})),
 
+
      ("e3sm_io_heatmap_and_dxt.darshan",
       pd.DataFrame({"filename_glob":
-                   ["/projects/radix-io/snyder/e3sm/can_I_out_h[.*].nc",
+                   ["/projects/radix-io/snyder/e3sm/can_I_out_h(.*).nc",
                     "/projects/radix-io/E3SM-IO-inputs/i_case_1344p.nc"],
                     "glob_count": [2, 1]})),
 
 
      ("hdf5_diagonal_write_1_byte_dxt.darshan",
       pd.DataFrame({"filename_glob":
-                   ["/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/site-packages/h5py/[.*]",
-                    "/yellow/users/nawtrey/projects/hdf5_testing/test_files_write_1_bytes/test_[.*].h5[.*]",
-                    "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/[.*]"],
-                    "glob_count": [54, 20, 10]})),
-
+                   ["/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/site-packages/h5py/(.*).so",
+                    "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/(.*).py",
+                    "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/(.*).pyc",
+                    "/yellow/users/nawtrey/projects/hdf5_testing/test_files_write_1_bytes/test_(.*).h5",
+                    "/yellow/users/nawtrey/projects/hdf5_testing/test_files_write_1_bytes/test_(.*)"],
+                    "glob_count": [24, 20, 20, 10, 10]})),
 
      ("hdf5_diagonal_write_bytes_range_dxt.darshan",
       pd.DataFrame({"filename_glob":
-                   ["/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/site-packages/h5py/[.*]",
-                    "/yellow/users/nawtrey/projects/hdf5_testing/test_files_write_1_bytes/test_[.*].h5[.*]",
-                    "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/[.*]"],
-                    "glob_count": [54, 20, 10]})),
+                   ["/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/site-packages/h5py/(.*).so",
+                    "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/(.*).py",
+                    "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/(.*).pyc",
+                    "/yellow/users/nawtrey/projects/hdf5_testing/test_files_write_1_bytes/test_(.*).h5",
+                    "/yellow/users/nawtrey/projects/hdf5_testing/test_files_write_1_bytes/test_(.*)"],
+                    "glob_count": [24, 20, 20, 10, 10]})),
 
      ("hdf5_diagonal_write_half_flush_dxt.darshan",
       pd.DataFrame({"filename_glob":
-                   ["/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/site-packages/h5py/[.*]",
-                    "/yellow/users/nawtrey/projects/hdf5_testing/test_files_write_1_bytes/test_[.*].h5[.*]",
-                    "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/[.*]"],
-                    "glob_count": [54, 20, 10]})),
+                   ["/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/site-packages/h5py/(.*).so",
+                    "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/(.*).py",
+                    "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/(.*).pyc",
+                    "/yellow/users/nawtrey/projects/hdf5_testing/test_files_write_1_bytes/test_(.*).h5",
+                    "/yellow/users/nawtrey/projects/hdf5_testing/test_files_write_1_bytes/test_(.*)"],
+                    "glob_count": [24, 20, 20, 10, 10]})),
 
      ("hdf5_diagonal_write_half_ranks_dxt.darshan",
       pd.DataFrame({"filename_glob":
-                   ["/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/site-packages/h5py/[.*]",
-                    "/yellow/users/nawtrey/projects/hdf5_testing/test_files_write_1_bytes/test_[.*].h5[.*]",
-                    "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/[.*]"],
-                    "glob_count": [54, 15, 10]})),
+                   ["/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/site-packages/h5py/(.*).so",
+                    "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/(.*).py",
+                    "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/(.*).pyc",
+                    "/yellow/users/nawtrey/projects/hdf5_testing/test_files_write_1_bytes/test_(.*).h5",
+                    "/yellow/users/nawtrey/projects/hdf5_testing/test_files_write_1_bytes/test_(.*)"],
+                    "glob_count": [24, 20, 20, 10, 10]})),
 
      ("hdf5_file_opens_only.darshan",
       pd.DataFrame({"filename_glob":
-                   ["/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/site-packages/numpy[.*]",
-                    "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/[.*]",
-                    "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/site-packages/h5py/[.*]",
-                    "/yellow/users/nawtrey/projects/hdf5_testing/test_h5f_only_[.*].h5"],
-                    "glob_count": [175, 85, 54, 3 ]})),
-
+                   ["/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/site-packages/numpy/(.*)",
+                    "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/(.*).py",
+                    "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/__pycache__/(.*).pyc",
+                    "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/site-packages/(.*).so",
+                    "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/site-packages/h5py(.*)",
+                    "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/lib-dynload/(.*).so",
+                    "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/(.*).pyc",
+                    "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/json/(.*)",
+                    "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/site-packages/(.*).py",
+                    "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/site-packages/(.*).pyc",
+                    "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/importlib/(.*)",
+                    "/users/nawtrey/.conda/envs/pydarshan_hdf5_py38/lib/python3.8/ctypes",
+                    "/yellow/users/nawtrey/projects/hdf5_testing/test_h5f_only_(.*).h5"],
+                                    "glob_count": [140, 62, 47, 37, 22, 17, 15, 8, 6, 6, 4, 4, 3]})),
 
 
      ("treddy_h5d_no_h5f.darshan",
       pd.DataFrame({"filename_glob":
-                   ["/home/treddy/python_virtual_envs/python_310_darshan/lib/python3.10/site-packages/h5py/_[.*].pyc",
-                    "/home/treddy/rough_work/darshan/issue_709/rank_[.*].h5[.*]"],
+                   ["/home/treddy/python_virtual_envs/python_310_darshan/lib/python3.10/site-packages/h5py/_(.*).pyc",
+                    "/home/treddy/rough_work/darshan/issue_709/rank_(.*)"],
                     "glob_count": [15, 6]})),
 
 
-     ("imbalanced-io.darshan",
-      pd.DataFrame({"filename_glob":
-                   ["/lus/theta-fs0/[.*]",
-                    "//3926523774",
-                    "//1958007717",
-                    "//946917208",
-                    "//3186458368",
-                    "//604249092",
-                    "//2324418701",
-                    "//2142813647",
-                    "//3149983296",
-                    "//1895353925",
-                    "//425392719",
-                    "//1053204904",
-                    "//2446001947"],
-                    "glob_count": [1015, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]})),
-
-
      ("shane_ior-HDF5_id438090-438090_11-9-41522-17417065676046418211_1.darshan",
       pd.DataFrame({"filename_glob":
-                   ["/home/shane/software/ior/build/testFile[.*]"],
+                   ["/home/shane/software/ior/build/testFile(.*)"],
                     "glob_count": [2]})),
 
      ("shane_ior-PNETCDF_id438100-438100_11-9-41525-10280033558448664385_1.darshan",
       pd.DataFrame({"filename_glob":
-                   ["/home/shane/software/ior/build/testFile[.*]"],
+                   ["/home/shane/software/ior/build/testFile(.*)"],
                     "glob_count": [2]})),
 
+
      ("partial_data_stdio.darshan",
      pd.DataFrame({"filename_glob":
-                  ["/home/carns/working/dbg/darshan-examples/foo[.*]",
+                  ["/home/carns/working/dbg/darshan-examples/foo(.*)",
                    "/home/carns/working/dbg/darshan-examples/test.out"],
                    "glob_count": [1021, 1]})),
 
 
-# This log file contains files that are only numeric. 
-# I commented them all out because I am unsure if we even want to include these files and if so do we want to group them together
-#     ("nonmpi_dxt_anonymized.darshan",
-#     pd.DataFrame({"filename_glob":
-#                  ["//2585653418",
-#                   "//3392535749",
-#                   "//1750113851",
-#                   "//68752815",
-#                   "//155559223",
-#                   "//1093384412",
-#                   "//3046746762",
-#                   "//2617286315",
-#                   "//826480344",
-#                   "//1571032323",
-#                   "//4226169779",
-#                   "//2418046705",
-#                   "//2010395326",
-#                   "//1767127016",
-#                   "//4075905285",
-#                   "//1067575933",
-#                   "//3616928368",
-#                   "//983841409",
-#                   "//513688402",
-#                   "//4287455549",
-#                   "//2136275236",
-#                   "//3097647757",
-#                   "//236164485",
-#                   "//1437530161",
-#                   "//2689488546",
-#                   "//4192870826",
-#                   "//309267665",
-#                   "//780646879",
-#                   "//499632015",
-#                   "//2507343021",
-#                   "//2695660354",
-#                   "//3091680351",
-#                   "//3164053573",
-#                   "//930552855",
-#                   "//1137823565",
-#                   "//2598810996",
-#                   "//2330561107",
-#                   "//2564488601",
-#                   "//317014058",
-#                   "//3342706664",
-#                   "//2160565458",
-#                   "//2907700500",
-#                   "//2116489843",
-#                   "//135439080",
-#                   "//3098064231",
-#                   "//2967008390",
-#                   "//3067634051",
-#                   "//1734260232",
-#                   "//3120506952",
-#                   "//642754434",
-#                   "//463702723",
-#                   "//1896899807",
-#                   "//4260655471",
-#                   "//827646422",
-#                   "//942747095",
-#                   "//432306240",
-#                   "//583215908",
-#                   "//1673153855",
-#                   "//3192604617",
-#                   "//3225174794",
-#                   "//2990589364",
-#                   "//37712466",
-#                   "//2173526570",
-#                   "//1117575673",
-#                   "//3916290828",
-#                   "//430181069",
-#                   "//3645159644",
-#                   "//529183092",
-#                   "//3225006356",
-#                   "//63288926",
-#                   "//798211322",
-#                   "//2256136699",
-#                   "//4004231621",
-#                   "//2379710227",
-#                   "//3211841059",
-#                   "//3397061505",
-#                   "//416688243",
-#                   "//1456531123"],
-#                   "glob_count": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]})),
-
-
      ("partial_data_dxt.darshan",
      pd.DataFrame({"filename_glob":
                   ["/home/carns/working/dbg/darshan-examples/test.out"],
                    "glob_count": [1]})),
 
 
-     ("partial_data_stdio.darshan",
-     pd.DataFrame({"filename_glob":
-                  ["/home/carns/working/dbg/darshan-examples/foo[.*]",
-                   "/home/carns/working/dbg/darshan-examples/test.out"],
-                   "glob_count": [1021 ,1]})),
-
-
      ("mpi-io-test-ppc64-3.0.0.darshan",
      pd.DataFrame({"filename_glob":
                   ["/gpfs/mira-fs0/projects/SSSPPg/snyder/tmp/mpi-io-test.tmp.dat"],
@@ -248,29 +157,10 @@
 
      ("runtime_and_dxt_heatmaps_diagonal_write_only.darshan",
      pd.DataFrame({"filename_glob":
-                  ["/yellow/users/treddy/github_projects/heatmap_diagonal/rank_[.*]_write_1_bytes"],
+                  ["/yellow/users/treddy/github_projects/heatmap_diagonal/rank_(.*)_write_1_bytes"],
                    "glob_count": [32]})),
 
 
-# This log file contains no data
-     ("treddy_runtime_heatmap_inactive_ranks.darshan",
-     pd.DataFrame({"filename_glob":
-                  [],
-                   "glob_count": []})),
-
-
-     ("skew-app.darshan",
-     pd.DataFrame({"filename_glob":
-                  ["/lus/theta-fs0/2934391481"],
-                   "glob_count": [1]})),
-
-     ("skew-autobench-ior.darshan",
-     pd.DataFrame({"filename_glob":
-                  ["//1968299212",
-                   "//4207382746"],
-                   "glob_count": [1, 1]})),
-
-
      ("laytonjb_test1_id28730_6-7-43012-2131301613401632697_1.darshan",
      pd.DataFrame({"filename_glob":
                   ["/home/laytonjb/PROJECTS/DARSHAN/TEST/jeff.txt"],
@@ -285,11 +175,9 @@ def test_glob_tables(tmpdir, log_name, expected_df):
     with tmpdir.as_cwd():
         cwd = os.getcwd()
         outfile = os.path.join(cwd, "output.html")
-        glob_feature.main(log_path, outfile)
+        glob_feature.main(log_path, outfile, verbose=False)
         actual_table = pd.read_html(outfile)[0]
         print("log path is", log_path)
-        print("Shape of actual table:", actual_table.shape)
-        print("Shape of expected_df:", expected_df.shape)
 
         # Print the contents of the DataFrames
         print("Actual DataFrame:")
@@ -297,9 +185,4 @@ def test_glob_tables(tmpdir, log_name, expected_df):
         print("Expected DataFrame:")
         print(expected_df)
 
-        # Compare the two DataFrames
-        diff = actual_table['filename_glob'].compare(expected_df['filename_glob'])
-        # Print the differences
-        print(diff)
         assert_frame_equal(actual_table, expected_df)
-