check in load_fgwas_scores

JPatrickPett · JPatrickPett · commit 1493734660f0 · 2025-03-06T16:14:10.000Z
diff --git a/snp2cell/util.py b/snp2cell/util.py
@@ -340,6 +340,19 @@ def load_fgwas_scores(
     df = pd.read_csv(fgwas_output_path, sep="\t", header=None)
     df.columns = ["regionID", "SNP_BF", "SNP_rel_loc"]
 
+    # load region locations
+    log.info(f"loading region locations from '{region_loc_path}'")
+    region_info = pd.read_csv(region_loc_path, sep="\t")
+    region_info.index += 1
+
+    if region_info.shape[0] != df["regionID"].max():
+        log.warning(
+            f"largest region ID in fgwas output ({df['regionID'].max()})\n"
+            f"number of regions in region location file ({region_info.shape[0]})\n"
+            "are you sure the region location file corresponds to the fgwas output?"
+        )
+
+    # calculate regional Bayes factors
     log.info(f"calculating regional Bayes factors")
     region_groups = list(df.groupby("regionID"))
     with mp.Pool(num_cores) as pool:
@@ -353,8 +366,7 @@ def load_fgwas_scores(
     res = pd.concat(res)
 
     # add region information from region_loc_path
-    log.info(f"loading region locations from '{region_loc_path}'")
-    region_info = pd.read_csv(region_loc_path, sep="\t")
+    log.info(f"adding region information to scores")
     region_info["log_RBF"] = region_info.index.map(res)
     region_info["name"] = region_info.apply(
         lambda r: f"chr{int(r['hm_chr'])}:{int(r['hm_pos'])}-{int(r['hm_pos'])}", axis=1
diff --git a/tests/test_util.py b/tests/test_util.py
@@ -73,9 +73,10 @@ def test_export_for_fgwas(snp2cell_instance, tmp_path):
 
 def test_load_fgwas_scores(snp2cell_instance, tmp_path):
     # Create a temporary fgwas output file with two rows.
-    fgwas_output_path = tmp_path / "fgwas_output.txt"
-    with open(fgwas_output_path, "w") as f:
-        f.write(f"0\t{np.log(2)}\t0\n1\t{np.log(3)}\t0\n")
+    fgwas_output_path = tmp_path / "fgwas_output.gz"
+    # TODO: Replace with a more realistic example.
+    df = pd.DataFrame([[1, np.log(2), 0], [2, np.log(3), 0]])
+    df.to_csv(fgwas_output_path, sep="\t", header=False, index=False)
 
     # Create a temporary region location file with header (as in export_for_fgwas).
     region_loc_path = tmp_path / "region_loc.txt"