Skip to content

Commit df29236

Browse files
committed
import fgwas scores using log scale
1 parent 231acd9 commit df29236

File tree

1 file changed

+11
-12
lines changed

1 file changed

+11
-12
lines changed

snp2cell/util.py

Lines changed: 11 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
import dill
1212
import networkx as nx
1313
import numpy as np
14+
from scipy.special import logsumexp
1415
import pandas as pd
1516
import pyranges
1617
import pybiomart
@@ -266,12 +267,12 @@ def export_for_fgwas(
266267

267268
def _calc_per_region_bf(region_id: str, region_df: pd.DataFrame) -> pd.Series:
268269
"""
269-
calculate regional Bayes factors per group (used in `load_fgwas_scores`)
270+
calculate regional log Bayes factors per group (used in `load_fgwas_scores`)
270271
"""
271-
w = np.exp(region_df["SNP_rel_loc"])
272-
bf = np.exp(region_df["SNP_BF"])
273-
rbf = (bf * w).sum() / w.sum()
274-
return pd.Series(rbf, index=[region_id])
272+
log_numerator = logsumexp(region_df["SNP_BF"] + region_df["SNP_rel_loc"])
273+
log_denominator = logsumexp(region_df["SNP_rel_loc"])
274+
log_bf = log_numerator - log_denominator
275+
return pd.Series(log_bf, index=[region_id])
275276

276277

277278
@add_logger()
@@ -322,7 +323,7 @@ def load_fgwas_scores(
322323
num_cores = snp2cell.NCPU
323324
log.info(f"using {num_cores} cores")
324325

325-
# load fgwas output
326+
# load fgwas output: SNP_BF (log BF), SNP_rel_loc (log weight including distance and LD score)
326327
df = pd.read_csv(fgwas_output_path, sep="\t", header=None)
327328
df.columns = ["regionID", "SNP_BF", "SNP_rel_loc"]
328329

@@ -332,20 +333,18 @@ def load_fgwas_scores(
332333

333334
# add region information from region_loc_path
334335
region_info = pd.read_csv(region_loc_path, sep="\t")
335-
region_info["RBF"] = region_info.index.map(res)
336+
region_info["log_RBF"] = region_info.index.map(res)
336337
region_info["name"] = region_info.apply(
337338
lambda r: f"chr{int(r['hm_chr'])}:{int(r['hm_pos'])}-{int(r['hm_pos'])}", axis=1
338339
)
339340
region_info["ID"] = region_info.index
340341

341-
region_info[["name", "ID", "hm_chr", "hm_pos", "RBF"]].to_csv(
342+
region_info[["name", "ID", "hm_chr", "hm_pos", "log_RBF"]].to_csv(
342343
rbf_table_path, sep="\t", index=False
343344
)
344345

345-
# calculate log RBF scores
346-
scores = (
347-
region_info.set_index("name")["RBF"].apply(np.log).sort_values(ascending=False)
348-
)
346+
# prepare log RBF scores
347+
scores = region_info.set_index("name")["log_RBF"].sort_values(ascending=False)
349348

350349
def rename(s):
351350
# expand region to full length

0 commit comments

Comments
 (0)