EliHei2 · pre-commit-ci · Oct 6, 2025 · Oct 6, 2025
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -8,15 +8,15 @@ minimum_pre_commit_version: 2.16.0
 ci:
   skip: []
 repos:
-  - repo: https://github.com/psf/black
-    rev: 24.10.0
+  - repo: https://github.com/psf/black-pre-commit-mirror
+    rev: 25.9.0
     hooks:
       - id: black
   - repo: https://github.com/pre-commit/mirrors-prettier
     rev: v4.0.0-alpha.8
     hooks:
       - id: prettier
   - repo: https://github.com/asottile/blacken-docs
-    rev: 1.19.1
+    rev: 1.20.0
     hooks:
       - id: blacken-docs
diff --git a/analysis_summary.html b/analysis_summary.html
diff --git a/scripts/0_data_creation_5k_nucleus.py b/scripts/0_data_creation_5k_nucleus.py
@@ -35,28 +35,23 @@
 """
 
 
-
 XENIUM_DATA_DIR = Path(
     "/dkfz/cluster/gpu/data/OE0606/elihei/segger_experiments/data_raw/xenium_seg_kit/human_CRC_real"
 )
 SEGGER_DATA_DIR = Path("data_tidy/pyg_datasets/human_CRC_seg_nuclei")
-SCRNASEQ_FILE = Path(
-    "data_tidy/Human_CRC/scRNAseq.h5ad"
-)
-CELLTYPE_COLUMN = "Level1" # change this to your column name
+SCRNASEQ_FILE = Path("data_tidy/Human_CRC/scRNAseq.h5ad")
+CELLTYPE_COLUMN = "Level1"  # change this to your column name
 scrnaseq = sc.read(SCRNASEQ_FILE)
 
 
-
 # subsample the scRNAseq if needed
 # sc.pp.subsample(scrnaseq, 0.1)
 # scrnaseq.var_names_make_unique()
 
 
 # Calculate gene-celltype embeddings from reference data
 gene_celltype_abundance_embedding = calculate_gene_celltype_abundance_embedding(
-    scrnaseq,
-    CELLTYPE_COLUMN
+    scrnaseq, CELLTYPE_COLUMN
 )
 
 # Initialize spatial transcriptomics sample object
@@ -65,7 +60,7 @@
     n_workers=4,
     sample_type="xenium",
     weights=gene_celltype_abundance_embedding,
-    scale_factor=1.
+    scale_factor=1.0,
 )
 
 
@@ -77,7 +72,7 @@
     dist_tx=5,  # Use calculated optimal search radius
     tile_size=10000,  # Tile size for processing
     # tile_height=50,
-    neg_sampling_ratio=10.,  # 5:1 negative:positive samples
+    neg_sampling_ratio=10.0,  # 5:1 negative:positive samples
     frac=1.0,  # Use all data
     val_prob=0.3,  # 30% validation set
     test_prob=0,  # No test set

diff --git a/scripts/1_train_5k.py b/scripts/1_train_5k.py
@@ -1,4 +1,5 @@
 from segger.training.segger_data_module import SeggerDataModule
+
 # from segger.prediction.predict import predict, load_model
 from segger.models.segger_model import Segger
 from segger.training.train import LitSegger
@@ -9,14 +10,15 @@
 from lightning.pytorch.plugins.environments import LightningEnvironment
 from matplotlib import pyplot as plt
 import seaborn as sns
+
 # import pandas as pd
 from segger.data.utils import calculate_gene_celltype_abundance_embedding
+
 # import scanpy as sc
 import os
 from lightning import LightningModule
 
 
-
 segger_data_dir = Path("data_tidy/pyg_datasets/human_CRC_seg_cells")
 models_dir = Path("./models/human_CRC_seg_cells")
 
@@ -43,14 +45,18 @@
 
 
 model = Segger(
-    num_tx_tokens= num_tx_tokens,
+    num_tx_tokens=num_tx_tokens,
     init_emb=8,
     hidden_channels=32,
     out_channels=16,
     heads=4,
     num_mid_layers=3,
 )
-model = to_hetero(model, (["tx", "bd"], [("tx", "belongs", "bd"), ("tx", "neighbors", "tx")]), aggr="sum")
+model = to_hetero(
+    model,
+    (["tx", "bd"], [("tx", "belongs", "bd"), ("tx", "neighbors", "tx")]),
+    aggr="sum",
+)
 
 batch = dm.train[0]
 model.forward(batch.x_dict, batch.edge_index_dict)

diff --git a/scripts/2_predict_5k.py b/scripts/2_predict_5k.py
@@ -8,26 +8,27 @@
 import dask.dataframe as dd
 import pandas as pd
 from pathlib import Path
+
 os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
 os.environ["CUPY_CACHE_DIR"] = "./.cupy"
 
 
-XENIUM_DATA_DIR = Path( #raw data dir
+XENIUM_DATA_DIR = Path(  # raw data dir
     "/dkfz/cluster/gpu/data/OE0606/elihei/segger_experiments/data_raw/xenium_seg_kit/human_CRC_real"
 )
-transcripts_file = (
-   XENIUM_DATA_DIR / "transcripts.parquet"
-)
+transcripts_file = XENIUM_DATA_DIR / "transcripts.parquet"
 
-SEGGER_DATA_DIR = Path("data_tidy/pyg_datasets/human_CRC_seg_nuclei") # preprocessed data dir
+SEGGER_DATA_DIR = Path(
+    "data_tidy/pyg_datasets/human_CRC_seg_nuclei"
+)  # preprocessed data dir
 
 
 seg_tag = "human_CRC_seg_nuclei"
 model_version = 0
-models_dir = Path("./models") / seg_tag #trained model dir
+models_dir = Path("./models") / seg_tag  # trained model dir
 
 
-output_dir = Path( #output dir
+output_dir = Path(  # output dir
     "/dkfz/cluster/gpu/data/OE0606/elihei/segger_experiments/data_tidy/benchmarks/human_CRC_seg_nuclei"
 )
 
@@ -58,10 +59,10 @@
     min_transcripts=5,
     score_cut=0.5,
     cell_id_col="segger_cell_id",
-    save_transcripts= True,
-    save_anndata= True,
-    save_cell_masks= False,  # Placeholder for future implementation
-    use_cc=False, # if one wants fragments (groups of similar transcripts not attached to any nuclei)
+    save_transcripts=True,
+    save_anndata=True,
+    save_cell_masks=False,  # Placeholder for future implementation
+    use_cc=False,  # if one wants fragments (groups of similar transcripts not attached to any nuclei)
     knn_method="kd_tree",
     verbose=True,
     gpu_ids=["0"],

diff --git a/scripts/create_data_cosmx.py b/scripts/create_data_cosmx.py
@@ -69,7 +69,7 @@
 )
 
 
-cells = list(set(transcript_counts.index) &  set(nucleus_polygons.index))
+cells = list(set(transcript_counts.index) & set(nucleus_polygons.index))
 nucleus_polygons = nucleus_polygons[cells]
 transcript_counts = transcript_counts[cells]
 

diff --git a/scripts/create_data_fast_sample.py b/scripts/create_data_fast_sample.py
@@ -42,17 +42,14 @@
     "/dkfz/cluster/gpu/data/OE0606/elihei/segger_experiments/data_raw/xenium_seg_kit/human_CRC_real"
 )
 SEGGER_DATA_DIR = Path("data_tidy/pyg_datasets/human_CRC_seg_nuclei")
-SCRNASEQ_FILE = Path(
-    "data_tidy/Human_CRC/scRNAseq.h5ad"
-)
+SCRNASEQ_FILE = Path("data_tidy/Human_CRC/scRNAseq.h5ad")
 CELLTYPE_COLUMN = "Level1"
 scrnaseq = sc.read(SCRNASEQ_FILE)
 sc.pp.subsample(scrnaseq, 0.1)
 scrnaseq.var_names_make_unique()
 # Calculate gene-celltype embeddings from reference data
 gene_celltype_abundance_embedding = calculate_gene_celltype_abundance_embedding(
-    scrnaseq,
-    CELLTYPE_COLUMN
+    scrnaseq, CELLTYPE_COLUMN
 )
 
 # Initialize spatial transcriptomics sample object
@@ -61,7 +58,7 @@
     n_workers=4,
     sample_type="xenium",
     # scale_factor=0.8,
-    weights=gene_celltype_abundance_embedding
+    weights=gene_celltype_abundance_embedding,
 )
 
 # # Load and filter datas

diff --git a/scripts/create_data_merscope.py b/scripts/create_data_merscope.py
@@ -38,8 +38,8 @@
 # CELLTYPE_COLUMN = 'celltype_minor'
 
 
-MERSCOPE_DATA_DIR = Path('data_raw/merscope/processed/')
-SEGGER_DATA_DIR = Path('data_tidy/pyg_datasets/merscope_liver')
+MERSCOPE_DATA_DIR = Path("data_raw/merscope/processed/")
+SEGGER_DATA_DIR = Path("data_tidy/pyg_datasets/merscope_liver")
 # SCRNASEQ_FILE = Path('/omics/groups/OE0606/internal/mimmo/MERSCOPE/notebooks/data/scData/bh/bh_mng_scdata_20250306.h5ad')
 # CELLTYPE_COLUMN = 'annot_v1'
 
@@ -80,4 +80,4 @@
     frac=1.0,  # Use all data
     val_prob=0.3,  # 30% validation set
     test_prob=0,  # No test set
-)
+)
diff --git a/scripts/predict_model_sample.py b/scripts/predict_model_sample.py
@@ -17,12 +17,10 @@
 import dask.dataframe as dd
 
 
-
 seg_tag = "human_CRC_seg_cells"
 model_version = 0
 
 
-
 XENIUM_DATA_DIR = Path(
     "/dkfz/cluster/gpu/data/OE0606/elihei/segger_experiments/data_raw/xenium_seg_kit/human_CRC_real"
 )
@@ -32,9 +30,7 @@
 benchmarks_dir = Path(
     "/dkfz/cluster/gpu/data/OE0606/elihei/segger_experiments/data_tidy/benchmarks/human_CRC_seg_cells"
 )
-transcripts_file = (
-   "/dkfz/cluster/gpu/data/OE0606/elihei/segger_experiments/data_raw/xenium_seg_kit/human_CRC_real/transcripts.parquet"
-)
+transcripts_file = "/dkfz/cluster/gpu/data/OE0606/elihei/segger_experiments/data_raw/xenium_seg_kit/human_CRC_real/transcripts.parquet"
 # Initialize the Lightning data module
 dm = SeggerDataModule(
     data_dir=SEGGER_DATA_DIR,

diff --git a/scripts/train_cosmx.py b/scripts/train_cosmx.py
@@ -1,4 +1,5 @@
 from segger.training.segger_data_module import SeggerDataModule
+
 # from segger.prediction.predict import predict, load_model
 from segger.models.segger_model import Segger
 from segger.training.train import LitSegger
@@ -9,14 +10,15 @@
 from lightning.pytorch.plugins.environments import LightningEnvironment
 from matplotlib import pyplot as plt
 import seaborn as sns
+
 # import pandas as pd
 from segger.data.utils import calculate_gene_celltype_abundance_embedding
+
 # import scanpy as sc
 import os
 from lightning import LightningModule
 
 
-
 segger_data_dir = Path("data_tidy/pyg_datasets/cosmx_pancreas_degbugged")
 models_dir = Path("./models/cosmx_pancreas")
 
@@ -50,7 +52,11 @@
     heads=4,
     num_mid_layers=3,
 )
-model = to_hetero(model, (["tx", "bd"], [("tx", "belongs", "bd"), ("tx", "neighbors", "tx")]), aggr="sum")
+model = to_hetero(
+    model,
+    (["tx", "bd"], [("tx", "belongs", "bd"), ("tx", "neighbors", "tx")]),
+    aggr="sum",
+)
 
 batch = dm.train[0]
 model.forward(batch.x_dict, batch.edge_index_dict)

diff --git a/scripts/train_mimmo_batch.py b/scripts/train_mimmo_batch.py
@@ -1,4 +1,5 @@
 from segger.training.segger_data_module import SeggerDataModule
+
 # from segger.prediction.predict import predict, load_model
 from segger.models.segger_model import Segger
 from segger.training.train import LitSegger
@@ -9,16 +10,21 @@
 from lightning.pytorch.plugins.environments import LightningEnvironment
 from matplotlib import pyplot as plt
 import seaborn as sns
+
 # import pandas as pd
 from segger.data.utils import calculate_gene_celltype_abundance_embedding
+
 # import scanpy as sc
 import os
 from lightning import LightningModule
 
 
-
-segger_data_dir = Path("data_tidy/pyg_datasets/project24_MNG/output-XETG00423__0042861__mng_07_TMA__20250303__153740")
-models_dir = Path("./models/project24_MNG/output-XETG00423__0042861__mng_07_TMA__20250303__153740")
+segger_data_dir = Path(
+    "data_tidy/pyg_datasets/project24_MNG/output-XETG00423__0042861__mng_07_TMA__20250303__153740"
+)
+models_dir = Path(
+    "./models/project24_MNG/output-XETG00423__0042861__mng_07_TMA__20250303__153740"
+)
 
 # Base directory to store Pytorch Lightning models
 # models_dir = Path('models')
@@ -44,14 +50,18 @@
 
 model = Segger(
     # is_token_based=is_token_based,
-    num_tx_tokens= num_tx_tokens,
+    num_tx_tokens=num_tx_tokens,
     init_emb=8,
     hidden_channels=32,
     out_channels=16,
     heads=4,
     num_mid_layers=3,
 )
-model = to_hetero(model, (["tx", "bd"], [("tx", "belongs", "bd"), ("tx", "neighbors", "tx")]), aggr="sum")
+model = to_hetero(
+    model,
+    (["tx", "bd"], [("tx", "belongs", "bd"), ("tx", "neighbors", "tx")]),
+    aggr="sum",
+)
 
 batch = dm.train[0]
 model.forward(batch.x_dict, batch.edge_index_dict)

diff --git a/scripts/train_model.py b/scripts/train_model.py
@@ -8,7 +8,7 @@
 from lightning import Trainer
 
 parser = argparse.ArgumentParser()
-parser.add_argument('--data_dir', type=Path, required=True)
+parser.add_argument("--data_dir", type=Path, required=True)
 args = parser.parse_args()
 
 segger_data_dir = args.data_dir
@@ -32,7 +32,11 @@
     heads=4,
     num_mid_layers=2,
 )
-model = to_hetero(model, (["tx", "bd"], [("tx", "belongs", "bd"), ("tx", "neighbors", "tx")]), aggr="sum")
+model = to_hetero(
+    model,
+    (["tx", "bd"], [("tx", "belongs", "bd"), ("tx", "neighbors", "tx")]),
+    aggr="sum",
+)
 
 ls = LitSegger(model=model)
 
@@ -46,4 +50,4 @@
     logger=CSVLogger(models_dir),
 )
 
-trainer.fit(ls, datamodule=dm)
+trainer.fit(ls, datamodule=dm)