Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,15 @@ minimum_pre_commit_version: 2.16.0
ci:
skip: []
repos:
- repo: https://github.com/psf/black
rev: 24.10.0
- repo: https://github.com/psf/black-pre-commit-mirror
rev: 25.9.0
hooks:
- id: black
- repo: https://github.com/pre-commit/mirrors-prettier
rev: v4.0.0-alpha.8
hooks:
- id: prettier
- repo: https://github.com/asottile/blacken-docs
rev: 1.19.1
rev: 1.20.0
hooks:
- id: blacken-docs
307,588 changes: 307,335 additions & 253 deletions analysis_summary.html

Large diffs are not rendered by default.

15 changes: 5 additions & 10 deletions scripts/0_data_creation_5k_nucleus.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,28 +35,23 @@
"""



XENIUM_DATA_DIR = Path(
"/dkfz/cluster/gpu/data/OE0606/elihei/segger_experiments/data_raw/xenium_seg_kit/human_CRC_real"
)
SEGGER_DATA_DIR = Path("data_tidy/pyg_datasets/human_CRC_seg_nuclei")
SCRNASEQ_FILE = Path(
"data_tidy/Human_CRC/scRNAseq.h5ad"
)
CELLTYPE_COLUMN = "Level1" # change this to your column name
SCRNASEQ_FILE = Path("data_tidy/Human_CRC/scRNAseq.h5ad")
CELLTYPE_COLUMN = "Level1" # change this to your column name
scrnaseq = sc.read(SCRNASEQ_FILE)



# subsample the scRNAseq if needed
# sc.pp.subsample(scrnaseq, 0.1)
# scrnaseq.var_names_make_unique()


# Calculate gene-celltype embeddings from reference data
gene_celltype_abundance_embedding = calculate_gene_celltype_abundance_embedding(
scrnaseq,
CELLTYPE_COLUMN
scrnaseq, CELLTYPE_COLUMN
)

# Initialize spatial transcriptomics sample object
Expand All @@ -65,7 +60,7 @@
n_workers=4,
sample_type="xenium",
weights=gene_celltype_abundance_embedding,
scale_factor=1.
scale_factor=1.0,
)


Expand All @@ -77,7 +72,7 @@
dist_tx=5, # Use calculated optimal search radius
tile_size=10000, # Tile size for processing
# tile_height=50,
neg_sampling_ratio=10., # 5:1 negative:positive samples
neg_sampling_ratio=10.0, # 5:1 negative:positive samples
frac=1.0, # Use all data
val_prob=0.3, # 30% validation set
test_prob=0, # No test set
Expand Down
12 changes: 9 additions & 3 deletions scripts/1_train_5k.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from segger.training.segger_data_module import SeggerDataModule

# from segger.prediction.predict import predict, load_model
from segger.models.segger_model import Segger
from segger.training.train import LitSegger
Expand All @@ -9,14 +10,15 @@
from lightning.pytorch.plugins.environments import LightningEnvironment
from matplotlib import pyplot as plt
import seaborn as sns

# import pandas as pd
from segger.data.utils import calculate_gene_celltype_abundance_embedding

# import scanpy as sc
import os
from lightning import LightningModule



segger_data_dir = Path("data_tidy/pyg_datasets/human_CRC_seg_cells")
models_dir = Path("./models/human_CRC_seg_cells")

Expand All @@ -43,14 +45,18 @@


model = Segger(
num_tx_tokens= num_tx_tokens,
num_tx_tokens=num_tx_tokens,
init_emb=8,
hidden_channels=32,
out_channels=16,
heads=4,
num_mid_layers=3,
)
model = to_hetero(model, (["tx", "bd"], [("tx", "belongs", "bd"), ("tx", "neighbors", "tx")]), aggr="sum")
model = to_hetero(
model,
(["tx", "bd"], [("tx", "belongs", "bd"), ("tx", "neighbors", "tx")]),
aggr="sum",
)

batch = dm.train[0]
model.forward(batch.x_dict, batch.edge_index_dict)
Expand Down
23 changes: 12 additions & 11 deletions scripts/2_predict_5k.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,26 +8,27 @@
import dask.dataframe as dd
import pandas as pd
from pathlib import Path

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
os.environ["CUPY_CACHE_DIR"] = "./.cupy"


XENIUM_DATA_DIR = Path( #raw data dir
XENIUM_DATA_DIR = Path( # raw data dir
"/dkfz/cluster/gpu/data/OE0606/elihei/segger_experiments/data_raw/xenium_seg_kit/human_CRC_real"
)
transcripts_file = (
XENIUM_DATA_DIR / "transcripts.parquet"
)
transcripts_file = XENIUM_DATA_DIR / "transcripts.parquet"

SEGGER_DATA_DIR = Path("data_tidy/pyg_datasets/human_CRC_seg_nuclei") # preprocessed data dir
SEGGER_DATA_DIR = Path(
"data_tidy/pyg_datasets/human_CRC_seg_nuclei"
) # preprocessed data dir


seg_tag = "human_CRC_seg_nuclei"
model_version = 0
models_dir = Path("./models") / seg_tag #trained model dir
models_dir = Path("./models") / seg_tag # trained model dir


output_dir = Path( #output dir
output_dir = Path( # output dir
"/dkfz/cluster/gpu/data/OE0606/elihei/segger_experiments/data_tidy/benchmarks/human_CRC_seg_nuclei"
)

Expand Down Expand Up @@ -58,10 +59,10 @@
min_transcripts=5,
score_cut=0.5,
cell_id_col="segger_cell_id",
save_transcripts= True,
save_anndata= True,
save_cell_masks= False, # Placeholder for future implementation
use_cc=False, # if one wants fragments (groups of similar transcripts not attached to any nuclei)
save_transcripts=True,
save_anndata=True,
save_cell_masks=False, # Placeholder for future implementation
use_cc=False, # if one wants fragments (groups of similar transcripts not attached to any nuclei)
knn_method="kd_tree",
verbose=True,
gpu_ids=["0"],
Expand Down
2 changes: 1 addition & 1 deletion scripts/create_data_cosmx.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@
)


cells = list(set(transcript_counts.index) & set(nucleus_polygons.index))
cells = list(set(transcript_counts.index) & set(nucleus_polygons.index))
nucleus_polygons = nucleus_polygons[cells]
transcript_counts = transcript_counts[cells]

Expand Down
9 changes: 3 additions & 6 deletions scripts/create_data_fast_sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,17 +42,14 @@
"/dkfz/cluster/gpu/data/OE0606/elihei/segger_experiments/data_raw/xenium_seg_kit/human_CRC_real"
)
SEGGER_DATA_DIR = Path("data_tidy/pyg_datasets/human_CRC_seg_nuclei")
SCRNASEQ_FILE = Path(
"data_tidy/Human_CRC/scRNAseq.h5ad"
)
SCRNASEQ_FILE = Path("data_tidy/Human_CRC/scRNAseq.h5ad")
CELLTYPE_COLUMN = "Level1"
scrnaseq = sc.read(SCRNASEQ_FILE)
sc.pp.subsample(scrnaseq, 0.1)
scrnaseq.var_names_make_unique()
# Calculate gene-celltype embeddings from reference data
gene_celltype_abundance_embedding = calculate_gene_celltype_abundance_embedding(
scrnaseq,
CELLTYPE_COLUMN
scrnaseq, CELLTYPE_COLUMN
)

# Initialize spatial transcriptomics sample object
Expand All @@ -61,7 +58,7 @@
n_workers=4,
sample_type="xenium",
# scale_factor=0.8,
weights=gene_celltype_abundance_embedding
weights=gene_celltype_abundance_embedding,
)

# # Load and filter datas
Expand Down
6 changes: 3 additions & 3 deletions scripts/create_data_merscope.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,8 @@
# CELLTYPE_COLUMN = 'celltype_minor'


MERSCOPE_DATA_DIR = Path('data_raw/merscope/processed/')
SEGGER_DATA_DIR = Path('data_tidy/pyg_datasets/merscope_liver')
MERSCOPE_DATA_DIR = Path("data_raw/merscope/processed/")
SEGGER_DATA_DIR = Path("data_tidy/pyg_datasets/merscope_liver")
# SCRNASEQ_FILE = Path('/omics/groups/OE0606/internal/mimmo/MERSCOPE/notebooks/data/scData/bh/bh_mng_scdata_20250306.h5ad')
# CELLTYPE_COLUMN = 'annot_v1'

Expand Down Expand Up @@ -80,4 +80,4 @@
frac=1.0, # Use all data
val_prob=0.3, # 30% validation set
test_prob=0, # No test set
)
)
6 changes: 1 addition & 5 deletions scripts/predict_model_sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,10 @@
import dask.dataframe as dd



seg_tag = "human_CRC_seg_cells"
model_version = 0



XENIUM_DATA_DIR = Path(
"/dkfz/cluster/gpu/data/OE0606/elihei/segger_experiments/data_raw/xenium_seg_kit/human_CRC_real"
)
Expand All @@ -32,9 +30,7 @@
benchmarks_dir = Path(
"/dkfz/cluster/gpu/data/OE0606/elihei/segger_experiments/data_tidy/benchmarks/human_CRC_seg_cells"
)
transcripts_file = (
"/dkfz/cluster/gpu/data/OE0606/elihei/segger_experiments/data_raw/xenium_seg_kit/human_CRC_real/transcripts.parquet"
)
transcripts_file = "/dkfz/cluster/gpu/data/OE0606/elihei/segger_experiments/data_raw/xenium_seg_kit/human_CRC_real/transcripts.parquet"
# Initialize the Lightning data module
dm = SeggerDataModule(
data_dir=SEGGER_DATA_DIR,
Expand Down
10 changes: 8 additions & 2 deletions scripts/train_cosmx.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from segger.training.segger_data_module import SeggerDataModule

# from segger.prediction.predict import predict, load_model
from segger.models.segger_model import Segger
from segger.training.train import LitSegger
Expand All @@ -9,14 +10,15 @@
from lightning.pytorch.plugins.environments import LightningEnvironment
from matplotlib import pyplot as plt
import seaborn as sns

# import pandas as pd
from segger.data.utils import calculate_gene_celltype_abundance_embedding

# import scanpy as sc
import os
from lightning import LightningModule



segger_data_dir = Path("data_tidy/pyg_datasets/cosmx_pancreas_degbugged")
models_dir = Path("./models/cosmx_pancreas")

Expand Down Expand Up @@ -50,7 +52,11 @@
heads=4,
num_mid_layers=3,
)
model = to_hetero(model, (["tx", "bd"], [("tx", "belongs", "bd"), ("tx", "neighbors", "tx")]), aggr="sum")
model = to_hetero(
model,
(["tx", "bd"], [("tx", "belongs", "bd"), ("tx", "neighbors", "tx")]),
aggr="sum",
)

batch = dm.train[0]
model.forward(batch.x_dict, batch.edge_index_dict)
Expand Down
20 changes: 15 additions & 5 deletions scripts/train_mimmo_batch.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from segger.training.segger_data_module import SeggerDataModule

# from segger.prediction.predict import predict, load_model
from segger.models.segger_model import Segger
from segger.training.train import LitSegger
Expand All @@ -9,16 +10,21 @@
from lightning.pytorch.plugins.environments import LightningEnvironment
from matplotlib import pyplot as plt
import seaborn as sns

# import pandas as pd
from segger.data.utils import calculate_gene_celltype_abundance_embedding

# import scanpy as sc
import os
from lightning import LightningModule



segger_data_dir = Path("data_tidy/pyg_datasets/project24_MNG/output-XETG00423__0042861__mng_07_TMA__20250303__153740")
models_dir = Path("./models/project24_MNG/output-XETG00423__0042861__mng_07_TMA__20250303__153740")
segger_data_dir = Path(
"data_tidy/pyg_datasets/project24_MNG/output-XETG00423__0042861__mng_07_TMA__20250303__153740"
)
models_dir = Path(
"./models/project24_MNG/output-XETG00423__0042861__mng_07_TMA__20250303__153740"
)

# Base directory to store Pytorch Lightning models
# models_dir = Path('models')
Expand All @@ -44,14 +50,18 @@

model = Segger(
# is_token_based=is_token_based,
num_tx_tokens= num_tx_tokens,
num_tx_tokens=num_tx_tokens,
init_emb=8,
hidden_channels=32,
out_channels=16,
heads=4,
num_mid_layers=3,
)
model = to_hetero(model, (["tx", "bd"], [("tx", "belongs", "bd"), ("tx", "neighbors", "tx")]), aggr="sum")
model = to_hetero(
model,
(["tx", "bd"], [("tx", "belongs", "bd"), ("tx", "neighbors", "tx")]),
aggr="sum",
)

batch = dm.train[0]
model.forward(batch.x_dict, batch.edge_index_dict)
Expand Down
10 changes: 7 additions & 3 deletions scripts/train_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from lightning import Trainer

parser = argparse.ArgumentParser()
parser.add_argument('--data_dir', type=Path, required=True)
parser.add_argument("--data_dir", type=Path, required=True)
args = parser.parse_args()

segger_data_dir = args.data_dir
Expand All @@ -32,7 +32,11 @@
heads=4,
num_mid_layers=2,
)
model = to_hetero(model, (["tx", "bd"], [("tx", "belongs", "bd"), ("tx", "neighbors", "tx")]), aggr="sum")
model = to_hetero(
model,
(["tx", "bd"], [("tx", "belongs", "bd"), ("tx", "neighbors", "tx")]),
aggr="sum",
)

ls = LitSegger(model=model)

Expand All @@ -46,4 +50,4 @@
logger=CSVLogger(models_dir),
)

trainer.fit(ls, datamodule=dm)
trainer.fit(ls, datamodule=dm)
Loading