diff --git a/.github/actions/nf-test/action.yml b/.github/actions/nf-test/action.yml index 243e782..ddfac57 100644 --- a/.github/actions/nf-test/action.yml +++ b/.github/actions/nf-test/action.yml @@ -59,8 +59,6 @@ runs: - name: Run nf-test shell: bash env: - NFT_DIFF: ${{ env.NFT_DIFF }} - NFT_DIFF_ARGS: ${{ env.NFT_DIFF_ARGS }} NFT_WORKDIR: ${{ env.NFT_WORKDIR }} run: | nf-test test \ diff --git a/.github/workflows/awsfulltest.yml b/.github/workflows/awsfulltest.yml index e936511..1330101 100644 --- a/.github/workflows/awsfulltest.yml +++ b/.github/workflows/awsfulltest.yml @@ -24,7 +24,7 @@ jobs: - name: Launch workflow via Seqera Platform uses: seqeralabs/action-tower-launch@v2 - # TODO nf-core: You can customise AWS full pipeline tests as required + # You can customise AWS full pipeline tests as required # Add full size test data (but still relatively small datasets for few samples) # on the `test_full.config` test runs with only one set of parameters with: diff --git a/.github/workflows/nf-test.yml b/.github/workflows/nf-test.yml index f03aea0..325bc15 100644 --- a/.github/workflows/nf-test.yml +++ b/.github/workflows/nf-test.yml @@ -99,8 +99,6 @@ jobs: - name: Run nf-test uses: ./.github/actions/nf-test env: - NFT_DIFF: ${{ env.NFT_DIFF }} - NFT_DIFF_ARGS: ${{ env.NFT_DIFF_ARGS }} NFT_WORKDIR: ${{ env.NFT_WORKDIR }} with: profile: ${{ matrix.profile }} diff --git a/.github/workflows/template-version-comment.yml b/.github/workflows/template-version-comment.yml index beb5c77..c75629b 100644 --- a/.github/workflows/template-version-comment.yml +++ b/.github/workflows/template-version-comment.yml @@ -2,7 +2,8 @@ name: nf-core template version comment # This workflow is triggered on PRs to check if the pipeline template version matches the latest nf-core version. # It posts a comment to the PR, even if it comes from a fork. -on: pull_request_target +on: + pull_request: jobs: template_version: diff --git a/.gitignore b/.gitignore index a42ce01..14290da 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,4 @@ testing/ testing* *.pyc null/ +.idea/ diff --git a/CHANGELOG.md b/CHANGELOG.md index 6b69c26..59b1667 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,12 +5,35 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## v1.1.0 - [date] +- Preprint is out now! Linking it in the documentation. +- Added support for AWS: changed the structure of load response and parameter check to conform more to Nextflow + best practices. +- Simplified visualization: multiple short processes were creating overhang → more efficient in one process. +- Fixed errors that arose from the latest drevalpy version. +- Added authors and licenses to the python scripts. +- Moved all publishDir directives to modules.config. +- Fixed drevalpy versions in conda and docker to 1.3.4: now supporting Python 3.13 +- Added no_hyperparameter_tuning option for quick runs without hyperparameter tuning. +- Flag final_model_on_full data: if True, a final/production model is saved in the results directory. If hyperparameter_tuning is true, the final model is tuned, too. The model can later be loaded using the implemented load functions of the drevalpy models. + +## v1.0.0 - [date] + Initial release of nf-core/drugresponseeval, created with the [nf-core](https://nf-co.re/) template. ### `Added` +- Updated to the new template +- Added tests that run with docker, singularity, apptainer, and conda +- Added the docker container and the conda env.yml in the nextflow.config. We just need one container for all + processes as this pipeline automates the PyPI package drevalpy. +- Added usage and output documentation. +- Added CurveCurator to preprocess curves of custom datasets + ### `Fixed` +- Fixed linting issues +- Fixed bugs with path_data: can now be handled as absolute and relative paths + ### `Dependencies` ### `Deprecated` diff --git a/CITATIONS.md b/CITATIONS.md index dd7e54a..8e6f818 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -1,5 +1,9 @@ # nf-core/drugresponseeval: Citations +## [DrugResponseEval](https://github.com/nf-core/drugresponseeval/) + +> Bernett, J., Iversen, P., Picciani, M., Wilhelm, M., Baum, K., & List, M. **From Hype to Health Check: Critical Evaluation of Drug Response Prediction Models with DrEval.** > [10.1101/2025.05.26.655288](doi.org/10.1101/2025.05.26.655288) > _bioRxiv_, 2025-05. + ## [nf-core](https://pubmed.ncbi.nlm.nih.gov/32055031/) > Ewels PA, Peltzer A, Fillinger S, Patel H, Alneberg J, Wilm A, Garcia MU, Di Tommaso P, Nahnsen S. The nf-core framework for community-curated bioinformatics pipelines. Nat Biotechnol. 2020 Mar;38(3):276-278. doi: 10.1038/s41587-020-0439-x. PubMed PMID: 32055031. @@ -10,6 +14,30 @@ ## Pipeline tools +- [DrEvalPy](https://github.com/daisybio/drevalpy): The pipeline mostly automates the individual steps of the DrEvalPy PyPI package. + + > Bernett, J, Iversen, P, Picciani, M, Wilhelm, M, Baum, K, List, M. Will be published soon. + +- [CurveCurator](https://www.nature.com/articles/s41467-023-43696-z): For custom curve fitting on custom datasets. We also used it to re-process the response curves of GDSC1, GDSC2, CCLE, and CTRP. + + > Bayer, F.P., Gander, M., Kuster, B., The, M. CurveCurator: a recalibrated F-statistic to assess, classify, and explore significance of dose–response curves. Nature Communications. 2023 Nov;14(7902). + +- [DIPK](https://doi.org/10.1093/bib/bbae153): Implemented model in the pipeline. + + > Li P, Jiang Z, Liu T, Liu X, Qiao H, Yao X. Improving drug response prediction via integrating gene relationships with deep learning. Briefings in Bioinformatics. 2024 May;25(3):bbae153. + +- [MOLI](https://doi.org/10.1093/bioinformatics/btz318): Implemented model in the pipeline. + + > Sharifi-Noghabi H, Zolotareva O, Collins CC, Ester M. MOLI: multi-omics late integration with deep neural networks for drug response prediction. Bioinformatics. 2019 Jul;35(14):i501-9. + +- [SRMF](https://doi.org/10.1186/s12885-017-3500-5): Implemented model in the pipeline. + + > Wang L, Li X, Zhang L, Gao Q. Improved anticancer drug response prediction in cell lines using matrix factorization with similarity regularization. BMC cancer. 2017 Dec;17:1-2. + +- [SuperFELT](https://doi.org/10.1186/s12859-021-04146-z): Implemented model in the pipeline. + + > Park S, Soh J, Lee H. Super. FELT: supervised feature extraction learning using triplet loss for drug response prediction with multi-omics data. BMC bioinformatics. 2021 May 25;22(1):269. + ## Software packaging/containerisation tools - [Anaconda](https://anaconda.com) diff --git a/README.md b/README.md index 1e2bf69..6ad4b24 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,8 @@ [![GitHub Actions CI Status](https://github.com/nf-core/drugresponseeval/actions/workflows/ci.yml/badge.svg)](https://github.com/nf-core/drugresponseeval/actions/workflows/ci.yml) -[![GitHub Actions Linting Status](https://github.com/nf-core/drugresponseeval/actions/workflows/linting.yml/badge.svg)](https://github.com/nf-core/drugresponseeval/actions/workflows/linting.yml)[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/drugresponseeval/results)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.XXXXXXX-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.XXXXXXX) +[![GitHub Actions Linting Status](https://github.com/nf-core/drugresponseeval/actions/workflows/linting.yml/badge.svg)](https://github.com/nf-core/drugresponseeval/actions/workflows/linting.yml)[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/drugresponseeval/results)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.14779984-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.14779984) + [![nf-test](https://img.shields.io/badge/unit_tests-nf--test-337ab7.svg)](https://www.nf-test.com) [![Nextflow](https://img.shields.io/badge/version-%E2%89%A524.04.2-green?style=flat&logo=nextflow&logoColor=white&color=%230DC09D&link=https%3A%2F%2Fnextflow.io)](https://www.nextflow.io/) @@ -20,48 +21,43 @@ ## Introduction -**nf-core/drugresponseeval** is a bioinformatics pipeline that ... - - - - - +# ![drevalpy_summary](assets/dreval_summary.svg) + +**DrEval** is a bioinformatics framework that includes a PyPI package (drevalpy) and a Nextflow +pipeline (this repo). DrEval ensures that evaluations are statistically sound, biologically +meaningful, and reproducible. DrEval simplifies the implementation of drug response prediction +models, allowing researchers to focus on advancing their modeling innovations by automating +standardized evaluation protocols and preprocessing workflows. With DrEval, hyperparameter +tuning is fair and consistent. With its flexible model interface, DrEval supports any model type, +ranging from statistical models to complex neural networks. By contributing your model to the +DrEval catalog, you can increase your work's exposure, reusability, and transferability. + +1. The response data is loaded +2. All models are trained and evaluated in a cross-validation setting +3. For each CV split, the best hyperparameters are determined using a grid search per model +4. The model is trained on the full training set (train & validation) with the best + hyperparameters to predict the test set +5. If randomization tests are enabled, the model is trained on the full training set with the best + hyperparameters to predict the randomized test set +6. If robustness tests are enabled, the model is trained N times on the full training set with the + best hyperparameters +7. Plots are created summarizing the results + +For baseline models, no randomization or robustness tests are performed. ## Usage > [!NOTE] > If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data. - - Now, you can run the pipeline using: - - ```bash nextflow run nf-core/drugresponseeval \ -profile \ - --input samplesheet.csv \ - --outdir + --models \ + --baselines \ + --dataset_name ``` > [!WARNING] @@ -77,24 +73,36 @@ For more details about the output files and reports, please refer to the ## Credits -nf-core/drugresponseeval was originally written by Judith Bernett. +nf-core/drugresponseeval was originally written by Judith Bernett (TUM) and Pascal Iversen (FU +Berlin). We thank the following people for their extensive assistance in the development of this pipeline: - - ## Contributions and Support +Contributors to nf-core/drugresponseeval and the drevalpy PyPI package: + +- [Judith Bernett](https://github.com/JudithBernett) (TUM) +- [Pascal Iversen](https://github.com/PascalIversen) (FU Berlin) +- [Mario Picciani](https://github.com/picciama) (TUM) + If you would like to contribute to this pipeline, please see the [contributing guidelines](.github/CONTRIBUTING.md). For further information or help, don't hesitate to get in touch on the [Slack `#drugresponseeval` channel](https://nfcore.slack.com/channels/drugresponseeval) (you can join with [this invite](https://nf-co.re/join/slack)). ## Citations - - +If you use nf-core/drugresponseeval for your analysis, please cite it using the following doi: [10.5281/zenodo.14779984](https://doi.org/10.5281/zenodo.14779984) + +> Our corresponding publication is at doi [10.1101/2025.05.26.655288](doi.org/10.1101/2025.05.26.655288) +> +> Bernett, J., Iversen, P., Picciani, M., Wilhelm, M., Baum, K., & List, M. **From Hype to Health Check: Critical Evaluation of Drug Response Prediction Models with DrEval.** +> +> _bioRxiv_, 2025-05. + +The underlying data is available at doi: [10.5281/zenodo.12633909](https://doi.org/10.5281/zenodo.12633909). - +The underlying python package is drevalpy, availably on [PyPI](https://pypi.org/project/drevalpy/) as standalone, for which we also have an extensive [ReadTheDocs Documentation](https://drevalpy.readthedocs.io/en/latest/). An extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file. diff --git a/assets/NO_FILE b/assets/NO_FILE new file mode 100644 index 0000000..e69de29 diff --git a/assets/dreval_summary.svg b/assets/dreval_summary.svg new file mode 100644 index 0000000..0c52631 --- /dev/null +++ b/assets/dreval_summary.svg @@ -0,0 +1,49 @@ +Commonly used response datasets316 506 (269 024)545886CTRPv2Curves Unique treatments Unique cell lines 234 437 (197 324) 354 (180) 243 (24)CTRPv111 670 (4611)24 (23) 503 (54)CCLE60 758 (35 682)378 (266)970 (394)GDSC1395 025287 (204)969 (393)GDSC2In brackets: not occurring in CTRPv2LDO generalizationLCO generalizationSimple models to avoid overfitting Unbiased evaluationTrue responsePredicted responseGlobal Pearson: 0.90Avg. Pearson per drug: 0.01Corrected metricsNaive baselinesFair hyperparameter tuning of all baseline & competitor modelsAblation studiesMore consistent data & metricsOriginal metricCurveCurator metricAUClnIC50pEC50Response agreement across datasetsApplication-aware splitsLeave-Pairs-Out (LPO): Missing value imputationLeave-Cell-Lines-Out (LCO): Personalized medicineLeave-Tissue-Out (LTO):Drug repurposingLeave-Drugs-Out (LDO): Drug designCell linesDrugsk foldsTestValidationLoad responsePreprocess response with CurveCuratorRandomization testsRobustness testsPredict CV test sets (+ cross-study datasets) with optimal hyperparametersInput optionsdrevalpyCross-validation with inner hold-outEvaluationVisualizationHTML reportCritical Difference Diagram: MSEOverall Friedman-Chi2 p-value: 2.26e-19ModelsSRMFDIPKMOLIRSuperFELTRBaselinesNaive PredictorNaive Cell Line Mean PredictorNaive Drug Mean PredictorNaive Mean Effects PredictorElasticNetGradient BoostingRandom ForestMulti-OMICs RFSVRSimple Neural NetworkMulti-OMICs Neural NetworkPackage versionsAvailable as packagePreprocessing scriptsDocumentationCode availableAvailable as pipelineEasily extendableFAIReR1 class ProteomicsRandomForest (RandomForest):2 cell_line_views = ["proteomics"]34 @classmethod5 def get_model_name(cls) -> str:6 return "ProteomicsRandomForest"7 8 def load_cell_line_features(9 self,10 data_path: str,11 dataset_name: str12 ) -> FeatureDataset:13 return load_and_select_gene_features(14 feature_type="proteomics",15 gene_list=None,16 data_path=data_path,17 dataset_name=dataset_name18 )Morgan fingerprintsMolGNet embeddingsRNAseq gene expessionMicroarray gene expressionCopy number variationRRBS methylationBeadChip methylationMutationsDIA protein expressionBIONIC gene embeddingsVariable input features diff --git a/bin/collect_results.py b/bin/collect_results.py new file mode 100755 index 0000000..2e9e1f9 --- /dev/null +++ b/bin/collect_results.py @@ -0,0 +1,72 @@ +#!/usr/bin/env python + +# Written by Judith Bernett and released under the MIT License. + +import argparse +import pandas as pd +import pathlib + +from drevalpy.visualization.utils import prep_results, write_results + + +def get_parser(): + parser = argparse.ArgumentParser(description="Collect results and write to single files.") + parser.add_argument("--outfiles", type=str, nargs="+", required=True, help="Output files.") + parser.add_argument("--path_data", type=str, default="data", help="Data directory path") + return parser + + +def parse_results(outfiles): + # get all files with the pattern f'{model_name}_evaluation_results.csv' from outfiles + result_files = [file for file in outfiles if "evaluation_results.csv" in file] + # get all files with the pattern f'{model_name}_evaluation_results_per_drug.csv' from outfiles + result_per_drug_files = [file for file in outfiles if "evaluation_results_per_drug.csv" in file] + # get all files with the pattern f'{model_name}_evaluation_results_per_cl.csv' from outfiles + result_per_cl_files = [file for file in outfiles if "evaluation_results_per_cl.csv" in file] + # get all files with the pattern f'{model_name}_true_vs_pred.csv' from outfiles + t_vs_pred_files = [file for file in outfiles if "true_vs_pred.csv" in file] + return result_files, result_per_drug_files, result_per_cl_files, t_vs_pred_files + + +def collapse_file(files): + out_df = None + for file in files: + if out_df is None: + out_df = pd.read_csv(file, index_col=0) + else: + out_df = pd.concat([out_df, pd.read_csv(file, index_col=0)]) + if out_df is not None and "drug" in out_df.columns: + out_df["drug"] = out_df["drug"].astype(str) + return out_df + + +if __name__ == "__main__": + args = get_parser().parse_args() + # parse the results from outfiles.outfiles + outfiles = args.outfiles + path_data = pathlib.Path(args.path_data) + eval_result_files, eval_result_per_drug_files, eval_result_per_cl_files, true_vs_pred_files = parse_results(outfiles) + + # collapse the results into single dataframes + eval_results = collapse_file(eval_result_files) + eval_results_per_drug = collapse_file(eval_result_per_drug_files) + eval_results_per_cell_line = collapse_file(eval_result_per_cl_files) + t_vs_p = collapse_file(true_vs_pred_files) + + # prepare the results through introducing new columns algorithm, rand_setting, LPO_LCO_LDO, split, CV_split + eval_results, eval_results_per_drug, eval_results_per_cell_line, t_vs_p = prep_results( + eval_results=eval_results, + eval_results_per_drug=eval_results_per_drug, + eval_results_per_cell_line=eval_results_per_cell_line, + t_vs_p=t_vs_p, + path_data=path_data + ) + + # save the results to csv files + write_results( + path_out="", + eval_results=eval_results, + eval_results_per_drug=eval_results_per_drug, + eval_results_per_cl=eval_results_per_cell_line, + t_vs_p=t_vs_p, + ) diff --git a/bin/consolidate_results.py b/bin/consolidate_results.py new file mode 100755 index 0000000..2595d11 --- /dev/null +++ b/bin/consolidate_results.py @@ -0,0 +1,55 @@ +#!/usr/bin/env python + +# Written by Judith Bernett and released under the MIT License. + +import os +import argparse +from drevalpy.models import MODEL_FACTORY +from drevalpy.experiment import consolidate_single_drug_model_predictions + + +def get_parser(): + parser = argparse.ArgumentParser(description="Consolidate results for SingleDrugModels") + parser.add_argument('--run_id', type=str, required=True, help="Run ID") + parser.add_argument("--test_mode", type=str, required=True, help="Test mode (LPO, LCO, LDO)") + parser.add_argument("--model_name", type=str, required=True, help="All Model " + "names") + parser.add_argument("--outdir_path", type=str, required=True, help="Output directory path") + parser.add_argument("--n_cv_splits", type=int, required=True, help="Number of CV splits") + parser.add_argument("--cross_study_datasets", type=str, nargs="+", help="All " + "cross-study " + "datasets") + parser.add_argument("--randomization_modes", type=str, required=True, help="All " + "randomizations") + parser.add_argument("--n_trials_robustness", type=int, required=True, help="Number of trials") + return parser + + +def main(): + parser = get_parser() + args = parser.parse_args() + results_path = os.path.join( + args.outdir_path, + args.run_id, + args.test_mode, + ) + if args.randomization_modes == "[None]": + randomizations = None + else: + randomizations = args.randomization_modes.split('[')[1].split(']')[0].split(', ') + model = MODEL_FACTORY[args.model_name] + if args.cross_study_datasets is None: + args.cross_study_datasets = [] + consolidate_single_drug_model_predictions( + models=[model], + n_cv_splits=args.n_cv_splits, + results_path=results_path, + cross_study_datasets=args.cross_study_datasets, + randomization_mode=randomizations, + n_trials_robustness=args.n_trials_robustness, + out_path="" + ) + + +if __name__ == "__main__": + main() diff --git a/bin/cv_split.py b/bin/cv_split.py new file mode 100755 index 0000000..cc518a3 --- /dev/null +++ b/bin/cv_split.py @@ -0,0 +1,38 @@ +#!/usr/bin/env python + +# Written by Judith Bernett and released under the MIT License. + +import argparse +import pickle +import sys + + +def get_parser(): + parser = argparse.ArgumentParser(description="Split data into CV splits") + parser.add_argument("--response", type=str, required=True, help="Path to response data") + parser.add_argument("--n_cv_splits", type=int, required=True, help="Number of CV splits") + parser.add_argument("--test_mode", type=str, default="LPO", help="Test mode (LPO, LCO, LDO)") + return parser + + +def main(): + parser = get_parser() + args = parser.parse_args() + response_data = pickle.load(open(args.response, "rb")) + response_data.remove_nan_responses() + response_data.split_dataset( + n_cv_splits=args.n_cv_splits, + mode=args.test_mode, + split_validation=True, + split_early_stopping=True, + validation_ratio=0.1, + random_state=42, + ) + for split_index, split in enumerate(response_data.cv_splits): + with open(f"split_{split_index}.pkl", "wb") as f: + pickle.dump(split, f) + + +if __name__ == "__main__": + main() + sys.exit(0) diff --git a/bin/evaluate_and_find_max.py b/bin/evaluate_and_find_max.py new file mode 100755 index 0000000..c5f15ce --- /dev/null +++ b/bin/evaluate_and_find_max.py @@ -0,0 +1,63 @@ +#!/usr/bin/env python + +# Written by Judith Bernett and released under the MIT License. + +import argparse +import pickle +import yaml + +from drevalpy.evaluation import evaluate, MAXIMIZATION_METRICS, MINIMIZATION_METRICS + + +def get_parser(): + parser = argparse.ArgumentParser( + description="Take model name, get hyperparameters, and split into single hyperparameter combinations" + ) + parser.add_argument("--model_name", type=str, help="model name") + parser.add_argument("--split_id", type=str, help="split id") + parser.add_argument("--hpam_yamls", nargs="+", help="paths to hpam yamls") + parser.add_argument("--pred_datas", nargs="+", help="paths to pred datas") + parser.add_argument("--metric", type=str, help="metric") + return parser + + +def best_metric(metric, current_metric, best_metric): + if metric in MINIMIZATION_METRICS: + if current_metric < best_metric: + return True + elif metric in MAXIMIZATION_METRICS: + if current_metric > best_metric: + return True + else: + raise ValueError(f"Metric {metric} not recognized.") + return False + + +if __name__ == "__main__": + parser = get_parser() + args = parser.parse_args() + hpam_yamls = [] + for hpam_yaml in args.hpam_yamls: + hpam_yamls.append(hpam_yaml) + pred_datas = [] + for pred_data in args.pred_datas: + pred_datas.append(pred_data) + + best_hpam_combi = None + best_result = None + for i in range(0, len(pred_datas)): + pred_data = pickle.load(open(pred_datas[i], "rb")) + with open(hpam_yamls[i], "r") as yaml_file: + hpam_combi = yaml.load(yaml_file, Loader=yaml.FullLoader) + results = evaluate(pred_data, args.metric) + if best_result is None: + best_result = results[args.metric] + best_hpam_combi = hpam_combi + elif best_metric(args.metric, results[args.metric], best_result): + best_result = results[args.metric] + best_hpam_combi = hpam_combi + final_result = { + f"{args.model_name}_{args.split_id}": {"best_hpam_combi": best_hpam_combi, "best_result": best_result} + } + with open(f"best_hpam_combi_{args.split_id}.yaml", "w") as yaml_file: + yaml.dump(final_result, yaml_file, default_flow_style=False) diff --git a/bin/evaluate_final.py b/bin/evaluate_final.py new file mode 100755 index 0000000..86a8ae3 --- /dev/null +++ b/bin/evaluate_final.py @@ -0,0 +1,32 @@ +#!/usr/bin/env python + +# Written by Judith Bernett and released under the MIT License. + +import argparse + +from drevalpy.visualization.utils import evaluate_file + + +def get_parser(): + parser = argparse.ArgumentParser(description="Evaluate the predictions from the final model.") + parser.add_argument("--test_mode", type=str, default="LPO", help="Test mode (LPO, LCO, LDO, LTO).") + parser.add_argument("--model_name", type=str, required=True, help="Model name.") + parser.add_argument("--pred_file", type=str, required=True, help="Path to predictions.") + return parser + + +def write_results(overall_eval, evaluation_results_per_drug, evaluation_results_per_cl, true_vs_pred, model_name): + overall_eval.to_csv(f"{model_name}_evaluation_results.csv") + if evaluation_results_per_drug is not None: + evaluation_results_per_drug.to_csv(f"{model_name}_evaluation_results_per_drug.csv") + if evaluation_results_per_cl is not None: + evaluation_results_per_cl.to_csv(f"{model_name}_evaluation_results_per_cl.csv") + true_vs_pred.to_csv(f"{model_name}_true_vs_pred.csv") + + +if __name__ == "__main__": + args = get_parser().parse_args() + results_all, eval_res_d, eval_res_cl, t_vs_pred, mname = evaluate_file( + test_mode=args.test_mode, model_name=args.model_name, pred_file=args.pred_file + ) + write_results(results_all, eval_res_d, eval_res_cl, t_vs_pred, mname) diff --git a/bin/final_split.py b/bin/final_split.py new file mode 100755 index 0000000..43e9263 --- /dev/null +++ b/bin/final_split.py @@ -0,0 +1,53 @@ +#!/usr/bin/env python + +# Written by Judith Bernett and released under the MIT License. + +import argparse +import pickle + +from drevalpy.datasets.dataset import _split_early_stopping_data +from drevalpy.experiment import make_train_val_split +from drevalpy.models import MODEL_FACTORY + + +def get_parser(): + parser = argparse.ArgumentParser( + description="Train a final model on the full dataset for future predictions." + ) + parser.add_argument("--response", type=str, required=True, help="Drug response data, pickled (output of load_response).") + parser.add_argument("--model_name", type=str, required=True, help="Model name.") + parser.add_argument("--path_data", type=str, required=True, help="Path to data.") + parser.add_argument("--test_mode", type=str, default="LPO", help="Test mode (LPO, LCO, LTO, LDO).") + return parser + + +if __name__ == "__main__": + arg_parser = get_parser() + args = arg_parser.parse_args() + + model_class = MODEL_FACTORY[args.model_name] + model = model_class() + + response_data = pickle.load(open(args.response, "rb")) + response_data.remove_nan_responses() + + cl_features = model.load_cell_line_features(data_path=args.path_data, dataset_name=response_data.dataset_name) + drug_features = model.load_drug_features(data_path=args.path_data, dataset_name=response_data.dataset_name) + cell_lines_to_keep = cl_features.identifiers + drugs_to_keep = drug_features.identifiers if drug_features is not None else None + response_data.reduce_to(cell_line_ids=cell_lines_to_keep, drug_ids=drugs_to_keep) + + train_dataset, validation_dataset = make_train_val_split(response_data, test_mode=args.test_mode, val_ratio=0.1) + + if model_class.early_stopping: + validation_dataset, early_stopping_dataset = _split_early_stopping_data(validation_dataset, args.test_mode) + else: + early_stopping_dataset = None + + # save + with open('training_dataset.pkl', 'wb') as f: + pickle.dump(train_dataset, f) + with open('validation_dataset.pkl', 'wb') as f: + pickle.dump(validation_dataset, f) + with open('early_stopping_dataset.pkl', 'wb') as f: + pickle.dump(early_stopping_dataset, f) diff --git a/bin/hpam_split.py b/bin/hpam_split.py new file mode 100755 index 0000000..70e4f5a --- /dev/null +++ b/bin/hpam_split.py @@ -0,0 +1,38 @@ +#!/usr/bin/env python + +# Written by Judith Bernett and released under the MIT License. + +import argparse +import yaml +from drevalpy.models import MODEL_FACTORY, MULTI_DRUG_MODEL_FACTORY, SINGLE_DRUG_MODEL_FACTORY + + +def get_parser(): + parser = argparse.ArgumentParser( + description="Take model name, get hyperparameters, and split into single hyperparameter combinations" + ) + parser.add_argument("--model_name", type=str, help="model name") + parser.add_argument("--hyperparameter_tuning", action="store_true", default=False, + help="if set, hyperparameter tuning is performed, otherwise only the first combination is used") + return parser + + +if __name__ == "__main__": + parser = get_parser() + args = parser.parse_args() + if args.model_name in MULTI_DRUG_MODEL_FACTORY: + model_name = args.model_name + else: + model_name = str(args.model_name).split(".")[0] + assert model_name in SINGLE_DRUG_MODEL_FACTORY, (f"{model_name} neither in " + f"SINGLE_DRUG_MODEL_FACTORY nor in " + f"MULTI_DRUG_MODEL_FACTORY.") + model_class = MODEL_FACTORY[model_name] + hyperparameters = model_class.get_hyperparameter_set() + if not args.hyperparameter_tuning: + hyperparameters = [hyperparameters[0]] + hpam_idx = 0 + for hpam_combi in hyperparameters: + with open(f"hpam_{hpam_idx}.yaml", "w") as yaml_file: + hpam_idx += 1 + yaml.dump(hpam_combi, yaml_file, default_flow_style=False) diff --git a/bin/load_response.py b/bin/load_response.py new file mode 100755 index 0000000..42007e0 --- /dev/null +++ b/bin/load_response.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python + +# Written by Judith Bernett and released under the MIT License. + +import argparse +import pickle +import pathlib +import pandas as pd +from drevalpy.datasets.loader import AVAILABLE_DATASETS +from drevalpy.datasets.dataset import DrugResponseDataset +from drevalpy.datasets.utils import CELL_LINE_IDENTIFIER, DRUG_IDENTIFIER, TISSUE_IDENTIFIER + + +def get_parser(): + parser = argparse.ArgumentParser(description="Load data for drug response prediction.") + parser.add_argument("--response_dataset", type=str, default="data", help="Path to the drug response file.") + parser.add_argument( + "--cross_study_dataset", + action="store_true", + default=False, + help="Whether to load cross-study datasets.", + + ) + parser.add_argument( + "--no_refitting", + action="store_true", + default=False, + help="If the CurveCurated measures should not be used.", + ) + parser.add_argument( + "--measure", + type=str, + default="LN_IC50", + help="Name of the column in the dataset containing the drug response measures." + ) + return parser + + +def main(args): + dataset_name = pathlib.Path(args.response_dataset).stem + input_file = pathlib.Path(f"{dataset_name}.csv") + if dataset_name in AVAILABLE_DATASETS: + response_file = pd.read_csv(input_file, dtype={"pubchem_id": str}) + response_data = DrugResponseDataset( + response=response_file[args.measure].values, + cell_line_ids=response_file[CELL_LINE_IDENTIFIER].values, + drug_ids=response_file[DRUG_IDENTIFIER].values, + tissues=response_file[TISSUE_IDENTIFIER].values, + dataset_name=dataset_name, + ) + else: + tissue_column = TISSUE_IDENTIFIER + # check whether the input file has a TISSUE_IDENTIFIER column, if not, set tissue_column to None + if TISSUE_IDENTIFIER not in pd.read_csv(input_file, nrows=1).columns: + tissue_column = None + + response_data = DrugResponseDataset.from_csv( + input_file=input_file, dataset_name=dataset_name, measure=args.measure, tissue_column=tissue_column + ) + outfile = f"cross_study_{dataset_name}.pkl" if args.cross_study_dataset else "response_dataset.pkl" + # Pickle the object to a file + with open(outfile, "wb") as f: + pickle.dump(response_data, f) + + +if __name__ == "__main__": + arg_parser = get_parser() + args = arg_parser.parse_args() + main(args) diff --git a/bin/make_model_channel.py b/bin/make_model_channel.py new file mode 100755 index 0000000..adfee3e --- /dev/null +++ b/bin/make_model_channel.py @@ -0,0 +1,34 @@ +#!/usr/bin/env python + +# Written by Judith Bernett and released under the MIT License. + +import argparse +import pickle + +from drevalpy.experiment import make_model_list +from drevalpy.models import MODEL_FACTORY + + +def get_parser(): + parser = argparse.ArgumentParser(description="Split data into CV splits") + parser.add_argument("--models", type=str, required=True, help="List of models") + parser.add_argument("--data", type=str, required=True, help="Path to response data") + parser.add_argument("--file_name", type=str, required=True, help="Name of the file") + return parser + + +def main(): + parser = get_parser() + args = parser.parse_args() + models = args.models.replace("[", "").replace("]", "").split(", ") + response_data = pickle.load(open(args.data, "rb")) + dataset_name = response_data.dataset_name + models = [MODEL_FACTORY[model] for model in models] + all_models = make_model_list(models, response_data) + with open(f'{args.file_name}_{dataset_name}.txt', 'w', encoding='utf-8') as f: + for model, model_class in all_models.items(): + f.write(f"{model_class},{model}\n") + + +if __name__ == "__main__": + main() diff --git a/bin/postprocess_curvecurator_output.py b/bin/postprocess_curvecurator_output.py new file mode 100755 index 0000000..0d5fc2e --- /dev/null +++ b/bin/postprocess_curvecurator_output.py @@ -0,0 +1,22 @@ +#!/usr/bin/env python + +# Written by Mario Picciani and released under the MIT License. + +from drevalpy.datasets.curvecurator import postprocess +import argparse + + +def get_parser(): + parser = argparse.ArgumentParser(description="Postprocess CurveCurator viability data.") + parser.add_argument("--dataset_name", type=str, required=True, help="Dataset name.") + return parser + + +def main(args): + postprocess(output_folder='./', dataset_name=args.dataset_name) + + +if __name__ == "__main__": + arg_parser = get_parser() + args = arg_parser.parse_args() + main(args) diff --git a/bin/preprocess_raw_viability.py b/bin/preprocess_raw_viability.py new file mode 100755 index 0000000..b0e1804 --- /dev/null +++ b/bin/preprocess_raw_viability.py @@ -0,0 +1,31 @@ +#!/usr/bin/env python + +# Written by Mario Picciani with subsequent reworking by Judith Bernett. Released under the MIT License. + +from drevalpy.datasets.curvecurator import preprocess +from pathlib import Path +import argparse + + +def get_parser(): + parser = argparse.ArgumentParser(description="Preprocess CurveCurator viability data.") + parser.add_argument("--path_data", type=str, default="", help="Path to base folder containing datasets.") + parser.add_argument("--dataset_name", type=str, required=True, help="Dataset name.") + parser.add_argument("--cores", type=int, default=0, help="The number of cores used for CurveCurator fitting.") + return parser + + +def main(args): + input_file = Path(args.path_data) / args.dataset_name / f"{args.dataset_name}_raw.csv" + preprocess( + input_file=input_file, + output_dir=args.dataset_name, + dataset_name=args.dataset_name, + cores=args.cores + ) + + +if __name__ == "__main__": + arg_parser = get_parser() + args = arg_parser.parse_args() + main(args) diff --git a/bin/randomization_split.py b/bin/randomization_split.py new file mode 100755 index 0000000..d4b5c7a --- /dev/null +++ b/bin/randomization_split.py @@ -0,0 +1,34 @@ +#!/usr/bin/env python + +# Written by Judith Bernett and released under the MIT License. + +import argparse +import yaml + +from drevalpy.models import MODEL_FACTORY +from drevalpy.experiment import get_randomization_test_views + + +def get_parser(): + parser = argparse.ArgumentParser(description="Create randomization test views.") + parser.add_argument("--model_name", type=str, required=True, help="Name of the model to use.") + parser.add_argument("--randomization_mode", type=str, required=True, help="Randomization mode to use.") + return parser + + +def main(args): + model_class = MODEL_FACTORY[args.model_name] + model = model_class() + + randomization_test_views = get_randomization_test_views(model=model, randomization_mode=[args.randomization_mode]) + for test_name, views in randomization_test_views.items(): + for view in views: + rand_dict = {"test_name": test_name, "view": view} + with open(f'randomization_test_view_{test_name}.yaml', "w") as f: + yaml.dump(rand_dict, f) + + +if __name__ == "__main__": + arg_parser = get_parser() + all_args = arg_parser.parse_args() + main(all_args) diff --git a/bin/train_and_predict_cv.py b/bin/train_and_predict_cv.py new file mode 100755 index 0000000..d2218c6 --- /dev/null +++ b/bin/train_and_predict_cv.py @@ -0,0 +1,61 @@ +#!/usr/bin/env python + +# Written by Judith Bernett and released under the MIT License. + +import argparse +import sys +import pickle +import yaml + + +from drevalpy.models import MODEL_FACTORY +from drevalpy.experiment import train_and_predict, get_model_name_and_drug_id, get_datasets_from_cv_split +from drevalpy.utils import get_response_transformation + + +def get_parser(): + parser = argparse.ArgumentParser(description="Train and predict using a drug response prediction model.") + parser.add_argument("--model_name", type=str, help="model to evaluate or list of models to compare") + parser.add_argument("--path_data", type=str, default="data", help="Data directory path") + parser.add_argument("--test_mode", type=str, default="LPO", help="Test mode (LPO, LCO, LDO)") + parser.add_argument("--hyperparameters", type=str, help="hyperparameters for the model") + parser.add_argument("--cv_data", type=str, help="path to the cv data split") + parser.add_argument("--response_transformation", type=str, help="response transformation to apply to the dataset") + parser.add_argument("--model_checkpoint_dir", type=str, default="TEMPORARY", help="model checkpoint directory, if not provided: temporary directory is used") + return parser + + +def main(): + parser = get_parser() + args = parser.parse_args() + + model_name, drug_id = get_model_name_and_drug_id(args.model_name) + + model_class = MODEL_FACTORY[model_name] + split = pickle.load(open(args.cv_data, "rb")) + + train_dataset, validation_dataset, es_dataset, test_dataset = get_datasets_from_cv_split( + split, model_class, model_name, drug_id) + + response_transform = get_response_transformation(args.response_transformation) + hpams = yaml.load(open(args.hyperparameters, "r"), Loader=yaml.FullLoader) + model = model_class() + validation_dataset = train_and_predict( + model=model, + hpams=hpams, + path_data=args.path_data, + train_dataset=train_dataset, + prediction_dataset=validation_dataset, + early_stopping_dataset=es_dataset, + response_transformation=response_transform, + model_checkpoint_dir=args.model_checkpoint_dir + ) + with open(f"prediction_dataset_{model_name}_{str(args.cv_data).split('.pkl')[0]}_" + f"{str(args.hyperparameters).split('.yaml')[0]}.pkl", + "wb") as f: + pickle.dump(validation_dataset, f) + + +if __name__ == "__main__": + main() + sys.exit(0) diff --git a/bin/train_and_predict_final.py b/bin/train_and_predict_final.py new file mode 100755 index 0000000..6e3e58b --- /dev/null +++ b/bin/train_and_predict_final.py @@ -0,0 +1,280 @@ +#!/usr/bin/env python + +# Written by Judith Bernett and released under the MIT License. + +import os +import json +import sys +import argparse +import pickle +from typing import Dict, Optional +import yaml +from sklearn.base import TransformerMixin + +from drevalpy.datasets.dataset import DrugResponseDataset +from drevalpy.models.drp_model import DRPModel +from drevalpy.models import MODEL_FACTORY +from drevalpy.experiment import (get_model_name_and_drug_id, + get_datasets_from_cv_split, + generate_data_saving_path, + train_and_predict, + randomize_train_predict, + robustness_train_predict, + cross_study_prediction) +from drevalpy.utils import get_response_transformation + + +def get_parser(): + parser = argparse.ArgumentParser( + description="Train and predict: either full mode, randomization mode, " "or robustness mode." + ) + parser.add_argument("--mode", type=str, default="full", help="Mode: full, randomization, or robustness.") + parser.add_argument("--model_name", type=str, required=True, help="Model name.") + parser.add_argument("--split_id", type=str, required=True, help="Split id.") + parser.add_argument("--split_dataset_path", type=str, required=True, help="Path to split dataset.") + parser.add_argument("--hyperparameters_path", type=str, required=True, help="Path to hyperparameters.") + parser.add_argument("--response_transformation", type=str, default="None", help="Response transformation.") + parser.add_argument("--test_mode", type=str, default="LPO", help="Test mode (LPO, LCO, LDO).") + parser.add_argument("--path_data", type=str, required=True, help="Path to data.") + parser.add_argument("--randomization_views_path", type=str, default=None, help="Path to randomization views.") + parser.add_argument( + "--randomization_type", + type=str, + default="permutation", + help="Randomization type (permutation, invariant).", + ) + parser.add_argument("--robustness_trial", type=int, help="Robustness trial index.") + parser.add_argument("--cross_study_datasets", nargs="+", help="Path to cross study datasets.") + parser.add_argument("--model_checkpoint_dir", type=str, default="TEMPORARY", help="model checkpoint directory, if not provided: temporary directory is used") + + return parser + + +def prep_data(arguments): + model_name, drug_id = get_model_name_and_drug_id(arguments.model_name) + model_class = MODEL_FACTORY[model_name] + model = model_class() + + split = pickle.load(open(arguments.split_dataset_path, "rb")) + train_dataset, validation_dataset, es_dataset, test_dataset = get_datasets_from_cv_split( + split, model_class, model_name, drug_id) + + if model_class.early_stopping: + validation_dataset = split["validation_es"] + es_dataset = split["early_stopping"] + else: + es_dataset = None + + train_dataset.add_rows(validation_dataset) + train_dataset.shuffle(random_state=42) + + with open(arguments.hyperparameters_path, "r") as f: + best_hpam_dict = yaml.safe_load(f) + best_hpams = best_hpam_dict[f"{arguments.model_name}_{arguments.split_id}"]["best_hpam_combi"] + + response_transform = get_response_transformation(arguments.response_transformation) + return model, drug_id, best_hpams, train_dataset, test_dataset, es_dataset, response_transform + + +def compute_randomization( + randomization_test_view: Dict[str, str], + model: DRPModel, + hpam_set: Dict, + path_data: str, + train_dataset: DrugResponseDataset, + test_dataset: DrugResponseDataset, + early_stopping_dataset: Optional[DrugResponseDataset], + split_id: str, + randomization_type: str = "permutation", + response_transformation=Optional[TransformerMixin], + randomization_test_path: str = "", + model_checkpoint_dir: str = "TEMPORARY", +): + randomization_test_file = os.path.join( + randomization_test_path, + f'randomization_{randomization_test_view["test_name"]}_{split_id}.csv' + ) + randomize_train_predict( + view=randomization_test_view["view"], + test_name=randomization_test_view["test_name"], + randomization_type=randomization_type, + randomization_test_file=randomization_test_file, + model=model, + hpam_set=hpam_set, + path_data=path_data, + train_dataset=train_dataset, + test_dataset=test_dataset, + early_stopping_dataset=early_stopping_dataset, + response_transformation=response_transformation, + model_checkpoint_dir=model_checkpoint_dir + ) + + +def compute_robustness( + model: DRPModel, + hpam_set: Dict, + path_data: str, + train_dataset: DrugResponseDataset, + test_dataset: DrugResponseDataset, + early_stopping_dataset: Optional[DrugResponseDataset], + split_id: str, + trial: int, + response_transformation=Optional[TransformerMixin], + rob_path: str = "", + model_checkpoint_dir: str = "TEMPORARY", +): + robustness_test_file = os.path.join( + rob_path, + f"robustness_{trial}_{split_id}.csv", + ) + robustness_train_predict( + trial=trial, + trial_file=robustness_test_file, + train_dataset=train_dataset, + test_dataset=test_dataset, + early_stopping_dataset=early_stopping_dataset, + model=model, + hpam_set=hpam_set, + path_data=path_data, + response_transformation=response_transformation, + model_checkpoint_dir=model_checkpoint_dir + ) + + +def compute_cross( + cross_study_dataset, + model, + test_mode, + train_dataset, + path_data, + early_stopping_dataset, + response_transformation, + path_out, + split_index, + single_drug_id +): + split_index = split_index.split("split_")[1] + cross_study_dataset = pickle.load(open(cross_study_dataset, "rb")) + cross_study_dataset.remove_nan_responses() + cross_study_prediction( + dataset=cross_study_dataset, + model=model, + test_mode=test_mode, + train_dataset=train_dataset, + path_data=path_data, + early_stopping_dataset=( + early_stopping_dataset if model.early_stopping else None + ), + response_transformation=response_transformation, + path_out=path_out, + split_index=split_index, + single_drug_id=single_drug_id + ) + + +if __name__ == "__main__": + arg_parser = get_parser() + args = arg_parser.parse_args() + selected_model, drug_id, hpam_combi, train_set, test_set, es_set, transformation = prep_data( + args) + + if args.mode == "full": + predictions_path = generate_data_saving_path( + model_name=selected_model.get_model_name(), + drug_id=drug_id, + result_path='', + suffix='predictions', + ) + hpam_path = generate_data_saving_path( + model_name=selected_model.get_model_name(), + drug_id=drug_id, + result_path='', + suffix='best_hpams', + ) + hpam_path = os.path.join(hpam_path, f"best_hpams_{args.split_id}.json") + # save the best hyperparameters as json + with open( + hpam_path, + "w", + encoding="utf-8", + ) as f: + json.dump(hpam_combi, f) + + test_set = train_and_predict( + model=selected_model, + hpams=hpam_combi, + path_data=args.path_data, + train_dataset=train_set, + prediction_dataset=test_set, + early_stopping_dataset=es_set, + response_transformation=transformation, + model_checkpoint_dir=args.model_checkpoint_dir + ) + prediction_dataset = os.path.join( + predictions_path, + f"predictions_{args.split_id}.csv", + ) + test_set.to_csv(prediction_dataset) + for ds in args.cross_study_datasets: + if ds == "NONE.csv": + continue + compute_cross( + cross_study_dataset=ds, + model=selected_model, + test_mode=args.test_mode, + train_dataset=train_set, + path_data=args.path_data, + early_stopping_dataset=es_set, + response_transformation=transformation, + path_out=os.path.dirname(predictions_path), + split_index=args.split_id, + single_drug_id=drug_id + ) + elif args.mode == "randomization": + with open(args.randomization_views_path, "r") as f: + rand_test_view = yaml.safe_load(f) + rand_path = generate_data_saving_path( + model_name=selected_model.get_model_name(), + drug_id=drug_id, + result_path='', + suffix='randomization', + ) + compute_randomization( + randomization_test_view=rand_test_view, + model=selected_model, + hpam_set=hpam_combi, + path_data=args.path_data, + train_dataset=train_set, + test_dataset=test_set, + early_stopping_dataset=es_set, + split_id=args.split_id, + randomization_type=args.randomization_type, + response_transformation=transformation, + randomization_test_path=rand_path, + model_checkpoint_dir=args.model_checkpoint_dir + + ) + elif args.mode == "robustness": + rob_path = generate_data_saving_path( + model_name=selected_model.get_model_name(), + drug_id=drug_id, + result_path='', + suffix='robustness', + ) + compute_robustness( + model=selected_model, + hpam_set=hpam_combi, + path_data=args.path_data, + train_dataset=train_set, + test_dataset=test_set, + early_stopping_dataset=es_set, + split_id=args.split_id, + trial=args.robustness_trial, + response_transformation=transformation, + rob_path=rob_path, + model_checkpoint_dir=args.model_checkpoint_dir + ) + else: + raise ValueError(f"Invalid mode: {args.mode}. Choose full, randomization, or robustness.") + + sys.exit(0) diff --git a/bin/train_final_model.py b/bin/train_final_model.py new file mode 100755 index 0000000..03dd34b --- /dev/null +++ b/bin/train_final_model.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python + +# Written by Judith Bernett and released under the MIT License. + +import argparse +import pickle +import yaml +import os + +from drevalpy.experiment import generate_data_saving_path, get_model_name_and_drug_id +from drevalpy.models import MODEL_FACTORY +from drevalpy.utils import get_response_transformation + + +def get_parser(): + parser = argparse.ArgumentParser( + description="Train a final model on the full dataset for future predictions." + ) + parser.add_argument("--train_data", type=str, required=True, help="Train data, pickled (output of final split).") + parser.add_argument("--val_data", type=str, required=True, help="Validation data, pickled (output of final split).") + parser.add_argument("--early_stop_data", type=str, required=True, + help="Early stopping data, pickled (output of final split).") + parser.add_argument("--response_transformation", type=str, default="None", help="Response transformation.") + parser.add_argument("--model_name", type=str, required=True, help="Model name.") + parser.add_argument("--path_data", type=str, required=True, help="Path to data.") + parser.add_argument("--model_checkpoint_dir", type=str, default="TEMPORARY", help="model checkpoint directory, if not provided: temporary directory is used") + parser.add_argument("--best_hpam_combi", type=str, required=True, help="Best hyperparameter combination file, yaml format.") + return parser + + +if __name__ == "__main__": + arg_parser = get_parser() + args = arg_parser.parse_args() + + model_name, drug_id = get_model_name_and_drug_id(args.model_name) + + final_model_path = generate_data_saving_path( + model_name=model_name, + drug_id=drug_id, + result_path="", + suffix="final_model" + ) + response_transform = get_response_transformation(args.response_transformation) + train_dataset = pickle.load(open(args.train_data, "rb")) + validation_dataset = pickle.load(open(args.val_data, "rb")) + es_dataset = pickle.load(open(args.early_stop_data, "rb")) + train_dataset.add_rows(validation_dataset) + train_dataset.shuffle(random_state=42) + if response_transform: + train_dataset.fit_transform(response_transform) + if es_dataset is not None: + es_dataset.transform(response_transform) + + best_hpam_combi = yaml.load(open(args.best_hpam_combi, "r"), Loader=yaml.FullLoader)[f'{model_name}_final']['best_hpam_combi'] + model = MODEL_FACTORY[model_name]() + cl_features = model.load_cell_line_features(data_path=args.path_data, dataset_name=train_dataset.dataset_name) + drug_features = model.load_drug_features(data_path=args.path_data, dataset_name=train_dataset.dataset_name) + model.build_model(hyperparameters=best_hpam_combi) + model.train( + output=train_dataset, + output_earlystopping=es_dataset, + cell_line_input=cl_features, + drug_input=drug_features, + model_checkpoint_dir=args.model_checkpoint_dir, + ) + os.makedirs(final_model_path, exist_ok=True) + model.save(final_model_path) diff --git a/bin/tune_final_model.py b/bin/tune_final_model.py new file mode 100755 index 0000000..6dd9c68 --- /dev/null +++ b/bin/tune_final_model.py @@ -0,0 +1,56 @@ +#!/usr/bin/env python + +# Written by Judith Bernett and released under the MIT License. + +import argparse +import pickle +import yaml + +from drevalpy.experiment import get_model_name_and_drug_id, train_and_predict +from drevalpy.models import MODEL_FACTORY +from drevalpy.utils import get_response_transformation + + +def get_parser(): + parser = argparse.ArgumentParser( + description="Train a final model on the full dataset for future predictions." + ) + parser.add_argument("--train_data", type=str, required=True, help="Train dataset, pickled output of final_split.py.") + parser.add_argument("--val_data", type=str, required=True, help="Validation dataset, pickled output of final_split.py.") + parser.add_argument("--early_stopping_data", type=str, required=True, help="Early stopping dataset, pickled output of final_split.py.") + parser.add_argument("--model_name", type=str, required=True, help="Model name.") + parser.add_argument("--hpam_combi", type=str, required=True, help="Hyperparameter combination file, yaml format.") + parser.add_argument("--response_transformation", type=str, default="None", help="Response transformation.") + parser.add_argument("--path_data", type=str, required=True, help="Path to data.") + parser.add_argument("--model_checkpoint_dir", type=str, default="TEMPORARY", help="model checkpoint directory, if not provided: temporary directory is used") + return parser + + +if __name__ == "__main__": + arg_parser = get_parser() + args = arg_parser.parse_args() + + train_dataset = pickle.load(open(args.train_data, "rb")) + validation_dataset = pickle.load(open(args.val_data, "rb")) + early_stopping_dataset = pickle.load(open(args.early_stopping_data, "rb")) + response_transform = get_response_transformation(args.response_transformation) + + model_name, drug_id = get_model_name_and_drug_id(args.model_name) + model_class = MODEL_FACTORY[model_name] + hpams = yaml.load(open(args.hpam_combi, "r"), Loader=yaml.FullLoader) + model = model_class() + + validation_dataset = train_and_predict( + model=model, + hpams=hpams, + path_data=args.path_data, + train_dataset=train_dataset, + prediction_dataset=validation_dataset, + early_stopping_dataset=early_stopping_dataset, + response_transformation=response_transform, + model_checkpoint_dir=args.model_checkpoint_dir, + ) + with open(f"final_prediction_dataset_{model_name}_" + f"{str(args.hpam_combi).split('.yaml')[0]}.pkl", + "wb") as f: + pickle.dump(validation_dataset, f) diff --git a/bin/visualize_results.py b/bin/visualize_results.py new file mode 100755 index 0000000..cd92e1c --- /dev/null +++ b/bin/visualize_results.py @@ -0,0 +1,90 @@ +#!/usr/bin/env python + +# Written by Judith Bernett and released under the MIT License. + +import os +import pathlib +import argparse +import pandas as pd + +from drevalpy.visualization.utils import create_output_directories, draw_test_mode_plots, draw_algorithm_plots, create_html, create_index_html + + +def get_parser(): + parser = argparse.ArgumentParser(description="Write individual LCO/LTO/LDO/LPO html files.") + parser.add_argument("--test_modes", type=str, nargs="+", required=True, help="LPO, LDO, LCO, or LTO.") + parser.add_argument("--eval_results", type=str, required=True, help="Path to the evaluation results.") + parser.add_argument("--eval_results_per_drug", type=str, required=True, help="Path to the evaluation results per drug.") + parser.add_argument("--eval_results_per_cl", type=str, required=True, help="Path to the evaluation results per cell line.") + parser.add_argument("--true_vs_predicted", type=str, required=True, help="Path to the true vs predicted results.") + parser.add_argument("--path_data", type=str, required=True, help="Path to the data.") + return parser + + +if __name__ == "__main__": + args = get_parser().parse_args() + result_path = pathlib.Path(".") + outdir_name = "report" + create_output_directories(result_path=result_path, custom_id=outdir_name) + test_modes = args.test_modes + + ev_res = pd.read_csv(args.eval_results, index_col=0) + if args.eval_results_per_drug == "NO_FILE": + ev_res_per_drug = None + else: + ev_res_per_drug = pd.read_csv(args.eval_results_per_drug, index_col=0) + if args.eval_results_per_cl == "NO_FILE": + ev_res_per_cl = None + else: + ev_res_per_cl = pd.read_csv(args.eval_results_per_cl, index_col=0) + t_vs_p = pd.read_csv(args.true_vs_predicted, index_col=0) + + for test_mode in test_modes: + unique_algos = draw_test_mode_plots( + test_mode=test_mode, + ev_res=ev_res, + ev_res_per_drug=ev_res_per_drug, + ev_res_per_cell_line=ev_res_per_cl, + custom_id=outdir_name, + path_data=args.path_data, + result_path=result_path, + ) + # draw figures for each algorithm with all randomizations etc + unique_algos = set(unique_algos) - { + "NaiveMeanEffectsPredictor", + "NaivePredictor", + "NaiveCellLineMeansPredictor", + "NaiveDrugMeanPredictor", + } + for algorithm in unique_algos: + draw_algorithm_plots( + model=algorithm, + ev_res=ev_res, + ev_res_per_drug=ev_res_per_drug, + ev_res_per_cell_line=ev_res_per_cl, + t_vs_p=t_vs_p, + test_mode=test_mode, + custom_id=outdir_name, + result_path=result_path, + ) + # get all html files from {result_path}/{run_id} + all_files: list[str] = [] + for _, _, files in os.walk(f"{result_path}/{outdir_name}"): # type: ignore[assignment] + for file in files: + if file.endswith("json") or ( + file.endswith(".html") and file not in ["index.html", "LPO.html", "LCO.html", "LDO.html"] + ): + all_files.append(file) + # PIPELINE: WRITE_HTML + create_html( + run_id=outdir_name, + test_mode=test_mode, + files=all_files, + prefix_results=f"{result_path}/{outdir_name}", + ) + # PIPELINE: WRITE_INDEX + create_index_html( + custom_id=outdir_name, + test_modes=test_modes, + prefix_results=f"{result_path}/{outdir_name}", + ) diff --git a/conf/base.config b/conf/base.config index d45c05e..d400450 100644 --- a/conf/base.config +++ b/conf/base.config @@ -10,7 +10,6 @@ process { - // TODO nf-core: Check the defaults for all processes cpus = { 1 * task.attempt } memory = { 6.GB * task.attempt } time = { 4.h * task.attempt } @@ -24,7 +23,6 @@ process { // These labels are used and recognised by default in DSL2 files hosted on nf-core/modules. // If possible, it would be nice to keep the same label naming convention when // adding in your local modules too. - // TODO nf-core: Customise requirements for specific processes. // See https://www.nextflow.io/docs/latest/config.html#config-process-selectors withLabel:process_single { cpus = { 1 } @@ -62,4 +60,9 @@ process { withLabel: process_gpu { ext.use_gpu = { workflow.profile.contains('gpu') } } + withLabel:high_cpu_low_mem { + cpus = { 32 * task.attempt } + memory = { 16.GB * task.attempt } + time = { 6.h * task.attempt } + } } diff --git a/conf/modules.config b/conf/modules.config index e27fd28..2e18c68 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -13,9 +13,245 @@ process { publishDir = [ - path: { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }, + path: { "${params.outdir}/${params.run_id}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] + withName: 'UNZIP_RESPONSE' { + publishDir = [ + path: { params.path_data }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: 'UNZIP_CS_RESPONSE' { + publishDir = [ + path: { params.path_data }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: 'LOAD_RESPONSE' { + publishDir = [ + path: params.path_data, + mode: params.publish_dir_mode, + saveAs: { filename -> (filename != 'versions.yml' && !filename.endsWith('pkl')) ? filename : null } + ] + } + + withName: 'LOAD_CS_RESPONSE' { + publishDir = [ + path: params.path_data, + mode: params.publish_dir_mode, + saveAs: { filename -> (filename != 'versions.yml' && !filename.endsWith('pkl')) ? filename : null } + ] + } + + withName: 'PREPROCESS_RAW_VIABILITY' { + publishDir = [ + path: { params.path_data }, + mode: params.publish_dir_mode, + saveAs: { filename -> null } + ] + } + + withName: 'FIT_CURVES' { + publishDir = [ + path: { params.path_data }, + mode: params.publish_dir_mode, + saveAs: { filename -> null } + ] + } + + withName: 'CV_SPLIT' { + publishDir = [ + path: { params.path_data }, + mode: params.publish_dir_mode, + saveAs: { filename -> null } + ] + } + + withName: 'MAKE_MODELS' { + publishDir = [ + path: params.outdir, + mode: params.publish_dir_mode, + saveAs: { filename -> null } + ] + } + + withName: 'MAKE_BASELINES' { + publishDir = [ + path: params.outdir, + mode: params.publish_dir_mode, + saveAs: { filename -> null } + ] + } + + withName: 'HPAM_SPLIT' { + publishDir = [ + path: params.outdir, + mode: params.publish_dir_mode, + saveAs: { filename -> null } + ] + } + + + withName: 'TRAIN_AND_PREDICT_CV' { + publishDir = [ + path: params.outdir, + mode: params.publish_dir_mode, + saveAs: { filename -> null } + ] + + ext.use_gpu = { [ + 'SimpleNeuralNetwork', + 'MultiOmicsNeuralNetwork', + 'MOLIR', + 'SuperFELTR', + 'DIPK' + ].contains( model_name.split( '\\.' )[0] ) + } + } + + withName: 'EVALUATE_FIND_MAX' { + publishDir = [ + path: params.outdir, + mode: params.publish_dir_mode, + saveAs: { filename -> null } + ] + } + + withName: 'PREDICT_FULL' { + publishDir = [ + path: { "${params.outdir}/${params.run_id}/${test_mode}" }, + mode: 'copy', + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + + ext.use_gpu = { [ + 'SimpleNeuralNetwork', + 'MultiOmicsNeuralNetwork', + 'MOLIR', + 'SuperFELTR' + ].contains( model_name.split( '\\.' )[0] ) + } + } + + withName: 'RANDOMIZATION_SPLIT' { + publishDir = [ + path: params.outdir, + mode: params.publish_dir_mode, + saveAs: { filename -> null } + ] + } + + withName: 'RANDOMIZATION_TEST' { + publishDir = [ + path: { "${params.outdir}/${params.run_id}/${test_mode}" }, + mode: 'copy', + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + + ext.use_gpu = { [ + 'SimpleNeuralNetwork', + 'MultiOmicsNeuralNetwork', + 'MOLIR', + 'SuperFELTR', + 'DIPK' + ].contains( model_name.split( '\\.' )[0] ) + } + } + + withName: 'ROBUSTNESS_TEST' { + publishDir = [ + path: { "${params.outdir}/${params.run_id}/${test_mode}" }, + mode: 'copy', + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + + ext.use_gpu = { [ + 'SimpleNeuralNetwork', + 'MultiOmicsNeuralNetwork', + 'MOLIR', + 'SuperFELTR', + 'DIPK' + ].contains( model_name.split( '\\.' )[0] ) + } + } + + withName: 'FINAL_SPLIT' { + publishDir = [ + path: params.outdir, + mode: params.publish_dir_mode, + saveAs: { filename -> null } + ] + } + + withName: 'TUNE_FINAL_MODEL' { + publishDir = [ + path: params.outdir, + mode: params.publish_dir_mode, + saveAs: { filename -> null } + ] + } + + withName: 'TRAIN_FINAL_MODEL' { + publishDir = [ + path: { "${params.outdir}/${params.run_id}/${test_mode}" }, + mode: 'copy', + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + + ext.use_gpu = { [ + 'SimpleNeuralNetwork', + 'MultiOmicsNeuralNetwork', + 'MOLIR', + 'SuperFELTR' + ].contains( model_name.split( '\\.' )[0] ) + } + } + + withName: 'EVALUATE_FINAL' { + publishDir = [ + path: params.outdir, + mode: params.publish_dir_mode, + saveAs: { filename -> null } + ] + } + + withName: 'CONSOLIDATE_RESULTS' { + publishDir = [ + path: { "${params.outdir}/${params.run_id}/${test_mode}" }, + mode: 'copy', + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: 'COLLECT_RESULTS' { + publishDir = [ + path: { "${params.outdir}/${params.run_id}" }, + mode: 'copy', + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + + withName: 'POSTPROCESS_CURVECURATOR_DATA' { + publishDir = [ + path: { "${params.path_data}/${dataset_name}" }, + mode: 'copy', + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: 'VISUALIZE_RESULTS' { + publishDir = [ + path: { "${params.outdir}/${params.run_id}" }, + mode: 'copy', + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } } diff --git a/conf/test.config b/conf/test.config index cb07603..7f33ebf 100644 --- a/conf/test.config +++ b/conf/test.config @@ -13,7 +13,7 @@ process { resourceLimits = [ cpus: 4, - memory: '15.GB', + memory: '3.GB', time: '1.h' ] } @@ -23,7 +23,10 @@ params { config_profile_description = 'Minimal test dataset to check pipeline function' // Input data - // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets - // TODO nf-core: Give any required params for the test so that command line flags are not needed - input = params.pipelines_testdata_base_path + 'viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv' + //input = params.pipelines_testdata_base_path + 'viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv' + + run_id = 'test_run' + dataset_name = 'TOYv1' + cross_study_datasets = 'TOYv2' + n_cv_splits = 2 } diff --git a/conf/test_full.config b/conf/test_full.config index e5a3d3b..f9e33fb 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -14,11 +14,11 @@ params { config_profile_name = 'Full test profile' config_profile_description = 'Full test dataset to check pipeline function' - // Input data for full size test - // TODO nf-core: Specify the paths to your full test data ( on nf-core/test-datasets or directly in repositories, e.g. SRA) - // TODO nf-core: Give any required params for the test so that command line flags are not needed - input = params.pipelines_testdata_base_path + 'viralrecon/samplesheet/samplesheet_full_illumina_amplicon.csv' - - // Fasta references - fasta = params.pipelines_testdata_base_path + 'viralrecon/genome/NC_045512.2/GCF_009858895.2_ASM985889v3_genomic.200409.fna.gz' + run_id = 'full_test_run' + dataset_name = 'TOYv1' + cross_study_datasets = 'TOYv2' + n_cv_splits = 5 + models = 'ElasticNet' + randomization_mode = 'SVRC' + n_trials_robustness = 2 } diff --git a/docs/images/drugresponseeval_icon.png b/docs/images/drugresponseeval_icon.png new file mode 100644 index 0000000..5f832e9 Binary files /dev/null and b/docs/images/drugresponseeval_icon.png differ diff --git a/docs/images/nf-core-drugresponseeval_logo_light.png b/docs/images/nf-core-drugresponseeval_logo_light.png index 05adcbd..46ba878 100644 Binary files a/docs/images/nf-core-drugresponseeval_logo_light.png and b/docs/images/nf-core-drugresponseeval_logo_light.png differ diff --git a/docs/output.md b/docs/output.md index 82fad2f..53050be 100644 --- a/docs/output.md +++ b/docs/output.md @@ -4,15 +4,245 @@ This document describes the output produced by the pipeline. -The directories listed below will be created in the results directory after the pipeline has finished. All paths are relative to the top-level results directory. - - +The directories listed below will be created in the results directory after the pipeline has finished. +All paths are relative to the top-level results directory. ## Pipeline overview The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: -- [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution +1. `PREPROCESS_CUSTOM` subworkflow: This subworkflow is only triggered if there is a custom dataset and if in the corresponding folder, there is a file named `[dataset_name]_raw.csv`. If this is the case, CurveCurator is run on the raw data. + - [Preprocess raw viability](#preprocess-raw-viability): The raw viability data is put in a format suitable for CurveCurator. + - [Fit curves](#fit-curves): Curves are fitted using CurveCurator. + - [Postprocess CurveCurator data](#postprocess-curvecurator-data): The individual curves.tsv files are collected and one output file is written. +2. `RUN_CV` subworkflow: Finds the optimal hyperparameters for each model in a cross-validation setting. + - [Load response](#load-response): The response data is loaded. + - [CV split](#cv-split): The response data is split into cross-validation folds. + - [Make model channel](#make-model-channel): From the input baseline and model names, channels are created. This + step is necessary because for the Single-Drug Models, one model has to be created per drug. + - [HPAM split](#hyperparameter-split): One YAML file is created per model and hyperparameter combination to be + tested. + - [Train and predict CV](#train-and-predict-cv): All models are trained and evaluated in a cross-validation setting. + - [Evaluate and find max](#evaluate-and-find-max): For each CV split, the best hyperparameters are determined + using a grid search per model +3. `MODEL_TESTING` subworkflow: The best hyperparameters are used to train the models on the full training set + and predict the test set. Optionally, randomization and robustness testes are performed. + - [Predict full](#predict-full): The model is trained on the full training set (train & validation) with the best + hyperparameters to predict the test set. + - [Randomization split](#randomization-split): Makes a channel per randomization to be tested. + - [Randomization test](#randomization-test): If randomization tests are enabled, the model is trained on the full + training set with the best hyperparameters to predict the randomized test set. + - [Robustness test](#robustness-test): If robustness tests are enabled, the model is trained N times on the full + training set with the best hyperparameters + - If `--final_model_on_full_data` is set: the model is trained on the full dataset to produce a production model. If `--no_hyperparameter_tuning` is **not** set, the model will be tuned on the full dataset, too. The model will be saved in the results directory. + - [FINAL_SPLIT](#final-split): For each model class, the full dataset is split into training, validation, and potentially early stopping sets. This is done to ensure per model and not overall to retain the maximum amount of data for training (because the data is filtered according to cell line / drug feature availability). + - [TUNE_FINAL_MODEL](#tune-final-model): The final model is tuned on the full dataset. + - [EVALUATE_FIND_MAX_FINAL](#evaluate-and-find-max-final): The best hyperparameters for the final model are determined on the validation dataset. + - [TRAIN_FINAL_MODEL](#train-final-model): The final model is trained on the full dataset (train+validation) with the best hyperparameters. The model is saved in the results directory. + - [Consolidate results](#consolidate-results): The results of the model testing are consolidated into a single + table for each model. + - [Evaluate final](#evaluate-final): The performance of the models is calculated on the test set results. + - [Collect results](#collect-results): The results of the evaluation metrics per model are collected into four + overview tables. +4. `VISUALIZATION` subworkflow: Plots are created summarizing the results. +5. [Pipeline information](#pipeline-information): Report metrics generated during the workflow execution + +### Subworkflow `PREPROCESS_CUSTOM` + +This process is only triggered if there is a custom dataset and if in the corresponding folder, there is a file named `[dataset_name]_raw.csv`. + +#### Preprocess raw viability + +The file is processed to be in a format suitable for CurveCurator. One process will be started per dosage. + +
+Output files + +- "${dataset_name}/\*/config.toml": Configuration files for CurveCurator. Each subdirectory corresponds to a different dosage. +- "${dataset_name}/\*/curvecurator_input.tsv": Input file for CurveCurator. Each subdirectory corresponds to a different dosage. + +
+ +#### Fit curves + +CurveCurator is run on the input files to fit the curves. + +
+Output files +- "curves.tsv": The fitted curves. These will be collected and postprocessed in the next step. +- "mad.txt": Other output - Median absolute deviation analysis is performed to detect problematic experiments, the results are stored in this file. +- "dashboard.html" - A dashboard with an overview of the fitted curves. +- "curveCurator.log" - Log file of the CurveCurator run. +
+ +#### Postprocess CurveCurator data + +The individual curves.tsv files are collected and one output file is written to `path_data/dataset_name/dataset_name.csv`. +This file contains the new adjusted measures; available are pEC50 and AUC (now internally renamed as pEC50_curvecurator, AUC_curvecurator). + +
+Output files +- "dataset_name.csv": The postprocessed data; exported to the path_data folder. +
+ +### Subworkflow `RUN_CV` + +#### Load response + +The response data is loaded into the pipeline. If the data does not lie in `--path_data` it is downloaded from Zenodo +(`--zenodo_link`) and exported to `--path_data`. If it is downloaded, it is additionally unzipped by the UNZIP module. +This step is necessary to provide the pipeline with the response data that will be used to train and evaluate the models. + +
+Output files + +- Folder `path_data/dataset_name`: Everything required for the models to run is saved into this folder. + +
+ +#### CV split + +The response data is split into as many cross-validation folds as specified over the `--n_cv_splits` parameter. +The data is split into training, validation, and test sets for each fold. For models using early stopping, the early +stopping dataset is split from the validation set. This ensures that all models are trained and evaluated on the +same dataset. + +#### Make model channel + +From the input baseline and model names, channels are created. This step is necessary because for the +Single-Drug Models, one model has to be created per drug. The model name then becomes the name of the model and the +drug, separated by a dot, e.g., `MOLIR.Drug1`. All of these models should be able to be trained in parallel +which is why they should be individual elements in the channel. + +#### Hyperparameter split + +One YAML file is created per model and hyperparameter combination to be tested. This ensures that all hyperparameter +can be tested in parallel. + +#### Train and predict CV + +A model is trained in the specified test mode, on the specific cross-validation split with the specified +hyperparameter combination. + +As soon as the GPU support is available, the training and prediction will be done on the GPU for the models +SimpleNeuralNetwork, MultiOmicsNeuralNetwork, MOLIR, SuperFELTR, and DIPK. + +#### Evaluate and find max + +Over all hyperparameter combinations, the best hyperparameters for a specific cross-validation split are determined. +The best hyperparameters are determined based on the optimization metric specified via `--optim_metric`. + +### Subworkflow `MODEL_TESTING` + +### Predict full + +The model is trained on the full training set (train & validation) per split with the best hyperparameters to predict +the test set of the CV split. If specified via `--cross_study_datasets`, the cross-study datasets are also +predicted. + +
+Output files + +- `**predictions*.csv`: CSV file with the predicted response values for the test set. +- `**cross_study/cross_study*.csv`: CSV file with the predicted response values for the cross-study datasets. +- `**best_hpams*.json`: JSON file with the best hyperparameters for the model. + +
+ +#### Randomization split + +Takes the `--randomization_mode` as input and creates a channel for each randomization to be tested. This ensures that +all randomizations can be tested in parallel. + +#### Randomization test + +Trains the model on the randomized training + validation set with the best hyperparameters to predict the +unperturbed test set of the specified CV split. How the data is randomized is determined by the +`--randomization_type`. + +As soon as GPU support is available, the training and prediction will be done on the GPU for +the models SimpleNeuralNetwork, MultiOmicsNeuralNetwork, MOLIR, SuperFELTR, and DIPK. + +
+Output files + +- `**randomization*.csv`: CSV file with the predicted response values for the randomization test. + +
+ +#### Robustness test + +Trains the model `--n_trials_robustness` times on the full training set with the best hyperparameters to predict the test set of the +specific CV split. + +As soon as GPU support is available, the training and prediction will be done on the GPU for the models +SimpleNeuralNetwork, MultiOmicsNeuralNetwork, MOLIR, SuperFELTR, and DIPK. + +
+Output files + +- `**robustness*.csv`: CSV file with the predicted response values for the robustness test. + +
+ +#### Consolidate results + +For Single-Drug Models, the results of the model testing are consolidated such that their results look like the +results of the Multi-Drug Models. + +
+Output files + +- `**predictions*.csv`: CSV file with the consolidated predicted response values for the test set. +- `**cross_study/cross_study*.csv`: CSV file with the consolidated predicted response values for the cross-study + datasets. +- `**randomization*.csv`: CSV file with the consolidated predicted response values for the randomization test. +- `**robustness*.csv`: CSV file with the consolidated predicted response values for the robustness test. + +
+ +#### Evaluate final + +Calculates various performance metrics on the given test set results, including RMSE, MSE, MAE, R^2, Pearson +Correlation, Spearman Correlation, Kendall Correlation, and Partial Correlation. + +#### Collect results + +Collapses the results from above into four overview tables: `evaluation_results.csv`, `evaluation_results_per_drug. +csv`, `evaluation_results_per_cell_line.csv`, and `true_vs_pred.csv`. + +
+Output files + +- `evaluation_results.csv`: Overall performance metrics. One value per model per CV fold and setting (LPO/LCO/LDO, + full predictions, randomizations, robustness, cross-study predictions). +- `evaluation_results_per_drug.csv`: Performance metrics calculated per drug. +- `evaluation_results_per_cell_line.csv`: Performance metrics calculated per cell line. +- `true_vs_pred.csv`: true vs predicted values for each model. + +
+ +### Subworkflow `VISUALIZATION` + +All plots are created in the `visualization` subworkflow. They are saved in the results/report directory. + +
+Output files + +- `critical_difference*.svg`: The critical difference plot measures whether a model is significantly better than another model measured over its + average rank over all CV folds. +- `critical_difference*.html`: The corresponding p-values in a table. +- `violin*.html`: The violin shows the distribution of the performance metrics over the CV folds. This plot is rendered overall for + all real predictions and once per algorithm to compare the real predictions against, e.g., the randomization results. +- `heatmap*.html`: The heatmap shows the average performance of the models over the CV folds. +- `comp_scatter*.html`: Renders a plot in which the per-drug/per-cell line performances between y_true and y_predicted are compared between + different models. +- `regression_lines*.html`: Plots in which the y_true and y_predicted values are compared between different models (not rendered for Naive Predictors). +- `table*.html`: Saves the cross-study performance metrics of the models in an html table. +- `{LPO,LCO,LTO,LPO}.html`: Creates a summary HTML file per setting (LPO/LCO/LTO/LDO) that contains all the plots and tables. +- `index.html`: HTML file that links to all the HTML files. +- `*.png`: Some png files for the logo, etc. +
### Pipeline information @@ -27,4 +257,4 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d -[Nextflow](https://www.nextflow.io/docs/latest/tracing.html) provides excellent functionality for generating various reports relevant to the running and execution of the pipeline. This will allow you to troubleshoot errors with the running of the pipeline, and also provide you with other information such as launch commands, run times and resource usage. +[Nextflow](https://www.nextflow.io/docs/latest/tracing.html) provides excellent functionality for generating various reports relevant to the running and execution of the pipeline. This will allow you to troubleshoot errors with the running of the pipeline, and also provide you with other information such as launch commands, run times, and resource usage. diff --git a/docs/usage.md b/docs/usage.md index 117f717..c216bef 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -6,72 +6,88 @@ ## Introduction - +DrugResponseEval is a workflow designed to ensure that drug response prediction models are evaluated in a consistent and +reproducible manner. We offer three settings: + +- **Leave-Pair-Out (LPO)**: Random pairs of cell lines and drugs are left out for testing but both the drug and the + cell line might already be present in the training set. This is the **easiest setting** for your model but also the + most uninformative one. The only application scenario for this setting is when you want to test whether your model + can **complete the missing values in the training set**. +- **Leave-Cell-Line-Out (LCO)**: Random cell lines are left out for testing but the drugs might already be present in + the training set. This setting is **more challenging** than LPO but still relatively easy. The application scenario + for this setting is when you want to test whether your model can **predict the response of a new cell line**. This + is very relevant for **personalized medicine**. +- **Leave-Tissue-Out (LTO)**: Random tissues of origin are left out for testing but the drugs and cell lines might already be + present in the training set. This setting is **more challenging** than LCO because for LCO, very similar cell lines might + end up in the test dataset. Because it can still leverage drug means, it is still relatively easy, though. The application + scenario for this setting is when you want to test whether your model can **predict the response of a new tissue**. + This is very relevant for **drug repurposing**. +- **Leave-Drug-Out (LDO)**: Random drugs are left out for testing but the cell lines might already be present in the + training set. This setting is the **most challenging** one. The application scenario for this setting is when you + want to test whether your model can **predict the response of a new drug**. This is very relevant for **drug + development**. + +An underlying issue is that drugs have a rather unique IC50/EC50 range. That means that by just predicting the mean response +that a drug has in the training set (aggregated over all cell lines), you can already achieve a rather good +prediction. This is why we also offer the possibility to compare your model to a **NaivePredictor** that predicts +the mean response of all drugs in the training set. We also offer four more advanced naive predictors: +**NaiveCellLineMeanPredictor**, **NaiveTissueMeanPredictor**, **NaiveDrugMeanPredictor**, and **NaiveMeanEffectsPredictor**. +The NaiveCellLineMeanPredictor predicts the mean response of a cell line in the training set, the NaiveTissueMeanPredictor +the mean response of a tissue of origin in the training set, the NaiveDrugMeanPredictor +predicts the mean response of a drug in the training set. The NaiveMeanEffectsPredictor combines both sources of variation +and predicts responses as the sum of the overall mean (NaivePredictor) + cell line + drug-specific means. +**The NaiveMeanEffectsPredictor is always run.** + +Furthermore, we offer a variety of more advanced **baseline models** and some **state-of-the-art models** to compare +your model against. Similarly, we provide commonly used datasets to evaluate your model on (GDSC1, GDSC2, CCLE, +CTRPv1, CTRPv2). You can also provide your **own dataset or your own model by contributing to our PyPI package +[drevalpy](https://github.com/daisybio/drevalpy.git)** Before contributing, you can pull our respective repositories. +More information can be found in the [drevalpy readthedocs](https://drevalpy.readthedocs.io/en/latest/). + +We first identify the best hyperparameters for all models and baselines in a cross-validation setting. Then, we +train the models on the whole training set and evaluate them on the test set. Furthermore, we offer randomization +and robustness tests. -## Samplesheet input +## Running the pipeline -You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row as shown in the examples below. +The typical command for running the pipeline is as follows: ```bash ---input '[path to samplesheet file]' -``` - -### Multiple runs of the same sample - -The `sample` identifiers have to be the same when you have re-sequenced the same sample more than once e.g. to increase sequencing depth. The pipeline will concatenate the raw reads before performing any downstream analysis. Below is an example for the same sample sequenced across 3 lanes: - -```csv title="samplesheet.csv" -sample,fastq_1,fastq_2 -CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz -CONTROL_REP1,AEG588A1_S1_L003_R1_001.fastq.gz,AEG588A1_S1_L003_R2_001.fastq.gz -CONTROL_REP1,AEG588A1_S1_L004_R1_001.fastq.gz,AEG588A1_S1_L004_R2_001.fastq.gz -``` - -### Full samplesheet - -The pipeline will auto-detect whether a sample is single- or paired-end using the information provided in the samplesheet. The samplesheet can have as many columns as you desire, however, there is a strict requirement for the first 3 columns to match those defined in the table below. - -A final samplesheet file consisting of both single- and paired-end data may look something like the one below. This is for 6 samples, where `TREATMENT_REP3` has been sequenced twice. - -```csv title="samplesheet.csv" -sample,fastq_1,fastq_2 -CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz -CONTROL_REP2,AEG588A2_S2_L002_R1_001.fastq.gz,AEG588A2_S2_L002_R2_001.fastq.gz -CONTROL_REP3,AEG588A3_S3_L002_R1_001.fastq.gz,AEG588A3_S3_L002_R2_001.fastq.gz -TREATMENT_REP1,AEG588A4_S4_L003_R1_001.fastq.gz, -TREATMENT_REP2,AEG588A5_S5_L003_R1_001.fastq.gz, -TREATMENT_REP3,AEG588A6_S6_L003_R1_001.fastq.gz, -TREATMENT_REP3,AEG588A6_S6_L004_R1_001.fastq.gz, +nextflow run nf-core/drugresponseeval \ + -profile \ + --run_id myRun \ + --test_mode \ + --models \ + --baselines \ + --dataset_name \ + --path_data ``` -| Column | Description | -| --------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `sample` | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (`_`). | -| `fastq_1` | Full path to FastQ file for Illumina short reads 1. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | -| `fastq_2` | Full path to FastQ file for Illumina short reads 2. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | +This will launch the pipeline with the `docker/singularity/.../institute` configuration profile. See below for more information about profiles. -An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline. +In your `outdir`, a folder named `myRun` will be created containing the results of the pipeline run. -## Running the pipeline +The `test_mode` parameter specifies the evaluation setting, e.g., `--test_mode LCO`. -The typical command for running the pipeline is as follows: +The `models` and `baselines` parameters are lists of models and baselines to be evaluated, e.g., +`--models ElasticNet,RandomForest --baselines NaivePredictor,NaiveCellLineMeanPredictor,NaiveDrugMeanPredictor`. -```bash -nextflow run nf-core/drugresponseeval --input ./samplesheet.csv --outdir ./results -profile docker -``` +The `dataset_name` parameter specifies the dataset to be used for evaluation, e.g., `--dataset_name CTRPv2`. -This will launch the pipeline with the `docker` configuration profile. See below for more information about profiles. +If you do not want to re-download the data every time you run the pipeline, you can specify the path to the data with +the `path_data` parameter, e.g., `--path_data /path/to/data`. Note that the pipeline will create the following files in your working directory: ```bash work # Directory containing the nextflow working files - # Finished results in specified location (defined with --outdir) + # Finished results in specified location (defined with --outdir), defaults to 'results' .nextflow_log # Log file from Nextflow # Other nextflow hidden files, eg. history of pipeline runs and old logs. ``` -If you wish to repeatedly use the same parameters for multiple runs, rather than specifying each flag in the command, you can specify these in a params file. +If you wish to repeatedly use the same parameters for multiple runs, rather than specifying each flag in the command, +you can specify these in a params file. Pipeline settings can be provided in a `yaml` or `json` file via `-params-file `. @@ -87,13 +103,182 @@ nextflow run nf-core/drugresponseeval -profile docker -params-file params.yaml with: ```yaml title="params.yaml" -input: './samplesheet.csv' -outdir: './results/' +models: 'ElasticNet' +baselines: 'NaivePredictor,NaiveCellLineMeanPredictor,NaiveDrugMeanPredictor' +dataset_name: 'GDSC2' +path_data: '/path/to/data' <...> ``` You can also generate such `YAML`/`JSON` files via [nf-core/launch](https://nf-co.re/launch). +### Available Models + +**Single-Drug Models** fit one model for each drug in the training set. They also cannot generalize to new drugs, +hence those models cannot be used in the LDO setting. **Multi-Drug Models** fit one model for all drugs in the training +set. They can be used in all three settings. + +The following models are available: + +| Model Name | Baseline / Published Model | Multi-Drug Model / Single-Drug Model | Description | +| -------------------------------- | -------------------------- | ------------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| NaivePredictor | Baseline Method | Multi-Drug Model | Most simple method. Predicts the mean response of all drugs in the training set. | +| NaiveCellLineMeanPredictor | Baseline Method | Multi-Drug Model | Predicts the mean response of a cell line in the training set. | +| NaiveDrugMeanPredictor | Baseline Method | Multi-Drug Model | Predicts the mean response of a drug in the training set. | +| NaiveTissueMeanPredictor | Baseline Method | Multi-Drug Model | Predicts the mean response of a tissue in the training set. | +| NaiveMeanEffectsPredictor | Baseline Method | Multi-Drug Model | Predicts the drug- and cell-line specific mean effects. | +| ElasticNet | Baseline Method | Multi-Drug Model | Fits an [Sklearn Elastic Net](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ElasticNet.html), [Lasso](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html), or [Ridge](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html) model on gene expression data and drug fingerprints (concatenated input matrix). | +| ProteomicsElasticNet | Baseline Method | Multi-Drug Model | Fits an [Sklearn Elastic Net](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ElasticNet.html), [Lasso](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html), or [Ridge](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html) model on proteomics data and drug fingerprints (concatenated input matrix). | +| SingleDrugElasticNet | Baseline Method | Single-Drug Model | Fits an ElasticNet model on gene expression data for each drug separately. | +| SingleDrugProteomicsElasticNet | Baseline Method | Single-Drug Model | Fits an ElasticNet model on proteomics data for each drug separately. | +| GradientBoosting | Baseline Method | Multi-Drug Model | Fits an [Sklearn Gradient Boosting Regressor](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html) gene expression data and drug fingerprints. | +| RandomForest | Baseline Method | Multi-Drug Model | Fits an [Sklearn Random Forest Regressor](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html) on gene expression data and drug fingerprints. | +| MultiOmicsRandomForest | Baseline Method | Multi-Drug Model | Fits an [Sklearn Random Forest Regressor](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html) on gene expression, methylation, mutation, copy number variation data, and drug fingerprints (concatenated matrix). The dimensionality of the methylation data is reduced with a PCA to the first 100 components before it is fed to the model. | +| ProteomicsRandomForest | Baseline Method | Multi-Drug Model | Fits Random Forest on proteomics data and drug fingerprints. | +| SingleDrugRandomForest | Baseline Method | Single-Drug Model | Fits an [Sklearn Random Forest Regressor](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html) on gene expression data for each drug separately. | +| SingleDrugProteomicsRandomForest | Baseline Method | Single-Drug Model | Fits an [Sklearn Random Forest Regressor](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html) on proteomics data for each drug separately. | +| SVR | Baseline Method | Multi-Drug Model | Fits an [Sklearn Support Vector Regressor](https://scikit-learn.org/1.5/modules/generated/sklearn.svm.SVR.html) gene expression data and drug fingerprints. | +| SimpleNeuralNetwork | Baseline Method | Multi-Drug Model | Fits a simple feedforward neural network (implemented with [Pytorch Lightning](https://lightning.ai/docs/pytorch/stable/)) on gene expression and drug fingerprints (concatenated input) with 3 layers of varying dimensions and Dropout layers. | +| MultiOmicsNeuralNetwork | Baseline Method | Multi-Drug Model | Fits a simple feedforward neural network (implemented with [Pytorch Lightning](https://lightning.ai/docs/pytorch/stable/)) on gene expression, methylation, mutation, copy number variation data, and drug fingerprints (concatenated input) with 3 layers of varying dimensions and Dropout layers. The dimensionality of the methylation data is reduced with a PCA to the first 100 components before it is fed to the model. | +| SRMF | Published Model | Multi-Drug Model | [Similarity Regularization Matrix Factorization](https://doi.org/10.1186/s12885-017-3500-5) model by Wang et al. on gene expression data and drug fingerprints. Re-implemented Matlab code into Python. The basic idea is represent each drug and each cell line by their respective similarities to all other drugs/cell lines. Those similarities are mapped into a shared latent low-dimensional space from which responses are predicted. | +| MOLIR | Published Model | Single-Drug Model | Regression extension of [MOLI: multi-omics late integration deep neural network.](https://doi.org/10.1093/bioinformatics/btz318) by Sharifi-Noghabi et al. Takes somatic mutation, copy number variation and gene expression data as input. MOLI reduces the dimensionality of each omics type with a hidden layer, concatenates them into one representation and optimizes this representation via a combined cost function consisting of a triplet loss and a binary cross-entropy loss. We implemented a regression adaption with MSE loss and an adapted triplet loss for regression. | +| SuperFELTR | Published Model | Single-Drug Model | Regression extension of [SuperFELT: supervised feature extraction learning using triplet loss for drug response](https://doi.org/10.1186/s12859-021-04146-z) by Park et al. Very similar to MOLI(R). In MOLI(R), encoders and the classifier were trained jointly. Super.FELT(R) trains them independently. MOLI(R) was trained without feature selection (except for the Variance Threshold on the gene expression). Super.FELT(R) uses feature selection for all omics data. | +| DIPK | Published Model | Multi-Drug Model | [Deep neural network Integrating Prior Knowledge](https://doi.org/10.1093/bib/bbae153) from Li et al. Uses gene interaction relationships (encoded by a graph auto-encoder), gene expression profiles (encoded by a denoising auto-encoder), and molecular topologies (encoded by MolGNet). Those features are integrated using multi-head attention layers. | + +### Custom models + +If you want to use your own model, you must contribute it to drevalpy. Please follow the following steps: + +1. Fork the [drevalpy repository](https://github.com/daisybio/drevalpy) +2. Create a mamba environment: `mamba create -n drevalpy python=3.13` +3. Install the dependencies: + - Run: `pip install poetry` + - Then run: `poetry install` +4. Implement your model (for more information on that, check the [ReadTheDocs](https://drevalpy.readthedocs.io/en/latest/runyourmodel.html)) +5. Test your model with the tests in `tests/`. Also, implement your own tests. +6. (You can then open a PR to the main repository for contributing your model) +7. Install drevalpy into your environment: `pip install -e .` +8. From your environment, try to run the pipeline: `nextflow run nf-core/drugresponseeval -r dev -profile test` +9. If everything works, try running your model: `nextflow run nf-core/drugresponseeval -r dev --models --dataset_name ` + +### Saving a production model + +If you want to save a production model, you can set the `--final_model_on_full_data` flag. This will save the model trained on the full dataset in the results directory. +The model can later be loaded using the implemented load functions of the drevalpy models. +Here is an example of how to load a GradientBoosting model that was saved in the `results` directory: + +```python +from drevalpy.models import MODEL_FACTORY + +model_class = MODEL_FACTORY["GradientBoosting"] +# provide the path to the final_model directory +gb_model = model_class.load('results/test_run/LCO/GradientBoosting/final_model/') +``` + +You can then investigate the sklearn HistGradientBoostingRegressor model saved in `gb_model.model`. +You can then either use `drevalpy` functions to predict responses for new data or use the model directly with `sklearn` functions. + +With `drevalpy`: + +```python +from drevalpy.datasets.dataset import DrugResponseDataset +# first load the new data which must have the 'measure' column and the cell line and drug identifiers ('cell_line_name', 'pubchem_id'). +# The tissue column is optional. +new_dataset = DrugResponseDataset.from_csv(input_file='path/to/new_data.csv', dataset_name='my_new_data', + measure='LN_IC50', tissue_column='tissue') +# In the path_to_features directory, we expect a directory called like the dataset_name (here my_new_data), which contains the cell line and drug features. +path_to_features = 'path/to/cell_line_and_drug_features/' +cl_features = gb_model.load_cell_line_features(data_path=path_to_features, dataset_name='my_new_data') +drug_features = gb_model.load_drug_features(data_path=path_to_features, dataset_name='my_new_data') +# Now we have to filter the dataset to only contain the cell lines and drugs that are in the features. +cell_lines_to_keep = cl_features.identifiers if cl_features is not None else None +drugs_to_keep = drug_features.identifiers if drug_features is not None else None +new_dataset.reduce_to(cell_line_ids=cell_lines_to_keep, drug_ids=drugs_to_keep) +# Now we can predict the responses for the new data. +new_dataset._predictions = gb_model.predict( + cell_line_ids=new_dataset.cell_line_ids, + drug_ids=new_dataset.drug_ids, + cell_line_input=cl_features, + drug_input=drug_features, +) +# This will create a csv with 'cell_line_name', 'pubchem_id', 'response', 'predictions', 'tissue' (if provided) columns. +new_dataset.to_csv('path/to/predictions.csv') +``` + +### Available Datasets + +The following datasets are available and can be supplied via `--dataset_name`: + +| Dataset Name | Number of DRP curves | Number of drugs | Number of Cell Lines | Description | +| ------------ | -------------------- | --------------- | -------------------- | ------------------------------------------------------------------------------------------------ | +| CTRPv1 | 60,758 | 354 | 243 | The Cancer Therapeutics Response Portal (CTRP) dataset version 1. | +| CTRPv2 | 395,025 | 546 | 886 | The Cancer Therapeutics Response Portal (CTRP) dataset version 2. | +| CCLE | 11,670 | 24 | 503 | The Cancer Cell Line Encyclopedia (CCLE) dataset. | +| GDSC1 | 316,506 | 378 | 970 | The Genomics of Drug Sensitivity in Cancer (GDSC) dataset version 1. | +| GDSC2 | 234,437 | 287 | 969 | The Genomics of Drug Sensitivity in Cancer (GDSC) dataset version 2. | +| TOYv1 | 2,711 | 36 | 90 | A toy dataset for testing purposes subsetted from CTRPv2. | +| TOYv2 | 2,784 | 36 | 90 | A second toy dataset for cross study testing purposes. 80 cell lines and 32 drugs overlap TOYv2. | + +Our pipeline also supports cross-study prediction, i.e., training on one dataset and testing on another (or multiple +others) to assess the generalization of the model. This dataset name can be supplied via `--cross_study_datasets`. + +The drug response measure that you want to use as the target variable can be specified via the `--measure` parameter. +Available measures are `[“AUC”, “pEC50”, “EC50”, “IC50”]`. + +We have re-fitted all the curves in the available datasets with CurveCurator to ensure that the data is processed +well. By default, we use those measures. If you do not want to use those measures, enable the `--no_refitting` flag. + +#### Custom datasets + +You can also provide your own custom dataset via the `--dataset_name` parameter by specifying a name that is not in the list of the available datasets. +This can be prefit data (not recommended for comparability reasons) or raw viability data that is automatically fit +with the exact same procedure that was used to refit the available datasets in the previous section. + +Raw viability data + +We expect a csv-formatted file in the location `//_raw.csv` +(corresponding to the `--path_data` and `--dataset_name` options), which contains the raw viability data in long format +with the columns `[“dose”, “response”, “sample”, “drug”]` and an optional “replicate” column. +If replicates are provided, the procedure will fit one curve per sample / drug pair using all replicates. + +The pipeline then fits the curves using CurveCurator and saves the processed file to `//.csv` +For individual results, look in the work directories. + +Prefit viability data + +We expect a csv-formatted file in the location `//.csv` +(corresponding to the `--path_data` and `--dataset_name` options), with at least the columns `[“cell_line_id”, “drug_id”, ”]` +where `` is replaced with the name of the measure you provide (`[“AUC”, “pEC50”, “EC50”, “IC50”]`). +It is required that you use measure names that are also working with the available datasets if you use the `--cross_study_datasets` option. + +### Available Randomization Tests + +We have several randomization modes and types available. + +The modes are supplied via `--randomization_mode` and the types via `--randomization_type`.: + +- **SVCC: Single View Constant for Cell Lines:** A single cell line view (e.g., gene expression) is held unperturbed + while the others are randomized. +- **SVCD: Single View Constant for Drugs:** A single drug view (e.g., drug fingerprints) is held unperturbed while the + others are randomized. +- **SVRC: Single View Random for Cell Lines:** A single cell line view (e.g., gene expression) is randomized while the + others are held unperturbed. +- **SVRD: Single View Random for Drugs:** A single drug view (e.g., drug fingerprints) is randomized while the others + are held unperturbed. + +Currently, we support two ways of randomizing the data. The default is permututation. + +- **Permutation**: Permutes the features over the instances, keeping the distribution of the features the same but + dissolving the relationship to the target. +- **Invariant**: The randomization is done in a way that a key characteristic of the feature is preserved. In case + of matrices, this is the mean and standard deviation of the feature view for this instance, for networks it is the + degree distribution. + +### Robustness Tests + +The robustness test is a test where the model is trained with varying seeds. This is done multiple times to see how +stable the model is. Via `--n_trials_robustness`, you can specify the number of trials for the robustness tests. + ### Updating the pipeline When you run the above command, Nextflow automatically pulls the pipeline code from GitHub and stores it as a cached version. When running the pipeline after this, it will always use the cached version if available - even if the pipeline has been updated since. To make sure that you're running the latest version of the pipeline, make sure that you regularly update the cached version of the pipeline: diff --git a/main.nf b/main.nf index 0a5fffc..e9a405a 100644 --- a/main.nf +++ b/main.nf @@ -28,18 +28,27 @@ include { PIPELINE_COMPLETION } from './subworkflows/local/utils_nfcore_drug // WORKFLOW: Run main analysis pipeline depending on type of input // workflow NFCORE_DRUGRESPONSEEVAL { - take: - samplesheet // channel: samplesheet read in from --input + models // channel: [ string(models) ] + baselines // channel: [ string(baselines) ] + work_path // channel: path to the data channel.fromPath(params.path_data) main: // // WORKFLOW: Run pipeline // + ch_versions = Channel.empty() DRUGRESPONSEEVAL ( - samplesheet + models, + baselines, + work_path ) + + ch_versions = ch_versions.mix(DRUGRESPONSEEVAL.out.versions) + + emit: + versions = ch_versions } /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -59,14 +68,19 @@ workflow { params.monochrome_logs, args, params.outdir, - params.input + // pipeline-specific input + params.models, + params.baselines, + params.path_data ) // // WORKFLOW: Run main workflow // NFCORE_DRUGRESPONSEEVAL ( - PIPELINE_INITIALISATION.out.samplesheet + PIPELINE_INITIALISATION.out.models, + PIPELINE_INITIALISATION.out.baselines, + PIPELINE_INITIALISATION.out.work_path, ) // // SUBWORKFLOW: Run completion tasks diff --git a/modules.json b/modules.json index 4430ebe..1f7d288 100644 --- a/modules.json +++ b/modules.json @@ -4,18 +4,24 @@ "repos": { "https://github.com/nf-core/modules.git": { "modules": { - "nf-core": {} + "nf-core": { + "custom/dumpsoftwareversions": { + "branch": "master", + "git_sha": "05954dab2ff481bcb999f24455da29a5828af08d", + "installed_by": ["modules"] + } + } }, "subworkflows": { "nf-core": { "utils_nextflow_pipeline": { "branch": "master", - "git_sha": "c2b22d85f30a706a3073387f30380704fcae013b", + "git_sha": "05954dab2ff481bcb999f24455da29a5828af08d", "installed_by": ["subworkflows"] }, "utils_nfcore_pipeline": { "branch": "master", - "git_sha": "51ae5406a030d4da1e49e4dab49756844fdd6c7a", + "git_sha": "05954dab2ff481bcb999f24455da29a5828af08d", "installed_by": ["subworkflows"] }, "utils_nfschema_plugin": { diff --git a/modules/local/collect_results/env.yml b/modules/local/collect_results/env.yml new file mode 100644 index 0000000..c85aab7 --- /dev/null +++ b/modules/local/collect_results/env.yml @@ -0,0 +1,8 @@ +name: nf-core-drugresponseeval +channels: + - conda-forge +dependencies: + - python=3.13 + - pip + - pip: + - drevalpy==1.3.4 diff --git a/modules/local/collect_results/main.nf b/modules/local/collect_results/main.nf new file mode 100644 index 0000000..58b0dcf --- /dev/null +++ b/modules/local/collect_results/main.nf @@ -0,0 +1,31 @@ +process COLLECT_RESULTS { + label 'process_medium' + + conda "${moduleDir}/env.yml" + + input: + path(outfiles) + path(path_data) + + output: + path('evaluation_results.csv'), emit: evaluation_results + path('evaluation_results_per_drug.csv'), emit: evaluation_results_per_drug, optional: true + path('evaluation_results_per_cl.csv'), emit: evaluation_results_per_cl, optional: true + path('true_vs_pred.csv'), emit: true_vs_pred + path("versions.yml"), emit: versions + + script: + """ + collect_results.py \\ + --outfiles $outfiles \\ + --path_data $path_data + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + drevalpy: \$(python -c "import drevalpy; print(drevalpy.__version__)") + pandas: \$(python -c "import pandas; print(pandas.__version__)") + END_VERSIONS + """ + +} diff --git a/modules/local/consolidate_results/env.yml b/modules/local/consolidate_results/env.yml new file mode 100644 index 0000000..c85aab7 --- /dev/null +++ b/modules/local/consolidate_results/env.yml @@ -0,0 +1,8 @@ +name: nf-core-drugresponseeval +channels: + - conda-forge +dependencies: + - python=3.13 + - pip + - pip: + - drevalpy==1.3.4 diff --git a/modules/local/consolidate_results/main.nf b/modules/local/consolidate_results/main.nf new file mode 100644 index 0000000..8f14aa9 --- /dev/null +++ b/modules/local/consolidate_results/main.nf @@ -0,0 +1,37 @@ +process CONSOLIDATE_RESULTS { + tag "$model_name" + label 'process_single' + + conda "${moduleDir}/env.yml" + + input: + tuple val(test_mode), val(model_name) + val(rand_modes) + val(nr_files) + + output: + tuple val(test_mode), val(model_name), path('**split*.csv'), emit: ch_vis, optional: true + path("versions.yml"), emit: versions + + script: + def outdirPath = new File(params.outdir).getAbsolutePath() + """ + consolidate_results.py \\ + --run_id ${params.run_id} \\ + --test_mode ${test_mode} \\ + --model_name "${model_name}" \\ + --outdir_path ${outdirPath} \\ + --n_cv_splits ${params.n_cv_splits} \\ + ${params.cross_study_datasets != '' ? '--cross_study_datasets ' + params + .cross_study_datasets.replace(',', ' ') : ''} \\ + --randomization_modes "${rand_modes}"\\ + --n_trials_robustness ${params.n_trials_robustness} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + drevalpy: \$(python -c "import drevalpy; print(drevalpy.__version__)") + pandas: \$(python -c "import pandas; print(pandas.__version__)") + END_VERSIONS + """ +} diff --git a/modules/local/cv_split/env.yml b/modules/local/cv_split/env.yml new file mode 100644 index 0000000..c85aab7 --- /dev/null +++ b/modules/local/cv_split/env.yml @@ -0,0 +1,8 @@ +name: nf-core-drugresponseeval +channels: + - conda-forge +dependencies: + - python=3.13 + - pip + - pip: + - drevalpy==1.3.4 diff --git a/modules/local/cv_split/main.nf b/modules/local/cv_split/main.nf new file mode 100644 index 0000000..5c269f7 --- /dev/null +++ b/modules/local/cv_split/main.nf @@ -0,0 +1,32 @@ +process CV_SPLIT { + tag "$test_mode" + label 'process_single' + + conda "${moduleDir}/env.yml" + + input: + tuple val(test_mode), path(response) + val n_cv_splits + + output: + tuple val(test_mode), path("split*.pkl") , emit: response_cv_splits + path("versions.yml"), emit: versions + + + script: + """ + cv_split.py \\ + --response $response \\ + --n_cv_splits $n_cv_splits \\ + --test_mode $test_mode + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + drevalpy: \$(python -c "import drevalpy; print(drevalpy.__version__)") + sklearn: \$(python -c "import sklearn; print(sklearn.__version__)") + numpy: \$(python -c "import numpy; print(numpy.__version__)") + END_VERSIONS + """ + +} diff --git a/modules/local/evaluate_final/env.yml b/modules/local/evaluate_final/env.yml new file mode 100644 index 0000000..c85aab7 --- /dev/null +++ b/modules/local/evaluate_final/env.yml @@ -0,0 +1,8 @@ +name: nf-core-drugresponseeval +channels: + - conda-forge +dependencies: + - python=3.13 + - pip + - pip: + - drevalpy==1.3.4 diff --git a/modules/local/evaluate_final/main.nf b/modules/local/evaluate_final/main.nf new file mode 100644 index 0000000..408b527 --- /dev/null +++ b/modules/local/evaluate_final/main.nf @@ -0,0 +1,33 @@ +process EVALUATE_FINAL { + tag "${test_mode}_${model_name}_${pred_file}" + label 'process_single' + + conda "${moduleDir}/env.yml" + + input: + tuple val(test_mode), val(model_name), path(pred_file) + + output: + path('*.csv'), emit: ch_individual_results, optional: true + path("versions.yml"), emit: versions + + + script: + """ + evaluate_final.py \\ + --test_mode $test_mode \\ + --model_name "${model_name}" \\ + --pred_file $pred_file + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + drevalpy: \$(python -c "import drevalpy; print(drevalpy.__version__)") + pandas: \$(python -c "import pandas; print(pandas.__version__)") + sklearn: \$(python -c "import sklearn; print(sklearn.__version__)") + numpy: \$(python -c "import numpy; print(numpy.__version__)") + scipy: \$(python -c "import scipy; print(scipy.__version__)") + END_VERSIONS + """ + +} diff --git a/modules/local/evaluate_find_max/env.yml b/modules/local/evaluate_find_max/env.yml new file mode 100644 index 0000000..c85aab7 --- /dev/null +++ b/modules/local/evaluate_find_max/env.yml @@ -0,0 +1,8 @@ +name: nf-core-drugresponseeval +channels: + - conda-forge +dependencies: + - python=3.13 + - pip + - pip: + - drevalpy==1.3.4 diff --git a/modules/local/evaluate_find_max/main.nf b/modules/local/evaluate_find_max/main.nf new file mode 100644 index 0000000..f641fe4 --- /dev/null +++ b/modules/local/evaluate_find_max/main.nf @@ -0,0 +1,36 @@ +process EVALUATE_FIND_MAX { + tag "${test_mode}_${model_name}_${split_id}" + label 'process_single' + + conda "${moduleDir}/env.yml" + + input: + tuple val(model_name), val(test_mode), val(split_id), path(hpam_yamls), path(pred_datas) + val metric + + output: + tuple val(model_name), val(split_id), val(test_mode), path('best_hpam_combi_*.yaml'), emit: best_combis + path("versions.yml"), emit: versions + + script: + """ + evaluate_and_find_max.py \\ + --model_name "${model_name}" \\ + --split_id $split_id \\ + --hpam_yamls $hpam_yamls \\ + --pred_datas $pred_datas \\ + --metric $metric + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + drevalpy: \$(python -c "import drevalpy; print(drevalpy.__version__)") + pandas: \$(python -c "import pandas; print(pandas.__version__)") + sklearn: \$(python -c "import sklearn; print(sklearn.__version__)") + numpy: \$(python -c "import numpy; print(numpy.__version__)") + scipy: \$(python -c "import scipy; print(scipy.__version__)") + yaml: \$(python -c "import yaml; print(yaml.__version__)") + END_VERSIONS + """ + +} diff --git a/modules/local/final_split/env.yml b/modules/local/final_split/env.yml new file mode 100644 index 0000000..c85aab7 --- /dev/null +++ b/modules/local/final_split/env.yml @@ -0,0 +1,8 @@ +name: nf-core-drugresponseeval +channels: + - conda-forge +dependencies: + - python=3.13 + - pip + - pip: + - drevalpy==1.3.4 diff --git a/modules/local/final_split/main.nf b/modules/local/final_split/main.nf new file mode 100644 index 0000000..35c16fd --- /dev/null +++ b/modules/local/final_split/main.nf @@ -0,0 +1,36 @@ +process FINAL_SPLIT { + tag { "${model_name}_${test_mode}_gpu:${task.ext.use_gpu}" } + label 'process_single' + + conda "${moduleDir}/env.yml" + + input: + tuple val(model_name), path(response), val(test_mode), path(path_data) + + + output: + tuple val(model_name), path("training_dataset.pkl"), path("validation_dataset.pkl"), path("early_stopping_dataset.pkl"), emit: final_datasets + path("versions.yml"), emit: versions + + script: + """ + final_split.py \\ + --response $response \\ + --model_name "${model_name}" \\ + --path_data $path_data \\ + --test_mode $test_mode + + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + drevalpy: \$(python -c "import drevalpy; print(drevalpy.__version__)") + sklearn: \$(python -c "import sklearn; print(sklearn.__version__)") + numpy: \$(python -c "import numpy; print(numpy.__version__)") + pandas: \$(python -c "import pandas; print(pandas.__version__)") + pytorch_lightning: \$(python -c "import pytorch_lightning; print(pytorch_lightning.__version__)") + torch: \$(python -c "import torch; print(torch.__version__)" | sed 's/+.*//') + platform: \$(python -c "import platform; print(platform.__version__)") + END_VERSIONS + """ +} diff --git a/modules/local/fit_curves/env.yml b/modules/local/fit_curves/env.yml new file mode 100644 index 0000000..e777796 --- /dev/null +++ b/modules/local/fit_curves/env.yml @@ -0,0 +1,8 @@ +name: nf-core-drugresponseeval +channels: + - conda-forge +dependencies: + - python=3.13 + - pip + - pip: + - curve_curator==0.6.0 diff --git a/modules/local/fit_curves/main.nf b/modules/local/fit_curves/main.nf new file mode 100644 index 0000000..15e40a7 --- /dev/null +++ b/modules/local/fit_curves/main.nf @@ -0,0 +1,25 @@ +process FIT_CURVES { + tag "$dir_name" + label 'high_cpu_low_mem' + + conda "${moduleDir}/env.yml" + + input: + val dataset_name + tuple val(dir_name), path(toml), path(curvecurator_input) + + output: + path("curves.tsv"), emit: path_to_curvecurator_out + tuple path("mad.txt"), path("dashboard.html"), path("curveCurator.log") // other output + path("versions.yml"), emit: versions + + script: + """ + CurveCurator ${toml} --mad + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + curve_curator: \$(python -c "import curve_curator; print(curve_curator.__version__)") + END_VERSIONS + """ +} diff --git a/modules/local/hpam_split/env.yml b/modules/local/hpam_split/env.yml new file mode 100644 index 0000000..c85aab7 --- /dev/null +++ b/modules/local/hpam_split/env.yml @@ -0,0 +1,8 @@ +name: nf-core-drugresponseeval +channels: + - conda-forge +dependencies: + - python=3.13 + - pip + - pip: + - drevalpy==1.3.4 diff --git a/modules/local/hpam_split/main.nf b/modules/local/hpam_split/main.nf new file mode 100644 index 0000000..0c636c6 --- /dev/null +++ b/modules/local/hpam_split/main.nf @@ -0,0 +1,29 @@ +process HPAM_SPLIT { + tag "$model_name" + label 'process_single' + + conda "${moduleDir}/env.yml" + + input: + val model_name + val no_hyperparameter_tuning + + output: + tuple val(model_name), path("*.yaml") , emit: hpam_combi + path("versions.yml"), emit: versions + + + script: + """ + hpam_split.py \\ + --model_name "${model_name}" \\ + ${no_hyperparameter_tuning ? '' : '--hyperparameter_tuning'} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + drevalpy: \$(python -c "import drevalpy; print(drevalpy.__version__)") + yaml: \$(python -c "import yaml; print(yaml.__version__)") + """ + +} diff --git a/modules/local/load_response/env.yml b/modules/local/load_response/env.yml new file mode 100644 index 0000000..c85aab7 --- /dev/null +++ b/modules/local/load_response/env.yml @@ -0,0 +1,8 @@ +name: nf-core-drugresponseeval +channels: + - conda-forge +dependencies: + - python=3.13 + - pip + - pip: + - drevalpy==1.3.4 diff --git a/modules/local/load_response/main.nf b/modules/local/load_response/main.nf new file mode 100644 index 0000000..6523a4f --- /dev/null +++ b/modules/local/load_response/main.nf @@ -0,0 +1,33 @@ +process LOAD_RESPONSE { + tag "${response.baseName}" + label 'process_single' + + conda "${moduleDir}/env.yml" + + input: + tuple val(measure), path(response) + val no_refitting + val cross_study_dataset + + output: + path 'response_dataset.pkl', emit: response_dataset, optional: true + path 'cross_study_*.pkl', emit: cross_study_datasets, optional: true + path("versions.yml"), emit: versions + + script: + """ + load_response.py \\ + --response_dataset ${response} \\ + --measure ${measure} \\ + ${no_refitting ? '--no_refitting' : ''} \\ + ${cross_study_dataset ? '--cross_study_dataset' : ''} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + drevalpy: \$(python -c "import drevalpy; print(drevalpy.__version__)") + pandas: \$(python -c "import pandas; print(pandas.__version__)") + END_VERSIONS + """ + +} diff --git a/modules/local/make_model_channel/env.yml b/modules/local/make_model_channel/env.yml new file mode 100644 index 0000000..c85aab7 --- /dev/null +++ b/modules/local/make_model_channel/env.yml @@ -0,0 +1,8 @@ +name: nf-core-drugresponseeval +channels: + - conda-forge +dependencies: + - python=3.13 + - pip + - pip: + - drevalpy==1.3.4 diff --git a/modules/local/make_model_channel/main.nf b/modules/local/make_model_channel/main.nf new file mode 100644 index 0000000..d8a6fc1 --- /dev/null +++ b/modules/local/make_model_channel/main.nf @@ -0,0 +1,35 @@ +process MAKE_MODEL_CHANNEL { + tag "Make model channel" + label 'process_single' + + conda "${moduleDir}/env.yml" + + input: + tuple val(models), path(response_data) + val(name) + + output: + path '{models,baselines}*.txt', emit: all_models + path("versions.yml"), emit: versions + + script: + """ + make_model_channel.py \\ + --models "${models}" \\ + --data ${response_data} \\ + --file_name ${name} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + drevalpy: \$(python -c "import drevalpy; print(drevalpy.__version__)") + sklearn: \$(python -c "import sklearn; print(sklearn.__version__)") + numpy: \$(python -c "import numpy; print(numpy.__version__)") + pandas: \$(python -c "import pandas; print(pandas.__version__)") + pytorch_lightning: \$(python -c "import pytorch_lightning; print(pytorch_lightning.__version__)") + torch: \$(python -c "import torch; print(torch.__version__)" | sed 's/+.*//') + platform: \$(python -c "import platform; print(platform.__version__)") + END_VERSIONS + """ + +} diff --git a/modules/local/postprocess_curvecurator_output/env.yml b/modules/local/postprocess_curvecurator_output/env.yml new file mode 100644 index 0000000..c85aab7 --- /dev/null +++ b/modules/local/postprocess_curvecurator_output/env.yml @@ -0,0 +1,8 @@ +name: nf-core-drugresponseeval +channels: + - conda-forge +dependencies: + - python=3.13 + - pip + - pip: + - drevalpy==1.3.4 diff --git a/modules/local/postprocess_curvecurator_output/main.nf b/modules/local/postprocess_curvecurator_output/main.nf new file mode 100644 index 0000000..dc5e77f --- /dev/null +++ b/modules/local/postprocess_curvecurator_output/main.nf @@ -0,0 +1,26 @@ +process POSTPROCESS_CURVECURATOR_DATA { + label 'process_single' + + conda "${moduleDir}/env.yml" + + input: + val dataset_name + path(curve_data, stageAs: "?/*") + val measure + + output: + path "${dataset_name}.csv", emit: path_to_dataset + val "${measure}" + "_curvecurator", emit: measure + path("versions.yml"), emit: versions + + script: + """ + postprocess_curvecurator_output.py --dataset_name ${dataset_name} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + drevalpy: \$(python -c "import drevalpy; print(drevalpy.__version__)") + END_VERSIONS + """ +} diff --git a/modules/local/predict_full/env.yml b/modules/local/predict_full/env.yml new file mode 100644 index 0000000..c85aab7 --- /dev/null +++ b/modules/local/predict_full/env.yml @@ -0,0 +1,8 @@ +name: nf-core-drugresponseeval +channels: + - conda-forge +dependencies: + - python=3.13 + - pip + - pip: + - drevalpy==1.3.4 diff --git a/modules/local/predict_full/main.nf b/modules/local/predict_full/main.nf new file mode 100644 index 0000000..2748ce0 --- /dev/null +++ b/modules/local/predict_full/main.nf @@ -0,0 +1,46 @@ +process PREDICT_FULL { + tag { "${test_mode}_${model_name}_${split_id}_gpu:${task.ext.use_gpu}" } + label 'process_high' + label 'process_gpu' + + conda "${moduleDir}/env.yml" + + input: + tuple path(cross_study_datasets), val(model_name), val(test_mode), val(split_id), path(split_dataset), path(hpam_combi), path(path_data) + val(response_transformation) + val(model_checkpoint_dir) + + output: + tuple val(test_mode), val(model_name), path('**predictions*.csv'), emit: ch_vis + tuple val(test_mode), val(model_name), path('**cross_study/cross_study*.csv'), emit: ch_cross, optional: true + path('**best_hpams*.json'), emit: ch_hpams + path("versions.yml"), emit: versions + + script: + """ + train_and_predict_final.py \\ + --mode full \\ + --model_name "${model_name}" \\ + --split_id $split_id \\ + --split_dataset_path $split_dataset \\ + --hyperparameters_path $hpam_combi \\ + --response_transformation $response_transformation \\ + --test_mode $test_mode \\ + --path_data $path_data \\ + --cross_study_datasets $cross_study_datasets \\ + --model_checkpoint_dir $model_checkpoint_dir \\ + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + drevalpy: \$(python -c "import drevalpy; print(drevalpy.__version__)") + sklearn: \$(python -c "import sklearn; print(sklearn.__version__)") + numpy: \$(python -c "import numpy; print(numpy.__version__)") + pandas: \$(python -c "import pandas; print(pandas.__version__)") + pytorch_lightning: \$(python -c "import pytorch_lightning; print(pytorch_lightning.__version__)") + torch: \$(python -c "import torch; print(torch.__version__)" | sed 's/+.*//') + platform: \$(python -c "import platform; print(platform.__version__)") + END_VERSIONS + """ + +} diff --git a/modules/local/preprocess_raw_viability/env.yml b/modules/local/preprocess_raw_viability/env.yml new file mode 100644 index 0000000..c85aab7 --- /dev/null +++ b/modules/local/preprocess_raw_viability/env.yml @@ -0,0 +1,8 @@ +name: nf-core-drugresponseeval +channels: + - conda-forge +dependencies: + - python=3.13 + - pip + - pip: + - drevalpy==1.3.4 diff --git a/modules/local/preprocess_raw_viability/main.nf b/modules/local/preprocess_raw_viability/main.nf new file mode 100644 index 0000000..51b6c43 --- /dev/null +++ b/modules/local/preprocess_raw_viability/main.nf @@ -0,0 +1,25 @@ +process PREPROCESS_RAW_VIABILITY { + label 'process_low' + + conda "${moduleDir}/env.yml" + + input: + val(dataset_name) + path(work_path) + + output: + path "${dataset_name}/*/config.toml", emit: path_to_toml + path "${dataset_name}/*/curvecurator_input.tsv", emit: curvecurator_input + path("versions.yml"), emit: versions + + script: + """ + preprocess_raw_viability.py --path_data ${work_path} --dataset_name ${dataset_name} --cores ${task.cpus} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + drevalpy: \$(python -c "import drevalpy; print(drevalpy.__version__)") + END_VERSIONS + """ +} diff --git a/modules/local/randomization_split/env.yml b/modules/local/randomization_split/env.yml new file mode 100644 index 0000000..c85aab7 --- /dev/null +++ b/modules/local/randomization_split/env.yml @@ -0,0 +1,8 @@ +name: nf-core-drugresponseeval +channels: + - conda-forge +dependencies: + - python=3.13 + - pip + - pip: + - drevalpy==1.3.4 diff --git a/modules/local/randomization_split/main.nf b/modules/local/randomization_split/main.nf new file mode 100644 index 0000000..b671175 --- /dev/null +++ b/modules/local/randomization_split/main.nf @@ -0,0 +1,32 @@ +process RANDOMIZATION_SPLIT { + tag "${model_name}_${randomization_mode}" + label 'process_single' + + conda "${moduleDir}/env.yml" + + input: + tuple val(model_name), val(randomization_mode) + + output: + tuple val(model_name), path('randomization_test_view*.yaml'), emit: randomization_test_views + path("versions.yml"), emit: versions + + script: + """ + randomization_split.py --model_name "${model_name}" --randomization_mode ${randomization_mode} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + drevalpy: \$(python -c "import drevalpy; print(drevalpy.__version__)") + sklearn: \$(python -c "import sklearn; print(sklearn.__version__)") + numpy: \$(python -c "import numpy; print(numpy.__version__)") + pandas: \$(python -c "import pandas; print(pandas.__version__)") + pytorch_lightning: \$(python -c "import pytorch_lightning; print(pytorch_lightning.__version__)") + torch: \$(python -c "import torch; print(torch.__version__)") + platform: \$(python -c "import platform; print(platform.__version__)") + yaml: \$(python -c "import yaml; print(yaml.__version__)") + END_VERSIONS + """ + +} diff --git a/modules/local/randomization_test/env.yml b/modules/local/randomization_test/env.yml new file mode 100644 index 0000000..c85aab7 --- /dev/null +++ b/modules/local/randomization_test/env.yml @@ -0,0 +1,8 @@ +name: nf-core-drugresponseeval +channels: + - conda-forge +dependencies: + - python=3.13 + - pip + - pip: + - drevalpy==1.3.4 diff --git a/modules/local/randomization_test/main.nf b/modules/local/randomization_test/main.nf new file mode 100644 index 0000000..35d7ed8 --- /dev/null +++ b/modules/local/randomization_test/main.nf @@ -0,0 +1,46 @@ +process RANDOMIZATION_TEST { + tag { "${test_mode}_${model_name}_${randomization_type}_gpu:${task.ext.use_gpu}" } + label 'process_high' + label 'process_gpu' + + conda "${moduleDir}/env.yml" + + input: + tuple val(model_name), val(test_mode), val(split_id), path(split_dataset), path(best_hpams), path(randomization_views), path(path_data) + val(randomization_type) + val(response_transformation) + val model_checkpoint_dir + + output: + tuple val(test_mode), val(model_name), path('**randomization*.csv'), emit: ch_vis + path("versions.yml"), emit: versions + + script: + """ + train_and_predict_final.py \\ + --mode randomization \\ + --model_name "${model_name}" \\ + --split_id $split_id \\ + --split_dataset_path $split_dataset \\ + --hyperparameters_path $best_hpams \\ + --response_transformation $response_transformation \\ + --test_mode $test_mode \\ + --path_data $path_data \\ + --randomization_views_path $randomization_views \\ + --randomization_type $randomization_type \\ + --model_checkpoint_dir $model_checkpoint_dir \\ + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + drevalpy: \$(python -c "import drevalpy; print(drevalpy.__version__)") + sklearn: \$(python -c "import sklearn; print(sklearn.__version__)") + numpy: \$(python -c "import numpy; print(numpy.__version__)") + pandas: \$(python -c "import pandas; print(pandas.__version__)") + pytorch_lightning: \$(python -c "import pytorch_lightning; print(pytorch_lightning.__version__)") + torch: \$(python -c "import torch; print(torch.__version__)") + platform: \$(python -c "import platform; print(platform.__version__)") + END_VERSIONS + """ + +} diff --git a/modules/local/robustness_test/env.yml b/modules/local/robustness_test/env.yml new file mode 100644 index 0000000..c85aab7 --- /dev/null +++ b/modules/local/robustness_test/env.yml @@ -0,0 +1,8 @@ +name: nf-core-drugresponseeval +channels: + - conda-forge +dependencies: + - python=3.13 + - pip + - pip: + - drevalpy==1.3.4 diff --git a/modules/local/robustness_test/main.nf b/modules/local/robustness_test/main.nf new file mode 100644 index 0000000..7b2b7a3 --- /dev/null +++ b/modules/local/robustness_test/main.nf @@ -0,0 +1,45 @@ +process ROBUSTNESS_TEST { + tag { "${model_name}_${robustness_iteration}_gpu:${task.ext.use_gpu}" } + label 'process_high' + label 'process_gpu' + + conda "${moduleDir}/env.yml" + + input: + tuple val(model_name), val(test_mode), val(split_id), path(split_dataset), path(best_hpams), val(robustness_iteration), path(path_data) + val(randomization_type) + val(response_transformation) + val model_checkpoint_dir + + output: + tuple val(test_mode), val(model_name), path('**robustness*.csv'), emit: ch_vis + path("versions.yml"), emit: versions + + script: + """ + train_and_predict_final.py \\ + --mode robustness \\ + --model_name "${model_name}" \\ + --split_id $split_id \\ + --split_dataset_path $split_dataset \\ + --hyperparameters_path $best_hpams \\ + --response_transformation $response_transformation \\ + --test_mode $test_mode \\ + --path_data $path_data \\ + --robustness_trial $robustness_iteration \\ + --model_checkpoint_dir $model_checkpoint_dir \\ + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + drevalpy: \$(python -c "import drevalpy; print(drevalpy.__version__)") + sklearn: \$(python -c "import sklearn; print(sklearn.__version__)") + numpy: \$(python -c "import numpy; print(numpy.__version__)") + pandas: \$(python -c "import pandas; print(pandas.__version__)") + pytorch_lightning: \$(python -c "import pytorch_lightning; print(pytorch_lightning.__version__)") + torch: \$(python -c "import torch; print(torch.__version__)") + platform: \$(python -c "import platform; print(platform.__version__)") + END_VERSIONS + """ + +} diff --git a/modules/local/train_and_predict_cv/env.yml b/modules/local/train_and_predict_cv/env.yml new file mode 100644 index 0000000..c85aab7 --- /dev/null +++ b/modules/local/train_and_predict_cv/env.yml @@ -0,0 +1,8 @@ +name: nf-core-drugresponseeval +channels: + - conda-forge +dependencies: + - python=3.13 + - pip + - pip: + - drevalpy==1.3.4 diff --git a/modules/local/train_and_predict_cv/main.nf b/modules/local/train_and_predict_cv/main.nf new file mode 100644 index 0000000..bc711ec --- /dev/null +++ b/modules/local/train_and_predict_cv/main.nf @@ -0,0 +1,41 @@ +process TRAIN_AND_PREDICT_CV { + tag { "${model_name}_${test_mode}_gpu:${task.ext.use_gpu}" } + label 'process_high' + label 'process_gpu' + + conda "${moduleDir}/env.yml" + + input: + tuple val(model_name), val(test_mode), path(cv_data), path(hyperparameters), path(path_data) + val response_transformation + val model_checkpoint_dir + + + output: + tuple val(model_name), val(test_mode), val(cv_data.baseName), path(hyperparameters), path("prediction_dataset_*.pkl"), emit: pred_data + path("versions.yml"), emit: versions + + script: + """ + train_and_predict_cv.py \\ + --model_name "${model_name}" \\ + --path_data $path_data \\ + --test_mode $test_mode \\ + --hyperparameters $hyperparameters \\ + --cv_data $cv_data \\ + --response_transformation $response_transformation \\ + --model_checkpoint_dir $model_checkpoint_dir + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + drevalpy: \$(python -c "import drevalpy; print(drevalpy.__version__)") + sklearn: \$(python -c "import sklearn; print(sklearn.__version__)") + numpy: \$(python -c "import numpy; print(numpy.__version__)") + pandas: \$(python -c "import pandas; print(pandas.__version__)") + pytorch_lightning: \$(python -c "import pytorch_lightning; print(pytorch_lightning.__version__)") + torch: \$(python -c "import torch; print(torch.__version__)" | sed 's/+.*//') + platform: \$(python -c "import platform; print(platform.__version__)") + END_VERSIONS + """ +} diff --git a/modules/local/train_final_model/env.yml b/modules/local/train_final_model/env.yml new file mode 100644 index 0000000..c85aab7 --- /dev/null +++ b/modules/local/train_final_model/env.yml @@ -0,0 +1,8 @@ +name: nf-core-drugresponseeval +channels: + - conda-forge +dependencies: + - python=3.13 + - pip + - pip: + - drevalpy==1.3.4 diff --git a/modules/local/train_final_model/main.nf b/modules/local/train_final_model/main.nf new file mode 100644 index 0000000..a1f28e5 --- /dev/null +++ b/modules/local/train_final_model/main.nf @@ -0,0 +1,43 @@ +process TRAIN_FINAL_MODEL { + tag { "${model_name}_${test_mode}_gpu:${task.ext.use_gpu}" } + label 'process_high' + label 'process_gpu' + + conda "${moduleDir}/env.yml" + + input: + tuple val(model_name), val(test_mode), path(best_hpam_combi), path(train_data), path(val_data), path(early_stop_data), path(path_data) + val response_transformation + val model_checkpoint_dir + + + output: + path("**final_model/*"), emit: final_model + path("versions.yml"), emit: versions + + script: + """ + train_final_model.py \\ + --train_data $train_data \\ + --val_data $val_data \\ + --early_stop_data $early_stop_data \\ + --response_transformation "${response_transformation}" \\ + --model_name "${model_name}" \\ + --path_data $path_data \\ + --model_checkpoint_dir $model_checkpoint_dir \\ + --best_hpam_combi $best_hpam_combi + + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + drevalpy: \$(python -c "import drevalpy; print(drevalpy.__version__)") + sklearn: \$(python -c "import sklearn; print(sklearn.__version__)") + numpy: \$(python -c "import numpy; print(numpy.__version__)") + pandas: \$(python -c "import pandas; print(pandas.__version__)") + pytorch_lightning: \$(python -c "import pytorch_lightning; print(pytorch_lightning.__version__)") + torch: \$(python -c "import torch; print(torch.__version__)" | sed 's/+.*//') + platform: \$(python -c "import platform; print(platform.__version__)") + END_VERSIONS + """ +} diff --git a/modules/local/tune_final_model/env.yml b/modules/local/tune_final_model/env.yml new file mode 100644 index 0000000..c85aab7 --- /dev/null +++ b/modules/local/tune_final_model/env.yml @@ -0,0 +1,8 @@ +name: nf-core-drugresponseeval +channels: + - conda-forge +dependencies: + - python=3.13 + - pip + - pip: + - drevalpy==1.3.4 diff --git a/modules/local/tune_final_model/main.nf b/modules/local/tune_final_model/main.nf new file mode 100644 index 0000000..13456a0 --- /dev/null +++ b/modules/local/tune_final_model/main.nf @@ -0,0 +1,44 @@ +process TUNE_FINAL_MODEL { + tag { "${model_name}_${test_mode}_gpu:${task.ext.use_gpu}" } + label 'process_high' + label 'process_gpu' + + conda "${moduleDir}/env.yml" + + input: + tuple val(model_name), path(train_ds), path(val_ds), path(early_stop_ds), val(test_mode), path(path_data), path(hpam_combi) + val response_transformation + val model_checkpoint_dir + val metric + + + output: + tuple val(model_name), val(test_mode), val("final"), path(hpam_combi), path("final_prediction_dataset_*.pkl"), emit: final_prediction + path("versions.yml"), emit: versions + + script: + """ + tune_final_model.py \\ + --train_data $train_ds \\ + --val_data $val_ds \\ + --early_stopping_data $early_stop_ds \\ + --model_name "${model_name}" \\ + --hpam_combi $hpam_combi \\ + --response_transformation $response_transformation \\ + --path_data $path_data \\ + --model_checkpoint_dir $model_checkpoint_dir + + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + drevalpy: \$(python -c "import drevalpy; print(drevalpy.__version__)") + sklearn: \$(python -c "import sklearn; print(sklearn.__version__)") + numpy: \$(python -c "import numpy; print(numpy.__version__)") + pandas: \$(python -c "import pandas; print(pandas.__version__)") + pytorch_lightning: \$(python -c "import pytorch_lightning; print(pytorch_lightning.__version__)") + torch: \$(python -c "import torch; print(torch.__version__)" | sed 's/+.*//') + platform: \$(python -c "import platform; print(platform.__version__)") + END_VERSIONS + """ +} diff --git a/modules/local/unzip/env.yml b/modules/local/unzip/env.yml new file mode 100644 index 0000000..070d526 --- /dev/null +++ b/modules/local/unzip/env.yml @@ -0,0 +1,5 @@ +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::unzip=6.0 diff --git a/modules/local/unzip/main.nf b/modules/local/unzip/main.nf new file mode 100644 index 0000000..e7bfb3e --- /dev/null +++ b/modules/local/unzip/main.nf @@ -0,0 +1,24 @@ +process UNZIP { + tag "${dataset_name}" + label 'process_single' + + conda "${moduleDir}/env.yml" + + input: + tuple val(dataset_name), path(file) + + output: + tuple val(dataset_name), path("${file.baseName}/"), path("${file.baseName}/${file.baseName}.csv"), emit: unzipped_archive + path("versions.yml"), emit: versions + + script: + """ + unzip ${file} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + unzip: 6.00 + END_VERSIONS + """ + +} diff --git a/modules/local/visualize_results/env.yml b/modules/local/visualize_results/env.yml new file mode 100644 index 0000000..c85aab7 --- /dev/null +++ b/modules/local/visualize_results/env.yml @@ -0,0 +1,8 @@ +name: nf-core-drugresponseeval +channels: + - conda-forge +dependencies: + - python=3.13 + - pip + - pip: + - drevalpy==1.3.4 diff --git a/modules/local/visualize_results/main.nf b/modules/local/visualize_results/main.nf new file mode 100644 index 0000000..d7cf7df --- /dev/null +++ b/modules/local/visualize_results/main.nf @@ -0,0 +1,38 @@ +process VISUALIZE_RESULTS { + label 'process_medium' + + conda "${moduleDir}/env.yml" + + input: + tuple path(eval_results), path(eval_results_per_drug), path(eval_results_per_cl), path(true_vs_predicted) + val(path_data) + + output: + path('report/*'), emit: html_out + path("versions.yml"), emit: versions + + + script: + """ + visualize_results.py \\ + --test_modes ${params.test_mode.replace(',', ' ')} \\ + --eval_results $eval_results \\ + --eval_results_per_drug $eval_results_per_drug \\ + --eval_results_per_cl $eval_results_per_cl \\ + --true_vs_predicted $true_vs_predicted \\ + --path_data $path_data + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + drevalpy: \$(python -c "import drevalpy; print(drevalpy.__version__)") + pandas: \$(python -c "import pandas; print(pandas.__version__)") + matplotlib: \$(python -c "import matplotlib; print(matplotlib.__version__)") + plotly: \$(python -c "import plotly; print(plotly.__version__)") + scikit_posthocs: \$(python -c "import scikit_posthocs; print(scikit_posthocs.__version__)") + scipy: \$(python -c "import scipy; print(scipy.__version__)") + sklearn: \$(python -c "import sklearn; print(sklearn.__version__)") + END_VERSIONS + """ + +} diff --git a/modules/nf-core/custom/dumpsoftwareversions/environment.yml b/modules/nf-core/custom/dumpsoftwareversions/environment.yml new file mode 100644 index 0000000..c3b3413 --- /dev/null +++ b/modules/nf-core/custom/dumpsoftwareversions/environment.yml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::multiqc=1.27 diff --git a/modules/nf-core/custom/dumpsoftwareversions/main.nf b/modules/nf-core/custom/dumpsoftwareversions/main.nf new file mode 100644 index 0000000..dd6e210 --- /dev/null +++ b/modules/nf-core/custom/dumpsoftwareversions/main.nf @@ -0,0 +1,33 @@ +def deprecation_message = """ +WARNING: This module has been deprecated. + +Reason: +This module is no longer recommended for use, as it is replaced by the function softwareVersionsToYAML +in the utils_nfcore_pipeline subworkflow that is included in the nf-core template. + +""" +process CUSTOM_DUMPSOFTWAREVERSIONS { + label 'process_single' + + // Requires `pyyaml` which does not have a dedicated container but is in the MultiQC container + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/multiqc:1.27--pyhdfd78af_0' : + 'biocontainers/multiqc:1.27--pyhdfd78af_0' }" + + input: + path versions + + output: + path "software_versions.yml" , emit: yml + path "software_versions_mqc.yml", emit: mqc_yml + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + assert true: deprecation_message + def args = task.ext.args ?: '' + template 'dumpsoftwareversions.py' +} diff --git a/modules/nf-core/custom/dumpsoftwareversions/meta.yml b/modules/nf-core/custom/dumpsoftwareversions/meta.yml new file mode 100644 index 0000000..dc1e412 --- /dev/null +++ b/modules/nf-core/custom/dumpsoftwareversions/meta.yml @@ -0,0 +1,43 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: custom_dumpsoftwareversions +description: Custom module used to dump software versions within the nf-core pipeline + template +keywords: + - custom + - dump + - version +tools: + - custom: + description: Custom module used to dump software versions within the nf-core pipeline + template + homepage: https://github.com/nf-core/tools + documentation: https://github.com/nf-core/tools + licence: ["MIT"] + identifier: "" +input: + - - versions: + type: file + description: YML file containing software versions + pattern: "*.yml" +output: + - yml: + - software_versions.yml: + type: file + description: Standard YML file containing software versions + pattern: "software_versions.yml" + - mqc_yml: + - software_versions_mqc.yml: + type: file + description: MultiQC custom content YML file containing software versions + pattern: "software_versions_mqc.yml" + - versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" + - "@grst" +maintainers: + - "@drpatelh" + - "@grst" diff --git a/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py b/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py new file mode 100755 index 0000000..b83b32c --- /dev/null +++ b/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py @@ -0,0 +1,101 @@ +#!/usr/bin/env python + + +"""Provide functions to merge multiple versions.yml files.""" + +import platform +from textwrap import dedent + +import yaml + + +def _make_versions_html(versions): + """Generate a tabular HTML output of all versions for MultiQC.""" + html = [ + dedent( + """\\ + + + + + + + + + + """ + ) + ] + for process, tmp_versions in sorted(versions.items()): + html.append("") + for i, (tool, version) in enumerate(sorted(tmp_versions.items())): + html.append( + dedent( + f"""\\ + + + + + + """ + ) + ) + html.append("") + html.append("
Process Name Software Version
{process if (i == 0) else ''}{tool}{version}
") + return "\\n".join(html) + + +def main(): + """Load all version files and generate merged output.""" + versions_this_module = {} + versions_this_module["${task.process}"] = { + "python": platform.python_version(), + "yaml": yaml.__version__, + } + + with open("$versions") as f: + versions_by_process = yaml.load(f, Loader=yaml.BaseLoader) | versions_this_module + + # aggregate versions by the module name (derived from fully-qualified process name) + versions_by_module = {} + for process, process_versions in versions_by_process.items(): + module = process.split(":")[-1] + try: + if versions_by_module[module] != process_versions: + raise AssertionError( + "We assume that software versions are the same between all modules. " + "If you see this error-message it means you discovered an edge-case " + "and should open an issue in nf-core/tools. " + ) + except KeyError: + versions_by_module[module] = process_versions + + versions_by_module["Workflow"] = { + "Nextflow": "$workflow.nextflow.version", + "$workflow.manifest.name": "$workflow.manifest.version", + } + + versions_mqc = { + "id": "software_versions", + "section_name": "${workflow.manifest.name} Software Versions", + "section_href": "https://github.com/${workflow.manifest.name}", + "plot_type": "html", + "description": "are collected at run time from the software output.", + "data": _make_versions_html(versions_by_module), + } + + with open("software_versions.yml", "w") as f: + yaml.dump(versions_by_module, f, default_flow_style=False) + with open("software_versions_mqc.yml", "w") as f: + yaml.dump(versions_mqc, f, default_flow_style=False) + + with open("versions.yml", "w") as f: + yaml.dump(versions_this_module, f, default_flow_style=False) + + +if __name__ == "__main__": + main() diff --git a/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test b/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test new file mode 100644 index 0000000..b1e1630 --- /dev/null +++ b/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test @@ -0,0 +1,43 @@ +nextflow_process { + + name "Test Process CUSTOM_DUMPSOFTWAREVERSIONS" + script "../main.nf" + process "CUSTOM_DUMPSOFTWAREVERSIONS" + tag "modules" + tag "modules_nfcore" + tag "custom" + tag "dumpsoftwareversions" + tag "custom/dumpsoftwareversions" + + test("Should run without failures") { + when { + process { + """ + def tool1_version = ''' + TOOL1: + tool1: 0.11.9 + '''.stripIndent() + + def tool2_version = ''' + TOOL2: + tool2: 1.9 + '''.stripIndent() + + input[0] = Channel.of(tool1_version, tool2_version).collectFile() + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + process.out.versions, + file(process.out.mqc_yml[0]).readLines()[0..10], + file(process.out.yml[0]).readLines()[0..7] + ).match() + } + ) + } + } +} diff --git a/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test.snap b/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test.snap new file mode 100644 index 0000000..74e42fb --- /dev/null +++ b/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test.snap @@ -0,0 +1,37 @@ +{ + "Should run without failures": { + "content": [ + [ + "versions.yml:md5,42bedca466554ea4ad0b586f8a18be28" + ], + [ + "data: \"\\n\\n \\n \\n \\n \\n \\n \\n \\n\\", + " \\n\\n\\n \\n \\n\\", + " \\ \\n\\n\\n\\n \\n \\", + " \\ \\n \\n\\n\\n\\n\\", + " \\n\\n \\n \\n\\", + " \\ \\n\\n\\n\\n\\n\\n \\n\\", + " \\ \\n \\n\\n\\n\\n\\", + " \\n\\n \\n \\n\\" + ], + [ + "CUSTOM_DUMPSOFTWAREVERSIONS:", + " python: 3.13.1", + " yaml: 6.0.2", + "TOOL1:", + " tool1: 0.11.9", + "TOOL2:", + " tool2: '1.9'", + "Workflow:" + ] + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.5" + }, + "timestamp": "2025-03-11T13:50:29.789124124" + } +} \ No newline at end of file diff --git a/nextflow.config b/nextflow.config index bb2bdf3..6ee230a 100644 --- a/nextflow.config +++ b/nextflow.config @@ -6,15 +6,33 @@ ---------------------------------------------------------------------------------------- */ +process.container = 'ghcr.io/daisybio/drevalpy:v1.3.4' + // Global default params, used in configs params { - - // TODO nf-core: Specify your pipeline's command line flags - // Input options - input = null + // For this pipeline + run_id = 'my_run' + models = 'NaiveDrugMeanPredictor' + baselines = 'NaiveMeanEffectsPredictor' + test_mode = 'LCO' + randomization_mode = 'None' + randomization_type = 'permutation' + n_trials_robustness = 0 + dataset_name = 'CTRPv2' + cross_study_datasets = '' + no_refitting = false + optim_metric = 'RMSE' + n_cv_splits = 10 + response_transformation = 'None' + path_data = 'data' + model_checkpoint_dir = 'TEMPORARY' + measure = 'LN_IC50' + zenodo_link = 'https://zenodo.org/records/15533857/files/' + no_hyperparameter_tuning = false + final_model_on_full_data = false // Boilerplate options - outdir = null + outdir = 'results' publish_dir_mode = 'copy' email = null email_on_fail = null @@ -165,8 +183,7 @@ includeConfig params.custom_config_base && (!System.getenv('NXF_OFFLINE') || !pa // Load nf-core/drugresponseeval custom profiles from different institutions. -// TODO nf-core: Optionally, you can add a pipeline-specific nf-core config at https://github.com/nf-core/configs -// includeConfig params.custom_config_base && (!System.getenv('NXF_OFFLINE') || !params.custom_config_base.startsWith('http')) ? "${params.custom_config_base}/pipeline/drugresponseeval.config" : "/dev/null" +includeConfig params.custom_config_base && (!System.getenv('NXF_OFFLINE') || !params.custom_config_base.startsWith('http')) ? "${params.custom_config_base}/pipeline/drugresponseeval.config" : "/dev/null" // Set default registry for Apptainer, Docker, Podman, Charliecloud and Singularity independent of -profile // Will not be used unless Apptainer / Docker / Podman / Charliecloud / Singularity are enabled @@ -224,20 +241,36 @@ manifest { name = 'nf-core/drugresponseeval' author = """Judith Bernett""" // The author field is deprecated from Nextflow version 24.10.0, use contributors instead contributors = [ - // TODO nf-core: Update the field with the details of the contributors to your pipeline. New with Nextflow version 24.10.0 [ name: 'Judith Bernett', - affiliation: '', - email: '', - github: '', - contribution: [], // List of contribution types ('author', 'maintainer' or 'contributor') - orcid: '' + affiliation: 'Data Science in Systems Biology, TUM School of Life Sciences, Technical University of Munich, Germany.', + email: 'judith.bernett@tum.de', + github: 'JudithBernett', + contribution: ['author', 'maintainer'], // List of contribution types ('author', 'maintainer' or 'contributor') + orcid: '0000-0001-5812-8013' + ], + [ + name: 'Pascal Iversen', + affiliation: 'Freie Universität Berlin, Department of Mathematics and Computer Science, Germany. Hasso-Plattner-Institute, Digital Engineering Faculty, University of Potsdam, Germany.', + email: 'pascal.iversen@fu-berlin.de', + github: 'PascalIversen', + contribution: ['author', 'maintainer'], // List of contribution types ('author', 'maintainer' or 'contributor') + orcid: '0000-0001-9877-4300' + ], + [ + name: 'Mario Picciani', + affiliation: 'Computational Mass Spectrometry, TUM School of Life Sciences, Technical University of Munich, Germany.', + email: 'mario.picciani@tum.de', + github: 'picciama', + contribution: ['contributor'], // List of contribution types ('author', 'maintainer' or 'contributor') + orcid: '0000-0003-0428-1703' ], ] homePage = 'https://github.com/nf-core/drugresponseeval' + icon = './docs/images/drugresponseeval_icon.png' description = """This pipeline evaluates drug response models in various settings on a variety of datasets.""" mainScript = 'main.nf' - defaultBranch = 'master' + defaultBranch = 'main' nextflowVersion = '!>=24.04.2' version = '1.1.0' doi = '' diff --git a/nextflow_schema.json b/nextflow_schema.json index e52ec18..e50fc4a 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -5,29 +5,52 @@ "description": "This pipeline evaluates drug response models in various settings on a variety of datasets.", "type": "object", "$defs": { + "model_options": { + "title": "Model options", + "type": "object", + "fa_icon": "fa-solid fa-laptop-code", + "description": "Define the models and baselines to be tested.", + "required": ["models", "baselines"], + "properties": { + "models": { + "type": "string", + "description": "Model to be tested.", + "help_text": "Model to be tested. See the documentation for a list of pre-implemented models. Can be multiple models separated by ','.", + "default": "NaiveDrugMeanPredictor" + }, + "baselines": { + "type": "string", + "description": "Baselines to be tested.", + "help_text": "Baselines to be tested. See documentation of a list of available models. For baselines, randomization and robustness tests are not run. The NaiveMeanEffectsPredictor will always be included.", + "default": "NaiveMeanEffectsPredictor" + } + } + }, "input_output_options": { "title": "Input/output options", "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": ["input", "outdir"], + "required": ["run_id", "dataset_name", "outdir"], "properties": { - "input": { + "run_id": { + "type": "string", + "description": "Run name for the pipeline. The subdirectory in results will be named like this.", + "help_text": "You will need to set a run identifier for the pipeline. This is used to create a unique output directory for each run.", + "default": "my_run" + }, + "dataset_name": { "type": "string", - "format": "file-path", - "exists": true, - "schema": "assets/schema_input.json", - "mimetype": "text/csv", - "pattern": "^\\S+\\.csv$", - "description": "Path to comma-separated file containing information about the samples in the experiment.", - "help_text": "You will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row. See [usage docs](https://nf-co.re/drugresponseeval/usage#samplesheet-input).", - "fa_icon": "fas fa-file-csv" + "description": "Name of the dataset. Pre-supplied datasets are CTRPv2, CTRPv1, CCLE, GDSC1, GDSC2, TOYv1, TOYv2.", + "help_text": "Name of the dataset used for the pipeline. This can be either one of the provided datasets ('GDSC1', 'GDSC2', 'CCLE', 'CTRPv2', 'TOYv1', 'TOYv2) in which case the datasets with the fitted curves is downloaded, or a custom dataset name, pointing either to raw viability measurements for automatic curve fitting, or pre-fit data (see no_refitting option; not recommended for dataset comparability reasons due to potential differences in fitting procedures).", + "default": "CTRPv2" }, "outdir": { "type": "string", "format": "directory-path", - "description": "The output directory where the results will be saved. You have to use absolute paths to storage on Cloud infrastructure.", - "fa_icon": "fas fa-folder-open" + "description": "The output directory where the results will be saved. Default is results/", + "fa_icon": "fas fa-folder-open", + "default": "results" }, "email": { "type": "string", @@ -38,6 +61,149 @@ } } }, + "mode_options": { + "title": "Mode options (LPO/LCO/LTO/LDO)", + "type": "object", + "fa_icon": "fa-solid fa-table-cells-column-lock", + "description": "Define the mode in which the pipeline will be run.", + "required": ["test_mode"], + "properties": { + "test_mode": { + "type": "string", + "description": "Run the pipeline in test mode LPO (Leave-random-Pairs-Out), LCO (Leave-Cell-line-Out), or LDO (Leave-Drug-Out).", + "default": "LCO", + "help_text": "Which tests to run (LPO=Leave-random-Pairs-Out, LCO=Leave-Cell-line-Out, LTO=Leave-Tissue-Out, LDO=Leave-Drug-Out). Can be a list of test runs e.g. 'LPO,LCO,LTO,LDO' to run all tests. Default is LCO.", + "pattern": "^((LPO|LCO|LTO|LDO)?,?)*(?//_raw.csv (also see 'path_data' option) containing the raw response data. We fit the curves by default with CurveCurator to provide fair comparison to our other available datasets. The fitted data will then be stored at //.csv. If you want to disable this option, set the flag.", + "default": false + }, + "optim_metric": { + "type": "string", + "description": "Optimization metric for the pipeline.", + "default": "RMSE", + "help_text": "Optimization metric for the pipeline. All models will minimize (MSE, RMSE, MAE)/maximize (R^2, Pearson, Spearman, Kendall) this metric calculated on the validation set. Default is RMSE.", + "enum": ["RMSE", "MSE", "MAE", "R^2", "Pearson", "Spearman", "Kendall"] + }, + "n_cv_splits": { + "type": "integer", + "default": 10, + "description": "Number of cross-validation splits.", + "help_text": "Number of cross-validation splits. Default is 10.", + "minimum": 2 + }, + "response_transformation": { + "type": "string", + "description": "Response transformation", + "help_text": "Transformation to apply to the response variable possible values: None, standard, minmax, robust", + "default": "None", + "enum": ["None", "standard", "minmax", "robust"] + }, + "model_checkpoint_dir": { + "type": "string", + "description": "Model checkpoint directory", + "default": "TEMPORARY", + "help_text": "Directory to save model checkpoints." + }, + "no_hyperparameter_tuning": { + "type": "boolean", + "description": "Disable hyperparameter tuning.", + "default": false, + "help_text": "Set this flag to disable hyperparameter tuning. If set, the pipeline will not perform hyperparameter tuning and will use the default parameters for the models (more meant for quick runs or debugging)." + }, + "final_model_on_full_data": { + "type": "boolean", + "description": "Train final model on full data.", + "default": false, + "help_text": "If True, saves a final model, trained/tuned on the union of all folds after CV. This is useful if you want to use the model for predictions on new data after running the pipeline." + } + } + }, "institutional_config_options": { "title": "Institutional config options", "type": "object", @@ -159,9 +325,27 @@ } }, "allOf": [ + { + "$ref": "#/$defs/model_options" + }, { "$ref": "#/$defs/input_output_options" }, + { + "$ref": "#/$defs/mode_options" + }, + { + "$ref": "#/$defs/randomization_options" + }, + { + "$ref": "#/$defs/data_options" + }, + { + "$ref": "#/$defs/robustness_options" + }, + { + "$ref": "#/$defs/additional_options" + }, { "$ref": "#/$defs/institutional_config_options" }, diff --git a/ro-crate-metadata.json b/ro-crate-metadata.json index be8b57f..2c65148 100644 --- a/ro-crate-metadata.json +++ b/ro-crate-metadata.json @@ -23,7 +23,7 @@ "@type": "Dataset", "creativeWorkStatus": "Stable", "datePublished": "2025-06-03T11:01:28+00:00", - "description": "

\n \n \n \"nf-core/drugresponseeval\"\n \n

\n\n[![GitHub Actions CI Status](https://github.com/nf-core/drugresponseeval/actions/workflows/ci.yml/badge.svg)](https://github.com/nf-core/drugresponseeval/actions/workflows/ci.yml)\n[![GitHub Actions Linting Status](https://github.com/nf-core/drugresponseeval/actions/workflows/linting.yml/badge.svg)](https://github.com/nf-core/drugresponseeval/actions/workflows/linting.yml)[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/drugresponseeval/results)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.XXXXXXX-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.XXXXXXX)\n[![nf-test](https://img.shields.io/badge/unit_tests-nf--test-337ab7.svg)](https://www.nf-test.com)\n\n[![Nextflow](https://img.shields.io/badge/version-%E2%89%A524.04.2-green?style=flat&logo=nextflow&logoColor=white&color=%230DC09D&link=https%3A%2F%2Fnextflow.io)](https://www.nextflow.io/)\n[![nf-core template version](https://img.shields.io/badge/nf--core_template-3.3.1-green?style=flat&logo=nfcore&logoColor=white&color=%2324B064&link=https%3A%2F%2Fnf-co.re)](https://github.com/nf-core/tools/releases/tag/3.3.1)\n[![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/)\n[![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/)\n[![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/)\n[![Launch on Seqera Platform](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Seqera%20Platform-%234256e7)](https://cloud.seqera.io/launch?pipeline=https://github.com/nf-core/drugresponseeval)\n\n[![Get help on Slack](http://img.shields.io/badge/slack-nf--core%20%23drugresponseeval-4A154B?labelColor=000000&logo=slack)](https://nfcore.slack.com/channels/drugresponseeval)[![Follow on Bluesky](https://img.shields.io/badge/bluesky-%40nf__core-1185fe?labelColor=000000&logo=bluesky)](https://bsky.app/profile/nf-co.re)[![Follow on Mastodon](https://img.shields.io/badge/mastodon-nf__core-6364ff?labelColor=FFFFFF&logo=mastodon)](https://mstdn.science/@nf_core)[![Watch on YouTube](http://img.shields.io/badge/youtube-nf--core-FF0000?labelColor=000000&logo=youtube)](https://www.youtube.com/c/nf-core)\n\n## Introduction\n\n**nf-core/drugresponseeval** is a bioinformatics pipeline that ...\n\n\n\n\n\n\n## Usage\n\n> [!NOTE]\n> If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data.\n\n\n\nNow, you can run the pipeline using:\n\n\n\n```bash\nnextflow run nf-core/drugresponseeval \\\n -profile \\\n --input samplesheet.csv \\\n --outdir \n```\n\n> [!WARNING]\n> Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_; see [docs](https://nf-co.re/docs/usage/getting_started/configuration#custom-configuration-files).\n\nFor more details and further functionality, please refer to the [usage documentation](https://nf-co.re/drugresponseeval/usage) and the [parameter documentation](https://nf-co.re/drugresponseeval/parameters).\n\n## Pipeline output\n\nTo see the results of an example test run with a full size dataset refer to the [results](https://nf-co.re/drugresponseeval/results) tab on the nf-core website pipeline page.\nFor more details about the output files and reports, please refer to the\n[output documentation](https://nf-co.re/drugresponseeval/output).\n\n## Credits\n\nnf-core/drugresponseeval was originally written by Judith Bernett.\n\nWe thank the following people for their extensive assistance in the development of this pipeline:\n\n\n\n## Contributions and Support\n\nIf you would like to contribute to this pipeline, please see the [contributing guidelines](.github/CONTRIBUTING.md).\n\nFor further information or help, don't hesitate to get in touch on the [Slack `#drugresponseeval` channel](https://nfcore.slack.com/channels/drugresponseeval) (you can join with [this invite](https://nf-co.re/join/slack)).\n\n## Citations\n\n\n\n\n\n\nAn extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file.\n\nYou can cite the `nf-core` publication as follows:\n\n> **The nf-core framework for community-curated bioinformatics pipelines.**\n>\n> Philip Ewels, Alexander Peltzer, Sven Fillinger, Harshil Patel, Johannes Alneberg, Andreas Wilm, Maxime Ulysse Garcia, Paolo Di Tommaso & Sven Nahnsen.\n>\n> _Nat Biotechnol._ 2020 Feb 13. doi: [10.1038/s41587-020-0439-x](https://dx.doi.org/10.1038/s41587-020-0439-x).\n", + "description": "

\n \n \n \"nf-core/drugresponseeval\"\n \n

\n\n[![GitHub Actions CI Status](https://github.com/nf-core/drugresponseeval/actions/workflows/ci.yml/badge.svg)](https://github.com/nf-core/drugresponseeval/actions/workflows/ci.yml)\n[![GitHub Actions Linting Status](https://github.com/nf-core/drugresponseeval/actions/workflows/linting.yml/badge.svg)](https://github.com/nf-core/drugresponseeval/actions/workflows/linting.yml)[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/drugresponseeval/results)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.14779984-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.14779984)\n\n[![nf-test](https://img.shields.io/badge/unit_tests-nf--test-337ab7.svg)](https://www.nf-test.com)\n\n[![Nextflow](https://img.shields.io/badge/version-%E2%89%A524.04.2-green?style=flat&logo=nextflow&logoColor=white&color=%230DC09D&link=https%3A%2F%2Fnextflow.io)](https://www.nextflow.io/)\n[![nf-core template version](https://img.shields.io/badge/nf--core_template-3.3.1-green?style=flat&logo=nfcore&logoColor=white&color=%2324B064&link=https%3A%2F%2Fnf-co.re)](https://github.com/nf-core/tools/releases/tag/3.3.1)\n[![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/)\n[![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/)\n[![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/)\n[![Launch on Seqera Platform](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Seqera%20Platform-%234256e7)](https://cloud.seqera.io/launch?pipeline=https://github.com/nf-core/drugresponseeval)\n\n[![Get help on Slack](http://img.shields.io/badge/slack-nf--core%20%23drugresponseeval-4A154B?labelColor=000000&logo=slack)](https://nfcore.slack.com/channels/drugresponseeval)[![Follow on Bluesky](https://img.shields.io/badge/bluesky-%40nf__core-1185fe?labelColor=000000&logo=bluesky)](https://bsky.app/profile/nf-co.re)[![Follow on Mastodon](https://img.shields.io/badge/mastodon-nf__core-6364ff?labelColor=FFFFFF&logo=mastodon)](https://mstdn.science/@nf_core)[![Watch on YouTube](http://img.shields.io/badge/youtube-nf--core-FF0000?labelColor=000000&logo=youtube)](https://www.youtube.com/c/nf-core)\n\n## Introduction\n\n# ![drevalpy_summary](assets/dreval_summary.svg)\n\n**DrEval** is a bioinformatics framework that includes a PyPI package (drevalpy) and a Nextflow\npipeline (this repo). DrEval ensures that evaluations are statistically sound, biologically\nmeaningful, and reproducible. DrEval simplifies the implementation of drug response prediction\nmodels, allowing researchers to focus on advancing their modeling innovations by automating\nstandardized evaluation protocols and preprocessing workflows. With DrEval, hyperparameter\ntuning is fair and consistent. With its flexible model interface, DrEval supports any model type,\nranging from statistical models to complex neural networks. By contributing your model to the\nDrEval catalog, you can increase your work's exposure, reusability, and transferability.\n\n1. The response data is loaded\n2. All models are trained and evaluated in a cross-validation setting\n3. For each CV split, the best hyperparameters are determined using a grid search per model\n4. The model is trained on the full training set (train & validation) with the best\n hyperparameters to predict the test set\n5. If randomization tests are enabled, the model is trained on the full training set with the best\n hyperparameters to predict the randomized test set\n6. If robustness tests are enabled, the model is trained N times on the full training set with the\n best hyperparameters\n7. Plots are created summarizing the results\n\nFor baseline models, no randomization or robustness tests are performed.\n\n## Usage\n\n> [!NOTE]\n> If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data.\n\nNow, you can run the pipeline using:\n\n```bash\nnextflow run nf-core/drugresponseeval \\\n -profile \\\n --models \\\n --baselines \\\n --dataset_name \n```\n\n> [!WARNING]\n> Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_; see [docs](https://nf-co.re/docs/usage/getting_started/configuration#custom-configuration-files).\n\nFor more details and further functionality, please refer to the [usage documentation](https://nf-co.re/drugresponseeval/usage) and the [parameter documentation](https://nf-co.re/drugresponseeval/parameters).\n\n## Pipeline output\n\nTo see the results of an example test run with a full size dataset refer to the [results](https://nf-co.re/drugresponseeval/results) tab on the nf-core website pipeline page.\nFor more details about the output files and reports, please refer to the\n[output documentation](https://nf-co.re/drugresponseeval/output).\n\n## Credits\n\nnf-core/drugresponseeval was originally written by Judith Bernett (TUM) and Pascal Iversen (FU\nBerlin).\n\nWe thank the following people for their extensive assistance in the development of this pipeline:\n\n## Contributions and Support\n\nContributors to nf-core/drugresponseeval and the drevalpy PyPI package:\n\n- [Judith Bernett](https://github.com/JudithBernett) (TUM)\n- [Pascal Iversen](https://github.com/PascalIversen) (FU Berlin)\n- [Mario Picciani](https://github.com/picciama) (TUM)\n\nIf you would like to contribute to this pipeline, please see the [contributing guidelines](.github/CONTRIBUTING.md).\n\nFor further information or help, don't hesitate to get in touch on the [Slack `#drugresponseeval` channel](https://nfcore.slack.com/channels/drugresponseeval) (you can join with [this invite](https://nf-co.re/join/slack)).\n\n## Citations\n\nIf you use nf-core/drugresponseeval for your analysis, please cite it using the following doi: [10.5281/zenodo.14779984](https://doi.org/10.5281/zenodo.14779984)\n\n> Our corresponding publication is at doi [10.1101/2025.05.26.655288](doi.org/10.1101/2025.05.26.655288)\n>\n> Bernett, J., Iversen, P., Picciani, M., Wilhelm, M., Baum, K., & List, M. **From Hype to Health Check: Critical Evaluation of Drug Response Prediction Models with DrEval.**\n>\n> _bioRxiv_, 2025-05.\n\nThe underlying data is available at doi: [10.5281/zenodo.12633909](https://doi.org/10.5281/zenodo.12633909).\n\nThe underlying python package is drevalpy, availably on [PyPI](https://pypi.org/project/drevalpy/) as standalone, for which we also have an extensive [ReadTheDocs Documentation](https://drevalpy.readthedocs.io/en/latest/).\n\nAn extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file.\n\nYou can cite the `nf-core` publication as follows:\n\n> **The nf-core framework for community-curated bioinformatics pipelines.**\n>\n> Philip Ewels, Alexander Peltzer, Sven Fillinger, Harshil Patel, Johannes Alneberg, Andreas Wilm, Maxime Ulysse Garcia, Paolo Di Tommaso & Sven Nahnsen.\n>\n> _Nat Biotechnol._ 2020 Feb 13. doi: [10.1038/s41587-020-0439-x](https://dx.doi.org/10.1038/s41587-020-0439-x).\n", "hasPart": [ { "@id": "main.nf" @@ -31,6 +31,9 @@ { "@id": "assets/" }, + { + "@id": "bin/" + }, { "@id": "conf/" }, @@ -40,6 +43,15 @@ { "@id": "docs/images/" }, + { + "@id": "modules/" + }, + { + "@id": "modules/local/" + }, + { + "@id": "modules/nf-core/" + }, { "@id": "workflows/" }, @@ -115,7 +127,11 @@ }, { "@id": "main.nf", - "@type": ["File", "SoftwareSourceCode", "ComputationalWorkflow"], + "@type": [ + "File", + "SoftwareSourceCode", + "ComputationalWorkflow" + ], "creator": [ { "@id": "https://orcid.org/0000-0001-5812-8013" @@ -141,21 +157,30 @@ "robustness-assessment", "training" ], - "license": ["MIT"], + "license": [ + "MIT" + ], "maintainer": [ { "@id": "https://orcid.org/0000-0001-5812-8013" } ], - "name": ["nf-core/drugresponseeval"], + "name": [ + "nf-core/drugresponseeval" + ], "programmingLanguage": { "@id": "https://w3id.org/workflowhub/workflow-ro-crate#nextflow" }, "sdPublisher": { "@id": "https://nf-co.re/" }, - "url": ["https://github.com/nf-core/drugresponseeval", "https://nf-co.re/drugresponseeval/1.1.0/"], - "version": ["1.1.0"] + "url": [ + "https://github.com/nf-core/drugresponseeval", + "https://nf-co.re/drugresponseeval/1.1.0/" + ], + "version": [ + "1.1.0" + ] }, { "@id": "https://w3id.org/workflowhub/workflow-ro-crate#nextflow", @@ -205,6 +230,11 @@ "@type": "Dataset", "description": "Additional files" }, + { + "@id": "bin/", + "@type": "Dataset", + "description": "Scripts that must be callable from a pipeline process" + }, { "@id": "conf/", "@type": "Dataset", @@ -220,6 +250,21 @@ "@type": "Dataset", "description": "Images for the documentation files" }, + { + "@id": "modules/", + "@type": "Dataset", + "description": "Modules used by the pipeline" + }, + { + "@id": "modules/local/", + "@type": "Dataset", + "description": "Pipeline-specific modules" + }, + { + "@id": "modules/nf-core/", + "@type": "Dataset", + "description": "nf-core modules" + }, { "@id": "workflows/", "@type": "Dataset", @@ -308,4 +353,4 @@ "name": "Judith Bernett" } ] -} +} \ No newline at end of file diff --git a/subworkflows/local/model_testing/main.nf b/subworkflows/local/model_testing/main.nf new file mode 100644 index 0000000..6e579cf --- /dev/null +++ b/subworkflows/local/model_testing/main.nf @@ -0,0 +1,208 @@ +include { PREDICT_FULL } from '../../../modules/local/predict_full' +include { RANDOMIZATION_SPLIT } from '../../../modules/local/randomization_split' +include { RANDOMIZATION_TEST } from '../../../modules/local/randomization_test' +include { ROBUSTNESS_TEST } from '../../../modules/local/robustness_test' +include { FINAL_SPLIT } from '../../../modules/local/final_split' +include { TUNE_FINAL_MODEL } from '../../../modules/local/tune_final_model' +include { EVALUATE_FIND_MAX as EVALUATE_FIND_MAX_FINAL } from '../../../modules/local/evaluate_find_max' +include { TRAIN_FINAL_MODEL } from '../../../modules/local/train_final_model' +include { CONSOLIDATE_RESULTS } from '../../../modules/local/consolidate_results' +include { EVALUATE_FINAL } from '../../../modules/local/evaluate_final' +include { COLLECT_RESULTS } from '../../../modules/local/collect_results' +include { VISUALIZE_RESULTS } from '../../../modules/local/visualize_results' + + +workflow MODEL_TESTING { + take: + ch_models_baselines // from input [model_class, model_name] + best_hpam_per_split // from RUN_CV: [split_id, test_mode, split_dataset, model_name, best_hpam_combi_X.yaml] + randomizations // from input + response_dataset // from LOAD_RESPONSE + cross_study_datasets // from LOAD_RESPONSE + ch_models // from RUN_CV [model_class, model_name] + work_path // from input + test_modes // e.g., ['LPO', 'LCO'] + ch_hpam_combis // from RUN_CV [model_name, hpam_X.yaml] + + main: + ch_versions = Channel.empty() + if (params.cross_study_datasets == '') { + cross_study_datasets = channel.fromPath(['./NONE.csv']) + } + ch_tmp = best_hpam_per_split.map{ + split_id, test_mode, path_to_split, model_name, path_to_hpams -> + return [model_name, test_mode, split_id, path_to_split, path_to_hpams] + } + ch_tmp2 = cross_study_datasets + .collect() + .map{it -> [it]} + // [[cross_study_datasets], model, test_mode, split_id, split_dataset, best_hpam_combi_X.yaml, path/to/data] + ch_predict_final = ch_tmp2.combine(ch_tmp).combine(work_path) + + PREDICT_FULL ( + ch_predict_final, + params.response_transformation, + params.model_checkpoint_dir + ) + ch_versions = ch_versions.mix(PREDICT_FULL.out.versions) + ch_vis = PREDICT_FULL.out.ch_vis.concat(PREDICT_FULL.out.ch_cross) + + if (params.randomization_mode != 'None') { + ch_randomization = channel.from(randomizations) + // randomizations only for models, not for baselines + ch_models_rand = ch_models + .map{it -> it[0]} + .unique() + .combine(ch_randomization) + RANDOMIZATION_SPLIT ( + ch_models_rand + ) + ch_versions = ch_versions.mix(RANDOMIZATION_SPLIT.out.versions) + ch_rand_views = ch_models + .combine(RANDOMIZATION_SPLIT.out.randomization_test_views.transpose(), by: 0) + .map{ model_class, model_name, rand_file -> [model_name, rand_file] } + + ch_best_hpams_per_split_rand = best_hpam_per_split.map { + split_id, test_mode, path_to_split, model_name, path_to_hpams -> + return [model_name, test_mode, split_id, path_to_split, path_to_hpams] + } + // [model_name, test_mode, split_id, split_dataset, best_hpam_combi_X.yaml, + // randomization_views] + ch_randomization = ch_best_hpams_per_split_rand + .combine(ch_rand_views, by: 0) + .combine(work_path) + + RANDOMIZATION_TEST ( + ch_randomization, + params.randomization_type, + params.response_transformation, + params.model_checkpoint_dir + ) + ch_versions = ch_versions.mix(RANDOMIZATION_TEST.out.versions) + ch_vis = ch_vis.concat(RANDOMIZATION_TEST.out.ch_vis) + } + + if (params.n_trials_robustness > 0) { + ch_trials_robustness = Channel.from(1..params.n_trials_robustness) + ch_trials_robustness = ch_models + .map{it -> it[1]} + .combine(ch_trials_robustness) + + ch_best_hpams_per_split_rob = best_hpam_per_split.map { + split_id, test_mode, path_to_split, model_name, path_to_hpams -> + return [model_name, test_mode, split_id, path_to_split, path_to_hpams] + } + + // [model_name, test_mode, split_id, split_dataset, best_hpam_combi_X.yaml, + // robustness_iteration] + ch_robustness = ch_best_hpams_per_split_rob.combine(ch_trials_robustness, by: 0).combine(work_path) + ROBUSTNESS_TEST ( + ch_robustness, + params.randomization_type, + params.response_transformation, + params.model_checkpoint_dir + ) + ch_versions = ch_versions.mix(ROBUSTNESS_TEST.out.versions) + ch_vis = ch_vis.concat(ROBUSTNESS_TEST.out.ch_vis) + } + + if (params.final_model_on_full_data) { + // we only do this for models, not for baselines + ch_test_modes = channel.from(test_modes) + ch_final_split = ch_models + .map{it -> it[0]} + .unique() + .combine(response_dataset) + .combine(ch_test_modes) + .combine(work_path) + + FINAL_SPLIT( + ch_final_split + ) + ch_versions = ch_versions.mix(FINAL_SPLIT.out.versions) + + ch_tune_final_model = ch_models + .combine(FINAL_SPLIT.out.final_datasets, by: 0) + .map { model_class, model_name, train_ds, val_ds, es_ds -> + [model_name, train_ds, val_ds, es_ds] } + .combine(ch_test_modes) + .combine(work_path) + .combine(ch_hpam_combis, by: 0) + + TUNE_FINAL_MODEL( + ch_tune_final_model, + params.response_transformation, + params.model_checkpoint_dir, + params.optim_metric + ) + ch_versions = ch_versions.mix(TUNE_FINAL_MODEL.out.versions) + ch_combined_hpams = TUNE_FINAL_MODEL.out.final_prediction.groupTuple(by: [0,1,2]) + + EVALUATE_FIND_MAX_FINAL( + ch_combined_hpams, + params.optim_metric + ) + ch_versions = ch_versions.mix(EVALUATE_FIND_MAX_FINAL.out.versions) + ch_final_model = EVALUATE_FIND_MAX_FINAL.out.best_combis + .map{ model_name, final_constant, test_mode, best_hpam_combi -> + [model_name, test_mode, best_hpam_combi] } + .combine(FINAL_SPLIT.out.final_datasets, by: 0) + .combine(work_path) + TRAIN_FINAL_MODEL ( + ch_final_model, + params.response_transformation, + params.model_checkpoint_dir + ) + ch_versions = ch_versions.mix(TRAIN_FINAL_MODEL.out.versions) + } + + ch_consolidate = ch_vis + .map{ test_mode, model, pred_file -> [test_mode, model.split("\\.")[0]] } + .unique() + + CONSOLIDATE_RESULTS ( + ch_consolidate, + randomizations, + ch_vis.count() // wait for ch_vis to finish + ) + ch_versions = ch_versions.mix(CONSOLIDATE_RESULTS.out.versions) + ch_consolidate = CONSOLIDATE_RESULTS.out.ch_vis.transpose() + // filter out SingleDrugModels that have been consolidated + ch_vis = ch_vis + .concat(ch_consolidate) + .transpose() + .map{ test_mode, model, pred_file -> [model, test_mode, pred_file] } + .combine(ch_models_baselines, by: 0) + .map{ model, test_mode, pred_file -> [test_mode, model, pred_file] } + + EVALUATE_FINAL ( + ch_vis + ) + ch_versions = ch_versions.mix(EVALUATE_FINAL.out.versions) + + ch_collapse = EVALUATE_FINAL.out.ch_individual_results.collect() + + COLLECT_RESULTS ( + ch_collapse, + work_path + ) + ch_versions = ch_versions.mix(COLLECT_RESULTS.out.versions) + + // evaluation_results_per_cl and evaluation_results_per_drug are optional + evaluation_results_per_drug = COLLECT_RESULTS.out.evaluation_results_per_drug.ifEmpty(file("${projectDir}/assets/NO_FILE", checkIfExists: true)) + evaluation_results_per_cl = COLLECT_RESULTS.out.evaluation_results_per_cl.ifEmpty(file("${projectDir}/assets/NO_FILE", checkIfExists: true)) + ch_input_vis = COLLECT_RESULTS.out.evaluation_results.concat( + evaluation_results_per_drug, + evaluation_results_per_cl, + COLLECT_RESULTS.out.true_vs_pred + ).collect() + + VISUALIZE_RESULTS( + ch_input_vis, + work_path + ) + ch_versions = ch_versions.mix(VISUALIZE_RESULTS.out.versions) + + emit: + versions = ch_versions +} diff --git a/subworkflows/local/preprocess_custom/main.nf b/subworkflows/local/preprocess_custom/main.nf new file mode 100644 index 0000000..c9beea6 --- /dev/null +++ b/subworkflows/local/preprocess_custom/main.nf @@ -0,0 +1,57 @@ +include { FIT_CURVES } from '../../../modules/local/fit_curves' +include { PREPROCESS_RAW_VIABILITY } from '../../../modules/local/preprocess_raw_viability' +include { POSTPROCESS_CURVECURATOR_DATA } from '../../../modules/local/postprocess_curvecurator_output' + +workflow PREPROCESS_CUSTOM { + take: + work_path + dataset_name + measure + + main: + ch_versions = Channel.empty() + def preimplemented_datasets = ['GDSC1', 'GDSC2', 'CCLE', 'CTRPv1', 'CTRPv2', 'TOYv1', 'TOYv2'] + if(!params.no_refitting){ + File raw_file = new File("${params.path_data}/${dataset_name}/${dataset_name}_raw.csv") + // refit with CurveCurator or use measures refitted with CurveCurator + if (dataset_name in preimplemented_datasets) { + // the dataset was already fit, use the pre-fitted curves and derived measure + ch_measure = channel.of("${measure}" + "_curvecurator") + } else { + log.info "Using a custom dataset: ${dataset_name}. If you want to use a pre-fitted dataset, please use one of the following: ${preimplemented_datasets.join(', ')}." + // the dataset is not pre-fitted, we need to refit it + if(!raw_file.exists()){ + throw new Exception("Raw data file not found: ${raw_file}. You want to refit a custom dataset with CurveCurator which requires raw viability data to be located at ${raw_file} but the file does not exist. Please provide the raw data in the correct format or set `no_refitting` to true in your parameters.") + }else{ + PREPROCESS_RAW_VIABILITY(dataset_name, work_path) + ch_versions = ch_versions.mix(PREPROCESS_RAW_VIABILITY.out.versions) + ch_toml_files = PREPROCESS_RAW_VIABILITY.out.path_to_toml + .flatten() + .map { file -> [file.parent.name, file] } + ch_curvecurator_input = PREPROCESS_RAW_VIABILITY.out.curvecurator_input + .flatten() + .map { file -> [file.parent.name, file] } + // [dose_dir_name, config.toml, curvecurator_input.tsv] + ch_fit_curves = ch_toml_files.combine(ch_curvecurator_input, by: 0) + FIT_CURVES(dataset_name, ch_fit_curves) + ch_versions = ch_versions.mix(FIT_CURVES.out.versions) + ch_curves = FIT_CURVES.out.path_to_curvecurator_out.collect() + POSTPROCESS_CURVECURATOR_DATA(dataset_name, ch_curves, measure) + ch_versions = ch_versions.mix(POSTPROCESS_CURVECURATOR_DATA.out.versions) + ch_measure = POSTPROCESS_CURVECURATOR_DATA.out.measure + } + } + }else{ + log.warn "You have set `no_refitting` to true. We discourage this option for comparability to our pre-supplied datasets. If you want to use a custom dataset, please ensure it is processed in the correct format." + File processed_file = new File("${params.path_data}/${dataset_name}/${dataset_name}.csv") + if(dataset_name !in preimplemented_datasets){ + if (!processed_file.exists()){ + throw new Exception("Processed data file not found: ${processed_file}. You want to use a custom dataset but the file does not exist. Please provide the processed data in the correct format or set `no_refitting` to false in your parameters.") + } + } + ch_measure = measure + } + emit: + measure = ch_measure + versions = ch_versions +} diff --git a/subworkflows/local/run_cv/main.nf b/subworkflows/local/run_cv/main.nf new file mode 100644 index 0000000..2f9d0d8 --- /dev/null +++ b/subworkflows/local/run_cv/main.nf @@ -0,0 +1,166 @@ +include { UNZIP as UNZIP_RESPONSE } from '../../../modules/local/unzip' +include { UNZIP as UNZIP_CS_RESPONSE } from '../../../modules/local/unzip' +include { LOAD_RESPONSE as LOAD_RESPONSE } from '../../../modules/local/load_response' +include { LOAD_RESPONSE as LOAD_CS_RESPONSE } from '../../../modules/local/load_response' +include { MAKE_MODEL_CHANNEL as MAKE_MODELS } from '../../../modules/local/make_model_channel' +include { MAKE_MODEL_CHANNEL as MAKE_BASELINES } from '../../../modules/local/make_model_channel' +include { CV_SPLIT } from '../../../modules/local/cv_split' +include { HPAM_SPLIT } from '../../../modules/local/hpam_split' +include { TRAIN_AND_PREDICT_CV } from '../../../modules/local/train_and_predict_cv' +include { EVALUATE_FIND_MAX } from '../../../modules/local/evaluate_find_max' + +workflow RUN_CV { + take: + test_modes // LPO,LDO,LCO, LTO + ch_models // channel of model names for full testing + ch_baselines // channel of model names for comparison + work_path // path to data + measure + + main: + ch_versions = Channel.empty() + File response_path = new File("${params.path_data}/${params.dataset_name}/${params.dataset_name}.csv") + if (!response_path.exists()) { + log.info "Downloading response dataset ${params.dataset_name} from Zenodo: ${params.zenodo_link}${params.dataset_name}.zip" + ch_unzip = channel + .fromPath("${params.zenodo_link}${params.dataset_name}.zip") + .map { file -> [params.dataset_name, file] } + UNZIP_RESPONSE(ch_unzip) + ch_versions = ch_versions.mix(UNZIP_RESPONSE.out.versions) + ch_response = UNZIP_RESPONSE.out.unzipped_archive + .map { dataset_name, path_to_dir, response_file -> + file(response_file, checkIfExists: true) + } + }else{ + log.info "Using existing response dataset ${params.dataset_name} from ${response_path}" + ch_response = channel.fromPath(response_path, checkIfExists: true) + } + + if (params.cross_study_datasets != '') { + def cross_study_datasets = params.cross_study_datasets.split(',') + log.info "Using cross-study datasets: ${cross_study_datasets.join(', ')}" + // iterate over cross-study datasets and load them + ch_all_cs = channel + .of(cross_study_datasets) + .map { dataset_name -> [dataset_name, file("${params.path_data}/${dataset_name}/${dataset_name}.csv")]} + ch_cs_cached = ch_all_cs + .filter { dataset_name, dataset_path -> + dataset_path.exists() + } + .map { dataset_name, dataset_path -> + file("${dataset_path}", checkIfExists: true) + } + + ch_cs_to_be_loaded = ch_all_cs + .filter { dataset_name, dataset_path -> + !dataset_path.exists() + } + .map { dataset_name, dataset_path -> + [dataset_name, "${params.zenodo_link}${dataset_path.baseName}.zip"] + } + UNZIP_CS_RESPONSE(ch_cs_to_be_loaded) + ch_versions = ch_versions.mix(UNZIP_CS_RESPONSE.out.versions) + ch_cs_loaded = UNZIP_CS_RESPONSE.out.unzipped_archive + .map { dataset_name, path_to_dir, response_file -> + file(response_file, checkIfExists: true) + } + ch_cross_study_datasets = ch_cs_cached.concat(ch_cs_loaded) + } else { + ch_cross_study_datasets = channel.empty() + } + ch_response = measure.combine(ch_response) + ch_cross_study_datasets = measure.combine(ch_cross_study_datasets) + LOAD_RESPONSE(ch_response, params.no_refitting, false) + ch_versions = ch_versions.mix(LOAD_RESPONSE.out.versions) + LOAD_CS_RESPONSE(ch_cross_study_datasets, params.no_refitting, true) + ch_versions = ch_versions.mix(LOAD_CS_RESPONSE.out.versions) + + + ch_test_modes = channel.from(test_modes) + ch_data = ch_test_modes.combine(LOAD_RESPONSE.out.response_dataset) + + CV_SPLIT ( + ch_data, + params.n_cv_splits + ) + ch_versions = ch_versions.mix(CV_SPLIT.out.versions) + // [test_mode, [split_1.pkl, split_2.pkl, ..., split_n.pkl]] + ch_cv_splits = CV_SPLIT.out.response_cv_splits + + ch_models_baselines = ch_models.concat(ch_baselines) + ch_input_models = ch_models + .collect() + .map { models -> [models] } + .combine(LOAD_RESPONSE.out.response_dataset) + ch_input_baselines = ch_baselines + .collect() + .map { models -> [models] } + .combine(LOAD_RESPONSE.out.response_dataset) + + MAKE_MODELS ( + ch_input_models, + "models" + ) + ch_versions = ch_versions.mix(MAKE_MODELS.out.versions) + + MAKE_BASELINES ( + ch_input_baselines, + "baselines" + ) + ch_versions = ch_versions.mix(MAKE_BASELINES.out.versions) + + ch_models_expanded = MAKE_MODELS.out.all_models + .splitCsv(strip: true) + ch_baselines_expanded = MAKE_BASELINES.out.all_models + .splitCsv(strip: true) + ch_models_baselines_expanded = ch_models_expanded.concat(ch_baselines_expanded) + + HPAM_SPLIT ( + ch_models_baselines, + params.no_hyperparameter_tuning + ) + ch_versions = ch_versions.mix(HPAM_SPLIT.out.versions) + // [model_name, [hpam_0.yaml, hpam_1.yaml, ..., hpam_n.yaml]] + ch_hpam_combis = ch_models_baselines_expanded + .combine(HPAM_SPLIT.out.hpam_combi, by: 0) + .map { model_class, model_name, hpam_combis -> [model_name, hpam_combis] } + + // [model_name, hpam_X.yaml] + ch_hpam_combis = ch_hpam_combis.transpose() + + // [model_name, test_mode, split_X.pkl] + ch_model_cv = ch_models_baselines_expanded + .combine(ch_cv_splits.transpose()) + .map { model_class, model_name, test_mode, split -> [model_name, test_mode, split] } + // [model_name, test_mode, split_X.pkl, hpam_X.yaml, path/to/data] + ch_test_combis = ch_model_cv.combine(ch_hpam_combis, by: 0) + ch_test_combis = ch_test_combis.combine(work_path) + + TRAIN_AND_PREDICT_CV(ch_test_combis, params.response_transformation, params.model_checkpoint_dir) + ch_versions = ch_versions.mix(TRAIN_AND_PREDICT_CV.out.versions) + + // [model_name, test_mode, split_id, + // [hpam_0.yaml, hpam_1.yaml, ..., hpam_n.yaml], + // [prediction_dataset_0.pkl, ..., prediction_dataset_n.pkl] ] + ch_combined_hpams = TRAIN_AND_PREDICT_CV.out.pred_data.groupTuple(by: [0,1,2]) + + EVALUATE_FIND_MAX ( + ch_combined_hpams, + params.optim_metric + ) + ch_versions = ch_versions.mix(EVALUATE_FIND_MAX.out.versions) + + // [split_id, test_mode, split_dataset, model_name, best_hpam_combi_X.yaml] + ch_best_hpams_per_split = ch_cv_splits + .map { test_mode, it -> [it, it.baseName, test_mode]} + .transpose() + .combine(EVALUATE_FIND_MAX.out.best_combis, by: [1, 2]) + + emit: + best_hpam_per_split = ch_best_hpams_per_split + response_dataset = LOAD_RESPONSE.out.response_dataset.collect() + cross_study_datasets = LOAD_CS_RESPONSE.out.cross_study_datasets.collect() + ch_models = MAKE_MODELS.out.all_models.splitCsv(strip: true) + ch_hpam_combis = ch_hpam_combis + versions = ch_versions +} diff --git a/subworkflows/local/utils_nfcore_drugresponseeval_pipeline/main.nf b/subworkflows/local/utils_nfcore_drugresponseeval_pipeline/main.nf index 7188074..a36e2db 100644 --- a/subworkflows/local/utils_nfcore_drugresponseeval_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_drugresponseeval_pipeline/main.nf @@ -26,12 +26,15 @@ include { UTILS_NEXTFLOW_PIPELINE } from '../../nf-core/utils_nextflow_pipelin workflow PIPELINE_INITIALISATION { take: - version // boolean: Display version and exit - validate_params // boolean: Boolean whether to validate parameters against the schema at runtime - monochrome_logs // boolean: Do not use coloured log outputs - nextflow_cli_args // array: List of positional nextflow CLI args - outdir // string: The output directory where the results will be saved - input // string: Path to input samplesheet + version // boolean: Display version and exit + validate_params // boolean: Boolean whether to validate parameters against the schema at runtime + monochrome_logs // boolean: Do not use coloured log outputs + nextflow_cli_args // array: List of positional nextflow CLI args + outdir // string: The output directory where the results will be saved + // pipeline-specific input + models // string: Comma-separated list of models to run + baselines // string: Comma-separated list of baseline models to run + path_data // string: Path to the data directory containing the input data main: @@ -64,32 +67,62 @@ workflow PIPELINE_INITIALISATION { ) // - // Create channel from input file provided through params.input + // Custom tests // - Channel - .fromList(samplesheetToList(params.input, "${projectDir}/assets/schema_input.json")) - .map { - meta, fastq_1, fastq_2 -> - if (!fastq_2) { - return [ meta.id, meta + [ single_end:true ], [ fastq_1 ] ] - } else { - return [ meta.id, meta + [ single_end:false ], [ fastq_1, fastq_2 ] ] - } - } - .groupTuple() - .map { samplesheet -> - validateInputSamplesheet(samplesheet) - } - .map { - meta, fastqs -> - return [ meta, fastqs.flatten() ] - } - .set { ch_samplesheet } + // it is possible to supply a custom model name, but write a warning + valid_model_names = [ + 'NaivePredictor', + 'NaiveDrugMeanPredictor', + 'NaiveCellLineMeanPredictor', + 'NaiveMeanEffectsPredictor', + 'NaiveTissueMeanPredictor', + 'ElasticNet', + 'RandomForest', + 'SVR', + 'SimpleNeuralNetwork', + 'MultiOmicsNeuralNetwork', + 'MultiOmicsRandomForest', + 'GradientBoosting', + 'SRMF', + 'DIPK', + 'ProteomicsRandomForest', + 'ProteomicsElasticNet', + 'SingleDrugRandomForest', + 'MOLIR', + 'SuperFELTR', + 'SingleDrugElasticNet', + 'SingleDrugProteomicsElasticNet', + 'SingleDrugProteomicsRandomForest', + ] + ch_models = channel.from(models.split(',').collect { it.trim() }) + def baseline_list = baselines.split(",") + // if NaiveMeanEffectsPredictor is not in baselines, add it + if (!baseline_list.contains("NaiveMeanEffectsPredictor")) { + baseline_list = baseline_list + "NaiveMeanEffectsPredictor" + log.warn "NaiveMeanEffectsPredictor baseline model was not specified, adding it to the list of baselines." + } + ch_baselines = channel + .from(baselines) + .map { baseline -> + if(!valid_model_names.contains(baseline)){ + error("Invalid baseline model specified: ${baseline}. If you use a custom model, please specify it under --models. For baselines, please use one of the following: ${valid_model_names.join(', ')}") + } else { + baseline + } + } + + new_models = ch_models + .filter { model -> !valid_model_names.contains(model) } + new_models.view { model -> log.warn "You have specified a model not pre-implemented by us: ${model}. If it is your own model in your own fork of drevalpy and you are working in a custom environment, all good :) If not, here is the list of pre-implemented models: ${valid_model_names.join(', ')}" } + + work_path = channel.fromPath(path_data) emit: - samplesheet = ch_samplesheet - versions = ch_versions + models = ch_models + baselines = ch_baselines + work_path = work_path + versions = ch_versions } /* @@ -162,11 +195,12 @@ def validateInputSamplesheet(input) { // Generate methods description for MultiQC // def toolCitationText() { - // TODO nf-core: Optionally add in-text citation tools to this list. + // Optionally add in-text citation tools to this list. // Can use ternary operators to dynamically construct based conditions, e.g. params["run_xyz"] ? "Tool (Foo et al. 2023)" : "", // Uncomment function in methodsDescriptionText to render in MultiQC report def citation_text = [ "Tools used in the workflow included:", + "DrEvalPy (Bernett, Iversen et al. 2025)", "." ].join(' ').trim() @@ -174,10 +208,11 @@ def toolCitationText() { } def toolBibliographyText() { - // TODO nf-core: Optionally add bibliographic entries to this list. + // Optionally add bibliographic entries to this list. // Can use ternary operators to dynamically construct based conditions, e.g. params["run_xyz"] ? "
  • Author (2023) Pub name, Journal, DOI
  • " : "", // Uncomment function in methodsDescriptionText to render in MultiQC report def reference_text = [ + "
  • Bernett, Iversen et al. (2025). From Hype to Health Check: Critical Evaluation of Drug Response Prediction Models with DrEval. bioRxiv, 2025-05.
  • " ].join(' ').trim() return reference_text @@ -204,12 +239,11 @@ def methodsDescriptionText(mqc_methods_yaml) { meta["nodoi_text"] = meta.manifest_map.doi ? "" : "
  • If available, make sure to update the text to include the Zenodo DOI of version of the pipeline used.
  • " // Tool references - meta["tool_citations"] = "" - meta["tool_bibliography"] = "" + //meta["tool_citations"] = "" + //meta["tool_bibliography"] = "" - // TODO nf-core: Only uncomment below if logic in toolCitationText/toolBibliographyText has been filled! - // meta["tool_citations"] = toolCitationText().replaceAll(", \\.", ".").replaceAll("\\. \\.", ".").replaceAll(", \\.", ".") - // meta["tool_bibliography"] = toolBibliographyText() + meta["tool_citations"] = toolCitationText().replaceAll(", \\.", ".").replaceAll("\\. \\.", ".").replaceAll(", \\.", ".") + meta["tool_bibliography"] = toolBibliographyText() def methods_text = mqc_methods_yaml.text diff --git a/subworkflows/nf-core/utils_nextflow_pipeline/tests/tags.yml b/subworkflows/nf-core/utils_nextflow_pipeline/tests/tags.yml deleted file mode 100644 index f847611..0000000 --- a/subworkflows/nf-core/utils_nextflow_pipeline/tests/tags.yml +++ /dev/null @@ -1,2 +0,0 @@ -subworkflows/utils_nextflow_pipeline: - - subworkflows/nf-core/utils_nextflow_pipeline/** diff --git a/subworkflows/nf-core/utils_nfcore_pipeline/tests/tags.yml b/subworkflows/nf-core/utils_nfcore_pipeline/tests/tags.yml deleted file mode 100644 index ac8523c..0000000 --- a/subworkflows/nf-core/utils_nfcore_pipeline/tests/tags.yml +++ /dev/null @@ -1,2 +0,0 @@ -subworkflows/utils_nfcore_pipeline: - - subworkflows/nf-core/utils_nfcore_pipeline/** diff --git a/tests/.nftignore b/tests/.nftignore index 73eb92f..65aa545 100644 --- a/tests/.nftignore +++ b/tests/.nftignore @@ -1,2 +1,13 @@ .DS_Store pipeline_info/*.{html,json,txt,yml} +test_run/evaluation*.{csv} +test_run/true_vs_pred.csv +test_run/LCO/NaiveDrugMeanPredictor/predictions/predictions_split*.csv +test_run/LCO/NaiveMeanEffectsPredictor/predictions/predictions_split*.csv +test_run/LCO/NaiveDrugMeanPredictor/cross_study/cross_study_TOYv2_split*.csv +test_run/LCO/NaiveMeanEffectsPredictor/cross_study/cross_study_TOYv2_split*.csv +test_run/report/comp_scatter/*.html +test_run/report/heatmaps/*.html +test_run/report/html_tables/*.html +test_run/report/violin_plots/*.html +test_run/report/heatmaps/*.html diff --git a/tests/default.nf.test b/tests/default.nf.test index eefd068..d90e530 100644 --- a/tests/default.nf.test +++ b/tests/default.nf.test @@ -23,7 +23,7 @@ nextflow_pipeline { // Number of successful tasks workflow.trace.succeeded().size(), // pipeline versions.yml file for multiqc from which Nextflow version is removed because we test pipelines on multiple Nextflow versions - removeNextflowVersion("$outputDir/pipeline_info/nf_core_drugresponseeval_software_mqc_versions.yml"), + removeNextflowVersion("$outputDir/pipeline_info/nf_core_drugresponseeval_software_versions.yml"), // All stable path name, with a relative path stable_name, // All files with stable contents diff --git a/tests/default.nf.test.snap b/tests/default.nf.test.snap new file mode 100644 index 0000000..6792612 --- /dev/null +++ b/tests/default.nf.test.snap @@ -0,0 +1,179 @@ +{ + "-profile test": { + "content": [ + 33, + { + "COLLECT_RESULTS": { + "python": "3.13.5", + "drevalpy": "1.3.4", + "pandas": "2.3.0" + }, + "CONSOLIDATE_RESULTS": { + "python": "3.13.5", + "drevalpy": "1.3.4", + "pandas": "2.3.0" + }, + "CV_SPLIT": { + "python": "3.13.5", + "drevalpy": "1.3.4", + "sklearn": "1.7.0", + "numpy": "2.3.1" + }, + "EVALUATE_FINAL": { + "python": "3.13.5", + "drevalpy": "1.3.4", + "pandas": "2.3.0", + "sklearn": "1.7.0", + "numpy": "2.3.1", + "scipy": "1.15.3" + }, + "EVALUATE_FIND_MAX": { + "python": "3.13.5", + "drevalpy": "1.3.4", + "pandas": "2.3.0", + "sklearn": "1.7.0", + "numpy": "2.3.1", + "scipy": "1.15.3", + "yaml": "6.0.2" + }, + "HPAM_SPLIT": { + "python": "3.13.5", + "drevalpy": "1.3.4", + "yaml": "6.0.2" + }, + "LOAD_CS_RESPONSE": { + "python": "3.13.5", + "drevalpy": "1.3.4", + "pandas": "2.3.0" + }, + "LOAD_RESPONSE": { + "python": "3.13.5", + "drevalpy": "1.3.4", + "pandas": "2.3.0" + }, + "MAKE_BASELINES": { + "python": "3.13.5", + "drevalpy": "1.3.4", + "sklearn": "1.7.0", + "numpy": "2.3.1", + "pandas": "2.3.0", + "pytorch_lightning": "2.5.2", + "torch": "2.7.1", + "platform": "1.0.8" + }, + "MAKE_MODELS": { + "python": "3.13.5", + "drevalpy": "1.3.4", + "sklearn": "1.7.0", + "numpy": "2.3.1", + "pandas": "2.3.0", + "pytorch_lightning": "2.5.2", + "torch": "2.7.1", + "platform": "1.0.8" + }, + "PREDICT_FULL": { + "python": "3.13.5", + "drevalpy": "1.3.4", + "sklearn": "1.7.0", + "numpy": "2.3.1", + "pandas": "2.3.0", + "pytorch_lightning": "2.5.2", + "torch": "2.7.1", + "platform": "1.0.8" + }, + "TRAIN_AND_PREDICT_CV": { + "python": "3.13.5", + "drevalpy": "1.3.4", + "sklearn": "1.7.0", + "numpy": "2.3.1", + "pandas": "2.3.0", + "pytorch_lightning": "2.5.2", + "torch": "2.7.1", + "platform": "1.0.8" + }, + "UNZIP_CS_RESPONSE": { + "unzip": 6.0 + }, + "UNZIP_RESPONSE": { + "unzip": 6.0 + }, + "VISUALIZE_RESULTS": { + "python": "3.13.5", + "drevalpy": "1.3.4", + "pandas": "2.3.0", + "matplotlib": "3.10.3", + "plotly": "6.1.2", + "scikit_posthocs": "0.11.4", + "scipy": "1.15.3", + "sklearn": "1.7.0" + }, + "Workflow": { + "nf-core/drugresponseeval": "v1.1.0" + } + }, + [ + "pipeline_info", + "pipeline_info/nf_core_drugresponseeval_software_versions.yml", + "test_run", + "test_run/LCO", + "test_run/LCO/NaiveDrugMeanPredictor", + "test_run/LCO/NaiveDrugMeanPredictor/best_hpams", + "test_run/LCO/NaiveDrugMeanPredictor/best_hpams/best_hpams_split_0.json", + "test_run/LCO/NaiveDrugMeanPredictor/best_hpams/best_hpams_split_1.json", + "test_run/LCO/NaiveDrugMeanPredictor/cross_study", + "test_run/LCO/NaiveDrugMeanPredictor/cross_study/cross_study_TOYv2_split_0.csv", + "test_run/LCO/NaiveDrugMeanPredictor/cross_study/cross_study_TOYv2_split_1.csv", + "test_run/LCO/NaiveDrugMeanPredictor/predictions", + "test_run/LCO/NaiveDrugMeanPredictor/predictions/predictions_split_0.csv", + "test_run/LCO/NaiveDrugMeanPredictor/predictions/predictions_split_1.csv", + "test_run/LCO/NaiveMeanEffectsPredictor", + "test_run/LCO/NaiveMeanEffectsPredictor/best_hpams", + "test_run/LCO/NaiveMeanEffectsPredictor/best_hpams/best_hpams_split_0.json", + "test_run/LCO/NaiveMeanEffectsPredictor/best_hpams/best_hpams_split_1.json", + "test_run/LCO/NaiveMeanEffectsPredictor/cross_study", + "test_run/LCO/NaiveMeanEffectsPredictor/cross_study/cross_study_TOYv2_split_0.csv", + "test_run/LCO/NaiveMeanEffectsPredictor/cross_study/cross_study_TOYv2_split_1.csv", + "test_run/LCO/NaiveMeanEffectsPredictor/predictions", + "test_run/LCO/NaiveMeanEffectsPredictor/predictions/predictions_split_0.csv", + "test_run/LCO/NaiveMeanEffectsPredictor/predictions/predictions_split_1.csv", + "test_run/evaluation_results.csv", + "test_run/evaluation_results_per_cl.csv", + "test_run/report", + "test_run/report/LCO.html", + "test_run/report/LCO.png", + "test_run/report/comp_scatter", + "test_run/report/comp_scatter/comp_scatter_cell_line_name_LCO.html", + "test_run/report/critical_difference_plots", + "test_run/report/favicon.png", + "test_run/report/heatmaps", + "test_run/report/heatmaps/heatmap_algorithms_LCO.html", + "test_run/report/heatmaps/heatmap_algorithms_LCO_normalized.html", + "test_run/report/html_tables", + "test_run/report/html_tables/table_cross_study_TOYv2_LCO.html", + "test_run/report/index.html", + "test_run/report/nf-core-drugresponseeval_logo_light.png", + "test_run/report/regression_plots", + "test_run/report/violin_plots", + "test_run/report/violin_plots/violin_algorithms_LCO.html", + "test_run/report/violin_plots/violin_algorithms_LCO_normalized.html", + "test_run/true_vs_pred.csv" + ], + [ + "best_hpams_split_0.json:md5,99914b932bd37a50b983c5e7c90ae93b", + "best_hpams_split_1.json:md5,99914b932bd37a50b983c5e7c90ae93b", + "best_hpams_split_0.json:md5,99914b932bd37a50b983c5e7c90ae93b", + "best_hpams_split_1.json:md5,99914b932bd37a50b983c5e7c90ae93b", + "LCO.html:md5,39657391f49643c51c6423805458ab76", + "LCO.png:md5,e8bd8858f8149d610c63e8f3cc36577e", + "favicon.png:md5,8da229af56874ca6750c5402f0b76e2a", + "index.html:md5,b0bd552d41394f52069aa4bbd1bc9af1", + "nf-core-drugresponseeval_logo_light.png:md5,4f55fb24b445a68f35c12a84e83f99c7" + ] + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.3" + }, + "timestamp": "2025-06-23T13:39:22.624" + } +} \ No newline at end of file diff --git a/workflows/drugresponseeval.nf b/workflows/drugresponseeval.nf index 72b7614..3537738 100644 --- a/workflows/drugresponseeval.nf +++ b/workflows/drugresponseeval.nf @@ -7,20 +7,64 @@ include { paramsSummaryMap } from 'plugin/nf-schema' include { softwareVersionsToYAML } from '../subworkflows/nf-core/utils_nfcore_pipeline' include { methodsDescriptionText } from '../subworkflows/local/utils_nfcore_drugresponseeval_pipeline' +// +// SUBWORKFLOW: Consisting of a mix of local and nf-core/modules +// + +include { PREPROCESS_CUSTOM } from '../subworkflows/local/preprocess_custom' +include { RUN_CV } from '../subworkflows/local/run_cv' +include { MODEL_TESTING } from '../subworkflows/local/model_testing' + /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ RUN MAIN WORKFLOW ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ +def test_modes = params.test_mode.split(",") +def randomizations = params.randomization_mode.split(",") + workflow DRUGRESPONSEEVAL { take: - ch_samplesheet // channel: samplesheet read in from --input - main: + models // channel: [ string(models) ] + baselines // channel: [ string(baselines) ] + work_path // channel: path to the data channel.fromPath(params.path_data) + main: ch_versions = Channel.empty() + ch_models_baselines = models.concat(baselines) + + PREPROCESS_CUSTOM ( + work_path, + params.dataset_name, + params.measure + ) + ch_versions = ch_versions.mix(PREPROCESS_CUSTOM.out.versions) + + RUN_CV ( + test_modes, + models, + baselines, + work_path, + PREPROCESS_CUSTOM.out.measure + ) + ch_versions = ch_versions.mix(RUN_CV.out.versions) + + MODEL_TESTING ( + ch_models_baselines, + RUN_CV.out.best_hpam_per_split, + randomizations, + RUN_CV.out.response_dataset, + RUN_CV.out.cross_study_datasets, + RUN_CV.out.ch_models, + work_path, + test_modes, + RUN_CV.out.ch_hpam_combis + ) + ch_versions = ch_versions.mix(MODEL_TESTING.out.versions) + // // Collate and save software versions // @@ -32,7 +76,6 @@ workflow DRUGRESPONSEEVAL { newLine: true ).set { ch_collated_versions } - emit: versions = ch_versions // channel: [ path(versions.yml) ]
    Process Name \\", + " \\ Software Version
    CUSTOM_DUMPSOFTWAREVERSIONSpython3.13.1
    yaml6.0.2
    TOOL1tool10.11.9
    TOOL2tool21.9
    WorkflowNextflow