diff --git a/README.md b/README.md index 08f6d23f..2c8c88d5 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,41 @@ +### Change log [2025-07-30 18:04:17] +1. Item Updated: `model_server_tester` (from version: `1.1.0` to `1.1.0`) +2. Item Updated: `aggregate` (from version: `1.3.0` to `1.3.0`) +3. Item Updated: `translate` (from version: `0.2.0` to `0.2.0`) +4. Item Updated: `v2_model_server` (from version: `1.2.0` to `1.2.0`) +5. Item Updated: `gen_class_data` (from version: `1.3.0` to `1.3.0`) +6. Item Updated: `auto_trainer` (from version: `1.7.0` to `1.7.0`) +7. Item Updated: `silero_vad` (from version: `1.4.0` to `1.4.0`) +8. Item Updated: `text_to_audio_generator` (from version: `1.3.0` to `1.3.0`) +9. Item Updated: `describe` (from version: `1.3.0` to `1.3.0`) +10. Item Updated: `transcribe` (from version: `1.2.0` to `1.2.0`) +11. Item Updated: `pyannote_audio` (from version: `1.3.0` to `1.3.0`) +12. Item Updated: `test_classifier` (from version: `1.1.0` to `1.1.0`) +13. Item Updated: `feature_selection` (from version: `1.6.0` to `1.6.0`) +14. Item Updated: `tf2_serving` (from version: `1.1.0` to `1.1.0`) +15. Item Updated: `azureml_serving` (from version: `1.1.0` to `1.1.0`) +16. Item Updated: `sklearn_classifier` (from version: `1.1.1` to `1.1.1`) +17. Item Updated: `azureml_utils` (from version: `1.4.0` to `1.4.0`) +18. Item Updated: `describe_dask` (from version: `1.1.0` to `1.1.0`) +19. Item Updated: `mlflow_utils` (from version: `1.1.0` to `1.1.0`) +20. Item Updated: `github_utils` (from version: `1.1.0` to `1.1.0`) +21. Item Updated: `v2_model_tester` (from version: `1.1.0` to `1.1.0`) +22. Item Updated: `open_archive` (from version: `1.2.0` to `1.2.0`) +23. Item Updated: `describe_spark` (from version: `1.1.0` to `1.1.0`) +24. Item Updated: `sklearn_classifier_dask` (from version: `1.1.1` to `1.1.1`) +25. Item Updated: `batch_inference_v2` (from version: `2.6.0` to `2.6.0`) +26. Item Updated: `arc_to_parquet` (from version: `1.5.0` to `1.5.0`) +27. Item Updated: `send_email` (from version: `1.2.0` to `1.2.0`) +28. Item Updated: `structured_data_generator` (from version: `1.6.0` to `1.6.0`) +29. Item Updated: `question_answering` (from version: `0.5.0` to `0.5.0`) +30. Item Updated: `hugging_face_serving` (from version: `1.1.0` to `1.1.0`) +31. Item Updated: `noise_reduction` (from version: `1.1.0` to `1.1.0`) +32. Item Updated: `pii_recognizer` (from version: `0.4.0` to `0.4.0`) +33. Item Updated: `onnx_utils` (from version: `1.3.0` to `1.3.0`) +34. Item Updated: `batch_inference` (from version: `1.8.0` to `1.8.0`) +35. Item Updated: `load_dataset` (from version: `1.2.0` to `1.2.0`) +36. Item Updated: `model_server` (from version: `1.1.0` to `1.1.0`) + ### Change log [2025-07-27 06:51:56] 1. Item Updated: `model_server_tester` (from version: `1.1.0` to `1.1.0`) 2. Item Updated: `aggregate` (from version: `1.3.0` to `1.3.0`) diff --git a/catalog.json b/catalog.json index dcaef5e3..d52626bc 100644 --- a/catalog.json +++ b/catalog.json @@ -1 +1 @@ -{"functions": {"development": {"tf2_serving": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "tf2-serving", "platformVersion": "3.5.0", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "1.1.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "tf2-serving", "platformVersion": "3.5.0", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf2-serving", "platformVersion": "3.2.0", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "tf2-serving", "platformVersion": "", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.0.1"}, "0.9.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf2-serving", "platformVersion": "3.2.0", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.9.1"}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf2-serving", "platformVersion": "3.2.0", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.8.0"}}, "load_dataset": {"latest": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.0", "name": "load-dataset", "platformVersion": "3.5.5", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "load-dataset", "platformVersion": "3.5.0", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "load-dataset", "platformVersion": "3.2.0", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "load-dataset", "platformVersion": "", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.0", "name": "load-dataset", "platformVersion": "3.5.5", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "load-dataset", "platformVersion": "3.2.0", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}}, "model_server_tester": {"latest": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-server-tester", "platformVersion": "3.5.0", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-server-tester", "platformVersion": "3.5.0", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server-tester", "platformVersion": "3.2.0", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "model-server-tester", "platformVersion": "", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.0.0": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server-tester", "platformVersion": "3.2.0", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server-tester", "platformVersion": "3.2.0", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}}, "feature_selection": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.8.0-rc40", "name": "feature-selection", "platformVersion": "3.6.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.6.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.9.0", "name": "feature-selection", "platformVersion": "3.2.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection/feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.5.4", "name": "feature-selection", "platformVersion": "2.10.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection/feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "0.9.1": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.9.0", "name": "feature-selection", "platformVersion": "3.2.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.1"}, "1.6.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.8.0-rc40", "name": "feature-selection", "platformVersion": "3.6.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.6.0"}, "1.4.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.4.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "feature-selection", "platformVersion": "3.2.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection/feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "1.3.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0"}, "1.1.1": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.1"}, "1.5.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.4", "name": "feature-selection", "platformVersion": "3.6.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.5.0"}}, "aggregate": {"latest": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "aggregate", "platformVersion": "3.5.4", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "aggregate", "platformVersion": "3.5.2", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "aggregate", "platformVersion": "3.2.0", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2021-05-19:22-31", "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.6.2", "name": "aggregate", "platformVersion": "3.0.0", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "aggregate", "platformVersion": "3.5.2", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "1.0.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2021-05-19:22-31", "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "aggregate", "platformVersion": "3.2.0", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2021-05-19:22-31", "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "aggregate", "platformVersion": "3.2.0", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "0.0.2": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2021-05-19:22-31", "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.7.1", "name": "aggregate", "platformVersion": "3.2.0", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.2"}, "1.3.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "aggregate", "platformVersion": "3.5.4", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0"}, "1.1.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "aggregate", "platformVersion": "3.5.2", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.1"}}, "describe": {"latest": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.0", "name": "describe", "platformVersion": "3.5.3", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0"}, "0.9.2": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-04-26:10-20", "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe", "platformVersion": "3.2.0", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.2"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "describe", "platformVersion": "3.5.0", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Iguazio"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe", "platformVersion": "3.2.0", "spec": {"filename": "describe.py", "handler": "summarize", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "Iguazio"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.5.4", "name": "describe", "platformVersion": "2.10.0", "spec": {"filename": "describe.py", "handler": "summarize", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.2.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "describe", "platformVersion": "3.5.3", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "0.9.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-04-07:14-20", "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe", "platformVersion": "3.2.0", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.1"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "Iguazio"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe", "platformVersion": "3.2.0", "spec": {"filename": "describe.py", "handler": "summarize", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "1.3.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.0", "name": "describe", "platformVersion": "3.5.3", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0"}}, "model_server": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-server", "platformVersion": "3.5.0", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "1.1.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-server", "platformVersion": "3.5.0", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server", "platformVersion": "3.2.0", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "model-server", "platformVersion": "", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "0.0.1"}, "1.0.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server", "platformVersion": "3.2.0", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "1.0.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server", "platformVersion": "3.2.0", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "0.8.0"}}, "describe_spark": {"latest": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "describe-spark", "platformVersion": "3.5.0", "spec": {"filename": "describe_spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "describe-spark", "platformVersion": "3.5.0", "spec": {"filename": "describe_spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe-spark", "platformVersion": "3.2.0", "spec": {"filename": "describe-spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "describe-spark", "platformVersion": "", "spec": {"filename": "describe-spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "0.9.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe-spark", "platformVersion": "3.2.0", "spec": {"filename": "describe_spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "0.9.1"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe-spark", "platformVersion": "3.2.0", "spec": {"filename": "describe-spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}}, "gen_class_data": {"latest": {"apiVersion": "v1", "categories": ["data-generation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0", "name": "gen_class_data", "platformVersion": "3.5.3", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0"}, "0.10.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "gen_class_data", "platformVersion": "3.2.0", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.10.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "gen_class_data", "platformVersion": "3.5.0", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "gen_class_data", "platformVersion": "3.2.0", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.6.2", "name": "gen_class_data", "platformVersion": "3.0.0", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "gen_class_data", "platformVersion": "3.5.3", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "gen_class_data", "platformVersion": "3.2.0", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "1.3.0": {"apiVersion": "v1", "categories": ["data-generation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0", "name": "gen_class_data", "platformVersion": "3.5.3", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0"}}, "open_archive": {"latest": {"apiVersion": "v1", "categories": ["utils"], "description": "Open a file/object archive into a target directory", "doc": "", "example": "open_archive.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.8.0-rc50", "name": "open-archive", "platformVersion": "3.5.0", "spec": {"filename": "open_archive.py", "handler": "open_archive", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Open a file/object archive into a target directory", "doc": "", "example": "open_archive.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.8.0-rc50", "name": "open-archive", "platformVersion": "3.5.0", "spec": {"filename": "open_archive.py", "handler": "open_archive", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}}, "send_email": {"latest": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "send-email", "platformVersion": "3.5.3", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "send-email", "platformVersion": "3.5.0", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "send-email", "platformVersion": "3.2.0", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "send-email", "platformVersion": "", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.2.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "send-email", "platformVersion": "3.5.3", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "send-email", "platformVersion": "3.2.0", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}}, "v2_model_tester": {"latest": {"apiVersion": "v1", "categories": ["model-testing", "machine-learning"], "description": "test v2 model servers", "doc": "", "example": "v2_model_tester.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "v2-model-tester", "platformVersion": "3.5.0", "spec": {"filename": "v2_model_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["model-testing", "machine-learning"], "description": "test v2 model servers", "doc": "", "example": "v2_model_tester.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "v2-model-tester", "platformVersion": "3.5.0", "spec": {"filename": "v2_model_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["model-testing", "machine-learning"], "description": "test v2 model servers", "doc": "", "example": "v2_model_tester.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "v2-model-tester", "platformVersion": "3.2.0", "spec": {"filename": "v2_model_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["model-testing", "machine-learning"], "description": "test v2 model servers", "doc": "", "example": "v2_model_tester.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "v2-model-tester", "platformVersion": "", "spec": {"filename": "v2_model_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "0.8.0": {"apiVersion": "v1", "categories": ["model-testing", "machine-learning"], "description": "test v2 model servers", "doc": "", "example": "v2_model_tester.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "v2-model-tester", "platformVersion": "3.2.0", "spec": {"filename": "v2_model_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}}, "arc_to_parquet": {"latest": {"apiVersion": "v1", "categories": ["utils"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avi"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0", "name": "arc-to-parquet", "platformVersion": "3.5.4", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.5.0"}, "1.4.1": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avi"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "arc-to-parquet", "platformVersion": "3.5.4", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.4.1"}, "1.5.0": {"apiVersion": "v1", "categories": ["utils"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avi"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0", "name": "arc-to-parquet", "platformVersion": "3.5.4", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.5.0"}}, "github_utils": {"latest": {"apiVersion": "v1", "categories": ["utils"], "description": "add comments to github pull request", "doc": "", "example": "github_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "github-utils", "platformVersion": "3.5.0", "spec": {"filename": "github_utils.py", "handler": "run_summary_comment", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["utils"], "description": "add comments to github pull request", "doc": "", "example": "github_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "github-utils", "platformVersion": "3.5.0", "spec": {"filename": "github_utils.py", "handler": "run_summary_comment", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["utils"], "description": "add comments to github pull request", "doc": "", "example": "github_utils.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "github-utils", "platformVersion": "3.2.0", "spec": {"filename": "github_utils.py", "handler": "run_summary_comment", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["utils"], "description": "add comments to github pull request", "doc": "", "example": "github_utils.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "github-utils", "platformVersion": "", "spec": {"filename": "github_utils.py", "handler": "run_summary_comment", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "0.8.0": {"apiVersion": "v1", "categories": ["utils"], "description": "add comments to github pull request", "doc": "", "example": "github_utils.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "github-utils", "platformVersion": "3.2.0", "spec": {"filename": "github_utils.py", "handler": "run_summary_comment", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}}, "v2_model_server": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "v2-model-server", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "1.2.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "v2-model-server", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "v2-model-server", "platformVersion": "3.2.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "v2-model-server", "platformVersion": "", "spec": {"filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": [], "customFields": {"default_class": "ClassifierModel"}}, "url": "", "version": "0.0.1"}, "1.2.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "v2-model-server", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "1.2.0"}, "1.0.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "v2-model-server", "platformVersion": "3.2.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "1.0.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "v2-model-server", "platformVersion": "3.2.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "0.8.0"}}, "onnx_utils": {"latest": {"apiVersion": "v1", "categories": ["utils", "deep-learning"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.2", "name": "onnx_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "with_mlrun": false}}, "filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/mlrun", "kind": "job", "requirements": ["tqdm~=4.67.1", "tensorflow~=2.19.0", "tf_keras~=2.19.0", "torch~=2.6.0", "torchvision~=0.21.0", "onnx~=1.17.0", "onnxruntime~=1.19.2", "onnxoptimizer~=0.3.13", "onnxmltools~=1.13.0", "tf2onnx~=1.16.1", "plotly~=5.4.0"]}, "url": "", "version": "1.3.0"}, "1.3.0": {"apiVersion": "v1", "categories": ["utils", "deep-learning"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.2", "name": "onnx_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "with_mlrun": false}}, "filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/mlrun", "kind": "job", "requirements": ["tqdm~=4.67.1", "tensorflow~=2.19.0", "tf_keras~=2.19.0", "torch~=2.6.0", "torchvision~=0.21.0", "onnx~=1.17.0", "onnxruntime~=1.19.2", "onnxoptimizer~=0.3.13", "onnxmltools~=1.13.0", "tf2onnx~=1.16.1", "plotly~=5.4.0"]}, "url": "", "version": "1.3.0"}}, "azureml_utils": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "utils"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0", "name": "azureml_utils", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "commands": ["apt-get update && apt-get install -y --no-install-recommends git", "apt install -y liblttng-ust0"], "with_mlrun": true}}, "filename": "azureml_utils.py", "handler": "train", "image": "python:3.9-bullseye", "kind": "job", "requirements": ["azureml-core==1.54.0.post1", "azureml-train-automl-client==1.54.0.post1", "plotly~=5.4"]}, "url": "", "version": "1.4.0", "test_valid": true}, "0.9.4": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2021-11-13:00-15", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "azureml_utils", "platformVersion": "", "spec": {"filename": "azureml_utils.py", "handler": "train", "extra_spec": {"build": {"commands": ["python -m pip install pip==21.2.4", "apt-get update && apt-get install -y --no-install-recommends git"], "with_mlrun": true, "auto_build": true}, "allow_empty_resources": true}, "image": "python:3.7.9-slim", "kind": "job", "requirements": ["azureml-core==1.33.0", "azureml-train-automl-client==1.33.0", "plotly~=5.4"]}, "url": "", "version": "0.9.4"}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "azureml_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "commands": ["python -m pip install pip==22.1.2", "apt-get update && apt-get install -y --no-install-recommends git"], "with_mlrun": true}}, "filename": "azureml_utils.py", "handler": "train", "image": "python:3.7.9-slim", "kind": "job", "requirements": ["azureml-core==1.40.0", "azureml-train-automl-client==1.40.0", "plotly~=5.4"]}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2021-11-13:00-15", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "azureml_utils", "platformVersion": "", "spec": {"filename": "azureml_utils.py", "handler": "train", "commands": null, "image": "", "kind": "job", "requirements": ["azureml-core==1.33.0", "azureml-train-automl-client==1.33.0"]}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2021-11-13:00-15", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "azureml_utils", "platformVersion": "", "spec": {"filename": "azureml_utils.py", "handler": "train", "commands": null, "image": "", "kind": "job", "requirements": ["azureml-core==1.33.0", "azureml-train-automl-client==1.33.0"]}, "url": "", "version": "0.0.1"}, "1.2.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "azureml_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "commands": ["python -m pip install pip==22.1.2", "apt-get update && apt-get install -y --no-install-recommends git"], "with_mlrun": true}}, "filename": "azureml_utils.py", "handler": "train", "image": "python:3.7.9-slim", "kind": "job", "requirements": ["azureml-core==1.40.0", "azureml-train-automl-client==1.40.0", "plotly~=5.4"]}, "url": "", "version": "1.2.0", "test_valid": false}, "0.9.3": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2021-11-13:00-15", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "azureml_utils", "platformVersion": "", "spec": {"filename": "azureml_utils.py", "handler": "train", "extra_spec": {"build": {"commands": ["python -m pip install pip==21.2.4", "apt-get update && apt-get install -y --no-install-recommends git"], "with_mlrun": true, "auto_build": true}, "allow_empty_resources": true}, "image": "python:3.7.9-slim", "kind": "job", "requirements": ["azureml-core==1.33.0", "azureml-train-automl-client==1.33.0", "plotly~=5.4"]}, "url": "", "version": "0.9.3"}, "1.4.0": {"apiVersion": "v1", "categories": ["model-serving", "utils"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0", "name": "azureml_utils", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "commands": ["apt-get update && apt-get install -y --no-install-recommends git", "apt install -y liblttng-ust0"], "with_mlrun": true}}, "filename": "azureml_utils.py", "handler": "train", "image": "python:3.9-bullseye", "kind": "job", "requirements": ["azureml-core==1.54.0.post1", "azureml-train-automl-client==1.54.0.post1", "plotly~=5.4"]}, "url": "", "version": "1.4.0", "test_valid": true}, "0.9.5": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2021-04-20:15-18", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "azureml_utils", "platformVersion": "", "spec": {"filename": "azureml_utils.py", "handler": "train", "extra_spec": {"build": {"commands": ["python -m pip install pip==21.2.4", "apt-get update && apt-get install -y --no-install-recommends git"], "with_mlrun": true, "auto_build": true}, "allow_empty_resources": true}, "image": "python:3.7.9-slim", "kind": "job", "requirements": ["azureml-core==1.40.0", "azureml-train-automl-client==1.40.0", "plotly~=5.4"]}, "url": "", "version": "0.9.5"}, "1.3.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "azureml_utils", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "commands": ["apt-get update && apt-get install -y --no-install-recommends git", "apt install -y liblttng-ust0"], "with_mlrun": true}}, "filename": "azureml_utils.py", "handler": "train", "image": "python:3.9-bullseye", "kind": "job", "requirements": ["azureml-core==1.54.0.post1", "azureml-train-automl-client==1.54.0.post1", "plotly~=5.4"]}, "url": "", "version": "1.3.0", "test_valid": true}}, "auto_trainer": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.7.0"}, "0.10.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-02-06:10-18", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.10.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "", "kind": "job", "requirements": []}, "url": "", "version": "0.10.0"}, "1.0.6": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-04-26:10-43", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.0.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.6"}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-02-06:10-18", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.10.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "1.0.1": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-04-26:10-43", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.0.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "", "kind": "job", "requirements": []}, "url": "", "version": "1.0.1"}, "0.10.3": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-04-26:10-43", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.10.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "", "kind": "job", "requirements": []}, "url": "", "version": "0.10.3"}, "0.10.2": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-02-06:10-18", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.10.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "", "kind": "job", "requirements": []}, "url": "", "version": "0.10.2"}, "1.6.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.6.0"}, "1.7.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.7.0"}, "1.4.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.4.0"}, "1.0.4": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-04-26:10-43", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.0.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.4"}, "1.0.2": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-04-26:10-43", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.0.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.2"}, "1.3.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.3.0", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0"}, "0.10.1": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-02-06:10-18", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.10.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "", "kind": "job", "requirements": []}, "url": "", "version": "0.10.1"}, "1.5.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.5.0"}}, "azureml_serving": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "AzureML serving function", "doc": "", "example": "azureml_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "azureml_serving", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "mlrun.frameworks.sklearn.PickleModelServer"}, "filename": "azureml_serving.py", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["azureml-automl-runtime~=1.38.1"]}, "url": "", "version": "1.1.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "AzureML serving function", "doc": "", "example": "azureml_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "azureml_serving", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "mlrun.frameworks.sklearn.PickleModelServer"}, "filename": "azureml_serving.py", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["azureml-automl-runtime~=1.38.1"]}, "url": "", "version": "1.1.0"}}, "batch_inference": {"latest": {"apiVersion": "v1", "categories": ["model-serving"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.8.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference ( also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-learn", "plotly"]}, "url": "", "version": "1.1.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.2.0"}, "1.6.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.6.0"}, "1.7.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.7.0"}, "1.8.0": {"apiVersion": "v1", "categories": ["model-serving"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.8.0"}, "1.1.1": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.1.1"}, "1.5.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.5.0"}}, "hugging_face_serving": {"latest": {"apiVersion": "v1", "categories": ["genai", "model-serving"], "description": "Generic Hugging Face model server.", "doc": "", "example": "hugging_face_serving.ipynb", "generationDate": "2022-09-05:17-00", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "hugging_face_serving", "platformVersion": "", "spec": {"customFields": {"default_class": "HuggingFaceModelServer"}, "filename": "hugging_face_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["transformers==4.21.3", "tensorflow==2.9.2"]}, "url": "", "version": "1.1.0", "test_valid": false}, "1.1.0": {"apiVersion": "v1", "categories": ["genai", "model-serving"], "description": "Generic Hugging Face model server.", "doc": "", "example": "hugging_face_serving.ipynb", "generationDate": "2022-09-05:17-00", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "hugging_face_serving", "platformVersion": "", "spec": {"customFields": {"default_class": "HuggingFaceModelServer"}, "filename": "hugging_face_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["transformers==4.21.3", "tensorflow==2.9.2"]}, "url": "", "version": "1.1.0", "test_valid": false}, "1.0.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "Generic Hugging Face model server.", "doc": "", "example": "hugging_face_serving.ipynb", "generationDate": "2022-09-05:17-00", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "hugging_face_serving", "platformVersion": "", "spec": {"customFields": {"default_class": "HuggingFaceModelServer"}, "filename": "hugging_face_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["transformers==4.21.3", "tensorflow==2.9.2"]}, "url": "", "version": "1.0.0"}}, "transcribe": {"latest": {"apiVersion": "v1", "categories": ["audio", "genai"], "description": "Transcribe audio files into text files", "doc": "", "example": "transcribe.ipynb", "generationDate": "2023-07-13:11-20", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0", "name": "transcribe", "platformVersion": "3.5.3", "spec": {"filename": "transcribe.py", "handler": "transcribe", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "tqdm", "torchaudio", "torch", "accelerate"]}, "url": "", "version": "1.2.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation", "genai", "huggingface", "machine-learning"], "description": "Transcribe audio files into text files", "doc": "", "example": "transcribe.ipynb", "generationDate": "2023-07-13:11-20", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "transcribe", "platformVersion": "3.5.3", "spec": {"filename": "transcribe.py", "handler": "transcribe", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "tqdm", "torchaudio", "torch", "accelerate"]}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Transcribe audio files into text files", "doc": "", "example": "transcribe.ipynb", "generationDate": "2023-07-13:11-20", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "transcribe", "platformVersion": "3.5.3", "spec": {"filename": "transcribe.py", "handler": "transcribe", "image": "mlrun/mlrun", "kind": "job", "requirements": ["openai-whisper", "tqdm"]}, "url": "", "version": "0.0.1", "test_valid": false}, "1.2.0": {"apiVersion": "v1", "categories": ["audio", "genai"], "description": "Transcribe audio files into text files", "doc": "", "example": "transcribe.ipynb", "generationDate": "2023-07-13:11-20", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0", "name": "transcribe", "platformVersion": "3.5.3", "spec": {"filename": "transcribe.py", "handler": "transcribe", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "tqdm", "torchaudio", "torch", "accelerate"]}, "url": "", "version": "1.2.0"}, "1.0.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Transcribe audio files into text files", "doc": "", "example": "transcribe.ipynb", "generationDate": "2023-07-13:11-20", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "transcribe", "platformVersion": "3.5.3", "spec": {"filename": "transcribe.py", "handler": "transcribe", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "tqdm", "torchaudio", "torch", "accelerate"]}, "url": "", "version": "1.0.0"}, "0.0.2": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Transcribe audio files into text files", "doc": "", "example": "transcribe.ipynb", "generationDate": "2023-07-13:11-20", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "transcribe", "platformVersion": "3.5.3", "spec": {"filename": "transcribe.py", "handler": "transcribe", "image": "mlrun/mlrun", "kind": "job", "requirements": ["openai-whisper", "tqdm"]}, "url": "", "version": "0.0.2", "test_valid": true}}, "question_answering": {"latest": {"apiVersion": "v1", "categories": ["genai"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "torch", "tqdm"]}, "url": "", "version": "0.5.0"}, "0.2.0": {"apiVersion": "v1", "categories": ["machine-learning"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": "transformers torch tqdm"}, "url": "", "version": "0.2.0"}, "0.3.1": {"apiVersion": "v1", "categories": ["machine-learning"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "torch", "tqdm"]}, "url": "", "version": "0.3.1"}, "0.4.0": {"apiVersion": "v1", "categories": ["genai", "huggingface", "machine-learning"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "torch", "tqdm"]}, "url": "", "version": "0.4.0"}, "0.3.0": {"apiVersion": "v1", "categories": ["machine-learning"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": "transformers torch tqdm"}, "url": "", "version": "0.3.0"}, "0.5.0": {"apiVersion": "v1", "categories": ["genai"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "torch", "tqdm"]}, "url": "", "version": "0.5.0"}}, "pii_recognizer": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "NLP"], "description": "This function is used to recognize PII in a directory of text files", "doc": "", "example": "pii_recognizer.ipynb", "generationDate": "2023-08-15:10-24", "hidden": false, "icon": "", "labels": {"author": "pgw"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0", "name": "pii-recognizer", "platformVersion": "3.5.3", "spec": {"filename": "pii_recognizer.py", "handler": "recognize_pii", "image": "mlrun/mlrun", "kind": "job", "requirements": ["nltk", "pandas", "presidio-anonymizer", "presidio-analyzer", "torch", "flair@git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653", "st-annotated-text", "https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl"]}, "url": "", "version": "0.4.0", "test_valid": false}, "0.2.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "This function is used to recognize PII in a directory of text files", "doc": "", "example": "pii_recognizer.ipynb", "generationDate": "2023-08-15:10-24", "hidden": false, "icon": "", "labels": {"author": "pgw"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "pii-recognizer", "platformVersion": "3.5.3", "spec": {"filename": "pii_recognizer.py", "handler": "recognize_pii", "image": "mlrun/mlrun", "kind": "job", "requirements": ["nltk", "pandas", "presidio-anonymizer", "presidio-analyzer", "torch", "flair@git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653", "st-annotated-text", "https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl"]}, "url": "", "version": "0.2.0", "test_valid": false}, "0.4.0": {"apiVersion": "v1", "categories": ["data-preparation", "NLP"], "description": "This function is used to recognize PII in a directory of text files", "doc": "", "example": "pii_recognizer.ipynb", "generationDate": "2023-08-15:10-24", "hidden": false, "icon": "", "labels": {"author": "pgw"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0", "name": "pii-recognizer", "platformVersion": "3.5.3", "spec": {"filename": "pii_recognizer.py", "handler": "recognize_pii", "image": "mlrun/mlrun", "kind": "job", "requirements": ["nltk", "pandas", "presidio-anonymizer", "presidio-analyzer", "torch", "flair@git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653", "st-annotated-text", "https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl"]}, "url": "", "version": "0.4.0", "test_valid": false}, "0.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "This function is used to recognize PII in a directory of text files", "doc": "", "example": "pii_recognizer.ipynb", "generationDate": "2023-08-15:10-24", "hidden": false, "icon": "", "labels": {"author": "pgw"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "pii-recognizer", "platformVersion": "3.5.3", "spec": {"filename": "pii_recognizer.py", "handler": "recognize_pii", "image": "mlrun/mlrun", "kind": "job", "requirements": ["nltk", "pandas", "presidio-anonymizer", "presidio-analyzer", "torch", "flair@git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653", "st-annotated-text", "https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl"]}, "url": "", "version": "0.1.0", "test_valid": false}, "0.3.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "NLP"], "description": "This function is used to recognize PII in a directory of text files", "doc": "", "example": "pii_recognizer.ipynb", "generationDate": "2023-08-15:10-24", "hidden": false, "icon": "", "labels": {"author": "pgw"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "pii-recognizer", "platformVersion": "3.5.3", "spec": {"filename": "pii_recognizer.py", "handler": "recognize_pii", "image": "mlrun/mlrun", "kind": "job", "requirements": ["nltk", "pandas", "presidio-anonymizer", "presidio-analyzer", "torch", "flair@git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653", "st-annotated-text", "https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl"]}, "url": "", "version": "0.3.0", "test_valid": false}}, "batch_inference_v2": {"latest": {"apiVersion": "v1", "categories": ["model-serving"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0-rc51", "name": "batch_inference_v2", "platformVersion": "3.6.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.6.0"}, "1.9.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0-rc16", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "1.9.0"}, "2.0.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.0.0"}, "2.3.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.3.0"}, "1.6.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0-rc9", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "1.6.0"}, "2.1.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.1.0"}, "1.7.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0-rc13", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "1.7.0"}, "1.8.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0-rc13", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "1.8.0"}, "2.5.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.5.0"}, "2.6.0": {"apiVersion": "v1", "categories": ["model-serving"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0-rc51", "name": "batch_inference_v2", "platformVersion": "3.6.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.6.0"}, "2.2.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.2.0"}, "1.5.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0-rc9", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "1.5.0"}}, "translate": {"latest": {"apiVersion": "v1", "categories": ["genai", "NLP"], "description": "Translate text files from one language to another", "doc": "", "example": "translate.ipynb", "generationDate": "2023-12-05:17-20", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0", "name": "translate", "platformVersion": "3.5.3", "spec": {"filename": "translate.py", "handler": "translate", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "sentencepiece", "torch", "tqdm"]}, "url": "", "version": "0.2.0", "test_valid": true}, "0.2.0": {"apiVersion": "v1", "categories": ["genai", "NLP"], "description": "Translate text files from one language to another", "doc": "", "example": "translate.ipynb", "generationDate": "2023-12-05:17-20", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0", "name": "translate", "platformVersion": "3.5.3", "spec": {"filename": "translate.py", "handler": "translate", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "sentencepiece", "torch", "tqdm"]}, "url": "", "version": "0.2.0", "test_valid": true}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Translate text files from one language to another", "doc": "", "example": "translate.ipynb", "generationDate": "2023-12-05:17-20", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "translate", "platformVersion": "3.5.3", "spec": {"filename": "translate.py", "handler": "translate", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "sentencepiece", "torch", "tqdm"]}, "url": "", "version": "0.0.1", "test_valid": true}, "0.1.0": {"apiVersion": "v1", "categories": ["data-preparation", "huggingface", "machine-learning", "deep-learning", "NLP"], "description": "Translate text files from one language to another", "doc": "", "example": "translate.ipynb", "generationDate": "2023-12-05:17-20", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "translate", "platformVersion": "3.5.3", "spec": {"filename": "translate.py", "handler": "translate", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "sentencepiece", "torch", "tqdm"]}, "url": "", "version": "0.1.0", "test_valid": true}, "0.0.2": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning", "deep-learning", "NLP"], "description": "Translate text files from one language to another", "doc": "", "example": "translate.ipynb", "generationDate": "2023-12-05:17-20", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "translate", "platformVersion": "3.5.3", "spec": {"filename": "translate.py", "handler": "translate", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "sentencepiece", "torch", "tqdm"]}, "url": "", "version": "0.0.2", "test_valid": true}}, "structured_data_generator": {"latest": {"apiVersion": "v1", "categories": ["data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.8.0", "name": "structured_data_generator", "platformVersion": "3.5.5", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.6.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "structured_data_generator", "platformVersion": "3.5.0", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.1.0"}, "1.0.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "structured_data_generator", "platformVersion": "3.5.0", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.0.0"}, "1.6.0": {"apiVersion": "v1", "categories": ["data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.8.0", "name": "structured_data_generator", "platformVersion": "3.5.5", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.6.0"}, "1.3.1": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "structured_data_generator", "platformVersion": "3.5.0", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.3.1"}, "1.4.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.1", "name": "structured_data_generator", "platformVersion": "3.5.5", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.4.0"}, "1.3.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "structured_data_generator", "platformVersion": "3.5.0", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.3.0"}, "1.5.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.1", "name": "structured_data_generator", "platformVersion": "3.5.5", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.5.0"}}, "text_to_audio_generator": {"latest": {"apiVersion": "v1", "categories": ["data-generation", "audio"], "description": "Generate audio file from text using different speakers", "doc": "", "example": "text_to_audio_generator.ipynb", "generationDate": "2023-12-03:15-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.1", "name": "text_to_audio_generator", "platformVersion": "3.5.3", "spec": {"filename": "text_to_audio_generator.py", "handler": "generate_multi_speakers_audio", "image": "mlrun/mlrun", "kind": "job", "requirements": ["torchaudio", "pydub"]}, "url": "", "version": "1.3.0", "test_valid": true}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Generate audio file from text using different speakers", "doc": "", "example": "text_to_audio_generator.ipynb", "generationDate": "2023-12-03:15-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "text_to_audio_generator", "platformVersion": "3.5.3", "spec": {"filename": "text_to_audio_generator.py", "handler": "generate_multi_speakers_audio", "image": "mlrun/mlrun", "kind": "job", "requirements": ["bark", "torchaudio"]}, "url": "", "version": "1.1.0", "test_valid": true}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning", "pytorch"], "description": "Generate audio file from text using different speakers", "doc": "", "example": "text_to_audio_generator.ipynb", "generationDate": "2023-12-03:15-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "text_to_audio_generator", "platformVersion": "3.5.3", "spec": {"filename": "text_to_audio_generator.py", "handler": "generate_multi_speakers_audio", "image": "mlrun/mlrun", "kind": "job", "requirements": ["bark", "torchaudio"]}, "url": "", "version": "1.2.0", "test_valid": true}, "1.0.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Generate audio file from text using different speakers", "doc": "", "example": "text_to_audio_generator.ipynb", "generationDate": "2023-12-03:15-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "text_to_audio_generator", "platformVersion": "3.5.3", "spec": {"filename": "text_to_audio_generator.py", "handler": "generate_multi_speakers_audio", "image": "mlrun/mlrun", "kind": "job", "requirements": ["bark", "torchaudio"]}, "url": "", "version": "1.0.0", "test_valid": true}, "1.3.0": {"apiVersion": "v1", "categories": ["data-generation", "audio"], "description": "Generate audio file from text using different speakers", "doc": "", "example": "text_to_audio_generator.ipynb", "generationDate": "2023-12-03:15-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.1", "name": "text_to_audio_generator", "platformVersion": "3.5.3", "spec": {"filename": "text_to_audio_generator.py", "handler": "generate_multi_speakers_audio", "image": "mlrun/mlrun", "kind": "job", "requirements": ["torchaudio", "pydub"]}, "url": "", "version": "1.3.0", "test_valid": true}}, "silero_vad": {"latest": {"apiVersion": "v1", "categories": ["deep-learning", "audio"], "description": "Silero VAD (Voice Activity Detection) functions.", "doc": "", "example": "silero_vad.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0", "name": "silero_vad", "platformVersion": "3.5.3", "spec": {"filename": "silero_vad.py", "handler": "detect_voice", "image": "mlrun/mlrun", "kind": "job", "requirements": ["torch", "torchaudio", "tqdm", "onnxruntime"]}, "url": "", "version": "1.4.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["deep-learning", "pytorch", "audio"], "description": "Silero VAD (Voice Activity Detection) functions.", "doc": "", "example": "silero_vad.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "silero_vad", "platformVersion": "3.5.3", "spec": {"filename": "silero_vad.py", "handler": "detect_voice", "image": "mlrun/mlrun", "kind": "job", "requirements": ["torch", "torchaudio", "tqdm", "onnxruntime"]}, "url": "", "version": "1.1.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["deep-learning", "pyTorch", "audio"], "description": "Silero VAD (Voice Activity Detection) functions.", "doc": "", "example": "silero_vad.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "silero_vad", "platformVersion": "3.5.3", "spec": {"filename": "silero_vad.py", "handler": "detect_voice", "image": "mlrun/mlrun", "kind": "job", "requirements": ["torch", "torchaudio", "tqdm", "onnxruntime"]}, "url": "", "version": "1.2.0"}, "1.4.0": {"apiVersion": "v1", "categories": ["deep-learning", "audio"], "description": "Silero VAD (Voice Activity Detection) functions.", "doc": "", "example": "silero_vad.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0", "name": "silero_vad", "platformVersion": "3.5.3", "spec": {"filename": "silero_vad.py", "handler": "detect_voice", "image": "mlrun/mlrun", "kind": "job", "requirements": ["torch", "torchaudio", "tqdm", "onnxruntime"]}, "url": "", "version": "1.4.0"}, "1.3.0": {"apiVersion": "v1", "categories": ["deep-learning", "pytorch", "audio"], "description": "Silero VAD (Voice Activity Detection) functions.", "doc": "", "example": "silero_vad.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "silero_vad", "platformVersion": "3.5.3", "spec": {"filename": "silero_vad.py", "handler": "detect_voice", "image": "mlrun/mlrun", "kind": "job", "requirements": ["torch", "torchaudio", "tqdm", "onnxruntime"]}, "url": "", "version": "1.3.0"}}, "pyannote_audio": {"latest": {"apiVersion": "v1", "categories": ["deep-learning", "audio"], "description": "pyannote's speech diarization of audio files", "doc": "", "example": "pyannote_audio.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0", "name": "pyannote-audio", "platformVersion": "3.5.3", "spec": {"filename": "pyannote_audio.py", "handler": "diarize", "image": "mlrun/mlrun-gpu", "kind": "job", "requirements": ["pyannote.audio", "pyannote.core", "torchaudio", "tqdm"]}, "url": "", "version": "1.3.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["deep-learning", "huggingface", "audio"], "description": "pyannote's speech diarization of audio files", "doc": "", "example": "pyannote_audio.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "pyannote-audio", "platformVersion": "3.5.3", "spec": {"filename": "pyannote_audio.py", "handler": "diarize", "image": "mlrun/mlrun-gpu", "kind": "job", "requirements": ["pyannote.audio", "pyannote.core", "torchaudio", "tqdm"]}, "url": "", "version": "1.1.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["deep-learning", "huggingface", "audio"], "description": "pyannote's speech diarization of audio files", "doc": "", "example": "pyannote_audio.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "pyannote-audio", "platformVersion": "3.5.3", "spec": {"filename": "pyannote_audio.py", "handler": "diarize", "image": "mlrun/mlrun-gpu", "kind": "job", "requirements": ["pyannote.audio", "pyannote.core", "torchaudio", "tqdm"]}, "url": "", "version": "1.2.0"}, "1.0.0": {"apiVersion": "v1", "categories": ["deep-learning", "huggingface", "audio"], "description": "pyannote's speech diarization of audio files", "doc": "", "example": "pyannote_audio.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "pyannote-audio", "platformVersion": "3.5.3", "spec": {"filename": "pyannote_audio.py", "handler": "diarize", "image": "mlrun/mlrun-gpu", "kind": "job", "requirements": ["pyannote.audio", "pyannote.core", "torchaudio", "tqdm"]}, "url": "", "version": "1.0.0"}, "1.3.0": {"apiVersion": "v1", "categories": ["deep-learning", "audio"], "description": "pyannote's speech diarization of audio files", "doc": "", "example": "pyannote_audio.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0", "name": "pyannote-audio", "platformVersion": "3.5.3", "spec": {"filename": "pyannote_audio.py", "handler": "diarize", "image": "mlrun/mlrun-gpu", "kind": "job", "requirements": ["pyannote.audio", "pyannote.core", "torchaudio", "tqdm"]}, "url": "", "version": "1.3.0"}}, "noise_reduction": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "audio"], "description": "Reduce noise from audio files", "doc": "", "example": "noise_reduction.ipynb", "generationDate": "2024-03-04:17-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "mlrunVersion": "1.7.0", "name": "noise-reduction", "platformVersion": "3.5.3", "spec": {"filename": "noise_reduction.py", "handler": "reduce_noise", "image": "mlrun/mlrun", "kind": "job", "requirements": ["librosa", "noisereduce", "deepfilternet", "torchaudio>=2.1.2"]}, "url": "", "version": "1.1.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation", "audio"], "description": "Reduce noise from audio files", "doc": "", "example": "noise_reduction.ipynb", "generationDate": "2024-03-04:17-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "mlrunVersion": "1.7.0", "name": "noise-reduction", "platformVersion": "3.5.3", "spec": {"filename": "noise_reduction.py", "handler": "reduce_noise", "image": "mlrun/mlrun", "kind": "job", "requirements": ["librosa", "noisereduce", "deepfilternet", "torchaudio>=2.1.2"]}, "url": "", "version": "1.1.0"}, "1.0.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Reduce noise from audio files", "doc": "", "example": "noise_reduction.ipynb", "generationDate": "2024-03-04:17-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "mlrunVersion": "1.5.2", "name": "noise-reduction", "platformVersion": "3.5.3", "spec": {"filename": "noise_reduction.py", "handler": "reduce_noise", "image": "mlrun/mlrun", "kind": "job", "requirements": ["librosa", "noisereduce", "deepfilternet", "torchaudio>=2.1.2"]}, "url": "", "version": "1.0.0"}}, "mlflow_utils": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "utils"], "description": "Mlflow model server, and additional utils.", "doc": "", "example": "mlflow_utils.ipynb", "generationDate": "2024-05-23:12-00", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.8.0", "name": "mlflow_utils", "platformVersion": "", "spec": {"customFields": {"default_class": "MLFlowModelServer"}, "filename": "mlflow_utils.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["mlflow==2.12.2", "lightgbm", "xgboost"]}, "url": "", "version": "1.1.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "utils"], "description": "Mlflow model server, and additional utils.", "doc": "", "example": "mlflow_utils.ipynb", "generationDate": "2024-05-23:12-00", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.8.0", "name": "mlflow_utils", "platformVersion": "", "spec": {"customFields": {"default_class": "MLFlowModelServer"}, "filename": "mlflow_utils.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["mlflow==2.12.2", "lightgbm", "xgboost"]}, "url": "", "version": "1.1.0"}, "1.0.0": {"apiVersion": "v1", "categories": ["genai", "model-serving", "machine-learning"], "description": "Mlflow model server, and additional utils.", "doc": "", "example": "mlflow_utils.ipynb", "generationDate": "2024-05-23:12-00", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0-rc17", "name": "mlflow_utils", "platformVersion": "", "spec": {"customFields": {"default_class": "MLFlowModelServer"}, "filename": "mlflow_utils.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["mlflow==2.12.2", "lightgbm", "xgboost"]}, "url": "", "version": "1.0.0"}}}, "master": {"tf2_serving": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "tf2-serving", "platformVersion": "3.5.0", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "1.1.0"}, "0.9.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf2-serving", "platformVersion": "3.2.0", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.9.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "tf2-serving", "platformVersion": "3.5.0", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "tf2-serving", "platformVersion": "", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.0.1"}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf2-serving", "platformVersion": "3.2.0", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.8.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf2-serving", "platformVersion": "3.2.0", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.9.0"}}, "load_dataset": {"latest": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.0", "name": "load-dataset", "platformVersion": "3.5.5", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "load-dataset", "platformVersion": "3.5.0", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "load-dataset", "platformVersion": "", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "load-dataset", "platformVersion": "3.2.0", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.0", "name": "load-dataset", "platformVersion": "3.5.5", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "load-dataset", "platformVersion": "3.2.0", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}}, "model_server_tester": {"latest": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-server-tester", "platformVersion": "3.5.0", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-server-tester", "platformVersion": "3.5.0", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "model-server-tester", "platformVersion": "", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.0.0": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server-tester", "platformVersion": "3.2.0", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server-tester", "platformVersion": "3.2.0", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server-tester", "platformVersion": "3.2.0", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}}, "feature_selection": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.8.0-rc40", "name": "feature-selection", "platformVersion": "3.6.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.6.0"}, "1.4.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.4.0"}, "0.9.1": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.9.0", "name": "feature-selection", "platformVersion": "3.2.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.1"}, "1.5.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.4", "name": "feature-selection", "platformVersion": "3.6.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.5.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.5.4", "name": "feature-selection", "platformVersion": "2.10.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection/feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.3.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "feature-selection", "platformVersion": "3.2.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection/feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "1.6.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.8.0-rc40", "name": "feature-selection", "platformVersion": "3.6.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.6.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "1.1.1": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.1"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.9.0", "name": "feature-selection", "platformVersion": "3.2.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection/feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}}, "aggregate": {"latest": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "aggregate", "platformVersion": "3.5.4", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2022-08-28:17-25", "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "aggregate", "platformVersion": "3.5.0", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2021-05-19:22-31", "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.6.2", "name": "aggregate", "platformVersion": "3.0.0", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.3.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "aggregate", "platformVersion": "3.5.4", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "aggregate", "platformVersion": "3.2.0", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "aggregate", "platformVersion": "3.5.2", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "aggregate", "platformVersion": "3.2.0", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}}, "describe": {"latest": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.0", "name": "describe", "platformVersion": "3.5.3", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0"}, "0.9.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-04-07:14-20", "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe", "platformVersion": "3.2.0", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "describe", "platformVersion": "3.5.0", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "Iguazio"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.5.4", "name": "describe", "platformVersion": "2.10.0", "spec": {"filename": "describe.py", "handler": "summarize", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.3.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.0", "name": "describe", "platformVersion": "3.5.3", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Iguazio"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe", "platformVersion": "3.2.0", "spec": {"filename": "describe.py", "handler": "summarize", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "describe", "platformVersion": "3.5.3", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "0.9.2": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-04-26:10-20", "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe", "platformVersion": "3.2.0", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.2"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Iguazio"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe", "platformVersion": "3.2.0", "spec": {"filename": "describe.py", "handler": "summarize", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}}, "model_server": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-server", "platformVersion": "3.5.0", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "1.1.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-server", "platformVersion": "3.5.0", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "model-server", "platformVersion": "", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "0.0.1"}, "1.0.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server", "platformVersion": "3.2.0", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "1.0.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server", "platformVersion": "3.2.0", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "0.8.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server", "platformVersion": "3.2.0", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "0.9.0"}}, "describe_spark": {"latest": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "describe-spark", "platformVersion": "3.5.0", "spec": {"filename": "describe_spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe-spark", "platformVersion": "3.2.0", "spec": {"filename": "describe_spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "0.9.1"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "describe-spark", "platformVersion": "3.5.0", "spec": {"filename": "describe_spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "describe-spark", "platformVersion": "", "spec": {"filename": "describe-spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe-spark", "platformVersion": "3.2.0", "spec": {"filename": "describe-spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe-spark", "platformVersion": "3.2.0", "spec": {"filename": "describe-spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}}, "gen_class_data": {"latest": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "gen_class_data", "platformVersion": "3.5.3", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "gen_class_data", "platformVersion": "3.5.0", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.6.2", "name": "gen_class_data", "platformVersion": "3.0.0", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "0.10.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "gen_class_data", "platformVersion": "3.2.0", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.10.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "gen_class_data", "platformVersion": "3.2.0", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "gen_class_data", "platformVersion": "3.5.3", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "gen_class_data", "platformVersion": "3.2.0", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}}, "open_archive": {"latest": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Open a file/object archive into a target directory", "doc": "", "example": "open_archive.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.8.0-rc50", "name": "open-archive", "platformVersion": "3.5.0", "spec": {"filename": "open_archive.py", "handler": "open_archive", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Open a file/object archive into a target directory", "doc": "", "example": "open_archive.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.8.0-rc50", "name": "open-archive", "platformVersion": "3.5.0", "spec": {"filename": "open_archive.py", "handler": "open_archive", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}}, "send_email": {"latest": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "send-email", "platformVersion": "3.5.3", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "send-email", "platformVersion": "3.5.0", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "send-email", "platformVersion": "", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "0.8.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "send-email", "platformVersion": "3.2.0", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "send-email", "platformVersion": "3.5.3", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "send-email", "platformVersion": "3.2.0", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}}, "v2_model_tester": {"latest": {"apiVersion": "v1", "categories": ["model-testing", "machine-learning"], "description": "test v2 model servers", "doc": "", "example": "v2_model_tester.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "v2-model-tester", "platformVersion": "3.5.0", "spec": {"filename": "v2_model_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["model-testing", "machine-learning"], "description": "test v2 model servers", "doc": "", "example": "v2_model_tester.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "v2-model-tester", "platformVersion": "3.5.0", "spec": {"filename": "v2_model_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["model-testing", "machine-learning"], "description": "test v2 model servers", "doc": "", "example": "v2_model_tester.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "v2-model-tester", "platformVersion": "", "spec": {"filename": "v2_model_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "0.8.0": {"apiVersion": "v1", "categories": ["model-testing", "machine-learning"], "description": "test v2 model servers", "doc": "", "example": "v2_model_tester.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "v2-model-tester", "platformVersion": "3.2.0", "spec": {"filename": "v2_model_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["model-testing", "machine-learning"], "description": "test v2 model servers", "doc": "", "example": "v2_model_tester.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "v2-model-tester", "platformVersion": "3.2.0", "spec": {"filename": "v2_model_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}}, "arc_to_parquet": {"latest": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avi"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "arc-to-parquet", "platformVersion": "3.5.4", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.4.1"}, "1.4.1": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avi"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "arc-to-parquet", "platformVersion": "3.5.4", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.4.1"}}, "github_utils": {"latest": {"apiVersion": "v1", "categories": ["utils"], "description": "add comments to github pull request", "doc": "", "example": "github_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "github-utils", "platformVersion": "3.5.0", "spec": {"filename": "github_utils.py", "handler": "run_summary_comment", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["utils"], "description": "add comments to github pull request", "doc": "", "example": "github_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "github-utils", "platformVersion": "3.5.0", "spec": {"filename": "github_utils.py", "handler": "run_summary_comment", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["utils"], "description": "add comments to github pull request", "doc": "", "example": "github_utils.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "github-utils", "platformVersion": "", "spec": {"filename": "github_utils.py", "handler": "run_summary_comment", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "0.8.0": {"apiVersion": "v1", "categories": ["utils"], "description": "add comments to github pull request", "doc": "", "example": "github_utils.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "github-utils", "platformVersion": "3.2.0", "spec": {"filename": "github_utils.py", "handler": "run_summary_comment", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["utils"], "description": "add comments to github pull request", "doc": "", "example": "github_utils.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "github-utils", "platformVersion": "3.2.0", "spec": {"filename": "github_utils.py", "handler": "run_summary_comment", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}}, "v2_model_server": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "v2-model-server", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "1.2.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "v2-model-server", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "v2-model-server", "platformVersion": "", "spec": {"filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": [], "customFields": {"default_class": "ClassifierModel"}}, "url": "", "version": "0.0.1"}, "1.0.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "v2-model-server", "platformVersion": "3.2.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "1.0.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "v2-model-server", "platformVersion": "3.2.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "0.8.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "v2-model-server", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "1.2.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "v2-model-server", "platformVersion": "3.2.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "0.9.0"}}, "onnx_utils": {"latest": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.2", "name": "onnx_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "with_mlrun": false}}, "filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/mlrun", "kind": "job", "requirements": ["tqdm~=4.67.1", "tensorflow~=2.19.0", "tf_keras~=2.19.0", "torch~=2.6.0", "torchvision~=0.21.0", "onnx~=1.17.0", "onnxruntime~=1.19.2", "onnxoptimizer~=0.3.13", "onnxmltools~=1.13.0", "tf2onnx~=1.16.1", "plotly~=5.4.0"]}, "url": "", "version": "1.3.0"}, "1.3.0": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.2", "name": "onnx_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "with_mlrun": false}}, "filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/mlrun", "kind": "job", "requirements": ["tqdm~=4.67.1", "tensorflow~=2.19.0", "tf_keras~=2.19.0", "torch~=2.6.0", "torchvision~=0.21.0", "onnx~=1.17.0", "onnxruntime~=1.19.2", "onnxoptimizer~=0.3.13", "onnxmltools~=1.13.0", "tf2onnx~=1.16.1", "plotly~=5.4.0"]}, "url": "", "version": "1.3.0"}}, "azureml_utils": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "azureml_utils", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "commands": ["apt-get update && apt-get install -y --no-install-recommends git", "apt install -y liblttng-ust0"], "with_mlrun": true}}, "filename": "azureml_utils.py", "handler": "train", "image": "python:3.9-bullseye", "kind": "job", "requirements": ["azureml-core==1.54.0.post1", "azureml-train-automl-client==1.54.0.post1", "plotly~=5.4"]}, "url": "", "version": "1.3.0", "test_valid": true}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "azureml_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "commands": ["python -m pip install pip==22.1.2", "apt-get update && apt-get install -y --no-install-recommends git"], "with_mlrun": true}}, "filename": "azureml_utils.py", "handler": "train", "image": "python:3.7.9-slim", "kind": "job", "requirements": ["azureml-core==1.40.0", "azureml-train-automl-client==1.40.0", "plotly~=5.4"]}, "url": "", "version": "1.1.0"}, "1.3.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "azureml_utils", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "commands": ["apt-get update && apt-get install -y --no-install-recommends git", "apt install -y liblttng-ust0"], "with_mlrun": true}}, "filename": "azureml_utils.py", "handler": "train", "image": "python:3.9-bullseye", "kind": "job", "requirements": ["azureml-core==1.54.0.post1", "azureml-train-automl-client==1.54.0.post1", "plotly~=5.4"]}, "url": "", "version": "1.3.0", "test_valid": true}, "0.9.5": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2021-04-20:15-18", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "azureml_utils", "platformVersion": "", "spec": {"filename": "azureml_utils.py", "handler": "train", "extra_spec": {"build": {"commands": ["python -m pip install pip==21.2.4", "apt-get update && apt-get install -y --no-install-recommends git"], "with_mlrun": true, "auto_build": true}, "allow_empty_resources": true}, "image": "python:3.7.9-slim", "kind": "job", "requirements": ["azureml-core==1.40.0", "azureml-train-automl-client==1.40.0", "plotly~=5.4"]}, "url": "", "version": "0.9.5"}, "1.2.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "azureml_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "commands": ["python -m pip install pip==22.1.2", "apt-get update && apt-get install -y --no-install-recommends git"], "with_mlrun": true}}, "filename": "azureml_utils.py", "handler": "train", "image": "python:3.7.9-slim", "kind": "job", "requirements": ["azureml-core==1.40.0", "azureml-train-automl-client==1.40.0", "plotly~=5.4"]}, "url": "", "version": "1.2.0", "test_valid": false}, "0.9.4": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2021-11-13:00-15", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "azureml_utils", "platformVersion": "", "spec": {"filename": "azureml_utils.py", "handler": "train", "extra_spec": {"build": {"commands": ["python -m pip install pip==21.2.4", "apt-get update && apt-get install -y --no-install-recommends git"], "with_mlrun": true, "auto_build": true}, "allow_empty_resources": true}, "image": "python:3.7.9-slim", "kind": "job", "requirements": ["azureml-core==1.33.0", "azureml-train-automl-client==1.33.0", "plotly~=5.4"]}, "url": "", "version": "0.9.4"}, "0.9.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2021-11-13:00-15", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "azureml_utils", "platformVersion": "", "spec": {"filename": "azureml_utils.py", "handler": "train", "commands": null, "image": "", "kind": "job", "requirements": ["azureml-core==1.33.0", "azureml-train-automl-client==1.33.0"]}, "url": "", "version": "0.9.0"}}, "auto_trainer": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.7.0"}, "1.4.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.4.0"}, "1.0.7": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-04-26:10-43", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.0.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.7"}, "1.0.6": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-04-26:10-43", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.0.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.6"}, "0.10.3": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-04-26:10-43", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.10.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "", "kind": "job", "requirements": []}, "url": "", "version": "0.10.3"}, "1.5.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.5.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.10.2": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-02-06:10-18", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.10.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "", "kind": "job", "requirements": []}, "url": "", "version": "0.10.2"}, "1.3.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.3.0", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0"}, "1.6.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.6.0"}, "1.7.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.7.0"}, "1.0.5": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-04-26:10-43", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.0.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.5"}}, "azureml_serving": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "AzureML serving function", "doc": "", "example": "azureml_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "azureml_serving", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "mlrun.frameworks.sklearn.PickleModelServer"}, "filename": "azureml_serving.py", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["azureml-automl-runtime~=1.38.1"]}, "url": "", "version": "1.1.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "AzureML serving function", "doc": "", "example": "azureml_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "azureml_serving", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "mlrun.frameworks.sklearn.PickleModelServer"}, "filename": "azureml_serving.py", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["azureml-automl-runtime~=1.38.1"]}, "url": "", "version": "1.1.0"}}, "batch_inference": {"latest": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.7.0"}, "1.4.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "1.4.0"}, "1.5.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.5.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference ( also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-learn", "plotly"]}, "url": "", "version": "1.1.0"}, "1.3.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.3.0"}, "1.6.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.6.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.2.0"}, "1.7.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.7.0"}, "1.1.1": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-learn", "plotly"]}, "url": "", "version": "1.1.1"}}, "hugging_face_serving": {"latest": {"apiVersion": "v1", "categories": ["huggingface", "genai", "model-serving", "machine-learning"], "description": "Generic Hugging Face model server.", "doc": "", "example": "hugging_face_serving.ipynb", "generationDate": "2022-09-05:17-00", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "hugging_face_serving", "platformVersion": "", "spec": {"customFields": {"default_class": "HuggingFaceModelServer"}, "filename": "hugging_face_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["transformers==4.21.3", "tensorflow==2.9.2"]}, "url": "", "version": "1.1.0", "test_valid": false}, "1.1.0": {"apiVersion": "v1", "categories": ["huggingface", "genai", "model-serving", "machine-learning"], "description": "Generic Hugging Face model server.", "doc": "", "example": "hugging_face_serving.ipynb", "generationDate": "2022-09-05:17-00", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "hugging_face_serving", "platformVersion": "", "spec": {"customFields": {"default_class": "HuggingFaceModelServer"}, "filename": "hugging_face_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["transformers==4.21.3", "tensorflow==2.9.2"]}, "url": "", "version": "1.1.0", "test_valid": false}, "1.0.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "Generic Hugging Face model server.", "doc": "", "example": "hugging_face_serving.ipynb", "generationDate": "2022-09-05:17-00", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "hugging_face_serving", "platformVersion": "", "spec": {"customFields": {"default_class": "HuggingFaceModelServer"}, "filename": "hugging_face_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["transformers==4.21.3", "tensorflow==2.9.2"]}, "url": "", "version": "1.0.0"}}, "question_answering": {"latest": {"apiVersion": "v1", "categories": ["genai", "huggingface", "machine-learning"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "torch", "tqdm"]}, "url": "", "version": "0.4.0"}, "0.2.0": {"apiVersion": "v1", "categories": ["machine-learning"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": "transformers torch tqdm"}, "url": "", "version": "0.2.0"}, "0.3.0": {"apiVersion": "v1", "categories": ["machine-learning"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": "transformers torch tqdm"}, "url": "", "version": "0.3.0"}, "0.1.0": {"apiVersion": "v1", "categories": ["machine-learning"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": "transformers torch tqdm"}, "url": "", "version": "0.1.0"}, "0.4.0": {"apiVersion": "v1", "categories": ["genai", "huggingface", "machine-learning"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "torch", "tqdm"]}, "url": "", "version": "0.4.0"}, "0.3.1": {"apiVersion": "v1", "categories": ["machine-learning"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "torch", "tqdm"]}, "url": "", "version": "0.3.1"}}, "transcribe": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "genai", "huggingface", "machine-learning"], "description": "Transcribe audio files into text files", "doc": "", "example": "transcribe.ipynb", "generationDate": "2023-07-13:11-20", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "transcribe", "platformVersion": "3.5.3", "spec": {"filename": "transcribe.py", "handler": "transcribe", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "tqdm", "torchaudio", "torch", "accelerate"]}, "url": "", "version": "1.1.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation", "genai", "huggingface", "machine-learning"], "description": "Transcribe audio files into text files", "doc": "", "example": "transcribe.ipynb", "generationDate": "2023-07-13:11-20", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "transcribe", "platformVersion": "3.5.3", "spec": {"filename": "transcribe.py", "handler": "transcribe", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "tqdm", "torchaudio", "torch", "accelerate"]}, "url": "", "version": "1.1.0"}, "0.0.2": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Transcribe audio files into text files", "doc": "", "example": "transcribe.ipynb", "generationDate": "2023-07-13:11-20", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "transcribe", "platformVersion": "3.5.3", "spec": {"filename": "transcribe.py", "handler": "transcribe", "image": "mlrun/mlrun", "kind": "job", "requirements": ["openai-whisper", "tqdm"]}, "url": "", "version": "0.0.2", "test_valid": true}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Transcribe audio files into text files", "doc": "", "example": "transcribe.ipynb", "generationDate": "2023-07-13:11-20", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "transcribe", "platformVersion": "3.5.3", "spec": {"filename": "transcribe.py", "handler": "transcribe", "image": "mlrun/mlrun", "kind": "job", "requirements": ["openai-whisper", "tqdm"]}, "url": "", "version": "0.0.1", "test_valid": false}, "1.0.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Transcribe audio files into text files", "doc": "", "example": "transcribe.ipynb", "generationDate": "2023-07-13:11-20", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "transcribe", "platformVersion": "3.5.3", "spec": {"filename": "transcribe.py", "handler": "transcribe", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "tqdm", "torchaudio", "torch", "accelerate"]}, "url": "", "version": "1.0.0"}}, "pii_recognizer": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "NLP"], "description": "This function is used to recognize PII in a directory of text files", "doc": "", "example": "pii_recognizer.ipynb", "generationDate": "2023-08-15:10-24", "hidden": false, "icon": "", "labels": {"author": "pgw"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "pii-recognizer", "platformVersion": "3.5.3", "spec": {"filename": "pii_recognizer.py", "handler": "recognize_pii", "image": "mlrun/mlrun", "kind": "job", "requirements": ["nltk", "pandas", "presidio-anonymizer", "presidio-analyzer", "torch", "flair@git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653", "st-annotated-text", "https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl"]}, "url": "", "version": "0.3.0", "test_valid": false}, "0.2.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "This function is used to recognize PII in a directory of text files", "doc": "", "example": "pii_recognizer.ipynb", "generationDate": "2023-08-15:10-24", "hidden": false, "icon": "", "labels": {"author": "pgw"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "pii-recognizer", "platformVersion": "3.5.3", "spec": {"filename": "pii_recognizer.py", "handler": "recognize_pii", "image": "mlrun/mlrun", "kind": "job", "requirements": ["nltk", "pandas", "presidio-anonymizer", "presidio-analyzer", "torch", "flair@git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653", "st-annotated-text", "https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl"]}, "url": "", "version": "0.2.0", "test_valid": false}, "0.3.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "NLP"], "description": "This function is used to recognize PII in a directory of text files", "doc": "", "example": "pii_recognizer.ipynb", "generationDate": "2023-08-15:10-24", "hidden": false, "icon": "", "labels": {"author": "pgw"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "pii-recognizer", "platformVersion": "3.5.3", "spec": {"filename": "pii_recognizer.py", "handler": "recognize_pii", "image": "mlrun/mlrun", "kind": "job", "requirements": ["nltk", "pandas", "presidio-anonymizer", "presidio-analyzer", "torch", "flair@git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653", "st-annotated-text", "https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl"]}, "url": "", "version": "0.3.0", "test_valid": false}, "0.0.1": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "This function is used to recognize PII in a directory of text files", "doc": "", "example": "pii_recognizer.ipynb", "generationDate": "2023-08-15:10-24", "hidden": false, "icon": "", "labels": {"author": "pgw"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "pii-recognizer", "platformVersion": "3.5.3", "spec": {"filename": "pii_recognizer.py", "handler": "recognize_pii", "image": "mlrun/mlrun", "kind": "job", "requirements": ["nltk", "pandas", "presidio-anonymizer", "presidio-analyzer", "torch", "flair@git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653", "st-annotated-text", "https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl"]}, "url": "", "version": "0.0.1"}, "0.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "This function is used to recognize PII in a directory of text files", "doc": "", "example": "pii_recognizer.ipynb", "generationDate": "2023-08-15:10-24", "hidden": false, "icon": "", "labels": {"author": "pgw"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "pii-recognizer", "platformVersion": "3.5.3", "spec": {"filename": "pii_recognizer.py", "handler": "recognize_pii", "image": "mlrun/mlrun", "kind": "job", "requirements": ["nltk", "pandas", "presidio-anonymizer", "presidio-analyzer", "torch", "flair@git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653", "st-annotated-text", "https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl"]}, "url": "", "version": "0.1.0", "test_valid": false}}, "batch_inference_v2": {"latest": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0-rc51", "name": "batch_inference_v2", "platformVersion": "3.6.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.6.0"}, "2.2.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.2.0"}, "2.6.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0-rc51", "name": "batch_inference_v2", "platformVersion": "3.6.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.6.0"}, "2.4.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.4.0"}, "2.0.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.0.0"}, "2.1.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.1.0"}, "1.5.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0-rc9", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "1.5.0"}, "1.9.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0-rc16", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "1.9.0"}, "1.8.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0-rc13", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "1.8.0"}, "2.5.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.5.0"}, "1.6.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0-rc9", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "1.6.0"}}, "translate": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "huggingface", "machine-learning", "deep-learning", "NLP"], "description": "Translate text files from one language to another", "doc": "", "example": "translate.ipynb", "generationDate": "2023-12-05:17-20", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "translate", "platformVersion": "3.5.3", "spec": {"filename": "translate.py", "handler": "translate", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "sentencepiece", "torch", "tqdm"]}, "url": "", "version": "0.1.0", "test_valid": true}, "0.0.2": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning", "deep-learning", "NLP"], "description": "Translate text files from one language to another", "doc": "", "example": "translate.ipynb", "generationDate": "2023-12-05:17-20", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "translate", "platformVersion": "3.5.3", "spec": {"filename": "translate.py", "handler": "translate", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "sentencepiece", "torch", "tqdm"]}, "url": "", "version": "0.0.2", "test_valid": true}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Translate text files from one language to another", "doc": "", "example": "translate.ipynb", "generationDate": "2023-12-05:17-20", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "translate", "platformVersion": "3.5.3", "spec": {"filename": "translate.py", "handler": "translate", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "sentencepiece", "torch", "tqdm"]}, "url": "", "version": "0.0.1", "test_valid": true}, "0.1.0": {"apiVersion": "v1", "categories": ["data-preparation", "huggingface", "machine-learning", "deep-learning", "NLP"], "description": "Translate text files from one language to another", "doc": "", "example": "translate.ipynb", "generationDate": "2023-12-05:17-20", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "translate", "platformVersion": "3.5.3", "spec": {"filename": "translate.py", "handler": "translate", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "sentencepiece", "torch", "tqdm"]}, "url": "", "version": "0.1.0", "test_valid": true}}, "structured_data_generator": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.1", "name": "structured_data_generator", "platformVersion": "3.5.5", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.5.0"}, "1.4.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.1", "name": "structured_data_generator", "platformVersion": "3.5.5", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.4.0"}, "1.5.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.1", "name": "structured_data_generator", "platformVersion": "3.5.5", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.5.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "structured_data_generator", "platformVersion": "3.5.0", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.1.0"}, "1.0.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "structured_data_generator", "platformVersion": "3.5.0", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.0.0"}, "1.3.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "structured_data_generator", "platformVersion": "3.5.0", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.3.0"}}, "text_to_audio_generator": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning", "pytorch"], "description": "Generate audio file from text using different speakers", "doc": "", "example": "text_to_audio_generator.ipynb", "generationDate": "2023-12-03:15-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.1", "name": "text_to_audio_generator", "platformVersion": "3.5.3", "spec": {"filename": "text_to_audio_generator.py", "handler": "generate_multi_speakers_audio", "image": "mlrun/mlrun", "kind": "job", "requirements": ["torchaudio", "pydub"]}, "url": "", "version": "1.3.0", "test_valid": true}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Generate audio file from text using different speakers", "doc": "", "example": "text_to_audio_generator.ipynb", "generationDate": "2023-12-03:15-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "text_to_audio_generator", "platformVersion": "3.5.3", "spec": {"filename": "text_to_audio_generator.py", "handler": "generate_multi_speakers_audio", "image": "mlrun/mlrun", "kind": "job", "requirements": ["bark", "torchaudio"]}, "url": "", "version": "1.1.0", "test_valid": true}, "1.0.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Generate audio file from text using different speakers", "doc": "", "example": "text_to_audio_generator.ipynb", "generationDate": "2023-12-03:15-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "text_to_audio_generator", "platformVersion": "3.5.3", "spec": {"filename": "text_to_audio_generator.py", "handler": "generate_multi_speakers_audio", "image": "mlrun/mlrun", "kind": "job", "requirements": ["bark", "torchaudio"]}, "url": "", "version": "1.0.0", "test_valid": true}, "1.3.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning", "pytorch"], "description": "Generate audio file from text using different speakers", "doc": "", "example": "text_to_audio_generator.ipynb", "generationDate": "2023-12-03:15-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.1", "name": "text_to_audio_generator", "platformVersion": "3.5.3", "spec": {"filename": "text_to_audio_generator.py", "handler": "generate_multi_speakers_audio", "image": "mlrun/mlrun", "kind": "job", "requirements": ["torchaudio", "pydub"]}, "url": "", "version": "1.3.0", "test_valid": true}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning", "pytorch"], "description": "Generate audio file from text using different speakers", "doc": "", "example": "text_to_audio_generator.ipynb", "generationDate": "2023-12-03:15-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "text_to_audio_generator", "platformVersion": "3.5.3", "spec": {"filename": "text_to_audio_generator.py", "handler": "generate_multi_speakers_audio", "image": "mlrun/mlrun", "kind": "job", "requirements": ["bark", "torchaudio"]}, "url": "", "version": "1.2.0", "test_valid": true}}, "silero_vad": {"latest": {"apiVersion": "v1", "categories": ["deep-learning", "pytorch", "audio"], "description": "Silero VAD (Voice Activity Detection) functions.", "doc": "", "example": "silero_vad.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "silero_vad", "platformVersion": "3.5.3", "spec": {"filename": "silero_vad.py", "handler": "detect_voice", "image": "mlrun/mlrun", "kind": "job", "requirements": ["torch", "torchaudio", "tqdm", "onnxruntime"]}, "url": "", "version": "1.3.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["deep-learning", "pytorch", "audio"], "description": "Silero VAD (Voice Activity Detection) functions.", "doc": "", "example": "silero_vad.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "silero_vad", "platformVersion": "3.5.3", "spec": {"filename": "silero_vad.py", "handler": "detect_voice", "image": "mlrun/mlrun", "kind": "job", "requirements": ["torch", "torchaudio", "tqdm", "onnxruntime"]}, "url": "", "version": "1.1.0"}, "1.3.0": {"apiVersion": "v1", "categories": ["deep-learning", "pytorch", "audio"], "description": "Silero VAD (Voice Activity Detection) functions.", "doc": "", "example": "silero_vad.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "silero_vad", "platformVersion": "3.5.3", "spec": {"filename": "silero_vad.py", "handler": "detect_voice", "image": "mlrun/mlrun", "kind": "job", "requirements": ["torch", "torchaudio", "tqdm", "onnxruntime"]}, "url": "", "version": "1.3.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["deep-learning", "pytorch", "audio"], "description": "Silero VAD (Voice Activity Detection) functions.", "doc": "", "example": "silero_vad.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "silero_vad", "platformVersion": "3.5.3", "spec": {"filename": "silero_vad.py", "handler": "detect_voice", "image": "mlrun/mlrun", "kind": "job", "requirements": ["torch", "torchaudio", "tqdm", "onnxruntime"]}, "url": "", "version": "1.2.0"}}, "pyannote_audio": {"latest": {"apiVersion": "v1", "categories": ["deep-learning", "huggingface", "audio"], "description": "pyannote's speech diarization of audio files", "doc": "", "example": "pyannote_audio.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "pyannote-audio", "platformVersion": "3.5.3", "spec": {"filename": "pyannote_audio.py", "handler": "diarize", "image": "mlrun/mlrun-gpu", "kind": "job", "requirements": ["pyannote.audio", "pyannote.core", "torchaudio", "tqdm"]}, "url": "", "version": "1.2.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["deep-learning", "huggingface", "audio"], "description": "pyannote's speech diarization of audio files", "doc": "", "example": "pyannote_audio.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "pyannote-audio", "platformVersion": "3.5.3", "spec": {"filename": "pyannote_audio.py", "handler": "diarize", "image": "mlrun/mlrun-gpu", "kind": "job", "requirements": ["pyannote.audio", "pyannote.core", "torchaudio", "tqdm"]}, "url": "", "version": "1.1.0"}, "1.0.0": {"apiVersion": "v1", "categories": ["deep-learning", "huggingface", "audio"], "description": "pyannote's speech diarization of audio files", "doc": "", "example": "pyannote_audio.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "pyannote-audio", "platformVersion": "3.5.3", "spec": {"filename": "pyannote_audio.py", "handler": "diarize", "image": "mlrun/mlrun-gpu", "kind": "job", "requirements": ["pyannote.audio", "pyannote.core", "torchaudio", "tqdm"]}, "url": "", "version": "1.0.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["deep-learning", "huggingface", "audio"], "description": "pyannote's speech diarization of audio files", "doc": "", "example": "pyannote_audio.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "pyannote-audio", "platformVersion": "3.5.3", "spec": {"filename": "pyannote_audio.py", "handler": "diarize", "image": "mlrun/mlrun-gpu", "kind": "job", "requirements": ["pyannote.audio", "pyannote.core", "torchaudio", "tqdm"]}, "url": "", "version": "1.2.0"}}, "mlflow_utils": {"latest": {"apiVersion": "v1", "categories": ["genai", "model-serving", "machine-learning"], "description": "Mlflow model server, and additional utils.", "doc": "", "example": "mlflow_utils.ipynb", "generationDate": "2024-05-23:12-00", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0-rc17", "name": "mlflow_utils", "platformVersion": "", "spec": {"customFields": {"default_class": "MLFlowModelServer"}, "filename": "mlflow_utils.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["mlflow==2.12.2", "lightgbm", "xgboost"]}, "url": "", "version": "1.0.0"}, "1.0.0": {"apiVersion": "v1", "categories": ["genai", "model-serving", "machine-learning"], "description": "Mlflow model server, and additional utils.", "doc": "", "example": "mlflow_utils.ipynb", "generationDate": "2024-05-23:12-00", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0-rc17", "name": "mlflow_utils", "platformVersion": "", "spec": {"customFields": {"default_class": "MLFlowModelServer"}, "filename": "mlflow_utils.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["mlflow==2.12.2", "lightgbm", "xgboost"]}, "url": "", "version": "1.0.0"}}, "noise_reduction": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Reduce noise from audio files", "doc": "", "example": "noise_reduction.ipynb", "generationDate": "2024-03-04:17-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "mlrunVersion": "1.5.2", "name": "noise-reduction", "platformVersion": "3.5.3", "spec": {"filename": "noise_reduction.py", "handler": "reduce_noise", "image": "mlrun/mlrun", "kind": "job", "requirements": ["librosa", "noisereduce", "deepfilternet", "torchaudio>=2.1.2"]}, "url": "", "version": "1.0.0"}, "1.0.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Reduce noise from audio files", "doc": "", "example": "noise_reduction.ipynb", "generationDate": "2024-03-04:17-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "mlrunVersion": "1.5.2", "name": "noise-reduction", "platformVersion": "3.5.3", "spec": {"filename": "noise_reduction.py", "handler": "reduce_noise", "image": "mlrun/mlrun", "kind": "job", "requirements": ["librosa", "noisereduce", "deepfilternet", "torchaudio>=2.1.2"]}, "url": "", "version": "1.0.0"}}}}} \ No newline at end of file +{"functions": {"development": {"tf2_serving": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "tf2-serving", "platformVersion": "3.5.0", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "1.1.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "tf2-serving", "platformVersion": "3.5.0", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf2-serving", "platformVersion": "3.2.0", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "tf2-serving", "platformVersion": "", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.0.1"}, "0.9.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf2-serving", "platformVersion": "3.2.0", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.9.1"}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf2-serving", "platformVersion": "3.2.0", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.8.0"}}, "load_dataset": {"latest": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.0", "name": "load-dataset", "platformVersion": "3.5.5", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "load-dataset", "platformVersion": "3.5.0", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "load-dataset", "platformVersion": "3.2.0", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "load-dataset", "platformVersion": "", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.0", "name": "load-dataset", "platformVersion": "3.5.5", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "load-dataset", "platformVersion": "3.2.0", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}}, "model_server_tester": {"latest": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-server-tester", "platformVersion": "3.5.0", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-server-tester", "platformVersion": "3.5.0", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server-tester", "platformVersion": "3.2.0", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "model-server-tester", "platformVersion": "", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.0.0": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server-tester", "platformVersion": "3.2.0", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server-tester", "platformVersion": "3.2.0", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}}, "feature_selection": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.8.0-rc40", "name": "feature-selection", "platformVersion": "3.6.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.6.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.9.0", "name": "feature-selection", "platformVersion": "3.2.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection/feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.5.4", "name": "feature-selection", "platformVersion": "2.10.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection/feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "0.9.1": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.9.0", "name": "feature-selection", "platformVersion": "3.2.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.1"}, "1.6.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.8.0-rc40", "name": "feature-selection", "platformVersion": "3.6.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.6.0"}, "1.4.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.4.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "feature-selection", "platformVersion": "3.2.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection/feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "1.3.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0"}, "1.1.1": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.1"}, "1.5.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.4", "name": "feature-selection", "platformVersion": "3.6.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.5.0"}}, "aggregate": {"latest": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "aggregate", "platformVersion": "3.5.4", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "aggregate", "platformVersion": "3.5.2", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "aggregate", "platformVersion": "3.2.0", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2021-05-19:22-31", "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.6.2", "name": "aggregate", "platformVersion": "3.0.0", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "aggregate", "platformVersion": "3.5.2", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "1.0.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2021-05-19:22-31", "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "aggregate", "platformVersion": "3.2.0", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2021-05-19:22-31", "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "aggregate", "platformVersion": "3.2.0", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "0.0.2": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2021-05-19:22-31", "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.7.1", "name": "aggregate", "platformVersion": "3.2.0", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.2"}, "1.3.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "aggregate", "platformVersion": "3.5.4", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0"}, "1.1.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "aggregate", "platformVersion": "3.5.2", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.1"}}, "describe": {"latest": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.0", "name": "describe", "platformVersion": "3.5.3", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0"}, "0.9.2": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-04-26:10-20", "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe", "platformVersion": "3.2.0", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.2"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "describe", "platformVersion": "3.5.0", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Iguazio"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe", "platformVersion": "3.2.0", "spec": {"filename": "describe.py", "handler": "summarize", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "Iguazio"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.5.4", "name": "describe", "platformVersion": "2.10.0", "spec": {"filename": "describe.py", "handler": "summarize", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.2.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "describe", "platformVersion": "3.5.3", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "0.9.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-04-07:14-20", "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe", "platformVersion": "3.2.0", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.1"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "Iguazio"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe", "platformVersion": "3.2.0", "spec": {"filename": "describe.py", "handler": "summarize", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "1.3.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.0", "name": "describe", "platformVersion": "3.5.3", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0"}}, "model_server": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-server", "platformVersion": "3.5.0", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "1.1.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-server", "platformVersion": "3.5.0", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server", "platformVersion": "3.2.0", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "model-server", "platformVersion": "", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "0.0.1"}, "1.0.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server", "platformVersion": "3.2.0", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "1.0.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server", "platformVersion": "3.2.0", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "0.8.0"}}, "describe_spark": {"latest": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "describe-spark", "platformVersion": "3.5.0", "spec": {"filename": "describe_spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "describe-spark", "platformVersion": "3.5.0", "spec": {"filename": "describe_spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe-spark", "platformVersion": "3.2.0", "spec": {"filename": "describe-spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "describe-spark", "platformVersion": "", "spec": {"filename": "describe-spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "0.9.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe-spark", "platformVersion": "3.2.0", "spec": {"filename": "describe_spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "0.9.1"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe-spark", "platformVersion": "3.2.0", "spec": {"filename": "describe-spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}}, "gen_class_data": {"latest": {"apiVersion": "v1", "categories": ["data-generation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0", "name": "gen_class_data", "platformVersion": "3.5.3", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0"}, "0.10.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "gen_class_data", "platformVersion": "3.2.0", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.10.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "gen_class_data", "platformVersion": "3.5.0", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "gen_class_data", "platformVersion": "3.2.0", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.6.2", "name": "gen_class_data", "platformVersion": "3.0.0", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "gen_class_data", "platformVersion": "3.5.3", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "gen_class_data", "platformVersion": "3.2.0", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "1.3.0": {"apiVersion": "v1", "categories": ["data-generation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0", "name": "gen_class_data", "platformVersion": "3.5.3", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0"}}, "open_archive": {"latest": {"apiVersion": "v1", "categories": ["utils"], "description": "Open a file/object archive into a target directory", "doc": "", "example": "open_archive.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.8.0-rc50", "name": "open-archive", "platformVersion": "3.5.0", "spec": {"filename": "open_archive.py", "handler": "open_archive", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Open a file/object archive into a target directory", "doc": "", "example": "open_archive.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.8.0-rc50", "name": "open-archive", "platformVersion": "3.5.0", "spec": {"filename": "open_archive.py", "handler": "open_archive", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}}, "send_email": {"latest": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "send-email", "platformVersion": "3.5.3", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "send-email", "platformVersion": "3.5.0", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "send-email", "platformVersion": "3.2.0", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "send-email", "platformVersion": "", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.2.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "send-email", "platformVersion": "3.5.3", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "send-email", "platformVersion": "3.2.0", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}}, "v2_model_tester": {"latest": {"apiVersion": "v1", "categories": ["model-testing", "machine-learning"], "description": "test v2 model servers", "doc": "", "example": "v2_model_tester.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "v2-model-tester", "platformVersion": "3.5.0", "spec": {"filename": "v2_model_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["model-testing", "machine-learning"], "description": "test v2 model servers", "doc": "", "example": "v2_model_tester.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "v2-model-tester", "platformVersion": "3.5.0", "spec": {"filename": "v2_model_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["model-testing", "machine-learning"], "description": "test v2 model servers", "doc": "", "example": "v2_model_tester.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "v2-model-tester", "platformVersion": "3.2.0", "spec": {"filename": "v2_model_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["model-testing", "machine-learning"], "description": "test v2 model servers", "doc": "", "example": "v2_model_tester.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "v2-model-tester", "platformVersion": "", "spec": {"filename": "v2_model_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "0.8.0": {"apiVersion": "v1", "categories": ["model-testing", "machine-learning"], "description": "test v2 model servers", "doc": "", "example": "v2_model_tester.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "v2-model-tester", "platformVersion": "3.2.0", "spec": {"filename": "v2_model_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}}, "arc_to_parquet": {"latest": {"apiVersion": "v1", "categories": ["utils"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avi"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0", "name": "arc-to-parquet", "platformVersion": "3.5.4", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.5.0"}, "1.4.1": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avi"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "arc-to-parquet", "platformVersion": "3.5.4", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.4.1"}, "1.5.0": {"apiVersion": "v1", "categories": ["utils"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avi"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0", "name": "arc-to-parquet", "platformVersion": "3.5.4", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.5.0"}}, "github_utils": {"latest": {"apiVersion": "v1", "categories": ["utils"], "description": "add comments to github pull request", "doc": "", "example": "github_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "github-utils", "platformVersion": "3.5.0", "spec": {"filename": "github_utils.py", "handler": "run_summary_comment", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["utils"], "description": "add comments to github pull request", "doc": "", "example": "github_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "github-utils", "platformVersion": "3.5.0", "spec": {"filename": "github_utils.py", "handler": "run_summary_comment", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["utils"], "description": "add comments to github pull request", "doc": "", "example": "github_utils.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "github-utils", "platformVersion": "3.2.0", "spec": {"filename": "github_utils.py", "handler": "run_summary_comment", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["utils"], "description": "add comments to github pull request", "doc": "", "example": "github_utils.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "github-utils", "platformVersion": "", "spec": {"filename": "github_utils.py", "handler": "run_summary_comment", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "0.8.0": {"apiVersion": "v1", "categories": ["utils"], "description": "add comments to github pull request", "doc": "", "example": "github_utils.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "github-utils", "platformVersion": "3.2.0", "spec": {"filename": "github_utils.py", "handler": "run_summary_comment", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}}, "v2_model_server": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "v2-model-server", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "1.2.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "v2-model-server", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "v2-model-server", "platformVersion": "3.2.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "v2-model-server", "platformVersion": "", "spec": {"filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": [], "customFields": {"default_class": "ClassifierModel"}}, "url": "", "version": "0.0.1"}, "1.2.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "v2-model-server", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "1.2.0"}, "1.0.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "v2-model-server", "platformVersion": "3.2.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "1.0.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "v2-model-server", "platformVersion": "3.2.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "0.8.0"}}, "onnx_utils": {"latest": {"apiVersion": "v1", "categories": ["utils", "deep-learning"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.2", "name": "onnx_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "with_mlrun": false}}, "filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/mlrun", "kind": "job", "requirements": ["tqdm~=4.67.1", "tensorflow~=2.19.0", "tf_keras~=2.19.0", "torch~=2.6.0", "torchvision~=0.21.0", "onnx~=1.17.0", "onnxruntime~=1.19.2", "onnxoptimizer~=0.3.13", "onnxmltools~=1.13.0", "tf2onnx~=1.16.1", "plotly~=5.4.0"]}, "url": "", "version": "1.3.0"}, "1.3.0": {"apiVersion": "v1", "categories": ["utils", "deep-learning"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.2", "name": "onnx_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "with_mlrun": false}}, "filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/mlrun", "kind": "job", "requirements": ["tqdm~=4.67.1", "tensorflow~=2.19.0", "tf_keras~=2.19.0", "torch~=2.6.0", "torchvision~=0.21.0", "onnx~=1.17.0", "onnxruntime~=1.19.2", "onnxoptimizer~=0.3.13", "onnxmltools~=1.13.0", "tf2onnx~=1.16.1", "plotly~=5.4.0"]}, "url": "", "version": "1.3.0"}}, "azureml_utils": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "utils"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0", "name": "azureml_utils", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "commands": ["apt-get update && apt-get install -y --no-install-recommends git", "apt install -y liblttng-ust0"], "with_mlrun": true}}, "filename": "azureml_utils.py", "handler": "train", "image": "python:3.9-bullseye", "kind": "job", "requirements": ["azureml-core==1.54.0.post1", "azureml-train-automl-client==1.54.0.post1", "plotly~=5.4"]}, "url": "", "version": "1.4.0", "test_valid": true}, "0.9.4": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2021-11-13:00-15", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "azureml_utils", "platformVersion": "", "spec": {"filename": "azureml_utils.py", "handler": "train", "extra_spec": {"build": {"commands": ["python -m pip install pip==21.2.4", "apt-get update && apt-get install -y --no-install-recommends git"], "with_mlrun": true, "auto_build": true}, "allow_empty_resources": true}, "image": "python:3.7.9-slim", "kind": "job", "requirements": ["azureml-core==1.33.0", "azureml-train-automl-client==1.33.0", "plotly~=5.4"]}, "url": "", "version": "0.9.4"}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "azureml_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "commands": ["python -m pip install pip==22.1.2", "apt-get update && apt-get install -y --no-install-recommends git"], "with_mlrun": true}}, "filename": "azureml_utils.py", "handler": "train", "image": "python:3.7.9-slim", "kind": "job", "requirements": ["azureml-core==1.40.0", "azureml-train-automl-client==1.40.0", "plotly~=5.4"]}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2021-11-13:00-15", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "azureml_utils", "platformVersion": "", "spec": {"filename": "azureml_utils.py", "handler": "train", "commands": null, "image": "", "kind": "job", "requirements": ["azureml-core==1.33.0", "azureml-train-automl-client==1.33.0"]}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2021-11-13:00-15", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "azureml_utils", "platformVersion": "", "spec": {"filename": "azureml_utils.py", "handler": "train", "commands": null, "image": "", "kind": "job", "requirements": ["azureml-core==1.33.0", "azureml-train-automl-client==1.33.0"]}, "url": "", "version": "0.0.1"}, "1.2.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "azureml_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "commands": ["python -m pip install pip==22.1.2", "apt-get update && apt-get install -y --no-install-recommends git"], "with_mlrun": true}}, "filename": "azureml_utils.py", "handler": "train", "image": "python:3.7.9-slim", "kind": "job", "requirements": ["azureml-core==1.40.0", "azureml-train-automl-client==1.40.0", "plotly~=5.4"]}, "url": "", "version": "1.2.0", "test_valid": false}, "0.9.3": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2021-11-13:00-15", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "azureml_utils", "platformVersion": "", "spec": {"filename": "azureml_utils.py", "handler": "train", "extra_spec": {"build": {"commands": ["python -m pip install pip==21.2.4", "apt-get update && apt-get install -y --no-install-recommends git"], "with_mlrun": true, "auto_build": true}, "allow_empty_resources": true}, "image": "python:3.7.9-slim", "kind": "job", "requirements": ["azureml-core==1.33.0", "azureml-train-automl-client==1.33.0", "plotly~=5.4"]}, "url": "", "version": "0.9.3"}, "1.4.0": {"apiVersion": "v1", "categories": ["model-serving", "utils"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0", "name": "azureml_utils", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "commands": ["apt-get update && apt-get install -y --no-install-recommends git", "apt install -y liblttng-ust0"], "with_mlrun": true}}, "filename": "azureml_utils.py", "handler": "train", "image": "python:3.9-bullseye", "kind": "job", "requirements": ["azureml-core==1.54.0.post1", "azureml-train-automl-client==1.54.0.post1", "plotly~=5.4"]}, "url": "", "version": "1.4.0", "test_valid": true}, "0.9.5": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2021-04-20:15-18", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "azureml_utils", "platformVersion": "", "spec": {"filename": "azureml_utils.py", "handler": "train", "extra_spec": {"build": {"commands": ["python -m pip install pip==21.2.4", "apt-get update && apt-get install -y --no-install-recommends git"], "with_mlrun": true, "auto_build": true}, "allow_empty_resources": true}, "image": "python:3.7.9-slim", "kind": "job", "requirements": ["azureml-core==1.40.0", "azureml-train-automl-client==1.40.0", "plotly~=5.4"]}, "url": "", "version": "0.9.5"}, "1.3.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "azureml_utils", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "commands": ["apt-get update && apt-get install -y --no-install-recommends git", "apt install -y liblttng-ust0"], "with_mlrun": true}}, "filename": "azureml_utils.py", "handler": "train", "image": "python:3.9-bullseye", "kind": "job", "requirements": ["azureml-core==1.54.0.post1", "azureml-train-automl-client==1.54.0.post1", "plotly~=5.4"]}, "url": "", "version": "1.3.0", "test_valid": true}}, "auto_trainer": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.7.0"}, "0.10.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-02-06:10-18", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.10.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "", "kind": "job", "requirements": []}, "url": "", "version": "0.10.0"}, "1.0.6": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-04-26:10-43", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.0.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.6"}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-02-06:10-18", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.10.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "1.0.1": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-04-26:10-43", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.0.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "", "kind": "job", "requirements": []}, "url": "", "version": "1.0.1"}, "0.10.3": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-04-26:10-43", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.10.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "", "kind": "job", "requirements": []}, "url": "", "version": "0.10.3"}, "0.10.2": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-02-06:10-18", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.10.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "", "kind": "job", "requirements": []}, "url": "", "version": "0.10.2"}, "1.6.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.6.0"}, "1.7.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.7.0"}, "1.4.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.4.0"}, "1.0.4": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-04-26:10-43", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.0.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.4"}, "1.0.2": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-04-26:10-43", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.0.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.2"}, "1.3.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.3.0", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0"}, "0.10.1": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-02-06:10-18", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.10.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "", "kind": "job", "requirements": []}, "url": "", "version": "0.10.1"}, "1.5.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.5.0"}}, "azureml_serving": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "AzureML serving function", "doc": "", "example": "azureml_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "azureml_serving", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "mlrun.frameworks.sklearn.PickleModelServer"}, "filename": "azureml_serving.py", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["azureml-automl-runtime~=1.38.1"]}, "url": "", "version": "1.1.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "AzureML serving function", "doc": "", "example": "azureml_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "azureml_serving", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "mlrun.frameworks.sklearn.PickleModelServer"}, "filename": "azureml_serving.py", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["azureml-automl-runtime~=1.38.1"]}, "url": "", "version": "1.1.0"}}, "batch_inference": {"latest": {"apiVersion": "v1", "categories": ["model-serving"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.8.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference ( also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-learn", "plotly"]}, "url": "", "version": "1.1.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.2.0"}, "1.6.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.6.0"}, "1.7.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.7.0"}, "1.8.0": {"apiVersion": "v1", "categories": ["model-serving"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.8.0"}, "1.1.1": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.1.1"}, "1.5.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.5.0"}}, "hugging_face_serving": {"latest": {"apiVersion": "v1", "categories": ["genai", "model-serving"], "description": "Generic Hugging Face model server.", "doc": "", "example": "hugging_face_serving.ipynb", "generationDate": "2022-09-05:17-00", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "hugging_face_serving", "platformVersion": "", "spec": {"customFields": {"default_class": "HuggingFaceModelServer"}, "filename": "hugging_face_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["transformers==4.21.3", "tensorflow==2.9.2"]}, "url": "", "version": "1.1.0", "test_valid": false}, "1.1.0": {"apiVersion": "v1", "categories": ["genai", "model-serving"], "description": "Generic Hugging Face model server.", "doc": "", "example": "hugging_face_serving.ipynb", "generationDate": "2022-09-05:17-00", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "hugging_face_serving", "platformVersion": "", "spec": {"customFields": {"default_class": "HuggingFaceModelServer"}, "filename": "hugging_face_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["transformers==4.21.3", "tensorflow==2.9.2"]}, "url": "", "version": "1.1.0", "test_valid": false}, "1.0.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "Generic Hugging Face model server.", "doc": "", "example": "hugging_face_serving.ipynb", "generationDate": "2022-09-05:17-00", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "hugging_face_serving", "platformVersion": "", "spec": {"customFields": {"default_class": "HuggingFaceModelServer"}, "filename": "hugging_face_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["transformers==4.21.3", "tensorflow==2.9.2"]}, "url": "", "version": "1.0.0"}}, "transcribe": {"latest": {"apiVersion": "v1", "categories": ["audio", "genai"], "description": "Transcribe audio files into text files", "doc": "", "example": "transcribe.ipynb", "generationDate": "2023-07-13:11-20", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0", "name": "transcribe", "platformVersion": "3.5.3", "spec": {"filename": "transcribe.py", "handler": "transcribe", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "tqdm", "torchaudio", "torch", "accelerate"]}, "url": "", "version": "1.2.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation", "genai", "huggingface", "machine-learning"], "description": "Transcribe audio files into text files", "doc": "", "example": "transcribe.ipynb", "generationDate": "2023-07-13:11-20", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "transcribe", "platformVersion": "3.5.3", "spec": {"filename": "transcribe.py", "handler": "transcribe", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "tqdm", "torchaudio", "torch", "accelerate"]}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Transcribe audio files into text files", "doc": "", "example": "transcribe.ipynb", "generationDate": "2023-07-13:11-20", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "transcribe", "platformVersion": "3.5.3", "spec": {"filename": "transcribe.py", "handler": "transcribe", "image": "mlrun/mlrun", "kind": "job", "requirements": ["openai-whisper", "tqdm"]}, "url": "", "version": "0.0.1", "test_valid": false}, "1.2.0": {"apiVersion": "v1", "categories": ["audio", "genai"], "description": "Transcribe audio files into text files", "doc": "", "example": "transcribe.ipynb", "generationDate": "2023-07-13:11-20", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0", "name": "transcribe", "platformVersion": "3.5.3", "spec": {"filename": "transcribe.py", "handler": "transcribe", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "tqdm", "torchaudio", "torch", "accelerate"]}, "url": "", "version": "1.2.0"}, "1.0.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Transcribe audio files into text files", "doc": "", "example": "transcribe.ipynb", "generationDate": "2023-07-13:11-20", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "transcribe", "platformVersion": "3.5.3", "spec": {"filename": "transcribe.py", "handler": "transcribe", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "tqdm", "torchaudio", "torch", "accelerate"]}, "url": "", "version": "1.0.0"}, "0.0.2": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Transcribe audio files into text files", "doc": "", "example": "transcribe.ipynb", "generationDate": "2023-07-13:11-20", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "transcribe", "platformVersion": "3.5.3", "spec": {"filename": "transcribe.py", "handler": "transcribe", "image": "mlrun/mlrun", "kind": "job", "requirements": ["openai-whisper", "tqdm"]}, "url": "", "version": "0.0.2", "test_valid": true}}, "question_answering": {"latest": {"apiVersion": "v1", "categories": ["genai"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "torch", "tqdm"]}, "url": "", "version": "0.5.0"}, "0.2.0": {"apiVersion": "v1", "categories": ["machine-learning"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": "transformers torch tqdm"}, "url": "", "version": "0.2.0"}, "0.3.1": {"apiVersion": "v1", "categories": ["machine-learning"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "torch", "tqdm"]}, "url": "", "version": "0.3.1"}, "0.4.0": {"apiVersion": "v1", "categories": ["genai", "huggingface", "machine-learning"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "torch", "tqdm"]}, "url": "", "version": "0.4.0"}, "0.3.0": {"apiVersion": "v1", "categories": ["machine-learning"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": "transformers torch tqdm"}, "url": "", "version": "0.3.0"}, "0.5.0": {"apiVersion": "v1", "categories": ["genai"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "torch", "tqdm"]}, "url": "", "version": "0.5.0"}}, "pii_recognizer": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "NLP"], "description": "This function is used to recognize PII in a directory of text files", "doc": "", "example": "pii_recognizer.ipynb", "generationDate": "2023-08-15:10-24", "hidden": false, "icon": "", "labels": {"author": "pgw"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0", "name": "pii-recognizer", "platformVersion": "3.5.3", "spec": {"filename": "pii_recognizer.py", "handler": "recognize_pii", "image": "mlrun/mlrun", "kind": "job", "requirements": ["nltk", "pandas", "presidio-anonymizer", "presidio-analyzer", "torch", "flair@git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653", "st-annotated-text", "https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl"]}, "url": "", "version": "0.4.0", "test_valid": false}, "0.2.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "This function is used to recognize PII in a directory of text files", "doc": "", "example": "pii_recognizer.ipynb", "generationDate": "2023-08-15:10-24", "hidden": false, "icon": "", "labels": {"author": "pgw"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "pii-recognizer", "platformVersion": "3.5.3", "spec": {"filename": "pii_recognizer.py", "handler": "recognize_pii", "image": "mlrun/mlrun", "kind": "job", "requirements": ["nltk", "pandas", "presidio-anonymizer", "presidio-analyzer", "torch", "flair@git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653", "st-annotated-text", "https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl"]}, "url": "", "version": "0.2.0", "test_valid": false}, "0.4.0": {"apiVersion": "v1", "categories": ["data-preparation", "NLP"], "description": "This function is used to recognize PII in a directory of text files", "doc": "", "example": "pii_recognizer.ipynb", "generationDate": "2023-08-15:10-24", "hidden": false, "icon": "", "labels": {"author": "pgw"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0", "name": "pii-recognizer", "platformVersion": "3.5.3", "spec": {"filename": "pii_recognizer.py", "handler": "recognize_pii", "image": "mlrun/mlrun", "kind": "job", "requirements": ["nltk", "pandas", "presidio-anonymizer", "presidio-analyzer", "torch", "flair@git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653", "st-annotated-text", "https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl"]}, "url": "", "version": "0.4.0", "test_valid": false}, "0.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "This function is used to recognize PII in a directory of text files", "doc": "", "example": "pii_recognizer.ipynb", "generationDate": "2023-08-15:10-24", "hidden": false, "icon": "", "labels": {"author": "pgw"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "pii-recognizer", "platformVersion": "3.5.3", "spec": {"filename": "pii_recognizer.py", "handler": "recognize_pii", "image": "mlrun/mlrun", "kind": "job", "requirements": ["nltk", "pandas", "presidio-anonymizer", "presidio-analyzer", "torch", "flair@git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653", "st-annotated-text", "https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl"]}, "url": "", "version": "0.1.0", "test_valid": false}, "0.3.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "NLP"], "description": "This function is used to recognize PII in a directory of text files", "doc": "", "example": "pii_recognizer.ipynb", "generationDate": "2023-08-15:10-24", "hidden": false, "icon": "", "labels": {"author": "pgw"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "pii-recognizer", "platformVersion": "3.5.3", "spec": {"filename": "pii_recognizer.py", "handler": "recognize_pii", "image": "mlrun/mlrun", "kind": "job", "requirements": ["nltk", "pandas", "presidio-anonymizer", "presidio-analyzer", "torch", "flair@git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653", "st-annotated-text", "https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl"]}, "url": "", "version": "0.3.0", "test_valid": false}}, "batch_inference_v2": {"latest": {"apiVersion": "v1", "categories": ["model-serving"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0-rc51", "name": "batch_inference_v2", "platformVersion": "3.6.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.6.0"}, "1.9.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0-rc16", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "1.9.0"}, "2.0.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.0.0"}, "2.3.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.3.0"}, "1.6.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0-rc9", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "1.6.0"}, "2.1.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.1.0"}, "1.7.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0-rc13", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "1.7.0"}, "1.8.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0-rc13", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "1.8.0"}, "2.5.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.5.0"}, "2.6.0": {"apiVersion": "v1", "categories": ["model-serving"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0-rc51", "name": "batch_inference_v2", "platformVersion": "3.6.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.6.0"}, "2.2.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.2.0"}, "1.5.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0-rc9", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "1.5.0"}}, "translate": {"latest": {"apiVersion": "v1", "categories": ["genai", "NLP"], "description": "Translate text files from one language to another", "doc": "", "example": "translate.ipynb", "generationDate": "2023-12-05:17-20", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0", "name": "translate", "platformVersion": "3.5.3", "spec": {"filename": "translate.py", "handler": "translate", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "sentencepiece", "torch", "tqdm"]}, "url": "", "version": "0.2.0", "test_valid": true}, "0.2.0": {"apiVersion": "v1", "categories": ["genai", "NLP"], "description": "Translate text files from one language to another", "doc": "", "example": "translate.ipynb", "generationDate": "2023-12-05:17-20", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0", "name": "translate", "platformVersion": "3.5.3", "spec": {"filename": "translate.py", "handler": "translate", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "sentencepiece", "torch", "tqdm"]}, "url": "", "version": "0.2.0", "test_valid": true}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Translate text files from one language to another", "doc": "", "example": "translate.ipynb", "generationDate": "2023-12-05:17-20", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "translate", "platformVersion": "3.5.3", "spec": {"filename": "translate.py", "handler": "translate", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "sentencepiece", "torch", "tqdm"]}, "url": "", "version": "0.0.1", "test_valid": true}, "0.1.0": {"apiVersion": "v1", "categories": ["data-preparation", "huggingface", "machine-learning", "deep-learning", "NLP"], "description": "Translate text files from one language to another", "doc": "", "example": "translate.ipynb", "generationDate": "2023-12-05:17-20", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "translate", "platformVersion": "3.5.3", "spec": {"filename": "translate.py", "handler": "translate", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "sentencepiece", "torch", "tqdm"]}, "url": "", "version": "0.1.0", "test_valid": true}, "0.0.2": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning", "deep-learning", "NLP"], "description": "Translate text files from one language to another", "doc": "", "example": "translate.ipynb", "generationDate": "2023-12-05:17-20", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "translate", "platformVersion": "3.5.3", "spec": {"filename": "translate.py", "handler": "translate", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "sentencepiece", "torch", "tqdm"]}, "url": "", "version": "0.0.2", "test_valid": true}}, "structured_data_generator": {"latest": {"apiVersion": "v1", "categories": ["data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.8.0", "name": "structured_data_generator", "platformVersion": "3.5.5", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.6.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "structured_data_generator", "platformVersion": "3.5.0", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.1.0"}, "1.0.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "structured_data_generator", "platformVersion": "3.5.0", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.0.0"}, "1.6.0": {"apiVersion": "v1", "categories": ["data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.8.0", "name": "structured_data_generator", "platformVersion": "3.5.5", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.6.0"}, "1.3.1": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "structured_data_generator", "platformVersion": "3.5.0", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.3.1"}, "1.4.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.1", "name": "structured_data_generator", "platformVersion": "3.5.5", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.4.0"}, "1.3.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "structured_data_generator", "platformVersion": "3.5.0", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.3.0"}, "1.5.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.1", "name": "structured_data_generator", "platformVersion": "3.5.5", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.5.0"}}, "text_to_audio_generator": {"latest": {"apiVersion": "v1", "categories": ["data-generation", "audio"], "description": "Generate audio file from text using different speakers", "doc": "", "example": "text_to_audio_generator.ipynb", "generationDate": "2023-12-03:15-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.1", "name": "text_to_audio_generator", "platformVersion": "3.5.3", "spec": {"filename": "text_to_audio_generator.py", "handler": "generate_multi_speakers_audio", "image": "mlrun/mlrun", "kind": "job", "requirements": ["torchaudio", "pydub"]}, "url": "", "version": "1.3.0", "test_valid": true}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Generate audio file from text using different speakers", "doc": "", "example": "text_to_audio_generator.ipynb", "generationDate": "2023-12-03:15-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "text_to_audio_generator", "platformVersion": "3.5.3", "spec": {"filename": "text_to_audio_generator.py", "handler": "generate_multi_speakers_audio", "image": "mlrun/mlrun", "kind": "job", "requirements": ["bark", "torchaudio"]}, "url": "", "version": "1.1.0", "test_valid": true}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning", "pytorch"], "description": "Generate audio file from text using different speakers", "doc": "", "example": "text_to_audio_generator.ipynb", "generationDate": "2023-12-03:15-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "text_to_audio_generator", "platformVersion": "3.5.3", "spec": {"filename": "text_to_audio_generator.py", "handler": "generate_multi_speakers_audio", "image": "mlrun/mlrun", "kind": "job", "requirements": ["bark", "torchaudio"]}, "url": "", "version": "1.2.0", "test_valid": true}, "1.0.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Generate audio file from text using different speakers", "doc": "", "example": "text_to_audio_generator.ipynb", "generationDate": "2023-12-03:15-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "text_to_audio_generator", "platformVersion": "3.5.3", "spec": {"filename": "text_to_audio_generator.py", "handler": "generate_multi_speakers_audio", "image": "mlrun/mlrun", "kind": "job", "requirements": ["bark", "torchaudio"]}, "url": "", "version": "1.0.0", "test_valid": true}, "1.3.0": {"apiVersion": "v1", "categories": ["data-generation", "audio"], "description": "Generate audio file from text using different speakers", "doc": "", "example": "text_to_audio_generator.ipynb", "generationDate": "2023-12-03:15-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.1", "name": "text_to_audio_generator", "platformVersion": "3.5.3", "spec": {"filename": "text_to_audio_generator.py", "handler": "generate_multi_speakers_audio", "image": "mlrun/mlrun", "kind": "job", "requirements": ["torchaudio", "pydub"]}, "url": "", "version": "1.3.0", "test_valid": true}}, "silero_vad": {"latest": {"apiVersion": "v1", "categories": ["deep-learning", "audio"], "description": "Silero VAD (Voice Activity Detection) functions.", "doc": "", "example": "silero_vad.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0", "name": "silero_vad", "platformVersion": "3.5.3", "spec": {"filename": "silero_vad.py", "handler": "detect_voice", "image": "mlrun/mlrun", "kind": "job", "requirements": ["torch", "torchaudio", "tqdm", "onnxruntime"]}, "url": "", "version": "1.4.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["deep-learning", "pytorch", "audio"], "description": "Silero VAD (Voice Activity Detection) functions.", "doc": "", "example": "silero_vad.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "silero_vad", "platformVersion": "3.5.3", "spec": {"filename": "silero_vad.py", "handler": "detect_voice", "image": "mlrun/mlrun", "kind": "job", "requirements": ["torch", "torchaudio", "tqdm", "onnxruntime"]}, "url": "", "version": "1.1.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["deep-learning", "pyTorch", "audio"], "description": "Silero VAD (Voice Activity Detection) functions.", "doc": "", "example": "silero_vad.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "silero_vad", "platformVersion": "3.5.3", "spec": {"filename": "silero_vad.py", "handler": "detect_voice", "image": "mlrun/mlrun", "kind": "job", "requirements": ["torch", "torchaudio", "tqdm", "onnxruntime"]}, "url": "", "version": "1.2.0"}, "1.4.0": {"apiVersion": "v1", "categories": ["deep-learning", "audio"], "description": "Silero VAD (Voice Activity Detection) functions.", "doc": "", "example": "silero_vad.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0", "name": "silero_vad", "platformVersion": "3.5.3", "spec": {"filename": "silero_vad.py", "handler": "detect_voice", "image": "mlrun/mlrun", "kind": "job", "requirements": ["torch", "torchaudio", "tqdm", "onnxruntime"]}, "url": "", "version": "1.4.0"}, "1.3.0": {"apiVersion": "v1", "categories": ["deep-learning", "pytorch", "audio"], "description": "Silero VAD (Voice Activity Detection) functions.", "doc": "", "example": "silero_vad.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "silero_vad", "platformVersion": "3.5.3", "spec": {"filename": "silero_vad.py", "handler": "detect_voice", "image": "mlrun/mlrun", "kind": "job", "requirements": ["torch", "torchaudio", "tqdm", "onnxruntime"]}, "url": "", "version": "1.3.0"}}, "pyannote_audio": {"latest": {"apiVersion": "v1", "categories": ["deep-learning", "audio"], "description": "pyannote's speech diarization of audio files", "doc": "", "example": "pyannote_audio.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0", "name": "pyannote-audio", "platformVersion": "3.5.3", "spec": {"filename": "pyannote_audio.py", "handler": "diarize", "image": "mlrun/mlrun-gpu", "kind": "job", "requirements": ["pyannote.audio", "pyannote.core", "torchaudio", "tqdm"]}, "url": "", "version": "1.3.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["deep-learning", "huggingface", "audio"], "description": "pyannote's speech diarization of audio files", "doc": "", "example": "pyannote_audio.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "pyannote-audio", "platformVersion": "3.5.3", "spec": {"filename": "pyannote_audio.py", "handler": "diarize", "image": "mlrun/mlrun-gpu", "kind": "job", "requirements": ["pyannote.audio", "pyannote.core", "torchaudio", "tqdm"]}, "url": "", "version": "1.1.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["deep-learning", "huggingface", "audio"], "description": "pyannote's speech diarization of audio files", "doc": "", "example": "pyannote_audio.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "pyannote-audio", "platformVersion": "3.5.3", "spec": {"filename": "pyannote_audio.py", "handler": "diarize", "image": "mlrun/mlrun-gpu", "kind": "job", "requirements": ["pyannote.audio", "pyannote.core", "torchaudio", "tqdm"]}, "url": "", "version": "1.2.0"}, "1.0.0": {"apiVersion": "v1", "categories": ["deep-learning", "huggingface", "audio"], "description": "pyannote's speech diarization of audio files", "doc": "", "example": "pyannote_audio.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "pyannote-audio", "platformVersion": "3.5.3", "spec": {"filename": "pyannote_audio.py", "handler": "diarize", "image": "mlrun/mlrun-gpu", "kind": "job", "requirements": ["pyannote.audio", "pyannote.core", "torchaudio", "tqdm"]}, "url": "", "version": "1.0.0"}, "1.3.0": {"apiVersion": "v1", "categories": ["deep-learning", "audio"], "description": "pyannote's speech diarization of audio files", "doc": "", "example": "pyannote_audio.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0", "name": "pyannote-audio", "platformVersion": "3.5.3", "spec": {"filename": "pyannote_audio.py", "handler": "diarize", "image": "mlrun/mlrun-gpu", "kind": "job", "requirements": ["pyannote.audio", "pyannote.core", "torchaudio", "tqdm"]}, "url": "", "version": "1.3.0"}}, "noise_reduction": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "audio"], "description": "Reduce noise from audio files", "doc": "", "example": "noise_reduction.ipynb", "generationDate": "2024-03-04:17-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "mlrunVersion": "1.7.0", "name": "noise-reduction", "platformVersion": "3.5.3", "spec": {"filename": "noise_reduction.py", "handler": "reduce_noise", "image": "mlrun/mlrun", "kind": "job", "requirements": ["librosa", "noisereduce", "deepfilternet", "torchaudio>=2.1.2"]}, "url": "", "version": "1.1.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation", "audio"], "description": "Reduce noise from audio files", "doc": "", "example": "noise_reduction.ipynb", "generationDate": "2024-03-04:17-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "mlrunVersion": "1.7.0", "name": "noise-reduction", "platformVersion": "3.5.3", "spec": {"filename": "noise_reduction.py", "handler": "reduce_noise", "image": "mlrun/mlrun", "kind": "job", "requirements": ["librosa", "noisereduce", "deepfilternet", "torchaudio>=2.1.2"]}, "url": "", "version": "1.1.0"}, "1.0.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Reduce noise from audio files", "doc": "", "example": "noise_reduction.ipynb", "generationDate": "2024-03-04:17-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "mlrunVersion": "1.5.2", "name": "noise-reduction", "platformVersion": "3.5.3", "spec": {"filename": "noise_reduction.py", "handler": "reduce_noise", "image": "mlrun/mlrun", "kind": "job", "requirements": ["librosa", "noisereduce", "deepfilternet", "torchaudio>=2.1.2"]}, "url": "", "version": "1.0.0"}}, "mlflow_utils": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "utils"], "description": "Mlflow model server, and additional utils.", "doc": "", "example": "mlflow_utils.ipynb", "generationDate": "2024-05-23:12-00", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.8.0", "name": "mlflow_utils", "platformVersion": "", "spec": {"customFields": {"default_class": "MLFlowModelServer"}, "filename": "mlflow_utils.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["mlflow==2.12.2", "lightgbm", "xgboost"]}, "url": "", "version": "1.1.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "utils"], "description": "Mlflow model server, and additional utils.", "doc": "", "example": "mlflow_utils.ipynb", "generationDate": "2024-05-23:12-00", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.8.0", "name": "mlflow_utils", "platformVersion": "", "spec": {"customFields": {"default_class": "MLFlowModelServer"}, "filename": "mlflow_utils.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["mlflow==2.12.2", "lightgbm", "xgboost"]}, "url": "", "version": "1.1.0"}, "1.0.0": {"apiVersion": "v1", "categories": ["genai", "model-serving", "machine-learning"], "description": "Mlflow model server, and additional utils.", "doc": "", "example": "mlflow_utils.ipynb", "generationDate": "2024-05-23:12-00", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0-rc17", "name": "mlflow_utils", "platformVersion": "", "spec": {"customFields": {"default_class": "MLFlowModelServer"}, "filename": "mlflow_utils.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["mlflow==2.12.2", "lightgbm", "xgboost"]}, "url": "", "version": "1.0.0"}}}, "master": {"tf2_serving": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "tf2-serving", "platformVersion": "3.5.0", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "1.1.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "tf2-serving", "platformVersion": "3.5.0", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf2-serving", "platformVersion": "3.2.0", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "tf2-serving", "platformVersion": "", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.0.1"}, "0.9.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf2-serving", "platformVersion": "3.2.0", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.9.1"}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf2-serving", "platformVersion": "3.2.0", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.8.0"}}, "load_dataset": {"latest": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.0", "name": "load-dataset", "platformVersion": "3.5.5", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "load-dataset", "platformVersion": "3.5.0", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "load-dataset", "platformVersion": "3.2.0", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "load-dataset", "platformVersion": "", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.0", "name": "load-dataset", "platformVersion": "3.5.5", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "load-dataset", "platformVersion": "3.2.0", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}}, "model_server_tester": {"latest": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-server-tester", "platformVersion": "3.5.0", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-server-tester", "platformVersion": "3.5.0", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server-tester", "platformVersion": "3.2.0", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "model-server-tester", "platformVersion": "", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.0.0": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server-tester", "platformVersion": "3.2.0", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server-tester", "platformVersion": "3.2.0", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}}, "feature_selection": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.8.0-rc40", "name": "feature-selection", "platformVersion": "3.6.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.6.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.9.0", "name": "feature-selection", "platformVersion": "3.2.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection/feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.5.4", "name": "feature-selection", "platformVersion": "2.10.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection/feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "0.9.1": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.9.0", "name": "feature-selection", "platformVersion": "3.2.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.1"}, "1.6.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.8.0-rc40", "name": "feature-selection", "platformVersion": "3.6.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.6.0"}, "1.4.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.4.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "feature-selection", "platformVersion": "3.2.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection/feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "1.3.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0"}, "1.1.1": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.1"}, "1.5.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.4", "name": "feature-selection", "platformVersion": "3.6.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.5.0"}}, "aggregate": {"latest": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "aggregate", "platformVersion": "3.5.4", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2022-08-28:17-25", "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "aggregate", "platformVersion": "3.5.0", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "aggregate", "platformVersion": "3.2.0", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2021-05-19:22-31", "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.6.2", "name": "aggregate", "platformVersion": "3.0.0", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "aggregate", "platformVersion": "3.5.2", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "aggregate", "platformVersion": "3.2.0", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "1.3.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "aggregate", "platformVersion": "3.5.4", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0"}}, "describe": {"latest": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.0", "name": "describe", "platformVersion": "3.5.3", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0"}, "0.9.2": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-04-26:10-20", "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe", "platformVersion": "3.2.0", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.2"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "describe", "platformVersion": "3.5.0", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Iguazio"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe", "platformVersion": "3.2.0", "spec": {"filename": "describe.py", "handler": "summarize", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "Iguazio"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.5.4", "name": "describe", "platformVersion": "2.10.0", "spec": {"filename": "describe.py", "handler": "summarize", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.2.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "describe", "platformVersion": "3.5.3", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "0.9.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-04-07:14-20", "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe", "platformVersion": "3.2.0", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.1"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Iguazio"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe", "platformVersion": "3.2.0", "spec": {"filename": "describe.py", "handler": "summarize", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "1.3.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.0", "name": "describe", "platformVersion": "3.5.3", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0"}}, "model_server": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-server", "platformVersion": "3.5.0", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "1.1.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-server", "platformVersion": "3.5.0", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server", "platformVersion": "3.2.0", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "model-server", "platformVersion": "", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "0.0.1"}, "1.0.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server", "platformVersion": "3.2.0", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "1.0.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server", "platformVersion": "3.2.0", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "0.8.0"}}, "describe_spark": {"latest": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "describe-spark", "platformVersion": "3.5.0", "spec": {"filename": "describe_spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "describe-spark", "platformVersion": "3.5.0", "spec": {"filename": "describe_spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe-spark", "platformVersion": "3.2.0", "spec": {"filename": "describe-spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "describe-spark", "platformVersion": "", "spec": {"filename": "describe-spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "0.9.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe-spark", "platformVersion": "3.2.0", "spec": {"filename": "describe_spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "0.9.1"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe-spark", "platformVersion": "3.2.0", "spec": {"filename": "describe-spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}}, "gen_class_data": {"latest": {"apiVersion": "v1", "categories": ["data-generation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0", "name": "gen_class_data", "platformVersion": "3.5.3", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0"}, "0.10.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "gen_class_data", "platformVersion": "3.2.0", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.10.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "gen_class_data", "platformVersion": "3.5.0", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "gen_class_data", "platformVersion": "3.2.0", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.6.2", "name": "gen_class_data", "platformVersion": "3.0.0", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "gen_class_data", "platformVersion": "3.5.3", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "gen_class_data", "platformVersion": "3.2.0", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}, "1.3.0": {"apiVersion": "v1", "categories": ["data-generation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0", "name": "gen_class_data", "platformVersion": "3.5.3", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0"}}, "open_archive": {"latest": {"apiVersion": "v1", "categories": ["utils"], "description": "Open a file/object archive into a target directory", "doc": "", "example": "open_archive.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.8.0-rc50", "name": "open-archive", "platformVersion": "3.5.0", "spec": {"filename": "open_archive.py", "handler": "open_archive", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Open a file/object archive into a target directory", "doc": "", "example": "open_archive.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.8.0-rc50", "name": "open-archive", "platformVersion": "3.5.0", "spec": {"filename": "open_archive.py", "handler": "open_archive", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}}, "send_email": {"latest": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "send-email", "platformVersion": "3.5.3", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "send-email", "platformVersion": "3.5.0", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "send-email", "platformVersion": "3.2.0", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "send-email", "platformVersion": "", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "1.2.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "send-email", "platformVersion": "3.5.3", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "send-email", "platformVersion": "3.2.0", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}}, "v2_model_tester": {"latest": {"apiVersion": "v1", "categories": ["model-testing", "machine-learning"], "description": "test v2 model servers", "doc": "", "example": "v2_model_tester.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "v2-model-tester", "platformVersion": "3.5.0", "spec": {"filename": "v2_model_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["model-testing", "machine-learning"], "description": "test v2 model servers", "doc": "", "example": "v2_model_tester.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "v2-model-tester", "platformVersion": "3.5.0", "spec": {"filename": "v2_model_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["model-testing", "machine-learning"], "description": "test v2 model servers", "doc": "", "example": "v2_model_tester.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "v2-model-tester", "platformVersion": "3.2.0", "spec": {"filename": "v2_model_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["model-testing", "machine-learning"], "description": "test v2 model servers", "doc": "", "example": "v2_model_tester.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "v2-model-tester", "platformVersion": "", "spec": {"filename": "v2_model_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "0.8.0": {"apiVersion": "v1", "categories": ["model-testing", "machine-learning"], "description": "test v2 model servers", "doc": "", "example": "v2_model_tester.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "v2-model-tester", "platformVersion": "3.2.0", "spec": {"filename": "v2_model_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}}, "arc_to_parquet": {"latest": {"apiVersion": "v1", "categories": ["utils"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avi"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0", "name": "arc-to-parquet", "platformVersion": "3.5.4", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.5.0"}, "1.4.1": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avi"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "arc-to-parquet", "platformVersion": "3.5.4", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.4.1"}, "1.5.0": {"apiVersion": "v1", "categories": ["utils"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avi"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0", "name": "arc-to-parquet", "platformVersion": "3.5.4", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.5.0"}}, "github_utils": {"latest": {"apiVersion": "v1", "categories": ["utils"], "description": "add comments to github pull request", "doc": "", "example": "github_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "github-utils", "platformVersion": "3.5.0", "spec": {"filename": "github_utils.py", "handler": "run_summary_comment", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["utils"], "description": "add comments to github pull request", "doc": "", "example": "github_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "github-utils", "platformVersion": "3.5.0", "spec": {"filename": "github_utils.py", "handler": "run_summary_comment", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["utils"], "description": "add comments to github pull request", "doc": "", "example": "github_utils.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "github-utils", "platformVersion": "3.2.0", "spec": {"filename": "github_utils.py", "handler": "run_summary_comment", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["utils"], "description": "add comments to github pull request", "doc": "", "example": "github_utils.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "github-utils", "platformVersion": "", "spec": {"filename": "github_utils.py", "handler": "run_summary_comment", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1"}, "0.8.0": {"apiVersion": "v1", "categories": ["utils"], "description": "add comments to github pull request", "doc": "", "example": "github_utils.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "github-utils", "platformVersion": "3.2.0", "spec": {"filename": "github_utils.py", "handler": "run_summary_comment", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0"}}, "v2_model_server": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "v2-model-server", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "1.2.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "v2-model-server", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "v2-model-server", "platformVersion": "3.2.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "0.9.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "v2-model-server", "platformVersion": "", "spec": {"filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": [], "customFields": {"default_class": "ClassifierModel"}}, "url": "", "version": "0.0.1"}, "1.2.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "v2-model-server", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "1.2.0"}, "1.0.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "v2-model-server", "platformVersion": "3.2.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "1.0.0"}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "v2-model-server", "platformVersion": "3.2.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "0.8.0"}}, "onnx_utils": {"latest": {"apiVersion": "v1", "categories": ["utils", "deep-learning"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.2", "name": "onnx_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "with_mlrun": false}}, "filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/mlrun", "kind": "job", "requirements": ["tqdm~=4.67.1", "tensorflow~=2.19.0", "tf_keras~=2.19.0", "torch~=2.6.0", "torchvision~=0.21.0", "onnx~=1.17.0", "onnxruntime~=1.19.2", "onnxoptimizer~=0.3.13", "onnxmltools~=1.13.0", "tf2onnx~=1.16.1", "plotly~=5.4.0"]}, "url": "", "version": "1.3.0"}, "1.3.0": {"apiVersion": "v1", "categories": ["utils", "deep-learning"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.2", "name": "onnx_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "with_mlrun": false}}, "filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/mlrun", "kind": "job", "requirements": ["tqdm~=4.67.1", "tensorflow~=2.19.0", "tf_keras~=2.19.0", "torch~=2.6.0", "torchvision~=0.21.0", "onnx~=1.17.0", "onnxruntime~=1.19.2", "onnxoptimizer~=0.3.13", "onnxmltools~=1.13.0", "tf2onnx~=1.16.1", "plotly~=5.4.0"]}, "url": "", "version": "1.3.0"}}, "azureml_utils": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "utils"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0", "name": "azureml_utils", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "commands": ["apt-get update && apt-get install -y --no-install-recommends git", "apt install -y liblttng-ust0"], "with_mlrun": true}}, "filename": "azureml_utils.py", "handler": "train", "image": "python:3.9-bullseye", "kind": "job", "requirements": ["azureml-core==1.54.0.post1", "azureml-train-automl-client==1.54.0.post1", "plotly~=5.4"]}, "url": "", "version": "1.4.0", "test_valid": true}, "0.9.4": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2021-11-13:00-15", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "azureml_utils", "platformVersion": "", "spec": {"filename": "azureml_utils.py", "handler": "train", "extra_spec": {"build": {"commands": ["python -m pip install pip==21.2.4", "apt-get update && apt-get install -y --no-install-recommends git"], "with_mlrun": true, "auto_build": true}, "allow_empty_resources": true}, "image": "python:3.7.9-slim", "kind": "job", "requirements": ["azureml-core==1.33.0", "azureml-train-automl-client==1.33.0", "plotly~=5.4"]}, "url": "", "version": "0.9.4"}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "azureml_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "commands": ["python -m pip install pip==22.1.2", "apt-get update && apt-get install -y --no-install-recommends git"], "with_mlrun": true}}, "filename": "azureml_utils.py", "handler": "train", "image": "python:3.7.9-slim", "kind": "job", "requirements": ["azureml-core==1.40.0", "azureml-train-automl-client==1.40.0", "plotly~=5.4"]}, "url": "", "version": "1.1.0"}, "0.9.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2021-11-13:00-15", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "azureml_utils", "platformVersion": "", "spec": {"filename": "azureml_utils.py", "handler": "train", "commands": null, "image": "", "kind": "job", "requirements": ["azureml-core==1.33.0", "azureml-train-automl-client==1.33.0"]}, "url": "", "version": "0.9.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "azureml_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "commands": ["python -m pip install pip==22.1.2", "apt-get update && apt-get install -y --no-install-recommends git"], "with_mlrun": true}}, "filename": "azureml_utils.py", "handler": "train", "image": "python:3.7.9-slim", "kind": "job", "requirements": ["azureml-core==1.40.0", "azureml-train-automl-client==1.40.0", "plotly~=5.4"]}, "url": "", "version": "1.2.0", "test_valid": false}, "1.4.0": {"apiVersion": "v1", "categories": ["model-serving", "utils"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0", "name": "azureml_utils", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "commands": ["apt-get update && apt-get install -y --no-install-recommends git", "apt install -y liblttng-ust0"], "with_mlrun": true}}, "filename": "azureml_utils.py", "handler": "train", "image": "python:3.9-bullseye", "kind": "job", "requirements": ["azureml-core==1.54.0.post1", "azureml-train-automl-client==1.54.0.post1", "plotly~=5.4"]}, "url": "", "version": "1.4.0", "test_valid": true}, "0.9.5": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2021-04-20:15-18", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "azureml_utils", "platformVersion": "", "spec": {"filename": "azureml_utils.py", "handler": "train", "extra_spec": {"build": {"commands": ["python -m pip install pip==21.2.4", "apt-get update && apt-get install -y --no-install-recommends git"], "with_mlrun": true, "auto_build": true}, "allow_empty_resources": true}, "image": "python:3.7.9-slim", "kind": "job", "requirements": ["azureml-core==1.40.0", "azureml-train-automl-client==1.40.0", "plotly~=5.4"]}, "url": "", "version": "0.9.5"}, "1.3.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "azureml_utils", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "commands": ["apt-get update && apt-get install -y --no-install-recommends git", "apt install -y liblttng-ust0"], "with_mlrun": true}}, "filename": "azureml_utils.py", "handler": "train", "image": "python:3.9-bullseye", "kind": "job", "requirements": ["azureml-core==1.54.0.post1", "azureml-train-automl-client==1.54.0.post1", "plotly~=5.4"]}, "url": "", "version": "1.3.0", "test_valid": true}}, "auto_trainer": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.7.0"}, "1.0.6": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-04-26:10-43", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.0.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.6"}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0"}, "0.10.3": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-04-26:10-43", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.10.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "", "kind": "job", "requirements": []}, "url": "", "version": "0.10.3"}, "0.10.2": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-02-06:10-18", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.10.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "", "kind": "job", "requirements": []}, "url": "", "version": "0.10.2"}, "1.6.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.6.0"}, "1.7.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.7.0"}, "1.4.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.4.0"}, "1.0.7": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-04-26:10-43", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.0.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.7"}, "1.3.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.3.0", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0"}, "1.0.5": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-04-26:10-43", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.0.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.5"}, "1.5.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.5.0"}}, "azureml_serving": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "AzureML serving function", "doc": "", "example": "azureml_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "azureml_serving", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "mlrun.frameworks.sklearn.PickleModelServer"}, "filename": "azureml_serving.py", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["azureml-automl-runtime~=1.38.1"]}, "url": "", "version": "1.1.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "AzureML serving function", "doc": "", "example": "azureml_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "azureml_serving", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "mlrun.frameworks.sklearn.PickleModelServer"}, "filename": "azureml_serving.py", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["azureml-automl-runtime~=1.38.1"]}, "url": "", "version": "1.1.0"}}, "batch_inference": {"latest": {"apiVersion": "v1", "categories": ["model-serving"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.8.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference ( also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-learn", "plotly"]}, "url": "", "version": "1.1.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.2.0"}, "1.6.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.6.0"}, "1.7.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.7.0"}, "1.8.0": {"apiVersion": "v1", "categories": ["model-serving"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.8.0"}, "1.4.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "1.4.0"}, "1.3.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.3.0"}, "1.1.1": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-learn", "plotly"]}, "url": "", "version": "1.1.1"}, "1.5.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.5.0"}}, "hugging_face_serving": {"latest": {"apiVersion": "v1", "categories": ["genai", "model-serving"], "description": "Generic Hugging Face model server.", "doc": "", "example": "hugging_face_serving.ipynb", "generationDate": "2022-09-05:17-00", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "hugging_face_serving", "platformVersion": "", "spec": {"customFields": {"default_class": "HuggingFaceModelServer"}, "filename": "hugging_face_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["transformers==4.21.3", "tensorflow==2.9.2"]}, "url": "", "version": "1.1.0", "test_valid": false}, "1.1.0": {"apiVersion": "v1", "categories": ["genai", "model-serving"], "description": "Generic Hugging Face model server.", "doc": "", "example": "hugging_face_serving.ipynb", "generationDate": "2022-09-05:17-00", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "hugging_face_serving", "platformVersion": "", "spec": {"customFields": {"default_class": "HuggingFaceModelServer"}, "filename": "hugging_face_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["transformers==4.21.3", "tensorflow==2.9.2"]}, "url": "", "version": "1.1.0", "test_valid": false}, "1.0.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "Generic Hugging Face model server.", "doc": "", "example": "hugging_face_serving.ipynb", "generationDate": "2022-09-05:17-00", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "hugging_face_serving", "platformVersion": "", "spec": {"customFields": {"default_class": "HuggingFaceModelServer"}, "filename": "hugging_face_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["transformers==4.21.3", "tensorflow==2.9.2"]}, "url": "", "version": "1.0.0"}}, "question_answering": {"latest": {"apiVersion": "v1", "categories": ["genai"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "torch", "tqdm"]}, "url": "", "version": "0.5.0"}, "0.2.0": {"apiVersion": "v1", "categories": ["machine-learning"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": "transformers torch tqdm"}, "url": "", "version": "0.2.0"}, "0.3.1": {"apiVersion": "v1", "categories": ["machine-learning"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "torch", "tqdm"]}, "url": "", "version": "0.3.1"}, "0.4.0": {"apiVersion": "v1", "categories": ["genai", "huggingface", "machine-learning"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "torch", "tqdm"]}, "url": "", "version": "0.4.0"}, "0.1.0": {"apiVersion": "v1", "categories": ["machine-learning"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": "transformers torch tqdm"}, "url": "", "version": "0.1.0"}, "0.3.0": {"apiVersion": "v1", "categories": ["machine-learning"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": "transformers torch tqdm"}, "url": "", "version": "0.3.0"}, "0.5.0": {"apiVersion": "v1", "categories": ["genai"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "torch", "tqdm"]}, "url": "", "version": "0.5.0"}}, "transcribe": {"latest": {"apiVersion": "v1", "categories": ["audio", "genai"], "description": "Transcribe audio files into text files", "doc": "", "example": "transcribe.ipynb", "generationDate": "2023-07-13:11-20", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0", "name": "transcribe", "platformVersion": "3.5.3", "spec": {"filename": "transcribe.py", "handler": "transcribe", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "tqdm", "torchaudio", "torch", "accelerate"]}, "url": "", "version": "1.2.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation", "genai", "huggingface", "machine-learning"], "description": "Transcribe audio files into text files", "doc": "", "example": "transcribe.ipynb", "generationDate": "2023-07-13:11-20", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "transcribe", "platformVersion": "3.5.3", "spec": {"filename": "transcribe.py", "handler": "transcribe", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "tqdm", "torchaudio", "torch", "accelerate"]}, "url": "", "version": "1.1.0"}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Transcribe audio files into text files", "doc": "", "example": "transcribe.ipynb", "generationDate": "2023-07-13:11-20", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "transcribe", "platformVersion": "3.5.3", "spec": {"filename": "transcribe.py", "handler": "transcribe", "image": "mlrun/mlrun", "kind": "job", "requirements": ["openai-whisper", "tqdm"]}, "url": "", "version": "0.0.1", "test_valid": false}, "1.2.0": {"apiVersion": "v1", "categories": ["audio", "genai"], "description": "Transcribe audio files into text files", "doc": "", "example": "transcribe.ipynb", "generationDate": "2023-07-13:11-20", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0", "name": "transcribe", "platformVersion": "3.5.3", "spec": {"filename": "transcribe.py", "handler": "transcribe", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "tqdm", "torchaudio", "torch", "accelerate"]}, "url": "", "version": "1.2.0"}, "1.0.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Transcribe audio files into text files", "doc": "", "example": "transcribe.ipynb", "generationDate": "2023-07-13:11-20", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "transcribe", "platformVersion": "3.5.3", "spec": {"filename": "transcribe.py", "handler": "transcribe", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "tqdm", "torchaudio", "torch", "accelerate"]}, "url": "", "version": "1.0.0"}, "0.0.2": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Transcribe audio files into text files", "doc": "", "example": "transcribe.ipynb", "generationDate": "2023-07-13:11-20", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "transcribe", "platformVersion": "3.5.3", "spec": {"filename": "transcribe.py", "handler": "transcribe", "image": "mlrun/mlrun", "kind": "job", "requirements": ["openai-whisper", "tqdm"]}, "url": "", "version": "0.0.2", "test_valid": true}}, "pii_recognizer": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "NLP"], "description": "This function is used to recognize PII in a directory of text files", "doc": "", "example": "pii_recognizer.ipynb", "generationDate": "2023-08-15:10-24", "hidden": false, "icon": "", "labels": {"author": "pgw"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0", "name": "pii-recognizer", "platformVersion": "3.5.3", "spec": {"filename": "pii_recognizer.py", "handler": "recognize_pii", "image": "mlrun/mlrun", "kind": "job", "requirements": ["nltk", "pandas", "presidio-anonymizer", "presidio-analyzer", "torch", "flair@git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653", "st-annotated-text", "https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl"]}, "url": "", "version": "0.4.0", "test_valid": false}, "0.2.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "This function is used to recognize PII in a directory of text files", "doc": "", "example": "pii_recognizer.ipynb", "generationDate": "2023-08-15:10-24", "hidden": false, "icon": "", "labels": {"author": "pgw"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "pii-recognizer", "platformVersion": "3.5.3", "spec": {"filename": "pii_recognizer.py", "handler": "recognize_pii", "image": "mlrun/mlrun", "kind": "job", "requirements": ["nltk", "pandas", "presidio-anonymizer", "presidio-analyzer", "torch", "flair@git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653", "st-annotated-text", "https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl"]}, "url": "", "version": "0.2.0", "test_valid": false}, "0.0.1": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "This function is used to recognize PII in a directory of text files", "doc": "", "example": "pii_recognizer.ipynb", "generationDate": "2023-08-15:10-24", "hidden": false, "icon": "", "labels": {"author": "pgw"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "pii-recognizer", "platformVersion": "3.5.3", "spec": {"filename": "pii_recognizer.py", "handler": "recognize_pii", "image": "mlrun/mlrun", "kind": "job", "requirements": ["nltk", "pandas", "presidio-anonymizer", "presidio-analyzer", "torch", "flair@git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653", "st-annotated-text", "https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl"]}, "url": "", "version": "0.0.1"}, "0.4.0": {"apiVersion": "v1", "categories": ["data-preparation", "NLP"], "description": "This function is used to recognize PII in a directory of text files", "doc": "", "example": "pii_recognizer.ipynb", "generationDate": "2023-08-15:10-24", "hidden": false, "icon": "", "labels": {"author": "pgw"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0", "name": "pii-recognizer", "platformVersion": "3.5.3", "spec": {"filename": "pii_recognizer.py", "handler": "recognize_pii", "image": "mlrun/mlrun", "kind": "job", "requirements": ["nltk", "pandas", "presidio-anonymizer", "presidio-analyzer", "torch", "flair@git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653", "st-annotated-text", "https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl"]}, "url": "", "version": "0.4.0", "test_valid": false}, "0.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "This function is used to recognize PII in a directory of text files", "doc": "", "example": "pii_recognizer.ipynb", "generationDate": "2023-08-15:10-24", "hidden": false, "icon": "", "labels": {"author": "pgw"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "pii-recognizer", "platformVersion": "3.5.3", "spec": {"filename": "pii_recognizer.py", "handler": "recognize_pii", "image": "mlrun/mlrun", "kind": "job", "requirements": ["nltk", "pandas", "presidio-anonymizer", "presidio-analyzer", "torch", "flair@git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653", "st-annotated-text", "https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl"]}, "url": "", "version": "0.1.0", "test_valid": false}, "0.3.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "NLP"], "description": "This function is used to recognize PII in a directory of text files", "doc": "", "example": "pii_recognizer.ipynb", "generationDate": "2023-08-15:10-24", "hidden": false, "icon": "", "labels": {"author": "pgw"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "pii-recognizer", "platformVersion": "3.5.3", "spec": {"filename": "pii_recognizer.py", "handler": "recognize_pii", "image": "mlrun/mlrun", "kind": "job", "requirements": ["nltk", "pandas", "presidio-anonymizer", "presidio-analyzer", "torch", "flair@git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653", "st-annotated-text", "https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl"]}, "url": "", "version": "0.3.0", "test_valid": false}}, "batch_inference_v2": {"latest": {"apiVersion": "v1", "categories": ["model-serving"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0-rc51", "name": "batch_inference_v2", "platformVersion": "3.6.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.6.0"}, "1.9.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0-rc16", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "1.9.0"}, "2.0.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.0.0"}, "1.6.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0-rc9", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "1.6.0"}, "2.1.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.1.0"}, "1.8.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0-rc13", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "1.8.0"}, "2.4.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.4.0"}, "2.5.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.5.0"}, "2.6.0": {"apiVersion": "v1", "categories": ["model-serving"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0-rc51", "name": "batch_inference_v2", "platformVersion": "3.6.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.6.0"}, "2.2.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.2.0"}, "1.5.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0-rc9", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "1.5.0"}}, "translate": {"latest": {"apiVersion": "v1", "categories": ["genai", "NLP"], "description": "Translate text files from one language to another", "doc": "", "example": "translate.ipynb", "generationDate": "2023-12-05:17-20", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0", "name": "translate", "platformVersion": "3.5.3", "spec": {"filename": "translate.py", "handler": "translate", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "sentencepiece", "torch", "tqdm"]}, "url": "", "version": "0.2.0", "test_valid": true}, "0.2.0": {"apiVersion": "v1", "categories": ["genai", "NLP"], "description": "Translate text files from one language to another", "doc": "", "example": "translate.ipynb", "generationDate": "2023-12-05:17-20", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0", "name": "translate", "platformVersion": "3.5.3", "spec": {"filename": "translate.py", "handler": "translate", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "sentencepiece", "torch", "tqdm"]}, "url": "", "version": "0.2.0", "test_valid": true}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Translate text files from one language to another", "doc": "", "example": "translate.ipynb", "generationDate": "2023-12-05:17-20", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "translate", "platformVersion": "3.5.3", "spec": {"filename": "translate.py", "handler": "translate", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "sentencepiece", "torch", "tqdm"]}, "url": "", "version": "0.0.1", "test_valid": true}, "0.1.0": {"apiVersion": "v1", "categories": ["data-preparation", "huggingface", "machine-learning", "deep-learning", "NLP"], "description": "Translate text files from one language to another", "doc": "", "example": "translate.ipynb", "generationDate": "2023-12-05:17-20", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "translate", "platformVersion": "3.5.3", "spec": {"filename": "translate.py", "handler": "translate", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "sentencepiece", "torch", "tqdm"]}, "url": "", "version": "0.1.0", "test_valid": true}, "0.0.2": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning", "deep-learning", "NLP"], "description": "Translate text files from one language to another", "doc": "", "example": "translate.ipynb", "generationDate": "2023-12-05:17-20", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "translate", "platformVersion": "3.5.3", "spec": {"filename": "translate.py", "handler": "translate", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "sentencepiece", "torch", "tqdm"]}, "url": "", "version": "0.0.2", "test_valid": true}}, "structured_data_generator": {"latest": {"apiVersion": "v1", "categories": ["data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.8.0", "name": "structured_data_generator", "platformVersion": "3.5.5", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.6.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "structured_data_generator", "platformVersion": "3.5.0", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.1.0"}, "1.0.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "structured_data_generator", "platformVersion": "3.5.0", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.0.0"}, "1.6.0": {"apiVersion": "v1", "categories": ["data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.8.0", "name": "structured_data_generator", "platformVersion": "3.5.5", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.6.0"}, "1.4.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.1", "name": "structured_data_generator", "platformVersion": "3.5.5", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.4.0"}, "1.3.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "structured_data_generator", "platformVersion": "3.5.0", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.3.0"}, "1.5.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.1", "name": "structured_data_generator", "platformVersion": "3.5.5", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.5.0"}}, "text_to_audio_generator": {"latest": {"apiVersion": "v1", "categories": ["data-generation", "audio"], "description": "Generate audio file from text using different speakers", "doc": "", "example": "text_to_audio_generator.ipynb", "generationDate": "2023-12-03:15-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.1", "name": "text_to_audio_generator", "platformVersion": "3.5.3", "spec": {"filename": "text_to_audio_generator.py", "handler": "generate_multi_speakers_audio", "image": "mlrun/mlrun", "kind": "job", "requirements": ["torchaudio", "pydub"]}, "url": "", "version": "1.3.0", "test_valid": true}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Generate audio file from text using different speakers", "doc": "", "example": "text_to_audio_generator.ipynb", "generationDate": "2023-12-03:15-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "text_to_audio_generator", "platformVersion": "3.5.3", "spec": {"filename": "text_to_audio_generator.py", "handler": "generate_multi_speakers_audio", "image": "mlrun/mlrun", "kind": "job", "requirements": ["bark", "torchaudio"]}, "url": "", "version": "1.1.0", "test_valid": true}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning", "pytorch"], "description": "Generate audio file from text using different speakers", "doc": "", "example": "text_to_audio_generator.ipynb", "generationDate": "2023-12-03:15-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "text_to_audio_generator", "platformVersion": "3.5.3", "spec": {"filename": "text_to_audio_generator.py", "handler": "generate_multi_speakers_audio", "image": "mlrun/mlrun", "kind": "job", "requirements": ["bark", "torchaudio"]}, "url": "", "version": "1.2.0", "test_valid": true}, "1.0.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Generate audio file from text using different speakers", "doc": "", "example": "text_to_audio_generator.ipynb", "generationDate": "2023-12-03:15-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "text_to_audio_generator", "platformVersion": "3.5.3", "spec": {"filename": "text_to_audio_generator.py", "handler": "generate_multi_speakers_audio", "image": "mlrun/mlrun", "kind": "job", "requirements": ["bark", "torchaudio"]}, "url": "", "version": "1.0.0", "test_valid": true}, "1.3.0": {"apiVersion": "v1", "categories": ["data-generation", "audio"], "description": "Generate audio file from text using different speakers", "doc": "", "example": "text_to_audio_generator.ipynb", "generationDate": "2023-12-03:15-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.1", "name": "text_to_audio_generator", "platformVersion": "3.5.3", "spec": {"filename": "text_to_audio_generator.py", "handler": "generate_multi_speakers_audio", "image": "mlrun/mlrun", "kind": "job", "requirements": ["torchaudio", "pydub"]}, "url": "", "version": "1.3.0", "test_valid": true}}, "silero_vad": {"latest": {"apiVersion": "v1", "categories": ["deep-learning", "audio"], "description": "Silero VAD (Voice Activity Detection) functions.", "doc": "", "example": "silero_vad.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0", "name": "silero_vad", "platformVersion": "3.5.3", "spec": {"filename": "silero_vad.py", "handler": "detect_voice", "image": "mlrun/mlrun", "kind": "job", "requirements": ["torch", "torchaudio", "tqdm", "onnxruntime"]}, "url": "", "version": "1.4.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["deep-learning", "pytorch", "audio"], "description": "Silero VAD (Voice Activity Detection) functions.", "doc": "", "example": "silero_vad.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "silero_vad", "platformVersion": "3.5.3", "spec": {"filename": "silero_vad.py", "handler": "detect_voice", "image": "mlrun/mlrun", "kind": "job", "requirements": ["torch", "torchaudio", "tqdm", "onnxruntime"]}, "url": "", "version": "1.1.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["deep-learning", "pytorch", "audio"], "description": "Silero VAD (Voice Activity Detection) functions.", "doc": "", "example": "silero_vad.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "silero_vad", "platformVersion": "3.5.3", "spec": {"filename": "silero_vad.py", "handler": "detect_voice", "image": "mlrun/mlrun", "kind": "job", "requirements": ["torch", "torchaudio", "tqdm", "onnxruntime"]}, "url": "", "version": "1.2.0"}, "1.4.0": {"apiVersion": "v1", "categories": ["deep-learning", "audio"], "description": "Silero VAD (Voice Activity Detection) functions.", "doc": "", "example": "silero_vad.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0", "name": "silero_vad", "platformVersion": "3.5.3", "spec": {"filename": "silero_vad.py", "handler": "detect_voice", "image": "mlrun/mlrun", "kind": "job", "requirements": ["torch", "torchaudio", "tqdm", "onnxruntime"]}, "url": "", "version": "1.4.0"}, "1.3.0": {"apiVersion": "v1", "categories": ["deep-learning", "pytorch", "audio"], "description": "Silero VAD (Voice Activity Detection) functions.", "doc": "", "example": "silero_vad.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "silero_vad", "platformVersion": "3.5.3", "spec": {"filename": "silero_vad.py", "handler": "detect_voice", "image": "mlrun/mlrun", "kind": "job", "requirements": ["torch", "torchaudio", "tqdm", "onnxruntime"]}, "url": "", "version": "1.3.0"}}, "pyannote_audio": {"latest": {"apiVersion": "v1", "categories": ["deep-learning", "audio"], "description": "pyannote's speech diarization of audio files", "doc": "", "example": "pyannote_audio.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0", "name": "pyannote-audio", "platformVersion": "3.5.3", "spec": {"filename": "pyannote_audio.py", "handler": "diarize", "image": "mlrun/mlrun-gpu", "kind": "job", "requirements": ["pyannote.audio", "pyannote.core", "torchaudio", "tqdm"]}, "url": "", "version": "1.3.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["deep-learning", "huggingface", "audio"], "description": "pyannote's speech diarization of audio files", "doc": "", "example": "pyannote_audio.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "pyannote-audio", "platformVersion": "3.5.3", "spec": {"filename": "pyannote_audio.py", "handler": "diarize", "image": "mlrun/mlrun-gpu", "kind": "job", "requirements": ["pyannote.audio", "pyannote.core", "torchaudio", "tqdm"]}, "url": "", "version": "1.1.0"}, "1.2.0": {"apiVersion": "v1", "categories": ["deep-learning", "huggingface", "audio"], "description": "pyannote's speech diarization of audio files", "doc": "", "example": "pyannote_audio.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "pyannote-audio", "platformVersion": "3.5.3", "spec": {"filename": "pyannote_audio.py", "handler": "diarize", "image": "mlrun/mlrun-gpu", "kind": "job", "requirements": ["pyannote.audio", "pyannote.core", "torchaudio", "tqdm"]}, "url": "", "version": "1.2.0"}, "1.0.0": {"apiVersion": "v1", "categories": ["deep-learning", "huggingface", "audio"], "description": "pyannote's speech diarization of audio files", "doc": "", "example": "pyannote_audio.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "pyannote-audio", "platformVersion": "3.5.3", "spec": {"filename": "pyannote_audio.py", "handler": "diarize", "image": "mlrun/mlrun-gpu", "kind": "job", "requirements": ["pyannote.audio", "pyannote.core", "torchaudio", "tqdm"]}, "url": "", "version": "1.0.0"}, "1.3.0": {"apiVersion": "v1", "categories": ["deep-learning", "audio"], "description": "pyannote's speech diarization of audio files", "doc": "", "example": "pyannote_audio.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0", "name": "pyannote-audio", "platformVersion": "3.5.3", "spec": {"filename": "pyannote_audio.py", "handler": "diarize", "image": "mlrun/mlrun-gpu", "kind": "job", "requirements": ["pyannote.audio", "pyannote.core", "torchaudio", "tqdm"]}, "url": "", "version": "1.3.0"}}, "mlflow_utils": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "utils"], "description": "Mlflow model server, and additional utils.", "doc": "", "example": "mlflow_utils.ipynb", "generationDate": "2024-05-23:12-00", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.8.0", "name": "mlflow_utils", "platformVersion": "", "spec": {"customFields": {"default_class": "MLFlowModelServer"}, "filename": "mlflow_utils.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["mlflow==2.12.2", "lightgbm", "xgboost"]}, "url": "", "version": "1.1.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "utils"], "description": "Mlflow model server, and additional utils.", "doc": "", "example": "mlflow_utils.ipynb", "generationDate": "2024-05-23:12-00", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.8.0", "name": "mlflow_utils", "platformVersion": "", "spec": {"customFields": {"default_class": "MLFlowModelServer"}, "filename": "mlflow_utils.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["mlflow==2.12.2", "lightgbm", "xgboost"]}, "url": "", "version": "1.1.0"}, "1.0.0": {"apiVersion": "v1", "categories": ["genai", "model-serving", "machine-learning"], "description": "Mlflow model server, and additional utils.", "doc": "", "example": "mlflow_utils.ipynb", "generationDate": "2024-05-23:12-00", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0-rc17", "name": "mlflow_utils", "platformVersion": "", "spec": {"customFields": {"default_class": "MLFlowModelServer"}, "filename": "mlflow_utils.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["mlflow==2.12.2", "lightgbm", "xgboost"]}, "url": "", "version": "1.0.0"}}, "noise_reduction": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "audio"], "description": "Reduce noise from audio files", "doc": "", "example": "noise_reduction.ipynb", "generationDate": "2024-03-04:17-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "mlrunVersion": "1.7.0", "name": "noise-reduction", "platformVersion": "3.5.3", "spec": {"filename": "noise_reduction.py", "handler": "reduce_noise", "image": "mlrun/mlrun", "kind": "job", "requirements": ["librosa", "noisereduce", "deepfilternet", "torchaudio>=2.1.2"]}, "url": "", "version": "1.1.0"}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation", "audio"], "description": "Reduce noise from audio files", "doc": "", "example": "noise_reduction.ipynb", "generationDate": "2024-03-04:17-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "mlrunVersion": "1.7.0", "name": "noise-reduction", "platformVersion": "3.5.3", "spec": {"filename": "noise_reduction.py", "handler": "reduce_noise", "image": "mlrun/mlrun", "kind": "job", "requirements": ["librosa", "noisereduce", "deepfilternet", "torchaudio>=2.1.2"]}, "url": "", "version": "1.1.0"}, "1.0.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Reduce noise from audio files", "doc": "", "example": "noise_reduction.ipynb", "generationDate": "2024-03-04:17-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "mlrunVersion": "1.5.2", "name": "noise-reduction", "platformVersion": "3.5.3", "spec": {"filename": "noise_reduction.py", "handler": "reduce_noise", "image": "mlrun/mlrun", "kind": "job", "requirements": ["librosa", "noisereduce", "deepfilternet", "torchaudio>=2.1.2"]}, "url": "", "version": "1.0.0"}}}}} \ No newline at end of file diff --git a/functions/master/_static/mystnb.8ecb98da25f57f5357bf6f572d296f466b2cfe2517ffebfabe82451661e28f02.css b/functions/master/_static/mystnb.8ecb98da25f57f5357bf6f572d296f466b2cfe2517ffebfabe82451661e28f02.css new file mode 100644 index 00000000..14edf629 --- /dev/null +++ b/functions/master/_static/mystnb.8ecb98da25f57f5357bf6f572d296f466b2cfe2517ffebfabe82451661e28f02.css @@ -0,0 +1,2474 @@ +/* Variables */ +:root { + /* + Following palettes are generated by using https://m2.material.io/design/color/the-color-system.html#tools-for-picking-colors + - neutral palette with #fcfcfc and danger palette with #ffdddd as base colors. + 50 means lightest, 900 means darkest; less used intermediate shades are omitted + but can be added when needed by accessing full palette from the above link. + */ + --mystnb-neutral-palette-50: #fcfcfc; + --mystnb-neutral-palette-100: #f7f7f7; + --mystnb-neutral-palette-400: #cccccc; + --mystnb-neutral-palette-500: #afafaf; + --mystnb-neutral-palette-800: #505050; + --mystnb-neutral-palette-900: #2d2d2d; + + --mystnb-danger-palette-50: #ffdddd; + --mystnb-danger-palette-100: #f5acad; + --mystnb-danger-palette-400: #c42029; + --mystnb-danger-palette-500: #b40008; + --mystnb-danger-palette-800: #850010; + --mystnb-danger-palette-900: #680010; + + /* MyST-NB specific variables; colors should be logically picked from palettes */ + --mystnb-source-bg-color: var(--mystnb-neutral-palette-100); + --mystnb-stdout-bg-color: var(--mystnb-neutral-palette-50); + --mystnb-stderr-bg-color: var(--mystnb-danger-palette-50); + --mystnb-traceback-bg-color: var(--mystnb-neutral-palette-50); + --mystnb-source-border-color: var(--mystnb-neutral-palette-400); + --mystnb-source-margin-color: green; + --mystnb-stdout-border-color: var(--mystnb-neutral-palette-100); + --mystnb-stderr-border-color: var(--mystnb-neutral-palette-100); + --mystnb-traceback-border-color: var(--mystnb-danger-palette-100); + --mystnb-hide-prompt-opacity: 70%; + --mystnb-source-border-radius: .4em; + --mystnb-source-border-width: 1px; + --mystnb-scrollbar-width: 0.3rem; + --mystnb-scrollbar-height: 0.3rem; + --mystnb-scrollbar-thumb-color: var(--mystnb-neutral-palette-400); + --mystnb-scrollbar-thumb-hover-color: var(--mystnb-neutral-palette-500); + --mystnb-scrollbar-thumb-border-radius: 0.25rem; +} + +/* Override colors in dark theme */ +html[data-theme="dark"] { + --mystnb-source-bg-color: var(--mystnb-neutral-palette-800); + --mystnb-stdout-bg-color: var(--mystnb-neutral-palette-900); + --mystnb-stderr-bg-color: var(--mystnb-danger-palette-900); + --mystnb-traceback-bg-color: var(--mystnb-neutral-palette-900); + --mystnb-source-border-color: var(--mystnb-neutral-palette-500); + --mystnb-stdout-border-color: var(--mystnb-neutral-palette-800); + --mystnb-stderr-border-color: var(--mystnb-neutral-palette-800); + --mystnb-traceback-border-color: var(--mystnb-danger-palette-800); + --mystnb-scrollbar-thumb-color: var(--mystnb-neutral-palette-500); + --mystnb-scrollbar-thumb-hover-color: var(--mystnb-neutral-palette-400); +} + + +/* Whole cell */ +div.container.cell { + padding-left: 0; + margin-bottom: 1em; +} + +/* Removing all background formatting so we can control at the div level */ +.cell_input div.highlight, +.cell_output pre, +.cell_input pre, +.cell_output .output { + border: none; + box-shadow: none; +} + +.cell_output .output pre, +.cell_input pre { + margin: 0px; +} + +/* Input cells */ +div.cell > div.cell_input { + padding-left: 0em; + padding-right: 0em; + border: var(--mystnb-source-border-width) var(--mystnb-source-border-color) solid; + background-color: var(--mystnb-source-bg-color); + border-left-color: var(--mystnb-source-margin-color); + border-left-width: medium; + border-radius: var(--mystnb-source-border-radius); +} + +div.cell_input>div, +div.cell_output div.output>div.highlight { + margin: 0em !important; + border: none !important; +} + +/* All cell outputs */ +.cell_output { + padding-left: 1em; + padding-right: 0em; + margin-top: 1em; +} + +/* Text outputs from cells */ +.cell_output .output.text_plain, +.cell_output .output.traceback, +.cell_output .output.stream, +.cell_output .output.stderr { + margin-top: 1em; + margin-bottom: 0em; + box-shadow: none; +} + +.cell_output .output.text_plain:not(:has(.highlight)), +.cell_output .output.stream:not(:has(.highlight)) { + /* plain (or stream of) output, not containing a pygments-highlighted block */ + background: var(--mystnb-stdout-bg-color); + border: 1px solid var(--mystnb-stdout-border-color); +} + +.cell_output .output.stderr { + background: var(--mystnb-stderr-bg-color); + border: 1px solid var(--mystnb-stderr-border-color); +} + +.cell_output .output.traceback { + background: var(--mystnb-traceback-bg-color); + border: 1px solid var(--mystnb-traceback-border-color); +} + +/* --- Collapsible cell content --- */ + +/* +encourage summary container to blend in with its parent. +p.admonition-title should hold the title styles. +*/ +div.cell details.hide summary { + border-left: unset; + padding: inherit; + margin: inherit; + background-color: inherit; +} + +/* Neighboring input/output elements - spacing, borders */ +div.cell details.hide.above-input + details.below-input, +div.cell div.cell_input + details.below-input +{ + margin-top: 0; +} + +div.cell details.hide.above-input:has(+ details.below-input), +div.cell div.cell_input:has(+ details.below-input) +{ + margin-bottom: 0; +} + +div.cell:has(> *:nth-child(2)) div.cell_input:first-child, +div.cell:has(> *:nth-child(2)) details:first-child +{ + border-bottom-left-radius: 0; + border-bottom-right-radius: 0; +} + +div.cell:has(> *:nth-child(2)) div.cell_input:last-child, +div.cell:has(> *:nth-child(2)) details:last-child +{ + border-top-left-radius: 0; + border-top-right-radius: 0; +} + +/* intra-label styles for collapsibles */ +div.cell.container details.hide.above-input>summary, +div.cell.container details.hide.below-input>summary, +div.cell.container details.hide.above-output>summary +{ + display: block; + border-left: none; +} + +div.cell details.hide>summary>p.admonition-title { + display: list-item; + margin-bottom: 0; +} + +div.cell details.hide:not([open]) { + padding-bottom: 0; +} + +div.cell details.hide[open]>summary>p.collapsed { + display: none; +} + +div.cell details.hide:not([open])>summary>p.expanded { + display: none; +} + +@keyframes collapsed-fade-in { + 0% { + opacity: 0; + } + + 100% { + opacity: 1; + } +} +div.cell details.hide[open]>summary~* { + -moz-animation: collapsed-fade-in 0.3s ease-in-out; + -webkit-animation: collapsed-fade-in 0.3s ease-in-out; + animation: collapsed-fade-in 0.3s ease-in-out; +} + +/* Clear conflicting styles for details and admonitions set by some themes */ +div.cell details.admonition summary::before { + content: unset; +} + +/* Math align to the left */ +.cell_output .MathJax_Display { + text-align: left !important; +} + +/* Pandas tables. Pulled from the Jupyter / nbsphinx CSS */ +div.cell_output table { + border: none; + border-collapse: collapse; + border-spacing: 0; + color: black; + font-size: 1em; + table-layout: fixed; +} + +div.cell_output thead { + border-bottom: 1px solid black; + vertical-align: bottom; +} + +div.cell_output tr, +div.cell_output th, +div.cell_output td { + text-align: right; + vertical-align: middle; + padding: 0.5em 0.5em; + line-height: normal; + white-space: normal; + max-width: none; + border: none; +} + +div.cell_output th { + font-weight: bold; +} + +div.cell_output tbody tr:nth-child(odd) { + background: #f5f5f5; +} + +div.cell_output tbody tr:hover { + background: rgba(66, 165, 245, 0.2); +} + +/** source code line numbers **/ +span.linenos { + opacity: 0.5; +} + +/* Inline text from `paste` operation */ + +span.pasted-text { + font-weight: bold; +} + +span.pasted-inline img { + max-height: 2em; +} + +tbody span.pasted-inline img { + max-height: none; +} + + +/* Adding scroll bars if tags: output_scroll, scroll-output, and scroll-input + * On screens, we want to scroll, but on print show all + * + * It was before in https://github.com/executablebooks/sphinx-book-theme/blob/eb1b6baf098b27605e8f2b7b2979b17ebf1b9540/src/sphinx_book_theme/assets/styles/extensions/_myst-nb.scss +*/ +div.cell:is( + .tag_output_scroll, + .tag_scroll-output, + .config_scroll_outputs + ) + div.cell_output, +div.cell.tag_scroll-input div.cell_input { + max-height: 24em; + overflow-y: auto; + max-width: 100%; + overflow-x: auto; +} + +div.cell.config_scroll_outputs div.cell_output:has(img) { + /* If the output cell has image(s), allow it to take 90% of viewport height + but still bounded between 24em and 60em */ + max-height: clamp(24em, 90vh, 60em); +} + +/* Custom scrollbars */ +div.cell:is( + .tag_output_scroll, + .tag_scroll-output, + .config_scroll_outputs + ) + div.cell_output::-webkit-scrollbar, +div.cell.tag_scroll-input div.cell_input::-webkit-scrollbar { + width: var(--mystnb-scrollbar-width); + height: var(--mystnb-scrollbar-height); +} + +div.cell:is( + .tag_output_scroll, + .tag_scroll-output, + .config_scroll_outputs + ) + div.cell_output::-webkit-scrollbar-thumb, +div.cell.tag_scroll-input div.cell_input::-webkit-scrollbar-thumb { + background: var(--mystnb-scrollbar-thumb-color); + border-radius: var(--mystnb-scrollbar-thumb-border-radius); +} + +div.cell:is( + .tag_output_scroll, + .tag_scroll-output, + .config_scroll_outputs + ) + div.cell_output::-webkit-scrollbar-thumb:hover, +div.cell.tag_scroll-input div.cell_input::-webkit-scrollbar-thumb:hover { + background: var(--mystnb-scrollbar-thumb-hover-color); +} + +/* In print mode, unset scroll styles */ +@media print { + div.cell:is( + .tag_output_scroll, + .tag_scroll-output, + .config_scroll_outputs + ) + div.cell_output, + div.cell.tag_scroll-input div.cell_input { + max-height: unset; + overflow-y: visible; + max-width: unset; + overflow-x: visible; + } +} + +/* Font colors for translated ANSI escape sequences +Color values are copied from Jupyter Notebook +https://github.com/jupyter/notebook/blob/52581f8eda9b319eb0390ac77fe5903c38f81e3e/notebook/static/notebook/less/ansicolors.less#L14-L21 +Background colors from +https://nbsphinx.readthedocs.io/en/latest/code-cells.html#ANSI-Colors +*/ +div.highlight .-Color-Bold { + font-weight: bold; +} + +div.highlight .-Color[class*=-Black] { + color: #3E424D +} + +div.highlight .-Color[class*=-Red] { + color: #E75C58 +} + +div.highlight .-Color[class*=-Green] { + color: #00A250 +} + +div.highlight .-Color[class*=-Yellow] { + color: #DDB62B +} + +div.highlight .-Color[class*=-Blue] { + color: #208FFB +} + +div.highlight .-Color[class*=-Magenta] { + color: #D160C4 +} + +div.highlight .-Color[class*=-Cyan] { + color: #60C6C8 +} + +div.highlight .-Color[class*=-White] { + color: #C5C1B4 +} + +div.highlight .-Color[class*=-BGBlack] { + background-color: #3E424D +} + +div.highlight .-Color[class*=-BGRed] { + background-color: #E75C58 +} + +div.highlight .-Color[class*=-BGGreen] { + background-color: #00A250 +} + +div.highlight .-Color[class*=-BGYellow] { + background-color: #DDB62B +} + +div.highlight .-Color[class*=-BGBlue] { + background-color: #208FFB +} + +div.highlight .-Color[class*=-BGMagenta] { + background-color: #D160C4 +} + +div.highlight .-Color[class*=-BGCyan] { + background-color: #60C6C8 +} + +div.highlight .-Color[class*=-BGWhite] { + background-color: #C5C1B4 +} + +/* Font colors for 8-bit ANSI */ + +div.highlight .-Color[class*=-C0] { + color: #000000 +} + +div.highlight .-Color[class*=-BGC0] { + background-color: #000000 +} + +div.highlight .-Color[class*=-C1] { + color: #800000 +} + +div.highlight .-Color[class*=-BGC1] { + background-color: #800000 +} + +div.highlight .-Color[class*=-C2] { + color: #008000 +} + +div.highlight .-Color[class*=-BGC2] { + background-color: #008000 +} + +div.highlight .-Color[class*=-C3] { + color: #808000 +} + +div.highlight .-Color[class*=-BGC3] { + background-color: #808000 +} + +div.highlight .-Color[class*=-C4] { + color: #000080 +} + +div.highlight .-Color[class*=-BGC4] { + background-color: #000080 +} + +div.highlight .-Color[class*=-C5] { + color: #800080 +} + +div.highlight .-Color[class*=-BGC5] { + background-color: #800080 +} + +div.highlight .-Color[class*=-C6] { + color: #008080 +} + +div.highlight .-Color[class*=-BGC6] { + background-color: #008080 +} + +div.highlight .-Color[class*=-C7] { + color: #C0C0C0 +} + +div.highlight .-Color[class*=-BGC7] { + background-color: #C0C0C0 +} + +div.highlight .-Color[class*=-C8] { + color: #808080 +} + +div.highlight .-Color[class*=-BGC8] { + background-color: #808080 +} + +div.highlight .-Color[class*=-C9] { + color: #FF0000 +} + +div.highlight .-Color[class*=-BGC9] { + background-color: #FF0000 +} + +div.highlight .-Color[class*=-C10] { + color: #00FF00 +} + +div.highlight .-Color[class*=-BGC10] { + background-color: #00FF00 +} + +div.highlight .-Color[class*=-C11] { + color: #FFFF00 +} + +div.highlight .-Color[class*=-BGC11] { + background-color: #FFFF00 +} + +div.highlight .-Color[class*=-C12] { + color: #0000FF +} + +div.highlight .-Color[class*=-BGC12] { + background-color: #0000FF +} + +div.highlight .-Color[class*=-C13] { + color: #FF00FF +} + +div.highlight .-Color[class*=-BGC13] { + background-color: #FF00FF +} + +div.highlight .-Color[class*=-C14] { + color: #00FFFF +} + +div.highlight .-Color[class*=-BGC14] { + background-color: #00FFFF +} + +div.highlight .-Color[class*=-C15] { + color: #FFFFFF +} + +div.highlight .-Color[class*=-BGC15] { + background-color: #FFFFFF +} + +div.highlight .-Color[class*=-C16] { + color: #000000 +} + +div.highlight .-Color[class*=-BGC16] { + background-color: #000000 +} + +div.highlight .-Color[class*=-C17] { + color: #00005F +} + +div.highlight .-Color[class*=-BGC17] { + background-color: #00005F +} + +div.highlight .-Color[class*=-C18] { + color: #000087 +} + +div.highlight .-Color[class*=-BGC18] { + background-color: #000087 +} + +div.highlight .-Color[class*=-C19] { + color: #0000AF +} + +div.highlight .-Color[class*=-BGC19] { + background-color: #0000AF +} + +div.highlight .-Color[class*=-C20] { + color: #0000D7 +} + +div.highlight .-Color[class*=-BGC20] { + background-color: #0000D7 +} + +div.highlight .-Color[class*=-C21] { + color: #0000FF +} + +div.highlight .-Color[class*=-BGC21] { + background-color: #0000FF +} + +div.highlight .-Color[class*=-C22] { + color: #005F00 +} + +div.highlight .-Color[class*=-BGC22] { + background-color: #005F00 +} + +div.highlight .-Color[class*=-C23] { + color: #005F5F +} + +div.highlight .-Color[class*=-BGC23] { + background-color: #005F5F +} + +div.highlight .-Color[class*=-C24] { + color: #005F87 +} + +div.highlight .-Color[class*=-BGC24] { + background-color: #005F87 +} + +div.highlight .-Color[class*=-C25] { + color: #005FAF +} + +div.highlight .-Color[class*=-BGC25] { + background-color: #005FAF +} + +div.highlight .-Color[class*=-C26] { + color: #005FD7 +} + +div.highlight .-Color[class*=-BGC26] { + background-color: #005FD7 +} + +div.highlight .-Color[class*=-C27] { + color: #005FFF +} + +div.highlight .-Color[class*=-BGC27] { + background-color: #005FFF +} + +div.highlight .-Color[class*=-C28] { + color: #008700 +} + +div.highlight .-Color[class*=-BGC28] { + background-color: #008700 +} + +div.highlight .-Color[class*=-C29] { + color: #00875F +} + +div.highlight .-Color[class*=-BGC29] { + background-color: #00875F +} + +div.highlight .-Color[class*=-C30] { + color: #008787 +} + +div.highlight .-Color[class*=-BGC30] { + background-color: #008787 +} + +div.highlight .-Color[class*=-C31] { + color: #0087AF +} + +div.highlight .-Color[class*=-BGC31] { + background-color: #0087AF +} + +div.highlight .-Color[class*=-C32] { + color: #0087D7 +} + +div.highlight .-Color[class*=-BGC32] { + background-color: #0087D7 +} + +div.highlight .-Color[class*=-C33] { + color: #0087FF +} + +div.highlight .-Color[class*=-BGC33] { + background-color: #0087FF +} + +div.highlight .-Color[class*=-C34] { + color: #00AF00 +} + +div.highlight .-Color[class*=-BGC34] { + background-color: #00AF00 +} + +div.highlight .-Color[class*=-C35] { + color: #00AF5F +} + +div.highlight .-Color[class*=-BGC35] { + background-color: #00AF5F +} + +div.highlight .-Color[class*=-C36] { + color: #00AF87 +} + +div.highlight .-Color[class*=-BGC36] { + background-color: #00AF87 +} + +div.highlight .-Color[class*=-C37] { + color: #00AFAF +} + +div.highlight .-Color[class*=-BGC37] { + background-color: #00AFAF +} + +div.highlight .-Color[class*=-C38] { + color: #00AFD7 +} + +div.highlight .-Color[class*=-BGC38] { + background-color: #00AFD7 +} + +div.highlight .-Color[class*=-C39] { + color: #00AFFF +} + +div.highlight .-Color[class*=-BGC39] { + background-color: #00AFFF +} + +div.highlight .-Color[class*=-C40] { + color: #00D700 +} + +div.highlight .-Color[class*=-BGC40] { + background-color: #00D700 +} + +div.highlight .-Color[class*=-C41] { + color: #00D75F +} + +div.highlight .-Color[class*=-BGC41] { + background-color: #00D75F +} + +div.highlight .-Color[class*=-C42] { + color: #00D787 +} + +div.highlight .-Color[class*=-BGC42] { + background-color: #00D787 +} + +div.highlight .-Color[class*=-C43] { + color: #00D7AF +} + +div.highlight .-Color[class*=-BGC43] { + background-color: #00D7AF +} + +div.highlight .-Color[class*=-C44] { + color: #00D7D7 +} + +div.highlight .-Color[class*=-BGC44] { + background-color: #00D7D7 +} + +div.highlight .-Color[class*=-C45] { + color: #00D7FF +} + +div.highlight .-Color[class*=-BGC45] { + background-color: #00D7FF +} + +div.highlight .-Color[class*=-C46] { + color: #00FF00 +} + +div.highlight .-Color[class*=-BGC46] { + background-color: #00FF00 +} + +div.highlight .-Color[class*=-C47] { + color: #00FF5F +} + +div.highlight .-Color[class*=-BGC47] { + background-color: #00FF5F +} + +div.highlight .-Color[class*=-C48] { + color: #00FF87 +} + +div.highlight .-Color[class*=-BGC48] { + background-color: #00FF87 +} + +div.highlight .-Color[class*=-C49] { + color: #00FFAF +} + +div.highlight .-Color[class*=-BGC49] { + background-color: #00FFAF +} + +div.highlight .-Color[class*=-C50] { + color: #00FFD7 +} + +div.highlight .-Color[class*=-BGC50] { + background-color: #00FFD7 +} + +div.highlight .-Color[class*=-C51] { + color: #00FFFF +} + +div.highlight .-Color[class*=-BGC51] { + background-color: #00FFFF +} + +div.highlight .-Color[class*=-C52] { + color: #5F0000 +} + +div.highlight .-Color[class*=-BGC52] { + background-color: #5F0000 +} + +div.highlight .-Color[class*=-C53] { + color: #5F005F +} + +div.highlight .-Color[class*=-BGC53] { + background-color: #5F005F +} + +div.highlight .-Color[class*=-C54] { + color: #5F0087 +} + +div.highlight .-Color[class*=-BGC54] { + background-color: #5F0087 +} + +div.highlight .-Color[class*=-C55] { + color: #5F00AF +} + +div.highlight .-Color[class*=-BGC55] { + background-color: #5F00AF +} + +div.highlight .-Color[class*=-C56] { + color: #5F00D7 +} + +div.highlight .-Color[class*=-BGC56] { + background-color: #5F00D7 +} + +div.highlight .-Color[class*=-C57] { + color: #5F00FF +} + +div.highlight .-Color[class*=-BGC57] { + background-color: #5F00FF +} + +div.highlight .-Color[class*=-C58] { + color: #5F5F00 +} + +div.highlight .-Color[class*=-BGC58] { + background-color: #5F5F00 +} + +div.highlight .-Color[class*=-C59] { + color: #5F5F5F +} + +div.highlight .-Color[class*=-BGC59] { + background-color: #5F5F5F +} + +div.highlight .-Color[class*=-C60] { + color: #5F5F87 +} + +div.highlight .-Color[class*=-BGC60] { + background-color: #5F5F87 +} + +div.highlight .-Color[class*=-C61] { + color: #5F5FAF +} + +div.highlight .-Color[class*=-BGC61] { + background-color: #5F5FAF +} + +div.highlight .-Color[class*=-C62] { + color: #5F5FD7 +} + +div.highlight .-Color[class*=-BGC62] { + background-color: #5F5FD7 +} + +div.highlight .-Color[class*=-C63] { + color: #5F5FFF +} + +div.highlight .-Color[class*=-BGC63] { + background-color: #5F5FFF +} + +div.highlight .-Color[class*=-C64] { + color: #5F8700 +} + +div.highlight .-Color[class*=-BGC64] { + background-color: #5F8700 +} + +div.highlight .-Color[class*=-C65] { + color: #5F875F +} + +div.highlight .-Color[class*=-BGC65] { + background-color: #5F875F +} + +div.highlight .-Color[class*=-C66] { + color: #5F8787 +} + +div.highlight .-Color[class*=-BGC66] { + background-color: #5F8787 +} + +div.highlight .-Color[class*=-C67] { + color: #5F87AF +} + +div.highlight .-Color[class*=-BGC67] { + background-color: #5F87AF +} + +div.highlight .-Color[class*=-C68] { + color: #5F87D7 +} + +div.highlight .-Color[class*=-BGC68] { + background-color: #5F87D7 +} + +div.highlight .-Color[class*=-C69] { + color: #5F87FF +} + +div.highlight .-Color[class*=-BGC69] { + background-color: #5F87FF +} + +div.highlight .-Color[class*=-C70] { + color: #5FAF00 +} + +div.highlight .-Color[class*=-BGC70] { + background-color: #5FAF00 +} + +div.highlight .-Color[class*=-C71] { + color: #5FAF5F +} + +div.highlight .-Color[class*=-BGC71] { + background-color: #5FAF5F +} + +div.highlight .-Color[class*=-C72] { + color: #5FAF87 +} + +div.highlight .-Color[class*=-BGC72] { + background-color: #5FAF87 +} + +div.highlight .-Color[class*=-C73] { + color: #5FAFAF +} + +div.highlight .-Color[class*=-BGC73] { + background-color: #5FAFAF +} + +div.highlight .-Color[class*=-C74] { + color: #5FAFD7 +} + +div.highlight .-Color[class*=-BGC74] { + background-color: #5FAFD7 +} + +div.highlight .-Color[class*=-C75] { + color: #5FAFFF +} + +div.highlight .-Color[class*=-BGC75] { + background-color: #5FAFFF +} + +div.highlight .-Color[class*=-C76] { + color: #5FD700 +} + +div.highlight .-Color[class*=-BGC76] { + background-color: #5FD700 +} + +div.highlight .-Color[class*=-C77] { + color: #5FD75F +} + +div.highlight .-Color[class*=-BGC77] { + background-color: #5FD75F +} + +div.highlight .-Color[class*=-C78] { + color: #5FD787 +} + +div.highlight .-Color[class*=-BGC78] { + background-color: #5FD787 +} + +div.highlight .-Color[class*=-C79] { + color: #5FD7AF +} + +div.highlight .-Color[class*=-BGC79] { + background-color: #5FD7AF +} + +div.highlight .-Color[class*=-C80] { + color: #5FD7D7 +} + +div.highlight .-Color[class*=-BGC80] { + background-color: #5FD7D7 +} + +div.highlight .-Color[class*=-C81] { + color: #5FD7FF +} + +div.highlight .-Color[class*=-BGC81] { + background-color: #5FD7FF +} + +div.highlight .-Color[class*=-C82] { + color: #5FFF00 +} + +div.highlight .-Color[class*=-BGC82] { + background-color: #5FFF00 +} + +div.highlight .-Color[class*=-C83] { + color: #5FFF5F +} + +div.highlight .-Color[class*=-BGC83] { + background-color: #5FFF5F +} + +div.highlight .-Color[class*=-C84] { + color: #5FFF87 +} + +div.highlight .-Color[class*=-BGC84] { + background-color: #5FFF87 +} + +div.highlight .-Color[class*=-C85] { + color: #5FFFAF +} + +div.highlight .-Color[class*=-BGC85] { + background-color: #5FFFAF +} + +div.highlight .-Color[class*=-C86] { + color: #5FFFD7 +} + +div.highlight .-Color[class*=-BGC86] { + background-color: #5FFFD7 +} + +div.highlight .-Color[class*=-C87] { + color: #5FFFFF +} + +div.highlight .-Color[class*=-BGC87] { + background-color: #5FFFFF +} + +div.highlight .-Color[class*=-C88] { + color: #870000 +} + +div.highlight .-Color[class*=-BGC88] { + background-color: #870000 +} + +div.highlight .-Color[class*=-C89] { + color: #87005F +} + +div.highlight .-Color[class*=-BGC89] { + background-color: #87005F +} + +div.highlight .-Color[class*=-C90] { + color: #870087 +} + +div.highlight .-Color[class*=-BGC90] { + background-color: #870087 +} + +div.highlight .-Color[class*=-C91] { + color: #8700AF +} + +div.highlight .-Color[class*=-BGC91] { + background-color: #8700AF +} + +div.highlight .-Color[class*=-C92] { + color: #8700D7 +} + +div.highlight .-Color[class*=-BGC92] { + background-color: #8700D7 +} + +div.highlight .-Color[class*=-C93] { + color: #8700FF +} + +div.highlight .-Color[class*=-BGC93] { + background-color: #8700FF +} + +div.highlight .-Color[class*=-C94] { + color: #875F00 +} + +div.highlight .-Color[class*=-BGC94] { + background-color: #875F00 +} + +div.highlight .-Color[class*=-C95] { + color: #875F5F +} + +div.highlight .-Color[class*=-BGC95] { + background-color: #875F5F +} + +div.highlight .-Color[class*=-C96] { + color: #875F87 +} + +div.highlight .-Color[class*=-BGC96] { + background-color: #875F87 +} + +div.highlight .-Color[class*=-C97] { + color: #875FAF +} + +div.highlight .-Color[class*=-BGC97] { + background-color: #875FAF +} + +div.highlight .-Color[class*=-C98] { + color: #875FD7 +} + +div.highlight .-Color[class*=-BGC98] { + background-color: #875FD7 +} + +div.highlight .-Color[class*=-C99] { + color: #875FFF +} + +div.highlight .-Color[class*=-BGC99] { + background-color: #875FFF +} + +div.highlight .-Color[class*=-C100] { + color: #878700 +} + +div.highlight .-Color[class*=-BGC100] { + background-color: #878700 +} + +div.highlight .-Color[class*=-C101] { + color: #87875F +} + +div.highlight .-Color[class*=-BGC101] { + background-color: #87875F +} + +div.highlight .-Color[class*=-C102] { + color: #878787 +} + +div.highlight .-Color[class*=-BGC102] { + background-color: #878787 +} + +div.highlight .-Color[class*=-C103] { + color: #8787AF +} + +div.highlight .-Color[class*=-BGC103] { + background-color: #8787AF +} + +div.highlight .-Color[class*=-C104] { + color: #8787D7 +} + +div.highlight .-Color[class*=-BGC104] { + background-color: #8787D7 +} + +div.highlight .-Color[class*=-C105] { + color: #8787FF +} + +div.highlight .-Color[class*=-BGC105] { + background-color: #8787FF +} + +div.highlight .-Color[class*=-C106] { + color: #87AF00 +} + +div.highlight .-Color[class*=-BGC106] { + background-color: #87AF00 +} + +div.highlight .-Color[class*=-C107] { + color: #87AF5F +} + +div.highlight .-Color[class*=-BGC107] { + background-color: #87AF5F +} + +div.highlight .-Color[class*=-C108] { + color: #87AF87 +} + +div.highlight .-Color[class*=-BGC108] { + background-color: #87AF87 +} + +div.highlight .-Color[class*=-C109] { + color: #87AFAF +} + +div.highlight .-Color[class*=-BGC109] { + background-color: #87AFAF +} + +div.highlight .-Color[class*=-C110] { + color: #87AFD7 +} + +div.highlight .-Color[class*=-BGC110] { + background-color: #87AFD7 +} + +div.highlight .-Color[class*=-C111] { + color: #87AFFF +} + +div.highlight .-Color[class*=-BGC111] { + background-color: #87AFFF +} + +div.highlight .-Color[class*=-C112] { + color: #87D700 +} + +div.highlight .-Color[class*=-BGC112] { + background-color: #87D700 +} + +div.highlight .-Color[class*=-C113] { + color: #87D75F +} + +div.highlight .-Color[class*=-BGC113] { + background-color: #87D75F +} + +div.highlight .-Color[class*=-C114] { + color: #87D787 +} + +div.highlight .-Color[class*=-BGC114] { + background-color: #87D787 +} + +div.highlight .-Color[class*=-C115] { + color: #87D7AF +} + +div.highlight .-Color[class*=-BGC115] { + background-color: #87D7AF +} + +div.highlight .-Color[class*=-C116] { + color: #87D7D7 +} + +div.highlight .-Color[class*=-BGC116] { + background-color: #87D7D7 +} + +div.highlight .-Color[class*=-C117] { + color: #87D7FF +} + +div.highlight .-Color[class*=-BGC117] { + background-color: #87D7FF +} + +div.highlight .-Color[class*=-C118] { + color: #87FF00 +} + +div.highlight .-Color[class*=-BGC118] { + background-color: #87FF00 +} + +div.highlight .-Color[class*=-C119] { + color: #87FF5F +} + +div.highlight .-Color[class*=-BGC119] { + background-color: #87FF5F +} + +div.highlight .-Color[class*=-C120] { + color: #87FF87 +} + +div.highlight .-Color[class*=-BGC120] { + background-color: #87FF87 +} + +div.highlight .-Color[class*=-C121] { + color: #87FFAF +} + +div.highlight .-Color[class*=-BGC121] { + background-color: #87FFAF +} + +div.highlight .-Color[class*=-C122] { + color: #87FFD7 +} + +div.highlight .-Color[class*=-BGC122] { + background-color: #87FFD7 +} + +div.highlight .-Color[class*=-C123] { + color: #87FFFF +} + +div.highlight .-Color[class*=-BGC123] { + background-color: #87FFFF +} + +div.highlight .-Color[class*=-C124] { + color: #AF0000 +} + +div.highlight .-Color[class*=-BGC124] { + background-color: #AF0000 +} + +div.highlight .-Color[class*=-C125] { + color: #AF005F +} + +div.highlight .-Color[class*=-BGC125] { + background-color: #AF005F +} + +div.highlight .-Color[class*=-C126] { + color: #AF0087 +} + +div.highlight .-Color[class*=-BGC126] { + background-color: #AF0087 +} + +div.highlight .-Color[class*=-C127] { + color: #AF00AF +} + +div.highlight .-Color[class*=-BGC127] { + background-color: #AF00AF +} + +div.highlight .-Color[class*=-C128] { + color: #AF00D7 +} + +div.highlight .-Color[class*=-BGC128] { + background-color: #AF00D7 +} + +div.highlight .-Color[class*=-C129] { + color: #AF00FF +} + +div.highlight .-Color[class*=-BGC129] { + background-color: #AF00FF +} + +div.highlight .-Color[class*=-C130] { + color: #AF5F00 +} + +div.highlight .-Color[class*=-BGC130] { + background-color: #AF5F00 +} + +div.highlight .-Color[class*=-C131] { + color: #AF5F5F +} + +div.highlight .-Color[class*=-BGC131] { + background-color: #AF5F5F +} + +div.highlight .-Color[class*=-C132] { + color: #AF5F87 +} + +div.highlight .-Color[class*=-BGC132] { + background-color: #AF5F87 +} + +div.highlight .-Color[class*=-C133] { + color: #AF5FAF +} + +div.highlight .-Color[class*=-BGC133] { + background-color: #AF5FAF +} + +div.highlight .-Color[class*=-C134] { + color: #AF5FD7 +} + +div.highlight .-Color[class*=-BGC134] { + background-color: #AF5FD7 +} + +div.highlight .-Color[class*=-C135] { + color: #AF5FFF +} + +div.highlight .-Color[class*=-BGC135] { + background-color: #AF5FFF +} + +div.highlight .-Color[class*=-C136] { + color: #AF8700 +} + +div.highlight .-Color[class*=-BGC136] { + background-color: #AF8700 +} + +div.highlight .-Color[class*=-C137] { + color: #AF875F +} + +div.highlight .-Color[class*=-BGC137] { + background-color: #AF875F +} + +div.highlight .-Color[class*=-C138] { + color: #AF8787 +} + +div.highlight .-Color[class*=-BGC138] { + background-color: #AF8787 +} + +div.highlight .-Color[class*=-C139] { + color: #AF87AF +} + +div.highlight .-Color[class*=-BGC139] { + background-color: #AF87AF +} + +div.highlight .-Color[class*=-C140] { + color: #AF87D7 +} + +div.highlight .-Color[class*=-BGC140] { + background-color: #AF87D7 +} + +div.highlight .-Color[class*=-C141] { + color: #AF87FF +} + +div.highlight .-Color[class*=-BGC141] { + background-color: #AF87FF +} + +div.highlight .-Color[class*=-C142] { + color: #AFAF00 +} + +div.highlight .-Color[class*=-BGC142] { + background-color: #AFAF00 +} + +div.highlight .-Color[class*=-C143] { + color: #AFAF5F +} + +div.highlight .-Color[class*=-BGC143] { + background-color: #AFAF5F +} + +div.highlight .-Color[class*=-C144] { + color: #AFAF87 +} + +div.highlight .-Color[class*=-BGC144] { + background-color: #AFAF87 +} + +div.highlight .-Color[class*=-C145] { + color: #AFAFAF +} + +div.highlight .-Color[class*=-BGC145] { + background-color: #AFAFAF +} + +div.highlight .-Color[class*=-C146] { + color: #AFAFD7 +} + +div.highlight .-Color[class*=-BGC146] { + background-color: #AFAFD7 +} + +div.highlight .-Color[class*=-C147] { + color: #AFAFFF +} + +div.highlight .-Color[class*=-BGC147] { + background-color: #AFAFFF +} + +div.highlight .-Color[class*=-C148] { + color: #AFD700 +} + +div.highlight .-Color[class*=-BGC148] { + background-color: #AFD700 +} + +div.highlight .-Color[class*=-C149] { + color: #AFD75F +} + +div.highlight .-Color[class*=-BGC149] { + background-color: #AFD75F +} + +div.highlight .-Color[class*=-C150] { + color: #AFD787 +} + +div.highlight .-Color[class*=-BGC150] { + background-color: #AFD787 +} + +div.highlight .-Color[class*=-C151] { + color: #AFD7AF +} + +div.highlight .-Color[class*=-BGC151] { + background-color: #AFD7AF +} + +div.highlight .-Color[class*=-C152] { + color: #AFD7D7 +} + +div.highlight .-Color[class*=-BGC152] { + background-color: #AFD7D7 +} + +div.highlight .-Color[class*=-C153] { + color: #AFD7FF +} + +div.highlight .-Color[class*=-BGC153] { + background-color: #AFD7FF +} + +div.highlight .-Color[class*=-C154] { + color: #AFFF00 +} + +div.highlight .-Color[class*=-BGC154] { + background-color: #AFFF00 +} + +div.highlight .-Color[class*=-C155] { + color: #AFFF5F +} + +div.highlight .-Color[class*=-BGC155] { + background-color: #AFFF5F +} + +div.highlight .-Color[class*=-C156] { + color: #AFFF87 +} + +div.highlight .-Color[class*=-BGC156] { + background-color: #AFFF87 +} + +div.highlight .-Color[class*=-C157] { + color: #AFFFAF +} + +div.highlight .-Color[class*=-BGC157] { + background-color: #AFFFAF +} + +div.highlight .-Color[class*=-C158] { + color: #AFFFD7 +} + +div.highlight .-Color[class*=-BGC158] { + background-color: #AFFFD7 +} + +div.highlight .-Color[class*=-C159] { + color: #AFFFFF +} + +div.highlight .-Color[class*=-BGC159] { + background-color: #AFFFFF +} + +div.highlight .-Color[class*=-C160] { + color: #D70000 +} + +div.highlight .-Color[class*=-BGC160] { + background-color: #D70000 +} + +div.highlight .-Color[class*=-C161] { + color: #D7005F +} + +div.highlight .-Color[class*=-BGC161] { + background-color: #D7005F +} + +div.highlight .-Color[class*=-C162] { + color: #D70087 +} + +div.highlight .-Color[class*=-BGC162] { + background-color: #D70087 +} + +div.highlight .-Color[class*=-C163] { + color: #D700AF +} + +div.highlight .-Color[class*=-BGC163] { + background-color: #D700AF +} + +div.highlight .-Color[class*=-C164] { + color: #D700D7 +} + +div.highlight .-Color[class*=-BGC164] { + background-color: #D700D7 +} + +div.highlight .-Color[class*=-C165] { + color: #D700FF +} + +div.highlight .-Color[class*=-BGC165] { + background-color: #D700FF +} + +div.highlight .-Color[class*=-C166] { + color: #D75F00 +} + +div.highlight .-Color[class*=-BGC166] { + background-color: #D75F00 +} + +div.highlight .-Color[class*=-C167] { + color: #D75F5F +} + +div.highlight .-Color[class*=-BGC167] { + background-color: #D75F5F +} + +div.highlight .-Color[class*=-C168] { + color: #D75F87 +} + +div.highlight .-Color[class*=-BGC168] { + background-color: #D75F87 +} + +div.highlight .-Color[class*=-C169] { + color: #D75FAF +} + +div.highlight .-Color[class*=-BGC169] { + background-color: #D75FAF +} + +div.highlight .-Color[class*=-C170] { + color: #D75FD7 +} + +div.highlight .-Color[class*=-BGC170] { + background-color: #D75FD7 +} + +div.highlight .-Color[class*=-C171] { + color: #D75FFF +} + +div.highlight .-Color[class*=-BGC171] { + background-color: #D75FFF +} + +div.highlight .-Color[class*=-C172] { + color: #D78700 +} + +div.highlight .-Color[class*=-BGC172] { + background-color: #D78700 +} + +div.highlight .-Color[class*=-C173] { + color: #D7875F +} + +div.highlight .-Color[class*=-BGC173] { + background-color: #D7875F +} + +div.highlight .-Color[class*=-C174] { + color: #D78787 +} + +div.highlight .-Color[class*=-BGC174] { + background-color: #D78787 +} + +div.highlight .-Color[class*=-C175] { + color: #D787AF +} + +div.highlight .-Color[class*=-BGC175] { + background-color: #D787AF +} + +div.highlight .-Color[class*=-C176] { + color: #D787D7 +} + +div.highlight .-Color[class*=-BGC176] { + background-color: #D787D7 +} + +div.highlight .-Color[class*=-C177] { + color: #D787FF +} + +div.highlight .-Color[class*=-BGC177] { + background-color: #D787FF +} + +div.highlight .-Color[class*=-C178] { + color: #D7AF00 +} + +div.highlight .-Color[class*=-BGC178] { + background-color: #D7AF00 +} + +div.highlight .-Color[class*=-C179] { + color: #D7AF5F +} + +div.highlight .-Color[class*=-BGC179] { + background-color: #D7AF5F +} + +div.highlight .-Color[class*=-C180] { + color: #D7AF87 +} + +div.highlight .-Color[class*=-BGC180] { + background-color: #D7AF87 +} + +div.highlight .-Color[class*=-C181] { + color: #D7AFAF +} + +div.highlight .-Color[class*=-BGC181] { + background-color: #D7AFAF +} + +div.highlight .-Color[class*=-C182] { + color: #D7AFD7 +} + +div.highlight .-Color[class*=-BGC182] { + background-color: #D7AFD7 +} + +div.highlight .-Color[class*=-C183] { + color: #D7AFFF +} + +div.highlight .-Color[class*=-BGC183] { + background-color: #D7AFFF +} + +div.highlight .-Color[class*=-C184] { + color: #D7D700 +} + +div.highlight .-Color[class*=-BGC184] { + background-color: #D7D700 +} + +div.highlight .-Color[class*=-C185] { + color: #D7D75F +} + +div.highlight .-Color[class*=-BGC185] { + background-color: #D7D75F +} + +div.highlight .-Color[class*=-C186] { + color: #D7D787 +} + +div.highlight .-Color[class*=-BGC186] { + background-color: #D7D787 +} + +div.highlight .-Color[class*=-C187] { + color: #D7D7AF +} + +div.highlight .-Color[class*=-BGC187] { + background-color: #D7D7AF +} + +div.highlight .-Color[class*=-C188] { + color: #D7D7D7 +} + +div.highlight .-Color[class*=-BGC188] { + background-color: #D7D7D7 +} + +div.highlight .-Color[class*=-C189] { + color: #D7D7FF +} + +div.highlight .-Color[class*=-BGC189] { + background-color: #D7D7FF +} + +div.highlight .-Color[class*=-C190] { + color: #D7FF00 +} + +div.highlight .-Color[class*=-BGC190] { + background-color: #D7FF00 +} + +div.highlight .-Color[class*=-C191] { + color: #D7FF5F +} + +div.highlight .-Color[class*=-BGC191] { + background-color: #D7FF5F +} + +div.highlight .-Color[class*=-C192] { + color: #D7FF87 +} + +div.highlight .-Color[class*=-BGC192] { + background-color: #D7FF87 +} + +div.highlight .-Color[class*=-C193] { + color: #D7FFAF +} + +div.highlight .-Color[class*=-BGC193] { + background-color: #D7FFAF +} + +div.highlight .-Color[class*=-C194] { + color: #D7FFD7 +} + +div.highlight .-Color[class*=-BGC194] { + background-color: #D7FFD7 +} + +div.highlight .-Color[class*=-C195] { + color: #D7FFFF +} + +div.highlight .-Color[class*=-BGC195] { + background-color: #D7FFFF +} + +div.highlight .-Color[class*=-C196] { + color: #FF0000 +} + +div.highlight .-Color[class*=-BGC196] { + background-color: #FF0000 +} + +div.highlight .-Color[class*=-C197] { + color: #FF005F +} + +div.highlight .-Color[class*=-BGC197] { + background-color: #FF005F +} + +div.highlight .-Color[class*=-C198] { + color: #FF0087 +} + +div.highlight .-Color[class*=-BGC198] { + background-color: #FF0087 +} + +div.highlight .-Color[class*=-C199] { + color: #FF00AF +} + +div.highlight .-Color[class*=-BGC199] { + background-color: #FF00AF +} + +div.highlight .-Color[class*=-C200] { + color: #FF00D7 +} + +div.highlight .-Color[class*=-BGC200] { + background-color: #FF00D7 +} + +div.highlight .-Color[class*=-C201] { + color: #FF00FF +} + +div.highlight .-Color[class*=-BGC201] { + background-color: #FF00FF +} + +div.highlight .-Color[class*=-C202] { + color: #FF5F00 +} + +div.highlight .-Color[class*=-BGC202] { + background-color: #FF5F00 +} + +div.highlight .-Color[class*=-C203] { + color: #FF5F5F +} + +div.highlight .-Color[class*=-BGC203] { + background-color: #FF5F5F +} + +div.highlight .-Color[class*=-C204] { + color: #FF5F87 +} + +div.highlight .-Color[class*=-BGC204] { + background-color: #FF5F87 +} + +div.highlight .-Color[class*=-C205] { + color: #FF5FAF +} + +div.highlight .-Color[class*=-BGC205] { + background-color: #FF5FAF +} + +div.highlight .-Color[class*=-C206] { + color: #FF5FD7 +} + +div.highlight .-Color[class*=-BGC206] { + background-color: #FF5FD7 +} + +div.highlight .-Color[class*=-C207] { + color: #FF5FFF +} + +div.highlight .-Color[class*=-BGC207] { + background-color: #FF5FFF +} + +div.highlight .-Color[class*=-C208] { + color: #FF8700 +} + +div.highlight .-Color[class*=-BGC208] { + background-color: #FF8700 +} + +div.highlight .-Color[class*=-C209] { + color: #FF875F +} + +div.highlight .-Color[class*=-BGC209] { + background-color: #FF875F +} + +div.highlight .-Color[class*=-C210] { + color: #FF8787 +} + +div.highlight .-Color[class*=-BGC210] { + background-color: #FF8787 +} + +div.highlight .-Color[class*=-C211] { + color: #FF87AF +} + +div.highlight .-Color[class*=-BGC211] { + background-color: #FF87AF +} + +div.highlight .-Color[class*=-C212] { + color: #FF87D7 +} + +div.highlight .-Color[class*=-BGC212] { + background-color: #FF87D7 +} + +div.highlight .-Color[class*=-C213] { + color: #FF87FF +} + +div.highlight .-Color[class*=-BGC213] { + background-color: #FF87FF +} + +div.highlight .-Color[class*=-C214] { + color: #FFAF00 +} + +div.highlight .-Color[class*=-BGC214] { + background-color: #FFAF00 +} + +div.highlight .-Color[class*=-C215] { + color: #FFAF5F +} + +div.highlight .-Color[class*=-BGC215] { + background-color: #FFAF5F +} + +div.highlight .-Color[class*=-C216] { + color: #FFAF87 +} + +div.highlight .-Color[class*=-BGC216] { + background-color: #FFAF87 +} + +div.highlight .-Color[class*=-C217] { + color: #FFAFAF +} + +div.highlight .-Color[class*=-BGC217] { + background-color: #FFAFAF +} + +div.highlight .-Color[class*=-C218] { + color: #FFAFD7 +} + +div.highlight .-Color[class*=-BGC218] { + background-color: #FFAFD7 +} + +div.highlight .-Color[class*=-C219] { + color: #FFAFFF +} + +div.highlight .-Color[class*=-BGC219] { + background-color: #FFAFFF +} + +div.highlight .-Color[class*=-C220] { + color: #FFD700 +} + +div.highlight .-Color[class*=-BGC220] { + background-color: #FFD700 +} + +div.highlight .-Color[class*=-C221] { + color: #FFD75F +} + +div.highlight .-Color[class*=-BGC221] { + background-color: #FFD75F +} + +div.highlight .-Color[class*=-C222] { + color: #FFD787 +} + +div.highlight .-Color[class*=-BGC222] { + background-color: #FFD787 +} + +div.highlight .-Color[class*=-C223] { + color: #FFD7AF +} + +div.highlight .-Color[class*=-BGC223] { + background-color: #FFD7AF +} + +div.highlight .-Color[class*=-C224] { + color: #FFD7D7 +} + +div.highlight .-Color[class*=-BGC224] { + background-color: #FFD7D7 +} + +div.highlight .-Color[class*=-C225] { + color: #FFD7FF +} + +div.highlight .-Color[class*=-BGC225] { + background-color: #FFD7FF +} + +div.highlight .-Color[class*=-C226] { + color: #FFFF00 +} + +div.highlight .-Color[class*=-BGC226] { + background-color: #FFFF00 +} + +div.highlight .-Color[class*=-C227] { + color: #FFFF5F +} + +div.highlight .-Color[class*=-BGC227] { + background-color: #FFFF5F +} + +div.highlight .-Color[class*=-C228] { + color: #FFFF87 +} + +div.highlight .-Color[class*=-BGC228] { + background-color: #FFFF87 +} + +div.highlight .-Color[class*=-C229] { + color: #FFFFAF +} + +div.highlight .-Color[class*=-BGC229] { + background-color: #FFFFAF +} + +div.highlight .-Color[class*=-C230] { + color: #FFFFD7 +} + +div.highlight .-Color[class*=-BGC230] { + background-color: #FFFFD7 +} + +div.highlight .-Color[class*=-C231] { + color: #FFFFFF +} + +div.highlight .-Color[class*=-BGC231] { + background-color: #FFFFFF +} + +div.highlight .-Color[class*=-C232] { + color: #080808 +} + +div.highlight .-Color[class*=-BGC232] { + background-color: #080808 +} + +div.highlight .-Color[class*=-C233] { + color: #121212 +} + +div.highlight .-Color[class*=-BGC233] { + background-color: #121212 +} + +div.highlight .-Color[class*=-C234] { + color: #1C1C1C +} + +div.highlight .-Color[class*=-BGC234] { + background-color: #1C1C1C +} + +div.highlight .-Color[class*=-C235] { + color: #262626 +} + +div.highlight .-Color[class*=-BGC235] { + background-color: #262626 +} + +div.highlight .-Color[class*=-C236] { + color: #303030 +} + +div.highlight .-Color[class*=-BGC236] { + background-color: #303030 +} + +div.highlight .-Color[class*=-C237] { + color: #3A3A3A +} + +div.highlight .-Color[class*=-BGC237] { + background-color: #3A3A3A +} + +div.highlight .-Color[class*=-C238] { + color: #444444 +} + +div.highlight .-Color[class*=-BGC238] { + background-color: #444444 +} + +div.highlight .-Color[class*=-C239] { + color: #4E4E4E +} + +div.highlight .-Color[class*=-BGC239] { + background-color: #4E4E4E +} + +div.highlight .-Color[class*=-C240] { + color: #585858 +} + +div.highlight .-Color[class*=-BGC240] { + background-color: #585858 +} + +div.highlight .-Color[class*=-C241] { + color: #626262 +} + +div.highlight .-Color[class*=-BGC241] { + background-color: #626262 +} + +div.highlight .-Color[class*=-C242] { + color: #6C6C6C +} + +div.highlight .-Color[class*=-BGC242] { + background-color: #6C6C6C +} + +div.highlight .-Color[class*=-C243] { + color: #767676 +} + +div.highlight .-Color[class*=-BGC243] { + background-color: #767676 +} + +div.highlight .-Color[class*=-C244] { + color: #808080 +} + +div.highlight .-Color[class*=-BGC244] { + background-color: #808080 +} + +div.highlight .-Color[class*=-C245] { + color: #8A8A8A +} + +div.highlight .-Color[class*=-BGC245] { + background-color: #8A8A8A +} + +div.highlight .-Color[class*=-C246] { + color: #949494 +} + +div.highlight .-Color[class*=-BGC246] { + background-color: #949494 +} + +div.highlight .-Color[class*=-C247] { + color: #9E9E9E +} + +div.highlight .-Color[class*=-BGC247] { + background-color: #9E9E9E +} + +div.highlight .-Color[class*=-C248] { + color: #A8A8A8 +} + +div.highlight .-Color[class*=-BGC248] { + background-color: #A8A8A8 +} + +div.highlight .-Color[class*=-C249] { + color: #B2B2B2 +} + +div.highlight .-Color[class*=-BGC249] { + background-color: #B2B2B2 +} + +div.highlight .-Color[class*=-C250] { + color: #BCBCBC +} + +div.highlight .-Color[class*=-BGC250] { + background-color: #BCBCBC +} + +div.highlight .-Color[class*=-C251] { + color: #C6C6C6 +} + +div.highlight .-Color[class*=-BGC251] { + background-color: #C6C6C6 +} + +div.highlight .-Color[class*=-C252] { + color: #D0D0D0 +} + +div.highlight .-Color[class*=-BGC252] { + background-color: #D0D0D0 +} + +div.highlight .-Color[class*=-C253] { + color: #DADADA +} + +div.highlight .-Color[class*=-BGC253] { + background-color: #DADADA +} + +div.highlight .-Color[class*=-C254] { + color: #E4E4E4 +} + +div.highlight .-Color[class*=-BGC254] { + background-color: #E4E4E4 +} + +div.highlight .-Color[class*=-C255] { + color: #EEEEEE +} + +div.highlight .-Color[class*=-BGC255] { + background-color: #EEEEEE +} diff --git a/functions/master/aggregate/1.3.0/static/aggregate.html b/functions/master/aggregate/1.3.0/static/aggregate.html index dd189991..94dd4dee 100644 --- a/functions/master/aggregate/1.3.0/static/aggregate.html +++ b/functions/master/aggregate/1.3.0/static/aggregate.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/aggregate/1.3.0/static/documentation.html b/functions/master/aggregate/1.3.0/static/documentation.html index 6f0be75b..5aac022e 100644 --- a/functions/master/aggregate/1.3.0/static/documentation.html +++ b/functions/master/aggregate/1.3.0/static/documentation.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/aggregate/1.3.0/static/example.html b/functions/master/aggregate/1.3.0/static/example.html index 0f230b87..d1ba3b94 100644 --- a/functions/master/aggregate/1.3.0/static/example.html +++ b/functions/master/aggregate/1.3.0/static/example.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/aggregate/latest/static/aggregate.html b/functions/master/aggregate/latest/static/aggregate.html index dd189991..94dd4dee 100644 --- a/functions/master/aggregate/latest/static/aggregate.html +++ b/functions/master/aggregate/latest/static/aggregate.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/aggregate/latest/static/documentation.html b/functions/master/aggregate/latest/static/documentation.html index 6f0be75b..5aac022e 100644 --- a/functions/master/aggregate/latest/static/documentation.html +++ b/functions/master/aggregate/latest/static/documentation.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/aggregate/latest/static/example.html b/functions/master/aggregate/latest/static/example.html index 0f230b87..d1ba3b94 100644 --- a/functions/master/aggregate/latest/static/example.html +++ b/functions/master/aggregate/latest/static/example.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/arc_to_parquet/1.5.0/src/README.md b/functions/master/arc_to_parquet/1.5.0/src/README.md new file mode 100644 index 00000000..568ea4e4 --- /dev/null +++ b/functions/master/arc_to_parquet/1.5.0/src/README.md @@ -0,0 +1,28 @@ +## arc_to_parquet + +Retrieve a remote archive and save locally as a parquet file, [source](arc_to_parquet.py) + +Usage example: + +```python +import mlrun, os +mlrun.mlconf.dbpath = 'http://mlrun-api:8080' +mlrun.mlconf.hub_url = '/User/functions/{name}/function.yaml' + +# load arc_to_parquet function from Github +func = mlrun.import_function("hub://arc_to_parquet").apply(mlrun.mount_v3io()) + +# create and run the task +archive = "https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz" + +arc_to_parq_task = mlrun.NewTask(name='tasks - acquire remote', + params={'archive_url': archive, + 'key' : 'HIGGS'}) +# run +run = func.run(arc_to_parq_task, artifact_path='/User/artifacts') +``` + +Output: + +``` +``` \ No newline at end of file diff --git a/functions/master/arc_to_parquet/1.5.0/src/arc_to_parquet.ipynb b/functions/master/arc_to_parquet/1.5.0/src/arc_to_parquet.ipynb new file mode 100644 index 00000000..59d2cefb --- /dev/null +++ b/functions/master/arc_to_parquet/1.5.0/src/arc_to_parquet.ipynb @@ -0,0 +1,834 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Archive to parquet function Example\n", + "> the arc_to_parquet function is typically for large files, the function accept an input of archive and stores the data into a file system.\n", + "in the example we will use arc_to_parquet function to unarchive the higgs-sample data-file stored on s3,\n", + "and will store it on the local file system in parquet format , " + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# upload environment variables from env file if exists\n", + "import os,mlrun\n", + " \n", + "# Specify path\n", + "path = \"/tmp/examples_ci.env\"\n", + " \n", + "if os.path.exists(path):\n", + " env_dict = mlrun.set_env_from_file(path, return_dict=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2022-12-25 11:14:04,646 [info] loaded project arch-to-parquet-example from MLRun DB\n" + ] + } + ], + "source": [ + "# create the new project\n", + "project_name = 'arch-to-parquet-example'\n", + "\n", + "# Initialize the MLRun project object\n", + "project = mlrun.get_or_create_project(project_name, context=\"./\", user_project=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# import packages\n", + "import mlrun\n", + "from mlrun import import_function" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# declare the dataset\n", + "DATA_URL = \"https://s3.wasabisys.com/iguazio/data/market-palce/arc_to_parquet/higgs-sample.csv.gz\"" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# import the function\n", + "arc_to_parquet_function = import_function(\"hub://arc_to_parquet\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2022-12-25 11:14:05,030 [warning] it is recommended to use k8s secret (specify secret_name), specifying the aws_access_key/aws_secret_key directly is unsafe\n", + "> 2022-12-25 11:14:05,046 [info] starting run arc-to-parquet-arc_to_parquet uid=cb1962a5333f4f9f9c16faabfd1e94c1 DB=http://mlrun-api:8080\n", + "> 2022-12-25 11:14:05,203 [info] Job is running in the background, pod: arc-to-parquet-arc-to-parquet-8kz4b\n", + "> 2022-12-25 11:14:44,126 [info] downloading https://s3.wasabisys.com/iguazio/data/market-palce/arc_to_parquet/higgs-sample.csv.gz to local temp file\n", + "> 2022-12-25 11:14:44,793 [info] destination file does not exist, downloading\n", + "> 2022-12-25 11:14:45,143 [info] To track results use the CLI: {'info_cmd': 'mlrun get run cb1962a5333f4f9f9c16faabfd1e94c1 -p arch-to-parquet-example-jovyan', 'logs_cmd': 'mlrun logs cb1962a5333f4f9f9c16faabfd1e94c1 -p arch-to-parquet-example-jovyan'}\n", + "> 2022-12-25 11:14:45,144 [info] run executed, status=completed\n", + "final state: completed\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
arch-to-parquet-example-jovyan0Dec 25 11:14:44completedarc-to-parquet-arc_to_parquet
kind=job
owner=jovyan
mlrun/client_version=1.2.1-rc7
host=arc-to-parquet-arc-to-parquet-8kz4b
archive_url
key=higgs-sample
higgs-sample
\n", + "
\n", + "
\n", + "
\n", + " Title\n", + " ×\n", + "
\n", + " \n", + "
\n", + "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "text/html": [ + " > to track results use the .show() or .logs() methods or click here to open in UI" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2022-12-25 11:14:47,549 [info] run executed, status=completed\n" + ] + } + ], + "source": [ + "# run the function\n", + "arc_to_parquet_run = arc_to_parquet_function.run(params={\"key\": \"higgs-sample\"},\n", + " handler=\"arc_to_parquet\",\n", + " inputs={\"archive_url\": DATA_URL}\n", + " )\n", + " " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Show the results" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unnamed: 01.000000000000000000e+008.692932128906250000e-01-6.350818276405334473e-012.256902605295181274e-013.274700641632080078e-01-6.899932026863098145e-017.542022466659545898e-01-2.485731393098831177e-01-1.092063903808593750e+00...-1.045456994324922562e-02-4.576716944575309753e-023.101961374282836914e+001.353760004043579102e+009.795631170272827148e-019.780761599540710449e-019.200048446655273438e-017.216574549674987793e-019.887509346008300781e-018.766783475875854492e-01
001.00.9075420.3291470.3594121.497970-0.3130101.095531-0.557525-1.588230...-1.138930-0.0008190.0000000.3022200.8330480.9857000.9780980.7797320.9923560.798343
111.00.7988351.470639-1.6359750.4537730.4256291.1048751.2823221.381664...1.1288480.9004610.0000000.9097531.1083300.9856920.9513310.8032520.8659240.780118
220.01.344385-0.8766260.9359131.9920500.8824541.786066-1.646778-0.942383...-0.678379-1.3603560.0000000.9466521.0287040.9986560.7282810.8692001.0267360.957904
331.01.1050090.3213561.5224010.882808-1.2053490.681466-1.070464-0.921871...-0.3735660.1130410.0000000.7558561.3610570.9866100.8380851.1332950.8722450.808487
440.01.595839-0.6078110.0070751.818450-0.1119060.847550-0.5664371.581239...-0.654227-1.2743453.1019610.8237610.9381910.9717580.7891760.4305530.9613570.957818
..................................................................
95951.00.7087940.8502210.6723540.948589-1.1377551.2409110.4168611.581794...1.461144-0.7588320.0000000.9716620.8563501.1340240.9499691.5948261.0486550.922793
96960.01.1350220.285319-1.1094111.088544-0.8962611.1031340.1267240.964220...-1.183070-0.9563801.5509810.8831620.9257140.9865751.0577850.5996320.8871970.970676
97971.01.1240420.3544700.0398121.1324991.6203060.9559211.3754040.415942...-0.1753541.5619160.0000000.8515531.2510611.5463950.7434750.1385500.7176250.746045
98981.00.341495-1.223359-1.3729710.9936660.6919381.0861870.318829-1.185753...1.3054060.4260110.0000001.4295100.9751000.9880901.2573371.3532081.0404130.962988
99990.01.217926-0.307828-1.6015731.532369-1.0068240.555781-0.0594390.819528...-1.4878830.8111200.0000000.6272980.8121120.9893710.7044440.5734870.7088750.764996
\n", + "

100 rows × 30 columns

\n", + "
" + ], + "text/plain": [ + " Unnamed: 0 1.000000000000000000e+00 8.692932128906250000e-01 \\\n", + "0 0 1.0 0.907542 \n", + "1 1 1.0 0.798835 \n", + "2 2 0.0 1.344385 \n", + "3 3 1.0 1.105009 \n", + "4 4 0.0 1.595839 \n", + ".. ... ... ... \n", + "95 95 1.0 0.708794 \n", + "96 96 0.0 1.135022 \n", + "97 97 1.0 1.124042 \n", + "98 98 1.0 0.341495 \n", + "99 99 0.0 1.217926 \n", + "\n", + " -6.350818276405334473e-01 2.256902605295181274e-01 \\\n", + "0 0.329147 0.359412 \n", + "1 1.470639 -1.635975 \n", + "2 -0.876626 0.935913 \n", + "3 0.321356 1.522401 \n", + "4 -0.607811 0.007075 \n", + ".. ... ... \n", + "95 0.850221 0.672354 \n", + "96 0.285319 -1.109411 \n", + "97 0.354470 0.039812 \n", + "98 -1.223359 -1.372971 \n", + "99 -0.307828 -1.601573 \n", + "\n", + " 3.274700641632080078e-01 -6.899932026863098145e-01 \\\n", + "0 1.497970 -0.313010 \n", + "1 0.453773 0.425629 \n", + "2 1.992050 0.882454 \n", + "3 0.882808 -1.205349 \n", + "4 1.818450 -0.111906 \n", + ".. ... ... \n", + "95 0.948589 -1.137755 \n", + "96 1.088544 -0.896261 \n", + "97 1.132499 1.620306 \n", + "98 0.993666 0.691938 \n", + "99 1.532369 -1.006824 \n", + "\n", + " 7.542022466659545898e-01 -2.485731393098831177e-01 \\\n", + "0 1.095531 -0.557525 \n", + "1 1.104875 1.282322 \n", + "2 1.786066 -1.646778 \n", + "3 0.681466 -1.070464 \n", + "4 0.847550 -0.566437 \n", + ".. ... ... \n", + "95 1.240911 0.416861 \n", + "96 1.103134 0.126724 \n", + "97 0.955921 1.375404 \n", + "98 1.086187 0.318829 \n", + "99 0.555781 -0.059439 \n", + "\n", + " -1.092063903808593750e+00 ... -1.045456994324922562e-02 \\\n", + "0 -1.588230 ... -1.138930 \n", + "1 1.381664 ... 1.128848 \n", + "2 -0.942383 ... -0.678379 \n", + "3 -0.921871 ... -0.373566 \n", + "4 1.581239 ... -0.654227 \n", + ".. ... ... ... \n", + "95 1.581794 ... 1.461144 \n", + "96 0.964220 ... -1.183070 \n", + "97 0.415942 ... -0.175354 \n", + "98 -1.185753 ... 1.305406 \n", + "99 0.819528 ... -1.487883 \n", + "\n", + " -4.576716944575309753e-02 3.101961374282836914e+00 \\\n", + "0 -0.000819 0.000000 \n", + "1 0.900461 0.000000 \n", + "2 -1.360356 0.000000 \n", + "3 0.113041 0.000000 \n", + "4 -1.274345 3.101961 \n", + ".. ... ... \n", + "95 -0.758832 0.000000 \n", + "96 -0.956380 1.550981 \n", + "97 1.561916 0.000000 \n", + "98 0.426011 0.000000 \n", + "99 0.811120 0.000000 \n", + "\n", + " 1.353760004043579102e+00 9.795631170272827148e-01 \\\n", + "0 0.302220 0.833048 \n", + "1 0.909753 1.108330 \n", + "2 0.946652 1.028704 \n", + "3 0.755856 1.361057 \n", + "4 0.823761 0.938191 \n", + ".. ... ... \n", + "95 0.971662 0.856350 \n", + "96 0.883162 0.925714 \n", + "97 0.851553 1.251061 \n", + "98 1.429510 0.975100 \n", + "99 0.627298 0.812112 \n", + "\n", + " 9.780761599540710449e-01 9.200048446655273438e-01 \\\n", + "0 0.985700 0.978098 \n", + "1 0.985692 0.951331 \n", + "2 0.998656 0.728281 \n", + "3 0.986610 0.838085 \n", + "4 0.971758 0.789176 \n", + ".. ... ... \n", + "95 1.134024 0.949969 \n", + "96 0.986575 1.057785 \n", + "97 1.546395 0.743475 \n", + "98 0.988090 1.257337 \n", + "99 0.989371 0.704444 \n", + "\n", + " 7.216574549674987793e-01 9.887509346008300781e-01 \\\n", + "0 0.779732 0.992356 \n", + "1 0.803252 0.865924 \n", + "2 0.869200 1.026736 \n", + "3 1.133295 0.872245 \n", + "4 0.430553 0.961357 \n", + ".. ... ... \n", + "95 1.594826 1.048655 \n", + "96 0.599632 0.887197 \n", + "97 0.138550 0.717625 \n", + "98 1.353208 1.040413 \n", + "99 0.573487 0.708875 \n", + "\n", + " 8.766783475875854492e-01 \n", + "0 0.798343 \n", + "1 0.780118 \n", + "2 0.957904 \n", + "3 0.808487 \n", + "4 0.957818 \n", + ".. ... \n", + "95 0.922793 \n", + "96 0.970676 \n", + "97 0.746045 \n", + "98 0.962988 \n", + "99 0.764996 \n", + "\n", + "[100 rows x 30 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "arc_to_parquet_run.artifact('higgs-sample').show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.8" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/functions/master/arc_to_parquet/1.5.0/src/arc_to_parquet.py b/functions/master/arc_to_parquet/1.5.0/src/arc_to_parquet.py new file mode 100644 index 00000000..d9275b7c --- /dev/null +++ b/functions/master/arc_to_parquet/1.5.0/src/arc_to_parquet.py @@ -0,0 +1,134 @@ +# Copyright 2019 Iguazio +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import pandas as pd +import pyarrow.parquet as pq +import pyarrow as pa +import numpy as np + + +from mlrun.execution import MLClientCtx +from mlrun.datastore import DataItem + +from typing import List +import os + + + +def _chunk_readwrite( + archive_url, + dest_path, + chunksize, + header, + encoding, + dtype, + dataset +): + """stream read and write archives + + pandas reads and parquet writes + + notes + ----- + * dest_path can be either a file.parquet, or in hte case of partitioned parquet + it will be only the destination folder of the parquet partition files + """ + pqwriter = None + header = [] + for i, df in enumerate(pd.read_csv(archive_url, chunksize=chunksize, + names=header, encoding=encoding, + dtype=dtype)): + table = pa.Table.from_pandas(df) + if i == 0: + if dataset: + header = np.copy(table.schema) + else: + pqwriter = pq.ParquetWriter(dest_path, table.schema) + if dataset: + pq.write_to_dataset(table, root_path=dest_path, partition_cols=partition_cols) + else: + pqwriter.write_table(table) + if pqwriter: + pqwriter.close() + + return header + + +def arc_to_parquet( + context: MLClientCtx, + archive_url: DataItem, + header: List[str] = [None], + chunksize: int = 0, + dtype=None, + encoding: str = "latin-1", + key: str = "data", + dataset: str = "None", + part_cols=[], + file_ext: str = "parquet", + index: bool = False, + refresh_data: bool = False, + stats: bool = False +) -> None: + """Open a file/object archive and save as a parquet file or dataset + + Notes + ----- + * this function is typically for large files, please be sure to check all settings + * partitioning requires precise specification of column types. + * the archive_url can be any file readable by pandas read_csv, which includes tar files + * if the `dataset` parameter is not empty, then a partitioned dataset will be created + instead of a single file in the folder `dataset` + * if a key exists already then it will not be re-acquired unless the `refresh_data` param + is set to `True`. This is in case the original file is corrupt, or a refresh is + required. + + :param context: the function context + :param archive_url: MLRun data input (DataItem object) + :param chunksize: (0) when > 0, row size (chunk) to retrieve + per iteration + :param dtype destination data type of specified columns + :param encoding ("latin-8") file encoding + :param key: key in artifact store (when log_data=True) + :param dataset: (None) if not None then "target_path/dataset" + is folder for partitioned files + :param part_cols: ([]) list of partitioning columns + :param file_ext: (parquet) csv/parquet file extension + :param index: (False) pandas save index option + :param refresh_data: (False) overwrite existing data at that location + :param stats: (None) calculate table stats when logging artifact + """ + base_path = context.artifact_path + os.makedirs(base_path, exist_ok=True) + + archive_url = archive_url.local() + + if dataset is not None: + dest_path = os.path.join(base_path, dataset) + exists = os.path.isdir(dest_path) + else: + dest_path = os.path.join(base_path, key + f".{file_ext}") + exists = os.path.isfile(dest_path) + + if not exists: + context.logger.info("destination file does not exist, downloading") + if chunksize > 0: + header = _chunk_readwrite(archive_url, dest_path, chunksize, + encoding, dtype, dataset) + context.log_dataset(key=key, stats=stats, format='parquet', + target_path=dest_path) + else: + df = pd.read_csv(archive_url) + context.log_dataset(key, df=df, format=file_ext, index=index) + else: + context.logger.info("destination file already exists, nothing done") \ No newline at end of file diff --git a/functions/master/arc_to_parquet/1.5.0/src/function.yaml b/functions/master/arc_to_parquet/1.5.0/src/function.yaml new file mode 100644 index 00000000..d10e841c --- /dev/null +++ b/functions/master/arc_to_parquet/1.5.0/src/function.yaml @@ -0,0 +1,100 @@ +verbose: false +metadata: + tag: '' + name: arc-to-parquet + categories: + - utils +kind: job +spec: + command: '' + default_handler: arc_to_parquet + entry_points: + arc_to_parquet: + has_varargs: false + parameters: + - name: context + type: MLClientCtx + doc: the function context + - name: archive_url + type: DataItem + doc: MLRun data input (DataItem object) + - name: header + type: List[str] + default: + - null + - name: chunksize + type: int + doc: (0) when > 0, row size (chunk) to retrieve per iteration + default: 0 + - name: dtype + default: null + - name: encoding + type: str + default: latin-1 + - name: key + type: str + doc: key in artifact store (when log_data=True) + default: data + - name: dataset + type: str + doc: (None) if not None then "target_path/dataset" is folder for partitioned + files + default: None + - name: part_cols + doc: ([]) list of partitioning columns + default: [] + - name: file_ext + type: str + doc: (parquet) csv/parquet file extension + default: parquet + - name: index + type: bool + doc: (False) pandas save index option + default: false + - name: refresh_data + type: bool + doc: (False) overwrite existing data at that location + default: false + - name: stats + type: bool + doc: (None) calculate table stats when logging artifact + default: false + lineno: 68 + outputs: + - type: None + name: arc_to_parquet + has_kwargs: false + doc: 'Open a file/object archive and save as a parquet file or dataset + + + Notes + + ----- + + * this function is typically for large files, please be sure to check all + settings + + * partitioning requires precise specification of column types. + + * the archive_url can be any file readable by pandas read_csv, which includes + tar files + + * if the `dataset` parameter is not empty, then a partitioned dataset will + be created + + instead of a single file in the folder `dataset` + + * if a key exists already then it will not be re-acquired unless the `refresh_data` + param + + is set to `True`. This is in case the original file is corrupt, or a refresh + is + + required.' + build: + functionSourceCode: IyBDb3B5cmlnaHQgMjAxOSBJZ3VhemlvCiMKIyBMaWNlbnNlZCB1bmRlciB0aGUgQXBhY2hlIExpY2Vuc2UsIFZlcnNpb24gMi4wICh0aGUgIkxpY2Vuc2UiKTsKIyB5b3UgbWF5IG5vdCB1c2UgdGhpcyBmaWxlIGV4Y2VwdCBpbiBjb21wbGlhbmNlIHdpdGggdGhlIExpY2Vuc2UuCiMgWW91IG1heSBvYnRhaW4gYSBjb3B5IG9mIHRoZSBMaWNlbnNlIGF0CiMKIyAgICAgaHR0cDovL3d3dy5hcGFjaGUub3JnL2xpY2Vuc2VzL0xJQ0VOU0UtMi4wCiMKIyBVbmxlc3MgcmVxdWlyZWQgYnkgYXBwbGljYWJsZSBsYXcgb3IgYWdyZWVkIHRvIGluIHdyaXRpbmcsIHNvZnR3YXJlCiMgZGlzdHJpYnV0ZWQgdW5kZXIgdGhlIExpY2Vuc2UgaXMgZGlzdHJpYnV0ZWQgb24gYW4gIkFTIElTIiBCQVNJUywKIyBXSVRIT1VUIFdBUlJBTlRJRVMgT1IgQ09ORElUSU9OUyBPRiBBTlkgS0lORCwgZWl0aGVyIGV4cHJlc3Mgb3IgaW1wbGllZC4KIyBTZWUgdGhlIExpY2Vuc2UgZm9yIHRoZSBzcGVjaWZpYyBsYW5ndWFnZSBnb3Zlcm5pbmcgcGVybWlzc2lvbnMgYW5kCiMgbGltaXRhdGlvbnMgdW5kZXIgdGhlIExpY2Vuc2UuCiMKaW1wb3J0IHBhbmRhcyBhcyBwZAppbXBvcnQgcHlhcnJvdy5wYXJxdWV0IGFzIHBxCmltcG9ydCBweWFycm93IGFzIHBhCmltcG9ydCBudW1weSBhcyBucAoKCmZyb20gbWxydW4uZXhlY3V0aW9uIGltcG9ydCBNTENsaWVudEN0eApmcm9tIG1scnVuLmRhdGFzdG9yZSBpbXBvcnQgRGF0YUl0ZW0KCmZyb20gdHlwaW5nIGltcG9ydCBMaXN0CmltcG9ydCBvcwoKCgpkZWYgX2NodW5rX3JlYWR3cml0ZSgKICAgICAgICBhcmNoaXZlX3VybCwKICAgICAgICBkZXN0X3BhdGgsCiAgICAgICAgY2h1bmtzaXplLAogICAgICAgIGhlYWRlciwKICAgICAgICBlbmNvZGluZywKICAgICAgICBkdHlwZSwKICAgICAgICBkYXRhc2V0Cik6CiAgICAiIiJzdHJlYW0gcmVhZCBhbmQgd3JpdGUgYXJjaGl2ZXMKCiAgICBwYW5kYXMgcmVhZHMgYW5kIHBhcnF1ZXQgd3JpdGVzCgogICAgbm90ZXMKICAgIC0tLS0tCiAgICAqIGRlc3RfcGF0aCBjYW4gYmUgZWl0aGVyIGEgZmlsZS5wYXJxdWV0LCBvciBpbiBodGUgY2FzZSBvZiBwYXJ0aXRpb25lZCBwYXJxdWV0CiAgICAgIGl0IHdpbGwgYmUgb25seSB0aGUgZGVzdGluYXRpb24gZm9sZGVyIG9mIHRoZSBwYXJxdWV0IHBhcnRpdGlvbiBmaWxlcwogICAgIiIiCiAgICBwcXdyaXRlciA9IE5vbmUKICAgIGhlYWRlciA9IFtdCiAgICBmb3IgaSwgZGYgaW4gZW51bWVyYXRlKHBkLnJlYWRfY3N2KGFyY2hpdmVfdXJsLCBjaHVua3NpemU9Y2h1bmtzaXplLAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBuYW1lcz1oZWFkZXIsIGVuY29kaW5nPWVuY29kaW5nLAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBkdHlwZT1kdHlwZSkpOgogICAgICAgIHRhYmxlID0gcGEuVGFibGUuZnJvbV9wYW5kYXMoZGYpCiAgICAgICAgaWYgaSA9PSAwOgogICAgICAgICAgICBpZiBkYXRhc2V0OgogICAgICAgICAgICAgICAgaGVhZGVyID0gbnAuY29weSh0YWJsZS5zY2hlbWEpCiAgICAgICAgICAgIGVsc2U6CiAgICAgICAgICAgICAgICBwcXdyaXRlciA9IHBxLlBhcnF1ZXRXcml0ZXIoZGVzdF9wYXRoLCB0YWJsZS5zY2hlbWEpCiAgICAgICAgaWYgZGF0YXNldDoKICAgICAgICAgICAgcHEud3JpdGVfdG9fZGF0YXNldCh0YWJsZSwgcm9vdF9wYXRoPWRlc3RfcGF0aCwgcGFydGl0aW9uX2NvbHM9cGFydGl0aW9uX2NvbHMpCiAgICAgICAgZWxzZToKICAgICAgICAgICAgcHF3cml0ZXIud3JpdGVfdGFibGUodGFibGUpCiAgICBpZiBwcXdyaXRlcjoKICAgICAgICBwcXdyaXRlci5jbG9zZSgpCgogICAgcmV0dXJuIGhlYWRlcgoKCmRlZiBhcmNfdG9fcGFycXVldCgKICAgICAgICBjb250ZXh0OiBNTENsaWVudEN0eCwKICAgICAgICBhcmNoaXZlX3VybDogRGF0YUl0ZW0sCiAgICAgICAgaGVhZGVyOiBMaXN0W3N0cl0gPSBbTm9uZV0sCiAgICAgICAgY2h1bmtzaXplOiBpbnQgPSAwLAogICAgICAgIGR0eXBlPU5vbmUsCiAgICAgICAgZW5jb2Rpbmc6IHN0ciA9ICJsYXRpbi0xIiwKICAgICAgICBrZXk6IHN0ciA9ICJkYXRhIiwKICAgICAgICBkYXRhc2V0OiBzdHIgPSAiTm9uZSIsCiAgICAgICAgcGFydF9jb2xzPVtdLAogICAgICAgIGZpbGVfZXh0OiBzdHIgPSAicGFycXVldCIsCiAgICAgICAgaW5kZXg6IGJvb2wgPSBGYWxzZSwKICAgICAgICByZWZyZXNoX2RhdGE6IGJvb2wgPSBGYWxzZSwKICAgICAgICBzdGF0czogYm9vbCA9IEZhbHNlCikgLT4gTm9uZToKICAgICIiIk9wZW4gYSBmaWxlL29iamVjdCBhcmNoaXZlIGFuZCBzYXZlIGFzIGEgcGFycXVldCBmaWxlIG9yIGRhdGFzZXQKCiAgICBOb3RlcwogICAgLS0tLS0KICAgICogdGhpcyBmdW5jdGlvbiBpcyB0eXBpY2FsbHkgZm9yIGxhcmdlIGZpbGVzLCBwbGVhc2UgYmUgc3VyZSB0byBjaGVjayBhbGwgc2V0dGluZ3MKICAgICogcGFydGl0aW9uaW5nIHJlcXVpcmVzIHByZWNpc2Ugc3BlY2lmaWNhdGlvbiBvZiBjb2x1bW4gdHlwZXMuCiAgICAqIHRoZSBhcmNoaXZlX3VybCBjYW4gYmUgYW55IGZpbGUgcmVhZGFibGUgYnkgcGFuZGFzIHJlYWRfY3N2LCB3aGljaCBpbmNsdWRlcyB0YXIgZmlsZXMKICAgICogaWYgdGhlIGBkYXRhc2V0YCBwYXJhbWV0ZXIgaXMgbm90IGVtcHR5LCB0aGVuIGEgcGFydGl0aW9uZWQgZGF0YXNldCB3aWxsIGJlIGNyZWF0ZWQKICAgIGluc3RlYWQgb2YgYSBzaW5nbGUgZmlsZSBpbiB0aGUgZm9sZGVyIGBkYXRhc2V0YAogICAgKiBpZiBhIGtleSBleGlzdHMgYWxyZWFkeSB0aGVuIGl0IHdpbGwgbm90IGJlIHJlLWFjcXVpcmVkIHVubGVzcyB0aGUgYHJlZnJlc2hfZGF0YWAgcGFyYW0KICAgIGlzIHNldCB0byBgVHJ1ZWAuICBUaGlzIGlzIGluIGNhc2UgdGhlIG9yaWdpbmFsIGZpbGUgaXMgY29ycnVwdCwgb3IgYSByZWZyZXNoIGlzCiAgICByZXF1aXJlZC4KCiAgICA6cGFyYW0gY29udGV4dDogICAgICAgIHRoZSBmdW5jdGlvbiBjb250ZXh0CiAgICA6cGFyYW0gYXJjaGl2ZV91cmw6ICAgIE1MUnVuIGRhdGEgaW5wdXQgKERhdGFJdGVtIG9iamVjdCkKICAgIDpwYXJhbSBjaHVua3NpemU6ICAgICAgKDApIHdoZW4gPiAwLCByb3cgc2l6ZSAoY2h1bmspIHRvIHJldHJpZXZlCiAgICAgICAgICAgICAgICAgICAgICAgICAgIHBlciBpdGVyYXRpb24KICAgIDpwYXJhbSBkdHlwZSAgICAgICAgICAgZGVzdGluYXRpb24gZGF0YSB0eXBlIG9mIHNwZWNpZmllZCBjb2x1bW5zCiAgICA6cGFyYW0gZW5jb2RpbmcgICAgICAgICgibGF0aW4tOCIpIGZpbGUgZW5jb2RpbmcKICAgIDpwYXJhbSBrZXk6ICAgICAgICAgICAga2V5IGluIGFydGlmYWN0IHN0b3JlICh3aGVuIGxvZ19kYXRhPVRydWUpCiAgICA6cGFyYW0gZGF0YXNldDogICAgICAgIChOb25lKSBpZiBub3QgTm9uZSB0aGVuICJ0YXJnZXRfcGF0aC9kYXRhc2V0IgogICAgICAgICAgICAgICAgICAgICAgICAgICBpcyBmb2xkZXIgZm9yIHBhcnRpdGlvbmVkIGZpbGVzCiAgICA6cGFyYW0gcGFydF9jb2xzOiAgICAgIChbXSkgbGlzdCBvZiBwYXJ0aXRpb25pbmcgY29sdW1ucwogICAgOnBhcmFtIGZpbGVfZXh0OiAgICAgICAocGFycXVldCkgY3N2L3BhcnF1ZXQgZmlsZSBleHRlbnNpb24KICAgIDpwYXJhbSBpbmRleDogICAgICAgICAgKEZhbHNlKSBwYW5kYXMgc2F2ZSBpbmRleCBvcHRpb24KICAgIDpwYXJhbSByZWZyZXNoX2RhdGE6ICAgKEZhbHNlKSBvdmVyd3JpdGUgZXhpc3RpbmcgZGF0YSBhdCB0aGF0IGxvY2F0aW9uCiAgICA6cGFyYW0gc3RhdHM6ICAgICAgICAgIChOb25lKSBjYWxjdWxhdGUgdGFibGUgc3RhdHMgd2hlbiBsb2dnaW5nIGFydGlmYWN0CiAgICAiIiIKICAgIGJhc2VfcGF0aCA9IGNvbnRleHQuYXJ0aWZhY3RfcGF0aAogICAgb3MubWFrZWRpcnMoYmFzZV9wYXRoLCBleGlzdF9vaz1UcnVlKQoKICAgIGFyY2hpdmVfdXJsID0gYXJjaGl2ZV91cmwubG9jYWwoKQoKICAgIGlmIGRhdGFzZXQgaXMgbm90IE5vbmU6CiAgICAgICAgZGVzdF9wYXRoID0gb3MucGF0aC5qb2luKGJhc2VfcGF0aCwgZGF0YXNldCkKICAgICAgICBleGlzdHMgPSBvcy5wYXRoLmlzZGlyKGRlc3RfcGF0aCkKICAgIGVsc2U6CiAgICAgICAgZGVzdF9wYXRoID0gb3MucGF0aC5qb2luKGJhc2VfcGF0aCwga2V5ICsgZiIue2ZpbGVfZXh0fSIpCiAgICAgICAgZXhpc3RzID0gb3MucGF0aC5pc2ZpbGUoZGVzdF9wYXRoKQoKICAgIGlmIG5vdCBleGlzdHM6CiAgICAgICAgY29udGV4dC5sb2dnZXIuaW5mbygiZGVzdGluYXRpb24gZmlsZSBkb2VzIG5vdCBleGlzdCwgZG93bmxvYWRpbmciKQogICAgICAgIGlmIGNodW5rc2l6ZSA+IDA6CiAgICAgICAgICAgIGhlYWRlciA9IF9jaHVua19yZWFkd3JpdGUoYXJjaGl2ZV91cmwsIGRlc3RfcGF0aCwgY2h1bmtzaXplLAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIGVuY29kaW5nLCBkdHlwZSwgZGF0YXNldCkKICAgICAgICAgICAgY29udGV4dC5sb2dfZGF0YXNldChrZXk9a2V5LCBzdGF0cz1zdGF0cywgZm9ybWF0PSdwYXJxdWV0JywKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICB0YXJnZXRfcGF0aD1kZXN0X3BhdGgpCiAgICAgICAgZWxzZToKICAgICAgICAgICAgZGYgPSBwZC5yZWFkX2NzdihhcmNoaXZlX3VybCkKICAgICAgICAgICAgY29udGV4dC5sb2dfZGF0YXNldChrZXksIGRmPWRmLCBmb3JtYXQ9ZmlsZV9leHQsIGluZGV4PWluZGV4KQogICAgZWxzZToKICAgICAgICBjb250ZXh0LmxvZ2dlci5pbmZvKCJkZXN0aW5hdGlvbiBmaWxlIGFscmVhZHkgZXhpc3RzLCBub3RoaW5nIGRvbmUiKQ== + code_origin: '' + origin_filename: '' + description: retrieve remote archive, open and save as parquet + disable_auto_mount: false + image: mlrun/mlrun diff --git a/functions/master/arc_to_parquet/1.5.0/src/item.yaml b/functions/master/arc_to_parquet/1.5.0/src/item.yaml new file mode 100644 index 00000000..4bc2634c --- /dev/null +++ b/functions/master/arc_to_parquet/1.5.0/src/item.yaml @@ -0,0 +1,24 @@ +apiVersion: v1 +categories: +- utils +description: retrieve remote archive, open and save as parquet +doc: '' +example: arc_to_parquet.ipynb +generationDate: 2022-08-28:17-25 +hidden: false +icon: '' +labels: + author: avi +maintainers: [] +marketplaceType: '' +mlrunVersion: 1.7.0 +name: arc-to-parquet +platformVersion: 3.5.4 +spec: + filename: arc_to_parquet.py + handler: arc_to_parquet + image: mlrun/mlrun + kind: job + requirements: [] +url: '' +version: 1.5.0 diff --git a/functions/master/arc_to_parquet/1.5.0/src/requirements.txt b/functions/master/arc_to_parquet/1.5.0/src/requirements.txt new file mode 100644 index 00000000..97eeefad --- /dev/null +++ b/functions/master/arc_to_parquet/1.5.0/src/requirements.txt @@ -0,0 +1,2 @@ +pyarrow +pandas \ No newline at end of file diff --git a/functions/master/arc_to_parquet/1.5.0/src/test_arc_to_parquet.py b/functions/master/arc_to_parquet/1.5.0/src/test_arc_to_parquet.py new file mode 100644 index 00000000..f0299f57 --- /dev/null +++ b/functions/master/arc_to_parquet/1.5.0/src/test_arc_to_parquet.py @@ -0,0 +1,43 @@ +# Copyright 2019 Iguazio +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +from mlrun import code_to_function, import_function + +DATA_URL = "https://s3.wasabisys.com/iguazio/data/market-palce/arc_to_parquet/higgs-sample.csv.gz" + +def test_run_arc_to_parquet(): + fn = code_to_function(name='test_arc_to_parquet', + filename="arc_to_parquet.py", + handler="arc_to_parquet", + kind="local", + ) + run = fn.run(params={"key": "higgs-sample"}, + handler="arc_to_parquet", + inputs={"archive_url": DATA_URL}, + artifact_path='artifacts', + local=False) + + assert(run.outputs['higgs-sample']) + +def test_run_local_arc_to_parquet(): + import os + os.getcwd() + fn = import_function("function.yaml") + run = fn.run(params={"key": "higgs-sample"}, + handler="arc_to_parquet", + inputs={"archive_url": DATA_URL}, + artifact_path=os.getcwd()+'/artifacts', + local=True) + + assert(run.outputs['higgs-sample']) \ No newline at end of file diff --git a/functions/master/arc_to_parquet/1.5.0/static/arc_to_parquet.html b/functions/master/arc_to_parquet/1.5.0/static/arc_to_parquet.html new file mode 100644 index 00000000..3598982d --- /dev/null +++ b/functions/master/arc_to_parquet/1.5.0/static/arc_to_parquet.html @@ -0,0 +1,309 @@ + + + + + + + +arc_to_parquet.arc_to_parquet + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + +
+
+
+
+
+
+ +
+ +
+
+
+ + + + +
+
+
+
+
+
+
+
+
+
+
+
+
+ + +
+
+
+
+
+

+ +
+
+
+
+
+ +
+

Source code for arc_to_parquet.arc_to_parquet

+# Copyright 2019 Iguazio
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import pandas as pd
+import pyarrow.parquet as pq
+import pyarrow as pa
+import numpy as np
+
+
+from mlrun.execution import MLClientCtx
+from mlrun.datastore import DataItem
+
+from typing import List
+import os
+
+
+
+def _chunk_readwrite(
+        archive_url,
+        dest_path,
+        chunksize,
+        header,
+        encoding,
+        dtype,
+        dataset
+):
+    """stream read and write archives
+
+    pandas reads and parquet writes
+
+    notes
+    -----
+    * dest_path can be either a file.parquet, or in hte case of partitioned parquet
+      it will be only the destination folder of the parquet partition files
+    """
+    pqwriter = None
+    header = []
+    for i, df in enumerate(pd.read_csv(archive_url, chunksize=chunksize,
+                                       names=header, encoding=encoding,
+                                       dtype=dtype)):
+        table = pa.Table.from_pandas(df)
+        if i == 0:
+            if dataset:
+                header = np.copy(table.schema)
+            else:
+                pqwriter = pq.ParquetWriter(dest_path, table.schema)
+        if dataset:
+            pq.write_to_dataset(table, root_path=dest_path, partition_cols=partition_cols)
+        else:
+            pqwriter.write_table(table)
+    if pqwriter:
+        pqwriter.close()
+
+    return header
+
+
+
+[docs] +def arc_to_parquet( + context: MLClientCtx, + archive_url: DataItem, + header: List[str] = [None], + chunksize: int = 0, + dtype=None, + encoding: str = "latin-1", + key: str = "data", + dataset: str = "None", + part_cols=[], + file_ext: str = "parquet", + index: bool = False, + refresh_data: bool = False, + stats: bool = False +) -> None: + """Open a file/object archive and save as a parquet file or dataset + + Notes + ----- + * this function is typically for large files, please be sure to check all settings + * partitioning requires precise specification of column types. + * the archive_url can be any file readable by pandas read_csv, which includes tar files + * if the `dataset` parameter is not empty, then a partitioned dataset will be created + instead of a single file in the folder `dataset` + * if a key exists already then it will not be re-acquired unless the `refresh_data` param + is set to `True`. This is in case the original file is corrupt, or a refresh is + required. + + :param context: the function context + :param archive_url: MLRun data input (DataItem object) + :param chunksize: (0) when > 0, row size (chunk) to retrieve + per iteration + :param dtype destination data type of specified columns + :param encoding ("latin-8") file encoding + :param key: key in artifact store (when log_data=True) + :param dataset: (None) if not None then "target_path/dataset" + is folder for partitioned files + :param part_cols: ([]) list of partitioning columns + :param file_ext: (parquet) csv/parquet file extension + :param index: (False) pandas save index option + :param refresh_data: (False) overwrite existing data at that location + :param stats: (None) calculate table stats when logging artifact + """ + base_path = context.artifact_path + os.makedirs(base_path, exist_ok=True) + + archive_url = archive_url.local() + + if dataset is not None: + dest_path = os.path.join(base_path, dataset) + exists = os.path.isdir(dest_path) + else: + dest_path = os.path.join(base_path, key + f".{file_ext}") + exists = os.path.isfile(dest_path) + + if not exists: + context.logger.info("destination file does not exist, downloading") + if chunksize > 0: + header = _chunk_readwrite(archive_url, dest_path, chunksize, + encoding, dtype, dataset) + context.log_dataset(key=key, stats=stats, format='parquet', + target_path=dest_path) + else: + df = pd.read_csv(archive_url) + context.log_dataset(key, df=df, format=file_ext, index=index) + else: + context.logger.info("destination file already exists, nothing done")
+ +
+
+
+
+
+
+
+
+
+ +
+
+
+
+ + + +
+
+ + \ No newline at end of file diff --git a/functions/master/arc_to_parquet/1.5.0/static/documentation.html b/functions/master/arc_to_parquet/1.5.0/static/documentation.html new file mode 100644 index 00000000..6f217a17 --- /dev/null +++ b/functions/master/arc_to_parquet/1.5.0/static/documentation.html @@ -0,0 +1,280 @@ + + + + + + + +arc_to_parquet package + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + +
+
+
+
+
+
+ +
+ +
+
+
+ + + + +
+
+
+
+
+
+
+
+
+
+
+
+
+ + + + +
+
+
+
+
+

arc_to_parquet package

+ +
+ +
+
+ +
+
+

arc_to_parquet package#

+
+

Submodules#

+
+
+

arc_to_parquet.arc_to_parquet module#

+
+
+arc_to_parquet.arc_to_parquet.arc_to_parquet(context: MLClientCtx, archive_url: DataItem, header: List[str] = [None], chunksize: int = 0, dtype=None, encoding: str = 'latin-1', key: str = 'data', dataset: str = 'None', part_cols=[], file_ext: str = 'parquet', index: bool = False, refresh_data: bool = False, stats: bool = False) None[source]#
+

Open a file/object archive and save as a parquet file or dataset

+

Notes

+
    +
  • this function is typically for large files, please be sure to check all settings

  • +
  • partitioning requires precise specification of column types.

  • +
  • the archive_url can be any file readable by pandas read_csv, which includes tar files

  • +
  • if the dataset parameter is not empty, then a partitioned dataset will be created

  • +
+

instead of a single file in the folder dataset +* if a key exists already then it will not be re-acquired unless the refresh_data param +is set to True. This is in case the original file is corrupt, or a refresh is +required.

+
+
Parameters:
+
    +
  • context – the function context

  • +
  • archive_url – MLRun data input (DataItem object)

  • +
  • chunksize – (0) when > 0, row size (chunk) to retrieve +per iteration

  • +
+
+
+

:param dtype destination data type of specified columns +:param encoding (“latin-8”) file encoding +:param key: key in artifact store (when log_data=True) +:param dataset: (None) if not None then “target_path/dataset”

+
+

is folder for partitioned files

+
+
+
Parameters:
+
    +
  • part_cols – ([]) list of partitioning columns

  • +
  • file_ext – (parquet) csv/parquet file extension

  • +
  • index – (False) pandas save index option

  • +
  • refresh_data – (False) overwrite existing data at that location

  • +
  • stats – (None) calculate table stats when logging artifact

  • +
+
+
+
+
+
+

Module contents#

+
+
+
+
+
+
+
+
+ +
+
+ +
+
+
+
+ + + +
+
+ + \ No newline at end of file diff --git a/functions/master/arc_to_parquet/1.5.0/static/example.html b/functions/master/arc_to_parquet/1.5.0/static/example.html new file mode 100644 index 00000000..3f87a94c --- /dev/null +++ b/functions/master/arc_to_parquet/1.5.0/static/example.html @@ -0,0 +1,818 @@ + + + + + + + +Archive to parquet function Example + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + +
+
+
+
+
+
+ +
+ +
+
+
+ + + + +
+
+
+
+
+
+
+
+
+
+
+
+
+ + + + +
+
+
+
+
+

Archive to parquet function Example

+ +
+
+
+

Contents

+
+ +
+
+
+ +
+
+

Archive to parquet function Example#

+
+

the arc_to_parquet function is typically for large files, the function accept an input of archive and stores the data into a file system. +in the example we will use arc_to_parquet function to unarchive the higgs-sample data-file stored on s3, +and will store it on the local file system in parquet format ,

+
+
+
+
# upload environment variables from env file if exists
+import os,mlrun
+   
+# Specify path
+path = "/tmp/examples_ci.env"
+   
+if os.path.exists(path):
+    env_dict = mlrun.set_env_from_file(path, return_dict=True)
+
+
+
+
+
+
+
# create the new project
+project_name = 'arch-to-parquet-example'
+
+# Initialize the MLRun project object
+project = mlrun.get_or_create_project(project_name, context="./", user_project=True)
+
+
+
+
+
> 2022-12-25 11:14:04,646 [info] loaded project arch-to-parquet-example from MLRun DB
+
+
+
+
+
+
+
# import packages
+import mlrun
+from mlrun import import_function
+
+
+
+
+
+
+
# declare the dataset
+DATA_URL = "https://s3.wasabisys.com/iguazio/data/market-palce/arc_to_parquet/higgs-sample.csv.gz"
+
+
+
+
+
+
+
# import the function
+arc_to_parquet_function = import_function("hub://arc_to_parquet")
+
+
+
+
+
+
+
# run the function
+arc_to_parquet_run = arc_to_parquet_function.run(params={"key": "higgs-sample"},
+           handler="arc_to_parquet",
+           inputs={"archive_url": DATA_URL}
+           )
+    
+
+
+
+
+
> 2022-12-25 11:14:05,030 [warning] it is recommended to use k8s secret (specify secret_name), specifying the aws_access_key/aws_secret_key directly is unsafe
+> 2022-12-25 11:14:05,046 [info] starting run arc-to-parquet-arc_to_parquet uid=cb1962a5333f4f9f9c16faabfd1e94c1 DB=http://mlrun-api:8080
+> 2022-12-25 11:14:05,203 [info] Job is running in the background, pod: arc-to-parquet-arc-to-parquet-8kz4b
+> 2022-12-25 11:14:44,126 [info] downloading https://s3.wasabisys.com/iguazio/data/market-palce/arc_to_parquet/higgs-sample.csv.gz to local temp file
+> 2022-12-25 11:14:44,793 [info] destination file does not exist, downloading
+> 2022-12-25 11:14:45,143 [info] To track results use the CLI: {'info_cmd': 'mlrun get run cb1962a5333f4f9f9c16faabfd1e94c1 -p arch-to-parquet-example-jovyan', 'logs_cmd': 'mlrun logs cb1962a5333f4f9f9c16faabfd1e94c1 -p arch-to-parquet-example-jovyan'}
+> 2022-12-25 11:14:45,144 [info] run executed, status=completed
+final state: completed
+
+
+
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
arch-to-parquet-example-jovyan0Dec 25 11:14:44completedarc-to-parquet-arc_to_parquet
kind=job
owner=jovyan
mlrun/client_version=1.2.1-rc7
host=arc-to-parquet-arc-to-parquet-8kz4b
archive_url
key=higgs-sample
higgs-sample
+
+ +
+

+
+
+
> to track results use the .show() or .logs() methods or click here to open in UI
> 2022-12-25 11:14:47,549 [info] run executed, status=completed
+
+
+
+
+
+

Show the results#

+
+
+
arc_to_parquet_run.artifact('higgs-sample').show()
+
+
+
+
+

Unnamed: 01.000000000000000000e+008.692932128906250000e-01-6.350818276405334473e-012.256902605295181274e-013.274700641632080078e-01-6.899932026863098145e-017.542022466659545898e-01-2.485731393098831177e-01-1.092063903808593750e+00...-1.045456994324922562e-02-4.576716944575309753e-023.101961374282836914e+001.353760004043579102e+009.795631170272827148e-019.780761599540710449e-019.200048446655273438e-017.216574549674987793e-019.887509346008300781e-018.766783475875854492e-01
001.00.9075420.3291470.3594121.497970-0.3130101.095531-0.557525-1.588230...-1.138930-0.0008190.0000000.3022200.8330480.9857000.9780980.7797320.9923560.798343
111.00.7988351.470639-1.6359750.4537730.4256291.1048751.2823221.381664...1.1288480.9004610.0000000.9097531.1083300.9856920.9513310.8032520.8659240.780118
220.01.344385-0.8766260.9359131.9920500.8824541.786066-1.646778-0.942383...-0.678379-1.3603560.0000000.9466521.0287040.9986560.7282810.8692001.0267360.957904
331.01.1050090.3213561.5224010.882808-1.2053490.681466-1.070464-0.921871...-0.3735660.1130410.0000000.7558561.3610570.9866100.8380851.1332950.8722450.808487
440.01.595839-0.6078110.0070751.818450-0.1119060.847550-0.5664371.581239...-0.654227-1.2743453.1019610.8237610.9381910.9717580.7891760.4305530.9613570.957818
..................................................................
95951.00.7087940.8502210.6723540.948589-1.1377551.2409110.4168611.581794...1.461144-0.7588320.0000000.9716620.8563501.1340240.9499691.5948261.0486550.922793
96960.01.1350220.285319-1.1094111.088544-0.8962611.1031340.1267240.964220...-1.183070-0.9563801.5509810.8831620.9257140.9865751.0577850.5996320.8871970.970676
97971.01.1240420.3544700.0398121.1324991.6203060.9559211.3754040.415942...-0.1753541.5619160.0000000.8515531.2510611.5463950.7434750.1385500.7176250.746045
98981.00.341495-1.223359-1.3729710.9936660.6919381.0861870.318829-1.185753...1.3054060.4260110.0000001.4295100.9751000.9880901.2573371.3532081.0404130.962988
99990.01.217926-0.307828-1.6015731.532369-1.0068240.555781-0.0594390.819528...-1.4878830.8111200.0000000.6272980.8121120.9893710.7044440.5734870.7088750.764996
+

100 rows × 30 columns

+
+
+
+
+
+
+
+
+
+
+
+
+
+ +
+
+
+
+ + + +
+
+ + \ No newline at end of file diff --git a/functions/master/arc_to_parquet/1.5.0/static/function.html b/functions/master/arc_to_parquet/1.5.0/static/function.html new file mode 100644 index 00000000..998b172d --- /dev/null +++ b/functions/master/arc_to_parquet/1.5.0/static/function.html @@ -0,0 +1,135 @@ + + + + + + + + + + + Source + + + + +
+        
+verbose: false
+metadata:
+  tag: ''
+  name: arc-to-parquet
+  categories:
+  - utils
+kind: job
+spec:
+  command: ''
+  default_handler: arc_to_parquet
+  entry_points:
+    arc_to_parquet:
+      has_varargs: false
+      parameters:
+      - name: context
+        type: MLClientCtx
+        doc: the function context
+      - name: archive_url
+        type: DataItem
+        doc: MLRun data input (DataItem object)
+      - name: header
+        type: List[str]
+        default:
+        - null
+      - name: chunksize
+        type: int
+        doc: (0) when > 0, row size (chunk) to retrieve per iteration
+        default: 0
+      - name: dtype
+        default: null
+      - name: encoding
+        type: str
+        default: latin-1
+      - name: key
+        type: str
+        doc: key in artifact store (when log_data=True)
+        default: data
+      - name: dataset
+        type: str
+        doc: (None) if not None then "target_path/dataset" is folder for partitioned
+          files
+        default: None
+      - name: part_cols
+        doc: ([]) list of partitioning columns
+        default: []
+      - name: file_ext
+        type: str
+        doc: (parquet) csv/parquet file extension
+        default: parquet
+      - name: index
+        type: bool
+        doc: (False) pandas save index option
+        default: false
+      - name: refresh_data
+        type: bool
+        doc: (False) overwrite existing data at that location
+        default: false
+      - name: stats
+        type: bool
+        doc: (None) calculate table stats when logging artifact
+        default: false
+      lineno: 68
+      outputs:
+      - type: None
+      name: arc_to_parquet
+      has_kwargs: false
+      doc: 'Open a file/object archive and save as a parquet file or dataset
+
+
+        Notes
+
+        -----
+
+        * this function is typically for large files, please be sure to check all
+        settings
+
+        * partitioning requires precise specification of column types.
+
+        * the archive_url can be any file readable by pandas read_csv, which includes
+        tar files
+
+        * if the `dataset` parameter is not empty, then a partitioned dataset will
+        be created
+
+        instead of a single file in the folder `dataset`
+
+        * if a key exists already then it will not be re-acquired unless the `refresh_data`
+        param
+
+        is set to `True`.  This is in case the original file is corrupt, or a refresh
+        is
+
+        required.'
+  build:
+    functionSourceCode: IyBDb3B5cmlnaHQgMjAxOSBJZ3VhemlvCiMKIyBMaWNlbnNlZCB1bmRlciB0aGUgQXBhY2hlIExpY2Vuc2UsIFZlcnNpb24gMi4wICh0aGUgIkxpY2Vuc2UiKTsKIyB5b3UgbWF5IG5vdCB1c2UgdGhpcyBmaWxlIGV4Y2VwdCBpbiBjb21wbGlhbmNlIHdpdGggdGhlIExpY2Vuc2UuCiMgWW91IG1heSBvYnRhaW4gYSBjb3B5IG9mIHRoZSBMaWNlbnNlIGF0CiMKIyAgICAgaHR0cDovL3d3dy5hcGFjaGUub3JnL2xpY2Vuc2VzL0xJQ0VOU0UtMi4wCiMKIyBVbmxlc3MgcmVxdWlyZWQgYnkgYXBwbGljYWJsZSBsYXcgb3IgYWdyZWVkIHRvIGluIHdyaXRpbmcsIHNvZnR3YXJlCiMgZGlzdHJpYnV0ZWQgdW5kZXIgdGhlIExpY2Vuc2UgaXMgZGlzdHJpYnV0ZWQgb24gYW4gIkFTIElTIiBCQVNJUywKIyBXSVRIT1VUIFdBUlJBTlRJRVMgT1IgQ09ORElUSU9OUyBPRiBBTlkgS0lORCwgZWl0aGVyIGV4cHJlc3Mgb3IgaW1wbGllZC4KIyBTZWUgdGhlIExpY2Vuc2UgZm9yIHRoZSBzcGVjaWZpYyBsYW5ndWFnZSBnb3Zlcm5pbmcgcGVybWlzc2lvbnMgYW5kCiMgbGltaXRhdGlvbnMgdW5kZXIgdGhlIExpY2Vuc2UuCiMKaW1wb3J0IHBhbmRhcyBhcyBwZAppbXBvcnQgcHlhcnJvdy5wYXJxdWV0IGFzIHBxCmltcG9ydCBweWFycm93IGFzIHBhCmltcG9ydCBudW1weSBhcyBucAoKCmZyb20gbWxydW4uZXhlY3V0aW9uIGltcG9ydCBNTENsaWVudEN0eApmcm9tIG1scnVuLmRhdGFzdG9yZSBpbXBvcnQgRGF0YUl0ZW0KCmZyb20gdHlwaW5nIGltcG9ydCBMaXN0CmltcG9ydCBvcwoKCgpkZWYgX2NodW5rX3JlYWR3cml0ZSgKICAgICAgICBhcmNoaXZlX3VybCwKICAgICAgICBkZXN0X3BhdGgsCiAgICAgICAgY2h1bmtzaXplLAogICAgICAgIGhlYWRlciwKICAgICAgICBlbmNvZGluZywKICAgICAgICBkdHlwZSwKICAgICAgICBkYXRhc2V0Cik6CiAgICAiIiJzdHJlYW0gcmVhZCBhbmQgd3JpdGUgYXJjaGl2ZXMKCiAgICBwYW5kYXMgcmVhZHMgYW5kIHBhcnF1ZXQgd3JpdGVzCgogICAgbm90ZXMKICAgIC0tLS0tCiAgICAqIGRlc3RfcGF0aCBjYW4gYmUgZWl0aGVyIGEgZmlsZS5wYXJxdWV0LCBvciBpbiBodGUgY2FzZSBvZiBwYXJ0aXRpb25lZCBwYXJxdWV0CiAgICAgIGl0IHdpbGwgYmUgb25seSB0aGUgZGVzdGluYXRpb24gZm9sZGVyIG9mIHRoZSBwYXJxdWV0IHBhcnRpdGlvbiBmaWxlcwogICAgIiIiCiAgICBwcXdyaXRlciA9IE5vbmUKICAgIGhlYWRlciA9IFtdCiAgICBmb3IgaSwgZGYgaW4gZW51bWVyYXRlKHBkLnJlYWRfY3N2KGFyY2hpdmVfdXJsLCBjaHVua3NpemU9Y2h1bmtzaXplLAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBuYW1lcz1oZWFkZXIsIGVuY29kaW5nPWVuY29kaW5nLAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBkdHlwZT1kdHlwZSkpOgogICAgICAgIHRhYmxlID0gcGEuVGFibGUuZnJvbV9wYW5kYXMoZGYpCiAgICAgICAgaWYgaSA9PSAwOgogICAgICAgICAgICBpZiBkYXRhc2V0OgogICAgICAgICAgICAgICAgaGVhZGVyID0gbnAuY29weSh0YWJsZS5zY2hlbWEpCiAgICAgICAgICAgIGVsc2U6CiAgICAgICAgICAgICAgICBwcXdyaXRlciA9IHBxLlBhcnF1ZXRXcml0ZXIoZGVzdF9wYXRoLCB0YWJsZS5zY2hlbWEpCiAgICAgICAgaWYgZGF0YXNldDoKICAgICAgICAgICAgcHEud3JpdGVfdG9fZGF0YXNldCh0YWJsZSwgcm9vdF9wYXRoPWRlc3RfcGF0aCwgcGFydGl0aW9uX2NvbHM9cGFydGl0aW9uX2NvbHMpCiAgICAgICAgZWxzZToKICAgICAgICAgICAgcHF3cml0ZXIud3JpdGVfdGFibGUodGFibGUpCiAgICBpZiBwcXdyaXRlcjoKICAgICAgICBwcXdyaXRlci5jbG9zZSgpCgogICAgcmV0dXJuIGhlYWRlcgoKCmRlZiBhcmNfdG9fcGFycXVldCgKICAgICAgICBjb250ZXh0OiBNTENsaWVudEN0eCwKICAgICAgICBhcmNoaXZlX3VybDogRGF0YUl0ZW0sCiAgICAgICAgaGVhZGVyOiBMaXN0W3N0cl0gPSBbTm9uZV0sCiAgICAgICAgY2h1bmtzaXplOiBpbnQgPSAwLAogICAgICAgIGR0eXBlPU5vbmUsCiAgICAgICAgZW5jb2Rpbmc6IHN0ciA9ICJsYXRpbi0xIiwKICAgICAgICBrZXk6IHN0ciA9ICJkYXRhIiwKICAgICAgICBkYXRhc2V0OiBzdHIgPSAiTm9uZSIsCiAgICAgICAgcGFydF9jb2xzPVtdLAogICAgICAgIGZpbGVfZXh0OiBzdHIgPSAicGFycXVldCIsCiAgICAgICAgaW5kZXg6IGJvb2wgPSBGYWxzZSwKICAgICAgICByZWZyZXNoX2RhdGE6IGJvb2wgPSBGYWxzZSwKICAgICAgICBzdGF0czogYm9vbCA9IEZhbHNlCikgLT4gTm9uZToKICAgICIiIk9wZW4gYSBmaWxlL29iamVjdCBhcmNoaXZlIGFuZCBzYXZlIGFzIGEgcGFycXVldCBmaWxlIG9yIGRhdGFzZXQKCiAgICBOb3RlcwogICAgLS0tLS0KICAgICogdGhpcyBmdW5jdGlvbiBpcyB0eXBpY2FsbHkgZm9yIGxhcmdlIGZpbGVzLCBwbGVhc2UgYmUgc3VyZSB0byBjaGVjayBhbGwgc2V0dGluZ3MKICAgICogcGFydGl0aW9uaW5nIHJlcXVpcmVzIHByZWNpc2Ugc3BlY2lmaWNhdGlvbiBvZiBjb2x1bW4gdHlwZXMuCiAgICAqIHRoZSBhcmNoaXZlX3VybCBjYW4gYmUgYW55IGZpbGUgcmVhZGFibGUgYnkgcGFuZGFzIHJlYWRfY3N2LCB3aGljaCBpbmNsdWRlcyB0YXIgZmlsZXMKICAgICogaWYgdGhlIGBkYXRhc2V0YCBwYXJhbWV0ZXIgaXMgbm90IGVtcHR5LCB0aGVuIGEgcGFydGl0aW9uZWQgZGF0YXNldCB3aWxsIGJlIGNyZWF0ZWQKICAgIGluc3RlYWQgb2YgYSBzaW5nbGUgZmlsZSBpbiB0aGUgZm9sZGVyIGBkYXRhc2V0YAogICAgKiBpZiBhIGtleSBleGlzdHMgYWxyZWFkeSB0aGVuIGl0IHdpbGwgbm90IGJlIHJlLWFjcXVpcmVkIHVubGVzcyB0aGUgYHJlZnJlc2hfZGF0YWAgcGFyYW0KICAgIGlzIHNldCB0byBgVHJ1ZWAuICBUaGlzIGlzIGluIGNhc2UgdGhlIG9yaWdpbmFsIGZpbGUgaXMgY29ycnVwdCwgb3IgYSByZWZyZXNoIGlzCiAgICByZXF1aXJlZC4KCiAgICA6cGFyYW0gY29udGV4dDogICAgICAgIHRoZSBmdW5jdGlvbiBjb250ZXh0CiAgICA6cGFyYW0gYXJjaGl2ZV91cmw6ICAgIE1MUnVuIGRhdGEgaW5wdXQgKERhdGFJdGVtIG9iamVjdCkKICAgIDpwYXJhbSBjaHVua3NpemU6ICAgICAgKDApIHdoZW4gPiAwLCByb3cgc2l6ZSAoY2h1bmspIHRvIHJldHJpZXZlCiAgICAgICAgICAgICAgICAgICAgICAgICAgIHBlciBpdGVyYXRpb24KICAgIDpwYXJhbSBkdHlwZSAgICAgICAgICAgZGVzdGluYXRpb24gZGF0YSB0eXBlIG9mIHNwZWNpZmllZCBjb2x1bW5zCiAgICA6cGFyYW0gZW5jb2RpbmcgICAgICAgICgibGF0aW4tOCIpIGZpbGUgZW5jb2RpbmcKICAgIDpwYXJhbSBrZXk6ICAgICAgICAgICAga2V5IGluIGFydGlmYWN0IHN0b3JlICh3aGVuIGxvZ19kYXRhPVRydWUpCiAgICA6cGFyYW0gZGF0YXNldDogICAgICAgIChOb25lKSBpZiBub3QgTm9uZSB0aGVuICJ0YXJnZXRfcGF0aC9kYXRhc2V0IgogICAgICAgICAgICAgICAgICAgICAgICAgICBpcyBmb2xkZXIgZm9yIHBhcnRpdGlvbmVkIGZpbGVzCiAgICA6cGFyYW0gcGFydF9jb2xzOiAgICAgIChbXSkgbGlzdCBvZiBwYXJ0aXRpb25pbmcgY29sdW1ucwogICAgOnBhcmFtIGZpbGVfZXh0OiAgICAgICAocGFycXVldCkgY3N2L3BhcnF1ZXQgZmlsZSBleHRlbnNpb24KICAgIDpwYXJhbSBpbmRleDogICAgICAgICAgKEZhbHNlKSBwYW5kYXMgc2F2ZSBpbmRleCBvcHRpb24KICAgIDpwYXJhbSByZWZyZXNoX2RhdGE6ICAgKEZhbHNlKSBvdmVyd3JpdGUgZXhpc3RpbmcgZGF0YSBhdCB0aGF0IGxvY2F0aW9uCiAgICA6cGFyYW0gc3RhdHM6ICAgICAgICAgIChOb25lKSBjYWxjdWxhdGUgdGFibGUgc3RhdHMgd2hlbiBsb2dnaW5nIGFydGlmYWN0CiAgICAiIiIKICAgIGJhc2VfcGF0aCA9IGNvbnRleHQuYXJ0aWZhY3RfcGF0aAogICAgb3MubWFrZWRpcnMoYmFzZV9wYXRoLCBleGlzdF9vaz1UcnVlKQoKICAgIGFyY2hpdmVfdXJsID0gYXJjaGl2ZV91cmwubG9jYWwoKQoKICAgIGlmIGRhdGFzZXQgaXMgbm90IE5vbmU6CiAgICAgICAgZGVzdF9wYXRoID0gb3MucGF0aC5qb2luKGJhc2VfcGF0aCwgZGF0YXNldCkKICAgICAgICBleGlzdHMgPSBvcy5wYXRoLmlzZGlyKGRlc3RfcGF0aCkKICAgIGVsc2U6CiAgICAgICAgZGVzdF9wYXRoID0gb3MucGF0aC5qb2luKGJhc2VfcGF0aCwga2V5ICsgZiIue2ZpbGVfZXh0fSIpCiAgICAgICAgZXhpc3RzID0gb3MucGF0aC5pc2ZpbGUoZGVzdF9wYXRoKQoKICAgIGlmIG5vdCBleGlzdHM6CiAgICAgICAgY29udGV4dC5sb2dnZXIuaW5mbygiZGVzdGluYXRpb24gZmlsZSBkb2VzIG5vdCBleGlzdCwgZG93bmxvYWRpbmciKQogICAgICAgIGlmIGNodW5rc2l6ZSA+IDA6CiAgICAgICAgICAgIGhlYWRlciA9IF9jaHVua19yZWFkd3JpdGUoYXJjaGl2ZV91cmwsIGRlc3RfcGF0aCwgY2h1bmtzaXplLAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIGVuY29kaW5nLCBkdHlwZSwgZGF0YXNldCkKICAgICAgICAgICAgY29udGV4dC5sb2dfZGF0YXNldChrZXk9a2V5LCBzdGF0cz1zdGF0cywgZm9ybWF0PSdwYXJxdWV0JywKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICB0YXJnZXRfcGF0aD1kZXN0X3BhdGgpCiAgICAgICAgZWxzZToKICAgICAgICAgICAgZGYgPSBwZC5yZWFkX2NzdihhcmNoaXZlX3VybCkKICAgICAgICAgICAgY29udGV4dC5sb2dfZGF0YXNldChrZXksIGRmPWRmLCBmb3JtYXQ9ZmlsZV9leHQsIGluZGV4PWluZGV4KQogICAgZWxzZToKICAgICAgICBjb250ZXh0LmxvZ2dlci5pbmZvKCJkZXN0aW5hdGlvbiBmaWxlIGFscmVhZHkgZXhpc3RzLCBub3RoaW5nIGRvbmUiKQ==
+    code_origin: ''
+    origin_filename: ''
+  description: retrieve remote archive, open and save as parquet
+  disable_auto_mount: false
+  image: mlrun/mlrun
+
+        
+    
+ + \ No newline at end of file diff --git a/functions/master/arc_to_parquet/1.5.0/static/item.html b/functions/master/arc_to_parquet/1.5.0/static/item.html new file mode 100644 index 00000000..f5e22a54 --- /dev/null +++ b/functions/master/arc_to_parquet/1.5.0/static/item.html @@ -0,0 +1,59 @@ + + + + + + + + + + + Source + + + + +
+        
+apiVersion: v1
+categories:
+- utils
+description: retrieve remote archive, open and save as parquet
+doc: ''
+example: arc_to_parquet.ipynb
+generationDate: 2022-08-28:17-25
+hidden: false
+icon: ''
+labels:
+  author: avi
+maintainers: []
+marketplaceType: ''
+mlrunVersion: 1.7.0
+name: arc-to-parquet
+platformVersion: 3.5.4
+spec:
+  filename: arc_to_parquet.py
+  handler: arc_to_parquet
+  image: mlrun/mlrun
+  kind: job
+  requirements: []
+url: ''
+version: 1.5.0
+
+        
+    
+ + \ No newline at end of file diff --git a/functions/master/arc_to_parquet/1.5.0/static/source.html b/functions/master/arc_to_parquet/1.5.0/static/source.html new file mode 100644 index 00000000..bc20fefd --- /dev/null +++ b/functions/master/arc_to_parquet/1.5.0/static/source.html @@ -0,0 +1,168 @@ + + + + + + + + + + + Source + + + + +
+        
+# Copyright 2019 Iguazio
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import pandas as pd
+import pyarrow.parquet as pq
+import pyarrow as pa
+import numpy as np
+
+
+from mlrun.execution import MLClientCtx
+from mlrun.datastore import DataItem
+
+from typing import List
+import os
+
+
+
+def _chunk_readwrite(
+        archive_url,
+        dest_path,
+        chunksize,
+        header,
+        encoding,
+        dtype,
+        dataset
+):
+    """stream read and write archives
+
+    pandas reads and parquet writes
+
+    notes
+    -----
+    * dest_path can be either a file.parquet, or in hte case of partitioned parquet
+      it will be only the destination folder of the parquet partition files
+    """
+    pqwriter = None
+    header = []
+    for i, df in enumerate(pd.read_csv(archive_url, chunksize=chunksize,
+                                       names=header, encoding=encoding,
+                                       dtype=dtype)):
+        table = pa.Table.from_pandas(df)
+        if i == 0:
+            if dataset:
+                header = np.copy(table.schema)
+            else:
+                pqwriter = pq.ParquetWriter(dest_path, table.schema)
+        if dataset:
+            pq.write_to_dataset(table, root_path=dest_path, partition_cols=partition_cols)
+        else:
+            pqwriter.write_table(table)
+    if pqwriter:
+        pqwriter.close()
+
+    return header
+
+
+def arc_to_parquet(
+        context: MLClientCtx,
+        archive_url: DataItem,
+        header: List[str] = [None],
+        chunksize: int = 0,
+        dtype=None,
+        encoding: str = "latin-1",
+        key: str = "data",
+        dataset: str = "None",
+        part_cols=[],
+        file_ext: str = "parquet",
+        index: bool = False,
+        refresh_data: bool = False,
+        stats: bool = False
+) -> None:
+    """Open a file/object archive and save as a parquet file or dataset
+
+    Notes
+    -----
+    * this function is typically for large files, please be sure to check all settings
+    * partitioning requires precise specification of column types.
+    * the archive_url can be any file readable by pandas read_csv, which includes tar files
+    * if the `dataset` parameter is not empty, then a partitioned dataset will be created
+    instead of a single file in the folder `dataset`
+    * if a key exists already then it will not be re-acquired unless the `refresh_data` param
+    is set to `True`.  This is in case the original file is corrupt, or a refresh is
+    required.
+
+    :param context:        the function context
+    :param archive_url:    MLRun data input (DataItem object)
+    :param chunksize:      (0) when > 0, row size (chunk) to retrieve
+                           per iteration
+    :param dtype           destination data type of specified columns
+    :param encoding        ("latin-8") file encoding
+    :param key:            key in artifact store (when log_data=True)
+    :param dataset:        (None) if not None then "target_path/dataset"
+                           is folder for partitioned files
+    :param part_cols:      ([]) list of partitioning columns
+    :param file_ext:       (parquet) csv/parquet file extension
+    :param index:          (False) pandas save index option
+    :param refresh_data:   (False) overwrite existing data at that location
+    :param stats:          (None) calculate table stats when logging artifact
+    """
+    base_path = context.artifact_path
+    os.makedirs(base_path, exist_ok=True)
+
+    archive_url = archive_url.local()
+
+    if dataset is not None:
+        dest_path = os.path.join(base_path, dataset)
+        exists = os.path.isdir(dest_path)
+    else:
+        dest_path = os.path.join(base_path, key + f".{file_ext}")
+        exists = os.path.isfile(dest_path)
+
+    if not exists:
+        context.logger.info("destination file does not exist, downloading")
+        if chunksize > 0:
+            header = _chunk_readwrite(archive_url, dest_path, chunksize,
+                                      encoding, dtype, dataset)
+            context.log_dataset(key=key, stats=stats, format='parquet',
+                                target_path=dest_path)
+        else:
+            df = pd.read_csv(archive_url)
+            context.log_dataset(key, df=df, format=file_ext, index=index)
+    else:
+        context.logger.info("destination file already exists, nothing done")
+        
+    
+ + \ No newline at end of file diff --git a/functions/master/arc_to_parquet/latest/src/function.yaml b/functions/master/arc_to_parquet/latest/src/function.yaml index f76d0494..d10e841c 100644 --- a/functions/master/arc_to_parquet/latest/src/function.yaml +++ b/functions/master/arc_to_parquet/latest/src/function.yaml @@ -1,62 +1,23 @@ -kind: job +verbose: false metadata: - name: arc-to-parquet tag: '' - hash: 959e5c3513bb7568402b6ce4023f4615e224b566 - project: '' - labels: - author: avi + name: arc-to-parquet categories: - - etl + - utils +kind: job spec: command: '' - args: [] - image: mlrun/mlrun - build: - functionSourceCode: IyBDb3B5cmlnaHQgMjAxOSBJZ3VhemlvCiMKIyBMaWNlbnNlZCB1bmRlciB0aGUgQXBhY2hlIExpY2Vuc2UsIFZlcnNpb24gMi4wICh0aGUgIkxpY2Vuc2UiKTsKIyB5b3UgbWF5IG5vdCB1c2UgdGhpcyBmaWxlIGV4Y2VwdCBpbiBjb21wbGlhbmNlIHdpdGggdGhlIExpY2Vuc2UuCiMgWW91IG1heSBvYnRhaW4gYSBjb3B5IG9mIHRoZSBMaWNlbnNlIGF0CiMKIyAgICAgaHR0cDovL3d3dy5hcGFjaGUub3JnL2xpY2Vuc2VzL0xJQ0VOU0UtMi4wCiMKIyBVbmxlc3MgcmVxdWlyZWQgYnkgYXBwbGljYWJsZSBsYXcgb3IgYWdyZWVkIHRvIGluIHdyaXRpbmcsIHNvZnR3YXJlCiMgZGlzdHJpYnV0ZWQgdW5kZXIgdGhlIExpY2Vuc2UgaXMgZGlzdHJpYnV0ZWQgb24gYW4gIkFTIElTIiBCQVNJUywKIyBXSVRIT1VUIFdBUlJBTlRJRVMgT1IgQ09ORElUSU9OUyBPRiBBTlkgS0lORCwgZWl0aGVyIGV4cHJlc3Mgb3IgaW1wbGllZC4KIyBTZWUgdGhlIExpY2Vuc2UgZm9yIHRoZSBzcGVjaWZpYyBsYW5ndWFnZSBnb3Zlcm5pbmcgcGVybWlzc2lvbnMgYW5kCiMgbGltaXRhdGlvbnMgdW5kZXIgdGhlIExpY2Vuc2UuCiMKaW1wb3J0IHBhbmRhcyBhcyBwZAppbXBvcnQgcHlhcnJvdy5wYXJxdWV0IGFzIHBxCmltcG9ydCBweWFycm93IGFzIHBhCmltcG9ydCBudW1weSBhcyBucAoKCmZyb20gbWxydW4uZXhlY3V0aW9uIGltcG9ydCBNTENsaWVudEN0eApmcm9tIG1scnVuLmRhdGFzdG9yZSBpbXBvcnQgRGF0YUl0ZW0KCmZyb20gdHlwaW5nIGltcG9ydCBMaXN0CmltcG9ydCBvcwoKCgpkZWYgX2NodW5rX3JlYWR3cml0ZSgKICAgICAgICBhcmNoaXZlX3VybCwKICAgICAgICBkZXN0X3BhdGgsCiAgICAgICAgY2h1bmtzaXplLAogICAgICAgIGhlYWRlciwKICAgICAgICBlbmNvZGluZywKICAgICAgICBkdHlwZSwKICAgICAgICBkYXRhc2V0Cik6CiAgICAiIiJzdHJlYW0gcmVhZCBhbmQgd3JpdGUgYXJjaGl2ZXMKCiAgICBwYW5kYXMgcmVhZHMgYW5kIHBhcnF1ZXQgd3JpdGVzCgogICAgbm90ZXMKICAgIC0tLS0tCiAgICAqIGRlc3RfcGF0aCBjYW4gYmUgZWl0aGVyIGEgZmlsZS5wYXJxdWV0LCBvciBpbiBodGUgY2FzZSBvZiBwYXJ0aXRpb25lZCBwYXJxdWV0CiAgICAgIGl0IHdpbGwgYmUgb25seSB0aGUgZGVzdGluYXRpb24gZm9sZGVyIG9mIHRoZSBwYXJxdWV0IHBhcnRpdGlvbiBmaWxlcwogICAgIiIiCiAgICBwcXdyaXRlciA9IE5vbmUKICAgIGhlYWRlciA9IFtdCiAgICBmb3IgaSwgZGYgaW4gZW51bWVyYXRlKHBkLnJlYWRfY3N2KGFyY2hpdmVfdXJsLCBjaHVua3NpemU9Y2h1bmtzaXplLAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBuYW1lcz1oZWFkZXIsIGVuY29kaW5nPWVuY29kaW5nLAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBkdHlwZT1kdHlwZSkpOgogICAgICAgIHRhYmxlID0gcGEuVGFibGUuZnJvbV9wYW5kYXMoZGYpCiAgICAgICAgaWYgaSA9PSAwOgogICAgICAgICAgICBpZiBkYXRhc2V0OgogICAgICAgICAgICAgICAgaGVhZGVyID0gbnAuY29weSh0YWJsZS5zY2hlbWEpCiAgICAgICAgICAgIGVsc2U6CiAgICAgICAgICAgICAgICBwcXdyaXRlciA9IHBxLlBhcnF1ZXRXcml0ZXIoZGVzdF9wYXRoLCB0YWJsZS5zY2hlbWEpCiAgICAgICAgaWYgZGF0YXNldDoKICAgICAgICAgICAgcHEud3JpdGVfdG9fZGF0YXNldCh0YWJsZSwgcm9vdF9wYXRoPWRlc3RfcGF0aCwgcGFydGl0aW9uX2NvbHM9cGFydGl0aW9uX2NvbHMpCiAgICAgICAgZWxzZToKICAgICAgICAgICAgcHF3cml0ZXIud3JpdGVfdGFibGUodGFibGUpCiAgICBpZiBwcXdyaXRlcjoKICAgICAgICBwcXdyaXRlci5jbG9zZSgpCgogICAgcmV0dXJuIGhlYWRlcgoKCmRlZiBhcmNfdG9fcGFycXVldCgKICAgICAgICBjb250ZXh0OiBNTENsaWVudEN0eCwKICAgICAgICBhcmNoaXZlX3VybDogRGF0YUl0ZW0sCiAgICAgICAgaGVhZGVyOiBMaXN0W3N0cl0gPSBbTm9uZV0sCiAgICAgICAgY2h1bmtzaXplOiBpbnQgPSAwLAogICAgICAgIGR0eXBlPU5vbmUsCiAgICAgICAgZW5jb2Rpbmc6IHN0ciA9ICJsYXRpbi0xIiwKICAgICAgICBrZXk6IHN0ciA9ICJkYXRhIiwKICAgICAgICBkYXRhc2V0OiBzdHIgPSAiTm9uZSIsCiAgICAgICAgcGFydF9jb2xzPVtdLAogICAgICAgIGZpbGVfZXh0OiBzdHIgPSAicGFycXVldCIsCiAgICAgICAgaW5kZXg6IGJvb2wgPSBGYWxzZSwKICAgICAgICByZWZyZXNoX2RhdGE6IGJvb2wgPSBGYWxzZSwKICAgICAgICBzdGF0czogYm9vbCA9IEZhbHNlCikgLT4gTm9uZToKICAgICIiIk9wZW4gYSBmaWxlL29iamVjdCBhcmNoaXZlIGFuZCBzYXZlIGFzIGEgcGFycXVldCBmaWxlIG9yIGRhdGFzZXQKCiAgICBOb3RlcwogICAgLS0tLS0KICAgICogdGhpcyBmdW5jdGlvbiBpcyB0eXBpY2FsbHkgZm9yIGxhcmdlIGZpbGVzLCBwbGVhc2UgYmUgc3VyZSB0byBjaGVjayBhbGwgc2V0dGluZ3MKICAgICogcGFydGl0aW9uaW5nIHJlcXVpcmVzIHByZWNpc2Ugc3BlY2lmaWNhdGlvbiBvZiBjb2x1bW4gdHlwZXMuCiAgICAqIHRoZSBhcmNoaXZlX3VybCBjYW4gYmUgYW55IGZpbGUgcmVhZGFibGUgYnkgcGFuZGFzIHJlYWRfY3N2LCB3aGljaCBpbmNsdWRlcyB0YXIgZmlsZXMKICAgICogaWYgdGhlIGBkYXRhc2V0YCBwYXJhbWV0ZXIgaXMgbm90IGVtcHR5LCB0aGVuIGEgcGFydGl0aW9uZWQgZGF0YXNldCB3aWxsIGJlIGNyZWF0ZWQKICAgIGluc3RlYWQgb2YgYSBzaW5nbGUgZmlsZSBpbiB0aGUgZm9sZGVyIGBkYXRhc2V0YAogICAgKiBpZiBhIGtleSBleGlzdHMgYWxyZWFkeSB0aGVuIGl0IHdpbGwgbm90IGJlIHJlLWFjcXVpcmVkIHVubGVzcyB0aGUgYHJlZnJlc2hfZGF0YWAgcGFyYW0KICAgIGlzIHNldCB0byBgVHJ1ZWAuICBUaGlzIGlzIGluIGNhc2UgdGhlIG9yaWdpbmFsIGZpbGUgaXMgY29ycnVwdCwgb3IgYSByZWZyZXNoIGlzCiAgICByZXF1aXJlZC4KCiAgICA6cGFyYW0gY29udGV4dDogICAgICAgIHRoZSBmdW5jdGlvbiBjb250ZXh0CiAgICA6cGFyYW0gYXJjaGl2ZV91cmw6ICAgIE1MUnVuIGRhdGEgaW5wdXQgKERhdGFJdGVtIG9iamVjdCkKICAgIDpwYXJhbSBjaHVua3NpemU6ICAgICAgKDApIHdoZW4gPiAwLCByb3cgc2l6ZSAoY2h1bmspIHRvIHJldHJpZXZlCiAgICAgICAgICAgICAgICAgICAgICAgICAgIHBlciBpdGVyYXRpb24KICAgIDpwYXJhbSBkdHlwZSAgICAgICAgICAgZGVzdGluYXRpb24gZGF0YSB0eXBlIG9mIHNwZWNpZmllZCBjb2x1bW5zCiAgICA6cGFyYW0gZW5jb2RpbmcgICAgICAgICgibGF0aW4tOCIpIGZpbGUgZW5jb2RpbmcKICAgIDpwYXJhbSBrZXk6ICAgICAgICAgICAga2V5IGluIGFydGlmYWN0IHN0b3JlICh3aGVuIGxvZ19kYXRhPVRydWUpCiAgICA6cGFyYW0gZGF0YXNldDogICAgICAgIChOb25lKSBpZiBub3QgTm9uZSB0aGVuICJ0YXJnZXRfcGF0aC9kYXRhc2V0IgogICAgICAgICAgICAgICAgICAgICAgICAgICBpcyBmb2xkZXIgZm9yIHBhcnRpdGlvbmVkIGZpbGVzCiAgICA6cGFyYW0gcGFydF9jb2xzOiAgICAgIChbXSkgbGlzdCBvZiBwYXJ0aXRpb25pbmcgY29sdW1ucwogICAgOnBhcmFtIGZpbGVfZXh0OiAgICAgICAocGFycXVldCkgY3N2L3BhcnF1ZXQgZmlsZSBleHRlbnNpb24KICAgIDpwYXJhbSBpbmRleDogICAgICAgICAgKEZhbHNlKSBwYW5kYXMgc2F2ZSBpbmRleCBvcHRpb24KICAgIDpwYXJhbSByZWZyZXNoX2RhdGE6ICAgKEZhbHNlKSBvdmVyd3JpdGUgZXhpc3RpbmcgZGF0YSBhdCB0aGF0IGxvY2F0aW9uCiAgICA6cGFyYW0gc3RhdHM6ICAgICAgICAgIChOb25lKSBjYWxjdWxhdGUgdGFibGUgc3RhdHMgd2hlbiBsb2dnaW5nIGFydGlmYWN0CiAgICAiIiIKICAgIGJhc2VfcGF0aCA9IGNvbnRleHQuYXJ0aWZhY3RfcGF0aAogICAgb3MubWFrZWRpcnMoYmFzZV9wYXRoLCBleGlzdF9vaz1UcnVlKQoKICAgIGFyY2hpdmVfdXJsID0gYXJjaGl2ZV91cmwubG9jYWwoKQoKICAgIGlmIGRhdGFzZXQgaXMgbm90IE5vbmU6CiAgICAgICAgZGVzdF9wYXRoID0gb3MucGF0aC5qb2luKGJhc2VfcGF0aCwgZGF0YXNldCkKICAgICAgICBleGlzdHMgPSBvcy5wYXRoLmlzZGlyKGRlc3RfcGF0aCkKICAgIGVsc2U6CiAgICAgICAgZGVzdF9wYXRoID0gb3MucGF0aC5qb2luKGJhc2VfcGF0aCwga2V5ICsgZiIue2ZpbGVfZXh0fSIpCiAgICAgICAgZXhpc3RzID0gb3MucGF0aC5pc2ZpbGUoZGVzdF9wYXRoKQoKICAgIGlmIG5vdCBleGlzdHM6CiAgICAgICAgY29udGV4dC5sb2dnZXIuaW5mbygiZGVzdGluYXRpb24gZmlsZSBkb2VzIG5vdCBleGlzdCwgZG93bmxvYWRpbmciKQogICAgICAgIGlmIGNodW5rc2l6ZSA+IDA6CiAgICAgICAgICAgIGhlYWRlciA9IF9jaHVua19yZWFkd3JpdGUoYXJjaGl2ZV91cmwsIGRlc3RfcGF0aCwgY2h1bmtzaXplLAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIGVuY29kaW5nLCBkdHlwZSwgZGF0YXNldCkKICAgICAgICAgICAgY29udGV4dC5sb2dfZGF0YXNldChrZXk9a2V5LCBzdGF0cz1zdGF0cywgZm9ybWF0PSdwYXJxdWV0JywKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICB0YXJnZXRfcGF0aD1kZXN0X3BhdGgpCiAgICAgICAgZWxzZToKICAgICAgICAgICAgZGYgPSBwZC5yZWFkX2NzdihhcmNoaXZlX3VybCkKICAgICAgICAgICAgY29udGV4dC5sb2dfZGF0YXNldChrZXksIGRmPWRmLCBmb3JtYXQ9ZmlsZV9leHQsIGluZGV4PWluZGV4KQogICAgZWxzZToKICAgICAgICBjb250ZXh0LmxvZ2dlci5pbmZvKCJkZXN0aW5hdGlvbiBmaWxlIGFscmVhZHkgZXhpc3RzLCBub3RoaW5nIGRvbmUiKQ== - commands: [] - code_origin: http://github.com/aviaIguazio/functions.git#b32ae36ee9e5fb7a3b0affa8c15046aae9df7d24:/Users/Avi_Asulin/PycharmProjects/functions/arc_to_parquet/arc_to_parquet.py - origin_filename: /Users/Avi_Asulin/PycharmProjects/functions/arc_to_parquet/arc_to_parquet.py - requirements: [] + default_handler: arc_to_parquet entry_points: arc_to_parquet: - name: arc_to_parquet - doc: 'Open a file/object archive and save as a parquet file or dataset - - - Notes - - ----- - - * this function is typically for large files, please be sure to check all - settings - - * partitioning requires precise specification of column types. - - * the archive_url can be any file readable by pandas read_csv, which includes - tar files - - * if the `dataset` parameter is not empty, then a partitioned dataset will - be created - - instead of a single file in the folder `dataset` - - * if a key exists already then it will not be re-acquired unless the `refresh_data` - param - - is set to `True`. This is in case the original file is corrupt, or a refresh - is - - required.' + has_varargs: false parameters: - name: context type: MLClientCtx doc: the function context - default: '' - name: archive_url type: DataItem doc: MLRun data input (DataItem object) - default: '' - name: header type: List[str] default: @@ -98,17 +59,42 @@ spec: type: bool doc: (None) calculate table stats when logging artifact default: false - outputs: - - default: '' lineno: 68 + outputs: + - type: None + name: arc_to_parquet + has_kwargs: false + doc: 'Open a file/object archive and save as a parquet file or dataset + + + Notes + + ----- + + * this function is typically for large files, please be sure to check all + settings + + * partitioning requires precise specification of column types. + + * the archive_url can be any file readable by pandas read_csv, which includes + tar files + + * if the `dataset` parameter is not empty, then a partitioned dataset will + be created + + instead of a single file in the folder `dataset` + + * if a key exists already then it will not be re-acquired unless the `refresh_data` + param + + is set to `True`. This is in case the original file is corrupt, or a refresh + is + + required.' + build: + functionSourceCode: IyBDb3B5cmlnaHQgMjAxOSBJZ3VhemlvCiMKIyBMaWNlbnNlZCB1bmRlciB0aGUgQXBhY2hlIExpY2Vuc2UsIFZlcnNpb24gMi4wICh0aGUgIkxpY2Vuc2UiKTsKIyB5b3UgbWF5IG5vdCB1c2UgdGhpcyBmaWxlIGV4Y2VwdCBpbiBjb21wbGlhbmNlIHdpdGggdGhlIExpY2Vuc2UuCiMgWW91IG1heSBvYnRhaW4gYSBjb3B5IG9mIHRoZSBMaWNlbnNlIGF0CiMKIyAgICAgaHR0cDovL3d3dy5hcGFjaGUub3JnL2xpY2Vuc2VzL0xJQ0VOU0UtMi4wCiMKIyBVbmxlc3MgcmVxdWlyZWQgYnkgYXBwbGljYWJsZSBsYXcgb3IgYWdyZWVkIHRvIGluIHdyaXRpbmcsIHNvZnR3YXJlCiMgZGlzdHJpYnV0ZWQgdW5kZXIgdGhlIExpY2Vuc2UgaXMgZGlzdHJpYnV0ZWQgb24gYW4gIkFTIElTIiBCQVNJUywKIyBXSVRIT1VUIFdBUlJBTlRJRVMgT1IgQ09ORElUSU9OUyBPRiBBTlkgS0lORCwgZWl0aGVyIGV4cHJlc3Mgb3IgaW1wbGllZC4KIyBTZWUgdGhlIExpY2Vuc2UgZm9yIHRoZSBzcGVjaWZpYyBsYW5ndWFnZSBnb3Zlcm5pbmcgcGVybWlzc2lvbnMgYW5kCiMgbGltaXRhdGlvbnMgdW5kZXIgdGhlIExpY2Vuc2UuCiMKaW1wb3J0IHBhbmRhcyBhcyBwZAppbXBvcnQgcHlhcnJvdy5wYXJxdWV0IGFzIHBxCmltcG9ydCBweWFycm93IGFzIHBhCmltcG9ydCBudW1weSBhcyBucAoKCmZyb20gbWxydW4uZXhlY3V0aW9uIGltcG9ydCBNTENsaWVudEN0eApmcm9tIG1scnVuLmRhdGFzdG9yZSBpbXBvcnQgRGF0YUl0ZW0KCmZyb20gdHlwaW5nIGltcG9ydCBMaXN0CmltcG9ydCBvcwoKCgpkZWYgX2NodW5rX3JlYWR3cml0ZSgKICAgICAgICBhcmNoaXZlX3VybCwKICAgICAgICBkZXN0X3BhdGgsCiAgICAgICAgY2h1bmtzaXplLAogICAgICAgIGhlYWRlciwKICAgICAgICBlbmNvZGluZywKICAgICAgICBkdHlwZSwKICAgICAgICBkYXRhc2V0Cik6CiAgICAiIiJzdHJlYW0gcmVhZCBhbmQgd3JpdGUgYXJjaGl2ZXMKCiAgICBwYW5kYXMgcmVhZHMgYW5kIHBhcnF1ZXQgd3JpdGVzCgogICAgbm90ZXMKICAgIC0tLS0tCiAgICAqIGRlc3RfcGF0aCBjYW4gYmUgZWl0aGVyIGEgZmlsZS5wYXJxdWV0LCBvciBpbiBodGUgY2FzZSBvZiBwYXJ0aXRpb25lZCBwYXJxdWV0CiAgICAgIGl0IHdpbGwgYmUgb25seSB0aGUgZGVzdGluYXRpb24gZm9sZGVyIG9mIHRoZSBwYXJxdWV0IHBhcnRpdGlvbiBmaWxlcwogICAgIiIiCiAgICBwcXdyaXRlciA9IE5vbmUKICAgIGhlYWRlciA9IFtdCiAgICBmb3IgaSwgZGYgaW4gZW51bWVyYXRlKHBkLnJlYWRfY3N2KGFyY2hpdmVfdXJsLCBjaHVua3NpemU9Y2h1bmtzaXplLAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBuYW1lcz1oZWFkZXIsIGVuY29kaW5nPWVuY29kaW5nLAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBkdHlwZT1kdHlwZSkpOgogICAgICAgIHRhYmxlID0gcGEuVGFibGUuZnJvbV9wYW5kYXMoZGYpCiAgICAgICAgaWYgaSA9PSAwOgogICAgICAgICAgICBpZiBkYXRhc2V0OgogICAgICAgICAgICAgICAgaGVhZGVyID0gbnAuY29weSh0YWJsZS5zY2hlbWEpCiAgICAgICAgICAgIGVsc2U6CiAgICAgICAgICAgICAgICBwcXdyaXRlciA9IHBxLlBhcnF1ZXRXcml0ZXIoZGVzdF9wYXRoLCB0YWJsZS5zY2hlbWEpCiAgICAgICAgaWYgZGF0YXNldDoKICAgICAgICAgICAgcHEud3JpdGVfdG9fZGF0YXNldCh0YWJsZSwgcm9vdF9wYXRoPWRlc3RfcGF0aCwgcGFydGl0aW9uX2NvbHM9cGFydGl0aW9uX2NvbHMpCiAgICAgICAgZWxzZToKICAgICAgICAgICAgcHF3cml0ZXIud3JpdGVfdGFibGUodGFibGUpCiAgICBpZiBwcXdyaXRlcjoKICAgICAgICBwcXdyaXRlci5jbG9zZSgpCgogICAgcmV0dXJuIGhlYWRlcgoKCmRlZiBhcmNfdG9fcGFycXVldCgKICAgICAgICBjb250ZXh0OiBNTENsaWVudEN0eCwKICAgICAgICBhcmNoaXZlX3VybDogRGF0YUl0ZW0sCiAgICAgICAgaGVhZGVyOiBMaXN0W3N0cl0gPSBbTm9uZV0sCiAgICAgICAgY2h1bmtzaXplOiBpbnQgPSAwLAogICAgICAgIGR0eXBlPU5vbmUsCiAgICAgICAgZW5jb2Rpbmc6IHN0ciA9ICJsYXRpbi0xIiwKICAgICAgICBrZXk6IHN0ciA9ICJkYXRhIiwKICAgICAgICBkYXRhc2V0OiBzdHIgPSAiTm9uZSIsCiAgICAgICAgcGFydF9jb2xzPVtdLAogICAgICAgIGZpbGVfZXh0OiBzdHIgPSAicGFycXVldCIsCiAgICAgICAgaW5kZXg6IGJvb2wgPSBGYWxzZSwKICAgICAgICByZWZyZXNoX2RhdGE6IGJvb2wgPSBGYWxzZSwKICAgICAgICBzdGF0czogYm9vbCA9IEZhbHNlCikgLT4gTm9uZToKICAgICIiIk9wZW4gYSBmaWxlL29iamVjdCBhcmNoaXZlIGFuZCBzYXZlIGFzIGEgcGFycXVldCBmaWxlIG9yIGRhdGFzZXQKCiAgICBOb3RlcwogICAgLS0tLS0KICAgICogdGhpcyBmdW5jdGlvbiBpcyB0eXBpY2FsbHkgZm9yIGxhcmdlIGZpbGVzLCBwbGVhc2UgYmUgc3VyZSB0byBjaGVjayBhbGwgc2V0dGluZ3MKICAgICogcGFydGl0aW9uaW5nIHJlcXVpcmVzIHByZWNpc2Ugc3BlY2lmaWNhdGlvbiBvZiBjb2x1bW4gdHlwZXMuCiAgICAqIHRoZSBhcmNoaXZlX3VybCBjYW4gYmUgYW55IGZpbGUgcmVhZGFibGUgYnkgcGFuZGFzIHJlYWRfY3N2LCB3aGljaCBpbmNsdWRlcyB0YXIgZmlsZXMKICAgICogaWYgdGhlIGBkYXRhc2V0YCBwYXJhbWV0ZXIgaXMgbm90IGVtcHR5LCB0aGVuIGEgcGFydGl0aW9uZWQgZGF0YXNldCB3aWxsIGJlIGNyZWF0ZWQKICAgIGluc3RlYWQgb2YgYSBzaW5nbGUgZmlsZSBpbiB0aGUgZm9sZGVyIGBkYXRhc2V0YAogICAgKiBpZiBhIGtleSBleGlzdHMgYWxyZWFkeSB0aGVuIGl0IHdpbGwgbm90IGJlIHJlLWFjcXVpcmVkIHVubGVzcyB0aGUgYHJlZnJlc2hfZGF0YWAgcGFyYW0KICAgIGlzIHNldCB0byBgVHJ1ZWAuICBUaGlzIGlzIGluIGNhc2UgdGhlIG9yaWdpbmFsIGZpbGUgaXMgY29ycnVwdCwgb3IgYSByZWZyZXNoIGlzCiAgICByZXF1aXJlZC4KCiAgICA6cGFyYW0gY29udGV4dDogICAgICAgIHRoZSBmdW5jdGlvbiBjb250ZXh0CiAgICA6cGFyYW0gYXJjaGl2ZV91cmw6ICAgIE1MUnVuIGRhdGEgaW5wdXQgKERhdGFJdGVtIG9iamVjdCkKICAgIDpwYXJhbSBjaHVua3NpemU6ICAgICAgKDApIHdoZW4gPiAwLCByb3cgc2l6ZSAoY2h1bmspIHRvIHJldHJpZXZlCiAgICAgICAgICAgICAgICAgICAgICAgICAgIHBlciBpdGVyYXRpb24KICAgIDpwYXJhbSBkdHlwZSAgICAgICAgICAgZGVzdGluYXRpb24gZGF0YSB0eXBlIG9mIHNwZWNpZmllZCBjb2x1bW5zCiAgICA6cGFyYW0gZW5jb2RpbmcgICAgICAgICgibGF0aW4tOCIpIGZpbGUgZW5jb2RpbmcKICAgIDpwYXJhbSBrZXk6ICAgICAgICAgICAga2V5IGluIGFydGlmYWN0IHN0b3JlICh3aGVuIGxvZ19kYXRhPVRydWUpCiAgICA6cGFyYW0gZGF0YXNldDogICAgICAgIChOb25lKSBpZiBub3QgTm9uZSB0aGVuICJ0YXJnZXRfcGF0aC9kYXRhc2V0IgogICAgICAgICAgICAgICAgICAgICAgICAgICBpcyBmb2xkZXIgZm9yIHBhcnRpdGlvbmVkIGZpbGVzCiAgICA6cGFyYW0gcGFydF9jb2xzOiAgICAgIChbXSkgbGlzdCBvZiBwYXJ0aXRpb25pbmcgY29sdW1ucwogICAgOnBhcmFtIGZpbGVfZXh0OiAgICAgICAocGFycXVldCkgY3N2L3BhcnF1ZXQgZmlsZSBleHRlbnNpb24KICAgIDpwYXJhbSBpbmRleDogICAgICAgICAgKEZhbHNlKSBwYW5kYXMgc2F2ZSBpbmRleCBvcHRpb24KICAgIDpwYXJhbSByZWZyZXNoX2RhdGE6ICAgKEZhbHNlKSBvdmVyd3JpdGUgZXhpc3RpbmcgZGF0YSBhdCB0aGF0IGxvY2F0aW9uCiAgICA6cGFyYW0gc3RhdHM6ICAgICAgICAgIChOb25lKSBjYWxjdWxhdGUgdGFibGUgc3RhdHMgd2hlbiBsb2dnaW5nIGFydGlmYWN0CiAgICAiIiIKICAgIGJhc2VfcGF0aCA9IGNvbnRleHQuYXJ0aWZhY3RfcGF0aAogICAgb3MubWFrZWRpcnMoYmFzZV9wYXRoLCBleGlzdF9vaz1UcnVlKQoKICAgIGFyY2hpdmVfdXJsID0gYXJjaGl2ZV91cmwubG9jYWwoKQoKICAgIGlmIGRhdGFzZXQgaXMgbm90IE5vbmU6CiAgICAgICAgZGVzdF9wYXRoID0gb3MucGF0aC5qb2luKGJhc2VfcGF0aCwgZGF0YXNldCkKICAgICAgICBleGlzdHMgPSBvcy5wYXRoLmlzZGlyKGRlc3RfcGF0aCkKICAgIGVsc2U6CiAgICAgICAgZGVzdF9wYXRoID0gb3MucGF0aC5qb2luKGJhc2VfcGF0aCwga2V5ICsgZiIue2ZpbGVfZXh0fSIpCiAgICAgICAgZXhpc3RzID0gb3MucGF0aC5pc2ZpbGUoZGVzdF9wYXRoKQoKICAgIGlmIG5vdCBleGlzdHM6CiAgICAgICAgY29udGV4dC5sb2dnZXIuaW5mbygiZGVzdGluYXRpb24gZmlsZSBkb2VzIG5vdCBleGlzdCwgZG93bmxvYWRpbmciKQogICAgICAgIGlmIGNodW5rc2l6ZSA+IDA6CiAgICAgICAgICAgIGhlYWRlciA9IF9jaHVua19yZWFkd3JpdGUoYXJjaGl2ZV91cmwsIGRlc3RfcGF0aCwgY2h1bmtzaXplLAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIGVuY29kaW5nLCBkdHlwZSwgZGF0YXNldCkKICAgICAgICAgICAgY29udGV4dC5sb2dfZGF0YXNldChrZXk9a2V5LCBzdGF0cz1zdGF0cywgZm9ybWF0PSdwYXJxdWV0JywKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICB0YXJnZXRfcGF0aD1kZXN0X3BhdGgpCiAgICAgICAgZWxzZToKICAgICAgICAgICAgZGYgPSBwZC5yZWFkX2NzdihhcmNoaXZlX3VybCkKICAgICAgICAgICAgY29udGV4dC5sb2dfZGF0YXNldChrZXksIGRmPWRmLCBmb3JtYXQ9ZmlsZV9leHQsIGluZGV4PWluZGV4KQogICAgZWxzZToKICAgICAgICBjb250ZXh0LmxvZ2dlci5pbmZvKCJkZXN0aW5hdGlvbiBmaWxlIGFscmVhZHkgZXhpc3RzLCBub3RoaW5nIGRvbmUiKQ== + code_origin: '' + origin_filename: '' description: retrieve remote archive, open and save as parquet - default_handler: arc_to_parquet disable_auto_mount: false - clone_target_dir: '' - env: [] - priority_class_name: '' - preemption_mode: prevent - affinity: null - tolerations: null - security_context: {} -verbose: false + image: mlrun/mlrun diff --git a/functions/master/arc_to_parquet/latest/src/item.yaml b/functions/master/arc_to_parquet/latest/src/item.yaml index e08535f9..4bc2634c 100644 --- a/functions/master/arc_to_parquet/latest/src/item.yaml +++ b/functions/master/arc_to_parquet/latest/src/item.yaml @@ -1,6 +1,6 @@ apiVersion: v1 categories: -- etl +- utils description: retrieve remote archive, open and save as parquet doc: '' example: arc_to_parquet.ipynb @@ -11,7 +11,7 @@ labels: author: avi maintainers: [] marketplaceType: '' -mlrunVersion: 1.4.1 +mlrunVersion: 1.7.0 name: arc-to-parquet platformVersion: 3.5.4 spec: @@ -21,4 +21,4 @@ spec: kind: job requirements: [] url: '' -version: 1.4.1 +version: 1.5.0 diff --git a/functions/master/arc_to_parquet/latest/static/arc_to_parquet.html b/functions/master/arc_to_parquet/latest/static/arc_to_parquet.html index 40fcaf4b..3598982d 100644 --- a/functions/master/arc_to_parquet/latest/static/arc_to_parquet.html +++ b/functions/master/arc_to_parquet/latest/static/arc_to_parquet.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/arc_to_parquet/latest/static/documentation.html b/functions/master/arc_to_parquet/latest/static/documentation.html index 31c44a7a..6f217a17 100644 --- a/functions/master/arc_to_parquet/latest/static/documentation.html +++ b/functions/master/arc_to_parquet/latest/static/documentation.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/arc_to_parquet/latest/static/example.html b/functions/master/arc_to_parquet/latest/static/example.html index d0a60d36..3f87a94c 100644 --- a/functions/master/arc_to_parquet/latest/static/example.html +++ b/functions/master/arc_to_parquet/latest/static/example.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/arc_to_parquet/latest/static/function.html b/functions/master/arc_to_parquet/latest/static/function.html index 690f9b29..998b172d 100644 --- a/functions/master/arc_to_parquet/latest/static/function.html +++ b/functions/master/arc_to_parquet/latest/static/function.html @@ -28,65 +28,26 @@
         
-kind: job
+verbose: false
 metadata:
-  name: arc-to-parquet
   tag: ''
-  hash: 959e5c3513bb7568402b6ce4023f4615e224b566
-  project: ''
-  labels:
-    author: avi
+  name: arc-to-parquet
   categories:
-  - etl
+  - utils
+kind: job
 spec:
   command: ''
-  args: []
-  image: mlrun/mlrun
-  build:
-    functionSourceCode: IyBDb3B5cmlnaHQgMjAxOSBJZ3VhemlvCiMKIyBMaWNlbnNlZCB1bmRlciB0aGUgQXBhY2hlIExpY2Vuc2UsIFZlcnNpb24gMi4wICh0aGUgIkxpY2Vuc2UiKTsKIyB5b3UgbWF5IG5vdCB1c2UgdGhpcyBmaWxlIGV4Y2VwdCBpbiBjb21wbGlhbmNlIHdpdGggdGhlIExpY2Vuc2UuCiMgWW91IG1heSBvYnRhaW4gYSBjb3B5IG9mIHRoZSBMaWNlbnNlIGF0CiMKIyAgICAgaHR0cDovL3d3dy5hcGFjaGUub3JnL2xpY2Vuc2VzL0xJQ0VOU0UtMi4wCiMKIyBVbmxlc3MgcmVxdWlyZWQgYnkgYXBwbGljYWJsZSBsYXcgb3IgYWdyZWVkIHRvIGluIHdyaXRpbmcsIHNvZnR3YXJlCiMgZGlzdHJpYnV0ZWQgdW5kZXIgdGhlIExpY2Vuc2UgaXMgZGlzdHJpYnV0ZWQgb24gYW4gIkFTIElTIiBCQVNJUywKIyBXSVRIT1VUIFdBUlJBTlRJRVMgT1IgQ09ORElUSU9OUyBPRiBBTlkgS0lORCwgZWl0aGVyIGV4cHJlc3Mgb3IgaW1wbGllZC4KIyBTZWUgdGhlIExpY2Vuc2UgZm9yIHRoZSBzcGVjaWZpYyBsYW5ndWFnZSBnb3Zlcm5pbmcgcGVybWlzc2lvbnMgYW5kCiMgbGltaXRhdGlvbnMgdW5kZXIgdGhlIExpY2Vuc2UuCiMKaW1wb3J0IHBhbmRhcyBhcyBwZAppbXBvcnQgcHlhcnJvdy5wYXJxdWV0IGFzIHBxCmltcG9ydCBweWFycm93IGFzIHBhCmltcG9ydCBudW1weSBhcyBucAoKCmZyb20gbWxydW4uZXhlY3V0aW9uIGltcG9ydCBNTENsaWVudEN0eApmcm9tIG1scnVuLmRhdGFzdG9yZSBpbXBvcnQgRGF0YUl0ZW0KCmZyb20gdHlwaW5nIGltcG9ydCBMaXN0CmltcG9ydCBvcwoKCgpkZWYgX2NodW5rX3JlYWR3cml0ZSgKICAgICAgICBhcmNoaXZlX3VybCwKICAgICAgICBkZXN0X3BhdGgsCiAgICAgICAgY2h1bmtzaXplLAogICAgICAgIGhlYWRlciwKICAgICAgICBlbmNvZGluZywKICAgICAgICBkdHlwZSwKICAgICAgICBkYXRhc2V0Cik6CiAgICAiIiJzdHJlYW0gcmVhZCBhbmQgd3JpdGUgYXJjaGl2ZXMKCiAgICBwYW5kYXMgcmVhZHMgYW5kIHBhcnF1ZXQgd3JpdGVzCgogICAgbm90ZXMKICAgIC0tLS0tCiAgICAqIGRlc3RfcGF0aCBjYW4gYmUgZWl0aGVyIGEgZmlsZS5wYXJxdWV0LCBvciBpbiBodGUgY2FzZSBvZiBwYXJ0aXRpb25lZCBwYXJxdWV0CiAgICAgIGl0IHdpbGwgYmUgb25seSB0aGUgZGVzdGluYXRpb24gZm9sZGVyIG9mIHRoZSBwYXJxdWV0IHBhcnRpdGlvbiBmaWxlcwogICAgIiIiCiAgICBwcXdyaXRlciA9IE5vbmUKICAgIGhlYWRlciA9IFtdCiAgICBmb3IgaSwgZGYgaW4gZW51bWVyYXRlKHBkLnJlYWRfY3N2KGFyY2hpdmVfdXJsLCBjaHVua3NpemU9Y2h1bmtzaXplLAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBuYW1lcz1oZWFkZXIsIGVuY29kaW5nPWVuY29kaW5nLAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBkdHlwZT1kdHlwZSkpOgogICAgICAgIHRhYmxlID0gcGEuVGFibGUuZnJvbV9wYW5kYXMoZGYpCiAgICAgICAgaWYgaSA9PSAwOgogICAgICAgICAgICBpZiBkYXRhc2V0OgogICAgICAgICAgICAgICAgaGVhZGVyID0gbnAuY29weSh0YWJsZS5zY2hlbWEpCiAgICAgICAgICAgIGVsc2U6CiAgICAgICAgICAgICAgICBwcXdyaXRlciA9IHBxLlBhcnF1ZXRXcml0ZXIoZGVzdF9wYXRoLCB0YWJsZS5zY2hlbWEpCiAgICAgICAgaWYgZGF0YXNldDoKICAgICAgICAgICAgcHEud3JpdGVfdG9fZGF0YXNldCh0YWJsZSwgcm9vdF9wYXRoPWRlc3RfcGF0aCwgcGFydGl0aW9uX2NvbHM9cGFydGl0aW9uX2NvbHMpCiAgICAgICAgZWxzZToKICAgICAgICAgICAgcHF3cml0ZXIud3JpdGVfdGFibGUodGFibGUpCiAgICBpZiBwcXdyaXRlcjoKICAgICAgICBwcXdyaXRlci5jbG9zZSgpCgogICAgcmV0dXJuIGhlYWRlcgoKCmRlZiBhcmNfdG9fcGFycXVldCgKICAgICAgICBjb250ZXh0OiBNTENsaWVudEN0eCwKICAgICAgICBhcmNoaXZlX3VybDogRGF0YUl0ZW0sCiAgICAgICAgaGVhZGVyOiBMaXN0W3N0cl0gPSBbTm9uZV0sCiAgICAgICAgY2h1bmtzaXplOiBpbnQgPSAwLAogICAgICAgIGR0eXBlPU5vbmUsCiAgICAgICAgZW5jb2Rpbmc6IHN0ciA9ICJsYXRpbi0xIiwKICAgICAgICBrZXk6IHN0ciA9ICJkYXRhIiwKICAgICAgICBkYXRhc2V0OiBzdHIgPSAiTm9uZSIsCiAgICAgICAgcGFydF9jb2xzPVtdLAogICAgICAgIGZpbGVfZXh0OiBzdHIgPSAicGFycXVldCIsCiAgICAgICAgaW5kZXg6IGJvb2wgPSBGYWxzZSwKICAgICAgICByZWZyZXNoX2RhdGE6IGJvb2wgPSBGYWxzZSwKICAgICAgICBzdGF0czogYm9vbCA9IEZhbHNlCikgLT4gTm9uZToKICAgICIiIk9wZW4gYSBmaWxlL29iamVjdCBhcmNoaXZlIGFuZCBzYXZlIGFzIGEgcGFycXVldCBmaWxlIG9yIGRhdGFzZXQKCiAgICBOb3RlcwogICAgLS0tLS0KICAgICogdGhpcyBmdW5jdGlvbiBpcyB0eXBpY2FsbHkgZm9yIGxhcmdlIGZpbGVzLCBwbGVhc2UgYmUgc3VyZSB0byBjaGVjayBhbGwgc2V0dGluZ3MKICAgICogcGFydGl0aW9uaW5nIHJlcXVpcmVzIHByZWNpc2Ugc3BlY2lmaWNhdGlvbiBvZiBjb2x1bW4gdHlwZXMuCiAgICAqIHRoZSBhcmNoaXZlX3VybCBjYW4gYmUgYW55IGZpbGUgcmVhZGFibGUgYnkgcGFuZGFzIHJlYWRfY3N2LCB3aGljaCBpbmNsdWRlcyB0YXIgZmlsZXMKICAgICogaWYgdGhlIGBkYXRhc2V0YCBwYXJhbWV0ZXIgaXMgbm90IGVtcHR5LCB0aGVuIGEgcGFydGl0aW9uZWQgZGF0YXNldCB3aWxsIGJlIGNyZWF0ZWQKICAgIGluc3RlYWQgb2YgYSBzaW5nbGUgZmlsZSBpbiB0aGUgZm9sZGVyIGBkYXRhc2V0YAogICAgKiBpZiBhIGtleSBleGlzdHMgYWxyZWFkeSB0aGVuIGl0IHdpbGwgbm90IGJlIHJlLWFjcXVpcmVkIHVubGVzcyB0aGUgYHJlZnJlc2hfZGF0YWAgcGFyYW0KICAgIGlzIHNldCB0byBgVHJ1ZWAuICBUaGlzIGlzIGluIGNhc2UgdGhlIG9yaWdpbmFsIGZpbGUgaXMgY29ycnVwdCwgb3IgYSByZWZyZXNoIGlzCiAgICByZXF1aXJlZC4KCiAgICA6cGFyYW0gY29udGV4dDogICAgICAgIHRoZSBmdW5jdGlvbiBjb250ZXh0CiAgICA6cGFyYW0gYXJjaGl2ZV91cmw6ICAgIE1MUnVuIGRhdGEgaW5wdXQgKERhdGFJdGVtIG9iamVjdCkKICAgIDpwYXJhbSBjaHVua3NpemU6ICAgICAgKDApIHdoZW4gPiAwLCByb3cgc2l6ZSAoY2h1bmspIHRvIHJldHJpZXZlCiAgICAgICAgICAgICAgICAgICAgICAgICAgIHBlciBpdGVyYXRpb24KICAgIDpwYXJhbSBkdHlwZSAgICAgICAgICAgZGVzdGluYXRpb24gZGF0YSB0eXBlIG9mIHNwZWNpZmllZCBjb2x1bW5zCiAgICA6cGFyYW0gZW5jb2RpbmcgICAgICAgICgibGF0aW4tOCIpIGZpbGUgZW5jb2RpbmcKICAgIDpwYXJhbSBrZXk6ICAgICAgICAgICAga2V5IGluIGFydGlmYWN0IHN0b3JlICh3aGVuIGxvZ19kYXRhPVRydWUpCiAgICA6cGFyYW0gZGF0YXNldDogICAgICAgIChOb25lKSBpZiBub3QgTm9uZSB0aGVuICJ0YXJnZXRfcGF0aC9kYXRhc2V0IgogICAgICAgICAgICAgICAgICAgICAgICAgICBpcyBmb2xkZXIgZm9yIHBhcnRpdGlvbmVkIGZpbGVzCiAgICA6cGFyYW0gcGFydF9jb2xzOiAgICAgIChbXSkgbGlzdCBvZiBwYXJ0aXRpb25pbmcgY29sdW1ucwogICAgOnBhcmFtIGZpbGVfZXh0OiAgICAgICAocGFycXVldCkgY3N2L3BhcnF1ZXQgZmlsZSBleHRlbnNpb24KICAgIDpwYXJhbSBpbmRleDogICAgICAgICAgKEZhbHNlKSBwYW5kYXMgc2F2ZSBpbmRleCBvcHRpb24KICAgIDpwYXJhbSByZWZyZXNoX2RhdGE6ICAgKEZhbHNlKSBvdmVyd3JpdGUgZXhpc3RpbmcgZGF0YSBhdCB0aGF0IGxvY2F0aW9uCiAgICA6cGFyYW0gc3RhdHM6ICAgICAgICAgIChOb25lKSBjYWxjdWxhdGUgdGFibGUgc3RhdHMgd2hlbiBsb2dnaW5nIGFydGlmYWN0CiAgICAiIiIKICAgIGJhc2VfcGF0aCA9IGNvbnRleHQuYXJ0aWZhY3RfcGF0aAogICAgb3MubWFrZWRpcnMoYmFzZV9wYXRoLCBleGlzdF9vaz1UcnVlKQoKICAgIGFyY2hpdmVfdXJsID0gYXJjaGl2ZV91cmwubG9jYWwoKQoKICAgIGlmIGRhdGFzZXQgaXMgbm90IE5vbmU6CiAgICAgICAgZGVzdF9wYXRoID0gb3MucGF0aC5qb2luKGJhc2VfcGF0aCwgZGF0YXNldCkKICAgICAgICBleGlzdHMgPSBvcy5wYXRoLmlzZGlyKGRlc3RfcGF0aCkKICAgIGVsc2U6CiAgICAgICAgZGVzdF9wYXRoID0gb3MucGF0aC5qb2luKGJhc2VfcGF0aCwga2V5ICsgZiIue2ZpbGVfZXh0fSIpCiAgICAgICAgZXhpc3RzID0gb3MucGF0aC5pc2ZpbGUoZGVzdF9wYXRoKQoKICAgIGlmIG5vdCBleGlzdHM6CiAgICAgICAgY29udGV4dC5sb2dnZXIuaW5mbygiZGVzdGluYXRpb24gZmlsZSBkb2VzIG5vdCBleGlzdCwgZG93bmxvYWRpbmciKQogICAgICAgIGlmIGNodW5rc2l6ZSA+IDA6CiAgICAgICAgICAgIGhlYWRlciA9IF9jaHVua19yZWFkd3JpdGUoYXJjaGl2ZV91cmwsIGRlc3RfcGF0aCwgY2h1bmtzaXplLAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIGVuY29kaW5nLCBkdHlwZSwgZGF0YXNldCkKICAgICAgICAgICAgY29udGV4dC5sb2dfZGF0YXNldChrZXk9a2V5LCBzdGF0cz1zdGF0cywgZm9ybWF0PSdwYXJxdWV0JywKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICB0YXJnZXRfcGF0aD1kZXN0X3BhdGgpCiAgICAgICAgZWxzZToKICAgICAgICAgICAgZGYgPSBwZC5yZWFkX2NzdihhcmNoaXZlX3VybCkKICAgICAgICAgICAgY29udGV4dC5sb2dfZGF0YXNldChrZXksIGRmPWRmLCBmb3JtYXQ9ZmlsZV9leHQsIGluZGV4PWluZGV4KQogICAgZWxzZToKICAgICAgICBjb250ZXh0LmxvZ2dlci5pbmZvKCJkZXN0aW5hdGlvbiBmaWxlIGFscmVhZHkgZXhpc3RzLCBub3RoaW5nIGRvbmUiKQ==
-    commands: []
-    code_origin: http://github.com/aviaIguazio/functions.git#b32ae36ee9e5fb7a3b0affa8c15046aae9df7d24:/Users/Avi_Asulin/PycharmProjects/functions/arc_to_parquet/arc_to_parquet.py
-    origin_filename: /Users/Avi_Asulin/PycharmProjects/functions/arc_to_parquet/arc_to_parquet.py
-    requirements: []
+  default_handler: arc_to_parquet
   entry_points:
     arc_to_parquet:
-      name: arc_to_parquet
-      doc: 'Open a file/object archive and save as a parquet file or dataset
-
-
-        Notes
-
-        -----
-
-        * this function is typically for large files, please be sure to check all
-        settings
-
-        * partitioning requires precise specification of column types.
-
-        * the archive_url can be any file readable by pandas read_csv, which includes
-        tar files
-
-        * if the `dataset` parameter is not empty, then a partitioned dataset will
-        be created
-
-        instead of a single file in the folder `dataset`
-
-        * if a key exists already then it will not be re-acquired unless the `refresh_data`
-        param
-
-        is set to `True`.  This is in case the original file is corrupt, or a refresh
-        is
-
-        required.'
+      has_varargs: false
       parameters:
       - name: context
         type: MLClientCtx
         doc: the function context
-        default: ''
       - name: archive_url
         type: DataItem
         doc: MLRun data input (DataItem object)
-        default: ''
       - name: header
         type: List[str]
         default:
@@ -128,20 +89,45 @@
         type: bool
         doc: (None) calculate table stats when logging artifact
         default: false
-      outputs:
-      - default: ''
       lineno: 68
+      outputs:
+      - type: None
+      name: arc_to_parquet
+      has_kwargs: false
+      doc: 'Open a file/object archive and save as a parquet file or dataset
+
+
+        Notes
+
+        -----
+
+        * this function is typically for large files, please be sure to check all
+        settings
+
+        * partitioning requires precise specification of column types.
+
+        * the archive_url can be any file readable by pandas read_csv, which includes
+        tar files
+
+        * if the `dataset` parameter is not empty, then a partitioned dataset will
+        be created
+
+        instead of a single file in the folder `dataset`
+
+        * if a key exists already then it will not be re-acquired unless the `refresh_data`
+        param
+
+        is set to `True`.  This is in case the original file is corrupt, or a refresh
+        is
+
+        required.'
+  build:
+    functionSourceCode: IyBDb3B5cmlnaHQgMjAxOSBJZ3VhemlvCiMKIyBMaWNlbnNlZCB1bmRlciB0aGUgQXBhY2hlIExpY2Vuc2UsIFZlcnNpb24gMi4wICh0aGUgIkxpY2Vuc2UiKTsKIyB5b3UgbWF5IG5vdCB1c2UgdGhpcyBmaWxlIGV4Y2VwdCBpbiBjb21wbGlhbmNlIHdpdGggdGhlIExpY2Vuc2UuCiMgWW91IG1heSBvYnRhaW4gYSBjb3B5IG9mIHRoZSBMaWNlbnNlIGF0CiMKIyAgICAgaHR0cDovL3d3dy5hcGFjaGUub3JnL2xpY2Vuc2VzL0xJQ0VOU0UtMi4wCiMKIyBVbmxlc3MgcmVxdWlyZWQgYnkgYXBwbGljYWJsZSBsYXcgb3IgYWdyZWVkIHRvIGluIHdyaXRpbmcsIHNvZnR3YXJlCiMgZGlzdHJpYnV0ZWQgdW5kZXIgdGhlIExpY2Vuc2UgaXMgZGlzdHJpYnV0ZWQgb24gYW4gIkFTIElTIiBCQVNJUywKIyBXSVRIT1VUIFdBUlJBTlRJRVMgT1IgQ09ORElUSU9OUyBPRiBBTlkgS0lORCwgZWl0aGVyIGV4cHJlc3Mgb3IgaW1wbGllZC4KIyBTZWUgdGhlIExpY2Vuc2UgZm9yIHRoZSBzcGVjaWZpYyBsYW5ndWFnZSBnb3Zlcm5pbmcgcGVybWlzc2lvbnMgYW5kCiMgbGltaXRhdGlvbnMgdW5kZXIgdGhlIExpY2Vuc2UuCiMKaW1wb3J0IHBhbmRhcyBhcyBwZAppbXBvcnQgcHlhcnJvdy5wYXJxdWV0IGFzIHBxCmltcG9ydCBweWFycm93IGFzIHBhCmltcG9ydCBudW1weSBhcyBucAoKCmZyb20gbWxydW4uZXhlY3V0aW9uIGltcG9ydCBNTENsaWVudEN0eApmcm9tIG1scnVuLmRhdGFzdG9yZSBpbXBvcnQgRGF0YUl0ZW0KCmZyb20gdHlwaW5nIGltcG9ydCBMaXN0CmltcG9ydCBvcwoKCgpkZWYgX2NodW5rX3JlYWR3cml0ZSgKICAgICAgICBhcmNoaXZlX3VybCwKICAgICAgICBkZXN0X3BhdGgsCiAgICAgICAgY2h1bmtzaXplLAogICAgICAgIGhlYWRlciwKICAgICAgICBlbmNvZGluZywKICAgICAgICBkdHlwZSwKICAgICAgICBkYXRhc2V0Cik6CiAgICAiIiJzdHJlYW0gcmVhZCBhbmQgd3JpdGUgYXJjaGl2ZXMKCiAgICBwYW5kYXMgcmVhZHMgYW5kIHBhcnF1ZXQgd3JpdGVzCgogICAgbm90ZXMKICAgIC0tLS0tCiAgICAqIGRlc3RfcGF0aCBjYW4gYmUgZWl0aGVyIGEgZmlsZS5wYXJxdWV0LCBvciBpbiBodGUgY2FzZSBvZiBwYXJ0aXRpb25lZCBwYXJxdWV0CiAgICAgIGl0IHdpbGwgYmUgb25seSB0aGUgZGVzdGluYXRpb24gZm9sZGVyIG9mIHRoZSBwYXJxdWV0IHBhcnRpdGlvbiBmaWxlcwogICAgIiIiCiAgICBwcXdyaXRlciA9IE5vbmUKICAgIGhlYWRlciA9IFtdCiAgICBmb3IgaSwgZGYgaW4gZW51bWVyYXRlKHBkLnJlYWRfY3N2KGFyY2hpdmVfdXJsLCBjaHVua3NpemU9Y2h1bmtzaXplLAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBuYW1lcz1oZWFkZXIsIGVuY29kaW5nPWVuY29kaW5nLAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBkdHlwZT1kdHlwZSkpOgogICAgICAgIHRhYmxlID0gcGEuVGFibGUuZnJvbV9wYW5kYXMoZGYpCiAgICAgICAgaWYgaSA9PSAwOgogICAgICAgICAgICBpZiBkYXRhc2V0OgogICAgICAgICAgICAgICAgaGVhZGVyID0gbnAuY29weSh0YWJsZS5zY2hlbWEpCiAgICAgICAgICAgIGVsc2U6CiAgICAgICAgICAgICAgICBwcXdyaXRlciA9IHBxLlBhcnF1ZXRXcml0ZXIoZGVzdF9wYXRoLCB0YWJsZS5zY2hlbWEpCiAgICAgICAgaWYgZGF0YXNldDoKICAgICAgICAgICAgcHEud3JpdGVfdG9fZGF0YXNldCh0YWJsZSwgcm9vdF9wYXRoPWRlc3RfcGF0aCwgcGFydGl0aW9uX2NvbHM9cGFydGl0aW9uX2NvbHMpCiAgICAgICAgZWxzZToKICAgICAgICAgICAgcHF3cml0ZXIud3JpdGVfdGFibGUodGFibGUpCiAgICBpZiBwcXdyaXRlcjoKICAgICAgICBwcXdyaXRlci5jbG9zZSgpCgogICAgcmV0dXJuIGhlYWRlcgoKCmRlZiBhcmNfdG9fcGFycXVldCgKICAgICAgICBjb250ZXh0OiBNTENsaWVudEN0eCwKICAgICAgICBhcmNoaXZlX3VybDogRGF0YUl0ZW0sCiAgICAgICAgaGVhZGVyOiBMaXN0W3N0cl0gPSBbTm9uZV0sCiAgICAgICAgY2h1bmtzaXplOiBpbnQgPSAwLAogICAgICAgIGR0eXBlPU5vbmUsCiAgICAgICAgZW5jb2Rpbmc6IHN0ciA9ICJsYXRpbi0xIiwKICAgICAgICBrZXk6IHN0ciA9ICJkYXRhIiwKICAgICAgICBkYXRhc2V0OiBzdHIgPSAiTm9uZSIsCiAgICAgICAgcGFydF9jb2xzPVtdLAogICAgICAgIGZpbGVfZXh0OiBzdHIgPSAicGFycXVldCIsCiAgICAgICAgaW5kZXg6IGJvb2wgPSBGYWxzZSwKICAgICAgICByZWZyZXNoX2RhdGE6IGJvb2wgPSBGYWxzZSwKICAgICAgICBzdGF0czogYm9vbCA9IEZhbHNlCikgLT4gTm9uZToKICAgICIiIk9wZW4gYSBmaWxlL29iamVjdCBhcmNoaXZlIGFuZCBzYXZlIGFzIGEgcGFycXVldCBmaWxlIG9yIGRhdGFzZXQKCiAgICBOb3RlcwogICAgLS0tLS0KICAgICogdGhpcyBmdW5jdGlvbiBpcyB0eXBpY2FsbHkgZm9yIGxhcmdlIGZpbGVzLCBwbGVhc2UgYmUgc3VyZSB0byBjaGVjayBhbGwgc2V0dGluZ3MKICAgICogcGFydGl0aW9uaW5nIHJlcXVpcmVzIHByZWNpc2Ugc3BlY2lmaWNhdGlvbiBvZiBjb2x1bW4gdHlwZXMuCiAgICAqIHRoZSBhcmNoaXZlX3VybCBjYW4gYmUgYW55IGZpbGUgcmVhZGFibGUgYnkgcGFuZGFzIHJlYWRfY3N2LCB3aGljaCBpbmNsdWRlcyB0YXIgZmlsZXMKICAgICogaWYgdGhlIGBkYXRhc2V0YCBwYXJhbWV0ZXIgaXMgbm90IGVtcHR5LCB0aGVuIGEgcGFydGl0aW9uZWQgZGF0YXNldCB3aWxsIGJlIGNyZWF0ZWQKICAgIGluc3RlYWQgb2YgYSBzaW5nbGUgZmlsZSBpbiB0aGUgZm9sZGVyIGBkYXRhc2V0YAogICAgKiBpZiBhIGtleSBleGlzdHMgYWxyZWFkeSB0aGVuIGl0IHdpbGwgbm90IGJlIHJlLWFjcXVpcmVkIHVubGVzcyB0aGUgYHJlZnJlc2hfZGF0YWAgcGFyYW0KICAgIGlzIHNldCB0byBgVHJ1ZWAuICBUaGlzIGlzIGluIGNhc2UgdGhlIG9yaWdpbmFsIGZpbGUgaXMgY29ycnVwdCwgb3IgYSByZWZyZXNoIGlzCiAgICByZXF1aXJlZC4KCiAgICA6cGFyYW0gY29udGV4dDogICAgICAgIHRoZSBmdW5jdGlvbiBjb250ZXh0CiAgICA6cGFyYW0gYXJjaGl2ZV91cmw6ICAgIE1MUnVuIGRhdGEgaW5wdXQgKERhdGFJdGVtIG9iamVjdCkKICAgIDpwYXJhbSBjaHVua3NpemU6ICAgICAgKDApIHdoZW4gPiAwLCByb3cgc2l6ZSAoY2h1bmspIHRvIHJldHJpZXZlCiAgICAgICAgICAgICAgICAgICAgICAgICAgIHBlciBpdGVyYXRpb24KICAgIDpwYXJhbSBkdHlwZSAgICAgICAgICAgZGVzdGluYXRpb24gZGF0YSB0eXBlIG9mIHNwZWNpZmllZCBjb2x1bW5zCiAgICA6cGFyYW0gZW5jb2RpbmcgICAgICAgICgibGF0aW4tOCIpIGZpbGUgZW5jb2RpbmcKICAgIDpwYXJhbSBrZXk6ICAgICAgICAgICAga2V5IGluIGFydGlmYWN0IHN0b3JlICh3aGVuIGxvZ19kYXRhPVRydWUpCiAgICA6cGFyYW0gZGF0YXNldDogICAgICAgIChOb25lKSBpZiBub3QgTm9uZSB0aGVuICJ0YXJnZXRfcGF0aC9kYXRhc2V0IgogICAgICAgICAgICAgICAgICAgICAgICAgICBpcyBmb2xkZXIgZm9yIHBhcnRpdGlvbmVkIGZpbGVzCiAgICA6cGFyYW0gcGFydF9jb2xzOiAgICAgIChbXSkgbGlzdCBvZiBwYXJ0aXRpb25pbmcgY29sdW1ucwogICAgOnBhcmFtIGZpbGVfZXh0OiAgICAgICAocGFycXVldCkgY3N2L3BhcnF1ZXQgZmlsZSBleHRlbnNpb24KICAgIDpwYXJhbSBpbmRleDogICAgICAgICAgKEZhbHNlKSBwYW5kYXMgc2F2ZSBpbmRleCBvcHRpb24KICAgIDpwYXJhbSByZWZyZXNoX2RhdGE6ICAgKEZhbHNlKSBvdmVyd3JpdGUgZXhpc3RpbmcgZGF0YSBhdCB0aGF0IGxvY2F0aW9uCiAgICA6cGFyYW0gc3RhdHM6ICAgICAgICAgIChOb25lKSBjYWxjdWxhdGUgdGFibGUgc3RhdHMgd2hlbiBsb2dnaW5nIGFydGlmYWN0CiAgICAiIiIKICAgIGJhc2VfcGF0aCA9IGNvbnRleHQuYXJ0aWZhY3RfcGF0aAogICAgb3MubWFrZWRpcnMoYmFzZV9wYXRoLCBleGlzdF9vaz1UcnVlKQoKICAgIGFyY2hpdmVfdXJsID0gYXJjaGl2ZV91cmwubG9jYWwoKQoKICAgIGlmIGRhdGFzZXQgaXMgbm90IE5vbmU6CiAgICAgICAgZGVzdF9wYXRoID0gb3MucGF0aC5qb2luKGJhc2VfcGF0aCwgZGF0YXNldCkKICAgICAgICBleGlzdHMgPSBvcy5wYXRoLmlzZGlyKGRlc3RfcGF0aCkKICAgIGVsc2U6CiAgICAgICAgZGVzdF9wYXRoID0gb3MucGF0aC5qb2luKGJhc2VfcGF0aCwga2V5ICsgZiIue2ZpbGVfZXh0fSIpCiAgICAgICAgZXhpc3RzID0gb3MucGF0aC5pc2ZpbGUoZGVzdF9wYXRoKQoKICAgIGlmIG5vdCBleGlzdHM6CiAgICAgICAgY29udGV4dC5sb2dnZXIuaW5mbygiZGVzdGluYXRpb24gZmlsZSBkb2VzIG5vdCBleGlzdCwgZG93bmxvYWRpbmciKQogICAgICAgIGlmIGNodW5rc2l6ZSA+IDA6CiAgICAgICAgICAgIGhlYWRlciA9IF9jaHVua19yZWFkd3JpdGUoYXJjaGl2ZV91cmwsIGRlc3RfcGF0aCwgY2h1bmtzaXplLAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIGVuY29kaW5nLCBkdHlwZSwgZGF0YXNldCkKICAgICAgICAgICAgY29udGV4dC5sb2dfZGF0YXNldChrZXk9a2V5LCBzdGF0cz1zdGF0cywgZm9ybWF0PSdwYXJxdWV0JywKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICB0YXJnZXRfcGF0aD1kZXN0X3BhdGgpCiAgICAgICAgZWxzZToKICAgICAgICAgICAgZGYgPSBwZC5yZWFkX2NzdihhcmNoaXZlX3VybCkKICAgICAgICAgICAgY29udGV4dC5sb2dfZGF0YXNldChrZXksIGRmPWRmLCBmb3JtYXQ9ZmlsZV9leHQsIGluZGV4PWluZGV4KQogICAgZWxzZToKICAgICAgICBjb250ZXh0LmxvZ2dlci5pbmZvKCJkZXN0aW5hdGlvbiBmaWxlIGFscmVhZHkgZXhpc3RzLCBub3RoaW5nIGRvbmUiKQ==
+    code_origin: ''
+    origin_filename: ''
   description: retrieve remote archive, open and save as parquet
-  default_handler: arc_to_parquet
   disable_auto_mount: false
-  clone_target_dir: ''
-  env: []
-  priority_class_name: ''
-  preemption_mode: prevent
-  affinity: null
-  tolerations: null
-  security_context: {}
-verbose: false
+  image: mlrun/mlrun
 
         
     
diff --git a/functions/master/arc_to_parquet/latest/static/item.html b/functions/master/arc_to_parquet/latest/static/item.html index 3d2efcf4..f5e22a54 100644 --- a/functions/master/arc_to_parquet/latest/static/item.html +++ b/functions/master/arc_to_parquet/latest/static/item.html @@ -30,7 +30,7 @@ apiVersion: v1 categories: -- etl +- utils description: retrieve remote archive, open and save as parquet doc: '' example: arc_to_parquet.ipynb @@ -41,7 +41,7 @@ author: avi maintainers: [] marketplaceType: '' -mlrunVersion: 1.4.1 +mlrunVersion: 1.7.0 name: arc-to-parquet platformVersion: 3.5.4 spec: @@ -51,7 +51,7 @@ kind: job requirements: [] url: '' -version: 1.4.1 +version: 1.5.0 diff --git a/functions/master/auto_trainer/1.7.0/static/auto_trainer.html b/functions/master/auto_trainer/1.7.0/static/auto_trainer.html index cb1160d7..795a4a23 100644 --- a/functions/master/auto_trainer/1.7.0/static/auto_trainer.html +++ b/functions/master/auto_trainer/1.7.0/static/auto_trainer.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/auto_trainer/1.7.0/static/documentation.html b/functions/master/auto_trainer/1.7.0/static/documentation.html index 2e4a160d..b0bdf6b4 100644 --- a/functions/master/auto_trainer/1.7.0/static/documentation.html +++ b/functions/master/auto_trainer/1.7.0/static/documentation.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/auto_trainer/1.7.0/static/example.html b/functions/master/auto_trainer/1.7.0/static/example.html index 46a819e9..2c4fdabf 100644 --- a/functions/master/auto_trainer/1.7.0/static/example.html +++ b/functions/master/auto_trainer/1.7.0/static/example.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/auto_trainer/latest/static/auto_trainer.html b/functions/master/auto_trainer/latest/static/auto_trainer.html index cb1160d7..795a4a23 100644 --- a/functions/master/auto_trainer/latest/static/auto_trainer.html +++ b/functions/master/auto_trainer/latest/static/auto_trainer.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/auto_trainer/latest/static/documentation.html b/functions/master/auto_trainer/latest/static/documentation.html index 2e4a160d..b0bdf6b4 100644 --- a/functions/master/auto_trainer/latest/static/documentation.html +++ b/functions/master/auto_trainer/latest/static/documentation.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/auto_trainer/latest/static/example.html b/functions/master/auto_trainer/latest/static/example.html index 46a819e9..2c4fdabf 100644 --- a/functions/master/auto_trainer/latest/static/example.html +++ b/functions/master/auto_trainer/latest/static/example.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/azureml_serving/1.1.0/src/function.yaml b/functions/master/azureml_serving/1.1.0/src/function.yaml index c558e625..26229e70 100644 --- a/functions/master/azureml_serving/1.1.0/src/function.yaml +++ b/functions/master/azureml_serving/1.1.0/src/function.yaml @@ -48,4 +48,4 @@ spec: secret_sources: [] affinity: null tolerations: null -verbose: false +verbose: false \ No newline at end of file diff --git a/functions/master/azureml_serving/1.1.0/src/item.yaml b/functions/master/azureml_serving/1.1.0/src/item.yaml index 84fadd55..d20e636b 100644 --- a/functions/master/azureml_serving/1.1.0/src/item.yaml +++ b/functions/master/azureml_serving/1.1.0/src/item.yaml @@ -24,4 +24,4 @@ spec: requirements: - azureml-automl-runtime~=1.38.1 url: '' -version: 1.1.0 +version: 1.1.0 \ No newline at end of file diff --git a/functions/master/azureml_serving/1.1.0/static/documentation.html b/functions/master/azureml_serving/1.1.0/static/documentation.html index c5194e21..92c7c89d 100644 --- a/functions/master/azureml_serving/1.1.0/static/documentation.html +++ b/functions/master/azureml_serving/1.1.0/static/documentation.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/azureml_serving/1.1.0/static/example.html b/functions/master/azureml_serving/1.1.0/static/example.html index ebc576a1..52c450d9 100644 --- a/functions/master/azureml_serving/1.1.0/static/example.html +++ b/functions/master/azureml_serving/1.1.0/static/example.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/azureml_serving/1.1.0/static/function.html b/functions/master/azureml_serving/1.1.0/static/function.html index a792e369..a2b3e82d 100644 --- a/functions/master/azureml_serving/1.1.0/static/function.html +++ b/functions/master/azureml_serving/1.1.0/static/function.html @@ -79,7 +79,6 @@ affinity: null tolerations: null verbose: false - diff --git a/functions/master/azureml_serving/1.1.0/static/item.html b/functions/master/azureml_serving/1.1.0/static/item.html index 062811e0..6e8b05b3 100644 --- a/functions/master/azureml_serving/1.1.0/static/item.html +++ b/functions/master/azureml_serving/1.1.0/static/item.html @@ -55,7 +55,6 @@ - azureml-automl-runtime~=1.38.1 url: '' version: 1.1.0 - diff --git a/functions/master/azureml_serving/latest/src/function.yaml b/functions/master/azureml_serving/latest/src/function.yaml index c558e625..26229e70 100644 --- a/functions/master/azureml_serving/latest/src/function.yaml +++ b/functions/master/azureml_serving/latest/src/function.yaml @@ -48,4 +48,4 @@ spec: secret_sources: [] affinity: null tolerations: null -verbose: false +verbose: false \ No newline at end of file diff --git a/functions/master/azureml_serving/latest/src/item.yaml b/functions/master/azureml_serving/latest/src/item.yaml index 84fadd55..d20e636b 100644 --- a/functions/master/azureml_serving/latest/src/item.yaml +++ b/functions/master/azureml_serving/latest/src/item.yaml @@ -24,4 +24,4 @@ spec: requirements: - azureml-automl-runtime~=1.38.1 url: '' -version: 1.1.0 +version: 1.1.0 \ No newline at end of file diff --git a/functions/master/azureml_serving/latest/static/documentation.html b/functions/master/azureml_serving/latest/static/documentation.html index c5194e21..92c7c89d 100644 --- a/functions/master/azureml_serving/latest/static/documentation.html +++ b/functions/master/azureml_serving/latest/static/documentation.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/azureml_serving/latest/static/example.html b/functions/master/azureml_serving/latest/static/example.html index ebc576a1..52c450d9 100644 --- a/functions/master/azureml_serving/latest/static/example.html +++ b/functions/master/azureml_serving/latest/static/example.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/azureml_serving/latest/static/function.html b/functions/master/azureml_serving/latest/static/function.html index a792e369..a2b3e82d 100644 --- a/functions/master/azureml_serving/latest/static/function.html +++ b/functions/master/azureml_serving/latest/static/function.html @@ -79,7 +79,6 @@ affinity: null tolerations: null verbose: false - diff --git a/functions/master/azureml_serving/latest/static/item.html b/functions/master/azureml_serving/latest/static/item.html index 062811e0..6e8b05b3 100644 --- a/functions/master/azureml_serving/latest/static/item.html +++ b/functions/master/azureml_serving/latest/static/item.html @@ -55,7 +55,6 @@ - azureml-automl-runtime~=1.38.1 url: '' version: 1.1.0 - diff --git a/functions/master/azureml_utils/1.4.0/src/azureml_utils.ipynb b/functions/master/azureml_utils/1.4.0/src/azureml_utils.ipynb new file mode 100644 index 00000000..3ab6f766 --- /dev/null +++ b/functions/master/azureml_utils/1.4.0/src/azureml_utils.ipynb @@ -0,0 +1,1899 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "source": [], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "# AzureML AutoML Demo\n", + "MLRun function for using Azure AutoML, Including the following handlers:\n", + "1. `init_experiment` - Initialize workspace and experiment in Azure ML.\n", + "2. `init_compute` - Initialize Azure ML compute target to run experiment.\n", + "3. `register_dataset` - Register dataset object (can be also an Iguazio FeatureVector) in Azure ML.\n", + "4. `download_model` - Download trained model from Azure ML to local filesystem.\n", + "5. `upload_model` - Upload pre-trained model from local filesystem to Azure ML.\n", + "6. `submit_training_job` - Submit training job to Azure AutoML and download trained model when completed.\n", + "7. `automl_train` - Whole training flow for Azure AutoML:\n", + " - Initializing workspace and experiment in Azure ML\n", + " - Registers dataset/feature vector,\n", + " - submits training job\n", + " - downloads trained model" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "## 1. Setup MLRun Project\n", + "\n", + "Creating MLRun project" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 1, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2022-02-02 18:28:06,840 [warning] Failed resolving version info. Ignoring and using defaults\n", + "> 2022-02-02 18:28:11,379 [warning] Server or client version is unstable. Assuming compatible: {'server_version': '0.0.0+unstable', 'client_version': '0.0.0+unstable'}\n" + ] + } + ], + "source": [ + "import mlrun" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 2, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2022-02-02 18:28:11,423 [info] loaded project azureml from MLRun DB\n" + ] + } + ], + "source": [ + "# Initialize the MLRun project object\n", + "project = mlrun.get_or_create_project('azureml', context=\"./\", user_project=True)" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "## 2. Preparing Dataset (Iris)\n", + "\n", + "- Preparing training URI for the MLRun function" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 3, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sepal length (cm)sepal width (cm)petal length (cm)petal width (cm)label
05.13.51.40.20
14.93.01.40.20
24.73.21.30.20
34.63.11.50.20
45.03.61.40.20
\n", + "
" + ], + "text/plain": [ + " sepal length (cm) sepal width (cm) petal length (cm) petal width (cm) \\\n", + "0 5.1 3.5 1.4 0.2 \n", + "1 4.9 3.0 1.4 0.2 \n", + "2 4.7 3.2 1.3 0.2 \n", + "3 4.6 3.1 1.5 0.2 \n", + "4 5.0 3.6 1.4 0.2 \n", + "\n", + " label \n", + "0 0 \n", + "1 0 \n", + "2 0 \n", + "3 0 \n", + "4 0 " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "DATA_URL = \"https://s3.wasabisys.com/iguazio/data/iris/iris_dataset.csv\"\n", + "\n", + "mlrun.get_dataitem(DATA_URL).as_df().head()" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "## 3. Submit Azure AutoML Training Job" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "### Submit Azure Secrets\n", + "For more information about working with secrets see: [MLRun docs: Working with secrets](https://docs.mlrun.org/en/latest/secrets.html)" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 4, + "outputs": [], + "source": [ + "project.set_secrets(file_path=\"env\")" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "### Import `azureml_utils` from marketplace" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 5, + "outputs": [], + "source": [ + "azureml_fn = mlrun.import_function('hub://azureml_utils')\n", + "azureml_fn.deploy()" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "### Automl configuration & run parameters\n", + "\n", + "- The `automl_settings` object is the setup for Azure AutoML. It holds the `task` type, number of models to train - `iterations`, the desired metric - `primary metric`, the allowed types of models `allowed_models` and more.\n", + "\n", + "- The `params` are the parameters for the MLRun function, such as experiment (`experiment_name`) and cpu cluster (`cpu_cluster_name`) names in AzureML, dataset properties for registration, target label for training - `label_column_name`, number of models to download `save_n_models` and more." + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 6, + "outputs": [], + "source": [ + "label_column_name = 'label' # target label\n", + "\n", + "# Configure automl settings:\n", + "automl_settings = {\n", + " \"task\": 'classification',\n", + " \"debug_log\": 'automl_errors.log',\n", + "# \"experiment_exit_score\" : 0.9,\n", + " \"enable_early_stopping\": False,\n", + " \"allowed_models\": ['LogisticRegression', 'SGD', 'SVM'],\n", + " \"iterations\": 5,\n", + " \"iteration_timeout_minutes\": 2,\n", + " \"max_concurrent_iterations\": 2,\n", + " \"max_cores_per_iteration\": -1,\n", + " \"n_cross_validations\": 5,\n", + " \"primary_metric\": 'accuracy',\n", + " \"featurization\": 'off',\n", + " \"model_explainability\": False,\n", + " \"enable_voting_ensemble\": False,\n", + " \"enable_stack_ensemble\": False\n", + " }\n", + "\n", + "# Setting params to azure_run function:\n", + "params = {\n", + " \"experiment_name\": 'azure-automl-test',\n", + " \"cpu_cluster_name\": 'azureml-cpu',\n", + " \"dataset_name\": 'iris',\n", + " \"dataset_description\": 'iris training data',\n", + " \"label_column_name\": label_column_name,\n", + " \"create_new_version\": True,\n", + " \"register_model_name\": \"iris-model\",\n", + " \"save_n_models\": 3,\n", + " \"automl_settings\": automl_settings\n", + "}" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "### Run Azure AutoML train:\n", + "\n", + "This MLRun function will perform the following:\n", + "- Initialize workspace and experiment in your AzureML\n", + "- Register the dataset/feature vector to Iguazio and to AzureML.\n", + "- Submit the training job to AzureML and print the live training results fro each model\n", + "- Generate the top trained models." + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 7, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2022-02-02 18:28:11,740 [info] Function is not deployed and auto_build flag is set, starting deploy...\n", + "> 2022-02-02 18:28:11,932 [info] Started building image: .mlrun/func-azureml-yonatan-azureml-utils:latest\n", + "\u001B[36mINFO\u001B[0m[0000] Retrieving image manifest python:3.7.9-slim \n", + "\u001B[36mINFO\u001B[0m[0000] Retrieving image python:3.7.9-slim from registry index.docker.io \n", + "\u001B[36mINFO\u001B[0m[0000] Built cross stage deps: map[] \n", + "\u001B[36mINFO\u001B[0m[0000] Retrieving image manifest python:3.7.9-slim \n", + "\u001B[36mINFO\u001B[0m[0000] Returning cached image manifest \n", + "\u001B[36mINFO\u001B[0m[0000] Executing 0 build triggers \n", + "\u001B[36mINFO\u001B[0m[0000] Unpacking rootfs as cmd RUN python -m pip install pip==21.2.4 requires it. \n", + "\u001B[36mINFO\u001B[0m[0002] RUN python -m pip install pip==21.2.4 \n", + "\u001B[36mINFO\u001B[0m[0002] Taking snapshot of full filesystem... \n", + "\u001B[36mINFO\u001B[0m[0003] cmd: /bin/sh \n", + "\u001B[36mINFO\u001B[0m[0003] args: [-c python -m pip install pip==21.2.4] \n", + "\u001B[36mINFO\u001B[0m[0003] Running: [/bin/sh -c python -m pip install pip==21.2.4] \n", + "Collecting pip==21.2.4\n", + " Downloading pip-21.2.4-py3-none-any.whl (1.6 MB)\n", + "Installing collected packages: pip\n", + " Attempting uninstall: pip\n", + " Found existing installation: pip 21.0.1\n", + " Uninstalling pip-21.0.1:\n", + " Successfully uninstalled pip-21.0.1\n", + "Successfully installed pip-21.2.4\n", + "\u001B[36mINFO\u001B[0m[0006] Taking snapshot of full filesystem... \n", + "\u001B[36mINFO\u001B[0m[0006] RUN apt-get update && apt-get install -y --no-install-recommends git \n", + "\u001B[36mINFO\u001B[0m[0006] cmd: /bin/sh \n", + "\u001B[36mINFO\u001B[0m[0006] args: [-c apt-get update && apt-get install -y --no-install-recommends git] \n", + "\u001B[36mINFO\u001B[0m[0006] Running: [/bin/sh -c apt-get update && apt-get install -y --no-install-recommends git] \n", + "Get:1 http://security.debian.org/debian-security buster/updates InRelease [65.4 kB]\n", + "Get:2 http://deb.debian.org/debian buster InRelease [122 kB]\n", + "Get:3 http://deb.debian.org/debian buster-updates InRelease [51.9 kB]\n", + "Get:4 http://security.debian.org/debian-security buster/updates/main amd64 Packages [314 kB]\n", + "Get:5 http://deb.debian.org/debian buster/main amd64 Packages [7906 kB]\n", + "Get:6 http://deb.debian.org/debian buster-updates/main amd64 Packages [8792 B]\n", + "Fetched 8468 kB in 2s (5347 kB/s)\n", + "Reading package lists...\n", + "Reading package lists...\n", + "Building dependency tree...\n", + "Reading state information...\n", + "The following additional packages will be installed:\n", + " git-man libcurl3-gnutls liberror-perl libgdbm-compat4 libgssapi-krb5-2\n", + " libk5crypto3 libkeyutils1 libkrb5-3 libkrb5support0 libldap-2.4-2\n", + " libldap-common libnghttp2-14 libpcre2-8-0 libperl5.28 libpsl5 librtmp1\n", + " libsasl2-2 libsasl2-modules-db libssh2-1 perl perl-modules-5.28\n", + "Suggested packages:\n", + " gettext-base git-daemon-run | git-daemon-sysvinit git-doc git-el git-email\n", + " git-gui gitk gitweb git-cvs git-mediawiki git-svn krb5-doc krb5-user\n", + " sensible-utils perl-doc libterm-readline-gnu-perl\n", + " | libterm-readline-perl-perl make libb-debug-perl liblocale-codes-perl\n", + "Recommended packages:\n", + " patch less ssh-client krb5-locales publicsuffix libsasl2-modules\n", + "The following NEW packages will be installed:\n", + " git git-man libcurl3-gnutls liberror-perl libgdbm-compat4 libgssapi-krb5-2\n", + " libk5crypto3 libkeyutils1 libkrb5-3 libkrb5support0 libldap-2.4-2\n", + " libldap-common libnghttp2-14 libpcre2-8-0 libperl5.28 libpsl5 librtmp1\n", + " libsasl2-2 libsasl2-modules-db libssh2-1 perl perl-modules-5.28\n", + "0 upgraded, 22 newly installed, 0 to remove and 16 not upgraded.\n", + "Need to get 16.4 MB of archives.\n", + "After this operation, 90.1 MB of additional disk space will be used.\n", + "Get:1 http://deb.debian.org/debian buster/main amd64 perl-modules-5.28 all 5.28.1-6+deb10u1 [2873 kB]\n", + "Get:2 http://deb.debian.org/debian buster/main amd64 libgdbm-compat4 amd64 1.18.1-4 [44.1 kB]\n", + "Get:3 http://deb.debian.org/debian buster/main amd64 libperl5.28 amd64 5.28.1-6+deb10u1 [3894 kB]\n", + "Get:4 http://deb.debian.org/debian buster/main amd64 perl amd64 5.28.1-6+deb10u1 [204 kB]\n", + "Get:5 http://deb.debian.org/debian buster/main amd64 libkeyutils1 amd64 1.6-6 [15.0 kB]\n", + "Get:6 http://deb.debian.org/debian buster/main amd64 libkrb5support0 amd64 1.17-3+deb10u3 [65.8 kB]\n", + "Get:7 http://deb.debian.org/debian buster/main amd64 libk5crypto3 amd64 1.17-3+deb10u3 [122 kB]\n", + "Get:8 http://deb.debian.org/debian buster/main amd64 libkrb5-3 amd64 1.17-3+deb10u3 [370 kB]\n", + "Get:9 http://deb.debian.org/debian buster/main amd64 libgssapi-krb5-2 amd64 1.17-3+deb10u3 [158 kB]\n", + "Get:10 http://deb.debian.org/debian buster/main amd64 libsasl2-modules-db amd64 2.1.27+dfsg-1+deb10u1 [69.1 kB]\n", + "Get:11 http://deb.debian.org/debian buster/main amd64 libsasl2-2 amd64 2.1.27+dfsg-1+deb10u1 [106 kB]\n", + "Get:12 http://deb.debian.org/debian buster/main amd64 libldap-common all 2.4.47+dfsg-3+deb10u6 [90.0 kB]\n", + "Get:13 http://deb.debian.org/debian buster/main amd64 libldap-2.4-2 amd64 2.4.47+dfsg-3+deb10u6 [224 kB]\n", + "Get:14 http://deb.debian.org/debian buster/main amd64 libnghttp2-14 amd64 1.36.0-2+deb10u1 [85.0 kB]\n", + "Get:15 http://deb.debian.org/debian buster/main amd64 libpsl5 amd64 0.20.2-2 [53.7 kB]\n", + "Get:16 http://deb.debian.org/debian buster/main amd64 librtmp1 amd64 2.4+20151223.gitfa8646d.1-2 [60.5 kB]\n", + "Get:17 http://deb.debian.org/debian buster/main amd64 libssh2-1 amd64 1.8.0-2.1 [140 kB]\n", + "Get:18 http://deb.debian.org/debian buster/main amd64 libcurl3-gnutls amd64 7.64.0-4+deb10u2 [330 kB]\n", + "Get:19 http://deb.debian.org/debian buster/main amd64 libpcre2-8-0 amd64 10.32-5 [213 kB]\n", + "Get:20 http://deb.debian.org/debian buster/main amd64 liberror-perl all 0.17027-2 [30.9 kB]\n", + "Get:21 http://deb.debian.org/debian buster/main amd64 git-man all 1:2.20.1-2+deb10u3 [1620 kB]\n", + "Get:22 http://deb.debian.org/debian buster/main amd64 git amd64 1:2.20.1-2+deb10u3 [5633 kB]\n", + "debconf: delaying package configuration, since apt-utils is not installed\n", + "Fetched 16.4 MB in 0s (62.3 MB/s)\n", + "Selecting previously unselected package perl-modules-5.28.\n", + "(Reading database ... 6840 files and directories currently installed.)\n", + "Preparing to unpack .../00-perl-modules-5.28_5.28.1-6+deb10u1_all.deb ...\n", + "Unpacking perl-modules-5.28 (5.28.1-6+deb10u1) ...\n", + "Selecting previously unselected package libgdbm-compat4:amd64.\n", + "Preparing to unpack .../01-libgdbm-compat4_1.18.1-4_amd64.deb ...\n", + "Unpacking libgdbm-compat4:amd64 (1.18.1-4) ...\n", + "Selecting previously unselected package libperl5.28:amd64.\n", + "Preparing to unpack .../02-libperl5.28_5.28.1-6+deb10u1_amd64.deb ...\n", + "Unpacking libperl5.28:amd64 (5.28.1-6+deb10u1) ...\n", + "Selecting previously unselected package perl.\n", + "Preparing to unpack .../03-perl_5.28.1-6+deb10u1_amd64.deb ...\n", + "Unpacking perl (5.28.1-6+deb10u1) ...\n", + "Selecting previously unselected package libkeyutils1:amd64.\n", + "Preparing to unpack .../04-libkeyutils1_1.6-6_amd64.deb ...\n", + "Unpacking libkeyutils1:amd64 (1.6-6) ...\n", + "Selecting previously unselected package libkrb5support0:amd64.\n", + "Preparing to unpack .../05-libkrb5support0_1.17-3+deb10u3_amd64.deb ...\n", + "Unpacking libkrb5support0:amd64 (1.17-3+deb10u3) ...\n", + "Selecting previously unselected package libk5crypto3:amd64.\n", + "Preparing to unpack .../06-libk5crypto3_1.17-3+deb10u3_amd64.deb ...\n", + "Unpacking libk5crypto3:amd64 (1.17-3+deb10u3) ...\n", + "Selecting previously unselected package libkrb5-3:amd64.\n", + "Preparing to unpack .../07-libkrb5-3_1.17-3+deb10u3_amd64.deb ...\n", + "Unpacking libkrb5-3:amd64 (1.17-3+deb10u3) ...\n", + "Selecting previously unselected package libgssapi-krb5-2:amd64.\n", + "Preparing to unpack .../08-libgssapi-krb5-2_1.17-3+deb10u3_amd64.deb ...\n", + "Unpacking libgssapi-krb5-2:amd64 (1.17-3+deb10u3) ...\n", + "Selecting previously unselected package libsasl2-modules-db:amd64.\n", + "Preparing to unpack .../09-libsasl2-modules-db_2.1.27+dfsg-1+deb10u1_amd64.deb ...\n", + "Unpacking libsasl2-modules-db:amd64 (2.1.27+dfsg-1+deb10u1) ...\n", + "Selecting previously unselected package libsasl2-2:amd64.\n", + "Preparing to unpack .../10-libsasl2-2_2.1.27+dfsg-1+deb10u1_amd64.deb ...\n", + "Unpacking libsasl2-2:amd64 (2.1.27+dfsg-1+deb10u1) ...\n", + "Selecting previously unselected package libldap-common.\n", + "Preparing to unpack .../11-libldap-common_2.4.47+dfsg-3+deb10u6_all.deb ...\n", + "Unpacking libldap-common (2.4.47+dfsg-3+deb10u6) ...\n", + "Selecting previously unselected package libldap-2.4-2:amd64.\n", + "Preparing to unpack .../12-libldap-2.4-2_2.4.47+dfsg-3+deb10u6_amd64.deb ...\n", + "Unpacking libldap-2.4-2:amd64 (2.4.47+dfsg-3+deb10u6) ...\n", + "Selecting previously unselected package libnghttp2-14:amd64.\n", + "Preparing to unpack .../13-libnghttp2-14_1.36.0-2+deb10u1_amd64.deb ...\n", + "Unpacking libnghttp2-14:amd64 (1.36.0-2+deb10u1) ...\n", + "Selecting previously unselected package libpsl5:amd64.\n", + "Preparing to unpack .../14-libpsl5_0.20.2-2_amd64.deb ...\n", + "Unpacking libpsl5:amd64 (0.20.2-2) ...\n", + "Selecting previously unselected package librtmp1:amd64.\n", + "Preparing to unpack .../15-librtmp1_2.4+20151223.gitfa8646d.1-2_amd64.deb ...\n", + "Unpacking librtmp1:amd64 (2.4+20151223.gitfa8646d.1-2) ...\n", + "Selecting previously unselected package libssh2-1:amd64.\n", + "Preparing to unpack .../16-libssh2-1_1.8.0-2.1_amd64.deb ...\n", + "Unpacking libssh2-1:amd64 (1.8.0-2.1) ...\n", + "Selecting previously unselected package libcurl3-gnutls:amd64.\n", + "Preparing to unpack .../17-libcurl3-gnutls_7.64.0-4+deb10u2_amd64.deb ...\n", + "Unpacking libcurl3-gnutls:amd64 (7.64.0-4+deb10u2) ...\n", + "Selecting previously unselected package libpcre2-8-0:amd64.\n", + "Preparing to unpack .../18-libpcre2-8-0_10.32-5_amd64.deb ...\n", + "Unpacking libpcre2-8-0:amd64 (10.32-5) ...\n", + "Selecting previously unselected package liberror-perl.\n", + "Preparing to unpack .../19-liberror-perl_0.17027-2_all.deb ...\n", + "Unpacking liberror-perl (0.17027-2) ...\n", + "Selecting previously unselected package git-man.\n", + "Preparing to unpack .../20-git-man_1%3a2.20.1-2+deb10u3_all.deb ...\n", + "Unpacking git-man (1:2.20.1-2+deb10u3) ...\n", + "Selecting previously unselected package git.\n", + "Preparing to unpack .../21-git_1%3a2.20.1-2+deb10u3_amd64.deb ...\n", + "Unpacking git (1:2.20.1-2+deb10u3) ...\n", + "Setting up perl-modules-5.28 (5.28.1-6+deb10u1) ...\n", + "Setting up libkeyutils1:amd64 (1.6-6) ...\n", + "Setting up libpsl5:amd64 (0.20.2-2) ...\n", + "Setting up libnghttp2-14:amd64 (1.36.0-2+deb10u1) ...\n", + "Setting up libldap-common (2.4.47+dfsg-3+deb10u6) ...\n", + "Setting up libkrb5support0:amd64 (1.17-3+deb10u3) ...\n", + "Setting up libsasl2-modules-db:amd64 (2.1.27+dfsg-1+deb10u1) ...\n", + "Setting up librtmp1:amd64 (2.4+20151223.gitfa8646d.1-2) ...\n", + "Setting up libgdbm-compat4:amd64 (1.18.1-4) ...\n", + "Setting up libpcre2-8-0:amd64 (10.32-5) ...\n", + "Setting up libk5crypto3:amd64 (1.17-3+deb10u3) ...\n", + "Setting up libsasl2-2:amd64 (2.1.27+dfsg-1+deb10u1) ...\n", + "Setting up libperl5.28:amd64 (5.28.1-6+deb10u1) ...\n", + "Setting up git-man (1:2.20.1-2+deb10u3) ...\n", + "Setting up libssh2-1:amd64 (1.8.0-2.1) ...\n", + "Setting up libkrb5-3:amd64 (1.17-3+deb10u3) ...\n", + "Setting up libldap-2.4-2:amd64 (2.4.47+dfsg-3+deb10u6) ...\n", + "Setting up perl (5.28.1-6+deb10u1) ...\n", + "Setting up libgssapi-krb5-2:amd64 (1.17-3+deb10u3) ...\n", + "Setting up libcurl3-gnutls:amd64 (7.64.0-4+deb10u2) ...\n", + "Setting up liberror-perl (0.17027-2) ...\n", + "Setting up git (1:2.20.1-2+deb10u3) ...\n", + "Processing triggers for libc-bin (2.28-10) ...\n", + "\u001B[36mINFO\u001B[0m[0012] Taking snapshot of full filesystem... \n", + "\u001B[36mINFO\u001B[0m[0015] RUN python -m pip install azureml-core==1.33.0 azureml-train-automl-client==1.33.0 \n", + "\u001B[36mINFO\u001B[0m[0015] cmd: /bin/sh \n", + "\u001B[36mINFO\u001B[0m[0015] args: [-c python -m pip install azureml-core==1.33.0 azureml-train-automl-client==1.33.0] \n", + "\u001B[36mINFO\u001B[0m[0015] Running: [/bin/sh -c python -m pip install azureml-core==1.33.0 azureml-train-automl-client==1.33.0] \n", + "Collecting azureml-core==1.33.0\n", + " Downloading azureml_core-1.33.0-py3-none-any.whl (2.2 MB)\n", + "Collecting azureml-train-automl-client==1.33.0\n", + " Downloading azureml_train_automl_client-1.33.0-py3-none-any.whl (128 kB)\n", + "Collecting pytz\n", + " Downloading pytz-2021.3-py2.py3-none-any.whl (503 kB)\n", + "Collecting urllib3<=1.26.5,>=1.23\n", + " Downloading urllib3-1.26.5-py2.py3-none-any.whl (138 kB)\n", + "Collecting pathspec<1.0.0\n", + " Downloading pathspec-0.9.0-py2.py3-none-any.whl (31 kB)\n", + "Collecting azure-mgmt-containerregistry>=2.0.0\n", + " Downloading azure_mgmt_containerregistry-9.0.0-py3-none-any.whl (937 kB)\n", + "Collecting contextlib2<1.0.0\n", + " Downloading contextlib2-0.6.0.post1-py2.py3-none-any.whl (9.8 kB)\n", + "Collecting SecretStorage<4.0.0\n", + " Downloading SecretStorage-3.3.1-py3-none-any.whl (15 kB)\n", + "Collecting backports.tempfile\n", + " Downloading backports.tempfile-1.0-py2.py3-none-any.whl (4.4 kB)\n", + "Collecting docker<5.0.0\n", + " Downloading docker-4.4.4-py2.py3-none-any.whl (147 kB)\n", + "Collecting msrestazure<=0.6.4,>=0.4.33\n", + " Downloading msrestazure-0.6.4-py2.py3-none-any.whl (40 kB)\n", + "Collecting cryptography!=1.9,!=2.0.*,!=2.1.*,!=2.2.*,<4.0.0\n", + " Downloading cryptography-3.4.8-cp36-abi3-manylinux_2_24_x86_64.whl (3.0 MB)\n", + "Collecting msrest<1.0.0,>=0.5.1\n", + " Downloading msrest-0.6.21-py2.py3-none-any.whl (85 kB)\n", + "Collecting requests<3.0.0,>=2.19.1\n", + " Downloading requests-2.27.1-py2.py3-none-any.whl (63 kB)\n", + "Collecting adal<=1.2.7,>=1.2.0\n", + " Downloading adal-1.2.7-py2.py3-none-any.whl (55 kB)\n", + "Collecting jsonpickle<3.0.0\n", + " Downloading jsonpickle-2.1.0-py2.py3-none-any.whl (38 kB)\n", + "Collecting azure-common<2.0.0,>=1.1.12\n", + " Downloading azure_common-1.1.27-py2.py3-none-any.whl (12 kB)\n", + "Collecting python-dateutil<3.0.0,>=2.7.3\n", + " Downloading python_dateutil-2.8.2-py2.py3-none-any.whl (247 kB)\n", + "Collecting PyJWT<3.0.0\n", + " Downloading PyJWT-2.3.0-py3-none-any.whl (16 kB)\n", + "Collecting azure-mgmt-authorization<1.0.0,>=0.40.0\n", + " Downloading azure_mgmt_authorization-0.61.0-py2.py3-none-any.whl (94 kB)\n", + "Collecting pyopenssl<21.0.0\n", + " Downloading pyOpenSSL-20.0.1-py2.py3-none-any.whl (54 kB)\n", + "Collecting jmespath<1.0.0\n", + " Downloading jmespath-0.10.0-py2.py3-none-any.whl (24 kB)\n", + "Collecting ndg-httpsclient<=0.5.1\n", + " Downloading ndg_httpsclient-0.5.1-py3-none-any.whl (34 kB)\n", + "Collecting azure-mgmt-storage<16.0.0,>=1.5.0\n", + " Downloading azure_mgmt_storage-11.2.0-py2.py3-none-any.whl (547 kB)\n", + "Collecting azure-mgmt-keyvault<10.0.0,>=0.40.0\n", + " Downloading azure_mgmt_keyvault-9.3.0-py2.py3-none-any.whl (412 kB)\n", + "Collecting ruamel.yaml<0.17.5,>=0.15.35\n", + " Downloading ruamel.yaml-0.17.4-py3-none-any.whl (101 kB)\n", + "Collecting azure-graphrbac<1.0.0,>=0.40.0\n", + " Downloading azure_graphrbac-0.61.1-py2.py3-none-any.whl (141 kB)\n", + "Collecting azure-mgmt-resource<15.0.0,>=1.2.1\n", + " Downloading azure_mgmt_resource-13.0.0-py2.py3-none-any.whl (1.3 MB)\n", + "Collecting azureml-dataset-runtime~=1.33.0\n", + " Downloading azureml_dataset_runtime-1.33.0-py3-none-any.whl (3.5 kB)\n", + "Collecting azureml-telemetry~=1.33.0\n", + " Downloading azureml_telemetry-1.33.0-py3-none-any.whl (30 kB)\n", + "Collecting azureml-automl-core~=1.33.0\n", + " Downloading azureml_automl_core-1.33.1-py3-none-any.whl (214 kB)\n", + "Collecting azure-mgmt-core<2.0.0,>=1.3.0\n", + " Downloading azure_mgmt_core-1.3.0-py2.py3-none-any.whl (25 kB)\n", + "Collecting azure-core<2.0.0,>=1.15.0\n", + " Downloading azure_core-1.21.1-py2.py3-none-any.whl (178 kB)\n", + "Collecting six>=1.11.0\n", + " Downloading six-1.16.0-py2.py3-none-any.whl (11 kB)\n", + "Collecting pyarrow<4.0.0,>=0.17.0\n", + " Downloading pyarrow-3.0.0-cp37-cp37m-manylinux2014_x86_64.whl (20.7 MB)\n", + "Collecting azureml-dataprep<2.21.0a,>=2.20.0a\n", + " Downloading azureml_dataprep-2.20.1-py3-none-any.whl (39.4 MB)\n", + "Collecting numpy!=1.19.3\n", + " Downloading numpy-1.21.5-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (15.7 MB)\n", + "Collecting azure-identity<1.5.0,>=1.2.0\n", + " Downloading azure_identity-1.4.1-py2.py3-none-any.whl (86 kB)\n", + "Collecting dotnetcore2<3.0.0,>=2.1.14\n", + " Downloading dotnetcore2-2.1.23-py3-none-manylinux1_x86_64.whl (29.3 MB)\n", + "Collecting azureml-dataprep-native<39.0.0,>=38.0.0\n", + " Downloading azureml_dataprep_native-38.0.0-cp37-cp37m-manylinux1_x86_64.whl (1.3 MB)\n", + "Collecting azureml-dataprep-rslex<1.19.0a,>=1.18.0dev0\n", + " Downloading azureml_dataprep_rslex-1.18.2-cp37-cp37m-manylinux1_x86_64.whl (10.4 MB)\n", + "Collecting cloudpickle<2.0.0,>=1.1.0\n", + " Downloading cloudpickle-1.6.0-py3-none-any.whl (23 kB)\n", + "Collecting msal-extensions~=0.2.2\n", + " Downloading msal_extensions-0.2.2-py2.py3-none-any.whl (15 kB)\n", + "Collecting msal<2.0.0,>=1.3.0\n", + " Downloading msal-1.16.0-py2.py3-none-any.whl (78 kB)\n", + "Collecting applicationinsights\n", + " Downloading applicationinsights-0.11.10-py2.py3-none-any.whl (55 kB)\n", + "Collecting cffi>=1.12\n", + " Downloading cffi-1.15.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (427 kB)\n", + "Collecting pycparser\n", + " Downloading pycparser-2.21-py2.py3-none-any.whl (118 kB)\n", + "Collecting websocket-client>=0.32.0\n", + " Downloading websocket_client-1.2.3-py3-none-any.whl (53 kB)\n", + "Collecting distro>=1.2.0\n", + " Downloading distro-1.6.0-py2.py3-none-any.whl (19 kB)\n", + "Collecting importlib-metadata\n", + " Downloading importlib_metadata-4.10.1-py3-none-any.whl (17 kB)\n", + "Collecting portalocker~=1.0\n", + " Downloading portalocker-1.7.1-py2.py3-none-any.whl (10 kB)\n", + "Collecting requests-oauthlib>=0.5.0\n", + " Downloading requests_oauthlib-1.3.1-py2.py3-none-any.whl (23 kB)\n", + "Collecting isodate>=0.6.0\n", + " Downloading isodate-0.6.1-py2.py3-none-any.whl (41 kB)\n", + "Collecting certifi>=2017.4.17\n", + " Downloading certifi-2021.10.8-py2.py3-none-any.whl (149 kB)\n", + "Collecting pyasn1>=0.1.1\n", + " Downloading pyasn1-0.4.8-py2.py3-none-any.whl (77 kB)\n", + "Collecting charset-normalizer~=2.0.0\n", + " Downloading charset_normalizer-2.0.11-py3-none-any.whl (39 kB)\n", + "Collecting idna<4,>=2.5\n", + " Downloading idna-3.3-py3-none-any.whl (61 kB)\n", + "Collecting oauthlib>=3.0.0\n", + " Downloading oauthlib-3.2.0-py3-none-any.whl (151 kB)\n", + "Collecting ruamel.yaml.clib>=0.1.2\n", + " Downloading ruamel.yaml.clib-0.2.6-cp37-cp37m-manylinux1_x86_64.whl (546 kB)\n", + "Collecting jeepney>=0.6\n", + " Downloading jeepney-0.7.1-py3-none-any.whl (54 kB)\n", + "Collecting backports.weakref\n", + " Downloading backports.weakref-1.0.post1-py2.py3-none-any.whl (5.2 kB)\n", + "Collecting zipp>=0.5\n", + " Downloading zipp-3.7.0-py3-none-any.whl (5.3 kB)\n", + "Collecting typing-extensions>=3.6.4\n", + " Downloading typing_extensions-4.0.1-py3-none-any.whl (22 kB)\n", + "Installing collected packages: pycparser, urllib3, idna, charset-normalizer, cffi, certifi, six, requests, PyJWT, oauthlib, cryptography, requests-oauthlib, python-dateutil, isodate, zipp, typing-extensions, portalocker, msrest, msal, azure-core, adal, websocket-client, ruamel.yaml.clib, pyopenssl, pyasn1, msrestazure, msal-extensions, jeepney, importlib-metadata, distro, backports.weakref, azure-mgmt-core, azure-common, SecretStorage, ruamel.yaml, pytz, pathspec, numpy, ndg-httpsclient, jsonpickle, jmespath, dotnetcore2, docker, contextlib2, cloudpickle, backports.tempfile, azureml-dataprep-rslex, azureml-dataprep-native, azure-mgmt-storage, azure-mgmt-resource, azure-mgmt-keyvault, azure-mgmt-containerregistry, azure-mgmt-authorization, azure-identity, azure-graphrbac, pyarrow, azureml-dataprep, azureml-core, applicationinsights, azureml-telemetry, azureml-dataset-runtime, azureml-automl-core, azureml-train-automl-client\n", + "Successfully installed PyJWT-2.3.0 SecretStorage-3.3.1 adal-1.2.7 applicationinsights-0.11.10 azure-common-1.1.27 azure-core-1.21.1 azure-graphrbac-0.61.1 azure-identity-1.4.1 azure-mgmt-authorization-0.61.0 azure-mgmt-containerregistry-9.0.0 azure-mgmt-core-1.3.0 azure-mgmt-keyvault-9.3.0 azure-mgmt-resource-13.0.0 azure-mgmt-storage-11.2.0 azureml-automl-core-1.33.1 azureml-core-1.33.0 azureml-dataprep-2.20.1 azureml-dataprep-native-38.0.0 azureml-dataprep-rslex-1.18.2 azureml-dataset-runtime-1.33.0 azureml-telemetry-1.33.0 azureml-train-automl-client-1.33.0 backports.tempfile-1.0 backports.weakref-1.0.post1 certifi-2021.10.8 cffi-1.15.0 charset-normalizer-2.0.11 cloudpickle-1.6.0 contextlib2-0.6.0.post1 cryptography-3.4.8 distro-1.6.0 docker-4.4.4 dotnetcore2-2.1.23 idna-3.3 importlib-metadata-4.10.1 isodate-0.6.1 jeepney-0.7.1 jmespath-0.10.0 jsonpickle-2.1.0 msal-1.16.0 msal-extensions-0.2.2 msrest-0.6.21 msrestazure-0.6.4 ndg-httpsclient-0.5.1 numpy-1.21.5 oauthlib-3.2.0 pathspec-0.9.0 portalocker-1.7.1 pyarrow-3.0.0 pyasn1-0.4.8 pycparser-2.21 pyopenssl-20.0.1 python-dateutil-2.8.2 pytz-2021.3 requests-2.27.1 requests-oauthlib-1.3.1 ruamel.yaml-0.17.4 ruamel.yaml.clib-0.2.6 six-1.16.0 typing-extensions-4.0.1 urllib3-1.26.5 websocket-client-1.2.3 zipp-3.7.0\n", + "WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\n", + "WARNING: You are using pip version 21.2.4; however, version 22.0.2 is available.\n", + "You should consider upgrading via the '/usr/local/bin/python -m pip install --upgrade pip' command.\n", + "\u001B[36mINFO\u001B[0m[0040] Taking snapshot of full filesystem... \n", + "\u001B[36mINFO\u001B[0m[0059] RUN python -m pip install \"mlrun[complete] @ git+https://github.com/mlrun/mlrun@development\" \n", + "\u001B[36mINFO\u001B[0m[0059] cmd: /bin/sh \n", + "\u001B[36mINFO\u001B[0m[0059] args: [-c python -m pip install \"mlrun[complete] @ git+https://github.com/mlrun/mlrun@development\"] \n", + "\u001B[36mINFO\u001B[0m[0059] Running: [/bin/sh -c python -m pip install \"mlrun[complete] @ git+https://github.com/mlrun/mlrun@development\"] \n", + "Collecting mlrun[complete]@ git+https://github.com/mlrun/mlrun@development\n", + " Cloning https://github.com/mlrun/mlrun (to revision development) to /tmp/pip-install-yegk19ip/mlrun_b9fdf6ee5c8c4e2d9de3a37d0807b6c5\n", + " Running command git clone -q https://github.com/mlrun/mlrun /tmp/pip-install-yegk19ip/mlrun_b9fdf6ee5c8c4e2d9de3a37d0807b6c5\n", + " Resolved https://github.com/mlrun/mlrun to commit 832a07b11f3198b844d30b4a80db12a45c6e8948\n", + " Installing build dependencies: started\n", + " Installing build dependencies: finished with status 'done'\n", + " Getting requirements to build wheel: started\n", + " Getting requirements to build wheel: finished with status 'done'\n", + " Preparing wheel metadata: started\n", + " Preparing wheel metadata: finished with status 'done'\n", + "Collecting pymysql~=1.0\n", + " Downloading PyMySQL-1.0.2-py3-none-any.whl (43 kB)\n", + "Requirement already satisfied: pyarrow<6,>=1 in /usr/local/lib/python3.7/site-packages (from mlrun[complete]@ git+https://github.com/mlrun/mlrun@development) (3.0.0)\n", + "Collecting fsspec~=2021.8.1\n", + " Downloading fsspec-2021.8.1-py3-none-any.whl (119 kB)\n", + "Collecting v3io~=0.5.13\n", + " Downloading v3io-0.5.15-py3-none-any.whl (49 kB)\n", + "Collecting ipykernel~=5.0\n", + " Downloading ipykernel-5.5.6-py3-none-any.whl (121 kB)\n", + "Collecting click~=7.0\n", + " Downloading click-7.1.2-py2.py3-none-any.whl (82 kB)\n", + "Collecting sqlalchemy~=1.3\n", + " Downloading SQLAlchemy-1.4.31-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.6 MB)\n", + "Collecting dask~=2021.11.2\n", + " Downloading dask-2021.11.2-py3-none-any.whl (1.0 MB)\n", + "Collecting distributed~=2021.11.2\n", + " Downloading distributed-2021.11.2-py3-none-any.whl (802 kB)\n", + "Collecting chardet<4.0,>=3.0.2\n", + " Downloading chardet-3.0.4-py2.py3-none-any.whl (133 kB)\n", + "Collecting nuclio-jupyter~=0.8.22\n", + " Downloading nuclio_jupyter-0.8.22-py3-none-any.whl (49 kB)\n", + "Collecting pydantic~=1.5\n", + " Downloading pydantic-1.9.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.9 MB)\n", + "Collecting v3iofs~=0.1.7\n", + " Downloading v3iofs-0.1.10-py3-none-any.whl (13 kB)\n", + "Collecting kfp~=1.8.0\n", + " Downloading kfp-1.8.11.tar.gz (298 kB)\n", + "Collecting pandas~=1.2\n", + " Downloading pandas-1.3.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.3 MB)\n", + "Collecting pyyaml~=5.1\n", + " Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)\n", + "Collecting orjson<3.4,>=3\n", + " Downloading orjson-3.3.1-cp37-cp37m-manylinux2014_x86_64.whl (208 kB)\n", + "Collecting inflection~=0.5.0\n", + " Downloading inflection-0.5.1-py2.py3-none-any.whl (9.5 kB)\n", + "Collecting aiohttp~=3.8\n", + " Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)\n", + "Collecting humanfriendly~=8.2\n", + " Downloading humanfriendly-8.2-py2.py3-none-any.whl (86 kB)\n", + "Collecting ipython~=7.0\n", + " Downloading ipython-7.31.1-py3-none-any.whl (792 kB)\n", + "Collecting fastapi~=0.67.0\n", + " Downloading fastapi-0.67.0-py3-none-any.whl (51 kB)\n", + "Requirement already satisfied: requests~=2.22 in /usr/local/lib/python3.7/site-packages (from mlrun[complete]@ git+https://github.com/mlrun/mlrun@development) (2.27.1)\n", + "Collecting tabulate~=0.8.6\n", + " Downloading tabulate-0.8.9-py3-none-any.whl (25 kB)\n", + "Collecting nest-asyncio~=1.0\n", + " Downloading nest_asyncio-1.5.4-py3-none-any.whl (5.1 kB)\n", + "Collecting storey~=0.10.4\n", + " Downloading storey-0.10.4-py3-none-any.whl (116 kB)\n", + "Requirement already satisfied: urllib3<1.27,>=1.25.4 in /usr/local/lib/python3.7/site-packages (from mlrun[complete]@ git+https://github.com/mlrun/mlrun@development) (1.26.5)\n", + "Collecting GitPython~=3.0\n", + " Downloading GitPython-3.1.26-py3-none-any.whl (180 kB)\n", + "Collecting mergedeep~=1.3\n", + " Downloading mergedeep-1.3.4-py3-none-any.whl (6.4 kB)\n", + "Collecting semver~=2.13\n", + " Downloading semver-2.13.0-py2.py3-none-any.whl (12 kB)\n", + "Collecting cryptography<3.4,~=3.0\n", + " Downloading cryptography-3.3.2-cp36-abi3-manylinux2010_x86_64.whl (2.6 MB)\n", + "Collecting v3io-frames~=0.10.2\n", + " Downloading v3io_frames-0.10.2-py3-none-any.whl (35 kB)\n", + "Collecting deepdiff~=5.0\n", + " Downloading deepdiff-5.7.0-py3-none-any.whl (68 kB)\n", + "Requirement already satisfied: numpy<1.22.0,>=1.16.5 in /usr/local/lib/python3.7/site-packages (from mlrun[complete]@ git+https://github.com/mlrun/mlrun@development) (1.21.5)\n", + "Collecting python-dotenv~=0.17.0\n", + " Downloading python_dotenv-0.17.1-py2.py3-none-any.whl (18 kB)\n", + "Collecting alembic<1.6.0,~=1.4\n", + " Downloading alembic-1.5.8-py2.py3-none-any.whl (159 kB)\n", + "Collecting kubernetes~=12.0\n", + " Downloading kubernetes-12.0.1-py2.py3-none-any.whl (1.7 MB)\n", + "Collecting google-auth<2.0dev,>=1.25.0\n", + " Downloading google_auth-1.35.0-py2.py3-none-any.whl (152 kB)\n", + "Collecting azure-identity~=1.5\n", + " Downloading azure_identity-1.7.1-py2.py3-none-any.whl (129 kB)\n", + "Collecting aiobotocore~=1.4.0\n", + " Downloading aiobotocore-1.4.2.tar.gz (52 kB)\n", + "Collecting boto3<1.17.107,~=1.9\n", + " Downloading boto3-1.17.106-py2.py3-none-any.whl (131 kB)\n", + "Collecting azure-keyvault-secrets~=4.2\n", + " Downloading azure_keyvault_secrets-4.3.0-py2.py3-none-any.whl (233 kB)\n", + "Collecting s3fs~=2021.8.1\n", + " Downloading s3fs-2021.8.1-py3-none-any.whl (26 kB)\n", + "Collecting plotly~=5.4\n", + " Downloading plotly-5.5.0-py2.py3-none-any.whl (26.5 MB)\n", + "Collecting botocore<1.20.107,>=1.20.106\n", + " Downloading botocore-1.20.106-py2.py3-none-any.whl (7.7 MB)\n", + "Collecting adlfs~=2021.8.1\n", + " Downloading adlfs-2021.8.2.tar.gz (38 kB)\n", + "Collecting gcsfs~=2021.8.1\n", + " Downloading gcsfs-2021.8.1-py2.py3-none-any.whl (23 kB)\n", + "Collecting azure-storage-blob~=12.0\n", + " Downloading azure_storage_blob-12.9.0-py2.py3-none-any.whl (356 kB)\n", + "Requirement already satisfied: azure-core>=1.7.0 in /usr/local/lib/python3.7/site-packages (from adlfs~=2021.8.1->mlrun[complete]@ git+https://github.com/mlrun/mlrun@development) (1.21.1)\n", + "Collecting azure-datalake-store<0.1,>=0.0.46\n", + " Downloading azure_datalake_store-0.0.52-py2.py3-none-any.whl (61 kB)\n", + "Collecting wrapt>=1.10.10\n", + " Downloading wrapt-1.13.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (79 kB)\n", + "Collecting aioitertools>=0.5.1\n", + " Downloading aioitertools-0.8.0-py3-none-any.whl (21 kB)\n", + "Collecting frozenlist>=1.1.1\n", + " Downloading frozenlist-1.3.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (144 kB)\n", + "Collecting yarl<2.0,>=1.0\n", + " Downloading yarl-1.7.2-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (271 kB)\n", + "Collecting multidict<7.0,>=4.5\n", + " Downloading multidict-6.0.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (94 kB)\n", + "Collecting aiosignal>=1.1.2\n", + " Downloading aiosignal-1.2.0-py3-none-any.whl (8.2 kB)\n", + "Requirement already satisfied: typing-extensions>=3.7.4 in /usr/local/lib/python3.7/site-packages (from aiohttp~=3.8->mlrun[complete]@ git+https://github.com/mlrun/mlrun@development) (4.0.1)\n", + "Collecting async-timeout<5.0,>=4.0.0a3\n", + " Downloading async_timeout-4.0.2-py3-none-any.whl (5.8 kB)\n", + "Collecting asynctest==0.13.0\n", + " Downloading asynctest-0.13.0-py3-none-any.whl (26 kB)\n", + "Collecting attrs>=17.3.0\n", + " Downloading attrs-21.4.0-py2.py3-none-any.whl (60 kB)\n", + "Requirement already satisfied: charset-normalizer<3.0,>=2.0 in /usr/local/lib/python3.7/site-packages (from aiohttp~=3.8->mlrun[complete]@ git+https://github.com/mlrun/mlrun@development) (2.0.11)\n", + "Collecting Mako\n", + " Downloading Mako-1.1.6-py2.py3-none-any.whl (75 kB)\n", + "Collecting python-editor>=0.3\n", + " Downloading python_editor-1.0.4-py3-none-any.whl (4.9 kB)\n", + "Requirement already satisfied: python-dateutil in /usr/local/lib/python3.7/site-packages (from alembic<1.6.0,~=1.4->mlrun[complete]@ git+https://github.com/mlrun/mlrun@development) (2.8.2)\n", + "Requirement already satisfied: six>=1.11.0 in /usr/local/lib/python3.7/site-packages (from azure-core>=1.7.0->adlfs~=2021.8.1->mlrun[complete]@ git+https://github.com/mlrun/mlrun@development) (1.16.0)\n", + "Requirement already satisfied: cffi in /usr/local/lib/python3.7/site-packages (from azure-datalake-store<0.1,>=0.0.46->adlfs~=2021.8.1->mlrun[complete]@ git+https://github.com/mlrun/mlrun@development) (1.15.0)\n", + "Requirement already satisfied: adal>=0.4.2 in /usr/local/lib/python3.7/site-packages (from azure-datalake-store<0.1,>=0.0.46->adlfs~=2021.8.1->mlrun[complete]@ git+https://github.com/mlrun/mlrun@development) (1.2.7)\n", + "Requirement already satisfied: PyJWT<3,>=1.0.0 in /usr/local/lib/python3.7/site-packages (from adal>=0.4.2->azure-datalake-store<0.1,>=0.0.46->adlfs~=2021.8.1->mlrun[complete]@ git+https://github.com/mlrun/mlrun@development) (2.3.0)\n", + "Requirement already satisfied: msal<2.0.0,>=1.12.0 in /usr/local/lib/python3.7/site-packages (from azure-identity~=1.5->mlrun[complete]@ git+https://github.com/mlrun/mlrun@development) (1.16.0)\n", + "Collecting msal-extensions~=0.3.0\n", + " Downloading msal_extensions-0.3.1-py2.py3-none-any.whl (18 kB)\n", + "Requirement already satisfied: azure-common~=1.1 in /usr/local/lib/python3.7/site-packages (from azure-keyvault-secrets~=4.2->mlrun[complete]@ git+https://github.com/mlrun/mlrun@development) (1.1.27)\n", + "Requirement already satisfied: msrest>=0.6.21 in /usr/local/lib/python3.7/site-packages (from azure-keyvault-secrets~=4.2->mlrun[complete]@ git+https://github.com/mlrun/mlrun@development) (0.6.21)\n", + "Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /usr/local/lib/python3.7/site-packages (from boto3<1.17.107,~=1.9->mlrun[complete]@ git+https://github.com/mlrun/mlrun@development) (0.10.0)\n", + "Collecting s3transfer<0.5.0,>=0.4.0\n", + " Downloading s3transfer-0.4.2-py2.py3-none-any.whl (79 kB)\n", + "Requirement already satisfied: pycparser in /usr/local/lib/python3.7/site-packages (from cffi->azure-datalake-store<0.1,>=0.0.46->adlfs~=2021.8.1->mlrun[complete]@ git+https://github.com/mlrun/mlrun@development) (2.21)\n", + "Collecting partd>=0.3.10\n", + " Downloading partd-1.2.0-py3-none-any.whl (19 kB)\n", + "Collecting toolz>=0.8.2\n", + " Downloading toolz-0.11.2-py3-none-any.whl (55 kB)\n", + "Requirement already satisfied: cloudpickle>=1.1.1 in /usr/local/lib/python3.7/site-packages (from dask~=2021.11.2->mlrun[complete]@ git+https://github.com/mlrun/mlrun@development) (1.6.0)\n", + "Collecting packaging>=20.0\n", + " Downloading packaging-21.3-py3-none-any.whl (40 kB)\n", + "Collecting ordered-set==4.0.2\n", + " Downloading ordered-set-4.0.2.tar.gz (10 kB)\n", + "Collecting tornado>=5\n", + " Downloading tornado-6.1-cp37-cp37m-manylinux2010_x86_64.whl (428 kB)\n", + "Collecting msgpack>=0.6.0\n", + " Downloading msgpack-1.0.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (299 kB)\n", + "Collecting zict>=0.1.3\n", + " Downloading zict-2.0.0-py3-none-any.whl (10 kB)\n", + "Requirement already satisfied: setuptools in /usr/local/lib/python3.7/site-packages (from distributed~=2021.11.2->mlrun[complete]@ git+https://github.com/mlrun/mlrun@development) (53.0.0)\n", + "Collecting psutil>=5.0\n", + " Downloading psutil-5.9.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (280 kB)\n", + "Collecting jinja2\n", + " Downloading Jinja2-3.0.3-py3-none-any.whl (133 kB)\n", + "Collecting tblib>=1.6.0\n", + " Downloading tblib-1.7.0-py2.py3-none-any.whl (12 kB)\n", + "Collecting sortedcontainers!=2.0.0,!=2.0.1\n", + " Downloading sortedcontainers-2.4.0-py2.py3-none-any.whl (29 kB)\n", + "Collecting starlette==0.14.2\n", + " Downloading starlette-0.14.2-py3-none-any.whl (60 kB)\n", + "Collecting decorator\n", + " Downloading decorator-5.1.1-py3-none-any.whl (9.1 kB)\n", + "Collecting google-auth-oauthlib\n", + " Downloading google_auth_oauthlib-0.4.6-py2.py3-none-any.whl (18 kB)\n", + "Collecting gitdb<5,>=4.0.1\n", + " Downloading gitdb-4.0.9-py3-none-any.whl (63 kB)\n", + "Collecting smmap<6,>=3.0.1\n", + " Downloading smmap-5.0.0-py3-none-any.whl (24 kB)\n", + "Collecting pyasn1-modules>=0.2.1\n", + " Downloading pyasn1_modules-0.2.8-py2.py3-none-any.whl (155 kB)\n", + "Collecting rsa<5,>=3.1.4\n", + " Downloading rsa-4.8-py3-none-any.whl (39 kB)\n", + "Collecting cachetools<5.0,>=2.0.0\n", + " Downloading cachetools-4.2.4-py3-none-any.whl (10 kB)\n", + "Collecting jupyter-client\n", + " Downloading jupyter_client-7.1.2-py3-none-any.whl (130 kB)\n", + "Collecting ipython-genutils\n", + " Downloading ipython_genutils-0.2.0-py2.py3-none-any.whl (26 kB)\n", + "Collecting traitlets>=4.1.0\n", + " Downloading traitlets-5.1.1-py3-none-any.whl (102 kB)\n", + "Collecting prompt-toolkit!=3.0.0,!=3.0.1,<3.1.0,>=2.0.0\n", + " Downloading prompt_toolkit-3.0.26-py3-none-any.whl (375 kB)\n", + "Collecting jedi>=0.16\n", + " Downloading jedi-0.18.1-py2.py3-none-any.whl (1.6 MB)\n", + "Collecting matplotlib-inline\n", + " Downloading matplotlib_inline-0.1.3-py3-none-any.whl (8.2 kB)\n", + "Collecting pickleshare\n", + " Downloading pickleshare-0.7.5-py2.py3-none-any.whl (6.9 kB)\n", + "Collecting pygments\n", + " Downloading Pygments-2.11.2-py3-none-any.whl (1.1 MB)\n", + "Collecting backcall\n", + " Downloading backcall-0.2.0-py2.py3-none-any.whl (11 kB)\n", + "Collecting pexpect>4.3\n", + " Downloading pexpect-4.8.0-py2.py3-none-any.whl (59 kB)\n", + "Collecting parso<0.9.0,>=0.8.0\n", + " Downloading parso-0.8.3-py2.py3-none-any.whl (100 kB)\n", + "Collecting absl-py<2,>=0.9\n", + " Downloading absl_py-1.0.0-py3-none-any.whl (126 kB)\n", + "Collecting google-cloud-storage<2,>=1.20.0\n", + " Downloading google_cloud_storage-1.44.0-py2.py3-none-any.whl (106 kB)\n", + "Collecting google-api-python-client<2,>=1.7.8\n", + " Downloading google_api_python_client-1.12.10-py2.py3-none-any.whl (61 kB)\n", + "Collecting requests-toolbelt<1,>=0.8.0\n", + " Downloading requests_toolbelt-0.9.1-py2.py3-none-any.whl (54 kB)\n", + "Collecting cloudpickle>=1.1.1\n", + " Downloading cloudpickle-2.0.0-py3-none-any.whl (25 kB)\n", + "Collecting kfp-server-api<2.0.0,>=1.1.2\n", + " Downloading kfp-server-api-1.7.1.tar.gz (52 kB)\n", + "Collecting jsonschema<4,>=3.0.1\n", + " Downloading jsonschema-3.2.0-py2.py3-none-any.whl (56 kB)\n", + "Collecting Deprecated<2,>=1.2.7\n", + " Downloading Deprecated-1.2.13-py2.py3-none-any.whl (9.6 kB)\n", + "Collecting strip-hints<1,>=0.1.8\n", + " Downloading strip-hints-0.1.10.tar.gz (29 kB)\n", + "Collecting docstring-parser<1,>=0.7.3\n", + " Downloading docstring_parser-0.13.tar.gz (23 kB)\n", + " Installing build dependencies: started\n", + " Installing build dependencies: finished with status 'done'\n", + " Getting requirements to build wheel: started\n", + " Getting requirements to build wheel: finished with status 'done'\n", + " Preparing wheel metadata: started\n", + " Preparing wheel metadata: finished with status 'done'\n", + "Collecting kfp-pipeline-spec<0.2.0,>=0.1.13\n", + " Downloading kfp_pipeline_spec-0.1.13-py3-none-any.whl (18 kB)\n", + "Collecting fire<1,>=0.3.1\n", + " Downloading fire-0.4.0.tar.gz (87 kB)\n", + "Collecting protobuf<4,>=3.13.0\n", + " Downloading protobuf-3.19.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)\n", + "Collecting uritemplate<4,>=3.0.1\n", + " Downloading uritemplate-3.0.1-py2.py3-none-any.whl (15 kB)\n", + "Collecting typer<1.0,>=0.3.2\n", + " Downloading typer-0.4.0-py3-none-any.whl (27 kB)\n", + "Collecting typing-extensions>=3.7.4\n", + " Downloading typing_extensions-3.10.0.2-py3-none-any.whl (26 kB)\n", + "Collecting termcolor\n", + " Downloading termcolor-1.1.0.tar.gz (3.9 kB)\n", + "Collecting httplib2<1dev,>=0.15.0\n", + " Downloading httplib2-0.20.2-py3-none-any.whl (96 kB)\n", + "Collecting google-api-core<3dev,>=1.21.0\n", + " Downloading google_api_core-2.4.0-py2.py3-none-any.whl (111 kB)\n", + "Collecting google-auth-httplib2>=0.0.3\n", + " Downloading google_auth_httplib2-0.1.0-py2.py3-none-any.whl (9.3 kB)\n", + "Collecting googleapis-common-protos<2.0dev,>=1.52.0\n", + " Downloading googleapis_common_protos-1.54.0-py2.py3-none-any.whl (207 kB)\n", + "Collecting google-cloud-core<3.0dev,>=1.6.0\n", + " Downloading google_cloud_core-2.2.2-py2.py3-none-any.whl (29 kB)\n", + "Collecting google-resumable-media<3.0dev,>=1.3.0\n", + " Downloading google_resumable_media-2.1.0-py2.py3-none-any.whl (75 kB)\n", + "Collecting google-crc32c<2.0dev,>=1.0\n", + " Downloading google_crc32c-1.3.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (38 kB)\n", + "Collecting pyparsing!=3.0.0,!=3.0.1,!=3.0.2,!=3.0.3,<4,>=2.4.2\n", + " Downloading pyparsing-3.0.7-py3-none-any.whl (98 kB)\n", + "Requirement already satisfied: importlib-metadata in /usr/local/lib/python3.7/site-packages (from jsonschema<4,>=3.0.1->kfp~=1.8.0->mlrun[complete]@ git+https://github.com/mlrun/mlrun@development) (4.10.1)\n", + "Collecting pyrsistent>=0.14.0\n", + " Downloading pyrsistent-0.18.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (117 kB)\n", + "Requirement already satisfied: certifi in /usr/local/lib/python3.7/site-packages (from kfp-server-api<2.0.0,>=1.1.2->kfp~=1.8.0->mlrun[complete]@ git+https://github.com/mlrun/mlrun@development) (2021.10.8)\n", + "Requirement already satisfied: requests-oauthlib in /usr/local/lib/python3.7/site-packages (from kubernetes~=12.0->mlrun[complete]@ git+https://github.com/mlrun/mlrun@development) (1.3.1)\n", + "Requirement already satisfied: websocket-client!=0.40.0,!=0.41.*,!=0.42.*,>=0.32.0 in /usr/local/lib/python3.7/site-packages (from kubernetes~=12.0->mlrun[complete]@ git+https://github.com/mlrun/mlrun@development) (1.2.3)\n", + "Requirement already satisfied: portalocker<3,>=1.0 in /usr/local/lib/python3.7/site-packages (from msal-extensions~=0.3.0->azure-identity~=1.5->mlrun[complete]@ git+https://github.com/mlrun/mlrun@development) (1.7.1)\n", + "Requirement already satisfied: isodate>=0.6.0 in /usr/local/lib/python3.7/site-packages (from msrest>=0.6.21->azure-keyvault-secrets~=4.2->mlrun[complete]@ git+https://github.com/mlrun/mlrun@development) (0.6.1)\n", + "Collecting notebook>=5.2.0\n", + " Downloading notebook-6.4.8-py3-none-any.whl (9.9 MB)\n", + "Collecting nbconvert>=5.4\n", + " Downloading nbconvert-6.4.1-py3-none-any.whl (557 kB)\n", + "Collecting nbformat>=4.4\n", + " Downloading nbformat-5.1.3-py3-none-any.whl (178 kB)\n", + "Collecting mistune<2,>=0.8.1\n", + " Downloading mistune-0.8.4-py2.py3-none-any.whl (16 kB)\n", + "Collecting defusedxml\n", + " Downloading defusedxml-0.7.1-py2.py3-none-any.whl (25 kB)\n", + "Collecting nbclient<0.6.0,>=0.5.0\n", + " Downloading nbclient-0.5.10-py3-none-any.whl (69 kB)\n", + "Collecting jupyter-core\n", + " Downloading jupyter_core-4.9.1-py3-none-any.whl (86 kB)\n", + "Collecting testpath\n", + " Downloading testpath-0.5.0-py3-none-any.whl (84 kB)\n", + "Collecting pandocfilters>=1.4.1\n", + " Downloading pandocfilters-1.5.0-py2.py3-none-any.whl (8.7 kB)\n", + "Collecting bleach\n", + " Downloading bleach-4.1.0-py2.py3-none-any.whl (157 kB)\n", + "Collecting jupyterlab-pygments\n", + " Downloading jupyterlab_pygments-0.1.2-py2.py3-none-any.whl (4.6 kB)\n", + "Collecting entrypoints>=0.2.2\n", + " Downloading entrypoints-0.3-py2.py3-none-any.whl (11 kB)\n", + "Collecting MarkupSafe>=2.0\n", + " Downloading MarkupSafe-2.0.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (31 kB)\n", + "Collecting pyzmq>=13\n", + " Downloading pyzmq-22.3.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (1.1 MB)\n", + "Collecting terminado>=0.8.3\n", + " Downloading terminado-0.13.1-py3-none-any.whl (14 kB)\n", + "Collecting argon2-cffi\n", + " Downloading argon2_cffi-21.3.0-py3-none-any.whl (14 kB)\n", + "Collecting Send2Trash>=1.8.0\n", + " Downloading Send2Trash-1.8.0-py3-none-any.whl (18 kB)\n", + "Collecting prometheus-client\n", + " Downloading prometheus_client-0.13.1-py3-none-any.whl (57 kB)\n", + "Requirement already satisfied: pytz>=2017.3 in /usr/local/lib/python3.7/site-packages (from pandas~=1.2->mlrun[complete]@ git+https://github.com/mlrun/mlrun@development) (2021.3)\n", + "Collecting locket\n", + " Downloading locket-0.2.1-py2.py3-none-any.whl (4.1 kB)\n", + "Collecting ptyprocess>=0.5\n", + " Downloading ptyprocess-0.7.0-py2.py3-none-any.whl (13 kB)\n", + "Collecting tenacity>=6.2.0\n", + " Downloading tenacity-8.0.1-py3-none-any.whl (24 kB)\n", + "Collecting wcwidth\n", + " Downloading wcwidth-0.2.5-py2.py3-none-any.whl (30 kB)\n", + "Requirement already satisfied: pyasn1<0.5.0,>=0.4.6 in /usr/local/lib/python3.7/site-packages (from pyasn1-modules>=0.2.1->google-auth<2.0dev,>=1.25.0->mlrun[complete]@ git+https://github.com/mlrun/mlrun@development) (0.4.8)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.7/site-packages (from requests~=2.22->mlrun[complete]@ git+https://github.com/mlrun/mlrun@development) (3.3)\n", + "Requirement already satisfied: oauthlib>=3.0.0 in /usr/local/lib/python3.7/site-packages (from requests-oauthlib->kubernetes~=12.0->mlrun[complete]@ git+https://github.com/mlrun/mlrun@development) (3.2.0)\n", + "Collecting greenlet!=0.4.17\n", + " Downloading greenlet-1.1.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (150 kB)\n", + "Collecting grpcio-tools<1.42,>1.34.0\n", + " Downloading grpcio_tools-1.41.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.4 MB)\n", + "Collecting grpcio<1.42,>1.34.0\n", + " Downloading grpcio-1.41.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.9 MB)\n", + "Requirement already satisfied: wheel in /usr/local/lib/python3.7/site-packages (from strip-hints<1,>=0.1.8->kfp~=1.8.0->mlrun[complete]@ git+https://github.com/mlrun/mlrun@development) (0.36.2)\n", + "Collecting ujson>=3.0.0\n", + " Downloading ujson-5.1.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (43 kB)\n", + "Collecting future>=0.18.2\n", + " Downloading future-0.18.2.tar.gz (829 kB)\n", + "Collecting heapdict\n", + " Downloading HeapDict-1.0.1-py3-none-any.whl (3.9 kB)\n", + "Collecting argon2-cffi-bindings\n", + " Downloading argon2_cffi_bindings-21.2.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (86 kB)\n", + "Collecting webencodings\n", + " Downloading webencodings-0.5.1-py2.py3-none-any.whl (11 kB)\n", + "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/site-packages (from importlib-metadata->jsonschema<4,>=3.0.1->kfp~=1.8.0->mlrun[complete]@ git+https://github.com/mlrun/mlrun@development) (3.7.0)\n", + "Building wheels for collected packages: adlfs, aiobotocore, ordered-set, kfp, docstring-parser, fire, kfp-server-api, strip-hints, future, mlrun, termcolor\n", + " Building wheel for adlfs (setup.py): started\n", + " Building wheel for adlfs (setup.py): finished with status 'done'\n", + " Created wheel for adlfs: filename=adlfs-2021.8.2-py3-none-any.whl size=21466 sha256=d3076fc23e1d05f49958ba450b9913a80c23c755d3347327d1d2150656fa3185\n", + " Stored in directory: /root/.cache/pip/wheels/0d/88/1d/e06072abb7fb4d59b5cf94e194e53017dfa2dc47af4dec88b7\n", + " Building wheel for aiobotocore (setup.py): started\n", + " Building wheel for aiobotocore (setup.py): finished with status 'done'\n", + " Created wheel for aiobotocore: filename=aiobotocore-1.4.2-py3-none-any.whl size=49910 sha256=d630bfe25d72229a76e207cb2de8dd29839368673c27edc12229b314660a7e69\n", + " Stored in directory: /root/.cache/pip/wheels/33/e7/d9/b297a9aa9c43d56bc2463e6e2771655ff638f30b30f0b61fcb\n", + " Building wheel for ordered-set (setup.py): started\n", + " Building wheel for ordered-set (setup.py): finished with status 'done'\n", + " Created wheel for ordered-set: filename=ordered_set-4.0.2-py2.py3-none-any.whl size=8210 sha256=d0a6fcb9c69107866ca516cb90c3a6a1c0dd00c2f55e981d69b476e92fe85c0a\n", + " Stored in directory: /root/.cache/pip/wheels/73/2b/f6/26e9f84153c25050fe7c09e88f8e32a6be3c7034a38c418319\n", + " Building wheel for kfp (setup.py): started\n", + " Building wheel for kfp (setup.py): finished with status 'done'\n", + " Created wheel for kfp: filename=kfp-1.8.11-py3-none-any.whl size=414450 sha256=62bc86dbc4fbb6d431756182a297f87d5d9f08edfb8c2bab347b81fe0654cad3\n", + " Stored in directory: /root/.cache/pip/wheels/85/1e/ee/a14b49663bddf9e72d1c269cbe53970167bfabb53cadbbea3a\n", + " Building wheel for docstring-parser (PEP 517): started\n", + " Building wheel for docstring-parser (PEP 517): finished with status 'done'\n", + " Created wheel for docstring-parser: filename=docstring_parser-0.13-py3-none-any.whl size=31866 sha256=6ab7172ddcc24d27d93d31af6438a00053464e76585907654ec90dfb3ecf1886\n", + " Stored in directory: /root/.cache/pip/wheels/bd/88/3c/d1aa049309f7945178cac9fbe6561a86424f432da57c18ca0f\n", + " Building wheel for fire (setup.py): started\n", + " Building wheel for fire (setup.py): finished with status 'done'\n", + " Created wheel for fire: filename=fire-0.4.0-py2.py3-none-any.whl size=115928 sha256=6704c4bed4908d06fbe2e61f97718607c4ef9dee5a573d09acf9882e6e620757\n", + " Stored in directory: /root/.cache/pip/wheels/8a/67/fb/2e8a12fa16661b9d5af1f654bd199366799740a85c64981226\n", + " Building wheel for kfp-server-api (setup.py): started\n", + " Building wheel for kfp-server-api (setup.py): finished with status 'done'\n", + " Created wheel for kfp-server-api: filename=kfp_server_api-1.7.1-py3-none-any.whl size=92618 sha256=b156a487ea471572b7c0d0dc85826bb5f6554ce5d097c3d16cdd75e36a93100a\n", + " Stored in directory: /root/.cache/pip/wheels/68/3f/d5/734c0278dd6c8969cef359edcf059505a61452c5eb0e2760e1\n", + " Building wheel for strip-hints (setup.py): started\n", + " Building wheel for strip-hints (setup.py): finished with status 'done'\n", + " Created wheel for strip-hints: filename=strip_hints-0.1.10-py2.py3-none-any.whl size=22279 sha256=ebdf8455bb18636c3ffe99dc1ffe7262298094e000e46a15df3ef1b809e3770f\n", + " Stored in directory: /root/.cache/pip/wheels/5e/14/c3/6e44e9b2545f2d570b03f5b6d38c00b7534aa8abb376978363\n", + " Building wheel for future (setup.py): started\n", + " Building wheel for future (setup.py): finished with status 'done'\n", + " Created wheel for future: filename=future-0.18.2-py3-none-any.whl size=491059 sha256=b71c260b8cae9faa06e701eb03743481d68f427d7ed0886bddf8eec6fab17927\n", + " Stored in directory: /root/.cache/pip/wheels/56/b0/fe/4410d17b32f1f0c3cf54cdfb2bc04d7b4b8f4ae377e2229ba0\n", + " Building wheel for mlrun (PEP 517): started\n", + " Building wheel for mlrun (PEP 517): finished with status 'done'\n", + " Created wheel for mlrun: filename=mlrun-0.0.0+unstable-py3-none-any.whl size=799835 sha256=a37564dbf60ba19531146c0404fa02216cea4492894cbafd9738bb199ce45775\n", + " Stored in directory: /tmp/pip-ephem-wheel-cache-xz3ex0pi/wheels/cd/42/82/13965317128ea26acc3fb21b24cc254077454998599db6f161\n", + " Building wheel for termcolor (setup.py): started\n", + " Building wheel for termcolor (setup.py): finished with status 'done'\n", + " Created wheel for termcolor: filename=termcolor-1.1.0-py3-none-any.whl size=4829 sha256=09ed568b0b6ea586b107bc3decf5a736c95331052246548b5a560c69a88e9414\n", + " Stored in directory: /root/.cache/pip/wheels/3f/e3/ec/8a8336ff196023622fbcb36de0c5a5c218cbb24111d1d4c7f2\n", + "Successfully built adlfs aiobotocore ordered-set kfp docstring-parser fire kfp-server-api strip-hints future mlrun termcolor\n", + "Installing collected packages: typing-extensions, traitlets, pyrsistent, attrs, wcwidth, tornado, rsa, pyzmq, pyparsing, pyasn1-modules, ptyprocess, protobuf, parso, nest-asyncio, jupyter-core, jsonschema, ipython-genutils, entrypoints, cachetools, webencodings, pygments, prompt-toolkit, pickleshare, pexpect, packaging, nbformat, matplotlib-inline, MarkupSafe, jupyter-client, jedi, googleapis-common-protos, google-auth, decorator, cryptography, backcall, ujson, toolz, testpath, pandocfilters, nbclient, multidict, mistune, locket, jupyterlab-pygments, jinja2, ipython, httplib2, grpcio, google-crc32c, google-api-core, future, frozenlist, defusedxml, botocore, bleach, argon2-cffi-bindings, yarl, wrapt, v3io, uritemplate, terminado, termcolor, smmap, Send2Trash, s3transfer, pyyaml, prometheus-client, partd, pandas, nbconvert, ipykernel, heapdict, grpcio-tools, greenlet, google-resumable-media, google-cloud-core, google-auth-httplib2, fsspec, cloudpickle, click, asynctest, async-timeout, argon2-cffi, aiosignal, zict, v3iofs, v3io-frames, typer, tblib, tabulate, strip-hints, starlette, sqlalchemy, sortedcontainers, requests-toolbelt, python-editor, pydantic, psutil, ordered-set, notebook, msgpack, msal-extensions, Mako, kubernetes, kfp-server-api, kfp-pipeline-spec, google-cloud-storage, google-api-python-client, gitdb, fire, docstring-parser, Deprecated, dask, boto3, aioitertools, aiohttp, absl-py, tenacity, storey, semver, python-dotenv, pymysql, orjson, nuclio-jupyter, mergedeep, kfp, inflection, humanfriendly, google-auth-oauthlib, GitPython, fastapi, distributed, deepdiff, chardet, azure-storage-blob, azure-identity, azure-datalake-store, alembic, aiobotocore, s3fs, plotly, mlrun, gcsfs, azure-keyvault-secrets, adlfs\n", + " Attempting uninstall: typing-extensions\n", + " Found existing installation: typing-extensions 4.0.1\n", + " Uninstalling typing-extensions-4.0.1:\n", + " Successfully uninstalled typing-extensions-4.0.1\n", + " Attempting uninstall: cryptography\n", + " Found existing installation: cryptography 3.4.8\n", + " Uninstalling cryptography-3.4.8:\n", + " Successfully uninstalled cryptography-3.4.8\n", + " Attempting uninstall: cloudpickle\n", + " Found existing installation: cloudpickle 1.6.0\n", + " Uninstalling cloudpickle-1.6.0:\n", + " Successfully uninstalled cloudpickle-1.6.0\n", + " Attempting uninstall: msal-extensions\n", + " Found existing installation: msal-extensions 0.2.2\n", + " Uninstalling msal-extensions-0.2.2:\n", + " Successfully uninstalled msal-extensions-0.2.2\n", + " Attempting uninstall: azure-identity\n", + " Found existing installation: azure-identity 1.4.1\n", + " Uninstalling azure-identity-1.4.1:\n", + " Successfully uninstalled azure-identity-1.4.1\n", + "ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "azureml-dataprep 2.20.1 requires azure-identity<1.5.0,>=1.2.0, but you have azure-identity 1.7.1 which is incompatible.\n", + "azureml-dataprep 2.20.1 requires cloudpickle<2.0.0,>=1.1.0, but you have cloudpickle 2.0.0 which is incompatible.\n", + "Successfully installed Deprecated-1.2.13 GitPython-3.1.26 Mako-1.1.6 MarkupSafe-2.0.1 Send2Trash-1.8.0 absl-py-1.0.0 adlfs-2021.8.2 aiobotocore-1.4.2 aiohttp-3.8.1 aioitertools-0.8.0 aiosignal-1.2.0 alembic-1.5.8 argon2-cffi-21.3.0 argon2-cffi-bindings-21.2.0 async-timeout-4.0.2 asynctest-0.13.0 attrs-21.4.0 azure-datalake-store-0.0.52 azure-identity-1.7.1 azure-keyvault-secrets-4.3.0 azure-storage-blob-12.9.0 backcall-0.2.0 bleach-4.1.0 boto3-1.17.106 botocore-1.20.106 cachetools-4.2.4 chardet-3.0.4 click-7.1.2 cloudpickle-2.0.0 cryptography-3.3.2 dask-2021.11.2 decorator-5.1.1 deepdiff-5.7.0 defusedxml-0.7.1 distributed-2021.11.2 docstring-parser-0.13 entrypoints-0.3 fastapi-0.67.0 fire-0.4.0 frozenlist-1.3.0 fsspec-2021.8.1 future-0.18.2 gcsfs-2021.8.1 gitdb-4.0.9 google-api-core-2.4.0 google-api-python-client-1.12.10 google-auth-1.35.0 google-auth-httplib2-0.1.0 google-auth-oauthlib-0.4.6 google-cloud-core-2.2.2 google-cloud-storage-1.44.0 google-crc32c-1.3.0 google-resumable-media-2.1.0 googleapis-common-protos-1.54.0 greenlet-1.1.2 grpcio-1.41.1 grpcio-tools-1.41.1 heapdict-1.0.1 httplib2-0.20.2 humanfriendly-8.2 inflection-0.5.1 ipykernel-5.5.6 ipython-7.31.1 ipython-genutils-0.2.0 jedi-0.18.1 jinja2-3.0.3 jsonschema-3.2.0 jupyter-client-7.1.2 jupyter-core-4.9.1 jupyterlab-pygments-0.1.2 kfp-1.8.11 kfp-pipeline-spec-0.1.13 kfp-server-api-1.7.1 kubernetes-12.0.1 locket-0.2.1 matplotlib-inline-0.1.3 mergedeep-1.3.4 mistune-0.8.4 mlrun-0.0.0+unstable msal-extensions-0.3.1 msgpack-1.0.3 multidict-6.0.2 nbclient-0.5.10 nbconvert-6.4.1 nbformat-5.1.3 nest-asyncio-1.5.4 notebook-6.4.8 nuclio-jupyter-0.8.22 ordered-set-4.0.2 orjson-3.3.1 packaging-21.3 pandas-1.3.5 pandocfilters-1.5.0 parso-0.8.3 partd-1.2.0 pexpect-4.8.0 pickleshare-0.7.5 plotly-5.5.0 prometheus-client-0.13.1 prompt-toolkit-3.0.26 protobuf-3.19.4 psutil-5.9.0 ptyprocess-0.7.0 pyasn1-modules-0.2.8 pydantic-1.9.0 pygments-2.11.2 pymysql-1.0.2 pyparsing-3.0.7 pyrsistent-0.18.1 python-dotenv-0.17.1 python-editor-1.0.4 pyyaml-5.4.1 pyzmq-22.3.0 requests-toolbelt-0.9.1 rsa-4.8 s3fs-2021.8.1 s3transfer-0.4.2 semver-2.13.0 smmap-5.0.0 sortedcontainers-2.4.0 sqlalchemy-1.4.31 starlette-0.14.2 storey-0.10.4 strip-hints-0.1.10 tabulate-0.8.9 tblib-1.7.0 tenacity-8.0.1 termcolor-1.1.0 terminado-0.13.1 testpath-0.5.0 toolz-0.11.2 tornado-6.1 traitlets-5.1.1 typer-0.4.0 typing-extensions-3.10.0.2 ujson-5.1.0 uritemplate-3.0.1 v3io-0.5.15 v3io-frames-0.10.2 v3iofs-0.1.10 wcwidth-0.2.5 webencodings-0.5.1 wrapt-1.13.3 yarl-1.7.2 zict-2.0.0\n", + "WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\n", + "WARNING: You are using pip version 21.2.4; however, version 22.0.2 is available.\n", + "You should consider upgrading via the '/usr/local/bin/python -m pip install --upgrade pip' command.\n", + "\u001B[36mINFO\u001B[0m[0128] Taking snapshot of full filesystem... \n", + "\u001B[36mINFO\u001B[0m[0148] Pushing image to docker-registry.default-tenant.app.yh41.iguazio-cd1.com/mlrun/func-azureml-yonatan-azureml-utils:latest \n", + "\u001B[36mINFO\u001B[0m[0153] Pushed image to 1 destinations \n", + "> 2022-02-02 18:30:56,789 [info] starting run azureml-utils-train uid=48dbbe26a2a34b5baaec5ca8aba3de5e DB=http://mlrun-api:8080\n", + "> 2022-02-02 18:30:56,988 [info] Job is running in the background, pod: azureml-utils-train-7pp86\n", + "> 2022-02-02 18:31:30,311 [warning] Failed resolving version info. Ignoring and using defaults\n", + "> 2022-02-02 18:31:32,893 [warning] Server or client version is unstable. Assuming compatible: {'server_version': '0.0.0+unstable', 'client_version': '0.0.0+unstable'}\n", + "Failure while loading azureml_run_type_providers. Failed to load entrypoint automl = azureml.train.automl.run:AutoMLRun._from_run_dto with exception (cloudpickle 2.0.0 (/usr/local/lib/python3.7/site-packages), Requirement.parse('cloudpickle<2.0.0,>=1.1.0'), {'azureml-dataprep'}).\n", + "> 2022-02-02 18:31:34,680 [info] Loading AzureML Workspace\n", + "> 2022-02-02 18:31:36,956 [info] Initializing AzureML experiment azure-automl-test\n", + "> 2022-02-02 18:31:40,005 [info] Initializing AzureML compute target azureml-cpu\n", + "> 2022-02-02 18:31:40,206 [info] Found existing cluster, will use it.\n", + "Succeeded\n", + "AmlCompute wait for completion finished\n", + "\n", + "Minimum number of nodes requested have been provisioned\n", + "> 2022-02-02 18:31:40,322 [info] Connecting to AzureML experiment default datastore\n", + "> 2022-02-02 18:31:41,624 [info] Retrieving feature vector and uploading to Azure blob storage: az://azureml-blobstore-27f8977b-4946-4ca0-bdc5-5a685d2fe8d7/iris.csv\n", + "> 2022-02-02 18:31:41,912 [info] Registering dataset iris in Azure ML\n", + "> 2022-02-02 18:31:41,912 [info] OpenSSL version must be 1.1. Overriding the OpenSSL version to 1.1\n", + "> 2022-02-02 18:31:49,558 [info] Setting up experiment parameters\n", + "> 2022-02-02 18:31:49,812 [info] Submitting and running experiment\n", + "Submitting remote run.\n", + "Parent Run ID: AutoML_35c51a81-98fd-44fb-aa23-3192c3aca08d\n", + "https://ml.azure.com/runs/AutoML_35c51a81-98fd-44fb-aa23-3192c3aca08d?wsid=/subscriptions/8d81bc0b-6abd-4395-be83-000251d9fdbe/resourcegroups/nick/workspaces/NickAzureML&tid=af053911-a8b7-450d-9f58-0c08567d4769\n", + "\n", + "Current status: ModelSelection. Beginning model selection.\n", + "\n", + "****************************************************************************************************\n", + "DATA GUARDRAILS: \n", + "\n", + "TYPE: Class balancing detection\n", + "STATUS: PASSED\n", + "DESCRIPTION: Your inputs were analyzed, and all classes are balanced in your training data.\n", + " Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData\n", + "\n", + "****************************************************************************************************\n", + "\n", + "****************************************************************************************************\n", + "ITERATION: The iteration being evaluated.\n", + "PIPELINE: A summary description of the pipeline being evaluated.\n", + "DURATION: Time taken for the current iteration.\n", + "METRIC: The result of computing score on the fitted pipeline.\n", + "BEST: The best observed score thus far.\n", + "****************************************************************************************************\n", + "\n", + " ITERATION PIPELINE DURATION METRIC BEST\n", + " 0 RobustScaler LogisticRegression 0:00:21 0.9667 0.9667\n", + " 1 StandardScalerWrapper SVM 0:00:18 0.9533 0.9667\n", + " 2 StandardScalerWrapper LogisticRegression 0:00:22 0.9733 0.9733\n", + " 3 MaxAbsScaler LogisticRegression 0:00:22 0.9533 0.9733\n", + " 4 MaxAbsScaler LogisticRegression 0:00:21 0.9733 0.9733\n", + "\n", + "********************************************************************************************\n", + "\n", + "> 2022-02-02 18:38:28,144 [info] Registering model\n", + "> 2022-02-02 18:38:29,495 [info] Registered model with name 'iris-model', id 'iris-model:178', version '178'\n", + "> 2022-02-02 18:38:29,495 [info] Downloading model iris-model:178\n", + "> 2022-02-02 18:38:34,083 [info] Logging model_1_standardscaler_logisticregression model to MLRun\n", + "> 2022-02-02 18:38:34,621 [info] Registering model\n", + "> 2022-02-02 18:38:35,519 [info] Registered model with name 'iris-model', id 'iris-model:179', version '179'\n", + "> 2022-02-02 18:38:35,519 [info] Downloading model iris-model:179\n", + "> 2022-02-02 18:38:39,972 [info] Logging model_2_maxabsscaler_logisticregression model to MLRun\n", + "> 2022-02-02 18:38:40,317 [info] Registering model\n", + "> 2022-02-02 18:38:41,087 [info] Registered model with name 'iris-model', id 'iris-model:180', version '180'\n", + "> 2022-02-02 18:38:41,087 [info] Downloading model iris-model:180\n", + "> 2022-02-02 18:38:46,299 [info] Logging model_3_robustscaler_logisticregression model to MLRun\n", + "> 2022-02-02 18:38:47,615 [info] run executed, status=completed\n", + "final state: completed\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
azureml-yonatan0Feb 02 18:31:32completedazureml-utils-train
v3io_user=yonatan
kind=job
owner=yonatan
host=azureml-utils-train-7pp86
dataset
experiment_name=azure-automl-test
cpu_cluster_name=azureml-cpu
dataset_name=iris
dataset_description=iris training data
label_column_name=label
create_new_version=True
register_model_name=iris-model
save_n_models=3
automl_settings={'task': 'classification', 'debug_log': 'automl_errors.log', 'enable_early_stopping': False, 'allowed_models': ['LogisticRegression', 'SGD', 'SVM'], 'iterations': 5, 'iteration_timeout_minutes': 2, 'max_concurrent_iterations': 2, 'max_cores_per_iteration': -1, 'n_cross_validations': 5, 'primary_metric': 'accuracy', 'featurization': 'off', 'model_explainability': False, 'enable_voting_ensemble': False, 'enable_stack_ensemble': False}
dataset_blob_path=az://azureml-blobstore-27f8977b-4946-4ca0-bdc5-5a685d2fe8d7/iris.csv
best_iteration=1
auc_macro=0.9973298059964726
auc_micro=0.9979999999999999
norm_macro_recall=0.9594444444444443
balanced_accuracy=0.9729629629629629
f1_score_macro=0.9721779225097302
weighted_accuracy=0.9739694654594448
average_precision_score_weighted=0.9953861693861693
f1_score_weighted=0.9730901151988108
precision_score_micro=0.9733333333333334
matthews_correlation=0.9613232982405628
recall_score_macro=0.9729629629629629
precision_score_weighted=0.9767380952380952
recall_score_micro=0.9733333333333334
precision_score_macro=0.9754761904761905
average_precision_score_macro=0.9954059829059829
accuracy=0.9733333333333334
auc_weighted=0.9972857142857142
recall_score_weighted=0.9733333333333334
f1_score_micro=0.9733333333333334
average_precision_score_micro=0.9962096994520057
log_loss=0.07548089806904337
model
iteration_results
parallel_coordinates
\n", + "
\n", + "
\n", + "
\n", + " Title\n", + " ×\n", + "
\n", + " \n", + "
\n", + "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "text/html": [ + " > to track results use the .show() or .logs() methods or click here to open in UI" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2022-02-02 18:38:48,608 [info] run executed, status=completed\n" + ] + } + ], + "source": [ + "azureml_run = azureml_fn.run(\n", + " handler=\"train\",\n", + " inputs={\"dataset\": DATA_URL},\n", + " params=params,\n", + ")" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "View the run result: (more details in the UI)" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 8, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
stateiterparam.data_trans_class_nameparam.data_trans_moduleparam.data_trans_spec_classparam.train_class_nameparam.train_moduleparam.train_param_kwargs_Cparam.train_param_kwargs_class_weightparam.train_spec_class...output.precision_score_weightedoutput.recall_score_microoutput.precision_score_macrooutput.average_precision_score_macrooutput.accuracyoutput.auc_weightedoutput.recall_score_weightedoutput.f1_score_microoutput.average_precision_score_microoutput.log_loss
0completed1StandardScalersklearn.preprocessingpreprocLogisticRegressionsklearn.linear_model16.768329NaNsklearn...0.9767380.9733330.9754760.9954060.9733330.9972860.9733330.9733330.9962100.075481
1completed2MaxAbsScalersklearn.preprocessingpreprocLogisticRegressionsklearn.linear_model719.685673NaNsklearn...0.9767380.9733330.9754760.9952270.9733330.9972860.9733330.9733330.9962110.072493
2completed3RobustScalersklearn.preprocessingpreprocLogisticRegressionsklearn.linear_model1048.113134balancedsklearn...0.9683590.9666670.9664100.9942170.9666670.9965980.9666670.9666670.9955950.086160
\n", + "

3 rows × 31 columns

\n", + "
" + ], + "text/plain": [ + " state iter param.data_trans_class_name param.data_trans_module \\\n", + "0 completed 1 StandardScaler sklearn.preprocessing \n", + "1 completed 2 MaxAbsScaler sklearn.preprocessing \n", + "2 completed 3 RobustScaler sklearn.preprocessing \n", + "\n", + " param.data_trans_spec_class param.train_class_name param.train_module \\\n", + "0 preproc LogisticRegression sklearn.linear_model \n", + "1 preproc LogisticRegression sklearn.linear_model \n", + "2 preproc LogisticRegression sklearn.linear_model \n", + "\n", + " param.train_param_kwargs_C param.train_param_kwargs_class_weight \\\n", + "0 16.768329 NaN \n", + "1 719.685673 NaN \n", + "2 1048.113134 balanced \n", + "\n", + " param.train_spec_class ... output.precision_score_weighted \\\n", + "0 sklearn ... 0.976738 \n", + "1 sklearn ... 0.976738 \n", + "2 sklearn ... 0.968359 \n", + "\n", + " output.recall_score_micro output.precision_score_macro \\\n", + "0 0.973333 0.975476 \n", + "1 0.973333 0.975476 \n", + "2 0.966667 0.966410 \n", + "\n", + " output.average_precision_score_macro output.accuracy output.auc_weighted \\\n", + "0 0.995406 0.973333 0.997286 \n", + "1 0.995227 0.973333 0.997286 \n", + "2 0.994217 0.966667 0.996598 \n", + "\n", + " output.recall_score_weighted output.f1_score_micro \\\n", + "0 0.973333 0.973333 \n", + "1 0.973333 0.973333 \n", + "2 0.966667 0.966667 \n", + "\n", + " output.average_precision_score_micro output.log_loss \n", + "0 0.996210 0.075481 \n", + "1 0.996211 0.072493 \n", + "2 0.995595 0.086160 \n", + "\n", + "[3 rows x 31 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "azureml_run.artifact('iteration_results').show()" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "## 4. Deploy Model Serving" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 9, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "mlrun-flow\n", + "\n", + "\n", + "\n", + "_start\n", + "\n", + "start\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "_start->\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "LogisticRegression0\n", + "\n", + "LogisticRegression0\n", + "\n", + "\n", + "\n", + "->LogisticRegression0\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "LogisticRegression1\n", + "\n", + "LogisticRegression1\n", + "\n", + "\n", + "\n", + "->LogisticRegression1\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "LogisticRegression2\n", + "\n", + "LogisticRegression2\n", + "\n", + "\n", + "\n", + "->LogisticRegression2\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "LogisticRegression3\n", + "\n", + "LogisticRegression3\n", + "\n", + "\n", + "\n", + "->LogisticRegression3\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "LogisticRegression4\n", + "\n", + "LogisticRegression4\n", + "\n", + "\n", + "\n", + "->LogisticRegression4\n", + "\n", + "\n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Importing serving function from marketplace:\n", + "serving_fn = mlrun.new_function(\"serving\", kind=\"serving\", image=\"yhaviv/mlrun:dev\")\n", + "serving_fn.with_code(body=\" \")\n", + "serving_fn.with_requirements(\"./requirements.txt\")\n", + "\n", + "# Set the real-time pipeline topology\n", + "serving_fn.set_topology(\n", + " 'router',\n", + " 'mlrun.serving.routers.VotingEnsemble'\n", + ")\n", + "\n", + "# Add the trained models:\n", + "artifacts = mlrun.get_run_db().list_artifacts(project=project.name)\n", + "models = {f\"{model['algorithm']}{i}\" :f\"{model['db_key']}#{model['iter']}\"\n", + " for i, model in enumerate(artifacts) if model[\"kind\"]==\"model\"}\n", + "\n", + "for name, path in models.items():\n", + " serving_fn.add_model(\n", + " name,\n", + " class_name=\"mlrun.frameworks.sklearn.PickleModelServer\",\n", + " model_path=project.get_artifact_uri(path))\n", + "\n", + "serving_fn.spec.graph.plot()" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "Building and Deploying the Serving Function" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2022-02-02 18:38:48,785 [info] Starting remote function deploy\n" + ] + } + ], + "source": [ + "function_address = serving_fn.deploy()" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "## 5. Using the Live Model-Serving Function" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "print (f'The address for the function is {function_address} \\n')\n", + "\n", + "!curl $function_address" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "# Data for testing:\n", + "source_df = mlrun.get_dataitem(DATA_URL).as_df()\n", + "test_vector = source_df.sample(5).drop('label', axis=1).values.tolist()\n", + "test_vector" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "After deploying the serving function with the required model we can make prediction:" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "serving_fn.invoke(f'/v2/models/infer', {\"inputs\": test_vector})" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "## 6. Clean up\n", + "\n", + "For cleaning up AzureML resources see:\n", + "https://docs.microsoft.com/en-us/azure/machine-learning/tutorial-auto-train-models#clean-up-resources" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} \ No newline at end of file diff --git a/functions/master/azureml_utils/1.4.0/src/azureml_utils.py b/functions/master/azureml_utils/1.4.0/src/azureml_utils.py new file mode 100644 index 00000000..041af2b8 --- /dev/null +++ b/functions/master/azureml_utils/1.4.0/src/azureml_utils.py @@ -0,0 +1,568 @@ +# Copyright 2019 Iguazio +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import os +import json +import logging +from typing import Tuple, List + +from mlrun import MLClientCtx, DataItem, get_dataitem +import mlrun.feature_store as f_store +import mlrun.datastore +import mlrun.utils +from mlrun.datastore.targets import ParquetTarget + +from azureml.core.authentication import ServicePrincipalAuthentication +from azureml.core.workspace import Workspace +from azureml.core.experiment import Experiment +from azureml.core.dataset import Dataset +from azureml.core.model import Model +from azureml.core.compute import ComputeTarget, AmlCompute +from azureml.core.compute_target import ComputeTargetException +from azureml.core.script_run import ScriptRun + +from azureml.train.automl import AutoMLConfig +from azureml.train.automl.run import AutoMLRun + + +def _env_or_secret(context, key): + if key in os.environ: + return os.environ[key] + return context.get_secret(key) + + +def _load_workspace(context: MLClientCtx) -> Workspace: + """ + Loading AzureML Workspace with Azure secrets. + + :param context: MLRun context. + :returns: AzureML Workspace + """ + + if hasattr(context, "_azure_workspace"): + return context._azure_workspace + + context.logger.info("Loading AzureML Workspace") + # Azure service authentication: + service_authentication = ServicePrincipalAuthentication( + tenant_id=_env_or_secret(context, "AZURE_TENANT_ID"), + service_principal_id=_env_or_secret(context, "AZURE_SERVICE_PRINCIPAL_ID"), + service_principal_password=_env_or_secret( + context, "AZURE_SERVICE_PRINCIPAL_PASSWORD" + ), + ) + + # Loading Azure workspace: + workspace = Workspace( + subscription_id=_env_or_secret(context, "AZURE_SUBSCRIPTION_ID"), + resource_group=_env_or_secret(context, "AZURE_RESOURCE_GROUP"), + workspace_name=_env_or_secret(context, "AZURE_WORKSPACE_NAME"), + auth=service_authentication, + ) + + context._azure_workspace = workspace + return workspace + + +def _init_experiment( + context: MLClientCtx, experiment_name: str +) -> Tuple[Workspace, Experiment]: + """ + Initialize workspace and experiment in Azure ML. Uses Service + Principal authentication via environment variables. + + :param context: MLRun context. + :param experiment_name: Name of experiment to create in Azure ML. + :returns: Azure ML Workspace and Experiment. + """ + + # Initialize experiment via Service Principal Authentication: + # https://docs.microsoft.com/en-us/azure/machine-learning/how-to-setup-authentication#use-service-principal-authentication + + workspace = _load_workspace(context) + + context.logger.info(f"Initializing AzureML experiment {experiment_name}") + # Creating experiment: + experiment = Experiment(workspace, experiment_name) + + return workspace, experiment + + +def init_compute( + context: MLClientCtx, + cpu_cluster_name: str, + vm_size: str = "STANDARD_D2_V2", + max_nodes: int = 1, +) -> ComputeTarget: + """ + Initialize Azure ML compute target to run experiment. Checks for + existing compute target and creates new if does not exist. + + :param context: MLRun context. + :param cpu_cluster_name: Name of Azure ML compute target. Created if does not exist. + :param vm_size: Azure machine type for compute target. + :param max_nodes: Maximum number of concurrent compute targets. + :returns: Azure ML Compute Target. + """ + + workspace = _load_workspace(context) + context.logger.info(f"Initializing AzureML compute target {cpu_cluster_name}") + + # Verify that cluster does not exist already: + try: + compute_target = ComputeTarget(workspace=workspace, name=cpu_cluster_name) + context.logger.info("Found existing cluster, will use it.") + except ComputeTargetException: + compute_config = AmlCompute.provisioning_configuration( + vm_size=vm_size, max_nodes=max_nodes + ) + compute_target = ComputeTarget.create( + workspace, cpu_cluster_name, compute_config + ) + + compute_target.wait_for_completion(show_output=True) + return compute_target + + +def register_dataset( + context: MLClientCtx, + dataset_name: str, + dataset_description: str, + data: DataItem, + create_new_version: bool = False, +): + """ + Register dataset object (can be also an Iguazio FeatureVector) in Azure ML. + Uploads parquet file to Azure blob storage and registers + that file as a dataset in Azure ML. + + :param context: MLRun context. + :param dataset_name: Name of Azure dataset to register. + :param dataset_description: Description of Azure dataset to register. + :param data: MLRun FeatureVector or dataset object to upload. + :param create_new_version: Register Azure dataset as new version. Must be used when + modifying dataset schema. + """ + + # test for Azure storage connection environment variable or secret: + assert _env_or_secret( + context, "AZURE_STORAGE_CONNECTION_STRING" + ), "AZURE_STORAGE_CONNECTION_STRING secret not set" + + # Connect to AzureML experiment and datastore: + context.logger.info("Connecting to AzureML experiment default datastore") + + workspace = _load_workspace(context) + datastore = workspace.get_default_datastore() + + # Azure blob path (default datastore for workspace): + blob_path = f"az://{datastore.container_name}/{dataset_name}" + + store_uri_prefix, _ = mlrun.datastore.parse_store_uri(data.artifact_url) + feature_vector_case = mlrun.utils.StorePrefix.FeatureVector == store_uri_prefix + # Retrieve data source as dataframe: + if feature_vector_case: + # FeatureVector case: + context.logger.info( + f"Retrieving feature vector and uploading to Azure blob storage: {blob_path}" + ) + f_store.get_offline_features(data.meta.uri, target=ParquetTarget(path=blob_path)) + else: + blob_path += data.suffix + # DataItem case: + context.logger.info( + f"Retrieving feature vector and uploading to Azure blob storage: {blob_path}" + ) + data_in_bytes = data.get() + get_dataitem(blob_path).put(data_in_bytes) + + # Register dataset in AzureML: + context.logger.info(f"Registering dataset {dataset_name} in Azure ML") + if data.suffix == ".parquet" or feature_vector_case: + dataset = Dataset.Tabular.from_parquet_files( + path=(datastore, f"{dataset_name}.parquet"), validate=False + ) + else: + context.logger.info( + f"OpenSSL version must be 1.1. Overriding the OpenSSL version to 1.1" + ) + # OpenSSL version must be 1.1 + os.environ["CLR_OPENSSL_VERSION_OVERRIDE"] = "1.1" + dataset = Dataset.Tabular.from_delimited_files( + path=(datastore, f"{dataset_name}{data.suffix}"), validate=False + ) + + dataset.register( + workspace=workspace, + name=dataset_name, + description=dataset_description, + create_new_version=create_new_version, + ) + + # Output registered dataset name in Azure: + context.log_result("dataset_blob_path", blob_path) + + +def download_model( + context: MLClientCtx, + model_name: str, + model_version: int, + target_dir: str = ".", +) -> None: + """ + Download trained model from Azure ML to local filesystem. + + :param context: MLRun context. + :param model_name: Name of trained and registered model. + :param model_version: Version of model to download. + :param target_dir: Target directory to download model. + """ + # Loading workspace if not provided: + workspace = _load_workspace(context) + context.logger.info(f"Downloading model {model_name}:{model_version}") + model = Model(workspace, model_name, version=model_version) + model.download(target_dir=target_dir, exist_ok=True) + + +def upload_model( + context: MLClientCtx, + model_name: str, + model_path: str, + model_description: str = None, + model_tags: dict = None, +) -> None: + """ + Upload pre-trained model from local filesystem to Azure ML. + :param context: MLRun context. + :param model_name: Name of trained and registered model. + :param model_path: Path to file on local filesystem. + :param model_description: Description of models. + :param model_tags: KV pairs of model tags. + """ + # Loading workspace if not provided: + workspace = _load_workspace(context) + + context.logger.info(f"Upload model {model_name} from {model_path}") + Model.register( + workspace=workspace, + model_path=model_path, + model_name=model_name, + description=model_description, + tags=model_tags, + ) + + +def _get_top_n_runs( + remote_run: AutoMLRun, n: int = 5, primary_metric: str = "accuracy" +) -> List[ScriptRun]: + """ + Get top N complete runs from experiment sorted by primary metric. + + :param remote_run: Azure ML Run. + :param n: Number of top runs to return. + :param primary_metric: Metric to sort by. + + :returns: List of top N runs sorted by primary metric. + """ + # Collect all models: + complete_runs = [ + run + for run in remote_run.get_children(status="Completed") + if not any(s in run.id for s in ["setup", "worker"]) + ] + + # Checking that the required number of runs are done: + if len(complete_runs) < n: + raise ValueError(f"Expected {n} runs but only received {len(complete_runs)}") + + # Sorting by the primary metric: + sorted_runs = sorted( + complete_runs, key=lambda run: run.get_metrics()[primary_metric], reverse=True + ) + return sorted_runs[:n] + + +def _get_model_hp( + run: ScriptRun, +) -> dict: + """ + Get hyper-parameters of trained AzureML model. + Combine the hyper-parameters of the data transformation and training to a dictionary. + The prefix of the dictionary keys corresponds to 'data transformation' and 'training'. + + :param run: Run object of AzureML trained model. + + :returns: A dictionary as described in the docstring. + """ + + spec_field = "pipeline_spec" + if spec_field not in run.properties: + return {} + spec_string = run.properties[spec_field] + spec_dict = json.loads(spec_string) + + if "objects" not in spec_dict: + # No hyper-params + return {} + hp_dicts = spec_dict["objects"] + # after training there are two hyper-parameters dicts inside the run object: + assert ( + len(hp_dicts) == 2 + ), "after training there are two hyper-parameters dicts inside the run object" + result_dict = {} + dict_keys = [ + ["data_trans_class_name", "data_trans_module", "data_trans_spec_class"], + [ + "train_class_name", + "train_module", + "train_param_kwargs_C", + "train_param_kwargs_class_weight", + "train_spec_class", + ], + ] + + # creating hyper-params dict with key prefixes for each part: + kwargs_prefix = "param_kwargs" + for d, name, keys in zip(hp_dicts, ["data_trans", "train"], dict_keys): + for key in keys: + + if kwargs_prefix in key: + result_dict[key] = d[kwargs_prefix][ + key.replace(f"{name}_{kwargs_prefix}_", "") + ] + else: + result_dict[key] = d[key.replace(f"{name}_", "")] + if not result_dict[key]: + result_dict[key] = "" + + return result_dict + + +def submit_training_job( + context: MLClientCtx, + experiment: Experiment, + compute_target: ComputeTarget, + register_model_name: str, + registered_dataset_name: str, + automl_settings: dict, + training_set: DataItem, + label_column_name: str = '', + save_n_models: int = 3, + show_output: bool = True, +) -> None: + """ + Submit training job to Azure AutoML and download trained model + when completed. Uses previously registered dataset for training. + + :param context: MLRun context. + :param experiment: Azure experiment. + :param compute_target: Azure compute target. + :param register_model_name: Name of model to register in Azure. + :param registered_dataset_name: Name of dataset registered in Azure ML. + :param label_column_name: Name of target column in dataset. + :param automl_settings: JSON string of all Azure AutoML settings. + :param training_set: Training set to log with model. For model + monitoring integration. + :param show_output: Displaying Azure logs. + :param save_n_models: How many of the top performing models to log. + """ + # Loading workspace if not provided: + workspace = _load_workspace(context) + + # Setup experiment: + context.logger.info("Setting up experiment parameters") + dataset = Dataset.get_by_name(workspace, name=registered_dataset_name) + + # Get training set to log with model: + feature_vector = None + store_uri_prefix, _ = mlrun.datastore.parse_store_uri(training_set.artifact_url) + if mlrun.utils.StorePrefix.FeatureVector == store_uri_prefix: + feature_vector = training_set.meta.uri + label_column_name = label_column_name or training_set.meta.status.label_column + context.logger.info(f'label column name: {label_column_name}') + training_set = f_store.get_offline_features(feature_vector).to_dataframe() + else: + training_set = training_set.as_df() + + automl_config = AutoMLConfig( + compute_target=compute_target, + training_data=dataset, + verbosity=logging.INFO, + label_column_name=label_column_name, + **automl_settings, + ) + + # Run experiment on AzureML: + context.logger.info("Submitting and running experiment") + remote_run = experiment.submit(automl_config) + remote_run.wait_for_completion(show_output=show_output) + if show_output: + # Azure log ending row: + print(f"\n{'*' * 92}\n") + # Get top N runs to log: + top_runs = _get_top_n_runs( + remote_run=remote_run, + n=save_n_models, + primary_metric=automl_settings["primary_metric"], + ) + + # Register, download, and log models: + for i, run in enumerate(top_runs): + # Register model: + context.logger.info("Registering model") + model = run.register_model( + model_name=register_model_name, model_path="outputs/model.pkl" + ) + context.logger.info( + f"Registered model with name '{model.name}', id '{model.id}', version '{model.version}'" + ) + + # Download model locally: + download_model( + context=context, + model_name=register_model_name, + model_version=model.version, + target_dir=f"./{model.version}", + ) + + metrics = {k.lower(): val for k, val in run.get_metrics().items()} + del metrics["confusion_matrix"] + del metrics["accuracy_table"] + + # Collect model hyper-parameters: + model_hp_dict = _get_model_hp(run) + with context.get_child_context(**model_hp_dict) as child: + model_key = f"model_{i + 1}_{model_hp_dict['data_trans_class_name'].lower()}_{model_hp_dict['train_class_name'].lower()}" + # Log model: + context.logger.info( + f"Logging {model_key} model to MLRun" + ) + child.log_results(metrics) + child.log_model( + "model", + db_key=model_key, + artifact_path=context.artifact_subpath("models"), + metrics=metrics, + model_file=f"{model.version}/model.pkl", + training_set=training_set, + label_column=label_column_name, + feature_vector=feature_vector, + framework="AzureML", + algorithm=model_hp_dict.get("train_class_name"), + ) + if i == 0: + # This also logs the model: + child.mark_as_best() + + +def train( + # MlRun + context: MLClientCtx, + dataset: DataItem, + # Init experiment and compute + experiment_name: str = "", + cpu_cluster_name: str = "", + vm_size: str = "STANDARD_D2_V2", + max_nodes: int = 1, + # Register dataset + dataset_name: str = "", + dataset_description: str = "", + create_new_version: bool = False, + label_column_name: str = "", + # Submit training job + register_model_name: str = "", + save_n_models: int = 1, + log_azure: bool = True, + automl_settings: str = None, +) -> None: + """ + Whole training flow for Azure AutoML. Registers dataset/feature vector, + submits training job to Azure AutoML, and downloads trained model + when completed. + + :param context: MLRun context. + + :param dataset: MLRun FeatureVector or dataset URI to upload. Will drop + index before uploading when it is a FeatureVector. + + :param experiment_name: Name of experiment to create in Azure ML. + :param cpu_cluster_name: Name of Azure ML compute target. Created if does not exist. + :param vm_size: Azure machine type for compute target. + :param max_nodes: Maximum number of concurrent compute targets. + + :param dataset_name: Name of Azure dataset to register. + :param dataset_description: Description of Azure dataset to register. + + :param create_new_version: Register Azure dataset as new version. Must be used when + modifying dataset schema. + :param label_column_name: Target column in dataset. + + :param register_model_name: Name of model to register in Azure. + :param save_n_models: How many of the top performing models to log. + :param log_azure: Displaying Azure logs. + :param automl_settings: JSON string of all Azure AutoML settings. + """ + if not automl_settings: + automl_settings = { + "task": "classification", + "debug_log": "automl_errors.log", + # "experiment_exit_score": 0.9, + "enable_early_stopping": False, + "allowed_models": ["LogisticRegression", "SGD", "SVM"], + "iterations": 3, + "iteration_timeout_minutes": 2, + "max_concurrent_iterations": 2, + "max_cores_per_iteration": -1, + "n_cross_validations": 5, + "primary_metric": "accuracy", + "featurization": "off", + "model_explainability": False, + "enable_voting_ensemble": False, + "enable_stack_ensemble": False, + } + + # Init experiment and compute + workspace, experiment = _init_experiment( + context=context, experiment_name=experiment_name + ) + + compute_target = init_compute( + context=context, + cpu_cluster_name=cpu_cluster_name, + vm_size=vm_size, + max_nodes=max_nodes, + ) + + # Register dataset + register_dataset( + context=context, + dataset_name=dataset_name, + dataset_description=dataset_description, + data=dataset, + create_new_version=create_new_version, + ) + + # Submit training job + submit_training_job( + context, + experiment=experiment, + compute_target=compute_target, + register_model_name=register_model_name, + registered_dataset_name=dataset_name, + label_column_name=label_column_name, + automl_settings=automl_settings, + training_set=dataset, + show_output=log_azure, + save_n_models=save_n_models, + ) diff --git a/functions/master/azureml_utils/1.4.0/src/function.yaml b/functions/master/azureml_utils/1.4.0/src/function.yaml new file mode 100644 index 00000000..a6348996 --- /dev/null +++ b/functions/master/azureml_utils/1.4.0/src/function.yaml @@ -0,0 +1,247 @@ +verbose: false +spec: + command: '' + build: + auto_build: true + code_origin: '' + with_mlrun: true + requirements: + - azureml-core==1.54.0.post1 + - azureml-train-automl-client==1.54.0.post1 + - plotly~=5.4 + functionSourceCode:  + commands: + - apt-get update && apt-get install -y --no-install-recommends git + - apt install -y liblttng-ust0 + base_image: python:3.9-bullseye + origin_filename: '' + default_handler: train + allow_empty_resources: true + disable_auto_mount: false + image: '' + entry_points: + init_compute: + doc: 'Initialize Azure ML compute target to run experiment. Checks for + + existing compute target and creates new if does not exist.' + name: init_compute + lineno: 102 + has_kwargs: false + parameters: + - name: context + type: MLClientCtx + doc: MLRun context. + - name: cpu_cluster_name + type: str + doc: Name of Azure ML compute target. Created if does not exist. + - name: vm_size + type: str + doc: Azure machine type for compute target. + default: STANDARD_D2_V2 + - name: max_nodes + type: int + doc: Maximum number of concurrent compute targets. + default: 1 + outputs: + - doc: Azure ML Compute Target. + type: ComputeTarget + has_varargs: false + register_dataset: + doc: 'Register dataset object (can be also an Iguazio FeatureVector) in Azure + ML. + + Uploads parquet file to Azure blob storage and registers + + that file as a dataset in Azure ML.' + name: register_dataset + lineno: 138 + has_kwargs: false + parameters: + - name: context + type: MLClientCtx + doc: MLRun context. + - name: dataset_name + type: str + doc: Name of Azure dataset to register. + - name: dataset_description + type: str + doc: Description of Azure dataset to register. + - name: data + type: DataItem + doc: MLRun FeatureVector or dataset object to upload. + - name: create_new_version + type: bool + doc: Register Azure dataset as new version. Must be used when modifying dataset + schema. + default: false + has_varargs: false + download_model: + doc: Download trained model from Azure ML to local filesystem. + name: download_model + lineno: 217 + has_kwargs: false + parameters: + - name: context + type: MLClientCtx + doc: MLRun context. + - name: model_name + type: str + doc: Name of trained and registered model. + - name: model_version + type: int + doc: Version of model to download. + - name: target_dir + type: str + doc: Target directory to download model. + default: . + outputs: + - type: None + has_varargs: false + upload_model: + doc: Upload pre-trained model from local filesystem to Azure ML. + name: upload_model + lineno: 238 + has_kwargs: false + parameters: + - name: context + type: MLClientCtx + doc: MLRun context. + - name: model_name + type: str + doc: Name of trained and registered model. + - name: model_path + type: str + doc: Path to file on local filesystem. + - name: model_description + type: str + doc: Description of models. + default: null + - name: model_tags + type: dict + doc: KV pairs of model tags. + default: null + outputs: + - type: None + has_varargs: false + submit_training_job: + doc: 'Submit training job to Azure AutoML and download trained model + + when completed. Uses previously registered dataset for training.' + name: submit_training_job + lineno: 352 + has_kwargs: false + parameters: + - name: context + type: MLClientCtx + doc: MLRun context. + - name: experiment + type: Experiment + doc: Azure experiment. + - name: compute_target + type: ComputeTarget + doc: Azure compute target. + - name: register_model_name + type: str + doc: Name of model to register in Azure. + - name: registered_dataset_name + type: str + doc: Name of dataset registered in Azure ML. + - name: automl_settings + type: dict + doc: JSON string of all Azure AutoML settings. + - name: training_set + type: DataItem + doc: Training set to log with model. For model monitoring integration. + - name: label_column_name + type: str + doc: Name of target column in dataset. + default: '' + - name: save_n_models + type: int + doc: How many of the top performing models to log. + default: 3 + - name: show_output + type: bool + doc: Displaying Azure logs. + default: true + outputs: + - type: None + has_varargs: false + train: + doc: 'Whole training flow for Azure AutoML. Registers dataset/feature vector, + + submits training job to Azure AutoML, and downloads trained model + + when completed.' + name: train + lineno: 469 + has_kwargs: false + parameters: + - name: context + type: MLClientCtx + doc: MLRun context. + - name: dataset + type: DataItem + doc: MLRun FeatureVector or dataset URI to upload. Will drop index before + uploading when it is a FeatureVector. + - name: experiment_name + type: str + doc: Name of experiment to create in Azure ML. + default: '' + - name: cpu_cluster_name + type: str + doc: Name of Azure ML compute target. Created if does not exist. + default: '' + - name: vm_size + type: str + doc: Azure machine type for compute target. + default: STANDARD_D2_V2 + - name: max_nodes + type: int + doc: Maximum number of concurrent compute targets. + default: 1 + - name: dataset_name + type: str + doc: Name of Azure dataset to register. + default: '' + - name: dataset_description + type: str + doc: Description of Azure dataset to register. + default: '' + - name: create_new_version + type: bool + doc: Register Azure dataset as new version. Must be used when modifying dataset + schema. + default: false + - name: label_column_name + type: str + doc: Target column in dataset. + default: '' + - name: register_model_name + type: str + doc: Name of model to register in Azure. + default: '' + - name: save_n_models + type: int + doc: How many of the top performing models to log. + default: 1 + - name: log_azure + type: bool + doc: Displaying Azure logs. + default: true + - name: automl_settings + type: str + doc: JSON string of all Azure AutoML settings. + default: null + outputs: + - type: None + has_varargs: false + description: Azure AutoML integration in MLRun, including utils functions for training + models on Azure AutoML platfrom. +kind: job +metadata: + categories: + - model-serving + - utils + tag: '' + name: azureml-utils diff --git a/functions/master/azureml_utils/1.4.0/src/item.yaml b/functions/master/azureml_utils/1.4.0/src/item.yaml new file mode 100644 index 00000000..0b4d5e49 --- /dev/null +++ b/functions/master/azureml_utils/1.4.0/src/item.yaml @@ -0,0 +1,38 @@ +apiVersion: v1 +categories: +- model-serving +- utils +description: Azure AutoML integration in MLRun, including utils functions for training + models on Azure AutoML platfrom. +doc: '' +example: azureml_utils.ipynb +generationDate: 2022-08-28:17-25 +hidden: false +icon: '' +labels: + author: yonish +maintainers: [] +marketplaceType: '' +mlrunVersion: 1.7.0 +name: azureml_utils +platformVersion: 3.5.3 +spec: + extra_spec: + allow_empty_resources: true + build: + auto_build: true + commands: + - apt-get update && apt-get install -y --no-install-recommends git + - apt install -y liblttng-ust0 + with_mlrun: true + filename: azureml_utils.py + handler: train + image: python:3.9-bullseye + kind: job + requirements: + - azureml-core==1.54.0.post1 + - azureml-train-automl-client==1.54.0.post1 + - plotly~=5.4 +url: '' +version: 1.4.0 +test_valid: True diff --git a/functions/master/azureml_utils/1.4.0/src/requirements.txt b/functions/master/azureml_utils/1.4.0/src/requirements.txt new file mode 100644 index 00000000..b5486614 --- /dev/null +++ b/functions/master/azureml_utils/1.4.0/src/requirements.txt @@ -0,0 +1,3 @@ +azureml-core==1.54.0.post1 +azureml-train-automl-client==1.54.0.post1 +plotly~=5.4 \ No newline at end of file diff --git a/functions/master/azureml_utils/1.4.0/src/test_azureml_utils.py b/functions/master/azureml_utils/1.4.0/src/test_azureml_utils.py new file mode 100644 index 00000000..d6ef80d1 --- /dev/null +++ b/functions/master/azureml_utils/1.4.0/src/test_azureml_utils.py @@ -0,0 +1,128 @@ +# Copyright 2019 Iguazio +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import os +import tempfile +import shutil +import pytest + +import mlrun +from mlrun import import_function + +EXPERIMENT_NAME = "azure-automl-test" +PROJECT_NAME = "azure-automl-project" + +DATA_URL = "https://s3.wasabisys.com/iguazio/data/iris/iris_dataset.csv" + +SECRETS_REQUIRED_FIELDS = [ + "AZURE_TENANT_ID", + "AZURE_SERVICE_PRINCIPAL_ID", + "AZURE_SERVICE_PRINCIPAL_PASSWORD", + "AZURE_SUBSCRIPTION_ID", + "AZURE_RESOURCE_GROUP", + "AZURE_WORKSPACE_NAME", + "AZURE_STORAGE_CONNECTION_STRING", +] + + +def _validate_environment_variables() -> bool: + environment_keys = os.environ.keys() + return all(key in environment_keys for key in SECRETS_REQUIRED_FIELDS) + + +def _get_secrets_spec(): + return mlrun.new_task().with_secrets( + "env", + ",".join(SECRETS_REQUIRED_FIELDS), + ) + + +def _set_environment(): + artifact_path = tempfile.TemporaryDirectory().name + os.makedirs(artifact_path) + return artifact_path + + +def _cleanup_environment(artifact_path: str): + """ + Cleanup the test environment, deleting files and artifacts created during the test. + + :param artifact_path: The artifact path to delete. + """ + # Clean the local directory: + for test_output in [ + *os.listdir(artifact_path), + "schedules", + "runs", + "artifacts", + "functions", + ]: + test_output_path = os.path.abspath(f"./{test_output}") + if os.path.exists(test_output_path): + if os.path.isdir(test_output_path): + shutil.rmtree(test_output_path) + else: + os.remove(test_output_path) + + # Clean the artifacts' directory: + shutil.rmtree(artifact_path) + + +@pytest.mark.skipif( + condition=not _validate_environment_variables(), + reason="AzureML secrets should be provided as environment variables", +) +def test_train(): + """ + Test the 'train' handler with iris dataset. + """ + test_pass = False + + # Setting secrets: + secrets_spec = _get_secrets_spec() + + # Setting environment: + artifact_path = _set_environment() + azure_automl_fn = import_function("function.yaml") + model_paths, save_n_models = [], 2 + + try: + azureml_run = azure_automl_fn.run( + runspec=secrets_spec, + handler="train", + params={ + "experiment_name": EXPERIMENT_NAME, + "cpu_cluster_name": "azureml-cpu", + "dataset_name": "iris-test", + "log_azure": False, + "dataset_description": "iris training data", + "label_column_name": "label", + "create_new_version": True, + "register_model_name": "iris-model", + "save_n_models": save_n_models, + }, + inputs={"dataset": DATA_URL}, + artifact_path=artifact_path, + local=True, + ) + # Get trained models: + num_saved_models = len(azureml_run.status.iterations) - 1 # The first one in the list is the 'columns' + test_pass = num_saved_models == save_n_models + + except Exception as exception: + print(f"- The test failed - raised the following error:\n- {exception}") + + _cleanup_environment(artifact_path) + + assert test_pass, f'Created {len(model_paths)} models instead of {save_n_models}' diff --git a/functions/master/azureml_utils/1.4.0/static/azureml_utils.html b/functions/master/azureml_utils/1.4.0/static/azureml_utils.html new file mode 100644 index 00000000..9a6dc0d3 --- /dev/null +++ b/functions/master/azureml_utils/1.4.0/static/azureml_utils.html @@ -0,0 +1,758 @@ + + + + + + + +azureml_utils.azureml_utils + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + +
+
+
+
+
+
+ +
+ +
+
+
+ + + + +
+
+
+
+
+
+
+
+
+
+
+
+
+ + +
+
+
+
+
+

+ +
+
+
+
+
+ +
+

Source code for azureml_utils.azureml_utils

+# Copyright 2019 Iguazio
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import os
+import json
+import logging
+from typing import Tuple, List
+
+from mlrun import MLClientCtx, DataItem, get_dataitem
+import mlrun.feature_store as f_store
+import mlrun.datastore
+import mlrun.utils
+from mlrun.datastore.targets import ParquetTarget
+
+from azureml.core.authentication import ServicePrincipalAuthentication
+from azureml.core.workspace import Workspace
+from azureml.core.experiment import Experiment
+from azureml.core.dataset import Dataset
+from azureml.core.model import Model
+from azureml.core.compute import ComputeTarget, AmlCompute
+from azureml.core.compute_target import ComputeTargetException
+from azureml.core.script_run import ScriptRun
+
+from azureml.train.automl import AutoMLConfig
+from azureml.train.automl.run import AutoMLRun
+
+
+def _env_or_secret(context, key):
+    if key in os.environ:
+        return os.environ[key]
+    return context.get_secret(key)
+
+
+def _load_workspace(context: MLClientCtx) -> Workspace:
+    """
+    Loading AzureML Workspace with Azure secrets.
+
+    :param context: MLRun context.
+    :returns:       AzureML Workspace
+    """
+
+    if hasattr(context, "_azure_workspace"):
+        return context._azure_workspace
+
+    context.logger.info("Loading AzureML Workspace")
+    # Azure service authentication:
+    service_authentication = ServicePrincipalAuthentication(
+        tenant_id=_env_or_secret(context, "AZURE_TENANT_ID"),
+        service_principal_id=_env_or_secret(context, "AZURE_SERVICE_PRINCIPAL_ID"),
+        service_principal_password=_env_or_secret(
+            context, "AZURE_SERVICE_PRINCIPAL_PASSWORD"
+        ),
+    )
+
+    # Loading Azure workspace:
+    workspace = Workspace(
+        subscription_id=_env_or_secret(context, "AZURE_SUBSCRIPTION_ID"),
+        resource_group=_env_or_secret(context, "AZURE_RESOURCE_GROUP"),
+        workspace_name=_env_or_secret(context, "AZURE_WORKSPACE_NAME"),
+        auth=service_authentication,
+    )
+
+    context._azure_workspace = workspace
+    return workspace
+
+
+def _init_experiment(
+    context: MLClientCtx, experiment_name: str
+) -> Tuple[Workspace, Experiment]:
+    """
+    Initialize workspace and experiment in Azure ML. Uses Service
+    Principal authentication via environment variables.
+
+    :param context:         MLRun context.
+    :param experiment_name: Name of experiment to create in Azure ML.
+    :returns:               Azure ML Workspace and Experiment.
+    """
+
+    # Initialize experiment via Service Principal Authentication:
+    # https://docs.microsoft.com/en-us/azure/machine-learning/how-to-setup-authentication#use-service-principal-authentication
+
+    workspace = _load_workspace(context)
+
+    context.logger.info(f"Initializing AzureML experiment {experiment_name}")
+    # Creating experiment:
+    experiment = Experiment(workspace, experiment_name)
+
+    return workspace, experiment
+
+
+
+[docs] +def init_compute( + context: MLClientCtx, + cpu_cluster_name: str, + vm_size: str = "STANDARD_D2_V2", + max_nodes: int = 1, +) -> ComputeTarget: + """ + Initialize Azure ML compute target to run experiment. Checks for + existing compute target and creates new if does not exist. + + :param context: MLRun context. + :param cpu_cluster_name: Name of Azure ML compute target. Created if does not exist. + :param vm_size: Azure machine type for compute target. + :param max_nodes: Maximum number of concurrent compute targets. + :returns: Azure ML Compute Target. + """ + + workspace = _load_workspace(context) + context.logger.info(f"Initializing AzureML compute target {cpu_cluster_name}") + + # Verify that cluster does not exist already: + try: + compute_target = ComputeTarget(workspace=workspace, name=cpu_cluster_name) + context.logger.info("Found existing cluster, will use it.") + except ComputeTargetException: + compute_config = AmlCompute.provisioning_configuration( + vm_size=vm_size, max_nodes=max_nodes + ) + compute_target = ComputeTarget.create( + workspace, cpu_cluster_name, compute_config + ) + + compute_target.wait_for_completion(show_output=True) + return compute_target
+ + + +
+[docs] +def register_dataset( + context: MLClientCtx, + dataset_name: str, + dataset_description: str, + data: DataItem, + create_new_version: bool = False, +): + """ + Register dataset object (can be also an Iguazio FeatureVector) in Azure ML. + Uploads parquet file to Azure blob storage and registers + that file as a dataset in Azure ML. + + :param context: MLRun context. + :param dataset_name: Name of Azure dataset to register. + :param dataset_description: Description of Azure dataset to register. + :param data: MLRun FeatureVector or dataset object to upload. + :param create_new_version: Register Azure dataset as new version. Must be used when + modifying dataset schema. + """ + + # test for Azure storage connection environment variable or secret: + assert _env_or_secret( + context, "AZURE_STORAGE_CONNECTION_STRING" + ), "AZURE_STORAGE_CONNECTION_STRING secret not set" + + # Connect to AzureML experiment and datastore: + context.logger.info("Connecting to AzureML experiment default datastore") + + workspace = _load_workspace(context) + datastore = workspace.get_default_datastore() + + # Azure blob path (default datastore for workspace): + blob_path = f"az://{datastore.container_name}/{dataset_name}" + + store_uri_prefix, _ = mlrun.datastore.parse_store_uri(data.artifact_url) + feature_vector_case = mlrun.utils.StorePrefix.FeatureVector == store_uri_prefix + # Retrieve data source as dataframe: + if feature_vector_case: + # FeatureVector case: + context.logger.info( + f"Retrieving feature vector and uploading to Azure blob storage: {blob_path}" + ) + f_store.get_offline_features(data.meta.uri, target=ParquetTarget(path=blob_path)) + else: + blob_path += data.suffix + # DataItem case: + context.logger.info( + f"Retrieving feature vector and uploading to Azure blob storage: {blob_path}" + ) + data_in_bytes = data.get() + get_dataitem(blob_path).put(data_in_bytes) + + # Register dataset in AzureML: + context.logger.info(f"Registering dataset {dataset_name} in Azure ML") + if data.suffix == ".parquet" or feature_vector_case: + dataset = Dataset.Tabular.from_parquet_files( + path=(datastore, f"{dataset_name}.parquet"), validate=False + ) + else: + context.logger.info( + f"OpenSSL version must be 1.1. Overriding the OpenSSL version to 1.1" + ) + # OpenSSL version must be 1.1 + os.environ["CLR_OPENSSL_VERSION_OVERRIDE"] = "1.1" + dataset = Dataset.Tabular.from_delimited_files( + path=(datastore, f"{dataset_name}{data.suffix}"), validate=False + ) + + dataset.register( + workspace=workspace, + name=dataset_name, + description=dataset_description, + create_new_version=create_new_version, + ) + + # Output registered dataset name in Azure: + context.log_result("dataset_blob_path", blob_path)
+ + + +
+[docs] +def download_model( + context: MLClientCtx, + model_name: str, + model_version: int, + target_dir: str = ".", +) -> None: + """ + Download trained model from Azure ML to local filesystem. + + :param context: MLRun context. + :param model_name: Name of trained and registered model. + :param model_version: Version of model to download. + :param target_dir: Target directory to download model. + """ + # Loading workspace if not provided: + workspace = _load_workspace(context) + context.logger.info(f"Downloading model {model_name}:{model_version}") + model = Model(workspace, model_name, version=model_version) + model.download(target_dir=target_dir, exist_ok=True)
+ + + +
+[docs] +def upload_model( + context: MLClientCtx, + model_name: str, + model_path: str, + model_description: str = None, + model_tags: dict = None, +) -> None: + """ + Upload pre-trained model from local filesystem to Azure ML. + :param context: MLRun context. + :param model_name: Name of trained and registered model. + :param model_path: Path to file on local filesystem. + :param model_description: Description of models. + :param model_tags: KV pairs of model tags. + """ + # Loading workspace if not provided: + workspace = _load_workspace(context) + + context.logger.info(f"Upload model {model_name} from {model_path}") + Model.register( + workspace=workspace, + model_path=model_path, + model_name=model_name, + description=model_description, + tags=model_tags, + )
+ + + +def _get_top_n_runs( + remote_run: AutoMLRun, n: int = 5, primary_metric: str = "accuracy" +) -> List[ScriptRun]: + """ + Get top N complete runs from experiment sorted by primary metric. + + :param remote_run: Azure ML Run. + :param n: Number of top runs to return. + :param primary_metric: Metric to sort by. + + :returns: List of top N runs sorted by primary metric. + """ + # Collect all models: + complete_runs = [ + run + for run in remote_run.get_children(status="Completed") + if not any(s in run.id for s in ["setup", "worker"]) + ] + + # Checking that the required number of runs are done: + if len(complete_runs) < n: + raise ValueError(f"Expected {n} runs but only received {len(complete_runs)}") + + # Sorting by the primary metric: + sorted_runs = sorted( + complete_runs, key=lambda run: run.get_metrics()[primary_metric], reverse=True + ) + return sorted_runs[:n] + + +def _get_model_hp( + run: ScriptRun, +) -> dict: + """ + Get hyper-parameters of trained AzureML model. + Combine the hyper-parameters of the data transformation and training to a dictionary. + The prefix of the dictionary keys corresponds to 'data transformation' and 'training'. + + :param run: Run object of AzureML trained model. + + :returns: A dictionary as described in the docstring. + """ + + spec_field = "pipeline_spec" + if spec_field not in run.properties: + return {} + spec_string = run.properties[spec_field] + spec_dict = json.loads(spec_string) + + if "objects" not in spec_dict: + # No hyper-params + return {} + hp_dicts = spec_dict["objects"] + # after training there are two hyper-parameters dicts inside the run object: + assert ( + len(hp_dicts) == 2 + ), "after training there are two hyper-parameters dicts inside the run object" + result_dict = {} + dict_keys = [ + ["data_trans_class_name", "data_trans_module", "data_trans_spec_class"], + [ + "train_class_name", + "train_module", + "train_param_kwargs_C", + "train_param_kwargs_class_weight", + "train_spec_class", + ], + ] + + # creating hyper-params dict with key prefixes for each part: + kwargs_prefix = "param_kwargs" + for d, name, keys in zip(hp_dicts, ["data_trans", "train"], dict_keys): + for key in keys: + + if kwargs_prefix in key: + result_dict[key] = d[kwargs_prefix][ + key.replace(f"{name}_{kwargs_prefix}_", "") + ] + else: + result_dict[key] = d[key.replace(f"{name}_", "")] + if not result_dict[key]: + result_dict[key] = "" + + return result_dict + + +
+[docs] +def submit_training_job( + context: MLClientCtx, + experiment: Experiment, + compute_target: ComputeTarget, + register_model_name: str, + registered_dataset_name: str, + automl_settings: dict, + training_set: DataItem, + label_column_name: str = '', + save_n_models: int = 3, + show_output: bool = True, +) -> None: + """ + Submit training job to Azure AutoML and download trained model + when completed. Uses previously registered dataset for training. + + :param context: MLRun context. + :param experiment: Azure experiment. + :param compute_target: Azure compute target. + :param register_model_name: Name of model to register in Azure. + :param registered_dataset_name: Name of dataset registered in Azure ML. + :param label_column_name: Name of target column in dataset. + :param automl_settings: JSON string of all Azure AutoML settings. + :param training_set: Training set to log with model. For model + monitoring integration. + :param show_output: Displaying Azure logs. + :param save_n_models: How many of the top performing models to log. + """ + # Loading workspace if not provided: + workspace = _load_workspace(context) + + # Setup experiment: + context.logger.info("Setting up experiment parameters") + dataset = Dataset.get_by_name(workspace, name=registered_dataset_name) + + # Get training set to log with model: + feature_vector = None + store_uri_prefix, _ = mlrun.datastore.parse_store_uri(training_set.artifact_url) + if mlrun.utils.StorePrefix.FeatureVector == store_uri_prefix: + feature_vector = training_set.meta.uri + label_column_name = label_column_name or training_set.meta.status.label_column + context.logger.info(f'label column name: {label_column_name}') + training_set = f_store.get_offline_features(feature_vector).to_dataframe() + else: + training_set = training_set.as_df() + + automl_config = AutoMLConfig( + compute_target=compute_target, + training_data=dataset, + verbosity=logging.INFO, + label_column_name=label_column_name, + **automl_settings, + ) + + # Run experiment on AzureML: + context.logger.info("Submitting and running experiment") + remote_run = experiment.submit(automl_config) + remote_run.wait_for_completion(show_output=show_output) + if show_output: + # Azure log ending row: + print(f"\n{'*' * 92}\n") + # Get top N runs to log: + top_runs = _get_top_n_runs( + remote_run=remote_run, + n=save_n_models, + primary_metric=automl_settings["primary_metric"], + ) + + # Register, download, and log models: + for i, run in enumerate(top_runs): + # Register model: + context.logger.info("Registering model") + model = run.register_model( + model_name=register_model_name, model_path="outputs/model.pkl" + ) + context.logger.info( + f"Registered model with name '{model.name}', id '{model.id}', version '{model.version}'" + ) + + # Download model locally: + download_model( + context=context, + model_name=register_model_name, + model_version=model.version, + target_dir=f"./{model.version}", + ) + + metrics = {k.lower(): val for k, val in run.get_metrics().items()} + del metrics["confusion_matrix"] + del metrics["accuracy_table"] + + # Collect model hyper-parameters: + model_hp_dict = _get_model_hp(run) + with context.get_child_context(**model_hp_dict) as child: + model_key = f"model_{i + 1}_{model_hp_dict['data_trans_class_name'].lower()}_{model_hp_dict['train_class_name'].lower()}" + # Log model: + context.logger.info( + f"Logging {model_key} model to MLRun" + ) + child.log_results(metrics) + child.log_model( + "model", + db_key=model_key, + artifact_path=context.artifact_subpath("models"), + metrics=metrics, + model_file=f"{model.version}/model.pkl", + training_set=training_set, + label_column=label_column_name, + feature_vector=feature_vector, + framework="AzureML", + algorithm=model_hp_dict.get("train_class_name"), + ) + if i == 0: + # This also logs the model: + child.mark_as_best()
+ + + +
+[docs] +def train( + # MlRun + context: MLClientCtx, + dataset: DataItem, + # Init experiment and compute + experiment_name: str = "", + cpu_cluster_name: str = "", + vm_size: str = "STANDARD_D2_V2", + max_nodes: int = 1, + # Register dataset + dataset_name: str = "", + dataset_description: str = "", + create_new_version: bool = False, + label_column_name: str = "", + # Submit training job + register_model_name: str = "", + save_n_models: int = 1, + log_azure: bool = True, + automl_settings: str = None, +) -> None: + """ + Whole training flow for Azure AutoML. Registers dataset/feature vector, + submits training job to Azure AutoML, and downloads trained model + when completed. + + :param context: MLRun context. + + :param dataset: MLRun FeatureVector or dataset URI to upload. Will drop + index before uploading when it is a FeatureVector. + + :param experiment_name: Name of experiment to create in Azure ML. + :param cpu_cluster_name: Name of Azure ML compute target. Created if does not exist. + :param vm_size: Azure machine type for compute target. + :param max_nodes: Maximum number of concurrent compute targets. + + :param dataset_name: Name of Azure dataset to register. + :param dataset_description: Description of Azure dataset to register. + + :param create_new_version: Register Azure dataset as new version. Must be used when + modifying dataset schema. + :param label_column_name: Target column in dataset. + + :param register_model_name: Name of model to register in Azure. + :param save_n_models: How many of the top performing models to log. + :param log_azure: Displaying Azure logs. + :param automl_settings: JSON string of all Azure AutoML settings. + """ + if not automl_settings: + automl_settings = { + "task": "classification", + "debug_log": "automl_errors.log", + # "experiment_exit_score": 0.9, + "enable_early_stopping": False, + "allowed_models": ["LogisticRegression", "SGD", "SVM"], + "iterations": 3, + "iteration_timeout_minutes": 2, + "max_concurrent_iterations": 2, + "max_cores_per_iteration": -1, + "n_cross_validations": 5, + "primary_metric": "accuracy", + "featurization": "off", + "model_explainability": False, + "enable_voting_ensemble": False, + "enable_stack_ensemble": False, + } + + # Init experiment and compute + workspace, experiment = _init_experiment( + context=context, experiment_name=experiment_name + ) + + compute_target = init_compute( + context=context, + cpu_cluster_name=cpu_cluster_name, + vm_size=vm_size, + max_nodes=max_nodes, + ) + + # Register dataset + register_dataset( + context=context, + dataset_name=dataset_name, + dataset_description=dataset_description, + data=dataset, + create_new_version=create_new_version, + ) + + # Submit training job + submit_training_job( + context, + experiment=experiment, + compute_target=compute_target, + register_model_name=register_model_name, + registered_dataset_name=dataset_name, + label_column_name=label_column_name, + automl_settings=automl_settings, + training_set=dataset, + show_output=log_azure, + save_n_models=save_n_models, + )
+ +
+
+
+
+
+
+
+
+
+ +
+
+
+
+ + + +
+
+ + \ No newline at end of file diff --git a/functions/master/azureml_utils/1.4.0/static/documentation.html b/functions/master/azureml_utils/1.4.0/static/documentation.html new file mode 100644 index 00000000..ae451127 --- /dev/null +++ b/functions/master/azureml_utils/1.4.0/static/documentation.html @@ -0,0 +1,361 @@ + + + + + + + +azureml_utils package + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + +
+
+
+
+
+
+ +
+ +
+
+
+ + + + +
+
+
+
+
+
+
+
+
+
+
+
+
+ + + + +
+
+
+
+ + +
+
+

azureml_utils package#

+
+

Submodules#

+
+
+

azureml_utils.azureml_utils module#

+
+
+azureml_utils.azureml_utils.download_model(context: MLClientCtx, model_name: str, model_version: int, target_dir: str = '.') None[source]#
+

Download trained model from Azure ML to local filesystem.

+
+
Parameters:
+
    +
  • context – MLRun context.

  • +
  • model_name – Name of trained and registered model.

  • +
  • model_version – Version of model to download.

  • +
  • target_dir – Target directory to download model.

  • +
+
+
+
+
+
+azureml_utils.azureml_utils.init_compute(context: MLClientCtx, cpu_cluster_name: str, vm_size: str = 'STANDARD_D2_V2', max_nodes: int = 1) azureml.core.compute.ComputeTarget[source]#
+

Initialize Azure ML compute target to run experiment. Checks for +existing compute target and creates new if does not exist.

+
+
Parameters:
+
    +
  • context – MLRun context.

  • +
  • cpu_cluster_name – Name of Azure ML compute target. Created if does not exist.

  • +
  • vm_size – Azure machine type for compute target.

  • +
  • max_nodes – Maximum number of concurrent compute targets.

  • +
+
+
Returns:
+

Azure ML Compute Target.

+
+
+
+
+
+azureml_utils.azureml_utils.register_dataset(context: MLClientCtx, dataset_name: str, dataset_description: str, data: DataItem, create_new_version: bool = False)[source]#
+

Register dataset object (can be also an Iguazio FeatureVector) in Azure ML. +Uploads parquet file to Azure blob storage and registers +that file as a dataset in Azure ML.

+
+
Parameters:
+
    +
  • context – MLRun context.

  • +
  • dataset_name – Name of Azure dataset to register.

  • +
  • dataset_description – Description of Azure dataset to register.

  • +
  • data – MLRun FeatureVector or dataset object to upload.

  • +
  • create_new_version – Register Azure dataset as new version. Must be used when +modifying dataset schema.

  • +
+
+
+
+
+
+azureml_utils.azureml_utils.submit_training_job(context: MLClientCtx, experiment: azureml.core.experiment.Experiment, compute_target: azureml.core.compute.ComputeTarget, register_model_name: str, registered_dataset_name: str, automl_settings: dict, training_set: DataItem, label_column_name: str = '', save_n_models: int = 3, show_output: bool = True) None[source]#
+

Submit training job to Azure AutoML and download trained model +when completed. Uses previously registered dataset for training.

+
+
Parameters:
+
    +
  • context – MLRun context.

  • +
  • experiment – Azure experiment.

  • +
  • compute_target – Azure compute target.

  • +
  • register_model_name – Name of model to register in Azure.

  • +
  • registered_dataset_name – Name of dataset registered in Azure ML.

  • +
  • label_column_name – Name of target column in dataset.

  • +
  • automl_settings – JSON string of all Azure AutoML settings.

  • +
  • training_set – Training set to log with model. For model +monitoring integration.

  • +
  • show_output – Displaying Azure logs.

  • +
  • save_n_models – How many of the top performing models to log.

  • +
+
+
+
+
+
+azureml_utils.azureml_utils.train(context: MLClientCtx, dataset: DataItem, experiment_name: str = '', cpu_cluster_name: str = '', vm_size: str = 'STANDARD_D2_V2', max_nodes: int = 1, dataset_name: str = '', dataset_description: str = '', create_new_version: bool = False, label_column_name: str = '', register_model_name: str = '', save_n_models: int = 1, log_azure: bool = True, automl_settings: str | None = None) None[source]#
+

Whole training flow for Azure AutoML. Registers dataset/feature vector, +submits training job to Azure AutoML, and downloads trained model +when completed.

+
+
Parameters:
+
    +
  • context – MLRun context.

  • +
  • dataset – MLRun FeatureVector or dataset URI to upload. Will drop +index before uploading when it is a FeatureVector.

  • +
  • experiment_name – Name of experiment to create in Azure ML.

  • +
  • cpu_cluster_name – Name of Azure ML compute target. Created if does not exist.

  • +
  • vm_size – Azure machine type for compute target.

  • +
  • max_nodes – Maximum number of concurrent compute targets.

  • +
  • dataset_name – Name of Azure dataset to register.

  • +
  • dataset_description – Description of Azure dataset to register.

  • +
  • create_new_version – Register Azure dataset as new version. Must be used when +modifying dataset schema.

  • +
  • label_column_name – Target column in dataset.

  • +
  • register_model_name – Name of model to register in Azure.

  • +
  • save_n_models – How many of the top performing models to log.

  • +
  • log_azure – Displaying Azure logs.

  • +
  • automl_settings – JSON string of all Azure AutoML settings.

  • +
+
+
+
+
+
+azureml_utils.azureml_utils.upload_model(context: MLClientCtx, model_name: str, model_path: str, model_description: str | None = None, model_tags: dict | None = None) None[source]#
+

Upload pre-trained model from local filesystem to Azure ML. +:param context: MLRun context. +:param model_name: Name of trained and registered model. +:param model_path: Path to file on local filesystem. +:param model_description: Description of models. +:param model_tags: KV pairs of model tags.

+
+
+
+

Module contents#

+
+
+
+
+
+
+
+
+ +
+
+ +
+
+
+
+ + + +
+
+ + \ No newline at end of file diff --git a/functions/master/azureml_utils/1.4.0/static/example.html b/functions/master/azureml_utils/1.4.0/static/example.html new file mode 100644 index 00000000..13e6a910 --- /dev/null +++ b/functions/master/azureml_utils/1.4.0/static/example.html @@ -0,0 +1,1661 @@ + + + + + + + +AzureML AutoML Demo + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + +
+
+
+
+
+
+ +
+ +
+
+
+ + + + +
+
+
+
+
+
+
+
+
+
+
+
+
+ + + + +
+
+
+
+ + +
+
+

AzureML AutoML Demo#

+

MLRun function for using Azure AutoML, Including the following handlers:

+
    +
  1. init_experiment - Initialize workspace and experiment in Azure ML.

  2. +
  3. init_compute - Initialize Azure ML compute target to run experiment.

  4. +
  5. register_dataset - Register dataset object (can be also an Iguazio FeatureVector) in Azure ML.

  6. +
  7. download_model - Download trained model from Azure ML to local filesystem.

  8. +
  9. upload_model - Upload pre-trained model from local filesystem to Azure ML.

  10. +
  11. submit_training_job - Submit training job to Azure AutoML and download trained model when completed.

  12. +
  13. automl_train - Whole training flow for Azure AutoML: +- Initializing workspace and experiment in Azure ML +- Registers dataset/feature vector, +- submits training job +- downloads trained model

  14. +
+
+

1. Setup MLRun Project#

+

Creating MLRun project

+
+
+
import mlrun
+
+
+
+
+
> 2022-02-02 18:28:06,840 [warning] Failed resolving version info. Ignoring and using defaults
+> 2022-02-02 18:28:11,379 [warning] Server or client version is unstable. Assuming compatible: {'server_version': '0.0.0+unstable', 'client_version': '0.0.0+unstable'}
+
+
+
+
+
+
+
# Initialize the MLRun project object
+project = mlrun.get_or_create_project('azureml', context="./", user_project=True)
+
+
+
+
+
> 2022-02-02 18:28:11,423 [info] loaded project azureml from MLRun DB
+
+
+
+
+
+
+

2. Preparing Dataset (Iris)#

+
    +
  • Preparing training URI for the MLRun function

  • +
+
+
+
DATA_URL = "https://s3.wasabisys.com/iguazio/data/iris/iris_dataset.csv"
+
+mlrun.get_dataitem(DATA_URL).as_df().head()
+
+
+
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
sepal length (cm)sepal width (cm)petal length (cm)petal width (cm)label
05.13.51.40.20
14.93.01.40.20
24.73.21.30.20
34.63.11.50.20
45.03.61.40.20
+
+
+
+
+

3. Submit Azure AutoML Training Job#

+
+

Submit Azure Secrets#

+

For more information about working with secrets see: MLRun docs: Working with secrets

+
+
+
project.set_secrets(file_path="env")
+
+
+
+
+
+
+

Import azureml_utils from marketplace#

+
+
+
azureml_fn = mlrun.import_function('hub://azureml_utils')
+azureml_fn.deploy()
+
+
+
+
+
+
+

Automl configuration & run parameters#

+
    +
  • The automl_settings object is the setup for Azure AutoML. It holds the task type, number of models to train - iterations, the desired metric - primary metric, the allowed types of models allowed_models and more.

  • +
  • The params are the parameters for the MLRun function, such as experiment (experiment_name) and cpu cluster (cpu_cluster_name) names in AzureML, dataset properties for registration, target label for training - label_column_name, number of models to download save_n_models and more.

  • +
+
+
+
label_column_name = 'label' # target label
+
+# Configure automl settings:
+automl_settings = {
+            "task": 'classification',
+            "debug_log": 'automl_errors.log',
+#             "experiment_exit_score" : 0.9,
+            "enable_early_stopping": False,
+            "allowed_models": ['LogisticRegression', 'SGD', 'SVM'],
+            "iterations": 5,
+            "iteration_timeout_minutes": 2,
+            "max_concurrent_iterations": 2,
+            "max_cores_per_iteration": -1,
+            "n_cross_validations": 5,
+            "primary_metric": 'accuracy',
+            "featurization": 'off',
+            "model_explainability": False,
+            "enable_voting_ensemble": False,
+            "enable_stack_ensemble": False
+        }
+
+# Setting params to azure_run function:
+params = {
+    "experiment_name": 'azure-automl-test',
+    "cpu_cluster_name": 'azureml-cpu',
+    "dataset_name": 'iris',
+    "dataset_description": 'iris training data',
+    "label_column_name": label_column_name,
+    "create_new_version": True,
+    "register_model_name": "iris-model",
+    "save_n_models": 3,
+    "automl_settings": automl_settings
+}
+
+
+
+
+
+
+

Run Azure AutoML train:#

+

This MLRun function will perform the following:

+
    +
  • Initialize workspace and experiment in your AzureML

  • +
  • Register the dataset/feature vector to Iguazio and to AzureML.

  • +
  • Submit the training job to AzureML and print the live training results fro each model

  • +
  • Generate the top trained models.

  • +
+
+
+
azureml_run = azureml_fn.run(
+    handler="train",
+    inputs={"dataset": DATA_URL},
+    params=params,
+)
+
+
+
+
+
> 2022-02-02 18:28:11,740 [info] Function is not deployed and auto_build flag is set, starting deploy...
+> 2022-02-02 18:28:11,932 [info] Started building image: .mlrun/func-azureml-yonatan-azureml-utils:latest
+INFO[0000] Retrieving image manifest python:3.7.9-slim  
+INFO[0000] Retrieving image python:3.7.9-slim from registry index.docker.io 
+INFO[0000] Built cross stage deps: map[]                
+INFO[0000] Retrieving image manifest python:3.7.9-slim  
+INFO[0000] Returning cached image manifest              
+INFO[0000] Executing 0 build triggers                   
+INFO[0000] Unpacking rootfs as cmd RUN python -m pip install pip==21.2.4 requires it. 
+INFO[0002] RUN python -m pip install pip==21.2.4        
+INFO[0002] Taking snapshot of full filesystem...        
+INFO[0003] cmd: /bin/sh                                 
+INFO[0003] args: [-c python -m pip install pip==21.2.4] 
+INFO[0003] Running: [/bin/sh -c python -m pip install pip==21.2.4] 
+Collecting pip==21.2.4
+  Downloading pip-21.2.4-py3-none-any.whl (1.6 MB)
+Installing collected packages: pip
+  Attempting uninstall: pip
+    Found existing installation: pip 21.0.1
+    Uninstalling pip-21.0.1:
+      Successfully uninstalled pip-21.0.1
+Successfully installed pip-21.2.4
+INFO[0006] Taking snapshot of full filesystem...        
+INFO[0006] RUN apt-get update && apt-get install -y --no-install-recommends git 
+INFO[0006] cmd: /bin/sh                                 
+INFO[0006] args: [-c apt-get update && apt-get install -y --no-install-recommends git] 
+INFO[0006] Running: [/bin/sh -c apt-get update && apt-get install -y --no-install-recommends git] 
+Get:1 http://security.debian.org/debian-security buster/updates InRelease [65.4 kB]
+Get:2 http://deb.debian.org/debian buster InRelease [122 kB]
+Get:3 http://deb.debian.org/debian buster-updates InRelease [51.9 kB]
+Get:4 http://security.debian.org/debian-security buster/updates/main amd64 Packages [314 kB]
+Get:5 http://deb.debian.org/debian buster/main amd64 Packages [7906 kB]
+Get:6 http://deb.debian.org/debian buster-updates/main amd64 Packages [8792 B]
+Fetched 8468 kB in 2s (5347 kB/s)
+Reading package lists...
+Reading package lists...
+Building dependency tree...
+Reading state information...
+The following additional packages will be installed:
+  git-man libcurl3-gnutls liberror-perl libgdbm-compat4 libgssapi-krb5-2
+  libk5crypto3 libkeyutils1 libkrb5-3 libkrb5support0 libldap-2.4-2
+  libldap-common libnghttp2-14 libpcre2-8-0 libperl5.28 libpsl5 librtmp1
+  libsasl2-2 libsasl2-modules-db libssh2-1 perl perl-modules-5.28
+Suggested packages:
+  gettext-base git-daemon-run | git-daemon-sysvinit git-doc git-el git-email
+  git-gui gitk gitweb git-cvs git-mediawiki git-svn krb5-doc krb5-user
+  sensible-utils perl-doc libterm-readline-gnu-perl
+  | libterm-readline-perl-perl make libb-debug-perl liblocale-codes-perl
+Recommended packages:
+  patch less ssh-client krb5-locales publicsuffix libsasl2-modules
+The following NEW packages will be installed:
+  git git-man libcurl3-gnutls liberror-perl libgdbm-compat4 libgssapi-krb5-2
+  libk5crypto3 libkeyutils1 libkrb5-3 libkrb5support0 libldap-2.4-2
+  libldap-common libnghttp2-14 libpcre2-8-0 libperl5.28 libpsl5 librtmp1
+  libsasl2-2 libsasl2-modules-db libssh2-1 perl perl-modules-5.28
+0 upgraded, 22 newly installed, 0 to remove and 16 not upgraded.
+Need to get 16.4 MB of archives.
+After this operation, 90.1 MB of additional disk space will be used.
+Get:1 http://deb.debian.org/debian buster/main amd64 perl-modules-5.28 all 5.28.1-6+deb10u1 [2873 kB]
+Get:2 http://deb.debian.org/debian buster/main amd64 libgdbm-compat4 amd64 1.18.1-4 [44.1 kB]
+Get:3 http://deb.debian.org/debian buster/main amd64 libperl5.28 amd64 5.28.1-6+deb10u1 [3894 kB]
+Get:4 http://deb.debian.org/debian buster/main amd64 perl amd64 5.28.1-6+deb10u1 [204 kB]
+Get:5 http://deb.debian.org/debian buster/main amd64 libkeyutils1 amd64 1.6-6 [15.0 kB]
+Get:6 http://deb.debian.org/debian buster/main amd64 libkrb5support0 amd64 1.17-3+deb10u3 [65.8 kB]
+Get:7 http://deb.debian.org/debian buster/main amd64 libk5crypto3 amd64 1.17-3+deb10u3 [122 kB]
+Get:8 http://deb.debian.org/debian buster/main amd64 libkrb5-3 amd64 1.17-3+deb10u3 [370 kB]
+Get:9 http://deb.debian.org/debian buster/main amd64 libgssapi-krb5-2 amd64 1.17-3+deb10u3 [158 kB]
+Get:10 http://deb.debian.org/debian buster/main amd64 libsasl2-modules-db amd64 2.1.27+dfsg-1+deb10u1 [69.1 kB]
+Get:11 http://deb.debian.org/debian buster/main amd64 libsasl2-2 amd64 2.1.27+dfsg-1+deb10u1 [106 kB]
+Get:12 http://deb.debian.org/debian buster/main amd64 libldap-common all 2.4.47+dfsg-3+deb10u6 [90.0 kB]
+Get:13 http://deb.debian.org/debian buster/main amd64 libldap-2.4-2 amd64 2.4.47+dfsg-3+deb10u6 [224 kB]
+Get:14 http://deb.debian.org/debian buster/main amd64 libnghttp2-14 amd64 1.36.0-2+deb10u1 [85.0 kB]
+Get:15 http://deb.debian.org/debian buster/main amd64 libpsl5 amd64 0.20.2-2 [53.7 kB]
+Get:16 http://deb.debian.org/debian buster/main amd64 librtmp1 amd64 2.4+20151223.gitfa8646d.1-2 [60.5 kB]
+Get:17 http://deb.debian.org/debian buster/main amd64 libssh2-1 amd64 1.8.0-2.1 [140 kB]
+Get:18 http://deb.debian.org/debian buster/main amd64 libcurl3-gnutls amd64 7.64.0-4+deb10u2 [330 kB]
+Get:19 http://deb.debian.org/debian buster/main amd64 libpcre2-8-0 amd64 10.32-5 [213 kB]
+Get:20 http://deb.debian.org/debian buster/main amd64 liberror-perl all 0.17027-2 [30.9 kB]
+Get:21 http://deb.debian.org/debian buster/main amd64 git-man all 1:2.20.1-2+deb10u3 [1620 kB]
+Get:22 http://deb.debian.org/debian buster/main amd64 git amd64 1:2.20.1-2+deb10u3 [5633 kB]
+debconf: delaying package configuration, since apt-utils is not installed
+Fetched 16.4 MB in 0s (62.3 MB/s)
+Selecting previously unselected package perl-modules-5.28.
+(Reading database ... 6840 files and directories currently installed.)
+Preparing to unpack .../00-perl-modules-5.28_5.28.1-6+deb10u1_all.deb ...
+Unpacking perl-modules-5.28 (5.28.1-6+deb10u1) ...
+Selecting previously unselected package libgdbm-compat4:amd64.
+Preparing to unpack .../01-libgdbm-compat4_1.18.1-4_amd64.deb ...
+Unpacking libgdbm-compat4:amd64 (1.18.1-4) ...
+Selecting previously unselected package libperl5.28:amd64.
+Preparing to unpack .../02-libperl5.28_5.28.1-6+deb10u1_amd64.deb ...
+Unpacking libperl5.28:amd64 (5.28.1-6+deb10u1) ...
+Selecting previously unselected package perl.
+Preparing to unpack .../03-perl_5.28.1-6+deb10u1_amd64.deb ...
+Unpacking perl (5.28.1-6+deb10u1) ...
+Selecting previously unselected package libkeyutils1:amd64.
+Preparing to unpack .../04-libkeyutils1_1.6-6_amd64.deb ...
+Unpacking libkeyutils1:amd64 (1.6-6) ...
+Selecting previously unselected package libkrb5support0:amd64.
+Preparing to unpack .../05-libkrb5support0_1.17-3+deb10u3_amd64.deb ...
+Unpacking libkrb5support0:amd64 (1.17-3+deb10u3) ...
+Selecting previously unselected package libk5crypto3:amd64.
+Preparing to unpack .../06-libk5crypto3_1.17-3+deb10u3_amd64.deb ...
+Unpacking libk5crypto3:amd64 (1.17-3+deb10u3) ...
+Selecting previously unselected package libkrb5-3:amd64.
+Preparing to unpack .../07-libkrb5-3_1.17-3+deb10u3_amd64.deb ...
+Unpacking libkrb5-3:amd64 (1.17-3+deb10u3) ...
+Selecting previously unselected package libgssapi-krb5-2:amd64.
+Preparing to unpack .../08-libgssapi-krb5-2_1.17-3+deb10u3_amd64.deb ...
+Unpacking libgssapi-krb5-2:amd64 (1.17-3+deb10u3) ...
+Selecting previously unselected package libsasl2-modules-db:amd64.
+Preparing to unpack .../09-libsasl2-modules-db_2.1.27+dfsg-1+deb10u1_amd64.deb ...
+Unpacking libsasl2-modules-db:amd64 (2.1.27+dfsg-1+deb10u1) ...
+Selecting previously unselected package libsasl2-2:amd64.
+Preparing to unpack .../10-libsasl2-2_2.1.27+dfsg-1+deb10u1_amd64.deb ...
+Unpacking libsasl2-2:amd64 (2.1.27+dfsg-1+deb10u1) ...
+Selecting previously unselected package libldap-common.
+Preparing to unpack .../11-libldap-common_2.4.47+dfsg-3+deb10u6_all.deb ...
+Unpacking libldap-common (2.4.47+dfsg-3+deb10u6) ...
+Selecting previously unselected package libldap-2.4-2:amd64.
+Preparing to unpack .../12-libldap-2.4-2_2.4.47+dfsg-3+deb10u6_amd64.deb ...
+Unpacking libldap-2.4-2:amd64 (2.4.47+dfsg-3+deb10u6) ...
+Selecting previously unselected package libnghttp2-14:amd64.
+Preparing to unpack .../13-libnghttp2-14_1.36.0-2+deb10u1_amd64.deb ...
+Unpacking libnghttp2-14:amd64 (1.36.0-2+deb10u1) ...
+Selecting previously unselected package libpsl5:amd64.
+Preparing to unpack .../14-libpsl5_0.20.2-2_amd64.deb ...
+Unpacking libpsl5:amd64 (0.20.2-2) ...
+Selecting previously unselected package librtmp1:amd64.
+Preparing to unpack .../15-librtmp1_2.4+20151223.gitfa8646d.1-2_amd64.deb ...
+Unpacking librtmp1:amd64 (2.4+20151223.gitfa8646d.1-2) ...
+Selecting previously unselected package libssh2-1:amd64.
+Preparing to unpack .../16-libssh2-1_1.8.0-2.1_amd64.deb ...
+Unpacking libssh2-1:amd64 (1.8.0-2.1) ...
+Selecting previously unselected package libcurl3-gnutls:amd64.
+Preparing to unpack .../17-libcurl3-gnutls_7.64.0-4+deb10u2_amd64.deb ...
+Unpacking libcurl3-gnutls:amd64 (7.64.0-4+deb10u2) ...
+Selecting previously unselected package libpcre2-8-0:amd64.
+Preparing to unpack .../18-libpcre2-8-0_10.32-5_amd64.deb ...
+Unpacking libpcre2-8-0:amd64 (10.32-5) ...
+Selecting previously unselected package liberror-perl.
+Preparing to unpack .../19-liberror-perl_0.17027-2_all.deb ...
+Unpacking liberror-perl (0.17027-2) ...
+Selecting previously unselected package git-man.
+Preparing to unpack .../20-git-man_1%3a2.20.1-2+deb10u3_all.deb ...
+Unpacking git-man (1:2.20.1-2+deb10u3) ...
+Selecting previously unselected package git.
+Preparing to unpack .../21-git_1%3a2.20.1-2+deb10u3_amd64.deb ...
+Unpacking git (1:2.20.1-2+deb10u3) ...
+Setting up perl-modules-5.28 (5.28.1-6+deb10u1) ...
+Setting up libkeyutils1:amd64 (1.6-6) ...
+Setting up libpsl5:amd64 (0.20.2-2) ...
+Setting up libnghttp2-14:amd64 (1.36.0-2+deb10u1) ...
+Setting up libldap-common (2.4.47+dfsg-3+deb10u6) ...
+Setting up libkrb5support0:amd64 (1.17-3+deb10u3) ...
+Setting up libsasl2-modules-db:amd64 (2.1.27+dfsg-1+deb10u1) ...
+Setting up librtmp1:amd64 (2.4+20151223.gitfa8646d.1-2) ...
+Setting up libgdbm-compat4:amd64 (1.18.1-4) ...
+Setting up libpcre2-8-0:amd64 (10.32-5) ...
+Setting up libk5crypto3:amd64 (1.17-3+deb10u3) ...
+Setting up libsasl2-2:amd64 (2.1.27+dfsg-1+deb10u1) ...
+Setting up libperl5.28:amd64 (5.28.1-6+deb10u1) ...
+Setting up git-man (1:2.20.1-2+deb10u3) ...
+Setting up libssh2-1:amd64 (1.8.0-2.1) ...
+Setting up libkrb5-3:amd64 (1.17-3+deb10u3) ...
+Setting up libldap-2.4-2:amd64 (2.4.47+dfsg-3+deb10u6) ...
+Setting up perl (5.28.1-6+deb10u1) ...
+Setting up libgssapi-krb5-2:amd64 (1.17-3+deb10u3) ...
+Setting up libcurl3-gnutls:amd64 (7.64.0-4+deb10u2) ...
+Setting up liberror-perl (0.17027-2) ...
+Setting up git (1:2.20.1-2+deb10u3) ...
+Processing triggers for libc-bin (2.28-10) ...
+INFO[0012] Taking snapshot of full filesystem...        
+INFO[0015] RUN python -m pip install azureml-core==1.33.0 azureml-train-automl-client==1.33.0 
+INFO[0015] cmd: /bin/sh                                 
+INFO[0015] args: [-c python -m pip install azureml-core==1.33.0 azureml-train-automl-client==1.33.0] 
+INFO[0015] Running: [/bin/sh -c python -m pip install azureml-core==1.33.0 azureml-train-automl-client==1.33.0] 
+Collecting azureml-core==1.33.0
+  Downloading azureml_core-1.33.0-py3-none-any.whl (2.2 MB)
+Collecting azureml-train-automl-client==1.33.0
+  Downloading azureml_train_automl_client-1.33.0-py3-none-any.whl (128 kB)
+Collecting pytz
+  Downloading pytz-2021.3-py2.py3-none-any.whl (503 kB)
+Collecting urllib3<=1.26.5,>=1.23
+  Downloading urllib3-1.26.5-py2.py3-none-any.whl (138 kB)
+Collecting pathspec<1.0.0
+  Downloading pathspec-0.9.0-py2.py3-none-any.whl (31 kB)
+Collecting azure-mgmt-containerregistry>=2.0.0
+  Downloading azure_mgmt_containerregistry-9.0.0-py3-none-any.whl (937 kB)
+Collecting contextlib2<1.0.0
+  Downloading contextlib2-0.6.0.post1-py2.py3-none-any.whl (9.8 kB)
+Collecting SecretStorage<4.0.0
+  Downloading SecretStorage-3.3.1-py3-none-any.whl (15 kB)
+Collecting backports.tempfile
+  Downloading backports.tempfile-1.0-py2.py3-none-any.whl (4.4 kB)
+Collecting docker<5.0.0
+  Downloading docker-4.4.4-py2.py3-none-any.whl (147 kB)
+Collecting msrestazure<=0.6.4,>=0.4.33
+  Downloading msrestazure-0.6.4-py2.py3-none-any.whl (40 kB)
+Collecting cryptography!=1.9,!=2.0.*,!=2.1.*,!=2.2.*,<4.0.0
+  Downloading cryptography-3.4.8-cp36-abi3-manylinux_2_24_x86_64.whl (3.0 MB)
+Collecting msrest<1.0.0,>=0.5.1
+  Downloading msrest-0.6.21-py2.py3-none-any.whl (85 kB)
+Collecting requests<3.0.0,>=2.19.1
+  Downloading requests-2.27.1-py2.py3-none-any.whl (63 kB)
+Collecting adal<=1.2.7,>=1.2.0
+  Downloading adal-1.2.7-py2.py3-none-any.whl (55 kB)
+Collecting jsonpickle<3.0.0
+  Downloading jsonpickle-2.1.0-py2.py3-none-any.whl (38 kB)
+Collecting azure-common<2.0.0,>=1.1.12
+  Downloading azure_common-1.1.27-py2.py3-none-any.whl (12 kB)
+Collecting python-dateutil<3.0.0,>=2.7.3
+  Downloading python_dateutil-2.8.2-py2.py3-none-any.whl (247 kB)
+Collecting PyJWT<3.0.0
+  Downloading PyJWT-2.3.0-py3-none-any.whl (16 kB)
+Collecting azure-mgmt-authorization<1.0.0,>=0.40.0
+  Downloading azure_mgmt_authorization-0.61.0-py2.py3-none-any.whl (94 kB)
+Collecting pyopenssl<21.0.0
+  Downloading pyOpenSSL-20.0.1-py2.py3-none-any.whl (54 kB)
+Collecting jmespath<1.0.0
+  Downloading jmespath-0.10.0-py2.py3-none-any.whl (24 kB)
+Collecting ndg-httpsclient<=0.5.1
+  Downloading ndg_httpsclient-0.5.1-py3-none-any.whl (34 kB)
+Collecting azure-mgmt-storage<16.0.0,>=1.5.0
+  Downloading azure_mgmt_storage-11.2.0-py2.py3-none-any.whl (547 kB)
+Collecting azure-mgmt-keyvault<10.0.0,>=0.40.0
+  Downloading azure_mgmt_keyvault-9.3.0-py2.py3-none-any.whl (412 kB)
+Collecting ruamel.yaml<0.17.5,>=0.15.35
+  Downloading ruamel.yaml-0.17.4-py3-none-any.whl (101 kB)
+Collecting azure-graphrbac<1.0.0,>=0.40.0
+  Downloading azure_graphrbac-0.61.1-py2.py3-none-any.whl (141 kB)
+Collecting azure-mgmt-resource<15.0.0,>=1.2.1
+  Downloading azure_mgmt_resource-13.0.0-py2.py3-none-any.whl (1.3 MB)
+Collecting azureml-dataset-runtime~=1.33.0
+  Downloading azureml_dataset_runtime-1.33.0-py3-none-any.whl (3.5 kB)
+Collecting azureml-telemetry~=1.33.0
+  Downloading azureml_telemetry-1.33.0-py3-none-any.whl (30 kB)
+Collecting azureml-automl-core~=1.33.0
+  Downloading azureml_automl_core-1.33.1-py3-none-any.whl (214 kB)
+Collecting azure-mgmt-core<2.0.0,>=1.3.0
+  Downloading azure_mgmt_core-1.3.0-py2.py3-none-any.whl (25 kB)
+Collecting azure-core<2.0.0,>=1.15.0
+  Downloading azure_core-1.21.1-py2.py3-none-any.whl (178 kB)
+Collecting six>=1.11.0
+  Downloading six-1.16.0-py2.py3-none-any.whl (11 kB)
+Collecting pyarrow<4.0.0,>=0.17.0
+  Downloading pyarrow-3.0.0-cp37-cp37m-manylinux2014_x86_64.whl (20.7 MB)
+Collecting azureml-dataprep<2.21.0a,>=2.20.0a
+  Downloading azureml_dataprep-2.20.1-py3-none-any.whl (39.4 MB)
+Collecting numpy!=1.19.3
+  Downloading numpy-1.21.5-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (15.7 MB)
+Collecting azure-identity<1.5.0,>=1.2.0
+  Downloading azure_identity-1.4.1-py2.py3-none-any.whl (86 kB)
+Collecting dotnetcore2<3.0.0,>=2.1.14
+  Downloading dotnetcore2-2.1.23-py3-none-manylinux1_x86_64.whl (29.3 MB)
+Collecting azureml-dataprep-native<39.0.0,>=38.0.0
+  Downloading azureml_dataprep_native-38.0.0-cp37-cp37m-manylinux1_x86_64.whl (1.3 MB)
+Collecting azureml-dataprep-rslex<1.19.0a,>=1.18.0dev0
+  Downloading azureml_dataprep_rslex-1.18.2-cp37-cp37m-manylinux1_x86_64.whl (10.4 MB)
+Collecting cloudpickle<2.0.0,>=1.1.0
+  Downloading cloudpickle-1.6.0-py3-none-any.whl (23 kB)
+Collecting msal-extensions~=0.2.2
+  Downloading msal_extensions-0.2.2-py2.py3-none-any.whl (15 kB)
+Collecting msal<2.0.0,>=1.3.0
+  Downloading msal-1.16.0-py2.py3-none-any.whl (78 kB)
+Collecting applicationinsights
+  Downloading applicationinsights-0.11.10-py2.py3-none-any.whl (55 kB)
+Collecting cffi>=1.12
+  Downloading cffi-1.15.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (427 kB)
+Collecting pycparser
+  Downloading pycparser-2.21-py2.py3-none-any.whl (118 kB)
+Collecting websocket-client>=0.32.0
+  Downloading websocket_client-1.2.3-py3-none-any.whl (53 kB)
+Collecting distro>=1.2.0
+  Downloading distro-1.6.0-py2.py3-none-any.whl (19 kB)
+Collecting importlib-metadata
+  Downloading importlib_metadata-4.10.1-py3-none-any.whl (17 kB)
+Collecting portalocker~=1.0
+  Downloading portalocker-1.7.1-py2.py3-none-any.whl (10 kB)
+Collecting requests-oauthlib>=0.5.0
+  Downloading requests_oauthlib-1.3.1-py2.py3-none-any.whl (23 kB)
+Collecting isodate>=0.6.0
+  Downloading isodate-0.6.1-py2.py3-none-any.whl (41 kB)
+Collecting certifi>=2017.4.17
+  Downloading certifi-2021.10.8-py2.py3-none-any.whl (149 kB)
+Collecting pyasn1>=0.1.1
+  Downloading pyasn1-0.4.8-py2.py3-none-any.whl (77 kB)
+Collecting charset-normalizer~=2.0.0
+  Downloading charset_normalizer-2.0.11-py3-none-any.whl (39 kB)
+Collecting idna<4,>=2.5
+  Downloading idna-3.3-py3-none-any.whl (61 kB)
+Collecting oauthlib>=3.0.0
+  Downloading oauthlib-3.2.0-py3-none-any.whl (151 kB)
+Collecting ruamel.yaml.clib>=0.1.2
+  Downloading ruamel.yaml.clib-0.2.6-cp37-cp37m-manylinux1_x86_64.whl (546 kB)
+Collecting jeepney>=0.6
+  Downloading jeepney-0.7.1-py3-none-any.whl (54 kB)
+Collecting backports.weakref
+  Downloading backports.weakref-1.0.post1-py2.py3-none-any.whl (5.2 kB)
+Collecting zipp>=0.5
+  Downloading zipp-3.7.0-py3-none-any.whl (5.3 kB)
+Collecting typing-extensions>=3.6.4
+  Downloading typing_extensions-4.0.1-py3-none-any.whl (22 kB)
+Installing collected packages: pycparser, urllib3, idna, charset-normalizer, cffi, certifi, six, requests, PyJWT, oauthlib, cryptography, requests-oauthlib, python-dateutil, isodate, zipp, typing-extensions, portalocker, msrest, msal, azure-core, adal, websocket-client, ruamel.yaml.clib, pyopenssl, pyasn1, msrestazure, msal-extensions, jeepney, importlib-metadata, distro, backports.weakref, azure-mgmt-core, azure-common, SecretStorage, ruamel.yaml, pytz, pathspec, numpy, ndg-httpsclient, jsonpickle, jmespath, dotnetcore2, docker, contextlib2, cloudpickle, backports.tempfile, azureml-dataprep-rslex, azureml-dataprep-native, azure-mgmt-storage, azure-mgmt-resource, azure-mgmt-keyvault, azure-mgmt-containerregistry, azure-mgmt-authorization, azure-identity, azure-graphrbac, pyarrow, azureml-dataprep, azureml-core, applicationinsights, azureml-telemetry, azureml-dataset-runtime, azureml-automl-core, azureml-train-automl-client
+Successfully installed PyJWT-2.3.0 SecretStorage-3.3.1 adal-1.2.7 applicationinsights-0.11.10 azure-common-1.1.27 azure-core-1.21.1 azure-graphrbac-0.61.1 azure-identity-1.4.1 azure-mgmt-authorization-0.61.0 azure-mgmt-containerregistry-9.0.0 azure-mgmt-core-1.3.0 azure-mgmt-keyvault-9.3.0 azure-mgmt-resource-13.0.0 azure-mgmt-storage-11.2.0 azureml-automl-core-1.33.1 azureml-core-1.33.0 azureml-dataprep-2.20.1 azureml-dataprep-native-38.0.0 azureml-dataprep-rslex-1.18.2 azureml-dataset-runtime-1.33.0 azureml-telemetry-1.33.0 azureml-train-automl-client-1.33.0 backports.tempfile-1.0 backports.weakref-1.0.post1 certifi-2021.10.8 cffi-1.15.0 charset-normalizer-2.0.11 cloudpickle-1.6.0 contextlib2-0.6.0.post1 cryptography-3.4.8 distro-1.6.0 docker-4.4.4 dotnetcore2-2.1.23 idna-3.3 importlib-metadata-4.10.1 isodate-0.6.1 jeepney-0.7.1 jmespath-0.10.0 jsonpickle-2.1.0 msal-1.16.0 msal-extensions-0.2.2 msrest-0.6.21 msrestazure-0.6.4 ndg-httpsclient-0.5.1 numpy-1.21.5 oauthlib-3.2.0 pathspec-0.9.0 portalocker-1.7.1 pyarrow-3.0.0 pyasn1-0.4.8 pycparser-2.21 pyopenssl-20.0.1 python-dateutil-2.8.2 pytz-2021.3 requests-2.27.1 requests-oauthlib-1.3.1 ruamel.yaml-0.17.4 ruamel.yaml.clib-0.2.6 six-1.16.0 typing-extensions-4.0.1 urllib3-1.26.5 websocket-client-1.2.3 zipp-3.7.0
+WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv
+WARNING: You are using pip version 21.2.4; however, version 22.0.2 is available.
+You should consider upgrading via the '/usr/local/bin/python -m pip install --upgrade pip' command.
+INFO[0040] Taking snapshot of full filesystem...        
+INFO[0059] RUN python -m pip install "mlrun[complete] @ git+https://github.com/mlrun/mlrun@development" 
+INFO[0059] cmd: /bin/sh                                 
+INFO[0059] args: [-c python -m pip install "mlrun[complete] @ git+https://github.com/mlrun/mlrun@development"] 
+INFO[0059] Running: [/bin/sh -c python -m pip install "mlrun[complete] @ git+https://github.com/mlrun/mlrun@development"] 
+Collecting mlrun[complete]@ git+https://github.com/mlrun/mlrun@development
+  Cloning https://github.com/mlrun/mlrun (to revision development) to /tmp/pip-install-yegk19ip/mlrun_b9fdf6ee5c8c4e2d9de3a37d0807b6c5
+  Running command git clone -q https://github.com/mlrun/mlrun /tmp/pip-install-yegk19ip/mlrun_b9fdf6ee5c8c4e2d9de3a37d0807b6c5
+  Resolved https://github.com/mlrun/mlrun to commit 832a07b11f3198b844d30b4a80db12a45c6e8948
+  Installing build dependencies: started
+  Installing build dependencies: finished with status 'done'
+  Getting requirements to build wheel: started
+  Getting requirements to build wheel: finished with status 'done'
+    Preparing wheel metadata: started
+    Preparing wheel metadata: finished with status 'done'
+Collecting pymysql~=1.0
+  Downloading PyMySQL-1.0.2-py3-none-any.whl (43 kB)
+Requirement already satisfied: pyarrow<6,>=1 in /usr/local/lib/python3.7/site-packages (from mlrun[complete]@ git+https://github.com/mlrun/mlrun@development) (3.0.0)
+Collecting fsspec~=2021.8.1
+  Downloading fsspec-2021.8.1-py3-none-any.whl (119 kB)
+Collecting v3io~=0.5.13
+  Downloading v3io-0.5.15-py3-none-any.whl (49 kB)
+Collecting ipykernel~=5.0
+  Downloading ipykernel-5.5.6-py3-none-any.whl (121 kB)
+Collecting click~=7.0
+  Downloading click-7.1.2-py2.py3-none-any.whl (82 kB)
+Collecting sqlalchemy~=1.3
+  Downloading SQLAlchemy-1.4.31-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.6 MB)
+Collecting dask~=2021.11.2
+  Downloading dask-2021.11.2-py3-none-any.whl (1.0 MB)
+Collecting distributed~=2021.11.2
+  Downloading distributed-2021.11.2-py3-none-any.whl (802 kB)
+Collecting chardet<4.0,>=3.0.2
+  Downloading chardet-3.0.4-py2.py3-none-any.whl (133 kB)
+Collecting nuclio-jupyter~=0.8.22
+  Downloading nuclio_jupyter-0.8.22-py3-none-any.whl (49 kB)
+Collecting pydantic~=1.5
+  Downloading pydantic-1.9.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.9 MB)
+Collecting v3iofs~=0.1.7
+  Downloading v3iofs-0.1.10-py3-none-any.whl (13 kB)
+Collecting kfp~=1.8.0
+  Downloading kfp-1.8.11.tar.gz (298 kB)
+Collecting pandas~=1.2
+  Downloading pandas-1.3.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.3 MB)
+Collecting pyyaml~=5.1
+  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
+Collecting orjson<3.4,>=3
+  Downloading orjson-3.3.1-cp37-cp37m-manylinux2014_x86_64.whl (208 kB)
+Collecting inflection~=0.5.0
+  Downloading inflection-0.5.1-py2.py3-none-any.whl (9.5 kB)
+Collecting aiohttp~=3.8
+  Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
+Collecting humanfriendly~=8.2
+  Downloading humanfriendly-8.2-py2.py3-none-any.whl (86 kB)
+Collecting ipython~=7.0
+  Downloading ipython-7.31.1-py3-none-any.whl (792 kB)
+Collecting fastapi~=0.67.0
+  Downloading fastapi-0.67.0-py3-none-any.whl (51 kB)
+Requirement already satisfied: requests~=2.22 in /usr/local/lib/python3.7/site-packages (from mlrun[complete]@ git+https://github.com/mlrun/mlrun@development) (2.27.1)
+Collecting tabulate~=0.8.6
+  Downloading tabulate-0.8.9-py3-none-any.whl (25 kB)
+Collecting nest-asyncio~=1.0
+  Downloading nest_asyncio-1.5.4-py3-none-any.whl (5.1 kB)
+Collecting storey~=0.10.4
+  Downloading storey-0.10.4-py3-none-any.whl (116 kB)
+Requirement already satisfied: urllib3<1.27,>=1.25.4 in /usr/local/lib/python3.7/site-packages (from mlrun[complete]@ git+https://github.com/mlrun/mlrun@development) (1.26.5)
+Collecting GitPython~=3.0
+  Downloading GitPython-3.1.26-py3-none-any.whl (180 kB)
+Collecting mergedeep~=1.3
+  Downloading mergedeep-1.3.4-py3-none-any.whl (6.4 kB)
+Collecting semver~=2.13
+  Downloading semver-2.13.0-py2.py3-none-any.whl (12 kB)
+Collecting cryptography<3.4,~=3.0
+  Downloading cryptography-3.3.2-cp36-abi3-manylinux2010_x86_64.whl (2.6 MB)
+Collecting v3io-frames~=0.10.2
+  Downloading v3io_frames-0.10.2-py3-none-any.whl (35 kB)
+Collecting deepdiff~=5.0
+  Downloading deepdiff-5.7.0-py3-none-any.whl (68 kB)
+Requirement already satisfied: numpy<1.22.0,>=1.16.5 in /usr/local/lib/python3.7/site-packages (from mlrun[complete]@ git+https://github.com/mlrun/mlrun@development) (1.21.5)
+Collecting python-dotenv~=0.17.0
+  Downloading python_dotenv-0.17.1-py2.py3-none-any.whl (18 kB)
+Collecting alembic<1.6.0,~=1.4
+  Downloading alembic-1.5.8-py2.py3-none-any.whl (159 kB)
+Collecting kubernetes~=12.0
+  Downloading kubernetes-12.0.1-py2.py3-none-any.whl (1.7 MB)
+Collecting google-auth<2.0dev,>=1.25.0
+  Downloading google_auth-1.35.0-py2.py3-none-any.whl (152 kB)
+Collecting azure-identity~=1.5
+  Downloading azure_identity-1.7.1-py2.py3-none-any.whl (129 kB)
+Collecting aiobotocore~=1.4.0
+  Downloading aiobotocore-1.4.2.tar.gz (52 kB)
+Collecting boto3<1.17.107,~=1.9
+  Downloading boto3-1.17.106-py2.py3-none-any.whl (131 kB)
+Collecting azure-keyvault-secrets~=4.2
+  Downloading azure_keyvault_secrets-4.3.0-py2.py3-none-any.whl (233 kB)
+Collecting s3fs~=2021.8.1
+  Downloading s3fs-2021.8.1-py3-none-any.whl (26 kB)
+Collecting plotly~=5.4
+  Downloading plotly-5.5.0-py2.py3-none-any.whl (26.5 MB)
+Collecting botocore<1.20.107,>=1.20.106
+  Downloading botocore-1.20.106-py2.py3-none-any.whl (7.7 MB)
+Collecting adlfs~=2021.8.1
+  Downloading adlfs-2021.8.2.tar.gz (38 kB)
+Collecting gcsfs~=2021.8.1
+  Downloading gcsfs-2021.8.1-py2.py3-none-any.whl (23 kB)
+Collecting azure-storage-blob~=12.0
+  Downloading azure_storage_blob-12.9.0-py2.py3-none-any.whl (356 kB)
+Requirement already satisfied: azure-core>=1.7.0 in /usr/local/lib/python3.7/site-packages (from adlfs~=2021.8.1->mlrun[complete]@ git+https://github.com/mlrun/mlrun@development) (1.21.1)
+Collecting azure-datalake-store<0.1,>=0.0.46
+  Downloading azure_datalake_store-0.0.52-py2.py3-none-any.whl (61 kB)
+Collecting wrapt>=1.10.10
+  Downloading wrapt-1.13.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (79 kB)
+Collecting aioitertools>=0.5.1
+  Downloading aioitertools-0.8.0-py3-none-any.whl (21 kB)
+Collecting frozenlist>=1.1.1
+  Downloading frozenlist-1.3.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (144 kB)
+Collecting yarl<2.0,>=1.0
+  Downloading yarl-1.7.2-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (271 kB)
+Collecting multidict<7.0,>=4.5
+  Downloading multidict-6.0.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (94 kB)
+Collecting aiosignal>=1.1.2
+  Downloading aiosignal-1.2.0-py3-none-any.whl (8.2 kB)
+Requirement already satisfied: typing-extensions>=3.7.4 in /usr/local/lib/python3.7/site-packages (from aiohttp~=3.8->mlrun[complete]@ git+https://github.com/mlrun/mlrun@development) (4.0.1)
+Collecting async-timeout<5.0,>=4.0.0a3
+  Downloading async_timeout-4.0.2-py3-none-any.whl (5.8 kB)
+Collecting asynctest==0.13.0
+  Downloading asynctest-0.13.0-py3-none-any.whl (26 kB)
+Collecting attrs>=17.3.0
+  Downloading attrs-21.4.0-py2.py3-none-any.whl (60 kB)
+Requirement already satisfied: charset-normalizer<3.0,>=2.0 in /usr/local/lib/python3.7/site-packages (from aiohttp~=3.8->mlrun[complete]@ git+https://github.com/mlrun/mlrun@development) (2.0.11)
+Collecting Mako
+  Downloading Mako-1.1.6-py2.py3-none-any.whl (75 kB)
+Collecting python-editor>=0.3
+  Downloading python_editor-1.0.4-py3-none-any.whl (4.9 kB)
+Requirement already satisfied: python-dateutil in /usr/local/lib/python3.7/site-packages (from alembic<1.6.0,~=1.4->mlrun[complete]@ git+https://github.com/mlrun/mlrun@development) (2.8.2)
+Requirement already satisfied: six>=1.11.0 in /usr/local/lib/python3.7/site-packages (from azure-core>=1.7.0->adlfs~=2021.8.1->mlrun[complete]@ git+https://github.com/mlrun/mlrun@development) (1.16.0)
+Requirement already satisfied: cffi in /usr/local/lib/python3.7/site-packages (from azure-datalake-store<0.1,>=0.0.46->adlfs~=2021.8.1->mlrun[complete]@ git+https://github.com/mlrun/mlrun@development) (1.15.0)
+Requirement already satisfied: adal>=0.4.2 in /usr/local/lib/python3.7/site-packages (from azure-datalake-store<0.1,>=0.0.46->adlfs~=2021.8.1->mlrun[complete]@ git+https://github.com/mlrun/mlrun@development) (1.2.7)
+Requirement already satisfied: PyJWT<3,>=1.0.0 in /usr/local/lib/python3.7/site-packages (from adal>=0.4.2->azure-datalake-store<0.1,>=0.0.46->adlfs~=2021.8.1->mlrun[complete]@ git+https://github.com/mlrun/mlrun@development) (2.3.0)
+Requirement already satisfied: msal<2.0.0,>=1.12.0 in /usr/local/lib/python3.7/site-packages (from azure-identity~=1.5->mlrun[complete]@ git+https://github.com/mlrun/mlrun@development) (1.16.0)
+Collecting msal-extensions~=0.3.0
+  Downloading msal_extensions-0.3.1-py2.py3-none-any.whl (18 kB)
+Requirement already satisfied: azure-common~=1.1 in /usr/local/lib/python3.7/site-packages (from azure-keyvault-secrets~=4.2->mlrun[complete]@ git+https://github.com/mlrun/mlrun@development) (1.1.27)
+Requirement already satisfied: msrest>=0.6.21 in /usr/local/lib/python3.7/site-packages (from azure-keyvault-secrets~=4.2->mlrun[complete]@ git+https://github.com/mlrun/mlrun@development) (0.6.21)
+Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /usr/local/lib/python3.7/site-packages (from boto3<1.17.107,~=1.9->mlrun[complete]@ git+https://github.com/mlrun/mlrun@development) (0.10.0)
+Collecting s3transfer<0.5.0,>=0.4.0
+  Downloading s3transfer-0.4.2-py2.py3-none-any.whl (79 kB)
+Requirement already satisfied: pycparser in /usr/local/lib/python3.7/site-packages (from cffi->azure-datalake-store<0.1,>=0.0.46->adlfs~=2021.8.1->mlrun[complete]@ git+https://github.com/mlrun/mlrun@development) (2.21)
+Collecting partd>=0.3.10
+  Downloading partd-1.2.0-py3-none-any.whl (19 kB)
+Collecting toolz>=0.8.2
+  Downloading toolz-0.11.2-py3-none-any.whl (55 kB)
+Requirement already satisfied: cloudpickle>=1.1.1 in /usr/local/lib/python3.7/site-packages (from dask~=2021.11.2->mlrun[complete]@ git+https://github.com/mlrun/mlrun@development) (1.6.0)
+Collecting packaging>=20.0
+  Downloading packaging-21.3-py3-none-any.whl (40 kB)
+Collecting ordered-set==4.0.2
+  Downloading ordered-set-4.0.2.tar.gz (10 kB)
+Collecting tornado>=5
+  Downloading tornado-6.1-cp37-cp37m-manylinux2010_x86_64.whl (428 kB)
+Collecting msgpack>=0.6.0
+  Downloading msgpack-1.0.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (299 kB)
+Collecting zict>=0.1.3
+  Downloading zict-2.0.0-py3-none-any.whl (10 kB)
+Requirement already satisfied: setuptools in /usr/local/lib/python3.7/site-packages (from distributed~=2021.11.2->mlrun[complete]@ git+https://github.com/mlrun/mlrun@development) (53.0.0)
+Collecting psutil>=5.0
+  Downloading psutil-5.9.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (280 kB)
+Collecting jinja2
+  Downloading Jinja2-3.0.3-py3-none-any.whl (133 kB)
+Collecting tblib>=1.6.0
+  Downloading tblib-1.7.0-py2.py3-none-any.whl (12 kB)
+Collecting sortedcontainers!=2.0.0,!=2.0.1
+  Downloading sortedcontainers-2.4.0-py2.py3-none-any.whl (29 kB)
+Collecting starlette==0.14.2
+  Downloading starlette-0.14.2-py3-none-any.whl (60 kB)
+Collecting decorator
+  Downloading decorator-5.1.1-py3-none-any.whl (9.1 kB)
+Collecting google-auth-oauthlib
+  Downloading google_auth_oauthlib-0.4.6-py2.py3-none-any.whl (18 kB)
+Collecting gitdb<5,>=4.0.1
+  Downloading gitdb-4.0.9-py3-none-any.whl (63 kB)
+Collecting smmap<6,>=3.0.1
+  Downloading smmap-5.0.0-py3-none-any.whl (24 kB)
+Collecting pyasn1-modules>=0.2.1
+  Downloading pyasn1_modules-0.2.8-py2.py3-none-any.whl (155 kB)
+Collecting rsa<5,>=3.1.4
+  Downloading rsa-4.8-py3-none-any.whl (39 kB)
+Collecting cachetools<5.0,>=2.0.0
+  Downloading cachetools-4.2.4-py3-none-any.whl (10 kB)
+Collecting jupyter-client
+  Downloading jupyter_client-7.1.2-py3-none-any.whl (130 kB)
+Collecting ipython-genutils
+  Downloading ipython_genutils-0.2.0-py2.py3-none-any.whl (26 kB)
+Collecting traitlets>=4.1.0
+  Downloading traitlets-5.1.1-py3-none-any.whl (102 kB)
+Collecting prompt-toolkit!=3.0.0,!=3.0.1,<3.1.0,>=2.0.0
+  Downloading prompt_toolkit-3.0.26-py3-none-any.whl (375 kB)
+Collecting jedi>=0.16
+  Downloading jedi-0.18.1-py2.py3-none-any.whl (1.6 MB)
+Collecting matplotlib-inline
+  Downloading matplotlib_inline-0.1.3-py3-none-any.whl (8.2 kB)
+Collecting pickleshare
+  Downloading pickleshare-0.7.5-py2.py3-none-any.whl (6.9 kB)
+Collecting pygments
+  Downloading Pygments-2.11.2-py3-none-any.whl (1.1 MB)
+Collecting backcall
+  Downloading backcall-0.2.0-py2.py3-none-any.whl (11 kB)
+Collecting pexpect>4.3
+  Downloading pexpect-4.8.0-py2.py3-none-any.whl (59 kB)
+Collecting parso<0.9.0,>=0.8.0
+  Downloading parso-0.8.3-py2.py3-none-any.whl (100 kB)
+Collecting absl-py<2,>=0.9
+  Downloading absl_py-1.0.0-py3-none-any.whl (126 kB)
+Collecting google-cloud-storage<2,>=1.20.0
+  Downloading google_cloud_storage-1.44.0-py2.py3-none-any.whl (106 kB)
+Collecting google-api-python-client<2,>=1.7.8
+  Downloading google_api_python_client-1.12.10-py2.py3-none-any.whl (61 kB)
+Collecting requests-toolbelt<1,>=0.8.0
+  Downloading requests_toolbelt-0.9.1-py2.py3-none-any.whl (54 kB)
+Collecting cloudpickle>=1.1.1
+  Downloading cloudpickle-2.0.0-py3-none-any.whl (25 kB)
+Collecting kfp-server-api<2.0.0,>=1.1.2
+  Downloading kfp-server-api-1.7.1.tar.gz (52 kB)
+Collecting jsonschema<4,>=3.0.1
+  Downloading jsonschema-3.2.0-py2.py3-none-any.whl (56 kB)
+Collecting Deprecated<2,>=1.2.7
+  Downloading Deprecated-1.2.13-py2.py3-none-any.whl (9.6 kB)
+Collecting strip-hints<1,>=0.1.8
+  Downloading strip-hints-0.1.10.tar.gz (29 kB)
+Collecting docstring-parser<1,>=0.7.3
+  Downloading docstring_parser-0.13.tar.gz (23 kB)
+  Installing build dependencies: started
+  Installing build dependencies: finished with status 'done'
+  Getting requirements to build wheel: started
+  Getting requirements to build wheel: finished with status 'done'
+    Preparing wheel metadata: started
+    Preparing wheel metadata: finished with status 'done'
+Collecting kfp-pipeline-spec<0.2.0,>=0.1.13
+  Downloading kfp_pipeline_spec-0.1.13-py3-none-any.whl (18 kB)
+Collecting fire<1,>=0.3.1
+  Downloading fire-0.4.0.tar.gz (87 kB)
+Collecting protobuf<4,>=3.13.0
+  Downloading protobuf-3.19.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
+Collecting uritemplate<4,>=3.0.1
+  Downloading uritemplate-3.0.1-py2.py3-none-any.whl (15 kB)
+Collecting typer<1.0,>=0.3.2
+  Downloading typer-0.4.0-py3-none-any.whl (27 kB)
+Collecting typing-extensions>=3.7.4
+  Downloading typing_extensions-3.10.0.2-py3-none-any.whl (26 kB)
+Collecting termcolor
+  Downloading termcolor-1.1.0.tar.gz (3.9 kB)
+Collecting httplib2<1dev,>=0.15.0
+  Downloading httplib2-0.20.2-py3-none-any.whl (96 kB)
+Collecting google-api-core<3dev,>=1.21.0
+  Downloading google_api_core-2.4.0-py2.py3-none-any.whl (111 kB)
+Collecting google-auth-httplib2>=0.0.3
+  Downloading google_auth_httplib2-0.1.0-py2.py3-none-any.whl (9.3 kB)
+Collecting googleapis-common-protos<2.0dev,>=1.52.0
+  Downloading googleapis_common_protos-1.54.0-py2.py3-none-any.whl (207 kB)
+Collecting google-cloud-core<3.0dev,>=1.6.0
+  Downloading google_cloud_core-2.2.2-py2.py3-none-any.whl (29 kB)
+Collecting google-resumable-media<3.0dev,>=1.3.0
+  Downloading google_resumable_media-2.1.0-py2.py3-none-any.whl (75 kB)
+Collecting google-crc32c<2.0dev,>=1.0
+  Downloading google_crc32c-1.3.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (38 kB)
+Collecting pyparsing!=3.0.0,!=3.0.1,!=3.0.2,!=3.0.3,<4,>=2.4.2
+  Downloading pyparsing-3.0.7-py3-none-any.whl (98 kB)
+Requirement already satisfied: importlib-metadata in /usr/local/lib/python3.7/site-packages (from jsonschema<4,>=3.0.1->kfp~=1.8.0->mlrun[complete]@ git+https://github.com/mlrun/mlrun@development) (4.10.1)
+Collecting pyrsistent>=0.14.0
+  Downloading pyrsistent-0.18.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (117 kB)
+Requirement already satisfied: certifi in /usr/local/lib/python3.7/site-packages (from kfp-server-api<2.0.0,>=1.1.2->kfp~=1.8.0->mlrun[complete]@ git+https://github.com/mlrun/mlrun@development) (2021.10.8)
+Requirement already satisfied: requests-oauthlib in /usr/local/lib/python3.7/site-packages (from kubernetes~=12.0->mlrun[complete]@ git+https://github.com/mlrun/mlrun@development) (1.3.1)
+Requirement already satisfied: websocket-client!=0.40.0,!=0.41.*,!=0.42.*,>=0.32.0 in /usr/local/lib/python3.7/site-packages (from kubernetes~=12.0->mlrun[complete]@ git+https://github.com/mlrun/mlrun@development) (1.2.3)
+Requirement already satisfied: portalocker<3,>=1.0 in /usr/local/lib/python3.7/site-packages (from msal-extensions~=0.3.0->azure-identity~=1.5->mlrun[complete]@ git+https://github.com/mlrun/mlrun@development) (1.7.1)
+Requirement already satisfied: isodate>=0.6.0 in /usr/local/lib/python3.7/site-packages (from msrest>=0.6.21->azure-keyvault-secrets~=4.2->mlrun[complete]@ git+https://github.com/mlrun/mlrun@development) (0.6.1)
+Collecting notebook>=5.2.0
+  Downloading notebook-6.4.8-py3-none-any.whl (9.9 MB)
+Collecting nbconvert>=5.4
+  Downloading nbconvert-6.4.1-py3-none-any.whl (557 kB)
+Collecting nbformat>=4.4
+  Downloading nbformat-5.1.3-py3-none-any.whl (178 kB)
+Collecting mistune<2,>=0.8.1
+  Downloading mistune-0.8.4-py2.py3-none-any.whl (16 kB)
+Collecting defusedxml
+  Downloading defusedxml-0.7.1-py2.py3-none-any.whl (25 kB)
+Collecting nbclient<0.6.0,>=0.5.0
+  Downloading nbclient-0.5.10-py3-none-any.whl (69 kB)
+Collecting jupyter-core
+  Downloading jupyter_core-4.9.1-py3-none-any.whl (86 kB)
+Collecting testpath
+  Downloading testpath-0.5.0-py3-none-any.whl (84 kB)
+Collecting pandocfilters>=1.4.1
+  Downloading pandocfilters-1.5.0-py2.py3-none-any.whl (8.7 kB)
+Collecting bleach
+  Downloading bleach-4.1.0-py2.py3-none-any.whl (157 kB)
+Collecting jupyterlab-pygments
+  Downloading jupyterlab_pygments-0.1.2-py2.py3-none-any.whl (4.6 kB)
+Collecting entrypoints>=0.2.2
+  Downloading entrypoints-0.3-py2.py3-none-any.whl (11 kB)
+Collecting MarkupSafe>=2.0
+  Downloading MarkupSafe-2.0.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (31 kB)
+Collecting pyzmq>=13
+  Downloading pyzmq-22.3.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (1.1 MB)
+Collecting terminado>=0.8.3
+  Downloading terminado-0.13.1-py3-none-any.whl (14 kB)
+Collecting argon2-cffi
+  Downloading argon2_cffi-21.3.0-py3-none-any.whl (14 kB)
+Collecting Send2Trash>=1.8.0
+  Downloading Send2Trash-1.8.0-py3-none-any.whl (18 kB)
+Collecting prometheus-client
+  Downloading prometheus_client-0.13.1-py3-none-any.whl (57 kB)
+Requirement already satisfied: pytz>=2017.3 in /usr/local/lib/python3.7/site-packages (from pandas~=1.2->mlrun[complete]@ git+https://github.com/mlrun/mlrun@development) (2021.3)
+Collecting locket
+  Downloading locket-0.2.1-py2.py3-none-any.whl (4.1 kB)
+Collecting ptyprocess>=0.5
+  Downloading ptyprocess-0.7.0-py2.py3-none-any.whl (13 kB)
+Collecting tenacity>=6.2.0
+  Downloading tenacity-8.0.1-py3-none-any.whl (24 kB)
+Collecting wcwidth
+  Downloading wcwidth-0.2.5-py2.py3-none-any.whl (30 kB)
+Requirement already satisfied: pyasn1<0.5.0,>=0.4.6 in /usr/local/lib/python3.7/site-packages (from pyasn1-modules>=0.2.1->google-auth<2.0dev,>=1.25.0->mlrun[complete]@ git+https://github.com/mlrun/mlrun@development) (0.4.8)
+Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.7/site-packages (from requests~=2.22->mlrun[complete]@ git+https://github.com/mlrun/mlrun@development) (3.3)
+Requirement already satisfied: oauthlib>=3.0.0 in /usr/local/lib/python3.7/site-packages (from requests-oauthlib->kubernetes~=12.0->mlrun[complete]@ git+https://github.com/mlrun/mlrun@development) (3.2.0)
+Collecting greenlet!=0.4.17
+  Downloading greenlet-1.1.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (150 kB)
+Collecting grpcio-tools<1.42,>1.34.0
+  Downloading grpcio_tools-1.41.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.4 MB)
+Collecting grpcio<1.42,>1.34.0
+  Downloading grpcio-1.41.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.9 MB)
+Requirement already satisfied: wheel in /usr/local/lib/python3.7/site-packages (from strip-hints<1,>=0.1.8->kfp~=1.8.0->mlrun[complete]@ git+https://github.com/mlrun/mlrun@development) (0.36.2)
+Collecting ujson>=3.0.0
+  Downloading ujson-5.1.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (43 kB)
+Collecting future>=0.18.2
+  Downloading future-0.18.2.tar.gz (829 kB)
+Collecting heapdict
+  Downloading HeapDict-1.0.1-py3-none-any.whl (3.9 kB)
+Collecting argon2-cffi-bindings
+  Downloading argon2_cffi_bindings-21.2.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (86 kB)
+Collecting webencodings
+  Downloading webencodings-0.5.1-py2.py3-none-any.whl (11 kB)
+Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/site-packages (from importlib-metadata->jsonschema<4,>=3.0.1->kfp~=1.8.0->mlrun[complete]@ git+https://github.com/mlrun/mlrun@development) (3.7.0)
+Building wheels for collected packages: adlfs, aiobotocore, ordered-set, kfp, docstring-parser, fire, kfp-server-api, strip-hints, future, mlrun, termcolor
+  Building wheel for adlfs (setup.py): started
+  Building wheel for adlfs (setup.py): finished with status 'done'
+  Created wheel for adlfs: filename=adlfs-2021.8.2-py3-none-any.whl size=21466 sha256=d3076fc23e1d05f49958ba450b9913a80c23c755d3347327d1d2150656fa3185
+  Stored in directory: /root/.cache/pip/wheels/0d/88/1d/e06072abb7fb4d59b5cf94e194e53017dfa2dc47af4dec88b7
+  Building wheel for aiobotocore (setup.py): started
+  Building wheel for aiobotocore (setup.py): finished with status 'done'
+  Created wheel for aiobotocore: filename=aiobotocore-1.4.2-py3-none-any.whl size=49910 sha256=d630bfe25d72229a76e207cb2de8dd29839368673c27edc12229b314660a7e69
+  Stored in directory: /root/.cache/pip/wheels/33/e7/d9/b297a9aa9c43d56bc2463e6e2771655ff638f30b30f0b61fcb
+  Building wheel for ordered-set (setup.py): started
+  Building wheel for ordered-set (setup.py): finished with status 'done'
+  Created wheel for ordered-set: filename=ordered_set-4.0.2-py2.py3-none-any.whl size=8210 sha256=d0a6fcb9c69107866ca516cb90c3a6a1c0dd00c2f55e981d69b476e92fe85c0a
+  Stored in directory: /root/.cache/pip/wheels/73/2b/f6/26e9f84153c25050fe7c09e88f8e32a6be3c7034a38c418319
+  Building wheel for kfp (setup.py): started
+  Building wheel for kfp (setup.py): finished with status 'done'
+  Created wheel for kfp: filename=kfp-1.8.11-py3-none-any.whl size=414450 sha256=62bc86dbc4fbb6d431756182a297f87d5d9f08edfb8c2bab347b81fe0654cad3
+  Stored in directory: /root/.cache/pip/wheels/85/1e/ee/a14b49663bddf9e72d1c269cbe53970167bfabb53cadbbea3a
+  Building wheel for docstring-parser (PEP 517): started
+  Building wheel for docstring-parser (PEP 517): finished with status 'done'
+  Created wheel for docstring-parser: filename=docstring_parser-0.13-py3-none-any.whl size=31866 sha256=6ab7172ddcc24d27d93d31af6438a00053464e76585907654ec90dfb3ecf1886
+  Stored in directory: /root/.cache/pip/wheels/bd/88/3c/d1aa049309f7945178cac9fbe6561a86424f432da57c18ca0f
+  Building wheel for fire (setup.py): started
+  Building wheel for fire (setup.py): finished with status 'done'
+  Created wheel for fire: filename=fire-0.4.0-py2.py3-none-any.whl size=115928 sha256=6704c4bed4908d06fbe2e61f97718607c4ef9dee5a573d09acf9882e6e620757
+  Stored in directory: /root/.cache/pip/wheels/8a/67/fb/2e8a12fa16661b9d5af1f654bd199366799740a85c64981226
+  Building wheel for kfp-server-api (setup.py): started
+  Building wheel for kfp-server-api (setup.py): finished with status 'done'
+  Created wheel for kfp-server-api: filename=kfp_server_api-1.7.1-py3-none-any.whl size=92618 sha256=b156a487ea471572b7c0d0dc85826bb5f6554ce5d097c3d16cdd75e36a93100a
+  Stored in directory: /root/.cache/pip/wheels/68/3f/d5/734c0278dd6c8969cef359edcf059505a61452c5eb0e2760e1
+  Building wheel for strip-hints (setup.py): started
+  Building wheel for strip-hints (setup.py): finished with status 'done'
+  Created wheel for strip-hints: filename=strip_hints-0.1.10-py2.py3-none-any.whl size=22279 sha256=ebdf8455bb18636c3ffe99dc1ffe7262298094e000e46a15df3ef1b809e3770f
+  Stored in directory: /root/.cache/pip/wheels/5e/14/c3/6e44e9b2545f2d570b03f5b6d38c00b7534aa8abb376978363
+  Building wheel for future (setup.py): started
+  Building wheel for future (setup.py): finished with status 'done'
+  Created wheel for future: filename=future-0.18.2-py3-none-any.whl size=491059 sha256=b71c260b8cae9faa06e701eb03743481d68f427d7ed0886bddf8eec6fab17927
+  Stored in directory: /root/.cache/pip/wheels/56/b0/fe/4410d17b32f1f0c3cf54cdfb2bc04d7b4b8f4ae377e2229ba0
+  Building wheel for mlrun (PEP 517): started
+  Building wheel for mlrun (PEP 517): finished with status 'done'
+  Created wheel for mlrun: filename=mlrun-0.0.0+unstable-py3-none-any.whl size=799835 sha256=a37564dbf60ba19531146c0404fa02216cea4492894cbafd9738bb199ce45775
+  Stored in directory: /tmp/pip-ephem-wheel-cache-xz3ex0pi/wheels/cd/42/82/13965317128ea26acc3fb21b24cc254077454998599db6f161
+  Building wheel for termcolor (setup.py): started
+  Building wheel for termcolor (setup.py): finished with status 'done'
+  Created wheel for termcolor: filename=termcolor-1.1.0-py3-none-any.whl size=4829 sha256=09ed568b0b6ea586b107bc3decf5a736c95331052246548b5a560c69a88e9414
+  Stored in directory: /root/.cache/pip/wheels/3f/e3/ec/8a8336ff196023622fbcb36de0c5a5c218cbb24111d1d4c7f2
+Successfully built adlfs aiobotocore ordered-set kfp docstring-parser fire kfp-server-api strip-hints future mlrun termcolor
+Installing collected packages: typing-extensions, traitlets, pyrsistent, attrs, wcwidth, tornado, rsa, pyzmq, pyparsing, pyasn1-modules, ptyprocess, protobuf, parso, nest-asyncio, jupyter-core, jsonschema, ipython-genutils, entrypoints, cachetools, webencodings, pygments, prompt-toolkit, pickleshare, pexpect, packaging, nbformat, matplotlib-inline, MarkupSafe, jupyter-client, jedi, googleapis-common-protos, google-auth, decorator, cryptography, backcall, ujson, toolz, testpath, pandocfilters, nbclient, multidict, mistune, locket, jupyterlab-pygments, jinja2, ipython, httplib2, grpcio, google-crc32c, google-api-core, future, frozenlist, defusedxml, botocore, bleach, argon2-cffi-bindings, yarl, wrapt, v3io, uritemplate, terminado, termcolor, smmap, Send2Trash, s3transfer, pyyaml, prometheus-client, partd, pandas, nbconvert, ipykernel, heapdict, grpcio-tools, greenlet, google-resumable-media, google-cloud-core, google-auth-httplib2, fsspec, cloudpickle, click, asynctest, async-timeout, argon2-cffi, aiosignal, zict, v3iofs, v3io-frames, typer, tblib, tabulate, strip-hints, starlette, sqlalchemy, sortedcontainers, requests-toolbelt, python-editor, pydantic, psutil, ordered-set, notebook, msgpack, msal-extensions, Mako, kubernetes, kfp-server-api, kfp-pipeline-spec, google-cloud-storage, google-api-python-client, gitdb, fire, docstring-parser, Deprecated, dask, boto3, aioitertools, aiohttp, absl-py, tenacity, storey, semver, python-dotenv, pymysql, orjson, nuclio-jupyter, mergedeep, kfp, inflection, humanfriendly, google-auth-oauthlib, GitPython, fastapi, distributed, deepdiff, chardet, azure-storage-blob, azure-identity, azure-datalake-store, alembic, aiobotocore, s3fs, plotly, mlrun, gcsfs, azure-keyvault-secrets, adlfs
+  Attempting uninstall: typing-extensions
+    Found existing installation: typing-extensions 4.0.1
+    Uninstalling typing-extensions-4.0.1:
+      Successfully uninstalled typing-extensions-4.0.1
+  Attempting uninstall: cryptography
+    Found existing installation: cryptography 3.4.8
+    Uninstalling cryptography-3.4.8:
+      Successfully uninstalled cryptography-3.4.8
+  Attempting uninstall: cloudpickle
+    Found existing installation: cloudpickle 1.6.0
+    Uninstalling cloudpickle-1.6.0:
+      Successfully uninstalled cloudpickle-1.6.0
+  Attempting uninstall: msal-extensions
+    Found existing installation: msal-extensions 0.2.2
+    Uninstalling msal-extensions-0.2.2:
+      Successfully uninstalled msal-extensions-0.2.2
+  Attempting uninstall: azure-identity
+    Found existing installation: azure-identity 1.4.1
+    Uninstalling azure-identity-1.4.1:
+      Successfully uninstalled azure-identity-1.4.1
+ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
+azureml-dataprep 2.20.1 requires azure-identity<1.5.0,>=1.2.0, but you have azure-identity 1.7.1 which is incompatible.
+azureml-dataprep 2.20.1 requires cloudpickle<2.0.0,>=1.1.0, but you have cloudpickle 2.0.0 which is incompatible.
+Successfully installed Deprecated-1.2.13 GitPython-3.1.26 Mako-1.1.6 MarkupSafe-2.0.1 Send2Trash-1.8.0 absl-py-1.0.0 adlfs-2021.8.2 aiobotocore-1.4.2 aiohttp-3.8.1 aioitertools-0.8.0 aiosignal-1.2.0 alembic-1.5.8 argon2-cffi-21.3.0 argon2-cffi-bindings-21.2.0 async-timeout-4.0.2 asynctest-0.13.0 attrs-21.4.0 azure-datalake-store-0.0.52 azure-identity-1.7.1 azure-keyvault-secrets-4.3.0 azure-storage-blob-12.9.0 backcall-0.2.0 bleach-4.1.0 boto3-1.17.106 botocore-1.20.106 cachetools-4.2.4 chardet-3.0.4 click-7.1.2 cloudpickle-2.0.0 cryptography-3.3.2 dask-2021.11.2 decorator-5.1.1 deepdiff-5.7.0 defusedxml-0.7.1 distributed-2021.11.2 docstring-parser-0.13 entrypoints-0.3 fastapi-0.67.0 fire-0.4.0 frozenlist-1.3.0 fsspec-2021.8.1 future-0.18.2 gcsfs-2021.8.1 gitdb-4.0.9 google-api-core-2.4.0 google-api-python-client-1.12.10 google-auth-1.35.0 google-auth-httplib2-0.1.0 google-auth-oauthlib-0.4.6 google-cloud-core-2.2.2 google-cloud-storage-1.44.0 google-crc32c-1.3.0 google-resumable-media-2.1.0 googleapis-common-protos-1.54.0 greenlet-1.1.2 grpcio-1.41.1 grpcio-tools-1.41.1 heapdict-1.0.1 httplib2-0.20.2 humanfriendly-8.2 inflection-0.5.1 ipykernel-5.5.6 ipython-7.31.1 ipython-genutils-0.2.0 jedi-0.18.1 jinja2-3.0.3 jsonschema-3.2.0 jupyter-client-7.1.2 jupyter-core-4.9.1 jupyterlab-pygments-0.1.2 kfp-1.8.11 kfp-pipeline-spec-0.1.13 kfp-server-api-1.7.1 kubernetes-12.0.1 locket-0.2.1 matplotlib-inline-0.1.3 mergedeep-1.3.4 mistune-0.8.4 mlrun-0.0.0+unstable msal-extensions-0.3.1 msgpack-1.0.3 multidict-6.0.2 nbclient-0.5.10 nbconvert-6.4.1 nbformat-5.1.3 nest-asyncio-1.5.4 notebook-6.4.8 nuclio-jupyter-0.8.22 ordered-set-4.0.2 orjson-3.3.1 packaging-21.3 pandas-1.3.5 pandocfilters-1.5.0 parso-0.8.3 partd-1.2.0 pexpect-4.8.0 pickleshare-0.7.5 plotly-5.5.0 prometheus-client-0.13.1 prompt-toolkit-3.0.26 protobuf-3.19.4 psutil-5.9.0 ptyprocess-0.7.0 pyasn1-modules-0.2.8 pydantic-1.9.0 pygments-2.11.2 pymysql-1.0.2 pyparsing-3.0.7 pyrsistent-0.18.1 python-dotenv-0.17.1 python-editor-1.0.4 pyyaml-5.4.1 pyzmq-22.3.0 requests-toolbelt-0.9.1 rsa-4.8 s3fs-2021.8.1 s3transfer-0.4.2 semver-2.13.0 smmap-5.0.0 sortedcontainers-2.4.0 sqlalchemy-1.4.31 starlette-0.14.2 storey-0.10.4 strip-hints-0.1.10 tabulate-0.8.9 tblib-1.7.0 tenacity-8.0.1 termcolor-1.1.0 terminado-0.13.1 testpath-0.5.0 toolz-0.11.2 tornado-6.1 traitlets-5.1.1 typer-0.4.0 typing-extensions-3.10.0.2 ujson-5.1.0 uritemplate-3.0.1 v3io-0.5.15 v3io-frames-0.10.2 v3iofs-0.1.10 wcwidth-0.2.5 webencodings-0.5.1 wrapt-1.13.3 yarl-1.7.2 zict-2.0.0
+WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv
+WARNING: You are using pip version 21.2.4; however, version 22.0.2 is available.
+You should consider upgrading via the '/usr/local/bin/python -m pip install --upgrade pip' command.
+INFO[0128] Taking snapshot of full filesystem...        
+INFO[0148] Pushing image to docker-registry.default-tenant.app.yh41.iguazio-cd1.com/mlrun/func-azureml-yonatan-azureml-utils:latest 
+INFO[0153] Pushed image to 1 destinations               
+> 2022-02-02 18:30:56,789 [info] starting run azureml-utils-train uid=48dbbe26a2a34b5baaec5ca8aba3de5e DB=http://mlrun-api:8080
+> 2022-02-02 18:30:56,988 [info] Job is running in the background, pod: azureml-utils-train-7pp86
+> 2022-02-02 18:31:30,311 [warning] Failed resolving version info. Ignoring and using defaults
+> 2022-02-02 18:31:32,893 [warning] Server or client version is unstable. Assuming compatible: {'server_version': '0.0.0+unstable', 'client_version': '0.0.0+unstable'}
+Failure while loading azureml_run_type_providers. Failed to load entrypoint automl = azureml.train.automl.run:AutoMLRun._from_run_dto with exception (cloudpickle 2.0.0 (/usr/local/lib/python3.7/site-packages), Requirement.parse('cloudpickle<2.0.0,>=1.1.0'), {'azureml-dataprep'}).
+> 2022-02-02 18:31:34,680 [info] Loading AzureML Workspace
+> 2022-02-02 18:31:36,956 [info] Initializing AzureML experiment azure-automl-test
+> 2022-02-02 18:31:40,005 [info] Initializing AzureML compute target azureml-cpu
+> 2022-02-02 18:31:40,206 [info] Found existing cluster, will use it.
+Succeeded
+AmlCompute wait for completion finished
+
+Minimum number of nodes requested have been provisioned
+> 2022-02-02 18:31:40,322 [info] Connecting to AzureML experiment default datastore
+> 2022-02-02 18:31:41,624 [info] Retrieving feature vector and uploading to Azure blob storage: az://azureml-blobstore-27f8977b-4946-4ca0-bdc5-5a685d2fe8d7/iris.csv
+> 2022-02-02 18:31:41,912 [info] Registering dataset iris in Azure ML
+> 2022-02-02 18:31:41,912 [info] OpenSSL version must be 1.1. Overriding the OpenSSL version to 1.1
+> 2022-02-02 18:31:49,558 [info] Setting up experiment parameters
+> 2022-02-02 18:31:49,812 [info] Submitting and running experiment
+Submitting remote run.
+Parent Run ID: AutoML_35c51a81-98fd-44fb-aa23-3192c3aca08d
+https://ml.azure.com/runs/AutoML_35c51a81-98fd-44fb-aa23-3192c3aca08d?wsid=/subscriptions/8d81bc0b-6abd-4395-be83-000251d9fdbe/resourcegroups/nick/workspaces/NickAzureML&tid=af053911-a8b7-450d-9f58-0c08567d4769
+
+Current status: ModelSelection. Beginning model selection.
+
+****************************************************************************************************
+DATA GUARDRAILS: 
+
+TYPE:         Class balancing detection
+STATUS:       PASSED
+DESCRIPTION:  Your inputs were analyzed, and all classes are balanced in your training data.
+              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData
+
+****************************************************************************************************
+
+****************************************************************************************************
+ITERATION: The iteration being evaluated.
+PIPELINE: A summary description of the pipeline being evaluated.
+DURATION: Time taken for the current iteration.
+METRIC: The result of computing score on the fitted pipeline.
+BEST: The best observed score thus far.
+****************************************************************************************************
+
+ ITERATION   PIPELINE                                       DURATION      METRIC      BEST
+         0   RobustScaler LogisticRegression                0:00:21       0.9667    0.9667
+         1   StandardScalerWrapper SVM                      0:00:18       0.9533    0.9667
+         2   StandardScalerWrapper LogisticRegression       0:00:22       0.9733    0.9733
+         3   MaxAbsScaler LogisticRegression                0:00:22       0.9533    0.9733
+         4   MaxAbsScaler LogisticRegression                0:00:21       0.9733    0.9733
+
+********************************************************************************************
+
+> 2022-02-02 18:38:28,144 [info] Registering model
+> 2022-02-02 18:38:29,495 [info] Registered model with name 'iris-model', id 'iris-model:178', version '178'
+> 2022-02-02 18:38:29,495 [info] Downloading model iris-model:178
+> 2022-02-02 18:38:34,083 [info] Logging model_1_standardscaler_logisticregression model to MLRun
+> 2022-02-02 18:38:34,621 [info] Registering model
+> 2022-02-02 18:38:35,519 [info] Registered model with name 'iris-model', id 'iris-model:179', version '179'
+> 2022-02-02 18:38:35,519 [info] Downloading model iris-model:179
+> 2022-02-02 18:38:39,972 [info] Logging model_2_maxabsscaler_logisticregression model to MLRun
+> 2022-02-02 18:38:40,317 [info] Registering model
+> 2022-02-02 18:38:41,087 [info] Registered model with name 'iris-model', id 'iris-model:180', version '180'
+> 2022-02-02 18:38:41,087 [info] Downloading model iris-model:180
+> 2022-02-02 18:38:46,299 [info] Logging model_3_robustscaler_logisticregression model to MLRun
+> 2022-02-02 18:38:47,615 [info] run executed, status=completed
+final state: completed
+
+
+
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
azureml-yonatan0Feb 02 18:31:32completedazureml-utils-train
v3io_user=yonatan
kind=job
owner=yonatan
host=azureml-utils-train-7pp86
dataset
experiment_name=azure-automl-test
cpu_cluster_name=azureml-cpu
dataset_name=iris
dataset_description=iris training data
label_column_name=label
create_new_version=True
register_model_name=iris-model
save_n_models=3
automl_settings={'task': 'classification', 'debug_log': 'automl_errors.log', 'enable_early_stopping': False, 'allowed_models': ['LogisticRegression', 'SGD', 'SVM'], 'iterations': 5, 'iteration_timeout_minutes': 2, 'max_concurrent_iterations': 2, 'max_cores_per_iteration': -1, 'n_cross_validations': 5, 'primary_metric': 'accuracy', 'featurization': 'off', 'model_explainability': False, 'enable_voting_ensemble': False, 'enable_stack_ensemble': False}
dataset_blob_path=az://azureml-blobstore-27f8977b-4946-4ca0-bdc5-5a685d2fe8d7/iris.csv
best_iteration=1
auc_macro=0.9973298059964726
auc_micro=0.9979999999999999
norm_macro_recall=0.9594444444444443
balanced_accuracy=0.9729629629629629
f1_score_macro=0.9721779225097302
weighted_accuracy=0.9739694654594448
average_precision_score_weighted=0.9953861693861693
f1_score_weighted=0.9730901151988108
precision_score_micro=0.9733333333333334
matthews_correlation=0.9613232982405628
recall_score_macro=0.9729629629629629
precision_score_weighted=0.9767380952380952
recall_score_micro=0.9733333333333334
precision_score_macro=0.9754761904761905
average_precision_score_macro=0.9954059829059829
accuracy=0.9733333333333334
auc_weighted=0.9972857142857142
recall_score_weighted=0.9733333333333334
f1_score_micro=0.9733333333333334
average_precision_score_micro=0.9962096994520057
log_loss=0.07548089806904337
model
iteration_results
parallel_coordinates
+
+ +
+

+
+
+
> to track results use the .show() or .logs() methods or click here to open in UI
> 2022-02-02 18:38:48,608 [info] run executed, status=completed
+
+
+
+
+

View the run result: (more details in the UI)

+
+
+
azureml_run.artifact('iteration_results').show()
+
+
+
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
stateiterparam.data_trans_class_nameparam.data_trans_moduleparam.data_trans_spec_classparam.train_class_nameparam.train_moduleparam.train_param_kwargs_Cparam.train_param_kwargs_class_weightparam.train_spec_class...output.precision_score_weightedoutput.recall_score_microoutput.precision_score_macrooutput.average_precision_score_macrooutput.accuracyoutput.auc_weightedoutput.recall_score_weightedoutput.f1_score_microoutput.average_precision_score_microoutput.log_loss
0completed1StandardScalersklearn.preprocessingpreprocLogisticRegressionsklearn.linear_model16.768329NaNsklearn...0.9767380.9733330.9754760.9954060.9733330.9972860.9733330.9733330.9962100.075481
1completed2MaxAbsScalersklearn.preprocessingpreprocLogisticRegressionsklearn.linear_model719.685673NaNsklearn...0.9767380.9733330.9754760.9952270.9733330.9972860.9733330.9733330.9962110.072493
2completed3RobustScalersklearn.preprocessingpreprocLogisticRegressionsklearn.linear_model1048.113134balancedsklearn...0.9683590.9666670.9664100.9942170.9666670.9965980.9666670.9666670.9955950.086160
+

3 rows × 31 columns

+
+
+
+
+
+

4. Deploy Model Serving#

+
+
+
# Importing serving function from marketplace:
+serving_fn = mlrun.new_function("serving", kind="serving", image="yhaviv/mlrun:dev")
+serving_fn.with_code(body=" ")
+serving_fn.with_requirements("./requirements.txt")
+
+# Set the real-time pipeline topology
+serving_fn.set_topology(
+    'router',
+    'mlrun.serving.routers.VotingEnsemble'
+)
+
+# Add the trained models:
+artifacts = mlrun.get_run_db().list_artifacts(project=project.name)
+models = {f"{model['algorithm']}{i}" :f"{model['db_key']}#{model['iter']}"
+          for i, model in enumerate(artifacts) if model["kind"]=="model"}
+
+for name, path in models.items():
+    serving_fn.add_model(
+        name,
+        class_name="mlrun.frameworks.sklearn.PickleModelServer",
+        model_path=project.get_artifact_uri(path))
+
+serving_fn.spec.graph.plot()
+
+
+
+
+_images/c0f81135bd698ac922e9ba61cadf04c6e39974dbd5ac9a1cf873894bf193d023.svg +
+
+

Building and Deploying the Serving Function

+
+
+
function_address = serving_fn.deploy()
+
+
+
+
+
> 2022-02-02 18:38:48,785 [info] Starting remote function deploy
+
+
+
+
+
+
+

5. Using the Live Model-Serving Function#

+
+
+
print (f'The address for the function is {function_address} \n')
+
+!curl $function_address
+
+
+
+
+
+
+
# Data for testing:
+source_df = mlrun.get_dataitem(DATA_URL).as_df()
+test_vector = source_df.sample(5).drop('label', axis=1).values.tolist()
+test_vector
+
+
+
+
+

After deploying the serving function with the required model we can make prediction:

+
+
+
serving_fn.invoke(f'/v2/models/infer', {"inputs": test_vector})
+
+
+
+
+
+
+

6. Clean up#

+

For cleaning up AzureML resources see: +https://docs.microsoft.com/en-us/azure/machine-learning/tutorial-auto-train-models#clean-up-resources

+
+
+
+
+
+
+
+
+ +
+
+ +
+
+
+
+ + + +
+
+ + \ No newline at end of file diff --git a/functions/master/azureml_utils/1.4.0/static/function.html b/functions/master/azureml_utils/1.4.0/static/function.html new file mode 100644 index 00000000..3aaa82f5 --- /dev/null +++ b/functions/master/azureml_utils/1.4.0/static/function.html @@ -0,0 +1,282 @@ + + + + + + + + + + + Source + + + + +
+        
+verbose: false
+spec:
+  command: ''
+  build:
+    auto_build: true
+    code_origin: ''
+    with_mlrun: true
+    requirements:
+    - azureml-core==1.54.0.post1
+    - azureml-train-automl-client==1.54.0.post1
+    - plotly~=5.4
+    functionSourceCode: IyBDb3B5cmlnaHQgMjAxOSBJZ3VhemlvCiMKIyBMaWNlbnNlZCB1bmRlciB0aGUgQXBhY2hlIExpY2Vuc2UsIFZlcnNpb24gMi4wICh0aGUgIkxpY2Vuc2UiKTsKIyB5b3UgbWF5IG5vdCB1c2UgdGhpcyBmaWxlIGV4Y2VwdCBpbiBjb21wbGlhbmNlIHdpdGggdGhlIExpY2Vuc2UuCiMgWW91IG1heSBvYnRhaW4gYSBjb3B5IG9mIHRoZSBMaWNlbnNlIGF0CiMKIyAgICAgaHR0cDovL3d3dy5hcGFjaGUub3JnL2xpY2Vuc2VzL0xJQ0VOU0UtMi4wCiMKIyBVbmxlc3MgcmVxdWlyZWQgYnkgYXBwbGljYWJsZSBsYXcgb3IgYWdyZWVkIHRvIGluIHdyaXRpbmcsIHNvZnR3YXJlCiMgZGlzdHJpYnV0ZWQgdW5kZXIgdGhlIExpY2Vuc2UgaXMgZGlzdHJpYnV0ZWQgb24gYW4gIkFTIElTIiBCQVNJUywKIyBXSVRIT1VUIFdBUlJBTlRJRVMgT1IgQ09ORElUSU9OUyBPRiBBTlkgS0lORCwgZWl0aGVyIGV4cHJlc3Mgb3IgaW1wbGllZC4KIyBTZWUgdGhlIExpY2Vuc2UgZm9yIHRoZSBzcGVjaWZpYyBsYW5ndWFnZSBnb3Zlcm5pbmcgcGVybWlzc2lvbnMgYW5kCiMgbGltaXRhdGlvbnMgdW5kZXIgdGhlIExpY2Vuc2UuCiMKaW1wb3J0IG9zCmltcG9ydCBqc29uCmltcG9ydCBsb2dnaW5nCmZyb20gdHlwaW5nIGltcG9ydCBUdXBsZSwgTGlzdAoKZnJvbSBtbHJ1biBpbXBvcnQgTUxDbGllbnRDdHgsIERhdGFJdGVtLCBnZXRfZGF0YWl0ZW0KaW1wb3J0IG1scnVuLmZlYXR1cmVfc3RvcmUgYXMgZl9zdG9yZQppbXBvcnQgbWxydW4uZGF0YXN0b3JlCmltcG9ydCBtbHJ1bi51dGlscwpmcm9tIG1scnVuLmRhdGFzdG9yZS50YXJnZXRzIGltcG9ydCBQYXJxdWV0VGFyZ2V0Cgpmcm9tIGF6dXJlbWwuY29yZS5hdXRoZW50aWNhdGlvbiBpbXBvcnQgU2VydmljZVByaW5jaXBhbEF1dGhlbnRpY2F0aW9uCmZyb20gYXp1cmVtbC5jb3JlLndvcmtzcGFjZSBpbXBvcnQgV29ya3NwYWNlCmZyb20gYXp1cmVtbC5jb3JlLmV4cGVyaW1lbnQgaW1wb3J0IEV4cGVyaW1lbnQKZnJvbSBhenVyZW1sLmNvcmUuZGF0YXNldCBpbXBvcnQgRGF0YXNldApmcm9tIGF6dXJlbWwuY29yZS5tb2RlbCBpbXBvcnQgTW9kZWwKZnJvbSBhenVyZW1sLmNvcmUuY29tcHV0ZSBpbXBvcnQgQ29tcHV0ZVRhcmdldCwgQW1sQ29tcHV0ZQpmcm9tIGF6dXJlbWwuY29yZS5jb21wdXRlX3RhcmdldCBpbXBvcnQgQ29tcHV0ZVRhcmdldEV4Y2VwdGlvbgpmcm9tIGF6dXJlbWwuY29yZS5zY3JpcHRfcnVuIGltcG9ydCBTY3JpcHRSdW4KCmZyb20gYXp1cmVtbC50cmFpbi5hdXRvbWwgaW1wb3J0IEF1dG9NTENvbmZpZwpmcm9tIGF6dXJlbWwudHJhaW4uYXV0b21sLnJ1biBpbXBvcnQgQXV0b01MUnVuCgoKZGVmIF9lbnZfb3Jfc2VjcmV0KGNvbnRleHQsIGtleSk6CiAgICBpZiBrZXkgaW4gb3MuZW52aXJvbjoKICAgICAgICByZXR1cm4gb3MuZW52aXJvbltrZXldCiAgICByZXR1cm4gY29udGV4dC5nZXRfc2VjcmV0KGtleSkKCgpkZWYgX2xvYWRfd29ya3NwYWNlKGNvbnRleHQ6IE1MQ2xpZW50Q3R4KSAtPiBXb3Jrc3BhY2U6CiAgICAiIiIKICAgIExvYWRpbmcgQXp1cmVNTCBXb3Jrc3BhY2Ugd2l0aCBBenVyZSBzZWNyZXRzLgoKICAgIDpwYXJhbSBjb250ZXh0OiBNTFJ1biBjb250ZXh0LgogICAgOnJldHVybnM6ICAgICAgIEF6dXJlTUwgV29ya3NwYWNlCiAgICAiIiIKCiAgICBpZiBoYXNhdHRyKGNvbnRleHQsICJfYXp1cmVfd29ya3NwYWNlIik6CiAgICAgICAgcmV0dXJuIGNvbnRleHQuX2F6dXJlX3dvcmtzcGFjZQoKICAgIGNvbnRleHQubG9nZ2VyLmluZm8oIkxvYWRpbmcgQXp1cmVNTCBXb3Jrc3BhY2UiKQogICAgIyBBenVyZSBzZXJ2aWNlIGF1dGhlbnRpY2F0aW9uOgogICAgc2VydmljZV9hdXRoZW50aWNhdGlvbiA9IFNlcnZpY2VQcmluY2lwYWxBdXRoZW50aWNhdGlvbigKICAgICAgICB0ZW5hbnRfaWQ9X2Vudl9vcl9zZWNyZXQoY29udGV4dCwgIkFaVVJFX1RFTkFOVF9JRCIpLAogICAgICAgIHNlcnZpY2VfcHJpbmNpcGFsX2lkPV9lbnZfb3Jfc2VjcmV0KGNvbnRleHQsICJBWlVSRV9TRVJWSUNFX1BSSU5DSVBBTF9JRCIpLAogICAgICAgIHNlcnZpY2VfcHJpbmNpcGFsX3Bhc3N3b3JkPV9lbnZfb3Jfc2VjcmV0KAogICAgICAgICAgICBjb250ZXh0LCAiQVpVUkVfU0VSVklDRV9QUklOQ0lQQUxfUEFTU1dPUkQiCiAgICAgICAgKSwKICAgICkKCiAgICAjIExvYWRpbmcgQXp1cmUgd29ya3NwYWNlOgogICAgd29ya3NwYWNlID0gV29ya3NwYWNlKAogICAgICAgIHN1YnNjcmlwdGlvbl9pZD1fZW52X29yX3NlY3JldChjb250ZXh0LCAiQVpVUkVfU1VCU0NSSVBUSU9OX0lEIiksCiAgICAgICAgcmVzb3VyY2VfZ3JvdXA9X2Vudl9vcl9zZWNyZXQoY29udGV4dCwgIkFaVVJFX1JFU09VUkNFX0dST1VQIiksCiAgICAgICAgd29ya3NwYWNlX25hbWU9X2Vudl9vcl9zZWNyZXQoY29udGV4dCwgIkFaVVJFX1dPUktTUEFDRV9OQU1FIiksCiAgICAgICAgYXV0aD1zZXJ2aWNlX2F1dGhlbnRpY2F0aW9uLAogICAgKQoKICAgIGNvbnRleHQuX2F6dXJlX3dvcmtzcGFjZSA9IHdvcmtzcGFjZQogICAgcmV0dXJuIHdvcmtzcGFjZQoKCmRlZiBfaW5pdF9leHBlcmltZW50KAogICAgY29udGV4dDogTUxDbGllbnRDdHgsIGV4cGVyaW1lbnRfbmFtZTogc3RyCikgLT4gVHVwbGVbV29ya3NwYWNlLCBFeHBlcmltZW50XToKICAgICIiIgogICAgSW5pdGlhbGl6ZSB3b3Jrc3BhY2UgYW5kIGV4cGVyaW1lbnQgaW4gQXp1cmUgTUwuIFVzZXMgU2VydmljZQogICAgUHJpbmNpcGFsIGF1dGhlbnRpY2F0aW9uIHZpYSBlbnZpcm9ubWVudCB2YXJpYWJsZXMuCgogICAgOnBhcmFtIGNvbnRleHQ6ICAgICAgICAgTUxSdW4gY29udGV4dC4KICAgIDpwYXJhbSBleHBlcmltZW50X25hbWU6IE5hbWUgb2YgZXhwZXJpbWVudCB0byBjcmVhdGUgaW4gQXp1cmUgTUwuCiAgICA6cmV0dXJuczogICAgICAgICAgICAgICBBenVyZSBNTCBXb3Jrc3BhY2UgYW5kIEV4cGVyaW1lbnQuCiAgICAiIiIKCiAgICAjIEluaXRpYWxpemUgZXhwZXJpbWVudCB2aWEgU2VydmljZSBQcmluY2lwYWwgQXV0aGVudGljYXRpb246CiAgICAjIGh0dHBzOi8vZG9jcy5taWNyb3NvZnQuY29tL2VuLXVzL2F6dXJlL21hY2hpbmUtbGVhcm5pbmcvaG93LXRvLXNldHVwLWF1dGhlbnRpY2F0aW9uI3VzZS1zZXJ2aWNlLXByaW5jaXBhbC1hdXRoZW50aWNhdGlvbgoKICAgIHdvcmtzcGFjZSA9IF9sb2FkX3dvcmtzcGFjZShjb250ZXh0KQoKICAgIGNvbnRleHQubG9nZ2VyLmluZm8oZiJJbml0aWFsaXppbmcgQXp1cmVNTCBleHBlcmltZW50IHtleHBlcmltZW50X25hbWV9IikKICAgICMgQ3JlYXRpbmcgZXhwZXJpbWVudDoKICAgIGV4cGVyaW1lbnQgPSBFeHBlcmltZW50KHdvcmtzcGFjZSwgZXhwZXJpbWVudF9uYW1lKQoKICAgIHJldHVybiB3b3Jrc3BhY2UsIGV4cGVyaW1lbnQKCgpkZWYgaW5pdF9jb21wdXRlKAogICAgY29udGV4dDogTUxDbGllbnRDdHgsCiAgICBjcHVfY2x1c3Rlcl9uYW1lOiBzdHIsCiAgICB2bV9zaXplOiBzdHIgPSAiU1RBTkRBUkRfRDJfVjIiLAogICAgbWF4X25vZGVzOiBpbnQgPSAxLAopIC0+IENvbXB1dGVUYXJnZXQ6CiAgICAiIiIKICAgIEluaXRpYWxpemUgQXp1cmUgTUwgY29tcHV0ZSB0YXJnZXQgdG8gcnVuIGV4cGVyaW1lbnQuIENoZWNrcyBmb3IKICAgIGV4aXN0aW5nIGNvbXB1dGUgdGFyZ2V0IGFuZCBjcmVhdGVzIG5ldyBpZiBkb2VzIG5vdCBleGlzdC4KCiAgICA6cGFyYW0gY29udGV4dDogICAgICAgICAgTUxSdW4gY29udGV4dC4KICAgIDpwYXJhbSBjcHVfY2x1c3Rlcl9uYW1lOiBOYW1lIG9mIEF6dXJlIE1MIGNvbXB1dGUgdGFyZ2V0LiBDcmVhdGVkIGlmIGRvZXMgbm90IGV4aXN0LgogICAgOnBhcmFtIHZtX3NpemU6ICAgICAgICAgIEF6dXJlIG1hY2hpbmUgdHlwZSBmb3IgY29tcHV0ZSB0YXJnZXQuCiAgICA6cGFyYW0gbWF4X25vZGVzOiAgICAgICAgTWF4aW11bSBudW1iZXIgb2YgY29uY3VycmVudCBjb21wdXRlIHRhcmdldHMuCiAgICA6cmV0dXJuczogICAgICAgICAgICAgICAgQXp1cmUgTUwgQ29tcHV0ZSBUYXJnZXQuCiAgICAiIiIKCiAgICB3b3Jrc3BhY2UgPSBfbG9hZF93b3Jrc3BhY2UoY29udGV4dCkKICAgIGNvbnRleHQubG9nZ2VyLmluZm8oZiJJbml0aWFsaXppbmcgQXp1cmVNTCBjb21wdXRlIHRhcmdldCB7Y3B1X2NsdXN0ZXJfbmFtZX0iKQoKICAgICMgVmVyaWZ5IHRoYXQgY2x1c3RlciBkb2VzIG5vdCBleGlzdCBhbHJlYWR5OgogICAgdHJ5OgogICAgICAgIGNvbXB1dGVfdGFyZ2V0ID0gQ29tcHV0ZVRhcmdldCh3b3Jrc3BhY2U9d29ya3NwYWNlLCBuYW1lPWNwdV9jbHVzdGVyX25hbWUpCiAgICAgICAgY29udGV4dC5sb2dnZXIuaW5mbygiRm91bmQgZXhpc3RpbmcgY2x1c3Rlciwgd2lsbCB1c2UgaXQuIikKICAgIGV4Y2VwdCBDb21wdXRlVGFyZ2V0RXhjZXB0aW9uOgogICAgICAgIGNvbXB1dGVfY29uZmlnID0gQW1sQ29tcHV0ZS5wcm92aXNpb25pbmdfY29uZmlndXJhdGlvbigKICAgICAgICAgICAgdm1fc2l6ZT12bV9zaXplLCBtYXhfbm9kZXM9bWF4X25vZGVzCiAgICAgICAgKQogICAgICAgIGNvbXB1dGVfdGFyZ2V0ID0gQ29tcHV0ZVRhcmdldC5jcmVhdGUoCiAgICAgICAgICAgIHdvcmtzcGFjZSwgY3B1X2NsdXN0ZXJfbmFtZSwgY29tcHV0ZV9jb25maWcKICAgICAgICApCgogICAgY29tcHV0ZV90YXJnZXQud2FpdF9mb3JfY29tcGxldGlvbihzaG93X291dHB1dD1UcnVlKQogICAgcmV0dXJuIGNvbXB1dGVfdGFyZ2V0CgoKZGVmIHJlZ2lzdGVyX2RhdGFzZXQoCiAgICBjb250ZXh0OiBNTENsaWVudEN0eCwKICAgIGRhdGFzZXRfbmFtZTogc3RyLAogICAgZGF0YXNldF9kZXNjcmlwdGlvbjogc3RyLAogICAgZGF0YTogRGF0YUl0ZW0sCiAgICBjcmVhdGVfbmV3X3ZlcnNpb246IGJvb2wgPSBGYWxzZSwKKToKICAgICIiIgogICAgUmVnaXN0ZXIgZGF0YXNldCBvYmplY3QgKGNhbiBiZSBhbHNvIGFuIElndWF6aW8gRmVhdHVyZVZlY3RvcikgaW4gQXp1cmUgTUwuCiAgICBVcGxvYWRzIHBhcnF1ZXQgZmlsZSB0byBBenVyZSBibG9iIHN0b3JhZ2UgYW5kIHJlZ2lzdGVycwogICAgdGhhdCBmaWxlIGFzIGEgZGF0YXNldCBpbiBBenVyZSBNTC4KCiAgICA6cGFyYW0gY29udGV4dDogICAgICAgICAgICAgICBNTFJ1biBjb250ZXh0LgogICAgOnBhcmFtIGRhdGFzZXRfbmFtZTogICAgICAgICAgTmFtZSBvZiBBenVyZSBkYXRhc2V0IHRvIHJlZ2lzdGVyLgogICAgOnBhcmFtIGRhdGFzZXRfZGVzY3JpcHRpb246ICAgRGVzY3JpcHRpb24gb2YgQXp1cmUgZGF0YXNldCB0byByZWdpc3Rlci4KICAgIDpwYXJhbSBkYXRhOiAgICAgICAgICAgICAgICAgIE1MUnVuIEZlYXR1cmVWZWN0b3Igb3IgZGF0YXNldCBvYmplY3QgdG8gdXBsb2FkLgogICAgOnBhcmFtIGNyZWF0ZV9uZXdfdmVyc2lvbjogICAgUmVnaXN0ZXIgQXp1cmUgZGF0YXNldCBhcyBuZXcgdmVyc2lvbi4gTXVzdCBiZSB1c2VkIHdoZW4KICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIG1vZGlmeWluZyBkYXRhc2V0IHNjaGVtYS4KICAgICIiIgoKICAgICMgdGVzdCBmb3IgQXp1cmUgc3RvcmFnZSBjb25uZWN0aW9uIGVudmlyb25tZW50IHZhcmlhYmxlIG9yIHNlY3JldDoKICAgIGFzc2VydCBfZW52X29yX3NlY3JldCgKICAgICAgICBjb250ZXh0LCAiQVpVUkVfU1RPUkFHRV9DT05ORUNUSU9OX1NUUklORyIKICAgICksICJBWlVSRV9TVE9SQUdFX0NPTk5FQ1RJT05fU1RSSU5HIHNlY3JldCBub3Qgc2V0IgoKICAgICMgQ29ubmVjdCB0byBBenVyZU1MIGV4cGVyaW1lbnQgYW5kIGRhdGFzdG9yZToKICAgIGNvbnRleHQubG9nZ2VyLmluZm8oIkNvbm5lY3RpbmcgdG8gQXp1cmVNTCBleHBlcmltZW50IGRlZmF1bHQgZGF0YXN0b3JlIikKCiAgICB3b3Jrc3BhY2UgPSBfbG9hZF93b3Jrc3BhY2UoY29udGV4dCkKICAgIGRhdGFzdG9yZSA9IHdvcmtzcGFjZS5nZXRfZGVmYXVsdF9kYXRhc3RvcmUoKQoKICAgICMgQXp1cmUgYmxvYiBwYXRoIChkZWZhdWx0IGRhdGFzdG9yZSBmb3Igd29ya3NwYWNlKToKICAgIGJsb2JfcGF0aCA9IGYiYXo6Ly97ZGF0YXN0b3JlLmNvbnRhaW5lcl9uYW1lfS97ZGF0YXNldF9uYW1lfSIKCiAgICBzdG9yZV91cmlfcHJlZml4LCBfID0gbWxydW4uZGF0YXN0b3JlLnBhcnNlX3N0b3JlX3VyaShkYXRhLmFydGlmYWN0X3VybCkKICAgIGZlYXR1cmVfdmVjdG9yX2Nhc2UgPSBtbHJ1bi51dGlscy5TdG9yZVByZWZpeC5GZWF0dXJlVmVjdG9yID09IHN0b3JlX3VyaV9wcmVmaXgKICAgICMgUmV0cmlldmUgZGF0YSBzb3VyY2UgYXMgZGF0YWZyYW1lOgogICAgaWYgZmVhdHVyZV92ZWN0b3JfY2FzZToKICAgICAgICAjIEZlYXR1cmVWZWN0b3IgY2FzZToKICAgICAgICBjb250ZXh0LmxvZ2dlci5pbmZvKAogICAgICAgICAgICBmIlJldHJpZXZpbmcgZmVhdHVyZSB2ZWN0b3IgYW5kIHVwbG9hZGluZyB0byBBenVyZSBibG9iIHN0b3JhZ2U6IHtibG9iX3BhdGh9IgogICAgICAgICkKICAgICAgICBmX3N0b3JlLmdldF9vZmZsaW5lX2ZlYXR1cmVzKGRhdGEubWV0YS51cmksIHRhcmdldD1QYXJxdWV0VGFyZ2V0KHBhdGg9YmxvYl9wYXRoKSkKICAgIGVsc2U6CiAgICAgICAgYmxvYl9wYXRoICs9IGRhdGEuc3VmZml4CiAgICAgICAgIyBEYXRhSXRlbSBjYXNlOgogICAgICAgIGNvbnRleHQubG9nZ2VyLmluZm8oCiAgICAgICAgICAgIGYiUmV0cmlldmluZyBmZWF0dXJlIHZlY3RvciBhbmQgdXBsb2FkaW5nIHRvIEF6dXJlIGJsb2Igc3RvcmFnZToge2Jsb2JfcGF0aH0iCiAgICAgICAgKQogICAgICAgIGRhdGFfaW5fYnl0ZXMgPSBkYXRhLmdldCgpCiAgICAgICAgZ2V0X2RhdGFpdGVtKGJsb2JfcGF0aCkucHV0KGRhdGFfaW5fYnl0ZXMpCgogICAgIyBSZWdpc3RlciBkYXRhc2V0IGluIEF6dXJlTUw6CiAgICBjb250ZXh0LmxvZ2dlci5pbmZvKGYiUmVnaXN0ZXJpbmcgZGF0YXNldCB7ZGF0YXNldF9uYW1lfSBpbiBBenVyZSBNTCIpCiAgICBpZiBkYXRhLnN1ZmZpeCA9PSAiLnBhcnF1ZXQiIG9yIGZlYXR1cmVfdmVjdG9yX2Nhc2U6CiAgICAgICAgZGF0YXNldCA9IERhdGFzZXQuVGFidWxhci5mcm9tX3BhcnF1ZXRfZmlsZXMoCiAgICAgICAgICAgIHBhdGg9KGRhdGFzdG9yZSwgZiJ7ZGF0YXNldF9uYW1lfS5wYXJxdWV0IiksIHZhbGlkYXRlPUZhbHNlCiAgICAgICAgKQogICAgZWxzZToKICAgICAgICBjb250ZXh0LmxvZ2dlci5pbmZvKAogICAgICAgICAgICBmIk9wZW5TU0wgdmVyc2lvbiBtdXN0IGJlIDEuMS4gT3ZlcnJpZGluZyB0aGUgT3BlblNTTCB2ZXJzaW9uIHRvIDEuMSIKICAgICAgICApCiAgICAgICAgIyBPcGVuU1NMIHZlcnNpb24gbXVzdCBiZSAxLjEKICAgICAgICBvcy5lbnZpcm9uWyJDTFJfT1BFTlNTTF9WRVJTSU9OX09WRVJSSURFIl0gPSAiMS4xIgogICAgICAgIGRhdGFzZXQgPSBEYXRhc2V0LlRhYnVsYXIuZnJvbV9kZWxpbWl0ZWRfZmlsZXMoCiAgICAgICAgICAgIHBhdGg9KGRhdGFzdG9yZSwgZiJ7ZGF0YXNldF9uYW1lfXtkYXRhLnN1ZmZpeH0iKSwgdmFsaWRhdGU9RmFsc2UKICAgICAgICApCgogICAgZGF0YXNldC5yZWdpc3RlcigKICAgICAgICB3b3Jrc3BhY2U9d29ya3NwYWNlLAogICAgICAgIG5hbWU9ZGF0YXNldF9uYW1lLAogICAgICAgIGRlc2NyaXB0aW9uPWRhdGFzZXRfZGVzY3JpcHRpb24sCiAgICAgICAgY3JlYXRlX25ld192ZXJzaW9uPWNyZWF0ZV9uZXdfdmVyc2lvbiwKICAgICkKCiAgICAjIE91dHB1dCByZWdpc3RlcmVkIGRhdGFzZXQgbmFtZSBpbiBBenVyZToKICAgIGNvbnRleHQubG9nX3Jlc3VsdCgiZGF0YXNldF9ibG9iX3BhdGgiLCBibG9iX3BhdGgpCgoKZGVmIGRvd25sb2FkX21vZGVsKAogICAgY29udGV4dDogTUxDbGllbnRDdHgsCiAgICBtb2RlbF9uYW1lOiBzdHIsCiAgICBtb2RlbF92ZXJzaW9uOiBpbnQsCiAgICB0YXJnZXRfZGlyOiBzdHIgPSAiLiIsCikgLT4gTm9uZToKICAgICIiIgogICAgRG93bmxvYWQgdHJhaW5lZCBtb2RlbCBmcm9tIEF6dXJlIE1MIHRvIGxvY2FsIGZpbGVzeXN0ZW0uCgogICAgOnBhcmFtIGNvbnRleHQ6ICAgICAgIE1MUnVuIGNvbnRleHQuCiAgICA6cGFyYW0gbW9kZWxfbmFtZTogICAgTmFtZSBvZiB0cmFpbmVkIGFuZCByZWdpc3RlcmVkIG1vZGVsLgogICAgOnBhcmFtIG1vZGVsX3ZlcnNpb246IFZlcnNpb24gb2YgbW9kZWwgdG8gZG93bmxvYWQuCiAgICA6cGFyYW0gdGFyZ2V0X2RpcjogICAgVGFyZ2V0IGRpcmVjdG9yeSB0byBkb3dubG9hZCBtb2RlbC4KICAgICIiIgogICAgIyBMb2FkaW5nIHdvcmtzcGFjZSBpZiBub3QgcHJvdmlkZWQ6CiAgICB3b3Jrc3BhY2UgPSBfbG9hZF93b3Jrc3BhY2UoY29udGV4dCkKICAgIGNvbnRleHQubG9nZ2VyLmluZm8oZiJEb3dubG9hZGluZyBtb2RlbCB7bW9kZWxfbmFtZX06e21vZGVsX3ZlcnNpb259IikKICAgIG1vZGVsID0gTW9kZWwod29ya3NwYWNlLCBtb2RlbF9uYW1lLCB2ZXJzaW9uPW1vZGVsX3ZlcnNpb24pCiAgICBtb2RlbC5kb3dubG9hZCh0YXJnZXRfZGlyPXRhcmdldF9kaXIsIGV4aXN0X29rPVRydWUpCgoKZGVmIHVwbG9hZF9tb2RlbCgKICAgIGNvbnRleHQ6IE1MQ2xpZW50Q3R4LAogICAgbW9kZWxfbmFtZTogc3RyLAogICAgbW9kZWxfcGF0aDogc3RyLAogICAgbW9kZWxfZGVzY3JpcHRpb246IHN0ciA9IE5vbmUsCiAgICBtb2RlbF90YWdzOiBkaWN0ID0gTm9uZSwKKSAtPiBOb25lOgogICAgIiIiCiAgICBVcGxvYWQgcHJlLXRyYWluZWQgbW9kZWwgZnJvbSBsb2NhbCBmaWxlc3lzdGVtIHRvIEF6dXJlIE1MLgogICAgOnBhcmFtIGNvbnRleHQ6ICAgICAgICAgICBNTFJ1biBjb250ZXh0LgogICAgOnBhcmFtIG1vZGVsX25hbWU6ICAgICAgICBOYW1lIG9mIHRyYWluZWQgYW5kIHJlZ2lzdGVyZWQgbW9kZWwuCiAgICA6cGFyYW0gbW9kZWxfcGF0aDogICAgICAgIFBhdGggdG8gZmlsZSBvbiBsb2NhbCBmaWxlc3lzdGVtLgogICAgOnBhcmFtIG1vZGVsX2Rlc2NyaXB0aW9uOiBEZXNjcmlwdGlvbiBvZiBtb2RlbHMuCiAgICA6cGFyYW0gbW9kZWxfdGFnczogICAgICAgIEtWIHBhaXJzIG9mIG1vZGVsIHRhZ3MuCiAgICAiIiIKICAgICMgTG9hZGluZyB3b3Jrc3BhY2UgaWYgbm90IHByb3ZpZGVkOgogICAgd29ya3NwYWNlID0gX2xvYWRfd29ya3NwYWNlKGNvbnRleHQpCgogICAgY29udGV4dC5sb2dnZXIuaW5mbyhmIlVwbG9hZCBtb2RlbCB7bW9kZWxfbmFtZX0gZnJvbSB7bW9kZWxfcGF0aH0iKQogICAgTW9kZWwucmVnaXN0ZXIoCiAgICAgICAgd29ya3NwYWNlPXdvcmtzcGFjZSwKICAgICAgICBtb2RlbF9wYXRoPW1vZGVsX3BhdGgsCiAgICAgICAgbW9kZWxfbmFtZT1tb2RlbF9uYW1lLAogICAgICAgIGRlc2NyaXB0aW9uPW1vZGVsX2Rlc2NyaXB0aW9uLAogICAgICAgIHRhZ3M9bW9kZWxfdGFncywKICAgICkKCgpkZWYgX2dldF90b3Bfbl9ydW5zKAogICAgcmVtb3RlX3J1bjogQXV0b01MUnVuLCBuOiBpbnQgPSA1LCBwcmltYXJ5X21ldHJpYzogc3RyID0gImFjY3VyYWN5IgopIC0+IExpc3RbU2NyaXB0UnVuXToKICAgICIiIgogICAgR2V0IHRvcCBOIGNvbXBsZXRlIHJ1bnMgZnJvbSBleHBlcmltZW50IHNvcnRlZCBieSBwcmltYXJ5IG1ldHJpYy4KCiAgICA6cGFyYW0gcmVtb3RlX3J1bjogICAgIEF6dXJlIE1MIFJ1bi4KICAgIDpwYXJhbSBuOiAgICAgICAgICAgICAgTnVtYmVyIG9mIHRvcCBydW5zIHRvIHJldHVybi4KICAgIDpwYXJhbSBwcmltYXJ5X21ldHJpYzogTWV0cmljIHRvIHNvcnQgYnkuCgogICAgOnJldHVybnM6ICAgICAgICAgICAgICBMaXN0IG9mIHRvcCBOIHJ1bnMgc29ydGVkIGJ5IHByaW1hcnkgbWV0cmljLgogICAgIiIiCiAgICAjIENvbGxlY3QgYWxsIG1vZGVsczoKICAgIGNvbXBsZXRlX3J1bnMgPSBbCiAgICAgICAgcnVuCiAgICAgICAgZm9yIHJ1biBpbiByZW1vdGVfcnVuLmdldF9jaGlsZHJlbihzdGF0dXM9IkNvbXBsZXRlZCIpCiAgICAgICAgaWYgbm90IGFueShzIGluIHJ1bi5pZCBmb3IgcyBpbiBbInNldHVwIiwgIndvcmtlciJdKQogICAgXQoKICAgICMgQ2hlY2tpbmcgdGhhdCB0aGUgcmVxdWlyZWQgbnVtYmVyIG9mIHJ1bnMgYXJlIGRvbmU6CiAgICBpZiBsZW4oY29tcGxldGVfcnVucykgPCBuOgogICAgICAgIHJhaXNlIFZhbHVlRXJyb3IoZiJFeHBlY3RlZCB7bn0gcnVucyBidXQgb25seSByZWNlaXZlZCB7bGVuKGNvbXBsZXRlX3J1bnMpfSIpCgogICAgIyBTb3J0aW5nIGJ5IHRoZSBwcmltYXJ5IG1ldHJpYzoKICAgIHNvcnRlZF9ydW5zID0gc29ydGVkKAogICAgICAgIGNvbXBsZXRlX3J1bnMsIGtleT1sYW1iZGEgcnVuOiBydW4uZ2V0X21ldHJpY3MoKVtwcmltYXJ5X21ldHJpY10sIHJldmVyc2U9VHJ1ZQogICAgKQogICAgcmV0dXJuIHNvcnRlZF9ydW5zWzpuXQoKCmRlZiBfZ2V0X21vZGVsX2hwKAogICAgcnVuOiBTY3JpcHRSdW4sCikgLT4gZGljdDoKICAgICIiIgogICAgR2V0IGh5cGVyLXBhcmFtZXRlcnMgb2YgdHJhaW5lZCBBenVyZU1MIG1vZGVsLgogICAgQ29tYmluZSB0aGUgaHlwZXItcGFyYW1ldGVycyBvZiB0aGUgZGF0YSB0cmFuc2Zvcm1hdGlvbiBhbmQgdHJhaW5pbmcgdG8gYSBkaWN0aW9uYXJ5LgogICAgVGhlIHByZWZpeCBvZiB0aGUgZGljdGlvbmFyeSBrZXlzIGNvcnJlc3BvbmRzIHRvICdkYXRhIHRyYW5zZm9ybWF0aW9uJyBhbmQgJ3RyYWluaW5nJy4KCiAgICA6cGFyYW0gcnVuOiBSdW4gb2JqZWN0IG9mIEF6dXJlTUwgdHJhaW5lZCBtb2RlbC4KCiAgICA6cmV0dXJuczogICAgQSBkaWN0aW9uYXJ5IGFzIGRlc2NyaWJlZCBpbiB0aGUgZG9jc3RyaW5nLgogICAgIiIiCgogICAgc3BlY19maWVsZCA9ICJwaXBlbGluZV9zcGVjIgogICAgaWYgc3BlY19maWVsZCBub3QgaW4gcnVuLnByb3BlcnRpZXM6CiAgICAgICAgcmV0dXJuIHt9CiAgICBzcGVjX3N0cmluZyA9IHJ1bi5wcm9wZXJ0aWVzW3NwZWNfZmllbGRdCiAgICBzcGVjX2RpY3QgPSBqc29uLmxvYWRzKHNwZWNfc3RyaW5nKQoKICAgIGlmICJvYmplY3RzIiBub3QgaW4gc3BlY19kaWN0OgogICAgICAgICMgTm8gaHlwZXItcGFyYW1zCiAgICAgICAgcmV0dXJuIHt9CiAgICBocF9kaWN0cyA9IHNwZWNfZGljdFsib2JqZWN0cyJdCiAgICAjIGFmdGVyIHRyYWluaW5nIHRoZXJlIGFyZSB0d28gaHlwZXItcGFyYW1ldGVycyBkaWN0cyBpbnNpZGUgdGhlIHJ1biBvYmplY3Q6CiAgICBhc3NlcnQgKAogICAgICAgIGxlbihocF9kaWN0cykgPT0gMgogICAgKSwgImFmdGVyIHRyYWluaW5nIHRoZXJlIGFyZSB0d28gaHlwZXItcGFyYW1ldGVycyBkaWN0cyBpbnNpZGUgdGhlIHJ1biBvYmplY3QiCiAgICByZXN1bHRfZGljdCA9IHt9CiAgICBkaWN0X2tleXMgPSBbCiAgICAgICAgWyJkYXRhX3RyYW5zX2NsYXNzX25hbWUiLCAiZGF0YV90cmFuc19tb2R1bGUiLCAiZGF0YV90cmFuc19zcGVjX2NsYXNzIl0sCiAgICAgICAgWwogICAgICAgICAgICAidHJhaW5fY2xhc3NfbmFtZSIsCiAgICAgICAgICAgICJ0cmFpbl9tb2R1bGUiLAogICAgICAgICAgICAidHJhaW5fcGFyYW1fa3dhcmdzX0MiLAogICAgICAgICAgICAidHJhaW5fcGFyYW1fa3dhcmdzX2NsYXNzX3dlaWdodCIsCiAgICAgICAgICAgICJ0cmFpbl9zcGVjX2NsYXNzIiwKICAgICAgICBdLAogICAgXQoKICAgICMgY3JlYXRpbmcgaHlwZXItcGFyYW1zIGRpY3Qgd2l0aCBrZXkgcHJlZml4ZXMgZm9yIGVhY2ggcGFydDoKICAgIGt3YXJnc19wcmVmaXggPSAicGFyYW1fa3dhcmdzIgogICAgZm9yIGQsIG5hbWUsIGtleXMgaW4gemlwKGhwX2RpY3RzLCBbImRhdGFfdHJhbnMiLCAidHJhaW4iXSwgZGljdF9rZXlzKToKICAgICAgICBmb3Iga2V5IGluIGtleXM6CgogICAgICAgICAgICBpZiBrd2FyZ3NfcHJlZml4IGluIGtleToKICAgICAgICAgICAgICAgIHJlc3VsdF9kaWN0W2tleV0gPSBkW2t3YXJnc19wcmVmaXhdWwogICAgICAgICAgICAgICAgICAgIGtleS5yZXBsYWNlKGYie25hbWV9X3trd2FyZ3NfcHJlZml4fV8iLCAiIikKICAgICAgICAgICAgICAgIF0KICAgICAgICAgICAgZWxzZToKICAgICAgICAgICAgICAgIHJlc3VsdF9kaWN0W2tleV0gPSBkW2tleS5yZXBsYWNlKGYie25hbWV9XyIsICIiKV0KICAgICAgICAgICAgaWYgbm90IHJlc3VsdF9kaWN0W2tleV06CiAgICAgICAgICAgICAgICByZXN1bHRfZGljdFtrZXldID0gIiIKCiAgICByZXR1cm4gcmVzdWx0X2RpY3QKCgpkZWYgc3VibWl0X3RyYWluaW5nX2pvYigKICAgIGNvbnRleHQ6IE1MQ2xpZW50Q3R4LAogICAgZXhwZXJpbWVudDogRXhwZXJpbWVudCwKICAgIGNvbXB1dGVfdGFyZ2V0OiBDb21wdXRlVGFyZ2V0LAogICAgcmVnaXN0ZXJfbW9kZWxfbmFtZTogc3RyLAogICAgcmVnaXN0ZXJlZF9kYXRhc2V0X25hbWU6IHN0ciwKICAgIGF1dG9tbF9zZXR0aW5nczogZGljdCwKICAgIHRyYWluaW5nX3NldDogRGF0YUl0ZW0sCiAgICBsYWJlbF9jb2x1bW5fbmFtZTogc3RyID0gJycsCiAgICBzYXZlX25fbW9kZWxzOiBpbnQgPSAzLAogICAgc2hvd19vdXRwdXQ6IGJvb2wgPSBUcnVlLAopIC0+IE5vbmU6CiAgICAiIiIKICAgIFN1Ym1pdCB0cmFpbmluZyBqb2IgdG8gQXp1cmUgQXV0b01MIGFuZCBkb3dubG9hZCB0cmFpbmVkIG1vZGVsCiAgICB3aGVuIGNvbXBsZXRlZC4gVXNlcyBwcmV2aW91c2x5IHJlZ2lzdGVyZWQgZGF0YXNldCBmb3IgdHJhaW5pbmcuCgogICAgOnBhcmFtIGNvbnRleHQ6ICAgICAgICAgICAgICAgICBNTFJ1biBjb250ZXh0LgogICAgOnBhcmFtIGV4cGVyaW1lbnQ6ICAgICAgICAgICAgICBBenVyZSBleHBlcmltZW50LgogICAgOnBhcmFtIGNvbXB1dGVfdGFyZ2V0OiAgICAgICAgICBBenVyZSBjb21wdXRlIHRhcmdldC4KICAgIDpwYXJhbSByZWdpc3Rlcl9tb2RlbF9uYW1lOiAgICAgTmFtZSBvZiBtb2RlbCB0byByZWdpc3RlciBpbiBBenVyZS4KICAgIDpwYXJhbSByZWdpc3RlcmVkX2RhdGFzZXRfbmFtZTogTmFtZSBvZiBkYXRhc2V0IHJlZ2lzdGVyZWQgaW4gQXp1cmUgTUwuCiAgICA6cGFyYW0gbGFiZWxfY29sdW1uX25hbWU6ICAgICAgIE5hbWUgb2YgdGFyZ2V0IGNvbHVtbiBpbiBkYXRhc2V0LgogICAgOnBhcmFtIGF1dG9tbF9zZXR0aW5nczogICAgICAgICBKU09OIHN0cmluZyBvZiBhbGwgQXp1cmUgQXV0b01MIHNldHRpbmdzLgogICAgOnBhcmFtIHRyYWluaW5nX3NldDogICAgICAgICAgICBUcmFpbmluZyBzZXQgdG8gbG9nIHdpdGggbW9kZWwuIEZvciBtb2RlbAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBtb25pdG9yaW5nIGludGVncmF0aW9uLgogICAgOnBhcmFtIHNob3dfb3V0cHV0OiAgICAgICAgICAgICBEaXNwbGF5aW5nIEF6dXJlIGxvZ3MuCiAgICA6cGFyYW0gc2F2ZV9uX21vZGVsczogICAgICAgICAgIEhvdyBtYW55IG9mIHRoZSB0b3AgcGVyZm9ybWluZyBtb2RlbHMgdG8gbG9nLgogICAgIiIiCiAgICAjIExvYWRpbmcgd29ya3NwYWNlIGlmIG5vdCBwcm92aWRlZDoKICAgIHdvcmtzcGFjZSA9IF9sb2FkX3dvcmtzcGFjZShjb250ZXh0KQoKICAgICMgU2V0dXAgZXhwZXJpbWVudDoKICAgIGNvbnRleHQubG9nZ2VyLmluZm8oIlNldHRpbmcgdXAgZXhwZXJpbWVudCBwYXJhbWV0ZXJzIikKICAgIGRhdGFzZXQgPSBEYXRhc2V0LmdldF9ieV9uYW1lKHdvcmtzcGFjZSwgbmFtZT1yZWdpc3RlcmVkX2RhdGFzZXRfbmFtZSkKCiAgICAjIEdldCB0cmFpbmluZyBzZXQgdG8gbG9nIHdpdGggbW9kZWw6CiAgICBmZWF0dXJlX3ZlY3RvciA9IE5vbmUKICAgIHN0b3JlX3VyaV9wcmVmaXgsIF8gPSBtbHJ1bi5kYXRhc3RvcmUucGFyc2Vfc3RvcmVfdXJpKHRyYWluaW5nX3NldC5hcnRpZmFjdF91cmwpCiAgICBpZiBtbHJ1bi51dGlscy5TdG9yZVByZWZpeC5GZWF0dXJlVmVjdG9yID09IHN0b3JlX3VyaV9wcmVmaXg6CiAgICAgICAgZmVhdHVyZV92ZWN0b3IgPSB0cmFpbmluZ19zZXQubWV0YS51cmkKICAgICAgICBsYWJlbF9jb2x1bW5fbmFtZSA9IGxhYmVsX2NvbHVtbl9uYW1lIG9yIHRyYWluaW5nX3NldC5tZXRhLnN0YXR1cy5sYWJlbF9jb2x1bW4KICAgICAgICBjb250ZXh0LmxvZ2dlci5pbmZvKGYnbGFiZWwgY29sdW1uIG5hbWU6IHtsYWJlbF9jb2x1bW5fbmFtZX0nKQogICAgICAgIHRyYWluaW5nX3NldCA9IGZfc3RvcmUuZ2V0X29mZmxpbmVfZmVhdHVyZXMoZmVhdHVyZV92ZWN0b3IpLnRvX2RhdGFmcmFtZSgpCiAgICBlbHNlOgogICAgICAgIHRyYWluaW5nX3NldCA9IHRyYWluaW5nX3NldC5hc19kZigpCgogICAgYXV0b21sX2NvbmZpZyA9IEF1dG9NTENvbmZpZygKICAgICAgICBjb21wdXRlX3RhcmdldD1jb21wdXRlX3RhcmdldCwKICAgICAgICB0cmFpbmluZ19kYXRhPWRhdGFzZXQsCiAgICAgICAgdmVyYm9zaXR5PWxvZ2dpbmcuSU5GTywKICAgICAgICBsYWJlbF9jb2x1bW5fbmFtZT1sYWJlbF9jb2x1bW5fbmFtZSwKICAgICAgICAqKmF1dG9tbF9zZXR0aW5ncywKICAgICkKCiAgICAjIFJ1biBleHBlcmltZW50IG9uIEF6dXJlTUw6CiAgICBjb250ZXh0LmxvZ2dlci5pbmZvKCJTdWJtaXR0aW5nIGFuZCBydW5uaW5nIGV4cGVyaW1lbnQiKQogICAgcmVtb3RlX3J1biA9IGV4cGVyaW1lbnQuc3VibWl0KGF1dG9tbF9jb25maWcpCiAgICByZW1vdGVfcnVuLndhaXRfZm9yX2NvbXBsZXRpb24oc2hvd19vdXRwdXQ9c2hvd19vdXRwdXQpCiAgICBpZiBzaG93X291dHB1dDoKICAgICAgICAjIEF6dXJlIGxvZyBlbmRpbmcgcm93OgogICAgICAgIHByaW50KGYiXG57JyonICogOTJ9XG4iKQogICAgIyBHZXQgdG9wIE4gcnVucyB0byBsb2c6CiAgICB0b3BfcnVucyA9IF9nZXRfdG9wX25fcnVucygKICAgICAgICByZW1vdGVfcnVuPXJlbW90ZV9ydW4sCiAgICAgICAgbj1zYXZlX25fbW9kZWxzLAogICAgICAgIHByaW1hcnlfbWV0cmljPWF1dG9tbF9zZXR0aW5nc1sicHJpbWFyeV9tZXRyaWMiXSwKICAgICkKCiAgICAjIFJlZ2lzdGVyLCBkb3dubG9hZCwgYW5kIGxvZyBtb2RlbHM6CiAgICBmb3IgaSwgcnVuIGluIGVudW1lcmF0ZSh0b3BfcnVucyk6CiAgICAgICAgIyBSZWdpc3RlciBtb2RlbDoKICAgICAgICBjb250ZXh0LmxvZ2dlci5pbmZvKCJSZWdpc3RlcmluZyBtb2RlbCIpCiAgICAgICAgbW9kZWwgPSBydW4ucmVnaXN0ZXJfbW9kZWwoCiAgICAgICAgICAgIG1vZGVsX25hbWU9cmVnaXN0ZXJfbW9kZWxfbmFtZSwgbW9kZWxfcGF0aD0ib3V0cHV0cy9tb2RlbC5wa2wiCiAgICAgICAgKQogICAgICAgIGNvbnRleHQubG9nZ2VyLmluZm8oCiAgICAgICAgICAgIGYiUmVnaXN0ZXJlZCBtb2RlbCB3aXRoIG5hbWUgJ3ttb2RlbC5uYW1lfScsIGlkICd7bW9kZWwuaWR9JywgdmVyc2lvbiAne21vZGVsLnZlcnNpb259JyIKICAgICAgICApCgogICAgICAgICMgRG93bmxvYWQgbW9kZWwgbG9jYWxseToKICAgICAgICBkb3dubG9hZF9tb2RlbCgKICAgICAgICAgICAgY29udGV4dD1jb250ZXh0LAogICAgICAgICAgICBtb2RlbF9uYW1lPXJlZ2lzdGVyX21vZGVsX25hbWUsCiAgICAgICAgICAgIG1vZGVsX3ZlcnNpb249bW9kZWwudmVyc2lvbiwKICAgICAgICAgICAgdGFyZ2V0X2Rpcj1mIi4ve21vZGVsLnZlcnNpb259IiwKICAgICAgICApCgogICAgICAgIG1ldHJpY3MgPSB7ay5sb3dlcigpOiB2YWwgZm9yIGssIHZhbCBpbiBydW4uZ2V0X21ldHJpY3MoKS5pdGVtcygpfQogICAgICAgIGRlbCBtZXRyaWNzWyJjb25mdXNpb25fbWF0cml4Il0KICAgICAgICBkZWwgbWV0cmljc1siYWNjdXJhY3lfdGFibGUiXQoKICAgICAgICAjIENvbGxlY3QgbW9kZWwgaHlwZXItcGFyYW1ldGVyczoKICAgICAgICBtb2RlbF9ocF9kaWN0ID0gX2dldF9tb2RlbF9ocChydW4pCiAgICAgICAgd2l0aCBjb250ZXh0LmdldF9jaGlsZF9jb250ZXh0KCoqbW9kZWxfaHBfZGljdCkgYXMgY2hpbGQ6CiAgICAgICAgICAgIG1vZGVsX2tleSA9IGYibW9kZWxfe2kgKyAxfV97bW9kZWxfaHBfZGljdFsnZGF0YV90cmFuc19jbGFzc19uYW1lJ10ubG93ZXIoKX1fe21vZGVsX2hwX2RpY3RbJ3RyYWluX2NsYXNzX25hbWUnXS5sb3dlcigpfSIKICAgICAgICAgICAgIyBMb2cgbW9kZWw6CiAgICAgICAgICAgIGNvbnRleHQubG9nZ2VyLmluZm8oCiAgICAgICAgICAgICAgICBmIkxvZ2dpbmcge21vZGVsX2tleX0gbW9kZWwgdG8gTUxSdW4iCiAgICAgICAgICAgICkKICAgICAgICAgICAgY2hpbGQubG9nX3Jlc3VsdHMobWV0cmljcykKICAgICAgICAgICAgY2hpbGQubG9nX21vZGVsKAogICAgICAgICAgICAgICAgIm1vZGVsIiwKICAgICAgICAgICAgICAgIGRiX2tleT1tb2RlbF9rZXksCiAgICAgICAgICAgICAgICBhcnRpZmFjdF9wYXRoPWNvbnRleHQuYXJ0aWZhY3Rfc3VicGF0aCgibW9kZWxzIiksCiAgICAgICAgICAgICAgICBtZXRyaWNzPW1ldHJpY3MsCiAgICAgICAgICAgICAgICBtb2RlbF9maWxlPWYie21vZGVsLnZlcnNpb259L21vZGVsLnBrbCIsCiAgICAgICAgICAgICAgICB0cmFpbmluZ19zZXQ9dHJhaW5pbmdfc2V0LAogICAgICAgICAgICAgICAgbGFiZWxfY29sdW1uPWxhYmVsX2NvbHVtbl9uYW1lLAogICAgICAgICAgICAgICAgZmVhdHVyZV92ZWN0b3I9ZmVhdHVyZV92ZWN0b3IsCiAgICAgICAgICAgICAgICBmcmFtZXdvcms9IkF6dXJlTUwiLAogICAgICAgICAgICAgICAgYWxnb3JpdGhtPW1vZGVsX2hwX2RpY3QuZ2V0KCJ0cmFpbl9jbGFzc19uYW1lIiksCiAgICAgICAgICAgICkKICAgICAgICAgICAgaWYgaSA9PSAwOgogICAgICAgICAgICAgICAgIyBUaGlzIGFsc28gbG9ncyB0aGUgbW9kZWw6CiAgICAgICAgICAgICAgICBjaGlsZC5tYXJrX2FzX2Jlc3QoKQoKCmRlZiB0cmFpbigKICAgICMgTWxSdW4KICAgIGNvbnRleHQ6IE1MQ2xpZW50Q3R4LAogICAgZGF0YXNldDogRGF0YUl0ZW0sCiAgICAjIEluaXQgZXhwZXJpbWVudCBhbmQgY29tcHV0ZQogICAgZXhwZXJpbWVudF9uYW1lOiBzdHIgPSAiIiwKICAgIGNwdV9jbHVzdGVyX25hbWU6IHN0ciA9ICIiLAogICAgdm1fc2l6ZTogc3RyID0gIlNUQU5EQVJEX0QyX1YyIiwKICAgIG1heF9ub2RlczogaW50ID0gMSwKICAgICMgUmVnaXN0ZXIgZGF0YXNldAogICAgZGF0YXNldF9uYW1lOiBzdHIgPSAiIiwKICAgIGRhdGFzZXRfZGVzY3JpcHRpb246IHN0ciA9ICIiLAogICAgY3JlYXRlX25ld192ZXJzaW9uOiBib29sID0gRmFsc2UsCiAgICBsYWJlbF9jb2x1bW5fbmFtZTogc3RyID0gIiIsCiAgICAjIFN1Ym1pdCB0cmFpbmluZyBqb2IKICAgIHJlZ2lzdGVyX21vZGVsX25hbWU6IHN0ciA9ICIiLAogICAgc2F2ZV9uX21vZGVsczogaW50ID0gMSwKICAgIGxvZ19henVyZTogYm9vbCA9IFRydWUsCiAgICBhdXRvbWxfc2V0dGluZ3M6IHN0ciA9IE5vbmUsCikgLT4gTm9uZToKICAgICIiIgogICAgV2hvbGUgdHJhaW5pbmcgZmxvdyBmb3IgQXp1cmUgQXV0b01MLiBSZWdpc3RlcnMgZGF0YXNldC9mZWF0dXJlIHZlY3RvciwKICAgIHN1Ym1pdHMgdHJhaW5pbmcgam9iIHRvIEF6dXJlIEF1dG9NTCwgYW5kIGRvd25sb2FkcyB0cmFpbmVkIG1vZGVsCiAgICB3aGVuIGNvbXBsZXRlZC4KCiAgICA6cGFyYW0gY29udGV4dDogICAgICAgICAgICAgTUxSdW4gY29udGV4dC4KCiAgICA6cGFyYW0gZGF0YXNldDogICAgICAgICAgICAgTUxSdW4gRmVhdHVyZVZlY3RvciBvciBkYXRhc2V0IFVSSSB0byB1cGxvYWQuIFdpbGwgZHJvcAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIGluZGV4IGJlZm9yZSB1cGxvYWRpbmcgd2hlbiBpdCBpcyBhIEZlYXR1cmVWZWN0b3IuCgogICAgOnBhcmFtIGV4cGVyaW1lbnRfbmFtZTogICAgIE5hbWUgb2YgZXhwZXJpbWVudCB0byBjcmVhdGUgaW4gQXp1cmUgTUwuCiAgICA6cGFyYW0gY3B1X2NsdXN0ZXJfbmFtZTogICAgTmFtZSBvZiBBenVyZSBNTCBjb21wdXRlIHRhcmdldC4gQ3JlYXRlZCBpZiBkb2VzIG5vdCBleGlzdC4KICAgIDpwYXJhbSB2bV9zaXplOiAgICAgICAgICAgICBBenVyZSBtYWNoaW5lIHR5cGUgZm9yIGNvbXB1dGUgdGFyZ2V0LgogICAgOnBhcmFtIG1heF9ub2RlczogICAgICAgICAgIE1heGltdW0gbnVtYmVyIG9mIGNvbmN1cnJlbnQgY29tcHV0ZSB0YXJnZXRzLgoKICAgIDpwYXJhbSBkYXRhc2V0X25hbWU6ICAgICAgICBOYW1lIG9mIEF6dXJlIGRhdGFzZXQgdG8gcmVnaXN0ZXIuCiAgICA6cGFyYW0gZGF0YXNldF9kZXNjcmlwdGlvbjogRGVzY3JpcHRpb24gb2YgQXp1cmUgZGF0YXNldCB0byByZWdpc3Rlci4KCiAgICA6cGFyYW0gY3JlYXRlX25ld192ZXJzaW9uOiAgUmVnaXN0ZXIgQXp1cmUgZGF0YXNldCBhcyBuZXcgdmVyc2lvbi4gTXVzdCBiZSB1c2VkIHdoZW4KICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBtb2RpZnlpbmcgZGF0YXNldCBzY2hlbWEuCiAgICA6cGFyYW0gbGFiZWxfY29sdW1uX25hbWU6ICAgVGFyZ2V0IGNvbHVtbiBpbiBkYXRhc2V0LgoKICAgIDpwYXJhbSByZWdpc3Rlcl9tb2RlbF9uYW1lOiBOYW1lIG9mIG1vZGVsIHRvIHJlZ2lzdGVyIGluIEF6dXJlLgogICAgOnBhcmFtIHNhdmVfbl9tb2RlbHM6ICAgICAgIEhvdyBtYW55IG9mIHRoZSB0b3AgcGVyZm9ybWluZyBtb2RlbHMgdG8gbG9nLgogICAgOnBhcmFtIGxvZ19henVyZTogICAgICAgICAgIERpc3BsYXlpbmcgQXp1cmUgbG9ncy4KICAgIDpwYXJhbSBhdXRvbWxfc2V0dGluZ3M6ICAgICBKU09OIHN0cmluZyBvZiBhbGwgQXp1cmUgQXV0b01MIHNldHRpbmdzLgogICAgIiIiCiAgICBpZiBub3QgYXV0b21sX3NldHRpbmdzOgogICAgICAgIGF1dG9tbF9zZXR0aW5ncyA9IHsKICAgICAgICAgICAgInRhc2siOiAiY2xhc3NpZmljYXRpb24iLAogICAgICAgICAgICAiZGVidWdfbG9nIjogImF1dG9tbF9lcnJvcnMubG9nIiwKICAgICAgICAgICAgIyAiZXhwZXJpbWVudF9leGl0X3Njb3JlIjogMC45LAogICAgICAgICAgICAiZW5hYmxlX2Vhcmx5X3N0b3BwaW5nIjogRmFsc2UsCiAgICAgICAgICAgICJhbGxvd2VkX21vZGVscyI6IFsiTG9naXN0aWNSZWdyZXNzaW9uIiwgIlNHRCIsICJTVk0iXSwKICAgICAgICAgICAgIml0ZXJhdGlvbnMiOiAzLAogICAgICAgICAgICAiaXRlcmF0aW9uX3RpbWVvdXRfbWludXRlcyI6IDIsCiAgICAgICAgICAgICJtYXhfY29uY3VycmVudF9pdGVyYXRpb25zIjogMiwKICAgICAgICAgICAgIm1heF9jb3Jlc19wZXJfaXRlcmF0aW9uIjogLTEsCiAgICAgICAgICAgICJuX2Nyb3NzX3ZhbGlkYXRpb25zIjogNSwKICAgICAgICAgICAgInByaW1hcnlfbWV0cmljIjogImFjY3VyYWN5IiwKICAgICAgICAgICAgImZlYXR1cml6YXRpb24iOiAib2ZmIiwKICAgICAgICAgICAgIm1vZGVsX2V4cGxhaW5hYmlsaXR5IjogRmFsc2UsCiAgICAgICAgICAgICJlbmFibGVfdm90aW5nX2Vuc2VtYmxlIjogRmFsc2UsCiAgICAgICAgICAgICJlbmFibGVfc3RhY2tfZW5zZW1ibGUiOiBGYWxzZSwKICAgICAgICB9CgogICAgIyBJbml0IGV4cGVyaW1lbnQgYW5kIGNvbXB1dGUKICAgIHdvcmtzcGFjZSwgZXhwZXJpbWVudCA9IF9pbml0X2V4cGVyaW1lbnQoCiAgICAgICAgY29udGV4dD1jb250ZXh0LCBleHBlcmltZW50X25hbWU9ZXhwZXJpbWVudF9uYW1lCiAgICApCgogICAgY29tcHV0ZV90YXJnZXQgPSBpbml0X2NvbXB1dGUoCiAgICAgICAgY29udGV4dD1jb250ZXh0LAogICAgICAgIGNwdV9jbHVzdGVyX25hbWU9Y3B1X2NsdXN0ZXJfbmFtZSwKICAgICAgICB2bV9zaXplPXZtX3NpemUsCiAgICAgICAgbWF4X25vZGVzPW1heF9ub2RlcywKICAgICkKCiAgICAjIFJlZ2lzdGVyIGRhdGFzZXQKICAgIHJlZ2lzdGVyX2RhdGFzZXQoCiAgICAgICAgY29udGV4dD1jb250ZXh0LAogICAgICAgIGRhdGFzZXRfbmFtZT1kYXRhc2V0X25hbWUsCiAgICAgICAgZGF0YXNldF9kZXNjcmlwdGlvbj1kYXRhc2V0X2Rlc2NyaXB0aW9uLAogICAgICAgIGRhdGE9ZGF0YXNldCwKICAgICAgICBjcmVhdGVfbmV3X3ZlcnNpb249Y3JlYXRlX25ld192ZXJzaW9uLAogICAgKQoKICAgICMgU3VibWl0IHRyYWluaW5nIGpvYgogICAgc3VibWl0X3RyYWluaW5nX2pvYigKICAgICAgICBjb250ZXh0LAogICAgICAgIGV4cGVyaW1lbnQ9ZXhwZXJpbWVudCwKICAgICAgICBjb21wdXRlX3RhcmdldD1jb21wdXRlX3RhcmdldCwKICAgICAgICByZWdpc3Rlcl9tb2RlbF9uYW1lPXJlZ2lzdGVyX21vZGVsX25hbWUsCiAgICAgICAgcmVnaXN0ZXJlZF9kYXRhc2V0X25hbWU9ZGF0YXNldF9uYW1lLAogICAgICAgIGxhYmVsX2NvbHVtbl9uYW1lPWxhYmVsX2NvbHVtbl9uYW1lLAogICAgICAgIGF1dG9tbF9zZXR0aW5ncz1hdXRvbWxfc2V0dGluZ3MsCiAgICAgICAgdHJhaW5pbmdfc2V0PWRhdGFzZXQsCiAgICAgICAgc2hvd19vdXRwdXQ9bG9nX2F6dXJlLAogICAgICAgIHNhdmVfbl9tb2RlbHM9c2F2ZV9uX21vZGVscywKICAgICkK
+    commands:
+    - apt-get update && apt-get install -y --no-install-recommends git
+    - apt install -y liblttng-ust0
+    base_image: python:3.9-bullseye
+    origin_filename: ''
+  default_handler: train
+  allow_empty_resources: true
+  disable_auto_mount: false
+  image: ''
+  entry_points:
+    init_compute:
+      doc: 'Initialize Azure ML compute target to run experiment. Checks for
+
+        existing compute target and creates new if does not exist.'
+      name: init_compute
+      lineno: 102
+      has_kwargs: false
+      parameters:
+      - name: context
+        type: MLClientCtx
+        doc: MLRun context.
+      - name: cpu_cluster_name
+        type: str
+        doc: Name of Azure ML compute target. Created if does not exist.
+      - name: vm_size
+        type: str
+        doc: Azure machine type for compute target.
+        default: STANDARD_D2_V2
+      - name: max_nodes
+        type: int
+        doc: Maximum number of concurrent compute targets.
+        default: 1
+      outputs:
+      - doc: Azure ML Compute Target.
+        type: ComputeTarget
+      has_varargs: false
+    register_dataset:
+      doc: 'Register dataset object (can be also an Iguazio FeatureVector) in Azure
+        ML.
+
+        Uploads parquet file to Azure blob storage and registers
+
+        that file as a dataset in Azure ML.'
+      name: register_dataset
+      lineno: 138
+      has_kwargs: false
+      parameters:
+      - name: context
+        type: MLClientCtx
+        doc: MLRun context.
+      - name: dataset_name
+        type: str
+        doc: Name of Azure dataset to register.
+      - name: dataset_description
+        type: str
+        doc: Description of Azure dataset to register.
+      - name: data
+        type: DataItem
+        doc: MLRun FeatureVector or dataset object to upload.
+      - name: create_new_version
+        type: bool
+        doc: Register Azure dataset as new version. Must be used when modifying dataset
+          schema.
+        default: false
+      has_varargs: false
+    download_model:
+      doc: Download trained model from Azure ML to local filesystem.
+      name: download_model
+      lineno: 217
+      has_kwargs: false
+      parameters:
+      - name: context
+        type: MLClientCtx
+        doc: MLRun context.
+      - name: model_name
+        type: str
+        doc: Name of trained and registered model.
+      - name: model_version
+        type: int
+        doc: Version of model to download.
+      - name: target_dir
+        type: str
+        doc: Target directory to download model.
+        default: .
+      outputs:
+      - type: None
+      has_varargs: false
+    upload_model:
+      doc: Upload pre-trained model from local filesystem to Azure ML.
+      name: upload_model
+      lineno: 238
+      has_kwargs: false
+      parameters:
+      - name: context
+        type: MLClientCtx
+        doc: MLRun context.
+      - name: model_name
+        type: str
+        doc: Name of trained and registered model.
+      - name: model_path
+        type: str
+        doc: Path to file on local filesystem.
+      - name: model_description
+        type: str
+        doc: Description of models.
+        default: null
+      - name: model_tags
+        type: dict
+        doc: KV pairs of model tags.
+        default: null
+      outputs:
+      - type: None
+      has_varargs: false
+    submit_training_job:
+      doc: 'Submit training job to Azure AutoML and download trained model
+
+        when completed. Uses previously registered dataset for training.'
+      name: submit_training_job
+      lineno: 352
+      has_kwargs: false
+      parameters:
+      - name: context
+        type: MLClientCtx
+        doc: MLRun context.
+      - name: experiment
+        type: Experiment
+        doc: Azure experiment.
+      - name: compute_target
+        type: ComputeTarget
+        doc: Azure compute target.
+      - name: register_model_name
+        type: str
+        doc: Name of model to register in Azure.
+      - name: registered_dataset_name
+        type: str
+        doc: Name of dataset registered in Azure ML.
+      - name: automl_settings
+        type: dict
+        doc: JSON string of all Azure AutoML settings.
+      - name: training_set
+        type: DataItem
+        doc: Training set to log with model. For model monitoring integration.
+      - name: label_column_name
+        type: str
+        doc: Name of target column in dataset.
+        default: ''
+      - name: save_n_models
+        type: int
+        doc: How many of the top performing models to log.
+        default: 3
+      - name: show_output
+        type: bool
+        doc: Displaying Azure logs.
+        default: true
+      outputs:
+      - type: None
+      has_varargs: false
+    train:
+      doc: 'Whole training flow for Azure AutoML. Registers dataset/feature vector,
+
+        submits training job to Azure AutoML, and downloads trained model
+
+        when completed.'
+      name: train
+      lineno: 469
+      has_kwargs: false
+      parameters:
+      - name: context
+        type: MLClientCtx
+        doc: MLRun context.
+      - name: dataset
+        type: DataItem
+        doc: MLRun FeatureVector or dataset URI to upload. Will drop index before
+          uploading when it is a FeatureVector.
+      - name: experiment_name
+        type: str
+        doc: Name of experiment to create in Azure ML.
+        default: ''
+      - name: cpu_cluster_name
+        type: str
+        doc: Name of Azure ML compute target. Created if does not exist.
+        default: ''
+      - name: vm_size
+        type: str
+        doc: Azure machine type for compute target.
+        default: STANDARD_D2_V2
+      - name: max_nodes
+        type: int
+        doc: Maximum number of concurrent compute targets.
+        default: 1
+      - name: dataset_name
+        type: str
+        doc: Name of Azure dataset to register.
+        default: ''
+      - name: dataset_description
+        type: str
+        doc: Description of Azure dataset to register.
+        default: ''
+      - name: create_new_version
+        type: bool
+        doc: Register Azure dataset as new version. Must be used when modifying dataset
+          schema.
+        default: false
+      - name: label_column_name
+        type: str
+        doc: Target column in dataset.
+        default: ''
+      - name: register_model_name
+        type: str
+        doc: Name of model to register in Azure.
+        default: ''
+      - name: save_n_models
+        type: int
+        doc: How many of the top performing models to log.
+        default: 1
+      - name: log_azure
+        type: bool
+        doc: Displaying Azure logs.
+        default: true
+      - name: automl_settings
+        type: str
+        doc: JSON string of all Azure AutoML settings.
+        default: null
+      outputs:
+      - type: None
+      has_varargs: false
+  description: Azure AutoML integration in MLRun, including utils functions for training
+    models on Azure AutoML platfrom.
+kind: job
+metadata:
+  categories:
+  - model-serving
+  - utils
+  tag: ''
+  name: azureml-utils
+
+        
+    
+ + \ No newline at end of file diff --git a/functions/master/azureml_utils/1.4.0/static/item.html b/functions/master/azureml_utils/1.4.0/static/item.html new file mode 100644 index 00000000..60bd9b2e --- /dev/null +++ b/functions/master/azureml_utils/1.4.0/static/item.html @@ -0,0 +1,73 @@ + + + + + + + + + + + Source + + + + +
+        
+apiVersion: v1
+categories:
+- model-serving
+- utils
+description: Azure AutoML integration in MLRun, including utils functions for training
+  models on Azure AutoML platfrom.
+doc: ''
+example: azureml_utils.ipynb
+generationDate: 2022-08-28:17-25
+hidden: false
+icon: ''
+labels:
+  author: yonish
+maintainers: []
+marketplaceType: ''
+mlrunVersion: 1.7.0
+name: azureml_utils
+platformVersion: 3.5.3
+spec:
+  extra_spec:
+    allow_empty_resources: true
+    build:
+      auto_build: true
+      commands:
+      - apt-get update && apt-get install -y --no-install-recommends git
+      - apt install -y liblttng-ust0
+      with_mlrun: true
+  filename: azureml_utils.py
+  handler: train
+  image: python:3.9-bullseye
+  kind: job
+  requirements:
+  - azureml-core==1.54.0.post1
+  - azureml-train-automl-client==1.54.0.post1
+  - plotly~=5.4
+url: ''
+version: 1.4.0
+test_valid: True
+
+        
+    
+ + \ No newline at end of file diff --git a/functions/master/azureml_utils/1.4.0/static/source.html b/functions/master/azureml_utils/1.4.0/static/source.html new file mode 100644 index 00000000..9c61cff8 --- /dev/null +++ b/functions/master/azureml_utils/1.4.0/static/source.html @@ -0,0 +1,603 @@ + + + + + + + + + + + Source + + + + +
+        
+# Copyright 2019 Iguazio
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import os
+import json
+import logging
+from typing import Tuple, List
+
+from mlrun import MLClientCtx, DataItem, get_dataitem
+import mlrun.feature_store as f_store
+import mlrun.datastore
+import mlrun.utils
+from mlrun.datastore.targets import ParquetTarget
+
+from azureml.core.authentication import ServicePrincipalAuthentication
+from azureml.core.workspace import Workspace
+from azureml.core.experiment import Experiment
+from azureml.core.dataset import Dataset
+from azureml.core.model import Model
+from azureml.core.compute import ComputeTarget, AmlCompute
+from azureml.core.compute_target import ComputeTargetException
+from azureml.core.script_run import ScriptRun
+
+from azureml.train.automl import AutoMLConfig
+from azureml.train.automl.run import AutoMLRun
+
+
+def _env_or_secret(context, key):
+    if key in os.environ:
+        return os.environ[key]
+    return context.get_secret(key)
+
+
+def _load_workspace(context: MLClientCtx) -> Workspace:
+    """
+    Loading AzureML Workspace with Azure secrets.
+
+    :param context: MLRun context.
+    :returns:       AzureML Workspace
+    """
+
+    if hasattr(context, "_azure_workspace"):
+        return context._azure_workspace
+
+    context.logger.info("Loading AzureML Workspace")
+    # Azure service authentication:
+    service_authentication = ServicePrincipalAuthentication(
+        tenant_id=_env_or_secret(context, "AZURE_TENANT_ID"),
+        service_principal_id=_env_or_secret(context, "AZURE_SERVICE_PRINCIPAL_ID"),
+        service_principal_password=_env_or_secret(
+            context, "AZURE_SERVICE_PRINCIPAL_PASSWORD"
+        ),
+    )
+
+    # Loading Azure workspace:
+    workspace = Workspace(
+        subscription_id=_env_or_secret(context, "AZURE_SUBSCRIPTION_ID"),
+        resource_group=_env_or_secret(context, "AZURE_RESOURCE_GROUP"),
+        workspace_name=_env_or_secret(context, "AZURE_WORKSPACE_NAME"),
+        auth=service_authentication,
+    )
+
+    context._azure_workspace = workspace
+    return workspace
+
+
+def _init_experiment(
+    context: MLClientCtx, experiment_name: str
+) -> Tuple[Workspace, Experiment]:
+    """
+    Initialize workspace and experiment in Azure ML. Uses Service
+    Principal authentication via environment variables.
+
+    :param context:         MLRun context.
+    :param experiment_name: Name of experiment to create in Azure ML.
+    :returns:               Azure ML Workspace and Experiment.
+    """
+
+    # Initialize experiment via Service Principal Authentication:
+    # https://docs.microsoft.com/en-us/azure/machine-learning/how-to-setup-authentication#use-service-principal-authentication
+
+    workspace = _load_workspace(context)
+
+    context.logger.info(f"Initializing AzureML experiment {experiment_name}")
+    # Creating experiment:
+    experiment = Experiment(workspace, experiment_name)
+
+    return workspace, experiment
+
+
+def init_compute(
+    context: MLClientCtx,
+    cpu_cluster_name: str,
+    vm_size: str = "STANDARD_D2_V2",
+    max_nodes: int = 1,
+) -> ComputeTarget:
+    """
+    Initialize Azure ML compute target to run experiment. Checks for
+    existing compute target and creates new if does not exist.
+
+    :param context:          MLRun context.
+    :param cpu_cluster_name: Name of Azure ML compute target. Created if does not exist.
+    :param vm_size:          Azure machine type for compute target.
+    :param max_nodes:        Maximum number of concurrent compute targets.
+    :returns:                Azure ML Compute Target.
+    """
+
+    workspace = _load_workspace(context)
+    context.logger.info(f"Initializing AzureML compute target {cpu_cluster_name}")
+
+    # Verify that cluster does not exist already:
+    try:
+        compute_target = ComputeTarget(workspace=workspace, name=cpu_cluster_name)
+        context.logger.info("Found existing cluster, will use it.")
+    except ComputeTargetException:
+        compute_config = AmlCompute.provisioning_configuration(
+            vm_size=vm_size, max_nodes=max_nodes
+        )
+        compute_target = ComputeTarget.create(
+            workspace, cpu_cluster_name, compute_config
+        )
+
+    compute_target.wait_for_completion(show_output=True)
+    return compute_target
+
+
+def register_dataset(
+    context: MLClientCtx,
+    dataset_name: str,
+    dataset_description: str,
+    data: DataItem,
+    create_new_version: bool = False,
+):
+    """
+    Register dataset object (can be also an Iguazio FeatureVector) in Azure ML.
+    Uploads parquet file to Azure blob storage and registers
+    that file as a dataset in Azure ML.
+
+    :param context:               MLRun context.
+    :param dataset_name:          Name of Azure dataset to register.
+    :param dataset_description:   Description of Azure dataset to register.
+    :param data:                  MLRun FeatureVector or dataset object to upload.
+    :param create_new_version:    Register Azure dataset as new version. Must be used when
+                                  modifying dataset schema.
+    """
+
+    # test for Azure storage connection environment variable or secret:
+    assert _env_or_secret(
+        context, "AZURE_STORAGE_CONNECTION_STRING"
+    ), "AZURE_STORAGE_CONNECTION_STRING secret not set"
+
+    # Connect to AzureML experiment and datastore:
+    context.logger.info("Connecting to AzureML experiment default datastore")
+
+    workspace = _load_workspace(context)
+    datastore = workspace.get_default_datastore()
+
+    # Azure blob path (default datastore for workspace):
+    blob_path = f"az://{datastore.container_name}/{dataset_name}"
+
+    store_uri_prefix, _ = mlrun.datastore.parse_store_uri(data.artifact_url)
+    feature_vector_case = mlrun.utils.StorePrefix.FeatureVector == store_uri_prefix
+    # Retrieve data source as dataframe:
+    if feature_vector_case:
+        # FeatureVector case:
+        context.logger.info(
+            f"Retrieving feature vector and uploading to Azure blob storage: {blob_path}"
+        )
+        f_store.get_offline_features(data.meta.uri, target=ParquetTarget(path=blob_path))
+    else:
+        blob_path += data.suffix
+        # DataItem case:
+        context.logger.info(
+            f"Retrieving feature vector and uploading to Azure blob storage: {blob_path}"
+        )
+        data_in_bytes = data.get()
+        get_dataitem(blob_path).put(data_in_bytes)
+
+    # Register dataset in AzureML:
+    context.logger.info(f"Registering dataset {dataset_name} in Azure ML")
+    if data.suffix == ".parquet" or feature_vector_case:
+        dataset = Dataset.Tabular.from_parquet_files(
+            path=(datastore, f"{dataset_name}.parquet"), validate=False
+        )
+    else:
+        context.logger.info(
+            f"OpenSSL version must be 1.1. Overriding the OpenSSL version to 1.1"
+        )
+        # OpenSSL version must be 1.1
+        os.environ["CLR_OPENSSL_VERSION_OVERRIDE"] = "1.1"
+        dataset = Dataset.Tabular.from_delimited_files(
+            path=(datastore, f"{dataset_name}{data.suffix}"), validate=False
+        )
+
+    dataset.register(
+        workspace=workspace,
+        name=dataset_name,
+        description=dataset_description,
+        create_new_version=create_new_version,
+    )
+
+    # Output registered dataset name in Azure:
+    context.log_result("dataset_blob_path", blob_path)
+
+
+def download_model(
+    context: MLClientCtx,
+    model_name: str,
+    model_version: int,
+    target_dir: str = ".",
+) -> None:
+    """
+    Download trained model from Azure ML to local filesystem.
+
+    :param context:       MLRun context.
+    :param model_name:    Name of trained and registered model.
+    :param model_version: Version of model to download.
+    :param target_dir:    Target directory to download model.
+    """
+    # Loading workspace if not provided:
+    workspace = _load_workspace(context)
+    context.logger.info(f"Downloading model {model_name}:{model_version}")
+    model = Model(workspace, model_name, version=model_version)
+    model.download(target_dir=target_dir, exist_ok=True)
+
+
+def upload_model(
+    context: MLClientCtx,
+    model_name: str,
+    model_path: str,
+    model_description: str = None,
+    model_tags: dict = None,
+) -> None:
+    """
+    Upload pre-trained model from local filesystem to Azure ML.
+    :param context:           MLRun context.
+    :param model_name:        Name of trained and registered model.
+    :param model_path:        Path to file on local filesystem.
+    :param model_description: Description of models.
+    :param model_tags:        KV pairs of model tags.
+    """
+    # Loading workspace if not provided:
+    workspace = _load_workspace(context)
+
+    context.logger.info(f"Upload model {model_name} from {model_path}")
+    Model.register(
+        workspace=workspace,
+        model_path=model_path,
+        model_name=model_name,
+        description=model_description,
+        tags=model_tags,
+    )
+
+
+def _get_top_n_runs(
+    remote_run: AutoMLRun, n: int = 5, primary_metric: str = "accuracy"
+) -> List[ScriptRun]:
+    """
+    Get top N complete runs from experiment sorted by primary metric.
+
+    :param remote_run:     Azure ML Run.
+    :param n:              Number of top runs to return.
+    :param primary_metric: Metric to sort by.
+
+    :returns:              List of top N runs sorted by primary metric.
+    """
+    # Collect all models:
+    complete_runs = [
+        run
+        for run in remote_run.get_children(status="Completed")
+        if not any(s in run.id for s in ["setup", "worker"])
+    ]
+
+    # Checking that the required number of runs are done:
+    if len(complete_runs) < n:
+        raise ValueError(f"Expected {n} runs but only received {len(complete_runs)}")
+
+    # Sorting by the primary metric:
+    sorted_runs = sorted(
+        complete_runs, key=lambda run: run.get_metrics()[primary_metric], reverse=True
+    )
+    return sorted_runs[:n]
+
+
+def _get_model_hp(
+    run: ScriptRun,
+) -> dict:
+    """
+    Get hyper-parameters of trained AzureML model.
+    Combine the hyper-parameters of the data transformation and training to a dictionary.
+    The prefix of the dictionary keys corresponds to 'data transformation' and 'training'.
+
+    :param run: Run object of AzureML trained model.
+
+    :returns:    A dictionary as described in the docstring.
+    """
+
+    spec_field = "pipeline_spec"
+    if spec_field not in run.properties:
+        return {}
+    spec_string = run.properties[spec_field]
+    spec_dict = json.loads(spec_string)
+
+    if "objects" not in spec_dict:
+        # No hyper-params
+        return {}
+    hp_dicts = spec_dict["objects"]
+    # after training there are two hyper-parameters dicts inside the run object:
+    assert (
+        len(hp_dicts) == 2
+    ), "after training there are two hyper-parameters dicts inside the run object"
+    result_dict = {}
+    dict_keys = [
+        ["data_trans_class_name", "data_trans_module", "data_trans_spec_class"],
+        [
+            "train_class_name",
+            "train_module",
+            "train_param_kwargs_C",
+            "train_param_kwargs_class_weight",
+            "train_spec_class",
+        ],
+    ]
+
+    # creating hyper-params dict with key prefixes for each part:
+    kwargs_prefix = "param_kwargs"
+    for d, name, keys in zip(hp_dicts, ["data_trans", "train"], dict_keys):
+        for key in keys:
+
+            if kwargs_prefix in key:
+                result_dict[key] = d[kwargs_prefix][
+                    key.replace(f"{name}_{kwargs_prefix}_", "")
+                ]
+            else:
+                result_dict[key] = d[key.replace(f"{name}_", "")]
+            if not result_dict[key]:
+                result_dict[key] = ""
+
+    return result_dict
+
+
+def submit_training_job(
+    context: MLClientCtx,
+    experiment: Experiment,
+    compute_target: ComputeTarget,
+    register_model_name: str,
+    registered_dataset_name: str,
+    automl_settings: dict,
+    training_set: DataItem,
+    label_column_name: str = '',
+    save_n_models: int = 3,
+    show_output: bool = True,
+) -> None:
+    """
+    Submit training job to Azure AutoML and download trained model
+    when completed. Uses previously registered dataset for training.
+
+    :param context:                 MLRun context.
+    :param experiment:              Azure experiment.
+    :param compute_target:          Azure compute target.
+    :param register_model_name:     Name of model to register in Azure.
+    :param registered_dataset_name: Name of dataset registered in Azure ML.
+    :param label_column_name:       Name of target column in dataset.
+    :param automl_settings:         JSON string of all Azure AutoML settings.
+    :param training_set:            Training set to log with model. For model
+                                    monitoring integration.
+    :param show_output:             Displaying Azure logs.
+    :param save_n_models:           How many of the top performing models to log.
+    """
+    # Loading workspace if not provided:
+    workspace = _load_workspace(context)
+
+    # Setup experiment:
+    context.logger.info("Setting up experiment parameters")
+    dataset = Dataset.get_by_name(workspace, name=registered_dataset_name)
+
+    # Get training set to log with model:
+    feature_vector = None
+    store_uri_prefix, _ = mlrun.datastore.parse_store_uri(training_set.artifact_url)
+    if mlrun.utils.StorePrefix.FeatureVector == store_uri_prefix:
+        feature_vector = training_set.meta.uri
+        label_column_name = label_column_name or training_set.meta.status.label_column
+        context.logger.info(f'label column name: {label_column_name}')
+        training_set = f_store.get_offline_features(feature_vector).to_dataframe()
+    else:
+        training_set = training_set.as_df()
+
+    automl_config = AutoMLConfig(
+        compute_target=compute_target,
+        training_data=dataset,
+        verbosity=logging.INFO,
+        label_column_name=label_column_name,
+        **automl_settings,
+    )
+
+    # Run experiment on AzureML:
+    context.logger.info("Submitting and running experiment")
+    remote_run = experiment.submit(automl_config)
+    remote_run.wait_for_completion(show_output=show_output)
+    if show_output:
+        # Azure log ending row:
+        print(f"\n{'*' * 92}\n")
+    # Get top N runs to log:
+    top_runs = _get_top_n_runs(
+        remote_run=remote_run,
+        n=save_n_models,
+        primary_metric=automl_settings["primary_metric"],
+    )
+
+    # Register, download, and log models:
+    for i, run in enumerate(top_runs):
+        # Register model:
+        context.logger.info("Registering model")
+        model = run.register_model(
+            model_name=register_model_name, model_path="outputs/model.pkl"
+        )
+        context.logger.info(
+            f"Registered model with name '{model.name}', id '{model.id}', version '{model.version}'"
+        )
+
+        # Download model locally:
+        download_model(
+            context=context,
+            model_name=register_model_name,
+            model_version=model.version,
+            target_dir=f"./{model.version}",
+        )
+
+        metrics = {k.lower(): val for k, val in run.get_metrics().items()}
+        del metrics["confusion_matrix"]
+        del metrics["accuracy_table"]
+
+        # Collect model hyper-parameters:
+        model_hp_dict = _get_model_hp(run)
+        with context.get_child_context(**model_hp_dict) as child:
+            model_key = f"model_{i + 1}_{model_hp_dict['data_trans_class_name'].lower()}_{model_hp_dict['train_class_name'].lower()}"
+            # Log model:
+            context.logger.info(
+                f"Logging {model_key} model to MLRun"
+            )
+            child.log_results(metrics)
+            child.log_model(
+                "model",
+                db_key=model_key,
+                artifact_path=context.artifact_subpath("models"),
+                metrics=metrics,
+                model_file=f"{model.version}/model.pkl",
+                training_set=training_set,
+                label_column=label_column_name,
+                feature_vector=feature_vector,
+                framework="AzureML",
+                algorithm=model_hp_dict.get("train_class_name"),
+            )
+            if i == 0:
+                # This also logs the model:
+                child.mark_as_best()
+
+
+def train(
+    # MlRun
+    context: MLClientCtx,
+    dataset: DataItem,
+    # Init experiment and compute
+    experiment_name: str = "",
+    cpu_cluster_name: str = "",
+    vm_size: str = "STANDARD_D2_V2",
+    max_nodes: int = 1,
+    # Register dataset
+    dataset_name: str = "",
+    dataset_description: str = "",
+    create_new_version: bool = False,
+    label_column_name: str = "",
+    # Submit training job
+    register_model_name: str = "",
+    save_n_models: int = 1,
+    log_azure: bool = True,
+    automl_settings: str = None,
+) -> None:
+    """
+    Whole training flow for Azure AutoML. Registers dataset/feature vector,
+    submits training job to Azure AutoML, and downloads trained model
+    when completed.
+
+    :param context:             MLRun context.
+
+    :param dataset:             MLRun FeatureVector or dataset URI to upload. Will drop
+                                index before uploading when it is a FeatureVector.
+
+    :param experiment_name:     Name of experiment to create in Azure ML.
+    :param cpu_cluster_name:    Name of Azure ML compute target. Created if does not exist.
+    :param vm_size:             Azure machine type for compute target.
+    :param max_nodes:           Maximum number of concurrent compute targets.
+
+    :param dataset_name:        Name of Azure dataset to register.
+    :param dataset_description: Description of Azure dataset to register.
+
+    :param create_new_version:  Register Azure dataset as new version. Must be used when
+                                modifying dataset schema.
+    :param label_column_name:   Target column in dataset.
+
+    :param register_model_name: Name of model to register in Azure.
+    :param save_n_models:       How many of the top performing models to log.
+    :param log_azure:           Displaying Azure logs.
+    :param automl_settings:     JSON string of all Azure AutoML settings.
+    """
+    if not automl_settings:
+        automl_settings = {
+            "task": "classification",
+            "debug_log": "automl_errors.log",
+            # "experiment_exit_score": 0.9,
+            "enable_early_stopping": False,
+            "allowed_models": ["LogisticRegression", "SGD", "SVM"],
+            "iterations": 3,
+            "iteration_timeout_minutes": 2,
+            "max_concurrent_iterations": 2,
+            "max_cores_per_iteration": -1,
+            "n_cross_validations": 5,
+            "primary_metric": "accuracy",
+            "featurization": "off",
+            "model_explainability": False,
+            "enable_voting_ensemble": False,
+            "enable_stack_ensemble": False,
+        }
+
+    # Init experiment and compute
+    workspace, experiment = _init_experiment(
+        context=context, experiment_name=experiment_name
+    )
+
+    compute_target = init_compute(
+        context=context,
+        cpu_cluster_name=cpu_cluster_name,
+        vm_size=vm_size,
+        max_nodes=max_nodes,
+    )
+
+    # Register dataset
+    register_dataset(
+        context=context,
+        dataset_name=dataset_name,
+        dataset_description=dataset_description,
+        data=dataset,
+        create_new_version=create_new_version,
+    )
+
+    # Submit training job
+    submit_training_job(
+        context,
+        experiment=experiment,
+        compute_target=compute_target,
+        register_model_name=register_model_name,
+        registered_dataset_name=dataset_name,
+        label_column_name=label_column_name,
+        automl_settings=automl_settings,
+        training_set=dataset,
+        show_output=log_azure,
+        save_n_models=save_n_models,
+    )
+
+        
+    
+ + \ No newline at end of file diff --git a/functions/master/azureml_utils/latest/src/function.yaml b/functions/master/azureml_utils/latest/src/function.yaml index f9c66bdf..a6348996 100644 --- a/functions/master/azureml_utils/latest/src/function.yaml +++ b/functions/master/azureml_utils/latest/src/function.yaml @@ -1,38 +1,32 @@ -kind: job -metadata: - name: azureml-utils - tag: '' - hash: b70ddba5204c2f52a9582abe363d9de4d5d94d52 - project: '' - labels: - author: yonish - categories: - - machine-learning - - model-training +verbose: false spec: command: '' - args: [] - image: '' build: - functionSourceCode:  - base_image: python:3.9-bullseye - commands: - - apt-get update && apt-get install -y --no-install-recommends git - - apt install -y liblttng-ust0 + auto_build: true code_origin: '' - origin_filename: '' with_mlrun: true - auto_build: true requirements: - azureml-core==1.54.0.post1 - azureml-train-automl-client==1.54.0.post1 - plotly~=5.4 + functionSourceCode:  + commands: + - apt-get update && apt-get install -y --no-install-recommends git + - apt install -y liblttng-ust0 + base_image: python:3.9-bullseye + origin_filename: '' + default_handler: train + allow_empty_resources: true + disable_auto_mount: false + image: '' entry_points: init_compute: - name: init_compute doc: 'Initialize Azure ML compute target to run experiment. Checks for existing compute target and creates new if does not exist.' + name: init_compute + lineno: 102 + has_kwargs: false parameters: - name: context type: MLClientCtx @@ -51,16 +45,17 @@ spec: outputs: - doc: Azure ML Compute Target. type: ComputeTarget - default: '' - lineno: 102 + has_varargs: false register_dataset: - name: register_dataset doc: 'Register dataset object (can be also an Iguazio FeatureVector) in Azure ML. Uploads parquet file to Azure blob storage and registers that file as a dataset in Azure ML.' + name: register_dataset + lineno: 138 + has_kwargs: false parameters: - name: context type: MLClientCtx @@ -79,12 +74,12 @@ spec: doc: Register Azure dataset as new version. Must be used when modifying dataset schema. default: false - outputs: - - default: '' - lineno: 138 + has_varargs: false download_model: - name: download_model doc: Download trained model from Azure ML to local filesystem. + name: download_model + lineno: 217 + has_kwargs: false parameters: - name: context type: MLClientCtx @@ -100,11 +95,13 @@ spec: doc: Target directory to download model. default: . outputs: - - default: '' - lineno: 217 + - type: None + has_varargs: false upload_model: - name: upload_model doc: Upload pre-trained model from local filesystem to Azure ML. + name: upload_model + lineno: 238 + has_kwargs: false parameters: - name: context type: MLClientCtx @@ -124,13 +121,15 @@ spec: doc: KV pairs of model tags. default: null outputs: - - default: '' - lineno: 238 + - type: None + has_varargs: false submit_training_job: - name: submit_training_job doc: 'Submit training job to Azure AutoML and download trained model when completed. Uses previously registered dataset for training.' + name: submit_training_job + lineno: 352 + has_kwargs: false parameters: - name: context type: MLClientCtx @@ -166,15 +165,17 @@ spec: doc: Displaying Azure logs. default: true outputs: - - default: '' - lineno: 352 + - type: None + has_varargs: false train: - name: train doc: 'Whole training flow for Azure AutoML. Registers dataset/feature vector, submits training job to Azure AutoML, and downloads trained model when completed.' + name: train + lineno: 469 + has_kwargs: false parameters: - name: context type: MLClientCtx @@ -233,18 +234,14 @@ spec: doc: JSON string of all Azure AutoML settings. default: null outputs: - - default: '' - lineno: 469 + - type: None + has_varargs: false description: Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom. - default_handler: train - disable_auto_mount: false - allow_empty_resources: true - clone_target_dir: '' - env: [] - priority_class_name: '' - preemption_mode: prevent - affinity: null - tolerations: null - security_context: {} -verbose: false +kind: job +metadata: + categories: + - model-serving + - utils + tag: '' + name: azureml-utils diff --git a/functions/master/azureml_utils/latest/src/item.yaml b/functions/master/azureml_utils/latest/src/item.yaml index 04733b84..0b4d5e49 100644 --- a/functions/master/azureml_utils/latest/src/item.yaml +++ b/functions/master/azureml_utils/latest/src/item.yaml @@ -1,7 +1,7 @@ apiVersion: v1 categories: -- machine-learning -- model-training +- model-serving +- utils description: Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom. doc: '' @@ -13,7 +13,7 @@ labels: author: yonish maintainers: [] marketplaceType: '' -mlrunVersion: 1.5.2 +mlrunVersion: 1.7.0 name: azureml_utils platformVersion: 3.5.3 spec: @@ -34,5 +34,5 @@ spec: - azureml-train-automl-client==1.54.0.post1 - plotly~=5.4 url: '' -version: 1.3.0 +version: 1.4.0 test_valid: True diff --git a/functions/master/azureml_utils/latest/static/azureml_utils.html b/functions/master/azureml_utils/latest/static/azureml_utils.html index 82356dfb..9a6dc0d3 100644 --- a/functions/master/azureml_utils/latest/static/azureml_utils.html +++ b/functions/master/azureml_utils/latest/static/azureml_utils.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/azureml_utils/latest/static/documentation.html b/functions/master/azureml_utils/latest/static/documentation.html index 89713ebc..ae451127 100644 --- a/functions/master/azureml_utils/latest/static/documentation.html +++ b/functions/master/azureml_utils/latest/static/documentation.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/azureml_utils/latest/static/example.html b/functions/master/azureml_utils/latest/static/example.html index e008bec6..13e6a910 100644 --- a/functions/master/azureml_utils/latest/static/example.html +++ b/functions/master/azureml_utils/latest/static/example.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/azureml_utils/latest/static/function.html b/functions/master/azureml_utils/latest/static/function.html index f2e3e083..3aaa82f5 100644 --- a/functions/master/azureml_utils/latest/static/function.html +++ b/functions/master/azureml_utils/latest/static/function.html @@ -28,41 +28,35 @@
         
-kind: job
-metadata:
-  name: azureml-utils
-  tag: ''
-  hash: b70ddba5204c2f52a9582abe363d9de4d5d94d52
-  project: ''
-  labels:
-    author: yonish
-  categories:
-  - machine-learning
-  - model-training
+verbose: false
 spec:
   command: ''
-  args: []
-  image: ''
   build:
-    functionSourceCode: 
-    base_image: python:3.9-bullseye
-    commands:
-    - apt-get update && apt-get install -y --no-install-recommends git
-    - apt install -y liblttng-ust0
+    auto_build: true
     code_origin: ''
-    origin_filename: ''
     with_mlrun: true
-    auto_build: true
     requirements:
     - azureml-core==1.54.0.post1
     - azureml-train-automl-client==1.54.0.post1
     - plotly~=5.4
+    functionSourceCode: 
+    commands:
+    - apt-get update && apt-get install -y --no-install-recommends git
+    - apt install -y liblttng-ust0
+    base_image: python:3.9-bullseye
+    origin_filename: ''
+  default_handler: train
+  allow_empty_resources: true
+  disable_auto_mount: false
+  image: ''
   entry_points:
     init_compute:
-      name: init_compute
       doc: 'Initialize Azure ML compute target to run experiment. Checks for
 
         existing compute target and creates new if does not exist.'
+      name: init_compute
+      lineno: 102
+      has_kwargs: false
       parameters:
       - name: context
         type: MLClientCtx
@@ -81,16 +75,17 @@
       outputs:
       - doc: Azure ML Compute Target.
         type: ComputeTarget
-        default: ''
-      lineno: 102
+      has_varargs: false
     register_dataset:
-      name: register_dataset
       doc: 'Register dataset object (can be also an Iguazio FeatureVector) in Azure
         ML.
 
         Uploads parquet file to Azure blob storage and registers
 
         that file as a dataset in Azure ML.'
+      name: register_dataset
+      lineno: 138
+      has_kwargs: false
       parameters:
       - name: context
         type: MLClientCtx
@@ -109,12 +104,12 @@
         doc: Register Azure dataset as new version. Must be used when modifying dataset
           schema.
         default: false
-      outputs:
-      - default: ''
-      lineno: 138
+      has_varargs: false
     download_model:
-      name: download_model
       doc: Download trained model from Azure ML to local filesystem.
+      name: download_model
+      lineno: 217
+      has_kwargs: false
       parameters:
       - name: context
         type: MLClientCtx
@@ -130,11 +125,13 @@
         doc: Target directory to download model.
         default: .
       outputs:
-      - default: ''
-      lineno: 217
+      - type: None
+      has_varargs: false
     upload_model:
-      name: upload_model
       doc: Upload pre-trained model from local filesystem to Azure ML.
+      name: upload_model
+      lineno: 238
+      has_kwargs: false
       parameters:
       - name: context
         type: MLClientCtx
@@ -154,13 +151,15 @@
         doc: KV pairs of model tags.
         default: null
       outputs:
-      - default: ''
-      lineno: 238
+      - type: None
+      has_varargs: false
     submit_training_job:
-      name: submit_training_job
       doc: 'Submit training job to Azure AutoML and download trained model
 
         when completed. Uses previously registered dataset for training.'
+      name: submit_training_job
+      lineno: 352
+      has_kwargs: false
       parameters:
       - name: context
         type: MLClientCtx
@@ -196,15 +195,17 @@
         doc: Displaying Azure logs.
         default: true
       outputs:
-      - default: ''
-      lineno: 352
+      - type: None
+      has_varargs: false
     train:
-      name: train
       doc: 'Whole training flow for Azure AutoML. Registers dataset/feature vector,
 
         submits training job to Azure AutoML, and downloads trained model
 
         when completed.'
+      name: train
+      lineno: 469
+      has_kwargs: false
       parameters:
       - name: context
         type: MLClientCtx
@@ -263,21 +264,17 @@
         doc: JSON string of all Azure AutoML settings.
         default: null
       outputs:
-      - default: ''
-      lineno: 469
+      - type: None
+      has_varargs: false
   description: Azure AutoML integration in MLRun, including utils functions for training
     models on Azure AutoML platfrom.
-  default_handler: train
-  disable_auto_mount: false
-  allow_empty_resources: true
-  clone_target_dir: ''
-  env: []
-  priority_class_name: ''
-  preemption_mode: prevent
-  affinity: null
-  tolerations: null
-  security_context: {}
-verbose: false
+kind: job
+metadata:
+  categories:
+  - model-serving
+  - utils
+  tag: ''
+  name: azureml-utils
 
         
     
diff --git a/functions/master/azureml_utils/latest/static/item.html b/functions/master/azureml_utils/latest/static/item.html index 5eb0f63b..60bd9b2e 100644 --- a/functions/master/azureml_utils/latest/static/item.html +++ b/functions/master/azureml_utils/latest/static/item.html @@ -30,8 +30,8 @@ apiVersion: v1 categories: -- machine-learning -- model-training +- model-serving +- utils description: Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom. doc: '' @@ -43,7 +43,7 @@ author: yonish maintainers: [] marketplaceType: '' -mlrunVersion: 1.5.2 +mlrunVersion: 1.7.0 name: azureml_utils platformVersion: 3.5.3 spec: @@ -64,7 +64,7 @@ - azureml-train-automl-client==1.54.0.post1 - plotly~=5.4 url: '' -version: 1.3.0 +version: 1.4.0 test_valid: True diff --git a/functions/master/batch_inference/1.8.0/src/batch_inference.ipynb b/functions/master/batch_inference/1.8.0/src/batch_inference.ipynb new file mode 100644 index 00000000..d949bf23 --- /dev/null +++ b/functions/master/batch_inference/1.8.0/src/batch_inference.ipynb @@ -0,0 +1,1789 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "# Batch Inference\n", + "\n", + "A function for inferring given input through a given model while producing a **Result Set** and performing **Data Drift Analysis**.\n", + "\n", + "In this notebook we will go over the function's docs and outputs and see an end-to-end example of running it.\n", + "\n", + "1. [Documentation](#chapter1)\n", + "2. [Results Prediction](#chapter2)\n", + "3. [Data Drift Analysis](#chapter3)\n", + "4. [End-to-end Demo](#chapter4)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "\n", + "## 1. Documentation\n", + "\n", + "Perform a prediction on a given dataset with the given model. Can perform drift analysis between the sample set statistics stored in the model to the current input data. The drift rule is the value per-feature mean of the TVD and Hellinger scores according to the thresholds configures here." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "### 1.1. Parameters:\n", + "* **context**: `mlrun.MLClientCtx`\n", + "\n", + " An MLRun context.\n", + " \n", + "* **model**: `str`\n", + " \n", + " The model Store path, a logged model URI.\n", + " \n", + "* **dataset**: `Union[mlrun.DataItem, list, dict, pd.DataFrame, pd.Series, np.ndarray]`\n", + " \n", + " The dataset to infer through the model.\n", + " * Can be passed in `inputs` as either a Dataset artifact / Feature vector URI.\n", + " * Or, in `parameters` as a list, dictionary or numpy array.\n", + " \n", + " \n", + "* **drop_columns**: `Union[str, List[str], int, List[int]]` = `None`\n", + " \n", + " A string / integer or a list of strings / integers that represent the column names / indices to drop. When the dataset is a list or a numpy array this parameter must be represented by integers.\n", + " \n", + "* **label_columns**: `Union[str, List[str]]` = `None`\n", + " \n", + " The target label(s) of the column(s) in the dataset. These names will be used as the column names for the predictions. The label column can be accessed from the model object, or the feature vector provided if available. The default name is `\"predicted_label_i\"` for the `i` column.\n", + "\n", + "* **feature_columns**: `Union[str, List[str]]` = `None`\n", + " \n", + " List of feature columns that will be used to build the dataframe when dataset is\n", + " from type list or numpy array.\n", + "\n", + "* **log_result_set**: `str` = `True`\n", + " \n", + " Whether to log the result set - a DataFrame of the given inputs concatenated with the predictions. Defaulted to `True`.\n", + "\n", + "* **result_set_name**: `str` = `\"prediction\"`\n", + " \n", + " The db key to set name of the prediction result and the filename. Defaulted to `\"prediction\"`.\n", + "\n", + "* **batch_id**: `str` = `None`\n", + "\n", + " The ID of the given batch (inference dataset). If `None`, it will be generated. Will be logged as a result of the run.\n", + "\n", + "* **perform_drift_analysis**: `bool` = `None`\n", + " \n", + " Whether to perform drift analysis between the sample set of the model object to the dataset given. By default, `None`, which means it will perform drift analysis if the model has a sample set statistics.\n", + "\n", + "\n", + "* **sample_set**: `Union[mlrun.DataItem, list, dict, pd.DataFrame, pd.Series, np.ndarray]`\n", + " \n", + " A sample dataset to give to compare the inputs in the drift analysis. The default chosen sample set will always be the one who is set in the model artifact itself.\n", + " * Can be passed in `inputs` as either a Dataset artifact / Feature vector URI.\n", + " * Or, in `parameters` as a list, dictionary or numpy array.\n", + "\n", + "\n", + "* **drift_threshold**: `float` = `0.7`\n", + " \n", + " The threshold of which to mark drifts. Defaulted to 0.7.\n", + "\n", + "* **possible_drift_threshold**: `float` = `0.5`\n", + " \n", + " The threshold of which to mark possible drifts. Defaulted to 0.5.\n", + "\n", + "* **inf_capping**: `float` = `10.0`\n", + " \n", + " The value to set for when it reached infinity. Defaulted to 10.0.\n", + "\n", + "* **artifacts_tag**: `str` = `\"\"`\n", + " \n", + " Tag to use for all the artifacts resulted from the function. Defaulted to no tag." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "### 1.2. Outputs\n", + "\n", + "The outputs are split to two actions the functions can perform:\n", + "* [**Results Prediction**](#chapter2) - Will log:\n", + " * A dataset artifact named by the `result_set_name` parameter.\n", + " * A `str` result named `\"batch_id\"` of the given / generated batch ID.\n", + "\n", + "* [**Data Drift Analysis**](#chapter3) - Will log:\n", + " * A `plotly` artifact named `\"data_drift_table\"` with a visualization of the drifts results and histograms.\n", + " * A json artifact named `\"features_drift_results\"` with all the features metric values.\n", + " * A `bool` result named `\"drift_status\"` of the overall drift status (`True` if there was a drift and `False` otherwise).\n", + " * A `float` result named `\"drift_score\"` of the overall drift metric score.\n", + "\n", + "For more details, see the next chapters." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "\n", + "## 2. Results Prediction\n", + "\n", + "The result set is a concatenated dataset of the inputs ($X$) provided and the predictions ($Y$) yielded by the model, so it will be $X | Y$.\n", + "\n", + "For example, if the `dataset` given as inputs was:\n", + "\n", + "| x1 | x2 | x3 | x4 | x5 |\n", + "|-----|-----|-----|-----|-----|\n", + "| ... | ... | ... | ... | ... |\n", + "| ... | ... | ... | ... | ... |\n", + "| ... | ... | ... | ... | ... |\n", + "\n", + "And the outputs yielded by the model's prediction was:\n", + "\n", + "| y1 | y2 |\n", + "|-----|-----|\n", + "| ... | ... |\n", + "| ... | ... |\n", + "| ... | ... |\n", + "\n", + "Then the result set will be:\n", + "\n", + "| x1 | x2 | x3 | x4 | x5 | y1 | y2 |\n", + "|-----|-----|-----|-----|-----|-----|-----|\n", + "| ... | ... | ... | ... | ... | ... | ... |\n", + "| ... | ... | ... | ... | ... | ... | ... |\n", + "| ... | ... | ... | ... | ... | ... | ... |\n", + "\n", + "In case the parameter `log_result_set` is `True`, the outputs of the results prediction will be:\n", + "* The result set as described above.\n", + "* The batch ID result - `batch_id`: `str` - a hashing result that is given by the user or generated randomly in case it was not provided to represent the batch that was being inferred.\n", + "\n", + " ```python\n", + " {\n", + " \"batch_id\": \"884a0cb00d8ae16d132dd8259aac29aa78f50a9245d0e4bd58cfbf77\",\n", + " }\n", + " ```\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "\n", + "## 3. Data Drift Analysis\n", + "\n", + "The data drift analysis is done per feature using two distance measure metrics for probability distributions.\n", + "\n", + "Let us mark our sample set as $S$ and our inputs as $I$. We will look at one feature $x$ out of $n$ features. Assuming the histograms of feature $x$ is split into 20 bins: $b_1,b_2,...,b_{20}$, we will match the feature $x$ histogram of the inputs $I$ ($x_I$) into the same bins (meaning to $x_S$) and compare their distributions using:\n", + "\n", + "* Total Variance Distance: $TVD(x_S,x_I) = \\frac{1}{2}\\sum_{b_1}^{b_{20}} {|x_S - x_I|}$\n", + "* Hellinger Distance: $H(x_S,x_I) = \\sqrt{1-{\\sum_{b_1}^{b_{20}}\\sqrt{x_S \\cdot x_I}}}$\n", + "\n", + "Our **rule** then is calculating for each $x\\in S: \\frac{H(x_S,x_I)+TVD(x_S,x_I)}{2} < $ given thresholds.\n", + "\n", + "In case the parameter `perform_drift_analysis` is `True`, the outputs of the analysis will be:\n", + "* **Drift table plot** - The results are presented in a `plotly` table artifact named `\"drift_table_plot\"` that shows each feature's statistics and its TVD, Hellinger and KLD (Kullback–Leibler divergence) results as follows:\n", + "\n", + "| | Count | | Mean | | Std | | Min | | Max | | Tvd | Hellinger | Kld | Histograms |\n", + "| ------ | ---------- | ---------- | ---------- | ---------- | ---------- | ---------- | ---------- | ---------- | ---------- | ---------- | --- | --------- | --- |------------|\n", + "| | **Sample** | **Input** | **Sample** | **Input** | **Sample** | **Input** | **Sample** | **Input** | **Sample** | **Input** | | | | |\n", + "| **x1** | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |\n", + "| **x2** | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |\n", + "| **x3** | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |\n", + "\n", + "* **Features drift results** - A rule metric per feature dictionary is saved in a json file named `\"features_drift_results\"` where each key is a feature and its value is the feature's metric value: `Dict[str, float]`\n", + "\n", + " ```python\n", + " {\n", + " \"x1\": 0.12,\n", + " \"x2\": 0.345,\n", + " \"x3\": 0.00678,\n", + " ...\n", + " }\n", + " ```\n", + "\n", + "* In addition, two results are being added to summarize the drift analysis:\n", + "\n", + " * `drift_status`: `bool` - A boolean value indicating whether a drift was found.\n", + " * `drift_metric`: `float` - The mean of all the features drift metric value (the rule above):\n", + " for $n$ features and metric rule $M(x_S,x_I)=\\frac{H(x_S,x_I)+TVD(x_S,x_I)}{2}$, `drift_metric` $=\\frac{1}{n}\\sum_{x\\in S}M(x_S,x_I)$\n", + "\n", + " ```python\n", + " {\n", + " \"drift_status\": True,\n", + " \"drift_metric\": 0.81234\n", + " }\n", + " ```" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "\n", + "## 4. End-to-end Demo\n", + "\n", + "We will see an end-to-end example that follows the steps below:\n", + "1. Generate data.\n", + "2. Train a model.\n", + "3. Infer data through the model using `batch_predict` and review the outputs." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "### 4.1. Code review\n", + "\n", + "We are using a very simple example of training a decision tree on a binary classification problem. For that we wrote two functions:\n", + "* `generate_data` - Generate a binary classification data. The data will be split into a *training set* and *data for prediction*. The data for prediction will be drifted in half of its features to showcase the plot later on.\n", + "* `train` - Train a decision tree classifier on a given data." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "# mlrun: start-code" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "# upload environment variables from env file if exists\n", + "import os,mlrun\n", + "\n", + "# Specify path\n", + "path = \"/tmp/examples_ci.env\"\n", + "\n", + "if os.path.exists(path):\n", + " env_dict = mlrun.set_env_from_file(path, return_dict=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "from sklearn.datasets import make_classification\n", + "from sklearn.tree import DecisionTreeClassifier\n", + "\n", + "import mlrun\n", + "from mlrun.frameworks.sklearn import apply_mlrun\n", + "\n", + "\n", + "@mlrun.handler(outputs=[\"training_set\", \"prediction_set\"])\n", + "def generate_data(n_samples: int = 5000, n_features: int = 20):\n", + " # Generate a classification data:\n", + " x, y = make_classification(\n", + " n_samples=n_samples, n_features=n_features, n_classes=2\n", + " )\n", + "\n", + " # Split the data into a training set and a prediction set:\n", + " x_train, x_prediction = x[: n_samples // 2], x[n_samples // 2 :]\n", + " y_train = y[: n_samples // 2]\n", + " \n", + " # Randomly drift some features:\n", + " x_prediction += (\n", + " np.random.uniform(low=2, high=4, size=x_train.shape) * \n", + " np.random.randint(low=0, high=2, size=x_train.shape[1], dtype=int)\n", + " )\n", + " \n", + " # Initialize dataframes:\n", + " features = [f\"feature_{i}\" for i in range(n_features)]\n", + " training_set = pd.DataFrame(data=x_train, columns=features)\n", + " training_set.insert(\n", + " loc=n_features, column=\"label\", value=y_train, allow_duplicates=True\n", + " )\n", + " prediction_set = pd.DataFrame(data=x_prediction, columns=features)\n", + "\n", + " return training_set, prediction_set\n", + "\n", + "\n", + "@mlrun.handler()\n", + "def train(training_set: pd.DataFrame):\n", + " # Get the data into x, y:\n", + " labels = pd.DataFrame(training_set[\"label\"])\n", + " training_set.drop(columns=[\"label\"], inplace=True)\n", + "\n", + " # Initialize a model:\n", + " model = DecisionTreeClassifier()\n", + "\n", + " # Apply MLRun:\n", + " apply_mlrun(model=model, model_name=\"model\")\n", + "\n", + " # Train:\n", + " model.fit(training_set, labels)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "# mlrun: end-code" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "### 4.2. Run the Example with MLRun\n", + "\n", + "First, we will prepare our MLRun functions:\n", + "1. We will use `mlrun.code_to_function` to turn this demo notebook into an MLRun function we can run.\n", + "2. We will use `mlrun.import_function` to import the `batch_predict` function ." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "# Create an MLRun function to run the notebook:\n", + "demo_function = mlrun.code_to_function(name=\"batch_inference_demo\", kind=\"job\")\n", + "\n", + "# Import the `batch_predict` function from the marketplace:\n", + "batch_inference_function = mlrun.import_function(\"hub://batch_inference\")\n", + "\n", + "# Set the desired artifact path:\n", + "artifact_path = \"./\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "Now, we will follow the demo steps as discussed above:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2022-09-13 09:54:59,693 [warning] artifact path is not defined or is local, artifacts will not be visible in the UI\n", + "> 2022-09-13 09:54:59,694 [info] starting run batch-predict-demo-generate_data uid=a5b1ca0a37d946e892b9305b9af833c3 DB=http://mlrun-api:8080\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Sep 13 09:54:59completedbatch-predict-demo-generate_data
v3io_user=guyl
kind=
owner=guyl
host=jupyter-guyl-66857b7999-ffvsx
training_set
prediction_set
\n", + "
\n", + "
\n", + "
\n", + " Title\n", + " ×\n", + "
\n", + " \n", + "
\n", + "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "text/html": [ + " > to track results use the .show() or .logs() methods or click here to open in UI" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2022-09-13 09:55:06,462 [info] run executed, status=completed\n", + "> 2022-09-13 09:55:06,464 [warning] artifact path is not defined or is local, artifacts will not be visible in the UI\n", + "> 2022-09-13 09:55:06,464 [info] starting run batch-predict-demo-train uid=384b36e84c4e4f91900e49e1f24ff1a6 DB=http://mlrun-api:8080\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Sep 13 09:55:06completedbatch-predict-demo-train
v3io_user=guyl
kind=
owner=guyl
host=jupyter-guyl-66857b7999-ffvsx
training_set
model
\n", + "
\n", + "
\n", + "
\n", + " Title\n", + " ×\n", + "
\n", + " \n", + "
\n", + "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "text/html": [ + " > to track results use the .show() or .logs() methods or click here to open in UI" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2022-09-13 09:55:07,367 [info] run executed, status=completed\n", + "> 2022-09-13 09:55:07,370 [warning] artifact path is not defined or is local, artifacts will not be visible in the UI\n", + "> 2022-09-13 09:55:07,370 [info] starting run batch-predict-predict uid=cf88e39d59704912a5ee41ceb539cd05 DB=http://mlrun-api:8080\n", + "> 2022-09-13 09:55:07,703 [info] Loading model...'\n", + "> 2022-09-13 09:55:07,753 [info] Calculating prediction...\n", + "> 2022-09-13 09:55:07,757 [info] Logging result set (x | prediction)...\n", + "> 2022-09-13 09:55:07,952 [info] Performing drift analysis...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "divide by zero encountered in log\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Sep 13 09:55:07completedbatch-predict-predict
v3io_user=guyl
kind=
owner=guyl
host=jupyter-guyl-66857b7999-ffvsx
dataset
model=store://artifacts/default/model:384b36e84c4e4f91900e49e1f24ff1a6
label_columns=label
drift_status=False
drift_metric=0.3880999515903545
prediction
drift_table_plot
features_drift_results
\n", + "
\n", + "
\n", + "
\n", + " Title\n", + " ×\n", + "
\n", + " \n", + "
\n", + "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "text/html": [ + " > to track results use the .show() or .logs() methods or click here to open in UI" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2022-09-13 09:55:10,078 [info] run executed, status=completed\n" + ] + } + ], + "source": [ + "# 1. Generate data:\n", + "generate_data_run = demo_function.run(\n", + " handler=\"generate_data\",\n", + " artifact_path=artifact_path,\n", + " local=True,\n", + ")\n", + "\n", + "# 2. Train a model:\n", + "train_run = demo_function.run(\n", + " handler=\"train\",\n", + " artifact_path=artifact_path,\n", + " inputs={\"training_set\": generate_data_run.outputs[\"training_set\"]},\n", + " local=True,\n", + ")\n", + "\n", + "# 3. Perform batch prediction:\n", + "batch_inference_run = batch_inference_function.run(\n", + " handler=\"infer\",\n", + " artifact_path=artifact_path,\n", + " inputs={\"dataset\": generate_data_run.outputs[\"prediction_set\"]},\n", + " params={\n", + " \"model\": train_run.outputs[\"model\"],\n", + " \"label_columns\": \"label\",\n", + " },\n", + " local=True,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "### 4.3. Review Outputs\n", + "\n", + "We will review the outputs as explained in the notebook above." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "#### 4.3.1. Results Prediction\n", + "\n", + "First we will showcase the **Result Set**. As we didn't send any name, it's default name will be `\"prediction\"`:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
feature_0feature_1feature_2feature_3feature_4feature_5feature_6feature_7feature_8feature_9...feature_11feature_12feature_13feature_14feature_15feature_16feature_17feature_18feature_19label
06.3191113.2082100.793499-0.6132522.6347661.2083524.7187353.557495-2.1163110.370145...1.9629010.5813214.8445321.4087370.9649872.1114563.134610-1.7279370.3357410
13.1383781.6336610.1445570.6870032.4042791.4059903.2358921.8044262.0199801.719908...-1.4868090.6482283.0397971.8067661.2016922.4759741.4485590.9592661.1586511
22.3530264.6428790.9520970.6059774.051640-0.1575841.2187432.4647381.706084-0.250366...0.5599680.9793782.4117033.7468302.2521553.4061023.263166-0.236510-0.3131611
31.6172024.5683322.9379612.5011663.9525410.6717493.7745944.042543-2.173079-0.983443...-0.8390100.9536983.0335511.0068912.3985635.0473825.2912601.3055840.8439511
43.3442914.5383571.032059-0.0479313.1184380.4038124.4726151.840558-0.7147750.287726...1.598060-0.8055084.7420324.6087921.6177174.5148953.648923-1.3440240.6105340
..................................................................
24952.3193012.9969411.3379340.8056492.3036560.2030695.5755593.4377900.7097770.392013...-0.114619-1.4697974.5381261.2824985.6861332.8269732.445658-0.1457800.3378030
24962.9206782.1449832.153517-0.5272952.6120401.1137042.4387613.2844251.0938940.921599...-1.5868520.4098384.0947632.6366543.3334143.2511061.1329761.072658-1.2401861
24974.2566982.135673-0.1144910.3299803.935633-0.7779582.5436432.195111-0.926822-0.251254...-0.9528890.6878202.2680435.0774542.2482593.4697042.2629000.687038-0.6140661
24984.7380302.390842-0.9723291.4714612.904280-2.0790882.5706042.325262-1.602976-0.806244...0.5543990.0274934.1457283.7828024.2020063.2727090.867462-1.0200292.0133010
24992.0478312.1538130.3924840.2490103.8469100.3008463.0059972.799457-0.304962-0.990622...-0.2634730.1100912.9954112.5828434.5995353.2190911.592652-0.074851-0.6177691
\n", + "

2500 rows × 21 columns

\n", + "
" + ], + "text/plain": [ + " feature_0 feature_1 feature_2 feature_3 feature_4 feature_5 \\\n", + "0 6.319111 3.208210 0.793499 -0.613252 2.634766 1.208352 \n", + "1 3.138378 1.633661 0.144557 0.687003 2.404279 1.405990 \n", + "2 2.353026 4.642879 0.952097 0.605977 4.051640 -0.157584 \n", + "3 1.617202 4.568332 2.937961 2.501166 3.952541 0.671749 \n", + "4 3.344291 4.538357 1.032059 -0.047931 3.118438 0.403812 \n", + "... ... ... ... ... ... ... \n", + "2495 2.319301 2.996941 1.337934 0.805649 2.303656 0.203069 \n", + "2496 2.920678 2.144983 2.153517 -0.527295 2.612040 1.113704 \n", + "2497 4.256698 2.135673 -0.114491 0.329980 3.935633 -0.777958 \n", + "2498 4.738030 2.390842 -0.972329 1.471461 2.904280 -2.079088 \n", + "2499 2.047831 2.153813 0.392484 0.249010 3.846910 0.300846 \n", + "\n", + " feature_6 feature_7 feature_8 feature_9 ... feature_11 feature_12 \\\n", + "0 4.718735 3.557495 -2.116311 0.370145 ... 1.962901 0.581321 \n", + "1 3.235892 1.804426 2.019980 1.719908 ... -1.486809 0.648228 \n", + "2 1.218743 2.464738 1.706084 -0.250366 ... 0.559968 0.979378 \n", + "3 3.774594 4.042543 -2.173079 -0.983443 ... -0.839010 0.953698 \n", + "4 4.472615 1.840558 -0.714775 0.287726 ... 1.598060 -0.805508 \n", + "... ... ... ... ... ... ... ... \n", + "2495 5.575559 3.437790 0.709777 0.392013 ... -0.114619 -1.469797 \n", + "2496 2.438761 3.284425 1.093894 0.921599 ... -1.586852 0.409838 \n", + "2497 2.543643 2.195111 -0.926822 -0.251254 ... -0.952889 0.687820 \n", + "2498 2.570604 2.325262 -1.602976 -0.806244 ... 0.554399 0.027493 \n", + "2499 3.005997 2.799457 -0.304962 -0.990622 ... -0.263473 0.110091 \n", + "\n", + " feature_13 feature_14 feature_15 feature_16 feature_17 feature_18 \\\n", + "0 4.844532 1.408737 0.964987 2.111456 3.134610 -1.727937 \n", + "1 3.039797 1.806766 1.201692 2.475974 1.448559 0.959266 \n", + "2 2.411703 3.746830 2.252155 3.406102 3.263166 -0.236510 \n", + "3 3.033551 1.006891 2.398563 5.047382 5.291260 1.305584 \n", + "4 4.742032 4.608792 1.617717 4.514895 3.648923 -1.344024 \n", + "... ... ... ... ... ... ... \n", + "2495 4.538126 1.282498 5.686133 2.826973 2.445658 -0.145780 \n", + "2496 4.094763 2.636654 3.333414 3.251106 1.132976 1.072658 \n", + "2497 2.268043 5.077454 2.248259 3.469704 2.262900 0.687038 \n", + "2498 4.145728 3.782802 4.202006 3.272709 0.867462 -1.020029 \n", + "2499 2.995411 2.582843 4.599535 3.219091 1.592652 -0.074851 \n", + "\n", + " feature_19 label \n", + "0 0.335741 0 \n", + "1 1.158651 1 \n", + "2 -0.313161 1 \n", + "3 0.843951 1 \n", + "4 0.610534 0 \n", + "... ... ... \n", + "2495 0.337803 0 \n", + "2496 -1.240186 1 \n", + "2497 -0.614066 1 \n", + "2498 2.013301 0 \n", + "2499 -0.617769 1 \n", + "\n", + "[2500 rows x 21 columns]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "batch_inference_run.artifact(\"prediction\").as_df()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "#### 4.3.2. Data Drift Analysis\n", + "\n", + "Second we will review the data drift table plot and the drift results:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "
\n", + "
\n", + "\n", + "" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "batch_inference_run.artifact(\"drift_table_plot\").show()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'drift_status': False, 'drift_metric': 0.3880999515903545}" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "batch_inference_run.status.results" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "mlrun-base", + "language": "python", + "name": "conda-env-mlrun-base-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/functions/master/batch_inference/1.8.0/src/batch_inference.py b/functions/master/batch_inference/1.8.0/src/batch_inference.py new file mode 100644 index 00000000..844fdf39 --- /dev/null +++ b/functions/master/batch_inference/1.8.0/src/batch_inference.py @@ -0,0 +1,445 @@ +# Copyright 2019 Iguazio +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import hashlib +import json +from datetime import datetime +from typing import Any, Dict, List, Tuple, Union +import semver + +import mlrun +if semver.compare(mlrun.__version__, "1.5.0") >= 0: + raise mlrun.errors.MLRunNotFoundError( + f"When using `mlrun` version >=1.5.0, please use " + f"batch inference `v2` function ('hub://batch_inference_v2')." + ) + +import mlrun.datastore +import mlrun.utils +import numpy as np +import pandas as pd +from mlrun import feature_store as fs +from mlrun.artifacts import Artifact +from mlrun.data_types.infer import InferOptions, get_df_stats +from mlrun.frameworks.auto_mlrun import AutoMLRun +from mlrun.model_monitoring.features_drift_table import FeaturesDriftTablePlot +from mlrun.model_monitoring.model_monitoring_batch import ( + VirtualDrift, + calculate_inputs_statistics, +) + +# A union of all supported dataset types: +DatasetType = Union[mlrun.DataItem, list, dict, pd.DataFrame, pd.Series, np.ndarray] + + +def _read_dataset_as_dataframe( + dataset: DatasetType, + feature_columns: Union[str, List[str]] = None, + label_columns: Union[str, List[str]] = None, + drop_columns: Union[str, List[str], int, List[int]] = None, +) -> Tuple[pd.DataFrame, List[str]]: + """ + Parse the given dataset into a DataFrame and drop the columns accordingly. In addition, the label columns will be + parsed and validated as well. + + :param dataset: A dataset that will be converted into a DataFrame. + Can be either a list of lists, dict, URI or a FeatureVector. + :param feature_columns: List of feature columns that will be used to build the dataframe when dataset is from + type list or numpy array. + :param label_columns: The target label(s) of the column(s) in the dataset. for Regression or + Classification tasks. + :param drop_columns: ``str`` / ``int`` or a list of ``str`` / ``int`` that represent the column names / indices + to drop. + + :returns: A tuple of: + [0] = The parsed dataset as a DataFrame + [1] = Label columns. + + raises MLRunInvalidArgumentError: If the `drop_columns` are not matching the dataset or unsupported dataset type. + """ + # Turn the `drop labels` into a list if given: + if drop_columns is not None: + if not isinstance(drop_columns, list): + drop_columns = [drop_columns] + + # Check if the dataset is in fact a Feature Vector: + if isinstance(dataset, fs.FeatureVector): + # Try to get the label columns if not provided: + if label_columns is None: + label_columns = dataset.status.label_column + # Get the features and parse to DataFrame: + dataset = fs.get_offline_features( + dataset.uri, drop_columns=drop_columns + ).to_dataframe() + + elif isinstance(dataset, (list, np.ndarray)): + if not feature_columns: + raise mlrun.errors.MLRunInvalidArgumentError( + "Feature columns list must be provided when dataset input as from type list or numpy array" + ) + # Parse the list / numpy array into a DataFrame: + dataset = pd.DataFrame(dataset, columns=feature_columns) + # Validate the `drop_columns` is given as integers: + if drop_columns and not all(isinstance(col, int) for col in drop_columns): + raise mlrun.errors.MLRunInvalidArgumentError( + "`drop_columns` must be an integer / list of integers if provided as a list." + ) + elif isinstance(dataset, mlrun.DataItem): + # Turn the DataITem to DataFrame: + dataset = dataset.as_df() + else: + # Parse the object (should be a pd.DataFrame / pd.Series, dictionary) into a DataFrame: + try: + dataset = pd.DataFrame(dataset) + except ValueError as e: + raise mlrun.errors.MLRunInvalidArgumentError( + f"Could not parse the given dataset of type {type(dataset)} into a pandas DataFrame. " + f"Received the following error: {e}" + ) + # Drop columns if needed: + if drop_columns: + dataset.drop(drop_columns, axis=1, inplace=True) + + # Turn the `label_columns` into a list by default: + if label_columns is None: + label_columns = [] + elif isinstance(label_columns, (str, int)): + label_columns = [label_columns] + return dataset, label_columns + + +def _prepare_result_set( + x: pd.DataFrame, label_columns: List[str], y_pred: np.ndarray +) -> pd.DataFrame: + """ + Set default label column names and validate given names to prepare the result set - a concatenation of the inputs + (x) and the model predictions (y_pred). + + :param x: The inputs. + :param label_columns: A list of strings representing the target column names to add to the predictions. Default name + will be used in case the list is empty (predicted_label_{i}). + :param y_pred: The model predictions on the inputs. + + :returns: The result set. + + raises MLRunInvalidArgumentError: If the labels columns amount do not match the outputs or if one of the label + column already exists in the dataset. + """ + # Prepare default target columns names if not provided: + prediction_columns_amount = 1 if len(y_pred.shape) == 1 else y_pred.shape[1] + if len(label_columns) == 0: + # Add default label column names: + if prediction_columns_amount == 1: + label_columns = ["predicted_label"] + else: + label_columns = [ + f"predicted_label_{i}" for i in range(prediction_columns_amount) + ] + + # Validate the label columns: + if prediction_columns_amount != len(label_columns): + # No equality between provided label column names and outputs amount: + raise mlrun.errors.MLRunInvalidArgumentError( + f"The number of predicted labels: {prediction_columns_amount} " + f"is not equal to the given label columns: {len(label_columns)}" + ) + common_labels = set(label_columns) & set(x.columns.tolist()) + if common_labels: + # Label column exist in the original inputs: + raise mlrun.errors.MLRunInvalidArgumentError( + f"The labels: {common_labels} are already existed in the given dataset." + ) + + return pd.concat( + [x, pd.DataFrame(y_pred, columns=label_columns, index=x.index)], axis=1 + ) + + +def _get_sample_set_statistics( + sample_set: DatasetType = None, model_artifact_feature_stats: dict = None +) -> dict: + """ + Get the sample set statistics either from the given sample set or the statistics logged with the model while + favoring the given sample set. + + :param sample_set: A sample dataset to give to compare the inputs in the drift analysis. + :param model_artifact_feature_stats: The `feature_stats` attribute in the spec of the model artifact, where the + original sample set statistics of the model was used. + + :returns: The sample set statistics. + + raises MLRunInvalidArgumentError: If no sample set or statistics were given. + """ + # Check if a sample set was provided: + if sample_set is None: + # Check if the model was logged with a sample set: + if model_artifact_feature_stats is None: + raise mlrun.errors.MLRunInvalidArgumentError( + "Cannot perform drift analysis as there is no sample set to compare to. The model artifact was not " + "logged with a sample set and `sample_set` was not provided to the function." + ) + # Return the statistics logged with the model: + return model_artifact_feature_stats + + # Turn the DataItem to DataFrame: + if isinstance(sample_set, mlrun.DataItem): + sample_set, _ = _read_dataset_as_dataframe(dataset=sample_set) + + # Return the sample set statistics: + return get_df_stats(df=sample_set, options=InferOptions.Histogram) + + +def _get_drift_result( + tvd: float, + hellinger: float, + threshold: float, +) -> Tuple[bool, float]: + """ + Calculate the drift result by the following equation: (tvd + hellinger) / 2 + + :param tvd: The feature's TVD value. + :param hellinger: The feature's Hellinger value. + :param threshold: The threshold from which the value is considered a drift. + + :returns: A tuple of: + [0] = Boolean value as the drift status. + [1] = The result. + """ + result = (tvd + hellinger) / 2 + if result >= threshold: + return True, result + return False, result + + +def _perform_drift_analysis( + sample_set_statistics: dict, + inputs: pd.DataFrame, + drift_threshold: float, + possible_drift_threshold: float, + inf_capping: float, +) -> Tuple[Artifact, Artifact, dict]: + """ + Perform drift analysis, producing the drift table artifact for logging post prediction. + + :param sample_set_statistics: The statistics of the sample set logged along a model. + :param inputs: Input dataset to perform the drift calculation on. + :param drift_threshold: The threshold of which to mark drifts. + :param possible_drift_threshold: The threshold of which to mark possible drifts. + :param inf_capping: The value to set for when it reached infinity. + + :returns: A tuple of + [0] = An MLRun artifact holding the HTML code of the drift table plot. + [1] = An MLRun artifact holding the metric per feature dictionary. + [2] = Results to log the final analysis outcome. + """ + # Calculate the input's statistics: + inputs_statistics = calculate_inputs_statistics( + sample_set_statistics=sample_set_statistics, + inputs=inputs, + ) + + # Calculate drift: + virtual_drift = VirtualDrift(inf_capping=inf_capping) + metrics = virtual_drift.compute_drift_from_histograms( + feature_stats=sample_set_statistics, + current_stats=inputs_statistics, + ) + drift_results = virtual_drift.check_for_drift_per_feature( + metrics_results_dictionary=metrics, + possible_drift_threshold=possible_drift_threshold, + drift_detected_threshold=drift_threshold, + ) + + # Validate all feature columns named the same between the inputs and sample sets: + sample_features = set( + [ + feature_name + for feature_name, feature_statistics in sample_set_statistics.items() + if isinstance(feature_statistics, dict) + ] + ) + input_features = set(inputs.columns) + if len(sample_features & input_features) != len(input_features): + raise mlrun.errors.MLRunInvalidArgumentError( + f"Not all feature names were matching between the inputs and the sample set provided: " + f"{input_features - sample_features | sample_features - input_features}" + ) + + # Plot: + html_plot = FeaturesDriftTablePlot().produce( + features=list(input_features), + sample_set_statistics=sample_set_statistics, + inputs_statistics=inputs_statistics, + metrics=metrics, + drift_results=drift_results, + ) + + # Prepare metrics per feature dictionary: + metrics_per_feature = { + feature: _get_drift_result( + tvd=metric_dictionary["tvd"], + hellinger=metric_dictionary["hellinger"], + threshold=drift_threshold, + )[1] + for feature, metric_dictionary in metrics.items() + if isinstance(metric_dictionary, dict) + } + + # Calculate the final analysis result: + drift_status, drift_metric = _get_drift_result( + tvd=metrics["tvd_mean"], + hellinger=metrics["hellinger_mean"], + threshold=drift_threshold, + ) + + return ( + Artifact(body=html_plot, format="html", key="drift_table_plot"), + Artifact( + body=json.dumps(metrics_per_feature), + format="json", + key="features_drift_results", + ), + {"drift_status": drift_status, "drift_metric": drift_metric}, + ) + + +def infer( + context: mlrun.MLClientCtx, + model: str, + dataset: DatasetType, + drop_columns: Union[str, List[str], int, List[int]] = None, + label_columns: Union[str, List[str]] = None, + feature_columns: Union[str, List[str]] = None, + log_result_set: bool = True, + result_set_name: str = "prediction", + batch_id: str = None, + perform_drift_analysis: bool = None, + sample_set: DatasetType = None, + drift_threshold: float = 0.7, + possible_drift_threshold: float = 0.5, + inf_capping: float = 10.0, + artifacts_tag: str = "", + **predict_kwargs: Dict[str, Any], +): + """ + Perform a prediction on a given dataset with the given model. Can perform drift analysis between the sample set + statistics stored in the model to the current input data. The drift rule is the value per-feature mean of the TVD + and Hellinger scores according to the thresholds configures here. + + :param context: MLRun context. + :param model: The model Store path. + :param dataset: The dataset to infer through the model. Can be passed in `inputs` as either a + Dataset artifact / Feature vector URI. Or, in `parameters` as a list, dictionary or + numpy array. + :param drop_columns: A string / integer or a list of strings / integers that represent the column names + / indices to drop. When the dataset is a list or a numpy array this parameter must + be represented by integers. + :param label_columns: The target label(s) of the column(s) in the dataset for Regression or + Classification tasks. The label column can be accessed from the model object, or + the feature vector provided if available. + :param feature_columns: List of feature columns that will be used to build the dataframe when dataset is + from type list or numpy array. + :param log_result_set: Whether to log the result set - a DataFrame of the given inputs concatenated with + the predictions. Defaulted to True. + :param result_set_name: The db key to set name of the prediction result and the filename. Defaulted to + 'prediction'. + :param batch_id: The ID of the given batch (inference dataset). If `None`, it will be generated. + Will be logged as a result of the run. + :param perform_drift_analysis: Whether to perform drift analysis between the sample set of the model object to the + dataset given. By default, None, which means it will perform drift analysis if the + model has a sample set statistics. Perform drift analysis will produce a data drift + table artifact. + :param sample_set: A sample dataset to give to compare the inputs in the drift analysis. The default + chosen sample set will always be the one who is set in the model artifact itself. + :param drift_threshold: The threshold of which to mark drifts. Defaulted to 0.7. + :param possible_drift_threshold: The threshold of which to mark possible drifts. Defaulted to 0.5. + :param inf_capping: The value to set for when it reached infinity. Defaulted to 10.0. + :param artifacts_tag: Tag to use for all the artifacts resulted from the function. + """ + # Loading the model: + context.logger.info(f"Loading model...") + model_handler = AutoMLRun.load_model(model_path=model, context=context) + if label_columns is None: + label_columns = [ + output.name for output in model_handler._model_artifact.spec.outputs + ] + + if feature_columns is None: + feature_columns = [ + input.name for input in model_handler._model_artifact.spec.inputs + ] + + # Get dataset by object, URL or by FeatureVector: + context.logger.info(f"Loading data...") + x, label_columns = _read_dataset_as_dataframe( + dataset=dataset, + feature_columns=feature_columns, + label_columns=label_columns, + drop_columns=drop_columns, + ) + + # Predict: + context.logger.info(f"Calculating prediction...") + y_pred = model_handler.model.predict(x, **predict_kwargs) + + # Prepare the result set: + result_set = _prepare_result_set(x=x, label_columns=label_columns, y_pred=y_pred) + + # Check for logging the result set: + if log_result_set: + # Log the result set: + context.logger.info(f"Logging result set (x | prediction)...") + context.log_dataset( + key=result_set_name, + df=result_set, + db_key=result_set_name, + tag=artifacts_tag, + ) + # Log the batch ID: + if batch_id is None: + batch_id = hashlib.sha224(str(datetime.now()).encode()).hexdigest() + context.log_result( + key="batch_id", + value=batch_id, + ) + + # Check for performing drift analysis: + if ( + perform_drift_analysis is None + and model_handler._model_artifact.spec.feature_stats is not None + ): + perform_drift_analysis = True + if perform_drift_analysis: + context.logger.info("Performing drift analysis...") + # Get the sample set statistics (either from the sample set or from the statistics logged with the model): + sample_set_statistics = _get_sample_set_statistics( + sample_set=sample_set, + model_artifact_feature_stats=model_handler._model_artifact.spec.feature_stats, + ) + # Produce the artifact: + ( + drift_table_plot, + metric_per_feature_dict, + analysis_results, + ) = _perform_drift_analysis( + sample_set_statistics=sample_set_statistics, + inputs=result_set, + drift_threshold=drift_threshold, + possible_drift_threshold=possible_drift_threshold, + inf_capping=inf_capping, + ) + # Log the artifact and results: + context.log_artifact(drift_table_plot, tag=artifacts_tag) + context.log_artifact(metric_per_feature_dict, tag=artifacts_tag) + context.log_results(results=analysis_results) diff --git a/functions/master/batch_inference/1.8.0/src/function.yaml b/functions/master/batch_inference/1.8.0/src/function.yaml new file mode 100644 index 00000000..74b672d4 --- /dev/null +++ b/functions/master/batch_inference/1.8.0/src/function.yaml @@ -0,0 +1,107 @@ +kind: job +verbose: false +metadata: + name: batch-inference + tag: '' + categories: + - model-serving +spec: + image: mlrun/ml-models + entry_points: + infer: + name: infer + doc: 'Perform a prediction on a given dataset with the given model. Can perform + drift analysis between the sample set + + statistics stored in the model to the current input data. The drift rule is + the value per-feature mean of the TVD + + and Hellinger scores according to the thresholds configures here.' + parameters: + - name: context + type: MLClientCtx + doc: MLRun context. + - name: model + type: str + doc: The model Store path. + - name: dataset + type: DatasetType + doc: The dataset to infer through the model. Can be passed in `inputs` as + either a Dataset artifact / Feature vector URI. Or, in `parameters` as a + list, dictionary or numpy array. + - name: drop_columns + type: Union[str, List[str], int, List[int]] + doc: A string / integer or a list of strings / integers that represent the + column names / indices to drop. When the dataset is a list or a numpy array + this parameter must be represented by integers. + default: null + - name: label_columns + type: Union[str, List[str]] + doc: The target label(s) of the column(s) in the dataset for Regression or + Classification tasks. The label column can be accessed from the model object, + or the feature vector provided if available. + default: null + - name: feature_columns + type: Union[str, List[str]] + doc: List of feature columns that will be used to build the dataframe when + dataset is from type list or numpy array. + default: null + - name: log_result_set + type: bool + doc: Whether to log the result set - a DataFrame of the given inputs concatenated + with the predictions. Defaulted to True. + default: true + - name: result_set_name + type: str + doc: The db key to set name of the prediction result and the filename. Defaulted + to 'prediction'. + default: prediction + - name: batch_id + type: str + doc: The ID of the given batch (inference dataset). If `None`, it will be + generated. Will be logged as a result of the run. + default: null + - name: perform_drift_analysis + type: bool + doc: Whether to perform drift analysis between the sample set of the model + object to the dataset given. By default, None, which means it will perform + drift analysis if the model has a sample set statistics. Perform drift analysis + will produce a data drift table artifact. + default: null + - name: sample_set + type: DatasetType + doc: A sample dataset to give to compare the inputs in the drift analysis. + The default chosen sample set will always be the one who is set in the model + artifact itself. + default: null + - name: drift_threshold + type: float + doc: The threshold of which to mark drifts. Defaulted to 0.7. + default: 0.7 + - name: possible_drift_threshold + type: float + doc: The threshold of which to mark possible drifts. Defaulted to 0.5. + default: 0.5 + - name: inf_capping + type: float + doc: The value to set for when it reached infinity. Defaulted to 10.0. + default: 10.0 + - name: artifacts_tag + type: str + doc: Tag to use for all the artifacts resulted from the function. + default: '' + lineno: 317 + has_kwargs: true + has_varargs: false + allow_empty_resources: true + default_handler: infer + command: '' + build: + functionSourceCode:  + origin_filename: '' + auto_build: false + code_origin: '' + with_mlrun: false + disable_auto_mount: false + description: Batch inference (also knows as prediction) for the common ML frameworks + (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis. diff --git a/functions/master/batch_inference/1.8.0/src/item.yaml b/functions/master/batch_inference/1.8.0/src/item.yaml new file mode 100644 index 00000000..16a56cfe --- /dev/null +++ b/functions/master/batch_inference/1.8.0/src/item.yaml @@ -0,0 +1,31 @@ +apiVersion: v1 +categories: +- model-serving +description: Batch inference (also knows as prediction) for the common ML frameworks + (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis. +doc: '' +example: batch_inference.ipynb +generationDate: 2022-08-28:17-25 +hidden: false +icon: '' +labels: + author: guyl +maintainers: [] +marketplaceType: '' +mlrunVersion: 1.7.0 +name: batch_inference +platformVersion: 3.5.0 +spec: + extra_spec: + allow_empty_resources: true + build: + auto_build: false + with_mlrun: false + filename: batch_inference.py + handler: infer + image: mlrun/ml-models + kind: job + requirements: +url: '' +version: 1.8.0 + diff --git a/functions/master/batch_inference/1.8.0/src/requirements.txt b/functions/master/batch_inference/1.8.0/src/requirements.txt new file mode 100644 index 00000000..c120cd84 --- /dev/null +++ b/functions/master/batch_inference/1.8.0/src/requirements.txt @@ -0,0 +1,4 @@ +numpy +pandas +scikit-learn +plotly \ No newline at end of file diff --git a/functions/master/batch_inference/1.8.0/src/test_batch_inference.py b/functions/master/batch_inference/1.8.0/src/test_batch_inference.py new file mode 100644 index 00000000..d18d27a9 --- /dev/null +++ b/functions/master/batch_inference/1.8.0/src/test_batch_inference.py @@ -0,0 +1,141 @@ +# Copyright 2019 Iguazio +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import json +import os + +import mlrun +import mlrun.common.schemas +import numpy as np +import pandas as pd +import pytest +from mlrun.frameworks.sklearn import apply_mlrun +from sklearn.datasets import make_classification +from sklearn.tree import DecisionTreeClassifier + +REQUIRED_ENV_VARS = [ + "MLRUN_DBPATH", + "V3IO_USERNAME", + "V3IO_API", + "V3IO_ACCESS_KEY", +] + + +def _validate_environment_variables() -> bool: + """ + Checks that all required Environment variables are set. + """ + environment_keys = os.environ.keys() + return all(key in environment_keys for key in REQUIRED_ENV_VARS) + + +@mlrun.handler(outputs=["training_set", "prediction_set"]) +def generate_data(n_samples: int = 5000, n_features: int = 20, n_classes: int = 2): + # Generate a classification data: + x, y = make_classification(n_samples=n_samples, n_features=n_features, n_classes=2) + + # Split the data into a training set and a prediction set: + x_train, x_prediction = x[: n_samples // 2], x[n_samples // 2 :] + y_train = y[: n_samples // 2] + + # Randomly drift some features: + x_prediction += np.random.uniform( + low=2, high=4, size=x_train.shape + ) * np.random.randint(low=0, high=2, size=x_train.shape[1], dtype=int) + + # Initialize dataframes: + features = [f"feature_{i}" for i in range(n_features)] + training_set = pd.DataFrame(data=x_train, columns=features) + training_set.insert( + loc=n_features, column="target_label", value=y_train, allow_duplicates=True + ) + prediction_set = pd.DataFrame(data=x_prediction, columns=features) + + return training_set, prediction_set + + +@mlrun.handler() +def train(training_set: pd.DataFrame): + # Get the data into x, y: + labels = pd.DataFrame(training_set["target_label"]) + training_set.drop(columns=["target_label"], inplace=True) + + # Initialize a model: + model = DecisionTreeClassifier() + + # Apply MLRun: + apply_mlrun(model=model, model_name="model") + + # Train: + model.fit(training_set, labels) + + +@pytest.mark.skipif( + condition=not _validate_environment_variables(), + reason="Project's environment variables are not set", +) +def test_batch_predict(): + + project = mlrun.get_or_create_project( + "batch-infer-v9-test", context="./", user_project=True + ) + + # Configure test: + n_samples = 5000 + n_features = 20 + + # Create the function and run: + test_function = mlrun.code_to_function(filename=__file__, kind="job") + generate_data_run = test_function.run( + handler="generate_data", + params={"n_samples": n_samples, "n_features": n_features}, + local=True, + ) + train_run = test_function.run( + handler="train", + inputs={"training_set": generate_data_run.outputs["training_set"]}, + local=True, + ) + + batch_predict_function = mlrun.import_function("function.yaml") + batch_predict_run = batch_predict_function.run( + handler="infer", + inputs={"dataset": generate_data_run.outputs["prediction_set"]}, + params={ + "model": train_run.outputs["model"], + "result_set_name": "result_set", + }, + ) + + # Check the result set: + result_set = batch_predict_run.artifact("result_set").as_df() + assert result_set.shape == (n_samples // 2, n_features + 1) + assert "target_label" in result_set.columns + assert "batch_id" in batch_predict_run.status.results + + # Check drift table artifact url + assert ( + batch_predict_run.artifact("drift_table_plot").artifact_url + == batch_predict_run.outputs["drift_table_plot"] + ) + + # Check the features drift results json: + drift_results_file = batch_predict_run.artifact("features_drift_results").local() + with open(drift_results_file, "r") as json_file: + drift_results = json.load(json_file) + assert len(drift_results) == n_features + 1 + + # Check the final analysis logged results: + assert "drift_status" in batch_predict_run.status.results + assert "drift_metric" in batch_predict_run.status.results diff --git a/functions/master/batch_inference/1.8.0/static/documentation.html b/functions/master/batch_inference/1.8.0/static/documentation.html new file mode 100644 index 00000000..e6e1e6d5 --- /dev/null +++ b/functions/master/batch_inference/1.8.0/static/documentation.html @@ -0,0 +1,230 @@ + + + + + + + +batch_inference package + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + +
+
+
+
+
+
+ +
+ +
+
+
+ + + + +
+
+
+
+
+
+
+
+
+
+
+
+
+ + + + +
+
+
+
+
+

batch_inference package

+ +
+ +
+
+ +
+
+

batch_inference package#

+
+

Submodules#

+
+
+

batch_inference.batch_inference module#

+
+
+

Module contents#

+
+
+
+
+
+
+
+
+ +
+
+ +
+
+
+
+ + + +
+
+ + \ No newline at end of file diff --git a/functions/master/batch_inference/1.8.0/static/example.html b/functions/master/batch_inference/1.8.0/static/example.html new file mode 100644 index 00000000..bbd82747 --- /dev/null +++ b/functions/master/batch_inference/1.8.0/static/example.html @@ -0,0 +1,1762 @@ + + + + + + + +Batch Inference + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + +
+
+
+
+
+
+ +
+ +
+
+
+ + + + +
+
+
+
+
+
+
+
+
+
+
+
+
+ + + + +
+
+
+
+ + +
+
+

Batch Inference#

+

A function for inferring given input through a given model while producing a Result Set and performing Data Drift Analysis.

+

In this notebook we will go over the function’s docs and outputs and see an end-to-end example of running it.

+
    +
  1. Documentation

  2. +
  3. Results Prediction

  4. +
  5. Data Drift Analysis

  6. +
  7. End-to-end Demo

  8. +
+

+
+

1. Documentation#

+

Perform a prediction on a given dataset with the given model. Can perform drift analysis between the sample set statistics stored in the model to the current input data. The drift rule is the value per-feature mean of the TVD and Hellinger scores according to the thresholds configures here.

+
+

1.1. Parameters:#

+
    +
  • context: mlrun.MLClientCtx

    +

    An MLRun context.

    +
  • +
  • model: str

    +

    The model Store path, a logged model URI.

    +
  • +
  • dataset: Union[mlrun.DataItem, list, dict, pd.DataFrame, pd.Series, np.ndarray]

    +

    The dataset to infer through the model.

    +
      +
    • Can be passed in inputs as either a Dataset artifact / Feature vector URI.

    • +
    • Or, in parameters as a list, dictionary or numpy array.

    • +
    +
  • +
  • drop_columns: Union[str, List[str], int, List[int]] = None

    +

    A string / integer or a list of strings / integers that represent the column names / indices to drop. When the dataset is a list or a numpy array this parameter must be represented by integers.

    +
  • +
  • label_columns: Union[str, List[str]] = None

    +

    The target label(s) of the column(s) in the dataset. These names will be used as the column names for the predictions. The label column can be accessed from the model object, or the feature vector provided if available. The default name is "predicted_label_i" for the i column.

    +
  • +
  • feature_columns: Union[str, List[str]] = None

    +

    List of feature columns that will be used to build the dataframe when dataset is +from type list or numpy array.

    +
  • +
  • log_result_set: str = True

    +

    Whether to log the result set - a DataFrame of the given inputs concatenated with the predictions. Defaulted to True.

    +
  • +
  • result_set_name: str = "prediction"

    +

    The db key to set name of the prediction result and the filename. Defaulted to "prediction".

    +
  • +
  • batch_id: str = None

    +

    The ID of the given batch (inference dataset). If None, it will be generated. Will be logged as a result of the run.

    +
  • +
  • perform_drift_analysis: bool = None

    +

    Whether to perform drift analysis between the sample set of the model object to the dataset given. By default, None, which means it will perform drift analysis if the model has a sample set statistics.

    +
  • +
  • sample_set: Union[mlrun.DataItem, list, dict, pd.DataFrame, pd.Series, np.ndarray]

    +

    A sample dataset to give to compare the inputs in the drift analysis. The default chosen sample set will always be the one who is set in the model artifact itself.

    +
      +
    • Can be passed in inputs as either a Dataset artifact / Feature vector URI.

    • +
    • Or, in parameters as a list, dictionary or numpy array.

    • +
    +
  • +
  • drift_threshold: float = 0.7

    +

    The threshold of which to mark drifts. Defaulted to 0.7.

    +
  • +
  • possible_drift_threshold: float = 0.5

    +

    The threshold of which to mark possible drifts. Defaulted to 0.5.

    +
  • +
  • inf_capping: float = 10.0

    +

    The value to set for when it reached infinity. Defaulted to 10.0.

    +
  • +
  • artifacts_tag: str = ""

    +

    Tag to use for all the artifacts resulted from the function. Defaulted to no tag.

    +
  • +
+
+
+

1.2. Outputs#

+

The outputs are split to two actions the functions can perform:

+
    +
  • Results Prediction - Will log:

    +
      +
    • A dataset artifact named by the result_set_name parameter.

    • +
    • A str result named "batch_id" of the given / generated batch ID.

    • +
    +
  • +
  • Data Drift Analysis - Will log:

    +
      +
    • A plotly artifact named "data_drift_table" with a visualization of the drifts results and histograms.

    • +
    • A json artifact named "features_drift_results" with all the features metric values.

    • +
    • A bool result named "drift_status" of the overall drift status (True if there was a drift and False otherwise).

    • +
    • A float result named "drift_score" of the overall drift metric score.

    • +
    +
  • +
+

For more details, see the next chapters.

+

+
+
+
+

2. Results Prediction#

+

The result set is a concatenated dataset of the inputs ($X$) provided and the predictions ($Y$) yielded by the model, so it will be $X | Y$.

+

For example, if the dataset given as inputs was:

+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + +

x1

x2

x3

x4

x5

+
+

And the outputs yielded by the model’s prediction was:

+
+ + + + + + + + + + + + + + + + +

y1

y2

+
+

Then the result set will be:

+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

x1

x2

x3

x4

x5

y1

y2

+
+

In case the parameter log_result_set is True, the outputs of the results prediction will be:

+
    +
  • The result set as described above.

  • +
  • The batch ID result - batch_id: str - a hashing result that is given by the user or generated randomly in case it was not provided to represent the batch that was being inferred.

    +
    {
    +    "batch_id": "884a0cb00d8ae16d132dd8259aac29aa78f50a9245d0e4bd58cfbf77",
    +}
    +
    +
    +
  • +
+

+
+
+

3. Data Drift Analysis#

+

The data drift analysis is done per feature using two distance measure metrics for probability distributions.

+

Let us mark our sample set as $S$ and our inputs as $I$. We will look at one feature $x$ out of $n$ features. Assuming the histograms of feature $x$ is split into 20 bins: $b_1,b_2,…,b_{20}$, we will match the feature $x$ histogram of the inputs $I$ ($x_I$) into the same bins (meaning to $x_S$) and compare their distributions using:

+
    +
  • Total Variance Distance: $TVD(x_S,x_I) = \frac{1}{2}\sum_{b_1}^{b_{20}} {|x_S - x_I|}$

  • +
  • Hellinger Distance: $H(x_S,x_I) = \sqrt{1-{\sum_{b_1}^{b_{20}}\sqrt{x_S \cdot x_I}}}$

  • +
+

Our rule then is calculating for each $x\in S: \frac{H(x_S,x_I)+TVD(x_S,x_I)}{2} < $ given thresholds.

+

In case the parameter perform_drift_analysis is True, the outputs of the analysis will be:

+
    +
  • Drift table plot - The results are presented in a plotly table artifact named "drift_table_plot" that shows each feature’s statistics and its TVD, Hellinger and KLD (Kullback–Leibler divergence) results as follows:

  • +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

Count

Mean

Std

Min

Max

Tvd

Hellinger

Kld

Histograms

Sample

Input

Sample

Input

Sample

Input

Sample

Input

Sample

Input

x1

x2

x3

+
+
    +
  • Features drift results - A rule metric per feature dictionary is saved in a json file named "features_drift_results" where each key is a feature and its value is the feature’s metric value: Dict[str, float]

    +
    {
    +    "x1": 0.12,
    +    "x2": 0.345,
    +    "x3": 0.00678,
    +    ...
    +}
    +
    +
    +
  • +
  • In addition, two results are being added to summarize the drift analysis:

    +
      +
    • drift_status: bool - A boolean value indicating whether a drift was found.

    • +
    • drift_metric: float - The mean of all the features drift metric value (the rule above): +for $n$ features and metric rule $M(x_S,x_I)=\frac{H(x_S,x_I)+TVD(x_S,x_I)}{2}$, drift_metric $=\frac{1}{n}\sum_{x\in S}M(x_S,x_I)$

    • +
    +
    {
    +    "drift_status": True,
    +    "drift_metric": 0.81234
    +}
    +
    +
    +
  • +
+

+
+
+

4. End-to-end Demo#

+

We will see an end-to-end example that follows the steps below:

+
    +
  1. Generate data.

  2. +
  3. Train a model.

  4. +
  5. Infer data through the model using batch_predict and review the outputs.

  6. +
+
+

4.1. Code review#

+

We are using a very simple example of training a decision tree on a binary classification problem. For that we wrote two functions:

+
    +
  • generate_data - Generate a binary classification data. The data will be split into a training set and data for prediction. The data for prediction will be drifted in half of its features to showcase the plot later on.

  • +
  • train - Train a decision tree classifier on a given data.

  • +
+
+
+
# mlrun: start-code
+
+
+
+
+
+
+
# upload environment variables from env file if exists
+import os,mlrun
+
+# Specify path
+path = "/tmp/examples_ci.env"
+
+if os.path.exists(path):
+    env_dict = mlrun.set_env_from_file(path, return_dict=True)
+
+
+
+
+
+
+
import numpy as np
+import pandas as pd
+
+from sklearn.datasets import make_classification
+from sklearn.tree import DecisionTreeClassifier
+
+import mlrun
+from mlrun.frameworks.sklearn import apply_mlrun
+
+
+@mlrun.handler(outputs=["training_set", "prediction_set"])
+def generate_data(n_samples: int = 5000, n_features: int = 20):
+    # Generate a classification data:
+    x, y = make_classification(
+        n_samples=n_samples, n_features=n_features, n_classes=2
+    )
+
+    # Split the data into a training set and a prediction set:
+    x_train, x_prediction = x[: n_samples // 2], x[n_samples // 2 :]
+    y_train = y[: n_samples // 2]
+    
+    # Randomly drift some features:
+    x_prediction += (
+        np.random.uniform(low=2, high=4, size=x_train.shape) * 
+        np.random.randint(low=0, high=2, size=x_train.shape[1], dtype=int)
+    )
+    
+    # Initialize dataframes:
+    features = [f"feature_{i}" for i in range(n_features)]
+    training_set = pd.DataFrame(data=x_train, columns=features)
+    training_set.insert(
+        loc=n_features, column="label", value=y_train, allow_duplicates=True
+    )
+    prediction_set = pd.DataFrame(data=x_prediction, columns=features)
+
+    return training_set, prediction_set
+
+
+@mlrun.handler()
+def train(training_set: pd.DataFrame):
+    # Get the data into x, y:
+    labels = pd.DataFrame(training_set["label"])
+    training_set.drop(columns=["label"], inplace=True)
+
+    # Initialize a model:
+    model = DecisionTreeClassifier()
+
+    # Apply MLRun:
+    apply_mlrun(model=model, model_name="model")
+
+    # Train:
+    model.fit(training_set, labels)
+
+
+
+
+
+
+
# mlrun: end-code
+
+
+
+
+
+
+

4.2. Run the Example with MLRun#

+

First, we will prepare our MLRun functions:

+
    +
  1. We will use mlrun.code_to_function to turn this demo notebook into an MLRun function we can run.

  2. +
  3. We will use mlrun.import_function to import the batch_predict function .

  4. +
+
+
+
# Create an MLRun function to run the notebook:
+demo_function = mlrun.code_to_function(name="batch_inference_demo", kind="job")
+
+# Import the `batch_predict` function from the marketplace:
+batch_inference_function = mlrun.import_function("hub://batch_inference")
+
+# Set the desired artifact path:
+artifact_path = "./"
+
+
+
+
+

Now, we will follow the demo steps as discussed above:

+
+
+
# 1. Generate data:
+generate_data_run = demo_function.run(
+    handler="generate_data",
+    artifact_path=artifact_path,
+    local=True,
+)
+
+# 2. Train a model:
+train_run = demo_function.run(
+    handler="train",
+    artifact_path=artifact_path,
+    inputs={"training_set": generate_data_run.outputs["training_set"]},
+    local=True,
+)
+
+# 3. Perform batch prediction:
+batch_inference_run = batch_inference_function.run(
+    handler="infer",
+    artifact_path=artifact_path,
+    inputs={"dataset": generate_data_run.outputs["prediction_set"]},
+    params={
+        "model": train_run.outputs["model"],
+        "label_columns": "label",
+    },
+    local=True,
+)
+
+
+
+
+
> 2022-09-13 09:54:59,693 [warning] artifact path is not defined or is local, artifacts will not be visible in the UI
+> 2022-09-13 09:54:59,694 [info] starting run batch-predict-demo-generate_data uid=a5b1ca0a37d946e892b9305b9af833c3 DB=http://mlrun-api:8080
+
+
+
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Sep 13 09:54:59completedbatch-predict-demo-generate_data
v3io_user=guyl
kind=
owner=guyl
host=jupyter-guyl-66857b7999-ffvsx
training_set
prediction_set
+
+ +
+

+
+
+
> to track results use the .show() or .logs() methods or click here to open in UI
> 2022-09-13 09:55:06,462 [info] run executed, status=completed
+> 2022-09-13 09:55:06,464 [warning] artifact path is not defined or is local, artifacts will not be visible in the UI
+> 2022-09-13 09:55:06,464 [info] starting run batch-predict-demo-train uid=384b36e84c4e4f91900e49e1f24ff1a6 DB=http://mlrun-api:8080
+
+
+
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Sep 13 09:55:06completedbatch-predict-demo-train
v3io_user=guyl
kind=
owner=guyl
host=jupyter-guyl-66857b7999-ffvsx
training_set
model
+
+ +
+

+
+
+
> to track results use the .show() or .logs() methods or click here to open in UI
> 2022-09-13 09:55:07,367 [info] run executed, status=completed
+> 2022-09-13 09:55:07,370 [warning] artifact path is not defined or is local, artifacts will not be visible in the UI
+> 2022-09-13 09:55:07,370 [info] starting run batch-predict-predict uid=cf88e39d59704912a5ee41ceb539cd05 DB=http://mlrun-api:8080
+> 2022-09-13 09:55:07,703 [info] Loading model...'
+> 2022-09-13 09:55:07,753 [info] Calculating prediction...
+> 2022-09-13 09:55:07,757 [info] Logging result set (x | prediction)...
+> 2022-09-13 09:55:07,952 [info] Performing drift analysis...
+
+
+
divide by zero encountered in log
+
+
+
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Sep 13 09:55:07completedbatch-predict-predict
v3io_user=guyl
kind=
owner=guyl
host=jupyter-guyl-66857b7999-ffvsx
dataset
model=store://artifacts/default/model:384b36e84c4e4f91900e49e1f24ff1a6
label_columns=label
drift_status=False
drift_metric=0.3880999515903545
prediction
drift_table_plot
features_drift_results
+
+ +
+

+
+
+
> to track results use the .show() or .logs() methods or click here to open in UI
> 2022-09-13 09:55:10,078 [info] run executed, status=completed
+
+
+
+
+
+
+

4.3. Review Outputs#

+

We will review the outputs as explained in the notebook above.

+
+

4.3.1. Results Prediction#

+

First we will showcase the Result Set. As we didn’t send any name, it’s default name will be "prediction":

+
+
+
batch_inference_run.artifact("prediction").as_df()
+
+
+
+
+

feature_0feature_1feature_2feature_3feature_4feature_5feature_6feature_7feature_8feature_9...feature_11feature_12feature_13feature_14feature_15feature_16feature_17feature_18feature_19label
06.3191113.2082100.793499-0.6132522.6347661.2083524.7187353.557495-2.1163110.370145...1.9629010.5813214.8445321.4087370.9649872.1114563.134610-1.7279370.3357410
13.1383781.6336610.1445570.6870032.4042791.4059903.2358921.8044262.0199801.719908...-1.4868090.6482283.0397971.8067661.2016922.4759741.4485590.9592661.1586511
22.3530264.6428790.9520970.6059774.051640-0.1575841.2187432.4647381.706084-0.250366...0.5599680.9793782.4117033.7468302.2521553.4061023.263166-0.236510-0.3131611
31.6172024.5683322.9379612.5011663.9525410.6717493.7745944.042543-2.173079-0.983443...-0.8390100.9536983.0335511.0068912.3985635.0473825.2912601.3055840.8439511
43.3442914.5383571.032059-0.0479313.1184380.4038124.4726151.840558-0.7147750.287726...1.598060-0.8055084.7420324.6087921.6177174.5148953.648923-1.3440240.6105340
..................................................................
24952.3193012.9969411.3379340.8056492.3036560.2030695.5755593.4377900.7097770.392013...-0.114619-1.4697974.5381261.2824985.6861332.8269732.445658-0.1457800.3378030
24962.9206782.1449832.153517-0.5272952.6120401.1137042.4387613.2844251.0938940.921599...-1.5868520.4098384.0947632.6366543.3334143.2511061.1329761.072658-1.2401861
24974.2566982.135673-0.1144910.3299803.935633-0.7779582.5436432.195111-0.926822-0.251254...-0.9528890.6878202.2680435.0774542.2482593.4697042.2629000.687038-0.6140661
24984.7380302.390842-0.9723291.4714612.904280-2.0790882.5706042.325262-1.602976-0.806244...0.5543990.0274934.1457283.7828024.2020063.2727090.867462-1.0200292.0133010
24992.0478312.1538130.3924840.2490103.8469100.3008463.0059972.799457-0.304962-0.990622...-0.2634730.1100912.9954112.5828434.5995353.2190911.592652-0.074851-0.6177691
+

2500 rows × 21 columns

+
+
+
+
+

4.3.2. Data Drift Analysis#

+

Second we will review the data drift table plot and the drift results:

+
+
+
batch_inference_run.artifact("drift_table_plot").show()
+
+
+
+
+
+ + +
+
+ +
+
+
+
+
batch_inference_run.status.results
+
+
+
+
+
{'drift_status': False, 'drift_metric': 0.3880999515903545}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ +
+
+ +
+
+
+
+ + + +
+
+ + \ No newline at end of file diff --git a/functions/master/batch_inference/1.8.0/static/function.html b/functions/master/batch_inference/1.8.0/static/function.html new file mode 100644 index 00000000..82e1d610 --- /dev/null +++ b/functions/master/batch_inference/1.8.0/static/function.html @@ -0,0 +1,142 @@ + + + + + + + + + + + Source + + + + +
+        
+kind: job
+verbose: false
+metadata:
+  name: batch-inference
+  tag: ''
+  categories:
+  - model-serving
+spec:
+  image: mlrun/ml-models
+  entry_points:
+    infer:
+      name: infer
+      doc: 'Perform a prediction on a given dataset with the given model. Can perform
+        drift analysis between the sample set
+
+        statistics stored in the model to the current input data. The drift rule is
+        the value per-feature mean of the TVD
+
+        and Hellinger scores according to the thresholds configures here.'
+      parameters:
+      - name: context
+        type: MLClientCtx
+        doc: MLRun context.
+      - name: model
+        type: str
+        doc: The model Store path.
+      - name: dataset
+        type: DatasetType
+        doc: The dataset to infer through the model. Can be passed in `inputs` as
+          either a Dataset artifact / Feature vector URI. Or, in `parameters` as a
+          list, dictionary or numpy array.
+      - name: drop_columns
+        type: Union[str, List[str], int, List[int]]
+        doc: A string / integer or a list of strings / integers that represent the
+          column names / indices to drop. When the dataset is a list or a numpy array
+          this parameter must be represented by integers.
+        default: null
+      - name: label_columns
+        type: Union[str, List[str]]
+        doc: The target label(s) of the column(s) in the dataset for Regression or
+          Classification tasks. The label column can be accessed from the model object,
+          or the feature vector provided if available.
+        default: null
+      - name: feature_columns
+        type: Union[str, List[str]]
+        doc: List of feature columns that will be used to build the dataframe when
+          dataset is from type list or numpy array.
+        default: null
+      - name: log_result_set
+        type: bool
+        doc: Whether to log the result set - a DataFrame of the given inputs concatenated
+          with the predictions. Defaulted to True.
+        default: true
+      - name: result_set_name
+        type: str
+        doc: The db key to set name of the prediction result and the filename. Defaulted
+          to 'prediction'.
+        default: prediction
+      - name: batch_id
+        type: str
+        doc: The ID of the given batch (inference dataset). If `None`, it will be
+          generated. Will be logged as a result of the run.
+        default: null
+      - name: perform_drift_analysis
+        type: bool
+        doc: Whether to perform drift analysis between the sample set of the model
+          object to the dataset given. By default, None, which means it will perform
+          drift analysis if the model has a sample set statistics. Perform drift analysis
+          will produce a data drift table artifact.
+        default: null
+      - name: sample_set
+        type: DatasetType
+        doc: A sample dataset to give to compare the inputs in the drift analysis.
+          The default chosen sample set will always be the one who is set in the model
+          artifact itself.
+        default: null
+      - name: drift_threshold
+        type: float
+        doc: The threshold of which to mark drifts. Defaulted to 0.7.
+        default: 0.7
+      - name: possible_drift_threshold
+        type: float
+        doc: The threshold of which to mark possible drifts. Defaulted to 0.5.
+        default: 0.5
+      - name: inf_capping
+        type: float
+        doc: The value to set for when it reached infinity. Defaulted to 10.0.
+        default: 10.0
+      - name: artifacts_tag
+        type: str
+        doc: Tag to use for all the artifacts resulted from the function.
+        default: ''
+      lineno: 317
+      has_kwargs: true
+      has_varargs: false
+  allow_empty_resources: true
+  default_handler: infer
+  command: ''
+  build:
+    functionSourceCode: 
+    origin_filename: ''
+    auto_build: false
+    code_origin: ''
+    with_mlrun: false
+  disable_auto_mount: false
+  description: Batch inference (also knows as prediction) for the common ML frameworks
+    (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.
+
+        
+    
+ + \ No newline at end of file diff --git a/functions/master/batch_inference/1.8.0/static/item.html b/functions/master/batch_inference/1.8.0/static/item.html new file mode 100644 index 00000000..812ce885 --- /dev/null +++ b/functions/master/batch_inference/1.8.0/static/item.html @@ -0,0 +1,66 @@ + + + + + + + + + + + Source + + + + +
+        
+apiVersion: v1
+categories:
+- model-serving
+description: Batch inference (also knows as prediction) for the common ML frameworks
+  (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.
+doc: ''
+example: batch_inference.ipynb
+generationDate: 2022-08-28:17-25
+hidden: false
+icon: ''
+labels:
+  author: guyl
+maintainers: []
+marketplaceType: ''
+mlrunVersion: 1.7.0
+name: batch_inference
+platformVersion: 3.5.0
+spec:
+  extra_spec:
+    allow_empty_resources: true
+    build:
+      auto_build: false
+      with_mlrun: false
+  filename: batch_inference.py
+  handler: infer
+  image: mlrun/ml-models
+  kind: job
+  requirements:
+url: ''
+version: 1.8.0
+
+
+        
+    
+ + \ No newline at end of file diff --git a/functions/master/batch_inference/1.8.0/static/source.html b/functions/master/batch_inference/1.8.0/static/source.html new file mode 100644 index 00000000..f0aad0a5 --- /dev/null +++ b/functions/master/batch_inference/1.8.0/static/source.html @@ -0,0 +1,480 @@ + + + + + + + + + + + Source + + + + +
+        
+# Copyright 2019 Iguazio
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import hashlib
+import json
+from datetime import datetime
+from typing import Any, Dict, List, Tuple, Union
+import semver
+
+import mlrun
+if semver.compare(mlrun.__version__, "1.5.0") >= 0:
+    raise mlrun.errors.MLRunNotFoundError(
+        f"When using `mlrun` version >=1.5.0, please use "
+        f"batch inference `v2` function ('hub://batch_inference_v2')."
+    )
+
+import mlrun.datastore
+import mlrun.utils
+import numpy as np
+import pandas as pd
+from mlrun import feature_store as fs
+from mlrun.artifacts import Artifact
+from mlrun.data_types.infer import InferOptions, get_df_stats
+from mlrun.frameworks.auto_mlrun import AutoMLRun
+from mlrun.model_monitoring.features_drift_table import FeaturesDriftTablePlot
+from mlrun.model_monitoring.model_monitoring_batch import (
+    VirtualDrift,
+    calculate_inputs_statistics,
+)
+
+# A union of all supported dataset types:
+DatasetType = Union[mlrun.DataItem, list, dict, pd.DataFrame, pd.Series, np.ndarray]
+
+
+def _read_dataset_as_dataframe(
+    dataset: DatasetType,
+    feature_columns: Union[str, List[str]] = None,
+    label_columns: Union[str, List[str]] = None,
+    drop_columns: Union[str, List[str], int, List[int]] = None,
+) -> Tuple[pd.DataFrame, List[str]]:
+    """
+    Parse the given dataset into a DataFrame and drop the columns accordingly. In addition, the label columns will be
+    parsed and validated as well.
+
+    :param dataset:         A dataset that will be converted into a DataFrame.
+                            Can be either a list of lists, dict, URI or a FeatureVector.
+    :param feature_columns: List of feature columns that will be used to build the dataframe when dataset is from
+                            type list or numpy array.
+    :param label_columns:   The target label(s) of the column(s) in the dataset. for Regression or
+                            Classification tasks.
+    :param drop_columns:    ``str`` / ``int`` or a list of ``str`` / ``int`` that represent the column names / indices
+                            to drop.
+
+    :returns: A tuple of:
+              [0] = The parsed dataset as a DataFrame
+              [1] = Label columns.
+
+    raises MLRunInvalidArgumentError: If the `drop_columns` are not matching the dataset or unsupported dataset type.
+    """
+    # Turn the `drop labels` into a list if given:
+    if drop_columns is not None:
+        if not isinstance(drop_columns, list):
+            drop_columns = [drop_columns]
+
+    # Check if the dataset is in fact a Feature Vector:
+    if isinstance(dataset, fs.FeatureVector):
+        # Try to get the label columns if not provided:
+        if label_columns is None:
+            label_columns = dataset.status.label_column
+        # Get the features and parse to DataFrame:
+        dataset = fs.get_offline_features(
+            dataset.uri, drop_columns=drop_columns
+        ).to_dataframe()
+
+    elif isinstance(dataset, (list, np.ndarray)):
+        if not feature_columns:
+            raise mlrun.errors.MLRunInvalidArgumentError(
+                "Feature columns list must be provided when dataset input as from type list or numpy array"
+            )
+        # Parse the list / numpy array into a DataFrame:
+        dataset = pd.DataFrame(dataset, columns=feature_columns)
+        # Validate the `drop_columns` is given as integers:
+        if drop_columns and not all(isinstance(col, int) for col in drop_columns):
+            raise mlrun.errors.MLRunInvalidArgumentError(
+                "`drop_columns` must be an integer / list of integers if provided as a list."
+            )
+    elif isinstance(dataset, mlrun.DataItem):
+        # Turn the DataITem to DataFrame:
+        dataset = dataset.as_df()
+    else:
+        # Parse the object (should be a pd.DataFrame / pd.Series, dictionary) into a DataFrame:
+        try:
+            dataset = pd.DataFrame(dataset)
+        except ValueError as e:
+            raise mlrun.errors.MLRunInvalidArgumentError(
+                f"Could not parse the given dataset of type {type(dataset)} into a pandas DataFrame. "
+                f"Received the following error: {e}"
+            )
+    # Drop columns if needed:
+    if drop_columns:
+        dataset.drop(drop_columns, axis=1, inplace=True)
+
+    # Turn the `label_columns` into a list by default:
+    if label_columns is None:
+        label_columns = []
+    elif isinstance(label_columns, (str, int)):
+        label_columns = [label_columns]
+    return dataset, label_columns
+
+
+def _prepare_result_set(
+    x: pd.DataFrame, label_columns: List[str], y_pred: np.ndarray
+) -> pd.DataFrame:
+    """
+    Set default label column names and validate given names to prepare the result set - a concatenation of the inputs
+    (x) and the model predictions (y_pred).
+
+    :param x:             The inputs.
+    :param label_columns: A list of strings representing the target column names to add to the predictions. Default name
+                          will be used in case the list is empty (predicted_label_{i}).
+    :param y_pred:        The model predictions on the inputs.
+
+    :returns: The result set.
+
+    raises MLRunInvalidArgumentError: If the labels columns amount do not match the outputs or if one of the label
+                                       column already exists in the dataset.
+    """
+    # Prepare default target columns names if not provided:
+    prediction_columns_amount = 1 if len(y_pred.shape) == 1 else y_pred.shape[1]
+    if len(label_columns) == 0:
+        # Add default label column names:
+        if prediction_columns_amount == 1:
+            label_columns = ["predicted_label"]
+        else:
+            label_columns = [
+                f"predicted_label_{i}" for i in range(prediction_columns_amount)
+            ]
+
+    # Validate the label columns:
+    if prediction_columns_amount != len(label_columns):
+        # No equality between provided label column names and outputs amount:
+        raise mlrun.errors.MLRunInvalidArgumentError(
+            f"The number of predicted labels: {prediction_columns_amount} "
+            f"is not equal to the given label columns: {len(label_columns)}"
+        )
+    common_labels = set(label_columns) & set(x.columns.tolist())
+    if common_labels:
+        # Label column exist in the original inputs:
+        raise mlrun.errors.MLRunInvalidArgumentError(
+            f"The labels: {common_labels} are already existed in the given dataset."
+        )
+
+    return pd.concat(
+        [x, pd.DataFrame(y_pred, columns=label_columns, index=x.index)], axis=1
+    )
+
+
+def _get_sample_set_statistics(
+    sample_set: DatasetType = None, model_artifact_feature_stats: dict = None
+) -> dict:
+    """
+    Get the sample set statistics either from the given sample set or the statistics logged with the model while
+    favoring the given sample set.
+
+    :param sample_set:                   A sample dataset to give to compare the inputs in the drift analysis.
+    :param model_artifact_feature_stats: The `feature_stats` attribute in the spec of the model artifact, where the
+                                         original sample set statistics of the model was used.
+
+    :returns: The sample set statistics.
+
+    raises MLRunInvalidArgumentError: If no sample set or statistics were given.
+    """
+    # Check if a sample set was provided:
+    if sample_set is None:
+        # Check if the model was logged with a sample set:
+        if model_artifact_feature_stats is None:
+            raise mlrun.errors.MLRunInvalidArgumentError(
+                "Cannot perform drift analysis as there is no sample set to compare to. The model artifact was not "
+                "logged with a sample set and `sample_set` was not provided to the function."
+            )
+        # Return the statistics logged with the model:
+        return model_artifact_feature_stats
+
+    # Turn the DataItem to DataFrame:
+    if isinstance(sample_set, mlrun.DataItem):
+        sample_set, _ = _read_dataset_as_dataframe(dataset=sample_set)
+
+    # Return the sample set statistics:
+    return get_df_stats(df=sample_set, options=InferOptions.Histogram)
+
+
+def _get_drift_result(
+    tvd: float,
+    hellinger: float,
+    threshold: float,
+) -> Tuple[bool, float]:
+    """
+    Calculate the drift result by the following equation: (tvd + hellinger) / 2
+
+    :param tvd:       The feature's TVD value.
+    :param hellinger: The feature's Hellinger value.
+    :param threshold: The threshold from which the value is considered a drift.
+
+    :returns: A tuple of:
+              [0] = Boolean value as the drift status.
+              [1] = The result.
+    """
+    result = (tvd + hellinger) / 2
+    if result >= threshold:
+        return True, result
+    return False, result
+
+
+def _perform_drift_analysis(
+    sample_set_statistics: dict,
+    inputs: pd.DataFrame,
+    drift_threshold: float,
+    possible_drift_threshold: float,
+    inf_capping: float,
+) -> Tuple[Artifact, Artifact, dict]:
+    """
+    Perform drift analysis, producing the drift table artifact for logging post prediction.
+
+    :param sample_set_statistics:    The statistics of the sample set logged along a model.
+    :param inputs:                   Input dataset to perform the drift calculation on.
+    :param drift_threshold:          The threshold of which to mark drifts.
+    :param possible_drift_threshold: The threshold of which to mark possible drifts.
+    :param inf_capping:              The value to set for when it reached infinity.
+
+    :returns: A tuple of
+              [0] = An MLRun artifact holding the HTML code of the drift table plot.
+              [1] = An MLRun artifact holding the metric per feature dictionary.
+              [2] = Results to log the final analysis outcome.
+    """
+    # Calculate the input's statistics:
+    inputs_statistics = calculate_inputs_statistics(
+        sample_set_statistics=sample_set_statistics,
+        inputs=inputs,
+    )
+
+    # Calculate drift:
+    virtual_drift = VirtualDrift(inf_capping=inf_capping)
+    metrics = virtual_drift.compute_drift_from_histograms(
+        feature_stats=sample_set_statistics,
+        current_stats=inputs_statistics,
+    )
+    drift_results = virtual_drift.check_for_drift_per_feature(
+        metrics_results_dictionary=metrics,
+        possible_drift_threshold=possible_drift_threshold,
+        drift_detected_threshold=drift_threshold,
+    )
+
+    # Validate all feature columns named the same between the inputs and sample sets:
+    sample_features = set(
+        [
+            feature_name
+            for feature_name, feature_statistics in sample_set_statistics.items()
+            if isinstance(feature_statistics, dict)
+        ]
+    )
+    input_features = set(inputs.columns)
+    if len(sample_features & input_features) != len(input_features):
+        raise mlrun.errors.MLRunInvalidArgumentError(
+            f"Not all feature names were matching between the inputs and the sample set provided: "
+            f"{input_features - sample_features | sample_features - input_features}"
+        )
+
+    # Plot:
+    html_plot = FeaturesDriftTablePlot().produce(
+        features=list(input_features),
+        sample_set_statistics=sample_set_statistics,
+        inputs_statistics=inputs_statistics,
+        metrics=metrics,
+        drift_results=drift_results,
+    )
+
+    # Prepare metrics per feature dictionary:
+    metrics_per_feature = {
+        feature: _get_drift_result(
+            tvd=metric_dictionary["tvd"],
+            hellinger=metric_dictionary["hellinger"],
+            threshold=drift_threshold,
+        )[1]
+        for feature, metric_dictionary in metrics.items()
+        if isinstance(metric_dictionary, dict)
+    }
+
+    # Calculate the final analysis result:
+    drift_status, drift_metric = _get_drift_result(
+        tvd=metrics["tvd_mean"],
+        hellinger=metrics["hellinger_mean"],
+        threshold=drift_threshold,
+    )
+
+    return (
+        Artifact(body=html_plot, format="html", key="drift_table_plot"),
+        Artifact(
+            body=json.dumps(metrics_per_feature),
+            format="json",
+            key="features_drift_results",
+        ),
+        {"drift_status": drift_status, "drift_metric": drift_metric},
+    )
+
+
+def infer(
+    context: mlrun.MLClientCtx,
+    model: str,
+    dataset: DatasetType,
+    drop_columns: Union[str, List[str], int, List[int]] = None,
+    label_columns: Union[str, List[str]] = None,
+    feature_columns: Union[str, List[str]] = None,
+    log_result_set: bool = True,
+    result_set_name: str = "prediction",
+    batch_id: str = None,
+    perform_drift_analysis: bool = None,
+    sample_set: DatasetType = None,
+    drift_threshold: float = 0.7,
+    possible_drift_threshold: float = 0.5,
+    inf_capping: float = 10.0,
+    artifacts_tag: str = "",
+    **predict_kwargs: Dict[str, Any],
+):
+    """
+    Perform a prediction on a given dataset with the given model. Can perform drift analysis between the sample set
+    statistics stored in the model to the current input data. The drift rule is the value per-feature mean of the TVD
+    and Hellinger scores according to the thresholds configures here.
+
+    :param context:                  MLRun context.
+    :param model:                    The model Store path.
+    :param dataset:                  The dataset to infer through the model. Can be passed in `inputs` as either a
+                                     Dataset artifact / Feature vector URI. Or, in `parameters` as a list, dictionary or
+                                     numpy array.
+    :param drop_columns:             A string / integer or a list of strings / integers that represent the column names
+                                     / indices to drop. When the dataset is a list or a numpy array this parameter must
+                                     be represented by integers.
+    :param label_columns:            The target label(s) of the column(s) in the dataset for Regression or
+                                     Classification tasks. The label column can be accessed from the model object, or
+                                     the feature vector provided if available.
+    :param feature_columns:          List of feature columns that will be used to build the dataframe when dataset is
+                                     from type list or numpy array.
+    :param log_result_set:           Whether to log the result set - a DataFrame of the given inputs concatenated with
+                                     the predictions. Defaulted to True.
+    :param result_set_name:          The db key to set name of the prediction result and the filename. Defaulted to
+                                     'prediction'.
+    :param batch_id:                 The ID of the given batch (inference dataset). If `None`, it will be generated.
+                                     Will be logged as a result of the run.
+    :param perform_drift_analysis:   Whether to perform drift analysis between the sample set of the model object to the
+                                     dataset given. By default, None, which means it will perform drift analysis if the
+                                     model has a sample set statistics. Perform drift analysis will produce a data drift
+                                     table artifact.
+    :param sample_set:               A sample dataset to give to compare the inputs in the drift analysis. The default
+                                     chosen sample set will always be the one who is set in the model artifact itself.
+    :param drift_threshold:          The threshold of which to mark drifts. Defaulted to 0.7.
+    :param possible_drift_threshold: The threshold of which to mark possible drifts. Defaulted to 0.5.
+    :param inf_capping:              The value to set for when it reached infinity. Defaulted to 10.0.
+    :param artifacts_tag:            Tag to use for all the artifacts resulted from the function.
+    """
+    # Loading the model:
+    context.logger.info(f"Loading model...")
+    model_handler = AutoMLRun.load_model(model_path=model, context=context)
+    if label_columns is None:
+        label_columns = [
+            output.name for output in model_handler._model_artifact.spec.outputs
+        ]
+
+    if feature_columns is None:
+        feature_columns = [
+            input.name for input in model_handler._model_artifact.spec.inputs
+        ]
+
+    # Get dataset by object, URL or by FeatureVector:
+    context.logger.info(f"Loading data...")
+    x, label_columns = _read_dataset_as_dataframe(
+        dataset=dataset,
+        feature_columns=feature_columns,
+        label_columns=label_columns,
+        drop_columns=drop_columns,
+    )
+
+    # Predict:
+    context.logger.info(f"Calculating prediction...")
+    y_pred = model_handler.model.predict(x, **predict_kwargs)
+
+    # Prepare the result set:
+    result_set = _prepare_result_set(x=x, label_columns=label_columns, y_pred=y_pred)
+
+    # Check for logging the result set:
+    if log_result_set:
+        # Log the result set:
+        context.logger.info(f"Logging result set (x | prediction)...")
+        context.log_dataset(
+            key=result_set_name,
+            df=result_set,
+            db_key=result_set_name,
+            tag=artifacts_tag,
+        )
+        # Log the batch ID:
+        if batch_id is None:
+            batch_id = hashlib.sha224(str(datetime.now()).encode()).hexdigest()
+        context.log_result(
+            key="batch_id",
+            value=batch_id,
+        )
+
+    # Check for performing drift analysis:
+    if (
+        perform_drift_analysis is None
+        and model_handler._model_artifact.spec.feature_stats is not None
+    ):
+        perform_drift_analysis = True
+    if perform_drift_analysis:
+        context.logger.info("Performing drift analysis...")
+        # Get the sample set statistics (either from the sample set or from the statistics logged with the model):
+        sample_set_statistics = _get_sample_set_statistics(
+            sample_set=sample_set,
+            model_artifact_feature_stats=model_handler._model_artifact.spec.feature_stats,
+        )
+        # Produce the artifact:
+        (
+            drift_table_plot,
+            metric_per_feature_dict,
+            analysis_results,
+        ) = _perform_drift_analysis(
+            sample_set_statistics=sample_set_statistics,
+            inputs=result_set,
+            drift_threshold=drift_threshold,
+            possible_drift_threshold=possible_drift_threshold,
+            inf_capping=inf_capping,
+        )
+        # Log the artifact and results:
+        context.log_artifact(drift_table_plot, tag=artifacts_tag)
+        context.log_artifact(metric_per_feature_dict, tag=artifacts_tag)
+        context.log_results(results=analysis_results)
+
+        
+    
+ + \ No newline at end of file diff --git a/functions/master/batch_inference/latest/src/function.yaml b/functions/master/batch_inference/latest/src/function.yaml index cdee5641..74b672d4 100644 --- a/functions/master/batch_inference/latest/src/function.yaml +++ b/functions/master/batch_inference/latest/src/function.yaml @@ -1,25 +1,12 @@ kind: job +verbose: false metadata: name: batch-inference tag: '' - hash: c7b8439a70292e916788a04dce35e57e00b1a41c - project: '' - labels: - author: guyl categories: - - utils + - model-serving spec: - command: '' - args: [] image: mlrun/ml-models - build: - functionSourceCode:  - commands: [] - code_origin: '' - origin_filename: '' - with_mlrun: false - auto_build: false - requirements: [] entry_points: infer: name: infer @@ -103,19 +90,18 @@ spec: type: str doc: Tag to use for all the artifacts resulted from the function. default: '' - outputs: - - default: '' lineno: 317 - description: Batch inference (also knows as prediction) for the common ML frameworks - (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis. + has_kwargs: true + has_varargs: false + allow_empty_resources: true default_handler: infer + command: '' + build: + functionSourceCode:  + origin_filename: '' + auto_build: false + code_origin: '' + with_mlrun: false disable_auto_mount: false - allow_empty_resources: true - clone_target_dir: '' - env: [] - priority_class_name: '' - preemption_mode: prevent - affinity: null - tolerations: null - security_context: {} -verbose: false + description: Batch inference (also knows as prediction) for the common ML frameworks + (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis. diff --git a/functions/master/batch_inference/latest/src/item.yaml b/functions/master/batch_inference/latest/src/item.yaml index 125fb525..16a56cfe 100644 --- a/functions/master/batch_inference/latest/src/item.yaml +++ b/functions/master/batch_inference/latest/src/item.yaml @@ -1,6 +1,6 @@ apiVersion: v1 categories: -- utils +- model-serving description: Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis. doc: '' @@ -12,7 +12,7 @@ labels: author: guyl maintainers: [] marketplaceType: '' -mlrunVersion: 1.4.1 +mlrunVersion: 1.7.0 name: batch_inference platformVersion: 3.5.0 spec: @@ -27,5 +27,5 @@ spec: kind: job requirements: url: '' -version: 1.7.0 +version: 1.8.0 diff --git a/functions/master/batch_inference/latest/static/documentation.html b/functions/master/batch_inference/latest/static/documentation.html index d37b20b7..e6e1e6d5 100644 --- a/functions/master/batch_inference/latest/static/documentation.html +++ b/functions/master/batch_inference/latest/static/documentation.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/batch_inference/latest/static/example.html b/functions/master/batch_inference/latest/static/example.html index 63c52c56..bbd82747 100644 --- a/functions/master/batch_inference/latest/static/example.html +++ b/functions/master/batch_inference/latest/static/example.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/batch_inference/latest/static/function.html b/functions/master/batch_inference/latest/static/function.html index afa44256..82e1d610 100644 --- a/functions/master/batch_inference/latest/static/function.html +++ b/functions/master/batch_inference/latest/static/function.html @@ -29,27 +29,14 @@
         
 kind: job
+verbose: false
 metadata:
   name: batch-inference
   tag: ''
-  hash: c7b8439a70292e916788a04dce35e57e00b1a41c
-  project: ''
-  labels:
-    author: guyl
   categories:
-  - utils
+  - model-serving
 spec:
-  command: ''
-  args: []
   image: mlrun/ml-models
-  build:
-    functionSourceCode: 
-    commands: []
-    code_origin: ''
-    origin_filename: ''
-    with_mlrun: false
-    auto_build: false
-    requirements: []
   entry_points:
     infer:
       name: infer
@@ -133,22 +120,21 @@
         type: str
         doc: Tag to use for all the artifacts resulted from the function.
         default: ''
-      outputs:
-      - default: ''
       lineno: 317
-  description: Batch inference (also knows as prediction) for the common ML frameworks
-    (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.
+      has_kwargs: true
+      has_varargs: false
+  allow_empty_resources: true
   default_handler: infer
+  command: ''
+  build:
+    functionSourceCode: 
+    origin_filename: ''
+    auto_build: false
+    code_origin: ''
+    with_mlrun: false
   disable_auto_mount: false
-  allow_empty_resources: true
-  clone_target_dir: ''
-  env: []
-  priority_class_name: ''
-  preemption_mode: prevent
-  affinity: null
-  tolerations: null
-  security_context: {}
-verbose: false
+  description: Batch inference (also knows as prediction) for the common ML frameworks
+    (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.
 
         
     
diff --git a/functions/master/batch_inference/latest/static/item.html b/functions/master/batch_inference/latest/static/item.html index a37f5b3f..812ce885 100644 --- a/functions/master/batch_inference/latest/static/item.html +++ b/functions/master/batch_inference/latest/static/item.html @@ -30,7 +30,7 @@ apiVersion: v1 categories: -- utils +- model-serving description: Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis. doc: '' @@ -42,7 +42,7 @@ author: guyl maintainers: [] marketplaceType: '' -mlrunVersion: 1.4.1 +mlrunVersion: 1.7.0 name: batch_inference platformVersion: 3.5.0 spec: @@ -57,7 +57,7 @@ kind: job requirements: url: '' -version: 1.7.0 +version: 1.8.0 diff --git a/functions/master/batch_inference_v2/2.6.0/src/function.yaml b/functions/master/batch_inference_v2/2.6.0/src/function.yaml index e0a9310c..014cb216 100644 --- a/functions/master/batch_inference_v2/2.6.0/src/function.yaml +++ b/functions/master/batch_inference_v2/2.6.0/src/function.yaml @@ -1,19 +1,10 @@ +verbose: false spec: - image: mlrun/mlrun default_handler: infer - command: '' - allow_empty_resources: true - description: Batch inference (also knows as prediction) for the common ML frameworks - (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis. - disable_auto_mount: false - build: - with_mlrun: false - functionSourceCode:  - code_origin: '' - auto_build: false - origin_filename: '' entry_points: infer: + lineno: 102 + name: infer parameters: - name: context type: MLClientCtx @@ -111,8 +102,7 @@ spec: doc: The threshold of which to mark possible drifts. Defaulted to 0.5. default: null has_kwargs: true - lineno: 102 - name: infer + has_varargs: false doc: 'Perform a prediction on the provided dataset using the specified model. Ensure that the model has already been logged under the current project. @@ -133,13 +123,21 @@ spec: At the moment, this function is supported for `mlrun>=1.5.0` versions.' - has_varargs: false -verbose: false + command: '' + build: + with_mlrun: false + code_origin: '' + origin_filename: '' + auto_build: false + functionSourceCode:  + allow_empty_resources: true + disable_auto_mount: false + image: mlrun/mlrun + description: Batch inference (also knows as prediction) for the common ML frameworks + (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis. metadata: - name: batch-inference-v2 tag: '' categories: - - utils - - data-analysis - - monitoring + - model-serving + name: batch-inference-v2 kind: job diff --git a/functions/master/batch_inference_v2/2.6.0/src/item.yaml b/functions/master/batch_inference_v2/2.6.0/src/item.yaml index e995c770..775579b9 100644 --- a/functions/master/batch_inference_v2/2.6.0/src/item.yaml +++ b/functions/master/batch_inference_v2/2.6.0/src/item.yaml @@ -1,8 +1,6 @@ apiVersion: v1 categories: -- utils -- data-analysis -- monitoring +- model-serving description: Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis. doc: '' diff --git a/functions/master/batch_inference_v2/2.6.0/static/batch_inference_v2.html b/functions/master/batch_inference_v2/2.6.0/static/batch_inference_v2.html index 9c41518e..85861922 100644 --- a/functions/master/batch_inference_v2/2.6.0/static/batch_inference_v2.html +++ b/functions/master/batch_inference_v2/2.6.0/static/batch_inference_v2.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/batch_inference_v2/2.6.0/static/documentation.html b/functions/master/batch_inference_v2/2.6.0/static/documentation.html index b45d98d7..bb677bf7 100644 --- a/functions/master/batch_inference_v2/2.6.0/static/documentation.html +++ b/functions/master/batch_inference_v2/2.6.0/static/documentation.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/batch_inference_v2/2.6.0/static/example.html b/functions/master/batch_inference_v2/2.6.0/static/example.html index 5fb0f011..c3bc49bd 100644 --- a/functions/master/batch_inference_v2/2.6.0/static/example.html +++ b/functions/master/batch_inference_v2/2.6.0/static/example.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/batch_inference_v2/2.6.0/static/function.html b/functions/master/batch_inference_v2/2.6.0/static/function.html index 94730d3a..39926fe5 100644 --- a/functions/master/batch_inference_v2/2.6.0/static/function.html +++ b/functions/master/batch_inference_v2/2.6.0/static/function.html @@ -28,22 +28,13 @@
         
+verbose: false
 spec:
-  image: mlrun/mlrun
   default_handler: infer
-  command: ''
-  allow_empty_resources: true
-  description: Batch inference (also knows as prediction) for the common ML frameworks
-    (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.
-  disable_auto_mount: false
-  build:
-    with_mlrun: false
-    functionSourceCode: 
-    code_origin: ''
-    auto_build: false
-    origin_filename: ''
   entry_points:
     infer:
+      lineno: 102
+      name: infer
       parameters:
       - name: context
         type: MLClientCtx
@@ -141,8 +132,7 @@
         doc: The threshold of which to mark possible drifts. Defaulted to 0.5.
         default: null
       has_kwargs: true
-      lineno: 102
-      name: infer
+      has_varargs: false
       doc: 'Perform a prediction on the provided dataset using the specified model.
 
         Ensure that the model has already been logged under the current project.
@@ -163,15 +153,23 @@
 
 
         At the moment, this function is supported for `mlrun>=1.5.0` versions.'
-      has_varargs: false
-verbose: false
+  command: ''
+  build:
+    with_mlrun: false
+    code_origin: ''
+    origin_filename: ''
+    auto_build: false
+    functionSourceCode: 
+  allow_empty_resources: true
+  disable_auto_mount: false
+  image: mlrun/mlrun
+  description: Batch inference (also knows as prediction) for the common ML frameworks
+    (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.
 metadata:
-  name: batch-inference-v2
   tag: ''
   categories:
-  - utils
-  - data-analysis
-  - monitoring
+  - model-serving
+  name: batch-inference-v2
 kind: job
 
         
diff --git a/functions/master/batch_inference_v2/2.6.0/static/item.html b/functions/master/batch_inference_v2/2.6.0/static/item.html
index 19b6d7ed..b73b3ff0 100644
--- a/functions/master/batch_inference_v2/2.6.0/static/item.html
+++ b/functions/master/batch_inference_v2/2.6.0/static/item.html
@@ -30,9 +30,7 @@
         
 apiVersion: v1
 categories:
-- utils
-- data-analysis
-- monitoring
+- model-serving
 description: Batch inference (also knows as prediction) for the common ML frameworks
   (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.
 doc: ''
diff --git a/functions/master/batch_inference_v2/latest/src/function.yaml b/functions/master/batch_inference_v2/latest/src/function.yaml
index e0a9310c..014cb216 100644
--- a/functions/master/batch_inference_v2/latest/src/function.yaml
+++ b/functions/master/batch_inference_v2/latest/src/function.yaml
@@ -1,19 +1,10 @@
+verbose: false
 spec:
-  image: mlrun/mlrun
   default_handler: infer
-  command: ''
-  allow_empty_resources: true
-  description: Batch inference (also knows as prediction) for the common ML frameworks
-    (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.
-  disable_auto_mount: false
-  build:
-    with_mlrun: false
-    functionSourceCode: 
-    code_origin: ''
-    auto_build: false
-    origin_filename: ''
   entry_points:
     infer:
+      lineno: 102
+      name: infer
       parameters:
       - name: context
         type: MLClientCtx
@@ -111,8 +102,7 @@ spec:
         doc: The threshold of which to mark possible drifts. Defaulted to 0.5.
         default: null
       has_kwargs: true
-      lineno: 102
-      name: infer
+      has_varargs: false
       doc: 'Perform a prediction on the provided dataset using the specified model.
 
         Ensure that the model has already been logged under the current project.
@@ -133,13 +123,21 @@ spec:
 
 
         At the moment, this function is supported for `mlrun>=1.5.0` versions.'
-      has_varargs: false
-verbose: false
+  command: ''
+  build:
+    with_mlrun: false
+    code_origin: ''
+    origin_filename: ''
+    auto_build: false
+    functionSourceCode: 
+  allow_empty_resources: true
+  disable_auto_mount: false
+  image: mlrun/mlrun
+  description: Batch inference (also knows as prediction) for the common ML frameworks
+    (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.
 metadata:
-  name: batch-inference-v2
   tag: ''
   categories:
-  - utils
-  - data-analysis
-  - monitoring
+  - model-serving
+  name: batch-inference-v2
 kind: job
diff --git a/functions/master/batch_inference_v2/latest/src/item.yaml b/functions/master/batch_inference_v2/latest/src/item.yaml
index e995c770..775579b9 100644
--- a/functions/master/batch_inference_v2/latest/src/item.yaml
+++ b/functions/master/batch_inference_v2/latest/src/item.yaml
@@ -1,8 +1,6 @@
 apiVersion: v1
 categories:
-- utils
-- data-analysis
-- monitoring
+- model-serving
 description: Batch inference (also knows as prediction) for the common ML frameworks
   (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.
 doc: ''
diff --git a/functions/master/batch_inference_v2/latest/static/batch_inference_v2.html b/functions/master/batch_inference_v2/latest/static/batch_inference_v2.html
index 9c41518e..85861922 100644
--- a/functions/master/batch_inference_v2/latest/static/batch_inference_v2.html
+++ b/functions/master/batch_inference_v2/latest/static/batch_inference_v2.html
@@ -20,7 +20,7 @@
 
 
 
-
+
 
 
 
diff --git a/functions/master/batch_inference_v2/latest/static/documentation.html b/functions/master/batch_inference_v2/latest/static/documentation.html
index b45d98d7..bb677bf7 100644
--- a/functions/master/batch_inference_v2/latest/static/documentation.html
+++ b/functions/master/batch_inference_v2/latest/static/documentation.html
@@ -20,7 +20,7 @@
 
 
 
-
+
 
 
 
diff --git a/functions/master/batch_inference_v2/latest/static/example.html b/functions/master/batch_inference_v2/latest/static/example.html
index 5fb0f011..c3bc49bd 100644
--- a/functions/master/batch_inference_v2/latest/static/example.html
+++ b/functions/master/batch_inference_v2/latest/static/example.html
@@ -20,7 +20,7 @@
 
 
 
-
+
 
 
 
diff --git a/functions/master/batch_inference_v2/latest/static/function.html b/functions/master/batch_inference_v2/latest/static/function.html
index 94730d3a..39926fe5 100644
--- a/functions/master/batch_inference_v2/latest/static/function.html
+++ b/functions/master/batch_inference_v2/latest/static/function.html
@@ -28,22 +28,13 @@
 
     
         
+verbose: false
 spec:
-  image: mlrun/mlrun
   default_handler: infer
-  command: ''
-  allow_empty_resources: true
-  description: Batch inference (also knows as prediction) for the common ML frameworks
-    (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.
-  disable_auto_mount: false
-  build:
-    with_mlrun: false
-    functionSourceCode: 
-    code_origin: ''
-    auto_build: false
-    origin_filename: ''
   entry_points:
     infer:
+      lineno: 102
+      name: infer
       parameters:
       - name: context
         type: MLClientCtx
@@ -141,8 +132,7 @@
         doc: The threshold of which to mark possible drifts. Defaulted to 0.5.
         default: null
       has_kwargs: true
-      lineno: 102
-      name: infer
+      has_varargs: false
       doc: 'Perform a prediction on the provided dataset using the specified model.
 
         Ensure that the model has already been logged under the current project.
@@ -163,15 +153,23 @@
 
 
         At the moment, this function is supported for `mlrun>=1.5.0` versions.'
-      has_varargs: false
-verbose: false
+  command: ''
+  build:
+    with_mlrun: false
+    code_origin: ''
+    origin_filename: ''
+    auto_build: false
+    functionSourceCode: 
+  allow_empty_resources: true
+  disable_auto_mount: false
+  image: mlrun/mlrun
+  description: Batch inference (also knows as prediction) for the common ML frameworks
+    (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.
 metadata:
-  name: batch-inference-v2
   tag: ''
   categories:
-  - utils
-  - data-analysis
-  - monitoring
+  - model-serving
+  name: batch-inference-v2
 kind: job
 
         
diff --git a/functions/master/batch_inference_v2/latest/static/item.html b/functions/master/batch_inference_v2/latest/static/item.html
index 19b6d7ed..b73b3ff0 100644
--- a/functions/master/batch_inference_v2/latest/static/item.html
+++ b/functions/master/batch_inference_v2/latest/static/item.html
@@ -30,9 +30,7 @@
         
 apiVersion: v1
 categories:
-- utils
-- data-analysis
-- monitoring
+- model-serving
 description: Batch inference (also knows as prediction) for the common ML frameworks
   (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.
 doc: ''
diff --git a/functions/master/catalog.json b/functions/master/catalog.json
index 38d7b0a5..2ad2c475 100644
--- a/functions/master/catalog.json
+++ b/functions/master/catalog.json
@@ -1 +1 @@
-{"tf2_serving": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "tf2-serving", "platformVersion": "3.5.0", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/tf2_serving.ipynb", "source": "src/tf2_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf2-serving", "platformVersion": "3.2.0", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.9.1", "assets": {"example": "src/tf2_serving.ipynb", "source": "src/tf2_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "tf2-serving", "platformVersion": "3.5.0", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/tf2_serving.ipynb", "source": "src/tf2_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "tf2-serving", "platformVersion": "", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.0.1", "assets": {"example": "src/tf2_serving.ipynb", "source": "src/tf2_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf2-serving", "platformVersion": "3.2.0", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.8.0", "assets": {"example": "src/tf2_serving.ipynb", "source": "src/tf2_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf2-serving", "platformVersion": "3.2.0", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.9.0", "assets": {"example": "src/tf2_serving.ipynb", "source": "src/tf2_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "feature_selection": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.8.0-rc40", "name": "feature-selection", "platformVersion": "3.6.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.6.0", "assets": {"example": "src/feature_selection.ipynb", "source": "src/feature_selection.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.4.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.4.0", "assets": {"example": "src/feature_selection.ipynb", "source": "src/feature_selection.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.1": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.9.0", "name": "feature-selection", "platformVersion": "3.2.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.1", "assets": {"example": "src/feature_selection.ipynb", "source": "src/feature_selection.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.5.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.4", "name": "feature-selection", "platformVersion": "3.6.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.5.0", "assets": {"example": "src/feature_selection.ipynb", "source": "src/feature_selection.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/feature_selection.ipynb", "source": "src/feature_selection.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.5.4", "name": "feature-selection", "platformVersion": "2.10.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection/feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/feature_selection.ipynb", "source": "src/feature_selection.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.3.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0", "assets": {"example": "src/feature_selection.ipynb", "source": "src/feature_selection.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "feature-selection", "platformVersion": "3.2.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection/feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/feature_selection.ipynb", "source": "src/feature_selection.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.6.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.8.0-rc40", "name": "feature-selection", "platformVersion": "3.6.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.6.0", "assets": {"example": "src/feature_selection.ipynb", "source": "src/feature_selection.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0", "assets": {"example": "src/feature_selection.ipynb", "source": "src/feature_selection.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.1": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.1", "assets": {"example": "src/feature_selection.ipynb", "source": "src/feature_selection.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.9.0", "name": "feature-selection", "platformVersion": "3.2.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection/feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/feature_selection.ipynb", "source": "src/feature_selection.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "model_server": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-server", "platformVersion": "3.5.0", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/model_server.ipynb", "source": "src/model_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-server", "platformVersion": "3.5.0", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/model_server.ipynb", "source": "src/model_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "model-server", "platformVersion": "", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/model_server.ipynb", "source": "src/model_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server", "platformVersion": "3.2.0", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "1.0.0", "assets": {"example": "src/model_server.ipynb", "source": "src/model_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server", "platformVersion": "3.2.0", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/model_server.ipynb", "source": "src/model_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server", "platformVersion": "3.2.0", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/model_server.ipynb", "source": "src/model_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "describe": {"latest": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.0", "name": "describe", "platformVersion": "3.5.3", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0", "assets": {"example": "src/describe.ipynb", "source": "src/describe.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-04-07:14-20", "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe", "platformVersion": "3.2.0", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.1", "assets": {"example": "src/describe.ipynb", "source": "src/describe.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "describe", "platformVersion": "3.5.0", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/describe.ipynb", "source": "src/describe.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "Iguazio"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.5.4", "name": "describe", "platformVersion": "2.10.0", "spec": {"filename": "describe.py", "handler": "summarize", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/describe.ipynb", "source": "src/describe.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.3.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.0", "name": "describe", "platformVersion": "3.5.3", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0", "assets": {"example": "src/describe.ipynb", "source": "src/describe.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Iguazio"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe", "platformVersion": "3.2.0", "spec": {"filename": "describe.py", "handler": "summarize", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/describe.ipynb", "source": "src/describe.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.2.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "describe", "platformVersion": "3.5.3", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0", "assets": {"example": "src/describe.ipynb", "source": "src/describe.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.2": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-04-26:10-20", "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe", "platformVersion": "3.2.0", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.2", "assets": {"example": "src/describe.ipynb", "source": "src/describe.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Iguazio"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe", "platformVersion": "3.2.0", "spec": {"filename": "describe.py", "handler": "summarize", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/describe.ipynb", "source": "src/describe.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "github_utils": {"latest": {"apiVersion": "v1", "categories": ["utils"], "description": "add comments to github pull request", "doc": "", "example": "github_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "github-utils", "platformVersion": "3.5.0", "spec": {"filename": "github_utils.py", "handler": "run_summary_comment", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/github_utils.ipynb", "source": "src/github_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["utils"], "description": "add comments to github pull request", "doc": "", "example": "github_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "github-utils", "platformVersion": "3.5.0", "spec": {"filename": "github_utils.py", "handler": "run_summary_comment", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/github_utils.ipynb", "source": "src/github_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["utils"], "description": "add comments to github pull request", "doc": "", "example": "github_utils.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "github-utils", "platformVersion": "", "spec": {"filename": "github_utils.py", "handler": "run_summary_comment", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/github_utils.ipynb", "source": "src/github_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["utils"], "description": "add comments to github pull request", "doc": "", "example": "github_utils.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "github-utils", "platformVersion": "3.2.0", "spec": {"filename": "github_utils.py", "handler": "run_summary_comment", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/github_utils.ipynb", "source": "src/github_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["utils"], "description": "add comments to github pull request", "doc": "", "example": "github_utils.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "github-utils", "platformVersion": "3.2.0", "spec": {"filename": "github_utils.py", "handler": "run_summary_comment", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/github_utils.ipynb", "source": "src/github_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "aggregate": {"latest": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "aggregate", "platformVersion": "3.5.4", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0", "assets": {"example": "src/aggregate.ipynb", "source": "src/aggregate.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2022-08-28:17-25", "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "aggregate", "platformVersion": "3.5.0", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/aggregate.ipynb", "source": "src/aggregate.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2021-05-19:22-31", "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.6.2", "name": "aggregate", "platformVersion": "3.0.0", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/aggregate.ipynb", "source": "src/aggregate.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.3.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "aggregate", "platformVersion": "3.5.4", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0", "assets": {"example": "src/aggregate.ipynb", "source": "src/aggregate.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "aggregate", "platformVersion": "3.2.0", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/aggregate.ipynb", "source": "src/aggregate.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "aggregate", "platformVersion": "3.5.2", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0", "assets": {"example": "src/aggregate.ipynb", "source": "src/aggregate.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "aggregate", "platformVersion": "3.2.0", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/aggregate.ipynb", "source": "src/aggregate.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "load_dataset": {"latest": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.0", "name": "load-dataset", "platformVersion": "3.5.5", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0", "assets": {"example": "src/load_dataset.ipynb", "source": "src/load_dataset.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "load-dataset", "platformVersion": "3.5.0", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/load_dataset.ipynb", "source": "src/load_dataset.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "load-dataset", "platformVersion": "", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/load_dataset.ipynb", "source": "src/load_dataset.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "load-dataset", "platformVersion": "3.2.0", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/load_dataset.ipynb", "source": "src/load_dataset.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.0", "name": "load-dataset", "platformVersion": "3.5.5", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0", "assets": {"example": "src/load_dataset.ipynb", "source": "src/load_dataset.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "load-dataset", "platformVersion": "3.2.0", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/load_dataset.ipynb", "source": "src/load_dataset.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "auto_trainer": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.7.0", "assets": {"example": "src/auto_trainer.ipynb", "source": "src/auto_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.4.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.4.0", "assets": {"example": "src/auto_trainer.ipynb", "source": "src/auto_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.7": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-04-26:10-43", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.0.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.7", "assets": {"example": "src/auto_trainer.ipynb", "source": "src/auto_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.6": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-04-26:10-43", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.0.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.6", "assets": {"example": "src/auto_trainer.ipynb", "source": "src/auto_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.10.3": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-04-26:10-43", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.10.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "", "kind": "job", "requirements": []}, "url": "", "version": "0.10.3", "assets": {"example": "src/auto_trainer.ipynb", "source": "src/auto_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.5.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.5.0", "assets": {"example": "src/auto_trainer.ipynb", "source": "src/auto_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/auto_trainer.ipynb", "source": "src/auto_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.10.2": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-02-06:10-18", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.10.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "", "kind": "job", "requirements": []}, "url": "", "version": "0.10.2", "assets": {"example": "src/auto_trainer.ipynb", "source": "src/auto_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.3.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.3.0", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0", "assets": {"example": "src/auto_trainer.ipynb", "source": "src/auto_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.6.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.6.0", "assets": {"example": "src/auto_trainer.ipynb", "source": "src/auto_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.7.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.7.0", "assets": {"example": "src/auto_trainer.ipynb", "source": "src/auto_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.5": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-04-26:10-43", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.0.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.5", "assets": {"example": "src/auto_trainer.ipynb", "source": "src/auto_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "v2_model_server": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "v2-model-server", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "1.2.0", "assets": {"example": "src/v2_model_server.ipynb", "source": "src/v2_model_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "v2-model-server", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/v2_model_server.ipynb", "source": "src/v2_model_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "v2-model-server", "platformVersion": "", "spec": {"filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": [], "customFields": {"default_class": "ClassifierModel"}}, "url": "", "version": "0.0.1", "assets": {"example": "src/v2_model_server.ipynb", "source": "src/v2_model_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "v2-model-server", "platformVersion": "3.2.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "1.0.0", "assets": {"example": "src/v2_model_server.ipynb", "source": "src/v2_model_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "v2-model-server", "platformVersion": "3.2.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/v2_model_server.ipynb", "source": "src/v2_model_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.2.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "v2-model-server", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "1.2.0", "assets": {"example": "src/v2_model_server.ipynb", "source": "src/v2_model_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "v2-model-server", "platformVersion": "3.2.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/v2_model_server.ipynb", "source": "src/v2_model_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "model_server_tester": {"latest": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-server-tester", "platformVersion": "3.5.0", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/model_server_tester.ipynb", "source": "src/model_server_tester.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-server-tester", "platformVersion": "3.5.0", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/model_server_tester.ipynb", "source": "src/model_server_tester.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "model-server-tester", "platformVersion": "", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/model_server_tester.ipynb", "source": "src/model_server_tester.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.0": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server-tester", "platformVersion": "3.2.0", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.0", "assets": {"example": "src/model_server_tester.ipynb", "source": "src/model_server_tester.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server-tester", "platformVersion": "3.2.0", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/model_server_tester.ipynb", "source": "src/model_server_tester.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server-tester", "platformVersion": "3.2.0", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/model_server_tester.ipynb", "source": "src/model_server_tester.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "v2_model_tester": {"latest": {"apiVersion": "v1", "categories": ["model-testing", "machine-learning"], "description": "test v2 model servers", "doc": "", "example": "v2_model_tester.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "v2-model-tester", "platformVersion": "3.5.0", "spec": {"filename": "v2_model_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/v2_model_tester.ipynb", "source": "src/v2_model_tester.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["model-testing", "machine-learning"], "description": "test v2 model servers", "doc": "", "example": "v2_model_tester.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "v2-model-tester", "platformVersion": "3.5.0", "spec": {"filename": "v2_model_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/v2_model_tester.ipynb", "source": "src/v2_model_tester.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["model-testing", "machine-learning"], "description": "test v2 model servers", "doc": "", "example": "v2_model_tester.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "v2-model-tester", "platformVersion": "", "spec": {"filename": "v2_model_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/v2_model_tester.ipynb", "source": "src/v2_model_tester.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["model-testing", "machine-learning"], "description": "test v2 model servers", "doc": "", "example": "v2_model_tester.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "v2-model-tester", "platformVersion": "3.2.0", "spec": {"filename": "v2_model_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/v2_model_tester.ipynb", "source": "src/v2_model_tester.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["model-testing", "machine-learning"], "description": "test v2 model servers", "doc": "", "example": "v2_model_tester.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "v2-model-tester", "platformVersion": "3.2.0", "spec": {"filename": "v2_model_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/v2_model_tester.ipynb", "source": "src/v2_model_tester.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "open_archive": {"latest": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Open a file/object archive into a target directory", "doc": "", "example": "open_archive.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.8.0-rc50", "name": "open-archive", "platformVersion": "3.5.0", "spec": {"filename": "open_archive.py", "handler": "open_archive", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0", "assets": {"example": "src/open_archive.ipynb", "source": "src/open_archive.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Open a file/object archive into a target directory", "doc": "", "example": "open_archive.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.8.0-rc50", "name": "open-archive", "platformVersion": "3.5.0", "spec": {"filename": "open_archive.py", "handler": "open_archive", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0", "assets": {"example": "src/open_archive.ipynb", "source": "src/open_archive.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "onnx_utils": {"latest": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.2", "name": "onnx_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "with_mlrun": false}}, "filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/mlrun", "kind": "job", "requirements": ["tqdm~=4.67.1", "tensorflow~=2.19.0", "tf_keras~=2.19.0", "torch~=2.6.0", "torchvision~=0.21.0", "onnx~=1.17.0", "onnxruntime~=1.19.2", "onnxoptimizer~=0.3.13", "onnxmltools~=1.13.0", "tf2onnx~=1.16.1", "plotly~=5.4.0"]}, "url": "", "version": "1.3.0", "assets": {"example": "src/onnx_utils.ipynb", "source": "src/onnx_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.3.0": {"apiVersion": "v1", "categories": ["utils"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.2", "name": "onnx_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "with_mlrun": false}}, "filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/mlrun", "kind": "job", "requirements": ["tqdm~=4.67.1", "tensorflow~=2.19.0", "tf_keras~=2.19.0", "torch~=2.6.0", "torchvision~=0.21.0", "onnx~=1.17.0", "onnxruntime~=1.19.2", "onnxoptimizer~=0.3.13", "onnxmltools~=1.13.0", "tf2onnx~=1.16.1", "plotly~=5.4.0"]}, "url": "", "version": "1.3.0", "assets": {"example": "src/onnx_utils.ipynb", "source": "src/onnx_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "gen_class_data": {"latest": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "gen_class_data", "platformVersion": "3.5.3", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0", "assets": {"example": "src/gen_class_data.ipynb", "source": "src/gen_class_data.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "gen_class_data", "platformVersion": "3.5.0", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/gen_class_data.ipynb", "source": "src/gen_class_data.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.6.2", "name": "gen_class_data", "platformVersion": "3.0.0", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/gen_class_data.ipynb", "source": "src/gen_class_data.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.10.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "gen_class_data", "platformVersion": "3.2.0", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.10.0", "assets": {"example": "src/gen_class_data.ipynb", "source": "src/gen_class_data.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "gen_class_data", "platformVersion": "3.2.0", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/gen_class_data.ipynb", "source": "src/gen_class_data.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "gen_class_data", "platformVersion": "3.5.3", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0", "assets": {"example": "src/gen_class_data.ipynb", "source": "src/gen_class_data.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "gen_class_data", "platformVersion": "3.2.0", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/gen_class_data.ipynb", "source": "src/gen_class_data.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "azureml_utils": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "azureml_utils", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "commands": ["apt-get update && apt-get install -y --no-install-recommends git", "apt install -y liblttng-ust0"], "with_mlrun": true}}, "filename": "azureml_utils.py", "handler": "train", "image": "python:3.9-bullseye", "kind": "job", "requirements": ["azureml-core==1.54.0.post1", "azureml-train-automl-client==1.54.0.post1", "plotly~=5.4"]}, "url": "", "version": "1.3.0", "test_valid": true, "assets": {"example": "src/azureml_utils.ipynb", "source": "src/azureml_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "azureml_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "commands": ["python -m pip install pip==22.1.2", "apt-get update && apt-get install -y --no-install-recommends git"], "with_mlrun": true}}, "filename": "azureml_utils.py", "handler": "train", "image": "python:3.7.9-slim", "kind": "job", "requirements": ["azureml-core==1.40.0", "azureml-train-automl-client==1.40.0", "plotly~=5.4"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/azureml_utils.ipynb", "source": "src/azureml_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.3.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "azureml_utils", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "commands": ["apt-get update && apt-get install -y --no-install-recommends git", "apt install -y liblttng-ust0"], "with_mlrun": true}}, "filename": "azureml_utils.py", "handler": "train", "image": "python:3.9-bullseye", "kind": "job", "requirements": ["azureml-core==1.54.0.post1", "azureml-train-automl-client==1.54.0.post1", "plotly~=5.4"]}, "url": "", "version": "1.3.0", "test_valid": true, "assets": {"example": "src/azureml_utils.ipynb", "source": "src/azureml_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.5": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2021-04-20:15-18", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "azureml_utils", "platformVersion": "", "spec": {"filename": "azureml_utils.py", "handler": "train", "extra_spec": {"build": {"commands": ["python -m pip install pip==21.2.4", "apt-get update && apt-get install -y --no-install-recommends git"], "with_mlrun": true, "auto_build": true}, "allow_empty_resources": true}, "image": "python:3.7.9-slim", "kind": "job", "requirements": ["azureml-core==1.40.0", "azureml-train-automl-client==1.40.0", "plotly~=5.4"]}, "url": "", "version": "0.9.5", "assets": {"example": "src/azureml_utils.ipynb", "source": "src/azureml_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.2.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "azureml_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "commands": ["python -m pip install pip==22.1.2", "apt-get update && apt-get install -y --no-install-recommends git"], "with_mlrun": true}}, "filename": "azureml_utils.py", "handler": "train", "image": "python:3.7.9-slim", "kind": "job", "requirements": ["azureml-core==1.40.0", "azureml-train-automl-client==1.40.0", "plotly~=5.4"]}, "url": "", "version": "1.2.0", "test_valid": false, "assets": {"example": "src/azureml_utils.ipynb", "source": "src/azureml_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.4": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2021-11-13:00-15", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "azureml_utils", "platformVersion": "", "spec": {"filename": "azureml_utils.py", "handler": "train", "extra_spec": {"build": {"commands": ["python -m pip install pip==21.2.4", "apt-get update && apt-get install -y --no-install-recommends git"], "with_mlrun": true, "auto_build": true}, "allow_empty_resources": true}, "image": "python:3.7.9-slim", "kind": "job", "requirements": ["azureml-core==1.33.0", "azureml-train-automl-client==1.33.0", "plotly~=5.4"]}, "url": "", "version": "0.9.4", "assets": {"example": "src/azureml_utils.ipynb", "source": "src/azureml_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2021-11-13:00-15", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "azureml_utils", "platformVersion": "", "spec": {"filename": "azureml_utils.py", "handler": "train", "commands": null, "image": "", "kind": "job", "requirements": ["azureml-core==1.33.0", "azureml-train-automl-client==1.33.0"]}, "url": "", "version": "0.9.0", "assets": {"example": "src/azureml_utils.ipynb", "source": "src/azureml_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "describe_spark": {"latest": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "describe-spark", "platformVersion": "3.5.0", "spec": {"filename": "describe_spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/describe_spark.ipynb", "source": "src/describe_spark.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe-spark", "platformVersion": "3.2.0", "spec": {"filename": "describe_spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "0.9.1", "assets": {"example": "src/describe_spark.ipynb", "source": "src/describe_spark.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "describe-spark", "platformVersion": "3.5.0", "spec": {"filename": "describe_spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/describe_spark.ipynb", "source": "src/describe_spark.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "describe-spark", "platformVersion": "", "spec": {"filename": "describe-spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/describe_spark.ipynb", "source": "src/describe-spark.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe-spark", "platformVersion": "3.2.0", "spec": {"filename": "describe-spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/describe_spark.ipynb", "source": "src/describe-spark.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe-spark", "platformVersion": "3.2.0", "spec": {"filename": "describe-spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/describe_spark.ipynb", "source": "src/describe-spark.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "send_email": {"latest": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "send-email", "platformVersion": "3.5.3", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0", "assets": {"example": "src/send_email.ipynb", "source": "src/send_email.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "send-email", "platformVersion": "3.5.0", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/send_email.ipynb", "source": "src/send_email.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "send-email", "platformVersion": "", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/send_email.ipynb", "source": "src/send_email.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "send-email", "platformVersion": "3.2.0", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/send_email.ipynb", "source": "src/send_email.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.2.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "send-email", "platformVersion": "3.5.3", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0", "assets": {"example": "src/send_email.ipynb", "source": "src/send_email.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "send-email", "platformVersion": "3.2.0", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/send_email.ipynb", "source": "src/send_email.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "arc_to_parquet": {"latest": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avi"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "arc-to-parquet", "platformVersion": "3.5.4", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.4.1", "assets": {"example": "src/arc_to_parquet.ipynb", "source": "src/arc_to_parquet.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.4.1": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avi"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "arc-to-parquet", "platformVersion": "3.5.4", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.4.1", "assets": {"example": "src/arc_to_parquet.ipynb", "source": "src/arc_to_parquet.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "azureml_serving": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "AzureML serving function", "doc": "", "example": "azureml_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "azureml_serving", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "mlrun.frameworks.sklearn.PickleModelServer"}, "filename": "azureml_serving.py", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["azureml-automl-runtime~=1.38.1"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/azureml_serving.ipynb", "source": "src/azureml_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "AzureML serving function", "doc": "", "example": "azureml_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "azureml_serving", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "mlrun.frameworks.sklearn.PickleModelServer"}, "filename": "azureml_serving.py", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["azureml-automl-runtime~=1.38.1"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/azureml_serving.ipynb", "source": "src/azureml_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "batch_inference": {"latest": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.7.0", "assets": {"example": "src/batch_inference.ipynb", "source": "src/batch_inference.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.4.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "1.4.0", "assets": {"example": "src/batch_inference.ipynb", "source": "src/batch_inference.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.5.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.5.0", "assets": {"example": "src/batch_inference.ipynb", "source": "src/batch_inference.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference ( also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-learn", "plotly"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/batch_inference.ipynb", "source": "src/batch_inference.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.3.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.3.0", "assets": {"example": "src/batch_inference.ipynb", "source": "src/batch_inference.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.6.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.6.0", "assets": {"example": "src/batch_inference.ipynb", "source": "src/batch_inference.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.2.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.2.0", "assets": {"example": "src/batch_inference.ipynb", "source": "src/batch_inference.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.7.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.7.0", "assets": {"example": "src/batch_inference.ipynb", "source": "src/batch_inference.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.1": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-learn", "plotly"]}, "url": "", "version": "1.1.1", "assets": {"example": "src/batch_inference.ipynb", "source": "src/batch_inference.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "hugging_face_serving": {"latest": {"apiVersion": "v1", "categories": ["huggingface", "genai", "model-serving", "machine-learning"], "description": "Generic Hugging Face model server.", "doc": "", "example": "hugging_face_serving.ipynb", "generationDate": "2022-09-05:17-00", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "hugging_face_serving", "platformVersion": "", "spec": {"customFields": {"default_class": "HuggingFaceModelServer"}, "filename": "hugging_face_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["transformers==4.21.3", "tensorflow==2.9.2"]}, "url": "", "version": "1.1.0", "test_valid": false, "assets": {"example": "src/hugging_face_serving.ipynb", "source": "src/hugging_face_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["huggingface", "genai", "model-serving", "machine-learning"], "description": "Generic Hugging Face model server.", "doc": "", "example": "hugging_face_serving.ipynb", "generationDate": "2022-09-05:17-00", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "hugging_face_serving", "platformVersion": "", "spec": {"customFields": {"default_class": "HuggingFaceModelServer"}, "filename": "hugging_face_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["transformers==4.21.3", "tensorflow==2.9.2"]}, "url": "", "version": "1.1.0", "test_valid": false, "assets": {"example": "src/hugging_face_serving.ipynb", "source": "src/hugging_face_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "Generic Hugging Face model server.", "doc": "", "example": "hugging_face_serving.ipynb", "generationDate": "2022-09-05:17-00", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "hugging_face_serving", "platformVersion": "", "spec": {"customFields": {"default_class": "HuggingFaceModelServer"}, "filename": "hugging_face_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["transformers==4.21.3", "tensorflow==2.9.2"]}, "url": "", "version": "1.0.0", "assets": {"example": "src/hugging_face_serving.ipynb", "source": "src/hugging_face_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "question_answering": {"latest": {"apiVersion": "v1", "categories": ["genai", "huggingface", "machine-learning"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "torch", "tqdm"]}, "url": "", "version": "0.4.0", "assets": {"example": "src/question_answering.ipynb", "source": "src/question_answering.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.2.0": {"apiVersion": "v1", "categories": ["machine-learning"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": "transformers torch tqdm"}, "url": "", "version": "0.2.0", "assets": {"example": "src/question_answering.ipynb", "source": "src/question_answering.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.3.0": {"apiVersion": "v1", "categories": ["machine-learning"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": "transformers torch tqdm"}, "url": "", "version": "0.3.0", "assets": {"example": "src/question_answering.ipynb", "source": "src/question_answering.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.1.0": {"apiVersion": "v1", "categories": ["machine-learning"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": "transformers torch tqdm"}, "url": "", "version": "0.1.0", "assets": {"example": "src/question_answering.ipynb", "source": "src/question_answering.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.4.0": {"apiVersion": "v1", "categories": ["genai", "huggingface", "machine-learning"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "torch", "tqdm"]}, "url": "", "version": "0.4.0", "assets": {"example": "src/question_answering.ipynb", "source": "src/question_answering.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.3.1": {"apiVersion": "v1", "categories": ["machine-learning"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "torch", "tqdm"]}, "url": "", "version": "0.3.1", "assets": {"example": "src/question_answering.ipynb", "source": "src/question_answering.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "transcribe": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "genai", "huggingface", "machine-learning"], "description": "Transcribe audio files into text files", "doc": "", "example": "transcribe.ipynb", "generationDate": "2023-07-13:11-20", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "transcribe", "platformVersion": "3.5.3", "spec": {"filename": "transcribe.py", "handler": "transcribe", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "tqdm", "torchaudio", "torch", "accelerate"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/transcribe.ipynb", "source": "src/transcribe.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation", "genai", "huggingface", "machine-learning"], "description": "Transcribe audio files into text files", "doc": "", "example": "transcribe.ipynb", "generationDate": "2023-07-13:11-20", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "transcribe", "platformVersion": "3.5.3", "spec": {"filename": "transcribe.py", "handler": "transcribe", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "tqdm", "torchaudio", "torch", "accelerate"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/transcribe.ipynb", "source": "src/transcribe.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.2": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Transcribe audio files into text files", "doc": "", "example": "transcribe.ipynb", "generationDate": "2023-07-13:11-20", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "transcribe", "platformVersion": "3.5.3", "spec": {"filename": "transcribe.py", "handler": "transcribe", "image": "mlrun/mlrun", "kind": "job", "requirements": ["openai-whisper", "tqdm"]}, "url": "", "version": "0.0.2", "test_valid": true, "assets": {"example": "src/transcribe.ipynb", "source": "src/transcribe.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Transcribe audio files into text files", "doc": "", "example": "transcribe.ipynb", "generationDate": "2023-07-13:11-20", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "transcribe", "platformVersion": "3.5.3", "spec": {"filename": "transcribe.py", "handler": "transcribe", "image": "mlrun/mlrun", "kind": "job", "requirements": ["openai-whisper", "tqdm"]}, "url": "", "version": "0.0.1", "test_valid": false, "assets": {"example": "src/transcribe.ipynb", "source": "src/transcribe.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Transcribe audio files into text files", "doc": "", "example": "transcribe.ipynb", "generationDate": "2023-07-13:11-20", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "transcribe", "platformVersion": "3.5.3", "spec": {"filename": "transcribe.py", "handler": "transcribe", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "tqdm", "torchaudio", "torch", "accelerate"]}, "url": "", "version": "1.0.0", "assets": {"example": "src/transcribe.ipynb", "source": "src/transcribe.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "pii_recognizer": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "NLP"], "description": "This function is used to recognize PII in a directory of text files", "doc": "", "example": "pii_recognizer.ipynb", "generationDate": "2023-08-15:10-24", "hidden": false, "icon": "", "labels": {"author": "pgw"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "pii-recognizer", "platformVersion": "3.5.3", "spec": {"filename": "pii_recognizer.py", "handler": "recognize_pii", "image": "mlrun/mlrun", "kind": "job", "requirements": ["nltk", "pandas", "presidio-anonymizer", "presidio-analyzer", "torch", "flair@git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653", "st-annotated-text", "https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl"]}, "url": "", "version": "0.3.0", "test_valid": false, "assets": {"example": "src/pii_recognizer.ipynb", "source": "src/pii_recognizer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.2.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "This function is used to recognize PII in a directory of text files", "doc": "", "example": "pii_recognizer.ipynb", "generationDate": "2023-08-15:10-24", "hidden": false, "icon": "", "labels": {"author": "pgw"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "pii-recognizer", "platformVersion": "3.5.3", "spec": {"filename": "pii_recognizer.py", "handler": "recognize_pii", "image": "mlrun/mlrun", "kind": "job", "requirements": ["nltk", "pandas", "presidio-anonymizer", "presidio-analyzer", "torch", "flair@git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653", "st-annotated-text", "https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl"]}, "url": "", "version": "0.2.0", "test_valid": false, "assets": {"example": "src/pii_recognizer.ipynb", "source": "src/pii_recognizer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.3.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "NLP"], "description": "This function is used to recognize PII in a directory of text files", "doc": "", "example": "pii_recognizer.ipynb", "generationDate": "2023-08-15:10-24", "hidden": false, "icon": "", "labels": {"author": "pgw"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "pii-recognizer", "platformVersion": "3.5.3", "spec": {"filename": "pii_recognizer.py", "handler": "recognize_pii", "image": "mlrun/mlrun", "kind": "job", "requirements": ["nltk", "pandas", "presidio-anonymizer", "presidio-analyzer", "torch", "flair@git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653", "st-annotated-text", "https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl"]}, "url": "", "version": "0.3.0", "test_valid": false, "assets": {"example": "src/pii_recognizer.ipynb", "source": "src/pii_recognizer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "This function is used to recognize PII in a directory of text files", "doc": "", "example": "pii_recognizer.ipynb", "generationDate": "2023-08-15:10-24", "hidden": false, "icon": "", "labels": {"author": "pgw"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "pii-recognizer", "platformVersion": "3.5.3", "spec": {"filename": "pii_recognizer.py", "handler": "recognize_pii", "image": "mlrun/mlrun", "kind": "job", "requirements": ["nltk", "pandas", "presidio-anonymizer", "presidio-analyzer", "torch", "flair@git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653", "st-annotated-text", "https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl"]}, "url": "", "version": "0.0.1", "assets": {"example": "src/pii_recognizer.ipynb", "source": "src/pii_recognizer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "This function is used to recognize PII in a directory of text files", "doc": "", "example": "pii_recognizer.ipynb", "generationDate": "2023-08-15:10-24", "hidden": false, "icon": "", "labels": {"author": "pgw"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "pii-recognizer", "platformVersion": "3.5.3", "spec": {"filename": "pii_recognizer.py", "handler": "recognize_pii", "image": "mlrun/mlrun", "kind": "job", "requirements": ["nltk", "pandas", "presidio-anonymizer", "presidio-analyzer", "torch", "flair@git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653", "st-annotated-text", "https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl"]}, "url": "", "version": "0.1.0", "test_valid": false, "assets": {"example": "src/pii_recognizer.ipynb", "source": "src/pii_recognizer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "batch_inference_v2": {"latest": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0-rc51", "name": "batch_inference_v2", "platformVersion": "3.6.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.6.0", "assets": {"example": "src/batch_inference_v2.ipynb", "source": "src/batch_inference_v2.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "2.2.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.2.0", "assets": {"example": "src/batch_inference_v2.ipynb", "source": "src/batch_inference_v2.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "2.6.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0-rc51", "name": "batch_inference_v2", "platformVersion": "3.6.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.6.0", "assets": {"example": "src/batch_inference_v2.ipynb", "source": "src/batch_inference_v2.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "2.4.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.4.0", "assets": {"example": "src/batch_inference_v2.ipynb", "source": "src/batch_inference_v2.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "2.0.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.0.0", "assets": {"example": "src/batch_inference_v2.ipynb", "source": "src/batch_inference_v2.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "2.1.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.1.0", "assets": {"example": "src/batch_inference_v2.ipynb", "source": "src/batch_inference_v2.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.5.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0-rc9", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "1.5.0", "assets": {"example": "src/batch_inference_v2.ipynb", "source": "src/batch_inference_v2.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.9.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0-rc16", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "1.9.0", "assets": {"example": "src/batch_inference_v2.ipynb", "source": "src/batch_inference_v2.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.8.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0-rc13", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "1.8.0", "assets": {"example": "src/batch_inference_v2.ipynb", "source": "src/batch_inference_v2.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "2.5.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.5.0", "assets": {"example": "src/batch_inference_v2.ipynb", "source": "src/batch_inference_v2.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.6.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0-rc9", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "1.6.0", "assets": {"example": "src/batch_inference_v2.ipynb", "source": "src/batch_inference_v2.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "translate": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "huggingface", "machine-learning", "deep-learning", "NLP"], "description": "Translate text files from one language to another", "doc": "", "example": "translate.ipynb", "generationDate": "2023-12-05:17-20", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "translate", "platformVersion": "3.5.3", "spec": {"filename": "translate.py", "handler": "translate", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "sentencepiece", "torch", "tqdm"]}, "url": "", "version": "0.1.0", "test_valid": true, "assets": {"example": "src/translate.ipynb", "source": "src/translate.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.2": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning", "deep-learning", "NLP"], "description": "Translate text files from one language to another", "doc": "", "example": "translate.ipynb", "generationDate": "2023-12-05:17-20", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "translate", "platformVersion": "3.5.3", "spec": {"filename": "translate.py", "handler": "translate", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "sentencepiece", "torch", "tqdm"]}, "url": "", "version": "0.0.2", "test_valid": true, "assets": {"example": "src/translate.ipynb", "source": "src/translate.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Translate text files from one language to another", "doc": "", "example": "translate.ipynb", "generationDate": "2023-12-05:17-20", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "translate", "platformVersion": "3.5.3", "spec": {"filename": "translate.py", "handler": "translate", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "sentencepiece", "torch", "tqdm"]}, "url": "", "version": "0.0.1", "test_valid": true, "assets": {"example": "src/translate.ipynb", "source": "src/translate.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.1.0": {"apiVersion": "v1", "categories": ["data-preparation", "huggingface", "machine-learning", "deep-learning", "NLP"], "description": "Translate text files from one language to another", "doc": "", "example": "translate.ipynb", "generationDate": "2023-12-05:17-20", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "translate", "platformVersion": "3.5.3", "spec": {"filename": "translate.py", "handler": "translate", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "sentencepiece", "torch", "tqdm"]}, "url": "", "version": "0.1.0", "test_valid": true, "assets": {"example": "src/translate.ipynb", "source": "src/translate.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "structured_data_generator": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.1", "name": "structured_data_generator", "platformVersion": "3.5.5", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.5.0", "assets": {"example": "src/structured_data_generator.ipynb", "source": "src/structured_data_generator.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.4.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.1", "name": "structured_data_generator", "platformVersion": "3.5.5", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.4.0", "assets": {"example": "src/structured_data_generator.ipynb", "source": "src/structured_data_generator.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.5.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.1", "name": "structured_data_generator", "platformVersion": "3.5.5", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.5.0", "assets": {"example": "src/structured_data_generator.ipynb", "source": "src/structured_data_generator.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "structured_data_generator", "platformVersion": "3.5.0", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/structured_data_generator.ipynb", "source": "src/structured_data_generator.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "structured_data_generator", "platformVersion": "3.5.0", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.0.0", "assets": {"example": "src/structured_data_generator.ipynb", "source": "src/structured_data_generator.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.3.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "structured_data_generator", "platformVersion": "3.5.0", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.3.0", "assets": {"example": "src/structured_data_generator.ipynb", "source": "src/structured_data_generator.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "text_to_audio_generator": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning", "pytorch"], "description": "Generate audio file from text using different speakers", "doc": "", "example": "text_to_audio_generator.ipynb", "generationDate": "2023-12-03:15-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.1", "name": "text_to_audio_generator", "platformVersion": "3.5.3", "spec": {"filename": "text_to_audio_generator.py", "handler": "generate_multi_speakers_audio", "image": "mlrun/mlrun", "kind": "job", "requirements": ["torchaudio", "pydub"]}, "url": "", "version": "1.3.0", "test_valid": true, "assets": {"example": "src/text_to_audio_generator.ipynb", "source": "src/text_to_audio_generator.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Generate audio file from text using different speakers", "doc": "", "example": "text_to_audio_generator.ipynb", "generationDate": "2023-12-03:15-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "text_to_audio_generator", "platformVersion": "3.5.3", "spec": {"filename": "text_to_audio_generator.py", "handler": "generate_multi_speakers_audio", "image": "mlrun/mlrun", "kind": "job", "requirements": ["bark", "torchaudio"]}, "url": "", "version": "1.1.0", "test_valid": true, "assets": {"example": "src/text_to_audio_generator.ipynb", "source": "src/text_to_audio_generator.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Generate audio file from text using different speakers", "doc": "", "example": "text_to_audio_generator.ipynb", "generationDate": "2023-12-03:15-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "text_to_audio_generator", "platformVersion": "3.5.3", "spec": {"filename": "text_to_audio_generator.py", "handler": "generate_multi_speakers_audio", "image": "mlrun/mlrun", "kind": "job", "requirements": ["bark", "torchaudio"]}, "url": "", "version": "1.0.0", "test_valid": true, "assets": {"example": "src/text_to_audio_generator.ipynb", "source": "src/text_to_audio_generator.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.3.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning", "pytorch"], "description": "Generate audio file from text using different speakers", "doc": "", "example": "text_to_audio_generator.ipynb", "generationDate": "2023-12-03:15-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.1", "name": "text_to_audio_generator", "platformVersion": "3.5.3", "spec": {"filename": "text_to_audio_generator.py", "handler": "generate_multi_speakers_audio", "image": "mlrun/mlrun", "kind": "job", "requirements": ["torchaudio", "pydub"]}, "url": "", "version": "1.3.0", "test_valid": true, "assets": {"example": "src/text_to_audio_generator.ipynb", "source": "src/text_to_audio_generator.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning", "pytorch"], "description": "Generate audio file from text using different speakers", "doc": "", "example": "text_to_audio_generator.ipynb", "generationDate": "2023-12-03:15-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "text_to_audio_generator", "platformVersion": "3.5.3", "spec": {"filename": "text_to_audio_generator.py", "handler": "generate_multi_speakers_audio", "image": "mlrun/mlrun", "kind": "job", "requirements": ["bark", "torchaudio"]}, "url": "", "version": "1.2.0", "test_valid": true, "assets": {"example": "src/text_to_audio_generator.ipynb", "source": "src/text_to_audio_generator.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "silero_vad": {"latest": {"apiVersion": "v1", "categories": ["deep-learning", "pytorch", "audio"], "description": "Silero VAD (Voice Activity Detection) functions.", "doc": "", "example": "silero_vad.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "silero_vad", "platformVersion": "3.5.3", "spec": {"filename": "silero_vad.py", "handler": "detect_voice", "image": "mlrun/mlrun", "kind": "job", "requirements": ["torch", "torchaudio", "tqdm", "onnxruntime"]}, "url": "", "version": "1.3.0", "assets": {"example": "src/silero_vad.ipynb", "source": "src/silero_vad.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["deep-learning", "pytorch", "audio"], "description": "Silero VAD (Voice Activity Detection) functions.", "doc": "", "example": "silero_vad.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "silero_vad", "platformVersion": "3.5.3", "spec": {"filename": "silero_vad.py", "handler": "detect_voice", "image": "mlrun/mlrun", "kind": "job", "requirements": ["torch", "torchaudio", "tqdm", "onnxruntime"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/silero_vad.ipynb", "source": "src/silero_vad.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.3.0": {"apiVersion": "v1", "categories": ["deep-learning", "pytorch", "audio"], "description": "Silero VAD (Voice Activity Detection) functions.", "doc": "", "example": "silero_vad.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "silero_vad", "platformVersion": "3.5.3", "spec": {"filename": "silero_vad.py", "handler": "detect_voice", "image": "mlrun/mlrun", "kind": "job", "requirements": ["torch", "torchaudio", "tqdm", "onnxruntime"]}, "url": "", "version": "1.3.0", "assets": {"example": "src/silero_vad.ipynb", "source": "src/silero_vad.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.2.0": {"apiVersion": "v1", "categories": ["deep-learning", "pytorch", "audio"], "description": "Silero VAD (Voice Activity Detection) functions.", "doc": "", "example": "silero_vad.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "silero_vad", "platformVersion": "3.5.3", "spec": {"filename": "silero_vad.py", "handler": "detect_voice", "image": "mlrun/mlrun", "kind": "job", "requirements": ["torch", "torchaudio", "tqdm", "onnxruntime"]}, "url": "", "version": "1.2.0", "assets": {"example": "src/silero_vad.ipynb", "source": "src/silero_vad.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "pyannote_audio": {"latest": {"apiVersion": "v1", "categories": ["deep-learning", "huggingface", "audio"], "description": "pyannote's speech diarization of audio files", "doc": "", "example": "pyannote_audio.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "pyannote-audio", "platformVersion": "3.5.3", "spec": {"filename": "pyannote_audio.py", "handler": "diarize", "image": "mlrun/mlrun-gpu", "kind": "job", "requirements": ["pyannote.audio", "pyannote.core", "torchaudio", "tqdm"]}, "url": "", "version": "1.2.0", "assets": {"example": "src/pyannote_audio.ipynb", "source": "src/pyannote_audio.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["deep-learning", "huggingface", "audio"], "description": "pyannote's speech diarization of audio files", "doc": "", "example": "pyannote_audio.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "pyannote-audio", "platformVersion": "3.5.3", "spec": {"filename": "pyannote_audio.py", "handler": "diarize", "image": "mlrun/mlrun-gpu", "kind": "job", "requirements": ["pyannote.audio", "pyannote.core", "torchaudio", "tqdm"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/pyannote_audio.ipynb", "source": "src/pyannote_audio.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.0": {"apiVersion": "v1", "categories": ["deep-learning", "huggingface", "audio"], "description": "pyannote's speech diarization of audio files", "doc": "", "example": "pyannote_audio.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "pyannote-audio", "platformVersion": "3.5.3", "spec": {"filename": "pyannote_audio.py", "handler": "diarize", "image": "mlrun/mlrun-gpu", "kind": "job", "requirements": ["pyannote.audio", "pyannote.core", "torchaudio", "tqdm"]}, "url": "", "version": "1.0.0", "assets": {"example": "src/pyannote_audio.ipynb", "source": "src/pyannote_audio.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.2.0": {"apiVersion": "v1", "categories": ["deep-learning", "huggingface", "audio"], "description": "pyannote's speech diarization of audio files", "doc": "", "example": "pyannote_audio.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "pyannote-audio", "platformVersion": "3.5.3", "spec": {"filename": "pyannote_audio.py", "handler": "diarize", "image": "mlrun/mlrun-gpu", "kind": "job", "requirements": ["pyannote.audio", "pyannote.core", "torchaudio", "tqdm"]}, "url": "", "version": "1.2.0", "assets": {"example": "src/pyannote_audio.ipynb", "source": "src/pyannote_audio.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "mlflow_utils": {"latest": {"apiVersion": "v1", "categories": ["genai", "model-serving", "machine-learning"], "description": "Mlflow model server, and additional utils.", "doc": "", "example": "mlflow_utils.ipynb", "generationDate": "2024-05-23:12-00", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0-rc17", "name": "mlflow_utils", "platformVersion": "", "spec": {"customFields": {"default_class": "MLFlowModelServer"}, "filename": "mlflow_utils.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["mlflow==2.12.2", "lightgbm", "xgboost"]}, "url": "", "version": "1.0.0", "assets": {"example": "src/mlflow_utils.ipynb", "source": "src/mlflow_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.0": {"apiVersion": "v1", "categories": ["genai", "model-serving", "machine-learning"], "description": "Mlflow model server, and additional utils.", "doc": "", "example": "mlflow_utils.ipynb", "generationDate": "2024-05-23:12-00", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0-rc17", "name": "mlflow_utils", "platformVersion": "", "spec": {"customFields": {"default_class": "MLFlowModelServer"}, "filename": "mlflow_utils.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["mlflow==2.12.2", "lightgbm", "xgboost"]}, "url": "", "version": "1.0.0", "assets": {"example": "src/mlflow_utils.ipynb", "source": "src/mlflow_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "noise_reduction": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Reduce noise from audio files", "doc": "", "example": "noise_reduction.ipynb", "generationDate": "2024-03-04:17-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "mlrunVersion": "1.5.2", "name": "noise-reduction", "platformVersion": "3.5.3", "spec": {"filename": "noise_reduction.py", "handler": "reduce_noise", "image": "mlrun/mlrun", "kind": "job", "requirements": ["librosa", "noisereduce", "deepfilternet", "torchaudio>=2.1.2"]}, "url": "", "version": "1.0.0", "assets": {"example": "src/noise_reduction.ipynb", "source": "src/noise_reduction.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Reduce noise from audio files", "doc": "", "example": "noise_reduction.ipynb", "generationDate": "2024-03-04:17-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "mlrunVersion": "1.5.2", "name": "noise-reduction", "platformVersion": "3.5.3", "spec": {"filename": "noise_reduction.py", "handler": "reduce_noise", "image": "mlrun/mlrun", "kind": "job", "requirements": ["librosa", "noisereduce", "deepfilternet", "torchaudio>=2.1.2"]}, "url": "", "version": "1.0.0", "assets": {"example": "src/noise_reduction.ipynb", "source": "src/noise_reduction.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}}
\ No newline at end of file
+{"tf2_serving": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "tf2-serving", "platformVersion": "3.5.0", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/tf2_serving.ipynb", "source": "src/tf2_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "tf2-serving", "platformVersion": "3.5.0", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/tf2_serving.ipynb", "source": "src/tf2_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf2-serving", "platformVersion": "3.2.0", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.9.0", "assets": {"example": "src/tf2_serving.ipynb", "source": "src/tf2_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "tf2-serving", "platformVersion": "", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.0.1", "assets": {"example": "src/tf2_serving.ipynb", "source": "src/tf2_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf2-serving", "platformVersion": "3.2.0", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.9.1", "assets": {"example": "src/tf2_serving.ipynb", "source": "src/tf2_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "tf2 image classification server", "doc": "", "example": "tf2_serving.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "tf2-serving", "platformVersion": "3.2.0", "spec": {"filename": "tf2_serving.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": ["requests", "pillow", "tensorflow>=2.1"]}, "url": "", "version": "0.8.0", "assets": {"example": "src/tf2_serving.ipynb", "source": "src/tf2_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "feature_selection": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.8.0-rc40", "name": "feature-selection", "platformVersion": "3.6.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.6.0", "assets": {"example": "src/feature_selection.ipynb", "source": "src/feature_selection.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/feature_selection.ipynb", "source": "src/feature_selection.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.9.0", "name": "feature-selection", "platformVersion": "3.2.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection/feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/feature_selection.ipynb", "source": "src/feature_selection.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.5.4", "name": "feature-selection", "platformVersion": "2.10.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection/feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/feature_selection.ipynb", "source": "src/feature_selection.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0", "assets": {"example": "src/feature_selection.ipynb", "source": "src/feature_selection.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.1": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.9.0", "name": "feature-selection", "platformVersion": "3.2.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.1", "assets": {"example": "src/feature_selection.ipynb", "source": "src/feature_selection.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.6.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.8.0-rc40", "name": "feature-selection", "platformVersion": "3.6.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.6.0", "assets": {"example": "src/feature_selection.ipynb", "source": "src/feature_selection.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.4.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.4.0", "assets": {"example": "src/feature_selection.ipynb", "source": "src/feature_selection.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "feature-selection", "platformVersion": "3.2.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection/feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/feature_selection.ipynb", "source": "src/feature_selection.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.3.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0", "assets": {"example": "src/feature_selection.ipynb", "source": "src/feature_selection.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.1": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "feature-selection", "platformVersion": "3.5.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.1", "assets": {"example": "src/feature_selection.ipynb", "source": "src/feature_selection.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.5.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Select features through multiple Statistical and Model filters", "doc": "", "example": "feature_selection.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "orz"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.4", "name": "feature-selection", "platformVersion": "3.6.0", "spec": {"filename": "feature_selection.py", "handler": "feature_selection", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.5.0", "assets": {"example": "src/feature_selection.ipynb", "source": "src/feature_selection.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "model_server": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-server", "platformVersion": "3.5.0", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/model_server.ipynb", "source": "src/model_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-server", "platformVersion": "3.5.0", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/model_server.ipynb", "source": "src/model_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server", "platformVersion": "3.2.0", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/model_server.ipynb", "source": "src/model_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "model-server", "platformVersion": "", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/model_server.ipynb", "source": "src/model_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server", "platformVersion": "3.2.0", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "1.0.0", "assets": {"example": "src/model_server.ipynb", "source": "src/model_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server", "platformVersion": "3.2.0", "spec": {"filename": "model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "nuclio:serving", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/model_server.ipynb", "source": "src/model_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "describe": {"latest": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.0", "name": "describe", "platformVersion": "3.5.3", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0", "assets": {"example": "src/describe.ipynb", "source": "src/describe.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.2": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-04-26:10-20", "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe", "platformVersion": "3.2.0", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.2", "assets": {"example": "src/describe.ipynb", "source": "src/describe.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "describe", "platformVersion": "3.5.0", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/describe.ipynb", "source": "src/describe.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Iguazio"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe", "platformVersion": "3.2.0", "spec": {"filename": "describe.py", "handler": "summarize", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/describe.ipynb", "source": "src/describe.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "Iguazio"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.5.4", "name": "describe", "platformVersion": "2.10.0", "spec": {"filename": "describe.py", "handler": "summarize", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/describe.ipynb", "source": "src/describe.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.2.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "describe", "platformVersion": "3.5.3", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0", "assets": {"example": "src/describe.ipynb", "source": "src/describe.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-04-07:14-20", "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe", "platformVersion": "3.2.0", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.1", "assets": {"example": "src/describe.ipynb", "source": "src/describe.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Iguazio"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe", "platformVersion": "3.2.0", "spec": {"filename": "describe.py", "handler": "summarize", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/describe.ipynb", "source": "src/describe.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.3.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "describe and visualizes dataset stats", "doc": "", "example": "describe.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Davids"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.0", "name": "describe", "platformVersion": "3.5.3", "spec": {"filename": "describe.py", "handler": "analyze", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0", "assets": {"example": "src/describe.ipynb", "source": "src/describe.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "github_utils": {"latest": {"apiVersion": "v1", "categories": ["utils"], "description": "add comments to github pull request", "doc": "", "example": "github_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "github-utils", "platformVersion": "3.5.0", "spec": {"filename": "github_utils.py", "handler": "run_summary_comment", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/github_utils.ipynb", "source": "src/github_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["utils"], "description": "add comments to github pull request", "doc": "", "example": "github_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "github-utils", "platformVersion": "3.5.0", "spec": {"filename": "github_utils.py", "handler": "run_summary_comment", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/github_utils.ipynb", "source": "src/github_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["utils"], "description": "add comments to github pull request", "doc": "", "example": "github_utils.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "github-utils", "platformVersion": "3.2.0", "spec": {"filename": "github_utils.py", "handler": "run_summary_comment", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/github_utils.ipynb", "source": "src/github_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["utils"], "description": "add comments to github pull request", "doc": "", "example": "github_utils.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "github-utils", "platformVersion": "", "spec": {"filename": "github_utils.py", "handler": "run_summary_comment", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/github_utils.ipynb", "source": "src/github_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["utils"], "description": "add comments to github pull request", "doc": "", "example": "github_utils.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "github-utils", "platformVersion": "3.2.0", "spec": {"filename": "github_utils.py", "handler": "run_summary_comment", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/github_utils.ipynb", "source": "src/github_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "aggregate": {"latest": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "aggregate", "platformVersion": "3.5.4", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0", "assets": {"example": "src/aggregate.ipynb", "source": "src/aggregate.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2022-08-28:17-25", "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "aggregate", "platformVersion": "3.5.0", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/aggregate.ipynb", "source": "src/aggregate.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "aggregate", "platformVersion": "3.2.0", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/aggregate.ipynb", "source": "src/aggregate.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2021-05-19:22-31", "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.6.2", "name": "aggregate", "platformVersion": "3.0.0", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/aggregate.ipynb", "source": "src/aggregate.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "aggregate", "platformVersion": "3.5.2", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0", "assets": {"example": "src/aggregate.ipynb", "source": "src/aggregate.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "aggregate", "platformVersion": "3.2.0", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/aggregate.ipynb", "source": "src/aggregate.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.3.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Rolling aggregation over Metrics and Lables according to specifications", "doc": "", "example": "aggregate.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avia"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "aggregate", "platformVersion": "3.5.4", "spec": {"filename": "aggregate.py", "handler": "aggregate", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0", "assets": {"example": "src/aggregate.ipynb", "source": "src/aggregate.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "load_dataset": {"latest": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.0", "name": "load-dataset", "platformVersion": "3.5.5", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0", "assets": {"example": "src/load_dataset.ipynb", "source": "src/load_dataset.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "load-dataset", "platformVersion": "3.5.0", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/load_dataset.ipynb", "source": "src/load_dataset.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "load-dataset", "platformVersion": "3.2.0", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/load_dataset.ipynb", "source": "src/load_dataset.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "load-dataset", "platformVersion": "", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/load_dataset.ipynb", "source": "src/load_dataset.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.0", "name": "load-dataset", "platformVersion": "3.5.5", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0", "assets": {"example": "src/load_dataset.ipynb", "source": "src/load_dataset.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "load a toy dataset from scikit-learn", "doc": "README.md", "example": "load_dataset.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yjb", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "load-dataset", "platformVersion": "3.2.0", "spec": {"filename": "load_dataset.py", "handler": "load_dataset", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/load_dataset.ipynb", "source": "src/load_dataset.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "auto_trainer": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.7.0", "assets": {"example": "src/auto_trainer.ipynb", "source": "src/auto_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.6": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-04-26:10-43", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.0.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.6", "assets": {"example": "src/auto_trainer.ipynb", "source": "src/auto_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/auto_trainer.ipynb", "source": "src/auto_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.10.3": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-04-26:10-43", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.10.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "", "kind": "job", "requirements": []}, "url": "", "version": "0.10.3", "assets": {"example": "src/auto_trainer.ipynb", "source": "src/auto_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.10.2": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-02-06:10-18", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.10.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "", "kind": "job", "requirements": []}, "url": "", "version": "0.10.2", "assets": {"example": "src/auto_trainer.ipynb", "source": "src/auto_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.6.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.6.0", "assets": {"example": "src/auto_trainer.ipynb", "source": "src/auto_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.7.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.7.0", "assets": {"example": "src/auto_trainer.ipynb", "source": "src/auto_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.4.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.4.0", "assets": {"example": "src/auto_trainer.ipynb", "source": "src/auto_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.7": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-04-26:10-43", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.0.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.7", "assets": {"example": "src/auto_trainer.ipynb", "source": "src/auto_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.3.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.3.0", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0", "assets": {"example": "src/auto_trainer.ipynb", "source": "src/auto_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.5": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-04-26:10-43", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.0.0", "name": "auto_trainer", "platformVersion": "", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.5", "assets": {"example": "src/auto_trainer.ipynb", "source": "src/auto_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.5.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Automatic train, evaluate and predict functions for the ML frameworks - Scikit-Learn, XGBoost and LightGBM.", "doc": "", "example": "auto_trainer.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "auto_trainer", "platformVersion": "3.5.0", "spec": {"filename": "auto_trainer.py", "handler": "train", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.5.0", "assets": {"example": "src/auto_trainer.ipynb", "source": "src/auto_trainer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "v2_model_server": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "v2-model-server", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "1.2.0", "assets": {"example": "src/v2_model_server.ipynb", "source": "src/v2_model_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "v2-model-server", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/v2_model_server.ipynb", "source": "src/v2_model_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "v2-model-server", "platformVersion": "3.2.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/v2_model_server.ipynb", "source": "src/v2_model_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "v2-model-server", "platformVersion": "", "spec": {"filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": [], "customFields": {"default_class": "ClassifierModel"}}, "url": "", "version": "0.0.1", "assets": {"example": "src/v2_model_server.ipynb", "source": "src/v2_model_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.2.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "v2-model-server", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "1.2.0", "assets": {"example": "src/v2_model_server.ipynb", "source": "src/v2_model_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "v2-model-server", "platformVersion": "3.2.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "1.0.0", "assets": {"example": "src/v2_model_server.ipynb", "source": "src/v2_model_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "generic sklearn model server", "doc": "", "example": "v2_model_server.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh", "framework": "sklearn"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "v2-model-server", "platformVersion": "3.2.0", "spec": {"customFields": {"default_class": "ClassifierModel"}, "filename": "v2_model_server.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/v2_model_server.ipynb", "source": "src/v2_model_server.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "model_server_tester": {"latest": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-server-tester", "platformVersion": "3.5.0", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/model_server_tester.ipynb", "source": "src/model_server_tester.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "model-server-tester", "platformVersion": "3.5.0", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/model_server_tester.ipynb", "source": "src/model_server_tester.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server-tester", "platformVersion": "3.2.0", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/model_server_tester.ipynb", "source": "src/model_server_tester.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "model-server-tester", "platformVersion": "", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/model_server_tester.ipynb", "source": "src/model_server_tester.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.0": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server-tester", "platformVersion": "3.2.0", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.0.0", "assets": {"example": "src/model_server_tester.ipynb", "source": "src/model_server_tester.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["monitoring", "model-serving"], "description": "test model servers", "doc": "", "example": "model_server_tester.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "model-server-tester", "platformVersion": "3.2.0", "spec": {"filename": "model_server_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/model_server_tester.ipynb", "source": "src/model_server_tester.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "v2_model_tester": {"latest": {"apiVersion": "v1", "categories": ["model-testing", "machine-learning"], "description": "test v2 model servers", "doc": "", "example": "v2_model_tester.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "v2-model-tester", "platformVersion": "3.5.0", "spec": {"filename": "v2_model_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/v2_model_tester.ipynb", "source": "src/v2_model_tester.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["model-testing", "machine-learning"], "description": "test v2 model servers", "doc": "", "example": "v2_model_tester.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "v2-model-tester", "platformVersion": "3.5.0", "spec": {"filename": "v2_model_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/v2_model_tester.ipynb", "source": "src/v2_model_tester.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["model-testing", "machine-learning"], "description": "test v2 model servers", "doc": "", "example": "v2_model_tester.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "v2-model-tester", "platformVersion": "3.2.0", "spec": {"filename": "v2_model_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/v2_model_tester.ipynb", "source": "src/v2_model_tester.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["model-testing", "machine-learning"], "description": "test v2 model servers", "doc": "", "example": "v2_model_tester.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "v2-model-tester", "platformVersion": "", "spec": {"filename": "v2_model_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/v2_model_tester.ipynb", "source": "src/v2_model_tester.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["model-testing", "machine-learning"], "description": "test v2 model servers", "doc": "", "example": "v2_model_tester.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "v2-model-tester", "platformVersion": "3.2.0", "spec": {"filename": "v2_model_tester.py", "handler": "model_server_tester", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/v2_model_tester.ipynb", "source": "src/v2_model_tester.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "open_archive": {"latest": {"apiVersion": "v1", "categories": ["utils"], "description": "Open a file/object archive into a target directory", "doc": "", "example": "open_archive.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.8.0-rc50", "name": "open-archive", "platformVersion": "3.5.0", "spec": {"filename": "open_archive.py", "handler": "open_archive", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0", "assets": {"example": "src/open_archive.ipynb", "source": "src/open_archive.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.2.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Open a file/object archive into a target directory", "doc": "", "example": "open_archive.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yaronh"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.8.0-rc50", "name": "open-archive", "platformVersion": "3.5.0", "spec": {"filename": "open_archive.py", "handler": "open_archive", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0", "assets": {"example": "src/open_archive.ipynb", "source": "src/open_archive.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "onnx_utils": {"latest": {"apiVersion": "v1", "categories": ["utils", "deep-learning"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.2", "name": "onnx_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "with_mlrun": false}}, "filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/mlrun", "kind": "job", "requirements": ["tqdm~=4.67.1", "tensorflow~=2.19.0", "tf_keras~=2.19.0", "torch~=2.6.0", "torchvision~=0.21.0", "onnx~=1.17.0", "onnxruntime~=1.19.2", "onnxoptimizer~=0.3.13", "onnxmltools~=1.13.0", "tf2onnx~=1.16.1", "plotly~=5.4.0"]}, "url": "", "version": "1.3.0", "assets": {"example": "src/onnx_utils.ipynb", "source": "src/onnx_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.3.0": {"apiVersion": "v1", "categories": ["utils", "deep-learning"], "description": "ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun.", "doc": "", "example": "onnx_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.2", "name": "onnx_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "with_mlrun": false}}, "filename": "onnx_utils.py", "handler": "to_onnx", "image": "mlrun/mlrun", "kind": "job", "requirements": ["tqdm~=4.67.1", "tensorflow~=2.19.0", "tf_keras~=2.19.0", "torch~=2.6.0", "torchvision~=0.21.0", "onnx~=1.17.0", "onnxruntime~=1.19.2", "onnxoptimizer~=0.3.13", "onnxmltools~=1.13.0", "tf2onnx~=1.16.1", "plotly~=5.4.0"]}, "url": "", "version": "1.3.0", "assets": {"example": "src/onnx_utils.ipynb", "source": "src/onnx_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "gen_class_data": {"latest": {"apiVersion": "v1", "categories": ["data-generation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0", "name": "gen_class_data", "platformVersion": "3.5.3", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0", "assets": {"example": "src/gen_class_data.ipynb", "source": "src/gen_class_data.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.10.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "gen_class_data", "platformVersion": "3.2.0", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.10.0", "assets": {"example": "src/gen_class_data.ipynb", "source": "src/gen_class_data.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "gen_class_data", "platformVersion": "3.5.0", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/gen_class_data.ipynb", "source": "src/gen_class_data.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "gen_class_data", "platformVersion": "3.2.0", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/gen_class_data.ipynb", "source": "src/gen_class_data.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.6.2", "name": "gen_class_data", "platformVersion": "3.0.0", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/gen_class_data.ipynb", "source": "src/gen_class_data.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "gen_class_data", "platformVersion": "3.5.3", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0", "assets": {"example": "src/gen_class_data.ipynb", "source": "src/gen_class_data.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["data-preparation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "gen_class_data", "platformVersion": "3.2.0", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/gen_class_data.ipynb", "source": "src/gen_class_data.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.3.0": {"apiVersion": "v1", "categories": ["data-generation"], "description": "Create a binary classification sample dataset and save.", "doc": "", "example": "gen_class_data.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "Daniel"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0", "name": "gen_class_data", "platformVersion": "3.5.3", "spec": {"filename": "gen_class_data.py", "handler": "gen_class_data", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.3.0", "assets": {"example": "src/gen_class_data.ipynb", "source": "src/gen_class_data.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "azureml_utils": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "utils"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0", "name": "azureml_utils", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "commands": ["apt-get update && apt-get install -y --no-install-recommends git", "apt install -y liblttng-ust0"], "with_mlrun": true}}, "filename": "azureml_utils.py", "handler": "train", "image": "python:3.9-bullseye", "kind": "job", "requirements": ["azureml-core==1.54.0.post1", "azureml-train-automl-client==1.54.0.post1", "plotly~=5.4"]}, "url": "", "version": "1.4.0", "test_valid": true, "assets": {"example": "src/azureml_utils.ipynb", "source": "src/azureml_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.4": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2021-11-13:00-15", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "azureml_utils", "platformVersion": "", "spec": {"filename": "azureml_utils.py", "handler": "train", "extra_spec": {"build": {"commands": ["python -m pip install pip==21.2.4", "apt-get update && apt-get install -y --no-install-recommends git"], "with_mlrun": true, "auto_build": true}, "allow_empty_resources": true}, "image": "python:3.7.9-slim", "kind": "job", "requirements": ["azureml-core==1.33.0", "azureml-train-automl-client==1.33.0", "plotly~=5.4"]}, "url": "", "version": "0.9.4", "assets": {"example": "src/azureml_utils.ipynb", "source": "src/azureml_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "azureml_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "commands": ["python -m pip install pip==22.1.2", "apt-get update && apt-get install -y --no-install-recommends git"], "with_mlrun": true}}, "filename": "azureml_utils.py", "handler": "train", "image": "python:3.7.9-slim", "kind": "job", "requirements": ["azureml-core==1.40.0", "azureml-train-automl-client==1.40.0", "plotly~=5.4"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/azureml_utils.ipynb", "source": "src/azureml_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2021-11-13:00-15", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "azureml_utils", "platformVersion": "", "spec": {"filename": "azureml_utils.py", "handler": "train", "commands": null, "image": "", "kind": "job", "requirements": ["azureml-core==1.33.0", "azureml-train-automl-client==1.33.0"]}, "url": "", "version": "0.9.0", "assets": {"example": "src/azureml_utils.ipynb", "source": "src/azureml_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.2.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "azureml_utils", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "commands": ["python -m pip install pip==22.1.2", "apt-get update && apt-get install -y --no-install-recommends git"], "with_mlrun": true}}, "filename": "azureml_utils.py", "handler": "train", "image": "python:3.7.9-slim", "kind": "job", "requirements": ["azureml-core==1.40.0", "azureml-train-automl-client==1.40.0", "plotly~=5.4"]}, "url": "", "version": "1.2.0", "test_valid": false, "assets": {"example": "src/azureml_utils.ipynb", "source": "src/azureml_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.4.0": {"apiVersion": "v1", "categories": ["model-serving", "utils"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0", "name": "azureml_utils", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "commands": ["apt-get update && apt-get install -y --no-install-recommends git", "apt install -y liblttng-ust0"], "with_mlrun": true}}, "filename": "azureml_utils.py", "handler": "train", "image": "python:3.9-bullseye", "kind": "job", "requirements": ["azureml-core==1.54.0.post1", "azureml-train-automl-client==1.54.0.post1", "plotly~=5.4"]}, "url": "", "version": "1.4.0", "test_valid": true, "assets": {"example": "src/azureml_utils.ipynb", "source": "src/azureml_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.5": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2021-04-20:15-18", "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "azureml_utils", "platformVersion": "", "spec": {"filename": "azureml_utils.py", "handler": "train", "extra_spec": {"build": {"commands": ["python -m pip install pip==21.2.4", "apt-get update && apt-get install -y --no-install-recommends git"], "with_mlrun": true, "auto_build": true}, "allow_empty_resources": true}, "image": "python:3.7.9-slim", "kind": "job", "requirements": ["azureml-core==1.40.0", "azureml-train-automl-client==1.40.0", "plotly~=5.4"]}, "url": "", "version": "0.9.5", "assets": {"example": "src/azureml_utils.ipynb", "source": "src/azureml_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.3.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-training"], "description": "Azure AutoML integration in MLRun, including utils functions for training models on Azure AutoML platfrom.", "doc": "", "example": "azureml_utils.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "azureml_utils", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "commands": ["apt-get update && apt-get install -y --no-install-recommends git", "apt install -y liblttng-ust0"], "with_mlrun": true}}, "filename": "azureml_utils.py", "handler": "train", "image": "python:3.9-bullseye", "kind": "job", "requirements": ["azureml-core==1.54.0.post1", "azureml-train-automl-client==1.54.0.post1", "plotly~=5.4"]}, "url": "", "version": "1.3.0", "test_valid": true, "assets": {"example": "src/azureml_utils.ipynb", "source": "src/azureml_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "describe_spark": {"latest": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "describe-spark", "platformVersion": "3.5.0", "spec": {"filename": "describe_spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/describe_spark.ipynb", "source": "src/describe_spark.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "describe-spark", "platformVersion": "3.5.0", "spec": {"filename": "describe_spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/describe_spark.ipynb", "source": "src/describe_spark.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe-spark", "platformVersion": "3.2.0", "spec": {"filename": "describe-spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/describe_spark.ipynb", "source": "src/describe-spark.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2021-05-19:22-41", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "describe-spark", "platformVersion": "", "spec": {"filename": "describe-spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/describe_spark.ipynb", "source": "src/describe-spark.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.1": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe-spark", "platformVersion": "3.2.0", "spec": {"filename": "describe_spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "0.9.1", "assets": {"example": "src/describe_spark.ipynb", "source": "src/describe_spark.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["data-analysis"], "description": "", "doc": "", "example": "describe_spark.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "describe-spark", "platformVersion": "3.2.0", "spec": {"filename": "describe-spark.py", "handler": "describe_spark", "image": "iguazio/shell:3.0_b5565_20201026062233_wsdf", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/describe_spark.ipynb", "source": "src/describe-spark.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "send_email": {"latest": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "send-email", "platformVersion": "3.5.3", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0", "assets": {"example": "src/send_email.ipynb", "source": "src/send_email.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "send-email", "platformVersion": "3.5.0", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "1.1.0", "assets": {"example": "src/send_email.ipynb", "source": "src/send_email.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.9.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "send-email", "platformVersion": "3.2.0", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.9.0", "assets": {"example": "src/send_email.ipynb", "source": "src/send_email.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2021-05-19:23-13", "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "", "name": "send-email", "platformVersion": "", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.0.1", "assets": {"example": "src/send_email.ipynb", "source": "src/send_email.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.2.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "send-email", "platformVersion": "3.5.3", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.2.0", "assets": {"example": "src/send_email.ipynb", "source": "src/send_email.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.8.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Send Email messages through SMTP server", "doc": "", "example": "send_email.ipynb", "generationDate": "2021-11-18:12-28", "icon": "", "labels": {"author": "saarc"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "0.8.0", "name": "send-email", "platformVersion": "3.2.0", "spec": {"filename": "send_email.py", "handler": "send_email", "image": "mlrun/ml-models", "kind": "job", "requirements": []}, "url": "", "version": "0.8.0", "assets": {"example": "src/send_email.ipynb", "source": "src/send_email.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "arc_to_parquet": {"latest": {"apiVersion": "v1", "categories": ["utils"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avi"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0", "name": "arc-to-parquet", "platformVersion": "3.5.4", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.5.0", "assets": {"example": "src/arc_to_parquet.ipynb", "source": "src/arc_to_parquet.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.4.1": {"apiVersion": "v1", "categories": ["etl"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avi"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "arc-to-parquet", "platformVersion": "3.5.4", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.4.1", "assets": {"example": "src/arc_to_parquet.ipynb", "source": "src/arc_to_parquet.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.5.0": {"apiVersion": "v1", "categories": ["utils"], "description": "retrieve remote archive, open and save as parquet", "doc": "", "example": "arc_to_parquet.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "avi"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0", "name": "arc-to-parquet", "platformVersion": "3.5.4", "spec": {"filename": "arc_to_parquet.py", "handler": "arc_to_parquet", "image": "mlrun/mlrun", "kind": "job", "requirements": []}, "url": "", "version": "1.5.0", "assets": {"example": "src/arc_to_parquet.ipynb", "source": "src/arc_to_parquet.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "azureml_serving": {"latest": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "AzureML serving function", "doc": "", "example": "azureml_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "azureml_serving", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "mlrun.frameworks.sklearn.PickleModelServer"}, "filename": "azureml_serving.py", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["azureml-automl-runtime~=1.38.1"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/azureml_serving.ipynb", "source": "src/azureml_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "model-serving"], "description": "AzureML serving function", "doc": "", "example": "azureml_serving.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "azureml_serving", "platformVersion": "3.5.0", "spec": {"customFields": {"default_class": "mlrun.frameworks.sklearn.PickleModelServer"}, "filename": "azureml_serving.py", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["azureml-automl-runtime~=1.38.1"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/azureml_serving.ipynb", "source": "src/azureml_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "batch_inference": {"latest": {"apiVersion": "v1", "categories": ["model-serving"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.8.0", "assets": {"example": "src/batch_inference.ipynb", "source": "src/batch_inference.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference ( also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-learn", "plotly"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/batch_inference.ipynb", "source": "src/batch_inference.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.2.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.2.0", "assets": {"example": "src/batch_inference.ipynb", "source": "src/batch_inference.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.6.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.6.0", "assets": {"example": "src/batch_inference.ipynb", "source": "src/batch_inference.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.7.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.1", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.7.0", "assets": {"example": "src/batch_inference.ipynb", "source": "src/batch_inference.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.8.0": {"apiVersion": "v1", "categories": ["model-serving"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.8.0", "assets": {"example": "src/batch_inference.ipynb", "source": "src/batch_inference.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.4.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "1.4.0", "assets": {"example": "src/batch_inference.ipynb", "source": "src/batch_inference.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.3.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.3.0", "assets": {"example": "src/batch_inference.ipynb", "source": "src/batch_inference.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.1": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": true, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": ["scikit-learn", "plotly"]}, "url": "", "version": "1.1.1", "assets": {"example": "src/batch_inference.ipynb", "source": "src/batch_inference.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.5.0": {"apiVersion": "v1", "categories": ["utils"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference.ipynb", "generationDate": "2022-08-28:17-25", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.2.0", "name": "batch_inference", "platformVersion": "3.5.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference.py", "handler": "infer", "image": "mlrun/ml-models", "kind": "job", "requirements": null}, "url": "", "version": "1.5.0", "assets": {"example": "src/batch_inference.ipynb", "source": "src/batch_inference.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "hugging_face_serving": {"latest": {"apiVersion": "v1", "categories": ["genai", "model-serving"], "description": "Generic Hugging Face model server.", "doc": "", "example": "hugging_face_serving.ipynb", "generationDate": "2022-09-05:17-00", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "hugging_face_serving", "platformVersion": "", "spec": {"customFields": {"default_class": "HuggingFaceModelServer"}, "filename": "hugging_face_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["transformers==4.21.3", "tensorflow==2.9.2"]}, "url": "", "version": "1.1.0", "test_valid": false, "assets": {"example": "src/hugging_face_serving.ipynb", "source": "src/hugging_face_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["genai", "model-serving"], "description": "Generic Hugging Face model server.", "doc": "", "example": "hugging_face_serving.ipynb", "generationDate": "2022-09-05:17-00", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "hugging_face_serving", "platformVersion": "", "spec": {"customFields": {"default_class": "HuggingFaceModelServer"}, "filename": "hugging_face_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["transformers==4.21.3", "tensorflow==2.9.2"]}, "url": "", "version": "1.1.0", "test_valid": false, "assets": {"example": "src/hugging_face_serving.ipynb", "source": "src/hugging_face_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.0": {"apiVersion": "v1", "categories": ["model-serving", "machine-learning"], "description": "Generic Hugging Face model server.", "doc": "", "example": "hugging_face_serving.ipynb", "generationDate": "2022-09-05:17-00", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.1.0", "name": "hugging_face_serving", "platformVersion": "", "spec": {"customFields": {"default_class": "HuggingFaceModelServer"}, "filename": "hugging_face_serving.py", "handler": "handler", "image": "mlrun/ml-models", "kind": "serving", "requirements": ["transformers==4.21.3", "tensorflow==2.9.2"]}, "url": "", "version": "1.0.0", "assets": {"example": "src/hugging_face_serving.ipynb", "source": "src/hugging_face_serving.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "question_answering": {"latest": {"apiVersion": "v1", "categories": ["genai"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "torch", "tqdm"]}, "url": "", "version": "0.5.0", "assets": {"example": "src/question_answering.ipynb", "source": "src/question_answering.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.2.0": {"apiVersion": "v1", "categories": ["machine-learning"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": "transformers torch tqdm"}, "url": "", "version": "0.2.0", "assets": {"example": "src/question_answering.ipynb", "source": "src/question_answering.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.3.1": {"apiVersion": "v1", "categories": ["machine-learning"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "torch", "tqdm"]}, "url": "", "version": "0.3.1", "assets": {"example": "src/question_answering.ipynb", "source": "src/question_answering.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.4.0": {"apiVersion": "v1", "categories": ["genai", "huggingface", "machine-learning"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "torch", "tqdm"]}, "url": "", "version": "0.4.0", "assets": {"example": "src/question_answering.ipynb", "source": "src/question_answering.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.1.0": {"apiVersion": "v1", "categories": ["machine-learning"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": "transformers torch tqdm"}, "url": "", "version": "0.1.0", "assets": {"example": "src/question_answering.ipynb", "source": "src/question_answering.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.3.0": {"apiVersion": "v1", "categories": ["machine-learning"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": "transformers torch tqdm"}, "url": "", "version": "0.3.0", "assets": {"example": "src/question_answering.ipynb", "source": "src/question_answering.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.5.0": {"apiVersion": "v1", "categories": ["genai"], "description": "GenAI approach of question answering on a given data", "doc": "", "example": "question_answering.ipynb", "generationDate": "2023-08-07:11-30", "hidden": false, "icon": "", "labels": {"author": "yonish"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0", "name": "question_answering", "platformVersion": "3.5.0", "spec": {"filename": "question_answering.py", "handler": "answer_questions", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "torch", "tqdm"]}, "url": "", "version": "0.5.0", "assets": {"example": "src/question_answering.ipynb", "source": "src/question_answering.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "transcribe": {"latest": {"apiVersion": "v1", "categories": ["audio", "genai"], "description": "Transcribe audio files into text files", "doc": "", "example": "transcribe.ipynb", "generationDate": "2023-07-13:11-20", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0", "name": "transcribe", "platformVersion": "3.5.3", "spec": {"filename": "transcribe.py", "handler": "transcribe", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "tqdm", "torchaudio", "torch", "accelerate"]}, "url": "", "version": "1.2.0", "assets": {"example": "src/transcribe.ipynb", "source": "src/transcribe.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation", "genai", "huggingface", "machine-learning"], "description": "Transcribe audio files into text files", "doc": "", "example": "transcribe.ipynb", "generationDate": "2023-07-13:11-20", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "transcribe", "platformVersion": "3.5.3", "spec": {"filename": "transcribe.py", "handler": "transcribe", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "tqdm", "torchaudio", "torch", "accelerate"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/transcribe.ipynb", "source": "src/transcribe.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Transcribe audio files into text files", "doc": "", "example": "transcribe.ipynb", "generationDate": "2023-07-13:11-20", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "transcribe", "platformVersion": "3.5.3", "spec": {"filename": "transcribe.py", "handler": "transcribe", "image": "mlrun/mlrun", "kind": "job", "requirements": ["openai-whisper", "tqdm"]}, "url": "", "version": "0.0.1", "test_valid": false, "assets": {"example": "src/transcribe.ipynb", "source": "src/transcribe.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.2.0": {"apiVersion": "v1", "categories": ["audio", "genai"], "description": "Transcribe audio files into text files", "doc": "", "example": "transcribe.ipynb", "generationDate": "2023-07-13:11-20", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0", "name": "transcribe", "platformVersion": "3.5.3", "spec": {"filename": "transcribe.py", "handler": "transcribe", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "tqdm", "torchaudio", "torch", "accelerate"]}, "url": "", "version": "1.2.0", "assets": {"example": "src/transcribe.ipynb", "source": "src/transcribe.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Transcribe audio files into text files", "doc": "", "example": "transcribe.ipynb", "generationDate": "2023-07-13:11-20", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "transcribe", "platformVersion": "3.5.3", "spec": {"filename": "transcribe.py", "handler": "transcribe", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "tqdm", "torchaudio", "torch", "accelerate"]}, "url": "", "version": "1.0.0", "assets": {"example": "src/transcribe.ipynb", "source": "src/transcribe.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.2": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Transcribe audio files into text files", "doc": "", "example": "transcribe.ipynb", "generationDate": "2023-07-13:11-20", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "transcribe", "platformVersion": "3.5.3", "spec": {"filename": "transcribe.py", "handler": "transcribe", "image": "mlrun/mlrun", "kind": "job", "requirements": ["openai-whisper", "tqdm"]}, "url": "", "version": "0.0.2", "test_valid": true, "assets": {"example": "src/transcribe.ipynb", "source": "src/transcribe.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "pii_recognizer": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "NLP"], "description": "This function is used to recognize PII in a directory of text files", "doc": "", "example": "pii_recognizer.ipynb", "generationDate": "2023-08-15:10-24", "hidden": false, "icon": "", "labels": {"author": "pgw"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0", "name": "pii-recognizer", "platformVersion": "3.5.3", "spec": {"filename": "pii_recognizer.py", "handler": "recognize_pii", "image": "mlrun/mlrun", "kind": "job", "requirements": ["nltk", "pandas", "presidio-anonymizer", "presidio-analyzer", "torch", "flair@git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653", "st-annotated-text", "https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl"]}, "url": "", "version": "0.4.0", "test_valid": false, "assets": {"example": "src/pii_recognizer.ipynb", "source": "src/pii_recognizer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.2.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "This function is used to recognize PII in a directory of text files", "doc": "", "example": "pii_recognizer.ipynb", "generationDate": "2023-08-15:10-24", "hidden": false, "icon": "", "labels": {"author": "pgw"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "pii-recognizer", "platformVersion": "3.5.3", "spec": {"filename": "pii_recognizer.py", "handler": "recognize_pii", "image": "mlrun/mlrun", "kind": "job", "requirements": ["nltk", "pandas", "presidio-anonymizer", "presidio-analyzer", "torch", "flair@git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653", "st-annotated-text", "https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl"]}, "url": "", "version": "0.2.0", "test_valid": false, "assets": {"example": "src/pii_recognizer.ipynb", "source": "src/pii_recognizer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "This function is used to recognize PII in a directory of text files", "doc": "", "example": "pii_recognizer.ipynb", "generationDate": "2023-08-15:10-24", "hidden": false, "icon": "", "labels": {"author": "pgw"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "pii-recognizer", "platformVersion": "3.5.3", "spec": {"filename": "pii_recognizer.py", "handler": "recognize_pii", "image": "mlrun/mlrun", "kind": "job", "requirements": ["nltk", "pandas", "presidio-anonymizer", "presidio-analyzer", "torch", "flair@git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653", "st-annotated-text", "https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl"]}, "url": "", "version": "0.0.1", "assets": {"example": "src/pii_recognizer.ipynb", "source": "src/pii_recognizer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.4.0": {"apiVersion": "v1", "categories": ["data-preparation", "NLP"], "description": "This function is used to recognize PII in a directory of text files", "doc": "", "example": "pii_recognizer.ipynb", "generationDate": "2023-08-15:10-24", "hidden": false, "icon": "", "labels": {"author": "pgw"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0", "name": "pii-recognizer", "platformVersion": "3.5.3", "spec": {"filename": "pii_recognizer.py", "handler": "recognize_pii", "image": "mlrun/mlrun", "kind": "job", "requirements": ["nltk", "pandas", "presidio-anonymizer", "presidio-analyzer", "torch", "flair@git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653", "st-annotated-text", "https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl"]}, "url": "", "version": "0.4.0", "test_valid": false, "assets": {"example": "src/pii_recognizer.ipynb", "source": "src/pii_recognizer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation"], "description": "This function is used to recognize PII in a directory of text files", "doc": "", "example": "pii_recognizer.ipynb", "generationDate": "2023-08-15:10-24", "hidden": false, "icon": "", "labels": {"author": "pgw"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "pii-recognizer", "platformVersion": "3.5.3", "spec": {"filename": "pii_recognizer.py", "handler": "recognize_pii", "image": "mlrun/mlrun", "kind": "job", "requirements": ["nltk", "pandas", "presidio-anonymizer", "presidio-analyzer", "torch", "flair@git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653", "st-annotated-text", "https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl"]}, "url": "", "version": "0.1.0", "test_valid": false, "assets": {"example": "src/pii_recognizer.ipynb", "source": "src/pii_recognizer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.3.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "NLP"], "description": "This function is used to recognize PII in a directory of text files", "doc": "", "example": "pii_recognizer.ipynb", "generationDate": "2023-08-15:10-24", "hidden": false, "icon": "", "labels": {"author": "pgw"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.4.0", "name": "pii-recognizer", "platformVersion": "3.5.3", "spec": {"filename": "pii_recognizer.py", "handler": "recognize_pii", "image": "mlrun/mlrun", "kind": "job", "requirements": ["nltk", "pandas", "presidio-anonymizer", "presidio-analyzer", "torch", "flair@git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653", "st-annotated-text", "https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl"]}, "url": "", "version": "0.3.0", "test_valid": false, "assets": {"example": "src/pii_recognizer.ipynb", "source": "src/pii_recognizer.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "batch_inference_v2": {"latest": {"apiVersion": "v1", "categories": ["model-serving"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0-rc51", "name": "batch_inference_v2", "platformVersion": "3.6.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.6.0", "assets": {"example": "src/batch_inference_v2.ipynb", "source": "src/batch_inference_v2.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.9.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0-rc16", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "1.9.0", "assets": {"example": "src/batch_inference_v2.ipynb", "source": "src/batch_inference_v2.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "2.0.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.0.0", "assets": {"example": "src/batch_inference_v2.ipynb", "source": "src/batch_inference_v2.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.6.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0-rc9", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "1.6.0", "assets": {"example": "src/batch_inference_v2.ipynb", "source": "src/batch_inference_v2.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "2.1.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.1.0", "assets": {"example": "src/batch_inference_v2.ipynb", "source": "src/batch_inference_v2.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.8.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0-rc13", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "1.8.0", "assets": {"example": "src/batch_inference_v2.ipynb", "source": "src/batch_inference_v2.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "2.4.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.4.0", "assets": {"example": "src/batch_inference_v2.ipynb", "source": "src/batch_inference_v2.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "2.5.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.5.0", "assets": {"example": "src/batch_inference_v2.ipynb", "source": "src/batch_inference_v2.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "2.6.0": {"apiVersion": "v1", "categories": ["model-serving"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0-rc51", "name": "batch_inference_v2", "platformVersion": "3.6.0", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.6.0", "assets": {"example": "src/batch_inference_v2.ipynb", "source": "src/batch_inference_v2.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "2.2.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "2.2.0", "assets": {"example": "src/batch_inference_v2.ipynb", "source": "src/batch_inference_v2.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.5.0": {"apiVersion": "v1", "categories": ["utils", "data-analysis", "monitoring"], "description": "Batch inference (also knows as prediction) for the common ML frameworks (SciKit-Learn, XGBoost and LightGBM) while performing data drift analysis.", "doc": "", "example": "batch_inference_v2.ipynb", "generationDate": "2023-08-07:12-25", "hidden": false, "icon": "", "labels": {"author": "eyald"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.0-rc9", "name": "batch_inference_v2", "platformVersion": "3.5.3", "spec": {"extra_spec": {"allow_empty_resources": true, "build": {"auto_build": false, "with_mlrun": false}}, "filename": "batch_inference_v2.py", "handler": "infer", "image": "mlrun/mlrun", "kind": "job", "requirements": null}, "url": "", "version": "1.5.0", "assets": {"example": "src/batch_inference_v2.ipynb", "source": "src/batch_inference_v2.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "translate": {"latest": {"apiVersion": "v1", "categories": ["genai", "NLP"], "description": "Translate text files from one language to another", "doc": "", "example": "translate.ipynb", "generationDate": "2023-12-05:17-20", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0", "name": "translate", "platformVersion": "3.5.3", "spec": {"filename": "translate.py", "handler": "translate", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "sentencepiece", "torch", "tqdm"]}, "url": "", "version": "0.2.0", "test_valid": true, "assets": {"example": "src/translate.ipynb", "source": "src/translate.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.2.0": {"apiVersion": "v1", "categories": ["genai", "NLP"], "description": "Translate text files from one language to another", "doc": "", "example": "translate.ipynb", "generationDate": "2023-12-05:17-20", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0", "name": "translate", "platformVersion": "3.5.3", "spec": {"filename": "translate.py", "handler": "translate", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "sentencepiece", "torch", "tqdm"]}, "url": "", "version": "0.2.0", "test_valid": true, "assets": {"example": "src/translate.ipynb", "source": "src/translate.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.1": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Translate text files from one language to another", "doc": "", "example": "translate.ipynb", "generationDate": "2023-12-05:17-20", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "translate", "platformVersion": "3.5.3", "spec": {"filename": "translate.py", "handler": "translate", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "sentencepiece", "torch", "tqdm"]}, "url": "", "version": "0.0.1", "test_valid": true, "assets": {"example": "src/translate.ipynb", "source": "src/translate.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.1.0": {"apiVersion": "v1", "categories": ["data-preparation", "huggingface", "machine-learning", "deep-learning", "NLP"], "description": "Translate text files from one language to another", "doc": "", "example": "translate.ipynb", "generationDate": "2023-12-05:17-20", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "translate", "platformVersion": "3.5.3", "spec": {"filename": "translate.py", "handler": "translate", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "sentencepiece", "torch", "tqdm"]}, "url": "", "version": "0.1.0", "test_valid": true, "assets": {"example": "src/translate.ipynb", "source": "src/translate.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "0.0.2": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning", "deep-learning", "NLP"], "description": "Translate text files from one language to another", "doc": "", "example": "translate.ipynb", "generationDate": "2023-12-05:17-20", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "translate", "platformVersion": "3.5.3", "spec": {"filename": "translate.py", "handler": "translate", "image": "mlrun/mlrun", "kind": "job", "requirements": ["transformers", "sentencepiece", "torch", "tqdm"]}, "url": "", "version": "0.0.2", "test_valid": true, "assets": {"example": "src/translate.ipynb", "source": "src/translate.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "structured_data_generator": {"latest": {"apiVersion": "v1", "categories": ["data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.8.0", "name": "structured_data_generator", "platformVersion": "3.5.5", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.6.0", "assets": {"example": "src/structured_data_generator.ipynb", "source": "src/structured_data_generator.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "structured_data_generator", "platformVersion": "3.5.0", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/structured_data_generator.ipynb", "source": "src/structured_data_generator.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "structured_data_generator", "platformVersion": "3.5.0", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.0.0", "assets": {"example": "src/structured_data_generator.ipynb", "source": "src/structured_data_generator.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.6.0": {"apiVersion": "v1", "categories": ["data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.8.0", "name": "structured_data_generator", "platformVersion": "3.5.5", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.6.0", "assets": {"example": "src/structured_data_generator.ipynb", "source": "src/structured_data_generator.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.4.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.1", "name": "structured_data_generator", "platformVersion": "3.5.5", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.4.0", "assets": {"example": "src/structured_data_generator.ipynb", "source": "src/structured_data_generator.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.3.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "structured_data_generator", "platformVersion": "3.5.0", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.3.0", "assets": {"example": "src/structured_data_generator.ipynb", "source": "src/structured_data_generator.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.5.0": {"apiVersion": "v1", "categories": ["machine-learning", "data-preparation", "data-generation", "genai"], "description": "GenAI approach of generating structured data according to a given schema", "doc": "", "example": "structured_data_generator.ipynb", "generationDate": "2023-12-14:10-50", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.6.1", "name": "structured_data_generator", "platformVersion": "3.5.5", "spec": {"filename": "structured_data_generator.py", "handler": "generate_data", "image": "mlrun/mlrun", "kind": "job", "requirements": ["langchain", "tqdm"]}, "url": "", "version": "1.5.0", "assets": {"example": "src/structured_data_generator.ipynb", "source": "src/structured_data_generator.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "text_to_audio_generator": {"latest": {"apiVersion": "v1", "categories": ["data-generation", "audio"], "description": "Generate audio file from text using different speakers", "doc": "", "example": "text_to_audio_generator.ipynb", "generationDate": "2023-12-03:15-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.1", "name": "text_to_audio_generator", "platformVersion": "3.5.3", "spec": {"filename": "text_to_audio_generator.py", "handler": "generate_multi_speakers_audio", "image": "mlrun/mlrun", "kind": "job", "requirements": ["torchaudio", "pydub"]}, "url": "", "version": "1.3.0", "test_valid": true, "assets": {"example": "src/text_to_audio_generator.ipynb", "source": "src/text_to_audio_generator.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Generate audio file from text using different speakers", "doc": "", "example": "text_to_audio_generator.ipynb", "generationDate": "2023-12-03:15-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "text_to_audio_generator", "platformVersion": "3.5.3", "spec": {"filename": "text_to_audio_generator.py", "handler": "generate_multi_speakers_audio", "image": "mlrun/mlrun", "kind": "job", "requirements": ["bark", "torchaudio"]}, "url": "", "version": "1.1.0", "test_valid": true, "assets": {"example": "src/text_to_audio_generator.ipynb", "source": "src/text_to_audio_generator.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.2.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning", "pytorch"], "description": "Generate audio file from text using different speakers", "doc": "", "example": "text_to_audio_generator.ipynb", "generationDate": "2023-12-03:15-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "text_to_audio_generator", "platformVersion": "3.5.3", "spec": {"filename": "text_to_audio_generator.py", "handler": "generate_multi_speakers_audio", "image": "mlrun/mlrun", "kind": "job", "requirements": ["bark", "torchaudio"]}, "url": "", "version": "1.2.0", "test_valid": true, "assets": {"example": "src/text_to_audio_generator.ipynb", "source": "src/text_to_audio_generator.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Generate audio file from text using different speakers", "doc": "", "example": "text_to_audio_generator.ipynb", "generationDate": "2023-12-03:15-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.1", "name": "text_to_audio_generator", "platformVersion": "3.5.3", "spec": {"filename": "text_to_audio_generator.py", "handler": "generate_multi_speakers_audio", "image": "mlrun/mlrun", "kind": "job", "requirements": ["bark", "torchaudio"]}, "url": "", "version": "1.0.0", "test_valid": true, "assets": {"example": "src/text_to_audio_generator.ipynb", "source": "src/text_to_audio_generator.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.3.0": {"apiVersion": "v1", "categories": ["data-generation", "audio"], "description": "Generate audio file from text using different speakers", "doc": "", "example": "text_to_audio_generator.ipynb", "generationDate": "2023-12-03:15-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.1", "name": "text_to_audio_generator", "platformVersion": "3.5.3", "spec": {"filename": "text_to_audio_generator.py", "handler": "generate_multi_speakers_audio", "image": "mlrun/mlrun", "kind": "job", "requirements": ["torchaudio", "pydub"]}, "url": "", "version": "1.3.0", "test_valid": true, "assets": {"example": "src/text_to_audio_generator.ipynb", "source": "src/text_to_audio_generator.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "silero_vad": {"latest": {"apiVersion": "v1", "categories": ["deep-learning", "audio"], "description": "Silero VAD (Voice Activity Detection) functions.", "doc": "", "example": "silero_vad.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0", "name": "silero_vad", "platformVersion": "3.5.3", "spec": {"filename": "silero_vad.py", "handler": "detect_voice", "image": "mlrun/mlrun", "kind": "job", "requirements": ["torch", "torchaudio", "tqdm", "onnxruntime"]}, "url": "", "version": "1.4.0", "assets": {"example": "src/silero_vad.ipynb", "source": "src/silero_vad.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["deep-learning", "pytorch", "audio"], "description": "Silero VAD (Voice Activity Detection) functions.", "doc": "", "example": "silero_vad.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "silero_vad", "platformVersion": "3.5.3", "spec": {"filename": "silero_vad.py", "handler": "detect_voice", "image": "mlrun/mlrun", "kind": "job", "requirements": ["torch", "torchaudio", "tqdm", "onnxruntime"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/silero_vad.ipynb", "source": "src/silero_vad.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.2.0": {"apiVersion": "v1", "categories": ["deep-learning", "pytorch", "audio"], "description": "Silero VAD (Voice Activity Detection) functions.", "doc": "", "example": "silero_vad.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "silero_vad", "platformVersion": "3.5.3", "spec": {"filename": "silero_vad.py", "handler": "detect_voice", "image": "mlrun/mlrun", "kind": "job", "requirements": ["torch", "torchaudio", "tqdm", "onnxruntime"]}, "url": "", "version": "1.2.0", "assets": {"example": "src/silero_vad.ipynb", "source": "src/silero_vad.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.4.0": {"apiVersion": "v1", "categories": ["deep-learning", "audio"], "description": "Silero VAD (Voice Activity Detection) functions.", "doc": "", "example": "silero_vad.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0", "name": "silero_vad", "platformVersion": "3.5.3", "spec": {"filename": "silero_vad.py", "handler": "detect_voice", "image": "mlrun/mlrun", "kind": "job", "requirements": ["torch", "torchaudio", "tqdm", "onnxruntime"]}, "url": "", "version": "1.4.0", "assets": {"example": "src/silero_vad.ipynb", "source": "src/silero_vad.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.3.0": {"apiVersion": "v1", "categories": ["deep-learning", "pytorch", "audio"], "description": "Silero VAD (Voice Activity Detection) functions.", "doc": "", "example": "silero_vad.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "silero_vad", "platformVersion": "3.5.3", "spec": {"filename": "silero_vad.py", "handler": "detect_voice", "image": "mlrun/mlrun", "kind": "job", "requirements": ["torch", "torchaudio", "tqdm", "onnxruntime"]}, "url": "", "version": "1.3.0", "assets": {"example": "src/silero_vad.ipynb", "source": "src/silero_vad.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "pyannote_audio": {"latest": {"apiVersion": "v1", "categories": ["deep-learning", "audio"], "description": "pyannote's speech diarization of audio files", "doc": "", "example": "pyannote_audio.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0", "name": "pyannote-audio", "platformVersion": "3.5.3", "spec": {"filename": "pyannote_audio.py", "handler": "diarize", "image": "mlrun/mlrun-gpu", "kind": "job", "requirements": ["pyannote.audio", "pyannote.core", "torchaudio", "tqdm"]}, "url": "", "version": "1.3.0", "assets": {"example": "src/pyannote_audio.ipynb", "source": "src/pyannote_audio.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["deep-learning", "huggingface", "audio"], "description": "pyannote's speech diarization of audio files", "doc": "", "example": "pyannote_audio.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "pyannote-audio", "platformVersion": "3.5.3", "spec": {"filename": "pyannote_audio.py", "handler": "diarize", "image": "mlrun/mlrun-gpu", "kind": "job", "requirements": ["pyannote.audio", "pyannote.core", "torchaudio", "tqdm"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/pyannote_audio.ipynb", "source": "src/pyannote_audio.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.2.0": {"apiVersion": "v1", "categories": ["deep-learning", "huggingface", "audio"], "description": "pyannote's speech diarization of audio files", "doc": "", "example": "pyannote_audio.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "pyannote-audio", "platformVersion": "3.5.3", "spec": {"filename": "pyannote_audio.py", "handler": "diarize", "image": "mlrun/mlrun-gpu", "kind": "job", "requirements": ["pyannote.audio", "pyannote.core", "torchaudio", "tqdm"]}, "url": "", "version": "1.2.0", "assets": {"example": "src/pyannote_audio.ipynb", "source": "src/pyannote_audio.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.0": {"apiVersion": "v1", "categories": ["deep-learning", "huggingface", "audio"], "description": "pyannote's speech diarization of audio files", "doc": "", "example": "pyannote_audio.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.5.2", "name": "pyannote-audio", "platformVersion": "3.5.3", "spec": {"filename": "pyannote_audio.py", "handler": "diarize", "image": "mlrun/mlrun-gpu", "kind": "job", "requirements": ["pyannote.audio", "pyannote.core", "torchaudio", "tqdm"]}, "url": "", "version": "1.0.0", "assets": {"example": "src/pyannote_audio.ipynb", "source": "src/pyannote_audio.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.3.0": {"apiVersion": "v1", "categories": ["deep-learning", "audio"], "description": "pyannote's speech diarization of audio files", "doc": "", "example": "pyannote_audio.ipynb", "generationDate": "2023-12-03:14-30", "hidden": false, "icon": "", "labels": {"author": "guyl"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0", "name": "pyannote-audio", "platformVersion": "3.5.3", "spec": {"filename": "pyannote_audio.py", "handler": "diarize", "image": "mlrun/mlrun-gpu", "kind": "job", "requirements": ["pyannote.audio", "pyannote.core", "torchaudio", "tqdm"]}, "url": "", "version": "1.3.0", "assets": {"example": "src/pyannote_audio.ipynb", "source": "src/pyannote_audio.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "mlflow_utils": {"latest": {"apiVersion": "v1", "categories": ["model-serving", "utils"], "description": "Mlflow model server, and additional utils.", "doc": "", "example": "mlflow_utils.ipynb", "generationDate": "2024-05-23:12-00", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.8.0", "name": "mlflow_utils", "platformVersion": "", "spec": {"customFields": {"default_class": "MLFlowModelServer"}, "filename": "mlflow_utils.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["mlflow==2.12.2", "lightgbm", "xgboost"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/mlflow_utils.ipynb", "source": "src/mlflow_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["model-serving", "utils"], "description": "Mlflow model server, and additional utils.", "doc": "", "example": "mlflow_utils.ipynb", "generationDate": "2024-05-23:12-00", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.8.0", "name": "mlflow_utils", "platformVersion": "", "spec": {"customFields": {"default_class": "MLFlowModelServer"}, "filename": "mlflow_utils.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["mlflow==2.12.2", "lightgbm", "xgboost"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/mlflow_utils.ipynb", "source": "src/mlflow_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.0": {"apiVersion": "v1", "categories": ["genai", "model-serving", "machine-learning"], "description": "Mlflow model server, and additional utils.", "doc": "", "example": "mlflow_utils.ipynb", "generationDate": "2024-05-23:12-00", "hidden": false, "icon": "", "labels": {"author": "zeevr"}, "maintainers": [], "marketplaceType": "", "mlrunVersion": "1.7.0-rc17", "name": "mlflow_utils", "platformVersion": "", "spec": {"customFields": {"default_class": "MLFlowModelServer"}, "filename": "mlflow_utils.py", "handler": "handler", "image": "mlrun/mlrun", "kind": "serving", "requirements": ["mlflow==2.12.2", "lightgbm", "xgboost"]}, "url": "", "version": "1.0.0", "assets": {"example": "src/mlflow_utils.ipynb", "source": "src/mlflow_utils.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}, "noise_reduction": {"latest": {"apiVersion": "v1", "categories": ["data-preparation", "audio"], "description": "Reduce noise from audio files", "doc": "", "example": "noise_reduction.ipynb", "generationDate": "2024-03-04:17-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "mlrunVersion": "1.7.0", "name": "noise-reduction", "platformVersion": "3.5.3", "spec": {"filename": "noise_reduction.py", "handler": "reduce_noise", "image": "mlrun/mlrun", "kind": "job", "requirements": ["librosa", "noisereduce", "deepfilternet", "torchaudio>=2.1.2"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/noise_reduction.ipynb", "source": "src/noise_reduction.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.1.0": {"apiVersion": "v1", "categories": ["data-preparation", "audio"], "description": "Reduce noise from audio files", "doc": "", "example": "noise_reduction.ipynb", "generationDate": "2024-03-04:17-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "mlrunVersion": "1.7.0", "name": "noise-reduction", "platformVersion": "3.5.3", "spec": {"filename": "noise_reduction.py", "handler": "reduce_noise", "image": "mlrun/mlrun", "kind": "job", "requirements": ["librosa", "noisereduce", "deepfilternet", "torchaudio>=2.1.2"]}, "url": "", "version": "1.1.0", "assets": {"example": "src/noise_reduction.ipynb", "source": "src/noise_reduction.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}, "1.0.0": {"apiVersion": "v1", "categories": ["data-preparation", "machine-learning"], "description": "Reduce noise from audio files", "doc": "", "example": "noise_reduction.ipynb", "generationDate": "2024-03-04:17-30", "hidden": false, "icon": "", "labels": {"author": "yonatans"}, "maintainers": [], "mlrunVersion": "1.5.2", "name": "noise-reduction", "platformVersion": "3.5.3", "spec": {"filename": "noise_reduction.py", "handler": "reduce_noise", "image": "mlrun/mlrun", "kind": "job", "requirements": ["librosa", "noisereduce", "deepfilternet", "torchaudio>=2.1.2"]}, "url": "", "version": "1.0.0", "assets": {"example": "src/noise_reduction.ipynb", "source": "src/noise_reduction.py", "function": "src/function.yaml", "docs": "static/documentation.html"}}}}
\ No newline at end of file
diff --git a/functions/master/describe/1.3.0/static/describe.html b/functions/master/describe/1.3.0/static/describe.html
index 698c8923..16609921 100644
--- a/functions/master/describe/1.3.0/static/describe.html
+++ b/functions/master/describe/1.3.0/static/describe.html
@@ -20,7 +20,7 @@
 
 
 
-
+
 
 
 
diff --git a/functions/master/describe/1.3.0/static/documentation.html b/functions/master/describe/1.3.0/static/documentation.html
index 21c81d34..5c15dcbd 100644
--- a/functions/master/describe/1.3.0/static/documentation.html
+++ b/functions/master/describe/1.3.0/static/documentation.html
@@ -20,7 +20,7 @@
 
 
 
-
+
 
 
 
diff --git a/functions/master/describe/1.3.0/static/example.html b/functions/master/describe/1.3.0/static/example.html
index d2932314..a35cd80e 100644
--- a/functions/master/describe/1.3.0/static/example.html
+++ b/functions/master/describe/1.3.0/static/example.html
@@ -20,7 +20,7 @@
 
 
 
-
+
 
 
 
diff --git a/functions/master/describe/latest/static/describe.html b/functions/master/describe/latest/static/describe.html
index 698c8923..16609921 100644
--- a/functions/master/describe/latest/static/describe.html
+++ b/functions/master/describe/latest/static/describe.html
@@ -20,7 +20,7 @@
 
 
 
-
+
 
 
 
diff --git a/functions/master/describe/latest/static/documentation.html b/functions/master/describe/latest/static/documentation.html
index 21c81d34..5c15dcbd 100644
--- a/functions/master/describe/latest/static/documentation.html
+++ b/functions/master/describe/latest/static/documentation.html
@@ -20,7 +20,7 @@
 
 
 
-
+
 
 
 
diff --git a/functions/master/describe/latest/static/example.html b/functions/master/describe/latest/static/example.html
index d2932314..a35cd80e 100644
--- a/functions/master/describe/latest/static/example.html
+++ b/functions/master/describe/latest/static/example.html
@@ -20,7 +20,7 @@
 
 
 
-
+
 
 
 
diff --git a/functions/master/describe_dask/1.1.0/static/describe_dask.html b/functions/master/describe_dask/1.1.0/static/describe_dask.html
index 7125e14e..b006612c 100644
--- a/functions/master/describe_dask/1.1.0/static/describe_dask.html
+++ b/functions/master/describe_dask/1.1.0/static/describe_dask.html
@@ -20,7 +20,7 @@
 
 
 
-
+
 
 
 
diff --git a/functions/master/describe_dask/1.1.0/static/documentation.html b/functions/master/describe_dask/1.1.0/static/documentation.html
index 972f2d66..1d8ae7b1 100644
--- a/functions/master/describe_dask/1.1.0/static/documentation.html
+++ b/functions/master/describe_dask/1.1.0/static/documentation.html
@@ -20,7 +20,7 @@
 
 
 
-
+
 
 
 
diff --git a/functions/master/describe_dask/1.1.0/static/example.html b/functions/master/describe_dask/1.1.0/static/example.html
index a6df31db..8a61fe6a 100644
--- a/functions/master/describe_dask/1.1.0/static/example.html
+++ b/functions/master/describe_dask/1.1.0/static/example.html
@@ -20,7 +20,7 @@
 
 
 
-
+
 
 
 
diff --git a/functions/master/describe_dask/latest/static/describe_dask.html b/functions/master/describe_dask/latest/static/describe_dask.html
index 7125e14e..b006612c 100644
--- a/functions/master/describe_dask/latest/static/describe_dask.html
+++ b/functions/master/describe_dask/latest/static/describe_dask.html
@@ -20,7 +20,7 @@
 
 
 
-
+
 
 
 
diff --git a/functions/master/describe_dask/latest/static/documentation.html b/functions/master/describe_dask/latest/static/documentation.html
index 972f2d66..1d8ae7b1 100644
--- a/functions/master/describe_dask/latest/static/documentation.html
+++ b/functions/master/describe_dask/latest/static/documentation.html
@@ -20,7 +20,7 @@
 
 
 
-
+
 
 
 
diff --git a/functions/master/describe_dask/latest/static/example.html b/functions/master/describe_dask/latest/static/example.html
index a6df31db..8a61fe6a 100644
--- a/functions/master/describe_dask/latest/static/example.html
+++ b/functions/master/describe_dask/latest/static/example.html
@@ -20,7 +20,7 @@
 
 
 
-
+
 
 
 
diff --git a/functions/master/describe_spark/1.1.0/static/documentation.html b/functions/master/describe_spark/1.1.0/static/documentation.html
index 606e76b2..734090e4 100644
--- a/functions/master/describe_spark/1.1.0/static/documentation.html
+++ b/functions/master/describe_spark/1.1.0/static/documentation.html
@@ -20,7 +20,7 @@
 
 
 
-
+
 
 
 
diff --git a/functions/master/describe_spark/1.1.0/static/example.html b/functions/master/describe_spark/1.1.0/static/example.html
index 7d07ac2f..7032fa9a 100644
--- a/functions/master/describe_spark/1.1.0/static/example.html
+++ b/functions/master/describe_spark/1.1.0/static/example.html
@@ -20,7 +20,7 @@
 
 
 
-
+
 
 
 
diff --git a/functions/master/describe_spark/latest/static/documentation.html b/functions/master/describe_spark/latest/static/documentation.html
index 606e76b2..734090e4 100644
--- a/functions/master/describe_spark/latest/static/documentation.html
+++ b/functions/master/describe_spark/latest/static/documentation.html
@@ -20,7 +20,7 @@
 
 
 
-
+
 
 
 
diff --git a/functions/master/describe_spark/latest/static/example.html b/functions/master/describe_spark/latest/static/example.html
index 7d07ac2f..7032fa9a 100644
--- a/functions/master/describe_spark/latest/static/example.html
+++ b/functions/master/describe_spark/latest/static/example.html
@@ -20,7 +20,7 @@
 
 
 
-
+
 
 
 
diff --git a/functions/master/feature_selection/1.6.0/static/documentation.html b/functions/master/feature_selection/1.6.0/static/documentation.html
index 65ce50d5..a136a710 100644
--- a/functions/master/feature_selection/1.6.0/static/documentation.html
+++ b/functions/master/feature_selection/1.6.0/static/documentation.html
@@ -20,7 +20,7 @@
 
 
 
-
+
 
 
 
diff --git a/functions/master/feature_selection/1.6.0/static/example.html b/functions/master/feature_selection/1.6.0/static/example.html
index 3bbfa58c..e4c5804f 100644
--- a/functions/master/feature_selection/1.6.0/static/example.html
+++ b/functions/master/feature_selection/1.6.0/static/example.html
@@ -20,7 +20,7 @@
 
 
 
-
+
 
 
 
diff --git a/functions/master/feature_selection/1.6.0/static/feature_selection.html b/functions/master/feature_selection/1.6.0/static/feature_selection.html
index d2696107..48afd201 100644
--- a/functions/master/feature_selection/1.6.0/static/feature_selection.html
+++ b/functions/master/feature_selection/1.6.0/static/feature_selection.html
@@ -20,7 +20,7 @@
 
 
 
-
+
 
 
 
diff --git a/functions/master/feature_selection/latest/static/documentation.html b/functions/master/feature_selection/latest/static/documentation.html
index 65ce50d5..a136a710 100644
--- a/functions/master/feature_selection/latest/static/documentation.html
+++ b/functions/master/feature_selection/latest/static/documentation.html
@@ -20,7 +20,7 @@
 
 
 
-
+
 
 
 
diff --git a/functions/master/feature_selection/latest/static/example.html b/functions/master/feature_selection/latest/static/example.html
index 3bbfa58c..e4c5804f 100644
--- a/functions/master/feature_selection/latest/static/example.html
+++ b/functions/master/feature_selection/latest/static/example.html
@@ -20,7 +20,7 @@
 
 
 
-
+
 
 
 
diff --git a/functions/master/feature_selection/latest/static/feature_selection.html b/functions/master/feature_selection/latest/static/feature_selection.html
index d2696107..48afd201 100644
--- a/functions/master/feature_selection/latest/static/feature_selection.html
+++ b/functions/master/feature_selection/latest/static/feature_selection.html
@@ -20,7 +20,7 @@
 
 
 
-
+
 
 
 
diff --git a/functions/master/gen_class_data/1.3.0/src/function.yaml b/functions/master/gen_class_data/1.3.0/src/function.yaml
new file mode 100644
index 00000000..1769bec0
--- /dev/null
+++ b/functions/master/gen_class_data/1.3.0/src/function.yaml
@@ -0,0 +1,72 @@
+metadata:
+  categories:
+  - data-generation
+  tag: ''
+  name: gen-class-data
+spec:
+  description: Create a binary classification sample dataset and save.
+  default_handler: gen_class_data
+  entry_points:
+    gen_class_data:
+      has_kwargs: false
+      parameters:
+      - name: context
+        type: MLClientCtx
+        doc: function context
+      - name: n_samples
+        type: int
+        doc: number of rows/samples
+      - name: m_features
+        type: int
+        doc: number of cols/features
+      - name: k_classes
+        type: int
+        doc: number of classes
+      - name: header
+        type: Optional[List[str]]
+        doc: header for features array
+      - name: label_column
+        type: Optional[str]
+        doc: column name of ground-truth series
+        default: labels
+      - name: weight
+        type: float
+        doc: fraction of sample negative value (ground-truth=0)
+        default: 0.5
+      - name: random_state
+        type: int
+        doc: rng seed (see https://scikit-learn.org/stable/glossary.html#term-random-state)
+        default: 1
+      - name: key
+        type: str
+        doc: key of data in artifact store
+        default: classifier-data
+      - name: file_ext
+        type: str
+        doc: (pqt) extension for parquet file
+        default: parquet
+      - name: sk_params
+        doc: additional parameters for `sklearn.datasets.make_classification`
+        default: {}
+      lineno: 22
+      doc: 'Create a binary classification sample dataset and save.
+
+        If no filename is given it will default to:
+
+        "simdata-{n_samples}X{m_features}.parquet".
+
+
+        Additional scikit-learn parameters can be set using **sk_params, please see
+        https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html
+        for more details.'
+      has_varargs: false
+      name: gen_class_data
+  command: ''
+  disable_auto_mount: false
+  image: mlrun/mlrun
+  build:
+    origin_filename: ''
+    functionSourceCode: IyBDb3B5cmlnaHQgMjAxOSBJZ3VhemlvCiMKIyBMaWNlbnNlZCB1bmRlciB0aGUgQXBhY2hlIExpY2Vuc2UsIFZlcnNpb24gMi4wICh0aGUgIkxpY2Vuc2UiKTsKIyB5b3UgbWF5IG5vdCB1c2UgdGhpcyBmaWxlIGV4Y2VwdCBpbiBjb21wbGlhbmNlIHdpdGggdGhlIExpY2Vuc2UuCiMgWW91IG1heSBvYnRhaW4gYSBjb3B5IG9mIHRoZSBMaWNlbnNlIGF0CiMKIyAgICAgaHR0cDovL3d3dy5hcGFjaGUub3JnL2xpY2Vuc2VzL0xJQ0VOU0UtMi4wCiMKIyBVbmxlc3MgcmVxdWlyZWQgYnkgYXBwbGljYWJsZSBsYXcgb3IgYWdyZWVkIHRvIGluIHdyaXRpbmcsIHNvZnR3YXJlCiMgZGlzdHJpYnV0ZWQgdW5kZXIgdGhlIExpY2Vuc2UgaXMgZGlzdHJpYnV0ZWQgb24gYW4gIkFTIElTIiBCQVNJUywKIyBXSVRIT1VUIFdBUlJBTlRJRVMgT1IgQ09ORElUSU9OUyBPRiBBTlkgS0lORCwgZWl0aGVyIGV4cHJlc3Mgb3IgaW1wbGllZC4KIyBTZWUgdGhlIExpY2Vuc2UgZm9yIHRoZSBzcGVjaWZpYyBsYW5ndWFnZSBnb3Zlcm5pbmcgcGVybWlzc2lvbnMgYW5kCiMgbGltaXRhdGlvbnMgdW5kZXIgdGhlIExpY2Vuc2UuCiMKaW1wb3J0IHBhbmRhcyBhcyBwZApmcm9tIHR5cGluZyBpbXBvcnQgT3B0aW9uYWwsIExpc3QKZnJvbSBza2xlYXJuLmRhdGFzZXRzIGltcG9ydCBtYWtlX2NsYXNzaWZpY2F0aW9uCgpmcm9tIG1scnVuLmV4ZWN1dGlvbiBpbXBvcnQgTUxDbGllbnRDdHgKCgpkZWYgZ2VuX2NsYXNzX2RhdGEoCiAgICAgICAgY29udGV4dDogTUxDbGllbnRDdHgsCiAgICAgICAgbl9zYW1wbGVzOiBpbnQsCiAgICAgICAgbV9mZWF0dXJlczogaW50LAogICAgICAgIGtfY2xhc3NlczogaW50LAogICAgICAgIGhlYWRlcjogT3B0aW9uYWxbTGlzdFtzdHJdXSwKICAgICAgICBsYWJlbF9jb2x1bW46IE9wdGlvbmFsW3N0cl0gPSAibGFiZWxzIiwKICAgICAgICB3ZWlnaHQ6IGZsb2F0ID0gMC41LAogICAgICAgIHJhbmRvbV9zdGF0ZTogaW50ID0gMSwKICAgICAgICBrZXk6IHN0ciA9ICJjbGFzc2lmaWVyLWRhdGEiLAogICAgICAgIGZpbGVfZXh0OiBzdHIgPSAicGFycXVldCIsCiAgICAgICAgc2tfcGFyYW1zPXt9Cik6CiAgICAiIiJDcmVhdGUgYSBiaW5hcnkgY2xhc3NpZmljYXRpb24gc2FtcGxlIGRhdGFzZXQgYW5kIHNhdmUuCiAgICBJZiBubyBmaWxlbmFtZSBpcyBnaXZlbiBpdCB3aWxsIGRlZmF1bHQgdG86CiAgICAic2ltZGF0YS17bl9zYW1wbGVzfVh7bV9mZWF0dXJlc30ucGFycXVldCIuCgogICAgQWRkaXRpb25hbCBzY2lraXQtbGVhcm4gcGFyYW1ldGVycyBjYW4gYmUgc2V0IHVzaW5nICoqc2tfcGFyYW1zLCBwbGVhc2Ugc2VlIGh0dHBzOi8vc2Npa2l0LWxlYXJuLm9yZy9zdGFibGUvbW9kdWxlcy9nZW5lcmF0ZWQvc2tsZWFybi5kYXRhc2V0cy5tYWtlX2NsYXNzaWZpY2F0aW9uLmh0bWwgZm9yIG1vcmUgZGV0YWlscy4KCiAgICA6cGFyYW0gY29udGV4dDogICAgICAgZnVuY3Rpb24gY29udGV4dAogICAgOnBhcmFtIG5fc2FtcGxlczogICAgIG51bWJlciBvZiByb3dzL3NhbXBsZXMKICAgIDpwYXJhbSBtX2ZlYXR1cmVzOiAgICBudW1iZXIgb2YgY29scy9mZWF0dXJlcwogICAgOnBhcmFtIGtfY2xhc3NlczogICAgIG51bWJlciBvZiBjbGFzc2VzCiAgICA6cGFyYW0gaGVhZGVyOiAgICAgICAgaGVhZGVyIGZvciBmZWF0dXJlcyBhcnJheQogICAgOnBhcmFtIGxhYmVsX2NvbHVtbjogIGNvbHVtbiBuYW1lIG9mIGdyb3VuZC10cnV0aCBzZXJpZXMKICAgIDpwYXJhbSB3ZWlnaHQ6ICAgICAgICBmcmFjdGlvbiBvZiBzYW1wbGUgbmVnYXRpdmUgdmFsdWUgKGdyb3VuZC10cnV0aD0wKQogICAgOnBhcmFtIHJhbmRvbV9zdGF0ZTogIHJuZyBzZWVkIChzZWUgaHR0cHM6Ly9zY2lraXQtbGVhcm4ub3JnL3N0YWJsZS9nbG9zc2FyeS5odG1sI3Rlcm0tcmFuZG9tLXN0YXRlKQogICAgOnBhcmFtIGtleTogICAgICAgICAgIGtleSBvZiBkYXRhIGluIGFydGlmYWN0IHN0b3JlCiAgICA6cGFyYW0gZmlsZV9leHQ6ICAgICAgKHBxdCkgZXh0ZW5zaW9uIGZvciBwYXJxdWV0IGZpbGUKICAgIDpwYXJhbSBza19wYXJhbXM6ICAgICBhZGRpdGlvbmFsIHBhcmFtZXRlcnMgZm9yIGBza2xlYXJuLmRhdGFzZXRzLm1ha2VfY2xhc3NpZmljYXRpb25gCiAgICAiIiIKICAgIGZlYXR1cmVzLCBsYWJlbHMgPSBtYWtlX2NsYXNzaWZpY2F0aW9uKAogICAgICAgIG5fc2FtcGxlcz1uX3NhbXBsZXMsCiAgICAgICAgbl9mZWF0dXJlcz1tX2ZlYXR1cmVzLAogICAgICAgIHdlaWdodHM9d2VpZ2h0LAogICAgICAgIG5fY2xhc3Nlcz1rX2NsYXNzZXMsCiAgICAgICAgcmFuZG9tX3N0YXRlPXJhbmRvbV9zdGF0ZSwKICAgICAgICAqKnNrX3BhcmFtcykKCiAgICAjIG1ha2UgZGF0YWZyYW1lcywgYWRkIGNvbHVtbiBuYW1lcywgY29uY2F0ZW5hdGUgKFgsIHkpCiAgICBYID0gcGQuRGF0YUZyYW1lKGZlYXR1cmVzKQogICAgaWYgbm90IGhlYWRlcjoKICAgICAgICBYLmNvbHVtbnMgPSBbImZlYXRfIiArIHN0cih4KSBmb3IgeCBpbiByYW5nZShtX2ZlYXR1cmVzKV0KICAgIGVsc2U6CiAgICAgICAgWC5jb2x1bW5zID0gaGVhZGVyCgogICAgeSA9IHBkLkRhdGFGcmFtZShsYWJlbHMsIGNvbHVtbnM9W2xhYmVsX2NvbHVtbl0pCiAgICBkYXRhID0gcGQuY29uY2F0KFtYLCB5XSwgYXhpcz0xKQoKICAgIGNvbnRleHQubG9nX2RhdGFzZXQoa2V5LCBkZj1kYXRhLCBmb3JtYXQ9ZmlsZV9leHQsIGluZGV4PUZhbHNlKQo=
+    code_origin: ''
+kind: job
+verbose: false
diff --git a/functions/master/gen_class_data/1.3.0/src/gen_class_data.ipynb b/functions/master/gen_class_data/1.3.0/src/gen_class_data.ipynb
new file mode 100644
index 00000000..5335e646
--- /dev/null
+++ b/functions/master/gen_class_data/1.3.0/src/gen_class_data.ipynb
@@ -0,0 +1,685 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Generate classification data\n",
+    "\n",
+    "Use this function to generate sample data sets, wraps scikit-learn's **[make_classification](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html#sklearn-datasets-make-classification)**.  See the link for a description of all parameters."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# nuclio: ignore\n",
+    "import nuclio"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import pandas as pd\n",
+    "import pyarrow as pa\n",
+    "import pyarrow.parquet as pq\n",
+    "from typing import Optional, List, Any\n",
+    "from sklearn.datasets import make_classification\n",
+    "\n",
+    "from mlrun.execution import MLClientCtx\n",
+    "\n",
+    "def gen_class_data(\n",
+    "    context: MLClientCtx,\n",
+    "    n_samples: int,\n",
+    "    m_features: int,\n",
+    "    k_classes: int,\n",
+    "    header: Optional[List[str]],\n",
+    "    label_column: Optional[str] = \"labels\",\n",
+    "    weight: float = 0.5,\n",
+    "    random_state: int = 1,\n",
+    "    key: str = \"classifier-data\", \n",
+    "    file_ext: str = \"parquet\",\n",
+    "    sk_params = {}\n",
+    "):\n",
+    "    \"\"\"Create a binary classification sample dataset and save.\n",
+    "    If no filename is given it will default to:\n",
+    "    \"simdata-{n_samples}X{m_features}.parquet\".\n",
+    "    \n",
+    "    Additional scikit-learn parameters can be set using **sk_params, please see https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html for more details.\n",
+    "    \n",
+    "    :param context:       function context\n",
+    "    :param n_samples:     number of rows/samples\n",
+    "    :param m_features:    number of cols/features\n",
+    "    :param k_classes:     number of classes\n",
+    "    :param header:        header for features array\n",
+    "    :param label_column:  column name of ground-truth series\n",
+    "    :param weight:        fraction of sample negative value (ground-truth=0)\n",
+    "    :param random_state:  rng seed (see https://scikit-learn.org/stable/glossary.html#term-random-state)\n",
+    "    :param key:           key of data in artifact store\n",
+    "    :param file_ext:      (pqt) extension for parquet file\n",
+    "    :param sk_params:     additional parameters for `sklearn.datasets.make_classification`\n",
+    "    \"\"\"\n",
+    "    features, labels = make_classification(\n",
+    "        n_samples=n_samples,\n",
+    "        n_features=m_features,\n",
+    "        weights=weight,\n",
+    "        n_classes=k_classes,\n",
+    "        random_state=random_state, \n",
+    "        **sk_params)\n",
+    "\n",
+    "    # make dataframes, add column names, concatenate (X, y)\n",
+    "    X = pd.DataFrame(features)\n",
+    "    if not header:\n",
+    "        X.columns = [\"feat_\" + str(x) for x in range(m_features)]\n",
+    "    else:\n",
+    "        X.columns = header\n",
+    "\n",
+    "    y = pd.DataFrame(labels, columns=[label_column])\n",
+    "    data = pd.concat([X, y], axis=1)\n",
+    "    \n",
+    "    context.log_dataset(key, df=data, format=file_ext, index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# nuclio: end-code"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### save"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[mlrun] 2020-06-14 10:37:07,647 function spec saved to path: function.yaml\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       ""
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from mlrun import code_to_function\n",
+    "from mlrun.platforms.other import auto_mount\n",
+    "\n",
+    "gpus = False\n",
+    "\n",
+    "fn_params = {\n",
+    "    \"name\"        : \"gen_class_data\",\n",
+    "    \"handler\"     : \"gen_class_data\",\n",
+    "    \"kind\"        : \"job\",\n",
+    "    \"image\"       : \"mlrun/ml-models\" if not gpus else \"mlrun/ml-models-gpu\",\n",
+    "    \"description\" : \"simulate classification data using scikit-learn\",\n",
+    "    \"categories\"  : [\"simulators\", \"ml\"],\n",
+    "    \"labels\"      : {\"author\": \"yjb\", 'framework': 'sklearn'},\n",
+    "}\n",
+    "\n",
+    "fn = code_to_function(**fn_params)\n",
+    "\n",
+    "fn.export(\"function.yaml\")\n",
+    "fn.apply(auto_mount())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### test function"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from mlrun import NewTask, mlconf\n",
+    "\n",
+    "task_params = {\n",
+    "    \"name\":        \"tasks generate classification data\", \n",
+    "    \"params\" : {\n",
+    "        \"n_samples\"   : 10_000,\n",
+    "        \"m_features\"  : 5,\n",
+    "        \"k_classes\"   : 2,\n",
+    "        \"weight\"      : [0.5, 0.5],\n",
+    "        \"sk_params\"   : {\"n_informative\": 2},\n",
+    "        \"file_ext\"    : \"csv\"}}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### local"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[mlrun] 2020-06-14 10:33:01,963 starting run tasks generate classification data uid=1d7c5af7e4b04bd98755c87842455105  -> http://mlrun-api:8080\n",
+      "[mlrun] 2020-06-14 10:33:02,156 log artifact classifier-data at /User/artifacts/classifier-data.csv, size: 998700, db: Y\n",
+      "\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 14 10:33:01completedtasks generate classification data
v3io_user=admin
kind=handler
owner=admin
host=jupyter-7b44c8d958-kklf7
n_samples=10000
m_features=5
k_classes=2
weight=[0.5, 0.5]
sk_params={'n_informative': 2}
file_ext=csv
classifier-data
\n", + "
\n", + "
\n", + "
\n", + " Title\n", + " ×\n", + "
\n", + " \n", + "
\n", + "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "to track results use .show() or .logs() or in CLI: \n", + "!mlrun get run 1d7c5af7e4b04bd98755c87842455105 --project default , !mlrun logs 1d7c5af7e4b04bd98755c87842455105 --project default\n", + "[mlrun] 2020-06-14 10:33:02,198 run executed, status=completed\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from mlrun import run_local\n", + "run_local(NewTask(**task_params), handler=gen_class_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### remote" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[mlrun] 2020-06-14 10:33:02,619 starting run tasks generate classification data uid=8f2102b308f446f28242c03ac1a835a7 -> http://mlrun-api:8080\n", + "[mlrun] 2020-06-14 10:33:02,723 Job is running in the background, pod: tasks-generate-classification-data-wjdsf\n", + "[mlrun] 2020-06-14 10:33:08,285 starting local run: main.py # gen_class_data\n", + "[mlrun] 2020-06-14 10:33:08,806 log artifact classifier-data at /User/artifacts/classifier-data.csv, size: 998700, db: Y\n", + "\n", + "[mlrun] 2020-06-14 10:33:08,823 run executed, status=completed\n", + "final state: succeeded\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 14 10:33:08completedtasks generate classification data
v3io_user=admin
kind=job
owner=admin
host=tasks-generate-classification-data-wjdsf
n_samples=10000
m_features=5
k_classes=2
weight=[0.5, 0.5]
sk_params={'n_informative': 2}
file_ext=csv
classifier-data
\n", + "
\n", + "
\n", + "
\n", + " Title\n", + " ×\n", + "
\n", + " \n", + "
\n", + "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "to track results use .show() or .logs() or in CLI: \n", + "!mlrun get run 8f2102b308f446f28242c03ac1a835a7 --project default , !mlrun logs 8f2102b308f446f28242c03ac1a835a7 --project default\n", + "[mlrun] 2020-06-14 10:33:11,884 run executed, status=completed\n" + ] + } + ], + "source": [ + "run = fn.run(NewTask(**task_params), artifact_path=mlconf.artifact_path)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/functions/master/gen_class_data/1.3.0/src/gen_class_data.py b/functions/master/gen_class_data/1.3.0/src/gen_class_data.py new file mode 100644 index 00000000..2e5ab107 --- /dev/null +++ b/functions/master/gen_class_data/1.3.0/src/gen_class_data.py @@ -0,0 +1,71 @@ +# Copyright 2019 Iguazio +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import pandas as pd +from typing import Optional, List +from sklearn.datasets import make_classification + +from mlrun.execution import MLClientCtx + + +def gen_class_data( + context: MLClientCtx, + n_samples: int, + m_features: int, + k_classes: int, + header: Optional[List[str]], + label_column: Optional[str] = "labels", + weight: float = 0.5, + random_state: int = 1, + key: str = "classifier-data", + file_ext: str = "parquet", + sk_params={} +): + """Create a binary classification sample dataset and save. + If no filename is given it will default to: + "simdata-{n_samples}X{m_features}.parquet". + + Additional scikit-learn parameters can be set using **sk_params, please see https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html for more details. + + :param context: function context + :param n_samples: number of rows/samples + :param m_features: number of cols/features + :param k_classes: number of classes + :param header: header for features array + :param label_column: column name of ground-truth series + :param weight: fraction of sample negative value (ground-truth=0) + :param random_state: rng seed (see https://scikit-learn.org/stable/glossary.html#term-random-state) + :param key: key of data in artifact store + :param file_ext: (pqt) extension for parquet file + :param sk_params: additional parameters for `sklearn.datasets.make_classification` + """ + features, labels = make_classification( + n_samples=n_samples, + n_features=m_features, + weights=weight, + n_classes=k_classes, + random_state=random_state, + **sk_params) + + # make dataframes, add column names, concatenate (X, y) + X = pd.DataFrame(features) + if not header: + X.columns = ["feat_" + str(x) for x in range(m_features)] + else: + X.columns = header + + y = pd.DataFrame(labels, columns=[label_column]) + data = pd.concat([X, y], axis=1) + + context.log_dataset(key, df=data, format=file_ext, index=False) diff --git a/functions/master/gen_class_data/1.3.0/src/item.yaml b/functions/master/gen_class_data/1.3.0/src/item.yaml new file mode 100644 index 00000000..a6dd94b6 --- /dev/null +++ b/functions/master/gen_class_data/1.3.0/src/item.yaml @@ -0,0 +1,24 @@ +apiVersion: v1 +categories: +- data-generation +description: Create a binary classification sample dataset and save. +doc: '' +example: gen_class_data.ipynb +generationDate: 2022-08-28:17-25 +hidden: false +icon: '' +labels: + author: Daniel +maintainers: [] +marketplaceType: '' +mlrunVersion: 1.7.0 +name: gen_class_data +platformVersion: 3.5.3 +spec: + filename: gen_class_data.py + handler: gen_class_data + image: mlrun/mlrun + kind: job + requirements: [] +url: '' +version: 1.3.0 diff --git a/functions/master/gen_class_data/1.3.0/src/requirements.txt b/functions/master/gen_class_data/1.3.0/src/requirements.txt new file mode 100644 index 00000000..d7dbe376 --- /dev/null +++ b/functions/master/gen_class_data/1.3.0/src/requirements.txt @@ -0,0 +1,2 @@ +pandas +scikit-learn==1.0.2 \ No newline at end of file diff --git a/functions/master/gen_class_data/1.3.0/src/test_gen_class_data.py b/functions/master/gen_class_data/1.3.0/src/test_gen_class_data.py new file mode 100644 index 00000000..e06eeb16 --- /dev/null +++ b/functions/master/gen_class_data/1.3.0/src/test_gen_class_data.py @@ -0,0 +1,39 @@ +# Copyright 2019 Iguazio +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +from mlrun import code_to_function +import os + + +def test_gen_class_data(): + fn = code_to_function( + name='test_gen_class_data', + filename="gen_class_data.py", + handler="gen_class_data", + kind="job", + ) + + run = fn.run( + params={ + "n_samples": 10_000, + "m_features": 5, + "k_classes": 2, + "header": None, + "weight": [0.5, 0.5], + "sk_params": {"n_informative": 2}, + "file_ext": "csv"}, + local=True, + artifact_path="./artifacts", + ) + assert os.path.isfile(run.status.artifacts[0]['spec']['target_path']), 'dataset is not available' diff --git a/functions/master/gen_class_data/1.3.0/static/documentation.html b/functions/master/gen_class_data/1.3.0/static/documentation.html new file mode 100644 index 00000000..7125e01f --- /dev/null +++ b/functions/master/gen_class_data/1.3.0/static/documentation.html @@ -0,0 +1,261 @@ + + + + + + + +gen_class_data package + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + +
+
+
+
+
+
+ +
+ +
+
+
+ + + + +
+
+
+
+
+
+
+
+
+
+
+
+
+ + + + +
+
+
+
+
+

gen_class_data package

+ +
+ +
+
+ +
+
+

gen_class_data package#

+
+

Submodules#

+
+
+

gen_class_data.gen_class_data module#

+
+
+gen_class_data.gen_class_data.gen_class_data(context: MLClientCtx, n_samples: int, m_features: int, k_classes: int, header: List[str] | None, label_column: str | None = 'labels', weight: float = 0.5, random_state: int = 1, key: str = 'classifier-data', file_ext: str = 'parquet', sk_params={})[source]#
+

Create a binary classification sample dataset and save. +If no filename is given it will default to: +“simdata-{n_samples}X{m_features}.parquet”.

+

Additional scikit-learn parameters can be set using **sk_params, please see https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html for more details.

+
+
Parameters:
+
    +
  • context – function context

  • +
  • n_samples – number of rows/samples

  • +
  • m_features – number of cols/features

  • +
  • k_classes – number of classes

  • +
  • header – header for features array

  • +
  • label_column – column name of ground-truth series

  • +
  • weight – fraction of sample negative value (ground-truth=0)

  • +
  • random_state – rng seed (see https://scikit-learn.org/stable/glossary.html#term-random-state)

  • +
  • key – key of data in artifact store

  • +
  • file_ext – (pqt) extension for parquet file

  • +
  • sk_params – additional parameters for sklearn.datasets.make_classification

  • +
+
+
+
+
+
+

Module contents#

+
+
+
+
+
+
+
+
+ +
+
+ +
+
+
+
+ + + +
+
+ + \ No newline at end of file diff --git a/functions/master/gen_class_data/1.3.0/static/example.html b/functions/master/gen_class_data/1.3.0/static/example.html new file mode 100644 index 00000000..853b6a66 --- /dev/null +++ b/functions/master/gen_class_data/1.3.0/static/example.html @@ -0,0 +1,791 @@ + + + + + + + +Generate classification data + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + +
+
+
+
+
+
+ +
+ +
+
+
+ + + + +
+
+
+
+
+
+
+
+
+
+
+
+
+ + + + +
+
+
+
+
+

Generate classification data

+ +
+
+
+

Contents

+
+ +
+
+
+ +
+
+

Generate classification data#

+

Use this function to generate sample data sets, wraps scikit-learn’s make_classification. See the link for a description of all parameters.

+
+
+
# nuclio: ignore
+import nuclio
+
+
+
+
+
+
+
import os
+import pandas as pd
+import pyarrow as pa
+import pyarrow.parquet as pq
+from typing import Optional, List, Any
+from sklearn.datasets import make_classification
+
+from mlrun.execution import MLClientCtx
+
+def gen_class_data(
+    context: MLClientCtx,
+    n_samples: int,
+    m_features: int,
+    k_classes: int,
+    header: Optional[List[str]],
+    label_column: Optional[str] = "labels",
+    weight: float = 0.5,
+    random_state: int = 1,
+    key: str = "classifier-data", 
+    file_ext: str = "parquet",
+    sk_params = {}
+):
+    """Create a binary classification sample dataset and save.
+    If no filename is given it will default to:
+    "simdata-{n_samples}X{m_features}.parquet".
+    
+    Additional scikit-learn parameters can be set using **sk_params, please see https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html for more details.
+    
+    :param context:       function context
+    :param n_samples:     number of rows/samples
+    :param m_features:    number of cols/features
+    :param k_classes:     number of classes
+    :param header:        header for features array
+    :param label_column:  column name of ground-truth series
+    :param weight:        fraction of sample negative value (ground-truth=0)
+    :param random_state:  rng seed (see https://scikit-learn.org/stable/glossary.html#term-random-state)
+    :param key:           key of data in artifact store
+    :param file_ext:      (pqt) extension for parquet file
+    :param sk_params:     additional parameters for `sklearn.datasets.make_classification`
+    """
+    features, labels = make_classification(
+        n_samples=n_samples,
+        n_features=m_features,
+        weights=weight,
+        n_classes=k_classes,
+        random_state=random_state, 
+        **sk_params)
+
+    # make dataframes, add column names, concatenate (X, y)
+    X = pd.DataFrame(features)
+    if not header:
+        X.columns = ["feat_" + str(x) for x in range(m_features)]
+    else:
+        X.columns = header
+
+    y = pd.DataFrame(labels, columns=[label_column])
+    data = pd.concat([X, y], axis=1)
+    
+    context.log_dataset(key, df=data, format=file_ext, index=False)
+
+
+
+
+
+
+
# nuclio: end-code
+
+
+
+
+
+

save#

+
+
+
from mlrun import code_to_function
+from mlrun.platforms.other import auto_mount
+
+gpus = False
+
+fn_params = {
+    "name"        : "gen_class_data",
+    "handler"     : "gen_class_data",
+    "kind"        : "job",
+    "image"       : "mlrun/ml-models" if not gpus else "mlrun/ml-models-gpu",
+    "description" : "simulate classification data using scikit-learn",
+    "categories"  : ["simulators", "ml"],
+    "labels"      : {"author": "yjb", 'framework': 'sklearn'},
+}
+
+fn = code_to_function(**fn_params)
+
+fn.export("function.yaml")
+fn.apply(auto_mount())
+
+
+
+
+
[mlrun] 2020-06-14 10:37:07,647 function spec saved to path: function.yaml
+
+
+
<mlrun.runtimes.kubejob.KubejobRuntime at 0x7faf4a975eb8>
+
+
+
+
+
+
+

test function#

+
+
+
from mlrun import NewTask, mlconf
+
+task_params = {
+    "name":        "tasks generate classification data", 
+    "params" : {
+        "n_samples"   : 10_000,
+        "m_features"  : 5,
+        "k_classes"   : 2,
+        "weight"      : [0.5, 0.5],
+        "sk_params"   : {"n_informative": 2},
+        "file_ext"    : "csv"}}
+
+
+
+
+
+
+

local#

+
+
+
from mlrun import run_local
+run_local(NewTask(**task_params), handler=gen_class_data)
+
+
+
+
+
[mlrun] 2020-06-14 10:33:01,963 starting run tasks generate classification data uid=1d7c5af7e4b04bd98755c87842455105  -> http://mlrun-api:8080
+[mlrun] 2020-06-14 10:33:02,156 log artifact classifier-data at /User/artifacts/classifier-data.csv, size: 998700, db: Y
+
+
+
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 14 10:33:01completedtasks generate classification data
v3io_user=admin
kind=handler
owner=admin
host=jupyter-7b44c8d958-kklf7
n_samples=10000
m_features=5
k_classes=2
weight=[0.5, 0.5]
sk_params={'n_informative': 2}
file_ext=csv
classifier-data
+
+ +
+
to track results use .show() or .logs() or in CLI: 
+!mlrun get run 1d7c5af7e4b04bd98755c87842455105 --project default , !mlrun logs 1d7c5af7e4b04bd98755c87842455105 --project default
+[mlrun] 2020-06-14 10:33:02,198 run executed, status=completed
+
+
+
<mlrun.model.RunObject at 0x7fafa49fc160>
+
+
+
+
+
+
+

remote#

+
+
+
run = fn.run(NewTask(**task_params), artifact_path=mlconf.artifact_path)
+
+
+
+
+
[mlrun] 2020-06-14 10:33:02,619 starting run tasks generate classification data uid=8f2102b308f446f28242c03ac1a835a7  -> http://mlrun-api:8080
+[mlrun] 2020-06-14 10:33:02,723 Job is running in the background, pod: tasks-generate-classification-data-wjdsf
+[mlrun] 2020-06-14 10:33:08,285 starting local run: main.py # gen_class_data
+[mlrun] 2020-06-14 10:33:08,806 log artifact classifier-data at /User/artifacts/classifier-data.csv, size: 998700, db: Y
+
+[mlrun] 2020-06-14 10:33:08,823 run executed, status=completed
+final state: succeeded
+
+
+
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
default0Jun 14 10:33:08completedtasks generate classification data
v3io_user=admin
kind=job
owner=admin
host=tasks-generate-classification-data-wjdsf
n_samples=10000
m_features=5
k_classes=2
weight=[0.5, 0.5]
sk_params={'n_informative': 2}
file_ext=csv
classifier-data
+
+ +
+
to track results use .show() or .logs() or in CLI: 
+!mlrun get run 8f2102b308f446f28242c03ac1a835a7 --project default , !mlrun logs 8f2102b308f446f28242c03ac1a835a7 --project default
+[mlrun] 2020-06-14 10:33:11,884 run executed, status=completed
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ +
+
+
+
+ + + +
+
+ + \ No newline at end of file diff --git a/functions/master/gen_class_data/1.3.0/static/function.html b/functions/master/gen_class_data/1.3.0/static/function.html new file mode 100644 index 00000000..f4605fe7 --- /dev/null +++ b/functions/master/gen_class_data/1.3.0/static/function.html @@ -0,0 +1,107 @@ + + + + + + + + + + + Source + + + + +
+        
+metadata:
+  categories:
+  - data-generation
+  tag: ''
+  name: gen-class-data
+spec:
+  description: Create a binary classification sample dataset and save.
+  default_handler: gen_class_data
+  entry_points:
+    gen_class_data:
+      has_kwargs: false
+      parameters:
+      - name: context
+        type: MLClientCtx
+        doc: function context
+      - name: n_samples
+        type: int
+        doc: number of rows/samples
+      - name: m_features
+        type: int
+        doc: number of cols/features
+      - name: k_classes
+        type: int
+        doc: number of classes
+      - name: header
+        type: Optional[List[str]]
+        doc: header for features array
+      - name: label_column
+        type: Optional[str]
+        doc: column name of ground-truth series
+        default: labels
+      - name: weight
+        type: float
+        doc: fraction of sample negative value (ground-truth=0)
+        default: 0.5
+      - name: random_state
+        type: int
+        doc: rng seed (see https://scikit-learn.org/stable/glossary.html#term-random-state)
+        default: 1
+      - name: key
+        type: str
+        doc: key of data in artifact store
+        default: classifier-data
+      - name: file_ext
+        type: str
+        doc: (pqt) extension for parquet file
+        default: parquet
+      - name: sk_params
+        doc: additional parameters for `sklearn.datasets.make_classification`
+        default: {}
+      lineno: 22
+      doc: 'Create a binary classification sample dataset and save.
+
+        If no filename is given it will default to:
+
+        "simdata-{n_samples}X{m_features}.parquet".
+
+
+        Additional scikit-learn parameters can be set using **sk_params, please see
+        https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html
+        for more details.'
+      has_varargs: false
+      name: gen_class_data
+  command: ''
+  disable_auto_mount: false
+  image: mlrun/mlrun
+  build:
+    origin_filename: ''
+    functionSourceCode: IyBDb3B5cmlnaHQgMjAxOSBJZ3VhemlvCiMKIyBMaWNlbnNlZCB1bmRlciB0aGUgQXBhY2hlIExpY2Vuc2UsIFZlcnNpb24gMi4wICh0aGUgIkxpY2Vuc2UiKTsKIyB5b3UgbWF5IG5vdCB1c2UgdGhpcyBmaWxlIGV4Y2VwdCBpbiBjb21wbGlhbmNlIHdpdGggdGhlIExpY2Vuc2UuCiMgWW91IG1heSBvYnRhaW4gYSBjb3B5IG9mIHRoZSBMaWNlbnNlIGF0CiMKIyAgICAgaHR0cDovL3d3dy5hcGFjaGUub3JnL2xpY2Vuc2VzL0xJQ0VOU0UtMi4wCiMKIyBVbmxlc3MgcmVxdWlyZWQgYnkgYXBwbGljYWJsZSBsYXcgb3IgYWdyZWVkIHRvIGluIHdyaXRpbmcsIHNvZnR3YXJlCiMgZGlzdHJpYnV0ZWQgdW5kZXIgdGhlIExpY2Vuc2UgaXMgZGlzdHJpYnV0ZWQgb24gYW4gIkFTIElTIiBCQVNJUywKIyBXSVRIT1VUIFdBUlJBTlRJRVMgT1IgQ09ORElUSU9OUyBPRiBBTlkgS0lORCwgZWl0aGVyIGV4cHJlc3Mgb3IgaW1wbGllZC4KIyBTZWUgdGhlIExpY2Vuc2UgZm9yIHRoZSBzcGVjaWZpYyBsYW5ndWFnZSBnb3Zlcm5pbmcgcGVybWlzc2lvbnMgYW5kCiMgbGltaXRhdGlvbnMgdW5kZXIgdGhlIExpY2Vuc2UuCiMKaW1wb3J0IHBhbmRhcyBhcyBwZApmcm9tIHR5cGluZyBpbXBvcnQgT3B0aW9uYWwsIExpc3QKZnJvbSBza2xlYXJuLmRhdGFzZXRzIGltcG9ydCBtYWtlX2NsYXNzaWZpY2F0aW9uCgpmcm9tIG1scnVuLmV4ZWN1dGlvbiBpbXBvcnQgTUxDbGllbnRDdHgKCgpkZWYgZ2VuX2NsYXNzX2RhdGEoCiAgICAgICAgY29udGV4dDogTUxDbGllbnRDdHgsCiAgICAgICAgbl9zYW1wbGVzOiBpbnQsCiAgICAgICAgbV9mZWF0dXJlczogaW50LAogICAgICAgIGtfY2xhc3NlczogaW50LAogICAgICAgIGhlYWRlcjogT3B0aW9uYWxbTGlzdFtzdHJdXSwKICAgICAgICBsYWJlbF9jb2x1bW46IE9wdGlvbmFsW3N0cl0gPSAibGFiZWxzIiwKICAgICAgICB3ZWlnaHQ6IGZsb2F0ID0gMC41LAogICAgICAgIHJhbmRvbV9zdGF0ZTogaW50ID0gMSwKICAgICAgICBrZXk6IHN0ciA9ICJjbGFzc2lmaWVyLWRhdGEiLAogICAgICAgIGZpbGVfZXh0OiBzdHIgPSAicGFycXVldCIsCiAgICAgICAgc2tfcGFyYW1zPXt9Cik6CiAgICAiIiJDcmVhdGUgYSBiaW5hcnkgY2xhc3NpZmljYXRpb24gc2FtcGxlIGRhdGFzZXQgYW5kIHNhdmUuCiAgICBJZiBubyBmaWxlbmFtZSBpcyBnaXZlbiBpdCB3aWxsIGRlZmF1bHQgdG86CiAgICAic2ltZGF0YS17bl9zYW1wbGVzfVh7bV9mZWF0dXJlc30ucGFycXVldCIuCgogICAgQWRkaXRpb25hbCBzY2lraXQtbGVhcm4gcGFyYW1ldGVycyBjYW4gYmUgc2V0IHVzaW5nICoqc2tfcGFyYW1zLCBwbGVhc2Ugc2VlIGh0dHBzOi8vc2Npa2l0LWxlYXJuLm9yZy9zdGFibGUvbW9kdWxlcy9nZW5lcmF0ZWQvc2tsZWFybi5kYXRhc2V0cy5tYWtlX2NsYXNzaWZpY2F0aW9uLmh0bWwgZm9yIG1vcmUgZGV0YWlscy4KCiAgICA6cGFyYW0gY29udGV4dDogICAgICAgZnVuY3Rpb24gY29udGV4dAogICAgOnBhcmFtIG5fc2FtcGxlczogICAgIG51bWJlciBvZiByb3dzL3NhbXBsZXMKICAgIDpwYXJhbSBtX2ZlYXR1cmVzOiAgICBudW1iZXIgb2YgY29scy9mZWF0dXJlcwogICAgOnBhcmFtIGtfY2xhc3NlczogICAgIG51bWJlciBvZiBjbGFzc2VzCiAgICA6cGFyYW0gaGVhZGVyOiAgICAgICAgaGVhZGVyIGZvciBmZWF0dXJlcyBhcnJheQogICAgOnBhcmFtIGxhYmVsX2NvbHVtbjogIGNvbHVtbiBuYW1lIG9mIGdyb3VuZC10cnV0aCBzZXJpZXMKICAgIDpwYXJhbSB3ZWlnaHQ6ICAgICAgICBmcmFjdGlvbiBvZiBzYW1wbGUgbmVnYXRpdmUgdmFsdWUgKGdyb3VuZC10cnV0aD0wKQogICAgOnBhcmFtIHJhbmRvbV9zdGF0ZTogIHJuZyBzZWVkIChzZWUgaHR0cHM6Ly9zY2lraXQtbGVhcm4ub3JnL3N0YWJsZS9nbG9zc2FyeS5odG1sI3Rlcm0tcmFuZG9tLXN0YXRlKQogICAgOnBhcmFtIGtleTogICAgICAgICAgIGtleSBvZiBkYXRhIGluIGFydGlmYWN0IHN0b3JlCiAgICA6cGFyYW0gZmlsZV9leHQ6ICAgICAgKHBxdCkgZXh0ZW5zaW9uIGZvciBwYXJxdWV0IGZpbGUKICAgIDpwYXJhbSBza19wYXJhbXM6ICAgICBhZGRpdGlvbmFsIHBhcmFtZXRlcnMgZm9yIGBza2xlYXJuLmRhdGFzZXRzLm1ha2VfY2xhc3NpZmljYXRpb25gCiAgICAiIiIKICAgIGZlYXR1cmVzLCBsYWJlbHMgPSBtYWtlX2NsYXNzaWZpY2F0aW9uKAogICAgICAgIG5fc2FtcGxlcz1uX3NhbXBsZXMsCiAgICAgICAgbl9mZWF0dXJlcz1tX2ZlYXR1cmVzLAogICAgICAgIHdlaWdodHM9d2VpZ2h0LAogICAgICAgIG5fY2xhc3Nlcz1rX2NsYXNzZXMsCiAgICAgICAgcmFuZG9tX3N0YXRlPXJhbmRvbV9zdGF0ZSwKICAgICAgICAqKnNrX3BhcmFtcykKCiAgICAjIG1ha2UgZGF0YWZyYW1lcywgYWRkIGNvbHVtbiBuYW1lcywgY29uY2F0ZW5hdGUgKFgsIHkpCiAgICBYID0gcGQuRGF0YUZyYW1lKGZlYXR1cmVzKQogICAgaWYgbm90IGhlYWRlcjoKICAgICAgICBYLmNvbHVtbnMgPSBbImZlYXRfIiArIHN0cih4KSBmb3IgeCBpbiByYW5nZShtX2ZlYXR1cmVzKV0KICAgIGVsc2U6CiAgICAgICAgWC5jb2x1bW5zID0gaGVhZGVyCgogICAgeSA9IHBkLkRhdGFGcmFtZShsYWJlbHMsIGNvbHVtbnM9W2xhYmVsX2NvbHVtbl0pCiAgICBkYXRhID0gcGQuY29uY2F0KFtYLCB5XSwgYXhpcz0xKQoKICAgIGNvbnRleHQubG9nX2RhdGFzZXQoa2V5LCBkZj1kYXRhLCBmb3JtYXQ9ZmlsZV9leHQsIGluZGV4PUZhbHNlKQo=
+    code_origin: ''
+kind: job
+verbose: false
+
+        
+    
+ + \ No newline at end of file diff --git a/functions/master/gen_class_data/1.3.0/static/gen_class_data.html b/functions/master/gen_class_data/1.3.0/static/gen_class_data.html new file mode 100644 index 00000000..8641ff05 --- /dev/null +++ b/functions/master/gen_class_data/1.3.0/static/gen_class_data.html @@ -0,0 +1,246 @@ + + + + + + + +gen_class_data.gen_class_data + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + +
+
+
+
+
+
+ +
+ +
+
+
+ + + + +
+
+
+
+
+
+
+
+
+
+
+
+
+ + +
+
+
+
+
+

+ +
+
+
+
+
+ +
+

Source code for gen_class_data.gen_class_data

+# Copyright 2019 Iguazio
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import pandas as pd
+from typing import Optional, List
+from sklearn.datasets import make_classification
+
+from mlrun.execution import MLClientCtx
+
+
+
+[docs] +def gen_class_data( + context: MLClientCtx, + n_samples: int, + m_features: int, + k_classes: int, + header: Optional[List[str]], + label_column: Optional[str] = "labels", + weight: float = 0.5, + random_state: int = 1, + key: str = "classifier-data", + file_ext: str = "parquet", + sk_params={} +): + """Create a binary classification sample dataset and save. + If no filename is given it will default to: + "simdata-{n_samples}X{m_features}.parquet". + + Additional scikit-learn parameters can be set using **sk_params, please see https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html for more details. + + :param context: function context + :param n_samples: number of rows/samples + :param m_features: number of cols/features + :param k_classes: number of classes + :param header: header for features array + :param label_column: column name of ground-truth series + :param weight: fraction of sample negative value (ground-truth=0) + :param random_state: rng seed (see https://scikit-learn.org/stable/glossary.html#term-random-state) + :param key: key of data in artifact store + :param file_ext: (pqt) extension for parquet file + :param sk_params: additional parameters for `sklearn.datasets.make_classification` + """ + features, labels = make_classification( + n_samples=n_samples, + n_features=m_features, + weights=weight, + n_classes=k_classes, + random_state=random_state, + **sk_params) + + # make dataframes, add column names, concatenate (X, y) + X = pd.DataFrame(features) + if not header: + X.columns = ["feat_" + str(x) for x in range(m_features)] + else: + X.columns = header + + y = pd.DataFrame(labels, columns=[label_column]) + data = pd.concat([X, y], axis=1) + + context.log_dataset(key, df=data, format=file_ext, index=False)
+ +
+
+
+
+
+
+
+
+
+ +
+
+
+
+ + + +
+
+ + \ No newline at end of file diff --git a/functions/master/gen_class_data/1.3.0/static/item.html b/functions/master/gen_class_data/1.3.0/static/item.html new file mode 100644 index 00000000..b1943f72 --- /dev/null +++ b/functions/master/gen_class_data/1.3.0/static/item.html @@ -0,0 +1,59 @@ + + + + + + + + + + + Source + + + + +
+        
+apiVersion: v1
+categories:
+- data-generation
+description: Create a binary classification sample dataset and save.
+doc: ''
+example: gen_class_data.ipynb
+generationDate: 2022-08-28:17-25
+hidden: false
+icon: ''
+labels:
+  author: Daniel
+maintainers: []
+marketplaceType: ''
+mlrunVersion: 1.7.0
+name: gen_class_data
+platformVersion: 3.5.3
+spec:
+  filename: gen_class_data.py
+  handler: gen_class_data
+  image: mlrun/mlrun
+  kind: job
+  requirements: []
+url: ''
+version: 1.3.0
+
+        
+    
+ + \ No newline at end of file diff --git a/functions/master/gen_class_data/1.3.0/static/source.html b/functions/master/gen_class_data/1.3.0/static/source.html new file mode 100644 index 00000000..8c733cda --- /dev/null +++ b/functions/master/gen_class_data/1.3.0/static/source.html @@ -0,0 +1,106 @@ + + + + + + + + + + + Source + + + + +
+        
+# Copyright 2019 Iguazio
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import pandas as pd
+from typing import Optional, List
+from sklearn.datasets import make_classification
+
+from mlrun.execution import MLClientCtx
+
+
+def gen_class_data(
+        context: MLClientCtx,
+        n_samples: int,
+        m_features: int,
+        k_classes: int,
+        header: Optional[List[str]],
+        label_column: Optional[str] = "labels",
+        weight: float = 0.5,
+        random_state: int = 1,
+        key: str = "classifier-data",
+        file_ext: str = "parquet",
+        sk_params={}
+):
+    """Create a binary classification sample dataset and save.
+    If no filename is given it will default to:
+    "simdata-{n_samples}X{m_features}.parquet".
+
+    Additional scikit-learn parameters can be set using **sk_params, please see https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html for more details.
+
+    :param context:       function context
+    :param n_samples:     number of rows/samples
+    :param m_features:    number of cols/features
+    :param k_classes:     number of classes
+    :param header:        header for features array
+    :param label_column:  column name of ground-truth series
+    :param weight:        fraction of sample negative value (ground-truth=0)
+    :param random_state:  rng seed (see https://scikit-learn.org/stable/glossary.html#term-random-state)
+    :param key:           key of data in artifact store
+    :param file_ext:      (pqt) extension for parquet file
+    :param sk_params:     additional parameters for `sklearn.datasets.make_classification`
+    """
+    features, labels = make_classification(
+        n_samples=n_samples,
+        n_features=m_features,
+        weights=weight,
+        n_classes=k_classes,
+        random_state=random_state,
+        **sk_params)
+
+    # make dataframes, add column names, concatenate (X, y)
+    X = pd.DataFrame(features)
+    if not header:
+        X.columns = ["feat_" + str(x) for x in range(m_features)]
+    else:
+        X.columns = header
+
+    y = pd.DataFrame(labels, columns=[label_column])
+    data = pd.concat([X, y], axis=1)
+
+    context.log_dataset(key, df=data, format=file_ext, index=False)
+
+        
+    
+ + \ No newline at end of file diff --git a/functions/master/gen_class_data/latest/src/function.yaml b/functions/master/gen_class_data/latest/src/function.yaml index 4249bed5..1769bec0 100644 --- a/functions/master/gen_class_data/latest/src/function.yaml +++ b/functions/master/gen_class_data/latest/src/function.yaml @@ -1,57 +1,30 @@ -kind: job metadata: - name: gen-class-data - tag: '' - hash: 7759c5db6fd6a66e91351a10862cf5c09e2b59b3 - project: '' - labels: - author: Daniel categories: - - data-preparation + - data-generation + tag: '' + name: gen-class-data spec: - command: '' - args: [] - image: mlrun/mlrun - build: - functionSourceCode: IyBDb3B5cmlnaHQgMjAxOSBJZ3VhemlvCiMKIyBMaWNlbnNlZCB1bmRlciB0aGUgQXBhY2hlIExpY2Vuc2UsIFZlcnNpb24gMi4wICh0aGUgIkxpY2Vuc2UiKTsKIyB5b3UgbWF5IG5vdCB1c2UgdGhpcyBmaWxlIGV4Y2VwdCBpbiBjb21wbGlhbmNlIHdpdGggdGhlIExpY2Vuc2UuCiMgWW91IG1heSBvYnRhaW4gYSBjb3B5IG9mIHRoZSBMaWNlbnNlIGF0CiMKIyAgICAgaHR0cDovL3d3dy5hcGFjaGUub3JnL2xpY2Vuc2VzL0xJQ0VOU0UtMi4wCiMKIyBVbmxlc3MgcmVxdWlyZWQgYnkgYXBwbGljYWJsZSBsYXcgb3IgYWdyZWVkIHRvIGluIHdyaXRpbmcsIHNvZnR3YXJlCiMgZGlzdHJpYnV0ZWQgdW5kZXIgdGhlIExpY2Vuc2UgaXMgZGlzdHJpYnV0ZWQgb24gYW4gIkFTIElTIiBCQVNJUywKIyBXSVRIT1VUIFdBUlJBTlRJRVMgT1IgQ09ORElUSU9OUyBPRiBBTlkgS0lORCwgZWl0aGVyIGV4cHJlc3Mgb3IgaW1wbGllZC4KIyBTZWUgdGhlIExpY2Vuc2UgZm9yIHRoZSBzcGVjaWZpYyBsYW5ndWFnZSBnb3Zlcm5pbmcgcGVybWlzc2lvbnMgYW5kCiMgbGltaXRhdGlvbnMgdW5kZXIgdGhlIExpY2Vuc2UuCiMKaW1wb3J0IHBhbmRhcyBhcyBwZApmcm9tIHR5cGluZyBpbXBvcnQgT3B0aW9uYWwsIExpc3QKZnJvbSBza2xlYXJuLmRhdGFzZXRzIGltcG9ydCBtYWtlX2NsYXNzaWZpY2F0aW9uCgpmcm9tIG1scnVuLmV4ZWN1dGlvbiBpbXBvcnQgTUxDbGllbnRDdHgKCgpkZWYgZ2VuX2NsYXNzX2RhdGEoCiAgICAgICAgY29udGV4dDogTUxDbGllbnRDdHgsCiAgICAgICAgbl9zYW1wbGVzOiBpbnQsCiAgICAgICAgbV9mZWF0dXJlczogaW50LAogICAgICAgIGtfY2xhc3NlczogaW50LAogICAgICAgIGhlYWRlcjogT3B0aW9uYWxbTGlzdFtzdHJdXSwKICAgICAgICBsYWJlbF9jb2x1bW46IE9wdGlvbmFsW3N0cl0gPSAibGFiZWxzIiwKICAgICAgICB3ZWlnaHQ6IGZsb2F0ID0gMC41LAogICAgICAgIHJhbmRvbV9zdGF0ZTogaW50ID0gMSwKICAgICAgICBrZXk6IHN0ciA9ICJjbGFzc2lmaWVyLWRhdGEiLAogICAgICAgIGZpbGVfZXh0OiBzdHIgPSAicGFycXVldCIsCiAgICAgICAgc2tfcGFyYW1zPXt9Cik6CiAgICAiIiJDcmVhdGUgYSBiaW5hcnkgY2xhc3NpZmljYXRpb24gc2FtcGxlIGRhdGFzZXQgYW5kIHNhdmUuCiAgICBJZiBubyBmaWxlbmFtZSBpcyBnaXZlbiBpdCB3aWxsIGRlZmF1bHQgdG86CiAgICAic2ltZGF0YS17bl9zYW1wbGVzfVh7bV9mZWF0dXJlc30ucGFycXVldCIuCgogICAgQWRkaXRpb25hbCBzY2lraXQtbGVhcm4gcGFyYW1ldGVycyBjYW4gYmUgc2V0IHVzaW5nICoqc2tfcGFyYW1zLCBwbGVhc2Ugc2VlIGh0dHBzOi8vc2Npa2l0LWxlYXJuLm9yZy9zdGFibGUvbW9kdWxlcy9nZW5lcmF0ZWQvc2tsZWFybi5kYXRhc2V0cy5tYWtlX2NsYXNzaWZpY2F0aW9uLmh0bWwgZm9yIG1vcmUgZGV0YWlscy4KCiAgICA6cGFyYW0gY29udGV4dDogICAgICAgZnVuY3Rpb24gY29udGV4dAogICAgOnBhcmFtIG5fc2FtcGxlczogICAgIG51bWJlciBvZiByb3dzL3NhbXBsZXMKICAgIDpwYXJhbSBtX2ZlYXR1cmVzOiAgICBudW1iZXIgb2YgY29scy9mZWF0dXJlcwogICAgOnBhcmFtIGtfY2xhc3NlczogICAgIG51bWJlciBvZiBjbGFzc2VzCiAgICA6cGFyYW0gaGVhZGVyOiAgICAgICAgaGVhZGVyIGZvciBmZWF0dXJlcyBhcnJheQogICAgOnBhcmFtIGxhYmVsX2NvbHVtbjogIGNvbHVtbiBuYW1lIG9mIGdyb3VuZC10cnV0aCBzZXJpZXMKICAgIDpwYXJhbSB3ZWlnaHQ6ICAgICAgICBmcmFjdGlvbiBvZiBzYW1wbGUgbmVnYXRpdmUgdmFsdWUgKGdyb3VuZC10cnV0aD0wKQogICAgOnBhcmFtIHJhbmRvbV9zdGF0ZTogIHJuZyBzZWVkIChzZWUgaHR0cHM6Ly9zY2lraXQtbGVhcm4ub3JnL3N0YWJsZS9nbG9zc2FyeS5odG1sI3Rlcm0tcmFuZG9tLXN0YXRlKQogICAgOnBhcmFtIGtleTogICAgICAgICAgIGtleSBvZiBkYXRhIGluIGFydGlmYWN0IHN0b3JlCiAgICA6cGFyYW0gZmlsZV9leHQ6ICAgICAgKHBxdCkgZXh0ZW5zaW9uIGZvciBwYXJxdWV0IGZpbGUKICAgIDpwYXJhbSBza19wYXJhbXM6ICAgICBhZGRpdGlvbmFsIHBhcmFtZXRlcnMgZm9yIGBza2xlYXJuLmRhdGFzZXRzLm1ha2VfY2xhc3NpZmljYXRpb25gCiAgICAiIiIKICAgIGZlYXR1cmVzLCBsYWJlbHMgPSBtYWtlX2NsYXNzaWZpY2F0aW9uKAogICAgICAgIG5fc2FtcGxlcz1uX3NhbXBsZXMsCiAgICAgICAgbl9mZWF0dXJlcz1tX2ZlYXR1cmVzLAogICAgICAgIHdlaWdodHM9d2VpZ2h0LAogICAgICAgIG5fY2xhc3Nlcz1rX2NsYXNzZXMsCiAgICAgICAgcmFuZG9tX3N0YXRlPXJhbmRvbV9zdGF0ZSwKICAgICAgICAqKnNrX3BhcmFtcykKCiAgICAjIG1ha2UgZGF0YWZyYW1lcywgYWRkIGNvbHVtbiBuYW1lcywgY29uY2F0ZW5hdGUgKFgsIHkpCiAgICBYID0gcGQuRGF0YUZyYW1lKGZlYXR1cmVzKQogICAgaWYgbm90IGhlYWRlcjoKICAgICAgICBYLmNvbHVtbnMgPSBbImZlYXRfIiArIHN0cih4KSBmb3IgeCBpbiByYW5nZShtX2ZlYXR1cmVzKV0KICAgIGVsc2U6CiAgICAgICAgWC5jb2x1bW5zID0gaGVhZGVyCgogICAgeSA9IHBkLkRhdGFGcmFtZShsYWJlbHMsIGNvbHVtbnM9W2xhYmVsX2NvbHVtbl0pCiAgICBkYXRhID0gcGQuY29uY2F0KFtYLCB5XSwgYXhpcz0xKQoKICAgIGNvbnRleHQubG9nX2RhdGFzZXQoa2V5LCBkZj1kYXRhLCBmb3JtYXQ9ZmlsZV9leHQsIGluZGV4PUZhbHNlKQo= - commands: [] - code_origin: http://github.com/aviaIguazio/functions.git#be04dfbae37aa7c2260ca800ffe248b38e34ebfc:/Users/Avi_Asulin/PycharmProjects/mlrun/functions/gen_class_data/gen_class_data.py - origin_filename: /Users/Avi_Asulin/PycharmProjects/mlrun/functions/gen_class_data/gen_class_data.py - requirements: [] + description: Create a binary classification sample dataset and save. + default_handler: gen_class_data entry_points: gen_class_data: - name: gen_class_data - doc: 'Create a binary classification sample dataset and save. - - If no filename is given it will default to: - - "simdata-{n_samples}X{m_features}.parquet". - - - Additional scikit-learn parameters can be set using **sk_params, please see - https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html - for more details.' + has_kwargs: false parameters: - name: context type: MLClientCtx doc: function context - default: '' - name: n_samples type: int doc: number of rows/samples - default: '' - name: m_features type: int doc: number of cols/features - default: '' - name: k_classes type: int doc: number of classes - default: '' - name: header type: Optional[List[str]] doc: header for features array - default: '' - name: label_column type: Optional[str] doc: column name of ground-truth series @@ -75,17 +48,25 @@ spec: - name: sk_params doc: additional parameters for `sklearn.datasets.make_classification` default: {} - outputs: - - default: '' lineno: 22 - description: Create a binary classification sample dataset and save. - default_handler: gen_class_data + doc: 'Create a binary classification sample dataset and save. + + If no filename is given it will default to: + + "simdata-{n_samples}X{m_features}.parquet". + + + Additional scikit-learn parameters can be set using **sk_params, please see + https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html + for more details.' + has_varargs: false + name: gen_class_data + command: '' disable_auto_mount: false - clone_target_dir: '' - env: [] - priority_class_name: '' - preemption_mode: prevent - affinity: null - tolerations: null - security_context: {} + image: mlrun/mlrun + build: + origin_filename: '' + functionSourceCode: IyBDb3B5cmlnaHQgMjAxOSBJZ3VhemlvCiMKIyBMaWNlbnNlZCB1bmRlciB0aGUgQXBhY2hlIExpY2Vuc2UsIFZlcnNpb24gMi4wICh0aGUgIkxpY2Vuc2UiKTsKIyB5b3UgbWF5IG5vdCB1c2UgdGhpcyBmaWxlIGV4Y2VwdCBpbiBjb21wbGlhbmNlIHdpdGggdGhlIExpY2Vuc2UuCiMgWW91IG1heSBvYnRhaW4gYSBjb3B5IG9mIHRoZSBMaWNlbnNlIGF0CiMKIyAgICAgaHR0cDovL3d3dy5hcGFjaGUub3JnL2xpY2Vuc2VzL0xJQ0VOU0UtMi4wCiMKIyBVbmxlc3MgcmVxdWlyZWQgYnkgYXBwbGljYWJsZSBsYXcgb3IgYWdyZWVkIHRvIGluIHdyaXRpbmcsIHNvZnR3YXJlCiMgZGlzdHJpYnV0ZWQgdW5kZXIgdGhlIExpY2Vuc2UgaXMgZGlzdHJpYnV0ZWQgb24gYW4gIkFTIElTIiBCQVNJUywKIyBXSVRIT1VUIFdBUlJBTlRJRVMgT1IgQ09ORElUSU9OUyBPRiBBTlkgS0lORCwgZWl0aGVyIGV4cHJlc3Mgb3IgaW1wbGllZC4KIyBTZWUgdGhlIExpY2Vuc2UgZm9yIHRoZSBzcGVjaWZpYyBsYW5ndWFnZSBnb3Zlcm5pbmcgcGVybWlzc2lvbnMgYW5kCiMgbGltaXRhdGlvbnMgdW5kZXIgdGhlIExpY2Vuc2UuCiMKaW1wb3J0IHBhbmRhcyBhcyBwZApmcm9tIHR5cGluZyBpbXBvcnQgT3B0aW9uYWwsIExpc3QKZnJvbSBza2xlYXJuLmRhdGFzZXRzIGltcG9ydCBtYWtlX2NsYXNzaWZpY2F0aW9uCgpmcm9tIG1scnVuLmV4ZWN1dGlvbiBpbXBvcnQgTUxDbGllbnRDdHgKCgpkZWYgZ2VuX2NsYXNzX2RhdGEoCiAgICAgICAgY29udGV4dDogTUxDbGllbnRDdHgsCiAgICAgICAgbl9zYW1wbGVzOiBpbnQsCiAgICAgICAgbV9mZWF0dXJlczogaW50LAogICAgICAgIGtfY2xhc3NlczogaW50LAogICAgICAgIGhlYWRlcjogT3B0aW9uYWxbTGlzdFtzdHJdXSwKICAgICAgICBsYWJlbF9jb2x1bW46IE9wdGlvbmFsW3N0cl0gPSAibGFiZWxzIiwKICAgICAgICB3ZWlnaHQ6IGZsb2F0ID0gMC41LAogICAgICAgIHJhbmRvbV9zdGF0ZTogaW50ID0gMSwKICAgICAgICBrZXk6IHN0ciA9ICJjbGFzc2lmaWVyLWRhdGEiLAogICAgICAgIGZpbGVfZXh0OiBzdHIgPSAicGFycXVldCIsCiAgICAgICAgc2tfcGFyYW1zPXt9Cik6CiAgICAiIiJDcmVhdGUgYSBiaW5hcnkgY2xhc3NpZmljYXRpb24gc2FtcGxlIGRhdGFzZXQgYW5kIHNhdmUuCiAgICBJZiBubyBmaWxlbmFtZSBpcyBnaXZlbiBpdCB3aWxsIGRlZmF1bHQgdG86CiAgICAic2ltZGF0YS17bl9zYW1wbGVzfVh7bV9mZWF0dXJlc30ucGFycXVldCIuCgogICAgQWRkaXRpb25hbCBzY2lraXQtbGVhcm4gcGFyYW1ldGVycyBjYW4gYmUgc2V0IHVzaW5nICoqc2tfcGFyYW1zLCBwbGVhc2Ugc2VlIGh0dHBzOi8vc2Npa2l0LWxlYXJuLm9yZy9zdGFibGUvbW9kdWxlcy9nZW5lcmF0ZWQvc2tsZWFybi5kYXRhc2V0cy5tYWtlX2NsYXNzaWZpY2F0aW9uLmh0bWwgZm9yIG1vcmUgZGV0YWlscy4KCiAgICA6cGFyYW0gY29udGV4dDogICAgICAgZnVuY3Rpb24gY29udGV4dAogICAgOnBhcmFtIG5fc2FtcGxlczogICAgIG51bWJlciBvZiByb3dzL3NhbXBsZXMKICAgIDpwYXJhbSBtX2ZlYXR1cmVzOiAgICBudW1iZXIgb2YgY29scy9mZWF0dXJlcwogICAgOnBhcmFtIGtfY2xhc3NlczogICAgIG51bWJlciBvZiBjbGFzc2VzCiAgICA6cGFyYW0gaGVhZGVyOiAgICAgICAgaGVhZGVyIGZvciBmZWF0dXJlcyBhcnJheQogICAgOnBhcmFtIGxhYmVsX2NvbHVtbjogIGNvbHVtbiBuYW1lIG9mIGdyb3VuZC10cnV0aCBzZXJpZXMKICAgIDpwYXJhbSB3ZWlnaHQ6ICAgICAgICBmcmFjdGlvbiBvZiBzYW1wbGUgbmVnYXRpdmUgdmFsdWUgKGdyb3VuZC10cnV0aD0wKQogICAgOnBhcmFtIHJhbmRvbV9zdGF0ZTogIHJuZyBzZWVkIChzZWUgaHR0cHM6Ly9zY2lraXQtbGVhcm4ub3JnL3N0YWJsZS9nbG9zc2FyeS5odG1sI3Rlcm0tcmFuZG9tLXN0YXRlKQogICAgOnBhcmFtIGtleTogICAgICAgICAgIGtleSBvZiBkYXRhIGluIGFydGlmYWN0IHN0b3JlCiAgICA6cGFyYW0gZmlsZV9leHQ6ICAgICAgKHBxdCkgZXh0ZW5zaW9uIGZvciBwYXJxdWV0IGZpbGUKICAgIDpwYXJhbSBza19wYXJhbXM6ICAgICBhZGRpdGlvbmFsIHBhcmFtZXRlcnMgZm9yIGBza2xlYXJuLmRhdGFzZXRzLm1ha2VfY2xhc3NpZmljYXRpb25gCiAgICAiIiIKICAgIGZlYXR1cmVzLCBsYWJlbHMgPSBtYWtlX2NsYXNzaWZpY2F0aW9uKAogICAgICAgIG5fc2FtcGxlcz1uX3NhbXBsZXMsCiAgICAgICAgbl9mZWF0dXJlcz1tX2ZlYXR1cmVzLAogICAgICAgIHdlaWdodHM9d2VpZ2h0LAogICAgICAgIG5fY2xhc3Nlcz1rX2NsYXNzZXMsCiAgICAgICAgcmFuZG9tX3N0YXRlPXJhbmRvbV9zdGF0ZSwKICAgICAgICAqKnNrX3BhcmFtcykKCiAgICAjIG1ha2UgZGF0YWZyYW1lcywgYWRkIGNvbHVtbiBuYW1lcywgY29uY2F0ZW5hdGUgKFgsIHkpCiAgICBYID0gcGQuRGF0YUZyYW1lKGZlYXR1cmVzKQogICAgaWYgbm90IGhlYWRlcjoKICAgICAgICBYLmNvbHVtbnMgPSBbImZlYXRfIiArIHN0cih4KSBmb3IgeCBpbiByYW5nZShtX2ZlYXR1cmVzKV0KICAgIGVsc2U6CiAgICAgICAgWC5jb2x1bW5zID0gaGVhZGVyCgogICAgeSA9IHBkLkRhdGFGcmFtZShsYWJlbHMsIGNvbHVtbnM9W2xhYmVsX2NvbHVtbl0pCiAgICBkYXRhID0gcGQuY29uY2F0KFtYLCB5XSwgYXhpcz0xKQoKICAgIGNvbnRleHQubG9nX2RhdGFzZXQoa2V5LCBkZj1kYXRhLCBmb3JtYXQ9ZmlsZV9leHQsIGluZGV4PUZhbHNlKQo= + code_origin: '' +kind: job verbose: false diff --git a/functions/master/gen_class_data/latest/src/item.yaml b/functions/master/gen_class_data/latest/src/item.yaml index a965c0ab..a6dd94b6 100644 --- a/functions/master/gen_class_data/latest/src/item.yaml +++ b/functions/master/gen_class_data/latest/src/item.yaml @@ -1,6 +1,6 @@ apiVersion: v1 categories: -- data-preparation +- data-generation description: Create a binary classification sample dataset and save. doc: '' example: gen_class_data.ipynb @@ -11,7 +11,7 @@ labels: author: Daniel maintainers: [] marketplaceType: '' -mlrunVersion: 1.4.1 +mlrunVersion: 1.7.0 name: gen_class_data platformVersion: 3.5.3 spec: @@ -21,4 +21,4 @@ spec: kind: job requirements: [] url: '' -version: 1.2.0 +version: 1.3.0 diff --git a/functions/master/gen_class_data/latest/static/documentation.html b/functions/master/gen_class_data/latest/static/documentation.html index 78d811f7..7125e01f 100644 --- a/functions/master/gen_class_data/latest/static/documentation.html +++ b/functions/master/gen_class_data/latest/static/documentation.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/gen_class_data/latest/static/example.html b/functions/master/gen_class_data/latest/static/example.html index 0910cc44..853b6a66 100644 --- a/functions/master/gen_class_data/latest/static/example.html +++ b/functions/master/gen_class_data/latest/static/example.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/gen_class_data/latest/static/function.html b/functions/master/gen_class_data/latest/static/function.html index a5a6f8c5..f4605fe7 100644 --- a/functions/master/gen_class_data/latest/static/function.html +++ b/functions/master/gen_class_data/latest/static/function.html @@ -28,60 +28,33 @@
         
-kind: job
 metadata:
-  name: gen-class-data
-  tag: ''
-  hash: 7759c5db6fd6a66e91351a10862cf5c09e2b59b3
-  project: ''
-  labels:
-    author: Daniel
   categories:
-  - data-preparation
+  - data-generation
+  tag: ''
+  name: gen-class-data
 spec:
-  command: ''
-  args: []
-  image: mlrun/mlrun
-  build:
-    functionSourceCode: IyBDb3B5cmlnaHQgMjAxOSBJZ3VhemlvCiMKIyBMaWNlbnNlZCB1bmRlciB0aGUgQXBhY2hlIExpY2Vuc2UsIFZlcnNpb24gMi4wICh0aGUgIkxpY2Vuc2UiKTsKIyB5b3UgbWF5IG5vdCB1c2UgdGhpcyBmaWxlIGV4Y2VwdCBpbiBjb21wbGlhbmNlIHdpdGggdGhlIExpY2Vuc2UuCiMgWW91IG1heSBvYnRhaW4gYSBjb3B5IG9mIHRoZSBMaWNlbnNlIGF0CiMKIyAgICAgaHR0cDovL3d3dy5hcGFjaGUub3JnL2xpY2Vuc2VzL0xJQ0VOU0UtMi4wCiMKIyBVbmxlc3MgcmVxdWlyZWQgYnkgYXBwbGljYWJsZSBsYXcgb3IgYWdyZWVkIHRvIGluIHdyaXRpbmcsIHNvZnR3YXJlCiMgZGlzdHJpYnV0ZWQgdW5kZXIgdGhlIExpY2Vuc2UgaXMgZGlzdHJpYnV0ZWQgb24gYW4gIkFTIElTIiBCQVNJUywKIyBXSVRIT1VUIFdBUlJBTlRJRVMgT1IgQ09ORElUSU9OUyBPRiBBTlkgS0lORCwgZWl0aGVyIGV4cHJlc3Mgb3IgaW1wbGllZC4KIyBTZWUgdGhlIExpY2Vuc2UgZm9yIHRoZSBzcGVjaWZpYyBsYW5ndWFnZSBnb3Zlcm5pbmcgcGVybWlzc2lvbnMgYW5kCiMgbGltaXRhdGlvbnMgdW5kZXIgdGhlIExpY2Vuc2UuCiMKaW1wb3J0IHBhbmRhcyBhcyBwZApmcm9tIHR5cGluZyBpbXBvcnQgT3B0aW9uYWwsIExpc3QKZnJvbSBza2xlYXJuLmRhdGFzZXRzIGltcG9ydCBtYWtlX2NsYXNzaWZpY2F0aW9uCgpmcm9tIG1scnVuLmV4ZWN1dGlvbiBpbXBvcnQgTUxDbGllbnRDdHgKCgpkZWYgZ2VuX2NsYXNzX2RhdGEoCiAgICAgICAgY29udGV4dDogTUxDbGllbnRDdHgsCiAgICAgICAgbl9zYW1wbGVzOiBpbnQsCiAgICAgICAgbV9mZWF0dXJlczogaW50LAogICAgICAgIGtfY2xhc3NlczogaW50LAogICAgICAgIGhlYWRlcjogT3B0aW9uYWxbTGlzdFtzdHJdXSwKICAgICAgICBsYWJlbF9jb2x1bW46IE9wdGlvbmFsW3N0cl0gPSAibGFiZWxzIiwKICAgICAgICB3ZWlnaHQ6IGZsb2F0ID0gMC41LAogICAgICAgIHJhbmRvbV9zdGF0ZTogaW50ID0gMSwKICAgICAgICBrZXk6IHN0ciA9ICJjbGFzc2lmaWVyLWRhdGEiLAogICAgICAgIGZpbGVfZXh0OiBzdHIgPSAicGFycXVldCIsCiAgICAgICAgc2tfcGFyYW1zPXt9Cik6CiAgICAiIiJDcmVhdGUgYSBiaW5hcnkgY2xhc3NpZmljYXRpb24gc2FtcGxlIGRhdGFzZXQgYW5kIHNhdmUuCiAgICBJZiBubyBmaWxlbmFtZSBpcyBnaXZlbiBpdCB3aWxsIGRlZmF1bHQgdG86CiAgICAic2ltZGF0YS17bl9zYW1wbGVzfVh7bV9mZWF0dXJlc30ucGFycXVldCIuCgogICAgQWRkaXRpb25hbCBzY2lraXQtbGVhcm4gcGFyYW1ldGVycyBjYW4gYmUgc2V0IHVzaW5nICoqc2tfcGFyYW1zLCBwbGVhc2Ugc2VlIGh0dHBzOi8vc2Npa2l0LWxlYXJuLm9yZy9zdGFibGUvbW9kdWxlcy9nZW5lcmF0ZWQvc2tsZWFybi5kYXRhc2V0cy5tYWtlX2NsYXNzaWZpY2F0aW9uLmh0bWwgZm9yIG1vcmUgZGV0YWlscy4KCiAgICA6cGFyYW0gY29udGV4dDogICAgICAgZnVuY3Rpb24gY29udGV4dAogICAgOnBhcmFtIG5fc2FtcGxlczogICAgIG51bWJlciBvZiByb3dzL3NhbXBsZXMKICAgIDpwYXJhbSBtX2ZlYXR1cmVzOiAgICBudW1iZXIgb2YgY29scy9mZWF0dXJlcwogICAgOnBhcmFtIGtfY2xhc3NlczogICAgIG51bWJlciBvZiBjbGFzc2VzCiAgICA6cGFyYW0gaGVhZGVyOiAgICAgICAgaGVhZGVyIGZvciBmZWF0dXJlcyBhcnJheQogICAgOnBhcmFtIGxhYmVsX2NvbHVtbjogIGNvbHVtbiBuYW1lIG9mIGdyb3VuZC10cnV0aCBzZXJpZXMKICAgIDpwYXJhbSB3ZWlnaHQ6ICAgICAgICBmcmFjdGlvbiBvZiBzYW1wbGUgbmVnYXRpdmUgdmFsdWUgKGdyb3VuZC10cnV0aD0wKQogICAgOnBhcmFtIHJhbmRvbV9zdGF0ZTogIHJuZyBzZWVkIChzZWUgaHR0cHM6Ly9zY2lraXQtbGVhcm4ub3JnL3N0YWJsZS9nbG9zc2FyeS5odG1sI3Rlcm0tcmFuZG9tLXN0YXRlKQogICAgOnBhcmFtIGtleTogICAgICAgICAgIGtleSBvZiBkYXRhIGluIGFydGlmYWN0IHN0b3JlCiAgICA6cGFyYW0gZmlsZV9leHQ6ICAgICAgKHBxdCkgZXh0ZW5zaW9uIGZvciBwYXJxdWV0IGZpbGUKICAgIDpwYXJhbSBza19wYXJhbXM6ICAgICBhZGRpdGlvbmFsIHBhcmFtZXRlcnMgZm9yIGBza2xlYXJuLmRhdGFzZXRzLm1ha2VfY2xhc3NpZmljYXRpb25gCiAgICAiIiIKICAgIGZlYXR1cmVzLCBsYWJlbHMgPSBtYWtlX2NsYXNzaWZpY2F0aW9uKAogICAgICAgIG5fc2FtcGxlcz1uX3NhbXBsZXMsCiAgICAgICAgbl9mZWF0dXJlcz1tX2ZlYXR1cmVzLAogICAgICAgIHdlaWdodHM9d2VpZ2h0LAogICAgICAgIG5fY2xhc3Nlcz1rX2NsYXNzZXMsCiAgICAgICAgcmFuZG9tX3N0YXRlPXJhbmRvbV9zdGF0ZSwKICAgICAgICAqKnNrX3BhcmFtcykKCiAgICAjIG1ha2UgZGF0YWZyYW1lcywgYWRkIGNvbHVtbiBuYW1lcywgY29uY2F0ZW5hdGUgKFgsIHkpCiAgICBYID0gcGQuRGF0YUZyYW1lKGZlYXR1cmVzKQogICAgaWYgbm90IGhlYWRlcjoKICAgICAgICBYLmNvbHVtbnMgPSBbImZlYXRfIiArIHN0cih4KSBmb3IgeCBpbiByYW5nZShtX2ZlYXR1cmVzKV0KICAgIGVsc2U6CiAgICAgICAgWC5jb2x1bW5zID0gaGVhZGVyCgogICAgeSA9IHBkLkRhdGFGcmFtZShsYWJlbHMsIGNvbHVtbnM9W2xhYmVsX2NvbHVtbl0pCiAgICBkYXRhID0gcGQuY29uY2F0KFtYLCB5XSwgYXhpcz0xKQoKICAgIGNvbnRleHQubG9nX2RhdGFzZXQoa2V5LCBkZj1kYXRhLCBmb3JtYXQ9ZmlsZV9leHQsIGluZGV4PUZhbHNlKQo=
-    commands: []
-    code_origin: http://github.com/aviaIguazio/functions.git#be04dfbae37aa7c2260ca800ffe248b38e34ebfc:/Users/Avi_Asulin/PycharmProjects/mlrun/functions/gen_class_data/gen_class_data.py
-    origin_filename: /Users/Avi_Asulin/PycharmProjects/mlrun/functions/gen_class_data/gen_class_data.py
-    requirements: []
+  description: Create a binary classification sample dataset and save.
+  default_handler: gen_class_data
   entry_points:
     gen_class_data:
-      name: gen_class_data
-      doc: 'Create a binary classification sample dataset and save.
-
-        If no filename is given it will default to:
-
-        "simdata-{n_samples}X{m_features}.parquet".
-
-
-        Additional scikit-learn parameters can be set using **sk_params, please see
-        https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html
-        for more details.'
+      has_kwargs: false
       parameters:
       - name: context
         type: MLClientCtx
         doc: function context
-        default: ''
       - name: n_samples
         type: int
         doc: number of rows/samples
-        default: ''
       - name: m_features
         type: int
         doc: number of cols/features
-        default: ''
       - name: k_classes
         type: int
         doc: number of classes
-        default: ''
       - name: header
         type: Optional[List[str]]
         doc: header for features array
-        default: ''
       - name: label_column
         type: Optional[str]
         doc: column name of ground-truth series
@@ -105,19 +78,27 @@
       - name: sk_params
         doc: additional parameters for `sklearn.datasets.make_classification`
         default: {}
-      outputs:
-      - default: ''
       lineno: 22
-  description: Create a binary classification sample dataset and save.
-  default_handler: gen_class_data
+      doc: 'Create a binary classification sample dataset and save.
+
+        If no filename is given it will default to:
+
+        "simdata-{n_samples}X{m_features}.parquet".
+
+
+        Additional scikit-learn parameters can be set using **sk_params, please see
+        https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html
+        for more details.'
+      has_varargs: false
+      name: gen_class_data
+  command: ''
   disable_auto_mount: false
-  clone_target_dir: ''
-  env: []
-  priority_class_name: ''
-  preemption_mode: prevent
-  affinity: null
-  tolerations: null
-  security_context: {}
+  image: mlrun/mlrun
+  build:
+    origin_filename: ''
+    functionSourceCode: IyBDb3B5cmlnaHQgMjAxOSBJZ3VhemlvCiMKIyBMaWNlbnNlZCB1bmRlciB0aGUgQXBhY2hlIExpY2Vuc2UsIFZlcnNpb24gMi4wICh0aGUgIkxpY2Vuc2UiKTsKIyB5b3UgbWF5IG5vdCB1c2UgdGhpcyBmaWxlIGV4Y2VwdCBpbiBjb21wbGlhbmNlIHdpdGggdGhlIExpY2Vuc2UuCiMgWW91IG1heSBvYnRhaW4gYSBjb3B5IG9mIHRoZSBMaWNlbnNlIGF0CiMKIyAgICAgaHR0cDovL3d3dy5hcGFjaGUub3JnL2xpY2Vuc2VzL0xJQ0VOU0UtMi4wCiMKIyBVbmxlc3MgcmVxdWlyZWQgYnkgYXBwbGljYWJsZSBsYXcgb3IgYWdyZWVkIHRvIGluIHdyaXRpbmcsIHNvZnR3YXJlCiMgZGlzdHJpYnV0ZWQgdW5kZXIgdGhlIExpY2Vuc2UgaXMgZGlzdHJpYnV0ZWQgb24gYW4gIkFTIElTIiBCQVNJUywKIyBXSVRIT1VUIFdBUlJBTlRJRVMgT1IgQ09ORElUSU9OUyBPRiBBTlkgS0lORCwgZWl0aGVyIGV4cHJlc3Mgb3IgaW1wbGllZC4KIyBTZWUgdGhlIExpY2Vuc2UgZm9yIHRoZSBzcGVjaWZpYyBsYW5ndWFnZSBnb3Zlcm5pbmcgcGVybWlzc2lvbnMgYW5kCiMgbGltaXRhdGlvbnMgdW5kZXIgdGhlIExpY2Vuc2UuCiMKaW1wb3J0IHBhbmRhcyBhcyBwZApmcm9tIHR5cGluZyBpbXBvcnQgT3B0aW9uYWwsIExpc3QKZnJvbSBza2xlYXJuLmRhdGFzZXRzIGltcG9ydCBtYWtlX2NsYXNzaWZpY2F0aW9uCgpmcm9tIG1scnVuLmV4ZWN1dGlvbiBpbXBvcnQgTUxDbGllbnRDdHgKCgpkZWYgZ2VuX2NsYXNzX2RhdGEoCiAgICAgICAgY29udGV4dDogTUxDbGllbnRDdHgsCiAgICAgICAgbl9zYW1wbGVzOiBpbnQsCiAgICAgICAgbV9mZWF0dXJlczogaW50LAogICAgICAgIGtfY2xhc3NlczogaW50LAogICAgICAgIGhlYWRlcjogT3B0aW9uYWxbTGlzdFtzdHJdXSwKICAgICAgICBsYWJlbF9jb2x1bW46IE9wdGlvbmFsW3N0cl0gPSAibGFiZWxzIiwKICAgICAgICB3ZWlnaHQ6IGZsb2F0ID0gMC41LAogICAgICAgIHJhbmRvbV9zdGF0ZTogaW50ID0gMSwKICAgICAgICBrZXk6IHN0ciA9ICJjbGFzc2lmaWVyLWRhdGEiLAogICAgICAgIGZpbGVfZXh0OiBzdHIgPSAicGFycXVldCIsCiAgICAgICAgc2tfcGFyYW1zPXt9Cik6CiAgICAiIiJDcmVhdGUgYSBiaW5hcnkgY2xhc3NpZmljYXRpb24gc2FtcGxlIGRhdGFzZXQgYW5kIHNhdmUuCiAgICBJZiBubyBmaWxlbmFtZSBpcyBnaXZlbiBpdCB3aWxsIGRlZmF1bHQgdG86CiAgICAic2ltZGF0YS17bl9zYW1wbGVzfVh7bV9mZWF0dXJlc30ucGFycXVldCIuCgogICAgQWRkaXRpb25hbCBzY2lraXQtbGVhcm4gcGFyYW1ldGVycyBjYW4gYmUgc2V0IHVzaW5nICoqc2tfcGFyYW1zLCBwbGVhc2Ugc2VlIGh0dHBzOi8vc2Npa2l0LWxlYXJuLm9yZy9zdGFibGUvbW9kdWxlcy9nZW5lcmF0ZWQvc2tsZWFybi5kYXRhc2V0cy5tYWtlX2NsYXNzaWZpY2F0aW9uLmh0bWwgZm9yIG1vcmUgZGV0YWlscy4KCiAgICA6cGFyYW0gY29udGV4dDogICAgICAgZnVuY3Rpb24gY29udGV4dAogICAgOnBhcmFtIG5fc2FtcGxlczogICAgIG51bWJlciBvZiByb3dzL3NhbXBsZXMKICAgIDpwYXJhbSBtX2ZlYXR1cmVzOiAgICBudW1iZXIgb2YgY29scy9mZWF0dXJlcwogICAgOnBhcmFtIGtfY2xhc3NlczogICAgIG51bWJlciBvZiBjbGFzc2VzCiAgICA6cGFyYW0gaGVhZGVyOiAgICAgICAgaGVhZGVyIGZvciBmZWF0dXJlcyBhcnJheQogICAgOnBhcmFtIGxhYmVsX2NvbHVtbjogIGNvbHVtbiBuYW1lIG9mIGdyb3VuZC10cnV0aCBzZXJpZXMKICAgIDpwYXJhbSB3ZWlnaHQ6ICAgICAgICBmcmFjdGlvbiBvZiBzYW1wbGUgbmVnYXRpdmUgdmFsdWUgKGdyb3VuZC10cnV0aD0wKQogICAgOnBhcmFtIHJhbmRvbV9zdGF0ZTogIHJuZyBzZWVkIChzZWUgaHR0cHM6Ly9zY2lraXQtbGVhcm4ub3JnL3N0YWJsZS9nbG9zc2FyeS5odG1sI3Rlcm0tcmFuZG9tLXN0YXRlKQogICAgOnBhcmFtIGtleTogICAgICAgICAgIGtleSBvZiBkYXRhIGluIGFydGlmYWN0IHN0b3JlCiAgICA6cGFyYW0gZmlsZV9leHQ6ICAgICAgKHBxdCkgZXh0ZW5zaW9uIGZvciBwYXJxdWV0IGZpbGUKICAgIDpwYXJhbSBza19wYXJhbXM6ICAgICBhZGRpdGlvbmFsIHBhcmFtZXRlcnMgZm9yIGBza2xlYXJuLmRhdGFzZXRzLm1ha2VfY2xhc3NpZmljYXRpb25gCiAgICAiIiIKICAgIGZlYXR1cmVzLCBsYWJlbHMgPSBtYWtlX2NsYXNzaWZpY2F0aW9uKAogICAgICAgIG5fc2FtcGxlcz1uX3NhbXBsZXMsCiAgICAgICAgbl9mZWF0dXJlcz1tX2ZlYXR1cmVzLAogICAgICAgIHdlaWdodHM9d2VpZ2h0LAogICAgICAgIG5fY2xhc3Nlcz1rX2NsYXNzZXMsCiAgICAgICAgcmFuZG9tX3N0YXRlPXJhbmRvbV9zdGF0ZSwKICAgICAgICAqKnNrX3BhcmFtcykKCiAgICAjIG1ha2UgZGF0YWZyYW1lcywgYWRkIGNvbHVtbiBuYW1lcywgY29uY2F0ZW5hdGUgKFgsIHkpCiAgICBYID0gcGQuRGF0YUZyYW1lKGZlYXR1cmVzKQogICAgaWYgbm90IGhlYWRlcjoKICAgICAgICBYLmNvbHVtbnMgPSBbImZlYXRfIiArIHN0cih4KSBmb3IgeCBpbiByYW5nZShtX2ZlYXR1cmVzKV0KICAgIGVsc2U6CiAgICAgICAgWC5jb2x1bW5zID0gaGVhZGVyCgogICAgeSA9IHBkLkRhdGFGcmFtZShsYWJlbHMsIGNvbHVtbnM9W2xhYmVsX2NvbHVtbl0pCiAgICBkYXRhID0gcGQuY29uY2F0KFtYLCB5XSwgYXhpcz0xKQoKICAgIGNvbnRleHQubG9nX2RhdGFzZXQoa2V5LCBkZj1kYXRhLCBmb3JtYXQ9ZmlsZV9leHQsIGluZGV4PUZhbHNlKQo=
+    code_origin: ''
+kind: job
 verbose: false
 
         
diff --git a/functions/master/gen_class_data/latest/static/gen_class_data.html b/functions/master/gen_class_data/latest/static/gen_class_data.html
index 83113ae2..8641ff05 100644
--- a/functions/master/gen_class_data/latest/static/gen_class_data.html
+++ b/functions/master/gen_class_data/latest/static/gen_class_data.html
@@ -20,7 +20,7 @@
 
 
 
-
+
 
 
 
diff --git a/functions/master/gen_class_data/latest/static/item.html b/functions/master/gen_class_data/latest/static/item.html
index 3bd8626a..b1943f72 100644
--- a/functions/master/gen_class_data/latest/static/item.html
+++ b/functions/master/gen_class_data/latest/static/item.html
@@ -30,7 +30,7 @@
         
 apiVersion: v1
 categories:
-- data-preparation
+- data-generation
 description: Create a binary classification sample dataset and save.
 doc: ''
 example: gen_class_data.ipynb
@@ -41,7 +41,7 @@
   author: Daniel
 maintainers: []
 marketplaceType: ''
-mlrunVersion: 1.4.1
+mlrunVersion: 1.7.0
 name: gen_class_data
 platformVersion: 3.5.3
 spec:
@@ -51,7 +51,7 @@
   kind: job
   requirements: []
 url: ''
-version: 1.2.0
+version: 1.3.0
 
         
     
diff --git a/functions/master/github_utils/1.1.0/static/documentation.html b/functions/master/github_utils/1.1.0/static/documentation.html index f7179e88..6881d547 100644 --- a/functions/master/github_utils/1.1.0/static/documentation.html +++ b/functions/master/github_utils/1.1.0/static/documentation.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/github_utils/1.1.0/static/example.html b/functions/master/github_utils/1.1.0/static/example.html index 627030e0..f20d1623 100644 --- a/functions/master/github_utils/1.1.0/static/example.html +++ b/functions/master/github_utils/1.1.0/static/example.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/github_utils/1.1.0/static/github_utils.html b/functions/master/github_utils/1.1.0/static/github_utils.html index e1287336..a33bbcc8 100644 --- a/functions/master/github_utils/1.1.0/static/github_utils.html +++ b/functions/master/github_utils/1.1.0/static/github_utils.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/github_utils/latest/static/documentation.html b/functions/master/github_utils/latest/static/documentation.html index f7179e88..6881d547 100644 --- a/functions/master/github_utils/latest/static/documentation.html +++ b/functions/master/github_utils/latest/static/documentation.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/github_utils/latest/static/example.html b/functions/master/github_utils/latest/static/example.html index 627030e0..f20d1623 100644 --- a/functions/master/github_utils/latest/static/example.html +++ b/functions/master/github_utils/latest/static/example.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/github_utils/latest/static/github_utils.html b/functions/master/github_utils/latest/static/github_utils.html index e1287336..a33bbcc8 100644 --- a/functions/master/github_utils/latest/static/github_utils.html +++ b/functions/master/github_utils/latest/static/github_utils.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/hugging_face_serving/1.1.0/src/function.yaml b/functions/master/hugging_face_serving/1.1.0/src/function.yaml index 764fc1cf..a628d7ab 100644 --- a/functions/master/hugging_face_serving/1.1.0/src/function.yaml +++ b/functions/master/hugging_face_serving/1.1.0/src/function.yaml @@ -1,46 +1,31 @@ -kind: serving metadata: name: hugging-face-serving - tag: '' - hash: 1a489a57da861f129eb26e933f34e58927e41195 - project: '' - labels: - author: yonish categories: - - huggingface - genai - model-serving - - machine-learning + tag: '' spec: - command: '' - args: [] + default_handler: '' + min_replicas: 1 + source: '' image: mlrun/ml-models build: functionSourceCode: IyBDb3B5cmlnaHQgMjAxOSBJZ3VhemlvCiMKIyBMaWNlbnNlZCB1bmRlciB0aGUgQXBhY2hlIExpY2Vuc2UsIFZlcnNpb24gMi4wICh0aGUgIkxpY2Vuc2UiKTsKIyB5b3UgbWF5IG5vdCB1c2UgdGhpcyBmaWxlIGV4Y2VwdCBpbiBjb21wbGlhbmNlIHdpdGggdGhlIExpY2Vuc2UuCiMgWW91IG1heSBvYnRhaW4gYSBjb3B5IG9mIHRoZSBMaWNlbnNlIGF0CiMKIyAgICAgaHR0cDovL3d3dy5hcGFjaGUub3JnL2xpY2Vuc2VzL0xJQ0VOU0UtMi4wCiMKIyBVbmxlc3MgcmVxdWlyZWQgYnkgYXBwbGljYWJsZSBsYXcgb3IgYWdyZWVkIHRvIGluIHdyaXRpbmcsIHNvZnR3YXJlCiMgZGlzdHJpYnV0ZWQgdW5kZXIgdGhlIExpY2Vuc2UgaXMgZGlzdHJpYnV0ZWQgb24gYW4gIkFTIElTIiBCQVNJUywKIyBXSVRIT1VUIFdBUlJBTlRJRVMgT1IgQ09ORElUSU9OUyBPRiBBTlkgS0lORCwgZWl0aGVyIGV4cHJlc3Mgb3IgaW1wbGllZC4KIyBTZWUgdGhlIExpY2Vuc2UgZm9yIHRoZSBzcGVjaWZpYyBsYW5ndWFnZSBnb3Zlcm5pbmcgcGVybWlzc2lvbnMgYW5kCiMgbGltaXRhdGlvbnMgdW5kZXIgdGhlIExpY2Vuc2UuCiMKCmZyb20gYWJjIGltcG9ydCBBQkMKZnJvbSBpbXBvcnRsaWIgaW1wb3J0IGltcG9ydF9tb2R1bGUKZnJvbSB0eXBpbmcgaW1wb3J0IExpc3QKCmZyb20gdHJhbnNmb3JtZXJzIGltcG9ydCBwaXBlbGluZQoKaW1wb3J0IG1scnVuLnNlcnZpbmcKClBBQ0tBR0VfTU9EVUxFID0gInRyYW5zZm9ybWVycyIKU0VSSUFMSVpBQkxFX1RZUEVTID0gW2RpY3QsIGxpc3QsIHR1cGxlLCBzdHIsIGludCwgZmxvYXRdCgoKY2xhc3MgSHVnZ2luZ0ZhY2VNb2RlbFNlcnZlcihtbHJ1bi5zZXJ2aW5nLlYyTW9kZWxTZXJ2ZXIsIEFCQyk6CiAgICAiIiIKICAgIEh1Z2dpbmcgRmFjZSBNb2RlbCBzZXJ2aW5nIGNsYXNzLCBpbmhlcml0aW5nIHRoZSBWMk1vZGVsU2VydmVyIGNsYXNzIGZvciBiZWluZyBpbml0aWFsaXplZCBhdXRvbWF0aWNhbGx5IGJ5IHRoZQogICAgbW9kZWwgc2VydmVyIGFuZCBiZSBhYmxlIHRvIHJ1biBsb2NhbGx5IGFzIHBhcnQgb2YgYSBudWNsaW8gc2VydmVybGVzcyBmdW5jdGlvbiwgb3IgYXMgcGFydCBvZiBhIHJlYWwtdGltZSBwaXBlbGluZS4KICAgICIiIgoKICAgIGRlZiBfX2luaXRfXygKICAgICAgICBzZWxmLAogICAgICAgIGNvbnRleHQ6IG1scnVuLk1MQ2xpZW50Q3R4LAogICAgICAgIG5hbWU6IHN0ciwKICAgICAgICB0YXNrOiBzdHIsCiAgICAgICAgbW9kZWxfcGF0aDogc3RyID0gTm9uZSwKICAgICAgICBtb2RlbF9uYW1lOiBzdHIgPSBOb25lLAogICAgICAgIG1vZGVsX2NsYXNzOiBzdHIgPSBOb25lLAogICAgICAgIHRva2VuaXplcl9uYW1lOiBzdHIgPSBOb25lLAogICAgICAgIHRva2VuaXplcl9jbGFzczogc3RyID0gTm9uZSwKICAgICAgICBmcmFtZXdvcms6IHN0ciA9IE5vbmUsCiAgICAgICAgKipjbGFzc19hcmdzLAogICAgKToKICAgICAgICAiIiIKICAgICAgICBJbml0aWFsaXplIGEgc2VydmluZyBjbGFzcyBmb3IgYSBIdWdnaW5nIGZhY2UgbW9kZWwuCgogICAgICAgIDpwYXJhbSBjb250ZXh0OiAgICAgICAgIFRoZSBtbHJ1biBjb250ZXh0IHRvIHdvcmsgd2l0aAogICAgICAgIDpwYXJhbSBuYW1lOiAgICAgICAgICAgIFRoZSBuYW1lIG9mIHRoaXMgc2VydmVyIHRvIGJlIGluaXRpYWxpemVkCiAgICAgICAgOnBhcmFtIG1vZGVsX3BhdGg6ICAgICAgTm90IGluIHVzZS4gV2hlbiBhZGRpbmcgYSBtb2RlbCBwYXNzIGFueSBzdHJpbmcgdmFsdWUKICAgICAgICA6cGFyYW0gbW9kZWxfbmFtZTogICAgICBUaGUgbW9kZWwncyBuYW1lIGluIHRoZSBIdWdnaW5nIEZhY2UgaHViCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgZS5nLiwgYG5scHRvd24vYmVydC1iYXNlLW11bHRpbGluZ3VhbC11bmNhc2VkLXNlbnRpbWVudGAKICAgICAgICA6cGFyYW0gbW9kZWxfY2xhc3M6ICAgICBUaGUgbW9kZWwncyBjbGFzcyB0eXBlIG9iamVjdCB3aGljaCBjYW4gYmUgcGFzc2VkIGFzIHRoZSBjbGFzcydzIG5hbWUgKHN0cmluZykuCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgTXVzdCBiZSBwcm92aWRlZCBhbmQgdG8gYmUgbWF0Y2hlZCB3aXRoIGBtb2RlbF9uYW1lYC4KICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBlLmcuLCBgQXV0b01vZGVsRm9yU2VxdWVuY2VDbGFzc2lmaWNhdGlvbmAKICAgICAgICA6cGFyYW0gdG9rZW5pemVyX25hbWU6ICBUaGUgdG9rZW5pemVyJ3MgbmFtZSBpbiB0aGUgSHVnZ2luZyBGYWNlIGh1YgogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIGUuZy4sIGBubHB0b3duL2JlcnQtYmFzZS1tdWx0aWxpbmd1YWwtdW5jYXNlZC1zZW50aW1lbnRgCiAgICAgICAgOnBhcmFtIHRva2VuaXplcl9jbGFzczogVGhlIG1vZGVsJ3MgY2xhc3MgdHlwZSBvYmplY3Qgd2hpY2ggY2FuIGJlIHBhc3NlZCBhcyB0aGUgY2xhc3MncyBuYW1lIChzdHJpbmcpLgogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIE11c3QgYmUgcHJvdmlkZWQgYW5kIHRvIGJlIG1hdGNoZWQgd2l0aCBgbW9kZWxfbmFtZWAuCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgZS5nLiwgYEF1dG9Ub2tlbml6ZXJgCiAgICAgICAgOnBhcmFtIGZyYW1ld29yazogICAgICAgVGhlIGZyYW1ld29yayB0byB1c2UsIGVpdGhlciBgInB0ImAgZm9yIFB5VG9yY2ggb3IgYCJ0ZiJgIGZvciBUZW5zb3JGbG93LiBUaGUgc3BlY2lmaWVkCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgZnJhbWV3b3JrIG11c3QgYmUgaW5zdGFsbGVkLgogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIElmIG5vIGZyYW1ld29yayBpcyBzcGVjaWZpZWQsIHdpbGwgZGVmYXVsdCB0byB0aGUgb25lIGN1cnJlbnRseSBpbnN0YWxsZWQuCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgSWYgbm8gZnJhbWV3b3JrIGlzIHNwZWNpZmllZCBhbmQgYm90aCBmcmFtZXdvcmtzIGFyZSBpbnN0YWxsZWQsIHdpbGwgZGVmYXVsdCB0byB0aGUKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBmcmFtZXdvcmsgb2YgdGhlIGBtb2RlbGAsIG9yIHRvIFB5VG9yY2ggaWYgbm8gbW9kZWwgaXMgcHJvdmlkZWQuCiAgICAgICAgOnBhcmFtIGNsYXNzX2FyZ3M6ICAgICAgLQogICAgICAgICIiIgogICAgICAgIHN1cGVyKEh1Z2dpbmdGYWNlTW9kZWxTZXJ2ZXIsIHNlbGYpLl9faW5pdF9fKAogICAgICAgICAgICBjb250ZXh0PWNvbnRleHQsCiAgICAgICAgICAgIG5hbWU9bmFtZSwKICAgICAgICAgICAgbW9kZWxfcGF0aD1tb2RlbF9wYXRoLAogICAgICAgICAgICAqKmNsYXNzX2FyZ3MsCiAgICAgICAgKQogICAgICAgIHNlbGYudGFzayA9IHRhc2sKICAgICAgICBzZWxmLm1vZGVsID0gTm9uZQogICAgICAgIHNlbGYudG9rZW5pemVyID0gTm9uZQogICAgICAgIHNlbGYubW9kZWxfbmFtZSA9IG1vZGVsX25hbWUKICAgICAgICBzZWxmLnRva2VuaXplcl9uYW1lID0gdG9rZW5pemVyX25hbWUKICAgICAgICBzZWxmLm1vZGVsX2NsYXNzID0gbW9kZWxfY2xhc3MKICAgICAgICBzZWxmLnRva2VuaXplcl9jbGFzcyA9IHRva2VuaXplcl9jbGFzcwogICAgICAgIHNlbGYuZnJhbWV3b3JrID0gZnJhbWV3b3JrCiAgICAgICAgc2VsZi5waXBlID0gTm9uZQoKICAgIGRlZiBsb2FkKHNlbGYpOgogICAgICAgICIiImxvYWQgYW5kIGluaXRpYWxpemUgdGhlIG1vZGVsIGFuZC9vciBvdGhlciBlbGVtZW50cyIiIgogICAgICAgIGlmIHNlbGYubW9kZWxfY2xhc3M6CiAgICAgICAgICAgIG1vZGVsX29iamVjdCA9IGdldGF0dHIoaW1wb3J0X21vZHVsZShQQUNLQUdFX01PRFVMRSksIHNlbGYubW9kZWxfY2xhc3MpCiAgICAgICAgICAgIHNlbGYubW9kZWwgPSBtb2RlbF9vYmplY3QuZnJvbV9wcmV0cmFpbmVkKHNlbGYubW9kZWxfbmFtZSkKICAgICAgICBpZiBzZWxmLnRva2VuaXplcl9jbGFzczoKICAgICAgICAgICAgdG9rZW5pemVyX29iamVjdCA9IGdldGF0dHIoCiAgICAgICAgICAgICAgICBpbXBvcnRfbW9kdWxlKFBBQ0tBR0VfTU9EVUxFKSwgc2VsZi50b2tlbml6ZXJfY2xhc3MKICAgICAgICAgICAgKQogICAgICAgICAgICBzZWxmLnRva2VuaXplciA9IHRva2VuaXplcl9vYmplY3QuZnJvbV9wcmV0cmFpbmVkKHNlbGYudG9rZW5pemVyX25hbWUpCiAgICAgICAgc2VsZi5waXBlID0gcGlwZWxpbmUoCiAgICAgICAgICAgIHRhc2s9c2VsZi50YXNrLAogICAgICAgICAgICBtb2RlbD1zZWxmLm1vZGVsIG9yIHNlbGYubW9kZWxfbmFtZSwKICAgICAgICAgICAgdG9rZW5pemVyPXNlbGYudG9rZW5pemVyLAogICAgICAgICAgICBmcmFtZXdvcms9c2VsZi5mcmFtZXdvcmssCiAgICAgICAgKQoKICAgIGRlZiBwcmVkaWN0KHNlbGYsIGJvZHk6IGRpY3QpIC0+IExpc3Q6CiAgICAgICAgIiIiR2VuZXJhdGUgbW9kZWwgcHJlZGljdGlvbnMgZnJvbSBzYW1wbGUuIiIiCiAgICAgICAgaWYgc2VsZi5waXBlIGlzIE5vbmU6CiAgICAgICAgICAgIHJhaXNlIFZhbHVlRXJyb3IoIlBsZWFzZSB1c2UgYC5sb2FkKClgIikKICAgICAgICB0cnk6CiAgICAgICAgICAgIGlmIGlzaW5zdGFuY2UoYm9keVsiaW5wdXRzIl1bMF0sIGRpY3QpOgogICAgICAgICAgICAgICAgcmVzdWx0ID0gW3NlbGYucGlwZSgqKl9pbnB1dCkgZm9yIF9pbnB1dCBpbiBib2R5WyJpbnB1dHMiXV0KICAgICAgICAgICAgZWxzZToKICAgICAgICAgICAgICAgIHJlc3VsdCA9IHNlbGYucGlwZShib2R5WyJpbnB1dHMiXSkKICAgICAgICAgICAgIyByZXBsYWNlIGxpc3Qgb2YgbGlzdHMgb2YgZGljdHMgaW50byBhIGxpc3Qgb2YgZGljdHM6CiAgICAgICAgICAgIGlmIGFsbChpc2luc3RhbmNlKHJlcywgbGlzdCkgZm9yIHJlcyBpbiByZXN1bHQpOgogICAgICAgICAgICAgICAgbmV3X3Jlc3VsdCA9IFtyZXNbMF0gZm9yIHJlcyBpbiByZXN1bHRdCiAgICAgICAgICAgICAgICByZXN1bHQgPSBuZXdfcmVzdWx0CgogICAgICAgICAgICBub25fc2VyaWFsaXphYmxlX3R5cGVzID0gW10KICAgICAgICAgICAgZm9yIHJlcyBpbiByZXN1bHQ6CiAgICAgICAgICAgICAgICBmb3Iga2V5LCB2YWwgaW4gcmVzLml0ZW1zKCk6CiAgICAgICAgICAgICAgICAgICAgaWYgdHlwZSh2YWwpIG5vdCBpbiBTRVJJQUxJWkFCTEVfVFlQRVM6CiAgICAgICAgICAgICAgICAgICAgICAgIG5vbl9zZXJpYWxpemFibGVfdHlwZXMuYXBwZW5kKHN0cih0eXBlKHZhbCkpKQogICAgICAgICAgICAgICAgICAgICAgICByZXNba2V5XSA9IHN0cih2YWwpCiAgICAgICAgICAgIGlmIG5vbl9zZXJpYWxpemFibGVfdHlwZXM6CiAgICAgICAgICAgICAgICBzZWxmLmNvbnRleHQubG9nZ2VyLmluZm8oCiAgICAgICAgICAgICAgICAgICAgZiJOb24tc2VyaWFsaXphYmxlIHR5cGVzOiB7bm9uX3NlcmlhbGl6YWJsZV90eXBlc30gd2VyZSBjYXN0ZWQgdG8gc3RyaW5ncyIKICAgICAgICAgICAgICAgICkKICAgICAgICBleGNlcHQgRXhjZXB0aW9uIGFzIGU6CiAgICAgICAgICAgIHJhaXNlIEV4Y2VwdGlvbigiRmFpbGVkIHRvIHByZWRpY3QgJXMiICUgZSkKICAgICAgICByZXR1cm4gcmVzdWx0Cgpmcm9tIG1scnVuLnJ1bnRpbWVzIGltcG9ydCBudWNsaW9faW5pdF9ob29rCmRlZiBpbml0X2NvbnRleHQoY29udGV4dCk6CiAgICBudWNsaW9faW5pdF9ob29rKGNvbnRleHQsIGdsb2JhbHMoKSwgJ3NlcnZpbmdfdjInKQoKZGVmIGhhbmRsZXIoY29udGV4dCwgZXZlbnQpOgogICAgcmV0dXJuIGNvbnRleHQubWxydW5faGFuZGxlcihjb250ZXh0LCBldmVudCkK - commands: [] code_origin: '' origin_filename: '' requirements: - transformers==4.21.3 - tensorflow==2.9.2 - description: Generic Hugging Face model server. - default_handler: '' + function_kind: serving_v2 + default_class: HuggingFaceModelServer + base_image_pull: false + max_replicas: 4 + command: '' disable_auto_mount: false - clone_target_dir: '' + function_handler: hugging-face-serving-nuclio:handler + description: Generic Hugging Face model server. env: - name: MLRUN_HTTPDB__NUCLIO__EXPLICIT_ACK value: enabled - priority_class_name: '' - preemption_mode: prevent - min_replicas: 1 - max_replicas: 4 - source: '' - function_kind: serving_v2 - function_handler: hugging_face_serving:handler - base_image_pull: false - default_class: HuggingFaceModelServer - secret_sources: [] - affinity: null - tolerations: null - security_context: {} verbose: false +kind: serving diff --git a/functions/master/hugging_face_serving/1.1.0/src/item.yaml b/functions/master/hugging_face_serving/1.1.0/src/item.yaml index d1f78769..48b063e4 100644 --- a/functions/master/hugging_face_serving/1.1.0/src/item.yaml +++ b/functions/master/hugging_face_serving/1.1.0/src/item.yaml @@ -1,9 +1,7 @@ apiVersion: v1 categories: -- huggingface - genai - model-serving -- machine-learning description: Generic Hugging Face model server. doc: '' example: hugging_face_serving.ipynb diff --git a/functions/master/hugging_face_serving/1.1.0/static/documentation.html b/functions/master/hugging_face_serving/1.1.0/static/documentation.html index a95b606c..65c7aeb8 100644 --- a/functions/master/hugging_face_serving/1.1.0/static/documentation.html +++ b/functions/master/hugging_face_serving/1.1.0/static/documentation.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/hugging_face_serving/1.1.0/static/example.html b/functions/master/hugging_face_serving/1.1.0/static/example.html index bc408d54..20ac581a 100644 --- a/functions/master/hugging_face_serving/1.1.0/static/example.html +++ b/functions/master/hugging_face_serving/1.1.0/static/example.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/hugging_face_serving/1.1.0/static/function.html b/functions/master/hugging_face_serving/1.1.0/static/function.html index ef13ff66..c30517c4 100644 --- a/functions/master/hugging_face_serving/1.1.0/static/function.html +++ b/functions/master/hugging_face_serving/1.1.0/static/function.html @@ -28,52 +28,37 @@
         
-kind: serving
 metadata:
   name: hugging-face-serving
-  tag: ''
-  hash: 1a489a57da861f129eb26e933f34e58927e41195
-  project: ''
-  labels:
-    author: yonish
   categories:
-  - huggingface
   - genai
   - model-serving
-  - machine-learning
+  tag: ''
 spec:
-  command: ''
-  args: []
+  default_handler: ''
+  min_replicas: 1
+  source: ''
   image: mlrun/ml-models
   build:
     functionSourceCode: IyBDb3B5cmlnaHQgMjAxOSBJZ3VhemlvCiMKIyBMaWNlbnNlZCB1bmRlciB0aGUgQXBhY2hlIExpY2Vuc2UsIFZlcnNpb24gMi4wICh0aGUgIkxpY2Vuc2UiKTsKIyB5b3UgbWF5IG5vdCB1c2UgdGhpcyBmaWxlIGV4Y2VwdCBpbiBjb21wbGlhbmNlIHdpdGggdGhlIExpY2Vuc2UuCiMgWW91IG1heSBvYnRhaW4gYSBjb3B5IG9mIHRoZSBMaWNlbnNlIGF0CiMKIyAgICAgaHR0cDovL3d3dy5hcGFjaGUub3JnL2xpY2Vuc2VzL0xJQ0VOU0UtMi4wCiMKIyBVbmxlc3MgcmVxdWlyZWQgYnkgYXBwbGljYWJsZSBsYXcgb3IgYWdyZWVkIHRvIGluIHdyaXRpbmcsIHNvZnR3YXJlCiMgZGlzdHJpYnV0ZWQgdW5kZXIgdGhlIExpY2Vuc2UgaXMgZGlzdHJpYnV0ZWQgb24gYW4gIkFTIElTIiBCQVNJUywKIyBXSVRIT1VUIFdBUlJBTlRJRVMgT1IgQ09ORElUSU9OUyBPRiBBTlkgS0lORCwgZWl0aGVyIGV4cHJlc3Mgb3IgaW1wbGllZC4KIyBTZWUgdGhlIExpY2Vuc2UgZm9yIHRoZSBzcGVjaWZpYyBsYW5ndWFnZSBnb3Zlcm5pbmcgcGVybWlzc2lvbnMgYW5kCiMgbGltaXRhdGlvbnMgdW5kZXIgdGhlIExpY2Vuc2UuCiMKCmZyb20gYWJjIGltcG9ydCBBQkMKZnJvbSBpbXBvcnRsaWIgaW1wb3J0IGltcG9ydF9tb2R1bGUKZnJvbSB0eXBpbmcgaW1wb3J0IExpc3QKCmZyb20gdHJhbnNmb3JtZXJzIGltcG9ydCBwaXBlbGluZQoKaW1wb3J0IG1scnVuLnNlcnZpbmcKClBBQ0tBR0VfTU9EVUxFID0gInRyYW5zZm9ybWVycyIKU0VSSUFMSVpBQkxFX1RZUEVTID0gW2RpY3QsIGxpc3QsIHR1cGxlLCBzdHIsIGludCwgZmxvYXRdCgoKY2xhc3MgSHVnZ2luZ0ZhY2VNb2RlbFNlcnZlcihtbHJ1bi5zZXJ2aW5nLlYyTW9kZWxTZXJ2ZXIsIEFCQyk6CiAgICAiIiIKICAgIEh1Z2dpbmcgRmFjZSBNb2RlbCBzZXJ2aW5nIGNsYXNzLCBpbmhlcml0aW5nIHRoZSBWMk1vZGVsU2VydmVyIGNsYXNzIGZvciBiZWluZyBpbml0aWFsaXplZCBhdXRvbWF0aWNhbGx5IGJ5IHRoZQogICAgbW9kZWwgc2VydmVyIGFuZCBiZSBhYmxlIHRvIHJ1biBsb2NhbGx5IGFzIHBhcnQgb2YgYSBudWNsaW8gc2VydmVybGVzcyBmdW5jdGlvbiwgb3IgYXMgcGFydCBvZiBhIHJlYWwtdGltZSBwaXBlbGluZS4KICAgICIiIgoKICAgIGRlZiBfX2luaXRfXygKICAgICAgICBzZWxmLAogICAgICAgIGNvbnRleHQ6IG1scnVuLk1MQ2xpZW50Q3R4LAogICAgICAgIG5hbWU6IHN0ciwKICAgICAgICB0YXNrOiBzdHIsCiAgICAgICAgbW9kZWxfcGF0aDogc3RyID0gTm9uZSwKICAgICAgICBtb2RlbF9uYW1lOiBzdHIgPSBOb25lLAogICAgICAgIG1vZGVsX2NsYXNzOiBzdHIgPSBOb25lLAogICAgICAgIHRva2VuaXplcl9uYW1lOiBzdHIgPSBOb25lLAogICAgICAgIHRva2VuaXplcl9jbGFzczogc3RyID0gTm9uZSwKICAgICAgICBmcmFtZXdvcms6IHN0ciA9IE5vbmUsCiAgICAgICAgKipjbGFzc19hcmdzLAogICAgKToKICAgICAgICAiIiIKICAgICAgICBJbml0aWFsaXplIGEgc2VydmluZyBjbGFzcyBmb3IgYSBIdWdnaW5nIGZhY2UgbW9kZWwuCgogICAgICAgIDpwYXJhbSBjb250ZXh0OiAgICAgICAgIFRoZSBtbHJ1biBjb250ZXh0IHRvIHdvcmsgd2l0aAogICAgICAgIDpwYXJhbSBuYW1lOiAgICAgICAgICAgIFRoZSBuYW1lIG9mIHRoaXMgc2VydmVyIHRvIGJlIGluaXRpYWxpemVkCiAgICAgICAgOnBhcmFtIG1vZGVsX3BhdGg6ICAgICAgTm90IGluIHVzZS4gV2hlbiBhZGRpbmcgYSBtb2RlbCBwYXNzIGFueSBzdHJpbmcgdmFsdWUKICAgICAgICA6cGFyYW0gbW9kZWxfbmFtZTogICAgICBUaGUgbW9kZWwncyBuYW1lIGluIHRoZSBIdWdnaW5nIEZhY2UgaHViCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgZS5nLiwgYG5scHRvd24vYmVydC1iYXNlLW11bHRpbGluZ3VhbC11bmNhc2VkLXNlbnRpbWVudGAKICAgICAgICA6cGFyYW0gbW9kZWxfY2xhc3M6ICAgICBUaGUgbW9kZWwncyBjbGFzcyB0eXBlIG9iamVjdCB3aGljaCBjYW4gYmUgcGFzc2VkIGFzIHRoZSBjbGFzcydzIG5hbWUgKHN0cmluZykuCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgTXVzdCBiZSBwcm92aWRlZCBhbmQgdG8gYmUgbWF0Y2hlZCB3aXRoIGBtb2RlbF9uYW1lYC4KICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBlLmcuLCBgQXV0b01vZGVsRm9yU2VxdWVuY2VDbGFzc2lmaWNhdGlvbmAKICAgICAgICA6cGFyYW0gdG9rZW5pemVyX25hbWU6ICBUaGUgdG9rZW5pemVyJ3MgbmFtZSBpbiB0aGUgSHVnZ2luZyBGYWNlIGh1YgogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIGUuZy4sIGBubHB0b3duL2JlcnQtYmFzZS1tdWx0aWxpbmd1YWwtdW5jYXNlZC1zZW50aW1lbnRgCiAgICAgICAgOnBhcmFtIHRva2VuaXplcl9jbGFzczogVGhlIG1vZGVsJ3MgY2xhc3MgdHlwZSBvYmplY3Qgd2hpY2ggY2FuIGJlIHBhc3NlZCBhcyB0aGUgY2xhc3MncyBuYW1lIChzdHJpbmcpLgogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIE11c3QgYmUgcHJvdmlkZWQgYW5kIHRvIGJlIG1hdGNoZWQgd2l0aCBgbW9kZWxfbmFtZWAuCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgZS5nLiwgYEF1dG9Ub2tlbml6ZXJgCiAgICAgICAgOnBhcmFtIGZyYW1ld29yazogICAgICAgVGhlIGZyYW1ld29yayB0byB1c2UsIGVpdGhlciBgInB0ImAgZm9yIFB5VG9yY2ggb3IgYCJ0ZiJgIGZvciBUZW5zb3JGbG93LiBUaGUgc3BlY2lmaWVkCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgZnJhbWV3b3JrIG11c3QgYmUgaW5zdGFsbGVkLgogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIElmIG5vIGZyYW1ld29yayBpcyBzcGVjaWZpZWQsIHdpbGwgZGVmYXVsdCB0byB0aGUgb25lIGN1cnJlbnRseSBpbnN0YWxsZWQuCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgSWYgbm8gZnJhbWV3b3JrIGlzIHNwZWNpZmllZCBhbmQgYm90aCBmcmFtZXdvcmtzIGFyZSBpbnN0YWxsZWQsIHdpbGwgZGVmYXVsdCB0byB0aGUKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBmcmFtZXdvcmsgb2YgdGhlIGBtb2RlbGAsIG9yIHRvIFB5VG9yY2ggaWYgbm8gbW9kZWwgaXMgcHJvdmlkZWQuCiAgICAgICAgOnBhcmFtIGNsYXNzX2FyZ3M6ICAgICAgLQogICAgICAgICIiIgogICAgICAgIHN1cGVyKEh1Z2dpbmdGYWNlTW9kZWxTZXJ2ZXIsIHNlbGYpLl9faW5pdF9fKAogICAgICAgICAgICBjb250ZXh0PWNvbnRleHQsCiAgICAgICAgICAgIG5hbWU9bmFtZSwKICAgICAgICAgICAgbW9kZWxfcGF0aD1tb2RlbF9wYXRoLAogICAgICAgICAgICAqKmNsYXNzX2FyZ3MsCiAgICAgICAgKQogICAgICAgIHNlbGYudGFzayA9IHRhc2sKICAgICAgICBzZWxmLm1vZGVsID0gTm9uZQogICAgICAgIHNlbGYudG9rZW5pemVyID0gTm9uZQogICAgICAgIHNlbGYubW9kZWxfbmFtZSA9IG1vZGVsX25hbWUKICAgICAgICBzZWxmLnRva2VuaXplcl9uYW1lID0gdG9rZW5pemVyX25hbWUKICAgICAgICBzZWxmLm1vZGVsX2NsYXNzID0gbW9kZWxfY2xhc3MKICAgICAgICBzZWxmLnRva2VuaXplcl9jbGFzcyA9IHRva2VuaXplcl9jbGFzcwogICAgICAgIHNlbGYuZnJhbWV3b3JrID0gZnJhbWV3b3JrCiAgICAgICAgc2VsZi5waXBlID0gTm9uZQoKICAgIGRlZiBsb2FkKHNlbGYpOgogICAgICAgICIiImxvYWQgYW5kIGluaXRpYWxpemUgdGhlIG1vZGVsIGFuZC9vciBvdGhlciBlbGVtZW50cyIiIgogICAgICAgIGlmIHNlbGYubW9kZWxfY2xhc3M6CiAgICAgICAgICAgIG1vZGVsX29iamVjdCA9IGdldGF0dHIoaW1wb3J0X21vZHVsZShQQUNLQUdFX01PRFVMRSksIHNlbGYubW9kZWxfY2xhc3MpCiAgICAgICAgICAgIHNlbGYubW9kZWwgPSBtb2RlbF9vYmplY3QuZnJvbV9wcmV0cmFpbmVkKHNlbGYubW9kZWxfbmFtZSkKICAgICAgICBpZiBzZWxmLnRva2VuaXplcl9jbGFzczoKICAgICAgICAgICAgdG9rZW5pemVyX29iamVjdCA9IGdldGF0dHIoCiAgICAgICAgICAgICAgICBpbXBvcnRfbW9kdWxlKFBBQ0tBR0VfTU9EVUxFKSwgc2VsZi50b2tlbml6ZXJfY2xhc3MKICAgICAgICAgICAgKQogICAgICAgICAgICBzZWxmLnRva2VuaXplciA9IHRva2VuaXplcl9vYmplY3QuZnJvbV9wcmV0cmFpbmVkKHNlbGYudG9rZW5pemVyX25hbWUpCiAgICAgICAgc2VsZi5waXBlID0gcGlwZWxpbmUoCiAgICAgICAgICAgIHRhc2s9c2VsZi50YXNrLAogICAgICAgICAgICBtb2RlbD1zZWxmLm1vZGVsIG9yIHNlbGYubW9kZWxfbmFtZSwKICAgICAgICAgICAgdG9rZW5pemVyPXNlbGYudG9rZW5pemVyLAogICAgICAgICAgICBmcmFtZXdvcms9c2VsZi5mcmFtZXdvcmssCiAgICAgICAgKQoKICAgIGRlZiBwcmVkaWN0KHNlbGYsIGJvZHk6IGRpY3QpIC0+IExpc3Q6CiAgICAgICAgIiIiR2VuZXJhdGUgbW9kZWwgcHJlZGljdGlvbnMgZnJvbSBzYW1wbGUuIiIiCiAgICAgICAgaWYgc2VsZi5waXBlIGlzIE5vbmU6CiAgICAgICAgICAgIHJhaXNlIFZhbHVlRXJyb3IoIlBsZWFzZSB1c2UgYC5sb2FkKClgIikKICAgICAgICB0cnk6CiAgICAgICAgICAgIGlmIGlzaW5zdGFuY2UoYm9keVsiaW5wdXRzIl1bMF0sIGRpY3QpOgogICAgICAgICAgICAgICAgcmVzdWx0ID0gW3NlbGYucGlwZSgqKl9pbnB1dCkgZm9yIF9pbnB1dCBpbiBib2R5WyJpbnB1dHMiXV0KICAgICAgICAgICAgZWxzZToKICAgICAgICAgICAgICAgIHJlc3VsdCA9IHNlbGYucGlwZShib2R5WyJpbnB1dHMiXSkKICAgICAgICAgICAgIyByZXBsYWNlIGxpc3Qgb2YgbGlzdHMgb2YgZGljdHMgaW50byBhIGxpc3Qgb2YgZGljdHM6CiAgICAgICAgICAgIGlmIGFsbChpc2luc3RhbmNlKHJlcywgbGlzdCkgZm9yIHJlcyBpbiByZXN1bHQpOgogICAgICAgICAgICAgICAgbmV3X3Jlc3VsdCA9IFtyZXNbMF0gZm9yIHJlcyBpbiByZXN1bHRdCiAgICAgICAgICAgICAgICByZXN1bHQgPSBuZXdfcmVzdWx0CgogICAgICAgICAgICBub25fc2VyaWFsaXphYmxlX3R5cGVzID0gW10KICAgICAgICAgICAgZm9yIHJlcyBpbiByZXN1bHQ6CiAgICAgICAgICAgICAgICBmb3Iga2V5LCB2YWwgaW4gcmVzLml0ZW1zKCk6CiAgICAgICAgICAgICAgICAgICAgaWYgdHlwZSh2YWwpIG5vdCBpbiBTRVJJQUxJWkFCTEVfVFlQRVM6CiAgICAgICAgICAgICAgICAgICAgICAgIG5vbl9zZXJpYWxpemFibGVfdHlwZXMuYXBwZW5kKHN0cih0eXBlKHZhbCkpKQogICAgICAgICAgICAgICAgICAgICAgICByZXNba2V5XSA9IHN0cih2YWwpCiAgICAgICAgICAgIGlmIG5vbl9zZXJpYWxpemFibGVfdHlwZXM6CiAgICAgICAgICAgICAgICBzZWxmLmNvbnRleHQubG9nZ2VyLmluZm8oCiAgICAgICAgICAgICAgICAgICAgZiJOb24tc2VyaWFsaXphYmxlIHR5cGVzOiB7bm9uX3NlcmlhbGl6YWJsZV90eXBlc30gd2VyZSBjYXN0ZWQgdG8gc3RyaW5ncyIKICAgICAgICAgICAgICAgICkKICAgICAgICBleGNlcHQgRXhjZXB0aW9uIGFzIGU6CiAgICAgICAgICAgIHJhaXNlIEV4Y2VwdGlvbigiRmFpbGVkIHRvIHByZWRpY3QgJXMiICUgZSkKICAgICAgICByZXR1cm4gcmVzdWx0Cgpmcm9tIG1scnVuLnJ1bnRpbWVzIGltcG9ydCBudWNsaW9faW5pdF9ob29rCmRlZiBpbml0X2NvbnRleHQoY29udGV4dCk6CiAgICBudWNsaW9faW5pdF9ob29rKGNvbnRleHQsIGdsb2JhbHMoKSwgJ3NlcnZpbmdfdjInKQoKZGVmIGhhbmRsZXIoY29udGV4dCwgZXZlbnQpOgogICAgcmV0dXJuIGNvbnRleHQubWxydW5faGFuZGxlcihjb250ZXh0LCBldmVudCkK
-    commands: []
     code_origin: ''
     origin_filename: ''
     requirements:
     - transformers==4.21.3
     - tensorflow==2.9.2
-  description: Generic Hugging Face model server.
-  default_handler: ''
+  function_kind: serving_v2
+  default_class: HuggingFaceModelServer
+  base_image_pull: false
+  max_replicas: 4
+  command: ''
   disable_auto_mount: false
-  clone_target_dir: ''
+  function_handler: hugging-face-serving-nuclio:handler
+  description: Generic Hugging Face model server.
   env:
   - name: MLRUN_HTTPDB__NUCLIO__EXPLICIT_ACK
     value: enabled
-  priority_class_name: ''
-  preemption_mode: prevent
-  min_replicas: 1
-  max_replicas: 4
-  source: ''
-  function_kind: serving_v2
-  function_handler: hugging_face_serving:handler
-  base_image_pull: false
-  default_class: HuggingFaceModelServer
-  secret_sources: []
-  affinity: null
-  tolerations: null
-  security_context: {}
 verbose: false
+kind: serving
 
         
     
diff --git a/functions/master/hugging_face_serving/1.1.0/static/hugging_face_serving.html b/functions/master/hugging_face_serving/1.1.0/static/hugging_face_serving.html index b07151ed..848cc67e 100644 --- a/functions/master/hugging_face_serving/1.1.0/static/hugging_face_serving.html +++ b/functions/master/hugging_face_serving/1.1.0/static/hugging_face_serving.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/hugging_face_serving/1.1.0/static/item.html b/functions/master/hugging_face_serving/1.1.0/static/item.html index 62cc1b5b..784df913 100644 --- a/functions/master/hugging_face_serving/1.1.0/static/item.html +++ b/functions/master/hugging_face_serving/1.1.0/static/item.html @@ -30,10 +30,8 @@ apiVersion: v1 categories: -- huggingface - genai - model-serving -- machine-learning description: Generic Hugging Face model server. doc: '' example: hugging_face_serving.ipynb diff --git a/functions/master/hugging_face_serving/latest/src/function.yaml b/functions/master/hugging_face_serving/latest/src/function.yaml index 764fc1cf..a628d7ab 100644 --- a/functions/master/hugging_face_serving/latest/src/function.yaml +++ b/functions/master/hugging_face_serving/latest/src/function.yaml @@ -1,46 +1,31 @@ -kind: serving metadata: name: hugging-face-serving - tag: '' - hash: 1a489a57da861f129eb26e933f34e58927e41195 - project: '' - labels: - author: yonish categories: - - huggingface - genai - model-serving - - machine-learning + tag: '' spec: - command: '' - args: [] + default_handler: '' + min_replicas: 1 + source: '' image: mlrun/ml-models build: functionSourceCode: IyBDb3B5cmlnaHQgMjAxOSBJZ3VhemlvCiMKIyBMaWNlbnNlZCB1bmRlciB0aGUgQXBhY2hlIExpY2Vuc2UsIFZlcnNpb24gMi4wICh0aGUgIkxpY2Vuc2UiKTsKIyB5b3UgbWF5IG5vdCB1c2UgdGhpcyBmaWxlIGV4Y2VwdCBpbiBjb21wbGlhbmNlIHdpdGggdGhlIExpY2Vuc2UuCiMgWW91IG1heSBvYnRhaW4gYSBjb3B5IG9mIHRoZSBMaWNlbnNlIGF0CiMKIyAgICAgaHR0cDovL3d3dy5hcGFjaGUub3JnL2xpY2Vuc2VzL0xJQ0VOU0UtMi4wCiMKIyBVbmxlc3MgcmVxdWlyZWQgYnkgYXBwbGljYWJsZSBsYXcgb3IgYWdyZWVkIHRvIGluIHdyaXRpbmcsIHNvZnR3YXJlCiMgZGlzdHJpYnV0ZWQgdW5kZXIgdGhlIExpY2Vuc2UgaXMgZGlzdHJpYnV0ZWQgb24gYW4gIkFTIElTIiBCQVNJUywKIyBXSVRIT1VUIFdBUlJBTlRJRVMgT1IgQ09ORElUSU9OUyBPRiBBTlkgS0lORCwgZWl0aGVyIGV4cHJlc3Mgb3IgaW1wbGllZC4KIyBTZWUgdGhlIExpY2Vuc2UgZm9yIHRoZSBzcGVjaWZpYyBsYW5ndWFnZSBnb3Zlcm5pbmcgcGVybWlzc2lvbnMgYW5kCiMgbGltaXRhdGlvbnMgdW5kZXIgdGhlIExpY2Vuc2UuCiMKCmZyb20gYWJjIGltcG9ydCBBQkMKZnJvbSBpbXBvcnRsaWIgaW1wb3J0IGltcG9ydF9tb2R1bGUKZnJvbSB0eXBpbmcgaW1wb3J0IExpc3QKCmZyb20gdHJhbnNmb3JtZXJzIGltcG9ydCBwaXBlbGluZQoKaW1wb3J0IG1scnVuLnNlcnZpbmcKClBBQ0tBR0VfTU9EVUxFID0gInRyYW5zZm9ybWVycyIKU0VSSUFMSVpBQkxFX1RZUEVTID0gW2RpY3QsIGxpc3QsIHR1cGxlLCBzdHIsIGludCwgZmxvYXRdCgoKY2xhc3MgSHVnZ2luZ0ZhY2VNb2RlbFNlcnZlcihtbHJ1bi5zZXJ2aW5nLlYyTW9kZWxTZXJ2ZXIsIEFCQyk6CiAgICAiIiIKICAgIEh1Z2dpbmcgRmFjZSBNb2RlbCBzZXJ2aW5nIGNsYXNzLCBpbmhlcml0aW5nIHRoZSBWMk1vZGVsU2VydmVyIGNsYXNzIGZvciBiZWluZyBpbml0aWFsaXplZCBhdXRvbWF0aWNhbGx5IGJ5IHRoZQogICAgbW9kZWwgc2VydmVyIGFuZCBiZSBhYmxlIHRvIHJ1biBsb2NhbGx5IGFzIHBhcnQgb2YgYSBudWNsaW8gc2VydmVybGVzcyBmdW5jdGlvbiwgb3IgYXMgcGFydCBvZiBhIHJlYWwtdGltZSBwaXBlbGluZS4KICAgICIiIgoKICAgIGRlZiBfX2luaXRfXygKICAgICAgICBzZWxmLAogICAgICAgIGNvbnRleHQ6IG1scnVuLk1MQ2xpZW50Q3R4LAogICAgICAgIG5hbWU6IHN0ciwKICAgICAgICB0YXNrOiBzdHIsCiAgICAgICAgbW9kZWxfcGF0aDogc3RyID0gTm9uZSwKICAgICAgICBtb2RlbF9uYW1lOiBzdHIgPSBOb25lLAogICAgICAgIG1vZGVsX2NsYXNzOiBzdHIgPSBOb25lLAogICAgICAgIHRva2VuaXplcl9uYW1lOiBzdHIgPSBOb25lLAogICAgICAgIHRva2VuaXplcl9jbGFzczogc3RyID0gTm9uZSwKICAgICAgICBmcmFtZXdvcms6IHN0ciA9IE5vbmUsCiAgICAgICAgKipjbGFzc19hcmdzLAogICAgKToKICAgICAgICAiIiIKICAgICAgICBJbml0aWFsaXplIGEgc2VydmluZyBjbGFzcyBmb3IgYSBIdWdnaW5nIGZhY2UgbW9kZWwuCgogICAgICAgIDpwYXJhbSBjb250ZXh0OiAgICAgICAgIFRoZSBtbHJ1biBjb250ZXh0IHRvIHdvcmsgd2l0aAogICAgICAgIDpwYXJhbSBuYW1lOiAgICAgICAgICAgIFRoZSBuYW1lIG9mIHRoaXMgc2VydmVyIHRvIGJlIGluaXRpYWxpemVkCiAgICAgICAgOnBhcmFtIG1vZGVsX3BhdGg6ICAgICAgTm90IGluIHVzZS4gV2hlbiBhZGRpbmcgYSBtb2RlbCBwYXNzIGFueSBzdHJpbmcgdmFsdWUKICAgICAgICA6cGFyYW0gbW9kZWxfbmFtZTogICAgICBUaGUgbW9kZWwncyBuYW1lIGluIHRoZSBIdWdnaW5nIEZhY2UgaHViCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgZS5nLiwgYG5scHRvd24vYmVydC1iYXNlLW11bHRpbGluZ3VhbC11bmNhc2VkLXNlbnRpbWVudGAKICAgICAgICA6cGFyYW0gbW9kZWxfY2xhc3M6ICAgICBUaGUgbW9kZWwncyBjbGFzcyB0eXBlIG9iamVjdCB3aGljaCBjYW4gYmUgcGFzc2VkIGFzIHRoZSBjbGFzcydzIG5hbWUgKHN0cmluZykuCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgTXVzdCBiZSBwcm92aWRlZCBhbmQgdG8gYmUgbWF0Y2hlZCB3aXRoIGBtb2RlbF9uYW1lYC4KICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBlLmcuLCBgQXV0b01vZGVsRm9yU2VxdWVuY2VDbGFzc2lmaWNhdGlvbmAKICAgICAgICA6cGFyYW0gdG9rZW5pemVyX25hbWU6ICBUaGUgdG9rZW5pemVyJ3MgbmFtZSBpbiB0aGUgSHVnZ2luZyBGYWNlIGh1YgogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIGUuZy4sIGBubHB0b3duL2JlcnQtYmFzZS1tdWx0aWxpbmd1YWwtdW5jYXNlZC1zZW50aW1lbnRgCiAgICAgICAgOnBhcmFtIHRva2VuaXplcl9jbGFzczogVGhlIG1vZGVsJ3MgY2xhc3MgdHlwZSBvYmplY3Qgd2hpY2ggY2FuIGJlIHBhc3NlZCBhcyB0aGUgY2xhc3MncyBuYW1lIChzdHJpbmcpLgogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIE11c3QgYmUgcHJvdmlkZWQgYW5kIHRvIGJlIG1hdGNoZWQgd2l0aCBgbW9kZWxfbmFtZWAuCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgZS5nLiwgYEF1dG9Ub2tlbml6ZXJgCiAgICAgICAgOnBhcmFtIGZyYW1ld29yazogICAgICAgVGhlIGZyYW1ld29yayB0byB1c2UsIGVpdGhlciBgInB0ImAgZm9yIFB5VG9yY2ggb3IgYCJ0ZiJgIGZvciBUZW5zb3JGbG93LiBUaGUgc3BlY2lmaWVkCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgZnJhbWV3b3JrIG11c3QgYmUgaW5zdGFsbGVkLgogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIElmIG5vIGZyYW1ld29yayBpcyBzcGVjaWZpZWQsIHdpbGwgZGVmYXVsdCB0byB0aGUgb25lIGN1cnJlbnRseSBpbnN0YWxsZWQuCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgSWYgbm8gZnJhbWV3b3JrIGlzIHNwZWNpZmllZCBhbmQgYm90aCBmcmFtZXdvcmtzIGFyZSBpbnN0YWxsZWQsIHdpbGwgZGVmYXVsdCB0byB0aGUKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBmcmFtZXdvcmsgb2YgdGhlIGBtb2RlbGAsIG9yIHRvIFB5VG9yY2ggaWYgbm8gbW9kZWwgaXMgcHJvdmlkZWQuCiAgICAgICAgOnBhcmFtIGNsYXNzX2FyZ3M6ICAgICAgLQogICAgICAgICIiIgogICAgICAgIHN1cGVyKEh1Z2dpbmdGYWNlTW9kZWxTZXJ2ZXIsIHNlbGYpLl9faW5pdF9fKAogICAgICAgICAgICBjb250ZXh0PWNvbnRleHQsCiAgICAgICAgICAgIG5hbWU9bmFtZSwKICAgICAgICAgICAgbW9kZWxfcGF0aD1tb2RlbF9wYXRoLAogICAgICAgICAgICAqKmNsYXNzX2FyZ3MsCiAgICAgICAgKQogICAgICAgIHNlbGYudGFzayA9IHRhc2sKICAgICAgICBzZWxmLm1vZGVsID0gTm9uZQogICAgICAgIHNlbGYudG9rZW5pemVyID0gTm9uZQogICAgICAgIHNlbGYubW9kZWxfbmFtZSA9IG1vZGVsX25hbWUKICAgICAgICBzZWxmLnRva2VuaXplcl9uYW1lID0gdG9rZW5pemVyX25hbWUKICAgICAgICBzZWxmLm1vZGVsX2NsYXNzID0gbW9kZWxfY2xhc3MKICAgICAgICBzZWxmLnRva2VuaXplcl9jbGFzcyA9IHRva2VuaXplcl9jbGFzcwogICAgICAgIHNlbGYuZnJhbWV3b3JrID0gZnJhbWV3b3JrCiAgICAgICAgc2VsZi5waXBlID0gTm9uZQoKICAgIGRlZiBsb2FkKHNlbGYpOgogICAgICAgICIiImxvYWQgYW5kIGluaXRpYWxpemUgdGhlIG1vZGVsIGFuZC9vciBvdGhlciBlbGVtZW50cyIiIgogICAgICAgIGlmIHNlbGYubW9kZWxfY2xhc3M6CiAgICAgICAgICAgIG1vZGVsX29iamVjdCA9IGdldGF0dHIoaW1wb3J0X21vZHVsZShQQUNLQUdFX01PRFVMRSksIHNlbGYubW9kZWxfY2xhc3MpCiAgICAgICAgICAgIHNlbGYubW9kZWwgPSBtb2RlbF9vYmplY3QuZnJvbV9wcmV0cmFpbmVkKHNlbGYubW9kZWxfbmFtZSkKICAgICAgICBpZiBzZWxmLnRva2VuaXplcl9jbGFzczoKICAgICAgICAgICAgdG9rZW5pemVyX29iamVjdCA9IGdldGF0dHIoCiAgICAgICAgICAgICAgICBpbXBvcnRfbW9kdWxlKFBBQ0tBR0VfTU9EVUxFKSwgc2VsZi50b2tlbml6ZXJfY2xhc3MKICAgICAgICAgICAgKQogICAgICAgICAgICBzZWxmLnRva2VuaXplciA9IHRva2VuaXplcl9vYmplY3QuZnJvbV9wcmV0cmFpbmVkKHNlbGYudG9rZW5pemVyX25hbWUpCiAgICAgICAgc2VsZi5waXBlID0gcGlwZWxpbmUoCiAgICAgICAgICAgIHRhc2s9c2VsZi50YXNrLAogICAgICAgICAgICBtb2RlbD1zZWxmLm1vZGVsIG9yIHNlbGYubW9kZWxfbmFtZSwKICAgICAgICAgICAgdG9rZW5pemVyPXNlbGYudG9rZW5pemVyLAogICAgICAgICAgICBmcmFtZXdvcms9c2VsZi5mcmFtZXdvcmssCiAgICAgICAgKQoKICAgIGRlZiBwcmVkaWN0KHNlbGYsIGJvZHk6IGRpY3QpIC0+IExpc3Q6CiAgICAgICAgIiIiR2VuZXJhdGUgbW9kZWwgcHJlZGljdGlvbnMgZnJvbSBzYW1wbGUuIiIiCiAgICAgICAgaWYgc2VsZi5waXBlIGlzIE5vbmU6CiAgICAgICAgICAgIHJhaXNlIFZhbHVlRXJyb3IoIlBsZWFzZSB1c2UgYC5sb2FkKClgIikKICAgICAgICB0cnk6CiAgICAgICAgICAgIGlmIGlzaW5zdGFuY2UoYm9keVsiaW5wdXRzIl1bMF0sIGRpY3QpOgogICAgICAgICAgICAgICAgcmVzdWx0ID0gW3NlbGYucGlwZSgqKl9pbnB1dCkgZm9yIF9pbnB1dCBpbiBib2R5WyJpbnB1dHMiXV0KICAgICAgICAgICAgZWxzZToKICAgICAgICAgICAgICAgIHJlc3VsdCA9IHNlbGYucGlwZShib2R5WyJpbnB1dHMiXSkKICAgICAgICAgICAgIyByZXBsYWNlIGxpc3Qgb2YgbGlzdHMgb2YgZGljdHMgaW50byBhIGxpc3Qgb2YgZGljdHM6CiAgICAgICAgICAgIGlmIGFsbChpc2luc3RhbmNlKHJlcywgbGlzdCkgZm9yIHJlcyBpbiByZXN1bHQpOgogICAgICAgICAgICAgICAgbmV3X3Jlc3VsdCA9IFtyZXNbMF0gZm9yIHJlcyBpbiByZXN1bHRdCiAgICAgICAgICAgICAgICByZXN1bHQgPSBuZXdfcmVzdWx0CgogICAgICAgICAgICBub25fc2VyaWFsaXphYmxlX3R5cGVzID0gW10KICAgICAgICAgICAgZm9yIHJlcyBpbiByZXN1bHQ6CiAgICAgICAgICAgICAgICBmb3Iga2V5LCB2YWwgaW4gcmVzLml0ZW1zKCk6CiAgICAgICAgICAgICAgICAgICAgaWYgdHlwZSh2YWwpIG5vdCBpbiBTRVJJQUxJWkFCTEVfVFlQRVM6CiAgICAgICAgICAgICAgICAgICAgICAgIG5vbl9zZXJpYWxpemFibGVfdHlwZXMuYXBwZW5kKHN0cih0eXBlKHZhbCkpKQogICAgICAgICAgICAgICAgICAgICAgICByZXNba2V5XSA9IHN0cih2YWwpCiAgICAgICAgICAgIGlmIG5vbl9zZXJpYWxpemFibGVfdHlwZXM6CiAgICAgICAgICAgICAgICBzZWxmLmNvbnRleHQubG9nZ2VyLmluZm8oCiAgICAgICAgICAgICAgICAgICAgZiJOb24tc2VyaWFsaXphYmxlIHR5cGVzOiB7bm9uX3NlcmlhbGl6YWJsZV90eXBlc30gd2VyZSBjYXN0ZWQgdG8gc3RyaW5ncyIKICAgICAgICAgICAgICAgICkKICAgICAgICBleGNlcHQgRXhjZXB0aW9uIGFzIGU6CiAgICAgICAgICAgIHJhaXNlIEV4Y2VwdGlvbigiRmFpbGVkIHRvIHByZWRpY3QgJXMiICUgZSkKICAgICAgICByZXR1cm4gcmVzdWx0Cgpmcm9tIG1scnVuLnJ1bnRpbWVzIGltcG9ydCBudWNsaW9faW5pdF9ob29rCmRlZiBpbml0X2NvbnRleHQoY29udGV4dCk6CiAgICBudWNsaW9faW5pdF9ob29rKGNvbnRleHQsIGdsb2JhbHMoKSwgJ3NlcnZpbmdfdjInKQoKZGVmIGhhbmRsZXIoY29udGV4dCwgZXZlbnQpOgogICAgcmV0dXJuIGNvbnRleHQubWxydW5faGFuZGxlcihjb250ZXh0LCBldmVudCkK - commands: [] code_origin: '' origin_filename: '' requirements: - transformers==4.21.3 - tensorflow==2.9.2 - description: Generic Hugging Face model server. - default_handler: '' + function_kind: serving_v2 + default_class: HuggingFaceModelServer + base_image_pull: false + max_replicas: 4 + command: '' disable_auto_mount: false - clone_target_dir: '' + function_handler: hugging-face-serving-nuclio:handler + description: Generic Hugging Face model server. env: - name: MLRUN_HTTPDB__NUCLIO__EXPLICIT_ACK value: enabled - priority_class_name: '' - preemption_mode: prevent - min_replicas: 1 - max_replicas: 4 - source: '' - function_kind: serving_v2 - function_handler: hugging_face_serving:handler - base_image_pull: false - default_class: HuggingFaceModelServer - secret_sources: [] - affinity: null - tolerations: null - security_context: {} verbose: false +kind: serving diff --git a/functions/master/hugging_face_serving/latest/src/item.yaml b/functions/master/hugging_face_serving/latest/src/item.yaml index d1f78769..48b063e4 100644 --- a/functions/master/hugging_face_serving/latest/src/item.yaml +++ b/functions/master/hugging_face_serving/latest/src/item.yaml @@ -1,9 +1,7 @@ apiVersion: v1 categories: -- huggingface - genai - model-serving -- machine-learning description: Generic Hugging Face model server. doc: '' example: hugging_face_serving.ipynb diff --git a/functions/master/hugging_face_serving/latest/static/documentation.html b/functions/master/hugging_face_serving/latest/static/documentation.html index a95b606c..65c7aeb8 100644 --- a/functions/master/hugging_face_serving/latest/static/documentation.html +++ b/functions/master/hugging_face_serving/latest/static/documentation.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/hugging_face_serving/latest/static/example.html b/functions/master/hugging_face_serving/latest/static/example.html index bc408d54..20ac581a 100644 --- a/functions/master/hugging_face_serving/latest/static/example.html +++ b/functions/master/hugging_face_serving/latest/static/example.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/hugging_face_serving/latest/static/function.html b/functions/master/hugging_face_serving/latest/static/function.html index ef13ff66..c30517c4 100644 --- a/functions/master/hugging_face_serving/latest/static/function.html +++ b/functions/master/hugging_face_serving/latest/static/function.html @@ -28,52 +28,37 @@
         
-kind: serving
 metadata:
   name: hugging-face-serving
-  tag: ''
-  hash: 1a489a57da861f129eb26e933f34e58927e41195
-  project: ''
-  labels:
-    author: yonish
   categories:
-  - huggingface
   - genai
   - model-serving
-  - machine-learning
+  tag: ''
 spec:
-  command: ''
-  args: []
+  default_handler: ''
+  min_replicas: 1
+  source: ''
   image: mlrun/ml-models
   build:
     functionSourceCode: IyBDb3B5cmlnaHQgMjAxOSBJZ3VhemlvCiMKIyBMaWNlbnNlZCB1bmRlciB0aGUgQXBhY2hlIExpY2Vuc2UsIFZlcnNpb24gMi4wICh0aGUgIkxpY2Vuc2UiKTsKIyB5b3UgbWF5IG5vdCB1c2UgdGhpcyBmaWxlIGV4Y2VwdCBpbiBjb21wbGlhbmNlIHdpdGggdGhlIExpY2Vuc2UuCiMgWW91IG1heSBvYnRhaW4gYSBjb3B5IG9mIHRoZSBMaWNlbnNlIGF0CiMKIyAgICAgaHR0cDovL3d3dy5hcGFjaGUub3JnL2xpY2Vuc2VzL0xJQ0VOU0UtMi4wCiMKIyBVbmxlc3MgcmVxdWlyZWQgYnkgYXBwbGljYWJsZSBsYXcgb3IgYWdyZWVkIHRvIGluIHdyaXRpbmcsIHNvZnR3YXJlCiMgZGlzdHJpYnV0ZWQgdW5kZXIgdGhlIExpY2Vuc2UgaXMgZGlzdHJpYnV0ZWQgb24gYW4gIkFTIElTIiBCQVNJUywKIyBXSVRIT1VUIFdBUlJBTlRJRVMgT1IgQ09ORElUSU9OUyBPRiBBTlkgS0lORCwgZWl0aGVyIGV4cHJlc3Mgb3IgaW1wbGllZC4KIyBTZWUgdGhlIExpY2Vuc2UgZm9yIHRoZSBzcGVjaWZpYyBsYW5ndWFnZSBnb3Zlcm5pbmcgcGVybWlzc2lvbnMgYW5kCiMgbGltaXRhdGlvbnMgdW5kZXIgdGhlIExpY2Vuc2UuCiMKCmZyb20gYWJjIGltcG9ydCBBQkMKZnJvbSBpbXBvcnRsaWIgaW1wb3J0IGltcG9ydF9tb2R1bGUKZnJvbSB0eXBpbmcgaW1wb3J0IExpc3QKCmZyb20gdHJhbnNmb3JtZXJzIGltcG9ydCBwaXBlbGluZQoKaW1wb3J0IG1scnVuLnNlcnZpbmcKClBBQ0tBR0VfTU9EVUxFID0gInRyYW5zZm9ybWVycyIKU0VSSUFMSVpBQkxFX1RZUEVTID0gW2RpY3QsIGxpc3QsIHR1cGxlLCBzdHIsIGludCwgZmxvYXRdCgoKY2xhc3MgSHVnZ2luZ0ZhY2VNb2RlbFNlcnZlcihtbHJ1bi5zZXJ2aW5nLlYyTW9kZWxTZXJ2ZXIsIEFCQyk6CiAgICAiIiIKICAgIEh1Z2dpbmcgRmFjZSBNb2RlbCBzZXJ2aW5nIGNsYXNzLCBpbmhlcml0aW5nIHRoZSBWMk1vZGVsU2VydmVyIGNsYXNzIGZvciBiZWluZyBpbml0aWFsaXplZCBhdXRvbWF0aWNhbGx5IGJ5IHRoZQogICAgbW9kZWwgc2VydmVyIGFuZCBiZSBhYmxlIHRvIHJ1biBsb2NhbGx5IGFzIHBhcnQgb2YgYSBudWNsaW8gc2VydmVybGVzcyBmdW5jdGlvbiwgb3IgYXMgcGFydCBvZiBhIHJlYWwtdGltZSBwaXBlbGluZS4KICAgICIiIgoKICAgIGRlZiBfX2luaXRfXygKICAgICAgICBzZWxmLAogICAgICAgIGNvbnRleHQ6IG1scnVuLk1MQ2xpZW50Q3R4LAogICAgICAgIG5hbWU6IHN0ciwKICAgICAgICB0YXNrOiBzdHIsCiAgICAgICAgbW9kZWxfcGF0aDogc3RyID0gTm9uZSwKICAgICAgICBtb2RlbF9uYW1lOiBzdHIgPSBOb25lLAogICAgICAgIG1vZGVsX2NsYXNzOiBzdHIgPSBOb25lLAogICAgICAgIHRva2VuaXplcl9uYW1lOiBzdHIgPSBOb25lLAogICAgICAgIHRva2VuaXplcl9jbGFzczogc3RyID0gTm9uZSwKICAgICAgICBmcmFtZXdvcms6IHN0ciA9IE5vbmUsCiAgICAgICAgKipjbGFzc19hcmdzLAogICAgKToKICAgICAgICAiIiIKICAgICAgICBJbml0aWFsaXplIGEgc2VydmluZyBjbGFzcyBmb3IgYSBIdWdnaW5nIGZhY2UgbW9kZWwuCgogICAgICAgIDpwYXJhbSBjb250ZXh0OiAgICAgICAgIFRoZSBtbHJ1biBjb250ZXh0IHRvIHdvcmsgd2l0aAogICAgICAgIDpwYXJhbSBuYW1lOiAgICAgICAgICAgIFRoZSBuYW1lIG9mIHRoaXMgc2VydmVyIHRvIGJlIGluaXRpYWxpemVkCiAgICAgICAgOnBhcmFtIG1vZGVsX3BhdGg6ICAgICAgTm90IGluIHVzZS4gV2hlbiBhZGRpbmcgYSBtb2RlbCBwYXNzIGFueSBzdHJpbmcgdmFsdWUKICAgICAgICA6cGFyYW0gbW9kZWxfbmFtZTogICAgICBUaGUgbW9kZWwncyBuYW1lIGluIHRoZSBIdWdnaW5nIEZhY2UgaHViCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgZS5nLiwgYG5scHRvd24vYmVydC1iYXNlLW11bHRpbGluZ3VhbC11bmNhc2VkLXNlbnRpbWVudGAKICAgICAgICA6cGFyYW0gbW9kZWxfY2xhc3M6ICAgICBUaGUgbW9kZWwncyBjbGFzcyB0eXBlIG9iamVjdCB3aGljaCBjYW4gYmUgcGFzc2VkIGFzIHRoZSBjbGFzcydzIG5hbWUgKHN0cmluZykuCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgTXVzdCBiZSBwcm92aWRlZCBhbmQgdG8gYmUgbWF0Y2hlZCB3aXRoIGBtb2RlbF9uYW1lYC4KICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBlLmcuLCBgQXV0b01vZGVsRm9yU2VxdWVuY2VDbGFzc2lmaWNhdGlvbmAKICAgICAgICA6cGFyYW0gdG9rZW5pemVyX25hbWU6ICBUaGUgdG9rZW5pemVyJ3MgbmFtZSBpbiB0aGUgSHVnZ2luZyBGYWNlIGh1YgogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIGUuZy4sIGBubHB0b3duL2JlcnQtYmFzZS1tdWx0aWxpbmd1YWwtdW5jYXNlZC1zZW50aW1lbnRgCiAgICAgICAgOnBhcmFtIHRva2VuaXplcl9jbGFzczogVGhlIG1vZGVsJ3MgY2xhc3MgdHlwZSBvYmplY3Qgd2hpY2ggY2FuIGJlIHBhc3NlZCBhcyB0aGUgY2xhc3MncyBuYW1lIChzdHJpbmcpLgogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIE11c3QgYmUgcHJvdmlkZWQgYW5kIHRvIGJlIG1hdGNoZWQgd2l0aCBgbW9kZWxfbmFtZWAuCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgZS5nLiwgYEF1dG9Ub2tlbml6ZXJgCiAgICAgICAgOnBhcmFtIGZyYW1ld29yazogICAgICAgVGhlIGZyYW1ld29yayB0byB1c2UsIGVpdGhlciBgInB0ImAgZm9yIFB5VG9yY2ggb3IgYCJ0ZiJgIGZvciBUZW5zb3JGbG93LiBUaGUgc3BlY2lmaWVkCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgZnJhbWV3b3JrIG11c3QgYmUgaW5zdGFsbGVkLgogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIElmIG5vIGZyYW1ld29yayBpcyBzcGVjaWZpZWQsIHdpbGwgZGVmYXVsdCB0byB0aGUgb25lIGN1cnJlbnRseSBpbnN0YWxsZWQuCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgSWYgbm8gZnJhbWV3b3JrIGlzIHNwZWNpZmllZCBhbmQgYm90aCBmcmFtZXdvcmtzIGFyZSBpbnN0YWxsZWQsIHdpbGwgZGVmYXVsdCB0byB0aGUKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBmcmFtZXdvcmsgb2YgdGhlIGBtb2RlbGAsIG9yIHRvIFB5VG9yY2ggaWYgbm8gbW9kZWwgaXMgcHJvdmlkZWQuCiAgICAgICAgOnBhcmFtIGNsYXNzX2FyZ3M6ICAgICAgLQogICAgICAgICIiIgogICAgICAgIHN1cGVyKEh1Z2dpbmdGYWNlTW9kZWxTZXJ2ZXIsIHNlbGYpLl9faW5pdF9fKAogICAgICAgICAgICBjb250ZXh0PWNvbnRleHQsCiAgICAgICAgICAgIG5hbWU9bmFtZSwKICAgICAgICAgICAgbW9kZWxfcGF0aD1tb2RlbF9wYXRoLAogICAgICAgICAgICAqKmNsYXNzX2FyZ3MsCiAgICAgICAgKQogICAgICAgIHNlbGYudGFzayA9IHRhc2sKICAgICAgICBzZWxmLm1vZGVsID0gTm9uZQogICAgICAgIHNlbGYudG9rZW5pemVyID0gTm9uZQogICAgICAgIHNlbGYubW9kZWxfbmFtZSA9IG1vZGVsX25hbWUKICAgICAgICBzZWxmLnRva2VuaXplcl9uYW1lID0gdG9rZW5pemVyX25hbWUKICAgICAgICBzZWxmLm1vZGVsX2NsYXNzID0gbW9kZWxfY2xhc3MKICAgICAgICBzZWxmLnRva2VuaXplcl9jbGFzcyA9IHRva2VuaXplcl9jbGFzcwogICAgICAgIHNlbGYuZnJhbWV3b3JrID0gZnJhbWV3b3JrCiAgICAgICAgc2VsZi5waXBlID0gTm9uZQoKICAgIGRlZiBsb2FkKHNlbGYpOgogICAgICAgICIiImxvYWQgYW5kIGluaXRpYWxpemUgdGhlIG1vZGVsIGFuZC9vciBvdGhlciBlbGVtZW50cyIiIgogICAgICAgIGlmIHNlbGYubW9kZWxfY2xhc3M6CiAgICAgICAgICAgIG1vZGVsX29iamVjdCA9IGdldGF0dHIoaW1wb3J0X21vZHVsZShQQUNLQUdFX01PRFVMRSksIHNlbGYubW9kZWxfY2xhc3MpCiAgICAgICAgICAgIHNlbGYubW9kZWwgPSBtb2RlbF9vYmplY3QuZnJvbV9wcmV0cmFpbmVkKHNlbGYubW9kZWxfbmFtZSkKICAgICAgICBpZiBzZWxmLnRva2VuaXplcl9jbGFzczoKICAgICAgICAgICAgdG9rZW5pemVyX29iamVjdCA9IGdldGF0dHIoCiAgICAgICAgICAgICAgICBpbXBvcnRfbW9kdWxlKFBBQ0tBR0VfTU9EVUxFKSwgc2VsZi50b2tlbml6ZXJfY2xhc3MKICAgICAgICAgICAgKQogICAgICAgICAgICBzZWxmLnRva2VuaXplciA9IHRva2VuaXplcl9vYmplY3QuZnJvbV9wcmV0cmFpbmVkKHNlbGYudG9rZW5pemVyX25hbWUpCiAgICAgICAgc2VsZi5waXBlID0gcGlwZWxpbmUoCiAgICAgICAgICAgIHRhc2s9c2VsZi50YXNrLAogICAgICAgICAgICBtb2RlbD1zZWxmLm1vZGVsIG9yIHNlbGYubW9kZWxfbmFtZSwKICAgICAgICAgICAgdG9rZW5pemVyPXNlbGYudG9rZW5pemVyLAogICAgICAgICAgICBmcmFtZXdvcms9c2VsZi5mcmFtZXdvcmssCiAgICAgICAgKQoKICAgIGRlZiBwcmVkaWN0KHNlbGYsIGJvZHk6IGRpY3QpIC0+IExpc3Q6CiAgICAgICAgIiIiR2VuZXJhdGUgbW9kZWwgcHJlZGljdGlvbnMgZnJvbSBzYW1wbGUuIiIiCiAgICAgICAgaWYgc2VsZi5waXBlIGlzIE5vbmU6CiAgICAgICAgICAgIHJhaXNlIFZhbHVlRXJyb3IoIlBsZWFzZSB1c2UgYC5sb2FkKClgIikKICAgICAgICB0cnk6CiAgICAgICAgICAgIGlmIGlzaW5zdGFuY2UoYm9keVsiaW5wdXRzIl1bMF0sIGRpY3QpOgogICAgICAgICAgICAgICAgcmVzdWx0ID0gW3NlbGYucGlwZSgqKl9pbnB1dCkgZm9yIF9pbnB1dCBpbiBib2R5WyJpbnB1dHMiXV0KICAgICAgICAgICAgZWxzZToKICAgICAgICAgICAgICAgIHJlc3VsdCA9IHNlbGYucGlwZShib2R5WyJpbnB1dHMiXSkKICAgICAgICAgICAgIyByZXBsYWNlIGxpc3Qgb2YgbGlzdHMgb2YgZGljdHMgaW50byBhIGxpc3Qgb2YgZGljdHM6CiAgICAgICAgICAgIGlmIGFsbChpc2luc3RhbmNlKHJlcywgbGlzdCkgZm9yIHJlcyBpbiByZXN1bHQpOgogICAgICAgICAgICAgICAgbmV3X3Jlc3VsdCA9IFtyZXNbMF0gZm9yIHJlcyBpbiByZXN1bHRdCiAgICAgICAgICAgICAgICByZXN1bHQgPSBuZXdfcmVzdWx0CgogICAgICAgICAgICBub25fc2VyaWFsaXphYmxlX3R5cGVzID0gW10KICAgICAgICAgICAgZm9yIHJlcyBpbiByZXN1bHQ6CiAgICAgICAgICAgICAgICBmb3Iga2V5LCB2YWwgaW4gcmVzLml0ZW1zKCk6CiAgICAgICAgICAgICAgICAgICAgaWYgdHlwZSh2YWwpIG5vdCBpbiBTRVJJQUxJWkFCTEVfVFlQRVM6CiAgICAgICAgICAgICAgICAgICAgICAgIG5vbl9zZXJpYWxpemFibGVfdHlwZXMuYXBwZW5kKHN0cih0eXBlKHZhbCkpKQogICAgICAgICAgICAgICAgICAgICAgICByZXNba2V5XSA9IHN0cih2YWwpCiAgICAgICAgICAgIGlmIG5vbl9zZXJpYWxpemFibGVfdHlwZXM6CiAgICAgICAgICAgICAgICBzZWxmLmNvbnRleHQubG9nZ2VyLmluZm8oCiAgICAgICAgICAgICAgICAgICAgZiJOb24tc2VyaWFsaXphYmxlIHR5cGVzOiB7bm9uX3NlcmlhbGl6YWJsZV90eXBlc30gd2VyZSBjYXN0ZWQgdG8gc3RyaW5ncyIKICAgICAgICAgICAgICAgICkKICAgICAgICBleGNlcHQgRXhjZXB0aW9uIGFzIGU6CiAgICAgICAgICAgIHJhaXNlIEV4Y2VwdGlvbigiRmFpbGVkIHRvIHByZWRpY3QgJXMiICUgZSkKICAgICAgICByZXR1cm4gcmVzdWx0Cgpmcm9tIG1scnVuLnJ1bnRpbWVzIGltcG9ydCBudWNsaW9faW5pdF9ob29rCmRlZiBpbml0X2NvbnRleHQoY29udGV4dCk6CiAgICBudWNsaW9faW5pdF9ob29rKGNvbnRleHQsIGdsb2JhbHMoKSwgJ3NlcnZpbmdfdjInKQoKZGVmIGhhbmRsZXIoY29udGV4dCwgZXZlbnQpOgogICAgcmV0dXJuIGNvbnRleHQubWxydW5faGFuZGxlcihjb250ZXh0LCBldmVudCkK
-    commands: []
     code_origin: ''
     origin_filename: ''
     requirements:
     - transformers==4.21.3
     - tensorflow==2.9.2
-  description: Generic Hugging Face model server.
-  default_handler: ''
+  function_kind: serving_v2
+  default_class: HuggingFaceModelServer
+  base_image_pull: false
+  max_replicas: 4
+  command: ''
   disable_auto_mount: false
-  clone_target_dir: ''
+  function_handler: hugging-face-serving-nuclio:handler
+  description: Generic Hugging Face model server.
   env:
   - name: MLRUN_HTTPDB__NUCLIO__EXPLICIT_ACK
     value: enabled
-  priority_class_name: ''
-  preemption_mode: prevent
-  min_replicas: 1
-  max_replicas: 4
-  source: ''
-  function_kind: serving_v2
-  function_handler: hugging_face_serving:handler
-  base_image_pull: false
-  default_class: HuggingFaceModelServer
-  secret_sources: []
-  affinity: null
-  tolerations: null
-  security_context: {}
 verbose: false
+kind: serving
 
         
     
diff --git a/functions/master/hugging_face_serving/latest/static/hugging_face_serving.html b/functions/master/hugging_face_serving/latest/static/hugging_face_serving.html index b07151ed..848cc67e 100644 --- a/functions/master/hugging_face_serving/latest/static/hugging_face_serving.html +++ b/functions/master/hugging_face_serving/latest/static/hugging_face_serving.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/hugging_face_serving/latest/static/item.html b/functions/master/hugging_face_serving/latest/static/item.html index 62cc1b5b..784df913 100644 --- a/functions/master/hugging_face_serving/latest/static/item.html +++ b/functions/master/hugging_face_serving/latest/static/item.html @@ -30,10 +30,8 @@ apiVersion: v1 categories: -- huggingface - genai - model-serving -- machine-learning description: Generic Hugging Face model server. doc: '' example: hugging_face_serving.ipynb diff --git a/functions/master/load_dataset/1.2.0/static/documentation.html b/functions/master/load_dataset/1.2.0/static/documentation.html index ef5de639..06068936 100644 --- a/functions/master/load_dataset/1.2.0/static/documentation.html +++ b/functions/master/load_dataset/1.2.0/static/documentation.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/load_dataset/1.2.0/static/example.html b/functions/master/load_dataset/1.2.0/static/example.html index ffb7b6c8..63619eb0 100644 --- a/functions/master/load_dataset/1.2.0/static/example.html +++ b/functions/master/load_dataset/1.2.0/static/example.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/load_dataset/1.2.0/static/load_dataset.html b/functions/master/load_dataset/1.2.0/static/load_dataset.html index 5fe817e8..9bd07517 100644 --- a/functions/master/load_dataset/1.2.0/static/load_dataset.html +++ b/functions/master/load_dataset/1.2.0/static/load_dataset.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/load_dataset/latest/static/documentation.html b/functions/master/load_dataset/latest/static/documentation.html index ef5de639..06068936 100644 --- a/functions/master/load_dataset/latest/static/documentation.html +++ b/functions/master/load_dataset/latest/static/documentation.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/load_dataset/latest/static/example.html b/functions/master/load_dataset/latest/static/example.html index ffb7b6c8..63619eb0 100644 --- a/functions/master/load_dataset/latest/static/example.html +++ b/functions/master/load_dataset/latest/static/example.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/load_dataset/latest/static/load_dataset.html b/functions/master/load_dataset/latest/static/load_dataset.html index 5fe817e8..9bd07517 100644 --- a/functions/master/load_dataset/latest/static/load_dataset.html +++ b/functions/master/load_dataset/latest/static/load_dataset.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/mlflow_utils/1.1.0/src/function.yaml b/functions/master/mlflow_utils/1.1.0/src/function.yaml new file mode 100644 index 00000000..623f054f --- /dev/null +++ b/functions/master/mlflow_utils/1.1.0/src/function.yaml @@ -0,0 +1,32 @@ +verbose: false +spec: + command: '' + source: '' + default_class: MLFlowModelServer + function_kind: serving_v2 + build: + functionSourceCode: aW1wb3J0IHppcGZpbGUKZnJvbSB0eXBpbmcgaW1wb3J0IEFueSwgRGljdAppbXBvcnQgbWxmbG93CmZyb20gbWxydW4uc2VydmluZy52Ml9zZXJ2aW5nIGltcG9ydCBWMk1vZGVsU2VydmVyCmltcG9ydCBwYW5kYXMgYXMgcGQKCgpjbGFzcyBNTEZsb3dNb2RlbFNlcnZlcihWMk1vZGVsU2VydmVyKToKICAgICIiIgogICAgTUxGbG93IHRyYWNrZXIgTW9kZWwgc2VydmluZyBjbGFzcywgaW5oZXJpdGluZyB0aGUgVjJNb2RlbFNlcnZlciBjbGFzcyBmb3IgYmVpbmcgaW5pdGlhbGl6ZWQgYXV0b21hdGljYWxseSBieSB0aGUgbW9kZWwKICAgIHNlcnZlciBhbmQgYmUgYWJsZSB0byBydW4gbG9jYWxseSBhcyBwYXJ0IG9mIGEgbnVjbGlvIHNlcnZlcmxlc3MgZnVuY3Rpb24sIG9yIGFzIHBhcnQgb2YgYSByZWFsLXRpbWUgcGlwZWxpbmUuCiAgICAiIiIKCiAgICBkZWYgbG9hZChzZWxmKToKICAgICAgICAiIiIKICAgICAgICBsb2FkcyBhIG1vZGVsIHRoYXQgd2FzIGxvZ2dlZCBieSB0aGUgTUxGbG93IHRyYWNrZXIgbW9kZWwKICAgICAgICAiIiIKICAgICAgICAjIFVuemlwIHRoZSBtb2RlbCBkaXIgYW5kIHRoZW4gdXNlIG1sZmxvdydzIGxvYWQgZnVuY3Rpb24KICAgICAgICBtb2RlbF9maWxlLCBfID0gc2VsZi5nZXRfbW9kZWwoIi56aXAiKQogICAgICAgIG1vZGVsX3BhdGhfdW56aXAgPSBtb2RlbF9maWxlLnJlcGxhY2UoIi56aXAiLCAiIikKCiAgICAgICAgd2l0aCB6aXBmaWxlLlppcEZpbGUobW9kZWxfZmlsZSwgInIiKSBhcyB6aXBfcmVmOgogICAgICAgICAgICB6aXBfcmVmLmV4dHJhY3RhbGwobW9kZWxfcGF0aF91bnppcCkKCiAgICAgICAgc2VsZi5tb2RlbCA9IG1sZmxvdy5weWZ1bmMubG9hZF9tb2RlbChtb2RlbF9wYXRoX3VuemlwKQoKICAgIGRlZiBwcmVkaWN0KHNlbGYsIHJlcXVlc3Q6IERpY3Rbc3RyLCBBbnldKSAtPiBsaXN0OgogICAgICAgICIiIgogICAgICAgIEluZmVyIHRoZSBpbnB1dHMgdGhyb3VnaCB0aGUgbW9kZWwuIFRoZSBpbmZlcnJlZCBkYXRhIHdpbGwKICAgICAgICBiZSByZWFkIGZyb20gdGhlICJpbnB1dHMiIGtleSBvZiB0aGUgcmVxdWVzdC4KCiAgICAgICAgOnBhcmFtIHJlcXVlc3Q6IFRoZSByZXF1ZXN0IHRvIHRoZSBtb2RlbCB1c2luZyB4Z2Jvb3N0J3MgcHJlZGljdC4KICAgICAgICAgICAgICAgIFRoZSBpbnB1dCB0byB0aGUgbW9kZWwgd2lsbCBiZSByZWFkIGZyb20gdGhlICJpbnB1dHMiIGtleS4KCiAgICAgICAgOnJldHVybjogVGhlIG1vZGVsJ3MgcHJlZGljdGlvbiBvbiB0aGUgZ2l2ZW4gaW5wdXQuCiAgICAgICAgIiIiCgogICAgICAgICMgR2V0IHRoZSBpbnB1dHMgYW5kIHNldCB0byBhY2NlcHRlZCB0eXBlOgogICAgICAgIGlucHV0cyA9IHBkLkRhdGFGcmFtZShyZXF1ZXN0WyJpbnB1dHMiXSkKCiAgICAgICAgIyBQcmVkaWN0IHVzaW5nIHRoZSBtb2RlbCdzIHByZWRpY3QgZnVuY3Rpb246CiAgICAgICAgcHJlZGljdGlvbnMgPSBzZWxmLm1vZGVsLnByZWRpY3QoaW5wdXRzKQoKICAgICAgICAjIFJldHVybiBhcyBsaXN0OgogICAgICAgIHJldHVybiBwcmVkaWN0aW9ucy50b2xpc3QoKQoKZnJvbSBtbHJ1bi5ydW50aW1lcyBpbXBvcnQgbnVjbGlvX2luaXRfaG9vawpkZWYgaW5pdF9jb250ZXh0KGNvbnRleHQpOgogICAgbnVjbGlvX2luaXRfaG9vayhjb250ZXh0LCBnbG9iYWxzKCksICdzZXJ2aW5nX3YyJykKCmRlZiBoYW5kbGVyKGNvbnRleHQsIGV2ZW50KToKICAgIHJldHVybiBjb250ZXh0Lm1scnVuX2hhbmRsZXIoY29udGV4dCwgZXZlbnQpCg== + requirements: + - mlflow==2.12.2 + - lightgbm + - xgboost + code_origin: '' + origin_filename: '' + image: mlrun/mlrun + base_image_pull: false + default_handler: '' + max_replicas: 4 + disable_auto_mount: false + min_replicas: 1 + description: Mlflow model server, and additional utils. + function_handler: mlflow-utils-nuclio:handler + env: + - name: MLRUN_HTTPDB__NUCLIO__EXPLICIT_ACK + value: enabled +metadata: + categories: + - model-serving + - utils + name: mlflow-utils + tag: '' +kind: serving diff --git a/functions/master/mlflow_utils/1.1.0/src/item.yaml b/functions/master/mlflow_utils/1.1.0/src/item.yaml new file mode 100644 index 00000000..27e61ab4 --- /dev/null +++ b/functions/master/mlflow_utils/1.1.0/src/item.yaml @@ -0,0 +1,30 @@ +apiVersion: v1 +categories: +- model-serving +- utils +description: Mlflow model server, and additional utils. +doc: '' +example: mlflow_utils.ipynb +generationDate: 2024-05-23:12-00 +hidden: false +icon: '' +labels: + author: zeevr +maintainers: [] +marketplaceType: '' +mlrunVersion: 1.8.0 +name: mlflow_utils +platformVersion: '' +spec: + customFields: + default_class: MLFlowModelServer + filename: mlflow_utils.py + handler: handler + image: mlrun/mlrun + kind: serving + requirements: + - mlflow==2.12.2 + - lightgbm + - xgboost +url: '' +version: 1.1.0 diff --git a/functions/master/mlflow_utils/1.1.0/src/mlflow_utils.ipynb b/functions/master/mlflow_utils/1.1.0/src/mlflow_utils.ipynb new file mode 100644 index 00000000..165dafc6 --- /dev/null +++ b/functions/master/mlflow_utils/1.1.0/src/mlflow_utils.ipynb @@ -0,0 +1,1353 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "c478ebb2", + "metadata": {}, + "source": [ + "# MLflow tracker demo\n", + "\n", + "This demo demonstrates how to seamlessly integrate and transfer logs from MLflow to MLRun,
\n", + "creating a unified and powerful platform for your machine learning experiments.\n", + "\n", + "You can combine MLflow and MLRun for a comprehensive solution for managing, tracking, and deploying machine learning models. \n", + "\n", + "This notebook guides you through the process of:\n", + "\n", + "1. Setting up the integration between MLflow and MLRun.\n", + "2. Extracting data, metrics, and artifacts from MLflow experiments.\n", + "3. Creating MLRun artifacts and projects to organize and manage the transferred data.\n", + "4. Leveraging MLRun's capabilities for model deployment and data processing.\n", + "\n", + "By the end of this demo, you will have a understanding of how to establish a smooth flow of data between MLflow and MLRun.\n", + "\n", + "## MLRun installation and configuration\n", + "Before running this notebook make sure the mlrun package is installed (pip install mlrun) and that you have configured the access to MLRun service." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "ab49e1f1", + "metadata": {}, + "outputs": [], + "source": [ + "# Install MLRun and scikit-learn if not already installed. Run this only once. Restart the notebook after the install!\n", + "# %pip install mlrun scikit-learn~=1.3.0" + ] + }, + { + "cell_type": "markdown", + "id": "1770566a", + "metadata": {}, + "source": [ + "Then you can import the necessary packages." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "0d2dfd8b-65c4-417b-b66e-99f44b015ee7", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import os\n", + "import mlrun\n", + "from mlrun.datastore.targets import ParquetTarget\n", + "import mlrun.feature_store as fstore" + ] + }, + { + "cell_type": "markdown", + "id": "7c4513d4", + "metadata": {}, + "source": [ + "Create a project for this demo:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "43ea863f-02d5-45f2-8143-306ce3bb6c58", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2024-03-27 15:34:40,940 [info] Project loaded successfully: {'project_name': 'mlflow-tracking-example-guy'}\n" + ] + } + ], + "source": [ + "# Create a project for this demo:\n", + "project = mlrun.get_or_create_project(name=\"mlflow-tracking-example\", context=\"./\")" + ] + }, + { + "cell_type": "markdown", + "id": "94413ee8", + "metadata": {}, + "source": [ + "Set all the necessary environment variables for the Databricks cluster:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "22f94f89-acce-442d-93ff-b2d08d3a35a4", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "DATABRICKS_HOST=\"add your host\"\n", + "DATABRICKS_TOKEN=\"add your token\"\n", + "DATABRICKS_CLUSTER_ID=\"add your cluster id\"" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "7af310da-fd02-444e-8619-43ba6dcdb0a4", + "metadata": {}, + "outputs": [], + "source": [ + "os.environ[\"DATABRICKS_HOST\"] = DATABRICKS_HOST\n", + "os.environ[\"DATABRICKS_TOKEN\"] = DATABRICKS_TOKEN\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "d98e823c-3a27-4532-9a2d-6398ea4e1778", + "metadata": {}, + "outputs": [], + "source": [ + "# Set the Databricks environment variables\n", + "job_env = {\n", + " \"DATABRICKS_HOST\": DATABRICKS_HOST,\n", + " \"DATABRICKS_CLUSTER_ID\": DATABRICKS_CLUSTER_ID\n", + "}\n", + "secrets = {\"DATABRICKS_TOKEN\": DATABRICKS_TOKEN}\n", + "\n", + "# Set the secrets in the project\n", + "project.set_secrets(secrets)" + ] + }, + { + "cell_type": "markdown", + "id": "37d75366", + "metadata": {}, + "source": [ + "## Create a feature set and ingest data\n", + "\n", + "This is a short example of how to create a feature set about music preferences." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "5701c04a-8442-4958-8f4c-265bf4c9b06a", + "metadata": {}, + "outputs": [], + "source": [ + "# create df\n", + "columns = [\"id\", \"name\", \"age\", \"gender\", \"favorite_music_type\"]\n", + "data = [\n", + " (1, \"Alice\", 20, \"f\", \"Pop\"),\n", + " (2, \"Bob\", 30, \"m\", \"Rock\"),\n", + " (3, \"Charlie\", 25, \"m\", \"Pop\"),\n", + " (4, \"David\", 40, \"m\", \"Classical\"),\n", + " (5, \"Eva\", 18, \"f\", \"Pop\"),\n", + " (6, \"Frank\", 32, \"m\", \"Rock\"),\n", + " (7, \"Grace\", 28, \"f\", \"Pop\"),\n", + " (8, \"Henry\", 45, \"m\", \"Classical\"),\n", + " (9, \"Ivy\", 22, \"f\", \"Pop\"),\n", + " (10, \"Jack\", 38, \"m\", \"Classical\"),\n", + " (11, \"Karen\", 27, \"f\", \"Pop\"),\n", + " (12, \"Liam\", 19, \"m\", \"Pop\"),\n", + " (13, \"Mia\", 27, \"f\", \"Rock\"),\n", + " (14, \"Nora\", 31, \"f\", \"Rock\"),\n", + " (15, \"Oliver\", 29, \"m\", \"Pop\"),\n", + " (16, \"Ben\", 38, \"m\", \"Pop\"),\n", + " (17, \"Alicia\", 20, \"f\", \"Pop\"),\n", + " (18, \"Bobby\", 30, \"m\", \"Rock\"),\n", + " (19, \"Charlien\", 22, \"f\", \"Pop\"),\n", + " (20, \"Davide\", 40, \"m\", \"Classical\"),\n", + " (21, \"Evans\", 19, \"m\", \"Pop\"),\n", + " (22, \"Franklin\", 34, \"m\", \"Rock\"),\n", + " (23, \"Grace\", 22, \"f\", \"Pop\"),\n", + " (24, \"Henrik\", 48, \"m\", \"Classical\"),\n", + " (25, \"eevee\", 29, \"f\", \"Pop\"),\n", + " (26, \"Jack\", 75, \"m\", \"Classical\"),\n", + " (27, \"Karen\", 26, \"f\", \"Pop\"),\n", + " (28, \"Lian\", 21, \"f\", \"Pop\"),\n", + " (29, \"kia\", 27, \"f\", \"Rock\"),\n", + " (30, \"Novak\", 30, \"m\", \"Rock\"),\n", + " (31, \"Olivia\", 29, \"f\", \"Pop\"),\n", + " (32, \"Benjamin\", 18, \"m\", \"Pop\")\n", + "]\n", + "df = pd.DataFrame(data, columns=columns)" + ] + }, + { + "cell_type": "markdown", + "id": "4b91576b", + "metadata": {}, + "source": [ + "Transfer the data to DataBricks." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "8679b0bb-0da6-4c35-9345-6cf0e83e19b2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'dbfs:///demos/mlrun_databricks_demo/1711553684480_33/music.parquet'" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Where to save the data in DataBricks\n", + "target_path = f\"dbfs:///demos/mlrun_databricks_demo/music.parquet\"\n", + "output_path = f\"dbfs:///demos/mlrun_databricks_demo/music_output_new.parquet\"\n", + "\n", + "targets = [ParquetTarget(path=target_path)]\n", + "\n", + "# Create a feature set and ingest the data\n", + "fset = fstore.FeatureSet(name=\"music_fset\", entities=[fstore.Entity(\"name\")])\n", + "fstore.ingest(fset, df, targets=targets, overwrite=True)\n", + "\n", + "# Get the target path and check it\n", + "dbfs_data_path = fset.get_target_path()\n", + "dbfs_data_path" + ] + }, + { + "cell_type": "markdown", + "id": "fe173be8-18eb-40ec-9662-6639b0deaedb", + "metadata": {}, + "source": [ + "We can look and see how how our data is logged in the DataBricks cluster:\n", + "(only top 20 rows)" + ] + }, + { + "attachments": { + "f7ad0425-26fe-482c-b97c-c9493b05fbf2.png": { + "image/png": "" + } + }, + "cell_type": "markdown", + "id": "c303d698-2f44-4f6f-8ce5-6a4f9f13534a", + "metadata": {}, + "source": [ + "![image.png](attachment:f7ad0425-26fe-482c-b97c-c9493b05fbf2.png)" + ] + }, + { + "cell_type": "markdown", + "id": "abd854e5", + "metadata": {}, + "source": [ + "## Create a data processing function\n", + "\n", + "The following code demonstrates how to create a simple data processing function using MLRun.
\n", + "The function will process the data and show some statistics.
\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b4e759f9-7154-4397-8db3-93b808426bd1", + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile process_data.py\n", + "\n", + "\n", + "# Here is an example of Spark processing.\n", + "from pyspark.sql import SparkSession\n", + "from pyspark.sql.functions import avg, min, max\n", + "import pandas as pd\n", + "import json\n", + "import fsspec\n", + "\n", + "def process_data(data_path: str, data_output_path: str):\n", + " spark = SparkSession.builder.appName(\"MusicDemo\").getOrCreate()\n", + " spark_df = spark.read.parquet(data_path, header=True)\n", + " spark_df = spark_df.drop(\"name\", \"id\")\n", + " \n", + " music_stats = spark_df.groupBy(\"favorite_music_type\").agg(\n", + " avg(\"age\").alias(\"avg_age\"),\n", + " min(\"age\").alias(\"min_age\"),\n", + " max(\"age\").alias(\"max_age\")\n", + " )\n", + " music_stats.show()\n", + " pandas_df = spark_df.toPandas()\n", + " pandas_df.to_parquet(data_output_path)\n", + " # spark_df.write.mode(\"overwrite\").parquet(data_output_path)\n", + "\n", + " return {\"music_data\": data_output_path}" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "13748b64-6a48-4500-a2a8-d9290dd082c5", + "metadata": {}, + "outputs": [], + "source": [ + "process_data_function = project.set_function(\n", + " func=\"./zeev-demos/mlflow-databricks/process_data.py\",\n", + " name=\"process-data\",\n", + " kind=\"databricks\",\n", + " image=\"mlrun/mlrun\",\n", + ")\n", + " " + ] + }, + { + "cell_type": "markdown", + "id": "2dbadf07-a32a-40da-b9bc-609070e4392d", + "metadata": {}, + "source": [ + "Set all parameters necessary for the function and run it." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "5642aa15-e8c0-4a72-a0a8-4cacd34fb63c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2024-03-27 15:34:45,422 [info] Storing function: {'name': 'process-data-process-data', 'uid': 'a9c770f8377046bda3061e61a5c015c2', 'db': 'http://mlrun-api:8080'}\n", + "> 2024-03-27 15:34:45,675 [info] Job is running in the background, pod: process-data-process-data-89bhh\n", + "> 2024-03-27 15:34:49,272 [info] Running with an existing cluster: {'cluster_id': '0327-134616-43m7kfxk'}\n", + "> 2024-03-27 15:34:49,492 [info] Starting to poll: 493449112310004\n", + "> 2024-03-27 15:34:49,539 [info] Workflow intermediate status: mlrun_task__15_34_48_703046: RunLifeCycleState.PENDING\n", + "> 2024-03-27 15:34:50,947 [info] Workflow intermediate status: mlrun_task__15_34_48_703046: RunLifeCycleState.PENDING\n", + "> 2024-03-27 15:34:53,063 [info] Workflow intermediate status: mlrun_task__15_34_48_703046: RunLifeCycleState.RUNNING\n", + "> 2024-03-27 15:34:56,737 [info] Workflow intermediate status: mlrun_task__15_34_48_703046: RunLifeCycleState.RUNNING\n", + "> 2024-03-27 15:35:00,947 [info] Artifacts found. Run name: mlrun_task__15_34_48_703046\n", + "> 2024-03-27 15:35:01,881 [info] Job finished: https://dbc-94c947ab-feb9.cloud.databricks.com/?o=4658245941722457#job/499259196347814/run/493449112310004\n", + "> 2024-03-27 15:35:01,881 [info] Logs:\n", + "+-------------------+------------------+-------+-------+\n", + "|favorite_music_type| avg_age|min_age|max_age|\n", + "+-------------------+------------------+-------+-------+\n", + "| Rock| 30.125| 27| 34|\n", + "| Classical|47.666666666666664| 38| 75|\n", + "| Pop| 24.0| 18| 38|\n", + "+-------------------+------------------+-------+-------+\n", + "\n", + "2024-03-27 15:34:54,980 - mlrun_logger - INFO - successfully wrote artifact details to the artifact JSON file in DBFS - music_data : /dbfs/demos/mlrun_databricks_demo/music_output_new.parquet\n", + "> 2024-03-27 15:35:02,182 [info] To track results use the CLI: {'info_cmd': 'mlrun get run a9c770f8377046bda3061e61a5c015c2 -p mlflow-tracking-example-guy', 'logs_cmd': 'mlrun logs a9c770f8377046bda3061e61a5c015c2 -p mlflow-tracking-example-guy'}\n", + "> 2024-03-27 15:35:02,182 [info] Or click for UI: {'ui_url': 'https://dashboard.default-tenant.app.llm-dev.iguazio-cd1.com/mlprojects/mlflow-tracking-example-guy/jobs/monitor/a9c770f8377046bda3061e61a5c015c2/overview'}\n", + "> 2024-03-27 15:35:02,182 [info] Run execution finished: {'status': 'completed', 'name': 'process-data-process-data'}\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
mlflow-tracking-example-guy0Mar 27 15:34:48completedprocess-data-process-data
v3io_user=zeevr
kind=databricks
owner=zeevr
mlrun/client_version=1.6.1
mlrun/client_python_version=3.9.16
host=process-data-process-data-89bhh
task_parameters={'timeout_minutes': 15, 'spark_app_code': 'IAoKaW1wb3J0IG9zCmltcG9ydCBsb2dnaW5nCm1scnVuX2xvZ2dlciA9IGxvZ2dpbmcuZ2V0TG9nZ2VyKCdtbHJ1bl9sb2dnZXInKQptbHJ1bl9sb2dnZXIuc2V0TGV2ZWwobG9nZ2luZy5ERUJVRykKCm1scnVuX2NvbnNvbGVfaGFuZGxlciA9IGxvZ2dpbmcuU3RyZWFtSGFuZGxlcigpCm1scnVuX2NvbnNvbGVfaGFuZGxlci5zZXRMZXZlbChsb2dnaW5nLkRFQlVHKQptbHJ1bl9mb3JtYXR0ZXIgPSBsb2dnaW5nLkZvcm1hdHRlcignJShhc2N0aW1lKXMgLSAlKG5hbWUpcyAtICUobGV2ZWxuYW1lKXMgLSAlKG1lc3NhZ2UpcycpCm1scnVuX2NvbnNvbGVfaGFuZGxlci5zZXRGb3JtYXR0ZXIobWxydW5fZm9ybWF0dGVyKQptbHJ1bl9sb2dnZXIuYWRkSGFuZGxlcihtbHJ1bl9jb25zb2xlX2hhbmRsZXIpCgptbHJ1bl9kZWZhdWx0X2FydGlmYWN0X3RlbXBsYXRlID0gJ21scnVuX3JldHVybl92YWx1ZV8nCm1scnVuX2FydGlmYWN0X2luZGV4ID0gMAoKCmRlZiBtbHJ1bl9sb2dfYXJ0aWZhY3QobmFtZT0nJywgcGF0aD0nJyk6CiAgICBnbG9iYWwgbWxydW5fYXJ0aWZhY3RfaW5kZXgKICAgIG1scnVuX2FydGlmYWN0X2luZGV4Kz0xICAjICBieSBob3cgbWFueSBhcnRpZmFjdHMgd2UgdHJpZWQgdG8gbG9nLCBub3QgaG93IG1hbnkgc3VjY2VlZC4KICAgIGlmIG5hbWUgaXMgTm9uZSBvciBuYW1lID09ICcnOgogICAgICAgIG5hbWUgPSBmJ3ttbHJ1bl9kZWZhdWx0X2FydGlmYWN0X3RlbXBsYXRlfXttbHJ1bl9hcnRpZmFjdF9pbmRleH0nCiAgICBpZiBub3QgcGF0aDoKICAgICAgICBtbHJ1bl9sb2dnZXIuZXJyb3IoZidwYXRoIHJlcXVpcmVkIGZvciBsb2dnaW5nIGFuIG1scnVuIGFydGlmYWN0IC0ge25hbWV9IDoge3BhdGh9JykKICAgICAgICByZXR1cm4KICAgIGlmIG5vdCBpc2luc3RhbmNlKG5hbWUsIHN0cikgb3Igbm90IGlzaW5zdGFuY2UocGF0aCwgc3RyKToKICAgICAgICBtbHJ1bl9sb2dnZXIuZXJyb3IoZiduYW1lIGFuZCBwYXRoIG11c3QgYmUgaW4gc3RyaW5nIHR5cGUgZm9yIGxvZ2dpbmcgYW4gbWxydW4gYXJ0aWZhY3QgLSB7bmFtZX0gOiB7cGF0aH0nKQogICAgICAgIHJldHVybgogICAgaWYgbm90IHBhdGguc3RhcnRzd2l0aCgnL2RiZnMnKSBhbmQgbm90IHBhdGguc3RhcnRzd2l0aCgnZGJmczovJyk6CiAgICAgICAgbWxydW5fbG9nZ2VyLmVycm9yKGYncGF0aCBmb3IgYW4gbWxydW4gYXJ0aWZhY3QgbXVzdCBzdGFydCB3aXRoIC9kYmZzIG9yIGRiZnM6LyAtIHtuYW1lfSA6IHtwYXRofScpCiAgICAgICAgcmV0dXJuCiAgICBtbHJ1bl9hcnRpZmFjdHNfcGF0aCA9ICcvZGJmcy9tbHJ1bl9kYXRhYnJpY2tzX3J1bnRpbWUvYXJ0aWZhY3RzX2RpY3Rpb25hcmllcy9tbHJ1bl9hcnRpZmFjdF9hOWM3NzBmODM3NzA0NmJkYTMwNjFlNjFhNWMwMTVjMi5qc29uJwogICAgdHJ5OgogICAgICAgIG5ld19kYXRhID0ge25hbWU6cGF0aH0KICAgICAgICBpZiBvcy5wYXRoLmV4aXN0cyhtbHJ1bl9hcnRpZmFjdHNfcGF0aCk6CiAgICAgICAgICAgIHdpdGggb3BlbihtbHJ1bl9hcnRpZmFjdHNfcGF0aCwgJ3IrJykgYXMganNvbl9maWxlOgogICAgICAgICAgICAgICAgZXhpc3RpbmdfZGF0YSA9IGpzb24ubG9hZChqc29uX2ZpbGUpCiAgICAgICAgICAgICAgICBleGlzdGluZ19kYXRhLnVwZGF0ZShuZXdfZGF0YSkKICAgICAgICAgICAgICAgIGpzb25fZmlsZS5zZWVrKDApCiAgICAgICAgICAgICAgICBqc29uLmR1bXAoZXhpc3RpbmdfZGF0YSwganNvbl9maWxlKQogICAgICAgIGVsc2U6CiAgICAgICAgICAgIHBhcmVudF9kaXIgPSBvcy5wYXRoLmRpcm5hbWUobWxydW5fYXJ0aWZhY3RzX3BhdGgpCiAgICAgICAgICAgIGlmIHBhcmVudF9kaXIgIT0gJy9kYmZzJzoKICAgICAgICAgICAgICAgIG9zLm1ha2VkaXJzKHBhcmVudF9kaXIsIGV4aXN0X29rPVRydWUpCiAgICAgICAgICAgIHdpdGggb3BlbihtbHJ1bl9hcnRpZmFjdHNfcGF0aCwgJ3cnKSBhcyBqc29uX2ZpbGU6CiAgICAgICAgICAgICAgICBqc29uLmR1bXAobmV3X2RhdGEsIGpzb25fZmlsZSkKICAgICAgICBzdWNjZXNzX2xvZyA9IGYnc3VjY2Vzc2Z1bGx5IHdyb3RlIGFydGlmYWN0IGRldGFpbHMgdG8gdGhlIGFydGlmYWN0IEpTT04gZmlsZSBpbiBEQkZTIC0ge25hbWV9IDoge3BhdGh9JwogICAgICAgIG1scnVuX2xvZ2dlci5pbmZvKHN1Y2Nlc3NfbG9nKQogICAgZXhjZXB0IEV4Y2VwdGlvbiBhcyB1bmtub3duX2V4Y2VwdGlvbjoKICAgICAgICBtbHJ1bl9sb2dnZXIuZXJyb3IoZidsb2cgbWxydW4gYXJ0aWZhY3QgZmFpbGVkIC0ge25hbWV9IDoge3BhdGh9LiBlcnJvcjoge3Vua25vd25fZXhjZXB0aW9ufScpCgoKCgppbXBvcnQgYXJncGFyc2UKaW1wb3J0IGpzb24KcGFyc2VyID0gYXJncGFyc2UuQXJndW1lbnRQYXJzZXIoKQpwYXJzZXIuYWRkX2FyZ3VtZW50KCdoYW5kbGVyX2FyZ3VtZW50cycpCmhhbmRsZXJfYXJndW1lbnRzID0gcGFyc2VyLnBhcnNlX2FyZ3MoKS5oYW5kbGVyX2FyZ3VtZW50cwpoYW5kbGVyX2FyZ3VtZW50cyA9IGpzb24ubG9hZHMoaGFuZGxlcl9hcmd1bWVudHMpCgoKZnJvbSBweXNwYXJrLnNxbCBpbXBvcnQgU3BhcmtTZXNzaW9uCmZyb20gcHlzcGFyay5zcWwuZnVuY3Rpb25zIGltcG9ydCBhdmcsIG1pbiwgbWF4CmltcG9ydCBwYW5kYXMgYXMgcGQKaW1wb3J0IGpzb24KaW1wb3J0IGZzc3BlYwoKZGVmIHByb2Nlc3NfZGF0YShkYXRhX3BhdGg6IHN0ciwgZGF0YV9vdXRwdXRfcGF0aDogc3RyKToKICAgIHNwYXJrID0gU3BhcmtTZXNzaW9uLmJ1aWxkZXIuYXBwTmFtZSgnTXVzaWNEZW1vJykuZ2V0T3JDcmVhdGUoKQogICAgc3BhcmtfZGYgPSBzcGFyay5yZWFkLnBhcnF1ZXQoZGF0YV9wYXRoLCBoZWFkZXI9VHJ1ZSkKICAgIHNwYXJrX2RmID0gc3BhcmtfZGYuZHJvcCgnbmFtZScsICdpZCcpCiAgICBtdXNpY19zdGF0cyA9IHNwYXJrX2RmLmdyb3VwQnkoJ2Zhdm9yaXRlX211c2ljX3R5cGUnKS5hZ2coYXZnKCdhZ2UnKS5hbGlhcygnYXZnX2FnZScpLCBtaW4oJ2FnZScpLmFsaWFzKCdtaW5fYWdlJyksIG1heCgnYWdlJykuYWxpYXMoJ21heF9hZ2UnKSkKICAgIG11c2ljX3N0YXRzLnNob3coKQogICAgcGFuZGFzX2RmID0gc3BhcmtfZGYudG9QYW5kYXMoKQogICAgcGFuZGFzX2RmLnRvX3BhcnF1ZXQoZGF0YV9vdXRwdXRfcGF0aCkKICAgIHJldHVybiB7J211c2ljX2RhdGEnOiBkYXRhX291dHB1dF9wYXRofQpyZXN1bHQgPSBwcm9jZXNzX2RhdGEoKipoYW5kbGVyX2FyZ3VtZW50cykKCgppZiByZXN1bHQ6CiAgICBpZiBpc2luc3RhbmNlKHJlc3VsdCwgZGljdCk6CiAgICAgICAgZm9yIGtleSwgcGF0aCBpbiByZXN1bHQuaXRlbXMoKToKICAgICAgICAgICAgbWxydW5fbG9nX2FydGlmYWN0KG5hbWU9a2V5LCBwYXRoPXBhdGgpCiAgICBlbGlmIGlzaW5zdGFuY2UocmVzdWx0LCAobGlzdCwgdHVwbGUsIHNldCkpOgogICAgICAgIGZvciBhcnRpZmFjdF9wYXRoIGluIHJlc3VsdDoKICAgICAgICAgICAgbWxydW5fbG9nX2FydGlmYWN0KHBhdGg9YXJ0aWZhY3RfcGF0aCkKICAgIGVsaWYgaXNpbnN0YW5jZShyZXN1bHQsIHN0cik6CiAgICAgICAgbWxydW5fbG9nX2FydGlmYWN0KHBhdGg9cmVzdWx0KQogICAgZWxzZToKICAgICAgICBtbHJ1bl9sb2dnZXIud2FybmluZyhmJ2NhbiBub3QgbG9nIGFydGlmYWN0cyB3aXRoIHRoZSByZXN1bHQgb2YgaGFuZGxlciBmdW5jdGlvbiAtIHJlc3VsdCBpbiB1bnN1cHBvcnRlZCB0eXBlLiB7dHlwZShyZXN1bHQpfScpCg==', 'original_handler': 'process_data', 'artifact_json_path': '/mlrun_databricks_runtime/artifacts_dictionaries/mlrun_artifact_a9c770f8377046bda3061e61a5c015c2.json'}
data_path=dbfs:///demos/mlrun_databricks_demo/1711553684480_33/music.parquet
data_output_path=/dbfs/demos/mlrun_databricks_demo/music_output_new.parquet
music_data
databricks_run_metadata
\n", + "
\n", + "
\n", + "
\n", + " Title\n", + " ×\n", + "
\n", + " \n", + "
\n", + "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "text/html": [ + " > to track results use the .show() or .logs() methods or click here to open in UI" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2024-03-27 15:35:07,910 [info] Run execution finished: {'status': 'completed', 'name': 'process-data-process-data'}\n" + ] + } + ], + "source": [ + "for name, val in job_env.items():\n", + " process_data_function.spec.env.append({\"name\": name, \"value\": val})\n", + "params = {\n", + " \"task_parameters\": {\"timeout_minutes\": 15},\n", + " \"data_path\": dbfs_data_path,\n", + " \"data_output_path\": output_path.replace(\"dbfs://\", \"/dbfs\"),\n", + "}\n", + "run = process_data_function.run(\n", + " handler=\"process_data\",\n", + " params=params,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "9a8db175-51f4-4218-afd1-752cc0e65216", + "metadata": { + "tags": [] + }, + "source": [ + "## Create an MLflow Xgboost function\n", + "\n", + "The following code demonstrates how to create a simple Xgboost model using MLflow and log the results.
\n", + "MLflow will log the model, parameters, metrics, and artifacts, and MLRun will track the run and collect the data." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "44a1e133-954d-47a3-9b0f-6e181fe12ea7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Overwriting training.py\n" + ] + } + ], + "source": [ + "%%writefile training.py\n", + "\n", + "import mlflow\n", + "import mlflow.xgboost\n", + "import xgboost as xgb\n", + "from mlflow import log_metric\n", + "from sklearn import datasets\n", + "from sklearn.metrics import accuracy_score, log_loss\n", + "from sklearn.model_selection import train_test_split\n", + "import pandas as pd\n", + "\n", + "def example_xgb_run(df: str):\n", + " df = pd.read_parquet(df)\n", + " \n", + " df = df.replace([\"f\", \"m\"], [0, 1])\n", + " df = df.replace([\"Pop\", \"Rock\", \"Classical\"], [0, 1, 2])\n", + " \n", + " # Prepare, train, and test data\n", + " y = df.pop('favorite_music_type')\n", + " X = df\n", + "\n", + " X_train, X_test, y_train, y_test = train_test_split(\n", + " X, y, test_size=0.2, random_state=42\n", + " )\n", + "\n", + " # Enable auto logging\n", + " mlflow.xgboost.autolog()\n", + "\n", + " dtrain = xgb.DMatrix(X_train, label=y_train)\n", + " dtest = xgb.DMatrix(X_test, label=y_test)\n", + "\n", + " with mlflow.start_run():\n", + " # Train model\n", + " params = {\n", + " \"objective\": \"multi:softprob\",\n", + " \"num_class\": 3,\n", + " \"learning_rate\": 0.3,\n", + " \"eval_metric\": \"mlogloss\",\n", + " \"colsample_bytree\": 1.0,\n", + " \"subsample\": 1.0,\n", + " \"seed\": 42,\n", + " }\n", + " model = xgb.train(params, dtrain, evals=[(dtrain, \"train\")])\n", + " \n", + " # Evaluate model\n", + " y_proba = model.predict(dtest)\n", + " y_pred = y_proba.argmax(axis=1)\n", + " loss = log_loss(y_test, y_proba)\n", + " acc = accuracy_score(y_test, y_pred)\n", + " \n", + " # Log metrics by hand\n", + " mlflow.log_metrics({\"log_loss\": loss, \"accuracy\": acc})" + ] + }, + { + "cell_type": "markdown", + "id": "1cf984c9-78a9-443f-9465-111263101dcd", + "metadata": {}, + "source": [ + "## Log the data from MLflow in MLRun " + ] + }, + { + "cell_type": "markdown", + "id": "365e4b39-9f39-40ae-aac4-7c4f42bce9bd", + "metadata": {}, + "source": [ + "### Change the MLRun configuration to use the tracker\n" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "0b194d04-e08f-4161-a65b-4f18d10fdbf0", + "metadata": {}, + "outputs": [], + "source": [ + "import mlrun\n", + "\n", + "mlrun.mlconf.external_platform_tracking.enabled = True" + ] + }, + { + "cell_type": "markdown", + "id": "b16bb4db-8a2a-4453-a42e-0e8e74ab8f53", + "metadata": {}, + "source": [ + "These are the three options to run tracking:\n", + "- Set: `mlrun.mlconf.external_platform_tracking.mlflow.match_experiment_to_runtime` to True. This determines the run id and is the safest method\n", + "- Set the experiment name at: `mlflow.environment_variables.MLFLOW_EXPERIMENT_NAME.set`. This determines the experiment mlrun will track and find the run added to it.\n", + "- Just run it, mlrun will look across all experiments and search for added run, this is not recomended." + ] + }, + { + "cell_type": "markdown", + "id": "8b7bc72a-bd1b-408a-afa8-e474d91c4a20", + "metadata": {}, + "source": [ + "### Create the mlrun function" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "3382b909-a8dc-41a3-afb1-b64df9bb7318", + "metadata": {}, + "outputs": [], + "source": [ + "# Use the first run option from above\n", + "mlrun.mlconf.external_platform_tracking.mlflow.match_experiment_to_runtime = True\n", + "\n", + "# Create a MLRun function using the example train file (all the functions must be located in it):\n", + "training_func = project.set_function(\n", + " func=\"training.py\",\n", + " name=\"example-xgb-run\",\n", + " kind=\"job\",\n", + " image=\"mlrun/mlrun\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "91597f57-364d-4d2a-b926-97b9d8afc81b", + "metadata": {}, + "source": [ + "### Run the function\n", + "\n", + "Run the function using MLRun. This will log the data from MLflow in MLRun.
\n", + "After running the function, you can look at the UI and see that all metrics and parameters are logged in MLRun." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "5a726ca8-8057-41ed-be4e-35e5e0582de9", + "metadata": {}, + "outputs": [], + "source": [ + "import mlrun.feature_store as fstore\n", + "\n", + "feature_set = fstore.get_feature_set(\"music_fset\", \"mlflow-tracking-example\")" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "4de1229a-cc59-4846-8473-3178e682efa6", + "metadata": {}, + "outputs": [], + "source": [ + "df = feature_set.to_dataframe()\n", + "df = df.drop(['id'], axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "8249a933-031c-4f2e-88c2-161dd4cfb7ed", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# df = project.list_().to_objects()[0].to_dataitem().as_df()\n", + "df_path = \"./music.parquet\"\n", + "df.to_parquet(df_path)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "8ba452dd-1756-4bfb-af64-d741e234dba3", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2024-03-27 15:37:22,829 [info] Storing function: {'name': 'example-xgb-run-example-xgb-run', 'uid': '6ff324dd21d64b6290d45a001957dda2', 'db': 'http://mlrun-api:8080'}\n", + "> 2024-03-27 15:37:22,912 [warning] `mlconf.external_platform_tracking.mlflow.match_experiment_to_runtime` is set to True but the MLFlow experiment name environment variable ('MLFLOW_EXPERIMENT_NAME') is set for using the name: 'example-xgb-run-example-xgb-run'. This name will be overriden with MLRun's runtime name as set in the MLRun configuration: 'example-xgb-run-example-xgb-run'.\n", + "[0]\ttrain-mlogloss:0.82467\n", + "[1]\ttrain-mlogloss:0.64706\n", + "[2]\ttrain-mlogloss:0.52480\n", + "[3]\ttrain-mlogloss:0.43768\n", + "[4]\ttrain-mlogloss:0.37410\n", + "[5]\ttrain-mlogloss:0.32686\n", + "[6]\ttrain-mlogloss:0.29057\n", + "[7]\ttrain-mlogloss:0.26192\n", + "[8]\ttrain-mlogloss:0.23885\n", + "[9]\ttrain-mlogloss:0.22004\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024/03/27 15:37:23 WARNING mlflow.utils.autologging_utils: MLflow autologging encountered a warning: \"/User/.pythonlibs/mlrun-base/lib/python3.9/site-packages/mlflow/types/utils.py:393: UserWarning: Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values `_ for more details.\"\n", + "2024/03/27 15:37:23 WARNING mlflow.utils.autologging_utils: MLflow autologging encountered a warning: \"/User/.pythonlibs/mlrun-base/lib/python3.9/site-packages/xgboost/core.py:160: UserWarning: [15:37:23] WARNING: /workspace/src/c_api/c_api.cc:1240: Saving into deprecated binary model format, please consider using `json` or `ubj`. Model format will default to JSON in XGBoost 2.2 if not specified.\"\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
mlflow-tracking-example-guy0Mar 27 15:37:22completedexample-xgb-run-example-xgb-run
v3io_user=zeevr
kind=local
owner=zeevr
host=jupyter-zeevr-9f4ffb7bb-8c4mf
mlflow-user=iguazio
mlflow-run-name=stately-cow-437
mlflow-run-id=f66d6149d54c4958a2485c941d86a538
mlflow-experiment-id=608717337209571124
df
colsample_bytree=1.0
custom_metric=None
early_stopping_rounds=None
eval_metric=mlogloss
learning_rate=0.3
maximize=None
num_boost_round=10
num_class=3
objective=multi:softprob
seed=42
subsample=1.0
verbose_eval=True
accuracy=0.7142857142857143
log_loss=0.9622776094122579
train-mlogloss=0.2200447738170624
feature_importance_weight_json
feature_importance_weight_png
model
\n", + "
\n", + "
\n", + "
\n", + " Title\n", + " ×\n", + "
\n", + " \n", + "
\n", + "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "text/html": [ + " > to track results use the .show() or .logs() methods or click here to open in UI" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2024-03-27 15:37:31,415 [info] Run execution finished: {'status': 'completed', 'name': 'example-xgb-run-example-xgb-run'}\n" + ] + } + ], + "source": [ + "# Run the example code using mlrun\n", + "train_run = training_func.run(\n", + " local=True,\n", + " handler=\"example_xgb_run\",\n", + " inputs={\"df\": df_path},\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "655d5c46-2c0a-46f2-bbec-a58853260476", + "metadata": {}, + "source": [ + "### Examine the results\n", + "\n", + "You can examine the results using the UI or by looking at the outputs of the run.
\n", + "The outputs include the model, the metrics, and the artifacts, and are completely independent of MLflow." + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "d23beb02-e455-48dc-9d9f-9e3d4549ec71", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'accuracy': 0.7142857142857143,\n", + " 'log_loss': 0.9622776094122579,\n", + " 'train-mlogloss': 0.2200447738170624,\n", + " 'feature_importance_weight_json': 'store://artifacts/mlflow-tracking-example-guy/example-xgb-run-example-xgb-run_feature_importance_weight_json@6ff324dd21d64b6290d45a001957dda2',\n", + " 'feature_importance_weight_png': 'store://artifacts/mlflow-tracking-example-guy/example-xgb-run-example-xgb-run_feature_importance_weight_png@6ff324dd21d64b6290d45a001957dda2',\n", + " 'model': 'store://artifacts/mlflow-tracking-example-guy/example-xgb-run-example-xgb-run_model@6ff324dd21d64b6290d45a001957dda2'}" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_run.outputs" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "b05f4c2a-5f2d-4d7c-9c21-39c0a949cfc3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'accuracy': 0.7142857142857143,\n", + " 'log_loss': 0.9622776094122579,\n", + " 'train-mlogloss': 0.2200447738170624}" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_run.status.results" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "925b3445-18b4-4497-9783-52b4cd069401", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAcwAAAFZCAYAAAAVcB92AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8qNh9FAAAACXBIWXMAAAsTAAALEwEAmpwYAAAVY0lEQVR4nO3debRsdXmn8efL5IBMAUKY5DqAiC1TR8UWBY3aGuzWXp0gCUFITCNqSExruzRtEofWoFnRGGyTEAfoaIiIkaB2KyTIjRhbBplEQAVBZkSmCwI28PYfex8pDnd4L9x7qrjn+ax1FrV37VP7V79DnefuXXWqUlVIkqSVW2/aA5Ak6dHAYEqS1GAwJUlqMJiSJDUYTEmSGgymJEkNBlOSpAaDKUlSg8GUJKnBYEqS1GAwJa1VSV6X5M/X8G1elGT/5rZXJHnxatz2kUne/3DHpnWXwdRUjL/E7kpyx8TXdmvgNtu/GB+pJO9M8qmF2t/KJDksyRnTHsd8STYC3gH86Zq83ap6RlWd/khvJ8n+Sa6et/pvgIOT/PwjvX2tWwympuk/VNUTJr6uneZgkmwwzf0/XDM+7lcCl1TVNdMeSFdV3Q38H+A10x6LZovB1ExJslmSjye5Lsk1Sf5HkvXH656S5LQkP05yU5JPJ9l8vO5vgScCXxiPVt+6vKOHyaPQ8QjxxCSfSnI7cNjK9t8YeyV5Q5LvJVmW5D3jmP81ye1JThiPuH52ZJPkD8b7ckWSg+fNw/9K8qMkVyZ5R5L1xusOS/L1JB9K8mPgM8BfAc8d7/ut43YHJDl33PdVSd45cftLxvEemuSH4xj++8T1649ju2y8L+ck2XG8btckpya5OcmlSQ5cybS8HFg6cbvHJXnzeHn7cQxvnPj53jxxP1+R5Lwkt45zuPsKfo6PG2/3liQXjz/7+UeNeya5IMltST6T5LFJNmYI43bLOctxOnDAKn7kWmQMpmbNscC9wFOBvYCXAr89XhfgT4DtgKcDOwLvBKiqQ4Af8sBR6wea+3slcCKwOfDpVey/498D/xbYB3grcAzwG+NY/w3waxPb/gKwFbA9cChwTJKnjdcdDWwGPBnYj+Fo5zcnvvc5wOXANuPtHwF8Y7zvm4/b3Dl+3+YMv/xfn+RV88a7L/A04JeAP0ry9HH9fx3H+svApsBvAT8ZI3Mq8HfAzwMHAR9NstsK5uOZwKUTy0uB/cfL+4334QUTy1+rqvuT7AV8AngdsCXw18DJSR6znH38MbCEYa5eMs7HfAcCLwOeBOwOHFZVdzIE/drlnOW4GNhjBfdJi5TB1DSdNB493JrkpCTbMPyCflNV3VlVNwIfYvilTFV9v6pOrap7qupHwAcZfsk+Et+oqpOq6n6GMKxw/00fqKrbq+oi4NvAKVV1eVXdxnA0s9e87f9wvD9LgS8BB45HtAcBb6+qZVV1BfBnwCET33dtVR1dVfdW1V3LG0hVnV5VF1bV/VV1AXA8D52vd1XVXVV1PnA+D0Tit4F3VNWlNTi/qn4MvAK4oqo+Oe77XOBzwK+uYD42B5ZNLC8F9h2PIl8AfAB43njdfjxwNHo48NdV9c2quq+qjgPuYfiHyHwHAu+rqluq6mrgL5azzV9U1bVVdTPwBWDPFYx3zjKGf7BIPzPLz31o3feqqvqnuYUkzwY2BK5LMrd6PeCq8fptgA8Dzwc2Ga+75RGO4aqJyzutbP9NN0xcvms5y78wsXzLeJQz50qGo+etxnFcOe+67Vcw7uVK8hzgKIYj242AxwCfnbfZ9ROXfwI8Yby8I3DZcm52J+A5c6d9RxsAf7uCYdzC8LMCoKouS3InQ7CeD7wHeO14ZL0fD8RuJ+DQJEdO3NZGDPMz33Y8eD6WNzfz7+eqXmC2CXDbKrbRIuMRpmbJVQxHEVtV1ebj16ZV9Yzx+vcBBTyzqjZlOPWWie+vebd3J/D4uYXxyG3redtMfs+q9r+mbTGe4pzzROBa4Cbg/zFEY/K6yRfOzL+v85dhOG16MrBjVW3G8DxnlrPd8lwFPGUF65dOzM/m46nM16/gdi4Adpm3binwK8BG44uBljKckt4COG9iP++dt5/HV9Xxy9nHdcAOE8s7du7gaHnzBsMp//NX43a0CBhMzYyqug44BfizJJsmWW98IcjcacRNgDuA25JsD/y3eTdxA8PzWHO+Czx2fPHLhgx/3rC858C6+18b3pVkoyTPZzjd+dmqug84AXhvkk2S7MTwnOLK/oTlBmCHuRcVjTYBbq6qu8ej919fjXF9DHhPkp0z2D3JlsAXgV2SHJJkw/HrWRPPfc73v3noaeClwO8A/zIunz4unzHedxj+tOOIJM8Z97/x+HPchIc6AXh7ki3G/y9+ZzXu5w3Alknmn37dj+EUuvQzBlOz5jUMp96+w3A670Rg2/G6dwF7M5wq+xLwD/O+90+Ad4zPib5lfN7wDQy//K9hOOKc/+rJ1dn/mnb9uI9rGV5wdERVXTJedyTDeC8HzmA4WvzESm7rNOAi4PokN43r3gC8O8ky4I8YwtL1wXH7U4DbgY8Dj6uqZQwvhDpoHPf1wPtZ8T9EvgDsmgf/je1ShpjPBfMMhjMBc8tU1dnAfwE+wjBH3wcOW8E+3s3wc/0B8E8MP7N7OndynO/jgcvH/2+2S/JYhueyj+vchhaPVK3ojISktSXDu9R8qqp2WMWmj3pJDgd2q6o3LdD+Xg8cVFUP68zA+LzpjlX11jU7Mj3a+aIfSWtVVR2zNm8/ybYMp+K/AewMvJnhyPRhqaqj19DQtI4xmJIe7TZi+DvNJwG3An8PfHSaA9K6yVOykiQ1+KIfSZIaZu6U7FZbbVVLliyZ9jAkSYvEOeecc1NVzf8b7YeYuWAuWbKEs88+e9rDkCQtEkmuXPVWnpKVJKnFYEqS1GAwJUlqMJiSJDUYTEmSGgymJEkNBlOSpAaDKUlSg8GUJKnBYEqS1GAwJUlqmLn3kr3wmttY8rYvTXsYkqQZdcVRB0xlvx5hSpLUYDAlSWowmJIkNRhMSZIaDKYkSQ0GU5KkBoMpSVKDwZQkqcFgSpLUYDAlSWowmJIkNRhMSZIaDKYkSQ0GU5KkBoMpSVKDwZQkqcFgSpLUYDAlSWowmJIkNRhMSZIaDKYkSQ0GU5KkBoMpSVKDwZQkqcFgSpLUYDAlSWowmJIkNRhMSZIaDKYkSQ0GU5KkBoMpSVKDwZQkqcFgSpLUYDAlSWowmJIkNRhMSZIaDKYkSQ0GU5KkBoMpSVKDwZQkqcFgSpLUYDAlSWowmJIkNRhMSZIaDKYkSQ0GU5KkBoMpSVKDwZQkqcFgSpLUYDAlSWowmJIkNRhMSZIaDKYkSQ0GU5KkBoMpSVKDwZQkqcFgSpLUYDAlSWowmJIkNRhMSZIaDKYkSQ0GU5KkBoMpSVKDwZQkqcFgSpLUYDAlSWowmJIkNRhMSZIaDKYkSQ0GU5KkBoMpSVKDwZQkqcFgSpLUYDAlSWowmJIkNRhMSZIaDKYkSQ0GU5KkBoMpSVKDwZQkqcFgSpLUYDAlSWowmJIkNRhMSZIaDKYkSQ0GU5KkBoMpSVKDwZQkqcFgSpLUYDAlSWowmJIkNRhMSZIaDKYkSQ0GU5KkBoMpSVKDwZQkqcFgSpLUYDAlSWowmJIkNRhMSZIaDKYkSQ0GU5KkBoMpSVKDwZQkqcFgSpLUYDAlSWowmJIkNRhMSZIaDKYkSQ0GU5KkBoMpSVKDwZQkqcFgSpLUYDAlSWowmJIkNRhMSZIaDKYkSQ0GU5KkBoMpSVKDwZQkqcFgSpLUYDAlSWowmJIkNax2MJOclOScJBclOXxc99ok301yZpK/SfKRcf3WST6X5Kzx63lr+g5IkrQQNngY3/NbVXVzkscBZyX5EvCHwN7AMuA04Pxx2w8DH6qqM5I8EfgK8PQ1MG5JkhbUwwnm7yb5T+PlHYFDgKVVdTNAks8Cu4zXvxjYLcnc926a5AlVdcfkDY5HqocDrL/p1g9jSJIkrV2rFcwk+zNE8LlV9ZMkpwOXsOKjxvWAfarq7pXdblUdAxwD8Jhtd67VGZMkSQthdZ/D3Ay4ZYzlrsA+wMbAfkm2SLIB8J8ntj8FOHJuIcmej3C8kiRNxeoG88vABkkuBo4C/i9wDfA+4Ezg68AVwG3j9r8L/GKSC5J8BzhiTQxakqSFtlqnZKvqHuDl89cnObuqjhmPMD8PnDRufxPw6jUwTkmSpmpN/R3mO5OcB3wb+AFjMCVJWlc8nFfJPkRVvWVN3I4kSbPKd/qRJKnBYEqS1GAwJUlqMJiSJDUYTEmSGgymJEkNBlOSpAaDKUlSg8GUJKnBYEqS1GAwJUlqMJiSJDUYTEmSGgymJEkNBlOSpAaDKUlSg8GUJKnBYEqS1GAwJUlqMJiSJDUYTEmSGgymJEkNBlOSpAaDKUlSg8GUJKnBYEqS1GAwJUlqMJiSJDUYTEmSGgymJEkNBlOSpAaDKUlSg8GUJKnBYEqS1GAwJUlqMJiSJDUYTEmSGgymJEkNBlOSpAaDKUlSg8GUJKnBYEqS1GAwJUlqMJiSJDUYTEmSGgymJEkNBlOSpAaDKUlSg8GUJKnBYEqS1GAwJUlqMJiSJDUYTEmSGgymJEkNBlOSpAaDKUlSg8GUJKnBYEqS1GAwJUlqMJiSJDUYTEmSGgymJEkNBlOSpAaDKUlSg8GUJKnBYEqS1GAwJUlqMJiSJDUYTEmSGgymJEkNBlOSpAaDKUlSg8GUJKnBYEqS1GAwJUlqMJiSJDUYTEmSGgymJEkNBlOSpAaDKUlSg8GUJKnBYEqS1GAwJUlqMJiSJDUYTEmSGgymJEkNBlOSpAaDKUlSg8GUJKnBYEqS1GAwJUlqMJiSJDUYTEmSGgymJEkNBlOSpAaDKUlSg8GUJKnBYEqS1GAwJUlqMJiSJDUYTEmSGgymJEkNBlOSpAaDKUlSg8GUJKnBYEqS1GAwJUlqMJiSJDUYTEmSGgymJEkNBlOSpAaDKUlSg8GUJKnBYEqS1GAwJUlqMJiSJDUYTEmSGgymJEkNBlOSpIYNpj2A+Z65/WacfdQB0x6GJEkP4hGmJEkNBlOSpAaDKUlSg8GUJKnBYEqS1GAwJUlqMJiSJDUYTEmSGgymJEkNBlOSpAaDKUlSQ6pq2mN4kCTLgEunPY4ZsBVw07QHMQOcB+dgjvMwcB7W/BzsVFVbr2qjmXvzdeDSqvrFaQ9i2pKc7Tw4D+AczHEeBs7D9ObAU7KSJDUYTEmSGmYxmMdMewAzwnkYOA/OwRznYeA8TGkOZu5FP5IkzaJZPMKUJGnmGExJkhpmKphJXpbk0iTfT/K2aY9noST5RJIbk3x7Yt3PJTk1yffG/24xzTGubUl2TPLVJN9JclGS3xvXL7Z5eGySM5OcP87Du8b1T0ryzfGx8ZkkG017rGtbkvWTnJvki+PyYpyDK5JcmOS8JGeP6xbVYwIgyeZJTkxySZKLkzx3GvMwM8FMsj7wP4GXA7sBv5Zkt+mOasEcC7xs3rq3Af9cVTsD/zwur8vuBd5cVbsB+wBvHH/+i20e7gFeVFV7AHsCL0uyD/B+4ENV9VTgFuC10xvigvk94OKJ5cU4BwAvrKo9J/7ucLE9JgA+DHy5qnYF9mD4/2LB52Fmggk8G/h+VV1eVT8F/h545ZTHtCCq6l+Am+etfiVw3Hj5OOBVCzmmhVZV11XVt8bLyxgeENuz+OahquqOcXHD8auAFwEnjuvX+XlIsgNwAPCxcTkssjlYiUX1mEiyGfAC4OMAVfXTqrqVKczDLAVze+CqieWrx3WL1TZVdd14+Xpgm2kOZiElWQLsBXyTRTgP46nI84AbgVOBy4Bbq+recZPF8Nj4c+CtwP3j8pYsvjmA4R9LpyQ5J8nh47rF9ph4EvAj4JPjKfqPJdmYKczDLAVTK1DD3/4sir//SfIE4HPAm6rq9snrFss8VNV9VbUnsAPDmZddpzuihZXkFcCNVXXOtMcyA/atqr0Znqp6Y5IXTF65SB4TGwB7A39ZVXsBdzLv9OtCzcMsBfMaYMeJ5R3GdYvVDUm2BRj/e+OUx7PWJdmQIZafrqp/GFcvunmYM552+irwXGDzJHPv/byuPzaeB/zHJFcwPDXzIobnsBbTHABQVdeM/70R+DzDP6AW22PiauDqqvrmuHwiQ0AXfB5mKZhnATuPr4TbCDgIOHnKY5qmk4FDx8uHAv84xbGsdeNzVB8HLq6qD05ctdjmYeskm4+XHwe8hOH53K8CvzJutk7PQ1W9vap2qKolDL8HTquqg1lEcwCQZOMkm8xdBl4KfJtF9pioquuBq5I8bVz1S8B3mMI8zNQ7/ST5ZYbnLtYHPlFV753uiBZGkuOB/Rk+suYG4I+Bk4ATgCcCVwIHVtX8FwatM5LsC3wNuJAHnrf6A4bnMRfTPOzO8AKG9Rn+QXtCVb07yZMZjrZ+DjgX+I2qumd6I10YSfYH3lJVr1hsczDe38+PixsAf1dV702yJYvoMQGQZE+GF4BtBFwO/Cbj44MFnIeZCqYkSbNqlk7JSpI0swymJEkNBlOSpAaDKUlSg8GUJKnBYEprUZI7Vr3VGt3fkiS/vpD7lBYLgymtI8Z3wVkCGExpLTCY0gJIsn+SpUn+McnlSY5KcvD42ZcXJnnKuN2xSf4qydlJvju+r+rc52R+ctz23CQvHNcfluTkJKcxfMTRUcDzx89P/P3xiPNrSb41fv27ifGcPvEZg58e322JJM9K8q8ZPpPzzCSbjG8I/6dJzkpyQZLXTWUipSnaYNWbSFpD9gCezvBRbpcDH6uqZ2f4sOwjgTeN2y1heM/QpwBfTfJU4I0M7zH9zCS7MnyCxS7j9nsDu1fVzZPvjAOQ5PHAS6rq7iQ7A8cDc5+ruBfwDOBa4OvA85KcCXwGeHVVnZVkU+Auhs+evK2qnpXkMcDXk5xSVT9Y89MkzSaDKS2cs+Y+jijJZcAp4/oLgRdObHdCVd0PfC/J5QyfVrIvcDRAVV2S5EpgLpinruQtwTYEPjK+tdh9E98DcGZVXT2O5zyGUN8GXFdVZ437un28/qXA7knm3st1M2BnwGBq0TCY0sKZfN/T+yeW7+fBj8X571e5qvevvHMl1/0+w/sT78HwFMzdKxjPfaz890GAI6vqK6sYi7TO8jlMafb8apL1xuc1nwxcyvDG9AcDjKdinziun28ZsMnE8mYMR4z3A4cwvKn7ylwKbJvkWeO+NhlfTPQV4PXjR7CRZJfxEzSkRcMjTGn2/BA4E9gUOGJ8/vGjwF8muRC4Fzisqu4ZX6cz6QLgviTnA8cCHwU+l+Q1wJdZ+dEoVfXTJK8Gjh4/Xuwu4MUMnxSxBPjW+OKgHwGvWgP3VXrU8NNKpBmS5Fjgi1V14rTHIunBPCUrSVKDR5iSJDV4hClJUoPBlCSpwWBKktRgMCVJajCYkiQ1GExJkhoMpiRJDQZTkqQGgylJUsP/BySEjToO/wa1AAAAAElFTkSuQmCC", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "train_run.artifact(\"feature_importance_weight_png\").show()" + ] + }, + { + "cell_type": "markdown", + "id": "227c4358-4c34-4d1c-acb4-e37ca110b8bf", + "metadata": {}, + "source": [ + "### You can also examine the results using the UI" + ] + }, + { + "cell_type": "markdown", + "id": "dde00fd1-a1f0-4c56-80c2-c5d36a9062a1", + "metadata": {}, + "source": [ + "Look at collected artifacts: " + ] + }, + { + "attachments": { + "95b9b198-55c9-4a67-b0bf-103c9ae0272e.png": { + "image/png": "" + } + }, + "cell_type": "markdown", + "id": "8cda6c13-7fee-4284-aacf-81a506a426da", + "metadata": {}, + "source": [ + "![image.png](attachment:95b9b198-55c9-4a67-b0bf-103c9ae0272e.png)" + ] + }, + { + "cell_type": "markdown", + "id": "e1525230-e10c-4f48-b951-bc73642bb3e4", + "metadata": {}, + "source": [ + "And at results:" + ] + }, + { + "attachments": { + "66422f79-9b46-4e07-9796-c1b350c26c9c.png": { + "image/png": "" + } + }, + "cell_type": "markdown", + "id": "217279f8-6af1-4209-b0ec-3d3d829ceed9", + "metadata": {}, + "source": [ + "![image.png](attachment:66422f79-9b46-4e07-9796-c1b350c26c9c.png)" + ] + }, + { + "cell_type": "markdown", + "id": "844edc05-0b6a-4e84-9213-1d3cbf6f833e", + "metadata": {}, + "source": [ + "## Use the function for model serving" + ] + }, + { + "cell_type": "markdown", + "id": "40182a6f-fc46-4a33-a7f5-7ee8ee171966", + "metadata": {}, + "source": [ + "### Create the server and serving function\n", + "\n", + "Create a serving function that uses the model from the previous run and serves it using MLRun.
\n", + "We will create a mock server to test the model in a local environment." + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "f5fe910b-e177-4af7-84de-41a571d1774c", + "metadata": {}, + "outputs": [], + "source": [ + "serving_func = project.set_function(\n", + " func=\"function.yaml\",\n", + " name=\"example-xgb-server\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "ddbfd48f-a90e-4fe6-9caa-ddffeacf63d1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Add the model\n", + "serving_func.add_model(\n", + " \"mlflow_xgb_model\",\n", + " class_name=\"MLFlowModelServer\",\n", + " model_path=train_run.outputs[\"model\"],\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "2298d111-2f53-4b84-be9e-e4e8a228dcc4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2024-03-27 15:37:31,627 [info] model mlflow_xgb_model was loaded\n", + "> 2024-03-27 15:37:31,628 [info] Loaded ['mlflow_xgb_model']\n" + ] + } + ], + "source": [ + "# Create a mock server\n", + "server = serving_func.to_mock_server()" + ] + }, + { + "cell_type": "markdown", + "id": "f54d7c06-4972-4881-9bc9-fba7db0adbe4", + "metadata": {}, + "source": [ + "### Test the model " + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "4f256490-f225-4bd6-ac8a-5fc12a0f335d", + "metadata": {}, + "outputs": [], + "source": [ + "# An example taken randomly \n", + "result = server.test(\"/v2/models/mlflow_xgb_model/predict\", {\"inputs\":[{\"age\": 20, \"gender\": 0}]})" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "47839f4b-bb2d-4341-99c5-e34fa31270c9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'id': '43a61d06f2694fa695bdd6561b487131',\n", + " 'model_name': 'mlflow_xgb_model',\n", + " 'outputs': [[0.9242361187934875, 0.0418272465467453, 0.033936627209186554]]}" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Look at the result, it shows the probability of the given example to be each of the \n", + "# irises featured in the dataset\n", + "result" + ] + }, + { + "cell_type": "markdown", + "id": "d4fc6c73-0963-4814-bd5f-2d27b464823e", + "metadata": {}, + "source": [ + "We predicted that a 20 year old female would like pop!" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "mlrun-base", + "language": "python", + "name": "conda-env-mlrun-base-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/functions/master/mlflow_utils/1.1.0/src/mlflow_utils.py b/functions/master/mlflow_utils/1.1.0/src/mlflow_utils.py new file mode 100644 index 00000000..fb6124be --- /dev/null +++ b/functions/master/mlflow_utils/1.1.0/src/mlflow_utils.py @@ -0,0 +1,45 @@ +import zipfile +from typing import Any, Dict +import mlflow +from mlrun.serving.v2_serving import V2ModelServer +import pandas as pd + + +class MLFlowModelServer(V2ModelServer): + """ + MLFlow tracker Model serving class, inheriting the V2ModelServer class for being initialized automatically by the model + server and be able to run locally as part of a nuclio serverless function, or as part of a real-time pipeline. + """ + + def load(self): + """ + loads a model that was logged by the MLFlow tracker model + """ + # Unzip the model dir and then use mlflow's load function + model_file, _ = self.get_model(".zip") + model_path_unzip = model_file.replace(".zip", "") + + with zipfile.ZipFile(model_file, "r") as zip_ref: + zip_ref.extractall(model_path_unzip) + + self.model = mlflow.pyfunc.load_model(model_path_unzip) + + def predict(self, request: Dict[str, Any]) -> list: + """ + Infer the inputs through the model. The inferred data will + be read from the "inputs" key of the request. + + :param request: The request to the model using xgboost's predict. + The input to the model will be read from the "inputs" key. + + :return: The model's prediction on the given input. + """ + + # Get the inputs and set to accepted type: + inputs = pd.DataFrame(request["inputs"]) + + # Predict using the model's predict function: + predictions = self.model.predict(inputs) + + # Return as list: + return predictions.tolist() diff --git a/functions/master/mlflow_utils/1.1.0/src/requirements.txt b/functions/master/mlflow_utils/1.1.0/src/requirements.txt new file mode 100644 index 00000000..2a40b1a8 --- /dev/null +++ b/functions/master/mlflow_utils/1.1.0/src/requirements.txt @@ -0,0 +1,3 @@ +mlflow==2.20.2 +lightgbm +xgboost diff --git a/functions/master/mlflow_utils/1.1.0/src/test_mlflow_utils.py b/functions/master/mlflow_utils/1.1.0/src/test_mlflow_utils.py new file mode 100644 index 00000000..70d6ce03 --- /dev/null +++ b/functions/master/mlflow_utils/1.1.0/src/test_mlflow_utils.py @@ -0,0 +1,179 @@ +# Copyright 2018 Iguazio +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import tempfile + +import lightgbm as lgb +import mlflow +import mlflow.environment_variables +import mlflow.xgboost +import pytest +import xgboost as xgb +from sklearn import datasets +from sklearn.metrics import accuracy_score, log_loss +from sklearn.model_selection import train_test_split + +import os +# os.environ["MLRUN_IGNORE_ENV_FILE"] = "True" #TODO remove before push + +import mlrun +import mlrun.launcher.local +# Important: +# unlike mlconf which resets back to default after each test run, the mlflow configurations +# and env vars don't, so at the end of each test we need to redo anything we set in that test. +# what we cover in these tests: logging "regular" runs with, experiment name, run id and context +# name (last two using mlconf), failing run mid-way, and a run with no handler. +# we also test here importing of runs, artifacts and models from a previous run. + +# simple mlflow example of lgb logging +def lgb_run(): + # prepare train and test data + iris = datasets.load_iris() + X = iris.data + y = iris.target + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.2, random_state=42 + ) + + # enable auto logging + mlflow.lightgbm.autolog() + + train_set = lgb.Dataset(X_train, label=y_train) + + with mlflow.start_run(): + # train model + params = { + "objective": "multiclass", + "num_class": 3, + "learning_rate": 0.1, + "metric": "multi_logloss", + "colsample_bytree": 1.0, + "subsample": 1.0, + "seed": 42, + } + # model and training data are being logged automatically + model = lgb.train( + params, + train_set, + num_boost_round=10, + valid_sets=[train_set], + valid_names=["train"], + ) + + # evaluate model + y_proba = model.predict(X_test) + y_pred = y_proba.argmax(axis=1) + loss = log_loss(y_test, y_proba) + acc = accuracy_score(y_test, y_pred) + + # log metrics + mlflow.log_metrics({"log_loss": loss, "accuracy": acc}) + + +# simple mlflow example of xgb logging +def xgb_run(): + # prepare train and test data + iris = datasets.load_iris() + x = iris.data + y = iris.target + x_train, x_test, y_train, y_test = train_test_split( + x, y, test_size=0.2, random_state=42 + ) + + # enable auto logging + mlflow.xgboost.autolog() + + dtrain = xgb.DMatrix(x_train, label=y_train) + dtest = xgb.DMatrix(x_test, label=y_test) + + with mlflow.start_run(): + # train model + params = { + "objective": "multi:softprob", + "num_class": 3, + "learning_rate": 0.3, + "eval_metric": "mlogloss", + "colsample_bytree": 1.0, + "subsample": 1.0, + "seed": 42, + } + # model and training data are being logged automatically + model = xgb.train(params, dtrain, evals=[(dtrain, "train")]) + # evaluate model + y_proba = model.predict(dtest) + y_pred = y_proba.argmax(axis=1) + loss = log_loss(y_test, y_proba) + acc = accuracy_score(y_test, y_pred) + # log metrics + mlflow.log_metrics({"log_loss": loss, "accuracy": acc}) + + +@pytest.mark.parametrize("handler", ["xgb_run", "lgb_run"]) +def test_track_run_with_experiment_name(handler): + """ + This test is for tracking a run logged by mlflow into mlrun while it's running using the experiment name. + first activate the tracking option in mlconf, then we name the mlflow experiment, + then we run some code that is being logged by mlflow using mlrun, + and finally compare the mlrun we tracked with the original mlflow run using the validate func + """ + # Enable general tracking + mlrun.mlconf.external_platform_tracking.enabled = True + # Set the mlflow experiment name + mlflow.environment_variables.MLFLOW_EXPERIMENT_NAME.set(f"{handler}_test_track") + with tempfile.TemporaryDirectory() as test_directory: + mlflow.set_tracking_uri(test_directory) # Tell mlflow where to save logged data + + # Create a project for this tester: + project = mlrun.get_or_create_project(name="default", context=test_directory) + + # Create a MLRun function using the tester source file (all the functions must be located in it): + func = project.set_function( + func=__file__, + name=f"{handler}-test", + kind="job", + image="mlrun/mlrun", + requirements=["mlflow"], + ) + # mlflow creates a dir to log the run, this makes it in the tmpdir we create + trainer_run = func.run( + local=True, + handler=handler, + artifact_path=test_directory, + ) + + serving_func = project.set_function( + func=os.path.abspath("function.yaml"), + name=f"{handler}-server", + ) + model_name = f"{handler}-model" + # Add the model + upper_handler = handler.replace("_", "-") + model_path = test_directory + f"/{upper_handler}-test-{upper_handler}/0/model/" + serving_func.add_model( + model_name, + class_name="MLFlowModelServer", + model_path=model_path, + ) + + # Create a mock server + server = serving_func.to_mock_server() + + # An example taken randomly + result = server.test(f"/v2/models/{model_name}/predict", {"inputs": [[5.1, 3.5, 1.4, 0.2]]}) + print(result) + assert result + # unset mlflow experiment name to default + mlflow.environment_variables.MLFLOW_EXPERIMENT_NAME.unset() + + diff --git a/functions/master/mlflow_utils/1.1.0/static/documentation.html b/functions/master/mlflow_utils/1.1.0/static/documentation.html new file mode 100644 index 00000000..ff93711e --- /dev/null +++ b/functions/master/mlflow_utils/1.1.0/static/documentation.html @@ -0,0 +1,271 @@ + + + + + + + +mlflow_utils package + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + +
+
+
+
+
+
+ +
+ +
+
+
+ + + + +
+
+
+
+
+
+
+
+
+
+
+
+
+ + + + +
+
+
+
+ + +
+
+

mlflow_utils package#

+
+

Submodules#

+
+
+

mlflow_utils.mlflow_utils module#

+
+
+class mlflow_utils.mlflow_utils.MLFlowModelServer(context=None, name: str | None = None, model_path: str | None = None, model=None, protocol=None, input_path: str | None = None, result_path: str | None = None, shard_by_endpoint: bool | None = None, **kwargs)[source]#
+

Bases: V2ModelServer

+

MLFlow tracker Model serving class, inheriting the V2ModelServer class for being initialized automatically by the model +server and be able to run locally as part of a nuclio serverless function, or as part of a real-time pipeline.

+
+
+load()[source]#
+

loads a model that was logged by the MLFlow tracker model

+
+
+
+predict(request: Dict[str, Any]) list[source]#
+

Infer the inputs through the model. The inferred data will +be read from the “inputs” key of the request.

+
+
Parameters:
+

request – The request to the model using xgboost’s predict. +The input to the model will be read from the “inputs” key.

+
+
Returns:
+

The model’s prediction on the given input.

+
+
+
+
+
+
+

Module contents#

+
+
+
+
+
+
+
+
+ +
+
+ +
+
+
+
+ + + +
+
+ + \ No newline at end of file diff --git a/functions/master/mlflow_utils/1.1.0/static/example.html b/functions/master/mlflow_utils/1.1.0/static/example.html new file mode 100644 index 00000000..72f59a9b --- /dev/null +++ b/functions/master/mlflow_utils/1.1.0/static/example.html @@ -0,0 +1,1194 @@ + + + + + + + +MLflow tracker demo + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + +
+
+
+
+
+
+ +
+ +
+
+
+ + + + +
+
+
+
+
+
+
+
+
+
+
+
+
+ + + + +
+
+
+
+ + +
+
+

MLflow tracker demo#

+

This demo demonstrates how to seamlessly integrate and transfer logs from MLflow to MLRun, +creating a unified and powerful platform for your machine learning experiments.

+

You can combine MLflow and MLRun for a comprehensive solution for managing, tracking, and deploying machine learning models.

+

This notebook guides you through the process of:

+
    +
  1. Setting up the integration between MLflow and MLRun.

  2. +
  3. Extracting data, metrics, and artifacts from MLflow experiments.

  4. +
  5. Creating MLRun artifacts and projects to organize and manage the transferred data.

  6. +
  7. Leveraging MLRun’s capabilities for model deployment and data processing.

  8. +
+

By the end of this demo, you will have a understanding of how to establish a smooth flow of data between MLflow and MLRun.

+
+

MLRun installation and configuration#

+

Before running this notebook make sure the mlrun package is installed (pip install mlrun) and that you have configured the access to MLRun service.

+
+
+
# Install MLRun and scikit-learn if not already installed. Run this only once. Restart the notebook after the install!
+# %pip install mlrun scikit-learn~=1.3.0
+
+
+
+
+

Then you can import the necessary packages.

+
+
+
import pandas as pd
+import os
+import mlrun
+from mlrun.datastore.targets import ParquetTarget
+import mlrun.feature_store as fstore
+
+
+
+
+

Create a project for this demo:

+
+
+
# Create a project for this demo:
+project = mlrun.get_or_create_project(name="mlflow-tracking-example", context="./")
+
+
+
+
+
> 2024-03-27 15:34:40,940 [info] Project loaded successfully: {'project_name': 'mlflow-tracking-example-guy'}
+
+
+
+
+

Set all the necessary environment variables for the Databricks cluster:

+
+
+
DATABRICKS_HOST="add your host"
+DATABRICKS_TOKEN="add your token"
+DATABRICKS_CLUSTER_ID="add your cluster id"
+
+
+
+
+
+
+
os.environ["DATABRICKS_HOST"] = DATABRICKS_HOST
+os.environ["DATABRICKS_TOKEN"] = DATABRICKS_TOKEN
+
+
+
+
+
+
+
# Set the Databricks environment variables
+job_env = {
+    "DATABRICKS_HOST": DATABRICKS_HOST,
+    "DATABRICKS_CLUSTER_ID": DATABRICKS_CLUSTER_ID
+}
+secrets = {"DATABRICKS_TOKEN": DATABRICKS_TOKEN}
+
+# Set the secrets in the project
+project.set_secrets(secrets)
+
+
+
+
+
+
+

Create a feature set and ingest data#

+

This is a short example of how to create a feature set about music preferences.

+
+
+
# create df
+columns = ["id", "name", "age", "gender", "favorite_music_type"]
+data = [
+    (1, "Alice", 20, "f", "Pop"),
+    (2, "Bob", 30, "m", "Rock"),
+    (3, "Charlie", 25, "m", "Pop"),
+    (4, "David", 40, "m", "Classical"),
+    (5, "Eva", 18, "f", "Pop"),
+    (6, "Frank", 32, "m", "Rock"),
+    (7, "Grace", 28, "f", "Pop"),
+    (8, "Henry", 45, "m", "Classical"),
+    (9, "Ivy", 22, "f", "Pop"),
+    (10, "Jack", 38, "m", "Classical"),
+    (11, "Karen", 27, "f", "Pop"),
+    (12, "Liam", 19, "m", "Pop"),
+    (13, "Mia", 27, "f", "Rock"),
+    (14, "Nora", 31, "f", "Rock"),
+    (15, "Oliver", 29, "m", "Pop"),
+    (16, "Ben", 38, "m", "Pop"),
+    (17, "Alicia", 20, "f", "Pop"),
+    (18, "Bobby", 30, "m", "Rock"),
+    (19, "Charlien", 22, "f", "Pop"),
+    (20, "Davide", 40, "m", "Classical"),
+    (21, "Evans", 19, "m", "Pop"),
+    (22, "Franklin", 34, "m", "Rock"),
+    (23, "Grace", 22, "f", "Pop"),
+    (24, "Henrik", 48, "m", "Classical"),
+    (25, "eevee", 29, "f", "Pop"),
+    (26, "Jack", 75, "m", "Classical"),
+    (27, "Karen", 26, "f", "Pop"),
+    (28, "Lian", 21, "f", "Pop"),
+    (29, "kia", 27, "f", "Rock"),
+    (30, "Novak", 30, "m", "Rock"),
+    (31, "Olivia", 29, "f", "Pop"),
+    (32, "Benjamin", 18, "m", "Pop")
+]
+df = pd.DataFrame(data, columns=columns)
+
+
+
+
+

Transfer the data to DataBricks.

+
+
+
# Where to save the data in DataBricks
+target_path = f"dbfs:///demos/mlrun_databricks_demo/music.parquet"
+output_path = f"dbfs:///demos/mlrun_databricks_demo/music_output_new.parquet"
+
+targets = [ParquetTarget(path=target_path)]
+
+# Create a feature set and ingest the data
+fset = fstore.FeatureSet(name="music_fset", entities=[fstore.Entity("name")])
+fstore.ingest(fset, df, targets=targets, overwrite=True)
+
+# Get the target path and check it
+dbfs_data_path = fset.get_target_path()
+dbfs_data_path
+
+
+
+
+
'dbfs:///demos/mlrun_databricks_demo/1711553684480_33/music.parquet'
+
+
+
+
+

We can look and see how how our data is logged in the DataBricks cluster: +(only top 20 rows)

+

image.png

+
+
+

Create a data processing function#

+

The following code demonstrates how to create a simple data processing function using MLRun. +The function will process the data and show some statistics.

+
+
+
%%writefile process_data.py
+
+
+#  Here is an example of Spark processing.
+from pyspark.sql import SparkSession
+from pyspark.sql.functions import avg, min, max
+import pandas as pd
+import json
+import fsspec
+
+def process_data(data_path: str, data_output_path: str):
+    spark = SparkSession.builder.appName("MusicDemo").getOrCreate()
+    spark_df = spark.read.parquet(data_path, header=True)
+    spark_df = spark_df.drop("name", "id")
+    
+    music_stats = spark_df.groupBy("favorite_music_type").agg(
+        avg("age").alias("avg_age"),
+        min("age").alias("min_age"),
+        max("age").alias("max_age")
+    )
+    music_stats.show()
+    pandas_df = spark_df.toPandas()
+    pandas_df.to_parquet(data_output_path)
+    # spark_df.write.mode("overwrite").parquet(data_output_path)
+
+    return {"music_data": data_output_path}
+
+
+
+
+
+
+
process_data_function = project.set_function(
+    func="./zeev-demos/mlflow-databricks/process_data.py",
+    name="process-data",
+    kind="databricks",
+    image="mlrun/mlrun",
+)
+                                
+
+
+
+
+

Set all parameters necessary for the function and run it.

+
+
+
for name, val in job_env.items():
+    process_data_function.spec.env.append({"name": name, "value": val})
+params = {
+    "task_parameters": {"timeout_minutes": 15},
+    "data_path": dbfs_data_path,
+    "data_output_path": output_path.replace("dbfs://", "/dbfs"),
+}
+run = process_data_function.run(
+    handler="process_data",
+    params=params,
+)
+
+
+
+
+
> 2024-03-27 15:34:45,422 [info] Storing function: {'name': 'process-data-process-data', 'uid': 'a9c770f8377046bda3061e61a5c015c2', 'db': 'http://mlrun-api:8080'}
+> 2024-03-27 15:34:45,675 [info] Job is running in the background, pod: process-data-process-data-89bhh
+> 2024-03-27 15:34:49,272 [info] Running with an existing cluster: {'cluster_id': '0327-134616-43m7kfxk'}
+> 2024-03-27 15:34:49,492 [info] Starting to poll: 493449112310004
+> 2024-03-27 15:34:49,539 [info] Workflow intermediate status: mlrun_task__15_34_48_703046: RunLifeCycleState.PENDING
+> 2024-03-27 15:34:50,947 [info] Workflow intermediate status: mlrun_task__15_34_48_703046: RunLifeCycleState.PENDING
+> 2024-03-27 15:34:53,063 [info] Workflow intermediate status: mlrun_task__15_34_48_703046: RunLifeCycleState.RUNNING
+> 2024-03-27 15:34:56,737 [info] Workflow intermediate status: mlrun_task__15_34_48_703046: RunLifeCycleState.RUNNING
+> 2024-03-27 15:35:00,947 [info] Artifacts found. Run name: mlrun_task__15_34_48_703046
+> 2024-03-27 15:35:01,881 [info] Job finished: https://dbc-94c947ab-feb9.cloud.databricks.com/?o=4658245941722457#job/499259196347814/run/493449112310004
+> 2024-03-27 15:35:01,881 [info] Logs:
++-------------------+------------------+-------+-------+
+|favorite_music_type|           avg_age|min_age|max_age|
++-------------------+------------------+-------+-------+
+|               Rock|            30.125|     27|     34|
+|          Classical|47.666666666666664|     38|     75|
+|                Pop|              24.0|     18|     38|
++-------------------+------------------+-------+-------+
+
+2024-03-27 15:34:54,980 - mlrun_logger - INFO - successfully wrote artifact details to the artifact JSON file in DBFS - music_data : /dbfs/demos/mlrun_databricks_demo/music_output_new.parquet
+> 2024-03-27 15:35:02,182 [info] To track results use the CLI: {'info_cmd': 'mlrun get run a9c770f8377046bda3061e61a5c015c2 -p mlflow-tracking-example-guy', 'logs_cmd': 'mlrun logs a9c770f8377046bda3061e61a5c015c2 -p mlflow-tracking-example-guy'}
+> 2024-03-27 15:35:02,182 [info] Or click for UI: {'ui_url': 'https://dashboard.default-tenant.app.llm-dev.iguazio-cd1.com/mlprojects/mlflow-tracking-example-guy/jobs/monitor/a9c770f8377046bda3061e61a5c015c2/overview'}
+> 2024-03-27 15:35:02,182 [info] Run execution finished: {'status': 'completed', 'name': 'process-data-process-data'}
+
+
+
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
mlflow-tracking-example-guy0Mar 27 15:34:48completedprocess-data-process-data
v3io_user=zeevr
kind=databricks
owner=zeevr
mlrun/client_version=1.6.1
mlrun/client_python_version=3.9.16
host=process-data-process-data-89bhh
task_parameters={'timeout_minutes': 15, 'spark_app_code': 'IAoKaW1wb3J0IG9zCmltcG9ydCBsb2dnaW5nCm1scnVuX2xvZ2dlciA9IGxvZ2dpbmcuZ2V0TG9nZ2VyKCdtbHJ1bl9sb2dnZXInKQptbHJ1bl9sb2dnZXIuc2V0TGV2ZWwobG9nZ2luZy5ERUJVRykKCm1scnVuX2NvbnNvbGVfaGFuZGxlciA9IGxvZ2dpbmcuU3RyZWFtSGFuZGxlcigpCm1scnVuX2NvbnNvbGVfaGFuZGxlci5zZXRMZXZlbChsb2dnaW5nLkRFQlVHKQptbHJ1bl9mb3JtYXR0ZXIgPSBsb2dnaW5nLkZvcm1hdHRlcignJShhc2N0aW1lKXMgLSAlKG5hbWUpcyAtICUobGV2ZWxuYW1lKXMgLSAlKG1lc3NhZ2UpcycpCm1scnVuX2NvbnNvbGVfaGFuZGxlci5zZXRGb3JtYXR0ZXIobWxydW5fZm9ybWF0dGVyKQptbHJ1bl9sb2dnZXIuYWRkSGFuZGxlcihtbHJ1bl9jb25zb2xlX2hhbmRsZXIpCgptbHJ1bl9kZWZhdWx0X2FydGlmYWN0X3RlbXBsYXRlID0gJ21scnVuX3JldHVybl92YWx1ZV8nCm1scnVuX2FydGlmYWN0X2luZGV4ID0gMAoKCmRlZiBtbHJ1bl9sb2dfYXJ0aWZhY3QobmFtZT0nJywgcGF0aD0nJyk6CiAgICBnbG9iYWwgbWxydW5fYXJ0aWZhY3RfaW5kZXgKICAgIG1scnVuX2FydGlmYWN0X2luZGV4Kz0xICAjICBieSBob3cgbWFueSBhcnRpZmFjdHMgd2UgdHJpZWQgdG8gbG9nLCBub3QgaG93IG1hbnkgc3VjY2VlZC4KICAgIGlmIG5hbWUgaXMgTm9uZSBvciBuYW1lID09ICcnOgogICAgICAgIG5hbWUgPSBmJ3ttbHJ1bl9kZWZhdWx0X2FydGlmYWN0X3RlbXBsYXRlfXttbHJ1bl9hcnRpZmFjdF9pbmRleH0nCiAgICBpZiBub3QgcGF0aDoKICAgICAgICBtbHJ1bl9sb2dnZXIuZXJyb3IoZidwYXRoIHJlcXVpcmVkIGZvciBsb2dnaW5nIGFuIG1scnVuIGFydGlmYWN0IC0ge25hbWV9IDoge3BhdGh9JykKICAgICAgICByZXR1cm4KICAgIGlmIG5vdCBpc2luc3RhbmNlKG5hbWUsIHN0cikgb3Igbm90IGlzaW5zdGFuY2UocGF0aCwgc3RyKToKICAgICAgICBtbHJ1bl9sb2dnZXIuZXJyb3IoZiduYW1lIGFuZCBwYXRoIG11c3QgYmUgaW4gc3RyaW5nIHR5cGUgZm9yIGxvZ2dpbmcgYW4gbWxydW4gYXJ0aWZhY3QgLSB7bmFtZX0gOiB7cGF0aH0nKQogICAgICAgIHJldHVybgogICAgaWYgbm90IHBhdGguc3RhcnRzd2l0aCgnL2RiZnMnKSBhbmQgbm90IHBhdGguc3RhcnRzd2l0aCgnZGJmczovJyk6CiAgICAgICAgbWxydW5fbG9nZ2VyLmVycm9yKGYncGF0aCBmb3IgYW4gbWxydW4gYXJ0aWZhY3QgbXVzdCBzdGFydCB3aXRoIC9kYmZzIG9yIGRiZnM6LyAtIHtuYW1lfSA6IHtwYXRofScpCiAgICAgICAgcmV0dXJuCiAgICBtbHJ1bl9hcnRpZmFjdHNfcGF0aCA9ICcvZGJmcy9tbHJ1bl9kYXRhYnJpY2tzX3J1bnRpbWUvYXJ0aWZhY3RzX2RpY3Rpb25hcmllcy9tbHJ1bl9hcnRpZmFjdF9hOWM3NzBmODM3NzA0NmJkYTMwNjFlNjFhNWMwMTVjMi5qc29uJwogICAgdHJ5OgogICAgICAgIG5ld19kYXRhID0ge25hbWU6cGF0aH0KICAgICAgICBpZiBvcy5wYXRoLmV4aXN0cyhtbHJ1bl9hcnRpZmFjdHNfcGF0aCk6CiAgICAgICAgICAgIHdpdGggb3BlbihtbHJ1bl9hcnRpZmFjdHNfcGF0aCwgJ3IrJykgYXMganNvbl9maWxlOgogICAgICAgICAgICAgICAgZXhpc3RpbmdfZGF0YSA9IGpzb24ubG9hZChqc29uX2ZpbGUpCiAgICAgICAgICAgICAgICBleGlzdGluZ19kYXRhLnVwZGF0ZShuZXdfZGF0YSkKICAgICAgICAgICAgICAgIGpzb25fZmlsZS5zZWVrKDApCiAgICAgICAgICAgICAgICBqc29uLmR1bXAoZXhpc3RpbmdfZGF0YSwganNvbl9maWxlKQogICAgICAgIGVsc2U6CiAgICAgICAgICAgIHBhcmVudF9kaXIgPSBvcy5wYXRoLmRpcm5hbWUobWxydW5fYXJ0aWZhY3RzX3BhdGgpCiAgICAgICAgICAgIGlmIHBhcmVudF9kaXIgIT0gJy9kYmZzJzoKICAgICAgICAgICAgICAgIG9zLm1ha2VkaXJzKHBhcmVudF9kaXIsIGV4aXN0X29rPVRydWUpCiAgICAgICAgICAgIHdpdGggb3BlbihtbHJ1bl9hcnRpZmFjdHNfcGF0aCwgJ3cnKSBhcyBqc29uX2ZpbGU6CiAgICAgICAgICAgICAgICBqc29uLmR1bXAobmV3X2RhdGEsIGpzb25fZmlsZSkKICAgICAgICBzdWNjZXNzX2xvZyA9IGYnc3VjY2Vzc2Z1bGx5IHdyb3RlIGFydGlmYWN0IGRldGFpbHMgdG8gdGhlIGFydGlmYWN0IEpTT04gZmlsZSBpbiBEQkZTIC0ge25hbWV9IDoge3BhdGh9JwogICAgICAgIG1scnVuX2xvZ2dlci5pbmZvKHN1Y2Nlc3NfbG9nKQogICAgZXhjZXB0IEV4Y2VwdGlvbiBhcyB1bmtub3duX2V4Y2VwdGlvbjoKICAgICAgICBtbHJ1bl9sb2dnZXIuZXJyb3IoZidsb2cgbWxydW4gYXJ0aWZhY3QgZmFpbGVkIC0ge25hbWV9IDoge3BhdGh9LiBlcnJvcjoge3Vua25vd25fZXhjZXB0aW9ufScpCgoKCgppbXBvcnQgYXJncGFyc2UKaW1wb3J0IGpzb24KcGFyc2VyID0gYXJncGFyc2UuQXJndW1lbnRQYXJzZXIoKQpwYXJzZXIuYWRkX2FyZ3VtZW50KCdoYW5kbGVyX2FyZ3VtZW50cycpCmhhbmRsZXJfYXJndW1lbnRzID0gcGFyc2VyLnBhcnNlX2FyZ3MoKS5oYW5kbGVyX2FyZ3VtZW50cwpoYW5kbGVyX2FyZ3VtZW50cyA9IGpzb24ubG9hZHMoaGFuZGxlcl9hcmd1bWVudHMpCgoKZnJvbSBweXNwYXJrLnNxbCBpbXBvcnQgU3BhcmtTZXNzaW9uCmZyb20gcHlzcGFyay5zcWwuZnVuY3Rpb25zIGltcG9ydCBhdmcsIG1pbiwgbWF4CmltcG9ydCBwYW5kYXMgYXMgcGQKaW1wb3J0IGpzb24KaW1wb3J0IGZzc3BlYwoKZGVmIHByb2Nlc3NfZGF0YShkYXRhX3BhdGg6IHN0ciwgZGF0YV9vdXRwdXRfcGF0aDogc3RyKToKICAgIHNwYXJrID0gU3BhcmtTZXNzaW9uLmJ1aWxkZXIuYXBwTmFtZSgnTXVzaWNEZW1vJykuZ2V0T3JDcmVhdGUoKQogICAgc3BhcmtfZGYgPSBzcGFyay5yZWFkLnBhcnF1ZXQoZGF0YV9wYXRoLCBoZWFkZXI9VHJ1ZSkKICAgIHNwYXJrX2RmID0gc3BhcmtfZGYuZHJvcCgnbmFtZScsICdpZCcpCiAgICBtdXNpY19zdGF0cyA9IHNwYXJrX2RmLmdyb3VwQnkoJ2Zhdm9yaXRlX211c2ljX3R5cGUnKS5hZ2coYXZnKCdhZ2UnKS5hbGlhcygnYXZnX2FnZScpLCBtaW4oJ2FnZScpLmFsaWFzKCdtaW5fYWdlJyksIG1heCgnYWdlJykuYWxpYXMoJ21heF9hZ2UnKSkKICAgIG11c2ljX3N0YXRzLnNob3coKQogICAgcGFuZGFzX2RmID0gc3BhcmtfZGYudG9QYW5kYXMoKQogICAgcGFuZGFzX2RmLnRvX3BhcnF1ZXQoZGF0YV9vdXRwdXRfcGF0aCkKICAgIHJldHVybiB7J211c2ljX2RhdGEnOiBkYXRhX291dHB1dF9wYXRofQpyZXN1bHQgPSBwcm9jZXNzX2RhdGEoKipoYW5kbGVyX2FyZ3VtZW50cykKCgppZiByZXN1bHQ6CiAgICBpZiBpc2luc3RhbmNlKHJlc3VsdCwgZGljdCk6CiAgICAgICAgZm9yIGtleSwgcGF0aCBpbiByZXN1bHQuaXRlbXMoKToKICAgICAgICAgICAgbWxydW5fbG9nX2FydGlmYWN0KG5hbWU9a2V5LCBwYXRoPXBhdGgpCiAgICBlbGlmIGlzaW5zdGFuY2UocmVzdWx0LCAobGlzdCwgdHVwbGUsIHNldCkpOgogICAgICAgIGZvciBhcnRpZmFjdF9wYXRoIGluIHJlc3VsdDoKICAgICAgICAgICAgbWxydW5fbG9nX2FydGlmYWN0KHBhdGg9YXJ0aWZhY3RfcGF0aCkKICAgIGVsaWYgaXNpbnN0YW5jZShyZXN1bHQsIHN0cik6CiAgICAgICAgbWxydW5fbG9nX2FydGlmYWN0KHBhdGg9cmVzdWx0KQogICAgZWxzZToKICAgICAgICBtbHJ1bl9sb2dnZXIud2FybmluZyhmJ2NhbiBub3QgbG9nIGFydGlmYWN0cyB3aXRoIHRoZSByZXN1bHQgb2YgaGFuZGxlciBmdW5jdGlvbiAtIHJlc3VsdCBpbiB1bnN1cHBvcnRlZCB0eXBlLiB7dHlwZShyZXN1bHQpfScpCg==', 'original_handler': 'process_data', 'artifact_json_path': '/mlrun_databricks_runtime/artifacts_dictionaries/mlrun_artifact_a9c770f8377046bda3061e61a5c015c2.json'}
data_path=dbfs:///demos/mlrun_databricks_demo/1711553684480_33/music.parquet
data_output_path=/dbfs/demos/mlrun_databricks_demo/music_output_new.parquet
music_data
databricks_run_metadata
+
+ +
+

+
+
+
> to track results use the .show() or .logs() methods or click here to open in UI
> 2024-03-27 15:35:07,910 [info] Run execution finished: {'status': 'completed', 'name': 'process-data-process-data'}
+
+
+
+
+
+
+

Create an MLflow Xgboost function#

+

The following code demonstrates how to create a simple Xgboost model using MLflow and log the results. +MLflow will log the model, parameters, metrics, and artifacts, and MLRun will track the run and collect the data.

+
+
+
%%writefile training.py
+
+import mlflow
+import mlflow.xgboost
+import xgboost as xgb
+from mlflow import log_metric
+from sklearn import datasets
+from sklearn.metrics import accuracy_score, log_loss
+from sklearn.model_selection import train_test_split
+import pandas as pd
+
+def example_xgb_run(df: str):
+    df = pd.read_parquet(df)
+    
+    df = df.replace(["f", "m"], [0, 1])
+    df = df.replace(["Pop", "Rock", "Classical"], [0, 1, 2])
+    
+    # Prepare, train, and test data
+    y = df.pop('favorite_music_type')
+    X = df
+
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=0.2, random_state=42
+    )
+
+    # Enable auto logging
+    mlflow.xgboost.autolog()
+
+    dtrain = xgb.DMatrix(X_train, label=y_train)
+    dtest = xgb.DMatrix(X_test, label=y_test)
+
+    with mlflow.start_run():
+        # Train model
+        params = {
+            "objective": "multi:softprob",
+            "num_class": 3,
+            "learning_rate": 0.3,
+            "eval_metric": "mlogloss",
+            "colsample_bytree": 1.0,
+            "subsample": 1.0,
+            "seed": 42,
+        }
+        model = xgb.train(params, dtrain, evals=[(dtrain, "train")])
+        
+        # Evaluate model
+        y_proba = model.predict(dtest)
+        y_pred = y_proba.argmax(axis=1)
+        loss = log_loss(y_test, y_proba)
+        acc = accuracy_score(y_test, y_pred)
+        
+        # Log metrics by hand
+        mlflow.log_metrics({"log_loss": loss, "accuracy": acc})
+
+
+
+
+
Overwriting training.py
+
+
+
+
+
+
+

Log the data from MLflow in MLRun#

+
+

Change the MLRun configuration to use the tracker#

+
+
+
import mlrun
+
+mlrun.mlconf.external_platform_tracking.enabled = True
+
+
+
+
+

These are the three options to run tracking:

+
    +
  • Set: mlrun.mlconf.external_platform_tracking.mlflow.match_experiment_to_runtime to True. This determines the run id and is the safest method

  • +
  • Set the experiment name at: mlflow.environment_variables.MLFLOW_EXPERIMENT_NAME.set. This determines the experiment mlrun will track and find the run added to it.

  • +
  • Just run it, mlrun will look across all experiments and search for added run, this is not recomended.

  • +
+
+
+

Create the mlrun function#

+
+
+
# Use the first run option from above
+mlrun.mlconf.external_platform_tracking.mlflow.match_experiment_to_runtime = True
+
+# Create a MLRun function using the example train file (all the functions must be located in it):
+training_func = project.set_function(
+    func="training.py",
+    name="example-xgb-run",
+    kind="job",
+    image="mlrun/mlrun",
+)
+
+
+
+
+
+
+

Run the function#

+

Run the function using MLRun. This will log the data from MLflow in MLRun. +After running the function, you can look at the UI and see that all metrics and parameters are logged in MLRun.

+
+
+
import mlrun.feature_store as fstore
+
+feature_set = fstore.get_feature_set("music_fset", "mlflow-tracking-example")
+
+
+
+
+
+
+
df = feature_set.to_dataframe()
+df = df.drop(['id'], axis=1)
+
+
+
+
+
+
+
# df = project.list_().to_objects()[0].to_dataitem().as_df()
+df_path = "./music.parquet"
+df.to_parquet(df_path)
+
+
+
+
+
+
+
# Run the example code using mlrun
+train_run = training_func.run(
+    local=True,
+    handler="example_xgb_run",
+    inputs={"df": df_path},
+)
+
+
+
+
+
> 2024-03-27 15:37:22,829 [info] Storing function: {'name': 'example-xgb-run-example-xgb-run', 'uid': '6ff324dd21d64b6290d45a001957dda2', 'db': 'http://mlrun-api:8080'}
+> 2024-03-27 15:37:22,912 [warning] `mlconf.external_platform_tracking.mlflow.match_experiment_to_runtime` is set to True but the MLFlow experiment name environment variable ('MLFLOW_EXPERIMENT_NAME') is set for using the name: 'example-xgb-run-example-xgb-run'. This name will be overriden with MLRun's runtime name as set in the MLRun configuration: 'example-xgb-run-example-xgb-run'.
+[0]	train-mlogloss:0.82467
+[1]	train-mlogloss:0.64706
+[2]	train-mlogloss:0.52480
+[3]	train-mlogloss:0.43768
+[4]	train-mlogloss:0.37410
+[5]	train-mlogloss:0.32686
+[6]	train-mlogloss:0.29057
+[7]	train-mlogloss:0.26192
+[8]	train-mlogloss:0.23885
+[9]	train-mlogloss:0.22004
+
+
+
2024/03/27 15:37:23 WARNING mlflow.utils.autologging_utils: MLflow autologging encountered a warning: "/User/.pythonlibs/mlrun-base/lib/python3.9/site-packages/mlflow/types/utils.py:393: UserWarning: Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details."
+2024/03/27 15:37:23 WARNING mlflow.utils.autologging_utils: MLflow autologging encountered a warning: "/User/.pythonlibs/mlrun-base/lib/python3.9/site-packages/xgboost/core.py:160: UserWarning: [15:37:23] WARNING: /workspace/src/c_api/c_api.cc:1240: Saving into deprecated binary model format, please consider using `json` or `ubj`. Model format will default to JSON in XGBoost 2.2 if not specified."
+
+
+
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
mlflow-tracking-example-guy0Mar 27 15:37:22completedexample-xgb-run-example-xgb-run
v3io_user=zeevr
kind=local
owner=zeevr
host=jupyter-zeevr-9f4ffb7bb-8c4mf
mlflow-user=iguazio
mlflow-run-name=stately-cow-437
mlflow-run-id=f66d6149d54c4958a2485c941d86a538
mlflow-experiment-id=608717337209571124
df
colsample_bytree=1.0
custom_metric=None
early_stopping_rounds=None
eval_metric=mlogloss
learning_rate=0.3
maximize=None
num_boost_round=10
num_class=3
objective=multi:softprob
seed=42
subsample=1.0
verbose_eval=True
accuracy=0.7142857142857143
log_loss=0.9622776094122579
train-mlogloss=0.2200447738170624
feature_importance_weight_json
feature_importance_weight_png
model
+
+ +
+

+
+
+
> to track results use the .show() or .logs() methods or click here to open in UI
> 2024-03-27 15:37:31,415 [info] Run execution finished: {'status': 'completed', 'name': 'example-xgb-run-example-xgb-run'}
+
+
+
+
+
+
+

Examine the results#

+

You can examine the results using the UI or by looking at the outputs of the run. +The outputs include the model, the metrics, and the artifacts, and are completely independent of MLflow.

+
+
+
train_run.outputs
+
+
+
+
+
{'accuracy': 0.7142857142857143,
+ 'log_loss': 0.9622776094122579,
+ 'train-mlogloss': 0.2200447738170624,
+ 'feature_importance_weight_json': 'store://artifacts/mlflow-tracking-example-guy/example-xgb-run-example-xgb-run_feature_importance_weight_json@6ff324dd21d64b6290d45a001957dda2',
+ 'feature_importance_weight_png': 'store://artifacts/mlflow-tracking-example-guy/example-xgb-run-example-xgb-run_feature_importance_weight_png@6ff324dd21d64b6290d45a001957dda2',
+ 'model': 'store://artifacts/mlflow-tracking-example-guy/example-xgb-run-example-xgb-run_model@6ff324dd21d64b6290d45a001957dda2'}
+
+
+
+
+
+
+
train_run.status.results
+
+
+
+
+
{'accuracy': 0.7142857142857143,
+ 'log_loss': 0.9622776094122579,
+ 'train-mlogloss': 0.2200447738170624}
+
+
+
+
+
+
+
train_run.artifact("feature_importance_weight_png").show()
+
+
+
+
+_images/3edebfb8ca78ff860681bc18084495dd1fefd44d69ead3b64c7b3e0b0170adbb.png +
+
+
+
+

You can also examine the results using the UI#

+

Look at collected artifacts:

+

image.png

+

And at results:

+

image.png

+
+
+
+

Use the function for model serving#

+
+

Create the server and serving function#

+

Create a serving function that uses the model from the previous run and serves it using MLRun. +We will create a mock server to test the model in a local environment.

+
+
+
serving_func = project.set_function(
+    func="function.yaml",
+    name="example-xgb-server",
+)
+
+
+
+
+
+
+
# Add the model
+serving_func.add_model(
+    "mlflow_xgb_model",
+    class_name="MLFlowModelServer",
+    model_path=train_run.outputs["model"],
+)
+
+
+
+
+
<mlrun.serving.states.TaskStep at 0x7f77c3e4c9a0>
+
+
+
+
+
+
+
# Create a mock server
+server = serving_func.to_mock_server()
+
+
+
+
+
> 2024-03-27 15:37:31,627 [info] model mlflow_xgb_model was loaded
+> 2024-03-27 15:37:31,628 [info] Loaded ['mlflow_xgb_model']
+
+
+
+
+
+
+

Test the model#

+
+
+
# An example taken randomly  
+result = server.test("/v2/models/mlflow_xgb_model/predict", {"inputs":[{"age": 20, "gender": 0}]})
+
+
+
+
+
+
+
# Look at the result, it shows the probability of the given example to be each of the 
+# irises featured in the dataset
+result
+
+
+
+
+
{'id': '43a61d06f2694fa695bdd6561b487131',
+ 'model_name': 'mlflow_xgb_model',
+ 'outputs': [[0.9242361187934875, 0.0418272465467453, 0.033936627209186554]]}
+
+
+
+
+

We predicted that a 20 year old female would like pop!

+
+
+
+
+
+
+
+
+
+ +
+
+ +
+
+
+
+ + + +
+
+ + \ No newline at end of file diff --git a/functions/master/mlflow_utils/1.1.0/static/function.html b/functions/master/mlflow_utils/1.1.0/static/function.html new file mode 100644 index 00000000..d0b2dd51 --- /dev/null +++ b/functions/master/mlflow_utils/1.1.0/static/function.html @@ -0,0 +1,67 @@ + + + + + + + + + + + Source + + + + +
+        
+verbose: false
+spec:
+  command: ''
+  source: ''
+  default_class: MLFlowModelServer
+  function_kind: serving_v2
+  build:
+    functionSourceCode: aW1wb3J0IHppcGZpbGUKZnJvbSB0eXBpbmcgaW1wb3J0IEFueSwgRGljdAppbXBvcnQgbWxmbG93CmZyb20gbWxydW4uc2VydmluZy52Ml9zZXJ2aW5nIGltcG9ydCBWMk1vZGVsU2VydmVyCmltcG9ydCBwYW5kYXMgYXMgcGQKCgpjbGFzcyBNTEZsb3dNb2RlbFNlcnZlcihWMk1vZGVsU2VydmVyKToKICAgICIiIgogICAgTUxGbG93IHRyYWNrZXIgTW9kZWwgc2VydmluZyBjbGFzcywgaW5oZXJpdGluZyB0aGUgVjJNb2RlbFNlcnZlciBjbGFzcyBmb3IgYmVpbmcgaW5pdGlhbGl6ZWQgYXV0b21hdGljYWxseSBieSB0aGUgbW9kZWwKICAgIHNlcnZlciBhbmQgYmUgYWJsZSB0byBydW4gbG9jYWxseSBhcyBwYXJ0IG9mIGEgbnVjbGlvIHNlcnZlcmxlc3MgZnVuY3Rpb24sIG9yIGFzIHBhcnQgb2YgYSByZWFsLXRpbWUgcGlwZWxpbmUuCiAgICAiIiIKCiAgICBkZWYgbG9hZChzZWxmKToKICAgICAgICAiIiIKICAgICAgICBsb2FkcyBhIG1vZGVsIHRoYXQgd2FzIGxvZ2dlZCBieSB0aGUgTUxGbG93IHRyYWNrZXIgbW9kZWwKICAgICAgICAiIiIKICAgICAgICAjIFVuemlwIHRoZSBtb2RlbCBkaXIgYW5kIHRoZW4gdXNlIG1sZmxvdydzIGxvYWQgZnVuY3Rpb24KICAgICAgICBtb2RlbF9maWxlLCBfID0gc2VsZi5nZXRfbW9kZWwoIi56aXAiKQogICAgICAgIG1vZGVsX3BhdGhfdW56aXAgPSBtb2RlbF9maWxlLnJlcGxhY2UoIi56aXAiLCAiIikKCiAgICAgICAgd2l0aCB6aXBmaWxlLlppcEZpbGUobW9kZWxfZmlsZSwgInIiKSBhcyB6aXBfcmVmOgogICAgICAgICAgICB6aXBfcmVmLmV4dHJhY3RhbGwobW9kZWxfcGF0aF91bnppcCkKCiAgICAgICAgc2VsZi5tb2RlbCA9IG1sZmxvdy5weWZ1bmMubG9hZF9tb2RlbChtb2RlbF9wYXRoX3VuemlwKQoKICAgIGRlZiBwcmVkaWN0KHNlbGYsIHJlcXVlc3Q6IERpY3Rbc3RyLCBBbnldKSAtPiBsaXN0OgogICAgICAgICIiIgogICAgICAgIEluZmVyIHRoZSBpbnB1dHMgdGhyb3VnaCB0aGUgbW9kZWwuIFRoZSBpbmZlcnJlZCBkYXRhIHdpbGwKICAgICAgICBiZSByZWFkIGZyb20gdGhlICJpbnB1dHMiIGtleSBvZiB0aGUgcmVxdWVzdC4KCiAgICAgICAgOnBhcmFtIHJlcXVlc3Q6IFRoZSByZXF1ZXN0IHRvIHRoZSBtb2RlbCB1c2luZyB4Z2Jvb3N0J3MgcHJlZGljdC4KICAgICAgICAgICAgICAgIFRoZSBpbnB1dCB0byB0aGUgbW9kZWwgd2lsbCBiZSByZWFkIGZyb20gdGhlICJpbnB1dHMiIGtleS4KCiAgICAgICAgOnJldHVybjogVGhlIG1vZGVsJ3MgcHJlZGljdGlvbiBvbiB0aGUgZ2l2ZW4gaW5wdXQuCiAgICAgICAgIiIiCgogICAgICAgICMgR2V0IHRoZSBpbnB1dHMgYW5kIHNldCB0byBhY2NlcHRlZCB0eXBlOgogICAgICAgIGlucHV0cyA9IHBkLkRhdGFGcmFtZShyZXF1ZXN0WyJpbnB1dHMiXSkKCiAgICAgICAgIyBQcmVkaWN0IHVzaW5nIHRoZSBtb2RlbCdzIHByZWRpY3QgZnVuY3Rpb246CiAgICAgICAgcHJlZGljdGlvbnMgPSBzZWxmLm1vZGVsLnByZWRpY3QoaW5wdXRzKQoKICAgICAgICAjIFJldHVybiBhcyBsaXN0OgogICAgICAgIHJldHVybiBwcmVkaWN0aW9ucy50b2xpc3QoKQoKZnJvbSBtbHJ1bi5ydW50aW1lcyBpbXBvcnQgbnVjbGlvX2luaXRfaG9vawpkZWYgaW5pdF9jb250ZXh0KGNvbnRleHQpOgogICAgbnVjbGlvX2luaXRfaG9vayhjb250ZXh0LCBnbG9iYWxzKCksICdzZXJ2aW5nX3YyJykKCmRlZiBoYW5kbGVyKGNvbnRleHQsIGV2ZW50KToKICAgIHJldHVybiBjb250ZXh0Lm1scnVuX2hhbmRsZXIoY29udGV4dCwgZXZlbnQpCg==
+    requirements:
+    - mlflow==2.12.2
+    - lightgbm
+    - xgboost
+    code_origin: ''
+    origin_filename: ''
+  image: mlrun/mlrun
+  base_image_pull: false
+  default_handler: ''
+  max_replicas: 4
+  disable_auto_mount: false
+  min_replicas: 1
+  description: Mlflow model server, and additional utils.
+  function_handler: mlflow-utils-nuclio:handler
+  env:
+  - name: MLRUN_HTTPDB__NUCLIO__EXPLICIT_ACK
+    value: enabled
+metadata:
+  categories:
+  - model-serving
+  - utils
+  name: mlflow-utils
+  tag: ''
+kind: serving
+
+        
+    
+ + \ No newline at end of file diff --git a/functions/master/mlflow_utils/1.1.0/static/item.html b/functions/master/mlflow_utils/1.1.0/static/item.html new file mode 100644 index 00000000..d193c1fa --- /dev/null +++ b/functions/master/mlflow_utils/1.1.0/static/item.html @@ -0,0 +1,65 @@ + + + + + + + + + + + Source + + + + +
+        
+apiVersion: v1
+categories:
+- model-serving
+- utils
+description: Mlflow model server, and additional utils.
+doc: ''
+example: mlflow_utils.ipynb
+generationDate: 2024-05-23:12-00
+hidden: false
+icon: ''
+labels:
+  author: zeevr
+maintainers: []
+marketplaceType: ''
+mlrunVersion: 1.8.0
+name: mlflow_utils
+platformVersion: ''
+spec:
+  customFields:
+    default_class: MLFlowModelServer
+  filename: mlflow_utils.py
+  handler: handler
+  image: mlrun/mlrun
+  kind: serving
+  requirements:
+  - mlflow==2.12.2
+  - lightgbm
+  - xgboost
+url: ''
+version: 1.1.0
+
+        
+    
+ + \ No newline at end of file diff --git a/functions/master/mlflow_utils/1.1.0/static/mlflow_utils.html b/functions/master/mlflow_utils/1.1.0/static/mlflow_utils.html new file mode 100644 index 00000000..d3e1f984 --- /dev/null +++ b/functions/master/mlflow_utils/1.1.0/static/mlflow_utils.html @@ -0,0 +1,226 @@ + + + + + + + +mlflow_utils.mlflow_utils + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + +
+
+
+
+
+
+ +
+ +
+
+
+ + + + +
+
+
+
+
+
+
+
+
+
+
+
+
+ + +
+
+
+
+
+

+ +
+
+
+
+
+ +
+

Source code for mlflow_utils.mlflow_utils

+import zipfile
+from typing import Any, Dict
+import mlflow
+from mlrun.serving.v2_serving import V2ModelServer
+import pandas as pd
+
+
+
+[docs] +class MLFlowModelServer(V2ModelServer): + """ + MLFlow tracker Model serving class, inheriting the V2ModelServer class for being initialized automatically by the model + server and be able to run locally as part of a nuclio serverless function, or as part of a real-time pipeline. + """ + +
+[docs] + def load(self): + """ + loads a model that was logged by the MLFlow tracker model + """ + # Unzip the model dir and then use mlflow's load function + model_file, _ = self.get_model(".zip") + model_path_unzip = model_file.replace(".zip", "") + + with zipfile.ZipFile(model_file, "r") as zip_ref: + zip_ref.extractall(model_path_unzip) + + self.model = mlflow.pyfunc.load_model(model_path_unzip)
+ + +
+[docs] + def predict(self, request: Dict[str, Any]) -> list: + """ + Infer the inputs through the model. The inferred data will + be read from the "inputs" key of the request. + + :param request: The request to the model using xgboost's predict. + The input to the model will be read from the "inputs" key. + + :return: The model's prediction on the given input. + """ + + # Get the inputs and set to accepted type: + inputs = pd.DataFrame(request["inputs"]) + + # Predict using the model's predict function: + predictions = self.model.predict(inputs) + + # Return as list: + return predictions.tolist()
+
+ +
+
+
+
+
+
+
+
+
+ +
+
+
+
+ + + +
+
+ + \ No newline at end of file diff --git a/functions/master/mlflow_utils/1.1.0/static/source.html b/functions/master/mlflow_utils/1.1.0/static/source.html new file mode 100644 index 00000000..fe11b2c1 --- /dev/null +++ b/functions/master/mlflow_utils/1.1.0/static/source.html @@ -0,0 +1,80 @@ + + + + + + + + + + + Source + + + + +
+        
+import zipfile
+from typing import Any, Dict
+import mlflow
+from mlrun.serving.v2_serving import V2ModelServer
+import pandas as pd
+
+
+class MLFlowModelServer(V2ModelServer):
+    """
+    MLFlow tracker Model serving class, inheriting the V2ModelServer class for being initialized automatically by the model
+    server and be able to run locally as part of a nuclio serverless function, or as part of a real-time pipeline.
+    """
+
+    def load(self):
+        """
+        loads a model that was logged by the MLFlow tracker model
+        """
+        # Unzip the model dir and then use mlflow's load function
+        model_file, _ = self.get_model(".zip")
+        model_path_unzip = model_file.replace(".zip", "")
+
+        with zipfile.ZipFile(model_file, "r") as zip_ref:
+            zip_ref.extractall(model_path_unzip)
+
+        self.model = mlflow.pyfunc.load_model(model_path_unzip)
+
+    def predict(self, request: Dict[str, Any]) -> list:
+        """
+        Infer the inputs through the model. The inferred data will
+        be read from the "inputs" key of the request.
+
+        :param request: The request to the model using xgboost's predict.
+                The input to the model will be read from the "inputs" key.
+
+        :return: The model's prediction on the given input.
+        """
+
+        # Get the inputs and set to accepted type:
+        inputs = pd.DataFrame(request["inputs"])
+
+        # Predict using the model's predict function:
+        predictions = self.model.predict(inputs)
+
+        # Return as list:
+        return predictions.tolist()
+
+        
+    
+ + \ No newline at end of file diff --git a/functions/master/mlflow_utils/latest/src/function.yaml b/functions/master/mlflow_utils/latest/src/function.yaml index d2e2bffe..623f054f 100644 --- a/functions/master/mlflow_utils/latest/src/function.yaml +++ b/functions/master/mlflow_utils/latest/src/function.yaml @@ -1,31 +1,32 @@ -metadata: - name: mlflow-utils - categories: - - genai - - model-serving - - machine-learning - tag: '' +verbose: false spec: - default_handler: '' - image: mlrun/mlrun command: '' - base_image_pull: false + source: '' default_class: MLFlowModelServer - function_handler: mlflow-utils:handler - disable_auto_mount: false + function_kind: serving_v2 build: - origin_filename: '' - code_origin: '' + functionSourceCode: aW1wb3J0IHppcGZpbGUKZnJvbSB0eXBpbmcgaW1wb3J0IEFueSwgRGljdAppbXBvcnQgbWxmbG93CmZyb20gbWxydW4uc2VydmluZy52Ml9zZXJ2aW5nIGltcG9ydCBWMk1vZGVsU2VydmVyCmltcG9ydCBwYW5kYXMgYXMgcGQKCgpjbGFzcyBNTEZsb3dNb2RlbFNlcnZlcihWMk1vZGVsU2VydmVyKToKICAgICIiIgogICAgTUxGbG93IHRyYWNrZXIgTW9kZWwgc2VydmluZyBjbGFzcywgaW5oZXJpdGluZyB0aGUgVjJNb2RlbFNlcnZlciBjbGFzcyBmb3IgYmVpbmcgaW5pdGlhbGl6ZWQgYXV0b21hdGljYWxseSBieSB0aGUgbW9kZWwKICAgIHNlcnZlciBhbmQgYmUgYWJsZSB0byBydW4gbG9jYWxseSBhcyBwYXJ0IG9mIGEgbnVjbGlvIHNlcnZlcmxlc3MgZnVuY3Rpb24sIG9yIGFzIHBhcnQgb2YgYSByZWFsLXRpbWUgcGlwZWxpbmUuCiAgICAiIiIKCiAgICBkZWYgbG9hZChzZWxmKToKICAgICAgICAiIiIKICAgICAgICBsb2FkcyBhIG1vZGVsIHRoYXQgd2FzIGxvZ2dlZCBieSB0aGUgTUxGbG93IHRyYWNrZXIgbW9kZWwKICAgICAgICAiIiIKICAgICAgICAjIFVuemlwIHRoZSBtb2RlbCBkaXIgYW5kIHRoZW4gdXNlIG1sZmxvdydzIGxvYWQgZnVuY3Rpb24KICAgICAgICBtb2RlbF9maWxlLCBfID0gc2VsZi5nZXRfbW9kZWwoIi56aXAiKQogICAgICAgIG1vZGVsX3BhdGhfdW56aXAgPSBtb2RlbF9maWxlLnJlcGxhY2UoIi56aXAiLCAiIikKCiAgICAgICAgd2l0aCB6aXBmaWxlLlppcEZpbGUobW9kZWxfZmlsZSwgInIiKSBhcyB6aXBfcmVmOgogICAgICAgICAgICB6aXBfcmVmLmV4dHJhY3RhbGwobW9kZWxfcGF0aF91bnppcCkKCiAgICAgICAgc2VsZi5tb2RlbCA9IG1sZmxvdy5weWZ1bmMubG9hZF9tb2RlbChtb2RlbF9wYXRoX3VuemlwKQoKICAgIGRlZiBwcmVkaWN0KHNlbGYsIHJlcXVlc3Q6IERpY3Rbc3RyLCBBbnldKSAtPiBsaXN0OgogICAgICAgICIiIgogICAgICAgIEluZmVyIHRoZSBpbnB1dHMgdGhyb3VnaCB0aGUgbW9kZWwuIFRoZSBpbmZlcnJlZCBkYXRhIHdpbGwKICAgICAgICBiZSByZWFkIGZyb20gdGhlICJpbnB1dHMiIGtleSBvZiB0aGUgcmVxdWVzdC4KCiAgICAgICAgOnBhcmFtIHJlcXVlc3Q6IFRoZSByZXF1ZXN0IHRvIHRoZSBtb2RlbCB1c2luZyB4Z2Jvb3N0J3MgcHJlZGljdC4KICAgICAgICAgICAgICAgIFRoZSBpbnB1dCB0byB0aGUgbW9kZWwgd2lsbCBiZSByZWFkIGZyb20gdGhlICJpbnB1dHMiIGtleS4KCiAgICAgICAgOnJldHVybjogVGhlIG1vZGVsJ3MgcHJlZGljdGlvbiBvbiB0aGUgZ2l2ZW4gaW5wdXQuCiAgICAgICAgIiIiCgogICAgICAgICMgR2V0IHRoZSBpbnB1dHMgYW5kIHNldCB0byBhY2NlcHRlZCB0eXBlOgogICAgICAgIGlucHV0cyA9IHBkLkRhdGFGcmFtZShyZXF1ZXN0WyJpbnB1dHMiXSkKCiAgICAgICAgIyBQcmVkaWN0IHVzaW5nIHRoZSBtb2RlbCdzIHByZWRpY3QgZnVuY3Rpb246CiAgICAgICAgcHJlZGljdGlvbnMgPSBzZWxmLm1vZGVsLnByZWRpY3QoaW5wdXRzKQoKICAgICAgICAjIFJldHVybiBhcyBsaXN0OgogICAgICAgIHJldHVybiBwcmVkaWN0aW9ucy50b2xpc3QoKQoKZnJvbSBtbHJ1bi5ydW50aW1lcyBpbXBvcnQgbnVjbGlvX2luaXRfaG9vawpkZWYgaW5pdF9jb250ZXh0KGNvbnRleHQpOgogICAgbnVjbGlvX2luaXRfaG9vayhjb250ZXh0LCBnbG9iYWxzKCksICdzZXJ2aW5nX3YyJykKCmRlZiBoYW5kbGVyKGNvbnRleHQsIGV2ZW50KToKICAgIHJldHVybiBjb250ZXh0Lm1scnVuX2hhbmRsZXIoY29udGV4dCwgZXZlbnQpCg== requirements: - mlflow==2.12.2 - functionSourceCode: aW1wb3J0IHppcGZpbGUKZnJvbSB0eXBpbmcgaW1wb3J0IEFueSwgRGljdAppbXBvcnQgbWxmbG93CmZyb20gbWxydW4uc2VydmluZy52Ml9zZXJ2aW5nIGltcG9ydCBWMk1vZGVsU2VydmVyCmltcG9ydCBwYW5kYXMgYXMgcGQKCgpjbGFzcyBNTEZsb3dNb2RlbFNlcnZlcihWMk1vZGVsU2VydmVyKToKICAgICIiIgogICAgTUxGbG93IHRyYWNrZXIgTW9kZWwgc2VydmluZyBjbGFzcywgaW5oZXJpdGluZyB0aGUgVjJNb2RlbFNlcnZlciBjbGFzcyBmb3IgYmVpbmcgaW5pdGlhbGl6ZWQgYXV0b21hdGljYWxseSBieSB0aGUgbW9kZWwKICAgIHNlcnZlciBhbmQgYmUgYWJsZSB0byBydW4gbG9jYWxseSBhcyBwYXJ0IG9mIGEgbnVjbGlvIHNlcnZlcmxlc3MgZnVuY3Rpb24sIG9yIGFzIHBhcnQgb2YgYSByZWFsLXRpbWUgcGlwZWxpbmUuCiAgICAiIiIKCiAgICBkZWYgbG9hZChzZWxmKToKICAgICAgICAiIiIKICAgICAgICBsb2FkcyBhbiBtb2RlbCB0aGF0IHdhcyBsb2dnZWQgYnkgdGhlIE1MRmxvdyB0cmFja2VyIG1vZGVsCiAgICAgICAgIiIiCiAgICAgICAgIyBVbnppcCB0aGUgbW9kZWwgZGlyIGFuZCB0aGVuIHVzZSBtbGZsb3cncyBsb2FkIGZ1bmN0aW9uCiAgICAgICAgbW9kZWxfZmlsZSwgXyA9IHNlbGYuZ2V0X21vZGVsKCIuemlwIikKICAgICAgICBtb2RlbF9wYXRoX3VuemlwID0gbW9kZWxfZmlsZS5yZXBsYWNlKCIuemlwIiwgIiIpCgogICAgICAgIHdpdGggemlwZmlsZS5aaXBGaWxlKG1vZGVsX2ZpbGUsICJyIikgYXMgemlwX3JlZjoKICAgICAgICAgICAgemlwX3JlZi5leHRyYWN0YWxsKG1vZGVsX3BhdGhfdW56aXApCgogICAgICAgIHNlbGYubW9kZWwgPSBtbGZsb3cucHlmdW5jLmxvYWRfbW9kZWwobW9kZWxfcGF0aF91bnppcCkKCiAgICBkZWYgcHJlZGljdChzZWxmLCByZXF1ZXN0OiBEaWN0W3N0ciwgQW55XSkgLT4gbGlzdDoKICAgICAgICAiIiIKICAgICAgICBJbmZlciB0aGUgaW5wdXRzIHRocm91Z2ggdGhlIG1vZGVsLiBUaGUgaW5mZXJyZWQgZGF0YSB3aWxsCiAgICAgICAgYmUgcmVhZCBmcm9tIHRoZSAiaW5wdXRzIiBrZXkgb2YgdGhlIHJlcXVlc3QuCgogICAgICAgIDpwYXJhbSByZXF1ZXN0OiBUaGUgcmVxdWVzdCB0byB0aGUgbW9kZWwgdXNpbmcgeGdib29zdCdzIHByZWRpY3QuCiAgICAgICAgICAgICAgICBUaGUgaW5wdXQgdG8gdGhlIG1vZGVsIHdpbGwgYmUgcmVhZCBmcm9tIHRoZSAiaW5wdXRzIiBrZXkuCgogICAgICAgIDpyZXR1cm46IFRoZSBtb2RlbCdzIHByZWRpY3Rpb24gb24gdGhlIGdpdmVuIGlucHV0LgogICAgICAgICIiIgoKICAgICAgICAjIEdldCB0aGUgaW5wdXRzIGFuZCBzZXQgdG8gYWNjZXB0ZWQgdHlwZToKICAgICAgICBpbnB1dHMgPSBwZC5EYXRhRnJhbWUocmVxdWVzdFsiaW5wdXRzIl0pCgogICAgICAgICMgUHJlZGljdCB1c2luZyB0aGUgbW9kZWwncyBwcmVkaWN0IGZ1bmN0aW9uOgogICAgICAgIHByZWRpY3Rpb25zID0gc2VsZi5tb2RlbC5wcmVkaWN0KGlucHV0cykKCiAgICAgICAgIyBSZXR1cm4gYXMgbGlzdDoKICAgICAgICByZXR1cm4gcHJlZGljdGlvbnMudG9saXN0KCkKCmZyb20gbWxydW4ucnVudGltZXMgaW1wb3J0IG51Y2xpb19pbml0X2hvb2sKZGVmIGluaXRfY29udGV4dChjb250ZXh0KToKICAgIG51Y2xpb19pbml0X2hvb2soY29udGV4dCwgZ2xvYmFscygpLCAnc2VydmluZ192MicpCgpkZWYgaGFuZGxlcihjb250ZXh0LCBldmVudCk6CiAgICByZXR1cm4gY29udGV4dC5tbHJ1bl9oYW5kbGVyKGNvbnRleHQsIGV2ZW50KQo= + - lightgbm + - xgboost + code_origin: '' + origin_filename: '' + image: mlrun/mlrun + base_image_pull: false + default_handler: '' + max_replicas: 4 + disable_auto_mount: false min_replicas: 1 description: Mlflow model server, and additional utils. - max_replicas: 4 - source: '' - function_kind: serving_v2 + function_handler: mlflow-utils-nuclio:handler env: - name: MLRUN_HTTPDB__NUCLIO__EXPLICIT_ACK value: enabled -verbose: false +metadata: + categories: + - model-serving + - utils + name: mlflow-utils + tag: '' kind: serving diff --git a/functions/master/mlflow_utils/latest/src/item.yaml b/functions/master/mlflow_utils/latest/src/item.yaml index bda09c5b..27e61ab4 100644 --- a/functions/master/mlflow_utils/latest/src/item.yaml +++ b/functions/master/mlflow_utils/latest/src/item.yaml @@ -1,8 +1,7 @@ apiVersion: v1 categories: -- genai - model-serving -- machine-learning +- utils description: Mlflow model server, and additional utils. doc: '' example: mlflow_utils.ipynb @@ -13,7 +12,7 @@ labels: author: zeevr maintainers: [] marketplaceType: '' -mlrunVersion: 1.7.0-rc17 +mlrunVersion: 1.8.0 name: mlflow_utils platformVersion: '' spec: @@ -28,4 +27,4 @@ spec: - lightgbm - xgboost url: '' -version: 1.0.0 +version: 1.1.0 diff --git a/functions/master/mlflow_utils/latest/static/documentation.html b/functions/master/mlflow_utils/latest/static/documentation.html index 5cd55de2..ff93711e 100644 --- a/functions/master/mlflow_utils/latest/static/documentation.html +++ b/functions/master/mlflow_utils/latest/static/documentation.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/mlflow_utils/latest/static/example.html b/functions/master/mlflow_utils/latest/static/example.html index 63974c6a..72f59a9b 100644 --- a/functions/master/mlflow_utils/latest/static/example.html +++ b/functions/master/mlflow_utils/latest/static/example.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/mlflow_utils/latest/static/function.html b/functions/master/mlflow_utils/latest/static/function.html index cbc50484..d0b2dd51 100644 --- a/functions/master/mlflow_utils/latest/static/function.html +++ b/functions/master/mlflow_utils/latest/static/function.html @@ -28,36 +28,37 @@
         
-metadata:
-  name: mlflow-utils
-  categories:
-  - genai
-  - model-serving
-  - machine-learning
-  tag: ''
+verbose: false
 spec:
-  default_handler: ''
-  image: mlrun/mlrun
   command: ''
-  base_image_pull: false
+  source: ''
   default_class: MLFlowModelServer
-  function_handler: mlflow-utils:handler
-  disable_auto_mount: false
+  function_kind: serving_v2
   build:
-    origin_filename: ''
-    code_origin: ''
+    functionSourceCode: aW1wb3J0IHppcGZpbGUKZnJvbSB0eXBpbmcgaW1wb3J0IEFueSwgRGljdAppbXBvcnQgbWxmbG93CmZyb20gbWxydW4uc2VydmluZy52Ml9zZXJ2aW5nIGltcG9ydCBWMk1vZGVsU2VydmVyCmltcG9ydCBwYW5kYXMgYXMgcGQKCgpjbGFzcyBNTEZsb3dNb2RlbFNlcnZlcihWMk1vZGVsU2VydmVyKToKICAgICIiIgogICAgTUxGbG93IHRyYWNrZXIgTW9kZWwgc2VydmluZyBjbGFzcywgaW5oZXJpdGluZyB0aGUgVjJNb2RlbFNlcnZlciBjbGFzcyBmb3IgYmVpbmcgaW5pdGlhbGl6ZWQgYXV0b21hdGljYWxseSBieSB0aGUgbW9kZWwKICAgIHNlcnZlciBhbmQgYmUgYWJsZSB0byBydW4gbG9jYWxseSBhcyBwYXJ0IG9mIGEgbnVjbGlvIHNlcnZlcmxlc3MgZnVuY3Rpb24sIG9yIGFzIHBhcnQgb2YgYSByZWFsLXRpbWUgcGlwZWxpbmUuCiAgICAiIiIKCiAgICBkZWYgbG9hZChzZWxmKToKICAgICAgICAiIiIKICAgICAgICBsb2FkcyBhIG1vZGVsIHRoYXQgd2FzIGxvZ2dlZCBieSB0aGUgTUxGbG93IHRyYWNrZXIgbW9kZWwKICAgICAgICAiIiIKICAgICAgICAjIFVuemlwIHRoZSBtb2RlbCBkaXIgYW5kIHRoZW4gdXNlIG1sZmxvdydzIGxvYWQgZnVuY3Rpb24KICAgICAgICBtb2RlbF9maWxlLCBfID0gc2VsZi5nZXRfbW9kZWwoIi56aXAiKQogICAgICAgIG1vZGVsX3BhdGhfdW56aXAgPSBtb2RlbF9maWxlLnJlcGxhY2UoIi56aXAiLCAiIikKCiAgICAgICAgd2l0aCB6aXBmaWxlLlppcEZpbGUobW9kZWxfZmlsZSwgInIiKSBhcyB6aXBfcmVmOgogICAgICAgICAgICB6aXBfcmVmLmV4dHJhY3RhbGwobW9kZWxfcGF0aF91bnppcCkKCiAgICAgICAgc2VsZi5tb2RlbCA9IG1sZmxvdy5weWZ1bmMubG9hZF9tb2RlbChtb2RlbF9wYXRoX3VuemlwKQoKICAgIGRlZiBwcmVkaWN0KHNlbGYsIHJlcXVlc3Q6IERpY3Rbc3RyLCBBbnldKSAtPiBsaXN0OgogICAgICAgICIiIgogICAgICAgIEluZmVyIHRoZSBpbnB1dHMgdGhyb3VnaCB0aGUgbW9kZWwuIFRoZSBpbmZlcnJlZCBkYXRhIHdpbGwKICAgICAgICBiZSByZWFkIGZyb20gdGhlICJpbnB1dHMiIGtleSBvZiB0aGUgcmVxdWVzdC4KCiAgICAgICAgOnBhcmFtIHJlcXVlc3Q6IFRoZSByZXF1ZXN0IHRvIHRoZSBtb2RlbCB1c2luZyB4Z2Jvb3N0J3MgcHJlZGljdC4KICAgICAgICAgICAgICAgIFRoZSBpbnB1dCB0byB0aGUgbW9kZWwgd2lsbCBiZSByZWFkIGZyb20gdGhlICJpbnB1dHMiIGtleS4KCiAgICAgICAgOnJldHVybjogVGhlIG1vZGVsJ3MgcHJlZGljdGlvbiBvbiB0aGUgZ2l2ZW4gaW5wdXQuCiAgICAgICAgIiIiCgogICAgICAgICMgR2V0IHRoZSBpbnB1dHMgYW5kIHNldCB0byBhY2NlcHRlZCB0eXBlOgogICAgICAgIGlucHV0cyA9IHBkLkRhdGFGcmFtZShyZXF1ZXN0WyJpbnB1dHMiXSkKCiAgICAgICAgIyBQcmVkaWN0IHVzaW5nIHRoZSBtb2RlbCdzIHByZWRpY3QgZnVuY3Rpb246CiAgICAgICAgcHJlZGljdGlvbnMgPSBzZWxmLm1vZGVsLnByZWRpY3QoaW5wdXRzKQoKICAgICAgICAjIFJldHVybiBhcyBsaXN0OgogICAgICAgIHJldHVybiBwcmVkaWN0aW9ucy50b2xpc3QoKQoKZnJvbSBtbHJ1bi5ydW50aW1lcyBpbXBvcnQgbnVjbGlvX2luaXRfaG9vawpkZWYgaW5pdF9jb250ZXh0KGNvbnRleHQpOgogICAgbnVjbGlvX2luaXRfaG9vayhjb250ZXh0LCBnbG9iYWxzKCksICdzZXJ2aW5nX3YyJykKCmRlZiBoYW5kbGVyKGNvbnRleHQsIGV2ZW50KToKICAgIHJldHVybiBjb250ZXh0Lm1scnVuX2hhbmRsZXIoY29udGV4dCwgZXZlbnQpCg==
     requirements:
     - mlflow==2.12.2
-    functionSourceCode: aW1wb3J0IHppcGZpbGUKZnJvbSB0eXBpbmcgaW1wb3J0IEFueSwgRGljdAppbXBvcnQgbWxmbG93CmZyb20gbWxydW4uc2VydmluZy52Ml9zZXJ2aW5nIGltcG9ydCBWMk1vZGVsU2VydmVyCmltcG9ydCBwYW5kYXMgYXMgcGQKCgpjbGFzcyBNTEZsb3dNb2RlbFNlcnZlcihWMk1vZGVsU2VydmVyKToKICAgICIiIgogICAgTUxGbG93IHRyYWNrZXIgTW9kZWwgc2VydmluZyBjbGFzcywgaW5oZXJpdGluZyB0aGUgVjJNb2RlbFNlcnZlciBjbGFzcyBmb3IgYmVpbmcgaW5pdGlhbGl6ZWQgYXV0b21hdGljYWxseSBieSB0aGUgbW9kZWwKICAgIHNlcnZlciBhbmQgYmUgYWJsZSB0byBydW4gbG9jYWxseSBhcyBwYXJ0IG9mIGEgbnVjbGlvIHNlcnZlcmxlc3MgZnVuY3Rpb24sIG9yIGFzIHBhcnQgb2YgYSByZWFsLXRpbWUgcGlwZWxpbmUuCiAgICAiIiIKCiAgICBkZWYgbG9hZChzZWxmKToKICAgICAgICAiIiIKICAgICAgICBsb2FkcyBhbiBtb2RlbCB0aGF0IHdhcyBsb2dnZWQgYnkgdGhlIE1MRmxvdyB0cmFja2VyIG1vZGVsCiAgICAgICAgIiIiCiAgICAgICAgIyBVbnppcCB0aGUgbW9kZWwgZGlyIGFuZCB0aGVuIHVzZSBtbGZsb3cncyBsb2FkIGZ1bmN0aW9uCiAgICAgICAgbW9kZWxfZmlsZSwgXyA9IHNlbGYuZ2V0X21vZGVsKCIuemlwIikKICAgICAgICBtb2RlbF9wYXRoX3VuemlwID0gbW9kZWxfZmlsZS5yZXBsYWNlKCIuemlwIiwgIiIpCgogICAgICAgIHdpdGggemlwZmlsZS5aaXBGaWxlKG1vZGVsX2ZpbGUsICJyIikgYXMgemlwX3JlZjoKICAgICAgICAgICAgemlwX3JlZi5leHRyYWN0YWxsKG1vZGVsX3BhdGhfdW56aXApCgogICAgICAgIHNlbGYubW9kZWwgPSBtbGZsb3cucHlmdW5jLmxvYWRfbW9kZWwobW9kZWxfcGF0aF91bnppcCkKCiAgICBkZWYgcHJlZGljdChzZWxmLCByZXF1ZXN0OiBEaWN0W3N0ciwgQW55XSkgLT4gbGlzdDoKICAgICAgICAiIiIKICAgICAgICBJbmZlciB0aGUgaW5wdXRzIHRocm91Z2ggdGhlIG1vZGVsLiBUaGUgaW5mZXJyZWQgZGF0YSB3aWxsCiAgICAgICAgYmUgcmVhZCBmcm9tIHRoZSAiaW5wdXRzIiBrZXkgb2YgdGhlIHJlcXVlc3QuCgogICAgICAgIDpwYXJhbSByZXF1ZXN0OiBUaGUgcmVxdWVzdCB0byB0aGUgbW9kZWwgdXNpbmcgeGdib29zdCdzIHByZWRpY3QuCiAgICAgICAgICAgICAgICBUaGUgaW5wdXQgdG8gdGhlIG1vZGVsIHdpbGwgYmUgcmVhZCBmcm9tIHRoZSAiaW5wdXRzIiBrZXkuCgogICAgICAgIDpyZXR1cm46IFRoZSBtb2RlbCdzIHByZWRpY3Rpb24gb24gdGhlIGdpdmVuIGlucHV0LgogICAgICAgICIiIgoKICAgICAgICAjIEdldCB0aGUgaW5wdXRzIGFuZCBzZXQgdG8gYWNjZXB0ZWQgdHlwZToKICAgICAgICBpbnB1dHMgPSBwZC5EYXRhRnJhbWUocmVxdWVzdFsiaW5wdXRzIl0pCgogICAgICAgICMgUHJlZGljdCB1c2luZyB0aGUgbW9kZWwncyBwcmVkaWN0IGZ1bmN0aW9uOgogICAgICAgIHByZWRpY3Rpb25zID0gc2VsZi5tb2RlbC5wcmVkaWN0KGlucHV0cykKCiAgICAgICAgIyBSZXR1cm4gYXMgbGlzdDoKICAgICAgICByZXR1cm4gcHJlZGljdGlvbnMudG9saXN0KCkKCmZyb20gbWxydW4ucnVudGltZXMgaW1wb3J0IG51Y2xpb19pbml0X2hvb2sKZGVmIGluaXRfY29udGV4dChjb250ZXh0KToKICAgIG51Y2xpb19pbml0X2hvb2soY29udGV4dCwgZ2xvYmFscygpLCAnc2VydmluZ192MicpCgpkZWYgaGFuZGxlcihjb250ZXh0LCBldmVudCk6CiAgICByZXR1cm4gY29udGV4dC5tbHJ1bl9oYW5kbGVyKGNvbnRleHQsIGV2ZW50KQo=
+    - lightgbm
+    - xgboost
+    code_origin: ''
+    origin_filename: ''
+  image: mlrun/mlrun
+  base_image_pull: false
+  default_handler: ''
+  max_replicas: 4
+  disable_auto_mount: false
   min_replicas: 1
   description: Mlflow model server, and additional utils.
-  max_replicas: 4
-  source: ''
-  function_kind: serving_v2
+  function_handler: mlflow-utils-nuclio:handler
   env:
   - name: MLRUN_HTTPDB__NUCLIO__EXPLICIT_ACK
     value: enabled
-verbose: false
+metadata:
+  categories:
+  - model-serving
+  - utils
+  name: mlflow-utils
+  tag: ''
 kind: serving
 
         
diff --git a/functions/master/mlflow_utils/latest/static/item.html b/functions/master/mlflow_utils/latest/static/item.html
index 27ea1430..d193c1fa 100644
--- a/functions/master/mlflow_utils/latest/static/item.html
+++ b/functions/master/mlflow_utils/latest/static/item.html
@@ -30,9 +30,8 @@
         
 apiVersion: v1
 categories:
-- genai
 - model-serving
-- machine-learning
+- utils
 description: Mlflow model server, and additional utils.
 doc: ''
 example: mlflow_utils.ipynb
@@ -43,7 +42,7 @@
   author: zeevr
 maintainers: []
 marketplaceType: ''
-mlrunVersion: 1.7.0-rc17
+mlrunVersion: 1.8.0
 name: mlflow_utils
 platformVersion: ''
 spec:
@@ -58,7 +57,7 @@
   - lightgbm
   - xgboost
 url: ''
-version: 1.0.0
+version: 1.1.0
 
         
     
diff --git a/functions/master/mlflow_utils/latest/static/mlflow_utils.html b/functions/master/mlflow_utils/latest/static/mlflow_utils.html index fb26fe56..d3e1f984 100644 --- a/functions/master/mlflow_utils/latest/static/mlflow_utils.html +++ b/functions/master/mlflow_utils/latest/static/mlflow_utils.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/model_server/1.1.0/static/documentation.html b/functions/master/model_server/1.1.0/static/documentation.html index 10e23680..79a8b873 100644 --- a/functions/master/model_server/1.1.0/static/documentation.html +++ b/functions/master/model_server/1.1.0/static/documentation.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/model_server/1.1.0/static/example.html b/functions/master/model_server/1.1.0/static/example.html index e5f17b08..c4e128a8 100644 --- a/functions/master/model_server/1.1.0/static/example.html +++ b/functions/master/model_server/1.1.0/static/example.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/model_server/1.1.0/static/model_server.html b/functions/master/model_server/1.1.0/static/model_server.html index 88bfe16f..35b760c8 100644 --- a/functions/master/model_server/1.1.0/static/model_server.html +++ b/functions/master/model_server/1.1.0/static/model_server.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/model_server/latest/static/documentation.html b/functions/master/model_server/latest/static/documentation.html index 10e23680..79a8b873 100644 --- a/functions/master/model_server/latest/static/documentation.html +++ b/functions/master/model_server/latest/static/documentation.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/model_server/latest/static/example.html b/functions/master/model_server/latest/static/example.html index e5f17b08..c4e128a8 100644 --- a/functions/master/model_server/latest/static/example.html +++ b/functions/master/model_server/latest/static/example.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/model_server/latest/static/model_server.html b/functions/master/model_server/latest/static/model_server.html index 88bfe16f..35b760c8 100644 --- a/functions/master/model_server/latest/static/model_server.html +++ b/functions/master/model_server/latest/static/model_server.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/model_server_tester/1.1.0/static/documentation.html b/functions/master/model_server_tester/1.1.0/static/documentation.html index bea3c892..d3ddf180 100644 --- a/functions/master/model_server_tester/1.1.0/static/documentation.html +++ b/functions/master/model_server_tester/1.1.0/static/documentation.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/model_server_tester/1.1.0/static/example.html b/functions/master/model_server_tester/1.1.0/static/example.html index 1cba1254..a3a90484 100644 --- a/functions/master/model_server_tester/1.1.0/static/example.html +++ b/functions/master/model_server_tester/1.1.0/static/example.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/model_server_tester/1.1.0/static/model_server_tester.html b/functions/master/model_server_tester/1.1.0/static/model_server_tester.html index c3de7a49..ed2fa10f 100644 --- a/functions/master/model_server_tester/1.1.0/static/model_server_tester.html +++ b/functions/master/model_server_tester/1.1.0/static/model_server_tester.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/model_server_tester/latest/static/documentation.html b/functions/master/model_server_tester/latest/static/documentation.html index bea3c892..d3ddf180 100644 --- a/functions/master/model_server_tester/latest/static/documentation.html +++ b/functions/master/model_server_tester/latest/static/documentation.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/model_server_tester/latest/static/example.html b/functions/master/model_server_tester/latest/static/example.html index 1cba1254..a3a90484 100644 --- a/functions/master/model_server_tester/latest/static/example.html +++ b/functions/master/model_server_tester/latest/static/example.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/model_server_tester/latest/static/model_server_tester.html b/functions/master/model_server_tester/latest/static/model_server_tester.html index c3de7a49..ed2fa10f 100644 --- a/functions/master/model_server_tester/latest/static/model_server_tester.html +++ b/functions/master/model_server_tester/latest/static/model_server_tester.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/noise_reduction/1.1.0/src/data/test_data.mp3 b/functions/master/noise_reduction/1.1.0/src/data/test_data.mp3 new file mode 100644 index 00000000..a330f980 Binary files /dev/null and b/functions/master/noise_reduction/1.1.0/src/data/test_data.mp3 differ diff --git a/functions/master/noise_reduction/1.1.0/src/data/test_data.wav b/functions/master/noise_reduction/1.1.0/src/data/test_data.wav new file mode 100644 index 00000000..a3a993c2 Binary files /dev/null and b/functions/master/noise_reduction/1.1.0/src/data/test_data.wav differ diff --git a/functions/master/noise_reduction/1.1.0/src/function.yaml b/functions/master/noise_reduction/1.1.0/src/function.yaml new file mode 100644 index 00000000..d6d33b8d --- /dev/null +++ b/functions/master/noise_reduction/1.1.0/src/function.yaml @@ -0,0 +1,179 @@ +spec: + entry_points: + reduce_noise: + has_kwargs: false + name: reduce_noise + has_varargs: false + doc: 'Reduce noise from audio file or directory containing audio files. + + The audio files must be in .wav format. + + The cleaned audio files will be saved in the target_directory. + + For information about the noise reduction algorithm see: + + https://github.com/timsainb/noisereduce + + Notice that the saved files are in wav format, even if the original files + are in other format.' + parameters: + - name: audio_source + type: str + doc: path to audio file or directory containing audio files + - name: target_directory + type: str + doc: path to directory to save the cleaned audio files. + - name: sample_rate + type: int + doc: Number of samples in one second in the audio file. Pass `None` to keep + the original sample rate. + default: 16000 + - name: duration + type: int + doc: Duration of the audio file to clean in seconds. Pass `None` to keep the + original duration. + default: null + - name: channel + type: int + doc: Channel to clean. Pass the number of the channel to clean. To clean all + channels pass None. + default: null + - name: silence_threshold + type: float + doc: The threshold to remove silence from the audio, in dB. If None, no silence + removal is performed. + default: null + - name: use_multiprocessing + type: int + doc: Number of processes to use for cleaning the audio files. If 0, no multiprocessing + is used. + default: 0 + - name: verbose + type: bool + doc: Verbosity level. If True, display progress bar. + default: true + lineno: 388 + clean_audio: + has_kwargs: false + name: clean_audio + has_varargs: false + outputs: + - type: torch.Tensor + doc: '' + parameters: + - name: self + - name: data + type: Tensor + lineno: 276 + save_audio: + has_kwargs: false + name: save_audio + has_varargs: false + doc: '' + parameters: + - name: self + - name: audio + type: ndarray + - name: target_path + type: Path + lineno: 256 + load_audio: + has_kwargs: false + name: load_audio + has_varargs: false + outputs: + - type: torch.Tensor + doc: '' + parameters: + - name: self + - name: file + type: str + lineno: 268 + update_to_wav_suffix: + has_kwargs: false + name: update_to_wav_suffix + has_varargs: false + doc: '' + parameters: + - name: self + - name: audio_file + type: Path + lineno: 125 + remove_silence: + has_kwargs: false + name: remove_silence + has_varargs: false + outputs: + - doc: The audio without silence. + doc: Remove silence sections from the audio. + parameters: + - name: self + - name: audio + type: ndarray + doc: The audio to remove silence from. + lineno: 134 + reduce_noise_dfn: + has_kwargs: true + name: reduce_noise_dfn + has_varargs: false + doc: 'Reduce noise from audio files using DeepFilterNet. + + For more information about the noise reduction algorithm see: + + https://github.com/Rikorose/DeepFilterNet + + Notice that the saved files are in wav format, even if the original files + are in other format.' + parameters: + - name: audio_source + type: str + doc: path to audio file or directory of audio files + - name: target_directory + type: str + doc: path to target directory to save cleaned audio files + - name: pad + type: bool + doc: whether to pad the audio file with zeros before cleaning + default: true + - name: atten_lim_db + type: int + doc: maximum attenuation in dB + default: null + - name: silence_threshold + type: float + doc: the threshold to remove silence from the audio, in dB. If None, no silence + removal is performed. + default: null + - name: use_multiprocessing + type: int + doc: Number of processes to use for cleaning the audio files. If 0, no multiprocessing + is used. + default: 0 + - name: verbose + type: bool + doc: verbosity level. If True, display progress bar and logs. + default: true + lineno: 322 + build: + code_origin: '' + base_image: mlrun/mlrun + requirements: + - librosa + - noisereduce + - deepfilternet + - torchaudio>=2.1.2 + functionSourceCode:  + origin_filename: '' + description: Reduce noise from audio files + command: '' + image: '' + default_handler: reduce_noise + disable_auto_mount: false +metadata: + name: noise-reduction + tag: '' + categories: + - data-preparation + - audio +kind: job +verbose: false diff --git a/functions/master/noise_reduction/1.1.0/src/item.yaml b/functions/master/noise_reduction/1.1.0/src/item.yaml new file mode 100644 index 00000000..f748d558 --- /dev/null +++ b/functions/master/noise_reduction/1.1.0/src/item.yaml @@ -0,0 +1,29 @@ +apiVersion: v1 +categories: + - data-preparation + - audio +description: Reduce noise from audio files +doc: '' +example: noise_reduction.ipynb +generationDate: 2024-03-04:17-30 +hidden: false +icon: '' +labels: + author: yonatans +maintainers: [] +mlrunVersion: 1.7.0 +name: noise-reduction +platformVersion: 3.5.3 +spec: + filename: noise_reduction.py + handler: reduce_noise + image: mlrun/mlrun + kind: job + requirements: [ + librosa, + noisereduce, + deepfilternet, + torchaudio>=2.1.2, + ] +url: '' +version: 1.1.0 \ No newline at end of file diff --git a/functions/master/noise_reduction/1.1.0/src/noise_reduction.ipynb b/functions/master/noise_reduction/1.1.0/src/noise_reduction.ipynb new file mode 100644 index 00000000..e4fa0a53 --- /dev/null +++ b/functions/master/noise_reduction/1.1.0/src/noise_reduction.ipynb @@ -0,0 +1,942 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "4e0abc60-b718-4f45-a82a-0b8759f19d3f", + "metadata": {}, + "source": [ + "# Noise Reduction\n", + "\n", + "## Table of Contents\n", + "\n", + "1. [Introduction](#Introduction)\n", + "2. [Project Setup](#Setting-up-a-project)\n", + "3. [Noise Reduction Techniques](#Noise-Reduction-Techniques)\n", + " 1. [DeepFilterNet](#DeepFilterNet)\n", + " 2. [Spectral Gating](#SpectralGating)" + ] + }, + { + "cell_type": "markdown", + "id": "9af33629-965f-4f73-9e4a-89cc4c3dacf1", + "metadata": {}, + "source": [ + "## Introduction\n", + "\n", + "Noise reduction is a crucial signal processing technique used to enhance the quality of signals by minimizing unwanted or irrelevant noise. This technique finds applications in various fields such as audio processing, image processing, telecommunications, and more. The goal is to extract the useful information from a signal while suppressing undesirable background noise." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "f9cd530d-36a7-47b1-96f8-498d338b3a1a", + "metadata": {}, + "outputs": [], + "source": [ + "import mlrun" + ] + }, + { + "cell_type": "markdown", + "id": "c659289f-01f2-4e02-b843-b39cfc0c1d63", + "metadata": {}, + "source": [ + "## Setting up a project\n", + "\n", + "First of all we need to create a project with the `noise-reduction` function" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "c4217272-85b8-4af7-afee-bc97c6c73bd9", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2024-03-04 15:54:53,561 [info] Project loaded successfully: {'project_name': 'noise-reduction'}\n" + ] + } + ], + "source": [ + "# Creating a project\n", + "project = mlrun.get_or_create_project(\"noise-reduction\")\n", + "# Importing the function from hub\n", + "noise_reduction_function = project.set_function(\"hub://noise_reduction\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "f7df4c3e-4e5b-47bd-a298-527d9c6fcb8f", + "metadata": {}, + "outputs": [], + "source": [ + "# Audio source can be either a single file or a directory of audio files\n", + "audio_source = \"data\"" + ] + }, + { + "cell_type": "markdown", + "id": "6c1c5109-6380-4364-b016-728523ed0ea1", + "metadata": {}, + "source": [ + "## Noise Reduction Techniques" + ] + }, + { + "attachments": { + "e48ce103-14f3-421d-82a4-823344895241.png": { + "image/png": "" + } + }, + "cell_type": "markdown", + "id": "5c81ecee-851c-4ee8-ad3a-4d372a1bfd97", + "metadata": {}, + "source": [ + "\n", + "### 1. DeepFilterNet\n", + "![image.png](attachment:e48ce103-14f3-421d-82a4-823344895241.png)\n", + "\n", + "In order to use this technique, you simply need to use the `reduce_noise_dfn` handler.\n", + "\n", + "Reduce noise from audio files using DeepFilterNet. For more information about the noise reduction algorithm, see [DeepFilterNet GitHub](https://github.com/Rikorose/DeepFilterNet). Notice that the saved files are in wav format, even if the original files are in other formats.\n", + "\n", + "### Parameters:\n", + "\n", + "- `audio_source`: path to the audio file or directory of audio files\n", + "- `target_directory`: path to the target directory to save cleaned audio files\n", + "- `pad`: whether to pad the audio file with zeros before cleaning\n", + "- `atten_lim_db`: maximum attenuation in dB\n", + "- `silence_threshold`: the threshold to remove silence from the audio, in dB. If None, no silence removal is performed.\n", + "- `use_multiprocessing`: Number of processes to use for cleaning the audio files. If 0, no multiprocessing is used.\n", + "- `verbose`: verbosity level. If True, display progress bar and logs.\n", + "- `kwargs`: additional arguments to pass to `torchaudio.load()`. For more information, see [torchaudio.load()](https://pytorch.org/audio/stable/generated/torchaudio.load.html).\n", + "\n", + "\n", + "In the examples below, the function is running locally, for running remotely, it is required to build the function's image first (need to execute only once):\n", + "```python\n", + "noise_reduction_function.apply(mlrun.auto_mount()) # required for local files\n", + "project.build_function(\"noise-reduction\")\n", + "```\n", + "\n", + "#### 1.1. Example" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "16113524-8597-48d4-8172-76b897fee3f2", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2024-03-04 15:54:56,999 [info] Storing function: {'name': 'noise-reduce-reduce-noise-dfn', 'uid': '9732dac831784a6a8b53acab5ff83a08', 'db': 'http://mlrun-api:8080'}\n", + "> 2024-03-04 15:55:07,525 [info] logging run results to: http://mlrun-api:8080\n", + "> 2024-03-04 15:55:07,702 [info] Reducing noise from audio files.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Noise-reduction: 0%| | 0/2 [00:00 2024-03-04 15:55:08,437 [info] Loading DeepFilterNet2 model.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "`torchaudio.backend.common.AudioMetaData` has been moved to `torchaudio.AudioMetaData`. Please update the import path.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2024-03-04 15:55:08 | INFO | DF | Running on torch 2.1.2+cu121\n", + "2024-03-04 15:55:08 | INFO | DF | Running on host jupyter-yoni-d56767c87-678n2\n", + "> 2024-03-04 15:55:08,464 [info] Loading DeepFilterNet2 model.\n", + "2024-03-04 15:55:08 | INFO | DF | Running on torch 2.1.2+cu121\n", + "2024-03-04 15:55:08 | INFO | DF | Running on host jupyter-yoni-d56767c87-678n2\n", + "2024-03-04 15:55:08 | INFO | DF | Loading model settings of DeepFilterNet3\n", + "2024-03-04 15:55:08 | INFO | DF | Using DeepFilterNet3 model at /igz/.cache/DeepFilterNet/DeepFilterNet3\n", + "2024-03-04 15:55:08 | INFO | DF | Initializing model `deepfilternet3`\n", + "2024-03-04 15:55:08 | INFO | DF | Loading model settings of DeepFilterNet3\n", + "2024-03-04 15:55:08 | INFO | DF | Using DeepFilterNet3 model at /igz/.cache/DeepFilterNet/DeepFilterNet3\n", + "2024-03-04 15:55:08 | INFO | DF | Initializing model `deepfilternet3`\n", + "2024-03-04 15:55:08 | INFO | DF | Found checkpoint /igz/.cache/DeepFilterNet/DeepFilterNet3/checkpoints/model_120.ckpt.best with epoch 120\n", + "2024-03-04 15:55:08 | INFO | DF | Found checkpoint /igz/.cache/DeepFilterNet/DeepFilterNet3/checkpoints/model_120.ckpt.best with epoch 120\n", + "2024-03-04 15:55:08 | INFO | DF | Running on device cpu\n", + "2024-03-04 15:55:08 | INFO | DF | Running on device cpu\n", + "2024-03-04 15:55:08 | INFO | DF | Model loaded\n", + "2024-03-04 15:55:08 | INFO | DF | Model loaded\n", + "> 2024-03-04 15:55:08,635 [info] Reducing noise from test_data.mp3.\n", + "> 2024-03-04 15:55:08,636 [info] Reducing noise from test_data.wav.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[32m2024-03-04 15:55:08\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mDF\u001b[0m | \u001b[33m\u001b[1mAudio sampling rate does not match model sampling rate (16000, 48000). Resampling...\u001b[0m\n", + "\"sinc_interpolation\" resampling method name is being deprecated and replaced by \"sinc_interp_hann\" in the next release. The default behavior remains unchanged.\n", + "The MPEG_LAYER_III subtype is unknown to TorchAudio. As a result, the bits_per_sample attribute will be set to 0. If you are seeing this warning, please report by opening an issue on github (after checking for existing/closed ones). You may otherwise ignore this warning.\n", + "\u001b[32m2024-03-04 15:55:08\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mDF\u001b[0m | \u001b[33m\u001b[1mAudio sampling rate does not match model sampling rate (16000, 48000). Resampling...\u001b[0m\n", + "\"sinc_interpolation\" resampling method name is being deprecated and replaced by \"sinc_interp_hann\" in the next release. The default behavior remains unchanged.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2024-03-04 15:55:16,701 [info] Saved cleaned audio file to clean_data/test_data.wav.\n", + "> 2024-03-04 15:55:16,706 [info] Saved cleaned audio file to clean_data/test_data_mp3.wav.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Noise-reduction: 100%|██████████| 2/2 [00:09<00:00, 4.51s/file]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2024-03-04 15:55:16,791 [info] Summarizing the results.\n", + "> 2024-03-04 15:55:16,792 [info] Done (2/2)\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
noise-reduction0Mar 04 15:54:57completednoise-reduce-reduce-noise-dfn
v3io_user=yonis
kind=local
owner=yonis
host=jupyter-yoni-d56767c87-678n2
audio_source
target_directory=./clean_data
use_multiprocessing=2
silence_threshold=50
atten_lim_db=10
successes
errors
\n", + "
\n", + "
\n", + "
\n", + " Title\n", + " ×\n", + "
\n", + " \n", + "
\n", + "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "text/html": [ + " > to track results use the .show() or .logs() methods or click here to open in UI" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2024-03-04 15:55:17,976 [info] Run execution finished: {'status': 'completed', 'name': 'noise-reduce-reduce-noise-dfn'}\n" + ] + } + ], + "source": [ + "dfn_run = noise_reduction_function.run(\n", + " handler=\"reduce_noise_dfn\",\n", + " inputs={\"audio_source\": audio_source},\n", + " params={\n", + " \"target_directory\": \"./clean_data\",\n", + " \"use_multiprocessing\": 2,\n", + " \"silence_threshold\": 50,\n", + " \"atten_lim_db\": 10,\n", + " },\n", + " returns=[\"successes: file\", \"errors: file\"],\n", + " local=True,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "a71ba944-1fc2-48be-b789-d57c59201939", + "metadata": {}, + "source": [ + "### Looking at the result" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "19b04cf6-5a4d-4d74-b66e-193540a900a1", + "metadata": {}, + "outputs": [ + { + "data": { + "application/json": { + "test_data.mp3": "clean_data/test_data_mp3.wav", + "test_data.wav": "clean_data/test_data.wav" + }, + "text/plain": [ + "" + ] + }, + "metadata": { + "application/json": { + "expanded": false, + "root": "root" + } + }, + "output_type": "display_data" + }, + { + "data": { + "application/json": {}, + "text/plain": [ + "" + ] + }, + "metadata": { + "application/json": { + "expanded": false, + "root": "root" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "dfn_run.artifact(\"successes\").show()\n", + "dfn_run.artifact(\"errors\").show()" + ] + }, + { + "attachments": { + "68c16acf-c28e-4bb8-a453-abbebc0137ce.png": { + "image/png": "" + } + }, + "cell_type": "markdown", + "id": "4576b576-4ac0-433a-9d1d-f39225a6648d", + "metadata": {}, + "source": [ + "\n", + "### 2. Spectral Gating\n", + "![image.png](attachment:68c16acf-c28e-4bb8-a453-abbebc0137ce.png)\n", + "\n", + "In order to use this technique, you simply need to use the `reduce_noise` handler.\n", + "\n", + "Spectral gating selectively filters signal frequencies based on amplitude, offering targeted noise reduction or feature enhancement in signal processing applications.\n", + "\n", + "Reduce noise from an audio file or directory containing audio files. The audio files must be in .wav format. The cleaned audio files will be saved in the target directory. For information about the noise reduction algorithm, see [noisereduce GitHub](https://github.com/timsainb/noisereduce). Notice that the saved files are in .wav format, even if the original files are in another format.\n", + "\n", + "### Parameters:\n", + "\n", + "- `audio_source`: path to the audio file or directory containing audio files\n", + "- `target_directory`: path to the directory to save the cleaned audio files.\n", + "- `sample_rate`: Number of samples in one second in the audio file. Pass `None` to keep the original sample rate.\n", + "- `duration`: Duration of the audio file to clean in seconds. Pass `None` to keep the original duration.\n", + "- `channel`: Channel to clean. Pass the number of the channel to clean. To clean all channels, pass `None`.\n", + "- `silence_threshold`: The threshold to remove silence from the audio, in dB. If `None`, no silence removal is performed.\n", + "- `use_multiprocessing`: Number of processes to use for cleaning the audio files. If 0, no multiprocessing is used.\n", + "- `verbose`: Verbosity level. If True, display a progress bar.\n", + "\n", + "#### 2.1. Example" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "f10a5ecd-bf90-4650-a42e-d3fbfff78e52", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2024-03-04 16:07:39,378 [info] Storing function: {'name': 'noise-reduce-reduce-noise', 'uid': '6e6d6f7c3f8243b995dc1bbcf66f7544', 'db': 'http://mlrun-api:8080'}\n", + "> 2024-03-04 16:07:39,541 [info] Reducing noise from audio files.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Noise-reduction: 0%| | 0/2 [00:00 2024-03-04 16:07:39,565 [info] Reducing noise from test_data.mp3.\n", + "> 2024-03-04 16:07:39,566 [info] Reducing noise from test_data.wav.\n", + "> 2024-03-04 16:07:46,174 [info] Saved cleaned audio file to clean_data/test_data.wav.\n", + "> 2024-03-04 16:07:46,175 [info] Saved cleaned audio file to clean_data/test_data_mp3.wav.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Noise-reduction: 100%|██████████| 2/2 [00:06<00:00, 3.31s/file]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2024-03-04 16:07:46,211 [info] Summarizing the results.\n", + "> 2024-03-04 16:07:46,212 [info] Done (2/2)\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
noise-reduction0Mar 04 16:07:39completednoise-reduce-reduce-noise
v3io_user=yonis
kind=local
owner=yonis
host=jupyter-yoni-d56767c87-678n2
audio_source
target_directory=./clean_data
use_multiprocessing=2
silence_threshold=50
successes
errors
\n", + "
\n", + "
\n", + "
\n", + " Title\n", + " ×\n", + "
\n", + " \n", + "
\n", + "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "text/html": [ + " > to track results use the .show() or .logs() methods or click here to open in UI" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2024-03-04 16:07:46,389 [info] Run execution finished: {'status': 'completed', 'name': 'noise-reduce-reduce-noise'}\n" + ] + } + ], + "source": [ + "noise_reduction_run = noise_reduction_function.run(\n", + " handler=\"reduce_noise\",\n", + " inputs={\"audio_source\": audio_source},\n", + " params={\n", + " \"target_directory\": \"./clean_data\",\n", + " \"use_multiprocessing\": 2,\n", + " \"silence_threshold\": 50,\n", + " },\n", + " local=True,\n", + " returns=[\"successes: file\", \"errors: file\"],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "699615d7-bba1-4147-ad3d-d295d794f866", + "metadata": {}, + "source": [ + "### Looking at the result" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "47c4f66a-d5d0-47e5-9842-abbe6653526b", + "metadata": {}, + "outputs": [ + { + "data": { + "application/json": { + "test_data.mp3": "clean_data/test_data_mp3.wav", + "test_data.wav": "clean_data/test_data.wav" + }, + "text/plain": [ + "" + ] + }, + "metadata": { + "application/json": { + "expanded": false, + "root": "root" + } + }, + "output_type": "display_data" + }, + { + "data": { + "application/json": {}, + "text/plain": [ + "" + ] + }, + "metadata": { + "application/json": { + "expanded": false, + "root": "root" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "dfn_run.artifact(\"successes\").show()\n", + "dfn_run.artifact(\"errors\").show()" + ] + }, + { + "cell_type": "markdown", + "id": "6eeae1bb-c714-491b-91dd-f22148cd0970", + "metadata": {}, + "source": [ + "The output of this function is the same as the first one." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "mlrun-base", + "language": "python", + "name": "conda-env-mlrun-base-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/functions/master/noise_reduction/1.1.0/src/noise_reduction.py b/functions/master/noise_reduction/1.1.0/src/noise_reduction.py new file mode 100644 index 00000000..f0fff550 --- /dev/null +++ b/functions/master/noise_reduction/1.1.0/src/noise_reduction.py @@ -0,0 +1,625 @@ +import logging +from abc import ABCMeta, abstractmethod +from multiprocessing import Process, Queue +from pathlib import Path +from typing import List, Tuple, Type, Union + +import librosa +import numpy as np +import torch +from scipy.io import wavfile +from tqdm import tqdm + +#: The value to send into multiprocessing queues to stop the process: +_MULTIPROCESSING_STOP_MARK = "STOP" + +# Get the global logger: +try: + import mlrun + + _LOGGER = mlrun.get_or_create_ctx("noise_reduce").logger +except ModuleNotFoundError: + _LOGGER = logging.getLogger() + + +class ReduceNoiseBase(metaclass=ABCMeta): + """ + Base class for noise reduction. + This class is aimed to be inherited by specific noise reduction algorithms. + You must implement the following methods: + - clean_audio: The method to clean the audio, where the noise reduction algorithm is implemented. + - save_audio: The method to save the audio to a file. + - load_audio: The method to load the audio from a file. + + After implementing the above methods, you can use the reduce_noise method to reduce noise from audio files. + """ + def __init__( + self, + target_directory: Path, + verbose: bool = True, + silence_threshold: float = None, + ): + self.target_directory = Path(target_directory) + self.verbose = verbose + self.silence_threshold = silence_threshold + + def reduce_noise(self, audio_file: Path) -> Tuple[bool, Tuple[str, str]]: + """ + Reduce noise from the given audio file. + + :param audio_file: The audio file to reduce noise from. + + :returns: A tuple of: + - a boolean indicating whether an error occurred + - a tuple of: + - audio file name + - target path in case of success / error message in case of failure. + """ + try: + if self.verbose: + _LOGGER.info(f"Reducing noise from {audio_file.name}.") + + # Load audio data: + audio = self.load_audio(file=str(audio_file)) + + # Perform noise reduction: + reduced_noise = self.clean_audio(data=audio) + + # Remove silence from the audio if necessary: + reduced_noise = self.remove_silence(audio=reduced_noise) + + # Prepare target path: + target_path = self.update_to_wav_suffix(audio_file=audio_file) + + # Save file: + self.save_audio( + audio=reduced_noise, + target_path=target_path, + ) + + if self.verbose: + _LOGGER.info(f"Saved cleaned audio file to {target_path}.") + + return False, (audio_file.name, str(target_path)) + except Exception as exception: + if self.verbose: + _LOGGER.error(f"Failed to reduce noise from {audio_file.name}.") + _LOGGER.error(f"Error: {exception}") + # Collect the error: + return True, (audio_file.name, str(exception)) + + @abstractmethod + def clean_audio(self, data) -> Union[np.ndarray, torch.Tensor]: + """ + Clean the audio from noise. Here you should implement the noise reduction algorithm. + + :param data: The audio data to clean. + + :returns: The cleaned audio. + """ + pass + + @abstractmethod + def save_audio(self, audio: np.ndarray, target_path: Path): + """ + Save the audio to a file. + + :param audio: The audio to save. + :param target_path: The target path to save the audio to. + """ + pass + + @abstractmethod + def load_audio(self, file: str) -> Tuple[Union[np.ndarray, torch.Tensor], int]: + """ + Load the audio from a file. + + :param file: The file to load the audio from. + + :returns: A tuple of: + - the audio data + - the sample rate + """ + pass + + def update_to_wav_suffix(self, audio_file: Path): + target_path = self.target_directory / audio_file.name + if target_path.suffix != ".wav": + old_suffix = target_path.suffix[1:] + target_path = target_path.with_stem(target_path.stem + f"_{old_suffix}") + return target_path.with_suffix(".wav") + else: + return target_path + + def remove_silence( + self, + audio: np.ndarray, + ): + """ + Remove silence sections from the audio. + + :param audio: The audio to remove silence from. + + :returns: The audio without silence. + """ + if self.silence_threshold is None: + return audio + + # Get the indices of the non-silent frames: + non_silent_indices = librosa.effects.split( + y=audio, + top_db=self.silence_threshold, + frame_length=2048, + hop_length=256, + ) + + # Get the non-silent audio: + non_silent_audio = np.concatenate( + [audio[:, start:end] for start, end in non_silent_indices], axis=1 + ) + + return non_silent_audio + + +class ReduceNoise(ReduceNoiseBase): + def __init__( + self, + target_directory: Path, + verbose: bool = True, + silence_threshold: float = None, + sample_rate: int = 16000, + duration: int = None, + channel: int = None, + ): + super().__init__(target_directory, verbose, silence_threshold) + self.sample_rate = sample_rate + self.duration = duration + self.channel = channel + + def save_audio(self, audio: np.ndarray, target_path: Path): + # If the audio has more than one channel, transpose it in order to save it: + if len(audio) > 1: + audio = audio.T + + wavfile.write( + filename=target_path, + rate=self.sample_rate, + data=audio, + ) + + def load_audio(self, file: str) -> np.ndarray: + data, sr = librosa.load( + path=file, + sr=self.sample_rate, + mono=False, # keep channels separate + duration=self.duration, + ) + # set sample rate: + self.sample_rate = int(sr) + + # convert to int with scaling for 16-bit integer + data *= 32767 / np.max(np.abs(data)) # re-scaling + data = data.astype(np.int16) # change data type + + # select channel + data_to_reduce = data[self.channel] if self.channel is not None else data + return data_to_reduce + + def clean_audio(self, data: np.ndarray) -> np.ndarray: + try: + import noisereduce + except ImportError as e: + raise ImportError("Please install noisereduce package") from e + + reduced_noise = noisereduce.reduce_noise(y=data, sr=self.sample_rate) + + # add channel back after noise reduction + if self.channel is not None: + # putting the channel back in the data + data[self.channel] = reduced_noise + # updating the data to save + reduced_noise = data + + return reduced_noise + + +class DFN(ReduceNoiseBase): + def __init__( + self, + target_directory: Path, + verbose: bool = True, + silence_threshold: float = None, + pad: bool = True, + atten_lim_db: int = None, + **kwargs, + ): + super().__init__(target_directory, verbose, silence_threshold) + self.pad = pad + self.atten_lim_db = atten_lim_db + self.kwargs = kwargs + + # import required packages + try: + from df.enhance import init_df + except ImportError as e: + raise ImportError("Please install deepfilternet packages") from e + + if self.verbose: + _LOGGER.info("Loading DeepFilterNet2 model.") + + # Load the model: + model, df_state, _ = init_df() + self.model = model + self.df_state = df_state + self.sample_rate = self.df_state.sr() + + def save_audio(self, audio: np.ndarray, target_path: Path): + try: + from df.enhance import save_audio + except ImportError as e: + raise ImportError("Please install deepfilternet package") from e + save_audio( + file=target_path.name, + audio=audio, + sr=self.sample_rate, + output_dir=str(self.target_directory), + ) + + def load_audio(self, file: str) -> torch.Tensor: + try: + from df.enhance import load_audio + except ImportError as e: + raise ImportError("Please install deepfilternet package") from e + audio, _ = load_audio(file=file, sr=self.sample_rate, **self.kwargs) + return audio + + def clean_audio(self, data: torch.Tensor) -> torch.Tensor: + try: + from df.enhance import enhance + except ImportError as e: + raise ImportError("Please install deepfilternet package") from e + return enhance( + model=self.model, + df_state=self.df_state, + audio=data, + pad=self.pad, + atten_lim_db=self.atten_lim_db, + ) + + +def _multiprocessing_complete_tasks( + noise_reduce_type: Type[ReduceNoiseBase], + noise_reduce_arguments: dict, + tasks_queue: Queue, + results_queue: Queue, +): + """ + Complete the tasks in the given queue and put the results in the given results queue. The function will stop when + the given tasks queue will receive the stop mark. It is aimed to be used with multiprocessing as a process. + + :param noise_reduce_type: The noise reduce type to use. + :param noise_reduce_arguments: The noisereduce initialization kwargs. + :param tasks_queue: A queue to get the tasks from. + :param results_queue: A queue to put the results in. + """ + # Initialize the reduce noise object + noise_reducer = noise_reduce_type(**noise_reduce_arguments) + + # Start listening to the tasks queue: + while True: + # Get the audio_file: + audio_file = tasks_queue.get() + if audio_file == _MULTIPROCESSING_STOP_MARK: + break + audio_file = Path(audio_file) + # Apply noise reduction and collect the result: + results_queue.put(noise_reducer.reduce_noise(audio_file=audio_file)) + + # Mark the end of the tasks: + results_queue.put(_MULTIPROCESSING_STOP_MARK) + + +def reduce_noise_dfn( + audio_source: str, + target_directory: str, + pad: bool = True, + atten_lim_db: int = None, + silence_threshold: float = None, + use_multiprocessing: int = 0, + verbose: bool = True, + **kwargs, +): + """ + Reduce noise from audio files using DeepFilterNet. + For more information about the noise reduction algorithm see: + https://github.com/Rikorose/DeepFilterNet + Notice that the saved files are in wav format, even if the original files are in other format. + + :param audio_source: path to audio file or directory of audio files + :param target_directory: path to target directory to save cleaned audio files + :param pad: whether to pad the audio file with zeros before cleaning + :param atten_lim_db: maximum attenuation in dB + :param silence_threshold: the threshold to remove silence from the audio, in dB. If None, no silence removal is + performed. + :param use_multiprocessing: Number of processes to use for cleaning the audio files. + If 0, no multiprocessing is used. + :param verbose: verbosity level. If True, display progress bar and logs. + :param kwargs: additional arguments to pass to torchaudio.load(). For more information see: + https://pytorch.org/audio/stable/generated/torchaudio.load.html + """ + if verbose: + _LOGGER.info("Reducing noise from audio files.") + + # create target directory: + target_directory = _create_target_directory(target_directory) + + # get audio files: + audio_files = _get_audio_files(audio_source) + + noise_reduce_arguments = { + "target_directory": target_directory, + "pad": pad, + "atten_lim_db": atten_lim_db, + "silence_threshold": silence_threshold, + **kwargs, + } + + if use_multiprocessing: + results = _parallel_run( + noise_reduce_type=DFN, + noise_reduce_arguments=noise_reduce_arguments, + n_workers=use_multiprocessing, + audio_files=audio_files, + description="Noise-reduction", + verbose=verbose, + ) + else: + results = _run( + noise_reduce_type=DFN, + noise_reduce_arguments=noise_reduce_arguments, + audio_files=audio_files, + description="Noise-reduction", + verbose=verbose, + ) + + return _process_results(results, verbose) + + +def reduce_noise( + audio_source: str, + target_directory: str, + sample_rate: int = 16000, + duration: int = None, + channel: int = None, + silence_threshold: float = None, + use_multiprocessing: int = 0, + verbose: bool = True, +): + """ + Reduce noise from audio file or directory containing audio files. + The audio files must be in .wav format. + The cleaned audio files will be saved in the target_directory. + For information about the noise reduction algorithm see: + https://github.com/timsainb/noisereduce + Notice that the saved files are in wav format, even if the original files are in other format. + + :param audio_source: path to audio file or directory containing audio files + :param target_directory: path to directory to save the cleaned audio files. + :param sample_rate: Number of samples in one second in the audio file. + Pass `None` to keep the original sample rate. + :param duration: Duration of the audio file to clean in seconds. + Pass `None` to keep the original duration. + :param channel: Channel to clean. Pass the number of the channel to clean. + To clean all channels pass None. + :param silence_threshold: The threshold to remove silence from the audio, in dB. + If None, no silence removal is performed. + :param use_multiprocessing: Number of processes to use for cleaning the audio files. + If 0, no multiprocessing is used. + :param verbose: Verbosity level. If True, display progress bar. + """ + if verbose: + _LOGGER.info("Reducing noise from audio files.") + + # create target directory: + target_directory = _create_target_directory(target_directory) + + # get audio files: + audio_files = _get_audio_files(audio_source) + + # Create the reduce noise object: + noise_reduce_arguments = { + "target_directory": target_directory, + "sample_rate": sample_rate, + "duration": duration, + "channel": channel, + "silence_threshold": silence_threshold, + } + + if use_multiprocessing: + results = _parallel_run( + noise_reduce_type=ReduceNoise, + noise_reduce_arguments=noise_reduce_arguments, + n_workers=use_multiprocessing, + audio_files=audio_files, + description="Noise-reduction", + verbose=verbose, + ) + else: + results = _run( + noise_reduce_type=ReduceNoise, + noise_reduce_arguments=noise_reduce_arguments, + audio_files=audio_files, + description="Noise-reduction", + verbose=verbose, + ) + + return _process_results(results, verbose) + + +def _create_target_directory(target_directory: str) -> str: + target_directory = Path(target_directory) + if not target_directory.exists(): + target_directory.mkdir(parents=True, exist_ok=True) + return str(target_directory) + + +def _get_audio_files(audio_source: str): + audio_source = Path(audio_source) + audio_files = [] + if audio_source.is_dir(): + audio_files = list(audio_source.glob("*.*")) + elif audio_source.is_file(): + audio_files.append(audio_source) + else: + raise ValueError( + f"audio_source must be a file or a directory, got {audio_source}" + ) + return audio_files + + +def _parallel_run( + noise_reduce_type: Type[ReduceNoiseBase], + noise_reduce_arguments: dict, + n_workers: int, + audio_files: List[Path], + description: str, + verbose: bool, +) -> List[Tuple[bool, Tuple[str, str]]]: + """ + Run multiple noise reduce workers with multiprocessing to complete the tasks that will be created on the provided + files using the given task creator. + + :param noise_reduce_type: The noise reduce type to use. + :param n_workers: The number of workers to use. + :param audio_files: The audio files to use. + :param description: The description to use for the progress bar. + :param verbose: Verbosity. + + :returns: The collected results. + """ + # Check the number of workers: + if n_workers > len(audio_files): + _LOGGER.warning( + f"The number of workers ({n_workers}) is larger than the number of audio files ({len(audio_files)}). " + f"Setting the number of workers to {len(audio_files)}." + ) + n_workers = len(audio_files) + + # Initialize the multiprocessing queues: + tasks_queue = Queue() + results_queue = Queue() + + # Initialize the multiprocessing processes: + task_completion_processes = [ + Process( + target=_multiprocessing_complete_tasks, + kwargs={ + "noise_reduce_type": noise_reduce_type, + "noise_reduce_arguments": noise_reduce_arguments, + "tasks_queue": tasks_queue, + "results_queue": results_queue, + }, + ) + for _ in range(n_workers) + ] + + # Start the multiprocessing processes: + for p in task_completion_processes: + p.start() + + # Put the tasks in the queue: + for audio_file in audio_files: + # tasks_queue.put(task_creator.create_task(audio_file=audio_file).to_tuple()) + tasks_queue.put(audio_file) + + # Put the stop marks in the queue: + for _ in range(n_workers): + tasks_queue.put(_MULTIPROCESSING_STOP_MARK) + + # Collect the results: + results = [] + stop_marks_counter = 0 + with tqdm( + desc=description, + unit="file", + total=len(audio_files), + disable=not verbose, + ) as progressbar: + while True: + # Get a result from the queue: + result: Tuple[bool, Tuple[str, str]] = results_queue.get() + if result == _MULTIPROCESSING_STOP_MARK: + stop_marks_counter += 1 + if stop_marks_counter == n_workers: + break + else: + # Collect the result: + results.append(result) + progressbar.update(1) + + # Wait for the processes to finish: + for p in task_completion_processes: + p.join() + + return results + + +def _run( + noise_reduce_type: Type[ReduceNoiseBase], + noise_reduce_arguments: dict, + audio_files: List[Path], + description: str, + verbose: bool, +) -> List[Tuple[bool, Tuple[str, str]]]: + """ + Run the noise reduce algorithm on the given audio files and collect the results. + + :param noise_reduce_type: The noise reduce type to use. + :param noise_reduce_arguments: The noisereduce initialization kwargs. + :param audio_files: The audio files to use. + :param description: The description to use for the progress bar. + :param verbose: Verbosity. + + :returns: The collected results. + """ + # Create the reduce noise object: + noise_reducer = noise_reduce_type(**noise_reduce_arguments) + + # Run the noise reduce algorithm on the audio files and collect the results: + results = [] + for audio_file in tqdm( + audio_files, + desc=description, + unit="file", + total=len(audio_files), + disable=not verbose, + ): + results.append(noise_reducer.reduce_noise(audio_file=audio_file)) + + return results + + +def _process_results( + results: List[Tuple[bool, Tuple[str, str]]], verbose: bool +) -> Tuple[dict, dict]: + """ + Process the results of the tasks. + + :param results: The results to process. + :param verbose: Verbosity. + + :returns: The processed results as a tuple of successes and errors. + """ + if verbose: + _LOGGER.info("Summarizing the results.") + successes = {} + errors = {} + for is_error, result in results: + if is_error: + errors[result[0]] = result[1] + else: + successes[result[0]] = result[1] + if verbose: + _LOGGER.info(f"Done ({len(successes)}/{len(successes) + len(errors)})\n") + + return successes, errors diff --git a/functions/master/noise_reduction/1.1.0/src/requirements.txt b/functions/master/noise_reduction/1.1.0/src/requirements.txt new file mode 100644 index 00000000..30934ad7 --- /dev/null +++ b/functions/master/noise_reduction/1.1.0/src/requirements.txt @@ -0,0 +1,5 @@ +tqdm +deepfilternet +librosa +noisereduce +torchaudio>=2.1.2 \ No newline at end of file diff --git a/functions/master/noise_reduction/1.1.0/src/test_noise_reduction.py b/functions/master/noise_reduction/1.1.0/src/test_noise_reduction.py new file mode 100644 index 00000000..a7737756 --- /dev/null +++ b/functions/master/noise_reduction/1.1.0/src/test_noise_reduction.py @@ -0,0 +1,75 @@ +import tempfile + +import mlrun +import pytest + + +@pytest.mark.parametrize( + "audio_source", + [ + "data/test_data.wav", + "data/test_data.mp3", + "data", + ], +) +def test_reduce_noise(audio_source): + # set up the project and function + artifact_path = tempfile.TemporaryDirectory().name + project = mlrun.new_project("noise-reduction") + noise_reduction_function = project.set_function( + func="function.yaml", + name="reduce_noise", + kind="job", + image="mlrun/mlrun", + ) + + # run the function + noise_reduction_run = noise_reduction_function.run( + handler="reduce_noise", + inputs={"audio_source": audio_source}, + params={ + "target_directory": artifact_path + "/data", + "sample_rate": None, + }, + local=True, + artifact_path=artifact_path, + returns=["successes: file", "errors: file"], + ) + + assert noise_reduction_run.outputs["successes"] + + +@pytest.mark.parametrize( + "audio_source", + [ + "data/test_data.wav", + "data/test_data.mp3", + "data", + ], +) +def test_reduce_noise_dfn(audio_source): + # set up the project and function + artifact_path = tempfile.TemporaryDirectory().name + project = mlrun.new_project("noise-reduction") + noise_reduction_function = project.set_function( + func="function.yaml", + name="reduce_noise", + kind="job", + image="mlrun/mlrun", + ) + + # run the function + noise_reduction_run = noise_reduction_function.run( + handler="reduce_noise_dfn", + inputs={"audio_source": audio_source}, + params={ + "target_directory": artifact_path + "/data", + "atten_lim_db": 50, + }, + local=True, + artifact_path=artifact_path, + returns=["successes: file", "errors: file"], + ) + + # assert that the function run completed successfully + assert noise_reduction_run.outputs["successes"] diff --git a/functions/master/noise_reduction/1.1.0/static/documentation.html b/functions/master/noise_reduction/1.1.0/static/documentation.html new file mode 100644 index 00000000..a772e518 --- /dev/null +++ b/functions/master/noise_reduction/1.1.0/static/documentation.html @@ -0,0 +1,518 @@ + + + + + + + +noise_reduction package + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + +
+
+
+
+
+
+ +
+ +
+
+
+ + + + +
+
+
+
+
+
+
+
+
+
+
+
+
+ + + + +
+
+
+
+ + +
+
+

noise_reduction package#

+
+

Submodules#

+
+
+

noise_reduction.noise_reduction module#

+
+
+class noise_reduction.noise_reduction.DFN(target_directory: Path, verbose: bool = True, silence_threshold: float | None = None, pad: bool = True, atten_lim_db: int | None = None, **kwargs)[source]#
+

Bases: ReduceNoiseBase

+
+
+clean_audio(data: torch.Tensor) torch.Tensor[source]#
+

Clean the audio from noise. Here you should implement the noise reduction algorithm.

+
+
Parameters:
+

data – The audio data to clean.

+
+
Returns:
+

The cleaned audio.

+
+
+
+
+
+load_audio(file: str) torch.Tensor[source]#
+

Load the audio from a file.

+
+
Parameters:
+

file – The file to load the audio from.

+
+
Returns:
+

A tuple of: +- the audio data +- the sample rate

+
+
+
+
+
+save_audio(audio: ndarray, target_path: Path)[source]#
+

Save the audio to a file.

+
+
Parameters:
+
    +
  • audio – The audio to save.

  • +
  • target_path – The target path to save the audio to.

  • +
+
+
+
+
+
+
+class noise_reduction.noise_reduction.ReduceNoise(target_directory: Path, verbose: bool = True, silence_threshold: float | None = None, sample_rate: int = 16000, duration: int | None = None, channel: int | None = None)[source]#
+

Bases: ReduceNoiseBase

+
+
+clean_audio(data: ndarray) ndarray[source]#
+

Clean the audio from noise. Here you should implement the noise reduction algorithm.

+
+
Parameters:
+

data – The audio data to clean.

+
+
Returns:
+

The cleaned audio.

+
+
+
+
+
+load_audio(file: str) ndarray[source]#
+

Load the audio from a file.

+
+
Parameters:
+

file – The file to load the audio from.

+
+
Returns:
+

A tuple of: +- the audio data +- the sample rate

+
+
+
+
+
+save_audio(audio: ndarray, target_path: Path)[source]#
+

Save the audio to a file.

+
+
Parameters:
+
    +
  • audio – The audio to save.

  • +
  • target_path – The target path to save the audio to.

  • +
+
+
+
+
+
+
+class noise_reduction.noise_reduction.ReduceNoiseBase(target_directory: Path, verbose: bool = True, silence_threshold: float | None = None)[source]#
+

Bases: object

+

Base class for noise reduction. +This class is aimed to be inherited by specific noise reduction algorithms. +You must implement the following methods: +- clean_audio: The method to clean the audio, where the noise reduction algorithm is implemented. +- save_audio: The method to save the audio to a file. +- load_audio: The method to load the audio from a file.

+

After implementing the above methods, you can use the reduce_noise method to reduce noise from audio files.

+
+
+abstract clean_audio(data) ndarray | torch.Tensor[source]#
+

Clean the audio from noise. Here you should implement the noise reduction algorithm.

+
+
Parameters:
+

data – The audio data to clean.

+
+
Returns:
+

The cleaned audio.

+
+
+
+
+
+abstract load_audio(file: str) Tuple[ndarray | torch.Tensor, int][source]#
+

Load the audio from a file.

+
+
Parameters:
+

file – The file to load the audio from.

+
+
Returns:
+

A tuple of: +- the audio data +- the sample rate

+
+
+
+
+
+reduce_noise(audio_file: Path) Tuple[bool, Tuple[str, str]][source]#
+

Reduce noise from the given audio file.

+
+
Parameters:
+

audio_file – The audio file to reduce noise from.

+
+
Returns:
+

A tuple of: +- a boolean indicating whether an error occurred +- a tuple of:

+
+
    +
  • audio file name

  • +
  • target path in case of success / error message in case of failure.

  • +
+
+

+
+
+
+
+
+remove_silence(audio: ndarray)[source]#
+

Remove silence sections from the audio.

+
+
Parameters:
+

audio – The audio to remove silence from.

+
+
Returns:
+

The audio without silence.

+
+
+
+
+
+abstract save_audio(audio: ndarray, target_path: Path)[source]#
+

Save the audio to a file.

+
+
Parameters:
+
    +
  • audio – The audio to save.

  • +
  • target_path – The target path to save the audio to.

  • +
+
+
+
+
+
+update_to_wav_suffix(audio_file: Path)[source]#
+
+
+
+
+noise_reduction.noise_reduction.reduce_noise(audio_source: str, target_directory: str, sample_rate: int = 16000, duration: int | None = None, channel: int | None = None, silence_threshold: float | None = None, use_multiprocessing: int = 0, verbose: bool = True)[source]#
+

Reduce noise from audio file or directory containing audio files. +The audio files must be in .wav format. +The cleaned audio files will be saved in the target_directory. +For information about the noise reduction algorithm see: +timsainb/noisereduce +Notice that the saved files are in wav format, even if the original files are in other format.

+
+
Parameters:
+
    +
  • audio_source – path to audio file or directory containing audio files

  • +
  • target_directory – path to directory to save the cleaned audio files.

  • +
  • sample_rate – Number of samples in one second in the audio file. +Pass None to keep the original sample rate.

  • +
  • duration – Duration of the audio file to clean in seconds. +Pass None to keep the original duration.

  • +
  • channel – Channel to clean. Pass the number of the channel to clean. +To clean all channels pass None.

  • +
  • silence_threshold – The threshold to remove silence from the audio, in dB. +If None, no silence removal is performed.

  • +
  • use_multiprocessing – Number of processes to use for cleaning the audio files. +If 0, no multiprocessing is used.

  • +
  • verbose – Verbosity level. If True, display progress bar.

  • +
+
+
+
+
+
+noise_reduction.noise_reduction.reduce_noise_dfn(audio_source: str, target_directory: str, pad: bool = True, atten_lim_db: int | None = None, silence_threshold: float | None = None, use_multiprocessing: int = 0, verbose: bool = True, **kwargs)[source]#
+

Reduce noise from audio files using DeepFilterNet. +For more information about the noise reduction algorithm see: +Rikorose/DeepFilterNet +Notice that the saved files are in wav format, even if the original files are in other format.

+
+
Parameters:
+
    +
  • audio_source – path to audio file or directory of audio files

  • +
  • target_directory – path to target directory to save cleaned audio files

  • +
  • pad – whether to pad the audio file with zeros before cleaning

  • +
  • atten_lim_db – maximum attenuation in dB

  • +
  • silence_threshold – the threshold to remove silence from the audio, in dB. If None, no silence removal is +performed.

  • +
  • use_multiprocessing – Number of processes to use for cleaning the audio files. +If 0, no multiprocessing is used.

  • +
  • verbose – verbosity level. If True, display progress bar and logs.

  • +
  • kwargs – additional arguments to pass to torchaudio.load(). For more information see: +https://pytorch.org/audio/stable/generated/torchaudio.load.html

  • +
+
+
+
+
+
+

Module contents#

+
+
+
+
+
+
+
+
+ +
+
+ +
+
+
+
+ + + +
+
+ + \ No newline at end of file diff --git a/functions/master/noise_reduction/1.1.0/static/example.html b/functions/master/noise_reduction/1.1.0/static/example.html new file mode 100644 index 00000000..7ad47bf9 --- /dev/null +++ b/functions/master/noise_reduction/1.1.0/static/example.html @@ -0,0 +1,906 @@ + + + + + + + +Noise Reduction + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + +
+
+
+
+
+
+ +
+ +
+
+
+ + + + +
+
+
+
+
+
+
+
+
+
+
+
+
+ + + + +
+
+
+
+ + +
+
+

Noise Reduction#

+
+

Table of Contents#

+
    +
  1. Introduction

  2. +
  3. Project Setup

  4. +
  5. Noise Reduction Techniques

    +
      +
    1. DeepFilterNet

    2. +
    3. Spectral Gating

    4. +
    +
  6. +
+
+
+

Introduction#

+

Noise reduction is a crucial signal processing technique used to enhance the quality of signals by minimizing unwanted or irrelevant noise. This technique finds applications in various fields such as audio processing, image processing, telecommunications, and more. The goal is to extract the useful information from a signal while suppressing undesirable background noise.

+
+
+
import mlrun
+
+
+
+
+
+
+

Setting up a project#

+

First of all we need to create a project with the noise-reduction function

+
+
+
# Creating a project
+project = mlrun.get_or_create_project("noise-reduction")
+# Importing the function from hub
+noise_reduction_function = project.set_function("hub://noise_reduction")
+
+
+
+
+
> 2024-03-04 15:54:53,561 [info] Project loaded successfully: {'project_name': 'noise-reduction'}
+
+
+
+
+
+
+
# Audio source can be either a single file or a directory of audio files
+audio_source = "data"
+
+
+
+
+
+
+

Noise Reduction Techniques#

+

+
+

1. DeepFilterNet#

+

image.png

+

In order to use this technique, you simply need to use the reduce_noise_dfn handler.

+

Reduce noise from audio files using DeepFilterNet. For more information about the noise reduction algorithm, see DeepFilterNet GitHub. Notice that the saved files are in wav format, even if the original files are in other formats.

+
+
+

Parameters:#

+
    +
  • audio_source: path to the audio file or directory of audio files

  • +
  • target_directory: path to the target directory to save cleaned audio files

  • +
  • pad: whether to pad the audio file with zeros before cleaning

  • +
  • atten_lim_db: maximum attenuation in dB

  • +
  • silence_threshold: the threshold to remove silence from the audio, in dB. If None, no silence removal is performed.

  • +
  • use_multiprocessing: Number of processes to use for cleaning the audio files. If 0, no multiprocessing is used.

  • +
  • verbose: verbosity level. If True, display progress bar and logs.

  • +
  • kwargs: additional arguments to pass to torchaudio.load(). For more information, see torchaudio.load().

  • +
+

In the examples below, the function is running locally, for running remotely, it is required to build the function’s image first (need to execute only once):

+
noise_reduction_function.apply(mlrun.auto_mount()) # required for local files
+project.build_function("noise-reduction")
+
+
+
+

1.1. Example#

+
+
+
dfn_run = noise_reduction_function.run(
+    handler="reduce_noise_dfn",
+    inputs={"audio_source": audio_source},
+    params={
+        "target_directory": "./clean_data",
+        "use_multiprocessing": 2,
+        "silence_threshold": 50,
+        "atten_lim_db": 10,
+    },
+    returns=["successes: file", "errors: file"],
+    local=True,
+)
+
+
+
+
+
> 2024-03-04 15:54:56,999 [info] Storing function: {'name': 'noise-reduce-reduce-noise-dfn', 'uid': '9732dac831784a6a8b53acab5ff83a08', 'db': 'http://mlrun-api:8080'}
+> 2024-03-04 15:55:07,525 [info] logging run results to: http://mlrun-api:8080
+> 2024-03-04 15:55:07,702 [info] Reducing noise from audio files.
+
+
+
Noise-reduction:   0%|          | 0/2 [00:00<?, ?file/s]`torchaudio.backend.common.AudioMetaData` has been moved to `torchaudio.AudioMetaData`. Please update the import path.
+
+
+
> 2024-03-04 15:55:08,437 [info] Loading DeepFilterNet2 model.
+
+
+
`torchaudio.backend.common.AudioMetaData` has been moved to `torchaudio.AudioMetaData`. Please update the import path.
+
+
+
2024-03-04 15:55:08 | INFO     | DF | Running on torch 2.1.2+cu121
+2024-03-04 15:55:08 | INFO     | DF | Running on host jupyter-yoni-d56767c87-678n2
+> 2024-03-04 15:55:08,464 [info] Loading DeepFilterNet2 model.
+2024-03-04 15:55:08 | INFO     | DF | Running on torch 2.1.2+cu121
+2024-03-04 15:55:08 | INFO     | DF | Running on host jupyter-yoni-d56767c87-678n2
+2024-03-04 15:55:08 | INFO     | DF | Loading model settings of DeepFilterNet3
+2024-03-04 15:55:08 | INFO     | DF | Using DeepFilterNet3 model at /igz/.cache/DeepFilterNet/DeepFilterNet3
+2024-03-04 15:55:08 | INFO     | DF | Initializing model `deepfilternet3`
+2024-03-04 15:55:08 | INFO     | DF | Loading model settings of DeepFilterNet3
+2024-03-04 15:55:08 | INFO     | DF | Using DeepFilterNet3 model at /igz/.cache/DeepFilterNet/DeepFilterNet3
+2024-03-04 15:55:08 | INFO     | DF | Initializing model `deepfilternet3`
+2024-03-04 15:55:08 | INFO     | DF | Found checkpoint /igz/.cache/DeepFilterNet/DeepFilterNet3/checkpoints/model_120.ckpt.best with epoch 120
+2024-03-04 15:55:08 | INFO     | DF | Found checkpoint /igz/.cache/DeepFilterNet/DeepFilterNet3/checkpoints/model_120.ckpt.best with epoch 120
+2024-03-04 15:55:08 | INFO     | DF | Running on device cpu
+2024-03-04 15:55:08 | INFO     | DF | Running on device cpu
+2024-03-04 15:55:08 | INFO     | DF | Model loaded
+2024-03-04 15:55:08 | INFO     | DF | Model loaded
+> 2024-03-04 15:55:08,635 [info] Reducing noise from test_data.mp3.
+> 2024-03-04 15:55:08,636 [info] Reducing noise from test_data.wav.
+
+
+
2024-03-04 15:55:08 | WARNING  | DF | Audio sampling rate does not match model sampling rate (16000, 48000). Resampling...
+"sinc_interpolation" resampling method name is being deprecated and replaced by "sinc_interp_hann" in the next release. The default behavior remains unchanged.
+The MPEG_LAYER_III subtype is unknown to TorchAudio. As a result, the bits_per_sample attribute will be set to 0. If you are seeing this warning, please report by opening an issue on github (after checking for existing/closed ones). You may otherwise ignore this warning.
+2024-03-04 15:55:08 | WARNING  | DF | Audio sampling rate does not match model sampling rate (16000, 48000). Resampling...
+"sinc_interpolation" resampling method name is being deprecated and replaced by "sinc_interp_hann" in the next release. The default behavior remains unchanged.
+
+
+
> 2024-03-04 15:55:16,701 [info] Saved cleaned audio file to clean_data/test_data.wav.
+> 2024-03-04 15:55:16,706 [info] Saved cleaned audio file to clean_data/test_data_mp3.wav.
+
+
+
Noise-reduction: 100%|██████████| 2/2 [00:09<00:00,  4.51s/file]
+
+
+
> 2024-03-04 15:55:16,791 [info] Summarizing the results.
+> 2024-03-04 15:55:16,792 [info] Done (2/2)
+
+
+

+
+
+
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
noise-reduction0Mar 04 15:54:57completednoise-reduce-reduce-noise-dfn
v3io_user=yonis
kind=local
owner=yonis
host=jupyter-yoni-d56767c87-678n2
audio_source
target_directory=./clean_data
use_multiprocessing=2
silence_threshold=50
atten_lim_db=10
successes
errors
+
+ +
+

+
+
+
> to track results use the .show() or .logs() methods or click here to open in UI
> 2024-03-04 15:55:17,976 [info] Run execution finished: {'status': 'completed', 'name': 'noise-reduce-reduce-noise-dfn'}
+
+
+
+
+
+
+
+

Looking at the result#

+
+
+
dfn_run.artifact("successes").show()
+dfn_run.artifact("errors").show()
+
+
+
+
+
<IPython.core.display.JSON object>
+
+
+
<IPython.core.display.JSON object>
+
+
+
+
+

+
+
+

2. Spectral Gating#

+

image.png

+

In order to use this technique, you simply need to use the reduce_noise handler.

+

Spectral gating selectively filters signal frequencies based on amplitude, offering targeted noise reduction or feature enhancement in signal processing applications.

+

Reduce noise from an audio file or directory containing audio files. The audio files must be in .wav format. The cleaned audio files will be saved in the target directory. For information about the noise reduction algorithm, see noisereduce GitHub. Notice that the saved files are in .wav format, even if the original files are in another format.

+
+
+

Parameters:#

+
    +
  • audio_source: path to the audio file or directory containing audio files

  • +
  • target_directory: path to the directory to save the cleaned audio files.

  • +
  • sample_rate: Number of samples in one second in the audio file. Pass None to keep the original sample rate.

  • +
  • duration: Duration of the audio file to clean in seconds. Pass None to keep the original duration.

  • +
  • channel: Channel to clean. Pass the number of the channel to clean. To clean all channels, pass None.

  • +
  • silence_threshold: The threshold to remove silence from the audio, in dB. If None, no silence removal is performed.

  • +
  • use_multiprocessing: Number of processes to use for cleaning the audio files. If 0, no multiprocessing is used.

  • +
  • verbose: Verbosity level. If True, display a progress bar.

  • +
+
+

2.1. Example#

+
+
+
noise_reduction_run = noise_reduction_function.run(
+    handler="reduce_noise",
+    inputs={"audio_source": audio_source},
+    params={
+        "target_directory": "./clean_data",
+        "use_multiprocessing": 2,
+        "silence_threshold": 50,
+    },
+    local=True,
+    returns=["successes: file", "errors: file"],
+)
+
+
+
+
+
> 2024-03-04 16:07:39,378 [info] Storing function: {'name': 'noise-reduce-reduce-noise', 'uid': '6e6d6f7c3f8243b995dc1bbcf66f7544', 'db': 'http://mlrun-api:8080'}
+> 2024-03-04 16:07:39,541 [info] Reducing noise from audio files.
+
+
+
Noise-reduction:   0%|          | 0/2 [00:00<?, ?file/s]
+
+
+
> 2024-03-04 16:07:39,565 [info] Reducing noise from test_data.mp3.
+> 2024-03-04 16:07:39,566 [info] Reducing noise from test_data.wav.
+> 2024-03-04 16:07:46,174 [info] Saved cleaned audio file to clean_data/test_data.wav.
+> 2024-03-04 16:07:46,175 [info] Saved cleaned audio file to clean_data/test_data_mp3.wav.
+
+
+
Noise-reduction: 100%|██████████| 2/2 [00:06<00:00,  3.31s/file]
+
+
+
> 2024-03-04 16:07:46,211 [info] Summarizing the results.
+> 2024-03-04 16:07:46,212 [info] Done (2/2)
+
+
+

+
+
+
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
noise-reduction0Mar 04 16:07:39completednoise-reduce-reduce-noise
v3io_user=yonis
kind=local
owner=yonis
host=jupyter-yoni-d56767c87-678n2
audio_source
target_directory=./clean_data
use_multiprocessing=2
silence_threshold=50
successes
errors
+
+ +
+

+
+
+
> to track results use the .show() or .logs() methods or click here to open in UI
> 2024-03-04 16:07:46,389 [info] Run execution finished: {'status': 'completed', 'name': 'noise-reduce-reduce-noise'}
+
+
+
+
+
+
+
+

Looking at the result#

+
+
+
dfn_run.artifact("successes").show()
+dfn_run.artifact("errors").show()
+
+
+
+
+
<IPython.core.display.JSON object>
+
+
+
<IPython.core.display.JSON object>
+
+
+
+
+

The output of this function is the same as the first one.

+
+
+
+
+
+
+
+
+
+ +
+
+ +
+
+
+
+ + + +
+
+ + \ No newline at end of file diff --git a/functions/master/noise_reduction/1.1.0/static/function.html b/functions/master/noise_reduction/1.1.0/static/function.html new file mode 100644 index 00000000..8c061036 --- /dev/null +++ b/functions/master/noise_reduction/1.1.0/static/function.html @@ -0,0 +1,214 @@ + + + + + + + + + + + Source + + + + +
+        
+spec:
+  entry_points:
+    reduce_noise:
+      has_kwargs: false
+      name: reduce_noise
+      has_varargs: false
+      doc: 'Reduce noise from audio file or directory containing audio files.
+
+        The audio files must be in .wav format.
+
+        The cleaned audio files will be saved in the target_directory.
+
+        For information about the noise reduction algorithm see:
+
+        https://github.com/timsainb/noisereduce
+
+        Notice that the saved files are in wav format, even if the original files
+        are in other format.'
+      parameters:
+      - name: audio_source
+        type: str
+        doc: path to audio file or directory containing audio files
+      - name: target_directory
+        type: str
+        doc: path to directory to save the cleaned audio files.
+      - name: sample_rate
+        type: int
+        doc: Number of samples in one second in the audio file. Pass `None` to keep
+          the original sample rate.
+        default: 16000
+      - name: duration
+        type: int
+        doc: Duration of the audio file to clean in seconds. Pass `None` to keep the
+          original duration.
+        default: null
+      - name: channel
+        type: int
+        doc: Channel to clean. Pass the number of the channel to clean. To clean all
+          channels pass None.
+        default: null
+      - name: silence_threshold
+        type: float
+        doc: The threshold to remove silence from the audio, in dB. If None, no silence
+          removal is performed.
+        default: null
+      - name: use_multiprocessing
+        type: int
+        doc: Number of processes to use for cleaning the audio files. If 0, no multiprocessing
+          is used.
+        default: 0
+      - name: verbose
+        type: bool
+        doc: Verbosity level. If True, display progress bar.
+        default: true
+      lineno: 388
+    clean_audio:
+      has_kwargs: false
+      name: clean_audio
+      has_varargs: false
+      outputs:
+      - type: torch.Tensor
+      doc: ''
+      parameters:
+      - name: self
+      - name: data
+        type: Tensor
+      lineno: 276
+    save_audio:
+      has_kwargs: false
+      name: save_audio
+      has_varargs: false
+      doc: ''
+      parameters:
+      - name: self
+      - name: audio
+        type: ndarray
+      - name: target_path
+        type: Path
+      lineno: 256
+    load_audio:
+      has_kwargs: false
+      name: load_audio
+      has_varargs: false
+      outputs:
+      - type: torch.Tensor
+      doc: ''
+      parameters:
+      - name: self
+      - name: file
+        type: str
+      lineno: 268
+    update_to_wav_suffix:
+      has_kwargs: false
+      name: update_to_wav_suffix
+      has_varargs: false
+      doc: ''
+      parameters:
+      - name: self
+      - name: audio_file
+        type: Path
+      lineno: 125
+    remove_silence:
+      has_kwargs: false
+      name: remove_silence
+      has_varargs: false
+      outputs:
+      - doc: The audio without silence.
+      doc: Remove silence sections from the audio.
+      parameters:
+      - name: self
+      - name: audio
+        type: ndarray
+        doc: The audio to remove silence from.
+      lineno: 134
+    reduce_noise_dfn:
+      has_kwargs: true
+      name: reduce_noise_dfn
+      has_varargs: false
+      doc: 'Reduce noise from audio files using DeepFilterNet.
+
+        For more information about the noise reduction algorithm see:
+
+        https://github.com/Rikorose/DeepFilterNet
+
+        Notice that the saved files are in wav format, even if the original files
+        are in other format.'
+      parameters:
+      - name: audio_source
+        type: str
+        doc: path to audio file or directory of audio files
+      - name: target_directory
+        type: str
+        doc: path to target directory to save cleaned audio files
+      - name: pad
+        type: bool
+        doc: whether to pad the audio file with zeros before cleaning
+        default: true
+      - name: atten_lim_db
+        type: int
+        doc: maximum attenuation in dB
+        default: null
+      - name: silence_threshold
+        type: float
+        doc: the threshold to remove silence from the audio, in dB. If None, no silence
+          removal is performed.
+        default: null
+      - name: use_multiprocessing
+        type: int
+        doc: Number of processes to use for cleaning the audio files. If 0, no multiprocessing
+          is used.
+        default: 0
+      - name: verbose
+        type: bool
+        doc: verbosity level. If True, display progress bar and logs.
+        default: true
+      lineno: 322
+  build:
+    code_origin: ''
+    base_image: mlrun/mlrun
+    requirements:
+    - librosa
+    - noisereduce
+    - deepfilternet
+    - torchaudio>=2.1.2
+    functionSourceCode: 
+    origin_filename: ''
+  description: Reduce noise from audio files
+  command: ''
+  image: ''
+  default_handler: reduce_noise
+  disable_auto_mount: false
+metadata:
+  name: noise-reduction
+  tag: ''
+  categories:
+  - data-preparation
+  - audio
+kind: job
+verbose: false
+
+        
+    
+ + \ No newline at end of file diff --git a/functions/master/noise_reduction/1.1.0/static/item.html b/functions/master/noise_reduction/1.1.0/static/item.html new file mode 100644 index 00000000..63a2e019 --- /dev/null +++ b/functions/master/noise_reduction/1.1.0/static/item.html @@ -0,0 +1,63 @@ + + + + + + + + + + + Source + + + + +
+        
+apiVersion: v1
+categories:
+  - data-preparation
+  - audio
+description: Reduce noise from audio files
+doc: ''
+example: noise_reduction.ipynb
+generationDate: 2024-03-04:17-30
+hidden: false
+icon: ''
+labels:
+  author: yonatans
+maintainers: []
+mlrunVersion: 1.7.0
+name: noise-reduction
+platformVersion: 3.5.3
+spec:
+  filename: noise_reduction.py
+  handler: reduce_noise
+  image: mlrun/mlrun
+  kind: job
+  requirements: [
+    librosa,
+    noisereduce,
+    deepfilternet,
+    torchaudio>=2.1.2,
+  ]
+url: ''
+version: 1.1.0
+        
+    
+ + \ No newline at end of file diff --git a/functions/master/noise_reduction/1.1.0/static/noise_reduction.html b/functions/master/noise_reduction/1.1.0/static/noise_reduction.html new file mode 100644 index 00000000..cfbc45de --- /dev/null +++ b/functions/master/noise_reduction/1.1.0/static/noise_reduction.html @@ -0,0 +1,848 @@ + + + + + + + +noise_reduction.noise_reduction + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + +
+
+
+
+
+
+ +
+ +
+
+
+ + + + +
+
+
+
+
+
+
+
+
+
+
+
+
+ + +
+
+
+
+
+

+ +
+
+
+
+
+ +
+

Source code for noise_reduction.noise_reduction

+import logging
+from abc import ABCMeta, abstractmethod
+from multiprocessing import Process, Queue
+from pathlib import Path
+from typing import List, Tuple, Type, Union
+
+import librosa
+import numpy as np
+import torch
+from scipy.io import wavfile
+from tqdm import tqdm
+
+#: The value to send into multiprocessing queues to stop the process:
+_MULTIPROCESSING_STOP_MARK = "STOP"
+
+# Get the global logger:
+try:
+    import mlrun
+
+    _LOGGER = mlrun.get_or_create_ctx("noise_reduce").logger
+except ModuleNotFoundError:
+    _LOGGER = logging.getLogger()
+
+
+
+[docs] +class ReduceNoiseBase(metaclass=ABCMeta): + """ + Base class for noise reduction. + This class is aimed to be inherited by specific noise reduction algorithms. + You must implement the following methods: + - clean_audio: The method to clean the audio, where the noise reduction algorithm is implemented. + - save_audio: The method to save the audio to a file. + - load_audio: The method to load the audio from a file. + + After implementing the above methods, you can use the reduce_noise method to reduce noise from audio files. + """ + def __init__( + self, + target_directory: Path, + verbose: bool = True, + silence_threshold: float = None, + ): + self.target_directory = Path(target_directory) + self.verbose = verbose + self.silence_threshold = silence_threshold + +
+[docs] + def reduce_noise(self, audio_file: Path) -> Tuple[bool, Tuple[str, str]]: + """ + Reduce noise from the given audio file. + + :param audio_file: The audio file to reduce noise from. + + :returns: A tuple of: + - a boolean indicating whether an error occurred + - a tuple of: + - audio file name + - target path in case of success / error message in case of failure. + """ + try: + if self.verbose: + _LOGGER.info(f"Reducing noise from {audio_file.name}.") + + # Load audio data: + audio = self.load_audio(file=str(audio_file)) + + # Perform noise reduction: + reduced_noise = self.clean_audio(data=audio) + + # Remove silence from the audio if necessary: + reduced_noise = self.remove_silence(audio=reduced_noise) + + # Prepare target path: + target_path = self.update_to_wav_suffix(audio_file=audio_file) + + # Save file: + self.save_audio( + audio=reduced_noise, + target_path=target_path, + ) + + if self.verbose: + _LOGGER.info(f"Saved cleaned audio file to {target_path}.") + + return False, (audio_file.name, str(target_path)) + except Exception as exception: + if self.verbose: + _LOGGER.error(f"Failed to reduce noise from {audio_file.name}.") + _LOGGER.error(f"Error: {exception}") + # Collect the error: + return True, (audio_file.name, str(exception))
+ + +
+[docs] + @abstractmethod + def clean_audio(self, data) -> Union[np.ndarray, torch.Tensor]: + """ + Clean the audio from noise. Here you should implement the noise reduction algorithm. + + :param data: The audio data to clean. + + :returns: The cleaned audio. + """ + pass
+ + +
+[docs] + @abstractmethod + def save_audio(self, audio: np.ndarray, target_path: Path): + """ + Save the audio to a file. + + :param audio: The audio to save. + :param target_path: The target path to save the audio to. + """ + pass
+ + +
+[docs] + @abstractmethod + def load_audio(self, file: str) -> Tuple[Union[np.ndarray, torch.Tensor], int]: + """ + Load the audio from a file. + + :param file: The file to load the audio from. + + :returns: A tuple of: + - the audio data + - the sample rate + """ + pass
+ + +
+[docs] + def update_to_wav_suffix(self, audio_file: Path): + target_path = self.target_directory / audio_file.name + if target_path.suffix != ".wav": + old_suffix = target_path.suffix[1:] + target_path = target_path.with_stem(target_path.stem + f"_{old_suffix}") + return target_path.with_suffix(".wav") + else: + return target_path
+ + +
+[docs] + def remove_silence( + self, + audio: np.ndarray, + ): + """ + Remove silence sections from the audio. + + :param audio: The audio to remove silence from. + + :returns: The audio without silence. + """ + if self.silence_threshold is None: + return audio + + # Get the indices of the non-silent frames: + non_silent_indices = librosa.effects.split( + y=audio, + top_db=self.silence_threshold, + frame_length=2048, + hop_length=256, + ) + + # Get the non-silent audio: + non_silent_audio = np.concatenate( + [audio[:, start:end] for start, end in non_silent_indices], axis=1 + ) + + return non_silent_audio
+
+ + + +
+[docs] +class ReduceNoise(ReduceNoiseBase): + def __init__( + self, + target_directory: Path, + verbose: bool = True, + silence_threshold: float = None, + sample_rate: int = 16000, + duration: int = None, + channel: int = None, + ): + super().__init__(target_directory, verbose, silence_threshold) + self.sample_rate = sample_rate + self.duration = duration + self.channel = channel + +
+[docs] + def save_audio(self, audio: np.ndarray, target_path: Path): + # If the audio has more than one channel, transpose it in order to save it: + if len(audio) > 1: + audio = audio.T + + wavfile.write( + filename=target_path, + rate=self.sample_rate, + data=audio, + )
+ + +
+[docs] + def load_audio(self, file: str) -> np.ndarray: + data, sr = librosa.load( + path=file, + sr=self.sample_rate, + mono=False, # keep channels separate + duration=self.duration, + ) + # set sample rate: + self.sample_rate = int(sr) + + # convert to int with scaling for 16-bit integer + data *= 32767 / np.max(np.abs(data)) # re-scaling + data = data.astype(np.int16) # change data type + + # select channel + data_to_reduce = data[self.channel] if self.channel is not None else data + return data_to_reduce
+ + +
+[docs] + def clean_audio(self, data: np.ndarray) -> np.ndarray: + try: + import noisereduce + except ImportError as e: + raise ImportError("Please install noisereduce package") from e + + reduced_noise = noisereduce.reduce_noise(y=data, sr=self.sample_rate) + + # add channel back after noise reduction + if self.channel is not None: + # putting the channel back in the data + data[self.channel] = reduced_noise + # updating the data to save + reduced_noise = data + + return reduced_noise
+
+ + + +
+[docs] +class DFN(ReduceNoiseBase): + def __init__( + self, + target_directory: Path, + verbose: bool = True, + silence_threshold: float = None, + pad: bool = True, + atten_lim_db: int = None, + **kwargs, + ): + super().__init__(target_directory, verbose, silence_threshold) + self.pad = pad + self.atten_lim_db = atten_lim_db + self.kwargs = kwargs + + # import required packages + try: + from df.enhance import init_df + except ImportError as e: + raise ImportError("Please install deepfilternet packages") from e + + if self.verbose: + _LOGGER.info("Loading DeepFilterNet2 model.") + + # Load the model: + model, df_state, _ = init_df() + self.model = model + self.df_state = df_state + self.sample_rate = self.df_state.sr() + +
+[docs] + def save_audio(self, audio: np.ndarray, target_path: Path): + try: + from df.enhance import save_audio + except ImportError as e: + raise ImportError("Please install deepfilternet package") from e + save_audio( + file=target_path.name, + audio=audio, + sr=self.sample_rate, + output_dir=str(self.target_directory), + )
+ + +
+[docs] + def load_audio(self, file: str) -> torch.Tensor: + try: + from df.enhance import load_audio + except ImportError as e: + raise ImportError("Please install deepfilternet package") from e + audio, _ = load_audio(file=file, sr=self.sample_rate, **self.kwargs) + return audio
+ + +
+[docs] + def clean_audio(self, data: torch.Tensor) -> torch.Tensor: + try: + from df.enhance import enhance + except ImportError as e: + raise ImportError("Please install deepfilternet package") from e + return enhance( + model=self.model, + df_state=self.df_state, + audio=data, + pad=self.pad, + atten_lim_db=self.atten_lim_db, + )
+
+ + + +def _multiprocessing_complete_tasks( + noise_reduce_type: Type[ReduceNoiseBase], + noise_reduce_arguments: dict, + tasks_queue: Queue, + results_queue: Queue, +): + """ + Complete the tasks in the given queue and put the results in the given results queue. The function will stop when + the given tasks queue will receive the stop mark. It is aimed to be used with multiprocessing as a process. + + :param noise_reduce_type: The noise reduce type to use. + :param noise_reduce_arguments: The noisereduce initialization kwargs. + :param tasks_queue: A queue to get the tasks from. + :param results_queue: A queue to put the results in. + """ + # Initialize the reduce noise object + noise_reducer = noise_reduce_type(**noise_reduce_arguments) + + # Start listening to the tasks queue: + while True: + # Get the audio_file: + audio_file = tasks_queue.get() + if audio_file == _MULTIPROCESSING_STOP_MARK: + break + audio_file = Path(audio_file) + # Apply noise reduction and collect the result: + results_queue.put(noise_reducer.reduce_noise(audio_file=audio_file)) + + # Mark the end of the tasks: + results_queue.put(_MULTIPROCESSING_STOP_MARK) + + +
+[docs] +def reduce_noise_dfn( + audio_source: str, + target_directory: str, + pad: bool = True, + atten_lim_db: int = None, + silence_threshold: float = None, + use_multiprocessing: int = 0, + verbose: bool = True, + **kwargs, +): + """ + Reduce noise from audio files using DeepFilterNet. + For more information about the noise reduction algorithm see: + https://github.com/Rikorose/DeepFilterNet + Notice that the saved files are in wav format, even if the original files are in other format. + + :param audio_source: path to audio file or directory of audio files + :param target_directory: path to target directory to save cleaned audio files + :param pad: whether to pad the audio file with zeros before cleaning + :param atten_lim_db: maximum attenuation in dB + :param silence_threshold: the threshold to remove silence from the audio, in dB. If None, no silence removal is + performed. + :param use_multiprocessing: Number of processes to use for cleaning the audio files. + If 0, no multiprocessing is used. + :param verbose: verbosity level. If True, display progress bar and logs. + :param kwargs: additional arguments to pass to torchaudio.load(). For more information see: + https://pytorch.org/audio/stable/generated/torchaudio.load.html + """ + if verbose: + _LOGGER.info("Reducing noise from audio files.") + + # create target directory: + target_directory = _create_target_directory(target_directory) + + # get audio files: + audio_files = _get_audio_files(audio_source) + + noise_reduce_arguments = { + "target_directory": target_directory, + "pad": pad, + "atten_lim_db": atten_lim_db, + "silence_threshold": silence_threshold, + **kwargs, + } + + if use_multiprocessing: + results = _parallel_run( + noise_reduce_type=DFN, + noise_reduce_arguments=noise_reduce_arguments, + n_workers=use_multiprocessing, + audio_files=audio_files, + description="Noise-reduction", + verbose=verbose, + ) + else: + results = _run( + noise_reduce_type=DFN, + noise_reduce_arguments=noise_reduce_arguments, + audio_files=audio_files, + description="Noise-reduction", + verbose=verbose, + ) + + return _process_results(results, verbose)
+ + + +
+[docs] +def reduce_noise( + audio_source: str, + target_directory: str, + sample_rate: int = 16000, + duration: int = None, + channel: int = None, + silence_threshold: float = None, + use_multiprocessing: int = 0, + verbose: bool = True, +): + """ + Reduce noise from audio file or directory containing audio files. + The audio files must be in .wav format. + The cleaned audio files will be saved in the target_directory. + For information about the noise reduction algorithm see: + https://github.com/timsainb/noisereduce + Notice that the saved files are in wav format, even if the original files are in other format. + + :param audio_source: path to audio file or directory containing audio files + :param target_directory: path to directory to save the cleaned audio files. + :param sample_rate: Number of samples in one second in the audio file. + Pass `None` to keep the original sample rate. + :param duration: Duration of the audio file to clean in seconds. + Pass `None` to keep the original duration. + :param channel: Channel to clean. Pass the number of the channel to clean. + To clean all channels pass None. + :param silence_threshold: The threshold to remove silence from the audio, in dB. + If None, no silence removal is performed. + :param use_multiprocessing: Number of processes to use for cleaning the audio files. + If 0, no multiprocessing is used. + :param verbose: Verbosity level. If True, display progress bar. + """ + if verbose: + _LOGGER.info("Reducing noise from audio files.") + + # create target directory: + target_directory = _create_target_directory(target_directory) + + # get audio files: + audio_files = _get_audio_files(audio_source) + + # Create the reduce noise object: + noise_reduce_arguments = { + "target_directory": target_directory, + "sample_rate": sample_rate, + "duration": duration, + "channel": channel, + "silence_threshold": silence_threshold, + } + + if use_multiprocessing: + results = _parallel_run( + noise_reduce_type=ReduceNoise, + noise_reduce_arguments=noise_reduce_arguments, + n_workers=use_multiprocessing, + audio_files=audio_files, + description="Noise-reduction", + verbose=verbose, + ) + else: + results = _run( + noise_reduce_type=ReduceNoise, + noise_reduce_arguments=noise_reduce_arguments, + audio_files=audio_files, + description="Noise-reduction", + verbose=verbose, + ) + + return _process_results(results, verbose)
+ + + +def _create_target_directory(target_directory: str) -> str: + target_directory = Path(target_directory) + if not target_directory.exists(): + target_directory.mkdir(parents=True, exist_ok=True) + return str(target_directory) + + +def _get_audio_files(audio_source: str): + audio_source = Path(audio_source) + audio_files = [] + if audio_source.is_dir(): + audio_files = list(audio_source.glob("*.*")) + elif audio_source.is_file(): + audio_files.append(audio_source) + else: + raise ValueError( + f"audio_source must be a file or a directory, got {audio_source}" + ) + return audio_files + + +def _parallel_run( + noise_reduce_type: Type[ReduceNoiseBase], + noise_reduce_arguments: dict, + n_workers: int, + audio_files: List[Path], + description: str, + verbose: bool, +) -> List[Tuple[bool, Tuple[str, str]]]: + """ + Run multiple noise reduce workers with multiprocessing to complete the tasks that will be created on the provided + files using the given task creator. + + :param noise_reduce_type: The noise reduce type to use. + :param n_workers: The number of workers to use. + :param audio_files: The audio files to use. + :param description: The description to use for the progress bar. + :param verbose: Verbosity. + + :returns: The collected results. + """ + # Check the number of workers: + if n_workers > len(audio_files): + _LOGGER.warning( + f"The number of workers ({n_workers}) is larger than the number of audio files ({len(audio_files)}). " + f"Setting the number of workers to {len(audio_files)}." + ) + n_workers = len(audio_files) + + # Initialize the multiprocessing queues: + tasks_queue = Queue() + results_queue = Queue() + + # Initialize the multiprocessing processes: + task_completion_processes = [ + Process( + target=_multiprocessing_complete_tasks, + kwargs={ + "noise_reduce_type": noise_reduce_type, + "noise_reduce_arguments": noise_reduce_arguments, + "tasks_queue": tasks_queue, + "results_queue": results_queue, + }, + ) + for _ in range(n_workers) + ] + + # Start the multiprocessing processes: + for p in task_completion_processes: + p.start() + + # Put the tasks in the queue: + for audio_file in audio_files: + # tasks_queue.put(task_creator.create_task(audio_file=audio_file).to_tuple()) + tasks_queue.put(audio_file) + + # Put the stop marks in the queue: + for _ in range(n_workers): + tasks_queue.put(_MULTIPROCESSING_STOP_MARK) + + # Collect the results: + results = [] + stop_marks_counter = 0 + with tqdm( + desc=description, + unit="file", + total=len(audio_files), + disable=not verbose, + ) as progressbar: + while True: + # Get a result from the queue: + result: Tuple[bool, Tuple[str, str]] = results_queue.get() + if result == _MULTIPROCESSING_STOP_MARK: + stop_marks_counter += 1 + if stop_marks_counter == n_workers: + break + else: + # Collect the result: + results.append(result) + progressbar.update(1) + + # Wait for the processes to finish: + for p in task_completion_processes: + p.join() + + return results + + +def _run( + noise_reduce_type: Type[ReduceNoiseBase], + noise_reduce_arguments: dict, + audio_files: List[Path], + description: str, + verbose: bool, +) -> List[Tuple[bool, Tuple[str, str]]]: + """ + Run the noise reduce algorithm on the given audio files and collect the results. + + :param noise_reduce_type: The noise reduce type to use. + :param noise_reduce_arguments: The noisereduce initialization kwargs. + :param audio_files: The audio files to use. + :param description: The description to use for the progress bar. + :param verbose: Verbosity. + + :returns: The collected results. + """ + # Create the reduce noise object: + noise_reducer = noise_reduce_type(**noise_reduce_arguments) + + # Run the noise reduce algorithm on the audio files and collect the results: + results = [] + for audio_file in tqdm( + audio_files, + desc=description, + unit="file", + total=len(audio_files), + disable=not verbose, + ): + results.append(noise_reducer.reduce_noise(audio_file=audio_file)) + + return results + + +def _process_results( + results: List[Tuple[bool, Tuple[str, str]]], verbose: bool +) -> Tuple[dict, dict]: + """ + Process the results of the tasks. + + :param results: The results to process. + :param verbose: Verbosity. + + :returns: The processed results as a tuple of successes and errors. + """ + if verbose: + _LOGGER.info("Summarizing the results.") + successes = {} + errors = {} + for is_error, result in results: + if is_error: + errors[result[0]] = result[1] + else: + successes[result[0]] = result[1] + if verbose: + _LOGGER.info(f"Done ({len(successes)}/{len(successes) + len(errors)})\n") + + return successes, errors +
+
+
+
+
+
+
+
+
+ +
+
+
+
+ + + +
+
+ + \ No newline at end of file diff --git a/functions/master/noise_reduction/1.1.0/static/source.html b/functions/master/noise_reduction/1.1.0/static/source.html new file mode 100644 index 00000000..d46b6ee1 --- /dev/null +++ b/functions/master/noise_reduction/1.1.0/static/source.html @@ -0,0 +1,660 @@ + + + + + + + + + + + Source + + + + +
+        
+import logging
+from abc import ABCMeta, abstractmethod
+from multiprocessing import Process, Queue
+from pathlib import Path
+from typing import List, Tuple, Type, Union
+
+import librosa
+import numpy as np
+import torch
+from scipy.io import wavfile
+from tqdm import tqdm
+
+#: The value to send into multiprocessing queues to stop the process:
+_MULTIPROCESSING_STOP_MARK = "STOP"
+
+# Get the global logger:
+try:
+    import mlrun
+
+    _LOGGER = mlrun.get_or_create_ctx("noise_reduce").logger
+except ModuleNotFoundError:
+    _LOGGER = logging.getLogger()
+
+
+class ReduceNoiseBase(metaclass=ABCMeta):
+    """
+    Base class for noise reduction.
+    This class is aimed to be inherited by specific noise reduction algorithms.
+    You must implement the following methods:
+    - clean_audio:  The method to clean the audio, where the noise reduction algorithm is implemented.
+    - save_audio:   The method to save the audio to a file.
+    - load_audio:   The method to load the audio from a file.
+
+    After implementing the above methods, you can use the reduce_noise method to reduce noise from audio files.
+    """
+    def __init__(
+        self,
+        target_directory: Path,
+        verbose: bool = True,
+        silence_threshold: float = None,
+    ):
+        self.target_directory = Path(target_directory)
+        self.verbose = verbose
+        self.silence_threshold = silence_threshold
+
+    def reduce_noise(self, audio_file: Path) -> Tuple[bool, Tuple[str, str]]:
+        """
+        Reduce noise from the given audio file.
+
+        :param audio_file:  The audio file to reduce noise from.
+
+        :returns: A tuple of:
+         - a boolean indicating whether an error occurred
+         - a tuple of:
+            - audio file name
+            - target path in case of success / error message in case of failure.
+        """
+        try:
+            if self.verbose:
+                _LOGGER.info(f"Reducing noise from {audio_file.name}.")
+
+            # Load audio data:
+            audio = self.load_audio(file=str(audio_file))
+
+            # Perform noise reduction:
+            reduced_noise = self.clean_audio(data=audio)
+
+            # Remove silence from the audio if necessary:
+            reduced_noise = self.remove_silence(audio=reduced_noise)
+
+            # Prepare target path:
+            target_path = self.update_to_wav_suffix(audio_file=audio_file)
+
+            # Save file:
+            self.save_audio(
+                audio=reduced_noise,
+                target_path=target_path,
+            )
+
+            if self.verbose:
+                _LOGGER.info(f"Saved cleaned audio file to {target_path}.")
+
+            return False, (audio_file.name, str(target_path))
+        except Exception as exception:
+            if self.verbose:
+                _LOGGER.error(f"Failed to reduce noise from {audio_file.name}.")
+                _LOGGER.error(f"Error: {exception}")
+            # Collect the error:
+            return True, (audio_file.name, str(exception))
+
+    @abstractmethod
+    def clean_audio(self, data) -> Union[np.ndarray, torch.Tensor]:
+        """
+        Clean the audio from noise. Here you should implement the noise reduction algorithm.
+
+        :param data:    The audio data to clean.
+
+        :returns: The cleaned audio.
+        """
+        pass
+
+    @abstractmethod
+    def save_audio(self, audio: np.ndarray, target_path: Path):
+        """
+        Save the audio to a file.
+
+        :param audio:       The audio to save.
+        :param target_path: The target path to save the audio to.
+        """
+        pass
+
+    @abstractmethod
+    def load_audio(self, file: str) -> Tuple[Union[np.ndarray, torch.Tensor], int]:
+        """
+        Load the audio from a file.
+
+        :param file:    The file to load the audio from.
+
+        :returns: A tuple of:
+            - the audio data
+            - the sample rate
+        """
+        pass
+
+    def update_to_wav_suffix(self, audio_file: Path):
+        target_path = self.target_directory / audio_file.name
+        if target_path.suffix != ".wav":
+            old_suffix = target_path.suffix[1:]
+            target_path = target_path.with_stem(target_path.stem + f"_{old_suffix}")
+            return target_path.with_suffix(".wav")
+        else:
+            return target_path
+
+    def remove_silence(
+        self,
+        audio: np.ndarray,
+    ):
+        """
+        Remove silence sections from the audio.
+
+        :param audio:   The audio to remove silence from.
+
+        :returns: The audio without silence.
+        """
+        if self.silence_threshold is None:
+            return audio
+
+        # Get the indices of the non-silent frames:
+        non_silent_indices = librosa.effects.split(
+            y=audio,
+            top_db=self.silence_threshold,
+            frame_length=2048,
+            hop_length=256,
+        )
+
+        # Get the non-silent audio:
+        non_silent_audio = np.concatenate(
+            [audio[:, start:end] for start, end in non_silent_indices], axis=1
+        )
+
+        return non_silent_audio
+
+
+class ReduceNoise(ReduceNoiseBase):
+    def __init__(
+        self,
+        target_directory: Path,
+        verbose: bool = True,
+        silence_threshold: float = None,
+        sample_rate: int = 16000,
+        duration: int = None,
+        channel: int = None,
+    ):
+        super().__init__(target_directory, verbose, silence_threshold)
+        self.sample_rate = sample_rate
+        self.duration = duration
+        self.channel = channel
+
+    def save_audio(self, audio: np.ndarray, target_path: Path):
+        # If the audio has more than one channel, transpose it in order to save it:
+        if len(audio) > 1:
+            audio = audio.T
+
+        wavfile.write(
+            filename=target_path,
+            rate=self.sample_rate,
+            data=audio,
+        )
+
+    def load_audio(self, file: str) -> np.ndarray:
+        data, sr = librosa.load(
+            path=file,
+            sr=self.sample_rate,
+            mono=False,  # keep channels separate
+            duration=self.duration,
+        )
+        # set sample rate:
+        self.sample_rate = int(sr)
+
+        # convert to int with scaling for 16-bit integer
+        data *= 32767 / np.max(np.abs(data))  # re-scaling
+        data = data.astype(np.int16)  # change data type
+
+        # select channel
+        data_to_reduce = data[self.channel] if self.channel is not None else data
+        return data_to_reduce
+
+    def clean_audio(self, data: np.ndarray) -> np.ndarray:
+        try:
+            import noisereduce
+        except ImportError as e:
+            raise ImportError("Please install noisereduce package") from e
+
+        reduced_noise = noisereduce.reduce_noise(y=data, sr=self.sample_rate)
+
+        # add channel back after noise reduction
+        if self.channel is not None:
+            # putting the channel back in the data
+            data[self.channel] = reduced_noise
+            # updating the data to save
+            reduced_noise = data
+
+        return reduced_noise
+
+
+class DFN(ReduceNoiseBase):
+    def __init__(
+        self,
+        target_directory: Path,
+        verbose: bool = True,
+        silence_threshold: float = None,
+        pad: bool = True,
+        atten_lim_db: int = None,
+        **kwargs,
+    ):
+        super().__init__(target_directory, verbose, silence_threshold)
+        self.pad = pad
+        self.atten_lim_db = atten_lim_db
+        self.kwargs = kwargs
+
+        # import required packages
+        try:
+            from df.enhance import init_df
+        except ImportError as e:
+            raise ImportError("Please install deepfilternet packages") from e
+
+        if self.verbose:
+            _LOGGER.info("Loading DeepFilterNet2 model.")
+
+        # Load the model:
+        model, df_state, _ = init_df()
+        self.model = model
+        self.df_state = df_state
+        self.sample_rate = self.df_state.sr()
+
+    def save_audio(self, audio: np.ndarray, target_path: Path):
+        try:
+            from df.enhance import save_audio
+        except ImportError as e:
+            raise ImportError("Please install deepfilternet package") from e
+        save_audio(
+            file=target_path.name,
+            audio=audio,
+            sr=self.sample_rate,
+            output_dir=str(self.target_directory),
+        )
+
+    def load_audio(self, file: str) -> torch.Tensor:
+        try:
+            from df.enhance import load_audio
+        except ImportError as e:
+            raise ImportError("Please install deepfilternet package") from e
+        audio, _ = load_audio(file=file, sr=self.sample_rate, **self.kwargs)
+        return audio
+
+    def clean_audio(self, data: torch.Tensor) -> torch.Tensor:
+        try:
+            from df.enhance import enhance
+        except ImportError as e:
+            raise ImportError("Please install deepfilternet package") from e
+        return enhance(
+            model=self.model,
+            df_state=self.df_state,
+            audio=data,
+            pad=self.pad,
+            atten_lim_db=self.atten_lim_db,
+        )
+
+
+def _multiprocessing_complete_tasks(
+    noise_reduce_type: Type[ReduceNoiseBase],
+    noise_reduce_arguments: dict,
+    tasks_queue: Queue,
+    results_queue: Queue,
+):
+    """
+    Complete the tasks in the given queue and put the results in the given results queue. The function will stop when
+    the given tasks queue will receive the stop mark. It is aimed to be used with multiprocessing as a process.
+
+    :param noise_reduce_type:       The noise reduce type to use.
+    :param noise_reduce_arguments:  The noisereduce initialization kwargs.
+    :param tasks_queue:             A queue to get the tasks from.
+    :param results_queue:           A queue to put the results in.
+    """
+    # Initialize the reduce noise object
+    noise_reducer = noise_reduce_type(**noise_reduce_arguments)
+
+    # Start listening to the tasks queue:
+    while True:
+        # Get the audio_file:
+        audio_file = tasks_queue.get()
+        if audio_file == _MULTIPROCESSING_STOP_MARK:
+            break
+        audio_file = Path(audio_file)
+        # Apply noise reduction and collect the result:
+        results_queue.put(noise_reducer.reduce_noise(audio_file=audio_file))
+
+    # Mark the end of the tasks:
+    results_queue.put(_MULTIPROCESSING_STOP_MARK)
+
+
+def reduce_noise_dfn(
+    audio_source: str,
+    target_directory: str,
+    pad: bool = True,
+    atten_lim_db: int = None,
+    silence_threshold: float = None,
+    use_multiprocessing: int = 0,
+    verbose: bool = True,
+    **kwargs,
+):
+    """
+    Reduce noise from audio files using DeepFilterNet.
+    For more information about the noise reduction algorithm see:
+    https://github.com/Rikorose/DeepFilterNet
+    Notice that the saved files are in wav format, even if the original files are in other format.
+
+    :param audio_source:        path to audio file or directory of audio files
+    :param target_directory:    path to target directory to save cleaned audio files
+    :param pad:                 whether to pad the audio file with zeros before cleaning
+    :param atten_lim_db:        maximum attenuation in dB
+    :param silence_threshold:   the threshold to remove silence from the audio, in dB. If None, no silence removal is
+                                performed.
+    :param use_multiprocessing: Number of processes to use for cleaning the audio files.
+                                If 0, no multiprocessing is used.
+    :param verbose:             verbosity level. If True, display progress bar and logs.
+    :param kwargs:              additional arguments to pass to torchaudio.load(). For more information see:
+                                https://pytorch.org/audio/stable/generated/torchaudio.load.html
+    """
+    if verbose:
+        _LOGGER.info("Reducing noise from audio files.")
+
+    # create target directory:
+    target_directory = _create_target_directory(target_directory)
+
+    # get audio files:
+    audio_files = _get_audio_files(audio_source)
+
+    noise_reduce_arguments = {
+        "target_directory": target_directory,
+        "pad": pad,
+        "atten_lim_db": atten_lim_db,
+        "silence_threshold": silence_threshold,
+        **kwargs,
+    }
+
+    if use_multiprocessing:
+        results = _parallel_run(
+            noise_reduce_type=DFN,
+            noise_reduce_arguments=noise_reduce_arguments,
+            n_workers=use_multiprocessing,
+            audio_files=audio_files,
+            description="Noise-reduction",
+            verbose=verbose,
+        )
+    else:
+        results = _run(
+            noise_reduce_type=DFN,
+            noise_reduce_arguments=noise_reduce_arguments,
+            audio_files=audio_files,
+            description="Noise-reduction",
+            verbose=verbose,
+        )
+
+    return _process_results(results, verbose)
+
+
+def reduce_noise(
+    audio_source: str,
+    target_directory: str,
+    sample_rate: int = 16000,
+    duration: int = None,
+    channel: int = None,
+    silence_threshold: float = None,
+    use_multiprocessing: int = 0,
+    verbose: bool = True,
+):
+    """
+    Reduce noise from audio file or directory containing audio files.
+    The audio files must be in .wav format.
+    The cleaned audio files will be saved in the target_directory.
+    For information about the noise reduction algorithm see:
+    https://github.com/timsainb/noisereduce
+    Notice that the saved files are in wav format, even if the original files are in other format.
+
+    :param audio_source:        path to audio file or directory containing audio files
+    :param target_directory:    path to directory to save the cleaned audio files.
+    :param sample_rate:         Number of samples in one second in the audio file.
+                                Pass `None` to keep the original sample rate.
+    :param duration:            Duration of the audio file to clean in seconds.
+                                Pass `None` to keep the original duration.
+    :param channel:             Channel to clean. Pass the number of the channel to clean.
+                                To clean all channels pass None.
+    :param silence_threshold:   The threshold to remove silence from the audio, in dB.
+                                If None, no silence removal is performed.
+    :param use_multiprocessing: Number of processes to use for cleaning the audio files.
+                                If 0, no multiprocessing is used.
+    :param verbose:             Verbosity level. If True, display progress bar.
+    """
+    if verbose:
+        _LOGGER.info("Reducing noise from audio files.")
+
+    # create target directory:
+    target_directory = _create_target_directory(target_directory)
+
+    # get audio files:
+    audio_files = _get_audio_files(audio_source)
+
+    # Create the reduce noise object:
+    noise_reduce_arguments = {
+        "target_directory": target_directory,
+        "sample_rate": sample_rate,
+        "duration": duration,
+        "channel": channel,
+        "silence_threshold": silence_threshold,
+    }
+
+    if use_multiprocessing:
+        results = _parallel_run(
+            noise_reduce_type=ReduceNoise,
+            noise_reduce_arguments=noise_reduce_arguments,
+            n_workers=use_multiprocessing,
+            audio_files=audio_files,
+            description="Noise-reduction",
+            verbose=verbose,
+        )
+    else:
+        results = _run(
+            noise_reduce_type=ReduceNoise,
+            noise_reduce_arguments=noise_reduce_arguments,
+            audio_files=audio_files,
+            description="Noise-reduction",
+            verbose=verbose,
+        )
+
+    return _process_results(results, verbose)
+
+
+def _create_target_directory(target_directory: str) -> str:
+    target_directory = Path(target_directory)
+    if not target_directory.exists():
+        target_directory.mkdir(parents=True, exist_ok=True)
+    return str(target_directory)
+
+
+def _get_audio_files(audio_source: str):
+    audio_source = Path(audio_source)
+    audio_files = []
+    if audio_source.is_dir():
+        audio_files = list(audio_source.glob("*.*"))
+    elif audio_source.is_file():
+        audio_files.append(audio_source)
+    else:
+        raise ValueError(
+            f"audio_source must be a file or a directory, got {audio_source}"
+        )
+    return audio_files
+
+
+def _parallel_run(
+    noise_reduce_type: Type[ReduceNoiseBase],
+    noise_reduce_arguments: dict,
+    n_workers: int,
+    audio_files: List[Path],
+    description: str,
+    verbose: bool,
+) -> List[Tuple[bool, Tuple[str, str]]]:
+    """
+    Run multiple noise reduce workers with multiprocessing to complete the tasks that will be created on the provided
+    files using the given task creator.
+
+    :param noise_reduce_type:   The noise reduce type to use.
+    :param n_workers:           The number of workers to use.
+    :param audio_files:         The audio files to use.
+    :param description:         The description to use for the progress bar.
+    :param verbose:             Verbosity.
+
+    :returns: The collected results.
+    """
+    # Check the number of workers:
+    if n_workers > len(audio_files):
+        _LOGGER.warning(
+            f"The number of workers ({n_workers}) is larger than the number of audio files ({len(audio_files)}). "
+            f"Setting the number of workers to {len(audio_files)}."
+        )
+        n_workers = len(audio_files)
+
+    # Initialize the multiprocessing queues:
+    tasks_queue = Queue()
+    results_queue = Queue()
+
+    # Initialize the multiprocessing processes:
+    task_completion_processes = [
+        Process(
+            target=_multiprocessing_complete_tasks,
+            kwargs={
+                "noise_reduce_type": noise_reduce_type,
+                "noise_reduce_arguments": noise_reduce_arguments,
+                "tasks_queue": tasks_queue,
+                "results_queue": results_queue,
+            },
+        )
+        for _ in range(n_workers)
+    ]
+
+    # Start the multiprocessing processes:
+    for p in task_completion_processes:
+        p.start()
+
+    # Put the tasks in the queue:
+    for audio_file in audio_files:
+        # tasks_queue.put(task_creator.create_task(audio_file=audio_file).to_tuple())
+        tasks_queue.put(audio_file)
+
+    # Put the stop marks in the queue:
+    for _ in range(n_workers):
+        tasks_queue.put(_MULTIPROCESSING_STOP_MARK)
+
+    # Collect the results:
+    results = []
+    stop_marks_counter = 0
+    with tqdm(
+        desc=description,
+        unit="file",
+        total=len(audio_files),
+        disable=not verbose,
+    ) as progressbar:
+        while True:
+            # Get a result from the queue:
+            result: Tuple[bool, Tuple[str, str]] = results_queue.get()
+            if result == _MULTIPROCESSING_STOP_MARK:
+                stop_marks_counter += 1
+                if stop_marks_counter == n_workers:
+                    break
+            else:
+                # Collect the result:
+                results.append(result)
+                progressbar.update(1)
+
+    # Wait for the processes to finish:
+    for p in task_completion_processes:
+        p.join()
+
+    return results
+
+
+def _run(
+    noise_reduce_type: Type[ReduceNoiseBase],
+    noise_reduce_arguments: dict,
+    audio_files: List[Path],
+    description: str,
+    verbose: bool,
+) -> List[Tuple[bool, Tuple[str, str]]]:
+    """
+    Run the noise reduce algorithm on the given audio files and collect the results.
+
+    :param noise_reduce_type:       The noise reduce type to use.
+    :param noise_reduce_arguments:  The noisereduce initialization kwargs.
+    :param audio_files:             The audio files to use.
+    :param description:             The description to use for the progress bar.
+    :param verbose:                 Verbosity.
+
+    :returns: The collected results.
+    """
+    # Create the reduce noise object:
+    noise_reducer = noise_reduce_type(**noise_reduce_arguments)
+
+    # Run the noise reduce algorithm on the audio files and collect the results:
+    results = []
+    for audio_file in tqdm(
+        audio_files,
+        desc=description,
+        unit="file",
+        total=len(audio_files),
+        disable=not verbose,
+    ):
+        results.append(noise_reducer.reduce_noise(audio_file=audio_file))
+
+    return results
+
+
+def _process_results(
+    results: List[Tuple[bool, Tuple[str, str]]], verbose: bool
+) -> Tuple[dict, dict]:
+    """
+    Process the results of the tasks.
+
+    :param results: The results to process.
+    :param verbose: Verbosity.
+
+    :returns: The processed results as a tuple of successes and errors.
+    """
+    if verbose:
+        _LOGGER.info("Summarizing the results.")
+    successes = {}
+    errors = {}
+    for is_error, result in results:
+        if is_error:
+            errors[result[0]] = result[1]
+        else:
+            successes[result[0]] = result[1]
+    if verbose:
+        _LOGGER.info(f"Done ({len(successes)}/{len(successes) + len(errors)})\n")
+
+    return successes, errors
+
+        
+    
+ + \ No newline at end of file diff --git a/functions/master/noise_reduction/latest/src/function.yaml b/functions/master/noise_reduction/latest/src/function.yaml index e03729a9..d6d33b8d 100644 --- a/functions/master/noise_reduction/latest/src/function.yaml +++ b/functions/master/noise_reduction/latest/src/function.yaml @@ -1,30 +1,9 @@ -kind: job -metadata: - name: noise-reduction - tag: '' - hash: cbf6498dca0358810ddaea3baa0e246b7874ea1d - project: '' - labels: - author: yonatans - categories: [] spec: - command: '' - args: [] - image: '' - build: - functionSourceCode:  - base_image: mlrun/mlrun - commands: [] - code_origin: '' - origin_filename: '' - requirements: - - librosa - - noisereduce - - deepfilternet - - torchaudio>=2.1.2 entry_points: reduce_noise: + has_kwargs: false name: reduce_noise + has_varargs: false doc: 'Reduce noise from audio file or directory containing audio files. The audio files must be in .wav format. @@ -73,24 +52,23 @@ spec: type: bool doc: Verbosity level. If True, display progress bar. default: true - outputs: [] lineno: 388 - has_varargs: false - has_kwargs: false clean_audio: + has_kwargs: false name: clean_audio + has_varargs: false + outputs: + - type: torch.Tensor doc: '' parameters: - name: self - name: data type: Tensor - outputs: - - type: torch.Tensor lineno: 276 - has_varargs: false - has_kwargs: false save_audio: + has_kwargs: false name: save_audio + has_varargs: false doc: '' parameters: - name: self @@ -98,48 +76,46 @@ spec: type: ndarray - name: target_path type: Path - outputs: [] lineno: 256 - has_varargs: false - has_kwargs: false load_audio: + has_kwargs: false name: load_audio + has_varargs: false + outputs: + - type: torch.Tensor doc: '' parameters: - name: self - name: file type: str - outputs: - - type: torch.Tensor lineno: 268 - has_varargs: false - has_kwargs: false update_to_wav_suffix: + has_kwargs: false name: update_to_wav_suffix + has_varargs: false doc: '' parameters: - name: self - name: audio_file type: Path - outputs: [] lineno: 125 - has_varargs: false - has_kwargs: false remove_silence: + has_kwargs: false name: remove_silence + has_varargs: false + outputs: + - doc: The audio without silence. doc: Remove silence sections from the audio. parameters: - name: self - name: audio type: ndarray doc: The audio to remove silence from. - outputs: - - doc: The audio without silence. lineno: 134 - has_varargs: false - has_kwargs: false reduce_noise_dfn: + has_kwargs: true name: reduce_noise_dfn + has_varargs: false doc: 'Reduce noise from audio files using DeepFilterNet. For more information about the noise reduction algorithm see: @@ -177,18 +153,27 @@ spec: type: bool doc: verbosity level. If True, display progress bar and logs. default: true - outputs: [] lineno: 322 - has_varargs: false - has_kwargs: true + build: + code_origin: '' + base_image: mlrun/mlrun + requirements: + - librosa + - noisereduce + - deepfilternet + - torchaudio>=2.1.2 + functionSourceCode:  + origin_filename: '' description: Reduce noise from audio files + command: '' + image: '' default_handler: reduce_noise disable_auto_mount: false - clone_target_dir: '' - env: [] - priority_class_name: '' - preemption_mode: prevent - affinity: null - tolerations: null - security_context: {} +metadata: + name: noise-reduction + tag: '' + categories: + - data-preparation + - audio +kind: job verbose: false diff --git a/functions/master/noise_reduction/latest/src/item.yaml b/functions/master/noise_reduction/latest/src/item.yaml index 8ddc63f4..f748d558 100644 --- a/functions/master/noise_reduction/latest/src/item.yaml +++ b/functions/master/noise_reduction/latest/src/item.yaml @@ -1,7 +1,7 @@ apiVersion: v1 categories: - data-preparation - - machine-learning + - audio description: Reduce noise from audio files doc: '' example: noise_reduction.ipynb @@ -11,7 +11,7 @@ icon: '' labels: author: yonatans maintainers: [] -mlrunVersion: 1.5.2 +mlrunVersion: 1.7.0 name: noise-reduction platformVersion: 3.5.3 spec: @@ -26,4 +26,4 @@ spec: torchaudio>=2.1.2, ] url: '' -version: 1.0.0 \ No newline at end of file +version: 1.1.0 \ No newline at end of file diff --git a/functions/master/noise_reduction/latest/static/documentation.html b/functions/master/noise_reduction/latest/static/documentation.html index 748a3e9b..a772e518 100644 --- a/functions/master/noise_reduction/latest/static/documentation.html +++ b/functions/master/noise_reduction/latest/static/documentation.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/noise_reduction/latest/static/example.html b/functions/master/noise_reduction/latest/static/example.html index c3a04295..7ad47bf9 100644 --- a/functions/master/noise_reduction/latest/static/example.html +++ b/functions/master/noise_reduction/latest/static/example.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/noise_reduction/latest/static/function.html b/functions/master/noise_reduction/latest/static/function.html index 14b402f5..8c061036 100644 --- a/functions/master/noise_reduction/latest/static/function.html +++ b/functions/master/noise_reduction/latest/static/function.html @@ -28,33 +28,12 @@
         
-kind: job
-metadata:
-  name: noise-reduction
-  tag: ''
-  hash: cbf6498dca0358810ddaea3baa0e246b7874ea1d
-  project: ''
-  labels:
-    author: yonatans
-  categories: []
 spec:
-  command: ''
-  args: []
-  image: ''
-  build:
-    functionSourceCode: 
-    base_image: mlrun/mlrun
-    commands: []
-    code_origin: ''
-    origin_filename: ''
-    requirements:
-    - librosa
-    - noisereduce
-    - deepfilternet
-    - torchaudio>=2.1.2
   entry_points:
     reduce_noise:
+      has_kwargs: false
       name: reduce_noise
+      has_varargs: false
       doc: 'Reduce noise from audio file or directory containing audio files.
 
         The audio files must be in .wav format.
@@ -103,24 +82,23 @@
         type: bool
         doc: Verbosity level. If True, display progress bar.
         default: true
-      outputs: []
       lineno: 388
-      has_varargs: false
-      has_kwargs: false
     clean_audio:
+      has_kwargs: false
       name: clean_audio
+      has_varargs: false
+      outputs:
+      - type: torch.Tensor
       doc: ''
       parameters:
       - name: self
       - name: data
         type: Tensor
-      outputs:
-      - type: torch.Tensor
       lineno: 276
-      has_varargs: false
-      has_kwargs: false
     save_audio:
+      has_kwargs: false
       name: save_audio
+      has_varargs: false
       doc: ''
       parameters:
       - name: self
@@ -128,48 +106,46 @@
         type: ndarray
       - name: target_path
         type: Path
-      outputs: []
       lineno: 256
-      has_varargs: false
-      has_kwargs: false
     load_audio:
+      has_kwargs: false
       name: load_audio
+      has_varargs: false
+      outputs:
+      - type: torch.Tensor
       doc: ''
       parameters:
       - name: self
       - name: file
         type: str
-      outputs:
-      - type: torch.Tensor
       lineno: 268
-      has_varargs: false
-      has_kwargs: false
     update_to_wav_suffix:
+      has_kwargs: false
       name: update_to_wav_suffix
+      has_varargs: false
       doc: ''
       parameters:
       - name: self
       - name: audio_file
         type: Path
-      outputs: []
       lineno: 125
-      has_varargs: false
-      has_kwargs: false
     remove_silence:
+      has_kwargs: false
       name: remove_silence
+      has_varargs: false
+      outputs:
+      - doc: The audio without silence.
       doc: Remove silence sections from the audio.
       parameters:
       - name: self
       - name: audio
         type: ndarray
         doc: The audio to remove silence from.
-      outputs:
-      - doc: The audio without silence.
       lineno: 134
-      has_varargs: false
-      has_kwargs: false
     reduce_noise_dfn:
+      has_kwargs: true
       name: reduce_noise_dfn
+      has_varargs: false
       doc: 'Reduce noise from audio files using DeepFilterNet.
 
         For more information about the noise reduction algorithm see:
@@ -207,20 +183,29 @@
         type: bool
         doc: verbosity level. If True, display progress bar and logs.
         default: true
-      outputs: []
       lineno: 322
-      has_varargs: false
-      has_kwargs: true
+  build:
+    code_origin: ''
+    base_image: mlrun/mlrun
+    requirements:
+    - librosa
+    - noisereduce
+    - deepfilternet
+    - torchaudio>=2.1.2
+    functionSourceCode: 
+    origin_filename: ''
   description: Reduce noise from audio files
+  command: ''
+  image: ''
   default_handler: reduce_noise
   disable_auto_mount: false
-  clone_target_dir: ''
-  env: []
-  priority_class_name: ''
-  preemption_mode: prevent
-  affinity: null
-  tolerations: null
-  security_context: {}
+metadata:
+  name: noise-reduction
+  tag: ''
+  categories:
+  - data-preparation
+  - audio
+kind: job
 verbose: false
 
         
diff --git a/functions/master/noise_reduction/latest/static/item.html b/functions/master/noise_reduction/latest/static/item.html
index df39e08a..63a2e019 100644
--- a/functions/master/noise_reduction/latest/static/item.html
+++ b/functions/master/noise_reduction/latest/static/item.html
@@ -31,7 +31,7 @@
 apiVersion: v1
 categories:
   - data-preparation
-  - machine-learning
+  - audio
 description: Reduce noise from audio files
 doc: ''
 example: noise_reduction.ipynb
@@ -41,7 +41,7 @@
 labels:
   author: yonatans
 maintainers: []
-mlrunVersion: 1.5.2
+mlrunVersion: 1.7.0
 name: noise-reduction
 platformVersion: 3.5.3
 spec:
@@ -56,7 +56,7 @@
     torchaudio>=2.1.2,
   ]
 url: ''
-version: 1.0.0
+version: 1.1.0
         
     
diff --git a/functions/master/noise_reduction/latest/static/noise_reduction.html b/functions/master/noise_reduction/latest/static/noise_reduction.html index dd131944..cfbc45de 100644 --- a/functions/master/noise_reduction/latest/static/noise_reduction.html +++ b/functions/master/noise_reduction/latest/static/noise_reduction.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/onnx_utils/1.3.0/src/function.yaml b/functions/master/onnx_utils/1.3.0/src/function.yaml index 67ebf0d5..af3d5082 100644 --- a/functions/master/onnx_utils/1.3.0/src/function.yaml +++ b/functions/master/onnx_utils/1.3.0/src/function.yaml @@ -1,17 +1,40 @@ +kind: job +metadata: + categories: + - utils + - deep-learning + name: onnx-utils + tag: '' +verbose: false spec: - allow_empty_resources: true + build: + code_origin: '' + base_image: mlrun/mlrun + origin_filename: '' + functionSourceCode:  + requirements: + - tqdm~=4.67.1 + - tensorflow~=2.19.0 + - tf_keras~=2.19.0 + - torch~=2.6.0 + - torchvision~=0.21.0 + - onnx~=1.17.0 + - onnxruntime~=1.19.2 + - onnxoptimizer~=0.3.13 + - onnxmltools~=1.13.0 + - tf2onnx~=1.16.1 + - plotly~=5.4.0 + with_mlrun: false + auto_build: true + disable_auto_mount: false description: ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun. - default_handler: to_onnx - command: '' image: '' entry_points: tf_keras_to_onnx: - has_kwargs: false - lineno: 26 - name: tf_keras_to_onnx doc: Convert a TF.Keras model to an ONNX model and log it back to MLRun as a new model object. + name: tf_keras_to_onnx parameters: - name: model_handler doc: An initialized TFKerasModelHandler with a loaded model to convert to @@ -36,12 +59,12 @@ spec: will be tried to be read from the model artifact. Defaulted to None.' default: null has_varargs: false - pytorch_to_onnx: has_kwargs: false - lineno: 81 - name: pytorch_to_onnx + lineno: 26 + pytorch_to_onnx: doc: Convert a PyTorch model to an ONNX model and log it back to MLRun as a new model object. + name: pytorch_to_onnx parameters: - name: model_handler doc: An initialized PyTorchModelHandler with a loaded model to convert to @@ -94,11 +117,11 @@ spec: output layer. Defaulted to True. Will be ignored if 'dynamic_axes' is provided. default: true has_varargs: false - to_onnx: has_kwargs: false - lineno: 160 - name: to_onnx + lineno: 81 + to_onnx: doc: Convert the given model to an ONNX model. + name: to_onnx parameters: - name: context type: MLClientCtx @@ -128,11 +151,11 @@ spec: "help". default: null has_varargs: false - optimize: has_kwargs: false - lineno: 224 - name: optimize + lineno: 160 + optimize: doc: Optimize the given ONNX model. + name: optimize parameters: - name: context type: MLClientCtx @@ -159,30 +182,8 @@ spec: overridden. Defaulted to None. default: null has_varargs: false - build: - code_origin: '' - functionSourceCode:  - auto_build: true - base_image: mlrun/mlrun - with_mlrun: false - requirements: - - tqdm~=4.67.1 - - tensorflow~=2.19.0 - - tf_keras~=2.19.0 - - torch~=2.6.0 - - torchvision~=0.21.0 - - onnx~=1.17.0 - - onnxruntime~=1.19.2 - - onnxoptimizer~=0.3.13 - - onnxmltools~=1.13.0 - - tf2onnx~=1.16.1 - - plotly~=5.4.0 - origin_filename: '' - disable_auto_mount: false -metadata: - categories: - - utils - tag: '' - name: onnx-utils -verbose: false -kind: job + has_kwargs: false + lineno: 224 + default_handler: to_onnx + allow_empty_resources: true + command: '' diff --git a/functions/master/onnx_utils/1.3.0/src/item.yaml b/functions/master/onnx_utils/1.3.0/src/item.yaml index 4a9455d9..ba65ce91 100644 --- a/functions/master/onnx_utils/1.3.0/src/item.yaml +++ b/functions/master/onnx_utils/1.3.0/src/item.yaml @@ -1,6 +1,7 @@ apiVersion: v1 categories: - utils +- deep-learning description: ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun. doc: '' diff --git a/functions/master/onnx_utils/1.3.0/static/documentation.html b/functions/master/onnx_utils/1.3.0/static/documentation.html index 5f1eec32..7505c831 100644 --- a/functions/master/onnx_utils/1.3.0/static/documentation.html +++ b/functions/master/onnx_utils/1.3.0/static/documentation.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/onnx_utils/1.3.0/static/example.html b/functions/master/onnx_utils/1.3.0/static/example.html index 7eff856a..3d4d124b 100644 --- a/functions/master/onnx_utils/1.3.0/static/example.html +++ b/functions/master/onnx_utils/1.3.0/static/example.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/onnx_utils/1.3.0/static/function.html b/functions/master/onnx_utils/1.3.0/static/function.html index 584bb31e..2d049e5f 100644 --- a/functions/master/onnx_utils/1.3.0/static/function.html +++ b/functions/master/onnx_utils/1.3.0/static/function.html @@ -28,20 +28,43 @@
         
+kind: job
+metadata:
+  categories:
+  - utils
+  - deep-learning
+  name: onnx-utils
+  tag: ''
+verbose: false
 spec:
-  allow_empty_resources: true
+  build:
+    code_origin: ''
+    base_image: mlrun/mlrun
+    origin_filename: ''
+    functionSourceCode: 
+    requirements:
+    - tqdm~=4.67.1
+    - tensorflow~=2.19.0
+    - tf_keras~=2.19.0
+    - torch~=2.6.0
+    - torchvision~=0.21.0
+    - onnx~=1.17.0
+    - onnxruntime~=1.19.2
+    - onnxoptimizer~=0.3.13
+    - onnxmltools~=1.13.0
+    - tf2onnx~=1.16.1
+    - plotly~=5.4.0
+    with_mlrun: false
+    auto_build: true
+  disable_auto_mount: false
   description: ONNX intigration in MLRun, some utils functions for the ONNX framework,
     optimizing and converting models from different framework to ONNX using MLRun.
-  default_handler: to_onnx
-  command: ''
   image: ''
   entry_points:
     tf_keras_to_onnx:
-      has_kwargs: false
-      lineno: 26
-      name: tf_keras_to_onnx
       doc: Convert a TF.Keras model to an ONNX model and log it back to MLRun as a
         new model object.
+      name: tf_keras_to_onnx
       parameters:
       - name: model_handler
         doc: An initialized TFKerasModelHandler with a loaded model to convert to
@@ -66,12 +89,12 @@
           will be tried to be read from the model artifact. Defaulted to None.'
         default: null
       has_varargs: false
-    pytorch_to_onnx:
       has_kwargs: false
-      lineno: 81
-      name: pytorch_to_onnx
+      lineno: 26
+    pytorch_to_onnx:
       doc: Convert a PyTorch model to an ONNX model and log it back to MLRun as a
         new model object.
+      name: pytorch_to_onnx
       parameters:
       - name: model_handler
         doc: An initialized PyTorchModelHandler with a loaded model to convert to
@@ -124,11 +147,11 @@
           output layer. Defaulted to True. Will be ignored if 'dynamic_axes' is provided.
         default: true
       has_varargs: false
-    to_onnx:
       has_kwargs: false
-      lineno: 160
-      name: to_onnx
+      lineno: 81
+    to_onnx:
       doc: Convert the given model to an ONNX model.
+      name: to_onnx
       parameters:
       - name: context
         type: MLClientCtx
@@ -158,11 +181,11 @@
           "help".
         default: null
       has_varargs: false
-    optimize:
       has_kwargs: false
-      lineno: 224
-      name: optimize
+      lineno: 160
+    optimize:
       doc: Optimize the given ONNX model.
+      name: optimize
       parameters:
       - name: context
         type: MLClientCtx
@@ -189,33 +212,11 @@
           overridden. Defaulted to None.
         default: null
       has_varargs: false
-  build:
-    code_origin: ''
-    functionSourceCode: 
-    auto_build: true
-    base_image: mlrun/mlrun
-    with_mlrun: false
-    requirements:
-    - tqdm~=4.67.1
-    - tensorflow~=2.19.0
-    - tf_keras~=2.19.0
-    - torch~=2.6.0
-    - torchvision~=0.21.0
-    - onnx~=1.17.0
-    - onnxruntime~=1.19.2
-    - onnxoptimizer~=0.3.13
-    - onnxmltools~=1.13.0
-    - tf2onnx~=1.16.1
-    - plotly~=5.4.0
-    origin_filename: ''
-  disable_auto_mount: false
-metadata:
-  categories:
-  - utils
-  tag: ''
-  name: onnx-utils
-verbose: false
-kind: job
+      has_kwargs: false
+      lineno: 224
+  default_handler: to_onnx
+  allow_empty_resources: true
+  command: ''
 
         
     
diff --git a/functions/master/onnx_utils/1.3.0/static/item.html b/functions/master/onnx_utils/1.3.0/static/item.html index ab6f22d2..b575e949 100644 --- a/functions/master/onnx_utils/1.3.0/static/item.html +++ b/functions/master/onnx_utils/1.3.0/static/item.html @@ -31,6 +31,7 @@ apiVersion: v1 categories: - utils +- deep-learning description: ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun. doc: '' diff --git a/functions/master/onnx_utils/1.3.0/static/onnx_utils.html b/functions/master/onnx_utils/1.3.0/static/onnx_utils.html index c1964206..297beedb 100644 --- a/functions/master/onnx_utils/1.3.0/static/onnx_utils.html +++ b/functions/master/onnx_utils/1.3.0/static/onnx_utils.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/onnx_utils/latest/src/function.yaml b/functions/master/onnx_utils/latest/src/function.yaml index 67ebf0d5..af3d5082 100644 --- a/functions/master/onnx_utils/latest/src/function.yaml +++ b/functions/master/onnx_utils/latest/src/function.yaml @@ -1,17 +1,40 @@ +kind: job +metadata: + categories: + - utils + - deep-learning + name: onnx-utils + tag: '' +verbose: false spec: - allow_empty_resources: true + build: + code_origin: '' + base_image: mlrun/mlrun + origin_filename: '' + functionSourceCode:  + requirements: + - tqdm~=4.67.1 + - tensorflow~=2.19.0 + - tf_keras~=2.19.0 + - torch~=2.6.0 + - torchvision~=0.21.0 + - onnx~=1.17.0 + - onnxruntime~=1.19.2 + - onnxoptimizer~=0.3.13 + - onnxmltools~=1.13.0 + - tf2onnx~=1.16.1 + - plotly~=5.4.0 + with_mlrun: false + auto_build: true + disable_auto_mount: false description: ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun. - default_handler: to_onnx - command: '' image: '' entry_points: tf_keras_to_onnx: - has_kwargs: false - lineno: 26 - name: tf_keras_to_onnx doc: Convert a TF.Keras model to an ONNX model and log it back to MLRun as a new model object. + name: tf_keras_to_onnx parameters: - name: model_handler doc: An initialized TFKerasModelHandler with a loaded model to convert to @@ -36,12 +59,12 @@ spec: will be tried to be read from the model artifact. Defaulted to None.' default: null has_varargs: false - pytorch_to_onnx: has_kwargs: false - lineno: 81 - name: pytorch_to_onnx + lineno: 26 + pytorch_to_onnx: doc: Convert a PyTorch model to an ONNX model and log it back to MLRun as a new model object. + name: pytorch_to_onnx parameters: - name: model_handler doc: An initialized PyTorchModelHandler with a loaded model to convert to @@ -94,11 +117,11 @@ spec: output layer. Defaulted to True. Will be ignored if 'dynamic_axes' is provided. default: true has_varargs: false - to_onnx: has_kwargs: false - lineno: 160 - name: to_onnx + lineno: 81 + to_onnx: doc: Convert the given model to an ONNX model. + name: to_onnx parameters: - name: context type: MLClientCtx @@ -128,11 +151,11 @@ spec: "help". default: null has_varargs: false - optimize: has_kwargs: false - lineno: 224 - name: optimize + lineno: 160 + optimize: doc: Optimize the given ONNX model. + name: optimize parameters: - name: context type: MLClientCtx @@ -159,30 +182,8 @@ spec: overridden. Defaulted to None. default: null has_varargs: false - build: - code_origin: '' - functionSourceCode:  - auto_build: true - base_image: mlrun/mlrun - with_mlrun: false - requirements: - - tqdm~=4.67.1 - - tensorflow~=2.19.0 - - tf_keras~=2.19.0 - - torch~=2.6.0 - - torchvision~=0.21.0 - - onnx~=1.17.0 - - onnxruntime~=1.19.2 - - onnxoptimizer~=0.3.13 - - onnxmltools~=1.13.0 - - tf2onnx~=1.16.1 - - plotly~=5.4.0 - origin_filename: '' - disable_auto_mount: false -metadata: - categories: - - utils - tag: '' - name: onnx-utils -verbose: false -kind: job + has_kwargs: false + lineno: 224 + default_handler: to_onnx + allow_empty_resources: true + command: '' diff --git a/functions/master/onnx_utils/latest/src/item.yaml b/functions/master/onnx_utils/latest/src/item.yaml index 4a9455d9..ba65ce91 100644 --- a/functions/master/onnx_utils/latest/src/item.yaml +++ b/functions/master/onnx_utils/latest/src/item.yaml @@ -1,6 +1,7 @@ apiVersion: v1 categories: - utils +- deep-learning description: ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun. doc: '' diff --git a/functions/master/onnx_utils/latest/static/documentation.html b/functions/master/onnx_utils/latest/static/documentation.html index 5f1eec32..7505c831 100644 --- a/functions/master/onnx_utils/latest/static/documentation.html +++ b/functions/master/onnx_utils/latest/static/documentation.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/onnx_utils/latest/static/example.html b/functions/master/onnx_utils/latest/static/example.html index 7eff856a..3d4d124b 100644 --- a/functions/master/onnx_utils/latest/static/example.html +++ b/functions/master/onnx_utils/latest/static/example.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/onnx_utils/latest/static/function.html b/functions/master/onnx_utils/latest/static/function.html index 584bb31e..2d049e5f 100644 --- a/functions/master/onnx_utils/latest/static/function.html +++ b/functions/master/onnx_utils/latest/static/function.html @@ -28,20 +28,43 @@
         
+kind: job
+metadata:
+  categories:
+  - utils
+  - deep-learning
+  name: onnx-utils
+  tag: ''
+verbose: false
 spec:
-  allow_empty_resources: true
+  build:
+    code_origin: ''
+    base_image: mlrun/mlrun
+    origin_filename: ''
+    functionSourceCode: 
+    requirements:
+    - tqdm~=4.67.1
+    - tensorflow~=2.19.0
+    - tf_keras~=2.19.0
+    - torch~=2.6.0
+    - torchvision~=0.21.0
+    - onnx~=1.17.0
+    - onnxruntime~=1.19.2
+    - onnxoptimizer~=0.3.13
+    - onnxmltools~=1.13.0
+    - tf2onnx~=1.16.1
+    - plotly~=5.4.0
+    with_mlrun: false
+    auto_build: true
+  disable_auto_mount: false
   description: ONNX intigration in MLRun, some utils functions for the ONNX framework,
     optimizing and converting models from different framework to ONNX using MLRun.
-  default_handler: to_onnx
-  command: ''
   image: ''
   entry_points:
     tf_keras_to_onnx:
-      has_kwargs: false
-      lineno: 26
-      name: tf_keras_to_onnx
       doc: Convert a TF.Keras model to an ONNX model and log it back to MLRun as a
         new model object.
+      name: tf_keras_to_onnx
       parameters:
       - name: model_handler
         doc: An initialized TFKerasModelHandler with a loaded model to convert to
@@ -66,12 +89,12 @@
           will be tried to be read from the model artifact. Defaulted to None.'
         default: null
       has_varargs: false
-    pytorch_to_onnx:
       has_kwargs: false
-      lineno: 81
-      name: pytorch_to_onnx
+      lineno: 26
+    pytorch_to_onnx:
       doc: Convert a PyTorch model to an ONNX model and log it back to MLRun as a
         new model object.
+      name: pytorch_to_onnx
       parameters:
       - name: model_handler
         doc: An initialized PyTorchModelHandler with a loaded model to convert to
@@ -124,11 +147,11 @@
           output layer. Defaulted to True. Will be ignored if 'dynamic_axes' is provided.
         default: true
       has_varargs: false
-    to_onnx:
       has_kwargs: false
-      lineno: 160
-      name: to_onnx
+      lineno: 81
+    to_onnx:
       doc: Convert the given model to an ONNX model.
+      name: to_onnx
       parameters:
       - name: context
         type: MLClientCtx
@@ -158,11 +181,11 @@
           "help".
         default: null
       has_varargs: false
-    optimize:
       has_kwargs: false
-      lineno: 224
-      name: optimize
+      lineno: 160
+    optimize:
       doc: Optimize the given ONNX model.
+      name: optimize
       parameters:
       - name: context
         type: MLClientCtx
@@ -189,33 +212,11 @@
           overridden. Defaulted to None.
         default: null
       has_varargs: false
-  build:
-    code_origin: ''
-    functionSourceCode: 
-    auto_build: true
-    base_image: mlrun/mlrun
-    with_mlrun: false
-    requirements:
-    - tqdm~=4.67.1
-    - tensorflow~=2.19.0
-    - tf_keras~=2.19.0
-    - torch~=2.6.0
-    - torchvision~=0.21.0
-    - onnx~=1.17.0
-    - onnxruntime~=1.19.2
-    - onnxoptimizer~=0.3.13
-    - onnxmltools~=1.13.0
-    - tf2onnx~=1.16.1
-    - plotly~=5.4.0
-    origin_filename: ''
-  disable_auto_mount: false
-metadata:
-  categories:
-  - utils
-  tag: ''
-  name: onnx-utils
-verbose: false
-kind: job
+      has_kwargs: false
+      lineno: 224
+  default_handler: to_onnx
+  allow_empty_resources: true
+  command: ''
 
         
     
diff --git a/functions/master/onnx_utils/latest/static/item.html b/functions/master/onnx_utils/latest/static/item.html index ab6f22d2..b575e949 100644 --- a/functions/master/onnx_utils/latest/static/item.html +++ b/functions/master/onnx_utils/latest/static/item.html @@ -31,6 +31,7 @@ apiVersion: v1 categories: - utils +- deep-learning description: ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun. doc: '' diff --git a/functions/master/onnx_utils/latest/static/onnx_utils.html b/functions/master/onnx_utils/latest/static/onnx_utils.html index c1964206..297beedb 100644 --- a/functions/master/onnx_utils/latest/static/onnx_utils.html +++ b/functions/master/onnx_utils/latest/static/onnx_utils.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/open_archive/1.2.0/src/function.yaml b/functions/master/open_archive/1.2.0/src/function.yaml index dee623a0..bf78b5fc 100644 --- a/functions/master/open_archive/1.2.0/src/function.yaml +++ b/functions/master/open_archive/1.2.0/src/function.yaml @@ -1,22 +1,20 @@ kind: job -metadata: - name: open-archive - categories: - - data-preparation - tag: '' verbose: false spec: + command: '' + disable_auto_mount: false + default_handler: open_archive build: + functionSourceCode: IyBDb3B5cmlnaHQgMjAyNSBJZ3VhemlvCiMKIyBMaWNlbnNlZCB1bmRlciB0aGUgQXBhY2hlIExpY2Vuc2UsIFZlcnNpb24gMi4wICh0aGUgIkxpY2Vuc2UiKTsKIyB5b3UgbWF5IG5vdCB1c2UgdGhpcyBmaWxlIGV4Y2VwdCBpbiBjb21wbGlhbmNlIHdpdGggdGhlIExpY2Vuc2UuCiMgWW91IG1heSBvYnRhaW4gYSBjb3B5IG9mIHRoZSBMaWNlbnNlIGF0CiMKIyAgICAgaHR0cDovL3d3dy5hcGFjaGUub3JnL2xpY2Vuc2VzL0xJQ0VOU0UtMi4wCiMKIyBVbmxlc3MgcmVxdWlyZWQgYnkgYXBwbGljYWJsZSBsYXcgb3IgYWdyZWVkIHRvIGluIHdyaXRpbmcsIHNvZnR3YXJlCiMgZGlzdHJpYnV0ZWQgdW5kZXIgdGhlIExpY2Vuc2UgaXMgZGlzdHJpYnV0ZWQgb24gYW4gIkFTIElTIiBCQVNJUywKIyBXSVRIT1VUIFdBUlJBTlRJRVMgT1IgQ09ORElUSU9OUyBPRiBBTlkgS0lORCwgZWl0aGVyIGV4cHJlc3Mgb3IgaW1wbGllZC4KIyBTZWUgdGhlIExpY2Vuc2UgZm9yIHRoZSBzcGVjaWZpYyBsYW5ndWFnZSBnb3Zlcm5pbmcgcGVybWlzc2lvbnMgYW5kCiMgbGltaXRhdGlvbnMgdW5kZXIgdGhlIExpY2Vuc2UuCiMKCmltcG9ydCBvcwppbXBvcnQgemlwZmlsZQppbXBvcnQgdGFyZmlsZQoKZnJvbSBtbHJ1bi5leGVjdXRpb24gaW1wb3J0IE1MQ2xpZW50Q3R4CmZyb20gbWxydW4uZGF0YXN0b3JlIGltcG9ydCBEYXRhSXRlbQpmcm9tIG1scnVuLmFydGlmYWN0cy5iYXNlIGltcG9ydCBEaXJBcnRpZmFjdAoKZnJvbSB1cmxsaWIucGFyc2UgaW1wb3J0IHVybHBhcnNlCgoKZGVmIG9wZW5fYXJjaGl2ZSgKICAgICAgICBjb250ZXh0OiBNTENsaWVudEN0eCwKICAgICAgICBhcmNoaXZlX3VybDogRGF0YUl0ZW0sCiAgICAgICAgc3ViZGlyOiBzdHIgPSAiY29udGVudC8iLAogICAgICAgIGtleTogc3RyID0gImNvbnRlbnQiLAogICAgICAgIHRhcmdldF9wYXRoOiBzdHIgPSBOb25lLAopOgogICAgIiIiT3BlbiBhIGZpbGUvb2JqZWN0IGFyY2hpdmUgaW50byBhIHRhcmdldCBkaXJlY3RvcnkuIEN1cnJlbnRseSwgc3VwcG9ydHMgemlwIGFuZCB0YXIuZ3ouCgogICAgOnBhcmFtIGNvbnRleHQ6ICAgICAgZnVuY3Rpb24gZXhlY3V0aW9uIGNvbnRleHQKICAgIDpwYXJhbSBhcmNoaXZlX3VybDogIHVybCBvZiBhcmNoaXZlIGZpbGUKICAgIDpwYXJhbSBzdWJkaXI6ICAgICAgIHBhdGggd2l0aGluIGFydGlmYWN0IHN0b3JlIHdoZXJlIGV4dHJhY3RlZCBmaWxlcyBhcmUgc3RvcmVkLCBkZWZhdWx0IGlzICIvY29udGVudCIKICAgIDpwYXJhbSBrZXk6ICAgICAgICAgIGtleSBvZiBhcmNoaXZlIGNvbnRlbnRzIGluIGFydGlmYWN0IHN0b3JlCiAgICA6cGFyYW0gdGFyZ2V0X3BhdGg6ICBmaWxlIHN5c3RlbSBwYXRoIHRvIHN0b3JlIGV4dHJhY3RlZCBmaWxlcwogICAgIiIiCgogICAgIyBSZXNvbHZlcyB0aGUgYXJjaGl2ZSBsb2NhbGx5CiAgICBhcmNoaXZlX3VybCA9IGFyY2hpdmVfdXJsLmxvY2FsKCkKICAgIHYzaW9fc3ViZGlyID0gTm9uZQogICAgIyBXaGVuIGN1c3RvbSBhcnRpZmFjdCBwYXRoIGlzIGRlZmluZWQKICAgIGlmIG5vdCB0YXJnZXRfcGF0aCBhbmQgY29udGV4dC5hcnRpZmFjdF9wYXRoOgogICAgICAgIHBhcnNlZF9zdWJkaXIgPSB1cmxwYXJzZShjb250ZXh0LmFydGlmYWN0X3BhdGgpCiAgICAgICAgaWYgcGFyc2VkX3N1YmRpci5zY2hlbWUgPT0gJ3MzJzoKICAgICAgICAgICAgc3ViZGlyID0gb3MucGF0aC5qb2luKGNvbnRleHQuYXJ0aWZhY3RfcGF0aCwgc3ViZGlyKQogICAgICAgIGVsaWYgcGFyc2VkX3N1YmRpci5zY2hlbWUgPT0gJ3YzaW8nOgogICAgICAgICAgICB2M2lvX3N1YmRpciA9IG9zLnBhdGguam9pbihjb250ZXh0LmFydGlmYWN0X3BhdGgsIHN1YmRpcikgICMgVXNpbmcgdjNpb19zdWJkaXIgZm9yIGxvZ2dpbmcKICAgICAgICAgICAgc3ViZGlyID0gJy92M2lvJyArIHBhcnNlZF9zdWJkaXIucGF0aCArICcvJyArIHN1YmRpcgogICAgICAgICAgICBjb250ZXh0LmxvZ2dlci5pbmZvKGYnVXNpbmcgdjNpbyBzY2hlbWUsIGV4dHJhY3RpbmcgdG8ge3N1YmRpcn0nKQogICAgICAgIGVsc2U6CiAgICAgICAgICAgIGNvbnRleHQubG9nZ2VyLmluZm8oZidVbnJlY29nbml6YWJsZSBzY2hlbWUsIGV4dHJhY3RpbmcgdG8ge3N1YmRpcn0nKQoKICAgICMgV2hlbiB3b3JraW5nIG9uIENFLCB0YXJnZXQgcGF0aCBtaWdodCBiZSBvbiBzMwogICAgaWYgJ3MzJyBpbiAodGFyZ2V0X3BhdGggb3Igc3ViZGlyKToKICAgICAgICBjb250ZXh0LmxvZ2dlci5pbmZvKGYnVXNpbmcgczMgc2NoZW1lLCBleHRyYWN0aW5nIHRvIHt0YXJnZXRfcGF0aCBvciBzdWJkaXJ9JykKCiAgICAgICAgaWYgYXJjaGl2ZV91cmwuZW5kc3dpdGgoImd6Iik6CiAgICAgICAgICAgIF9leHRyYWN0X2d6X2ZpbGUoYXJjaGl2ZV91cmw9YXJjaGl2ZV91cmwsIHN1YmRpcj1zdWJkaXIsIHRhcmdldF9wYXRoPXRhcmdldF9wYXRoLCBpbl9zMz1UcnVlKQoKICAgICAgICBlbGlmIGFyY2hpdmVfdXJsLmVuZHN3aXRoKCJ6aXAiKToKICAgICAgICAgICAgX2V4dHJhY3RfemlwX2ZpbGUoYXJjaGl2ZV91cmw9YXJjaGl2ZV91cmwsIHN1YmRpcj1zdWJkaXIsIHRhcmdldF9wYXRoPXRhcmdldF9wYXRoLCBpbl9zMz1UcnVlKQogICAgICAgIGVsc2U6CiAgICAgICAgICAgIHJhaXNlIFZhbHVlRXJyb3IoZiJ1bnN1cHBvcnRlZCBhcmNoaXZlIHR5cGUgaW4ge2FyY2hpdmVfdXJsfSIpCiAgICBlbHNlOgogICAgICAgIGlmIGFyY2hpdmVfdXJsLmVuZHN3aXRoKCJneiIpOgogICAgICAgICAgICBfZXh0cmFjdF9nel9maWxlKGFyY2hpdmVfdXJsPWFyY2hpdmVfdXJsLCBzdWJkaXI9c3ViZGlyLCB0YXJnZXRfcGF0aD10YXJnZXRfcGF0aCkKICAgICAgICBlbGlmIGFyY2hpdmVfdXJsLmVuZHN3aXRoKCJ6aXAiKToKICAgICAgICAgICAgX2V4dHJhY3RfemlwX2ZpbGUoYXJjaGl2ZV91cmw9YXJjaGl2ZV91cmwsIHN1YmRpcj1zdWJkaXIsIHRhcmdldF9wYXRoPXRhcmdldF9wYXRoKQogICAgICAgIGVsc2U6CiAgICAgICAgICAgIHJhaXNlIFZhbHVlRXJyb3IoZiJ1bnN1cHBvcnRlZCBhcmNoaXZlIHR5cGUgaW4ge2FyY2hpdmVfdXJsfSIpCgogICAgaWYgdjNpb19zdWJkaXI6CiAgICAgICAgc3ViZGlyID0gdjNpb19zdWJkaXIKCiAgICBjb250ZXh0LmxvZ2dlci5pbmZvKGYnTG9nZ2luZyBhcnRpZmFjdCB0byB7KHRhcmdldF9wYXRoIG9yIHN1YmRpcil9JykKICAgIGNvbnRleHQubG9nX2FydGlmYWN0KERpckFydGlmYWN0KGtleT1rZXksIHRhcmdldF9wYXRoPSh0YXJnZXRfcGF0aCBvciBzdWJkaXIpKSkKCgpkZWYgX2V4dHJhY3RfZ3pfZmlsZShhcmNoaXZlX3VybDogc3RyLCB0YXJnZXRfcGF0aDogc3RyID0gTm9uZSwgc3ViZGlyOiBzdHIgPSAiY29udGVudC8iLCBpbl9zMzogYm9vbCA9IEZhbHNlKToKICAgIGlmIGluX3MzOgogICAgICAgIGNsaWVudCA9IF9pbml0X2JvdG8zX2NsaWVudCgpCiAgICAgICAgd2l0aCB0YXJmaWxlLm9wZW4oYXJjaGl2ZV91cmwsIG1vZGU9InJ8Z3oiKSBhcyByZWY6CiAgICAgICAgICAgIGZvciBtZW1iZXIgaW4gcmVmLmdldG1lbWJlcnMoKToKICAgICAgICAgICAgICAgIGRhdGEgPSByZWYuZXh0cmFjdGZpbGUobWVtYmVyPW1lbWJlcikucmVhZCgpCiAgICAgICAgICAgICAgICBjbGllbnQucHV0X29iamVjdChCb2R5PWRhdGEsIEJ1Y2tldD11cmxwYXJzZSh0YXJnZXRfcGF0aCBvciBzdWJkaXIpLm5ldGxvYywKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIEtleT1mJ3t1cmxwYXJzZSh0YXJnZXRfcGF0aCBvciBzdWJkaXIpLnBhdGhbMTpdfXttZW1iZXIubmFtZX0nKQogICAgZWxzZToKICAgICAgICBvcy5tYWtlZGlycyh0YXJnZXRfcGF0aCBvciBzdWJkaXIsIGV4aXN0X29rPVRydWUpCiAgICAgICAgd2l0aCB0YXJmaWxlLm9wZW4oYXJjaGl2ZV91cmwsIG1vZGU9InI6Z3oiKSBhcyByZWY6CiAgICAgICAgICAgIGZvciBlbnRyeSBpbiByZWY6CiAgICAgICAgICAgICAgICAjIFZhbGlkYXRlIHRoYXQgdGhlcmUgaXMgbm8gcGF0aCB0cmF2ZXJzYWwgaW4gdGhlIGFyY2hpdmUKICAgICAgICAgICAgICAgIGlmIG9zLnBhdGguaXNhYnMoZW50cnkubmFtZSkgb3IgIi4uIiBpbiBlbnRyeS5uYW1lOgogICAgICAgICAgICAgICAgICAgIHJhaXNlIFZhbHVlRXJyb3IoZiJJbGxlZ2FsIHRhciBhcmNoaXZlIGVudHJ5OiB7ZW50cnkubmFtZX0iKQoKICAgICAgICAgICAgICAgIHJlZi5leHRyYWN0KGVudHJ5LCB0YXJnZXRfcGF0aCBvciBzdWJkaXIpCgoKZGVmIF9leHRyYWN0X3ppcF9maWxlKGFyY2hpdmVfdXJsLCB0YXJnZXRfcGF0aDogc3RyID0gTm9uZSwgc3ViZGlyOiBzdHIgPSAiY29udGVudC8iLCBpbl9zMzogYm9vbCA9IEZhbHNlKToKICAgIGlmIGluX3MzOgogICAgICAgIGNsaWVudCA9IF9pbml0X2JvdG8zX2NsaWVudCgpCiAgICAgICAgd2l0aCB6aXBmaWxlLlppcEZpbGUoYXJjaGl2ZV91cmwsICJyIikgYXMgcmVmOgogICAgICAgICAgICBmb3IgZmlsZW5hbWUgaW4gcmVmLm5hbWVsaXN0KCk6CiAgICAgICAgICAgICAgICBkYXRhID0gcmVmLnJlYWQoZmlsZW5hbWUpCiAgICAgICAgICAgICAgICBjbGllbnQucHV0X29iamVjdChCb2R5PWRhdGEsIEJ1Y2tldD11cmxwYXJzZSh0YXJnZXRfcGF0aCBvciBzdWJkaXIpLm5ldGxvYywKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIEtleT1mJ3t1cmxwYXJzZSh0YXJnZXRfcGF0aCBvciBzdWJkaXIpLnBhdGhbMTpdfXtmaWxlbmFtZX0nKQogICAgZWxzZToKICAgICAgICB3aXRoIHppcGZpbGUuWmlwRmlsZShhcmNoaXZlX3VybCwgInIiKSBhcyByZWY6CiAgICAgICAgICAgICMgVmFsaWRhdGUgdGhhdCB0aGVyZSBpcyBubyBwYXRoIHRyYXZlcnNhbCBpbiB0aGUgYXJjaGl2ZQogICAgICAgICAgICBmb3IgZW50cnkgaW4gcmVmLm5hbWVsaXN0KCk6CiAgICAgICAgICAgICAgICBpZiBvcy5wYXRoLmlzYWJzKGVudHJ5KSBvciAiLi4iIGluIGVudHJ5OgogICAgICAgICAgICAgICAgICAgIHJhaXNlIFZhbHVlRXJyb3IoZiJJbGxlZ2FsIHppcCBhcmNoaXZlIGVudHJ5OiB7ZW50cnl9IikKICAgICAgICAgICAgb3MubWFrZWRpcnModGFyZ2V0X3BhdGggb3Igc3ViZGlyLCBleGlzdF9vaz1UcnVlKQogICAgICAgICAgICByZWYuZXh0cmFjdGFsbCh0YXJnZXRfcGF0aCBvciBzdWJkaXIpCgoKZGVmIF9pbml0X2JvdG8zX2NsaWVudCgpOgogICAgaW1wb3J0IGJvdG8zCiAgICBpZiBvcy5lbnZpcm9uLmdldCgnUzNfRU5EUE9JTlRfVVJMJyk6CiAgICAgICAgY2xpZW50ID0gYm90bzMuY2xpZW50KCdzMycsIGVuZHBvaW50X3VybD1vcy5lbnZpcm9uLmdldCgnUzNfRU5EUE9JTlRfVVJMJykpCiAgICBlbHNlOgogICAgICAgIGNsaWVudCA9IGJvdG8zLmNsaWVudCgnczMnKQogICAgcmV0dXJuIGNsaWVudA== code_origin: '' origin_filename: '' - functionSourceCode: IyBDb3B5cmlnaHQgMjAyNSBJZ3VhemlvCiMKIyBMaWNlbnNlZCB1bmRlciB0aGUgQXBhY2hlIExpY2Vuc2UsIFZlcnNpb24gMi4wICh0aGUgIkxpY2Vuc2UiKTsKIyB5b3UgbWF5IG5vdCB1c2UgdGhpcyBmaWxlIGV4Y2VwdCBpbiBjb21wbGlhbmNlIHdpdGggdGhlIExpY2Vuc2UuCiMgWW91IG1heSBvYnRhaW4gYSBjb3B5IG9mIHRoZSBMaWNlbnNlIGF0CiMKIyAgICAgaHR0cDovL3d3dy5hcGFjaGUub3JnL2xpY2Vuc2VzL0xJQ0VOU0UtMi4wCiMKIyBVbmxlc3MgcmVxdWlyZWQgYnkgYXBwbGljYWJsZSBsYXcgb3IgYWdyZWVkIHRvIGluIHdyaXRpbmcsIHNvZnR3YXJlCiMgZGlzdHJpYnV0ZWQgdW5kZXIgdGhlIExpY2Vuc2UgaXMgZGlzdHJpYnV0ZWQgb24gYW4gIkFTIElTIiBCQVNJUywKIyBXSVRIT1VUIFdBUlJBTlRJRVMgT1IgQ09ORElUSU9OUyBPRiBBTlkgS0lORCwgZWl0aGVyIGV4cHJlc3Mgb3IgaW1wbGllZC4KIyBTZWUgdGhlIExpY2Vuc2UgZm9yIHRoZSBzcGVjaWZpYyBsYW5ndWFnZSBnb3Zlcm5pbmcgcGVybWlzc2lvbnMgYW5kCiMgbGltaXRhdGlvbnMgdW5kZXIgdGhlIExpY2Vuc2UuCiMKCmltcG9ydCBvcwppbXBvcnQgemlwZmlsZQppbXBvcnQgdGFyZmlsZQoKZnJvbSBtbHJ1bi5leGVjdXRpb24gaW1wb3J0IE1MQ2xpZW50Q3R4CmZyb20gbWxydW4uZGF0YXN0b3JlIGltcG9ydCBEYXRhSXRlbQpmcm9tIG1scnVuLmFydGlmYWN0cy5iYXNlIGltcG9ydCBEaXJBcnRpZmFjdAoKZnJvbSB1cmxsaWIucGFyc2UgaW1wb3J0IHVybHBhcnNlCgoKZGVmIG9wZW5fYXJjaGl2ZSgKICAgICAgICBjb250ZXh0OiBNTENsaWVudEN0eCwKICAgICAgICBhcmNoaXZlX3VybDogRGF0YUl0ZW0sCiAgICAgICAgc3ViZGlyOiBzdHIgPSAiY29udGVudC8iLAogICAgICAgIGtleTogc3RyID0gImNvbnRlbnQiLAogICAgICAgIHRhcmdldF9wYXRoOiBzdHIgPSBOb25lLAopOgogICAgIiIiT3BlbiBhIGZpbGUvb2JqZWN0IGFyY2hpdmUgaW50byBhIHRhcmdldCBkaXJlY3RvcnkuIEN1cnJlbnRseSwgc3VwcG9ydHMgemlwIGFuZCB0YXIuZ3ouCgogICAgOnBhcmFtIGNvbnRleHQ6ICAgICAgZnVuY3Rpb24gZXhlY3V0aW9uIGNvbnRleHQKICAgIDpwYXJhbSBhcmNoaXZlX3VybDogIHVybCBvZiBhcmNoaXZlIGZpbGUKICAgIDpwYXJhbSBzdWJkaXI6ICAgICAgIHBhdGggd2l0aGluIGFydGlmYWN0IHN0b3JlIHdoZXJlIGV4dHJhY3RlZCBmaWxlcyBhcmUgc3RvcmVkLCBkZWZhdWx0IGlzICIvY29udGVudCIKICAgIDpwYXJhbSBrZXk6ICAgICAgICAgIGtleSBvZiBhcmNoaXZlIGNvbnRlbnRzIGluIGFydGlmYWN0IHN0b3JlCiAgICA6cGFyYW0gdGFyZ2V0X3BhdGg6ICBmaWxlIHN5c3RlbSBwYXRoIHRvIHN0b3JlIGV4dHJhY3RlZCBmaWxlcwogICAgIiIiCgogICAgIyBSZXNvbHZlcyB0aGUgYXJjaGl2ZSBsb2NhbGx5CiAgICBhcmNoaXZlX3VybCA9IGFyY2hpdmVfdXJsLmxvY2FsKCkKICAgIHYzaW9fc3ViZGlyID0gTm9uZQogICAgIyBXaGVuIGN1c3RvbSBhcnRpZmFjdCBwYXRoIGlzIGRlZmluZWQKICAgIGlmIG5vdCB0YXJnZXRfcGF0aCBhbmQgY29udGV4dC5hcnRpZmFjdF9wYXRoOgogICAgICAgIHBhcnNlZF9zdWJkaXIgPSB1cmxwYXJzZShjb250ZXh0LmFydGlmYWN0X3BhdGgpCiAgICAgICAgaWYgcGFyc2VkX3N1YmRpci5zY2hlbWUgPT0gJ3MzJzoKICAgICAgICAgICAgc3ViZGlyID0gb3MucGF0aC5qb2luKGNvbnRleHQuYXJ0aWZhY3RfcGF0aCwgc3ViZGlyKQogICAgICAgIGVsaWYgcGFyc2VkX3N1YmRpci5zY2hlbWUgPT0gJ3YzaW8nOgogICAgICAgICAgICB2M2lvX3N1YmRpciA9IG9zLnBhdGguam9pbihjb250ZXh0LmFydGlmYWN0X3BhdGgsIHN1YmRpcikgICMgVXNpbmcgdjNpb19zdWJkaXIgZm9yIGxvZ2dpbmcKICAgICAgICAgICAgc3ViZGlyID0gJy92M2lvJyArIHBhcnNlZF9zdWJkaXIucGF0aCArICcvJyArIHN1YmRpcgogICAgICAgICAgICBjb250ZXh0LmxvZ2dlci5pbmZvKGYnVXNpbmcgdjNpbyBzY2hlbWUsIGV4dHJhY3RpbmcgdG8ge3N1YmRpcn0nKQogICAgICAgIGVsc2U6CiAgICAgICAgICAgIGNvbnRleHQubG9nZ2VyLmluZm8oZidVbnJlY29nbml6YWJsZSBzY2hlbWUsIGV4dHJhY3RpbmcgdG8ge3N1YmRpcn0nKQoKICAgICMgV2hlbiB3b3JraW5nIG9uIENFLCB0YXJnZXQgcGF0aCBtaWdodCBiZSBvbiBzMwogICAgaWYgJ3MzJyBpbiAodGFyZ2V0X3BhdGggb3Igc3ViZGlyKToKICAgICAgICBjb250ZXh0LmxvZ2dlci5pbmZvKGYnVXNpbmcgczMgc2NoZW1lLCBleHRyYWN0aW5nIHRvIHt0YXJnZXRfcGF0aCBvciBzdWJkaXJ9JykKCiAgICAgICAgaWYgYXJjaGl2ZV91cmwuZW5kc3dpdGgoImd6Iik6CiAgICAgICAgICAgIF9leHRyYWN0X2d6X2ZpbGUoYXJjaGl2ZV91cmw9YXJjaGl2ZV91cmwsIHN1YmRpcj1zdWJkaXIsIHRhcmdldF9wYXRoPXRhcmdldF9wYXRoLCBpbl9zMz1UcnVlKQoKICAgICAgICBlbGlmIGFyY2hpdmVfdXJsLmVuZHN3aXRoKCJ6aXAiKToKICAgICAgICAgICAgX2V4dHJhY3RfemlwX2ZpbGUoYXJjaGl2ZV91cmw9YXJjaGl2ZV91cmwsIHN1YmRpcj1zdWJkaXIsIHRhcmdldF9wYXRoPXRhcmdldF9wYXRoLCBpbl9zMz1UcnVlKQogICAgICAgIGVsc2U6CiAgICAgICAgICAgIHJhaXNlIFZhbHVlRXJyb3IoZiJ1bnN1cHBvcnRlZCBhcmNoaXZlIHR5cGUgaW4ge2FyY2hpdmVfdXJsfSIpCiAgICBlbHNlOgogICAgICAgIGlmIGFyY2hpdmVfdXJsLmVuZHN3aXRoKCJneiIpOgogICAgICAgICAgICBfZXh0cmFjdF9nel9maWxlKGFyY2hpdmVfdXJsPWFyY2hpdmVfdXJsLCBzdWJkaXI9c3ViZGlyLCB0YXJnZXRfcGF0aD10YXJnZXRfcGF0aCkKICAgICAgICBlbGlmIGFyY2hpdmVfdXJsLmVuZHN3aXRoKCJ6aXAiKToKICAgICAgICAgICAgX2V4dHJhY3RfemlwX2ZpbGUoYXJjaGl2ZV91cmw9YXJjaGl2ZV91cmwsIHN1YmRpcj1zdWJkaXIsIHRhcmdldF9wYXRoPXRhcmdldF9wYXRoKQogICAgICAgIGVsc2U6CiAgICAgICAgICAgIHJhaXNlIFZhbHVlRXJyb3IoZiJ1bnN1cHBvcnRlZCBhcmNoaXZlIHR5cGUgaW4ge2FyY2hpdmVfdXJsfSIpCgogICAgaWYgdjNpb19zdWJkaXI6CiAgICAgICAgc3ViZGlyID0gdjNpb19zdWJkaXIKCiAgICBjb250ZXh0LmxvZ2dlci5pbmZvKGYnTG9nZ2luZyBhcnRpZmFjdCB0byB7KHRhcmdldF9wYXRoIG9yIHN1YmRpcil9JykKICAgIGNvbnRleHQubG9nX2FydGlmYWN0KERpckFydGlmYWN0KGtleT1rZXksIHRhcmdldF9wYXRoPSh0YXJnZXRfcGF0aCBvciBzdWJkaXIpKSkKCgpkZWYgX2V4dHJhY3RfZ3pfZmlsZShhcmNoaXZlX3VybDogc3RyLCB0YXJnZXRfcGF0aDogc3RyID0gTm9uZSwgc3ViZGlyOiBzdHIgPSAiY29udGVudC8iLCBpbl9zMzogYm9vbCA9IEZhbHNlKToKICAgIGlmIGluX3MzOgogICAgICAgIGNsaWVudCA9IF9pbml0X2JvdG8zX2NsaWVudCgpCiAgICAgICAgd2l0aCB0YXJmaWxlLm9wZW4oYXJjaGl2ZV91cmwsIG1vZGU9InJ8Z3oiKSBhcyByZWY6CiAgICAgICAgICAgIGZvciBtZW1iZXIgaW4gcmVmLmdldG1lbWJlcnMoKToKICAgICAgICAgICAgICAgIGRhdGEgPSByZWYuZXh0cmFjdGZpbGUobWVtYmVyPW1lbWJlcikucmVhZCgpCiAgICAgICAgICAgICAgICBjbGllbnQucHV0X29iamVjdChCb2R5PWRhdGEsIEJ1Y2tldD11cmxwYXJzZSh0YXJnZXRfcGF0aCBvciBzdWJkaXIpLm5ldGxvYywKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIEtleT1mJ3t1cmxwYXJzZSh0YXJnZXRfcGF0aCBvciBzdWJkaXIpLnBhdGhbMTpdfXttZW1iZXIubmFtZX0nKQogICAgZWxzZToKICAgICAgICBvcy5tYWtlZGlycyh0YXJnZXRfcGF0aCBvciBzdWJkaXIsIGV4aXN0X29rPVRydWUpCiAgICAgICAgd2l0aCB0YXJmaWxlLm9wZW4oYXJjaGl2ZV91cmwsIG1vZGU9InI6Z3oiKSBhcyByZWY6CiAgICAgICAgICAgIGZvciBlbnRyeSBpbiByZWY6CiAgICAgICAgICAgICAgICAjIFZhbGlkYXRlIHRoYXQgdGhlcmUgaXMgbm8gcGF0aCB0cmF2ZXJzYWwgaW4gdGhlIGFyY2hpdmUKICAgICAgICAgICAgICAgIGlmIG9zLnBhdGguaXNhYnMoZW50cnkubmFtZSkgb3IgIi4uIiBpbiBlbnRyeS5uYW1lOgogICAgICAgICAgICAgICAgICAgIHJhaXNlIFZhbHVlRXJyb3IoZiJJbGxlZ2FsIHRhciBhcmNoaXZlIGVudHJ5OiB7ZW50cnkubmFtZX0iKQoKICAgICAgICAgICAgICAgIHJlZi5leHRyYWN0KGVudHJ5LCB0YXJnZXRfcGF0aCBvciBzdWJkaXIpCgoKZGVmIF9leHRyYWN0X3ppcF9maWxlKGFyY2hpdmVfdXJsLCB0YXJnZXRfcGF0aDogc3RyID0gTm9uZSwgc3ViZGlyOiBzdHIgPSAiY29udGVudC8iLCBpbl9zMzogYm9vbCA9IEZhbHNlKToKICAgIGlmIGluX3MzOgogICAgICAgIGNsaWVudCA9IF9pbml0X2JvdG8zX2NsaWVudCgpCiAgICAgICAgd2l0aCB6aXBmaWxlLlppcEZpbGUoYXJjaGl2ZV91cmwsICJyIikgYXMgcmVmOgogICAgICAgICAgICBmb3IgZmlsZW5hbWUgaW4gcmVmLm5hbWVsaXN0KCk6CiAgICAgICAgICAgICAgICBkYXRhID0gcmVmLnJlYWQoZmlsZW5hbWUpCiAgICAgICAgICAgICAgICBjbGllbnQucHV0X29iamVjdChCb2R5PWRhdGEsIEJ1Y2tldD11cmxwYXJzZSh0YXJnZXRfcGF0aCBvciBzdWJkaXIpLm5ldGxvYywKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIEtleT1mJ3t1cmxwYXJzZSh0YXJnZXRfcGF0aCBvciBzdWJkaXIpLnBhdGhbMTpdfXtmaWxlbmFtZX0nKQogICAgZWxzZToKICAgICAgICB3aXRoIHppcGZpbGUuWmlwRmlsZShhcmNoaXZlX3VybCwgInIiKSBhcyByZWY6CiAgICAgICAgICAgICMgVmFsaWRhdGUgdGhhdCB0aGVyZSBpcyBubyBwYXRoIHRyYXZlcnNhbCBpbiB0aGUgYXJjaGl2ZQogICAgICAgICAgICBmb3IgZW50cnkgaW4gcmVmLm5hbWVsaXN0KCk6CiAgICAgICAgICAgICAgICBpZiBvcy5wYXRoLmlzYWJzKGVudHJ5KSBvciAiLi4iIGluIGVudHJ5OgogICAgICAgICAgICAgICAgICAgIHJhaXNlIFZhbHVlRXJyb3IoZiJJbGxlZ2FsIHppcCBhcmNoaXZlIGVudHJ5OiB7ZW50cnl9IikKICAgICAgICAgICAgb3MubWFrZWRpcnModGFyZ2V0X3BhdGggb3Igc3ViZGlyLCBleGlzdF9vaz1UcnVlKQogICAgICAgICAgICByZWYuZXh0cmFjdGFsbCh0YXJnZXRfcGF0aCBvciBzdWJkaXIpCgoKZGVmIF9pbml0X2JvdG8zX2NsaWVudCgpOgogICAgaW1wb3J0IGJvdG8zCiAgICBpZiBvcy5lbnZpcm9uLmdldCgnUzNfRU5EUE9JTlRfVVJMJyk6CiAgICAgICAgY2xpZW50ID0gYm90bzMuY2xpZW50KCdzMycsIGVuZHBvaW50X3VybD1vcy5lbnZpcm9uLmdldCgnUzNfRU5EUE9JTlRfVVJMJykpCiAgICBlbHNlOgogICAgICAgIGNsaWVudCA9IGJvdG8zLmNsaWVudCgnczMnKQogICAgcmV0dXJuIGNsaWVudA== - default_handler: open_archive - command: '' - image: mlrun/mlrun description: Open a file/object archive into a target directory + image: mlrun/mlrun entry_points: open_archive: has_kwargs: false + lineno: 27 + name: open_archive parameters: - name: context type: MLClientCtx @@ -40,6 +38,8 @@ spec: doc: Open a file/object archive into a target directory. Currently, supports zip and tar.gz. has_varargs: false - name: open_archive - lineno: 27 - disable_auto_mount: false +metadata: + name: open-archive + categories: + - utils + tag: '' diff --git a/functions/master/open_archive/1.2.0/src/item.yaml b/functions/master/open_archive/1.2.0/src/item.yaml index 35b5e147..0a2f4516 100644 --- a/functions/master/open_archive/1.2.0/src/item.yaml +++ b/functions/master/open_archive/1.2.0/src/item.yaml @@ -1,6 +1,6 @@ apiVersion: v1 categories: -- data-preparation +- utils description: Open a file/object archive into a target directory doc: '' example: open_archive.ipynb diff --git a/functions/master/open_archive/1.2.0/static/documentation.html b/functions/master/open_archive/1.2.0/static/documentation.html index 8d47370a..316b0576 100644 --- a/functions/master/open_archive/1.2.0/static/documentation.html +++ b/functions/master/open_archive/1.2.0/static/documentation.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/open_archive/1.2.0/static/example.html b/functions/master/open_archive/1.2.0/static/example.html index c7b8d345..567eaf34 100644 --- a/functions/master/open_archive/1.2.0/static/example.html +++ b/functions/master/open_archive/1.2.0/static/example.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/open_archive/1.2.0/static/function.html b/functions/master/open_archive/1.2.0/static/function.html index e6348802..16dd7fa7 100644 --- a/functions/master/open_archive/1.2.0/static/function.html +++ b/functions/master/open_archive/1.2.0/static/function.html @@ -29,24 +29,22 @@
         
 kind: job
-metadata:
-  name: open-archive
-  categories:
-  - data-preparation
-  tag: ''
 verbose: false
 spec:
+  command: ''
+  disable_auto_mount: false
+  default_handler: open_archive
   build:
+    functionSourceCode: IyBDb3B5cmlnaHQgMjAyNSBJZ3VhemlvCiMKIyBMaWNlbnNlZCB1bmRlciB0aGUgQXBhY2hlIExpY2Vuc2UsIFZlcnNpb24gMi4wICh0aGUgIkxpY2Vuc2UiKTsKIyB5b3UgbWF5IG5vdCB1c2UgdGhpcyBmaWxlIGV4Y2VwdCBpbiBjb21wbGlhbmNlIHdpdGggdGhlIExpY2Vuc2UuCiMgWW91IG1heSBvYnRhaW4gYSBjb3B5IG9mIHRoZSBMaWNlbnNlIGF0CiMKIyAgICAgaHR0cDovL3d3dy5hcGFjaGUub3JnL2xpY2Vuc2VzL0xJQ0VOU0UtMi4wCiMKIyBVbmxlc3MgcmVxdWlyZWQgYnkgYXBwbGljYWJsZSBsYXcgb3IgYWdyZWVkIHRvIGluIHdyaXRpbmcsIHNvZnR3YXJlCiMgZGlzdHJpYnV0ZWQgdW5kZXIgdGhlIExpY2Vuc2UgaXMgZGlzdHJpYnV0ZWQgb24gYW4gIkFTIElTIiBCQVNJUywKIyBXSVRIT1VUIFdBUlJBTlRJRVMgT1IgQ09ORElUSU9OUyBPRiBBTlkgS0lORCwgZWl0aGVyIGV4cHJlc3Mgb3IgaW1wbGllZC4KIyBTZWUgdGhlIExpY2Vuc2UgZm9yIHRoZSBzcGVjaWZpYyBsYW5ndWFnZSBnb3Zlcm5pbmcgcGVybWlzc2lvbnMgYW5kCiMgbGltaXRhdGlvbnMgdW5kZXIgdGhlIExpY2Vuc2UuCiMKCmltcG9ydCBvcwppbXBvcnQgemlwZmlsZQppbXBvcnQgdGFyZmlsZQoKZnJvbSBtbHJ1bi5leGVjdXRpb24gaW1wb3J0IE1MQ2xpZW50Q3R4CmZyb20gbWxydW4uZGF0YXN0b3JlIGltcG9ydCBEYXRhSXRlbQpmcm9tIG1scnVuLmFydGlmYWN0cy5iYXNlIGltcG9ydCBEaXJBcnRpZmFjdAoKZnJvbSB1cmxsaWIucGFyc2UgaW1wb3J0IHVybHBhcnNlCgoKZGVmIG9wZW5fYXJjaGl2ZSgKICAgICAgICBjb250ZXh0OiBNTENsaWVudEN0eCwKICAgICAgICBhcmNoaXZlX3VybDogRGF0YUl0ZW0sCiAgICAgICAgc3ViZGlyOiBzdHIgPSAiY29udGVudC8iLAogICAgICAgIGtleTogc3RyID0gImNvbnRlbnQiLAogICAgICAgIHRhcmdldF9wYXRoOiBzdHIgPSBOb25lLAopOgogICAgIiIiT3BlbiBhIGZpbGUvb2JqZWN0IGFyY2hpdmUgaW50byBhIHRhcmdldCBkaXJlY3RvcnkuIEN1cnJlbnRseSwgc3VwcG9ydHMgemlwIGFuZCB0YXIuZ3ouCgogICAgOnBhcmFtIGNvbnRleHQ6ICAgICAgZnVuY3Rpb24gZXhlY3V0aW9uIGNvbnRleHQKICAgIDpwYXJhbSBhcmNoaXZlX3VybDogIHVybCBvZiBhcmNoaXZlIGZpbGUKICAgIDpwYXJhbSBzdWJkaXI6ICAgICAgIHBhdGggd2l0aGluIGFydGlmYWN0IHN0b3JlIHdoZXJlIGV4dHJhY3RlZCBmaWxlcyBhcmUgc3RvcmVkLCBkZWZhdWx0IGlzICIvY29udGVudCIKICAgIDpwYXJhbSBrZXk6ICAgICAgICAgIGtleSBvZiBhcmNoaXZlIGNvbnRlbnRzIGluIGFydGlmYWN0IHN0b3JlCiAgICA6cGFyYW0gdGFyZ2V0X3BhdGg6ICBmaWxlIHN5c3RlbSBwYXRoIHRvIHN0b3JlIGV4dHJhY3RlZCBmaWxlcwogICAgIiIiCgogICAgIyBSZXNvbHZlcyB0aGUgYXJjaGl2ZSBsb2NhbGx5CiAgICBhcmNoaXZlX3VybCA9IGFyY2hpdmVfdXJsLmxvY2FsKCkKICAgIHYzaW9fc3ViZGlyID0gTm9uZQogICAgIyBXaGVuIGN1c3RvbSBhcnRpZmFjdCBwYXRoIGlzIGRlZmluZWQKICAgIGlmIG5vdCB0YXJnZXRfcGF0aCBhbmQgY29udGV4dC5hcnRpZmFjdF9wYXRoOgogICAgICAgIHBhcnNlZF9zdWJkaXIgPSB1cmxwYXJzZShjb250ZXh0LmFydGlmYWN0X3BhdGgpCiAgICAgICAgaWYgcGFyc2VkX3N1YmRpci5zY2hlbWUgPT0gJ3MzJzoKICAgICAgICAgICAgc3ViZGlyID0gb3MucGF0aC5qb2luKGNvbnRleHQuYXJ0aWZhY3RfcGF0aCwgc3ViZGlyKQogICAgICAgIGVsaWYgcGFyc2VkX3N1YmRpci5zY2hlbWUgPT0gJ3YzaW8nOgogICAgICAgICAgICB2M2lvX3N1YmRpciA9IG9zLnBhdGguam9pbihjb250ZXh0LmFydGlmYWN0X3BhdGgsIHN1YmRpcikgICMgVXNpbmcgdjNpb19zdWJkaXIgZm9yIGxvZ2dpbmcKICAgICAgICAgICAgc3ViZGlyID0gJy92M2lvJyArIHBhcnNlZF9zdWJkaXIucGF0aCArICcvJyArIHN1YmRpcgogICAgICAgICAgICBjb250ZXh0LmxvZ2dlci5pbmZvKGYnVXNpbmcgdjNpbyBzY2hlbWUsIGV4dHJhY3RpbmcgdG8ge3N1YmRpcn0nKQogICAgICAgIGVsc2U6CiAgICAgICAgICAgIGNvbnRleHQubG9nZ2VyLmluZm8oZidVbnJlY29nbml6YWJsZSBzY2hlbWUsIGV4dHJhY3RpbmcgdG8ge3N1YmRpcn0nKQoKICAgICMgV2hlbiB3b3JraW5nIG9uIENFLCB0YXJnZXQgcGF0aCBtaWdodCBiZSBvbiBzMwogICAgaWYgJ3MzJyBpbiAodGFyZ2V0X3BhdGggb3Igc3ViZGlyKToKICAgICAgICBjb250ZXh0LmxvZ2dlci5pbmZvKGYnVXNpbmcgczMgc2NoZW1lLCBleHRyYWN0aW5nIHRvIHt0YXJnZXRfcGF0aCBvciBzdWJkaXJ9JykKCiAgICAgICAgaWYgYXJjaGl2ZV91cmwuZW5kc3dpdGgoImd6Iik6CiAgICAgICAgICAgIF9leHRyYWN0X2d6X2ZpbGUoYXJjaGl2ZV91cmw9YXJjaGl2ZV91cmwsIHN1YmRpcj1zdWJkaXIsIHRhcmdldF9wYXRoPXRhcmdldF9wYXRoLCBpbl9zMz1UcnVlKQoKICAgICAgICBlbGlmIGFyY2hpdmVfdXJsLmVuZHN3aXRoKCJ6aXAiKToKICAgICAgICAgICAgX2V4dHJhY3RfemlwX2ZpbGUoYXJjaGl2ZV91cmw9YXJjaGl2ZV91cmwsIHN1YmRpcj1zdWJkaXIsIHRhcmdldF9wYXRoPXRhcmdldF9wYXRoLCBpbl9zMz1UcnVlKQogICAgICAgIGVsc2U6CiAgICAgICAgICAgIHJhaXNlIFZhbHVlRXJyb3IoZiJ1bnN1cHBvcnRlZCBhcmNoaXZlIHR5cGUgaW4ge2FyY2hpdmVfdXJsfSIpCiAgICBlbHNlOgogICAgICAgIGlmIGFyY2hpdmVfdXJsLmVuZHN3aXRoKCJneiIpOgogICAgICAgICAgICBfZXh0cmFjdF9nel9maWxlKGFyY2hpdmVfdXJsPWFyY2hpdmVfdXJsLCBzdWJkaXI9c3ViZGlyLCB0YXJnZXRfcGF0aD10YXJnZXRfcGF0aCkKICAgICAgICBlbGlmIGFyY2hpdmVfdXJsLmVuZHN3aXRoKCJ6aXAiKToKICAgICAgICAgICAgX2V4dHJhY3RfemlwX2ZpbGUoYXJjaGl2ZV91cmw9YXJjaGl2ZV91cmwsIHN1YmRpcj1zdWJkaXIsIHRhcmdldF9wYXRoPXRhcmdldF9wYXRoKQogICAgICAgIGVsc2U6CiAgICAgICAgICAgIHJhaXNlIFZhbHVlRXJyb3IoZiJ1bnN1cHBvcnRlZCBhcmNoaXZlIHR5cGUgaW4ge2FyY2hpdmVfdXJsfSIpCgogICAgaWYgdjNpb19zdWJkaXI6CiAgICAgICAgc3ViZGlyID0gdjNpb19zdWJkaXIKCiAgICBjb250ZXh0LmxvZ2dlci5pbmZvKGYnTG9nZ2luZyBhcnRpZmFjdCB0byB7KHRhcmdldF9wYXRoIG9yIHN1YmRpcil9JykKICAgIGNvbnRleHQubG9nX2FydGlmYWN0KERpckFydGlmYWN0KGtleT1rZXksIHRhcmdldF9wYXRoPSh0YXJnZXRfcGF0aCBvciBzdWJkaXIpKSkKCgpkZWYgX2V4dHJhY3RfZ3pfZmlsZShhcmNoaXZlX3VybDogc3RyLCB0YXJnZXRfcGF0aDogc3RyID0gTm9uZSwgc3ViZGlyOiBzdHIgPSAiY29udGVudC8iLCBpbl9zMzogYm9vbCA9IEZhbHNlKToKICAgIGlmIGluX3MzOgogICAgICAgIGNsaWVudCA9IF9pbml0X2JvdG8zX2NsaWVudCgpCiAgICAgICAgd2l0aCB0YXJmaWxlLm9wZW4oYXJjaGl2ZV91cmwsIG1vZGU9InJ8Z3oiKSBhcyByZWY6CiAgICAgICAgICAgIGZvciBtZW1iZXIgaW4gcmVmLmdldG1lbWJlcnMoKToKICAgICAgICAgICAgICAgIGRhdGEgPSByZWYuZXh0cmFjdGZpbGUobWVtYmVyPW1lbWJlcikucmVhZCgpCiAgICAgICAgICAgICAgICBjbGllbnQucHV0X29iamVjdChCb2R5PWRhdGEsIEJ1Y2tldD11cmxwYXJzZSh0YXJnZXRfcGF0aCBvciBzdWJkaXIpLm5ldGxvYywKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIEtleT1mJ3t1cmxwYXJzZSh0YXJnZXRfcGF0aCBvciBzdWJkaXIpLnBhdGhbMTpdfXttZW1iZXIubmFtZX0nKQogICAgZWxzZToKICAgICAgICBvcy5tYWtlZGlycyh0YXJnZXRfcGF0aCBvciBzdWJkaXIsIGV4aXN0X29rPVRydWUpCiAgICAgICAgd2l0aCB0YXJmaWxlLm9wZW4oYXJjaGl2ZV91cmwsIG1vZGU9InI6Z3oiKSBhcyByZWY6CiAgICAgICAgICAgIGZvciBlbnRyeSBpbiByZWY6CiAgICAgICAgICAgICAgICAjIFZhbGlkYXRlIHRoYXQgdGhlcmUgaXMgbm8gcGF0aCB0cmF2ZXJzYWwgaW4gdGhlIGFyY2hpdmUKICAgICAgICAgICAgICAgIGlmIG9zLnBhdGguaXNhYnMoZW50cnkubmFtZSkgb3IgIi4uIiBpbiBlbnRyeS5uYW1lOgogICAgICAgICAgICAgICAgICAgIHJhaXNlIFZhbHVlRXJyb3IoZiJJbGxlZ2FsIHRhciBhcmNoaXZlIGVudHJ5OiB7ZW50cnkubmFtZX0iKQoKICAgICAgICAgICAgICAgIHJlZi5leHRyYWN0KGVudHJ5LCB0YXJnZXRfcGF0aCBvciBzdWJkaXIpCgoKZGVmIF9leHRyYWN0X3ppcF9maWxlKGFyY2hpdmVfdXJsLCB0YXJnZXRfcGF0aDogc3RyID0gTm9uZSwgc3ViZGlyOiBzdHIgPSAiY29udGVudC8iLCBpbl9zMzogYm9vbCA9IEZhbHNlKToKICAgIGlmIGluX3MzOgogICAgICAgIGNsaWVudCA9IF9pbml0X2JvdG8zX2NsaWVudCgpCiAgICAgICAgd2l0aCB6aXBmaWxlLlppcEZpbGUoYXJjaGl2ZV91cmwsICJyIikgYXMgcmVmOgogICAgICAgICAgICBmb3IgZmlsZW5hbWUgaW4gcmVmLm5hbWVsaXN0KCk6CiAgICAgICAgICAgICAgICBkYXRhID0gcmVmLnJlYWQoZmlsZW5hbWUpCiAgICAgICAgICAgICAgICBjbGllbnQucHV0X29iamVjdChCb2R5PWRhdGEsIEJ1Y2tldD11cmxwYXJzZSh0YXJnZXRfcGF0aCBvciBzdWJkaXIpLm5ldGxvYywKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIEtleT1mJ3t1cmxwYXJzZSh0YXJnZXRfcGF0aCBvciBzdWJkaXIpLnBhdGhbMTpdfXtmaWxlbmFtZX0nKQogICAgZWxzZToKICAgICAgICB3aXRoIHppcGZpbGUuWmlwRmlsZShhcmNoaXZlX3VybCwgInIiKSBhcyByZWY6CiAgICAgICAgICAgICMgVmFsaWRhdGUgdGhhdCB0aGVyZSBpcyBubyBwYXRoIHRyYXZlcnNhbCBpbiB0aGUgYXJjaGl2ZQogICAgICAgICAgICBmb3IgZW50cnkgaW4gcmVmLm5hbWVsaXN0KCk6CiAgICAgICAgICAgICAgICBpZiBvcy5wYXRoLmlzYWJzKGVudHJ5KSBvciAiLi4iIGluIGVudHJ5OgogICAgICAgICAgICAgICAgICAgIHJhaXNlIFZhbHVlRXJyb3IoZiJJbGxlZ2FsIHppcCBhcmNoaXZlIGVudHJ5OiB7ZW50cnl9IikKICAgICAgICAgICAgb3MubWFrZWRpcnModGFyZ2V0X3BhdGggb3Igc3ViZGlyLCBleGlzdF9vaz1UcnVlKQogICAgICAgICAgICByZWYuZXh0cmFjdGFsbCh0YXJnZXRfcGF0aCBvciBzdWJkaXIpCgoKZGVmIF9pbml0X2JvdG8zX2NsaWVudCgpOgogICAgaW1wb3J0IGJvdG8zCiAgICBpZiBvcy5lbnZpcm9uLmdldCgnUzNfRU5EUE9JTlRfVVJMJyk6CiAgICAgICAgY2xpZW50ID0gYm90bzMuY2xpZW50KCdzMycsIGVuZHBvaW50X3VybD1vcy5lbnZpcm9uLmdldCgnUzNfRU5EUE9JTlRfVVJMJykpCiAgICBlbHNlOgogICAgICAgIGNsaWVudCA9IGJvdG8zLmNsaWVudCgnczMnKQogICAgcmV0dXJuIGNsaWVudA==
     code_origin: ''
     origin_filename: ''
-    functionSourceCode: IyBDb3B5cmlnaHQgMjAyNSBJZ3VhemlvCiMKIyBMaWNlbnNlZCB1bmRlciB0aGUgQXBhY2hlIExpY2Vuc2UsIFZlcnNpb24gMi4wICh0aGUgIkxpY2Vuc2UiKTsKIyB5b3UgbWF5IG5vdCB1c2UgdGhpcyBmaWxlIGV4Y2VwdCBpbiBjb21wbGlhbmNlIHdpdGggdGhlIExpY2Vuc2UuCiMgWW91IG1heSBvYnRhaW4gYSBjb3B5IG9mIHRoZSBMaWNlbnNlIGF0CiMKIyAgICAgaHR0cDovL3d3dy5hcGFjaGUub3JnL2xpY2Vuc2VzL0xJQ0VOU0UtMi4wCiMKIyBVbmxlc3MgcmVxdWlyZWQgYnkgYXBwbGljYWJsZSBsYXcgb3IgYWdyZWVkIHRvIGluIHdyaXRpbmcsIHNvZnR3YXJlCiMgZGlzdHJpYnV0ZWQgdW5kZXIgdGhlIExpY2Vuc2UgaXMgZGlzdHJpYnV0ZWQgb24gYW4gIkFTIElTIiBCQVNJUywKIyBXSVRIT1VUIFdBUlJBTlRJRVMgT1IgQ09ORElUSU9OUyBPRiBBTlkgS0lORCwgZWl0aGVyIGV4cHJlc3Mgb3IgaW1wbGllZC4KIyBTZWUgdGhlIExpY2Vuc2UgZm9yIHRoZSBzcGVjaWZpYyBsYW5ndWFnZSBnb3Zlcm5pbmcgcGVybWlzc2lvbnMgYW5kCiMgbGltaXRhdGlvbnMgdW5kZXIgdGhlIExpY2Vuc2UuCiMKCmltcG9ydCBvcwppbXBvcnQgemlwZmlsZQppbXBvcnQgdGFyZmlsZQoKZnJvbSBtbHJ1bi5leGVjdXRpb24gaW1wb3J0IE1MQ2xpZW50Q3R4CmZyb20gbWxydW4uZGF0YXN0b3JlIGltcG9ydCBEYXRhSXRlbQpmcm9tIG1scnVuLmFydGlmYWN0cy5iYXNlIGltcG9ydCBEaXJBcnRpZmFjdAoKZnJvbSB1cmxsaWIucGFyc2UgaW1wb3J0IHVybHBhcnNlCgoKZGVmIG9wZW5fYXJjaGl2ZSgKICAgICAgICBjb250ZXh0OiBNTENsaWVudEN0eCwKICAgICAgICBhcmNoaXZlX3VybDogRGF0YUl0ZW0sCiAgICAgICAgc3ViZGlyOiBzdHIgPSAiY29udGVudC8iLAogICAgICAgIGtleTogc3RyID0gImNvbnRlbnQiLAogICAgICAgIHRhcmdldF9wYXRoOiBzdHIgPSBOb25lLAopOgogICAgIiIiT3BlbiBhIGZpbGUvb2JqZWN0IGFyY2hpdmUgaW50byBhIHRhcmdldCBkaXJlY3RvcnkuIEN1cnJlbnRseSwgc3VwcG9ydHMgemlwIGFuZCB0YXIuZ3ouCgogICAgOnBhcmFtIGNvbnRleHQ6ICAgICAgZnVuY3Rpb24gZXhlY3V0aW9uIGNvbnRleHQKICAgIDpwYXJhbSBhcmNoaXZlX3VybDogIHVybCBvZiBhcmNoaXZlIGZpbGUKICAgIDpwYXJhbSBzdWJkaXI6ICAgICAgIHBhdGggd2l0aGluIGFydGlmYWN0IHN0b3JlIHdoZXJlIGV4dHJhY3RlZCBmaWxlcyBhcmUgc3RvcmVkLCBkZWZhdWx0IGlzICIvY29udGVudCIKICAgIDpwYXJhbSBrZXk6ICAgICAgICAgIGtleSBvZiBhcmNoaXZlIGNvbnRlbnRzIGluIGFydGlmYWN0IHN0b3JlCiAgICA6cGFyYW0gdGFyZ2V0X3BhdGg6ICBmaWxlIHN5c3RlbSBwYXRoIHRvIHN0b3JlIGV4dHJhY3RlZCBmaWxlcwogICAgIiIiCgogICAgIyBSZXNvbHZlcyB0aGUgYXJjaGl2ZSBsb2NhbGx5CiAgICBhcmNoaXZlX3VybCA9IGFyY2hpdmVfdXJsLmxvY2FsKCkKICAgIHYzaW9fc3ViZGlyID0gTm9uZQogICAgIyBXaGVuIGN1c3RvbSBhcnRpZmFjdCBwYXRoIGlzIGRlZmluZWQKICAgIGlmIG5vdCB0YXJnZXRfcGF0aCBhbmQgY29udGV4dC5hcnRpZmFjdF9wYXRoOgogICAgICAgIHBhcnNlZF9zdWJkaXIgPSB1cmxwYXJzZShjb250ZXh0LmFydGlmYWN0X3BhdGgpCiAgICAgICAgaWYgcGFyc2VkX3N1YmRpci5zY2hlbWUgPT0gJ3MzJzoKICAgICAgICAgICAgc3ViZGlyID0gb3MucGF0aC5qb2luKGNvbnRleHQuYXJ0aWZhY3RfcGF0aCwgc3ViZGlyKQogICAgICAgIGVsaWYgcGFyc2VkX3N1YmRpci5zY2hlbWUgPT0gJ3YzaW8nOgogICAgICAgICAgICB2M2lvX3N1YmRpciA9IG9zLnBhdGguam9pbihjb250ZXh0LmFydGlmYWN0X3BhdGgsIHN1YmRpcikgICMgVXNpbmcgdjNpb19zdWJkaXIgZm9yIGxvZ2dpbmcKICAgICAgICAgICAgc3ViZGlyID0gJy92M2lvJyArIHBhcnNlZF9zdWJkaXIucGF0aCArICcvJyArIHN1YmRpcgogICAgICAgICAgICBjb250ZXh0LmxvZ2dlci5pbmZvKGYnVXNpbmcgdjNpbyBzY2hlbWUsIGV4dHJhY3RpbmcgdG8ge3N1YmRpcn0nKQogICAgICAgIGVsc2U6CiAgICAgICAgICAgIGNvbnRleHQubG9nZ2VyLmluZm8oZidVbnJlY29nbml6YWJsZSBzY2hlbWUsIGV4dHJhY3RpbmcgdG8ge3N1YmRpcn0nKQoKICAgICMgV2hlbiB3b3JraW5nIG9uIENFLCB0YXJnZXQgcGF0aCBtaWdodCBiZSBvbiBzMwogICAgaWYgJ3MzJyBpbiAodGFyZ2V0X3BhdGggb3Igc3ViZGlyKToKICAgICAgICBjb250ZXh0LmxvZ2dlci5pbmZvKGYnVXNpbmcgczMgc2NoZW1lLCBleHRyYWN0aW5nIHRvIHt0YXJnZXRfcGF0aCBvciBzdWJkaXJ9JykKCiAgICAgICAgaWYgYXJjaGl2ZV91cmwuZW5kc3dpdGgoImd6Iik6CiAgICAgICAgICAgIF9leHRyYWN0X2d6X2ZpbGUoYXJjaGl2ZV91cmw9YXJjaGl2ZV91cmwsIHN1YmRpcj1zdWJkaXIsIHRhcmdldF9wYXRoPXRhcmdldF9wYXRoLCBpbl9zMz1UcnVlKQoKICAgICAgICBlbGlmIGFyY2hpdmVfdXJsLmVuZHN3aXRoKCJ6aXAiKToKICAgICAgICAgICAgX2V4dHJhY3RfemlwX2ZpbGUoYXJjaGl2ZV91cmw9YXJjaGl2ZV91cmwsIHN1YmRpcj1zdWJkaXIsIHRhcmdldF9wYXRoPXRhcmdldF9wYXRoLCBpbl9zMz1UcnVlKQogICAgICAgIGVsc2U6CiAgICAgICAgICAgIHJhaXNlIFZhbHVlRXJyb3IoZiJ1bnN1cHBvcnRlZCBhcmNoaXZlIHR5cGUgaW4ge2FyY2hpdmVfdXJsfSIpCiAgICBlbHNlOgogICAgICAgIGlmIGFyY2hpdmVfdXJsLmVuZHN3aXRoKCJneiIpOgogICAgICAgICAgICBfZXh0cmFjdF9nel9maWxlKGFyY2hpdmVfdXJsPWFyY2hpdmVfdXJsLCBzdWJkaXI9c3ViZGlyLCB0YXJnZXRfcGF0aD10YXJnZXRfcGF0aCkKICAgICAgICBlbGlmIGFyY2hpdmVfdXJsLmVuZHN3aXRoKCJ6aXAiKToKICAgICAgICAgICAgX2V4dHJhY3RfemlwX2ZpbGUoYXJjaGl2ZV91cmw9YXJjaGl2ZV91cmwsIHN1YmRpcj1zdWJkaXIsIHRhcmdldF9wYXRoPXRhcmdldF9wYXRoKQogICAgICAgIGVsc2U6CiAgICAgICAgICAgIHJhaXNlIFZhbHVlRXJyb3IoZiJ1bnN1cHBvcnRlZCBhcmNoaXZlIHR5cGUgaW4ge2FyY2hpdmVfdXJsfSIpCgogICAgaWYgdjNpb19zdWJkaXI6CiAgICAgICAgc3ViZGlyID0gdjNpb19zdWJkaXIKCiAgICBjb250ZXh0LmxvZ2dlci5pbmZvKGYnTG9nZ2luZyBhcnRpZmFjdCB0byB7KHRhcmdldF9wYXRoIG9yIHN1YmRpcil9JykKICAgIGNvbnRleHQubG9nX2FydGlmYWN0KERpckFydGlmYWN0KGtleT1rZXksIHRhcmdldF9wYXRoPSh0YXJnZXRfcGF0aCBvciBzdWJkaXIpKSkKCgpkZWYgX2V4dHJhY3RfZ3pfZmlsZShhcmNoaXZlX3VybDogc3RyLCB0YXJnZXRfcGF0aDogc3RyID0gTm9uZSwgc3ViZGlyOiBzdHIgPSAiY29udGVudC8iLCBpbl9zMzogYm9vbCA9IEZhbHNlKToKICAgIGlmIGluX3MzOgogICAgICAgIGNsaWVudCA9IF9pbml0X2JvdG8zX2NsaWVudCgpCiAgICAgICAgd2l0aCB0YXJmaWxlLm9wZW4oYXJjaGl2ZV91cmwsIG1vZGU9InJ8Z3oiKSBhcyByZWY6CiAgICAgICAgICAgIGZvciBtZW1iZXIgaW4gcmVmLmdldG1lbWJlcnMoKToKICAgICAgICAgICAgICAgIGRhdGEgPSByZWYuZXh0cmFjdGZpbGUobWVtYmVyPW1lbWJlcikucmVhZCgpCiAgICAgICAgICAgICAgICBjbGllbnQucHV0X29iamVjdChCb2R5PWRhdGEsIEJ1Y2tldD11cmxwYXJzZSh0YXJnZXRfcGF0aCBvciBzdWJkaXIpLm5ldGxvYywKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIEtleT1mJ3t1cmxwYXJzZSh0YXJnZXRfcGF0aCBvciBzdWJkaXIpLnBhdGhbMTpdfXttZW1iZXIubmFtZX0nKQogICAgZWxzZToKICAgICAgICBvcy5tYWtlZGlycyh0YXJnZXRfcGF0aCBvciBzdWJkaXIsIGV4aXN0X29rPVRydWUpCiAgICAgICAgd2l0aCB0YXJmaWxlLm9wZW4oYXJjaGl2ZV91cmwsIG1vZGU9InI6Z3oiKSBhcyByZWY6CiAgICAgICAgICAgIGZvciBlbnRyeSBpbiByZWY6CiAgICAgICAgICAgICAgICAjIFZhbGlkYXRlIHRoYXQgdGhlcmUgaXMgbm8gcGF0aCB0cmF2ZXJzYWwgaW4gdGhlIGFyY2hpdmUKICAgICAgICAgICAgICAgIGlmIG9zLnBhdGguaXNhYnMoZW50cnkubmFtZSkgb3IgIi4uIiBpbiBlbnRyeS5uYW1lOgogICAgICAgICAgICAgICAgICAgIHJhaXNlIFZhbHVlRXJyb3IoZiJJbGxlZ2FsIHRhciBhcmNoaXZlIGVudHJ5OiB7ZW50cnkubmFtZX0iKQoKICAgICAgICAgICAgICAgIHJlZi5leHRyYWN0KGVudHJ5LCB0YXJnZXRfcGF0aCBvciBzdWJkaXIpCgoKZGVmIF9leHRyYWN0X3ppcF9maWxlKGFyY2hpdmVfdXJsLCB0YXJnZXRfcGF0aDogc3RyID0gTm9uZSwgc3ViZGlyOiBzdHIgPSAiY29udGVudC8iLCBpbl9zMzogYm9vbCA9IEZhbHNlKToKICAgIGlmIGluX3MzOgogICAgICAgIGNsaWVudCA9IF9pbml0X2JvdG8zX2NsaWVudCgpCiAgICAgICAgd2l0aCB6aXBmaWxlLlppcEZpbGUoYXJjaGl2ZV91cmwsICJyIikgYXMgcmVmOgogICAgICAgICAgICBmb3IgZmlsZW5hbWUgaW4gcmVmLm5hbWVsaXN0KCk6CiAgICAgICAgICAgICAgICBkYXRhID0gcmVmLnJlYWQoZmlsZW5hbWUpCiAgICAgICAgICAgICAgICBjbGllbnQucHV0X29iamVjdChCb2R5PWRhdGEsIEJ1Y2tldD11cmxwYXJzZSh0YXJnZXRfcGF0aCBvciBzdWJkaXIpLm5ldGxvYywKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIEtleT1mJ3t1cmxwYXJzZSh0YXJnZXRfcGF0aCBvciBzdWJkaXIpLnBhdGhbMTpdfXtmaWxlbmFtZX0nKQogICAgZWxzZToKICAgICAgICB3aXRoIHppcGZpbGUuWmlwRmlsZShhcmNoaXZlX3VybCwgInIiKSBhcyByZWY6CiAgICAgICAgICAgICMgVmFsaWRhdGUgdGhhdCB0aGVyZSBpcyBubyBwYXRoIHRyYXZlcnNhbCBpbiB0aGUgYXJjaGl2ZQogICAgICAgICAgICBmb3IgZW50cnkgaW4gcmVmLm5hbWVsaXN0KCk6CiAgICAgICAgICAgICAgICBpZiBvcy5wYXRoLmlzYWJzKGVudHJ5KSBvciAiLi4iIGluIGVudHJ5OgogICAgICAgICAgICAgICAgICAgIHJhaXNlIFZhbHVlRXJyb3IoZiJJbGxlZ2FsIHppcCBhcmNoaXZlIGVudHJ5OiB7ZW50cnl9IikKICAgICAgICAgICAgb3MubWFrZWRpcnModGFyZ2V0X3BhdGggb3Igc3ViZGlyLCBleGlzdF9vaz1UcnVlKQogICAgICAgICAgICByZWYuZXh0cmFjdGFsbCh0YXJnZXRfcGF0aCBvciBzdWJkaXIpCgoKZGVmIF9pbml0X2JvdG8zX2NsaWVudCgpOgogICAgaW1wb3J0IGJvdG8zCiAgICBpZiBvcy5lbnZpcm9uLmdldCgnUzNfRU5EUE9JTlRfVVJMJyk6CiAgICAgICAgY2xpZW50ID0gYm90bzMuY2xpZW50KCdzMycsIGVuZHBvaW50X3VybD1vcy5lbnZpcm9uLmdldCgnUzNfRU5EUE9JTlRfVVJMJykpCiAgICBlbHNlOgogICAgICAgIGNsaWVudCA9IGJvdG8zLmNsaWVudCgnczMnKQogICAgcmV0dXJuIGNsaWVudA==
-  default_handler: open_archive
-  command: ''
-  image: mlrun/mlrun
   description: Open a file/object archive into a target directory
+  image: mlrun/mlrun
   entry_points:
     open_archive:
       has_kwargs: false
+      lineno: 27
+      name: open_archive
       parameters:
       - name: context
         type: MLClientCtx
@@ -70,9 +68,11 @@
       doc: Open a file/object archive into a target directory. Currently, supports
         zip and tar.gz.
       has_varargs: false
-      name: open_archive
-      lineno: 27
-  disable_auto_mount: false
+metadata:
+  name: open-archive
+  categories:
+  - utils
+  tag: ''
 
         
     
diff --git a/functions/master/open_archive/1.2.0/static/item.html b/functions/master/open_archive/1.2.0/static/item.html index 0e81169e..fed83413 100644 --- a/functions/master/open_archive/1.2.0/static/item.html +++ b/functions/master/open_archive/1.2.0/static/item.html @@ -30,7 +30,7 @@ apiVersion: v1 categories: -- data-preparation +- utils description: Open a file/object archive into a target directory doc: '' example: open_archive.ipynb diff --git a/functions/master/open_archive/1.2.0/static/open_archive.html b/functions/master/open_archive/1.2.0/static/open_archive.html index ec8542bd..5aa03a6d 100644 --- a/functions/master/open_archive/1.2.0/static/open_archive.html +++ b/functions/master/open_archive/1.2.0/static/open_archive.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/open_archive/latest/src/function.yaml b/functions/master/open_archive/latest/src/function.yaml index dee623a0..bf78b5fc 100644 --- a/functions/master/open_archive/latest/src/function.yaml +++ b/functions/master/open_archive/latest/src/function.yaml @@ -1,22 +1,20 @@ kind: job -metadata: - name: open-archive - categories: - - data-preparation - tag: '' verbose: false spec: + command: '' + disable_auto_mount: false + default_handler: open_archive build: + functionSourceCode: IyBDb3B5cmlnaHQgMjAyNSBJZ3VhemlvCiMKIyBMaWNlbnNlZCB1bmRlciB0aGUgQXBhY2hlIExpY2Vuc2UsIFZlcnNpb24gMi4wICh0aGUgIkxpY2Vuc2UiKTsKIyB5b3UgbWF5IG5vdCB1c2UgdGhpcyBmaWxlIGV4Y2VwdCBpbiBjb21wbGlhbmNlIHdpdGggdGhlIExpY2Vuc2UuCiMgWW91IG1heSBvYnRhaW4gYSBjb3B5IG9mIHRoZSBMaWNlbnNlIGF0CiMKIyAgICAgaHR0cDovL3d3dy5hcGFjaGUub3JnL2xpY2Vuc2VzL0xJQ0VOU0UtMi4wCiMKIyBVbmxlc3MgcmVxdWlyZWQgYnkgYXBwbGljYWJsZSBsYXcgb3IgYWdyZWVkIHRvIGluIHdyaXRpbmcsIHNvZnR3YXJlCiMgZGlzdHJpYnV0ZWQgdW5kZXIgdGhlIExpY2Vuc2UgaXMgZGlzdHJpYnV0ZWQgb24gYW4gIkFTIElTIiBCQVNJUywKIyBXSVRIT1VUIFdBUlJBTlRJRVMgT1IgQ09ORElUSU9OUyBPRiBBTlkgS0lORCwgZWl0aGVyIGV4cHJlc3Mgb3IgaW1wbGllZC4KIyBTZWUgdGhlIExpY2Vuc2UgZm9yIHRoZSBzcGVjaWZpYyBsYW5ndWFnZSBnb3Zlcm5pbmcgcGVybWlzc2lvbnMgYW5kCiMgbGltaXRhdGlvbnMgdW5kZXIgdGhlIExpY2Vuc2UuCiMKCmltcG9ydCBvcwppbXBvcnQgemlwZmlsZQppbXBvcnQgdGFyZmlsZQoKZnJvbSBtbHJ1bi5leGVjdXRpb24gaW1wb3J0IE1MQ2xpZW50Q3R4CmZyb20gbWxydW4uZGF0YXN0b3JlIGltcG9ydCBEYXRhSXRlbQpmcm9tIG1scnVuLmFydGlmYWN0cy5iYXNlIGltcG9ydCBEaXJBcnRpZmFjdAoKZnJvbSB1cmxsaWIucGFyc2UgaW1wb3J0IHVybHBhcnNlCgoKZGVmIG9wZW5fYXJjaGl2ZSgKICAgICAgICBjb250ZXh0OiBNTENsaWVudEN0eCwKICAgICAgICBhcmNoaXZlX3VybDogRGF0YUl0ZW0sCiAgICAgICAgc3ViZGlyOiBzdHIgPSAiY29udGVudC8iLAogICAgICAgIGtleTogc3RyID0gImNvbnRlbnQiLAogICAgICAgIHRhcmdldF9wYXRoOiBzdHIgPSBOb25lLAopOgogICAgIiIiT3BlbiBhIGZpbGUvb2JqZWN0IGFyY2hpdmUgaW50byBhIHRhcmdldCBkaXJlY3RvcnkuIEN1cnJlbnRseSwgc3VwcG9ydHMgemlwIGFuZCB0YXIuZ3ouCgogICAgOnBhcmFtIGNvbnRleHQ6ICAgICAgZnVuY3Rpb24gZXhlY3V0aW9uIGNvbnRleHQKICAgIDpwYXJhbSBhcmNoaXZlX3VybDogIHVybCBvZiBhcmNoaXZlIGZpbGUKICAgIDpwYXJhbSBzdWJkaXI6ICAgICAgIHBhdGggd2l0aGluIGFydGlmYWN0IHN0b3JlIHdoZXJlIGV4dHJhY3RlZCBmaWxlcyBhcmUgc3RvcmVkLCBkZWZhdWx0IGlzICIvY29udGVudCIKICAgIDpwYXJhbSBrZXk6ICAgICAgICAgIGtleSBvZiBhcmNoaXZlIGNvbnRlbnRzIGluIGFydGlmYWN0IHN0b3JlCiAgICA6cGFyYW0gdGFyZ2V0X3BhdGg6ICBmaWxlIHN5c3RlbSBwYXRoIHRvIHN0b3JlIGV4dHJhY3RlZCBmaWxlcwogICAgIiIiCgogICAgIyBSZXNvbHZlcyB0aGUgYXJjaGl2ZSBsb2NhbGx5CiAgICBhcmNoaXZlX3VybCA9IGFyY2hpdmVfdXJsLmxvY2FsKCkKICAgIHYzaW9fc3ViZGlyID0gTm9uZQogICAgIyBXaGVuIGN1c3RvbSBhcnRpZmFjdCBwYXRoIGlzIGRlZmluZWQKICAgIGlmIG5vdCB0YXJnZXRfcGF0aCBhbmQgY29udGV4dC5hcnRpZmFjdF9wYXRoOgogICAgICAgIHBhcnNlZF9zdWJkaXIgPSB1cmxwYXJzZShjb250ZXh0LmFydGlmYWN0X3BhdGgpCiAgICAgICAgaWYgcGFyc2VkX3N1YmRpci5zY2hlbWUgPT0gJ3MzJzoKICAgICAgICAgICAgc3ViZGlyID0gb3MucGF0aC5qb2luKGNvbnRleHQuYXJ0aWZhY3RfcGF0aCwgc3ViZGlyKQogICAgICAgIGVsaWYgcGFyc2VkX3N1YmRpci5zY2hlbWUgPT0gJ3YzaW8nOgogICAgICAgICAgICB2M2lvX3N1YmRpciA9IG9zLnBhdGguam9pbihjb250ZXh0LmFydGlmYWN0X3BhdGgsIHN1YmRpcikgICMgVXNpbmcgdjNpb19zdWJkaXIgZm9yIGxvZ2dpbmcKICAgICAgICAgICAgc3ViZGlyID0gJy92M2lvJyArIHBhcnNlZF9zdWJkaXIucGF0aCArICcvJyArIHN1YmRpcgogICAgICAgICAgICBjb250ZXh0LmxvZ2dlci5pbmZvKGYnVXNpbmcgdjNpbyBzY2hlbWUsIGV4dHJhY3RpbmcgdG8ge3N1YmRpcn0nKQogICAgICAgIGVsc2U6CiAgICAgICAgICAgIGNvbnRleHQubG9nZ2VyLmluZm8oZidVbnJlY29nbml6YWJsZSBzY2hlbWUsIGV4dHJhY3RpbmcgdG8ge3N1YmRpcn0nKQoKICAgICMgV2hlbiB3b3JraW5nIG9uIENFLCB0YXJnZXQgcGF0aCBtaWdodCBiZSBvbiBzMwogICAgaWYgJ3MzJyBpbiAodGFyZ2V0X3BhdGggb3Igc3ViZGlyKToKICAgICAgICBjb250ZXh0LmxvZ2dlci5pbmZvKGYnVXNpbmcgczMgc2NoZW1lLCBleHRyYWN0aW5nIHRvIHt0YXJnZXRfcGF0aCBvciBzdWJkaXJ9JykKCiAgICAgICAgaWYgYXJjaGl2ZV91cmwuZW5kc3dpdGgoImd6Iik6CiAgICAgICAgICAgIF9leHRyYWN0X2d6X2ZpbGUoYXJjaGl2ZV91cmw9YXJjaGl2ZV91cmwsIHN1YmRpcj1zdWJkaXIsIHRhcmdldF9wYXRoPXRhcmdldF9wYXRoLCBpbl9zMz1UcnVlKQoKICAgICAgICBlbGlmIGFyY2hpdmVfdXJsLmVuZHN3aXRoKCJ6aXAiKToKICAgICAgICAgICAgX2V4dHJhY3RfemlwX2ZpbGUoYXJjaGl2ZV91cmw9YXJjaGl2ZV91cmwsIHN1YmRpcj1zdWJkaXIsIHRhcmdldF9wYXRoPXRhcmdldF9wYXRoLCBpbl9zMz1UcnVlKQogICAgICAgIGVsc2U6CiAgICAgICAgICAgIHJhaXNlIFZhbHVlRXJyb3IoZiJ1bnN1cHBvcnRlZCBhcmNoaXZlIHR5cGUgaW4ge2FyY2hpdmVfdXJsfSIpCiAgICBlbHNlOgogICAgICAgIGlmIGFyY2hpdmVfdXJsLmVuZHN3aXRoKCJneiIpOgogICAgICAgICAgICBfZXh0cmFjdF9nel9maWxlKGFyY2hpdmVfdXJsPWFyY2hpdmVfdXJsLCBzdWJkaXI9c3ViZGlyLCB0YXJnZXRfcGF0aD10YXJnZXRfcGF0aCkKICAgICAgICBlbGlmIGFyY2hpdmVfdXJsLmVuZHN3aXRoKCJ6aXAiKToKICAgICAgICAgICAgX2V4dHJhY3RfemlwX2ZpbGUoYXJjaGl2ZV91cmw9YXJjaGl2ZV91cmwsIHN1YmRpcj1zdWJkaXIsIHRhcmdldF9wYXRoPXRhcmdldF9wYXRoKQogICAgICAgIGVsc2U6CiAgICAgICAgICAgIHJhaXNlIFZhbHVlRXJyb3IoZiJ1bnN1cHBvcnRlZCBhcmNoaXZlIHR5cGUgaW4ge2FyY2hpdmVfdXJsfSIpCgogICAgaWYgdjNpb19zdWJkaXI6CiAgICAgICAgc3ViZGlyID0gdjNpb19zdWJkaXIKCiAgICBjb250ZXh0LmxvZ2dlci5pbmZvKGYnTG9nZ2luZyBhcnRpZmFjdCB0byB7KHRhcmdldF9wYXRoIG9yIHN1YmRpcil9JykKICAgIGNvbnRleHQubG9nX2FydGlmYWN0KERpckFydGlmYWN0KGtleT1rZXksIHRhcmdldF9wYXRoPSh0YXJnZXRfcGF0aCBvciBzdWJkaXIpKSkKCgpkZWYgX2V4dHJhY3RfZ3pfZmlsZShhcmNoaXZlX3VybDogc3RyLCB0YXJnZXRfcGF0aDogc3RyID0gTm9uZSwgc3ViZGlyOiBzdHIgPSAiY29udGVudC8iLCBpbl9zMzogYm9vbCA9IEZhbHNlKToKICAgIGlmIGluX3MzOgogICAgICAgIGNsaWVudCA9IF9pbml0X2JvdG8zX2NsaWVudCgpCiAgICAgICAgd2l0aCB0YXJmaWxlLm9wZW4oYXJjaGl2ZV91cmwsIG1vZGU9InJ8Z3oiKSBhcyByZWY6CiAgICAgICAgICAgIGZvciBtZW1iZXIgaW4gcmVmLmdldG1lbWJlcnMoKToKICAgICAgICAgICAgICAgIGRhdGEgPSByZWYuZXh0cmFjdGZpbGUobWVtYmVyPW1lbWJlcikucmVhZCgpCiAgICAgICAgICAgICAgICBjbGllbnQucHV0X29iamVjdChCb2R5PWRhdGEsIEJ1Y2tldD11cmxwYXJzZSh0YXJnZXRfcGF0aCBvciBzdWJkaXIpLm5ldGxvYywKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIEtleT1mJ3t1cmxwYXJzZSh0YXJnZXRfcGF0aCBvciBzdWJkaXIpLnBhdGhbMTpdfXttZW1iZXIubmFtZX0nKQogICAgZWxzZToKICAgICAgICBvcy5tYWtlZGlycyh0YXJnZXRfcGF0aCBvciBzdWJkaXIsIGV4aXN0X29rPVRydWUpCiAgICAgICAgd2l0aCB0YXJmaWxlLm9wZW4oYXJjaGl2ZV91cmwsIG1vZGU9InI6Z3oiKSBhcyByZWY6CiAgICAgICAgICAgIGZvciBlbnRyeSBpbiByZWY6CiAgICAgICAgICAgICAgICAjIFZhbGlkYXRlIHRoYXQgdGhlcmUgaXMgbm8gcGF0aCB0cmF2ZXJzYWwgaW4gdGhlIGFyY2hpdmUKICAgICAgICAgICAgICAgIGlmIG9zLnBhdGguaXNhYnMoZW50cnkubmFtZSkgb3IgIi4uIiBpbiBlbnRyeS5uYW1lOgogICAgICAgICAgICAgICAgICAgIHJhaXNlIFZhbHVlRXJyb3IoZiJJbGxlZ2FsIHRhciBhcmNoaXZlIGVudHJ5OiB7ZW50cnkubmFtZX0iKQoKICAgICAgICAgICAgICAgIHJlZi5leHRyYWN0KGVudHJ5LCB0YXJnZXRfcGF0aCBvciBzdWJkaXIpCgoKZGVmIF9leHRyYWN0X3ppcF9maWxlKGFyY2hpdmVfdXJsLCB0YXJnZXRfcGF0aDogc3RyID0gTm9uZSwgc3ViZGlyOiBzdHIgPSAiY29udGVudC8iLCBpbl9zMzogYm9vbCA9IEZhbHNlKToKICAgIGlmIGluX3MzOgogICAgICAgIGNsaWVudCA9IF9pbml0X2JvdG8zX2NsaWVudCgpCiAgICAgICAgd2l0aCB6aXBmaWxlLlppcEZpbGUoYXJjaGl2ZV91cmwsICJyIikgYXMgcmVmOgogICAgICAgICAgICBmb3IgZmlsZW5hbWUgaW4gcmVmLm5hbWVsaXN0KCk6CiAgICAgICAgICAgICAgICBkYXRhID0gcmVmLnJlYWQoZmlsZW5hbWUpCiAgICAgICAgICAgICAgICBjbGllbnQucHV0X29iamVjdChCb2R5PWRhdGEsIEJ1Y2tldD11cmxwYXJzZSh0YXJnZXRfcGF0aCBvciBzdWJkaXIpLm5ldGxvYywKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIEtleT1mJ3t1cmxwYXJzZSh0YXJnZXRfcGF0aCBvciBzdWJkaXIpLnBhdGhbMTpdfXtmaWxlbmFtZX0nKQogICAgZWxzZToKICAgICAgICB3aXRoIHppcGZpbGUuWmlwRmlsZShhcmNoaXZlX3VybCwgInIiKSBhcyByZWY6CiAgICAgICAgICAgICMgVmFsaWRhdGUgdGhhdCB0aGVyZSBpcyBubyBwYXRoIHRyYXZlcnNhbCBpbiB0aGUgYXJjaGl2ZQogICAgICAgICAgICBmb3IgZW50cnkgaW4gcmVmLm5hbWVsaXN0KCk6CiAgICAgICAgICAgICAgICBpZiBvcy5wYXRoLmlzYWJzKGVudHJ5KSBvciAiLi4iIGluIGVudHJ5OgogICAgICAgICAgICAgICAgICAgIHJhaXNlIFZhbHVlRXJyb3IoZiJJbGxlZ2FsIHppcCBhcmNoaXZlIGVudHJ5OiB7ZW50cnl9IikKICAgICAgICAgICAgb3MubWFrZWRpcnModGFyZ2V0X3BhdGggb3Igc3ViZGlyLCBleGlzdF9vaz1UcnVlKQogICAgICAgICAgICByZWYuZXh0cmFjdGFsbCh0YXJnZXRfcGF0aCBvciBzdWJkaXIpCgoKZGVmIF9pbml0X2JvdG8zX2NsaWVudCgpOgogICAgaW1wb3J0IGJvdG8zCiAgICBpZiBvcy5lbnZpcm9uLmdldCgnUzNfRU5EUE9JTlRfVVJMJyk6CiAgICAgICAgY2xpZW50ID0gYm90bzMuY2xpZW50KCdzMycsIGVuZHBvaW50X3VybD1vcy5lbnZpcm9uLmdldCgnUzNfRU5EUE9JTlRfVVJMJykpCiAgICBlbHNlOgogICAgICAgIGNsaWVudCA9IGJvdG8zLmNsaWVudCgnczMnKQogICAgcmV0dXJuIGNsaWVudA== code_origin: '' origin_filename: '' - functionSourceCode: IyBDb3B5cmlnaHQgMjAyNSBJZ3VhemlvCiMKIyBMaWNlbnNlZCB1bmRlciB0aGUgQXBhY2hlIExpY2Vuc2UsIFZlcnNpb24gMi4wICh0aGUgIkxpY2Vuc2UiKTsKIyB5b3UgbWF5IG5vdCB1c2UgdGhpcyBmaWxlIGV4Y2VwdCBpbiBjb21wbGlhbmNlIHdpdGggdGhlIExpY2Vuc2UuCiMgWW91IG1heSBvYnRhaW4gYSBjb3B5IG9mIHRoZSBMaWNlbnNlIGF0CiMKIyAgICAgaHR0cDovL3d3dy5hcGFjaGUub3JnL2xpY2Vuc2VzL0xJQ0VOU0UtMi4wCiMKIyBVbmxlc3MgcmVxdWlyZWQgYnkgYXBwbGljYWJsZSBsYXcgb3IgYWdyZWVkIHRvIGluIHdyaXRpbmcsIHNvZnR3YXJlCiMgZGlzdHJpYnV0ZWQgdW5kZXIgdGhlIExpY2Vuc2UgaXMgZGlzdHJpYnV0ZWQgb24gYW4gIkFTIElTIiBCQVNJUywKIyBXSVRIT1VUIFdBUlJBTlRJRVMgT1IgQ09ORElUSU9OUyBPRiBBTlkgS0lORCwgZWl0aGVyIGV4cHJlc3Mgb3IgaW1wbGllZC4KIyBTZWUgdGhlIExpY2Vuc2UgZm9yIHRoZSBzcGVjaWZpYyBsYW5ndWFnZSBnb3Zlcm5pbmcgcGVybWlzc2lvbnMgYW5kCiMgbGltaXRhdGlvbnMgdW5kZXIgdGhlIExpY2Vuc2UuCiMKCmltcG9ydCBvcwppbXBvcnQgemlwZmlsZQppbXBvcnQgdGFyZmlsZQoKZnJvbSBtbHJ1bi5leGVjdXRpb24gaW1wb3J0IE1MQ2xpZW50Q3R4CmZyb20gbWxydW4uZGF0YXN0b3JlIGltcG9ydCBEYXRhSXRlbQpmcm9tIG1scnVuLmFydGlmYWN0cy5iYXNlIGltcG9ydCBEaXJBcnRpZmFjdAoKZnJvbSB1cmxsaWIucGFyc2UgaW1wb3J0IHVybHBhcnNlCgoKZGVmIG9wZW5fYXJjaGl2ZSgKICAgICAgICBjb250ZXh0OiBNTENsaWVudEN0eCwKICAgICAgICBhcmNoaXZlX3VybDogRGF0YUl0ZW0sCiAgICAgICAgc3ViZGlyOiBzdHIgPSAiY29udGVudC8iLAogICAgICAgIGtleTogc3RyID0gImNvbnRlbnQiLAogICAgICAgIHRhcmdldF9wYXRoOiBzdHIgPSBOb25lLAopOgogICAgIiIiT3BlbiBhIGZpbGUvb2JqZWN0IGFyY2hpdmUgaW50byBhIHRhcmdldCBkaXJlY3RvcnkuIEN1cnJlbnRseSwgc3VwcG9ydHMgemlwIGFuZCB0YXIuZ3ouCgogICAgOnBhcmFtIGNvbnRleHQ6ICAgICAgZnVuY3Rpb24gZXhlY3V0aW9uIGNvbnRleHQKICAgIDpwYXJhbSBhcmNoaXZlX3VybDogIHVybCBvZiBhcmNoaXZlIGZpbGUKICAgIDpwYXJhbSBzdWJkaXI6ICAgICAgIHBhdGggd2l0aGluIGFydGlmYWN0IHN0b3JlIHdoZXJlIGV4dHJhY3RlZCBmaWxlcyBhcmUgc3RvcmVkLCBkZWZhdWx0IGlzICIvY29udGVudCIKICAgIDpwYXJhbSBrZXk6ICAgICAgICAgIGtleSBvZiBhcmNoaXZlIGNvbnRlbnRzIGluIGFydGlmYWN0IHN0b3JlCiAgICA6cGFyYW0gdGFyZ2V0X3BhdGg6ICBmaWxlIHN5c3RlbSBwYXRoIHRvIHN0b3JlIGV4dHJhY3RlZCBmaWxlcwogICAgIiIiCgogICAgIyBSZXNvbHZlcyB0aGUgYXJjaGl2ZSBsb2NhbGx5CiAgICBhcmNoaXZlX3VybCA9IGFyY2hpdmVfdXJsLmxvY2FsKCkKICAgIHYzaW9fc3ViZGlyID0gTm9uZQogICAgIyBXaGVuIGN1c3RvbSBhcnRpZmFjdCBwYXRoIGlzIGRlZmluZWQKICAgIGlmIG5vdCB0YXJnZXRfcGF0aCBhbmQgY29udGV4dC5hcnRpZmFjdF9wYXRoOgogICAgICAgIHBhcnNlZF9zdWJkaXIgPSB1cmxwYXJzZShjb250ZXh0LmFydGlmYWN0X3BhdGgpCiAgICAgICAgaWYgcGFyc2VkX3N1YmRpci5zY2hlbWUgPT0gJ3MzJzoKICAgICAgICAgICAgc3ViZGlyID0gb3MucGF0aC5qb2luKGNvbnRleHQuYXJ0aWZhY3RfcGF0aCwgc3ViZGlyKQogICAgICAgIGVsaWYgcGFyc2VkX3N1YmRpci5zY2hlbWUgPT0gJ3YzaW8nOgogICAgICAgICAgICB2M2lvX3N1YmRpciA9IG9zLnBhdGguam9pbihjb250ZXh0LmFydGlmYWN0X3BhdGgsIHN1YmRpcikgICMgVXNpbmcgdjNpb19zdWJkaXIgZm9yIGxvZ2dpbmcKICAgICAgICAgICAgc3ViZGlyID0gJy92M2lvJyArIHBhcnNlZF9zdWJkaXIucGF0aCArICcvJyArIHN1YmRpcgogICAgICAgICAgICBjb250ZXh0LmxvZ2dlci5pbmZvKGYnVXNpbmcgdjNpbyBzY2hlbWUsIGV4dHJhY3RpbmcgdG8ge3N1YmRpcn0nKQogICAgICAgIGVsc2U6CiAgICAgICAgICAgIGNvbnRleHQubG9nZ2VyLmluZm8oZidVbnJlY29nbml6YWJsZSBzY2hlbWUsIGV4dHJhY3RpbmcgdG8ge3N1YmRpcn0nKQoKICAgICMgV2hlbiB3b3JraW5nIG9uIENFLCB0YXJnZXQgcGF0aCBtaWdodCBiZSBvbiBzMwogICAgaWYgJ3MzJyBpbiAodGFyZ2V0X3BhdGggb3Igc3ViZGlyKToKICAgICAgICBjb250ZXh0LmxvZ2dlci5pbmZvKGYnVXNpbmcgczMgc2NoZW1lLCBleHRyYWN0aW5nIHRvIHt0YXJnZXRfcGF0aCBvciBzdWJkaXJ9JykKCiAgICAgICAgaWYgYXJjaGl2ZV91cmwuZW5kc3dpdGgoImd6Iik6CiAgICAgICAgICAgIF9leHRyYWN0X2d6X2ZpbGUoYXJjaGl2ZV91cmw9YXJjaGl2ZV91cmwsIHN1YmRpcj1zdWJkaXIsIHRhcmdldF9wYXRoPXRhcmdldF9wYXRoLCBpbl9zMz1UcnVlKQoKICAgICAgICBlbGlmIGFyY2hpdmVfdXJsLmVuZHN3aXRoKCJ6aXAiKToKICAgICAgICAgICAgX2V4dHJhY3RfemlwX2ZpbGUoYXJjaGl2ZV91cmw9YXJjaGl2ZV91cmwsIHN1YmRpcj1zdWJkaXIsIHRhcmdldF9wYXRoPXRhcmdldF9wYXRoLCBpbl9zMz1UcnVlKQogICAgICAgIGVsc2U6CiAgICAgICAgICAgIHJhaXNlIFZhbHVlRXJyb3IoZiJ1bnN1cHBvcnRlZCBhcmNoaXZlIHR5cGUgaW4ge2FyY2hpdmVfdXJsfSIpCiAgICBlbHNlOgogICAgICAgIGlmIGFyY2hpdmVfdXJsLmVuZHN3aXRoKCJneiIpOgogICAgICAgICAgICBfZXh0cmFjdF9nel9maWxlKGFyY2hpdmVfdXJsPWFyY2hpdmVfdXJsLCBzdWJkaXI9c3ViZGlyLCB0YXJnZXRfcGF0aD10YXJnZXRfcGF0aCkKICAgICAgICBlbGlmIGFyY2hpdmVfdXJsLmVuZHN3aXRoKCJ6aXAiKToKICAgICAgICAgICAgX2V4dHJhY3RfemlwX2ZpbGUoYXJjaGl2ZV91cmw9YXJjaGl2ZV91cmwsIHN1YmRpcj1zdWJkaXIsIHRhcmdldF9wYXRoPXRhcmdldF9wYXRoKQogICAgICAgIGVsc2U6CiAgICAgICAgICAgIHJhaXNlIFZhbHVlRXJyb3IoZiJ1bnN1cHBvcnRlZCBhcmNoaXZlIHR5cGUgaW4ge2FyY2hpdmVfdXJsfSIpCgogICAgaWYgdjNpb19zdWJkaXI6CiAgICAgICAgc3ViZGlyID0gdjNpb19zdWJkaXIKCiAgICBjb250ZXh0LmxvZ2dlci5pbmZvKGYnTG9nZ2luZyBhcnRpZmFjdCB0byB7KHRhcmdldF9wYXRoIG9yIHN1YmRpcil9JykKICAgIGNvbnRleHQubG9nX2FydGlmYWN0KERpckFydGlmYWN0KGtleT1rZXksIHRhcmdldF9wYXRoPSh0YXJnZXRfcGF0aCBvciBzdWJkaXIpKSkKCgpkZWYgX2V4dHJhY3RfZ3pfZmlsZShhcmNoaXZlX3VybDogc3RyLCB0YXJnZXRfcGF0aDogc3RyID0gTm9uZSwgc3ViZGlyOiBzdHIgPSAiY29udGVudC8iLCBpbl9zMzogYm9vbCA9IEZhbHNlKToKICAgIGlmIGluX3MzOgogICAgICAgIGNsaWVudCA9IF9pbml0X2JvdG8zX2NsaWVudCgpCiAgICAgICAgd2l0aCB0YXJmaWxlLm9wZW4oYXJjaGl2ZV91cmwsIG1vZGU9InJ8Z3oiKSBhcyByZWY6CiAgICAgICAgICAgIGZvciBtZW1iZXIgaW4gcmVmLmdldG1lbWJlcnMoKToKICAgICAgICAgICAgICAgIGRhdGEgPSByZWYuZXh0cmFjdGZpbGUobWVtYmVyPW1lbWJlcikucmVhZCgpCiAgICAgICAgICAgICAgICBjbGllbnQucHV0X29iamVjdChCb2R5PWRhdGEsIEJ1Y2tldD11cmxwYXJzZSh0YXJnZXRfcGF0aCBvciBzdWJkaXIpLm5ldGxvYywKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIEtleT1mJ3t1cmxwYXJzZSh0YXJnZXRfcGF0aCBvciBzdWJkaXIpLnBhdGhbMTpdfXttZW1iZXIubmFtZX0nKQogICAgZWxzZToKICAgICAgICBvcy5tYWtlZGlycyh0YXJnZXRfcGF0aCBvciBzdWJkaXIsIGV4aXN0X29rPVRydWUpCiAgICAgICAgd2l0aCB0YXJmaWxlLm9wZW4oYXJjaGl2ZV91cmwsIG1vZGU9InI6Z3oiKSBhcyByZWY6CiAgICAgICAgICAgIGZvciBlbnRyeSBpbiByZWY6CiAgICAgICAgICAgICAgICAjIFZhbGlkYXRlIHRoYXQgdGhlcmUgaXMgbm8gcGF0aCB0cmF2ZXJzYWwgaW4gdGhlIGFyY2hpdmUKICAgICAgICAgICAgICAgIGlmIG9zLnBhdGguaXNhYnMoZW50cnkubmFtZSkgb3IgIi4uIiBpbiBlbnRyeS5uYW1lOgogICAgICAgICAgICAgICAgICAgIHJhaXNlIFZhbHVlRXJyb3IoZiJJbGxlZ2FsIHRhciBhcmNoaXZlIGVudHJ5OiB7ZW50cnkubmFtZX0iKQoKICAgICAgICAgICAgICAgIHJlZi5leHRyYWN0KGVudHJ5LCB0YXJnZXRfcGF0aCBvciBzdWJkaXIpCgoKZGVmIF9leHRyYWN0X3ppcF9maWxlKGFyY2hpdmVfdXJsLCB0YXJnZXRfcGF0aDogc3RyID0gTm9uZSwgc3ViZGlyOiBzdHIgPSAiY29udGVudC8iLCBpbl9zMzogYm9vbCA9IEZhbHNlKToKICAgIGlmIGluX3MzOgogICAgICAgIGNsaWVudCA9IF9pbml0X2JvdG8zX2NsaWVudCgpCiAgICAgICAgd2l0aCB6aXBmaWxlLlppcEZpbGUoYXJjaGl2ZV91cmwsICJyIikgYXMgcmVmOgogICAgICAgICAgICBmb3IgZmlsZW5hbWUgaW4gcmVmLm5hbWVsaXN0KCk6CiAgICAgICAgICAgICAgICBkYXRhID0gcmVmLnJlYWQoZmlsZW5hbWUpCiAgICAgICAgICAgICAgICBjbGllbnQucHV0X29iamVjdChCb2R5PWRhdGEsIEJ1Y2tldD11cmxwYXJzZSh0YXJnZXRfcGF0aCBvciBzdWJkaXIpLm5ldGxvYywKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIEtleT1mJ3t1cmxwYXJzZSh0YXJnZXRfcGF0aCBvciBzdWJkaXIpLnBhdGhbMTpdfXtmaWxlbmFtZX0nKQogICAgZWxzZToKICAgICAgICB3aXRoIHppcGZpbGUuWmlwRmlsZShhcmNoaXZlX3VybCwgInIiKSBhcyByZWY6CiAgICAgICAgICAgICMgVmFsaWRhdGUgdGhhdCB0aGVyZSBpcyBubyBwYXRoIHRyYXZlcnNhbCBpbiB0aGUgYXJjaGl2ZQogICAgICAgICAgICBmb3IgZW50cnkgaW4gcmVmLm5hbWVsaXN0KCk6CiAgICAgICAgICAgICAgICBpZiBvcy5wYXRoLmlzYWJzKGVudHJ5KSBvciAiLi4iIGluIGVudHJ5OgogICAgICAgICAgICAgICAgICAgIHJhaXNlIFZhbHVlRXJyb3IoZiJJbGxlZ2FsIHppcCBhcmNoaXZlIGVudHJ5OiB7ZW50cnl9IikKICAgICAgICAgICAgb3MubWFrZWRpcnModGFyZ2V0X3BhdGggb3Igc3ViZGlyLCBleGlzdF9vaz1UcnVlKQogICAgICAgICAgICByZWYuZXh0cmFjdGFsbCh0YXJnZXRfcGF0aCBvciBzdWJkaXIpCgoKZGVmIF9pbml0X2JvdG8zX2NsaWVudCgpOgogICAgaW1wb3J0IGJvdG8zCiAgICBpZiBvcy5lbnZpcm9uLmdldCgnUzNfRU5EUE9JTlRfVVJMJyk6CiAgICAgICAgY2xpZW50ID0gYm90bzMuY2xpZW50KCdzMycsIGVuZHBvaW50X3VybD1vcy5lbnZpcm9uLmdldCgnUzNfRU5EUE9JTlRfVVJMJykpCiAgICBlbHNlOgogICAgICAgIGNsaWVudCA9IGJvdG8zLmNsaWVudCgnczMnKQogICAgcmV0dXJuIGNsaWVudA== - default_handler: open_archive - command: '' - image: mlrun/mlrun description: Open a file/object archive into a target directory + image: mlrun/mlrun entry_points: open_archive: has_kwargs: false + lineno: 27 + name: open_archive parameters: - name: context type: MLClientCtx @@ -40,6 +38,8 @@ spec: doc: Open a file/object archive into a target directory. Currently, supports zip and tar.gz. has_varargs: false - name: open_archive - lineno: 27 - disable_auto_mount: false +metadata: + name: open-archive + categories: + - utils + tag: '' diff --git a/functions/master/open_archive/latest/src/item.yaml b/functions/master/open_archive/latest/src/item.yaml index 35b5e147..0a2f4516 100644 --- a/functions/master/open_archive/latest/src/item.yaml +++ b/functions/master/open_archive/latest/src/item.yaml @@ -1,6 +1,6 @@ apiVersion: v1 categories: -- data-preparation +- utils description: Open a file/object archive into a target directory doc: '' example: open_archive.ipynb diff --git a/functions/master/open_archive/latest/static/documentation.html b/functions/master/open_archive/latest/static/documentation.html index 8d47370a..316b0576 100644 --- a/functions/master/open_archive/latest/static/documentation.html +++ b/functions/master/open_archive/latest/static/documentation.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/open_archive/latest/static/example.html b/functions/master/open_archive/latest/static/example.html index c7b8d345..567eaf34 100644 --- a/functions/master/open_archive/latest/static/example.html +++ b/functions/master/open_archive/latest/static/example.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/open_archive/latest/static/function.html b/functions/master/open_archive/latest/static/function.html index e6348802..16dd7fa7 100644 --- a/functions/master/open_archive/latest/static/function.html +++ b/functions/master/open_archive/latest/static/function.html @@ -29,24 +29,22 @@
         
 kind: job
-metadata:
-  name: open-archive
-  categories:
-  - data-preparation
-  tag: ''
 verbose: false
 spec:
+  command: ''
+  disable_auto_mount: false
+  default_handler: open_archive
   build:
+    functionSourceCode: IyBDb3B5cmlnaHQgMjAyNSBJZ3VhemlvCiMKIyBMaWNlbnNlZCB1bmRlciB0aGUgQXBhY2hlIExpY2Vuc2UsIFZlcnNpb24gMi4wICh0aGUgIkxpY2Vuc2UiKTsKIyB5b3UgbWF5IG5vdCB1c2UgdGhpcyBmaWxlIGV4Y2VwdCBpbiBjb21wbGlhbmNlIHdpdGggdGhlIExpY2Vuc2UuCiMgWW91IG1heSBvYnRhaW4gYSBjb3B5IG9mIHRoZSBMaWNlbnNlIGF0CiMKIyAgICAgaHR0cDovL3d3dy5hcGFjaGUub3JnL2xpY2Vuc2VzL0xJQ0VOU0UtMi4wCiMKIyBVbmxlc3MgcmVxdWlyZWQgYnkgYXBwbGljYWJsZSBsYXcgb3IgYWdyZWVkIHRvIGluIHdyaXRpbmcsIHNvZnR3YXJlCiMgZGlzdHJpYnV0ZWQgdW5kZXIgdGhlIExpY2Vuc2UgaXMgZGlzdHJpYnV0ZWQgb24gYW4gIkFTIElTIiBCQVNJUywKIyBXSVRIT1VUIFdBUlJBTlRJRVMgT1IgQ09ORElUSU9OUyBPRiBBTlkgS0lORCwgZWl0aGVyIGV4cHJlc3Mgb3IgaW1wbGllZC4KIyBTZWUgdGhlIExpY2Vuc2UgZm9yIHRoZSBzcGVjaWZpYyBsYW5ndWFnZSBnb3Zlcm5pbmcgcGVybWlzc2lvbnMgYW5kCiMgbGltaXRhdGlvbnMgdW5kZXIgdGhlIExpY2Vuc2UuCiMKCmltcG9ydCBvcwppbXBvcnQgemlwZmlsZQppbXBvcnQgdGFyZmlsZQoKZnJvbSBtbHJ1bi5leGVjdXRpb24gaW1wb3J0IE1MQ2xpZW50Q3R4CmZyb20gbWxydW4uZGF0YXN0b3JlIGltcG9ydCBEYXRhSXRlbQpmcm9tIG1scnVuLmFydGlmYWN0cy5iYXNlIGltcG9ydCBEaXJBcnRpZmFjdAoKZnJvbSB1cmxsaWIucGFyc2UgaW1wb3J0IHVybHBhcnNlCgoKZGVmIG9wZW5fYXJjaGl2ZSgKICAgICAgICBjb250ZXh0OiBNTENsaWVudEN0eCwKICAgICAgICBhcmNoaXZlX3VybDogRGF0YUl0ZW0sCiAgICAgICAgc3ViZGlyOiBzdHIgPSAiY29udGVudC8iLAogICAgICAgIGtleTogc3RyID0gImNvbnRlbnQiLAogICAgICAgIHRhcmdldF9wYXRoOiBzdHIgPSBOb25lLAopOgogICAgIiIiT3BlbiBhIGZpbGUvb2JqZWN0IGFyY2hpdmUgaW50byBhIHRhcmdldCBkaXJlY3RvcnkuIEN1cnJlbnRseSwgc3VwcG9ydHMgemlwIGFuZCB0YXIuZ3ouCgogICAgOnBhcmFtIGNvbnRleHQ6ICAgICAgZnVuY3Rpb24gZXhlY3V0aW9uIGNvbnRleHQKICAgIDpwYXJhbSBhcmNoaXZlX3VybDogIHVybCBvZiBhcmNoaXZlIGZpbGUKICAgIDpwYXJhbSBzdWJkaXI6ICAgICAgIHBhdGggd2l0aGluIGFydGlmYWN0IHN0b3JlIHdoZXJlIGV4dHJhY3RlZCBmaWxlcyBhcmUgc3RvcmVkLCBkZWZhdWx0IGlzICIvY29udGVudCIKICAgIDpwYXJhbSBrZXk6ICAgICAgICAgIGtleSBvZiBhcmNoaXZlIGNvbnRlbnRzIGluIGFydGlmYWN0IHN0b3JlCiAgICA6cGFyYW0gdGFyZ2V0X3BhdGg6ICBmaWxlIHN5c3RlbSBwYXRoIHRvIHN0b3JlIGV4dHJhY3RlZCBmaWxlcwogICAgIiIiCgogICAgIyBSZXNvbHZlcyB0aGUgYXJjaGl2ZSBsb2NhbGx5CiAgICBhcmNoaXZlX3VybCA9IGFyY2hpdmVfdXJsLmxvY2FsKCkKICAgIHYzaW9fc3ViZGlyID0gTm9uZQogICAgIyBXaGVuIGN1c3RvbSBhcnRpZmFjdCBwYXRoIGlzIGRlZmluZWQKICAgIGlmIG5vdCB0YXJnZXRfcGF0aCBhbmQgY29udGV4dC5hcnRpZmFjdF9wYXRoOgogICAgICAgIHBhcnNlZF9zdWJkaXIgPSB1cmxwYXJzZShjb250ZXh0LmFydGlmYWN0X3BhdGgpCiAgICAgICAgaWYgcGFyc2VkX3N1YmRpci5zY2hlbWUgPT0gJ3MzJzoKICAgICAgICAgICAgc3ViZGlyID0gb3MucGF0aC5qb2luKGNvbnRleHQuYXJ0aWZhY3RfcGF0aCwgc3ViZGlyKQogICAgICAgIGVsaWYgcGFyc2VkX3N1YmRpci5zY2hlbWUgPT0gJ3YzaW8nOgogICAgICAgICAgICB2M2lvX3N1YmRpciA9IG9zLnBhdGguam9pbihjb250ZXh0LmFydGlmYWN0X3BhdGgsIHN1YmRpcikgICMgVXNpbmcgdjNpb19zdWJkaXIgZm9yIGxvZ2dpbmcKICAgICAgICAgICAgc3ViZGlyID0gJy92M2lvJyArIHBhcnNlZF9zdWJkaXIucGF0aCArICcvJyArIHN1YmRpcgogICAgICAgICAgICBjb250ZXh0LmxvZ2dlci5pbmZvKGYnVXNpbmcgdjNpbyBzY2hlbWUsIGV4dHJhY3RpbmcgdG8ge3N1YmRpcn0nKQogICAgICAgIGVsc2U6CiAgICAgICAgICAgIGNvbnRleHQubG9nZ2VyLmluZm8oZidVbnJlY29nbml6YWJsZSBzY2hlbWUsIGV4dHJhY3RpbmcgdG8ge3N1YmRpcn0nKQoKICAgICMgV2hlbiB3b3JraW5nIG9uIENFLCB0YXJnZXQgcGF0aCBtaWdodCBiZSBvbiBzMwogICAgaWYgJ3MzJyBpbiAodGFyZ2V0X3BhdGggb3Igc3ViZGlyKToKICAgICAgICBjb250ZXh0LmxvZ2dlci5pbmZvKGYnVXNpbmcgczMgc2NoZW1lLCBleHRyYWN0aW5nIHRvIHt0YXJnZXRfcGF0aCBvciBzdWJkaXJ9JykKCiAgICAgICAgaWYgYXJjaGl2ZV91cmwuZW5kc3dpdGgoImd6Iik6CiAgICAgICAgICAgIF9leHRyYWN0X2d6X2ZpbGUoYXJjaGl2ZV91cmw9YXJjaGl2ZV91cmwsIHN1YmRpcj1zdWJkaXIsIHRhcmdldF9wYXRoPXRhcmdldF9wYXRoLCBpbl9zMz1UcnVlKQoKICAgICAgICBlbGlmIGFyY2hpdmVfdXJsLmVuZHN3aXRoKCJ6aXAiKToKICAgICAgICAgICAgX2V4dHJhY3RfemlwX2ZpbGUoYXJjaGl2ZV91cmw9YXJjaGl2ZV91cmwsIHN1YmRpcj1zdWJkaXIsIHRhcmdldF9wYXRoPXRhcmdldF9wYXRoLCBpbl9zMz1UcnVlKQogICAgICAgIGVsc2U6CiAgICAgICAgICAgIHJhaXNlIFZhbHVlRXJyb3IoZiJ1bnN1cHBvcnRlZCBhcmNoaXZlIHR5cGUgaW4ge2FyY2hpdmVfdXJsfSIpCiAgICBlbHNlOgogICAgICAgIGlmIGFyY2hpdmVfdXJsLmVuZHN3aXRoKCJneiIpOgogICAgICAgICAgICBfZXh0cmFjdF9nel9maWxlKGFyY2hpdmVfdXJsPWFyY2hpdmVfdXJsLCBzdWJkaXI9c3ViZGlyLCB0YXJnZXRfcGF0aD10YXJnZXRfcGF0aCkKICAgICAgICBlbGlmIGFyY2hpdmVfdXJsLmVuZHN3aXRoKCJ6aXAiKToKICAgICAgICAgICAgX2V4dHJhY3RfemlwX2ZpbGUoYXJjaGl2ZV91cmw9YXJjaGl2ZV91cmwsIHN1YmRpcj1zdWJkaXIsIHRhcmdldF9wYXRoPXRhcmdldF9wYXRoKQogICAgICAgIGVsc2U6CiAgICAgICAgICAgIHJhaXNlIFZhbHVlRXJyb3IoZiJ1bnN1cHBvcnRlZCBhcmNoaXZlIHR5cGUgaW4ge2FyY2hpdmVfdXJsfSIpCgogICAgaWYgdjNpb19zdWJkaXI6CiAgICAgICAgc3ViZGlyID0gdjNpb19zdWJkaXIKCiAgICBjb250ZXh0LmxvZ2dlci5pbmZvKGYnTG9nZ2luZyBhcnRpZmFjdCB0byB7KHRhcmdldF9wYXRoIG9yIHN1YmRpcil9JykKICAgIGNvbnRleHQubG9nX2FydGlmYWN0KERpckFydGlmYWN0KGtleT1rZXksIHRhcmdldF9wYXRoPSh0YXJnZXRfcGF0aCBvciBzdWJkaXIpKSkKCgpkZWYgX2V4dHJhY3RfZ3pfZmlsZShhcmNoaXZlX3VybDogc3RyLCB0YXJnZXRfcGF0aDogc3RyID0gTm9uZSwgc3ViZGlyOiBzdHIgPSAiY29udGVudC8iLCBpbl9zMzogYm9vbCA9IEZhbHNlKToKICAgIGlmIGluX3MzOgogICAgICAgIGNsaWVudCA9IF9pbml0X2JvdG8zX2NsaWVudCgpCiAgICAgICAgd2l0aCB0YXJmaWxlLm9wZW4oYXJjaGl2ZV91cmwsIG1vZGU9InJ8Z3oiKSBhcyByZWY6CiAgICAgICAgICAgIGZvciBtZW1iZXIgaW4gcmVmLmdldG1lbWJlcnMoKToKICAgICAgICAgICAgICAgIGRhdGEgPSByZWYuZXh0cmFjdGZpbGUobWVtYmVyPW1lbWJlcikucmVhZCgpCiAgICAgICAgICAgICAgICBjbGllbnQucHV0X29iamVjdChCb2R5PWRhdGEsIEJ1Y2tldD11cmxwYXJzZSh0YXJnZXRfcGF0aCBvciBzdWJkaXIpLm5ldGxvYywKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIEtleT1mJ3t1cmxwYXJzZSh0YXJnZXRfcGF0aCBvciBzdWJkaXIpLnBhdGhbMTpdfXttZW1iZXIubmFtZX0nKQogICAgZWxzZToKICAgICAgICBvcy5tYWtlZGlycyh0YXJnZXRfcGF0aCBvciBzdWJkaXIsIGV4aXN0X29rPVRydWUpCiAgICAgICAgd2l0aCB0YXJmaWxlLm9wZW4oYXJjaGl2ZV91cmwsIG1vZGU9InI6Z3oiKSBhcyByZWY6CiAgICAgICAgICAgIGZvciBlbnRyeSBpbiByZWY6CiAgICAgICAgICAgICAgICAjIFZhbGlkYXRlIHRoYXQgdGhlcmUgaXMgbm8gcGF0aCB0cmF2ZXJzYWwgaW4gdGhlIGFyY2hpdmUKICAgICAgICAgICAgICAgIGlmIG9zLnBhdGguaXNhYnMoZW50cnkubmFtZSkgb3IgIi4uIiBpbiBlbnRyeS5uYW1lOgogICAgICAgICAgICAgICAgICAgIHJhaXNlIFZhbHVlRXJyb3IoZiJJbGxlZ2FsIHRhciBhcmNoaXZlIGVudHJ5OiB7ZW50cnkubmFtZX0iKQoKICAgICAgICAgICAgICAgIHJlZi5leHRyYWN0KGVudHJ5LCB0YXJnZXRfcGF0aCBvciBzdWJkaXIpCgoKZGVmIF9leHRyYWN0X3ppcF9maWxlKGFyY2hpdmVfdXJsLCB0YXJnZXRfcGF0aDogc3RyID0gTm9uZSwgc3ViZGlyOiBzdHIgPSAiY29udGVudC8iLCBpbl9zMzogYm9vbCA9IEZhbHNlKToKICAgIGlmIGluX3MzOgogICAgICAgIGNsaWVudCA9IF9pbml0X2JvdG8zX2NsaWVudCgpCiAgICAgICAgd2l0aCB6aXBmaWxlLlppcEZpbGUoYXJjaGl2ZV91cmwsICJyIikgYXMgcmVmOgogICAgICAgICAgICBmb3IgZmlsZW5hbWUgaW4gcmVmLm5hbWVsaXN0KCk6CiAgICAgICAgICAgICAgICBkYXRhID0gcmVmLnJlYWQoZmlsZW5hbWUpCiAgICAgICAgICAgICAgICBjbGllbnQucHV0X29iamVjdChCb2R5PWRhdGEsIEJ1Y2tldD11cmxwYXJzZSh0YXJnZXRfcGF0aCBvciBzdWJkaXIpLm5ldGxvYywKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIEtleT1mJ3t1cmxwYXJzZSh0YXJnZXRfcGF0aCBvciBzdWJkaXIpLnBhdGhbMTpdfXtmaWxlbmFtZX0nKQogICAgZWxzZToKICAgICAgICB3aXRoIHppcGZpbGUuWmlwRmlsZShhcmNoaXZlX3VybCwgInIiKSBhcyByZWY6CiAgICAgICAgICAgICMgVmFsaWRhdGUgdGhhdCB0aGVyZSBpcyBubyBwYXRoIHRyYXZlcnNhbCBpbiB0aGUgYXJjaGl2ZQogICAgICAgICAgICBmb3IgZW50cnkgaW4gcmVmLm5hbWVsaXN0KCk6CiAgICAgICAgICAgICAgICBpZiBvcy5wYXRoLmlzYWJzKGVudHJ5KSBvciAiLi4iIGluIGVudHJ5OgogICAgICAgICAgICAgICAgICAgIHJhaXNlIFZhbHVlRXJyb3IoZiJJbGxlZ2FsIHppcCBhcmNoaXZlIGVudHJ5OiB7ZW50cnl9IikKICAgICAgICAgICAgb3MubWFrZWRpcnModGFyZ2V0X3BhdGggb3Igc3ViZGlyLCBleGlzdF9vaz1UcnVlKQogICAgICAgICAgICByZWYuZXh0cmFjdGFsbCh0YXJnZXRfcGF0aCBvciBzdWJkaXIpCgoKZGVmIF9pbml0X2JvdG8zX2NsaWVudCgpOgogICAgaW1wb3J0IGJvdG8zCiAgICBpZiBvcy5lbnZpcm9uLmdldCgnUzNfRU5EUE9JTlRfVVJMJyk6CiAgICAgICAgY2xpZW50ID0gYm90bzMuY2xpZW50KCdzMycsIGVuZHBvaW50X3VybD1vcy5lbnZpcm9uLmdldCgnUzNfRU5EUE9JTlRfVVJMJykpCiAgICBlbHNlOgogICAgICAgIGNsaWVudCA9IGJvdG8zLmNsaWVudCgnczMnKQogICAgcmV0dXJuIGNsaWVudA==
     code_origin: ''
     origin_filename: ''
-    functionSourceCode: IyBDb3B5cmlnaHQgMjAyNSBJZ3VhemlvCiMKIyBMaWNlbnNlZCB1bmRlciB0aGUgQXBhY2hlIExpY2Vuc2UsIFZlcnNpb24gMi4wICh0aGUgIkxpY2Vuc2UiKTsKIyB5b3UgbWF5IG5vdCB1c2UgdGhpcyBmaWxlIGV4Y2VwdCBpbiBjb21wbGlhbmNlIHdpdGggdGhlIExpY2Vuc2UuCiMgWW91IG1heSBvYnRhaW4gYSBjb3B5IG9mIHRoZSBMaWNlbnNlIGF0CiMKIyAgICAgaHR0cDovL3d3dy5hcGFjaGUub3JnL2xpY2Vuc2VzL0xJQ0VOU0UtMi4wCiMKIyBVbmxlc3MgcmVxdWlyZWQgYnkgYXBwbGljYWJsZSBsYXcgb3IgYWdyZWVkIHRvIGluIHdyaXRpbmcsIHNvZnR3YXJlCiMgZGlzdHJpYnV0ZWQgdW5kZXIgdGhlIExpY2Vuc2UgaXMgZGlzdHJpYnV0ZWQgb24gYW4gIkFTIElTIiBCQVNJUywKIyBXSVRIT1VUIFdBUlJBTlRJRVMgT1IgQ09ORElUSU9OUyBPRiBBTlkgS0lORCwgZWl0aGVyIGV4cHJlc3Mgb3IgaW1wbGllZC4KIyBTZWUgdGhlIExpY2Vuc2UgZm9yIHRoZSBzcGVjaWZpYyBsYW5ndWFnZSBnb3Zlcm5pbmcgcGVybWlzc2lvbnMgYW5kCiMgbGltaXRhdGlvbnMgdW5kZXIgdGhlIExpY2Vuc2UuCiMKCmltcG9ydCBvcwppbXBvcnQgemlwZmlsZQppbXBvcnQgdGFyZmlsZQoKZnJvbSBtbHJ1bi5leGVjdXRpb24gaW1wb3J0IE1MQ2xpZW50Q3R4CmZyb20gbWxydW4uZGF0YXN0b3JlIGltcG9ydCBEYXRhSXRlbQpmcm9tIG1scnVuLmFydGlmYWN0cy5iYXNlIGltcG9ydCBEaXJBcnRpZmFjdAoKZnJvbSB1cmxsaWIucGFyc2UgaW1wb3J0IHVybHBhcnNlCgoKZGVmIG9wZW5fYXJjaGl2ZSgKICAgICAgICBjb250ZXh0OiBNTENsaWVudEN0eCwKICAgICAgICBhcmNoaXZlX3VybDogRGF0YUl0ZW0sCiAgICAgICAgc3ViZGlyOiBzdHIgPSAiY29udGVudC8iLAogICAgICAgIGtleTogc3RyID0gImNvbnRlbnQiLAogICAgICAgIHRhcmdldF9wYXRoOiBzdHIgPSBOb25lLAopOgogICAgIiIiT3BlbiBhIGZpbGUvb2JqZWN0IGFyY2hpdmUgaW50byBhIHRhcmdldCBkaXJlY3RvcnkuIEN1cnJlbnRseSwgc3VwcG9ydHMgemlwIGFuZCB0YXIuZ3ouCgogICAgOnBhcmFtIGNvbnRleHQ6ICAgICAgZnVuY3Rpb24gZXhlY3V0aW9uIGNvbnRleHQKICAgIDpwYXJhbSBhcmNoaXZlX3VybDogIHVybCBvZiBhcmNoaXZlIGZpbGUKICAgIDpwYXJhbSBzdWJkaXI6ICAgICAgIHBhdGggd2l0aGluIGFydGlmYWN0IHN0b3JlIHdoZXJlIGV4dHJhY3RlZCBmaWxlcyBhcmUgc3RvcmVkLCBkZWZhdWx0IGlzICIvY29udGVudCIKICAgIDpwYXJhbSBrZXk6ICAgICAgICAgIGtleSBvZiBhcmNoaXZlIGNvbnRlbnRzIGluIGFydGlmYWN0IHN0b3JlCiAgICA6cGFyYW0gdGFyZ2V0X3BhdGg6ICBmaWxlIHN5c3RlbSBwYXRoIHRvIHN0b3JlIGV4dHJhY3RlZCBmaWxlcwogICAgIiIiCgogICAgIyBSZXNvbHZlcyB0aGUgYXJjaGl2ZSBsb2NhbGx5CiAgICBhcmNoaXZlX3VybCA9IGFyY2hpdmVfdXJsLmxvY2FsKCkKICAgIHYzaW9fc3ViZGlyID0gTm9uZQogICAgIyBXaGVuIGN1c3RvbSBhcnRpZmFjdCBwYXRoIGlzIGRlZmluZWQKICAgIGlmIG5vdCB0YXJnZXRfcGF0aCBhbmQgY29udGV4dC5hcnRpZmFjdF9wYXRoOgogICAgICAgIHBhcnNlZF9zdWJkaXIgPSB1cmxwYXJzZShjb250ZXh0LmFydGlmYWN0X3BhdGgpCiAgICAgICAgaWYgcGFyc2VkX3N1YmRpci5zY2hlbWUgPT0gJ3MzJzoKICAgICAgICAgICAgc3ViZGlyID0gb3MucGF0aC5qb2luKGNvbnRleHQuYXJ0aWZhY3RfcGF0aCwgc3ViZGlyKQogICAgICAgIGVsaWYgcGFyc2VkX3N1YmRpci5zY2hlbWUgPT0gJ3YzaW8nOgogICAgICAgICAgICB2M2lvX3N1YmRpciA9IG9zLnBhdGguam9pbihjb250ZXh0LmFydGlmYWN0X3BhdGgsIHN1YmRpcikgICMgVXNpbmcgdjNpb19zdWJkaXIgZm9yIGxvZ2dpbmcKICAgICAgICAgICAgc3ViZGlyID0gJy92M2lvJyArIHBhcnNlZF9zdWJkaXIucGF0aCArICcvJyArIHN1YmRpcgogICAgICAgICAgICBjb250ZXh0LmxvZ2dlci5pbmZvKGYnVXNpbmcgdjNpbyBzY2hlbWUsIGV4dHJhY3RpbmcgdG8ge3N1YmRpcn0nKQogICAgICAgIGVsc2U6CiAgICAgICAgICAgIGNvbnRleHQubG9nZ2VyLmluZm8oZidVbnJlY29nbml6YWJsZSBzY2hlbWUsIGV4dHJhY3RpbmcgdG8ge3N1YmRpcn0nKQoKICAgICMgV2hlbiB3b3JraW5nIG9uIENFLCB0YXJnZXQgcGF0aCBtaWdodCBiZSBvbiBzMwogICAgaWYgJ3MzJyBpbiAodGFyZ2V0X3BhdGggb3Igc3ViZGlyKToKICAgICAgICBjb250ZXh0LmxvZ2dlci5pbmZvKGYnVXNpbmcgczMgc2NoZW1lLCBleHRyYWN0aW5nIHRvIHt0YXJnZXRfcGF0aCBvciBzdWJkaXJ9JykKCiAgICAgICAgaWYgYXJjaGl2ZV91cmwuZW5kc3dpdGgoImd6Iik6CiAgICAgICAgICAgIF9leHRyYWN0X2d6X2ZpbGUoYXJjaGl2ZV91cmw9YXJjaGl2ZV91cmwsIHN1YmRpcj1zdWJkaXIsIHRhcmdldF9wYXRoPXRhcmdldF9wYXRoLCBpbl9zMz1UcnVlKQoKICAgICAgICBlbGlmIGFyY2hpdmVfdXJsLmVuZHN3aXRoKCJ6aXAiKToKICAgICAgICAgICAgX2V4dHJhY3RfemlwX2ZpbGUoYXJjaGl2ZV91cmw9YXJjaGl2ZV91cmwsIHN1YmRpcj1zdWJkaXIsIHRhcmdldF9wYXRoPXRhcmdldF9wYXRoLCBpbl9zMz1UcnVlKQogICAgICAgIGVsc2U6CiAgICAgICAgICAgIHJhaXNlIFZhbHVlRXJyb3IoZiJ1bnN1cHBvcnRlZCBhcmNoaXZlIHR5cGUgaW4ge2FyY2hpdmVfdXJsfSIpCiAgICBlbHNlOgogICAgICAgIGlmIGFyY2hpdmVfdXJsLmVuZHN3aXRoKCJneiIpOgogICAgICAgICAgICBfZXh0cmFjdF9nel9maWxlKGFyY2hpdmVfdXJsPWFyY2hpdmVfdXJsLCBzdWJkaXI9c3ViZGlyLCB0YXJnZXRfcGF0aD10YXJnZXRfcGF0aCkKICAgICAgICBlbGlmIGFyY2hpdmVfdXJsLmVuZHN3aXRoKCJ6aXAiKToKICAgICAgICAgICAgX2V4dHJhY3RfemlwX2ZpbGUoYXJjaGl2ZV91cmw9YXJjaGl2ZV91cmwsIHN1YmRpcj1zdWJkaXIsIHRhcmdldF9wYXRoPXRhcmdldF9wYXRoKQogICAgICAgIGVsc2U6CiAgICAgICAgICAgIHJhaXNlIFZhbHVlRXJyb3IoZiJ1bnN1cHBvcnRlZCBhcmNoaXZlIHR5cGUgaW4ge2FyY2hpdmVfdXJsfSIpCgogICAgaWYgdjNpb19zdWJkaXI6CiAgICAgICAgc3ViZGlyID0gdjNpb19zdWJkaXIKCiAgICBjb250ZXh0LmxvZ2dlci5pbmZvKGYnTG9nZ2luZyBhcnRpZmFjdCB0byB7KHRhcmdldF9wYXRoIG9yIHN1YmRpcil9JykKICAgIGNvbnRleHQubG9nX2FydGlmYWN0KERpckFydGlmYWN0KGtleT1rZXksIHRhcmdldF9wYXRoPSh0YXJnZXRfcGF0aCBvciBzdWJkaXIpKSkKCgpkZWYgX2V4dHJhY3RfZ3pfZmlsZShhcmNoaXZlX3VybDogc3RyLCB0YXJnZXRfcGF0aDogc3RyID0gTm9uZSwgc3ViZGlyOiBzdHIgPSAiY29udGVudC8iLCBpbl9zMzogYm9vbCA9IEZhbHNlKToKICAgIGlmIGluX3MzOgogICAgICAgIGNsaWVudCA9IF9pbml0X2JvdG8zX2NsaWVudCgpCiAgICAgICAgd2l0aCB0YXJmaWxlLm9wZW4oYXJjaGl2ZV91cmwsIG1vZGU9InJ8Z3oiKSBhcyByZWY6CiAgICAgICAgICAgIGZvciBtZW1iZXIgaW4gcmVmLmdldG1lbWJlcnMoKToKICAgICAgICAgICAgICAgIGRhdGEgPSByZWYuZXh0cmFjdGZpbGUobWVtYmVyPW1lbWJlcikucmVhZCgpCiAgICAgICAgICAgICAgICBjbGllbnQucHV0X29iamVjdChCb2R5PWRhdGEsIEJ1Y2tldD11cmxwYXJzZSh0YXJnZXRfcGF0aCBvciBzdWJkaXIpLm5ldGxvYywKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIEtleT1mJ3t1cmxwYXJzZSh0YXJnZXRfcGF0aCBvciBzdWJkaXIpLnBhdGhbMTpdfXttZW1iZXIubmFtZX0nKQogICAgZWxzZToKICAgICAgICBvcy5tYWtlZGlycyh0YXJnZXRfcGF0aCBvciBzdWJkaXIsIGV4aXN0X29rPVRydWUpCiAgICAgICAgd2l0aCB0YXJmaWxlLm9wZW4oYXJjaGl2ZV91cmwsIG1vZGU9InI6Z3oiKSBhcyByZWY6CiAgICAgICAgICAgIGZvciBlbnRyeSBpbiByZWY6CiAgICAgICAgICAgICAgICAjIFZhbGlkYXRlIHRoYXQgdGhlcmUgaXMgbm8gcGF0aCB0cmF2ZXJzYWwgaW4gdGhlIGFyY2hpdmUKICAgICAgICAgICAgICAgIGlmIG9zLnBhdGguaXNhYnMoZW50cnkubmFtZSkgb3IgIi4uIiBpbiBlbnRyeS5uYW1lOgogICAgICAgICAgICAgICAgICAgIHJhaXNlIFZhbHVlRXJyb3IoZiJJbGxlZ2FsIHRhciBhcmNoaXZlIGVudHJ5OiB7ZW50cnkubmFtZX0iKQoKICAgICAgICAgICAgICAgIHJlZi5leHRyYWN0KGVudHJ5LCB0YXJnZXRfcGF0aCBvciBzdWJkaXIpCgoKZGVmIF9leHRyYWN0X3ppcF9maWxlKGFyY2hpdmVfdXJsLCB0YXJnZXRfcGF0aDogc3RyID0gTm9uZSwgc3ViZGlyOiBzdHIgPSAiY29udGVudC8iLCBpbl9zMzogYm9vbCA9IEZhbHNlKToKICAgIGlmIGluX3MzOgogICAgICAgIGNsaWVudCA9IF9pbml0X2JvdG8zX2NsaWVudCgpCiAgICAgICAgd2l0aCB6aXBmaWxlLlppcEZpbGUoYXJjaGl2ZV91cmwsICJyIikgYXMgcmVmOgogICAgICAgICAgICBmb3IgZmlsZW5hbWUgaW4gcmVmLm5hbWVsaXN0KCk6CiAgICAgICAgICAgICAgICBkYXRhID0gcmVmLnJlYWQoZmlsZW5hbWUpCiAgICAgICAgICAgICAgICBjbGllbnQucHV0X29iamVjdChCb2R5PWRhdGEsIEJ1Y2tldD11cmxwYXJzZSh0YXJnZXRfcGF0aCBvciBzdWJkaXIpLm5ldGxvYywKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIEtleT1mJ3t1cmxwYXJzZSh0YXJnZXRfcGF0aCBvciBzdWJkaXIpLnBhdGhbMTpdfXtmaWxlbmFtZX0nKQogICAgZWxzZToKICAgICAgICB3aXRoIHppcGZpbGUuWmlwRmlsZShhcmNoaXZlX3VybCwgInIiKSBhcyByZWY6CiAgICAgICAgICAgICMgVmFsaWRhdGUgdGhhdCB0aGVyZSBpcyBubyBwYXRoIHRyYXZlcnNhbCBpbiB0aGUgYXJjaGl2ZQogICAgICAgICAgICBmb3IgZW50cnkgaW4gcmVmLm5hbWVsaXN0KCk6CiAgICAgICAgICAgICAgICBpZiBvcy5wYXRoLmlzYWJzKGVudHJ5KSBvciAiLi4iIGluIGVudHJ5OgogICAgICAgICAgICAgICAgICAgIHJhaXNlIFZhbHVlRXJyb3IoZiJJbGxlZ2FsIHppcCBhcmNoaXZlIGVudHJ5OiB7ZW50cnl9IikKICAgICAgICAgICAgb3MubWFrZWRpcnModGFyZ2V0X3BhdGggb3Igc3ViZGlyLCBleGlzdF9vaz1UcnVlKQogICAgICAgICAgICByZWYuZXh0cmFjdGFsbCh0YXJnZXRfcGF0aCBvciBzdWJkaXIpCgoKZGVmIF9pbml0X2JvdG8zX2NsaWVudCgpOgogICAgaW1wb3J0IGJvdG8zCiAgICBpZiBvcy5lbnZpcm9uLmdldCgnUzNfRU5EUE9JTlRfVVJMJyk6CiAgICAgICAgY2xpZW50ID0gYm90bzMuY2xpZW50KCdzMycsIGVuZHBvaW50X3VybD1vcy5lbnZpcm9uLmdldCgnUzNfRU5EUE9JTlRfVVJMJykpCiAgICBlbHNlOgogICAgICAgIGNsaWVudCA9IGJvdG8zLmNsaWVudCgnczMnKQogICAgcmV0dXJuIGNsaWVudA==
-  default_handler: open_archive
-  command: ''
-  image: mlrun/mlrun
   description: Open a file/object archive into a target directory
+  image: mlrun/mlrun
   entry_points:
     open_archive:
       has_kwargs: false
+      lineno: 27
+      name: open_archive
       parameters:
       - name: context
         type: MLClientCtx
@@ -70,9 +68,11 @@
       doc: Open a file/object archive into a target directory. Currently, supports
         zip and tar.gz.
       has_varargs: false
-      name: open_archive
-      lineno: 27
-  disable_auto_mount: false
+metadata:
+  name: open-archive
+  categories:
+  - utils
+  tag: ''
 
         
     
diff --git a/functions/master/open_archive/latest/static/item.html b/functions/master/open_archive/latest/static/item.html index 0e81169e..fed83413 100644 --- a/functions/master/open_archive/latest/static/item.html +++ b/functions/master/open_archive/latest/static/item.html @@ -30,7 +30,7 @@ apiVersion: v1 categories: -- data-preparation +- utils description: Open a file/object archive into a target directory doc: '' example: open_archive.ipynb diff --git a/functions/master/open_archive/latest/static/open_archive.html b/functions/master/open_archive/latest/static/open_archive.html index ec8542bd..5aa03a6d 100644 --- a/functions/master/open_archive/latest/static/open_archive.html +++ b/functions/master/open_archive/latest/static/open_archive.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/pii_recognizer/0.4.0/src/data/config.csv b/functions/master/pii_recognizer/0.4.0/src/data/config.csv new file mode 100644 index 00000000..fe2c350e --- /dev/null +++ b/functions/master/pii_recognizer/0.4.0/src/data/config.csv @@ -0,0 +1,3 @@ +input_file,output_file +data/pii.txt,data/pii_out.txt +data/letter.txt,data/letter_out.txt diff --git a/functions/master/pii_recognizer/0.4.0/src/data/letter.txt b/functions/master/pii_recognizer/0.4.0/src/data/letter.txt new file mode 100644 index 00000000..59d25e78 --- /dev/null +++ b/functions/master/pii_recognizer/0.4.0/src/data/letter.txt @@ -0,0 +1,12 @@ +Dear Mr. John Doe, + +We are pleased to inform you that you have been selected as the winner of our exclusive vacation package giveaway! Congratulations! You, along with your family, will enjoy a luxurious stay at our resort in the beautiful city of Riviera. Your flight tickets have been booked, and you will be departing on July 15th, 2023. + +Please provide us with the necessary details to finalize your travel arrangements. We kindly request your full name, date of birth, passport number, and contact information. Rest assured that all provided information will be handled with utmost confidentiality and in compliance with data protection regulations. + +We look forward to creating unforgettable memories for you and your loved ones during your stay with us. If you have any questions or require further assistance, please don't hesitate to contact our customer support team. + +Best regards, + +Jane Smith +Customer Support Representative diff --git a/functions/master/pii_recognizer/0.4.0/src/data/output/letter_output.txt b/functions/master/pii_recognizer/0.4.0/src/data/output/letter_output.txt new file mode 100644 index 00000000..468533af --- /dev/null +++ b/functions/master/pii_recognizer/0.4.0/src/data/output/letter_output.txt @@ -0,0 +1,12 @@ +Dear Mr. , + +We are pleased to inform you that you have been selected as the winner of our exclusive vacation package giveaway! Congratulations! You, along with your family, will enjoy a luxurious stay at our resort in the beautiful city of . Your flight tickets have been booked, and you will be departing on July 15th, 2023. + +Please provide us with the necessary details to finalize your travel arrangements. We kindly request your full name, date of birth, passport number, and contact information. Rest assured that all provided information will be handled with utmost confidentiality and in compliance with data protection regulations. + +We look forward to creating unforgettable memories for you and your loved ones during your stay with us. If you have any questions or require further assistance, please don't hesitate to contact our customer support team. + +Best regards, + + + Support diff --git a/functions/master/pii_recognizer/0.4.0/src/data/output/pii_output.txt b/functions/master/pii_recognizer/0.4.0/src/data/output/pii_output.txt new file mode 100644 index 00000000..1160e497 --- /dev/null +++ b/functions/master/pii_recognizer/0.4.0/src/data/output/pii_output.txt @@ -0,0 +1 @@ + is , connect him with or , he can pay you with \ No newline at end of file diff --git a/functions/master/pii_recognizer/0.4.0/src/data/pii.txt b/functions/master/pii_recognizer/0.4.0/src/data/pii.txt new file mode 100644 index 00000000..8886cc08 --- /dev/null +++ b/functions/master/pii_recognizer/0.4.0/src/data/pii.txt @@ -0,0 +1 @@ +John smith's ssn is 182838483, connect him with John_smith@gmail.com or 6288389029, he can pay you with 41482929939393 \ No newline at end of file diff --git a/functions/master/pii_recognizer/0.4.0/src/function.yaml b/functions/master/pii_recognizer/0.4.0/src/function.yaml new file mode 100644 index 00000000..e7d6c124 --- /dev/null +++ b/functions/master/pii_recognizer/0.4.0/src/function.yaml @@ -0,0 +1,115 @@ +verbose: false +spec: + default_handler: recognize_pii + entry_points: + analyze: + name: analyze + outputs: + - doc: The list of Presidio RecognizerResult constructed from the recognized + Flair detections. + type: List[pa.RecognizerResult] + has_kwargs: false + parameters: + - name: self + - name: text + type: str + doc: The text for analysis. + - name: entities + type: List[str] + doc: The list of entities to recognize. + - name: nlp_artifacts + type: pa.nlp_engine.NlpArtifacts + doc: Not used by this recognizer but needed for the interface. + default: null + lineno: 381 + doc: Analyze text and return the results. + has_varargs: false + recognize_pii: + name: recognize_pii + outputs: + - doc: 'A tuple of:' + type: Union[Tuple[str, pd.DataFrame, dict, dict], Tuple[str, pd.DataFrame, + dict]] + has_kwargs: false + parameters: + - name: context + type: MLClientCtx + doc: The MLRun context. this is needed for log the artifacts. + - name: input_path + type: Union[str, Path] + doc: The input path of the text files needs to be analyzed. + - name: html_key + type: str + doc: The html key for the artifact. + - name: score_threshold + type: float + doc: The score threshold to mark the recognition as trusted. + - name: output_directory + type: str + doc: The output directory path to store the anonymized text. + default: null + - name: entities + type: List[str] + doc: The list of entities to recognize. + default: null + - name: entity_operator_map + type: dict + doc: The map of entity to operator (mask, redact, replace, keep, hash, and + its params) + default: null + - name: model + type: str + doc: The model to use. Can be "spacy", "flair", "pattern" or "whole". + default: null + - name: generate_json + type: bool + doc: Whether to generate the json report of the explanation. + default: true + - name: generate_html + type: bool + doc: Whether to generate the html report of the explanation. + default: true + - name: is_full_text + type: bool + doc: Whether to return the full text or only the masked text. + default: true + - name: is_full_html + type: bool + doc: Whether to return the full html or just the annotated text + default: true + - name: is_full_report + type: bool + doc: Whether to return the full report or just the score and start, end index + default: true + lineno: 845 + doc: 'Walk through the input path, recognize PII in text and store the anonymized + text in the output path. + + Generate the html with different colors for each entity, json report of the + explanation.' + has_varargs: false + build: + base_image: mlrun/mlrun + requirements: + - nltk + - pandas + - presidio-anonymizer + - presidio-analyzer + - torch + - flair@git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653 + - st-annotated-text + - https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl + functionSourceCode:  + code_origin: '' + origin_filename: '' + description: This function is used to recognize PII in a directory of text files + image: '' + command: '' + disable_auto_mount: false +kind: job +metadata: + name: pii-recognizer + tag: '' + categories: + - data-preparation + - NLP diff --git a/functions/master/pii_recognizer/0.4.0/src/item.yaml b/functions/master/pii_recognizer/0.4.0/src/item.yaml new file mode 100644 index 00000000..8f3185b4 --- /dev/null +++ b/functions/master/pii_recognizer/0.4.0/src/item.yaml @@ -0,0 +1,34 @@ +apiVersion: v1 +categories: + - data-preparation + - NLP +description: This function is used to recognize PII in a directory of text files +doc: '' +example: pii_recognizer.ipynb +generationDate: 2023-08-15:10-24 +hidden: false +icon: '' +labels: + author: pgw +maintainers: [] +marketplaceType: '' +mlrunVersion: 1.7.0 +name: pii-recognizer +platformVersion: 3.5.3 +spec: + filename: pii_recognizer.py + handler: recognize_pii + image: mlrun/mlrun + kind: job + requirements: + - nltk + - pandas + - presidio-anonymizer + - presidio-analyzer + - torch + - flair@git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653 + - st-annotated-text + - https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl +url: '' +version: 0.4.0 +test_valid: False diff --git a/functions/master/pii_recognizer/0.4.0/src/pii_recognizer.ipynb b/functions/master/pii_recognizer/0.4.0/src/pii_recognizer.ipynb new file mode 100644 index 00000000..48d1100d --- /dev/null +++ b/functions/master/pii_recognizer/0.4.0/src/pii_recognizer.ipynb @@ -0,0 +1,2015 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "7412335f", + "metadata": {}, + "source": [ + "# PII Recognizer\n", + "\n", + "A function to detect pii data and anonymize the pii entity in the text. \n", + "\n", + "In this notebook we will go over the function's docs and outputs and see an end-to-end example of running it.\n", + "\n", + "1. [Documentation](#chapter1)\n", + "2. [Results](#chapter2)\n", + "3. [End-to-end Demo](#chapter3)" + ] + }, + { + "cell_type": "markdown", + "id": "0bb6c621", + "metadata": {}, + "source": [ + "\n", + "## 1. Documentation\n", + "\n", + "The function receive a directory path with all the text files in it. It walk through the directory, get all the text file. Then it detect the pii entity inside of the text file, apply the operator on the entity. Generate the html file with all pii entity highlighted. Generate the json report has the explaination of the process.\n" + ] + }, + { + "cell_type": "markdown", + "id": "de1a1349", + "metadata": {}, + "source": [ + "### 1.1. Parameters:\n", + "* **context**: `mlrun.MLClientCtx`\n", + " \n", + " The MLRun context\n", + " \n", + "* **input_path**: `str`\n", + " \n", + " The input directory with all the text files\n", + " \n", + "* **output_path**: `str`\n", + " \n", + " The directory that is used to store the anonymized text files. it is also used for mlrun to log the artifact as zip file\n", + " \n", + "* **output_suffix**: `str`\n", + " \n", + " The suffix will added to the input file. for example if the input text file is pii.txt, if output_suffix is \"anonymized\", the output file would be pii_anonymized.txt\n", + " \n", + "* **html_key**: `str`\n", + " \n", + " The artifact name of the html file \n", + " \n", + "* **entities**: `List[str]`\n", + " \n", + " The list of the entities to recognize. Please make sure the model you choose can recognize the entities. \n", + "\n", + "* **entity_operator_map**: `List[str]`\n", + " For different entity, we can apply different operator. Now supports Keep, Mask, Replace, Redact, Hash\n", + " \n", + "
\n",
+    "     entity_operator_map = {\n",
+    "        \"PERSON\": (\"keep\", {}),\n",
+    "        \"EMAIL\": (\"mask\", {\"masking_char\": \"#\", \"chars_to_mask\": 5, \"from_end\": False}),\n",
+    "        \"PHONE\": (\"hash\", {}),\n",
+    "        \"LOCATION\": (\"redact\", {}),\n",
+    "        \"ORGANIZATION\": (\"replace\", {\"new_value\": \"Company XYZ\"})\n",
+    "        }\n",
+    "     
\n", + " \n", + " In this example:\n", + "\n", + " - \"PERSON\" entities are kept as they are using the \"keep\" operator. \n", + " - \"EMAIL_ADDRESS\" entities are masked with the \"#\" character, masking the first five characters. \n", + " - \"PHONE_NUMBER\" entities are replaced with their hashed value using the \"hash\" operator.\n", + " - \"LOCATION\" entities are completely removed using the \"redact\" operator.\n", + " - \"ORGANIZATION\" entities are replaced with the string \"Company XYZ\" using the \"replace\" operator.\n", + " \n", + "* **model**: `str`\n", + " \n", + " - \"whole\", \"spacy\", \"pattern\", \"flair\". The default is \"whole\".\n", + " \n", + " For each model, it can detect some entities. The \"whole\" model is combined all three models together. It can detect all the entities list below. \n", + " \n", + " \n", + " - \"spacy\" : [\"LOCATION\", \"PERSON\",\"NRP\",\"ORGANIZATION\",\"DATE_TIME\"]\n", + " \n", + " - \"pattern\": [\"CREDIT_CARD\", \"SSN\", \"PHONE\", \"EMAIL\"]\n", + " \n", + " - \"flair\": [ \"LOCATION\",\n", + " \"PERSON\",\n", + " \"NRP\",\n", + " \"GPE\",\n", + " \"ORGANIZATION\",\n", + " \"MAC_ADDRESS\",\n", + " \"US_BANK_NUMBER\",\n", + " \"IMEI\",\n", + " \"TITLE\",\n", + " \"LICENSE_PLATE\",\n", + " \"US_PASSPORT\",\n", + " \"CURRENCY\",\n", + " \"ROUTING_NUMBER\",\n", + " \"US_ITIN\",\n", + " \"US_BANK_NUMBER\",\n", + " \"US_DRIVER_LICENSE\",\n", + " \"AGE\",\n", + " \"PASSWORD\",\n", + " \"SWIFT_CODE\"\n", + " ]\n", + " \n", + "* **score_threshold**:\n", + " \n", + " Minimum confidence value, the default is 0 to align with presidio.AnalyzerEngine\n", + " \n", + "* **generate_json_rpt**:\n", + "\n", + " Whether to generate the json report of the explaination\n", + " \n", + "* **generate_html_rpt**:\n", + "\n", + " Whether to generate the html with highlighted pii entities or not\n", + " \n", + "* **is_full_text**:\n", + "\n", + " Whether to return the full text or just the sentences with pii entities.\n", + " \n", + "* **is_full_html**: `bool`\n", + " \n", + " Whether to return the full html or just the annotated html\n", + " \n", + "* **is_full_report**: `bool`\n", + " \n", + " Whether to return the full json report or just the score and start, end index\n", + " \n", + " " + ] + }, + { + "cell_type": "markdown", + "id": "83f616d2", + "metadata": {}, + "source": [ + "### 1.2. Outputs:\n", + "\n", + "There are two outputs of this function. \n", + "\n", + "* **output_path**: `str`\n", + " \n", + " The directory stored all the anonymized text files\n", + "\n", + "* **rpt_json**: `dict`\n", + "\n", + " A dict of reporting to explain how does the model detect the pii entity\n", + " \n", + "* **errors** : `dict`\n", + " A dict of errors when processing the text files if any\n", + " " + ] + }, + { + "cell_type": "markdown", + "id": "310de23a", + "metadata": {}, + "source": [ + "\n", + "## 2. Results\n", + "\n", + "The result of the function looks like the following: \n", + "\n", + "For example if the input string is \n", + "\n", + "`John Doe 's ssn is 182838483, connect john doe with john_doe@gmail.com or 6288389029, he can pay you with 41482929939393`\n", + "\n", + "The anonymized_text is \n", + "\n", + "`'s is , connect with or , he can pay you with `\n", + "\n", + "The html_str is\n", + "\n", + "

John Doe'sPERSON ssnORGANIZATION is 182838483SSN, connect me with john_doe@gmail.comPERSONjohn_doe@gmail.comEMAIL or 6288389029PHONE, he can pay you with 41482929939393CREDIT_CARD\n", + "

\n", + "\n", + "The json report that explain the output is\n", + "\n", + "```yaml\n", + "\n", + "[\n", + " {\n", + " \"entity_type\": \"PERSON\", # result of the labeling\n", + " \"start\": 0, # start positon of the entity\n", + " \"end\": 9, # end postion of the entity\n", + " \"score\": 0.99, # the confident score of the model + context_improvement\n", + " \"analysis_explanation\": {\n", + " \"recognizer\": \"FlairRecognizer\", # which recognizer is used to recognize this entity\n", + " \"pattern_name\": null,\n", + " \"pattern\": null,\n", + " \"original_score\": 0.99, # The original confident score from the pre-trained model\n", + " \"score\": 0.99, # the final score = original_score + score_context_improvement\n", + " \"textual_explanation\": \"Identified as PER by Flair's Named Entity Recognition\",\n", + " \"score_context_improvement\": 0, # The improvement from the context\n", + " \"supportive_context_word\": \"\",\n", + " \"validation_result\": null\n", + " },\n", + " \"recognition_metadata\": {\n", + " \"recognizer_identifier\": \"Flair Analytics_5577088640\",\n", + " \"recognizer_name\": \"Flair Analytics\"\n", + " }\n", + " },\n", + " ....\n", + "]\n", + "```\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "ce2199fb", + "metadata": {}, + "source": [ + "\n", + "## 3. End-to-end Demo\n" + ] + }, + { + "cell_type": "markdown", + "id": "fc42debf-f363-48f9-9512-3951d352fb1d", + "metadata": {}, + "source": [ + "### 3.1. Recognition configurations \n", + " - model: which model you want to use.\n", + " - entities: What entities to recognize? \n", + " - score_threshold: From which score to mark the recogniztion as trusted?\n" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "id": "2a290d0f-15da-434d-b3fc-46ebb35be611", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2023-07-31 02:17:04,305 [info] Project loaded successfully: {'project_name': 'pii'}\n", + "> 2023-07-31 02:17:04,312 [warning] Failed to add git metadata, ignore if path is not part of a git repo.: {'path': './', 'error': '/User/pii_recognizer'}\n", + "> 2023-07-31 02:17:04,408 [warning] artifact/output path is not defined or is local and relative, artifacts will not be visible in the UI: {'output_path': './'}\n", + "> 2023-07-31 02:17:04,409 [info] Storing function: {'name': 'pii-recognizer-recognize-pii', 'uid': '51b5ad8144004e52a1008c08850842c8', 'db': None}\n", + "2023-07-31 02:17:04,567 loading file /User/.flair/models/flair-pii-distilbert/models--beki--flair-pii-distilbert/snapshots/20fb59f1762edcf253bce67716a94a43cb075ae6/pytorch_model.bin\n", + "2023-07-31 02:17:07,730 SequenceTagger predicts: Dictionary with 21 tags: O, S-LOC, B-LOC, E-LOC, I-LOC, S-PER, B-PER, E-PER, I-PER, S-DATE_TIME, B-DATE_TIME, E-DATE_TIME, I-DATE_TIME, S-ORG, B-ORG, E-ORG, I-ORG, S-NRP, B-NRP, E-NRP, I-NRP\n", + "Model loaded\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "fd09fd6ee2844e13b5839e1fd20ef222", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Processing files: 0%| | 0/2 [00:00\n", + ".dictlist {\n", + " background-color: #4EC64B;\n", + " text-align: center;\n", + " margin: 4px;\n", + " border-radius: 3px; padding: 0px 3px 1px 3px; display: inline-block;}\n", + ".artifact {\n", + " cursor: pointer;\n", + " background-color: #4EC64B;\n", + " text-align: left;\n", + " margin: 4px; border-radius: 3px; padding: 0px 3px 1px 3px; display: inline-block;\n", + "}\n", + "div.block.hidden {\n", + " display: none;\n", + "}\n", + ".clickable {\n", + " cursor: pointer;\n", + "}\n", + ".ellipsis {\n", + " display: inline-block;\n", + " max-width: 60px;\n", + " white-space: nowrap;\n", + " overflow: hidden;\n", + " text-overflow: ellipsis;\n", + "}\n", + ".master-wrapper {\n", + " display: flex;\n", + " flex-flow: row nowrap;\n", + " justify-content: flex-start;\n", + " align-items: stretch;\n", + "}\n", + ".master-tbl {\n", + " flex: 3\n", + "}\n", + ".master-wrapper > div {\n", + " margin: 4px;\n", + " padding: 10px;\n", + "}\n", + "iframe.fileview {\n", + " border: 0 none;\n", + " height: 100%;\n", + " width: 100%;\n", + " white-space: pre-wrap;\n", + "}\n", + ".pane-header-title {\n", + " width: 80%;\n", + " font-weight: 500;\n", + "}\n", + ".pane-header {\n", + " line-height: 1;\n", + " background-color: #4EC64B;\n", + " padding: 3px;\n", + "}\n", + ".pane-header .close {\n", + " font-size: 20px;\n", + " font-weight: 700;\n", + " float: right;\n", + " margin-top: -5px;\n", + "}\n", + ".master-wrapper .right-pane {\n", + " border: 1px inset silver;\n", + " width: 40%;\n", + " min-height: 300px;\n", + " flex: 3\n", + " min-width: 500px;\n", + "}\n", + ".master-wrapper * {\n", + " box-sizing: border-box;\n", + "}\n", + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
pii0Jul 31 02:17:04completedpii-recognizer-recognize-pii
v3io_user=pengw
kind=
owner=pengw
host=jupyter-pengw-5f99fb678d-mnvxl
model=whole
input_path=./data/
output_path=./data/output1/
entities=['PERSON', 'EMAIL', 'PHONE', 'LOCATION', 'ORGANIZATION']
output_suffix=output
html_key=highlighted
score_threshold=0.5
highlighted
output_path
rpt_json
errors
\n", + "
\n", + "
\n", + "
\n", + " Title\n", + " ×\n", + "
\n", + " \n", + "
\n", + "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "text/html": [ + " > to track results use the .show() or .logs() methods or click here to open in UI" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2023-07-31 02:17:12,403 [info] Run execution finished: {'status': 'completed', 'name': 'pii-recognizer-recognize-pii'}\n" + ] + } + ], + "source": [ + "import mlrun\n", + "artifact_path = \"./\"\n", + "proj = mlrun.get_or_create_project(\"pii\", \"./\")\n", + "fn = mlrun.code_to_function(\n", + " project=\"pii\",\n", + " name=\"pii_recognizer\",\n", + " filename=\"pii_recognizer.py\",\n", + " kind=\"job\",\n", + " image=\"mlrun/mlrun\",\n", + " handler=\"recognize_pii\",\n", + " description=\"This function is used to recognize PII in a given text\",\n", + ")\n", + "run_obj = fn.run(\n", + " artifact_path = artifact_path,\n", + " params= {\n", + " 'model': \"whole\", \n", + " 'input_path': \"./data/\",\n", + " 'output_path': \"./data/output1/\",\n", + " \"entities\": ['PERSON', \"EMAIL\", \"PHONE\", \"LOCATION\", \"ORGANIZATION\"], # the entities that needs to recognize\n", + " \"output_suffix\": \"output\",\n", + " \"html_key\": \"highlighted\",\n", + " \"score_threshold\" : 0.5, # the score threshold to mark the recognition as trusted\n", + " },\n", + " returns = [\"output_path: path\", \"rpt_json: file\", \"errors: file\"],\n", + " local=True,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "id": "38e1a44b-e045-4c50-a40f-fbc7e77d6c6b", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "c23dc77030224dfc825d7da86c6c1220", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Processing files: 0%| | 0/2 [00:00,\n", + "\n", + "We are pleased to inform you that you have been selected as the winner of our exclusive vacation package giveaway! Congratulations! You, along with your family, will enjoy a luxurious stay at our resort in the beautiful city of . Your flight tickets have been booked, and you will be departing on July 15th, 2023.\n", + "\n", + "Please provide us with the necessary details to finalize your travel arrangements. We kindly request your full name, date of birth, passport number, and contact information. Rest assured that all provided information will be handled with utmost confidentiality and in compliance with data protection regulations.\n", + "\n", + "We look forward to creating unforgettable memories for you and your loved ones during your stay with us. If you have any questions or require further assistance, please don't hesitate to contact our customer support team.\n", + "\n", + " is 182838483, connect him with or , he can pay you with 9393\n" + ] + } + ], + "source": [ + "#get the mlrun context\n", + "context = mlrun.get_or_create_ctx('pii_ctx1')\n", + "import pathlib\n", + "from tqdm.auto import tqdm\n", + "for i, txt_file in enumerate(\n", + " tqdm(\n", + " list(pathlib.Path(\"./data/output1/\").glob(\"*.txt\")),\n", + " desc=\"Processing files\",\n", + " unit=\"file\",\n", + " )\n", + " ):\n", + " # Load the str from the text file\n", + " text = txt_file.read_text()\n", + " print(text)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "id": "72c31b9c-47cc-4e73-8c76-041f78cfd305", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Highlighted Pii Entities

Highlighted Pii Entities

  • data/letter.txt

    Dear Mr. John DoePERSON,\n", + "\n", + "We are pleased to inform you that you have been selected as the winner of our exclusive vacation package giveaway! Congratulations! You, along with your family, will enjoy a luxurious stay at our resort in the beautiful city of RivieraLOCATIONRivieraORGANIZATION. Your flight tickets have been booked, and you will be departing on July 15th, 2023.\n", + "\n", + "Please provide us with the necessary details to finalize your travel arrangements. We kindly request your full name, date of birth, passport number, and contact information. Rest assured that all provided information will be handled with utmost confidentiality and in compliance with data protection regulations.\n", + "\n", + "We look forward to creating unforgettable memories for you and your loved ones during your stay with us. If you have any questions or require further assistance, please don't hesitate to contact our customer support team.\n", + "

  • data/pii_data.txt

    John smith'sPERSON ssnORGANIZATION is 182838483, connect him with JohnPERSONJohn_smith@gmail.comEMAILsmithPERSON@gmail.com or 6288389029PHONE, he can pay you with 4148292993PHONE9393

  • " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "#check the highlighted html \n", + "html_output = context.get_cached_artifact(\"highlighted\")\n", + "html_str = mlrun.get_dataitem(html_output.get_target_path()).get().decode(\"utf-8\")\n", + "from IPython.core.display import display, HTML\n", + "display(HTML(html_str))" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "id": "d4c7fb04-af53-4e63-8b0a-e14e1184f973", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"data/letter.txt\": [\n", + " {\n", + " \"entity_type\": \"PERSON\",\n", + " \"start\": 9,\n", + " \"end\": 17,\n", + " \"score\": 1,\n", + " \"analysis_explanation\": {\n", + " \"recognizer\": \"CustomSpacyRecognizer\",\n", + " \"pattern_name\": null,\n", + " \"pattern\": null,\n", + " \"original_score\": 1,\n", + " \"score\": 1,\n", + " \"textual_explanation\": \"Identified as PERSON by Spacy's Named Entity Recognition (Privy-trained)\",\n", + " \"score_context_improvement\": 0,\n", + " \"supportive_context_word\": \"\",\n", + " \"validation_result\": null\n", + " },\n", + " \"recognition_metadata\": {\n", + " \"recognizer_name\": \"CustomSpacyRecognizer\",\n", + " \"recognizer_identifier\": \"CustomSpacyRecognizer_139944219101744\"\n", + " }\n", + " },\n", + " {\n", + " \"entity_type\": \"LOCATION\",\n", + " \"start\": 248,\n", + " \"end\": 255,\n", + " \"score\": 1.0,\n", + " \"analysis_explanation\": {\n", + " \"recognizer\": \"FlairRecognizer\",\n", + " \"pattern_name\": null,\n", + " \"pattern\": null,\n", + " \"original_score\": 1.0,\n", + " \"score\": 1.0,\n", + " \"textual_explanation\": \"Identified as LOC by Flair's Named Entity Recognition\",\n", + " \"score_context_improvement\": 0,\n", + " \"supportive_context_word\": \"\",\n", + " \"validation_result\": null\n", + " },\n", + " \"recognition_metadata\": {\n", + " \"recognizer_identifier\": \"Flair Analytics_139944219101936\",\n", + " \"recognizer_name\": \"Flair Analytics\"\n", + " }\n", + " },\n", + " {\n", + " \"entity_type\": \"ORGANIZATION\",\n", + " \"start\": 248,\n", + " \"end\": 255,\n", + " \"score\": 1,\n", + " \"analysis_explanation\": {\n", + " \"recognizer\": \"CustomSpacyRecognizer\",\n", + " \"pattern_name\": null,\n", + " \"pattern\": null,\n", + " \"original_score\": 1,\n", + " \"score\": 1,\n", + " \"textual_explanation\": \"Identified as ORG by Spacy's Named Entity Recognition (Privy-trained)\",\n", + " \"score_context_improvement\": 0,\n", + " \"supportive_context_word\": \"\",\n", + " \"validation_result\": null\n", + " },\n", + " \"recognition_metadata\": {\n", + " \"recognizer_name\": \"CustomSpacyRecognizer\",\n", + " \"recognizer_identifier\": \"CustomSpacyRecognizer_139944219101744\"\n", + " }\n", + " }\n", + " ],\n", + " \"data/pii_data.txt\": [\n", + " {\n", + " \"entity_type\": \"PERSON\",\n", + " \"start\": 0,\n", + " \"end\": 12,\n", + " \"score\": 1,\n", + " \"analysis_explanation\": {\n", + " \"recognizer\": \"CustomSpacyRecognizer\",\n", + " \"pattern_name\": null,\n", + " \"pattern\": null,\n", + " \"original_score\": 1,\n", + " \"score\": 1,\n", + " \"textual_explanation\": \"Identified as PERSON by Spacy's Named Entity Recognition (Privy-trained)\",\n", + " \"score_context_improvement\": 0,\n", + " \"supportive_context_word\": \"\",\n", + " \"validation_result\": null\n", + " },\n", + " \"recognition_metadata\": {\n", + " \"recognizer_name\": \"CustomSpacyRecognizer\",\n", + " \"recognizer_identifier\": \"CustomSpacyRecognizer_139944219101744\"\n", + " }\n", + " },\n", + " {\n", + " \"entity_type\": \"ORGANIZATION\",\n", + " \"start\": 13,\n", + " \"end\": 16,\n", + " \"score\": 1,\n", + " \"analysis_explanation\": {\n", + " \"recognizer\": \"CustomSpacyRecognizer\",\n", + " \"pattern_name\": null,\n", + " \"pattern\": null,\n", + " \"original_score\": 1,\n", + " \"score\": 1,\n", + " \"textual_explanation\": \"Identified as ORG by Spacy's Named Entity Recognition (Privy-trained)\",\n", + " \"score_context_improvement\": 0,\n", + " \"supportive_context_word\": \"\",\n", + " \"validation_result\": null\n", + " },\n", + " \"recognition_metadata\": {\n", + " \"recognizer_name\": \"CustomSpacyRecognizer\",\n", + " \"recognizer_identifier\": \"CustomSpacyRecognizer_139944219101744\"\n", + " }\n", + " },\n", + " {\n", + " \"entity_type\": \"PERSON\",\n", + " \"start\": 53,\n", + " \"end\": 58,\n", + " \"score\": 1.0,\n", + " \"analysis_explanation\": {\n", + " \"recognizer\": \"FlairRecognizer\",\n", + " \"pattern_name\": null,\n", + " \"pattern\": null,\n", + " \"original_score\": 1.0,\n", + " \"score\": 1.0,\n", + " \"textual_explanation\": \"Identified as PER by Flair's Named Entity Recognition\",\n", + " \"score_context_improvement\": 0,\n", + " \"supportive_context_word\": \"\",\n", + " \"validation_result\": null\n", + " },\n", + " \"recognition_metadata\": {\n", + " \"recognizer_identifier\": \"Flair Analytics_139944219101936\",\n", + " \"recognizer_name\": \"Flair Analytics\"\n", + " }\n", + " },\n", + " {\n", + " \"entity_type\": \"PERSON\",\n", + " \"start\": 48,\n", + " \"end\": 52,\n", + " \"score\": 0.87,\n", + " \"analysis_explanation\": {\n", + " \"recognizer\": \"FlairRecognizer\",\n", + " \"pattern_name\": null,\n", + " \"pattern\": null,\n", + " \"original_score\": 0.87,\n", + " \"score\": 0.87,\n", + " \"textual_explanation\": \"Identified as PER by Flair's Named Entity Recognition\",\n", + " \"score_context_improvement\": 0,\n", + " \"supportive_context_word\": \"\",\n", + " \"validation_result\": null\n", + " },\n", + " \"recognition_metadata\": {\n", + " \"recognizer_identifier\": \"Flair Analytics_139944219101936\",\n", + " \"recognizer_name\": \"Flair Analytics\"\n", + " }\n", + " },\n", + " {\n", + " \"entity_type\": \"EMAIL\",\n", + " \"start\": 48,\n", + " \"end\": 68,\n", + " \"score\": 0.5,\n", + " \"analysis_explanation\": {\n", + " \"recognizer\": \"PatternRecognizer\",\n", + " \"pattern_name\": \"EMAIL\",\n", + " \"pattern\": \"\\\\S+@\\\\S+\",\n", + " \"original_score\": 0.5,\n", + " \"score\": 0.5,\n", + " \"textual_explanation\": null,\n", + " \"score_context_improvement\": 0,\n", + " \"supportive_context_word\": \"\",\n", + " \"validation_result\": null\n", + " },\n", + " \"recognition_metadata\": {\n", + " \"recognizer_name\": \"PatternRecognizer\",\n", + " \"recognizer_identifier\": \"PatternRecognizer_139944352474640\"\n", + " }\n", + " },\n", + " {\n", + " \"entity_type\": \"PHONE\",\n", + " \"start\": 72,\n", + " \"end\": 82,\n", + " \"score\": 0.5,\n", + " \"analysis_explanation\": {\n", + " \"recognizer\": \"PatternRecognizer\",\n", + " \"pattern_name\": \"PHONE\",\n", + " \"pattern\": \"\\\\(?\\\\d{3}\\\\)?[-.\\\\s]?\\\\d{3}[-.\\\\s]?\\\\d{4}\",\n", + " \"original_score\": 0.5,\n", + " \"score\": 0.5,\n", + " \"textual_explanation\": null,\n", + " \"score_context_improvement\": 0,\n", + " \"supportive_context_word\": \"\",\n", + " \"validation_result\": null\n", + " },\n", + " \"recognition_metadata\": {\n", + " \"recognizer_name\": \"PatternRecognizer\",\n", + " \"recognizer_identifier\": \"PatternRecognizer_139944352476560\"\n", + " }\n", + " },\n", + " {\n", + " \"entity_type\": \"PHONE\",\n", + " \"start\": 104,\n", + " \"end\": 114,\n", + " \"score\": 0.5,\n", + " \"analysis_explanation\": {\n", + " \"recognizer\": \"PatternRecognizer\",\n", + " \"pattern_name\": \"PHONE\",\n", + " \"pattern\": \"\\\\(?\\\\d{3}\\\\)?[-.\\\\s]?\\\\d{3}[-.\\\\s]?\\\\d{4}\",\n", + " \"original_score\": 0.5,\n", + " \"score\": 0.5,\n", + " \"textual_explanation\": null,\n", + " \"score_context_improvement\": 0,\n", + " \"supportive_context_word\": \"\",\n", + " \"validation_result\": null\n", + " },\n", + " \"recognition_metadata\": {\n", + " \"recognizer_name\": \"PatternRecognizer\",\n", + " \"recognizer_identifier\": \"PatternRecognizer_139944352476560\"\n", + " }\n", + " }\n", + " ]\n", + "}\n" + ] + } + ], + "source": [ + "#check the json report about the explanation.\n", + "rpt_output1 = context.get_cached_artifact(\"rpt_json\")\n", + "rpt_str1 = mlrun.get_dataitem(rpt_output1.get_target_path()).get().decode(\"utf-8\")\n", + "import json\n", + "obj = json.loads(rpt_str1)\n", + " \n", + "# Pretty Print JSON\n", + "json_formatted_str1 = json.dumps(obj, indent=4)\n", + "print(json_formatted_str1)" + ] + }, + { + "cell_type": "markdown", + "id": "1182c119", + "metadata": {}, + "source": [ + "### 3.2. Masking configurations \n", + " - entity_operator_map: it defined what to do with recognized tokens? Mask them? mask them with what? remove them? replace them?\n", + "
    \n",
    +    "     entity_operator_map = {\n",
    +    "        \"PERSON\": (\"keep\", {}),\n",
    +    "        \"EMAIL\": (\"mask\", {\"masking_char\": \"😀\", \"chars_to_mask\": 5, \"from_end\": False}),\n",
    +    "        \"PHONE\": (\"hash\", {}),\n",
    +    "        \"LOCATION\": (\"redact\", {}),\n",
    +    "        \"ORGANIZATION\": (\"replace\", {\"new_value\": \"Company XYZ\"})\n",
    +    "        }\n",
    +    "     
    " + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "id": "df325ea8-4b01-4485-b835-e0196ffe83d7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2023-07-31 02:20:40,550 [info] Project loaded successfully: {'project_name': 'pii'}\n", + "> 2023-07-31 02:20:40,556 [warning] Failed to add git metadata, ignore if path is not part of a git repo.: {'path': './', 'error': '/User/pii_recognizer'}\n", + "> 2023-07-31 02:20:40,649 [warning] artifact/output path is not defined or is local and relative, artifacts will not be visible in the UI: {'output_path': './'}\n", + "> 2023-07-31 02:20:40,649 [info] Storing function: {'name': 'pii-recognizer-recognize-pii', 'uid': '2b43f80c7ca44b43b229760bb55f814d', 'db': None}\n", + "2023-07-31 02:20:40,812 loading file /User/.flair/models/flair-pii-distilbert/models--beki--flair-pii-distilbert/snapshots/20fb59f1762edcf253bce67716a94a43cb075ae6/pytorch_model.bin\n", + "2023-07-31 02:20:44,130 SequenceTagger predicts: Dictionary with 21 tags: O, S-LOC, B-LOC, E-LOC, I-LOC, S-PER, B-PER, E-PER, I-PER, S-DATE_TIME, B-DATE_TIME, E-DATE_TIME, I-DATE_TIME, S-ORG, B-ORG, E-ORG, I-ORG, S-NRP, B-NRP, E-NRP, I-NRP\n", + "Model loaded\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "5ad56413aad64e59b177666ca0a89a01", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Processing files: 0%| | 0/2 [00:00\n", + ".dictlist {\n", + " background-color: #4EC64B;\n", + " text-align: center;\n", + " margin: 4px;\n", + " border-radius: 3px; padding: 0px 3px 1px 3px; display: inline-block;}\n", + ".artifact {\n", + " cursor: pointer;\n", + " background-color: #4EC64B;\n", + " text-align: left;\n", + " margin: 4px; border-radius: 3px; padding: 0px 3px 1px 3px; display: inline-block;\n", + "}\n", + "div.block.hidden {\n", + " display: none;\n", + "}\n", + ".clickable {\n", + " cursor: pointer;\n", + "}\n", + ".ellipsis {\n", + " display: inline-block;\n", + " max-width: 60px;\n", + " white-space: nowrap;\n", + " overflow: hidden;\n", + " text-overflow: ellipsis;\n", + "}\n", + ".master-wrapper {\n", + " display: flex;\n", + " flex-flow: row nowrap;\n", + " justify-content: flex-start;\n", + " align-items: stretch;\n", + "}\n", + ".master-tbl {\n", + " flex: 3\n", + "}\n", + ".master-wrapper > div {\n", + " margin: 4px;\n", + " padding: 10px;\n", + "}\n", + "iframe.fileview {\n", + " border: 0 none;\n", + " height: 100%;\n", + " width: 100%;\n", + " white-space: pre-wrap;\n", + "}\n", + ".pane-header-title {\n", + " width: 80%;\n", + " font-weight: 500;\n", + "}\n", + ".pane-header {\n", + " line-height: 1;\n", + " background-color: #4EC64B;\n", + " padding: 3px;\n", + "}\n", + ".pane-header .close {\n", + " font-size: 20px;\n", + " font-weight: 700;\n", + " float: right;\n", + " margin-top: -5px;\n", + "}\n", + ".master-wrapper .right-pane {\n", + " border: 1px inset silver;\n", + " width: 40%;\n", + " min-height: 300px;\n", + " flex: 3\n", + " min-width: 500px;\n", + "}\n", + ".master-wrapper * {\n", + " box-sizing: border-box;\n", + "}\n", + "\n", + "
    \n", + "
    \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
    projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
    pii0Jul 31 02:20:40completedpii-recognizer-recognize-pii
    v3io_user=pengw
    kind=
    owner=pengw
    host=jupyter-pengw-5f99fb678d-mnvxl
    model=whole
    input_path=./data/
    output_path=./data/output2/
    entities=['PERSON', 'EMAIL', 'PHONE', 'LOCATION', 'ORGANIZATION']
    output_suffix=output
    html_key=highlighted
    score_threshold=0.5
    entity_operator_map={'PERSON': ('keep', {}), 'EMAIL': ('mask', {'masking_char': '😀', 'chars_to_mask': 100, 'from_end': False, 'entity_type': 'EMAIL'}), 'PHONE': ('hash', {}), 'LOCATION': ('redact', {}), 'ORGANIZATION': ('replace', {'new_value': 'Company XYZ', 'entity_type': 'ORGANIZATION'})}
    highlighted
    output_path
    rpt_json
    errors
    \n", + "
    \n", + "
    \n", + "
    \n", + " Title\n", + " ×\n", + "
    \n", + " \n", + "
    \n", + "
    \n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "text/html": [ + " > to track results use the .show() or .logs() methods or click here to open in UI" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2023-07-31 02:20:48,903 [info] Run execution finished: {'status': 'completed', 'name': 'pii-recognizer-recognize-pii'}\n" + ] + } + ], + "source": [ + "import mlrun\n", + "artifact_path = \"./\"\n", + "proj = mlrun.get_or_create_project(\"pii\", \"./\")\n", + "fn = mlrun.code_to_function(\n", + " project=\"pii\",\n", + " name=\"pii_recognizer\",\n", + " filename=\"pii_recognizer.py\",\n", + " kind=\"job\",\n", + " image=\"mlrun/mlrun\",\n", + " handler=\"recognize_pii\",\n", + " description=\"This function is used to recognize PII in a given text\",\n", + ")\n", + "\n", + "entity_operator_map = {\n", + " \"PERSON\": (\"keep\", {}),\n", + " \"EMAIL\": (\"mask\", {\"masking_char\": \"😀\", \"chars_to_mask\" : 100, \"from_end\": False}),\n", + " \"PHONE\": (\"hash\", {}),\n", + " \"LOCATION\": (\"redact\", {}),\n", + " \"ORGANIZATION\": (\"replace\", {\"new_value\": \"Company XYZ\"})\n", + " }\n", + "run_obj = fn.run(\n", + " artifact_path = artifact_path,\n", + " params= {\n", + " 'model': \"whole\", \n", + " 'input_path': \"./data/\",\n", + " 'output_path': \"./data/output2/\",\n", + " \"entities\": ['PERSON', \"EMAIL\", \"PHONE\", \"LOCATION\", \"ORGANIZATION\"],\n", + " \"output_suffix\": \"output\",\n", + " \"html_key\": \"highlighted\",\n", + " \"score_threshold\" : 0.5,\n", + " \"entity_operator_map\": entity_operator_map,\n", + " \n", + " },\n", + " returns = [\"output_path: path\", \"rpt_json: file\", \"errors: file\"],\n", + " local=True,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "id": "2583e72b-8dda-4469-8e2b-f492851015af", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "552ad96fd23e497ea6e547936c7853a0", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Processing files: 0%| | 0/2 [00:00Highlighted Pii Entities

    Highlighted Pii Entities

  • data/letter.txt

    Dear Mr. John DoePERSON,\n", + "\n", + "We are pleased to inform you that you have been selected as the winner of our exclusive vacation package giveaway! Congratulations! You, along with your family, will enjoy a luxurious stay at our resort in the beautiful city of RivieraLOCATIONRivieraORGANIZATION. Your flight tickets have been booked, and you will be departing on July 15th, 2023.\n", + "\n", + "Please provide us with the necessary details to finalize your travel arrangements. We kindly request your full name, date of birth, passport number, and contact information. Rest assured that all provided information will be handled with utmost confidentiality and in compliance with data protection regulations.\n", + "\n", + "We look forward to creating unforgettable memories for you and your loved ones during your stay with us. If you have any questions or require further assistance, please don't hesitate to contact our customer support team.\n", + "

  • data/pii_data.txt

    John smith'sPERSON ssnORGANIZATION is 182838483, connect him with JohnPERSONJohn_smith@gmail.comEMAILsmithPERSON@gmail.com or 6288389029PHONE, he can pay you with 4148292993PHONE9393

  • " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "#check the highlighted html \n", + "html_output = context.get_cached_artifact(\"highlighted\")\n", + "html_str = mlrun.get_dataitem(html_output.get_target_path()).get().decode(\"utf-8\")\n", + "from IPython.core.display import display, HTML\n", + "display(HTML(html_str))" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "id": "3a087fb1-dde7-4ba9-9f53-a10f9099c769", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"data/letter.txt\": [\n", + " {\n", + " \"entity_type\": \"PERSON\",\n", + " \"start\": 9,\n", + " \"end\": 17,\n", + " \"score\": 1.0,\n", + " \"analysis_explanation\": {\n", + " \"recognizer\": \"FlairRecognizer\",\n", + " \"pattern_name\": null,\n", + " \"pattern\": null,\n", + " \"original_score\": 1.0,\n", + " \"score\": 1.0,\n", + " \"textual_explanation\": \"Identified as PER by Flair's Named Entity Recognition\",\n", + " \"score_context_improvement\": 0,\n", + " \"supportive_context_word\": \"\",\n", + " \"validation_result\": null\n", + " },\n", + " \"recognition_metadata\": {\n", + " \"recognizer_identifier\": \"Flair Analytics_139944345555488\",\n", + " \"recognizer_name\": \"Flair Analytics\"\n", + " }\n", + " },\n", + " {\n", + " \"entity_type\": \"LOCATION\",\n", + " \"start\": 248,\n", + " \"end\": 255,\n", + " \"score\": 1.0,\n", + " \"analysis_explanation\": {\n", + " \"recognizer\": \"FlairRecognizer\",\n", + " \"pattern_name\": null,\n", + " \"pattern\": null,\n", + " \"original_score\": 1.0,\n", + " \"score\": 1.0,\n", + " \"textual_explanation\": \"Identified as LOC by Flair's Named Entity Recognition\",\n", + " \"score_context_improvement\": 0,\n", + " \"supportive_context_word\": \"\",\n", + " \"validation_result\": null\n", + " },\n", + " \"recognition_metadata\": {\n", + " \"recognizer_identifier\": \"Flair Analytics_139944345555488\",\n", + " \"recognizer_name\": \"Flair Analytics\"\n", + " }\n", + " },\n", + " {\n", + " \"entity_type\": \"ORGANIZATION\",\n", + " \"start\": 248,\n", + " \"end\": 255,\n", + " \"score\": 1,\n", + " \"analysis_explanation\": {\n", + " \"recognizer\": \"CustomSpacyRecognizer\",\n", + " \"pattern_name\": null,\n", + " \"pattern\": null,\n", + " \"original_score\": 1,\n", + " \"score\": 1,\n", + " \"textual_explanation\": \"Identified as ORG by Spacy's Named Entity Recognition (Privy-trained)\",\n", + " \"score_context_improvement\": 0,\n", + " \"supportive_context_word\": \"\",\n", + " \"validation_result\": null\n", + " },\n", + " \"recognition_metadata\": {\n", + " \"recognizer_name\": \"CustomSpacyRecognizer\",\n", + " \"recognizer_identifier\": \"CustomSpacyRecognizer_139943499301312\"\n", + " }\n", + " }\n", + " ],\n", + " \"data/pii_data.txt\": [\n", + " {\n", + " \"entity_type\": \"PERSON\",\n", + " \"start\": 0,\n", + " \"end\": 12,\n", + " \"score\": 1,\n", + " \"analysis_explanation\": {\n", + " \"recognizer\": \"CustomSpacyRecognizer\",\n", + " \"pattern_name\": null,\n", + " \"pattern\": null,\n", + " \"original_score\": 1,\n", + " \"score\": 1,\n", + " \"textual_explanation\": \"Identified as PERSON by Spacy's Named Entity Recognition (Privy-trained)\",\n", + " \"score_context_improvement\": 0,\n", + " \"supportive_context_word\": \"\",\n", + " \"validation_result\": null\n", + " },\n", + " \"recognition_metadata\": {\n", + " \"recognizer_name\": \"CustomSpacyRecognizer\",\n", + " \"recognizer_identifier\": \"CustomSpacyRecognizer_139943499301312\"\n", + " }\n", + " },\n", + " {\n", + " \"entity_type\": \"ORGANIZATION\",\n", + " \"start\": 13,\n", + " \"end\": 16,\n", + " \"score\": 1,\n", + " \"analysis_explanation\": {\n", + " \"recognizer\": \"CustomSpacyRecognizer\",\n", + " \"pattern_name\": null,\n", + " \"pattern\": null,\n", + " \"original_score\": 1,\n", + " \"score\": 1,\n", + " \"textual_explanation\": \"Identified as ORG by Spacy's Named Entity Recognition (Privy-trained)\",\n", + " \"score_context_improvement\": 0,\n", + " \"supportive_context_word\": \"\",\n", + " \"validation_result\": null\n", + " },\n", + " \"recognition_metadata\": {\n", + " \"recognizer_name\": \"CustomSpacyRecognizer\",\n", + " \"recognizer_identifier\": \"CustomSpacyRecognizer_139943499301312\"\n", + " }\n", + " },\n", + " {\n", + " \"entity_type\": \"PERSON\",\n", + " \"start\": 53,\n", + " \"end\": 58,\n", + " \"score\": 1.0,\n", + " \"analysis_explanation\": {\n", + " \"recognizer\": \"FlairRecognizer\",\n", + " \"pattern_name\": null,\n", + " \"pattern\": null,\n", + " \"original_score\": 1.0,\n", + " \"score\": 1.0,\n", + " \"textual_explanation\": \"Identified as PER by Flair's Named Entity Recognition\",\n", + " \"score_context_improvement\": 0,\n", + " \"supportive_context_word\": \"\",\n", + " \"validation_result\": null\n", + " },\n", + " \"recognition_metadata\": {\n", + " \"recognizer_identifier\": \"Flair Analytics_139944345555488\",\n", + " \"recognizer_name\": \"Flair Analytics\"\n", + " }\n", + " },\n", + " {\n", + " \"entity_type\": \"PERSON\",\n", + " \"start\": 48,\n", + " \"end\": 52,\n", + " \"score\": 0.87,\n", + " \"analysis_explanation\": {\n", + " \"recognizer\": \"FlairRecognizer\",\n", + " \"pattern_name\": null,\n", + " \"pattern\": null,\n", + " \"original_score\": 0.87,\n", + " \"score\": 0.87,\n", + " \"textual_explanation\": \"Identified as PER by Flair's Named Entity Recognition\",\n", + " \"score_context_improvement\": 0,\n", + " \"supportive_context_word\": \"\",\n", + " \"validation_result\": null\n", + " },\n", + " \"recognition_metadata\": {\n", + " \"recognizer_identifier\": \"Flair Analytics_139944345555488\",\n", + " \"recognizer_name\": \"Flair Analytics\"\n", + " }\n", + " },\n", + " {\n", + " \"entity_type\": \"EMAIL\",\n", + " \"start\": 48,\n", + " \"end\": 68,\n", + " \"score\": 0.5,\n", + " \"analysis_explanation\": {\n", + " \"recognizer\": \"PatternRecognizer\",\n", + " \"pattern_name\": \"EMAIL\",\n", + " \"pattern\": \"\\\\S+@\\\\S+\",\n", + " \"original_score\": 0.5,\n", + " \"score\": 0.5,\n", + " \"textual_explanation\": null,\n", + " \"score_context_improvement\": 0,\n", + " \"supportive_context_word\": \"\",\n", + " \"validation_result\": null\n", + " },\n", + " \"recognition_metadata\": {\n", + " \"recognizer_name\": \"PatternRecognizer\",\n", + " \"recognizer_identifier\": \"PatternRecognizer_139943864893792\"\n", + " }\n", + " },\n", + " {\n", + " \"entity_type\": \"PHONE\",\n", + " \"start\": 72,\n", + " \"end\": 82,\n", + " \"score\": 0.5,\n", + " \"analysis_explanation\": {\n", + " \"recognizer\": \"PatternRecognizer\",\n", + " \"pattern_name\": \"PHONE\",\n", + " \"pattern\": \"\\\\(?\\\\d{3}\\\\)?[-.\\\\s]?\\\\d{3}[-.\\\\s]?\\\\d{4}\",\n", + " \"original_score\": 0.5,\n", + " \"score\": 0.5,\n", + " \"textual_explanation\": null,\n", + " \"score_context_improvement\": 0,\n", + " \"supportive_context_word\": \"\",\n", + " \"validation_result\": null\n", + " },\n", + " \"recognition_metadata\": {\n", + " \"recognizer_name\": \"PatternRecognizer\",\n", + " \"recognizer_identifier\": \"PatternRecognizer_139943864894128\"\n", + " }\n", + " },\n", + " {\n", + " \"entity_type\": \"PHONE\",\n", + " \"start\": 104,\n", + " \"end\": 114,\n", + " \"score\": 0.5,\n", + " \"analysis_explanation\": {\n", + " \"recognizer\": \"PatternRecognizer\",\n", + " \"pattern_name\": \"PHONE\",\n", + " \"pattern\": \"\\\\(?\\\\d{3}\\\\)?[-.\\\\s]?\\\\d{3}[-.\\\\s]?\\\\d{4}\",\n", + " \"original_score\": 0.5,\n", + " \"score\": 0.5,\n", + " \"textual_explanation\": null,\n", + " \"score_context_improvement\": 0,\n", + " \"supportive_context_word\": \"\",\n", + " \"validation_result\": null\n", + " },\n", + " \"recognition_metadata\": {\n", + " \"recognizer_name\": \"PatternRecognizer\",\n", + " \"recognizer_identifier\": \"PatternRecognizer_139943864894128\"\n", + " }\n", + " }\n", + " ]\n", + "}\n" + ] + } + ], + "source": [ + "#check the json report about the explanation.\n", + "rpt_output1 = context.get_cached_artifact(\"rpt_json\")\n", + "rpt_str1 = mlrun.get_dataitem(rpt_output1.get_target_path()).get().decode(\"utf-8\")\n", + "import json\n", + "obj = json.loads(rpt_str1)\n", + " \n", + "# Pretty Print JSON\n", + "json_formatted_str1 = json.dumps(obj, indent=4)\n", + "print(json_formatted_str1)" + ] + }, + { + "cell_type": "markdown", + "id": "7c058fe3-000c-4566-a11e-80283426d945", + "metadata": {}, + "source": [ + "### 3.3 Output configurations \n", + " - is_full_text: whether produce full text or just the sentences have PII entities in it\n", + " - generate_html: whether to produce the html with highlighted pii entities\n", + " - generate_json: whether to proudce the json report with the explaination of the process\n", + " - is_full_html: whether produce full text with the pii entities highlighted or just sentences with pii entities.\n", + " - is_full_report: whether produce the json report with detailed information or just start, end index and scores." + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "id": "6a684769", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2023-07-31 02:22:57,789 [info] Project loaded successfully: {'project_name': 'pii'}\n", + "> 2023-07-31 02:22:57,799 [warning] Failed to add git metadata, ignore if path is not part of a git repo.: {'path': './', 'error': '/User/pii_recognizer'}\n", + "> 2023-07-31 02:22:57,891 [warning] artifact/output path is not defined or is local and relative, artifacts will not be visible in the UI: {'output_path': './'}\n", + "> 2023-07-31 02:22:57,892 [info] Storing function: {'name': 'pii-recognizer-recognize-pii', 'uid': '3f6d701e423346b39026dc365698c15c', 'db': None}\n", + "2023-07-31 02:22:58,079 loading file /User/.flair/models/flair-pii-distilbert/models--beki--flair-pii-distilbert/snapshots/20fb59f1762edcf253bce67716a94a43cb075ae6/pytorch_model.bin\n", + "2023-07-31 02:23:01,565 SequenceTagger predicts: Dictionary with 21 tags: O, S-LOC, B-LOC, E-LOC, I-LOC, S-PER, B-PER, E-PER, I-PER, S-DATE_TIME, B-DATE_TIME, E-DATE_TIME, I-DATE_TIME, S-ORG, B-ORG, E-ORG, I-ORG, S-NRP, B-NRP, E-NRP, I-NRP\n", + "Model loaded\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "ad05f59e8c604629a01f797dc84ec530", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Processing files: 0%| | 0/2 [00:00\n", + ".dictlist {\n", + " background-color: #4EC64B;\n", + " text-align: center;\n", + " margin: 4px;\n", + " border-radius: 3px; padding: 0px 3px 1px 3px; display: inline-block;}\n", + ".artifact {\n", + " cursor: pointer;\n", + " background-color: #4EC64B;\n", + " text-align: left;\n", + " margin: 4px; border-radius: 3px; padding: 0px 3px 1px 3px; display: inline-block;\n", + "}\n", + "div.block.hidden {\n", + " display: none;\n", + "}\n", + ".clickable {\n", + " cursor: pointer;\n", + "}\n", + ".ellipsis {\n", + " display: inline-block;\n", + " max-width: 60px;\n", + " white-space: nowrap;\n", + " overflow: hidden;\n", + " text-overflow: ellipsis;\n", + "}\n", + ".master-wrapper {\n", + " display: flex;\n", + " flex-flow: row nowrap;\n", + " justify-content: flex-start;\n", + " align-items: stretch;\n", + "}\n", + ".master-tbl {\n", + " flex: 3\n", + "}\n", + ".master-wrapper > div {\n", + " margin: 4px;\n", + " padding: 10px;\n", + "}\n", + "iframe.fileview {\n", + " border: 0 none;\n", + " height: 100%;\n", + " width: 100%;\n", + " white-space: pre-wrap;\n", + "}\n", + ".pane-header-title {\n", + " width: 80%;\n", + " font-weight: 500;\n", + "}\n", + ".pane-header {\n", + " line-height: 1;\n", + " background-color: #4EC64B;\n", + " padding: 3px;\n", + "}\n", + ".pane-header .close {\n", + " font-size: 20px;\n", + " font-weight: 700;\n", + " float: right;\n", + " margin-top: -5px;\n", + "}\n", + ".master-wrapper .right-pane {\n", + " border: 1px inset silver;\n", + " width: 40%;\n", + " min-height: 300px;\n", + " flex: 3\n", + " min-width: 500px;\n", + "}\n", + ".master-wrapper * {\n", + " box-sizing: border-box;\n", + "}\n", + "\n", + "
    \n", + "
    \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
    projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
    pii0Jul 31 02:22:57completedpii-recognizer-recognize-pii
    v3io_user=pengw
    kind=
    owner=pengw
    host=jupyter-pengw-5f99fb678d-mnvxl
    model=whole
    input_path=./data/
    output_path=./data/output3/
    entities=['PERSON', 'EMAIL', 'PHONE', 'LOCATION', 'ORGANIZATION']
    output_suffix=output
    html_key=highlighted
    score_threshold=0.5
    entity_operator_map={'PERSON': ('keep', {}), 'EMAIL': ('mask', {'masking_char': '😀', 'chars_to_mask': 100, 'from_end': False, 'entity_type': 'EMAIL'}), 'PHONE': ('hash', {}), 'LOCATION': ('redact', {}), 'ORGANIZATION': ('replace', {'new_value': 'Company XYZ', 'entity_type': 'ORGANIZATION'})}
    is_full_text=False
    is_full_html=False
    is_full_report=False
    highlighted
    output_path
    rpt_json
    errors
    \n", + "
    \n", + "
    \n", + "
    \n", + " Title\n", + " ×\n", + "
    \n", + " \n", + "
    \n", + "
    \n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "text/html": [ + " > to track results use the .show() or .logs() methods or click here to open in UI" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2023-07-31 02:23:06,096 [info] Run execution finished: {'status': 'completed', 'name': 'pii-recognizer-recognize-pii'}\n" + ] + } + ], + "source": [ + "import mlrun\n", + "artifact_path = \"./\"\n", + "proj = mlrun.get_or_create_project(\"pii\", \"./\")\n", + "fn = mlrun.code_to_function(\n", + " project=\"pii\",\n", + " name=\"pii_recognizer\",\n", + " filename=\"pii_recognizer.py\",\n", + " kind=\"job\",\n", + " image=\"mlrun/mlrun\",\n", + " handler=\"recognize_pii\",\n", + " description=\"This function is used to recognize PII in a given text\",\n", + ")\n", + "\n", + "entity_operator_map = {\n", + " \"PERSON\": (\"keep\", {}),\n", + " \"EMAIL\": (\"mask\", {\"masking_char\": \"😀\", \"chars_to_mask\" : 100, \"from_end\": False}),\n", + " \"PHONE\": (\"hash\", {}),\n", + " \"LOCATION\": (\"redact\", {}),\n", + " \"ORGANIZATION\": (\"replace\", {\"new_value\": \"Company XYZ\"})\n", + " }\n", + "run_obj = fn.run(\n", + " artifact_path = artifact_path,\n", + " params= {\n", + " 'model': \"whole\", \n", + " 'input_path': \"./data/\",\n", + " 'output_path': \"./data/output3/\",\n", + " \"entities\": ['PERSON', \"EMAIL\", \"PHONE\", \"LOCATION\", \"ORGANIZATION\"],\n", + " \"output_suffix\": \"output\",\n", + " \"html_key\": \"highlighted\",\n", + " \"score_threshold\" : 0.5,\n", + " \"entity_operator_map\": entity_operator_map,\n", + " \"is_full_text\": False,\n", + " \"is_full_html\": False,\n", + " \"is_full_report\": False,\n", + " },\n", + " returns = [\"output_path: path\", \"rpt_json: file\", \"errors: file\"],\n", + " local=True,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "id": "0e10d8fa", + "metadata": {}, + "outputs": [], + "source": [ + "#get the mlrun context\n", + "context = mlrun.get_or_create_ctx('pii_ctx')\n", + "import pathlib\n", + "from tqdm.auto import tqdm" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "id": "fb303fef", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "f478b2a3792e42beabad632b9523e169", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Processing files: 0%| | 0/2 [00:00Highlighted Pii Entities

    Highlighted Pii Entities

  • data/letter.txt

    Dear Mr. John DoePERSON,\n", + "\n", + "We are pleased to inform you that you have been selected as the winner of our exclusive vacation package giveaway! Congratulations! You, along with your family, will enjoy a luxurious stay at our resort in the beautiful city of RivieraLOCATIONRivieraORGANIZATION

  • data/pii_data.txt

    John smith'sPERSON ssnORGANIZATION is 182838483, connect him with JohnPERSONJohn_smith@gmail.comEMAILsmithPERSON

  • " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "#check the highlighted html \n", + "html_output = context.get_cached_artifact(\"highlighted\")\n", + "html_str = mlrun.get_dataitem(html_output.get_target_path()).get().decode(\"utf-8\")\n", + "from IPython.core.display import display, HTML\n", + "display(HTML(html_str))" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "id": "26f9e706", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"data/letter.txt\": [\n", + " {\n", + " \"entity_type\": \"PERSON\",\n", + " \"start\": 9,\n", + " \"end\": 17,\n", + " \"score\": 1\n", + " },\n", + " {\n", + " \"entity_type\": \"LOCATION\",\n", + " \"start\": 248,\n", + " \"end\": 255,\n", + " \"score\": 1.0\n", + " },\n", + " {\n", + " \"entity_type\": \"ORGANIZATION\",\n", + " \"start\": 248,\n", + " \"end\": 255,\n", + " \"score\": 1\n", + " }\n", + " ],\n", + " \"data/pii_data.txt\": [\n", + " {\n", + " \"entity_type\": \"PERSON\",\n", + " \"start\": 0,\n", + " \"end\": 12,\n", + " \"score\": 1\n", + " },\n", + " {\n", + " \"entity_type\": \"ORGANIZATION\",\n", + " \"start\": 13,\n", + " \"end\": 16,\n", + " \"score\": 1\n", + " },\n", + " {\n", + " \"entity_type\": \"PERSON\",\n", + " \"start\": 53,\n", + " \"end\": 58,\n", + " \"score\": 1.0\n", + " },\n", + " {\n", + " \"entity_type\": \"PERSON\",\n", + " \"start\": 48,\n", + " \"end\": 52,\n", + " \"score\": 0.87\n", + " },\n", + " {\n", + " \"entity_type\": \"EMAIL\",\n", + " \"start\": 48,\n", + " \"end\": 68,\n", + " \"score\": 0.5\n", + " },\n", + " {\n", + " \"entity_type\": \"PHONE\",\n", + " \"start\": 72,\n", + " \"end\": 82,\n", + " \"score\": 0.5\n", + " },\n", + " {\n", + " \"entity_type\": \"PHONE\",\n", + " \"start\": 104,\n", + " \"end\": 114,\n", + " \"score\": 0.5\n", + " }\n", + " ]\n", + "}\n" + ] + } + ], + "source": [ + "#check the json report about the explanation.\n", + "rpt_output = context.get_cached_artifact(\"rpt_json\")\n", + "rpt_str = mlrun.get_dataitem(rpt_output.get_target_path()).get().decode(\"utf-8\")\n", + "import json\n", + "obj = json.loads(rpt_str)\n", + " \n", + "# Pretty Print JSON\n", + "json_formatted_str = json.dumps(obj, indent=4)\n", + "print(json_formatted_str)\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "pii", + "language": "python", + "name": "conda-env-.conda-pii-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/functions/master/pii_recognizer/0.4.0/src/pii_recognizer.py b/functions/master/pii_recognizer/0.4.0/src/pii_recognizer.py new file mode 100644 index 00000000..0acc55dc --- /dev/null +++ b/functions/master/pii_recognizer/0.4.0/src/pii_recognizer.py @@ -0,0 +1,951 @@ +# Copyright 2023 Iguazio +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import os +import pathlib +import tempfile +import warnings +from typing import List, Set, Tuple, Union + +import annotated_text.util as at_util +import mlrun +import nltk +import pandas as pd +import presidio_analyzer as pa +import presidio_anonymizer as pre_anoymizer +from presidio_anonymizer.entities import OperatorConfig +from tqdm import tqdm + +try: + import flair as fl +except ModuleNotFoundError: + print("Flair is not installed") + +# There is a conflict between Rust-based tokenizers' parallel processing +# and Python's fork operations during multiprocessing. To avoid this, we need +# the following two lines + +os.environ["TOKENIZERS_PARALLELISM"] = "false" +warnings.filterwarnings("ignore") + +logger = logging.getLogger("pii-recognizer") + + +# Add the constant classes of Models and Entities to govern the whole package +class Models: + WHOLE = "whole" + PATTERN = "pattern" + SPACY = "spacy" + FLAIR = "flair" + + +class Entities: + CREDIT_CARD = "CREDIT_CARD" + SSN = "SSN" + PHONE = "PHONE" + EMAIL = "EMAIL" + LOCATION = "LOCATION" + PERSON = "PERSON" + NRP = "NRP" + ORGANIZATION = "ORGANIZATION" + DATE_TIME = "DATE_TIME" + GPE = ("GPE",) + MAC_ADDRESS = "MAC_ADDRESS" + US_BANK_NUMBER = "US_BANK_NUMBER" + IMEI = "IMEI" + TITLE = "TITLE" + LICENSE_PLATE = "LICENSE_PLATE" + US_PASSPORT = "US_PASSPORT" + CURRENCY = "CURRENCY" + ROUTING_NUMBER = "ROUTING_NUMBER" + US_ITIN = "US_ITIN" + US_BANK_NUMBER = "US_BANK_NUMBER" + US_DRIVER_LICENSE = "US_DRIVER_LICENSE" + AGE = "AGE" + PASSWORD = "PASSWORD" + SWIFT_CODE = "SWIFT_CODE" + + +class PatternRecognizerFactory: + """ + Factory for creating pattern recognizers, it can be extended in the future to + add more regex pattern for different entities. For the pattern recognizer to work, + we need construct a list of regex patterns for each entity. + """ + + RECOGNIZABLE_ENTITIES = { + "CREDIT_CARD": [pa.Pattern("CREDIT_CARD", r"\b(?:\d[ -]*?){13,16}\b", 0.5)], + "SSN": [pa.Pattern("SSN", r"\b\d{3}-?\d{2}-?\d{4}\b", 0.5)], + "PHONE": [pa.Pattern("PHONE", r"\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}", 0.5)], + "EMAIL": [pa.Pattern("EMAIL", r"\S+@\S+", 0.5)], + } + + # create a list of pattern recognizers + @classmethod + def _create_pattern_recognizer(cls): + """ + For each entity, create a list of patterns to recognize it + + :param cls: PatternRecognizerFactory class + + :returns: List of pattern recognizers + """ + + # Entities to recognize and their regex patterns + + return [ + pa.PatternRecognizer(supported_entity=entity, patterns=pattern) + for entity, pattern in cls.RECOGNIZABLE_ENTITIES.items() + ] + + +class CustomSpacyRecognizer(pa.LocalRecognizer): + """ + Custom Spacy Recognizer from Presidio Analyzer trained on Privy data. + The privy data is generated using this https://github.com/pixie-io/pixie/tree/main/src/datagen/pii/privy + It can be used to recognize custom entities, Since we want to use Presidio's Registries to generate AnalyzerEngine, + it inherits from Presidio Analyzer's LocalRecognizer class. + """ + + # Entities to recognize + + RECOGNIZABLE_ENTITIES = { + "LOCATION", + "PERSON", + "NRP", + "ORGANIZATION", + "DATE_TIME", + } + + # Default explanation for this recognizer + + _DEFAULT_EXPLANATION = ( + "Identified as {} by Spacy's Named Entity Recognition (Privy-trained)" + ) + + # Label groups to check + + _DEFAULT_CHECK_LABEL_GROUPS = [ + ({"LOCATION"}, {"LOC", "LOCATION", "STREET_ADDRESS", "COORDINATE"}), + ({"PERSON"}, {"PER", "PERSON"}), + ({"NRP"}, {"NORP", "NRP"}), + ({"ORGANIZATION"}, {"ORG"}), + ({"DATE_TIME"}, {"DATE_TIME"}), + ] + + # pretrained model for this recognizer + + _DEFAULT_MODEL_LANGUAGES = { + "en": "beki/en_spacy_pii_distilbert", + } + + _DEFAULT_PRESIDIO_EQUIVALENCES = { + "PER": "PERSON", + "LOC": "LOCATION", + "ORG": "ORGANIZATION", + "NROP": "NRP", + "DATE_TIME": "DATE_TIME", + } + + def __init__( + self, + supported_language: str = "en", + supported_entities: List[str] = None, + check_label_groups: Tuple[Set, Set] = None, + context: List[str] = None, + ner_strength: float = 1, + ): + """ + Initialize Spacy Recognizer. + + :param supported_language: Language to use, default is English + :param supported_entities: Entities to use for recognition + :param check_label_groups: Label groups to check for the entities + :param context: Context to use if any + :param ner_strength: Default confidence for NER prediction + + :returns: SpacyRecognizer object + """ + + # Default confidence for NER prediction + self.ner_strength = ner_strength + + self.check_label_groups = check_label_groups or self._DEFAULT_CHECK_LABEL_GROUPS + supported_entities = supported_entities or self.RECOGNIZABLE_ENTITIES + super().__init__( + supported_entities=supported_entities, + supported_language=supported_language, + ) + + # get the presidio explanation for the result + + def _build_spacy_explanation( + self, original_score: float, explanation: str + ) -> pa.AnalysisExplanation: + """ + Create explanation for why this result was detected. + + :param original_score: Score given by this recognizer + :param explanation: Explanation string + + :returns: Presidio AnalysisExplanation object + """ + explanation = pa.AnalysisExplanation( + recognizer=self.__class__.__name__, + original_score=original_score, + textual_explanation=explanation, + ) + return explanation + + # main method for the recognizer + def analyze(self, text: str, entities: List[str], nlp_artifacts=None): # noqa D102 + """ + Analyze text using Spacy. + + :param text: Text to analyze + :param entities: Entities to analyze + :param nlp_artifacts: NLP artifacts to use + + :returns: List of Presidio RecognizerResult objects + """ + results = [] + if not nlp_artifacts: + logger.warning("Skipping SpaCy, nlp artifacts not provided...") + return results + + ner_entities = nlp_artifacts.entities + + # recognize the supported entities + for entity in entities: + if entity not in self.supported_entities: + continue + for ent in ner_entities: + if not self.__check_label(entity, ent.label_, self.check_label_groups): + continue + + # string of the explanation saying the entity is recognized by spacy + textual_explanation = self._DEFAULT_EXPLANATION.format(ent.label_) + explanation = self._build_spacy_explanation( + self.ner_strength, textual_explanation + ) + + # create the standard result with the entity, start, end, score, and explanation + spacy_result = pa.RecognizerResult( + entity_type=entity, + start=ent.start_char, + end=ent.end_char, + score=self.ner_strength, + analysis_explanation=explanation, + recognition_metadata={ + pa.RecognizerResult.RECOGNIZER_NAME_KEY: self.name + }, + ) + results.append(spacy_result) + + return results + + @staticmethod + def __check_label( + entity: str, label: str, check_label_groups: Tuple[Set, Set] + ) -> bool: + """ + Check if the label is in the label group. + + :param entity: Entity to check + :param label: Label to check + :param check_label_groups: Label groups to check + + :returns: True if the label is in the label group, False otherwise + """ + return any( + entity in egrp and label in lgrp for egrp, lgrp in check_label_groups + ) + + +# Class to use Flair with Presidio as an external recognizer. +class FlairRecognizer(pa.EntityRecognizer): + """ + Wrapper for a flair model, if needed to be used within Presidio Analyzer. + This is to make sure the recognizer can be registered with Presidio registry. + """ + + RECOGNIZABLE_ENTITIES = { + "LOCATION", + "PERSON", + "NRP", + "GPE", + "ORGANIZATION", + "MAC_ADDRESS", + "US_BANK_NUMBER", + "IMEI", + "TITLE", + "LICENSE_PLATE", + "US_PASSPORT", + "CURRENCY", + "ROUTING_NUMBER", + "US_ITIN", + "US_BANK_NUMBER", + "US_DRIVER_LICENSE", + "AGE", + "PASSWORD", + "SWIFT_CODE", + } + + # This is used to construct the explanation for the result + + _DEFAULT_EXPLANATION = "Identified as {} by Flair's Named Entity Recognition" + + _DEFAULT_CHECK_LABEL_GROUPS = [ + ({"LOCATION"}, {"LOC", "LOCATION", "STREET_ADDRESS", "COORDINATE"}), + ({"PERSON"}, {"PER", "PERSON"}), + ({"NRP"}, {"NORP", "NRP"}), + ({"GPE"}, {"GPE"}), + ({"ORGANIZATION"}, {"ORG"}), + ({"MAC_ADDRESS"}, {"MAC_ADDRESS"}), + ({"US_BANK_NUMBER"}, {"US_BANK_NUMBER"}), + ({"IMEI"}, {"IMEI"}), + ({"TITLE"}, {"TITLE"}), + ({"LICENSE_PLATE"}, {"LICENSE_PLATE"}), + ({"US_PASSPORT"}, {"US_PASSPORT"}), + ({"CURRENCY"}, {"CURRENCY"}), + ({"ROUTING_NUMBER"}, {"ROUTING_NUMBER"}), + ({"AGE"}, {"AGE"}), + ({"CURRENCY"}, {"CURRENCY"}), + ({"SWIFT_CODE"}, {"SWIFT_CODE"}), + ({"US_ITIN"}, {"US_ITIN"}), + ({"US_BANK_NUMBER"}, {"US_BANK_NUMBER"}), + ({"US_DRIVER_LICENSE"}, {"US_DRIVER_LICENSE"}), + ] + + _DEFAULT_MODEL_LANGUAGES = { + "en": "beki/flair-pii-distilbert", + } + + _DEFAULT_PRESIDIO_EQUIVALENCES = { + "PER": "PERSON", + "LOC": "LOCATION", + "ORG": "ORGANIZATION", + "NROP": "NRP", + "URL": "URL", + "US_ITIN": "US_ITIN", + "US_PASSPORT": "US_PASSPORT", + "IBAN_CODE": "IBAN_CODE", + "IP_ADDRESS": "IP_ADDRESS", + "EMAIL_ADDRESS": "EMAIL", + "US_DRIVER_LICENSE": "US_DRIVER_LICENSE", + "US_BANK_NUMBER": "US_BANK_NUMBER", + } + + def __init__( + self, + supported_language: str = "en", + supported_entities: List[str] = None, + check_label_groups: Tuple[Set, Set] = None, + ): + """ + Initialize the FlairRecognizer. + + :param supported_language: Language to use + :param supported_entities: Entities to use + :param check_label_groups: Label groups to check + + :returns: FlairRecognizer object + + """ + self.check_label_groups = check_label_groups or self._DEFAULT_CHECK_LABEL_GROUPS + + supported_entities = supported_entities or self.RECOGNIZABLE_ENTITIES + self.model = fl.models.SequenceTagger.load( + self._DEFAULT_MODEL_LANGUAGES.get(supported_language) + ) + + super().__init__( + supported_entities=supported_entities, + supported_language=supported_language, + name="Flair Analytics", + ) + + # main method for the recognizer + def analyze( + self, + text: str, + entities: List[str], + nlp_artifacts: pa.nlp_engine.NlpArtifacts = None, + ) -> List[pa.RecognizerResult]: + """ + Analyze text and return the results. + + :param text: The text for analysis. + :param entities: The list of entities to recognize. + :param nlp_artifacts: Not used by this recognizer but needed for the interface. + + :returns: The list of Presidio RecognizerResult constructed from the recognized Flair detections. + """ + + results = [] + + sentences = fl.data.Sentence(text) + self.model.predict(sentences) + + # If there are no specific list of entities, we will look for all of it. + if not entities: + entities = self.supported_entities + + # Go over the entities and check if they are in the supported entities list. + for entity in entities: + if entity not in self.supported_entities: + continue + + # Go over the sentences and check if the entity is in the sentence. + for ent in sentences.get_spans("ner"): + if not self.__check_label( + entity, ent.labels[0].value, self.check_label_groups + ): + continue + + # If the entity is in the sentence, we will add it to the results. + textual_explanation = self._DEFAULT_EXPLANATION.format( + ent.labels[0].value + ) + + # Build the explanation for the result + explanation = self._build_flair_explanation( + round(ent.score, 2), textual_explanation + ) + + flair_result = self._convert_to_recognizer_result(ent, explanation) + + results.append(flair_result) + + return results + + def _convert_to_recognizer_result( + self, entity: fl.data.Span, explanation: str + ) -> pa.RecognizerResult: + """ + Convert Flair result to Presidio RecognizerResult. + + :param entity: Flair entity of Span + :param explanation: Presidio AnalysisExplanation + + :returns: Presidio RecognizerResult + """ + + # Convert the entity type to Presidio entity type + entity_type = self._DEFAULT_PRESIDIO_EQUIVALENCES.get(entity.tag, entity.tag) + + # Convert the score to Presidio score + flair_score = round(entity.score, 2) + + # Create the Presidio RecognizerResult from the Flair entity + flair_results = pa.RecognizerResult( + entity_type=entity_type, + start=entity.start_position, + end=entity.end_position, + score=flair_score, + analysis_explanation=explanation, + ) + + return flair_results + + def _build_flair_explanation( + self, original_score: float, explanation: str + ) -> pa.AnalysisExplanation: + """ + Create explanation for why this result was detected. + + :param original_score: Score given by this recognizer + :param explanation: Explanation string + + :returns: Presidio AnalysisExplanation + """ + + # Create the Presidio AnalysisExplanation for the result + explanation = pa.AnalysisExplanation( + recognizer=self.__class__.__name__, + original_score=original_score, + textual_explanation=explanation, + ) + return explanation + + # sanity check of the entity and label before recognition + @staticmethod + def __check_label( + entity: str, label: str, check_label_groups: Tuple[Set, Set] + ) -> bool: + return any( + entity in egrp and label in lgrp for egrp, lgrp in check_label_groups + ) + + +# get the analyzer engine based on the model +def _get_analyzer_engine( + model: str = None, entities: List[str] = None +) -> pa.AnalyzerEngine: + """ + Return pa.AnalyzerEngine. + + :param model: The model to use. Can be "spacy", "flair", "pattern" or "whole". + :param entities: The list of entities to use. + + :returns: pa.AnalyzerEngine + """ + # recognizer registry that can store multiple recognizers + registry = pa.RecognizerRegistry() + if model == Models.SPACY: + # custom spacy recognizer + spacy_recognizer = CustomSpacyRecognizer() + # add the custom build spacy recognizer + registry.add_recognizer(spacy_recognizer) + elif model == Models.FLAIR: + # pre-trained flair recognizer + flair_recognizer = FlairRecognizer() + # add the custom build flair recognizer + registry.add_recognizer(flair_recognizer) + elif model == Models.PATTERN: + # add the pattern recognizer + pattern_recognizer_factory = PatternRecognizerFactory() + for recognizer in pattern_recognizer_factory._create_pattern_recognizer(): + registry.add_recognizer(recognizer) + elif model == Models.WHOLE: + spacy_recognizer = CustomSpacyRecognizer() + flair_recognizer = FlairRecognizer() + registry.add_recognizer(spacy_recognizer) + registry.add_recognizer(flair_recognizer) + # add the pattern recognizer + pattern_recognizer_factory = PatternRecognizerFactory() + for recognizer in pattern_recognizer_factory._create_pattern_recognizer(): + registry.add_recognizer(recognizer) + elif not model and entities: + if set(entities) & CustomSpacyRecognizer.RECOGNIZABLE_ENTITIES: + spacy_recognizer = CustomSpacyRecognizer() + registry.add_recognizer(spacy_recognizer) + if set(entities) & FlairRecognizer.RECOGNIZABLE_ENTITIES: + flair_recognizer = FlairRecognizer() + registry.add_recognizer(flair_recognizer) + # add the pattern recognizer + if set(entities) & (set(PatternRecognizerFactory.RECOGNIZABLE_ENTITIES.keys())): + pattern_recognizer_factory = PatternRecognizerFactory() + for recognizer in pattern_recognizer_factory._create_pattern_recognizer(): + registry.add_recognizer(recognizer) + else: + raise ValueError( + f"argument of model and entities can not be None at the same time" + ) + analyzer = pa.AnalyzerEngine( + registry=registry, + supported_languages=["en"], + ) + + supported_entities = analyzer.get_supported_entities() + + if entities and not all(item in supported_entities for item in entities): + not_supported_entities = [ + item for item in entities if item not in supported_entities + ] + raise ValueError( + f"The current model {model} doesn't support the following entities: {not_supported_entities}. " + f"Supported entities are: {supported_entities}" + ) + return analyzer + + +def _get_anonymizer_engine() -> pre_anoymizer.AnonymizerEngine: + """ + Return AnonymizerEngine. + + :returns: The AnonymizerEngine. + """ + return pre_anoymizer.AnonymizerEngine() + + +def _anonymize( + text: str, + analyze_results: List[pa.RecognizerResult], + entity_operator_map: dict = None, + is_full_text: bool = True, +) -> str: + """ + Anonymize identified input using Presidio Abonymizer. + + :param text: The text for analysis. + :param analyze_results: The list of Presidio RecognizerResult constructed from + :param entity_operator_map: The entity_operator_map is a dictionary that maps entity to operator name and operator params. + :param is_full_text: Whether the text is full text or not. + + :returns: The anonymized text. + """ + if not text: + return "" + + anonymizer_engine = _get_anonymizer_engine() + if not entity_operator_map: + operators = None + else: + # Create OperatorConfig based on the entity_operator_map + operators = { + entity: OperatorConfig(operator_name, operator_params) + for entity, (operator_name, operator_params) in entity_operator_map.items() + } + + if is_full_text: + # Anonymize the entire text + return anonymizer_engine.anonymize( + text=text, analyzer_results=analyze_results, operators=operators + ).text + # Tokenize the text to sentences + sentences = nltk.sent_tokenize(text) + anonymized_sentences = [] + current_idx = 0 + + # Find the sentence that has pii entity + for sentence in sentences: + start_idx = current_idx + end_idx = start_idx + len(sentence) + + # Get the entities that are in the sentence, update hte start_idx and end_idx + sentence_results = [ + pa.RecognizerResult( + result.entity_type, + start=result.start - start_idx, + end=result.end - start_idx, + score=result.score, + ) + for result in analyze_results + if result.start >= start_idx and result.end <= end_idx + ] + + # If PII is detected + if sentence_results: + anonymized_sentence = anonymizer_engine.anonymize( + text=sentence, analyzer_results=sentence_results, operators=operators + ).text + anonymized_sentences.append(anonymized_sentence) + + current_idx = end_idx + + return " ".join(anonymized_sentences) + + +def _get_tokens( + text: str, analyze_results: List[pa.RecognizerResult], is_full: bool = True +) -> List[str]: + """ + Get the full tokens or only contains the entities that can form a sentence. + + :param text: The text for analysis. + :param analyze_results: The list of Presidio RecognizerResult constructed from + :param is_full: Whether return full tokens or just the tokens that only contains the entities that can form a sentence. + + :returns: The tokens. + """ + + tokens = [] + # sort by start index + results = sorted(analyze_results, key=lambda x: x.start) + for i, res in enumerate(results): + if i == 0: + tokens.append(text[: res.start]) + + # append entity text and entity type + tokens.append((text[res.start : res.end], res.entity_type)) + + # if another entity coming i.e. we're not at the last results element, + # add text up to next entity + if i != len(results) - 1: + tokens.append(text[res.end : results[i + 1].start]) + # if no more entities coming, add all remaining text + else: + tokens.append(text[res.end :]) + + # get the tokens that only contains the entities that can form a sentence + part_annontated_tokens = [] + if not is_full: + last_end_sentence = 0 + for i, token in enumerate(tokens): + if any(item in token for item in [".", "!", "?"]) and any( + type(item) is tuple for item in tokens[last_end_sentence:i] + ): + part_annontated_tokens.append(tokens[last_end_sentence:i]) + last_end_sentence = i + return part_annontated_tokens + return tokens + + +def _annotate( + text: str, st_analyze_results: List[pa.RecognizerResult], is_full_html: bool = True +) -> List[str]: + """ + Annotate identified input using Presidio Anonymizer. + + :param text: The text for analysis. + :param st_analyze_results: The list of Presidio RecognizerResult constructed from analysis. + :param is_full_html: Whether generate full html or not. + + :returns: The list of tokens with the identified entities. + + """ + return _get_tokens(text, st_analyze_results, is_full_html) + + +def _process( + text: str, + model: pa.AnalyzerEngine, + score_threshold: float, + entities: List[str] = None, + entities_operator_map: dict = None, + is_full_text: bool = True, +) -> Tuple[str, list]: + """ + Process the text of str using the model. + + :param text: Text to process + :param model: Model to use for processing + :param entities: Entities to recognize + :param entities_operator_map: The entity_operator_map is a dictionary that maps entity to operator name and operator params. + :param score_threshold: The score threshold to use for recognition + :param is_full_text: Whether to return the full text or just the annotated text + + :returns: A tuple of: + + * the anonymized text + * the list of Presidio RecognizerResult constructed from analysis + """ + + # get the analyzer engine + analyzer = model + + # analyze the text that can be used for anonymization + results = analyzer.analyze( + text=text, + language="en", + entities=entities, + score_threshold=score_threshold, + return_decision_process=True, + ) + + # anonymize the text, replace the pii entities with the labels + anonymized_text = _anonymize(text, results, entities_operator_map, is_full_text) + + return anonymized_text, results + + +def _get_single_html( + text: str, results: List[pa.RecognizerResult], is_full_html: bool = True +): + """ + Generate the html for a single txt file. + + :param text: The text for analysis. + :param results: The list of Presidio RecognizerResult constructed from analysis. + :param is_full_html: Whether generate full html or not. + + :returns: The html string for a single txt file. + """ + # convert the results to tokens to generate the html + tokens = _annotate(text, results, is_full_html) + html = at_util.get_annotated_html(*tokens) + + # avoid the error during rendering of the \n in the html + backslash_char = "\\" + + html_str = f"

    {html.replace('{backslash_char}n', '
    ')}

    " + + return html_str + + +def _get_single_json(results: List[pa.RecognizerResult], is_full_report: bool = True): + """ + Generate the json for a single txt file. + + :param results: The list of Presidio RecognizerResult constructed from analysis. + :param is_full_report: Whether generate full json or not. + + :returns: The json string for a single txt file. + """ + # generate the stats report if needed + if not is_full_report: + stats = [] + # add the simplify stats logic here + for item in results: + item.analysis_explanation = None + stats.append(item) + else: + stats = results + + return stats + + +def _get_all_html( + txt_content: dict, + res_dict: dict, + is_full_html: bool = True, +): + """ + Generate the html for all txt files. + + :param txt_content: The dictionary of txt file name and content. + :param res_dict: The dictionary of txt file name and the list of Presidio RecognizerResult constructed from analysis. + :param is_full_html: Whether generate full html or not. + + :returns: The html string for all txt files. + + """ + # These are placeholder for the html string + html_index = "Highlighted Pii Entities

    Highlighted Pii Entities

      " + html_content = "" + for txt_file, results in res_dict.items(): + txt = txt_content[txt_file] + html_index += f"
    • {txt_file}
    • " + html_content += f"
    • {txt_file}

      {_get_single_html(txt, results, is_full_html)}

    • " + html_index += "
    " + html_res = f"{html_index}{html_content}" + + return html_res + + +def _get_all_rpt(res_dict: dict, is_full_report: bool = True): + """ + Generate the stats report for all txt files. + + :param res_dict: The dictionary of txt file name and the list of Presidio RecognizerResult constructed from analysis. + :param is_full_report: Whether generate full report or not. + + :returns: The stats report for all txt files. + """ + # These are placeholder for the json report + stats_dict = {} + for txt_file, results in res_dict.items(): + new_stats = [] + for item in _get_single_json(results, is_full_report): + if is_full_report: + item.analysis_explanation = item.analysis_explanation.to_dict() + new_stats.append(item.to_dict()) + else: + tmp_dict = item.to_dict() + tmp_dict.pop("analysis_explanation") + tmp_dict.pop("recognition_metadata") + new_stats.append(tmp_dict) + stats_dict[txt_file] = new_stats + return stats_dict + + +def recognize_pii( + context: mlrun.MLClientCtx, + input_path: Union[str, pathlib.Path], + html_key: str, + score_threshold: float, + output_directory: str = None, + entities: List[ + str + ] = None, # List of entities to recognize, default is recognizing all + entity_operator_map: dict = None, + model: str = None, + generate_json: bool = True, + generate_html: bool = True, + is_full_text: bool = True, + is_full_html: bool = True, + is_full_report: bool = True, +) -> Union[Tuple[str, pd.DataFrame, dict, dict], Tuple[str, pd.DataFrame, dict]]: + """ + Walk through the input path, recognize PII in text and store the anonymized text in the output path. + Generate the html with different colors for each entity, json report of the explanation. + + :param context: The MLRun context. this is needed for log the artifacts. + :param input_path: The input path of the text files needs to be analyzed. + :param html_key: The html key for the artifact. + :param score_threshold: The score threshold to mark the recognition as trusted. + :param output_directory: The output directory path to store the anonymized text. + :param entities: The list of entities to recognize. + :param entity_operator_map: The map of entity to operator (mask, redact, replace, keep, hash, and its params) + :param model: The model to use. Can be "spacy", "flair", "pattern" or "whole". + :param generate_json: Whether to generate the json report of the explanation. + :param generate_html: Whether to generate the html report of the explanation. + :param is_full_text: Whether to return the full text or only the masked text. + :param is_full_html: Whether to return the full html or just the annotated text + :param is_full_report: Whether to return the full report or just the score and start, end index + + :returns: A tuple of: + + * Path to the output directory + * The json report of the explanation (if generate_json is True) + * A dictionary of errors files that were not processed + + """ + + # Set output directory + if output_directory is None: + output_directory = tempfile.mkdtemp() + + # Create the output directory: + output_directory = pathlib.Path(output_directory) + if not output_directory.exists(): + output_directory.mkdir(parents=True, exist_ok=True) + + txt_files_directory = pathlib.Path(input_path) + successes = [] + errors = {} + + res_dict = {} + txt_content = {} + # Load the model: + analyzer = _get_analyzer_engine(model, entities) + logger.info("Model loaded") + # Go over the text files in the input path, analyze and anonymize them: + for txt_file in tqdm( + list(txt_files_directory.glob("*.txt")), + desc="Processing files", + unit="file", + ): + try: + # Load the str from the text file + text = txt_file.read_text() + txt_content[str(txt_file)] = text + # Process the text to recoginze the pii entities in it + anonymized_text, results = _process( + text=text, + model=analyzer, + entities=entities, + entities_operator_map=entity_operator_map, + score_threshold=score_threshold, + is_full_text=is_full_text, + ) + res_dict[str(txt_file)] = results + # Store the anonymized text in the output path + output_file = output_directory / f"{txt_file.stem}.txt" + output_file.parent.mkdir(parents=True, exist_ok=True) + with open(output_file, "w") as f: + f.write(anonymized_text) + successes.append([txt_file.name, output_file.name]) + except Exception as e: + errors[str(txt_file)] = str(e) + logger.error(f"Error processing {txt_file}: {e}") + + successes = pd.DataFrame( + successes, + columns=["original_file", "anonymized_file"], + ) + + if generate_html: + # Generate the html report + html_res = _get_all_html(txt_content, res_dict, is_full_html) + # Store the html report in the context + arti_html = mlrun.artifacts.Artifact(body=html_res, format="html", key=html_key) + context.log_artifact(arti_html) + if generate_json: + # Generate the json report + json_res = _get_all_rpt(res_dict, is_full_report) + return str(output_directory), successes, errors, json_res + return str(output_directory), successes, errors diff --git a/functions/master/pii_recognizer/0.4.0/src/requirements.txt b/functions/master/pii_recognizer/0.4.0/src/requirements.txt new file mode 100644 index 00000000..467565d4 --- /dev/null +++ b/functions/master/pii_recognizer/0.4.0/src/requirements.txt @@ -0,0 +1,12 @@ +faker +nltk +pandas +streamlit +presidio-anonymizer +presidio-analyzer +torch +st-annotated-text +streamlit +git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653#egg=flair +st-annotated-text +https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl diff --git a/functions/master/pii_recognizer/0.4.0/src/test_pii_recognizer.py b/functions/master/pii_recognizer/0.4.0/src/test_pii_recognizer.py new file mode 100644 index 00000000..81a16611 --- /dev/null +++ b/functions/master/pii_recognizer/0.4.0/src/test_pii_recognizer.py @@ -0,0 +1,251 @@ +# Copyright 2019 Iguazio +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import os +import pytest +import random +from faker import Faker +import mlrun +from pii_recognizer import ( + _process, + _get_analyzer_engine, + _anonymize, + _annotate, + recognize_pii_parallel, +) + + +def generate_routing_number(): + prefix = random.randint(0, 99) + identifier = random.randint(0, 9999999) + identifier_str = str(identifier).zfill(7) + weighted_sum = ( + 3 * (int(str(prefix).zfill(2)[0])) + + 7 * (int(str(prefix).zfill(2)[1])) + + 1 * (int(identifier_str[0])) + + 3 * (int(identifier_str[1])) + + 7 * (int(identifier_str[2])) + + 1 * (int(identifier_str[3])) + + 3 * (int(identifier_str[4])) + + 7 * (int(identifier_str[5])) + + 1 * (int(identifier_str[6])) + ) + check_digit = (10 - (weighted_sum % 10)) % 10 + + routing_number = f"{prefix:02d}{identifier_str}{check_digit}" + + return routing_number + + +def generate_us_itin(): + area_number = random.randint(900, 999) + group_number = random.randint(70, 99) + serial_number = random.randint(0, 9999) + + formatted_itin = f"{area_number:03d}-{group_number:02d}-{serial_number:04d}" + return formatted_itin + + +@pytest.fixture(scope="function") +def fake_data(request): + params = request.param if hasattr(request, "param") else {} + fake = Faker("en_US") + data = { + "name": fake.name(), + "email": fake.email(), + "address": fake.address(), + "phone": fake.phone_number(), + "ssn": fake.ssn(), + "credit_card": fake.credit_card_number(), + "organization": fake.company(), + "location": fake.street_address(), + "date_time": fake.date(), + "mac_address": fake.mac_address(), + "us_bank_number": fake.bban(), + "imei": "".join(str(fake.random_int(0, 9)) for _ in range(14)), + "title": fake.job(), + "license_plate": fake.license_plate(), + "us_passport": fake.passport_number(), + "currency": fake.currency_code(), + "routing_number": generate_routing_number(), + "us_itin": generate_us_itin(), + "age": fake.random_int(1, 100), + "password": fake.password(), + "swift_code": fake.swift(), + } + + data.update(params) + + yield data + + +@pytest.mark.skip() +def test_pattern_process(fake_data): + ENTITIES = { + "CREDIT_CARD": "credit_card", + "SSN": "ssn", + "PHONE": "phone", + "EMAIL": "email", + } + + analyzer = _get_analyzer_engine(model="pattern") + text = f"He can be reached at {fake_data['email']} or {fake_data['phone']}. His credit card number is {fake_data['credit_card']} and his SSN is {fake_data['ssn']}." + res, results = _process(text, analyzer, score_threshold=0.5) + + assert any(entity in res for entity in ENTITIES.keys()) + + +@pytest.mark.skip() +def test_spacy_process(fake_data): + ENTITIES = { + "PERSON": "name", + "ORGANIZATION": "organization", + } + + analyzer = _get_analyzer_engine(model="spacy") + text = f"{fake_data['name']}'s employer is {fake_data['organization']}." + res, results = _process(text, analyzer, score_threshold=0.5) + + assert any(entity in res for entity in ENTITIES.keys()) + + +@pytest.mark.skip() +def test_flair_process(fake_data): + ENTITIES = { + "LOCATION": "location", + "PERSON": "name", + "ORGANIZATION": "organization", + "MAC_ADDRESS": "mac_address", + "US_BANK_NUMBER": "us_bank_number", + "IMEI": "imei", + "TITLE": "title", + "LICENSE_PLATE": "license_plate", + "US_PASSPORT": "us_passport", + "CURRENCY": "currency", + "ROUTING_NUMBER": "routing_number", + "US_ITIN": "us_itin", + "US_BANK_NUMBER": "us_bank_number", + "AGE": "age", + "PASSWORD": "password", + "SWIFT_CODE": "swift_code", + } + + analyzer = _get_analyzer_engine(model="flair") + text = " ".join( + [item + " is " + str(fake_data[item]) for item in ENTITIES.values()] + ) + res, results = _process(text, analyzer, score_threshold=0.5) + assert any(entity in res for entity in ENTITIES.keys()) + + +@pytest.mark.skip() +def test_whole_process(fake_data): + ENTITIES = { + "LOCATION": "location", + "PERSON": "name", + "ORGANIZATION": "organization", + "MAC_ADDRESS": "mac_address", + "US_BANK_NUMBER": "us_bank_number", + "IMEI": "imei", + "TITLE": "title", + "LICENSE_PLATE": "license_plate", + "US_PASSPORT": "us_passport", + "CURRENCY": "currency", + "ROUTING_NUMBER": "routing_number", + "US_ITIN": "us_itin", + "US_BANK_NUMBER": "us_bank_number", + "AGE": "age", + "CREDIT_CARD": "credit_card", + "SSN": "ssn", + "PHONE": "phone", + "EMAIL": "email", + "PASSWORD": "password", + "SWIFT_CODE": "swift_code", + } + text = " ".join( + [item + " is " + str(fake_data[item]) for item in ENTITIES.values()] + ) + analyzer = _get_analyzer_engine(model="whole") + res, results = _process(text, analyzer, score_threshold=0.5) + assert any(entity in res for entity in ENTITIES.keys()) + + +@pytest.mark.skip() +def test_only_entities(fake_data): + ENTITIES = { + "LOCATION": "location", + "PERSON": "name", + "ORGANIZATION": "organization", + "MAC_ADDRESS": "mac_address", + "US_BANK_NUMBER": "us_bank_number", + "IMEI": "imei", + "TITLE": "title", + "LICENSE_PLATE": "license_plate", + "US_PASSPORT": "us_passport", + "CURRENCY": "currency", + "ROUTING_NUMBER": "routing_number", + "US_ITIN": "us_itin", + "US_BANK_NUMBER": "us_bank_number", + "AGE": "age", + "CREDIT_CARD": "credit_card", + "SSN": "ssn", + "PHONE": "phone", + "EMAIL": "email", + "PASSWORD": "password", + "SWIFT_CODE": "swift_code", + } + + text = " ".join( + [item + " is " + str(fake_data[item]) for item in ENTITIES.values()] + ) + analyzer = _get_analyzer_engine(entities=list(ENTITIES.keys())[:5]) + res, results = _process(text, analyzer, score_threshold=0.5) + assert any(entity in res for entity in ENTITIES.keys()) + + +def test_parallel(): + context = mlrun.get_or_create_ctx("test_parallel") + ENTITIES = { + "LOCATION": "location", + "PERSON": "name", + "ORGANIZATION": "organization", + "MAC_ADDRESS": "mac_address", + "US_BANK_NUMBER": "us_bank_number", + "IMEI": "imei", + "TITLE": "title", + "LICENSE_PLATE": "license_plate", + "US_PASSPORT": "us_passport", + "CURRENCY": "currency", + "ROUTING_NUMBER": "routing_number", + "US_ITIN": "us_itin", + "US_BANK_NUMBER": "us_bank_number", + "AGE": "age", + "CREDIT_CARD": "credit_card", + "SSN": "ssn", + "PHONE": "phone", + "EMAIL": "email", + "PASSWORD": "password", + "SWIFT_CODE": "swift_code", + } + json_res, erros = recognize_pii_parallel( + context=context, + config_input_output="data/config.csv", + score_threshold=0.5, + html_key="test_parallel", + entities=list(ENTITIES.keys()), + model="whole", + ) + + assert len(json_res) == 2 diff --git a/functions/master/pii_recognizer/0.4.0/static/documentation.html b/functions/master/pii_recognizer/0.4.0/static/documentation.html new file mode 100644 index 00000000..fbf1f167 --- /dev/null +++ b/functions/master/pii_recognizer/0.4.0/static/documentation.html @@ -0,0 +1,552 @@ + + + + + + + +pii_recognizer package + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + + + +
    +
    +
    +
    +
    +
    + +
    + +
    +
    +
    + + + + +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + + + +
    +
    +
    +
    + + +
    +
    +

    pii_recognizer package#

    +
    +

    Submodules#

    +
    +
    +

    pii_recognizer.pii_recognizer module#

    +
    +
    +class pii_recognizer.pii_recognizer.CustomSpacyRecognizer(*args: Any, **kwargs: Any)[source]#
    +

    Bases: LocalRecognizer

    +

    Custom Spacy Recognizer from Presidio Analyzer trained on Privy data. +The privy data is generated using this pixie-io/pixie +It can be used to recognize custom entities, Since we want to use Presidio’s Registries to generate AnalyzerEngine, +it inherits from Presidio Analyzer’s LocalRecognizer class.

    +
    +
    +RECOGNIZABLE_ENTITIES = {'DATE_TIME', 'LOCATION', 'NRP', 'ORGANIZATION', 'PERSON'}#
    +
    +
    +
    +analyze(text: str, entities: List[str], nlp_artifacts=None)[source]#
    +

    Analyze text using Spacy.

    +
    +
    Parameters:
    +
      +
    • text – Text to analyze

    • +
    • entities – Entities to analyze

    • +
    • nlp_artifacts – NLP artifacts to use

    • +
    +
    +
    Returns:
    +

    List of Presidio RecognizerResult objects

    +
    +
    +
    +
    +
    +
    +class pii_recognizer.pii_recognizer.Entities[source]#
    +

    Bases: object

    +
    +
    +AGE = 'AGE'#
    +
    +
    +
    +CREDIT_CARD = 'CREDIT_CARD'#
    +
    +
    +
    +CURRENCY = 'CURRENCY'#
    +
    +
    +
    +DATE_TIME = 'DATE_TIME'#
    +
    +
    +
    +EMAIL = 'EMAIL'#
    +
    +
    +
    +GPE = ('GPE',)#
    +
    +
    +
    +IMEI = 'IMEI'#
    +
    +
    +
    +LICENSE_PLATE = 'LICENSE_PLATE'#
    +
    +
    +
    +LOCATION = 'LOCATION'#
    +
    +
    +
    +MAC_ADDRESS = 'MAC_ADDRESS'#
    +
    +
    +
    +NRP = 'NRP'#
    +
    +
    +
    +ORGANIZATION = 'ORGANIZATION'#
    +
    +
    +
    +PASSWORD = 'PASSWORD'#
    +
    +
    +
    +PERSON = 'PERSON'#
    +
    +
    +
    +PHONE = 'PHONE'#
    +
    +
    +
    +ROUTING_NUMBER = 'ROUTING_NUMBER'#
    +
    +
    +
    +SSN = 'SSN'#
    +
    +
    +
    +SWIFT_CODE = 'SWIFT_CODE'#
    +
    +
    +
    +TITLE = 'TITLE'#
    +
    +
    +
    +US_BANK_NUMBER = 'US_BANK_NUMBER'#
    +
    +
    +
    +US_DRIVER_LICENSE = 'US_DRIVER_LICENSE'#
    +
    +
    +
    +US_ITIN = 'US_ITIN'#
    +
    +
    +
    +US_PASSPORT = 'US_PASSPORT'#
    +
    +
    +
    +
    +class pii_recognizer.pii_recognizer.FlairRecognizer(*args: Any, **kwargs: Any)[source]#
    +

    Bases: EntityRecognizer

    +

    Wrapper for a flair model, if needed to be used within Presidio Analyzer. +This is to make sure the recognizer can be registered with Presidio registry.

    +
    +
    +RECOGNIZABLE_ENTITIES = {'AGE', 'CURRENCY', 'GPE', 'IMEI', 'LICENSE_PLATE', 'LOCATION', 'MAC_ADDRESS', 'NRP', 'ORGANIZATION', 'PASSWORD', 'PERSON', 'ROUTING_NUMBER', 'SWIFT_CODE', 'TITLE', 'US_BANK_NUMBER', 'US_DRIVER_LICENSE', 'US_ITIN', 'US_PASSPORT'}#
    +
    +
    +
    +analyze(text: str, entities: List[str], nlp_artifacts: presidio_analyzer.nlp_engine.NlpArtifacts | None = None) List[presidio_analyzer.RecognizerResult][source]#
    +

    Analyze text and return the results.

    +
    +
    Parameters:
    +
      +
    • text – The text for analysis.

    • +
    • entities – The list of entities to recognize.

    • +
    • nlp_artifacts – Not used by this recognizer but needed for the interface.

    • +
    +
    +
    Returns:
    +

    The list of Presidio RecognizerResult constructed from the recognized Flair detections.

    +
    +
    +
    +
    +
    +
    +class pii_recognizer.pii_recognizer.Models[source]#
    +

    Bases: object

    +
    +
    +FLAIR = 'flair'#
    +
    +
    +
    +PATTERN = 'pattern'#
    +
    +
    +
    +SPACY = 'spacy'#
    +
    +
    +
    +WHOLE = 'whole'#
    +
    +
    +
    +
    +class pii_recognizer.pii_recognizer.PatternRecognizerFactory[source]#
    +

    Bases: object

    +

    Factory for creating pattern recognizers, it can be extended in the future to +add more regex pattern for different entities. For the pattern recognizer to work, +we need construct a list of regex patterns for each entity.

    +
    +
    +RECOGNIZABLE_ENTITIES = {'CREDIT_CARD': [presidio_analyzer.Pattern], 'EMAIL': [presidio_analyzer.Pattern], 'PHONE': [presidio_analyzer.Pattern], 'SSN': [presidio_analyzer.Pattern]}#
    +
    +
    +
    +
    +pii_recognizer.pii_recognizer.recognize_pii(context: MLClientCtx, input_path: str | Path, html_key: str, score_threshold: float, output_directory: str | None = None, entities: List[str] | None = None, entity_operator_map: dict | None = None, model: str | None = None, generate_json: bool = True, generate_html: bool = True, is_full_text: bool = True, is_full_html: bool = True, is_full_report: bool = True) Tuple[str, DataFrame, dict, dict] | Tuple[str, DataFrame, dict][source]#
    +

    Walk through the input path, recognize PII in text and store the anonymized text in the output path. +Generate the html with different colors for each entity, json report of the explanation.

    +
    +
    Parameters:
    +
      +
    • context – The MLRun context. this is needed for log the artifacts.

    • +
    • input_path – The input path of the text files needs to be analyzed.

    • +
    • html_key – The html key for the artifact.

    • +
    • score_threshold – The score threshold to mark the recognition as trusted.

    • +
    • output_directory – The output directory path to store the anonymized text.

    • +
    • entities – The list of entities to recognize.

    • +
    • entity_operator_map – The map of entity to operator (mask, redact, replace, keep, hash, and its params)

    • +
    • model – The model to use. Can be “spacy”, “flair”, “pattern” or “whole”.

    • +
    • generate_json – Whether to generate the json report of the explanation.

    • +
    • generate_html – Whether to generate the html report of the explanation.

    • +
    • is_full_text – Whether to return the full text or only the masked text.

    • +
    • is_full_html – Whether to return the full html or just the annotated text

    • +
    • is_full_report – Whether to return the full report or just the score and start, end index

    • +
    +
    +
    Returns:
    +

    A tuple of:

    +
      +
    • Path to the output directory

    • +
    • The json report of the explanation (if generate_json is True)

    • +
    • A dictionary of errors files that were not processed

    • +
    +

    +
    +
    +
    +
    +
    +

    Module contents#

    +
    +
    +
    +
    +
    +
    +
    +
    + +
    +
    + +
    +
    +
    +
    + + + +
    +
    + + \ No newline at end of file diff --git a/functions/master/pii_recognizer/0.4.0/static/example.html b/functions/master/pii_recognizer/0.4.0/static/example.html new file mode 100644 index 00000000..e403a056 --- /dev/null +++ b/functions/master/pii_recognizer/0.4.0/static/example.html @@ -0,0 +1,1880 @@ + + + + + + + +PII Recognizer + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + + + +
    +
    +
    +
    +
    +
    + +
    + +
    +
    +
    + + + + +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + + + +
    +
    +
    +
    + + +
    +
    +

    PII Recognizer#

    +

    A function to detect pii data and anonymize the pii entity in the text.

    +

    In this notebook we will go over the function’s docs and outputs and see an end-to-end example of running it.

    +
      +
    1. Documentation

    2. +
    3. Results

    4. +
    5. End-to-end Demo

    6. +
    +

    +
    +

    1. Documentation#

    +

    The function receive a directory path with all the text files in it. It walk through the directory, get all the text file. Then it detect the pii entity inside of the text file, apply the operator on the entity. Generate the html file with all pii entity highlighted. Generate the json report has the explaination of the process.

    +
    +

    1.1. Parameters:#

    +
      +
    • context: mlrun.MLClientCtx

      +

      The MLRun context

      +
    • +
    • input_path: str

      +

      The input directory with all the text files

      +
    • +
    • output_path: str

      +

      The directory that is used to store the anonymized text files. it is also used for mlrun to log the artifact as zip file

      +
    • +
    • output_suffix: str

      +

      The suffix will added to the input file. for example if the input text file is pii.txt, if output_suffix is “anonymized”, the output file would be pii_anonymized.txt

      +
    • +
    • html_key: str

      +

      The artifact name of the html file

      +
    • +
    • entities: List[str]

      +

      The list of the entities to recognize. Please make sure the model you choose can recognize the entities.

      +
    • +
    • entity_operator_map: List[str] +For different entity, we can apply different operator. Now supports Keep, Mask, Replace, Redact, Hash

      +
      +   entity_operator_map = {
      +      "PERSON": ("keep", {}),
      +      "EMAIL": ("mask", {"masking_char": "#", "chars_to_mask": 5, "from_end": False}),
      +      "PHONE": ("hash", {}),
      +      "LOCATION": ("redact", {}),
      +      "ORGANIZATION": ("replace", {"new_value": "Company XYZ"})
      +      }
      +   
      +

      In this example:

      +
        +
      • “PERSON” entities are kept as they are using the “keep” operator.

      • +
      • “EMAIL_ADDRESS” entities are masked with the “#” character, masking the first five characters.

      • +
      • “PHONE_NUMBER” entities are replaced with their hashed value using the “hash” operator.

      • +
      • “LOCATION” entities are completely removed using the “redact” operator.

      • +
      • “ORGANIZATION” entities are replaced with the string “Company XYZ” using the “replace” operator.

      • +
      +
    • +
    • model: str

      +
        +
      • “whole”, “spacy”, “pattern”, “flair”. The default is “whole”.

      • +
      +

      For each model, it can detect some entities. The “whole” model is combined all three models together. It can detect all the entities list below.

      +
        +
      • “spacy” : [“LOCATION”, “PERSON”,”NRP”,”ORGANIZATION”,”DATE_TIME”]

      • +
      • “pattern”: [“CREDIT_CARD”, “SSN”, “PHONE”, “EMAIL”]

      • +
      • “flair”: [ “LOCATION”, +“PERSON”, +“NRP”, +“GPE”, +“ORGANIZATION”, +“MAC_ADDRESS”, +“US_BANK_NUMBER”, +“IMEI”, +“TITLE”, +“LICENSE_PLATE”, +“US_PASSPORT”, +“CURRENCY”, +“ROUTING_NUMBER”, +“US_ITIN”, +“US_BANK_NUMBER”, +“US_DRIVER_LICENSE”, +“AGE”, +“PASSWORD”, +“SWIFT_CODE” +]

      • +
      +
    • +
    • score_threshold:

      +

      Minimum confidence value, the default is 0 to align with presidio.AnalyzerEngine

      +
    • +
    • generate_json_rpt:

      +

      Whether to generate the json report of the explaination

      +
    • +
    • generate_html_rpt:

      +

      Whether to generate the html with highlighted pii entities or not

      +
    • +
    • is_full_text:

      +

      Whether to return the full text or just the sentences with pii entities.

      +
    • +
    • is_full_html: bool

      +

      Whether to return the full html or just the annotated html

      +
    • +
    • is_full_report: bool

      +

      Whether to return the full json report or just the score and start, end index

      +
    • +
    +
    +
    +

    1.2. Outputs:#

    +

    There are two outputs of this function.

    +
      +
    • output_path: str

      +

      The directory stored all the anonymized text files

      +
    • +
    • rpt_json: dict

      +

      A dict of reporting to explain how does the model detect the pii entity

      +
    • +
    • errors : dict +A dict of errors when processing the text files if any

    • +
    +

    +
    +
    +
    +

    2. Results#

    +

    The result of the function looks like the following:

    +

    For example if the input string is

    +

    John Doe 's ssn is 182838483, connect john doe with john_doe@gmail.com or 6288389029, he can pay you with 41482929939393

    +

    The anonymized_text is

    +

    <PERSON>'s <ORGANIZATION> is <SSN>, connect <PERSON> with <PERSON> <EMAIL> or <PHONE>, he can pay you with <CREDIT_CARD>

    +

    The html_str is

    +

    John Doe'sPERSON ssnORGANIZATION is 182838483SSN, connect me with john_doe@gmail.comPERSONjohn_doe@gmail.comEMAIL or 6288389029PHONE, he can pay you with 41482929939393CREDIT_CARD +

    +

    The json report that explain the output is

    +
    [
    +  {
    +    "entity_type": "PERSON", # result of the labeling
    +    "start": 0, # start positon of the entity
    +    "end": 9,  # end postion of the entity
    +    "score": 0.99, # the confident score of the model + context_improvement
    +    "analysis_explanation": {
    +      "recognizer": "FlairRecognizer", # which recognizer is used to recognize this entity
    +      "pattern_name": null,
    +      "pattern": null,
    +      "original_score": 0.99, # The original confident score from the pre-trained model
    +      "score": 0.99, # the final score = original_score + score_context_improvement
    +      "textual_explanation": "Identified as PER by Flair's Named Entity Recognition",
    +      "score_context_improvement": 0, # The improvement from the context
    +      "supportive_context_word": "",
    +      "validation_result": null
    +    },
    +    "recognition_metadata": {
    +      "recognizer_identifier": "Flair Analytics_5577088640",
    +      "recognizer_name": "Flair Analytics"
    +    }
    +  },
    +  ....
    +]
    +
    +
    +

    +
    +
    +

    3. End-to-end Demo#

    +
    +

    3.1. Recognition configurations#

    +
      +
    • model: which model you want to use.

    • +
    • entities: What entities to recognize?

    • +
    • score_threshold: From which score to mark the recogniztion as trusted?

    • +
    +
    +
    +
    import mlrun
    +artifact_path = "./"
    +proj = mlrun.get_or_create_project("pii", "./")
    +fn = mlrun.code_to_function(
    +    project="pii",
    +    name="pii_recognizer",
    +    filename="pii_recognizer.py",
    +    kind="job",
    +    image="mlrun/mlrun",
    +    handler="recognize_pii",
    +    description="This function is used to recognize PII in a given text",
    +)
    +run_obj = fn.run(
    +    artifact_path = artifact_path,
    +    params= {
    +        'model': "whole", 
    +        'input_path': "./data/",
    +        'output_path': "./data/output1/",
    +        "entities": ['PERSON', "EMAIL", "PHONE", "LOCATION", "ORGANIZATION"], # the entities that needs to recognize
    +        "output_suffix": "output",
    +        "html_key": "highlighted",
    +        "score_threshold" : 0.5, # the score threshold to mark the recognition as trusted
    +    },
    +    returns = ["output_path: path", "rpt_json: file", "errors: file"],
    +    local=True,
    +)
    +
    +
    +
    +
    +
    > 2023-07-31 02:17:04,305 [info] Project loaded successfully: {'project_name': 'pii'}
    +> 2023-07-31 02:17:04,312 [warning] Failed to add git metadata, ignore if path is not part of a git repo.: {'path': './', 'error': '/User/pii_recognizer'}
    +> 2023-07-31 02:17:04,408 [warning] artifact/output path is not defined or is local and relative, artifacts will not be visible in the UI: {'output_path': './'}
    +> 2023-07-31 02:17:04,409 [info] Storing function: {'name': 'pii-recognizer-recognize-pii', 'uid': '51b5ad8144004e52a1008c08850842c8', 'db': None}
    +2023-07-31 02:17:04,567 loading file /User/.flair/models/flair-pii-distilbert/models--beki--flair-pii-distilbert/snapshots/20fb59f1762edcf253bce67716a94a43cb075ae6/pytorch_model.bin
    +2023-07-31 02:17:07,730 SequenceTagger predicts: Dictionary with 21 tags: O, S-LOC, B-LOC, E-LOC, I-LOC, S-PER, B-PER, E-PER, I-PER, S-DATE_TIME, B-DATE_TIME, E-DATE_TIME, I-DATE_TIME, S-ORG, B-ORG, E-ORG, I-ORG, S-NRP, B-NRP, E-NRP, I-NRP
    +Model loaded
    +
    +
    +
    +
    +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
    pii0Jul 31 02:17:04completedpii-recognizer-recognize-pii
    v3io_user=pengw
    kind=
    owner=pengw
    host=jupyter-pengw-5f99fb678d-mnvxl
    model=whole
    input_path=./data/
    output_path=./data/output1/
    entities=['PERSON', 'EMAIL', 'PHONE', 'LOCATION', 'ORGANIZATION']
    output_suffix=output
    html_key=highlighted
    score_threshold=0.5
    highlighted
    output_path
    rpt_json
    errors
    +
    + +
    +
    
    +
    +
    +
    > to track results use the .show() or .logs() methods or click here to open in UI
    > 2023-07-31 02:17:12,403 [info] Run execution finished: {'status': 'completed', 'name': 'pii-recognizer-recognize-pii'}
    +
    +
    +
    +
    +
    +
    +
    #get the mlrun context
    +context = mlrun.get_or_create_ctx('pii_ctx1')
    +import pathlib
    +from tqdm.auto import tqdm
    +for i, txt_file in enumerate(
    +        tqdm(
    +            list(pathlib.Path("./data/output1/").glob("*.txt")),
    +            desc="Processing files",
    +            unit="file",
    +        )
    +    ):
    +            # Load the str from the text file
    +        text = txt_file.read_text()
    +        print(text)
    +
    +
    +
    +
    +
    Dear Mr. <PERSON>,
    +
    +We are pleased to inform you that you have been selected as the winner of our exclusive vacation package giveaway! Congratulations! You, along with your family, will enjoy a luxurious stay at our resort in the beautiful city of <ORGANIZATION>. Your flight tickets have been booked, and you will be departing on July 15th, 2023.
    +
    +Please provide us with the necessary details to finalize your travel arrangements. We kindly request your full name, date of birth, passport number, and contact information. Rest assured that all provided information will be handled with utmost confidentiality and in compliance with data protection regulations.
    +
    +We look forward to creating unforgettable memories for you and your loved ones during your stay with us. If you have any questions or require further assistance, please don't hesitate to contact our customer support team.
    +
    +<PERSON> <ORGANIZATION> is 182838483, connect him with <EMAIL> or <PHONE>, he can pay you with <PHONE>9393
    +
    +
    +
    +
    +
    +
    +
    #check the highlighted html 
    +html_output = context.get_cached_artifact("highlighted")
    +html_str = mlrun.get_dataitem(html_output.get_target_path()).get().decode("utf-8")
    +from IPython.core.display import display, HTML
    +display(HTML(html_str))
    +
    +
    +
    +
    +
    Highlighted Pii Entities

    Highlighted Pii Entities

  • data/letter.txt

    Dear Mr. John DoePERSON, + +We are pleased to inform you that you have been selected as the winner of our exclusive vacation package giveaway! Congratulations! You, along with your family, will enjoy a luxurious stay at our resort in the beautiful city of RivieraLOCATIONRivieraORGANIZATION. Your flight tickets have been booked, and you will be departing on July 15th, 2023. + +Please provide us with the necessary details to finalize your travel arrangements. We kindly request your full name, date of birth, passport number, and contact information. Rest assured that all provided information will be handled with utmost confidentiality and in compliance with data protection regulations. + +We look forward to creating unforgettable memories for you and your loved ones during your stay with us. If you have any questions or require further assistance, please don't hesitate to contact our customer support team. +

  • data/pii_data.txt

    John smith'sPERSON ssnORGANIZATION is 182838483, connect him with JohnPERSONJohn_smith@gmail.comEMAILsmithPERSON@gmail.com or 6288389029PHONE, he can pay you with 4148292993PHONE9393

  • +
    +
    +
    +
    #check the json report about the explanation.
    +rpt_output1 = context.get_cached_artifact("rpt_json")
    +rpt_str1 = mlrun.get_dataitem(rpt_output1.get_target_path()).get().decode("utf-8")
    +import json
    +obj = json.loads(rpt_str1)
    + 
    +# Pretty Print JSON
    +json_formatted_str1 = json.dumps(obj, indent=4)
    +print(json_formatted_str1)
    +
    +
    +
    +
    +
    {
    +    "data/letter.txt": [
    +        {
    +            "entity_type": "PERSON",
    +            "start": 9,
    +            "end": 17,
    +            "score": 1,
    +            "analysis_explanation": {
    +                "recognizer": "CustomSpacyRecognizer",
    +                "pattern_name": null,
    +                "pattern": null,
    +                "original_score": 1,
    +                "score": 1,
    +                "textual_explanation": "Identified as PERSON by Spacy's Named Entity Recognition (Privy-trained)",
    +                "score_context_improvement": 0,
    +                "supportive_context_word": "",
    +                "validation_result": null
    +            },
    +            "recognition_metadata": {
    +                "recognizer_name": "CustomSpacyRecognizer",
    +                "recognizer_identifier": "CustomSpacyRecognizer_139944219101744"
    +            }
    +        },
    +        {
    +            "entity_type": "LOCATION",
    +            "start": 248,
    +            "end": 255,
    +            "score": 1.0,
    +            "analysis_explanation": {
    +                "recognizer": "FlairRecognizer",
    +                "pattern_name": null,
    +                "pattern": null,
    +                "original_score": 1.0,
    +                "score": 1.0,
    +                "textual_explanation": "Identified as LOC by Flair's Named Entity Recognition",
    +                "score_context_improvement": 0,
    +                "supportive_context_word": "",
    +                "validation_result": null
    +            },
    +            "recognition_metadata": {
    +                "recognizer_identifier": "Flair Analytics_139944219101936",
    +                "recognizer_name": "Flair Analytics"
    +            }
    +        },
    +        {
    +            "entity_type": "ORGANIZATION",
    +            "start": 248,
    +            "end": 255,
    +            "score": 1,
    +            "analysis_explanation": {
    +                "recognizer": "CustomSpacyRecognizer",
    +                "pattern_name": null,
    +                "pattern": null,
    +                "original_score": 1,
    +                "score": 1,
    +                "textual_explanation": "Identified as ORG by Spacy's Named Entity Recognition (Privy-trained)",
    +                "score_context_improvement": 0,
    +                "supportive_context_word": "",
    +                "validation_result": null
    +            },
    +            "recognition_metadata": {
    +                "recognizer_name": "CustomSpacyRecognizer",
    +                "recognizer_identifier": "CustomSpacyRecognizer_139944219101744"
    +            }
    +        }
    +    ],
    +    "data/pii_data.txt": [
    +        {
    +            "entity_type": "PERSON",
    +            "start": 0,
    +            "end": 12,
    +            "score": 1,
    +            "analysis_explanation": {
    +                "recognizer": "CustomSpacyRecognizer",
    +                "pattern_name": null,
    +                "pattern": null,
    +                "original_score": 1,
    +                "score": 1,
    +                "textual_explanation": "Identified as PERSON by Spacy's Named Entity Recognition (Privy-trained)",
    +                "score_context_improvement": 0,
    +                "supportive_context_word": "",
    +                "validation_result": null
    +            },
    +            "recognition_metadata": {
    +                "recognizer_name": "CustomSpacyRecognizer",
    +                "recognizer_identifier": "CustomSpacyRecognizer_139944219101744"
    +            }
    +        },
    +        {
    +            "entity_type": "ORGANIZATION",
    +            "start": 13,
    +            "end": 16,
    +            "score": 1,
    +            "analysis_explanation": {
    +                "recognizer": "CustomSpacyRecognizer",
    +                "pattern_name": null,
    +                "pattern": null,
    +                "original_score": 1,
    +                "score": 1,
    +                "textual_explanation": "Identified as ORG by Spacy's Named Entity Recognition (Privy-trained)",
    +                "score_context_improvement": 0,
    +                "supportive_context_word": "",
    +                "validation_result": null
    +            },
    +            "recognition_metadata": {
    +                "recognizer_name": "CustomSpacyRecognizer",
    +                "recognizer_identifier": "CustomSpacyRecognizer_139944219101744"
    +            }
    +        },
    +        {
    +            "entity_type": "PERSON",
    +            "start": 53,
    +            "end": 58,
    +            "score": 1.0,
    +            "analysis_explanation": {
    +                "recognizer": "FlairRecognizer",
    +                "pattern_name": null,
    +                "pattern": null,
    +                "original_score": 1.0,
    +                "score": 1.0,
    +                "textual_explanation": "Identified as PER by Flair's Named Entity Recognition",
    +                "score_context_improvement": 0,
    +                "supportive_context_word": "",
    +                "validation_result": null
    +            },
    +            "recognition_metadata": {
    +                "recognizer_identifier": "Flair Analytics_139944219101936",
    +                "recognizer_name": "Flair Analytics"
    +            }
    +        },
    +        {
    +            "entity_type": "PERSON",
    +            "start": 48,
    +            "end": 52,
    +            "score": 0.87,
    +            "analysis_explanation": {
    +                "recognizer": "FlairRecognizer",
    +                "pattern_name": null,
    +                "pattern": null,
    +                "original_score": 0.87,
    +                "score": 0.87,
    +                "textual_explanation": "Identified as PER by Flair's Named Entity Recognition",
    +                "score_context_improvement": 0,
    +                "supportive_context_word": "",
    +                "validation_result": null
    +            },
    +            "recognition_metadata": {
    +                "recognizer_identifier": "Flair Analytics_139944219101936",
    +                "recognizer_name": "Flair Analytics"
    +            }
    +        },
    +        {
    +            "entity_type": "EMAIL",
    +            "start": 48,
    +            "end": 68,
    +            "score": 0.5,
    +            "analysis_explanation": {
    +                "recognizer": "PatternRecognizer",
    +                "pattern_name": "EMAIL",
    +                "pattern": "\\S+@\\S+",
    +                "original_score": 0.5,
    +                "score": 0.5,
    +                "textual_explanation": null,
    +                "score_context_improvement": 0,
    +                "supportive_context_word": "",
    +                "validation_result": null
    +            },
    +            "recognition_metadata": {
    +                "recognizer_name": "PatternRecognizer",
    +                "recognizer_identifier": "PatternRecognizer_139944352474640"
    +            }
    +        },
    +        {
    +            "entity_type": "PHONE",
    +            "start": 72,
    +            "end": 82,
    +            "score": 0.5,
    +            "analysis_explanation": {
    +                "recognizer": "PatternRecognizer",
    +                "pattern_name": "PHONE",
    +                "pattern": "\\(?\\d{3}\\)?[-.\\s]?\\d{3}[-.\\s]?\\d{4}",
    +                "original_score": 0.5,
    +                "score": 0.5,
    +                "textual_explanation": null,
    +                "score_context_improvement": 0,
    +                "supportive_context_word": "",
    +                "validation_result": null
    +            },
    +            "recognition_metadata": {
    +                "recognizer_name": "PatternRecognizer",
    +                "recognizer_identifier": "PatternRecognizer_139944352476560"
    +            }
    +        },
    +        {
    +            "entity_type": "PHONE",
    +            "start": 104,
    +            "end": 114,
    +            "score": 0.5,
    +            "analysis_explanation": {
    +                "recognizer": "PatternRecognizer",
    +                "pattern_name": "PHONE",
    +                "pattern": "\\(?\\d{3}\\)?[-.\\s]?\\d{3}[-.\\s]?\\d{4}",
    +                "original_score": 0.5,
    +                "score": 0.5,
    +                "textual_explanation": null,
    +                "score_context_improvement": 0,
    +                "supportive_context_word": "",
    +                "validation_result": null
    +            },
    +            "recognition_metadata": {
    +                "recognizer_name": "PatternRecognizer",
    +                "recognizer_identifier": "PatternRecognizer_139944352476560"
    +            }
    +        }
    +    ]
    +}
    +
    +
    +
    +
    +
    +
    +

    3.2. Masking configurations#

    +
      +
    • entity_operator_map: it defined what to do with recognized tokens? Mask them? mask them with what? remove them? replace them?

    • +
    +
    +     entity_operator_map = {
    +        "PERSON": ("keep", {}),
    +        "EMAIL": ("mask", {"masking_char": "😀", "chars_to_mask": 5, "from_end": False}),
    +        "PHONE": ("hash", {}),
    +        "LOCATION": ("redact", {}),
    +        "ORGANIZATION": ("replace", {"new_value": "Company XYZ"})
    +        }
    +     
    +
    +
    import mlrun
    +artifact_path = "./"
    +proj = mlrun.get_or_create_project("pii", "./")
    +fn = mlrun.code_to_function(
    +    project="pii",
    +    name="pii_recognizer",
    +    filename="pii_recognizer.py",
    +    kind="job",
    +    image="mlrun/mlrun",
    +    handler="recognize_pii",
    +    description="This function is used to recognize PII in a given text",
    +)
    +
    +entity_operator_map = {
    +        "PERSON": ("keep", {}),
    +        "EMAIL": ("mask", {"masking_char": "😀", "chars_to_mask" : 100, "from_end": False}),
    +        "PHONE": ("hash", {}),
    +        "LOCATION": ("redact", {}),
    +        "ORGANIZATION": ("replace", {"new_value": "Company XYZ"})
    +        }
    +run_obj = fn.run(
    +    artifact_path = artifact_path,
    +    params= {
    +        'model': "whole", 
    +        'input_path': "./data/",
    +        'output_path': "./data/output2/",
    +        "entities": ['PERSON', "EMAIL", "PHONE", "LOCATION", "ORGANIZATION"],
    +        "output_suffix": "output",
    +        "html_key": "highlighted",
    +        "score_threshold" : 0.5,
    +        "entity_operator_map": entity_operator_map,
    +        
    +    },
    +    returns = ["output_path: path", "rpt_json: file", "errors: file"],
    +    local=True,
    +)
    +
    +
    +
    +
    +
    > 2023-07-31 02:20:40,550 [info] Project loaded successfully: {'project_name': 'pii'}
    +> 2023-07-31 02:20:40,556 [warning] Failed to add git metadata, ignore if path is not part of a git repo.: {'path': './', 'error': '/User/pii_recognizer'}
    +> 2023-07-31 02:20:40,649 [warning] artifact/output path is not defined or is local and relative, artifacts will not be visible in the UI: {'output_path': './'}
    +> 2023-07-31 02:20:40,649 [info] Storing function: {'name': 'pii-recognizer-recognize-pii', 'uid': '2b43f80c7ca44b43b229760bb55f814d', 'db': None}
    +2023-07-31 02:20:40,812 loading file /User/.flair/models/flair-pii-distilbert/models--beki--flair-pii-distilbert/snapshots/20fb59f1762edcf253bce67716a94a43cb075ae6/pytorch_model.bin
    +2023-07-31 02:20:44,130 SequenceTagger predicts: Dictionary with 21 tags: O, S-LOC, B-LOC, E-LOC, I-LOC, S-PER, B-PER, E-PER, I-PER, S-DATE_TIME, B-DATE_TIME, E-DATE_TIME, I-DATE_TIME, S-ORG, B-ORG, E-ORG, I-ORG, S-NRP, B-NRP, E-NRP, I-NRP
    +Model loaded
    +
    +
    +
    +
    +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
    pii0Jul 31 02:20:40completedpii-recognizer-recognize-pii
    v3io_user=pengw
    kind=
    owner=pengw
    host=jupyter-pengw-5f99fb678d-mnvxl
    model=whole
    input_path=./data/
    output_path=./data/output2/
    entities=['PERSON', 'EMAIL', 'PHONE', 'LOCATION', 'ORGANIZATION']
    output_suffix=output
    html_key=highlighted
    score_threshold=0.5
    entity_operator_map={'PERSON': ('keep', {}), 'EMAIL': ('mask', {'masking_char': '😀', 'chars_to_mask': 100, 'from_end': False, 'entity_type': 'EMAIL'}), 'PHONE': ('hash', {}), 'LOCATION': ('redact', {}), 'ORGANIZATION': ('replace', {'new_value': 'Company XYZ', 'entity_type': 'ORGANIZATION'})}
    highlighted
    output_path
    rpt_json
    errors
    +
    + +
    +
    
    +
    +
    +
    > to track results use the .show() or .logs() methods or click here to open in UI
    > 2023-07-31 02:20:48,903 [info] Run execution finished: {'status': 'completed', 'name': 'pii-recognizer-recognize-pii'}
    +
    +
    +
    +
    +
    +
    +
    #get the mlrun context
    +context = mlrun.get_or_create_ctx('pii_ctx1')
    +import pathlib
    +from tqdm.auto import tqdm
    +for i, txt_file in enumerate(
    +        tqdm(
    +            list(pathlib.Path("./data/output2/").glob("*.txt")),
    +            desc="Processing files",
    +            unit="file",
    +        )
    +    ):
    +            # Load the str from the text file
    +        text = txt_file.read_text()
    +        print(text)
    +
    +
    +
    +
    +
    Dear Mr. John Doe,
    +
    +We are pleased to inform you that you have been selected as the winner of our exclusive vacation package giveaway! Congratulations! You, along with your family, will enjoy a luxurious stay at our resort in the beautiful city of Company XYZ. Your flight tickets have been booked, and you will be departing on July 15th, 2023.
    +
    +Please provide us with the necessary details to finalize your travel arrangements. We kindly request your full name, date of birth, passport number, and contact information. Rest assured that all provided information will be handled with utmost confidentiality and in compliance with data protection regulations.
    +
    +We look forward to creating unforgettable memories for you and your loved ones during your stay with us. If you have any questions or require further assistance, please don't hesitate to contact our customer support team.
    +
    +John smith's Company XYZ is 182838483, connect him with 😀😀😀😀😀😀😀😀😀😀😀😀😀😀😀😀😀😀😀😀 or 3990096a212e92850c3b3c8e57ab398252d482444a32def6b030cbac2d51efa3, he can pay you with a6983d9477e93eab115305afd124bd096699e6cb7d2ce72ec6e29a6378a4e8059393
    +
    +
    +
    +
    +
    +
    +
    #check the highlighted html 
    +html_output = context.get_cached_artifact("highlighted")
    +html_str = mlrun.get_dataitem(html_output.get_target_path()).get().decode("utf-8")
    +from IPython.core.display import display, HTML
    +display(HTML(html_str))
    +
    +
    +
    +
    +
    Highlighted Pii Entities

    Highlighted Pii Entities

  • data/letter.txt

    Dear Mr. John DoePERSON, + +We are pleased to inform you that you have been selected as the winner of our exclusive vacation package giveaway! Congratulations! You, along with your family, will enjoy a luxurious stay at our resort in the beautiful city of RivieraLOCATIONRivieraORGANIZATION. Your flight tickets have been booked, and you will be departing on July 15th, 2023. + +Please provide us with the necessary details to finalize your travel arrangements. We kindly request your full name, date of birth, passport number, and contact information. Rest assured that all provided information will be handled with utmost confidentiality and in compliance with data protection regulations. + +We look forward to creating unforgettable memories for you and your loved ones during your stay with us. If you have any questions or require further assistance, please don't hesitate to contact our customer support team. +

  • data/pii_data.txt

    John smith'sPERSON ssnORGANIZATION is 182838483, connect him with JohnPERSONJohn_smith@gmail.comEMAILsmithPERSON@gmail.com or 6288389029PHONE, he can pay you with 4148292993PHONE9393

  • +
    +
    +
    +
    #check the json report about the explanation.
    +rpt_output1 = context.get_cached_artifact("rpt_json")
    +rpt_str1 = mlrun.get_dataitem(rpt_output1.get_target_path()).get().decode("utf-8")
    +import json
    +obj = json.loads(rpt_str1)
    + 
    +# Pretty Print JSON
    +json_formatted_str1 = json.dumps(obj, indent=4)
    +print(json_formatted_str1)
    +
    +
    +
    +
    +
    {
    +    "data/letter.txt": [
    +        {
    +            "entity_type": "PERSON",
    +            "start": 9,
    +            "end": 17,
    +            "score": 1.0,
    +            "analysis_explanation": {
    +                "recognizer": "FlairRecognizer",
    +                "pattern_name": null,
    +                "pattern": null,
    +                "original_score": 1.0,
    +                "score": 1.0,
    +                "textual_explanation": "Identified as PER by Flair's Named Entity Recognition",
    +                "score_context_improvement": 0,
    +                "supportive_context_word": "",
    +                "validation_result": null
    +            },
    +            "recognition_metadata": {
    +                "recognizer_identifier": "Flair Analytics_139944345555488",
    +                "recognizer_name": "Flair Analytics"
    +            }
    +        },
    +        {
    +            "entity_type": "LOCATION",
    +            "start": 248,
    +            "end": 255,
    +            "score": 1.0,
    +            "analysis_explanation": {
    +                "recognizer": "FlairRecognizer",
    +                "pattern_name": null,
    +                "pattern": null,
    +                "original_score": 1.0,
    +                "score": 1.0,
    +                "textual_explanation": "Identified as LOC by Flair's Named Entity Recognition",
    +                "score_context_improvement": 0,
    +                "supportive_context_word": "",
    +                "validation_result": null
    +            },
    +            "recognition_metadata": {
    +                "recognizer_identifier": "Flair Analytics_139944345555488",
    +                "recognizer_name": "Flair Analytics"
    +            }
    +        },
    +        {
    +            "entity_type": "ORGANIZATION",
    +            "start": 248,
    +            "end": 255,
    +            "score": 1,
    +            "analysis_explanation": {
    +                "recognizer": "CustomSpacyRecognizer",
    +                "pattern_name": null,
    +                "pattern": null,
    +                "original_score": 1,
    +                "score": 1,
    +                "textual_explanation": "Identified as ORG by Spacy's Named Entity Recognition (Privy-trained)",
    +                "score_context_improvement": 0,
    +                "supportive_context_word": "",
    +                "validation_result": null
    +            },
    +            "recognition_metadata": {
    +                "recognizer_name": "CustomSpacyRecognizer",
    +                "recognizer_identifier": "CustomSpacyRecognizer_139943499301312"
    +            }
    +        }
    +    ],
    +    "data/pii_data.txt": [
    +        {
    +            "entity_type": "PERSON",
    +            "start": 0,
    +            "end": 12,
    +            "score": 1,
    +            "analysis_explanation": {
    +                "recognizer": "CustomSpacyRecognizer",
    +                "pattern_name": null,
    +                "pattern": null,
    +                "original_score": 1,
    +                "score": 1,
    +                "textual_explanation": "Identified as PERSON by Spacy's Named Entity Recognition (Privy-trained)",
    +                "score_context_improvement": 0,
    +                "supportive_context_word": "",
    +                "validation_result": null
    +            },
    +            "recognition_metadata": {
    +                "recognizer_name": "CustomSpacyRecognizer",
    +                "recognizer_identifier": "CustomSpacyRecognizer_139943499301312"
    +            }
    +        },
    +        {
    +            "entity_type": "ORGANIZATION",
    +            "start": 13,
    +            "end": 16,
    +            "score": 1,
    +            "analysis_explanation": {
    +                "recognizer": "CustomSpacyRecognizer",
    +                "pattern_name": null,
    +                "pattern": null,
    +                "original_score": 1,
    +                "score": 1,
    +                "textual_explanation": "Identified as ORG by Spacy's Named Entity Recognition (Privy-trained)",
    +                "score_context_improvement": 0,
    +                "supportive_context_word": "",
    +                "validation_result": null
    +            },
    +            "recognition_metadata": {
    +                "recognizer_name": "CustomSpacyRecognizer",
    +                "recognizer_identifier": "CustomSpacyRecognizer_139943499301312"
    +            }
    +        },
    +        {
    +            "entity_type": "PERSON",
    +            "start": 53,
    +            "end": 58,
    +            "score": 1.0,
    +            "analysis_explanation": {
    +                "recognizer": "FlairRecognizer",
    +                "pattern_name": null,
    +                "pattern": null,
    +                "original_score": 1.0,
    +                "score": 1.0,
    +                "textual_explanation": "Identified as PER by Flair's Named Entity Recognition",
    +                "score_context_improvement": 0,
    +                "supportive_context_word": "",
    +                "validation_result": null
    +            },
    +            "recognition_metadata": {
    +                "recognizer_identifier": "Flair Analytics_139944345555488",
    +                "recognizer_name": "Flair Analytics"
    +            }
    +        },
    +        {
    +            "entity_type": "PERSON",
    +            "start": 48,
    +            "end": 52,
    +            "score": 0.87,
    +            "analysis_explanation": {
    +                "recognizer": "FlairRecognizer",
    +                "pattern_name": null,
    +                "pattern": null,
    +                "original_score": 0.87,
    +                "score": 0.87,
    +                "textual_explanation": "Identified as PER by Flair's Named Entity Recognition",
    +                "score_context_improvement": 0,
    +                "supportive_context_word": "",
    +                "validation_result": null
    +            },
    +            "recognition_metadata": {
    +                "recognizer_identifier": "Flair Analytics_139944345555488",
    +                "recognizer_name": "Flair Analytics"
    +            }
    +        },
    +        {
    +            "entity_type": "EMAIL",
    +            "start": 48,
    +            "end": 68,
    +            "score": 0.5,
    +            "analysis_explanation": {
    +                "recognizer": "PatternRecognizer",
    +                "pattern_name": "EMAIL",
    +                "pattern": "\\S+@\\S+",
    +                "original_score": 0.5,
    +                "score": 0.5,
    +                "textual_explanation": null,
    +                "score_context_improvement": 0,
    +                "supportive_context_word": "",
    +                "validation_result": null
    +            },
    +            "recognition_metadata": {
    +                "recognizer_name": "PatternRecognizer",
    +                "recognizer_identifier": "PatternRecognizer_139943864893792"
    +            }
    +        },
    +        {
    +            "entity_type": "PHONE",
    +            "start": 72,
    +            "end": 82,
    +            "score": 0.5,
    +            "analysis_explanation": {
    +                "recognizer": "PatternRecognizer",
    +                "pattern_name": "PHONE",
    +                "pattern": "\\(?\\d{3}\\)?[-.\\s]?\\d{3}[-.\\s]?\\d{4}",
    +                "original_score": 0.5,
    +                "score": 0.5,
    +                "textual_explanation": null,
    +                "score_context_improvement": 0,
    +                "supportive_context_word": "",
    +                "validation_result": null
    +            },
    +            "recognition_metadata": {
    +                "recognizer_name": "PatternRecognizer",
    +                "recognizer_identifier": "PatternRecognizer_139943864894128"
    +            }
    +        },
    +        {
    +            "entity_type": "PHONE",
    +            "start": 104,
    +            "end": 114,
    +            "score": 0.5,
    +            "analysis_explanation": {
    +                "recognizer": "PatternRecognizer",
    +                "pattern_name": "PHONE",
    +                "pattern": "\\(?\\d{3}\\)?[-.\\s]?\\d{3}[-.\\s]?\\d{4}",
    +                "original_score": 0.5,
    +                "score": 0.5,
    +                "textual_explanation": null,
    +                "score_context_improvement": 0,
    +                "supportive_context_word": "",
    +                "validation_result": null
    +            },
    +            "recognition_metadata": {
    +                "recognizer_name": "PatternRecognizer",
    +                "recognizer_identifier": "PatternRecognizer_139943864894128"
    +            }
    +        }
    +    ]
    +}
    +
    +
    +
    +
    +
    +
    +

    3.3 Output configurations#

    +
      +
    • is_full_text: whether produce full text or just the sentences have PII entities in it

    • +
    • generate_html: whether to produce the html with highlighted pii entities

    • +
    • generate_json: whether to proudce the json report with the explaination of the process

    • +
    • is_full_html: whether produce full text with the pii entities highlighted or just sentences with pii entities.

    • +
    • is_full_report: whether produce the json report with detailed information or just start, end index and scores.

    • +
    +
    +
    +
    import mlrun
    +artifact_path = "./"
    +proj = mlrun.get_or_create_project("pii", "./")
    +fn = mlrun.code_to_function(
    +    project="pii",
    +    name="pii_recognizer",
    +    filename="pii_recognizer.py",
    +    kind="job",
    +    image="mlrun/mlrun",
    +    handler="recognize_pii",
    +    description="This function is used to recognize PII in a given text",
    +)
    +
    +entity_operator_map = {
    +        "PERSON": ("keep", {}),
    +        "EMAIL": ("mask", {"masking_char": "😀", "chars_to_mask" : 100, "from_end": False}),
    +        "PHONE": ("hash", {}),
    +        "LOCATION": ("redact", {}),
    +        "ORGANIZATION": ("replace", {"new_value": "Company XYZ"})
    +        }
    +run_obj = fn.run(
    +    artifact_path = artifact_path,
    +    params= {
    +        'model': "whole", 
    +        'input_path': "./data/",
    +        'output_path': "./data/output3/",
    +        "entities": ['PERSON', "EMAIL", "PHONE", "LOCATION", "ORGANIZATION"],
    +        "output_suffix": "output",
    +        "html_key": "highlighted",
    +        "score_threshold" : 0.5,
    +        "entity_operator_map": entity_operator_map,
    +        "is_full_text": False,
    +        "is_full_html": False,
    +        "is_full_report": False,
    +    },
    +    returns = ["output_path: path", "rpt_json: file", "errors: file"],
    +    local=True,
    +)
    +
    +
    +
    +
    +
    > 2023-07-31 02:22:57,789 [info] Project loaded successfully: {'project_name': 'pii'}
    +> 2023-07-31 02:22:57,799 [warning] Failed to add git metadata, ignore if path is not part of a git repo.: {'path': './', 'error': '/User/pii_recognizer'}
    +> 2023-07-31 02:22:57,891 [warning] artifact/output path is not defined or is local and relative, artifacts will not be visible in the UI: {'output_path': './'}
    +> 2023-07-31 02:22:57,892 [info] Storing function: {'name': 'pii-recognizer-recognize-pii', 'uid': '3f6d701e423346b39026dc365698c15c', 'db': None}
    +2023-07-31 02:22:58,079 loading file /User/.flair/models/flair-pii-distilbert/models--beki--flair-pii-distilbert/snapshots/20fb59f1762edcf253bce67716a94a43cb075ae6/pytorch_model.bin
    +2023-07-31 02:23:01,565 SequenceTagger predicts: Dictionary with 21 tags: O, S-LOC, B-LOC, E-LOC, I-LOC, S-PER, B-PER, E-PER, I-PER, S-DATE_TIME, B-DATE_TIME, E-DATE_TIME, I-DATE_TIME, S-ORG, B-ORG, E-ORG, I-ORG, S-NRP, B-NRP, E-NRP, I-NRP
    +Model loaded
    +
    +
    +
    +
    +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
    pii0Jul 31 02:22:57completedpii-recognizer-recognize-pii
    v3io_user=pengw
    kind=
    owner=pengw
    host=jupyter-pengw-5f99fb678d-mnvxl
    model=whole
    input_path=./data/
    output_path=./data/output3/
    entities=['PERSON', 'EMAIL', 'PHONE', 'LOCATION', 'ORGANIZATION']
    output_suffix=output
    html_key=highlighted
    score_threshold=0.5
    entity_operator_map={'PERSON': ('keep', {}), 'EMAIL': ('mask', {'masking_char': '😀', 'chars_to_mask': 100, 'from_end': False, 'entity_type': 'EMAIL'}), 'PHONE': ('hash', {}), 'LOCATION': ('redact', {}), 'ORGANIZATION': ('replace', {'new_value': 'Company XYZ', 'entity_type': 'ORGANIZATION'})}
    is_full_text=False
    is_full_html=False
    is_full_report=False
    highlighted
    output_path
    rpt_json
    errors
    +
    + +
    +
    
    +
    +
    +
    > to track results use the .show() or .logs() methods or click here to open in UI
    > 2023-07-31 02:23:06,096 [info] Run execution finished: {'status': 'completed', 'name': 'pii-recognizer-recognize-pii'}
    +
    +
    +
    +
    +
    +
    +
    #get the mlrun context
    +context = mlrun.get_or_create_ctx('pii_ctx')
    +import pathlib
    +from tqdm.auto import tqdm
    +
    +
    +
    +
    +
    +
    +
    for i, txt_file in enumerate(
    +        tqdm(
    +            list(pathlib.Path("./data/output3/").glob("*.txt")),
    +            desc="Processing files",
    +            unit="file",
    +        )
    +    ):
    +            # Load the str from the text file
    +        text = txt_file.read_text()
    +        print(text)
    +
    +
    +
    +
    +
    Dear Mr. John Doe,
    +
    +We are pleased to inform you that you have been selected as the winner of our exclusive vacation package giveaway!
    +John smith's Company XYZ is 182838483, connect him with 😀😀😀😀😀😀😀😀😀😀😀😀😀😀😀😀😀😀😀😀 or 3990096a212e92850c3b3c8e57ab398252d482444a32def6b030cbac2d51efa3, he can pay you with a6983d9477e93eab115305afd124bd096699e6cb7d2ce72ec6e29a6378a4e8059393
    +
    +
    +
    +
    +
    +
    +
    #check the highlighted html 
    +html_output = context.get_cached_artifact("highlighted")
    +html_str = mlrun.get_dataitem(html_output.get_target_path()).get().decode("utf-8")
    +from IPython.core.display import display, HTML
    +display(HTML(html_str))
    +
    +
    +
    +
    +
    Highlighted Pii Entities

    Highlighted Pii Entities

  • data/letter.txt

    Dear Mr. John DoePERSON, + +We are pleased to inform you that you have been selected as the winner of our exclusive vacation package giveaway! Congratulations! You, along with your family, will enjoy a luxurious stay at our resort in the beautiful city of RivieraLOCATIONRivieraORGANIZATION

  • data/pii_data.txt

    John smith'sPERSON ssnORGANIZATION is 182838483, connect him with JohnPERSONJohn_smith@gmail.comEMAILsmithPERSON

  • +
    +
    +
    +
    #check the json report about the explanation.
    +rpt_output = context.get_cached_artifact("rpt_json")
    +rpt_str = mlrun.get_dataitem(rpt_output.get_target_path()).get().decode("utf-8")
    +import json
    +obj = json.loads(rpt_str)
    + 
    +# Pretty Print JSON
    +json_formatted_str = json.dumps(obj, indent=4)
    +print(json_formatted_str)
    +
    +
    +
    +
    +
    {
    +    "data/letter.txt": [
    +        {
    +            "entity_type": "PERSON",
    +            "start": 9,
    +            "end": 17,
    +            "score": 1
    +        },
    +        {
    +            "entity_type": "LOCATION",
    +            "start": 248,
    +            "end": 255,
    +            "score": 1.0
    +        },
    +        {
    +            "entity_type": "ORGANIZATION",
    +            "start": 248,
    +            "end": 255,
    +            "score": 1
    +        }
    +    ],
    +    "data/pii_data.txt": [
    +        {
    +            "entity_type": "PERSON",
    +            "start": 0,
    +            "end": 12,
    +            "score": 1
    +        },
    +        {
    +            "entity_type": "ORGANIZATION",
    +            "start": 13,
    +            "end": 16,
    +            "score": 1
    +        },
    +        {
    +            "entity_type": "PERSON",
    +            "start": 53,
    +            "end": 58,
    +            "score": 1.0
    +        },
    +        {
    +            "entity_type": "PERSON",
    +            "start": 48,
    +            "end": 52,
    +            "score": 0.87
    +        },
    +        {
    +            "entity_type": "EMAIL",
    +            "start": 48,
    +            "end": 68,
    +            "score": 0.5
    +        },
    +        {
    +            "entity_type": "PHONE",
    +            "start": 72,
    +            "end": 82,
    +            "score": 0.5
    +        },
    +        {
    +            "entity_type": "PHONE",
    +            "start": 104,
    +            "end": 114,
    +            "score": 0.5
    +        }
    +    ]
    +}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +
    +
    + +
    +
    +
    +
    + + + +
    +
    + + \ No newline at end of file diff --git a/functions/master/pii_recognizer/0.4.0/static/function.html b/functions/master/pii_recognizer/0.4.0/static/function.html new file mode 100644 index 00000000..92eba7ca --- /dev/null +++ b/functions/master/pii_recognizer/0.4.0/static/function.html @@ -0,0 +1,150 @@ + + + + + + + + + + + Source + + + + +
    +        
    +verbose: false
    +spec:
    +  default_handler: recognize_pii
    +  entry_points:
    +    analyze:
    +      name: analyze
    +      outputs:
    +      - doc: The list of Presidio RecognizerResult constructed from the recognized
    +          Flair detections.
    +        type: List[pa.RecognizerResult]
    +      has_kwargs: false
    +      parameters:
    +      - name: self
    +      - name: text
    +        type: str
    +        doc: The text for analysis.
    +      - name: entities
    +        type: List[str]
    +        doc: The list of entities to recognize.
    +      - name: nlp_artifacts
    +        type: pa.nlp_engine.NlpArtifacts
    +        doc: Not used by this recognizer but needed for the interface.
    +        default: null
    +      lineno: 381
    +      doc: Analyze text and return the results.
    +      has_varargs: false
    +    recognize_pii:
    +      name: recognize_pii
    +      outputs:
    +      - doc: 'A tuple of:'
    +        type: Union[Tuple[str, pd.DataFrame, dict, dict], Tuple[str, pd.DataFrame,
    +          dict]]
    +      has_kwargs: false
    +      parameters:
    +      - name: context
    +        type: MLClientCtx
    +        doc: The MLRun context. this is needed for log the artifacts.
    +      - name: input_path
    +        type: Union[str, Path]
    +        doc: The input path of the text files needs to be analyzed.
    +      - name: html_key
    +        type: str
    +        doc: The html key for the artifact.
    +      - name: score_threshold
    +        type: float
    +        doc: The score threshold to mark the recognition as trusted.
    +      - name: output_directory
    +        type: str
    +        doc: The output directory path to store the anonymized text.
    +        default: null
    +      - name: entities
    +        type: List[str]
    +        doc: The list of entities to recognize.
    +        default: null
    +      - name: entity_operator_map
    +        type: dict
    +        doc: The map of entity to operator (mask, redact, replace, keep, hash, and
    +          its params)
    +        default: null
    +      - name: model
    +        type: str
    +        doc: The model to use. Can be "spacy", "flair", "pattern" or "whole".
    +        default: null
    +      - name: generate_json
    +        type: bool
    +        doc: Whether to generate the json report of the explanation.
    +        default: true
    +      - name: generate_html
    +        type: bool
    +        doc: Whether to generate the html report of the explanation.
    +        default: true
    +      - name: is_full_text
    +        type: bool
    +        doc: Whether to return the full text or only the masked text.
    +        default: true
    +      - name: is_full_html
    +        type: bool
    +        doc: Whether to return the full html or just the annotated text
    +        default: true
    +      - name: is_full_report
    +        type: bool
    +        doc: Whether to return the full report or just the score and start, end index
    +        default: true
    +      lineno: 845
    +      doc: 'Walk through the input path, recognize PII in text and store the anonymized
    +        text in the output path.
    +
    +        Generate the html with different colors for each entity, json report of the
    +        explanation.'
    +      has_varargs: false
    +  build:
    +    base_image: mlrun/mlrun
    +    requirements:
    +    - nltk
    +    - pandas
    +    - presidio-anonymizer
    +    - presidio-analyzer
    +    - torch
    +    - flair@git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653
    +    - st-annotated-text
    +    - https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl
    +    functionSourceCode: 
    +    code_origin: ''
    +    origin_filename: ''
    +  description: This function is used to recognize PII in a directory of text files
    +  image: ''
    +  command: ''
    +  disable_auto_mount: false
    +kind: job
    +metadata:
    +  name: pii-recognizer
    +  tag: ''
    +  categories:
    +  - data-preparation
    +  - NLP
    +
    +        
    +    
    + + \ No newline at end of file diff --git a/functions/master/pii_recognizer/0.4.0/static/item.html b/functions/master/pii_recognizer/0.4.0/static/item.html new file mode 100644 index 00000000..d16c5239 --- /dev/null +++ b/functions/master/pii_recognizer/0.4.0/static/item.html @@ -0,0 +1,69 @@ + + + + + + + + + + + Source + + + + +
    +        
    +apiVersion: v1
    +categories: 
    +  - data-preparation
    +  - NLP
    +description: This function is used to recognize PII in a directory of text files
    +doc: ''
    +example: pii_recognizer.ipynb
    +generationDate: 2023-08-15:10-24
    +hidden: false
    +icon: ''
    +labels:
    +  author: pgw
    +maintainers: []
    +marketplaceType: ''
    +mlrunVersion: 1.7.0
    +name: pii-recognizer
    +platformVersion: 3.5.3
    +spec:
    +  filename: pii_recognizer.py
    +  handler: recognize_pii
    +  image: mlrun/mlrun
    +  kind: job
    +  requirements:
    +   - nltk
    +   - pandas
    +   - presidio-anonymizer
    +   - presidio-analyzer
    +   - torch
    +   - flair@git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653
    +   - st-annotated-text
    +   - https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl
    +url: ''
    +version: 0.4.0
    +test_valid: False
    +
    +        
    +    
    + + \ No newline at end of file diff --git a/functions/master/pii_recognizer/0.4.0/static/pii_recognizer.html b/functions/master/pii_recognizer/0.4.0/static/pii_recognizer.html new file mode 100644 index 00000000..d271919b --- /dev/null +++ b/functions/master/pii_recognizer/0.4.0/static/pii_recognizer.html @@ -0,0 +1,1147 @@ + + + + + + + +pii_recognizer.pii_recognizer + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + + + +
    +
    +
    +
    +
    +
    + +
    + +
    +
    +
    + + + + +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +
    +
    +
    +
    +
    +

    + +
    +
    +
    +
    +
    + +
    +

    Source code for pii_recognizer.pii_recognizer

    +# Copyright 2023 Iguazio
    +#
    +# Licensed under the Apache License, Version 2.0 (the "License");
    +# you may not use this file except in compliance with the License.
    +# You may obtain a copy of the License at
    +#
    +#   http://www.apache.org/licenses/LICENSE-2.0
    +#
    +# Unless required by applicable law or agreed to in writing, software
    +# distributed under the License is distributed on an "AS IS" BASIS,
    +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    +# See the License for the specific language governing permissions and
    +# limitations under the License.
    +
    +import logging
    +import os
    +import pathlib
    +import tempfile
    +import warnings
    +from typing import List, Set, Tuple, Union
    +
    +import annotated_text.util as at_util
    +import mlrun
    +import nltk
    +import pandas as pd
    +import presidio_analyzer as pa
    +import presidio_anonymizer as pre_anoymizer
    +from presidio_anonymizer.entities import OperatorConfig
    +from tqdm import tqdm
    +
    +try:
    +    import flair as fl
    +except ModuleNotFoundError:
    +    print("Flair is not installed")
    +
    +# There is a conflict between Rust-based tokenizers' parallel processing
    +# and Python's fork operations during multiprocessing. To avoid this, we need
    +# the following two lines
    +
    +os.environ["TOKENIZERS_PARALLELISM"] = "false"
    +warnings.filterwarnings("ignore")
    +
    +logger = logging.getLogger("pii-recognizer")
    +
    +
    +# Add the constant classes of Models and Entities to govern the whole package
    +
    +[docs] +class Models: + WHOLE = "whole" + PATTERN = "pattern" + SPACY = "spacy" + FLAIR = "flair"
    + + + +
    +[docs] +class Entities: + CREDIT_CARD = "CREDIT_CARD" + SSN = "SSN" + PHONE = "PHONE" + EMAIL = "EMAIL" + LOCATION = "LOCATION" + PERSON = "PERSON" + NRP = "NRP" + ORGANIZATION = "ORGANIZATION" + DATE_TIME = "DATE_TIME" + GPE = ("GPE",) + MAC_ADDRESS = "MAC_ADDRESS" + US_BANK_NUMBER = "US_BANK_NUMBER" + IMEI = "IMEI" + TITLE = "TITLE" + LICENSE_PLATE = "LICENSE_PLATE" + US_PASSPORT = "US_PASSPORT" + CURRENCY = "CURRENCY" + ROUTING_NUMBER = "ROUTING_NUMBER" + US_ITIN = "US_ITIN" + US_BANK_NUMBER = "US_BANK_NUMBER" + US_DRIVER_LICENSE = "US_DRIVER_LICENSE" + AGE = "AGE" + PASSWORD = "PASSWORD" + SWIFT_CODE = "SWIFT_CODE"
    + + + +
    +[docs] +class PatternRecognizerFactory: + """ + Factory for creating pattern recognizers, it can be extended in the future to + add more regex pattern for different entities. For the pattern recognizer to work, + we need construct a list of regex patterns for each entity. + """ + + RECOGNIZABLE_ENTITIES = { + "CREDIT_CARD": [pa.Pattern("CREDIT_CARD", r"\b(?:\d[ -]*?){13,16}\b", 0.5)], + "SSN": [pa.Pattern("SSN", r"\b\d{3}-?\d{2}-?\d{4}\b", 0.5)], + "PHONE": [pa.Pattern("PHONE", r"\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}", 0.5)], + "EMAIL": [pa.Pattern("EMAIL", r"\S+@\S+", 0.5)], + } + + # create a list of pattern recognizers + @classmethod + def _create_pattern_recognizer(cls): + """ + For each entity, create a list of patterns to recognize it + + :param cls: PatternRecognizerFactory class + + :returns: List of pattern recognizers + """ + + # Entities to recognize and their regex patterns + + return [ + pa.PatternRecognizer(supported_entity=entity, patterns=pattern) + for entity, pattern in cls.RECOGNIZABLE_ENTITIES.items() + ]
    + + + +
    +[docs] +class CustomSpacyRecognizer(pa.LocalRecognizer): + """ + Custom Spacy Recognizer from Presidio Analyzer trained on Privy data. + The privy data is generated using this https://github.com/pixie-io/pixie/tree/main/src/datagen/pii/privy + It can be used to recognize custom entities, Since we want to use Presidio's Registries to generate AnalyzerEngine, + it inherits from Presidio Analyzer's LocalRecognizer class. + """ + + # Entities to recognize + + RECOGNIZABLE_ENTITIES = { + "LOCATION", + "PERSON", + "NRP", + "ORGANIZATION", + "DATE_TIME", + } + + # Default explanation for this recognizer + + _DEFAULT_EXPLANATION = ( + "Identified as {} by Spacy's Named Entity Recognition (Privy-trained)" + ) + + # Label groups to check + + _DEFAULT_CHECK_LABEL_GROUPS = [ + ({"LOCATION"}, {"LOC", "LOCATION", "STREET_ADDRESS", "COORDINATE"}), + ({"PERSON"}, {"PER", "PERSON"}), + ({"NRP"}, {"NORP", "NRP"}), + ({"ORGANIZATION"}, {"ORG"}), + ({"DATE_TIME"}, {"DATE_TIME"}), + ] + + # pretrained model for this recognizer + + _DEFAULT_MODEL_LANGUAGES = { + "en": "beki/en_spacy_pii_distilbert", + } + + _DEFAULT_PRESIDIO_EQUIVALENCES = { + "PER": "PERSON", + "LOC": "LOCATION", + "ORG": "ORGANIZATION", + "NROP": "NRP", + "DATE_TIME": "DATE_TIME", + } + + def __init__( + self, + supported_language: str = "en", + supported_entities: List[str] = None, + check_label_groups: Tuple[Set, Set] = None, + context: List[str] = None, + ner_strength: float = 1, + ): + """ + Initialize Spacy Recognizer. + + :param supported_language: Language to use, default is English + :param supported_entities: Entities to use for recognition + :param check_label_groups: Label groups to check for the entities + :param context: Context to use if any + :param ner_strength: Default confidence for NER prediction + + :returns: SpacyRecognizer object + """ + + # Default confidence for NER prediction + self.ner_strength = ner_strength + + self.check_label_groups = check_label_groups or self._DEFAULT_CHECK_LABEL_GROUPS + supported_entities = supported_entities or self.RECOGNIZABLE_ENTITIES + super().__init__( + supported_entities=supported_entities, + supported_language=supported_language, + ) + + # get the presidio explanation for the result + + def _build_spacy_explanation( + self, original_score: float, explanation: str + ) -> pa.AnalysisExplanation: + """ + Create explanation for why this result was detected. + + :param original_score: Score given by this recognizer + :param explanation: Explanation string + + :returns: Presidio AnalysisExplanation object + """ + explanation = pa.AnalysisExplanation( + recognizer=self.__class__.__name__, + original_score=original_score, + textual_explanation=explanation, + ) + return explanation + + # main method for the recognizer +
    +[docs] + def analyze(self, text: str, entities: List[str], nlp_artifacts=None): # noqa D102 + """ + Analyze text using Spacy. + + :param text: Text to analyze + :param entities: Entities to analyze + :param nlp_artifacts: NLP artifacts to use + + :returns: List of Presidio RecognizerResult objects + """ + results = [] + if not nlp_artifacts: + logger.warning("Skipping SpaCy, nlp artifacts not provided...") + return results + + ner_entities = nlp_artifacts.entities + + # recognize the supported entities + for entity in entities: + if entity not in self.supported_entities: + continue + for ent in ner_entities: + if not self.__check_label(entity, ent.label_, self.check_label_groups): + continue + + # string of the explanation saying the entity is recognized by spacy + textual_explanation = self._DEFAULT_EXPLANATION.format(ent.label_) + explanation = self._build_spacy_explanation( + self.ner_strength, textual_explanation + ) + + # create the standard result with the entity, start, end, score, and explanation + spacy_result = pa.RecognizerResult( + entity_type=entity, + start=ent.start_char, + end=ent.end_char, + score=self.ner_strength, + analysis_explanation=explanation, + recognition_metadata={ + pa.RecognizerResult.RECOGNIZER_NAME_KEY: self.name + }, + ) + results.append(spacy_result) + + return results
    + + + @staticmethod + def __check_label( + entity: str, label: str, check_label_groups: Tuple[Set, Set] + ) -> bool: + """ + Check if the label is in the label group. + + :param entity: Entity to check + :param label: Label to check + :param check_label_groups: Label groups to check + + :returns: True if the label is in the label group, False otherwise + """ + return any( + entity in egrp and label in lgrp for egrp, lgrp in check_label_groups + )
    + + + +# Class to use Flair with Presidio as an external recognizer. +
    +[docs] +class FlairRecognizer(pa.EntityRecognizer): + """ + Wrapper for a flair model, if needed to be used within Presidio Analyzer. + This is to make sure the recognizer can be registered with Presidio registry. + """ + + RECOGNIZABLE_ENTITIES = { + "LOCATION", + "PERSON", + "NRP", + "GPE", + "ORGANIZATION", + "MAC_ADDRESS", + "US_BANK_NUMBER", + "IMEI", + "TITLE", + "LICENSE_PLATE", + "US_PASSPORT", + "CURRENCY", + "ROUTING_NUMBER", + "US_ITIN", + "US_BANK_NUMBER", + "US_DRIVER_LICENSE", + "AGE", + "PASSWORD", + "SWIFT_CODE", + } + + # This is used to construct the explanation for the result + + _DEFAULT_EXPLANATION = "Identified as {} by Flair's Named Entity Recognition" + + _DEFAULT_CHECK_LABEL_GROUPS = [ + ({"LOCATION"}, {"LOC", "LOCATION", "STREET_ADDRESS", "COORDINATE"}), + ({"PERSON"}, {"PER", "PERSON"}), + ({"NRP"}, {"NORP", "NRP"}), + ({"GPE"}, {"GPE"}), + ({"ORGANIZATION"}, {"ORG"}), + ({"MAC_ADDRESS"}, {"MAC_ADDRESS"}), + ({"US_BANK_NUMBER"}, {"US_BANK_NUMBER"}), + ({"IMEI"}, {"IMEI"}), + ({"TITLE"}, {"TITLE"}), + ({"LICENSE_PLATE"}, {"LICENSE_PLATE"}), + ({"US_PASSPORT"}, {"US_PASSPORT"}), + ({"CURRENCY"}, {"CURRENCY"}), + ({"ROUTING_NUMBER"}, {"ROUTING_NUMBER"}), + ({"AGE"}, {"AGE"}), + ({"CURRENCY"}, {"CURRENCY"}), + ({"SWIFT_CODE"}, {"SWIFT_CODE"}), + ({"US_ITIN"}, {"US_ITIN"}), + ({"US_BANK_NUMBER"}, {"US_BANK_NUMBER"}), + ({"US_DRIVER_LICENSE"}, {"US_DRIVER_LICENSE"}), + ] + + _DEFAULT_MODEL_LANGUAGES = { + "en": "beki/flair-pii-distilbert", + } + + _DEFAULT_PRESIDIO_EQUIVALENCES = { + "PER": "PERSON", + "LOC": "LOCATION", + "ORG": "ORGANIZATION", + "NROP": "NRP", + "URL": "URL", + "US_ITIN": "US_ITIN", + "US_PASSPORT": "US_PASSPORT", + "IBAN_CODE": "IBAN_CODE", + "IP_ADDRESS": "IP_ADDRESS", + "EMAIL_ADDRESS": "EMAIL", + "US_DRIVER_LICENSE": "US_DRIVER_LICENSE", + "US_BANK_NUMBER": "US_BANK_NUMBER", + } + + def __init__( + self, + supported_language: str = "en", + supported_entities: List[str] = None, + check_label_groups: Tuple[Set, Set] = None, + ): + """ + Initialize the FlairRecognizer. + + :param supported_language: Language to use + :param supported_entities: Entities to use + :param check_label_groups: Label groups to check + + :returns: FlairRecognizer object + + """ + self.check_label_groups = check_label_groups or self._DEFAULT_CHECK_LABEL_GROUPS + + supported_entities = supported_entities or self.RECOGNIZABLE_ENTITIES + self.model = fl.models.SequenceTagger.load( + self._DEFAULT_MODEL_LANGUAGES.get(supported_language) + ) + + super().__init__( + supported_entities=supported_entities, + supported_language=supported_language, + name="Flair Analytics", + ) + + # main method for the recognizer +
    +[docs] + def analyze( + self, + text: str, + entities: List[str], + nlp_artifacts: pa.nlp_engine.NlpArtifacts = None, + ) -> List[pa.RecognizerResult]: + """ + Analyze text and return the results. + + :param text: The text for analysis. + :param entities: The list of entities to recognize. + :param nlp_artifacts: Not used by this recognizer but needed for the interface. + + :returns: The list of Presidio RecognizerResult constructed from the recognized Flair detections. + """ + + results = [] + + sentences = fl.data.Sentence(text) + self.model.predict(sentences) + + # If there are no specific list of entities, we will look for all of it. + if not entities: + entities = self.supported_entities + + # Go over the entities and check if they are in the supported entities list. + for entity in entities: + if entity not in self.supported_entities: + continue + + # Go over the sentences and check if the entity is in the sentence. + for ent in sentences.get_spans("ner"): + if not self.__check_label( + entity, ent.labels[0].value, self.check_label_groups + ): + continue + + # If the entity is in the sentence, we will add it to the results. + textual_explanation = self._DEFAULT_EXPLANATION.format( + ent.labels[0].value + ) + + # Build the explanation for the result + explanation = self._build_flair_explanation( + round(ent.score, 2), textual_explanation + ) + + flair_result = self._convert_to_recognizer_result(ent, explanation) + + results.append(flair_result) + + return results
    + + + def _convert_to_recognizer_result( + self, entity: fl.data.Span, explanation: str + ) -> pa.RecognizerResult: + """ + Convert Flair result to Presidio RecognizerResult. + + :param entity: Flair entity of Span + :param explanation: Presidio AnalysisExplanation + + :returns: Presidio RecognizerResult + """ + + # Convert the entity type to Presidio entity type + entity_type = self._DEFAULT_PRESIDIO_EQUIVALENCES.get(entity.tag, entity.tag) + + # Convert the score to Presidio score + flair_score = round(entity.score, 2) + + # Create the Presidio RecognizerResult from the Flair entity + flair_results = pa.RecognizerResult( + entity_type=entity_type, + start=entity.start_position, + end=entity.end_position, + score=flair_score, + analysis_explanation=explanation, + ) + + return flair_results + + def _build_flair_explanation( + self, original_score: float, explanation: str + ) -> pa.AnalysisExplanation: + """ + Create explanation for why this result was detected. + + :param original_score: Score given by this recognizer + :param explanation: Explanation string + + :returns: Presidio AnalysisExplanation + """ + + # Create the Presidio AnalysisExplanation for the result + explanation = pa.AnalysisExplanation( + recognizer=self.__class__.__name__, + original_score=original_score, + textual_explanation=explanation, + ) + return explanation + + # sanity check of the entity and label before recognition + @staticmethod + def __check_label( + entity: str, label: str, check_label_groups: Tuple[Set, Set] + ) -> bool: + return any( + entity in egrp and label in lgrp for egrp, lgrp in check_label_groups + )
    + + + +# get the analyzer engine based on the model +def _get_analyzer_engine( + model: str = None, entities: List[str] = None +) -> pa.AnalyzerEngine: + """ + Return pa.AnalyzerEngine. + + :param model: The model to use. Can be "spacy", "flair", "pattern" or "whole". + :param entities: The list of entities to use. + + :returns: pa.AnalyzerEngine + """ + # recognizer registry that can store multiple recognizers + registry = pa.RecognizerRegistry() + if model == Models.SPACY: + # custom spacy recognizer + spacy_recognizer = CustomSpacyRecognizer() + # add the custom build spacy recognizer + registry.add_recognizer(spacy_recognizer) + elif model == Models.FLAIR: + # pre-trained flair recognizer + flair_recognizer = FlairRecognizer() + # add the custom build flair recognizer + registry.add_recognizer(flair_recognizer) + elif model == Models.PATTERN: + # add the pattern recognizer + pattern_recognizer_factory = PatternRecognizerFactory() + for recognizer in pattern_recognizer_factory._create_pattern_recognizer(): + registry.add_recognizer(recognizer) + elif model == Models.WHOLE: + spacy_recognizer = CustomSpacyRecognizer() + flair_recognizer = FlairRecognizer() + registry.add_recognizer(spacy_recognizer) + registry.add_recognizer(flair_recognizer) + # add the pattern recognizer + pattern_recognizer_factory = PatternRecognizerFactory() + for recognizer in pattern_recognizer_factory._create_pattern_recognizer(): + registry.add_recognizer(recognizer) + elif not model and entities: + if set(entities) & CustomSpacyRecognizer.RECOGNIZABLE_ENTITIES: + spacy_recognizer = CustomSpacyRecognizer() + registry.add_recognizer(spacy_recognizer) + if set(entities) & FlairRecognizer.RECOGNIZABLE_ENTITIES: + flair_recognizer = FlairRecognizer() + registry.add_recognizer(flair_recognizer) + # add the pattern recognizer + if set(entities) & (set(PatternRecognizerFactory.RECOGNIZABLE_ENTITIES.keys())): + pattern_recognizer_factory = PatternRecognizerFactory() + for recognizer in pattern_recognizer_factory._create_pattern_recognizer(): + registry.add_recognizer(recognizer) + else: + raise ValueError( + f"argument of model and entities can not be None at the same time" + ) + analyzer = pa.AnalyzerEngine( + registry=registry, + supported_languages=["en"], + ) + + supported_entities = analyzer.get_supported_entities() + + if entities and not all(item in supported_entities for item in entities): + not_supported_entities = [ + item for item in entities if item not in supported_entities + ] + raise ValueError( + f"The current model {model} doesn't support the following entities: {not_supported_entities}. " + f"Supported entities are: {supported_entities}" + ) + return analyzer + + +def _get_anonymizer_engine() -> pre_anoymizer.AnonymizerEngine: + """ + Return AnonymizerEngine. + + :returns: The AnonymizerEngine. + """ + return pre_anoymizer.AnonymizerEngine() + + +def _anonymize( + text: str, + analyze_results: List[pa.RecognizerResult], + entity_operator_map: dict = None, + is_full_text: bool = True, +) -> str: + """ + Anonymize identified input using Presidio Abonymizer. + + :param text: The text for analysis. + :param analyze_results: The list of Presidio RecognizerResult constructed from + :param entity_operator_map: The entity_operator_map is a dictionary that maps entity to operator name and operator params. + :param is_full_text: Whether the text is full text or not. + + :returns: The anonymized text. + """ + if not text: + return "" + + anonymizer_engine = _get_anonymizer_engine() + if not entity_operator_map: + operators = None + else: + # Create OperatorConfig based on the entity_operator_map + operators = { + entity: OperatorConfig(operator_name, operator_params) + for entity, (operator_name, operator_params) in entity_operator_map.items() + } + + if is_full_text: + # Anonymize the entire text + return anonymizer_engine.anonymize( + text=text, analyzer_results=analyze_results, operators=operators + ).text + # Tokenize the text to sentences + sentences = nltk.sent_tokenize(text) + anonymized_sentences = [] + current_idx = 0 + + # Find the sentence that has pii entity + for sentence in sentences: + start_idx = current_idx + end_idx = start_idx + len(sentence) + + # Get the entities that are in the sentence, update hte start_idx and end_idx + sentence_results = [ + pa.RecognizerResult( + result.entity_type, + start=result.start - start_idx, + end=result.end - start_idx, + score=result.score, + ) + for result in analyze_results + if result.start >= start_idx and result.end <= end_idx + ] + + # If PII is detected + if sentence_results: + anonymized_sentence = anonymizer_engine.anonymize( + text=sentence, analyzer_results=sentence_results, operators=operators + ).text + anonymized_sentences.append(anonymized_sentence) + + current_idx = end_idx + + return " ".join(anonymized_sentences) + + +def _get_tokens( + text: str, analyze_results: List[pa.RecognizerResult], is_full: bool = True +) -> List[str]: + """ + Get the full tokens or only contains the entities that can form a sentence. + + :param text: The text for analysis. + :param analyze_results: The list of Presidio RecognizerResult constructed from + :param is_full: Whether return full tokens or just the tokens that only contains the entities that can form a sentence. + + :returns: The tokens. + """ + + tokens = [] + # sort by start index + results = sorted(analyze_results, key=lambda x: x.start) + for i, res in enumerate(results): + if i == 0: + tokens.append(text[: res.start]) + + # append entity text and entity type + tokens.append((text[res.start : res.end], res.entity_type)) + + # if another entity coming i.e. we're not at the last results element, + # add text up to next entity + if i != len(results) - 1: + tokens.append(text[res.end : results[i + 1].start]) + # if no more entities coming, add all remaining text + else: + tokens.append(text[res.end :]) + + # get the tokens that only contains the entities that can form a sentence + part_annontated_tokens = [] + if not is_full: + last_end_sentence = 0 + for i, token in enumerate(tokens): + if any(item in token for item in [".", "!", "?"]) and any( + type(item) is tuple for item in tokens[last_end_sentence:i] + ): + part_annontated_tokens.append(tokens[last_end_sentence:i]) + last_end_sentence = i + return part_annontated_tokens + return tokens + + +def _annotate( + text: str, st_analyze_results: List[pa.RecognizerResult], is_full_html: bool = True +) -> List[str]: + """ + Annotate identified input using Presidio Anonymizer. + + :param text: The text for analysis. + :param st_analyze_results: The list of Presidio RecognizerResult constructed from analysis. + :param is_full_html: Whether generate full html or not. + + :returns: The list of tokens with the identified entities. + + """ + return _get_tokens(text, st_analyze_results, is_full_html) + + +def _process( + text: str, + model: pa.AnalyzerEngine, + score_threshold: float, + entities: List[str] = None, + entities_operator_map: dict = None, + is_full_text: bool = True, +) -> Tuple[str, list]: + """ + Process the text of str using the model. + + :param text: Text to process + :param model: Model to use for processing + :param entities: Entities to recognize + :param entities_operator_map: The entity_operator_map is a dictionary that maps entity to operator name and operator params. + :param score_threshold: The score threshold to use for recognition + :param is_full_text: Whether to return the full text or just the annotated text + + :returns: A tuple of: + + * the anonymized text + * the list of Presidio RecognizerResult constructed from analysis + """ + + # get the analyzer engine + analyzer = model + + # analyze the text that can be used for anonymization + results = analyzer.analyze( + text=text, + language="en", + entities=entities, + score_threshold=score_threshold, + return_decision_process=True, + ) + + # anonymize the text, replace the pii entities with the labels + anonymized_text = _anonymize(text, results, entities_operator_map, is_full_text) + + return anonymized_text, results + + +def _get_single_html( + text: str, results: List[pa.RecognizerResult], is_full_html: bool = True +): + """ + Generate the html for a single txt file. + + :param text: The text for analysis. + :param results: The list of Presidio RecognizerResult constructed from analysis. + :param is_full_html: Whether generate full html or not. + + :returns: The html string for a single txt file. + """ + # convert the results to tokens to generate the html + tokens = _annotate(text, results, is_full_html) + html = at_util.get_annotated_html(*tokens) + + # avoid the error during rendering of the \n in the html + backslash_char = "\\" + + html_str = f"<p>{html.replace('{backslash_char}n', '<br>')}</p>" + + return html_str + + +def _get_single_json(results: List[pa.RecognizerResult], is_full_report: bool = True): + """ + Generate the json for a single txt file. + + :param results: The list of Presidio RecognizerResult constructed from analysis. + :param is_full_report: Whether generate full json or not. + + :returns: The json string for a single txt file. + """ + # generate the stats report if needed + if not is_full_report: + stats = [] + # add the simplify stats logic here + for item in results: + item.analysis_explanation = None + stats.append(item) + else: + stats = results + + return stats + + +def _get_all_html( + txt_content: dict, + res_dict: dict, + is_full_html: bool = True, +): + """ + Generate the html for all txt files. + + :param txt_content: The dictionary of txt file name and content. + :param res_dict: The dictionary of txt file name and the list of Presidio RecognizerResult constructed from analysis. + :param is_full_html: Whether generate full html or not. + + :returns: The html string for all txt files. + + """ + # These are placeholder for the html string + html_index = "<html><head><title>Highlighted Pii Entities</title></head><body><h1>Highlighted Pii Entities</h1><ul>" + html_content = "" + for txt_file, results in res_dict.items(): + txt = txt_content[txt_file] + html_index += f"<li><a href='#{txt_file}'>{txt_file}</a></li>" + html_content += f"<li><h2>{txt_file}</h2><p>{_get_single_html(txt, results, is_full_html)}</p></li>" + html_index += "</ul>" + html_res = f"{html_index}{html_content}</body></html>" + + return html_res + + +def _get_all_rpt(res_dict: dict, is_full_report: bool = True): + """ + Generate the stats report for all txt files. + + :param res_dict: The dictionary of txt file name and the list of Presidio RecognizerResult constructed from analysis. + :param is_full_report: Whether generate full report or not. + + :returns: The stats report for all txt files. + """ + # These are placeholder for the json report + stats_dict = {} + for txt_file, results in res_dict.items(): + new_stats = [] + for item in _get_single_json(results, is_full_report): + if is_full_report: + item.analysis_explanation = item.analysis_explanation.to_dict() + new_stats.append(item.to_dict()) + else: + tmp_dict = item.to_dict() + tmp_dict.pop("analysis_explanation") + tmp_dict.pop("recognition_metadata") + new_stats.append(tmp_dict) + stats_dict[txt_file] = new_stats + return stats_dict + + +
    +[docs] +def recognize_pii( + context: mlrun.MLClientCtx, + input_path: Union[str, pathlib.Path], + html_key: str, + score_threshold: float, + output_directory: str = None, + entities: List[ + str + ] = None, # List of entities to recognize, default is recognizing all + entity_operator_map: dict = None, + model: str = None, + generate_json: bool = True, + generate_html: bool = True, + is_full_text: bool = True, + is_full_html: bool = True, + is_full_report: bool = True, +) -> Union[Tuple[str, pd.DataFrame, dict, dict], Tuple[str, pd.DataFrame, dict]]: + """ + Walk through the input path, recognize PII in text and store the anonymized text in the output path. + Generate the html with different colors for each entity, json report of the explanation. + + :param context: The MLRun context. this is needed for log the artifacts. + :param input_path: The input path of the text files needs to be analyzed. + :param html_key: The html key for the artifact. + :param score_threshold: The score threshold to mark the recognition as trusted. + :param output_directory: The output directory path to store the anonymized text. + :param entities: The list of entities to recognize. + :param entity_operator_map: The map of entity to operator (mask, redact, replace, keep, hash, and its params) + :param model: The model to use. Can be "spacy", "flair", "pattern" or "whole". + :param generate_json: Whether to generate the json report of the explanation. + :param generate_html: Whether to generate the html report of the explanation. + :param is_full_text: Whether to return the full text or only the masked text. + :param is_full_html: Whether to return the full html or just the annotated text + :param is_full_report: Whether to return the full report or just the score and start, end index + + :returns: A tuple of: + + * Path to the output directory + * The json report of the explanation (if generate_json is True) + * A dictionary of errors files that were not processed + + """ + + # Set output directory + if output_directory is None: + output_directory = tempfile.mkdtemp() + + # Create the output directory: + output_directory = pathlib.Path(output_directory) + if not output_directory.exists(): + output_directory.mkdir(parents=True, exist_ok=True) + + txt_files_directory = pathlib.Path(input_path) + successes = [] + errors = {} + + res_dict = {} + txt_content = {} + # Load the model: + analyzer = _get_analyzer_engine(model, entities) + logger.info("Model loaded") + # Go over the text files in the input path, analyze and anonymize them: + for txt_file in tqdm( + list(txt_files_directory.glob("*.txt")), + desc="Processing files", + unit="file", + ): + try: + # Load the str from the text file + text = txt_file.read_text() + txt_content[str(txt_file)] = text + # Process the text to recoginze the pii entities in it + anonymized_text, results = _process( + text=text, + model=analyzer, + entities=entities, + entities_operator_map=entity_operator_map, + score_threshold=score_threshold, + is_full_text=is_full_text, + ) + res_dict[str(txt_file)] = results + # Store the anonymized text in the output path + output_file = output_directory / f"{txt_file.stem}.txt" + output_file.parent.mkdir(parents=True, exist_ok=True) + with open(output_file, "w") as f: + f.write(anonymized_text) + successes.append([txt_file.name, output_file.name]) + except Exception as e: + errors[str(txt_file)] = str(e) + logger.error(f"Error processing {txt_file}: {e}") + + successes = pd.DataFrame( + successes, + columns=["original_file", "anonymized_file"], + ) + + if generate_html: + # Generate the html report + html_res = _get_all_html(txt_content, res_dict, is_full_html) + # Store the html report in the context + arti_html = mlrun.artifacts.Artifact(body=html_res, format="html", key=html_key) + context.log_artifact(arti_html) + if generate_json: + # Generate the json report + json_res = _get_all_rpt(res_dict, is_full_report) + return str(output_directory), successes, errors, json_res + return str(output_directory), successes, errors
    + +
    +
    +
    +
    +
    +
    +
    +
    +
    + +
    +
    +
    +
    + + + +
    +
    + + \ No newline at end of file diff --git a/functions/master/pii_recognizer/0.4.0/static/source.html b/functions/master/pii_recognizer/0.4.0/static/source.html new file mode 100644 index 00000000..dde9a412 --- /dev/null +++ b/functions/master/pii_recognizer/0.4.0/static/source.html @@ -0,0 +1,986 @@ + + + + + + + + + + + Source + + + + +
    +        
    +# Copyright 2023 Iguazio
    +#
    +# Licensed under the Apache License, Version 2.0 (the "License");
    +# you may not use this file except in compliance with the License.
    +# You may obtain a copy of the License at
    +#
    +#   http://www.apache.org/licenses/LICENSE-2.0
    +#
    +# Unless required by applicable law or agreed to in writing, software
    +# distributed under the License is distributed on an "AS IS" BASIS,
    +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    +# See the License for the specific language governing permissions and
    +# limitations under the License.
    +
    +import logging
    +import os
    +import pathlib
    +import tempfile
    +import warnings
    +from typing import List, Set, Tuple, Union
    +
    +import annotated_text.util as at_util
    +import mlrun
    +import nltk
    +import pandas as pd
    +import presidio_analyzer as pa
    +import presidio_anonymizer as pre_anoymizer
    +from presidio_anonymizer.entities import OperatorConfig
    +from tqdm import tqdm
    +
    +try:
    +    import flair as fl
    +except ModuleNotFoundError:
    +    print("Flair is not installed")
    +
    +# There is a conflict between Rust-based tokenizers' parallel processing
    +# and Python's fork operations during multiprocessing. To avoid this, we need
    +# the following two lines
    +
    +os.environ["TOKENIZERS_PARALLELISM"] = "false"
    +warnings.filterwarnings("ignore")
    +
    +logger = logging.getLogger("pii-recognizer")
    +
    +
    +# Add the constant classes of Models and Entities to govern the whole package
    +class Models:
    +    WHOLE = "whole"
    +    PATTERN = "pattern"
    +    SPACY = "spacy"
    +    FLAIR = "flair"
    +
    +
    +class Entities:
    +    CREDIT_CARD = "CREDIT_CARD"
    +    SSN = "SSN"
    +    PHONE = "PHONE"
    +    EMAIL = "EMAIL"
    +    LOCATION = "LOCATION"
    +    PERSON = "PERSON"
    +    NRP = "NRP"
    +    ORGANIZATION = "ORGANIZATION"
    +    DATE_TIME = "DATE_TIME"
    +    GPE = ("GPE",)
    +    MAC_ADDRESS = "MAC_ADDRESS"
    +    US_BANK_NUMBER = "US_BANK_NUMBER"
    +    IMEI = "IMEI"
    +    TITLE = "TITLE"
    +    LICENSE_PLATE = "LICENSE_PLATE"
    +    US_PASSPORT = "US_PASSPORT"
    +    CURRENCY = "CURRENCY"
    +    ROUTING_NUMBER = "ROUTING_NUMBER"
    +    US_ITIN = "US_ITIN"
    +    US_BANK_NUMBER = "US_BANK_NUMBER"
    +    US_DRIVER_LICENSE = "US_DRIVER_LICENSE"
    +    AGE = "AGE"
    +    PASSWORD = "PASSWORD"
    +    SWIFT_CODE = "SWIFT_CODE"
    +
    +
    +class PatternRecognizerFactory:
    +    """
    +    Factory for creating pattern recognizers, it can be extended in the future to
    +    add more regex pattern for different entities. For the pattern recognizer to work,
    +    we need construct a list of regex patterns for each entity.
    +    """
    +
    +    RECOGNIZABLE_ENTITIES = {
    +        "CREDIT_CARD": [pa.Pattern("CREDIT_CARD", r"\b(?:\d[ -]*?){13,16}\b", 0.5)],
    +        "SSN": [pa.Pattern("SSN", r"\b\d{3}-?\d{2}-?\d{4}\b", 0.5)],
    +        "PHONE": [pa.Pattern("PHONE", r"\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}", 0.5)],
    +        "EMAIL": [pa.Pattern("EMAIL", r"\S+@\S+", 0.5)],
    +    }
    +
    +    # create a list of pattern recognizers
    +    @classmethod
    +    def _create_pattern_recognizer(cls):
    +        """
    +        For each entity, create a list of patterns to recognize it
    +
    +        :param cls: PatternRecognizerFactory class
    +
    +        :returns: List of pattern recognizers
    +        """
    +
    +        # Entities to recognize and their regex patterns
    +
    +        return [
    +            pa.PatternRecognizer(supported_entity=entity, patterns=pattern)
    +            for entity, pattern in cls.RECOGNIZABLE_ENTITIES.items()
    +        ]
    +
    +
    +class CustomSpacyRecognizer(pa.LocalRecognizer):
    +    """
    +    Custom Spacy Recognizer from Presidio Analyzer trained on Privy data.
    +    The privy data is generated using this https://github.com/pixie-io/pixie/tree/main/src/datagen/pii/privy
    +    It can be used to recognize custom entities, Since we want to use Presidio's Registries to generate AnalyzerEngine,
    +    it inherits from Presidio Analyzer's LocalRecognizer class.
    +    """
    +
    +    # Entities to recognize
    +
    +    RECOGNIZABLE_ENTITIES = {
    +        "LOCATION",
    +        "PERSON",
    +        "NRP",
    +        "ORGANIZATION",
    +        "DATE_TIME",
    +    }
    +
    +    # Default explanation for this recognizer
    +
    +    _DEFAULT_EXPLANATION = (
    +        "Identified as {} by Spacy's Named Entity Recognition (Privy-trained)"
    +    )
    +
    +    # Label groups to check
    +
    +    _DEFAULT_CHECK_LABEL_GROUPS = [
    +        ({"LOCATION"}, {"LOC", "LOCATION", "STREET_ADDRESS", "COORDINATE"}),
    +        ({"PERSON"}, {"PER", "PERSON"}),
    +        ({"NRP"}, {"NORP", "NRP"}),
    +        ({"ORGANIZATION"}, {"ORG"}),
    +        ({"DATE_TIME"}, {"DATE_TIME"}),
    +    ]
    +
    +    # pretrained model for this recognizer
    +
    +    _DEFAULT_MODEL_LANGUAGES = {
    +        "en": "beki/en_spacy_pii_distilbert",
    +    }
    +
    +    _DEFAULT_PRESIDIO_EQUIVALENCES = {
    +        "PER": "PERSON",
    +        "LOC": "LOCATION",
    +        "ORG": "ORGANIZATION",
    +        "NROP": "NRP",
    +        "DATE_TIME": "DATE_TIME",
    +    }
    +
    +    def __init__(
    +        self,
    +        supported_language: str = "en",
    +        supported_entities: List[str] = None,
    +        check_label_groups: Tuple[Set, Set] = None,
    +        context: List[str] = None,
    +        ner_strength: float = 1,
    +    ):
    +        """
    +        Initialize Spacy Recognizer.
    +
    +        :param supported_language: Language to use, default is English
    +        :param supported_entities: Entities to use for recognition
    +        :param check_label_groups: Label groups to check for the entities
    +        :param context:            Context to use if any
    +        :param ner_strength:       Default confidence for NER prediction
    +
    +        :returns: SpacyRecognizer object
    +        """
    +
    +        # Default confidence for NER prediction
    +        self.ner_strength = ner_strength
    +
    +        self.check_label_groups = check_label_groups or self._DEFAULT_CHECK_LABEL_GROUPS
    +        supported_entities = supported_entities or self.RECOGNIZABLE_ENTITIES
    +        super().__init__(
    +            supported_entities=supported_entities,
    +            supported_language=supported_language,
    +        )
    +
    +    # get the presidio explanation for the result
    +
    +    def _build_spacy_explanation(
    +        self, original_score: float, explanation: str
    +    ) -> pa.AnalysisExplanation:
    +        """
    +        Create explanation for why this result was detected.
    +
    +        :param original_score: Score given by this recognizer
    +        :param explanation:    Explanation string
    +
    +        :returns: Presidio AnalysisExplanation object
    +        """
    +        explanation = pa.AnalysisExplanation(
    +            recognizer=self.__class__.__name__,
    +            original_score=original_score,
    +            textual_explanation=explanation,
    +        )
    +        return explanation
    +
    +    # main method for the recognizer
    +    def analyze(self, text: str, entities: List[str], nlp_artifacts=None):  # noqa D102
    +        """
    +        Analyze text using Spacy.
    +
    +        :param text:          Text to analyze
    +        :param entities:      Entities to analyze
    +        :param nlp_artifacts: NLP artifacts to use
    +
    +        :returns: List of Presidio RecognizerResult objects
    +        """
    +        results = []
    +        if not nlp_artifacts:
    +            logger.warning("Skipping SpaCy, nlp artifacts not provided...")
    +            return results
    +
    +        ner_entities = nlp_artifacts.entities
    +
    +        # recognize the supported entities
    +        for entity in entities:
    +            if entity not in self.supported_entities:
    +                continue
    +            for ent in ner_entities:
    +                if not self.__check_label(entity, ent.label_, self.check_label_groups):
    +                    continue
    +
    +                # string of the explanation saying the entity is recognized by spacy
    +                textual_explanation = self._DEFAULT_EXPLANATION.format(ent.label_)
    +                explanation = self._build_spacy_explanation(
    +                    self.ner_strength, textual_explanation
    +                )
    +
    +                # create the standard result with the entity, start, end, score, and explanation
    +                spacy_result = pa.RecognizerResult(
    +                    entity_type=entity,
    +                    start=ent.start_char,
    +                    end=ent.end_char,
    +                    score=self.ner_strength,
    +                    analysis_explanation=explanation,
    +                    recognition_metadata={
    +                        pa.RecognizerResult.RECOGNIZER_NAME_KEY: self.name
    +                    },
    +                )
    +                results.append(spacy_result)
    +
    +        return results
    +
    +    @staticmethod
    +    def __check_label(
    +        entity: str, label: str, check_label_groups: Tuple[Set, Set]
    +    ) -> bool:
    +        """
    +        Check if the label is in the label group.
    +
    +        :param entity:             Entity to check
    +        :param label:              Label to check
    +        :param check_label_groups: Label groups to check
    +
    +        :returns: True if the label is in the label group, False otherwise
    +        """
    +        return any(
    +            entity in egrp and label in lgrp for egrp, lgrp in check_label_groups
    +        )
    +
    +
    +# Class to use Flair with Presidio as an external recognizer.
    +class FlairRecognizer(pa.EntityRecognizer):
    +    """
    +    Wrapper for a flair model, if needed to be used within Presidio Analyzer.
    +    This is to make sure the recognizer can be registered with Presidio registry.
    +    """
    +
    +    RECOGNIZABLE_ENTITIES = {
    +        "LOCATION",
    +        "PERSON",
    +        "NRP",
    +        "GPE",
    +        "ORGANIZATION",
    +        "MAC_ADDRESS",
    +        "US_BANK_NUMBER",
    +        "IMEI",
    +        "TITLE",
    +        "LICENSE_PLATE",
    +        "US_PASSPORT",
    +        "CURRENCY",
    +        "ROUTING_NUMBER",
    +        "US_ITIN",
    +        "US_BANK_NUMBER",
    +        "US_DRIVER_LICENSE",
    +        "AGE",
    +        "PASSWORD",
    +        "SWIFT_CODE",
    +    }
    +
    +    # This is used to construct the explanation for the result
    +
    +    _DEFAULT_EXPLANATION = "Identified as {} by Flair's Named Entity Recognition"
    +
    +    _DEFAULT_CHECK_LABEL_GROUPS = [
    +        ({"LOCATION"}, {"LOC", "LOCATION", "STREET_ADDRESS", "COORDINATE"}),
    +        ({"PERSON"}, {"PER", "PERSON"}),
    +        ({"NRP"}, {"NORP", "NRP"}),
    +        ({"GPE"}, {"GPE"}),
    +        ({"ORGANIZATION"}, {"ORG"}),
    +        ({"MAC_ADDRESS"}, {"MAC_ADDRESS"}),
    +        ({"US_BANK_NUMBER"}, {"US_BANK_NUMBER"}),
    +        ({"IMEI"}, {"IMEI"}),
    +        ({"TITLE"}, {"TITLE"}),
    +        ({"LICENSE_PLATE"}, {"LICENSE_PLATE"}),
    +        ({"US_PASSPORT"}, {"US_PASSPORT"}),
    +        ({"CURRENCY"}, {"CURRENCY"}),
    +        ({"ROUTING_NUMBER"}, {"ROUTING_NUMBER"}),
    +        ({"AGE"}, {"AGE"}),
    +        ({"CURRENCY"}, {"CURRENCY"}),
    +        ({"SWIFT_CODE"}, {"SWIFT_CODE"}),
    +        ({"US_ITIN"}, {"US_ITIN"}),
    +        ({"US_BANK_NUMBER"}, {"US_BANK_NUMBER"}),
    +        ({"US_DRIVER_LICENSE"}, {"US_DRIVER_LICENSE"}),
    +    ]
    +
    +    _DEFAULT_MODEL_LANGUAGES = {
    +        "en": "beki/flair-pii-distilbert",
    +    }
    +
    +    _DEFAULT_PRESIDIO_EQUIVALENCES = {
    +        "PER": "PERSON",
    +        "LOC": "LOCATION",
    +        "ORG": "ORGANIZATION",
    +        "NROP": "NRP",
    +        "URL": "URL",
    +        "US_ITIN": "US_ITIN",
    +        "US_PASSPORT": "US_PASSPORT",
    +        "IBAN_CODE": "IBAN_CODE",
    +        "IP_ADDRESS": "IP_ADDRESS",
    +        "EMAIL_ADDRESS": "EMAIL",
    +        "US_DRIVER_LICENSE": "US_DRIVER_LICENSE",
    +        "US_BANK_NUMBER": "US_BANK_NUMBER",
    +    }
    +
    +    def __init__(
    +        self,
    +        supported_language: str = "en",
    +        supported_entities: List[str] = None,
    +        check_label_groups: Tuple[Set, Set] = None,
    +    ):
    +        """
    +        Initialize the FlairRecognizer.
    +
    +        :param supported_language: Language to use
    +        :param supported_entities: Entities to use
    +        :param check_label_groups: Label groups to check
    +
    +        :returns: FlairRecognizer object
    +
    +        """
    +        self.check_label_groups = check_label_groups or self._DEFAULT_CHECK_LABEL_GROUPS
    +
    +        supported_entities = supported_entities or self.RECOGNIZABLE_ENTITIES
    +        self.model = fl.models.SequenceTagger.load(
    +            self._DEFAULT_MODEL_LANGUAGES.get(supported_language)
    +        )
    +
    +        super().__init__(
    +            supported_entities=supported_entities,
    +            supported_language=supported_language,
    +            name="Flair Analytics",
    +        )
    +
    +    # main method for the recognizer
    +    def analyze(
    +        self,
    +        text: str,
    +        entities: List[str],
    +        nlp_artifacts: pa.nlp_engine.NlpArtifacts = None,
    +    ) -> List[pa.RecognizerResult]:
    +        """
    +        Analyze text and return the results.
    +
    +        :param text:          The text for analysis.
    +        :param entities:      The list of entities to recognize.
    +        :param nlp_artifacts: Not used by this recognizer but needed for the interface.
    +
    +        :returns: The list of Presidio RecognizerResult constructed from the recognized Flair detections.
    +        """
    +
    +        results = []
    +
    +        sentences = fl.data.Sentence(text)
    +        self.model.predict(sentences)
    +
    +        # If there are no specific list of entities, we will look for all of it.
    +        if not entities:
    +            entities = self.supported_entities
    +
    +        # Go over the entities and check if they are in the supported entities list.
    +        for entity in entities:
    +            if entity not in self.supported_entities:
    +                continue
    +
    +            # Go over the sentences and check if the entity is in the sentence.
    +            for ent in sentences.get_spans("ner"):
    +                if not self.__check_label(
    +                    entity, ent.labels[0].value, self.check_label_groups
    +                ):
    +                    continue
    +
    +                # If the entity is in the sentence, we will add it to the results.
    +                textual_explanation = self._DEFAULT_EXPLANATION.format(
    +                    ent.labels[0].value
    +                )
    +
    +                # Build the explanation for the result
    +                explanation = self._build_flair_explanation(
    +                    round(ent.score, 2), textual_explanation
    +                )
    +
    +                flair_result = self._convert_to_recognizer_result(ent, explanation)
    +
    +                results.append(flair_result)
    +
    +        return results
    +
    +    def _convert_to_recognizer_result(
    +        self, entity: fl.data.Span, explanation: str
    +    ) -> pa.RecognizerResult:
    +        """
    +        Convert Flair result to Presidio RecognizerResult.
    +
    +        :param entity:      Flair entity of Span
    +        :param explanation: Presidio AnalysisExplanation
    +
    +        :returns: Presidio RecognizerResult
    +        """
    +
    +        # Convert the entity type to Presidio entity type
    +        entity_type = self._DEFAULT_PRESIDIO_EQUIVALENCES.get(entity.tag, entity.tag)
    +
    +        # Convert the score to Presidio score
    +        flair_score = round(entity.score, 2)
    +
    +        # Create the Presidio RecognizerResult from the Flair entity
    +        flair_results = pa.RecognizerResult(
    +            entity_type=entity_type,
    +            start=entity.start_position,
    +            end=entity.end_position,
    +            score=flair_score,
    +            analysis_explanation=explanation,
    +        )
    +
    +        return flair_results
    +
    +    def _build_flair_explanation(
    +        self, original_score: float, explanation: str
    +    ) -> pa.AnalysisExplanation:
    +        """
    +        Create explanation for why this result was detected.
    +
    +        :param original_score: Score given by this recognizer
    +        :param explanation:    Explanation string
    +
    +        :returns: Presidio AnalysisExplanation
    +        """
    +
    +        # Create the Presidio AnalysisExplanation for the result
    +        explanation = pa.AnalysisExplanation(
    +            recognizer=self.__class__.__name__,
    +            original_score=original_score,
    +            textual_explanation=explanation,
    +        )
    +        return explanation
    +
    +    # sanity check of the entity and label before recognition
    +    @staticmethod
    +    def __check_label(
    +        entity: str, label: str, check_label_groups: Tuple[Set, Set]
    +    ) -> bool:
    +        return any(
    +            entity in egrp and label in lgrp for egrp, lgrp in check_label_groups
    +        )
    +
    +
    +# get the analyzer engine based on the model
    +def _get_analyzer_engine(
    +    model: str = None, entities: List[str] = None
    +) -> pa.AnalyzerEngine:
    +    """
    +    Return pa.AnalyzerEngine.
    +
    +    :param model: The model to use. Can be "spacy", "flair", "pattern" or "whole".
    +    :param entities: The list of entities to use.
    +
    +    :returns: pa.AnalyzerEngine
    +    """
    +    # recognizer registry that can store multiple recognizers
    +    registry = pa.RecognizerRegistry()
    +    if model == Models.SPACY:
    +        # custom spacy recognizer
    +        spacy_recognizer = CustomSpacyRecognizer()
    +        # add the custom build spacy recognizer
    +        registry.add_recognizer(spacy_recognizer)
    +    elif model == Models.FLAIR:
    +        # pre-trained flair recognizer
    +        flair_recognizer = FlairRecognizer()
    +        # add the custom build flair recognizer
    +        registry.add_recognizer(flair_recognizer)
    +    elif model == Models.PATTERN:
    +        # add the pattern recognizer
    +        pattern_recognizer_factory = PatternRecognizerFactory()
    +        for recognizer in pattern_recognizer_factory._create_pattern_recognizer():
    +            registry.add_recognizer(recognizer)
    +    elif model == Models.WHOLE:
    +        spacy_recognizer = CustomSpacyRecognizer()
    +        flair_recognizer = FlairRecognizer()
    +        registry.add_recognizer(spacy_recognizer)
    +        registry.add_recognizer(flair_recognizer)
    +        # add the pattern recognizer
    +        pattern_recognizer_factory = PatternRecognizerFactory()
    +        for recognizer in pattern_recognizer_factory._create_pattern_recognizer():
    +            registry.add_recognizer(recognizer)
    +    elif not model and entities:
    +        if set(entities) & CustomSpacyRecognizer.RECOGNIZABLE_ENTITIES:
    +            spacy_recognizer = CustomSpacyRecognizer()
    +            registry.add_recognizer(spacy_recognizer)
    +        if set(entities) & FlairRecognizer.RECOGNIZABLE_ENTITIES:
    +            flair_recognizer = FlairRecognizer()
    +            registry.add_recognizer(flair_recognizer)
    +        # add the pattern recognizer
    +        if set(entities) & (set(PatternRecognizerFactory.RECOGNIZABLE_ENTITIES.keys())):
    +            pattern_recognizer_factory = PatternRecognizerFactory()
    +            for recognizer in pattern_recognizer_factory._create_pattern_recognizer():
    +                registry.add_recognizer(recognizer)
    +    else:
    +        raise ValueError(
    +            f"argument of model and entities can not be None at the same time"
    +        )
    +    analyzer = pa.AnalyzerEngine(
    +        registry=registry,
    +        supported_languages=["en"],
    +    )
    +
    +    supported_entities = analyzer.get_supported_entities()
    +
    +    if entities and not all(item in supported_entities for item in entities):
    +        not_supported_entities = [
    +            item for item in entities if item not in supported_entities
    +        ]
    +        raise ValueError(
    +            f"The current model {model} doesn't support the following entities: {not_supported_entities}. "
    +            f"Supported entities are: {supported_entities}"
    +        )
    +    return analyzer
    +
    +
    +def _get_anonymizer_engine() -> pre_anoymizer.AnonymizerEngine:
    +    """
    +    Return AnonymizerEngine.
    +
    +    :returns: The AnonymizerEngine.
    +    """
    +    return pre_anoymizer.AnonymizerEngine()
    +
    +
    +def _anonymize(
    +    text: str,
    +    analyze_results: List[pa.RecognizerResult],
    +    entity_operator_map: dict = None,
    +    is_full_text: bool = True,
    +) -> str:
    +    """
    +    Anonymize identified input using Presidio Abonymizer.
    +
    +    :param text:                The text for analysis.
    +    :param analyze_results:     The list of Presidio RecognizerResult constructed from
    +    :param entity_operator_map: The entity_operator_map is a dictionary that maps entity to operator name and operator params.
    +    :param is_full_text:        Whether the text is full text or not.
    +
    +    :returns: The anonymized text.
    +    """
    +    if not text:
    +        return ""
    +
    +    anonymizer_engine = _get_anonymizer_engine()
    +    if not entity_operator_map:
    +        operators = None
    +    else:
    +        # Create OperatorConfig based on the entity_operator_map
    +        operators = {
    +            entity: OperatorConfig(operator_name, operator_params)
    +            for entity, (operator_name, operator_params) in entity_operator_map.items()
    +        }
    +
    +    if is_full_text:
    +        # Anonymize the entire text
    +        return anonymizer_engine.anonymize(
    +            text=text, analyzer_results=analyze_results, operators=operators
    +        ).text
    +    # Tokenize the text to sentences
    +    sentences = nltk.sent_tokenize(text)
    +    anonymized_sentences = []
    +    current_idx = 0
    +
    +    # Find the sentence that has pii entity
    +    for sentence in sentences:
    +        start_idx = current_idx
    +        end_idx = start_idx + len(sentence)
    +
    +        # Get the entities that are in the sentence, update hte start_idx and end_idx
    +        sentence_results = [
    +            pa.RecognizerResult(
    +                result.entity_type,
    +                start=result.start - start_idx,
    +                end=result.end - start_idx,
    +                score=result.score,
    +            )
    +            for result in analyze_results
    +            if result.start >= start_idx and result.end <= end_idx
    +        ]
    +
    +        # If PII is detected
    +        if sentence_results:
    +            anonymized_sentence = anonymizer_engine.anonymize(
    +                text=sentence, analyzer_results=sentence_results, operators=operators
    +            ).text
    +            anonymized_sentences.append(anonymized_sentence)
    +
    +        current_idx = end_idx
    +
    +    return " ".join(anonymized_sentences)
    +
    +
    +def _get_tokens(
    +    text: str, analyze_results: List[pa.RecognizerResult], is_full: bool = True
    +) -> List[str]:
    +    """
    +    Get the full tokens or only contains the entities that can form a sentence.
    +
    +    :param text:            The text for analysis.
    +    :param analyze_results: The list of Presidio RecognizerResult constructed from
    +    :param is_full:         Whether return full tokens or just the tokens that only contains the entities that can form a sentence.
    +
    +    :returns: The tokens.
    +    """
    +
    +    tokens = []
    +    # sort by start index
    +    results = sorted(analyze_results, key=lambda x: x.start)
    +    for i, res in enumerate(results):
    +        if i == 0:
    +            tokens.append(text[: res.start])
    +
    +        # append entity text and entity type
    +        tokens.append((text[res.start : res.end], res.entity_type))
    +
    +        # if another entity coming i.e. we're not at the last results element,
    +        # add text up to next entity
    +        if i != len(results) - 1:
    +            tokens.append(text[res.end : results[i + 1].start])
    +        # if no more entities coming, add all remaining text
    +        else:
    +            tokens.append(text[res.end :])
    +
    +    # get the tokens that only contains the entities that can form a sentence
    +    part_annontated_tokens = []
    +    if not is_full:
    +        last_end_sentence = 0
    +        for i, token in enumerate(tokens):
    +            if any(item in token for item in [".", "!", "?"]) and any(
    +                type(item) is tuple for item in tokens[last_end_sentence:i]
    +            ):
    +                part_annontated_tokens.append(tokens[last_end_sentence:i])
    +                last_end_sentence = i
    +        return part_annontated_tokens
    +    return tokens
    +
    +
    +def _annotate(
    +    text: str, st_analyze_results: List[pa.RecognizerResult], is_full_html: bool = True
    +) -> List[str]:
    +    """
    +    Annotate identified input using Presidio Anonymizer.
    +
    +    :param text:               The text for analysis.
    +    :param st_analyze_results: The list of Presidio RecognizerResult constructed from analysis.
    +    :param is_full_html:       Whether generate full html or not.
    +
    +    :returns: The list of tokens with the identified entities.
    +
    +    """
    +    return _get_tokens(text, st_analyze_results, is_full_html)
    +
    +
    +def _process(
    +    text: str,
    +    model: pa.AnalyzerEngine,
    +    score_threshold: float,
    +    entities: List[str] = None,
    +    entities_operator_map: dict = None,
    +    is_full_text: bool = True,
    +) -> Tuple[str, list]:
    +    """
    +    Process the text of str using the model.
    +
    +    :param text:                  Text to process
    +    :param model:                 Model to use for processing
    +    :param entities:              Entities to recognize
    +    :param entities_operator_map: The entity_operator_map is a dictionary that maps entity to operator name and operator params.
    +    :param score_threshold:       The score threshold to use for recognition
    +    :param is_full_text:          Whether to return the full text or just the annotated text
    +
    +    :returns: A tuple of:
    +
    +              * the anonymized text
    +              * the list of Presidio RecognizerResult constructed from analysis
    +    """
    +
    +    # get the analyzer engine
    +    analyzer = model
    +
    +    # analyze the text that can be used for anonymization
    +    results = analyzer.analyze(
    +        text=text,
    +        language="en",
    +        entities=entities,
    +        score_threshold=score_threshold,
    +        return_decision_process=True,
    +    )
    +
    +    # anonymize the text, replace the pii entities with the labels
    +    anonymized_text = _anonymize(text, results, entities_operator_map, is_full_text)
    +
    +    return anonymized_text, results
    +
    +
    +def _get_single_html(
    +    text: str, results: List[pa.RecognizerResult], is_full_html: bool = True
    +):
    +    """
    +    Generate the html for a single txt file.
    +
    +    :param text:         The text for analysis.
    +    :param results:      The list of Presidio RecognizerResult constructed from analysis.
    +    :param is_full_html: Whether generate full html or not.
    +
    +    :returns: The html string for a single txt file.
    +    """
    +    # convert the results to tokens to generate the html
    +    tokens = _annotate(text, results, is_full_html)
    +    html = at_util.get_annotated_html(*tokens)
    +
    +    # avoid the error during rendering of the \n in the html
    +    backslash_char = "\\"
    +
    +    html_str = f"

    {html.replace('{backslash_char}n', '
    ')}

    " + + return html_str + + +def _get_single_json(results: List[pa.RecognizerResult], is_full_report: bool = True): + """ + Generate the json for a single txt file. + + :param results: The list of Presidio RecognizerResult constructed from analysis. + :param is_full_report: Whether generate full json or not. + + :returns: The json string for a single txt file. + """ + # generate the stats report if needed + if not is_full_report: + stats = [] + # add the simplify stats logic here + for item in results: + item.analysis_explanation = None + stats.append(item) + else: + stats = results + + return stats + + +def _get_all_html( + txt_content: dict, + res_dict: dict, + is_full_html: bool = True, +): + """ + Generate the html for all txt files. + + :param txt_content: The dictionary of txt file name and content. + :param res_dict: The dictionary of txt file name and the list of Presidio RecognizerResult constructed from analysis. + :param is_full_html: Whether generate full html or not. + + :returns: The html string for all txt files. + + """ + # These are placeholder for the html string + html_index = "Highlighted Pii Entities

    Highlighted Pii Entities

      " + html_content = "" + for txt_file, results in res_dict.items(): + txt = txt_content[txt_file] + html_index += f"
    • {txt_file}
    • " + html_content += f"
    • {txt_file}

      {_get_single_html(txt, results, is_full_html)}

    • " + html_index += "
    " + html_res = f"{html_index}{html_content}" + + return html_res + + +def _get_all_rpt(res_dict: dict, is_full_report: bool = True): + """ + Generate the stats report for all txt files. + + :param res_dict: The dictionary of txt file name and the list of Presidio RecognizerResult constructed from analysis. + :param is_full_report: Whether generate full report or not. + + :returns: The stats report for all txt files. + """ + # These are placeholder for the json report + stats_dict = {} + for txt_file, results in res_dict.items(): + new_stats = [] + for item in _get_single_json(results, is_full_report): + if is_full_report: + item.analysis_explanation = item.analysis_explanation.to_dict() + new_stats.append(item.to_dict()) + else: + tmp_dict = item.to_dict() + tmp_dict.pop("analysis_explanation") + tmp_dict.pop("recognition_metadata") + new_stats.append(tmp_dict) + stats_dict[txt_file] = new_stats + return stats_dict + + +def recognize_pii( + context: mlrun.MLClientCtx, + input_path: Union[str, pathlib.Path], + html_key: str, + score_threshold: float, + output_directory: str = None, + entities: List[ + str + ] = None, # List of entities to recognize, default is recognizing all + entity_operator_map: dict = None, + model: str = None, + generate_json: bool = True, + generate_html: bool = True, + is_full_text: bool = True, + is_full_html: bool = True, + is_full_report: bool = True, +) -> Union[Tuple[str, pd.DataFrame, dict, dict], Tuple[str, pd.DataFrame, dict]]: + """ + Walk through the input path, recognize PII in text and store the anonymized text in the output path. + Generate the html with different colors for each entity, json report of the explanation. + + :param context: The MLRun context. this is needed for log the artifacts. + :param input_path: The input path of the text files needs to be analyzed. + :param html_key: The html key for the artifact. + :param score_threshold: The score threshold to mark the recognition as trusted. + :param output_directory: The output directory path to store the anonymized text. + :param entities: The list of entities to recognize. + :param entity_operator_map: The map of entity to operator (mask, redact, replace, keep, hash, and its params) + :param model: The model to use. Can be "spacy", "flair", "pattern" or "whole". + :param generate_json: Whether to generate the json report of the explanation. + :param generate_html: Whether to generate the html report of the explanation. + :param is_full_text: Whether to return the full text or only the masked text. + :param is_full_html: Whether to return the full html or just the annotated text + :param is_full_report: Whether to return the full report or just the score and start, end index + + :returns: A tuple of: + + * Path to the output directory + * The json report of the explanation (if generate_json is True) + * A dictionary of errors files that were not processed + + """ + + # Set output directory + if output_directory is None: + output_directory = tempfile.mkdtemp() + + # Create the output directory: + output_directory = pathlib.Path(output_directory) + if not output_directory.exists(): + output_directory.mkdir(parents=True, exist_ok=True) + + txt_files_directory = pathlib.Path(input_path) + successes = [] + errors = {} + + res_dict = {} + txt_content = {} + # Load the model: + analyzer = _get_analyzer_engine(model, entities) + logger.info("Model loaded") + # Go over the text files in the input path, analyze and anonymize them: + for txt_file in tqdm( + list(txt_files_directory.glob("*.txt")), + desc="Processing files", + unit="file", + ): + try: + # Load the str from the text file + text = txt_file.read_text() + txt_content[str(txt_file)] = text + # Process the text to recoginze the pii entities in it + anonymized_text, results = _process( + text=text, + model=analyzer, + entities=entities, + entities_operator_map=entity_operator_map, + score_threshold=score_threshold, + is_full_text=is_full_text, + ) + res_dict[str(txt_file)] = results + # Store the anonymized text in the output path + output_file = output_directory / f"{txt_file.stem}.txt" + output_file.parent.mkdir(parents=True, exist_ok=True) + with open(output_file, "w") as f: + f.write(anonymized_text) + successes.append([txt_file.name, output_file.name]) + except Exception as e: + errors[str(txt_file)] = str(e) + logger.error(f"Error processing {txt_file}: {e}") + + successes = pd.DataFrame( + successes, + columns=["original_file", "anonymized_file"], + ) + + if generate_html: + # Generate the html report + html_res = _get_all_html(txt_content, res_dict, is_full_html) + # Store the html report in the context + arti_html = mlrun.artifacts.Artifact(body=html_res, format="html", key=html_key) + context.log_artifact(arti_html) + if generate_json: + # Generate the json report + json_res = _get_all_rpt(res_dict, is_full_report) + return str(output_directory), successes, errors, json_res + return str(output_directory), successes, errors + +
    +
    + + \ No newline at end of file diff --git a/functions/master/pii_recognizer/latest/src/function.yaml b/functions/master/pii_recognizer/latest/src/function.yaml index 069fa1ff..e7d6c124 100644 --- a/functions/master/pii_recognizer/latest/src/function.yaml +++ b/functions/master/pii_recognizer/latest/src/function.yaml @@ -1,38 +1,14 @@ -kind: job -metadata: - name: pii-recognizer - tag: '' - hash: 818930645d33704e9cada919769ee9d93cbb9434 - project: '' - labels: - author: pgw - categories: - - machine-learning - - data-preparation - - NLP +verbose: false spec: - command: '' - args: [] - image: '' - build: - functionSourceCode:  - base_image: mlrun/mlrun - commands: [] - code_origin: '' - origin_filename: '' - requirements: - - nltk - - pandas - - presidio-anonymizer - - presidio-analyzer - - torch - - flair@git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653 - - st-annotated-text - - https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl + default_handler: recognize_pii entry_points: analyze: name: analyze - doc: Analyze text and return the results. + outputs: + - doc: The list of Presidio RecognizerResult constructed from the recognized + Flair detections. + type: List[pa.RecognizerResult] + has_kwargs: false parameters: - name: self - name: text @@ -45,20 +21,16 @@ spec: type: pa.nlp_engine.NlpArtifacts doc: Not used by this recognizer but needed for the interface. default: null - outputs: - - doc: The list of Presidio RecognizerResult constructed from the recognized - Flair detections. - type: List[pa.RecognizerResult] lineno: 381 + doc: Analyze text and return the results. has_varargs: false - has_kwargs: false recognize_pii: name: recognize_pii - doc: 'Walk through the input path, recognize PII in text and store the anonymized - text in the output path. - - Generate the html with different colors for each entity, json report of the - explanation.' + outputs: + - doc: 'A tuple of:' + type: Union[Tuple[str, pd.DataFrame, dict, dict], Tuple[str, pd.DataFrame, + dict]] + has_kwargs: false parameters: - name: context type: MLClientCtx @@ -109,21 +81,35 @@ spec: type: bool doc: Whether to return the full report or just the score and start, end index default: true - outputs: - - doc: 'A tuple of:' - type: Union[Tuple[str, pd.DataFrame, dict, dict], Tuple[str, pd.DataFrame, - dict]] lineno: 845 + doc: 'Walk through the input path, recognize PII in text and store the anonymized + text in the output path. + + Generate the html with different colors for each entity, json report of the + explanation.' has_varargs: false - has_kwargs: false + build: + base_image: mlrun/mlrun + requirements: + - nltk + - pandas + - presidio-anonymizer + - presidio-analyzer + - torch + - flair@git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653 + - st-annotated-text + - https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl + functionSourceCode:  + code_origin: '' + origin_filename: '' description: This function is used to recognize PII in a directory of text files - default_handler: recognize_pii + image: '' + command: '' disable_auto_mount: false - clone_target_dir: '' - env: [] - priority_class_name: '' - preemption_mode: prevent - affinity: null - tolerations: null - security_context: {} -verbose: false +kind: job +metadata: + name: pii-recognizer + tag: '' + categories: + - data-preparation + - NLP diff --git a/functions/master/pii_recognizer/latest/src/item.yaml b/functions/master/pii_recognizer/latest/src/item.yaml index 41ead33b..8f3185b4 100644 --- a/functions/master/pii_recognizer/latest/src/item.yaml +++ b/functions/master/pii_recognizer/latest/src/item.yaml @@ -1,6 +1,5 @@ apiVersion: v1 categories: - - machine-learning - data-preparation - NLP description: This function is used to recognize PII in a directory of text files @@ -13,7 +12,7 @@ labels: author: pgw maintainers: [] marketplaceType: '' -mlrunVersion: 1.4.0 +mlrunVersion: 1.7.0 name: pii-recognizer platformVersion: 3.5.3 spec: @@ -31,5 +30,5 @@ spec: - st-annotated-text - https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl url: '' -version: 0.3.0 +version: 0.4.0 test_valid: False diff --git a/functions/master/pii_recognizer/latest/static/documentation.html b/functions/master/pii_recognizer/latest/static/documentation.html index fdad808c..fbf1f167 100644 --- a/functions/master/pii_recognizer/latest/static/documentation.html +++ b/functions/master/pii_recognizer/latest/static/documentation.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/pii_recognizer/latest/static/example.html b/functions/master/pii_recognizer/latest/static/example.html index d5a156b8..e403a056 100644 --- a/functions/master/pii_recognizer/latest/static/example.html +++ b/functions/master/pii_recognizer/latest/static/example.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/pii_recognizer/latest/static/function.html b/functions/master/pii_recognizer/latest/static/function.html index fe3339fa..92eba7ca 100644 --- a/functions/master/pii_recognizer/latest/static/function.html +++ b/functions/master/pii_recognizer/latest/static/function.html @@ -28,41 +28,17 @@
             
    -kind: job
    -metadata:
    -  name: pii-recognizer
    -  tag: ''
    -  hash: 818930645d33704e9cada919769ee9d93cbb9434
    -  project: ''
    -  labels:
    -    author: pgw
    -  categories:
    -  - machine-learning
    -  - data-preparation
    -  - NLP
    +verbose: false
     spec:
    -  command: ''
    -  args: []
    -  image: ''
    -  build:
    -    functionSourceCode: 
    -    base_image: mlrun/mlrun
    -    commands: []
    -    code_origin: ''
    -    origin_filename: ''
    -    requirements:
    -    - nltk
    -    - pandas
    -    - presidio-anonymizer
    -    - presidio-analyzer
    -    - torch
    -    - flair@git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653
    -    - st-annotated-text
    -    - https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl
    +  default_handler: recognize_pii
       entry_points:
         analyze:
           name: analyze
    -      doc: Analyze text and return the results.
    +      outputs:
    +      - doc: The list of Presidio RecognizerResult constructed from the recognized
    +          Flair detections.
    +        type: List[pa.RecognizerResult]
    +      has_kwargs: false
           parameters:
           - name: self
           - name: text
    @@ -75,20 +51,16 @@
             type: pa.nlp_engine.NlpArtifacts
             doc: Not used by this recognizer but needed for the interface.
             default: null
    -      outputs:
    -      - doc: The list of Presidio RecognizerResult constructed from the recognized
    -          Flair detections.
    -        type: List[pa.RecognizerResult]
           lineno: 381
    +      doc: Analyze text and return the results.
           has_varargs: false
    -      has_kwargs: false
         recognize_pii:
           name: recognize_pii
    -      doc: 'Walk through the input path, recognize PII in text and store the anonymized
    -        text in the output path.
    -
    -        Generate the html with different colors for each entity, json report of the
    -        explanation.'
    +      outputs:
    +      - doc: 'A tuple of:'
    +        type: Union[Tuple[str, pd.DataFrame, dict, dict], Tuple[str, pd.DataFrame,
    +          dict]]
    +      has_kwargs: false
           parameters:
           - name: context
             type: MLClientCtx
    @@ -139,24 +111,38 @@
             type: bool
             doc: Whether to return the full report or just the score and start, end index
             default: true
    -      outputs:
    -      - doc: 'A tuple of:'
    -        type: Union[Tuple[str, pd.DataFrame, dict, dict], Tuple[str, pd.DataFrame,
    -          dict]]
           lineno: 845
    +      doc: 'Walk through the input path, recognize PII in text and store the anonymized
    +        text in the output path.
    +
    +        Generate the html with different colors for each entity, json report of the
    +        explanation.'
           has_varargs: false
    -      has_kwargs: false
    +  build:
    +    base_image: mlrun/mlrun
    +    requirements:
    +    - nltk
    +    - pandas
    +    - presidio-anonymizer
    +    - presidio-analyzer
    +    - torch
    +    - flair@git+https://github.com/flairNLP/flair.git@d4ed67bf663e4066517f00397412510d90043653
    +    - st-annotated-text
    +    - https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl
    +    functionSourceCode: 
    +    code_origin: ''
    +    origin_filename: ''
       description: This function is used to recognize PII in a directory of text files
    -  default_handler: recognize_pii
    +  image: ''
    +  command: ''
       disable_auto_mount: false
    -  clone_target_dir: ''
    -  env: []
    -  priority_class_name: ''
    -  preemption_mode: prevent
    -  affinity: null
    -  tolerations: null
    -  security_context: {}
    -verbose: false
    +kind: job
    +metadata:
    +  name: pii-recognizer
    +  tag: ''
    +  categories:
    +  - data-preparation
    +  - NLP
     
             
         
    diff --git a/functions/master/pii_recognizer/latest/static/item.html b/functions/master/pii_recognizer/latest/static/item.html index 1f2995a3..d16c5239 100644 --- a/functions/master/pii_recognizer/latest/static/item.html +++ b/functions/master/pii_recognizer/latest/static/item.html @@ -30,7 +30,6 @@ apiVersion: v1 categories: - - machine-learning - data-preparation - NLP description: This function is used to recognize PII in a directory of text files @@ -43,7 +42,7 @@ author: pgw maintainers: [] marketplaceType: '' -mlrunVersion: 1.4.0 +mlrunVersion: 1.7.0 name: pii-recognizer platformVersion: 3.5.3 spec: @@ -61,7 +60,7 @@ - st-annotated-text - https://huggingface.co/beki/en_spacy_pii_distilbert/resolve/main/en_spacy_pii_distilbert-any-py3-none-any.whl url: '' -version: 0.3.0 +version: 0.4.0 test_valid: False diff --git a/functions/master/pii_recognizer/latest/static/pii_recognizer.html b/functions/master/pii_recognizer/latest/static/pii_recognizer.html index 161e26b9..d271919b 100644 --- a/functions/master/pii_recognizer/latest/static/pii_recognizer.html +++ b/functions/master/pii_recognizer/latest/static/pii_recognizer.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/pyannote_audio/1.3.0/src/assets/test_data.wav b/functions/master/pyannote_audio/1.3.0/src/assets/test_data.wav new file mode 100644 index 00000000..a3a993c2 Binary files /dev/null and b/functions/master/pyannote_audio/1.3.0/src/assets/test_data.wav differ diff --git a/functions/master/pyannote_audio/1.3.0/src/function.yaml b/functions/master/pyannote_audio/1.3.0/src/function.yaml new file mode 100644 index 00000000..b4cd9ad9 --- /dev/null +++ b/functions/master/pyannote_audio/1.3.0/src/function.yaml @@ -0,0 +1,133 @@ +kind: job +spec: + command: '' + disable_auto_mount: false + image: '' + build: + code_origin: '' + requirements: + - pyannote.audio + - pyannote.core + - torchaudio + - tqdm + base_image: mlrun/mlrun-gpu + origin_filename: '' + functionSourceCode:  + default_handler: diarize + entry_points: + open_mpi_handler: + name: open_mpi_handler + has_varargs: false + lineno: 61 + parameters: + - name: worker_inputs + type: List[str] + - name: root_worker_inputs + type: Dict[str, Any] + default: null + has_kwargs: false + doc: '' + decorator: + name: decorator + has_varargs: false + lineno: 73 + parameters: + - name: handler + has_kwargs: false + doc: '' + wrapper: + name: wrapper + has_varargs: false + lineno: 78 + has_kwargs: true + doc: '' + diarize: + name: diarize + has_varargs: false + lineno: 139 + outputs: + - doc: 'A tuple of:' + type: Tuple[Dict[str, List[Tuple[float, float, str]]], Dict[str, str]] + parameters: + - name: data_path + type: Union[str, List[str]] + doc: A directory of the audio files, a single file or a list of files to transcribe. + - name: model_name + type: str + doc: 'One of the official diarization model names (referred as diarization + pipelines) of `pyannote.audio` Huggingface page. Default: "pyannote/speaker-diarization-3.0".' + default: pyannote/speaker-diarization-3.0 + - name: access_token + type: str + doc: An access token to pass for using the `pyannote.audio` models. If not + provided, it will be looking for the environment variable "HUGGING_FACE_HUB_TOKEN". + If MLRun is available, it will look for a secret "HUGGING_FACE_HUB_TOKEN". + default: null + - name: device + type: str + doc: Device to load the model. Can be one of {"cuda", "cpu"}. Default will + prefer "cuda" if available. + default: null + - name: speakers_labels + type: List[str] + doc: 'Labels to use for the recognized speakers. Default: numeric labels (0, + 1, ...).' + default: null + - name: speaker_prefix + type: str + doc: 'A prefix to add for the speakers labels. This parameter is ignored if + `speakers_labels` is not None. Default: "speaker".' + default: speaker_ + - name: separate_by_channels + type: bool + doc: If each speaker is speaking in a separate channel, you can diarize each + channel and combine the result into a single diarization. Each label set + in the `speakers_labels` parameter will be assigned to a specific channel + by order. + default: false + - name: minimum_speakers + type: int + doc: Set the minimum expected amount of speakers to be in the audio files. + This parameter is ignored if `speakers_labels` is not None. + default: null + - name: maximum_speakers + type: int + doc: Set the maximum expected amount of speakers to be in the audio files. + This parameter is ignored if `speakers_labels` is not None. + default: null + - name: verbose + type: bool + doc: 'Whether to present logs of a progress bar and errors. Default: True.' + default: false + has_kwargs: false + doc: "Perform speech diarization on given audio files using pyannote-audio (https://github.com/pyannote/pyannote-audio).\n\ + The end result is a dictionary with the file names as keys and their diarization\ + \ as value. A diarization is a list\nof tuples: (start, end, speaker_label).\n\ + \nTo use the `pyannote.audio` models you must pass a Huggingface token and\ + \ get access to the required models. The\ntoken can be passed in one of the\ + \ following options:\n\n* Use the parameter `access_token`.\n* Set an environment\ + \ variable named \"HUGGING_FACE_HUB_TOKEN\".\n* If using MLRun, you can pass\ + \ it as a secret named \"HUGGING_FACE_HUB_TOKEN\".\n\nTo get access to the\ + \ models on Huggingface, visit their page. For example, to use the default\ + \ diarization model set\nin this function (\"pyannote/speaker-diarization-3.0\"\ + ), you need access for these two models:\n\n* https://huggingface.co/pyannote/segmentation-3.0\n\ + * https://huggingface.co/pyannote/speaker-diarization-3.0\n\nNote: To control\ + \ the recognized speakers in the diarization output you can choose one of\ + \ the following methods:\n\n* For a known speakers amount, you may set speaker\ + \ labels via the `speakers_labels` parameter that will be used in\n the order\ + \ of speaking in the audio (first person speaking be the first label in the\ + \ list). In addition, you can do\n diarization per channel (setting the parameter\ + \ `separate_by_channels` to True). Each label will be assigned to a\n specific\ + \ channel by order (first label to channel 0, second label to channel 1 and\ + \ so on). Notice, this will\n increase runtime.\n* For unknown speakers amount,\ + \ you can set the `speaker_prefix` parameter to add a prefix for each speaker\ + \ number.\n You can also help the diarization by setting the speakers range\ + \ via the `speakers_amount_range` parameter." + description: pyannote's speech diarization of audio files +metadata: + name: pyannote-audio + tag: '' + categories: + - deep-learning + - audio +verbose: false diff --git a/functions/master/pyannote_audio/1.3.0/src/item.yaml b/functions/master/pyannote_audio/1.3.0/src/item.yaml new file mode 100644 index 00000000..b6dbccdd --- /dev/null +++ b/functions/master/pyannote_audio/1.3.0/src/item.yaml @@ -0,0 +1,29 @@ +apiVersion: v1 +categories: +- deep-learning +- audio +description: pyannote's speech diarization of audio files +doc: '' +example: pyannote_audio.ipynb +generationDate: 2023-12-03:14-30 +hidden: false +icon: '' +labels: + author: guyl +maintainers: [] +marketplaceType: '' +mlrunVersion: 1.7.0 +name: pyannote-audio +platformVersion: 3.5.3 +spec: + filename: pyannote_audio.py + handler: diarize + image: mlrun/mlrun-gpu + kind: job + requirements: + - pyannote.audio + - pyannote.core + - torchaudio + - tqdm +url: '' +version: 1.3.0 diff --git a/functions/master/pyannote_audio/1.3.0/src/pyannote_audio.ipynb b/functions/master/pyannote_audio/1.3.0/src/pyannote_audio.ipynb new file mode 100644 index 00000000..9901cc4f --- /dev/null +++ b/functions/master/pyannote_audio/1.3.0/src/pyannote_audio.ipynb @@ -0,0 +1,375 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "4f17e477-db37-41b6-a76e-c69dbeea53db", + "metadata": {}, + "source": [ + "# Speech diarization example notebook" + ] + }, + { + "cell_type": "markdown", + "id": "46e7131b-42fe-4f3c-a268-08d6d4ff9cdf", + "metadata": {}, + "source": [ + "In this notebook we will utilize a call diarization capability to get per-speaker speech durations from a call recording.
    \n", + "This can be useful for quantifying participation rates in calls for things like customer service analysis.
    \n", + "\n", + "We will demonstrate this by:
    \n", + "\n", + "1. Loading in a sample call recording between multiple participants\n", + "2. Using a diarize() function to automatically detect speakers and estimate per-speaker talk time\n", + "3. Return a dictionary of described results, and a df of errors\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "53d25661-15eb-40c0-8ec8-4af9838c1d04", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import mlrun" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "68b84d16-d0aa-4e86-a716-5d92e38c9236", + "metadata": {}, + "outputs": [], + "source": [ + "# To use the `pyannote.audio` models you must pass a Huggingface token and get access to the required models. The\n", + "# token can be passed in one of the following options:\n", + "#\n", + "# * Use the parameter `access_token`.\n", + "# * Set an environment variable named \"HUGGING_FACE_HUB_TOKEN\".\n", + "# * If using MLRun, you can pass it as a secret named \"HUGGING_FACE_HUB_TOKEN\".\n", + "os.environ[\"HUGGING_FACE_HUB_TOKEN\"] = <\"add your token here\">\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "2a0b1f97-6fba-400f-aacf-fe1da28e35d0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2023-12-05 15:28:51,758 [info] Project loaded successfully: {'project_name': 'diarization-test'}\n" + ] + } + ], + "source": [ + "# Create an mlrun project\n", + "project = mlrun.get_or_create_project(\"diarization-test\")\n", + "\n", + "# Import the function from the yaml file, once it's in the the we can import from there \n", + "speech_diarization = project.set_function(func=\"hub://speech_diarization\", name=\"speech_diarization\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "50d9a797-a3f2-4824-b6e2-8245f6e30b17", + "metadata": {}, + "outputs": [], + "source": [ + "# Set the desired run params and files\n", + "audio_files = os.path.join(\"test_data.wav\")\n", + "device = \"cpu\"\n", + "speakers_labels = [\"Agent\", \"Client\"]\n", + "separate_by_channels = True" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "110080e5-3f54-4117-a61b-0e09f1422b1b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2023-12-05 15:28:52,229 [info] Storing function: {'name': 'speech-diarization-diarize', 'uid': 'ec6cd014e4674966b30303ea14048acf', 'db': 'http://mlrun-api:8080'}\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
    \n", + "
    \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
    projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
    diarization-test0Dec 05 15:28:52completedspeech-diarization-diarize
    v3io_user=zeevr
    kind=local
    owner=zeevr
    host=jupyter-zeev-gpu-5995df47dc-rtpvr
    data_path
    device=cpu
    speakers_labels=['Agent', 'Client']
    separate_by_channels=True
    speech-diarization
    diarize-errors
    \n", + "
    \n", + "
    \n", + "
    \n", + " Title\n", + " ×\n", + "
    \n", + " \n", + "
    \n", + "
    \n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "text/html": [ + " > to track results use the .show() or .logs() methods or click here to open in UI" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2023-12-05 15:28:53,350 [info] Run execution finished: {'status': 'completed', 'name': 'speech-diarization-diarize'}\n" + ] + } + ], + "source": [ + "# Run the imported function with desired file/s and params\n", + "diarize_run = speech_diarization.run(\n", + " handler=\"diarize\",\n", + " inputs={\"data_path\": audio_files},\n", + " params={\n", + " \"device\": device,\n", + " \"speakers_labels\": speakers_labels,\n", + " \"separate_by_channels\": separate_by_channels,\n", + " },\n", + " returns=[\"speech-diarization: file\", \"diarize-errors: file\"],\n", + " local=True,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ede77975-8843-424f-b521-b9dd56ddad28", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "mlrun-base", + "language": "python", + "name": "conda-env-mlrun-base-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/functions/master/pyannote_audio/1.3.0/src/pyannote_audio.py b/functions/master/pyannote_audio/1.3.0/src/pyannote_audio.py new file mode 100644 index 00000000..6271da6a --- /dev/null +++ b/functions/master/pyannote_audio/1.3.0/src/pyannote_audio.py @@ -0,0 +1,376 @@ +# Copyright 2023 Iguazio +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import heapq +import logging +import operator +import os +import pathlib +from functools import reduce, wraps +from typing import Any, Dict, List, Tuple, Union + +import pandas as pd +import pyannote.audio +import pyannote.core +import torch +import torchaudio +from tqdm import tqdm + +# Get the global logger: +_LOGGER = logging.getLogger() + + +def _check_mlrun_and_open_mpi() -> Tuple["mlrun.MLClientCtx", "mpi4py.MPI.Intracomm"]: + is_mpi = False + try: + import mlrun + + context = mlrun.get_or_create_ctx(name="mlrun") + is_mpi = context.labels.get("kind", "job") == "mpijob" + + if is_mpi: + try: + from mpi4py import MPI + + return context, MPI.COMM_WORLD + except ModuleNotFoundError as mpi4py_not_found: + context.logger.error( + "To distribute the function using MLRun's 'mpijob' you need to have `mpi4py` package in your " + "interpreter. Please run `pip install mpi4py` and make sure you have open-mpi." + ) + raise mpi4py_not_found + else: + return context, None + except ModuleNotFoundError as module_not_found: + if is_mpi: + raise module_not_found + return None, None + + +def open_mpi_handler( + worker_inputs: List[str], root_worker_inputs: Dict[str, Any] = None +): + global _LOGGER + + # Check for MLRun and OpenMPI availability: + context, comm = _check_mlrun_and_open_mpi() + + # Check if MLRun is available, set the global logger to MLRun's: + if context: + _LOGGER = context.logger + + def decorator(handler): + if comm is None or comm.Get_size() == 1: + return handler + + @wraps(handler) + def wrapper(**kwargs): + # Get the open mpi environment properties: + size = comm.Get_size() + rank = comm.Get_rank() + + # Give the correct chunk of the workers inputs: + for worker_input in worker_inputs: + input_argument = kwargs[worker_input] + if input_argument is None: + continue + if isinstance(input_argument, str): + input_argument = _get_audio_files( + data_path=pathlib.Path(input_argument).absolute() + ) + if len(input_argument) < size: + raise ValueError( + f"Cannot split the input '{worker_input}' of length {len(input_argument)} to {size} workers. " + f"Please reduce the amount of workers for this input." + ) + even_chunk_size = len(input_argument) // size + chunk_start = rank * even_chunk_size + chunk_end = ( + (rank + 1) * even_chunk_size + if rank + 1 < size + else len(input_argument) + ) + context.logger.info( + f"Rank #{rank}: Processing input chunk of '{worker_input}' " + f"from index {chunk_start} to {chunk_end}." + ) + if isinstance(input_argument, list): + input_argument = input_argument[chunk_start:chunk_end] + elif isinstance(input_argument, pd.DataFrame): + input_argument = input_argument.iloc[chunk_start:chunk_end:, :] + kwargs[worker_input] = input_argument + + # Set the root worker only arguments: + if rank == 0 and root_worker_inputs: + kwargs.update(root_worker_inputs) + + # Run the worker: + output = handler(**kwargs) + + # Send the output to the root rank (rank #0): + output = comm.gather(output, root=0) + if rank == 0: + # Join the outputs: + context.logger.info("Collecting data from workers to root worker.") + diarization_dictionary = reduce( + operator.ior, [dia for dia, _ in output], {} + ) + errors_dictionary = reduce(operator.ior, [err for _, err in output], {}) + return diarization_dictionary, errors_dictionary + return None + + return wrapper + + return decorator + + +@open_mpi_handler(worker_inputs=["data_path"], root_worker_inputs={"verbose": True}) +def diarize( + data_path: Union[str, List[str]], + model_name: str = "pyannote/speaker-diarization-3.0", + access_token: str = None, + device: str = None, + speakers_labels: List[str] = None, + speaker_prefix: str = "speaker_", + separate_by_channels: bool = False, + minimum_speakers: int = None, + maximum_speakers: int = None, + verbose: bool = False, +) -> Tuple[Dict[str, List[Tuple[float, float, str]]], Dict[str, str]]: + """ + Perform speech diarization on given audio files using pyannote-audio (https://github.com/pyannote/pyannote-audio). + The end result is a dictionary with the file names as keys and their diarization as value. A diarization is a list + of tuples: (start, end, speaker_label). + + To use the `pyannote.audio` models you must pass a Huggingface token and get access to the required models. The + token can be passed in one of the following options: + + * Use the parameter `access_token`. + * Set an environment variable named "HUGGING_FACE_HUB_TOKEN". + * If using MLRun, you can pass it as a secret named "HUGGING_FACE_HUB_TOKEN". + + To get access to the models on Huggingface, visit their page. For example, to use the default diarization model set + in this function ("pyannote/speaker-diarization-3.0"), you need access for these two models: + + * https://huggingface.co/pyannote/segmentation-3.0 + * https://huggingface.co/pyannote/speaker-diarization-3.0 + + Note: To control the recognized speakers in the diarization output you can choose one of the following methods: + + * For a known speakers amount, you may set speaker labels via the `speakers_labels` parameter that will be used in + the order of speaking in the audio (first person speaking be the first label in the list). In addition, you can do + diarization per channel (setting the parameter `separate_by_channels` to True). Each label will be assigned to a + specific channel by order (first label to channel 0, second label to channel 1 and so on). Notice, this will + increase runtime. + * For unknown speakers amount, you can set the `speaker_prefix` parameter to add a prefix for each speaker number. + You can also help the diarization by setting the speakers range via the `speakers_amount_range` parameter. + + :param data_path: A directory of the audio files, a single file or a list of files to transcribe. + :param model_name: One of the official diarization model names (referred as diarization pipelines) of + `pyannote.audio` Huggingface page. Default: "pyannote/speaker-diarization-3.0". + :param access_token: An access token to pass for using the `pyannote.audio` models. If not provided, it + will be looking for the environment variable "HUGGING_FACE_HUB_TOKEN". If MLRun is + available, it will look for a secret "HUGGING_FACE_HUB_TOKEN". + :param device: Device to load the model. Can be one of {"cuda", "cpu"}. Default will prefer "cuda" if + available. + :param speakers_labels: Labels to use for the recognized speakers. Default: numeric labels (0, 1, ...). + :param separate_by_channels: If each speaker is speaking in a separate channel, you can diarize each channel and + combine the result into a single diarization. Each label set in the `speakers_labels` + parameter will be assigned to a specific channel by order. + :param speaker_prefix: A prefix to add for the speakers labels. This parameter is ignored if + `speakers_labels` is not None. Default: "speaker". + :param minimum_speakers: Set the minimum expected amount of speakers to be in the audio files. This parameter is + ignored if `speakers_labels` is not None. + :param maximum_speakers: Set the maximum expected amount of speakers to be in the audio files. This parameter is + ignored if `speakers_labels` is not None. + :param verbose: Whether to present logs of a progress bar and errors. Default: True. + + :returns: A tuple of: + + * Speech diarization dictionary. + * A dictionary of errored files that were not transcribed. + """ + global _LOGGER + + # Get the input audio files to diarize: + if isinstance(data_path, str): + data_path = pathlib.Path(data_path).absolute() + audio_files = _get_audio_files(data_path=data_path) + else: # Should be a list of files. + audio_files = data_path + + # Get the Huggingface access token: + access_token = _get_access_token(parameter=access_token) + if access_token is None: + raise ValueError( + "A Huggingface access token must be provided to use `pyannote.audio` models. Access token can be passed " + "via one of the following options:\n" + "* Use the parameter `access_token`.\n" + "* Set an environment variable named 'HUGGING_FACE_HUB_TOKEN'.\n" + "* If using MLRun, you can pass it as a secret named 'HUGGING_FACE_HUB_TOKEN'." + ) + + # Load the diarization pipeline: + pipeline = pyannote.audio.Pipeline.from_pretrained( + checkpoint_path=model_name, use_auth_token=access_token + ) + + # Set the device: + device = device or ("cuda" if torch.cuda.is_available() else "cpu") + if device != "cpu": + pipeline.to(torch.device(device)) + + # Prepare the successes dataframe and errors dictionary to be returned: + diarizations = {} + errors = {} + + # Prepare the diarization keyword arguments: + diarize_kwargs = {} + if speakers_labels: + diarize_kwargs["num_speakers"] = len(speakers_labels) + else: + if minimum_speakers: + diarize_kwargs["min_speakers"] = minimum_speakers + if maximum_speakers: + diarize_kwargs["max_speakers"] = maximum_speakers + + # Go over the audio files and diarize: + for audio_file in tqdm( + audio_files, desc="Diarizing", unit="file", disable=not verbose + ): + try: + # Load audio file: + audio, sample_rate = torchaudio.load(uri=audio_file, channels_first=True) + # Get the diarization (if provided): + diarizations[audio_file.name] = _diarize( + audio=audio, + sample_rate=sample_rate, + pipeline=pipeline, + speakers_labels=speakers_labels, + separate_by_channels=separate_by_channels, + speaker_prefix=speaker_prefix, + diarize_kwargs=diarize_kwargs, + ) + except Exception as exception: + # Note the exception as error in the dictionary: + if verbose: + _LOGGER.warning(f"Error in file: '{audio_file.name}'") + errors[str(audio_file.name)] = str(exception) + continue + + # Print the head of the produced dataframe and return: + if verbose: + _LOGGER.info(f"Done ({len(diarizations)}/{len(audio_files)})\n") + return diarizations, errors + + +def _get_audio_files( + data_path: pathlib.Path, +) -> List[pathlib.Path]: + # Check if the path is of a directory or a file: + if data_path.is_dir(): + # Get all files inside the directory: + audio_files = list(data_path.glob("*.*")) + elif data_path.is_file(): + audio_files = [data_path] + else: + raise ValueError( + f"Unrecognized data path. The parameter `data_path` must be either a directory path or a file path. " + f"Given: {str(data_path)} " + ) + + return audio_files + + +def _get_access_token(parameter: str) -> str: + # If given as a parameter, return it: + if parameter: + return parameter + + # Otherwise, look at the environment variable: + environment_variable = os.environ.get("HUGGING_FACE_HUB_TOKEN") + if environment_variable: + return environment_variable + + # Lastly, try look in the set secrets in MLRun: + secret = None + try: + import mlrun + + context = mlrun.get_or_create_ctx(name="mlrun") + secret = context.get_secret(key="HUGGING_FACE_HUB_TOKEN") + except ModuleNotFoundError: + pass + + return secret + + +def _diarize( + audio: torch.Tensor, + sample_rate: int, + pipeline: pyannote.audio.Pipeline, + speakers_labels: List[str], + separate_by_channels: bool, + speaker_prefix: str, + diarize_kwargs: dict, +) -> List[Tuple[float, float, str]]: + # If there is no need for separation by channels, we diarize and return: + if not separate_by_channels: + # Diarize: + diarization: pyannote.core.Annotation = pipeline( + file={"waveform": audio, "sample_rate": sample_rate}, **diarize_kwargs + ) + # Verify speakers labels (should not fail here as we set `num_speakers=len(speakers_labels)` when inferring + # through the pipeline): + if speakers_labels: + given_speakers = len(speakers_labels) + found_speakers = len(set(diarization.labels())) + if given_speakers < found_speakers: + raise ValueError( + f"Not enough `speakers_labels` were given. Got {given_speakers} labels but the diarization " + f"recognized {found_speakers} speakers." + ) + # Return as a diarization list - a sorted list of tuples of start time, end time and a label (the default label + # returned is "SPEAKER_i" so we take only the index out of it): + return [ + ( + segment.start, + segment.end, + speakers_labels[int(label.split("_")[1])] + if speakers_labels + else f"{speaker_prefix}{int(label.split('_')[1])}", + ) + for segment, track, label in diarization.itertracks(yield_label=True) + ] + + # Separate to channels and diarize (we expect only one speaker per channel): + channel_diarizations = [ + _diarize( + audio=audio[channel].unsqueeze( + 0 + ), # Take channel and add a channel dimension to it. + sample_rate=sample_rate, + pipeline=pipeline, + speakers_labels=[ + speakers_labels[channel] + ], # Take the channel's label only. + separate_by_channels=False, + speaker_prefix=speaker_prefix, + diarize_kwargs={"num_speakers": 1}, # Set to one speaker. + ) + for channel in range(audio.shape[0]) + ] + + # Merge the channel diarizations into a single sorted list: + return list(heapq.merge(*channel_diarizations)) diff --git a/functions/master/pyannote_audio/1.3.0/src/test_pyannote_audio.py b/functions/master/pyannote_audio/1.3.0/src/test_pyannote_audio.py new file mode 100644 index 00000000..93da5083 --- /dev/null +++ b/functions/master/pyannote_audio/1.3.0/src/test_pyannote_audio.py @@ -0,0 +1,25 @@ +import os + +import mlrun +import pytest + + +@pytest.mark.skipif("HUGGING_FACE_HUB_TOKEN" not in os.environ, reason="no token") +def test_speech_diarization(): + project = mlrun.new_project("diarization-test2") + speech_diarization = project.set_function( + func="./function.yaml", name="speech_diarization", image="mlrun/mlrun" + ) + + diarize_run = speech_diarization.run( + handler="diarize", + inputs={"data_path": os.path.join("assets", "test_data.wav")}, + params={ + "device": "cpu", + "speakers_labels": ["Agent", "Client"], + "separate_by_channels": True, + }, + returns=["speech_diarization: file", "diarize_errors: file"], + local=True, + ) + assert diarize_run.outputs["speech_diarization"] diff --git a/functions/master/pyannote_audio/1.3.0/static/documentation.html b/functions/master/pyannote_audio/1.3.0/static/documentation.html new file mode 100644 index 00000000..e28a243b --- /dev/null +++ b/functions/master/pyannote_audio/1.3.0/static/documentation.html @@ -0,0 +1,305 @@ + + + + + + + +pyannote_audio package + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + + + +
    +
    +
    +
    +
    +
    + +
    + +
    +
    +
    + + + + +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + + + +
    +
    +
    +
    +
    +

    pyannote_audio package

    + +
    + +
    +
    + +
    +
    +

    pyannote_audio package#

    +
    +

    Submodules#

    +
    +
    +

    pyannote_audio.pyannote_audio module#

    +
    +
    +pyannote_audio.pyannote_audio.diarize(data_path: str | List[str], model_name: str = 'pyannote/speaker-diarization-3.0', access_token: str | None = None, device: str | None = None, speakers_labels: List[str] | None = None, speaker_prefix: str = 'speaker_', separate_by_channels: bool = False, minimum_speakers: int | None = None, maximum_speakers: int | None = None, verbose: bool = False) Tuple[Dict[str, List[Tuple[float, float, str]]], Dict[str, str]][source]#
    +

    Perform speech diarization on given audio files using pyannote-audio (pyannote/pyannote-audio). +The end result is a dictionary with the file names as keys and their diarization as value. A diarization is a list +of tuples: (start, end, speaker_label).

    +

    To use the pyannote.audio models you must pass a Huggingface token and get access to the required models. The +token can be passed in one of the following options:

    +
      +
    • Use the parameter access_token.

    • +
    • Set an environment variable named “HUGGING_FACE_HUB_TOKEN”.

    • +
    • If using MLRun, you can pass it as a secret named “HUGGING_FACE_HUB_TOKEN”.

    • +
    +

    To get access to the models on Huggingface, visit their page. For example, to use the default diarization model set +in this function (“pyannote/speaker-diarization-3.0”), you need access for these two models:

    + +

    Note: To control the recognized speakers in the diarization output you can choose one of the following methods:

    +
      +
    • For a known speakers amount, you may set speaker labels via the speakers_labels parameter that will be used in +the order of speaking in the audio (first person speaking be the first label in the list). In addition, you can do +diarization per channel (setting the parameter separate_by_channels to True). Each label will be assigned to a +specific channel by order (first label to channel 0, second label to channel 1 and so on). Notice, this will +increase runtime.

    • +
    • For unknown speakers amount, you can set the speaker_prefix parameter to add a prefix for each speaker number. +You can also help the diarization by setting the speakers range via the speakers_amount_range parameter.

    • +
    +
    +
    Parameters:
    +
      +
    • data_path – A directory of the audio files, a single file or a list of files to transcribe.

    • +
    • model_name – One of the official diarization model names (referred as diarization pipelines) of +pyannote.audio Huggingface page. Default: “pyannote/speaker-diarization-3.0”.

    • +
    • access_token – An access token to pass for using the pyannote.audio models. If not provided, it +will be looking for the environment variable “HUGGING_FACE_HUB_TOKEN”. If MLRun is +available, it will look for a secret “HUGGING_FACE_HUB_TOKEN”.

    • +
    • device – Device to load the model. Can be one of {“cuda”, “cpu”}. Default will prefer “cuda” if +available.

    • +
    • speakers_labels – Labels to use for the recognized speakers. Default: numeric labels (0, 1, …).

    • +
    • separate_by_channels – If each speaker is speaking in a separate channel, you can diarize each channel and +combine the result into a single diarization. Each label set in the speakers_labels +parameter will be assigned to a specific channel by order.

    • +
    • speaker_prefix – A prefix to add for the speakers labels. This parameter is ignored if +speakers_labels is not None. Default: “speaker”.

    • +
    • minimum_speakers – Set the minimum expected amount of speakers to be in the audio files. This parameter is +ignored if speakers_labels is not None.

    • +
    • maximum_speakers – Set the maximum expected amount of speakers to be in the audio files. This parameter is +ignored if speakers_labels is not None.

    • +
    • verbose – Whether to present logs of a progress bar and errors. Default: True.

    • +
    +
    +
    Returns:
    +

    A tuple of:

    +
      +
    • Speech diarization dictionary.

    • +
    • A dictionary of errored files that were not transcribed.

    • +
    +

    +
    +
    +
    +
    +
    +pyannote_audio.pyannote_audio.open_mpi_handler(worker_inputs: List[str], root_worker_inputs: Dict[str, Any] | None = None)[source]#
    +
    +
    +
    +

    Module contents#

    +
    +
    +
    +
    +
    +
    +
    +
    + +
    +
    + +
    +
    +
    +
    + + + +
    +
    + + \ No newline at end of file diff --git a/functions/master/pyannote_audio/1.3.0/static/example.html b/functions/master/pyannote_audio/1.3.0/static/example.html new file mode 100644 index 00000000..fd6ba740 --- /dev/null +++ b/functions/master/pyannote_audio/1.3.0/static/example.html @@ -0,0 +1,470 @@ + + + + + + + +Speech diarization example notebook + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + + + +
    +
    +
    +
    +
    +
    + +
    + +
    +
    +
    + + + + +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + + +
    +
    +
    +
    +
    +

    Speech diarization example notebook

    + +
    +
    +
    +
    +
    + +
    +
    +

    Speech diarization example notebook#

    +

    In this notebook we will utilize a call diarization capability to get per-speaker speech durations from a call recording.
    +This can be useful for quantifying participation rates in calls for things like customer service analysis.

    +

    We will demonstrate this by:

    +
      +
    1. Loading in a sample call recording between multiple participants

    2. +
    3. Using a diarize() function to automatically detect speakers and estimate per-speaker talk time

    4. +
    5. Return a dictionary of described results, and a df of errors

    6. +
    +
    +
    +
    import os
    +import mlrun
    +
    +
    +
    +
    +
    +
    +
    # To use the `pyannote.audio` models you must pass a Huggingface token and get access to the required models. The
    +#    token can be passed in one of the following options:
    +#
    +#    * Use the parameter `access_token`.
    +#    * Set an environment variable named "HUGGING_FACE_HUB_TOKEN".
    +#    * If using MLRun, you can pass it as a secret named "HUGGING_FACE_HUB_TOKEN".
    +os.environ["HUGGING_FACE_HUB_TOKEN"] = <"add your token here">
    +
    +
    +
    +
    +
    +
    +
    # Create an mlrun project
    +project = mlrun.get_or_create_project("diarization-test")
    +
    +# Import the function from the yaml file, once it's in the the we can import from there 
    +speech_diarization = project.set_function(func="hub://speech_diarization", name="speech_diarization")
    +
    +
    +
    +
    +
    > 2023-12-05 15:28:51,758 [info] Project loaded successfully: {'project_name': 'diarization-test'}
    +
    +
    +
    +
    +
    +
    +
    # Set the desired run params and files
    +audio_files = os.path.join("test_data.wav")
    +device = "cpu"
    +speakers_labels = ["Agent", "Client"]
    +separate_by_channels = True
    +
    +
    +
    +
    +
    +
    +
    # Run the imported function with desired file/s and params
    +diarize_run = speech_diarization.run(
    +    handler="diarize",
    +    inputs={"data_path": audio_files},
    +    params={
    +        "device": device,
    +        "speakers_labels": speakers_labels,
    +        "separate_by_channels": separate_by_channels,
    +    },
    +    returns=["speech-diarization: file", "diarize-errors: file"],
    +    local=True,
    +)
    +
    +
    +
    +
    +
    > 2023-12-05 15:28:52,229 [info] Storing function: {'name': 'speech-diarization-diarize', 'uid': 'ec6cd014e4674966b30303ea14048acf', 'db': 'http://mlrun-api:8080'}
    +
    +
    +
    +
    +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
    diarization-test0Dec 05 15:28:52completedspeech-diarization-diarize
    v3io_user=zeevr
    kind=local
    owner=zeevr
    host=jupyter-zeev-gpu-5995df47dc-rtpvr
    data_path
    device=cpu
    speakers_labels=['Agent', 'Client']
    separate_by_channels=True
    speech-diarization
    diarize-errors
    +
    + +
    +
    
    +
    +
    +
    > to track results use the .show() or .logs() methods or click here to open in UI
    > 2023-12-05 15:28:53,350 [info] Run execution finished: {'status': 'completed', 'name': 'speech-diarization-diarize'}
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +
    +
    +
    +
    + + + +
    +
    + + \ No newline at end of file diff --git a/functions/master/pyannote_audio/1.3.0/static/function.html b/functions/master/pyannote_audio/1.3.0/static/function.html new file mode 100644 index 00000000..36399af3 --- /dev/null +++ b/functions/master/pyannote_audio/1.3.0/static/function.html @@ -0,0 +1,168 @@ + + + + + + + + + + + Source + + + + +
    +        
    +kind: job
    +spec:
    +  command: ''
    +  disable_auto_mount: false
    +  image: ''
    +  build:
    +    code_origin: ''
    +    requirements:
    +    - pyannote.audio
    +    - pyannote.core
    +    - torchaudio
    +    - tqdm
    +    base_image: mlrun/mlrun-gpu
    +    origin_filename: ''
    +    functionSourceCode: 
    +  default_handler: diarize
    +  entry_points:
    +    open_mpi_handler:
    +      name: open_mpi_handler
    +      has_varargs: false
    +      lineno: 61
    +      parameters:
    +      - name: worker_inputs
    +        type: List[str]
    +      - name: root_worker_inputs
    +        type: Dict[str, Any]
    +        default: null
    +      has_kwargs: false
    +      doc: ''
    +    decorator:
    +      name: decorator
    +      has_varargs: false
    +      lineno: 73
    +      parameters:
    +      - name: handler
    +      has_kwargs: false
    +      doc: ''
    +    wrapper:
    +      name: wrapper
    +      has_varargs: false
    +      lineno: 78
    +      has_kwargs: true
    +      doc: ''
    +    diarize:
    +      name: diarize
    +      has_varargs: false
    +      lineno: 139
    +      outputs:
    +      - doc: 'A tuple of:'
    +        type: Tuple[Dict[str, List[Tuple[float, float, str]]], Dict[str, str]]
    +      parameters:
    +      - name: data_path
    +        type: Union[str, List[str]]
    +        doc: A directory of the audio files, a single file or a list of files to transcribe.
    +      - name: model_name
    +        type: str
    +        doc: 'One of the official diarization model names (referred as diarization
    +          pipelines) of `pyannote.audio` Huggingface page. Default: "pyannote/speaker-diarization-3.0".'
    +        default: pyannote/speaker-diarization-3.0
    +      - name: access_token
    +        type: str
    +        doc: An access token to pass for using the `pyannote.audio` models. If not
    +          provided, it will be looking for the environment variable "HUGGING_FACE_HUB_TOKEN".
    +          If MLRun is available, it will look for a secret "HUGGING_FACE_HUB_TOKEN".
    +        default: null
    +      - name: device
    +        type: str
    +        doc: Device to load the model. Can be one of {"cuda", "cpu"}. Default will
    +          prefer "cuda" if available.
    +        default: null
    +      - name: speakers_labels
    +        type: List[str]
    +        doc: 'Labels to use for the recognized speakers. Default: numeric labels (0,
    +          1, ...).'
    +        default: null
    +      - name: speaker_prefix
    +        type: str
    +        doc: 'A prefix to add for the speakers labels. This parameter is ignored if
    +          `speakers_labels` is not None. Default: "speaker".'
    +        default: speaker_
    +      - name: separate_by_channels
    +        type: bool
    +        doc: If each speaker is speaking in a separate channel, you can diarize each
    +          channel and combine the result into a single diarization. Each label set
    +          in the `speakers_labels` parameter will be assigned to a specific channel
    +          by order.
    +        default: false
    +      - name: minimum_speakers
    +        type: int
    +        doc: Set the minimum expected amount of speakers to be in the audio files.
    +          This parameter is ignored if `speakers_labels` is not None.
    +        default: null
    +      - name: maximum_speakers
    +        type: int
    +        doc: Set the maximum expected amount of speakers to be in the audio files.
    +          This parameter is ignored if `speakers_labels` is not None.
    +        default: null
    +      - name: verbose
    +        type: bool
    +        doc: 'Whether to present logs of a progress bar and errors. Default: True.'
    +        default: false
    +      has_kwargs: false
    +      doc: "Perform speech diarization on given audio files using pyannote-audio (https://github.com/pyannote/pyannote-audio).\n\
    +        The end result is a dictionary with the file names as keys and their diarization\
    +        \ as value. A diarization is a list\nof tuples: (start, end, speaker_label).\n\
    +        \nTo use the `pyannote.audio` models you must pass a Huggingface token and\
    +        \ get access to the required models. The\ntoken can be passed in one of the\
    +        \ following options:\n\n* Use the parameter `access_token`.\n* Set an environment\
    +        \ variable named \"HUGGING_FACE_HUB_TOKEN\".\n* If using MLRun, you can pass\
    +        \ it as a secret named \"HUGGING_FACE_HUB_TOKEN\".\n\nTo get access to the\
    +        \ models on Huggingface, visit their page. For example, to use the default\
    +        \ diarization model set\nin this function (\"pyannote/speaker-diarization-3.0\"\
    +        ), you need access for these two models:\n\n* https://huggingface.co/pyannote/segmentation-3.0\n\
    +        * https://huggingface.co/pyannote/speaker-diarization-3.0\n\nNote: To control\
    +        \ the recognized speakers in the diarization output you can choose one of\
    +        \ the following methods:\n\n* For a known speakers amount, you may set speaker\
    +        \ labels via the `speakers_labels` parameter that will be used in\n  the order\
    +        \ of speaking in the audio (first person speaking be the first label in the\
    +        \ list). In addition, you can do\n  diarization per channel (setting the parameter\
    +        \ `separate_by_channels` to True). Each label will be assigned to a\n  specific\
    +        \ channel by order (first label to channel 0, second label to channel 1 and\
    +        \ so on). Notice, this will\n  increase runtime.\n* For unknown speakers amount,\
    +        \ you can set the `speaker_prefix` parameter to add a prefix for each speaker\
    +        \ number.\n  You can also help the diarization by setting the speakers range\
    +        \ via the `speakers_amount_range` parameter."
    +  description: pyannote's speech diarization of audio files
    +metadata:
    +  name: pyannote-audio
    +  tag: ''
    +  categories:
    +  - deep-learning
    +  - audio
    +verbose: false
    +
    +        
    +    
    + + \ No newline at end of file diff --git a/functions/master/pyannote_audio/1.3.0/static/item.html b/functions/master/pyannote_audio/1.3.0/static/item.html new file mode 100644 index 00000000..2fbd98e6 --- /dev/null +++ b/functions/master/pyannote_audio/1.3.0/static/item.html @@ -0,0 +1,64 @@ + + + + + + + + + + + Source + + + + +
    +        
    +apiVersion: v1
    +categories:
    +- deep-learning
    +- audio
    +description: pyannote's speech diarization of audio files
    +doc: ''
    +example: pyannote_audio.ipynb
    +generationDate: 2023-12-03:14-30
    +hidden: false
    +icon: ''
    +labels:
    +  author: guyl
    +maintainers: []
    +marketplaceType: ''
    +mlrunVersion: 1.7.0
    +name: pyannote-audio
    +platformVersion: 3.5.3
    +spec:
    +  filename: pyannote_audio.py
    +  handler: diarize
    +  image: mlrun/mlrun-gpu
    +  kind: job
    +  requirements:
    +  - pyannote.audio
    +  - pyannote.core
    +  - torchaudio
    +  - tqdm
    +url: ''
    +version: 1.3.0
    +
    +        
    +    
    + + \ No newline at end of file diff --git a/functions/master/pyannote_audio/1.3.0/static/pyannote_audio.html b/functions/master/pyannote_audio/1.3.0/static/pyannote_audio.html new file mode 100644 index 00000000..2d9f2aa3 --- /dev/null +++ b/functions/master/pyannote_audio/1.3.0/static/pyannote_audio.html @@ -0,0 +1,554 @@ + + + + + + + +pyannote_audio.pyannote_audio + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + + + +
    +
    +
    +
    +
    +
    + +
    + +
    +
    +
    + + + + +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +
    +
    +
    +
    +
    +

    + +
    +
    +
    +
    +
    + +
    +

    Source code for pyannote_audio.pyannote_audio

    +# Copyright 2023 Iguazio
    +#
    +# Licensed under the Apache License, Version 2.0 (the "License");
    +# you may not use this file except in compliance with the License.
    +# You may obtain a copy of the License at
    +#
    +#   http://www.apache.org/licenses/LICENSE-2.0
    +#
    +# Unless required by applicable law or agreed to in writing, software
    +# distributed under the License is distributed on an "AS IS" BASIS,
    +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    +# See the License for the specific language governing permissions and
    +# limitations under the License.
    +
    +import heapq
    +import logging
    +import operator
    +import os
    +import pathlib
    +from functools import reduce, wraps
    +from typing import Any, Dict, List, Tuple, Union
    +
    +import pandas as pd
    +import pyannote.audio
    +import pyannote.core
    +import torch
    +import torchaudio
    +from tqdm import tqdm
    +
    +# Get the global logger:
    +_LOGGER = logging.getLogger()
    +
    +
    +def _check_mlrun_and_open_mpi() -> Tuple["mlrun.MLClientCtx", "mpi4py.MPI.Intracomm"]:
    +    is_mpi = False
    +    try:
    +        import mlrun
    +
    +        context = mlrun.get_or_create_ctx(name="mlrun")
    +        is_mpi = context.labels.get("kind", "job") == "mpijob"
    +
    +        if is_mpi:
    +            try:
    +                from mpi4py import MPI
    +
    +                return context, MPI.COMM_WORLD
    +            except ModuleNotFoundError as mpi4py_not_found:
    +                context.logger.error(
    +                    "To distribute the function using MLRun's 'mpijob' you need to have `mpi4py` package in your "
    +                    "interpreter. Please run `pip install mpi4py` and make sure you have open-mpi."
    +                )
    +                raise mpi4py_not_found
    +        else:
    +            return context, None
    +    except ModuleNotFoundError as module_not_found:
    +        if is_mpi:
    +            raise module_not_found
    +    return None, None
    +
    +
    +
    +[docs] +def open_mpi_handler( + worker_inputs: List[str], root_worker_inputs: Dict[str, Any] = None +): + global _LOGGER + + # Check for MLRun and OpenMPI availability: + context, comm = _check_mlrun_and_open_mpi() + + # Check if MLRun is available, set the global logger to MLRun's: + if context: + _LOGGER = context.logger + + def decorator(handler): + if comm is None or comm.Get_size() == 1: + return handler + + @wraps(handler) + def wrapper(**kwargs): + # Get the open mpi environment properties: + size = comm.Get_size() + rank = comm.Get_rank() + + # Give the correct chunk of the workers inputs: + for worker_input in worker_inputs: + input_argument = kwargs[worker_input] + if input_argument is None: + continue + if isinstance(input_argument, str): + input_argument = _get_audio_files( + data_path=pathlib.Path(input_argument).absolute() + ) + if len(input_argument) < size: + raise ValueError( + f"Cannot split the input '{worker_input}' of length {len(input_argument)} to {size} workers. " + f"Please reduce the amount of workers for this input." + ) + even_chunk_size = len(input_argument) // size + chunk_start = rank * even_chunk_size + chunk_end = ( + (rank + 1) * even_chunk_size + if rank + 1 < size + else len(input_argument) + ) + context.logger.info( + f"Rank #{rank}: Processing input chunk of '{worker_input}' " + f"from index {chunk_start} to {chunk_end}." + ) + if isinstance(input_argument, list): + input_argument = input_argument[chunk_start:chunk_end] + elif isinstance(input_argument, pd.DataFrame): + input_argument = input_argument.iloc[chunk_start:chunk_end:, :] + kwargs[worker_input] = input_argument + + # Set the root worker only arguments: + if rank == 0 and root_worker_inputs: + kwargs.update(root_worker_inputs) + + # Run the worker: + output = handler(**kwargs) + + # Send the output to the root rank (rank #0): + output = comm.gather(output, root=0) + if rank == 0: + # Join the outputs: + context.logger.info("Collecting data from workers to root worker.") + diarization_dictionary = reduce( + operator.ior, [dia for dia, _ in output], {} + ) + errors_dictionary = reduce(operator.ior, [err for _, err in output], {}) + return diarization_dictionary, errors_dictionary + return None + + return wrapper + + return decorator
    + + + +
    +[docs] +@open_mpi_handler(worker_inputs=["data_path"], root_worker_inputs={"verbose": True}) +def diarize( + data_path: Union[str, List[str]], + model_name: str = "pyannote/speaker-diarization-3.0", + access_token: str = None, + device: str = None, + speakers_labels: List[str] = None, + speaker_prefix: str = "speaker_", + separate_by_channels: bool = False, + minimum_speakers: int = None, + maximum_speakers: int = None, + verbose: bool = False, +) -> Tuple[Dict[str, List[Tuple[float, float, str]]], Dict[str, str]]: + """ + Perform speech diarization on given audio files using pyannote-audio (https://github.com/pyannote/pyannote-audio). + The end result is a dictionary with the file names as keys and their diarization as value. A diarization is a list + of tuples: (start, end, speaker_label). + + To use the `pyannote.audio` models you must pass a Huggingface token and get access to the required models. The + token can be passed in one of the following options: + + * Use the parameter `access_token`. + * Set an environment variable named "HUGGING_FACE_HUB_TOKEN". + * If using MLRun, you can pass it as a secret named "HUGGING_FACE_HUB_TOKEN". + + To get access to the models on Huggingface, visit their page. For example, to use the default diarization model set + in this function ("pyannote/speaker-diarization-3.0"), you need access for these two models: + + * https://huggingface.co/pyannote/segmentation-3.0 + * https://huggingface.co/pyannote/speaker-diarization-3.0 + + Note: To control the recognized speakers in the diarization output you can choose one of the following methods: + + * For a known speakers amount, you may set speaker labels via the `speakers_labels` parameter that will be used in + the order of speaking in the audio (first person speaking be the first label in the list). In addition, you can do + diarization per channel (setting the parameter `separate_by_channels` to True). Each label will be assigned to a + specific channel by order (first label to channel 0, second label to channel 1 and so on). Notice, this will + increase runtime. + * For unknown speakers amount, you can set the `speaker_prefix` parameter to add a prefix for each speaker number. + You can also help the diarization by setting the speakers range via the `speakers_amount_range` parameter. + + :param data_path: A directory of the audio files, a single file or a list of files to transcribe. + :param model_name: One of the official diarization model names (referred as diarization pipelines) of + `pyannote.audio` Huggingface page. Default: "pyannote/speaker-diarization-3.0". + :param access_token: An access token to pass for using the `pyannote.audio` models. If not provided, it + will be looking for the environment variable "HUGGING_FACE_HUB_TOKEN". If MLRun is + available, it will look for a secret "HUGGING_FACE_HUB_TOKEN". + :param device: Device to load the model. Can be one of {"cuda", "cpu"}. Default will prefer "cuda" if + available. + :param speakers_labels: Labels to use for the recognized speakers. Default: numeric labels (0, 1, ...). + :param separate_by_channels: If each speaker is speaking in a separate channel, you can diarize each channel and + combine the result into a single diarization. Each label set in the `speakers_labels` + parameter will be assigned to a specific channel by order. + :param speaker_prefix: A prefix to add for the speakers labels. This parameter is ignored if + `speakers_labels` is not None. Default: "speaker". + :param minimum_speakers: Set the minimum expected amount of speakers to be in the audio files. This parameter is + ignored if `speakers_labels` is not None. + :param maximum_speakers: Set the maximum expected amount of speakers to be in the audio files. This parameter is + ignored if `speakers_labels` is not None. + :param verbose: Whether to present logs of a progress bar and errors. Default: True. + + :returns: A tuple of: + + * Speech diarization dictionary. + * A dictionary of errored files that were not transcribed. + """ + global _LOGGER + + # Get the input audio files to diarize: + if isinstance(data_path, str): + data_path = pathlib.Path(data_path).absolute() + audio_files = _get_audio_files(data_path=data_path) + else: # Should be a list of files. + audio_files = data_path + + # Get the Huggingface access token: + access_token = _get_access_token(parameter=access_token) + if access_token is None: + raise ValueError( + "A Huggingface access token must be provided to use `pyannote.audio` models. Access token can be passed " + "via one of the following options:\n" + "* Use the parameter `access_token`.\n" + "* Set an environment variable named 'HUGGING_FACE_HUB_TOKEN'.\n" + "* If using MLRun, you can pass it as a secret named 'HUGGING_FACE_HUB_TOKEN'." + ) + + # Load the diarization pipeline: + pipeline = pyannote.audio.Pipeline.from_pretrained( + checkpoint_path=model_name, use_auth_token=access_token + ) + + # Set the device: + device = device or ("cuda" if torch.cuda.is_available() else "cpu") + if device != "cpu": + pipeline.to(torch.device(device)) + + # Prepare the successes dataframe and errors dictionary to be returned: + diarizations = {} + errors = {} + + # Prepare the diarization keyword arguments: + diarize_kwargs = {} + if speakers_labels: + diarize_kwargs["num_speakers"] = len(speakers_labels) + else: + if minimum_speakers: + diarize_kwargs["min_speakers"] = minimum_speakers + if maximum_speakers: + diarize_kwargs["max_speakers"] = maximum_speakers + + # Go over the audio files and diarize: + for audio_file in tqdm( + audio_files, desc="Diarizing", unit="file", disable=not verbose + ): + try: + # Load audio file: + audio, sample_rate = torchaudio.load(uri=audio_file, channels_first=True) + # Get the diarization (if provided): + diarizations[audio_file.name] = _diarize( + audio=audio, + sample_rate=sample_rate, + pipeline=pipeline, + speakers_labels=speakers_labels, + separate_by_channels=separate_by_channels, + speaker_prefix=speaker_prefix, + diarize_kwargs=diarize_kwargs, + ) + except Exception as exception: + # Note the exception as error in the dictionary: + if verbose: + _LOGGER.warning(f"Error in file: '{audio_file.name}'") + errors[str(audio_file.name)] = str(exception) + continue + + # Print the head of the produced dataframe and return: + if verbose: + _LOGGER.info(f"Done ({len(diarizations)}/{len(audio_files)})\n") + return diarizations, errors
    + + + +def _get_audio_files( + data_path: pathlib.Path, +) -> List[pathlib.Path]: + # Check if the path is of a directory or a file: + if data_path.is_dir(): + # Get all files inside the directory: + audio_files = list(data_path.glob("*.*")) + elif data_path.is_file(): + audio_files = [data_path] + else: + raise ValueError( + f"Unrecognized data path. The parameter `data_path` must be either a directory path or a file path. " + f"Given: {str(data_path)} " + ) + + return audio_files + + +def _get_access_token(parameter: str) -> str: + # If given as a parameter, return it: + if parameter: + return parameter + + # Otherwise, look at the environment variable: + environment_variable = os.environ.get("HUGGING_FACE_HUB_TOKEN") + if environment_variable: + return environment_variable + + # Lastly, try look in the set secrets in MLRun: + secret = None + try: + import mlrun + + context = mlrun.get_or_create_ctx(name="mlrun") + secret = context.get_secret(key="HUGGING_FACE_HUB_TOKEN") + except ModuleNotFoundError: + pass + + return secret + + +def _diarize( + audio: torch.Tensor, + sample_rate: int, + pipeline: pyannote.audio.Pipeline, + speakers_labels: List[str], + separate_by_channels: bool, + speaker_prefix: str, + diarize_kwargs: dict, +) -> List[Tuple[float, float, str]]: + # If there is no need for separation by channels, we diarize and return: + if not separate_by_channels: + # Diarize: + diarization: pyannote.core.Annotation = pipeline( + file={"waveform": audio, "sample_rate": sample_rate}, **diarize_kwargs + ) + # Verify speakers labels (should not fail here as we set `num_speakers=len(speakers_labels)` when inferring + # through the pipeline): + if speakers_labels: + given_speakers = len(speakers_labels) + found_speakers = len(set(diarization.labels())) + if given_speakers < found_speakers: + raise ValueError( + f"Not enough `speakers_labels` were given. Got {given_speakers} labels but the diarization " + f"recognized {found_speakers} speakers." + ) + # Return as a diarization list - a sorted list of tuples of start time, end time and a label (the default label + # returned is "SPEAKER_i" so we take only the index out of it): + return [ + ( + segment.start, + segment.end, + speakers_labels[int(label.split("_")[1])] + if speakers_labels + else f"{speaker_prefix}{int(label.split('_')[1])}", + ) + for segment, track, label in diarization.itertracks(yield_label=True) + ] + + # Separate to channels and diarize (we expect only one speaker per channel): + channel_diarizations = [ + _diarize( + audio=audio[channel].unsqueeze( + 0 + ), # Take channel and add a channel dimension to it. + sample_rate=sample_rate, + pipeline=pipeline, + speakers_labels=[ + speakers_labels[channel] + ], # Take the channel's label only. + separate_by_channels=False, + speaker_prefix=speaker_prefix, + diarize_kwargs={"num_speakers": 1}, # Set to one speaker. + ) + for channel in range(audio.shape[0]) + ] + + # Merge the channel diarizations into a single sorted list: + return list(heapq.merge(*channel_diarizations)) +
    +
    +
    +
    +
    +
    +
    +
    +
    + +
    +
    +
    +
    + + + +
    +
    + + \ No newline at end of file diff --git a/functions/master/pyannote_audio/1.3.0/static/source.html b/functions/master/pyannote_audio/1.3.0/static/source.html new file mode 100644 index 00000000..f3f0617e --- /dev/null +++ b/functions/master/pyannote_audio/1.3.0/static/source.html @@ -0,0 +1,411 @@ + + + + + + + + + + + Source + + + + +
    +        
    +# Copyright 2023 Iguazio
    +#
    +# Licensed under the Apache License, Version 2.0 (the "License");
    +# you may not use this file except in compliance with the License.
    +# You may obtain a copy of the License at
    +#
    +#   http://www.apache.org/licenses/LICENSE-2.0
    +#
    +# Unless required by applicable law or agreed to in writing, software
    +# distributed under the License is distributed on an "AS IS" BASIS,
    +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    +# See the License for the specific language governing permissions and
    +# limitations under the License.
    +
    +import heapq
    +import logging
    +import operator
    +import os
    +import pathlib
    +from functools import reduce, wraps
    +from typing import Any, Dict, List, Tuple, Union
    +
    +import pandas as pd
    +import pyannote.audio
    +import pyannote.core
    +import torch
    +import torchaudio
    +from tqdm import tqdm
    +
    +# Get the global logger:
    +_LOGGER = logging.getLogger()
    +
    +
    +def _check_mlrun_and_open_mpi() -> Tuple["mlrun.MLClientCtx", "mpi4py.MPI.Intracomm"]:
    +    is_mpi = False
    +    try:
    +        import mlrun
    +
    +        context = mlrun.get_or_create_ctx(name="mlrun")
    +        is_mpi = context.labels.get("kind", "job") == "mpijob"
    +
    +        if is_mpi:
    +            try:
    +                from mpi4py import MPI
    +
    +                return context, MPI.COMM_WORLD
    +            except ModuleNotFoundError as mpi4py_not_found:
    +                context.logger.error(
    +                    "To distribute the function using MLRun's 'mpijob' you need to have `mpi4py` package in your "
    +                    "interpreter. Please run `pip install mpi4py` and make sure you have open-mpi."
    +                )
    +                raise mpi4py_not_found
    +        else:
    +            return context, None
    +    except ModuleNotFoundError as module_not_found:
    +        if is_mpi:
    +            raise module_not_found
    +    return None, None
    +
    +
    +def open_mpi_handler(
    +    worker_inputs: List[str], root_worker_inputs: Dict[str, Any] = None
    +):
    +    global _LOGGER
    +
    +    # Check for MLRun and OpenMPI availability:
    +    context, comm = _check_mlrun_and_open_mpi()
    +
    +    # Check if MLRun is available, set the global logger to MLRun's:
    +    if context:
    +        _LOGGER = context.logger
    +
    +    def decorator(handler):
    +        if comm is None or comm.Get_size() == 1:
    +            return handler
    +
    +        @wraps(handler)
    +        def wrapper(**kwargs):
    +            # Get the open mpi environment properties:
    +            size = comm.Get_size()
    +            rank = comm.Get_rank()
    +
    +            # Give the correct chunk of the workers inputs:
    +            for worker_input in worker_inputs:
    +                input_argument = kwargs[worker_input]
    +                if input_argument is None:
    +                    continue
    +                if isinstance(input_argument, str):
    +                    input_argument = _get_audio_files(
    +                        data_path=pathlib.Path(input_argument).absolute()
    +                    )
    +                if len(input_argument) < size:
    +                    raise ValueError(
    +                        f"Cannot split the input '{worker_input}' of length {len(input_argument)} to {size} workers. "
    +                        f"Please reduce the amount of workers for this input."
    +                    )
    +                even_chunk_size = len(input_argument) // size
    +                chunk_start = rank * even_chunk_size
    +                chunk_end = (
    +                    (rank + 1) * even_chunk_size
    +                    if rank + 1 < size
    +                    else len(input_argument)
    +                )
    +                context.logger.info(
    +                    f"Rank #{rank}: Processing input chunk of '{worker_input}' "
    +                    f"from index {chunk_start} to {chunk_end}."
    +                )
    +                if isinstance(input_argument, list):
    +                    input_argument = input_argument[chunk_start:chunk_end]
    +                elif isinstance(input_argument, pd.DataFrame):
    +                    input_argument = input_argument.iloc[chunk_start:chunk_end:, :]
    +                kwargs[worker_input] = input_argument
    +
    +            # Set the root worker only arguments:
    +            if rank == 0 and root_worker_inputs:
    +                kwargs.update(root_worker_inputs)
    +
    +            # Run the worker:
    +            output = handler(**kwargs)
    +
    +            # Send the output to the root rank (rank #0):
    +            output = comm.gather(output, root=0)
    +            if rank == 0:
    +                # Join the outputs:
    +                context.logger.info("Collecting data from workers to root worker.")
    +                diarization_dictionary = reduce(
    +                    operator.ior, [dia for dia, _ in output], {}
    +                )
    +                errors_dictionary = reduce(operator.ior, [err for _, err in output], {})
    +                return diarization_dictionary, errors_dictionary
    +            return None
    +
    +        return wrapper
    +
    +    return decorator
    +
    +
    +@open_mpi_handler(worker_inputs=["data_path"], root_worker_inputs={"verbose": True})
    +def diarize(
    +    data_path: Union[str, List[str]],
    +    model_name: str = "pyannote/speaker-diarization-3.0",
    +    access_token: str = None,
    +    device: str = None,
    +    speakers_labels: List[str] = None,
    +    speaker_prefix: str = "speaker_",
    +    separate_by_channels: bool = False,
    +    minimum_speakers: int = None,
    +    maximum_speakers: int = None,
    +    verbose: bool = False,
    +) -> Tuple[Dict[str, List[Tuple[float, float, str]]], Dict[str, str]]:
    +    """
    +    Perform speech diarization on given audio files using pyannote-audio (https://github.com/pyannote/pyannote-audio).
    +    The end result is a dictionary with the file names as keys and their diarization as value. A diarization is a list
    +    of tuples: (start, end, speaker_label).
    +
    +    To use the `pyannote.audio` models you must pass a Huggingface token and get access to the required models. The
    +    token can be passed in one of the following options:
    +
    +    * Use the parameter `access_token`.
    +    * Set an environment variable named "HUGGING_FACE_HUB_TOKEN".
    +    * If using MLRun, you can pass it as a secret named "HUGGING_FACE_HUB_TOKEN".
    +
    +    To get access to the models on Huggingface, visit their page. For example, to use the default diarization model set
    +    in this function ("pyannote/speaker-diarization-3.0"), you need access for these two models:
    +
    +    * https://huggingface.co/pyannote/segmentation-3.0
    +    * https://huggingface.co/pyannote/speaker-diarization-3.0
    +
    +    Note: To control the recognized speakers in the diarization output you can choose one of the following methods:
    +
    +    * For a known speakers amount, you may set speaker labels via the `speakers_labels` parameter that will be used in
    +      the order of speaking in the audio (first person speaking be the first label in the list). In addition, you can do
    +      diarization per channel (setting the parameter `separate_by_channels` to True). Each label will be assigned to a
    +      specific channel by order (first label to channel 0, second label to channel 1 and so on). Notice, this will
    +      increase runtime.
    +    * For unknown speakers amount, you can set the `speaker_prefix` parameter to add a prefix for each speaker number.
    +      You can also help the diarization by setting the speakers range via the `speakers_amount_range` parameter.
    +
    +    :param data_path:            A directory of the audio files, a single file or a list of files to transcribe.
    +    :param model_name:           One of the official diarization model names (referred as diarization pipelines) of
    +                                 `pyannote.audio` Huggingface page. Default: "pyannote/speaker-diarization-3.0".
    +    :param access_token:         An access token to pass for using the `pyannote.audio` models. If not provided, it
    +                                 will be looking for the environment variable "HUGGING_FACE_HUB_TOKEN". If MLRun is
    +                                 available, it will look for a secret "HUGGING_FACE_HUB_TOKEN".
    +    :param device:               Device to load the model. Can be one of {"cuda", "cpu"}. Default will prefer "cuda" if
    +                                 available.
    +    :param speakers_labels:      Labels to use for the recognized speakers. Default: numeric labels (0, 1, ...).
    +    :param separate_by_channels: If each speaker is speaking in a separate channel, you can diarize each channel and
    +                                 combine the result into a single diarization. Each label set in the `speakers_labels`
    +                                 parameter will be assigned to a specific channel by order.
    +    :param speaker_prefix:       A prefix to add for the speakers labels. This parameter is ignored if
    +                                 `speakers_labels` is not None. Default: "speaker".
    +    :param minimum_speakers:     Set the minimum expected amount of speakers to be in the audio files. This parameter is
    +                                 ignored if `speakers_labels` is not None.
    +    :param maximum_speakers:     Set the maximum expected amount of speakers to be in the audio files. This parameter is
    +                                 ignored if `speakers_labels` is not None.
    +    :param verbose:              Whether to present logs of a progress bar and errors. Default: True.
    +
    +    :returns: A tuple of:
    +
    +              * Speech diarization dictionary.
    +              * A dictionary of errored files that were not transcribed.
    +    """
    +    global _LOGGER
    +
    +    # Get the input audio files to diarize:
    +    if isinstance(data_path, str):
    +        data_path = pathlib.Path(data_path).absolute()
    +        audio_files = _get_audio_files(data_path=data_path)
    +    else:  # Should be a list of files.
    +        audio_files = data_path
    +
    +    # Get the Huggingface access token:
    +    access_token = _get_access_token(parameter=access_token)
    +    if access_token is None:
    +        raise ValueError(
    +            "A Huggingface access token must be provided to use `pyannote.audio` models. Access token can be passed "
    +            "via one of the following options:\n"
    +            "* Use the parameter `access_token`.\n"
    +            "* Set an environment variable named 'HUGGING_FACE_HUB_TOKEN'.\n"
    +            "* If using MLRun, you can pass it as a secret named 'HUGGING_FACE_HUB_TOKEN'."
    +        )
    +
    +    # Load the diarization pipeline:
    +    pipeline = pyannote.audio.Pipeline.from_pretrained(
    +        checkpoint_path=model_name, use_auth_token=access_token
    +    )
    +
    +    # Set the device:
    +    device = device or ("cuda" if torch.cuda.is_available() else "cpu")
    +    if device != "cpu":
    +        pipeline.to(torch.device(device))
    +
    +    # Prepare the successes dataframe and errors dictionary to be returned:
    +    diarizations = {}
    +    errors = {}
    +
    +    # Prepare the diarization keyword arguments:
    +    diarize_kwargs = {}
    +    if speakers_labels:
    +        diarize_kwargs["num_speakers"] = len(speakers_labels)
    +    else:
    +        if minimum_speakers:
    +            diarize_kwargs["min_speakers"] = minimum_speakers
    +        if maximum_speakers:
    +            diarize_kwargs["max_speakers"] = maximum_speakers
    +
    +    # Go over the audio files and diarize:
    +    for audio_file in tqdm(
    +        audio_files, desc="Diarizing", unit="file", disable=not verbose
    +    ):
    +        try:
    +            # Load audio file:
    +            audio, sample_rate = torchaudio.load(uri=audio_file, channels_first=True)
    +            # Get the diarization (if provided):
    +            diarizations[audio_file.name] = _diarize(
    +                audio=audio,
    +                sample_rate=sample_rate,
    +                pipeline=pipeline,
    +                speakers_labels=speakers_labels,
    +                separate_by_channels=separate_by_channels,
    +                speaker_prefix=speaker_prefix,
    +                diarize_kwargs=diarize_kwargs,
    +            )
    +        except Exception as exception:
    +            # Note the exception as error in the dictionary:
    +            if verbose:
    +                _LOGGER.warning(f"Error in file: '{audio_file.name}'")
    +            errors[str(audio_file.name)] = str(exception)
    +            continue
    +
    +    # Print the head of the produced dataframe and return:
    +    if verbose:
    +        _LOGGER.info(f"Done ({len(diarizations)}/{len(audio_files)})\n")
    +    return diarizations, errors
    +
    +
    +def _get_audio_files(
    +    data_path: pathlib.Path,
    +) -> List[pathlib.Path]:
    +    # Check if the path is of a directory or a file:
    +    if data_path.is_dir():
    +        # Get all files inside the directory:
    +        audio_files = list(data_path.glob("*.*"))
    +    elif data_path.is_file():
    +        audio_files = [data_path]
    +    else:
    +        raise ValueError(
    +            f"Unrecognized data path. The parameter `data_path` must be either a directory path or a file path. "
    +            f"Given: {str(data_path)} "
    +        )
    +
    +    return audio_files
    +
    +
    +def _get_access_token(parameter: str) -> str:
    +    # If given as a parameter, return it:
    +    if parameter:
    +        return parameter
    +
    +    # Otherwise, look at the environment variable:
    +    environment_variable = os.environ.get("HUGGING_FACE_HUB_TOKEN")
    +    if environment_variable:
    +        return environment_variable
    +
    +    # Lastly, try look in the set secrets in MLRun:
    +    secret = None
    +    try:
    +        import mlrun
    +
    +        context = mlrun.get_or_create_ctx(name="mlrun")
    +        secret = context.get_secret(key="HUGGING_FACE_HUB_TOKEN")
    +    except ModuleNotFoundError:
    +        pass
    +
    +    return secret
    +
    +
    +def _diarize(
    +    audio: torch.Tensor,
    +    sample_rate: int,
    +    pipeline: pyannote.audio.Pipeline,
    +    speakers_labels: List[str],
    +    separate_by_channels: bool,
    +    speaker_prefix: str,
    +    diarize_kwargs: dict,
    +) -> List[Tuple[float, float, str]]:
    +    # If there is no need for separation by channels, we diarize and return:
    +    if not separate_by_channels:
    +        # Diarize:
    +        diarization: pyannote.core.Annotation = pipeline(
    +            file={"waveform": audio, "sample_rate": sample_rate}, **diarize_kwargs
    +        )
    +        # Verify speakers labels (should not fail here as we set `num_speakers=len(speakers_labels)` when inferring
    +        # through the pipeline):
    +        if speakers_labels:
    +            given_speakers = len(speakers_labels)
    +            found_speakers = len(set(diarization.labels()))
    +            if given_speakers < found_speakers:
    +                raise ValueError(
    +                    f"Not enough `speakers_labels` were given. Got {given_speakers} labels but the diarization "
    +                    f"recognized {found_speakers} speakers."
    +                )
    +        # Return as a diarization list - a sorted list of tuples of start time, end time and a label (the default label
    +        # returned is "SPEAKER_i" so we take only the index out of it):
    +        return [
    +            (
    +                segment.start,
    +                segment.end,
    +                speakers_labels[int(label.split("_")[1])]
    +                if speakers_labels
    +                else f"{speaker_prefix}{int(label.split('_')[1])}",
    +            )
    +            for segment, track, label in diarization.itertracks(yield_label=True)
    +        ]
    +
    +    # Separate to channels and diarize (we expect only one speaker per channel):
    +    channel_diarizations = [
    +        _diarize(
    +            audio=audio[channel].unsqueeze(
    +                0
    +            ),  # Take channel and add a channel dimension to it.
    +            sample_rate=sample_rate,
    +            pipeline=pipeline,
    +            speakers_labels=[
    +                speakers_labels[channel]
    +            ],  # Take the channel's label only.
    +            separate_by_channels=False,
    +            speaker_prefix=speaker_prefix,
    +            diarize_kwargs={"num_speakers": 1},  # Set to one speaker.
    +        )
    +        for channel in range(audio.shape[0])
    +    ]
    +
    +    # Merge the channel diarizations into a single sorted list:
    +    return list(heapq.merge(*channel_diarizations))
    +
    +        
    +    
    + + \ No newline at end of file diff --git a/functions/master/pyannote_audio/latest/src/function.yaml b/functions/master/pyannote_audio/latest/src/function.yaml index 30870afa..b4cd9ad9 100644 --- a/functions/master/pyannote_audio/latest/src/function.yaml +++ b/functions/master/pyannote_audio/latest/src/function.yaml @@ -1,86 +1,53 @@ kind: job -metadata: - name: pyannote-audio - tag: '' - hash: aed670a0534ebf30690dd2af7acad35595c7d5b1 - project: '' - labels: - author: guyl - categories: - - deep-learning - - huggingface - - audio spec: command: '' - args: [] + disable_auto_mount: false image: '' build: - functionSourceCode:  - base_image: mlrun/mlrun-gpu - commands: [] code_origin: '' - origin_filename: '' requirements: - pyannote.audio - pyannote.core - torchaudio - tqdm + base_image: mlrun/mlrun-gpu + origin_filename: '' + functionSourceCode:  + default_handler: diarize entry_points: open_mpi_handler: name: open_mpi_handler - doc: '' + has_varargs: false + lineno: 61 parameters: - name: worker_inputs type: List[str] - name: root_worker_inputs type: Dict[str, Any] default: null - outputs: [] - lineno: 61 - has_varargs: false has_kwargs: false + doc: '' decorator: name: decorator - doc: '' + has_varargs: false + lineno: 73 parameters: - name: handler - outputs: [] - lineno: 73 - has_varargs: false has_kwargs: false + doc: '' wrapper: name: wrapper - doc: '' - parameters: [] - outputs: [] - lineno: 78 has_varargs: false + lineno: 78 has_kwargs: true + doc: '' diarize: name: diarize - doc: "Perform speech diarization on given audio files using pyannote-audio (https://github.com/pyannote/pyannote-audio).\n\ - The end result is a dictionary with the file names as keys and their diarization\ - \ as value. A diarization is a list\nof tuples: (start, end, speaker_label).\n\ - \nTo use the `pyannote.audio` models you must pass a Huggingface token and\ - \ get access to the required models. The\ntoken can be passed in one of the\ - \ following options:\n\n* Use the parameter `access_token`.\n* Set an environment\ - \ variable named \"HUGGING_FACE_HUB_TOKEN\".\n* If using MLRun, you can pass\ - \ it as a secret named \"HUGGING_FACE_HUB_TOKEN\".\n\nTo get access to the\ - \ models on Huggingface, visit their page. For example, to use the default\ - \ diarization model set\nin this function (\"pyannote/speaker-diarization-3.0\"\ - ), you need access for these two models:\n\n* https://huggingface.co/pyannote/segmentation-3.0\n\ - * https://huggingface.co/pyannote/speaker-diarization-3.0\n\nNote: To control\ - \ the recognized speakers in the diarization output you can choose one of\ - \ the following methods:\n\n* For a known speakers amount, you may set speaker\ - \ labels via the `speakers_labels` parameter that will be used in\n the order\ - \ of speaking in the audio (first person speaking be the first label in the\ - \ list). In addition, you can do\n diarization per channel (setting the parameter\ - \ `separate_by_channels` to True). Each label will be assigned to a\n specific\ - \ channel by order (first label to channel 0, second label to channel 1 and\ - \ so on). Notice, this will\n increase runtime.\n* For unknown speakers amount,\ - \ you can set the `speaker_prefix` parameter to add a prefix for each speaker\ - \ number.\n You can also help the diarization by setting the speakers range\ - \ via the `speakers_amount_range` parameter." + has_varargs: false + lineno: 139 + outputs: + - doc: 'A tuple of:' + type: Tuple[Dict[str, List[Tuple[float, float, str]]], Dict[str, str]] parameters: - name: data_path type: Union[str, List[str]] @@ -132,20 +99,35 @@ spec: type: bool doc: 'Whether to present logs of a progress bar and errors. Default: True.' default: false - outputs: - - doc: 'A tuple of:' - type: Tuple[Dict[str, List[Tuple[float, float, str]]], Dict[str, str]] - lineno: 139 - has_varargs: false has_kwargs: false + doc: "Perform speech diarization on given audio files using pyannote-audio (https://github.com/pyannote/pyannote-audio).\n\ + The end result is a dictionary with the file names as keys and their diarization\ + \ as value. A diarization is a list\nof tuples: (start, end, speaker_label).\n\ + \nTo use the `pyannote.audio` models you must pass a Huggingface token and\ + \ get access to the required models. The\ntoken can be passed in one of the\ + \ following options:\n\n* Use the parameter `access_token`.\n* Set an environment\ + \ variable named \"HUGGING_FACE_HUB_TOKEN\".\n* If using MLRun, you can pass\ + \ it as a secret named \"HUGGING_FACE_HUB_TOKEN\".\n\nTo get access to the\ + \ models on Huggingface, visit their page. For example, to use the default\ + \ diarization model set\nin this function (\"pyannote/speaker-diarization-3.0\"\ + ), you need access for these two models:\n\n* https://huggingface.co/pyannote/segmentation-3.0\n\ + * https://huggingface.co/pyannote/speaker-diarization-3.0\n\nNote: To control\ + \ the recognized speakers in the diarization output you can choose one of\ + \ the following methods:\n\n* For a known speakers amount, you may set speaker\ + \ labels via the `speakers_labels` parameter that will be used in\n the order\ + \ of speaking in the audio (first person speaking be the first label in the\ + \ list). In addition, you can do\n diarization per channel (setting the parameter\ + \ `separate_by_channels` to True). Each label will be assigned to a\n specific\ + \ channel by order (first label to channel 0, second label to channel 1 and\ + \ so on). Notice, this will\n increase runtime.\n* For unknown speakers amount,\ + \ you can set the `speaker_prefix` parameter to add a prefix for each speaker\ + \ number.\n You can also help the diarization by setting the speakers range\ + \ via the `speakers_amount_range` parameter." description: pyannote's speech diarization of audio files - default_handler: diarize - disable_auto_mount: false - clone_target_dir: '' - env: [] - priority_class_name: '' - preemption_mode: prevent - affinity: null - tolerations: null - security_context: {} +metadata: + name: pyannote-audio + tag: '' + categories: + - deep-learning + - audio verbose: false diff --git a/functions/master/pyannote_audio/latest/src/item.yaml b/functions/master/pyannote_audio/latest/src/item.yaml index b69add9e..b6dbccdd 100644 --- a/functions/master/pyannote_audio/latest/src/item.yaml +++ b/functions/master/pyannote_audio/latest/src/item.yaml @@ -1,7 +1,6 @@ apiVersion: v1 categories: - deep-learning -- huggingface - audio description: pyannote's speech diarization of audio files doc: '' @@ -13,7 +12,7 @@ labels: author: guyl maintainers: [] marketplaceType: '' -mlrunVersion: 1.5.2 +mlrunVersion: 1.7.0 name: pyannote-audio platformVersion: 3.5.3 spec: @@ -27,4 +26,4 @@ spec: - torchaudio - tqdm url: '' -version: 1.2.0 +version: 1.3.0 diff --git a/functions/master/pyannote_audio/latest/static/documentation.html b/functions/master/pyannote_audio/latest/static/documentation.html index 8c247d59..e28a243b 100644 --- a/functions/master/pyannote_audio/latest/static/documentation.html +++ b/functions/master/pyannote_audio/latest/static/documentation.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/pyannote_audio/latest/static/example.html b/functions/master/pyannote_audio/latest/static/example.html index 47ad38d5..fd6ba740 100644 --- a/functions/master/pyannote_audio/latest/static/example.html +++ b/functions/master/pyannote_audio/latest/static/example.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/pyannote_audio/latest/static/function.html b/functions/master/pyannote_audio/latest/static/function.html index bf24893b..36399af3 100644 --- a/functions/master/pyannote_audio/latest/static/function.html +++ b/functions/master/pyannote_audio/latest/static/function.html @@ -29,88 +29,55 @@
             
     kind: job
    -metadata:
    -  name: pyannote-audio
    -  tag: ''
    -  hash: aed670a0534ebf30690dd2af7acad35595c7d5b1
    -  project: ''
    -  labels:
    -    author: guyl
    -  categories:
    -  - deep-learning
    -  - huggingface
    -  - audio
     spec:
       command: ''
    -  args: []
    +  disable_auto_mount: false
       image: ''
       build:
    -    functionSourceCode: 
    -    base_image: mlrun/mlrun-gpu
    -    commands: []
         code_origin: ''
    -    origin_filename: ''
         requirements:
         - pyannote.audio
         - pyannote.core
         - torchaudio
         - tqdm
    +    base_image: mlrun/mlrun-gpu
    +    origin_filename: ''
    +    functionSourceCode: 
    +  default_handler: diarize
       entry_points:
         open_mpi_handler:
           name: open_mpi_handler
    -      doc: ''
    +      has_varargs: false
    +      lineno: 61
           parameters:
           - name: worker_inputs
             type: List[str]
           - name: root_worker_inputs
             type: Dict[str, Any]
             default: null
    -      outputs: []
    -      lineno: 61
    -      has_varargs: false
           has_kwargs: false
    +      doc: ''
         decorator:
           name: decorator
    -      doc: ''
    +      has_varargs: false
    +      lineno: 73
           parameters:
           - name: handler
    -      outputs: []
    -      lineno: 73
    -      has_varargs: false
           has_kwargs: false
    +      doc: ''
         wrapper:
           name: wrapper
    -      doc: ''
    -      parameters: []
    -      outputs: []
    -      lineno: 78
           has_varargs: false
    +      lineno: 78
           has_kwargs: true
    +      doc: ''
         diarize:
           name: diarize
    -      doc: "Perform speech diarization on given audio files using pyannote-audio (https://github.com/pyannote/pyannote-audio).\n\
    -        The end result is a dictionary with the file names as keys and their diarization\
    -        \ as value. A diarization is a list\nof tuples: (start, end, speaker_label).\n\
    -        \nTo use the `pyannote.audio` models you must pass a Huggingface token and\
    -        \ get access to the required models. The\ntoken can be passed in one of the\
    -        \ following options:\n\n* Use the parameter `access_token`.\n* Set an environment\
    -        \ variable named \"HUGGING_FACE_HUB_TOKEN\".\n* If using MLRun, you can pass\
    -        \ it as a secret named \"HUGGING_FACE_HUB_TOKEN\".\n\nTo get access to the\
    -        \ models on Huggingface, visit their page. For example, to use the default\
    -        \ diarization model set\nin this function (\"pyannote/speaker-diarization-3.0\"\
    -        ), you need access for these two models:\n\n* https://huggingface.co/pyannote/segmentation-3.0\n\
    -        * https://huggingface.co/pyannote/speaker-diarization-3.0\n\nNote: To control\
    -        \ the recognized speakers in the diarization output you can choose one of\
    -        \ the following methods:\n\n* For a known speakers amount, you may set speaker\
    -        \ labels via the `speakers_labels` parameter that will be used in\n  the order\
    -        \ of speaking in the audio (first person speaking be the first label in the\
    -        \ list). In addition, you can do\n  diarization per channel (setting the parameter\
    -        \ `separate_by_channels` to True). Each label will be assigned to a\n  specific\
    -        \ channel by order (first label to channel 0, second label to channel 1 and\
    -        \ so on). Notice, this will\n  increase runtime.\n* For unknown speakers amount,\
    -        \ you can set the `speaker_prefix` parameter to add a prefix for each speaker\
    -        \ number.\n  You can also help the diarization by setting the speakers range\
    -        \ via the `speakers_amount_range` parameter."
    +      has_varargs: false
    +      lineno: 139
    +      outputs:
    +      - doc: 'A tuple of:'
    +        type: Tuple[Dict[str, List[Tuple[float, float, str]]], Dict[str, str]]
           parameters:
           - name: data_path
             type: Union[str, List[str]]
    @@ -162,22 +129,37 @@
             type: bool
             doc: 'Whether to present logs of a progress bar and errors. Default: True.'
             default: false
    -      outputs:
    -      - doc: 'A tuple of:'
    -        type: Tuple[Dict[str, List[Tuple[float, float, str]]], Dict[str, str]]
    -      lineno: 139
    -      has_varargs: false
           has_kwargs: false
    +      doc: "Perform speech diarization on given audio files using pyannote-audio (https://github.com/pyannote/pyannote-audio).\n\
    +        The end result is a dictionary with the file names as keys and their diarization\
    +        \ as value. A diarization is a list\nof tuples: (start, end, speaker_label).\n\
    +        \nTo use the `pyannote.audio` models you must pass a Huggingface token and\
    +        \ get access to the required models. The\ntoken can be passed in one of the\
    +        \ following options:\n\n* Use the parameter `access_token`.\n* Set an environment\
    +        \ variable named \"HUGGING_FACE_HUB_TOKEN\".\n* If using MLRun, you can pass\
    +        \ it as a secret named \"HUGGING_FACE_HUB_TOKEN\".\n\nTo get access to the\
    +        \ models on Huggingface, visit their page. For example, to use the default\
    +        \ diarization model set\nin this function (\"pyannote/speaker-diarization-3.0\"\
    +        ), you need access for these two models:\n\n* https://huggingface.co/pyannote/segmentation-3.0\n\
    +        * https://huggingface.co/pyannote/speaker-diarization-3.0\n\nNote: To control\
    +        \ the recognized speakers in the diarization output you can choose one of\
    +        \ the following methods:\n\n* For a known speakers amount, you may set speaker\
    +        \ labels via the `speakers_labels` parameter that will be used in\n  the order\
    +        \ of speaking in the audio (first person speaking be the first label in the\
    +        \ list). In addition, you can do\n  diarization per channel (setting the parameter\
    +        \ `separate_by_channels` to True). Each label will be assigned to a\n  specific\
    +        \ channel by order (first label to channel 0, second label to channel 1 and\
    +        \ so on). Notice, this will\n  increase runtime.\n* For unknown speakers amount,\
    +        \ you can set the `speaker_prefix` parameter to add a prefix for each speaker\
    +        \ number.\n  You can also help the diarization by setting the speakers range\
    +        \ via the `speakers_amount_range` parameter."
       description: pyannote's speech diarization of audio files
    -  default_handler: diarize
    -  disable_auto_mount: false
    -  clone_target_dir: ''
    -  env: []
    -  priority_class_name: ''
    -  preemption_mode: prevent
    -  affinity: null
    -  tolerations: null
    -  security_context: {}
    +metadata:
    +  name: pyannote-audio
    +  tag: ''
    +  categories:
    +  - deep-learning
    +  - audio
     verbose: false
     
             
    diff --git a/functions/master/pyannote_audio/latest/static/item.html b/functions/master/pyannote_audio/latest/static/item.html
    index b3e1fd35..2fbd98e6 100644
    --- a/functions/master/pyannote_audio/latest/static/item.html
    +++ b/functions/master/pyannote_audio/latest/static/item.html
    @@ -31,7 +31,6 @@
     apiVersion: v1
     categories:
     - deep-learning
    -- huggingface
     - audio
     description: pyannote's speech diarization of audio files
     doc: ''
    @@ -43,7 +42,7 @@
       author: guyl
     maintainers: []
     marketplaceType: ''
    -mlrunVersion: 1.5.2
    +mlrunVersion: 1.7.0
     name: pyannote-audio
     platformVersion: 3.5.3
     spec:
    @@ -57,7 +56,7 @@
       - torchaudio
       - tqdm
     url: ''
    -version: 1.2.0
    +version: 1.3.0
     
             
         
    diff --git a/functions/master/pyannote_audio/latest/static/pyannote_audio.html b/functions/master/pyannote_audio/latest/static/pyannote_audio.html index b73f1595..2d9f2aa3 100644 --- a/functions/master/pyannote_audio/latest/static/pyannote_audio.html +++ b/functions/master/pyannote_audio/latest/static/pyannote_audio.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/question_answering/0.5.0/src/data/test-data.txt b/functions/master/question_answering/0.5.0/src/data/test-data.txt new file mode 100644 index 00000000..efe6b646 --- /dev/null +++ b/functions/master/question_answering/0.5.0/src/data/test-data.txt @@ -0,0 +1 @@ +The apple color is red. \ No newline at end of file diff --git a/functions/master/question_answering/0.5.0/src/function.yaml b/functions/master/question_answering/0.5.0/src/function.yaml new file mode 100644 index 00000000..21f741aa --- /dev/null +++ b/functions/master/question_answering/0.5.0/src/function.yaml @@ -0,0 +1,197 @@ +metadata: + name: question-answering + tag: '' + categories: + - genai +verbose: false +kind: job +spec: + command: '' + default_handler: answer_questions + build: + origin_filename: '' + base_image: mlrun/mlrun + requirements: + - transformers + - torch + - tqdm + code_origin: '' + functionSourceCode:  + entry_points: + open_mpi_handler: + name: open_mpi_handler + has_varargs: false + doc: '' + lineno: 58 + parameters: + - name: worker_inputs + type: List[str] + - name: root_worker_inputs + type: Dict[str, Any] + default: null + has_kwargs: false + decorator: + name: decorator + has_varargs: false + doc: '' + lineno: 66 + parameters: + - name: handler + has_kwargs: false + wrapper: + name: wrapper + has_varargs: false + doc: '' + lineno: 71 + has_kwargs: true + answer_questions: + outputs: + - doc: 'A tuple of:' + type: Tuple[pd.DataFrame, dict] + name: answer_questions + has_varargs: false + doc: 'Answer questions with a context to the given text files contents by a + pretrained LLM model. Each text file will have + + the following prompt built: + + + start of `text_wrapper` + + + + end of `text_wrapper` + + + start of `questions_wrapper` + + 1. + + 2. + + ... + + n. + + end of `questions_wrapper`' + lineno: 130 + parameters: + - name: data_path + type: Union[str, List[str]] + doc: A path to a directory of text files or a path to a text file to ask questions + about. + - name: model_name + type: str + doc: The pre-trained model name from the huggingface hub to use for asking + questions. + - name: questions + type: Union[List[str], List[List[str]]] + doc: The questions to ask. A list of lists of questions to ask per text file, + and devided by question groups, the groups can be dtermained by size (in + order to avoid large inputs to the llm) or by questioning method (regular + or poll like questioning). + - name: device_map + type: Union[str, dict] + doc: A map to use for loading the model on multiple devices. + default: null + - name: model_kwargs + type: dict + doc: Keyword arguments to pass for loading the model using HuggingFace's `transformers.AutoModelForCausalLM.from_pretrained` + function. + default: null + - name: auto_gptq_exllama_max_input_length + type: int + doc: For AutoGPTQ models to set and extend the model's input buffer size. + default: null + - name: tokenizer_name + type: str + doc: The tokenizer name from the huggingface hub to use. If not given, the + model name will be used. + default: null + - name: tokenizer_kwargs + type: dict + doc: Keyword arguments to pass for loading the tokenizer using HuggingFace's + `transformers.AutoTokenizer.from_pretrained` function. + default: null + - name: text_wrapper + type: Union[str, List[str]] + doc: A wrapper for the file's text. Will be added at the start of the prompt. + Must have a placeholder ('{}') for the text of the file. + default: '' + - name: questions_wrapper + type: Union[str, List[str]] + doc: A wrapper for the questions received. Will be added after the text wrapper + in the prompt template. Must have a placeholder ('{}') for the questions. + default: '' + - name: generation_config + type: Union[Dict, List[Dict]] + doc: HuggingFace's `GenerationConfig` keyword arguments to pass to the `generate` + method. + default: null + - name: questions_config + type: Union[Dict, List[Dict]] + doc: A dictionary or list of dictionaries containing specific ways to answer + questions (using a poll for example), each dictionary in the list is for + corresponding question group and determines the question asking method for + said group. + default: null + - name: batch_size + type: int + doc: Batch size for inference. + default: 1 + - name: questions_columns + type: List[str] + doc: Columns to use for the dataframe returned. + default: null + - name: verbose + type: bool + doc: 'Whether to present logs of a progress bar and errors. Default: True.' + default: false + has_kwargs: false + answer: + outputs: + - type: List[List[str]] + name: answer + has_varargs: false + doc: Answer questions with a context to the given text files contents by a pretrained + LLM model in given pipeline. + lineno: 674 + parameters: + - name: self + - name: questions_amount + type: int + - name: batched_input + type: List[str] + - name: generation_pipeline + type: Pipeline + - name: generation_config + type: GenerationConfig + has_kwargs: false + most_common: + name: most_common + has_varargs: false + doc: Calculate the most common answer for a given list of answers. + lineno: 637 + parameters: + - name: answers + has_kwargs: false + average: + name: average + has_varargs: false + doc: Calculate the average answer for a given list of answers. + lineno: 646 + parameters: + - name: answers + has_kwargs: false + do: + name: do + has_varargs: false + doc: Perform the strategy. + lineno: 662 + parameters: + - name: self + - name: answers + has_kwargs: false + image: '' + description: GenAI approach of question answering on a given data + disable_auto_mount: false diff --git a/functions/master/question_answering/0.5.0/src/item.yaml b/functions/master/question_answering/0.5.0/src/item.yaml new file mode 100755 index 00000000..741bab80 --- /dev/null +++ b/functions/master/question_answering/0.5.0/src/item.yaml @@ -0,0 +1,27 @@ +apiVersion: v1 +categories: +- genai +description: GenAI approach of question answering on a given data +doc: '' +example: question_answering.ipynb +generationDate: 2023-08-07:11-30 +hidden: false +icon: '' +labels: + author: yonish +maintainers: [] +marketplaceType: '' +mlrunVersion: 1.7.0 +name: question_answering +platformVersion: 3.5.0 +spec: + filename: question_answering.py + handler: answer_questions + image: mlrun/mlrun + kind: job + requirements: + - transformers + - torch + - tqdm +url: '' +version: 0.5.0 diff --git a/functions/master/question_answering/0.5.0/src/question_answering.ipynb b/functions/master/question_answering/0.5.0/src/question_answering.ipynb new file mode 100644 index 00000000..7c506688 --- /dev/null +++ b/functions/master/question_answering/0.5.0/src/question_answering.ipynb @@ -0,0 +1,903 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "75860292-80d3-4dfb-89e4-66579321c78b", + "metadata": {}, + "source": [ + "# Question Answering" + ] + }, + { + "cell_type": "markdown", + "id": "4593a39d-6e91-4f92-9e7e-09dcd7dbcab7", + "metadata": {}, + "source": [ + "## Short description and explenation" + ] + }, + { + "cell_type": "markdown", + "id": "14dc0595-8b8a-4a13-b6a7-2a1bc43d8d50", + "metadata": {}, + "source": [ + "This function enables ad-hoc question answering over documents by ingesting text into a language model and returning formatted responses.
    \n", + "It accepts:
    \n", + "\n", + "* A language model
    \n", + "* Text files with content
    \n", + "* Questions to answer
    \n", + "* More inputs can be given for configuration
    \n", + "\n", + "The model processes the files to build understanding. Questions posed are then answered in one of two modes:\n", + "\n", + "Default mode:
    \n", + "The model directly answers each question using its own capabilities.\n", + "\n", + "Poll mode:
    \n", + "Additional models are included to separately answer each question. An aggregation algorithm determines the best response through consensus between models.
    \n", + "Two options exist for consensus methodology:
    \n", + "\n", + "Average Answer:
    \n", + "Each model's answer is scored. The response with the average highest score amongst models is selected. Useful for numeric or ranked responses.\n", + "\n", + "Most Common Answer:
    The answer that occurs most frequently across models is selected. Useful for textual responses to avoid outliers.\n", + "\n", + "Using multiple models via the poll mode provides accuracy improvements for questions lacking definitive answers, as it refines responses through an ensemble process.
    " + ] + }, + { + "cell_type": "markdown", + "id": "ae957ac3-2c26-4a0b-8e44-8315caeb2953", + "metadata": {}, + "source": [ + "## Background" + ] + }, + { + "cell_type": "markdown", + "id": "3a351565-6f2c-4fa3-a024-4b5d658db311", + "metadata": {}, + "source": [ + "At the core, advanced natural language processing (NLP) models called foundation models are being leveraged to read and comprehend the input text files.
    \n", + "Specifically, models such as GPT-3 or Codex from Anthropic are used as the base language model.\n", + "\n", + "When documents are fed into the function, the background process invokes these models to ingest and digest the information.
    \n", + "\n", + "This provides the knowledge base for the models to then offer informed answers tailored to any queries about the documents.
    \n", + "The parameters controlling model size and computation time provide tradeoffs between cost, speed, and sophistication of comprehension.\n", + "\n", + "Additionally, the poll option expands on a single model by sampling responses from a number of models as mentioned above.
    " + ] + }, + { + "cell_type": "markdown", + "id": "a6fc4aaa-530a-4e9e-8447-737a0cfd6ed5", + "metadata": {}, + "source": [ + "## Requirements" + ] + }, + { + "cell_type": "markdown", + "id": "685d9000-37e1-462b-93c8-1bbfcdf6aaa1", + "metadata": {}, + "source": [ + "`transformers`
    \n", + "`torch`
    \n", + "`tqdm`
    " + ] + }, + { + "cell_type": "markdown", + "id": "73d9b369-1c36-42e8-b106-491ad911f281", + "metadata": {}, + "source": [ + "## Documentation" + ] + }, + { + "cell_type": "markdown", + "id": "68e3a54d-0cd9-4845-ae14-f24068052bf3", + "metadata": {}, + "source": [ + "`data_path`: A path to a directory of text files or a path to a text file to ask questions about.
    \n", + "\n", + "`model_name`: The pre-trained model name from the huggingface hub to use for answering questions.
    \n", + "\n", + "`questions`: The questions to ask. A list of lists of questions to ask per text file, and devided
    \n", + " by question groups, the groups can be determained by size (in order to
    \n", + " avoid large inputs to the llm) or by questioning method (regular or poll like questioning).
    \n", + " \n", + "`device_map`: A map to use for loading the model on multiple devices.
    \n", + "\n", + "`model_kwargs`: Keyword arguments to pass for loading the model using HuggingFace's
    \n", + " _transformers.AutoModelForCausalLM.from_pretrained_ function.
    \n", + " \n", + "`auto_gptq_exllama_max_input_length`: For AutoGPTQ models to set and extend the model's input buffer size.
    \n", + "\n", + "`tokenizer_name`: The tokenizer name from the huggingface hub to use. If not given, the given model name will be used.
    \n", + " \n", + "`tokenizer_kwargs`: Keyword arguments to pass for loading the tokenizer using HuggingFace's
    \n", + " _transformers.AutoTokenizer.from_pretrained_ function.
    \n", + " \n", + "`text_wrapper`: Must have a placeholder ('{}') for the text of the file.
    \n", + "\n", + "`questions_wrapper`: A wrapper for the questions received. Will be added after the text wrapper in the prompt template.
    \n", + " Must have a placeholder ('{}') for the questions.
    \n", + " \n", + "`generation_config`: HuggingFace's _GenerationConfig_ keyword arguments to pass to the _generate_ method.
    \n", + " \n", + "`questions_config`: A dictionary or list of dictionaries containing specific ways to answer questions (using a poll for example),
    \n", + " each dictionary in the list is for corresponding question group and determines the question asking method
    \n", + " for said group.
    \n", + " \n", + "`batch_size`: Batch size for inference.
    \n", + "\n", + "`questions_columns`: Columns to use for the dataframe returned.
    \n", + "\n", + "`verbose`: Whether to present logs of a progress bar and errors. Default: True.
    \n" + ] + }, + { + "cell_type": "markdown", + "id": "716e5fac-3def-4cdd-8ca5-d1c93ee64f2e", + "metadata": {}, + "source": [ + "## Demo 1" + ] + }, + { + "cell_type": "markdown", + "id": "3bf4bc9b-fc5e-4155-8563-0575c22cef05", + "metadata": {}, + "source": [ + "This is a short and simple example to show the basic use of the function." + ] + }, + { + "cell_type": "markdown", + "id": "c95dcfdb-22e1-4b82-b0a3-9c89487a216f", + "metadata": {}, + "source": [ + "### (1.) Import the function (import mlrun, set project and import function)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "60161e5f-468c-47c9-be98-e6554b899c9c", + "metadata": {}, + "outputs": [], + "source": [ + "import mlrun\n", + "import transformers\n", + "import tempfile" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1267b60b-35d1-48bf-8ea0-dfe7a5f366e7", + "metadata": {}, + "outputs": [], + "source": [ + "project = mlrun.get_or_create_project(\n", + " name=\"call-center-demo-1\",\n", + " context=\"./\",\n", + " user_project=True,\n", + " parameters={\n", + " \"default_image\": \"mlrun/mlrun\",\n", + " })" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "05c8b39b-433c-40b8-9260-94923c9cbb6c", + "metadata": {}, + "outputs": [], + "source": [ + "func = project.set_function(\n", + " \"question-answering.py\",\n", + " name=\"question-answering\",\n", + " kind=\"job\",\n", + " handler=\"answer_questions\",\n", + ")\n", + "project.save()" + ] + }, + { + "cell_type": "markdown", + "id": "b9744a13-6530-4aa0-a30c-a88db94ce853", + "metadata": {}, + "source": [ + "We create a text file that the model can be asked about" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "503b874a-0c64-4a66-9b30-fe99191b5fd3", + "metadata": {}, + "outputs": [], + "source": [ + "def _make_data_dir_for_test():\n", + " data_dir = tempfile.mkdtemp()\n", + " # The information the model will need in order to answer our question\n", + " content = \"The apple is red.\"\n", + " with open(data_dir + \"/test_data.txt\", \"w\") as f:\n", + " f.write(content)\n", + " return data_dir" + ] + }, + { + "cell_type": "markdown", + "id": "7fadd06e-210b-45aa-b7ea-686058b6e7f4", + "metadata": {}, + "source": [ + "### (2.) Usage\n", + "Then we set where to take the path to the text file we want to ask about, the questions, and column name for the answer table." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5a634b19-d809-4436-bbdd-469fc1d61c6e", + "metadata": {}, + "outputs": [], + "source": [ + "input_path = _make_data_dir_for_test()\n", + "# The question for the model to answer\n", + "question = [\"What is the color of the apple?\"]\n", + "# The column of the answer in the data frame returned by the function\n", + "column_name = [\"color\"]" + ] + }, + { + "cell_type": "markdown", + "id": "0364ce68-079e-4769-89b6-661fcdc1d475", + "metadata": {}, + "source": [ + "Now we run the function with all the parameters we prepered earlier" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "448bada9-8b52-4175-9839-ecb409ab3e35", + "metadata": {}, + "outputs": [], + "source": [ + "demo1_run = func.run(\n", + " handler=\"answer_questions\",\n", + " params={\n", + " \"model\": \"distilgpt2\",\n", + " \"input_path\": input_path,\n", + " \"questions\": question,\n", + " \"questions_columns\": column_name,\n", + " \"generation_config\": {\n", + " \"do_sample\": True,\n", + " \"temperature\": 0.8,\n", + " \"top_p\": 0.9,\n", + " \"early_stopping\": True,\n", + " \"max_new_tokens\": 20,\n", + " },\n", + " },\n", + " returns=[\n", + " \"question_answering_df: dataset\",\n", + " \"question_answering_errors: result\",\n", + " ],\n", + " local=True,\n", + " artifact_path=\"./\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "474505db-2fc8-48fd-a634-2bada802a449", + "metadata": {}, + "source": [ + "### (3.) Review results\n", + "and after the run is finished we can take a look and see our answer" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4560b51d-5f96-465d-9826-e88c7d4d46aa", + "metadata": {}, + "outputs": [], + "source": [ + "demo1_run.outputs" + ] + }, + { + "cell_type": "markdown", + "id": "31a401a5-2f8a-427f-bf62-2f31f94f5ee7", + "metadata": {}, + "source": [ + "## Demo 2" + ] + }, + { + "cell_type": "markdown", + "id": "503b8a40-ad61-445f-900b-4fdaa036e417", + "metadata": {}, + "source": [ + "This is a much larger example, we will show how we use this function to analyze a number of calls between agents and customer of a internet company (all the data is generated by Iguazio).
    \n", + "For something like this, we recomend using a strong model, and putting some time into making the prompts." + ] + }, + { + "cell_type": "markdown", + "id": "759c521b-df3d-498f-8642-863182107618", + "metadata": {}, + "source": [ + "### (1.) Import the function (import mlrun, set project and import function)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "bde6a480-a3d9-4b8c-a9c0-daa235f0f0c4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2023-12-18 10:18:37,490 [warning] Client version with higher version than server version isn't supported, align your client to the server version: {'parsed_server_version': Version(major=1, minor=5, patch=2, prerelease='rc1', build='track'), 'parsed_client_version': Version(major=1, minor=6, patch=0, prerelease='rc11', build=None)}\n" + ] + } + ], + "source": [ + "import os\n", + "import mlrun\n", + "import torch\n", + "from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "187a4643-53e9-40bb-a337-5096df7946d6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2023-12-18 10:18:51,651 [info] Project loaded successfully: {'project_name': 'call-center-demo-zeev55'}\n" + ] + } + ], + "source": [ + "project = mlrun.get_or_create_project(\n", + " name=\"call-center-demo-2\",\n", + " context=\"./\",\n", + " user_project=True,\n", + " parameters={\n", + " \"default_image\": \"mlrun/mlrun\",\n", + " })\n" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "17eb7783-9ced-482b-9bdf-c41e55995faf", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "func = project.set_function(\n", + " \"question-answering.py\",\n", + " name=\"question-answering\",\n", + " kind=\"job\",\n", + " handler=\"answer_questions\",\n", + ")\n", + "project.save()" + ] + }, + { + "cell_type": "markdown", + "id": "91d3ebb2-7d4a-4e52-89ed-45287c06eb76", + "metadata": {}, + "source": [ + "### (2.) Usage\n", + "\n", + "This example is a bit more complicated as we mentioned, we give the model a list of questions, for some of them we give the model a list of answers to choose from." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "2bc065e4-2dbf-4d7a-9772-6b7039f428bc", + "metadata": {}, + "outputs": [], + "source": [ + "QUESTIONS = [\n", + " \"1. Write a long summary of the text, focus on the topic (max 50 words).\",\n", + " \"2. Was the Client's concern addressed, (choose only one) [Yes, No]?\",\n", + " ]\n", + "\n", + "qa_questions_columns = [\n", + " \"Summary\",\n", + " \"is_fixed\",\n", + " ]" + ] + }, + { + "cell_type": "markdown", + "id": "aa89f316-0d1b-4ada-9990-d2293546eee3", + "metadata": {}, + "source": [ + "Another thing we give the model this time is answer examples (one/few shot answering), this can be done to show the model how you want the answer to be structured or caculated.
    \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bbc093ad-dab4-46a1-b36a-2a7551cef018", + "metadata": {}, + "outputs": [], + "source": [ + "# For every file we ask about, the model will be presented with this example of a call and how we want the answers.\n", + "DEMO_CALL = (\n", + " \"Agent: Good afternoon, you've reached [Internet Service Provider] customer support. I'm Megan. How can I assist \"\n", + " \"you today?\\n\"\n", + " \"Customer: Hello, Megan. This is Lisa. I've noticed some billing discrepancies on my last statement.\\n\"\n", + " \"Agent: Thank you, Lisa. Let me pull up your account. I see the billing discrepancies you mentioned. It appears \"\n", + " \"there was an error in the charges. I apologize for the inconvenience.\\n\"\n", + " \"Customer: Thank you for acknowledging the issue, Megan. Can you please help me get it resolved?\\n\"\n", + " \"Agent: Absolutely, Lisa. I've made note of the discrepancies, and I'll escalate this to our billing department \"\n", + " \"for investigation and correction. You should see the adjustments on your next statement.\\n\"\n", + " \"Customer: That sounds good, Megan. I appreciate your help.\\n\"\n", + " \"Agent: Not a problem, Lisa. Have a wonderful day, and we'll get this sorted out for you.\\n\"\n", + ")\n", + "\n", + "DEMO_ANSWERS = (\n", + " \"1. The customer, contacted the call center regarding billing discrepancies on her statement. The agent, \"\n", + " \"acknowledged the issue, assured The customer it would be resolved, and escalated it to the billing department for \"\n", + " \"correction.\\n\"\n", + " \"2. Yes.\\n\"" + ] + }, + { + "cell_type": "markdown", + "id": "8b44ded3-fee3-4911-a02a-6a51a62a7020", + "metadata": {}, + "source": [ + "Then we need to wrap it all nicely to be given to the model as a single prompt, this is done with a text wrapper, and a question wrapper.
    \n", + "both of them will be concatenated inside the function with the questions and passed to the model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2108f5aa-75a6-402d-83a6-bf45f0d7223a", + "metadata": {}, + "outputs": [], + "source": [ + "# The wrappers are built according to the model's convensions to improve result\n", + "TEXT_WRAPPER = (\n", + " f\"<|im_start|>system: You are an AI assistant that answers questions accurately and shortly<|im_end|>\\n\"\n", + " f\"<|im_start|>user: Given the following text:\\n\"\n", + " f\"{DEMO_CALL}\\n\"\n", + " f\"answer the questions as accurately as you can:\\n\"\n", + " f\"{QUESTIONS}<|im_end|>\\n\"\n", + " f\"<|im_start|>assistant:\\n\"\n", + " f\"{DEMO_ANSWERS}<|im_end|>\\n\"\n", + " f\"<|im_start|>user: Given the following text:\\n\"\n", + " \"{}\"\n", + ") \n", + "QUESTIONS_WRAPPER = (\n", + " \" answer the given questions as accurately as you can, do not write more answers the questions:\\n\"\n", + " \"{}<|im_end|>\\n\"\n", + " \"<|im_start|>assistant:\\n\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "1a44b391-87d2-447d-aafa-66ed45f06ba5", + "metadata": {}, + "source": [ + "The last few parameters we need to set are the model we will use, the input lenth (no available for all models) and the batch size.
    \n", + "The batch size determains how many files we want procced at each epoch, and the larger we go the faster the proccess will be, as long as our memory is sufficient. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "528cae4c-541b-49a3-b24d-deb94f7130fb", + "metadata": {}, + "outputs": [], + "source": [ + "# We like this version of mistral's model, which is small and fast but also gives great results\n", + "qa_model = \"TheBloke/Mistral-7B-OpenOrca-GPTQ\"" + ] + }, + { + "cell_type": "markdown", + "id": "47fa4eaa-f3b0-457f-b98a-18a8ee5ba4d8", + "metadata": {}, + "source": [ + "Finnaly, we run the function with all the parameters we prepared. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3d200706-e852-4ce9-9b9a-61686b30e5b7", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Question answering:\n", + "demo2_run = func.run(\n", + " function=\"question-answering\",\n", + " local=True,\n", + " handler=\"answer_questions\",\n", + " inputs={\"data_path\": os.path.abspath(\"./calls\")},\n", + " params={\n", + " \"model_name\": qa_model,\n", + " \"device_map\": \"auto\",\n", + " \"text_wrapper\":TEXT_WRAPPER,\n", + " \"questions\": QUESTIONS,\n", + " \"questions_wrapper\": QUESTIONS_WRAPPER,\n", + " \"questions_columns\": qa_questions_columns,\n", + " },\n", + " returns=[\n", + " \"question_answering_df: dataset\",\n", + " \"question_answering_errors: result\",\n", + " ],\n", + ")\n" + ] + }, + { + "cell_type": "markdown", + "id": "0d505915-49b5-47fb-9f50-ce15fe6dc392", + "metadata": {}, + "source": [ + "### (3.) Review results\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fa39c5bf-c959-4ff5-ad60-4ad68d00f22c", + "metadata": {}, + "outputs": [], + "source": [ + "demo2_run.outputs" + ] + }, + { + "cell_type": "markdown", + "id": "947d6ce8-b330-44ab-b13f-b6eec20e839e", + "metadata": {}, + "source": [ + "## Demo 3" + ] + }, + { + "cell_type": "markdown", + "id": "66b916d2-96b0-448d-8e51-b51fb5a5a1a7", + "metadata": {}, + "source": [ + "This is also a large example, in this case we use another option of the function to ask questions in the form of a poll." + ] + }, + { + "cell_type": "markdown", + "id": "9ec66fc7-f50b-4417-a7cc-3c42848b1f01", + "metadata": {}, + "source": [ + "### (1.) Import the function (import mlrun, set project and import function)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "dfcab8d0-5022-40e5-92ff-14b02cfa2eaa", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2023-12-18 10:18:37,490 [warning] Client version with higher version than server version isn't supported, align your client to the server version: {'parsed_server_version': Version(major=1, minor=5, patch=2, prerelease='rc1', build='track'), 'parsed_client_version': Version(major=1, minor=6, patch=0, prerelease='rc11', build=None)}\n" + ] + } + ], + "source": [ + "import os\n", + "import mlrun\n", + "import torch\n", + "from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "49bc523b-9bca-46c5-917d-320d5641506a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2023-12-18 10:18:51,651 [info] Project loaded successfully: {'project_name': 'call-center-demo-zeev55'}\n" + ] + } + ], + "source": [ + "project = mlrun.get_or_create_project(\n", + " name=\"call-center-demo-3\",\n", + " context=\"./\",\n", + " user_project=True,\n", + " parameters={\n", + " \"default_image\": \"mlrun/mlrun\",\n", + " })\n" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "370f3780-0dfc-4b9c-87aa-1dd124e62249", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "func = project.set_function(\n", + " \"question-answering.py\",\n", + " name=\"question-answering\",\n", + " kind=\"job\",\n", + " handler=\"answer_questions\",\n", + ")\n", + "project.save()" + ] + }, + { + "cell_type": "markdown", + "id": "88dbe941-b9af-40bb-a038-7fcc812d506c", + "metadata": {}, + "source": [ + "### (2.) Usage\n", + "\n", + "Like in the second demo, we make a list of questions for the function to answer." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "f9b02aaa-2a31-4ade-ba26-a2d73c5d03ab", + "metadata": {}, + "outputs": [], + "source": [ + "# These questions are harder to answer, as there is no right answer.\n", + "# So we want it to be at least consistent, for that we use the poll option.\n", + "QUESTIONS = [\n", + " \"1. Rate the agent's level of empathy (The ability to understand and share the feelings of others) on a scale of 1-5.\",\n", + " \"2. Rate the agent's level of professionalism (Conducting oneself in a way that is appropriate for the workplace) on a scale of 1-5.\",\n", + "]\n", + "\n", + "qa_questions_columns = [\n", + " \"empathy\",\n", + " \"professionalism\",\n", + "\n", + " ]" + ] + }, + { + "cell_type": "markdown", + "id": "6ed8a0e3-9c5d-4524-bbe1-b345b981694a", + "metadata": {}, + "source": [ + "Another thing we give the model this time is answer examples (one/few shot answering), this can be done to show the model how you want the answer to be structured or caculated.
    \n", + "So for every file we ask about, the model will be presented with this example of a call and how we want the answers." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d14e79d6-687c-4424-a01f-68376ad3dd30", + "metadata": {}, + "outputs": [], + "source": [ + "# For every file we ask about, the model will be presented with this example of a call and how we want the answers.\n", + "DEMO_CALL = (\n", + " \"Agent: Good afternoon, you've reached [Internet Service Provider] customer support. I'm Megan. How can I assist \"\n", + " \"you today?\\n\"\n", + " \"Customer: Hello, Megan. This is Lisa. I've noticed some billing discrepancies on my last statement.\\n\"\n", + " \"Agent: Thank you, Lisa. Let me pull up your account. I see the billing discrepancies you mentioned. It appears \"\n", + " \"there was an error in the charges. I apologize for the inconvenience.\\n\"\n", + " \"Customer: Thank you for acknowledging the issue, Megan. Can you please help me get it resolved?\\n\"\n", + " \"Agent: Absolutely, Lisa. I've made note of the discrepancies, and I'll escalate this to our billing department \"\n", + " \"for investigation and correction. You should see the adjustments on your next statement.\\n\"\n", + " \"Customer: That sounds good, Megan. I appreciate your help.\\n\"\n", + " \"Agent: Not a problem, Lisa. Have a wonderful day, and we'll get this sorted out for you.\\n\"\n", + ")\n", + "\n", + "\n", + "DEMO_ANSWERS = (\n", + " \"1. 4\\n\"\n", + " \"2. 5\\n\"\n", + "\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "86099fb8-895c-4e2c-979d-6bda9782ccd3", + "metadata": {}, + "source": [ + "Then we need to wrap it all nicely to be given to the model as a single prompt, this is done with a text wrapper, and a question wrapper.
    \n", + "both of them will be concatenated inside the function with the questions and passed to the model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e5efac70-cd2c-4fc7-bc9c-4c04d18077a1", + "metadata": {}, + "outputs": [], + "source": [ + "TEXT_WRAPPER = (\n", + " f\"<|im_start|>system: You are an AI assistant that answers questions accurately and shortly<|im_end|>\\n\"\n", + " f\"<|im_start|>user: Given the following text:\\n\"\n", + " f\"{DEMO_CALL}\\n\"\n", + " f\"answer the questions as accurately as you can:\\n\"\n", + " f\"{QUESTIONS}<|im_end|>\\n\"\n", + " f\"<|im_start|>assistant:\\n\"\n", + " f\"{DEMO_ANSWERS}<|im_end|>\\n\"\n", + " f\"<|im_start|>user: Given the following text:\\n\"\n", + " \"{}\"\n", + ") \n", + "\n", + "QUESTIONS_WRAPPER = (\n", + " \" answer the given questions as accurately as you can, do not write more answers the questions:\\n\"\n", + " \"{}<|im_end|>\\n\"\n", + " \"<|im_start|>assistant:\\n\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "9339816e-d436-4add-b8f3-b48e577f4bfe", + "metadata": {}, + "source": [ + "The config is for the second questioning method, we cal \"poll\", and in which we need to choose how many voting models we want participating,
    \n", + "and in what way we want do decide the result, we currentlly support `average` and `most_common` as show here.
    \n", + "\n", + "\n", + "*An explenation about both questioning methods can be found in the begginig of this notebook" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "6330db65-9806-44a6-8046-0b156d2a3228", + "metadata": {}, + "outputs": [], + "source": [ + "questions_config = \n", + " {\n", + " \"type\": \"poll\",\n", + " \"poll_count\": 3, # How many 'voters'\n", + " \"poll_strategy\": \"most_common\"\n", + " }" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eaa0ae3d-9302-4b73-92f1-8c43ec92e9cd", + "metadata": {}, + "outputs": [], + "source": [ + "qa_model = \"TheBloke/Mistral-7B-OpenOrca-GPTQ\"" + ] + }, + { + "cell_type": "markdown", + "id": "20c0e1eb-49cf-426e-b125-eb133d440fbd", + "metadata": {}, + "source": [ + "Finnaly, we run the function with all the parameters we prepared. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "03d6d619-618a-49d6-b0be-43c300902927", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Question answering:\n", + "demo3_run = func.run(\n", + " function=\"question-answering\",\n", + " local=True,\n", + " handler=\"answer_questions\",\n", + " inputs={\"data_path\": os.path.abspath(\"./calls\")},\n", + " params={\n", + " \"model_name\": qa_model,\n", + " \"device_map\": \"auto\",\n", + " \"text_wrapper\":TEXT_WRAPPER,\n", + " \"questions\": QUESTIONS,\n", + " \"questions_wrapper\": QUESTIONS_WRAPPER,\n", + " \"questions_columns\": qa_questions_columns,\n", + " \"questions_config\": questions_config, # This time we add 'questions_config'\n", + " },\n", + " returns=[\n", + " \"question_answering_df: dataset\",\n", + " \"question_answering_errors: result\",\n", + " ],\n", + ")\n" + ] + }, + { + "cell_type": "markdown", + "id": "534edd4e-1e5b-4663-a2bb-bc6da7b603ca", + "metadata": {}, + "source": [ + "### (3.) Review results\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a61f06ad-ee28-45c9-b7da-d93c5a296810", + "metadata": {}, + "outputs": [], + "source": [ + "demo3_run.outputs" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "mlrun-base", + "language": "python", + "name": "conda-env-mlrun-base-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/functions/master/question_answering/0.5.0/src/question_answering.py b/functions/master/question_answering/0.5.0/src/question_answering.py new file mode 100644 index 00000000..2e4e96d0 --- /dev/null +++ b/functions/master/question_answering/0.5.0/src/question_answering.py @@ -0,0 +1,736 @@ +# Copyright 2023 Iguazio +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import enum +import logging +import operator +import pathlib +from collections import Counter +from functools import reduce, wraps +from typing import Any, Dict, List, Tuple, Union + +import pandas as pd +import transformers +from tqdm import tqdm + +# Get the global logger: +_LOGGER = logging.getLogger() + + +def _check_mlrun_and_open_mpi() -> Tuple["mlrun.MLClientCtx", "mpi4py.MPI.Intracomm"]: + global _LOGGER + + is_mpi = False + try: + import mlrun + + context = mlrun.get_or_create_ctx(name="mlrun") + _LOGGER = context.logger + is_mpi = context.labels.get("kind", "job") == "mpijob" + + if is_mpi: + try: + from mpi4py import MPI + + return context, MPI.COMM_WORLD + except ModuleNotFoundError as mpi4py_not_found: + context.logger.error( + "To distribute the function using MLRun's 'mpijob' you need to have `mpi4py` package in your " + "interpreter. Please run `pip install mpi4py` and make sure you have open-mpi." + ) + raise mpi4py_not_found + except ModuleNotFoundError as module_not_found: + if is_mpi: + raise module_not_found + return None, None + + +def open_mpi_handler( + worker_inputs: List[str], root_worker_inputs: Dict[str, Any] = None +): + global _LOGGER + + # Check for MLRun and OpenMPI availability: + context, comm = _check_mlrun_and_open_mpi() + + def decorator(handler): + if comm is None or comm.Get_size() == 1: + return handler + + @wraps(handler) + def wrapper(**kwargs): + # Get the open mpi environment properties: + size = comm.Get_size() + rank = comm.Get_rank() + + # Give the correct chunk of the workers inputs: + for worker_input in worker_inputs: + input_argument = kwargs[worker_input] + if input_argument is None: + continue + if isinstance(input_argument, str): + input_argument = _get_text_files( + data_path=pathlib.Path(input_argument).absolute() + ) + if len(input_argument) < size: + raise ValueError( + f"Cannot split the input '{worker_input}' of length {len(input_argument)} to {size} workers. " + f"Please reduce the amount of workers for this input." + ) + even_chunk_size = len(input_argument) // size + chunk_start = rank * even_chunk_size + chunk_end = ( + (rank + 1) * even_chunk_size + if rank + 1 < size + else len(input_argument) + ) + context.logger.info( + f"Rank #{rank}: Processing input chunk of '{worker_input}' " + f"from index {chunk_start} to {chunk_end}." + ) + if isinstance(input_argument, list): + input_argument = input_argument[chunk_start:chunk_end] + elif isinstance(input_argument, pd.DataFrame): + input_argument = input_argument.iloc[chunk_start:chunk_end:, :] + kwargs[worker_input] = input_argument + + # Set the root worker only arguments: + if rank == 0 and root_worker_inputs: + kwargs.update(root_worker_inputs) + + # Run the worker: + output = handler(**kwargs) + + # Send the output to the root rank (rank #0): + output = comm.gather(output, root=0) + if rank == 0: + # Join the outputs: + context.logger.info("Collecting data from workers to root worker.") + dataframe = pd.concat(objs=[df for df, _ in output], axis=0) + errors_dictionary = reduce(operator.ior, [err for _, err in output], {}) + return dataframe, errors_dictionary + return None + + return wrapper + + return decorator + + +@open_mpi_handler(worker_inputs=["data_path"], root_worker_inputs={"verbose": True}) +def answer_questions( + data_path: Union[str, List[str]], + model_name: str, + questions: Union[List[str], List[List[str]]], + device_map: Union[str, dict] = None, + model_kwargs: dict = None, + auto_gptq_exllama_max_input_length: int = None, + tokenizer_name: str = None, + tokenizer_kwargs: dict = None, + text_wrapper: Union[str, List[str]] = "", + questions_wrapper: Union[str, List[str]] = "", + generation_config: Union[Dict, List[Dict]] = None, + questions_config: Union[Dict, List[Dict]] = None, + batch_size: int = 1, + questions_columns: List[str] = None, + verbose: bool = False, +) -> Tuple[pd.DataFrame, dict]: + """ + Answer questions with a context to the given text files contents by a pretrained LLM model. Each text file will have + the following prompt built: + + start of `text_wrapper` + + end of `text_wrapper` + + start of `questions_wrapper` + 1. + 2. + ... + n. + end of `questions_wrapper` + + :param data_path: A path to a directory of text files or a path to a text file to ask + questions about. + :param model_name: The pre-trained model name from the huggingface hub to use for asking + questions. + :param questions: The questions to ask. + A list of lists of questions to ask per text file, and devided + by question groups, the groups can be dtermained by size (in order to + avoid large inputs to the llm) or by questioning method + (regular or poll like questioning). + :param device_map: A map to use for loading the model on multiple devices. + :param model_kwargs: Keyword arguments to pass for loading the model using HuggingFace's + `transformers.AutoModelForCausalLM.from_pretrained` function. + :param auto_gptq_exllama_max_input_length: For AutoGPTQ models to set and extend the model's input buffer size. + :param tokenizer_name: The tokenizer name from the huggingface hub to use. If not given, the + model name will be used. + :param tokenizer_kwargs: Keyword arguments to pass for loading the tokenizer using HuggingFace's + `transformers.AutoTokenizer.from_pretrained` function. + :param text_wrapper: A wrapper for the file's text. Will be added at the start of the prompt. + Must have a placeholder ('{}') for the text of the file. + :param questions_wrapper: A wrapper for the questions received. Will be added after the text + wrapper in the prompt template. Must have a placeholder ('{}') for the + questions. + :param generation_config: HuggingFace's `GenerationConfig` keyword arguments to pass to the + `generate` method. + :param questions_config: A dictionary or list of dictionaries containing specific ways to answer + questions (using a poll for example), each dictionary in the list is for + corresponding question group and determines the question asking method + for said group. + :param batch_size: Batch size for inference. + :param questions_columns: Columns to use for the dataframe returned. + :param verbose: Whether to present logs of a progress bar and errors. Default: True. + + + :returns: A tuple of: + + * A dataframe dataset of the questions answers. + * A dictionary of errored files that were not inferred or were not answered properly. + """ + global _LOGGER + + # Set configs to empty dict if not given: + if generation_config is None: + generation_config = {} + if questions_config is None: + questions_config = {} + + # Get the input text files to question: + if verbose: + _LOGGER.info("Collecting text files.") + if isinstance(data_path, str): + data_path = pathlib.Path(data_path).absolute() + text_files = _get_text_files(data_path=data_path) + else: + text_files = data_path + if verbose: + _LOGGER.info(f"Collected {len(text_files)} text files.") + + # Get the prompt template: + if verbose: + _LOGGER.info("Creating prompt template.") + + # Organize questions as a list of list, and count number of sub-lists for future use + number_of_question_groups = 1 if isinstance(questions[0], str) else len(questions) + questions = _to_group_list( + argument_value=questions, + argument_name="questions", + length=number_of_question_groups, + ) + + # Organize prompt parts at proper length + text_wrapper = _to_group_list( + argument_value=text_wrapper, + argument_name="text_wrapper", + length=number_of_question_groups, + ) + questions_wrapper = _to_group_list( + argument_value=questions_wrapper, + argument_name="questions_wrapper", + length=number_of_question_groups, + ) + + # Create a list of prompt according to given parts and questions + prompt_template = [] + questions = questions if isinstance(questions[0], list) else [questions] + + # Build all prompts + for i in range(number_of_question_groups): + prompt_template.append( + _get_prompt_template( + text_wrapper=text_wrapper[i], + questions_wrapper=questions_wrapper[i], + questions=questions[i], + ) + ) + if verbose: + _LOGGER.info(f"Prompt template created:\n\n{prompt_template}\n") + + # Get the total amount of questions: + questions_amount = sum([len(sublist) for sublist in questions]) + + # Get the questions columns: + questions_columns = questions_columns or [ + f"q{i}" for i in range(1, questions_amount + 1) + ] + + # Check if we have the correct amount of questions columns: + if len(questions_columns) != questions_amount: + raise ValueError( + f"The provided questions columns length ({len(questions_columns)}) " + f"does not match the questions amount ({questions_amount})" + ) + + # Load the generation config: + if verbose: + _LOGGER.info("Loading generation configuration.") + generation_config = [ + transformers.GenerationConfig(**(cfg or {})) + for cfg in _to_group_list( + argument_value=generation_config, + argument_name="generation_config", + length=number_of_question_groups, + ) + ] + if verbose: + _LOGGER.info(f"Generation configuration loaded: {generation_config}") + + # Load the model and tokenizer into a pipeline object: + if verbose: + _LOGGER.info(f"Loading model '{model_name}'.") + generation_pipeline = _get_generation_pipeline( + model_name=model_name, + device_map=device_map, + tokenizer_name=tokenizer_name or model_name, + model_kwargs=model_kwargs or {}, + tokenizer_kwargs=tokenizer_kwargs or {}, + auto_gptq_exllama_max_input_length=auto_gptq_exllama_max_input_length, + batch_size=batch_size, + ) + if verbose: + _LOGGER.info("Model loaded.") + + # Prepare the successes dataframe and errors dictionary to be returned: + successes = [] + errors = {} + + # Split the files into batches: + file_batches = [ + text_files[i : i + batch_size] + if i + batch_size < len(text_files) + else text_files[i:] + for i in range(0, len(text_files), batch_size) + ] + questions_config = _to_group_list( + argument_value=questions_config, + argument_name="questions_config", + length=number_of_question_groups, + ) + + # Create a list of question handlers according to given configs + handlers = [] + for cfg in questions_config: + question_type = cfg.pop("type", "default") + handlers.append(QUESTION_MAPPING.get(question_type)(**cfg)) + + # Go over the batches of text files and question them: + for file_batch in tqdm( + file_batches, + desc="Generating answers", + unit=f"file (batch of {batch_size})", + disable=not verbose, + ): + try: + total_answers = [[] for _ in range(batch_size)] + + # Go over all question group per batch of documents + for question_group in range(number_of_question_groups): + current_questions_amount = len(questions[question_group]) + + # Read batch (read the text from the text files): + batched_input = _read_file_batch( + file_batch=file_batch, + prompt_template=prompt_template[question_group], + ) + + # Answer the questions with each question handler: + batched_answers = handlers[question_group].answer( + questions_amount=current_questions_amount, + batched_input=batched_input, + generation_pipeline=generation_pipeline, + generation_config=generation_config[question_group], + ) + + # Put the answers in the correct place in the total answers list according to the place in the batch: + for i in range(batch_size): + total_answers[i].extend(batched_answers[i]) + + # Collect the answers and attach the file name: + successes.extend( + [ + [file.name, *answers] + for file, answers in zip(file_batch, total_answers) + ] + ) + except Exception as exception: + # Note the exception as error in the dictionary: + batch_file_names = ", ".join([file.name for file in file_batch]) + if verbose: + _LOGGER.warning( + f"Error in batch '{batch_file_names}': {str(exception)}" + ) + errors[batch_file_names] = str(exception) + continue + + # Construct the answers dataframe: + columns = [ + "text_file", + *questions_columns, + ] + + # Create a data frame of answers by files + successes = pd.DataFrame( + successes, + columns=columns, + ) + + # Print the head of the produced dataframe and return: + if verbose: + _LOGGER.info( + f"Done ({successes.shape[0]}/{len(text_files)})\n" + f"Answers summary:\n" + f"{successes.head()}" + ) + return successes, errors + + +def _get_text_files( + data_path: pathlib.Path, +) -> List[pathlib.Path]: + + # Check if the path is of a directory or a file: + if data_path.is_dir(): + + # Get all files inside the directory: + text_files = list(data_path.glob("*.*")) + elif data_path.is_file(): + text_files = [data_path] + else: + raise ValueError( + f"Unrecognized data path. The parameter `data_path` must be either a directory path or a file path. " + f"Given: {str(data_path)} " + ) + + return text_files + + +def _get_prompt_template( + text_wrapper: str, + questions_wrapper: str, + questions: List[str], +) -> str: + + # Validate and build the text wrapper: + text_wrapper = text_wrapper or ( + "Given the following text:\n" "-----\n" "{}\n" "-----" + ) + if text_wrapper.count("{}") != 1: + raise ValueError( + "The `text_wrapper` must include one placeholder '{}' for the text of the file to be asked about." + ) + + # Validate and build the question wrapper: + questions_wrapper = questions_wrapper or "Answer the questions:\n" "{}" + if questions_wrapper.count("{}") != 1: + raise ValueError( + "The `questions_wrapper` must include one placeholder '{}' for the list of questions." + ) + + # Validate and parse the questions: + if len(questions) == 0: + raise ValueError("Please include at least one question.") + questions = "\n".join( + [f"{i}. {question}" for i, question in enumerate(questions, 1)] + ) + + # Construct the template: + return f"{text_wrapper}\n{questions_wrapper.format(questions)}\n" + + +def _get_generation_pipeline( + model_name: str, + device_map: Union[str, dict], + tokenizer_name: str, + model_kwargs: dict, + tokenizer_kwargs: dict, + auto_gptq_exllama_max_input_length: int = None, + batch_size: int = 1, +): + # Load the model: + model = transformers.AutoModelForCausalLM.from_pretrained( + model_name, device_map=device_map, **model_kwargs + ) + + # Set exllama max input length if provided: + # This changes the model's context size. + if auto_gptq_exllama_max_input_length: + from auto_gptq import exllama_set_max_input_length + + model = exllama_set_max_input_length( + model=model, max_input_length=auto_gptq_exllama_max_input_length + ) + + # Load the tokenizer: + tokenizer = transformers.AutoTokenizer.from_pretrained( + tokenizer_name, **tokenizer_kwargs + ) + + # Initialize a generation pipline and return: + pipe = transformers.pipeline( + task="text-generation", + model=model, + tokenizer=tokenizer, + batch_size=batch_size, + ) + pipe.tokenizer.pad_token_id = model.config.eos_token_id + return pipe + + +def _read_file_batch( + file_batch: List[pathlib.Path], + prompt_template: str, +) -> List[str]: + batch = [] + + # Go over all files and read in usable format + for file in file_batch: + with open(file, "r", encoding="utf-8") as fp: + batch.append(prompt_template.format(fp.read())) + return batch + + +def _to_group_list(argument_value: list, argument_name: str, length: int): + + # Check if is list, turn to list if not + argument_value = ( + argument_value if isinstance(argument_value, list) else [argument_value] + ) + list_len = len(argument_value) + + # If not a list, or is a list of len 1 we duplicate for correct length + # If list in wrong length throw an error + if list_len != length: + if list_len == 1: + return argument_value * length + raise ValueError( + f"The argument value of '{argument_name}' is not equal to the length of the given questions - {length}" + ) + return argument_value + + +class QuestionHandler: + """ + A class for handling questions answering for a given question type. + This class is used as a base class for all question types, and for default question type (regular question + answering without any special handling). + """ + + class ConfigKeys: + pass + + def __init__(self): + pass + + @staticmethod + def _get_answers(generated_text: str, questions_amount: int) -> List[str]: + + # Clear answer start (part before numbers): + # TODO find better way to verify, for list of questions this is redundant for example + if "1." not in generated_text: + raise ValueError( + f"Answer 1. is missing from the generated text: '{generated_text}'" + ) + text = generated_text.split("1.", 1)[1] + + # Start extracting the answers: + answers = [] + for i in range(1, questions_amount + 1): + # If it's the last answer to look for, take the rest of the text: + if i == questions_amount: + answer_i = text + # Verify there is a question number in the text: + elif f"{i + 1}." not in text: + raise ValueError( + f"Answer {i + 1}. is missing from the generated text: '{generated_text}'" + ) + # Take i's answer: + else: + answer_i, text = text.split(f"{i + 1}.", 1) + # Collect the answer removing redundant spaces: + answers.append(answer_i.strip()) + + return answers + + def _infer_questions( + self, + questions_amount: int, + batched_input: List[str], + generation_pipeline: transformers.Pipeline, + generation_config: transformers.GenerationConfig, + ) -> List[List[str]]: + + # Infer through the llm: + batched_output = generation_pipeline( + batched_input, + generation_config=generation_config, + eos_token_id=generation_pipeline.tokenizer.eos_token_id, + return_full_text=False, + num_return_sequences=1, + ) + + # Process the outputs to get the answers: + batched_answers = [] + for output in batched_output: + # Get the generated answers: + answers = self._get_answers( + generated_text=output[0]["generated_text"], + questions_amount=questions_amount, + ) + # Collect the processed answers: + batched_answers.append(answers) + return batched_answers + + def answer( + self, + questions_amount: int, + batched_input: List[str], + generation_pipeline: transformers.Pipeline, + generation_config: transformers.GenerationConfig, + ) -> List[List[str]]: + """ + Answer questions with a context to the given text files contents by a pretrained LLM model in given pipeline. + """ + return self._infer_questions( + questions_amount=questions_amount, + batched_input=batched_input, + generation_pipeline=generation_pipeline, + generation_config=generation_config, + ) + + +class PollQuestionHandler(QuestionHandler): + """ + Static class to hold all the possible poll question configurations options keys + """ + + class ConfigKeys: + """ + A class for handling questions answering for poll type questions. + These type of question are answered by asking the same question multiple times + and choosing the most common answer or the average answer. + """ + + #: The number of times to ask the same question. + POLL_COUNT = "poll_count" + + #: The strategy to use for choosing the answer from the poll. + POLL_STRATEGY = "poll_strategy" + + class Strategy(enum.Enum): + #: The most common answer strategy. + MOST_COMMON = "most_common" + + #: The average answer strategy. + AVERAGE = "average" + + @staticmethod + def most_common(answers): + """ + Calculate the most common answer for a given list of answers. + """ + count = Counter(answers) + most_common = count.most_common(1) + return most_common[0][0] + + @staticmethod + def average(answers): + """ + Calculate the average answer for a given list of answers. + """ + if isinstance(answers[0], str): + raise ValueError( + "Cannot perform poll with average answer strategy of non numeric values," + " please change the question to give numeric data, or choose 'most_common' as strategy." + ) + else: + numeric_values = answers + avg = sum(numeric_values) / len(numeric_values) + + # Round to the closest integer and return corresponding value + return round(avg) + + def do(self, answers): + """ + Perform the strategy. + """ + return getattr(self, self.value)(answers) + + def __init__( + self, poll_count: int = 5, poll_strategy: str = "most_common"): + super().__init__() + self.poll_count = poll_count + self.poll_strategy = self.Strategy(poll_strategy) + + def answer( + self, + questions_amount: int, + batched_input: List[str], + generation_pipeline: transformers.Pipeline, + generation_config: transformers.GenerationConfig, + ) -> List[List[str]]: + """ + Answer questions with a context to the given text files contents by a pretrained LLM model in given pipeline. + """ + return self._answer_poll_questions( + questions_amount=questions_amount, + batched_input=batched_input, + generation_pipeline=generation_pipeline, + generation_config=generation_config, + ) + + def _answer_poll_questions( + self, + questions_amount: int, + batched_input: List[str], + generation_pipeline: transformers.Pipeline, + generation_config: transformers.GenerationConfig, + ) -> List[List[str]]: + votes = [] + + # Run the poll for each question + for _ in range(self.poll_count): + batched_answers = self._infer_questions( + questions_amount=questions_amount, + batched_input=batched_input, + generation_pipeline=generation_pipeline, + generation_config=generation_config, + ) + votes.append(batched_answers) + answers = [] + + # Collect the answers according to the poll strategy + # Average strategy works for numeric values only + for batch in range(len(votes[0])): + batched_answers = [] + for question in range(questions_amount): + # Create a list of all answers to relevant question + answer = [ + votes[voter][batch][question] for voter in range(self.poll_count) + ] + answer = self.poll_strategy.do(answer) + batched_answers.append(answer) + answers.append(batched_answers) + return answers + + +# Holds names of QuestionHandles +class QuestionTypes: + DEFAULT = "default" + POLL = "poll" + + +# Maps question types to their handlers +QUESTION_MAPPING = { + QuestionTypes.DEFAULT: QuestionHandler, + QuestionTypes.POLL: PollQuestionHandler, +} diff --git a/functions/master/question_answering/0.5.0/src/requirements.txt b/functions/master/question_answering/0.5.0/src/requirements.txt new file mode 100644 index 00000000..d05cb777 --- /dev/null +++ b/functions/master/question_answering/0.5.0/src/requirements.txt @@ -0,0 +1,4 @@ +transformers +tqdm +torch +einops \ No newline at end of file diff --git a/functions/master/question_answering/0.5.0/src/test_question_answering.py b/functions/master/question_answering/0.5.0/src/test_question_answering.py new file mode 100644 index 00000000..f35b4364 --- /dev/null +++ b/functions/master/question_answering/0.5.0/src/test_question_answering.py @@ -0,0 +1,76 @@ +# Copyright 2019 Iguazio +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import mlrun +import transformers +import tempfile + +APPLE_COLOR = "red" + + +def mock_pipeline_call(*args, **kwargs): + return [[{"generated_text": "1. " + APPLE_COLOR}]] + + +def _make_data_dir_for_test(): + data_dir = tempfile.mkdtemp() + content = "The apple color is red." + with open(data_dir + "/test_data.txt", "w") as f: + f.write(content) + return data_dir + + +def test_question_answering(monkeypatch): + monkeypatch.setattr(transformers.Pipeline, "__call__", mock_pipeline_call) + input_path = "./data" + artifact_path = tempfile.mkdtemp() + project = mlrun.new_project("qa", context="./") + fn = project.set_function("question_answering.py", "answer_questions", kind="job", image="mlrun/mlrun") + qa_run = fn.run( + handler="answer_questions", + params={ + "model_name": "distilgpt2", + "data_path": input_path, + "text_wrapper": ( + "Given the following sentence:\n" + "-----\n" + "{}\n" + "-----" + ), + "questions": [ + "What is the color of the apple?", + ], + "questions_columns": [ + "color", + ], + "generation_config": { + "do_sample": True, + "temperature": 0.8, + "top_p": 0.9, + "early_stopping": True, + "max_new_tokens": 20, + }, + }, + returns=[ + "question_answering_df: dataset", + "question_answering_errors: result", + ], + local=True, + artifact_path=artifact_path + ) + qa_df = mlrun.get_dataitem( + qa_run.status.artifacts[0]["spec"]["target_path"] + ).as_df() + assert qa_df["color"][0] == APPLE_COLOR + assert qa_run.outputs["question_answering_errors"] == {} diff --git a/functions/master/question_answering/0.5.0/static/documentation.html b/functions/master/question_answering/0.5.0/static/documentation.html new file mode 100644 index 00000000..602f654b --- /dev/null +++ b/functions/master/question_answering/0.5.0/static/documentation.html @@ -0,0 +1,447 @@ + + + + + + + +question_answering package + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + + + +
    +
    +
    +
    +
    +
    + +
    + +
    +
    +
    + + + + +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + + + +
    +
    +
    +
    + + +
    +
    +

    question_answering package#

    +
    +

    Submodules#

    +
    +
    +

    question_answering.question_answering module#

    +
    +
    +class question_answering.question_answering.PollQuestionHandler(poll_count: int = 5, poll_strategy: str = 'most_common')[source]#
    +

    Bases: QuestionHandler

    +

    Static class to hold all the possible poll question configurations options keys

    +
    +
    +class ConfigKeys[source]#
    +

    Bases: object

    +

    A class for handling questions answering for poll type questions. +These type of question are answered by asking the same question multiple times +and choosing the most common answer or the average answer.

    +
    +
    +POLL_COUNT = 'poll_count'#
    +

    The number of times to ask the same question.

    +
    +
    +
    +POLL_STRATEGY = 'poll_strategy'#
    +

    The strategy to use for choosing the answer from the poll.

    +
    +
    +
    +
    +class Strategy(value)[source]#
    +

    Bases: Enum

    +

    An enumeration.

    +
    +
    +AVERAGE = 'average'#
    +

    The average answer strategy.

    +
    +
    +
    +MOST_COMMON = 'most_common'#
    +

    The most common answer strategy.

    +
    +
    +
    +static average(answers)[source]#
    +

    Calculate the average answer for a given list of answers.

    +
    +
    +
    +do(answers)[source]#
    +

    Perform the strategy.

    +
    +
    +
    +static most_common(answers)[source]#
    +

    Calculate the most common answer for a given list of answers.

    +
    +
    +
    +
    +answer(questions_amount: int, batched_input: List[str], generation_pipeline: transformers.Pipeline, generation_config: transformers.GenerationConfig) List[List[str]][source]#
    +

    Answer questions with a context to the given text files contents by a pretrained LLM model in given pipeline.

    +
    +
    +
    +
    +class question_answering.question_answering.QuestionHandler[source]#
    +

    Bases: object

    +

    A class for handling questions answering for a given question type. +This class is used as a base class for all question types, and for default question type (regular question +answering without any special handling).

    +
    +
    +class ConfigKeys[source]#
    +

    Bases: object

    +
    +
    +
    +answer(questions_amount: int, batched_input: List[str], generation_pipeline: transformers.Pipeline, generation_config: transformers.GenerationConfig) List[List[str]][source]#
    +

    Answer questions with a context to the given text files contents by a pretrained LLM model in given pipeline.

    +
    +
    +
    +
    +class question_answering.question_answering.QuestionTypes[source]#
    +

    Bases: object

    +
    +
    +DEFAULT = 'default'#
    +
    +
    +
    +POLL = 'poll'#
    +
    +
    +
    +
    +question_answering.question_answering.answer_questions(data_path: str | List[str], model_name: str, questions: List[str] | List[List[str]], device_map: str | dict | None = None, model_kwargs: dict | None = None, auto_gptq_exllama_max_input_length: int | None = None, tokenizer_name: str | None = None, tokenizer_kwargs: dict | None = None, text_wrapper: str | List[str] = '', questions_wrapper: str | List[str] = '', generation_config: Dict | List[Dict] | None = None, questions_config: Dict | List[Dict] | None = None, batch_size: int = 1, questions_columns: List[str] | None = None, verbose: bool = False) Tuple[DataFrame, dict][source]#
    +

    Answer questions with a context to the given text files contents by a pretrained LLM model. Each text file will have +the following prompt built:

    +

    start of text_wrapper +<text file content> +end of text_wrapper

    +

    start of questions_wrapper +1. <questions[0]> +2. <questions[1]> +… +n. <questions[n-1]> +end of questions_wrapper

    +
    +
    Parameters:
    +
      +
    • data_path – A path to a directory of text files or a path to a text file to ask +questions about.

    • +
    • model_name – The pre-trained model name from the huggingface hub to use for asking +questions.

    • +
    • questions – The questions to ask. +A list of lists of questions to ask per text file, and devided +by question groups, the groups can be dtermained by size (in order to +avoid large inputs to the llm) or by questioning method +(regular or poll like questioning).

    • +
    • device_map – A map to use for loading the model on multiple devices.

    • +
    • model_kwargs – Keyword arguments to pass for loading the model using HuggingFace’s +transformers.AutoModelForCausalLM.from_pretrained function.

    • +
    • auto_gptq_exllama_max_input_length – For AutoGPTQ models to set and extend the model’s input buffer size.

    • +
    • tokenizer_name – The tokenizer name from the huggingface hub to use. If not given, the +model name will be used.

    • +
    • tokenizer_kwargs – Keyword arguments to pass for loading the tokenizer using HuggingFace’s +transformers.AutoTokenizer.from_pretrained function.

    • +
    • text_wrapper – A wrapper for the file’s text. Will be added at the start of the prompt. +Must have a placeholder (‘{}’) for the text of the file.

    • +
    • questions_wrapper – A wrapper for the questions received. Will be added after the text +wrapper in the prompt template. Must have a placeholder (‘{}’) for the +questions.

    • +
    • generation_config – HuggingFace’s GenerationConfig keyword arguments to pass to the +generate method.

    • +
    • questions_config – A dictionary or list of dictionaries containing specific ways to answer +questions (using a poll for example), each dictionary in the list is for +corresponding question group and determines the question asking method +for said group.

    • +
    • batch_size – Batch size for inference.

    • +
    • questions_columns – Columns to use for the dataframe returned.

    • +
    • verbose – Whether to present logs of a progress bar and errors. Default: True.

    • +
    +
    +
    Returns:
    +

    A tuple of:

    +
      +
    • A dataframe dataset of the questions answers.

    • +
    • A dictionary of errored files that were not inferred or were not answered properly.

    • +
    +

    +
    +
    +
    +
    +
    +question_answering.question_answering.open_mpi_handler(worker_inputs: List[str], root_worker_inputs: Dict[str, Any] | None = None)[source]#
    +
    +
    +
    +

    Module contents#

    +
    +
    +
    +
    +
    +
    +
    +
    + +
    +
    + +
    +
    +
    +
    + + + +
    +
    + + \ No newline at end of file diff --git a/functions/master/question_answering/0.5.0/static/example.html b/functions/master/question_answering/0.5.0/static/example.html new file mode 100644 index 00000000..e54ca5e5 --- /dev/null +++ b/functions/master/question_answering/0.5.0/static/example.html @@ -0,0 +1,798 @@ + + + + + + + +Question Answering + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + + + +
    +
    +
    +
    +
    +
    + +
    + +
    +
    +
    + + + + +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + + + +
    +
    +
    +
    + + +
    +
    +

    Question Answering#

    +
    +

    Short description and explenation#

    +

    This function enables ad-hoc question answering over documents by ingesting text into a language model and returning formatted responses.
    +It accepts:

    +
      +
    • A language model

    • +
    • Text files with content

    • +
    • Questions to answer

    • +
    • More inputs can be given for configuration

    • +
    +

    The model processes the files to build understanding. Questions posed are then answered in one of two modes:

    +

    Default mode:
    +The model directly answers each question using its own capabilities.

    +

    Poll mode:
    +Additional models are included to separately answer each question. An aggregation algorithm determines the best response through consensus between models.
    +Two options exist for consensus methodology:

    +

    Average Answer:
    +Each model’s answer is scored. The response with the average highest score amongst models is selected. Useful for numeric or ranked responses.

    +

    Most Common Answer:
    The answer that occurs most frequently across models is selected. Useful for textual responses to avoid outliers.

    +

    Using multiple models via the poll mode provides accuracy improvements for questions lacking definitive answers, as it refines responses through an ensemble process.

    +
    +
    +

    Background#

    +

    At the core, advanced natural language processing (NLP) models called foundation models are being leveraged to read and comprehend the input text files.
    +Specifically, models such as GPT-3 or Codex from Anthropic are used as the base language model.

    +

    When documents are fed into the function, the background process invokes these models to ingest and digest the information.

    +

    This provides the knowledge base for the models to then offer informed answers tailored to any queries about the documents.
    +The parameters controlling model size and computation time provide tradeoffs between cost, speed, and sophistication of comprehension.

    +

    Additionally, the poll option expands on a single model by sampling responses from a number of models as mentioned above.

    +
    +
    +

    Requirements#

    +

    transformers
    +torch
    +tqdm

    +
    +
    +

    Documentation#

    +

    data_path: A path to a directory of text files or a path to a text file to ask questions about.

    +

    model_name: The pre-trained model name from the huggingface hub to use for answering questions.

    +

    questions: The questions to ask. A list of lists of questions to ask per text file, and devided
    +by question groups, the groups can be determained by size (in order to
    +avoid large inputs to the llm) or by questioning method (regular or poll like questioning).

    +

    device_map: A map to use for loading the model on multiple devices.

    +

    model_kwargs: Keyword arguments to pass for loading the model using HuggingFace’s
    +transformers.AutoModelForCausalLM.from_pretrained function.

    +

    auto_gptq_exllama_max_input_length: For AutoGPTQ models to set and extend the model’s input buffer size.

    +

    tokenizer_name: The tokenizer name from the huggingface hub to use. If not given, the given model name will be used.

    +

    tokenizer_kwargs: Keyword arguments to pass for loading the tokenizer using HuggingFace’s
    +transformers.AutoTokenizer.from_pretrained function.

    +

    text_wrapper: Must have a placeholder (‘{}’) for the text of the file.

    +

    questions_wrapper: A wrapper for the questions received. Will be added after the text wrapper in the prompt template.
    +Must have a placeholder (‘{}’) for the questions.

    +

    generation_config: HuggingFace’s GenerationConfig keyword arguments to pass to the generate method.

    +

    questions_config: A dictionary or list of dictionaries containing specific ways to answer questions (using a poll for example),
    +each dictionary in the list is for corresponding question group and determines the question asking method
    +for said group.

    +

    batch_size: Batch size for inference.

    +

    questions_columns: Columns to use for the dataframe returned.

    +

    verbose: Whether to present logs of a progress bar and errors. Default: True.

    +
    +
    +

    Demo 1#

    +

    This is a short and simple example to show the basic use of the function.

    +
    +

    (1.) Import the function (import mlrun, set project and import function)#

    +
    +
    +
    import mlrun
    +import transformers
    +import tempfile
    +
    +
    +
    +
    +
    +
    +
    project = mlrun.get_or_create_project(
    +    name="call-center-demo-1",
    +    context="./",
    +    user_project=True,
    +    parameters={
    +        "default_image": "mlrun/mlrun",
    +    })
    +
    +
    +
    +
    +
    +
    +
    func = project.set_function(
    +    "question-answering.py",
    +    name="question-answering",
    +    kind="job",
    +    handler="answer_questions",
    +)
    +project.save()
    +
    +
    +
    +
    +

    We create a text file that the model can be asked about

    +
    +
    +
    def _make_data_dir_for_test():
    +    data_dir = tempfile.mkdtemp()
    +    # The information the model will need in order to answer our question
    +    content = "The apple is red."
    +    with open(data_dir + "/test_data.txt", "w") as f:
    +        f.write(content)
    +    return data_dir
    +
    +
    +
    +
    +
    +
    +

    (2.) Usage#

    +

    Then we set where to take the path to the text file we want to ask about, the questions, and column name for the answer table.

    +
    +
    +
    input_path = _make_data_dir_for_test()
    +# The question for the model to answer
    +question = ["What is the color of the apple?"]
    +# The column of the answer in the data frame returned by the function
    +column_name = ["color"]
    +
    +
    +
    +
    +

    Now we run the function with all the parameters we prepered earlier

    +
    +
    +
    demo1_run = func.run(
    +    handler="answer_questions",
    +    params={
    +        "model": "distilgpt2",
    +        "input_path": input_path,
    +        "questions": question,
    +        "questions_columns": column_name,
    +        "generation_config": {
    +            "do_sample": True,
    +            "temperature": 0.8,
    +            "top_p": 0.9,
    +            "early_stopping": True,
    +            "max_new_tokens": 20,
    +        },
    +    },
    +    returns=[
    +        "question_answering_df: dataset",
    +        "question_answering_errors: result",
    +    ],
    +    local=True,
    +    artifact_path="./"
    +)
    +
    +
    +
    +
    +
    +
    +

    (3.) Review results#

    +

    and after the run is finished we can take a look and see our answer

    +
    +
    +
    demo1_run.outputs
    +
    +
    +
    +
    +
    +
    +
    +

    Demo 2#

    +

    This is a much larger example, we will show how we use this function to analyze a number of calls between agents and customer of a internet company (all the data is generated by Iguazio).
    +For something like this, we recomend using a strong model, and putting some time into making the prompts.

    +
    +

    (1.) Import the function (import mlrun, set project and import function)#

    +
    +
    +
    import os
    +import mlrun
    +import torch
    +from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
    +
    +
    +
    +
    +
    > 2023-12-18 10:18:37,490 [warning] Client version with higher version than server version isn't supported, align your client to the server version: {'parsed_server_version': Version(major=1, minor=5, patch=2, prerelease='rc1', build='track'), 'parsed_client_version': Version(major=1, minor=6, patch=0, prerelease='rc11', build=None)}
    +
    +
    +
    +
    +
    +
    +
    project = mlrun.get_or_create_project(
    +    name="call-center-demo-2",
    +    context="./",
    +    user_project=True,
    +    parameters={
    +        "default_image": "mlrun/mlrun",
    +    })
    +
    +
    +
    +
    +
    > 2023-12-18 10:18:51,651 [info] Project loaded successfully: {'project_name': 'call-center-demo-zeev55'}
    +
    +
    +
    +
    +
    +
    +
    func = project.set_function(
    +    "question-answering.py",
    +    name="question-answering",
    +    kind="job",
    +    handler="answer_questions",
    +)
    +project.save()
    +
    +
    +
    +
    +
    <mlrun.projects.project.MlrunProject at 0x7f8bc5b0a370>
    +
    +
    +
    +
    +
    +
    +

    (2.) Usage#

    +

    This example is a bit more complicated as we mentioned, we give the model a list of questions, for some of them we give the model a list of answers to choose from.

    +
    +
    +
    QUESTIONS = [
    +    "1. Write a long summary of the text, focus on the topic (max 50 words).",
    +    "2. Was the Client's concern addressed, (choose only one) [Yes, No]?",
    +    ]
    +
    +qa_questions_columns = [
    +                        "Summary",
    +                        "is_fixed",
    +                        ]
    +
    +
    +
    +
    +

    Another thing we give the model this time is answer examples (one/few shot answering), this can be done to show the model how you want the answer to be structured or caculated.

    +
    +
    +
    # For every file we ask about, the model will be presented with this example of a call and how we want the answers.
    +DEMO_CALL = (
    +    "Agent: Good afternoon, you've reached [Internet Service Provider] customer support. I'm Megan. How can I assist "
    +    "you today?\n"
    +    "Customer: Hello, Megan. This is Lisa. I've noticed some billing discrepancies on my last statement.\n"
    +    "Agent: Thank you, Lisa. Let me pull up your account. I see the billing discrepancies you mentioned. It appears "
    +    "there was an error in the charges. I apologize for the inconvenience.\n"
    +    "Customer: Thank you for acknowledging the issue, Megan. Can you please help me get it resolved?\n"
    +    "Agent: Absolutely, Lisa. I've made note of the discrepancies, and I'll escalate this to our billing department "
    +    "for investigation and correction. You should see the adjustments on your next statement.\n"
    +    "Customer: That sounds good, Megan. I appreciate your help.\n"
    +    "Agent: Not a problem, Lisa. Have a wonderful day, and we'll get this sorted out for you.\n"
    +)
    +
    +DEMO_ANSWERS = (
    +    "1. The customer, contacted the call center regarding billing discrepancies on her statement. The agent, "
    +    "acknowledged the issue, assured The customer it would be resolved, and escalated it to the billing department for "
    +    "correction.\n"
    +    "2. Yes.\n"
    +
    +
    +
    +
    +

    Then we need to wrap it all nicely to be given to the model as a single prompt, this is done with a text wrapper, and a question wrapper.
    +both of them will be concatenated inside the function with the questions and passed to the model.

    +
    +
    +
    # The wrappers are built according to the model's convensions to improve result
    +TEXT_WRAPPER = (
    +    f"<|im_start|>system: You are an AI assistant that answers questions accurately and shortly<|im_end|>\n"
    +    f"<|im_start|>user: Given the following text:\n"
    +    f"{DEMO_CALL}\n"
    +    f"answer the questions as accurately as you can:\n"
    +    f"{QUESTIONS}<|im_end|>\n"
    +    f"<|im_start|>assistant:\n"
    +    f"{DEMO_ANSWERS}<|im_end|>\n"
    +    f"<|im_start|>user: Given the following text:\n"
    +    "{}"
    +) 
    +QUESTIONS_WRAPPER = (
    +    " answer the given questions as accurately as you can, do not write more answers the questions:\n"
    +    "{}<|im_end|>\n"
    +    "<|im_start|>assistant:\n"
    +)
    +
    +
    +
    +
    +

    The last few parameters we need to set are the model we will use, the input lenth (no available for all models) and the batch size.
    +The batch size determains how many files we want procced at each epoch, and the larger we go the faster the proccess will be, as long as our memory is sufficient.

    +
    +
    +
    # We like this version of mistral's model, which is small and fast but also gives great results
    +qa_model = "TheBloke/Mistral-7B-OpenOrca-GPTQ"
    +
    +
    +
    +
    +

    Finnaly, we run the function with all the parameters we prepared.

    +
    +
    +
    # Question answering:
    +demo2_run = func.run(
    +    function="question-answering",
    +    local=True,
    +    handler="answer_questions",
    +    inputs={"data_path": os.path.abspath("./calls")},
    +    params={
    +        "model_name": qa_model,
    +        "device_map": "auto",
    +        "text_wrapper":TEXT_WRAPPER,
    +        "questions": QUESTIONS,
    +        "questions_wrapper": QUESTIONS_WRAPPER,
    +        "questions_columns": qa_questions_columns,
    +    },
    +    returns=[
    +        "question_answering_df: dataset",
    +        "question_answering_errors: result",
    +    ],
    +)
    +
    +
    +
    +
    +
    +
    +

    (3.) Review results#

    +
    +
    +
    demo2_run.outputs
    +
    +
    +
    +
    +
    +
    +
    +

    Demo 3#

    +

    This is also a large example, in this case we use another option of the function to ask questions in the form of a poll.

    +
    +

    (1.) Import the function (import mlrun, set project and import function)#

    +
    +
    +
    import os
    +import mlrun
    +import torch
    +from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
    +
    +
    +
    +
    +
    > 2023-12-18 10:18:37,490 [warning] Client version with higher version than server version isn't supported, align your client to the server version: {'parsed_server_version': Version(major=1, minor=5, patch=2, prerelease='rc1', build='track'), 'parsed_client_version': Version(major=1, minor=6, patch=0, prerelease='rc11', build=None)}
    +
    +
    +
    +
    +
    +
    +
    project = mlrun.get_or_create_project(
    +    name="call-center-demo-3",
    +    context="./",
    +    user_project=True,
    +    parameters={
    +        "default_image": "mlrun/mlrun",
    +    })
    +
    +
    +
    +
    +
    > 2023-12-18 10:18:51,651 [info] Project loaded successfully: {'project_name': 'call-center-demo-zeev55'}
    +
    +
    +
    +
    +
    +
    +
    func = project.set_function(
    +    "question-answering.py",
    +    name="question-answering",
    +    kind="job",
    +    handler="answer_questions",
    +)
    +project.save()
    +
    +
    +
    +
    +
    <mlrun.projects.project.MlrunProject at 0x7f8bc5b0a370>
    +
    +
    +
    +
    +
    +
    +

    (2.) Usage#

    +

    Like in the second demo, we make a list of questions for the function to answer.

    +
    +
    +
    # These questions are harder to answer, as there is no right answer.
    +# So we want it to be at least consistent, for that we use the poll option.
    +QUESTIONS = [
    +    "1. Rate the agent's level of empathy (The ability to understand and share the feelings of others) on a scale of 1-5.",
    +    "2. Rate the agent's level of professionalism (Conducting oneself in a way that is appropriate for the workplace) on a scale of 1-5.",
    +]
    +
    +qa_questions_columns = [
    +                        "empathy",
    +                        "professionalism",
    +
    +                        ]
    +
    +
    +
    +
    +

    Another thing we give the model this time is answer examples (one/few shot answering), this can be done to show the model how you want the answer to be structured or caculated.
    +So for every file we ask about, the model will be presented with this example of a call and how we want the answers.

    +
    +
    +
    # For every file we ask about, the model will be presented with this example of a call and how we want the answers.
    +DEMO_CALL = (
    +    "Agent: Good afternoon, you've reached [Internet Service Provider] customer support. I'm Megan. How can I assist "
    +    "you today?\n"
    +    "Customer: Hello, Megan. This is Lisa. I've noticed some billing discrepancies on my last statement.\n"
    +    "Agent: Thank you, Lisa. Let me pull up your account. I see the billing discrepancies you mentioned. It appears "
    +    "there was an error in the charges. I apologize for the inconvenience.\n"
    +    "Customer: Thank you for acknowledging the issue, Megan. Can you please help me get it resolved?\n"
    +    "Agent: Absolutely, Lisa. I've made note of the discrepancies, and I'll escalate this to our billing department "
    +    "for investigation and correction. You should see the adjustments on your next statement.\n"
    +    "Customer: That sounds good, Megan. I appreciate your help.\n"
    +    "Agent: Not a problem, Lisa. Have a wonderful day, and we'll get this sorted out for you.\n"
    +)
    +
    +
    +DEMO_ANSWERS = (
    +    "1. 4\n"
    +    "2. 5\n"
    +
    +)
    +
    +
    +
    +
    +

    Then we need to wrap it all nicely to be given to the model as a single prompt, this is done with a text wrapper, and a question wrapper.
    +both of them will be concatenated inside the function with the questions and passed to the model.

    +
    +
    +
    TEXT_WRAPPER = (
    +    f"<|im_start|>system: You are an AI assistant that answers questions accurately and shortly<|im_end|>\n"
    +    f"<|im_start|>user: Given the following text:\n"
    +    f"{DEMO_CALL}\n"
    +    f"answer the questions as accurately as you can:\n"
    +    f"{QUESTIONS}<|im_end|>\n"
    +    f"<|im_start|>assistant:\n"
    +    f"{DEMO_ANSWERS}<|im_end|>\n"
    +    f"<|im_start|>user: Given the following text:\n"
    +    "{}"
    +) 
    +
    +QUESTIONS_WRAPPER = (
    +    " answer the given questions as accurately as you can, do not write more answers the questions:\n"
    +    "{}<|im_end|>\n"
    +    "<|im_start|>assistant:\n"
    +)
    +
    +
    +
    +
    +

    The config is for the second questioning method, we cal “poll”, and in which we need to choose how many voting models we want participating,
    +and in what way we want do decide the result, we currentlly support average and most_common as show here.

    +

    *An explenation about both questioning methods can be found in the begginig of this notebook

    +
    +
    +
    questions_config = 
    +    {
    +        "type": "poll",
    +        "poll_count": 3, # How many 'voters'
    +        "poll_strategy": "most_common"
    +    }
    +
    +
    +
    +
    +
    +
    +
    qa_model = "TheBloke/Mistral-7B-OpenOrca-GPTQ"
    +
    +
    +
    +
    +

    Finnaly, we run the function with all the parameters we prepared.

    +
    +
    +
    # Question answering:
    +demo3_run = func.run(
    +    function="question-answering",
    +    local=True,
    +    handler="answer_questions",
    +    inputs={"data_path": os.path.abspath("./calls")},
    +    params={
    +        "model_name": qa_model,
    +        "device_map": "auto",
    +        "text_wrapper":TEXT_WRAPPER,
    +        "questions": QUESTIONS,
    +        "questions_wrapper": QUESTIONS_WRAPPER,
    +        "questions_columns": qa_questions_columns,
    +        "questions_config": questions_config, # This time we add 'questions_config'
    +    },
    +    returns=[
    +        "question_answering_df: dataset",
    +        "question_answering_errors: result",
    +    ],
    +)
    +
    +
    +
    +
    +
    +
    +

    (3.) Review results#

    +
    +
    +
    demo3_run.outputs
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +
    +
    + +
    +
    +
    +
    + + + +
    +
    + + \ No newline at end of file diff --git a/functions/master/question_answering/0.5.0/static/function.html b/functions/master/question_answering/0.5.0/static/function.html new file mode 100644 index 00000000..658c382a --- /dev/null +++ b/functions/master/question_answering/0.5.0/static/function.html @@ -0,0 +1,232 @@ + + + + + + + + + + + Source + + + + +
    +        
    +metadata:
    +  name: question-answering
    +  tag: ''
    +  categories:
    +  - genai
    +verbose: false
    +kind: job
    +spec:
    +  command: ''
    +  default_handler: answer_questions
    +  build:
    +    origin_filename: ''
    +    base_image: mlrun/mlrun
    +    requirements:
    +    - transformers
    +    - torch
    +    - tqdm
    +    code_origin: ''
    +    functionSourceCode: 
    +  entry_points:
    +    open_mpi_handler:
    +      name: open_mpi_handler
    +      has_varargs: false
    +      doc: ''
    +      lineno: 58
    +      parameters:
    +      - name: worker_inputs
    +        type: List[str]
    +      - name: root_worker_inputs
    +        type: Dict[str, Any]
    +        default: null
    +      has_kwargs: false
    +    decorator:
    +      name: decorator
    +      has_varargs: false
    +      doc: ''
    +      lineno: 66
    +      parameters:
    +      - name: handler
    +      has_kwargs: false
    +    wrapper:
    +      name: wrapper
    +      has_varargs: false
    +      doc: ''
    +      lineno: 71
    +      has_kwargs: true
    +    answer_questions:
    +      outputs:
    +      - doc: 'A tuple of:'
    +        type: Tuple[pd.DataFrame, dict]
    +      name: answer_questions
    +      has_varargs: false
    +      doc: 'Answer questions with a context to the given text files contents by a
    +        pretrained LLM model. Each text file will have
    +
    +        the following prompt built:
    +
    +
    +        start of `text_wrapper`
    +
    +        
    +
    +        end of `text_wrapper`
    +
    +
    +        start of `questions_wrapper`
    +
    +        1. 
    +
    +        2. 
    +
    +        ...
    +
    +        n. 
    +
    +        end of `questions_wrapper`'
    +      lineno: 130
    +      parameters:
    +      - name: data_path
    +        type: Union[str, List[str]]
    +        doc: A path to a directory of text files or a path to a text file to ask questions
    +          about.
    +      - name: model_name
    +        type: str
    +        doc: The pre-trained model name from the huggingface hub to use for asking
    +          questions.
    +      - name: questions
    +        type: Union[List[str], List[List[str]]]
    +        doc: The questions to ask. A list of lists of questions to ask per text file,
    +          and devided by question groups, the groups can be dtermained by size (in
    +          order to avoid large inputs to the llm) or by questioning method (regular
    +          or poll like questioning).
    +      - name: device_map
    +        type: Union[str, dict]
    +        doc: A map to use for loading the model on multiple devices.
    +        default: null
    +      - name: model_kwargs
    +        type: dict
    +        doc: Keyword arguments to pass for loading the model using HuggingFace's `transformers.AutoModelForCausalLM.from_pretrained`
    +          function.
    +        default: null
    +      - name: auto_gptq_exllama_max_input_length
    +        type: int
    +        doc: For AutoGPTQ models to set and extend the model's input buffer size.
    +        default: null
    +      - name: tokenizer_name
    +        type: str
    +        doc: The tokenizer name from the huggingface hub to use. If not given, the
    +          model name will be used.
    +        default: null
    +      - name: tokenizer_kwargs
    +        type: dict
    +        doc: Keyword arguments to pass for loading the tokenizer using HuggingFace's
    +          `transformers.AutoTokenizer.from_pretrained` function.
    +        default: null
    +      - name: text_wrapper
    +        type: Union[str, List[str]]
    +        doc: A wrapper for the file's text. Will be added at the start of the prompt.
    +          Must have a placeholder ('{}') for the text of the file.
    +        default: ''
    +      - name: questions_wrapper
    +        type: Union[str, List[str]]
    +        doc: A wrapper for the questions received. Will be added after the text wrapper
    +          in the prompt template. Must have a placeholder ('{}') for the questions.
    +        default: ''
    +      - name: generation_config
    +        type: Union[Dict, List[Dict]]
    +        doc: HuggingFace's `GenerationConfig` keyword arguments to pass to the `generate`
    +          method.
    +        default: null
    +      - name: questions_config
    +        type: Union[Dict, List[Dict]]
    +        doc: A dictionary or list of dictionaries containing specific ways to answer
    +          questions (using a poll for example), each dictionary in the list is for
    +          corresponding question group and determines the question asking method for
    +          said group.
    +        default: null
    +      - name: batch_size
    +        type: int
    +        doc: Batch size for inference.
    +        default: 1
    +      - name: questions_columns
    +        type: List[str]
    +        doc: Columns to use for the dataframe returned.
    +        default: null
    +      - name: verbose
    +        type: bool
    +        doc: 'Whether to present logs of a progress bar and errors. Default: True.'
    +        default: false
    +      has_kwargs: false
    +    answer:
    +      outputs:
    +      - type: List[List[str]]
    +      name: answer
    +      has_varargs: false
    +      doc: Answer questions with a context to the given text files contents by a pretrained
    +        LLM model in given pipeline.
    +      lineno: 674
    +      parameters:
    +      - name: self
    +      - name: questions_amount
    +        type: int
    +      - name: batched_input
    +        type: List[str]
    +      - name: generation_pipeline
    +        type: Pipeline
    +      - name: generation_config
    +        type: GenerationConfig
    +      has_kwargs: false
    +    most_common:
    +      name: most_common
    +      has_varargs: false
    +      doc: Calculate the most common answer for a given list of answers.
    +      lineno: 637
    +      parameters:
    +      - name: answers
    +      has_kwargs: false
    +    average:
    +      name: average
    +      has_varargs: false
    +      doc: Calculate the average answer for a given list of answers.
    +      lineno: 646
    +      parameters:
    +      - name: answers
    +      has_kwargs: false
    +    do:
    +      name: do
    +      has_varargs: false
    +      doc: Perform the strategy.
    +      lineno: 662
    +      parameters:
    +      - name: self
    +      - name: answers
    +      has_kwargs: false
    +  image: ''
    +  description: GenAI approach of question answering on a given data
    +  disable_auto_mount: false
    +
    +        
    +    
    + + \ No newline at end of file diff --git a/functions/master/question_answering/0.5.0/static/item.html b/functions/master/question_answering/0.5.0/static/item.html new file mode 100644 index 00000000..64e45da7 --- /dev/null +++ b/functions/master/question_answering/0.5.0/static/item.html @@ -0,0 +1,62 @@ + + + + + + + + + + + Source + + + + +
    +        
    +apiVersion: v1
    +categories:
    +- genai
    +description: GenAI approach of question answering on a given data
    +doc: ''
    +example: question_answering.ipynb
    +generationDate: 2023-08-07:11-30
    +hidden: false
    +icon: ''
    +labels:
    +  author: yonish
    +maintainers: []
    +marketplaceType: ''
    +mlrunVersion: 1.7.0
    +name: question_answering
    +platformVersion: 3.5.0
    +spec:
    +  filename: question_answering.py
    +  handler: answer_questions
    +  image: mlrun/mlrun
    +  kind: job
    +  requirements:
    +    - transformers
    +    - torch
    +    - tqdm
    +url: ''
    +version: 0.5.0
    +
    +        
    +    
    + + \ No newline at end of file diff --git a/functions/master/question_answering/0.5.0/static/question_answering.html b/functions/master/question_answering/0.5.0/static/question_answering.html new file mode 100644 index 00000000..79c5db68 --- /dev/null +++ b/functions/master/question_answering/0.5.0/static/question_answering.html @@ -0,0 +1,947 @@ + + + + + + + +question_answering.question_answering + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + + + +
    +
    +
    +
    +
    +
    + +
    + +
    +
    +
    + + + + +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +
    +
    +
    +
    +
    +

    + +
    +
    +
    +
    +
    + +
    +

    Source code for question_answering.question_answering

    +# Copyright 2023 Iguazio
    +#
    +# Licensed under the Apache License, Version 2.0 (the "License");
    +# you may not use this file except in compliance with the License.
    +# You may obtain a copy of the License at
    +#
    +#   http://www.apache.org/licenses/LICENSE-2.0
    +#
    +# Unless required by applicable law or agreed to in writing, software
    +# distributed under the License is distributed on an "AS IS" BASIS,
    +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    +# See the License for the specific language governing permissions and
    +# limitations under the License.
    +import enum
    +import logging
    +import operator
    +import pathlib
    +from collections import Counter
    +from functools import reduce, wraps
    +from typing import Any, Dict, List, Tuple, Union
    +
    +import pandas as pd
    +import transformers
    +from tqdm import tqdm
    +
    +# Get the global logger:
    +_LOGGER = logging.getLogger()
    +
    +
    +def _check_mlrun_and_open_mpi() -> Tuple["mlrun.MLClientCtx", "mpi4py.MPI.Intracomm"]:
    +    global _LOGGER
    +
    +    is_mpi = False
    +    try:
    +        import mlrun
    +
    +        context = mlrun.get_or_create_ctx(name="mlrun")
    +        _LOGGER = context.logger
    +        is_mpi = context.labels.get("kind", "job") == "mpijob"
    +
    +        if is_mpi:
    +            try:
    +                from mpi4py import MPI
    +
    +                return context, MPI.COMM_WORLD
    +            except ModuleNotFoundError as mpi4py_not_found:
    +                context.logger.error(
    +                    "To distribute the function using MLRun's 'mpijob' you need to have `mpi4py` package in your "
    +                    "interpreter. Please run `pip install mpi4py` and make sure you have open-mpi."
    +                )
    +                raise mpi4py_not_found
    +    except ModuleNotFoundError as module_not_found:
    +        if is_mpi:
    +            raise module_not_found
    +    return None, None
    +
    +
    +
    +[docs] +def open_mpi_handler( + worker_inputs: List[str], root_worker_inputs: Dict[str, Any] = None +): + global _LOGGER + + # Check for MLRun and OpenMPI availability: + context, comm = _check_mlrun_and_open_mpi() + + def decorator(handler): + if comm is None or comm.Get_size() == 1: + return handler + + @wraps(handler) + def wrapper(**kwargs): + # Get the open mpi environment properties: + size = comm.Get_size() + rank = comm.Get_rank() + + # Give the correct chunk of the workers inputs: + for worker_input in worker_inputs: + input_argument = kwargs[worker_input] + if input_argument is None: + continue + if isinstance(input_argument, str): + input_argument = _get_text_files( + data_path=pathlib.Path(input_argument).absolute() + ) + if len(input_argument) < size: + raise ValueError( + f"Cannot split the input '{worker_input}' of length {len(input_argument)} to {size} workers. " + f"Please reduce the amount of workers for this input." + ) + even_chunk_size = len(input_argument) // size + chunk_start = rank * even_chunk_size + chunk_end = ( + (rank + 1) * even_chunk_size + if rank + 1 < size + else len(input_argument) + ) + context.logger.info( + f"Rank #{rank}: Processing input chunk of '{worker_input}' " + f"from index {chunk_start} to {chunk_end}." + ) + if isinstance(input_argument, list): + input_argument = input_argument[chunk_start:chunk_end] + elif isinstance(input_argument, pd.DataFrame): + input_argument = input_argument.iloc[chunk_start:chunk_end:, :] + kwargs[worker_input] = input_argument + + # Set the root worker only arguments: + if rank == 0 and root_worker_inputs: + kwargs.update(root_worker_inputs) + + # Run the worker: + output = handler(**kwargs) + + # Send the output to the root rank (rank #0): + output = comm.gather(output, root=0) + if rank == 0: + # Join the outputs: + context.logger.info("Collecting data from workers to root worker.") + dataframe = pd.concat(objs=[df for df, _ in output], axis=0) + errors_dictionary = reduce(operator.ior, [err for _, err in output], {}) + return dataframe, errors_dictionary + return None + + return wrapper + + return decorator
    + + + +
    +[docs] +@open_mpi_handler(worker_inputs=["data_path"], root_worker_inputs={"verbose": True}) +def answer_questions( + data_path: Union[str, List[str]], + model_name: str, + questions: Union[List[str], List[List[str]]], + device_map: Union[str, dict] = None, + model_kwargs: dict = None, + auto_gptq_exllama_max_input_length: int = None, + tokenizer_name: str = None, + tokenizer_kwargs: dict = None, + text_wrapper: Union[str, List[str]] = "", + questions_wrapper: Union[str, List[str]] = "", + generation_config: Union[Dict, List[Dict]] = None, + questions_config: Union[Dict, List[Dict]] = None, + batch_size: int = 1, + questions_columns: List[str] = None, + verbose: bool = False, +) -> Tuple[pd.DataFrame, dict]: + """ + Answer questions with a context to the given text files contents by a pretrained LLM model. Each text file will have + the following prompt built: + + start of `text_wrapper` + <text file content> + end of `text_wrapper` + + start of `questions_wrapper` + 1. <questions[0]> + 2. <questions[1]> + ... + n. <questions[n-1]> + end of `questions_wrapper` + + :param data_path: A path to a directory of text files or a path to a text file to ask + questions about. + :param model_name: The pre-trained model name from the huggingface hub to use for asking + questions. + :param questions: The questions to ask. + A list of lists of questions to ask per text file, and devided + by question groups, the groups can be dtermained by size (in order to + avoid large inputs to the llm) or by questioning method + (regular or poll like questioning). + :param device_map: A map to use for loading the model on multiple devices. + :param model_kwargs: Keyword arguments to pass for loading the model using HuggingFace's + `transformers.AutoModelForCausalLM.from_pretrained` function. + :param auto_gptq_exllama_max_input_length: For AutoGPTQ models to set and extend the model's input buffer size. + :param tokenizer_name: The tokenizer name from the huggingface hub to use. If not given, the + model name will be used. + :param tokenizer_kwargs: Keyword arguments to pass for loading the tokenizer using HuggingFace's + `transformers.AutoTokenizer.from_pretrained` function. + :param text_wrapper: A wrapper for the file's text. Will be added at the start of the prompt. + Must have a placeholder ('{}') for the text of the file. + :param questions_wrapper: A wrapper for the questions received. Will be added after the text + wrapper in the prompt template. Must have a placeholder ('{}') for the + questions. + :param generation_config: HuggingFace's `GenerationConfig` keyword arguments to pass to the + `generate` method. + :param questions_config: A dictionary or list of dictionaries containing specific ways to answer + questions (using a poll for example), each dictionary in the list is for + corresponding question group and determines the question asking method + for said group. + :param batch_size: Batch size for inference. + :param questions_columns: Columns to use for the dataframe returned. + :param verbose: Whether to present logs of a progress bar and errors. Default: True. + + + :returns: A tuple of: + + * A dataframe dataset of the questions answers. + * A dictionary of errored files that were not inferred or were not answered properly. + """ + global _LOGGER + + # Set configs to empty dict if not given: + if generation_config is None: + generation_config = {} + if questions_config is None: + questions_config = {} + + # Get the input text files to question: + if verbose: + _LOGGER.info("Collecting text files.") + if isinstance(data_path, str): + data_path = pathlib.Path(data_path).absolute() + text_files = _get_text_files(data_path=data_path) + else: + text_files = data_path + if verbose: + _LOGGER.info(f"Collected {len(text_files)} text files.") + + # Get the prompt template: + if verbose: + _LOGGER.info("Creating prompt template.") + + # Organize questions as a list of list, and count number of sub-lists for future use + number_of_question_groups = 1 if isinstance(questions[0], str) else len(questions) + questions = _to_group_list( + argument_value=questions, + argument_name="questions", + length=number_of_question_groups, + ) + + # Organize prompt parts at proper length + text_wrapper = _to_group_list( + argument_value=text_wrapper, + argument_name="text_wrapper", + length=number_of_question_groups, + ) + questions_wrapper = _to_group_list( + argument_value=questions_wrapper, + argument_name="questions_wrapper", + length=number_of_question_groups, + ) + + # Create a list of prompt according to given parts and questions + prompt_template = [] + questions = questions if isinstance(questions[0], list) else [questions] + + # Build all prompts + for i in range(number_of_question_groups): + prompt_template.append( + _get_prompt_template( + text_wrapper=text_wrapper[i], + questions_wrapper=questions_wrapper[i], + questions=questions[i], + ) + ) + if verbose: + _LOGGER.info(f"Prompt template created:\n\n{prompt_template}\n") + + # Get the total amount of questions: + questions_amount = sum([len(sublist) for sublist in questions]) + + # Get the questions columns: + questions_columns = questions_columns or [ + f"q{i}" for i in range(1, questions_amount + 1) + ] + + # Check if we have the correct amount of questions columns: + if len(questions_columns) != questions_amount: + raise ValueError( + f"The provided questions columns length ({len(questions_columns)}) " + f"does not match the questions amount ({questions_amount})" + ) + + # Load the generation config: + if verbose: + _LOGGER.info("Loading generation configuration.") + generation_config = [ + transformers.GenerationConfig(**(cfg or {})) + for cfg in _to_group_list( + argument_value=generation_config, + argument_name="generation_config", + length=number_of_question_groups, + ) + ] + if verbose: + _LOGGER.info(f"Generation configuration loaded: {generation_config}") + + # Load the model and tokenizer into a pipeline object: + if verbose: + _LOGGER.info(f"Loading model '{model_name}'.") + generation_pipeline = _get_generation_pipeline( + model_name=model_name, + device_map=device_map, + tokenizer_name=tokenizer_name or model_name, + model_kwargs=model_kwargs or {}, + tokenizer_kwargs=tokenizer_kwargs or {}, + auto_gptq_exllama_max_input_length=auto_gptq_exllama_max_input_length, + batch_size=batch_size, + ) + if verbose: + _LOGGER.info("Model loaded.") + + # Prepare the successes dataframe and errors dictionary to be returned: + successes = [] + errors = {} + + # Split the files into batches: + file_batches = [ + text_files[i : i + batch_size] + if i + batch_size < len(text_files) + else text_files[i:] + for i in range(0, len(text_files), batch_size) + ] + questions_config = _to_group_list( + argument_value=questions_config, + argument_name="questions_config", + length=number_of_question_groups, + ) + + # Create a list of question handlers according to given configs + handlers = [] + for cfg in questions_config: + question_type = cfg.pop("type", "default") + handlers.append(QUESTION_MAPPING.get(question_type)(**cfg)) + + # Go over the batches of text files and question them: + for file_batch in tqdm( + file_batches, + desc="Generating answers", + unit=f"file (batch of {batch_size})", + disable=not verbose, + ): + try: + total_answers = [[] for _ in range(batch_size)] + + # Go over all question group per batch of documents + for question_group in range(number_of_question_groups): + current_questions_amount = len(questions[question_group]) + + # Read batch (read the text from the text files): + batched_input = _read_file_batch( + file_batch=file_batch, + prompt_template=prompt_template[question_group], + ) + + # Answer the questions with each question handler: + batched_answers = handlers[question_group].answer( + questions_amount=current_questions_amount, + batched_input=batched_input, + generation_pipeline=generation_pipeline, + generation_config=generation_config[question_group], + ) + + # Put the answers in the correct place in the total answers list according to the place in the batch: + for i in range(batch_size): + total_answers[i].extend(batched_answers[i]) + + # Collect the answers and attach the file name: + successes.extend( + [ + [file.name, *answers] + for file, answers in zip(file_batch, total_answers) + ] + ) + except Exception as exception: + # Note the exception as error in the dictionary: + batch_file_names = ", ".join([file.name for file in file_batch]) + if verbose: + _LOGGER.warning( + f"Error in batch '{batch_file_names}': {str(exception)}" + ) + errors[batch_file_names] = str(exception) + continue + + # Construct the answers dataframe: + columns = [ + "text_file", + *questions_columns, + ] + + # Create a data frame of answers by files + successes = pd.DataFrame( + successes, + columns=columns, + ) + + # Print the head of the produced dataframe and return: + if verbose: + _LOGGER.info( + f"Done ({successes.shape[0]}/{len(text_files)})\n" + f"Answers summary:\n" + f"{successes.head()}" + ) + return successes, errors
    + + + +def _get_text_files( + data_path: pathlib.Path, +) -> List[pathlib.Path]: + + # Check if the path is of a directory or a file: + if data_path.is_dir(): + + # Get all files inside the directory: + text_files = list(data_path.glob("*.*")) + elif data_path.is_file(): + text_files = [data_path] + else: + raise ValueError( + f"Unrecognized data path. The parameter `data_path` must be either a directory path or a file path. " + f"Given: {str(data_path)} " + ) + + return text_files + + +def _get_prompt_template( + text_wrapper: str, + questions_wrapper: str, + questions: List[str], +) -> str: + + # Validate and build the text wrapper: + text_wrapper = text_wrapper or ( + "Given the following text:\n" "-----\n" "{}\n" "-----" + ) + if text_wrapper.count("{}") != 1: + raise ValueError( + "The `text_wrapper` must include one placeholder '{}' for the text of the file to be asked about." + ) + + # Validate and build the question wrapper: + questions_wrapper = questions_wrapper or "Answer the questions:\n" "{}" + if questions_wrapper.count("{}") != 1: + raise ValueError( + "The `questions_wrapper` must include one placeholder '{}' for the list of questions." + ) + + # Validate and parse the questions: + if len(questions) == 0: + raise ValueError("Please include at least one question.") + questions = "\n".join( + [f"{i}. {question}" for i, question in enumerate(questions, 1)] + ) + + # Construct the template: + return f"{text_wrapper}\n{questions_wrapper.format(questions)}\n" + + +def _get_generation_pipeline( + model_name: str, + device_map: Union[str, dict], + tokenizer_name: str, + model_kwargs: dict, + tokenizer_kwargs: dict, + auto_gptq_exllama_max_input_length: int = None, + batch_size: int = 1, +): + # Load the model: + model = transformers.AutoModelForCausalLM.from_pretrained( + model_name, device_map=device_map, **model_kwargs + ) + + # Set exllama max input length if provided: + # This changes the model's context size. + if auto_gptq_exllama_max_input_length: + from auto_gptq import exllama_set_max_input_length + + model = exllama_set_max_input_length( + model=model, max_input_length=auto_gptq_exllama_max_input_length + ) + + # Load the tokenizer: + tokenizer = transformers.AutoTokenizer.from_pretrained( + tokenizer_name, **tokenizer_kwargs + ) + + # Initialize a generation pipline and return: + pipe = transformers.pipeline( + task="text-generation", + model=model, + tokenizer=tokenizer, + batch_size=batch_size, + ) + pipe.tokenizer.pad_token_id = model.config.eos_token_id + return pipe + + +def _read_file_batch( + file_batch: List[pathlib.Path], + prompt_template: str, +) -> List[str]: + batch = [] + + # Go over all files and read in usable format + for file in file_batch: + with open(file, "r", encoding="utf-8") as fp: + batch.append(prompt_template.format(fp.read())) + return batch + + +def _to_group_list(argument_value: list, argument_name: str, length: int): + + # Check if is list, turn to list if not + argument_value = ( + argument_value if isinstance(argument_value, list) else [argument_value] + ) + list_len = len(argument_value) + + # If not a list, or is a list of len 1 we duplicate for correct length + # If list in wrong length throw an error + if list_len != length: + if list_len == 1: + return argument_value * length + raise ValueError( + f"The argument value of '{argument_name}' is not equal to the length of the given questions - {length}" + ) + return argument_value + + +
    +[docs] +class QuestionHandler: + """ + A class for handling questions answering for a given question type. + This class is used as a base class for all question types, and for default question type (regular question + answering without any special handling). + """ + +
    +[docs] + class ConfigKeys: + pass
    + + + def __init__(self): + pass + + @staticmethod + def _get_answers(generated_text: str, questions_amount: int) -> List[str]: + + # Clear answer start (part before numbers): + # TODO find better way to verify, for list of questions this is redundant for example + if "1." not in generated_text: + raise ValueError( + f"Answer 1. is missing from the generated text: '{generated_text}'" + ) + text = generated_text.split("1.", 1)[1] + + # Start extracting the answers: + answers = [] + for i in range(1, questions_amount + 1): + # If it's the last answer to look for, take the rest of the text: + if i == questions_amount: + answer_i = text + # Verify there is a question number in the text: + elif f"{i + 1}." not in text: + raise ValueError( + f"Answer {i + 1}. is missing from the generated text: '{generated_text}'" + ) + # Take i's answer: + else: + answer_i, text = text.split(f"{i + 1}.", 1) + # Collect the answer removing redundant spaces: + answers.append(answer_i.strip()) + + return answers + + def _infer_questions( + self, + questions_amount: int, + batched_input: List[str], + generation_pipeline: transformers.Pipeline, + generation_config: transformers.GenerationConfig, + ) -> List[List[str]]: + + # Infer through the llm: + batched_output = generation_pipeline( + batched_input, + generation_config=generation_config, + eos_token_id=generation_pipeline.tokenizer.eos_token_id, + return_full_text=False, + num_return_sequences=1, + ) + + # Process the outputs to get the answers: + batched_answers = [] + for output in batched_output: + # Get the generated answers: + answers = self._get_answers( + generated_text=output[0]["generated_text"], + questions_amount=questions_amount, + ) + # Collect the processed answers: + batched_answers.append(answers) + return batched_answers + +
    +[docs] + def answer( + self, + questions_amount: int, + batched_input: List[str], + generation_pipeline: transformers.Pipeline, + generation_config: transformers.GenerationConfig, + ) -> List[List[str]]: + """ + Answer questions with a context to the given text files contents by a pretrained LLM model in given pipeline. + """ + return self._infer_questions( + questions_amount=questions_amount, + batched_input=batched_input, + generation_pipeline=generation_pipeline, + generation_config=generation_config, + )
    +
    + + + +
    +[docs] +class PollQuestionHandler(QuestionHandler): + """ + Static class to hold all the possible poll question configurations options keys + """ + +
    +[docs] + class ConfigKeys: + """ + A class for handling questions answering for poll type questions. + These type of question are answered by asking the same question multiple times + and choosing the most common answer or the average answer. + """ + + #: The number of times to ask the same question. + POLL_COUNT = "poll_count" + + #: The strategy to use for choosing the answer from the poll. + POLL_STRATEGY = "poll_strategy"
    + + +
    +[docs] + class Strategy(enum.Enum): + #: The most common answer strategy. + MOST_COMMON = "most_common" + + #: The average answer strategy. + AVERAGE = "average" + +
    +[docs] + @staticmethod + def most_common(answers): + """ + Calculate the most common answer for a given list of answers. + """ + count = Counter(answers) + most_common = count.most_common(1) + return most_common[0][0]
    + + +
    +[docs] + @staticmethod + def average(answers): + """ + Calculate the average answer for a given list of answers. + """ + if isinstance(answers[0], str): + raise ValueError( + "Cannot perform poll with average answer strategy of non numeric values," + " please change the question to give numeric data, or choose 'most_common' as strategy." + ) + else: + numeric_values = answers + avg = sum(numeric_values) / len(numeric_values) + + # Round to the closest integer and return corresponding value + return round(avg)
    + + +
    +[docs] + def do(self, answers): + """ + Perform the strategy. + """ + return getattr(self, self.value)(answers)
    +
    + + + def __init__( + self, poll_count: int = 5, poll_strategy: str = "most_common"): + super().__init__() + self.poll_count = poll_count + self.poll_strategy = self.Strategy(poll_strategy) + +
    +[docs] + def answer( + self, + questions_amount: int, + batched_input: List[str], + generation_pipeline: transformers.Pipeline, + generation_config: transformers.GenerationConfig, + ) -> List[List[str]]: + """ + Answer questions with a context to the given text files contents by a pretrained LLM model in given pipeline. + """ + return self._answer_poll_questions( + questions_amount=questions_amount, + batched_input=batched_input, + generation_pipeline=generation_pipeline, + generation_config=generation_config, + )
    + + + def _answer_poll_questions( + self, + questions_amount: int, + batched_input: List[str], + generation_pipeline: transformers.Pipeline, + generation_config: transformers.GenerationConfig, + ) -> List[List[str]]: + votes = [] + + # Run the poll for each question + for _ in range(self.poll_count): + batched_answers = self._infer_questions( + questions_amount=questions_amount, + batched_input=batched_input, + generation_pipeline=generation_pipeline, + generation_config=generation_config, + ) + votes.append(batched_answers) + answers = [] + + # Collect the answers according to the poll strategy + # Average strategy works for numeric values only + for batch in range(len(votes[0])): + batched_answers = [] + for question in range(questions_amount): + # Create a list of all answers to relevant question + answer = [ + votes[voter][batch][question] for voter in range(self.poll_count) + ] + answer = self.poll_strategy.do(answer) + batched_answers.append(answer) + answers.append(batched_answers) + return answers
    + + + +# Holds names of QuestionHandles +
    +[docs] +class QuestionTypes: + DEFAULT = "default" + POLL = "poll"
    + + + +# Maps question types to their handlers +QUESTION_MAPPING = { + QuestionTypes.DEFAULT: QuestionHandler, + QuestionTypes.POLL: PollQuestionHandler, +} +
    +
    +
    +
    +
    +
    +
    +
    +
    + +
    +
    +
    +
    + + + +
    +
    + + \ No newline at end of file diff --git a/functions/master/question_answering/0.5.0/static/source.html b/functions/master/question_answering/0.5.0/static/source.html new file mode 100644 index 00000000..40c2e482 --- /dev/null +++ b/functions/master/question_answering/0.5.0/static/source.html @@ -0,0 +1,771 @@ + + + + + + + + + + + Source + + + + +
    +        
    +# Copyright 2023 Iguazio
    +#
    +# Licensed under the Apache License, Version 2.0 (the "License");
    +# you may not use this file except in compliance with the License.
    +# You may obtain a copy of the License at
    +#
    +#   http://www.apache.org/licenses/LICENSE-2.0
    +#
    +# Unless required by applicable law or agreed to in writing, software
    +# distributed under the License is distributed on an "AS IS" BASIS,
    +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    +# See the License for the specific language governing permissions and
    +# limitations under the License.
    +import enum
    +import logging
    +import operator
    +import pathlib
    +from collections import Counter
    +from functools import reduce, wraps
    +from typing import Any, Dict, List, Tuple, Union
    +
    +import pandas as pd
    +import transformers
    +from tqdm import tqdm
    +
    +# Get the global logger:
    +_LOGGER = logging.getLogger()
    +
    +
    +def _check_mlrun_and_open_mpi() -> Tuple["mlrun.MLClientCtx", "mpi4py.MPI.Intracomm"]:
    +    global _LOGGER
    +
    +    is_mpi = False
    +    try:
    +        import mlrun
    +
    +        context = mlrun.get_or_create_ctx(name="mlrun")
    +        _LOGGER = context.logger
    +        is_mpi = context.labels.get("kind", "job") == "mpijob"
    +
    +        if is_mpi:
    +            try:
    +                from mpi4py import MPI
    +
    +                return context, MPI.COMM_WORLD
    +            except ModuleNotFoundError as mpi4py_not_found:
    +                context.logger.error(
    +                    "To distribute the function using MLRun's 'mpijob' you need to have `mpi4py` package in your "
    +                    "interpreter. Please run `pip install mpi4py` and make sure you have open-mpi."
    +                )
    +                raise mpi4py_not_found
    +    except ModuleNotFoundError as module_not_found:
    +        if is_mpi:
    +            raise module_not_found
    +    return None, None
    +
    +
    +def open_mpi_handler(
    +    worker_inputs: List[str], root_worker_inputs: Dict[str, Any] = None
    +):
    +    global _LOGGER
    +
    +    # Check for MLRun and OpenMPI availability:
    +    context, comm = _check_mlrun_and_open_mpi()
    +
    +    def decorator(handler):
    +        if comm is None or comm.Get_size() == 1:
    +            return handler
    +
    +        @wraps(handler)
    +        def wrapper(**kwargs):
    +            # Get the open mpi environment properties:
    +            size = comm.Get_size()
    +            rank = comm.Get_rank()
    +
    +            # Give the correct chunk of the workers inputs:
    +            for worker_input in worker_inputs:
    +                input_argument = kwargs[worker_input]
    +                if input_argument is None:
    +                    continue
    +                if isinstance(input_argument, str):
    +                    input_argument = _get_text_files(
    +                        data_path=pathlib.Path(input_argument).absolute()
    +                    )
    +                if len(input_argument) < size:
    +                    raise ValueError(
    +                        f"Cannot split the input '{worker_input}' of length {len(input_argument)} to {size} workers. "
    +                        f"Please reduce the amount of workers for this input."
    +                    )
    +                even_chunk_size = len(input_argument) // size
    +                chunk_start = rank * even_chunk_size
    +                chunk_end = (
    +                    (rank + 1) * even_chunk_size
    +                    if rank + 1 < size
    +                    else len(input_argument)
    +                )
    +                context.logger.info(
    +                    f"Rank #{rank}: Processing input chunk of '{worker_input}' "
    +                    f"from index {chunk_start} to {chunk_end}."
    +                )
    +                if isinstance(input_argument, list):
    +                    input_argument = input_argument[chunk_start:chunk_end]
    +                elif isinstance(input_argument, pd.DataFrame):
    +                    input_argument = input_argument.iloc[chunk_start:chunk_end:, :]
    +                kwargs[worker_input] = input_argument
    +
    +            # Set the root worker only arguments:
    +            if rank == 0 and root_worker_inputs:
    +                kwargs.update(root_worker_inputs)
    +
    +            # Run the worker:
    +            output = handler(**kwargs)
    +
    +            # Send the output to the root rank (rank #0):
    +            output = comm.gather(output, root=0)
    +            if rank == 0:
    +                # Join the outputs:
    +                context.logger.info("Collecting data from workers to root worker.")
    +                dataframe = pd.concat(objs=[df for df, _ in output], axis=0)
    +                errors_dictionary = reduce(operator.ior, [err for _, err in output], {})
    +                return dataframe, errors_dictionary
    +            return None
    +
    +        return wrapper
    +
    +    return decorator
    +
    +
    +@open_mpi_handler(worker_inputs=["data_path"], root_worker_inputs={"verbose": True})
    +def answer_questions(
    +    data_path: Union[str, List[str]],
    +    model_name: str,
    +    questions: Union[List[str], List[List[str]]],
    +    device_map: Union[str, dict] = None,
    +    model_kwargs: dict = None,
    +    auto_gptq_exllama_max_input_length: int = None,
    +    tokenizer_name: str = None,
    +    tokenizer_kwargs: dict = None,
    +    text_wrapper: Union[str, List[str]] = "",
    +    questions_wrapper: Union[str, List[str]] = "",
    +    generation_config: Union[Dict, List[Dict]] = None,
    +    questions_config: Union[Dict, List[Dict]] = None,
    +    batch_size: int = 1,
    +    questions_columns: List[str] = None,
    +    verbose: bool = False,
    +) -> Tuple[pd.DataFrame, dict]:
    +    """
    +    Answer questions with a context to the given text files contents by a pretrained LLM model. Each text file will have
    +    the following prompt built:
    +
    +    start of `text_wrapper`
    +    
    +    end of `text_wrapper`
    +
    +    start of `questions_wrapper`
    +    1. 
    +    2. 
    +    ...
    +    n. 
    +    end of `questions_wrapper`
    +
    +    :param data_path:                          A path to a directory of text files or a path to a text file to ask
    +                                               questions about.
    +    :param model_name:                         The pre-trained model name from the huggingface hub to use for asking
    +                                               questions.
    +    :param questions:                          The questions to ask.
    +                                               A list of lists of questions to ask per text file, and devided
    +                                               by question groups, the groups can be dtermained by size (in order to
    +                                               avoid large inputs to the llm) or by questioning method
    +                                               (regular or poll like questioning).
    +    :param device_map:                         A map to use for loading the model on multiple devices.
    +    :param model_kwargs:                       Keyword arguments to pass for loading the model using HuggingFace's
    +                                               `transformers.AutoModelForCausalLM.from_pretrained` function.
    +    :param auto_gptq_exllama_max_input_length: For AutoGPTQ models to set and extend the model's input buffer size.
    +    :param tokenizer_name:                     The tokenizer name from the huggingface hub to use. If not given, the
    +                                               model name will be used.
    +    :param tokenizer_kwargs:                   Keyword arguments to pass for loading the tokenizer using HuggingFace's
    +                                               `transformers.AutoTokenizer.from_pretrained` function.
    +    :param text_wrapper:                       A wrapper for the file's text. Will be added at the start of the prompt.
    +                                               Must have a placeholder ('{}') for the text of the file.
    +    :param questions_wrapper:                  A wrapper for the questions received. Will be added after the text
    +                                               wrapper in the prompt template. Must have a placeholder ('{}') for the
    +                                               questions.
    +    :param generation_config:                  HuggingFace's `GenerationConfig` keyword arguments to pass to the
    +                                               `generate` method.
    +    :param questions_config:                   A dictionary or list of dictionaries containing specific ways to answer
    +                                               questions (using a poll for example), each dictionary in the list is for
    +                                               corresponding question group and determines the question asking method
    +                                               for said group.
    +    :param batch_size:                         Batch size for inference.
    +    :param questions_columns:                  Columns to use for the dataframe returned.
    +    :param verbose:                            Whether to present logs of a progress bar and errors. Default: True.
    +
    +
    +    :returns: A tuple of:
    +
    +              * A dataframe dataset of the questions answers.
    +              * A dictionary of errored files that were not inferred or were not answered properly.
    +    """
    +    global _LOGGER
    +
    +    # Set configs to empty dict if not given:
    +    if generation_config is None:
    +        generation_config = {}
    +    if questions_config is None:
    +        questions_config = {}
    +
    +    # Get the input text files to question:
    +    if verbose:
    +        _LOGGER.info("Collecting text files.")
    +    if isinstance(data_path, str):
    +        data_path = pathlib.Path(data_path).absolute()
    +        text_files = _get_text_files(data_path=data_path)
    +    else:
    +        text_files = data_path
    +    if verbose:
    +        _LOGGER.info(f"Collected {len(text_files)} text files.")
    +
    +    # Get the prompt template:
    +    if verbose:
    +        _LOGGER.info("Creating prompt template.")
    +
    +    # Organize questions as a list of list, and count number of sub-lists for future use
    +    number_of_question_groups = 1 if isinstance(questions[0], str) else len(questions)
    +    questions = _to_group_list(
    +        argument_value=questions,
    +        argument_name="questions",
    +        length=number_of_question_groups,
    +    )
    +
    +    # Organize prompt parts at proper length
    +    text_wrapper = _to_group_list(
    +        argument_value=text_wrapper,
    +        argument_name="text_wrapper",
    +        length=number_of_question_groups,
    +    )
    +    questions_wrapper = _to_group_list(
    +        argument_value=questions_wrapper,
    +        argument_name="questions_wrapper",
    +        length=number_of_question_groups,
    +    )
    +
    +    # Create a list of prompt according to given parts and questions
    +    prompt_template = []
    +    questions = questions if isinstance(questions[0], list) else [questions]
    +
    +    # Build all prompts
    +    for i in range(number_of_question_groups):
    +        prompt_template.append(
    +            _get_prompt_template(
    +                text_wrapper=text_wrapper[i],
    +                questions_wrapper=questions_wrapper[i],
    +                questions=questions[i],
    +            )
    +        )
    +    if verbose:
    +        _LOGGER.info(f"Prompt template created:\n\n{prompt_template}\n")
    +
    +    # Get the total amount of questions:
    +    questions_amount = sum([len(sublist) for sublist in questions])
    +
    +    # Get the questions columns:
    +    questions_columns = questions_columns or [
    +        f"q{i}" for i in range(1, questions_amount + 1)
    +    ]
    +
    +    # Check if we have the correct amount of questions columns:
    +    if len(questions_columns) != questions_amount:
    +        raise ValueError(
    +            f"The provided questions columns length ({len(questions_columns)}) "
    +            f"does not match the questions amount ({questions_amount})"
    +        )
    +
    +    # Load the generation config:
    +    if verbose:
    +        _LOGGER.info("Loading generation configuration.")
    +    generation_config = [
    +        transformers.GenerationConfig(**(cfg or {}))
    +        for cfg in _to_group_list(
    +            argument_value=generation_config,
    +            argument_name="generation_config",
    +            length=number_of_question_groups,
    +        )
    +    ]
    +    if verbose:
    +        _LOGGER.info(f"Generation configuration loaded: {generation_config}")
    +
    +    # Load the model and tokenizer into a pipeline object:
    +    if verbose:
    +        _LOGGER.info(f"Loading model '{model_name}'.")
    +    generation_pipeline = _get_generation_pipeline(
    +        model_name=model_name,
    +        device_map=device_map,
    +        tokenizer_name=tokenizer_name or model_name,
    +        model_kwargs=model_kwargs or {},
    +        tokenizer_kwargs=tokenizer_kwargs or {},
    +        auto_gptq_exllama_max_input_length=auto_gptq_exllama_max_input_length,
    +        batch_size=batch_size,
    +    )
    +    if verbose:
    +        _LOGGER.info("Model loaded.")
    +
    +    # Prepare the successes dataframe and errors dictionary to be returned:
    +    successes = []
    +    errors = {}
    +
    +    # Split the files into batches:
    +    file_batches = [
    +        text_files[i : i + batch_size]
    +        if i + batch_size < len(text_files)
    +        else text_files[i:]
    +        for i in range(0, len(text_files), batch_size)
    +    ]
    +    questions_config = _to_group_list(
    +        argument_value=questions_config,
    +        argument_name="questions_config",
    +        length=number_of_question_groups,
    +    )
    +
    +    # Create a list of question handlers according to given configs
    +    handlers = []
    +    for cfg in questions_config:
    +        question_type = cfg.pop("type", "default")
    +        handlers.append(QUESTION_MAPPING.get(question_type)(**cfg))
    +
    +    # Go over the batches of text files and question them:
    +    for file_batch in tqdm(
    +        file_batches,
    +        desc="Generating answers",
    +        unit=f"file (batch of {batch_size})",
    +        disable=not verbose,
    +    ):
    +        try:
    +            total_answers = [[] for _ in range(batch_size)]
    +
    +            # Go over all question group per batch of documents
    +            for question_group in range(number_of_question_groups):
    +                current_questions_amount = len(questions[question_group])
    +
    +                # Read batch (read the text from the text files):
    +                batched_input = _read_file_batch(
    +                    file_batch=file_batch,
    +                    prompt_template=prompt_template[question_group],
    +                )
    +
    +                # Answer the questions with each question handler:
    +                batched_answers = handlers[question_group].answer(
    +                    questions_amount=current_questions_amount,
    +                    batched_input=batched_input,
    +                    generation_pipeline=generation_pipeline,
    +                    generation_config=generation_config[question_group],
    +                )
    +
    +                # Put the answers in the correct place in the total answers list according to the place in the batch:
    +                for i in range(batch_size):
    +                    total_answers[i].extend(batched_answers[i])
    +
    +            # Collect the answers and attach the file name:
    +            successes.extend(
    +                [
    +                    [file.name, *answers]
    +                    for file, answers in zip(file_batch, total_answers)
    +                ]
    +            )
    +        except Exception as exception:
    +            # Note the exception as error in the dictionary:
    +            batch_file_names = ", ".join([file.name for file in file_batch])
    +            if verbose:
    +                _LOGGER.warning(
    +                    f"Error in batch '{batch_file_names}': {str(exception)}"
    +                )
    +            errors[batch_file_names] = str(exception)
    +            continue
    +
    +    # Construct the answers dataframe:
    +    columns = [
    +        "text_file",
    +        *questions_columns,
    +    ]
    +
    +    # Create a data frame of answers by files
    +    successes = pd.DataFrame(
    +        successes,
    +        columns=columns,
    +    )
    +
    +    # Print the head of the produced dataframe and return:
    +    if verbose:
    +        _LOGGER.info(
    +            f"Done ({successes.shape[0]}/{len(text_files)})\n"
    +            f"Answers summary:\n"
    +            f"{successes.head()}"
    +        )
    +    return successes, errors
    +
    +
    +def _get_text_files(
    +    data_path: pathlib.Path,
    +) -> List[pathlib.Path]:
    +
    +    # Check if the path is of a directory or a file:
    +    if data_path.is_dir():
    +
    +        # Get all files inside the directory:
    +        text_files = list(data_path.glob("*.*"))
    +    elif data_path.is_file():
    +        text_files = [data_path]
    +    else:
    +        raise ValueError(
    +            f"Unrecognized data path. The parameter `data_path` must be either a directory path or a file path. "
    +            f"Given: {str(data_path)} "
    +        )
    +
    +    return text_files
    +
    +
    +def _get_prompt_template(
    +    text_wrapper: str,
    +    questions_wrapper: str,
    +    questions: List[str],
    +) -> str:
    +
    +    # Validate and build the text wrapper:
    +    text_wrapper = text_wrapper or (
    +        "Given the following text:\n" "-----\n" "{}\n" "-----"
    +    )
    +    if text_wrapper.count("{}") != 1:
    +        raise ValueError(
    +            "The `text_wrapper` must include one placeholder '{}' for the text of the file to be asked about."
    +        )
    +
    +    # Validate and build the question wrapper:
    +    questions_wrapper = questions_wrapper or "Answer the questions:\n" "{}"
    +    if questions_wrapper.count("{}") != 1:
    +        raise ValueError(
    +            "The `questions_wrapper` must include one placeholder '{}' for the list of questions."
    +        )
    +
    +    # Validate and parse the questions:
    +    if len(questions) == 0:
    +        raise ValueError("Please include at least one question.")
    +    questions = "\n".join(
    +        [f"{i}. {question}" for i, question in enumerate(questions, 1)]
    +    )
    +
    +    # Construct the template:
    +    return f"{text_wrapper}\n{questions_wrapper.format(questions)}\n"
    +
    +
    +def _get_generation_pipeline(
    +    model_name: str,
    +    device_map: Union[str, dict],
    +    tokenizer_name: str,
    +    model_kwargs: dict,
    +    tokenizer_kwargs: dict,
    +    auto_gptq_exllama_max_input_length: int = None,
    +    batch_size: int = 1,
    +):
    +    # Load the model:
    +    model = transformers.AutoModelForCausalLM.from_pretrained(
    +        model_name, device_map=device_map, **model_kwargs
    +    )
    +
    +    # Set exllama max input length if provided:
    +    # This changes the model's context size.
    +    if auto_gptq_exllama_max_input_length:
    +        from auto_gptq import exllama_set_max_input_length
    +
    +        model = exllama_set_max_input_length(
    +            model=model, max_input_length=auto_gptq_exllama_max_input_length
    +        )
    +
    +    # Load the tokenizer:
    +    tokenizer = transformers.AutoTokenizer.from_pretrained(
    +        tokenizer_name, **tokenizer_kwargs
    +    )
    +
    +    # Initialize a generation pipline and return:
    +    pipe = transformers.pipeline(
    +        task="text-generation",
    +        model=model,
    +        tokenizer=tokenizer,
    +        batch_size=batch_size,
    +    )
    +    pipe.tokenizer.pad_token_id = model.config.eos_token_id
    +    return pipe
    +
    +
    +def _read_file_batch(
    +    file_batch: List[pathlib.Path],
    +    prompt_template: str,
    +) -> List[str]:
    +    batch = []
    +
    +    # Go over all files and read in usable format
    +    for file in file_batch:
    +        with open(file, "r", encoding="utf-8") as fp:
    +            batch.append(prompt_template.format(fp.read()))
    +    return batch
    +
    +
    +def _to_group_list(argument_value: list, argument_name: str, length: int):
    +
    +    # Check if is list, turn to list if not
    +    argument_value = (
    +        argument_value if isinstance(argument_value, list) else [argument_value]
    +    )
    +    list_len = len(argument_value)
    +
    +    # If not a list, or is a list of len 1 we duplicate for correct length
    +    # If list in wrong length throw an error
    +    if list_len != length:
    +        if list_len == 1:
    +            return argument_value * length
    +        raise ValueError(
    +            f"The argument value of '{argument_name}' is not equal to the length of the given questions - {length}"
    +        )
    +    return argument_value
    +
    +
    +class QuestionHandler:
    +    """
    +    A class for handling questions answering for a given question type.
    +    This class is used as a base class for all question types, and for default question type (regular question
    +    answering without any special handling).
    +    """
    +
    +    class ConfigKeys:
    +        pass
    +
    +    def __init__(self):
    +        pass
    +
    +    @staticmethod
    +    def _get_answers(generated_text: str, questions_amount: int) -> List[str]:
    +
    +        # Clear answer start (part before numbers):
    +        # TODO find better way to verify, for list of questions this is redundant for example
    +        if "1." not in generated_text:
    +            raise ValueError(
    +                f"Answer 1. is missing from the generated text: '{generated_text}'"
    +            )
    +        text = generated_text.split("1.", 1)[1]
    +
    +        # Start extracting the answers:
    +        answers = []
    +        for i in range(1, questions_amount + 1):
    +            # If it's the last answer to look for, take the rest of the text:
    +            if i == questions_amount:
    +                answer_i = text
    +            # Verify there is a question number in the text:
    +            elif f"{i + 1}." not in text:
    +                raise ValueError(
    +                    f"Answer {i + 1}. is missing from the generated text: '{generated_text}'"
    +                )
    +            # Take i's answer:
    +            else:
    +                answer_i, text = text.split(f"{i + 1}.", 1)
    +            # Collect the answer removing redundant spaces:
    +            answers.append(answer_i.strip())
    +
    +        return answers
    +
    +    def _infer_questions(
    +        self,
    +        questions_amount: int,
    +        batched_input: List[str],
    +        generation_pipeline: transformers.Pipeline,
    +        generation_config: transformers.GenerationConfig,
    +    ) -> List[List[str]]:
    +
    +        # Infer through the llm:
    +        batched_output = generation_pipeline(
    +            batched_input,
    +            generation_config=generation_config,
    +            eos_token_id=generation_pipeline.tokenizer.eos_token_id,
    +            return_full_text=False,
    +            num_return_sequences=1,
    +        )
    +
    +        # Process the outputs to get the answers:
    +        batched_answers = []
    +        for output in batched_output:
    +            # Get the generated answers:
    +            answers = self._get_answers(
    +                generated_text=output[0]["generated_text"],
    +                questions_amount=questions_amount,
    +            )
    +            # Collect the processed answers:
    +            batched_answers.append(answers)
    +        return batched_answers
    +
    +    def answer(
    +        self,
    +        questions_amount: int,
    +        batched_input: List[str],
    +        generation_pipeline: transformers.Pipeline,
    +        generation_config: transformers.GenerationConfig,
    +    ) -> List[List[str]]:
    +        """
    +        Answer questions with a context to the given text files contents by a pretrained LLM model in given pipeline.
    +        """
    +        return self._infer_questions(
    +            questions_amount=questions_amount,
    +            batched_input=batched_input,
    +            generation_pipeline=generation_pipeline,
    +            generation_config=generation_config,
    +        )
    +
    +
    +class PollQuestionHandler(QuestionHandler):
    +    """
    +    Static class to hold all the possible poll question configurations options keys
    +    """
    +
    +    class ConfigKeys:
    +        """
    +        A class for handling questions answering for poll type questions.
    +        These type of question are answered by asking the same question multiple times
    +        and choosing the most common answer or the average answer.
    +        """
    +
    +        #: The number of times to ask the same question.
    +        POLL_COUNT = "poll_count"
    +
    +        #: The strategy to use for choosing the answer from the poll.
    +        POLL_STRATEGY = "poll_strategy"
    +
    +    class Strategy(enum.Enum):
    +        #: The most common answer strategy.
    +        MOST_COMMON = "most_common"
    +
    +        #: The average answer strategy.
    +        AVERAGE = "average"
    +
    +        @staticmethod
    +        def most_common(answers):
    +            """
    +            Calculate the most common answer for a given list of answers.
    +            """
    +            count = Counter(answers)
    +            most_common = count.most_common(1)
    +            return most_common[0][0]
    +
    +        @staticmethod
    +        def average(answers):
    +            """
    +            Calculate the average answer for a given list of answers.
    +            """
    +            if isinstance(answers[0], str):
    +                raise ValueError(
    +                    "Cannot perform poll with average answer strategy of non numeric values,"
    +                    " please change the question to give numeric data, or choose 'most_common' as strategy."
    +                )
    +            else:
    +                numeric_values = answers
    +            avg = sum(numeric_values) / len(numeric_values)
    +
    +            # Round to the closest integer and return corresponding value
    +            return round(avg)
    +
    +        def do(self, answers):
    +            """
    +            Perform the strategy.
    +            """
    +            return getattr(self, self.value)(answers)
    +
    +    def __init__(
    +        self, poll_count: int = 5, poll_strategy: str = "most_common"):
    +        super().__init__()
    +        self.poll_count = poll_count
    +        self.poll_strategy = self.Strategy(poll_strategy)
    +
    +    def answer(
    +        self,
    +        questions_amount: int,
    +        batched_input: List[str],
    +        generation_pipeline: transformers.Pipeline,
    +        generation_config: transformers.GenerationConfig,
    +    ) -> List[List[str]]:
    +        """
    +        Answer questions with a context to the given text files contents by a pretrained LLM model in given pipeline.
    +        """
    +        return self._answer_poll_questions(
    +            questions_amount=questions_amount,
    +            batched_input=batched_input,
    +            generation_pipeline=generation_pipeline,
    +            generation_config=generation_config,
    +        )
    +
    +    def _answer_poll_questions(
    +        self,
    +        questions_amount: int,
    +        batched_input: List[str],
    +        generation_pipeline: transformers.Pipeline,
    +        generation_config: transformers.GenerationConfig,
    +    ) -> List[List[str]]:
    +        votes = []
    +
    +        # Run the poll for each question
    +        for _ in range(self.poll_count):
    +            batched_answers = self._infer_questions(
    +                questions_amount=questions_amount,
    +                batched_input=batched_input,
    +                generation_pipeline=generation_pipeline,
    +                generation_config=generation_config,
    +            )
    +            votes.append(batched_answers)
    +        answers = []
    +
    +        # Collect the answers according to the poll strategy
    +        # Average strategy works for numeric values only
    +        for batch in range(len(votes[0])):
    +            batched_answers = []
    +            for question in range(questions_amount):
    +                # Create a list of all answers to relevant question
    +                answer = [
    +                    votes[voter][batch][question] for voter in range(self.poll_count)
    +                ]
    +                answer = self.poll_strategy.do(answer)
    +                batched_answers.append(answer)
    +            answers.append(batched_answers)
    +        return answers
    +
    +
    +# Holds names of QuestionHandles
    +class QuestionTypes:
    +    DEFAULT = "default"
    +    POLL = "poll"
    +
    +
    +# Maps question types to their handlers
    +QUESTION_MAPPING = {
    +    QuestionTypes.DEFAULT: QuestionHandler,
    +    QuestionTypes.POLL: PollQuestionHandler,
    +}
    +
    +        
    +    
    + + \ No newline at end of file diff --git a/functions/master/question_answering/latest/src/function.yaml b/functions/master/question_answering/latest/src/function.yaml index 7491b17e..21f741aa 100644 --- a/functions/master/question_answering/latest/src/function.yaml +++ b/functions/master/question_answering/latest/src/function.yaml @@ -1,62 +1,55 @@ -kind: job metadata: name: question-answering tag: '' - hash: aed62db95f17576c69b457767e3595c2de1d5465 - project: '' - labels: - author: yonish categories: - genai - - huggingface - - machine-learning +verbose: false +kind: job spec: command: '' - args: [] - image: '' + default_handler: answer_questions build: - functionSourceCode:  - base_image: mlrun/mlrun - commands: [] - code_origin: '' origin_filename: '' + base_image: mlrun/mlrun requirements: - transformers - torch - tqdm + code_origin: '' + functionSourceCode:  entry_points: open_mpi_handler: name: open_mpi_handler + has_varargs: false doc: '' + lineno: 58 parameters: - name: worker_inputs type: List[str] - name: root_worker_inputs type: Dict[str, Any] default: null - outputs: [] - lineno: 58 - has_varargs: false has_kwargs: false decorator: name: decorator + has_varargs: false doc: '' + lineno: 66 parameters: - name: handler - outputs: [] - lineno: 66 - has_varargs: false has_kwargs: false wrapper: name: wrapper + has_varargs: false doc: '' - parameters: [] - outputs: [] lineno: 71 - has_varargs: false has_kwargs: true answer_questions: + outputs: + - doc: 'A tuple of:' + type: Tuple[pd.DataFrame, dict] name: answer_questions + has_varargs: false doc: 'Answer questions with a context to the given text files contents by a pretrained LLM model. Each text file will have @@ -81,6 +74,7 @@ spec: n. end of `questions_wrapper`' + lineno: 130 parameters: - name: data_path type: Union[str, List[str]] @@ -153,16 +147,15 @@ spec: type: bool doc: 'Whether to present logs of a progress bar and errors. Default: True.' default: false - outputs: - - doc: 'A tuple of:' - type: Tuple[pd.DataFrame, dict] - lineno: 130 - has_varargs: false has_kwargs: false answer: + outputs: + - type: List[List[str]] name: answer + has_varargs: false doc: Answer questions with a context to the given text files contents by a pretrained LLM model in given pipeline. + lineno: 674 parameters: - name: self - name: questions_amount @@ -173,47 +166,32 @@ spec: type: Pipeline - name: generation_config type: GenerationConfig - outputs: - - type: List[List[str]] - lineno: 674 - has_varargs: false has_kwargs: false most_common: name: most_common + has_varargs: false doc: Calculate the most common answer for a given list of answers. + lineno: 637 parameters: - name: answers - outputs: [] - lineno: 637 - has_varargs: false has_kwargs: false average: name: average + has_varargs: false doc: Calculate the average answer for a given list of answers. + lineno: 646 parameters: - name: answers - outputs: [] - lineno: 646 - has_varargs: false has_kwargs: false do: name: do + has_varargs: false doc: Perform the strategy. + lineno: 662 parameters: - name: self - name: answers - outputs: [] - lineno: 662 - has_varargs: false has_kwargs: false + image: '' description: GenAI approach of question answering on a given data - default_handler: answer_questions disable_auto_mount: false - clone_target_dir: '' - env: [] - priority_class_name: '' - preemption_mode: prevent - affinity: null - tolerations: null - security_context: {} -verbose: false diff --git a/functions/master/question_answering/latest/src/item.yaml b/functions/master/question_answering/latest/src/item.yaml index 56fc5a5e..741bab80 100755 --- a/functions/master/question_answering/latest/src/item.yaml +++ b/functions/master/question_answering/latest/src/item.yaml @@ -1,8 +1,6 @@ apiVersion: v1 categories: - genai -- huggingface -- machine-learning description: GenAI approach of question answering on a given data doc: '' example: question_answering.ipynb @@ -13,7 +11,7 @@ labels: author: yonish maintainers: [] marketplaceType: '' -mlrunVersion: 1.5.2 +mlrunVersion: 1.7.0 name: question_answering platformVersion: 3.5.0 spec: @@ -26,4 +24,4 @@ spec: - torch - tqdm url: '' -version: 0.4.0 +version: 0.5.0 diff --git a/functions/master/question_answering/latest/static/documentation.html b/functions/master/question_answering/latest/static/documentation.html index 868c1682..602f654b 100644 --- a/functions/master/question_answering/latest/static/documentation.html +++ b/functions/master/question_answering/latest/static/documentation.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/question_answering/latest/static/example.html b/functions/master/question_answering/latest/static/example.html index 5f8422d9..e54ca5e5 100644 --- a/functions/master/question_answering/latest/static/example.html +++ b/functions/master/question_answering/latest/static/example.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/question_answering/latest/static/function.html b/functions/master/question_answering/latest/static/function.html index 53c64f1c..658c382a 100644 --- a/functions/master/question_answering/latest/static/function.html +++ b/functions/master/question_answering/latest/static/function.html @@ -28,65 +28,58 @@
             
    -kind: job
     metadata:
       name: question-answering
       tag: ''
    -  hash: aed62db95f17576c69b457767e3595c2de1d5465
    -  project: ''
    -  labels:
    -    author: yonish
       categories:
       - genai
    -  - huggingface
    -  - machine-learning
    +verbose: false
    +kind: job
     spec:
       command: ''
    -  args: []
    -  image: ''
    +  default_handler: answer_questions
       build:
    -    functionSourceCode: 
    -    base_image: mlrun/mlrun
    -    commands: []
    -    code_origin: ''
         origin_filename: ''
    +    base_image: mlrun/mlrun
         requirements:
         - transformers
         - torch
         - tqdm
    +    code_origin: ''
    +    functionSourceCode: 
       entry_points:
         open_mpi_handler:
           name: open_mpi_handler
    +      has_varargs: false
           doc: ''
    +      lineno: 58
           parameters:
           - name: worker_inputs
             type: List[str]
           - name: root_worker_inputs
             type: Dict[str, Any]
             default: null
    -      outputs: []
    -      lineno: 58
    -      has_varargs: false
           has_kwargs: false
         decorator:
           name: decorator
    +      has_varargs: false
           doc: ''
    +      lineno: 66
           parameters:
           - name: handler
    -      outputs: []
    -      lineno: 66
    -      has_varargs: false
           has_kwargs: false
         wrapper:
           name: wrapper
    +      has_varargs: false
           doc: ''
    -      parameters: []
    -      outputs: []
           lineno: 71
    -      has_varargs: false
           has_kwargs: true
         answer_questions:
    +      outputs:
    +      - doc: 'A tuple of:'
    +        type: Tuple[pd.DataFrame, dict]
           name: answer_questions
    +      has_varargs: false
           doc: 'Answer questions with a context to the given text files contents by a
             pretrained LLM model. Each text file will have
     
    @@ -111,6 +104,7 @@
             n. 
     
             end of `questions_wrapper`'
    +      lineno: 130
           parameters:
           - name: data_path
             type: Union[str, List[str]]
    @@ -183,16 +177,15 @@
             type: bool
             doc: 'Whether to present logs of a progress bar and errors. Default: True.'
             default: false
    -      outputs:
    -      - doc: 'A tuple of:'
    -        type: Tuple[pd.DataFrame, dict]
    -      lineno: 130
    -      has_varargs: false
           has_kwargs: false
         answer:
    +      outputs:
    +      - type: List[List[str]]
           name: answer
    +      has_varargs: false
           doc: Answer questions with a context to the given text files contents by a pretrained
             LLM model in given pipeline.
    +      lineno: 674
           parameters:
           - name: self
           - name: questions_amount
    @@ -203,50 +196,35 @@
             type: Pipeline
           - name: generation_config
             type: GenerationConfig
    -      outputs:
    -      - type: List[List[str]]
    -      lineno: 674
    -      has_varargs: false
           has_kwargs: false
         most_common:
           name: most_common
    +      has_varargs: false
           doc: Calculate the most common answer for a given list of answers.
    +      lineno: 637
           parameters:
           - name: answers
    -      outputs: []
    -      lineno: 637
    -      has_varargs: false
           has_kwargs: false
         average:
           name: average
    +      has_varargs: false
           doc: Calculate the average answer for a given list of answers.
    +      lineno: 646
           parameters:
           - name: answers
    -      outputs: []
    -      lineno: 646
    -      has_varargs: false
           has_kwargs: false
         do:
           name: do
    +      has_varargs: false
           doc: Perform the strategy.
    +      lineno: 662
           parameters:
           - name: self
           - name: answers
    -      outputs: []
    -      lineno: 662
    -      has_varargs: false
           has_kwargs: false
    +  image: ''
       description: GenAI approach of question answering on a given data
    -  default_handler: answer_questions
       disable_auto_mount: false
    -  clone_target_dir: ''
    -  env: []
    -  priority_class_name: ''
    -  preemption_mode: prevent
    -  affinity: null
    -  tolerations: null
    -  security_context: {}
    -verbose: false
     
             
         
    diff --git a/functions/master/question_answering/latest/static/item.html b/functions/master/question_answering/latest/static/item.html index 0baa2db8..64e45da7 100644 --- a/functions/master/question_answering/latest/static/item.html +++ b/functions/master/question_answering/latest/static/item.html @@ -31,8 +31,6 @@ apiVersion: v1 categories: - genai -- huggingface -- machine-learning description: GenAI approach of question answering on a given data doc: '' example: question_answering.ipynb @@ -43,7 +41,7 @@ author: yonish maintainers: [] marketplaceType: '' -mlrunVersion: 1.5.2 +mlrunVersion: 1.7.0 name: question_answering platformVersion: 3.5.0 spec: @@ -56,7 +54,7 @@ - torch - tqdm url: '' -version: 0.4.0 +version: 0.5.0
    diff --git a/functions/master/question_answering/latest/static/question_answering.html b/functions/master/question_answering/latest/static/question_answering.html index d077dfed..79c5db68 100644 --- a/functions/master/question_answering/latest/static/question_answering.html +++ b/functions/master/question_answering/latest/static/question_answering.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/send_email/1.2.0/static/documentation.html b/functions/master/send_email/1.2.0/static/documentation.html index a5315478..ab35e980 100644 --- a/functions/master/send_email/1.2.0/static/documentation.html +++ b/functions/master/send_email/1.2.0/static/documentation.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/send_email/1.2.0/static/example.html b/functions/master/send_email/1.2.0/static/example.html index 752eb14c..4e6a237e 100644 --- a/functions/master/send_email/1.2.0/static/example.html +++ b/functions/master/send_email/1.2.0/static/example.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/send_email/1.2.0/static/send_email.html b/functions/master/send_email/1.2.0/static/send_email.html index 6333b120..e806359d 100644 --- a/functions/master/send_email/1.2.0/static/send_email.html +++ b/functions/master/send_email/1.2.0/static/send_email.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/send_email/latest/static/documentation.html b/functions/master/send_email/latest/static/documentation.html index a5315478..ab35e980 100644 --- a/functions/master/send_email/latest/static/documentation.html +++ b/functions/master/send_email/latest/static/documentation.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/send_email/latest/static/example.html b/functions/master/send_email/latest/static/example.html index 752eb14c..4e6a237e 100644 --- a/functions/master/send_email/latest/static/example.html +++ b/functions/master/send_email/latest/static/example.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/send_email/latest/static/send_email.html b/functions/master/send_email/latest/static/send_email.html index 6333b120..e806359d 100644 --- a/functions/master/send_email/latest/static/send_email.html +++ b/functions/master/send_email/latest/static/send_email.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/silero_vad/1.4.0/src/assets/test_data.wav b/functions/master/silero_vad/1.4.0/src/assets/test_data.wav new file mode 100644 index 00000000..a3a993c2 Binary files /dev/null and b/functions/master/silero_vad/1.4.0/src/assets/test_data.wav differ diff --git a/functions/master/silero_vad/1.4.0/src/function.yaml b/functions/master/silero_vad/1.4.0/src/function.yaml new file mode 100644 index 00000000..fd637f1c --- /dev/null +++ b/functions/master/silero_vad/1.4.0/src/function.yaml @@ -0,0 +1,273 @@ +metadata: + tag: '' + categories: + - deep-learning + - audio + name: silero-vad +verbose: false +spec: + description: Silero VAD (Voice Activity Detection) functions. + build: + code_origin: '' + base_image: mlrun/mlrun + requirements: + - torch + - torchaudio + - tqdm + - onnxruntime + functionSourceCode:  + origin_filename: '' + image: '' + command: '' + entry_points: + audio_file: + doc: Get the audio file of the task. + lineno: 43 + has_varargs: false + outputs: + - doc: The audio file of the task. + type: Path + parameters: + - name: self + has_kwargs: false + name: audio_file + do_task: + doc: Do the task on the given speech timestamps. The task will diarize the VAD + speech timestamps into speakers. + lineno: 94 + has_varargs: false + parameters: + - name: self + - name: speech_timestamps + type: List[List[Dict[str, int]]] + doc: The speech timestamps per channel to do the task on as outputted from + the VAD. + has_kwargs: false + name: do_task + get_result: + doc: Get the result of the task. A tuple of the audio file name and the result. + lineno: 61 + has_varargs: false + outputs: + - doc: The result of the task. + type: Tuple[str, list] + parameters: + - name: self + has_kwargs: false + name: get_result + to_tuple: + doc: Convert the task to a tuple to reconstruct it later (used for multiprocessing + to pass in queue). + lineno: 116 + has_varargs: false + outputs: + - doc: The converted task. + type: Tuple[str, dict] + parameters: + - name: self + has_kwargs: false + name: to_tuple + create_task: + doc: Create a task with the given audio file. + lineno: 146 + has_varargs: false + outputs: + - doc: The created task. + type: BaseTask + parameters: + - name: self + - name: audio_file + type: Path + doc: The audio file to assign to the task. + has_kwargs: false + name: create_task + from_tuple: + doc: Create a task from a tuple of the audio file name and the task kwargs. + lineno: 157 + has_varargs: false + outputs: + - doc: The created task. + type: BaseTask + parameters: + - name: cls + - name: task_tuple + type: Tuple[str, dict] + doc: The task tuple to create the task from. + has_kwargs: false + name: from_tuple + load: + doc: Load the VAD model. + lineno: 234 + has_varargs: false + parameters: + - name: self + - name: force_reload + type: bool + doc: Whether to force reload the model even if it was already loaded. Default + is True. + default: true + has_kwargs: false + name: load + detect_voice: + doc: "Perform voice activity detection on given audio files using the silero\ + \ VAD model -\nhttps://github.com/snakers4/silero-vad. The end result is a\ + \ dictionary with the file names as keys and their\nVAD timestamps dictionaries\ + \ as value.\n\nFor example::\n\n {\n \"file_1.wav\": [\n \ + \ {\"start\": 0, \"end\": 16000},\n {\"start\": 16000, \"end\"\ + : 32000},\n {\"start\": 32000, \"end\": 48000},\n ...\n\ + \ ],\n \"file_2.wav\": [\n {\"start\": 0, \"end\"\ + : 16000},\n {\"start\": 16000, \"end\": 32000},\n {\"\ + start\": 32000, \"end\": 48000},\n ...\n ],\n ...\n\ + \ }" + lineno: 393 + has_varargs: false + parameters: + - name: data_path + type: Union[str, Path, List[Union[str, Path]]] + doc: The path to the audio files to diarize. Can be a path to a single file, + a path to a directory or a list of paths to files. + - name: use_onnx + type: bool + doc: Whether to use ONNX for inference. Default is True. + default: true + - name: force_onnx_cpu + type: bool + doc: Whether to force ONNX to use CPU for inference. Default is True. + default: true + - name: threshold + type: float + doc: Speech threshold. Silero VAD outputs speech probabilities for each audio + chunk, probabilities ABOVE this value are considered as SPEECH. It is better + to tune this parameter for each dataset separately, but "lazy" 0.5 is pretty + good for most datasets. + default: 0.5 + - name: sampling_rate + type: int + doc: Currently, silero VAD models support 8000 and 16000 sample rates. + default: 16000 + - name: min_speech_duration_ms + type: int + doc: Final speech chunks shorter min_speech_duration_ms are thrown out. + default: 250 + - name: max_speech_duration_s + type: float + doc: Maximum duration of speech chunks in seconds. Chunks longer than `max_speech_duration_s` + will be split at the timestamp of the last silence that lasts more than + 100ms (if any), to prevent aggressive cutting. Otherwise, they will be split + aggressively just before max_speech_duration_s. + default: float('inf') + - name: min_silence_duration_ms + type: int + doc: In the end of each speech chunk wait for min_silence_duration_ms before + separating it. + default: 100 + - name: window_size_samples + type: int + doc: Audio chunks of window_size_samples size are fed to the silero VAD model. + default: 512 + - name: speech_pad_ms + type: int + doc: Final speech chunks are padded by speech_pad_ms each side. + default: 30 + - name: return_seconds + type: bool + doc: Whether return timestamps in seconds. False means to return timestamps + in samples (default - False). + default: false + - name: per_channel + type: bool + doc: Whether to return timestamps per channel (default - False). This will + run VAD on each channel separately and return a list of timestamps per channel. + default: false + - name: use_multiprocessing + type: int + doc: The number of workers to use for multiprocessing. If 0, no multiprocessing + will be used. Default is 0. + default: 0 + - name: verbose + type: bool + doc: Verbosity. + default: false + has_kwargs: false + name: detect_voice + diarize: + doc: "Perform speech diarization on given audio files using the silero VAD model\ + \ - https://github.com/snakers4/silero-vad.\nThe speech diarization is performed\ + \ per channel so that each channel in the audio belong to a different speaker.\ + \ The\nend result is a dictionary with the file names as keys and their diarization\ + \ as value. A diarization is a list\nof tuples: (start, end, speaker_label).\n\ + \nFor example::\n\n {\n \"file_1.wav\": [\n (0.0, 1.0,\ + \ \"speaker_0\"),\n (1.0, 2.0, \"speaker_1\"),\n (2.0,\ + \ 3.0, \"speaker_0\"),\n ...\n ],\n \"file_2.wav\"\ + : [\n (0.0, 1.0, \"speaker_0\"),\n (1.0, 2.0, \"speaker_1\"\ + ),\n (2.0, 3.0, \"speaker_0\"),\n ...\n ],\n\ + \ ...\n }" + lineno: 517 + has_varargs: false + parameters: + - name: data_path + type: Union[str, Path, List[Union[str, Path]]] + doc: The path to the audio files to diarize. Can be a path to a single file, + a path to a directory or a list of paths to files. + - name: use_onnx + type: bool + doc: Whether to use ONNX for inference. Default is True. + default: true + - name: force_onnx_cpu + type: bool + doc: Whether to force ONNX to use CPU for inference. Default is True. + default: true + - name: threshold + type: float + doc: Speech threshold. Silero VAD outputs speech probabilities for each audio + chunk, probabilities ABOVE this value are considered as SPEECH. It is better + to tune this parameter for each dataset separately, but "lazy" 0.5 is pretty + good for most datasets. + default: 0.5 + - name: sampling_rate + type: int + doc: Currently, silero VAD models support 8000 and 16000 sample rates. + default: 16000 + - name: min_speech_duration_ms + type: int + doc: Final speech chunks shorter min_speech_duration_ms are thrown out. + default: 250 + - name: max_speech_duration_s + type: float + doc: Maximum duration of speech chunks in seconds. Chunks longer than `max_speech_duration_s` + will be split at the timestamp of the last silence that lasts more than + 100ms (if any), to prevent aggressive cutting. Otherwise, they will be split + aggressively just before max_speech_duration_s. + default: float('inf') + - name: min_silence_duration_ms + type: int + doc: In the end of each speech chunk wait for min_silence_duration_ms before + separating it. + default: 100 + - name: window_size_samples + type: int + doc: Audio chunks of window_size_samples size are fed to the silero VAD model. + default: 512 + - name: speech_pad_ms + type: int + doc: Final speech chunks are padded by speech_pad_ms each side. + default: 30 + - name: speaker_labels + type: List[str] + doc: The speaker labels to use for the diarization. If not given, the speakers + will be named "speaker_0", "speaker_1", etc. + default: null + - name: use_multiprocessing + type: int + doc: The number of workers to use for multiprocessing. If 0, no multiprocessing + will be used. Default is 0. + default: 0 + - name: verbose + type: bool + doc: Verbosity. + default: false + has_kwargs: false + name: diarize + disable_auto_mount: false + default_handler: detect_voice +kind: job diff --git a/functions/master/silero_vad/1.4.0/src/item.yaml b/functions/master/silero_vad/1.4.0/src/item.yaml new file mode 100644 index 00000000..49adfcd9 --- /dev/null +++ b/functions/master/silero_vad/1.4.0/src/item.yaml @@ -0,0 +1,29 @@ +apiVersion: v1 +categories: +- deep-learning +- audio +description: Silero VAD (Voice Activity Detection) functions. +doc: '' +example: silero_vad.ipynb +generationDate: 2023-12-03:14-30 +hidden: false +icon: '' +labels: + author: guyl +maintainers: [] +marketplaceType: '' +mlrunVersion: 1.7.0 +name: silero_vad +platformVersion: 3.5.3 +spec: + filename: silero_vad.py + handler: detect_voice + image: mlrun/mlrun + kind: job + requirements: + - torch + - torchaudio + - tqdm + - onnxruntime +url: '' +version: 1.4.0 diff --git a/functions/master/silero_vad/1.4.0/src/silero_vad.ipynb b/functions/master/silero_vad/1.4.0/src/silero_vad.ipynb new file mode 100644 index 00000000..29cd7437 --- /dev/null +++ b/functions/master/silero_vad/1.4.0/src/silero_vad.ipynb @@ -0,0 +1,35 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "initial_id", + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/functions/master/silero_vad/1.4.0/src/silero_vad.py b/functions/master/silero_vad/1.4.0/src/silero_vad.py new file mode 100644 index 00000000..a477d4ec --- /dev/null +++ b/functions/master/silero_vad/1.4.0/src/silero_vad.py @@ -0,0 +1,847 @@ +# Copyright 2024 Iguazio +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import logging +from multiprocessing import Process, Queue +from pathlib import Path +from types import FunctionType +from typing import Dict, List, Tuple, Type, Union + +import torch +import torchaudio +from tqdm import tqdm + + +class BaseTask: + """ + A base class for a task to complete after VAD. + """ + + def __init__(self, audio_file: Path): + """ + Initialize the base task. + + :param audio_file: The audio file assigned to the task. + """ + # Store the audio file: + self._audio_file = audio_file + + # Prepare the result: + self._result = None + + @property + def audio_file(self) -> Path: + """ + Get the audio file of the task. + + :returns: The audio file of the task. + """ + return self._audio_file + + def do_task( + self, speech_timestamps: Union[List[Dict[str, int]], List[List[Dict[str, int]]]] + ): + """ + Do the task on the given speech timestamps. The base task will simply save the speech timestamps as the result. + + :param speech_timestamps: The speech timestamps to do the task on as outputted from the VAD. + """ + self._result = speech_timestamps + + def get_result(self) -> Tuple[str, list]: + """ + Get the result of the task. A tuple of the audio file name and the result. + + :returns: The result of the task. + """ + return self._audio_file.name, self._result + + def to_tuple(self) -> Tuple[str, dict]: + """ + Convert the task to a tuple to reconstruct it later (used for multiprocessing to pass in queue). + + :returns: The converted task. + """ + return self.__class__.__name__, {"audio_file": self._audio_file} + + +class SpeechDiarizationTask(BaseTask): + """ + A speech diarization task. The task will diarize the VAD speech timestamps into speakers. + """ + + def __init__(self, audio_file: Path, speaker_labels: List[str]): + """ + Initialize the speech diarization task. + + :param audio_file: The audio file assigned to the task. + :param speaker_labels: The speaker labels to use for the diarization. If not given, the speakers will be named + "speaker_0", "speaker_1", etc. + """ + super().__init__(audio_file=audio_file) + self._speaker_labels = speaker_labels + + def do_task(self, speech_timestamps: List[List[Dict[str, int]]]): + """ + Do the task on the given speech timestamps. The task will diarize the VAD speech timestamps into speakers. + + :param speech_timestamps: The speech timestamps per channel to do the task on as outputted from the VAD. + """ + # Get the speaker labels (set default if not given): + speaker_labels = self._speaker_labels or [ + f"speaker_{i}" for i in range(len(speech_timestamps)) + ] + + # Diarize - organize the speech timestamps into a single list of speakers and sort it by start time: + speech_diarization = [ + (speech_timestamp["start"], speech_timestamp["end"], speaker_label) + for speaker_label, channel_speech_timestamps in zip( + speaker_labels, speech_timestamps + ) + for speech_timestamp in channel_speech_timestamps + ] + speech_diarization.sort() + self._result = speech_diarization + + def to_tuple(self) -> Tuple[str, dict]: + """ + Convert the task to a tuple to reconstruct it later (used for multiprocessing to pass in queue). + + :returns: The converted task. + """ + task_class, task_kwargs = super().to_tuple() + return task_class, {**task_kwargs, "speaker_labels": self._speaker_labels} + + +class TaskCreator: + """ + A task creator to create different tasks to run after the VAD. + """ + + #: A map from task class name to task class to use in `from_tuple`: + _MAP = { + BaseTask.__name__: BaseTask, + SpeechDiarizationTask.__name__: SpeechDiarizationTask, + } + + def __init__(self, task_type: Type[BaseTask], task_kwargs: dict = None): + """ + Initialize the task creator. + :param task_type: The task type - a `BaseTask` subclass. + :param task_kwargs: Additional keyword arguments to pass to the to be created tasks. + """ + self._task_type = task_type + self._task_kwargs = task_kwargs or {} + + def create_task(self, audio_file: Path) -> BaseTask: + """ + Create a task with the given audio file. + + :param audio_file: The audio file to assign to the task. + + :returns: The created task. + """ + return self._task_type(audio_file=audio_file, **self._task_kwargs) + + @classmethod + def from_tuple(cls, task_tuple: Tuple[str, dict]) -> BaseTask: + """ + Create a task from a tuple of the audio file name and the task kwargs. + + :param task_tuple: The task tuple to create the task from. + + :returns: The created task. + """ + task_class, task_kwargs = task_tuple + return cls._MAP[task_class](**task_kwargs) + + +class VoiceActivityDetector: + """ + A voice activity detection wrapper for the silero VAD model - https://github.com/snakers4/silero-vad. + """ + + def __init__( + self, + # Model loading kwargs: + use_onnx: bool = True, + force_onnx_cpu: bool = True, + # Detection kwargs: + threshold: float = 0.5, + sampling_rate: int = 16_000, + min_speech_duration_ms: int = 250, + max_speech_duration_s: float = float("inf"), + min_silence_duration_ms: int = 100, + window_size_samples: int = 512, + speech_pad_ms: int = 30, + return_seconds: bool = False, + per_channel: bool = False, + ): + """ + Initialize the voice activity detector. + + :param use_onnx: Whether to use ONNX for inference. Default is True. + :param force_onnx_cpu: Whether to force ONNX to use CPU for inference. Default is True. + :param threshold: Speech threshold. Silero VAD outputs speech probabilities for each audio chunk, + probabilities ABOVE this value are considered as SPEECH. It is better to tune + this parameter for each dataset separately, but "lazy" 0.5 is pretty good for + most datasets. + :param sampling_rate: Currently, silero VAD models support 8000 and 16000 sample rates. + :param min_speech_duration_ms: Final speech chunks shorter min_speech_duration_ms are thrown out. + :param max_speech_duration_s: Maximum duration of speech chunks in seconds. Chunks longer than + `max_speech_duration_s` will be split at the timestamp of the last silence that + lasts more than 100ms (if any), to prevent aggressive cutting. Otherwise, + they will be split aggressively just before max_speech_duration_s. + :param min_silence_duration_ms: In the end of each speech chunk wait for min_silence_duration_ms before + separating it. + :param window_size_samples: Audio chunks of window_size_samples size are fed to the silero VAD model. + WARNING! Silero VAD models were trained using 512, 1024, 1536 samples for 16000 + sample rate and 256, 512, 768 samples for 8000 sample rate. Values other than + these may affect model performance! + :param speech_pad_ms: Final speech chunks are padded by speech_pad_ms each side. + :param return_seconds: Whether return timestamps in seconds. False means to return timestamps in + samples (default - False). + :param per_channel: Whether to return timestamps per channel (default - False). This will run VAD + on each channel separately and return a list of timestamps per channel. + """ + # Store configurations: + self._use_onnx = use_onnx + self._force_onnx_cpu = force_onnx_cpu + self._threshold = threshold + self._sampling_rate = sampling_rate + self._min_speech_duration_ms = min_speech_duration_ms + self._max_speech_duration_s = max_speech_duration_s + self._min_silence_duration_ms = min_silence_duration_ms + self._window_size_samples = window_size_samples + self._speech_pad_ms = speech_pad_ms + self._return_seconds = return_seconds + self._per_channel = per_channel + + # Prepare the model variables + self._model: torch.Module = None + self._get_speech_timestamps: FunctionType = None + + def load(self, force_reload: bool = True): + """ + Load the VAD model. + + :param force_reload: Whether to force reload the model even if it was already loaded. Default is True. + """ + model, utils = torch.hub.load( + repo_or_dir="snakers4/silero-vad", + model="silero_vad", + force_reload=force_reload, + onnx=self._use_onnx, + force_onnx_cpu=self._force_onnx_cpu, + ) + self._model = model + ( + self._get_speech_timestamps, + _, # save_audio, + _, # read_audio, + _, # VADIterator, + _, # collect_chunks + ) = utils + + def detect_voice( + self, + audio_file: Path, + ) -> Union[List[Dict[str, int]], List[List[Dict[str, int]]]]: + """ + Infer the audio through the VAD model and return the speech timestamps. + + :param audio_file: The audio file to infer. + + :returns: The speech timestamps in the audio. A list of timestamps where each timestamp is a dictionary with the + following keys: + + * "start": The start sample index of the speech in the audio. + * "end": The end sample index of the speech in the audio. + + If `per_channel` is True, a list of timestamps per channel will be returned. + """ + # Cast to a numpy array: + audio = self._read_audio(audio_file) + + # Detect speech: + if not self._per_channel: + return self._get_speech_timestamps( + audio, + self._model, + threshold=self._threshold, + min_speech_duration_ms=self._min_speech_duration_ms, + max_speech_duration_s=self._max_speech_duration_s, + min_silence_duration_ms=self._min_silence_duration_ms, + speech_pad_ms=self._speech_pad_ms, + sampling_rate=self._sampling_rate, + window_size_samples=self._window_size_samples, + return_seconds=self._return_seconds, + ) + + # Per channel: + speech_timestamps = [] + for channel in audio: + speech_timestamps.append( + self._get_speech_timestamps( + channel, + self._model, + threshold=self._threshold, + min_speech_duration_ms=self._min_speech_duration_ms, + max_speech_duration_s=self._max_speech_duration_s, + min_silence_duration_ms=self._min_silence_duration_ms, + speech_pad_ms=self._speech_pad_ms, + sampling_rate=self._sampling_rate, + window_size_samples=self._window_size_samples, + return_seconds=self._return_seconds, + ) + ) + + return speech_timestamps + + def _read_audio( + self, + path: Path, + ) -> torch.Tensor: + """ + Read the audio from the given path and return it as a tensor. + + :param path: The path to the audio file. + + :returns: The audio as a tensor. + """ + # Read the audio: + audio, sampling_rate = torchaudio.load(str(path)) + + # Check if the audio is stereo and if so, convert it to mono (only if not per channel): + if audio.size(0) > 1 and not self._per_channel: + audio = audio.mean(dim=0, keepdim=True) + + # Resample the audio if needed: + if sampling_rate != self._sampling_rate: + transform = torchaudio.transforms.Resample( + orig_freq=sampling_rate, new_freq=self._sampling_rate + ) + audio = transform(audio) + + # Return the audio (squeeze if not per channel): + return audio if self._per_channel else audio.squeeze(0) + + +#: The value to send into multiprocessing queues to stop the process: +_MULTIPROCESSING_STOP_MARK = "STOP" + + +def _multiprocessing_complete_tasks( + vad_init_kwargs: dict, tasks_queue: Queue, results_queue: Queue +): + """ + Complete the tasks in the given queue and put the results in the given results queue. The function will stop when + the given tasks queue will receive the stop mark. It is aimed to be used with multiprocessing as a process. + + :param vad_init_kwargs: The VAD initialization kwargs. + :param tasks_queue: A queue to get the tasks from. + :param results_queue: A queue to put the results in. + """ + # Initialize and load the VAD: + vad = VoiceActivityDetector(**vad_init_kwargs) + vad.load(force_reload=False) + + # Start listening to the tasks queue: + while True: + # Get the task: + task: Tuple[str, dict] = tasks_queue.get() + if task == _MULTIPROCESSING_STOP_MARK: + break + try: + # Create the task: + task = TaskCreator.from_tuple(task_tuple=task) + # Run the file through the VAD: + speech_timestamps = vad.detect_voice(audio_file=task.audio_file) + # Complete the task: + task.do_task(speech_timestamps=speech_timestamps) + # Build the result: + result = (False, task.get_result()) + except Exception as exception: + # Build the error: + result = (True, (task.audio_file.name, str(exception))) + # Collect the result / error: + results_queue.put(result) + + # Mark the end of the tasks: + results_queue.put(_MULTIPROCESSING_STOP_MARK) + + +# Get the global logger: +try: + import mlrun + + _LOGGER = mlrun.get_or_create_ctx("silero_vad").logger +except ModuleNotFoundError: + _LOGGER = logging.getLogger() + + +def detect_voice( + # Input kwargs: + data_path: Union[str, Path, List[Union[str, Path]]], + # Model loading kwargs: + use_onnx: bool = True, + force_onnx_cpu: bool = True, + # Detection kwargs: + threshold: float = 0.5, + sampling_rate: int = 16_000, + min_speech_duration_ms: int = 250, + max_speech_duration_s: float = float("inf"), + min_silence_duration_ms: int = 100, + window_size_samples: int = 512, + speech_pad_ms: int = 30, + return_seconds: bool = False, + per_channel: bool = False, + # Other kwargs: + use_multiprocessing: int = 0, + verbose: bool = False, +): + """ + Perform voice activity detection on given audio files using the silero VAD model - + https://github.com/snakers4/silero-vad. The end result is a dictionary with the file names as keys and their + VAD timestamps dictionaries as value. + + For example:: + + { + "file_1.wav": [ + {"start": 0, "end": 16000}, + {"start": 16000, "end": 32000}, + {"start": 32000, "end": 48000}, + ... + ], + "file_2.wav": [ + {"start": 0, "end": 16000}, + {"start": 16000, "end": 32000}, + {"start": 32000, "end": 48000}, + ... + ], + ... + } + + + :param data_path: The path to the audio files to diarize. Can be a path to a single file, a path to a + directory or a list of paths to files. + :param use_onnx: Whether to use ONNX for inference. Default is True. + :param force_onnx_cpu: Whether to force ONNX to use CPU for inference. Default is True. + :param threshold: Speech threshold. Silero VAD outputs speech probabilities for each audio chunk, + probabilities ABOVE this value are considered as SPEECH. It is better to tune + this parameter for each dataset separately, but "lazy" 0.5 is pretty good for + most datasets. + :param sampling_rate: Currently, silero VAD models support 8000 and 16000 sample rates. + :param min_speech_duration_ms: Final speech chunks shorter min_speech_duration_ms are thrown out. + :param max_speech_duration_s: Maximum duration of speech chunks in seconds. Chunks longer than + `max_speech_duration_s` will be split at the timestamp of the last silence that + lasts more than 100ms (if any), to prevent aggressive cutting. Otherwise, they will + be split aggressively just before max_speech_duration_s. + :param min_silence_duration_ms: In the end of each speech chunk wait for min_silence_duration_ms before separating + it. + :param window_size_samples: Audio chunks of window_size_samples size are fed to the silero VAD model. + + WARNING! Silero VAD models were trained using 512, 1024, 1536 samples for 16000 + sample rate and 256, 512, 768 samples for 8000 sample rate. Values other than + these may affect model performance! + :param speech_pad_ms: Final speech chunks are padded by speech_pad_ms each side. + :param return_seconds: Whether return timestamps in seconds. False means to return timestamps in samples + (default - False). + :param per_channel: Whether to return timestamps per channel (default - False). This will run VAD on + each channel separately and return a list of timestamps per channel. + :param use_multiprocessing: The number of workers to use for multiprocessing. If 0, no multiprocessing will + be used. Default is 0. + :param verbose: Verbosity. + """ + global _LOGGER + + # Get the input audio files to transcribe: + if verbose: + _LOGGER.info("Collecting audio files.") + audio_files = _get_audio_files(data_path=data_path) + if verbose: + _LOGGER.info(f"Collected {len(audio_files)} audio files.") + + # Initialize the transcription pipeline: + vad_init_kwargs = { + "use_onnx": use_onnx, + "force_onnx_cpu": force_onnx_cpu, + "threshold": threshold, + "sampling_rate": sampling_rate, + "min_speech_duration_ms": min_speech_duration_ms, + "max_speech_duration_s": max_speech_duration_s, + "min_silence_duration_ms": min_silence_duration_ms, + "window_size_samples": window_size_samples, + "speech_pad_ms": speech_pad_ms, + "return_seconds": return_seconds, + "per_channel": per_channel, + } + + # Create the task creator: + task_creator = TaskCreator(task_type=BaseTask) + + # Run the transcription: + if use_multiprocessing: + results = _parallel_run( + n_workers=use_multiprocessing, + audio_files=audio_files, + description="Detecting voice", + vad_init_kwargs=vad_init_kwargs, + task_creator=task_creator, + verbose=verbose, + ) + else: + results = _run( + audio_files=audio_files, + description="Detecting voice", + vad_init_kwargs=vad_init_kwargs, + task_creator=task_creator, + verbose=verbose, + ) + + # Process the results: + return _process_results(results=results, verbose=verbose) + + +def diarize( + # Input / Output kwargs: + data_path: Union[str, Path, List[Union[str, Path]]], + # Model loading kwargs: + use_onnx: bool = True, + force_onnx_cpu: bool = True, + # Detection kwargs: + threshold: float = 0.5, + sampling_rate: int = 16_000, + min_speech_duration_ms: int = 250, + max_speech_duration_s: float = float("inf"), + min_silence_duration_ms: int = 100, + window_size_samples: int = 512, + speech_pad_ms: int = 30, + # Diarization kwargs: + speaker_labels: List[str] = None, + # Other kwargs: + use_multiprocessing: int = 0, + verbose: bool = False, +): + """ + Perform speech diarization on given audio files using the silero VAD model - https://github.com/snakers4/silero-vad. + The speech diarization is performed per channel so that each channel in the audio belong to a different speaker. The + end result is a dictionary with the file names as keys and their diarization as value. A diarization is a list + of tuples: (start, end, speaker_label). + + For example:: + + { + "file_1.wav": [ + (0.0, 1.0, "speaker_0"), + (1.0, 2.0, "speaker_1"), + (2.0, 3.0, "speaker_0"), + ... + ], + "file_2.wav": [ + (0.0, 1.0, "speaker_0"), + (1.0, 2.0, "speaker_1"), + (2.0, 3.0, "speaker_0"), + ... + ], + ... + } + + + :param data_path: The path to the audio files to diarize. Can be a path to a single file, a path to a + directory or a list of paths to files. + :param use_onnx: Whether to use ONNX for inference. Default is True. + :param force_onnx_cpu: Whether to force ONNX to use CPU for inference. Default is True. + :param threshold: Speech threshold. Silero VAD outputs speech probabilities for each audio chunk, + probabilities ABOVE this value are considered as SPEECH. It is better to tune + this parameter for each dataset separately, but "lazy" 0.5 is pretty good for + most datasets. + :param sampling_rate: Currently, silero VAD models support 8000 and 16000 sample rates. + :param min_speech_duration_ms: Final speech chunks shorter min_speech_duration_ms are thrown out. + :param max_speech_duration_s: Maximum duration of speech chunks in seconds. Chunks longer than + `max_speech_duration_s` will be split at the timestamp of the last silence that + lasts more than 100ms (if any), to prevent aggressive cutting. Otherwise, they will + be split aggressively just before max_speech_duration_s. + :param min_silence_duration_ms: In the end of each speech chunk wait for min_silence_duration_ms before separating + it. + :param window_size_samples: Audio chunks of window_size_samples size are fed to the silero VAD model. + + WARNING! Silero VAD models were trained using 512, 1024, 1536 samples for 16000 + sample rate and 256, 512, 768 samples for 8000 sample rate. Values other than + these may affect model performance! + :param speech_pad_ms: Final speech chunks are padded by speech_pad_ms each side. + :param speaker_labels: The speaker labels to use for the diarization. If not given, the speakers will be + named "speaker_0", "speaker_1", etc. + :param use_multiprocessing: The number of workers to use for multiprocessing. If 0, no multiprocessing will + be used. Default is 0. + :param verbose: Verbosity. + """ + global _LOGGER + + # Get the input audio files to transcribe: + if verbose: + _LOGGER.info("Collecting audio files.") + audio_files = _get_audio_files(data_path=data_path) + if verbose: + _LOGGER.info(f"Collected {len(audio_files)} audio files.") + + # Initialize the transcription pipeline: + vad_init_kwargs = { + "use_onnx": use_onnx, + "force_onnx_cpu": force_onnx_cpu, + "threshold": threshold, + "sampling_rate": sampling_rate, + "min_speech_duration_ms": min_speech_duration_ms, + "max_speech_duration_s": max_speech_duration_s, + "min_silence_duration_ms": min_silence_duration_ms, + "window_size_samples": window_size_samples, + "speech_pad_ms": speech_pad_ms, + "return_seconds": True, + "per_channel": True, + } + + # Create the task creator: + task_creator = TaskCreator( + task_type=SpeechDiarizationTask, task_kwargs={"speaker_labels": speaker_labels} + ) + + # Run the transcription: + if use_multiprocessing: + results = _parallel_run( + n_workers=use_multiprocessing, + audio_files=audio_files, + description="Diarizing", + vad_init_kwargs=vad_init_kwargs, + task_creator=task_creator, + verbose=verbose, + ) + else: + results = _run( + audio_files=audio_files, + description="Diarizing", + vad_init_kwargs=vad_init_kwargs, + task_creator=task_creator, + verbose=verbose, + ) + + # Process the results: + return _process_results(results=results, verbose=verbose) + + +def _get_audio_files( + data_path: Union[Path, str, list], +) -> List[Path]: + """ + Get the audio files from the data path. If a path to a directory is given, all files in the directory will be + collected. + + :param data_path: The data path to collect the audio files from. + + :returns: The audio files list. + """ + # Check if given a list of paths: + if isinstance(data_path, list): + audio_files = [] + for path in data_path: + audio_files.extend(_get_audio_files(data_path=path)) + return audio_files + + # Check if given a single string path to cast it to a `pathlib.Path`: + if isinstance(data_path, str): + data_path = Path(data_path).absolute() + + # Check if the path is of a directory or a file: + if data_path.is_dir(): + # Get all files inside the directory: + audio_files = list(data_path.glob("*.*")) + elif data_path.is_file(): + audio_files = [data_path] + else: + raise ValueError( + f"Unrecognized data path. The parameter `data_path` must be a valid path to either a directory path or a " + f"file. Given: {str(data_path)} " + ) + + return audio_files + + +def _run( + audio_files: List[Path], + description: str, + vad_init_kwargs: dict, + task_creator: TaskCreator, + verbose: bool, +) -> List[Tuple[bool, Tuple[str, list]]]: + """ + Load a VAD and use it to complete the tasks that will be created on the provided files using the given task creator. + + :param audio_files: The audio files to use. + :param description: The description to use for the progress bar. + :param vad_init_kwargs: The VAD initialization keyword arguments. + :param task_creator: The task creator to use to create the tasks. + :param verbose: Verbosity. + + :returns: The collected results. + """ + # Load the VAD: + vad = VoiceActivityDetector(**vad_init_kwargs) + if verbose: + _LOGGER.info(f"Loading the VAD model.") + vad.load() + if verbose: + _LOGGER.info("VAD model loaded.") + + # Run the VAD on the audio files and collect the results: + results = [] + for audio_file in tqdm( + audio_files, + desc=description, + unit="file", + total=len(audio_files), + disable=not verbose, + ): + try: + # Create the task: + task = task_creator.create_task(audio_file=audio_file) + # Run the file through the VAD: + speech_timestamps = vad.detect_voice(audio_file=audio_file) + # Complete the task: + task.do_task(speech_timestamps=speech_timestamps) + # Collect the result: + results.append((False, task.get_result())) + except Exception as exception: + # Collect the error: + results.append((True, (audio_file.name, str(exception)))) + + return results + + +def _parallel_run( + n_workers: int, + audio_files: List[Path], + description: str, + vad_init_kwargs: dict, + task_creator: TaskCreator, + verbose: bool, +) -> List[Tuple[bool, Tuple[str, list]]]: + """ + Run multiple VAD workers with multiprocessing to complete the tasks that will be created on the provided files using + the given task creator. + + :param n_workers: The number of workers to use. + :param audio_files: The audio files to use. + :param description: The description to use for the progress bar. + :param vad_init_kwargs: The VAD initialization keyword arguments. + :param task_creator: The task creator to use to create the tasks. + :param verbose: Verbosity. + + :returns: The collected results. + """ + # Load the VAD (download once, and it will be loaded then per process later on): + if verbose: + _LOGGER.info(f"Loading the VAD model.") + vad = VoiceActivityDetector(**vad_init_kwargs) + vad.load() + if verbose: + _LOGGER.info("VAD model loaded.") + + # Check the number of workers: + if n_workers > len(audio_files): + _LOGGER.warning( + f"The number of workers ({n_workers}) is larger than the number of audio files ({len(audio_files)}). " + f"Setting the number of workers to {len(audio_files)}." + ) + n_workers = len(audio_files) + + # Initialize the multiprocessing queues: + tasks_queue = Queue() + results_queue = Queue() + + # Initialize the multiprocessing processes: + task_completion_processes = [ + Process( + target=_multiprocessing_complete_tasks, + kwargs={ + "vad_init_kwargs": vad_init_kwargs, + "tasks_queue": tasks_queue, + "results_queue": results_queue, + }, + ) + for _ in range(n_workers) + ] + + # Start the multiprocessing processes: + for p in task_completion_processes: + p.start() + + # Put the tasks in the queue: + for audio_file in audio_files: + tasks_queue.put(task_creator.create_task(audio_file=audio_file).to_tuple()) + + # Put the stop marks in the queue: + for _ in range(n_workers): + tasks_queue.put(_MULTIPROCESSING_STOP_MARK) + + # Collect the results: + results = [] + stop_marks_counter = 0 + with tqdm( + desc=description, + unit="file", + total=len(audio_files), + disable=not verbose, + ) as progressbar: + while True: + # Get a result from the queue: + result: Tuple[bool, Tuple[str, list]] = results_queue.get() + if result == _MULTIPROCESSING_STOP_MARK: + stop_marks_counter += 1 + if stop_marks_counter == n_workers: + break + else: + # Collect the result: + results.append(result) + progressbar.update(1) + + # Wait for the processes to finish: + for p in task_completion_processes: + p.join() + + return results + + +def _process_results( + results: List[Tuple[bool, Tuple[str, list]]], verbose: bool +) -> Tuple[dict, dict]: + """ + Process the results of the tasks. + + :param results: The results to process. + :param verbose: Verbosity. + + :returns: The processed results as a tuple of successes and errors. + """ + if verbose: + _LOGGER.info("Summarizing the results.") + successes = {} + errors = {} + for is_error, result in results: + if is_error: + errors[result[0]] = result[1] + else: + successes[result[0]] = result[1] + if verbose: + _LOGGER.info(f"Done ({len(successes)}/{len(successes) + len(errors)})\n") + + return successes, errors diff --git a/functions/master/silero_vad/1.4.0/src/test_silero_vad.py b/functions/master/silero_vad/1.4.0/src/test_silero_vad.py new file mode 100644 index 00000000..d46471a5 --- /dev/null +++ b/functions/master/silero_vad/1.4.0/src/test_silero_vad.py @@ -0,0 +1,44 @@ +import os +import tempfile + +import mlrun +import pytest + + +@pytest.fixture() +def setup_test(): + with tempfile.TemporaryDirectory() as artifact_path: + project = mlrun.get_or_create_project(name="default", context=artifact_path) + func = project.set_function( + func=os.path.abspath("./function.yaml"), + name="silero-vad", + image="mlrun/mlrun", + ) + yield func, artifact_path + + +def test_detect_voice(setup_test): + silero_vad_function, artifact_path = setup_test + run = silero_vad_function.run( + handler="detect_voice", + inputs={"data_path": "./assets"}, + returns=["vad_outputs: file", "errors: file"], + artifact_path=artifact_path, + local=True, + ) + assert run.outputs["vad_outputs"] + + +def test_diarize(setup_test): + silero_vad_function, artifact_path = setup_test + run = silero_vad_function.run( + handler="diarize", + inputs={"data_path": "./assets"}, + params={ + "speakers_labels": ["Agent", "Client"], + }, + returns=["speech_diarization: file", "errors: file"], + artifact_path=artifact_path, + local=True, + ) + assert run.outputs["speech_diarization"] diff --git a/functions/master/silero_vad/1.4.0/static/documentation.html b/functions/master/silero_vad/1.4.0/static/documentation.html new file mode 100644 index 00000000..344c2421 --- /dev/null +++ b/functions/master/silero_vad/1.4.0/static/documentation.html @@ -0,0 +1,541 @@ + + + + + + + +silero_vad package + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + + + +
    +
    +
    +
    +
    +
    + +
    + +
    +
    +
    + + + + +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + + + +
    +
    +
    +
    + + +
    +
    +

    silero_vad package#

    +
    +

    Submodules#

    +
    +
    +

    silero_vad.silero_vad module#

    +
    +
    +class silero_vad.silero_vad.BaseTask(audio_file: Path)[source]#
    +

    Bases: object

    +

    A base class for a task to complete after VAD.

    +
    +
    +property audio_file: Path#
    +

    Get the audio file of the task.

    +
    +
    Returns:
    +

    The audio file of the task.

    +
    +
    +
    +
    +
    +do_task(speech_timestamps: List[Dict[str, int]] | List[List[Dict[str, int]]])[source]#
    +

    Do the task on the given speech timestamps. The base task will simply save the speech timestamps as the result.

    +
    +
    Parameters:
    +

    speech_timestamps – The speech timestamps to do the task on as outputted from the VAD.

    +
    +
    +
    +
    +
    +get_result() Tuple[str, list][source]#
    +

    Get the result of the task. A tuple of the audio file name and the result.

    +
    +
    Returns:
    +

    The result of the task.

    +
    +
    +
    +
    +
    +to_tuple() Tuple[str, dict][source]#
    +

    Convert the task to a tuple to reconstruct it later (used for multiprocessing to pass in queue).

    +
    +
    Returns:
    +

    The converted task.

    +
    +
    +
    +
    +
    +
    +class silero_vad.silero_vad.SpeechDiarizationTask(audio_file: Path, speaker_labels: List[str])[source]#
    +

    Bases: BaseTask

    +

    A speech diarization task. The task will diarize the VAD speech timestamps into speakers.

    +
    +
    +do_task(speech_timestamps: List[List[Dict[str, int]]])[source]#
    +

    Do the task on the given speech timestamps. The task will diarize the VAD speech timestamps into speakers.

    +
    +
    Parameters:
    +

    speech_timestamps – The speech timestamps per channel to do the task on as outputted from the VAD.

    +
    +
    +
    +
    +
    +to_tuple() Tuple[str, dict][source]#
    +

    Convert the task to a tuple to reconstruct it later (used for multiprocessing to pass in queue).

    +
    +
    Returns:
    +

    The converted task.

    +
    +
    +
    +
    +
    +
    +class silero_vad.silero_vad.TaskCreator(task_type: Type[BaseTask], task_kwargs: dict | None = None)[source]#
    +

    Bases: object

    +

    A task creator to create different tasks to run after the VAD.

    +
    +
    +create_task(audio_file: Path) BaseTask[source]#
    +

    Create a task with the given audio file.

    +
    +
    Parameters:
    +

    audio_file – The audio file to assign to the task.

    +
    +
    Returns:
    +

    The created task.

    +
    +
    +
    +
    +
    +classmethod from_tuple(task_tuple: Tuple[str, dict]) BaseTask[source]#
    +

    Create a task from a tuple of the audio file name and the task kwargs.

    +
    +
    Parameters:
    +

    task_tuple – The task tuple to create the task from.

    +
    +
    Returns:
    +

    The created task.

    +
    +
    +
    +
    +
    +
    +class silero_vad.silero_vad.VoiceActivityDetector(use_onnx: bool = True, force_onnx_cpu: bool = True, threshold: float = 0.5, sampling_rate: int = 16000, min_speech_duration_ms: int = 250, max_speech_duration_s: float = inf, min_silence_duration_ms: int = 100, window_size_samples: int = 512, speech_pad_ms: int = 30, return_seconds: bool = False, per_channel: bool = False)[source]#
    +

    Bases: object

    +

    A voice activity detection wrapper for the silero VAD model - snakers4/silero-vad.

    +
    +
    +detect_voice(audio_file: Path) List[Dict[str, int]] | List[List[Dict[str, int]]][source]#
    +

    Infer the audio through the VAD model and return the speech timestamps.

    +
    +
    Parameters:
    +

    audio_file – The audio file to infer.

    +
    +
    Returns:
    +

    The speech timestamps in the audio. A list of timestamps where each timestamp is a dictionary with the +following keys:

    +
      +
    • ”start”: The start sample index of the speech in the audio.

    • +
    • ”end”: The end sample index of the speech in the audio.

    • +
    +

    If per_channel is True, a list of timestamps per channel will be returned.

    +

    +
    +
    +
    +
    +
    +load(force_reload: bool = True)[source]#
    +

    Load the VAD model.

    +
    +
    Parameters:
    +

    force_reload – Whether to force reload the model even if it was already loaded. Default is True.

    +
    +
    +
    +
    +
    +
    +silero_vad.silero_vad.detect_voice(data_path: str | Path | List[str | Path], use_onnx: bool = True, force_onnx_cpu: bool = True, threshold: float = 0.5, sampling_rate: int = 16000, min_speech_duration_ms: int = 250, max_speech_duration_s: float = inf, min_silence_duration_ms: int = 100, window_size_samples: int = 512, speech_pad_ms: int = 30, return_seconds: bool = False, per_channel: bool = False, use_multiprocessing: int = 0, verbose: bool = False)[source]#
    +

    Perform voice activity detection on given audio files using the silero VAD model - +snakers4/silero-vad. The end result is a dictionary with the file names as keys and their +VAD timestamps dictionaries as value.

    +

    For example:

    +
    {
    +    "file_1.wav": [
    +        {"start": 0, "end": 16000},
    +        {"start": 16000, "end": 32000},
    +        {"start": 32000, "end": 48000},
    +        ...
    +    ],
    +    "file_2.wav": [
    +        {"start": 0, "end": 16000},
    +        {"start": 16000, "end": 32000},
    +        {"start": 32000, "end": 48000},
    +        ...
    +    ],
    +    ...
    +}
    +
    +
    +
    +
    Parameters:
    +
      +
    • data_path – The path to the audio files to diarize. Can be a path to a single file, a path to a +directory or a list of paths to files.

    • +
    • use_onnx – Whether to use ONNX for inference. Default is True.

    • +
    • force_onnx_cpu – Whether to force ONNX to use CPU for inference. Default is True.

    • +
    • threshold – Speech threshold. Silero VAD outputs speech probabilities for each audio chunk, +probabilities ABOVE this value are considered as SPEECH. It is better to tune +this parameter for each dataset separately, but “lazy” 0.5 is pretty good for +most datasets.

    • +
    • sampling_rate – Currently, silero VAD models support 8000 and 16000 sample rates.

    • +
    • min_speech_duration_ms – Final speech chunks shorter min_speech_duration_ms are thrown out.

    • +
    • max_speech_duration_s – Maximum duration of speech chunks in seconds. Chunks longer than +max_speech_duration_s will be split at the timestamp of the last silence that +lasts more than 100ms (if any), to prevent aggressive cutting. Otherwise, they will +be split aggressively just before max_speech_duration_s.

    • +
    • min_silence_duration_ms – In the end of each speech chunk wait for min_silence_duration_ms before separating +it.

    • +
    • window_size_samples

      Audio chunks of window_size_samples size are fed to the silero VAD model.

      +

      WARNING! Silero VAD models were trained using 512, 1024, 1536 samples for 16000 +sample rate and 256, 512, 768 samples for 8000 sample rate. Values other than +these may affect model performance!

      +

    • +
    • speech_pad_ms – Final speech chunks are padded by speech_pad_ms each side.

    • +
    • return_seconds – Whether return timestamps in seconds. False means to return timestamps in samples +(default - False).

    • +
    • per_channel – Whether to return timestamps per channel (default - False). This will run VAD on +each channel separately and return a list of timestamps per channel.

    • +
    • use_multiprocessing – The number of workers to use for multiprocessing. If 0, no multiprocessing will +be used. Default is 0.

    • +
    • verbose – Verbosity.

    • +
    +
    +
    +
    +
    +
    +silero_vad.silero_vad.diarize(data_path: str | Path | List[str | Path], use_onnx: bool = True, force_onnx_cpu: bool = True, threshold: float = 0.5, sampling_rate: int = 16000, min_speech_duration_ms: int = 250, max_speech_duration_s: float = inf, min_silence_duration_ms: int = 100, window_size_samples: int = 512, speech_pad_ms: int = 30, speaker_labels: List[str] | None = None, use_multiprocessing: int = 0, verbose: bool = False)[source]#
    +

    Perform speech diarization on given audio files using the silero VAD model - snakers4/silero-vad. +The speech diarization is performed per channel so that each channel in the audio belong to a different speaker. The +end result is a dictionary with the file names as keys and their diarization as value. A diarization is a list +of tuples: (start, end, speaker_label).

    +

    For example:

    +
    {
    +    "file_1.wav": [
    +        (0.0, 1.0, "speaker_0"),
    +        (1.0, 2.0, "speaker_1"),
    +        (2.0, 3.0, "speaker_0"),
    +        ...
    +    ],
    +    "file_2.wav": [
    +        (0.0, 1.0, "speaker_0"),
    +        (1.0, 2.0, "speaker_1"),
    +        (2.0, 3.0, "speaker_0"),
    +        ...
    +    ],
    +    ...
    +}
    +
    +
    +
    +
    Parameters:
    +
      +
    • data_path – The path to the audio files to diarize. Can be a path to a single file, a path to a +directory or a list of paths to files.

    • +
    • use_onnx – Whether to use ONNX for inference. Default is True.

    • +
    • force_onnx_cpu – Whether to force ONNX to use CPU for inference. Default is True.

    • +
    • threshold – Speech threshold. Silero VAD outputs speech probabilities for each audio chunk, +probabilities ABOVE this value are considered as SPEECH. It is better to tune +this parameter for each dataset separately, but “lazy” 0.5 is pretty good for +most datasets.

    • +
    • sampling_rate – Currently, silero VAD models support 8000 and 16000 sample rates.

    • +
    • min_speech_duration_ms – Final speech chunks shorter min_speech_duration_ms are thrown out.

    • +
    • max_speech_duration_s – Maximum duration of speech chunks in seconds. Chunks longer than +max_speech_duration_s will be split at the timestamp of the last silence that +lasts more than 100ms (if any), to prevent aggressive cutting. Otherwise, they will +be split aggressively just before max_speech_duration_s.

    • +
    • min_silence_duration_ms – In the end of each speech chunk wait for min_silence_duration_ms before separating +it.

    • +
    • window_size_samples

      Audio chunks of window_size_samples size are fed to the silero VAD model.

      +

      WARNING! Silero VAD models were trained using 512, 1024, 1536 samples for 16000 +sample rate and 256, 512, 768 samples for 8000 sample rate. Values other than +these may affect model performance!

      +

    • +
    • speech_pad_ms – Final speech chunks are padded by speech_pad_ms each side.

    • +
    • speaker_labels – The speaker labels to use for the diarization. If not given, the speakers will be +named “speaker_0”, “speaker_1”, etc.

    • +
    • use_multiprocessing – The number of workers to use for multiprocessing. If 0, no multiprocessing will +be used. Default is 0.

    • +
    • verbose – Verbosity.

    • +
    +
    +
    +
    +
    +
    +

    Module contents#

    +
    +
    +
    +
    +
    +
    +
    +
    + +
    +
    + +
    +
    +
    +
    + + + +
    +
    + + \ No newline at end of file diff --git a/functions/master/silero_vad/1.4.0/static/example.html b/functions/master/silero_vad/1.4.0/static/example.html new file mode 100644 index 00000000..0eb7542f --- /dev/null +++ b/functions/master/silero_vad/1.4.0/static/example.html @@ -0,0 +1,212 @@ + + + + + + + +<no title> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + + + +
    +
    +
    +
    +
    +
    + +
    + +
    +
    +
    + + + + +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + + + +
    +
    +
    +
    +
    +

    + +
    +
    +
    +

    Contents

    +
    + +
    +
    +
    + +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +
    +
    +
    +
    + + + +
    +
    + + \ No newline at end of file diff --git a/functions/master/silero_vad/1.4.0/static/function.html b/functions/master/silero_vad/1.4.0/static/function.html new file mode 100644 index 00000000..839ab544 --- /dev/null +++ b/functions/master/silero_vad/1.4.0/static/function.html @@ -0,0 +1,308 @@ + + + + + + + + + + + Source + + + + +
    +        
    +metadata:
    +  tag: ''
    +  categories:
    +  - deep-learning
    +  - audio
    +  name: silero-vad
    +verbose: false
    +spec:
    +  description: Silero VAD (Voice Activity Detection) functions.
    +  build:
    +    code_origin: ''
    +    base_image: mlrun/mlrun
    +    requirements:
    +    - torch
    +    - torchaudio
    +    - tqdm
    +    - onnxruntime
    +    functionSourceCode: 
    +    origin_filename: ''
    +  image: ''
    +  command: ''
    +  entry_points:
    +    audio_file:
    +      doc: Get the audio file of the task.
    +      lineno: 43
    +      has_varargs: false
    +      outputs:
    +      - doc: The audio file of the task.
    +        type: Path
    +      parameters:
    +      - name: self
    +      has_kwargs: false
    +      name: audio_file
    +    do_task:
    +      doc: Do the task on the given speech timestamps. The task will diarize the VAD
    +        speech timestamps into speakers.
    +      lineno: 94
    +      has_varargs: false
    +      parameters:
    +      - name: self
    +      - name: speech_timestamps
    +        type: List[List[Dict[str, int]]]
    +        doc: The speech timestamps per channel to do the task on as outputted from
    +          the VAD.
    +      has_kwargs: false
    +      name: do_task
    +    get_result:
    +      doc: Get the result of the task. A tuple of the audio file name and the result.
    +      lineno: 61
    +      has_varargs: false
    +      outputs:
    +      - doc: The result of the task.
    +        type: Tuple[str, list]
    +      parameters:
    +      - name: self
    +      has_kwargs: false
    +      name: get_result
    +    to_tuple:
    +      doc: Convert the task to a tuple to reconstruct it later (used for multiprocessing
    +        to pass in queue).
    +      lineno: 116
    +      has_varargs: false
    +      outputs:
    +      - doc: The converted task.
    +        type: Tuple[str, dict]
    +      parameters:
    +      - name: self
    +      has_kwargs: false
    +      name: to_tuple
    +    create_task:
    +      doc: Create a task with the given audio file.
    +      lineno: 146
    +      has_varargs: false
    +      outputs:
    +      - doc: The created task.
    +        type: BaseTask
    +      parameters:
    +      - name: self
    +      - name: audio_file
    +        type: Path
    +        doc: The audio file to assign to the task.
    +      has_kwargs: false
    +      name: create_task
    +    from_tuple:
    +      doc: Create a task from a tuple of the audio file name and the task kwargs.
    +      lineno: 157
    +      has_varargs: false
    +      outputs:
    +      - doc: The created task.
    +        type: BaseTask
    +      parameters:
    +      - name: cls
    +      - name: task_tuple
    +        type: Tuple[str, dict]
    +        doc: The task tuple to create the task from.
    +      has_kwargs: false
    +      name: from_tuple
    +    load:
    +      doc: Load the VAD model.
    +      lineno: 234
    +      has_varargs: false
    +      parameters:
    +      - name: self
    +      - name: force_reload
    +        type: bool
    +        doc: Whether to force reload the model even if it was already loaded. Default
    +          is True.
    +        default: true
    +      has_kwargs: false
    +      name: load
    +    detect_voice:
    +      doc: "Perform voice activity detection on given audio files using the silero\
    +        \ VAD model -\nhttps://github.com/snakers4/silero-vad. The end result is a\
    +        \ dictionary with the file names as keys and their\nVAD timestamps dictionaries\
    +        \ as value.\n\nFor example::\n\n    {\n        \"file_1.wav\": [\n       \
    +        \     {\"start\": 0, \"end\": 16000},\n            {\"start\": 16000, \"end\"\
    +        : 32000},\n            {\"start\": 32000, \"end\": 48000},\n            ...\n\
    +        \        ],\n        \"file_2.wav\": [\n            {\"start\": 0, \"end\"\
    +        : 16000},\n            {\"start\": 16000, \"end\": 32000},\n            {\"\
    +        start\": 32000, \"end\": 48000},\n            ...\n        ],\n        ...\n\
    +        \    }"
    +      lineno: 393
    +      has_varargs: false
    +      parameters:
    +      - name: data_path
    +        type: Union[str, Path, List[Union[str, Path]]]
    +        doc: The path to the audio files to diarize. Can be a path to a single file,
    +          a path to a directory or a list of paths to files.
    +      - name: use_onnx
    +        type: bool
    +        doc: Whether to use ONNX for inference. Default is True.
    +        default: true
    +      - name: force_onnx_cpu
    +        type: bool
    +        doc: Whether to force ONNX to use CPU for inference. Default is True.
    +        default: true
    +      - name: threshold
    +        type: float
    +        doc: Speech threshold. Silero VAD outputs speech probabilities for each audio
    +          chunk, probabilities ABOVE this value are considered as SPEECH. It is better
    +          to tune this parameter for each dataset separately, but "lazy" 0.5 is pretty
    +          good for most datasets.
    +        default: 0.5
    +      - name: sampling_rate
    +        type: int
    +        doc: Currently, silero VAD models support 8000 and 16000 sample rates.
    +        default: 16000
    +      - name: min_speech_duration_ms
    +        type: int
    +        doc: Final speech chunks shorter min_speech_duration_ms are thrown out.
    +        default: 250
    +      - name: max_speech_duration_s
    +        type: float
    +        doc: Maximum duration of speech chunks in seconds. Chunks longer than `max_speech_duration_s`
    +          will be split at the timestamp of the last silence that lasts more than
    +          100ms (if any), to prevent aggressive cutting. Otherwise, they will be split
    +          aggressively just before max_speech_duration_s.
    +        default: float('inf')
    +      - name: min_silence_duration_ms
    +        type: int
    +        doc: In the end of each speech chunk wait for min_silence_duration_ms before
    +          separating it.
    +        default: 100
    +      - name: window_size_samples
    +        type: int
    +        doc: Audio chunks of window_size_samples size are fed to the silero VAD model.
    +        default: 512
    +      - name: speech_pad_ms
    +        type: int
    +        doc: Final speech chunks are padded by speech_pad_ms each side.
    +        default: 30
    +      - name: return_seconds
    +        type: bool
    +        doc: Whether return timestamps in seconds. False means to return timestamps
    +          in samples (default - False).
    +        default: false
    +      - name: per_channel
    +        type: bool
    +        doc: Whether to return timestamps per channel (default - False). This will
    +          run VAD on each channel separately and return a list of timestamps per channel.
    +        default: false
    +      - name: use_multiprocessing
    +        type: int
    +        doc: The number of workers to use for multiprocessing. If 0, no multiprocessing
    +          will be used. Default is 0.
    +        default: 0
    +      - name: verbose
    +        type: bool
    +        doc: Verbosity.
    +        default: false
    +      has_kwargs: false
    +      name: detect_voice
    +    diarize:
    +      doc: "Perform speech diarization on given audio files using the silero VAD model\
    +        \ - https://github.com/snakers4/silero-vad.\nThe speech diarization is performed\
    +        \ per channel so that each channel in the audio belong to a different speaker.\
    +        \ The\nend result is a dictionary with the file names as keys and their diarization\
    +        \ as value. A diarization is a list\nof tuples: (start, end, speaker_label).\n\
    +        \nFor example::\n\n    {\n        \"file_1.wav\": [\n            (0.0, 1.0,\
    +        \ \"speaker_0\"),\n            (1.0, 2.0, \"speaker_1\"),\n            (2.0,\
    +        \ 3.0, \"speaker_0\"),\n            ...\n        ],\n        \"file_2.wav\"\
    +        : [\n            (0.0, 1.0, \"speaker_0\"),\n            (1.0, 2.0, \"speaker_1\"\
    +        ),\n            (2.0, 3.0, \"speaker_0\"),\n            ...\n        ],\n\
    +        \        ...\n    }"
    +      lineno: 517
    +      has_varargs: false
    +      parameters:
    +      - name: data_path
    +        type: Union[str, Path, List[Union[str, Path]]]
    +        doc: The path to the audio files to diarize. Can be a path to a single file,
    +          a path to a directory or a list of paths to files.
    +      - name: use_onnx
    +        type: bool
    +        doc: Whether to use ONNX for inference. Default is True.
    +        default: true
    +      - name: force_onnx_cpu
    +        type: bool
    +        doc: Whether to force ONNX to use CPU for inference. Default is True.
    +        default: true
    +      - name: threshold
    +        type: float
    +        doc: Speech threshold. Silero VAD outputs speech probabilities for each audio
    +          chunk, probabilities ABOVE this value are considered as SPEECH. It is better
    +          to tune this parameter for each dataset separately, but "lazy" 0.5 is pretty
    +          good for most datasets.
    +        default: 0.5
    +      - name: sampling_rate
    +        type: int
    +        doc: Currently, silero VAD models support 8000 and 16000 sample rates.
    +        default: 16000
    +      - name: min_speech_duration_ms
    +        type: int
    +        doc: Final speech chunks shorter min_speech_duration_ms are thrown out.
    +        default: 250
    +      - name: max_speech_duration_s
    +        type: float
    +        doc: Maximum duration of speech chunks in seconds. Chunks longer than `max_speech_duration_s`
    +          will be split at the timestamp of the last silence that lasts more than
    +          100ms (if any), to prevent aggressive cutting. Otherwise, they will be split
    +          aggressively just before max_speech_duration_s.
    +        default: float('inf')
    +      - name: min_silence_duration_ms
    +        type: int
    +        doc: In the end of each speech chunk wait for min_silence_duration_ms before
    +          separating it.
    +        default: 100
    +      - name: window_size_samples
    +        type: int
    +        doc: Audio chunks of window_size_samples size are fed to the silero VAD model.
    +        default: 512
    +      - name: speech_pad_ms
    +        type: int
    +        doc: Final speech chunks are padded by speech_pad_ms each side.
    +        default: 30
    +      - name: speaker_labels
    +        type: List[str]
    +        doc: The speaker labels to use for the diarization. If not given, the speakers
    +          will be named "speaker_0", "speaker_1", etc.
    +        default: null
    +      - name: use_multiprocessing
    +        type: int
    +        doc: The number of workers to use for multiprocessing. If 0, no multiprocessing
    +          will be used. Default is 0.
    +        default: 0
    +      - name: verbose
    +        type: bool
    +        doc: Verbosity.
    +        default: false
    +      has_kwargs: false
    +      name: diarize
    +  disable_auto_mount: false
    +  default_handler: detect_voice
    +kind: job
    +
    +        
    +    
    + + \ No newline at end of file diff --git a/functions/master/silero_vad/1.4.0/static/item.html b/functions/master/silero_vad/1.4.0/static/item.html new file mode 100644 index 00000000..9e297535 --- /dev/null +++ b/functions/master/silero_vad/1.4.0/static/item.html @@ -0,0 +1,64 @@ + + + + + + + + + + + Source + + + + +
    +        
    +apiVersion: v1
    +categories:
    +- deep-learning
    +- audio
    +description: Silero VAD (Voice Activity Detection) functions.
    +doc: ''
    +example: silero_vad.ipynb
    +generationDate: 2023-12-03:14-30
    +hidden: false
    +icon: ''
    +labels:
    +  author: guyl
    +maintainers: []
    +marketplaceType: ''
    +mlrunVersion: 1.7.0
    +name: silero_vad
    +platformVersion: 3.5.3
    +spec:
    +  filename: silero_vad.py
    +  handler: detect_voice
    +  image: mlrun/mlrun
    +  kind: job
    +  requirements:
    +  - torch
    +  - torchaudio
    +  - tqdm
    +  - onnxruntime
    +url: ''
    +version: 1.4.0
    +
    +        
    +    
    + + \ No newline at end of file diff --git a/functions/master/silero_vad/1.4.0/static/silero_vad.html b/functions/master/silero_vad/1.4.0/static/silero_vad.html new file mode 100644 index 00000000..feae13a6 --- /dev/null +++ b/functions/master/silero_vad/1.4.0/static/silero_vad.html @@ -0,0 +1,1064 @@ + + + + + + + +silero_vad.silero_vad + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + + + +
    +
    +
    +
    +
    +
    + +
    + +
    +
    +
    + + + + +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +
    +
    +
    +
    +
    +

    + +
    +
    +
    +
    +
    + +
    +

    Source code for silero_vad.silero_vad

    +# Copyright 2024 Iguazio
    +#
    +# Licensed under the Apache License, Version 2.0 (the "License");
    +# you may not use this file except in compliance with the License.
    +# You may obtain a copy of the License at
    +#
    +#   http://www.apache.org/licenses/LICENSE-2.0
    +#
    +# Unless required by applicable law or agreed to in writing, software
    +# distributed under the License is distributed on an "AS IS" BASIS,
    +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    +# See the License for the specific language governing permissions and
    +# limitations under the License.
    +import logging
    +from multiprocessing import Process, Queue
    +from pathlib import Path
    +from types import FunctionType
    +from typing import Dict, List, Tuple, Type, Union
    +
    +import torch
    +import torchaudio
    +from tqdm import tqdm
    +
    +
    +
    +[docs] +class BaseTask: + """ + A base class for a task to complete after VAD. + """ + + def __init__(self, audio_file: Path): + """ + Initialize the base task. + + :param audio_file: The audio file assigned to the task. + """ + # Store the audio file: + self._audio_file = audio_file + + # Prepare the result: + self._result = None + + @property + def audio_file(self) -> Path: + """ + Get the audio file of the task. + + :returns: The audio file of the task. + """ + return self._audio_file + +
    +[docs] + def do_task( + self, speech_timestamps: Union[List[Dict[str, int]], List[List[Dict[str, int]]]] + ): + """ + Do the task on the given speech timestamps. The base task will simply save the speech timestamps as the result. + + :param speech_timestamps: The speech timestamps to do the task on as outputted from the VAD. + """ + self._result = speech_timestamps
    + + +
    +[docs] + def get_result(self) -> Tuple[str, list]: + """ + Get the result of the task. A tuple of the audio file name and the result. + + :returns: The result of the task. + """ + return self._audio_file.name, self._result
    + + +
    +[docs] + def to_tuple(self) -> Tuple[str, dict]: + """ + Convert the task to a tuple to reconstruct it later (used for multiprocessing to pass in queue). + + :returns: The converted task. + """ + return self.__class__.__name__, {"audio_file": self._audio_file}
    +
    + + + +
    +[docs] +class SpeechDiarizationTask(BaseTask): + """ + A speech diarization task. The task will diarize the VAD speech timestamps into speakers. + """ + + def __init__(self, audio_file: Path, speaker_labels: List[str]): + """ + Initialize the speech diarization task. + + :param audio_file: The audio file assigned to the task. + :param speaker_labels: The speaker labels to use for the diarization. If not given, the speakers will be named + "speaker_0", "speaker_1", etc. + """ + super().__init__(audio_file=audio_file) + self._speaker_labels = speaker_labels + +
    +[docs] + def do_task(self, speech_timestamps: List[List[Dict[str, int]]]): + """ + Do the task on the given speech timestamps. The task will diarize the VAD speech timestamps into speakers. + + :param speech_timestamps: The speech timestamps per channel to do the task on as outputted from the VAD. + """ + # Get the speaker labels (set default if not given): + speaker_labels = self._speaker_labels or [ + f"speaker_{i}" for i in range(len(speech_timestamps)) + ] + + # Diarize - organize the speech timestamps into a single list of speakers and sort it by start time: + speech_diarization = [ + (speech_timestamp["start"], speech_timestamp["end"], speaker_label) + for speaker_label, channel_speech_timestamps in zip( + speaker_labels, speech_timestamps + ) + for speech_timestamp in channel_speech_timestamps + ] + speech_diarization.sort() + self._result = speech_diarization
    + + +
    +[docs] + def to_tuple(self) -> Tuple[str, dict]: + """ + Convert the task to a tuple to reconstruct it later (used for multiprocessing to pass in queue). + + :returns: The converted task. + """ + task_class, task_kwargs = super().to_tuple() + return task_class, {**task_kwargs, "speaker_labels": self._speaker_labels}
    +
    + + + +
    +[docs] +class TaskCreator: + """ + A task creator to create different tasks to run after the VAD. + """ + + #: A map from task class name to task class to use in `from_tuple`: + _MAP = { + BaseTask.__name__: BaseTask, + SpeechDiarizationTask.__name__: SpeechDiarizationTask, + } + + def __init__(self, task_type: Type[BaseTask], task_kwargs: dict = None): + """ + Initialize the task creator. + :param task_type: The task type - a `BaseTask` subclass. + :param task_kwargs: Additional keyword arguments to pass to the to be created tasks. + """ + self._task_type = task_type + self._task_kwargs = task_kwargs or {} + +
    +[docs] + def create_task(self, audio_file: Path) -> BaseTask: + """ + Create a task with the given audio file. + + :param audio_file: The audio file to assign to the task. + + :returns: The created task. + """ + return self._task_type(audio_file=audio_file, **self._task_kwargs)
    + + +
    +[docs] + @classmethod + def from_tuple(cls, task_tuple: Tuple[str, dict]) -> BaseTask: + """ + Create a task from a tuple of the audio file name and the task kwargs. + + :param task_tuple: The task tuple to create the task from. + + :returns: The created task. + """ + task_class, task_kwargs = task_tuple + return cls._MAP[task_class](**task_kwargs)
    +
    + + + +
    +[docs] +class VoiceActivityDetector: + """ + A voice activity detection wrapper for the silero VAD model - https://github.com/snakers4/silero-vad. + """ + + def __init__( + self, + # Model loading kwargs: + use_onnx: bool = True, + force_onnx_cpu: bool = True, + # Detection kwargs: + threshold: float = 0.5, + sampling_rate: int = 16_000, + min_speech_duration_ms: int = 250, + max_speech_duration_s: float = float("inf"), + min_silence_duration_ms: int = 100, + window_size_samples: int = 512, + speech_pad_ms: int = 30, + return_seconds: bool = False, + per_channel: bool = False, + ): + """ + Initialize the voice activity detector. + + :param use_onnx: Whether to use ONNX for inference. Default is True. + :param force_onnx_cpu: Whether to force ONNX to use CPU for inference. Default is True. + :param threshold: Speech threshold. Silero VAD outputs speech probabilities for each audio chunk, + probabilities ABOVE this value are considered as SPEECH. It is better to tune + this parameter for each dataset separately, but "lazy" 0.5 is pretty good for + most datasets. + :param sampling_rate: Currently, silero VAD models support 8000 and 16000 sample rates. + :param min_speech_duration_ms: Final speech chunks shorter min_speech_duration_ms are thrown out. + :param max_speech_duration_s: Maximum duration of speech chunks in seconds. Chunks longer than + `max_speech_duration_s` will be split at the timestamp of the last silence that + lasts more than 100ms (if any), to prevent aggressive cutting. Otherwise, + they will be split aggressively just before max_speech_duration_s. + :param min_silence_duration_ms: In the end of each speech chunk wait for min_silence_duration_ms before + separating it. + :param window_size_samples: Audio chunks of window_size_samples size are fed to the silero VAD model. + WARNING! Silero VAD models were trained using 512, 1024, 1536 samples for 16000 + sample rate and 256, 512, 768 samples for 8000 sample rate. Values other than + these may affect model performance! + :param speech_pad_ms: Final speech chunks are padded by speech_pad_ms each side. + :param return_seconds: Whether return timestamps in seconds. False means to return timestamps in + samples (default - False). + :param per_channel: Whether to return timestamps per channel (default - False). This will run VAD + on each channel separately and return a list of timestamps per channel. + """ + # Store configurations: + self._use_onnx = use_onnx + self._force_onnx_cpu = force_onnx_cpu + self._threshold = threshold + self._sampling_rate = sampling_rate + self._min_speech_duration_ms = min_speech_duration_ms + self._max_speech_duration_s = max_speech_duration_s + self._min_silence_duration_ms = min_silence_duration_ms + self._window_size_samples = window_size_samples + self._speech_pad_ms = speech_pad_ms + self._return_seconds = return_seconds + self._per_channel = per_channel + + # Prepare the model variables + self._model: torch.Module = None + self._get_speech_timestamps: FunctionType = None + +
    +[docs] + def load(self, force_reload: bool = True): + """ + Load the VAD model. + + :param force_reload: Whether to force reload the model even if it was already loaded. Default is True. + """ + model, utils = torch.hub.load( + repo_or_dir="snakers4/silero-vad", + model="silero_vad", + force_reload=force_reload, + onnx=self._use_onnx, + force_onnx_cpu=self._force_onnx_cpu, + ) + self._model = model + ( + self._get_speech_timestamps, + _, # save_audio, + _, # read_audio, + _, # VADIterator, + _, # collect_chunks + ) = utils
    + + +
    +[docs] + def detect_voice( + self, + audio_file: Path, + ) -> Union[List[Dict[str, int]], List[List[Dict[str, int]]]]: + """ + Infer the audio through the VAD model and return the speech timestamps. + + :param audio_file: The audio file to infer. + + :returns: The speech timestamps in the audio. A list of timestamps where each timestamp is a dictionary with the + following keys: + + * "start": The start sample index of the speech in the audio. + * "end": The end sample index of the speech in the audio. + + If `per_channel` is True, a list of timestamps per channel will be returned. + """ + # Cast to a numpy array: + audio = self._read_audio(audio_file) + + # Detect speech: + if not self._per_channel: + return self._get_speech_timestamps( + audio, + self._model, + threshold=self._threshold, + min_speech_duration_ms=self._min_speech_duration_ms, + max_speech_duration_s=self._max_speech_duration_s, + min_silence_duration_ms=self._min_silence_duration_ms, + speech_pad_ms=self._speech_pad_ms, + sampling_rate=self._sampling_rate, + window_size_samples=self._window_size_samples, + return_seconds=self._return_seconds, + ) + + # Per channel: + speech_timestamps = [] + for channel in audio: + speech_timestamps.append( + self._get_speech_timestamps( + channel, + self._model, + threshold=self._threshold, + min_speech_duration_ms=self._min_speech_duration_ms, + max_speech_duration_s=self._max_speech_duration_s, + min_silence_duration_ms=self._min_silence_duration_ms, + speech_pad_ms=self._speech_pad_ms, + sampling_rate=self._sampling_rate, + window_size_samples=self._window_size_samples, + return_seconds=self._return_seconds, + ) + ) + + return speech_timestamps
    + + + def _read_audio( + self, + path: Path, + ) -> torch.Tensor: + """ + Read the audio from the given path and return it as a tensor. + + :param path: The path to the audio file. + + :returns: The audio as a tensor. + """ + # Read the audio: + audio, sampling_rate = torchaudio.load(str(path)) + + # Check if the audio is stereo and if so, convert it to mono (only if not per channel): + if audio.size(0) > 1 and not self._per_channel: + audio = audio.mean(dim=0, keepdim=True) + + # Resample the audio if needed: + if sampling_rate != self._sampling_rate: + transform = torchaudio.transforms.Resample( + orig_freq=sampling_rate, new_freq=self._sampling_rate + ) + audio = transform(audio) + + # Return the audio (squeeze if not per channel): + return audio if self._per_channel else audio.squeeze(0)
    + + + +#: The value to send into multiprocessing queues to stop the process: +_MULTIPROCESSING_STOP_MARK = "STOP" + + +def _multiprocessing_complete_tasks( + vad_init_kwargs: dict, tasks_queue: Queue, results_queue: Queue +): + """ + Complete the tasks in the given queue and put the results in the given results queue. The function will stop when + the given tasks queue will receive the stop mark. It is aimed to be used with multiprocessing as a process. + + :param vad_init_kwargs: The VAD initialization kwargs. + :param tasks_queue: A queue to get the tasks from. + :param results_queue: A queue to put the results in. + """ + # Initialize and load the VAD: + vad = VoiceActivityDetector(**vad_init_kwargs) + vad.load(force_reload=False) + + # Start listening to the tasks queue: + while True: + # Get the task: + task: Tuple[str, dict] = tasks_queue.get() + if task == _MULTIPROCESSING_STOP_MARK: + break + try: + # Create the task: + task = TaskCreator.from_tuple(task_tuple=task) + # Run the file through the VAD: + speech_timestamps = vad.detect_voice(audio_file=task.audio_file) + # Complete the task: + task.do_task(speech_timestamps=speech_timestamps) + # Build the result: + result = (False, task.get_result()) + except Exception as exception: + # Build the error: + result = (True, (task.audio_file.name, str(exception))) + # Collect the result / error: + results_queue.put(result) + + # Mark the end of the tasks: + results_queue.put(_MULTIPROCESSING_STOP_MARK) + + +# Get the global logger: +try: + import mlrun + + _LOGGER = mlrun.get_or_create_ctx("silero_vad").logger +except ModuleNotFoundError: + _LOGGER = logging.getLogger() + + +
    +[docs] +def detect_voice( + # Input kwargs: + data_path: Union[str, Path, List[Union[str, Path]]], + # Model loading kwargs: + use_onnx: bool = True, + force_onnx_cpu: bool = True, + # Detection kwargs: + threshold: float = 0.5, + sampling_rate: int = 16_000, + min_speech_duration_ms: int = 250, + max_speech_duration_s: float = float("inf"), + min_silence_duration_ms: int = 100, + window_size_samples: int = 512, + speech_pad_ms: int = 30, + return_seconds: bool = False, + per_channel: bool = False, + # Other kwargs: + use_multiprocessing: int = 0, + verbose: bool = False, +): + """ + Perform voice activity detection on given audio files using the silero VAD model - + https://github.com/snakers4/silero-vad. The end result is a dictionary with the file names as keys and their + VAD timestamps dictionaries as value. + + For example:: + + { + "file_1.wav": [ + {"start": 0, "end": 16000}, + {"start": 16000, "end": 32000}, + {"start": 32000, "end": 48000}, + ... + ], + "file_2.wav": [ + {"start": 0, "end": 16000}, + {"start": 16000, "end": 32000}, + {"start": 32000, "end": 48000}, + ... + ], + ... + } + + + :param data_path: The path to the audio files to diarize. Can be a path to a single file, a path to a + directory or a list of paths to files. + :param use_onnx: Whether to use ONNX for inference. Default is True. + :param force_onnx_cpu: Whether to force ONNX to use CPU for inference. Default is True. + :param threshold: Speech threshold. Silero VAD outputs speech probabilities for each audio chunk, + probabilities ABOVE this value are considered as SPEECH. It is better to tune + this parameter for each dataset separately, but "lazy" 0.5 is pretty good for + most datasets. + :param sampling_rate: Currently, silero VAD models support 8000 and 16000 sample rates. + :param min_speech_duration_ms: Final speech chunks shorter min_speech_duration_ms are thrown out. + :param max_speech_duration_s: Maximum duration of speech chunks in seconds. Chunks longer than + `max_speech_duration_s` will be split at the timestamp of the last silence that + lasts more than 100ms (if any), to prevent aggressive cutting. Otherwise, they will + be split aggressively just before max_speech_duration_s. + :param min_silence_duration_ms: In the end of each speech chunk wait for min_silence_duration_ms before separating + it. + :param window_size_samples: Audio chunks of window_size_samples size are fed to the silero VAD model. + + WARNING! Silero VAD models were trained using 512, 1024, 1536 samples for 16000 + sample rate and 256, 512, 768 samples for 8000 sample rate. Values other than + these may affect model performance! + :param speech_pad_ms: Final speech chunks are padded by speech_pad_ms each side. + :param return_seconds: Whether return timestamps in seconds. False means to return timestamps in samples + (default - False). + :param per_channel: Whether to return timestamps per channel (default - False). This will run VAD on + each channel separately and return a list of timestamps per channel. + :param use_multiprocessing: The number of workers to use for multiprocessing. If 0, no multiprocessing will + be used. Default is 0. + :param verbose: Verbosity. + """ + global _LOGGER + + # Get the input audio files to transcribe: + if verbose: + _LOGGER.info("Collecting audio files.") + audio_files = _get_audio_files(data_path=data_path) + if verbose: + _LOGGER.info(f"Collected {len(audio_files)} audio files.") + + # Initialize the transcription pipeline: + vad_init_kwargs = { + "use_onnx": use_onnx, + "force_onnx_cpu": force_onnx_cpu, + "threshold": threshold, + "sampling_rate": sampling_rate, + "min_speech_duration_ms": min_speech_duration_ms, + "max_speech_duration_s": max_speech_duration_s, + "min_silence_duration_ms": min_silence_duration_ms, + "window_size_samples": window_size_samples, + "speech_pad_ms": speech_pad_ms, + "return_seconds": return_seconds, + "per_channel": per_channel, + } + + # Create the task creator: + task_creator = TaskCreator(task_type=BaseTask) + + # Run the transcription: + if use_multiprocessing: + results = _parallel_run( + n_workers=use_multiprocessing, + audio_files=audio_files, + description="Detecting voice", + vad_init_kwargs=vad_init_kwargs, + task_creator=task_creator, + verbose=verbose, + ) + else: + results = _run( + audio_files=audio_files, + description="Detecting voice", + vad_init_kwargs=vad_init_kwargs, + task_creator=task_creator, + verbose=verbose, + ) + + # Process the results: + return _process_results(results=results, verbose=verbose)
    + + + +
    +[docs] +def diarize( + # Input / Output kwargs: + data_path: Union[str, Path, List[Union[str, Path]]], + # Model loading kwargs: + use_onnx: bool = True, + force_onnx_cpu: bool = True, + # Detection kwargs: + threshold: float = 0.5, + sampling_rate: int = 16_000, + min_speech_duration_ms: int = 250, + max_speech_duration_s: float = float("inf"), + min_silence_duration_ms: int = 100, + window_size_samples: int = 512, + speech_pad_ms: int = 30, + # Diarization kwargs: + speaker_labels: List[str] = None, + # Other kwargs: + use_multiprocessing: int = 0, + verbose: bool = False, +): + """ + Perform speech diarization on given audio files using the silero VAD model - https://github.com/snakers4/silero-vad. + The speech diarization is performed per channel so that each channel in the audio belong to a different speaker. The + end result is a dictionary with the file names as keys and their diarization as value. A diarization is a list + of tuples: (start, end, speaker_label). + + For example:: + + { + "file_1.wav": [ + (0.0, 1.0, "speaker_0"), + (1.0, 2.0, "speaker_1"), + (2.0, 3.0, "speaker_0"), + ... + ], + "file_2.wav": [ + (0.0, 1.0, "speaker_0"), + (1.0, 2.0, "speaker_1"), + (2.0, 3.0, "speaker_0"), + ... + ], + ... + } + + + :param data_path: The path to the audio files to diarize. Can be a path to a single file, a path to a + directory or a list of paths to files. + :param use_onnx: Whether to use ONNX for inference. Default is True. + :param force_onnx_cpu: Whether to force ONNX to use CPU for inference. Default is True. + :param threshold: Speech threshold. Silero VAD outputs speech probabilities for each audio chunk, + probabilities ABOVE this value are considered as SPEECH. It is better to tune + this parameter for each dataset separately, but "lazy" 0.5 is pretty good for + most datasets. + :param sampling_rate: Currently, silero VAD models support 8000 and 16000 sample rates. + :param min_speech_duration_ms: Final speech chunks shorter min_speech_duration_ms are thrown out. + :param max_speech_duration_s: Maximum duration of speech chunks in seconds. Chunks longer than + `max_speech_duration_s` will be split at the timestamp of the last silence that + lasts more than 100ms (if any), to prevent aggressive cutting. Otherwise, they will + be split aggressively just before max_speech_duration_s. + :param min_silence_duration_ms: In the end of each speech chunk wait for min_silence_duration_ms before separating + it. + :param window_size_samples: Audio chunks of window_size_samples size are fed to the silero VAD model. + + WARNING! Silero VAD models were trained using 512, 1024, 1536 samples for 16000 + sample rate and 256, 512, 768 samples for 8000 sample rate. Values other than + these may affect model performance! + :param speech_pad_ms: Final speech chunks are padded by speech_pad_ms each side. + :param speaker_labels: The speaker labels to use for the diarization. If not given, the speakers will be + named "speaker_0", "speaker_1", etc. + :param use_multiprocessing: The number of workers to use for multiprocessing. If 0, no multiprocessing will + be used. Default is 0. + :param verbose: Verbosity. + """ + global _LOGGER + + # Get the input audio files to transcribe: + if verbose: + _LOGGER.info("Collecting audio files.") + audio_files = _get_audio_files(data_path=data_path) + if verbose: + _LOGGER.info(f"Collected {len(audio_files)} audio files.") + + # Initialize the transcription pipeline: + vad_init_kwargs = { + "use_onnx": use_onnx, + "force_onnx_cpu": force_onnx_cpu, + "threshold": threshold, + "sampling_rate": sampling_rate, + "min_speech_duration_ms": min_speech_duration_ms, + "max_speech_duration_s": max_speech_duration_s, + "min_silence_duration_ms": min_silence_duration_ms, + "window_size_samples": window_size_samples, + "speech_pad_ms": speech_pad_ms, + "return_seconds": True, + "per_channel": True, + } + + # Create the task creator: + task_creator = TaskCreator( + task_type=SpeechDiarizationTask, task_kwargs={"speaker_labels": speaker_labels} + ) + + # Run the transcription: + if use_multiprocessing: + results = _parallel_run( + n_workers=use_multiprocessing, + audio_files=audio_files, + description="Diarizing", + vad_init_kwargs=vad_init_kwargs, + task_creator=task_creator, + verbose=verbose, + ) + else: + results = _run( + audio_files=audio_files, + description="Diarizing", + vad_init_kwargs=vad_init_kwargs, + task_creator=task_creator, + verbose=verbose, + ) + + # Process the results: + return _process_results(results=results, verbose=verbose)
    + + + +def _get_audio_files( + data_path: Union[Path, str, list], +) -> List[Path]: + """ + Get the audio files from the data path. If a path to a directory is given, all files in the directory will be + collected. + + :param data_path: The data path to collect the audio files from. + + :returns: The audio files list. + """ + # Check if given a list of paths: + if isinstance(data_path, list): + audio_files = [] + for path in data_path: + audio_files.extend(_get_audio_files(data_path=path)) + return audio_files + + # Check if given a single string path to cast it to a `pathlib.Path`: + if isinstance(data_path, str): + data_path = Path(data_path).absolute() + + # Check if the path is of a directory or a file: + if data_path.is_dir(): + # Get all files inside the directory: + audio_files = list(data_path.glob("*.*")) + elif data_path.is_file(): + audio_files = [data_path] + else: + raise ValueError( + f"Unrecognized data path. The parameter `data_path` must be a valid path to either a directory path or a " + f"file. Given: {str(data_path)} " + ) + + return audio_files + + +def _run( + audio_files: List[Path], + description: str, + vad_init_kwargs: dict, + task_creator: TaskCreator, + verbose: bool, +) -> List[Tuple[bool, Tuple[str, list]]]: + """ + Load a VAD and use it to complete the tasks that will be created on the provided files using the given task creator. + + :param audio_files: The audio files to use. + :param description: The description to use for the progress bar. + :param vad_init_kwargs: The VAD initialization keyword arguments. + :param task_creator: The task creator to use to create the tasks. + :param verbose: Verbosity. + + :returns: The collected results. + """ + # Load the VAD: + vad = VoiceActivityDetector(**vad_init_kwargs) + if verbose: + _LOGGER.info(f"Loading the VAD model.") + vad.load() + if verbose: + _LOGGER.info("VAD model loaded.") + + # Run the VAD on the audio files and collect the results: + results = [] + for audio_file in tqdm( + audio_files, + desc=description, + unit="file", + total=len(audio_files), + disable=not verbose, + ): + try: + # Create the task: + task = task_creator.create_task(audio_file=audio_file) + # Run the file through the VAD: + speech_timestamps = vad.detect_voice(audio_file=audio_file) + # Complete the task: + task.do_task(speech_timestamps=speech_timestamps) + # Collect the result: + results.append((False, task.get_result())) + except Exception as exception: + # Collect the error: + results.append((True, (audio_file.name, str(exception)))) + + return results + + +def _parallel_run( + n_workers: int, + audio_files: List[Path], + description: str, + vad_init_kwargs: dict, + task_creator: TaskCreator, + verbose: bool, +) -> List[Tuple[bool, Tuple[str, list]]]: + """ + Run multiple VAD workers with multiprocessing to complete the tasks that will be created on the provided files using + the given task creator. + + :param n_workers: The number of workers to use. + :param audio_files: The audio files to use. + :param description: The description to use for the progress bar. + :param vad_init_kwargs: The VAD initialization keyword arguments. + :param task_creator: The task creator to use to create the tasks. + :param verbose: Verbosity. + + :returns: The collected results. + """ + # Load the VAD (download once, and it will be loaded then per process later on): + if verbose: + _LOGGER.info(f"Loading the VAD model.") + vad = VoiceActivityDetector(**vad_init_kwargs) + vad.load() + if verbose: + _LOGGER.info("VAD model loaded.") + + # Check the number of workers: + if n_workers > len(audio_files): + _LOGGER.warning( + f"The number of workers ({n_workers}) is larger than the number of audio files ({len(audio_files)}). " + f"Setting the number of workers to {len(audio_files)}." + ) + n_workers = len(audio_files) + + # Initialize the multiprocessing queues: + tasks_queue = Queue() + results_queue = Queue() + + # Initialize the multiprocessing processes: + task_completion_processes = [ + Process( + target=_multiprocessing_complete_tasks, + kwargs={ + "vad_init_kwargs": vad_init_kwargs, + "tasks_queue": tasks_queue, + "results_queue": results_queue, + }, + ) + for _ in range(n_workers) + ] + + # Start the multiprocessing processes: + for p in task_completion_processes: + p.start() + + # Put the tasks in the queue: + for audio_file in audio_files: + tasks_queue.put(task_creator.create_task(audio_file=audio_file).to_tuple()) + + # Put the stop marks in the queue: + for _ in range(n_workers): + tasks_queue.put(_MULTIPROCESSING_STOP_MARK) + + # Collect the results: + results = [] + stop_marks_counter = 0 + with tqdm( + desc=description, + unit="file", + total=len(audio_files), + disable=not verbose, + ) as progressbar: + while True: + # Get a result from the queue: + result: Tuple[bool, Tuple[str, list]] = results_queue.get() + if result == _MULTIPROCESSING_STOP_MARK: + stop_marks_counter += 1 + if stop_marks_counter == n_workers: + break + else: + # Collect the result: + results.append(result) + progressbar.update(1) + + # Wait for the processes to finish: + for p in task_completion_processes: + p.join() + + return results + + +def _process_results( + results: List[Tuple[bool, Tuple[str, list]]], verbose: bool +) -> Tuple[dict, dict]: + """ + Process the results of the tasks. + + :param results: The results to process. + :param verbose: Verbosity. + + :returns: The processed results as a tuple of successes and errors. + """ + if verbose: + _LOGGER.info("Summarizing the results.") + successes = {} + errors = {} + for is_error, result in results: + if is_error: + errors[result[0]] = result[1] + else: + successes[result[0]] = result[1] + if verbose: + _LOGGER.info(f"Done ({len(successes)}/{len(successes) + len(errors)})\n") + + return successes, errors +
    +
    +
    +
    +
    +
    +
    +
    +
    + +
    +
    +
    +
    + + + +
    +
    + + \ No newline at end of file diff --git a/functions/master/silero_vad/1.4.0/static/source.html b/functions/master/silero_vad/1.4.0/static/source.html new file mode 100644 index 00000000..6a255822 --- /dev/null +++ b/functions/master/silero_vad/1.4.0/static/source.html @@ -0,0 +1,882 @@ + + + + + + + + + + + Source + + + + +
    +        
    +# Copyright 2024 Iguazio
    +#
    +# Licensed under the Apache License, Version 2.0 (the "License");
    +# you may not use this file except in compliance with the License.
    +# You may obtain a copy of the License at
    +#
    +#   http://www.apache.org/licenses/LICENSE-2.0
    +#
    +# Unless required by applicable law or agreed to in writing, software
    +# distributed under the License is distributed on an "AS IS" BASIS,
    +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    +# See the License for the specific language governing permissions and
    +# limitations under the License.
    +import logging
    +from multiprocessing import Process, Queue
    +from pathlib import Path
    +from types import FunctionType
    +from typing import Dict, List, Tuple, Type, Union
    +
    +import torch
    +import torchaudio
    +from tqdm import tqdm
    +
    +
    +class BaseTask:
    +    """
    +    A base class for a task to complete after VAD.
    +    """
    +
    +    def __init__(self, audio_file: Path):
    +        """
    +        Initialize the base task.
    +
    +        :param audio_file: The audio file assigned to the task.
    +        """
    +        # Store the audio file:
    +        self._audio_file = audio_file
    +
    +        # Prepare the result:
    +        self._result = None
    +
    +    @property
    +    def audio_file(self) -> Path:
    +        """
    +        Get the audio file of the task.
    +
    +        :returns: The audio file of the task.
    +        """
    +        return self._audio_file
    +
    +    def do_task(
    +        self, speech_timestamps: Union[List[Dict[str, int]], List[List[Dict[str, int]]]]
    +    ):
    +        """
    +        Do the task on the given speech timestamps. The base task will simply save the speech timestamps as the result.
    +
    +        :param speech_timestamps: The speech timestamps to do the task on as outputted from the VAD.
    +        """
    +        self._result = speech_timestamps
    +
    +    def get_result(self) -> Tuple[str, list]:
    +        """
    +        Get the result of the task. A tuple of the audio file name and the result.
    +
    +        :returns: The result of the task.
    +        """
    +        return self._audio_file.name, self._result
    +
    +    def to_tuple(self) -> Tuple[str, dict]:
    +        """
    +        Convert the task to a tuple to reconstruct it later (used for multiprocessing to pass in queue).
    +
    +        :returns: The converted task.
    +        """
    +        return self.__class__.__name__, {"audio_file": self._audio_file}
    +
    +
    +class SpeechDiarizationTask(BaseTask):
    +    """
    +    A speech diarization task. The task will diarize the VAD speech timestamps into speakers.
    +    """
    +
    +    def __init__(self, audio_file: Path, speaker_labels: List[str]):
    +        """
    +        Initialize the speech diarization task.
    +
    +        :param audio_file:     The audio file assigned to the task.
    +        :param speaker_labels: The speaker labels to use for the diarization. If not given, the speakers will be named
    +                               "speaker_0", "speaker_1", etc.
    +        """
    +        super().__init__(audio_file=audio_file)
    +        self._speaker_labels = speaker_labels
    +
    +    def do_task(self, speech_timestamps: List[List[Dict[str, int]]]):
    +        """
    +        Do the task on the given speech timestamps. The task will diarize the VAD speech timestamps into speakers.
    +
    +        :param speech_timestamps: The speech timestamps per channel to do the task on as outputted from the VAD.
    +        """
    +        # Get the speaker labels (set default if not given):
    +        speaker_labels = self._speaker_labels or [
    +            f"speaker_{i}" for i in range(len(speech_timestamps))
    +        ]
    +
    +        # Diarize - organize the speech timestamps into a single list of speakers and sort it by start time:
    +        speech_diarization = [
    +            (speech_timestamp["start"], speech_timestamp["end"], speaker_label)
    +            for speaker_label, channel_speech_timestamps in zip(
    +                speaker_labels, speech_timestamps
    +            )
    +            for speech_timestamp in channel_speech_timestamps
    +        ]
    +        speech_diarization.sort()
    +        self._result = speech_diarization
    +
    +    def to_tuple(self) -> Tuple[str, dict]:
    +        """
    +        Convert the task to a tuple to reconstruct it later (used for multiprocessing to pass in queue).
    +
    +        :returns: The converted task.
    +        """
    +        task_class, task_kwargs = super().to_tuple()
    +        return task_class, {**task_kwargs, "speaker_labels": self._speaker_labels}
    +
    +
    +class TaskCreator:
    +    """
    +    A task creator to create different tasks to run after the VAD.
    +    """
    +
    +    #: A map from task class name to task class to use in `from_tuple`:
    +    _MAP = {
    +        BaseTask.__name__: BaseTask,
    +        SpeechDiarizationTask.__name__: SpeechDiarizationTask,
    +    }
    +
    +    def __init__(self, task_type: Type[BaseTask], task_kwargs: dict = None):
    +        """
    +        Initialize the task creator.
    +        :param task_type: The task type - a `BaseTask` subclass.
    +        :param task_kwargs: Additional keyword arguments to pass to the to be created tasks.
    +        """
    +        self._task_type = task_type
    +        self._task_kwargs = task_kwargs or {}
    +
    +    def create_task(self, audio_file: Path) -> BaseTask:
    +        """
    +        Create a task with the given audio file.
    +
    +        :param audio_file: The audio file to assign to the task.
    +
    +        :returns: The created task.
    +        """
    +        return self._task_type(audio_file=audio_file, **self._task_kwargs)
    +
    +    @classmethod
    +    def from_tuple(cls, task_tuple: Tuple[str, dict]) -> BaseTask:
    +        """
    +        Create a task from a tuple of the audio file name and the task kwargs.
    +
    +        :param task_tuple: The task tuple to create the task from.
    +
    +        :returns: The created task.
    +        """
    +        task_class, task_kwargs = task_tuple
    +        return cls._MAP[task_class](**task_kwargs)
    +
    +
    +class VoiceActivityDetector:
    +    """
    +    A voice activity detection wrapper for the silero VAD model - https://github.com/snakers4/silero-vad.
    +    """
    +
    +    def __init__(
    +        self,
    +        # Model loading kwargs:
    +        use_onnx: bool = True,
    +        force_onnx_cpu: bool = True,
    +        # Detection kwargs:
    +        threshold: float = 0.5,
    +        sampling_rate: int = 16_000,
    +        min_speech_duration_ms: int = 250,
    +        max_speech_duration_s: float = float("inf"),
    +        min_silence_duration_ms: int = 100,
    +        window_size_samples: int = 512,
    +        speech_pad_ms: int = 30,
    +        return_seconds: bool = False,
    +        per_channel: bool = False,
    +    ):
    +        """
    +        Initialize the voice activity detector.
    +
    +        :param use_onnx:                Whether to use ONNX for inference. Default is True.
    +        :param force_onnx_cpu:          Whether to force ONNX to use CPU for inference. Default is True.
    +        :param threshold:               Speech threshold. Silero VAD outputs speech probabilities for each audio chunk,
    +                                        probabilities ABOVE this value are considered as SPEECH. It is better to tune
    +                                        this parameter for each dataset separately, but "lazy" 0.5 is pretty good for
    +                                        most datasets.
    +        :param sampling_rate:           Currently, silero VAD models support 8000 and 16000 sample rates.
    +        :param min_speech_duration_ms:  Final speech chunks shorter min_speech_duration_ms are thrown out.
    +        :param max_speech_duration_s:   Maximum duration of speech chunks in seconds. Chunks longer than
    +                                        `max_speech_duration_s` will be split at the timestamp of the last silence that
    +                                        lasts more than 100ms (if any), to prevent aggressive cutting. Otherwise,
    +                                        they will be split aggressively just before max_speech_duration_s.
    +        :param min_silence_duration_ms: In the end of each speech chunk wait for min_silence_duration_ms before
    +                                        separating it.
    +        :param window_size_samples:     Audio chunks of window_size_samples size are fed to the silero VAD model.
    +                                        WARNING! Silero VAD models were trained using 512, 1024, 1536 samples for 16000
    +                                        sample rate and 256, 512, 768 samples for 8000 sample rate. Values other than
    +                                        these may affect model performance!
    +        :param speech_pad_ms:           Final speech chunks are padded by speech_pad_ms each side.
    +        :param return_seconds:          Whether return timestamps in seconds. False means to return timestamps in
    +                                        samples (default - False).
    +        :param per_channel:             Whether to return timestamps per channel (default - False). This will run VAD
    +                                        on each channel separately and return a list of timestamps per channel.
    +        """
    +        # Store configurations:
    +        self._use_onnx = use_onnx
    +        self._force_onnx_cpu = force_onnx_cpu
    +        self._threshold = threshold
    +        self._sampling_rate = sampling_rate
    +        self._min_speech_duration_ms = min_speech_duration_ms
    +        self._max_speech_duration_s = max_speech_duration_s
    +        self._min_silence_duration_ms = min_silence_duration_ms
    +        self._window_size_samples = window_size_samples
    +        self._speech_pad_ms = speech_pad_ms
    +        self._return_seconds = return_seconds
    +        self._per_channel = per_channel
    +
    +        # Prepare the model variables
    +        self._model: torch.Module = None
    +        self._get_speech_timestamps: FunctionType = None
    +
    +    def load(self, force_reload: bool = True):
    +        """
    +        Load the VAD model.
    +
    +        :param force_reload: Whether to force reload the model even if it was already loaded. Default is True.
    +        """
    +        model, utils = torch.hub.load(
    +            repo_or_dir="snakers4/silero-vad",
    +            model="silero_vad",
    +            force_reload=force_reload,
    +            onnx=self._use_onnx,
    +            force_onnx_cpu=self._force_onnx_cpu,
    +        )
    +        self._model = model
    +        (
    +            self._get_speech_timestamps,
    +            _,  # save_audio,
    +            _,  # read_audio,
    +            _,  # VADIterator,
    +            _,  # collect_chunks
    +        ) = utils
    +
    +    def detect_voice(
    +        self,
    +        audio_file: Path,
    +    ) -> Union[List[Dict[str, int]], List[List[Dict[str, int]]]]:
    +        """
    +        Infer the audio through the VAD model and return the speech timestamps.
    +
    +        :param audio_file: The audio file to infer.
    +
    +        :returns: The speech timestamps in the audio. A list of timestamps where each timestamp is a dictionary with the
    +                 following keys:
    +
    +                 * "start": The start sample index of the speech in the audio.
    +                 * "end":   The end sample index of the speech in the audio.
    +
    +                 If `per_channel` is True, a list of timestamps per channel will be returned.
    +        """
    +        # Cast to a numpy array:
    +        audio = self._read_audio(audio_file)
    +
    +        # Detect speech:
    +        if not self._per_channel:
    +            return self._get_speech_timestamps(
    +                audio,
    +                self._model,
    +                threshold=self._threshold,
    +                min_speech_duration_ms=self._min_speech_duration_ms,
    +                max_speech_duration_s=self._max_speech_duration_s,
    +                min_silence_duration_ms=self._min_silence_duration_ms,
    +                speech_pad_ms=self._speech_pad_ms,
    +                sampling_rate=self._sampling_rate,
    +                window_size_samples=self._window_size_samples,
    +                return_seconds=self._return_seconds,
    +            )
    +
    +        # Per channel:
    +        speech_timestamps = []
    +        for channel in audio:
    +            speech_timestamps.append(
    +                self._get_speech_timestamps(
    +                    channel,
    +                    self._model,
    +                    threshold=self._threshold,
    +                    min_speech_duration_ms=self._min_speech_duration_ms,
    +                    max_speech_duration_s=self._max_speech_duration_s,
    +                    min_silence_duration_ms=self._min_silence_duration_ms,
    +                    speech_pad_ms=self._speech_pad_ms,
    +                    sampling_rate=self._sampling_rate,
    +                    window_size_samples=self._window_size_samples,
    +                    return_seconds=self._return_seconds,
    +                )
    +            )
    +
    +        return speech_timestamps
    +
    +    def _read_audio(
    +        self,
    +        path: Path,
    +    ) -> torch.Tensor:
    +        """
    +        Read the audio from the given path and return it as a tensor.
    +
    +        :param path: The path to the audio file.
    +
    +        :returns: The audio as a tensor.
    +        """
    +        # Read the audio:
    +        audio, sampling_rate = torchaudio.load(str(path))
    +
    +        # Check if the audio is stereo and if so, convert it to mono (only if not per channel):
    +        if audio.size(0) > 1 and not self._per_channel:
    +            audio = audio.mean(dim=0, keepdim=True)
    +
    +        # Resample the audio if needed:
    +        if sampling_rate != self._sampling_rate:
    +            transform = torchaudio.transforms.Resample(
    +                orig_freq=sampling_rate, new_freq=self._sampling_rate
    +            )
    +            audio = transform(audio)
    +
    +        # Return the audio (squeeze if not per channel):
    +        return audio if self._per_channel else audio.squeeze(0)
    +
    +
    +#: The value to send into multiprocessing queues to stop the process:
    +_MULTIPROCESSING_STOP_MARK = "STOP"
    +
    +
    +def _multiprocessing_complete_tasks(
    +    vad_init_kwargs: dict, tasks_queue: Queue, results_queue: Queue
    +):
    +    """
    +    Complete the tasks in the given queue and put the results in the given results queue. The function will stop when
    +    the given tasks queue will receive the stop mark. It is aimed to be used with multiprocessing as a process.
    +
    +    :param vad_init_kwargs: The VAD initialization kwargs.
    +    :param tasks_queue:     A queue to get the tasks from.
    +    :param results_queue:   A queue to put the results in.
    +    """
    +    # Initialize and load the VAD:
    +    vad = VoiceActivityDetector(**vad_init_kwargs)
    +    vad.load(force_reload=False)
    +
    +    # Start listening to the tasks queue:
    +    while True:
    +        # Get the task:
    +        task: Tuple[str, dict] = tasks_queue.get()
    +        if task == _MULTIPROCESSING_STOP_MARK:
    +            break
    +        try:
    +            # Create the task:
    +            task = TaskCreator.from_tuple(task_tuple=task)
    +            # Run the file through the VAD:
    +            speech_timestamps = vad.detect_voice(audio_file=task.audio_file)
    +            # Complete the task:
    +            task.do_task(speech_timestamps=speech_timestamps)
    +            # Build the result:
    +            result = (False, task.get_result())
    +        except Exception as exception:
    +            # Build the error:
    +            result = (True, (task.audio_file.name, str(exception)))
    +        # Collect the result / error:
    +        results_queue.put(result)
    +
    +    # Mark the end of the tasks:
    +    results_queue.put(_MULTIPROCESSING_STOP_MARK)
    +
    +
    +# Get the global logger:
    +try:
    +    import mlrun
    +
    +    _LOGGER = mlrun.get_or_create_ctx("silero_vad").logger
    +except ModuleNotFoundError:
    +    _LOGGER = logging.getLogger()
    +
    +
    +def detect_voice(
    +    # Input kwargs:
    +    data_path: Union[str, Path, List[Union[str, Path]]],
    +    # Model loading kwargs:
    +    use_onnx: bool = True,
    +    force_onnx_cpu: bool = True,
    +    # Detection kwargs:
    +    threshold: float = 0.5,
    +    sampling_rate: int = 16_000,
    +    min_speech_duration_ms: int = 250,
    +    max_speech_duration_s: float = float("inf"),
    +    min_silence_duration_ms: int = 100,
    +    window_size_samples: int = 512,
    +    speech_pad_ms: int = 30,
    +    return_seconds: bool = False,
    +    per_channel: bool = False,
    +    # Other kwargs:
    +    use_multiprocessing: int = 0,
    +    verbose: bool = False,
    +):
    +    """
    +    Perform voice activity detection on given audio files using the silero VAD model -
    +    https://github.com/snakers4/silero-vad. The end result is a dictionary with the file names as keys and their
    +    VAD timestamps dictionaries as value.
    +
    +    For example::
    +
    +        {
    +            "file_1.wav": [
    +                {"start": 0, "end": 16000},
    +                {"start": 16000, "end": 32000},
    +                {"start": 32000, "end": 48000},
    +                ...
    +            ],
    +            "file_2.wav": [
    +                {"start": 0, "end": 16000},
    +                {"start": 16000, "end": 32000},
    +                {"start": 32000, "end": 48000},
    +                ...
    +            ],
    +            ...
    +        }
    +
    +
    +    :param data_path:               The path to the audio files to diarize. Can be a path to a single file, a path to a
    +                                    directory or a list of paths to files.
    +    :param use_onnx:                Whether to use ONNX for inference. Default is True.
    +    :param force_onnx_cpu:          Whether to force ONNX to use CPU for inference. Default is True.
    +    :param threshold:               Speech threshold. Silero VAD outputs speech probabilities for each audio chunk,
    +                                    probabilities ABOVE this value are considered as SPEECH. It is better to tune
    +                                    this parameter for each dataset separately, but "lazy" 0.5 is pretty good for
    +                                    most datasets.
    +    :param sampling_rate:           Currently, silero VAD models support 8000 and 16000 sample rates.
    +    :param min_speech_duration_ms:  Final speech chunks shorter min_speech_duration_ms are thrown out.
    +    :param max_speech_duration_s:   Maximum duration of speech chunks in seconds. Chunks longer than
    +                                    `max_speech_duration_s` will be split at the timestamp of the last silence that
    +                                    lasts more than 100ms (if any), to prevent aggressive cutting. Otherwise, they will
    +                                    be split aggressively just before max_speech_duration_s.
    +    :param min_silence_duration_ms: In the end of each speech chunk wait for min_silence_duration_ms before separating
    +                                    it.
    +    :param window_size_samples:     Audio chunks of window_size_samples size are fed to the silero VAD model.
    +
    +                                    WARNING! Silero VAD models were trained using 512, 1024, 1536 samples for 16000
    +                                    sample rate and 256, 512, 768 samples for 8000 sample rate. Values other than
    +                                    these may affect model performance!
    +    :param speech_pad_ms:           Final speech chunks are padded by speech_pad_ms each side.
    +    :param return_seconds:          Whether return timestamps in seconds. False means to return timestamps in samples
    +                                    (default - False).
    +    :param per_channel:             Whether to return timestamps per channel (default - False). This will run VAD on
    +                                    each channel separately and return a list of timestamps per channel.
    +    :param use_multiprocessing:     The number of workers to use for multiprocessing. If 0, no multiprocessing will
    +                                    be used. Default is 0.
    +    :param verbose:                 Verbosity.
    +    """
    +    global _LOGGER
    +
    +    # Get the input audio files to transcribe:
    +    if verbose:
    +        _LOGGER.info("Collecting audio files.")
    +    audio_files = _get_audio_files(data_path=data_path)
    +    if verbose:
    +        _LOGGER.info(f"Collected {len(audio_files)} audio files.")
    +
    +    # Initialize the transcription pipeline:
    +    vad_init_kwargs = {
    +        "use_onnx": use_onnx,
    +        "force_onnx_cpu": force_onnx_cpu,
    +        "threshold": threshold,
    +        "sampling_rate": sampling_rate,
    +        "min_speech_duration_ms": min_speech_duration_ms,
    +        "max_speech_duration_s": max_speech_duration_s,
    +        "min_silence_duration_ms": min_silence_duration_ms,
    +        "window_size_samples": window_size_samples,
    +        "speech_pad_ms": speech_pad_ms,
    +        "return_seconds": return_seconds,
    +        "per_channel": per_channel,
    +    }
    +
    +    # Create the task creator:
    +    task_creator = TaskCreator(task_type=BaseTask)
    +
    +    # Run the transcription:
    +    if use_multiprocessing:
    +        results = _parallel_run(
    +            n_workers=use_multiprocessing,
    +            audio_files=audio_files,
    +            description="Detecting voice",
    +            vad_init_kwargs=vad_init_kwargs,
    +            task_creator=task_creator,
    +            verbose=verbose,
    +        )
    +    else:
    +        results = _run(
    +            audio_files=audio_files,
    +            description="Detecting voice",
    +            vad_init_kwargs=vad_init_kwargs,
    +            task_creator=task_creator,
    +            verbose=verbose,
    +        )
    +
    +    # Process the results:
    +    return _process_results(results=results, verbose=verbose)
    +
    +
    +def diarize(
    +    # Input / Output kwargs:
    +    data_path: Union[str, Path, List[Union[str, Path]]],
    +    # Model loading kwargs:
    +    use_onnx: bool = True,
    +    force_onnx_cpu: bool = True,
    +    # Detection kwargs:
    +    threshold: float = 0.5,
    +    sampling_rate: int = 16_000,
    +    min_speech_duration_ms: int = 250,
    +    max_speech_duration_s: float = float("inf"),
    +    min_silence_duration_ms: int = 100,
    +    window_size_samples: int = 512,
    +    speech_pad_ms: int = 30,
    +    # Diarization kwargs:
    +    speaker_labels: List[str] = None,
    +    # Other kwargs:
    +    use_multiprocessing: int = 0,
    +    verbose: bool = False,
    +):
    +    """
    +    Perform speech diarization on given audio files using the silero VAD model - https://github.com/snakers4/silero-vad.
    +    The speech diarization is performed per channel so that each channel in the audio belong to a different speaker. The
    +    end result is a dictionary with the file names as keys and their diarization as value. A diarization is a list
    +    of tuples: (start, end, speaker_label).
    +
    +    For example::
    +
    +        {
    +            "file_1.wav": [
    +                (0.0, 1.0, "speaker_0"),
    +                (1.0, 2.0, "speaker_1"),
    +                (2.0, 3.0, "speaker_0"),
    +                ...
    +            ],
    +            "file_2.wav": [
    +                (0.0, 1.0, "speaker_0"),
    +                (1.0, 2.0, "speaker_1"),
    +                (2.0, 3.0, "speaker_0"),
    +                ...
    +            ],
    +            ...
    +        }
    +
    +
    +    :param data_path:               The path to the audio files to diarize. Can be a path to a single file, a path to a
    +                                    directory or a list of paths to files.
    +    :param use_onnx:                Whether to use ONNX for inference. Default is True.
    +    :param force_onnx_cpu:          Whether to force ONNX to use CPU for inference. Default is True.
    +    :param threshold:               Speech threshold. Silero VAD outputs speech probabilities for each audio chunk,
    +                                    probabilities ABOVE this value are considered as SPEECH. It is better to tune
    +                                    this parameter for each dataset separately, but "lazy" 0.5 is pretty good for
    +                                    most datasets.
    +    :param sampling_rate:           Currently, silero VAD models support 8000 and 16000 sample rates.
    +    :param min_speech_duration_ms:  Final speech chunks shorter min_speech_duration_ms are thrown out.
    +    :param max_speech_duration_s:   Maximum duration of speech chunks in seconds. Chunks longer than
    +                                    `max_speech_duration_s` will be split at the timestamp of the last silence that
    +                                    lasts more than 100ms (if any), to prevent aggressive cutting. Otherwise, they will
    +                                    be split aggressively just before max_speech_duration_s.
    +    :param min_silence_duration_ms: In the end of each speech chunk wait for min_silence_duration_ms before separating
    +                                    it.
    +    :param window_size_samples:     Audio chunks of window_size_samples size are fed to the silero VAD model.
    +
    +                                    WARNING! Silero VAD models were trained using 512, 1024, 1536 samples for 16000
    +                                    sample rate and 256, 512, 768 samples for 8000 sample rate. Values other than
    +                                    these may affect model performance!
    +    :param speech_pad_ms:           Final speech chunks are padded by speech_pad_ms each side.
    +    :param speaker_labels:          The speaker labels to use for the diarization. If not given, the speakers will be
    +                                    named "speaker_0", "speaker_1", etc.
    +    :param use_multiprocessing:     The number of workers to use for multiprocessing. If 0, no multiprocessing will
    +                                    be used. Default is 0.
    +    :param verbose:                 Verbosity.
    +    """
    +    global _LOGGER
    +
    +    # Get the input audio files to transcribe:
    +    if verbose:
    +        _LOGGER.info("Collecting audio files.")
    +    audio_files = _get_audio_files(data_path=data_path)
    +    if verbose:
    +        _LOGGER.info(f"Collected {len(audio_files)} audio files.")
    +
    +    # Initialize the transcription pipeline:
    +    vad_init_kwargs = {
    +        "use_onnx": use_onnx,
    +        "force_onnx_cpu": force_onnx_cpu,
    +        "threshold": threshold,
    +        "sampling_rate": sampling_rate,
    +        "min_speech_duration_ms": min_speech_duration_ms,
    +        "max_speech_duration_s": max_speech_duration_s,
    +        "min_silence_duration_ms": min_silence_duration_ms,
    +        "window_size_samples": window_size_samples,
    +        "speech_pad_ms": speech_pad_ms,
    +        "return_seconds": True,
    +        "per_channel": True,
    +    }
    +
    +    # Create the task creator:
    +    task_creator = TaskCreator(
    +        task_type=SpeechDiarizationTask, task_kwargs={"speaker_labels": speaker_labels}
    +    )
    +
    +    # Run the transcription:
    +    if use_multiprocessing:
    +        results = _parallel_run(
    +            n_workers=use_multiprocessing,
    +            audio_files=audio_files,
    +            description="Diarizing",
    +            vad_init_kwargs=vad_init_kwargs,
    +            task_creator=task_creator,
    +            verbose=verbose,
    +        )
    +    else:
    +        results = _run(
    +            audio_files=audio_files,
    +            description="Diarizing",
    +            vad_init_kwargs=vad_init_kwargs,
    +            task_creator=task_creator,
    +            verbose=verbose,
    +        )
    +
    +    # Process the results:
    +    return _process_results(results=results, verbose=verbose)
    +
    +
    +def _get_audio_files(
    +    data_path: Union[Path, str, list],
    +) -> List[Path]:
    +    """
    +    Get the audio files from the data path. If a path to a directory is given, all files in the directory will be
    +    collected.
    +
    +    :param data_path: The data path to collect the audio files from.
    +
    +    :returns: The audio files list.
    +    """
    +    # Check if given a list of paths:
    +    if isinstance(data_path, list):
    +        audio_files = []
    +        for path in data_path:
    +            audio_files.extend(_get_audio_files(data_path=path))
    +        return audio_files
    +
    +    # Check if given a single string path to cast it to a `pathlib.Path`:
    +    if isinstance(data_path, str):
    +        data_path = Path(data_path).absolute()
    +
    +    # Check if the path is of a directory or a file:
    +    if data_path.is_dir():
    +        # Get all files inside the directory:
    +        audio_files = list(data_path.glob("*.*"))
    +    elif data_path.is_file():
    +        audio_files = [data_path]
    +    else:
    +        raise ValueError(
    +            f"Unrecognized data path. The parameter `data_path` must be a valid path to either a directory path or a "
    +            f"file. Given: {str(data_path)} "
    +        )
    +
    +    return audio_files
    +
    +
    +def _run(
    +    audio_files: List[Path],
    +    description: str,
    +    vad_init_kwargs: dict,
    +    task_creator: TaskCreator,
    +    verbose: bool,
    +) -> List[Tuple[bool, Tuple[str, list]]]:
    +    """
    +    Load a VAD and use it to complete the tasks that will be created on the provided files using the given task creator.
    +
    +    :param audio_files:     The audio files to use.
    +    :param description:     The description to use for the progress bar.
    +    :param vad_init_kwargs: The VAD initialization keyword arguments.
    +    :param task_creator:    The task creator to use to create the tasks.
    +    :param verbose:         Verbosity.
    +
    +    :returns: The collected results.
    +    """
    +    # Load the VAD:
    +    vad = VoiceActivityDetector(**vad_init_kwargs)
    +    if verbose:
    +        _LOGGER.info(f"Loading the VAD model.")
    +    vad.load()
    +    if verbose:
    +        _LOGGER.info("VAD model loaded.")
    +
    +    # Run the VAD on the audio files and collect the results:
    +    results = []
    +    for audio_file in tqdm(
    +        audio_files,
    +        desc=description,
    +        unit="file",
    +        total=len(audio_files),
    +        disable=not verbose,
    +    ):
    +        try:
    +            # Create the task:
    +            task = task_creator.create_task(audio_file=audio_file)
    +            # Run the file through the VAD:
    +            speech_timestamps = vad.detect_voice(audio_file=audio_file)
    +            # Complete the task:
    +            task.do_task(speech_timestamps=speech_timestamps)
    +            # Collect the result:
    +            results.append((False, task.get_result()))
    +        except Exception as exception:
    +            # Collect the error:
    +            results.append((True, (audio_file.name, str(exception))))
    +
    +    return results
    +
    +
    +def _parallel_run(
    +    n_workers: int,
    +    audio_files: List[Path],
    +    description: str,
    +    vad_init_kwargs: dict,
    +    task_creator: TaskCreator,
    +    verbose: bool,
    +) -> List[Tuple[bool, Tuple[str, list]]]:
    +    """
    +    Run multiple VAD workers with multiprocessing to complete the tasks that will be created on the provided files using
    +    the given task creator.
    +
    +    :param n_workers:       The number of workers to use.
    +    :param audio_files:     The audio files to use.
    +    :param description:     The description to use for the progress bar.
    +    :param vad_init_kwargs: The VAD initialization keyword arguments.
    +    :param task_creator:    The task creator to use to create the tasks.
    +    :param verbose:         Verbosity.
    +
    +    :returns: The collected results.
    +    """
    +    # Load the VAD (download once, and it will be loaded then per process later on):
    +    if verbose:
    +        _LOGGER.info(f"Loading the VAD model.")
    +    vad = VoiceActivityDetector(**vad_init_kwargs)
    +    vad.load()
    +    if verbose:
    +        _LOGGER.info("VAD model loaded.")
    +
    +    # Check the number of workers:
    +    if n_workers > len(audio_files):
    +        _LOGGER.warning(
    +            f"The number of workers ({n_workers}) is larger than the number of audio files ({len(audio_files)}). "
    +            f"Setting the number of workers to {len(audio_files)}."
    +        )
    +        n_workers = len(audio_files)
    +
    +    # Initialize the multiprocessing queues:
    +    tasks_queue = Queue()
    +    results_queue = Queue()
    +
    +    # Initialize the multiprocessing processes:
    +    task_completion_processes = [
    +        Process(
    +            target=_multiprocessing_complete_tasks,
    +            kwargs={
    +                "vad_init_kwargs": vad_init_kwargs,
    +                "tasks_queue": tasks_queue,
    +                "results_queue": results_queue,
    +            },
    +        )
    +        for _ in range(n_workers)
    +    ]
    +
    +    # Start the multiprocessing processes:
    +    for p in task_completion_processes:
    +        p.start()
    +
    +    # Put the tasks in the queue:
    +    for audio_file in audio_files:
    +        tasks_queue.put(task_creator.create_task(audio_file=audio_file).to_tuple())
    +
    +    # Put the stop marks in the queue:
    +    for _ in range(n_workers):
    +        tasks_queue.put(_MULTIPROCESSING_STOP_MARK)
    +
    +    # Collect the results:
    +    results = []
    +    stop_marks_counter = 0
    +    with tqdm(
    +        desc=description,
    +        unit="file",
    +        total=len(audio_files),
    +        disable=not verbose,
    +    ) as progressbar:
    +        while True:
    +            # Get a result from the queue:
    +            result: Tuple[bool, Tuple[str, list]] = results_queue.get()
    +            if result == _MULTIPROCESSING_STOP_MARK:
    +                stop_marks_counter += 1
    +                if stop_marks_counter == n_workers:
    +                    break
    +            else:
    +                # Collect the result:
    +                results.append(result)
    +                progressbar.update(1)
    +
    +    # Wait for the processes to finish:
    +    for p in task_completion_processes:
    +        p.join()
    +
    +    return results
    +
    +
    +def _process_results(
    +    results: List[Tuple[bool, Tuple[str, list]]], verbose: bool
    +) -> Tuple[dict, dict]:
    +    """
    +    Process the results of the tasks.
    +
    +    :param results: The results to process.
    +    :param verbose: Verbosity.
    +
    +    :returns: The processed results as a tuple of successes and errors.
    +    """
    +    if verbose:
    +        _LOGGER.info("Summarizing the results.")
    +    successes = {}
    +    errors = {}
    +    for is_error, result in results:
    +        if is_error:
    +            errors[result[0]] = result[1]
    +        else:
    +            successes[result[0]] = result[1]
    +    if verbose:
    +        _LOGGER.info(f"Done ({len(successes)}/{len(successes) + len(errors)})\n")
    +
    +    return successes, errors
    +
    +        
    +    
    + + \ No newline at end of file diff --git a/functions/master/silero_vad/latest/src/function.yaml b/functions/master/silero_vad/latest/src/function.yaml index 8ec121a6..fd637f1c 100644 --- a/functions/master/silero_vad/latest/src/function.yaml +++ b/functions/master/silero_vad/latest/src/function.yaml @@ -1,110 +1,104 @@ -kind: job metadata: - name: silero-vad tag: '' - hash: 59336f808643a74f3a2c5d506977387010427208 - project: '' - labels: - author: guyl categories: - deep-learning - - pytorch - audio + name: silero-vad +verbose: false spec: - command: '' - args: [] - image: '' + description: Silero VAD (Voice Activity Detection) functions. build: - functionSourceCode:  - base_image: mlrun/mlrun - commands: [] code_origin: '' - origin_filename: '' + base_image: mlrun/mlrun requirements: - torch - torchaudio - tqdm - onnxruntime + functionSourceCode:  + origin_filename: '' + image: '' + command: '' entry_points: audio_file: - name: audio_file doc: Get the audio file of the task. - parameters: - - name: self + lineno: 43 + has_varargs: false outputs: - doc: The audio file of the task. type: Path - lineno: 43 - has_varargs: false + parameters: + - name: self has_kwargs: false + name: audio_file do_task: - name: do_task doc: Do the task on the given speech timestamps. The task will diarize the VAD speech timestamps into speakers. + lineno: 94 + has_varargs: false parameters: - name: self - name: speech_timestamps type: List[List[Dict[str, int]]] doc: The speech timestamps per channel to do the task on as outputted from the VAD. - outputs: [] - lineno: 94 - has_varargs: false has_kwargs: false + name: do_task get_result: - name: get_result doc: Get the result of the task. A tuple of the audio file name and the result. - parameters: - - name: self + lineno: 61 + has_varargs: false outputs: - doc: The result of the task. type: Tuple[str, list] - lineno: 61 - has_varargs: false + parameters: + - name: self has_kwargs: false + name: get_result to_tuple: - name: to_tuple doc: Convert the task to a tuple to reconstruct it later (used for multiprocessing to pass in queue). - parameters: - - name: self + lineno: 116 + has_varargs: false outputs: - doc: The converted task. type: Tuple[str, dict] - lineno: 116 - has_varargs: false + parameters: + - name: self has_kwargs: false + name: to_tuple create_task: - name: create_task doc: Create a task with the given audio file. + lineno: 146 + has_varargs: false + outputs: + - doc: The created task. + type: BaseTask parameters: - name: self - name: audio_file type: Path doc: The audio file to assign to the task. - outputs: - - doc: The created task. - type: BaseTask - lineno: 146 - has_varargs: false has_kwargs: false + name: create_task from_tuple: - name: from_tuple doc: Create a task from a tuple of the audio file name and the task kwargs. + lineno: 157 + has_varargs: false + outputs: + - doc: The created task. + type: BaseTask parameters: - name: cls - name: task_tuple type: Tuple[str, dict] doc: The task tuple to create the task from. - outputs: - - doc: The created task. - type: BaseTask - lineno: 157 - has_varargs: false has_kwargs: false + name: from_tuple load: - name: load doc: Load the VAD model. + lineno: 234 + has_varargs: false parameters: - name: self - name: force_reload @@ -112,12 +106,9 @@ spec: doc: Whether to force reload the model even if it was already loaded. Default is True. default: true - outputs: [] - lineno: 234 - has_varargs: false has_kwargs: false + name: load detect_voice: - name: detect_voice doc: "Perform voice activity detection on given audio files using the silero\ \ VAD model -\nhttps://github.com/snakers4/silero-vad. The end result is a\ \ dictionary with the file names as keys and their\nVAD timestamps dictionaries\ @@ -128,6 +119,8 @@ spec: : 16000},\n {\"start\": 16000, \"end\": 32000},\n {\"\ start\": 32000, \"end\": 48000},\n ...\n ],\n ...\n\ \ }" + lineno: 393 + has_varargs: false parameters: - name: data_path type: Union[str, Path, List[Union[str, Path]]] @@ -195,12 +188,9 @@ spec: type: bool doc: Verbosity. default: false - outputs: [] - lineno: 393 - has_varargs: false has_kwargs: false + name: detect_voice diarize: - name: diarize doc: "Perform speech diarization on given audio files using the silero VAD model\ \ - https://github.com/snakers4/silero-vad.\nThe speech diarization is performed\ \ per channel so that each channel in the audio belong to a different speaker.\ @@ -212,6 +202,8 @@ spec: : [\n (0.0, 1.0, \"speaker_0\"),\n (1.0, 2.0, \"speaker_1\"\ ),\n (2.0, 3.0, \"speaker_0\"),\n ...\n ],\n\ \ ...\n }" + lineno: 517 + has_varargs: false parameters: - name: data_path type: Union[str, Path, List[Union[str, Path]]] @@ -274,18 +266,8 @@ spec: type: bool doc: Verbosity. default: false - outputs: [] - lineno: 517 - has_varargs: false has_kwargs: false - description: Silero VAD (Voice Activity Detection) functions. - default_handler: detect_voice + name: diarize disable_auto_mount: false - clone_target_dir: '' - env: [] - priority_class_name: '' - preemption_mode: prevent - affinity: null - tolerations: null - security_context: {} -verbose: false + default_handler: detect_voice +kind: job diff --git a/functions/master/silero_vad/latest/src/item.yaml b/functions/master/silero_vad/latest/src/item.yaml index 9ce9a5d2..49adfcd9 100644 --- a/functions/master/silero_vad/latest/src/item.yaml +++ b/functions/master/silero_vad/latest/src/item.yaml @@ -1,7 +1,6 @@ apiVersion: v1 categories: - deep-learning -- pytorch - audio description: Silero VAD (Voice Activity Detection) functions. doc: '' @@ -13,7 +12,7 @@ labels: author: guyl maintainers: [] marketplaceType: '' -mlrunVersion: 1.5.2 +mlrunVersion: 1.7.0 name: silero_vad platformVersion: 3.5.3 spec: @@ -27,4 +26,4 @@ spec: - tqdm - onnxruntime url: '' -version: 1.3.0 +version: 1.4.0 diff --git a/functions/master/silero_vad/latest/static/documentation.html b/functions/master/silero_vad/latest/static/documentation.html index a24baa8e..344c2421 100644 --- a/functions/master/silero_vad/latest/static/documentation.html +++ b/functions/master/silero_vad/latest/static/documentation.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/silero_vad/latest/static/example.html b/functions/master/silero_vad/latest/static/example.html index bb7366a9..0eb7542f 100644 --- a/functions/master/silero_vad/latest/static/example.html +++ b/functions/master/silero_vad/latest/static/example.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/silero_vad/latest/static/function.html b/functions/master/silero_vad/latest/static/function.html index 051e0cf1..839ab544 100644 --- a/functions/master/silero_vad/latest/static/function.html +++ b/functions/master/silero_vad/latest/static/function.html @@ -28,113 +28,107 @@
             
    -kind: job
     metadata:
    -  name: silero-vad
       tag: ''
    -  hash: 59336f808643a74f3a2c5d506977387010427208
    -  project: ''
    -  labels:
    -    author: guyl
       categories:
       - deep-learning
    -  - pytorch
       - audio
    +  name: silero-vad
    +verbose: false
     spec:
    -  command: ''
    -  args: []
    -  image: ''
    +  description: Silero VAD (Voice Activity Detection) functions.
       build:
    -    functionSourceCode: 
    -    base_image: mlrun/mlrun
    -    commands: []
         code_origin: ''
    -    origin_filename: ''
    +    base_image: mlrun/mlrun
         requirements:
         - torch
         - torchaudio
         - tqdm
         - onnxruntime
    +    functionSourceCode: 
    +    origin_filename: ''
    +  image: ''
    +  command: ''
       entry_points:
         audio_file:
    -      name: audio_file
           doc: Get the audio file of the task.
    -      parameters:
    -      - name: self
    +      lineno: 43
    +      has_varargs: false
           outputs:
           - doc: The audio file of the task.
             type: Path
    -      lineno: 43
    -      has_varargs: false
    +      parameters:
    +      - name: self
           has_kwargs: false
    +      name: audio_file
         do_task:
    -      name: do_task
           doc: Do the task on the given speech timestamps. The task will diarize the VAD
             speech timestamps into speakers.
    +      lineno: 94
    +      has_varargs: false
           parameters:
           - name: self
           - name: speech_timestamps
             type: List[List[Dict[str, int]]]
             doc: The speech timestamps per channel to do the task on as outputted from
               the VAD.
    -      outputs: []
    -      lineno: 94
    -      has_varargs: false
           has_kwargs: false
    +      name: do_task
         get_result:
    -      name: get_result
           doc: Get the result of the task. A tuple of the audio file name and the result.
    -      parameters:
    -      - name: self
    +      lineno: 61
    +      has_varargs: false
           outputs:
           - doc: The result of the task.
             type: Tuple[str, list]
    -      lineno: 61
    -      has_varargs: false
    +      parameters:
    +      - name: self
           has_kwargs: false
    +      name: get_result
         to_tuple:
    -      name: to_tuple
           doc: Convert the task to a tuple to reconstruct it later (used for multiprocessing
             to pass in queue).
    -      parameters:
    -      - name: self
    +      lineno: 116
    +      has_varargs: false
           outputs:
           - doc: The converted task.
             type: Tuple[str, dict]
    -      lineno: 116
    -      has_varargs: false
    +      parameters:
    +      - name: self
           has_kwargs: false
    +      name: to_tuple
         create_task:
    -      name: create_task
           doc: Create a task with the given audio file.
    +      lineno: 146
    +      has_varargs: false
    +      outputs:
    +      - doc: The created task.
    +        type: BaseTask
           parameters:
           - name: self
           - name: audio_file
             type: Path
             doc: The audio file to assign to the task.
    -      outputs:
    -      - doc: The created task.
    -        type: BaseTask
    -      lineno: 146
    -      has_varargs: false
           has_kwargs: false
    +      name: create_task
         from_tuple:
    -      name: from_tuple
           doc: Create a task from a tuple of the audio file name and the task kwargs.
    +      lineno: 157
    +      has_varargs: false
    +      outputs:
    +      - doc: The created task.
    +        type: BaseTask
           parameters:
           - name: cls
           - name: task_tuple
             type: Tuple[str, dict]
             doc: The task tuple to create the task from.
    -      outputs:
    -      - doc: The created task.
    -        type: BaseTask
    -      lineno: 157
    -      has_varargs: false
           has_kwargs: false
    +      name: from_tuple
         load:
    -      name: load
           doc: Load the VAD model.
    +      lineno: 234
    +      has_varargs: false
           parameters:
           - name: self
           - name: force_reload
    @@ -142,12 +136,9 @@
             doc: Whether to force reload the model even if it was already loaded. Default
               is True.
             default: true
    -      outputs: []
    -      lineno: 234
    -      has_varargs: false
           has_kwargs: false
    +      name: load
         detect_voice:
    -      name: detect_voice
           doc: "Perform voice activity detection on given audio files using the silero\
             \ VAD model -\nhttps://github.com/snakers4/silero-vad. The end result is a\
             \ dictionary with the file names as keys and their\nVAD timestamps dictionaries\
    @@ -158,6 +149,8 @@
             : 16000},\n            {\"start\": 16000, \"end\": 32000},\n            {\"\
             start\": 32000, \"end\": 48000},\n            ...\n        ],\n        ...\n\
             \    }"
    +      lineno: 393
    +      has_varargs: false
           parameters:
           - name: data_path
             type: Union[str, Path, List[Union[str, Path]]]
    @@ -225,12 +218,9 @@
             type: bool
             doc: Verbosity.
             default: false
    -      outputs: []
    -      lineno: 393
    -      has_varargs: false
           has_kwargs: false
    +      name: detect_voice
         diarize:
    -      name: diarize
           doc: "Perform speech diarization on given audio files using the silero VAD model\
             \ - https://github.com/snakers4/silero-vad.\nThe speech diarization is performed\
             \ per channel so that each channel in the audio belong to a different speaker.\
    @@ -242,6 +232,8 @@
             : [\n            (0.0, 1.0, \"speaker_0\"),\n            (1.0, 2.0, \"speaker_1\"\
             ),\n            (2.0, 3.0, \"speaker_0\"),\n            ...\n        ],\n\
             \        ...\n    }"
    +      lineno: 517
    +      has_varargs: false
           parameters:
           - name: data_path
             type: Union[str, Path, List[Union[str, Path]]]
    @@ -304,21 +296,11 @@
             type: bool
             doc: Verbosity.
             default: false
    -      outputs: []
    -      lineno: 517
    -      has_varargs: false
           has_kwargs: false
    -  description: Silero VAD (Voice Activity Detection) functions.
    -  default_handler: detect_voice
    +      name: diarize
       disable_auto_mount: false
    -  clone_target_dir: ''
    -  env: []
    -  priority_class_name: ''
    -  preemption_mode: prevent
    -  affinity: null
    -  tolerations: null
    -  security_context: {}
    -verbose: false
    +  default_handler: detect_voice
    +kind: job
     
             
         
    diff --git a/functions/master/silero_vad/latest/static/item.html b/functions/master/silero_vad/latest/static/item.html index a5696cbf..9e297535 100644 --- a/functions/master/silero_vad/latest/static/item.html +++ b/functions/master/silero_vad/latest/static/item.html @@ -31,7 +31,6 @@ apiVersion: v1 categories: - deep-learning -- pytorch - audio description: Silero VAD (Voice Activity Detection) functions. doc: '' @@ -43,7 +42,7 @@ author: guyl maintainers: [] marketplaceType: '' -mlrunVersion: 1.5.2 +mlrunVersion: 1.7.0 name: silero_vad platformVersion: 3.5.3 spec: @@ -57,7 +56,7 @@ - tqdm - onnxruntime url: '' -version: 1.3.0 +version: 1.4.0
    diff --git a/functions/master/silero_vad/latest/static/silero_vad.html b/functions/master/silero_vad/latest/static/silero_vad.html index 502167b2..feae13a6 100644 --- a/functions/master/silero_vad/latest/static/silero_vad.html +++ b/functions/master/silero_vad/latest/static/silero_vad.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/sklearn_classifier/1.1.1/static/documentation.html b/functions/master/sklearn_classifier/1.1.1/static/documentation.html index 48056cda..1f3427a2 100644 --- a/functions/master/sklearn_classifier/1.1.1/static/documentation.html +++ b/functions/master/sklearn_classifier/1.1.1/static/documentation.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/sklearn_classifier/1.1.1/static/example.html b/functions/master/sklearn_classifier/1.1.1/static/example.html index 9c4249a6..05d293d8 100644 --- a/functions/master/sklearn_classifier/1.1.1/static/example.html +++ b/functions/master/sklearn_classifier/1.1.1/static/example.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/sklearn_classifier/1.1.1/static/sklearn_classifier.html b/functions/master/sklearn_classifier/1.1.1/static/sklearn_classifier.html index db33df83..e229f241 100644 --- a/functions/master/sklearn_classifier/1.1.1/static/sklearn_classifier.html +++ b/functions/master/sklearn_classifier/1.1.1/static/sklearn_classifier.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/sklearn_classifier/latest/static/documentation.html b/functions/master/sklearn_classifier/latest/static/documentation.html index 48056cda..1f3427a2 100644 --- a/functions/master/sklearn_classifier/latest/static/documentation.html +++ b/functions/master/sklearn_classifier/latest/static/documentation.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/sklearn_classifier/latest/static/example.html b/functions/master/sklearn_classifier/latest/static/example.html index 9c4249a6..05d293d8 100644 --- a/functions/master/sklearn_classifier/latest/static/example.html +++ b/functions/master/sklearn_classifier/latest/static/example.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/sklearn_classifier/latest/static/sklearn_classifier.html b/functions/master/sklearn_classifier/latest/static/sklearn_classifier.html index db33df83..e229f241 100644 --- a/functions/master/sklearn_classifier/latest/static/sklearn_classifier.html +++ b/functions/master/sklearn_classifier/latest/static/sklearn_classifier.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/sklearn_classifier_dask/1.1.1/static/documentation.html b/functions/master/sklearn_classifier_dask/1.1.1/static/documentation.html index d467f89f..38aa65c5 100644 --- a/functions/master/sklearn_classifier_dask/1.1.1/static/documentation.html +++ b/functions/master/sklearn_classifier_dask/1.1.1/static/documentation.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/sklearn_classifier_dask/1.1.1/static/example.html b/functions/master/sklearn_classifier_dask/1.1.1/static/example.html index 5fcdfa90..c37419fa 100644 --- a/functions/master/sklearn_classifier_dask/1.1.1/static/example.html +++ b/functions/master/sklearn_classifier_dask/1.1.1/static/example.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/sklearn_classifier_dask/1.1.1/static/sklearn_classifier_dask.html b/functions/master/sklearn_classifier_dask/1.1.1/static/sklearn_classifier_dask.html index 4a133c6c..9d03ccb2 100644 --- a/functions/master/sklearn_classifier_dask/1.1.1/static/sklearn_classifier_dask.html +++ b/functions/master/sklearn_classifier_dask/1.1.1/static/sklearn_classifier_dask.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/sklearn_classifier_dask/latest/static/documentation.html b/functions/master/sklearn_classifier_dask/latest/static/documentation.html index d467f89f..38aa65c5 100644 --- a/functions/master/sklearn_classifier_dask/latest/static/documentation.html +++ b/functions/master/sklearn_classifier_dask/latest/static/documentation.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/sklearn_classifier_dask/latest/static/example.html b/functions/master/sklearn_classifier_dask/latest/static/example.html index 5fcdfa90..c37419fa 100644 --- a/functions/master/sklearn_classifier_dask/latest/static/example.html +++ b/functions/master/sklearn_classifier_dask/latest/static/example.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/sklearn_classifier_dask/latest/static/sklearn_classifier_dask.html b/functions/master/sklearn_classifier_dask/latest/static/sklearn_classifier_dask.html index 4a133c6c..9d03ccb2 100644 --- a/functions/master/sklearn_classifier_dask/latest/static/sklearn_classifier_dask.html +++ b/functions/master/sklearn_classifier_dask/latest/static/sklearn_classifier_dask.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/structured_data_generator/1.6.0/src/function.yaml b/functions/master/structured_data_generator/1.6.0/src/function.yaml new file mode 100644 index 00000000..4e8a3562 --- /dev/null +++ b/functions/master/structured_data_generator/1.6.0/src/function.yaml @@ -0,0 +1,56 @@ +spec: + build: + origin_filename: '' + requirements: + - langchain + - tqdm + code_origin: '' + functionSourceCode: IyBDb3B5cmlnaHQgMjAyMyBJZ3VhemlvCiMKIyBMaWNlbnNlZCB1bmRlciB0aGUgQXBhY2hlIExpY2Vuc2UsIFZlcnNpb24gMi4wICh0aGUgIkxpY2Vuc2UiKTsKIyB5b3UgbWF5IG5vdCB1c2UgdGhpcyBmaWxlIGV4Y2VwdCBpbiBjb21wbGlhbmNlIHdpdGggdGhlIExpY2Vuc2UuCiMgWW91IG1heSBvYnRhaW4gYSBjb3B5IG9mIHRoZSBMaWNlbnNlIGF0CiMKIyAgIGh0dHA6Ly93d3cuYXBhY2hlLm9yZy9saWNlbnNlcy9MSUNFTlNFLTIuMAojCiMgVW5sZXNzIHJlcXVpcmVkIGJ5IGFwcGxpY2FibGUgbGF3IG9yIGFncmVlZCB0byBpbiB3cml0aW5nLCBzb2Z0d2FyZQojIGRpc3RyaWJ1dGVkIHVuZGVyIHRoZSBMaWNlbnNlIGlzIGRpc3RyaWJ1dGVkIG9uIGFuICJBUyBJUyIgQkFTSVMsCiMgV0lUSE9VVCBXQVJSQU5USUVTIE9SIENPTkRJVElPTlMgT0YgQU5ZIEtJTkQsIGVpdGhlciBleHByZXNzIG9yIGltcGxpZWQuCiMgU2VlIHRoZSBMaWNlbnNlIGZvciB0aGUgc3BlY2lmaWMgbGFuZ3VhZ2UgZ292ZXJuaW5nIHBlcm1pc3Npb25zIGFuZAojIGxpbWl0YXRpb25zIHVuZGVyIHRoZSBMaWNlbnNlLgppbXBvcnQgYXN0CmltcG9ydCBvcwoKaW1wb3J0IHRxZG0KZnJvbSBsYW5nY2hhaW4uY2hhdF9tb2RlbHMgaW1wb3J0IENoYXRPcGVuQUkKCgpkZWYgX3NldF9vcGVuYWlfc2VjcmV0cygpIC0+IGJvb2w6CiAgICBrZXkgPSAiT1BFTkFJX0FQSV9LRVkiCiAgICBiYXNlID0gIk9QRU5BSV9BUElfQkFTRSIKICAgICMgQ2hlY2sgaWYgdGhlIGtleSBpcyBhbHJlYWR5IGluIHRoZSBlbnZpcm9ubWVudCB2YXJpYWJsZXM6CiAgICBpZiBrZXkgaW4gb3MuZW52aXJvbiBhbmQgYmFzZSBpbiBvcy5lbnZpcm9uOgogICAgICAgIHJldHVybiBUcnVlCiAgICAjIENoZWNrIGlmIG1scnVuIGlzIGluc3RhbGxlZDoKICAgIHRyeToKICAgICAgICBpbXBvcnQgbWxydW4KICAgIGV4Y2VwdCBNb2R1bGVOb3RGb3VuZEVycm9yOgogICAgICAgIHJhaXNlIEVudmlyb25tZW50RXJyb3IoCiAgICAgICAgICAgIGYiT25lIG9yIG1vcmUgb2YgdGhlIE9wZW5BSSByZXF1aXJlZCBlbnZpcm9ubWVudCB2YXJpYWJsZXMgKCd7a2V5fScsICd7YmFzZX0nKSBhcmUgbWlzc2luZy4iCiAgICAgICAgICAgIGYiUGxlYXNlIHNldCB0aGVtIGFzIGVudmlyb25tZW50IHZhcmlhYmxlcyBvciBpbnN0YWxsIG1scnVuIChgcGlwIGluc3RhbGwgbWxydW5gKSIKICAgICAgICAgICAgZiJhbmQgc2V0IHRoZW0gYXMgcHJvamVjdCBzZWNyZXRzIHVzaW5nIGBwcm9qZWN5LnNldF9zZWNyZXRzYC4iCiAgICAgICAgKQoKICAgICMgQ2hlY2sgaWYgdGhlIGtleSBpcyBpbiB0aGUgc2VjcmV0czoKICAgIGNvbnRleHQgPSBtbHJ1bi5nZXRfb3JfY3JlYXRlX2N0eChuYW1lPSJjb250ZXh0IikKICAgIG9wZW5haV9rZXkgPSBjb250ZXh0LmdldF9zZWNyZXQoa2V5KQogICAgb3BlbmFpX2Jhc2UgPSBjb250ZXh0LmdldF9zZWNyZXQoYmFzZSkKCiAgICAjIElmIHRoZSBrZXkgaXMgbm90IGluIHRoZSBzZWNyZXRzLCByZXR1cm4gRmFsc2U6CiAgICBpZiBub3Qgb3BlbmFpX2tleToKICAgICAgICByYWlzZSBFbnZpcm9ubWVudEVycm9yKAogICAgICAgICAgICBmIkNvdWxkIG5vdCBmaW5kIE9wZW5BSSBBUEkga2V5IGluIHRoZSBlbnZpcm9ubWVudCB2YXJpYWJsZXMgb3Igc2VjcmV0cywiCiAgICAgICAgICAgIGYiIHBsZWFzZSBzZXQgaXQgYXM6IHtrZXl9LiIKICAgICAgICApCiAgICBpZiBub3Qgb3BlbmFpX2Jhc2U6CiAgICAgICAgcmFpc2UgRW52aXJvbm1lbnRFcnJvcigKICAgICAgICAgICAgZiJDb3VsZCBub3QgZmluZCBPcGVuQUkgQVBJIGJhc2UgaW4gdGhlIGVudmlyb25tZW50IHZhcmlhYmxlcyBvciBzZWNyZXRzLCIKICAgICAgICAgICAgZiIgcGxlYXNlIHNldCBpdCBhczoge2Jhc2V9LiIKICAgICAgICApCiAgICAjIElmIHRoZSBrZXkgaXMgaW4gdGhlIHNlY3JldHMsIHNldCBpdCBpbiB0aGUgZW52aXJvbm1lbnQgdmFyaWFibGVzIGFuZCByZXR1cm4gVHJ1ZToKICAgIG9zLmVudmlyb25ba2V5XSA9IG9wZW5haV9rZXkKICAgIG9zLmVudmlyb25bYmFzZV0gPSBvcGVuYWlfYmFzZQogICAgcmV0dXJuIFRydWUKCgpkZWYgZ2VuZXJhdGVfZGF0YSgKICAgIGZpZWxkczogbGlzdCwKICAgIGFtb3VudDogaW50ID0gMTAsCiAgICBtb2RlbF9uYW1lOiBzdHIgPSAiZ3B0LTMuNS10dXJibyIsCiAgICBsYW5ndWFnZTogc3RyID0gImVuIiwKICAgIGNodW5rX3NpemU6IGludCA9IDUwLAopIC0+IGxpc3Q6CiAgICAiIiIKICAgIFN0cnVjdHVyZWQgZGF0YSBvZiBlbGVtZW50cyBhY2NvcmRpbmcgdG8gdGhlIGdpdmVuIHBhcmFtZXRlcnMuCiAgICBUaGUgZGF0YSBjYW4gYmUgbGF0ZXIgbG9nZ2VkIGFzIGEgc3RydWN0dXJlZCBmaWxlIHdpdGggTUxSdW4ncyBgcmV0dXJuc2AgcGFyYW1ldGVyLgoKICAgIDpwYXJhbSBmaWVsZHM6IEEgbGlzdCBvZiBmaWVsZHMgdG8gcmFuZG9tbHkgZ2VuZXJhdGUuCiAgICA6cGFyYW0gYW1vdW50OiBUaGUgbnVtYmVyIG9mIHZhcmlhbnRzIHRvIGdlbmVyYXRlLgogICAgOnBhcmFtIG1vZGVsX25hbWU6IFRoZSBuYW1lIG9mIHRoZSBtb2RlbCB0byB1c2UgZm9yIGNvbnZlcnNhdGlvbiBnZW5lcmF0aW9uLgogICAgICAgICAgICAgICAgICAgICAgIFlvdSBzaG91bGQgY2hvb3NlIG9uZSBvZiBHUFQtNCBvciBHUFQtMy41IGZyb20gdGhlIGxpc3QgaGVyZTogaHR0cHM6Ly9wbGF0Zm9ybS5vcGVuYWkuY29tL2RvY3MvbW9kZWxzLgogICAgICAgICAgICAgICAgICAgICAgIERlZmF1bHQ6ICdncHQtMy41LXR1cmJvJy4KICAgIDpwYXJhbSBsYW5ndWFnZTogVGhlIGxhbmd1YWdlIHRvIHVzZSBmb3IgdGhlIGdlbmVyYXRlZCBjb252ZXJzYXRpb24gdGV4dC4KICAgIDpwYXJhbSBjaHVua19zaXplOiBOdW1iZXIgb2Ygc2FtcGxlcyBnZW5lcmF0ZWQgYXQgZWFjaCBHUFQgcXVlcnkuCiAgICAiIiIKICAgIGluc3RydWN0aW9ucyA9ICIiCiAgICBmb3IgZmllbGQgaW4gZmllbGRzOgogICAgICAgICMgU3BsaXQgdGhlIGZpZWxkIHRvIGtleSBhbmQgaW5zdHJ1Y3Rpb246CiAgICAgICAgaWYgIjoiIGluIGZpZWxkOgogICAgICAgICAgICBrZXksIGluc3RydWN0aW9uID0gZmllbGQuc3BsaXQoIjoiLCAxKQogICAgICAgIGVsc2U6CiAgICAgICAgICAgIGtleSwgaW5zdHJ1Y3Rpb24gPSBmaWVsZCwgIm5vIHNwZWNpYWwgaW5zdHJ1Y3Rpb24iCiAgICAgICAgIyBSZXBsYWNlIHNwYWNlcyB3aXRoIHVuZGVyc2NvcmVzIGZvciB0aGUga2V5IHRvIGJlIHVzZWQgYXMgYSBqc29uIGtleToKICAgICAgICBrZXkgPSBrZXkuc3RyaXAoKS5yZXBsYWNlKCIgIiwgIl8iKQogICAgICAgIGluc3RydWN0aW9ucyArPSBmIioge2tleX06IHtpbnN0cnVjdGlvbn1cbiIKCiAgICAjIENyZWF0ZSB0aGUgcHJvbXB0IHN0cnVjdHVyZToKICAgIHByb21wdF9zdHJ1Y3R1cmUgPSAoCiAgICAgICAgZiJnZW5lcmF0ZSB0aGUgZm9sbG93aW5nIHZhbHVlcyB7YW1vdW50fSB0aW1lcyByYW5kb21seSwgaW4gYW4gb3JkZXIgdGhhdCBjcmVhdGVzIGEganNvbiB0YWJsZS5cbiIKICAgICAgICBmIlVzZSB0aGUgZm9sbG93aW5nIGtleXMgYW5kIGluc3RydWN0aW9ucyAoZXhhbXBsZTogJ2tleTogaW5zdHJ1Y3Rpb24gb3Igbm8gc3BlY2lhbCBpbnN0cnVjdGlvbicpOiAiCiAgICAgICAgZiJ7aW5zdHJ1Y3Rpb25zfS5cbiIKICAgICAgICBmIlBsZWFzZSBnZW5lcmF0ZSB0aGUgdmFsdWVzIGluIHtsYW5ndWFnZX0gbGFuZ3VhZ2UuIFxuIgogICAgICAgIGYiTWFrZSBzdXJlIHRoZSBuYW1lcyBvZiB0aGUga2V5cyBhcmUgdGhlIHNhbWUgYXMgdGhlIGdpdmVuIGZpZWxkIG5hbWUuXG4iCiAgICAgICAgZiJQbGVhc2UgcmV0dXJuIG9ubHkgdGhlIGpzb24gZm9ybWF0IHdpdGhvdXQgYW55IGludHJvZHVjdGlvbiBhbmQgZW5kaW5nIgogICAgKQoKICAgICMgU2V0IHRoZSBPcGVuQUkgc2VjcmV0czoKICAgIF9zZXRfb3BlbmFpX3NlY3JldHMoKQoKICAgICMgTG9hZCB0aGUgT3BlbkFJIG1vZGVsIHVzaW5nIGxhbmdjaGFpbjoKICAgIGxsbSA9IENoYXRPcGVuQUkobW9kZWw9bW9kZWxfbmFtZSkKCiAgICAjIFN0YXJ0IGdlbmVyYXRpbmcgZGF0YToKICAgIGRhdGEgPSBbXQogICAgZm9yIF8gaW4gdHFkbS50cWRtKHJhbmdlKChhbW91bnQgLy8gY2h1bmtfc2l6ZSkgKyAxKSwgZGVzYz0iR2VuZXJhdGluZyIpOgogICAgICAgICMgV2UgdHJ5IHRvIGdlbmVyYXRlIHRoZSBkYXRhIDMgdGltZXMsIGlmIHdlIGZhaWwgd2UgcmFpc2UgYW4gZXJyb3I6CiAgICAgICAgZm9yIHRyeW91dCBpbiByYW5nZSgzKToKICAgICAgICAgICAgIyBJZiB0aGUgYW1vdW50IHdhbnRlZCBpcyBiaWdnZXIgdGhhbiB0aGUgY2h1bmsgc2l6ZSwgd2UgZ2VuZXJhdGUgYSBjaHVuayBvZiBkYXRhIGluIHRoZSBzaXplIG9mIHRoZSBjaHVuawogICAgICAgICAgICAjIGFuZCBkZWNyZWFzZSB0aGUgYW1vdW50IGJ5IHRoZSBjaHVuayBzaXplLgogICAgICAgICAgICAjIG90aGVyd2lzZSB3ZSBnZW5lcmF0ZSBhIGNodW5rIG9mIGRhdGEgaW4gdGhlIHNpemUgb2YgdGhlIGFtb3VudDoKICAgICAgICAgICAgaWYgYW1vdW50ID4gY2h1bmtfc2l6ZToKICAgICAgICAgICAgICAgIGN1cnJlbnRfY2h1bmtfc2l6ZSA9IGNodW5rX3NpemUKICAgICAgICAgICAgICAgIGFtb3VudCAtPSBjaHVua19zaXplCiAgICAgICAgICAgIGVsc2U6CiAgICAgICAgICAgICAgICBjdXJyZW50X2NodW5rX3NpemUgPSBhbW91bnQKCiAgICAgICAgICAgICMgQ3JlYXRlIHRoZSBwcm9tcHQ6CiAgICAgICAgICAgIHByb21wdCA9IHByb21wdF9zdHJ1Y3R1cmUuZm9ybWF0KAogICAgICAgICAgICAgICAgYW1vdW50PWN1cnJlbnRfY2h1bmtfc2l6ZSwKICAgICAgICAgICAgKQoKICAgICAgICAgICAgIyBHZW5lcmF0ZSBhIGNodW5rIG9mIGRhdGE6CiAgICAgICAgICAgIGNodW5rX2RhdGEgPSBsbG0ucHJlZGljdCh0ZXh0PXByb21wdCkKCiAgICAgICAgICAgICMgVmFsaWRhdGUgdGhlIHJlc3BvbnNlIGZvciBjb3JyZWN0IHB5dGhvbiBgbGlzdGAgc3RydWN0dXJlCiAgICAgICAgICAgIGNodW5rX2RhdGEgPSBjaHVua19kYXRhW2NodW5rX2RhdGEuZmluZCgiWyIpIDogY2h1bmtfZGF0YS5yZmluZCgiXSIpICsgMV0KICAgICAgICAgICAgaWYgY2h1bmtfZGF0YS5jb3VudCgiWyIpICE9IGNodW5rX2RhdGEuY291bnQoIl0iKToKICAgICAgICAgICAgICAgIHByaW50KAogICAgICAgICAgICAgICAgICAgICJGYWlsZWQgdG8gZ2V0IHByb3BlciBqc29uIGZvcm1hdCBmcm9tIG1vZGVsLCBudW1iZXIgb2YgJ1snIGRvZXNuJ3QgbWF0Y2ggbnVtYmVyIG9mICddJy4iCiAgICAgICAgICAgICAgICApCiAgICAgICAgICAgICAgICBjb250aW51ZQogICAgICAgICAgICBjaHVua19kYXRhID0gYXN0LmxpdGVyYWxfZXZhbChjaHVua19kYXRhKQogICAgICAgICAgICBkYXRhICs9IGNodW5rX2RhdGEKICAgICAgICAgICAgYnJlYWsKICAgICAgICBpZiB0cnlvdXQgPT0gMzoKICAgICAgICAgICAgcmFpc2UgUnVudGltZUVycm9yKAogICAgICAgICAgICAgICAgZiJDb3VsZCBub3QgZ2VuZXJhdGUgYSBwcm9wZXIganNvbiBmb3JtYXQgZm9yIHRoZSBnaXZlbiBmaWVsZHMsIHVzaW5nIGdpdmVuIG1vZGVsOiB7bW9kZWxfbmFtZX0uIgogICAgICAgICAgICAgICAgZiIgSGludDogR3B0LTQgd29ya3MgYmVzdCBmb3IgbW9zdCBzY2VuYXJpb3MuIgogICAgICAgICAgICApCiAgICByZXR1cm4gZGF0YQo= + base_image: mlrun/mlrun + entry_points: + generate_data: + has_varargs: false + name: generate_data + has_kwargs: false + doc: 'Structured data of elements according to the given parameters. + + The data can be later logged as a structured file with MLRun''s `returns` + parameter.' + parameters: + - name: fields + type: list + doc: A list of fields to randomly generate. + - name: amount + type: int + doc: The number of variants to generate. + default: 10 + - name: model_name + type: str + doc: 'The name of the model to use for conversation generation. You should + choose one of GPT-4 or GPT-3.5 from the list here: https://platform.openai.com/docs/models. + Default: ''gpt-3.5-turbo''.' + default: gpt-3.5-turbo + - name: language + type: str + doc: The language to use for the generated conversation text. + default: en + - name: chunk_size + type: int + doc: Number of samples generated at each GPT query. + default: 50 + outputs: + - type: list + lineno: 59 + command: '' + description: GenAI approach of generating structured data according to a given schema + default_handler: generate_data + disable_auto_mount: false + image: '' +metadata: + name: structured-data-generator + tag: '' + categories: + - data-generation + - genai +verbose: false +kind: job diff --git a/functions/master/structured_data_generator/1.6.0/src/item.yaml b/functions/master/structured_data_generator/1.6.0/src/item.yaml new file mode 100755 index 00000000..6e01aefb --- /dev/null +++ b/functions/master/structured_data_generator/1.6.0/src/item.yaml @@ -0,0 +1,27 @@ +apiVersion: v1 +categories: +- data-generation +- genai +description: GenAI approach of generating structured data according to a given schema +doc: '' +example: structured_data_generator.ipynb +generationDate: 2023-12-14:10-50 +hidden: false +icon: '' +labels: + author: zeevr +maintainers: [] +marketplaceType: '' +mlrunVersion: 1.8.0 +name: structured_data_generator +platformVersion: 3.5.5 +spec: + filename: structured_data_generator.py + handler: generate_data + image: mlrun/mlrun + kind: job + requirements: + - langchain + - tqdm +url: '' +version: 1.6.0 diff --git a/functions/master/structured_data_generator/1.6.0/src/structured_data_generator.ipynb b/functions/master/structured_data_generator/1.6.0/src/structured_data_generator.ipynb new file mode 100644 index 00000000..12f87cf0 --- /dev/null +++ b/functions/master/structured_data_generator/1.6.0/src/structured_data_generator.ipynb @@ -0,0 +1,137 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "9f7d79e7-8199-4680-919f-5039e8d7a0fe", + "metadata": {}, + "source": [ + "# structured_data_generator example" + ] + }, + { + "cell_type": "markdown", + "id": "4df1c846-2391-49a4-b65f-e7cff69dcdd9", + "metadata": {}, + "source": [ + "Introducing our innovative hub function, structured_data_generator, designed to streamline the process of creating structured files based on a list of fields.
    \n", + "This powerful function takes user-provided fields as input and dynamically generates relevant data, crafting a comprehensive structured file that aligns with the specified themes.
    \n", + "Whether you're working on content creation, testing scenarios, or simply need diverse data for development purposes, structured_data_generator is your go-to tool.
    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3913a3b7-48c1-4b5a-8a28-8f2c93fc05d1", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import mlrun" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "010c16e7-9d0a-42b1-9f09-141b72048885", + "metadata": {}, + "outputs": [], + "source": [ + "# OpenAI tokens:\n", + "OPENAI_API_KEY = \"\"\n", + "OPENAI_API_BASE = \"\"\n", + "os.environ[\"OPENAI_API_KEY\"] = OPENAI_API_KEY\n", + "os.environ[\"OPENAI_API_BASE\"] = OPENAI_API_BASE" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "324f2120-bcd9-4b61-a418-9c810709b6cf", + "metadata": {}, + "outputs": [], + "source": [ + "# Create mlrun project\n", + "project = mlrun.get_or_create_project(\"structured-data-generator-test\")\n", + "\n", + "# Import the function from the yaml file, once it's in the hub we can import from there \n", + "data_generation = project.set_function(func=\"./structured_data_generator.py\", name=\"structured_data_generator\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "999739d0-c8bf-48c3-8f57-b3c9ffec1a7f", + "metadata": {}, + "outputs": [], + "source": [ + "# Run the imported function with desired file/s and params\n", + "data_generation_run = data_generation.run(\n", + " handler=\"generate_data\",\n", + " params={\n", + " \"amount\": 5,\n", + " \"model_name\": \"gpt-4\",\n", + " \"language\": \"en\",\n", + " \"fields\": [\"first name\", \"last_name\", \"phone_number: at least 9 digits long\", \"email\", \"client_id: at least 8 digits long, only numbers\"],\n", + " },\n", + " returns=[\n", + " \"clients: file\",\n", + " ],\n", + " local=True,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dde97e2b-8570-4df4-84aa-04c341f455c9", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5d70ceee-d17b-4901-9e8c-c9eda72f4e57", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c3ea3341-80cc-4c87-a914-f2f3ffa1d491", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "24983bf4-9fb0-4ebd-97cb-20e87859c22a", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "mlrun-base", + "language": "python", + "name": "conda-env-mlrun-base-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/functions/master/structured_data_generator/1.6.0/src/structured_data_generator.py b/functions/master/structured_data_generator/1.6.0/src/structured_data_generator.py new file mode 100644 index 00000000..34fa36d4 --- /dev/null +++ b/functions/master/structured_data_generator/1.6.0/src/structured_data_generator.py @@ -0,0 +1,142 @@ +# Copyright 2023 Iguazio +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import ast +import os + +import tqdm +from langchain.chat_models import ChatOpenAI + + +def _set_openai_secrets() -> bool: + key = "OPENAI_API_KEY" + base = "OPENAI_API_BASE" + # Check if the key is already in the environment variables: + if key in os.environ and base in os.environ: + return True + # Check if mlrun is installed: + try: + import mlrun + except ModuleNotFoundError: + raise EnvironmentError( + f"One or more of the OpenAI required environment variables ('{key}', '{base}') are missing." + f"Please set them as environment variables or install mlrun (`pip install mlrun`)" + f"and set them as project secrets using `projecy.set_secrets`." + ) + + # Check if the key is in the secrets: + context = mlrun.get_or_create_ctx(name="context") + openai_key = context.get_secret(key) + openai_base = context.get_secret(base) + + # If the key is not in the secrets, return False: + if not openai_key: + raise EnvironmentError( + f"Could not find OpenAI API key in the environment variables or secrets," + f" please set it as: {key}." + ) + if not openai_base: + raise EnvironmentError( + f"Could not find OpenAI API base in the environment variables or secrets," + f" please set it as: {base}." + ) + # If the key is in the secrets, set it in the environment variables and return True: + os.environ[key] = openai_key + os.environ[base] = openai_base + return True + + +def generate_data( + fields: list, + amount: int = 10, + model_name: str = "gpt-3.5-turbo", + language: str = "en", + chunk_size: int = 50, +) -> list: + """ + Structured data of elements according to the given parameters. + The data can be later logged as a structured file with MLRun's `returns` parameter. + + :param fields: A list of fields to randomly generate. + :param amount: The number of variants to generate. + :param model_name: The name of the model to use for conversation generation. + You should choose one of GPT-4 or GPT-3.5 from the list here: https://platform.openai.com/docs/models. + Default: 'gpt-3.5-turbo'. + :param language: The language to use for the generated conversation text. + :param chunk_size: Number of samples generated at each GPT query. + """ + instructions = "" + for field in fields: + # Split the field to key and instruction: + if ":" in field: + key, instruction = field.split(":", 1) + else: + key, instruction = field, "no special instruction" + # Replace spaces with underscores for the key to be used as a json key: + key = key.strip().replace(" ", "_") + instructions += f"* {key}: {instruction}\n" + + # Create the prompt structure: + prompt_structure = ( + f"generate the following values {amount} times randomly, in an order that creates a json table.\n" + f"Use the following keys and instructions (example: 'key: instruction or no special instruction'): " + f"{instructions}.\n" + f"Please generate the values in {language} language. \n" + f"Make sure the names of the keys are the same as the given field name.\n" + f"Please return only the json format without any introduction and ending" + ) + + # Set the OpenAI secrets: + _set_openai_secrets() + + # Load the OpenAI model using langchain: + llm = ChatOpenAI(model=model_name) + + # Start generating data: + data = [] + for _ in tqdm.tqdm(range((amount // chunk_size) + 1), desc="Generating"): + # We try to generate the data 3 times, if we fail we raise an error: + for tryout in range(3): + # If the amount wanted is bigger than the chunk size, we generate a chunk of data in the size of the chunk + # and decrease the amount by the chunk size. + # otherwise we generate a chunk of data in the size of the amount: + if amount > chunk_size: + current_chunk_size = chunk_size + amount -= chunk_size + else: + current_chunk_size = amount + + # Create the prompt: + prompt = prompt_structure.format( + amount=current_chunk_size, + ) + + # Generate a chunk of data: + chunk_data = llm.predict(text=prompt) + + # Validate the response for correct python `list` structure + chunk_data = chunk_data[chunk_data.find("[") : chunk_data.rfind("]") + 1] + if chunk_data.count("[") != chunk_data.count("]"): + print( + "Failed to get proper json format from model, number of '[' doesn't match number of ']'." + ) + continue + chunk_data = ast.literal_eval(chunk_data) + data += chunk_data + break + if tryout == 3: + raise RuntimeError( + f"Could not generate a proper json format for the given fields, using given model: {model_name}." + f" Hint: Gpt-4 works best for most scenarios." + ) + return data diff --git a/functions/master/structured_data_generator/1.6.0/src/test_structured_data_generator.py b/functions/master/structured_data_generator/1.6.0/src/test_structured_data_generator.py new file mode 100644 index 00000000..3a7a7aa5 --- /dev/null +++ b/functions/master/structured_data_generator/1.6.0/src/test_structured_data_generator.py @@ -0,0 +1,37 @@ +import os +import mlrun +import pytest + + +@pytest.mark.skipif("OPENAI_API_KEY" not in os.environ, reason="no token") +def test_structured_data_generator(): + # Create mlrun project + project = mlrun.get_or_create_project("structured-data-generator-test") + + #Set secrets + # project.set_secrets({"OPENAI_API_KEY": "", "OPENAI_API_BASE": ""}) + + # Import the function from the yaml file, once it's in the hub we can import from there + data_generation = project.set_function(func="structured_data_generator.py", name="structured_data_generator") + + # Run the imported function with desired file/s and params + data_generation_run = data_generation.run( + handler="generate_data", + params={ + "amount": 3, + "model_name": "gpt-4", + "language": "en", + "fields": [ + "first name", + "last_name", + "phone_number: at least 9 digits long", + "email", + "client_id: at least 8 digits long, only numbers" + ], + }, + returns=[ + "clients: file", + ], + local=True, + ) + assert data_generation_run.outputs["clients"] \ No newline at end of file diff --git a/functions/master/structured_data_generator/1.6.0/static/documentation.html b/functions/master/structured_data_generator/1.6.0/static/documentation.html new file mode 100644 index 00000000..060d59d9 --- /dev/null +++ b/functions/master/structured_data_generator/1.6.0/static/documentation.html @@ -0,0 +1,255 @@ + + + + + + + +structured_data_generator package + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + + + +
    +
    +
    +
    +
    +
    + +
    + +
    +
    +
    + + + + +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + + + +
    +
    +
    +
    +
    +

    structured_data_generator package

    + +
    + +
    +
    + +
    +
    +

    structured_data_generator package#

    +
    +

    Submodules#

    +
    +
    +

    structured_data_generator.structured_data_generator module#

    +
    +
    +structured_data_generator.structured_data_generator.generate_data(fields: list, amount: int = 10, model_name: str = 'gpt-3.5-turbo', language: str = 'en', chunk_size: int = 50) list[source]#
    +

    Structured data of elements according to the given parameters. +The data can be later logged as a structured file with MLRun’s returns parameter.

    +
    +
    Parameters:
    +
      +
    • fields – A list of fields to randomly generate.

    • +
    • amount – The number of variants to generate.

    • +
    • model_name – The name of the model to use for conversation generation. +You should choose one of GPT-4 or GPT-3.5 from the list here: https://platform.openai.com/docs/models. +Default: ‘gpt-3.5-turbo’.

    • +
    • language – The language to use for the generated conversation text.

    • +
    • chunk_size – Number of samples generated at each GPT query.

    • +
    +
    +
    +
    +
    +
    +

    Module contents#

    +
    +
    +
    +
    +
    +
    +
    +
    + +
    +
    + +
    +
    +
    +
    + + + +
    +
    + + \ No newline at end of file diff --git a/functions/master/structured_data_generator/1.6.0/static/example.html b/functions/master/structured_data_generator/1.6.0/static/example.html new file mode 100644 index 00000000..1573d754 --- /dev/null +++ b/functions/master/structured_data_generator/1.6.0/static/example.html @@ -0,0 +1,248 @@ + + + + + + + +structured_data_generator example + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + + + +
    +
    +
    +
    +
    +
    + +
    + +
    +
    +
    + + + + +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + + +
    +
    +
    +
    +
    +

    structured_data_generator example

    + +
    +
    +
    +
    +
    + +
    +
    +

    structured_data_generator example#

    +

    Introducing our innovative hub function, structured_data_generator, designed to streamline the process of creating structured files based on a list of fields.
    +This powerful function takes user-provided fields as input and dynamically generates relevant data, crafting a comprehensive structured file that aligns with the specified themes.
    +Whether you’re working on content creation, testing scenarios, or simply need diverse data for development purposes, structured_data_generator is your go-to tool.

    +
    +
    +
    import os
    +import mlrun
    +
    +
    +
    +
    +
    +
    +
    # OpenAI tokens:
    +OPENAI_API_KEY = ""
    +OPENAI_API_BASE = ""
    +os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
    +os.environ["OPENAI_API_BASE"] = OPENAI_API_BASE
    +
    +
    +
    +
    +
    +
    +
    # Create mlrun project
    +project = mlrun.get_or_create_project("structured-data-generator-test")
    +
    +# Import the function from the yaml file, once it's in the hub we can import from there 
    +data_generation = project.set_function(func="./structured_data_generator.py", name="structured_data_generator")
    +
    +
    +
    +
    +
    +
    +
    # Run the imported function with desired file/s and params
    +data_generation_run = data_generation.run(
    +    handler="generate_data",
    +            params={
    +                "amount": 5,
    +                "model_name": "gpt-4",
    +                "language": "en",
    +                "fields": ["first name", "last_name", "phone_number: at least 9 digits long", "email", "client_id: at least 8 digits long, only numbers"],
    +            },
    +            returns=[
    +                "clients: file",
    +            ],
    +    local=True,
    +)
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +
    +
    +
    +
    + + + +
    +
    + + \ No newline at end of file diff --git a/functions/master/structured_data_generator/1.6.0/static/function.html b/functions/master/structured_data_generator/1.6.0/static/function.html new file mode 100644 index 00000000..11fdbe2f --- /dev/null +++ b/functions/master/structured_data_generator/1.6.0/static/function.html @@ -0,0 +1,91 @@ + + + + + + + + + + + Source + + + + +
    +        
    +spec:
    +  build:
    +    origin_filename: ''
    +    requirements:
    +    - langchain
    +    - tqdm
    +    code_origin: ''
    +    functionSourceCode: IyBDb3B5cmlnaHQgMjAyMyBJZ3VhemlvCiMKIyBMaWNlbnNlZCB1bmRlciB0aGUgQXBhY2hlIExpY2Vuc2UsIFZlcnNpb24gMi4wICh0aGUgIkxpY2Vuc2UiKTsKIyB5b3UgbWF5IG5vdCB1c2UgdGhpcyBmaWxlIGV4Y2VwdCBpbiBjb21wbGlhbmNlIHdpdGggdGhlIExpY2Vuc2UuCiMgWW91IG1heSBvYnRhaW4gYSBjb3B5IG9mIHRoZSBMaWNlbnNlIGF0CiMKIyAgIGh0dHA6Ly93d3cuYXBhY2hlLm9yZy9saWNlbnNlcy9MSUNFTlNFLTIuMAojCiMgVW5sZXNzIHJlcXVpcmVkIGJ5IGFwcGxpY2FibGUgbGF3IG9yIGFncmVlZCB0byBpbiB3cml0aW5nLCBzb2Z0d2FyZQojIGRpc3RyaWJ1dGVkIHVuZGVyIHRoZSBMaWNlbnNlIGlzIGRpc3RyaWJ1dGVkIG9uIGFuICJBUyBJUyIgQkFTSVMsCiMgV0lUSE9VVCBXQVJSQU5USUVTIE9SIENPTkRJVElPTlMgT0YgQU5ZIEtJTkQsIGVpdGhlciBleHByZXNzIG9yIGltcGxpZWQuCiMgU2VlIHRoZSBMaWNlbnNlIGZvciB0aGUgc3BlY2lmaWMgbGFuZ3VhZ2UgZ292ZXJuaW5nIHBlcm1pc3Npb25zIGFuZAojIGxpbWl0YXRpb25zIHVuZGVyIHRoZSBMaWNlbnNlLgppbXBvcnQgYXN0CmltcG9ydCBvcwoKaW1wb3J0IHRxZG0KZnJvbSBsYW5nY2hhaW4uY2hhdF9tb2RlbHMgaW1wb3J0IENoYXRPcGVuQUkKCgpkZWYgX3NldF9vcGVuYWlfc2VjcmV0cygpIC0+IGJvb2w6CiAgICBrZXkgPSAiT1BFTkFJX0FQSV9LRVkiCiAgICBiYXNlID0gIk9QRU5BSV9BUElfQkFTRSIKICAgICMgQ2hlY2sgaWYgdGhlIGtleSBpcyBhbHJlYWR5IGluIHRoZSBlbnZpcm9ubWVudCB2YXJpYWJsZXM6CiAgICBpZiBrZXkgaW4gb3MuZW52aXJvbiBhbmQgYmFzZSBpbiBvcy5lbnZpcm9uOgogICAgICAgIHJldHVybiBUcnVlCiAgICAjIENoZWNrIGlmIG1scnVuIGlzIGluc3RhbGxlZDoKICAgIHRyeToKICAgICAgICBpbXBvcnQgbWxydW4KICAgIGV4Y2VwdCBNb2R1bGVOb3RGb3VuZEVycm9yOgogICAgICAgIHJhaXNlIEVudmlyb25tZW50RXJyb3IoCiAgICAgICAgICAgIGYiT25lIG9yIG1vcmUgb2YgdGhlIE9wZW5BSSByZXF1aXJlZCBlbnZpcm9ubWVudCB2YXJpYWJsZXMgKCd7a2V5fScsICd7YmFzZX0nKSBhcmUgbWlzc2luZy4iCiAgICAgICAgICAgIGYiUGxlYXNlIHNldCB0aGVtIGFzIGVudmlyb25tZW50IHZhcmlhYmxlcyBvciBpbnN0YWxsIG1scnVuIChgcGlwIGluc3RhbGwgbWxydW5gKSIKICAgICAgICAgICAgZiJhbmQgc2V0IHRoZW0gYXMgcHJvamVjdCBzZWNyZXRzIHVzaW5nIGBwcm9qZWN5LnNldF9zZWNyZXRzYC4iCiAgICAgICAgKQoKICAgICMgQ2hlY2sgaWYgdGhlIGtleSBpcyBpbiB0aGUgc2VjcmV0czoKICAgIGNvbnRleHQgPSBtbHJ1bi5nZXRfb3JfY3JlYXRlX2N0eChuYW1lPSJjb250ZXh0IikKICAgIG9wZW5haV9rZXkgPSBjb250ZXh0LmdldF9zZWNyZXQoa2V5KQogICAgb3BlbmFpX2Jhc2UgPSBjb250ZXh0LmdldF9zZWNyZXQoYmFzZSkKCiAgICAjIElmIHRoZSBrZXkgaXMgbm90IGluIHRoZSBzZWNyZXRzLCByZXR1cm4gRmFsc2U6CiAgICBpZiBub3Qgb3BlbmFpX2tleToKICAgICAgICByYWlzZSBFbnZpcm9ubWVudEVycm9yKAogICAgICAgICAgICBmIkNvdWxkIG5vdCBmaW5kIE9wZW5BSSBBUEkga2V5IGluIHRoZSBlbnZpcm9ubWVudCB2YXJpYWJsZXMgb3Igc2VjcmV0cywiCiAgICAgICAgICAgIGYiIHBsZWFzZSBzZXQgaXQgYXM6IHtrZXl9LiIKICAgICAgICApCiAgICBpZiBub3Qgb3BlbmFpX2Jhc2U6CiAgICAgICAgcmFpc2UgRW52aXJvbm1lbnRFcnJvcigKICAgICAgICAgICAgZiJDb3VsZCBub3QgZmluZCBPcGVuQUkgQVBJIGJhc2UgaW4gdGhlIGVudmlyb25tZW50IHZhcmlhYmxlcyBvciBzZWNyZXRzLCIKICAgICAgICAgICAgZiIgcGxlYXNlIHNldCBpdCBhczoge2Jhc2V9LiIKICAgICAgICApCiAgICAjIElmIHRoZSBrZXkgaXMgaW4gdGhlIHNlY3JldHMsIHNldCBpdCBpbiB0aGUgZW52aXJvbm1lbnQgdmFyaWFibGVzIGFuZCByZXR1cm4gVHJ1ZToKICAgIG9zLmVudmlyb25ba2V5XSA9IG9wZW5haV9rZXkKICAgIG9zLmVudmlyb25bYmFzZV0gPSBvcGVuYWlfYmFzZQogICAgcmV0dXJuIFRydWUKCgpkZWYgZ2VuZXJhdGVfZGF0YSgKICAgIGZpZWxkczogbGlzdCwKICAgIGFtb3VudDogaW50ID0gMTAsCiAgICBtb2RlbF9uYW1lOiBzdHIgPSAiZ3B0LTMuNS10dXJibyIsCiAgICBsYW5ndWFnZTogc3RyID0gImVuIiwKICAgIGNodW5rX3NpemU6IGludCA9IDUwLAopIC0+IGxpc3Q6CiAgICAiIiIKICAgIFN0cnVjdHVyZWQgZGF0YSBvZiBlbGVtZW50cyBhY2NvcmRpbmcgdG8gdGhlIGdpdmVuIHBhcmFtZXRlcnMuCiAgICBUaGUgZGF0YSBjYW4gYmUgbGF0ZXIgbG9nZ2VkIGFzIGEgc3RydWN0dXJlZCBmaWxlIHdpdGggTUxSdW4ncyBgcmV0dXJuc2AgcGFyYW1ldGVyLgoKICAgIDpwYXJhbSBmaWVsZHM6IEEgbGlzdCBvZiBmaWVsZHMgdG8gcmFuZG9tbHkgZ2VuZXJhdGUuCiAgICA6cGFyYW0gYW1vdW50OiBUaGUgbnVtYmVyIG9mIHZhcmlhbnRzIHRvIGdlbmVyYXRlLgogICAgOnBhcmFtIG1vZGVsX25hbWU6IFRoZSBuYW1lIG9mIHRoZSBtb2RlbCB0byB1c2UgZm9yIGNvbnZlcnNhdGlvbiBnZW5lcmF0aW9uLgogICAgICAgICAgICAgICAgICAgICAgIFlvdSBzaG91bGQgY2hvb3NlIG9uZSBvZiBHUFQtNCBvciBHUFQtMy41IGZyb20gdGhlIGxpc3QgaGVyZTogaHR0cHM6Ly9wbGF0Zm9ybS5vcGVuYWkuY29tL2RvY3MvbW9kZWxzLgogICAgICAgICAgICAgICAgICAgICAgIERlZmF1bHQ6ICdncHQtMy41LXR1cmJvJy4KICAgIDpwYXJhbSBsYW5ndWFnZTogVGhlIGxhbmd1YWdlIHRvIHVzZSBmb3IgdGhlIGdlbmVyYXRlZCBjb252ZXJzYXRpb24gdGV4dC4KICAgIDpwYXJhbSBjaHVua19zaXplOiBOdW1iZXIgb2Ygc2FtcGxlcyBnZW5lcmF0ZWQgYXQgZWFjaCBHUFQgcXVlcnkuCiAgICAiIiIKICAgIGluc3RydWN0aW9ucyA9ICIiCiAgICBmb3IgZmllbGQgaW4gZmllbGRzOgogICAgICAgICMgU3BsaXQgdGhlIGZpZWxkIHRvIGtleSBhbmQgaW5zdHJ1Y3Rpb246CiAgICAgICAgaWYgIjoiIGluIGZpZWxkOgogICAgICAgICAgICBrZXksIGluc3RydWN0aW9uID0gZmllbGQuc3BsaXQoIjoiLCAxKQogICAgICAgIGVsc2U6CiAgICAgICAgICAgIGtleSwgaW5zdHJ1Y3Rpb24gPSBmaWVsZCwgIm5vIHNwZWNpYWwgaW5zdHJ1Y3Rpb24iCiAgICAgICAgIyBSZXBsYWNlIHNwYWNlcyB3aXRoIHVuZGVyc2NvcmVzIGZvciB0aGUga2V5IHRvIGJlIHVzZWQgYXMgYSBqc29uIGtleToKICAgICAgICBrZXkgPSBrZXkuc3RyaXAoKS5yZXBsYWNlKCIgIiwgIl8iKQogICAgICAgIGluc3RydWN0aW9ucyArPSBmIioge2tleX06IHtpbnN0cnVjdGlvbn1cbiIKCiAgICAjIENyZWF0ZSB0aGUgcHJvbXB0IHN0cnVjdHVyZToKICAgIHByb21wdF9zdHJ1Y3R1cmUgPSAoCiAgICAgICAgZiJnZW5lcmF0ZSB0aGUgZm9sbG93aW5nIHZhbHVlcyB7YW1vdW50fSB0aW1lcyByYW5kb21seSwgaW4gYW4gb3JkZXIgdGhhdCBjcmVhdGVzIGEganNvbiB0YWJsZS5cbiIKICAgICAgICBmIlVzZSB0aGUgZm9sbG93aW5nIGtleXMgYW5kIGluc3RydWN0aW9ucyAoZXhhbXBsZTogJ2tleTogaW5zdHJ1Y3Rpb24gb3Igbm8gc3BlY2lhbCBpbnN0cnVjdGlvbicpOiAiCiAgICAgICAgZiJ7aW5zdHJ1Y3Rpb25zfS5cbiIKICAgICAgICBmIlBsZWFzZSBnZW5lcmF0ZSB0aGUgdmFsdWVzIGluIHtsYW5ndWFnZX0gbGFuZ3VhZ2UuIFxuIgogICAgICAgIGYiTWFrZSBzdXJlIHRoZSBuYW1lcyBvZiB0aGUga2V5cyBhcmUgdGhlIHNhbWUgYXMgdGhlIGdpdmVuIGZpZWxkIG5hbWUuXG4iCiAgICAgICAgZiJQbGVhc2UgcmV0dXJuIG9ubHkgdGhlIGpzb24gZm9ybWF0IHdpdGhvdXQgYW55IGludHJvZHVjdGlvbiBhbmQgZW5kaW5nIgogICAgKQoKICAgICMgU2V0IHRoZSBPcGVuQUkgc2VjcmV0czoKICAgIF9zZXRfb3BlbmFpX3NlY3JldHMoKQoKICAgICMgTG9hZCB0aGUgT3BlbkFJIG1vZGVsIHVzaW5nIGxhbmdjaGFpbjoKICAgIGxsbSA9IENoYXRPcGVuQUkobW9kZWw9bW9kZWxfbmFtZSkKCiAgICAjIFN0YXJ0IGdlbmVyYXRpbmcgZGF0YToKICAgIGRhdGEgPSBbXQogICAgZm9yIF8gaW4gdHFkbS50cWRtKHJhbmdlKChhbW91bnQgLy8gY2h1bmtfc2l6ZSkgKyAxKSwgZGVzYz0iR2VuZXJhdGluZyIpOgogICAgICAgICMgV2UgdHJ5IHRvIGdlbmVyYXRlIHRoZSBkYXRhIDMgdGltZXMsIGlmIHdlIGZhaWwgd2UgcmFpc2UgYW4gZXJyb3I6CiAgICAgICAgZm9yIHRyeW91dCBpbiByYW5nZSgzKToKICAgICAgICAgICAgIyBJZiB0aGUgYW1vdW50IHdhbnRlZCBpcyBiaWdnZXIgdGhhbiB0aGUgY2h1bmsgc2l6ZSwgd2UgZ2VuZXJhdGUgYSBjaHVuayBvZiBkYXRhIGluIHRoZSBzaXplIG9mIHRoZSBjaHVuawogICAgICAgICAgICAjIGFuZCBkZWNyZWFzZSB0aGUgYW1vdW50IGJ5IHRoZSBjaHVuayBzaXplLgogICAgICAgICAgICAjIG90aGVyd2lzZSB3ZSBnZW5lcmF0ZSBhIGNodW5rIG9mIGRhdGEgaW4gdGhlIHNpemUgb2YgdGhlIGFtb3VudDoKICAgICAgICAgICAgaWYgYW1vdW50ID4gY2h1bmtfc2l6ZToKICAgICAgICAgICAgICAgIGN1cnJlbnRfY2h1bmtfc2l6ZSA9IGNodW5rX3NpemUKICAgICAgICAgICAgICAgIGFtb3VudCAtPSBjaHVua19zaXplCiAgICAgICAgICAgIGVsc2U6CiAgICAgICAgICAgICAgICBjdXJyZW50X2NodW5rX3NpemUgPSBhbW91bnQKCiAgICAgICAgICAgICMgQ3JlYXRlIHRoZSBwcm9tcHQ6CiAgICAgICAgICAgIHByb21wdCA9IHByb21wdF9zdHJ1Y3R1cmUuZm9ybWF0KAogICAgICAgICAgICAgICAgYW1vdW50PWN1cnJlbnRfY2h1bmtfc2l6ZSwKICAgICAgICAgICAgKQoKICAgICAgICAgICAgIyBHZW5lcmF0ZSBhIGNodW5rIG9mIGRhdGE6CiAgICAgICAgICAgIGNodW5rX2RhdGEgPSBsbG0ucHJlZGljdCh0ZXh0PXByb21wdCkKCiAgICAgICAgICAgICMgVmFsaWRhdGUgdGhlIHJlc3BvbnNlIGZvciBjb3JyZWN0IHB5dGhvbiBgbGlzdGAgc3RydWN0dXJlCiAgICAgICAgICAgIGNodW5rX2RhdGEgPSBjaHVua19kYXRhW2NodW5rX2RhdGEuZmluZCgiWyIpIDogY2h1bmtfZGF0YS5yZmluZCgiXSIpICsgMV0KICAgICAgICAgICAgaWYgY2h1bmtfZGF0YS5jb3VudCgiWyIpICE9IGNodW5rX2RhdGEuY291bnQoIl0iKToKICAgICAgICAgICAgICAgIHByaW50KAogICAgICAgICAgICAgICAgICAgICJGYWlsZWQgdG8gZ2V0IHByb3BlciBqc29uIGZvcm1hdCBmcm9tIG1vZGVsLCBudW1iZXIgb2YgJ1snIGRvZXNuJ3QgbWF0Y2ggbnVtYmVyIG9mICddJy4iCiAgICAgICAgICAgICAgICApCiAgICAgICAgICAgICAgICBjb250aW51ZQogICAgICAgICAgICBjaHVua19kYXRhID0gYXN0LmxpdGVyYWxfZXZhbChjaHVua19kYXRhKQogICAgICAgICAgICBkYXRhICs9IGNodW5rX2RhdGEKICAgICAgICAgICAgYnJlYWsKICAgICAgICBpZiB0cnlvdXQgPT0gMzoKICAgICAgICAgICAgcmFpc2UgUnVudGltZUVycm9yKAogICAgICAgICAgICAgICAgZiJDb3VsZCBub3QgZ2VuZXJhdGUgYSBwcm9wZXIganNvbiBmb3JtYXQgZm9yIHRoZSBnaXZlbiBmaWVsZHMsIHVzaW5nIGdpdmVuIG1vZGVsOiB7bW9kZWxfbmFtZX0uIgogICAgICAgICAgICAgICAgZiIgSGludDogR3B0LTQgd29ya3MgYmVzdCBmb3IgbW9zdCBzY2VuYXJpb3MuIgogICAgICAgICAgICApCiAgICByZXR1cm4gZGF0YQo=
    +    base_image: mlrun/mlrun
    +  entry_points:
    +    generate_data:
    +      has_varargs: false
    +      name: generate_data
    +      has_kwargs: false
    +      doc: 'Structured data of elements according to the given parameters.
    +
    +        The data can be later logged as a structured file with MLRun''s `returns`
    +        parameter.'
    +      parameters:
    +      - name: fields
    +        type: list
    +        doc: A list of fields to randomly generate.
    +      - name: amount
    +        type: int
    +        doc: The number of variants to generate.
    +        default: 10
    +      - name: model_name
    +        type: str
    +        doc: 'The name of the model to use for conversation generation. You should
    +          choose one of GPT-4 or GPT-3.5 from the list here: https://platform.openai.com/docs/models.
    +          Default: ''gpt-3.5-turbo''.'
    +        default: gpt-3.5-turbo
    +      - name: language
    +        type: str
    +        doc: The language to use for the generated conversation text.
    +        default: en
    +      - name: chunk_size
    +        type: int
    +        doc: Number of samples generated at each GPT query.
    +        default: 50
    +      outputs:
    +      - type: list
    +      lineno: 59
    +  command: ''
    +  description: GenAI approach of generating structured data according to a given schema
    +  default_handler: generate_data
    +  disable_auto_mount: false
    +  image: ''
    +metadata:
    +  name: structured-data-generator
    +  tag: ''
    +  categories:
    +  - data-generation
    +  - genai
    +verbose: false
    +kind: job
    +
    +        
    +    
    + + \ No newline at end of file diff --git a/functions/master/structured_data_generator/1.6.0/static/item.html b/functions/master/structured_data_generator/1.6.0/static/item.html new file mode 100644 index 00000000..90c770e7 --- /dev/null +++ b/functions/master/structured_data_generator/1.6.0/static/item.html @@ -0,0 +1,62 @@ + + + + + + + + + + + Source + + + + +
    +        
    +apiVersion: v1
    +categories:
    +- data-generation
    +- genai
    +description: GenAI approach of generating structured data according to a given schema
    +doc: ''
    +example: structured_data_generator.ipynb
    +generationDate: 2023-12-14:10-50
    +hidden: false
    +icon: ''
    +labels:
    +  author: zeevr
    +maintainers: []
    +marketplaceType: ''
    +mlrunVersion: 1.8.0
    +name: structured_data_generator
    +platformVersion: 3.5.5
    +spec:
    +  filename: structured_data_generator.py
    +  handler: generate_data
    +  image: mlrun/mlrun
    +  kind: job
    +  requirements:
    +  - langchain
    +  - tqdm
    +url: ''
    +version: 1.6.0
    +
    +        
    +    
    + + \ No newline at end of file diff --git a/functions/master/structured_data_generator/1.6.0/static/source.html b/functions/master/structured_data_generator/1.6.0/static/source.html new file mode 100644 index 00000000..334b6e1e --- /dev/null +++ b/functions/master/structured_data_generator/1.6.0/static/source.html @@ -0,0 +1,177 @@ + + + + + + + + + + + Source + + + + +
    +        
    +# Copyright 2023 Iguazio
    +#
    +# Licensed under the Apache License, Version 2.0 (the "License");
    +# you may not use this file except in compliance with the License.
    +# You may obtain a copy of the License at
    +#
    +#   http://www.apache.org/licenses/LICENSE-2.0
    +#
    +# Unless required by applicable law or agreed to in writing, software
    +# distributed under the License is distributed on an "AS IS" BASIS,
    +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    +# See the License for the specific language governing permissions and
    +# limitations under the License.
    +import ast
    +import os
    +
    +import tqdm
    +from langchain.chat_models import ChatOpenAI
    +
    +
    +def _set_openai_secrets() -> bool:
    +    key = "OPENAI_API_KEY"
    +    base = "OPENAI_API_BASE"
    +    # Check if the key is already in the environment variables:
    +    if key in os.environ and base in os.environ:
    +        return True
    +    # Check if mlrun is installed:
    +    try:
    +        import mlrun
    +    except ModuleNotFoundError:
    +        raise EnvironmentError(
    +            f"One or more of the OpenAI required environment variables ('{key}', '{base}') are missing."
    +            f"Please set them as environment variables or install mlrun (`pip install mlrun`)"
    +            f"and set them as project secrets using `projecy.set_secrets`."
    +        )
    +
    +    # Check if the key is in the secrets:
    +    context = mlrun.get_or_create_ctx(name="context")
    +    openai_key = context.get_secret(key)
    +    openai_base = context.get_secret(base)
    +
    +    # If the key is not in the secrets, return False:
    +    if not openai_key:
    +        raise EnvironmentError(
    +            f"Could not find OpenAI API key in the environment variables or secrets,"
    +            f" please set it as: {key}."
    +        )
    +    if not openai_base:
    +        raise EnvironmentError(
    +            f"Could not find OpenAI API base in the environment variables or secrets,"
    +            f" please set it as: {base}."
    +        )
    +    # If the key is in the secrets, set it in the environment variables and return True:
    +    os.environ[key] = openai_key
    +    os.environ[base] = openai_base
    +    return True
    +
    +
    +def generate_data(
    +    fields: list,
    +    amount: int = 10,
    +    model_name: str = "gpt-3.5-turbo",
    +    language: str = "en",
    +    chunk_size: int = 50,
    +) -> list:
    +    """
    +    Structured data of elements according to the given parameters.
    +    The data can be later logged as a structured file with MLRun's `returns` parameter.
    +
    +    :param fields: A list of fields to randomly generate.
    +    :param amount: The number of variants to generate.
    +    :param model_name: The name of the model to use for conversation generation.
    +                       You should choose one of GPT-4 or GPT-3.5 from the list here: https://platform.openai.com/docs/models.
    +                       Default: 'gpt-3.5-turbo'.
    +    :param language: The language to use for the generated conversation text.
    +    :param chunk_size: Number of samples generated at each GPT query.
    +    """
    +    instructions = ""
    +    for field in fields:
    +        # Split the field to key and instruction:
    +        if ":" in field:
    +            key, instruction = field.split(":", 1)
    +        else:
    +            key, instruction = field, "no special instruction"
    +        # Replace spaces with underscores for the key to be used as a json key:
    +        key = key.strip().replace(" ", "_")
    +        instructions += f"* {key}: {instruction}\n"
    +
    +    # Create the prompt structure:
    +    prompt_structure = (
    +        f"generate the following values {amount} times randomly, in an order that creates a json table.\n"
    +        f"Use the following keys and instructions (example: 'key: instruction or no special instruction'): "
    +        f"{instructions}.\n"
    +        f"Please generate the values in {language} language. \n"
    +        f"Make sure the names of the keys are the same as the given field name.\n"
    +        f"Please return only the json format without any introduction and ending"
    +    )
    +
    +    # Set the OpenAI secrets:
    +    _set_openai_secrets()
    +
    +    # Load the OpenAI model using langchain:
    +    llm = ChatOpenAI(model=model_name)
    +
    +    # Start generating data:
    +    data = []
    +    for _ in tqdm.tqdm(range((amount // chunk_size) + 1), desc="Generating"):
    +        # We try to generate the data 3 times, if we fail we raise an error:
    +        for tryout in range(3):
    +            # If the amount wanted is bigger than the chunk size, we generate a chunk of data in the size of the chunk
    +            # and decrease the amount by the chunk size.
    +            # otherwise we generate a chunk of data in the size of the amount:
    +            if amount > chunk_size:
    +                current_chunk_size = chunk_size
    +                amount -= chunk_size
    +            else:
    +                current_chunk_size = amount
    +
    +            # Create the prompt:
    +            prompt = prompt_structure.format(
    +                amount=current_chunk_size,
    +            )
    +
    +            # Generate a chunk of data:
    +            chunk_data = llm.predict(text=prompt)
    +
    +            # Validate the response for correct python `list` structure
    +            chunk_data = chunk_data[chunk_data.find("[") : chunk_data.rfind("]") + 1]
    +            if chunk_data.count("[") != chunk_data.count("]"):
    +                print(
    +                    "Failed to get proper json format from model, number of '[' doesn't match number of ']'."
    +                )
    +                continue
    +            chunk_data = ast.literal_eval(chunk_data)
    +            data += chunk_data
    +            break
    +        if tryout == 3:
    +            raise RuntimeError(
    +                f"Could not generate a proper json format for the given fields, using given model: {model_name}."
    +                f" Hint: Gpt-4 works best for most scenarios."
    +            )
    +    return data
    +
    +        
    +    
    + + \ No newline at end of file diff --git a/functions/master/structured_data_generator/1.6.0/static/structured_data_generator.html b/functions/master/structured_data_generator/1.6.0/static/structured_data_generator.html new file mode 100644 index 00000000..b665f268 --- /dev/null +++ b/functions/master/structured_data_generator/1.6.0/static/structured_data_generator.html @@ -0,0 +1,317 @@ + + + + + + + +structured_data_generator.structured_data_generator + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + + + +
    +
    +
    +
    +
    +
    + +
    + +
    +
    +
    + + + + +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +
    +
    +
    +
    +
    +

    + +
    +
    +
    +
    +
    + +
    +

    Source code for structured_data_generator.structured_data_generator

    +# Copyright 2023 Iguazio
    +#
    +# Licensed under the Apache License, Version 2.0 (the "License");
    +# you may not use this file except in compliance with the License.
    +# You may obtain a copy of the License at
    +#
    +#   http://www.apache.org/licenses/LICENSE-2.0
    +#
    +# Unless required by applicable law or agreed to in writing, software
    +# distributed under the License is distributed on an "AS IS" BASIS,
    +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    +# See the License for the specific language governing permissions and
    +# limitations under the License.
    +import ast
    +import os
    +
    +import tqdm
    +from langchain.chat_models import ChatOpenAI
    +
    +
    +def _set_openai_secrets() -> bool:
    +    key = "OPENAI_API_KEY"
    +    base = "OPENAI_API_BASE"
    +    # Check if the key is already in the environment variables:
    +    if key in os.environ and base in os.environ:
    +        return True
    +    # Check if mlrun is installed:
    +    try:
    +        import mlrun
    +    except ModuleNotFoundError:
    +        raise EnvironmentError(
    +            f"One or more of the OpenAI required environment variables ('{key}', '{base}') are missing."
    +            f"Please set them as environment variables or install mlrun (`pip install mlrun`)"
    +            f"and set them as project secrets using `projecy.set_secrets`."
    +        )
    +
    +    # Check if the key is in the secrets:
    +    context = mlrun.get_or_create_ctx(name="context")
    +    openai_key = context.get_secret(key)
    +    openai_base = context.get_secret(base)
    +
    +    # If the key is not in the secrets, return False:
    +    if not openai_key:
    +        raise EnvironmentError(
    +            f"Could not find OpenAI API key in the environment variables or secrets,"
    +            f" please set it as: {key}."
    +        )
    +    if not openai_base:
    +        raise EnvironmentError(
    +            f"Could not find OpenAI API base in the environment variables or secrets,"
    +            f" please set it as: {base}."
    +        )
    +    # If the key is in the secrets, set it in the environment variables and return True:
    +    os.environ[key] = openai_key
    +    os.environ[base] = openai_base
    +    return True
    +
    +
    +
    +[docs] +def generate_data( + fields: list, + amount: int = 10, + model_name: str = "gpt-3.5-turbo", + language: str = "en", + chunk_size: int = 50, +) -> list: + """ + Structured data of elements according to the given parameters. + The data can be later logged as a structured file with MLRun's `returns` parameter. + + :param fields: A list of fields to randomly generate. + :param amount: The number of variants to generate. + :param model_name: The name of the model to use for conversation generation. + You should choose one of GPT-4 or GPT-3.5 from the list here: https://platform.openai.com/docs/models. + Default: 'gpt-3.5-turbo'. + :param language: The language to use for the generated conversation text. + :param chunk_size: Number of samples generated at each GPT query. + """ + instructions = "" + for field in fields: + # Split the field to key and instruction: + if ":" in field: + key, instruction = field.split(":", 1) + else: + key, instruction = field, "no special instruction" + # Replace spaces with underscores for the key to be used as a json key: + key = key.strip().replace(" ", "_") + instructions += f"* {key}: {instruction}\n" + + # Create the prompt structure: + prompt_structure = ( + f"generate the following values {amount} times randomly, in an order that creates a json table.\n" + f"Use the following keys and instructions (example: 'key: instruction or no special instruction'): " + f"{instructions}.\n" + f"Please generate the values in {language} language. \n" + f"Make sure the names of the keys are the same as the given field name.\n" + f"Please return only the json format without any introduction and ending" + ) + + # Set the OpenAI secrets: + _set_openai_secrets() + + # Load the OpenAI model using langchain: + llm = ChatOpenAI(model=model_name) + + # Start generating data: + data = [] + for _ in tqdm.tqdm(range((amount // chunk_size) + 1), desc="Generating"): + # We try to generate the data 3 times, if we fail we raise an error: + for tryout in range(3): + # If the amount wanted is bigger than the chunk size, we generate a chunk of data in the size of the chunk + # and decrease the amount by the chunk size. + # otherwise we generate a chunk of data in the size of the amount: + if amount > chunk_size: + current_chunk_size = chunk_size + amount -= chunk_size + else: + current_chunk_size = amount + + # Create the prompt: + prompt = prompt_structure.format( + amount=current_chunk_size, + ) + + # Generate a chunk of data: + chunk_data = llm.predict(text=prompt) + + # Validate the response for correct python `list` structure + chunk_data = chunk_data[chunk_data.find("[") : chunk_data.rfind("]") + 1] + if chunk_data.count("[") != chunk_data.count("]"): + print( + "Failed to get proper json format from model, number of '[' doesn't match number of ']'." + ) + continue + chunk_data = ast.literal_eval(chunk_data) + data += chunk_data + break + if tryout == 3: + raise RuntimeError( + f"Could not generate a proper json format for the given fields, using given model: {model_name}." + f" Hint: Gpt-4 works best for most scenarios." + ) + return data
    + +
    +
    +
    +
    +
    +
    +
    +
    +
    + +
    +
    +
    +
    + + + +
    +
    + + \ No newline at end of file diff --git a/functions/master/structured_data_generator/latest/src/function.yaml b/functions/master/structured_data_generator/latest/src/function.yaml index 1093e178..4e8a3562 100644 --- a/functions/master/structured_data_generator/latest/src/function.yaml +++ b/functions/master/structured_data_generator/latest/src/function.yaml @@ -1,32 +1,17 @@ -kind: job -metadata: - name: structured-data-generator - tag: '' - hash: 44bb39f4bc55b38fc7ead1df24cb02bcf7f05bc9 - project: '' - labels: - author: zeevr - categories: - - machine-learning - - data-preparation - - data-generation - - genai spec: - command: '' - args: [] - image: '' build: - functionSourceCode: IyBDb3B5cmlnaHQgMjAyMyBJZ3VhemlvCiMKIyBMaWNlbnNlZCB1bmRlciB0aGUgQXBhY2hlIExpY2Vuc2UsIFZlcnNpb24gMi4wICh0aGUgIkxpY2Vuc2UiKTsKIyB5b3UgbWF5IG5vdCB1c2UgdGhpcyBmaWxlIGV4Y2VwdCBpbiBjb21wbGlhbmNlIHdpdGggdGhlIExpY2Vuc2UuCiMgWW91IG1heSBvYnRhaW4gYSBjb3B5IG9mIHRoZSBMaWNlbnNlIGF0CiMKIyAgIGh0dHA6Ly93d3cuYXBhY2hlLm9yZy9saWNlbnNlcy9MSUNFTlNFLTIuMAojCiMgVW5sZXNzIHJlcXVpcmVkIGJ5IGFwcGxpY2FibGUgbGF3IG9yIGFncmVlZCB0byBpbiB3cml0aW5nLCBzb2Z0d2FyZQojIGRpc3RyaWJ1dGVkIHVuZGVyIHRoZSBMaWNlbnNlIGlzIGRpc3RyaWJ1dGVkIG9uIGFuICJBUyBJUyIgQkFTSVMsCiMgV0lUSE9VVCBXQVJSQU5USUVTIE9SIENPTkRJVElPTlMgT0YgQU5ZIEtJTkQsIGVpdGhlciBleHByZXNzIG9yIGltcGxpZWQuCiMgU2VlIHRoZSBMaWNlbnNlIGZvciB0aGUgc3BlY2lmaWMgbGFuZ3VhZ2UgZ292ZXJuaW5nIHBlcm1pc3Npb25zIGFuZAojIGxpbWl0YXRpb25zIHVuZGVyIHRoZSBMaWNlbnNlLgppbXBvcnQgYXN0CmltcG9ydCBvcwoKaW1wb3J0IHRxZG0KZnJvbSBsYW5nY2hhaW4uY2hhdF9tb2RlbHMgaW1wb3J0IENoYXRPcGVuQUkKCgpkZWYgX3NldF9vcGVuYWlfc2VjcmV0cygpIC0+IGJvb2w6CiAgICBrZXkgPSAiT1BFTkFJX0FQSV9LRVkiCiAgICBiYXNlID0gIk9QRU5BSV9BUElfQkFTRSIKICAgICMgQ2hlY2sgaWYgdGhlIGtleSBpcyBhbHJlYWR5IGluIHRoZSBlbnZpcm9ubWVudCB2YXJpYWJsZXM6CiAgICBpZiBrZXkgaW4gb3MuZW52aXJvbiBhbmQgYmFzZSBpbiBvcy5lbnZpcm9uOgogICAgICAgIHJldHVybiBUcnVlCiAgICAjIENoZWNrIGlmIG1scnVuIGlzIGluc3RhbGxlZDoKICAgIHRyeToKICAgICAgICBpbXBvcnQgbWxydW4KICAgIGV4Y2VwdCBNb2R1bGVOb3RGb3VuZEVycm9yOgogICAgICAgIHJhaXNlIEVudmlyb25tZW50RXJyb3IoCiAgICAgICAgICAgIGYiT25lIG9yIG1vcmUgb2YgdGhlIE9wZW5BSSByZXF1aXJlZCBlbnZpcm9ubWVudCB2YXJpYWJsZXMgKCd7a2V5fScsICd7YmFzZX0nKSBhcmUgbWlzc2luZy4iCiAgICAgICAgICAgIGYiUGxlYXNlIHNldCB0aGVtIGFzIGVudmlyb25tZW50IHZhcmlhYmxlcyBvciBpbnN0YWxsIG1scnVuIChgcGlwIGluc3RhbGwgbWxydW5gKSIKICAgICAgICAgICAgZiJhbmQgc2V0IHRoZW0gYXMgcHJvamVjdCBzZWNyZXRzIHVzaW5nIGBwcm9qZWN5LnNldF9zZWNyZXRzYC4iCiAgICAgICAgKQoKICAgICMgQ2hlY2sgaWYgdGhlIGtleSBpcyBpbiB0aGUgc2VjcmV0czoKICAgIGNvbnRleHQgPSBtbHJ1bi5nZXRfb3JfY3JlYXRlX2N0eChuYW1lPSJjb250ZXh0IikKICAgIG9wZW5haV9rZXkgPSBjb250ZXh0LmdldF9zZWNyZXQoa2V5KQogICAgb3BlbmFpX2Jhc2UgPSBjb250ZXh0LmdldF9zZWNyZXQoYmFzZSkKCiAgICAjIElmIHRoZSBrZXkgaXMgbm90IGluIHRoZSBzZWNyZXRzLCByZXR1cm4gRmFsc2U6CiAgICBpZiBub3Qgb3BlbmFpX2tleToKICAgICAgICByYWlzZSBFbnZpcm9ubWVudEVycm9yKAogICAgICAgICAgICBmIkNvdWxkIG5vdCBmaW5kIE9wZW5BSSBBUEkga2V5IGluIHRoZSBlbnZpcm9ubWVudCB2YXJpYWJsZXMgb3Igc2VjcmV0cywiCiAgICAgICAgICAgIGYiIHBsZWFzZSBzZXQgaXQgYXM6IHtrZXl9LiIKICAgICAgICApCiAgICBpZiBub3Qgb3BlbmFpX2Jhc2U6CiAgICAgICAgcmFpc2UgRW52aXJvbm1lbnRFcnJvcigKICAgICAgICAgICAgZiJDb3VsZCBub3QgZmluZCBPcGVuQUkgQVBJIGJhc2UgaW4gdGhlIGVudmlyb25tZW50IHZhcmlhYmxlcyBvciBzZWNyZXRzLCIKICAgICAgICAgICAgZiIgcGxlYXNlIHNldCBpdCBhczoge2Jhc2V9LiIKICAgICAgICApCiAgICAjIElmIHRoZSBrZXkgaXMgaW4gdGhlIHNlY3JldHMsIHNldCBpdCBpbiB0aGUgZW52aXJvbm1lbnQgdmFyaWFibGVzIGFuZCByZXR1cm4gVHJ1ZToKICAgIG9zLmVudmlyb25ba2V5XSA9IG9wZW5haV9rZXkKICAgIG9zLmVudmlyb25bYmFzZV0gPSBvcGVuYWlfYmFzZQogICAgcmV0dXJuIFRydWUKCgpkZWYgZ2VuZXJhdGVfZGF0YSgKICAgIGZpZWxkczogbGlzdCwKICAgIGFtb3VudDogaW50ID0gMTAsCiAgICBtb2RlbF9uYW1lOiBzdHIgPSAiZ3B0LTMuNS10dXJibyIsCiAgICBsYW5ndWFnZTogc3RyID0gImVuIiwKICAgIGNodW5rX3NpemU6IGludCA9IDUwLAopIC0+IGxpc3Q6CiAgICAiIiIKICAgIFN0cnVjdHVyZWQgZGF0YSBvZiBlbGVtZW50cyBhY2NvcmRpbmcgdG8gdGhlIGdpdmVuIHBhcmFtZXRlcnMuCiAgICBUaGUgZGF0YSBjYW4gYmUgbGF0ZXIgbG9nZ2VkIGFzIGEgc3RydWN0dXJlZCBmaWxlIHdpdGggTUxSdW4ncyBgcmV0dXJuc2AgcGFyYW1ldGVyLgoKICAgIDpwYXJhbSBmaWVsZHM6IEEgbGlzdCBvZiBmaWVsZHMgdG8gcmFuZG9tbHkgZ2VuZXJhdGUuCiAgICA6cGFyYW0gYW1vdW50OiBUaGUgbnVtYmVyIG9mIHZhcmlhbnRzIHRvIGdlbmVyYXRlLgogICAgOnBhcmFtIG1vZGVsX25hbWU6IFRoZSBuYW1lIG9mIHRoZSBtb2RlbCB0byB1c2UgZm9yIGNvbnZlcnNhdGlvbiBnZW5lcmF0aW9uLgogICAgICAgICAgICAgICAgICAgICAgIFlvdSBzaG91bGQgY2hvb3NlIG9uZSBvZiBHUFQtNCBvciBHUFQtMy41IGZyb20gdGhlIGxpc3QgaGVyZTogaHR0cHM6Ly9wbGF0Zm9ybS5vcGVuYWkuY29tL2RvY3MvbW9kZWxzLgogICAgICAgICAgICAgICAgICAgICAgIERlZmF1bHQ6ICdncHQtMy41LXR1cmJvJy4KICAgIDpwYXJhbSBsYW5ndWFnZTogVGhlIGxhbmd1YWdlIHRvIHVzZSBmb3IgdGhlIGdlbmVyYXRlZCBjb252ZXJzYXRpb24gdGV4dC4KICAgIDpwYXJhbSBjaHVua19zaXplOiBOdW1iZXIgb2Ygc2FtcGxlcyBnZW5lcmF0ZWQgYXQgZWFjaCBHUFQgcXVlcnkuCiAgICAiIiIKICAgIGluc3RydWN0aW9ucyA9ICIiCiAgICBmb3IgZmllbGQgaW4gZmllbGRzOgogICAgICAgICMgU3BsaXQgdGhlIGZpZWxkIHRvIGtleSBhbmQgaW5zdHJ1Y3Rpb246CiAgICAgICAgaWYgIjoiIGluIGZpZWxkOgogICAgICAgICAgICBrZXksIGluc3RydWN0aW9uID0gZmllbGQuc3BsaXQoIjoiLCAxKQogICAgICAgIGVsc2U6CiAgICAgICAgICAgIGtleSwgaW5zdHJ1Y3Rpb24gPSBmaWVsZCwgIm5vIHNwZWNpYWwgaW5zdHJ1Y3Rpb24iCiAgICAgICAgIyBSZXBsYWNlIHNwYWNlcyB3aXRoIHVuZGVyc2NvcmVzIGZvciB0aGUga2V5IHRvIGJlIHVzZWQgYXMgYSBqc29uIGtleToKICAgICAgICBrZXkgPSBrZXkuc3RyaXAoKS5yZXBsYWNlKCIgIiwgIl8iKQogICAgICAgIGluc3RydWN0aW9ucyArPSBmIioge2tleX06IHtpbnN0cnVjdGlvbn1cbiIKCiAgICAjIENyZWF0ZSB0aGUgcHJvbXB0IHN0cnVjdHVyZToKICAgIHByb21wdF9zdHJ1Y3R1cmUgPSAoCiAgICAgICAgZiJnZW5lcmF0ZSB0aGUgZm9sbG93aW5nIHZhbHVlcyB7YW1vdW50fSB0aW1lcyByYW5kb21seSwgaW4gYW4gb3JkZXIgdGhhdCBjcmVhdGVzIGEganNvbiB0YWJsZS5cbiIKICAgICAgICBmIlVzZSB0aGUgZm9sbG93aW5nIGtleXMgYW5kIGluc3RydWN0aW9ucyAoZXhhbXBsZTogJ2tleTogaW5zdHJ1Y3Rpb24gb3Igbm8gc3BlY2lhbCBpbnN0cnVjdGlvbicpOiAiCiAgICAgICAgZiJ7aW5zdHJ1Y3Rpb25zfS5cbiIKICAgICAgICBmIlBsZWFzZSBnZW5lcmF0ZSB0aGUgdmFsdWVzIGluIHtsYW5ndWFnZX0gbGFuZ3VhZ2UuIFxuIgogICAgICAgIGYiTWFrZSBzdXJlIHRoZSBuYW1lcyBvZiB0aGUga2V5cyBhcmUgdGhlIHNhbWUgYXMgdGhlIGdpdmVuIGZpZWxkIG5hbWUuXG4iCiAgICAgICAgZiJQbGVhc2UgcmV0dXJuIG9ubHkgdGhlIGpzb24gZm9ybWF0IHdpdGhvdXQgYW55IGludHJvZHVjdGlvbiBhbmQgZW5kaW5nIgogICAgKQoKICAgICMgU2V0IHRoZSBPcGVuQUkgc2VjcmV0czoKICAgIF9zZXRfb3BlbmFpX3NlY3JldHMoKQoKICAgICMgTG9hZCB0aGUgT3BlbkFJIG1vZGVsIHVzaW5nIGxhbmdjaGFpbjoKICAgIGxsbSA9IENoYXRPcGVuQUkobW9kZWw9bW9kZWxfbmFtZSkKCiAgICAjIFN0YXJ0IGdlbmVyYXRpbmcgZGF0YToKICAgIGRhdGEgPSBbXQogICAgZm9yIF8gaW4gdHFkbS50cWRtKHJhbmdlKChhbW91bnQgLy8gY2h1bmtfc2l6ZSkgKyAxKSwgZGVzYz0iR2VuZXJhdGluZyIpOgogICAgICAgICMgV2UgdHJ5IHRvIGdlbmVyYXRlIHRoZSBkYXRhIDMgdGltZXMsIGlmIHdlIGZhaWwgd2UgcmFpc2UgYW4gZXJyb3I6CiAgICAgICAgZm9yIHRyeW91dCBpbiByYW5nZSgzKToKICAgICAgICAgICAgIyBJZiB0aGUgYW1vdW50IHdhbnRlZCBpcyBiaWdnZXIgdGhhbiB0aGUgY2h1bmsgc2l6ZSwgd2UgZ2VuZXJhdGUgYSBjaHVuayBvZiBkYXRhIGluIHRoZSBzaXplIG9mIHRoZSBjaHVuawogICAgICAgICAgICAjIGFuZCBkZWNyZWFzZSB0aGUgYW1vdW50IGJ5IHRoZSBjaHVuayBzaXplLgogICAgICAgICAgICAjIG90aGVyd2lzZSB3ZSBnZW5lcmF0ZSBhIGNodW5rIG9mIGRhdGEgaW4gdGhlIHNpemUgb2YgdGhlIGFtb3VudDoKICAgICAgICAgICAgaWYgYW1vdW50ID4gY2h1bmtfc2l6ZToKICAgICAgICAgICAgICAgIGN1cnJlbnRfY2h1bmtfc2l6ZSA9IGNodW5rX3NpemUKICAgICAgICAgICAgICAgIGFtb3VudCAtPSBjaHVua19zaXplCiAgICAgICAgICAgIGVsc2U6CiAgICAgICAgICAgICAgICBjdXJyZW50X2NodW5rX3NpemUgPSBhbW91bnQKCiAgICAgICAgICAgICMgQ3JlYXRlIHRoZSBwcm9tcHQ6CiAgICAgICAgICAgIHByb21wdCA9IHByb21wdF9zdHJ1Y3R1cmUuZm9ybWF0KAogICAgICAgICAgICAgICAgYW1vdW50PWN1cnJlbnRfY2h1bmtfc2l6ZSwKICAgICAgICAgICAgKQoKICAgICAgICAgICAgIyBHZW5lcmF0ZSBhIGNodW5rIG9mIGRhdGE6CiAgICAgICAgICAgIGNodW5rX2RhdGEgPSBsbG0ucHJlZGljdCh0ZXh0PXByb21wdCkKCiAgICAgICAgICAgICMgVmFsaWRhdGUgdGhlIHJlc3BvbnNlIGZvciBjb3JyZWN0IHB5dGhvbiBgbGlzdGAgc3RydWN0dXJlCiAgICAgICAgICAgIGNodW5rX2RhdGEgPSBjaHVua19kYXRhW2NodW5rX2RhdGEuZmluZCgiWyIpIDogY2h1bmtfZGF0YS5yZmluZCgiXSIpICsgMV0KICAgICAgICAgICAgaWYgY2h1bmtfZGF0YS5jb3VudCgiWyIpICE9IGNodW5rX2RhdGEuY291bnQoIl0iKToKICAgICAgICAgICAgICAgIHByaW50KAogICAgICAgICAgICAgICAgICAgICJGYWlsZWQgdG8gZ2V0IHByb3BlciBqc29uIGZvcm1hdCBmcm9tIG1vZGVsLCBudW1iZXIgb2YgJ1snIGRvZXNuJ3QgbWF0Y2ggbnVtYmVyIG9mICddJy4iCiAgICAgICAgICAgICAgICApCiAgICAgICAgICAgICAgICBjb250aW51ZQogICAgICAgICAgICBjaHVua19kYXRhID0gYXN0LmxpdGVyYWxfZXZhbChjaHVua19kYXRhKQogICAgICAgICAgICBkYXRhICs9IGNodW5rX2RhdGEKICAgICAgICAgICAgYnJlYWsKICAgICAgICBpZiB0cnlvdXQgPT0gMzoKICAgICAgICAgICAgcmFpc2UgUnVudGltZUVycm9yKAogICAgICAgICAgICAgICAgZiJDb3VsZCBub3QgZ2VuZXJhdGUgYSBwcm9wZXIganNvbiBmb3JtYXQgZm9yIHRoZSBnaXZlbiBmaWVsZHMsIHVzaW5nIGdpdmVuIG1vZGVsOiB7bW9kZWxfbmFtZX0uIgogICAgICAgICAgICAgICAgZiIgSGludDogR3B0LTQgd29ya3MgYmVzdCBmb3IgbW9zdCBzY2VuYXJpb3MuIgogICAgICAgICAgICApCiAgICByZXR1cm4gZGF0YQo= - base_image: mlrun/mlrun - commands: [] - code_origin: '' origin_filename: '' requirements: - langchain - tqdm + code_origin: '' + functionSourceCode: IyBDb3B5cmlnaHQgMjAyMyBJZ3VhemlvCiMKIyBMaWNlbnNlZCB1bmRlciB0aGUgQXBhY2hlIExpY2Vuc2UsIFZlcnNpb24gMi4wICh0aGUgIkxpY2Vuc2UiKTsKIyB5b3UgbWF5IG5vdCB1c2UgdGhpcyBmaWxlIGV4Y2VwdCBpbiBjb21wbGlhbmNlIHdpdGggdGhlIExpY2Vuc2UuCiMgWW91IG1heSBvYnRhaW4gYSBjb3B5IG9mIHRoZSBMaWNlbnNlIGF0CiMKIyAgIGh0dHA6Ly93d3cuYXBhY2hlLm9yZy9saWNlbnNlcy9MSUNFTlNFLTIuMAojCiMgVW5sZXNzIHJlcXVpcmVkIGJ5IGFwcGxpY2FibGUgbGF3IG9yIGFncmVlZCB0byBpbiB3cml0aW5nLCBzb2Z0d2FyZQojIGRpc3RyaWJ1dGVkIHVuZGVyIHRoZSBMaWNlbnNlIGlzIGRpc3RyaWJ1dGVkIG9uIGFuICJBUyBJUyIgQkFTSVMsCiMgV0lUSE9VVCBXQVJSQU5USUVTIE9SIENPTkRJVElPTlMgT0YgQU5ZIEtJTkQsIGVpdGhlciBleHByZXNzIG9yIGltcGxpZWQuCiMgU2VlIHRoZSBMaWNlbnNlIGZvciB0aGUgc3BlY2lmaWMgbGFuZ3VhZ2UgZ292ZXJuaW5nIHBlcm1pc3Npb25zIGFuZAojIGxpbWl0YXRpb25zIHVuZGVyIHRoZSBMaWNlbnNlLgppbXBvcnQgYXN0CmltcG9ydCBvcwoKaW1wb3J0IHRxZG0KZnJvbSBsYW5nY2hhaW4uY2hhdF9tb2RlbHMgaW1wb3J0IENoYXRPcGVuQUkKCgpkZWYgX3NldF9vcGVuYWlfc2VjcmV0cygpIC0+IGJvb2w6CiAgICBrZXkgPSAiT1BFTkFJX0FQSV9LRVkiCiAgICBiYXNlID0gIk9QRU5BSV9BUElfQkFTRSIKICAgICMgQ2hlY2sgaWYgdGhlIGtleSBpcyBhbHJlYWR5IGluIHRoZSBlbnZpcm9ubWVudCB2YXJpYWJsZXM6CiAgICBpZiBrZXkgaW4gb3MuZW52aXJvbiBhbmQgYmFzZSBpbiBvcy5lbnZpcm9uOgogICAgICAgIHJldHVybiBUcnVlCiAgICAjIENoZWNrIGlmIG1scnVuIGlzIGluc3RhbGxlZDoKICAgIHRyeToKICAgICAgICBpbXBvcnQgbWxydW4KICAgIGV4Y2VwdCBNb2R1bGVOb3RGb3VuZEVycm9yOgogICAgICAgIHJhaXNlIEVudmlyb25tZW50RXJyb3IoCiAgICAgICAgICAgIGYiT25lIG9yIG1vcmUgb2YgdGhlIE9wZW5BSSByZXF1aXJlZCBlbnZpcm9ubWVudCB2YXJpYWJsZXMgKCd7a2V5fScsICd7YmFzZX0nKSBhcmUgbWlzc2luZy4iCiAgICAgICAgICAgIGYiUGxlYXNlIHNldCB0aGVtIGFzIGVudmlyb25tZW50IHZhcmlhYmxlcyBvciBpbnN0YWxsIG1scnVuIChgcGlwIGluc3RhbGwgbWxydW5gKSIKICAgICAgICAgICAgZiJhbmQgc2V0IHRoZW0gYXMgcHJvamVjdCBzZWNyZXRzIHVzaW5nIGBwcm9qZWN5LnNldF9zZWNyZXRzYC4iCiAgICAgICAgKQoKICAgICMgQ2hlY2sgaWYgdGhlIGtleSBpcyBpbiB0aGUgc2VjcmV0czoKICAgIGNvbnRleHQgPSBtbHJ1bi5nZXRfb3JfY3JlYXRlX2N0eChuYW1lPSJjb250ZXh0IikKICAgIG9wZW5haV9rZXkgPSBjb250ZXh0LmdldF9zZWNyZXQoa2V5KQogICAgb3BlbmFpX2Jhc2UgPSBjb250ZXh0LmdldF9zZWNyZXQoYmFzZSkKCiAgICAjIElmIHRoZSBrZXkgaXMgbm90IGluIHRoZSBzZWNyZXRzLCByZXR1cm4gRmFsc2U6CiAgICBpZiBub3Qgb3BlbmFpX2tleToKICAgICAgICByYWlzZSBFbnZpcm9ubWVudEVycm9yKAogICAgICAgICAgICBmIkNvdWxkIG5vdCBmaW5kIE9wZW5BSSBBUEkga2V5IGluIHRoZSBlbnZpcm9ubWVudCB2YXJpYWJsZXMgb3Igc2VjcmV0cywiCiAgICAgICAgICAgIGYiIHBsZWFzZSBzZXQgaXQgYXM6IHtrZXl9LiIKICAgICAgICApCiAgICBpZiBub3Qgb3BlbmFpX2Jhc2U6CiAgICAgICAgcmFpc2UgRW52aXJvbm1lbnRFcnJvcigKICAgICAgICAgICAgZiJDb3VsZCBub3QgZmluZCBPcGVuQUkgQVBJIGJhc2UgaW4gdGhlIGVudmlyb25tZW50IHZhcmlhYmxlcyBvciBzZWNyZXRzLCIKICAgICAgICAgICAgZiIgcGxlYXNlIHNldCBpdCBhczoge2Jhc2V9LiIKICAgICAgICApCiAgICAjIElmIHRoZSBrZXkgaXMgaW4gdGhlIHNlY3JldHMsIHNldCBpdCBpbiB0aGUgZW52aXJvbm1lbnQgdmFyaWFibGVzIGFuZCByZXR1cm4gVHJ1ZToKICAgIG9zLmVudmlyb25ba2V5XSA9IG9wZW5haV9rZXkKICAgIG9zLmVudmlyb25bYmFzZV0gPSBvcGVuYWlfYmFzZQogICAgcmV0dXJuIFRydWUKCgpkZWYgZ2VuZXJhdGVfZGF0YSgKICAgIGZpZWxkczogbGlzdCwKICAgIGFtb3VudDogaW50ID0gMTAsCiAgICBtb2RlbF9uYW1lOiBzdHIgPSAiZ3B0LTMuNS10dXJibyIsCiAgICBsYW5ndWFnZTogc3RyID0gImVuIiwKICAgIGNodW5rX3NpemU6IGludCA9IDUwLAopIC0+IGxpc3Q6CiAgICAiIiIKICAgIFN0cnVjdHVyZWQgZGF0YSBvZiBlbGVtZW50cyBhY2NvcmRpbmcgdG8gdGhlIGdpdmVuIHBhcmFtZXRlcnMuCiAgICBUaGUgZGF0YSBjYW4gYmUgbGF0ZXIgbG9nZ2VkIGFzIGEgc3RydWN0dXJlZCBmaWxlIHdpdGggTUxSdW4ncyBgcmV0dXJuc2AgcGFyYW1ldGVyLgoKICAgIDpwYXJhbSBmaWVsZHM6IEEgbGlzdCBvZiBmaWVsZHMgdG8gcmFuZG9tbHkgZ2VuZXJhdGUuCiAgICA6cGFyYW0gYW1vdW50OiBUaGUgbnVtYmVyIG9mIHZhcmlhbnRzIHRvIGdlbmVyYXRlLgogICAgOnBhcmFtIG1vZGVsX25hbWU6IFRoZSBuYW1lIG9mIHRoZSBtb2RlbCB0byB1c2UgZm9yIGNvbnZlcnNhdGlvbiBnZW5lcmF0aW9uLgogICAgICAgICAgICAgICAgICAgICAgIFlvdSBzaG91bGQgY2hvb3NlIG9uZSBvZiBHUFQtNCBvciBHUFQtMy41IGZyb20gdGhlIGxpc3QgaGVyZTogaHR0cHM6Ly9wbGF0Zm9ybS5vcGVuYWkuY29tL2RvY3MvbW9kZWxzLgogICAgICAgICAgICAgICAgICAgICAgIERlZmF1bHQ6ICdncHQtMy41LXR1cmJvJy4KICAgIDpwYXJhbSBsYW5ndWFnZTogVGhlIGxhbmd1YWdlIHRvIHVzZSBmb3IgdGhlIGdlbmVyYXRlZCBjb252ZXJzYXRpb24gdGV4dC4KICAgIDpwYXJhbSBjaHVua19zaXplOiBOdW1iZXIgb2Ygc2FtcGxlcyBnZW5lcmF0ZWQgYXQgZWFjaCBHUFQgcXVlcnkuCiAgICAiIiIKICAgIGluc3RydWN0aW9ucyA9ICIiCiAgICBmb3IgZmllbGQgaW4gZmllbGRzOgogICAgICAgICMgU3BsaXQgdGhlIGZpZWxkIHRvIGtleSBhbmQgaW5zdHJ1Y3Rpb246CiAgICAgICAgaWYgIjoiIGluIGZpZWxkOgogICAgICAgICAgICBrZXksIGluc3RydWN0aW9uID0gZmllbGQuc3BsaXQoIjoiLCAxKQogICAgICAgIGVsc2U6CiAgICAgICAgICAgIGtleSwgaW5zdHJ1Y3Rpb24gPSBmaWVsZCwgIm5vIHNwZWNpYWwgaW5zdHJ1Y3Rpb24iCiAgICAgICAgIyBSZXBsYWNlIHNwYWNlcyB3aXRoIHVuZGVyc2NvcmVzIGZvciB0aGUga2V5IHRvIGJlIHVzZWQgYXMgYSBqc29uIGtleToKICAgICAgICBrZXkgPSBrZXkuc3RyaXAoKS5yZXBsYWNlKCIgIiwgIl8iKQogICAgICAgIGluc3RydWN0aW9ucyArPSBmIioge2tleX06IHtpbnN0cnVjdGlvbn1cbiIKCiAgICAjIENyZWF0ZSB0aGUgcHJvbXB0IHN0cnVjdHVyZToKICAgIHByb21wdF9zdHJ1Y3R1cmUgPSAoCiAgICAgICAgZiJnZW5lcmF0ZSB0aGUgZm9sbG93aW5nIHZhbHVlcyB7YW1vdW50fSB0aW1lcyByYW5kb21seSwgaW4gYW4gb3JkZXIgdGhhdCBjcmVhdGVzIGEganNvbiB0YWJsZS5cbiIKICAgICAgICBmIlVzZSB0aGUgZm9sbG93aW5nIGtleXMgYW5kIGluc3RydWN0aW9ucyAoZXhhbXBsZTogJ2tleTogaW5zdHJ1Y3Rpb24gb3Igbm8gc3BlY2lhbCBpbnN0cnVjdGlvbicpOiAiCiAgICAgICAgZiJ7aW5zdHJ1Y3Rpb25zfS5cbiIKICAgICAgICBmIlBsZWFzZSBnZW5lcmF0ZSB0aGUgdmFsdWVzIGluIHtsYW5ndWFnZX0gbGFuZ3VhZ2UuIFxuIgogICAgICAgIGYiTWFrZSBzdXJlIHRoZSBuYW1lcyBvZiB0aGUga2V5cyBhcmUgdGhlIHNhbWUgYXMgdGhlIGdpdmVuIGZpZWxkIG5hbWUuXG4iCiAgICAgICAgZiJQbGVhc2UgcmV0dXJuIG9ubHkgdGhlIGpzb24gZm9ybWF0IHdpdGhvdXQgYW55IGludHJvZHVjdGlvbiBhbmQgZW5kaW5nIgogICAgKQoKICAgICMgU2V0IHRoZSBPcGVuQUkgc2VjcmV0czoKICAgIF9zZXRfb3BlbmFpX3NlY3JldHMoKQoKICAgICMgTG9hZCB0aGUgT3BlbkFJIG1vZGVsIHVzaW5nIGxhbmdjaGFpbjoKICAgIGxsbSA9IENoYXRPcGVuQUkobW9kZWw9bW9kZWxfbmFtZSkKCiAgICAjIFN0YXJ0IGdlbmVyYXRpbmcgZGF0YToKICAgIGRhdGEgPSBbXQogICAgZm9yIF8gaW4gdHFkbS50cWRtKHJhbmdlKChhbW91bnQgLy8gY2h1bmtfc2l6ZSkgKyAxKSwgZGVzYz0iR2VuZXJhdGluZyIpOgogICAgICAgICMgV2UgdHJ5IHRvIGdlbmVyYXRlIHRoZSBkYXRhIDMgdGltZXMsIGlmIHdlIGZhaWwgd2UgcmFpc2UgYW4gZXJyb3I6CiAgICAgICAgZm9yIHRyeW91dCBpbiByYW5nZSgzKToKICAgICAgICAgICAgIyBJZiB0aGUgYW1vdW50IHdhbnRlZCBpcyBiaWdnZXIgdGhhbiB0aGUgY2h1bmsgc2l6ZSwgd2UgZ2VuZXJhdGUgYSBjaHVuayBvZiBkYXRhIGluIHRoZSBzaXplIG9mIHRoZSBjaHVuawogICAgICAgICAgICAjIGFuZCBkZWNyZWFzZSB0aGUgYW1vdW50IGJ5IHRoZSBjaHVuayBzaXplLgogICAgICAgICAgICAjIG90aGVyd2lzZSB3ZSBnZW5lcmF0ZSBhIGNodW5rIG9mIGRhdGEgaW4gdGhlIHNpemUgb2YgdGhlIGFtb3VudDoKICAgICAgICAgICAgaWYgYW1vdW50ID4gY2h1bmtfc2l6ZToKICAgICAgICAgICAgICAgIGN1cnJlbnRfY2h1bmtfc2l6ZSA9IGNodW5rX3NpemUKICAgICAgICAgICAgICAgIGFtb3VudCAtPSBjaHVua19zaXplCiAgICAgICAgICAgIGVsc2U6CiAgICAgICAgICAgICAgICBjdXJyZW50X2NodW5rX3NpemUgPSBhbW91bnQKCiAgICAgICAgICAgICMgQ3JlYXRlIHRoZSBwcm9tcHQ6CiAgICAgICAgICAgIHByb21wdCA9IHByb21wdF9zdHJ1Y3R1cmUuZm9ybWF0KAogICAgICAgICAgICAgICAgYW1vdW50PWN1cnJlbnRfY2h1bmtfc2l6ZSwKICAgICAgICAgICAgKQoKICAgICAgICAgICAgIyBHZW5lcmF0ZSBhIGNodW5rIG9mIGRhdGE6CiAgICAgICAgICAgIGNodW5rX2RhdGEgPSBsbG0ucHJlZGljdCh0ZXh0PXByb21wdCkKCiAgICAgICAgICAgICMgVmFsaWRhdGUgdGhlIHJlc3BvbnNlIGZvciBjb3JyZWN0IHB5dGhvbiBgbGlzdGAgc3RydWN0dXJlCiAgICAgICAgICAgIGNodW5rX2RhdGEgPSBjaHVua19kYXRhW2NodW5rX2RhdGEuZmluZCgiWyIpIDogY2h1bmtfZGF0YS5yZmluZCgiXSIpICsgMV0KICAgICAgICAgICAgaWYgY2h1bmtfZGF0YS5jb3VudCgiWyIpICE9IGNodW5rX2RhdGEuY291bnQoIl0iKToKICAgICAgICAgICAgICAgIHByaW50KAogICAgICAgICAgICAgICAgICAgICJGYWlsZWQgdG8gZ2V0IHByb3BlciBqc29uIGZvcm1hdCBmcm9tIG1vZGVsLCBudW1iZXIgb2YgJ1snIGRvZXNuJ3QgbWF0Y2ggbnVtYmVyIG9mICddJy4iCiAgICAgICAgICAgICAgICApCiAgICAgICAgICAgICAgICBjb250aW51ZQogICAgICAgICAgICBjaHVua19kYXRhID0gYXN0LmxpdGVyYWxfZXZhbChjaHVua19kYXRhKQogICAgICAgICAgICBkYXRhICs9IGNodW5rX2RhdGEKICAgICAgICAgICAgYnJlYWsKICAgICAgICBpZiB0cnlvdXQgPT0gMzoKICAgICAgICAgICAgcmFpc2UgUnVudGltZUVycm9yKAogICAgICAgICAgICAgICAgZiJDb3VsZCBub3QgZ2VuZXJhdGUgYSBwcm9wZXIganNvbiBmb3JtYXQgZm9yIHRoZSBnaXZlbiBmaWVsZHMsIHVzaW5nIGdpdmVuIG1vZGVsOiB7bW9kZWxfbmFtZX0uIgogICAgICAgICAgICAgICAgZiIgSGludDogR3B0LTQgd29ya3MgYmVzdCBmb3IgbW9zdCBzY2VuYXJpb3MuIgogICAgICAgICAgICApCiAgICByZXR1cm4gZGF0YQo= + base_image: mlrun/mlrun entry_points: generate_data: + has_varargs: false name: generate_data + has_kwargs: false doc: 'Structured data of elements according to the given parameters. The data can be later logged as a structured file with MLRun''s `returns` @@ -56,16 +41,16 @@ spec: outputs: - type: list lineno: 59 - has_varargs: false - has_kwargs: false + command: '' description: GenAI approach of generating structured data according to a given schema default_handler: generate_data disable_auto_mount: false - clone_target_dir: '' - env: [] - priority_class_name: '' - preemption_mode: prevent - affinity: null - tolerations: null - security_context: {} + image: '' +metadata: + name: structured-data-generator + tag: '' + categories: + - data-generation + - genai verbose: false +kind: job diff --git a/functions/master/structured_data_generator/latest/src/item.yaml b/functions/master/structured_data_generator/latest/src/item.yaml index be2a2a94..6e01aefb 100755 --- a/functions/master/structured_data_generator/latest/src/item.yaml +++ b/functions/master/structured_data_generator/latest/src/item.yaml @@ -1,7 +1,5 @@ apiVersion: v1 categories: -- machine-learning -- data-preparation - data-generation - genai description: GenAI approach of generating structured data according to a given schema @@ -14,7 +12,7 @@ labels: author: zeevr maintainers: [] marketplaceType: '' -mlrunVersion: 1.6.1 +mlrunVersion: 1.8.0 name: structured_data_generator platformVersion: 3.5.5 spec: @@ -26,4 +24,4 @@ spec: - langchain - tqdm url: '' -version: 1.5.0 +version: 1.6.0 diff --git a/functions/master/structured_data_generator/latest/static/documentation.html b/functions/master/structured_data_generator/latest/static/documentation.html index 67e6b6d6..060d59d9 100644 --- a/functions/master/structured_data_generator/latest/static/documentation.html +++ b/functions/master/structured_data_generator/latest/static/documentation.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/structured_data_generator/latest/static/example.html b/functions/master/structured_data_generator/latest/static/example.html index 589080f3..1573d754 100644 --- a/functions/master/structured_data_generator/latest/static/example.html +++ b/functions/master/structured_data_generator/latest/static/example.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/structured_data_generator/latest/static/function.html b/functions/master/structured_data_generator/latest/static/function.html index e9070b88..11fdbe2f 100644 --- a/functions/master/structured_data_generator/latest/static/function.html +++ b/functions/master/structured_data_generator/latest/static/function.html @@ -28,35 +28,20 @@
             
    -kind: job
    -metadata:
    -  name: structured-data-generator
    -  tag: ''
    -  hash: 44bb39f4bc55b38fc7ead1df24cb02bcf7f05bc9
    -  project: ''
    -  labels:
    -    author: zeevr
    -  categories:
    -  - machine-learning
    -  - data-preparation
    -  - data-generation
    -  - genai
     spec:
    -  command: ''
    -  args: []
    -  image: ''
       build:
    -    functionSourceCode: IyBDb3B5cmlnaHQgMjAyMyBJZ3VhemlvCiMKIyBMaWNlbnNlZCB1bmRlciB0aGUgQXBhY2hlIExpY2Vuc2UsIFZlcnNpb24gMi4wICh0aGUgIkxpY2Vuc2UiKTsKIyB5b3UgbWF5IG5vdCB1c2UgdGhpcyBmaWxlIGV4Y2VwdCBpbiBjb21wbGlhbmNlIHdpdGggdGhlIExpY2Vuc2UuCiMgWW91IG1heSBvYnRhaW4gYSBjb3B5IG9mIHRoZSBMaWNlbnNlIGF0CiMKIyAgIGh0dHA6Ly93d3cuYXBhY2hlLm9yZy9saWNlbnNlcy9MSUNFTlNFLTIuMAojCiMgVW5sZXNzIHJlcXVpcmVkIGJ5IGFwcGxpY2FibGUgbGF3IG9yIGFncmVlZCB0byBpbiB3cml0aW5nLCBzb2Z0d2FyZQojIGRpc3RyaWJ1dGVkIHVuZGVyIHRoZSBMaWNlbnNlIGlzIGRpc3RyaWJ1dGVkIG9uIGFuICJBUyBJUyIgQkFTSVMsCiMgV0lUSE9VVCBXQVJSQU5USUVTIE9SIENPTkRJVElPTlMgT0YgQU5ZIEtJTkQsIGVpdGhlciBleHByZXNzIG9yIGltcGxpZWQuCiMgU2VlIHRoZSBMaWNlbnNlIGZvciB0aGUgc3BlY2lmaWMgbGFuZ3VhZ2UgZ292ZXJuaW5nIHBlcm1pc3Npb25zIGFuZAojIGxpbWl0YXRpb25zIHVuZGVyIHRoZSBMaWNlbnNlLgppbXBvcnQgYXN0CmltcG9ydCBvcwoKaW1wb3J0IHRxZG0KZnJvbSBsYW5nY2hhaW4uY2hhdF9tb2RlbHMgaW1wb3J0IENoYXRPcGVuQUkKCgpkZWYgX3NldF9vcGVuYWlfc2VjcmV0cygpIC0+IGJvb2w6CiAgICBrZXkgPSAiT1BFTkFJX0FQSV9LRVkiCiAgICBiYXNlID0gIk9QRU5BSV9BUElfQkFTRSIKICAgICMgQ2hlY2sgaWYgdGhlIGtleSBpcyBhbHJlYWR5IGluIHRoZSBlbnZpcm9ubWVudCB2YXJpYWJsZXM6CiAgICBpZiBrZXkgaW4gb3MuZW52aXJvbiBhbmQgYmFzZSBpbiBvcy5lbnZpcm9uOgogICAgICAgIHJldHVybiBUcnVlCiAgICAjIENoZWNrIGlmIG1scnVuIGlzIGluc3RhbGxlZDoKICAgIHRyeToKICAgICAgICBpbXBvcnQgbWxydW4KICAgIGV4Y2VwdCBNb2R1bGVOb3RGb3VuZEVycm9yOgogICAgICAgIHJhaXNlIEVudmlyb25tZW50RXJyb3IoCiAgICAgICAgICAgIGYiT25lIG9yIG1vcmUgb2YgdGhlIE9wZW5BSSByZXF1aXJlZCBlbnZpcm9ubWVudCB2YXJpYWJsZXMgKCd7a2V5fScsICd7YmFzZX0nKSBhcmUgbWlzc2luZy4iCiAgICAgICAgICAgIGYiUGxlYXNlIHNldCB0aGVtIGFzIGVudmlyb25tZW50IHZhcmlhYmxlcyBvciBpbnN0YWxsIG1scnVuIChgcGlwIGluc3RhbGwgbWxydW5gKSIKICAgICAgICAgICAgZiJhbmQgc2V0IHRoZW0gYXMgcHJvamVjdCBzZWNyZXRzIHVzaW5nIGBwcm9qZWN5LnNldF9zZWNyZXRzYC4iCiAgICAgICAgKQoKICAgICMgQ2hlY2sgaWYgdGhlIGtleSBpcyBpbiB0aGUgc2VjcmV0czoKICAgIGNvbnRleHQgPSBtbHJ1bi5nZXRfb3JfY3JlYXRlX2N0eChuYW1lPSJjb250ZXh0IikKICAgIG9wZW5haV9rZXkgPSBjb250ZXh0LmdldF9zZWNyZXQoa2V5KQogICAgb3BlbmFpX2Jhc2UgPSBjb250ZXh0LmdldF9zZWNyZXQoYmFzZSkKCiAgICAjIElmIHRoZSBrZXkgaXMgbm90IGluIHRoZSBzZWNyZXRzLCByZXR1cm4gRmFsc2U6CiAgICBpZiBub3Qgb3BlbmFpX2tleToKICAgICAgICByYWlzZSBFbnZpcm9ubWVudEVycm9yKAogICAgICAgICAgICBmIkNvdWxkIG5vdCBmaW5kIE9wZW5BSSBBUEkga2V5IGluIHRoZSBlbnZpcm9ubWVudCB2YXJpYWJsZXMgb3Igc2VjcmV0cywiCiAgICAgICAgICAgIGYiIHBsZWFzZSBzZXQgaXQgYXM6IHtrZXl9LiIKICAgICAgICApCiAgICBpZiBub3Qgb3BlbmFpX2Jhc2U6CiAgICAgICAgcmFpc2UgRW52aXJvbm1lbnRFcnJvcigKICAgICAgICAgICAgZiJDb3VsZCBub3QgZmluZCBPcGVuQUkgQVBJIGJhc2UgaW4gdGhlIGVudmlyb25tZW50IHZhcmlhYmxlcyBvciBzZWNyZXRzLCIKICAgICAgICAgICAgZiIgcGxlYXNlIHNldCBpdCBhczoge2Jhc2V9LiIKICAgICAgICApCiAgICAjIElmIHRoZSBrZXkgaXMgaW4gdGhlIHNlY3JldHMsIHNldCBpdCBpbiB0aGUgZW52aXJvbm1lbnQgdmFyaWFibGVzIGFuZCByZXR1cm4gVHJ1ZToKICAgIG9zLmVudmlyb25ba2V5XSA9IG9wZW5haV9rZXkKICAgIG9zLmVudmlyb25bYmFzZV0gPSBvcGVuYWlfYmFzZQogICAgcmV0dXJuIFRydWUKCgpkZWYgZ2VuZXJhdGVfZGF0YSgKICAgIGZpZWxkczogbGlzdCwKICAgIGFtb3VudDogaW50ID0gMTAsCiAgICBtb2RlbF9uYW1lOiBzdHIgPSAiZ3B0LTMuNS10dXJibyIsCiAgICBsYW5ndWFnZTogc3RyID0gImVuIiwKICAgIGNodW5rX3NpemU6IGludCA9IDUwLAopIC0+IGxpc3Q6CiAgICAiIiIKICAgIFN0cnVjdHVyZWQgZGF0YSBvZiBlbGVtZW50cyBhY2NvcmRpbmcgdG8gdGhlIGdpdmVuIHBhcmFtZXRlcnMuCiAgICBUaGUgZGF0YSBjYW4gYmUgbGF0ZXIgbG9nZ2VkIGFzIGEgc3RydWN0dXJlZCBmaWxlIHdpdGggTUxSdW4ncyBgcmV0dXJuc2AgcGFyYW1ldGVyLgoKICAgIDpwYXJhbSBmaWVsZHM6IEEgbGlzdCBvZiBmaWVsZHMgdG8gcmFuZG9tbHkgZ2VuZXJhdGUuCiAgICA6cGFyYW0gYW1vdW50OiBUaGUgbnVtYmVyIG9mIHZhcmlhbnRzIHRvIGdlbmVyYXRlLgogICAgOnBhcmFtIG1vZGVsX25hbWU6IFRoZSBuYW1lIG9mIHRoZSBtb2RlbCB0byB1c2UgZm9yIGNvbnZlcnNhdGlvbiBnZW5lcmF0aW9uLgogICAgICAgICAgICAgICAgICAgICAgIFlvdSBzaG91bGQgY2hvb3NlIG9uZSBvZiBHUFQtNCBvciBHUFQtMy41IGZyb20gdGhlIGxpc3QgaGVyZTogaHR0cHM6Ly9wbGF0Zm9ybS5vcGVuYWkuY29tL2RvY3MvbW9kZWxzLgogICAgICAgICAgICAgICAgICAgICAgIERlZmF1bHQ6ICdncHQtMy41LXR1cmJvJy4KICAgIDpwYXJhbSBsYW5ndWFnZTogVGhlIGxhbmd1YWdlIHRvIHVzZSBmb3IgdGhlIGdlbmVyYXRlZCBjb252ZXJzYXRpb24gdGV4dC4KICAgIDpwYXJhbSBjaHVua19zaXplOiBOdW1iZXIgb2Ygc2FtcGxlcyBnZW5lcmF0ZWQgYXQgZWFjaCBHUFQgcXVlcnkuCiAgICAiIiIKICAgIGluc3RydWN0aW9ucyA9ICIiCiAgICBmb3IgZmllbGQgaW4gZmllbGRzOgogICAgICAgICMgU3BsaXQgdGhlIGZpZWxkIHRvIGtleSBhbmQgaW5zdHJ1Y3Rpb246CiAgICAgICAgaWYgIjoiIGluIGZpZWxkOgogICAgICAgICAgICBrZXksIGluc3RydWN0aW9uID0gZmllbGQuc3BsaXQoIjoiLCAxKQogICAgICAgIGVsc2U6CiAgICAgICAgICAgIGtleSwgaW5zdHJ1Y3Rpb24gPSBmaWVsZCwgIm5vIHNwZWNpYWwgaW5zdHJ1Y3Rpb24iCiAgICAgICAgIyBSZXBsYWNlIHNwYWNlcyB3aXRoIHVuZGVyc2NvcmVzIGZvciB0aGUga2V5IHRvIGJlIHVzZWQgYXMgYSBqc29uIGtleToKICAgICAgICBrZXkgPSBrZXkuc3RyaXAoKS5yZXBsYWNlKCIgIiwgIl8iKQogICAgICAgIGluc3RydWN0aW9ucyArPSBmIioge2tleX06IHtpbnN0cnVjdGlvbn1cbiIKCiAgICAjIENyZWF0ZSB0aGUgcHJvbXB0IHN0cnVjdHVyZToKICAgIHByb21wdF9zdHJ1Y3R1cmUgPSAoCiAgICAgICAgZiJnZW5lcmF0ZSB0aGUgZm9sbG93aW5nIHZhbHVlcyB7YW1vdW50fSB0aW1lcyByYW5kb21seSwgaW4gYW4gb3JkZXIgdGhhdCBjcmVhdGVzIGEganNvbiB0YWJsZS5cbiIKICAgICAgICBmIlVzZSB0aGUgZm9sbG93aW5nIGtleXMgYW5kIGluc3RydWN0aW9ucyAoZXhhbXBsZTogJ2tleTogaW5zdHJ1Y3Rpb24gb3Igbm8gc3BlY2lhbCBpbnN0cnVjdGlvbicpOiAiCiAgICAgICAgZiJ7aW5zdHJ1Y3Rpb25zfS5cbiIKICAgICAgICBmIlBsZWFzZSBnZW5lcmF0ZSB0aGUgdmFsdWVzIGluIHtsYW5ndWFnZX0gbGFuZ3VhZ2UuIFxuIgogICAgICAgIGYiTWFrZSBzdXJlIHRoZSBuYW1lcyBvZiB0aGUga2V5cyBhcmUgdGhlIHNhbWUgYXMgdGhlIGdpdmVuIGZpZWxkIG5hbWUuXG4iCiAgICAgICAgZiJQbGVhc2UgcmV0dXJuIG9ubHkgdGhlIGpzb24gZm9ybWF0IHdpdGhvdXQgYW55IGludHJvZHVjdGlvbiBhbmQgZW5kaW5nIgogICAgKQoKICAgICMgU2V0IHRoZSBPcGVuQUkgc2VjcmV0czoKICAgIF9zZXRfb3BlbmFpX3NlY3JldHMoKQoKICAgICMgTG9hZCB0aGUgT3BlbkFJIG1vZGVsIHVzaW5nIGxhbmdjaGFpbjoKICAgIGxsbSA9IENoYXRPcGVuQUkobW9kZWw9bW9kZWxfbmFtZSkKCiAgICAjIFN0YXJ0IGdlbmVyYXRpbmcgZGF0YToKICAgIGRhdGEgPSBbXQogICAgZm9yIF8gaW4gdHFkbS50cWRtKHJhbmdlKChhbW91bnQgLy8gY2h1bmtfc2l6ZSkgKyAxKSwgZGVzYz0iR2VuZXJhdGluZyIpOgogICAgICAgICMgV2UgdHJ5IHRvIGdlbmVyYXRlIHRoZSBkYXRhIDMgdGltZXMsIGlmIHdlIGZhaWwgd2UgcmFpc2UgYW4gZXJyb3I6CiAgICAgICAgZm9yIHRyeW91dCBpbiByYW5nZSgzKToKICAgICAgICAgICAgIyBJZiB0aGUgYW1vdW50IHdhbnRlZCBpcyBiaWdnZXIgdGhhbiB0aGUgY2h1bmsgc2l6ZSwgd2UgZ2VuZXJhdGUgYSBjaHVuayBvZiBkYXRhIGluIHRoZSBzaXplIG9mIHRoZSBjaHVuawogICAgICAgICAgICAjIGFuZCBkZWNyZWFzZSB0aGUgYW1vdW50IGJ5IHRoZSBjaHVuayBzaXplLgogICAgICAgICAgICAjIG90aGVyd2lzZSB3ZSBnZW5lcmF0ZSBhIGNodW5rIG9mIGRhdGEgaW4gdGhlIHNpemUgb2YgdGhlIGFtb3VudDoKICAgICAgICAgICAgaWYgYW1vdW50ID4gY2h1bmtfc2l6ZToKICAgICAgICAgICAgICAgIGN1cnJlbnRfY2h1bmtfc2l6ZSA9IGNodW5rX3NpemUKICAgICAgICAgICAgICAgIGFtb3VudCAtPSBjaHVua19zaXplCiAgICAgICAgICAgIGVsc2U6CiAgICAgICAgICAgICAgICBjdXJyZW50X2NodW5rX3NpemUgPSBhbW91bnQKCiAgICAgICAgICAgICMgQ3JlYXRlIHRoZSBwcm9tcHQ6CiAgICAgICAgICAgIHByb21wdCA9IHByb21wdF9zdHJ1Y3R1cmUuZm9ybWF0KAogICAgICAgICAgICAgICAgYW1vdW50PWN1cnJlbnRfY2h1bmtfc2l6ZSwKICAgICAgICAgICAgKQoKICAgICAgICAgICAgIyBHZW5lcmF0ZSBhIGNodW5rIG9mIGRhdGE6CiAgICAgICAgICAgIGNodW5rX2RhdGEgPSBsbG0ucHJlZGljdCh0ZXh0PXByb21wdCkKCiAgICAgICAgICAgICMgVmFsaWRhdGUgdGhlIHJlc3BvbnNlIGZvciBjb3JyZWN0IHB5dGhvbiBgbGlzdGAgc3RydWN0dXJlCiAgICAgICAgICAgIGNodW5rX2RhdGEgPSBjaHVua19kYXRhW2NodW5rX2RhdGEuZmluZCgiWyIpIDogY2h1bmtfZGF0YS5yZmluZCgiXSIpICsgMV0KICAgICAgICAgICAgaWYgY2h1bmtfZGF0YS5jb3VudCgiWyIpICE9IGNodW5rX2RhdGEuY291bnQoIl0iKToKICAgICAgICAgICAgICAgIHByaW50KAogICAgICAgICAgICAgICAgICAgICJGYWlsZWQgdG8gZ2V0IHByb3BlciBqc29uIGZvcm1hdCBmcm9tIG1vZGVsLCBudW1iZXIgb2YgJ1snIGRvZXNuJ3QgbWF0Y2ggbnVtYmVyIG9mICddJy4iCiAgICAgICAgICAgICAgICApCiAgICAgICAgICAgICAgICBjb250aW51ZQogICAgICAgICAgICBjaHVua19kYXRhID0gYXN0LmxpdGVyYWxfZXZhbChjaHVua19kYXRhKQogICAgICAgICAgICBkYXRhICs9IGNodW5rX2RhdGEKICAgICAgICAgICAgYnJlYWsKICAgICAgICBpZiB0cnlvdXQgPT0gMzoKICAgICAgICAgICAgcmFpc2UgUnVudGltZUVycm9yKAogICAgICAgICAgICAgICAgZiJDb3VsZCBub3QgZ2VuZXJhdGUgYSBwcm9wZXIganNvbiBmb3JtYXQgZm9yIHRoZSBnaXZlbiBmaWVsZHMsIHVzaW5nIGdpdmVuIG1vZGVsOiB7bW9kZWxfbmFtZX0uIgogICAgICAgICAgICAgICAgZiIgSGludDogR3B0LTQgd29ya3MgYmVzdCBmb3IgbW9zdCBzY2VuYXJpb3MuIgogICAgICAgICAgICApCiAgICByZXR1cm4gZGF0YQo=
    -    base_image: mlrun/mlrun
    -    commands: []
    -    code_origin: ''
         origin_filename: ''
         requirements:
         - langchain
         - tqdm
    +    code_origin: ''
    +    functionSourceCode: IyBDb3B5cmlnaHQgMjAyMyBJZ3VhemlvCiMKIyBMaWNlbnNlZCB1bmRlciB0aGUgQXBhY2hlIExpY2Vuc2UsIFZlcnNpb24gMi4wICh0aGUgIkxpY2Vuc2UiKTsKIyB5b3UgbWF5IG5vdCB1c2UgdGhpcyBmaWxlIGV4Y2VwdCBpbiBjb21wbGlhbmNlIHdpdGggdGhlIExpY2Vuc2UuCiMgWW91IG1heSBvYnRhaW4gYSBjb3B5IG9mIHRoZSBMaWNlbnNlIGF0CiMKIyAgIGh0dHA6Ly93d3cuYXBhY2hlLm9yZy9saWNlbnNlcy9MSUNFTlNFLTIuMAojCiMgVW5sZXNzIHJlcXVpcmVkIGJ5IGFwcGxpY2FibGUgbGF3IG9yIGFncmVlZCB0byBpbiB3cml0aW5nLCBzb2Z0d2FyZQojIGRpc3RyaWJ1dGVkIHVuZGVyIHRoZSBMaWNlbnNlIGlzIGRpc3RyaWJ1dGVkIG9uIGFuICJBUyBJUyIgQkFTSVMsCiMgV0lUSE9VVCBXQVJSQU5USUVTIE9SIENPTkRJVElPTlMgT0YgQU5ZIEtJTkQsIGVpdGhlciBleHByZXNzIG9yIGltcGxpZWQuCiMgU2VlIHRoZSBMaWNlbnNlIGZvciB0aGUgc3BlY2lmaWMgbGFuZ3VhZ2UgZ292ZXJuaW5nIHBlcm1pc3Npb25zIGFuZAojIGxpbWl0YXRpb25zIHVuZGVyIHRoZSBMaWNlbnNlLgppbXBvcnQgYXN0CmltcG9ydCBvcwoKaW1wb3J0IHRxZG0KZnJvbSBsYW5nY2hhaW4uY2hhdF9tb2RlbHMgaW1wb3J0IENoYXRPcGVuQUkKCgpkZWYgX3NldF9vcGVuYWlfc2VjcmV0cygpIC0+IGJvb2w6CiAgICBrZXkgPSAiT1BFTkFJX0FQSV9LRVkiCiAgICBiYXNlID0gIk9QRU5BSV9BUElfQkFTRSIKICAgICMgQ2hlY2sgaWYgdGhlIGtleSBpcyBhbHJlYWR5IGluIHRoZSBlbnZpcm9ubWVudCB2YXJpYWJsZXM6CiAgICBpZiBrZXkgaW4gb3MuZW52aXJvbiBhbmQgYmFzZSBpbiBvcy5lbnZpcm9uOgogICAgICAgIHJldHVybiBUcnVlCiAgICAjIENoZWNrIGlmIG1scnVuIGlzIGluc3RhbGxlZDoKICAgIHRyeToKICAgICAgICBpbXBvcnQgbWxydW4KICAgIGV4Y2VwdCBNb2R1bGVOb3RGb3VuZEVycm9yOgogICAgICAgIHJhaXNlIEVudmlyb25tZW50RXJyb3IoCiAgICAgICAgICAgIGYiT25lIG9yIG1vcmUgb2YgdGhlIE9wZW5BSSByZXF1aXJlZCBlbnZpcm9ubWVudCB2YXJpYWJsZXMgKCd7a2V5fScsICd7YmFzZX0nKSBhcmUgbWlzc2luZy4iCiAgICAgICAgICAgIGYiUGxlYXNlIHNldCB0aGVtIGFzIGVudmlyb25tZW50IHZhcmlhYmxlcyBvciBpbnN0YWxsIG1scnVuIChgcGlwIGluc3RhbGwgbWxydW5gKSIKICAgICAgICAgICAgZiJhbmQgc2V0IHRoZW0gYXMgcHJvamVjdCBzZWNyZXRzIHVzaW5nIGBwcm9qZWN5LnNldF9zZWNyZXRzYC4iCiAgICAgICAgKQoKICAgICMgQ2hlY2sgaWYgdGhlIGtleSBpcyBpbiB0aGUgc2VjcmV0czoKICAgIGNvbnRleHQgPSBtbHJ1bi5nZXRfb3JfY3JlYXRlX2N0eChuYW1lPSJjb250ZXh0IikKICAgIG9wZW5haV9rZXkgPSBjb250ZXh0LmdldF9zZWNyZXQoa2V5KQogICAgb3BlbmFpX2Jhc2UgPSBjb250ZXh0LmdldF9zZWNyZXQoYmFzZSkKCiAgICAjIElmIHRoZSBrZXkgaXMgbm90IGluIHRoZSBzZWNyZXRzLCByZXR1cm4gRmFsc2U6CiAgICBpZiBub3Qgb3BlbmFpX2tleToKICAgICAgICByYWlzZSBFbnZpcm9ubWVudEVycm9yKAogICAgICAgICAgICBmIkNvdWxkIG5vdCBmaW5kIE9wZW5BSSBBUEkga2V5IGluIHRoZSBlbnZpcm9ubWVudCB2YXJpYWJsZXMgb3Igc2VjcmV0cywiCiAgICAgICAgICAgIGYiIHBsZWFzZSBzZXQgaXQgYXM6IHtrZXl9LiIKICAgICAgICApCiAgICBpZiBub3Qgb3BlbmFpX2Jhc2U6CiAgICAgICAgcmFpc2UgRW52aXJvbm1lbnRFcnJvcigKICAgICAgICAgICAgZiJDb3VsZCBub3QgZmluZCBPcGVuQUkgQVBJIGJhc2UgaW4gdGhlIGVudmlyb25tZW50IHZhcmlhYmxlcyBvciBzZWNyZXRzLCIKICAgICAgICAgICAgZiIgcGxlYXNlIHNldCBpdCBhczoge2Jhc2V9LiIKICAgICAgICApCiAgICAjIElmIHRoZSBrZXkgaXMgaW4gdGhlIHNlY3JldHMsIHNldCBpdCBpbiB0aGUgZW52aXJvbm1lbnQgdmFyaWFibGVzIGFuZCByZXR1cm4gVHJ1ZToKICAgIG9zLmVudmlyb25ba2V5XSA9IG9wZW5haV9rZXkKICAgIG9zLmVudmlyb25bYmFzZV0gPSBvcGVuYWlfYmFzZQogICAgcmV0dXJuIFRydWUKCgpkZWYgZ2VuZXJhdGVfZGF0YSgKICAgIGZpZWxkczogbGlzdCwKICAgIGFtb3VudDogaW50ID0gMTAsCiAgICBtb2RlbF9uYW1lOiBzdHIgPSAiZ3B0LTMuNS10dXJibyIsCiAgICBsYW5ndWFnZTogc3RyID0gImVuIiwKICAgIGNodW5rX3NpemU6IGludCA9IDUwLAopIC0+IGxpc3Q6CiAgICAiIiIKICAgIFN0cnVjdHVyZWQgZGF0YSBvZiBlbGVtZW50cyBhY2NvcmRpbmcgdG8gdGhlIGdpdmVuIHBhcmFtZXRlcnMuCiAgICBUaGUgZGF0YSBjYW4gYmUgbGF0ZXIgbG9nZ2VkIGFzIGEgc3RydWN0dXJlZCBmaWxlIHdpdGggTUxSdW4ncyBgcmV0dXJuc2AgcGFyYW1ldGVyLgoKICAgIDpwYXJhbSBmaWVsZHM6IEEgbGlzdCBvZiBmaWVsZHMgdG8gcmFuZG9tbHkgZ2VuZXJhdGUuCiAgICA6cGFyYW0gYW1vdW50OiBUaGUgbnVtYmVyIG9mIHZhcmlhbnRzIHRvIGdlbmVyYXRlLgogICAgOnBhcmFtIG1vZGVsX25hbWU6IFRoZSBuYW1lIG9mIHRoZSBtb2RlbCB0byB1c2UgZm9yIGNvbnZlcnNhdGlvbiBnZW5lcmF0aW9uLgogICAgICAgICAgICAgICAgICAgICAgIFlvdSBzaG91bGQgY2hvb3NlIG9uZSBvZiBHUFQtNCBvciBHUFQtMy41IGZyb20gdGhlIGxpc3QgaGVyZTogaHR0cHM6Ly9wbGF0Zm9ybS5vcGVuYWkuY29tL2RvY3MvbW9kZWxzLgogICAgICAgICAgICAgICAgICAgICAgIERlZmF1bHQ6ICdncHQtMy41LXR1cmJvJy4KICAgIDpwYXJhbSBsYW5ndWFnZTogVGhlIGxhbmd1YWdlIHRvIHVzZSBmb3IgdGhlIGdlbmVyYXRlZCBjb252ZXJzYXRpb24gdGV4dC4KICAgIDpwYXJhbSBjaHVua19zaXplOiBOdW1iZXIgb2Ygc2FtcGxlcyBnZW5lcmF0ZWQgYXQgZWFjaCBHUFQgcXVlcnkuCiAgICAiIiIKICAgIGluc3RydWN0aW9ucyA9ICIiCiAgICBmb3IgZmllbGQgaW4gZmllbGRzOgogICAgICAgICMgU3BsaXQgdGhlIGZpZWxkIHRvIGtleSBhbmQgaW5zdHJ1Y3Rpb246CiAgICAgICAgaWYgIjoiIGluIGZpZWxkOgogICAgICAgICAgICBrZXksIGluc3RydWN0aW9uID0gZmllbGQuc3BsaXQoIjoiLCAxKQogICAgICAgIGVsc2U6CiAgICAgICAgICAgIGtleSwgaW5zdHJ1Y3Rpb24gPSBmaWVsZCwgIm5vIHNwZWNpYWwgaW5zdHJ1Y3Rpb24iCiAgICAgICAgIyBSZXBsYWNlIHNwYWNlcyB3aXRoIHVuZGVyc2NvcmVzIGZvciB0aGUga2V5IHRvIGJlIHVzZWQgYXMgYSBqc29uIGtleToKICAgICAgICBrZXkgPSBrZXkuc3RyaXAoKS5yZXBsYWNlKCIgIiwgIl8iKQogICAgICAgIGluc3RydWN0aW9ucyArPSBmIioge2tleX06IHtpbnN0cnVjdGlvbn1cbiIKCiAgICAjIENyZWF0ZSB0aGUgcHJvbXB0IHN0cnVjdHVyZToKICAgIHByb21wdF9zdHJ1Y3R1cmUgPSAoCiAgICAgICAgZiJnZW5lcmF0ZSB0aGUgZm9sbG93aW5nIHZhbHVlcyB7YW1vdW50fSB0aW1lcyByYW5kb21seSwgaW4gYW4gb3JkZXIgdGhhdCBjcmVhdGVzIGEganNvbiB0YWJsZS5cbiIKICAgICAgICBmIlVzZSB0aGUgZm9sbG93aW5nIGtleXMgYW5kIGluc3RydWN0aW9ucyAoZXhhbXBsZTogJ2tleTogaW5zdHJ1Y3Rpb24gb3Igbm8gc3BlY2lhbCBpbnN0cnVjdGlvbicpOiAiCiAgICAgICAgZiJ7aW5zdHJ1Y3Rpb25zfS5cbiIKICAgICAgICBmIlBsZWFzZSBnZW5lcmF0ZSB0aGUgdmFsdWVzIGluIHtsYW5ndWFnZX0gbGFuZ3VhZ2UuIFxuIgogICAgICAgIGYiTWFrZSBzdXJlIHRoZSBuYW1lcyBvZiB0aGUga2V5cyBhcmUgdGhlIHNhbWUgYXMgdGhlIGdpdmVuIGZpZWxkIG5hbWUuXG4iCiAgICAgICAgZiJQbGVhc2UgcmV0dXJuIG9ubHkgdGhlIGpzb24gZm9ybWF0IHdpdGhvdXQgYW55IGludHJvZHVjdGlvbiBhbmQgZW5kaW5nIgogICAgKQoKICAgICMgU2V0IHRoZSBPcGVuQUkgc2VjcmV0czoKICAgIF9zZXRfb3BlbmFpX3NlY3JldHMoKQoKICAgICMgTG9hZCB0aGUgT3BlbkFJIG1vZGVsIHVzaW5nIGxhbmdjaGFpbjoKICAgIGxsbSA9IENoYXRPcGVuQUkobW9kZWw9bW9kZWxfbmFtZSkKCiAgICAjIFN0YXJ0IGdlbmVyYXRpbmcgZGF0YToKICAgIGRhdGEgPSBbXQogICAgZm9yIF8gaW4gdHFkbS50cWRtKHJhbmdlKChhbW91bnQgLy8gY2h1bmtfc2l6ZSkgKyAxKSwgZGVzYz0iR2VuZXJhdGluZyIpOgogICAgICAgICMgV2UgdHJ5IHRvIGdlbmVyYXRlIHRoZSBkYXRhIDMgdGltZXMsIGlmIHdlIGZhaWwgd2UgcmFpc2UgYW4gZXJyb3I6CiAgICAgICAgZm9yIHRyeW91dCBpbiByYW5nZSgzKToKICAgICAgICAgICAgIyBJZiB0aGUgYW1vdW50IHdhbnRlZCBpcyBiaWdnZXIgdGhhbiB0aGUgY2h1bmsgc2l6ZSwgd2UgZ2VuZXJhdGUgYSBjaHVuayBvZiBkYXRhIGluIHRoZSBzaXplIG9mIHRoZSBjaHVuawogICAgICAgICAgICAjIGFuZCBkZWNyZWFzZSB0aGUgYW1vdW50IGJ5IHRoZSBjaHVuayBzaXplLgogICAgICAgICAgICAjIG90aGVyd2lzZSB3ZSBnZW5lcmF0ZSBhIGNodW5rIG9mIGRhdGEgaW4gdGhlIHNpemUgb2YgdGhlIGFtb3VudDoKICAgICAgICAgICAgaWYgYW1vdW50ID4gY2h1bmtfc2l6ZToKICAgICAgICAgICAgICAgIGN1cnJlbnRfY2h1bmtfc2l6ZSA9IGNodW5rX3NpemUKICAgICAgICAgICAgICAgIGFtb3VudCAtPSBjaHVua19zaXplCiAgICAgICAgICAgIGVsc2U6CiAgICAgICAgICAgICAgICBjdXJyZW50X2NodW5rX3NpemUgPSBhbW91bnQKCiAgICAgICAgICAgICMgQ3JlYXRlIHRoZSBwcm9tcHQ6CiAgICAgICAgICAgIHByb21wdCA9IHByb21wdF9zdHJ1Y3R1cmUuZm9ybWF0KAogICAgICAgICAgICAgICAgYW1vdW50PWN1cnJlbnRfY2h1bmtfc2l6ZSwKICAgICAgICAgICAgKQoKICAgICAgICAgICAgIyBHZW5lcmF0ZSBhIGNodW5rIG9mIGRhdGE6CiAgICAgICAgICAgIGNodW5rX2RhdGEgPSBsbG0ucHJlZGljdCh0ZXh0PXByb21wdCkKCiAgICAgICAgICAgICMgVmFsaWRhdGUgdGhlIHJlc3BvbnNlIGZvciBjb3JyZWN0IHB5dGhvbiBgbGlzdGAgc3RydWN0dXJlCiAgICAgICAgICAgIGNodW5rX2RhdGEgPSBjaHVua19kYXRhW2NodW5rX2RhdGEuZmluZCgiWyIpIDogY2h1bmtfZGF0YS5yZmluZCgiXSIpICsgMV0KICAgICAgICAgICAgaWYgY2h1bmtfZGF0YS5jb3VudCgiWyIpICE9IGNodW5rX2RhdGEuY291bnQoIl0iKToKICAgICAgICAgICAgICAgIHByaW50KAogICAgICAgICAgICAgICAgICAgICJGYWlsZWQgdG8gZ2V0IHByb3BlciBqc29uIGZvcm1hdCBmcm9tIG1vZGVsLCBudW1iZXIgb2YgJ1snIGRvZXNuJ3QgbWF0Y2ggbnVtYmVyIG9mICddJy4iCiAgICAgICAgICAgICAgICApCiAgICAgICAgICAgICAgICBjb250aW51ZQogICAgICAgICAgICBjaHVua19kYXRhID0gYXN0LmxpdGVyYWxfZXZhbChjaHVua19kYXRhKQogICAgICAgICAgICBkYXRhICs9IGNodW5rX2RhdGEKICAgICAgICAgICAgYnJlYWsKICAgICAgICBpZiB0cnlvdXQgPT0gMzoKICAgICAgICAgICAgcmFpc2UgUnVudGltZUVycm9yKAogICAgICAgICAgICAgICAgZiJDb3VsZCBub3QgZ2VuZXJhdGUgYSBwcm9wZXIganNvbiBmb3JtYXQgZm9yIHRoZSBnaXZlbiBmaWVsZHMsIHVzaW5nIGdpdmVuIG1vZGVsOiB7bW9kZWxfbmFtZX0uIgogICAgICAgICAgICAgICAgZiIgSGludDogR3B0LTQgd29ya3MgYmVzdCBmb3IgbW9zdCBzY2VuYXJpb3MuIgogICAgICAgICAgICApCiAgICByZXR1cm4gZGF0YQo=
    +    base_image: mlrun/mlrun
       entry_points:
         generate_data:
    +      has_varargs: false
           name: generate_data
    +      has_kwargs: false
           doc: 'Structured data of elements according to the given parameters.
     
             The data can be later logged as a structured file with MLRun''s `returns`
    @@ -86,19 +71,19 @@
           outputs:
           - type: list
           lineno: 59
    -      has_varargs: false
    -      has_kwargs: false
    +  command: ''
       description: GenAI approach of generating structured data according to a given schema
       default_handler: generate_data
       disable_auto_mount: false
    -  clone_target_dir: ''
    -  env: []
    -  priority_class_name: ''
    -  preemption_mode: prevent
    -  affinity: null
    -  tolerations: null
    -  security_context: {}
    +  image: ''
    +metadata:
    +  name: structured-data-generator
    +  tag: ''
    +  categories:
    +  - data-generation
    +  - genai
     verbose: false
    +kind: job
     
             
         
    diff --git a/functions/master/structured_data_generator/latest/static/item.html b/functions/master/structured_data_generator/latest/static/item.html index ffc6817d..90c770e7 100644 --- a/functions/master/structured_data_generator/latest/static/item.html +++ b/functions/master/structured_data_generator/latest/static/item.html @@ -30,8 +30,6 @@ apiVersion: v1 categories: -- machine-learning -- data-preparation - data-generation - genai description: GenAI approach of generating structured data according to a given schema @@ -44,7 +42,7 @@ author: zeevr maintainers: [] marketplaceType: '' -mlrunVersion: 1.6.1 +mlrunVersion: 1.8.0 name: structured_data_generator platformVersion: 3.5.5 spec: @@ -56,7 +54,7 @@ - langchain - tqdm url: '' -version: 1.5.0 +version: 1.6.0 diff --git a/functions/master/structured_data_generator/latest/static/structured_data_generator.html b/functions/master/structured_data_generator/latest/static/structured_data_generator.html index 51e05a22..b665f268 100644 --- a/functions/master/structured_data_generator/latest/static/structured_data_generator.html +++ b/functions/master/structured_data_generator/latest/static/structured_data_generator.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/tags.json b/functions/master/tags.json index 04f007e8..52b80d8b 100644 --- a/functions/master/tags.json +++ b/functions/master/tags.json @@ -1 +1 @@ -{"kind": ["serving", "nuclio:serving", "job"], "categories": ["utils", "model-serving", "deep-learning", "huggingface", "etl", "machine-learning", "model-testing", "data-generation", "NLP", "data-preparation", "pytorch", "genai", "monitoring", "model-training", "data-analysis", "audio"]} \ No newline at end of file +{"categories": ["deep-learning", "data-generation", "audio", "NLP", "data-analysis", "model-testing", "monitoring", "data-preparation", "model-serving", "model-training", "machine-learning", "genai", "utils"], "kind": ["serving", "nuclio:serving", "job"]} \ No newline at end of file diff --git a/functions/master/test_classifier/1.1.0/static/documentation.html b/functions/master/test_classifier/1.1.0/static/documentation.html index e36e9403..ecb59d84 100644 --- a/functions/master/test_classifier/1.1.0/static/documentation.html +++ b/functions/master/test_classifier/1.1.0/static/documentation.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/test_classifier/1.1.0/static/example.html b/functions/master/test_classifier/1.1.0/static/example.html index 2d5bc2f0..7317badc 100644 --- a/functions/master/test_classifier/1.1.0/static/example.html +++ b/functions/master/test_classifier/1.1.0/static/example.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/test_classifier/1.1.0/static/test_classifier.html b/functions/master/test_classifier/1.1.0/static/test_classifier.html index 68421377..0ee946fa 100644 --- a/functions/master/test_classifier/1.1.0/static/test_classifier.html +++ b/functions/master/test_classifier/1.1.0/static/test_classifier.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/test_classifier/latest/static/documentation.html b/functions/master/test_classifier/latest/static/documentation.html index e36e9403..ecb59d84 100644 --- a/functions/master/test_classifier/latest/static/documentation.html +++ b/functions/master/test_classifier/latest/static/documentation.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/test_classifier/latest/static/example.html b/functions/master/test_classifier/latest/static/example.html index 2d5bc2f0..7317badc 100644 --- a/functions/master/test_classifier/latest/static/example.html +++ b/functions/master/test_classifier/latest/static/example.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/test_classifier/latest/static/test_classifier.html b/functions/master/test_classifier/latest/static/test_classifier.html index 68421377..0ee946fa 100644 --- a/functions/master/test_classifier/latest/static/test_classifier.html +++ b/functions/master/test_classifier/latest/static/test_classifier.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/text_to_audio_generator/1.3.0/src/function.yaml b/functions/master/text_to_audio_generator/1.3.0/src/function.yaml index f7fe5286..8edbde74 100644 --- a/functions/master/text_to_audio_generator/1.3.0/src/function.yaml +++ b/functions/master/text_to_audio_generator/1.3.0/src/function.yaml @@ -1,28 +1,8 @@ -metadata: - name: text-to-audio-generator - categories: - - data-preparation - - machine-learning - - pytorch - tag: '' spec: - command: '' - build: - functionSourceCode:  - code_origin: '' - base_image: mlrun/mlrun - requirements: - - torchaudio - - pydub - origin_filename: '' - image: '' + default_handler: generate_multi_speakers_audio disable_auto_mount: false entry_points: generate_multi_speakers_audio: - has_kwargs: false - name: generate_multi_speakers_audio - doc: Generate audio files from text files. - has_varargs: false lineno: 38 parameters: - name: data_path @@ -89,11 +69,30 @@ spec: doc: Changes the bit depth for the supported formats. Supported only in "wav" or "flac" formats. default: null + name: generate_multi_speakers_audio + has_kwargs: false + has_varargs: false outputs: - doc: 'A tuple of: - The output directory path. - The generated audio files dataframe. - The errors'' dictionary.' type: Tuple[str, pd.DataFrame, dict] - default_handler: generate_multi_speakers_audio + doc: Generate audio files from text files. + command: '' + image: '' description: Generate audio file from text using different speakers -verbose: false + build: + requirements: + - torchaudio + - pydub + base_image: mlrun/mlrun + code_origin: '' + origin_filename: '' + functionSourceCode: IyBDb3B5cmlnaHQgMjAyMyBJZ3VhemlvCiMKIyBMaWNlbnNlZCB1bmRlciB0aGUgQXBhY2hlIExpY2Vuc2UsIFZlcnNpb24gMi4wICh0aGUgIkxpY2Vuc2UiKTsKIyB5b3UgbWF5IG5vdCB1c2UgdGhpcyBmaWxlIGV4Y2VwdCBpbiBjb21wbGlhbmNlIHdpdGggdGhlIExpY2Vuc2UuCiMgWW91IG1heSBvYnRhaW4gYSBjb3B5IG9mIHRoZSBMaWNlbnNlIGF0CiMKIyAgIGh0dHA6Ly93d3cuYXBhY2hlLm9yZy9saWNlbnNlcy9MSUNFTlNFLTIuMAojCiMgVW5sZXNzIHJlcXVpcmVkIGJ5IGFwcGxpY2FibGUgbGF3IG9yIGFncmVlZCB0byBpbiB3cml0aW5nLCBzb2Z0d2FyZQojIGRpc3RyaWJ1dGVkIHVuZGVyIHRoZSBMaWNlbnNlIGlzIGRpc3RyaWJ1dGVkIG9uIGFuICJBUyBJUyIgQkFTSVMsCiMgV0lUSE9VVCBXQVJSQU5USUVTIE9SIENPTkRJVElPTlMgT0YgQU5ZIEtJTkQsIGVpdGhlciBleHByZXNzIG9yIGltcGxpZWQuCiMgU2VlIHRoZSBMaWNlbnNlIGZvciB0aGUgc3BlY2lmaWMgbGFuZ3VhZ2UgZ292ZXJuaW5nIHBlcm1pc3Npb25zIGFuZAojIGxpbWl0YXRpb25zIHVuZGVyIHRoZSBMaWNlbnNlLgppbXBvcnQgaW1wb3J0bGliCmltcG9ydCBpbwppbXBvcnQgbG9nZ2luZwppbXBvcnQgb3MKaW1wb3J0IHBhdGhsaWIKaW1wb3J0IHJhbmRvbQppbXBvcnQgdGVtcGZpbGUKZnJvbSBhYmMgaW1wb3J0IEFCQywgYWJzdHJhY3RtZXRob2QKZnJvbSB0eXBpbmcgaW1wb3J0IERpY3QsIExpc3QsIE9wdGlvbmFsLCBUdXBsZSwgVW5pb24KCmltcG9ydCBudW1weSBhcyBucAppbXBvcnQgcGFuZGFzIGFzIHBkCmltcG9ydCB0b3JjaAppbXBvcnQgdG9yY2hhdWRpbwppbXBvcnQgdHFkbQoKIyBHZXQgdGhlIGdsb2JhbCBsb2dnZXI6Cl9MT0dHRVIgPSBsb2dnaW5nLmdldExvZ2dlcigpCgpPUEVOQUlfQVBJX0tFWSA9ICJPUEVOQUlfQVBJX0tFWSIKT1BFTkFJX0JBU0VfVVJMID0gIk9QRU5BSV9BUElfQkFTRSIKU0FNUExFX1JBVEUgPSAyNDAwMAoKCmRlZiBnZW5lcmF0ZV9tdWx0aV9zcGVha2Vyc19hdWRpbygKICAgIGRhdGFfcGF0aDogc3RyLAogICAgc3BlYWtlcnM6IFVuaW9uW0xpc3Rbc3RyXSwgRGljdFtzdHIsIGludF1dLAogICAgYXZhaWxhYmxlX3ZvaWNlczogTGlzdFtzdHJdLAogICAgZW5naW5lOiBzdHIgPSAib3BlbmFpIiwKICAgIG91dHB1dF9kaXJlY3Rvcnk6IHN0ciA9IE5vbmUsCiAgICB1c2VfZ3B1OiBPcHRpb25hbFtib29sXSA9IE5vbmUsCiAgICB1c2Vfc21hbGxfbW9kZWxzOiBPcHRpb25hbFtib29sXSA9IE5vbmUsCiAgICBvZmZsb2FkX2NwdTogT3B0aW9uYWxbYm9vbF0gPSBOb25lLAogICAgbW9kZWw6IE9wdGlvbmFsW3N0cl0gPSBOb25lLAogICAgc3BlZWQ6IE9wdGlvbmFsW2Zsb2F0XSA9IE5vbmUsCiAgICBzYW1wbGVfcmF0ZTogaW50ID0gMTYwMDAsCiAgICBmaWxlX2Zvcm1hdDogc3RyID0gIndhdiIsCiAgICB2ZXJib3NlOiBib29sID0gVHJ1ZSwKICAgIGJpdHNfcGVyX3NhbXBsZTogT3B0aW9uYWxbaW50XSA9IE5vbmUsCikgLT4gVHVwbGVbc3RyLCBwZC5EYXRhRnJhbWUsIGRpY3RdOgogICAgIiIiCiAgICBHZW5lcmF0ZSBhdWRpbyBmaWxlcyBmcm9tIHRleHQgZmlsZXMuCgogICAgOnBhcmFtIGRhdGFfcGF0aDogICAgICAgICAgIFBhdGggdG8gdGhlIHRleHQgZmlsZSBvciBkaXJlY3RvcnkgY29udGFpbmluZyB0aGUgdGV4dCBmaWxlcyB0byBnZW5lcmF0ZSBhdWRpbyBmcm9tLgogICAgOnBhcmFtIHNwZWFrZXJzOiAgICAgICAgICAgIExpc3QgLyBEaWN0IG9mIHNwZWFrZXJzIHRvIGdlbmVyYXRlIGF1ZGlvIGZvci4KICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBJZiBhIGxpc3QgaXMgZ2l2ZW4sIHRoZSBzcGVha2VycyB3aWxsIGJlIGFzc2lnbmVkIHRvIGNoYW5uZWxzIGluIHRoZSBvcmRlciBnaXZlbi4KICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBJZiBkaWN0aW9uYXJ5LCB0aGUga2V5cyB3aWxsIGJlIHRoZSBzcGVha2VycyBhbmQgdGhlIHZhbHVlcyB3aWxsIGJlIHRoZSBjaGFubmVscy4KICAgIDpwYXJhbSBhdmFpbGFibGVfdm9pY2VzOiAgICBMaXN0IG9mIGF2YWlsYWJsZSB2b2ljZXMgdG8gdXNlIGZvciB0aGUgZ2VuZXJhdGlvbi4KICAgICAgICAgICAgICAgICAgICAgICAgU2VlIGhlcmUgZm9yIHRoZSBhdmFpbGFibGUgdm9pY2VzIGZvciBiYXJrIGVuZ2luZToKICAgICAgICAgICAgICAgICAgICAgICAgaHR0cHM6Ly9zdW5vLWFpLm5vdGlvbi5zaXRlLzhiOGU4NzQ5ZWQ1MTRiMGNiZjNmNjk5MDEzNTQ4NjgzP3Y9YmM2N2NmZjc4NmIwNGI1MGIzY2ViNzU2ZmQwNWY2OGMKICAgICAgICAgICAgICAgICAgICAgICAgU2VlIGhlcmUgZm9yIHRoZSBhdmFpbGFibGUgdm9pY2VzIGZvciBvcGVuYWkgZW5naW5lOgogICAgICAgICAgICAgICAgICAgICAgICBodHRwczovL2JldGEub3BlbmFpLmNvbS9kb2NzL2FwaS1yZWZlcmVuY2Uvc3BlZWNoCiAgICA6cGFyYW0gZW5naW5lOiAgICAgICAgICAgICAgVGhlIGVuZ2luZSB0byB1c2UgZm9yIHRoZSBnZW5lcmF0aW9uLiBTZWxlY3QgZWl0aGVyICJiYXJrIiBvciAib3BlbmFpIi4gRGVmYXVsdCBpcyAib3BlbmFpIi4KICAgIDpwYXJhbSBvdXRwdXRfZGlyZWN0b3J5OiAgICBQYXRoIHRvIHRoZSBkaXJlY3RvcnkgdG8gc2F2ZSB0aGUgZ2VuZXJhdGVkIGF1ZGlvIGZpbGVzIHRvLgogICAgOnBhcmFtIHVzZV9ncHU6ICAgICAgICAgICAgIFdoZXRoZXIgdG8gdXNlIHRoZSBHUFUgZm9yIHRoZSBnZW5lcmF0aW9uLiBTdXBwb3J0ZWQgb25seSBpbiAiYmFyayIgZW5naW5lLgogICAgOnBhcmFtIHVzZV9zbWFsbF9tb2RlbHM6ICAgIFdoZXRoZXIgdG8gdXNlIHRoZSBzbWFsbCBtb2RlbHMgZm9yIHRoZSBnZW5lcmF0aW9uLiBTdXBwb3J0ZWQgb25seSBpbiAiYmFyayIgZW5naW5lLgogICAgOnBhcmFtIG9mZmxvYWRfY3B1OiAgICAgICAgIFRvIHJlZHVjZSB0aGUgbWVtb3J5IGZvb3RwcmludCwgdGhlIG1vZGVscyBjYW4gYmUgb2ZmbG9hZGVkIHRvIHRoZSBDUFUgYWZ0ZXIgbG9hZGluZy4KICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBTdXBwb3J0ZWQgb25seSBpbiAiYmFyayIgZW5naW5lLgogICAgOnBhcmFtIG1vZGVsOiAgICAgICAgICAgICAgIFdoaWNoIG1vZGVsIHRvIHVzZSBmb3IgdGhlIGdlbmVyYXRpb24uIFN1cHBvcnRlZCBvbmx5IGluICJvcGVuYWkiIGVuZ2luZS4KICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBEZWZhdWx0IGlzICJ0dHMtMSIuCiAgICA6cGFyYW0gc3BlZWQ6ICAgICAgICAgICAgICAgVGhlIHNwZWVkIG9mIHRoZSBnZW5lcmF0ZWQgYXVkaW8uIFNlbGVjdCBhIHZhbHVlIGZyb20gYDAuMjVgIHRvIGA0LjBgLiBgMS4wYCBpcyB0aGUgZGVmYXVsdC4KICAgIDpwYXJhbSBzYW1wbGVfcmF0ZTogICAgICAgICBUaGUgc2FtcGxpbmcgcmF0ZSBvZiB0aGUgZ2VuZXJhdGVkIGF1ZGlvLgogICAgOnBhcmFtIGZpbGVfZm9ybWF0OiAgICAgICAgIFRoZSBmb3JtYXQgb2YgdGhlIGdlbmVyYXRlZCBhdWRpbyBmaWxlcy4KICAgIDpwYXJhbSB2ZXJib3NlOiAgICAgICAgICAgICBXaGV0aGVyIHRvIHByaW50IHRoZSBwcm9ncmVzcyBvZiB0aGUgZ2VuZXJhdGlvbi4KICAgIDpwYXJhbSBiaXRzX3Blcl9zYW1wbGU6ICAgICBDaGFuZ2VzIHRoZSBiaXQgZGVwdGggZm9yIHRoZSBzdXBwb3J0ZWQgZm9ybWF0cy4KICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBTdXBwb3J0ZWQgb25seSBpbiAid2F2IiBvciAiZmxhYyIgZm9ybWF0cy4KCiAgICA6cmV0dXJuczogICAgICAgICAgICAgICAgICAgQSB0dXBsZSBvZjoKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAtIFRoZSBvdXRwdXQgZGlyZWN0b3J5IHBhdGguCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgLSBUaGUgZ2VuZXJhdGVkIGF1ZGlvIGZpbGVzIGRhdGFmcmFtZS4KICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAtIFRoZSBlcnJvcnMnIGRpY3Rpb25hcnkuCiAgICAiIiIKCiAgICBnbG9iYWwgX0xPR0dFUgogICAgX0xPR0dFUiA9IF9nZXRfbG9nZ2VyKCkKICAgICMgR2V0IHRoZSBpbnB1dCB0ZXh0IGZpbGVzIHRvIHR1cm4gdG8gYXVkaW86CiAgICBkYXRhX3BhdGggPSBwYXRobGliLlBhdGgoZGF0YV9wYXRoKS5hYnNvbHV0ZSgpCiAgICB0ZXh0X2ZpbGVzID0gX2dldF90ZXh0X2ZpbGVzKGRhdGFfcGF0aD1kYXRhX3BhdGgpCgoKICAgICMgUHJlcGFyZSB0aGUgc3BlZWNoIGVuZ2luZToKICAgIGVuZ2luZSA9IF9nZXRfZW5naW5lKAogICAgICAgIGVuZ2luZT1lbmdpbmUsCiAgICAgICAgdXNlX2dwdT11c2VfZ3B1LAogICAgICAgIHVzZV9zbWFsbF9tb2RlbHM9dXNlX3NtYWxsX21vZGVscywKICAgICAgICBvZmZsb2FkX2NwdT1vZmZsb2FkX2NwdSwKICAgICAgICBtb2RlbD1tb2RlbCwKICAgICAgICBmaWxlX2Zvcm1hdD1maWxlX2Zvcm1hdCwKICAgICAgICBzcGVlZD1zcGVlZAogICAgKQoKICAgICMgQ2hlY2sgZm9yIHBlciBjaGFubmVsIGdlbmVyYXRpb246CiAgICBpZiBpc2luc3RhbmNlKHNwZWFrZXJzLCBkaWN0KToKICAgICAgICBzcGVha2VyX3Blcl9jaGFubmVsID0gVHJ1ZQogICAgICAgICMgU29ydCB0aGUgZ2l2ZW4gc3BlYWtlcnMgYnkgY2hhbm5lbHM6CiAgICAgICAgc3BlYWtlcnMgPSB7CiAgICAgICAgICAgIHNwZWFrZXI6IGNoYW5uZWwKICAgICAgICAgICAgZm9yIHNwZWFrZXIsIGNoYW5uZWwgaW4gc29ydGVkKHNwZWFrZXJzLml0ZW1zKCksIGtleT1sYW1iZGEgaXRlbTogaXRlbVsxXSkKICAgICAgICB9CiAgICBlbHNlOgogICAgICAgIHNwZWFrZXJfcGVyX2NoYW5uZWwgPSBGYWxzZQoKICAgICMgUHJlcGFyZSB0aGUgcmVzYW1wbGluZyBtb2R1bGU6CiAgICByZXNhbXBsZXIgPSB0b3JjaGF1ZGlvLnRyYW5zZm9ybXMuUmVzYW1wbGUoCiAgICAgICAgb3JpZ19mcmVxPVNBTVBMRV9SQVRFLCBuZXdfZnJlcT1zYW1wbGVfcmF0ZSwgZHR5cGU9dG9yY2guZmxvYXQzMgogICAgKQoKICAgICMgUHJlcGFyZSB0aGUgZ2FwIGJldHdlZW4gZWFjaCBzcGVha2VyOgogICAgZ2FwX2JldHdlZW5fc3BlYWtlcnMgPSBucC56ZXJvcyhpbnQoMC41ICogU0FNUExFX1JBVEUpKQoKICAgICMgUHJlcGFyZSB0aGUgc3VjY2Vzc2VzIGRhdGFmcmFtZSBhbmQgZXJyb3JzIGRpY3Rpb25hcnkgdG8gYmUgcmV0dXJuZWQ6CiAgICBzdWNjZXNzZXMgPSBbXQogICAgZXJyb3JzID0ge30KCiAgICAjIENyZWF0ZSB0aGUgb3V0cHV0IGRpcmVjdG9yeToKICAgIGlmIG91dHB1dF9kaXJlY3RvcnkgaXMgTm9uZToKICAgICAgICBvdXRwdXRfZGlyZWN0b3J5ID0gdGVtcGZpbGUubWtkdGVtcCgpCiAgICBvdXRwdXRfZGlyZWN0b3J5ID0gcGF0aGxpYi5QYXRoKG91dHB1dF9kaXJlY3RvcnkpCiAgICBpZiBub3Qgb3V0cHV0X2RpcmVjdG9yeS5leGlzdHMoKToKICAgICAgICBvdXRwdXRfZGlyZWN0b3J5Lm1rZGlyKGV4aXN0X29rPVRydWUsIHBhcmVudHM9VHJ1ZSkKCiAgICAjIFN0YXJ0IGdlbmVyYXRpbmcgYXVkaW86CiAgICAjIEdvIG92ZXIgdGhlIGF1ZGlvIGZpbGVzIGFuZCB0cmFuc2NyaWJlOgogICAgZm9yIHRleHRfZmlsZSBpbiB0cWRtLnRxZG0oCiAgICAgICAgdGV4dF9maWxlcywgZGVzYz0iR2VuZXJhdGluZyIsIHVuaXQ9ImZpbGUiLCBkaXNhYmxlPW5vdCB2ZXJib3NlCiAgICApOgoKICAgICAgICB0cnk6CiAgICAgICAgICAgICMgUmFuZG9taXplIHZvaWNlcyBmb3IgZWFjaCBzcGVha2VyOgogICAgICAgICAgICBjaG9zZW5fdm9pY2VzID0ge30KICAgICAgICAgICAgYXZhaWxhYmxlX3ZvaWNlc19jb3B5ID0gYXZhaWxhYmxlX3ZvaWNlcy5jb3B5KCkKICAgICAgICAgICAgZm9yIHNwZWFrZXIgaW4gc3BlYWtlcnM6CiAgICAgICAgICAgICAgICB2b2ljZSA9IHJhbmRvbS5jaG9pY2UoYXZhaWxhYmxlX3ZvaWNlc19jb3B5KQogICAgICAgICAgICAgICAgY2hvc2VuX3ZvaWNlc1tzcGVha2VyXSA9IHZvaWNlCiAgICAgICAgICAgICAgICBhdmFpbGFibGVfdm9pY2VzX2NvcHkucmVtb3ZlKHZvaWNlKQogICAgICAgICAgICAjIFJlYWQgdGV4dDoKICAgICAgICAgICAgd2l0aCBvcGVuKHRleHRfZmlsZSwgInIiKSBhcyBmcDoKICAgICAgICAgICAgICAgIHRleHQgPSBmcC5yZWFkKCkKICAgICAgICAgICAgIyBQcmVwYXJlIGEgaG9sZGVyIGZvciBhbGwgdGhlIGdlbmVyYXRlZCBwaWVjZXMgKGlmIHBlciBjaGFubmVsIGVhY2ggc3BlYWtlciB3aWxsIGhhdmUgaXRzIG93bik6CiAgICAgICAgICAgIGF1ZGlvX3BpZWNlcyA9ICgKICAgICAgICAgICAgICAgIHtzcGVha2VyOiBbXSBmb3Igc3BlYWtlciBpbiBzcGVha2Vyc30KICAgICAgICAgICAgICAgIGlmIHNwZWFrZXJfcGVyX2NoYW5uZWwKICAgICAgICAgICAgICAgIGVsc2UgeyJhbGwiOiBbXX0KICAgICAgICAgICAgKQoKICAgICAgICAgICAgIyBHZW5lcmF0ZSBhdWRpbyBwZXIgbGluZToKICAgICAgICAgICAgZm9yIGxpbmUgaW4gdGV4dC5zcGxpdGxpbmVzKCk6CiAgICAgICAgICAgICAgICAjIFZhbGlkYXRlIGxpbmUgaXMgaW4gY29ycmVjdCBzcGVha2VyIGZvcm1hdDoKCiAgICAgICAgICAgICAgICBpZiAiOiAiIG5vdCBpbiBsaW5lOgogICAgICAgICAgICAgICAgICAgIGlmIHZlcmJvc2U6CiAgICAgICAgICAgICAgICAgICAgICAgIF9MT0dHRVIud2FybmluZyhmIlNraXBwaW5nIGxpbmU6IHtsaW5lfSIpCiAgICAgICAgICAgICAgICAgICAgY29udGludWUKICAgICAgICAgICAgICAgICMgU3BsaXQgbGluZSB0byBzcGVha2VyIGFuZCBoaXMgd29yZHM6CiAgICAgICAgICAgICAgICBjdXJyZW50X3NwZWFrZXIsIHNlbnRlbmNlcyA9IGxpbmUuc3BsaXQoIjogIiwgMSkKICAgICAgICAgICAgICAgICMgVmFsaWRhdGUgc3BlYWtlciBpcyBrbm93bjoKICAgICAgICAgICAgICAgIGlmIGN1cnJlbnRfc3BlYWtlciBub3QgaW4gc3BlYWtlcnM6CiAgICAgICAgICAgICAgICAgICAgcmFpc2UgVmFsdWVFcnJvcigKICAgICAgICAgICAgICAgICAgICAgICAgZiJVbmtub3duIHNwZWFrZXI6IHtjdXJyZW50X3NwZWFrZXJ9LiBHaXZlbiBzcGVha2VycyBhcmU6IHtzcGVha2Vyc30iCiAgICAgICAgICAgICAgICAgICAgKQogICAgICAgICAgICAgICAgZm9yIHNlbnRlbmNlIGluIF9zcGxpdF9saW5lKGxpbmU9c2VudGVuY2VzKToKICAgICAgICAgICAgICAgICAgICAjIEdlbmVyYXRlIHdvcmRzIGF1ZGlvOgogICAgICAgICAgICAgICAgICAgIGF1ZGlvID0gZW5naW5lLl9nZW5lcmF0ZV9hdWRpbygKICAgICAgICAgICAgICAgICAgICAgICAgdGV4dD1zZW50ZW5jZSwKICAgICAgICAgICAgICAgICAgICAgICAgdm9pY2U9Y2hvc2VuX3ZvaWNlc1tjdXJyZW50X3NwZWFrZXJdLAogICAgICAgICAgICAgICAgICAgICkKCiAgICAgICAgICAgICAgICAgICAgaWYgc3BlYWtlcl9wZXJfY2hhbm5lbDoKICAgICAgICAgICAgICAgICAgICAgICAgc2lsZW5jZSA9IG5wLnplcm9zX2xpa2UoYXVkaW8pCiAgICAgICAgICAgICAgICAgICAgICAgIGZvciBzcGVha2VyIGluIGF1ZGlvX3BpZWNlcy5rZXlzKCk6CiAgICAgICAgICAgICAgICAgICAgICAgICAgICBpZiBzcGVha2VyID09IGN1cnJlbnRfc3BlYWtlcjoKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBhdWRpb19waWVjZXNbc3BlYWtlcl0gKz0gW2F1ZGlvLCBnYXBfYmV0d2Vlbl9zcGVha2Vyc10KICAgICAgICAgICAgICAgICAgICAgICAgICAgIGVsc2U6CiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgYXVkaW9fcGllY2VzW3NwZWFrZXJdICs9IFtzaWxlbmNlLCBnYXBfYmV0d2Vlbl9zcGVha2Vyc10KICAgICAgICAgICAgICAgICAgICBlbHNlOgogICAgICAgICAgICAgICAgICAgICAgICBhdWRpb19waWVjZXNbImFsbCJdICs9IFthdWRpbywgZ2FwX2JldHdlZW5fc3BlYWtlcnNdCiAgICAgICAgICAgICMgQ29uc3RydWN0IGEgc2luZ2xlIGF1ZGlvIGFycmF5IGZyb20gYWxsIHRoZSBwaWVjZXMgYW5kIGNoYW5uZWxzOgoKICAgICAgICAgICAgYXVkaW8gPSBucC52c3RhY2soCiAgICAgICAgICAgICAgICBbbnAuY29uY2F0ZW5hdGUoYXVkaW9fcGllY2VzW3NwZWFrZXJdKSBmb3Igc3BlYWtlciBpbiBzcGVha2Vyc10KICAgICAgICAgICAgKS5hc3R5cGUoZHR5cGU9bnAuZmxvYXQzMikKICAgICAgICAgICAgIyBSZXNhbXBsZToKICAgICAgICAgICAgYXVkaW8gPSB0b3JjaC5mcm9tX251bXB5KGF1ZGlvKQogICAgICAgICAgICBhdWRpbyA9IHJlc2FtcGxlcihhdWRpbykKICAgICAgICAgICAgIyBTYXZlIHRvIGF1ZGlvIGZpbGU6CiAgICAgICAgICAgIGF1ZGlvX2ZpbGUgPSBvdXRwdXRfZGlyZWN0b3J5IC8gZiJ7dGV4dF9maWxlLnN0ZW19LntmaWxlX2Zvcm1hdH0iCgogICAgICAgICAgICB0b3JjaGF1ZGlvLnNhdmUoCiAgICAgICAgICAgICAgICB1cmk9c3RyKGF1ZGlvX2ZpbGUpLAogICAgICAgICAgICAgICAgc3JjPWF1ZGlvLAogICAgICAgICAgICAgICAgc2FtcGxlX3JhdGU9c2FtcGxlX3JhdGUsCiAgICAgICAgICAgICAgICBmb3JtYXQ9ZmlsZV9mb3JtYXQsCiAgICAgICAgICAgICAgICBiaXRzX3Blcl9zYW1wbGU9Yml0c19wZXJfc2FtcGxlLAogICAgICAgICAgICApCgogICAgICAgICAgICAjIENvbGxlY3QgdG8gdGhlIHN1Y2Nlc3NlczoKICAgICAgICAgICAgc3VjY2Vzc2VzLmFwcGVuZChbdGV4dF9maWxlLm5hbWUsIGF1ZGlvX2ZpbGUubmFtZV0pCiAgICAgICAgZXhjZXB0IEV4Y2VwdGlvbiBhcyBleGNlcHRpb246CiAgICAgICAgICAgICMgTm90ZSB0aGUgZXhjZXB0aW9uIGFzIGVycm9yIGluIHRoZSBkaWN0aW9uYXJ5OgogICAgICAgICAgICBpZiB2ZXJib3NlOgogICAgICAgICAgICAgICAgX0xPR0dFUi53YXJuaW5nKGYiRXJyb3IgaW4gZmlsZTogJ3t0ZXh0X2ZpbGUubmFtZX0nIikKICAgICAgICAgICAgcHJpbnQoZXhjZXB0aW9uKQogICAgICAgICAgICBlcnJvcnNbdGV4dF9maWxlLm5hbWVdID0gc3RyKGV4Y2VwdGlvbikKCiAgICAjIENvbnN0cnVjdCB0aGUgdHJhbnNsYXRpb25zIGRhdGFmcmFtZToKICAgIHN1Y2Nlc3NlcyA9IHBkLkRhdGFGcmFtZSgKICAgICAgICBzdWNjZXNzZXMsCiAgICAgICAgY29sdW1ucz1bInRleHRfZmlsZSIsICJhdWRpb19maWxlIl0sCiAgICApCgogICAgIyBQcmludCB0aGUgaGVhZCBvZiB0aGUgcHJvZHVjZWQgZGF0YWZyYW1lIGFuZCByZXR1cm46CiAgICBpZiB2ZXJib3NlOgogICAgICAgIF9MT0dHRVIuaW5mbygKICAgICAgICAgICAgZiJEb25lICh7c3VjY2Vzc2VzLnNoYXBlWzBdfS97bGVuKHRleHRfZmlsZXMpfSlcbiIKICAgICAgICAgICAgZiJUcmFuc2xhdGlvbnMgc3VtbWFyeTpcbiIKICAgICAgICAgICAgZiJ7c3VjY2Vzc2VzLmhlYWQoKX0iCiAgICAgICAgKQogICAgcmV0dXJuIHN0cihvdXRwdXRfZGlyZWN0b3J5KSwgc3VjY2Vzc2VzLCBlcnJvcnMKCgpjbGFzcyBTcGVlY2hFbmdpbmUoQUJDKToKICAgIEBhYnN0cmFjdG1ldGhvZAogICAgZGVmIF9nZW5lcmF0ZV9hdWRpbyhzZWxmLCB0ZXh0OiBzdHIsIHZvaWNlOiBzdHIpIC0+IG5wLm5kYXJyYXk6CiAgICAgICAgcGFzcwoKCmNsYXNzIEJhcmtFbmdpbmUoU3BlZWNoRW5naW5lKToKICAgIGRlZiBfX2luaXRfXyhzZWxmLCB1c2VfZ3B1OiBib29sID0gVHJ1ZSwgdXNlX3NtYWxsX21vZGVsczogYm9vbCA9IEZhbHNlLCBvZmZsb2FkX2NwdTogYm9vbCA9IEZhbHNlKToKICAgICAgICB0cnk6CiAgICAgICAgICAgIHNlbGYuYmFyayA9IGltcG9ydGxpYi5pbXBvcnRfbW9kdWxlKCJiYXJrIikKICAgICAgICBleGNlcHQgSW1wb3J0RXJyb3I6CiAgICAgICAgICAgIHJhaXNlIEltcG9ydEVycm9yKAogICAgICAgICAgICAgICAgIlRoZSAnYmFyaycgbGlicmFyeSBpcyByZXF1aXJlZCBmb3IgdGhlIEJhcmtFbmdpbmUuIFBsZWFzZSBpbnN0YWxsIGl0IHVzaW5nICdwaXAgaW5zdGFsbCBiYXJrLWFpJy4iCiAgICAgICAgICAgICkKCiAgICAgICAgc2VsZi5iYXJrLnByZWxvYWRfbW9kZWxzKAogICAgICAgICAgICB0ZXh0X3VzZV9ncHU9dXNlX2dwdSwKICAgICAgICAgICAgdGV4dF91c2Vfc21hbGw9dXNlX3NtYWxsX21vZGVscywKICAgICAgICAgICAgY29hcnNlX3VzZV9ncHU9dXNlX2dwdSwKICAgICAgICAgICAgY29hcnNlX3VzZV9zbWFsbD11c2Vfc21hbGxfbW9kZWxzLAogICAgICAgICAgICBmaW5lX3VzZV9ncHU9dXNlX2dwdSwKICAgICAgICAgICAgZmluZV91c2Vfc21hbGw9dXNlX3NtYWxsX21vZGVscywKICAgICAgICAgICAgY29kZWNfdXNlX2dwdT11c2VfZ3B1LAogICAgICAgICAgICBmb3JjZV9yZWxvYWQ9b2ZmbG9hZF9jcHUsCiAgICAgICAgKQoKICAgIGRlZiBfZ2VuZXJhdGVfYXVkaW8oc2VsZiwgdGV4dDogc3RyLCB2b2ljZTogc3RyKSAtPiBucC5uZGFycmF5OgogICAgICAgICMgR2VuZXJhdGUgd29yZHMgYXVkaW86CiAgICAgICAgYXVkaW8gPSBzZWxmLmJhcmsuZ2VuZXJhdGVfYXVkaW8oCiAgICAgICAgICAgIHRleHQsCiAgICAgICAgICAgIGhpc3RvcnlfcHJvbXB0PXZvaWNlLAogICAgICAgICAgICBzaWxlbnQ9VHJ1ZSwKICAgICAgICApCiAgICAgICAgcmV0dXJuIGF1ZGlvCgoKY2xhc3MgT3BlbkFJRW5naW5lKFNwZWVjaEVuZ2luZSk6CiAgICBkZWYgX19pbml0X18oc2VsZiwgbW9kZWw6IHN0ciA9ICJ0dHMtMSIsIGZpbGVfZm9ybWF0OiBzdHIgPSAid2F2Iiwgc3BlZWQ6IGZsb2F0ID0gMS4wKToKICAgICAgICB0cnk6CiAgICAgICAgICAgIHNlbGYub3BlbmFpID0gaW1wb3J0bGliLmltcG9ydF9tb2R1bGUoIm9wZW5haSIpCiAgICAgICAgICAgIHNlbGYucHlkdWIgPSBpbXBvcnRsaWIuaW1wb3J0X21vZHVsZSgicHlkdWIiKQogICAgICAgIGV4Y2VwdCBJbXBvcnRFcnJvcjoKICAgICAgICAgICAgcmFpc2UgSW1wb3J0RXJyb3IoCiAgICAgICAgICAgICAgICAiVGhlICdvcGVuYWknIGFuZCAncHlkdWInIGxpYnJhcmllcyBhcmUgcmVxdWlyZWQgZm9yIHRoZSBPcGVuQUlFbmdpbmUuIFBsZWFzZSBpbnN0YWxsIHRoZW0gdXNpbmcgJ3BpcCBpbnN0YWxsIG9wZW5haSBweWR1YicuIgogICAgICAgICAgICApCgogICAgICAgIGFwaV9rZXkgPSBvcy5nZXRlbnYoT1BFTkFJX0FQSV9LRVkpCiAgICAgICAgYmFzZV91cmwgPSBvcy5nZXRlbnYoT1BFTkFJX0JBU0VfVVJMKQogICAgICAgICMgQ2hlY2sgaWYgdGhlIGtleSBpcyBhbHJlYWR5IGluIHRoZSBlbnZpcm9ubWVudCB2YXJpYWJsZXM6CiAgICAgICAgaWYgbm90IGFwaV9rZXkgb3Igbm90IGJhc2VfdXJsOgogICAgICAgICAgICB0cnk6CiAgICAgICAgICAgICAgICBpbXBvcnQgbWxydW4KCiAgICAgICAgICAgICAgICBjb250ZXh0ID0gbWxydW4uZ2V0X29yX2NyZWF0ZV9jdHgobmFtZT0iY29udGV4dCIpCiAgICAgICAgICAgICAgICAjIENoZWNrIGlmIHRoZSBrZXkgaXMgaW4gdGhlIHNlY3JldHM6CiAgICAgICAgICAgICAgICBhcGlfa2V5ID0gY29udGV4dC5nZXRfc2VjcmV0KE9QRU5BSV9BUElfS0VZKQogICAgICAgICAgICAgICAgYmFzZV91cmwgPSBjb250ZXh0LmdldF9zZWNyZXQoT1BFTkFJX0JBU0VfVVJMKQogICAgICAgICAgICBleGNlcHQgTW9kdWxlTm90Rm91bmRFcnJvcjoKICAgICAgICAgICAgICAgIHJhaXNlIEVudmlyb25tZW50RXJyb3IoCiAgICAgICAgICAgICAgICAgICAgZiJPbmUgb3IgbW9yZSBvZiB0aGUgT3BlbkFJIHJlcXVpcmVkIGVudmlyb25tZW50IHZhcmlhYmxlcyAoJ3tPUEVOQUlfQVBJX0tFWX0nLCAne09QRU5BSV9CQVNFX1VSTH0nKSBhcmUgbWlzc2luZy4iCiAgICAgICAgICAgICAgICAgICAgZiJQbGVhc2Ugc2V0IHRoZW0gYXMgZW52aXJvbm1lbnQgdmFyaWFibGVzIG9yIGluc3RhbGwgbWxydW4gKGBwaXAgaW5zdGFsbCBtbHJ1bmApIgogICAgICAgICAgICAgICAgICAgIGYiYW5kIHNldCB0aGVtIGFzIHByb2plY3Qgc2VjcmV0cyB1c2luZyBgcHJvamVjdC5zZXRfc2VjcmV0c2AuIgogICAgICAgICAgICAgICAgKQoKICAgICAgICBzZWxmLmNsaWVudCA9IHNlbGYub3BlbmFpLk9wZW5BSShhcGlfa2V5PWFwaV9rZXksIGJhc2VfdXJsPWJhc2VfdXJsKQogICAgICAgIHNlbGYubW9kZWwgPSBtb2RlbAogICAgICAgIHNlbGYuZmlsZV9mb3JtYXQgPSBmaWxlX2Zvcm1hdAogICAgICAgIHNlbGYuc3BlZWQgPSBzcGVlZAoKICAgIGRlZiBfZ2VuZXJhdGVfYXVkaW8oc2VsZiwgdGV4dDogc3RyLCB2b2ljZTogc3RyKSAtPiBucC5uZGFycmF5OgogICAgICAgICMgR2VuZXJhdGUgd29yZHMgYXVkaW86CiAgICAgICAgYXVkaW8gPSBzZWxmLmNsaWVudC5hdWRpby5zcGVlY2guY3JlYXRlKAogICAgICAgICAgICBtb2RlbD1zZWxmLm1vZGVsLAogICAgICAgICAgICBpbnB1dD10ZXh0LAogICAgICAgICAgICB2b2ljZT12b2ljZSwKICAgICAgICAgICAgcmVzcG9uc2VfZm9ybWF0PXNlbGYuZmlsZV9mb3JtYXQsCiAgICAgICAgICAgIHNwZWVkPXNlbGYuc3BlZWQsCiAgICAgICAgKQogICAgICAgIGF1ZGlvID0gYXVkaW8uY29udGVudAogICAgICAgIGF1ZGlvID0gc2VsZi5fYnl0ZXNfdG9fbnBfYXJyYXkoYXVkaW89YXVkaW8pCiAgICAgICAgcmV0dXJuIGF1ZGlvCgogICAgZGVmIF9ieXRlc190b19ucF9hcnJheShzZWxmLCBhdWRpbzogYnl0ZXMpOgogICAgICAgIGlmIHNlbGYuZmlsZV9mb3JtYXQgPT0gIm1wMyI6CiAgICAgICAgICAgIGF1ZGlvX3NlZ21lbnQgPSBzZWxmLnB5ZHViLkF1ZGlvU2VnbWVudC5mcm9tX21wMyhpby5CeXRlc0lPKGF1ZGlvKSkKCiAgICAgICAgICAgICMgQ29udmVydCB0byByYXcgUENNIGF1ZGlvIGRhdGEKICAgICAgICAgICAgc2FtcGxlcyA9IGF1ZGlvX3NlZ21lbnQuZ2V0X2FycmF5X29mX3NhbXBsZXMoKQoKICAgICAgICAgICAgIyBDb252ZXJ0IHRvIG51bXB5IGFycmF5CiAgICAgICAgICAgIGF1ZGlvX2FycmF5ID0gbnAuYXJyYXkoc2FtcGxlcykKCiAgICAgICAgICAgICMgTm9ybWFsaXplIHRvIGZsb2F0IGJldHdlZW4gLTEgYW5kIDEKICAgICAgICAgICAgcmV0dXJuIGF1ZGlvX2FycmF5LmFzdHlwZShucC5mbG9hdDMyKSAvIG5wLmlpbmZvKHNhbXBsZXMudHlwZWNvZGUpLm1heAogICAgICAgIGVsc2U6CiAgICAgICAgICAgIHJldHVybiBucC5mcm9tYnVmZmVyKGF1ZGlvLCBkdHlwZT1ucC5pbnQxNikgLyAzMjc2OC4wCgoKZGVmIF9nZXRfZW5naW5lKGVuZ2luZTogc3RyLCBmaWxlX2Zvcm1hdDogc3RyLCAqKmt3YXJncykgLT4gU3BlZWNoRW5naW5lOgogICAgIyBlbGltaW5hdGUgdGhlIE5vbmUgdmFsdWVzOgogICAga3dhcmdzID0ge2tleTogdmFsdWUgZm9yIGtleSwgdmFsdWUgaW4ga3dhcmdzLml0ZW1zKCkgaWYgdmFsdWUgaXMgbm90IE5vbmV9CgogICAgaWYgZW5naW5lID09ICJiYXJrIjoKICAgICAgICByZXR1cm4gQmFya0VuZ2luZSgqKmt3YXJncykKICAgIGVsaWYgZW5naW5lID09ICJvcGVuYWkiOgogICAgICAgIHJldHVybiBPcGVuQUlFbmdpbmUoZmlsZV9mb3JtYXQ9ZmlsZV9mb3JtYXQsICoqa3dhcmdzKQogICAgZWxzZToKICAgICAgICByYWlzZSBWYWx1ZUVycm9yKAogICAgICAgICAgICBmIlVucmVjb2duaXplZCBlbmdpbmUuIFRoZSBwYXJhbWV0ZXIgYGVuZ2luZWAgbXVzdCBiZSBlaXRoZXIgJ2JhcmsnIG9yICdvcGVuYWknLiBHaXZlbjoge2VuZ2luZX0iCiAgICAgICAgKQoKZGVmIF9nZXRfdGV4dF9maWxlcygKICAgIGRhdGFfcGF0aDogcGF0aGxpYi5QYXRoLAopIC0+IExpc3RbcGF0aGxpYi5QYXRoXToKICAgICMgQ2hlY2sgaWYgdGhlIHBhdGggaXMgb2YgYSBkaXJlY3Rvcnkgb3IgYSBmaWxlOgogICAgaWYgZGF0YV9wYXRoLmlzX2RpcigpOgogICAgICAgICMgR2V0IGFsbCBmaWxlcyBpbnNpZGUgdGhlIGRpcmVjdG9yeToKICAgICAgICB0ZXh0X2ZpbGVzID0gbGlzdChkYXRhX3BhdGguZ2xvYigiKi4qIikpCiAgICBlbGlmIGRhdGFfcGF0aC5pc19maWxlKCk6CiAgICAgICAgdGV4dF9maWxlcyA9IFtkYXRhX3BhdGhdCiAgICBlbHNlOgogICAgICAgIHJhaXNlIFZhbHVlRXJyb3IoCiAgICAgICAgICAgIGYiVW5yZWNvZ25pemVkIGRhdGEgcGF0aC4gVGhlIHBhcmFtZXRlciBgZGF0YV9wYXRoYCBtdXN0IGJlIGVpdGhlciBhIGRpcmVjdG9yeSBwYXRoIG9yIGEgZmlsZSBwYXRoLiAiCiAgICAgICAgICAgIGYiR2l2ZW46IHtzdHIoZGF0YV9wYXRoKX0gIgogICAgICAgICkKCiAgICByZXR1cm4gdGV4dF9maWxlcwoKCmRlZiBfc3BsaXRfbGluZShsaW5lOiBzdHIsIG1heF9sZW5ndGg6IGludCA9IDI1MCkgLT4gTGlzdFtzdHJdOgogICAgaWYgbGVuKGxpbmUpIDwgbWF4X2xlbmd0aDoKICAgICAgICByZXR1cm4gW2xpbmVdCgogICAgc2VudGVuY2VzID0gWwogICAgICAgIGYie3NlbnRlbmNlLnN0cmlwKCl9LiIgZm9yIHNlbnRlbmNlIGluIGxpbmUuc3BsaXQoIi4iKSBpZiBzZW50ZW5jZS5zdHJpcCgpCiAgICBdCgogICAgc3BsaXRzID0gW10KICAgIGN1cnJlbnRfbGVuZ3RoID0gbGVuKHNlbnRlbmNlc1swXSkKICAgIHNwbGl0ID0gc2VudGVuY2VzWzBdCiAgICBmb3Igc2VudGVuY2UgaW4gc2VudGVuY2VzWzE6XToKICAgICAgICBpZiBjdXJyZW50X2xlbmd0aCArIGxlbihzZW50ZW5jZSkgPiBtYXhfbGVuZ3RoOgogICAgICAgICAgICBzcGxpdHMuYXBwZW5kKHNwbGl0KQogICAgICAgICAgICBzcGxpdCA9IHNlbnRlbmNlCiAgICAgICAgICAgIGN1cnJlbnRfbGVuZ3RoID0gbGVuKHNlbnRlbmNlKQogICAgICAgIGVsc2U6CiAgICAgICAgICAgIGN1cnJlbnRfbGVuZ3RoICs9IGxlbihzZW50ZW5jZSkKICAgICAgICAgICAgc3BsaXQgKz0gIiAiICsgc2VudGVuY2UKICAgIGlmIHNwbGl0OgogICAgICAgIHNwbGl0cy5hcHBlbmQoc3BsaXQpCgogICAgcmV0dXJuIHNwbGl0cwoKCmRlZiBfZ2V0X2xvZ2dlcigpOgogICAgZ2xvYmFsIF9MT0dHRVIKICAgIHRyeToKICAgICAgICBpbXBvcnQgbWxydW4KCiAgICAgICAgIyBDaGVjayBpZiBNTFJ1biBpcyBhdmFpbGFibGU6CiAgICAgICAgY29udGV4dCA9IG1scnVuLmdldF9vcl9jcmVhdGVfY3R4KG5hbWU9Im1scnVuIikKICAgICAgICByZXR1cm4gY29udGV4dC5sb2dnZXIKICAgIGV4Y2VwdCBNb2R1bGVOb3RGb3VuZEVycm9yOgogICAgICAgIHJldHVybiBfTE9HR0VSCg== +metadata: + categories: + - data-generation + - audio + tag: '' + name: text-to-audio-generator kind: job +verbose: false diff --git a/functions/master/text_to_audio_generator/1.3.0/src/item.yaml b/functions/master/text_to_audio_generator/1.3.0/src/item.yaml index e8235a08..3eba86ea 100644 --- a/functions/master/text_to_audio_generator/1.3.0/src/item.yaml +++ b/functions/master/text_to_audio_generator/1.3.0/src/item.yaml @@ -1,8 +1,7 @@ apiVersion: v1 categories: -- data-preparation -- machine-learning -- pytorch +- data-generation +- audio description: Generate audio file from text using different speakers doc: '' example: text_to_audio_generator.ipynb diff --git a/functions/master/text_to_audio_generator/1.3.0/static/documentation.html b/functions/master/text_to_audio_generator/1.3.0/static/documentation.html index e4ad7a88..b4660ebf 100644 --- a/functions/master/text_to_audio_generator/1.3.0/static/documentation.html +++ b/functions/master/text_to_audio_generator/1.3.0/static/documentation.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/text_to_audio_generator/1.3.0/static/example.html b/functions/master/text_to_audio_generator/1.3.0/static/example.html index 0b9f820f..096f4a9c 100644 --- a/functions/master/text_to_audio_generator/1.3.0/static/example.html +++ b/functions/master/text_to_audio_generator/1.3.0/static/example.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/text_to_audio_generator/1.3.0/static/function.html b/functions/master/text_to_audio_generator/1.3.0/static/function.html index 7e65f8e8..14516532 100644 --- a/functions/master/text_to_audio_generator/1.3.0/static/function.html +++ b/functions/master/text_to_audio_generator/1.3.0/static/function.html @@ -28,31 +28,11 @@
             
    -metadata:
    -  name: text-to-audio-generator
    -  categories:
    -  - data-preparation
    -  - machine-learning
    -  - pytorch
    -  tag: ''
     spec:
    -  command: ''
    -  build:
    -    functionSourceCode: 
    -    code_origin: ''
    -    base_image: mlrun/mlrun
    -    requirements:
    -    - torchaudio
    -    - pydub
    -    origin_filename: ''
    -  image: ''
    +  default_handler: generate_multi_speakers_audio
       disable_auto_mount: false
       entry_points:
         generate_multi_speakers_audio:
    -      has_kwargs: false
    -      name: generate_multi_speakers_audio
    -      doc: Generate audio files from text files.
    -      has_varargs: false
           lineno: 38
           parameters:
           - name: data_path
    @@ -119,14 +99,33 @@
             doc: Changes the bit depth for the supported formats. Supported only in "wav"
               or "flac" formats.
             default: null
    +      name: generate_multi_speakers_audio
    +      has_kwargs: false
    +      has_varargs: false
           outputs:
           - doc: 'A tuple of: - The output directory path. - The generated audio files
               dataframe. - The errors'' dictionary.'
             type: Tuple[str, pd.DataFrame, dict]
    -  default_handler: generate_multi_speakers_audio
    +      doc: Generate audio files from text files.
    +  command: ''
    +  image: ''
       description: Generate audio file from text using different speakers
    -verbose: false
    +  build:
    +    requirements:
    +    - torchaudio
    +    - pydub
    +    base_image: mlrun/mlrun
    +    code_origin: ''
    +    origin_filename: ''
    +    functionSourceCode: 
    +metadata:
    +  categories:
    +  - data-generation
    +  - audio
    +  tag: ''
    +  name: text-to-audio-generator
     kind: job
    +verbose: false
     
             
         
    diff --git a/functions/master/text_to_audio_generator/1.3.0/static/item.html b/functions/master/text_to_audio_generator/1.3.0/static/item.html index 4fc64dfb..2aeea892 100644 --- a/functions/master/text_to_audio_generator/1.3.0/static/item.html +++ b/functions/master/text_to_audio_generator/1.3.0/static/item.html @@ -30,9 +30,8 @@ apiVersion: v1 categories: -- data-preparation -- machine-learning -- pytorch +- data-generation +- audio description: Generate audio file from text using different speakers doc: '' example: text_to_audio_generator.ipynb diff --git a/functions/master/text_to_audio_generator/1.3.0/static/text_to_audio_generator.html b/functions/master/text_to_audio_generator/1.3.0/static/text_to_audio_generator.html index c9c93a42..f807a73b 100644 --- a/functions/master/text_to_audio_generator/1.3.0/static/text_to_audio_generator.html +++ b/functions/master/text_to_audio_generator/1.3.0/static/text_to_audio_generator.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/text_to_audio_generator/latest/src/function.yaml b/functions/master/text_to_audio_generator/latest/src/function.yaml index f7fe5286..8edbde74 100644 --- a/functions/master/text_to_audio_generator/latest/src/function.yaml +++ b/functions/master/text_to_audio_generator/latest/src/function.yaml @@ -1,28 +1,8 @@ -metadata: - name: text-to-audio-generator - categories: - - data-preparation - - machine-learning - - pytorch - tag: '' spec: - command: '' - build: - functionSourceCode: IyBDb3B5cmlnaHQgMjAyMyBJZ3VhemlvCiMKIyBMaWNlbnNlZCB1bmRlciB0aGUgQXBhY2hlIExpY2Vuc2UsIFZlcnNpb24gMi4wICh0aGUgIkxpY2Vuc2UiKTsKIyB5b3UgbWF5IG5vdCB1c2UgdGhpcyBmaWxlIGV4Y2VwdCBpbiBjb21wbGlhbmNlIHdpdGggdGhlIExpY2Vuc2UuCiMgWW91IG1heSBvYnRhaW4gYSBjb3B5IG9mIHRoZSBMaWNlbnNlIGF0CiMKIyAgIGh0dHA6Ly93d3cuYXBhY2hlLm9yZy9saWNlbnNlcy9MSUNFTlNFLTIuMAojCiMgVW5sZXNzIHJlcXVpcmVkIGJ5IGFwcGxpY2FibGUgbGF3IG9yIGFncmVlZCB0byBpbiB3cml0aW5nLCBzb2Z0d2FyZQojIGRpc3RyaWJ1dGVkIHVuZGVyIHRoZSBMaWNlbnNlIGlzIGRpc3RyaWJ1dGVkIG9uIGFuICJBUyBJUyIgQkFTSVMsCiMgV0lUSE9VVCBXQVJSQU5USUVTIE9SIENPTkRJVElPTlMgT0YgQU5ZIEtJTkQsIGVpdGhlciBleHByZXNzIG9yIGltcGxpZWQuCiMgU2VlIHRoZSBMaWNlbnNlIGZvciB0aGUgc3BlY2lmaWMgbGFuZ3VhZ2UgZ292ZXJuaW5nIHBlcm1pc3Npb25zIGFuZAojIGxpbWl0YXRpb25zIHVuZGVyIHRoZSBMaWNlbnNlLgppbXBvcnQgaW1wb3J0bGliCmltcG9ydCBpbwppbXBvcnQgbG9nZ2luZwppbXBvcnQgb3MKaW1wb3J0IHBhdGhsaWIKaW1wb3J0IHJhbmRvbQppbXBvcnQgdGVtcGZpbGUKZnJvbSBhYmMgaW1wb3J0IEFCQywgYWJzdHJhY3RtZXRob2QKZnJvbSB0eXBpbmcgaW1wb3J0IERpY3QsIExpc3QsIE9wdGlvbmFsLCBUdXBsZSwgVW5pb24KCmltcG9ydCBudW1weSBhcyBucAppbXBvcnQgcGFuZGFzIGFzIHBkCmltcG9ydCB0b3JjaAppbXBvcnQgdG9yY2hhdWRpbwppbXBvcnQgdHFkbQoKIyBHZXQgdGhlIGdsb2JhbCBsb2dnZXI6Cl9MT0dHRVIgPSBsb2dnaW5nLmdldExvZ2dlcigpCgpPUEVOQUlfQVBJX0tFWSA9ICJPUEVOQUlfQVBJX0tFWSIKT1BFTkFJX0JBU0VfVVJMID0gIk9QRU5BSV9BUElfQkFTRSIKU0FNUExFX1JBVEUgPSAyNDAwMAoKCmRlZiBnZW5lcmF0ZV9tdWx0aV9zcGVha2Vyc19hdWRpbygKICAgIGRhdGFfcGF0aDogc3RyLAogICAgc3BlYWtlcnM6IFVuaW9uW0xpc3Rbc3RyXSwgRGljdFtzdHIsIGludF1dLAogICAgYXZhaWxhYmxlX3ZvaWNlczogTGlzdFtzdHJdLAogICAgZW5naW5lOiBzdHIgPSAib3BlbmFpIiwKICAgIG91dHB1dF9kaXJlY3Rvcnk6IHN0ciA9IE5vbmUsCiAgICB1c2VfZ3B1OiBPcHRpb25hbFtib29sXSA9IE5vbmUsCiAgICB1c2Vfc21hbGxfbW9kZWxzOiBPcHRpb25hbFtib29sXSA9IE5vbmUsCiAgICBvZmZsb2FkX2NwdTogT3B0aW9uYWxbYm9vbF0gPSBOb25lLAogICAgbW9kZWw6IE9wdGlvbmFsW3N0cl0gPSBOb25lLAogICAgc3BlZWQ6IE9wdGlvbmFsW2Zsb2F0XSA9IE5vbmUsCiAgICBzYW1wbGVfcmF0ZTogaW50ID0gMTYwMDAsCiAgICBmaWxlX2Zvcm1hdDogc3RyID0gIndhdiIsCiAgICB2ZXJib3NlOiBib29sID0gVHJ1ZSwKICAgIGJpdHNfcGVyX3NhbXBsZTogT3B0aW9uYWxbaW50XSA9IE5vbmUsCikgLT4gVHVwbGVbc3RyLCBwZC5EYXRhRnJhbWUsIGRpY3RdOgogICAgIiIiCiAgICBHZW5lcmF0ZSBhdWRpbyBmaWxlcyBmcm9tIHRleHQgZmlsZXMuCgogICAgOnBhcmFtIGRhdGFfcGF0aDogICAgICAgICAgIFBhdGggdG8gdGhlIHRleHQgZmlsZSBvciBkaXJlY3RvcnkgY29udGFpbmluZyB0aGUgdGV4dCBmaWxlcyB0byBnZW5lcmF0ZSBhdWRpbyBmcm9tLgogICAgOnBhcmFtIHNwZWFrZXJzOiAgICAgICAgICAgIExpc3QgLyBEaWN0IG9mIHNwZWFrZXJzIHRvIGdlbmVyYXRlIGF1ZGlvIGZvci4KICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBJZiBhIGxpc3QgaXMgZ2l2ZW4sIHRoZSBzcGVha2VycyB3aWxsIGJlIGFzc2lnbmVkIHRvIGNoYW5uZWxzIGluIHRoZSBvcmRlciBnaXZlbi4KICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBJZiBkaWN0aW9uYXJ5LCB0aGUga2V5cyB3aWxsIGJlIHRoZSBzcGVha2VycyBhbmQgdGhlIHZhbHVlcyB3aWxsIGJlIHRoZSBjaGFubmVscy4KICAgIDpwYXJhbSBhdmFpbGFibGVfdm9pY2VzOiAgICBMaXN0IG9mIGF2YWlsYWJsZSB2b2ljZXMgdG8gdXNlIGZvciB0aGUgZ2VuZXJhdGlvbi4KICAgICAgICAgICAgICAgICAgICAgICAgU2VlIGhlcmUgZm9yIHRoZSBhdmFpbGFibGUgdm9pY2VzIGZvciBiYXJrIGVuZ2luZToKICAgICAgICAgICAgICAgICAgICAgICAgaHR0cHM6Ly9zdW5vLWFpLm5vdGlvbi5zaXRlLzhiOGU4NzQ5ZWQ1MTRiMGNiZjNmNjk5MDEzNTQ4NjgzP3Y9YmM2N2NmZjc4NmIwNGI1MGIzY2ViNzU2ZmQwNWY2OGMKICAgICAgICAgICAgICAgICAgICAgICAgU2VlIGhlcmUgZm9yIHRoZSBhdmFpbGFibGUgdm9pY2VzIGZvciBvcGVuYWkgZW5naW5lOgogICAgICAgICAgICAgICAgICAgICAgICBodHRwczovL2JldGEub3BlbmFpLmNvbS9kb2NzL2FwaS1yZWZlcmVuY2Uvc3BlZWNoCiAgICA6cGFyYW0gZW5naW5lOiAgICAgICAgICAgICAgVGhlIGVuZ2luZSB0byB1c2UgZm9yIHRoZSBnZW5lcmF0aW9uLiBTZWxlY3QgZWl0aGVyICJiYXJrIiBvciAib3BlbmFpIi4gRGVmYXVsdCBpcyAib3BlbmFpIi4KICAgIDpwYXJhbSBvdXRwdXRfZGlyZWN0b3J5OiAgICBQYXRoIHRvIHRoZSBkaXJlY3RvcnkgdG8gc2F2ZSB0aGUgZ2VuZXJhdGVkIGF1ZGlvIGZpbGVzIHRvLgogICAgOnBhcmFtIHVzZV9ncHU6ICAgICAgICAgICAgIFdoZXRoZXIgdG8gdXNlIHRoZSBHUFUgZm9yIHRoZSBnZW5lcmF0aW9uLiBTdXBwb3J0ZWQgb25seSBpbiAiYmFyayIgZW5naW5lLgogICAgOnBhcmFtIHVzZV9zbWFsbF9tb2RlbHM6ICAgIFdoZXRoZXIgdG8gdXNlIHRoZSBzbWFsbCBtb2RlbHMgZm9yIHRoZSBnZW5lcmF0aW9uLiBTdXBwb3J0ZWQgb25seSBpbiAiYmFyayIgZW5naW5lLgogICAgOnBhcmFtIG9mZmxvYWRfY3B1OiAgICAgICAgIFRvIHJlZHVjZSB0aGUgbWVtb3J5IGZvb3RwcmludCwgdGhlIG1vZGVscyBjYW4gYmUgb2ZmbG9hZGVkIHRvIHRoZSBDUFUgYWZ0ZXIgbG9hZGluZy4KICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBTdXBwb3J0ZWQgb25seSBpbiAiYmFyayIgZW5naW5lLgogICAgOnBhcmFtIG1vZGVsOiAgICAgICAgICAgICAgIFdoaWNoIG1vZGVsIHRvIHVzZSBmb3IgdGhlIGdlbmVyYXRpb24uIFN1cHBvcnRlZCBvbmx5IGluICJvcGVuYWkiIGVuZ2luZS4KICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBEZWZhdWx0IGlzICJ0dHMtMSIuCiAgICA6cGFyYW0gc3BlZWQ6ICAgICAgICAgICAgICAgVGhlIHNwZWVkIG9mIHRoZSBnZW5lcmF0ZWQgYXVkaW8uIFNlbGVjdCBhIHZhbHVlIGZyb20gYDAuMjVgIHRvIGA0LjBgLiBgMS4wYCBpcyB0aGUgZGVmYXVsdC4KICAgIDpwYXJhbSBzYW1wbGVfcmF0ZTogICAgICAgICBUaGUgc2FtcGxpbmcgcmF0ZSBvZiB0aGUgZ2VuZXJhdGVkIGF1ZGlvLgogICAgOnBhcmFtIGZpbGVfZm9ybWF0OiAgICAgICAgIFRoZSBmb3JtYXQgb2YgdGhlIGdlbmVyYXRlZCBhdWRpbyBmaWxlcy4KICAgIDpwYXJhbSB2ZXJib3NlOiAgICAgICAgICAgICBXaGV0aGVyIHRvIHByaW50IHRoZSBwcm9ncmVzcyBvZiB0aGUgZ2VuZXJhdGlvbi4KICAgIDpwYXJhbSBiaXRzX3Blcl9zYW1wbGU6ICAgICBDaGFuZ2VzIHRoZSBiaXQgZGVwdGggZm9yIHRoZSBzdXBwb3J0ZWQgZm9ybWF0cy4KICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBTdXBwb3J0ZWQgb25seSBpbiAid2F2IiBvciAiZmxhYyIgZm9ybWF0cy4KCiAgICA6cmV0dXJuczogICAgICAgICAgICAgICAgICAgQSB0dXBsZSBvZjoKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAtIFRoZSBvdXRwdXQgZGlyZWN0b3J5IHBhdGguCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgLSBUaGUgZ2VuZXJhdGVkIGF1ZGlvIGZpbGVzIGRhdGFmcmFtZS4KICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAtIFRoZSBlcnJvcnMnIGRpY3Rpb25hcnkuCiAgICAiIiIKCiAgICBnbG9iYWwgX0xPR0dFUgogICAgX0xPR0dFUiA9IF9nZXRfbG9nZ2VyKCkKICAgICMgR2V0IHRoZSBpbnB1dCB0ZXh0IGZpbGVzIHRvIHR1cm4gdG8gYXVkaW86CiAgICBkYXRhX3BhdGggPSBwYXRobGliLlBhdGgoZGF0YV9wYXRoKS5hYnNvbHV0ZSgpCiAgICB0ZXh0X2ZpbGVzID0gX2dldF90ZXh0X2ZpbGVzKGRhdGFfcGF0aD1kYXRhX3BhdGgpCgoKICAgICMgUHJlcGFyZSB0aGUgc3BlZWNoIGVuZ2luZToKICAgIGVuZ2luZSA9IF9nZXRfZW5naW5lKAogICAgICAgIGVuZ2luZT1lbmdpbmUsCiAgICAgICAgdXNlX2dwdT11c2VfZ3B1LAogICAgICAgIHVzZV9zbWFsbF9tb2RlbHM9dXNlX3NtYWxsX21vZGVscywKICAgICAgICBvZmZsb2FkX2NwdT1vZmZsb2FkX2NwdSwKICAgICAgICBtb2RlbD1tb2RlbCwKICAgICAgICBmaWxlX2Zvcm1hdD1maWxlX2Zvcm1hdCwKICAgICAgICBzcGVlZD1zcGVlZAogICAgKQoKICAgICMgQ2hlY2sgZm9yIHBlciBjaGFubmVsIGdlbmVyYXRpb246CiAgICBpZiBpc2luc3RhbmNlKHNwZWFrZXJzLCBkaWN0KToKICAgICAgICBzcGVha2VyX3Blcl9jaGFubmVsID0gVHJ1ZQogICAgICAgICMgU29ydCB0aGUgZ2l2ZW4gc3BlYWtlcnMgYnkgY2hhbm5lbHM6CiAgICAgICAgc3BlYWtlcnMgPSB7CiAgICAgICAgICAgIHNwZWFrZXI6IGNoYW5uZWwKICAgICAgICAgICAgZm9yIHNwZWFrZXIsIGNoYW5uZWwgaW4gc29ydGVkKHNwZWFrZXJzLml0ZW1zKCksIGtleT1sYW1iZGEgaXRlbTogaXRlbVsxXSkKICAgICAgICB9CiAgICBlbHNlOgogICAgICAgIHNwZWFrZXJfcGVyX2NoYW5uZWwgPSBGYWxzZQoKICAgICMgUHJlcGFyZSB0aGUgcmVzYW1wbGluZyBtb2R1bGU6CiAgICByZXNhbXBsZXIgPSB0b3JjaGF1ZGlvLnRyYW5zZm9ybXMuUmVzYW1wbGUoCiAgICAgICAgb3JpZ19mcmVxPVNBTVBMRV9SQVRFLCBuZXdfZnJlcT1zYW1wbGVfcmF0ZSwgZHR5cGU9dG9yY2guZmxvYXQzMgogICAgKQoKICAgICMgUHJlcGFyZSB0aGUgZ2FwIGJldHdlZW4gZWFjaCBzcGVha2VyOgogICAgZ2FwX2JldHdlZW5fc3BlYWtlcnMgPSBucC56ZXJvcyhpbnQoMC41ICogU0FNUExFX1JBVEUpKQoKICAgICMgUHJlcGFyZSB0aGUgc3VjY2Vzc2VzIGRhdGFmcmFtZSBhbmQgZXJyb3JzIGRpY3Rpb25hcnkgdG8gYmUgcmV0dXJuZWQ6CiAgICBzdWNjZXNzZXMgPSBbXQogICAgZXJyb3JzID0ge30KCiAgICAjIENyZWF0ZSB0aGUgb3V0cHV0IGRpcmVjdG9yeToKICAgIGlmIG91dHB1dF9kaXJlY3RvcnkgaXMgTm9uZToKICAgICAgICBvdXRwdXRfZGlyZWN0b3J5ID0gdGVtcGZpbGUubWtkdGVtcCgpCiAgICBvdXRwdXRfZGlyZWN0b3J5ID0gcGF0aGxpYi5QYXRoKG91dHB1dF9kaXJlY3RvcnkpCiAgICBpZiBub3Qgb3V0cHV0X2RpcmVjdG9yeS5leGlzdHMoKToKICAgICAgICBvdXRwdXRfZGlyZWN0b3J5Lm1rZGlyKGV4aXN0X29rPVRydWUsIHBhcmVudHM9VHJ1ZSkKCiAgICAjIFN0YXJ0IGdlbmVyYXRpbmcgYXVkaW86CiAgICAjIEdvIG92ZXIgdGhlIGF1ZGlvIGZpbGVzIGFuZCB0cmFuc2NyaWJlOgogICAgZm9yIHRleHRfZmlsZSBpbiB0cWRtLnRxZG0oCiAgICAgICAgdGV4dF9maWxlcywgZGVzYz0iR2VuZXJhdGluZyIsIHVuaXQ9ImZpbGUiLCBkaXNhYmxlPW5vdCB2ZXJib3NlCiAgICApOgoKICAgICAgICB0cnk6CiAgICAgICAgICAgICMgUmFuZG9taXplIHZvaWNlcyBmb3IgZWFjaCBzcGVha2VyOgogICAgICAgICAgICBjaG9zZW5fdm9pY2VzID0ge30KICAgICAgICAgICAgYXZhaWxhYmxlX3ZvaWNlc19jb3B5ID0gYXZhaWxhYmxlX3ZvaWNlcy5jb3B5KCkKICAgICAgICAgICAgZm9yIHNwZWFrZXIgaW4gc3BlYWtlcnM6CiAgICAgICAgICAgICAgICB2b2ljZSA9IHJhbmRvbS5jaG9pY2UoYXZhaWxhYmxlX3ZvaWNlc19jb3B5KQogICAgICAgICAgICAgICAgY2hvc2VuX3ZvaWNlc1tzcGVha2VyXSA9IHZvaWNlCiAgICAgICAgICAgICAgICBhdmFpbGFibGVfdm9pY2VzX2NvcHkucmVtb3ZlKHZvaWNlKQogICAgICAgICAgICAjIFJlYWQgdGV4dDoKICAgICAgICAgICAgd2l0aCBvcGVuKHRleHRfZmlsZSwgInIiKSBhcyBmcDoKICAgICAgICAgICAgICAgIHRleHQgPSBmcC5yZWFkKCkKICAgICAgICAgICAgIyBQcmVwYXJlIGEgaG9sZGVyIGZvciBhbGwgdGhlIGdlbmVyYXRlZCBwaWVjZXMgKGlmIHBlciBjaGFubmVsIGVhY2ggc3BlYWtlciB3aWxsIGhhdmUgaXRzIG93bik6CiAgICAgICAgICAgIGF1ZGlvX3BpZWNlcyA9ICgKICAgICAgICAgICAgICAgIHtzcGVha2VyOiBbXSBmb3Igc3BlYWtlciBpbiBzcGVha2Vyc30KICAgICAgICAgICAgICAgIGlmIHNwZWFrZXJfcGVyX2NoYW5uZWwKICAgICAgICAgICAgICAgIGVsc2UgeyJhbGwiOiBbXX0KICAgICAgICAgICAgKQoKICAgICAgICAgICAgIyBHZW5lcmF0ZSBhdWRpbyBwZXIgbGluZToKICAgICAgICAgICAgZm9yIGxpbmUgaW4gdGV4dC5zcGxpdGxpbmVzKCk6CiAgICAgICAgICAgICAgICAjIFZhbGlkYXRlIGxpbmUgaXMgaW4gY29ycmVjdCBzcGVha2VyIGZvcm1hdDoKCiAgICAgICAgICAgICAgICBpZiAiOiAiIG5vdCBpbiBsaW5lOgogICAgICAgICAgICAgICAgICAgIGlmIHZlcmJvc2U6CiAgICAgICAgICAgICAgICAgICAgICAgIF9MT0dHRVIud2FybmluZyhmIlNraXBwaW5nIGxpbmU6IHtsaW5lfSIpCiAgICAgICAgICAgICAgICAgICAgY29udGludWUKICAgICAgICAgICAgICAgICMgU3BsaXQgbGluZSB0byBzcGVha2VyIGFuZCBoaXMgd29yZHM6CiAgICAgICAgICAgICAgICBjdXJyZW50X3NwZWFrZXIsIHNlbnRlbmNlcyA9IGxpbmUuc3BsaXQoIjogIiwgMSkKICAgICAgICAgICAgICAgICMgVmFsaWRhdGUgc3BlYWtlciBpcyBrbm93bjoKICAgICAgICAgICAgICAgIGlmIGN1cnJlbnRfc3BlYWtlciBub3QgaW4gc3BlYWtlcnM6CiAgICAgICAgICAgICAgICAgICAgcmFpc2UgVmFsdWVFcnJvcigKICAgICAgICAgICAgICAgICAgICAgICAgZiJVbmtub3duIHNwZWFrZXI6IHtjdXJyZW50X3NwZWFrZXJ9LiBHaXZlbiBzcGVha2VycyBhcmU6IHtzcGVha2Vyc30iCiAgICAgICAgICAgICAgICAgICAgKQogICAgICAgICAgICAgICAgZm9yIHNlbnRlbmNlIGluIF9zcGxpdF9saW5lKGxpbmU9c2VudGVuY2VzKToKICAgICAgICAgICAgICAgICAgICAjIEdlbmVyYXRlIHdvcmRzIGF1ZGlvOgogICAgICAgICAgICAgICAgICAgIGF1ZGlvID0gZW5naW5lLl9nZW5lcmF0ZV9hdWRpbygKICAgICAgICAgICAgICAgICAgICAgICAgdGV4dD1zZW50ZW5jZSwKICAgICAgICAgICAgICAgICAgICAgICAgdm9pY2U9Y2hvc2VuX3ZvaWNlc1tjdXJyZW50X3NwZWFrZXJdLAogICAgICAgICAgICAgICAgICAgICkKCiAgICAgICAgICAgICAgICAgICAgaWYgc3BlYWtlcl9wZXJfY2hhbm5lbDoKICAgICAgICAgICAgICAgICAgICAgICAgc2lsZW5jZSA9IG5wLnplcm9zX2xpa2UoYXVkaW8pCiAgICAgICAgICAgICAgICAgICAgICAgIGZvciBzcGVha2VyIGluIGF1ZGlvX3BpZWNlcy5rZXlzKCk6CiAgICAgICAgICAgICAgICAgICAgICAgICAgICBpZiBzcGVha2VyID09IGN1cnJlbnRfc3BlYWtlcjoKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBhdWRpb19waWVjZXNbc3BlYWtlcl0gKz0gW2F1ZGlvLCBnYXBfYmV0d2Vlbl9zcGVha2Vyc10KICAgICAgICAgICAgICAgICAgICAgICAgICAgIGVsc2U6CiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgYXVkaW9fcGllY2VzW3NwZWFrZXJdICs9IFtzaWxlbmNlLCBnYXBfYmV0d2Vlbl9zcGVha2Vyc10KICAgICAgICAgICAgICAgICAgICBlbHNlOgogICAgICAgICAgICAgICAgICAgICAgICBhdWRpb19waWVjZXNbImFsbCJdICs9IFthdWRpbywgZ2FwX2JldHdlZW5fc3BlYWtlcnNdCiAgICAgICAgICAgICMgQ29uc3RydWN0IGEgc2luZ2xlIGF1ZGlvIGFycmF5IGZyb20gYWxsIHRoZSBwaWVjZXMgYW5kIGNoYW5uZWxzOgoKICAgICAgICAgICAgYXVkaW8gPSBucC52c3RhY2soCiAgICAgICAgICAgICAgICBbbnAuY29uY2F0ZW5hdGUoYXVkaW9fcGllY2VzW3NwZWFrZXJdKSBmb3Igc3BlYWtlciBpbiBzcGVha2Vyc10KICAgICAgICAgICAgKS5hc3R5cGUoZHR5cGU9bnAuZmxvYXQzMikKICAgICAgICAgICAgIyBSZXNhbXBsZToKICAgICAgICAgICAgYXVkaW8gPSB0b3JjaC5mcm9tX251bXB5KGF1ZGlvKQogICAgICAgICAgICBhdWRpbyA9IHJlc2FtcGxlcihhdWRpbykKICAgICAgICAgICAgIyBTYXZlIHRvIGF1ZGlvIGZpbGU6CiAgICAgICAgICAgIGF1ZGlvX2ZpbGUgPSBvdXRwdXRfZGlyZWN0b3J5IC8gZiJ7dGV4dF9maWxlLnN0ZW19LntmaWxlX2Zvcm1hdH0iCgogICAgICAgICAgICB0b3JjaGF1ZGlvLnNhdmUoCiAgICAgICAgICAgICAgICB1cmk9c3RyKGF1ZGlvX2ZpbGUpLAogICAgICAgICAgICAgICAgc3JjPWF1ZGlvLAogICAgICAgICAgICAgICAgc2FtcGxlX3JhdGU9c2FtcGxlX3JhdGUsCiAgICAgICAgICAgICAgICBmb3JtYXQ9ZmlsZV9mb3JtYXQsCiAgICAgICAgICAgICAgICBiaXRzX3Blcl9zYW1wbGU9Yml0c19wZXJfc2FtcGxlLAogICAgICAgICAgICApCgogICAgICAgICAgICAjIENvbGxlY3QgdG8gdGhlIHN1Y2Nlc3NlczoKICAgICAgICAgICAgc3VjY2Vzc2VzLmFwcGVuZChbdGV4dF9maWxlLm5hbWUsIGF1ZGlvX2ZpbGUubmFtZV0pCiAgICAgICAgZXhjZXB0IEV4Y2VwdGlvbiBhcyBleGNlcHRpb246CiAgICAgICAgICAgICMgTm90ZSB0aGUgZXhjZXB0aW9uIGFzIGVycm9yIGluIHRoZSBkaWN0aW9uYXJ5OgogICAgICAgICAgICBpZiB2ZXJib3NlOgogICAgICAgICAgICAgICAgX0xPR0dFUi53YXJuaW5nKGYiRXJyb3IgaW4gZmlsZTogJ3t0ZXh0X2ZpbGUubmFtZX0nIikKICAgICAgICAgICAgcHJpbnQoZXhjZXB0aW9uKQogICAgICAgICAgICBlcnJvcnNbdGV4dF9maWxlLm5hbWVdID0gc3RyKGV4Y2VwdGlvbikKCiAgICAjIENvbnN0cnVjdCB0aGUgdHJhbnNsYXRpb25zIGRhdGFmcmFtZToKICAgIHN1Y2Nlc3NlcyA9IHBkLkRhdGFGcmFtZSgKICAgICAgICBzdWNjZXNzZXMsCiAgICAgICAgY29sdW1ucz1bInRleHRfZmlsZSIsICJhdWRpb19maWxlIl0sCiAgICApCgogICAgIyBQcmludCB0aGUgaGVhZCBvZiB0aGUgcHJvZHVjZWQgZGF0YWZyYW1lIGFuZCByZXR1cm46CiAgICBpZiB2ZXJib3NlOgogICAgICAgIF9MT0dHRVIuaW5mbygKICAgICAgICAgICAgZiJEb25lICh7c3VjY2Vzc2VzLnNoYXBlWzBdfS97bGVuKHRleHRfZmlsZXMpfSlcbiIKICAgICAgICAgICAgZiJUcmFuc2xhdGlvbnMgc3VtbWFyeTpcbiIKICAgICAgICAgICAgZiJ7c3VjY2Vzc2VzLmhlYWQoKX0iCiAgICAgICAgKQogICAgcmV0dXJuIHN0cihvdXRwdXRfZGlyZWN0b3J5KSwgc3VjY2Vzc2VzLCBlcnJvcnMKCgpjbGFzcyBTcGVlY2hFbmdpbmUoQUJDKToKICAgIEBhYnN0cmFjdG1ldGhvZAogICAgZGVmIF9nZW5lcmF0ZV9hdWRpbyhzZWxmLCB0ZXh0OiBzdHIsIHZvaWNlOiBzdHIpIC0+IG5wLm5kYXJyYXk6CiAgICAgICAgcGFzcwoKCmNsYXNzIEJhcmtFbmdpbmUoU3BlZWNoRW5naW5lKToKICAgIGRlZiBfX2luaXRfXyhzZWxmLCB1c2VfZ3B1OiBib29sID0gVHJ1ZSwgdXNlX3NtYWxsX21vZGVsczogYm9vbCA9IEZhbHNlLCBvZmZsb2FkX2NwdTogYm9vbCA9IEZhbHNlKToKICAgICAgICB0cnk6CiAgICAgICAgICAgIHNlbGYuYmFyayA9IGltcG9ydGxpYi5pbXBvcnRfbW9kdWxlKCJiYXJrIikKICAgICAgICBleGNlcHQgSW1wb3J0RXJyb3I6CiAgICAgICAgICAgIHJhaXNlIEltcG9ydEVycm9yKAogICAgICAgICAgICAgICAgIlRoZSAnYmFyaycgbGlicmFyeSBpcyByZXF1aXJlZCBmb3IgdGhlIEJhcmtFbmdpbmUuIFBsZWFzZSBpbnN0YWxsIGl0IHVzaW5nICdwaXAgaW5zdGFsbCBiYXJrLWFpJy4iCiAgICAgICAgICAgICkKCiAgICAgICAgc2VsZi5iYXJrLnByZWxvYWRfbW9kZWxzKAogICAgICAgICAgICB0ZXh0X3VzZV9ncHU9dXNlX2dwdSwKICAgICAgICAgICAgdGV4dF91c2Vfc21hbGw9dXNlX3NtYWxsX21vZGVscywKICAgICAgICAgICAgY29hcnNlX3VzZV9ncHU9dXNlX2dwdSwKICAgICAgICAgICAgY29hcnNlX3VzZV9zbWFsbD11c2Vfc21hbGxfbW9kZWxzLAogICAgICAgICAgICBmaW5lX3VzZV9ncHU9dXNlX2dwdSwKICAgICAgICAgICAgZmluZV91c2Vfc21hbGw9dXNlX3NtYWxsX21vZGVscywKICAgICAgICAgICAgY29kZWNfdXNlX2dwdT11c2VfZ3B1LAogICAgICAgICAgICBmb3JjZV9yZWxvYWQ9b2ZmbG9hZF9jcHUsCiAgICAgICAgKQoKICAgIGRlZiBfZ2VuZXJhdGVfYXVkaW8oc2VsZiwgdGV4dDogc3RyLCB2b2ljZTogc3RyKSAtPiBucC5uZGFycmF5OgogICAgICAgICMgR2VuZXJhdGUgd29yZHMgYXVkaW86CiAgICAgICAgYXVkaW8gPSBzZWxmLmJhcmsuZ2VuZXJhdGVfYXVkaW8oCiAgICAgICAgICAgIHRleHQsCiAgICAgICAgICAgIGhpc3RvcnlfcHJvbXB0PXZvaWNlLAogICAgICAgICAgICBzaWxlbnQ9VHJ1ZSwKICAgICAgICApCiAgICAgICAgcmV0dXJuIGF1ZGlvCgoKY2xhc3MgT3BlbkFJRW5naW5lKFNwZWVjaEVuZ2luZSk6CiAgICBkZWYgX19pbml0X18oc2VsZiwgbW9kZWw6IHN0ciA9ICJ0dHMtMSIsIGZpbGVfZm9ybWF0OiBzdHIgPSAid2F2Iiwgc3BlZWQ6IGZsb2F0ID0gMS4wKToKICAgICAgICB0cnk6CiAgICAgICAgICAgIHNlbGYub3BlbmFpID0gaW1wb3J0bGliLmltcG9ydF9tb2R1bGUoIm9wZW5haSIpCiAgICAgICAgICAgIHNlbGYucHlkdWIgPSBpbXBvcnRsaWIuaW1wb3J0X21vZHVsZSgicHlkdWIiKQogICAgICAgIGV4Y2VwdCBJbXBvcnRFcnJvcjoKICAgICAgICAgICAgcmFpc2UgSW1wb3J0RXJyb3IoCiAgICAgICAgICAgICAgICAiVGhlICdvcGVuYWknIGFuZCAncHlkdWInIGxpYnJhcmllcyBhcmUgcmVxdWlyZWQgZm9yIHRoZSBPcGVuQUlFbmdpbmUuIFBsZWFzZSBpbnN0YWxsIHRoZW0gdXNpbmcgJ3BpcCBpbnN0YWxsIG9wZW5haSBweWR1YicuIgogICAgICAgICAgICApCgogICAgICAgIGFwaV9rZXkgPSBvcy5nZXRlbnYoT1BFTkFJX0FQSV9LRVkpCiAgICAgICAgYmFzZV91cmwgPSBvcy5nZXRlbnYoT1BFTkFJX0JBU0VfVVJMKQogICAgICAgICMgQ2hlY2sgaWYgdGhlIGtleSBpcyBhbHJlYWR5IGluIHRoZSBlbnZpcm9ubWVudCB2YXJpYWJsZXM6CiAgICAgICAgaWYgbm90IGFwaV9rZXkgb3Igbm90IGJhc2VfdXJsOgogICAgICAgICAgICB0cnk6CiAgICAgICAgICAgICAgICBpbXBvcnQgbWxydW4KCiAgICAgICAgICAgICAgICBjb250ZXh0ID0gbWxydW4uZ2V0X29yX2NyZWF0ZV9jdHgobmFtZT0iY29udGV4dCIpCiAgICAgICAgICAgICAgICAjIENoZWNrIGlmIHRoZSBrZXkgaXMgaW4gdGhlIHNlY3JldHM6CiAgICAgICAgICAgICAgICBhcGlfa2V5ID0gY29udGV4dC5nZXRfc2VjcmV0KE9QRU5BSV9BUElfS0VZKQogICAgICAgICAgICAgICAgYmFzZV91cmwgPSBjb250ZXh0LmdldF9zZWNyZXQoT1BFTkFJX0JBU0VfVVJMKQogICAgICAgICAgICBleGNlcHQgTW9kdWxlTm90Rm91bmRFcnJvcjoKICAgICAgICAgICAgICAgIHJhaXNlIEVudmlyb25tZW50RXJyb3IoCiAgICAgICAgICAgICAgICAgICAgZiJPbmUgb3IgbW9yZSBvZiB0aGUgT3BlbkFJIHJlcXVpcmVkIGVudmlyb25tZW50IHZhcmlhYmxlcyAoJ3tPUEVOQUlfQVBJX0tFWX0nLCAne09QRU5BSV9CQVNFX1VSTH0nKSBhcmUgbWlzc2luZy4iCiAgICAgICAgICAgICAgICAgICAgZiJQbGVhc2Ugc2V0IHRoZW0gYXMgZW52aXJvbm1lbnQgdmFyaWFibGVzIG9yIGluc3RhbGwgbWxydW4gKGBwaXAgaW5zdGFsbCBtbHJ1bmApIgogICAgICAgICAgICAgICAgICAgIGYiYW5kIHNldCB0aGVtIGFzIHByb2plY3Qgc2VjcmV0cyB1c2luZyBgcHJvamVjdC5zZXRfc2VjcmV0c2AuIgogICAgICAgICAgICAgICAgKQoKICAgICAgICBzZWxmLmNsaWVudCA9IHNlbGYub3BlbmFpLk9wZW5BSShhcGlfa2V5PWFwaV9rZXksIGJhc2VfdXJsPWJhc2VfdXJsKQogICAgICAgIHNlbGYubW9kZWwgPSBtb2RlbAogICAgICAgIHNlbGYuZmlsZV9mb3JtYXQgPSBmaWxlX2Zvcm1hdAogICAgICAgIHNlbGYuc3BlZWQgPSBzcGVlZAoKICAgIGRlZiBfZ2VuZXJhdGVfYXVkaW8oc2VsZiwgdGV4dDogc3RyLCB2b2ljZTogc3RyKSAtPiBucC5uZGFycmF5OgogICAgICAgICMgR2VuZXJhdGUgd29yZHMgYXVkaW86CiAgICAgICAgYXVkaW8gPSBzZWxmLmNsaWVudC5hdWRpby5zcGVlY2guY3JlYXRlKAogICAgICAgICAgICBtb2RlbD1zZWxmLm1vZGVsLAogICAgICAgICAgICBpbnB1dD10ZXh0LAogICAgICAgICAgICB2b2ljZT12b2ljZSwKICAgICAgICAgICAgcmVzcG9uc2VfZm9ybWF0PXNlbGYuZmlsZV9mb3JtYXQsCiAgICAgICAgICAgIHNwZWVkPXNlbGYuc3BlZWQsCiAgICAgICAgKQogICAgICAgIGF1ZGlvID0gYXVkaW8uY29udGVudAogICAgICAgIGF1ZGlvID0gc2VsZi5fYnl0ZXNfdG9fbnBfYXJyYXkoYXVkaW89YXVkaW8pCiAgICAgICAgcmV0dXJuIGF1ZGlvCgogICAgZGVmIF9ieXRlc190b19ucF9hcnJheShzZWxmLCBhdWRpbzogYnl0ZXMpOgogICAgICAgIGlmIHNlbGYuZmlsZV9mb3JtYXQgPT0gIm1wMyI6CiAgICAgICAgICAgIGF1ZGlvX3NlZ21lbnQgPSBzZWxmLnB5ZHViLkF1ZGlvU2VnbWVudC5mcm9tX21wMyhpby5CeXRlc0lPKGF1ZGlvKSkKCiAgICAgICAgICAgICMgQ29udmVydCB0byByYXcgUENNIGF1ZGlvIGRhdGEKICAgICAgICAgICAgc2FtcGxlcyA9IGF1ZGlvX3NlZ21lbnQuZ2V0X2FycmF5X29mX3NhbXBsZXMoKQoKICAgICAgICAgICAgIyBDb252ZXJ0IHRvIG51bXB5IGFycmF5CiAgICAgICAgICAgIGF1ZGlvX2FycmF5ID0gbnAuYXJyYXkoc2FtcGxlcykKCiAgICAgICAgICAgICMgTm9ybWFsaXplIHRvIGZsb2F0IGJldHdlZW4gLTEgYW5kIDEKICAgICAgICAgICAgcmV0dXJuIGF1ZGlvX2FycmF5LmFzdHlwZShucC5mbG9hdDMyKSAvIG5wLmlpbmZvKHNhbXBsZXMudHlwZWNvZGUpLm1heAogICAgICAgIGVsc2U6CiAgICAgICAgICAgIHJldHVybiBucC5mcm9tYnVmZmVyKGF1ZGlvLCBkdHlwZT1ucC5pbnQxNikgLyAzMjc2OC4wCgoKZGVmIF9nZXRfZW5naW5lKGVuZ2luZTogc3RyLCBmaWxlX2Zvcm1hdDogc3RyLCAqKmt3YXJncykgLT4gU3BlZWNoRW5naW5lOgogICAgIyBlbGltaW5hdGUgdGhlIE5vbmUgdmFsdWVzOgogICAga3dhcmdzID0ge2tleTogdmFsdWUgZm9yIGtleSwgdmFsdWUgaW4ga3dhcmdzLml0ZW1zKCkgaWYgdmFsdWUgaXMgbm90IE5vbmV9CgogICAgaWYgZW5naW5lID09ICJiYXJrIjoKICAgICAgICByZXR1cm4gQmFya0VuZ2luZSgqKmt3YXJncykKICAgIGVsaWYgZW5naW5lID09ICJvcGVuYWkiOgogICAgICAgIHJldHVybiBPcGVuQUlFbmdpbmUoZmlsZV9mb3JtYXQ9ZmlsZV9mb3JtYXQsICoqa3dhcmdzKQogICAgZWxzZToKICAgICAgICByYWlzZSBWYWx1ZUVycm9yKAogICAgICAgICAgICBmIlVucmVjb2duaXplZCBlbmdpbmUuIFRoZSBwYXJhbWV0ZXIgYGVuZ2luZWAgbXVzdCBiZSBlaXRoZXIgJ2JhcmsnIG9yICdvcGVuYWknLiBHaXZlbjoge2VuZ2luZX0iCiAgICAgICAgKQoKZGVmIF9nZXRfdGV4dF9maWxlcygKICAgIGRhdGFfcGF0aDogcGF0aGxpYi5QYXRoLAopIC0+IExpc3RbcGF0aGxpYi5QYXRoXToKICAgICMgQ2hlY2sgaWYgdGhlIHBhdGggaXMgb2YgYSBkaXJlY3Rvcnkgb3IgYSBmaWxlOgogICAgaWYgZGF0YV9wYXRoLmlzX2RpcigpOgogICAgICAgICMgR2V0IGFsbCBmaWxlcyBpbnNpZGUgdGhlIGRpcmVjdG9yeToKICAgICAgICB0ZXh0X2ZpbGVzID0gbGlzdChkYXRhX3BhdGguZ2xvYigiKi4qIikpCiAgICBlbGlmIGRhdGFfcGF0aC5pc19maWxlKCk6CiAgICAgICAgdGV4dF9maWxlcyA9IFtkYXRhX3BhdGhdCiAgICBlbHNlOgogICAgICAgIHJhaXNlIFZhbHVlRXJyb3IoCiAgICAgICAgICAgIGYiVW5yZWNvZ25pemVkIGRhdGEgcGF0aC4gVGhlIHBhcmFtZXRlciBgZGF0YV9wYXRoYCBtdXN0IGJlIGVpdGhlciBhIGRpcmVjdG9yeSBwYXRoIG9yIGEgZmlsZSBwYXRoLiAiCiAgICAgICAgICAgIGYiR2l2ZW46IHtzdHIoZGF0YV9wYXRoKX0gIgogICAgICAgICkKCiAgICByZXR1cm4gdGV4dF9maWxlcwoKCmRlZiBfc3BsaXRfbGluZShsaW5lOiBzdHIsIG1heF9sZW5ndGg6IGludCA9IDI1MCkgLT4gTGlzdFtzdHJdOgogICAgaWYgbGVuKGxpbmUpIDwgbWF4X2xlbmd0aDoKICAgICAgICByZXR1cm4gW2xpbmVdCgogICAgc2VudGVuY2VzID0gWwogICAgICAgIGYie3NlbnRlbmNlLnN0cmlwKCl9LiIgZm9yIHNlbnRlbmNlIGluIGxpbmUuc3BsaXQoIi4iKSBpZiBzZW50ZW5jZS5zdHJpcCgpCiAgICBdCgogICAgc3BsaXRzID0gW10KICAgIGN1cnJlbnRfbGVuZ3RoID0gbGVuKHNlbnRlbmNlc1swXSkKICAgIHNwbGl0ID0gc2VudGVuY2VzWzBdCiAgICBmb3Igc2VudGVuY2UgaW4gc2VudGVuY2VzWzE6XToKICAgICAgICBpZiBjdXJyZW50X2xlbmd0aCArIGxlbihzZW50ZW5jZSkgPiBtYXhfbGVuZ3RoOgogICAgICAgICAgICBzcGxpdHMuYXBwZW5kKHNwbGl0KQogICAgICAgICAgICBzcGxpdCA9IHNlbnRlbmNlCiAgICAgICAgICAgIGN1cnJlbnRfbGVuZ3RoID0gbGVuKHNlbnRlbmNlKQogICAgICAgIGVsc2U6CiAgICAgICAgICAgIGN1cnJlbnRfbGVuZ3RoICs9IGxlbihzZW50ZW5jZSkKICAgICAgICAgICAgc3BsaXQgKz0gIiAiICsgc2VudGVuY2UKICAgIGlmIHNwbGl0OgogICAgICAgIHNwbGl0cy5hcHBlbmQoc3BsaXQpCgogICAgcmV0dXJuIHNwbGl0cwoKCmRlZiBfZ2V0X2xvZ2dlcigpOgogICAgZ2xvYmFsIF9MT0dHRVIKICAgIHRyeToKICAgICAgICBpbXBvcnQgbWxydW4KCiAgICAgICAgIyBDaGVjayBpZiBNTFJ1biBpcyBhdmFpbGFibGU6CiAgICAgICAgY29udGV4dCA9IG1scnVuLmdldF9vcl9jcmVhdGVfY3R4KG5hbWU9Im1scnVuIikKICAgICAgICByZXR1cm4gY29udGV4dC5sb2dnZXIKICAgIGV4Y2VwdCBNb2R1bGVOb3RGb3VuZEVycm9yOgogICAgICAgIHJldHVybiBfTE9HR0VSCg== - code_origin: '' - base_image: mlrun/mlrun - requirements: - - torchaudio - - pydub - origin_filename: '' - image: '' + default_handler: generate_multi_speakers_audio disable_auto_mount: false entry_points: generate_multi_speakers_audio: - has_kwargs: false - name: generate_multi_speakers_audio - doc: Generate audio files from text files. - has_varargs: false lineno: 38 parameters: - name: data_path @@ -89,11 +69,30 @@ spec: doc: Changes the bit depth for the supported formats. Supported only in "wav" or "flac" formats. default: null + name: generate_multi_speakers_audio + has_kwargs: false + has_varargs: false outputs: - doc: 'A tuple of: - The output directory path. - The generated audio files dataframe. - The errors'' dictionary.' type: Tuple[str, pd.DataFrame, dict] - default_handler: generate_multi_speakers_audio + doc: Generate audio files from text files. + command: '' + image: '' description: Generate audio file from text using different speakers -verbose: false + build: + requirements: + - torchaudio + - pydub + base_image: mlrun/mlrun + code_origin: '' + origin_filename: '' + functionSourceCode:  +metadata: + categories: + - data-generation + - audio + tag: '' + name: text-to-audio-generator kind: job +verbose: false diff --git a/functions/master/text_to_audio_generator/latest/src/item.yaml b/functions/master/text_to_audio_generator/latest/src/item.yaml index e8235a08..3eba86ea 100644 --- a/functions/master/text_to_audio_generator/latest/src/item.yaml +++ b/functions/master/text_to_audio_generator/latest/src/item.yaml @@ -1,8 +1,7 @@ apiVersion: v1 categories: -- data-preparation -- machine-learning -- pytorch +- data-generation +- audio description: Generate audio file from text using different speakers doc: '' example: text_to_audio_generator.ipynb diff --git a/functions/master/text_to_audio_generator/latest/static/documentation.html b/functions/master/text_to_audio_generator/latest/static/documentation.html index e4ad7a88..b4660ebf 100644 --- a/functions/master/text_to_audio_generator/latest/static/documentation.html +++ b/functions/master/text_to_audio_generator/latest/static/documentation.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/text_to_audio_generator/latest/static/example.html b/functions/master/text_to_audio_generator/latest/static/example.html index 0b9f820f..096f4a9c 100644 --- a/functions/master/text_to_audio_generator/latest/static/example.html +++ b/functions/master/text_to_audio_generator/latest/static/example.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/text_to_audio_generator/latest/static/function.html b/functions/master/text_to_audio_generator/latest/static/function.html index 7e65f8e8..14516532 100644 --- a/functions/master/text_to_audio_generator/latest/static/function.html +++ b/functions/master/text_to_audio_generator/latest/static/function.html @@ -28,31 +28,11 @@
             
    -metadata:
    -  name: text-to-audio-generator
    -  categories:
    -  - data-preparation
    -  - machine-learning
    -  - pytorch
    -  tag: ''
     spec:
    -  command: ''
    -  build:
    -    functionSourceCode: 
    -    code_origin: ''
    -    base_image: mlrun/mlrun
    -    requirements:
    -    - torchaudio
    -    - pydub
    -    origin_filename: ''
    -  image: ''
    +  default_handler: generate_multi_speakers_audio
       disable_auto_mount: false
       entry_points:
         generate_multi_speakers_audio:
    -      has_kwargs: false
    -      name: generate_multi_speakers_audio
    -      doc: Generate audio files from text files.
    -      has_varargs: false
           lineno: 38
           parameters:
           - name: data_path
    @@ -119,14 +99,33 @@
             doc: Changes the bit depth for the supported formats. Supported only in "wav"
               or "flac" formats.
             default: null
    +      name: generate_multi_speakers_audio
    +      has_kwargs: false
    +      has_varargs: false
           outputs:
           - doc: 'A tuple of: - The output directory path. - The generated audio files
               dataframe. - The errors'' dictionary.'
             type: Tuple[str, pd.DataFrame, dict]
    -  default_handler: generate_multi_speakers_audio
    +      doc: Generate audio files from text files.
    +  command: ''
    +  image: ''
       description: Generate audio file from text using different speakers
    -verbose: false
    +  build:
    +    requirements:
    +    - torchaudio
    +    - pydub
    +    base_image: mlrun/mlrun
    +    code_origin: ''
    +    origin_filename: ''
    +    functionSourceCode: IyBDb3B5cmlnaHQgMjAyMyBJZ3VhemlvCiMKIyBMaWNlbnNlZCB1bmRlciB0aGUgQXBhY2hlIExpY2Vuc2UsIFZlcnNpb24gMi4wICh0aGUgIkxpY2Vuc2UiKTsKIyB5b3UgbWF5IG5vdCB1c2UgdGhpcyBmaWxlIGV4Y2VwdCBpbiBjb21wbGlhbmNlIHdpdGggdGhlIExpY2Vuc2UuCiMgWW91IG1heSBvYnRhaW4gYSBjb3B5IG9mIHRoZSBMaWNlbnNlIGF0CiMKIyAgIGh0dHA6Ly93d3cuYXBhY2hlLm9yZy9saWNlbnNlcy9MSUNFTlNFLTIuMAojCiMgVW5sZXNzIHJlcXVpcmVkIGJ5IGFwcGxpY2FibGUgbGF3IG9yIGFncmVlZCB0byBpbiB3cml0aW5nLCBzb2Z0d2FyZQojIGRpc3RyaWJ1dGVkIHVuZGVyIHRoZSBMaWNlbnNlIGlzIGRpc3RyaWJ1dGVkIG9uIGFuICJBUyBJUyIgQkFTSVMsCiMgV0lUSE9VVCBXQVJSQU5USUVTIE9SIENPTkRJVElPTlMgT0YgQU5ZIEtJTkQsIGVpdGhlciBleHByZXNzIG9yIGltcGxpZWQuCiMgU2VlIHRoZSBMaWNlbnNlIGZvciB0aGUgc3BlY2lmaWMgbGFuZ3VhZ2UgZ292ZXJuaW5nIHBlcm1pc3Npb25zIGFuZAojIGxpbWl0YXRpb25zIHVuZGVyIHRoZSBMaWNlbnNlLgppbXBvcnQgaW1wb3J0bGliCmltcG9ydCBpbwppbXBvcnQgbG9nZ2luZwppbXBvcnQgb3MKaW1wb3J0IHBhdGhsaWIKaW1wb3J0IHJhbmRvbQppbXBvcnQgdGVtcGZpbGUKZnJvbSBhYmMgaW1wb3J0IEFCQywgYWJzdHJhY3RtZXRob2QKZnJvbSB0eXBpbmcgaW1wb3J0IERpY3QsIExpc3QsIE9wdGlvbmFsLCBUdXBsZSwgVW5pb24KCmltcG9ydCBudW1weSBhcyBucAppbXBvcnQgcGFuZGFzIGFzIHBkCmltcG9ydCB0b3JjaAppbXBvcnQgdG9yY2hhdWRpbwppbXBvcnQgdHFkbQoKIyBHZXQgdGhlIGdsb2JhbCBsb2dnZXI6Cl9MT0dHRVIgPSBsb2dnaW5nLmdldExvZ2dlcigpCgpPUEVOQUlfQVBJX0tFWSA9ICJPUEVOQUlfQVBJX0tFWSIKT1BFTkFJX0JBU0VfVVJMID0gIk9QRU5BSV9BUElfQkFTRSIKU0FNUExFX1JBVEUgPSAyNDAwMAoKCmRlZiBnZW5lcmF0ZV9tdWx0aV9zcGVha2Vyc19hdWRpbygKICAgIGRhdGFfcGF0aDogc3RyLAogICAgc3BlYWtlcnM6IFVuaW9uW0xpc3Rbc3RyXSwgRGljdFtzdHIsIGludF1dLAogICAgYXZhaWxhYmxlX3ZvaWNlczogTGlzdFtzdHJdLAogICAgZW5naW5lOiBzdHIgPSAib3BlbmFpIiwKICAgIG91dHB1dF9kaXJlY3Rvcnk6IHN0ciA9IE5vbmUsCiAgICB1c2VfZ3B1OiBPcHRpb25hbFtib29sXSA9IE5vbmUsCiAgICB1c2Vfc21hbGxfbW9kZWxzOiBPcHRpb25hbFtib29sXSA9IE5vbmUsCiAgICBvZmZsb2FkX2NwdTogT3B0aW9uYWxbYm9vbF0gPSBOb25lLAogICAgbW9kZWw6IE9wdGlvbmFsW3N0cl0gPSBOb25lLAogICAgc3BlZWQ6IE9wdGlvbmFsW2Zsb2F0XSA9IE5vbmUsCiAgICBzYW1wbGVfcmF0ZTogaW50ID0gMTYwMDAsCiAgICBmaWxlX2Zvcm1hdDogc3RyID0gIndhdiIsCiAgICB2ZXJib3NlOiBib29sID0gVHJ1ZSwKICAgIGJpdHNfcGVyX3NhbXBsZTogT3B0aW9uYWxbaW50XSA9IE5vbmUsCikgLT4gVHVwbGVbc3RyLCBwZC5EYXRhRnJhbWUsIGRpY3RdOgogICAgIiIiCiAgICBHZW5lcmF0ZSBhdWRpbyBmaWxlcyBmcm9tIHRleHQgZmlsZXMuCgogICAgOnBhcmFtIGRhdGFfcGF0aDogICAgICAgICAgIFBhdGggdG8gdGhlIHRleHQgZmlsZSBvciBkaXJlY3RvcnkgY29udGFpbmluZyB0aGUgdGV4dCBmaWxlcyB0byBnZW5lcmF0ZSBhdWRpbyBmcm9tLgogICAgOnBhcmFtIHNwZWFrZXJzOiAgICAgICAgICAgIExpc3QgLyBEaWN0IG9mIHNwZWFrZXJzIHRvIGdlbmVyYXRlIGF1ZGlvIGZvci4KICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBJZiBhIGxpc3QgaXMgZ2l2ZW4sIHRoZSBzcGVha2VycyB3aWxsIGJlIGFzc2lnbmVkIHRvIGNoYW5uZWxzIGluIHRoZSBvcmRlciBnaXZlbi4KICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBJZiBkaWN0aW9uYXJ5LCB0aGUga2V5cyB3aWxsIGJlIHRoZSBzcGVha2VycyBhbmQgdGhlIHZhbHVlcyB3aWxsIGJlIHRoZSBjaGFubmVscy4KICAgIDpwYXJhbSBhdmFpbGFibGVfdm9pY2VzOiAgICBMaXN0IG9mIGF2YWlsYWJsZSB2b2ljZXMgdG8gdXNlIGZvciB0aGUgZ2VuZXJhdGlvbi4KICAgICAgICAgICAgICAgICAgICAgICAgU2VlIGhlcmUgZm9yIHRoZSBhdmFpbGFibGUgdm9pY2VzIGZvciBiYXJrIGVuZ2luZToKICAgICAgICAgICAgICAgICAgICAgICAgaHR0cHM6Ly9zdW5vLWFpLm5vdGlvbi5zaXRlLzhiOGU4NzQ5ZWQ1MTRiMGNiZjNmNjk5MDEzNTQ4NjgzP3Y9YmM2N2NmZjc4NmIwNGI1MGIzY2ViNzU2ZmQwNWY2OGMKICAgICAgICAgICAgICAgICAgICAgICAgU2VlIGhlcmUgZm9yIHRoZSBhdmFpbGFibGUgdm9pY2VzIGZvciBvcGVuYWkgZW5naW5lOgogICAgICAgICAgICAgICAgICAgICAgICBodHRwczovL2JldGEub3BlbmFpLmNvbS9kb2NzL2FwaS1yZWZlcmVuY2Uvc3BlZWNoCiAgICA6cGFyYW0gZW5naW5lOiAgICAgICAgICAgICAgVGhlIGVuZ2luZSB0byB1c2UgZm9yIHRoZSBnZW5lcmF0aW9uLiBTZWxlY3QgZWl0aGVyICJiYXJrIiBvciAib3BlbmFpIi4gRGVmYXVsdCBpcyAib3BlbmFpIi4KICAgIDpwYXJhbSBvdXRwdXRfZGlyZWN0b3J5OiAgICBQYXRoIHRvIHRoZSBkaXJlY3RvcnkgdG8gc2F2ZSB0aGUgZ2VuZXJhdGVkIGF1ZGlvIGZpbGVzIHRvLgogICAgOnBhcmFtIHVzZV9ncHU6ICAgICAgICAgICAgIFdoZXRoZXIgdG8gdXNlIHRoZSBHUFUgZm9yIHRoZSBnZW5lcmF0aW9uLiBTdXBwb3J0ZWQgb25seSBpbiAiYmFyayIgZW5naW5lLgogICAgOnBhcmFtIHVzZV9zbWFsbF9tb2RlbHM6ICAgIFdoZXRoZXIgdG8gdXNlIHRoZSBzbWFsbCBtb2RlbHMgZm9yIHRoZSBnZW5lcmF0aW9uLiBTdXBwb3J0ZWQgb25seSBpbiAiYmFyayIgZW5naW5lLgogICAgOnBhcmFtIG9mZmxvYWRfY3B1OiAgICAgICAgIFRvIHJlZHVjZSB0aGUgbWVtb3J5IGZvb3RwcmludCwgdGhlIG1vZGVscyBjYW4gYmUgb2ZmbG9hZGVkIHRvIHRoZSBDUFUgYWZ0ZXIgbG9hZGluZy4KICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBTdXBwb3J0ZWQgb25seSBpbiAiYmFyayIgZW5naW5lLgogICAgOnBhcmFtIG1vZGVsOiAgICAgICAgICAgICAgIFdoaWNoIG1vZGVsIHRvIHVzZSBmb3IgdGhlIGdlbmVyYXRpb24uIFN1cHBvcnRlZCBvbmx5IGluICJvcGVuYWkiIGVuZ2luZS4KICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBEZWZhdWx0IGlzICJ0dHMtMSIuCiAgICA6cGFyYW0gc3BlZWQ6ICAgICAgICAgICAgICAgVGhlIHNwZWVkIG9mIHRoZSBnZW5lcmF0ZWQgYXVkaW8uIFNlbGVjdCBhIHZhbHVlIGZyb20gYDAuMjVgIHRvIGA0LjBgLiBgMS4wYCBpcyB0aGUgZGVmYXVsdC4KICAgIDpwYXJhbSBzYW1wbGVfcmF0ZTogICAgICAgICBUaGUgc2FtcGxpbmcgcmF0ZSBvZiB0aGUgZ2VuZXJhdGVkIGF1ZGlvLgogICAgOnBhcmFtIGZpbGVfZm9ybWF0OiAgICAgICAgIFRoZSBmb3JtYXQgb2YgdGhlIGdlbmVyYXRlZCBhdWRpbyBmaWxlcy4KICAgIDpwYXJhbSB2ZXJib3NlOiAgICAgICAgICAgICBXaGV0aGVyIHRvIHByaW50IHRoZSBwcm9ncmVzcyBvZiB0aGUgZ2VuZXJhdGlvbi4KICAgIDpwYXJhbSBiaXRzX3Blcl9zYW1wbGU6ICAgICBDaGFuZ2VzIHRoZSBiaXQgZGVwdGggZm9yIHRoZSBzdXBwb3J0ZWQgZm9ybWF0cy4KICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBTdXBwb3J0ZWQgb25seSBpbiAid2F2IiBvciAiZmxhYyIgZm9ybWF0cy4KCiAgICA6cmV0dXJuczogICAgICAgICAgICAgICAgICAgQSB0dXBsZSBvZjoKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAtIFRoZSBvdXRwdXQgZGlyZWN0b3J5IHBhdGguCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgLSBUaGUgZ2VuZXJhdGVkIGF1ZGlvIGZpbGVzIGRhdGFmcmFtZS4KICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAtIFRoZSBlcnJvcnMnIGRpY3Rpb25hcnkuCiAgICAiIiIKCiAgICBnbG9iYWwgX0xPR0dFUgogICAgX0xPR0dFUiA9IF9nZXRfbG9nZ2VyKCkKICAgICMgR2V0IHRoZSBpbnB1dCB0ZXh0IGZpbGVzIHRvIHR1cm4gdG8gYXVkaW86CiAgICBkYXRhX3BhdGggPSBwYXRobGliLlBhdGgoZGF0YV9wYXRoKS5hYnNvbHV0ZSgpCiAgICB0ZXh0X2ZpbGVzID0gX2dldF90ZXh0X2ZpbGVzKGRhdGFfcGF0aD1kYXRhX3BhdGgpCgoKICAgICMgUHJlcGFyZSB0aGUgc3BlZWNoIGVuZ2luZToKICAgIGVuZ2luZSA9IF9nZXRfZW5naW5lKAogICAgICAgIGVuZ2luZT1lbmdpbmUsCiAgICAgICAgdXNlX2dwdT11c2VfZ3B1LAogICAgICAgIHVzZV9zbWFsbF9tb2RlbHM9dXNlX3NtYWxsX21vZGVscywKICAgICAgICBvZmZsb2FkX2NwdT1vZmZsb2FkX2NwdSwKICAgICAgICBtb2RlbD1tb2RlbCwKICAgICAgICBmaWxlX2Zvcm1hdD1maWxlX2Zvcm1hdCwKICAgICAgICBzcGVlZD1zcGVlZAogICAgKQoKICAgICMgQ2hlY2sgZm9yIHBlciBjaGFubmVsIGdlbmVyYXRpb246CiAgICBpZiBpc2luc3RhbmNlKHNwZWFrZXJzLCBkaWN0KToKICAgICAgICBzcGVha2VyX3Blcl9jaGFubmVsID0gVHJ1ZQogICAgICAgICMgU29ydCB0aGUgZ2l2ZW4gc3BlYWtlcnMgYnkgY2hhbm5lbHM6CiAgICAgICAgc3BlYWtlcnMgPSB7CiAgICAgICAgICAgIHNwZWFrZXI6IGNoYW5uZWwKICAgICAgICAgICAgZm9yIHNwZWFrZXIsIGNoYW5uZWwgaW4gc29ydGVkKHNwZWFrZXJzLml0ZW1zKCksIGtleT1sYW1iZGEgaXRlbTogaXRlbVsxXSkKICAgICAgICB9CiAgICBlbHNlOgogICAgICAgIHNwZWFrZXJfcGVyX2NoYW5uZWwgPSBGYWxzZQoKICAgICMgUHJlcGFyZSB0aGUgcmVzYW1wbGluZyBtb2R1bGU6CiAgICByZXNhbXBsZXIgPSB0b3JjaGF1ZGlvLnRyYW5zZm9ybXMuUmVzYW1wbGUoCiAgICAgICAgb3JpZ19mcmVxPVNBTVBMRV9SQVRFLCBuZXdfZnJlcT1zYW1wbGVfcmF0ZSwgZHR5cGU9dG9yY2guZmxvYXQzMgogICAgKQoKICAgICMgUHJlcGFyZSB0aGUgZ2FwIGJldHdlZW4gZWFjaCBzcGVha2VyOgogICAgZ2FwX2JldHdlZW5fc3BlYWtlcnMgPSBucC56ZXJvcyhpbnQoMC41ICogU0FNUExFX1JBVEUpKQoKICAgICMgUHJlcGFyZSB0aGUgc3VjY2Vzc2VzIGRhdGFmcmFtZSBhbmQgZXJyb3JzIGRpY3Rpb25hcnkgdG8gYmUgcmV0dXJuZWQ6CiAgICBzdWNjZXNzZXMgPSBbXQogICAgZXJyb3JzID0ge30KCiAgICAjIENyZWF0ZSB0aGUgb3V0cHV0IGRpcmVjdG9yeToKICAgIGlmIG91dHB1dF9kaXJlY3RvcnkgaXMgTm9uZToKICAgICAgICBvdXRwdXRfZGlyZWN0b3J5ID0gdGVtcGZpbGUubWtkdGVtcCgpCiAgICBvdXRwdXRfZGlyZWN0b3J5ID0gcGF0aGxpYi5QYXRoKG91dHB1dF9kaXJlY3RvcnkpCiAgICBpZiBub3Qgb3V0cHV0X2RpcmVjdG9yeS5leGlzdHMoKToKICAgICAgICBvdXRwdXRfZGlyZWN0b3J5Lm1rZGlyKGV4aXN0X29rPVRydWUsIHBhcmVudHM9VHJ1ZSkKCiAgICAjIFN0YXJ0IGdlbmVyYXRpbmcgYXVkaW86CiAgICAjIEdvIG92ZXIgdGhlIGF1ZGlvIGZpbGVzIGFuZCB0cmFuc2NyaWJlOgogICAgZm9yIHRleHRfZmlsZSBpbiB0cWRtLnRxZG0oCiAgICAgICAgdGV4dF9maWxlcywgZGVzYz0iR2VuZXJhdGluZyIsIHVuaXQ9ImZpbGUiLCBkaXNhYmxlPW5vdCB2ZXJib3NlCiAgICApOgoKICAgICAgICB0cnk6CiAgICAgICAgICAgICMgUmFuZG9taXplIHZvaWNlcyBmb3IgZWFjaCBzcGVha2VyOgogICAgICAgICAgICBjaG9zZW5fdm9pY2VzID0ge30KICAgICAgICAgICAgYXZhaWxhYmxlX3ZvaWNlc19jb3B5ID0gYXZhaWxhYmxlX3ZvaWNlcy5jb3B5KCkKICAgICAgICAgICAgZm9yIHNwZWFrZXIgaW4gc3BlYWtlcnM6CiAgICAgICAgICAgICAgICB2b2ljZSA9IHJhbmRvbS5jaG9pY2UoYXZhaWxhYmxlX3ZvaWNlc19jb3B5KQogICAgICAgICAgICAgICAgY2hvc2VuX3ZvaWNlc1tzcGVha2VyXSA9IHZvaWNlCiAgICAgICAgICAgICAgICBhdmFpbGFibGVfdm9pY2VzX2NvcHkucmVtb3ZlKHZvaWNlKQogICAgICAgICAgICAjIFJlYWQgdGV4dDoKICAgICAgICAgICAgd2l0aCBvcGVuKHRleHRfZmlsZSwgInIiKSBhcyBmcDoKICAgICAgICAgICAgICAgIHRleHQgPSBmcC5yZWFkKCkKICAgICAgICAgICAgIyBQcmVwYXJlIGEgaG9sZGVyIGZvciBhbGwgdGhlIGdlbmVyYXRlZCBwaWVjZXMgKGlmIHBlciBjaGFubmVsIGVhY2ggc3BlYWtlciB3aWxsIGhhdmUgaXRzIG93bik6CiAgICAgICAgICAgIGF1ZGlvX3BpZWNlcyA9ICgKICAgICAgICAgICAgICAgIHtzcGVha2VyOiBbXSBmb3Igc3BlYWtlciBpbiBzcGVha2Vyc30KICAgICAgICAgICAgICAgIGlmIHNwZWFrZXJfcGVyX2NoYW5uZWwKICAgICAgICAgICAgICAgIGVsc2UgeyJhbGwiOiBbXX0KICAgICAgICAgICAgKQoKICAgICAgICAgICAgIyBHZW5lcmF0ZSBhdWRpbyBwZXIgbGluZToKICAgICAgICAgICAgZm9yIGxpbmUgaW4gdGV4dC5zcGxpdGxpbmVzKCk6CiAgICAgICAgICAgICAgICAjIFZhbGlkYXRlIGxpbmUgaXMgaW4gY29ycmVjdCBzcGVha2VyIGZvcm1hdDoKCiAgICAgICAgICAgICAgICBpZiAiOiAiIG5vdCBpbiBsaW5lOgogICAgICAgICAgICAgICAgICAgIGlmIHZlcmJvc2U6CiAgICAgICAgICAgICAgICAgICAgICAgIF9MT0dHRVIud2FybmluZyhmIlNraXBwaW5nIGxpbmU6IHtsaW5lfSIpCiAgICAgICAgICAgICAgICAgICAgY29udGludWUKICAgICAgICAgICAgICAgICMgU3BsaXQgbGluZSB0byBzcGVha2VyIGFuZCBoaXMgd29yZHM6CiAgICAgICAgICAgICAgICBjdXJyZW50X3NwZWFrZXIsIHNlbnRlbmNlcyA9IGxpbmUuc3BsaXQoIjogIiwgMSkKICAgICAgICAgICAgICAgICMgVmFsaWRhdGUgc3BlYWtlciBpcyBrbm93bjoKICAgICAgICAgICAgICAgIGlmIGN1cnJlbnRfc3BlYWtlciBub3QgaW4gc3BlYWtlcnM6CiAgICAgICAgICAgICAgICAgICAgcmFpc2UgVmFsdWVFcnJvcigKICAgICAgICAgICAgICAgICAgICAgICAgZiJVbmtub3duIHNwZWFrZXI6IHtjdXJyZW50X3NwZWFrZXJ9LiBHaXZlbiBzcGVha2VycyBhcmU6IHtzcGVha2Vyc30iCiAgICAgICAgICAgICAgICAgICAgKQogICAgICAgICAgICAgICAgZm9yIHNlbnRlbmNlIGluIF9zcGxpdF9saW5lKGxpbmU9c2VudGVuY2VzKToKICAgICAgICAgICAgICAgICAgICAjIEdlbmVyYXRlIHdvcmRzIGF1ZGlvOgogICAgICAgICAgICAgICAgICAgIGF1ZGlvID0gZW5naW5lLl9nZW5lcmF0ZV9hdWRpbygKICAgICAgICAgICAgICAgICAgICAgICAgdGV4dD1zZW50ZW5jZSwKICAgICAgICAgICAgICAgICAgICAgICAgdm9pY2U9Y2hvc2VuX3ZvaWNlc1tjdXJyZW50X3NwZWFrZXJdLAogICAgICAgICAgICAgICAgICAgICkKCiAgICAgICAgICAgICAgICAgICAgaWYgc3BlYWtlcl9wZXJfY2hhbm5lbDoKICAgICAgICAgICAgICAgICAgICAgICAgc2lsZW5jZSA9IG5wLnplcm9zX2xpa2UoYXVkaW8pCiAgICAgICAgICAgICAgICAgICAgICAgIGZvciBzcGVha2VyIGluIGF1ZGlvX3BpZWNlcy5rZXlzKCk6CiAgICAgICAgICAgICAgICAgICAgICAgICAgICBpZiBzcGVha2VyID09IGN1cnJlbnRfc3BlYWtlcjoKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBhdWRpb19waWVjZXNbc3BlYWtlcl0gKz0gW2F1ZGlvLCBnYXBfYmV0d2Vlbl9zcGVha2Vyc10KICAgICAgICAgICAgICAgICAgICAgICAgICAgIGVsc2U6CiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgYXVkaW9fcGllY2VzW3NwZWFrZXJdICs9IFtzaWxlbmNlLCBnYXBfYmV0d2Vlbl9zcGVha2Vyc10KICAgICAgICAgICAgICAgICAgICBlbHNlOgogICAgICAgICAgICAgICAgICAgICAgICBhdWRpb19waWVjZXNbImFsbCJdICs9IFthdWRpbywgZ2FwX2JldHdlZW5fc3BlYWtlcnNdCiAgICAgICAgICAgICMgQ29uc3RydWN0IGEgc2luZ2xlIGF1ZGlvIGFycmF5IGZyb20gYWxsIHRoZSBwaWVjZXMgYW5kIGNoYW5uZWxzOgoKICAgICAgICAgICAgYXVkaW8gPSBucC52c3RhY2soCiAgICAgICAgICAgICAgICBbbnAuY29uY2F0ZW5hdGUoYXVkaW9fcGllY2VzW3NwZWFrZXJdKSBmb3Igc3BlYWtlciBpbiBzcGVha2Vyc10KICAgICAgICAgICAgKS5hc3R5cGUoZHR5cGU9bnAuZmxvYXQzMikKICAgICAgICAgICAgIyBSZXNhbXBsZToKICAgICAgICAgICAgYXVkaW8gPSB0b3JjaC5mcm9tX251bXB5KGF1ZGlvKQogICAgICAgICAgICBhdWRpbyA9IHJlc2FtcGxlcihhdWRpbykKICAgICAgICAgICAgIyBTYXZlIHRvIGF1ZGlvIGZpbGU6CiAgICAgICAgICAgIGF1ZGlvX2ZpbGUgPSBvdXRwdXRfZGlyZWN0b3J5IC8gZiJ7dGV4dF9maWxlLnN0ZW19LntmaWxlX2Zvcm1hdH0iCgogICAgICAgICAgICB0b3JjaGF1ZGlvLnNhdmUoCiAgICAgICAgICAgICAgICB1cmk9c3RyKGF1ZGlvX2ZpbGUpLAogICAgICAgICAgICAgICAgc3JjPWF1ZGlvLAogICAgICAgICAgICAgICAgc2FtcGxlX3JhdGU9c2FtcGxlX3JhdGUsCiAgICAgICAgICAgICAgICBmb3JtYXQ9ZmlsZV9mb3JtYXQsCiAgICAgICAgICAgICAgICBiaXRzX3Blcl9zYW1wbGU9Yml0c19wZXJfc2FtcGxlLAogICAgICAgICAgICApCgogICAgICAgICAgICAjIENvbGxlY3QgdG8gdGhlIHN1Y2Nlc3NlczoKICAgICAgICAgICAgc3VjY2Vzc2VzLmFwcGVuZChbdGV4dF9maWxlLm5hbWUsIGF1ZGlvX2ZpbGUubmFtZV0pCiAgICAgICAgZXhjZXB0IEV4Y2VwdGlvbiBhcyBleGNlcHRpb246CiAgICAgICAgICAgICMgTm90ZSB0aGUgZXhjZXB0aW9uIGFzIGVycm9yIGluIHRoZSBkaWN0aW9uYXJ5OgogICAgICAgICAgICBpZiB2ZXJib3NlOgogICAgICAgICAgICAgICAgX0xPR0dFUi53YXJuaW5nKGYiRXJyb3IgaW4gZmlsZTogJ3t0ZXh0X2ZpbGUubmFtZX0nIikKICAgICAgICAgICAgcHJpbnQoZXhjZXB0aW9uKQogICAgICAgICAgICBlcnJvcnNbdGV4dF9maWxlLm5hbWVdID0gc3RyKGV4Y2VwdGlvbikKCiAgICAjIENvbnN0cnVjdCB0aGUgdHJhbnNsYXRpb25zIGRhdGFmcmFtZToKICAgIHN1Y2Nlc3NlcyA9IHBkLkRhdGFGcmFtZSgKICAgICAgICBzdWNjZXNzZXMsCiAgICAgICAgY29sdW1ucz1bInRleHRfZmlsZSIsICJhdWRpb19maWxlIl0sCiAgICApCgogICAgIyBQcmludCB0aGUgaGVhZCBvZiB0aGUgcHJvZHVjZWQgZGF0YWZyYW1lIGFuZCByZXR1cm46CiAgICBpZiB2ZXJib3NlOgogICAgICAgIF9MT0dHRVIuaW5mbygKICAgICAgICAgICAgZiJEb25lICh7c3VjY2Vzc2VzLnNoYXBlWzBdfS97bGVuKHRleHRfZmlsZXMpfSlcbiIKICAgICAgICAgICAgZiJUcmFuc2xhdGlvbnMgc3VtbWFyeTpcbiIKICAgICAgICAgICAgZiJ7c3VjY2Vzc2VzLmhlYWQoKX0iCiAgICAgICAgKQogICAgcmV0dXJuIHN0cihvdXRwdXRfZGlyZWN0b3J5KSwgc3VjY2Vzc2VzLCBlcnJvcnMKCgpjbGFzcyBTcGVlY2hFbmdpbmUoQUJDKToKICAgIEBhYnN0cmFjdG1ldGhvZAogICAgZGVmIF9nZW5lcmF0ZV9hdWRpbyhzZWxmLCB0ZXh0OiBzdHIsIHZvaWNlOiBzdHIpIC0+IG5wLm5kYXJyYXk6CiAgICAgICAgcGFzcwoKCmNsYXNzIEJhcmtFbmdpbmUoU3BlZWNoRW5naW5lKToKICAgIGRlZiBfX2luaXRfXyhzZWxmLCB1c2VfZ3B1OiBib29sID0gVHJ1ZSwgdXNlX3NtYWxsX21vZGVsczogYm9vbCA9IEZhbHNlLCBvZmZsb2FkX2NwdTogYm9vbCA9IEZhbHNlKToKICAgICAgICB0cnk6CiAgICAgICAgICAgIHNlbGYuYmFyayA9IGltcG9ydGxpYi5pbXBvcnRfbW9kdWxlKCJiYXJrIikKICAgICAgICBleGNlcHQgSW1wb3J0RXJyb3I6CiAgICAgICAgICAgIHJhaXNlIEltcG9ydEVycm9yKAogICAgICAgICAgICAgICAgIlRoZSAnYmFyaycgbGlicmFyeSBpcyByZXF1aXJlZCBmb3IgdGhlIEJhcmtFbmdpbmUuIFBsZWFzZSBpbnN0YWxsIGl0IHVzaW5nICdwaXAgaW5zdGFsbCBiYXJrLWFpJy4iCiAgICAgICAgICAgICkKCiAgICAgICAgc2VsZi5iYXJrLnByZWxvYWRfbW9kZWxzKAogICAgICAgICAgICB0ZXh0X3VzZV9ncHU9dXNlX2dwdSwKICAgICAgICAgICAgdGV4dF91c2Vfc21hbGw9dXNlX3NtYWxsX21vZGVscywKICAgICAgICAgICAgY29hcnNlX3VzZV9ncHU9dXNlX2dwdSwKICAgICAgICAgICAgY29hcnNlX3VzZV9zbWFsbD11c2Vfc21hbGxfbW9kZWxzLAogICAgICAgICAgICBmaW5lX3VzZV9ncHU9dXNlX2dwdSwKICAgICAgICAgICAgZmluZV91c2Vfc21hbGw9dXNlX3NtYWxsX21vZGVscywKICAgICAgICAgICAgY29kZWNfdXNlX2dwdT11c2VfZ3B1LAogICAgICAgICAgICBmb3JjZV9yZWxvYWQ9b2ZmbG9hZF9jcHUsCiAgICAgICAgKQoKICAgIGRlZiBfZ2VuZXJhdGVfYXVkaW8oc2VsZiwgdGV4dDogc3RyLCB2b2ljZTogc3RyKSAtPiBucC5uZGFycmF5OgogICAgICAgICMgR2VuZXJhdGUgd29yZHMgYXVkaW86CiAgICAgICAgYXVkaW8gPSBzZWxmLmJhcmsuZ2VuZXJhdGVfYXVkaW8oCiAgICAgICAgICAgIHRleHQsCiAgICAgICAgICAgIGhpc3RvcnlfcHJvbXB0PXZvaWNlLAogICAgICAgICAgICBzaWxlbnQ9VHJ1ZSwKICAgICAgICApCiAgICAgICAgcmV0dXJuIGF1ZGlvCgoKY2xhc3MgT3BlbkFJRW5naW5lKFNwZWVjaEVuZ2luZSk6CiAgICBkZWYgX19pbml0X18oc2VsZiwgbW9kZWw6IHN0ciA9ICJ0dHMtMSIsIGZpbGVfZm9ybWF0OiBzdHIgPSAid2F2Iiwgc3BlZWQ6IGZsb2F0ID0gMS4wKToKICAgICAgICB0cnk6CiAgICAgICAgICAgIHNlbGYub3BlbmFpID0gaW1wb3J0bGliLmltcG9ydF9tb2R1bGUoIm9wZW5haSIpCiAgICAgICAgICAgIHNlbGYucHlkdWIgPSBpbXBvcnRsaWIuaW1wb3J0X21vZHVsZSgicHlkdWIiKQogICAgICAgIGV4Y2VwdCBJbXBvcnRFcnJvcjoKICAgICAgICAgICAgcmFpc2UgSW1wb3J0RXJyb3IoCiAgICAgICAgICAgICAgICAiVGhlICdvcGVuYWknIGFuZCAncHlkdWInIGxpYnJhcmllcyBhcmUgcmVxdWlyZWQgZm9yIHRoZSBPcGVuQUlFbmdpbmUuIFBsZWFzZSBpbnN0YWxsIHRoZW0gdXNpbmcgJ3BpcCBpbnN0YWxsIG9wZW5haSBweWR1YicuIgogICAgICAgICAgICApCgogICAgICAgIGFwaV9rZXkgPSBvcy5nZXRlbnYoT1BFTkFJX0FQSV9LRVkpCiAgICAgICAgYmFzZV91cmwgPSBvcy5nZXRlbnYoT1BFTkFJX0JBU0VfVVJMKQogICAgICAgICMgQ2hlY2sgaWYgdGhlIGtleSBpcyBhbHJlYWR5IGluIHRoZSBlbnZpcm9ubWVudCB2YXJpYWJsZXM6CiAgICAgICAgaWYgbm90IGFwaV9rZXkgb3Igbm90IGJhc2VfdXJsOgogICAgICAgICAgICB0cnk6CiAgICAgICAgICAgICAgICBpbXBvcnQgbWxydW4KCiAgICAgICAgICAgICAgICBjb250ZXh0ID0gbWxydW4uZ2V0X29yX2NyZWF0ZV9jdHgobmFtZT0iY29udGV4dCIpCiAgICAgICAgICAgICAgICAjIENoZWNrIGlmIHRoZSBrZXkgaXMgaW4gdGhlIHNlY3JldHM6CiAgICAgICAgICAgICAgICBhcGlfa2V5ID0gY29udGV4dC5nZXRfc2VjcmV0KE9QRU5BSV9BUElfS0VZKQogICAgICAgICAgICAgICAgYmFzZV91cmwgPSBjb250ZXh0LmdldF9zZWNyZXQoT1BFTkFJX0JBU0VfVVJMKQogICAgICAgICAgICBleGNlcHQgTW9kdWxlTm90Rm91bmRFcnJvcjoKICAgICAgICAgICAgICAgIHJhaXNlIEVudmlyb25tZW50RXJyb3IoCiAgICAgICAgICAgICAgICAgICAgZiJPbmUgb3IgbW9yZSBvZiB0aGUgT3BlbkFJIHJlcXVpcmVkIGVudmlyb25tZW50IHZhcmlhYmxlcyAoJ3tPUEVOQUlfQVBJX0tFWX0nLCAne09QRU5BSV9CQVNFX1VSTH0nKSBhcmUgbWlzc2luZy4iCiAgICAgICAgICAgICAgICAgICAgZiJQbGVhc2Ugc2V0IHRoZW0gYXMgZW52aXJvbm1lbnQgdmFyaWFibGVzIG9yIGluc3RhbGwgbWxydW4gKGBwaXAgaW5zdGFsbCBtbHJ1bmApIgogICAgICAgICAgICAgICAgICAgIGYiYW5kIHNldCB0aGVtIGFzIHByb2plY3Qgc2VjcmV0cyB1c2luZyBgcHJvamVjdC5zZXRfc2VjcmV0c2AuIgogICAgICAgICAgICAgICAgKQoKICAgICAgICBzZWxmLmNsaWVudCA9IHNlbGYub3BlbmFpLk9wZW5BSShhcGlfa2V5PWFwaV9rZXksIGJhc2VfdXJsPWJhc2VfdXJsKQogICAgICAgIHNlbGYubW9kZWwgPSBtb2RlbAogICAgICAgIHNlbGYuZmlsZV9mb3JtYXQgPSBmaWxlX2Zvcm1hdAogICAgICAgIHNlbGYuc3BlZWQgPSBzcGVlZAoKICAgIGRlZiBfZ2VuZXJhdGVfYXVkaW8oc2VsZiwgdGV4dDogc3RyLCB2b2ljZTogc3RyKSAtPiBucC5uZGFycmF5OgogICAgICAgICMgR2VuZXJhdGUgd29yZHMgYXVkaW86CiAgICAgICAgYXVkaW8gPSBzZWxmLmNsaWVudC5hdWRpby5zcGVlY2guY3JlYXRlKAogICAgICAgICAgICBtb2RlbD1zZWxmLm1vZGVsLAogICAgICAgICAgICBpbnB1dD10ZXh0LAogICAgICAgICAgICB2b2ljZT12b2ljZSwKICAgICAgICAgICAgcmVzcG9uc2VfZm9ybWF0PXNlbGYuZmlsZV9mb3JtYXQsCiAgICAgICAgICAgIHNwZWVkPXNlbGYuc3BlZWQsCiAgICAgICAgKQogICAgICAgIGF1ZGlvID0gYXVkaW8uY29udGVudAogICAgICAgIGF1ZGlvID0gc2VsZi5fYnl0ZXNfdG9fbnBfYXJyYXkoYXVkaW89YXVkaW8pCiAgICAgICAgcmV0dXJuIGF1ZGlvCgogICAgZGVmIF9ieXRlc190b19ucF9hcnJheShzZWxmLCBhdWRpbzogYnl0ZXMpOgogICAgICAgIGlmIHNlbGYuZmlsZV9mb3JtYXQgPT0gIm1wMyI6CiAgICAgICAgICAgIGF1ZGlvX3NlZ21lbnQgPSBzZWxmLnB5ZHViLkF1ZGlvU2VnbWVudC5mcm9tX21wMyhpby5CeXRlc0lPKGF1ZGlvKSkKCiAgICAgICAgICAgICMgQ29udmVydCB0byByYXcgUENNIGF1ZGlvIGRhdGEKICAgICAgICAgICAgc2FtcGxlcyA9IGF1ZGlvX3NlZ21lbnQuZ2V0X2FycmF5X29mX3NhbXBsZXMoKQoKICAgICAgICAgICAgIyBDb252ZXJ0IHRvIG51bXB5IGFycmF5CiAgICAgICAgICAgIGF1ZGlvX2FycmF5ID0gbnAuYXJyYXkoc2FtcGxlcykKCiAgICAgICAgICAgICMgTm9ybWFsaXplIHRvIGZsb2F0IGJldHdlZW4gLTEgYW5kIDEKICAgICAgICAgICAgcmV0dXJuIGF1ZGlvX2FycmF5LmFzdHlwZShucC5mbG9hdDMyKSAvIG5wLmlpbmZvKHNhbXBsZXMudHlwZWNvZGUpLm1heAogICAgICAgIGVsc2U6CiAgICAgICAgICAgIHJldHVybiBucC5mcm9tYnVmZmVyKGF1ZGlvLCBkdHlwZT1ucC5pbnQxNikgLyAzMjc2OC4wCgoKZGVmIF9nZXRfZW5naW5lKGVuZ2luZTogc3RyLCBmaWxlX2Zvcm1hdDogc3RyLCAqKmt3YXJncykgLT4gU3BlZWNoRW5naW5lOgogICAgIyBlbGltaW5hdGUgdGhlIE5vbmUgdmFsdWVzOgogICAga3dhcmdzID0ge2tleTogdmFsdWUgZm9yIGtleSwgdmFsdWUgaW4ga3dhcmdzLml0ZW1zKCkgaWYgdmFsdWUgaXMgbm90IE5vbmV9CgogICAgaWYgZW5naW5lID09ICJiYXJrIjoKICAgICAgICByZXR1cm4gQmFya0VuZ2luZSgqKmt3YXJncykKICAgIGVsaWYgZW5naW5lID09ICJvcGVuYWkiOgogICAgICAgIHJldHVybiBPcGVuQUlFbmdpbmUoZmlsZV9mb3JtYXQ9ZmlsZV9mb3JtYXQsICoqa3dhcmdzKQogICAgZWxzZToKICAgICAgICByYWlzZSBWYWx1ZUVycm9yKAogICAgICAgICAgICBmIlVucmVjb2duaXplZCBlbmdpbmUuIFRoZSBwYXJhbWV0ZXIgYGVuZ2luZWAgbXVzdCBiZSBlaXRoZXIgJ2JhcmsnIG9yICdvcGVuYWknLiBHaXZlbjoge2VuZ2luZX0iCiAgICAgICAgKQoKZGVmIF9nZXRfdGV4dF9maWxlcygKICAgIGRhdGFfcGF0aDogcGF0aGxpYi5QYXRoLAopIC0+IExpc3RbcGF0aGxpYi5QYXRoXToKICAgICMgQ2hlY2sgaWYgdGhlIHBhdGggaXMgb2YgYSBkaXJlY3Rvcnkgb3IgYSBmaWxlOgogICAgaWYgZGF0YV9wYXRoLmlzX2RpcigpOgogICAgICAgICMgR2V0IGFsbCBmaWxlcyBpbnNpZGUgdGhlIGRpcmVjdG9yeToKICAgICAgICB0ZXh0X2ZpbGVzID0gbGlzdChkYXRhX3BhdGguZ2xvYigiKi4qIikpCiAgICBlbGlmIGRhdGFfcGF0aC5pc19maWxlKCk6CiAgICAgICAgdGV4dF9maWxlcyA9IFtkYXRhX3BhdGhdCiAgICBlbHNlOgogICAgICAgIHJhaXNlIFZhbHVlRXJyb3IoCiAgICAgICAgICAgIGYiVW5yZWNvZ25pemVkIGRhdGEgcGF0aC4gVGhlIHBhcmFtZXRlciBgZGF0YV9wYXRoYCBtdXN0IGJlIGVpdGhlciBhIGRpcmVjdG9yeSBwYXRoIG9yIGEgZmlsZSBwYXRoLiAiCiAgICAgICAgICAgIGYiR2l2ZW46IHtzdHIoZGF0YV9wYXRoKX0gIgogICAgICAgICkKCiAgICByZXR1cm4gdGV4dF9maWxlcwoKCmRlZiBfc3BsaXRfbGluZShsaW5lOiBzdHIsIG1heF9sZW5ndGg6IGludCA9IDI1MCkgLT4gTGlzdFtzdHJdOgogICAgaWYgbGVuKGxpbmUpIDwgbWF4X2xlbmd0aDoKICAgICAgICByZXR1cm4gW2xpbmVdCgogICAgc2VudGVuY2VzID0gWwogICAgICAgIGYie3NlbnRlbmNlLnN0cmlwKCl9LiIgZm9yIHNlbnRlbmNlIGluIGxpbmUuc3BsaXQoIi4iKSBpZiBzZW50ZW5jZS5zdHJpcCgpCiAgICBdCgogICAgc3BsaXRzID0gW10KICAgIGN1cnJlbnRfbGVuZ3RoID0gbGVuKHNlbnRlbmNlc1swXSkKICAgIHNwbGl0ID0gc2VudGVuY2VzWzBdCiAgICBmb3Igc2VudGVuY2UgaW4gc2VudGVuY2VzWzE6XToKICAgICAgICBpZiBjdXJyZW50X2xlbmd0aCArIGxlbihzZW50ZW5jZSkgPiBtYXhfbGVuZ3RoOgogICAgICAgICAgICBzcGxpdHMuYXBwZW5kKHNwbGl0KQogICAgICAgICAgICBzcGxpdCA9IHNlbnRlbmNlCiAgICAgICAgICAgIGN1cnJlbnRfbGVuZ3RoID0gbGVuKHNlbnRlbmNlKQogICAgICAgIGVsc2U6CiAgICAgICAgICAgIGN1cnJlbnRfbGVuZ3RoICs9IGxlbihzZW50ZW5jZSkKICAgICAgICAgICAgc3BsaXQgKz0gIiAiICsgc2VudGVuY2UKICAgIGlmIHNwbGl0OgogICAgICAgIHNwbGl0cy5hcHBlbmQoc3BsaXQpCgogICAgcmV0dXJuIHNwbGl0cwoKCmRlZiBfZ2V0X2xvZ2dlcigpOgogICAgZ2xvYmFsIF9MT0dHRVIKICAgIHRyeToKICAgICAgICBpbXBvcnQgbWxydW4KCiAgICAgICAgIyBDaGVjayBpZiBNTFJ1biBpcyBhdmFpbGFibGU6CiAgICAgICAgY29udGV4dCA9IG1scnVuLmdldF9vcl9jcmVhdGVfY3R4KG5hbWU9Im1scnVuIikKICAgICAgICByZXR1cm4gY29udGV4dC5sb2dnZXIKICAgIGV4Y2VwdCBNb2R1bGVOb3RGb3VuZEVycm9yOgogICAgICAgIHJldHVybiBfTE9HR0VSCg==
    +metadata:
    +  categories:
    +  - data-generation
    +  - audio
    +  tag: ''
    +  name: text-to-audio-generator
     kind: job
    +verbose: false
     
             
         
    diff --git a/functions/master/text_to_audio_generator/latest/static/item.html b/functions/master/text_to_audio_generator/latest/static/item.html index 4fc64dfb..2aeea892 100644 --- a/functions/master/text_to_audio_generator/latest/static/item.html +++ b/functions/master/text_to_audio_generator/latest/static/item.html @@ -30,9 +30,8 @@ apiVersion: v1 categories: -- data-preparation -- machine-learning -- pytorch +- data-generation +- audio description: Generate audio file from text using different speakers doc: '' example: text_to_audio_generator.ipynb diff --git a/functions/master/text_to_audio_generator/latest/static/text_to_audio_generator.html b/functions/master/text_to_audio_generator/latest/static/text_to_audio_generator.html index c9c93a42..f807a73b 100644 --- a/functions/master/text_to_audio_generator/latest/static/text_to_audio_generator.html +++ b/functions/master/text_to_audio_generator/latest/static/text_to_audio_generator.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/tf2_serving/1.1.0/static/documentation.html b/functions/master/tf2_serving/1.1.0/static/documentation.html index 7124689a..b92a2870 100644 --- a/functions/master/tf2_serving/1.1.0/static/documentation.html +++ b/functions/master/tf2_serving/1.1.0/static/documentation.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/tf2_serving/1.1.0/static/example.html b/functions/master/tf2_serving/1.1.0/static/example.html index bded5cf6..a27ca4ac 100644 --- a/functions/master/tf2_serving/1.1.0/static/example.html +++ b/functions/master/tf2_serving/1.1.0/static/example.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/tf2_serving/1.1.0/static/tf2_serving.html b/functions/master/tf2_serving/1.1.0/static/tf2_serving.html index 9e33ec1e..de5fa741 100644 --- a/functions/master/tf2_serving/1.1.0/static/tf2_serving.html +++ b/functions/master/tf2_serving/1.1.0/static/tf2_serving.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/tf2_serving/latest/static/documentation.html b/functions/master/tf2_serving/latest/static/documentation.html index 7124689a..b92a2870 100644 --- a/functions/master/tf2_serving/latest/static/documentation.html +++ b/functions/master/tf2_serving/latest/static/documentation.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/tf2_serving/latest/static/example.html b/functions/master/tf2_serving/latest/static/example.html index bded5cf6..a27ca4ac 100644 --- a/functions/master/tf2_serving/latest/static/example.html +++ b/functions/master/tf2_serving/latest/static/example.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/tf2_serving/latest/static/tf2_serving.html b/functions/master/tf2_serving/latest/static/tf2_serving.html index 9e33ec1e..de5fa741 100644 --- a/functions/master/tf2_serving/latest/static/tf2_serving.html +++ b/functions/master/tf2_serving/latest/static/tf2_serving.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/transcribe/1.2.0/src/data/error_file.txt b/functions/master/transcribe/1.2.0/src/data/error_file.txt new file mode 100644 index 00000000..e69de29b diff --git a/functions/master/transcribe/1.2.0/src/data/speech_01.mp3 b/functions/master/transcribe/1.2.0/src/data/speech_01.mp3 new file mode 100644 index 00000000..ae0e5c82 Binary files /dev/null and b/functions/master/transcribe/1.2.0/src/data/speech_01.mp3 differ diff --git a/functions/master/transcribe/1.2.0/src/data/speech_02.mp3 b/functions/master/transcribe/1.2.0/src/data/speech_02.mp3 new file mode 100644 index 00000000..1d5e6c03 Binary files /dev/null and b/functions/master/transcribe/1.2.0/src/data/speech_02.mp3 differ diff --git a/functions/master/transcribe/1.2.0/src/function.yaml b/functions/master/transcribe/1.2.0/src/function.yaml new file mode 100644 index 00000000..43e9b3a8 --- /dev/null +++ b/functions/master/transcribe/1.2.0/src/function.yaml @@ -0,0 +1,285 @@ +kind: job +metadata: + categories: + - audio + - genai + tag: '' + name: transcribe +verbose: false +spec: + build: + origin_filename: '' + requirements: + - transformers + - tqdm + - torchaudio + - torch + - accelerate + base_image: mlrun/mlrun + code_origin: '' + functionSourceCode:  + disable_auto_mount: false + description: Transcribe audio files into text files + image: '' + command: '' + default_handler: transcribe + entry_points: + do_task: + name: do_task + doc: Try to perform the task storing an error if occurred. + lineno: 348 + parameters: + - name: self + has_varargs: false + has_kwargs: false + is_failed: + name: is_failed + doc: Check if the task failed. + lineno: 70 + parameters: + - name: self + has_varargs: false + has_kwargs: false + outputs: + - doc: Whether the task failed. + type: bool + get_result: + name: get_result + doc: 'Get the result of the task. If the task failed, the error will be returned, + otherwise, the result will be the + + text file name.' + lineno: 78 + parameters: + - name: self + has_varargs: false + has_kwargs: false + outputs: + - doc: The task's result. + type: Tuple[str, str] + to_tuple: + name: to_tuple + doc: Convert the task to a tuple to reconstruct it later (used for multiprocessing + to pass in queue). + lineno: 358 + parameters: + - name: self + has_varargs: false + has_kwargs: false + outputs: + - doc: The converted task. + type: Tuple[str, dict] + transcription_output_channels: + name: transcription_output_channels + doc: Get the transcription output channels. + lineno: 340 + parameters: + - name: self + has_varargs: false + has_kwargs: false + outputs: + - doc: The transcription output channels. + type: List[Tuple[str, dict]] + process_batch: + name: process_batch + doc: 'Process a batch of transcriptions. Tasks related to the given batch will + be created and stored in the batch + + processor.' + lineno: 575 + parameters: + - name: self + - name: batch + type: List[dict] + doc: The batch of transcriptions to process. + has_varargs: false + has_kwargs: false + get_tasks: + name: get_tasks + doc: Get the tasks to perform. + lineno: 453 + parameters: + - name: self + has_varargs: false + has_kwargs: false + outputs: + - doc: The tasks to perform. + type: List[BaseTask] + do_tasks: + name: do_tasks + doc: Perform the tasks. Should be used if no multiprocessing queue is given + to a transcriber. + lineno: 463 + parameters: + - name: self + has_varargs: false + has_kwargs: false + get_results: + name: get_results + doc: Get the results of the tasks. The stored results are then cleared. + lineno: 471 + parameters: + - name: self + has_varargs: false + has_kwargs: false + outputs: + - doc: The results of the tasks. + type: List[Tuple[bool, Tuple[str, str]]] + load: + name: load + doc: Load the transcriber. Must be called before transcribing. + lineno: 695 + parameters: + - name: self + has_varargs: false + has_kwargs: false + transcribe: + name: transcribe + doc: "Transcribe audio files into text files and collect additional data. The\ + \ end result is a directory of transcribed\ntext files and a dataframe containing\ + \ the following columns:\n\n* audio_file - The audio file path.\n* transcription_file\ + \ - The transcribed text file name in the output directory.\n\nThe transcription\ + \ is based on Huggingface's ASR pipeline -\nhttps://huggingface.co/transformers/main_classes/pipelines.html#transformers.AutomaticSpeechRecognitionPipeline\ + \ and\nis tested with OpenAI's Whisper models - https://huggingface.co/openai.\n\ + \nIf one of the speaker diarization parameters are given (either `speech_diarization`\ + \ or\n`speech_diarize_per_channel`), the transcription will be written in\ + \ a conversation format, where each speaker will\nbe written in a separate\ + \ line::\n\n speaker_1: text\n speaker_2: text\n speaker_1: text\n\ + \ ..." + lineno: 1097 + parameters: + - name: data_path + type: Union[str, Path, List[Union[str, Path]]] + doc: A directory of audio files or a single file or a list of files to transcribe. + - name: output_directory + type: str + doc: Path to a directory to save all transcribed audio files. If not given, + will save the transcribed files in a temporary directory. + default: null + - name: model_name + type: str + doc: 'The model name to use. Should be a model from the OpenAI''s Whisper + models for best results (for example "tiny", "base", "large", etc.). See + here for more information: https://huggingface.co/openai?search_models=whisper.' + default: openai/whisper-tiny + - name: device + type: str + doc: The device to use for inference. If not given, will use GPU if available. + default: null + - name: use_flash_attention_2 + type: bool + doc: 'Whether to use the Flash Attention 2 implementation. It can be used + only with one of the following GPUs: Nvidia H series and Nvidia A series. + T4 support will be available soon.' + default: null + - name: use_better_transformers + type: bool + doc: Whether to use the Better Transformers library to further optimize the + model. Should be used for all use cases that do not support flash attention + 2. + default: null + - name: assistant_model + type: str + doc: 'The assistant model name to use for inference. Notice that the optimizations + (flash attention 2 and better transformers) will be applied for the assistant + as well. Should be a model from Huggingface''s distil-whisper (see here + for more information: https://github.com/huggingface/distil-whisper).' + default: null + - name: max_new_tokens + type: int + doc: The maximum number of new tokens to generate. This is used to limit the + generation length. Default is 128 tokens. + default: 128 + - name: chunk_length_s + type: int + doc: The audio chunk to split the audio to (in seconds). Default is 30 seconds. + default: 30 + - name: batch_size + type: int + doc: The batch size to use for inference. Default is 2. + default: 8 + - name: spoken_language + type: str + doc: Aim whisper to know what language is spoken. If None, it will try to + detect it. + default: null + - name: translate_to_english + type: bool + doc: Whether to translate the transcriptions to English. + default: false + - name: speech_diarization + type: Dict[str, List[Tuple[float, float, str]]] + doc: 'A speech diarization dictionary with the file names to transcribe as + keys and their diarization as value. The diarization is a list of tuples: + (start, end, speaker). An example for a diarization dictionary::' + default: null + - name: speech_diarize_per_channel + type: int + doc: 'Perform speech diarization per channel. Each speaker is expected to + belong to a separate channel in the audio. Notice: This will make the transcription + slower as each channel wil be transcribed separatly. If a speech diarization + is passed (via the `speech_diarization` parameter), this parameter is ignored.' + default: null + - name: speaker_labels + type: List[str] + doc: A list of speaker labels by channel order to use for writing the transcription + with respect to per channel speech diarization. This won't be used together + with a given speech diarization (via the `speech_diarization` parameter). + default: null + - name: use_multiprocessing + type: Union[bool, int] + doc: 'Whether to use multiprocessing to transcribe the audio files. Can be + either a boolean value or an integer. If `True`, will use the default amount + of workers (3): 1 for transcription, 1 for batch processing and 1 for task + completion (such as speech diarization and writing to files). To control + the amount of tasks completion workers, an integer can be provided to specify + the amount of workers. `False`, will use a single process. Default is `False`.' + default: false + - name: verbose + type: bool + doc: Whether to print the progress of the transcription. Default is `False`. + default: false + has_varargs: false + has_kwargs: false + audio_iterator: + name: audio_iterator + doc: '' + lineno: 804 + has_varargs: false + has_kwargs: false + outputs: + - type: Generator[Union[dict, str], None, None] + batch_iterator: + name: batch_iterator + doc: '' + lineno: 816 + has_varargs: false + has_kwargs: false + outputs: + - type: Generator[List[Union[dict, str]], None, None] + open_mpi_handler: + name: open_mpi_handler + doc: '' + lineno: 957 + parameters: + - name: worker_inputs + type: List[str] + - name: root_worker_inputs + type: Dict[str, Any] + default: null + has_varargs: false + has_kwargs: false + decorator: + name: decorator + doc: '' + lineno: 969 + parameters: + - name: handler + has_varargs: false + has_kwargs: false + wrapper: + name: wrapper + doc: '' + lineno: 974 + has_varargs: false + has_kwargs: true diff --git a/functions/master/transcribe/1.2.0/src/item.yaml b/functions/master/transcribe/1.2.0/src/item.yaml new file mode 100644 index 00000000..6deaf710 --- /dev/null +++ b/functions/master/transcribe/1.2.0/src/item.yaml @@ -0,0 +1,30 @@ +apiVersion: v1 +categories: +- audio +- genai +description: Transcribe audio files into text files +doc: '' +example: transcribe.ipynb +generationDate: 2023-07-13:11-20 +hidden: false +icon: '' +labels: + author: yonatans +maintainers: [] +marketplaceType: '' +mlrunVersion: 1.7.0 +name: transcribe +platformVersion: 3.5.3 +spec: + filename: transcribe.py + handler: transcribe + image: mlrun/mlrun + kind: job + requirements: + - transformers + - tqdm + - torchaudio + - torch + - accelerate +url: '' +version: 1.2.0 \ No newline at end of file diff --git a/functions/master/transcribe/1.2.0/src/requirements.txt b/functions/master/transcribe/1.2.0/src/requirements.txt new file mode 100644 index 00000000..d16bfc9d --- /dev/null +++ b/functions/master/transcribe/1.2.0/src/requirements.txt @@ -0,0 +1,5 @@ +transformers +torch +torchaudio +tqdm +accelerate \ No newline at end of file diff --git a/functions/master/transcribe/1.2.0/src/test_transcribe.py b/functions/master/transcribe/1.2.0/src/test_transcribe.py new file mode 100644 index 00000000..f70b3856 --- /dev/null +++ b/functions/master/transcribe/1.2.0/src/test_transcribe.py @@ -0,0 +1,104 @@ +# Copyright 2019 Iguazio +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import os +import pathlib +import tempfile +from difflib import SequenceMatcher + +import mlrun +import pytest + + +expected_outputs = [ + "This is a speech to text test.", + "In the heart of the stadium, " + "cheers paint the air as the ball weaves its tale across the pitch. " + "With each kick, players chase their dreams, guided by the rhythmic dance of teamwork. " + "The crowd roars, a symphony of passion, " + "as the game writes its unpredictable story on the field of destiny.", +] +models = [ + + "openai/whisper-tiny", +] + + +@pytest.mark.skipif(os.system("which ffmpeg") != 0, reason="ffmpeg not installed") +@pytest.mark.parametrize("model_name", models) +@pytest.mark.parametrize("audio_path", ["./data", "./data/speech_01.mp3"]) +def test_transcribe(model_name: str, audio_path: str): + # Setting variables and importing function: + artifact_path = tempfile.mkdtemp() + project = mlrun.get_or_create_project("test") + transcribe_function = project.set_function("transcribe.py", "transcribe", kind="job", image="mlrun/mlrun") + # transcribe_function = mlrun.import_function("function.yaml") + temp_dir = tempfile.mkdtemp() + + # Running transcribe function: + transcribe_run = transcribe_function.run( + handler="transcribe", + params={ + "data_path": audio_path, + "model_name": model_name, + "device": "cpu", + "output_directory": temp_dir, + }, + local=True, + returns=["output_dir: path", "dataset: dataset", "errored_files"], + artifact_path=artifact_path, + ) + + artifact_path += ( + f"/{transcribe_run.metadata.name}/{transcribe_run.metadata.iteration}/" + ) + + # Getting actual files from run (text and errored): + input_files = ( + os.listdir(audio_path) + if pathlib.Path(audio_path).is_dir() + else [pathlib.Path(audio_path).name] + ) + expected_text_files = sorted([f for f in input_files if f.endswith("mp3")]) + error_files = list(set(input_files) - set(expected_text_files)) + expected_text_files = [f.replace("mp3", "txt") for f in expected_text_files] + text_files = sorted(os.listdir(temp_dir)) + + # Check that the text files are saved in output_directory: + assert text_files == expected_text_files + + # Check that the transcribed text was approximately (90%) generated from audio: + for text_file, expected in zip(text_files, expected_outputs): + with open(os.path.join(temp_dir, text_file), "r") as f: + output = f.readlines()[0] + ratio = SequenceMatcher(None, expected, output).ratio() + assert ratio >= 0.9 + + # Check that the dataframe is in the correct size: + df = mlrun.get_dataitem(artifact_path + "dataset.parquet").as_df() + assert len(df) == len(expected_text_files) + + # Check errored files: + if isinstance(transcribe_run.outputs["errored_files"], str): + actual_errored_files = [] + else: + actual_errored_files = [ + os.path.basename(errored) + for errored in transcribe_run.outputs["errored_files"].keys() + ] + assert actual_errored_files == error_files + + # Check output_dir: + zip_dir = mlrun.get_dataitem(artifact_path + "output_dir.zip") + assert zip_dir.kind == "file" diff --git a/functions/master/transcribe/1.2.0/src/transcribe.ipynb b/functions/master/transcribe/1.2.0/src/transcribe.ipynb new file mode 100644 index 00000000..5671160c --- /dev/null +++ b/functions/master/transcribe/1.2.0/src/transcribe.ipynb @@ -0,0 +1,338 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "a80305ba-ffff-4116-aa46-5c1b67368239", + "metadata": {}, + "source": [ + "# Transcribe tutorial" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "bdb947f0-5b9a-492d-9676-374c38eee14a", + "metadata": { + "ExecuteTime": { + "start_time": "2023-07-16T17:13:48.565039Z", + "end_time": "2023-07-16T17:14:01.952515Z" + } + }, + "outputs": [], + "source": [ + "import tempfile\n", + "import mlrun" + ] + }, + { + "cell_type": "markdown", + "id": "b7364965-8dcd-419a-8764-dd0c87edb9f8", + "metadata": {}, + "source": [ + "## Importing the transcribe function from hub\n", + "\n", + "To import the function directly from hub, use:\n", + "```python \n", + "transcribe_fn = mlrun.import_function(\"hub://transcribe\")\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "outputs": [], + "source": [ + "artifact_path = tempfile.mkdtemp()" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "start_time": "2023-07-16T17:14:01.954022Z", + "end_time": "2023-07-16T17:14:01.955760Z" + } + } + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "2d9a80a2-8448-49cd-a92f-1ab2072fc720", + "metadata": { + "ExecuteTime": { + "start_time": "2023-07-16T17:14:01.956508Z", + "end_time": "2023-07-16T17:14:01.966758Z" + } + }, + "outputs": [], + "source": [ + "transcribe_fn = mlrun.import_function(\"function.yaml\")" + ] + }, + { + "cell_type": "markdown", + "id": "7fcb6c8a-f83b-42d9-b02e-9187e85fe232", + "metadata": {}, + "source": [ + "## Running transcribe" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "1570b05f-cfb7-466d-84c8-98f4c9d54ad4", + "metadata": { + "ExecuteTime": { + "start_time": "2023-07-16T17:14:01.969912Z", + "end_time": "2023-07-16T17:14:12.724086Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2023-07-16 17:14:01,968 [info] Storing function: {'name': 'transcribe-transcribe', 'uid': 'd1384cb679bc4c178b0195d964b628a8', 'db': None}\n", + "> 2023-07-16 17:14:01,969 [warning] Could not detect path to API server, not connected to API server!\n", + "> 2023-07-16 17:14:01,969 [warning] MLRUN_DBPATH is not set. Set this environment variable to the URL of the API server in order to connect\n", + "> 2023-07-16 17:14:01,970 [warning] Could not detect path to API server, not connected to API server!\n", + "> 2023-07-16 17:14:01,970 [warning] MLRUN_DBPATH is not set. Set this environment variable to the URL of the API server in order to connect\n", + "> 2023-07-16 17:14:01,972 [warning] Could not detect path to API server, not connected to API server!\n", + "> 2023-07-16 17:14:01,972 [warning] MLRUN_DBPATH is not set. Set this environment variable to the URL of the API server in order to connect\n", + "> 2023-07-16 17:14:09,804 [warning] Could not detect path to API server, not connected to API server!\n", + "> 2023-07-16 17:14:09,805 [warning] MLRUN_DBPATH is not set. Set this environment variable to the URL of the API server in order to connect\n", + "> 2023-07-16 17:14:09,805 [info] Loading whisper model: 'tiny'\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "The 'nopython' keyword argument was not supplied to the 'numba.jit' decorator. The implicit default value for this argument is currently False, but it will be changed to True in Numba 0.59.0. See https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit for details.\n", + "IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2023-07-16 17:14:10,374 [info] Model loaded.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Transcribing: 67%|██████▋ | 2/3 [00:02<00:01, 1.04s/file]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2023-07-16 17:14:12,556 [warning] Error in file: '/Users/Yonatan_Shelach/projects/functions/transcribe/data/error_file.txt'\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Transcribing: 100%|██████████| 3/3 [00:02<00:00, 1.39file/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2023-07-16 17:14:12,566 [info] Done:\n", + " audio_file transcription_file language length rate_of_speech\n", + "0 speech_01.mp3 speech_01.txt en 2.011333 3.480278\n", + "1 speech_02.mp3 speech_02.txt en 20.793500 2.548873\n", + "> 2023-07-16 17:14:12,596 [warning] Could not detect path to API server, not connected to API server!\n", + "> 2023-07-16 17:14:12,597 [warning] MLRUN_DBPATH is not set. Set this environment variable to the URL of the API server in order to connect\n", + "> 2023-07-16 17:14:12,659 [warning] Could not detect path to API server, not connected to API server!\n", + "> 2023-07-16 17:14:12,660 [warning] MLRUN_DBPATH is not set. Set this environment variable to the URL of the API server in order to connect\n", + "> 2023-07-16 17:14:12,671 [warning] Could not detect path to API server, not connected to API server!\n", + "> 2023-07-16 17:14:12,672 [warning] MLRUN_DBPATH is not set. Set this environment variable to the URL of the API server in order to connect\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2023-07-16 17:14:12,707 [warning] Could not detect path to API server, not connected to API server!\n", + "> 2023-07-16 17:14:12,707 [warning] MLRUN_DBPATH is not set. Set this environment variable to the URL of the API server in order to connect\n", + "> 2023-07-16 17:14:12,708 [warning] Could not detect path to API server, not connected to API server!\n", + "> 2023-07-16 17:14:12,708 [warning] MLRUN_DBPATH is not set. Set this environment variable to the URL of the API server in order to connect\n" + ] + }, + { + "data": { + "text/plain": "", + "text/html": "\n
    \n
    \n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
    projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
    default
    ...b628a8
    0Jul 16 14:14:01completedtranscribe-transcribe
    kind=
    owner=Yonatan_Shelach
    host=M-QWXQJK77Q0
    model_name=tiny
    audio_files_directory=./data
    decoding_options={'fp16': False}
    output_directory=./output
    transcriptions
    transcriptions_df
    transcriptions_errors
    \n
    \n
    \n
    \n Title\n ×\n
    \n \n
    \n
    \n" + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "text/plain": "", + "text/html": " > to track results use the .show() or .logs() methods " + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2023-07-16 17:14:12,721 [info] Run execution finished: {'status': 'completed', 'name': 'transcribe-transcribe'}\n" + ] + } + ], + "source": [ + "transcribe_run = transcribe_fn.run(\n", + " handler=\"transcribe\",\n", + " params={\n", + " \"model_name\": \"tiny\",\n", + " \"input_path\": \"./data\",\n", + " \"decoding_options\": {\"fp16\": False},\n", + " \"output_directory\": \"./output\",\n", + " },\n", + " returns=[\n", + " \"transcriptions: path\",\n", + " \"transcriptions_df: dataset\",\n", + " {\"key\": \"transcriptions_errors\", \"artifact_type\": \"file\", \"file_format\": \"yaml\"},\n", + " ],\n", + " local=True,\n", + " artifact_path=artifact_path,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "407d1e6c-d2a4-42e7-b3e2-c51138cb30ea", + "metadata": { + "ExecuteTime": { + "start_time": "2023-07-16T17:14:12.726898Z", + "end_time": "2023-07-16T17:14:12.745521Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": "{'transcriptions': 'store://artifacts/default/transcribe-transcribe_transcriptions:d1384cb679bc4c178b0195d964b628a8',\n 'transcriptions_df': 'store://artifacts/default/transcribe-transcribe_transcriptions_df:d1384cb679bc4c178b0195d964b628a8',\n 'transcriptions_errors': 'store://artifacts/default/transcribe-transcribe_transcriptions_errors:d1384cb679bc4c178b0195d964b628a8'}" + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "transcribe_run.outputs" + ] + }, + { + "cell_type": "markdown", + "source": [ + "**Notice**: If connected to mlrun server, you can simply use:\n", + "\n", + "```python\n", + "df = transcribe_run.artifact(\"transcriptions_df\")\n", + "```" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 6, + "outputs": [], + "source": [ + "artifact_path += f\"/{transcribe_run.metadata.name}/{transcribe_run.metadata.iteration}/\"" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "start_time": "2023-07-16T17:14:12.730064Z", + "end_time": "2023-07-16T17:14:12.748292Z" + } + } + }, + { + "cell_type": "code", + "execution_count": 10, + "outputs": [], + "source": [ + "df = mlrun.get_dataitem(artifact_path + \"transcriptions_df.parquet\").as_df()" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "start_time": "2023-07-16T17:25:02.712455Z", + "end_time": "2023-07-16T17:25:02.719538Z" + } + } + }, + { + "cell_type": "code", + "execution_count": 11, + "outputs": [ + { + "data": { + "text/plain": " audio_file transcription_file language length rate_of_speech\n0 speech_01.mp3 speech_01.txt en 2.011333 3.480278\n1 speech_02.mp3 speech_02.txt en 20.793500 2.548873", + "text/html": "
    \n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
    audio_filetranscription_filelanguagelengthrate_of_speech
    0speech_01.mp3speech_01.txten2.0113333.480278
    1speech_02.mp3speech_02.txten20.7935002.548873
    \n
    " + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "start_time": "2023-07-16T17:25:07.878158Z", + "end_time": "2023-07-16T17:25:07.880514Z" + } + } + } + ], + "metadata": { + "kernelspec": { + "name": "python3", + "language": "python", + "display_name": "Python 3 (ipykernel)" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/functions/master/transcribe/1.2.0/src/transcribe.py b/functions/master/transcribe/1.2.0/src/transcribe.py new file mode 100644 index 00000000..9cabcb1e --- /dev/null +++ b/functions/master/transcribe/1.2.0/src/transcribe.py @@ -0,0 +1,1464 @@ +# Copyright 2024 Iguazio +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import logging +import operator +import os +import tempfile +from functools import reduce, wraps +from multiprocessing import Process, Queue +from pathlib import Path +from typing import Any, Dict, Generator, List, Literal, NamedTuple, Tuple, Union + +import pandas as pd +import torch +import torchaudio +from tqdm import tqdm +from transformers import ( + AutomaticSpeechRecognitionPipeline, + AutoModelForCausalLM, + pipeline, +) +from transformers.utils import is_flash_attn_2_available + + +class BaseTask: + """ + A task to write the transcription to file. + """ + + def __init__( + self, audio_file: Path, transcription_output: Union[dict, str], text_file: Path + ): + """ + Initialize the task. + + :param audio_file: Path to the audio file that was transcribed. + :param transcription_output: The transcription output from the pipeline. String means an exception was raised. + :param text_file: Path to the text file to write the transcription to. + """ + # Store the parameters: + self._audio_file = audio_file + self._transcription_output = transcription_output + self._text_file = text_file + + # Prepare the error variable: + self._error: str = None + + def do_task(self): + """ + Try to perform the task storing an error if occurred. + """ + if isinstance(self._transcription_output, str): + self._error = self._transcription_output + return + try: + self._do_task() + except Exception as exception: + self._error = str(exception) + + def is_failed(self) -> bool: + """ + Check if the task failed. + + :returns: Whether the task failed. + """ + return self._error is not None + + def get_result(self) -> Tuple[str, str]: + """ + Get the result of the task. If the task failed, the error will be returned, otherwise, the result will be the + text file name. + + :returns: The task's result. + """ + if self.is_failed(): + return self._audio_file.name, self._error + return self._audio_file.name, self._text_file.name + + def to_tuple(self) -> Tuple[str, dict]: + """ + Convert the task to a tuple to reconstruct it later (used for multiprocessing to pass in queue). + + :returns: The converted task. + """ + return self.__class__.__name__, { + "audio_file": self._audio_file, + "transcription_output": self._transcription_output, + "text_file": self._text_file, + } + + def _do_task(self): + """ + Perform the task - write the transcription to the stored file path. + """ + # Checking for no duplications: + i = 1 + while self._text_file.exists(): + i += 1 + self._text_file = ( + self._text_file.parent + / f"{self._text_file.stem.rsplit('_', 1)[0]}_{i}{self._text_file.suffix}" + ) + + # Make sure all directories are created: + self._text_file.parent.mkdir(exist_ok=True, parents=True) + + # Write to file: + with open(self._text_file, "w") as fp: + fp.write(self._transcription_output["text"]) + + +class SpeechDiarizationTask(BaseTask): + """ + A task to write the transcription to file with respect to a given speech diarization. + """ + + class _DiarizationSegment(NamedTuple): + """ + A speech diarization segment. + """ + + start: float + end: float + speaker: str + + class _WordTimestamp(NamedTuple): + """ + A word with its start and end timestamps. + """ + + start: float + end: float + text: str + + def __init__( + self, + audio_file: Path, + transcription_output: dict, + text_file: Path, + speech_diarization: List[Tuple[float, float, str]], + ): + """ + Initialize the task. + + :param audio_file: Path to the audio file that was transcribed. + :param transcription_output: The transcription output from the pipeline. + :param text_file: Path to the text file to write the transcription to. + :param speech_diarization: A speech diarization as a list of tuples: (start, end, speaker). + """ + super().__init__( + audio_file=audio_file, + transcription_output=transcription_output, + text_file=text_file, + ) + self._speech_diarization = speech_diarization + self._segments: List[SpeechDiarizationTask._DiarizationSegment] = None + self._last_chosen_index = 0 + + def to_tuple(self) -> Tuple[str, dict]: + """ + Convert the task to a tuple to reconstruct it later (used for multiprocessing to pass in queue). + + :returns: The converted task. + """ + task_class, task_kwargs = super().to_tuple() + return task_class, { + **task_kwargs, + "speech_diarization": self._speech_diarization, + } + + def _do_task(self): + """ + Perform the task - write the transcription to the stored file path with respect to the given speech diarization. + """ + # Check if a speech diarization is given, if not, just write the transcription to file: + if not self._speech_diarization: + super()._do_task() + return + + # Cast the chunks to word timestamps tuples: + words = [ + SpeechDiarizationTask._WordTimestamp( + start=chunk["timestamp"][0], + end=chunk["timestamp"][1], + text=chunk["text"], + ) + for chunk in self._transcription_output["chunks"] + ] + + # Cast speech diarization to segments tuples: + self._segments = [ + SpeechDiarizationTask._DiarizationSegment(*segment) + for segment in self._speech_diarization + ] + + # Try to match the Whisper model predicted timestamps to the closest diarization segment (closest diarization + # segment will be the most overlapping with the word, and if there is no overlap, the closest segment to the + # word): + speaker = self._segments[self._last_chosen_index].speaker + text = f"{speaker}:" + for word in words: + # Get the next diarization segment: + self._get_next_segment(word=word) + # Check if the segment is of the same speaker: + if self._segments[self._last_chosen_index].speaker == speaker: + # Collect the word: + text += word.text + else: + # Append a newline and update the new speaker: + speaker = self._segments[self._last_chosen_index].speaker + text += f"\n{speaker}:{word.text}" + + # Update the transcription output with the new text to write it to file: + self._transcription_output["text"] = text + super()._do_task() + + def _get_next_segment( + self, + word: _WordTimestamp, + ): + """ + Get the next diarization segment the given word falls into. The `self._last_chosen_index` will be updated + accordingly. + + :param word: The word timestamp to match to the next segment. + """ + # If the last chosen segment is the last segment, return it: + if self._last_chosen_index == len(self._segments) - 1: + return + + # Get the last chosen diarization segment: + last_chosen = self._segments[self._last_chosen_index] + + # None value may appear if the word is the last word in the audio file, or it was split during inference. In + # that case, we'll set the last segment: + if word.end is None: + self._last_chosen_index = len(self._segments) - 1 + return + + # If the word ends before the last chosen segment: + if word.end <= last_chosen.start: + # Then it is still the closest segment + return + + # We check if it ends inside the last chosen segment: + if word.end < last_chosen.end: + # Then it still is the closest segment + return + + # The word ends after the segment, we need to collect all next segments up until the word ends before them: + possible_segments = [self._last_chosen_index] + for i in range(self._last_chosen_index + 1, len(self._segments)): + if word.end > self._segments[i].end: + possible_segments.append(i) + continue + possible_segments.append(i) + break + + # Check for the most overlapping option: + best_overlap = 0 + most_overlapping_segment_index = None + for i in possible_segments: + # If the word starts before segment: + if word.start <= self._segments[i].start: + # If it ends before the segment, there is an overlap from the start of the segment to the end of the + # word: + if word.end < self._segments[i].end: + overlap = word.end - self._segments[i].start + else: + # The word is wrapping the segment, the overlap is the segment's length: + overlap = self._segments[i].end - self._segments[i].start + # The word starts in segment, check if the word ends in it: + elif word.end < self._segments[i].end: + # The overlap is the word's length: + overlap = word.end - word.start + # The word start in segment but ends after it, the overlap is from the word's start to the segment's end: + else: + overlap = self._segments[i].end - word.start + # Check for new best overlap: + if overlap > best_overlap: + best_overlap = overlap + most_overlapping_segment_index = i + if most_overlapping_segment_index is not None: + self._last_chosen_index = most_overlapping_segment_index + return + + # If there is no overlapping segment, return the closest segment: + best_distance = None + closest_segment_index = None + for i in possible_segments: + distance = ( + word.start - self._segments[i].end + if word.start > self._segments[i].end + else self._segments[i].start - word.end + ) + if best_distance is None or distance < best_distance: + best_distance = distance + closest_segment_index = i + self._last_chosen_index = closest_segment_index + + +class SpeechDiarizationPerChannelTask(BaseTask): + """ + A task to write the transcription to file with respect to a given speech diarization per channel. + """ + + class _WordTimestamp(NamedTuple): + """ + A word with its start and end timestamps and speaker label (channel the word was taken from). + """ + + start: float + end: float + speaker: str + text: str + + def __init__(self, audio_file: Path, text_file: Path): + """ + Initialize the task. + + :param audio_file: Path to the audio file that was transcribed. + :param text_file: Path to the text file to write the transcription to. + """ + super().__init__( + audio_file=audio_file, transcription_output={}, text_file=text_file + ) + self._transcription_output_channels: List[Tuple[str, dict]] = [] + + @property + def transcription_output_channels(self) -> List[Tuple[str, dict]]: + """ + Get the transcription output channels. + + :returns: The transcription output channels. + """ + return self._transcription_output_channels + + def do_task(self): + """ + Try to perform the task storing an error if occurred. + """ + for _, channel_output in self._transcription_output_channels: + if isinstance(channel_output, str): + self._error = self._transcription_output_channels + return + super().do_task() + + def to_tuple(self) -> Tuple[str, dict]: + """ + Convert the task to a tuple to reconstruct it later (used for multiprocessing to pass in queue). + + :returns: The converted task. + """ + task_class, task_kwargs = super().to_tuple() + task_kwargs.pop("transcription_output") + return task_class, task_kwargs + + def _do_task(self): + """ + Perform the task - write the transcription to the stored file path with respect to the given speech diarization + per channel. + """ + # Cast the chunks to word timestamps tuples: + words_per_channel = [ + [ + SpeechDiarizationPerChannelTask._WordTimestamp( + start=chunk["timestamp"][0], + end=chunk["timestamp"][1], + speaker=speaker, + text=chunk["text"], + ) + for chunk in output["chunks"] + ] + for speaker, output in self._transcription_output_channels + ] + + # Merge and sort the words per channel by their start time: + words = operator.add(*words_per_channel) + words.sort() + + # Write the transcription to file: + current_speaker = words[0].speaker + text = f"{current_speaker}:" + for word in words: + # Check if the word's speaker is different from the current one: + if word.speaker != current_speaker: + # Append a newline and update the new speaker: + current_speaker = word.speaker + text += f"\n{current_speaker}:" + # Collect the word: + text += word.text + + # Update the transcription output with the new text to write it to file: + self._transcription_output["text"] = text + super()._do_task() + + +class BatchProcessor: + """ + A batch processor to process batches of transcriptions. The batch processor is creating tasks and is aimed to be + working along the transcriber. It can be used with multiprocessing queue or run the tasks directly using the + associated methods. + """ + + def __init__(self, audio_files: List[Path], output_directory: Path): + """ + Initialize the batch processor. + + :param audio_files: The list of all audio files to transcribe. + :param output_directory: The output directory to write the transcriptions to. + """ + # Store the parameters: + self._audio_files = audio_files + self._output_directory = output_directory + + # Prepare the batching variables: + self._current_file_index = 0 + self._tasks: List[BaseTask] = [] + self._results: List[Tuple[bool, Tuple[str, str]]] = [] + + def process_batch(self, batch: List[Union[dict, str]]): + """ + Process a batch of transcriptions. Tasks related to the given batch will be created and stored in the batch + processor. + + :param batch: The batch of transcriptions to process. + """ + # Get the relevant files belongs to the given batch: + current_files = self._get_current_files(batch_size=len(batch)) + + # Build the diarization tasks: + self._tasks.extend( + [ + BaseTask( + audio_file=file, + transcription_output=batch[i], + text_file=self._output_directory / f"{file.stem}.txt", + ) + for i, file in enumerate(current_files) + ] + ) + + def get_tasks(self) -> List[BaseTask]: + """ + Get the tasks to perform. + + :returns: The tasks to perform. + """ + tasks = self._tasks + self._tasks = [] + return tasks + + def do_tasks(self): + """ + Perform the tasks. Should be used if no multiprocessing queue is given to a transcriber. + """ + for task in self.get_tasks(): + task.do_task() + self._results.append((task.is_failed(), task.get_result())) + + def get_results(self) -> List[Tuple[bool, Tuple[str, str]]]: + """ + Get the results of the tasks. The stored results are then cleared. + + :returns: The results of the tasks. + """ + results = self._results + self._results = [] + return results + + def _get_current_files(self, batch_size: int) -> List[Path]: + """ + Get the current files to process. + + :param batch_size: The batch size to progress the current file index. + + :returns: The current files to process. + """ + end_index = ( + self._current_file_index + batch_size + if self._current_file_index + batch_size < len(self._audio_files) + else len(self._audio_files) + ) + current_files = self._audio_files[self._current_file_index : end_index] + self._current_file_index = end_index + return current_files + + +class SpeechDiarizationBatchProcessor(BatchProcessor): + """ + A batch processor to process batches of transcriptions with respect to a given speech diarization. The batch + processor is creating tasks and is aimed to be working along the transcriber. It can be used with multiprocessing + queue or run the tasks directly using the associated methods. + """ + + def __init__( + self, audio_files: List[Path], output_directory: Path, speech_diarization: dict + ): + """ + Initialize the batch processor. + + :param audio_files: The list of all audio files to transcribe. + :param output_directory: The output directory to write the transcriptions to. + :param speech_diarization: A speech diarization dictionary to pass along with each processed batch. + """ + super().__init__(audio_files=audio_files, output_directory=output_directory) + self._speech_diarization = speech_diarization + self._audio_files = audio_files + + def process_batch(self, batch: List[dict]): + """ + Process a batch of transcriptions. Tasks related to the given batch will be created and stored in the batch + processor. + + :param batch: The batch of transcriptions to process. + """ + # Get the relevant files belongs to the given batch: + current_files = self._get_current_files(batch_size=len(batch)) + + # Build the diarization tasks: + self._tasks.extend( + [ + SpeechDiarizationTask( + audio_file=file, + transcription_output=batch[i], + text_file=self._output_directory / f"{file.stem}.txt", + speech_diarization=self._speech_diarization.get(file.name), + ) + for i, file in enumerate(current_files) + ] + ) + + +class PerChannelSpeechDiarizationBatchProcessor(BatchProcessor): + """ + A batch processor to process batches of transcriptions per channel. The batch processor is creating tasks with the + selected amount of channels given and is aimed to be working along the transcriber. It can be used with + multiprocessing queue or run the tasks directly using the associated methods. + """ + + def __init__( + self, + audio_files: List[Path], + output_directory: Path, + n_channels: int, + speakers: List[str], + ): + """ + Initialize the batch processor. + + :param audio_files: The list of all audio files to transcribe. + :param output_directory: The output directory to write the transcriptions to. + :param n_channels: The number of channels in each audio file to transcribe. + :param speakers: The speakers labels to use for each channel. + """ + super().__init__(audio_files=audio_files, output_directory=output_directory) + + # Store the parameters: + self._n_channels = n_channels + self._speakers = speakers + + # Prepare a channel buffer to store the channels until the current task created is fully covered: + self._task_in_process: SpeechDiarizationPerChannelTask = None + + def process_batch(self, batch: List[dict]): + """ + Process a batch of transcriptions. Tasks related to the given batch will be created and stored in the batch + processor. + + :param batch: The batch of transcriptions to process. + """ + # Go over the batch and create the tasks: + for output in batch: + # Check if there is a task in process: + if not self._task_in_process: + # Create a new task: + self._task_in_process = SpeechDiarizationPerChannelTask( + audio_file=self._audio_files[self._current_file_index], + text_file=self._output_directory + / f"{self._audio_files[self._current_file_index].stem}.txt", + ) + # Get the channel's speaker: + speaker = self._speakers[ + len(self._task_in_process.transcription_output_channels) + ] + # Collect the channel into the processed task: + self._task_in_process.transcription_output_channels.append( + (speaker, output) + ) + # Check if the task is fully covered (all channels are collected): + if ( + len(self._task_in_process.transcription_output_channels) + == self._n_channels + ): + # Collect the task and reset the task in process: + self._tasks.append(self._task_in_process) + self._current_file_index += 1 + self._task_in_process = None + + +class Transcriber: + """ + A transcription wrapper for the Huggingface's ASR pipeline - + https://huggingface.co/transformers/main_classes/pipelines.html#transformers.AutomaticSpeechRecognitionPipeline to + use with OpenAI's Whisper models - https://huggingface.co/openai. + """ + + def __init__( + self, + model_name: str, + device: str = None, + use_flash_attention_2: bool = None, + use_better_transformers: bool = None, + assistant_model: str = None, + max_new_tokens: int = 128, + chunk_length_s: int = 30, + batch_size: int = 2, + spoken_language: str = None, + translate_to_english: bool = False, + return_timestamps: Union[bool, Literal["word"]] = False, + per_channel_transcription: int = 0, + ): + """ + Initialize the transcriber. + + :param model_name: The model name to use. Should be a model from the OpenAI's Whisper models for + best results (for example "tiny", "base", "large", etc.). + :param device: The device to use for inference. If not given, will use GPU if available. + :param use_flash_attention_2: Whether to use the Flash Attention 2 implementation. It can be used only with + one of the following GPUs: Nvidia H series and Nvidia A series. T4 support + will be available soon. + + Note: If both `use_flash_attention_2` and + `use_better_transformers` are `None`, the optimization will be chosen + automatically according to the available resources. + + :param use_better_transformers: Whether to use the Better Transformers library to further optimize the model. + Should be used for all use cases that do not support flash attention 2. + + Note: If both `use_flash_attention_2` and `use_better_transformers` are + `None`, the optimization will be chosen automatically according to the + available resources. + :param assistant_model: The assistant model name to use for inference. Notice that the optimizations + (flash attention 2 and better transformers) will be applied for the assistant + as well. Should be a model from Huggingface's distil-whisper (see here for + more information: https://github.com/huggingface/distil-whisper). + :param max_new_tokens: The maximum number of new tokens to generate. This is used to limit the + generation length. Default is 128 tokens. + :param chunk_length_s: The audio chunk to split the audio to (in seconds). Default is 30 seconds. + :param batch_size: The batch size to use for inference. Default is 2. + :param spoken_language: Aim whisper to know what language is spoken. If None, it will try to detect it + for each chunk. + :param translate_to_english: Whether to translate the transcriptions to English. Default is False. + :param return_timestamps: Whether to return the timestamps of the words. If "word", will return the + timestamps of each word. If True will return the timestamps of each chunk. + Default is False. Aimed to be used for speech diarization. + :param per_channel_transcription: Whether to do per channel transcription. If needed to run per channel + transcription, pass the number of channels expected for each audio file here. + 0 means regular transcription (merge channels). + + Note: If `per_channel_transcription` is not 0, `batch_size` must be treated to + be the number of channels and not audio files. Aimed to be used for per + channel speech diarization. + """ + # Store loading parameters: + self._model_name = model_name + self._device = device + self._use_flash_attention_2 = use_flash_attention_2 + self._use_better_transformers = use_better_transformers + self._max_new_tokens = max_new_tokens + self._chunk_length_s = chunk_length_s + self._batch_size = batch_size + self._return_timestamps = return_timestamps + self._per_channel_transcription = per_channel_transcription + + # Store generation parameters: + self._assistant_model = assistant_model + self._spoken_language = spoken_language + self._translate_to_english = translate_to_english + + # Prepare the transcription objects: + self._transcription_pipeline: AutomaticSpeechRecognitionPipeline = None + self._generate_kwargs: dict = None + + def load(self): + """ + Load the transcriber. Must be called before transcribing. + """ + # Set the device and data type to use (prefer GPU if available): + device = torch.device( + self._device or "cuda" if torch.cuda.is_available() else "cpu" + ) + torch_dtype = torch.float16 if device.type == "cuda" else torch.float32 + + # Choose the optimization to use (in case the user did not specify any): + if ( + self._use_flash_attention_2 is None + and self._use_better_transformers is None + ): + # Prefer to use flash attention 2 if available and cuda device is supported (see GPU names to architecture + # here: https://en.wikipedia.org/wiki/List_of_Nvidia_graphics_processing_units#Tesla): + if device.type == "cuda" and is_flash_attn_2_available(): + cuda_device_name = torch.cuda.get_device_properties(device).name + if any( + cuda_device_name.startswith(gpu_name) + for gpu_name in [ + "NVIDIA A", # For Ampere architecture (e.g. A10, A30, A100) + "NVIDIA H", # For Hopper architecture (e.g. H100) + "NVIDIA L", # For Ada Lovelace architecture (e.g. L4, L40) + "NVIDIA RTX 30", # For Ada Lovelace architecture (RTX 30 series) + "NVIDIA RTX 40", # For Ada Lovelace architecture (RTX 40 series) + "NVIDIA RTX 50", # For Ada Lovelace architecture (RTX 50 series) + # Will be supported soon according to FlashAttention GitHub repo: + # https://github.com/Dao-AILab/flash-attention?tab=readme-ov-file#installation-and-features + # "NVIDIA T4", # For Turing architecture (only T4) + # "NVIDIA RTX 20", # For Turing architecture (RTX 20 series) + ] + ): + self._use_flash_attention_2 = True + else: + self._use_better_transformers = True + else: + self._use_better_transformers = True + + # Build the optimizations kwargs: + model_kwargs = { + "low_cpu_mem_usage": True, + "use_safetensors": True, + } + if self._use_flash_attention_2: + if _LOGGER: + _LOGGER.info( + "Using FlashAttention2 optimization - make sure the `flash-attn` package is installed via " + "`pip install -U flash-attn --no-build-isolation`" + ) + model_kwargs["attn_implementation"] = "flash_attention_2" + elif self._use_better_transformers: + if _LOGGER: + _LOGGER.info( + "Using BetterTransformers optimization - make sure the `optimum` package is installed via " + "`pip install -U optimum`" + ) + model_kwargs["attn_implementation"] = "sdpa" + + # Initialize the speech recognition pipeline: + self._transcription_pipeline = pipeline( + task="automatic-speech-recognition", + model=self._model_name, + model_kwargs=model_kwargs.copy(), + batch_size=self._batch_size, + max_new_tokens=self._max_new_tokens, + chunk_length_s=self._chunk_length_s, + return_timestamps=self._return_timestamps, + torch_dtype=torch_dtype, + device=device, + ) + + # Prepare the generation kwargs: + self._generate_kwargs = { + "language": self._spoken_language, + "task": "translate" if self._translate_to_english else "transcribe", + } + + # Initialize the assistant model (if needed): + if self._assistant_model: + assistant_model = AutoModelForCausalLM.from_pretrained( + self._assistant_model, torch_dtype=torch_dtype, **model_kwargs + ) + assistant_model.to(device) + self._generate_kwargs["assistant_model"] = assistant_model + + def transcribe( + self, + audio_files: List[Path], + batch_processor: BatchProcessor = None, + batches_queue: Queue = None, + verbose: bool = False, + ) -> Union[List[List[dict]], None]: + """ + Transcribe the given audio files. The transcriptions will be sent to a queue or a batch processor for further + processing like writing to text files. If no queue or batch processor is given, the transcriptions outputs from + the pipeline will be returned. Otherwise, `None` is returned. + + :param audio_files: The audio files to transcribe. + :param batch_processor: A batch processor. + :param batches_queue: A multiprocessing queue to put the batches in. + :param verbose: Whether to show a progress bar. Default is False. + + :returns: The transcriptions outputs from the pipeline if no queue or batch processor is given, otherwise, + `None`. + """ + # Wrap the audio files with a function to iterate over them via a generator (save memory and runtime with + # Huggingface's pipelines as they preload each input while inference is running): + def audio_iterator() -> Generator[Union[dict, str], None, None]: + if self._per_channel_transcription: + for audio_file in audio_files: + audio, sampling_rate = torchaudio.load(str(audio_file)) + audio = audio.numpy() + for channel in audio: + yield {"raw": channel, "sampling_rate": sampling_rate} + else: + for audio_file in audio_files: + yield str(audio_file) + + # Create a batch iterator: + def batch_iterator() -> Generator[List[Union[dict, str]], None, None]: + batch = [] + for audio in audio_iterator(): + batch.append(audio) + if len(batch) == self._batch_size: + yield batch + batch = [] + if batch: + yield batch + + # Prepare the successes dataframe and errors dictionary to be returned: + outputs = [] + + # Infer through the pipeline: + for input_batch in tqdm( + batch_iterator() if self._batch_size > 1 else audio_iterator(), + desc="Transcribing", + unit="channel" if self._per_channel_transcription else "audio file", + total=( + ( + (len(audio_files) // self._batch_size) + + (len(audio_files) % self._batch_size != 0) + ) + * (self._per_channel_transcription or 1) + ), + disable=not verbose, + ): + # Infer: + try: + output_batch = self._transcription_pipeline( + input_batch, + generate_kwargs=self._generate_kwargs, + ) + except Exception as exception: + # Collect the exception: + output_batch = str(exception) + # Align to batch size: + output_batch = ( + [output_batch] * len(input_batch) + if isinstance(input_batch, list) + else [output_batch] + ) + # To align with batching, if batch size is 1, wrap the output with a list: + if isinstance(output_batch, dict): + output_batch = [output_batch] + # If a batch processor is given, process the batch: + if batch_processor: + # Process it directly: + batch_processor.process_batch(batch=output_batch) + batch_processor.do_tasks() + elif batches_queue: + # Otherwise, queue the batch: + batches_queue.put(output_batch) + else: + # Otherwise, collect the output as is without processing: + outputs.append(output_batch) + + # Check if given a multiprocessing queue or a batch processor: + if batches_queue: + batches_queue.put(_MULTIPROCESSING_STOP_MARK) + + return outputs if not batch_processor else None + + +#: The value to send into multiprocessing queues to stop the process: +_MULTIPROCESSING_STOP_MARK = "STOP" + + +def _multiprocessing_process_batches( + batch_processor: BatchProcessor, + batches_queue: Queue, + tasks_queue: Queue, + n_task_completers: int, +): + """ + Process the batches in the given batches queue and put the tasks in the given tasks queue. The function will stop + when the given batches queue will receive the stop mark. It is aimed to be used with multiprocessing as a process. + + :param batch_processor: A batch processor to process the batches. + :param batches_queue: A queue to get the batches from. + :param tasks_queue: A queue to put the tasks in. + :param n_task_completers: The number of task completers (processes that run the `_multiprocessing_complete_tasks` + function). A stop mark will be sent to the tasks queue for each task completer. + """ + while True: + # Get the batch: + batch: List[dict] = batches_queue.get() + if batch == _MULTIPROCESSING_STOP_MARK: + break + + # Process the batch: + batch_processor.process_batch(batch=batch) + + # Get the tasks: + tasks = batch_processor.get_tasks() + + # Queue the tasks: + for task in tasks: + tasks_queue.put(task.to_tuple()) + + # Mark the end of the batches: + for _ in range(n_task_completers): + tasks_queue.put(_MULTIPROCESSING_STOP_MARK) + + +def _multiprocessing_complete_tasks(tasks_queue: Queue, results_queue: Queue): + """ + Complete the tasks in the given queue and put the results in the given results queue. The function will stop when + the given tasks queue will receive the stop mark. It is aimed to be used with multiprocessing as a process. + + :param tasks_queue: A queue to get the tasks from. + :param results_queue: A queue to put the results in. + """ + tasks_map = { + BaseTask.__name__: BaseTask, + SpeechDiarizationTask.__name__: SpeechDiarizationTask, + SpeechDiarizationPerChannelTask.__name__: SpeechDiarizationPerChannelTask, + } + + while True: + # Get the task: + task = tasks_queue.get() + if task == _MULTIPROCESSING_STOP_MARK: + break + + # Reconstruct the task: + task_class, task_kwargs = task + task = tasks_map[task_class](**task_kwargs) + + # Complete the task: + task.do_task() + results_queue.put((task.is_failed(), task.get_result())) + + # Mark the end of the tasks: + results_queue.put(_MULTIPROCESSING_STOP_MARK) + + +# Get the global logger: +_LOGGER = logging.getLogger() + + +def open_mpi_handler( + worker_inputs: List[str], root_worker_inputs: Dict[str, Any] = None +): + global _LOGGER + + # Check for MLRun and OpenMPI availability: + context, comm = _check_mlrun_and_open_mpi() + + # Check if MLRun is available, set the global logger to MLRun's: + if context: + _LOGGER = context.logger + + def decorator(handler): + if comm is None or comm.Get_size() == 1: + return handler + + @wraps(handler) + def wrapper(**kwargs): + # Get the open mpi environment properties: + size = comm.Get_size() + rank = comm.Get_rank() + + # Give the correct chunk of the workers inputs: + for worker_input in worker_inputs: + input_argument = kwargs[worker_input] + if input_argument is None: + continue + if isinstance(input_argument, str): + input_argument = _get_audio_files( + data_path=Path(input_argument).absolute() + ) + if len(input_argument) < size: + raise ValueError( + f"Cannot split the input '{worker_input}' of length {len(input_argument)} to {size} workers. " + f"Please reduce the amount of workers for this input." + ) + even_chunk_size = len(input_argument) // size + chunk_start = rank * even_chunk_size + chunk_end = ( + (rank + 1) * even_chunk_size + if rank + 1 < size + else len(input_argument) + ) + context.logger.info( + f"Rank #{rank}: Processing input chunk of '{worker_input}' " + f"from index {chunk_start} to {chunk_end}." + ) + if isinstance(input_argument, list): + input_argument = input_argument[chunk_start:chunk_end] + elif isinstance(input_argument, pd.DataFrame): + input_argument = input_argument.iloc[chunk_start:chunk_end:, :] + kwargs[worker_input] = input_argument + + # Set the root worker only arguments: + if rank == 0 and root_worker_inputs: + kwargs.update(root_worker_inputs) + + # Run the worker: + output = handler(**kwargs) + + # Save the output directory of this worker: + output_directory = Path(output[0]) + + # Send the output to the root rank (rank #0): + output = comm.gather(output, root=0) + + # Join the data from all workers: + if rank == 0: + context.logger.info("Collecting data from workers to root worker.") + + # Check if there are different output directories: + output_directories = set([Path(out_dir) for out_dir, _, _ in output]) + for r in range(1, size): + # True means the other workers should pass their files to the root worker (rank 0): + comm.send(len(output_directories) != 1, dest=r) + + # If there are different output directories, listen to the other workers: + if len(output_directories) != 1: + # Collect the files from the other workers: + files = [] + for r in range(1, size): + files.extend(comm.recv(source=r)) + # Write the files to the root worker's output directory: + for file_name, file_content in files: + with open(output_directory / file_name, "w") as f: + f.write(file_content) + + # Concatenate the dataframes: + dataframe = pd.concat(objs=[df for _, df, _ in output], axis=0) + + # Concatenate the errors dictionaries: + errors_dictionary = reduce( + operator.ior, [err for _, _, err in output], {} + ) + + return str(output_directory), dataframe, errors_dictionary + + # Listen to rank 0 to see if there are different output directories and this rank need to send its files to + # it: + if comm.recv(source=0): + files = [] + for file in os.listdir(output_directory): + with open(output_directory / file, "r") as f: + files.append((file, f.read())) + comm.send(files, dest=0) + return None + + return wrapper + + return decorator + + +def _check_mlrun_and_open_mpi() -> Tuple["mlrun.MLClientCtx", "mpi4py.MPI.Intracomm"]: + is_mpi = False + try: + import mlrun + + context = mlrun.get_or_create_ctx(name="mlrun") + is_mpi = context.labels.get("kind", "job") == "mpijob" + + if is_mpi: + try: + from mpi4py import MPI + + return context, MPI.COMM_WORLD + except ModuleNotFoundError as mpi4py_not_found: + context.logger.error( + "To distribute the function using MLRun's 'mpijob' you need to have `mpi4py` package in your " + "interpreter. Please run `pip install mpi4py` and make sure you have open-mpi." + ) + raise mpi4py_not_found + else: + return context, None + except ModuleNotFoundError as module_not_found: + if is_mpi: + raise module_not_found + return None, None + + +@open_mpi_handler(worker_inputs=["data_path"], root_worker_inputs={"verbose": True}) +def transcribe( + # Input / Output kwargs: + data_path: Union[str, Path, List[Union[str, Path]]], + output_directory: str = None, + # Model loading kwargs: + model_name: str = "openai/whisper-tiny", + device: str = None, + use_flash_attention_2: bool = None, + use_better_transformers: bool = None, + # Generation kwargs: + assistant_model: str = None, + max_new_tokens: int = 128, + chunk_length_s: int = 30, + batch_size: int = 8, + spoken_language: str = None, + translate_to_english: bool = False, + # Diarization kwargs: + speech_diarization: Dict[str, List[Tuple[float, float, str]]] = None, + speech_diarize_per_channel: int = None, + speaker_labels: List[str] = None, + # Other kwargs: + use_multiprocessing: Union[bool, int] = False, + verbose: bool = False, +): + """ + Transcribe audio files into text files and collect additional data. The end result is a directory of transcribed + text files and a dataframe containing the following columns: + + * audio_file - The audio file path. + * transcription_file - The transcribed text file name in the output directory. + + The transcription is based on Huggingface's ASR pipeline - + https://huggingface.co/transformers/main_classes/pipelines.html#transformers.AutomaticSpeechRecognitionPipeline and + is tested with OpenAI's Whisper models - https://huggingface.co/openai. + + If one of the speaker diarization parameters are given (either `speech_diarization` or + `speech_diarize_per_channel`), the transcription will be written in a conversation format, where each speaker will + be written in a separate line:: + + speaker_1: text + speaker_2: text + speaker_1: text + ... + + :param data_path: A directory of audio files or a single file or a list of files to transcribe. + :param output_directory: Path to a directory to save all transcribed audio files. If not given, will save + the transcribed files in a temporary directory. + :param model_name: The model name to use. Should be a model from the OpenAI's Whisper models for + best results (for example "tiny", "base", "large", etc.). See here for more + information: https://huggingface.co/openai?search_models=whisper. + :param device: The device to use for inference. If not given, will use GPU if available. + :param use_flash_attention_2: Whether to use the Flash Attention 2 implementation. It can be used only with + one of the following GPUs: Nvidia H series and Nvidia A series. T4 support + will be available soon. + + Note: If both `use_flash_attention_2` and + `use_better_transformers` are `None`, the optimization will be chosen + automatically according to the available resources. + + :param use_better_transformers: Whether to use the Better Transformers library to further optimize the model. + Should be used for all use cases that do not support flash attention 2. + + Note: If both `use_flash_attention_2` and `use_better_transformers` are + `None`, the optimization will be chosen automatically according to the + available resources. + :param assistant_model: The assistant model name to use for inference. Notice that the optimizations + (flash attention 2 and better transformers) will be applied for the assistant as + well. Should be a model from Huggingface's distil-whisper (see here for more + information: https://github.com/huggingface/distil-whisper). + + Note: Currently an assistant model is only usable with batch size of 1. + :param max_new_tokens: The maximum number of new tokens to generate. This is used to limit the + generation length. Default is 128 tokens. + :param chunk_length_s: The audio chunk to split the audio to (in seconds). Default is 30 seconds. + :param batch_size: The batch size to use for inference. Default is 2. + :param spoken_language: Aim whisper to know what language is spoken. If None, it will try to detect + it. + :param translate_to_english: Whether to translate the transcriptions to English. + :param speech_diarization: A speech diarization dictionary with the file names to transcribe as keys and + their diarization as value. The diarization is a list of tuples: + (start, end, speaker). An example + for a diarization dictionary:: + + { + "audio_file_name": [ + { + "start": 0.0, + "end": 2.0, + "speaker": "Agent", + }, + { + "start": 2.0, + "end": 4.0, + "speaker": "Client", + }, + ... + ], + ... + } + + Note: The diarization must be for the entire duration of the audio file (as long + as Whisper is predicting words up until then. + :param speech_diarize_per_channel: Perform speech diarization per channel. Each speaker is expected to belong to + a separate channel in the audio. Notice: This will make the transcription + slower as each channel wil be transcribed separatly. If a speech diarization + is passed (via the `speech_diarization` parameter), this parameter is + ignored. + :param speaker_labels: A list of speaker labels by channel order to use for writing the + transcription with respect to per channel speech diarization. This won't be + used together with a given speech diarization (via the `speech_diarization` + parameter). + :param use_multiprocessing: Whether to use multiprocessing to transcribe the audio files. Can be either a + boolean value or an integer. If `True`, will use the default amount of workers + (3): 1 for transcription, 1 for batch processing and 1 for task completion (such + as speech diarization and writing to files). To control the amount of tasks + completion workers, an integer can be provided to specify the amount of workers. + `False`, will use a single process. Default is `False`. + :param verbose: Whether to print the progress of the transcription. Default is `False`. + """ + global _LOGGER + + # Get the input audio files to transcribe: + if verbose: + _LOGGER.info("Collecting audio files.") + audio_files = _get_audio_files(data_path=data_path) + if verbose: + _LOGGER.info(f"Collected {len(audio_files)} audio files.") + + # Get the output directory: + if output_directory is None: + if verbose: + _LOGGER.info("No output directory given, using temporary directory.") + output_directory = tempfile.mkdtemp() + output_directory = Path(output_directory).absolute() + output_directory.mkdir(exist_ok=True, parents=True) + if verbose: + _LOGGER.info(f"Transcriptions will be saved to: {output_directory}") + + # Initialize a batch processor according to user requirements (no speech diarization, given speech diarization, + # speech diarization per channel): + if speech_diarization: + batch_processor = SpeechDiarizationBatchProcessor( + audio_files=audio_files, + output_directory=output_directory, + speech_diarization=speech_diarization, + ) + elif speech_diarize_per_channel: + batch_processor = PerChannelSpeechDiarizationBatchProcessor( + audio_files=audio_files, + output_directory=output_directory, + n_channels=speech_diarize_per_channel, + speakers=speaker_labels, + ) + else: + batch_processor = BatchProcessor( + audio_files=audio_files, + output_directory=output_directory, + ) + + # Initialize the transcription pipeline: + transcriber = Transcriber( + device=device, + use_flash_attention_2=use_flash_attention_2, + use_better_transformers=use_better_transformers, + assistant_model=assistant_model, + model_name=model_name, + max_new_tokens=max_new_tokens, + chunk_length_s=chunk_length_s, + batch_size=batch_size, + return_timestamps=( + "word" + if speech_diarization is not None or speech_diarize_per_channel is not None + else False + ), + per_channel_transcription=speech_diarize_per_channel or 0, + spoken_language=spoken_language, + translate_to_english=translate_to_english, + ) + + # Run the transcription: + if use_multiprocessing: + results = _parallel_run( + n_workers=use_multiprocessing + if isinstance(use_multiprocessing, int) + else 1, + audio_files=audio_files, + batch_processor=batch_processor, + transcriber=transcriber, + verbose=verbose, + ) + else: + results = _run( + audio_files=audio_files, + batch_processor=batch_processor, + transcriber=transcriber, + verbose=verbose, + ) + + # Process the results: + if verbose: + _LOGGER.info("Summarizing the results.") + successes = [] + errors = {} + for is_error, result in results: + if is_error: + errors[result[0]] = result[1] + else: + successes.append(result) + successes = pd.DataFrame(successes, columns=["audio_file", "transcription_file"]) + if verbose: + _LOGGER.info( + f"Done ({successes.shape[0]}/{len(audio_files)})\n" + f"Transcriptions summary:\n" + f"{successes.head()}" + ) + + return str(output_directory), successes, errors + + +def _get_audio_files( + data_path: Union[Path, str, list], +) -> List[Path]: + """ + Get the audio files to transcribe. If a path to a directory is given, all files in the directory will be collected. + + :param data_path: The data path to collect the audio files from. + + :returns: The audio files list. + """ + # Check if given a list of paths: + if isinstance(data_path, list): + audio_files = [] + for path in data_path: + audio_files.extend(_get_audio_files(data_path=path)) + return audio_files + + # Check if given a single string path to cast it to a `pathlib.Path`: + if isinstance(data_path, str): + data_path = Path(data_path).absolute() + + # Check if the path is of a directory or a file: + if data_path.is_dir(): + # Get all files inside the directory: + audio_files = list(data_path.glob("*.*")) + elif data_path.is_file(): + audio_files = [data_path] + else: + raise ValueError( + f"Unrecognized data path. The parameter `data_path` must be a valid path to either a directory path or a " + f"file. Given: {str(data_path)} " + ) + + return audio_files + + +def _run( + audio_files: List[Path], + batch_processor: BatchProcessor, + transcriber: Transcriber, + verbose: bool, +) -> List[Tuple[bool, Tuple[str, str]]]: + """ + Run the transcription without multiprocessing. + + :param audio_files: The audio files to transcribe. + :param batch_processor: The batch processor to use. + :param transcriber: The transcriber to use. + :param verbose: Verbosity. + + :returns: The collected results. + """ + # Load the transcription pipeline: + if verbose: + _LOGGER.info(f"Loading the transcription pipeline.") + transcriber.load() + if verbose: + _LOGGER.info("Transcription pipeline loaded.") + + # Transcribe the files: + transcriber.transcribe( + audio_files=audio_files, + batch_processor=batch_processor, + verbose=verbose, + ) + + # Return the results: + return batch_processor.get_results() + + +def _parallel_run( + n_workers: int, + audio_files: List[Path], + batch_processor: BatchProcessor, + transcriber: Transcriber, + verbose: bool, +): + """ + Run the transcription with multiprocessing. + + :param n_workers: The amount of workers to use as task completers. + :param audio_files: The audio files to transcribe. + :param batch_processor: The batch processor to use. + :param transcriber: The transcriber to use. + :param verbose: Verbosity. + + :returns: The collected results. + """ + # Initialize the multiprocessing queues: + batches_queue = Queue() + tasks_queue = Queue() + results_queue = Queue() + + # Initialize the multiprocessing processes: + batch_processing_process = Process( + target=_multiprocessing_process_batches, + kwargs={ + "batch_processor": batch_processor, + "batches_queue": batches_queue, + "tasks_queue": tasks_queue, + "n_task_completers": n_workers, + }, + ) + task_completion_processes = [ + Process( + target=_multiprocessing_complete_tasks, + kwargs={"tasks_queue": tasks_queue, "results_queue": results_queue}, + ) + for _ in range(n_workers) + ] + + # Start the multiprocessing processes: + batch_processing_process.start() + for p in task_completion_processes: + p.start() + + # Load the transcription pipeline: + if verbose: + _LOGGER.info(f"Loading the transcription pipeline.") + transcriber.load() + if verbose: + _LOGGER.info("Transcription pipeline loaded.") + + # Transcribe the files: + transcriber.transcribe( + audio_files=audio_files, batches_queue=batches_queue, verbose=verbose + ) + + # Collect the results: + results = [] + stop_marks_counter = 0 + while True: + # Get a result from the queue: + result: Tuple[bool, Tuple[str, str]] = results_queue.get() + if result == _MULTIPROCESSING_STOP_MARK: + stop_marks_counter += 1 + if stop_marks_counter == n_workers: + break + else: + # Collect the result: + results.append(result) + + # Wait for the processes to finish: + results_queue.empty() + batch_processing_process.join() + for p in task_completion_processes: + p.join() + + return results \ No newline at end of file diff --git a/functions/master/transcribe/1.2.0/static/documentation.html b/functions/master/transcribe/1.2.0/static/documentation.html new file mode 100644 index 00000000..d92df103 --- /dev/null +++ b/functions/master/transcribe/1.2.0/static/documentation.html @@ -0,0 +1,627 @@ + + + + + + + +transcribe package + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + + + +
    +
    +
    +
    +
    +
    + +
    + +
    +
    +
    + + + + +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + + + +
    +
    +
    +
    + + +
    +
    +

    transcribe package#

    +
    +

    Submodules#

    +
    +
    +

    transcribe.transcribe module#

    +
    +
    +class transcribe.transcribe.BaseTask(audio_file: Path, transcription_output: dict | str, text_file: Path)[source]#
    +

    Bases: object

    +

    A task to write the transcription to file.

    +
    +
    +do_task()[source]#
    +

    Try to perform the task storing an error if occurred.

    +
    +
    +
    +get_result() Tuple[str, str][source]#
    +

    Get the result of the task. If the task failed, the error will be returned, otherwise, the result will be the +text file name.

    +
    +
    Returns:
    +

    The task’s result.

    +
    +
    +
    +
    +
    +is_failed() bool[source]#
    +

    Check if the task failed.

    +
    +
    Returns:
    +

    Whether the task failed.

    +
    +
    +
    +
    +
    +to_tuple() Tuple[str, dict][source]#
    +

    Convert the task to a tuple to reconstruct it later (used for multiprocessing to pass in queue).

    +
    +
    Returns:
    +

    The converted task.

    +
    +
    +
    +
    +
    +
    +class transcribe.transcribe.BatchProcessor(audio_files: List[Path], output_directory: Path)[source]#
    +

    Bases: object

    +

    A batch processor to process batches of transcriptions. The batch processor is creating tasks and is aimed to be +working along the transcriber. It can be used with multiprocessing queue or run the tasks directly using the +associated methods.

    +
    +
    +do_tasks()[source]#
    +

    Perform the tasks. Should be used if no multiprocessing queue is given to a transcriber.

    +
    +
    +
    +get_results() List[Tuple[bool, Tuple[str, str]]][source]#
    +

    Get the results of the tasks. The stored results are then cleared.

    +
    +
    Returns:
    +

    The results of the tasks.

    +
    +
    +
    +
    +
    +get_tasks() List[BaseTask][source]#
    +

    Get the tasks to perform.

    +
    +
    Returns:
    +

    The tasks to perform.

    +
    +
    +
    +
    +
    +process_batch(batch: List[dict | str])[source]#
    +

    Process a batch of transcriptions. Tasks related to the given batch will be created and stored in the batch +processor.

    +
    +
    Parameters:
    +

    batch – The batch of transcriptions to process.

    +
    +
    +
    +
    +
    +
    +class transcribe.transcribe.PerChannelSpeechDiarizationBatchProcessor(audio_files: List[Path], output_directory: Path, n_channels: int, speakers: List[str])[source]#
    +

    Bases: BatchProcessor

    +

    A batch processor to process batches of transcriptions per channel. The batch processor is creating tasks with the +selected amount of channels given and is aimed to be working along the transcriber. It can be used with +multiprocessing queue or run the tasks directly using the associated methods.

    +
    +
    +process_batch(batch: List[dict])[source]#
    +

    Process a batch of transcriptions. Tasks related to the given batch will be created and stored in the batch +processor.

    +
    +
    Parameters:
    +

    batch – The batch of transcriptions to process.

    +
    +
    +
    +
    +
    +
    +class transcribe.transcribe.SpeechDiarizationBatchProcessor(audio_files: List[Path], output_directory: Path, speech_diarization: dict)[source]#
    +

    Bases: BatchProcessor

    +

    A batch processor to process batches of transcriptions with respect to a given speech diarization. The batch +processor is creating tasks and is aimed to be working along the transcriber. It can be used with multiprocessing +queue or run the tasks directly using the associated methods.

    +
    +
    +process_batch(batch: List[dict])[source]#
    +

    Process a batch of transcriptions. Tasks related to the given batch will be created and stored in the batch +processor.

    +
    +
    Parameters:
    +

    batch – The batch of transcriptions to process.

    +
    +
    +
    +
    +
    +
    +class transcribe.transcribe.SpeechDiarizationPerChannelTask(audio_file: Path, text_file: Path)[source]#
    +

    Bases: BaseTask

    +

    A task to write the transcription to file with respect to a given speech diarization per channel.

    +
    +
    +do_task()[source]#
    +

    Try to perform the task storing an error if occurred.

    +
    +
    +
    +to_tuple() Tuple[str, dict][source]#
    +

    Convert the task to a tuple to reconstruct it later (used for multiprocessing to pass in queue).

    +
    +
    Returns:
    +

    The converted task.

    +
    +
    +
    +
    +
    +property transcription_output_channels: List[Tuple[str, dict]]#
    +

    Get the transcription output channels.

    +
    +
    Returns:
    +

    The transcription output channels.

    +
    +
    +
    +
    +
    +
    +class transcribe.transcribe.SpeechDiarizationTask(audio_file: Path, transcription_output: dict, text_file: Path, speech_diarization: List[Tuple[float, float, str]])[source]#
    +

    Bases: BaseTask

    +

    A task to write the transcription to file with respect to a given speech diarization.

    +
    +
    +to_tuple() Tuple[str, dict][source]#
    +

    Convert the task to a tuple to reconstruct it later (used for multiprocessing to pass in queue).

    +
    +
    Returns:
    +

    The converted task.

    +
    +
    +
    +
    +
    +
    +class transcribe.transcribe.Transcriber(model_name: str, device: str | None = None, use_flash_attention_2: bool | None = None, use_better_transformers: bool | None = None, assistant_model: str | None = None, max_new_tokens: int = 128, chunk_length_s: int = 30, batch_size: int = 2, spoken_language: str | None = None, translate_to_english: bool = False, return_timestamps: bool | Literal['word'] = False, per_channel_transcription: int = 0)[source]#
    +

    Bases: object

    +

    A transcription wrapper for the Huggingface’s ASR pipeline - +https://huggingface.co/transformers/main_classes/pipelines.html#transformers.AutomaticSpeechRecognitionPipeline to +use with OpenAI’s Whisper models - https://huggingface.co/openai.

    +
    +
    +load()[source]#
    +

    Load the transcriber. Must be called before transcribing.

    +
    +
    +
    +transcribe(audio_files: List[Path], batch_processor: BatchProcessor | None = None, batches_queue: Queue | None = None, verbose: bool = False) List[List[dict]] | None[source]#
    +

    Transcribe the given audio files. The transcriptions will be sent to a queue or a batch processor for further +processing like writing to text files. If no queue or batch processor is given, the transcriptions outputs from +the pipeline will be returned. Otherwise, None is returned.

    +
    +
    Parameters:
    +
      +
    • audio_files – The audio files to transcribe.

    • +
    • batch_processor – A batch processor.

    • +
    • batches_queue – A multiprocessing queue to put the batches in.

    • +
    • verbose – Whether to show a progress bar. Default is False.

    • +
    +
    +
    Returns:
    +

    The transcriptions outputs from the pipeline if no queue or batch processor is given, otherwise, +None.

    +
    +
    +
    +
    +
    +
    +transcribe.transcribe.open_mpi_handler(worker_inputs: List[str], root_worker_inputs: Dict[str, Any] | None = None)[source]#
    +
    +
    +
    +transcribe.transcribe.transcribe(data_path: str | Path | List[str | Path], output_directory: str | None = None, model_name: str = 'openai/whisper-tiny', device: str | None = None, use_flash_attention_2: bool | None = None, use_better_transformers: bool | None = None, assistant_model: str | None = None, max_new_tokens: int = 128, chunk_length_s: int = 30, batch_size: int = 8, spoken_language: str | None = None, translate_to_english: bool = False, speech_diarization: Dict[str, List[Tuple[float, float, str]]] | None = None, speech_diarize_per_channel: int | None = None, speaker_labels: List[str] | None = None, use_multiprocessing: bool | int = False, verbose: bool = False)[source]#
    +

    Transcribe audio files into text files and collect additional data. The end result is a directory of transcribed +text files and a dataframe containing the following columns:

    +
      +
    • audio_file - The audio file path.

    • +
    • transcription_file - The transcribed text file name in the output directory.

    • +
    +

    The transcription is based on Huggingface’s ASR pipeline - +https://huggingface.co/transformers/main_classes/pipelines.html#transformers.AutomaticSpeechRecognitionPipeline and +is tested with OpenAI’s Whisper models - https://huggingface.co/openai.

    +

    If one of the speaker diarization parameters are given (either speech_diarization or +speech_diarize_per_channel), the transcription will be written in a conversation format, where each speaker will +be written in a separate line:

    +
    speaker_1: text
    +speaker_2: text
    +speaker_1: text
    +...
    +
    +
    +
    +
    Parameters:
    +
      +
    • data_path – A directory of audio files or a single file or a list of files to transcribe.

    • +
    • output_directory – Path to a directory to save all transcribed audio files. If not given, will save +the transcribed files in a temporary directory.

    • +
    • model_name – The model name to use. Should be a model from the OpenAI’s Whisper models for +best results (for example “tiny”, “base”, “large”, etc.). See here for more +information: https://huggingface.co/openai?search_models=whisper.

    • +
    • device – The device to use for inference. If not given, will use GPU if available.

    • +
    • use_flash_attention_2

      Whether to use the Flash Attention 2 implementation. It can be used only with +one of the following GPUs: Nvidia H series and Nvidia A series. T4 support +will be available soon.

      +

      Note: If both use_flash_attention_2 and +use_better_transformers are None, the optimization will be chosen +automatically according to the available resources.

      +

    • +
    • use_better_transformers

      Whether to use the Better Transformers library to further optimize the model. +Should be used for all use cases that do not support flash attention 2.

      +

      Note: If both use_flash_attention_2 and use_better_transformers are +None, the optimization will be chosen automatically according to the +available resources.

      +

    • +
    • assistant_model

      The assistant model name to use for inference. Notice that the optimizations +(flash attention 2 and better transformers) will be applied for the assistant as +well. Should be a model from Huggingface’s distil-whisper (see here for more +information: huggingface/distil-whisper).

      +

      Note: Currently an assistant model is only usable with batch size of 1.

      +

    • +
    • max_new_tokens – The maximum number of new tokens to generate. This is used to limit the +generation length. Default is 128 tokens.

    • +
    • chunk_length_s – The audio chunk to split the audio to (in seconds). Default is 30 seconds.

    • +
    • batch_size – The batch size to use for inference. Default is 2.

    • +
    • spoken_language – Aim whisper to know what language is spoken. If None, it will try to detect +it.

    • +
    • translate_to_english – Whether to translate the transcriptions to English.

    • +
    • speech_diarization

      A speech diarization dictionary with the file names to transcribe as keys and +their diarization as value. The diarization is a list of tuples: +(start, end, speaker). An example +for a diarization dictionary:

      +
      {
      +
      +
      +
      +
      +
      ”audio_file_name”: [
      +
      {

      “start”: 0.0, +“end”: 2.0, +“speaker”: “Agent”,

      +
      +
      +

      }, +{

      +
      +

      ”start”: 2.0, +“end”: 4.0, +“speaker”: “Client”,

      +
      +
      +
      +
      +

      }

      +

      Note: The diarization must be for the entire duration of the audio file (as long +as Whisper is predicting words up until then.

      +

    • +
    • speech_diarize_per_channel – Perform speech diarization per channel. Each speaker is expected to belong to +a separate channel in the audio. Notice: This will make the transcription +slower as each channel wil be transcribed separatly. If a speech diarization +is passed (via the speech_diarization parameter), this parameter is +ignored.

    • +
    • speaker_labels – A list of speaker labels by channel order to use for writing the +transcription with respect to per channel speech diarization. This won’t be +used together with a given speech diarization (via the speech_diarization +parameter).

    • +
    • use_multiprocessing – Whether to use multiprocessing to transcribe the audio files. Can be either a +boolean value or an integer. If True, will use the default amount of workers +(3): 1 for transcription, 1 for batch processing and 1 for task completion (such +as speech diarization and writing to files). To control the amount of tasks +completion workers, an integer can be provided to specify the amount of workers. +False, will use a single process. Default is False.

    • +
    • verbose – Whether to print the progress of the transcription. Default is False.

    • +
    +
    +
    +
    +
    +
    +

    Module contents#

    +
    +
    +
    +
    +
    +
    +
    +
    + +
    +
    + +
    +
    +
    +
    + + + +
    +
    + + \ No newline at end of file diff --git a/functions/master/transcribe/1.2.0/static/example.html b/functions/master/transcribe/1.2.0/static/example.html new file mode 100644 index 00000000..261d8df0 --- /dev/null +++ b/functions/master/transcribe/1.2.0/static/example.html @@ -0,0 +1,605 @@ + + + + + + + +Transcribe tutorial + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + + + +
    +
    +
    +
    +
    +
    + +
    + +
    +
    +
    + + + + +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + + + +
    +
    +
    +
    +
    +

    Transcribe tutorial

    + +
    + +
    +
    + +
    +
    +

    Transcribe tutorial#

    +
    +
    +
    import tempfile
    +import mlrun
    +
    +
    +
    +
    +
    +

    Importing the transcribe function from hub#

    +

    To import the function directly from hub, use:

    +
    transcribe_fn = mlrun.import_function("hub://transcribe")
    +
    +
    +
    +
    +
    artifact_path = tempfile.mkdtemp()
    +
    +
    +
    +
    +
    +
    +
    transcribe_fn = mlrun.import_function("function.yaml")
    +
    +
    +
    +
    +
    +
    +

    Running transcribe#

    +
    +
    +
    transcribe_run = transcribe_fn.run(
    +    handler="transcribe",
    +    params={
    +        "model_name": "tiny",
    +        "input_path": "./data",
    +        "decoding_options": {"fp16": False},
    +        "output_directory": "./output",
    +    },
    +    returns=[
    +        "transcriptions: path",
    +        "transcriptions_df: dataset",
    +        {"key": "transcriptions_errors", "artifact_type": "file", "file_format": "yaml"},
    +    ],
    +    local=True,
    +    artifact_path=artifact_path,
    +)
    +
    +
    +
    +
    +
    > 2023-07-16 17:14:01,968 [info] Storing function: {'name': 'transcribe-transcribe', 'uid': 'd1384cb679bc4c178b0195d964b628a8', 'db': None}
    +> 2023-07-16 17:14:01,969 [warning] Could not detect path to API server, not connected to API server!
    +> 2023-07-16 17:14:01,969 [warning] MLRUN_DBPATH is not set. Set this environment variable to the URL of the API server in order to connect
    +> 2023-07-16 17:14:01,970 [warning] Could not detect path to API server, not connected to API server!
    +> 2023-07-16 17:14:01,970 [warning] MLRUN_DBPATH is not set. Set this environment variable to the URL of the API server in order to connect
    +> 2023-07-16 17:14:01,972 [warning] Could not detect path to API server, not connected to API server!
    +> 2023-07-16 17:14:01,972 [warning] MLRUN_DBPATH is not set. Set this environment variable to the URL of the API server in order to connect
    +> 2023-07-16 17:14:09,804 [warning] Could not detect path to API server, not connected to API server!
    +> 2023-07-16 17:14:09,805 [warning] MLRUN_DBPATH is not set. Set this environment variable to the URL of the API server in order to connect
    +> 2023-07-16 17:14:09,805 [info] Loading whisper model: 'tiny'
    +
    +
    +
    The 'nopython' keyword argument was not supplied to the 'numba.jit' decorator. The implicit default value for this argument is currently False, but it will be changed to True in Numba 0.59.0. See https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit for details.
    +IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
    +
    +
    +
    > 2023-07-16 17:14:10,374 [info] Model loaded.
    +
    +
    +
    Transcribing:  67%|██████▋   | 2/3 [00:02<00:01,  1.04s/file]
    +
    +
    +
    > 2023-07-16 17:14:12,556 [warning] Error in file: '/Users/Yonatan_Shelach/projects/functions/transcribe/data/error_file.txt'
    +
    +
    +
    Transcribing: 100%|██████████| 3/3 [00:02<00:00,  1.39file/s]
    +
    +
    +
    > 2023-07-16 17:14:12,566 [info] Done:
    +      audio_file transcription_file language     length  rate_of_speech
    +0  speech_01.mp3      speech_01.txt       en   2.011333        3.480278
    +1  speech_02.mp3      speech_02.txt       en  20.793500        2.548873
    +> 2023-07-16 17:14:12,596 [warning] Could not detect path to API server, not connected to API server!
    +> 2023-07-16 17:14:12,597 [warning] MLRUN_DBPATH is not set. Set this environment variable to the URL of the API server in order to connect
    +> 2023-07-16 17:14:12,659 [warning] Could not detect path to API server, not connected to API server!
    +> 2023-07-16 17:14:12,660 [warning] MLRUN_DBPATH is not set. Set this environment variable to the URL of the API server in order to connect
    +> 2023-07-16 17:14:12,671 [warning] Could not detect path to API server, not connected to API server!
    +> 2023-07-16 17:14:12,672 [warning] MLRUN_DBPATH is not set. Set this environment variable to the URL of the API server in order to connect
    +
    +
    +
    
    +
    +
    +
    > 2023-07-16 17:14:12,707 [warning] Could not detect path to API server, not connected to API server!
    +> 2023-07-16 17:14:12,707 [warning] MLRUN_DBPATH is not set. Set this environment variable to the URL of the API server in order to connect
    +> 2023-07-16 17:14:12,708 [warning] Could not detect path to API server, not connected to API server!
    +> 2023-07-16 17:14:12,708 [warning] MLRUN_DBPATH is not set. Set this environment variable to the URL of the API server in order to connect
    +
    +
    +
    +
    +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
    default
    ...b628a8
    0Jul 16 14:14:01completedtranscribe-transcribe
    kind=
    owner=Yonatan_Shelach
    host=M-QWXQJK77Q0
    model_name=tiny
    audio_files_directory=./data
    decoding_options={'fp16': False}
    output_directory=./output
    transcriptions
    transcriptions_df
    transcriptions_errors
    +
    + +
    +
    
    +
    +
    +
    > to track results use the .show() or .logs() methods
    > 2023-07-16 17:14:12,721 [info] Run execution finished: {'status': 'completed', 'name': 'transcribe-transcribe'}
    +
    +
    +
    +
    +
    +
    +
    transcribe_run.outputs
    +
    +
    +
    +
    +
    {'transcriptions': 'store://artifacts/default/transcribe-transcribe_transcriptions:d1384cb679bc4c178b0195d964b628a8',
    + 'transcriptions_df': 'store://artifacts/default/transcribe-transcribe_transcriptions_df:d1384cb679bc4c178b0195d964b628a8',
    + 'transcriptions_errors': 'store://artifacts/default/transcribe-transcribe_transcriptions_errors:d1384cb679bc4c178b0195d964b628a8'}
    +
    +
    +
    +
    +

    Notice: If connected to mlrun server, you can simply use:

    +
    df = transcribe_run.artifact("transcriptions_df")
    +
    +
    +
    +
    +
    artifact_path += f"/{transcribe_run.metadata.name}/{transcribe_run.metadata.iteration}/"
    +
    +
    +
    +
    +
    +
    +
    df = mlrun.get_dataitem(artifact_path + "transcriptions_df.parquet").as_df()
    +
    +
    +
    +
    +
    +
    +
    df.head()
    +
    +
    +
    +
    +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    audio_filetranscription_filelanguagelengthrate_of_speech
    0speech_01.mp3speech_01.txten2.0113333.480278
    1speech_02.mp3speech_02.txten20.7935002.548873
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +
    +
    + +
    +
    +
    +
    + + + +
    +
    + + \ No newline at end of file diff --git a/functions/master/transcribe/1.2.0/static/function.html b/functions/master/transcribe/1.2.0/static/function.html new file mode 100644 index 00000000..1f55b3ab --- /dev/null +++ b/functions/master/transcribe/1.2.0/static/function.html @@ -0,0 +1,320 @@ + + + + + + + + + + + Source + + + + +
    +        
    +kind: job
    +metadata:
    +  categories:
    +  - audio
    +  - genai
    +  tag: ''
    +  name: transcribe
    +verbose: false
    +spec:
    +  build:
    +    origin_filename: ''
    +    requirements:
    +    - transformers
    +    - tqdm
    +    - torchaudio
    +    - torch
    +    - accelerate
    +    base_image: mlrun/mlrun
    +    code_origin: ''
    +    functionSourceCode: 
    +  disable_auto_mount: false
    +  description: Transcribe audio files into text files
    +  image: ''
    +  command: ''
    +  default_handler: transcribe
    +  entry_points:
    +    do_task:
    +      name: do_task
    +      doc: Try to perform the task storing an error if occurred.
    +      lineno: 348
    +      parameters:
    +      - name: self
    +      has_varargs: false
    +      has_kwargs: false
    +    is_failed:
    +      name: is_failed
    +      doc: Check if the task failed.
    +      lineno: 70
    +      parameters:
    +      - name: self
    +      has_varargs: false
    +      has_kwargs: false
    +      outputs:
    +      - doc: Whether the task failed.
    +        type: bool
    +    get_result:
    +      name: get_result
    +      doc: 'Get the result of the task. If the task failed, the error will be returned,
    +        otherwise, the result will be the
    +
    +        text file name.'
    +      lineno: 78
    +      parameters:
    +      - name: self
    +      has_varargs: false
    +      has_kwargs: false
    +      outputs:
    +      - doc: The task's result.
    +        type: Tuple[str, str]
    +    to_tuple:
    +      name: to_tuple
    +      doc: Convert the task to a tuple to reconstruct it later (used for multiprocessing
    +        to pass in queue).
    +      lineno: 358
    +      parameters:
    +      - name: self
    +      has_varargs: false
    +      has_kwargs: false
    +      outputs:
    +      - doc: The converted task.
    +        type: Tuple[str, dict]
    +    transcription_output_channels:
    +      name: transcription_output_channels
    +      doc: Get the transcription output channels.
    +      lineno: 340
    +      parameters:
    +      - name: self
    +      has_varargs: false
    +      has_kwargs: false
    +      outputs:
    +      - doc: The transcription output channels.
    +        type: List[Tuple[str, dict]]
    +    process_batch:
    +      name: process_batch
    +      doc: 'Process a batch of transcriptions. Tasks related to the given batch will
    +        be created and stored in the batch
    +
    +        processor.'
    +      lineno: 575
    +      parameters:
    +      - name: self
    +      - name: batch
    +        type: List[dict]
    +        doc: The batch of transcriptions to process.
    +      has_varargs: false
    +      has_kwargs: false
    +    get_tasks:
    +      name: get_tasks
    +      doc: Get the tasks to perform.
    +      lineno: 453
    +      parameters:
    +      - name: self
    +      has_varargs: false
    +      has_kwargs: false
    +      outputs:
    +      - doc: The tasks to perform.
    +        type: List[BaseTask]
    +    do_tasks:
    +      name: do_tasks
    +      doc: Perform the tasks. Should be used if no multiprocessing queue is given
    +        to a transcriber.
    +      lineno: 463
    +      parameters:
    +      - name: self
    +      has_varargs: false
    +      has_kwargs: false
    +    get_results:
    +      name: get_results
    +      doc: Get the results of the tasks. The stored results are then cleared.
    +      lineno: 471
    +      parameters:
    +      - name: self
    +      has_varargs: false
    +      has_kwargs: false
    +      outputs:
    +      - doc: The results of the tasks.
    +        type: List[Tuple[bool, Tuple[str, str]]]
    +    load:
    +      name: load
    +      doc: Load the transcriber. Must be called before transcribing.
    +      lineno: 695
    +      parameters:
    +      - name: self
    +      has_varargs: false
    +      has_kwargs: false
    +    transcribe:
    +      name: transcribe
    +      doc: "Transcribe audio files into text files and collect additional data. The\
    +        \ end result is a directory of transcribed\ntext files and a dataframe containing\
    +        \ the following columns:\n\n* audio_file - The audio file path.\n* transcription_file\
    +        \ - The transcribed text file name in the output directory.\n\nThe transcription\
    +        \ is based on Huggingface's ASR pipeline -\nhttps://huggingface.co/transformers/main_classes/pipelines.html#transformers.AutomaticSpeechRecognitionPipeline\
    +        \ and\nis tested with OpenAI's Whisper models - https://huggingface.co/openai.\n\
    +        \nIf one of the speaker diarization parameters are given (either `speech_diarization`\
    +        \ or\n`speech_diarize_per_channel`), the transcription will be written in\
    +        \ a conversation format, where each speaker will\nbe written in a separate\
    +        \ line::\n\n    speaker_1: text\n    speaker_2: text\n    speaker_1: text\n\
    +        \    ..."
    +      lineno: 1097
    +      parameters:
    +      - name: data_path
    +        type: Union[str, Path, List[Union[str, Path]]]
    +        doc: A directory of audio files or a single file or a list of files to transcribe.
    +      - name: output_directory
    +        type: str
    +        doc: Path to a directory to save all transcribed audio files. If not given,
    +          will save the transcribed files in a temporary directory.
    +        default: null
    +      - name: model_name
    +        type: str
    +        doc: 'The model name to use. Should be a model from the OpenAI''s Whisper
    +          models for best results (for example "tiny", "base", "large", etc.). See
    +          here for more information: https://huggingface.co/openai?search_models=whisper.'
    +        default: openai/whisper-tiny
    +      - name: device
    +        type: str
    +        doc: The device to use for inference. If not given, will use GPU if available.
    +        default: null
    +      - name: use_flash_attention_2
    +        type: bool
    +        doc: 'Whether to use the Flash Attention 2 implementation. It can be used
    +          only with one of the following GPUs: Nvidia H series and Nvidia A series.
    +          T4 support will be available soon.'
    +        default: null
    +      - name: use_better_transformers
    +        type: bool
    +        doc: Whether to use the Better Transformers library to further optimize the
    +          model. Should be used for all use cases that do not support flash attention
    +          2.
    +        default: null
    +      - name: assistant_model
    +        type: str
    +        doc: 'The assistant model name to use for inference. Notice that the optimizations
    +          (flash attention 2 and better transformers) will be applied for the assistant
    +          as well. Should be a model from Huggingface''s distil-whisper (see here
    +          for more information: https://github.com/huggingface/distil-whisper).'
    +        default: null
    +      - name: max_new_tokens
    +        type: int
    +        doc: The maximum number of new tokens to generate. This is used to limit the
    +          generation length. Default is 128 tokens.
    +        default: 128
    +      - name: chunk_length_s
    +        type: int
    +        doc: The audio chunk to split the audio to (in seconds). Default is 30 seconds.
    +        default: 30
    +      - name: batch_size
    +        type: int
    +        doc: The batch size to use for inference. Default is 2.
    +        default: 8
    +      - name: spoken_language
    +        type: str
    +        doc: Aim whisper to know what language is spoken. If None, it will try to
    +          detect it.
    +        default: null
    +      - name: translate_to_english
    +        type: bool
    +        doc: Whether to translate the transcriptions to English.
    +        default: false
    +      - name: speech_diarization
    +        type: Dict[str, List[Tuple[float, float, str]]]
    +        doc: 'A speech diarization dictionary with the file names to transcribe as
    +          keys and their diarization as value. The diarization is a list of tuples:
    +          (start, end, speaker). An example for a diarization dictionary::'
    +        default: null
    +      - name: speech_diarize_per_channel
    +        type: int
    +        doc: 'Perform speech diarization per channel. Each speaker is expected to
    +          belong to a separate channel in the audio. Notice: This will make the transcription
    +          slower as each channel wil be transcribed separatly. If a speech diarization
    +          is passed (via the `speech_diarization` parameter), this parameter is ignored.'
    +        default: null
    +      - name: speaker_labels
    +        type: List[str]
    +        doc: A list of speaker labels by channel order to use for writing the transcription
    +          with respect to per channel speech diarization. This won't be used together
    +          with a given speech diarization (via the `speech_diarization` parameter).
    +        default: null
    +      - name: use_multiprocessing
    +        type: Union[bool, int]
    +        doc: 'Whether to use multiprocessing to transcribe the audio files. Can be
    +          either a boolean value or an integer. If `True`, will use the default amount
    +          of workers (3): 1 for transcription, 1 for batch processing and 1 for task
    +          completion (such as speech diarization and writing to files). To control
    +          the amount of tasks completion workers, an integer can be provided to specify
    +          the amount of workers. `False`, will use a single process. Default is `False`.'
    +        default: false
    +      - name: verbose
    +        type: bool
    +        doc: Whether to print the progress of the transcription. Default is `False`.
    +        default: false
    +      has_varargs: false
    +      has_kwargs: false
    +    audio_iterator:
    +      name: audio_iterator
    +      doc: ''
    +      lineno: 804
    +      has_varargs: false
    +      has_kwargs: false
    +      outputs:
    +      - type: Generator[Union[dict, str], None, None]
    +    batch_iterator:
    +      name: batch_iterator
    +      doc: ''
    +      lineno: 816
    +      has_varargs: false
    +      has_kwargs: false
    +      outputs:
    +      - type: Generator[List[Union[dict, str]], None, None]
    +    open_mpi_handler:
    +      name: open_mpi_handler
    +      doc: ''
    +      lineno: 957
    +      parameters:
    +      - name: worker_inputs
    +        type: List[str]
    +      - name: root_worker_inputs
    +        type: Dict[str, Any]
    +        default: null
    +      has_varargs: false
    +      has_kwargs: false
    +    decorator:
    +      name: decorator
    +      doc: ''
    +      lineno: 969
    +      parameters:
    +      - name: handler
    +      has_varargs: false
    +      has_kwargs: false
    +    wrapper:
    +      name: wrapper
    +      doc: ''
    +      lineno: 974
    +      has_varargs: false
    +      has_kwargs: true
    +
    +        
    +    
    + + \ No newline at end of file diff --git a/functions/master/transcribe/1.2.0/static/item.html b/functions/master/transcribe/1.2.0/static/item.html new file mode 100644 index 00000000..b39372bd --- /dev/null +++ b/functions/master/transcribe/1.2.0/static/item.html @@ -0,0 +1,64 @@ + + + + + + + + + + + Source + + + + +
    +        
    +apiVersion: v1
    +categories:
    +- audio
    +- genai
    +description: Transcribe audio files into text files
    +doc: ''
    +example: transcribe.ipynb
    +generationDate: 2023-07-13:11-20
    +hidden: false
    +icon: ''
    +labels:
    +  author: yonatans
    +maintainers: []
    +marketplaceType: ''
    +mlrunVersion: 1.7.0
    +name: transcribe
    +platformVersion: 3.5.3
    +spec:
    +  filename: transcribe.py
    +  handler: transcribe
    +  image: mlrun/mlrun
    +  kind: job
    +  requirements:
    +    - transformers
    +    - tqdm
    +    - torchaudio
    +    - torch
    +    - accelerate
    +url: ''
    +version: 1.2.0
    +        
    +    
    + + \ No newline at end of file diff --git a/functions/master/transcribe/1.2.0/static/source.html b/functions/master/transcribe/1.2.0/static/source.html new file mode 100644 index 00000000..3c63a460 --- /dev/null +++ b/functions/master/transcribe/1.2.0/static/source.html @@ -0,0 +1,1498 @@ + + + + + + + + + + + Source + + + + +
    +        
    +# Copyright 2024 Iguazio
    +#
    +# Licensed under the Apache License, Version 2.0 (the "License");
    +# you may not use this file except in compliance with the License.
    +# You may obtain a copy of the License at
    +#
    +#   http://www.apache.org/licenses/LICENSE-2.0
    +#
    +# Unless required by applicable law or agreed to in writing, software
    +# distributed under the License is distributed on an "AS IS" BASIS,
    +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    +# See the License for the specific language governing permissions and
    +# limitations under the License.
    +import logging
    +import operator
    +import os
    +import tempfile
    +from functools import reduce, wraps
    +from multiprocessing import Process, Queue
    +from pathlib import Path
    +from typing import Any, Dict, Generator, List, Literal, NamedTuple, Tuple, Union
    +
    +import pandas as pd
    +import torch
    +import torchaudio
    +from tqdm import tqdm
    +from transformers import (
    +    AutomaticSpeechRecognitionPipeline,
    +    AutoModelForCausalLM,
    +    pipeline,
    +)
    +from transformers.utils import is_flash_attn_2_available
    +
    +
    +class BaseTask:
    +    """
    +    A task to write the transcription to file.
    +    """
    +
    +    def __init__(
    +        self, audio_file: Path, transcription_output: Union[dict, str], text_file: Path
    +    ):
    +        """
    +        Initialize the task.
    +
    +        :param audio_file:           Path to the audio file that was transcribed.
    +        :param transcription_output: The transcription output from the pipeline. String means an exception was raised.
    +        :param text_file:            Path to the text file to write the transcription to.
    +        """
    +        # Store the parameters:
    +        self._audio_file = audio_file
    +        self._transcription_output = transcription_output
    +        self._text_file = text_file
    +
    +        # Prepare the error variable:
    +        self._error: str = None
    +
    +    def do_task(self):
    +        """
    +        Try to perform the task storing an error if occurred.
    +        """
    +        if isinstance(self._transcription_output, str):
    +            self._error = self._transcription_output
    +            return
    +        try:
    +            self._do_task()
    +        except Exception as exception:
    +            self._error = str(exception)
    +
    +    def is_failed(self) -> bool:
    +        """
    +        Check if the task failed.
    +
    +        :returns: Whether the task failed.
    +        """
    +        return self._error is not None
    +
    +    def get_result(self) -> Tuple[str, str]:
    +        """
    +        Get the result of the task. If the task failed, the error will be returned, otherwise, the result will be the
    +        text file name.
    +
    +        :returns: The task's result.
    +        """
    +        if self.is_failed():
    +            return self._audio_file.name, self._error
    +        return self._audio_file.name, self._text_file.name
    +
    +    def to_tuple(self) -> Tuple[str, dict]:
    +        """
    +        Convert the task to a tuple to reconstruct it later (used for multiprocessing to pass in queue).
    +
    +        :returns: The converted task.
    +        """
    +        return self.__class__.__name__, {
    +            "audio_file": self._audio_file,
    +            "transcription_output": self._transcription_output,
    +            "text_file": self._text_file,
    +        }
    +
    +    def _do_task(self):
    +        """
    +        Perform the task - write the transcription to the stored file path.
    +        """
    +        # Checking for no duplications:
    +        i = 1
    +        while self._text_file.exists():
    +            i += 1
    +            self._text_file = (
    +                self._text_file.parent
    +                / f"{self._text_file.stem.rsplit('_', 1)[0]}_{i}{self._text_file.suffix}"
    +            )
    +
    +        # Make sure all directories are created:
    +        self._text_file.parent.mkdir(exist_ok=True, parents=True)
    +
    +        # Write to file:
    +        with open(self._text_file, "w") as fp:
    +            fp.write(self._transcription_output["text"])
    +
    +
    +class SpeechDiarizationTask(BaseTask):
    +    """
    +    A task to write the transcription to file with respect to a given speech diarization.
    +    """
    +
    +    class _DiarizationSegment(NamedTuple):
    +        """
    +        A speech diarization segment.
    +        """
    +
    +        start: float
    +        end: float
    +        speaker: str
    +
    +    class _WordTimestamp(NamedTuple):
    +        """
    +        A word with its start and end timestamps.
    +        """
    +
    +        start: float
    +        end: float
    +        text: str
    +
    +    def __init__(
    +        self,
    +        audio_file: Path,
    +        transcription_output: dict,
    +        text_file: Path,
    +        speech_diarization: List[Tuple[float, float, str]],
    +    ):
    +        """
    +        Initialize the task.
    +
    +        :param audio_file:           Path to the audio file that was transcribed.
    +        :param transcription_output: The transcription output from the pipeline.
    +        :param text_file:            Path to the text file to write the transcription to.
    +        :param speech_diarization:   A speech diarization as a list of tuples: (start, end, speaker).
    +        """
    +        super().__init__(
    +            audio_file=audio_file,
    +            transcription_output=transcription_output,
    +            text_file=text_file,
    +        )
    +        self._speech_diarization = speech_diarization
    +        self._segments: List[SpeechDiarizationTask._DiarizationSegment] = None
    +        self._last_chosen_index = 0
    +
    +    def to_tuple(self) -> Tuple[str, dict]:
    +        """
    +        Convert the task to a tuple to reconstruct it later (used for multiprocessing to pass in queue).
    +
    +        :returns: The converted task.
    +        """
    +        task_class, task_kwargs = super().to_tuple()
    +        return task_class, {
    +            **task_kwargs,
    +            "speech_diarization": self._speech_diarization,
    +        }
    +
    +    def _do_task(self):
    +        """
    +        Perform the task - write the transcription to the stored file path with respect to the given speech diarization.
    +        """
    +        # Check if a speech diarization is given, if not, just write the transcription to file:
    +        if not self._speech_diarization:
    +            super()._do_task()
    +            return
    +
    +        # Cast the chunks to word timestamps tuples:
    +        words = [
    +            SpeechDiarizationTask._WordTimestamp(
    +                start=chunk["timestamp"][0],
    +                end=chunk["timestamp"][1],
    +                text=chunk["text"],
    +            )
    +            for chunk in self._transcription_output["chunks"]
    +        ]
    +
    +        # Cast speech diarization to segments tuples:
    +        self._segments = [
    +            SpeechDiarizationTask._DiarizationSegment(*segment)
    +            for segment in self._speech_diarization
    +        ]
    +
    +        # Try to match the Whisper model predicted timestamps to the closest diarization segment (closest diarization
    +        # segment will be the most overlapping with the word, and if there is no overlap, the closest segment to the
    +        # word):
    +        speaker = self._segments[self._last_chosen_index].speaker
    +        text = f"{speaker}:"
    +        for word in words:
    +            # Get the next diarization segment:
    +            self._get_next_segment(word=word)
    +            # Check if the segment is of the same speaker:
    +            if self._segments[self._last_chosen_index].speaker == speaker:
    +                # Collect the word:
    +                text += word.text
    +            else:
    +                # Append a newline and update the new speaker:
    +                speaker = self._segments[self._last_chosen_index].speaker
    +                text += f"\n{speaker}:{word.text}"
    +
    +        # Update the transcription output with the new text to write it to file:
    +        self._transcription_output["text"] = text
    +        super()._do_task()
    +
    +    def _get_next_segment(
    +        self,
    +        word: _WordTimestamp,
    +    ):
    +        """
    +        Get the next diarization segment the given word falls into. The `self._last_chosen_index` will be updated
    +        accordingly.
    +
    +        :param word: The word timestamp to match to the next segment.
    +        """
    +        # If the last chosen segment is the last segment, return it:
    +        if self._last_chosen_index == len(self._segments) - 1:
    +            return
    +
    +        # Get the last chosen diarization segment:
    +        last_chosen = self._segments[self._last_chosen_index]
    +
    +        # None value may appear if the word is the last word in the audio file, or it was split during inference. In
    +        # that case, we'll set the last segment:
    +        if word.end is None:
    +            self._last_chosen_index = len(self._segments) - 1
    +            return
    +
    +        # If the word ends before the last chosen segment:
    +        if word.end <= last_chosen.start:
    +            # Then it is still the closest segment
    +            return
    +
    +        # We check if it ends inside the last chosen segment:
    +        if word.end < last_chosen.end:
    +            # Then it still is the closest segment
    +            return
    +
    +        # The word ends after the segment, we need to collect all next segments up until the word ends before them:
    +        possible_segments = [self._last_chosen_index]
    +        for i in range(self._last_chosen_index + 1, len(self._segments)):
    +            if word.end > self._segments[i].end:
    +                possible_segments.append(i)
    +                continue
    +            possible_segments.append(i)
    +            break
    +
    +        # Check for the most overlapping option:
    +        best_overlap = 0
    +        most_overlapping_segment_index = None
    +        for i in possible_segments:
    +            # If the word starts before segment:
    +            if word.start <= self._segments[i].start:
    +                # If it ends before the segment, there is an overlap from the start of the segment to the end of the
    +                # word:
    +                if word.end < self._segments[i].end:
    +                    overlap = word.end - self._segments[i].start
    +                else:
    +                    # The word is wrapping the segment, the overlap is the segment's length:
    +                    overlap = self._segments[i].end - self._segments[i].start
    +            # The word starts in segment, check if the word ends in it:
    +            elif word.end < self._segments[i].end:
    +                # The overlap is the word's length:
    +                overlap = word.end - word.start
    +            # The word start in segment but ends after it, the overlap is from the word's start to the segment's end:
    +            else:
    +                overlap = self._segments[i].end - word.start
    +            # Check for new best overlap:
    +            if overlap > best_overlap:
    +                best_overlap = overlap
    +                most_overlapping_segment_index = i
    +        if most_overlapping_segment_index is not None:
    +            self._last_chosen_index = most_overlapping_segment_index
    +            return
    +
    +        # If there is no overlapping segment, return the closest segment:
    +        best_distance = None
    +        closest_segment_index = None
    +        for i in possible_segments:
    +            distance = (
    +                word.start - self._segments[i].end
    +                if word.start > self._segments[i].end
    +                else self._segments[i].start - word.end
    +            )
    +            if best_distance is None or distance < best_distance:
    +                best_distance = distance
    +                closest_segment_index = i
    +        self._last_chosen_index = closest_segment_index
    +
    +
    +class SpeechDiarizationPerChannelTask(BaseTask):
    +    """
    +    A task to write the transcription to file with respect to a given speech diarization per channel.
    +    """
    +
    +    class _WordTimestamp(NamedTuple):
    +        """
    +        A word with its start and end timestamps and speaker label (channel the word was taken from).
    +        """
    +
    +        start: float
    +        end: float
    +        speaker: str
    +        text: str
    +
    +    def __init__(self, audio_file: Path, text_file: Path):
    +        """
    +        Initialize the task.
    +
    +        :param audio_file: Path to the audio file that was transcribed.
    +        :param text_file:  Path to the text file to write the transcription to.
    +        """
    +        super().__init__(
    +            audio_file=audio_file, transcription_output={}, text_file=text_file
    +        )
    +        self._transcription_output_channels: List[Tuple[str, dict]] = []
    +
    +    @property
    +    def transcription_output_channels(self) -> List[Tuple[str, dict]]:
    +        """
    +        Get the transcription output channels.
    +
    +        :returns: The transcription output channels.
    +        """
    +        return self._transcription_output_channels
    +
    +    def do_task(self):
    +        """
    +        Try to perform the task storing an error if occurred.
    +        """
    +        for _, channel_output in self._transcription_output_channels:
    +            if isinstance(channel_output, str):
    +                self._error = self._transcription_output_channels
    +                return
    +        super().do_task()
    +
    +    def to_tuple(self) -> Tuple[str, dict]:
    +        """
    +        Convert the task to a tuple to reconstruct it later (used for multiprocessing to pass in queue).
    +
    +        :returns: The converted task.
    +        """
    +        task_class, task_kwargs = super().to_tuple()
    +        task_kwargs.pop("transcription_output")
    +        return task_class, task_kwargs
    +
    +    def _do_task(self):
    +        """
    +        Perform the task - write the transcription to the stored file path with respect to the given speech diarization
    +        per channel.
    +        """
    +        # Cast the chunks to word timestamps tuples:
    +        words_per_channel = [
    +            [
    +                SpeechDiarizationPerChannelTask._WordTimestamp(
    +                    start=chunk["timestamp"][0],
    +                    end=chunk["timestamp"][1],
    +                    speaker=speaker,
    +                    text=chunk["text"],
    +                )
    +                for chunk in output["chunks"]
    +            ]
    +            for speaker, output in self._transcription_output_channels
    +        ]
    +
    +        # Merge and sort the words per channel by their start time:
    +        words = operator.add(*words_per_channel)
    +        words.sort()
    +
    +        # Write the transcription to file:
    +        current_speaker = words[0].speaker
    +        text = f"{current_speaker}:"
    +        for word in words:
    +            # Check if the word's speaker is different from the current one:
    +            if word.speaker != current_speaker:
    +                # Append a newline and update the new speaker:
    +                current_speaker = word.speaker
    +                text += f"\n{current_speaker}:"
    +            # Collect the word:
    +            text += word.text
    +
    +        # Update the transcription output with the new text to write it to file:
    +        self._transcription_output["text"] = text
    +        super()._do_task()
    +
    +
    +class BatchProcessor:
    +    """
    +    A batch processor to process batches of transcriptions. The batch processor is creating tasks and is aimed to be
    +    working along the transcriber. It can be used with multiprocessing queue or run the tasks directly using the
    +    associated methods.
    +    """
    +
    +    def __init__(self, audio_files: List[Path], output_directory: Path):
    +        """
    +        Initialize the batch processor.
    +
    +        :param audio_files:      The list of all audio files to transcribe.
    +        :param output_directory: The output directory to write the transcriptions to.
    +        """
    +        # Store the parameters:
    +        self._audio_files = audio_files
    +        self._output_directory = output_directory
    +
    +        # Prepare the batching variables:
    +        self._current_file_index = 0
    +        self._tasks: List[BaseTask] = []
    +        self._results: List[Tuple[bool, Tuple[str, str]]] = []
    +
    +    def process_batch(self, batch: List[Union[dict, str]]):
    +        """
    +        Process a batch of transcriptions. Tasks related to the given batch will be created and stored in the batch
    +        processor.
    +
    +        :param batch: The batch of transcriptions to process.
    +        """
    +        # Get the relevant files belongs to the given batch:
    +        current_files = self._get_current_files(batch_size=len(batch))
    +
    +        # Build the diarization tasks:
    +        self._tasks.extend(
    +            [
    +                BaseTask(
    +                    audio_file=file,
    +                    transcription_output=batch[i],
    +                    text_file=self._output_directory / f"{file.stem}.txt",
    +                )
    +                for i, file in enumerate(current_files)
    +            ]
    +        )
    +
    +    def get_tasks(self) -> List[BaseTask]:
    +        """
    +        Get the tasks to perform.
    +
    +        :returns: The tasks to perform.
    +        """
    +        tasks = self._tasks
    +        self._tasks = []
    +        return tasks
    +
    +    def do_tasks(self):
    +        """
    +        Perform the tasks. Should be used if no multiprocessing queue is given to a transcriber.
    +        """
    +        for task in self.get_tasks():
    +            task.do_task()
    +            self._results.append((task.is_failed(), task.get_result()))
    +
    +    def get_results(self) -> List[Tuple[bool, Tuple[str, str]]]:
    +        """
    +        Get the results of the tasks. The stored results are then cleared.
    +
    +        :returns: The results of the tasks.
    +        """
    +        results = self._results
    +        self._results = []
    +        return results
    +
    +    def _get_current_files(self, batch_size: int) -> List[Path]:
    +        """
    +        Get the current files to process.
    +
    +        :param batch_size: The batch size to progress the current file index.
    +
    +        :returns: The current files to process.
    +        """
    +        end_index = (
    +            self._current_file_index + batch_size
    +            if self._current_file_index + batch_size < len(self._audio_files)
    +            else len(self._audio_files)
    +        )
    +        current_files = self._audio_files[self._current_file_index : end_index]
    +        self._current_file_index = end_index
    +        return current_files
    +
    +
    +class SpeechDiarizationBatchProcessor(BatchProcessor):
    +    """
    +    A batch processor to process batches of transcriptions with respect to a given speech diarization. The batch
    +    processor is creating tasks and is aimed to be working along the transcriber. It can be used with multiprocessing
    +    queue or run the tasks directly using the associated methods.
    +    """
    +
    +    def __init__(
    +        self, audio_files: List[Path], output_directory: Path, speech_diarization: dict
    +    ):
    +        """
    +        Initialize the batch processor.
    +
    +        :param audio_files:        The list of all audio files to transcribe.
    +        :param output_directory:   The output directory to write the transcriptions to.
    +        :param speech_diarization: A speech diarization dictionary to pass along with each processed batch.
    +        """
    +        super().__init__(audio_files=audio_files, output_directory=output_directory)
    +        self._speech_diarization = speech_diarization
    +        self._audio_files = audio_files
    +
    +    def process_batch(self, batch: List[dict]):
    +        """
    +        Process a batch of transcriptions. Tasks related to the given batch will be created and stored in the batch
    +        processor.
    +
    +        :param batch: The batch of transcriptions to process.
    +        """
    +        # Get the relevant files belongs to the given batch:
    +        current_files = self._get_current_files(batch_size=len(batch))
    +
    +        # Build the diarization tasks:
    +        self._tasks.extend(
    +            [
    +                SpeechDiarizationTask(
    +                    audio_file=file,
    +                    transcription_output=batch[i],
    +                    text_file=self._output_directory / f"{file.stem}.txt",
    +                    speech_diarization=self._speech_diarization.get(file.name),
    +                )
    +                for i, file in enumerate(current_files)
    +            ]
    +        )
    +
    +
    +class PerChannelSpeechDiarizationBatchProcessor(BatchProcessor):
    +    """
    +    A batch processor to process batches of transcriptions per channel. The batch processor is creating tasks with the
    +    selected amount of channels given and is aimed to be working along the transcriber. It can be used with
    +    multiprocessing queue or run the tasks directly using the associated methods.
    +    """
    +
    +    def __init__(
    +        self,
    +        audio_files: List[Path],
    +        output_directory: Path,
    +        n_channels: int,
    +        speakers: List[str],
    +    ):
    +        """
    +        Initialize the batch processor.
    +
    +        :param audio_files:      The list of all audio files to transcribe.
    +        :param output_directory: The output directory to write the transcriptions to.
    +        :param n_channels:       The number of channels in each audio file to transcribe.
    +        :param speakers:         The speakers labels to use for each channel.
    +        """
    +        super().__init__(audio_files=audio_files, output_directory=output_directory)
    +
    +        # Store the parameters:
    +        self._n_channels = n_channels
    +        self._speakers = speakers
    +
    +        # Prepare a channel buffer to store the channels until the current task created is fully covered:
    +        self._task_in_process: SpeechDiarizationPerChannelTask = None
    +
    +    def process_batch(self, batch: List[dict]):
    +        """
    +        Process a batch of transcriptions. Tasks related to the given batch will be created and stored in the batch
    +        processor.
    +
    +        :param batch: The batch of transcriptions to process.
    +        """
    +        # Go over the batch and create the tasks:
    +        for output in batch:
    +            # Check if there is a task in process:
    +            if not self._task_in_process:
    +                # Create a new task:
    +                self._task_in_process = SpeechDiarizationPerChannelTask(
    +                    audio_file=self._audio_files[self._current_file_index],
    +                    text_file=self._output_directory
    +                    / f"{self._audio_files[self._current_file_index].stem}.txt",
    +                )
    +            # Get the channel's speaker:
    +            speaker = self._speakers[
    +                len(self._task_in_process.transcription_output_channels)
    +            ]
    +            # Collect the channel into the processed task:
    +            self._task_in_process.transcription_output_channels.append(
    +                (speaker, output)
    +            )
    +            # Check if the task is fully covered (all channels are collected):
    +            if (
    +                len(self._task_in_process.transcription_output_channels)
    +                == self._n_channels
    +            ):
    +                # Collect the task and reset the task in process:
    +                self._tasks.append(self._task_in_process)
    +                self._current_file_index += 1
    +                self._task_in_process = None
    +
    +
    +class Transcriber:
    +    """
    +    A transcription wrapper for the Huggingface's ASR pipeline -
    +    https://huggingface.co/transformers/main_classes/pipelines.html#transformers.AutomaticSpeechRecognitionPipeline to
    +    use with OpenAI's Whisper models - https://huggingface.co/openai.
    +    """
    +
    +    def __init__(
    +        self,
    +        model_name: str,
    +        device: str = None,
    +        use_flash_attention_2: bool = None,
    +        use_better_transformers: bool = None,
    +        assistant_model: str = None,
    +        max_new_tokens: int = 128,
    +        chunk_length_s: int = 30,
    +        batch_size: int = 2,
    +        spoken_language: str = None,
    +        translate_to_english: bool = False,
    +        return_timestamps: Union[bool, Literal["word"]] = False,
    +        per_channel_transcription: int = 0,
    +    ):
    +        """
    +        Initialize the transcriber.
    +
    +        :param model_name:                The model name to use. Should be a model from the OpenAI's Whisper models for
    +                                          best results (for example "tiny", "base", "large", etc.).
    +        :param device:                    The device to use for inference. If not given, will use GPU if available.
    +        :param use_flash_attention_2:     Whether to use the Flash Attention 2 implementation. It can be used only with
    +                                          one of the following GPUs: Nvidia H series and Nvidia A series. T4 support
    +                                          will be available soon.
    +
    +                                          Note: If both `use_flash_attention_2` and
    +                                          `use_better_transformers` are `None`, the optimization will be chosen
    +                                          automatically according to the available resources.
    +
    +        :param use_better_transformers:   Whether to use the Better Transformers library to further optimize the model.
    +                                          Should be used for all use cases that do not support flash attention 2.
    +
    +                                          Note: If both `use_flash_attention_2` and `use_better_transformers` are
    +                                          `None`, the optimization will be chosen automatically according to the
    +                                          available resources.
    +       :param assistant_model:           The assistant model name to use for inference. Notice that the optimizations
    +                                          (flash attention 2 and better transformers) will be applied for the assistant
    +                                          as well. Should be a model from Huggingface's distil-whisper (see here for
    +                                          more information: https://github.com/huggingface/distil-whisper).
    +        :param max_new_tokens:            The maximum number of new tokens to generate. This is used to limit the
    +                                          generation length. Default is 128 tokens.
    +        :param chunk_length_s:            The audio chunk to split the audio to (in seconds). Default is 30 seconds.
    +        :param batch_size:                The batch size to use for inference. Default is 2.
    +        :param spoken_language:           Aim whisper to know what language is spoken. If None, it will try to detect it
    +                                          for each chunk.
    +        :param translate_to_english:      Whether to translate the transcriptions to English. Default is False.
    +        :param return_timestamps:         Whether to return the timestamps of the words. If "word", will return the
    +                                          timestamps of each word. If True will return the timestamps of each chunk.
    +                                          Default is False. Aimed to be used for speech diarization.
    +        :param per_channel_transcription: Whether to do per channel transcription. If needed to run per channel
    +                                          transcription, pass the number of channels expected for each audio file here.
    +                                          0 means regular transcription (merge channels).
    +
    +                                          Note: If `per_channel_transcription` is not 0, `batch_size` must be treated to
    +                                          be the number of channels and not audio files. Aimed to be used for per
    +                                          channel speech diarization.
    +        """
    +        # Store loading parameters:
    +        self._model_name = model_name
    +        self._device = device
    +        self._use_flash_attention_2 = use_flash_attention_2
    +        self._use_better_transformers = use_better_transformers
    +        self._max_new_tokens = max_new_tokens
    +        self._chunk_length_s = chunk_length_s
    +        self._batch_size = batch_size
    +        self._return_timestamps = return_timestamps
    +        self._per_channel_transcription = per_channel_transcription
    +
    +        # Store generation parameters:
    +        self._assistant_model = assistant_model
    +        self._spoken_language = spoken_language
    +        self._translate_to_english = translate_to_english
    +
    +        # Prepare the transcription objects:
    +        self._transcription_pipeline: AutomaticSpeechRecognitionPipeline = None
    +        self._generate_kwargs: dict = None
    +
    +    def load(self):
    +        """
    +        Load the transcriber. Must be called before transcribing.
    +        """
    +        # Set the device and data type to use (prefer GPU if available):
    +        device = torch.device(
    +            self._device or "cuda" if torch.cuda.is_available() else "cpu"
    +        )
    +        torch_dtype = torch.float16 if device.type == "cuda" else torch.float32
    +
    +        # Choose the optimization to use (in case the user did not specify any):
    +        if (
    +            self._use_flash_attention_2 is None
    +            and self._use_better_transformers is None
    +        ):
    +            # Prefer to use flash attention 2 if available and cuda device is supported (see GPU names to architecture
    +            # here: https://en.wikipedia.org/wiki/List_of_Nvidia_graphics_processing_units#Tesla):
    +            if device.type == "cuda" and is_flash_attn_2_available():
    +                cuda_device_name = torch.cuda.get_device_properties(device).name
    +                if any(
    +                    cuda_device_name.startswith(gpu_name)
    +                    for gpu_name in [
    +                        "NVIDIA A",  # For Ampere architecture (e.g. A10, A30, A100)
    +                        "NVIDIA H",  # For Hopper architecture (e.g. H100)
    +                        "NVIDIA L",  # For Ada Lovelace architecture (e.g. L4, L40)
    +                        "NVIDIA RTX 30",  # For Ada Lovelace architecture (RTX 30 series)
    +                        "NVIDIA RTX 40",  # For Ada Lovelace architecture (RTX 40 series)
    +                        "NVIDIA RTX 50",  # For Ada Lovelace architecture (RTX 50 series)
    +                        # Will be supported soon according to FlashAttention GitHub repo:
    +                        # https://github.com/Dao-AILab/flash-attention?tab=readme-ov-file#installation-and-features
    +                        # "NVIDIA T4",  # For Turing architecture (only T4)
    +                        # "NVIDIA RTX 20",  # For Turing architecture (RTX 20 series)
    +                    ]
    +                ):
    +                    self._use_flash_attention_2 = True
    +                else:
    +                    self._use_better_transformers = True
    +            else:
    +                self._use_better_transformers = True
    +
    +        # Build the optimizations kwargs:
    +        model_kwargs = {
    +            "low_cpu_mem_usage": True,
    +            "use_safetensors": True,
    +        }
    +        if self._use_flash_attention_2:
    +            if _LOGGER:
    +                _LOGGER.info(
    +                    "Using FlashAttention2 optimization - make sure the `flash-attn` package is installed via "
    +                    "`pip install -U flash-attn --no-build-isolation`"
    +                )
    +            model_kwargs["attn_implementation"] = "flash_attention_2"
    +        elif self._use_better_transformers:
    +            if _LOGGER:
    +                _LOGGER.info(
    +                    "Using BetterTransformers optimization - make sure the `optimum` package is installed via "
    +                    "`pip install -U optimum`"
    +                )
    +            model_kwargs["attn_implementation"] = "sdpa"
    +
    +        # Initialize the speech recognition pipeline:
    +        self._transcription_pipeline = pipeline(
    +            task="automatic-speech-recognition",
    +            model=self._model_name,
    +            model_kwargs=model_kwargs.copy(),
    +            batch_size=self._batch_size,
    +            max_new_tokens=self._max_new_tokens,
    +            chunk_length_s=self._chunk_length_s,
    +            return_timestamps=self._return_timestamps,
    +            torch_dtype=torch_dtype,
    +            device=device,
    +        )
    +
    +        # Prepare the generation kwargs:
    +        self._generate_kwargs = {
    +            "language": self._spoken_language,
    +            "task": "translate" if self._translate_to_english else "transcribe",
    +        }
    +
    +        # Initialize the assistant model (if needed):
    +        if self._assistant_model:
    +            assistant_model = AutoModelForCausalLM.from_pretrained(
    +                self._assistant_model, torch_dtype=torch_dtype, **model_kwargs
    +            )
    +            assistant_model.to(device)
    +            self._generate_kwargs["assistant_model"] = assistant_model
    +
    +    def transcribe(
    +        self,
    +        audio_files: List[Path],
    +        batch_processor: BatchProcessor = None,
    +        batches_queue: Queue = None,
    +        verbose: bool = False,
    +    ) -> Union[List[List[dict]], None]:
    +        """
    +        Transcribe the given audio files. The transcriptions will be sent to a queue or a batch processor for further
    +        processing like writing to text files. If no queue or batch processor is given, the transcriptions outputs from
    +        the pipeline will be returned. Otherwise, `None` is returned.
    +
    +        :param audio_files:     The audio files to transcribe.
    +        :param batch_processor: A batch processor.
    +        :param batches_queue:   A multiprocessing queue to put the batches in.
    +        :param verbose:         Whether to show a progress bar. Default is False.
    +
    +        :returns: The transcriptions outputs from the pipeline if no queue or batch processor is given, otherwise,
    +                  `None`.
    +        """
    +        # Wrap the audio files with a function to iterate over them via a generator (save memory and runtime with
    +        # Huggingface's pipelines as they preload each input while inference is running):
    +        def audio_iterator() -> Generator[Union[dict, str], None, None]:
    +            if self._per_channel_transcription:
    +                for audio_file in audio_files:
    +                    audio, sampling_rate = torchaudio.load(str(audio_file))
    +                    audio = audio.numpy()
    +                    for channel in audio:
    +                        yield {"raw": channel, "sampling_rate": sampling_rate}
    +            else:
    +                for audio_file in audio_files:
    +                    yield str(audio_file)
    +
    +        # Create a batch iterator:
    +        def batch_iterator() -> Generator[List[Union[dict, str]], None, None]:
    +            batch = []
    +            for audio in audio_iterator():
    +                batch.append(audio)
    +                if len(batch) == self._batch_size:
    +                    yield batch
    +                    batch = []
    +            if batch:
    +                yield batch
    +
    +        # Prepare the successes dataframe and errors dictionary to be returned:
    +        outputs = []
    +
    +        # Infer through the pipeline:
    +        for input_batch in tqdm(
    +            batch_iterator() if self._batch_size > 1 else audio_iterator(),
    +            desc="Transcribing",
    +            unit="channel" if self._per_channel_transcription else "audio file",
    +            total=(
    +                (
    +                    (len(audio_files) // self._batch_size)
    +                    + (len(audio_files) % self._batch_size != 0)
    +                )
    +                * (self._per_channel_transcription or 1)
    +            ),
    +            disable=not verbose,
    +        ):
    +            # Infer:
    +            try:
    +                output_batch = self._transcription_pipeline(
    +                    input_batch,
    +                    generate_kwargs=self._generate_kwargs,
    +                )
    +            except Exception as exception:
    +                # Collect the exception:
    +                output_batch = str(exception)
    +                # Align to batch size:
    +                output_batch = (
    +                    [output_batch] * len(input_batch)
    +                    if isinstance(input_batch, list)
    +                    else [output_batch]
    +                )
    +            # To align with batching, if batch size is 1, wrap the output with a list:
    +            if isinstance(output_batch, dict):
    +                output_batch = [output_batch]
    +            # If a batch processor is given, process the batch:
    +            if batch_processor:
    +                # Process it directly:
    +                batch_processor.process_batch(batch=output_batch)
    +                batch_processor.do_tasks()
    +            elif batches_queue:
    +                # Otherwise, queue the batch:
    +                batches_queue.put(output_batch)
    +            else:
    +                # Otherwise, collect the output as is without processing:
    +                outputs.append(output_batch)
    +
    +        # Check if given a multiprocessing queue or a batch processor:
    +        if batches_queue:
    +            batches_queue.put(_MULTIPROCESSING_STOP_MARK)
    +
    +        return outputs if not batch_processor else None
    +
    +
    +#: The value to send into multiprocessing queues to stop the process:
    +_MULTIPROCESSING_STOP_MARK = "STOP"
    +
    +
    +def _multiprocessing_process_batches(
    +    batch_processor: BatchProcessor,
    +    batches_queue: Queue,
    +    tasks_queue: Queue,
    +    n_task_completers: int,
    +):
    +    """
    +    Process the batches in the given batches queue and put the tasks in the given tasks queue. The function will stop
    +    when the given batches queue will receive the stop mark. It is aimed to be used with multiprocessing as a process.
    +
    +    :param batch_processor:   A batch processor to process the batches.
    +    :param batches_queue:     A queue to get the batches from.
    +    :param tasks_queue:       A queue to put the tasks in.
    +    :param n_task_completers: The number of task completers (processes that run the `_multiprocessing_complete_tasks`
    +                              function). A stop mark will be sent to the tasks queue for each task completer.
    +    """
    +    while True:
    +        # Get the batch:
    +        batch: List[dict] = batches_queue.get()
    +        if batch == _MULTIPROCESSING_STOP_MARK:
    +            break
    +
    +        # Process the batch:
    +        batch_processor.process_batch(batch=batch)
    +
    +        # Get the tasks:
    +        tasks = batch_processor.get_tasks()
    +
    +        # Queue the tasks:
    +        for task in tasks:
    +            tasks_queue.put(task.to_tuple())
    +
    +    # Mark the end of the batches:
    +    for _ in range(n_task_completers):
    +        tasks_queue.put(_MULTIPROCESSING_STOP_MARK)
    +
    +
    +def _multiprocessing_complete_tasks(tasks_queue: Queue, results_queue: Queue):
    +    """
    +    Complete the tasks in the given queue and put the results in the given results queue. The function will stop when
    +    the given tasks queue will receive the stop mark. It is aimed to be used with multiprocessing as a process.
    +
    +    :param tasks_queue:   A queue to get the tasks from.
    +    :param results_queue: A queue to put the results in.
    +    """
    +    tasks_map = {
    +        BaseTask.__name__: BaseTask,
    +        SpeechDiarizationTask.__name__: SpeechDiarizationTask,
    +        SpeechDiarizationPerChannelTask.__name__: SpeechDiarizationPerChannelTask,
    +    }
    +
    +    while True:
    +        # Get the task:
    +        task = tasks_queue.get()
    +        if task == _MULTIPROCESSING_STOP_MARK:
    +            break
    +
    +        # Reconstruct the task:
    +        task_class, task_kwargs = task
    +        task = tasks_map[task_class](**task_kwargs)
    +
    +        # Complete the task:
    +        task.do_task()
    +        results_queue.put((task.is_failed(), task.get_result()))
    +
    +    # Mark the end of the tasks:
    +    results_queue.put(_MULTIPROCESSING_STOP_MARK)
    +
    +
    +# Get the global logger:
    +_LOGGER = logging.getLogger()
    +
    +
    +def open_mpi_handler(
    +    worker_inputs: List[str], root_worker_inputs: Dict[str, Any] = None
    +):
    +    global _LOGGER
    +
    +    # Check for MLRun and OpenMPI availability:
    +    context, comm = _check_mlrun_and_open_mpi()
    +
    +    # Check if MLRun is available, set the global logger to MLRun's:
    +    if context:
    +        _LOGGER = context.logger
    +
    +    def decorator(handler):
    +        if comm is None or comm.Get_size() == 1:
    +            return handler
    +
    +        @wraps(handler)
    +        def wrapper(**kwargs):
    +            # Get the open mpi environment properties:
    +            size = comm.Get_size()
    +            rank = comm.Get_rank()
    +
    +            # Give the correct chunk of the workers inputs:
    +            for worker_input in worker_inputs:
    +                input_argument = kwargs[worker_input]
    +                if input_argument is None:
    +                    continue
    +                if isinstance(input_argument, str):
    +                    input_argument = _get_audio_files(
    +                        data_path=Path(input_argument).absolute()
    +                    )
    +                if len(input_argument) < size:
    +                    raise ValueError(
    +                        f"Cannot split the input '{worker_input}' of length {len(input_argument)} to {size} workers. "
    +                        f"Please reduce the amount of workers for this input."
    +                    )
    +                even_chunk_size = len(input_argument) // size
    +                chunk_start = rank * even_chunk_size
    +                chunk_end = (
    +                    (rank + 1) * even_chunk_size
    +                    if rank + 1 < size
    +                    else len(input_argument)
    +                )
    +                context.logger.info(
    +                    f"Rank #{rank}: Processing input chunk of '{worker_input}' "
    +                    f"from index {chunk_start} to {chunk_end}."
    +                )
    +                if isinstance(input_argument, list):
    +                    input_argument = input_argument[chunk_start:chunk_end]
    +                elif isinstance(input_argument, pd.DataFrame):
    +                    input_argument = input_argument.iloc[chunk_start:chunk_end:, :]
    +                kwargs[worker_input] = input_argument
    +
    +            # Set the root worker only arguments:
    +            if rank == 0 and root_worker_inputs:
    +                kwargs.update(root_worker_inputs)
    +
    +            # Run the worker:
    +            output = handler(**kwargs)
    +
    +            # Save the output directory of this worker:
    +            output_directory = Path(output[0])
    +
    +            # Send the output to the root rank (rank #0):
    +            output = comm.gather(output, root=0)
    +
    +            # Join the data from all workers:
    +            if rank == 0:
    +                context.logger.info("Collecting data from workers to root worker.")
    +
    +                # Check if there are different output directories:
    +                output_directories = set([Path(out_dir) for out_dir, _, _ in output])
    +                for r in range(1, size):
    +                    # True means the other workers should pass their files to the root worker (rank 0):
    +                    comm.send(len(output_directories) != 1, dest=r)
    +
    +                # If there are different output directories, listen to the other workers:
    +                if len(output_directories) != 1:
    +                    # Collect the files from the other workers:
    +                    files = []
    +                    for r in range(1, size):
    +                        files.extend(comm.recv(source=r))
    +                    # Write the files to the root worker's output directory:
    +                    for file_name, file_content in files:
    +                        with open(output_directory / file_name, "w") as f:
    +                            f.write(file_content)
    +
    +                # Concatenate the dataframes:
    +                dataframe = pd.concat(objs=[df for _, df, _ in output], axis=0)
    +
    +                # Concatenate the errors dictionaries:
    +                errors_dictionary = reduce(
    +                    operator.ior, [err for _, _, err in output], {}
    +                )
    +
    +                return str(output_directory), dataframe, errors_dictionary
    +
    +            # Listen to rank 0 to see if there are different output directories and this rank need to send its files to
    +            # it:
    +            if comm.recv(source=0):
    +                files = []
    +                for file in os.listdir(output_directory):
    +                    with open(output_directory / file, "r") as f:
    +                        files.append((file, f.read()))
    +                comm.send(files, dest=0)
    +            return None
    +
    +        return wrapper
    +
    +    return decorator
    +
    +
    +def _check_mlrun_and_open_mpi() -> Tuple["mlrun.MLClientCtx", "mpi4py.MPI.Intracomm"]:
    +    is_mpi = False
    +    try:
    +        import mlrun
    +
    +        context = mlrun.get_or_create_ctx(name="mlrun")
    +        is_mpi = context.labels.get("kind", "job") == "mpijob"
    +
    +        if is_mpi:
    +            try:
    +                from mpi4py import MPI
    +
    +                return context, MPI.COMM_WORLD
    +            except ModuleNotFoundError as mpi4py_not_found:
    +                context.logger.error(
    +                    "To distribute the function using MLRun's 'mpijob' you need to have `mpi4py` package in your "
    +                    "interpreter. Please run `pip install mpi4py` and make sure you have open-mpi."
    +                )
    +                raise mpi4py_not_found
    +        else:
    +            return context, None
    +    except ModuleNotFoundError as module_not_found:
    +        if is_mpi:
    +            raise module_not_found
    +    return None, None
    +
    +
    +@open_mpi_handler(worker_inputs=["data_path"], root_worker_inputs={"verbose": True})
    +def transcribe(
    +    # Input / Output kwargs:
    +    data_path: Union[str, Path, List[Union[str, Path]]],
    +    output_directory: str = None,
    +    # Model loading kwargs:
    +    model_name: str = "openai/whisper-tiny",
    +    device: str = None,
    +    use_flash_attention_2: bool = None,
    +    use_better_transformers: bool = None,
    +    # Generation kwargs:
    +    assistant_model: str = None,
    +    max_new_tokens: int = 128,
    +    chunk_length_s: int = 30,
    +    batch_size: int = 8,
    +    spoken_language: str = None,
    +    translate_to_english: bool = False,
    +    # Diarization kwargs:
    +    speech_diarization: Dict[str, List[Tuple[float, float, str]]] = None,
    +    speech_diarize_per_channel: int = None,
    +    speaker_labels: List[str] = None,
    +    # Other kwargs:
    +    use_multiprocessing: Union[bool, int] = False,
    +    verbose: bool = False,
    +):
    +    """
    +    Transcribe audio files into text files and collect additional data. The end result is a directory of transcribed
    +    text files and a dataframe containing the following columns:
    +
    +    * audio_file - The audio file path.
    +    * transcription_file - The transcribed text file name in the output directory.
    +
    +    The transcription is based on Huggingface's ASR pipeline -
    +    https://huggingface.co/transformers/main_classes/pipelines.html#transformers.AutomaticSpeechRecognitionPipeline and
    +    is tested with OpenAI's Whisper models - https://huggingface.co/openai.
    +
    +    If one of the speaker diarization parameters are given (either `speech_diarization` or
    +    `speech_diarize_per_channel`), the transcription will be written in a conversation format, where each speaker will
    +    be written in a separate line::
    +
    +        speaker_1: text
    +        speaker_2: text
    +        speaker_1: text
    +        ...
    +
    +    :param data_path:                  A directory of audio files or a single file or a list of files to transcribe.
    +    :param output_directory:           Path to a directory to save all transcribed audio files. If not given, will save
    +                                       the transcribed files in a temporary directory.
    +    :param model_name:                 The model name to use. Should be a model from the OpenAI's Whisper models for
    +                                       best results (for example "tiny", "base", "large", etc.). See here for more
    +                                       information: https://huggingface.co/openai?search_models=whisper.
    +    :param device:                     The device to use for inference. If not given, will use GPU if available.
    +    :param use_flash_attention_2:      Whether to use the Flash Attention 2 implementation. It can be used only with
    +                                       one of the following GPUs: Nvidia H series and Nvidia A series. T4 support
    +                                       will be available soon.
    +
    +                                       Note: If both `use_flash_attention_2` and
    +                                       `use_better_transformers` are `None`, the optimization will be chosen
    +                                       automatically according to the available resources.
    +
    +    :param use_better_transformers:    Whether to use the Better Transformers library to further optimize the model.
    +                                       Should be used for all use cases that do not support flash attention 2.
    +
    +                                       Note: If both `use_flash_attention_2` and `use_better_transformers` are
    +                                       `None`, the optimization will be chosen automatically according to the
    +                                       available resources.
    +    :param assistant_model:            The assistant model name to use for inference. Notice that the optimizations
    +                                       (flash attention 2 and better transformers) will be applied for the assistant as
    +                                       well. Should be a model from Huggingface's distil-whisper (see here for more
    +                                       information: https://github.com/huggingface/distil-whisper).
    +
    +                                       Note: Currently an assistant model is only usable with batch size of 1.
    +    :param max_new_tokens:             The maximum number of new tokens to generate. This is used to limit the
    +                                       generation length. Default is 128 tokens.
    +    :param chunk_length_s:             The audio chunk to split the audio to (in seconds). Default is 30 seconds.
    +    :param batch_size:                 The batch size to use for inference. Default is 2.
    +    :param spoken_language:            Aim whisper to know what language is spoken. If None, it will try to detect
    +                                       it.
    +    :param translate_to_english:       Whether to translate the transcriptions to English.
    +    :param speech_diarization:         A speech diarization dictionary with the file names to transcribe as keys and
    +                                       their diarization as value. The diarization is a list of tuples:
    +                                       (start, end, speaker). An example
    +                                       for a diarization dictionary::
    +
    +                                       {
    +                                           "audio_file_name": [
    +                                               {
    +                                                   "start": 0.0,
    +                                                   "end": 2.0,
    +                                                   "speaker": "Agent",
    +                                               },
    +                                               {
    +                                                   "start": 2.0,
    +                                                   "end": 4.0,
    +                                                   "speaker": "Client",
    +                                               },
    +                                               ...
    +                                           ],
    +                                           ...
    +                                       }
    +
    +                                       Note: The diarization must be for the entire duration of the audio file (as long
    +                                       as Whisper is predicting words up until then.
    +    :param speech_diarize_per_channel: Perform speech diarization per channel. Each speaker is expected to belong to
    +                                       a separate channel in the audio. Notice: This will make the transcription
    +                                       slower as each channel wil be transcribed separatly. If a speech diarization
    +                                       is passed (via the `speech_diarization` parameter), this parameter is
    +                                       ignored.
    +    :param speaker_labels:             A list of speaker labels by channel order to use for writing the
    +                                       transcription with respect to per channel speech diarization. This won't be
    +                                       used together with a given speech diarization (via the `speech_diarization`
    +                                       parameter).
    +    :param use_multiprocessing:        Whether to use multiprocessing to transcribe the audio files. Can be either a
    +                                       boolean value or an integer. If `True`, will use the default amount of workers
    +                                       (3): 1 for transcription, 1 for batch processing and 1 for task completion (such
    +                                       as speech diarization and writing to files). To control the amount of tasks
    +                                       completion workers, an integer can be provided to specify the amount of workers.
    +                                       `False`, will use a single process. Default is `False`.
    +    :param verbose:                    Whether to print the progress of the transcription. Default is `False`.
    +    """
    +    global _LOGGER
    +
    +    # Get the input audio files to transcribe:
    +    if verbose:
    +        _LOGGER.info("Collecting audio files.")
    +    audio_files = _get_audio_files(data_path=data_path)
    +    if verbose:
    +        _LOGGER.info(f"Collected {len(audio_files)} audio files.")
    +
    +    # Get the output directory:
    +    if output_directory is None:
    +        if verbose:
    +            _LOGGER.info("No output directory given, using temporary directory.")
    +        output_directory = tempfile.mkdtemp()
    +    output_directory = Path(output_directory).absolute()
    +    output_directory.mkdir(exist_ok=True, parents=True)
    +    if verbose:
    +        _LOGGER.info(f"Transcriptions will be saved to: {output_directory}")
    +
    +    # Initialize a batch processor according to user requirements (no speech diarization, given speech diarization,
    +    # speech diarization per channel):
    +    if speech_diarization:
    +        batch_processor = SpeechDiarizationBatchProcessor(
    +            audio_files=audio_files,
    +            output_directory=output_directory,
    +            speech_diarization=speech_diarization,
    +        )
    +    elif speech_diarize_per_channel:
    +        batch_processor = PerChannelSpeechDiarizationBatchProcessor(
    +            audio_files=audio_files,
    +            output_directory=output_directory,
    +            n_channels=speech_diarize_per_channel,
    +            speakers=speaker_labels,
    +        )
    +    else:
    +        batch_processor = BatchProcessor(
    +            audio_files=audio_files,
    +            output_directory=output_directory,
    +        )
    +
    +    # Initialize the transcription pipeline:
    +    transcriber = Transcriber(
    +        device=device,
    +        use_flash_attention_2=use_flash_attention_2,
    +        use_better_transformers=use_better_transformers,
    +        assistant_model=assistant_model,
    +        model_name=model_name,
    +        max_new_tokens=max_new_tokens,
    +        chunk_length_s=chunk_length_s,
    +        batch_size=batch_size,
    +        return_timestamps=(
    +            "word"
    +            if speech_diarization is not None or speech_diarize_per_channel is not None
    +            else False
    +        ),
    +        per_channel_transcription=speech_diarize_per_channel or 0,
    +        spoken_language=spoken_language,
    +        translate_to_english=translate_to_english,
    +    )
    +
    +    # Run the transcription:
    +    if use_multiprocessing:
    +        results = _parallel_run(
    +            n_workers=use_multiprocessing
    +            if isinstance(use_multiprocessing, int)
    +            else 1,
    +            audio_files=audio_files,
    +            batch_processor=batch_processor,
    +            transcriber=transcriber,
    +            verbose=verbose,
    +        )
    +    else:
    +        results = _run(
    +            audio_files=audio_files,
    +            batch_processor=batch_processor,
    +            transcriber=transcriber,
    +            verbose=verbose,
    +        )
    +
    +    # Process the results:
    +    if verbose:
    +        _LOGGER.info("Summarizing the results.")
    +    successes = []
    +    errors = {}
    +    for is_error, result in results:
    +        if is_error:
    +            errors[result[0]] = result[1]
    +        else:
    +            successes.append(result)
    +    successes = pd.DataFrame(successes, columns=["audio_file", "transcription_file"])
    +    if verbose:
    +        _LOGGER.info(
    +            f"Done ({successes.shape[0]}/{len(audio_files)})\n"
    +            f"Transcriptions summary:\n"
    +            f"{successes.head()}"
    +        )
    +
    +    return str(output_directory), successes, errors
    +
    +
    +def _get_audio_files(
    +    data_path: Union[Path, str, list],
    +) -> List[Path]:
    +    """
    +    Get the audio files to transcribe. If a path to a directory is given, all files in the directory will be collected.
    +
    +    :param data_path: The data path to collect the audio files from.
    +
    +    :returns: The audio files list.
    +    """
    +    # Check if given a list of paths:
    +    if isinstance(data_path, list):
    +        audio_files = []
    +        for path in data_path:
    +            audio_files.extend(_get_audio_files(data_path=path))
    +        return audio_files
    +
    +    # Check if given a single string path to cast it to a `pathlib.Path`:
    +    if isinstance(data_path, str):
    +        data_path = Path(data_path).absolute()
    +
    +    # Check if the path is of a directory or a file:
    +    if data_path.is_dir():
    +        # Get all files inside the directory:
    +        audio_files = list(data_path.glob("*.*"))
    +    elif data_path.is_file():
    +        audio_files = [data_path]
    +    else:
    +        raise ValueError(
    +            f"Unrecognized data path. The parameter `data_path` must be a valid path to either a directory path or a "
    +            f"file. Given: {str(data_path)} "
    +        )
    +
    +    return audio_files
    +
    +
    +def _run(
    +    audio_files: List[Path],
    +    batch_processor: BatchProcessor,
    +    transcriber: Transcriber,
    +    verbose: bool,
    +) -> List[Tuple[bool, Tuple[str, str]]]:
    +    """
    +    Run the transcription without multiprocessing.
    +
    +    :param audio_files:     The audio files to transcribe.
    +    :param batch_processor: The batch processor to use.
    +    :param transcriber:     The transcriber to use.
    +    :param verbose:         Verbosity.
    +
    +    :returns: The collected results.
    +    """
    +    # Load the transcription pipeline:
    +    if verbose:
    +        _LOGGER.info(f"Loading the transcription pipeline.")
    +    transcriber.load()
    +    if verbose:
    +        _LOGGER.info("Transcription pipeline loaded.")
    +
    +    # Transcribe the files:
    +    transcriber.transcribe(
    +        audio_files=audio_files,
    +        batch_processor=batch_processor,
    +        verbose=verbose,
    +    )
    +
    +    # Return the results:
    +    return batch_processor.get_results()
    +
    +
    +def _parallel_run(
    +    n_workers: int,
    +    audio_files: List[Path],
    +    batch_processor: BatchProcessor,
    +    transcriber: Transcriber,
    +    verbose: bool,
    +):
    +    """
    +    Run the transcription with multiprocessing.
    +
    +    :param n_workers:       The amount of workers to use as task completers.
    +    :param audio_files:     The audio files to transcribe.
    +    :param batch_processor: The batch processor to use.
    +    :param transcriber:     The transcriber to use.
    +    :param verbose:         Verbosity.
    +
    +    :returns: The collected results.
    +    """
    +    # Initialize the multiprocessing queues:
    +    batches_queue = Queue()
    +    tasks_queue = Queue()
    +    results_queue = Queue()
    +
    +    # Initialize the multiprocessing processes:
    +    batch_processing_process = Process(
    +        target=_multiprocessing_process_batches,
    +        kwargs={
    +            "batch_processor": batch_processor,
    +            "batches_queue": batches_queue,
    +            "tasks_queue": tasks_queue,
    +            "n_task_completers": n_workers,
    +        },
    +    )
    +    task_completion_processes = [
    +        Process(
    +            target=_multiprocessing_complete_tasks,
    +            kwargs={"tasks_queue": tasks_queue, "results_queue": results_queue},
    +        )
    +        for _ in range(n_workers)
    +    ]
    +
    +    # Start the multiprocessing processes:
    +    batch_processing_process.start()
    +    for p in task_completion_processes:
    +        p.start()
    +
    +    # Load the transcription pipeline:
    +    if verbose:
    +        _LOGGER.info(f"Loading the transcription pipeline.")
    +    transcriber.load()
    +    if verbose:
    +        _LOGGER.info("Transcription pipeline loaded.")
    +
    +    # Transcribe the files:
    +    transcriber.transcribe(
    +        audio_files=audio_files, batches_queue=batches_queue, verbose=verbose
    +    )
    +
    +    # Collect the results:
    +    results = []
    +    stop_marks_counter = 0
    +    while True:
    +        # Get a result from the queue:
    +        result: Tuple[bool, Tuple[str, str]] = results_queue.get()
    +        if result == _MULTIPROCESSING_STOP_MARK:
    +            stop_marks_counter += 1
    +            if stop_marks_counter == n_workers:
    +                break
    +        else:
    +            # Collect the result:
    +            results.append(result)
    +
    +    # Wait for the processes to finish:
    +    results_queue.empty()
    +    batch_processing_process.join()
    +    for p in task_completion_processes:
    +        p.join()
    +
    +    return results
    +        
    +    
    + + \ No newline at end of file diff --git a/functions/master/transcribe/1.2.0/static/transcribe.html b/functions/master/transcribe/1.2.0/static/transcribe.html new file mode 100644 index 00000000..7f277c56 --- /dev/null +++ b/functions/master/transcribe/1.2.0/static/transcribe.html @@ -0,0 +1,1708 @@ + + + + + + + +transcribe.transcribe + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + + + +
    +
    +
    +
    +
    +
    + +
    + +
    +
    +
    + + + + +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +
    +
    +
    +
    +
    +

    + +
    +
    +
    +
    +
    + +
    +

    Source code for transcribe.transcribe

    +# Copyright 2024 Iguazio
    +#
    +# Licensed under the Apache License, Version 2.0 (the "License");
    +# you may not use this file except in compliance with the License.
    +# You may obtain a copy of the License at
    +#
    +#   http://www.apache.org/licenses/LICENSE-2.0
    +#
    +# Unless required by applicable law or agreed to in writing, software
    +# distributed under the License is distributed on an "AS IS" BASIS,
    +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    +# See the License for the specific language governing permissions and
    +# limitations under the License.
    +import logging
    +import operator
    +import os
    +import tempfile
    +from functools import reduce, wraps
    +from multiprocessing import Process, Queue
    +from pathlib import Path
    +from typing import Any, Dict, Generator, List, Literal, NamedTuple, Tuple, Union
    +
    +import pandas as pd
    +import torch
    +import torchaudio
    +from tqdm import tqdm
    +from transformers import (
    +    AutomaticSpeechRecognitionPipeline,
    +    AutoModelForCausalLM,
    +    pipeline,
    +)
    +from transformers.utils import is_flash_attn_2_available
    +
    +
    +
    +[docs] +class BaseTask: + """ + A task to write the transcription to file. + """ + + def __init__( + self, audio_file: Path, transcription_output: Union[dict, str], text_file: Path + ): + """ + Initialize the task. + + :param audio_file: Path to the audio file that was transcribed. + :param transcription_output: The transcription output from the pipeline. String means an exception was raised. + :param text_file: Path to the text file to write the transcription to. + """ + # Store the parameters: + self._audio_file = audio_file + self._transcription_output = transcription_output + self._text_file = text_file + + # Prepare the error variable: + self._error: str = None + +
    +[docs] + def do_task(self): + """ + Try to perform the task storing an error if occurred. + """ + if isinstance(self._transcription_output, str): + self._error = self._transcription_output + return + try: + self._do_task() + except Exception as exception: + self._error = str(exception)
    + + +
    +[docs] + def is_failed(self) -> bool: + """ + Check if the task failed. + + :returns: Whether the task failed. + """ + return self._error is not None
    + + +
    +[docs] + def get_result(self) -> Tuple[str, str]: + """ + Get the result of the task. If the task failed, the error will be returned, otherwise, the result will be the + text file name. + + :returns: The task's result. + """ + if self.is_failed(): + return self._audio_file.name, self._error + return self._audio_file.name, self._text_file.name
    + + +
    +[docs] + def to_tuple(self) -> Tuple[str, dict]: + """ + Convert the task to a tuple to reconstruct it later (used for multiprocessing to pass in queue). + + :returns: The converted task. + """ + return self.__class__.__name__, { + "audio_file": self._audio_file, + "transcription_output": self._transcription_output, + "text_file": self._text_file, + }
    + + + def _do_task(self): + """ + Perform the task - write the transcription to the stored file path. + """ + # Checking for no duplications: + i = 1 + while self._text_file.exists(): + i += 1 + self._text_file = ( + self._text_file.parent + / f"{self._text_file.stem.rsplit('_', 1)[0]}_{i}{self._text_file.suffix}" + ) + + # Make sure all directories are created: + self._text_file.parent.mkdir(exist_ok=True, parents=True) + + # Write to file: + with open(self._text_file, "w") as fp: + fp.write(self._transcription_output["text"])
    + + + +
    +[docs] +class SpeechDiarizationTask(BaseTask): + """ + A task to write the transcription to file with respect to a given speech diarization. + """ + + class _DiarizationSegment(NamedTuple): + """ + A speech diarization segment. + """ + + start: float + end: float + speaker: str + + class _WordTimestamp(NamedTuple): + """ + A word with its start and end timestamps. + """ + + start: float + end: float + text: str + + def __init__( + self, + audio_file: Path, + transcription_output: dict, + text_file: Path, + speech_diarization: List[Tuple[float, float, str]], + ): + """ + Initialize the task. + + :param audio_file: Path to the audio file that was transcribed. + :param transcription_output: The transcription output from the pipeline. + :param text_file: Path to the text file to write the transcription to. + :param speech_diarization: A speech diarization as a list of tuples: (start, end, speaker). + """ + super().__init__( + audio_file=audio_file, + transcription_output=transcription_output, + text_file=text_file, + ) + self._speech_diarization = speech_diarization + self._segments: List[SpeechDiarizationTask._DiarizationSegment] = None + self._last_chosen_index = 0 + +
    +[docs] + def to_tuple(self) -> Tuple[str, dict]: + """ + Convert the task to a tuple to reconstruct it later (used for multiprocessing to pass in queue). + + :returns: The converted task. + """ + task_class, task_kwargs = super().to_tuple() + return task_class, { + **task_kwargs, + "speech_diarization": self._speech_diarization, + }
    + + + def _do_task(self): + """ + Perform the task - write the transcription to the stored file path with respect to the given speech diarization. + """ + # Check if a speech diarization is given, if not, just write the transcription to file: + if not self._speech_diarization: + super()._do_task() + return + + # Cast the chunks to word timestamps tuples: + words = [ + SpeechDiarizationTask._WordTimestamp( + start=chunk["timestamp"][0], + end=chunk["timestamp"][1], + text=chunk["text"], + ) + for chunk in self._transcription_output["chunks"] + ] + + # Cast speech diarization to segments tuples: + self._segments = [ + SpeechDiarizationTask._DiarizationSegment(*segment) + for segment in self._speech_diarization + ] + + # Try to match the Whisper model predicted timestamps to the closest diarization segment (closest diarization + # segment will be the most overlapping with the word, and if there is no overlap, the closest segment to the + # word): + speaker = self._segments[self._last_chosen_index].speaker + text = f"{speaker}:" + for word in words: + # Get the next diarization segment: + self._get_next_segment(word=word) + # Check if the segment is of the same speaker: + if self._segments[self._last_chosen_index].speaker == speaker: + # Collect the word: + text += word.text + else: + # Append a newline and update the new speaker: + speaker = self._segments[self._last_chosen_index].speaker + text += f"\n{speaker}:{word.text}" + + # Update the transcription output with the new text to write it to file: + self._transcription_output["text"] = text + super()._do_task() + + def _get_next_segment( + self, + word: _WordTimestamp, + ): + """ + Get the next diarization segment the given word falls into. The `self._last_chosen_index` will be updated + accordingly. + + :param word: The word timestamp to match to the next segment. + """ + # If the last chosen segment is the last segment, return it: + if self._last_chosen_index == len(self._segments) - 1: + return + + # Get the last chosen diarization segment: + last_chosen = self._segments[self._last_chosen_index] + + # None value may appear if the word is the last word in the audio file, or it was split during inference. In + # that case, we'll set the last segment: + if word.end is None: + self._last_chosen_index = len(self._segments) - 1 + return + + # If the word ends before the last chosen segment: + if word.end <= last_chosen.start: + # Then it is still the closest segment + return + + # We check if it ends inside the last chosen segment: + if word.end < last_chosen.end: + # Then it still is the closest segment + return + + # The word ends after the segment, we need to collect all next segments up until the word ends before them: + possible_segments = [self._last_chosen_index] + for i in range(self._last_chosen_index + 1, len(self._segments)): + if word.end > self._segments[i].end: + possible_segments.append(i) + continue + possible_segments.append(i) + break + + # Check for the most overlapping option: + best_overlap = 0 + most_overlapping_segment_index = None + for i in possible_segments: + # If the word starts before segment: + if word.start <= self._segments[i].start: + # If it ends before the segment, there is an overlap from the start of the segment to the end of the + # word: + if word.end < self._segments[i].end: + overlap = word.end - self._segments[i].start + else: + # The word is wrapping the segment, the overlap is the segment's length: + overlap = self._segments[i].end - self._segments[i].start + # The word starts in segment, check if the word ends in it: + elif word.end < self._segments[i].end: + # The overlap is the word's length: + overlap = word.end - word.start + # The word start in segment but ends after it, the overlap is from the word's start to the segment's end: + else: + overlap = self._segments[i].end - word.start + # Check for new best overlap: + if overlap > best_overlap: + best_overlap = overlap + most_overlapping_segment_index = i + if most_overlapping_segment_index is not None: + self._last_chosen_index = most_overlapping_segment_index + return + + # If there is no overlapping segment, return the closest segment: + best_distance = None + closest_segment_index = None + for i in possible_segments: + distance = ( + word.start - self._segments[i].end + if word.start > self._segments[i].end + else self._segments[i].start - word.end + ) + if best_distance is None or distance < best_distance: + best_distance = distance + closest_segment_index = i + self._last_chosen_index = closest_segment_index
    + + + +
    +[docs] +class SpeechDiarizationPerChannelTask(BaseTask): + """ + A task to write the transcription to file with respect to a given speech diarization per channel. + """ + + class _WordTimestamp(NamedTuple): + """ + A word with its start and end timestamps and speaker label (channel the word was taken from). + """ + + start: float + end: float + speaker: str + text: str + + def __init__(self, audio_file: Path, text_file: Path): + """ + Initialize the task. + + :param audio_file: Path to the audio file that was transcribed. + :param text_file: Path to the text file to write the transcription to. + """ + super().__init__( + audio_file=audio_file, transcription_output={}, text_file=text_file + ) + self._transcription_output_channels: List[Tuple[str, dict]] = [] + + @property + def transcription_output_channels(self) -> List[Tuple[str, dict]]: + """ + Get the transcription output channels. + + :returns: The transcription output channels. + """ + return self._transcription_output_channels + +
    +[docs] + def do_task(self): + """ + Try to perform the task storing an error if occurred. + """ + for _, channel_output in self._transcription_output_channels: + if isinstance(channel_output, str): + self._error = self._transcription_output_channels + return + super().do_task()
    + + +
    +[docs] + def to_tuple(self) -> Tuple[str, dict]: + """ + Convert the task to a tuple to reconstruct it later (used for multiprocessing to pass in queue). + + :returns: The converted task. + """ + task_class, task_kwargs = super().to_tuple() + task_kwargs.pop("transcription_output") + return task_class, task_kwargs
    + + + def _do_task(self): + """ + Perform the task - write the transcription to the stored file path with respect to the given speech diarization + per channel. + """ + # Cast the chunks to word timestamps tuples: + words_per_channel = [ + [ + SpeechDiarizationPerChannelTask._WordTimestamp( + start=chunk["timestamp"][0], + end=chunk["timestamp"][1], + speaker=speaker, + text=chunk["text"], + ) + for chunk in output["chunks"] + ] + for speaker, output in self._transcription_output_channels + ] + + # Merge and sort the words per channel by their start time: + words = operator.add(*words_per_channel) + words.sort() + + # Write the transcription to file: + current_speaker = words[0].speaker + text = f"{current_speaker}:" + for word in words: + # Check if the word's speaker is different from the current one: + if word.speaker != current_speaker: + # Append a newline and update the new speaker: + current_speaker = word.speaker + text += f"\n{current_speaker}:" + # Collect the word: + text += word.text + + # Update the transcription output with the new text to write it to file: + self._transcription_output["text"] = text + super()._do_task()
    + + + +
    +[docs] +class BatchProcessor: + """ + A batch processor to process batches of transcriptions. The batch processor is creating tasks and is aimed to be + working along the transcriber. It can be used with multiprocessing queue or run the tasks directly using the + associated methods. + """ + + def __init__(self, audio_files: List[Path], output_directory: Path): + """ + Initialize the batch processor. + + :param audio_files: The list of all audio files to transcribe. + :param output_directory: The output directory to write the transcriptions to. + """ + # Store the parameters: + self._audio_files = audio_files + self._output_directory = output_directory + + # Prepare the batching variables: + self._current_file_index = 0 + self._tasks: List[BaseTask] = [] + self._results: List[Tuple[bool, Tuple[str, str]]] = [] + +
    +[docs] + def process_batch(self, batch: List[Union[dict, str]]): + """ + Process a batch of transcriptions. Tasks related to the given batch will be created and stored in the batch + processor. + + :param batch: The batch of transcriptions to process. + """ + # Get the relevant files belongs to the given batch: + current_files = self._get_current_files(batch_size=len(batch)) + + # Build the diarization tasks: + self._tasks.extend( + [ + BaseTask( + audio_file=file, + transcription_output=batch[i], + text_file=self._output_directory / f"{file.stem}.txt", + ) + for i, file in enumerate(current_files) + ] + )
    + + +
    +[docs] + def get_tasks(self) -> List[BaseTask]: + """ + Get the tasks to perform. + + :returns: The tasks to perform. + """ + tasks = self._tasks + self._tasks = [] + return tasks
    + + +
    +[docs] + def do_tasks(self): + """ + Perform the tasks. Should be used if no multiprocessing queue is given to a transcriber. + """ + for task in self.get_tasks(): + task.do_task() + self._results.append((task.is_failed(), task.get_result()))
    + + +
    +[docs] + def get_results(self) -> List[Tuple[bool, Tuple[str, str]]]: + """ + Get the results of the tasks. The stored results are then cleared. + + :returns: The results of the tasks. + """ + results = self._results + self._results = [] + return results
    + + + def _get_current_files(self, batch_size: int) -> List[Path]: + """ + Get the current files to process. + + :param batch_size: The batch size to progress the current file index. + + :returns: The current files to process. + """ + end_index = ( + self._current_file_index + batch_size + if self._current_file_index + batch_size < len(self._audio_files) + else len(self._audio_files) + ) + current_files = self._audio_files[self._current_file_index : end_index] + self._current_file_index = end_index + return current_files
    + + + +
    +[docs] +class SpeechDiarizationBatchProcessor(BatchProcessor): + """ + A batch processor to process batches of transcriptions with respect to a given speech diarization. The batch + processor is creating tasks and is aimed to be working along the transcriber. It can be used with multiprocessing + queue or run the tasks directly using the associated methods. + """ + + def __init__( + self, audio_files: List[Path], output_directory: Path, speech_diarization: dict + ): + """ + Initialize the batch processor. + + :param audio_files: The list of all audio files to transcribe. + :param output_directory: The output directory to write the transcriptions to. + :param speech_diarization: A speech diarization dictionary to pass along with each processed batch. + """ + super().__init__(audio_files=audio_files, output_directory=output_directory) + self._speech_diarization = speech_diarization + self._audio_files = audio_files + +
    +[docs] + def process_batch(self, batch: List[dict]): + """ + Process a batch of transcriptions. Tasks related to the given batch will be created and stored in the batch + processor. + + :param batch: The batch of transcriptions to process. + """ + # Get the relevant files belongs to the given batch: + current_files = self._get_current_files(batch_size=len(batch)) + + # Build the diarization tasks: + self._tasks.extend( + [ + SpeechDiarizationTask( + audio_file=file, + transcription_output=batch[i], + text_file=self._output_directory / f"{file.stem}.txt", + speech_diarization=self._speech_diarization.get(file.name), + ) + for i, file in enumerate(current_files) + ] + )
    +
    + + + +
    +[docs] +class PerChannelSpeechDiarizationBatchProcessor(BatchProcessor): + """ + A batch processor to process batches of transcriptions per channel. The batch processor is creating tasks with the + selected amount of channels given and is aimed to be working along the transcriber. It can be used with + multiprocessing queue or run the tasks directly using the associated methods. + """ + + def __init__( + self, + audio_files: List[Path], + output_directory: Path, + n_channels: int, + speakers: List[str], + ): + """ + Initialize the batch processor. + + :param audio_files: The list of all audio files to transcribe. + :param output_directory: The output directory to write the transcriptions to. + :param n_channels: The number of channels in each audio file to transcribe. + :param speakers: The speakers labels to use for each channel. + """ + super().__init__(audio_files=audio_files, output_directory=output_directory) + + # Store the parameters: + self._n_channels = n_channels + self._speakers = speakers + + # Prepare a channel buffer to store the channels until the current task created is fully covered: + self._task_in_process: SpeechDiarizationPerChannelTask = None + +
    +[docs] + def process_batch(self, batch: List[dict]): + """ + Process a batch of transcriptions. Tasks related to the given batch will be created and stored in the batch + processor. + + :param batch: The batch of transcriptions to process. + """ + # Go over the batch and create the tasks: + for output in batch: + # Check if there is a task in process: + if not self._task_in_process: + # Create a new task: + self._task_in_process = SpeechDiarizationPerChannelTask( + audio_file=self._audio_files[self._current_file_index], + text_file=self._output_directory + / f"{self._audio_files[self._current_file_index].stem}.txt", + ) + # Get the channel's speaker: + speaker = self._speakers[ + len(self._task_in_process.transcription_output_channels) + ] + # Collect the channel into the processed task: + self._task_in_process.transcription_output_channels.append( + (speaker, output) + ) + # Check if the task is fully covered (all channels are collected): + if ( + len(self._task_in_process.transcription_output_channels) + == self._n_channels + ): + # Collect the task and reset the task in process: + self._tasks.append(self._task_in_process) + self._current_file_index += 1 + self._task_in_process = None
    +
    + + + +
    +[docs] +class Transcriber: + """ + A transcription wrapper for the Huggingface's ASR pipeline - + https://huggingface.co/transformers/main_classes/pipelines.html#transformers.AutomaticSpeechRecognitionPipeline to + use with OpenAI's Whisper models - https://huggingface.co/openai. + """ + + def __init__( + self, + model_name: str, + device: str = None, + use_flash_attention_2: bool = None, + use_better_transformers: bool = None, + assistant_model: str = None, + max_new_tokens: int = 128, + chunk_length_s: int = 30, + batch_size: int = 2, + spoken_language: str = None, + translate_to_english: bool = False, + return_timestamps: Union[bool, Literal["word"]] = False, + per_channel_transcription: int = 0, + ): + """ + Initialize the transcriber. + + :param model_name: The model name to use. Should be a model from the OpenAI's Whisper models for + best results (for example "tiny", "base", "large", etc.). + :param device: The device to use for inference. If not given, will use GPU if available. + :param use_flash_attention_2: Whether to use the Flash Attention 2 implementation. It can be used only with + one of the following GPUs: Nvidia H series and Nvidia A series. T4 support + will be available soon. + + Note: If both `use_flash_attention_2` and + `use_better_transformers` are `None`, the optimization will be chosen + automatically according to the available resources. + + :param use_better_transformers: Whether to use the Better Transformers library to further optimize the model. + Should be used for all use cases that do not support flash attention 2. + + Note: If both `use_flash_attention_2` and `use_better_transformers` are + `None`, the optimization will be chosen automatically according to the + available resources. + :param assistant_model: The assistant model name to use for inference. Notice that the optimizations + (flash attention 2 and better transformers) will be applied for the assistant + as well. Should be a model from Huggingface's distil-whisper (see here for + more information: https://github.com/huggingface/distil-whisper). + :param max_new_tokens: The maximum number of new tokens to generate. This is used to limit the + generation length. Default is 128 tokens. + :param chunk_length_s: The audio chunk to split the audio to (in seconds). Default is 30 seconds. + :param batch_size: The batch size to use for inference. Default is 2. + :param spoken_language: Aim whisper to know what language is spoken. If None, it will try to detect it + for each chunk. + :param translate_to_english: Whether to translate the transcriptions to English. Default is False. + :param return_timestamps: Whether to return the timestamps of the words. If "word", will return the + timestamps of each word. If True will return the timestamps of each chunk. + Default is False. Aimed to be used for speech diarization. + :param per_channel_transcription: Whether to do per channel transcription. If needed to run per channel + transcription, pass the number of channels expected for each audio file here. + 0 means regular transcription (merge channels). + + Note: If `per_channel_transcription` is not 0, `batch_size` must be treated to + be the number of channels and not audio files. Aimed to be used for per + channel speech diarization. + """ + # Store loading parameters: + self._model_name = model_name + self._device = device + self._use_flash_attention_2 = use_flash_attention_2 + self._use_better_transformers = use_better_transformers + self._max_new_tokens = max_new_tokens + self._chunk_length_s = chunk_length_s + self._batch_size = batch_size + self._return_timestamps = return_timestamps + self._per_channel_transcription = per_channel_transcription + + # Store generation parameters: + self._assistant_model = assistant_model + self._spoken_language = spoken_language + self._translate_to_english = translate_to_english + + # Prepare the transcription objects: + self._transcription_pipeline: AutomaticSpeechRecognitionPipeline = None + self._generate_kwargs: dict = None + +
    +[docs] + def load(self): + """ + Load the transcriber. Must be called before transcribing. + """ + # Set the device and data type to use (prefer GPU if available): + device = torch.device( + self._device or "cuda" if torch.cuda.is_available() else "cpu" + ) + torch_dtype = torch.float16 if device.type == "cuda" else torch.float32 + + # Choose the optimization to use (in case the user did not specify any): + if ( + self._use_flash_attention_2 is None + and self._use_better_transformers is None + ): + # Prefer to use flash attention 2 if available and cuda device is supported (see GPU names to architecture + # here: https://en.wikipedia.org/wiki/List_of_Nvidia_graphics_processing_units#Tesla): + if device.type == "cuda" and is_flash_attn_2_available(): + cuda_device_name = torch.cuda.get_device_properties(device).name + if any( + cuda_device_name.startswith(gpu_name) + for gpu_name in [ + "NVIDIA A", # For Ampere architecture (e.g. A10, A30, A100) + "NVIDIA H", # For Hopper architecture (e.g. H100) + "NVIDIA L", # For Ada Lovelace architecture (e.g. L4, L40) + "NVIDIA RTX 30", # For Ada Lovelace architecture (RTX 30 series) + "NVIDIA RTX 40", # For Ada Lovelace architecture (RTX 40 series) + "NVIDIA RTX 50", # For Ada Lovelace architecture (RTX 50 series) + # Will be supported soon according to FlashAttention GitHub repo: + # https://github.com/Dao-AILab/flash-attention?tab=readme-ov-file#installation-and-features + # "NVIDIA T4", # For Turing architecture (only T4) + # "NVIDIA RTX 20", # For Turing architecture (RTX 20 series) + ] + ): + self._use_flash_attention_2 = True + else: + self._use_better_transformers = True + else: + self._use_better_transformers = True + + # Build the optimizations kwargs: + model_kwargs = { + "low_cpu_mem_usage": True, + "use_safetensors": True, + } + if self._use_flash_attention_2: + if _LOGGER: + _LOGGER.info( + "Using FlashAttention2 optimization - make sure the `flash-attn` package is installed via " + "`pip install -U flash-attn --no-build-isolation`" + ) + model_kwargs["attn_implementation"] = "flash_attention_2" + elif self._use_better_transformers: + if _LOGGER: + _LOGGER.info( + "Using BetterTransformers optimization - make sure the `optimum` package is installed via " + "`pip install -U optimum`" + ) + model_kwargs["attn_implementation"] = "sdpa" + + # Initialize the speech recognition pipeline: + self._transcription_pipeline = pipeline( + task="automatic-speech-recognition", + model=self._model_name, + model_kwargs=model_kwargs.copy(), + batch_size=self._batch_size, + max_new_tokens=self._max_new_tokens, + chunk_length_s=self._chunk_length_s, + return_timestamps=self._return_timestamps, + torch_dtype=torch_dtype, + device=device, + ) + + # Prepare the generation kwargs: + self._generate_kwargs = { + "language": self._spoken_language, + "task": "translate" if self._translate_to_english else "transcribe", + } + + # Initialize the assistant model (if needed): + if self._assistant_model: + assistant_model = AutoModelForCausalLM.from_pretrained( + self._assistant_model, torch_dtype=torch_dtype, **model_kwargs + ) + assistant_model.to(device) + self._generate_kwargs["assistant_model"] = assistant_model
    + + +
    +[docs] + def transcribe( + self, + audio_files: List[Path], + batch_processor: BatchProcessor = None, + batches_queue: Queue = None, + verbose: bool = False, + ) -> Union[List[List[dict]], None]: + """ + Transcribe the given audio files. The transcriptions will be sent to a queue or a batch processor for further + processing like writing to text files. If no queue or batch processor is given, the transcriptions outputs from + the pipeline will be returned. Otherwise, `None` is returned. + + :param audio_files: The audio files to transcribe. + :param batch_processor: A batch processor. + :param batches_queue: A multiprocessing queue to put the batches in. + :param verbose: Whether to show a progress bar. Default is False. + + :returns: The transcriptions outputs from the pipeline if no queue or batch processor is given, otherwise, + `None`. + """ + # Wrap the audio files with a function to iterate over them via a generator (save memory and runtime with + # Huggingface's pipelines as they preload each input while inference is running): + def audio_iterator() -> Generator[Union[dict, str], None, None]: + if self._per_channel_transcription: + for audio_file in audio_files: + audio, sampling_rate = torchaudio.load(str(audio_file)) + audio = audio.numpy() + for channel in audio: + yield {"raw": channel, "sampling_rate": sampling_rate} + else: + for audio_file in audio_files: + yield str(audio_file) + + # Create a batch iterator: + def batch_iterator() -> Generator[List[Union[dict, str]], None, None]: + batch = [] + for audio in audio_iterator(): + batch.append(audio) + if len(batch) == self._batch_size: + yield batch + batch = [] + if batch: + yield batch + + # Prepare the successes dataframe and errors dictionary to be returned: + outputs = [] + + # Infer through the pipeline: + for input_batch in tqdm( + batch_iterator() if self._batch_size > 1 else audio_iterator(), + desc="Transcribing", + unit="channel" if self._per_channel_transcription else "audio file", + total=( + ( + (len(audio_files) // self._batch_size) + + (len(audio_files) % self._batch_size != 0) + ) + * (self._per_channel_transcription or 1) + ), + disable=not verbose, + ): + # Infer: + try: + output_batch = self._transcription_pipeline( + input_batch, + generate_kwargs=self._generate_kwargs, + ) + except Exception as exception: + # Collect the exception: + output_batch = str(exception) + # Align to batch size: + output_batch = ( + [output_batch] * len(input_batch) + if isinstance(input_batch, list) + else [output_batch] + ) + # To align with batching, if batch size is 1, wrap the output with a list: + if isinstance(output_batch, dict): + output_batch = [output_batch] + # If a batch processor is given, process the batch: + if batch_processor: + # Process it directly: + batch_processor.process_batch(batch=output_batch) + batch_processor.do_tasks() + elif batches_queue: + # Otherwise, queue the batch: + batches_queue.put(output_batch) + else: + # Otherwise, collect the output as is without processing: + outputs.append(output_batch) + + # Check if given a multiprocessing queue or a batch processor: + if batches_queue: + batches_queue.put(_MULTIPROCESSING_STOP_MARK) + + return outputs if not batch_processor else None
    +
    + + + +#: The value to send into multiprocessing queues to stop the process: +_MULTIPROCESSING_STOP_MARK = "STOP" + + +def _multiprocessing_process_batches( + batch_processor: BatchProcessor, + batches_queue: Queue, + tasks_queue: Queue, + n_task_completers: int, +): + """ + Process the batches in the given batches queue and put the tasks in the given tasks queue. The function will stop + when the given batches queue will receive the stop mark. It is aimed to be used with multiprocessing as a process. + + :param batch_processor: A batch processor to process the batches. + :param batches_queue: A queue to get the batches from. + :param tasks_queue: A queue to put the tasks in. + :param n_task_completers: The number of task completers (processes that run the `_multiprocessing_complete_tasks` + function). A stop mark will be sent to the tasks queue for each task completer. + """ + while True: + # Get the batch: + batch: List[dict] = batches_queue.get() + if batch == _MULTIPROCESSING_STOP_MARK: + break + + # Process the batch: + batch_processor.process_batch(batch=batch) + + # Get the tasks: + tasks = batch_processor.get_tasks() + + # Queue the tasks: + for task in tasks: + tasks_queue.put(task.to_tuple()) + + # Mark the end of the batches: + for _ in range(n_task_completers): + tasks_queue.put(_MULTIPROCESSING_STOP_MARK) + + +def _multiprocessing_complete_tasks(tasks_queue: Queue, results_queue: Queue): + """ + Complete the tasks in the given queue and put the results in the given results queue. The function will stop when + the given tasks queue will receive the stop mark. It is aimed to be used with multiprocessing as a process. + + :param tasks_queue: A queue to get the tasks from. + :param results_queue: A queue to put the results in. + """ + tasks_map = { + BaseTask.__name__: BaseTask, + SpeechDiarizationTask.__name__: SpeechDiarizationTask, + SpeechDiarizationPerChannelTask.__name__: SpeechDiarizationPerChannelTask, + } + + while True: + # Get the task: + task = tasks_queue.get() + if task == _MULTIPROCESSING_STOP_MARK: + break + + # Reconstruct the task: + task_class, task_kwargs = task + task = tasks_map[task_class](**task_kwargs) + + # Complete the task: + task.do_task() + results_queue.put((task.is_failed(), task.get_result())) + + # Mark the end of the tasks: + results_queue.put(_MULTIPROCESSING_STOP_MARK) + + +# Get the global logger: +_LOGGER = logging.getLogger() + + +
    +[docs] +def open_mpi_handler( + worker_inputs: List[str], root_worker_inputs: Dict[str, Any] = None +): + global _LOGGER + + # Check for MLRun and OpenMPI availability: + context, comm = _check_mlrun_and_open_mpi() + + # Check if MLRun is available, set the global logger to MLRun's: + if context: + _LOGGER = context.logger + + def decorator(handler): + if comm is None or comm.Get_size() == 1: + return handler + + @wraps(handler) + def wrapper(**kwargs): + # Get the open mpi environment properties: + size = comm.Get_size() + rank = comm.Get_rank() + + # Give the correct chunk of the workers inputs: + for worker_input in worker_inputs: + input_argument = kwargs[worker_input] + if input_argument is None: + continue + if isinstance(input_argument, str): + input_argument = _get_audio_files( + data_path=Path(input_argument).absolute() + ) + if len(input_argument) < size: + raise ValueError( + f"Cannot split the input '{worker_input}' of length {len(input_argument)} to {size} workers. " + f"Please reduce the amount of workers for this input." + ) + even_chunk_size = len(input_argument) // size + chunk_start = rank * even_chunk_size + chunk_end = ( + (rank + 1) * even_chunk_size + if rank + 1 < size + else len(input_argument) + ) + context.logger.info( + f"Rank #{rank}: Processing input chunk of '{worker_input}' " + f"from index {chunk_start} to {chunk_end}." + ) + if isinstance(input_argument, list): + input_argument = input_argument[chunk_start:chunk_end] + elif isinstance(input_argument, pd.DataFrame): + input_argument = input_argument.iloc[chunk_start:chunk_end:, :] + kwargs[worker_input] = input_argument + + # Set the root worker only arguments: + if rank == 0 and root_worker_inputs: + kwargs.update(root_worker_inputs) + + # Run the worker: + output = handler(**kwargs) + + # Save the output directory of this worker: + output_directory = Path(output[0]) + + # Send the output to the root rank (rank #0): + output = comm.gather(output, root=0) + + # Join the data from all workers: + if rank == 0: + context.logger.info("Collecting data from workers to root worker.") + + # Check if there are different output directories: + output_directories = set([Path(out_dir) for out_dir, _, _ in output]) + for r in range(1, size): + # True means the other workers should pass their files to the root worker (rank 0): + comm.send(len(output_directories) != 1, dest=r) + + # If there are different output directories, listen to the other workers: + if len(output_directories) != 1: + # Collect the files from the other workers: + files = [] + for r in range(1, size): + files.extend(comm.recv(source=r)) + # Write the files to the root worker's output directory: + for file_name, file_content in files: + with open(output_directory / file_name, "w") as f: + f.write(file_content) + + # Concatenate the dataframes: + dataframe = pd.concat(objs=[df for _, df, _ in output], axis=0) + + # Concatenate the errors dictionaries: + errors_dictionary = reduce( + operator.ior, [err for _, _, err in output], {} + ) + + return str(output_directory), dataframe, errors_dictionary + + # Listen to rank 0 to see if there are different output directories and this rank need to send its files to + # it: + if comm.recv(source=0): + files = [] + for file in os.listdir(output_directory): + with open(output_directory / file, "r") as f: + files.append((file, f.read())) + comm.send(files, dest=0) + return None + + return wrapper + + return decorator
    + + + +def _check_mlrun_and_open_mpi() -> Tuple["mlrun.MLClientCtx", "mpi4py.MPI.Intracomm"]: + is_mpi = False + try: + import mlrun + + context = mlrun.get_or_create_ctx(name="mlrun") + is_mpi = context.labels.get("kind", "job") == "mpijob" + + if is_mpi: + try: + from mpi4py import MPI + + return context, MPI.COMM_WORLD + except ModuleNotFoundError as mpi4py_not_found: + context.logger.error( + "To distribute the function using MLRun's 'mpijob' you need to have `mpi4py` package in your " + "interpreter. Please run `pip install mpi4py` and make sure you have open-mpi." + ) + raise mpi4py_not_found + else: + return context, None + except ModuleNotFoundError as module_not_found: + if is_mpi: + raise module_not_found + return None, None + + +
    +[docs] +@open_mpi_handler(worker_inputs=["data_path"], root_worker_inputs={"verbose": True}) +def transcribe( + # Input / Output kwargs: + data_path: Union[str, Path, List[Union[str, Path]]], + output_directory: str = None, + # Model loading kwargs: + model_name: str = "openai/whisper-tiny", + device: str = None, + use_flash_attention_2: bool = None, + use_better_transformers: bool = None, + # Generation kwargs: + assistant_model: str = None, + max_new_tokens: int = 128, + chunk_length_s: int = 30, + batch_size: int = 8, + spoken_language: str = None, + translate_to_english: bool = False, + # Diarization kwargs: + speech_diarization: Dict[str, List[Tuple[float, float, str]]] = None, + speech_diarize_per_channel: int = None, + speaker_labels: List[str] = None, + # Other kwargs: + use_multiprocessing: Union[bool, int] = False, + verbose: bool = False, +): + """ + Transcribe audio files into text files and collect additional data. The end result is a directory of transcribed + text files and a dataframe containing the following columns: + + * audio_file - The audio file path. + * transcription_file - The transcribed text file name in the output directory. + + The transcription is based on Huggingface's ASR pipeline - + https://huggingface.co/transformers/main_classes/pipelines.html#transformers.AutomaticSpeechRecognitionPipeline and + is tested with OpenAI's Whisper models - https://huggingface.co/openai. + + If one of the speaker diarization parameters are given (either `speech_diarization` or + `speech_diarize_per_channel`), the transcription will be written in a conversation format, where each speaker will + be written in a separate line:: + + speaker_1: text + speaker_2: text + speaker_1: text + ... + + :param data_path: A directory of audio files or a single file or a list of files to transcribe. + :param output_directory: Path to a directory to save all transcribed audio files. If not given, will save + the transcribed files in a temporary directory. + :param model_name: The model name to use. Should be a model from the OpenAI's Whisper models for + best results (for example "tiny", "base", "large", etc.). See here for more + information: https://huggingface.co/openai?search_models=whisper. + :param device: The device to use for inference. If not given, will use GPU if available. + :param use_flash_attention_2: Whether to use the Flash Attention 2 implementation. It can be used only with + one of the following GPUs: Nvidia H series and Nvidia A series. T4 support + will be available soon. + + Note: If both `use_flash_attention_2` and + `use_better_transformers` are `None`, the optimization will be chosen + automatically according to the available resources. + + :param use_better_transformers: Whether to use the Better Transformers library to further optimize the model. + Should be used for all use cases that do not support flash attention 2. + + Note: If both `use_flash_attention_2` and `use_better_transformers` are + `None`, the optimization will be chosen automatically according to the + available resources. + :param assistant_model: The assistant model name to use for inference. Notice that the optimizations + (flash attention 2 and better transformers) will be applied for the assistant as + well. Should be a model from Huggingface's distil-whisper (see here for more + information: https://github.com/huggingface/distil-whisper). + + Note: Currently an assistant model is only usable with batch size of 1. + :param max_new_tokens: The maximum number of new tokens to generate. This is used to limit the + generation length. Default is 128 tokens. + :param chunk_length_s: The audio chunk to split the audio to (in seconds). Default is 30 seconds. + :param batch_size: The batch size to use for inference. Default is 2. + :param spoken_language: Aim whisper to know what language is spoken. If None, it will try to detect + it. + :param translate_to_english: Whether to translate the transcriptions to English. + :param speech_diarization: A speech diarization dictionary with the file names to transcribe as keys and + their diarization as value. The diarization is a list of tuples: + (start, end, speaker). An example + for a diarization dictionary:: + + { + "audio_file_name": [ + { + "start": 0.0, + "end": 2.0, + "speaker": "Agent", + }, + { + "start": 2.0, + "end": 4.0, + "speaker": "Client", + }, + ... + ], + ... + } + + Note: The diarization must be for the entire duration of the audio file (as long + as Whisper is predicting words up until then. + :param speech_diarize_per_channel: Perform speech diarization per channel. Each speaker is expected to belong to + a separate channel in the audio. Notice: This will make the transcription + slower as each channel wil be transcribed separatly. If a speech diarization + is passed (via the `speech_diarization` parameter), this parameter is + ignored. + :param speaker_labels: A list of speaker labels by channel order to use for writing the + transcription with respect to per channel speech diarization. This won't be + used together with a given speech diarization (via the `speech_diarization` + parameter). + :param use_multiprocessing: Whether to use multiprocessing to transcribe the audio files. Can be either a + boolean value or an integer. If `True`, will use the default amount of workers + (3): 1 for transcription, 1 for batch processing and 1 for task completion (such + as speech diarization and writing to files). To control the amount of tasks + completion workers, an integer can be provided to specify the amount of workers. + `False`, will use a single process. Default is `False`. + :param verbose: Whether to print the progress of the transcription. Default is `False`. + """ + global _LOGGER + + # Get the input audio files to transcribe: + if verbose: + _LOGGER.info("Collecting audio files.") + audio_files = _get_audio_files(data_path=data_path) + if verbose: + _LOGGER.info(f"Collected {len(audio_files)} audio files.") + + # Get the output directory: + if output_directory is None: + if verbose: + _LOGGER.info("No output directory given, using temporary directory.") + output_directory = tempfile.mkdtemp() + output_directory = Path(output_directory).absolute() + output_directory.mkdir(exist_ok=True, parents=True) + if verbose: + _LOGGER.info(f"Transcriptions will be saved to: {output_directory}") + + # Initialize a batch processor according to user requirements (no speech diarization, given speech diarization, + # speech diarization per channel): + if speech_diarization: + batch_processor = SpeechDiarizationBatchProcessor( + audio_files=audio_files, + output_directory=output_directory, + speech_diarization=speech_diarization, + ) + elif speech_diarize_per_channel: + batch_processor = PerChannelSpeechDiarizationBatchProcessor( + audio_files=audio_files, + output_directory=output_directory, + n_channels=speech_diarize_per_channel, + speakers=speaker_labels, + ) + else: + batch_processor = BatchProcessor( + audio_files=audio_files, + output_directory=output_directory, + ) + + # Initialize the transcription pipeline: + transcriber = Transcriber( + device=device, + use_flash_attention_2=use_flash_attention_2, + use_better_transformers=use_better_transformers, + assistant_model=assistant_model, + model_name=model_name, + max_new_tokens=max_new_tokens, + chunk_length_s=chunk_length_s, + batch_size=batch_size, + return_timestamps=( + "word" + if speech_diarization is not None or speech_diarize_per_channel is not None + else False + ), + per_channel_transcription=speech_diarize_per_channel or 0, + spoken_language=spoken_language, + translate_to_english=translate_to_english, + ) + + # Run the transcription: + if use_multiprocessing: + results = _parallel_run( + n_workers=use_multiprocessing + if isinstance(use_multiprocessing, int) + else 1, + audio_files=audio_files, + batch_processor=batch_processor, + transcriber=transcriber, + verbose=verbose, + ) + else: + results = _run( + audio_files=audio_files, + batch_processor=batch_processor, + transcriber=transcriber, + verbose=verbose, + ) + + # Process the results: + if verbose: + _LOGGER.info("Summarizing the results.") + successes = [] + errors = {} + for is_error, result in results: + if is_error: + errors[result[0]] = result[1] + else: + successes.append(result) + successes = pd.DataFrame(successes, columns=["audio_file", "transcription_file"]) + if verbose: + _LOGGER.info( + f"Done ({successes.shape[0]}/{len(audio_files)})\n" + f"Transcriptions summary:\n" + f"{successes.head()}" + ) + + return str(output_directory), successes, errors
    + + + +def _get_audio_files( + data_path: Union[Path, str, list], +) -> List[Path]: + """ + Get the audio files to transcribe. If a path to a directory is given, all files in the directory will be collected. + + :param data_path: The data path to collect the audio files from. + + :returns: The audio files list. + """ + # Check if given a list of paths: + if isinstance(data_path, list): + audio_files = [] + for path in data_path: + audio_files.extend(_get_audio_files(data_path=path)) + return audio_files + + # Check if given a single string path to cast it to a `pathlib.Path`: + if isinstance(data_path, str): + data_path = Path(data_path).absolute() + + # Check if the path is of a directory or a file: + if data_path.is_dir(): + # Get all files inside the directory: + audio_files = list(data_path.glob("*.*")) + elif data_path.is_file(): + audio_files = [data_path] + else: + raise ValueError( + f"Unrecognized data path. The parameter `data_path` must be a valid path to either a directory path or a " + f"file. Given: {str(data_path)} " + ) + + return audio_files + + +def _run( + audio_files: List[Path], + batch_processor: BatchProcessor, + transcriber: Transcriber, + verbose: bool, +) -> List[Tuple[bool, Tuple[str, str]]]: + """ + Run the transcription without multiprocessing. + + :param audio_files: The audio files to transcribe. + :param batch_processor: The batch processor to use. + :param transcriber: The transcriber to use. + :param verbose: Verbosity. + + :returns: The collected results. + """ + # Load the transcription pipeline: + if verbose: + _LOGGER.info(f"Loading the transcription pipeline.") + transcriber.load() + if verbose: + _LOGGER.info("Transcription pipeline loaded.") + + # Transcribe the files: + transcriber.transcribe( + audio_files=audio_files, + batch_processor=batch_processor, + verbose=verbose, + ) + + # Return the results: + return batch_processor.get_results() + + +def _parallel_run( + n_workers: int, + audio_files: List[Path], + batch_processor: BatchProcessor, + transcriber: Transcriber, + verbose: bool, +): + """ + Run the transcription with multiprocessing. + + :param n_workers: The amount of workers to use as task completers. + :param audio_files: The audio files to transcribe. + :param batch_processor: The batch processor to use. + :param transcriber: The transcriber to use. + :param verbose: Verbosity. + + :returns: The collected results. + """ + # Initialize the multiprocessing queues: + batches_queue = Queue() + tasks_queue = Queue() + results_queue = Queue() + + # Initialize the multiprocessing processes: + batch_processing_process = Process( + target=_multiprocessing_process_batches, + kwargs={ + "batch_processor": batch_processor, + "batches_queue": batches_queue, + "tasks_queue": tasks_queue, + "n_task_completers": n_workers, + }, + ) + task_completion_processes = [ + Process( + target=_multiprocessing_complete_tasks, + kwargs={"tasks_queue": tasks_queue, "results_queue": results_queue}, + ) + for _ in range(n_workers) + ] + + # Start the multiprocessing processes: + batch_processing_process.start() + for p in task_completion_processes: + p.start() + + # Load the transcription pipeline: + if verbose: + _LOGGER.info(f"Loading the transcription pipeline.") + transcriber.load() + if verbose: + _LOGGER.info("Transcription pipeline loaded.") + + # Transcribe the files: + transcriber.transcribe( + audio_files=audio_files, batches_queue=batches_queue, verbose=verbose + ) + + # Collect the results: + results = [] + stop_marks_counter = 0 + while True: + # Get a result from the queue: + result: Tuple[bool, Tuple[str, str]] = results_queue.get() + if result == _MULTIPROCESSING_STOP_MARK: + stop_marks_counter += 1 + if stop_marks_counter == n_workers: + break + else: + # Collect the result: + results.append(result) + + # Wait for the processes to finish: + results_queue.empty() + batch_processing_process.join() + for p in task_completion_processes: + p.join() + + return results +
    +
    +
    +
    +
    +
    +
    +
    +
    + +
    +
    +
    +
    + + + +
    +
    + + \ No newline at end of file diff --git a/functions/master/transcribe/latest/src/function.yaml b/functions/master/transcribe/latest/src/function.yaml index d72751ad..43e9b3a8 100644 --- a/functions/master/transcribe/latest/src/function.yaml +++ b/functions/master/transcribe/latest/src/function.yaml @@ -1,25 +1,13 @@ kind: job metadata: - name: transcribe - tag: '' - hash: 8810ac74045bd15cee15a2e4e89563e8e29908d3 - project: '' - labels: - author: yonatans categories: - - data-preparation + - audio - genai - - huggingface - - machine-learning + tag: '' + name: transcribe +verbose: false spec: - command: '' - args: [] - image: '' build: - functionSourceCode:  - base_image: mlrun/mlrun - commands: [] - code_origin: '' origin_filename: '' requirements: - transformers @@ -27,118 +15,122 @@ spec: - torchaudio - torch - accelerate + base_image: mlrun/mlrun + code_origin: '' + functionSourceCode:  + disable_auto_mount: false + description: Transcribe audio files into text files + image: '' + command: '' + default_handler: transcribe entry_points: do_task: name: do_task doc: Try to perform the task storing an error if occurred. + lineno: 348 parameters: - name: self - outputs: [] - lineno: 348 has_varargs: false has_kwargs: false is_failed: name: is_failed doc: Check if the task failed. + lineno: 70 parameters: - name: self + has_varargs: false + has_kwargs: false outputs: - doc: Whether the task failed. type: bool - lineno: 70 - has_varargs: false - has_kwargs: false get_result: name: get_result doc: 'Get the result of the task. If the task failed, the error will be returned, otherwise, the result will be the text file name.' + lineno: 78 parameters: - name: self + has_varargs: false + has_kwargs: false outputs: - doc: The task's result. type: Tuple[str, str] - lineno: 78 - has_varargs: false - has_kwargs: false to_tuple: name: to_tuple doc: Convert the task to a tuple to reconstruct it later (used for multiprocessing to pass in queue). + lineno: 358 parameters: - name: self + has_varargs: false + has_kwargs: false outputs: - doc: The converted task. type: Tuple[str, dict] - lineno: 358 - has_varargs: false - has_kwargs: false transcription_output_channels: name: transcription_output_channels doc: Get the transcription output channels. + lineno: 340 parameters: - name: self + has_varargs: false + has_kwargs: false outputs: - doc: The transcription output channels. type: List[Tuple[str, dict]] - lineno: 340 - has_varargs: false - has_kwargs: false process_batch: name: process_batch doc: 'Process a batch of transcriptions. Tasks related to the given batch will be created and stored in the batch processor.' + lineno: 575 parameters: - name: self - name: batch type: List[dict] doc: The batch of transcriptions to process. - outputs: [] - lineno: 575 has_varargs: false has_kwargs: false get_tasks: name: get_tasks doc: Get the tasks to perform. + lineno: 453 parameters: - name: self + has_varargs: false + has_kwargs: false outputs: - doc: The tasks to perform. type: List[BaseTask] - lineno: 453 - has_varargs: false - has_kwargs: false do_tasks: name: do_tasks doc: Perform the tasks. Should be used if no multiprocessing queue is given to a transcriber. + lineno: 463 parameters: - name: self - outputs: [] - lineno: 463 has_varargs: false has_kwargs: false get_results: name: get_results doc: Get the results of the tasks. The stored results are then cleared. + lineno: 471 parameters: - name: self + has_varargs: false + has_kwargs: false outputs: - doc: The results of the tasks. type: List[Tuple[bool, Tuple[str, str]]] - lineno: 471 - has_varargs: false - has_kwargs: false load: name: load doc: Load the transcriber. Must be called before transcribing. + lineno: 695 parameters: - name: self - outputs: [] - lineno: 695 has_varargs: false has_kwargs: false transcribe: @@ -154,6 +146,7 @@ spec: \ a conversation format, where each speaker will\nbe written in a separate\ \ line::\n\n speaker_1: text\n speaker_2: text\n speaker_1: text\n\ \ ..." + lineno: 1097 parameters: - name: data_path type: Union[str, Path, List[Union[str, Path]]] @@ -246,66 +239,47 @@ spec: type: bool doc: Whether to print the progress of the transcription. Default is `False`. default: false - outputs: [] - lineno: 1097 has_varargs: false has_kwargs: false audio_iterator: name: audio_iterator doc: '' - parameters: [] - outputs: - - type: Generator[Union[dict, str], None, None] lineno: 804 has_varargs: false has_kwargs: false + outputs: + - type: Generator[Union[dict, str], None, None] batch_iterator: name: batch_iterator doc: '' - parameters: [] - outputs: - - type: Generator[List[Union[dict, str]], None, None] lineno: 816 has_varargs: false has_kwargs: false + outputs: + - type: Generator[List[Union[dict, str]], None, None] open_mpi_handler: name: open_mpi_handler doc: '' + lineno: 957 parameters: - name: worker_inputs type: List[str] - name: root_worker_inputs type: Dict[str, Any] default: null - outputs: [] - lineno: 957 has_varargs: false has_kwargs: false decorator: name: decorator doc: '' + lineno: 969 parameters: - name: handler - outputs: [] - lineno: 969 has_varargs: false has_kwargs: false wrapper: name: wrapper doc: '' - parameters: [] - outputs: [] lineno: 974 has_varargs: false has_kwargs: true - description: Transcribe audio files into text files - default_handler: transcribe - disable_auto_mount: false - clone_target_dir: '' - env: [] - priority_class_name: '' - preemption_mode: prevent - affinity: null - tolerations: null - security_context: {} -verbose: false diff --git a/functions/master/transcribe/latest/src/item.yaml b/functions/master/transcribe/latest/src/item.yaml index 7fddcf95..6deaf710 100644 --- a/functions/master/transcribe/latest/src/item.yaml +++ b/functions/master/transcribe/latest/src/item.yaml @@ -1,9 +1,7 @@ apiVersion: v1 categories: -- data-preparation +- audio - genai -- huggingface -- machine-learning description: Transcribe audio files into text files doc: '' example: transcribe.ipynb @@ -14,7 +12,7 @@ labels: author: yonatans maintainers: [] marketplaceType: '' -mlrunVersion: 1.5.1 +mlrunVersion: 1.7.0 name: transcribe platformVersion: 3.5.3 spec: @@ -29,4 +27,4 @@ spec: - torch - accelerate url: '' -version: 1.1.0 \ No newline at end of file +version: 1.2.0 \ No newline at end of file diff --git a/functions/master/transcribe/latest/static/documentation.html b/functions/master/transcribe/latest/static/documentation.html index 80bf639b..d92df103 100644 --- a/functions/master/transcribe/latest/static/documentation.html +++ b/functions/master/transcribe/latest/static/documentation.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/transcribe/latest/static/example.html b/functions/master/transcribe/latest/static/example.html index f25f86a8..261d8df0 100644 --- a/functions/master/transcribe/latest/static/example.html +++ b/functions/master/transcribe/latest/static/example.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/transcribe/latest/static/function.html b/functions/master/transcribe/latest/static/function.html index 852a901f..1f55b3ab 100644 --- a/functions/master/transcribe/latest/static/function.html +++ b/functions/master/transcribe/latest/static/function.html @@ -30,26 +30,14 @@ kind: job metadata: - name: transcribe - tag: '' - hash: 8810ac74045bd15cee15a2e4e89563e8e29908d3 - project: '' - labels: - author: yonatans categories: - - data-preparation + - audio - genai - - huggingface - - machine-learning + tag: '' + name: transcribe +verbose: false spec: - command: '' - args: [] - image: '' build: - functionSourceCode:  - base_image: mlrun/mlrun - commands: [] - code_origin: '' origin_filename: '' requirements: - transformers @@ -57,118 +45,122 @@ - torchaudio - torch - accelerate + base_image: mlrun/mlrun + code_origin: '' + functionSourceCode:  + disable_auto_mount: false + description: Transcribe audio files into text files + image: '' + command: '' + default_handler: transcribe entry_points: do_task: name: do_task doc: Try to perform the task storing an error if occurred. + lineno: 348 parameters: - name: self - outputs: [] - lineno: 348 has_varargs: false has_kwargs: false is_failed: name: is_failed doc: Check if the task failed. + lineno: 70 parameters: - name: self + has_varargs: false + has_kwargs: false outputs: - doc: Whether the task failed. type: bool - lineno: 70 - has_varargs: false - has_kwargs: false get_result: name: get_result doc: 'Get the result of the task. If the task failed, the error will be returned, otherwise, the result will be the text file name.' + lineno: 78 parameters: - name: self + has_varargs: false + has_kwargs: false outputs: - doc: The task's result. type: Tuple[str, str] - lineno: 78 - has_varargs: false - has_kwargs: false to_tuple: name: to_tuple doc: Convert the task to a tuple to reconstruct it later (used for multiprocessing to pass in queue). + lineno: 358 parameters: - name: self + has_varargs: false + has_kwargs: false outputs: - doc: The converted task. type: Tuple[str, dict] - lineno: 358 - has_varargs: false - has_kwargs: false transcription_output_channels: name: transcription_output_channels doc: Get the transcription output channels. + lineno: 340 parameters: - name: self + has_varargs: false + has_kwargs: false outputs: - doc: The transcription output channels. type: List[Tuple[str, dict]] - lineno: 340 - has_varargs: false - has_kwargs: false process_batch: name: process_batch doc: 'Process a batch of transcriptions. Tasks related to the given batch will be created and stored in the batch processor.' + lineno: 575 parameters: - name: self - name: batch type: List[dict] doc: The batch of transcriptions to process. - outputs: [] - lineno: 575 has_varargs: false has_kwargs: false get_tasks: name: get_tasks doc: Get the tasks to perform. + lineno: 453 parameters: - name: self + has_varargs: false + has_kwargs: false outputs: - doc: The tasks to perform. type: List[BaseTask] - lineno: 453 - has_varargs: false - has_kwargs: false do_tasks: name: do_tasks doc: Perform the tasks. Should be used if no multiprocessing queue is given to a transcriber. + lineno: 463 parameters: - name: self - outputs: [] - lineno: 463 has_varargs: false has_kwargs: false get_results: name: get_results doc: Get the results of the tasks. The stored results are then cleared. + lineno: 471 parameters: - name: self + has_varargs: false + has_kwargs: false outputs: - doc: The results of the tasks. type: List[Tuple[bool, Tuple[str, str]]] - lineno: 471 - has_varargs: false - has_kwargs: false load: name: load doc: Load the transcriber. Must be called before transcribing. + lineno: 695 parameters: - name: self - outputs: [] - lineno: 695 has_varargs: false has_kwargs: false transcribe: @@ -184,6 +176,7 @@ \ a conversation format, where each speaker will\nbe written in a separate\ \ line::\n\n speaker_1: text\n speaker_2: text\n speaker_1: text\n\ \ ..." + lineno: 1097 parameters: - name: data_path type: Union[str, Path, List[Union[str, Path]]] @@ -276,69 +269,50 @@ type: bool doc: Whether to print the progress of the transcription. Default is `False`. default: false - outputs: [] - lineno: 1097 has_varargs: false has_kwargs: false audio_iterator: name: audio_iterator doc: '' - parameters: [] - outputs: - - type: Generator[Union[dict, str], None, None] lineno: 804 has_varargs: false has_kwargs: false + outputs: + - type: Generator[Union[dict, str], None, None] batch_iterator: name: batch_iterator doc: '' - parameters: [] - outputs: - - type: Generator[List[Union[dict, str]], None, None] lineno: 816 has_varargs: false has_kwargs: false + outputs: + - type: Generator[List[Union[dict, str]], None, None] open_mpi_handler: name: open_mpi_handler doc: '' + lineno: 957 parameters: - name: worker_inputs type: List[str] - name: root_worker_inputs type: Dict[str, Any] default: null - outputs: [] - lineno: 957 has_varargs: false has_kwargs: false decorator: name: decorator doc: '' + lineno: 969 parameters: - name: handler - outputs: [] - lineno: 969 has_varargs: false has_kwargs: false wrapper: name: wrapper doc: '' - parameters: [] - outputs: [] lineno: 974 has_varargs: false has_kwargs: true - description: Transcribe audio files into text files - default_handler: transcribe - disable_auto_mount: false - clone_target_dir: '' - env: [] - priority_class_name: '' - preemption_mode: prevent - affinity: null - tolerations: null - security_context: {} -verbose: false diff --git a/functions/master/transcribe/latest/static/item.html b/functions/master/transcribe/latest/static/item.html index 7654d6c3..b39372bd 100644 --- a/functions/master/transcribe/latest/static/item.html +++ b/functions/master/transcribe/latest/static/item.html @@ -30,10 +30,8 @@ apiVersion: v1 categories: -- data-preparation +- audio - genai -- huggingface -- machine-learning description: Transcribe audio files into text files doc: '' example: transcribe.ipynb @@ -44,7 +42,7 @@ author: yonatans maintainers: [] marketplaceType: '' -mlrunVersion: 1.5.1 +mlrunVersion: 1.7.0 name: transcribe platformVersion: 3.5.3 spec: @@ -59,7 +57,7 @@ - torch - accelerate url: '' -version: 1.1.0 +version: 1.2.0 diff --git a/functions/master/transcribe/latest/static/transcribe.html b/functions/master/transcribe/latest/static/transcribe.html index 215235bd..7f277c56 100644 --- a/functions/master/transcribe/latest/static/transcribe.html +++ b/functions/master/transcribe/latest/static/transcribe.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/translate/0.2.0/src/function.yaml b/functions/master/translate/0.2.0/src/function.yaml new file mode 100644 index 00000000..9595b77a --- /dev/null +++ b/functions/master/translate/0.2.0/src/function.yaml @@ -0,0 +1,115 @@ +spec: + entry_points: + open_mpi_handler: + lineno: 56 + parameters: + - name: worker_inputs + type: List[str] + - name: root_worker_inputs + type: Dict[str, Any] + default: null + name: open_mpi_handler + has_kwargs: false + doc: '' + has_varargs: false + decorator: + lineno: 68 + parameters: + - name: handler + name: decorator + has_kwargs: false + doc: '' + has_varargs: false + wrapper: + lineno: 73 + name: wrapper + has_kwargs: true + doc: '' + has_varargs: false + translate: + outputs: + - doc: 'A tuple of:' + type: Tuple[str, pd.DataFrame, dict] + lineno: 135 + parameters: + - name: data_path + type: Union[str, List[str], Path] + doc: A directory of text files or a single file or a list of files to translate. + - name: output_directory + type: str + doc: Directory where the translated files will be saved. + - name: model_name + type: str + doc: The name of a model to load. If None, the model name is constructed using + the source and target languages parameters. + default: null + - name: source_language + type: str + doc: The source language code (e.g., 'en' for English). + default: null + - name: target_language + type: str + doc: The target language code (e.g., 'en' for English). + default: null + - name: device + type: str + doc: The device index for transformers. Default will prefer cuda if available. + default: null + - name: model_kwargs + type: dict + doc: Keyword arguments to pass regarding the loading of the model in HuggingFace's + `pipeline` function. + default: null + - name: batch_size + type: int + doc: The number of batches to use in translation. The files are translated + one by one, but the sentences can be batched. + default: 1 + - name: translation_kwargs + type: dict + doc: Additional keyword arguments to pass to a `transformers.TranslationPipeline` + when doing the translation inference. Notice the batch size here is being + added automatically. + default: null + - name: verbose + type: bool + doc: 'Whether to present logs of a progress bar and errors. Default: True.' + default: false + name: translate + has_kwargs: false + doc: 'Translate text files using a transformer model from Huggingface''s hub + according to the source and target languages + + given (or using the directly provided model name). The end result is a directory + of translated text files and a + + dataframe containing the following columns: + + + * text_file - The text file path. + + * translation_file - The translation text file name in the output directory.' + has_varargs: false + build: + requirements: + - transformers + - sentencepiece + - torch + - tqdm + code_origin: '' + functionSourceCode:  + base_image: mlrun/mlrun + origin_filename: '' + image: '' + default_handler: translate + disable_auto_mount: false + command: '' + description: Translate text files from one language to another +verbose: false +metadata: + categories: + - genai + - NLP + tag: '' + name: translate +kind: job diff --git a/functions/master/translate/0.2.0/src/item.yaml b/functions/master/translate/0.2.0/src/item.yaml new file mode 100644 index 00000000..839d1efa --- /dev/null +++ b/functions/master/translate/0.2.0/src/item.yaml @@ -0,0 +1,30 @@ +apiVersion: v1 +categories: +- genai +- NLP +description: Translate text files from one language to another +doc: '' +example: translate.ipynb +generationDate: 2023-12-05:17-20 +hidden: false +icon: '' +labels: + author: guyl +maintainers: [] +marketplaceType: '' +mlrunVersion: 1.7.0 +name: translate +platformVersion: 3.5.3 +spec: + filename: translate.py + handler: translate + image: mlrun/mlrun + kind: job + requirements: + - transformers + - sentencepiece + - torch + - tqdm +url: '' +version: 0.2.0 +test_valid: True diff --git a/functions/master/translate/0.2.0/src/requirements.txt b/functions/master/translate/0.2.0/src/requirements.txt new file mode 100644 index 00000000..94e54846 --- /dev/null +++ b/functions/master/translate/0.2.0/src/requirements.txt @@ -0,0 +1,4 @@ +transformers +tqdm +torch +sentencepiece \ No newline at end of file diff --git a/functions/master/translate/0.2.0/src/test_translate.py b/functions/master/translate/0.2.0/src/test_translate.py new file mode 100644 index 00000000..a22dc899 --- /dev/null +++ b/functions/master/translate/0.2.0/src/test_translate.py @@ -0,0 +1,51 @@ +# Copyright 2023 Iguazio +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os.path +import tempfile + +import mlrun + + +def test_translate(): + project = mlrun.new_project("test-translate") + translate_fn = project.set_function("translate.py", "translate", image="mlrun/mlrun") + input_text = "Ali her gece bir kitap okur." + expected_translation = "Ali reads a book every night." + + with tempfile.TemporaryDirectory() as test_dir: + with tempfile.TemporaryDirectory() as data_dir: + with open(os.path.join(data_dir, "test_tr.txt"), "w") as f: + f.write(input_text) + translate_run = translate_fn.run( + handler="translate", + inputs={ + "data_path": data_dir, + }, + params={ + "model_name": "Helsinki-NLP/opus-mt-tr-en", + "device": "cpu", + "output_directory": test_dir, + }, + local=True, + returns=[ + "files: path", + "text_files_dataframe: dataset", + "errors: dict", + ], + artifact_path=test_dir, + ) + assert translate_run.status.state == "completed" + with open(os.path.join(test_dir, "test_tr.txt")) as f: + assert f.read() == expected_translation + diff --git a/functions/master/translate/0.2.0/src/translate.ipynb b/functions/master/translate/0.2.0/src/translate.ipynb new file mode 100644 index 00000000..5e14ee87 --- /dev/null +++ b/functions/master/translate/0.2.0/src/translate.ipynb @@ -0,0 +1,658 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "6d3c20aa-7129-4905-beaa-7011943373f5", + "metadata": {}, + "source": [ + "# Translate tutorial" + ] + }, + { + "cell_type": "markdown", + "id": "afe4a3ee-f886-461c-9830-0fd9a5b625c3", + "metadata": {}, + "source": [ + "## Short description and explenation" + ] + }, + { + "cell_type": "markdown", + "id": "313ed5c3-7416-4bbb-a7fb-aa37ab1f8445", + "metadata": {}, + "source": [ + "Machine translation has made huge strides in recent years thanks to advances in deep learning, our translte function makes it even easier to use.
    \n", + "Simply tell it where your file is and the languages you're working with (the one you're translating from and the one you want),
    \n", + "and this function takes care of the rest. It cleverly picks the right pre-trained model for your language pair, ensuring top-notch translations.
    \n", + "\n", + "No need to worry about finding the perfect model or dealing with complex setup – it's all handled behind the scenes.
    \n", + "\n", + "With this function, language translation becomes a breeze, making your documents accessible in any language without breaking a sweat." + ] + }, + { + "cell_type": "markdown", + "id": "9352f799-fe99-4ace-9b44-ca0e28bb1fb4", + "metadata": {}, + "source": [ + "## Background" + ] + }, + { + "cell_type": "markdown", + "id": "6026a8bd-e2e7-454a-b325-9550561a587e", + "metadata": {}, + "source": [ + "The function takes two parameters: a model name or the source and target languages, and a path to one or more text files to translate.\n", + "\n", + "It first checks if a model name was passed. If so, it loads that Helsinki-NLP model.
    \n", + "If not, it looks at the source and target languages and loads the appropriate Helsinki-NLP translation model.\n", + "\n", + "It then reads in the text files and translates them using the loaded model.\n", + "\n", + "Finally, it writes the translated text out to new files and returns the filename or dir name.
    \n", + "\n", + "This allows the user to easily translate a text file to another language using Helsinki-NLP's pre-trained models by just passing the model name or language pair and source text file.
    \n", + "\n", + "This function auto-model selection is based on the great translation models offered by Helsinki. Check them out https://huggingface.co/Helsinki-NLP" + ] + }, + { + "cell_type": "markdown", + "id": "42ec9bc3-2b90-40f1-b10b-5493d9e2b75e", + "metadata": {}, + "source": [ + "## Requirements" + ] + }, + { + "cell_type": "markdown", + "id": "6b756726-e750-4da4-b032-bf5385f85311", + "metadata": {}, + "source": [ + "`transformers`
    \n", + "`tqdm`
    " + ] + }, + { + "cell_type": "markdown", + "id": "212b8161-3e75-459e-98f3-a5b7c5a15efe", + "metadata": {}, + "source": [ + "## Documentation" + ] + }, + { + "cell_type": "markdown", + "id": "9b5fe561-4fbb-4471-91bb-532fa55559f9", + "metadata": {}, + "source": [ + "`data_path`: A directory of text files or a single text file or a list of files to translate.\n", + "\n", + "`output_directory`: Directory where the translated files will be saved.\n", + "\n", + "`model_name`: The name of a model to load. If None, the model name is constructed using the source and
    \n", + " target languages parameters from the \"Helsinki-NLP\" group.\n", + " \n", + "`source_language`: The source language code (e.g., 'en' for English).\n", + "\n", + "`target_language`: The target language code (e.g., 'en' for English).\n", + "\n", + "`model_kwargs`: Keyword arguments to pass regarding the loading of the model in HuggingFace's \"pipeline\"\n", + " function.\n", + " \n", + "`device`: The device index for transformers. Default will prefer cuda if available.\n", + "\n", + "`batch_size`: The number of batches to use in translation. The files are translated one by one, but the\n", + " sentences can be batched.\n", + " \n", + "`translation_kwargs`: Additional keyword arguments to pass to a \"transformers.TranslationPipeline\" when doing
    \n", + " the translation inference. Notice the batch size here is being added automatically.\n" + ] + }, + { + "cell_type": "markdown", + "id": "2e6f44a6-d6ac-48ed-a7d1-936d25e7426c", + "metadata": {}, + "source": [ + "## Demo " + ] + }, + { + "cell_type": "markdown", + "id": "2b231e4c-0224-41a2-87cf-400a4680e2b9", + "metadata": {}, + "source": [ + "The following demo will show an example of translating a text file written in turkish to eanglish using the _tranlate_ function.
    \n", + "\n", + "### (1.) Import the function (import mlrun, set project and import function)" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "797ef0d4-f435-485c-b705-e1d6115fb8fd", + "metadata": {}, + "outputs": [], + "source": [ + "import mlrun" + ] + }, + { + "cell_type": "markdown", + "id": "1ff51127-dc54-44d2-bd13-0b81165b2033", + "metadata": {}, + "source": [ + "We want to translate the following turkish sentence into english, so we will write it to a text file." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "f9517cc8-a0d6-4169-b746-cf4c265e6a3b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Writing data.txt\n" + ] + } + ], + "source": [ + "%%writefile data.txt\n", + "Ali her gece bir kitap okur. # which means: \"Ali reads a book every night.\"" + ] + }, + { + "cell_type": "markdown", + "id": "c24d71a7-9400-475a-9472-424658801914", + "metadata": {}, + "source": [ + "Setting a project and importing the translate function" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "e61184ea-44a3-4184-9a2f-9c45b90fdc0f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2023-12-06 14:44:05,223 [info] Created and saved project: {'name': 'test-translate', 'from_template': None, 'overwrite': False, 'context': './', 'save': True}\n" + ] + } + ], + "source": [ + "project = mlrun.new_project(\"test-translate\")\n", + "translate_fn = project.set_function(\"hub://translate\", \"translate\")" + ] + }, + { + "cell_type": "markdown", + "id": "558260ce-e453-4e05-a6a7-b2df39cff1b9", + "metadata": {}, + "source": [ + "## Usage" + ] + }, + { + "cell_type": "markdown", + "id": "5a1781ee-a210-4dc1-82de-0f4f5d191173", + "metadata": {}, + "source": [ + "### (2.1.) Manual model selection\n", + "Here we run our function that we've imported from the MLRun Function Hub.
    \n", + "We select the specific model, give the function a path to to the file and output directory and choose to run on the cpu." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "9b3107fd-b78d-43de-b4a2-ad3863f72a03", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2023-12-06 14:48:52,794 [info] Storing function: {'name': 'translate-translate', 'uid': '5768d0ddaf06469da053c85d47f61a47', 'db': 'http://mlrun-api:8080'}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Recommended: pip install sacremoses.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2023-12-06 14:48:56,190 [warning] Skipping logging an object with the log hint '{'key': 'errors', 'artifact_type': 'dict'}' due to the following error:\n", + "An exception was raised during the packing of '{}': No packager was found for the combination of 'object_type=builtins.dict' and 'artifact_type=dict'.\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
    \n", + "
    \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
    projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
    test-translate0Dec 06 14:48:52completedtranslate-translate
    v3io_user=yonis
    kind=local
    owner=yonis
    host=jupyter-yonis-7c9bdbfb4d-9g2p2
    data_path
    model_name=Helsinki-NLP/opus-mt-tr-en
    device=cpu
    output_directory=./
    files
    text_files_dataframe
    \n", + "
    \n", + "
    \n", + "
    \n", + " Title\n", + " ×\n", + "
    \n", + " \n", + "
    \n", + "
    \n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "text/html": [ + " > to track results use the .show() or .logs() methods or click here to open in UI" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2023-12-06 14:48:56,409 [info] Run execution finished: {'status': 'completed', 'name': 'translate-translate'}\n" + ] + } + ], + "source": [ + "translate_run = translate_fn.run(\n", + " handler=\"translate\",\n", + " inputs={\"data_path\": \"data.txt\"},\n", + " params={\n", + " \"model_name\": \"Helsinki-NLP/opus-mt-tr-en\",\n", + " \"device\": \"cpu\",\n", + " \"output_directory\": \"./\",\n", + " },\n", + " local=True,\n", + " returns=[\n", + " \"files: path\",\n", + " \"text_files_dataframe: dataset\",\n", + " \"errors: dict\",\n", + " ],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "8b2fcf2b-3893-4dda-85e2-4a2b9ed0d963", + "metadata": {}, + "source": [ + "### (2.1.) Auto model detectyion" + ] + }, + { + "cell_type": "markdown", + "id": "8c3d24ca-8df7-4204-8b0d-e7a08d53d8c9", + "metadata": {}, + "source": [ + "Here we run our function that we've imported from the MLRun Function Hub.
    \n", + "We select the languages to use for choosing the model, give the function a path to to the file and output directory and choose to run on the cpu." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dbe10afd-5ede-4475-abc2-bb07dfdf33aa", + "metadata": {}, + "outputs": [], + "source": [ + "translate_run = translate_fn.run(\n", + " handler=\"translate\",\n", + " inputs={\"data_path\": \"data.txt\"},\n", + " params={\n", + " \"target_language\": \"en\",\n", + " \"source_language\": \"tr\",\n", + " \"device\": \"cpu\",\n", + " \"output_directory\": \"./\",\n", + " },\n", + " local=True,\n", + " returns=[\n", + " \"files: path\",\n", + " \"text_files_dataframe: dataset\",\n", + " \"errors: dict\",\n", + " ],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "40e4a666-9680-40d6-93ee-9466d31a9efc", + "metadata": {}, + "source": [ + "We can take alook at the file created" + ] + }, + { + "cell_type": "markdown", + "id": "89a1952c-f3c3-4a7b-bad4-b59c701a5af6", + "metadata": {}, + "source": [ + "### (3.) Review results" + ] + }, + { + "cell_type": "markdown", + "id": "9d583cf9-7e81-4d0d-982f-aba345d4cf9c", + "metadata": {}, + "source": [ + "We can look at the articat returned, the import " + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "c3dab6f8-6089-46c2-b4b9-899a2442403f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
    \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
    text_filetranslation_file
    0data.txtdata_2.txt
    \n", + "
    " + ], + "text/plain": [ + " text_file translation_file\n", + "0 data.txt data_2.txt" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "translate_run.artifact(\"text_files_dataframe\").show()" + ] + }, + { + "cell_type": "markdown", + "id": "580a20a2-4877-48b4-8f83-59cbfc2f3b83", + "metadata": {}, + "source": [ + "Checking that translation is correct, we print the text file created by function, and can see the sentence is as expected." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "0157bcaf-8f2c-4995-a214-32f2710da4c9", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Translated text:\n", + "Ali reads a book every night.\n", + "\n" + ] + } + ], + "source": [ + "with open(\"data_2.txt\", \"r\") as f:\n", + " print(f\"Translated text:\\n{f.read()}\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "mlrun-base", + "language": "python", + "name": "conda-env-mlrun-base-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/functions/master/translate/0.2.0/src/translate.py b/functions/master/translate/0.2.0/src/translate.py new file mode 100644 index 00000000..360fa620 --- /dev/null +++ b/functions/master/translate/0.2.0/src/translate.py @@ -0,0 +1,396 @@ +# Copyright 2023 Iguazio +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import operator +import pathlib +from functools import reduce, wraps +from typing import Any, Dict, List, Tuple, Union + +import pandas as pd +import transformers +from tqdm import tqdm + +# Get the global logger: +_LOGGER = logging.getLogger() + + +def _check_mlrun_and_open_mpi() -> Tuple["mlrun.MLClientCtx", "mpi4py.MPI.Intracomm"]: + is_mpi = False + try: + import mlrun + + context = mlrun.get_or_create_ctx(name="mlrun") + is_mpi = context.labels.get("kind", "job") == "mpijob" + + if is_mpi: + try: + from mpi4py import MPI + + return context, MPI.COMM_WORLD + except ModuleNotFoundError as mpi4py_not_found: + context.logger.error( + "To distribute the function using MLRun's 'mpijob' you need to have `mpi4py` package in your " + "interpreter. Please run `pip install mpi4py` and make sure you have open-mpi." + ) + raise mpi4py_not_found + else: + return context, None + except ModuleNotFoundError as module_not_found: + if is_mpi: + raise module_not_found + return None, None + + +def open_mpi_handler( + worker_inputs: List[str], root_worker_inputs: Dict[str, Any] = None +): + global _LOGGER + + # Check for MLRun and OpenMPI availability: + context, comm = _check_mlrun_and_open_mpi() + + # Check if MLRun is available, set the global logger to MLRun's: + if context: + _LOGGER = context.logger + + def decorator(handler): + if comm is None or comm.Get_size() == 1: + return handler + + @wraps(handler) + def wrapper(**kwargs): + # Get the open mpi environment properties: + size = comm.Get_size() + rank = comm.Get_rank() + + # Give the correct chunk of the workers inputs: + for worker_input in worker_inputs: + input_argument = kwargs[worker_input] + if input_argument is None: + continue + if isinstance(input_argument, (str, pathlib.Path)): + input_argument = _get_text_files( + data_path=pathlib.Path(input_argument).absolute() + ) + if len(input_argument) < size: + raise ValueError( + f"Cannot split the input '{worker_input}' of length {len(input_argument)} to {size} workers. " + f"Please reduce the amount of workers for this input." + ) + even_chunk_size = len(input_argument) // size + chunk_start = rank * even_chunk_size + chunk_end = ( + (rank + 1) * even_chunk_size + if rank + 1 < size + else len(input_argument) + ) + context.logger.info( + f"Rank #{rank}: Processing input chunk of '{worker_input}' " + f"from index {chunk_start} to {chunk_end}." + ) + if isinstance(input_argument, list): + input_argument = input_argument[chunk_start:chunk_end] + elif isinstance(input_argument, pd.DataFrame): + input_argument = input_argument.iloc[chunk_start:chunk_end:, :] + kwargs[worker_input] = input_argument + + # Set the root worker only arguments: + if rank == 0 and root_worker_inputs: + kwargs.update(root_worker_inputs) + + # Run the worker: + output = handler(**kwargs) + + # Send the output to the root rank (rank #0): + output = comm.gather(output, root=0) + if rank == 0: + # Join the outputs: + context.logger.info("Collecting data from workers to root worker.") + output_directory = output[0][0] + dataframe = pd.concat(objs=[df for _, df, _ in output], axis=0) + errors_dictionary = reduce( + operator.ior, [err for _, _, err in output], {} + ) + return output_directory, dataframe, errors_dictionary + return None + + return wrapper + + return decorator + + +@open_mpi_handler(worker_inputs=["data_path"], root_worker_inputs={"verbose": True}) +def translate( + data_path: Union[str, List[str], pathlib.Path], + output_directory: str, + model_name: str = None, + source_language: str = None, + target_language: str = None, + device: str = None, + model_kwargs: dict = None, + batch_size: int = 1, + translation_kwargs: dict = None, + verbose: bool = False, +) -> Tuple[str, pd.DataFrame, dict]: + """ + Translate text files using a transformer model from Huggingface's hub according to the source and target languages + given (or using the directly provided model name). The end result is a directory of translated text files and a + dataframe containing the following columns: + + * text_file - The text file path. + * translation_file - The translation text file name in the output directory. + + :param data_path: A directory of text files or a single file or a list of files to translate. + :param output_directory: Directory where the translated files will be saved. + :param model_name: The name of a model to load. If None, the model name is constructed using the source and + target languages parameters. + :param source_language: The source language code (e.g., 'en' for English). + :param target_language: The target language code (e.g., 'en' for English). + :param model_kwargs: Keyword arguments to pass regarding the loading of the model in HuggingFace's `pipeline` + function. + :param device: The device index for transformers. Default will prefer cuda if available. + :param batch_size: The number of batches to use in translation. The files are translated one by one, but the + sentences can be batched. + :param translation_kwargs: Additional keyword arguments to pass to a `transformers.TranslationPipeline` when doing + the translation inference. Notice the batch size here is being added automatically. + :param verbose: Whether to present logs of a progress bar and errors. Default: True. + + :returns: A tuple of: + + * Path to the output directory. + * A dataframe dataset of the translated file names. + * A dictionary of errored files that were not translated. + """ + global _LOGGER + + # Get the input text files to translate: + if verbose: + _LOGGER.info("Collecting text files.") + if isinstance(data_path, str): + data_path = pathlib.Path(data_path).absolute() + text_files = _get_text_files(data_path=data_path) + else: + text_files = data_path + if verbose: + _LOGGER.info(f"Collected {len(text_files)} text files.") + + # Get the translation pipeline: + if verbose: + _LOGGER.info(f"Loading model - using device '{device}'.") + translation_pipeline, model_name = _get_translation_pipeline( + model_name=model_name, + source_language=source_language, + target_language=target_language, + device=device, + model_kwargs=model_kwargs, + batch_size=batch_size if batch_size != 1 else None, + ) + if verbose: + _LOGGER.info(f"Model '{model_name}' was loaded successfully.") + + # Prepare the successes dataframe and errors dictionary to be returned: + successes = [] + errors = {} + + # Create the output directory: + output_directory = pathlib.Path(output_directory) + output_directory.mkdir(parents=True, exist_ok=True) + + # Prepare the translation keyword arguments: + translation_kwargs = translation_kwargs or {} + + # Go over the audio files and transcribe: + for text_file in tqdm( + text_files, desc="Translating", unit="file", disable=not verbose + ): + try: + # Translate: + translation = _translate( + text_file=text_file, + translation_pipeline=translation_pipeline, + translation_kwargs=translation_kwargs, + ) + # Write the transcription to file: + translation_file = _save_to_file( + translation=translation, + file_name=text_file.stem, + output_directory=output_directory, + ) + # Note as a success in the list: + successes.append( + [ + text_file.name, + translation_file.name, + ] + ) + except Exception as exception: + # Note the exception as error in the dictionary: + if verbose: + _LOGGER.warning(f"Error in file: '{text_file.name}'") + errors[str(text_file.name)] = str(exception) + continue + + # Construct the translations dataframe: + columns = [ + "text_file", + "translation_file", + ] + successes = pd.DataFrame( + successes, + columns=columns, + ) + + # Print the head of the produced dataframe and return: + if verbose: + _LOGGER.info( + f"Done ({successes.shape[0]}/{len(text_files)})\n" + f"Translations summary:\n" + f"{successes.head()}" + ) + return str(output_directory), successes, errors + + +def _get_text_files( + data_path: pathlib.Path, +) -> List[pathlib.Path]: + # Check if the path is of a directory or a file: + if data_path.is_dir(): + # Get all files inside the directory: + text_files = list(data_path.glob("*.*")) + elif data_path.is_file(): + text_files = [data_path] + else: + raise ValueError( + f"Unrecognized data path. The parameter `data_path` must be either a directory path or a file path. " + f"Given: {str(data_path)} " + ) + + return text_files + + +def _get_translation_pipeline( + model_name: str = None, + source_language: str = None, + target_language: str = None, + device: str = None, + model_kwargs: dict = None, + batch_size: int = None, +) -> Tuple[transformers.Pipeline, str]: + # Construct the model name - if model name is provided (not None) then we take it, otherwise we check both source + # and target were provided to construct the model name: + if model_name is None and (source_language is None or target_language is None): + raise ValueError( + "No model name were given and missing source and / or target languages. In order to translate you must " + "pass a `model_name` or both `source_language` and `target_language`." + ) + elif model_name is None: + model_name = f"Helsinki-NLP/opus-mt-{source_language}-{target_language}" + + # Initialize the translation pipeline: + try: + translation_pipeline = transformers.pipeline( + task="translation", + model=model_name, + tokenizer=model_name, + device=device, + model_kwargs=model_kwargs, + batch_size=batch_size, + ) + except OSError as load_exception: + if ( + "is not a valid model identifier listed on 'https://huggingface.co/models'" + in str(load_exception) + and source_language + ): + raise ValueError( + f"The model '{model_name}' is not a valid model identifier. " + f"The parameters `source_language` and `target_language` are used to construct a Helsinki model for " + f"text to text generation, but the model created from the given languages does not exist. " + f"You may check language identifiers at " + f"https://developers.google.com/admin-sdk/directory/v1/languages, and if the error was not fixed, one " + f"or more language code might be with 3 letters and needs to be found online. " + f"Remember, you can always choose a model directly from the Huggingface hub by using the `model_name` " + f"parameter." + ) from load_exception + raise load_exception + + return translation_pipeline, model_name + + +def _translate( + text_file: pathlib.Path, + translation_pipeline: transformers.Pipeline, + translation_kwargs: dict, +) -> str: + # Read the text from file: + with open(text_file, "r") as fp: + text = fp.read() + + # Split to paragraphs and each paragraph to sentences: + paragraphs = [paragraph.split(".") for paragraph in text.split("\n")] + + # Discover the newline indexes to restore the file to its structure post translation: + newlines_indexes = [] + for paragraph in paragraphs[:-1]: + if len(newlines_indexes) == 0: + newlines_indexes.append(len(paragraph) - 1) + else: + newlines_indexes.append(newlines_indexes[-1] + len(paragraph)) + + # Prepare the batches (each sentence from the paragraphs). Notice we add a dot not only to restore the sentence + # structure but to ignore empty strings as it will ruin the translation: + sentences = [f"{line}." for paragraph in paragraphs for line in paragraph] + + # Translate the sentences: + translations = translation_pipeline(sentences, **translation_kwargs) + + # Restructure the full text from the sentences: + translated_text = [] + newline_index = newlines_indexes.pop(0) if newlines_indexes else None + for i, translation in enumerate(translations): + # Get the translation: + text = translation["translation_text"] + # Validate if it was an empty sentence before: + if text == ".": + text = "" + # Check if needed to insert a newline: + if newline_index and newline_index == i: + text += "\n" + newline_index = newlines_indexes.pop(0) if newlines_indexes else None + # Collect it: + translated_text.append(text) + translated_text = "".join(translated_text) + + return translated_text + + +def _save_to_file( + translation: str, file_name: str, output_directory: pathlib.Path +) -> pathlib.Path: + # Prepare the file full path (checking for no duplications): + translation_file = output_directory / f"{file_name}.txt" + i = 1 + while translation_file.exists(): + i += 1 + translation_file = output_directory / f"{file_name}_{i}.txt" + + # Make sure all directories are created: + translation_file.parent.mkdir(exist_ok=True, parents=True) + + # Write to file: + with open(translation_file, "w") as fp: + fp.write(translation) + + return translation_file diff --git a/functions/master/translate/0.2.0/static/documentation.html b/functions/master/translate/0.2.0/static/documentation.html new file mode 100644 index 00000000..9e4fdd01 --- /dev/null +++ b/functions/master/translate/0.2.0/static/documentation.html @@ -0,0 +1,282 @@ + + + + + + + +translate package + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + + + +
    +
    +
    +
    +
    +
    + +
    + +
    +
    +
    + + + + +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + + + +
    +
    +
    +
    +
    +

    translate package

    + +
    + +
    +
    + +
    +
    +

    translate package#

    +
    +

    Submodules#

    +
    +
    +

    translate.translate module#

    +
    +
    +translate.translate.open_mpi_handler(worker_inputs: List[str], root_worker_inputs: Dict[str, Any] | None = None)[source]#
    +
    +
    +
    +translate.translate.translate(data_path: str | List[str] | Path, output_directory: str, model_name: str | None = None, source_language: str | None = None, target_language: str | None = None, device: str | None = None, model_kwargs: dict | None = None, batch_size: int = 1, translation_kwargs: dict | None = None, verbose: bool = False) Tuple[str, DataFrame, dict][source]#
    +

    Translate text files using a transformer model from Huggingface’s hub according to the source and target languages +given (or using the directly provided model name). The end result is a directory of translated text files and a +dataframe containing the following columns:

    +
      +
    • text_file - The text file path.

    • +
    • translation_file - The translation text file name in the output directory.

    • +
    +
    +
    Parameters:
    +
      +
    • data_path – A directory of text files or a single file or a list of files to translate.

    • +
    • output_directory – Directory where the translated files will be saved.

    • +
    • model_name – The name of a model to load. If None, the model name is constructed using the source and +target languages parameters.

    • +
    • source_language – The source language code (e.g., ‘en’ for English).

    • +
    • target_language – The target language code (e.g., ‘en’ for English).

    • +
    • model_kwargs – Keyword arguments to pass regarding the loading of the model in HuggingFace’s pipeline +function.

    • +
    • device – The device index for transformers. Default will prefer cuda if available.

    • +
    • batch_size – The number of batches to use in translation. The files are translated one by one, but the +sentences can be batched.

    • +
    • translation_kwargs – Additional keyword arguments to pass to a transformers.TranslationPipeline when doing +the translation inference. Notice the batch size here is being added automatically.

    • +
    • verbose – Whether to present logs of a progress bar and errors. Default: True.

    • +
    +
    +
    Returns:
    +

    A tuple of:

    +
      +
    • Path to the output directory.

    • +
    • A dataframe dataset of the translated file names.

    • +
    • A dictionary of errored files that were not translated.

    • +
    +

    +
    +
    +
    +
    +
    +

    Module contents#

    +
    +
    +
    +
    +
    +
    +
    +
    + +
    +
    + +
    +
    +
    +
    + + + +
    +
    + + \ No newline at end of file diff --git a/functions/master/translate/0.2.0/static/example.html b/functions/master/translate/0.2.0/static/example.html new file mode 100644 index 00000000..6b086df4 --- /dev/null +++ b/functions/master/translate/0.2.0/static/example.html @@ -0,0 +1,647 @@ + + + + + + + +Translate tutorial + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + + + +
    +
    +
    +
    +
    +
    + +
    + +
    +
    +
    + + + + +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + + + +
    +
    +
    +
    + + +
    +
    +

    Translate tutorial#

    +
    +

    Short description and explenation#

    +

    Machine translation has made huge strides in recent years thanks to advances in deep learning, our translte function makes it even easier to use.
    +Simply tell it where your file is and the languages you’re working with (the one you’re translating from and the one you want),
    +and this function takes care of the rest. It cleverly picks the right pre-trained model for your language pair, ensuring top-notch translations.

    +

    No need to worry about finding the perfect model or dealing with complex setup – it’s all handled behind the scenes.

    +

    With this function, language translation becomes a breeze, making your documents accessible in any language without breaking a sweat.

    +
    +
    +

    Background#

    +

    The function takes two parameters: a model name or the source and target languages, and a path to one or more text files to translate.

    +

    It first checks if a model name was passed. If so, it loads that Helsinki-NLP model.
    +If not, it looks at the source and target languages and loads the appropriate Helsinki-NLP translation model.

    +

    It then reads in the text files and translates them using the loaded model.

    +

    Finally, it writes the translated text out to new files and returns the filename or dir name.

    +

    This allows the user to easily translate a text file to another language using Helsinki-NLP’s pre-trained models by just passing the model name or language pair and source text file.

    +

    This function auto-model selection is based on the great translation models offered by Helsinki. Check them out https://huggingface.co/Helsinki-NLP

    +
    +
    +

    Requirements#

    +

    transformers
    +tqdm

    +
    +
    +

    Documentation#

    +

    data_path: A directory of text files or a single text file or a list of files to translate.

    +

    output_directory: Directory where the translated files will be saved.

    +

    model_name: The name of a model to load. If None, the model name is constructed using the source and
    +target languages parameters from the “Helsinki-NLP” group.

    +

    source_language: The source language code (e.g., ‘en’ for English).

    +

    target_language: The target language code (e.g., ‘en’ for English).

    +

    model_kwargs: Keyword arguments to pass regarding the loading of the model in HuggingFace’s “pipeline” +function.

    +

    device: The device index for transformers. Default will prefer cuda if available.

    +

    batch_size: The number of batches to use in translation. The files are translated one by one, but the +sentences can be batched.

    +

    translation_kwargs: Additional keyword arguments to pass to a “transformers.TranslationPipeline” when doing
    +the translation inference. Notice the batch size here is being added automatically.

    +
    +
    +

    Demo#

    +

    The following demo will show an example of translating a text file written in turkish to eanglish using the tranlate function.

    +
    +

    (1.) Import the function (import mlrun, set project and import function)#

    +
    +
    +
    import mlrun
    +
    +
    +
    +
    +

    We want to translate the following turkish sentence into english, so we will write it to a text file.

    +
    +
    +
    %%writefile data.txt
    +Ali her gece bir kitap okur. # which means: "Ali reads a book every night."
    +
    +
    +
    +
    +
    Writing data.txt
    +
    +
    +
    +
    +

    Setting a project and importing the translate function

    +
    +
    +
    project = mlrun.new_project("test-translate")
    +translate_fn = project.set_function("hub://translate", "translate")
    +
    +
    +
    +
    +
    > 2023-12-06 14:44:05,223 [info] Created and saved project: {'name': 'test-translate', 'from_template': None, 'overwrite': False, 'context': './', 'save': True}
    +
    +
    +
    +
    +
    +
    +
    +

    Usage#

    +
    +

    (2.1.) Manual model selection#

    +

    Here we run our function that we’ve imported from the MLRun Function Hub.
    +We select the specific model, give the function a path to to the file and output directory and choose to run on the cpu.

    +
    +
    +
    translate_run = translate_fn.run(
    +    handler="translate",
    +    inputs={"data_path": "data.txt"},
    +    params={
    +        "model_name": "Helsinki-NLP/opus-mt-tr-en",
    +        "device": "cpu",
    +        "output_directory": "./",
    +    },
    +    local=True,
    +    returns=[
    +        "files: path",
    +        "text_files_dataframe: dataset",
    +        "errors: dict",
    +    ],
    +)
    +
    +
    +
    +
    +
    > 2023-12-06 14:48:52,794 [info] Storing function: {'name': 'translate-translate', 'uid': '5768d0ddaf06469da053c85d47f61a47', 'db': 'http://mlrun-api:8080'}
    +
    +
    +
    Recommended: pip install sacremoses.
    +
    +
    +
    > 2023-12-06 14:48:56,190 [warning] Skipping logging an object with the log hint '{'key': 'errors', 'artifact_type': 'dict'}' due to the following error:
    +An exception was raised during the packing of '{}': No packager was found for the combination of 'object_type=builtins.dict' and 'artifact_type=dict'.
    +
    +
    +
    +
    +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
    test-translate0Dec 06 14:48:52completedtranslate-translate
    v3io_user=yonis
    kind=local
    owner=yonis
    host=jupyter-yonis-7c9bdbfb4d-9g2p2
    data_path
    model_name=Helsinki-NLP/opus-mt-tr-en
    device=cpu
    output_directory=./
    files
    text_files_dataframe
    +
    + +
    +
    
    +
    +
    +
    > to track results use the .show() or .logs() methods or click here to open in UI
    > 2023-12-06 14:48:56,409 [info] Run execution finished: {'status': 'completed', 'name': 'translate-translate'}
    +
    +
    +
    +
    +
    +
    +

    (2.1.) Auto model detectyion#

    +

    Here we run our function that we’ve imported from the MLRun Function Hub.
    +We select the languages to use for choosing the model, give the function a path to to the file and output directory and choose to run on the cpu.

    +
    +
    +
    translate_run = translate_fn.run(
    +    handler="translate",
    +    inputs={"data_path": "data.txt"},
    +    params={
    +        "target_language": "en",
    +        "source_language": "tr",
    +        "device": "cpu",
    +        "output_directory": "./",
    +    },
    +    local=True,
    +    returns=[
    +        "files: path",
    +        "text_files_dataframe: dataset",
    +        "errors: dict",
    +    ],
    +)
    +
    +
    +
    +
    +

    We can take alook at the file created

    +
    +
    +

    (3.) Review results#

    +

    We can look at the articat returned, the import

    +
    +
    +
    translate_run.artifact("text_files_dataframe").show()
    +
    +
    +
    +
    +
    + + + + + + + + + + + + + + + + +
    text_filetranslation_file
    0data.txtdata_2.txt
    +
    +
    +

    Checking that translation is correct, we print the text file created by function, and can see the sentence is as expected.

    +
    +
    +
    with open("data_2.txt", "r") as f:
    +    print(f"Translated text:\n{f.read()}")
    +
    +
    +
    +
    +
    Translated text:
    +Ali reads a book every night.
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +
    +
    + +
    +
    +
    +
    + + + +
    +
    + + \ No newline at end of file diff --git a/functions/master/translate/0.2.0/static/function.html b/functions/master/translate/0.2.0/static/function.html new file mode 100644 index 00000000..38951174 --- /dev/null +++ b/functions/master/translate/0.2.0/static/function.html @@ -0,0 +1,150 @@ + + + + + + + + + + + Source + + + + +
    +        
    +spec:
    +  entry_points:
    +    open_mpi_handler:
    +      lineno: 56
    +      parameters:
    +      - name: worker_inputs
    +        type: List[str]
    +      - name: root_worker_inputs
    +        type: Dict[str, Any]
    +        default: null
    +      name: open_mpi_handler
    +      has_kwargs: false
    +      doc: ''
    +      has_varargs: false
    +    decorator:
    +      lineno: 68
    +      parameters:
    +      - name: handler
    +      name: decorator
    +      has_kwargs: false
    +      doc: ''
    +      has_varargs: false
    +    wrapper:
    +      lineno: 73
    +      name: wrapper
    +      has_kwargs: true
    +      doc: ''
    +      has_varargs: false
    +    translate:
    +      outputs:
    +      - doc: 'A tuple of:'
    +        type: Tuple[str, pd.DataFrame, dict]
    +      lineno: 135
    +      parameters:
    +      - name: data_path
    +        type: Union[str, List[str], Path]
    +        doc: A directory of text files or a single file or a list of files to translate.
    +      - name: output_directory
    +        type: str
    +        doc: Directory where the translated files will be saved.
    +      - name: model_name
    +        type: str
    +        doc: The name of a model to load. If None, the model name is constructed using
    +          the source and target languages parameters.
    +        default: null
    +      - name: source_language
    +        type: str
    +        doc: The source language code (e.g., 'en' for English).
    +        default: null
    +      - name: target_language
    +        type: str
    +        doc: The target language code (e.g., 'en' for English).
    +        default: null
    +      - name: device
    +        type: str
    +        doc: The device index for transformers. Default will prefer cuda if available.
    +        default: null
    +      - name: model_kwargs
    +        type: dict
    +        doc: Keyword arguments to pass regarding the loading of the model in HuggingFace's
    +          `pipeline` function.
    +        default: null
    +      - name: batch_size
    +        type: int
    +        doc: The number of batches to use in translation. The files are translated
    +          one by one, but the sentences can be batched.
    +        default: 1
    +      - name: translation_kwargs
    +        type: dict
    +        doc: Additional keyword arguments to pass to a `transformers.TranslationPipeline`
    +          when doing the translation inference. Notice the batch size here is being
    +          added automatically.
    +        default: null
    +      - name: verbose
    +        type: bool
    +        doc: 'Whether to present logs of a progress bar and errors. Default: True.'
    +        default: false
    +      name: translate
    +      has_kwargs: false
    +      doc: 'Translate text files using a transformer model from Huggingface''s hub
    +        according to the source and target languages
    +
    +        given (or using the directly provided model name). The end result is a directory
    +        of translated text files and a
    +
    +        dataframe containing the following columns:
    +
    +
    +        * text_file - The text file path.
    +
    +        * translation_file - The translation text file name in the output directory.'
    +      has_varargs: false
    +  build:
    +    requirements:
    +    - transformers
    +    - sentencepiece
    +    - torch
    +    - tqdm
    +    code_origin: ''
    +    functionSourceCode: 
    +    base_image: mlrun/mlrun
    +    origin_filename: ''
    +  image: ''
    +  default_handler: translate
    +  disable_auto_mount: false
    +  command: ''
    +  description: Translate text files from one language to another
    +verbose: false
    +metadata:
    +  categories:
    +  - genai
    +  - NLP
    +  tag: ''
    +  name: translate
    +kind: job
    +
    +        
    +    
    + + \ No newline at end of file diff --git a/functions/master/translate/0.2.0/static/item.html b/functions/master/translate/0.2.0/static/item.html new file mode 100644 index 00000000..0c48dfc6 --- /dev/null +++ b/functions/master/translate/0.2.0/static/item.html @@ -0,0 +1,65 @@ + + + + + + + + + + + Source + + + + +
    +        
    +apiVersion: v1
    +categories:
    +- genai
    +- NLP
    +description: Translate text files from one language to another
    +doc: ''
    +example: translate.ipynb
    +generationDate: 2023-12-05:17-20
    +hidden: false
    +icon: ''
    +labels:
    +  author: guyl
    +maintainers: []
    +marketplaceType: ''
    +mlrunVersion: 1.7.0
    +name: translate
    +platformVersion: 3.5.3
    +spec:
    +  filename: translate.py
    +  handler: translate
    +  image: mlrun/mlrun
    +  kind: job
    +  requirements:
    +    - transformers
    +    - sentencepiece
    +    - torch
    +    - tqdm
    +url: ''
    +version: 0.2.0
    +test_valid: True
    +
    +        
    +    
    + + \ No newline at end of file diff --git a/functions/master/translate/0.2.0/static/source.html b/functions/master/translate/0.2.0/static/source.html new file mode 100644 index 00000000..1582a69b --- /dev/null +++ b/functions/master/translate/0.2.0/static/source.html @@ -0,0 +1,431 @@ + + + + + + + + + + + Source + + + + +
    +        
    +# Copyright 2023 Iguazio
    +#
    +# Licensed under the Apache License, Version 2.0 (the "License");
    +# you may not use this file except in compliance with the License.
    +# You may obtain a copy of the License at
    +#
    +#   http://www.apache.org/licenses/LICENSE-2.0
    +#
    +# Unless required by applicable law or agreed to in writing, software
    +# distributed under the License is distributed on an "AS IS" BASIS,
    +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    +# See the License for the specific language governing permissions and
    +# limitations under the License.
    +
    +import logging
    +import operator
    +import pathlib
    +from functools import reduce, wraps
    +from typing import Any, Dict, List, Tuple, Union
    +
    +import pandas as pd
    +import transformers
    +from tqdm import tqdm
    +
    +# Get the global logger:
    +_LOGGER = logging.getLogger()
    +
    +
    +def _check_mlrun_and_open_mpi() -> Tuple["mlrun.MLClientCtx", "mpi4py.MPI.Intracomm"]:
    +    is_mpi = False
    +    try:
    +        import mlrun
    +
    +        context = mlrun.get_or_create_ctx(name="mlrun")
    +        is_mpi = context.labels.get("kind", "job") == "mpijob"
    +
    +        if is_mpi:
    +            try:
    +                from mpi4py import MPI
    +
    +                return context, MPI.COMM_WORLD
    +            except ModuleNotFoundError as mpi4py_not_found:
    +                context.logger.error(
    +                    "To distribute the function using MLRun's 'mpijob' you need to have `mpi4py` package in your "
    +                    "interpreter. Please run `pip install mpi4py` and make sure you have open-mpi."
    +                )
    +                raise mpi4py_not_found
    +        else:
    +            return context, None
    +    except ModuleNotFoundError as module_not_found:
    +        if is_mpi:
    +            raise module_not_found
    +    return None, None
    +
    +
    +def open_mpi_handler(
    +    worker_inputs: List[str], root_worker_inputs: Dict[str, Any] = None
    +):
    +    global _LOGGER
    +
    +    # Check for MLRun and OpenMPI availability:
    +    context, comm = _check_mlrun_and_open_mpi()
    +
    +    # Check if MLRun is available, set the global logger to MLRun's:
    +    if context:
    +        _LOGGER = context.logger
    +
    +    def decorator(handler):
    +        if comm is None or comm.Get_size() == 1:
    +            return handler
    +
    +        @wraps(handler)
    +        def wrapper(**kwargs):
    +            # Get the open mpi environment properties:
    +            size = comm.Get_size()
    +            rank = comm.Get_rank()
    +
    +            # Give the correct chunk of the workers inputs:
    +            for worker_input in worker_inputs:
    +                input_argument = kwargs[worker_input]
    +                if input_argument is None:
    +                    continue
    +                if isinstance(input_argument, (str, pathlib.Path)):
    +                    input_argument = _get_text_files(
    +                        data_path=pathlib.Path(input_argument).absolute()
    +                    )
    +                if len(input_argument) < size:
    +                    raise ValueError(
    +                        f"Cannot split the input '{worker_input}' of length {len(input_argument)} to {size} workers. "
    +                        f"Please reduce the amount of workers for this input."
    +                    )
    +                even_chunk_size = len(input_argument) // size
    +                chunk_start = rank * even_chunk_size
    +                chunk_end = (
    +                    (rank + 1) * even_chunk_size
    +                    if rank + 1 < size
    +                    else len(input_argument)
    +                )
    +                context.logger.info(
    +                    f"Rank #{rank}: Processing input chunk of '{worker_input}' "
    +                    f"from index {chunk_start} to {chunk_end}."
    +                )
    +                if isinstance(input_argument, list):
    +                    input_argument = input_argument[chunk_start:chunk_end]
    +                elif isinstance(input_argument, pd.DataFrame):
    +                    input_argument = input_argument.iloc[chunk_start:chunk_end:, :]
    +                kwargs[worker_input] = input_argument
    +
    +            # Set the root worker only arguments:
    +            if rank == 0 and root_worker_inputs:
    +                kwargs.update(root_worker_inputs)
    +
    +            # Run the worker:
    +            output = handler(**kwargs)
    +
    +            # Send the output to the root rank (rank #0):
    +            output = comm.gather(output, root=0)
    +            if rank == 0:
    +                # Join the outputs:
    +                context.logger.info("Collecting data from workers to root worker.")
    +                output_directory = output[0][0]
    +                dataframe = pd.concat(objs=[df for _, df, _ in output], axis=0)
    +                errors_dictionary = reduce(
    +                    operator.ior, [err for _, _, err in output], {}
    +                )
    +                return output_directory, dataframe, errors_dictionary
    +            return None
    +
    +        return wrapper
    +
    +    return decorator
    +
    +
    +@open_mpi_handler(worker_inputs=["data_path"], root_worker_inputs={"verbose": True})
    +def translate(
    +    data_path: Union[str, List[str], pathlib.Path],
    +    output_directory: str,
    +    model_name: str = None,
    +    source_language: str = None,
    +    target_language: str = None,
    +    device: str = None,
    +    model_kwargs: dict = None,
    +    batch_size: int = 1,
    +    translation_kwargs: dict = None,
    +    verbose: bool = False,
    +) -> Tuple[str, pd.DataFrame, dict]:
    +    """
    +    Translate text files using a transformer model from Huggingface's hub according to the source and target languages
    +    given (or using the directly provided model name). The end result is a directory of translated text files and a
    +    dataframe containing the following columns:
    +
    +    * text_file - The text file path.
    +    * translation_file - The translation text file name in the output directory.
    +
    +    :param data_path:          A directory of text files or a single file or a list of files to translate.
    +    :param output_directory:   Directory where the translated files will be saved.
    +    :param model_name:         The name of a model to load. If None, the model name is constructed using the source and
    +                               target languages parameters.
    +    :param source_language:    The source language code (e.g., 'en' for English).
    +    :param target_language:    The target language code (e.g., 'en' for English).
    +    :param model_kwargs:       Keyword arguments to pass regarding the loading of the model in HuggingFace's `pipeline`
    +                               function.
    +    :param device:             The device index for transformers. Default will prefer cuda if available.
    +    :param batch_size:         The number of batches to use in translation. The files are translated one by one, but the
    +                               sentences can be batched.
    +    :param translation_kwargs: Additional keyword arguments to pass to a `transformers.TranslationPipeline` when doing
    +                               the translation inference. Notice the batch size here is being added automatically.
    +    :param verbose:            Whether to present logs of a progress bar and errors. Default: True.
    +
    +    :returns: A tuple of:
    +
    +              * Path to the output directory.
    +              * A dataframe dataset of the translated file names.
    +              * A dictionary of errored files that were not translated.
    +    """
    +    global _LOGGER
    +
    +    # Get the input text files to translate:
    +    if verbose:
    +        _LOGGER.info("Collecting text files.")
    +    if isinstance(data_path, str):
    +        data_path = pathlib.Path(data_path).absolute()
    +        text_files = _get_text_files(data_path=data_path)
    +    else:
    +        text_files = data_path
    +    if verbose:
    +        _LOGGER.info(f"Collected {len(text_files)} text files.")
    +
    +    # Get the translation pipeline:
    +    if verbose:
    +        _LOGGER.info(f"Loading model - using device '{device}'.")
    +    translation_pipeline, model_name = _get_translation_pipeline(
    +        model_name=model_name,
    +        source_language=source_language,
    +        target_language=target_language,
    +        device=device,
    +        model_kwargs=model_kwargs,
    +        batch_size=batch_size if batch_size != 1 else None,
    +    )
    +    if verbose:
    +        _LOGGER.info(f"Model '{model_name}' was loaded successfully.")
    +
    +    # Prepare the successes dataframe and errors dictionary to be returned:
    +    successes = []
    +    errors = {}
    +
    +    # Create the output directory:
    +    output_directory = pathlib.Path(output_directory)
    +    output_directory.mkdir(parents=True, exist_ok=True)
    +
    +    # Prepare the translation keyword arguments:
    +    translation_kwargs = translation_kwargs or {}
    +
    +    # Go over the audio files and transcribe:
    +    for text_file in tqdm(
    +        text_files, desc="Translating", unit="file", disable=not verbose
    +    ):
    +        try:
    +            # Translate:
    +            translation = _translate(
    +                text_file=text_file,
    +                translation_pipeline=translation_pipeline,
    +                translation_kwargs=translation_kwargs,
    +            )
    +            # Write the transcription to file:
    +            translation_file = _save_to_file(
    +                translation=translation,
    +                file_name=text_file.stem,
    +                output_directory=output_directory,
    +            )
    +            # Note as a success in the list:
    +            successes.append(
    +                [
    +                    text_file.name,
    +                    translation_file.name,
    +                ]
    +            )
    +        except Exception as exception:
    +            # Note the exception as error in the dictionary:
    +            if verbose:
    +                _LOGGER.warning(f"Error in file: '{text_file.name}'")
    +            errors[str(text_file.name)] = str(exception)
    +            continue
    +
    +    # Construct the translations dataframe:
    +    columns = [
    +        "text_file",
    +        "translation_file",
    +    ]
    +    successes = pd.DataFrame(
    +        successes,
    +        columns=columns,
    +    )
    +
    +    # Print the head of the produced dataframe and return:
    +    if verbose:
    +        _LOGGER.info(
    +            f"Done ({successes.shape[0]}/{len(text_files)})\n"
    +            f"Translations summary:\n"
    +            f"{successes.head()}"
    +        )
    +    return str(output_directory), successes, errors
    +
    +
    +def _get_text_files(
    +    data_path: pathlib.Path,
    +) -> List[pathlib.Path]:
    +    # Check if the path is of a directory or a file:
    +    if data_path.is_dir():
    +        # Get all files inside the directory:
    +        text_files = list(data_path.glob("*.*"))
    +    elif data_path.is_file():
    +        text_files = [data_path]
    +    else:
    +        raise ValueError(
    +            f"Unrecognized data path. The parameter `data_path` must be either a directory path or a file path. "
    +            f"Given: {str(data_path)} "
    +        )
    +
    +    return text_files
    +
    +
    +def _get_translation_pipeline(
    +    model_name: str = None,
    +    source_language: str = None,
    +    target_language: str = None,
    +    device: str = None,
    +    model_kwargs: dict = None,
    +    batch_size: int = None,
    +) -> Tuple[transformers.Pipeline, str]:
    +    # Construct the model name - if model name is provided (not None) then we take it, otherwise we check both source
    +    # and target were provided to construct the model name:
    +    if model_name is None and (source_language is None or target_language is None):
    +        raise ValueError(
    +            "No model name were given and missing source and / or target languages. In order to translate you must "
    +            "pass a `model_name` or both `source_language` and `target_language`."
    +        )
    +    elif model_name is None:
    +        model_name = f"Helsinki-NLP/opus-mt-{source_language}-{target_language}"
    +
    +    # Initialize the translation pipeline:
    +    try:
    +        translation_pipeline = transformers.pipeline(
    +            task="translation",
    +            model=model_name,
    +            tokenizer=model_name,
    +            device=device,
    +            model_kwargs=model_kwargs,
    +            batch_size=batch_size,
    +        )
    +    except OSError as load_exception:
    +        if (
    +            "is not a valid model identifier listed on 'https://huggingface.co/models'"
    +            in str(load_exception)
    +            and source_language
    +        ):
    +            raise ValueError(
    +                f"The model '{model_name}' is not a valid model identifier. "
    +                f"The parameters `source_language` and `target_language` are used to construct a Helsinki model for "
    +                f"text to text generation, but the model created from the given languages does not exist. "
    +                f"You may check language identifiers at "
    +                f"https://developers.google.com/admin-sdk/directory/v1/languages, and if the error was not fixed, one "
    +                f"or more language code might be with 3 letters and needs to be found online. "
    +                f"Remember, you can always choose a model directly from the Huggingface hub by using the `model_name` "
    +                f"parameter."
    +            ) from load_exception
    +        raise load_exception
    +
    +    return translation_pipeline, model_name
    +
    +
    +def _translate(
    +    text_file: pathlib.Path,
    +    translation_pipeline: transformers.Pipeline,
    +    translation_kwargs: dict,
    +) -> str:
    +    # Read the text from file:
    +    with open(text_file, "r") as fp:
    +        text = fp.read()
    +
    +    # Split to paragraphs and each paragraph to sentences:
    +    paragraphs = [paragraph.split(".") for paragraph in text.split("\n")]
    +
    +    # Discover the newline indexes to restore the file to its structure post translation:
    +    newlines_indexes = []
    +    for paragraph in paragraphs[:-1]:
    +        if len(newlines_indexes) == 0:
    +            newlines_indexes.append(len(paragraph) - 1)
    +        else:
    +            newlines_indexes.append(newlines_indexes[-1] + len(paragraph))
    +
    +    # Prepare the batches (each sentence from the paragraphs). Notice we add a dot not only to restore the sentence
    +    # structure but to ignore empty strings as it will ruin the translation:
    +    sentences = [f"{line}." for paragraph in paragraphs for line in paragraph]
    +
    +    # Translate the sentences:
    +    translations = translation_pipeline(sentences, **translation_kwargs)
    +
    +    # Restructure the full text from the sentences:
    +    translated_text = []
    +    newline_index = newlines_indexes.pop(0) if newlines_indexes else None
    +    for i, translation in enumerate(translations):
    +        # Get the translation:
    +        text = translation["translation_text"]
    +        # Validate if it was an empty sentence before:
    +        if text == ".":
    +            text = ""
    +        # Check if needed to insert a newline:
    +        if newline_index and newline_index == i:
    +            text += "\n"
    +            newline_index = newlines_indexes.pop(0) if newlines_indexes else None
    +        # Collect it:
    +        translated_text.append(text)
    +    translated_text = "".join(translated_text)
    +
    +    return translated_text
    +
    +
    +def _save_to_file(
    +    translation: str, file_name: str, output_directory: pathlib.Path
    +) -> pathlib.Path:
    +    # Prepare the file full path (checking for no duplications):
    +    translation_file = output_directory / f"{file_name}.txt"
    +    i = 1
    +    while translation_file.exists():
    +        i += 1
    +        translation_file = output_directory / f"{file_name}_{i}.txt"
    +
    +    # Make sure all directories are created:
    +    translation_file.parent.mkdir(exist_ok=True, parents=True)
    +
    +    # Write to file:
    +    with open(translation_file, "w") as fp:
    +        fp.write(translation)
    +
    +    return translation_file
    +
    +        
    +    
    + + \ No newline at end of file diff --git a/functions/master/translate/0.2.0/static/translate.html b/functions/master/translate/0.2.0/static/translate.html new file mode 100644 index 00000000..153b48a4 --- /dev/null +++ b/functions/master/translate/0.2.0/static/translate.html @@ -0,0 +1,574 @@ + + + + + + + +translate.translate + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + + + +
    +
    +
    +
    +
    +
    + +
    + +
    +
    +
    + + + + +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + +
    +
    +
    +
    +
    +

    + +
    +
    +
    +
    +
    + +
    +

    Source code for translate.translate

    +# Copyright 2023 Iguazio
    +#
    +# Licensed under the Apache License, Version 2.0 (the "License");
    +# you may not use this file except in compliance with the License.
    +# You may obtain a copy of the License at
    +#
    +#   http://www.apache.org/licenses/LICENSE-2.0
    +#
    +# Unless required by applicable law or agreed to in writing, software
    +# distributed under the License is distributed on an "AS IS" BASIS,
    +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    +# See the License for the specific language governing permissions and
    +# limitations under the License.
    +
    +import logging
    +import operator
    +import pathlib
    +from functools import reduce, wraps
    +from typing import Any, Dict, List, Tuple, Union
    +
    +import pandas as pd
    +import transformers
    +from tqdm import tqdm
    +
    +# Get the global logger:
    +_LOGGER = logging.getLogger()
    +
    +
    +def _check_mlrun_and_open_mpi() -> Tuple["mlrun.MLClientCtx", "mpi4py.MPI.Intracomm"]:
    +    is_mpi = False
    +    try:
    +        import mlrun
    +
    +        context = mlrun.get_or_create_ctx(name="mlrun")
    +        is_mpi = context.labels.get("kind", "job") == "mpijob"
    +
    +        if is_mpi:
    +            try:
    +                from mpi4py import MPI
    +
    +                return context, MPI.COMM_WORLD
    +            except ModuleNotFoundError as mpi4py_not_found:
    +                context.logger.error(
    +                    "To distribute the function using MLRun's 'mpijob' you need to have `mpi4py` package in your "
    +                    "interpreter. Please run `pip install mpi4py` and make sure you have open-mpi."
    +                )
    +                raise mpi4py_not_found
    +        else:
    +            return context, None
    +    except ModuleNotFoundError as module_not_found:
    +        if is_mpi:
    +            raise module_not_found
    +    return None, None
    +
    +
    +
    +[docs] +def open_mpi_handler( + worker_inputs: List[str], root_worker_inputs: Dict[str, Any] = None +): + global _LOGGER + + # Check for MLRun and OpenMPI availability: + context, comm = _check_mlrun_and_open_mpi() + + # Check if MLRun is available, set the global logger to MLRun's: + if context: + _LOGGER = context.logger + + def decorator(handler): + if comm is None or comm.Get_size() == 1: + return handler + + @wraps(handler) + def wrapper(**kwargs): + # Get the open mpi environment properties: + size = comm.Get_size() + rank = comm.Get_rank() + + # Give the correct chunk of the workers inputs: + for worker_input in worker_inputs: + input_argument = kwargs[worker_input] + if input_argument is None: + continue + if isinstance(input_argument, (str, pathlib.Path)): + input_argument = _get_text_files( + data_path=pathlib.Path(input_argument).absolute() + ) + if len(input_argument) < size: + raise ValueError( + f"Cannot split the input '{worker_input}' of length {len(input_argument)} to {size} workers. " + f"Please reduce the amount of workers for this input." + ) + even_chunk_size = len(input_argument) // size + chunk_start = rank * even_chunk_size + chunk_end = ( + (rank + 1) * even_chunk_size + if rank + 1 < size + else len(input_argument) + ) + context.logger.info( + f"Rank #{rank}: Processing input chunk of '{worker_input}' " + f"from index {chunk_start} to {chunk_end}." + ) + if isinstance(input_argument, list): + input_argument = input_argument[chunk_start:chunk_end] + elif isinstance(input_argument, pd.DataFrame): + input_argument = input_argument.iloc[chunk_start:chunk_end:, :] + kwargs[worker_input] = input_argument + + # Set the root worker only arguments: + if rank == 0 and root_worker_inputs: + kwargs.update(root_worker_inputs) + + # Run the worker: + output = handler(**kwargs) + + # Send the output to the root rank (rank #0): + output = comm.gather(output, root=0) + if rank == 0: + # Join the outputs: + context.logger.info("Collecting data from workers to root worker.") + output_directory = output[0][0] + dataframe = pd.concat(objs=[df for _, df, _ in output], axis=0) + errors_dictionary = reduce( + operator.ior, [err for _, _, err in output], {} + ) + return output_directory, dataframe, errors_dictionary + return None + + return wrapper + + return decorator
    + + + +
    +[docs] +@open_mpi_handler(worker_inputs=["data_path"], root_worker_inputs={"verbose": True}) +def translate( + data_path: Union[str, List[str], pathlib.Path], + output_directory: str, + model_name: str = None, + source_language: str = None, + target_language: str = None, + device: str = None, + model_kwargs: dict = None, + batch_size: int = 1, + translation_kwargs: dict = None, + verbose: bool = False, +) -> Tuple[str, pd.DataFrame, dict]: + """ + Translate text files using a transformer model from Huggingface's hub according to the source and target languages + given (or using the directly provided model name). The end result is a directory of translated text files and a + dataframe containing the following columns: + + * text_file - The text file path. + * translation_file - The translation text file name in the output directory. + + :param data_path: A directory of text files or a single file or a list of files to translate. + :param output_directory: Directory where the translated files will be saved. + :param model_name: The name of a model to load. If None, the model name is constructed using the source and + target languages parameters. + :param source_language: The source language code (e.g., 'en' for English). + :param target_language: The target language code (e.g., 'en' for English). + :param model_kwargs: Keyword arguments to pass regarding the loading of the model in HuggingFace's `pipeline` + function. + :param device: The device index for transformers. Default will prefer cuda if available. + :param batch_size: The number of batches to use in translation. The files are translated one by one, but the + sentences can be batched. + :param translation_kwargs: Additional keyword arguments to pass to a `transformers.TranslationPipeline` when doing + the translation inference. Notice the batch size here is being added automatically. + :param verbose: Whether to present logs of a progress bar and errors. Default: True. + + :returns: A tuple of: + + * Path to the output directory. + * A dataframe dataset of the translated file names. + * A dictionary of errored files that were not translated. + """ + global _LOGGER + + # Get the input text files to translate: + if verbose: + _LOGGER.info("Collecting text files.") + if isinstance(data_path, str): + data_path = pathlib.Path(data_path).absolute() + text_files = _get_text_files(data_path=data_path) + else: + text_files = data_path + if verbose: + _LOGGER.info(f"Collected {len(text_files)} text files.") + + # Get the translation pipeline: + if verbose: + _LOGGER.info(f"Loading model - using device '{device}'.") + translation_pipeline, model_name = _get_translation_pipeline( + model_name=model_name, + source_language=source_language, + target_language=target_language, + device=device, + model_kwargs=model_kwargs, + batch_size=batch_size if batch_size != 1 else None, + ) + if verbose: + _LOGGER.info(f"Model '{model_name}' was loaded successfully.") + + # Prepare the successes dataframe and errors dictionary to be returned: + successes = [] + errors = {} + + # Create the output directory: + output_directory = pathlib.Path(output_directory) + output_directory.mkdir(parents=True, exist_ok=True) + + # Prepare the translation keyword arguments: + translation_kwargs = translation_kwargs or {} + + # Go over the audio files and transcribe: + for text_file in tqdm( + text_files, desc="Translating", unit="file", disable=not verbose + ): + try: + # Translate: + translation = _translate( + text_file=text_file, + translation_pipeline=translation_pipeline, + translation_kwargs=translation_kwargs, + ) + # Write the transcription to file: + translation_file = _save_to_file( + translation=translation, + file_name=text_file.stem, + output_directory=output_directory, + ) + # Note as a success in the list: + successes.append( + [ + text_file.name, + translation_file.name, + ] + ) + except Exception as exception: + # Note the exception as error in the dictionary: + if verbose: + _LOGGER.warning(f"Error in file: '{text_file.name}'") + errors[str(text_file.name)] = str(exception) + continue + + # Construct the translations dataframe: + columns = [ + "text_file", + "translation_file", + ] + successes = pd.DataFrame( + successes, + columns=columns, + ) + + # Print the head of the produced dataframe and return: + if verbose: + _LOGGER.info( + f"Done ({successes.shape[0]}/{len(text_files)})\n" + f"Translations summary:\n" + f"{successes.head()}" + ) + return str(output_directory), successes, errors
    + + + +def _get_text_files( + data_path: pathlib.Path, +) -> List[pathlib.Path]: + # Check if the path is of a directory or a file: + if data_path.is_dir(): + # Get all files inside the directory: + text_files = list(data_path.glob("*.*")) + elif data_path.is_file(): + text_files = [data_path] + else: + raise ValueError( + f"Unrecognized data path. The parameter `data_path` must be either a directory path or a file path. " + f"Given: {str(data_path)} " + ) + + return text_files + + +def _get_translation_pipeline( + model_name: str = None, + source_language: str = None, + target_language: str = None, + device: str = None, + model_kwargs: dict = None, + batch_size: int = None, +) -> Tuple[transformers.Pipeline, str]: + # Construct the model name - if model name is provided (not None) then we take it, otherwise we check both source + # and target were provided to construct the model name: + if model_name is None and (source_language is None or target_language is None): + raise ValueError( + "No model name were given and missing source and / or target languages. In order to translate you must " + "pass a `model_name` or both `source_language` and `target_language`." + ) + elif model_name is None: + model_name = f"Helsinki-NLP/opus-mt-{source_language}-{target_language}" + + # Initialize the translation pipeline: + try: + translation_pipeline = transformers.pipeline( + task="translation", + model=model_name, + tokenizer=model_name, + device=device, + model_kwargs=model_kwargs, + batch_size=batch_size, + ) + except OSError as load_exception: + if ( + "is not a valid model identifier listed on 'https://huggingface.co/models'" + in str(load_exception) + and source_language + ): + raise ValueError( + f"The model '{model_name}' is not a valid model identifier. " + f"The parameters `source_language` and `target_language` are used to construct a Helsinki model for " + f"text to text generation, but the model created from the given languages does not exist. " + f"You may check language identifiers at " + f"https://developers.google.com/admin-sdk/directory/v1/languages, and if the error was not fixed, one " + f"or more language code might be with 3 letters and needs to be found online. " + f"Remember, you can always choose a model directly from the Huggingface hub by using the `model_name` " + f"parameter." + ) from load_exception + raise load_exception + + return translation_pipeline, model_name + + +def _translate( + text_file: pathlib.Path, + translation_pipeline: transformers.Pipeline, + translation_kwargs: dict, +) -> str: + # Read the text from file: + with open(text_file, "r") as fp: + text = fp.read() + + # Split to paragraphs and each paragraph to sentences: + paragraphs = [paragraph.split(".") for paragraph in text.split("\n")] + + # Discover the newline indexes to restore the file to its structure post translation: + newlines_indexes = [] + for paragraph in paragraphs[:-1]: + if len(newlines_indexes) == 0: + newlines_indexes.append(len(paragraph) - 1) + else: + newlines_indexes.append(newlines_indexes[-1] + len(paragraph)) + + # Prepare the batches (each sentence from the paragraphs). Notice we add a dot not only to restore the sentence + # structure but to ignore empty strings as it will ruin the translation: + sentences = [f"{line}." for paragraph in paragraphs for line in paragraph] + + # Translate the sentences: + translations = translation_pipeline(sentences, **translation_kwargs) + + # Restructure the full text from the sentences: + translated_text = [] + newline_index = newlines_indexes.pop(0) if newlines_indexes else None + for i, translation in enumerate(translations): + # Get the translation: + text = translation["translation_text"] + # Validate if it was an empty sentence before: + if text == ".": + text = "" + # Check if needed to insert a newline: + if newline_index and newline_index == i: + text += "\n" + newline_index = newlines_indexes.pop(0) if newlines_indexes else None + # Collect it: + translated_text.append(text) + translated_text = "".join(translated_text) + + return translated_text + + +def _save_to_file( + translation: str, file_name: str, output_directory: pathlib.Path +) -> pathlib.Path: + # Prepare the file full path (checking for no duplications): + translation_file = output_directory / f"{file_name}.txt" + i = 1 + while translation_file.exists(): + i += 1 + translation_file = output_directory / f"{file_name}_{i}.txt" + + # Make sure all directories are created: + translation_file.parent.mkdir(exist_ok=True, parents=True) + + # Write to file: + with open(translation_file, "w") as fp: + fp.write(translation) + + return translation_file +
    +
    +
    +
    +
    +
    +
    +
    +
    + +
    +
    +
    +
    + + + +
    +
    + + \ No newline at end of file diff --git a/functions/master/translate/latest/src/function.yaml b/functions/master/translate/latest/src/function.yaml index bb165610..9595b77a 100644 --- a/functions/master/translate/latest/src/function.yaml +++ b/functions/master/translate/latest/src/function.yaml @@ -1,77 +1,36 @@ -kind: job -metadata: - name: translate - tag: '' - hash: 7eedf684bcebfbfd964e5503afbb56335c8f4097 - project: '' - labels: - author: guyl - categories: - - data-preparation - - huggingface - - machine-learning - - deep-learning - - NLP spec: - command: '' - args: [] - image: '' - build: - functionSourceCode:  - base_image: mlrun/mlrun - commands: [] - code_origin: '' - origin_filename: '' - requirements: - - transformers - - sentencepiece - - torch - - tqdm entry_points: open_mpi_handler: - name: open_mpi_handler - doc: '' + lineno: 56 parameters: - name: worker_inputs type: List[str] - name: root_worker_inputs type: Dict[str, Any] default: null - outputs: [] - lineno: 56 - has_varargs: false + name: open_mpi_handler has_kwargs: false - decorator: - name: decorator doc: '' + has_varargs: false + decorator: + lineno: 68 parameters: - name: handler - outputs: [] - lineno: 68 - has_varargs: false + name: decorator has_kwargs: false + doc: '' + has_varargs: false wrapper: + lineno: 73 name: wrapper + has_kwargs: true doc: '' - parameters: [] - outputs: [] - lineno: 73 has_varargs: false - has_kwargs: true translate: - name: translate - doc: 'Translate text files using a transformer model from Huggingface''s hub - according to the source and target languages - - given (or using the directly provided model name). The end result is a directory - of translated text files and a - - dataframe containing the following columns: - - - * text_file - The text file path. - - * translation_file - The translation text file name in the output directory.' + outputs: + - doc: 'A tuple of:' + type: Tuple[str, pd.DataFrame, dict] + lineno: 135 parameters: - name: data_path type: Union[str, List[str], Path] @@ -116,20 +75,41 @@ spec: type: bool doc: 'Whether to present logs of a progress bar and errors. Default: True.' default: false - outputs: - - doc: 'A tuple of:' - type: Tuple[str, pd.DataFrame, dict] - lineno: 135 - has_varargs: false + name: translate has_kwargs: false - description: Translate text files from one language to another + doc: 'Translate text files using a transformer model from Huggingface''s hub + according to the source and target languages + + given (or using the directly provided model name). The end result is a directory + of translated text files and a + + dataframe containing the following columns: + + + * text_file - The text file path. + + * translation_file - The translation text file name in the output directory.' + has_varargs: false + build: + requirements: + - transformers + - sentencepiece + - torch + - tqdm + code_origin: '' + functionSourceCode: IyBDb3B5cmlnaHQgMjAyMyBJZ3VhemlvCiMKIyBMaWNlbnNlZCB1bmRlciB0aGUgQXBhY2hlIExpY2Vuc2UsIFZlcnNpb24gMi4wICh0aGUgIkxpY2Vuc2UiKTsKIyB5b3UgbWF5IG5vdCB1c2UgdGhpcyBmaWxlIGV4Y2VwdCBpbiBjb21wbGlhbmNlIHdpdGggdGhlIExpY2Vuc2UuCiMgWW91IG1heSBvYnRhaW4gYSBjb3B5IG9mIHRoZSBMaWNlbnNlIGF0CiMKIyAgIGh0dHA6Ly93d3cuYXBhY2hlLm9yZy9saWNlbnNlcy9MSUNFTlNFLTIuMAojCiMgVW5sZXNzIHJlcXVpcmVkIGJ5IGFwcGxpY2FibGUgbGF3IG9yIGFncmVlZCB0byBpbiB3cml0aW5nLCBzb2Z0d2FyZQojIGRpc3RyaWJ1dGVkIHVuZGVyIHRoZSBMaWNlbnNlIGlzIGRpc3RyaWJ1dGVkIG9uIGFuICJBUyBJUyIgQkFTSVMsCiMgV0lUSE9VVCBXQVJSQU5USUVTIE9SIENPTkRJVElPTlMgT0YgQU5ZIEtJTkQsIGVpdGhlciBleHByZXNzIG9yIGltcGxpZWQuCiMgU2VlIHRoZSBMaWNlbnNlIGZvciB0aGUgc3BlY2lmaWMgbGFuZ3VhZ2UgZ292ZXJuaW5nIHBlcm1pc3Npb25zIGFuZAojIGxpbWl0YXRpb25zIHVuZGVyIHRoZSBMaWNlbnNlLgoKaW1wb3J0IGxvZ2dpbmcKaW1wb3J0IG9wZXJhdG9yCmltcG9ydCBwYXRobGliCmZyb20gZnVuY3Rvb2xzIGltcG9ydCByZWR1Y2UsIHdyYXBzCmZyb20gdHlwaW5nIGltcG9ydCBBbnksIERpY3QsIExpc3QsIFR1cGxlLCBVbmlvbgoKaW1wb3J0IHBhbmRhcyBhcyBwZAppbXBvcnQgdHJhbnNmb3JtZXJzCmZyb20gdHFkbSBpbXBvcnQgdHFkbQoKIyBHZXQgdGhlIGdsb2JhbCBsb2dnZXI6Cl9MT0dHRVIgPSBsb2dnaW5nLmdldExvZ2dlcigpCgoKZGVmIF9jaGVja19tbHJ1bl9hbmRfb3Blbl9tcGkoKSAtPiBUdXBsZVsibWxydW4uTUxDbGllbnRDdHgiLCAibXBpNHB5Lk1QSS5JbnRyYWNvbW0iXToKICAgIGlzX21waSA9IEZhbHNlCiAgICB0cnk6CiAgICAgICAgaW1wb3J0IG1scnVuCgogICAgICAgIGNvbnRleHQgPSBtbHJ1bi5nZXRfb3JfY3JlYXRlX2N0eChuYW1lPSJtbHJ1biIpCiAgICAgICAgaXNfbXBpID0gY29udGV4dC5sYWJlbHMuZ2V0KCJraW5kIiwgImpvYiIpID09ICJtcGlqb2IiCgogICAgICAgIGlmIGlzX21waToKICAgICAgICAgICAgdHJ5OgogICAgICAgICAgICAgICAgZnJvbSBtcGk0cHkgaW1wb3J0IE1QSQoKICAgICAgICAgICAgICAgIHJldHVybiBjb250ZXh0LCBNUEkuQ09NTV9XT1JMRAogICAgICAgICAgICBleGNlcHQgTW9kdWxlTm90Rm91bmRFcnJvciBhcyBtcGk0cHlfbm90X2ZvdW5kOgogICAgICAgICAgICAgICAgY29udGV4dC5sb2dnZXIuZXJyb3IoCiAgICAgICAgICAgICAgICAgICAgIlRvIGRpc3RyaWJ1dGUgdGhlIGZ1bmN0aW9uIHVzaW5nIE1MUnVuJ3MgJ21waWpvYicgeW91IG5lZWQgdG8gaGF2ZSBgbXBpNHB5YCBwYWNrYWdlIGluIHlvdXIgIgogICAgICAgICAgICAgICAgICAgICJpbnRlcnByZXRlci4gUGxlYXNlIHJ1biBgcGlwIGluc3RhbGwgbXBpNHB5YCBhbmQgbWFrZSBzdXJlIHlvdSBoYXZlIG9wZW4tbXBpLiIKICAgICAgICAgICAgICAgICkKICAgICAgICAgICAgICAgIHJhaXNlIG1waTRweV9ub3RfZm91bmQKICAgICAgICBlbHNlOgogICAgICAgICAgICByZXR1cm4gY29udGV4dCwgTm9uZQogICAgZXhjZXB0IE1vZHVsZU5vdEZvdW5kRXJyb3IgYXMgbW9kdWxlX25vdF9mb3VuZDoKICAgICAgICBpZiBpc19tcGk6CiAgICAgICAgICAgIHJhaXNlIG1vZHVsZV9ub3RfZm91bmQKICAgIHJldHVybiBOb25lLCBOb25lCgoKZGVmIG9wZW5fbXBpX2hhbmRsZXIoCiAgICB3b3JrZXJfaW5wdXRzOiBMaXN0W3N0cl0sIHJvb3Rfd29ya2VyX2lucHV0czogRGljdFtzdHIsIEFueV0gPSBOb25lCik6CiAgICBnbG9iYWwgX0xPR0dFUgoKICAgICMgQ2hlY2sgZm9yIE1MUnVuIGFuZCBPcGVuTVBJIGF2YWlsYWJpbGl0eToKICAgIGNvbnRleHQsIGNvbW0gPSBfY2hlY2tfbWxydW5fYW5kX29wZW5fbXBpKCkKCiAgICAjIENoZWNrIGlmIE1MUnVuIGlzIGF2YWlsYWJsZSwgc2V0IHRoZSBnbG9iYWwgbG9nZ2VyIHRvIE1MUnVuJ3M6CiAgICBpZiBjb250ZXh0OgogICAgICAgIF9MT0dHRVIgPSBjb250ZXh0LmxvZ2dlcgoKICAgIGRlZiBkZWNvcmF0b3IoaGFuZGxlcik6CiAgICAgICAgaWYgY29tbSBpcyBOb25lIG9yIGNvbW0uR2V0X3NpemUoKSA9PSAxOgogICAgICAgICAgICByZXR1cm4gaGFuZGxlcgoKICAgICAgICBAd3JhcHMoaGFuZGxlcikKICAgICAgICBkZWYgd3JhcHBlcigqKmt3YXJncyk6CiAgICAgICAgICAgICMgR2V0IHRoZSBvcGVuIG1waSBlbnZpcm9ubWVudCBwcm9wZXJ0aWVzOgogICAgICAgICAgICBzaXplID0gY29tbS5HZXRfc2l6ZSgpCiAgICAgICAgICAgIHJhbmsgPSBjb21tLkdldF9yYW5rKCkKCiAgICAgICAgICAgICMgR2l2ZSB0aGUgY29ycmVjdCBjaHVuayBvZiB0aGUgd29ya2VycyBpbnB1dHM6CiAgICAgICAgICAgIGZvciB3b3JrZXJfaW5wdXQgaW4gd29ya2VyX2lucHV0czoKICAgICAgICAgICAgICAgIGlucHV0X2FyZ3VtZW50ID0ga3dhcmdzW3dvcmtlcl9pbnB1dF0KICAgICAgICAgICAgICAgIGlmIGlucHV0X2FyZ3VtZW50IGlzIE5vbmU6CiAgICAgICAgICAgICAgICAgICAgY29udGludWUKICAgICAgICAgICAgICAgIGlmIGlzaW5zdGFuY2UoaW5wdXRfYXJndW1lbnQsIChzdHIsIHBhdGhsaWIuUGF0aCkpOgogICAgICAgICAgICAgICAgICAgIGlucHV0X2FyZ3VtZW50ID0gX2dldF90ZXh0X2ZpbGVzKAogICAgICAgICAgICAgICAgICAgICAgICBkYXRhX3BhdGg9cGF0aGxpYi5QYXRoKGlucHV0X2FyZ3VtZW50KS5hYnNvbHV0ZSgpCiAgICAgICAgICAgICAgICAgICAgKQogICAgICAgICAgICAgICAgaWYgbGVuKGlucHV0X2FyZ3VtZW50KSA8IHNpemU6CiAgICAgICAgICAgICAgICAgICAgcmFpc2UgVmFsdWVFcnJvcigKICAgICAgICAgICAgICAgICAgICAgICAgZiJDYW5ub3Qgc3BsaXQgdGhlIGlucHV0ICd7d29ya2VyX2lucHV0fScgb2YgbGVuZ3RoIHtsZW4oaW5wdXRfYXJndW1lbnQpfSB0byB7c2l6ZX0gd29ya2Vycy4gIgogICAgICAgICAgICAgICAgICAgICAgICBmIlBsZWFzZSByZWR1Y2UgdGhlIGFtb3VudCBvZiB3b3JrZXJzIGZvciB0aGlzIGlucHV0LiIKICAgICAgICAgICAgICAgICAgICApCiAgICAgICAgICAgICAgICBldmVuX2NodW5rX3NpemUgPSBsZW4oaW5wdXRfYXJndW1lbnQpIC8vIHNpemUKICAgICAgICAgICAgICAgIGNodW5rX3N0YXJ0ID0gcmFuayAqIGV2ZW5fY2h1bmtfc2l6ZQogICAgICAgICAgICAgICAgY2h1bmtfZW5kID0gKAogICAgICAgICAgICAgICAgICAgIChyYW5rICsgMSkgKiBldmVuX2NodW5rX3NpemUKICAgICAgICAgICAgICAgICAgICBpZiByYW5rICsgMSA8IHNpemUKICAgICAgICAgICAgICAgICAgICBlbHNlIGxlbihpbnB1dF9hcmd1bWVudCkKICAgICAgICAgICAgICAgICkKICAgICAgICAgICAgICAgIGNvbnRleHQubG9nZ2VyLmluZm8oCiAgICAgICAgICAgICAgICAgICAgZiJSYW5rICN7cmFua306IFByb2Nlc3NpbmcgaW5wdXQgY2h1bmsgb2YgJ3t3b3JrZXJfaW5wdXR9JyAiCiAgICAgICAgICAgICAgICAgICAgZiJmcm9tIGluZGV4IHtjaHVua19zdGFydH0gdG8ge2NodW5rX2VuZH0uIgogICAgICAgICAgICAgICAgKQogICAgICAgICAgICAgICAgaWYgaXNpbnN0YW5jZShpbnB1dF9hcmd1bWVudCwgbGlzdCk6CiAgICAgICAgICAgICAgICAgICAgaW5wdXRfYXJndW1lbnQgPSBpbnB1dF9hcmd1bWVudFtjaHVua19zdGFydDpjaHVua19lbmRdCiAgICAgICAgICAgICAgICBlbGlmIGlzaW5zdGFuY2UoaW5wdXRfYXJndW1lbnQsIHBkLkRhdGFGcmFtZSk6CiAgICAgICAgICAgICAgICAgICAgaW5wdXRfYXJndW1lbnQgPSBpbnB1dF9hcmd1bWVudC5pbG9jW2NodW5rX3N0YXJ0OmNodW5rX2VuZDosIDpdCiAgICAgICAgICAgICAgICBrd2FyZ3Nbd29ya2VyX2lucHV0XSA9IGlucHV0X2FyZ3VtZW50CgogICAgICAgICAgICAjIFNldCB0aGUgcm9vdCB3b3JrZXIgb25seSBhcmd1bWVudHM6CiAgICAgICAgICAgIGlmIHJhbmsgPT0gMCBhbmQgcm9vdF93b3JrZXJfaW5wdXRzOgogICAgICAgICAgICAgICAga3dhcmdzLnVwZGF0ZShyb290X3dvcmtlcl9pbnB1dHMpCgogICAgICAgICAgICAjIFJ1biB0aGUgd29ya2VyOgogICAgICAgICAgICBvdXRwdXQgPSBoYW5kbGVyKCoqa3dhcmdzKQoKICAgICAgICAgICAgIyBTZW5kIHRoZSBvdXRwdXQgdG8gdGhlIHJvb3QgcmFuayAocmFuayAjMCk6CiAgICAgICAgICAgIG91dHB1dCA9IGNvbW0uZ2F0aGVyKG91dHB1dCwgcm9vdD0wKQogICAgICAgICAgICBpZiByYW5rID09IDA6CiAgICAgICAgICAgICAgICAjIEpvaW4gdGhlIG91dHB1dHM6CiAgICAgICAgICAgICAgICBjb250ZXh0LmxvZ2dlci5pbmZvKCJDb2xsZWN0aW5nIGRhdGEgZnJvbSB3b3JrZXJzIHRvIHJvb3Qgd29ya2VyLiIpCiAgICAgICAgICAgICAgICBvdXRwdXRfZGlyZWN0b3J5ID0gb3V0cHV0WzBdWzBdCiAgICAgICAgICAgICAgICBkYXRhZnJhbWUgPSBwZC5jb25jYXQob2Jqcz1bZGYgZm9yIF8sIGRmLCBfIGluIG91dHB1dF0sIGF4aXM9MCkKICAgICAgICAgICAgICAgIGVycm9yc19kaWN0aW9uYXJ5ID0gcmVkdWNlKAogICAgICAgICAgICAgICAgICAgIG9wZXJhdG9yLmlvciwgW2VyciBmb3IgXywgXywgZXJyIGluIG91dHB1dF0sIHt9CiAgICAgICAgICAgICAgICApCiAgICAgICAgICAgICAgICByZXR1cm4gb3V0cHV0X2RpcmVjdG9yeSwgZGF0YWZyYW1lLCBlcnJvcnNfZGljdGlvbmFyeQogICAgICAgICAgICByZXR1cm4gTm9uZQoKICAgICAgICByZXR1cm4gd3JhcHBlcgoKICAgIHJldHVybiBkZWNvcmF0b3IKCgpAb3Blbl9tcGlfaGFuZGxlcih3b3JrZXJfaW5wdXRzPVsiZGF0YV9wYXRoIl0sIHJvb3Rfd29ya2VyX2lucHV0cz17InZlcmJvc2UiOiBUcnVlfSkKZGVmIHRyYW5zbGF0ZSgKICAgIGRhdGFfcGF0aDogVW5pb25bc3RyLCBMaXN0W3N0cl0sIHBhdGhsaWIuUGF0aF0sCiAgICBvdXRwdXRfZGlyZWN0b3J5OiBzdHIsCiAgICBtb2RlbF9uYW1lOiBzdHIgPSBOb25lLAogICAgc291cmNlX2xhbmd1YWdlOiBzdHIgPSBOb25lLAogICAgdGFyZ2V0X2xhbmd1YWdlOiBzdHIgPSBOb25lLAogICAgZGV2aWNlOiBzdHIgPSBOb25lLAogICAgbW9kZWxfa3dhcmdzOiBkaWN0ID0gTm9uZSwKICAgIGJhdGNoX3NpemU6IGludCA9IDEsCiAgICB0cmFuc2xhdGlvbl9rd2FyZ3M6IGRpY3QgPSBOb25lLAogICAgdmVyYm9zZTogYm9vbCA9IEZhbHNlLAopIC0+IFR1cGxlW3N0ciwgcGQuRGF0YUZyYW1lLCBkaWN0XToKICAgICIiIgogICAgVHJhbnNsYXRlIHRleHQgZmlsZXMgdXNpbmcgYSB0cmFuc2Zvcm1lciBtb2RlbCBmcm9tIEh1Z2dpbmdmYWNlJ3MgaHViIGFjY29yZGluZyB0byB0aGUgc291cmNlIGFuZCB0YXJnZXQgbGFuZ3VhZ2VzCiAgICBnaXZlbiAob3IgdXNpbmcgdGhlIGRpcmVjdGx5IHByb3ZpZGVkIG1vZGVsIG5hbWUpLiBUaGUgZW5kIHJlc3VsdCBpcyBhIGRpcmVjdG9yeSBvZiB0cmFuc2xhdGVkIHRleHQgZmlsZXMgYW5kIGEKICAgIGRhdGFmcmFtZSBjb250YWluaW5nIHRoZSBmb2xsb3dpbmcgY29sdW1uczoKCiAgICAqIHRleHRfZmlsZSAtIFRoZSB0ZXh0IGZpbGUgcGF0aC4KICAgICogdHJhbnNsYXRpb25fZmlsZSAtIFRoZSB0cmFuc2xhdGlvbiB0ZXh0IGZpbGUgbmFtZSBpbiB0aGUgb3V0cHV0IGRpcmVjdG9yeS4KCiAgICA6cGFyYW0gZGF0YV9wYXRoOiAgICAgICAgICBBIGRpcmVjdG9yeSBvZiB0ZXh0IGZpbGVzIG9yIGEgc2luZ2xlIGZpbGUgb3IgYSBsaXN0IG9mIGZpbGVzIHRvIHRyYW5zbGF0ZS4KICAgIDpwYXJhbSBvdXRwdXRfZGlyZWN0b3J5OiAgIERpcmVjdG9yeSB3aGVyZSB0aGUgdHJhbnNsYXRlZCBmaWxlcyB3aWxsIGJlIHNhdmVkLgogICAgOnBhcmFtIG1vZGVsX25hbWU6ICAgICAgICAgVGhlIG5hbWUgb2YgYSBtb2RlbCB0byBsb2FkLiBJZiBOb25lLCB0aGUgbW9kZWwgbmFtZSBpcyBjb25zdHJ1Y3RlZCB1c2luZyB0aGUgc291cmNlIGFuZAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgdGFyZ2V0IGxhbmd1YWdlcyBwYXJhbWV0ZXJzLgogICAgOnBhcmFtIHNvdXJjZV9sYW5ndWFnZTogICAgVGhlIHNvdXJjZSBsYW5ndWFnZSBjb2RlIChlLmcuLCAnZW4nIGZvciBFbmdsaXNoKS4KICAgIDpwYXJhbSB0YXJnZXRfbGFuZ3VhZ2U6ICAgIFRoZSB0YXJnZXQgbGFuZ3VhZ2UgY29kZSAoZS5nLiwgJ2VuJyBmb3IgRW5nbGlzaCkuCiAgICA6cGFyYW0gbW9kZWxfa3dhcmdzOiAgICAgICBLZXl3b3JkIGFyZ3VtZW50cyB0byBwYXNzIHJlZ2FyZGluZyB0aGUgbG9hZGluZyBvZiB0aGUgbW9kZWwgaW4gSHVnZ2luZ0ZhY2UncyBgcGlwZWxpbmVgCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBmdW5jdGlvbi4KICAgIDpwYXJhbSBkZXZpY2U6ICAgICAgICAgICAgIFRoZSBkZXZpY2UgaW5kZXggZm9yIHRyYW5zZm9ybWVycy4gRGVmYXVsdCB3aWxsIHByZWZlciBjdWRhIGlmIGF2YWlsYWJsZS4KICAgIDpwYXJhbSBiYXRjaF9zaXplOiAgICAgICAgIFRoZSBudW1iZXIgb2YgYmF0Y2hlcyB0byB1c2UgaW4gdHJhbnNsYXRpb24uIFRoZSBmaWxlcyBhcmUgdHJhbnNsYXRlZCBvbmUgYnkgb25lLCBidXQgdGhlCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBzZW50ZW5jZXMgY2FuIGJlIGJhdGNoZWQuCiAgICA6cGFyYW0gdHJhbnNsYXRpb25fa3dhcmdzOiBBZGRpdGlvbmFsIGtleXdvcmQgYXJndW1lbnRzIHRvIHBhc3MgdG8gYSBgdHJhbnNmb3JtZXJzLlRyYW5zbGF0aW9uUGlwZWxpbmVgIHdoZW4gZG9pbmcKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIHRoZSB0cmFuc2xhdGlvbiBpbmZlcmVuY2UuIE5vdGljZSB0aGUgYmF0Y2ggc2l6ZSBoZXJlIGlzIGJlaW5nIGFkZGVkIGF1dG9tYXRpY2FsbHkuCiAgICA6cGFyYW0gdmVyYm9zZTogICAgICAgICAgICBXaGV0aGVyIHRvIHByZXNlbnQgbG9ncyBvZiBhIHByb2dyZXNzIGJhciBhbmQgZXJyb3JzLiBEZWZhdWx0OiBUcnVlLgoKICAgIDpyZXR1cm5zOiBBIHR1cGxlIG9mOgoKICAgICAgICAgICAgICAqIFBhdGggdG8gdGhlIG91dHB1dCBkaXJlY3RvcnkuCiAgICAgICAgICAgICAgKiBBIGRhdGFmcmFtZSBkYXRhc2V0IG9mIHRoZSB0cmFuc2xhdGVkIGZpbGUgbmFtZXMuCiAgICAgICAgICAgICAgKiBBIGRpY3Rpb25hcnkgb2YgZXJyb3JlZCBmaWxlcyB0aGF0IHdlcmUgbm90IHRyYW5zbGF0ZWQuCiAgICAiIiIKICAgIGdsb2JhbCBfTE9HR0VSCgogICAgIyBHZXQgdGhlIGlucHV0IHRleHQgZmlsZXMgdG8gdHJhbnNsYXRlOgogICAgaWYgdmVyYm9zZToKICAgICAgICBfTE9HR0VSLmluZm8oIkNvbGxlY3RpbmcgdGV4dCBmaWxlcy4iKQogICAgaWYgaXNpbnN0YW5jZShkYXRhX3BhdGgsIHN0cik6CiAgICAgICAgZGF0YV9wYXRoID0gcGF0aGxpYi5QYXRoKGRhdGFfcGF0aCkuYWJzb2x1dGUoKQogICAgICAgIHRleHRfZmlsZXMgPSBfZ2V0X3RleHRfZmlsZXMoZGF0YV9wYXRoPWRhdGFfcGF0aCkKICAgIGVsc2U6CiAgICAgICAgdGV4dF9maWxlcyA9IGRhdGFfcGF0aAogICAgaWYgdmVyYm9zZToKICAgICAgICBfTE9HR0VSLmluZm8oZiJDb2xsZWN0ZWQge2xlbih0ZXh0X2ZpbGVzKX0gdGV4dCBmaWxlcy4iKQoKICAgICMgR2V0IHRoZSB0cmFuc2xhdGlvbiBwaXBlbGluZToKICAgIGlmIHZlcmJvc2U6CiAgICAgICAgX0xPR0dFUi5pbmZvKGYiTG9hZGluZyBtb2RlbCAtIHVzaW5nIGRldmljZSAne2RldmljZX0nLiIpCiAgICB0cmFuc2xhdGlvbl9waXBlbGluZSwgbW9kZWxfbmFtZSA9IF9nZXRfdHJhbnNsYXRpb25fcGlwZWxpbmUoCiAgICAgICAgbW9kZWxfbmFtZT1tb2RlbF9uYW1lLAogICAgICAgIHNvdXJjZV9sYW5ndWFnZT1zb3VyY2VfbGFuZ3VhZ2UsCiAgICAgICAgdGFyZ2V0X2xhbmd1YWdlPXRhcmdldF9sYW5ndWFnZSwKICAgICAgICBkZXZpY2U9ZGV2aWNlLAogICAgICAgIG1vZGVsX2t3YXJncz1tb2RlbF9rd2FyZ3MsCiAgICAgICAgYmF0Y2hfc2l6ZT1iYXRjaF9zaXplIGlmIGJhdGNoX3NpemUgIT0gMSBlbHNlIE5vbmUsCiAgICApCiAgICBpZiB2ZXJib3NlOgogICAgICAgIF9MT0dHRVIuaW5mbyhmIk1vZGVsICd7bW9kZWxfbmFtZX0nIHdhcyBsb2FkZWQgc3VjY2Vzc2Z1bGx5LiIpCgogICAgIyBQcmVwYXJlIHRoZSBzdWNjZXNzZXMgZGF0YWZyYW1lIGFuZCBlcnJvcnMgZGljdGlvbmFyeSB0byBiZSByZXR1cm5lZDoKICAgIHN1Y2Nlc3NlcyA9IFtdCiAgICBlcnJvcnMgPSB7fQoKICAgICMgQ3JlYXRlIHRoZSBvdXRwdXQgZGlyZWN0b3J5OgogICAgb3V0cHV0X2RpcmVjdG9yeSA9IHBhdGhsaWIuUGF0aChvdXRwdXRfZGlyZWN0b3J5KQogICAgb3V0cHV0X2RpcmVjdG9yeS5ta2RpcihwYXJlbnRzPVRydWUsIGV4aXN0X29rPVRydWUpCgogICAgIyBQcmVwYXJlIHRoZSB0cmFuc2xhdGlvbiBrZXl3b3JkIGFyZ3VtZW50czoKICAgIHRyYW5zbGF0aW9uX2t3YXJncyA9IHRyYW5zbGF0aW9uX2t3YXJncyBvciB7fQoKICAgICMgR28gb3ZlciB0aGUgYXVkaW8gZmlsZXMgYW5kIHRyYW5zY3JpYmU6CiAgICBmb3IgdGV4dF9maWxlIGluIHRxZG0oCiAgICAgICAgdGV4dF9maWxlcywgZGVzYz0iVHJhbnNsYXRpbmciLCB1bml0PSJmaWxlIiwgZGlzYWJsZT1ub3QgdmVyYm9zZQogICAgKToKICAgICAgICB0cnk6CiAgICAgICAgICAgICMgVHJhbnNsYXRlOgogICAgICAgICAgICB0cmFuc2xhdGlvbiA9IF90cmFuc2xhdGUoCiAgICAgICAgICAgICAgICB0ZXh0X2ZpbGU9dGV4dF9maWxlLAogICAgICAgICAgICAgICAgdHJhbnNsYXRpb25fcGlwZWxpbmU9dHJhbnNsYXRpb25fcGlwZWxpbmUsCiAgICAgICAgICAgICAgICB0cmFuc2xhdGlvbl9rd2FyZ3M9dHJhbnNsYXRpb25fa3dhcmdzLAogICAgICAgICAgICApCiAgICAgICAgICAgICMgV3JpdGUgdGhlIHRyYW5zY3JpcHRpb24gdG8gZmlsZToKICAgICAgICAgICAgdHJhbnNsYXRpb25fZmlsZSA9IF9zYXZlX3RvX2ZpbGUoCiAgICAgICAgICAgICAgICB0cmFuc2xhdGlvbj10cmFuc2xhdGlvbiwKICAgICAgICAgICAgICAgIGZpbGVfbmFtZT10ZXh0X2ZpbGUuc3RlbSwKICAgICAgICAgICAgICAgIG91dHB1dF9kaXJlY3Rvcnk9b3V0cHV0X2RpcmVjdG9yeSwKICAgICAgICAgICAgKQogICAgICAgICAgICAjIE5vdGUgYXMgYSBzdWNjZXNzIGluIHRoZSBsaXN0OgogICAgICAgICAgICBzdWNjZXNzZXMuYXBwZW5kKAogICAgICAgICAgICAgICAgWwogICAgICAgICAgICAgICAgICAgIHRleHRfZmlsZS5uYW1lLAogICAgICAgICAgICAgICAgICAgIHRyYW5zbGF0aW9uX2ZpbGUubmFtZSwKICAgICAgICAgICAgICAgIF0KICAgICAgICAgICAgKQogICAgICAgIGV4Y2VwdCBFeGNlcHRpb24gYXMgZXhjZXB0aW9uOgogICAgICAgICAgICAjIE5vdGUgdGhlIGV4Y2VwdGlvbiBhcyBlcnJvciBpbiB0aGUgZGljdGlvbmFyeToKICAgICAgICAgICAgaWYgdmVyYm9zZToKICAgICAgICAgICAgICAgIF9MT0dHRVIud2FybmluZyhmIkVycm9yIGluIGZpbGU6ICd7dGV4dF9maWxlLm5hbWV9JyIpCiAgICAgICAgICAgIGVycm9yc1tzdHIodGV4dF9maWxlLm5hbWUpXSA9IHN0cihleGNlcHRpb24pCiAgICAgICAgICAgIGNvbnRpbnVlCgogICAgIyBDb25zdHJ1Y3QgdGhlIHRyYW5zbGF0aW9ucyBkYXRhZnJhbWU6CiAgICBjb2x1bW5zID0gWwogICAgICAgICJ0ZXh0X2ZpbGUiLAogICAgICAgICJ0cmFuc2xhdGlvbl9maWxlIiwKICAgIF0KICAgIHN1Y2Nlc3NlcyA9IHBkLkRhdGFGcmFtZSgKICAgICAgICBzdWNjZXNzZXMsCiAgICAgICAgY29sdW1ucz1jb2x1bW5zLAogICAgKQoKICAgICMgUHJpbnQgdGhlIGhlYWQgb2YgdGhlIHByb2R1Y2VkIGRhdGFmcmFtZSBhbmQgcmV0dXJuOgogICAgaWYgdmVyYm9zZToKICAgICAgICBfTE9HR0VSLmluZm8oCiAgICAgICAgICAgIGYiRG9uZSAoe3N1Y2Nlc3Nlcy5zaGFwZVswXX0ve2xlbih0ZXh0X2ZpbGVzKX0pXG4iCiAgICAgICAgICAgIGYiVHJhbnNsYXRpb25zIHN1bW1hcnk6XG4iCiAgICAgICAgICAgIGYie3N1Y2Nlc3Nlcy5oZWFkKCl9IgogICAgICAgICkKICAgIHJldHVybiBzdHIob3V0cHV0X2RpcmVjdG9yeSksIHN1Y2Nlc3NlcywgZXJyb3JzCgoKZGVmIF9nZXRfdGV4dF9maWxlcygKICAgIGRhdGFfcGF0aDogcGF0aGxpYi5QYXRoLAopIC0+IExpc3RbcGF0aGxpYi5QYXRoXToKICAgICMgQ2hlY2sgaWYgdGhlIHBhdGggaXMgb2YgYSBkaXJlY3Rvcnkgb3IgYSBmaWxlOgogICAgaWYgZGF0YV9wYXRoLmlzX2RpcigpOgogICAgICAgICMgR2V0IGFsbCBmaWxlcyBpbnNpZGUgdGhlIGRpcmVjdG9yeToKICAgICAgICB0ZXh0X2ZpbGVzID0gbGlzdChkYXRhX3BhdGguZ2xvYigiKi4qIikpCiAgICBlbGlmIGRhdGFfcGF0aC5pc19maWxlKCk6CiAgICAgICAgdGV4dF9maWxlcyA9IFtkYXRhX3BhdGhdCiAgICBlbHNlOgogICAgICAgIHJhaXNlIFZhbHVlRXJyb3IoCiAgICAgICAgICAgIGYiVW5yZWNvZ25pemVkIGRhdGEgcGF0aC4gVGhlIHBhcmFtZXRlciBgZGF0YV9wYXRoYCBtdXN0IGJlIGVpdGhlciBhIGRpcmVjdG9yeSBwYXRoIG9yIGEgZmlsZSBwYXRoLiAiCiAgICAgICAgICAgIGYiR2l2ZW46IHtzdHIoZGF0YV9wYXRoKX0gIgogICAgICAgICkKCiAgICByZXR1cm4gdGV4dF9maWxlcwoKCmRlZiBfZ2V0X3RyYW5zbGF0aW9uX3BpcGVsaW5lKAogICAgbW9kZWxfbmFtZTogc3RyID0gTm9uZSwKICAgIHNvdXJjZV9sYW5ndWFnZTogc3RyID0gTm9uZSwKICAgIHRhcmdldF9sYW5ndWFnZTogc3RyID0gTm9uZSwKICAgIGRldmljZTogc3RyID0gTm9uZSwKICAgIG1vZGVsX2t3YXJnczogZGljdCA9IE5vbmUsCiAgICBiYXRjaF9zaXplOiBpbnQgPSBOb25lLAopIC0+IFR1cGxlW3RyYW5zZm9ybWVycy5QaXBlbGluZSwgc3RyXToKICAgICMgQ29uc3RydWN0IHRoZSBtb2RlbCBuYW1lIC0gaWYgbW9kZWwgbmFtZSBpcyBwcm92aWRlZCAobm90IE5vbmUpIHRoZW4gd2UgdGFrZSBpdCwgb3RoZXJ3aXNlIHdlIGNoZWNrIGJvdGggc291cmNlCiAgICAjIGFuZCB0YXJnZXQgd2VyZSBwcm92aWRlZCB0byBjb25zdHJ1Y3QgdGhlIG1vZGVsIG5hbWU6CiAgICBpZiBtb2RlbF9uYW1lIGlzIE5vbmUgYW5kIChzb3VyY2VfbGFuZ3VhZ2UgaXMgTm9uZSBvciB0YXJnZXRfbGFuZ3VhZ2UgaXMgTm9uZSk6CiAgICAgICAgcmFpc2UgVmFsdWVFcnJvcigKICAgICAgICAgICAgIk5vIG1vZGVsIG5hbWUgd2VyZSBnaXZlbiBhbmQgbWlzc2luZyBzb3VyY2UgYW5kIC8gb3IgdGFyZ2V0IGxhbmd1YWdlcy4gSW4gb3JkZXIgdG8gdHJhbnNsYXRlIHlvdSBtdXN0ICIKICAgICAgICAgICAgInBhc3MgYSBgbW9kZWxfbmFtZWAgb3IgYm90aCBgc291cmNlX2xhbmd1YWdlYCBhbmQgYHRhcmdldF9sYW5ndWFnZWAuIgogICAgICAgICkKICAgIGVsaWYgbW9kZWxfbmFtZSBpcyBOb25lOgogICAgICAgIG1vZGVsX25hbWUgPSBmIkhlbHNpbmtpLU5MUC9vcHVzLW10LXtzb3VyY2VfbGFuZ3VhZ2V9LXt0YXJnZXRfbGFuZ3VhZ2V9IgoKICAgICMgSW5pdGlhbGl6ZSB0aGUgdHJhbnNsYXRpb24gcGlwZWxpbmU6CiAgICB0cnk6CiAgICAgICAgdHJhbnNsYXRpb25fcGlwZWxpbmUgPSB0cmFuc2Zvcm1lcnMucGlwZWxpbmUoCiAgICAgICAgICAgIHRhc2s9InRyYW5zbGF0aW9uIiwKICAgICAgICAgICAgbW9kZWw9bW9kZWxfbmFtZSwKICAgICAgICAgICAgdG9rZW5pemVyPW1vZGVsX25hbWUsCiAgICAgICAgICAgIGRldmljZT1kZXZpY2UsCiAgICAgICAgICAgIG1vZGVsX2t3YXJncz1tb2RlbF9rd2FyZ3MsCiAgICAgICAgICAgIGJhdGNoX3NpemU9YmF0Y2hfc2l6ZSwKICAgICAgICApCiAgICBleGNlcHQgT1NFcnJvciBhcyBsb2FkX2V4Y2VwdGlvbjoKICAgICAgICBpZiAoCiAgICAgICAgICAgICJpcyBub3QgYSB2YWxpZCBtb2RlbCBpZGVudGlmaWVyIGxpc3RlZCBvbiAnaHR0cHM6Ly9odWdnaW5nZmFjZS5jby9tb2RlbHMnIgogICAgICAgICAgICBpbiBzdHIobG9hZF9leGNlcHRpb24pCiAgICAgICAgICAgIGFuZCBzb3VyY2VfbGFuZ3VhZ2UKICAgICAgICApOgogICAgICAgICAgICByYWlzZSBWYWx1ZUVycm9yKAogICAgICAgICAgICAgICAgZiJUaGUgbW9kZWwgJ3ttb2RlbF9uYW1lfScgaXMgbm90IGEgdmFsaWQgbW9kZWwgaWRlbnRpZmllci4gIgogICAgICAgICAgICAgICAgZiJUaGUgcGFyYW1ldGVycyBgc291cmNlX2xhbmd1YWdlYCBhbmQgYHRhcmdldF9sYW5ndWFnZWAgYXJlIHVzZWQgdG8gY29uc3RydWN0IGEgSGVsc2lua2kgbW9kZWwgZm9yICIKICAgICAgICAgICAgICAgIGYidGV4dCB0byB0ZXh0IGdlbmVyYXRpb24sIGJ1dCB0aGUgbW9kZWwgY3JlYXRlZCBmcm9tIHRoZSBnaXZlbiBsYW5ndWFnZXMgZG9lcyBub3QgZXhpc3QuICIKICAgICAgICAgICAgICAgIGYiWW91IG1heSBjaGVjayBsYW5ndWFnZSBpZGVudGlmaWVycyBhdCAiCiAgICAgICAgICAgICAgICBmImh0dHBzOi8vZGV2ZWxvcGVycy5nb29nbGUuY29tL2FkbWluLXNkay9kaXJlY3RvcnkvdjEvbGFuZ3VhZ2VzLCBhbmQgaWYgdGhlIGVycm9yIHdhcyBub3QgZml4ZWQsIG9uZSAiCiAgICAgICAgICAgICAgICBmIm9yIG1vcmUgbGFuZ3VhZ2UgY29kZSBtaWdodCBiZSB3aXRoIDMgbGV0dGVycyBhbmQgbmVlZHMgdG8gYmUgZm91bmQgb25saW5lLiAiCiAgICAgICAgICAgICAgICBmIlJlbWVtYmVyLCB5b3UgY2FuIGFsd2F5cyBjaG9vc2UgYSBtb2RlbCBkaXJlY3RseSBmcm9tIHRoZSBIdWdnaW5nZmFjZSBodWIgYnkgdXNpbmcgdGhlIGBtb2RlbF9uYW1lYCAiCiAgICAgICAgICAgICAgICBmInBhcmFtZXRlci4iCiAgICAgICAgICAgICkgZnJvbSBsb2FkX2V4Y2VwdGlvbgogICAgICAgIHJhaXNlIGxvYWRfZXhjZXB0aW9uCgogICAgcmV0dXJuIHRyYW5zbGF0aW9uX3BpcGVsaW5lLCBtb2RlbF9uYW1lCgoKZGVmIF90cmFuc2xhdGUoCiAgICB0ZXh0X2ZpbGU6IHBhdGhsaWIuUGF0aCwKICAgIHRyYW5zbGF0aW9uX3BpcGVsaW5lOiB0cmFuc2Zvcm1lcnMuUGlwZWxpbmUsCiAgICB0cmFuc2xhdGlvbl9rd2FyZ3M6IGRpY3QsCikgLT4gc3RyOgogICAgIyBSZWFkIHRoZSB0ZXh0IGZyb20gZmlsZToKICAgIHdpdGggb3Blbih0ZXh0X2ZpbGUsICJyIikgYXMgZnA6CiAgICAgICAgdGV4dCA9IGZwLnJlYWQoKQoKICAgICMgU3BsaXQgdG8gcGFyYWdyYXBocyBhbmQgZWFjaCBwYXJhZ3JhcGggdG8gc2VudGVuY2VzOgogICAgcGFyYWdyYXBocyA9IFtwYXJhZ3JhcGguc3BsaXQoIi4iKSBmb3IgcGFyYWdyYXBoIGluIHRleHQuc3BsaXQoIlxuIildCgogICAgIyBEaXNjb3ZlciB0aGUgbmV3bGluZSBpbmRleGVzIHRvIHJlc3RvcmUgdGhlIGZpbGUgdG8gaXRzIHN0cnVjdHVyZSBwb3N0IHRyYW5zbGF0aW9uOgogICAgbmV3bGluZXNfaW5kZXhlcyA9IFtdCiAgICBmb3IgcGFyYWdyYXBoIGluIHBhcmFncmFwaHNbOi0xXToKICAgICAgICBpZiBsZW4obmV3bGluZXNfaW5kZXhlcykgPT0gMDoKICAgICAgICAgICAgbmV3bGluZXNfaW5kZXhlcy5hcHBlbmQobGVuKHBhcmFncmFwaCkgLSAxKQogICAgICAgIGVsc2U6CiAgICAgICAgICAgIG5ld2xpbmVzX2luZGV4ZXMuYXBwZW5kKG5ld2xpbmVzX2luZGV4ZXNbLTFdICsgbGVuKHBhcmFncmFwaCkpCgogICAgIyBQcmVwYXJlIHRoZSBiYXRjaGVzIChlYWNoIHNlbnRlbmNlIGZyb20gdGhlIHBhcmFncmFwaHMpLiBOb3RpY2Ugd2UgYWRkIGEgZG90IG5vdCBvbmx5IHRvIHJlc3RvcmUgdGhlIHNlbnRlbmNlCiAgICAjIHN0cnVjdHVyZSBidXQgdG8gaWdub3JlIGVtcHR5IHN0cmluZ3MgYXMgaXQgd2lsbCBydWluIHRoZSB0cmFuc2xhdGlvbjoKICAgIHNlbnRlbmNlcyA9IFtmIntsaW5lfS4iIGZvciBwYXJhZ3JhcGggaW4gcGFyYWdyYXBocyBmb3IgbGluZSBpbiBwYXJhZ3JhcGhdCgogICAgIyBUcmFuc2xhdGUgdGhlIHNlbnRlbmNlczoKICAgIHRyYW5zbGF0aW9ucyA9IHRyYW5zbGF0aW9uX3BpcGVsaW5lKHNlbnRlbmNlcywgKip0cmFuc2xhdGlvbl9rd2FyZ3MpCgogICAgIyBSZXN0cnVjdHVyZSB0aGUgZnVsbCB0ZXh0IGZyb20gdGhlIHNlbnRlbmNlczoKICAgIHRyYW5zbGF0ZWRfdGV4dCA9IFtdCiAgICBuZXdsaW5lX2luZGV4ID0gbmV3bGluZXNfaW5kZXhlcy5wb3AoMCkgaWYgbmV3bGluZXNfaW5kZXhlcyBlbHNlIE5vbmUKICAgIGZvciBpLCB0cmFuc2xhdGlvbiBpbiBlbnVtZXJhdGUodHJhbnNsYXRpb25zKToKICAgICAgICAjIEdldCB0aGUgdHJhbnNsYXRpb246CiAgICAgICAgdGV4dCA9IHRyYW5zbGF0aW9uWyJ0cmFuc2xhdGlvbl90ZXh0Il0KICAgICAgICAjIFZhbGlkYXRlIGlmIGl0IHdhcyBhbiBlbXB0eSBzZW50ZW5jZSBiZWZvcmU6CiAgICAgICAgaWYgdGV4dCA9PSAiLiI6CiAgICAgICAgICAgIHRleHQgPSAiIgogICAgICAgICMgQ2hlY2sgaWYgbmVlZGVkIHRvIGluc2VydCBhIG5ld2xpbmU6CiAgICAgICAgaWYgbmV3bGluZV9pbmRleCBhbmQgbmV3bGluZV9pbmRleCA9PSBpOgogICAgICAgICAgICB0ZXh0ICs9ICJcbiIKICAgICAgICAgICAgbmV3bGluZV9pbmRleCA9IG5ld2xpbmVzX2luZGV4ZXMucG9wKDApIGlmIG5ld2xpbmVzX2luZGV4ZXMgZWxzZSBOb25lCiAgICAgICAgIyBDb2xsZWN0IGl0OgogICAgICAgIHRyYW5zbGF0ZWRfdGV4dC5hcHBlbmQodGV4dCkKICAgIHRyYW5zbGF0ZWRfdGV4dCA9ICIiLmpvaW4odHJhbnNsYXRlZF90ZXh0KQoKICAgIHJldHVybiB0cmFuc2xhdGVkX3RleHQKCgpkZWYgX3NhdmVfdG9fZmlsZSgKICAgIHRyYW5zbGF0aW9uOiBzdHIsIGZpbGVfbmFtZTogc3RyLCBvdXRwdXRfZGlyZWN0b3J5OiBwYXRobGliLlBhdGgKKSAtPiBwYXRobGliLlBhdGg6CiAgICAjIFByZXBhcmUgdGhlIGZpbGUgZnVsbCBwYXRoIChjaGVja2luZyBmb3Igbm8gZHVwbGljYXRpb25zKToKICAgIHRyYW5zbGF0aW9uX2ZpbGUgPSBvdXRwdXRfZGlyZWN0b3J5IC8gZiJ7ZmlsZV9uYW1lfS50eHQiCiAgICBpID0gMQogICAgd2hpbGUgdHJhbnNsYXRpb25fZmlsZS5leGlzdHMoKToKICAgICAgICBpICs9IDEKICAgICAgICB0cmFuc2xhdGlvbl9maWxlID0gb3V0cHV0X2RpcmVjdG9yeSAvIGYie2ZpbGVfbmFtZX1fe2l9LnR4dCIKCiAgICAjIE1ha2Ugc3VyZSBhbGwgZGlyZWN0b3JpZXMgYXJlIGNyZWF0ZWQ6CiAgICB0cmFuc2xhdGlvbl9maWxlLnBhcmVudC5ta2RpcihleGlzdF9vaz1UcnVlLCBwYXJlbnRzPVRydWUpCgogICAgIyBXcml0ZSB0byBmaWxlOgogICAgd2l0aCBvcGVuKHRyYW5zbGF0aW9uX2ZpbGUsICJ3IikgYXMgZnA6CiAgICAgICAgZnAud3JpdGUodHJhbnNsYXRpb24pCgogICAgcmV0dXJuIHRyYW5zbGF0aW9uX2ZpbGUK + base_image: mlrun/mlrun + origin_filename: '' + image: '' default_handler: translate disable_auto_mount: false - clone_target_dir: '' - env: [] - priority_class_name: '' - preemption_mode: prevent - affinity: null - tolerations: null - security_context: {} + command: '' + description: Translate text files from one language to another verbose: false +metadata: + categories: + - genai + - NLP + tag: '' + name: translate +kind: job diff --git a/functions/master/translate/latest/src/item.yaml b/functions/master/translate/latest/src/item.yaml index e6394734..839d1efa 100644 --- a/functions/master/translate/latest/src/item.yaml +++ b/functions/master/translate/latest/src/item.yaml @@ -1,9 +1,6 @@ apiVersion: v1 categories: -- data-preparation -- huggingface -- machine-learning -- deep-learning +- genai - NLP description: Translate text files from one language to another doc: '' @@ -15,7 +12,7 @@ labels: author: guyl maintainers: [] marketplaceType: '' -mlrunVersion: 1.5.1 +mlrunVersion: 1.7.0 name: translate platformVersion: 3.5.3 spec: @@ -29,5 +26,5 @@ spec: - torch - tqdm url: '' -version: 0.1.0 +version: 0.2.0 test_valid: True diff --git a/functions/master/translate/latest/static/documentation.html b/functions/master/translate/latest/static/documentation.html index 237b2231..9e4fdd01 100644 --- a/functions/master/translate/latest/static/documentation.html +++ b/functions/master/translate/latest/static/documentation.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/translate/latest/static/example.html b/functions/master/translate/latest/static/example.html index e763bd83..6b086df4 100644 --- a/functions/master/translate/latest/static/example.html +++ b/functions/master/translate/latest/static/example.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/translate/latest/static/function.html b/functions/master/translate/latest/static/function.html index ca69dbf3..38951174 100644 --- a/functions/master/translate/latest/static/function.html +++ b/functions/master/translate/latest/static/function.html @@ -28,80 +28,39 @@
             
    -kind: job
    -metadata:
    -  name: translate
    -  tag: ''
    -  hash: 7eedf684bcebfbfd964e5503afbb56335c8f4097
    -  project: ''
    -  labels:
    -    author: guyl
    -  categories:
    -  - data-preparation
    -  - huggingface
    -  - machine-learning
    -  - deep-learning
    -  - NLP
     spec:
    -  command: ''
    -  args: []
    -  image: ''
    -  build:
    -    functionSourceCode: 
    -    base_image: mlrun/mlrun
    -    commands: []
    -    code_origin: ''
    -    origin_filename: ''
    -    requirements:
    -    - transformers
    -    - sentencepiece
    -    - torch
    -    - tqdm
       entry_points:
         open_mpi_handler:
    -      name: open_mpi_handler
    -      doc: ''
    +      lineno: 56
           parameters:
           - name: worker_inputs
             type: List[str]
           - name: root_worker_inputs
             type: Dict[str, Any]
             default: null
    -      outputs: []
    -      lineno: 56
    -      has_varargs: false
    +      name: open_mpi_handler
           has_kwargs: false
    -    decorator:
    -      name: decorator
           doc: ''
    +      has_varargs: false
    +    decorator:
    +      lineno: 68
           parameters:
           - name: handler
    -      outputs: []
    -      lineno: 68
    -      has_varargs: false
    +      name: decorator
           has_kwargs: false
    +      doc: ''
    +      has_varargs: false
         wrapper:
    +      lineno: 73
           name: wrapper
    +      has_kwargs: true
           doc: ''
    -      parameters: []
    -      outputs: []
    -      lineno: 73
           has_varargs: false
    -      has_kwargs: true
         translate:
    -      name: translate
    -      doc: 'Translate text files using a transformer model from Huggingface''s hub
    -        according to the source and target languages
    -
    -        given (or using the directly provided model name). The end result is a directory
    -        of translated text files and a
    -
    -        dataframe containing the following columns:
    -
    -
    -        * text_file - The text file path.
    -
    -        * translation_file - The translation text file name in the output directory.'
    +      outputs:
    +      - doc: 'A tuple of:'
    +        type: Tuple[str, pd.DataFrame, dict]
    +      lineno: 135
           parameters:
           - name: data_path
             type: Union[str, List[str], Path]
    @@ -146,23 +105,44 @@
             type: bool
             doc: 'Whether to present logs of a progress bar and errors. Default: True.'
             default: false
    -      outputs:
    -      - doc: 'A tuple of:'
    -        type: Tuple[str, pd.DataFrame, dict]
    -      lineno: 135
    -      has_varargs: false
    +      name: translate
           has_kwargs: false
    -  description: Translate text files from one language to another
    +      doc: 'Translate text files using a transformer model from Huggingface''s hub
    +        according to the source and target languages
    +
    +        given (or using the directly provided model name). The end result is a directory
    +        of translated text files and a
    +
    +        dataframe containing the following columns:
    +
    +
    +        * text_file - The text file path.
    +
    +        * translation_file - The translation text file name in the output directory.'
    +      has_varargs: false
    +  build:
    +    requirements:
    +    - transformers
    +    - sentencepiece
    +    - torch
    +    - tqdm
    +    code_origin: ''
    +    functionSourceCode: 
    +    base_image: mlrun/mlrun
    +    origin_filename: ''
    +  image: ''
       default_handler: translate
       disable_auto_mount: false
    -  clone_target_dir: ''
    -  env: []
    -  priority_class_name: ''
    -  preemption_mode: prevent
    -  affinity: null
    -  tolerations: null
    -  security_context: {}
    +  command: ''
    +  description: Translate text files from one language to another
     verbose: false
    +metadata:
    +  categories:
    +  - genai
    +  - NLP
    +  tag: ''
    +  name: translate
    +kind: job
     
             
         
    diff --git a/functions/master/translate/latest/static/item.html b/functions/master/translate/latest/static/item.html index 4e7aaf85..0c48dfc6 100644 --- a/functions/master/translate/latest/static/item.html +++ b/functions/master/translate/latest/static/item.html @@ -30,10 +30,7 @@ apiVersion: v1 categories: -- data-preparation -- huggingface -- machine-learning -- deep-learning +- genai - NLP description: Translate text files from one language to another doc: '' @@ -45,7 +42,7 @@ author: guyl maintainers: [] marketplaceType: '' -mlrunVersion: 1.5.1 +mlrunVersion: 1.7.0 name: translate platformVersion: 3.5.3 spec: @@ -59,7 +56,7 @@ - torch - tqdm url: '' -version: 0.1.0 +version: 0.2.0 test_valid: True diff --git a/functions/master/translate/latest/static/translate.html b/functions/master/translate/latest/static/translate.html index 1dc8ac2a..153b48a4 100644 --- a/functions/master/translate/latest/static/translate.html +++ b/functions/master/translate/latest/static/translate.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/v2_model_server/1.2.0/static/documentation.html b/functions/master/v2_model_server/1.2.0/static/documentation.html index fc69d875..8c6b6a58 100644 --- a/functions/master/v2_model_server/1.2.0/static/documentation.html +++ b/functions/master/v2_model_server/1.2.0/static/documentation.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/v2_model_server/1.2.0/static/example.html b/functions/master/v2_model_server/1.2.0/static/example.html index 6cc76013..c2a447d9 100644 --- a/functions/master/v2_model_server/1.2.0/static/example.html +++ b/functions/master/v2_model_server/1.2.0/static/example.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/v2_model_server/1.2.0/static/v2_model_server.html b/functions/master/v2_model_server/1.2.0/static/v2_model_server.html index 9c6a96d5..2245fc64 100644 --- a/functions/master/v2_model_server/1.2.0/static/v2_model_server.html +++ b/functions/master/v2_model_server/1.2.0/static/v2_model_server.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/v2_model_server/latest/static/documentation.html b/functions/master/v2_model_server/latest/static/documentation.html index fc69d875..8c6b6a58 100644 --- a/functions/master/v2_model_server/latest/static/documentation.html +++ b/functions/master/v2_model_server/latest/static/documentation.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/v2_model_server/latest/static/example.html b/functions/master/v2_model_server/latest/static/example.html index 6cc76013..c2a447d9 100644 --- a/functions/master/v2_model_server/latest/static/example.html +++ b/functions/master/v2_model_server/latest/static/example.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/v2_model_server/latest/static/v2_model_server.html b/functions/master/v2_model_server/latest/static/v2_model_server.html index 9c6a96d5..2245fc64 100644 --- a/functions/master/v2_model_server/latest/static/v2_model_server.html +++ b/functions/master/v2_model_server/latest/static/v2_model_server.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/v2_model_tester/1.1.0/static/documentation.html b/functions/master/v2_model_tester/1.1.0/static/documentation.html index 1252dca9..b7cbc37b 100644 --- a/functions/master/v2_model_tester/1.1.0/static/documentation.html +++ b/functions/master/v2_model_tester/1.1.0/static/documentation.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/v2_model_tester/1.1.0/static/example.html b/functions/master/v2_model_tester/1.1.0/static/example.html index f5d574bb..74c41d12 100644 --- a/functions/master/v2_model_tester/1.1.0/static/example.html +++ b/functions/master/v2_model_tester/1.1.0/static/example.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/v2_model_tester/1.1.0/static/v2_model_tester.html b/functions/master/v2_model_tester/1.1.0/static/v2_model_tester.html index 2eb9604e..996c4470 100644 --- a/functions/master/v2_model_tester/1.1.0/static/v2_model_tester.html +++ b/functions/master/v2_model_tester/1.1.0/static/v2_model_tester.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/v2_model_tester/latest/static/documentation.html b/functions/master/v2_model_tester/latest/static/documentation.html index 1252dca9..b7cbc37b 100644 --- a/functions/master/v2_model_tester/latest/static/documentation.html +++ b/functions/master/v2_model_tester/latest/static/documentation.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/v2_model_tester/latest/static/example.html b/functions/master/v2_model_tester/latest/static/example.html index f5d574bb..74c41d12 100644 --- a/functions/master/v2_model_tester/latest/static/example.html +++ b/functions/master/v2_model_tester/latest/static/example.html @@ -20,7 +20,7 @@ - + diff --git a/functions/master/v2_model_tester/latest/static/v2_model_tester.html b/functions/master/v2_model_tester/latest/static/v2_model_tester.html index 2eb9604e..996c4470 100644 --- a/functions/master/v2_model_tester/latest/static/v2_model_tester.html +++ b/functions/master/v2_model_tester/latest/static/v2_model_tester.html @@ -20,7 +20,7 @@ - +