From 651e4b26005c878c456ffd47bb95e3f60f9e80d3 Mon Sep 17 00:00:00 2001 From: MOLYHECI Date: Sun, 7 Sep 2025 20:55:58 +0800 Subject: [PATCH 1/4] divide general_text operators into core_text, general_pt, general_sft --- dataflow/operators/core_text/__init__.py | 19 ++- .../eval}/perspective_scorer.py | 2 +- .../eval}/task2vec/task2vec.py | 0 .../eval}/task2vec/task_similarity.py | 0 .../eval}/task2vec/utils.py | 0 .../eval}/task2vec_scorer.py | 6 +- .../eval}/vendi_scorer.py | 2 +- .../filter/blocklist/en.txt | 0 .../filter/blocklist/zh.txt | 0 .../core_text/filter/blocklist_filter.py | 82 +++++++++++ .../filter/general_filter.py | 0 .../filter/hash_deduplicator.py | 2 +- .../filter/language_filter.py | 0 .../filter/llm_language_filter.py | 0 .../filter/minhash_deduplicator.py | 4 +- .../filter/ngramhash_deduplicator.py | 2 +- .../filter/perspective_filter.py | 4 +- .../filter/sem_deduplicator.py | 2 +- .../filter/simhash_deduplicator.py | 2 +- .../core_text/filter/word_number_filter.py | 64 +++++++++ dataflow/operators/general_pt/__init__.py | 83 +++++++++++ .../models => general_pt/eval}/Kenlm/model.py | 0 .../Qurating/modeling/modeling_flash_llama.py | 0 .../eval}/Qurating/qurater_annotate.py | 0 .../gen/bleu => general_pt/eval}/__init__.py | 0 .../eval/bert_sample_evaluator.py} | 2 +- .../eval/bleu}/__init__.py | 0 .../eval/gen => general_pt/eval}/bleu/bleu.py | 0 .../eval/bleu_sample_evaluator.py} | 4 +- .../eval/cider}/__init__.py | 0 .../gen => general_pt/eval}/cider/cider.py | 0 .../eval/cider_sample_evaluator.py} | 6 +- .../eval/debertav3_sample_evaluator.py} | 2 +- .../eval/fineweb_edu_sample_evaluator.py} | 2 +- .../eval/langkit_sample_evaluator.py} | 4 +- .../lexical_diversity_sample_evaluator.py} | 4 +- .../eval/meta_sample_evaluator.py} | 3 +- .../eval/ngram_sample_evaluator.py} | 2 +- .../eval/pair_qual_sample_evaluator.py} | 2 +- .../eval/perplexity_sample_evaluator.py} | 6 +- .../eval/presidio_sample_evaluator.py} | 2 +- .../eval/qurating_sample_evaluator.py} | 6 +- .../eval/textbook_sample_evaluator.py} | 2 +- .../filter/__init__.py | 0 .../filter/ccnet_deduplicate_filter.py} | 2 +- .../filter/debertav3_filter.py | 4 +- .../filter/fineweb_edu_filter.py | 4 +- .../filter/heuristics_filter.py} | 133 ------------------ .../filter/langkit_filter.py | 4 +- .../filter/lexical_diversity_filter.py | 4 +- .../filter/ngram_filter.py | 4 +- .../filter/pair_qual_filter.py | 4 +- .../filter/perplexity_filter.py | 4 +- .../filter/presidio_filter.py | 4 +- .../filter/qurating_filter.py | 6 +- .../filter/text_book_filter.py | 4 +- .../generate/phi4qa_generator.py} | 6 +- .../refine/html_entity_refiner.py | 0 .../refine/html_url_remover_refiner.py | 0 .../refine/lowercase_refiner.py | 0 .../refine/ner_refiner.py | 0 .../refine/pii_anonymize_refiner.py | 0 .../refine/ref_removal_refiner.py | 0 .../refine/remove_contractions_refiner.py | 0 .../refine/remove_emoji_refiner.py | 0 .../refine/remove_emoticons_refiner.py | 0 .../refine/remove_extra_spaces_refiner.py | 0 .../refine/remove_image_ref_refiner.py | 0 .../refine/remove_number_refiner.py | 0 .../refine/remove_punctuation_refiner.py | 0 .../remove_repetitions_punctuation_refiner.py | 0 .../refine/remove_stopwords_refiner.py | 0 .../refine/spelling_correction_refiner.py | 0 .../refine/stemming_lemmatization_refiner.py | 0 .../refine/text_normalization_refiner.py | 0 dataflow/operators/general_sft/__init__.py | 33 +++++ .../eval}/Superfiltering/data_analysis.py | 0 .../eval}/alpagasus_scorer.py | 2 +- .../eval}/deita_complexity_scorer.py | 2 +- .../eval}/deita_quality_scorer.py | 2 +- .../eval}/instag_scorer.py | 2 +- .../models => general_sft/eval}/rm_scorer.py | 2 +- .../eval}/superfiltering_scorer.py | 4 +- .../eval}/treeinstruct_scorer.py | 2 +- .../filter/alpagasus_filter.py | 4 +- .../filter/deita_complexity_filter.py | 4 +- .../filter/deita_quality_filter.py | 4 +- .../filter/instag_filter.py | 4 +- .../filter/rm_filter.py} | 4 +- .../filter/superfiltering_filter.py | 4 +- .../filter/treeinstruct_filter.py | 4 +- .../generate/condor_generator.py | 0 .../generate/sft_generator_from_seed.py | 0 .../refine/condor_refiner.py | 0 dataflow/operators/general_text/__init__.py | 120 ---------------- dataflow/prompts/general_text.py | 2 +- .../statics/core_text_pipeline/core_filter.py | 2 +- .../core_sft_from_scratch.py | 2 +- .../text_sft_synthesis_pipeline.py | 6 +- .../pipelines/cpu_pipelines/text_pt_filter.py | 13 +- .../cpu_pipelines/text_sft_filter.py | 2 +- .../pipelines/gpu_pipelines/text_pt_filter.py | 12 +- .../gpu_pipelines/text_pt_synthetic.py | 16 ++- .../gpu_pipelines/text_sft_filter.py | 4 +- .../gpu_pipelines/text_sft_synthetic.py | 18 ++- dataflow/utils/registry.py | 3 +- test/test_autoop.py | 4 +- 107 files changed, 411 insertions(+), 374 deletions(-) rename dataflow/operators/{general_text/eval/APIcaller => core_text/eval}/perspective_scorer.py (98%) rename dataflow/operators/{general_text/eval/diversity => core_text/eval}/task2vec/task2vec.py (100%) rename dataflow/operators/{general_text/eval/diversity => core_text/eval}/task2vec/task_similarity.py (100%) rename dataflow/operators/{general_text/eval/diversity => core_text/eval}/task2vec/utils.py (100%) rename dataflow/operators/{general_text/eval/diversity => core_text/eval}/task2vec_scorer.py (96%) rename dataflow/operators/{general_text/eval/diversity => core_text/eval}/vendi_scorer.py (98%) rename dataflow/operators/{general_text => core_text}/filter/blocklist/en.txt (100%) rename dataflow/operators/{general_text => core_text}/filter/blocklist/zh.txt (100%) create mode 100644 dataflow/operators/core_text/filter/blocklist_filter.py rename dataflow/operators/{general_text => core_text}/filter/general_filter.py (100%) rename dataflow/operators/{general_text => core_text}/filter/hash_deduplicator.py (99%) rename dataflow/operators/{general_text => core_text}/filter/language_filter.py (100%) rename dataflow/operators/{general_text => core_text}/filter/llm_language_filter.py (100%) rename dataflow/operators/{general_text => core_text}/filter/minhash_deduplicator.py (98%) rename dataflow/operators/{general_text => core_text}/filter/ngramhash_deduplicator.py (99%) rename dataflow/operators/{general_text => core_text}/filter/perspective_filter.py (95%) rename dataflow/operators/{general_text => core_text}/filter/sem_deduplicator.py (99%) rename dataflow/operators/{general_text => core_text}/filter/simhash_deduplicator.py (99%) create mode 100644 dataflow/operators/core_text/filter/word_number_filter.py create mode 100644 dataflow/operators/general_pt/__init__.py rename dataflow/operators/{general_text/eval/models => general_pt/eval}/Kenlm/model.py (100%) rename dataflow/operators/{general_text/eval/models => general_pt/eval}/Qurating/modeling/modeling_flash_llama.py (100%) rename dataflow/operators/{general_text/eval/models => general_pt/eval}/Qurating/qurater_annotate.py (100%) rename dataflow/operators/{general_text/eval/gen/bleu => general_pt/eval}/__init__.py (100%) rename dataflow/operators/{general_text/eval/gen/bert_scorer.py => general_pt/eval/bert_sample_evaluator.py} (98%) rename dataflow/operators/{general_text/eval/gen/cider => general_pt/eval/bleu}/__init__.py (100%) rename dataflow/operators/{general_text/eval/gen => general_pt/eval}/bleu/bleu.py (100%) rename dataflow/operators/{general_text/eval/gen/bleu_scorer.py => general_pt/eval/bleu_sample_evaluator.py} (97%) rename dataflow/operators/{general_text/eval/statistics => general_pt/eval/cider}/__init__.py (100%) rename dataflow/operators/{general_text/eval/gen => general_pt/eval}/cider/cider.py (100%) rename dataflow/operators/{general_text/eval/gen/cider_scorer.py => general_pt/eval/cider_sample_evaluator.py} (95%) rename dataflow/operators/{general_text/eval/models/debertav3_scorer.py => general_pt/eval/debertav3_sample_evaluator.py} (99%) rename dataflow/operators/{general_text/eval/models/fineweb_edu_scorer.py => general_pt/eval/fineweb_edu_sample_evaluator.py} (98%) rename dataflow/operators/{general_text/eval/statistics/langkit_scorer.py => general_pt/eval/langkit_sample_evaluator.py} (98%) rename dataflow/operators/{general_text/eval/statistics/lexical_diversity_scorer.py => general_pt/eval/lexical_diversity_sample_evaluator.py} (98%) rename dataflow/operators/{general_text/eval/APIcaller/meta_scorer.py => general_pt/eval/meta_sample_evaluator.py} (98%) rename dataflow/operators/{general_text/eval/statistics/ngram_scorer.py => general_pt/eval/ngram_sample_evaluator.py} (98%) rename dataflow/operators/{general_text/eval/models/pair_qual_scorer.py => general_pt/eval/pair_qual_sample_evaluator.py} (99%) rename dataflow/operators/{general_text/eval/models/perplexity_scorer.py => general_pt/eval/perplexity_sample_evaluator.py} (95%) rename dataflow/operators/{general_text/eval/models/presidio_scorer.py => general_pt/eval/presidio_sample_evaluator.py} (98%) rename dataflow/operators/{general_text/eval/models/qurating_scorer.py => general_pt/eval/qurating_sample_evaluator.py} (96%) rename dataflow/operators/{general_text/eval/models/textbook_scorer.py => general_pt/eval/textbook_sample_evaluator.py} (98%) rename dataflow/operators/{general_text => general_pt}/filter/__init__.py (100%) rename dataflow/operators/{general_text/filter/ccnet_deduplicator.py => general_pt/filter/ccnet_deduplicate_filter.py} (99%) rename dataflow/operators/{general_text => general_pt}/filter/debertav3_filter.py (96%) rename dataflow/operators/{general_text => general_pt}/filter/fineweb_edu_filter.py (95%) rename dataflow/operators/{general_text/filter/heuristics.py => general_pt/filter/heuristics_filter.py} (90%) rename dataflow/operators/{general_text => general_pt}/filter/langkit_filter.py (98%) rename dataflow/operators/{general_text => general_pt}/filter/lexical_diversity_filter.py (96%) rename dataflow/operators/{general_text => general_pt}/filter/ngram_filter.py (95%) rename dataflow/operators/{general_text => general_pt}/filter/pair_qual_filter.py (95%) rename dataflow/operators/{general_text => general_pt}/filter/perplexity_filter.py (96%) rename dataflow/operators/{general_text => general_pt}/filter/presidio_filter.py (95%) rename dataflow/operators/{general_text => general_pt}/filter/qurating_filter.py (96%) rename dataflow/operators/{general_text => general_pt}/filter/text_book_filter.py (96%) rename dataflow/operators/{general_text/generate/pretrain_generator.py => general_pt/generate/phi4qa_generator.py} (95%) rename dataflow/operators/{general_text => general_pt}/refine/html_entity_refiner.py (100%) rename dataflow/operators/{general_text => general_pt}/refine/html_url_remover_refiner.py (100%) rename dataflow/operators/{general_text => general_pt}/refine/lowercase_refiner.py (100%) rename dataflow/operators/{general_text => general_pt}/refine/ner_refiner.py (100%) rename dataflow/operators/{general_text => general_pt}/refine/pii_anonymize_refiner.py (100%) rename dataflow/operators/{general_text => general_pt}/refine/ref_removal_refiner.py (100%) rename dataflow/operators/{general_text => general_pt}/refine/remove_contractions_refiner.py (100%) rename dataflow/operators/{general_text => general_pt}/refine/remove_emoji_refiner.py (100%) rename dataflow/operators/{general_text => general_pt}/refine/remove_emoticons_refiner.py (100%) rename dataflow/operators/{general_text => general_pt}/refine/remove_extra_spaces_refiner.py (100%) rename dataflow/operators/{general_text => general_pt}/refine/remove_image_ref_refiner.py (100%) rename dataflow/operators/{general_text => general_pt}/refine/remove_number_refiner.py (100%) rename dataflow/operators/{general_text => general_pt}/refine/remove_punctuation_refiner.py (100%) rename dataflow/operators/{general_text => general_pt}/refine/remove_repetitions_punctuation_refiner.py (100%) rename dataflow/operators/{general_text => general_pt}/refine/remove_stopwords_refiner.py (100%) rename dataflow/operators/{general_text => general_pt}/refine/spelling_correction_refiner.py (100%) rename dataflow/operators/{general_text => general_pt}/refine/stemming_lemmatization_refiner.py (100%) rename dataflow/operators/{general_text => general_pt}/refine/text_normalization_refiner.py (100%) create mode 100644 dataflow/operators/general_sft/__init__.py rename dataflow/operators/{general_text/eval/models => general_sft/eval}/Superfiltering/data_analysis.py (100%) rename dataflow/operators/{general_text/eval/APIcaller => general_sft/eval}/alpagasus_scorer.py (98%) rename dataflow/operators/{general_text/eval/models => general_sft/eval}/deita_complexity_scorer.py (99%) rename dataflow/operators/{general_text/eval/models => general_sft/eval}/deita_quality_scorer.py (99%) rename dataflow/operators/{general_text/eval/models => general_sft/eval}/instag_scorer.py (99%) rename dataflow/operators/{general_text/eval/models => general_sft/eval}/rm_scorer.py (98%) rename dataflow/operators/{general_text/eval/models => general_sft/eval}/superfiltering_scorer.py (96%) rename dataflow/operators/{general_text/eval/APIcaller => general_sft/eval}/treeinstruct_scorer.py (98%) rename dataflow/operators/{general_text => general_sft}/filter/alpagasus_filter.py (96%) rename dataflow/operators/{general_text => general_sft}/filter/deita_complexity_filter.py (96%) rename dataflow/operators/{general_text => general_sft}/filter/deita_quality_filter.py (96%) rename dataflow/operators/{general_text => general_sft}/filter/instag_filter.py (95%) rename dataflow/operators/{general_text/filter/reward_model_filter.py => general_sft/filter/rm_filter.py} (96%) rename dataflow/operators/{general_text => general_sft}/filter/superfiltering_filter.py (97%) rename dataflow/operators/{general_text => general_sft}/filter/treeinstruct_filter.py (96%) rename dataflow/operators/{general_text => general_sft}/generate/condor_generator.py (100%) rename dataflow/operators/{general_text => general_sft}/generate/sft_generator_from_seed.py (100%) rename dataflow/operators/{general_text => general_sft}/refine/condor_refiner.py (100%) delete mode 100644 dataflow/operators/general_text/__init__.py diff --git a/dataflow/operators/core_text/__init__.py b/dataflow/operators/core_text/__init__.py index 20075855..98aada34 100644 --- a/dataflow/operators/core_text/__init__.py +++ b/dataflow/operators/core_text/__init__.py @@ -4,9 +4,26 @@ from .generate.prompted_generator import PromptedGenerator from .generate.paired_prompted_generator import PairedPromptedGenerator from .generate.random_domain_knowledge_row_generator import RandomDomainKnowledgeRowGenerator + from .eval.prompted_eval import PromptedEvaluator - from .filter.prompted_filter import PromptedFilter + from .eval.perspective_scorer import PerspectiveSampleEvaluator + from .eval.task2vec_scorer import Task2VecDatasetEvaluator + from .eval.vendi_scorer import VendiDatasetEvaluator from .refine.prompted_refiner import PromptedRefiner + + from .filter.prompted_filter import PromptedFilter + from .filter.general_filter import GeneralFilter + from .filter.minhash_deduplicator import MinHashDeduplicateFilter + from .filter.ngramhash_deduplicator import NgramHashDeduplicateFilter + from .filter.sem_deduplicator import SemDeduplicateFilter + from .filter.simhash_deduplicator import SimHashDeduplicateFilter + from .filter.hash_deduplicator import HashDeduplicateFilter + from .filter.blocklist_filter import BlocklistFilter + from .filter.word_number_filter import WordNumberFilter + from .filter.perspective_filter import PerspectiveFilter + from .filter.language_filter import LanguageFilter + from .filter.llm_language_filter import LLMLanguageFilter + else: import sys from dataflow.utils.registry import LazyLoader, generate_import_structure_from_type_checking diff --git a/dataflow/operators/general_text/eval/APIcaller/perspective_scorer.py b/dataflow/operators/core_text/eval/perspective_scorer.py similarity index 98% rename from dataflow/operators/general_text/eval/APIcaller/perspective_scorer.py rename to dataflow/operators/core_text/eval/perspective_scorer.py index 9748cef9..44bd44a1 100644 --- a/dataflow/operators/general_text/eval/APIcaller/perspective_scorer.py +++ b/dataflow/operators/core_text/eval/perspective_scorer.py @@ -8,7 +8,7 @@ from dataflow.serving import PerspectiveAPIServing @OPERATOR_REGISTRY.register() -class PerspectiveScorer(OperatorABC): +class PerspectiveSampleEvaluator(OperatorABC): """Operator that assigns Perspective API toxicity scores to text inputs.""" def __init__(self, serving: PerspectiveAPIServing = None): self.logger = get_logger() diff --git a/dataflow/operators/general_text/eval/diversity/task2vec/task2vec.py b/dataflow/operators/core_text/eval/task2vec/task2vec.py similarity index 100% rename from dataflow/operators/general_text/eval/diversity/task2vec/task2vec.py rename to dataflow/operators/core_text/eval/task2vec/task2vec.py diff --git a/dataflow/operators/general_text/eval/diversity/task2vec/task_similarity.py b/dataflow/operators/core_text/eval/task2vec/task_similarity.py similarity index 100% rename from dataflow/operators/general_text/eval/diversity/task2vec/task_similarity.py rename to dataflow/operators/core_text/eval/task2vec/task_similarity.py diff --git a/dataflow/operators/general_text/eval/diversity/task2vec/utils.py b/dataflow/operators/core_text/eval/task2vec/utils.py similarity index 100% rename from dataflow/operators/general_text/eval/diversity/task2vec/utils.py rename to dataflow/operators/core_text/eval/task2vec/utils.py diff --git a/dataflow/operators/general_text/eval/diversity/task2vec_scorer.py b/dataflow/operators/core_text/eval/task2vec_scorer.py similarity index 96% rename from dataflow/operators/general_text/eval/diversity/task2vec_scorer.py rename to dataflow/operators/core_text/eval/task2vec_scorer.py index 54d4e925..7f864c3d 100644 --- a/dataflow/operators/general_text/eval/diversity/task2vec_scorer.py +++ b/dataflow/operators/core_text/eval/task2vec_scorer.py @@ -1,5 +1,5 @@ -from dataflow.operators.general_text.eval.diversity.task2vec.task2vec import Task2Vec -from dataflow.operators.general_text.eval.diversity.task2vec import task_similarity +from dataflow.operators.core_text.eval.task2vec.task2vec import Task2Vec +from dataflow.operators.core_text.eval.task2vec import task_similarity import torch import random from transformers import GPT2Tokenizer, GPT2LMHeadModel @@ -12,7 +12,7 @@ # Task2Vec dataset diversity evaluation # Cited from: Beyond Scale: the Diversity Coefficient as a Data Quality Metric Demonstrates LLMs are Pre-trained on Formally Diverse Data @OPERATOR_REGISTRY.register() -class Task2VecScorer(OperatorABC): +class Task2VecDatasetEvaluator(OperatorABC): def __init__(self, device='cuda', sample_nums=10, sample_size=1, method: Optional[str]='montecarlo', model_cache_dir='./dataflow_cache'): self.logger = get_logger() self.logger.info(f'Initializing {self.__class__.__name__}...') diff --git a/dataflow/operators/general_text/eval/diversity/vendi_scorer.py b/dataflow/operators/core_text/eval/vendi_scorer.py similarity index 98% rename from dataflow/operators/general_text/eval/diversity/vendi_scorer.py rename to dataflow/operators/core_text/eval/vendi_scorer.py index aacd3169..c0d18ad5 100644 --- a/dataflow/operators/general_text/eval/diversity/vendi_scorer.py +++ b/dataflow/operators/core_text/eval/vendi_scorer.py @@ -8,7 +8,7 @@ # VendiScore dataset diversity evaluation # Cited from: The Vendi Score: A Diversity Evaluation Metric for Machine Learning @OPERATOR_REGISTRY.register() -class VendiScorer(OperatorABC): +class VendiDatasetEvaluator(OperatorABC): def __init__(self, device='cuda'): self.logger = get_logger() self.logger.info(f'Initializing {self.__class__.__name__}...') diff --git a/dataflow/operators/general_text/filter/blocklist/en.txt b/dataflow/operators/core_text/filter/blocklist/en.txt similarity index 100% rename from dataflow/operators/general_text/filter/blocklist/en.txt rename to dataflow/operators/core_text/filter/blocklist/en.txt diff --git a/dataflow/operators/general_text/filter/blocklist/zh.txt b/dataflow/operators/core_text/filter/blocklist/zh.txt similarity index 100% rename from dataflow/operators/general_text/filter/blocklist/zh.txt rename to dataflow/operators/core_text/filter/blocklist/zh.txt diff --git a/dataflow/operators/core_text/filter/blocklist_filter.py b/dataflow/operators/core_text/filter/blocklist_filter.py new file mode 100644 index 00000000..51f74dfb --- /dev/null +++ b/dataflow/operators/core_text/filter/blocklist_filter.py @@ -0,0 +1,82 @@ +from tqdm import tqdm +import numpy as np +from dataflow import get_logger +from dataflow.core import OperatorABC +from dataflow.utils.storage import DataFlowStorage +from dataflow.utils.registry import OPERATOR_REGISTRY +from dataflow.cli_funcs.paths import DataFlowPath +from nltk.tokenize import word_tokenize + +@OPERATOR_REGISTRY.register() +class BlocklistFilter(OperatorABC): + + def __init__(self, language:str = 'en', threshold:int = 1, use_tokenizer:bool = False): + self.logger = get_logger() + self.language = language + self.threshold = threshold + self.use_tokenizer = use_tokenizer + self.logger.info(f"Initializing {self.__class__.__name__} with language = {self.language}, threshold = {self.threshold}, use_tokenizer = {self.use_tokenizer}...") + self.blocklist = self.load_blocklist() + + @staticmethod + def get_desc(lang: str = "zh"): + if lang == "zh": + return ( + "该算子使用特定语言的阻止列表进行文本过滤,支持可选的分词器进行单词级匹配。\n" + "输入参数:\n" + "- input_key:输入文本字段名,默认为'text'\n" + "- language:语言代码,默认为'zh'\n" + "- blocklist_dir:阻止列表文件目录,默认为'./blocklists/'\n" + "- threshold:匹配次数阈值,默认为1\n" + "- use_tokenizer:是否使用分词器,默认为True\n" + "- tokenizer:分词器对象,默认为None\n" + "输出参数:\n" + "- 过滤后的DataFrame,仅保留不包含阻止列表关键词的文本行\n" + "- 返回包含输入字段名的列表,用于后续算子引用" + ) + elif lang == "en": + return ( + "This operator filters text using language-specific blocklists with optional tokenizer integration for word-level filtering.\n" + "Input Parameters:\n" + "- input_key: Input text field name, default is 'text'\n" + "- language: Language code, default is 'zh'\n" + "- blocklist_dir: Blocklist file directory, default is './blocklists/'\n" + "- threshold: Matching count threshold, default is 1\n" + "- use_tokenizer: Whether to use tokenizer, default is True\n" + "- tokenizer: Tokenizer object, default is None\n" + "Output Parameters:\n" + "- Filtered DataFrame containing only rows without blocklist keywords\n" + "- List containing input field name for subsequent operator reference" + ) + else: + return "BlocklistFilter uses language-specific blocklists with optional tokenizer integration." + + def load_blocklist(self): + dataflow_dir = DataFlowPath.get_dataflow_dir() + file_path = f"{dataflow_dir}/operators/core_text/filter/blocklist/{self.language}.txt" + self.logger.info(f"Loading blocklist for language '{self.language}' from {file_path}...") + with open(file_path, 'r', encoding='utf-8') as file: + blocklist = set(line.strip().lower() for line in file if line.strip()) + self.logger.info(f"Blocklist for '{self.language}' loaded. Total words in blocklist: {len(blocklist)}.") + return blocklist + + def run(self, storage: DataFlowStorage, input_key: str, output_key: str = 'blocklist_filter_label'): + self.input_key = input_key + self.output_key = output_key + dataframe = storage.read("dataframe") + self.logger.info(f"Running {self.__class__.__name__}...") + valid_checks = [] + for text in tqdm(dataframe[self.input_key], desc=f"Implementing {self.__class__.__name__}"): + if text: + if self.use_tokenizer: + text = word_tokenize(text.lower()) + else: + text = text.lower().split() + blocklist_count = sum(1 for word in text if word in self.blocklist) + valid_checks.append(blocklist_count <= self.threshold) + valid_checks = np.array(valid_checks, dtype=int) + dataframe[self.output_key] = valid_checks + filtered_dataframe = dataframe[valid_checks == 1] + storage.write(filtered_dataframe) + self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.") + return [self.output_key] diff --git a/dataflow/operators/general_text/filter/general_filter.py b/dataflow/operators/core_text/filter/general_filter.py similarity index 100% rename from dataflow/operators/general_text/filter/general_filter.py rename to dataflow/operators/core_text/filter/general_filter.py diff --git a/dataflow/operators/general_text/filter/hash_deduplicator.py b/dataflow/operators/core_text/filter/hash_deduplicator.py similarity index 99% rename from dataflow/operators/general_text/filter/hash_deduplicator.py rename to dataflow/operators/core_text/filter/hash_deduplicator.py index 14d78f20..85354db9 100644 --- a/dataflow/operators/general_text/filter/hash_deduplicator.py +++ b/dataflow/operators/core_text/filter/hash_deduplicator.py @@ -7,7 +7,7 @@ from dataflow.utils.registry import OPERATOR_REGISTRY @OPERATOR_REGISTRY.register() -class HashDeduplicator(OperatorABC): +class HashDeduplicateFilter(OperatorABC): def __init__(self, hash_func: str = 'md5'): self.logger = get_logger() self.hash_func = hash_func diff --git a/dataflow/operators/general_text/filter/language_filter.py b/dataflow/operators/core_text/filter/language_filter.py similarity index 100% rename from dataflow/operators/general_text/filter/language_filter.py rename to dataflow/operators/core_text/filter/language_filter.py diff --git a/dataflow/operators/general_text/filter/llm_language_filter.py b/dataflow/operators/core_text/filter/llm_language_filter.py similarity index 100% rename from dataflow/operators/general_text/filter/llm_language_filter.py rename to dataflow/operators/core_text/filter/llm_language_filter.py diff --git a/dataflow/operators/general_text/filter/minhash_deduplicator.py b/dataflow/operators/core_text/filter/minhash_deduplicator.py similarity index 98% rename from dataflow/operators/general_text/filter/minhash_deduplicator.py rename to dataflow/operators/core_text/filter/minhash_deduplicator.py index e07c1f8d..db73df92 100644 --- a/dataflow/operators/general_text/filter/minhash_deduplicator.py +++ b/dataflow/operators/core_text/filter/minhash_deduplicator.py @@ -4,11 +4,9 @@ from dataflow.core import OperatorABC from dataflow.utils.storage import DataFlowStorage from dataflow.utils.registry import OPERATOR_REGISTRY -from dataflow.operators.general_text import NgramScorer - @OPERATOR_REGISTRY.register() -class MinHashDeduplicator(OperatorABC): +class MinHashDeduplicateFilter(OperatorABC): def __init__(self, num_perm=128, threshold=0.9, use_n_gram=True, ngram=5): self.logger = get_logger() self.num_perm = num_perm diff --git a/dataflow/operators/general_text/filter/ngramhash_deduplicator.py b/dataflow/operators/core_text/filter/ngramhash_deduplicator.py similarity index 99% rename from dataflow/operators/general_text/filter/ngramhash_deduplicator.py rename to dataflow/operators/core_text/filter/ngramhash_deduplicator.py index fef84aa6..44fa12cf 100644 --- a/dataflow/operators/general_text/filter/ngramhash_deduplicator.py +++ b/dataflow/operators/core_text/filter/ngramhash_deduplicator.py @@ -7,7 +7,7 @@ from dataflow.utils.registry import OPERATOR_REGISTRY @OPERATOR_REGISTRY.register() -class NgramHashDeduplicator(OperatorABC): +class NgramHashDeduplicateFilter(OperatorABC): def __init__(self, n_gram: int = 3, hash_func: str = 'md5', diff_size : int = 1): self.logger = get_logger() self.n_gram = n_gram diff --git a/dataflow/operators/general_text/filter/perspective_filter.py b/dataflow/operators/core_text/filter/perspective_filter.py similarity index 95% rename from dataflow/operators/general_text/filter/perspective_filter.py rename to dataflow/operators/core_text/filter/perspective_filter.py index b394f5b3..72083839 100644 --- a/dataflow/operators/general_text/filter/perspective_filter.py +++ b/dataflow/operators/core_text/filter/perspective_filter.py @@ -3,7 +3,7 @@ from dataflow.core import OperatorABC from dataflow.utils.storage import DataFlowStorage from dataflow.utils.registry import OPERATOR_REGISTRY -from dataflow.operators.general_text import PerspectiveScorer +from dataflow.operators.core_text import PerspectiveSampleEvaluator from dataflow.serving import PerspectiveAPIServing @OPERATOR_REGISTRY.register() @@ -14,7 +14,7 @@ def __init__(self, min_score: float = 0.0, max_score: float = 0.5): self.min_score = min_score self.max_score = max_score self.serving = PerspectiveAPIServing(max_workers=10) - self.scorer = PerspectiveScorer(serving=self.serving) + self.scorer = PerspectiveSampleEvaluator(serving=self.serving) @staticmethod def get_desc(lang: str = "zh"): diff --git a/dataflow/operators/general_text/filter/sem_deduplicator.py b/dataflow/operators/core_text/filter/sem_deduplicator.py similarity index 99% rename from dataflow/operators/general_text/filter/sem_deduplicator.py rename to dataflow/operators/core_text/filter/sem_deduplicator.py index 5975012f..737d63e2 100644 --- a/dataflow/operators/general_text/filter/sem_deduplicator.py +++ b/dataflow/operators/core_text/filter/sem_deduplicator.py @@ -60,7 +60,7 @@ def compute_cos_sim_matrix(embeddings): @OPERATOR_REGISTRY.register() -class SemDeduplicator(OperatorABC): +class SemDeduplicateFilter(OperatorABC): def __init__(self, eps: float = 0.05, model_name: str = 'sentence-transformers/all-MiniLM-L6-v2', model_cache_dir: str = './dataflow_cache', device: str = 'cuda'): self.logger = get_logger() self.eps = eps diff --git a/dataflow/operators/general_text/filter/simhash_deduplicator.py b/dataflow/operators/core_text/filter/simhash_deduplicator.py similarity index 99% rename from dataflow/operators/general_text/filter/simhash_deduplicator.py rename to dataflow/operators/core_text/filter/simhash_deduplicator.py index 7db1cac4..955cf252 100644 --- a/dataflow/operators/general_text/filter/simhash_deduplicator.py +++ b/dataflow/operators/core_text/filter/simhash_deduplicator.py @@ -12,7 +12,7 @@ def get_similarity(simhash, another_simhash): return similar @OPERATOR_REGISTRY.register() -class SimHashDeduplicator(OperatorABC): +class SimHashDeduplicateFilter(OperatorABC): def __init__(self, fingerprint_size: int = 64, bound: float = 0.1): self.logger = get_logger() self.fingerprint_size = fingerprint_size diff --git a/dataflow/operators/core_text/filter/word_number_filter.py b/dataflow/operators/core_text/filter/word_number_filter.py new file mode 100644 index 00000000..ce4fd933 --- /dev/null +++ b/dataflow/operators/core_text/filter/word_number_filter.py @@ -0,0 +1,64 @@ +from tqdm import tqdm +import numpy as np +from dataflow import get_logger +from dataflow.core import OperatorABC +from dataflow.utils.storage import DataFlowStorage +from dataflow.utils.registry import OPERATOR_REGISTRY + +@OPERATOR_REGISTRY.register() +class WordNumberFilter(OperatorABC): + + def __init__(self, min_words: int=20, max_words: int=100000): + self.logger = get_logger() + self.min_words = min_words + self.max_words = max_words + self.logger.info(f"Initializing {self.__class__.__name__} with min_words = {self.min_words}, max_words = {self.max_words}...") + + @staticmethod + def get_desc(lang: str = "zh"): + if lang == "zh": + return ( + "该算子用于过滤单词数量不在指定范围内的文本,通过空格分割计算单词数量。\n" + "输入参数:\n" + "- input_key:输入文本字段名,默认为'text'\n" + "- min_words:最小单词数量阈值,默认为5\n" + "- max_words:最大单词数量阈值,默认为100\n" + "输出参数:\n" + "- 过滤后的DataFrame,仅保留单词数量在指定范围内的文本行\n" + "- 返回包含输入字段名的列表,用于后续算子引用" + ) + elif lang == "en": + return ( + "This operator filters text with word count outside the specified range, using space splitting for word counting.\n" + "Input Parameters:\n" + "- input_key: Input text field name, default is 'text'\n" + "- min_words: Minimum word count threshold, default is 5\n" + "- max_words: Maximum word count threshold, default is 100\n" + "Output Parameters:\n" + "- Filtered DataFrame containing only rows with word count within specified range\n" + "- List containing input field name for subsequent operator reference" + ) + else: + return "WordNumberFilter filters text based on word count range using space splitting." + + def run(self, storage: DataFlowStorage, input_key: str, output_key: str='word_number_filter_label'): + self.input_key = input_key + self.output_key = output_key + dataframe = storage.read("dataframe") + self.logger.info(f"Running {self.__class__.__name__} with input_key = {self.input_key} and output_key = {self.output_key}...") + word_counts = [] + for text in tqdm(dataframe[self.input_key], desc=f"Implementing {self.__class__.__name__}"): + if text: + normalized_words = tuple(text.split()) + num_normalized_words = len(normalized_words) + word_counts.append(num_normalized_words) + else: + word_counts.append(0) + word_counts = np.array(word_counts) + metric_filter = (self.min_words <= word_counts) & (word_counts < self.max_words) + dataframe[self.output_key] = word_counts + filtered_dataframe = dataframe[metric_filter] + storage.write(filtered_dataframe) + self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.") + return [self.output_key] + diff --git a/dataflow/operators/general_pt/__init__.py b/dataflow/operators/general_pt/__init__.py new file mode 100644 index 00000000..8592413f --- /dev/null +++ b/dataflow/operators/general_pt/__init__.py @@ -0,0 +1,83 @@ +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + # filter + from .filter.ccnet_deduplicate_filter import CCNetDeduplicateFilter + from .filter.debertav3_filter import DebertaV3Filter + from .filter.fineweb_edu_filter import FineWebEduFilter + from .filter.heuristics_filter import ColonEndFilter + from .filter.heuristics_filter import SentenceNumberFilter + from .filter.heuristics_filter import LineEndWithEllipsisFilter + from .filter.heuristics_filter import ContentNullFilter + from .filter.heuristics_filter import SymbolWordRatioFilter + from .filter.heuristics_filter import AlphaWordsFilter + from .filter.heuristics_filter import HtmlEntityFilter + from .filter.heuristics_filter import IDCardFilter + from .filter.heuristics_filter import NoPuncFilter + from .filter.heuristics_filter import SpecialCharacterFilter + from .filter.heuristics_filter import WatermarkFilter + from .filter.heuristics_filter import MeanWordLengthFilter + from .filter.heuristics_filter import StopWordFilter + from .filter.heuristics_filter import CurlyBracketFilter + from .filter.heuristics_filter import CapitalWordsFilter + from .filter.heuristics_filter import LoremIpsumFilter + from .filter.heuristics_filter import UniqueWordsFilter + from .filter.heuristics_filter import CharNumberFilter + from .filter.heuristics_filter import LineStartWithBulletpointFilter + from .filter.heuristics_filter import LineWithJavascriptFilter + from .filter.langkit_filter import LangkitFilter + from .filter.lexical_diversity_filter import LexicalDiversityFilter + from .filter.ngram_filter import NgramFilter + from .filter.pair_qual_filter import PairQualFilter + from .filter.perplexity_filter import PerplexityFilter + from .filter.presidio_filter import PresidioFilter + from .filter.qurating_filter import QuratingFilter + from .filter.text_book_filter import TextbookFilter + + # generate + from .generate.phi4qa_generator import Phi4QAGenerator + + # refine + from .refine.html_entity_refiner import HtmlEntityRefiner + from .refine.html_url_remover_refiner import HtmlUrlRemoverRefiner + from .refine.lowercase_refiner import LowercaseRefiner + from .refine.ner_refiner import NERRefiner + from .refine.pii_anonymize_refiner import PIIAnonymizeRefiner + from .refine.ref_removal_refiner import ReferenceRemoverRefiner + from .refine.remove_contractions_refiner import RemoveContractionsRefiner + from .refine.remove_emoji_refiner import RemoveEmojiRefiner + from .refine.remove_emoticons_refiner import RemoveEmoticonsRefiner + from .refine.remove_extra_spaces_refiner import RemoveExtraSpacesRefiner + from .refine.remove_image_ref_refiner import RemoveImageRefsRefiner + from .refine.remove_number_refiner import RemoveNumberRefiner + from .refine.remove_punctuation_refiner import RemovePunctuationRefiner + from .refine.remove_repetitions_punctuation_refiner import RemoveRepetitionsPunctuationRefiner + from .refine.remove_stopwords_refiner import RemoveStopwordsRefiner + from .refine.spelling_correction_refiner import SpellingCorrectionRefiner + from .refine.stemming_lemmatization_refiner import StemmingLemmatizationRefiner + from .refine.text_normalization_refiner import TextNormalizationRefiner + + # eval + from .eval.ngram_sample_evaluator import NgramSampleEvaluator + from .eval.lexical_diversity_sample_evaluator import LexicalDiversitySampleEvaluator + from .eval.langkit_sample_evaluator import LangkitSampleEvaluator + from .eval.debertav3_sample_evaluator import DebertaV3SampleEvaluator + from .eval.fineweb_edu_sample_evaluator import FineWebEduSampleEvaluator + from .eval.pair_qual_sample_evaluator import PairQualSampleEvaluator + from .eval.presidio_sample_evaluator import PresidioSampleEvaluator + from .eval.textbook_sample_evaluator import TextbookSampleEvaluator + from .eval.qurating_sample_evaluator import QuratingSampleEvaluator + from .eval.perplexity_sample_evaluator import PerplexitySampleEvaluator + from .eval.meta_sample_evaluator import MetaSampleEvaluator + from .eval.bert_sample_evaluator import BertSampleEvaluator + from .eval.bleu_sample_evaluator import BleuSampleEvaluator + from .eval.cider_sample_evaluator import CiderSampleEvaluator + +else: + import sys + from dataflow.utils.registry import LazyLoader, generate_import_structure_from_type_checking + + cur_path = "dataflow/operators/general_pt/" + + _import_structure = generate_import_structure_from_type_checking(__file__, cur_path) + sys.modules[__name__] = LazyLoader(__name__, "dataflow/operators/general_pt/", _import_structure) diff --git a/dataflow/operators/general_text/eval/models/Kenlm/model.py b/dataflow/operators/general_pt/eval/Kenlm/model.py similarity index 100% rename from dataflow/operators/general_text/eval/models/Kenlm/model.py rename to dataflow/operators/general_pt/eval/Kenlm/model.py diff --git a/dataflow/operators/general_text/eval/models/Qurating/modeling/modeling_flash_llama.py b/dataflow/operators/general_pt/eval/Qurating/modeling/modeling_flash_llama.py similarity index 100% rename from dataflow/operators/general_text/eval/models/Qurating/modeling/modeling_flash_llama.py rename to dataflow/operators/general_pt/eval/Qurating/modeling/modeling_flash_llama.py diff --git a/dataflow/operators/general_text/eval/models/Qurating/qurater_annotate.py b/dataflow/operators/general_pt/eval/Qurating/qurater_annotate.py similarity index 100% rename from dataflow/operators/general_text/eval/models/Qurating/qurater_annotate.py rename to dataflow/operators/general_pt/eval/Qurating/qurater_annotate.py diff --git a/dataflow/operators/general_text/eval/gen/bleu/__init__.py b/dataflow/operators/general_pt/eval/__init__.py similarity index 100% rename from dataflow/operators/general_text/eval/gen/bleu/__init__.py rename to dataflow/operators/general_pt/eval/__init__.py diff --git a/dataflow/operators/general_text/eval/gen/bert_scorer.py b/dataflow/operators/general_pt/eval/bert_sample_evaluator.py similarity index 98% rename from dataflow/operators/general_text/eval/gen/bert_scorer.py rename to dataflow/operators/general_pt/eval/bert_sample_evaluator.py index 2880b727..d40f4b5d 100644 --- a/dataflow/operators/general_text/eval/gen/bert_scorer.py +++ b/dataflow/operators/general_pt/eval/bert_sample_evaluator.py @@ -5,7 +5,7 @@ import evaluate @OPERATOR_REGISTRY.register() -class BERTScorer(OperatorABC): +class BertSampleEvaluator(OperatorABC): def __init__(self, lang='en', model_cache_dir='./dataflow_cache'): self.logger = get_logger() self.logger.info(f'Initializing {self.__class__.__name__}...') diff --git a/dataflow/operators/general_text/eval/gen/cider/__init__.py b/dataflow/operators/general_pt/eval/bleu/__init__.py similarity index 100% rename from dataflow/operators/general_text/eval/gen/cider/__init__.py rename to dataflow/operators/general_pt/eval/bleu/__init__.py diff --git a/dataflow/operators/general_text/eval/gen/bleu/bleu.py b/dataflow/operators/general_pt/eval/bleu/bleu.py similarity index 100% rename from dataflow/operators/general_text/eval/gen/bleu/bleu.py rename to dataflow/operators/general_pt/eval/bleu/bleu.py diff --git a/dataflow/operators/general_text/eval/gen/bleu_scorer.py b/dataflow/operators/general_pt/eval/bleu_sample_evaluator.py similarity index 97% rename from dataflow/operators/general_text/eval/gen/bleu_scorer.py rename to dataflow/operators/general_pt/eval/bleu_sample_evaluator.py index 9f92ffcf..5b0930a7 100644 --- a/dataflow/operators/general_text/eval/gen/bleu_scorer.py +++ b/dataflow/operators/general_pt/eval/bleu_sample_evaluator.py @@ -2,11 +2,11 @@ from dataflow.utils.storage import DataFlowStorage from dataflow.utils.registry import OPERATOR_REGISTRY from dataflow import get_logger -from dataflow.operators.general_text.eval.gen.bleu.bleu import Bleu +from dataflow.operators.general_pt.eval.bleu.bleu import Bleu from tqdm import tqdm @OPERATOR_REGISTRY.register() -class BleuScorer(OperatorABC): +class BleuSampleEvaluator(OperatorABC): def __init__(self, n=4, eff="average", special_reflen=None): self.logger = get_logger() self.logger.info(f'Initializing {self.__class__.__name__}...') diff --git a/dataflow/operators/general_text/eval/statistics/__init__.py b/dataflow/operators/general_pt/eval/cider/__init__.py similarity index 100% rename from dataflow/operators/general_text/eval/statistics/__init__.py rename to dataflow/operators/general_pt/eval/cider/__init__.py diff --git a/dataflow/operators/general_text/eval/gen/cider/cider.py b/dataflow/operators/general_pt/eval/cider/cider.py similarity index 100% rename from dataflow/operators/general_text/eval/gen/cider/cider.py rename to dataflow/operators/general_pt/eval/cider/cider.py diff --git a/dataflow/operators/general_text/eval/gen/cider_scorer.py b/dataflow/operators/general_pt/eval/cider_sample_evaluator.py similarity index 95% rename from dataflow/operators/general_text/eval/gen/cider_scorer.py rename to dataflow/operators/general_pt/eval/cider_sample_evaluator.py index 9a59690d..db3f5d76 100644 --- a/dataflow/operators/general_text/eval/gen/cider_scorer.py +++ b/dataflow/operators/general_pt/eval/cider_sample_evaluator.py @@ -6,7 +6,7 @@ from dataflow.utils.storage import DataFlowStorage from dataflow.utils.registry import OPERATOR_REGISTRY from dataflow import get_logger -from dataflow.operators.general_text.eval.gen.cider.cider import Cider +from dataflow.operators.general_pt.eval.cider.cider import Cider def load_idf(idf_path): with open(idf_path, 'rb') as f: @@ -14,8 +14,8 @@ def load_idf(idf_path): return idf @OPERATOR_REGISTRY.register() -class CiderScorer(OperatorABC): - def __init__(self, n=4, sigma=6.0, df_mode="coco-val-df", idf_path="./dataflow/operators/eval/GeneralText/gen/cider/coco-val-df.p"): +class CiderSampleEvaluator(OperatorABC): + def __init__(self, n=4, sigma=6.0, df_mode="coco-val-df", idf_path="./dataflow/operators/general_pt/eval/cider/coco-val-df.p"): self.logger = get_logger() self.logger.info(f'Initializing {self.__class__.__name__}...') self.score_name = 'CiderScore' diff --git a/dataflow/operators/general_text/eval/models/debertav3_scorer.py b/dataflow/operators/general_pt/eval/debertav3_sample_evaluator.py similarity index 99% rename from dataflow/operators/general_text/eval/models/debertav3_scorer.py rename to dataflow/operators/general_pt/eval/debertav3_sample_evaluator.py index c84344d5..7ce31cb8 100644 --- a/dataflow/operators/general_text/eval/models/debertav3_scorer.py +++ b/dataflow/operators/general_pt/eval/debertav3_sample_evaluator.py @@ -9,7 +9,7 @@ from dataflow import get_logger @OPERATOR_REGISTRY.register() -class DebertaV3Scorer(OperatorABC): +class DebertaV3SampleEvaluator(OperatorABC): def __init__(self, model_name, model_cache_dir='./dataflow_cache', device='cuda'): self.logger = get_logger() self.logger.info(f'Initializing {self.__class__.__name__}...') diff --git a/dataflow/operators/general_text/eval/models/fineweb_edu_scorer.py b/dataflow/operators/general_pt/eval/fineweb_edu_sample_evaluator.py similarity index 98% rename from dataflow/operators/general_text/eval/models/fineweb_edu_scorer.py rename to dataflow/operators/general_pt/eval/fineweb_edu_sample_evaluator.py index 292adf20..7f717197 100644 --- a/dataflow/operators/general_text/eval/models/fineweb_edu_scorer.py +++ b/dataflow/operators/general_pt/eval/fineweb_edu_sample_evaluator.py @@ -8,7 +8,7 @@ import numpy as np @OPERATOR_REGISTRY.register() -class FineWebEduScorer(OperatorABC): +class FineWebEduSampleEvaluator(OperatorABC): def __init__(self, model_cache_dir: str = './dataflow_cache', device: str = 'cuda'): self.logger = get_logger() self.logger.info(f'Initializing {self.__class__.__name__}...') diff --git a/dataflow/operators/general_text/eval/statistics/langkit_scorer.py b/dataflow/operators/general_pt/eval/langkit_sample_evaluator.py similarity index 98% rename from dataflow/operators/general_text/eval/statistics/langkit_scorer.py rename to dataflow/operators/general_pt/eval/langkit_sample_evaluator.py index 2e73b6a3..17ed87d6 100644 --- a/dataflow/operators/general_text/eval/statistics/langkit_scorer.py +++ b/dataflow/operators/general_pt/eval/langkit_sample_evaluator.py @@ -7,7 +7,7 @@ from tqdm import tqdm @OPERATOR_REGISTRY.register() -class LangkitScorer(OperatorABC): +class LangkitSampleEvaluator(OperatorABC): def __init__(self): self.logger = get_logger() self.logger.info(f'Initializing {self.__class__.__name__}...') @@ -63,7 +63,7 @@ def _score_func(self, sample): def eval(self, dataframe, input_key): scores_list = [] self.logger.info(f"Evaluating {self.score_name}...") - for sample in tqdm(dataframe[input_key], desc="LangkitScorer Evaluating..."): + for sample in tqdm(dataframe[input_key], desc="LangkitScore Evaluating..."): scores = self._score_func(sample) scores_list.append(scores) self.logger.info("Evaluation complete!") diff --git a/dataflow/operators/general_text/eval/statistics/lexical_diversity_scorer.py b/dataflow/operators/general_pt/eval/lexical_diversity_sample_evaluator.py similarity index 98% rename from dataflow/operators/general_text/eval/statistics/lexical_diversity_scorer.py rename to dataflow/operators/general_pt/eval/lexical_diversity_sample_evaluator.py index 4124cab4..a77b064d 100644 --- a/dataflow/operators/general_text/eval/statistics/lexical_diversity_scorer.py +++ b/dataflow/operators/general_pt/eval/lexical_diversity_sample_evaluator.py @@ -90,7 +90,7 @@ def hdd(word_array, sample_size=42.0): @OPERATOR_REGISTRY.register() -class LexicalDiversityScorer(OperatorABC): +class LexicalDiversitySampleEvaluator(OperatorABC): def __init__(self): self.logger = get_logger() self.logger.info(f'Initializing {self.__class__.__name__}...') @@ -145,7 +145,7 @@ def _score_func(self, sample): def eval(self, dataframe, input_key): scores_list = [] self.logger.info(f"Evaluating {self.score_name}...") - for sample in tqdm(dataframe[input_key], desc="LexicalDiversityScorer Evaluating..."): + for sample in tqdm(dataframe[input_key], desc="LexicalDiversityScore Evaluating..."): scores = self._score_func(sample) scores_list.append(scores) self.logger.info("Evaluation complete!") diff --git a/dataflow/operators/general_text/eval/APIcaller/meta_scorer.py b/dataflow/operators/general_pt/eval/meta_sample_evaluator.py similarity index 98% rename from dataflow/operators/general_text/eval/APIcaller/meta_scorer.py rename to dataflow/operators/general_pt/eval/meta_sample_evaluator.py index 259fc021..f3fe77d2 100644 --- a/dataflow/operators/general_text/eval/APIcaller/meta_scorer.py +++ b/dataflow/operators/general_pt/eval/meta_sample_evaluator.py @@ -95,7 +95,7 @@ ] @OPERATOR_REGISTRY.register() -class MetaScorer(OperatorABC): +class MetaSampleEvaluator(OperatorABC): def __init__(self, llm_serving: LLMServingABC = None, dimensions: list[dict] = example_dimensions, @@ -118,7 +118,6 @@ def __init__(self, } ] } - You can refer to dataflow/operators/eval/GeneralText/APIcaller/meta_scorer.py for an example. """ self.logger = get_logger() self.logger.info(f'Initializing {self.__class__.__name__}...') diff --git a/dataflow/operators/general_text/eval/statistics/ngram_scorer.py b/dataflow/operators/general_pt/eval/ngram_sample_evaluator.py similarity index 98% rename from dataflow/operators/general_text/eval/statistics/ngram_scorer.py rename to dataflow/operators/general_pt/eval/ngram_sample_evaluator.py index 6dc602ab..434ec30d 100644 --- a/dataflow/operators/general_text/eval/statistics/ngram_scorer.py +++ b/dataflow/operators/general_pt/eval/ngram_sample_evaluator.py @@ -7,7 +7,7 @@ from dataflow import get_logger @OPERATOR_REGISTRY.register() -class NgramScorer(OperatorABC): +class NgramSampleEvaluator(OperatorABC): def __init__(self, ngrams=5): self.logger = get_logger() diff --git a/dataflow/operators/general_text/eval/models/pair_qual_scorer.py b/dataflow/operators/general_pt/eval/pair_qual_sample_evaluator.py similarity index 99% rename from dataflow/operators/general_text/eval/models/pair_qual_scorer.py rename to dataflow/operators/general_pt/eval/pair_qual_sample_evaluator.py index 2570678f..d877f19a 100644 --- a/dataflow/operators/general_text/eval/models/pair_qual_scorer.py +++ b/dataflow/operators/general_pt/eval/pair_qual_sample_evaluator.py @@ -9,7 +9,7 @@ import numpy as np @OPERATOR_REGISTRY.register() -class PairQualScorer(OperatorABC): +class PairQualSampleEvaluator(OperatorABC): def __init__(self, model_cache_dir:str='./dataflow_cache', device="cuda", lang='en', max_length=512): self.logger = get_logger() self.logger.info(f'Initializing {self.__class__.__name__}...') diff --git a/dataflow/operators/general_text/eval/models/perplexity_scorer.py b/dataflow/operators/general_pt/eval/perplexity_sample_evaluator.py similarity index 95% rename from dataflow/operators/general_text/eval/models/perplexity_scorer.py rename to dataflow/operators/general_pt/eval/perplexity_sample_evaluator.py index 6422bbbc..b009ee5e 100644 --- a/dataflow/operators/general_text/eval/models/perplexity_scorer.py +++ b/dataflow/operators/general_pt/eval/perplexity_sample_evaluator.py @@ -1,13 +1,13 @@ from dataflow.core import OperatorABC -from dataflow.operators.general_text.eval.models.Kenlm.model import KenlmModel +from dataflow.operators.general_pt.eval.Kenlm.model import KenlmModel from dataflow.utils.registry import OPERATOR_REGISTRY from dataflow.utils.storage import DataFlowStorage from dataflow.utils.utils import get_logger # Kenlm models perplexity evaluation @OPERATOR_REGISTRY.register() -class PerplexityScorer(OperatorABC): +class PerplexitySampleEvaluator(OperatorABC): # Need to download model first! - def __init__(self, lang='en', model_name='dataflow/operators/eval/GeneralText/models/Kenlm/wikipedia'): + def __init__(self, lang='en', model_name='dataflow/operators/general_pt/eval/Kenlm/wikipeia'): self.logger = get_logger() self.logger.info(f'Initializing {self.__class__.__name__}...') self.model_name = model_name diff --git a/dataflow/operators/general_text/eval/models/presidio_scorer.py b/dataflow/operators/general_pt/eval/presidio_sample_evaluator.py similarity index 98% rename from dataflow/operators/general_text/eval/models/presidio_scorer.py rename to dataflow/operators/general_pt/eval/presidio_sample_evaluator.py index 9afca545..ed9b5830 100644 --- a/dataflow/operators/general_text/eval/models/presidio_scorer.py +++ b/dataflow/operators/general_pt/eval/presidio_sample_evaluator.py @@ -9,7 +9,7 @@ # Presidio PII detection Scorer @OPERATOR_REGISTRY.register() -class PresidioScorer(OperatorABC): +class PresidioSampleEvaluator(OperatorABC): def __init__(self, device='cuda', lang='en', model_cache_dir='./dataflow_cache'): self.logger = get_logger() self.logger.info(f'Initializing {self.__class__.__name__}...') diff --git a/dataflow/operators/general_text/eval/models/qurating_scorer.py b/dataflow/operators/general_pt/eval/qurating_sample_evaluator.py similarity index 96% rename from dataflow/operators/general_text/eval/models/qurating_scorer.py rename to dataflow/operators/general_pt/eval/qurating_sample_evaluator.py index 3bd7dea1..69dc525e 100644 --- a/dataflow/operators/general_text/eval/models/qurating_scorer.py +++ b/dataflow/operators/general_pt/eval/qurating_sample_evaluator.py @@ -4,12 +4,12 @@ from datasets import Dataset from tqdm import tqdm from dataflow import get_logger -from dataflow.operators.general_text.eval.models.Qurating.qurater_annotate import ModelAnnotator -from dataflow.operators.general_text.eval.models.Qurating.qurater_annotate import TokenizeAndChunk +from dataflow.operators.general_pt.eval.Qurating.qurater_annotate import ModelAnnotator +from dataflow.operators.general_pt.eval.Qurating.qurater_annotate import TokenizeAndChunk import torch @OPERATOR_REGISTRY.register() -class QuratingScorer(OperatorABC): +class QuratingSampleEvaluator(OperatorABC): def __init__(self, map_batch_size: int = 512, num_workers: int = 1, device_batch_size: int = 16, device: str = 'cuda', labels: list = ['writing_style', 'required_expertise', 'facts_and_trivia', 'educational_value'], model_cache_dir: str = './dataflow_cache'): self.logger = get_logger() diff --git a/dataflow/operators/general_text/eval/models/textbook_scorer.py b/dataflow/operators/general_pt/eval/textbook_sample_evaluator.py similarity index 98% rename from dataflow/operators/general_text/eval/models/textbook_scorer.py rename to dataflow/operators/general_pt/eval/textbook_sample_evaluator.py index 195f1cb7..3242fd01 100644 --- a/dataflow/operators/general_text/eval/models/textbook_scorer.py +++ b/dataflow/operators/general_pt/eval/textbook_sample_evaluator.py @@ -10,7 +10,7 @@ import numpy as np @OPERATOR_REGISTRY.register() -class TextbookScorer(OperatorABC): +class TextbookSampleEvaluator(OperatorABC): def __init__(self, model_cache_dir='./dataflow_cache'): self.logger = get_logger() self.logger.info(f'Initializing {self.__class__.__name__}...') diff --git a/dataflow/operators/general_text/filter/__init__.py b/dataflow/operators/general_pt/filter/__init__.py similarity index 100% rename from dataflow/operators/general_text/filter/__init__.py rename to dataflow/operators/general_pt/filter/__init__.py diff --git a/dataflow/operators/general_text/filter/ccnet_deduplicator.py b/dataflow/operators/general_pt/filter/ccnet_deduplicate_filter.py similarity index 99% rename from dataflow/operators/general_text/filter/ccnet_deduplicator.py rename to dataflow/operators/general_pt/filter/ccnet_deduplicate_filter.py index 1bae30df..1fd77e68 100644 --- a/dataflow/operators/general_text/filter/ccnet_deduplicator.py +++ b/dataflow/operators/general_pt/filter/ccnet_deduplicate_filter.py @@ -40,7 +40,7 @@ def sha1_hash(data: bytes, d: int = 32) -> int: @OPERATOR_REGISTRY.register() -class CCNetDeduplicator(OperatorABC): +class CCNetDeduplicateFilter(OperatorABC): def __init__(self, bit_length: int = 64): self.logger = get_logger() diff --git a/dataflow/operators/general_text/filter/debertav3_filter.py b/dataflow/operators/general_pt/filter/debertav3_filter.py similarity index 96% rename from dataflow/operators/general_text/filter/debertav3_filter.py rename to dataflow/operators/general_pt/filter/debertav3_filter.py index f0c6a528..864e3417 100644 --- a/dataflow/operators/general_text/filter/debertav3_filter.py +++ b/dataflow/operators/general_pt/filter/debertav3_filter.py @@ -3,7 +3,7 @@ from dataflow.core import OperatorABC from dataflow.utils.storage import DataFlowStorage from dataflow.utils.registry import OPERATOR_REGISTRY -from dataflow.operators.general_text import DebertaV3Scorer +from dataflow.operators.general_pt import DebertaV3SampleEvaluator @OPERATOR_REGISTRY.register() class DebertaV3Filter(OperatorABC): @@ -11,7 +11,7 @@ class DebertaV3Filter(OperatorABC): def __init__(self, allowed_scores : list = ['Medium', 'High'], model_name='nvidia/quality-classifier-deberta', model_cache_dir='./dataflow_cache', device='cuda', batch_size=16): self.logger = get_logger() self.allowed_scores = allowed_scores - self.scorer = DebertaV3Scorer( + self.scorer = DebertaV3SampleEvaluator( model_name=model_name, model_cache_dir=model_cache_dir, device=device, diff --git a/dataflow/operators/general_text/filter/fineweb_edu_filter.py b/dataflow/operators/general_pt/filter/fineweb_edu_filter.py similarity index 95% rename from dataflow/operators/general_text/filter/fineweb_edu_filter.py rename to dataflow/operators/general_pt/filter/fineweb_edu_filter.py index 8fd037ab..990cad73 100644 --- a/dataflow/operators/general_text/filter/fineweb_edu_filter.py +++ b/dataflow/operators/general_pt/filter/fineweb_edu_filter.py @@ -1,4 +1,4 @@ -from dataflow.operators.general_text import FineWebEduScorer +from dataflow.operators.general_pt import FineWebEduSampleEvaluator from dataflow.core import OperatorABC from dataflow.utils.storage import DataFlowStorage from dataflow.utils.registry import OPERATOR_REGISTRY @@ -10,7 +10,7 @@ def __init__(self, min_score: float = 2.5, max_score: float = 10000, model_cache self.min_score = min_score self.max_score = max_score self.logger = get_logger() - self.scorer = FineWebEduScorer(model_cache_dir=model_cache_dir, device=device) + self.scorer = FineWebEduSampleEvaluator(model_cache_dir=model_cache_dir, device=device) self.filter_name = 'FineWebEduFilter' self.logger.info(f"Initializing {self.filter_name} with min_score = {self.min_score}, max_score = {self.max_score}, " f"device = {device}, model_cache_dir = {model_cache_dir}") diff --git a/dataflow/operators/general_text/filter/heuristics.py b/dataflow/operators/general_pt/filter/heuristics_filter.py similarity index 90% rename from dataflow/operators/general_text/filter/heuristics.py rename to dataflow/operators/general_pt/filter/heuristics_filter.py index 5c6500db..09b5f3b8 100644 --- a/dataflow/operators/general_text/filter/heuristics.py +++ b/dataflow/operators/general_pt/filter/heuristics_filter.py @@ -5,7 +5,6 @@ from dataflow.utils.registry import OPERATOR_REGISTRY from dataflow.utils.utils import get_logger from dataflow.utils.storage import DataFlowStorage -from dataflow.cli_funcs.paths import DataFlowPath from tqdm import tqdm import re @@ -57,64 +56,6 @@ def run(self, storage: DataFlowStorage, input_key: str, output_key: str = None): self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.") return [self.output_key] -@OPERATOR_REGISTRY.register() -class WordNumberFilter(OperatorABC): - - def __init__(self, min_words: int=20, max_words: int=100000): - self.logger = get_logger() - self.min_words = min_words - self.max_words = max_words - self.logger.info(f"Initializing {self.__class__.__name__} with min_words = {self.min_words}, max_words = {self.max_words}...") - - @staticmethod - def get_desc(lang: str = "zh"): - if lang == "zh": - return ( - "该算子用于过滤单词数量不在指定范围内的文本,通过空格分割计算单词数量。\n" - "输入参数:\n" - "- input_key:输入文本字段名,默认为'text'\n" - "- min_words:最小单词数量阈值,默认为5\n" - "- max_words:最大单词数量阈值,默认为100\n" - "输出参数:\n" - "- 过滤后的DataFrame,仅保留单词数量在指定范围内的文本行\n" - "- 返回包含输入字段名的列表,用于后续算子引用" - ) - elif lang == "en": - return ( - "This operator filters text with word count outside the specified range, using space splitting for word counting.\n" - "Input Parameters:\n" - "- input_key: Input text field name, default is 'text'\n" - "- min_words: Minimum word count threshold, default is 5\n" - "- max_words: Maximum word count threshold, default is 100\n" - "Output Parameters:\n" - "- Filtered DataFrame containing only rows with word count within specified range\n" - "- List containing input field name for subsequent operator reference" - ) - else: - return "WordNumberFilter filters text based on word count range using space splitting." - - def run(self, storage: DataFlowStorage, input_key: str, output_key: str='word_number_filter_label'): - self.input_key = input_key - self.output_key = output_key - dataframe = storage.read("dataframe") - self.logger.info(f"Running {self.__class__.__name__} with input_key = {self.input_key} and output_key = {self.output_key}...") - word_counts = [] - for text in tqdm(dataframe[self.input_key], desc=f"Implementing {self.__class__.__name__}"): - if text: - normalized_words = tuple(text.split()) - num_normalized_words = len(normalized_words) - word_counts.append(num_normalized_words) - else: - word_counts.append(0) - word_counts = np.array(word_counts) - metric_filter = (self.min_words <= word_counts) & (word_counts < self.max_words) - dataframe[self.output_key] = word_counts - filtered_dataframe = dataframe[metric_filter] - storage.write(filtered_dataframe) - self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.") - return [self.output_key] - - @OPERATOR_REGISTRY.register() class SentenceNumberFilter(OperatorABC): @@ -1498,77 +1439,3 @@ def run(self, storage: DataFlowStorage, input_key: str, output_key: str='line_wi self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.") return [self.output_key] - -@OPERATOR_REGISTRY.register() -class BlocklistFilter(OperatorABC): - - def __init__(self, language:str = 'en', threshold:int = 1, use_tokenizer:bool = False): - self.logger = get_logger() - self.language = language - self.threshold = threshold - self.use_tokenizer = use_tokenizer - self.logger.info(f"Initializing {self.__class__.__name__} with language = {self.language}, threshold = {self.threshold}, use_tokenizer = {self.use_tokenizer}...") - self.blocklist = self.load_blocklist() - - @staticmethod - def get_desc(lang: str = "zh"): - if lang == "zh": - return ( - "该算子使用特定语言的阻止列表进行文本过滤,支持可选的分词器进行单词级匹配。\n" - "输入参数:\n" - "- input_key:输入文本字段名,默认为'text'\n" - "- language:语言代码,默认为'zh'\n" - "- blocklist_dir:阻止列表文件目录,默认为'./blocklists/'\n" - "- threshold:匹配次数阈值,默认为1\n" - "- use_tokenizer:是否使用分词器,默认为True\n" - "- tokenizer:分词器对象,默认为None\n" - "输出参数:\n" - "- 过滤后的DataFrame,仅保留不包含阻止列表关键词的文本行\n" - "- 返回包含输入字段名的列表,用于后续算子引用" - ) - elif lang == "en": - return ( - "This operator filters text using language-specific blocklists with optional tokenizer integration for word-level filtering.\n" - "Input Parameters:\n" - "- input_key: Input text field name, default is 'text'\n" - "- language: Language code, default is 'zh'\n" - "- blocklist_dir: Blocklist file directory, default is './blocklists/'\n" - "- threshold: Matching count threshold, default is 1\n" - "- use_tokenizer: Whether to use tokenizer, default is True\n" - "- tokenizer: Tokenizer object, default is None\n" - "Output Parameters:\n" - "- Filtered DataFrame containing only rows without blocklist keywords\n" - "- List containing input field name for subsequent operator reference" - ) - else: - return "BlocklistFilter uses language-specific blocklists with optional tokenizer integration." - - def load_blocklist(self): - dataflow_dir = DataFlowPath.get_dataflow_dir() - file_path = f"{dataflow_dir}/operators/filter/GeneralText/blocklist/{self.language}.txt" - self.logger.info(f"Loading blocklist for language '{self.language}' from {file_path}...") - with open(file_path, 'r', encoding='utf-8') as file: - blocklist = set(line.strip().lower() for line in file if line.strip()) - self.logger.info(f"Blocklist for '{self.language}' loaded. Total words in blocklist: {len(blocklist)}.") - return blocklist - - def run(self, storage: DataFlowStorage, input_key: str, output_key: str = 'blocklist_filter_label'): - self.input_key = input_key - self.output_key = output_key - dataframe = storage.read("dataframe") - self.logger.info(f"Running {self.__class__.__name__}...") - valid_checks = [] - for text in tqdm(dataframe[self.input_key], desc=f"Implementing {self.__class__.__name__}"): - if text: - if self.use_tokenizer: - text = word_tokenize(text.lower()) - else: - text = text.lower().split() - blocklist_count = sum(1 for word in text if word in self.blocklist) - valid_checks.append(blocklist_count <= self.threshold) - valid_checks = np.array(valid_checks, dtype=int) - dataframe[self.output_key] = valid_checks - filtered_dataframe = dataframe[valid_checks == 1] - storage.write(filtered_dataframe) - self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.") - return [self.output_key] diff --git a/dataflow/operators/general_text/filter/langkit_filter.py b/dataflow/operators/general_pt/filter/langkit_filter.py similarity index 98% rename from dataflow/operators/general_text/filter/langkit_filter.py rename to dataflow/operators/general_pt/filter/langkit_filter.py index e9cf64a8..939957f8 100644 --- a/dataflow/operators/general_text/filter/langkit_filter.py +++ b/dataflow/operators/general_pt/filter/langkit_filter.py @@ -4,7 +4,7 @@ from dataflow.core import OperatorABC from dataflow.utils.storage import DataFlowStorage from dataflow.utils.registry import OPERATOR_REGISTRY -from dataflow.operators.general_text import LangkitScorer +from dataflow.operators.general_pt import LangkitSampleEvaluator @OPERATOR_REGISTRY.register() class LangkitFilter(OperatorABC): @@ -66,7 +66,7 @@ def __init__(self, if not self.min_scores.keys() == self.max_scores.keys(): raise ValueError("min_scores and max_scores must have the same keys") self.logger = get_logger() - self.scorer = LangkitScorer() + self.scorer = LangkitSampleEvaluator() self.logger.info(f"Initializing {self.__class__.__name__} with min_scores: {self.min_scores} and max_scores: {self.max_scores}...") @staticmethod diff --git a/dataflow/operators/general_text/filter/lexical_diversity_filter.py b/dataflow/operators/general_pt/filter/lexical_diversity_filter.py similarity index 96% rename from dataflow/operators/general_text/filter/lexical_diversity_filter.py rename to dataflow/operators/general_pt/filter/lexical_diversity_filter.py index d6782a3b..bd65dee8 100644 --- a/dataflow/operators/general_text/filter/lexical_diversity_filter.py +++ b/dataflow/operators/general_pt/filter/lexical_diversity_filter.py @@ -4,7 +4,7 @@ from dataflow.core import OperatorABC from dataflow.utils.storage import DataFlowStorage from dataflow.utils.registry import OPERATOR_REGISTRY -from dataflow.operators.general_text import LexicalDiversityScorer +from dataflow.operators.general_pt import LexicalDiversitySampleEvaluator @OPERATOR_REGISTRY.register() class LexicalDiversityFilter(OperatorABC): @@ -20,7 +20,7 @@ def __init__(self, min_scores: dict = {'mtld': 50, 'hdd': 0.8}, max_scores: dict 'hdd': 'LexicalDiversityHD-DScore', 'mtld': 'LexicalDiversityMTLDScore', } - self.scorer = LexicalDiversityScorer() + self.scorer = LexicalDiversitySampleEvaluator() @staticmethod def get_desc(lang: str = "zh"): diff --git a/dataflow/operators/general_text/filter/ngram_filter.py b/dataflow/operators/general_pt/filter/ngram_filter.py similarity index 95% rename from dataflow/operators/general_text/filter/ngram_filter.py rename to dataflow/operators/general_pt/filter/ngram_filter.py index 4c0f5eee..f332b06e 100644 --- a/dataflow/operators/general_text/filter/ngram_filter.py +++ b/dataflow/operators/general_pt/filter/ngram_filter.py @@ -2,7 +2,7 @@ from dataflow.core import OperatorABC from dataflow.utils.storage import DataFlowStorage from dataflow.utils.registry import OPERATOR_REGISTRY -from dataflow.operators.general_text import NgramScorer +from dataflow.operators.general_pt import NgramSampleEvaluator @OPERATOR_REGISTRY.register() class NgramFilter(OperatorABC): @@ -11,7 +11,7 @@ def __init__(self, min_score=0.8, max_score=1, ngrams=5): self.logger = get_logger() self.min_score = min_score self.max_score = max_score - self.scorer = NgramScorer(ngrams) + self.scorer = NgramSampleEvaluator(ngrams) self.logger.info(f"Initializing {self.__class__.__name__} with min_scores: {self.min_score} and max_scores: {self.max_score}...") @staticmethod diff --git a/dataflow/operators/general_text/filter/pair_qual_filter.py b/dataflow/operators/general_pt/filter/pair_qual_filter.py similarity index 95% rename from dataflow/operators/general_text/filter/pair_qual_filter.py rename to dataflow/operators/general_pt/filter/pair_qual_filter.py index 6310bc88..0bca6b3b 100644 --- a/dataflow/operators/general_text/filter/pair_qual_filter.py +++ b/dataflow/operators/general_pt/filter/pair_qual_filter.py @@ -1,4 +1,4 @@ -from dataflow.operators.general_text import PairQualScorer +from dataflow.operators.general_pt import PairQualSampleEvaluator import numpy as np from dataflow.core import OperatorABC from dataflow.utils.registry import OPERATOR_REGISTRY @@ -12,7 +12,7 @@ def __init__(self, min_score=0, max_score=10000, model_cache_dir='./dataflow_cac self.min_score = min_score self.max_score = max_score - self.scorer = PairQualScorer(model_cache_dir=model_cache_dir, lang=lang) + self.scorer = PairQualSampleEvaluator(model_cache_dir=model_cache_dir, lang=lang) self.filter_name = 'PairQualFilter' self.logger.info(f"Initializing {self.filter_name} with min_score = {self.min_score}, max_score = {self.max_score}...") diff --git a/dataflow/operators/general_text/filter/perplexity_filter.py b/dataflow/operators/general_pt/filter/perplexity_filter.py similarity index 96% rename from dataflow/operators/general_text/filter/perplexity_filter.py rename to dataflow/operators/general_pt/filter/perplexity_filter.py index 5c0b82e4..f0269183 100644 --- a/dataflow/operators/general_text/filter/perplexity_filter.py +++ b/dataflow/operators/general_pt/filter/perplexity_filter.py @@ -3,7 +3,7 @@ from dataflow.core import OperatorABC from dataflow.utils.storage import DataFlowStorage from dataflow.utils.registry import OPERATOR_REGISTRY -from dataflow.operators.general_text import PerplexityScorer +from dataflow.operators.general_pt import PerplexitySampleEvaluator @OPERATOR_REGISTRY.register() class PerplexityFilter(OperatorABC): @@ -12,7 +12,7 @@ def __init__(self, min_score: float = 10.0, max_score: float = 500.0, model_nam self.logger = get_logger() self.min_score = min_score self.max_score = max_score - self.scorer = PerplexityScorer( + self.scorer = PerplexitySampleEvaluator( model_name=model_name, lang=lang ) diff --git a/dataflow/operators/general_text/filter/presidio_filter.py b/dataflow/operators/general_pt/filter/presidio_filter.py similarity index 95% rename from dataflow/operators/general_text/filter/presidio_filter.py rename to dataflow/operators/general_pt/filter/presidio_filter.py index 00530e13..29159c18 100644 --- a/dataflow/operators/general_text/filter/presidio_filter.py +++ b/dataflow/operators/general_pt/filter/presidio_filter.py @@ -3,7 +3,7 @@ from dataflow.core import OperatorABC from dataflow.utils.storage import DataFlowStorage from dataflow.utils.registry import OPERATOR_REGISTRY -from dataflow.operators.general_text import PresidioScorer +from dataflow.operators.general_pt import PresidioSampleEvaluator @OPERATOR_REGISTRY.register() class PresidioFilter(OperatorABC): @@ -12,7 +12,7 @@ def __init__(self, min_score: int = 0, max_score: int = 5, lang='en', device='cu self.logger = get_logger() self.min_score = min_score self.max_score = max_score - self.scorer = PresidioScorer(lang=lang, device=device, model_cache_dir=model_cache_dir) + self.scorer = PresidioSampleEvaluator(lang=lang, device=device, model_cache_dir=model_cache_dir) self.logger.info(f"Initializing {self.__class__.__name__} with min_score = {self.min_score} and max_score = {self.max_score}") @staticmethod diff --git a/dataflow/operators/general_text/filter/qurating_filter.py b/dataflow/operators/general_pt/filter/qurating_filter.py similarity index 96% rename from dataflow/operators/general_text/filter/qurating_filter.py rename to dataflow/operators/general_pt/filter/qurating_filter.py index 46a33d0c..6fc9483b 100644 --- a/dataflow/operators/general_text/filter/qurating_filter.py +++ b/dataflow/operators/general_pt/filter/qurating_filter.py @@ -1,4 +1,4 @@ -from dataflow.operators.general_text import QuratingScorer +from dataflow.operators.general_pt import QuratingSampleEvaluator import numpy as np from dataflow.core import OperatorABC from dataflow.utils.registry import OPERATOR_REGISTRY @@ -16,7 +16,7 @@ def __init__(self, min_scores: dict = {'writing_style': 0,'required_expertise': self.max_scores = max_scores # Initialize the QuratingScorer with the passed parameters - self.scorer = QuratingScorer(map_batch_size=map_batch_size, + self.scorer = QuratingSampleEvaluator(map_batch_size=map_batch_size, num_workers=num_workers, device_batch_size=device_batch_size, device=device, labels=labels, model_cache_dir=model_cache_dir) @@ -65,7 +65,7 @@ def get_desc(lang: str = "zh"): def run(self, storage: DataFlowStorage, input_key: str): self.input_key = input_key dataframe = storage.read("dataframe") - self.logger.info(f"Running {self.filter_name}...") + self.logger.info(f"Running {self.__class__.__name__}...") # Get the scores for filtering scores = self.scorer.eval(dataframe, self.input_key) diff --git a/dataflow/operators/general_text/filter/text_book_filter.py b/dataflow/operators/general_pt/filter/text_book_filter.py similarity index 96% rename from dataflow/operators/general_text/filter/text_book_filter.py rename to dataflow/operators/general_pt/filter/text_book_filter.py index f7e336cd..4ec1c9a0 100644 --- a/dataflow/operators/general_text/filter/text_book_filter.py +++ b/dataflow/operators/general_pt/filter/text_book_filter.py @@ -2,7 +2,7 @@ from dataflow.core import OperatorABC from dataflow.utils.storage import DataFlowStorage from dataflow.utils.registry import OPERATOR_REGISTRY -from dataflow.operators.general_text import TextbookScorer +from dataflow.operators.general_pt import TextbookSampleEvaluator @OPERATOR_REGISTRY.register() class TextbookFilter(OperatorABC): @@ -11,7 +11,7 @@ def __init__(self, min_score=0.99, max_score=1, model_cache_dir:str='./dataflow_ self.logger = get_logger() self.min_score = min_score self.max_score = max_score - self.scorer = TextbookScorer(model_cache_dir=model_cache_dir) + self.scorer = TextbookSampleEvaluator(model_cache_dir=model_cache_dir) self.logger.info(f"Initializing {self.__class__.__name__} with min_score = {min_score} and max_score = {max_score}") @staticmethod diff --git a/dataflow/operators/general_text/generate/pretrain_generator.py b/dataflow/operators/general_pt/generate/phi4qa_generator.py similarity index 95% rename from dataflow/operators/general_text/generate/pretrain_generator.py rename to dataflow/operators/general_pt/generate/phi4qa_generator.py index 95a0cecd..81171f84 100644 --- a/dataflow/operators/general_text/generate/pretrain_generator.py +++ b/dataflow/operators/general_pt/generate/phi4qa_generator.py @@ -1,4 +1,4 @@ -from dataflow.prompts.general_text import PretrainGeneratorPrompt +from dataflow.prompts.general_text import Phi4QAGeneratorPrompt import pandas as pd from dataflow.utils.registry import OPERATOR_REGISTRY from dataflow import get_logger @@ -8,13 +8,13 @@ from dataflow.core import LLMServingABC @OPERATOR_REGISTRY.register() -class PretrainGenerator(OperatorABC): +class Phi4QAGenerator(OperatorABC): ''' Answer Generator is a class that generates answers for given questions. ''' def __init__(self, llm_serving: LLMServingABC): self.logger = get_logger() - self.prompts = PretrainGeneratorPrompt() + self.prompts = Phi4QAGeneratorPrompt() self.llm_serving = llm_serving @staticmethod diff --git a/dataflow/operators/general_text/refine/html_entity_refiner.py b/dataflow/operators/general_pt/refine/html_entity_refiner.py similarity index 100% rename from dataflow/operators/general_text/refine/html_entity_refiner.py rename to dataflow/operators/general_pt/refine/html_entity_refiner.py diff --git a/dataflow/operators/general_text/refine/html_url_remover_refiner.py b/dataflow/operators/general_pt/refine/html_url_remover_refiner.py similarity index 100% rename from dataflow/operators/general_text/refine/html_url_remover_refiner.py rename to dataflow/operators/general_pt/refine/html_url_remover_refiner.py diff --git a/dataflow/operators/general_text/refine/lowercase_refiner.py b/dataflow/operators/general_pt/refine/lowercase_refiner.py similarity index 100% rename from dataflow/operators/general_text/refine/lowercase_refiner.py rename to dataflow/operators/general_pt/refine/lowercase_refiner.py diff --git a/dataflow/operators/general_text/refine/ner_refiner.py b/dataflow/operators/general_pt/refine/ner_refiner.py similarity index 100% rename from dataflow/operators/general_text/refine/ner_refiner.py rename to dataflow/operators/general_pt/refine/ner_refiner.py diff --git a/dataflow/operators/general_text/refine/pii_anonymize_refiner.py b/dataflow/operators/general_pt/refine/pii_anonymize_refiner.py similarity index 100% rename from dataflow/operators/general_text/refine/pii_anonymize_refiner.py rename to dataflow/operators/general_pt/refine/pii_anonymize_refiner.py diff --git a/dataflow/operators/general_text/refine/ref_removal_refiner.py b/dataflow/operators/general_pt/refine/ref_removal_refiner.py similarity index 100% rename from dataflow/operators/general_text/refine/ref_removal_refiner.py rename to dataflow/operators/general_pt/refine/ref_removal_refiner.py diff --git a/dataflow/operators/general_text/refine/remove_contractions_refiner.py b/dataflow/operators/general_pt/refine/remove_contractions_refiner.py similarity index 100% rename from dataflow/operators/general_text/refine/remove_contractions_refiner.py rename to dataflow/operators/general_pt/refine/remove_contractions_refiner.py diff --git a/dataflow/operators/general_text/refine/remove_emoji_refiner.py b/dataflow/operators/general_pt/refine/remove_emoji_refiner.py similarity index 100% rename from dataflow/operators/general_text/refine/remove_emoji_refiner.py rename to dataflow/operators/general_pt/refine/remove_emoji_refiner.py diff --git a/dataflow/operators/general_text/refine/remove_emoticons_refiner.py b/dataflow/operators/general_pt/refine/remove_emoticons_refiner.py similarity index 100% rename from dataflow/operators/general_text/refine/remove_emoticons_refiner.py rename to dataflow/operators/general_pt/refine/remove_emoticons_refiner.py diff --git a/dataflow/operators/general_text/refine/remove_extra_spaces_refiner.py b/dataflow/operators/general_pt/refine/remove_extra_spaces_refiner.py similarity index 100% rename from dataflow/operators/general_text/refine/remove_extra_spaces_refiner.py rename to dataflow/operators/general_pt/refine/remove_extra_spaces_refiner.py diff --git a/dataflow/operators/general_text/refine/remove_image_ref_refiner.py b/dataflow/operators/general_pt/refine/remove_image_ref_refiner.py similarity index 100% rename from dataflow/operators/general_text/refine/remove_image_ref_refiner.py rename to dataflow/operators/general_pt/refine/remove_image_ref_refiner.py diff --git a/dataflow/operators/general_text/refine/remove_number_refiner.py b/dataflow/operators/general_pt/refine/remove_number_refiner.py similarity index 100% rename from dataflow/operators/general_text/refine/remove_number_refiner.py rename to dataflow/operators/general_pt/refine/remove_number_refiner.py diff --git a/dataflow/operators/general_text/refine/remove_punctuation_refiner.py b/dataflow/operators/general_pt/refine/remove_punctuation_refiner.py similarity index 100% rename from dataflow/operators/general_text/refine/remove_punctuation_refiner.py rename to dataflow/operators/general_pt/refine/remove_punctuation_refiner.py diff --git a/dataflow/operators/general_text/refine/remove_repetitions_punctuation_refiner.py b/dataflow/operators/general_pt/refine/remove_repetitions_punctuation_refiner.py similarity index 100% rename from dataflow/operators/general_text/refine/remove_repetitions_punctuation_refiner.py rename to dataflow/operators/general_pt/refine/remove_repetitions_punctuation_refiner.py diff --git a/dataflow/operators/general_text/refine/remove_stopwords_refiner.py b/dataflow/operators/general_pt/refine/remove_stopwords_refiner.py similarity index 100% rename from dataflow/operators/general_text/refine/remove_stopwords_refiner.py rename to dataflow/operators/general_pt/refine/remove_stopwords_refiner.py diff --git a/dataflow/operators/general_text/refine/spelling_correction_refiner.py b/dataflow/operators/general_pt/refine/spelling_correction_refiner.py similarity index 100% rename from dataflow/operators/general_text/refine/spelling_correction_refiner.py rename to dataflow/operators/general_pt/refine/spelling_correction_refiner.py diff --git a/dataflow/operators/general_text/refine/stemming_lemmatization_refiner.py b/dataflow/operators/general_pt/refine/stemming_lemmatization_refiner.py similarity index 100% rename from dataflow/operators/general_text/refine/stemming_lemmatization_refiner.py rename to dataflow/operators/general_pt/refine/stemming_lemmatization_refiner.py diff --git a/dataflow/operators/general_text/refine/text_normalization_refiner.py b/dataflow/operators/general_pt/refine/text_normalization_refiner.py similarity index 100% rename from dataflow/operators/general_text/refine/text_normalization_refiner.py rename to dataflow/operators/general_pt/refine/text_normalization_refiner.py diff --git a/dataflow/operators/general_sft/__init__.py b/dataflow/operators/general_sft/__init__.py new file mode 100644 index 00000000..bb32bac0 --- /dev/null +++ b/dataflow/operators/general_sft/__init__.py @@ -0,0 +1,33 @@ +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from .eval.alpagasus_scorer import AlpagasusSampleEvaluator + from .eval.deita_quality_scorer import DeitaQualitySampleEvaluator + from .eval.deita_complexity_scorer import DeitaComplexitySampleEvaluator + from .eval.instag_scorer import InstagSampleEvaluator + from .eval.rm_scorer import RMSampleEvaluator + from .eval.superfiltering_scorer import SuperfilteringSampleEvaluator + from .eval.treeinstruct_scorer import TreeinstructSampleEvaluator + + + from .filter.alpagasus_filter import AlpagasusFilter + from .filter.deita_quality_filter import DeitaQualityFilter + from .filter.deita_complexity_filter import DeitaComplexityFilter + from .filter.instag_filter import InstagFilter + from .filter.rm_filter import RMFilter + from .filter.superfiltering_filter import SuperfilteringFilter + from .filter.treeinstruct_filter import TreeinstructFilter + + from .generate.condor_generator import CondorGenerator + from .generate.sft_generator_from_seed import SFTGeneratorSeed + + from .refine.condor_refiner import CondorRefiner + +else: + import sys + from dataflow.utils.registry import LazyLoader, generate_import_structure_from_type_checking + + cur_path = "dataflow/operators/general_sft/" + + _import_structure = generate_import_structure_from_type_checking(__file__, cur_path) + sys.modules[__name__] = LazyLoader(__name__, "dataflow/operators/general_sft/", _import_structure) diff --git a/dataflow/operators/general_text/eval/models/Superfiltering/data_analysis.py b/dataflow/operators/general_sft/eval/Superfiltering/data_analysis.py similarity index 100% rename from dataflow/operators/general_text/eval/models/Superfiltering/data_analysis.py rename to dataflow/operators/general_sft/eval/Superfiltering/data_analysis.py diff --git a/dataflow/operators/general_text/eval/APIcaller/alpagasus_scorer.py b/dataflow/operators/general_sft/eval/alpagasus_scorer.py similarity index 98% rename from dataflow/operators/general_text/eval/APIcaller/alpagasus_scorer.py rename to dataflow/operators/general_sft/eval/alpagasus_scorer.py index ae802bae..fef79466 100644 --- a/dataflow/operators/general_text/eval/APIcaller/alpagasus_scorer.py +++ b/dataflow/operators/general_sft/eval/alpagasus_scorer.py @@ -7,7 +7,7 @@ from dataflow.prompts.general_text import AlpagasusPrompt @OPERATOR_REGISTRY.register() -class AlpagasusScorer(OperatorABC): +class AlpagasusSampleEvaluator(OperatorABC): def __init__(self, llm_serving: LLMServingABC = None, dimension: str = 'quality'): self.logger = get_logger() self.logger.info(f'Initializing {self.__class__.__name__}...') diff --git a/dataflow/operators/general_text/eval/models/deita_complexity_scorer.py b/dataflow/operators/general_sft/eval/deita_complexity_scorer.py similarity index 99% rename from dataflow/operators/general_text/eval/models/deita_complexity_scorer.py rename to dataflow/operators/general_sft/eval/deita_complexity_scorer.py index 1f5f746d..e4ba8849 100644 --- a/dataflow/operators/general_text/eval/models/deita_complexity_scorer.py +++ b/dataflow/operators/general_sft/eval/deita_complexity_scorer.py @@ -9,7 +9,7 @@ from tqdm import tqdm @OPERATOR_REGISTRY.register() -class DeitaComplexityScorer(OperatorABC): +class DeitaComplexitySampleEvaluator(OperatorABC): def __init__(self, device='cuda', model_cache_dir='./dataflow_cache', max_length=512): self.logger = get_logger() self.logger.info(f'Initializing {self.__class__.__name__}...') diff --git a/dataflow/operators/general_text/eval/models/deita_quality_scorer.py b/dataflow/operators/general_sft/eval/deita_quality_scorer.py similarity index 99% rename from dataflow/operators/general_text/eval/models/deita_quality_scorer.py rename to dataflow/operators/general_sft/eval/deita_quality_scorer.py index 065efa01..2bd63e34 100644 --- a/dataflow/operators/general_text/eval/models/deita_quality_scorer.py +++ b/dataflow/operators/general_sft/eval/deita_quality_scorer.py @@ -10,7 +10,7 @@ from tqdm import tqdm @OPERATOR_REGISTRY.register() -class DeitaQualityScorer(OperatorABC): +class DeitaQualitySampleEvaluator(OperatorABC): def __init__(self, device='cuda', model_cache_dir='./dataflow_cache', max_length=512): self.logger = get_logger() self.logger.info(f'Initializing {self.__class__.__name__}...') diff --git a/dataflow/operators/general_text/eval/models/instag_scorer.py b/dataflow/operators/general_sft/eval/instag_scorer.py similarity index 99% rename from dataflow/operators/general_text/eval/models/instag_scorer.py rename to dataflow/operators/general_sft/eval/instag_scorer.py index d438cd82..53afdfe8 100644 --- a/dataflow/operators/general_text/eval/models/instag_scorer.py +++ b/dataflow/operators/general_sft/eval/instag_scorer.py @@ -10,7 +10,7 @@ import json @OPERATOR_REGISTRY.register() -class InstagScorer(OperatorABC): +class InstagSampleEvaluator(OperatorABC): def __init__(self, model_cache_dir='./dataflow_cache', device='cuda', max_new_tokens=1024, temperature=0, do_sample=False, num_return_sequences=1, return_dict_in_generate=True): self.logger = get_logger() self.logger.info(f'Initializing {self.__class__.__name__}...') diff --git a/dataflow/operators/general_text/eval/models/rm_scorer.py b/dataflow/operators/general_sft/eval/rm_scorer.py similarity index 98% rename from dataflow/operators/general_text/eval/models/rm_scorer.py rename to dataflow/operators/general_sft/eval/rm_scorer.py index d49ea56d..0be42ee3 100644 --- a/dataflow/operators/general_text/eval/models/rm_scorer.py +++ b/dataflow/operators/general_sft/eval/rm_scorer.py @@ -7,7 +7,7 @@ # RMScorer for evaluating based on reward-model-deberta-v3-large-v2 @OPERATOR_REGISTRY.register() -class RMScorer(OperatorABC): +class RMSampleEvaluator(OperatorABC): def __init__(self, device='cuda', model_cache_dir='./dataflow_cache', ): self.logger = get_logger() self.logger.info(f'Initializing {self.__class__.__name__}...') diff --git a/dataflow/operators/general_text/eval/models/superfiltering_scorer.py b/dataflow/operators/general_sft/eval/superfiltering_scorer.py similarity index 96% rename from dataflow/operators/general_text/eval/models/superfiltering_scorer.py rename to dataflow/operators/general_sft/eval/superfiltering_scorer.py index d93d3087..50069ac0 100644 --- a/dataflow/operators/general_text/eval/models/superfiltering_scorer.py +++ b/dataflow/operators/general_sft/eval/superfiltering_scorer.py @@ -1,6 +1,6 @@ from dataflow.core import OperatorABC from dataflow.utils.registry import OPERATOR_REGISTRY -from dataflow.operators.general_text.eval.models.Superfiltering.data_analysis import get_perplexity_and_embedding_whole_text, get_perplexity_and_embedding_part_text +from dataflow.operators.general_sft.eval.Superfiltering.data_analysis import get_perplexity_and_embedding_whole_text, get_perplexity_and_embedding_part_text from transformers import AutoTokenizer, AutoModelForCausalLM from tqdm import tqdm import torch @@ -11,7 +11,7 @@ # Superfiltering instruction quality (ifd) evaluation # cited from: Superfiltering: Weak-to-Strong Data Filtering for Fast Instruction-Tuning @OPERATOR_REGISTRY.register() -class SuperfilteringScorer(OperatorABC): +class SuperfilteringSampleEvaluator(OperatorABC): def __init__(self, device='cuda', model_cache_dir='./dataflow_cache', max_length=512): self.logger = get_logger() self.logger.info(f'Initializing {self.__class__.__name__}...') diff --git a/dataflow/operators/general_text/eval/APIcaller/treeinstruct_scorer.py b/dataflow/operators/general_sft/eval/treeinstruct_scorer.py similarity index 98% rename from dataflow/operators/general_text/eval/APIcaller/treeinstruct_scorer.py rename to dataflow/operators/general_sft/eval/treeinstruct_scorer.py index da611e65..7642f20d 100644 --- a/dataflow/operators/general_text/eval/APIcaller/treeinstruct_scorer.py +++ b/dataflow/operators/general_sft/eval/treeinstruct_scorer.py @@ -7,7 +7,7 @@ from dataflow.prompts.general_text import TreeinstructPrompt @OPERATOR_REGISTRY.register() -class TreeinstructScorer(OperatorABC): +class TreeinstructSampleEvaluator(OperatorABC): def __init__(self, llm_serving: LLMServingABC = None): self.logger = get_logger() self.logger.info(f'Initializing {self.__class__.__name__}...') diff --git a/dataflow/operators/general_text/filter/alpagasus_filter.py b/dataflow/operators/general_sft/filter/alpagasus_filter.py similarity index 96% rename from dataflow/operators/general_text/filter/alpagasus_filter.py rename to dataflow/operators/general_sft/filter/alpagasus_filter.py index 39f7e4c4..8bd5215b 100644 --- a/dataflow/operators/general_text/filter/alpagasus_filter.py +++ b/dataflow/operators/general_sft/filter/alpagasus_filter.py @@ -3,7 +3,7 @@ from dataflow.utils.storage import DataFlowStorage from dataflow.utils.registry import OPERATOR_REGISTRY from dataflow.core import LLMServingABC -from dataflow.operators.general_text import AlpagasusScorer +from dataflow.operators.general_sft import AlpagasusSampleEvaluator @OPERATOR_REGISTRY.register() class AlpagasusFilter(OperatorABC): @@ -13,7 +13,7 @@ def __init__(self, min_score=3, max_score=5, llm_serving: LLMServingABC = None, self.min_score = min_score self.max_score = max_score self.logger.info(f"Initializing {self.__class__.__name__} with min_score = {self.min_score} and max_score = {self.max_score}...") - self.scorer = AlpagasusScorer(llm_serving, dimension) + self.scorer = AlpagasusSampleEvaluator(llm_serving, dimension) @staticmethod def get_desc(lang: str = "zh"): diff --git a/dataflow/operators/general_text/filter/deita_complexity_filter.py b/dataflow/operators/general_sft/filter/deita_complexity_filter.py similarity index 96% rename from dataflow/operators/general_text/filter/deita_complexity_filter.py rename to dataflow/operators/general_sft/filter/deita_complexity_filter.py index 9c60e882..9dc9fa0f 100644 --- a/dataflow/operators/general_text/filter/deita_complexity_filter.py +++ b/dataflow/operators/general_sft/filter/deita_complexity_filter.py @@ -1,4 +1,4 @@ -from dataflow.operators.general_text import DeitaComplexityScorer +from dataflow.operators.general_sft import DeitaComplexitySampleEvaluator from dataflow.core import OperatorABC import numpy as np from dataflow.utils.registry import OPERATOR_REGISTRY @@ -11,7 +11,7 @@ def __init__(self, min_score=3.0, max_score=5.0, device='cuda', model_cache_dir= self.logger = get_logger() self.min_score = min_score self.max_score = max_score - self.scorer = DeitaComplexityScorer( + self.scorer = DeitaComplexitySampleEvaluator( device=device, model_cache_dir=model_cache_dir, max_length=max_length, diff --git a/dataflow/operators/general_text/filter/deita_quality_filter.py b/dataflow/operators/general_sft/filter/deita_quality_filter.py similarity index 96% rename from dataflow/operators/general_text/filter/deita_quality_filter.py rename to dataflow/operators/general_sft/filter/deita_quality_filter.py index 552ea2d1..fe72f991 100644 --- a/dataflow/operators/general_text/filter/deita_quality_filter.py +++ b/dataflow/operators/general_sft/filter/deita_quality_filter.py @@ -1,4 +1,4 @@ -from dataflow.operators.general_text import DeitaQualityScorer +from dataflow.operators.general_sft import DeitaQualitySampleEvaluator from dataflow.core import OperatorABC import numpy as np from dataflow.utils.registry import OPERATOR_REGISTRY @@ -11,7 +11,7 @@ def __init__(self, min_score=2.5, max_score=10000.0, device='cuda', model_cache_ self.logger = get_logger() self.min_score = min_score self.max_score = max_score - self.scorer = DeitaQualityScorer( + self.scorer = DeitaQualitySampleEvaluator( device=device, model_cache_dir=model_cache_dir, max_length=max_length, diff --git a/dataflow/operators/general_text/filter/instag_filter.py b/dataflow/operators/general_sft/filter/instag_filter.py similarity index 95% rename from dataflow/operators/general_text/filter/instag_filter.py rename to dataflow/operators/general_sft/filter/instag_filter.py index d5d67ebe..35355d05 100644 --- a/dataflow/operators/general_text/filter/instag_filter.py +++ b/dataflow/operators/general_sft/filter/instag_filter.py @@ -1,4 +1,4 @@ -from dataflow.operators.general_text import InstagScorer +from dataflow.operators.general_sft import InstagSampleEvaluator from dataflow.core import OperatorABC import numpy as np from dataflow.utils.registry import OPERATOR_REGISTRY @@ -14,7 +14,7 @@ def __init__(self, min_score=0.0, max_score=1.0, model_cache_dir='./dataflow_cac self.max_score = max_score # Initialize the scorer - self.scorer = InstagScorer( + self.scorer = InstagSampleEvaluator( model_cache_dir=model_cache_dir, device=device, max_new_tokens=max_new_tokens diff --git a/dataflow/operators/general_text/filter/reward_model_filter.py b/dataflow/operators/general_sft/filter/rm_filter.py similarity index 96% rename from dataflow/operators/general_text/filter/reward_model_filter.py rename to dataflow/operators/general_sft/filter/rm_filter.py index 50d439d4..c1592416 100644 --- a/dataflow/operators/general_text/filter/reward_model_filter.py +++ b/dataflow/operators/general_sft/filter/rm_filter.py @@ -3,7 +3,7 @@ from dataflow.core import OperatorABC from dataflow.utils.storage import DataFlowStorage from dataflow.utils.registry import OPERATOR_REGISTRY -from dataflow.operators.general_text import RMScorer +from dataflow.operators.general_sft import RMSampleEvaluator @OPERATOR_REGISTRY.register() class RMFilter(OperatorABC): @@ -12,7 +12,7 @@ def __init__(self, min_score: float = 0.2, max_score: float = 0.8, device='cuda' self.logger = get_logger() self.min_score = min_score self.max_score = max_score - self.scorer = RMScorer(device=device, model_cache_dir=model_cache_dir) + self.scorer = RMSampleEvaluator(device=device, model_cache_dir=model_cache_dir) self.logger.info(f"Initializing {self.__class__.__name__} with min_score = {self.min_score}, max_score = {self.max_score}") @staticmethod diff --git a/dataflow/operators/general_text/filter/superfiltering_filter.py b/dataflow/operators/general_sft/filter/superfiltering_filter.py similarity index 97% rename from dataflow/operators/general_text/filter/superfiltering_filter.py rename to dataflow/operators/general_sft/filter/superfiltering_filter.py index 5c9ff010..59165abd 100644 --- a/dataflow/operators/general_text/filter/superfiltering_filter.py +++ b/dataflow/operators/general_sft/filter/superfiltering_filter.py @@ -1,4 +1,4 @@ -from dataflow.operators.general_text import SuperfilteringScorer +from dataflow.operators.general_sft import SuperfilteringSampleEvaluator import numpy as np from dataflow.core import OperatorABC from dataflow.utils.registry import OPERATOR_REGISTRY @@ -13,7 +13,7 @@ def __init__(self, min_score=0.0, max_score=1.0, device='cuda', model_cache_dir= self.min_score = min_score self.max_score = max_score - self.scorer = SuperfilteringScorer( + self.scorer = SuperfilteringSampleEvaluator( device=device, model_cache_dir=model_cache_dir, max_length=max_length diff --git a/dataflow/operators/general_text/filter/treeinstruct_filter.py b/dataflow/operators/general_sft/filter/treeinstruct_filter.py similarity index 96% rename from dataflow/operators/general_text/filter/treeinstruct_filter.py rename to dataflow/operators/general_sft/filter/treeinstruct_filter.py index d216ef97..a45ce61a 100644 --- a/dataflow/operators/general_text/filter/treeinstruct_filter.py +++ b/dataflow/operators/general_sft/filter/treeinstruct_filter.py @@ -3,7 +3,7 @@ from dataflow.core import OperatorABC, LLMServingABC from dataflow.utils.storage import DataFlowStorage from dataflow.utils.registry import OPERATOR_REGISTRY -from dataflow.operators.general_text import TreeinstructScorer +from dataflow.operators.general_sft import TreeinstructSampleEvaluator @OPERATOR_REGISTRY.register() class TreeinstructFilter(OperatorABC): @@ -12,7 +12,7 @@ def __init__(self, min_score: int = 7, max_score: int = 100, llm_serving: LLMSer self.logger = get_logger() self.min_score = min_score self.max_score = max_score - self.scorer = TreeinstructScorer(llm_serving=llm_serving) + self.scorer = TreeinstructSampleEvaluator(llm_serving=llm_serving) self.logger.info(f"Initializing {self.__class__.__name__} with min_score = {min_score} and max_score = {max_score}") @staticmethod diff --git a/dataflow/operators/general_text/generate/condor_generator.py b/dataflow/operators/general_sft/generate/condor_generator.py similarity index 100% rename from dataflow/operators/general_text/generate/condor_generator.py rename to dataflow/operators/general_sft/generate/condor_generator.py diff --git a/dataflow/operators/general_text/generate/sft_generator_from_seed.py b/dataflow/operators/general_sft/generate/sft_generator_from_seed.py similarity index 100% rename from dataflow/operators/general_text/generate/sft_generator_from_seed.py rename to dataflow/operators/general_sft/generate/sft_generator_from_seed.py diff --git a/dataflow/operators/general_text/refine/condor_refiner.py b/dataflow/operators/general_sft/refine/condor_refiner.py similarity index 100% rename from dataflow/operators/general_text/refine/condor_refiner.py rename to dataflow/operators/general_sft/refine/condor_refiner.py diff --git a/dataflow/operators/general_text/__init__.py b/dataflow/operators/general_text/__init__.py deleted file mode 100644 index 7ec7ebee..00000000 --- a/dataflow/operators/general_text/__init__.py +++ /dev/null @@ -1,120 +0,0 @@ -from typing import TYPE_CHECKING - -if TYPE_CHECKING: - # filter - from .filter.alpagasus_filter import AlpagasusFilter - from .filter.ccnet_deduplicator import CCNetDeduplicator - from .filter.debertav3_filter import DebertaV3Filter - from .filter.deita_complexity_filter import DeitaComplexityFilter - from .filter.deita_quality_filter import DeitaQualityFilter - from .filter.fineweb_edu_filter import FineWebEduFilter - from .filter.general_filter import GeneralFilter - from .filter.hash_deduplicator import HashDeduplicator - from .filter.heuristics import ColonEndFilter - from .filter.heuristics import WordNumberFilter - from .filter.heuristics import SentenceNumberFilter - from .filter.heuristics import LineEndWithEllipsisFilter - from .filter.heuristics import ContentNullFilter - from .filter.heuristics import SymbolWordRatioFilter - from .filter.heuristics import AlphaWordsFilter - from .filter.heuristics import HtmlEntityFilter - from .filter.heuristics import IDCardFilter - from .filter.heuristics import NoPuncFilter - from .filter.heuristics import SpecialCharacterFilter - from .filter.heuristics import WatermarkFilter - from .filter.heuristics import MeanWordLengthFilter - from .filter.heuristics import StopWordFilter - from .filter.heuristics import CurlyBracketFilter - from .filter.heuristics import CapitalWordsFilter - from .filter.heuristics import LoremIpsumFilter - from .filter.heuristics import UniqueWordsFilter - from .filter.heuristics import CharNumberFilter - from .filter.heuristics import LineStartWithBulletpointFilter - from .filter.heuristics import LineWithJavascriptFilter - from .filter.heuristics import BlocklistFilter - from .filter.instag_filter import InstagFilter - from .filter.langkit_filter import LangkitFilter - from .filter.language_filter import LanguageFilter - from .filter.lexical_diversity_filter import LexicalDiversityFilter - from .filter.llm_language_filter import LLMLanguageFilter - from .filter.minhash_deduplicator import MinHashDeduplicator - from .filter.ngram_filter import NgramFilter - from .filter.ngramhash_deduplicator import NgramHashDeduplicator - from .filter.pair_qual_filter import PairQualFilter - from .filter.perplexity_filter import PerplexityFilter - from .filter.perspective_filter import PerspectiveFilter - from .filter.presidio_filter import PresidioFilter - from .filter.qurating_filter import QuratingFilter - from .filter.reward_model_filter import RMFilter - from .filter.sem_deduplicator import SemDeduplicator - from .filter.simhash_deduplicator import SimHashDeduplicator - from .filter.superfiltering_filter import SuperfilteringFilter - from .filter.text_book_filter import TextbookFilter - from .filter.treeinstruct_filter import TreeinstructFilter - - # generate - from .generate.condor_generator import CondorGenerator - from .generate.pretrain_generator import PretrainGenerator - from .generate.sft_generator_from_seed import SFTGeneratorSeed - - # refine - from .refine.condor_refiner import CondorRefiner - from .refine.html_entity_refiner import HtmlEntityRefiner - from .refine.html_url_remover_refiner import HtmlUrlRemoverRefiner - from .refine.lowercase_refiner import LowercaseRefiner - from .refine.ner_refiner import NERRefiner - from .refine.pii_anonymize_refiner import PIIAnonymizeRefiner - from .refine.ref_removal_refiner import ReferenceRemoverRefiner - from .refine.remove_contractions_refiner import RemoveContractionsRefiner - from .refine.remove_emoji_refiner import RemoveEmojiRefiner - from .refine.remove_emoticons_refiner import RemoveEmoticonsRefiner - from .refine.remove_extra_spaces_refiner import RemoveExtraSpacesRefiner - from .refine.remove_image_ref_refiner import RemoveImageRefsRefiner - from .refine.remove_number_refiner import RemoveNumberRefiner - from .refine.remove_punctuation_refiner import RemovePunctuationRefiner - from .refine.remove_repetitions_punctuation_refiner import RemoveRepetitionsPunctuationRefiner - from .refine.remove_stopwords_refiner import RemoveStopwordsRefiner - from .refine.spelling_correction_refiner import SpellingCorrectionRefiner - from .refine.stemming_lemmatization_refiner import StemmingLemmatizationRefiner - from .refine.text_normalization_refiner import TextNormalizationRefiner - - # eval - from .eval.statistics.ngram_scorer import NgramScorer - from .eval.statistics.lexical_diversity_scorer import LexicalDiversityScorer - from .eval.statistics.langkit_scorer import LangkitScorer - - from .eval.models.deita_quality_scorer import DeitaQualityScorer - from .eval.models.instag_scorer import InstagScorer - from .eval.models.debertav3_scorer import DebertaV3Scorer - from .eval.models.deita_complexity_scorer import DeitaComplexityScorer - from .eval.models.fineweb_edu_scorer import FineWebEduScorer - from .eval.models.pair_qual_scorer import PairQualScorer - from .eval.models.presidio_scorer import PresidioScorer - from .eval.models.rm_scorer import RMScorer - from .eval.models.textbook_scorer import TextbookScorer - from .eval.models.superfiltering_scorer import SuperfilteringScorer - from .eval.models.qurating_scorer import QuratingScorer - from .eval.models.perplexity_scorer import PerplexityScorer - - from .eval.APIcaller.alpagasus_scorer import AlpagasusScorer - from .eval.APIcaller.treeinstruct_scorer import TreeinstructScorer - from .eval.APIcaller.perspective_scorer import PerspectiveScorer - from .eval.APIcaller.meta_scorer import MetaScorer - - from .eval.diversity.vendi_scorer import VendiScorer - from .eval.diversity.task2vec_scorer import Task2VecScorer - - from .eval.gen.bert_scorer import BERTScorer - from .eval.gen.bleu_scorer import BleuScorer - from .eval.gen.cider_scorer import CiderScorer -else: - import sys - from dataflow.utils.registry import LazyLoader, generate_import_structure_from_type_checking - - - - cur_path = "dataflow/operators/general_text/" - - - _import_structure = generate_import_structure_from_type_checking(__file__, cur_path) - sys.modules[__name__] = LazyLoader(__name__, "dataflow/operators/general_text/", _import_structure) diff --git a/dataflow/prompts/general_text.py b/dataflow/prompts/general_text.py index 4816d48f..e8d6bcac 100644 --- a/dataflow/prompts/general_text.py +++ b/dataflow/prompts/general_text.py @@ -1,7 +1,7 @@ ''' A collection of prompts for the general text operator. ''' -class PretrainGeneratorPrompt: +class Phi4QAGeneratorPrompt: def __init__(self): pass diff --git a/dataflow/statics/core_text_pipeline/core_filter.py b/dataflow/statics/core_text_pipeline/core_filter.py index 6f1a80fe..566d350d 100644 --- a/dataflow/statics/core_text_pipeline/core_filter.py +++ b/dataflow/statics/core_text_pipeline/core_filter.py @@ -12,7 +12,7 @@ import pandas as pd from dataflow.operators.core_text import PromptedGenerator -from dataflow.operators.general_text import GeneralFilter +from dataflow.operators.core_text import GeneralFilter from dataflow.utils.storage import FileStorage from dataflow.serving import APILLMServing_request from dataflow.prompts.core_filter import * diff --git a/dataflow/statics/core_text_pipeline/core_sft_from_scratch.py b/dataflow/statics/core_text_pipeline/core_sft_from_scratch.py index 36780e5a..9e0b611a 100644 --- a/dataflow/statics/core_text_pipeline/core_sft_from_scratch.py +++ b/dataflow/statics/core_text_pipeline/core_sft_from_scratch.py @@ -12,7 +12,7 @@ """ from dataflow.operators.core_text import RandomDomainKnowledgeRowGenerator from dataflow.operators.core_text import PromptedGenerator -from dataflow.operators.general_text import GeneralFilter +from dataflow.operators.core_text import GeneralFilter from dataflow.utils.storage import FileStorage from dataflow.serving import APILLMServing_request from dataflow.prompts.core_sft_from_scratch import * diff --git a/dataflow/statics/pipelines/api_pipelines/text_sft_synthesis_pipeline.py b/dataflow/statics/pipelines/api_pipelines/text_sft_synthesis_pipeline.py index 4419132b..75f16d39 100644 --- a/dataflow/statics/pipelines/api_pipelines/text_sft_synthesis_pipeline.py +++ b/dataflow/statics/pipelines/api_pipelines/text_sft_synthesis_pipeline.py @@ -1,7 +1,7 @@ -from dataflow.operators.general_text import AlpagasusFilter -from dataflow.operators.general_text import CondorGenerator -from dataflow.operators.general_text import CondorRefiner +from dataflow.operators.general_sft import AlpagasusFilter +from dataflow.operators.general_sft import CondorGenerator +from dataflow.operators.general_sft import CondorRefiner from dataflow.utils.storage import FileStorage from dataflow.serving import APILLMServing_request diff --git a/dataflow/statics/pipelines/cpu_pipelines/text_pt_filter.py b/dataflow/statics/pipelines/cpu_pipelines/text_pt_filter.py index 9e6177fc..d60b524f 100644 --- a/dataflow/statics/pipelines/cpu_pipelines/text_pt_filter.py +++ b/dataflow/statics/pipelines/cpu_pipelines/text_pt_filter.py @@ -1,8 +1,11 @@ -from dataflow.operators.general_text import ( - MinHashDeduplicator, - ColonEndFilter, +from dataflow.operators.core_text import ( WordNumberFilter, BlocklistFilter, + MinHashDeduplicateFilter +) +from dataflow.operators.general_pt import ( + MetaSampleEvaluator, + ColonEndFilter, SentenceNumberFilter, LineEndWithEllipsisFilter, ContentNullFilter, @@ -21,7 +24,7 @@ LineStartWithBulletpointFilter, LineWithJavascriptFilter ) -from dataflow.operators.refine import ( +from dataflow.operators.general_pt import ( HtmlUrlRemoverRefiner, RemoveEmojiRefiner, RemoveExtraSpacesRefiner @@ -40,7 +43,7 @@ def __init__(self): self.remove_extra_spaces_refiner = RemoveExtraSpacesRefiner() self.remove_emoji_refiner = RemoveEmojiRefiner() self.html_remove_refiner = HtmlUrlRemoverRefiner() - self.minhash_deduplicator = MinHashDeduplicator(num_perm=128, threshold=0.9, use_n_gram=True, ngram=5) + self.minhash_deduplicator = MinHashDeduplicateFilter(num_perm=128, threshold=0.9, use_n_gram=True, ngram=5) self.blocklist_filter = BlocklistFilter() self.word_number_filter = WordNumberFilter(min_words=20, max_words=100000) self.colon_end_filter = ColonEndFilter() diff --git a/dataflow/statics/pipelines/cpu_pipelines/text_sft_filter.py b/dataflow/statics/pipelines/cpu_pipelines/text_sft_filter.py index da23e5ad..0abf069d 100644 --- a/dataflow/statics/pipelines/cpu_pipelines/text_sft_filter.py +++ b/dataflow/statics/pipelines/cpu_pipelines/text_sft_filter.py @@ -1,4 +1,4 @@ -from dataflow.operators.general_text import WordNumberFilter +from dataflow.operators.core_text import WordNumberFilter from dataflow.utils.storage import FileStorage diff --git a/dataflow/statics/pipelines/gpu_pipelines/text_pt_filter.py b/dataflow/statics/pipelines/gpu_pipelines/text_pt_filter.py index 90938452..36cb4de4 100644 --- a/dataflow/statics/pipelines/gpu_pipelines/text_pt_filter.py +++ b/dataflow/statics/pipelines/gpu_pipelines/text_pt_filter.py @@ -1,9 +1,11 @@ -from dataflow.operators.general_text import ( - MinHashDeduplicator, +from dataflow.operators.core_text import ( + MinHashDeduplicateFilter, LanguageFilter, - ColonEndFilter, WordNumberFilter, BlocklistFilter, +) +from dataflow.operators.general_pt import ( + ColonEndFilter, SentenceNumberFilter, LineEndWithEllipsisFilter, ContentNullFilter, @@ -23,7 +25,7 @@ LineWithJavascriptFilter, PairQualFilter ) -from dataflow.operators.general_text import ( +from dataflow.operators.general_pt import ( HtmlUrlRemoverRefiner, RemoveEmojiRefiner, RemoveExtraSpacesRefiner @@ -44,7 +46,7 @@ def __init__(self): self.remove_extra_spaces_refiner = RemoveExtraSpacesRefiner() self.remove_emoji_refiner = RemoveEmojiRefiner() self.html_remove_refiner = HtmlUrlRemoverRefiner() - self.minhash_deduplicator = MinHashDeduplicator(num_perm=128, threshold=0.9, use_n_gram=True, ngram=5) + self.minhash_deduplicator = MinHashDeduplicateFilter(num_perm=128, threshold=0.9, use_n_gram=True, ngram=5) self.blocklist_filter = BlocklistFilter() self.word_number_filter = WordNumberFilter(min_words=20, max_words=100000) self.colon_end_filter = ColonEndFilter() diff --git a/dataflow/statics/pipelines/gpu_pipelines/text_pt_synthetic.py b/dataflow/statics/pipelines/gpu_pipelines/text_pt_synthetic.py index 46b36ed7..cb78149f 100644 --- a/dataflow/statics/pipelines/gpu_pipelines/text_pt_synthetic.py +++ b/dataflow/statics/pipelines/gpu_pipelines/text_pt_synthetic.py @@ -1,9 +1,11 @@ -from dataflow.operators.general_text import ( - MinHashDeduplicator, +from dataflow.operators.core_text import ( + MinHashDeduplicateFilter, LanguageFilter, - ColonEndFilter, WordNumberFilter, BlocklistFilter, +) +from dataflow.operators.general_pt import ( + ColonEndFilter, SentenceNumberFilter, LineEndWithEllipsisFilter, ContentNullFilter, @@ -24,12 +26,12 @@ PairQualFilter, QuratingFilter ) -from dataflow.operators.general_text import ( +from dataflow.operators.general_pt import ( HtmlUrlRemoverRefiner, RemoveEmojiRefiner, RemoveExtraSpacesRefiner ) -from dataflow.operators.general_text import PretrainGenerator +from dataflow.operators.general_pt import Phi4QAGenerator from dataflow.serving import LocalModelLLMServing_vllm, LocalModelLLMServing_sglang from dataflow.utils.storage import FileStorage @@ -63,7 +65,7 @@ def __init__(self): self.remove_extra_spaces_refiner = RemoveExtraSpacesRefiner() self.remove_emoji_refiner = RemoveEmojiRefiner() self.html_remove_refiner = HtmlUrlRemoverRefiner() - self.minhash_deduplicator = MinHashDeduplicator(num_perm=128, threshold=0.9, use_n_gram=True, ngram=5) + self.minhash_deduplicator = MinHashDeduplicateFilter(num_perm=128, threshold=0.9, use_n_gram=True, ngram=5) self.blocklist_filter = BlocklistFilter() self.word_number_filter = WordNumberFilter(min_words=20, max_words=100000) self.colon_end_filter = ColonEndFilter() @@ -85,7 +87,7 @@ def __init__(self): self.line_start_with_bulletpoint_filter = LineStartWithBulletpointFilter(threshold=0.9) self.line_with_javascript_filter = LineWithJavascriptFilter(threshold=3) self.quality_filter = PairQualFilter(min_score=-2, max_score=10000, lang='en') - self.pt_generator = PretrainGenerator( + self.pt_generator = Phi4QAGenerator( llm_serving=self.llm_serving ) self.qurating_filter = QuratingFilter(min_scores = {'writing_style': 0,'required_expertise': 0,'facts_and_trivia': 0,'educational_value': 0}, max_scores = {'writing_style': 9,'required_expertise': 9,'facts_and_trivia': 9,'educational_value': 9}) diff --git a/dataflow/statics/pipelines/gpu_pipelines/text_sft_filter.py b/dataflow/statics/pipelines/gpu_pipelines/text_sft_filter.py index f60a37bc..1137e52b 100644 --- a/dataflow/statics/pipelines/gpu_pipelines/text_sft_filter.py +++ b/dataflow/statics/pipelines/gpu_pipelines/text_sft_filter.py @@ -1,5 +1,7 @@ -from dataflow.operators.general_text import ( +from dataflow.operators.core_text import ( WordNumberFilter, +) +from dataflow.operators.general_sft import ( SuperfilteringFilter, DeitaQualityFilter, InstagFilter diff --git a/dataflow/statics/pipelines/gpu_pipelines/text_sft_synthetic.py b/dataflow/statics/pipelines/gpu_pipelines/text_sft_synthetic.py index 4c65124e..64376ae7 100644 --- a/dataflow/statics/pipelines/gpu_pipelines/text_sft_synthetic.py +++ b/dataflow/statics/pipelines/gpu_pipelines/text_sft_synthetic.py @@ -1,9 +1,11 @@ -from dataflow.operators.general_text import ( - MinHashDeduplicator, +from dataflow.operators.core_text import ( + MinHashDeduplicateFilter, LanguageFilter, - ColonEndFilter, WordNumberFilter, BlocklistFilter, +) +from dataflow.operators.general_pt import ( + ColonEndFilter, SentenceNumberFilter, LineEndWithEllipsisFilter, ContentNullFilter, @@ -22,16 +24,18 @@ LineStartWithBulletpointFilter, LineWithJavascriptFilter, PairQualFilter, +) +from dataflow.operators.general_sft import ( SuperfilteringFilter, DeitaQualityFilter, - InstagFilter + InstagFilter, ) -from dataflow.operators.general_text import ( +from dataflow.operators.general_pt import ( HtmlUrlRemoverRefiner, RemoveEmojiRefiner, RemoveExtraSpacesRefiner ) -from dataflow.operators.general_text import SFTGeneratorSeed +from dataflow.operators.general_sft import SFTGeneratorSeed from dataflow.serving import LocalModelLLMServing_vllm, LocalModelLLMServing_sglang from dataflow.utils.storage import FileStorage @@ -63,7 +67,7 @@ def __init__(self): self.remove_extra_spaces_refiner = RemoveExtraSpacesRefiner() self.remove_emoji_refiner = RemoveEmojiRefiner() self.html_remove_refiner = HtmlUrlRemoverRefiner() - self.minhash_deduplicator = MinHashDeduplicator(num_perm=128, threshold=0.9, use_n_gram=True, ngram=5) + self.minhash_deduplicator = MinHashDeduplicateFilter(num_perm=128, threshold=0.9, use_n_gram=True, ngram=5) self.blocklist_filter = BlocklistFilter() self.word_number_filter = WordNumberFilter(min_words=20, max_words=100000) self.colon_end_filter = ColonEndFilter() diff --git a/dataflow/utils/registry.py b/dataflow/utils/registry.py index f0b9e5d1..b62fa040 100644 --- a/dataflow/utils/registry.py +++ b/dataflow/utils/registry.py @@ -205,8 +205,9 @@ def get_type_of_operator(self): 'core_text', 'core_vision', 'db', - 'general_text', 'knowledge_cleaning', + 'general_pt', + 'general_sft', 'rare', 'reasoning', 'text2sql' diff --git a/test/test_autoop.py b/test/test_autoop.py index cb709b4d..088c2446 100644 --- a/test/test_autoop.py +++ b/test/test_autoop.py @@ -1,8 +1,8 @@ from dataflow.pipeline import PipelineABC -from dataflow.operators.general_text import ( +from dataflow.operators.core_text import ( LLMLanguageFilter, ) -from dataflow.operators.general_text import MetaScorer +from dataflow.operators.general_pt import MetaSampleEvaluator from dataflow.operators.core_text import PromptedGenerator from dataflow.serving import APILLMServing_request, LocalModelLLMServing_vllm, LocalHostLLMAPIServing_vllm from dataflow.utils.storage import FileStorage From 9e3ea2ab48029384a7d21f76d0d6f28ff919f3f7 Mon Sep 17 00:00:00 2001 From: MOLYHECI Date: Mon, 8 Sep 2025 00:28:28 +0800 Subject: [PATCH 2/4] rename some operators --- dataflow/operators/core_text/__init__.py | 16 ++++++++-------- ...scorer.py => perspective_sample_evaluator.py} | 0 ...c_scorer.py => task2vec_dataset_evaluator.py} | 0 ...endi_scorer.py => vendi_dataset_evaluator.py} | 0 ...eduplicator.py => hash_deduplicate_filter.py} | 0 ...plicator.py => minhash_deduplicate_filter.py} | 0 ...icator.py => ngramhash_deduplicate_filter.py} | 0 ...deduplicator.py => sem_deduplicate_filter.py} | 0 ...plicator.py => simhash_deduplicate_filter.py} | 0 dataflow/operators/general_sft/__init__.py | 14 +++++++------- ...s_scorer.py => alpagasus_sample_evaluator.py} | 0 ...r.py => deita_complexity_sample_evaluator.py} | 0 ...orer.py => deita_quality_sample_evaluator.py} | 0 ...stag_scorer.py => instag_sample_evaluator.py} | 0 .../{rm_scorer.py => rm_sample_evaluator.py} | 0 ...rer.py => superfiltering_sample_evaluator.py} | 0 ...corer.py => treeinstruct_sample_evaluator.py} | 0 17 files changed, 15 insertions(+), 15 deletions(-) rename dataflow/operators/core_text/eval/{perspective_scorer.py => perspective_sample_evaluator.py} (100%) rename dataflow/operators/core_text/eval/{task2vec_scorer.py => task2vec_dataset_evaluator.py} (100%) rename dataflow/operators/core_text/eval/{vendi_scorer.py => vendi_dataset_evaluator.py} (100%) rename dataflow/operators/core_text/filter/{hash_deduplicator.py => hash_deduplicate_filter.py} (100%) rename dataflow/operators/core_text/filter/{minhash_deduplicator.py => minhash_deduplicate_filter.py} (100%) rename dataflow/operators/core_text/filter/{ngramhash_deduplicator.py => ngramhash_deduplicate_filter.py} (100%) rename dataflow/operators/core_text/filter/{sem_deduplicator.py => sem_deduplicate_filter.py} (100%) rename dataflow/operators/core_text/filter/{simhash_deduplicator.py => simhash_deduplicate_filter.py} (100%) rename dataflow/operators/general_sft/eval/{alpagasus_scorer.py => alpagasus_sample_evaluator.py} (100%) rename dataflow/operators/general_sft/eval/{deita_complexity_scorer.py => deita_complexity_sample_evaluator.py} (100%) rename dataflow/operators/general_sft/eval/{deita_quality_scorer.py => deita_quality_sample_evaluator.py} (100%) rename dataflow/operators/general_sft/eval/{instag_scorer.py => instag_sample_evaluator.py} (100%) rename dataflow/operators/general_sft/eval/{rm_scorer.py => rm_sample_evaluator.py} (100%) rename dataflow/operators/general_sft/eval/{superfiltering_scorer.py => superfiltering_sample_evaluator.py} (100%) rename dataflow/operators/general_sft/eval/{treeinstruct_scorer.py => treeinstruct_sample_evaluator.py} (100%) diff --git a/dataflow/operators/core_text/__init__.py b/dataflow/operators/core_text/__init__.py index 98aada34..5494ff31 100644 --- a/dataflow/operators/core_text/__init__.py +++ b/dataflow/operators/core_text/__init__.py @@ -6,18 +6,18 @@ from .generate.random_domain_knowledge_row_generator import RandomDomainKnowledgeRowGenerator from .eval.prompted_eval import PromptedEvaluator - from .eval.perspective_scorer import PerspectiveSampleEvaluator - from .eval.task2vec_scorer import Task2VecDatasetEvaluator - from .eval.vendi_scorer import VendiDatasetEvaluator + from .eval.perspective_sample_evaluator import PerspectiveSampleEvaluator + from .eval.task2vec_dataset_evaluator import Task2VecDatasetEvaluator + from .eval.vendi_dataset_evaluator import VendiDatasetEvaluator from .refine.prompted_refiner import PromptedRefiner from .filter.prompted_filter import PromptedFilter from .filter.general_filter import GeneralFilter - from .filter.minhash_deduplicator import MinHashDeduplicateFilter - from .filter.ngramhash_deduplicator import NgramHashDeduplicateFilter - from .filter.sem_deduplicator import SemDeduplicateFilter - from .filter.simhash_deduplicator import SimHashDeduplicateFilter - from .filter.hash_deduplicator import HashDeduplicateFilter + from .filter.minhash_deduplicate_filter import MinHashDeduplicateFilter + from .filter.ngramhash_deduplicate_filter import NgramHashDeduplicateFilter + from .filter.sem_deduplicate_filter import SemDeduplicateFilter + from .filter.simhash_deduplicate_filter import SimHashDeduplicateFilter + from .filter.hash_deduplicate_filter import HashDeduplicateFilter from .filter.blocklist_filter import BlocklistFilter from .filter.word_number_filter import WordNumberFilter from .filter.perspective_filter import PerspectiveFilter diff --git a/dataflow/operators/core_text/eval/perspective_scorer.py b/dataflow/operators/core_text/eval/perspective_sample_evaluator.py similarity index 100% rename from dataflow/operators/core_text/eval/perspective_scorer.py rename to dataflow/operators/core_text/eval/perspective_sample_evaluator.py diff --git a/dataflow/operators/core_text/eval/task2vec_scorer.py b/dataflow/operators/core_text/eval/task2vec_dataset_evaluator.py similarity index 100% rename from dataflow/operators/core_text/eval/task2vec_scorer.py rename to dataflow/operators/core_text/eval/task2vec_dataset_evaluator.py diff --git a/dataflow/operators/core_text/eval/vendi_scorer.py b/dataflow/operators/core_text/eval/vendi_dataset_evaluator.py similarity index 100% rename from dataflow/operators/core_text/eval/vendi_scorer.py rename to dataflow/operators/core_text/eval/vendi_dataset_evaluator.py diff --git a/dataflow/operators/core_text/filter/hash_deduplicator.py b/dataflow/operators/core_text/filter/hash_deduplicate_filter.py similarity index 100% rename from dataflow/operators/core_text/filter/hash_deduplicator.py rename to dataflow/operators/core_text/filter/hash_deduplicate_filter.py diff --git a/dataflow/operators/core_text/filter/minhash_deduplicator.py b/dataflow/operators/core_text/filter/minhash_deduplicate_filter.py similarity index 100% rename from dataflow/operators/core_text/filter/minhash_deduplicator.py rename to dataflow/operators/core_text/filter/minhash_deduplicate_filter.py diff --git a/dataflow/operators/core_text/filter/ngramhash_deduplicator.py b/dataflow/operators/core_text/filter/ngramhash_deduplicate_filter.py similarity index 100% rename from dataflow/operators/core_text/filter/ngramhash_deduplicator.py rename to dataflow/operators/core_text/filter/ngramhash_deduplicate_filter.py diff --git a/dataflow/operators/core_text/filter/sem_deduplicator.py b/dataflow/operators/core_text/filter/sem_deduplicate_filter.py similarity index 100% rename from dataflow/operators/core_text/filter/sem_deduplicator.py rename to dataflow/operators/core_text/filter/sem_deduplicate_filter.py diff --git a/dataflow/operators/core_text/filter/simhash_deduplicator.py b/dataflow/operators/core_text/filter/simhash_deduplicate_filter.py similarity index 100% rename from dataflow/operators/core_text/filter/simhash_deduplicator.py rename to dataflow/operators/core_text/filter/simhash_deduplicate_filter.py diff --git a/dataflow/operators/general_sft/__init__.py b/dataflow/operators/general_sft/__init__.py index bb32bac0..861640f4 100644 --- a/dataflow/operators/general_sft/__init__.py +++ b/dataflow/operators/general_sft/__init__.py @@ -1,13 +1,13 @@ from typing import TYPE_CHECKING if TYPE_CHECKING: - from .eval.alpagasus_scorer import AlpagasusSampleEvaluator - from .eval.deita_quality_scorer import DeitaQualitySampleEvaluator - from .eval.deita_complexity_scorer import DeitaComplexitySampleEvaluator - from .eval.instag_scorer import InstagSampleEvaluator - from .eval.rm_scorer import RMSampleEvaluator - from .eval.superfiltering_scorer import SuperfilteringSampleEvaluator - from .eval.treeinstruct_scorer import TreeinstructSampleEvaluator + from .eval.alpagasus_sample_evaluator import AlpagasusSampleEvaluator + from .eval.deita_quality_sample_evaluator import DeitaQualitySampleEvaluator + from .eval.deita_complexity_sample_evaluator import DeitaComplexitySampleEvaluator + from .eval.instag_sample_evaluator import InstagSampleEvaluator + from .eval.rm_sample_evaluator import RMSampleEvaluator + from .eval.superfiltering_sample_evaluator import SuperfilteringSampleEvaluator + from .eval.treeinstruct_sample_evaluator import TreeinstructSampleEvaluator from .filter.alpagasus_filter import AlpagasusFilter diff --git a/dataflow/operators/general_sft/eval/alpagasus_scorer.py b/dataflow/operators/general_sft/eval/alpagasus_sample_evaluator.py similarity index 100% rename from dataflow/operators/general_sft/eval/alpagasus_scorer.py rename to dataflow/operators/general_sft/eval/alpagasus_sample_evaluator.py diff --git a/dataflow/operators/general_sft/eval/deita_complexity_scorer.py b/dataflow/operators/general_sft/eval/deita_complexity_sample_evaluator.py similarity index 100% rename from dataflow/operators/general_sft/eval/deita_complexity_scorer.py rename to dataflow/operators/general_sft/eval/deita_complexity_sample_evaluator.py diff --git a/dataflow/operators/general_sft/eval/deita_quality_scorer.py b/dataflow/operators/general_sft/eval/deita_quality_sample_evaluator.py similarity index 100% rename from dataflow/operators/general_sft/eval/deita_quality_scorer.py rename to dataflow/operators/general_sft/eval/deita_quality_sample_evaluator.py diff --git a/dataflow/operators/general_sft/eval/instag_scorer.py b/dataflow/operators/general_sft/eval/instag_sample_evaluator.py similarity index 100% rename from dataflow/operators/general_sft/eval/instag_scorer.py rename to dataflow/operators/general_sft/eval/instag_sample_evaluator.py diff --git a/dataflow/operators/general_sft/eval/rm_scorer.py b/dataflow/operators/general_sft/eval/rm_sample_evaluator.py similarity index 100% rename from dataflow/operators/general_sft/eval/rm_scorer.py rename to dataflow/operators/general_sft/eval/rm_sample_evaluator.py diff --git a/dataflow/operators/general_sft/eval/superfiltering_scorer.py b/dataflow/operators/general_sft/eval/superfiltering_sample_evaluator.py similarity index 100% rename from dataflow/operators/general_sft/eval/superfiltering_scorer.py rename to dataflow/operators/general_sft/eval/superfiltering_sample_evaluator.py diff --git a/dataflow/operators/general_sft/eval/treeinstruct_scorer.py b/dataflow/operators/general_sft/eval/treeinstruct_sample_evaluator.py similarity index 100% rename from dataflow/operators/general_sft/eval/treeinstruct_scorer.py rename to dataflow/operators/general_sft/eval/treeinstruct_sample_evaluator.py From f239fe537a27b80fb53560ff4f459aaa147af7a6 Mon Sep 17 00:00:00 2001 From: MOLYHECI Date: Wed, 10 Sep 2025 10:54:04 +0800 Subject: [PATCH 3/4] divide into four folders --- dataflow/operators/core_text/__init__.py | 15 +---- .../{general_pt => general_text}/__init__.py | 34 +++++------ .../eval/__init__.py | 0 .../eval/bert_sample_evaluator.py | 0 .../eval/bleu/__init__.py | 0 .../eval/bleu/bleu.py | 0 .../eval/bleu_sample_evaluator.py | 2 +- .../eval/cider/__init__.py | 0 .../eval/cider/cider.py | 0 .../eval/cider_sample_evaluator.py | 2 +- .../eval/langkit_sample_evaluator.py | 0 .../lexical_diversity_sample_evaluator.py | 0 .../eval/ngram_sample_evaluator.py | 0 .../eval/perspective_sample_evaluator.py | 0 .../eval/presidio_sample_evaluator.py | 0 .../eval/task2vec/task2vec.py | 0 .../eval/task2vec/task_similarity.py | 0 .../eval/task2vec/utils.py | 0 .../eval/task2vec_dataset_evaluator.py | 4 +- .../eval/vendi_dataset_evaluator.py | 0 .../filter/__init__.py | 0 .../filter/blocklist/en.txt | 0 .../filter/blocklist/zh.txt | 0 .../filter/blocklist_filter.py | 2 +- .../filter/hash_deduplicate_filter.py | 0 .../filter/heuristics_filter.py | 0 .../filter/langkit_filter.py | 2 +- .../filter/language_filter.py | 0 .../filter/lexical_diversity_filter.py | 2 +- .../filter/llm_language_filter.py | 0 .../filter/minhash_deduplicate_filter.py | 0 .../filter/ngram_filter.py | 2 +- .../filter/ngramhash_deduplicate_filter.py | 0 .../filter/perspective_filter.py | 2 +- .../filter/presidio_filter.py | 2 +- .../filter/sem_deduplicate_filter.py | 0 .../filter/simhash_deduplicate_filter.py | 0 .../filter/word_number_filter.py | 0 .../refine/html_entity_refiner.py | 0 .../refine/html_url_remover_refiner.py | 0 .../refine/lowercase_refiner.py | 0 .../refine/ner_refiner.py | 0 .../refine/pii_anonymize_refiner.py | 0 .../refine/ref_removal_refiner.py | 0 .../refine/remove_contractions_refiner.py | 0 .../refine/remove_emoji_refiner.py | 0 .../refine/remove_emoticons_refiner.py | 0 .../refine/remove_extra_spaces_refiner.py | 0 .../refine/remove_image_ref_refiner.py | 0 .../refine/remove_number_refiner.py | 0 .../refine/remove_punctuation_refiner.py | 0 .../remove_repetitions_punctuation_refiner.py | 0 .../refine/remove_stopwords_refiner.py | 0 .../refine/spelling_correction_refiner.py | 0 .../refine/stemming_lemmatization_refiner.py | 0 .../refine/text_normalization_refiner.py | 0 dataflow/operators/text_pt/__init__.py | 32 ++++++++++ .../eval/Kenlm/model.py | 0 .../Qurating/modeling/modeling_flash_llama.py | 0 .../eval/Qurating/qurater_annotate.py | 0 dataflow/operators/text_pt/eval/__init__.py | 0 .../eval/debertav3_sample_evaluator.py | 0 .../eval/fineweb_edu_sample_evaluator.py | 0 .../eval/meta_sample_evaluator.py | 0 .../eval/pair_qual_sample_evaluator.py | 0 .../eval/perplexity_sample_evaluator.py | 2 +- .../eval/qurating_sample_evaluator.py | 4 +- .../eval/textbook_sample_evaluator.py | 0 dataflow/operators/text_pt/filter/__init__.py | 60 +++++++++++++++++++ .../filter/ccnet_deduplicate_filter.py | 0 .../filter/debertav3_filter.py | 2 +- .../filter/fineweb_edu_filter.py | 2 +- .../filter/pair_qual_filter.py | 2 +- .../filter/perplexity_filter.py | 2 +- .../filter/qurating_filter.py | 2 +- .../filter/text_book_filter.py | 2 +- .../generate/phi4qa_generator.py | 0 .../{general_sft => text_sft}/__init__.py | 4 +- .../eval/Superfiltering/data_analysis.py | 0 .../eval/alpagasus_sample_evaluator.py | 0 .../eval/deita_complexity_sample_evaluator.py | 0 .../eval/deita_quality_sample_evaluator.py | 0 .../eval/instag_sample_evaluator.py | 0 .../eval/rm_sample_evaluator.py | 0 .../eval/superfiltering_sample_evaluator.py | 2 +- .../eval/treeinstruct_sample_evaluator.py | 0 .../filter/alpagasus_filter.py | 2 +- .../filter/deita_complexity_filter.py | 2 +- .../filter/deita_quality_filter.py | 2 +- .../filter/instag_filter.py | 2 +- .../filter/rm_filter.py | 2 +- .../filter/superfiltering_filter.py | 2 +- .../filter/treeinstruct_filter.py | 2 +- .../generate/condor_generator.py | 0 .../generate/sft_generator_from_seed.py | 0 .../refine/condor_refiner.py | 0 .../text_sft_synthesis_pipeline.py | 6 +- .../pipelines/cpu_pipelines/text_pt_filter.py | 4 +- .../pipelines/gpu_pipelines/text_pt_filter.py | 4 +- .../gpu_pipelines/text_pt_synthetic.py | 6 +- .../gpu_pipelines/text_sft_filter.py | 2 +- .../gpu_pipelines/text_sft_synthetic.py | 8 +-- dataflow/utils/registry.py | 5 +- test/test_autoop.py | 2 +- 104 files changed, 156 insertions(+), 80 deletions(-) rename dataflow/operators/{general_pt => general_text}/__init__.py (77%) rename dataflow/operators/{general_pt => general_text}/eval/__init__.py (100%) rename dataflow/operators/{general_pt => general_text}/eval/bert_sample_evaluator.py (100%) rename dataflow/operators/{general_pt => general_text}/eval/bleu/__init__.py (100%) rename dataflow/operators/{general_pt => general_text}/eval/bleu/bleu.py (100%) rename dataflow/operators/{general_pt => general_text}/eval/bleu_sample_evaluator.py (98%) rename dataflow/operators/{general_pt => general_text}/eval/cider/__init__.py (100%) rename dataflow/operators/{general_pt => general_text}/eval/cider/cider.py (100%) rename dataflow/operators/{general_pt => general_text}/eval/cider_sample_evaluator.py (98%) rename dataflow/operators/{general_pt => general_text}/eval/langkit_sample_evaluator.py (100%) rename dataflow/operators/{general_pt => general_text}/eval/lexical_diversity_sample_evaluator.py (100%) rename dataflow/operators/{general_pt => general_text}/eval/ngram_sample_evaluator.py (100%) rename dataflow/operators/{core_text => general_text}/eval/perspective_sample_evaluator.py (100%) rename dataflow/operators/{general_pt => general_text}/eval/presidio_sample_evaluator.py (100%) rename dataflow/operators/{core_text => general_text}/eval/task2vec/task2vec.py (100%) rename dataflow/operators/{core_text => general_text}/eval/task2vec/task_similarity.py (100%) rename dataflow/operators/{core_text => general_text}/eval/task2vec/utils.py (100%) rename dataflow/operators/{core_text => general_text}/eval/task2vec_dataset_evaluator.py (97%) rename dataflow/operators/{core_text => general_text}/eval/vendi_dataset_evaluator.py (100%) rename dataflow/operators/{general_pt => general_text}/filter/__init__.py (100%) rename dataflow/operators/{core_text => general_text}/filter/blocklist/en.txt (100%) rename dataflow/operators/{core_text => general_text}/filter/blocklist/zh.txt (100%) rename dataflow/operators/{core_text => general_text}/filter/blocklist_filter.py (97%) rename dataflow/operators/{core_text => general_text}/filter/hash_deduplicate_filter.py (100%) rename dataflow/operators/{general_pt => general_text}/filter/heuristics_filter.py (100%) rename dataflow/operators/{general_pt => general_text}/filter/langkit_filter.py (99%) rename dataflow/operators/{core_text => general_text}/filter/language_filter.py (100%) rename dataflow/operators/{general_pt => general_text}/filter/lexical_diversity_filter.py (98%) rename dataflow/operators/{core_text => general_text}/filter/llm_language_filter.py (100%) rename dataflow/operators/{core_text => general_text}/filter/minhash_deduplicate_filter.py (100%) rename dataflow/operators/{general_pt => general_text}/filter/ngram_filter.py (97%) rename dataflow/operators/{core_text => general_text}/filter/ngramhash_deduplicate_filter.py (100%) rename dataflow/operators/{core_text => general_text}/filter/perspective_filter.py (97%) rename dataflow/operators/{general_pt => general_text}/filter/presidio_filter.py (98%) rename dataflow/operators/{core_text => general_text}/filter/sem_deduplicate_filter.py (100%) rename dataflow/operators/{core_text => general_text}/filter/simhash_deduplicate_filter.py (100%) rename dataflow/operators/{core_text => general_text}/filter/word_number_filter.py (100%) rename dataflow/operators/{general_pt => general_text}/refine/html_entity_refiner.py (100%) rename dataflow/operators/{general_pt => general_text}/refine/html_url_remover_refiner.py (100%) rename dataflow/operators/{general_pt => general_text}/refine/lowercase_refiner.py (100%) rename dataflow/operators/{general_pt => general_text}/refine/ner_refiner.py (100%) rename dataflow/operators/{general_pt => general_text}/refine/pii_anonymize_refiner.py (100%) rename dataflow/operators/{general_pt => general_text}/refine/ref_removal_refiner.py (100%) rename dataflow/operators/{general_pt => general_text}/refine/remove_contractions_refiner.py (100%) rename dataflow/operators/{general_pt => general_text}/refine/remove_emoji_refiner.py (100%) rename dataflow/operators/{general_pt => general_text}/refine/remove_emoticons_refiner.py (100%) rename dataflow/operators/{general_pt => general_text}/refine/remove_extra_spaces_refiner.py (100%) rename dataflow/operators/{general_pt => general_text}/refine/remove_image_ref_refiner.py (100%) rename dataflow/operators/{general_pt => general_text}/refine/remove_number_refiner.py (100%) rename dataflow/operators/{general_pt => general_text}/refine/remove_punctuation_refiner.py (100%) rename dataflow/operators/{general_pt => general_text}/refine/remove_repetitions_punctuation_refiner.py (100%) rename dataflow/operators/{general_pt => general_text}/refine/remove_stopwords_refiner.py (100%) rename dataflow/operators/{general_pt => general_text}/refine/spelling_correction_refiner.py (100%) rename dataflow/operators/{general_pt => general_text}/refine/stemming_lemmatization_refiner.py (100%) rename dataflow/operators/{general_pt => general_text}/refine/text_normalization_refiner.py (100%) create mode 100644 dataflow/operators/text_pt/__init__.py rename dataflow/operators/{general_pt => text_pt}/eval/Kenlm/model.py (100%) rename dataflow/operators/{general_pt => text_pt}/eval/Qurating/modeling/modeling_flash_llama.py (100%) rename dataflow/operators/{general_pt => text_pt}/eval/Qurating/qurater_annotate.py (100%) create mode 100644 dataflow/operators/text_pt/eval/__init__.py rename dataflow/operators/{general_pt => text_pt}/eval/debertav3_sample_evaluator.py (100%) rename dataflow/operators/{general_pt => text_pt}/eval/fineweb_edu_sample_evaluator.py (100%) rename dataflow/operators/{general_pt => text_pt}/eval/meta_sample_evaluator.py (100%) rename dataflow/operators/{general_pt => text_pt}/eval/pair_qual_sample_evaluator.py (100%) rename dataflow/operators/{general_pt => text_pt}/eval/perplexity_sample_evaluator.py (98%) rename dataflow/operators/{general_pt => text_pt}/eval/qurating_sample_evaluator.py (97%) rename dataflow/operators/{general_pt => text_pt}/eval/textbook_sample_evaluator.py (100%) create mode 100644 dataflow/operators/text_pt/filter/__init__.py rename dataflow/operators/{general_pt => text_pt}/filter/ccnet_deduplicate_filter.py (100%) rename dataflow/operators/{general_pt => text_pt}/filter/debertav3_filter.py (98%) rename dataflow/operators/{general_pt => text_pt}/filter/fineweb_edu_filter.py (98%) rename dataflow/operators/{general_pt => text_pt}/filter/pair_qual_filter.py (98%) rename dataflow/operators/{general_pt => text_pt}/filter/perplexity_filter.py (97%) rename dataflow/operators/{general_pt => text_pt}/filter/qurating_filter.py (98%) rename dataflow/operators/{general_pt => text_pt}/filter/text_book_filter.py (98%) rename dataflow/operators/{general_pt => text_pt}/generate/phi4qa_generator.py (100%) rename dataflow/operators/{general_sft => text_sft}/__init__.py (94%) rename dataflow/operators/{general_sft => text_sft}/eval/Superfiltering/data_analysis.py (100%) rename dataflow/operators/{general_sft => text_sft}/eval/alpagasus_sample_evaluator.py (100%) rename dataflow/operators/{general_sft => text_sft}/eval/deita_complexity_sample_evaluator.py (100%) rename dataflow/operators/{general_sft => text_sft}/eval/deita_quality_sample_evaluator.py (100%) rename dataflow/operators/{general_sft => text_sft}/eval/instag_sample_evaluator.py (100%) rename dataflow/operators/{general_sft => text_sft}/eval/rm_sample_evaluator.py (100%) rename dataflow/operators/{general_sft => text_sft}/eval/superfiltering_sample_evaluator.py (97%) rename dataflow/operators/{general_sft => text_sft}/eval/treeinstruct_sample_evaluator.py (100%) rename dataflow/operators/{general_sft => text_sft}/filter/alpagasus_filter.py (98%) rename dataflow/operators/{general_sft => text_sft}/filter/deita_complexity_filter.py (98%) rename dataflow/operators/{general_sft => text_sft}/filter/deita_quality_filter.py (98%) rename dataflow/operators/{general_sft => text_sft}/filter/instag_filter.py (97%) rename dataflow/operators/{general_sft => text_sft}/filter/rm_filter.py (98%) rename dataflow/operators/{general_sft => text_sft}/filter/superfiltering_filter.py (98%) rename dataflow/operators/{general_sft => text_sft}/filter/treeinstruct_filter.py (98%) rename dataflow/operators/{general_sft => text_sft}/generate/condor_generator.py (100%) rename dataflow/operators/{general_sft => text_sft}/generate/sft_generator_from_seed.py (100%) rename dataflow/operators/{general_sft => text_sft}/refine/condor_refiner.py (100%) diff --git a/dataflow/operators/core_text/__init__.py b/dataflow/operators/core_text/__init__.py index 5494ff31..632484cd 100644 --- a/dataflow/operators/core_text/__init__.py +++ b/dataflow/operators/core_text/__init__.py @@ -6,24 +6,11 @@ from .generate.random_domain_knowledge_row_generator import RandomDomainKnowledgeRowGenerator from .eval.prompted_eval import PromptedEvaluator - from .eval.perspective_sample_evaluator import PerspectiveSampleEvaluator - from .eval.task2vec_dataset_evaluator import Task2VecDatasetEvaluator - from .eval.vendi_dataset_evaluator import VendiDatasetEvaluator + from .refine.prompted_refiner import PromptedRefiner from .filter.prompted_filter import PromptedFilter from .filter.general_filter import GeneralFilter - from .filter.minhash_deduplicate_filter import MinHashDeduplicateFilter - from .filter.ngramhash_deduplicate_filter import NgramHashDeduplicateFilter - from .filter.sem_deduplicate_filter import SemDeduplicateFilter - from .filter.simhash_deduplicate_filter import SimHashDeduplicateFilter - from .filter.hash_deduplicate_filter import HashDeduplicateFilter - from .filter.blocklist_filter import BlocklistFilter - from .filter.word_number_filter import WordNumberFilter - from .filter.perspective_filter import PerspectiveFilter - from .filter.language_filter import LanguageFilter - from .filter.llm_language_filter import LLMLanguageFilter - else: import sys from dataflow.utils.registry import LazyLoader, generate_import_structure_from_type_checking diff --git a/dataflow/operators/general_pt/__init__.py b/dataflow/operators/general_text/__init__.py similarity index 77% rename from dataflow/operators/general_pt/__init__.py rename to dataflow/operators/general_text/__init__.py index 8592413f..180579f1 100644 --- a/dataflow/operators/general_pt/__init__.py +++ b/dataflow/operators/general_text/__init__.py @@ -2,9 +2,6 @@ if TYPE_CHECKING: # filter - from .filter.ccnet_deduplicate_filter import CCNetDeduplicateFilter - from .filter.debertav3_filter import DebertaV3Filter - from .filter.fineweb_edu_filter import FineWebEduFilter from .filter.heuristics_filter import ColonEndFilter from .filter.heuristics_filter import SentenceNumberFilter from .filter.heuristics_filter import LineEndWithEllipsisFilter @@ -28,14 +25,17 @@ from .filter.langkit_filter import LangkitFilter from .filter.lexical_diversity_filter import LexicalDiversityFilter from .filter.ngram_filter import NgramFilter - from .filter.pair_qual_filter import PairQualFilter - from .filter.perplexity_filter import PerplexityFilter from .filter.presidio_filter import PresidioFilter - from .filter.qurating_filter import QuratingFilter - from .filter.text_book_filter import TextbookFilter - - # generate - from .generate.phi4qa_generator import Phi4QAGenerator + from .filter.blocklist_filter import BlocklistFilter + from .filter.hash_deduplicate_filter import HashDeduplicateFilter + from .filter.language_filter import LanguageFilter + from .filter.llm_language_filter import LLMLanguageFilter + from .filter.minhash_deduplicate_filter import MinHashDeduplicateFilter + from .filter.ngramhash_deduplicate_filter import NgramHashDeduplicateFilter + from .filter.perspective_filter import PerspectiveFilter + from .filter.sem_deduplicate_filter import SemDeduplicateFilter + from .filter.simhash_deduplicate_filter import SimHashDeduplicateFilter + from .filter.word_number_filter import WordNumberFilter # refine from .refine.html_entity_refiner import HtmlEntityRefiner @@ -61,23 +61,19 @@ from .eval.ngram_sample_evaluator import NgramSampleEvaluator from .eval.lexical_diversity_sample_evaluator import LexicalDiversitySampleEvaluator from .eval.langkit_sample_evaluator import LangkitSampleEvaluator - from .eval.debertav3_sample_evaluator import DebertaV3SampleEvaluator - from .eval.fineweb_edu_sample_evaluator import FineWebEduSampleEvaluator - from .eval.pair_qual_sample_evaluator import PairQualSampleEvaluator from .eval.presidio_sample_evaluator import PresidioSampleEvaluator - from .eval.textbook_sample_evaluator import TextbookSampleEvaluator - from .eval.qurating_sample_evaluator import QuratingSampleEvaluator - from .eval.perplexity_sample_evaluator import PerplexitySampleEvaluator - from .eval.meta_sample_evaluator import MetaSampleEvaluator from .eval.bert_sample_evaluator import BertSampleEvaluator from .eval.bleu_sample_evaluator import BleuSampleEvaluator from .eval.cider_sample_evaluator import CiderSampleEvaluator + from .eval.perspective_sample_evaluator import PerspectiveSampleEvaluator + from .eval.task2vec_dataset_evaluator import Task2VecDatasetEvaluator + from .eval.vendi_dataset_evaluator import VendiDatasetEvaluator else: import sys from dataflow.utils.registry import LazyLoader, generate_import_structure_from_type_checking - cur_path = "dataflow/operators/general_pt/" + cur_path = "dataflow/operators/general_text/" _import_structure = generate_import_structure_from_type_checking(__file__, cur_path) - sys.modules[__name__] = LazyLoader(__name__, "dataflow/operators/general_pt/", _import_structure) + sys.modules[__name__] = LazyLoader(__name__, "dataflow/operators/general_text/", _import_structure) diff --git a/dataflow/operators/general_pt/eval/__init__.py b/dataflow/operators/general_text/eval/__init__.py similarity index 100% rename from dataflow/operators/general_pt/eval/__init__.py rename to dataflow/operators/general_text/eval/__init__.py diff --git a/dataflow/operators/general_pt/eval/bert_sample_evaluator.py b/dataflow/operators/general_text/eval/bert_sample_evaluator.py similarity index 100% rename from dataflow/operators/general_pt/eval/bert_sample_evaluator.py rename to dataflow/operators/general_text/eval/bert_sample_evaluator.py diff --git a/dataflow/operators/general_pt/eval/bleu/__init__.py b/dataflow/operators/general_text/eval/bleu/__init__.py similarity index 100% rename from dataflow/operators/general_pt/eval/bleu/__init__.py rename to dataflow/operators/general_text/eval/bleu/__init__.py diff --git a/dataflow/operators/general_pt/eval/bleu/bleu.py b/dataflow/operators/general_text/eval/bleu/bleu.py similarity index 100% rename from dataflow/operators/general_pt/eval/bleu/bleu.py rename to dataflow/operators/general_text/eval/bleu/bleu.py diff --git a/dataflow/operators/general_pt/eval/bleu_sample_evaluator.py b/dataflow/operators/general_text/eval/bleu_sample_evaluator.py similarity index 98% rename from dataflow/operators/general_pt/eval/bleu_sample_evaluator.py rename to dataflow/operators/general_text/eval/bleu_sample_evaluator.py index 5b0930a7..c87a4a83 100644 --- a/dataflow/operators/general_pt/eval/bleu_sample_evaluator.py +++ b/dataflow/operators/general_text/eval/bleu_sample_evaluator.py @@ -2,7 +2,7 @@ from dataflow.utils.storage import DataFlowStorage from dataflow.utils.registry import OPERATOR_REGISTRY from dataflow import get_logger -from dataflow.operators.general_pt.eval.bleu.bleu import Bleu +from dataflow.operators.general_text.eval.bleu.bleu import Bleu from tqdm import tqdm @OPERATOR_REGISTRY.register() diff --git a/dataflow/operators/general_pt/eval/cider/__init__.py b/dataflow/operators/general_text/eval/cider/__init__.py similarity index 100% rename from dataflow/operators/general_pt/eval/cider/__init__.py rename to dataflow/operators/general_text/eval/cider/__init__.py diff --git a/dataflow/operators/general_pt/eval/cider/cider.py b/dataflow/operators/general_text/eval/cider/cider.py similarity index 100% rename from dataflow/operators/general_pt/eval/cider/cider.py rename to dataflow/operators/general_text/eval/cider/cider.py diff --git a/dataflow/operators/general_pt/eval/cider_sample_evaluator.py b/dataflow/operators/general_text/eval/cider_sample_evaluator.py similarity index 98% rename from dataflow/operators/general_pt/eval/cider_sample_evaluator.py rename to dataflow/operators/general_text/eval/cider_sample_evaluator.py index db3f5d76..e432c08c 100644 --- a/dataflow/operators/general_pt/eval/cider_sample_evaluator.py +++ b/dataflow/operators/general_text/eval/cider_sample_evaluator.py @@ -6,7 +6,7 @@ from dataflow.utils.storage import DataFlowStorage from dataflow.utils.registry import OPERATOR_REGISTRY from dataflow import get_logger -from dataflow.operators.general_pt.eval.cider.cider import Cider +from dataflow.operators.general_text.eval.cider.cider import Cider def load_idf(idf_path): with open(idf_path, 'rb') as f: diff --git a/dataflow/operators/general_pt/eval/langkit_sample_evaluator.py b/dataflow/operators/general_text/eval/langkit_sample_evaluator.py similarity index 100% rename from dataflow/operators/general_pt/eval/langkit_sample_evaluator.py rename to dataflow/operators/general_text/eval/langkit_sample_evaluator.py diff --git a/dataflow/operators/general_pt/eval/lexical_diversity_sample_evaluator.py b/dataflow/operators/general_text/eval/lexical_diversity_sample_evaluator.py similarity index 100% rename from dataflow/operators/general_pt/eval/lexical_diversity_sample_evaluator.py rename to dataflow/operators/general_text/eval/lexical_diversity_sample_evaluator.py diff --git a/dataflow/operators/general_pt/eval/ngram_sample_evaluator.py b/dataflow/operators/general_text/eval/ngram_sample_evaluator.py similarity index 100% rename from dataflow/operators/general_pt/eval/ngram_sample_evaluator.py rename to dataflow/operators/general_text/eval/ngram_sample_evaluator.py diff --git a/dataflow/operators/core_text/eval/perspective_sample_evaluator.py b/dataflow/operators/general_text/eval/perspective_sample_evaluator.py similarity index 100% rename from dataflow/operators/core_text/eval/perspective_sample_evaluator.py rename to dataflow/operators/general_text/eval/perspective_sample_evaluator.py diff --git a/dataflow/operators/general_pt/eval/presidio_sample_evaluator.py b/dataflow/operators/general_text/eval/presidio_sample_evaluator.py similarity index 100% rename from dataflow/operators/general_pt/eval/presidio_sample_evaluator.py rename to dataflow/operators/general_text/eval/presidio_sample_evaluator.py diff --git a/dataflow/operators/core_text/eval/task2vec/task2vec.py b/dataflow/operators/general_text/eval/task2vec/task2vec.py similarity index 100% rename from dataflow/operators/core_text/eval/task2vec/task2vec.py rename to dataflow/operators/general_text/eval/task2vec/task2vec.py diff --git a/dataflow/operators/core_text/eval/task2vec/task_similarity.py b/dataflow/operators/general_text/eval/task2vec/task_similarity.py similarity index 100% rename from dataflow/operators/core_text/eval/task2vec/task_similarity.py rename to dataflow/operators/general_text/eval/task2vec/task_similarity.py diff --git a/dataflow/operators/core_text/eval/task2vec/utils.py b/dataflow/operators/general_text/eval/task2vec/utils.py similarity index 100% rename from dataflow/operators/core_text/eval/task2vec/utils.py rename to dataflow/operators/general_text/eval/task2vec/utils.py diff --git a/dataflow/operators/core_text/eval/task2vec_dataset_evaluator.py b/dataflow/operators/general_text/eval/task2vec_dataset_evaluator.py similarity index 97% rename from dataflow/operators/core_text/eval/task2vec_dataset_evaluator.py rename to dataflow/operators/general_text/eval/task2vec_dataset_evaluator.py index 7f864c3d..5dbbe883 100644 --- a/dataflow/operators/core_text/eval/task2vec_dataset_evaluator.py +++ b/dataflow/operators/general_text/eval/task2vec_dataset_evaluator.py @@ -1,5 +1,5 @@ -from dataflow.operators.core_text.eval.task2vec.task2vec import Task2Vec -from dataflow.operators.core_text.eval.task2vec import task_similarity +from dataflow.operators.general_text.eval.task2vec.task2vec import Task2Vec +from dataflow.operators.general_text.eval.task2vec import task_similarity import torch import random from transformers import GPT2Tokenizer, GPT2LMHeadModel diff --git a/dataflow/operators/core_text/eval/vendi_dataset_evaluator.py b/dataflow/operators/general_text/eval/vendi_dataset_evaluator.py similarity index 100% rename from dataflow/operators/core_text/eval/vendi_dataset_evaluator.py rename to dataflow/operators/general_text/eval/vendi_dataset_evaluator.py diff --git a/dataflow/operators/general_pt/filter/__init__.py b/dataflow/operators/general_text/filter/__init__.py similarity index 100% rename from dataflow/operators/general_pt/filter/__init__.py rename to dataflow/operators/general_text/filter/__init__.py diff --git a/dataflow/operators/core_text/filter/blocklist/en.txt b/dataflow/operators/general_text/filter/blocklist/en.txt similarity index 100% rename from dataflow/operators/core_text/filter/blocklist/en.txt rename to dataflow/operators/general_text/filter/blocklist/en.txt diff --git a/dataflow/operators/core_text/filter/blocklist/zh.txt b/dataflow/operators/general_text/filter/blocklist/zh.txt similarity index 100% rename from dataflow/operators/core_text/filter/blocklist/zh.txt rename to dataflow/operators/general_text/filter/blocklist/zh.txt diff --git a/dataflow/operators/core_text/filter/blocklist_filter.py b/dataflow/operators/general_text/filter/blocklist_filter.py similarity index 97% rename from dataflow/operators/core_text/filter/blocklist_filter.py rename to dataflow/operators/general_text/filter/blocklist_filter.py index 51f74dfb..495b3403 100644 --- a/dataflow/operators/core_text/filter/blocklist_filter.py +++ b/dataflow/operators/general_text/filter/blocklist_filter.py @@ -53,7 +53,7 @@ def get_desc(lang: str = "zh"): def load_blocklist(self): dataflow_dir = DataFlowPath.get_dataflow_dir() - file_path = f"{dataflow_dir}/operators/core_text/filter/blocklist/{self.language}.txt" + file_path = f"{dataflow_dir}/operators/general_text/filter/blocklist/{self.language}.txt" self.logger.info(f"Loading blocklist for language '{self.language}' from {file_path}...") with open(file_path, 'r', encoding='utf-8') as file: blocklist = set(line.strip().lower() for line in file if line.strip()) diff --git a/dataflow/operators/core_text/filter/hash_deduplicate_filter.py b/dataflow/operators/general_text/filter/hash_deduplicate_filter.py similarity index 100% rename from dataflow/operators/core_text/filter/hash_deduplicate_filter.py rename to dataflow/operators/general_text/filter/hash_deduplicate_filter.py diff --git a/dataflow/operators/general_pt/filter/heuristics_filter.py b/dataflow/operators/general_text/filter/heuristics_filter.py similarity index 100% rename from dataflow/operators/general_pt/filter/heuristics_filter.py rename to dataflow/operators/general_text/filter/heuristics_filter.py diff --git a/dataflow/operators/general_pt/filter/langkit_filter.py b/dataflow/operators/general_text/filter/langkit_filter.py similarity index 99% rename from dataflow/operators/general_pt/filter/langkit_filter.py rename to dataflow/operators/general_text/filter/langkit_filter.py index 939957f8..22c0ec0c 100644 --- a/dataflow/operators/general_pt/filter/langkit_filter.py +++ b/dataflow/operators/general_text/filter/langkit_filter.py @@ -4,7 +4,7 @@ from dataflow.core import OperatorABC from dataflow.utils.storage import DataFlowStorage from dataflow.utils.registry import OPERATOR_REGISTRY -from dataflow.operators.general_pt import LangkitSampleEvaluator +from dataflow.operators.general_text import LangkitSampleEvaluator @OPERATOR_REGISTRY.register() class LangkitFilter(OperatorABC): diff --git a/dataflow/operators/core_text/filter/language_filter.py b/dataflow/operators/general_text/filter/language_filter.py similarity index 100% rename from dataflow/operators/core_text/filter/language_filter.py rename to dataflow/operators/general_text/filter/language_filter.py diff --git a/dataflow/operators/general_pt/filter/lexical_diversity_filter.py b/dataflow/operators/general_text/filter/lexical_diversity_filter.py similarity index 98% rename from dataflow/operators/general_pt/filter/lexical_diversity_filter.py rename to dataflow/operators/general_text/filter/lexical_diversity_filter.py index bd65dee8..faa87b6c 100644 --- a/dataflow/operators/general_pt/filter/lexical_diversity_filter.py +++ b/dataflow/operators/general_text/filter/lexical_diversity_filter.py @@ -4,7 +4,7 @@ from dataflow.core import OperatorABC from dataflow.utils.storage import DataFlowStorage from dataflow.utils.registry import OPERATOR_REGISTRY -from dataflow.operators.general_pt import LexicalDiversitySampleEvaluator +from dataflow.operators.general_text import LexicalDiversitySampleEvaluator @OPERATOR_REGISTRY.register() class LexicalDiversityFilter(OperatorABC): diff --git a/dataflow/operators/core_text/filter/llm_language_filter.py b/dataflow/operators/general_text/filter/llm_language_filter.py similarity index 100% rename from dataflow/operators/core_text/filter/llm_language_filter.py rename to dataflow/operators/general_text/filter/llm_language_filter.py diff --git a/dataflow/operators/core_text/filter/minhash_deduplicate_filter.py b/dataflow/operators/general_text/filter/minhash_deduplicate_filter.py similarity index 100% rename from dataflow/operators/core_text/filter/minhash_deduplicate_filter.py rename to dataflow/operators/general_text/filter/minhash_deduplicate_filter.py diff --git a/dataflow/operators/general_pt/filter/ngram_filter.py b/dataflow/operators/general_text/filter/ngram_filter.py similarity index 97% rename from dataflow/operators/general_pt/filter/ngram_filter.py rename to dataflow/operators/general_text/filter/ngram_filter.py index f332b06e..047bebf1 100644 --- a/dataflow/operators/general_pt/filter/ngram_filter.py +++ b/dataflow/operators/general_text/filter/ngram_filter.py @@ -2,7 +2,7 @@ from dataflow.core import OperatorABC from dataflow.utils.storage import DataFlowStorage from dataflow.utils.registry import OPERATOR_REGISTRY -from dataflow.operators.general_pt import NgramSampleEvaluator +from dataflow.operators.general_text import NgramSampleEvaluator @OPERATOR_REGISTRY.register() class NgramFilter(OperatorABC): diff --git a/dataflow/operators/core_text/filter/ngramhash_deduplicate_filter.py b/dataflow/operators/general_text/filter/ngramhash_deduplicate_filter.py similarity index 100% rename from dataflow/operators/core_text/filter/ngramhash_deduplicate_filter.py rename to dataflow/operators/general_text/filter/ngramhash_deduplicate_filter.py diff --git a/dataflow/operators/core_text/filter/perspective_filter.py b/dataflow/operators/general_text/filter/perspective_filter.py similarity index 97% rename from dataflow/operators/core_text/filter/perspective_filter.py rename to dataflow/operators/general_text/filter/perspective_filter.py index 72083839..97f8d027 100644 --- a/dataflow/operators/core_text/filter/perspective_filter.py +++ b/dataflow/operators/general_text/filter/perspective_filter.py @@ -3,7 +3,7 @@ from dataflow.core import OperatorABC from dataflow.utils.storage import DataFlowStorage from dataflow.utils.registry import OPERATOR_REGISTRY -from dataflow.operators.core_text import PerspectiveSampleEvaluator +from dataflow.operators.general_text import PerspectiveSampleEvaluator from dataflow.serving import PerspectiveAPIServing @OPERATOR_REGISTRY.register() diff --git a/dataflow/operators/general_pt/filter/presidio_filter.py b/dataflow/operators/general_text/filter/presidio_filter.py similarity index 98% rename from dataflow/operators/general_pt/filter/presidio_filter.py rename to dataflow/operators/general_text/filter/presidio_filter.py index 29159c18..1febf290 100644 --- a/dataflow/operators/general_pt/filter/presidio_filter.py +++ b/dataflow/operators/general_text/filter/presidio_filter.py @@ -3,7 +3,7 @@ from dataflow.core import OperatorABC from dataflow.utils.storage import DataFlowStorage from dataflow.utils.registry import OPERATOR_REGISTRY -from dataflow.operators.general_pt import PresidioSampleEvaluator +from dataflow.operators.general_text import PresidioSampleEvaluator @OPERATOR_REGISTRY.register() class PresidioFilter(OperatorABC): diff --git a/dataflow/operators/core_text/filter/sem_deduplicate_filter.py b/dataflow/operators/general_text/filter/sem_deduplicate_filter.py similarity index 100% rename from dataflow/operators/core_text/filter/sem_deduplicate_filter.py rename to dataflow/operators/general_text/filter/sem_deduplicate_filter.py diff --git a/dataflow/operators/core_text/filter/simhash_deduplicate_filter.py b/dataflow/operators/general_text/filter/simhash_deduplicate_filter.py similarity index 100% rename from dataflow/operators/core_text/filter/simhash_deduplicate_filter.py rename to dataflow/operators/general_text/filter/simhash_deduplicate_filter.py diff --git a/dataflow/operators/core_text/filter/word_number_filter.py b/dataflow/operators/general_text/filter/word_number_filter.py similarity index 100% rename from dataflow/operators/core_text/filter/word_number_filter.py rename to dataflow/operators/general_text/filter/word_number_filter.py diff --git a/dataflow/operators/general_pt/refine/html_entity_refiner.py b/dataflow/operators/general_text/refine/html_entity_refiner.py similarity index 100% rename from dataflow/operators/general_pt/refine/html_entity_refiner.py rename to dataflow/operators/general_text/refine/html_entity_refiner.py diff --git a/dataflow/operators/general_pt/refine/html_url_remover_refiner.py b/dataflow/operators/general_text/refine/html_url_remover_refiner.py similarity index 100% rename from dataflow/operators/general_pt/refine/html_url_remover_refiner.py rename to dataflow/operators/general_text/refine/html_url_remover_refiner.py diff --git a/dataflow/operators/general_pt/refine/lowercase_refiner.py b/dataflow/operators/general_text/refine/lowercase_refiner.py similarity index 100% rename from dataflow/operators/general_pt/refine/lowercase_refiner.py rename to dataflow/operators/general_text/refine/lowercase_refiner.py diff --git a/dataflow/operators/general_pt/refine/ner_refiner.py b/dataflow/operators/general_text/refine/ner_refiner.py similarity index 100% rename from dataflow/operators/general_pt/refine/ner_refiner.py rename to dataflow/operators/general_text/refine/ner_refiner.py diff --git a/dataflow/operators/general_pt/refine/pii_anonymize_refiner.py b/dataflow/operators/general_text/refine/pii_anonymize_refiner.py similarity index 100% rename from dataflow/operators/general_pt/refine/pii_anonymize_refiner.py rename to dataflow/operators/general_text/refine/pii_anonymize_refiner.py diff --git a/dataflow/operators/general_pt/refine/ref_removal_refiner.py b/dataflow/operators/general_text/refine/ref_removal_refiner.py similarity index 100% rename from dataflow/operators/general_pt/refine/ref_removal_refiner.py rename to dataflow/operators/general_text/refine/ref_removal_refiner.py diff --git a/dataflow/operators/general_pt/refine/remove_contractions_refiner.py b/dataflow/operators/general_text/refine/remove_contractions_refiner.py similarity index 100% rename from dataflow/operators/general_pt/refine/remove_contractions_refiner.py rename to dataflow/operators/general_text/refine/remove_contractions_refiner.py diff --git a/dataflow/operators/general_pt/refine/remove_emoji_refiner.py b/dataflow/operators/general_text/refine/remove_emoji_refiner.py similarity index 100% rename from dataflow/operators/general_pt/refine/remove_emoji_refiner.py rename to dataflow/operators/general_text/refine/remove_emoji_refiner.py diff --git a/dataflow/operators/general_pt/refine/remove_emoticons_refiner.py b/dataflow/operators/general_text/refine/remove_emoticons_refiner.py similarity index 100% rename from dataflow/operators/general_pt/refine/remove_emoticons_refiner.py rename to dataflow/operators/general_text/refine/remove_emoticons_refiner.py diff --git a/dataflow/operators/general_pt/refine/remove_extra_spaces_refiner.py b/dataflow/operators/general_text/refine/remove_extra_spaces_refiner.py similarity index 100% rename from dataflow/operators/general_pt/refine/remove_extra_spaces_refiner.py rename to dataflow/operators/general_text/refine/remove_extra_spaces_refiner.py diff --git a/dataflow/operators/general_pt/refine/remove_image_ref_refiner.py b/dataflow/operators/general_text/refine/remove_image_ref_refiner.py similarity index 100% rename from dataflow/operators/general_pt/refine/remove_image_ref_refiner.py rename to dataflow/operators/general_text/refine/remove_image_ref_refiner.py diff --git a/dataflow/operators/general_pt/refine/remove_number_refiner.py b/dataflow/operators/general_text/refine/remove_number_refiner.py similarity index 100% rename from dataflow/operators/general_pt/refine/remove_number_refiner.py rename to dataflow/operators/general_text/refine/remove_number_refiner.py diff --git a/dataflow/operators/general_pt/refine/remove_punctuation_refiner.py b/dataflow/operators/general_text/refine/remove_punctuation_refiner.py similarity index 100% rename from dataflow/operators/general_pt/refine/remove_punctuation_refiner.py rename to dataflow/operators/general_text/refine/remove_punctuation_refiner.py diff --git a/dataflow/operators/general_pt/refine/remove_repetitions_punctuation_refiner.py b/dataflow/operators/general_text/refine/remove_repetitions_punctuation_refiner.py similarity index 100% rename from dataflow/operators/general_pt/refine/remove_repetitions_punctuation_refiner.py rename to dataflow/operators/general_text/refine/remove_repetitions_punctuation_refiner.py diff --git a/dataflow/operators/general_pt/refine/remove_stopwords_refiner.py b/dataflow/operators/general_text/refine/remove_stopwords_refiner.py similarity index 100% rename from dataflow/operators/general_pt/refine/remove_stopwords_refiner.py rename to dataflow/operators/general_text/refine/remove_stopwords_refiner.py diff --git a/dataflow/operators/general_pt/refine/spelling_correction_refiner.py b/dataflow/operators/general_text/refine/spelling_correction_refiner.py similarity index 100% rename from dataflow/operators/general_pt/refine/spelling_correction_refiner.py rename to dataflow/operators/general_text/refine/spelling_correction_refiner.py diff --git a/dataflow/operators/general_pt/refine/stemming_lemmatization_refiner.py b/dataflow/operators/general_text/refine/stemming_lemmatization_refiner.py similarity index 100% rename from dataflow/operators/general_pt/refine/stemming_lemmatization_refiner.py rename to dataflow/operators/general_text/refine/stemming_lemmatization_refiner.py diff --git a/dataflow/operators/general_pt/refine/text_normalization_refiner.py b/dataflow/operators/general_text/refine/text_normalization_refiner.py similarity index 100% rename from dataflow/operators/general_pt/refine/text_normalization_refiner.py rename to dataflow/operators/general_text/refine/text_normalization_refiner.py diff --git a/dataflow/operators/text_pt/__init__.py b/dataflow/operators/text_pt/__init__.py new file mode 100644 index 00000000..d2f7b8b1 --- /dev/null +++ b/dataflow/operators/text_pt/__init__.py @@ -0,0 +1,32 @@ +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + # filter + from .filter.ccnet_deduplicate_filter import CCNetDeduplicateFilter + from .filter.debertav3_filter import DebertaV3Filter + from .filter.fineweb_edu_filter import FineWebEduFilter + from .filter.pair_qual_filter import PairQualFilter + from .filter.perplexity_filter import PerplexityFilter + from .filter.qurating_filter import QuratingFilter + from .filter.text_book_filter import TextbookFilter + + # generate + from .generate.phi4qa_generator import Phi4QAGenerator + + # eval + from .eval.debertav3_sample_evaluator import DebertaV3SampleEvaluator + from .eval.fineweb_edu_sample_evaluator import FineWebEduSampleEvaluator + from .eval.pair_qual_sample_evaluator import PairQualSampleEvaluator + from .eval.textbook_sample_evaluator import TextbookSampleEvaluator + from .eval.qurating_sample_evaluator import QuratingSampleEvaluator + from .eval.perplexity_sample_evaluator import PerplexitySampleEvaluator + from .eval.meta_sample_evaluator import MetaSampleEvaluator + +else: + import sys + from dataflow.utils.registry import LazyLoader, generate_import_structure_from_type_checking + + cur_path = "dataflow/operators/text_pt/" + + _import_structure = generate_import_structure_from_type_checking(__file__, cur_path) + sys.modules[__name__] = LazyLoader(__name__, "dataflow/operators/text_pt/", _import_structure) diff --git a/dataflow/operators/general_pt/eval/Kenlm/model.py b/dataflow/operators/text_pt/eval/Kenlm/model.py similarity index 100% rename from dataflow/operators/general_pt/eval/Kenlm/model.py rename to dataflow/operators/text_pt/eval/Kenlm/model.py diff --git a/dataflow/operators/general_pt/eval/Qurating/modeling/modeling_flash_llama.py b/dataflow/operators/text_pt/eval/Qurating/modeling/modeling_flash_llama.py similarity index 100% rename from dataflow/operators/general_pt/eval/Qurating/modeling/modeling_flash_llama.py rename to dataflow/operators/text_pt/eval/Qurating/modeling/modeling_flash_llama.py diff --git a/dataflow/operators/general_pt/eval/Qurating/qurater_annotate.py b/dataflow/operators/text_pt/eval/Qurating/qurater_annotate.py similarity index 100% rename from dataflow/operators/general_pt/eval/Qurating/qurater_annotate.py rename to dataflow/operators/text_pt/eval/Qurating/qurater_annotate.py diff --git a/dataflow/operators/text_pt/eval/__init__.py b/dataflow/operators/text_pt/eval/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/dataflow/operators/general_pt/eval/debertav3_sample_evaluator.py b/dataflow/operators/text_pt/eval/debertav3_sample_evaluator.py similarity index 100% rename from dataflow/operators/general_pt/eval/debertav3_sample_evaluator.py rename to dataflow/operators/text_pt/eval/debertav3_sample_evaluator.py diff --git a/dataflow/operators/general_pt/eval/fineweb_edu_sample_evaluator.py b/dataflow/operators/text_pt/eval/fineweb_edu_sample_evaluator.py similarity index 100% rename from dataflow/operators/general_pt/eval/fineweb_edu_sample_evaluator.py rename to dataflow/operators/text_pt/eval/fineweb_edu_sample_evaluator.py diff --git a/dataflow/operators/general_pt/eval/meta_sample_evaluator.py b/dataflow/operators/text_pt/eval/meta_sample_evaluator.py similarity index 100% rename from dataflow/operators/general_pt/eval/meta_sample_evaluator.py rename to dataflow/operators/text_pt/eval/meta_sample_evaluator.py diff --git a/dataflow/operators/general_pt/eval/pair_qual_sample_evaluator.py b/dataflow/operators/text_pt/eval/pair_qual_sample_evaluator.py similarity index 100% rename from dataflow/operators/general_pt/eval/pair_qual_sample_evaluator.py rename to dataflow/operators/text_pt/eval/pair_qual_sample_evaluator.py diff --git a/dataflow/operators/general_pt/eval/perplexity_sample_evaluator.py b/dataflow/operators/text_pt/eval/perplexity_sample_evaluator.py similarity index 98% rename from dataflow/operators/general_pt/eval/perplexity_sample_evaluator.py rename to dataflow/operators/text_pt/eval/perplexity_sample_evaluator.py index b009ee5e..aa7fed98 100644 --- a/dataflow/operators/general_pt/eval/perplexity_sample_evaluator.py +++ b/dataflow/operators/text_pt/eval/perplexity_sample_evaluator.py @@ -1,5 +1,5 @@ from dataflow.core import OperatorABC -from dataflow.operators.general_pt.eval.Kenlm.model import KenlmModel +from dataflow.operators.text_pt.eval.Kenlm.model import KenlmModel from dataflow.utils.registry import OPERATOR_REGISTRY from dataflow.utils.storage import DataFlowStorage from dataflow.utils.utils import get_logger diff --git a/dataflow/operators/general_pt/eval/qurating_sample_evaluator.py b/dataflow/operators/text_pt/eval/qurating_sample_evaluator.py similarity index 97% rename from dataflow/operators/general_pt/eval/qurating_sample_evaluator.py rename to dataflow/operators/text_pt/eval/qurating_sample_evaluator.py index 69dc525e..3802ef61 100644 --- a/dataflow/operators/general_pt/eval/qurating_sample_evaluator.py +++ b/dataflow/operators/text_pt/eval/qurating_sample_evaluator.py @@ -4,8 +4,8 @@ from datasets import Dataset from tqdm import tqdm from dataflow import get_logger -from dataflow.operators.general_pt.eval.Qurating.qurater_annotate import ModelAnnotator -from dataflow.operators.general_pt.eval.Qurating.qurater_annotate import TokenizeAndChunk +from dataflow.operators.text_pt.eval.Qurating.qurater_annotate import ModelAnnotator +from dataflow.operators.text_pt.eval.Qurating.qurater_annotate import TokenizeAndChunk import torch @OPERATOR_REGISTRY.register() diff --git a/dataflow/operators/general_pt/eval/textbook_sample_evaluator.py b/dataflow/operators/text_pt/eval/textbook_sample_evaluator.py similarity index 100% rename from dataflow/operators/general_pt/eval/textbook_sample_evaluator.py rename to dataflow/operators/text_pt/eval/textbook_sample_evaluator.py diff --git a/dataflow/operators/text_pt/filter/__init__.py b/dataflow/operators/text_pt/filter/__init__.py new file mode 100644 index 00000000..12beb159 --- /dev/null +++ b/dataflow/operators/text_pt/filter/__init__.py @@ -0,0 +1,60 @@ +# import sys +# from dataflow.utils.registry import LazyLoader + +# cur_path = "dataflow/operators/filter/" + +# _import_structure = { +# # Primary filters +# "NgramFilter": (cur_path + "ngram_filter.py", "NgramFilter"), +# "LanguageFilter": (cur_path + "language_filter.py", "LanguageFilter"), +# "DeitaQualityFilter": (cur_path + "deita_quality_filter.py", "DeitaQualityFilter"), +# "DeitaComplexityFilter": (cur_path + "deita_complexity_filter.py", "DeitaComplexityFilter"), +# "InstagFilter": (cur_path + "instag_filter.py", "InstagFilter"), +# "PairQualFilter": (cur_path + "pair_qual_filter.py", "PairQualFilter"), +# "QuratingFilter": (cur_path + "qurating_filter.py", "QuratingFilter"), +# "SuperfilteringFilter": (cur_path + "superfiltering_filter.py", "SuperfilteringFilter"), +# "FineWebEduFilter": (cur_path + "fineweb_edu_filter.py", "FineWebEduFilter"), +# "TextbookFilter": (cur_path + "text_book_filter.py", "TextbookFilter"), +# "AlpagasusFilter": (cur_path + "alpagasus_filter.py", "AlpagasusFilter"), +# "DebertaV3Filter": (cur_path + "debertav3_filter.py", "DebertaV3Filter"), +# "LangkitFilter": (cur_path + "langkit_filter.py", "LangkitFilter"), +# "LexicalDiversityFilter": (cur_path + "lexical_diversity_filter.py", "LexicalDiversityFilter"), +# "PerplexityFilter": (cur_path + "perplexity_filter.py", "PerplexityFilter"), +# "PerspectiveFilter": (cur_path + "perspective_filter.py", "PerspectiveFilter"), +# "PresidioFilter": (cur_path + "presidio_filter.py", "PresidioFilter"), +# "RMFilter": (cur_path + "reward_model_filter.py", "RMFilter"), +# "TreeinstructFilter": (cur_path + "treeinstruct_filter.py", "TreeinstructFilter"), + +# # Heuristic filters +# "ColonEndFilter": (cur_path + "heuristics.py", "ColonEndFilter"), +# "WordNumberFilter": (cur_path + "heuristics.py", "WordNumberFilter"), +# "BlocklistFilter": (cur_path + "heuristics.py", "BlocklistFilter"), +# "SentenceNumberFilter": (cur_path + "heuristics.py", "SentenceNumberFilter"), +# "LineEndWithEllipsisFilter": (cur_path + "heuristics.py", "LineEndWithEllipsisFilter"), +# "ContentNullFilter": (cur_path + "heuristics.py", "ContentNullFilter"), +# "MeanWordLengthFilter": (cur_path + "heuristics.py", "MeanWordLengthFilter"), +# "SymbolWordRatioFilter": (cur_path + "heuristics.py", "SymbolWordRatioFilter"), +# "HtmlEntityFilter": (cur_path + "heuristics.py", "HtmlEntityFilter"), +# "IDCardFilter": (cur_path + "heuristics.py", "IDCardFilter"), +# "NoPuncFilter": (cur_path + "heuristics.py", "NoPuncFilter"), +# "SpecialCharacterFilter": (cur_path + "heuristics.py", "SpecialCharacterFilter"), +# "WatermarkFilter": (cur_path + "heuristics.py", "WatermarkFilter"), +# "StopWordFilter": (cur_path + "heuristics.py", "StopWordFilter"), +# "CurlyBracketFilter": (cur_path + "heuristics.py", "CurlyBracketFilter"), +# "CapitalWordsFilter": (cur_path + "heuristics.py", "CapitalWordsFilter"), +# "LoremIpsumFilter": (cur_path + "heuristics.py", "LoremIpsumFilter"), +# "UniqueWordsFilter": (cur_path + "heuristics.py", "UniqueWordsFilter"), +# "CharNumberFilter": (cur_path + "heuristics.py", "CharNumberFilter"), +# "LineStartWithBulletpointFilter": (cur_path + "heuristics.py", "LineStartWithBulletpointFilter"), +# "LineWithJavascriptFilter": (cur_path + "heuristics.py", "LineWithJavascriptFilter"), + +# # Deduplicators +# "MinHashDeduplicator": (cur_path + "minhash_deduplicator.py", "MinHashDeduplicator"), +# "CCNetDeduplicator": (cur_path + "ccnet_deduplicator.py", "CCNetDeduplicator"), +# "HashDeduplicator": (cur_path + "hash_deduplicator.py", "HashDeduplicator"), +# "NgramHashDeduplicator": (cur_path + "ngramhash_deduplicator.py", "NgramHashDeduplicator"), +# "SemDeduplicator": (cur_path + "sem_deduplicator.py", "SemDeduplicator"), +# "SimHashDeduplicator": (cur_path + "simhash_deduplicator.py", "SimHashDeduplicator"), +# } + +# sys.modules[__name__] = LazyLoader(__name__, cur_path, _import_structure) \ No newline at end of file diff --git a/dataflow/operators/general_pt/filter/ccnet_deduplicate_filter.py b/dataflow/operators/text_pt/filter/ccnet_deduplicate_filter.py similarity index 100% rename from dataflow/operators/general_pt/filter/ccnet_deduplicate_filter.py rename to dataflow/operators/text_pt/filter/ccnet_deduplicate_filter.py diff --git a/dataflow/operators/general_pt/filter/debertav3_filter.py b/dataflow/operators/text_pt/filter/debertav3_filter.py similarity index 98% rename from dataflow/operators/general_pt/filter/debertav3_filter.py rename to dataflow/operators/text_pt/filter/debertav3_filter.py index 864e3417..344e00bb 100644 --- a/dataflow/operators/general_pt/filter/debertav3_filter.py +++ b/dataflow/operators/text_pt/filter/debertav3_filter.py @@ -3,7 +3,7 @@ from dataflow.core import OperatorABC from dataflow.utils.storage import DataFlowStorage from dataflow.utils.registry import OPERATOR_REGISTRY -from dataflow.operators.general_pt import DebertaV3SampleEvaluator +from dataflow.operators.text_pt import DebertaV3SampleEvaluator @OPERATOR_REGISTRY.register() class DebertaV3Filter(OperatorABC): diff --git a/dataflow/operators/general_pt/filter/fineweb_edu_filter.py b/dataflow/operators/text_pt/filter/fineweb_edu_filter.py similarity index 98% rename from dataflow/operators/general_pt/filter/fineweb_edu_filter.py rename to dataflow/operators/text_pt/filter/fineweb_edu_filter.py index 990cad73..20e05d02 100644 --- a/dataflow/operators/general_pt/filter/fineweb_edu_filter.py +++ b/dataflow/operators/text_pt/filter/fineweb_edu_filter.py @@ -1,4 +1,4 @@ -from dataflow.operators.general_pt import FineWebEduSampleEvaluator +from dataflow.operators.text_pt import FineWebEduSampleEvaluator from dataflow.core import OperatorABC from dataflow.utils.storage import DataFlowStorage from dataflow.utils.registry import OPERATOR_REGISTRY diff --git a/dataflow/operators/general_pt/filter/pair_qual_filter.py b/dataflow/operators/text_pt/filter/pair_qual_filter.py similarity index 98% rename from dataflow/operators/general_pt/filter/pair_qual_filter.py rename to dataflow/operators/text_pt/filter/pair_qual_filter.py index 0bca6b3b..56515af7 100644 --- a/dataflow/operators/general_pt/filter/pair_qual_filter.py +++ b/dataflow/operators/text_pt/filter/pair_qual_filter.py @@ -1,4 +1,4 @@ -from dataflow.operators.general_pt import PairQualSampleEvaluator +from dataflow.operators.text_pt import PairQualSampleEvaluator import numpy as np from dataflow.core import OperatorABC from dataflow.utils.registry import OPERATOR_REGISTRY diff --git a/dataflow/operators/general_pt/filter/perplexity_filter.py b/dataflow/operators/text_pt/filter/perplexity_filter.py similarity index 97% rename from dataflow/operators/general_pt/filter/perplexity_filter.py rename to dataflow/operators/text_pt/filter/perplexity_filter.py index f0269183..b6469968 100644 --- a/dataflow/operators/general_pt/filter/perplexity_filter.py +++ b/dataflow/operators/text_pt/filter/perplexity_filter.py @@ -3,7 +3,7 @@ from dataflow.core import OperatorABC from dataflow.utils.storage import DataFlowStorage from dataflow.utils.registry import OPERATOR_REGISTRY -from dataflow.operators.general_pt import PerplexitySampleEvaluator +from dataflow.operators.text_pt import PerplexitySampleEvaluator @OPERATOR_REGISTRY.register() class PerplexityFilter(OperatorABC): diff --git a/dataflow/operators/general_pt/filter/qurating_filter.py b/dataflow/operators/text_pt/filter/qurating_filter.py similarity index 98% rename from dataflow/operators/general_pt/filter/qurating_filter.py rename to dataflow/operators/text_pt/filter/qurating_filter.py index 6fc9483b..c468e7ba 100644 --- a/dataflow/operators/general_pt/filter/qurating_filter.py +++ b/dataflow/operators/text_pt/filter/qurating_filter.py @@ -1,4 +1,4 @@ -from dataflow.operators.general_pt import QuratingSampleEvaluator +from dataflow.operators.text_pt import QuratingSampleEvaluator import numpy as np from dataflow.core import OperatorABC from dataflow.utils.registry import OPERATOR_REGISTRY diff --git a/dataflow/operators/general_pt/filter/text_book_filter.py b/dataflow/operators/text_pt/filter/text_book_filter.py similarity index 98% rename from dataflow/operators/general_pt/filter/text_book_filter.py rename to dataflow/operators/text_pt/filter/text_book_filter.py index 4ec1c9a0..68c18e6b 100644 --- a/dataflow/operators/general_pt/filter/text_book_filter.py +++ b/dataflow/operators/text_pt/filter/text_book_filter.py @@ -2,7 +2,7 @@ from dataflow.core import OperatorABC from dataflow.utils.storage import DataFlowStorage from dataflow.utils.registry import OPERATOR_REGISTRY -from dataflow.operators.general_pt import TextbookSampleEvaluator +from dataflow.operators.text_pt import TextbookSampleEvaluator @OPERATOR_REGISTRY.register() class TextbookFilter(OperatorABC): diff --git a/dataflow/operators/general_pt/generate/phi4qa_generator.py b/dataflow/operators/text_pt/generate/phi4qa_generator.py similarity index 100% rename from dataflow/operators/general_pt/generate/phi4qa_generator.py rename to dataflow/operators/text_pt/generate/phi4qa_generator.py diff --git a/dataflow/operators/general_sft/__init__.py b/dataflow/operators/text_sft/__init__.py similarity index 94% rename from dataflow/operators/general_sft/__init__.py rename to dataflow/operators/text_sft/__init__.py index 861640f4..5a5da160 100644 --- a/dataflow/operators/general_sft/__init__.py +++ b/dataflow/operators/text_sft/__init__.py @@ -27,7 +27,7 @@ import sys from dataflow.utils.registry import LazyLoader, generate_import_structure_from_type_checking - cur_path = "dataflow/operators/general_sft/" + cur_path = "dataflow/operators/text_sft/" _import_structure = generate_import_structure_from_type_checking(__file__, cur_path) - sys.modules[__name__] = LazyLoader(__name__, "dataflow/operators/general_sft/", _import_structure) + sys.modules[__name__] = LazyLoader(__name__, "dataflow/operators/text_sft/", _import_structure) diff --git a/dataflow/operators/general_sft/eval/Superfiltering/data_analysis.py b/dataflow/operators/text_sft/eval/Superfiltering/data_analysis.py similarity index 100% rename from dataflow/operators/general_sft/eval/Superfiltering/data_analysis.py rename to dataflow/operators/text_sft/eval/Superfiltering/data_analysis.py diff --git a/dataflow/operators/general_sft/eval/alpagasus_sample_evaluator.py b/dataflow/operators/text_sft/eval/alpagasus_sample_evaluator.py similarity index 100% rename from dataflow/operators/general_sft/eval/alpagasus_sample_evaluator.py rename to dataflow/operators/text_sft/eval/alpagasus_sample_evaluator.py diff --git a/dataflow/operators/general_sft/eval/deita_complexity_sample_evaluator.py b/dataflow/operators/text_sft/eval/deita_complexity_sample_evaluator.py similarity index 100% rename from dataflow/operators/general_sft/eval/deita_complexity_sample_evaluator.py rename to dataflow/operators/text_sft/eval/deita_complexity_sample_evaluator.py diff --git a/dataflow/operators/general_sft/eval/deita_quality_sample_evaluator.py b/dataflow/operators/text_sft/eval/deita_quality_sample_evaluator.py similarity index 100% rename from dataflow/operators/general_sft/eval/deita_quality_sample_evaluator.py rename to dataflow/operators/text_sft/eval/deita_quality_sample_evaluator.py diff --git a/dataflow/operators/general_sft/eval/instag_sample_evaluator.py b/dataflow/operators/text_sft/eval/instag_sample_evaluator.py similarity index 100% rename from dataflow/operators/general_sft/eval/instag_sample_evaluator.py rename to dataflow/operators/text_sft/eval/instag_sample_evaluator.py diff --git a/dataflow/operators/general_sft/eval/rm_sample_evaluator.py b/dataflow/operators/text_sft/eval/rm_sample_evaluator.py similarity index 100% rename from dataflow/operators/general_sft/eval/rm_sample_evaluator.py rename to dataflow/operators/text_sft/eval/rm_sample_evaluator.py diff --git a/dataflow/operators/general_sft/eval/superfiltering_sample_evaluator.py b/dataflow/operators/text_sft/eval/superfiltering_sample_evaluator.py similarity index 97% rename from dataflow/operators/general_sft/eval/superfiltering_sample_evaluator.py rename to dataflow/operators/text_sft/eval/superfiltering_sample_evaluator.py index 50069ac0..db6a816d 100644 --- a/dataflow/operators/general_sft/eval/superfiltering_sample_evaluator.py +++ b/dataflow/operators/text_sft/eval/superfiltering_sample_evaluator.py @@ -1,6 +1,6 @@ from dataflow.core import OperatorABC from dataflow.utils.registry import OPERATOR_REGISTRY -from dataflow.operators.general_sft.eval.Superfiltering.data_analysis import get_perplexity_and_embedding_whole_text, get_perplexity_and_embedding_part_text +from dataflow.operators.text_sft.eval.Superfiltering.data_analysis import get_perplexity_and_embedding_whole_text, get_perplexity_and_embedding_part_text from transformers import AutoTokenizer, AutoModelForCausalLM from tqdm import tqdm import torch diff --git a/dataflow/operators/general_sft/eval/treeinstruct_sample_evaluator.py b/dataflow/operators/text_sft/eval/treeinstruct_sample_evaluator.py similarity index 100% rename from dataflow/operators/general_sft/eval/treeinstruct_sample_evaluator.py rename to dataflow/operators/text_sft/eval/treeinstruct_sample_evaluator.py diff --git a/dataflow/operators/general_sft/filter/alpagasus_filter.py b/dataflow/operators/text_sft/filter/alpagasus_filter.py similarity index 98% rename from dataflow/operators/general_sft/filter/alpagasus_filter.py rename to dataflow/operators/text_sft/filter/alpagasus_filter.py index 8bd5215b..75fb49bb 100644 --- a/dataflow/operators/general_sft/filter/alpagasus_filter.py +++ b/dataflow/operators/text_sft/filter/alpagasus_filter.py @@ -3,7 +3,7 @@ from dataflow.utils.storage import DataFlowStorage from dataflow.utils.registry import OPERATOR_REGISTRY from dataflow.core import LLMServingABC -from dataflow.operators.general_sft import AlpagasusSampleEvaluator +from dataflow.operators.text_sft import AlpagasusSampleEvaluator @OPERATOR_REGISTRY.register() class AlpagasusFilter(OperatorABC): diff --git a/dataflow/operators/general_sft/filter/deita_complexity_filter.py b/dataflow/operators/text_sft/filter/deita_complexity_filter.py similarity index 98% rename from dataflow/operators/general_sft/filter/deita_complexity_filter.py rename to dataflow/operators/text_sft/filter/deita_complexity_filter.py index 9dc9fa0f..529af761 100644 --- a/dataflow/operators/general_sft/filter/deita_complexity_filter.py +++ b/dataflow/operators/text_sft/filter/deita_complexity_filter.py @@ -1,4 +1,4 @@ -from dataflow.operators.general_sft import DeitaComplexitySampleEvaluator +from dataflow.operators.text_sft import DeitaComplexitySampleEvaluator from dataflow.core import OperatorABC import numpy as np from dataflow.utils.registry import OPERATOR_REGISTRY diff --git a/dataflow/operators/general_sft/filter/deita_quality_filter.py b/dataflow/operators/text_sft/filter/deita_quality_filter.py similarity index 98% rename from dataflow/operators/general_sft/filter/deita_quality_filter.py rename to dataflow/operators/text_sft/filter/deita_quality_filter.py index fe72f991..c7d36124 100644 --- a/dataflow/operators/general_sft/filter/deita_quality_filter.py +++ b/dataflow/operators/text_sft/filter/deita_quality_filter.py @@ -1,4 +1,4 @@ -from dataflow.operators.general_sft import DeitaQualitySampleEvaluator +from dataflow.operators.text_sft import DeitaQualitySampleEvaluator from dataflow.core import OperatorABC import numpy as np from dataflow.utils.registry import OPERATOR_REGISTRY diff --git a/dataflow/operators/general_sft/filter/instag_filter.py b/dataflow/operators/text_sft/filter/instag_filter.py similarity index 97% rename from dataflow/operators/general_sft/filter/instag_filter.py rename to dataflow/operators/text_sft/filter/instag_filter.py index 35355d05..cf4c1052 100644 --- a/dataflow/operators/general_sft/filter/instag_filter.py +++ b/dataflow/operators/text_sft/filter/instag_filter.py @@ -1,4 +1,4 @@ -from dataflow.operators.general_sft import InstagSampleEvaluator +from dataflow.operators.text_sft import InstagSampleEvaluator from dataflow.core import OperatorABC import numpy as np from dataflow.utils.registry import OPERATOR_REGISTRY diff --git a/dataflow/operators/general_sft/filter/rm_filter.py b/dataflow/operators/text_sft/filter/rm_filter.py similarity index 98% rename from dataflow/operators/general_sft/filter/rm_filter.py rename to dataflow/operators/text_sft/filter/rm_filter.py index c1592416..f36bf6b1 100644 --- a/dataflow/operators/general_sft/filter/rm_filter.py +++ b/dataflow/operators/text_sft/filter/rm_filter.py @@ -3,7 +3,7 @@ from dataflow.core import OperatorABC from dataflow.utils.storage import DataFlowStorage from dataflow.utils.registry import OPERATOR_REGISTRY -from dataflow.operators.general_sft import RMSampleEvaluator +from dataflow.operators.text_sft import RMSampleEvaluator @OPERATOR_REGISTRY.register() class RMFilter(OperatorABC): diff --git a/dataflow/operators/general_sft/filter/superfiltering_filter.py b/dataflow/operators/text_sft/filter/superfiltering_filter.py similarity index 98% rename from dataflow/operators/general_sft/filter/superfiltering_filter.py rename to dataflow/operators/text_sft/filter/superfiltering_filter.py index 59165abd..a33c21cd 100644 --- a/dataflow/operators/general_sft/filter/superfiltering_filter.py +++ b/dataflow/operators/text_sft/filter/superfiltering_filter.py @@ -1,4 +1,4 @@ -from dataflow.operators.general_sft import SuperfilteringSampleEvaluator +from dataflow.operators.text_sft import SuperfilteringSampleEvaluator import numpy as np from dataflow.core import OperatorABC from dataflow.utils.registry import OPERATOR_REGISTRY diff --git a/dataflow/operators/general_sft/filter/treeinstruct_filter.py b/dataflow/operators/text_sft/filter/treeinstruct_filter.py similarity index 98% rename from dataflow/operators/general_sft/filter/treeinstruct_filter.py rename to dataflow/operators/text_sft/filter/treeinstruct_filter.py index a45ce61a..ee40cb8a 100644 --- a/dataflow/operators/general_sft/filter/treeinstruct_filter.py +++ b/dataflow/operators/text_sft/filter/treeinstruct_filter.py @@ -3,7 +3,7 @@ from dataflow.core import OperatorABC, LLMServingABC from dataflow.utils.storage import DataFlowStorage from dataflow.utils.registry import OPERATOR_REGISTRY -from dataflow.operators.general_sft import TreeinstructSampleEvaluator +from dataflow.operators.text_sft import TreeinstructSampleEvaluator @OPERATOR_REGISTRY.register() class TreeinstructFilter(OperatorABC): diff --git a/dataflow/operators/general_sft/generate/condor_generator.py b/dataflow/operators/text_sft/generate/condor_generator.py similarity index 100% rename from dataflow/operators/general_sft/generate/condor_generator.py rename to dataflow/operators/text_sft/generate/condor_generator.py diff --git a/dataflow/operators/general_sft/generate/sft_generator_from_seed.py b/dataflow/operators/text_sft/generate/sft_generator_from_seed.py similarity index 100% rename from dataflow/operators/general_sft/generate/sft_generator_from_seed.py rename to dataflow/operators/text_sft/generate/sft_generator_from_seed.py diff --git a/dataflow/operators/general_sft/refine/condor_refiner.py b/dataflow/operators/text_sft/refine/condor_refiner.py similarity index 100% rename from dataflow/operators/general_sft/refine/condor_refiner.py rename to dataflow/operators/text_sft/refine/condor_refiner.py diff --git a/dataflow/statics/pipelines/api_pipelines/text_sft_synthesis_pipeline.py b/dataflow/statics/pipelines/api_pipelines/text_sft_synthesis_pipeline.py index 75f16d39..42f8d00c 100644 --- a/dataflow/statics/pipelines/api_pipelines/text_sft_synthesis_pipeline.py +++ b/dataflow/statics/pipelines/api_pipelines/text_sft_synthesis_pipeline.py @@ -1,7 +1,7 @@ -from dataflow.operators.general_sft import AlpagasusFilter -from dataflow.operators.general_sft import CondorGenerator -from dataflow.operators.general_sft import CondorRefiner +from dataflow.operators.text_sft import AlpagasusFilter +from dataflow.operators.text_sft import CondorGenerator +from dataflow.operators.text_sft import CondorRefiner from dataflow.utils.storage import FileStorage from dataflow.serving import APILLMServing_request diff --git a/dataflow/statics/pipelines/cpu_pipelines/text_pt_filter.py b/dataflow/statics/pipelines/cpu_pipelines/text_pt_filter.py index d60b524f..96a3d036 100644 --- a/dataflow/statics/pipelines/cpu_pipelines/text_pt_filter.py +++ b/dataflow/statics/pipelines/cpu_pipelines/text_pt_filter.py @@ -3,7 +3,7 @@ BlocklistFilter, MinHashDeduplicateFilter ) -from dataflow.operators.general_pt import ( +from dataflow.operators.text_pt import ( MetaSampleEvaluator, ColonEndFilter, SentenceNumberFilter, @@ -24,7 +24,7 @@ LineStartWithBulletpointFilter, LineWithJavascriptFilter ) -from dataflow.operators.general_pt import ( +from dataflow.operators.text_pt import ( HtmlUrlRemoverRefiner, RemoveEmojiRefiner, RemoveExtraSpacesRefiner diff --git a/dataflow/statics/pipelines/gpu_pipelines/text_pt_filter.py b/dataflow/statics/pipelines/gpu_pipelines/text_pt_filter.py index 36cb4de4..241a2865 100644 --- a/dataflow/statics/pipelines/gpu_pipelines/text_pt_filter.py +++ b/dataflow/statics/pipelines/gpu_pipelines/text_pt_filter.py @@ -4,7 +4,7 @@ WordNumberFilter, BlocklistFilter, ) -from dataflow.operators.general_pt import ( +from dataflow.operators.text_pt import ( ColonEndFilter, SentenceNumberFilter, LineEndWithEllipsisFilter, @@ -25,7 +25,7 @@ LineWithJavascriptFilter, PairQualFilter ) -from dataflow.operators.general_pt import ( +from dataflow.operators.text_pt import ( HtmlUrlRemoverRefiner, RemoveEmojiRefiner, RemoveExtraSpacesRefiner diff --git a/dataflow/statics/pipelines/gpu_pipelines/text_pt_synthetic.py b/dataflow/statics/pipelines/gpu_pipelines/text_pt_synthetic.py index cb78149f..7c2e2be6 100644 --- a/dataflow/statics/pipelines/gpu_pipelines/text_pt_synthetic.py +++ b/dataflow/statics/pipelines/gpu_pipelines/text_pt_synthetic.py @@ -4,7 +4,7 @@ WordNumberFilter, BlocklistFilter, ) -from dataflow.operators.general_pt import ( +from dataflow.operators.text_pt import ( ColonEndFilter, SentenceNumberFilter, LineEndWithEllipsisFilter, @@ -26,12 +26,12 @@ PairQualFilter, QuratingFilter ) -from dataflow.operators.general_pt import ( +from dataflow.operators.text_pt import ( HtmlUrlRemoverRefiner, RemoveEmojiRefiner, RemoveExtraSpacesRefiner ) -from dataflow.operators.general_pt import Phi4QAGenerator +from dataflow.operators.text_pt import Phi4QAGenerator from dataflow.serving import LocalModelLLMServing_vllm, LocalModelLLMServing_sglang from dataflow.utils.storage import FileStorage diff --git a/dataflow/statics/pipelines/gpu_pipelines/text_sft_filter.py b/dataflow/statics/pipelines/gpu_pipelines/text_sft_filter.py index 1137e52b..d4bf3f14 100644 --- a/dataflow/statics/pipelines/gpu_pipelines/text_sft_filter.py +++ b/dataflow/statics/pipelines/gpu_pipelines/text_sft_filter.py @@ -1,7 +1,7 @@ from dataflow.operators.core_text import ( WordNumberFilter, ) -from dataflow.operators.general_sft import ( +from dataflow.operators.text_sft import ( SuperfilteringFilter, DeitaQualityFilter, InstagFilter diff --git a/dataflow/statics/pipelines/gpu_pipelines/text_sft_synthetic.py b/dataflow/statics/pipelines/gpu_pipelines/text_sft_synthetic.py index 64376ae7..c5e7112e 100644 --- a/dataflow/statics/pipelines/gpu_pipelines/text_sft_synthetic.py +++ b/dataflow/statics/pipelines/gpu_pipelines/text_sft_synthetic.py @@ -4,7 +4,7 @@ WordNumberFilter, BlocklistFilter, ) -from dataflow.operators.general_pt import ( +from dataflow.operators.text_pt import ( ColonEndFilter, SentenceNumberFilter, LineEndWithEllipsisFilter, @@ -25,17 +25,17 @@ LineWithJavascriptFilter, PairQualFilter, ) -from dataflow.operators.general_sft import ( +from dataflow.operators.text_sft import ( SuperfilteringFilter, DeitaQualityFilter, InstagFilter, ) -from dataflow.operators.general_pt import ( +from dataflow.operators.text_pt import ( HtmlUrlRemoverRefiner, RemoveEmojiRefiner, RemoveExtraSpacesRefiner ) -from dataflow.operators.general_sft import SFTGeneratorSeed +from dataflow.operators.text_sft import SFTGeneratorSeed from dataflow.serving import LocalModelLLMServing_vllm, LocalModelLLMServing_sglang from dataflow.utils.storage import FileStorage diff --git a/dataflow/utils/registry.py b/dataflow/utils/registry.py index b62fa040..deab8f93 100644 --- a/dataflow/utils/registry.py +++ b/dataflow/utils/registry.py @@ -206,8 +206,9 @@ def get_type_of_operator(self): 'core_vision', 'db', 'knowledge_cleaning', - 'general_pt', - 'general_sft', + 'general_text', + 'text_pt', + 'text_sft', 'rare', 'reasoning', 'text2sql' diff --git a/test/test_autoop.py b/test/test_autoop.py index 088c2446..e4abe5a7 100644 --- a/test/test_autoop.py +++ b/test/test_autoop.py @@ -2,7 +2,7 @@ from dataflow.operators.core_text import ( LLMLanguageFilter, ) -from dataflow.operators.general_pt import MetaSampleEvaluator +from dataflow.operators.text_pt import MetaSampleEvaluator from dataflow.operators.core_text import PromptedGenerator from dataflow.serving import APILLMServing_request, LocalModelLLMServing_vllm, LocalHostLLMAPIServing_vllm from dataflow.utils.storage import FileStorage From 351dd3b58a2ed5af31b927ad7bbed8fb43c85792 Mon Sep 17 00:00:00 2001 From: MOLYHECI Date: Thu, 11 Sep 2025 08:45:46 +0800 Subject: [PATCH 4/4] [debug] fix bugs for test and statics --- .../pipelines/cpu_pipelines/text_pt_filter.py | 14 ++++++-------- .../pipelines/cpu_pipelines/text_sft_filter.py | 2 +- .../pipelines/gpu_pipelines/text_pt_filter.py | 12 +++++------- .../pipelines/gpu_pipelines/text_pt_synthetic.py | 15 +++++++-------- .../pipelines/gpu_pipelines/text_sft_filter.py | 2 +- .../pipelines/gpu_pipelines/text_sft_synthetic.py | 14 ++++++-------- test/test_autoop.py | 2 +- 7 files changed, 27 insertions(+), 34 deletions(-) diff --git a/dataflow/statics/pipelines/cpu_pipelines/text_pt_filter.py b/dataflow/statics/pipelines/cpu_pipelines/text_pt_filter.py index 96a3d036..c19147c9 100644 --- a/dataflow/statics/pipelines/cpu_pipelines/text_pt_filter.py +++ b/dataflow/statics/pipelines/cpu_pipelines/text_pt_filter.py @@ -1,10 +1,7 @@ -from dataflow.operators.core_text import ( +from dataflow.operators.general_text import ( WordNumberFilter, BlocklistFilter, - MinHashDeduplicateFilter -) -from dataflow.operators.text_pt import ( - MetaSampleEvaluator, + MinHashDeduplicateFilter, ColonEndFilter, SentenceNumberFilter, LineEndWithEllipsisFilter, @@ -22,13 +19,14 @@ UniqueWordsFilter, CharNumberFilter, LineStartWithBulletpointFilter, - LineWithJavascriptFilter -) -from dataflow.operators.text_pt import ( + LineWithJavascriptFilter, HtmlUrlRemoverRefiner, RemoveEmojiRefiner, RemoveExtraSpacesRefiner ) +from dataflow.operators.text_pt import ( + MetaSampleEvaluator, +) from dataflow.utils.storage import FileStorage diff --git a/dataflow/statics/pipelines/cpu_pipelines/text_sft_filter.py b/dataflow/statics/pipelines/cpu_pipelines/text_sft_filter.py index 0abf069d..da23e5ad 100644 --- a/dataflow/statics/pipelines/cpu_pipelines/text_sft_filter.py +++ b/dataflow/statics/pipelines/cpu_pipelines/text_sft_filter.py @@ -1,4 +1,4 @@ -from dataflow.operators.core_text import WordNumberFilter +from dataflow.operators.general_text import WordNumberFilter from dataflow.utils.storage import FileStorage diff --git a/dataflow/statics/pipelines/gpu_pipelines/text_pt_filter.py b/dataflow/statics/pipelines/gpu_pipelines/text_pt_filter.py index 241a2865..28754614 100644 --- a/dataflow/statics/pipelines/gpu_pipelines/text_pt_filter.py +++ b/dataflow/statics/pipelines/gpu_pipelines/text_pt_filter.py @@ -1,10 +1,8 @@ -from dataflow.operators.core_text import ( +from dataflow.operators.general_text import ( MinHashDeduplicateFilter, LanguageFilter, WordNumberFilter, BlocklistFilter, -) -from dataflow.operators.text_pt import ( ColonEndFilter, SentenceNumberFilter, LineEndWithEllipsisFilter, @@ -23,12 +21,12 @@ CharNumberFilter, LineStartWithBulletpointFilter, LineWithJavascriptFilter, - PairQualFilter + RemoveExtraSpacesRefiner, + RemoveEmojiRefiner, + HtmlUrlRemoverRefiner, ) from dataflow.operators.text_pt import ( - HtmlUrlRemoverRefiner, - RemoveEmojiRefiner, - RemoveExtraSpacesRefiner + PairQualFilter ) from dataflow.utils.storage import FileStorage diff --git a/dataflow/statics/pipelines/gpu_pipelines/text_pt_synthetic.py b/dataflow/statics/pipelines/gpu_pipelines/text_pt_synthetic.py index 7c2e2be6..923b2732 100644 --- a/dataflow/statics/pipelines/gpu_pipelines/text_pt_synthetic.py +++ b/dataflow/statics/pipelines/gpu_pipelines/text_pt_synthetic.py @@ -1,10 +1,8 @@ -from dataflow.operators.core_text import ( +from dataflow.operators.general_text import ( MinHashDeduplicateFilter, LanguageFilter, WordNumberFilter, BlocklistFilter, -) -from dataflow.operators.text_pt import ( ColonEndFilter, SentenceNumberFilter, LineEndWithEllipsisFilter, @@ -23,14 +21,15 @@ CharNumberFilter, LineStartWithBulletpointFilter, LineWithJavascriptFilter, - PairQualFilter, - QuratingFilter + RemoveExtraSpacesRefiner, + RemoveEmojiRefiner, + HtmlUrlRemoverRefiner, ) from dataflow.operators.text_pt import ( - HtmlUrlRemoverRefiner, - RemoveEmojiRefiner, - RemoveExtraSpacesRefiner + PairQualFilter, + QuratingFilter ) + from dataflow.operators.text_pt import Phi4QAGenerator from dataflow.serving import LocalModelLLMServing_vllm, LocalModelLLMServing_sglang from dataflow.utils.storage import FileStorage diff --git a/dataflow/statics/pipelines/gpu_pipelines/text_sft_filter.py b/dataflow/statics/pipelines/gpu_pipelines/text_sft_filter.py index d4bf3f14..1761c29e 100644 --- a/dataflow/statics/pipelines/gpu_pipelines/text_sft_filter.py +++ b/dataflow/statics/pipelines/gpu_pipelines/text_sft_filter.py @@ -1,4 +1,4 @@ -from dataflow.operators.core_text import ( +from dataflow.operators.general_text import ( WordNumberFilter, ) from dataflow.operators.text_sft import ( diff --git a/dataflow/statics/pipelines/gpu_pipelines/text_sft_synthetic.py b/dataflow/statics/pipelines/gpu_pipelines/text_sft_synthetic.py index c5e7112e..c5a55f33 100644 --- a/dataflow/statics/pipelines/gpu_pipelines/text_sft_synthetic.py +++ b/dataflow/statics/pipelines/gpu_pipelines/text_sft_synthetic.py @@ -1,10 +1,11 @@ -from dataflow.operators.core_text import ( +from dataflow.operators.general_text import ( MinHashDeduplicateFilter, LanguageFilter, WordNumberFilter, BlocklistFilter, -) -from dataflow.operators.text_pt import ( + HtmlUrlRemoverRefiner, + RemoveEmojiRefiner, + RemoveExtraSpacesRefiner, ColonEndFilter, SentenceNumberFilter, LineEndWithEllipsisFilter, @@ -23,6 +24,8 @@ CharNumberFilter, LineStartWithBulletpointFilter, LineWithJavascriptFilter, +) +from dataflow.operators.text_pt import ( PairQualFilter, ) from dataflow.operators.text_sft import ( @@ -30,11 +33,6 @@ DeitaQualityFilter, InstagFilter, ) -from dataflow.operators.text_pt import ( - HtmlUrlRemoverRefiner, - RemoveEmojiRefiner, - RemoveExtraSpacesRefiner -) from dataflow.operators.text_sft import SFTGeneratorSeed from dataflow.serving import LocalModelLLMServing_vllm, LocalModelLLMServing_sglang from dataflow.utils.storage import FileStorage diff --git a/test/test_autoop.py b/test/test_autoop.py index e4abe5a7..2cefefa0 100644 --- a/test/test_autoop.py +++ b/test/test_autoop.py @@ -1,5 +1,5 @@ from dataflow.pipeline import PipelineABC -from dataflow.operators.core_text import ( +from dataflow.operators.general_text import ( LLMLanguageFilter, ) from dataflow.operators.text_pt import MetaSampleEvaluator