diff --git a/dataflow/operators/core_text/__init__.py b/dataflow/operators/core_text/__init__.py index 20075855..632484cd 100644 --- a/dataflow/operators/core_text/__init__.py +++ b/dataflow/operators/core_text/__init__.py @@ -4,9 +4,13 @@ from .generate.prompted_generator import PromptedGenerator from .generate.paired_prompted_generator import PairedPromptedGenerator from .generate.random_domain_knowledge_row_generator import RandomDomainKnowledgeRowGenerator + from .eval.prompted_eval import PromptedEvaluator - from .filter.prompted_filter import PromptedFilter + from .refine.prompted_refiner import PromptedRefiner + + from .filter.prompted_filter import PromptedFilter + from .filter.general_filter import GeneralFilter else: import sys from dataflow.utils.registry import LazyLoader, generate_import_structure_from_type_checking diff --git a/dataflow/operators/general_text/filter/general_filter.py b/dataflow/operators/core_text/filter/general_filter.py similarity index 100% rename from dataflow/operators/general_text/filter/general_filter.py rename to dataflow/operators/core_text/filter/general_filter.py diff --git a/dataflow/operators/general_text/__init__.py b/dataflow/operators/general_text/__init__.py index 7ec7ebee..180579f1 100644 --- a/dataflow/operators/general_text/__init__.py +++ b/dataflow/operators/general_text/__init__.py @@ -2,63 +2,42 @@ if TYPE_CHECKING: # filter - from .filter.alpagasus_filter import AlpagasusFilter - from .filter.ccnet_deduplicator import CCNetDeduplicator - from .filter.debertav3_filter import DebertaV3Filter - from .filter.deita_complexity_filter import DeitaComplexityFilter - from .filter.deita_quality_filter import DeitaQualityFilter - from .filter.fineweb_edu_filter import FineWebEduFilter - from .filter.general_filter import GeneralFilter - from .filter.hash_deduplicator import HashDeduplicator - from .filter.heuristics import ColonEndFilter - from .filter.heuristics import WordNumberFilter - from .filter.heuristics import SentenceNumberFilter - from .filter.heuristics import LineEndWithEllipsisFilter - from .filter.heuristics import ContentNullFilter - from .filter.heuristics import SymbolWordRatioFilter - from .filter.heuristics import AlphaWordsFilter - from .filter.heuristics import HtmlEntityFilter - from .filter.heuristics import IDCardFilter - from .filter.heuristics import NoPuncFilter - from .filter.heuristics import SpecialCharacterFilter - from .filter.heuristics import WatermarkFilter - from .filter.heuristics import MeanWordLengthFilter - from .filter.heuristics import StopWordFilter - from .filter.heuristics import CurlyBracketFilter - from .filter.heuristics import CapitalWordsFilter - from .filter.heuristics import LoremIpsumFilter - from .filter.heuristics import UniqueWordsFilter - from .filter.heuristics import CharNumberFilter - from .filter.heuristics import LineStartWithBulletpointFilter - from .filter.heuristics import LineWithJavascriptFilter - from .filter.heuristics import BlocklistFilter - from .filter.instag_filter import InstagFilter + from .filter.heuristics_filter import ColonEndFilter + from .filter.heuristics_filter import SentenceNumberFilter + from .filter.heuristics_filter import LineEndWithEllipsisFilter + from .filter.heuristics_filter import ContentNullFilter + from .filter.heuristics_filter import SymbolWordRatioFilter + from .filter.heuristics_filter import AlphaWordsFilter + from .filter.heuristics_filter import HtmlEntityFilter + from .filter.heuristics_filter import IDCardFilter + from .filter.heuristics_filter import NoPuncFilter + from .filter.heuristics_filter import SpecialCharacterFilter + from .filter.heuristics_filter import WatermarkFilter + from .filter.heuristics_filter import MeanWordLengthFilter + from .filter.heuristics_filter import StopWordFilter + from .filter.heuristics_filter import CurlyBracketFilter + from .filter.heuristics_filter import CapitalWordsFilter + from .filter.heuristics_filter import LoremIpsumFilter + from .filter.heuristics_filter import UniqueWordsFilter + from .filter.heuristics_filter import CharNumberFilter + from .filter.heuristics_filter import LineStartWithBulletpointFilter + from .filter.heuristics_filter import LineWithJavascriptFilter from .filter.langkit_filter import LangkitFilter - from .filter.language_filter import LanguageFilter from .filter.lexical_diversity_filter import LexicalDiversityFilter - from .filter.llm_language_filter import LLMLanguageFilter - from .filter.minhash_deduplicator import MinHashDeduplicator from .filter.ngram_filter import NgramFilter - from .filter.ngramhash_deduplicator import NgramHashDeduplicator - from .filter.pair_qual_filter import PairQualFilter - from .filter.perplexity_filter import PerplexityFilter - from .filter.perspective_filter import PerspectiveFilter from .filter.presidio_filter import PresidioFilter - from .filter.qurating_filter import QuratingFilter - from .filter.reward_model_filter import RMFilter - from .filter.sem_deduplicator import SemDeduplicator - from .filter.simhash_deduplicator import SimHashDeduplicator - from .filter.superfiltering_filter import SuperfilteringFilter - from .filter.text_book_filter import TextbookFilter - from .filter.treeinstruct_filter import TreeinstructFilter - - # generate - from .generate.condor_generator import CondorGenerator - from .generate.pretrain_generator import PretrainGenerator - from .generate.sft_generator_from_seed import SFTGeneratorSeed + from .filter.blocklist_filter import BlocklistFilter + from .filter.hash_deduplicate_filter import HashDeduplicateFilter + from .filter.language_filter import LanguageFilter + from .filter.llm_language_filter import LLMLanguageFilter + from .filter.minhash_deduplicate_filter import MinHashDeduplicateFilter + from .filter.ngramhash_deduplicate_filter import NgramHashDeduplicateFilter + from .filter.perspective_filter import PerspectiveFilter + from .filter.sem_deduplicate_filter import SemDeduplicateFilter + from .filter.simhash_deduplicate_filter import SimHashDeduplicateFilter + from .filter.word_number_filter import WordNumberFilter # refine - from .refine.condor_refiner import CondorRefiner from .refine.html_entity_refiner import HtmlEntityRefiner from .refine.html_url_remover_refiner import HtmlUrlRemoverRefiner from .refine.lowercase_refiner import LowercaseRefiner @@ -79,42 +58,22 @@ from .refine.text_normalization_refiner import TextNormalizationRefiner # eval - from .eval.statistics.ngram_scorer import NgramScorer - from .eval.statistics.lexical_diversity_scorer import LexicalDiversityScorer - from .eval.statistics.langkit_scorer import LangkitScorer + from .eval.ngram_sample_evaluator import NgramSampleEvaluator + from .eval.lexical_diversity_sample_evaluator import LexicalDiversitySampleEvaluator + from .eval.langkit_sample_evaluator import LangkitSampleEvaluator + from .eval.presidio_sample_evaluator import PresidioSampleEvaluator + from .eval.bert_sample_evaluator import BertSampleEvaluator + from .eval.bleu_sample_evaluator import BleuSampleEvaluator + from .eval.cider_sample_evaluator import CiderSampleEvaluator + from .eval.perspective_sample_evaluator import PerspectiveSampleEvaluator + from .eval.task2vec_dataset_evaluator import Task2VecDatasetEvaluator + from .eval.vendi_dataset_evaluator import VendiDatasetEvaluator - from .eval.models.deita_quality_scorer import DeitaQualityScorer - from .eval.models.instag_scorer import InstagScorer - from .eval.models.debertav3_scorer import DebertaV3Scorer - from .eval.models.deita_complexity_scorer import DeitaComplexityScorer - from .eval.models.fineweb_edu_scorer import FineWebEduScorer - from .eval.models.pair_qual_scorer import PairQualScorer - from .eval.models.presidio_scorer import PresidioScorer - from .eval.models.rm_scorer import RMScorer - from .eval.models.textbook_scorer import TextbookScorer - from .eval.models.superfiltering_scorer import SuperfilteringScorer - from .eval.models.qurating_scorer import QuratingScorer - from .eval.models.perplexity_scorer import PerplexityScorer - - from .eval.APIcaller.alpagasus_scorer import AlpagasusScorer - from .eval.APIcaller.treeinstruct_scorer import TreeinstructScorer - from .eval.APIcaller.perspective_scorer import PerspectiveScorer - from .eval.APIcaller.meta_scorer import MetaScorer - - from .eval.diversity.vendi_scorer import VendiScorer - from .eval.diversity.task2vec_scorer import Task2VecScorer - - from .eval.gen.bert_scorer import BERTScorer - from .eval.gen.bleu_scorer import BleuScorer - from .eval.gen.cider_scorer import CiderScorer else: import sys from dataflow.utils.registry import LazyLoader, generate_import_structure_from_type_checking - - cur_path = "dataflow/operators/general_text/" - _import_structure = generate_import_structure_from_type_checking(__file__, cur_path) sys.modules[__name__] = LazyLoader(__name__, "dataflow/operators/general_text/", _import_structure) diff --git a/dataflow/operators/general_text/eval/gen/bleu/__init__.py b/dataflow/operators/general_text/eval/__init__.py similarity index 100% rename from dataflow/operators/general_text/eval/gen/bleu/__init__.py rename to dataflow/operators/general_text/eval/__init__.py diff --git a/dataflow/operators/general_text/eval/gen/bert_scorer.py b/dataflow/operators/general_text/eval/bert_sample_evaluator.py similarity index 98% rename from dataflow/operators/general_text/eval/gen/bert_scorer.py rename to dataflow/operators/general_text/eval/bert_sample_evaluator.py index 2880b727..d40f4b5d 100644 --- a/dataflow/operators/general_text/eval/gen/bert_scorer.py +++ b/dataflow/operators/general_text/eval/bert_sample_evaluator.py @@ -5,7 +5,7 @@ import evaluate @OPERATOR_REGISTRY.register() -class BERTScorer(OperatorABC): +class BertSampleEvaluator(OperatorABC): def __init__(self, lang='en', model_cache_dir='./dataflow_cache'): self.logger = get_logger() self.logger.info(f'Initializing {self.__class__.__name__}...') diff --git a/dataflow/operators/general_text/eval/gen/cider/__init__.py b/dataflow/operators/general_text/eval/bleu/__init__.py similarity index 100% rename from dataflow/operators/general_text/eval/gen/cider/__init__.py rename to dataflow/operators/general_text/eval/bleu/__init__.py diff --git a/dataflow/operators/general_text/eval/gen/bleu/bleu.py b/dataflow/operators/general_text/eval/bleu/bleu.py similarity index 100% rename from dataflow/operators/general_text/eval/gen/bleu/bleu.py rename to dataflow/operators/general_text/eval/bleu/bleu.py diff --git a/dataflow/operators/general_text/eval/gen/bleu_scorer.py b/dataflow/operators/general_text/eval/bleu_sample_evaluator.py similarity index 97% rename from dataflow/operators/general_text/eval/gen/bleu_scorer.py rename to dataflow/operators/general_text/eval/bleu_sample_evaluator.py index 9f92ffcf..c87a4a83 100644 --- a/dataflow/operators/general_text/eval/gen/bleu_scorer.py +++ b/dataflow/operators/general_text/eval/bleu_sample_evaluator.py @@ -2,11 +2,11 @@ from dataflow.utils.storage import DataFlowStorage from dataflow.utils.registry import OPERATOR_REGISTRY from dataflow import get_logger -from dataflow.operators.general_text.eval.gen.bleu.bleu import Bleu +from dataflow.operators.general_text.eval.bleu.bleu import Bleu from tqdm import tqdm @OPERATOR_REGISTRY.register() -class BleuScorer(OperatorABC): +class BleuSampleEvaluator(OperatorABC): def __init__(self, n=4, eff="average", special_reflen=None): self.logger = get_logger() self.logger.info(f'Initializing {self.__class__.__name__}...') diff --git a/dataflow/operators/general_text/eval/statistics/__init__.py b/dataflow/operators/general_text/eval/cider/__init__.py similarity index 100% rename from dataflow/operators/general_text/eval/statistics/__init__.py rename to dataflow/operators/general_text/eval/cider/__init__.py diff --git a/dataflow/operators/general_text/eval/gen/cider/cider.py b/dataflow/operators/general_text/eval/cider/cider.py similarity index 100% rename from dataflow/operators/general_text/eval/gen/cider/cider.py rename to dataflow/operators/general_text/eval/cider/cider.py diff --git a/dataflow/operators/general_text/eval/gen/cider_scorer.py b/dataflow/operators/general_text/eval/cider_sample_evaluator.py similarity index 95% rename from dataflow/operators/general_text/eval/gen/cider_scorer.py rename to dataflow/operators/general_text/eval/cider_sample_evaluator.py index 9a59690d..e432c08c 100644 --- a/dataflow/operators/general_text/eval/gen/cider_scorer.py +++ b/dataflow/operators/general_text/eval/cider_sample_evaluator.py @@ -6,7 +6,7 @@ from dataflow.utils.storage import DataFlowStorage from dataflow.utils.registry import OPERATOR_REGISTRY from dataflow import get_logger -from dataflow.operators.general_text.eval.gen.cider.cider import Cider +from dataflow.operators.general_text.eval.cider.cider import Cider def load_idf(idf_path): with open(idf_path, 'rb') as f: @@ -14,8 +14,8 @@ def load_idf(idf_path): return idf @OPERATOR_REGISTRY.register() -class CiderScorer(OperatorABC): - def __init__(self, n=4, sigma=6.0, df_mode="coco-val-df", idf_path="./dataflow/operators/eval/GeneralText/gen/cider/coco-val-df.p"): +class CiderSampleEvaluator(OperatorABC): + def __init__(self, n=4, sigma=6.0, df_mode="coco-val-df", idf_path="./dataflow/operators/general_pt/eval/cider/coco-val-df.p"): self.logger = get_logger() self.logger.info(f'Initializing {self.__class__.__name__}...') self.score_name = 'CiderScore' diff --git a/dataflow/operators/general_text/eval/statistics/langkit_scorer.py b/dataflow/operators/general_text/eval/langkit_sample_evaluator.py similarity index 98% rename from dataflow/operators/general_text/eval/statistics/langkit_scorer.py rename to dataflow/operators/general_text/eval/langkit_sample_evaluator.py index 2e73b6a3..17ed87d6 100644 --- a/dataflow/operators/general_text/eval/statistics/langkit_scorer.py +++ b/dataflow/operators/general_text/eval/langkit_sample_evaluator.py @@ -7,7 +7,7 @@ from tqdm import tqdm @OPERATOR_REGISTRY.register() -class LangkitScorer(OperatorABC): +class LangkitSampleEvaluator(OperatorABC): def __init__(self): self.logger = get_logger() self.logger.info(f'Initializing {self.__class__.__name__}...') @@ -63,7 +63,7 @@ def _score_func(self, sample): def eval(self, dataframe, input_key): scores_list = [] self.logger.info(f"Evaluating {self.score_name}...") - for sample in tqdm(dataframe[input_key], desc="LangkitScorer Evaluating..."): + for sample in tqdm(dataframe[input_key], desc="LangkitScore Evaluating..."): scores = self._score_func(sample) scores_list.append(scores) self.logger.info("Evaluation complete!") diff --git a/dataflow/operators/general_text/eval/statistics/lexical_diversity_scorer.py b/dataflow/operators/general_text/eval/lexical_diversity_sample_evaluator.py similarity index 98% rename from dataflow/operators/general_text/eval/statistics/lexical_diversity_scorer.py rename to dataflow/operators/general_text/eval/lexical_diversity_sample_evaluator.py index 4124cab4..a77b064d 100644 --- a/dataflow/operators/general_text/eval/statistics/lexical_diversity_scorer.py +++ b/dataflow/operators/general_text/eval/lexical_diversity_sample_evaluator.py @@ -90,7 +90,7 @@ def hdd(word_array, sample_size=42.0): @OPERATOR_REGISTRY.register() -class LexicalDiversityScorer(OperatorABC): +class LexicalDiversitySampleEvaluator(OperatorABC): def __init__(self): self.logger = get_logger() self.logger.info(f'Initializing {self.__class__.__name__}...') @@ -145,7 +145,7 @@ def _score_func(self, sample): def eval(self, dataframe, input_key): scores_list = [] self.logger.info(f"Evaluating {self.score_name}...") - for sample in tqdm(dataframe[input_key], desc="LexicalDiversityScorer Evaluating..."): + for sample in tqdm(dataframe[input_key], desc="LexicalDiversityScore Evaluating..."): scores = self._score_func(sample) scores_list.append(scores) self.logger.info("Evaluation complete!") diff --git a/dataflow/operators/general_text/eval/statistics/ngram_scorer.py b/dataflow/operators/general_text/eval/ngram_sample_evaluator.py similarity index 98% rename from dataflow/operators/general_text/eval/statistics/ngram_scorer.py rename to dataflow/operators/general_text/eval/ngram_sample_evaluator.py index 6dc602ab..434ec30d 100644 --- a/dataflow/operators/general_text/eval/statistics/ngram_scorer.py +++ b/dataflow/operators/general_text/eval/ngram_sample_evaluator.py @@ -7,7 +7,7 @@ from dataflow import get_logger @OPERATOR_REGISTRY.register() -class NgramScorer(OperatorABC): +class NgramSampleEvaluator(OperatorABC): def __init__(self, ngrams=5): self.logger = get_logger() diff --git a/dataflow/operators/general_text/eval/APIcaller/perspective_scorer.py b/dataflow/operators/general_text/eval/perspective_sample_evaluator.py similarity index 98% rename from dataflow/operators/general_text/eval/APIcaller/perspective_scorer.py rename to dataflow/operators/general_text/eval/perspective_sample_evaluator.py index 9748cef9..44bd44a1 100644 --- a/dataflow/operators/general_text/eval/APIcaller/perspective_scorer.py +++ b/dataflow/operators/general_text/eval/perspective_sample_evaluator.py @@ -8,7 +8,7 @@ from dataflow.serving import PerspectiveAPIServing @OPERATOR_REGISTRY.register() -class PerspectiveScorer(OperatorABC): +class PerspectiveSampleEvaluator(OperatorABC): """Operator that assigns Perspective API toxicity scores to text inputs.""" def __init__(self, serving: PerspectiveAPIServing = None): self.logger = get_logger() diff --git a/dataflow/operators/general_text/eval/models/presidio_scorer.py b/dataflow/operators/general_text/eval/presidio_sample_evaluator.py similarity index 98% rename from dataflow/operators/general_text/eval/models/presidio_scorer.py rename to dataflow/operators/general_text/eval/presidio_sample_evaluator.py index 9afca545..ed9b5830 100644 --- a/dataflow/operators/general_text/eval/models/presidio_scorer.py +++ b/dataflow/operators/general_text/eval/presidio_sample_evaluator.py @@ -9,7 +9,7 @@ # Presidio PII detection Scorer @OPERATOR_REGISTRY.register() -class PresidioScorer(OperatorABC): +class PresidioSampleEvaluator(OperatorABC): def __init__(self, device='cuda', lang='en', model_cache_dir='./dataflow_cache'): self.logger = get_logger() self.logger.info(f'Initializing {self.__class__.__name__}...') diff --git a/dataflow/operators/general_text/eval/diversity/task2vec/task2vec.py b/dataflow/operators/general_text/eval/task2vec/task2vec.py similarity index 100% rename from dataflow/operators/general_text/eval/diversity/task2vec/task2vec.py rename to dataflow/operators/general_text/eval/task2vec/task2vec.py diff --git a/dataflow/operators/general_text/eval/diversity/task2vec/task_similarity.py b/dataflow/operators/general_text/eval/task2vec/task_similarity.py similarity index 100% rename from dataflow/operators/general_text/eval/diversity/task2vec/task_similarity.py rename to dataflow/operators/general_text/eval/task2vec/task_similarity.py diff --git a/dataflow/operators/general_text/eval/diversity/task2vec/utils.py b/dataflow/operators/general_text/eval/task2vec/utils.py similarity index 100% rename from dataflow/operators/general_text/eval/diversity/task2vec/utils.py rename to dataflow/operators/general_text/eval/task2vec/utils.py diff --git a/dataflow/operators/general_text/eval/diversity/task2vec_scorer.py b/dataflow/operators/general_text/eval/task2vec_dataset_evaluator.py similarity index 96% rename from dataflow/operators/general_text/eval/diversity/task2vec_scorer.py rename to dataflow/operators/general_text/eval/task2vec_dataset_evaluator.py index 54d4e925..5dbbe883 100644 --- a/dataflow/operators/general_text/eval/diversity/task2vec_scorer.py +++ b/dataflow/operators/general_text/eval/task2vec_dataset_evaluator.py @@ -1,5 +1,5 @@ -from dataflow.operators.general_text.eval.diversity.task2vec.task2vec import Task2Vec -from dataflow.operators.general_text.eval.diversity.task2vec import task_similarity +from dataflow.operators.general_text.eval.task2vec.task2vec import Task2Vec +from dataflow.operators.general_text.eval.task2vec import task_similarity import torch import random from transformers import GPT2Tokenizer, GPT2LMHeadModel @@ -12,7 +12,7 @@ # Task2Vec dataset diversity evaluation # Cited from: Beyond Scale: the Diversity Coefficient as a Data Quality Metric Demonstrates LLMs are Pre-trained on Formally Diverse Data @OPERATOR_REGISTRY.register() -class Task2VecScorer(OperatorABC): +class Task2VecDatasetEvaluator(OperatorABC): def __init__(self, device='cuda', sample_nums=10, sample_size=1, method: Optional[str]='montecarlo', model_cache_dir='./dataflow_cache'): self.logger = get_logger() self.logger.info(f'Initializing {self.__class__.__name__}...') diff --git a/dataflow/operators/general_text/eval/diversity/vendi_scorer.py b/dataflow/operators/general_text/eval/vendi_dataset_evaluator.py similarity index 98% rename from dataflow/operators/general_text/eval/diversity/vendi_scorer.py rename to dataflow/operators/general_text/eval/vendi_dataset_evaluator.py index aacd3169..c0d18ad5 100644 --- a/dataflow/operators/general_text/eval/diversity/vendi_scorer.py +++ b/dataflow/operators/general_text/eval/vendi_dataset_evaluator.py @@ -8,7 +8,7 @@ # VendiScore dataset diversity evaluation # Cited from: The Vendi Score: A Diversity Evaluation Metric for Machine Learning @OPERATOR_REGISTRY.register() -class VendiScorer(OperatorABC): +class VendiDatasetEvaluator(OperatorABC): def __init__(self, device='cuda'): self.logger = get_logger() self.logger.info(f'Initializing {self.__class__.__name__}...') diff --git a/dataflow/operators/general_text/filter/blocklist_filter.py b/dataflow/operators/general_text/filter/blocklist_filter.py new file mode 100644 index 00000000..495b3403 --- /dev/null +++ b/dataflow/operators/general_text/filter/blocklist_filter.py @@ -0,0 +1,82 @@ +from tqdm import tqdm +import numpy as np +from dataflow import get_logger +from dataflow.core import OperatorABC +from dataflow.utils.storage import DataFlowStorage +from dataflow.utils.registry import OPERATOR_REGISTRY +from dataflow.cli_funcs.paths import DataFlowPath +from nltk.tokenize import word_tokenize + +@OPERATOR_REGISTRY.register() +class BlocklistFilter(OperatorABC): + + def __init__(self, language:str = 'en', threshold:int = 1, use_tokenizer:bool = False): + self.logger = get_logger() + self.language = language + self.threshold = threshold + self.use_tokenizer = use_tokenizer + self.logger.info(f"Initializing {self.__class__.__name__} with language = {self.language}, threshold = {self.threshold}, use_tokenizer = {self.use_tokenizer}...") + self.blocklist = self.load_blocklist() + + @staticmethod + def get_desc(lang: str = "zh"): + if lang == "zh": + return ( + "该算子使用特定语言的阻止列表进行文本过滤,支持可选的分词器进行单词级匹配。\n" + "输入参数:\n" + "- input_key:输入文本字段名,默认为'text'\n" + "- language:语言代码,默认为'zh'\n" + "- blocklist_dir:阻止列表文件目录,默认为'./blocklists/'\n" + "- threshold:匹配次数阈值,默认为1\n" + "- use_tokenizer:是否使用分词器,默认为True\n" + "- tokenizer:分词器对象,默认为None\n" + "输出参数:\n" + "- 过滤后的DataFrame,仅保留不包含阻止列表关键词的文本行\n" + "- 返回包含输入字段名的列表,用于后续算子引用" + ) + elif lang == "en": + return ( + "This operator filters text using language-specific blocklists with optional tokenizer integration for word-level filtering.\n" + "Input Parameters:\n" + "- input_key: Input text field name, default is 'text'\n" + "- language: Language code, default is 'zh'\n" + "- blocklist_dir: Blocklist file directory, default is './blocklists/'\n" + "- threshold: Matching count threshold, default is 1\n" + "- use_tokenizer: Whether to use tokenizer, default is True\n" + "- tokenizer: Tokenizer object, default is None\n" + "Output Parameters:\n" + "- Filtered DataFrame containing only rows without blocklist keywords\n" + "- List containing input field name for subsequent operator reference" + ) + else: + return "BlocklistFilter uses language-specific blocklists with optional tokenizer integration." + + def load_blocklist(self): + dataflow_dir = DataFlowPath.get_dataflow_dir() + file_path = f"{dataflow_dir}/operators/general_text/filter/blocklist/{self.language}.txt" + self.logger.info(f"Loading blocklist for language '{self.language}' from {file_path}...") + with open(file_path, 'r', encoding='utf-8') as file: + blocklist = set(line.strip().lower() for line in file if line.strip()) + self.logger.info(f"Blocklist for '{self.language}' loaded. Total words in blocklist: {len(blocklist)}.") + return blocklist + + def run(self, storage: DataFlowStorage, input_key: str, output_key: str = 'blocklist_filter_label'): + self.input_key = input_key + self.output_key = output_key + dataframe = storage.read("dataframe") + self.logger.info(f"Running {self.__class__.__name__}...") + valid_checks = [] + for text in tqdm(dataframe[self.input_key], desc=f"Implementing {self.__class__.__name__}"): + if text: + if self.use_tokenizer: + text = word_tokenize(text.lower()) + else: + text = text.lower().split() + blocklist_count = sum(1 for word in text if word in self.blocklist) + valid_checks.append(blocklist_count <= self.threshold) + valid_checks = np.array(valid_checks, dtype=int) + dataframe[self.output_key] = valid_checks + filtered_dataframe = dataframe[valid_checks == 1] + storage.write(filtered_dataframe) + self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.") + return [self.output_key] diff --git a/dataflow/operators/general_text/filter/hash_deduplicator.py b/dataflow/operators/general_text/filter/hash_deduplicate_filter.py similarity index 99% rename from dataflow/operators/general_text/filter/hash_deduplicator.py rename to dataflow/operators/general_text/filter/hash_deduplicate_filter.py index 14d78f20..85354db9 100644 --- a/dataflow/operators/general_text/filter/hash_deduplicator.py +++ b/dataflow/operators/general_text/filter/hash_deduplicate_filter.py @@ -7,7 +7,7 @@ from dataflow.utils.registry import OPERATOR_REGISTRY @OPERATOR_REGISTRY.register() -class HashDeduplicator(OperatorABC): +class HashDeduplicateFilter(OperatorABC): def __init__(self, hash_func: str = 'md5'): self.logger = get_logger() self.hash_func = hash_func diff --git a/dataflow/operators/general_text/filter/heuristics.py b/dataflow/operators/general_text/filter/heuristics_filter.py similarity index 90% rename from dataflow/operators/general_text/filter/heuristics.py rename to dataflow/operators/general_text/filter/heuristics_filter.py index 5c6500db..09b5f3b8 100644 --- a/dataflow/operators/general_text/filter/heuristics.py +++ b/dataflow/operators/general_text/filter/heuristics_filter.py @@ -5,7 +5,6 @@ from dataflow.utils.registry import OPERATOR_REGISTRY from dataflow.utils.utils import get_logger from dataflow.utils.storage import DataFlowStorage -from dataflow.cli_funcs.paths import DataFlowPath from tqdm import tqdm import re @@ -57,64 +56,6 @@ def run(self, storage: DataFlowStorage, input_key: str, output_key: str = None): self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.") return [self.output_key] -@OPERATOR_REGISTRY.register() -class WordNumberFilter(OperatorABC): - - def __init__(self, min_words: int=20, max_words: int=100000): - self.logger = get_logger() - self.min_words = min_words - self.max_words = max_words - self.logger.info(f"Initializing {self.__class__.__name__} with min_words = {self.min_words}, max_words = {self.max_words}...") - - @staticmethod - def get_desc(lang: str = "zh"): - if lang == "zh": - return ( - "该算子用于过滤单词数量不在指定范围内的文本,通过空格分割计算单词数量。\n" - "输入参数:\n" - "- input_key:输入文本字段名,默认为'text'\n" - "- min_words:最小单词数量阈值,默认为5\n" - "- max_words:最大单词数量阈值,默认为100\n" - "输出参数:\n" - "- 过滤后的DataFrame,仅保留单词数量在指定范围内的文本行\n" - "- 返回包含输入字段名的列表,用于后续算子引用" - ) - elif lang == "en": - return ( - "This operator filters text with word count outside the specified range, using space splitting for word counting.\n" - "Input Parameters:\n" - "- input_key: Input text field name, default is 'text'\n" - "- min_words: Minimum word count threshold, default is 5\n" - "- max_words: Maximum word count threshold, default is 100\n" - "Output Parameters:\n" - "- Filtered DataFrame containing only rows with word count within specified range\n" - "- List containing input field name for subsequent operator reference" - ) - else: - return "WordNumberFilter filters text based on word count range using space splitting." - - def run(self, storage: DataFlowStorage, input_key: str, output_key: str='word_number_filter_label'): - self.input_key = input_key - self.output_key = output_key - dataframe = storage.read("dataframe") - self.logger.info(f"Running {self.__class__.__name__} with input_key = {self.input_key} and output_key = {self.output_key}...") - word_counts = [] - for text in tqdm(dataframe[self.input_key], desc=f"Implementing {self.__class__.__name__}"): - if text: - normalized_words = tuple(text.split()) - num_normalized_words = len(normalized_words) - word_counts.append(num_normalized_words) - else: - word_counts.append(0) - word_counts = np.array(word_counts) - metric_filter = (self.min_words <= word_counts) & (word_counts < self.max_words) - dataframe[self.output_key] = word_counts - filtered_dataframe = dataframe[metric_filter] - storage.write(filtered_dataframe) - self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.") - return [self.output_key] - - @OPERATOR_REGISTRY.register() class SentenceNumberFilter(OperatorABC): @@ -1498,77 +1439,3 @@ def run(self, storage: DataFlowStorage, input_key: str, output_key: str='line_wi self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.") return [self.output_key] - -@OPERATOR_REGISTRY.register() -class BlocklistFilter(OperatorABC): - - def __init__(self, language:str = 'en', threshold:int = 1, use_tokenizer:bool = False): - self.logger = get_logger() - self.language = language - self.threshold = threshold - self.use_tokenizer = use_tokenizer - self.logger.info(f"Initializing {self.__class__.__name__} with language = {self.language}, threshold = {self.threshold}, use_tokenizer = {self.use_tokenizer}...") - self.blocklist = self.load_blocklist() - - @staticmethod - def get_desc(lang: str = "zh"): - if lang == "zh": - return ( - "该算子使用特定语言的阻止列表进行文本过滤,支持可选的分词器进行单词级匹配。\n" - "输入参数:\n" - "- input_key:输入文本字段名,默认为'text'\n" - "- language:语言代码,默认为'zh'\n" - "- blocklist_dir:阻止列表文件目录,默认为'./blocklists/'\n" - "- threshold:匹配次数阈值,默认为1\n" - "- use_tokenizer:是否使用分词器,默认为True\n" - "- tokenizer:分词器对象,默认为None\n" - "输出参数:\n" - "- 过滤后的DataFrame,仅保留不包含阻止列表关键词的文本行\n" - "- 返回包含输入字段名的列表,用于后续算子引用" - ) - elif lang == "en": - return ( - "This operator filters text using language-specific blocklists with optional tokenizer integration for word-level filtering.\n" - "Input Parameters:\n" - "- input_key: Input text field name, default is 'text'\n" - "- language: Language code, default is 'zh'\n" - "- blocklist_dir: Blocklist file directory, default is './blocklists/'\n" - "- threshold: Matching count threshold, default is 1\n" - "- use_tokenizer: Whether to use tokenizer, default is True\n" - "- tokenizer: Tokenizer object, default is None\n" - "Output Parameters:\n" - "- Filtered DataFrame containing only rows without blocklist keywords\n" - "- List containing input field name for subsequent operator reference" - ) - else: - return "BlocklistFilter uses language-specific blocklists with optional tokenizer integration." - - def load_blocklist(self): - dataflow_dir = DataFlowPath.get_dataflow_dir() - file_path = f"{dataflow_dir}/operators/filter/GeneralText/blocklist/{self.language}.txt" - self.logger.info(f"Loading blocklist for language '{self.language}' from {file_path}...") - with open(file_path, 'r', encoding='utf-8') as file: - blocklist = set(line.strip().lower() for line in file if line.strip()) - self.logger.info(f"Blocklist for '{self.language}' loaded. Total words in blocklist: {len(blocklist)}.") - return blocklist - - def run(self, storage: DataFlowStorage, input_key: str, output_key: str = 'blocklist_filter_label'): - self.input_key = input_key - self.output_key = output_key - dataframe = storage.read("dataframe") - self.logger.info(f"Running {self.__class__.__name__}...") - valid_checks = [] - for text in tqdm(dataframe[self.input_key], desc=f"Implementing {self.__class__.__name__}"): - if text: - if self.use_tokenizer: - text = word_tokenize(text.lower()) - else: - text = text.lower().split() - blocklist_count = sum(1 for word in text if word in self.blocklist) - valid_checks.append(blocklist_count <= self.threshold) - valid_checks = np.array(valid_checks, dtype=int) - dataframe[self.output_key] = valid_checks - filtered_dataframe = dataframe[valid_checks == 1] - storage.write(filtered_dataframe) - self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.") - return [self.output_key] diff --git a/dataflow/operators/general_text/filter/langkit_filter.py b/dataflow/operators/general_text/filter/langkit_filter.py index e9cf64a8..22c0ec0c 100644 --- a/dataflow/operators/general_text/filter/langkit_filter.py +++ b/dataflow/operators/general_text/filter/langkit_filter.py @@ -4,7 +4,7 @@ from dataflow.core import OperatorABC from dataflow.utils.storage import DataFlowStorage from dataflow.utils.registry import OPERATOR_REGISTRY -from dataflow.operators.general_text import LangkitScorer +from dataflow.operators.general_text import LangkitSampleEvaluator @OPERATOR_REGISTRY.register() class LangkitFilter(OperatorABC): @@ -66,7 +66,7 @@ def __init__(self, if not self.min_scores.keys() == self.max_scores.keys(): raise ValueError("min_scores and max_scores must have the same keys") self.logger = get_logger() - self.scorer = LangkitScorer() + self.scorer = LangkitSampleEvaluator() self.logger.info(f"Initializing {self.__class__.__name__} with min_scores: {self.min_scores} and max_scores: {self.max_scores}...") @staticmethod diff --git a/dataflow/operators/general_text/filter/lexical_diversity_filter.py b/dataflow/operators/general_text/filter/lexical_diversity_filter.py index d6782a3b..faa87b6c 100644 --- a/dataflow/operators/general_text/filter/lexical_diversity_filter.py +++ b/dataflow/operators/general_text/filter/lexical_diversity_filter.py @@ -4,7 +4,7 @@ from dataflow.core import OperatorABC from dataflow.utils.storage import DataFlowStorage from dataflow.utils.registry import OPERATOR_REGISTRY -from dataflow.operators.general_text import LexicalDiversityScorer +from dataflow.operators.general_text import LexicalDiversitySampleEvaluator @OPERATOR_REGISTRY.register() class LexicalDiversityFilter(OperatorABC): @@ -20,7 +20,7 @@ def __init__(self, min_scores: dict = {'mtld': 50, 'hdd': 0.8}, max_scores: dict 'hdd': 'LexicalDiversityHD-DScore', 'mtld': 'LexicalDiversityMTLDScore', } - self.scorer = LexicalDiversityScorer() + self.scorer = LexicalDiversitySampleEvaluator() @staticmethod def get_desc(lang: str = "zh"): diff --git a/dataflow/operators/general_text/filter/minhash_deduplicator.py b/dataflow/operators/general_text/filter/minhash_deduplicate_filter.py similarity index 98% rename from dataflow/operators/general_text/filter/minhash_deduplicator.py rename to dataflow/operators/general_text/filter/minhash_deduplicate_filter.py index e07c1f8d..db73df92 100644 --- a/dataflow/operators/general_text/filter/minhash_deduplicator.py +++ b/dataflow/operators/general_text/filter/minhash_deduplicate_filter.py @@ -4,11 +4,9 @@ from dataflow.core import OperatorABC from dataflow.utils.storage import DataFlowStorage from dataflow.utils.registry import OPERATOR_REGISTRY -from dataflow.operators.general_text import NgramScorer - @OPERATOR_REGISTRY.register() -class MinHashDeduplicator(OperatorABC): +class MinHashDeduplicateFilter(OperatorABC): def __init__(self, num_perm=128, threshold=0.9, use_n_gram=True, ngram=5): self.logger = get_logger() self.num_perm = num_perm diff --git a/dataflow/operators/general_text/filter/ngram_filter.py b/dataflow/operators/general_text/filter/ngram_filter.py index 4c0f5eee..047bebf1 100644 --- a/dataflow/operators/general_text/filter/ngram_filter.py +++ b/dataflow/operators/general_text/filter/ngram_filter.py @@ -2,7 +2,7 @@ from dataflow.core import OperatorABC from dataflow.utils.storage import DataFlowStorage from dataflow.utils.registry import OPERATOR_REGISTRY -from dataflow.operators.general_text import NgramScorer +from dataflow.operators.general_text import NgramSampleEvaluator @OPERATOR_REGISTRY.register() class NgramFilter(OperatorABC): @@ -11,7 +11,7 @@ def __init__(self, min_score=0.8, max_score=1, ngrams=5): self.logger = get_logger() self.min_score = min_score self.max_score = max_score - self.scorer = NgramScorer(ngrams) + self.scorer = NgramSampleEvaluator(ngrams) self.logger.info(f"Initializing {self.__class__.__name__} with min_scores: {self.min_score} and max_scores: {self.max_score}...") @staticmethod diff --git a/dataflow/operators/general_text/filter/ngramhash_deduplicator.py b/dataflow/operators/general_text/filter/ngramhash_deduplicate_filter.py similarity index 99% rename from dataflow/operators/general_text/filter/ngramhash_deduplicator.py rename to dataflow/operators/general_text/filter/ngramhash_deduplicate_filter.py index fef84aa6..44fa12cf 100644 --- a/dataflow/operators/general_text/filter/ngramhash_deduplicator.py +++ b/dataflow/operators/general_text/filter/ngramhash_deduplicate_filter.py @@ -7,7 +7,7 @@ from dataflow.utils.registry import OPERATOR_REGISTRY @OPERATOR_REGISTRY.register() -class NgramHashDeduplicator(OperatorABC): +class NgramHashDeduplicateFilter(OperatorABC): def __init__(self, n_gram: int = 3, hash_func: str = 'md5', diff_size : int = 1): self.logger = get_logger() self.n_gram = n_gram diff --git a/dataflow/operators/general_text/filter/perspective_filter.py b/dataflow/operators/general_text/filter/perspective_filter.py index b394f5b3..97f8d027 100644 --- a/dataflow/operators/general_text/filter/perspective_filter.py +++ b/dataflow/operators/general_text/filter/perspective_filter.py @@ -3,7 +3,7 @@ from dataflow.core import OperatorABC from dataflow.utils.storage import DataFlowStorage from dataflow.utils.registry import OPERATOR_REGISTRY -from dataflow.operators.general_text import PerspectiveScorer +from dataflow.operators.general_text import PerspectiveSampleEvaluator from dataflow.serving import PerspectiveAPIServing @OPERATOR_REGISTRY.register() @@ -14,7 +14,7 @@ def __init__(self, min_score: float = 0.0, max_score: float = 0.5): self.min_score = min_score self.max_score = max_score self.serving = PerspectiveAPIServing(max_workers=10) - self.scorer = PerspectiveScorer(serving=self.serving) + self.scorer = PerspectiveSampleEvaluator(serving=self.serving) @staticmethod def get_desc(lang: str = "zh"): diff --git a/dataflow/operators/general_text/filter/presidio_filter.py b/dataflow/operators/general_text/filter/presidio_filter.py index 00530e13..1febf290 100644 --- a/dataflow/operators/general_text/filter/presidio_filter.py +++ b/dataflow/operators/general_text/filter/presidio_filter.py @@ -3,7 +3,7 @@ from dataflow.core import OperatorABC from dataflow.utils.storage import DataFlowStorage from dataflow.utils.registry import OPERATOR_REGISTRY -from dataflow.operators.general_text import PresidioScorer +from dataflow.operators.general_text import PresidioSampleEvaluator @OPERATOR_REGISTRY.register() class PresidioFilter(OperatorABC): @@ -12,7 +12,7 @@ def __init__(self, min_score: int = 0, max_score: int = 5, lang='en', device='cu self.logger = get_logger() self.min_score = min_score self.max_score = max_score - self.scorer = PresidioScorer(lang=lang, device=device, model_cache_dir=model_cache_dir) + self.scorer = PresidioSampleEvaluator(lang=lang, device=device, model_cache_dir=model_cache_dir) self.logger.info(f"Initializing {self.__class__.__name__} with min_score = {self.min_score} and max_score = {self.max_score}") @staticmethod diff --git a/dataflow/operators/general_text/filter/sem_deduplicator.py b/dataflow/operators/general_text/filter/sem_deduplicate_filter.py similarity index 99% rename from dataflow/operators/general_text/filter/sem_deduplicator.py rename to dataflow/operators/general_text/filter/sem_deduplicate_filter.py index 5975012f..737d63e2 100644 --- a/dataflow/operators/general_text/filter/sem_deduplicator.py +++ b/dataflow/operators/general_text/filter/sem_deduplicate_filter.py @@ -60,7 +60,7 @@ def compute_cos_sim_matrix(embeddings): @OPERATOR_REGISTRY.register() -class SemDeduplicator(OperatorABC): +class SemDeduplicateFilter(OperatorABC): def __init__(self, eps: float = 0.05, model_name: str = 'sentence-transformers/all-MiniLM-L6-v2', model_cache_dir: str = './dataflow_cache', device: str = 'cuda'): self.logger = get_logger() self.eps = eps diff --git a/dataflow/operators/general_text/filter/simhash_deduplicator.py b/dataflow/operators/general_text/filter/simhash_deduplicate_filter.py similarity index 99% rename from dataflow/operators/general_text/filter/simhash_deduplicator.py rename to dataflow/operators/general_text/filter/simhash_deduplicate_filter.py index 7db1cac4..955cf252 100644 --- a/dataflow/operators/general_text/filter/simhash_deduplicator.py +++ b/dataflow/operators/general_text/filter/simhash_deduplicate_filter.py @@ -12,7 +12,7 @@ def get_similarity(simhash, another_simhash): return similar @OPERATOR_REGISTRY.register() -class SimHashDeduplicator(OperatorABC): +class SimHashDeduplicateFilter(OperatorABC): def __init__(self, fingerprint_size: int = 64, bound: float = 0.1): self.logger = get_logger() self.fingerprint_size = fingerprint_size diff --git a/dataflow/operators/general_text/filter/word_number_filter.py b/dataflow/operators/general_text/filter/word_number_filter.py new file mode 100644 index 00000000..ce4fd933 --- /dev/null +++ b/dataflow/operators/general_text/filter/word_number_filter.py @@ -0,0 +1,64 @@ +from tqdm import tqdm +import numpy as np +from dataflow import get_logger +from dataflow.core import OperatorABC +from dataflow.utils.storage import DataFlowStorage +from dataflow.utils.registry import OPERATOR_REGISTRY + +@OPERATOR_REGISTRY.register() +class WordNumberFilter(OperatorABC): + + def __init__(self, min_words: int=20, max_words: int=100000): + self.logger = get_logger() + self.min_words = min_words + self.max_words = max_words + self.logger.info(f"Initializing {self.__class__.__name__} with min_words = {self.min_words}, max_words = {self.max_words}...") + + @staticmethod + def get_desc(lang: str = "zh"): + if lang == "zh": + return ( + "该算子用于过滤单词数量不在指定范围内的文本,通过空格分割计算单词数量。\n" + "输入参数:\n" + "- input_key:输入文本字段名,默认为'text'\n" + "- min_words:最小单词数量阈值,默认为5\n" + "- max_words:最大单词数量阈值,默认为100\n" + "输出参数:\n" + "- 过滤后的DataFrame,仅保留单词数量在指定范围内的文本行\n" + "- 返回包含输入字段名的列表,用于后续算子引用" + ) + elif lang == "en": + return ( + "This operator filters text with word count outside the specified range, using space splitting for word counting.\n" + "Input Parameters:\n" + "- input_key: Input text field name, default is 'text'\n" + "- min_words: Minimum word count threshold, default is 5\n" + "- max_words: Maximum word count threshold, default is 100\n" + "Output Parameters:\n" + "- Filtered DataFrame containing only rows with word count within specified range\n" + "- List containing input field name for subsequent operator reference" + ) + else: + return "WordNumberFilter filters text based on word count range using space splitting." + + def run(self, storage: DataFlowStorage, input_key: str, output_key: str='word_number_filter_label'): + self.input_key = input_key + self.output_key = output_key + dataframe = storage.read("dataframe") + self.logger.info(f"Running {self.__class__.__name__} with input_key = {self.input_key} and output_key = {self.output_key}...") + word_counts = [] + for text in tqdm(dataframe[self.input_key], desc=f"Implementing {self.__class__.__name__}"): + if text: + normalized_words = tuple(text.split()) + num_normalized_words = len(normalized_words) + word_counts.append(num_normalized_words) + else: + word_counts.append(0) + word_counts = np.array(word_counts) + metric_filter = (self.min_words <= word_counts) & (word_counts < self.max_words) + dataframe[self.output_key] = word_counts + filtered_dataframe = dataframe[metric_filter] + storage.write(filtered_dataframe) + self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.") + return [self.output_key] + diff --git a/dataflow/operators/text_pt/__init__.py b/dataflow/operators/text_pt/__init__.py new file mode 100644 index 00000000..d2f7b8b1 --- /dev/null +++ b/dataflow/operators/text_pt/__init__.py @@ -0,0 +1,32 @@ +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + # filter + from .filter.ccnet_deduplicate_filter import CCNetDeduplicateFilter + from .filter.debertav3_filter import DebertaV3Filter + from .filter.fineweb_edu_filter import FineWebEduFilter + from .filter.pair_qual_filter import PairQualFilter + from .filter.perplexity_filter import PerplexityFilter + from .filter.qurating_filter import QuratingFilter + from .filter.text_book_filter import TextbookFilter + + # generate + from .generate.phi4qa_generator import Phi4QAGenerator + + # eval + from .eval.debertav3_sample_evaluator import DebertaV3SampleEvaluator + from .eval.fineweb_edu_sample_evaluator import FineWebEduSampleEvaluator + from .eval.pair_qual_sample_evaluator import PairQualSampleEvaluator + from .eval.textbook_sample_evaluator import TextbookSampleEvaluator + from .eval.qurating_sample_evaluator import QuratingSampleEvaluator + from .eval.perplexity_sample_evaluator import PerplexitySampleEvaluator + from .eval.meta_sample_evaluator import MetaSampleEvaluator + +else: + import sys + from dataflow.utils.registry import LazyLoader, generate_import_structure_from_type_checking + + cur_path = "dataflow/operators/text_pt/" + + _import_structure = generate_import_structure_from_type_checking(__file__, cur_path) + sys.modules[__name__] = LazyLoader(__name__, "dataflow/operators/text_pt/", _import_structure) diff --git a/dataflow/operators/general_text/eval/models/Kenlm/model.py b/dataflow/operators/text_pt/eval/Kenlm/model.py similarity index 100% rename from dataflow/operators/general_text/eval/models/Kenlm/model.py rename to dataflow/operators/text_pt/eval/Kenlm/model.py diff --git a/dataflow/operators/general_text/eval/models/Qurating/modeling/modeling_flash_llama.py b/dataflow/operators/text_pt/eval/Qurating/modeling/modeling_flash_llama.py similarity index 100% rename from dataflow/operators/general_text/eval/models/Qurating/modeling/modeling_flash_llama.py rename to dataflow/operators/text_pt/eval/Qurating/modeling/modeling_flash_llama.py diff --git a/dataflow/operators/general_text/eval/models/Qurating/qurater_annotate.py b/dataflow/operators/text_pt/eval/Qurating/qurater_annotate.py similarity index 100% rename from dataflow/operators/general_text/eval/models/Qurating/qurater_annotate.py rename to dataflow/operators/text_pt/eval/Qurating/qurater_annotate.py diff --git a/dataflow/operators/text_pt/eval/__init__.py b/dataflow/operators/text_pt/eval/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/dataflow/operators/general_text/eval/models/debertav3_scorer.py b/dataflow/operators/text_pt/eval/debertav3_sample_evaluator.py similarity index 99% rename from dataflow/operators/general_text/eval/models/debertav3_scorer.py rename to dataflow/operators/text_pt/eval/debertav3_sample_evaluator.py index c84344d5..7ce31cb8 100644 --- a/dataflow/operators/general_text/eval/models/debertav3_scorer.py +++ b/dataflow/operators/text_pt/eval/debertav3_sample_evaluator.py @@ -9,7 +9,7 @@ from dataflow import get_logger @OPERATOR_REGISTRY.register() -class DebertaV3Scorer(OperatorABC): +class DebertaV3SampleEvaluator(OperatorABC): def __init__(self, model_name, model_cache_dir='./dataflow_cache', device='cuda'): self.logger = get_logger() self.logger.info(f'Initializing {self.__class__.__name__}...') diff --git a/dataflow/operators/general_text/eval/models/fineweb_edu_scorer.py b/dataflow/operators/text_pt/eval/fineweb_edu_sample_evaluator.py similarity index 98% rename from dataflow/operators/general_text/eval/models/fineweb_edu_scorer.py rename to dataflow/operators/text_pt/eval/fineweb_edu_sample_evaluator.py index 292adf20..7f717197 100644 --- a/dataflow/operators/general_text/eval/models/fineweb_edu_scorer.py +++ b/dataflow/operators/text_pt/eval/fineweb_edu_sample_evaluator.py @@ -8,7 +8,7 @@ import numpy as np @OPERATOR_REGISTRY.register() -class FineWebEduScorer(OperatorABC): +class FineWebEduSampleEvaluator(OperatorABC): def __init__(self, model_cache_dir: str = './dataflow_cache', device: str = 'cuda'): self.logger = get_logger() self.logger.info(f'Initializing {self.__class__.__name__}...') diff --git a/dataflow/operators/general_text/eval/APIcaller/meta_scorer.py b/dataflow/operators/text_pt/eval/meta_sample_evaluator.py similarity index 98% rename from dataflow/operators/general_text/eval/APIcaller/meta_scorer.py rename to dataflow/operators/text_pt/eval/meta_sample_evaluator.py index 259fc021..f3fe77d2 100644 --- a/dataflow/operators/general_text/eval/APIcaller/meta_scorer.py +++ b/dataflow/operators/text_pt/eval/meta_sample_evaluator.py @@ -95,7 +95,7 @@ ] @OPERATOR_REGISTRY.register() -class MetaScorer(OperatorABC): +class MetaSampleEvaluator(OperatorABC): def __init__(self, llm_serving: LLMServingABC = None, dimensions: list[dict] = example_dimensions, @@ -118,7 +118,6 @@ def __init__(self, } ] } - You can refer to dataflow/operators/eval/GeneralText/APIcaller/meta_scorer.py for an example. """ self.logger = get_logger() self.logger.info(f'Initializing {self.__class__.__name__}...') diff --git a/dataflow/operators/general_text/eval/models/pair_qual_scorer.py b/dataflow/operators/text_pt/eval/pair_qual_sample_evaluator.py similarity index 99% rename from dataflow/operators/general_text/eval/models/pair_qual_scorer.py rename to dataflow/operators/text_pt/eval/pair_qual_sample_evaluator.py index 2570678f..d877f19a 100644 --- a/dataflow/operators/general_text/eval/models/pair_qual_scorer.py +++ b/dataflow/operators/text_pt/eval/pair_qual_sample_evaluator.py @@ -9,7 +9,7 @@ import numpy as np @OPERATOR_REGISTRY.register() -class PairQualScorer(OperatorABC): +class PairQualSampleEvaluator(OperatorABC): def __init__(self, model_cache_dir:str='./dataflow_cache', device="cuda", lang='en', max_length=512): self.logger = get_logger() self.logger.info(f'Initializing {self.__class__.__name__}...') diff --git a/dataflow/operators/general_text/eval/models/perplexity_scorer.py b/dataflow/operators/text_pt/eval/perplexity_sample_evaluator.py similarity index 95% rename from dataflow/operators/general_text/eval/models/perplexity_scorer.py rename to dataflow/operators/text_pt/eval/perplexity_sample_evaluator.py index 6422bbbc..aa7fed98 100644 --- a/dataflow/operators/general_text/eval/models/perplexity_scorer.py +++ b/dataflow/operators/text_pt/eval/perplexity_sample_evaluator.py @@ -1,13 +1,13 @@ from dataflow.core import OperatorABC -from dataflow.operators.general_text.eval.models.Kenlm.model import KenlmModel +from dataflow.operators.text_pt.eval.Kenlm.model import KenlmModel from dataflow.utils.registry import OPERATOR_REGISTRY from dataflow.utils.storage import DataFlowStorage from dataflow.utils.utils import get_logger # Kenlm models perplexity evaluation @OPERATOR_REGISTRY.register() -class PerplexityScorer(OperatorABC): +class PerplexitySampleEvaluator(OperatorABC): # Need to download model first! - def __init__(self, lang='en', model_name='dataflow/operators/eval/GeneralText/models/Kenlm/wikipedia'): + def __init__(self, lang='en', model_name='dataflow/operators/general_pt/eval/Kenlm/wikipeia'): self.logger = get_logger() self.logger.info(f'Initializing {self.__class__.__name__}...') self.model_name = model_name diff --git a/dataflow/operators/general_text/eval/models/qurating_scorer.py b/dataflow/operators/text_pt/eval/qurating_sample_evaluator.py similarity index 96% rename from dataflow/operators/general_text/eval/models/qurating_scorer.py rename to dataflow/operators/text_pt/eval/qurating_sample_evaluator.py index 3bd7dea1..3802ef61 100644 --- a/dataflow/operators/general_text/eval/models/qurating_scorer.py +++ b/dataflow/operators/text_pt/eval/qurating_sample_evaluator.py @@ -4,12 +4,12 @@ from datasets import Dataset from tqdm import tqdm from dataflow import get_logger -from dataflow.operators.general_text.eval.models.Qurating.qurater_annotate import ModelAnnotator -from dataflow.operators.general_text.eval.models.Qurating.qurater_annotate import TokenizeAndChunk +from dataflow.operators.text_pt.eval.Qurating.qurater_annotate import ModelAnnotator +from dataflow.operators.text_pt.eval.Qurating.qurater_annotate import TokenizeAndChunk import torch @OPERATOR_REGISTRY.register() -class QuratingScorer(OperatorABC): +class QuratingSampleEvaluator(OperatorABC): def __init__(self, map_batch_size: int = 512, num_workers: int = 1, device_batch_size: int = 16, device: str = 'cuda', labels: list = ['writing_style', 'required_expertise', 'facts_and_trivia', 'educational_value'], model_cache_dir: str = './dataflow_cache'): self.logger = get_logger() diff --git a/dataflow/operators/general_text/eval/models/textbook_scorer.py b/dataflow/operators/text_pt/eval/textbook_sample_evaluator.py similarity index 98% rename from dataflow/operators/general_text/eval/models/textbook_scorer.py rename to dataflow/operators/text_pt/eval/textbook_sample_evaluator.py index 195f1cb7..3242fd01 100644 --- a/dataflow/operators/general_text/eval/models/textbook_scorer.py +++ b/dataflow/operators/text_pt/eval/textbook_sample_evaluator.py @@ -10,7 +10,7 @@ import numpy as np @OPERATOR_REGISTRY.register() -class TextbookScorer(OperatorABC): +class TextbookSampleEvaluator(OperatorABC): def __init__(self, model_cache_dir='./dataflow_cache'): self.logger = get_logger() self.logger.info(f'Initializing {self.__class__.__name__}...') diff --git a/dataflow/operators/text_pt/filter/__init__.py b/dataflow/operators/text_pt/filter/__init__.py new file mode 100644 index 00000000..12beb159 --- /dev/null +++ b/dataflow/operators/text_pt/filter/__init__.py @@ -0,0 +1,60 @@ +# import sys +# from dataflow.utils.registry import LazyLoader + +# cur_path = "dataflow/operators/filter/" + +# _import_structure = { +# # Primary filters +# "NgramFilter": (cur_path + "ngram_filter.py", "NgramFilter"), +# "LanguageFilter": (cur_path + "language_filter.py", "LanguageFilter"), +# "DeitaQualityFilter": (cur_path + "deita_quality_filter.py", "DeitaQualityFilter"), +# "DeitaComplexityFilter": (cur_path + "deita_complexity_filter.py", "DeitaComplexityFilter"), +# "InstagFilter": (cur_path + "instag_filter.py", "InstagFilter"), +# "PairQualFilter": (cur_path + "pair_qual_filter.py", "PairQualFilter"), +# "QuratingFilter": (cur_path + "qurating_filter.py", "QuratingFilter"), +# "SuperfilteringFilter": (cur_path + "superfiltering_filter.py", "SuperfilteringFilter"), +# "FineWebEduFilter": (cur_path + "fineweb_edu_filter.py", "FineWebEduFilter"), +# "TextbookFilter": (cur_path + "text_book_filter.py", "TextbookFilter"), +# "AlpagasusFilter": (cur_path + "alpagasus_filter.py", "AlpagasusFilter"), +# "DebertaV3Filter": (cur_path + "debertav3_filter.py", "DebertaV3Filter"), +# "LangkitFilter": (cur_path + "langkit_filter.py", "LangkitFilter"), +# "LexicalDiversityFilter": (cur_path + "lexical_diversity_filter.py", "LexicalDiversityFilter"), +# "PerplexityFilter": (cur_path + "perplexity_filter.py", "PerplexityFilter"), +# "PerspectiveFilter": (cur_path + "perspective_filter.py", "PerspectiveFilter"), +# "PresidioFilter": (cur_path + "presidio_filter.py", "PresidioFilter"), +# "RMFilter": (cur_path + "reward_model_filter.py", "RMFilter"), +# "TreeinstructFilter": (cur_path + "treeinstruct_filter.py", "TreeinstructFilter"), + +# # Heuristic filters +# "ColonEndFilter": (cur_path + "heuristics.py", "ColonEndFilter"), +# "WordNumberFilter": (cur_path + "heuristics.py", "WordNumberFilter"), +# "BlocklistFilter": (cur_path + "heuristics.py", "BlocklistFilter"), +# "SentenceNumberFilter": (cur_path + "heuristics.py", "SentenceNumberFilter"), +# "LineEndWithEllipsisFilter": (cur_path + "heuristics.py", "LineEndWithEllipsisFilter"), +# "ContentNullFilter": (cur_path + "heuristics.py", "ContentNullFilter"), +# "MeanWordLengthFilter": (cur_path + "heuristics.py", "MeanWordLengthFilter"), +# "SymbolWordRatioFilter": (cur_path + "heuristics.py", "SymbolWordRatioFilter"), +# "HtmlEntityFilter": (cur_path + "heuristics.py", "HtmlEntityFilter"), +# "IDCardFilter": (cur_path + "heuristics.py", "IDCardFilter"), +# "NoPuncFilter": (cur_path + "heuristics.py", "NoPuncFilter"), +# "SpecialCharacterFilter": (cur_path + "heuristics.py", "SpecialCharacterFilter"), +# "WatermarkFilter": (cur_path + "heuristics.py", "WatermarkFilter"), +# "StopWordFilter": (cur_path + "heuristics.py", "StopWordFilter"), +# "CurlyBracketFilter": (cur_path + "heuristics.py", "CurlyBracketFilter"), +# "CapitalWordsFilter": (cur_path + "heuristics.py", "CapitalWordsFilter"), +# "LoremIpsumFilter": (cur_path + "heuristics.py", "LoremIpsumFilter"), +# "UniqueWordsFilter": (cur_path + "heuristics.py", "UniqueWordsFilter"), +# "CharNumberFilter": (cur_path + "heuristics.py", "CharNumberFilter"), +# "LineStartWithBulletpointFilter": (cur_path + "heuristics.py", "LineStartWithBulletpointFilter"), +# "LineWithJavascriptFilter": (cur_path + "heuristics.py", "LineWithJavascriptFilter"), + +# # Deduplicators +# "MinHashDeduplicator": (cur_path + "minhash_deduplicator.py", "MinHashDeduplicator"), +# "CCNetDeduplicator": (cur_path + "ccnet_deduplicator.py", "CCNetDeduplicator"), +# "HashDeduplicator": (cur_path + "hash_deduplicator.py", "HashDeduplicator"), +# "NgramHashDeduplicator": (cur_path + "ngramhash_deduplicator.py", "NgramHashDeduplicator"), +# "SemDeduplicator": (cur_path + "sem_deduplicator.py", "SemDeduplicator"), +# "SimHashDeduplicator": (cur_path + "simhash_deduplicator.py", "SimHashDeduplicator"), +# } + +# sys.modules[__name__] = LazyLoader(__name__, cur_path, _import_structure) \ No newline at end of file diff --git a/dataflow/operators/general_text/filter/ccnet_deduplicator.py b/dataflow/operators/text_pt/filter/ccnet_deduplicate_filter.py similarity index 99% rename from dataflow/operators/general_text/filter/ccnet_deduplicator.py rename to dataflow/operators/text_pt/filter/ccnet_deduplicate_filter.py index 1bae30df..1fd77e68 100644 --- a/dataflow/operators/general_text/filter/ccnet_deduplicator.py +++ b/dataflow/operators/text_pt/filter/ccnet_deduplicate_filter.py @@ -40,7 +40,7 @@ def sha1_hash(data: bytes, d: int = 32) -> int: @OPERATOR_REGISTRY.register() -class CCNetDeduplicator(OperatorABC): +class CCNetDeduplicateFilter(OperatorABC): def __init__(self, bit_length: int = 64): self.logger = get_logger() diff --git a/dataflow/operators/general_text/filter/debertav3_filter.py b/dataflow/operators/text_pt/filter/debertav3_filter.py similarity index 96% rename from dataflow/operators/general_text/filter/debertav3_filter.py rename to dataflow/operators/text_pt/filter/debertav3_filter.py index f0c6a528..344e00bb 100644 --- a/dataflow/operators/general_text/filter/debertav3_filter.py +++ b/dataflow/operators/text_pt/filter/debertav3_filter.py @@ -3,7 +3,7 @@ from dataflow.core import OperatorABC from dataflow.utils.storage import DataFlowStorage from dataflow.utils.registry import OPERATOR_REGISTRY -from dataflow.operators.general_text import DebertaV3Scorer +from dataflow.operators.text_pt import DebertaV3SampleEvaluator @OPERATOR_REGISTRY.register() class DebertaV3Filter(OperatorABC): @@ -11,7 +11,7 @@ class DebertaV3Filter(OperatorABC): def __init__(self, allowed_scores : list = ['Medium', 'High'], model_name='nvidia/quality-classifier-deberta', model_cache_dir='./dataflow_cache', device='cuda', batch_size=16): self.logger = get_logger() self.allowed_scores = allowed_scores - self.scorer = DebertaV3Scorer( + self.scorer = DebertaV3SampleEvaluator( model_name=model_name, model_cache_dir=model_cache_dir, device=device, diff --git a/dataflow/operators/general_text/filter/fineweb_edu_filter.py b/dataflow/operators/text_pt/filter/fineweb_edu_filter.py similarity index 95% rename from dataflow/operators/general_text/filter/fineweb_edu_filter.py rename to dataflow/operators/text_pt/filter/fineweb_edu_filter.py index 8fd037ab..20e05d02 100644 --- a/dataflow/operators/general_text/filter/fineweb_edu_filter.py +++ b/dataflow/operators/text_pt/filter/fineweb_edu_filter.py @@ -1,4 +1,4 @@ -from dataflow.operators.general_text import FineWebEduScorer +from dataflow.operators.text_pt import FineWebEduSampleEvaluator from dataflow.core import OperatorABC from dataflow.utils.storage import DataFlowStorage from dataflow.utils.registry import OPERATOR_REGISTRY @@ -10,7 +10,7 @@ def __init__(self, min_score: float = 2.5, max_score: float = 10000, model_cache self.min_score = min_score self.max_score = max_score self.logger = get_logger() - self.scorer = FineWebEduScorer(model_cache_dir=model_cache_dir, device=device) + self.scorer = FineWebEduSampleEvaluator(model_cache_dir=model_cache_dir, device=device) self.filter_name = 'FineWebEduFilter' self.logger.info(f"Initializing {self.filter_name} with min_score = {self.min_score}, max_score = {self.max_score}, " f"device = {device}, model_cache_dir = {model_cache_dir}") diff --git a/dataflow/operators/general_text/filter/pair_qual_filter.py b/dataflow/operators/text_pt/filter/pair_qual_filter.py similarity index 95% rename from dataflow/operators/general_text/filter/pair_qual_filter.py rename to dataflow/operators/text_pt/filter/pair_qual_filter.py index 6310bc88..56515af7 100644 --- a/dataflow/operators/general_text/filter/pair_qual_filter.py +++ b/dataflow/operators/text_pt/filter/pair_qual_filter.py @@ -1,4 +1,4 @@ -from dataflow.operators.general_text import PairQualScorer +from dataflow.operators.text_pt import PairQualSampleEvaluator import numpy as np from dataflow.core import OperatorABC from dataflow.utils.registry import OPERATOR_REGISTRY @@ -12,7 +12,7 @@ def __init__(self, min_score=0, max_score=10000, model_cache_dir='./dataflow_cac self.min_score = min_score self.max_score = max_score - self.scorer = PairQualScorer(model_cache_dir=model_cache_dir, lang=lang) + self.scorer = PairQualSampleEvaluator(model_cache_dir=model_cache_dir, lang=lang) self.filter_name = 'PairQualFilter' self.logger.info(f"Initializing {self.filter_name} with min_score = {self.min_score}, max_score = {self.max_score}...") diff --git a/dataflow/operators/general_text/filter/perplexity_filter.py b/dataflow/operators/text_pt/filter/perplexity_filter.py similarity index 96% rename from dataflow/operators/general_text/filter/perplexity_filter.py rename to dataflow/operators/text_pt/filter/perplexity_filter.py index 5c0b82e4..b6469968 100644 --- a/dataflow/operators/general_text/filter/perplexity_filter.py +++ b/dataflow/operators/text_pt/filter/perplexity_filter.py @@ -3,7 +3,7 @@ from dataflow.core import OperatorABC from dataflow.utils.storage import DataFlowStorage from dataflow.utils.registry import OPERATOR_REGISTRY -from dataflow.operators.general_text import PerplexityScorer +from dataflow.operators.text_pt import PerplexitySampleEvaluator @OPERATOR_REGISTRY.register() class PerplexityFilter(OperatorABC): @@ -12,7 +12,7 @@ def __init__(self, min_score: float = 10.0, max_score: float = 500.0, model_nam self.logger = get_logger() self.min_score = min_score self.max_score = max_score - self.scorer = PerplexityScorer( + self.scorer = PerplexitySampleEvaluator( model_name=model_name, lang=lang ) diff --git a/dataflow/operators/general_text/filter/qurating_filter.py b/dataflow/operators/text_pt/filter/qurating_filter.py similarity index 96% rename from dataflow/operators/general_text/filter/qurating_filter.py rename to dataflow/operators/text_pt/filter/qurating_filter.py index 46a33d0c..c468e7ba 100644 --- a/dataflow/operators/general_text/filter/qurating_filter.py +++ b/dataflow/operators/text_pt/filter/qurating_filter.py @@ -1,4 +1,4 @@ -from dataflow.operators.general_text import QuratingScorer +from dataflow.operators.text_pt import QuratingSampleEvaluator import numpy as np from dataflow.core import OperatorABC from dataflow.utils.registry import OPERATOR_REGISTRY @@ -16,7 +16,7 @@ def __init__(self, min_scores: dict = {'writing_style': 0,'required_expertise': self.max_scores = max_scores # Initialize the QuratingScorer with the passed parameters - self.scorer = QuratingScorer(map_batch_size=map_batch_size, + self.scorer = QuratingSampleEvaluator(map_batch_size=map_batch_size, num_workers=num_workers, device_batch_size=device_batch_size, device=device, labels=labels, model_cache_dir=model_cache_dir) @@ -65,7 +65,7 @@ def get_desc(lang: str = "zh"): def run(self, storage: DataFlowStorage, input_key: str): self.input_key = input_key dataframe = storage.read("dataframe") - self.logger.info(f"Running {self.filter_name}...") + self.logger.info(f"Running {self.__class__.__name__}...") # Get the scores for filtering scores = self.scorer.eval(dataframe, self.input_key) diff --git a/dataflow/operators/general_text/filter/text_book_filter.py b/dataflow/operators/text_pt/filter/text_book_filter.py similarity index 96% rename from dataflow/operators/general_text/filter/text_book_filter.py rename to dataflow/operators/text_pt/filter/text_book_filter.py index f7e336cd..68c18e6b 100644 --- a/dataflow/operators/general_text/filter/text_book_filter.py +++ b/dataflow/operators/text_pt/filter/text_book_filter.py @@ -2,7 +2,7 @@ from dataflow.core import OperatorABC from dataflow.utils.storage import DataFlowStorage from dataflow.utils.registry import OPERATOR_REGISTRY -from dataflow.operators.general_text import TextbookScorer +from dataflow.operators.text_pt import TextbookSampleEvaluator @OPERATOR_REGISTRY.register() class TextbookFilter(OperatorABC): @@ -11,7 +11,7 @@ def __init__(self, min_score=0.99, max_score=1, model_cache_dir:str='./dataflow_ self.logger = get_logger() self.min_score = min_score self.max_score = max_score - self.scorer = TextbookScorer(model_cache_dir=model_cache_dir) + self.scorer = TextbookSampleEvaluator(model_cache_dir=model_cache_dir) self.logger.info(f"Initializing {self.__class__.__name__} with min_score = {min_score} and max_score = {max_score}") @staticmethod diff --git a/dataflow/operators/general_text/generate/pretrain_generator.py b/dataflow/operators/text_pt/generate/phi4qa_generator.py similarity index 95% rename from dataflow/operators/general_text/generate/pretrain_generator.py rename to dataflow/operators/text_pt/generate/phi4qa_generator.py index 95a0cecd..81171f84 100644 --- a/dataflow/operators/general_text/generate/pretrain_generator.py +++ b/dataflow/operators/text_pt/generate/phi4qa_generator.py @@ -1,4 +1,4 @@ -from dataflow.prompts.general_text import PretrainGeneratorPrompt +from dataflow.prompts.general_text import Phi4QAGeneratorPrompt import pandas as pd from dataflow.utils.registry import OPERATOR_REGISTRY from dataflow import get_logger @@ -8,13 +8,13 @@ from dataflow.core import LLMServingABC @OPERATOR_REGISTRY.register() -class PretrainGenerator(OperatorABC): +class Phi4QAGenerator(OperatorABC): ''' Answer Generator is a class that generates answers for given questions. ''' def __init__(self, llm_serving: LLMServingABC): self.logger = get_logger() - self.prompts = PretrainGeneratorPrompt() + self.prompts = Phi4QAGeneratorPrompt() self.llm_serving = llm_serving @staticmethod diff --git a/dataflow/operators/text_sft/__init__.py b/dataflow/operators/text_sft/__init__.py new file mode 100644 index 00000000..5a5da160 --- /dev/null +++ b/dataflow/operators/text_sft/__init__.py @@ -0,0 +1,33 @@ +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from .eval.alpagasus_sample_evaluator import AlpagasusSampleEvaluator + from .eval.deita_quality_sample_evaluator import DeitaQualitySampleEvaluator + from .eval.deita_complexity_sample_evaluator import DeitaComplexitySampleEvaluator + from .eval.instag_sample_evaluator import InstagSampleEvaluator + from .eval.rm_sample_evaluator import RMSampleEvaluator + from .eval.superfiltering_sample_evaluator import SuperfilteringSampleEvaluator + from .eval.treeinstruct_sample_evaluator import TreeinstructSampleEvaluator + + + from .filter.alpagasus_filter import AlpagasusFilter + from .filter.deita_quality_filter import DeitaQualityFilter + from .filter.deita_complexity_filter import DeitaComplexityFilter + from .filter.instag_filter import InstagFilter + from .filter.rm_filter import RMFilter + from .filter.superfiltering_filter import SuperfilteringFilter + from .filter.treeinstruct_filter import TreeinstructFilter + + from .generate.condor_generator import CondorGenerator + from .generate.sft_generator_from_seed import SFTGeneratorSeed + + from .refine.condor_refiner import CondorRefiner + +else: + import sys + from dataflow.utils.registry import LazyLoader, generate_import_structure_from_type_checking + + cur_path = "dataflow/operators/text_sft/" + + _import_structure = generate_import_structure_from_type_checking(__file__, cur_path) + sys.modules[__name__] = LazyLoader(__name__, "dataflow/operators/text_sft/", _import_structure) diff --git a/dataflow/operators/general_text/eval/models/Superfiltering/data_analysis.py b/dataflow/operators/text_sft/eval/Superfiltering/data_analysis.py similarity index 100% rename from dataflow/operators/general_text/eval/models/Superfiltering/data_analysis.py rename to dataflow/operators/text_sft/eval/Superfiltering/data_analysis.py diff --git a/dataflow/operators/general_text/eval/APIcaller/alpagasus_scorer.py b/dataflow/operators/text_sft/eval/alpagasus_sample_evaluator.py similarity index 98% rename from dataflow/operators/general_text/eval/APIcaller/alpagasus_scorer.py rename to dataflow/operators/text_sft/eval/alpagasus_sample_evaluator.py index ae802bae..fef79466 100644 --- a/dataflow/operators/general_text/eval/APIcaller/alpagasus_scorer.py +++ b/dataflow/operators/text_sft/eval/alpagasus_sample_evaluator.py @@ -7,7 +7,7 @@ from dataflow.prompts.general_text import AlpagasusPrompt @OPERATOR_REGISTRY.register() -class AlpagasusScorer(OperatorABC): +class AlpagasusSampleEvaluator(OperatorABC): def __init__(self, llm_serving: LLMServingABC = None, dimension: str = 'quality'): self.logger = get_logger() self.logger.info(f'Initializing {self.__class__.__name__}...') diff --git a/dataflow/operators/general_text/eval/models/deita_complexity_scorer.py b/dataflow/operators/text_sft/eval/deita_complexity_sample_evaluator.py similarity index 99% rename from dataflow/operators/general_text/eval/models/deita_complexity_scorer.py rename to dataflow/operators/text_sft/eval/deita_complexity_sample_evaluator.py index 1f5f746d..e4ba8849 100644 --- a/dataflow/operators/general_text/eval/models/deita_complexity_scorer.py +++ b/dataflow/operators/text_sft/eval/deita_complexity_sample_evaluator.py @@ -9,7 +9,7 @@ from tqdm import tqdm @OPERATOR_REGISTRY.register() -class DeitaComplexityScorer(OperatorABC): +class DeitaComplexitySampleEvaluator(OperatorABC): def __init__(self, device='cuda', model_cache_dir='./dataflow_cache', max_length=512): self.logger = get_logger() self.logger.info(f'Initializing {self.__class__.__name__}...') diff --git a/dataflow/operators/general_text/eval/models/deita_quality_scorer.py b/dataflow/operators/text_sft/eval/deita_quality_sample_evaluator.py similarity index 99% rename from dataflow/operators/general_text/eval/models/deita_quality_scorer.py rename to dataflow/operators/text_sft/eval/deita_quality_sample_evaluator.py index 065efa01..2bd63e34 100644 --- a/dataflow/operators/general_text/eval/models/deita_quality_scorer.py +++ b/dataflow/operators/text_sft/eval/deita_quality_sample_evaluator.py @@ -10,7 +10,7 @@ from tqdm import tqdm @OPERATOR_REGISTRY.register() -class DeitaQualityScorer(OperatorABC): +class DeitaQualitySampleEvaluator(OperatorABC): def __init__(self, device='cuda', model_cache_dir='./dataflow_cache', max_length=512): self.logger = get_logger() self.logger.info(f'Initializing {self.__class__.__name__}...') diff --git a/dataflow/operators/general_text/eval/models/instag_scorer.py b/dataflow/operators/text_sft/eval/instag_sample_evaluator.py similarity index 99% rename from dataflow/operators/general_text/eval/models/instag_scorer.py rename to dataflow/operators/text_sft/eval/instag_sample_evaluator.py index d438cd82..53afdfe8 100644 --- a/dataflow/operators/general_text/eval/models/instag_scorer.py +++ b/dataflow/operators/text_sft/eval/instag_sample_evaluator.py @@ -10,7 +10,7 @@ import json @OPERATOR_REGISTRY.register() -class InstagScorer(OperatorABC): +class InstagSampleEvaluator(OperatorABC): def __init__(self, model_cache_dir='./dataflow_cache', device='cuda', max_new_tokens=1024, temperature=0, do_sample=False, num_return_sequences=1, return_dict_in_generate=True): self.logger = get_logger() self.logger.info(f'Initializing {self.__class__.__name__}...') diff --git a/dataflow/operators/general_text/eval/models/rm_scorer.py b/dataflow/operators/text_sft/eval/rm_sample_evaluator.py similarity index 98% rename from dataflow/operators/general_text/eval/models/rm_scorer.py rename to dataflow/operators/text_sft/eval/rm_sample_evaluator.py index d49ea56d..0be42ee3 100644 --- a/dataflow/operators/general_text/eval/models/rm_scorer.py +++ b/dataflow/operators/text_sft/eval/rm_sample_evaluator.py @@ -7,7 +7,7 @@ # RMScorer for evaluating based on reward-model-deberta-v3-large-v2 @OPERATOR_REGISTRY.register() -class RMScorer(OperatorABC): +class RMSampleEvaluator(OperatorABC): def __init__(self, device='cuda', model_cache_dir='./dataflow_cache', ): self.logger = get_logger() self.logger.info(f'Initializing {self.__class__.__name__}...') diff --git a/dataflow/operators/general_text/eval/models/superfiltering_scorer.py b/dataflow/operators/text_sft/eval/superfiltering_sample_evaluator.py similarity index 96% rename from dataflow/operators/general_text/eval/models/superfiltering_scorer.py rename to dataflow/operators/text_sft/eval/superfiltering_sample_evaluator.py index d93d3087..db6a816d 100644 --- a/dataflow/operators/general_text/eval/models/superfiltering_scorer.py +++ b/dataflow/operators/text_sft/eval/superfiltering_sample_evaluator.py @@ -1,6 +1,6 @@ from dataflow.core import OperatorABC from dataflow.utils.registry import OPERATOR_REGISTRY -from dataflow.operators.general_text.eval.models.Superfiltering.data_analysis import get_perplexity_and_embedding_whole_text, get_perplexity_and_embedding_part_text +from dataflow.operators.text_sft.eval.Superfiltering.data_analysis import get_perplexity_and_embedding_whole_text, get_perplexity_and_embedding_part_text from transformers import AutoTokenizer, AutoModelForCausalLM from tqdm import tqdm import torch @@ -11,7 +11,7 @@ # Superfiltering instruction quality (ifd) evaluation # cited from: Superfiltering: Weak-to-Strong Data Filtering for Fast Instruction-Tuning @OPERATOR_REGISTRY.register() -class SuperfilteringScorer(OperatorABC): +class SuperfilteringSampleEvaluator(OperatorABC): def __init__(self, device='cuda', model_cache_dir='./dataflow_cache', max_length=512): self.logger = get_logger() self.logger.info(f'Initializing {self.__class__.__name__}...') diff --git a/dataflow/operators/general_text/eval/APIcaller/treeinstruct_scorer.py b/dataflow/operators/text_sft/eval/treeinstruct_sample_evaluator.py similarity index 98% rename from dataflow/operators/general_text/eval/APIcaller/treeinstruct_scorer.py rename to dataflow/operators/text_sft/eval/treeinstruct_sample_evaluator.py index da611e65..7642f20d 100644 --- a/dataflow/operators/general_text/eval/APIcaller/treeinstruct_scorer.py +++ b/dataflow/operators/text_sft/eval/treeinstruct_sample_evaluator.py @@ -7,7 +7,7 @@ from dataflow.prompts.general_text import TreeinstructPrompt @OPERATOR_REGISTRY.register() -class TreeinstructScorer(OperatorABC): +class TreeinstructSampleEvaluator(OperatorABC): def __init__(self, llm_serving: LLMServingABC = None): self.logger = get_logger() self.logger.info(f'Initializing {self.__class__.__name__}...') diff --git a/dataflow/operators/general_text/filter/alpagasus_filter.py b/dataflow/operators/text_sft/filter/alpagasus_filter.py similarity index 96% rename from dataflow/operators/general_text/filter/alpagasus_filter.py rename to dataflow/operators/text_sft/filter/alpagasus_filter.py index 39f7e4c4..75fb49bb 100644 --- a/dataflow/operators/general_text/filter/alpagasus_filter.py +++ b/dataflow/operators/text_sft/filter/alpagasus_filter.py @@ -3,7 +3,7 @@ from dataflow.utils.storage import DataFlowStorage from dataflow.utils.registry import OPERATOR_REGISTRY from dataflow.core import LLMServingABC -from dataflow.operators.general_text import AlpagasusScorer +from dataflow.operators.text_sft import AlpagasusSampleEvaluator @OPERATOR_REGISTRY.register() class AlpagasusFilter(OperatorABC): @@ -13,7 +13,7 @@ def __init__(self, min_score=3, max_score=5, llm_serving: LLMServingABC = None, self.min_score = min_score self.max_score = max_score self.logger.info(f"Initializing {self.__class__.__name__} with min_score = {self.min_score} and max_score = {self.max_score}...") - self.scorer = AlpagasusScorer(llm_serving, dimension) + self.scorer = AlpagasusSampleEvaluator(llm_serving, dimension) @staticmethod def get_desc(lang: str = "zh"): diff --git a/dataflow/operators/general_text/filter/deita_complexity_filter.py b/dataflow/operators/text_sft/filter/deita_complexity_filter.py similarity index 96% rename from dataflow/operators/general_text/filter/deita_complexity_filter.py rename to dataflow/operators/text_sft/filter/deita_complexity_filter.py index 9c60e882..529af761 100644 --- a/dataflow/operators/general_text/filter/deita_complexity_filter.py +++ b/dataflow/operators/text_sft/filter/deita_complexity_filter.py @@ -1,4 +1,4 @@ -from dataflow.operators.general_text import DeitaComplexityScorer +from dataflow.operators.text_sft import DeitaComplexitySampleEvaluator from dataflow.core import OperatorABC import numpy as np from dataflow.utils.registry import OPERATOR_REGISTRY @@ -11,7 +11,7 @@ def __init__(self, min_score=3.0, max_score=5.0, device='cuda', model_cache_dir= self.logger = get_logger() self.min_score = min_score self.max_score = max_score - self.scorer = DeitaComplexityScorer( + self.scorer = DeitaComplexitySampleEvaluator( device=device, model_cache_dir=model_cache_dir, max_length=max_length, diff --git a/dataflow/operators/general_text/filter/deita_quality_filter.py b/dataflow/operators/text_sft/filter/deita_quality_filter.py similarity index 97% rename from dataflow/operators/general_text/filter/deita_quality_filter.py rename to dataflow/operators/text_sft/filter/deita_quality_filter.py index 552ea2d1..c7d36124 100644 --- a/dataflow/operators/general_text/filter/deita_quality_filter.py +++ b/dataflow/operators/text_sft/filter/deita_quality_filter.py @@ -1,4 +1,4 @@ -from dataflow.operators.general_text import DeitaQualityScorer +from dataflow.operators.text_sft import DeitaQualitySampleEvaluator from dataflow.core import OperatorABC import numpy as np from dataflow.utils.registry import OPERATOR_REGISTRY @@ -11,7 +11,7 @@ def __init__(self, min_score=2.5, max_score=10000.0, device='cuda', model_cache_ self.logger = get_logger() self.min_score = min_score self.max_score = max_score - self.scorer = DeitaQualityScorer( + self.scorer = DeitaQualitySampleEvaluator( device=device, model_cache_dir=model_cache_dir, max_length=max_length, diff --git a/dataflow/operators/general_text/filter/instag_filter.py b/dataflow/operators/text_sft/filter/instag_filter.py similarity index 95% rename from dataflow/operators/general_text/filter/instag_filter.py rename to dataflow/operators/text_sft/filter/instag_filter.py index d5d67ebe..cf4c1052 100644 --- a/dataflow/operators/general_text/filter/instag_filter.py +++ b/dataflow/operators/text_sft/filter/instag_filter.py @@ -1,4 +1,4 @@ -from dataflow.operators.general_text import InstagScorer +from dataflow.operators.text_sft import InstagSampleEvaluator from dataflow.core import OperatorABC import numpy as np from dataflow.utils.registry import OPERATOR_REGISTRY @@ -14,7 +14,7 @@ def __init__(self, min_score=0.0, max_score=1.0, model_cache_dir='./dataflow_cac self.max_score = max_score # Initialize the scorer - self.scorer = InstagScorer( + self.scorer = InstagSampleEvaluator( model_cache_dir=model_cache_dir, device=device, max_new_tokens=max_new_tokens diff --git a/dataflow/operators/general_text/filter/reward_model_filter.py b/dataflow/operators/text_sft/filter/rm_filter.py similarity index 96% rename from dataflow/operators/general_text/filter/reward_model_filter.py rename to dataflow/operators/text_sft/filter/rm_filter.py index 50d439d4..f36bf6b1 100644 --- a/dataflow/operators/general_text/filter/reward_model_filter.py +++ b/dataflow/operators/text_sft/filter/rm_filter.py @@ -3,7 +3,7 @@ from dataflow.core import OperatorABC from dataflow.utils.storage import DataFlowStorage from dataflow.utils.registry import OPERATOR_REGISTRY -from dataflow.operators.general_text import RMScorer +from dataflow.operators.text_sft import RMSampleEvaluator @OPERATOR_REGISTRY.register() class RMFilter(OperatorABC): @@ -12,7 +12,7 @@ def __init__(self, min_score: float = 0.2, max_score: float = 0.8, device='cuda' self.logger = get_logger() self.min_score = min_score self.max_score = max_score - self.scorer = RMScorer(device=device, model_cache_dir=model_cache_dir) + self.scorer = RMSampleEvaluator(device=device, model_cache_dir=model_cache_dir) self.logger.info(f"Initializing {self.__class__.__name__} with min_score = {self.min_score}, max_score = {self.max_score}") @staticmethod diff --git a/dataflow/operators/general_text/filter/superfiltering_filter.py b/dataflow/operators/text_sft/filter/superfiltering_filter.py similarity index 97% rename from dataflow/operators/general_text/filter/superfiltering_filter.py rename to dataflow/operators/text_sft/filter/superfiltering_filter.py index 5c9ff010..a33c21cd 100644 --- a/dataflow/operators/general_text/filter/superfiltering_filter.py +++ b/dataflow/operators/text_sft/filter/superfiltering_filter.py @@ -1,4 +1,4 @@ -from dataflow.operators.general_text import SuperfilteringScorer +from dataflow.operators.text_sft import SuperfilteringSampleEvaluator import numpy as np from dataflow.core import OperatorABC from dataflow.utils.registry import OPERATOR_REGISTRY @@ -13,7 +13,7 @@ def __init__(self, min_score=0.0, max_score=1.0, device='cuda', model_cache_dir= self.min_score = min_score self.max_score = max_score - self.scorer = SuperfilteringScorer( + self.scorer = SuperfilteringSampleEvaluator( device=device, model_cache_dir=model_cache_dir, max_length=max_length diff --git a/dataflow/operators/general_text/filter/treeinstruct_filter.py b/dataflow/operators/text_sft/filter/treeinstruct_filter.py similarity index 96% rename from dataflow/operators/general_text/filter/treeinstruct_filter.py rename to dataflow/operators/text_sft/filter/treeinstruct_filter.py index d216ef97..ee40cb8a 100644 --- a/dataflow/operators/general_text/filter/treeinstruct_filter.py +++ b/dataflow/operators/text_sft/filter/treeinstruct_filter.py @@ -3,7 +3,7 @@ from dataflow.core import OperatorABC, LLMServingABC from dataflow.utils.storage import DataFlowStorage from dataflow.utils.registry import OPERATOR_REGISTRY -from dataflow.operators.general_text import TreeinstructScorer +from dataflow.operators.text_sft import TreeinstructSampleEvaluator @OPERATOR_REGISTRY.register() class TreeinstructFilter(OperatorABC): @@ -12,7 +12,7 @@ def __init__(self, min_score: int = 7, max_score: int = 100, llm_serving: LLMSer self.logger = get_logger() self.min_score = min_score self.max_score = max_score - self.scorer = TreeinstructScorer(llm_serving=llm_serving) + self.scorer = TreeinstructSampleEvaluator(llm_serving=llm_serving) self.logger.info(f"Initializing {self.__class__.__name__} with min_score = {min_score} and max_score = {max_score}") @staticmethod diff --git a/dataflow/operators/general_text/generate/condor_generator.py b/dataflow/operators/text_sft/generate/condor_generator.py similarity index 100% rename from dataflow/operators/general_text/generate/condor_generator.py rename to dataflow/operators/text_sft/generate/condor_generator.py diff --git a/dataflow/operators/general_text/generate/sft_generator_from_seed.py b/dataflow/operators/text_sft/generate/sft_generator_from_seed.py similarity index 100% rename from dataflow/operators/general_text/generate/sft_generator_from_seed.py rename to dataflow/operators/text_sft/generate/sft_generator_from_seed.py diff --git a/dataflow/operators/general_text/refine/condor_refiner.py b/dataflow/operators/text_sft/refine/condor_refiner.py similarity index 100% rename from dataflow/operators/general_text/refine/condor_refiner.py rename to dataflow/operators/text_sft/refine/condor_refiner.py diff --git a/dataflow/prompts/general_text.py b/dataflow/prompts/general_text.py index 4816d48f..e8d6bcac 100644 --- a/dataflow/prompts/general_text.py +++ b/dataflow/prompts/general_text.py @@ -1,7 +1,7 @@ ''' A collection of prompts for the general text operator. ''' -class PretrainGeneratorPrompt: +class Phi4QAGeneratorPrompt: def __init__(self): pass diff --git a/dataflow/statics/core_text_pipeline/core_filter.py b/dataflow/statics/core_text_pipeline/core_filter.py index 6f1a80fe..566d350d 100644 --- a/dataflow/statics/core_text_pipeline/core_filter.py +++ b/dataflow/statics/core_text_pipeline/core_filter.py @@ -12,7 +12,7 @@ import pandas as pd from dataflow.operators.core_text import PromptedGenerator -from dataflow.operators.general_text import GeneralFilter +from dataflow.operators.core_text import GeneralFilter from dataflow.utils.storage import FileStorage from dataflow.serving import APILLMServing_request from dataflow.prompts.core_filter import * diff --git a/dataflow/statics/core_text_pipeline/core_sft_from_scratch.py b/dataflow/statics/core_text_pipeline/core_sft_from_scratch.py index 36780e5a..9e0b611a 100644 --- a/dataflow/statics/core_text_pipeline/core_sft_from_scratch.py +++ b/dataflow/statics/core_text_pipeline/core_sft_from_scratch.py @@ -12,7 +12,7 @@ """ from dataflow.operators.core_text import RandomDomainKnowledgeRowGenerator from dataflow.operators.core_text import PromptedGenerator -from dataflow.operators.general_text import GeneralFilter +from dataflow.operators.core_text import GeneralFilter from dataflow.utils.storage import FileStorage from dataflow.serving import APILLMServing_request from dataflow.prompts.core_sft_from_scratch import * diff --git a/dataflow/statics/pipelines/api_pipelines/text_sft_synthesis_pipeline.py b/dataflow/statics/pipelines/api_pipelines/text_sft_synthesis_pipeline.py index 4419132b..42f8d00c 100644 --- a/dataflow/statics/pipelines/api_pipelines/text_sft_synthesis_pipeline.py +++ b/dataflow/statics/pipelines/api_pipelines/text_sft_synthesis_pipeline.py @@ -1,7 +1,7 @@ -from dataflow.operators.general_text import AlpagasusFilter -from dataflow.operators.general_text import CondorGenerator -from dataflow.operators.general_text import CondorRefiner +from dataflow.operators.text_sft import AlpagasusFilter +from dataflow.operators.text_sft import CondorGenerator +from dataflow.operators.text_sft import CondorRefiner from dataflow.utils.storage import FileStorage from dataflow.serving import APILLMServing_request diff --git a/dataflow/statics/pipelines/cpu_pipelines/text_pt_filter.py b/dataflow/statics/pipelines/cpu_pipelines/text_pt_filter.py index 9e6177fc..c19147c9 100644 --- a/dataflow/statics/pipelines/cpu_pipelines/text_pt_filter.py +++ b/dataflow/statics/pipelines/cpu_pipelines/text_pt_filter.py @@ -1,8 +1,8 @@ from dataflow.operators.general_text import ( - MinHashDeduplicator, - ColonEndFilter, WordNumberFilter, BlocklistFilter, + MinHashDeduplicateFilter, + ColonEndFilter, SentenceNumberFilter, LineEndWithEllipsisFilter, ContentNullFilter, @@ -19,13 +19,14 @@ UniqueWordsFilter, CharNumberFilter, LineStartWithBulletpointFilter, - LineWithJavascriptFilter -) -from dataflow.operators.refine import ( + LineWithJavascriptFilter, HtmlUrlRemoverRefiner, RemoveEmojiRefiner, RemoveExtraSpacesRefiner ) +from dataflow.operators.text_pt import ( + MetaSampleEvaluator, +) from dataflow.utils.storage import FileStorage @@ -40,7 +41,7 @@ def __init__(self): self.remove_extra_spaces_refiner = RemoveExtraSpacesRefiner() self.remove_emoji_refiner = RemoveEmojiRefiner() self.html_remove_refiner = HtmlUrlRemoverRefiner() - self.minhash_deduplicator = MinHashDeduplicator(num_perm=128, threshold=0.9, use_n_gram=True, ngram=5) + self.minhash_deduplicator = MinHashDeduplicateFilter(num_perm=128, threshold=0.9, use_n_gram=True, ngram=5) self.blocklist_filter = BlocklistFilter() self.word_number_filter = WordNumberFilter(min_words=20, max_words=100000) self.colon_end_filter = ColonEndFilter() diff --git a/dataflow/statics/pipelines/gpu_pipelines/text_pt_filter.py b/dataflow/statics/pipelines/gpu_pipelines/text_pt_filter.py index 90938452..28754614 100644 --- a/dataflow/statics/pipelines/gpu_pipelines/text_pt_filter.py +++ b/dataflow/statics/pipelines/gpu_pipelines/text_pt_filter.py @@ -1,9 +1,9 @@ from dataflow.operators.general_text import ( - MinHashDeduplicator, + MinHashDeduplicateFilter, LanguageFilter, - ColonEndFilter, WordNumberFilter, BlocklistFilter, + ColonEndFilter, SentenceNumberFilter, LineEndWithEllipsisFilter, ContentNullFilter, @@ -21,12 +21,12 @@ CharNumberFilter, LineStartWithBulletpointFilter, LineWithJavascriptFilter, - PairQualFilter -) -from dataflow.operators.general_text import ( - HtmlUrlRemoverRefiner, + RemoveExtraSpacesRefiner, RemoveEmojiRefiner, - RemoveExtraSpacesRefiner + HtmlUrlRemoverRefiner, +) +from dataflow.operators.text_pt import ( + PairQualFilter ) from dataflow.utils.storage import FileStorage @@ -44,7 +44,7 @@ def __init__(self): self.remove_extra_spaces_refiner = RemoveExtraSpacesRefiner() self.remove_emoji_refiner = RemoveEmojiRefiner() self.html_remove_refiner = HtmlUrlRemoverRefiner() - self.minhash_deduplicator = MinHashDeduplicator(num_perm=128, threshold=0.9, use_n_gram=True, ngram=5) + self.minhash_deduplicator = MinHashDeduplicateFilter(num_perm=128, threshold=0.9, use_n_gram=True, ngram=5) self.blocklist_filter = BlocklistFilter() self.word_number_filter = WordNumberFilter(min_words=20, max_words=100000) self.colon_end_filter = ColonEndFilter() diff --git a/dataflow/statics/pipelines/gpu_pipelines/text_pt_synthetic.py b/dataflow/statics/pipelines/gpu_pipelines/text_pt_synthetic.py index 46b36ed7..923b2732 100644 --- a/dataflow/statics/pipelines/gpu_pipelines/text_pt_synthetic.py +++ b/dataflow/statics/pipelines/gpu_pipelines/text_pt_synthetic.py @@ -1,9 +1,9 @@ from dataflow.operators.general_text import ( - MinHashDeduplicator, + MinHashDeduplicateFilter, LanguageFilter, - ColonEndFilter, WordNumberFilter, BlocklistFilter, + ColonEndFilter, SentenceNumberFilter, LineEndWithEllipsisFilter, ContentNullFilter, @@ -21,15 +21,16 @@ CharNumberFilter, LineStartWithBulletpointFilter, LineWithJavascriptFilter, + RemoveExtraSpacesRefiner, + RemoveEmojiRefiner, + HtmlUrlRemoverRefiner, +) +from dataflow.operators.text_pt import ( PairQualFilter, QuratingFilter ) -from dataflow.operators.general_text import ( - HtmlUrlRemoverRefiner, - RemoveEmojiRefiner, - RemoveExtraSpacesRefiner -) -from dataflow.operators.general_text import PretrainGenerator + +from dataflow.operators.text_pt import Phi4QAGenerator from dataflow.serving import LocalModelLLMServing_vllm, LocalModelLLMServing_sglang from dataflow.utils.storage import FileStorage @@ -63,7 +64,7 @@ def __init__(self): self.remove_extra_spaces_refiner = RemoveExtraSpacesRefiner() self.remove_emoji_refiner = RemoveEmojiRefiner() self.html_remove_refiner = HtmlUrlRemoverRefiner() - self.minhash_deduplicator = MinHashDeduplicator(num_perm=128, threshold=0.9, use_n_gram=True, ngram=5) + self.minhash_deduplicator = MinHashDeduplicateFilter(num_perm=128, threshold=0.9, use_n_gram=True, ngram=5) self.blocklist_filter = BlocklistFilter() self.word_number_filter = WordNumberFilter(min_words=20, max_words=100000) self.colon_end_filter = ColonEndFilter() @@ -85,7 +86,7 @@ def __init__(self): self.line_start_with_bulletpoint_filter = LineStartWithBulletpointFilter(threshold=0.9) self.line_with_javascript_filter = LineWithJavascriptFilter(threshold=3) self.quality_filter = PairQualFilter(min_score=-2, max_score=10000, lang='en') - self.pt_generator = PretrainGenerator( + self.pt_generator = Phi4QAGenerator( llm_serving=self.llm_serving ) self.qurating_filter = QuratingFilter(min_scores = {'writing_style': 0,'required_expertise': 0,'facts_and_trivia': 0,'educational_value': 0}, max_scores = {'writing_style': 9,'required_expertise': 9,'facts_and_trivia': 9,'educational_value': 9}) diff --git a/dataflow/statics/pipelines/gpu_pipelines/text_sft_filter.py b/dataflow/statics/pipelines/gpu_pipelines/text_sft_filter.py index f60a37bc..1761c29e 100644 --- a/dataflow/statics/pipelines/gpu_pipelines/text_sft_filter.py +++ b/dataflow/statics/pipelines/gpu_pipelines/text_sft_filter.py @@ -1,5 +1,7 @@ from dataflow.operators.general_text import ( WordNumberFilter, +) +from dataflow.operators.text_sft import ( SuperfilteringFilter, DeitaQualityFilter, InstagFilter diff --git a/dataflow/statics/pipelines/gpu_pipelines/text_sft_synthetic.py b/dataflow/statics/pipelines/gpu_pipelines/text_sft_synthetic.py index 4c65124e..c5a55f33 100644 --- a/dataflow/statics/pipelines/gpu_pipelines/text_sft_synthetic.py +++ b/dataflow/statics/pipelines/gpu_pipelines/text_sft_synthetic.py @@ -1,9 +1,12 @@ from dataflow.operators.general_text import ( - MinHashDeduplicator, + MinHashDeduplicateFilter, LanguageFilter, - ColonEndFilter, WordNumberFilter, BlocklistFilter, + HtmlUrlRemoverRefiner, + RemoveEmojiRefiner, + RemoveExtraSpacesRefiner, + ColonEndFilter, SentenceNumberFilter, LineEndWithEllipsisFilter, ContentNullFilter, @@ -21,17 +24,16 @@ CharNumberFilter, LineStartWithBulletpointFilter, LineWithJavascriptFilter, +) +from dataflow.operators.text_pt import ( PairQualFilter, +) +from dataflow.operators.text_sft import ( SuperfilteringFilter, DeitaQualityFilter, - InstagFilter -) -from dataflow.operators.general_text import ( - HtmlUrlRemoverRefiner, - RemoveEmojiRefiner, - RemoveExtraSpacesRefiner + InstagFilter, ) -from dataflow.operators.general_text import SFTGeneratorSeed +from dataflow.operators.text_sft import SFTGeneratorSeed from dataflow.serving import LocalModelLLMServing_vllm, LocalModelLLMServing_sglang from dataflow.utils.storage import FileStorage @@ -63,7 +65,7 @@ def __init__(self): self.remove_extra_spaces_refiner = RemoveExtraSpacesRefiner() self.remove_emoji_refiner = RemoveEmojiRefiner() self.html_remove_refiner = HtmlUrlRemoverRefiner() - self.minhash_deduplicator = MinHashDeduplicator(num_perm=128, threshold=0.9, use_n_gram=True, ngram=5) + self.minhash_deduplicator = MinHashDeduplicateFilter(num_perm=128, threshold=0.9, use_n_gram=True, ngram=5) self.blocklist_filter = BlocklistFilter() self.word_number_filter = WordNumberFilter(min_words=20, max_words=100000) self.colon_end_filter = ColonEndFilter() diff --git a/dataflow/utils/registry.py b/dataflow/utils/registry.py index f0b9e5d1..deab8f93 100644 --- a/dataflow/utils/registry.py +++ b/dataflow/utils/registry.py @@ -205,8 +205,10 @@ def get_type_of_operator(self): 'core_text', 'core_vision', 'db', - 'general_text', 'knowledge_cleaning', + 'general_text', + 'text_pt', + 'text_sft', 'rare', 'reasoning', 'text2sql' diff --git a/test/test_autoop.py b/test/test_autoop.py index cb709b4d..2cefefa0 100644 --- a/test/test_autoop.py +++ b/test/test_autoop.py @@ -2,7 +2,7 @@ from dataflow.operators.general_text import ( LLMLanguageFilter, ) -from dataflow.operators.general_text import MetaScorer +from dataflow.operators.text_pt import MetaSampleEvaluator from dataflow.operators.core_text import PromptedGenerator from dataflow.serving import APILLMServing_request, LocalModelLLMServing_vllm, LocalHostLLMAPIServing_vllm from dataflow.utils.storage import FileStorage