Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion dataflow/operators/core_text/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,13 @@
from .generate.prompted_generator import PromptedGenerator
from .generate.paired_prompted_generator import PairedPromptedGenerator
from .generate.random_domain_knowledge_row_generator import RandomDomainKnowledgeRowGenerator

from .eval.prompted_eval import PromptedEvaluator
from .filter.prompted_filter import PromptedFilter

from .refine.prompted_refiner import PromptedRefiner

from .filter.prompted_filter import PromptedFilter
from .filter.general_filter import GeneralFilter
else:
import sys
from dataflow.utils.registry import LazyLoader, generate_import_structure_from_type_checking
Expand Down
121 changes: 40 additions & 81 deletions dataflow/operators/general_text/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,63 +2,42 @@

if TYPE_CHECKING:
# filter
from .filter.alpagasus_filter import AlpagasusFilter
from .filter.ccnet_deduplicator import CCNetDeduplicator
from .filter.debertav3_filter import DebertaV3Filter
from .filter.deita_complexity_filter import DeitaComplexityFilter
from .filter.deita_quality_filter import DeitaQualityFilter
from .filter.fineweb_edu_filter import FineWebEduFilter
from .filter.general_filter import GeneralFilter
from .filter.hash_deduplicator import HashDeduplicator
from .filter.heuristics import ColonEndFilter
from .filter.heuristics import WordNumberFilter
from .filter.heuristics import SentenceNumberFilter
from .filter.heuristics import LineEndWithEllipsisFilter
from .filter.heuristics import ContentNullFilter
from .filter.heuristics import SymbolWordRatioFilter
from .filter.heuristics import AlphaWordsFilter
from .filter.heuristics import HtmlEntityFilter
from .filter.heuristics import IDCardFilter
from .filter.heuristics import NoPuncFilter
from .filter.heuristics import SpecialCharacterFilter
from .filter.heuristics import WatermarkFilter
from .filter.heuristics import MeanWordLengthFilter
from .filter.heuristics import StopWordFilter
from .filter.heuristics import CurlyBracketFilter
from .filter.heuristics import CapitalWordsFilter
from .filter.heuristics import LoremIpsumFilter
from .filter.heuristics import UniqueWordsFilter
from .filter.heuristics import CharNumberFilter
from .filter.heuristics import LineStartWithBulletpointFilter
from .filter.heuristics import LineWithJavascriptFilter
from .filter.heuristics import BlocklistFilter
from .filter.instag_filter import InstagFilter
from .filter.heuristics_filter import ColonEndFilter
from .filter.heuristics_filter import SentenceNumberFilter
from .filter.heuristics_filter import LineEndWithEllipsisFilter
from .filter.heuristics_filter import ContentNullFilter
from .filter.heuristics_filter import SymbolWordRatioFilter
from .filter.heuristics_filter import AlphaWordsFilter
from .filter.heuristics_filter import HtmlEntityFilter
from .filter.heuristics_filter import IDCardFilter
from .filter.heuristics_filter import NoPuncFilter
from .filter.heuristics_filter import SpecialCharacterFilter
from .filter.heuristics_filter import WatermarkFilter
from .filter.heuristics_filter import MeanWordLengthFilter
from .filter.heuristics_filter import StopWordFilter
from .filter.heuristics_filter import CurlyBracketFilter
from .filter.heuristics_filter import CapitalWordsFilter
from .filter.heuristics_filter import LoremIpsumFilter
from .filter.heuristics_filter import UniqueWordsFilter
from .filter.heuristics_filter import CharNumberFilter
from .filter.heuristics_filter import LineStartWithBulletpointFilter
from .filter.heuristics_filter import LineWithJavascriptFilter
from .filter.langkit_filter import LangkitFilter
from .filter.language_filter import LanguageFilter
from .filter.lexical_diversity_filter import LexicalDiversityFilter
from .filter.llm_language_filter import LLMLanguageFilter
from .filter.minhash_deduplicator import MinHashDeduplicator
from .filter.ngram_filter import NgramFilter
from .filter.ngramhash_deduplicator import NgramHashDeduplicator
from .filter.pair_qual_filter import PairQualFilter
from .filter.perplexity_filter import PerplexityFilter
from .filter.perspective_filter import PerspectiveFilter
from .filter.presidio_filter import PresidioFilter
from .filter.qurating_filter import QuratingFilter
from .filter.reward_model_filter import RMFilter
from .filter.sem_deduplicator import SemDeduplicator
from .filter.simhash_deduplicator import SimHashDeduplicator
from .filter.superfiltering_filter import SuperfilteringFilter
from .filter.text_book_filter import TextbookFilter
from .filter.treeinstruct_filter import TreeinstructFilter

# generate
from .generate.condor_generator import CondorGenerator
from .generate.pretrain_generator import PretrainGenerator
from .generate.sft_generator_from_seed import SFTGeneratorSeed
from .filter.blocklist_filter import BlocklistFilter
from .filter.hash_deduplicate_filter import HashDeduplicateFilter
from .filter.language_filter import LanguageFilter
from .filter.llm_language_filter import LLMLanguageFilter
from .filter.minhash_deduplicate_filter import MinHashDeduplicateFilter
from .filter.ngramhash_deduplicate_filter import NgramHashDeduplicateFilter
from .filter.perspective_filter import PerspectiveFilter
from .filter.sem_deduplicate_filter import SemDeduplicateFilter
from .filter.simhash_deduplicate_filter import SimHashDeduplicateFilter
from .filter.word_number_filter import WordNumberFilter

# refine
from .refine.condor_refiner import CondorRefiner
from .refine.html_entity_refiner import HtmlEntityRefiner
from .refine.html_url_remover_refiner import HtmlUrlRemoverRefiner
from .refine.lowercase_refiner import LowercaseRefiner
Expand All @@ -79,42 +58,22 @@
from .refine.text_normalization_refiner import TextNormalizationRefiner

# eval
from .eval.statistics.ngram_scorer import NgramScorer
from .eval.statistics.lexical_diversity_scorer import LexicalDiversityScorer
from .eval.statistics.langkit_scorer import LangkitScorer
from .eval.ngram_sample_evaluator import NgramSampleEvaluator
from .eval.lexical_diversity_sample_evaluator import LexicalDiversitySampleEvaluator
from .eval.langkit_sample_evaluator import LangkitSampleEvaluator
from .eval.presidio_sample_evaluator import PresidioSampleEvaluator
from .eval.bert_sample_evaluator import BertSampleEvaluator
from .eval.bleu_sample_evaluator import BleuSampleEvaluator
from .eval.cider_sample_evaluator import CiderSampleEvaluator
from .eval.perspective_sample_evaluator import PerspectiveSampleEvaluator
from .eval.task2vec_dataset_evaluator import Task2VecDatasetEvaluator
from .eval.vendi_dataset_evaluator import VendiDatasetEvaluator

from .eval.models.deita_quality_scorer import DeitaQualityScorer
from .eval.models.instag_scorer import InstagScorer
from .eval.models.debertav3_scorer import DebertaV3Scorer
from .eval.models.deita_complexity_scorer import DeitaComplexityScorer
from .eval.models.fineweb_edu_scorer import FineWebEduScorer
from .eval.models.pair_qual_scorer import PairQualScorer
from .eval.models.presidio_scorer import PresidioScorer
from .eval.models.rm_scorer import RMScorer
from .eval.models.textbook_scorer import TextbookScorer
from .eval.models.superfiltering_scorer import SuperfilteringScorer
from .eval.models.qurating_scorer import QuratingScorer
from .eval.models.perplexity_scorer import PerplexityScorer

from .eval.APIcaller.alpagasus_scorer import AlpagasusScorer
from .eval.APIcaller.treeinstruct_scorer import TreeinstructScorer
from .eval.APIcaller.perspective_scorer import PerspectiveScorer
from .eval.APIcaller.meta_scorer import MetaScorer

from .eval.diversity.vendi_scorer import VendiScorer
from .eval.diversity.task2vec_scorer import Task2VecScorer

from .eval.gen.bert_scorer import BERTScorer
from .eval.gen.bleu_scorer import BleuScorer
from .eval.gen.cider_scorer import CiderScorer
else:
import sys
from dataflow.utils.registry import LazyLoader, generate_import_structure_from_type_checking



cur_path = "dataflow/operators/general_text/"


_import_structure = generate_import_structure_from_type_checking(__file__, cur_path)
sys.modules[__name__] = LazyLoader(__name__, "dataflow/operators/general_text/", _import_structure)
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import evaluate

@OPERATOR_REGISTRY.register()
class BERTScorer(OperatorABC):
class BertSampleEvaluator(OperatorABC):
def __init__(self, lang='en', model_cache_dir='./dataflow_cache'):
self.logger = get_logger()
self.logger.info(f'Initializing {self.__class__.__name__}...')
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@
from dataflow.utils.storage import DataFlowStorage
from dataflow.utils.registry import OPERATOR_REGISTRY
from dataflow import get_logger
from dataflow.operators.general_text.eval.gen.bleu.bleu import Bleu
from dataflow.operators.general_text.eval.bleu.bleu import Bleu
from tqdm import tqdm

@OPERATOR_REGISTRY.register()
class BleuScorer(OperatorABC):
class BleuSampleEvaluator(OperatorABC):
def __init__(self, n=4, eff="average", special_reflen=None):
self.logger = get_logger()
self.logger.info(f'Initializing {self.__class__.__name__}...')
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,16 @@
from dataflow.utils.storage import DataFlowStorage
from dataflow.utils.registry import OPERATOR_REGISTRY
from dataflow import get_logger
from dataflow.operators.general_text.eval.gen.cider.cider import Cider
from dataflow.operators.general_text.eval.cider.cider import Cider

def load_idf(idf_path):
with open(idf_path, 'rb') as f:
idf = pickle.load(f, encoding='utf-8')
return idf

@OPERATOR_REGISTRY.register()
class CiderScorer(OperatorABC):
def __init__(self, n=4, sigma=6.0, df_mode="coco-val-df", idf_path="./dataflow/operators/eval/GeneralText/gen/cider/coco-val-df.p"):
class CiderSampleEvaluator(OperatorABC):
def __init__(self, n=4, sigma=6.0, df_mode="coco-val-df", idf_path="./dataflow/operators/general_pt/eval/cider/coco-val-df.p"):
self.logger = get_logger()
self.logger.info(f'Initializing {self.__class__.__name__}...')
self.score_name = 'CiderScore'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from tqdm import tqdm

@OPERATOR_REGISTRY.register()
class LangkitScorer(OperatorABC):
class LangkitSampleEvaluator(OperatorABC):
def __init__(self):
self.logger = get_logger()
self.logger.info(f'Initializing {self.__class__.__name__}...')
Expand Down Expand Up @@ -63,7 +63,7 @@ def _score_func(self, sample):
def eval(self, dataframe, input_key):
scores_list = []
self.logger.info(f"Evaluating {self.score_name}...")
for sample in tqdm(dataframe[input_key], desc="LangkitScorer Evaluating..."):
for sample in tqdm(dataframe[input_key], desc="LangkitScore Evaluating..."):
scores = self._score_func(sample)
scores_list.append(scores)
self.logger.info("Evaluation complete!")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ def hdd(word_array, sample_size=42.0):


@OPERATOR_REGISTRY.register()
class LexicalDiversityScorer(OperatorABC):
class LexicalDiversitySampleEvaluator(OperatorABC):
def __init__(self):
self.logger = get_logger()
self.logger.info(f'Initializing {self.__class__.__name__}...')
Expand Down Expand Up @@ -145,7 +145,7 @@ def _score_func(self, sample):
def eval(self, dataframe, input_key):
scores_list = []
self.logger.info(f"Evaluating {self.score_name}...")
for sample in tqdm(dataframe[input_key], desc="LexicalDiversityScorer Evaluating..."):
for sample in tqdm(dataframe[input_key], desc="LexicalDiversityScore Evaluating..."):
scores = self._score_func(sample)
scores_list.append(scores)
self.logger.info("Evaluation complete!")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from dataflow import get_logger

@OPERATOR_REGISTRY.register()
class NgramScorer(OperatorABC):
class NgramSampleEvaluator(OperatorABC):

def __init__(self, ngrams=5):
self.logger = get_logger()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from dataflow.serving import PerspectiveAPIServing

@OPERATOR_REGISTRY.register()
class PerspectiveScorer(OperatorABC):
class PerspectiveSampleEvaluator(OperatorABC):
"""Operator that assigns Perspective API toxicity scores to text inputs."""
def __init__(self, serving: PerspectiveAPIServing = None):
self.logger = get_logger()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

# Presidio PII detection Scorer
@OPERATOR_REGISTRY.register()
class PresidioScorer(OperatorABC):
class PresidioSampleEvaluator(OperatorABC):
def __init__(self, device='cuda', lang='en', model_cache_dir='./dataflow_cache'):
self.logger = get_logger()
self.logger.info(f'Initializing {self.__class__.__name__}...')
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from dataflow.operators.general_text.eval.diversity.task2vec.task2vec import Task2Vec
from dataflow.operators.general_text.eval.diversity.task2vec import task_similarity
from dataflow.operators.general_text.eval.task2vec.task2vec import Task2Vec
from dataflow.operators.general_text.eval.task2vec import task_similarity
import torch
import random
from transformers import GPT2Tokenizer, GPT2LMHeadModel
Expand All @@ -12,7 +12,7 @@
# Task2Vec dataset diversity evaluation
# Cited from: Beyond Scale: the Diversity Coefficient as a Data Quality Metric Demonstrates LLMs are Pre-trained on Formally Diverse Data
@OPERATOR_REGISTRY.register()
class Task2VecScorer(OperatorABC):
class Task2VecDatasetEvaluator(OperatorABC):
def __init__(self, device='cuda', sample_nums=10, sample_size=1, method: Optional[str]='montecarlo', model_cache_dir='./dataflow_cache'):
self.logger = get_logger()
self.logger.info(f'Initializing {self.__class__.__name__}...')
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
# VendiScore dataset diversity evaluation
# Cited from: The Vendi Score: A Diversity Evaluation Metric for Machine Learning
@OPERATOR_REGISTRY.register()
class VendiScorer(OperatorABC):
class VendiDatasetEvaluator(OperatorABC):
def __init__(self, device='cuda'):
self.logger = get_logger()
self.logger.info(f'Initializing {self.__class__.__name__}...')
Expand Down
82 changes: 82 additions & 0 deletions dataflow/operators/general_text/filter/blocklist_filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
from tqdm import tqdm
import numpy as np
from dataflow import get_logger
from dataflow.core import OperatorABC
from dataflow.utils.storage import DataFlowStorage
from dataflow.utils.registry import OPERATOR_REGISTRY
from dataflow.cli_funcs.paths import DataFlowPath
from nltk.tokenize import word_tokenize

@OPERATOR_REGISTRY.register()
class BlocklistFilter(OperatorABC):

def __init__(self, language:str = 'en', threshold:int = 1, use_tokenizer:bool = False):
self.logger = get_logger()
self.language = language
self.threshold = threshold
self.use_tokenizer = use_tokenizer
self.logger.info(f"Initializing {self.__class__.__name__} with language = {self.language}, threshold = {self.threshold}, use_tokenizer = {self.use_tokenizer}...")
self.blocklist = self.load_blocklist()

@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"该算子使用特定语言的阻止列表进行文本过滤,支持可选的分词器进行单词级匹配。\n"
"输入参数:\n"
"- input_key:输入文本字段名,默认为'text'\n"
"- language:语言代码,默认为'zh'\n"
"- blocklist_dir:阻止列表文件目录,默认为'./blocklists/'\n"
"- threshold:匹配次数阈值,默认为1\n"
"- use_tokenizer:是否使用分词器,默认为True\n"
"- tokenizer:分词器对象,默认为None\n"
"输出参数:\n"
"- 过滤后的DataFrame,仅保留不包含阻止列表关键词的文本行\n"
"- 返回包含输入字段名的列表,用于后续算子引用"
)
elif lang == "en":
return (
"This operator filters text using language-specific blocklists with optional tokenizer integration for word-level filtering.\n"
"Input Parameters:\n"
"- input_key: Input text field name, default is 'text'\n"
"- language: Language code, default is 'zh'\n"
"- blocklist_dir: Blocklist file directory, default is './blocklists/'\n"
"- threshold: Matching count threshold, default is 1\n"
"- use_tokenizer: Whether to use tokenizer, default is True\n"
"- tokenizer: Tokenizer object, default is None\n"
"Output Parameters:\n"
"- Filtered DataFrame containing only rows without blocklist keywords\n"
"- List containing input field name for subsequent operator reference"
)
else:
return "BlocklistFilter uses language-specific blocklists with optional tokenizer integration."

def load_blocklist(self):
dataflow_dir = DataFlowPath.get_dataflow_dir()
file_path = f"{dataflow_dir}/operators/general_text/filter/blocklist/{self.language}.txt"
self.logger.info(f"Loading blocklist for language '{self.language}' from {file_path}...")
with open(file_path, 'r', encoding='utf-8') as file:
blocklist = set(line.strip().lower() for line in file if line.strip())
self.logger.info(f"Blocklist for '{self.language}' loaded. Total words in blocklist: {len(blocklist)}.")
return blocklist

def run(self, storage: DataFlowStorage, input_key: str, output_key: str = 'blocklist_filter_label'):
self.input_key = input_key
self.output_key = output_key
dataframe = storage.read("dataframe")
self.logger.info(f"Running {self.__class__.__name__}...")
valid_checks = []
for text in tqdm(dataframe[self.input_key], desc=f"Implementing {self.__class__.__name__}"):
if text:
if self.use_tokenizer:
text = word_tokenize(text.lower())
else:
text = text.lower().split()
blocklist_count = sum(1 for word in text if word in self.blocklist)
valid_checks.append(blocklist_count <= self.threshold)
valid_checks = np.array(valid_checks, dtype=int)
dataframe[self.output_key] = valid_checks
filtered_dataframe = dataframe[valid_checks == 1]
storage.write(filtered_dataframe)
self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.")
return [self.output_key]
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from dataflow.utils.registry import OPERATOR_REGISTRY

@OPERATOR_REGISTRY.register()
class HashDeduplicator(OperatorABC):
class HashDeduplicateFilter(OperatorABC):
def __init__(self, hash_func: str = 'md5'):
self.logger = get_logger()
self.hash_func = hash_func
Expand Down
Loading