Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 55 additions & 0 deletions configs/deepseek-r1-eagle3.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
{
"architectures": [
"DeepseekV3ForCausalLMEagle3"
],
"attention_bias": false,
"attention_dropout": 0.0,
"bos_token_id": 0,
"eos_token_id": 1,
"ep_size": 1,
"first_k_dense_replace": 3,
"hidden_act": "silu",
"hidden_size": 7168,
"initializer_range": 0.02,
"intermediate_size": 18432,
"kv_lora_rank": 512,
"max_position_embeddings": 2048,
"model_type": "deepseek_v3",
"moe_intermediate_size": 2048,
"moe_layer_freq": 1,
"n_group": 8,
"n_routed_experts": 256,
"n_shared_experts": 1,
"norm_topk_prob": true,
"num_attention_heads": 128,
"num_experts_per_tok": 8,
"num_hidden_layers": 1,
"num_key_value_heads": 128,
"num_nextn_predict_layers": 1,
"pad_token_id": 0,
"q_lora_rank": 1536,
"qk_nope_head_dim": 128,
"qk_rope_head_dim": 64,
"rms_norm_eps": 1e-06,
"rope_scaling": {
"beta_fast": 32,
"beta_slow": 1,
"factor": 40,
"mscale": 1.0,
"mscale_all_dim": 1.0,
"original_max_position_embeddings": 4096,
"type": "yarn"
},
"rope_theta": 10000,
"routed_scaling_factor": 2.5,
"scoring_func": "sigmoid",
"tie_word_embeddings": false,
"topk_group": 4,
"topk_method": "noaux_tc",
"torch_dtype": "float16",
"transformers_version": "4.28.1",
"use_cache": true,
"v_head_dim": 128,
"vocab_size": 129280,
"draft_vocab_size": 32000
}
21 changes: 21 additions & 0 deletions examples/run_dpsk_r1_eagle3_offline.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
ROOT_DIR=$(dirname $SCRIPT_DIR)

# train eagle3 for DeepSeek-R1 offline
NUM_GPUS=${1:-8}

torchrun \
--standalone \
--nproc_per_node $NUM_GPUS \
$ROOT_DIR/scripts/train_eagle3_offline.py \
--target-model-path deepseek-ai/DeepSeek-R1-0528 \
--draft-model-config $ROOT_DIR/configs/deepseek-r1-eagle3.json \
--train-data-path $ROOT_DIR/cache/dataset/sharegpt.jsonl \
--train-hidden-states-path $ROOT_DIR/cache/hidden_states/ \
--output-dir $ROOT_DIR/outputs/Deepseek-r1-eagle3 \
--num-epochs 10 \
--batch-size 2 \
--learning-rate 1e-5 \
--max-length 2048 \
--chat-template deepseek_r1 \
--cache-dir $ROOT_DIR/cache\
31 changes: 31 additions & 0 deletions specforge/data/template.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,3 +163,34 @@ def get_all_template_names(self) -> List[str]:
end_of_turn_token="<|end|>",
),
)

TEMPLATE_REGISTRY.register(
name="kimi_k2",
template=ChatTemplate(
system_prompt="You are a helpful assistant.",
user_header="<|im_user|>user<|im_middle|>",
assistant_header="<|im_assistant|>assistant<|im_middle|>",
end_of_turn_token="<|im_end|>",
),
)

TEMPLATE_REGISTRY.register(
name="deepseek_v3",
template=ChatTemplate(
system_prompt="You are a helpful assistant.",
user_header="<|User|>",
assistant_header="<|Assistant|>",
end_of_turn_token="<|end▁of▁sentence|>",
),
)


TEMPLATE_REGISTRY.register(
name="deepseek_r1",
template=ChatTemplate(
system_prompt="You are a helpful assistant.",
user_header="<|User|>",
assistant_header="<|Assistant|>",
end_of_turn_token="<|end▁of▁sentence|>",
),
)
2 changes: 2 additions & 0 deletions specforge/modeling/__init__.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
from .auto import AutoDistributedTargetModel, AutoDraftModelConfig, AutoEagle3DraftModel
from .draft.llama3_eagle import LlamaForCausalLMEagle3
from .draft.deepseekv3_eagle import DeepseekV3ForCausalLMEagle3

__all__ = [
"AutoDraftModelConfig",
"AutoEagle3DraftModel",
"AutoDistributedTargetModel",
"LlamaForCausalLMEagle3",
"DeepseekV3ForCausalLMEagle3",
]
4 changes: 4 additions & 0 deletions specforge/modeling/auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,14 @@
Qwen2Config,
Qwen3Config,
Qwen3MoeConfig,
DeepseekV3Config,
modeling_utils,
)

from specforge.utils import default_torch_dtype

from .draft.llama3_eagle import LlamaForCausalLMEagle3
from .draft.deepseekv3_eagle import DeepseekV3ForCausalLMEagle3
from .target.llama import LlamaForCausalLM
from .target.llama4 import Llama4ForCausalLM
from .target.phi3 import Phi3ForCausalLM
Expand All @@ -34,6 +36,7 @@ class AutoEagle3DraftModel(AutoModelForCausalLMBase):
# the model mapping is currently hardcoded, we should support lazy model mapping via registry
_model_mapping = {
LlamaConfig: LlamaForCausalLMEagle3,
DeepseekV3Config: DeepseekV3ForCausalLMEagle3,
}

@classmethod
Expand Down Expand Up @@ -134,6 +137,7 @@ class AutoDraftModelConfig:

_config_mapping = {
"LlamaForCausalLMEagle3": LlamaConfig,
"DeepseekV3ForCausalLMEagle3": DeepseekV3Config,
}

@classmethod
Expand Down
3 changes: 2 additions & 1 deletion specforge/modeling/draft/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from .base import Eagle3DraftModel
from .llama3_eagle import LlamaForCausalLMEagle3
from .deepseekv3_eagle import DeepseekV3ForCausalLMEagle3

__all__ = ["Eagle3DraftModel", "LlamaForCausalLMEagle3"]
__all__ = ["Eagle3DraftModel", "LlamaForCausalLMEagle3", "DeepseekV3ForCausalLMEagle3"]
Loading
Loading