From 53733c79e7da4c290c5943d8c2499965175104c6 Mon Sep 17 00:00:00 2001 From: ppbrown Date: Wed, 28 May 2025 18:45:35 -0700 Subject: [PATCH 1/5] Add community class StableDiffusionXL_T5Pipeline Will be used with base model opendiffusionai/stablediffusionxl_t5 --- .../pipeline_stable_diffusion_xl_t5.py | 186 ++++++++++++++++++ 1 file changed, 186 insertions(+) create mode 100644 examples/community/pipeline_stable_diffusion_xl_t5.py diff --git a/examples/community/pipeline_stable_diffusion_xl_t5.py b/examples/community/pipeline_stable_diffusion_xl_t5.py new file mode 100644 index 000000000000..c22ac5b38bc2 --- /dev/null +++ b/examples/community/pipeline_stable_diffusion_xl_t5.py @@ -0,0 +1,186 @@ +# Copyright Philip Brown, ppbrown@github +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Note: At this time, the intent is to use the T5 encoder mentioned +# below, with zero changes. +# Therefore, the model deliberately does not store the T5 encoder model bytes, +# (Since they are not unique!) +# but instead takes advantage of huggingface hub cache loading + +T5_NAME = "mcmonkey/google_t5-v1_1-xxl_encoderonly" + + +# Caller is expected to load this, or equivalent, as model name for now +# eg: pipe = StableDiffusionXL_T5Pipeline(SDXL_NAME) +SDXL_NAME = "stabilityai/stable-diffusion-xl-base-1.0" + + + +from diffusers import StableDiffusionXLPipeline, DiffusionPipeline +from transformers import T5Tokenizer, T5EncoderModel +from transformers import ( + CLIPImageProcessor, + CLIPTextModel, + CLIPTextModelWithProjection, + CLIPTokenizer, + CLIPVisionModelWithProjection, +) + +from diffusers.models import AutoencoderKL, ImageProjection, UNet2DConditionModel +from diffusers.schedulers import KarrasDiffusionSchedulers +from diffusers.image_processor import PipelineImageInput, VaeImageProcessor + + +from typing import Optional + +import torch.nn as nn, torch, types + +import torch.nn as nn + +class LinearWithDtype(nn.Linear): + @property + def dtype(self): + return self.weight.dtype + + +class StableDiffusionXL_T5Pipeline(StableDiffusionXLPipeline): + _expected_modules = [ + "vae", "unet", "scheduler", "tokenizer", + "image_encoder", "feature_extractor", + "t5_encoder", "t5_projection", + ] + + _optional_components = [ + "image_encoder", "feature_extractor", + "t5_encoder", "t5_projection", + ] + + def __init__( + self, + vae: AutoencoderKL, + unet: UNet2DConditionModel, + scheduler: KarrasDiffusionSchedulers, + tokenizer: CLIPTokenizer, + t5_encoder=None, + t5_projection=None, + image_encoder: CLIPVisionModelWithProjection = None, + feature_extractor: CLIPImageProcessor = None, + force_zeros_for_empty_prompt: bool = True, + add_watermarker: Optional[bool] = None, + ): + DiffusionPipeline.__init__(self) + + if t5_encoder is None: + self.t5_encoder = T5EncoderModel.from_pretrained(T5_NAME, + torch_dtype=unet.dtype) + else: + self.t5_encoder = t5_encoder + + # ----- build T5 4096 => 2048 dim projection ----- + if t5_projection is None: + self.t5_projection = LinearWithDtype(4096, 2048) # trainable + else: + self.t5_projection = t5_projection + self.t5_projection.to(dtype=unet.dtype) + + print("dtype of Linear is ",self.t5_projection.dtype) + + self.register_modules( + vae=vae, + unet=unet, + scheduler=scheduler, + tokenizer=tokenizer, + t5_encoder=self.t5_encoder, + t5_projection=self.t5_projection, + image_encoder=image_encoder, + feature_extractor=feature_extractor, + ) + self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt) + self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8 + self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) + + self.default_sample_size = ( + self.unet.config.sample_size + if hasattr(self, "unet") and self.unet is not None and hasattr(self.unet.config, "sample_size") + else 128 + ) + + self.watermark = None + + # Parts of original SDXL class complain if these attributes are not + # at least PRESENT + self.text_encoder = self.text_encoder_2 = None + + # ------------------------------------------------------------------ + # Encode a text prompt (T5-XXL + 4096→2048 projection) + # Returns exactly four tensors in the order SDXL’s __call__ expects. + # ------------------------------------------------------------------ + def encode_prompt( + self, + prompt, + num_images_per_prompt: int = 1, + do_classifier_free_guidance: bool = True, + negative_prompt: str | None = None, + **_, + ): + """ + Returns + ------- + prompt_embeds : Tensor [B, T, 2048] + negative_prompt_embeds : Tensor [B, T, 2048] | None + pooled_prompt_embeds : Tensor [B, 1280] + negative_pooled_prompt_embeds: Tensor [B, 1280] | None + where B = batch * num_images_per_prompt + """ + + # --- helper to tokenize on the pipeline’s device ---------------- + def _tok(text: str): + tok_out = self.tokenizer( + text, + return_tensors="pt", + padding="max_length", + max_length=self.tokenizer.model_max_length, + truncation=True, + ).to(self.device) + return tok_out.input_ids, tok_out.attention_mask + + # ---------- positive stream ------------------------------------- + ids, mask = _tok(prompt) + h_pos = self.t5_encoder(ids, attention_mask=mask).last_hidden_state # [b, T, 4096] + tok_pos = self.t5_projection(h_pos) # [b, T, 2048] + pool_pos = tok_pos.mean(dim=1)[:, :1280] # [b, 1280] + + # expand for multiple images per prompt + tok_pos = tok_pos.repeat_interleave(num_images_per_prompt, 0) + pool_pos = pool_pos.repeat_interleave(num_images_per_prompt, 0) + + # ---------- negative / CFG stream -------------------------------- + if do_classifier_free_guidance: + neg_text = "" if negative_prompt is None else negative_prompt + ids_n, mask_n = _tok(neg_text) + h_neg = self.t5_encoder(ids_n, attention_mask=mask_n).last_hidden_state + tok_neg = self.t5_projection(h_neg) + pool_neg = tok_neg.mean(dim=1)[:, :1280] + + tok_neg = tok_neg.repeat_interleave(num_images_per_prompt, 0) + pool_neg = pool_neg.repeat_interleave(num_images_per_prompt, 0) + else: + tok_neg = pool_neg = None + + # ----------------- final ordered return -------------------------- + # 1) positive token embeddings + # 2) negative token embeddings (or None) + # 3) positive pooled embeddings + # 4) negative pooled embeddings (or None) + return tok_pos, tok_neg, pool_pos, pool_neg From 23daa8182bf77c082d49d6b07950bffadc89cb41 Mon Sep 17 00:00:00 2001 From: ppbrown Date: Thu, 29 May 2025 08:32:44 -0700 Subject: [PATCH 2/5] Changed pooled_embeds to use projection instead of slice --- .../pipeline_stable_diffusion_xl_t5.py | 20 +++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/examples/community/pipeline_stable_diffusion_xl_t5.py b/examples/community/pipeline_stable_diffusion_xl_t5.py index c22ac5b38bc2..7434c90bff38 100644 --- a/examples/community/pipeline_stable_diffusion_xl_t5.py +++ b/examples/community/pipeline_stable_diffusion_xl_t5.py @@ -58,12 +58,12 @@ class StableDiffusionXL_T5Pipeline(StableDiffusionXLPipeline): _expected_modules = [ "vae", "unet", "scheduler", "tokenizer", "image_encoder", "feature_extractor", - "t5_encoder", "t5_projection", + "t5_encoder", "t5_projection", "t5_pooled_projection", ] _optional_components = [ "image_encoder", "feature_extractor", - "t5_encoder", "t5_projection", + "t5_encoder", "t5_projection", "t5_pooled_projection", ] def __init__( @@ -74,6 +74,7 @@ def __init__( tokenizer: CLIPTokenizer, t5_encoder=None, t5_projection=None, + t5_pooled_projection=None, image_encoder: CLIPVisionModelWithProjection = None, feature_extractor: CLIPImageProcessor = None, force_zeros_for_empty_prompt: bool = True, @@ -93,6 +94,12 @@ def __init__( else: self.t5_projection = t5_projection self.t5_projection.to(dtype=unet.dtype) + # ----- build T5 4096 => 1280 dim projection ----- + if t5_pooled_projection is None: + self.t5_pooled_projection = LinearWithDtype(4096, 1280) # trainable + else: + self.t5_pooled_projection = t5_pooled_projection + self.t5_pooled_projection.to(dtype=unet.dtype) print("dtype of Linear is ",self.t5_projection.dtype) @@ -103,6 +110,7 @@ def __init__( tokenizer=tokenizer, t5_encoder=self.t5_encoder, t5_projection=self.t5_projection, + t5_pooled_projection=self.t5_pooled_projection, image_encoder=image_encoder, feature_extractor=feature_extractor, ) @@ -157,9 +165,9 @@ def _tok(text: str): # ---------- positive stream ------------------------------------- ids, mask = _tok(prompt) - h_pos = self.t5_encoder(ids, attention_mask=mask).last_hidden_state # [b, T, 4096] - tok_pos = self.t5_projection(h_pos) # [b, T, 2048] - pool_pos = tok_pos.mean(dim=1)[:, :1280] # [b, 1280] + h_pos = self.t5_encoder(ids, attention_mask=mask).last_hidden_state # [b, T, 4096] + tok_pos = self.t5_projection(h_pos) # [b, T, 2048] + pool_pos = self.t5_pooled_projection(h_pos.mean(dim=1)) # [b, 1280] # expand for multiple images per prompt tok_pos = tok_pos.repeat_interleave(num_images_per_prompt, 0) @@ -171,7 +179,7 @@ def _tok(text: str): ids_n, mask_n = _tok(neg_text) h_neg = self.t5_encoder(ids_n, attention_mask=mask_n).last_hidden_state tok_neg = self.t5_projection(h_neg) - pool_neg = tok_neg.mean(dim=1)[:, :1280] + pool_neg = self.t5_pooled_projection(h_neg.mean(dim=1)) tok_neg = tok_neg.repeat_interleave(num_images_per_prompt, 0) pool_neg = pool_neg.repeat_interleave(num_images_per_prompt, 0) From 685ad1bc0f4c382776823c001374c686edb58ea2 Mon Sep 17 00:00:00 2001 From: ppbrown Date: Mon, 9 Jun 2025 11:23:37 -0700 Subject: [PATCH 3/5] "make style" tweaks --- .../pipeline_stable_diffusion_xl_t5.py | 84 ++++++++++--------- 1 file changed, 43 insertions(+), 41 deletions(-) diff --git a/examples/community/pipeline_stable_diffusion_xl_t5.py b/examples/community/pipeline_stable_diffusion_xl_t5.py index 7434c90bff38..4cde556710f3 100644 --- a/examples/community/pipeline_stable_diffusion_xl_t5.py +++ b/examples/community/pipeline_stable_diffusion_xl_t5.py @@ -12,41 +12,35 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Note: At this time, the intent is to use the T5 encoder mentioned -# below, with zero changes. -# Therefore, the model deliberately does not store the T5 encoder model bytes, -# (Since they are not unique!) -# but instead takes advantage of huggingface hub cache loading - -T5_NAME = "mcmonkey/google_t5-v1_1-xxl_encoderonly" - - -# Caller is expected to load this, or equivalent, as model name for now -# eg: pipe = StableDiffusionXL_T5Pipeline(SDXL_NAME) -SDXL_NAME = "stabilityai/stable-diffusion-xl-base-1.0" - +from typing import Optional -from diffusers import StableDiffusionXLPipeline, DiffusionPipeline -from transformers import T5Tokenizer, T5EncoderModel +import torch.nn as nn from transformers import ( CLIPImageProcessor, - CLIPTextModel, - CLIPTextModelWithProjection, CLIPTokenizer, CLIPVisionModelWithProjection, + T5EncoderModel, ) -from diffusers.models import AutoencoderKL, ImageProjection, UNet2DConditionModel +from diffusers import DiffusionPipeline, StableDiffusionXLPipeline +from diffusers.image_processor import VaeImageProcessor +from diffusers.models import AutoencoderKL, UNet2DConditionModel from diffusers.schedulers import KarrasDiffusionSchedulers -from diffusers.image_processor import PipelineImageInput, VaeImageProcessor -from typing import Optional +# Note: At this time, the intent is to use the T5 encoder mentioned +# below, with zero changes. +# Therefore, the model deliberately does not store the T5 encoder model bytes, +# (Since they are not unique!) +# but instead takes advantage of huggingface hub cache loading -import torch.nn as nn, torch, types +T5_NAME = "mcmonkey/google_t5-v1_1-xxl_encoderonly" + +# Caller is expected to load this, or equivalent, as model name for now +# eg: pipe = StableDiffusionXL_T5Pipeline(SDXL_NAME) +SDXL_NAME = "stabilityai/stable-diffusion-xl-base-1.0" -import torch.nn as nn class LinearWithDtype(nn.Linear): @property @@ -56,14 +50,23 @@ def dtype(self): class StableDiffusionXL_T5Pipeline(StableDiffusionXLPipeline): _expected_modules = [ - "vae", "unet", "scheduler", "tokenizer", - "image_encoder", "feature_extractor", - "t5_encoder", "t5_projection", "t5_pooled_projection", + "vae", + "unet", + "scheduler", + "tokenizer", + "image_encoder", + "feature_extractor", + "t5_encoder", + "t5_projection", + "t5_pooled_projection", ] _optional_components = [ - "image_encoder", "feature_extractor", - "t5_encoder", "t5_projection", "t5_pooled_projection", + "image_encoder", + "feature_extractor", + "t5_encoder", + "t5_projection", + "t5_pooled_projection", ] def __init__( @@ -83,25 +86,24 @@ def __init__( DiffusionPipeline.__init__(self) if t5_encoder is None: - self.t5_encoder = T5EncoderModel.from_pretrained(T5_NAME, - torch_dtype=unet.dtype) + self.t5_encoder = T5EncoderModel.from_pretrained(T5_NAME, torch_dtype=unet.dtype) else: - self.t5_encoder = t5_encoder + self.t5_encoder = t5_encoder # ----- build T5 4096 => 2048 dim projection ----- if t5_projection is None: - self.t5_projection = LinearWithDtype(4096, 2048) # trainable + self.t5_projection = LinearWithDtype(4096, 2048) # trainable else: - self.t5_projection = t5_projection + self.t5_projection = t5_projection self.t5_projection.to(dtype=unet.dtype) # ----- build T5 4096 => 1280 dim projection ----- if t5_pooled_projection is None: - self.t5_pooled_projection = LinearWithDtype(4096, 1280) # trainable + self.t5_pooled_projection = LinearWithDtype(4096, 1280) # trainable else: - self.t5_pooled_projection = t5_pooled_projection + self.t5_pooled_projection = t5_pooled_projection self.t5_pooled_projection.to(dtype=unet.dtype) - print("dtype of Linear is ",self.t5_projection.dtype) + print("dtype of Linear is ", self.t5_projection.dtype) self.register_modules( vae=vae, @@ -165,13 +167,13 @@ def _tok(text: str): # ---------- positive stream ------------------------------------- ids, mask = _tok(prompt) - h_pos = self.t5_encoder(ids, attention_mask=mask).last_hidden_state # [b, T, 4096] - tok_pos = self.t5_projection(h_pos) # [b, T, 2048] - pool_pos = self.t5_pooled_projection(h_pos.mean(dim=1)) # [b, 1280] + h_pos = self.t5_encoder(ids, attention_mask=mask).last_hidden_state # [b, T, 4096] + tok_pos = self.t5_projection(h_pos) # [b, T, 2048] + pool_pos = self.t5_pooled_projection(h_pos.mean(dim=1)) # [b, 1280] # expand for multiple images per prompt - tok_pos = tok_pos.repeat_interleave(num_images_per_prompt, 0) - pool_pos = pool_pos.repeat_interleave(num_images_per_prompt, 0) + tok_pos = tok_pos.repeat_interleave(num_images_per_prompt, 0) + pool_pos = pool_pos.repeat_interleave(num_images_per_prompt, 0) # ---------- negative / CFG stream -------------------------------- if do_classifier_free_guidance: @@ -181,7 +183,7 @@ def _tok(text: str): tok_neg = self.t5_projection(h_neg) pool_neg = self.t5_pooled_projection(h_neg.mean(dim=1)) - tok_neg = tok_neg.repeat_interleave(num_images_per_prompt, 0) + tok_neg = tok_neg.repeat_interleave(num_images_per_prompt, 0) pool_neg = pool_neg.repeat_interleave(num_images_per_prompt, 0) else: tok_neg = pool_neg = None From 53c20386990401da533ad54239f9b77a008c2f68 Mon Sep 17 00:00:00 2001 From: ppbrown Date: Mon, 9 Jun 2025 11:31:44 -0700 Subject: [PATCH 4/5] Added comments to top of code --- examples/community/pipeline_stable_diffusion_xl_t5.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/examples/community/pipeline_stable_diffusion_xl_t5.py b/examples/community/pipeline_stable_diffusion_xl_t5.py index 4cde556710f3..1c3f84794b8c 100644 --- a/examples/community/pipeline_stable_diffusion_xl_t5.py +++ b/examples/community/pipeline_stable_diffusion_xl_t5.py @@ -12,6 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. +########################################################################### +# This pipeline attempts to use a model that has SDXL vae, T5 text encoder, +# and SDXL unet. +# At the present time, there are no pretrained models that give pleasing +# output. So as yet, (2025/06/10) this pipeline is somewhat of a tech +# demo proving that the pieces can at least be put together. +# Hopefully, it will encourage someone with the hardware available to +# throw enough resources into training one up. + from typing import Optional From 3b8f13b0927b986e6d3f6066616eb6751546ac47 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Mon, 9 Jun 2025 19:50:45 +0000 Subject: [PATCH 5/5] Apply style fixes --- examples/community/pipeline_stable_diffusion_xl_t5.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/community/pipeline_stable_diffusion_xl_t5.py b/examples/community/pipeline_stable_diffusion_xl_t5.py index 1c3f84794b8c..7659bd5bc832 100644 --- a/examples/community/pipeline_stable_diffusion_xl_t5.py +++ b/examples/community/pipeline_stable_diffusion_xl_t5.py @@ -13,8 +13,8 @@ # limitations under the License. ########################################################################### -# This pipeline attempts to use a model that has SDXL vae, T5 text encoder, -# and SDXL unet. +# This pipeline attempts to use a model that has SDXL vae, T5 text encoder, +# and SDXL unet. # At the present time, there are no pretrained models that give pleasing # output. So as yet, (2025/06/10) this pipeline is somewhat of a tech # demo proving that the pieces can at least be put together.