From 53733c79e7da4c290c5943d8c2499965175104c6 Mon Sep 17 00:00:00 2001
From: ppbrown <phil@bolthole.com>
Date: Wed, 28 May 2025 18:45:35 -0700
Subject: [PATCH 1/5] Add community class StableDiffusionXL_T5Pipeline Will be
 used with base model opendiffusionai/stablediffusionxl_t5

---
 .../pipeline_stable_diffusion_xl_t5.py        | 186 ++++++++++++++++++
 1 file changed, 186 insertions(+)
 create mode 100644 examples/community/pipeline_stable_diffusion_xl_t5.py

diff --git a/examples/community/pipeline_stable_diffusion_xl_t5.py b/examples/community/pipeline_stable_diffusion_xl_t5.py
new file mode 100644
index 000000000000..c22ac5b38bc2
--- /dev/null
+++ b/examples/community/pipeline_stable_diffusion_xl_t5.py
@@ -0,0 +1,186 @@
+# Copyright Philip Brown, ppbrown@github
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Note: At this time, the intent is to use the T5 encoder mentioned
+# below, with zero changes.
+# Therefore, the model deliberately does not store the T5 encoder model bytes,
+# (Since they are not unique!)
+# but instead takes advantage of huggingface hub cache loading
+
+T5_NAME  = "mcmonkey/google_t5-v1_1-xxl_encoderonly"
+
+
+# Caller is expected to load this, or equivalent, as model name for now
+#   eg: pipe = StableDiffusionXL_T5Pipeline(SDXL_NAME)
+SDXL_NAME = "stabilityai/stable-diffusion-xl-base-1.0"
+
+
+
+from diffusers import StableDiffusionXLPipeline, DiffusionPipeline
+from transformers import T5Tokenizer, T5EncoderModel
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextModel,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    CLIPVisionModelWithProjection,
+)
+
+from diffusers.models import AutoencoderKL, ImageProjection, UNet2DConditionModel
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
+
+
+from typing import Optional
+
+import torch.nn as nn, torch, types
+
+import torch.nn as nn
+
+class LinearWithDtype(nn.Linear):
+    @property
+    def dtype(self):
+        return self.weight.dtype
+
+
+class StableDiffusionXL_T5Pipeline(StableDiffusionXLPipeline):
+    _expected_modules = [
+        "vae", "unet", "scheduler", "tokenizer",
+        "image_encoder", "feature_extractor",
+        "t5_encoder", "t5_projection",
+    ]
+
+    _optional_components = [
+        "image_encoder", "feature_extractor",
+        "t5_encoder", "t5_projection",
+    ]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        tokenizer: CLIPTokenizer,
+        t5_encoder=None,
+        t5_projection=None,
+        image_encoder: CLIPVisionModelWithProjection = None,
+        feature_extractor: CLIPImageProcessor = None,
+        force_zeros_for_empty_prompt: bool = True,
+        add_watermarker: Optional[bool] = None,
+    ):
+        DiffusionPipeline.__init__(self)
+
+        if t5_encoder is None:
+            self.t5_encoder     = T5EncoderModel.from_pretrained(T5_NAME,
+                                torch_dtype=unet.dtype)
+        else:
+            self.t5_encoder     = t5_encoder
+
+        # ----- build T5 4096 => 2048 dim projection -----
+        if t5_projection is None:
+            self.t5_projection  = LinearWithDtype(4096, 2048)   # trainable
+        else:
+            self.t5_projection  = t5_projection
+        self.t5_projection.to(dtype=unet.dtype)
+
+        print("dtype of Linear is ",self.t5_projection.dtype)
+
+        self.register_modules(
+            vae=vae,
+            unet=unet,
+            scheduler=scheduler,
+            tokenizer=tokenizer,
+            t5_encoder=self.t5_encoder,
+            t5_projection=self.t5_projection,
+            image_encoder=image_encoder,
+            feature_extractor=feature_extractor,
+        )
+        self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+
+        self.default_sample_size = (
+            self.unet.config.sample_size
+            if hasattr(self, "unet") and self.unet is not None and hasattr(self.unet.config, "sample_size")
+            else 128
+        )
+
+        self.watermark = None
+
+        # Parts of original SDXL class complain if these attributes are not
+        # at least PRESENT
+        self.text_encoder = self.text_encoder_2 = None
+
+    # ------------------------------------------------------------------
+    #  Encode a text prompt (T5-XXL + 4096→2048 projection)
+    #  Returns exactly four tensors in the order SDXL’s __call__ expects.
+    # ------------------------------------------------------------------
+    def encode_prompt(
+        self,
+        prompt,
+        num_images_per_prompt: int = 1,
+        do_classifier_free_guidance: bool = True,
+        negative_prompt: str | None = None,
+        **_,
+    ):
+        """
+        Returns
+        -------
+        prompt_embeds                : Tensor [B, T, 2048]
+        negative_prompt_embeds       : Tensor [B, T, 2048] | None
+        pooled_prompt_embeds         : Tensor [B, 1280]
+        negative_pooled_prompt_embeds: Tensor [B, 1280]    | None
+        where B = batch * num_images_per_prompt
+        """
+
+        # --- helper to tokenize on the pipeline’s device ----------------
+        def _tok(text: str):
+            tok_out = self.tokenizer(
+                text,
+                return_tensors="pt",
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+            ).to(self.device)
+            return tok_out.input_ids, tok_out.attention_mask
+
+        # ---------- positive stream -------------------------------------
+        ids, mask = _tok(prompt)
+        h_pos = self.t5_encoder(ids, attention_mask=mask).last_hidden_state     # [b, T, 4096]
+        tok_pos = self.t5_projection(h_pos)                                     # [b, T, 2048]
+        pool_pos = tok_pos.mean(dim=1)[:, :1280]                                # [b, 1280]
+
+        # expand for multiple images per prompt
+        tok_pos   = tok_pos.repeat_interleave(num_images_per_prompt, 0)
+        pool_pos  = pool_pos.repeat_interleave(num_images_per_prompt, 0)
+
+        # ---------- negative / CFG stream --------------------------------
+        if do_classifier_free_guidance:
+            neg_text = "" if negative_prompt is None else negative_prompt
+            ids_n, mask_n = _tok(neg_text)
+            h_neg = self.t5_encoder(ids_n, attention_mask=mask_n).last_hidden_state
+            tok_neg = self.t5_projection(h_neg)
+            pool_neg = tok_neg.mean(dim=1)[:, :1280]
+
+            tok_neg  = tok_neg.repeat_interleave(num_images_per_prompt, 0)
+            pool_neg = pool_neg.repeat_interleave(num_images_per_prompt, 0)
+        else:
+            tok_neg = pool_neg = None
+
+        # ----------------- final ordered return --------------------------
+        # 1) positive token embeddings
+        # 2) negative token embeddings (or None)
+        # 3) positive pooled embeddings
+        # 4) negative pooled embeddings (or None)
+        return tok_pos, tok_neg, pool_pos, pool_neg

From 23daa8182bf77c082d49d6b07950bffadc89cb41 Mon Sep 17 00:00:00 2001
From: ppbrown <phil@bolthole.com>
Date: Thu, 29 May 2025 08:32:44 -0700
Subject: [PATCH 2/5] Changed pooled_embeds to use projection instead of slice

---
 .../pipeline_stable_diffusion_xl_t5.py        | 20 +++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/examples/community/pipeline_stable_diffusion_xl_t5.py b/examples/community/pipeline_stable_diffusion_xl_t5.py
index c22ac5b38bc2..7434c90bff38 100644
--- a/examples/community/pipeline_stable_diffusion_xl_t5.py
+++ b/examples/community/pipeline_stable_diffusion_xl_t5.py
@@ -58,12 +58,12 @@ class StableDiffusionXL_T5Pipeline(StableDiffusionXLPipeline):
     _expected_modules = [
         "vae", "unet", "scheduler", "tokenizer",
         "image_encoder", "feature_extractor",
-        "t5_encoder", "t5_projection",
+        "t5_encoder", "t5_projection", "t5_pooled_projection",
     ]
 
     _optional_components = [
         "image_encoder", "feature_extractor",
-        "t5_encoder", "t5_projection",
+        "t5_encoder", "t5_projection", "t5_pooled_projection",
     ]
 
     def __init__(
@@ -74,6 +74,7 @@ def __init__(
         tokenizer: CLIPTokenizer,
         t5_encoder=None,
         t5_projection=None,
+        t5_pooled_projection=None,
         image_encoder: CLIPVisionModelWithProjection = None,
         feature_extractor: CLIPImageProcessor = None,
         force_zeros_for_empty_prompt: bool = True,
@@ -93,6 +94,12 @@ def __init__(
         else:
             self.t5_projection  = t5_projection
         self.t5_projection.to(dtype=unet.dtype)
+        # ----- build T5 4096 => 1280 dim projection -----
+        if t5_pooled_projection is None:
+            self.t5_pooled_projection  = LinearWithDtype(4096, 1280)   # trainable
+        else:
+            self.t5_pooled_projection  = t5_pooled_projection
+        self.t5_pooled_projection.to(dtype=unet.dtype)
 
         print("dtype of Linear is ",self.t5_projection.dtype)
 
@@ -103,6 +110,7 @@ def __init__(
             tokenizer=tokenizer,
             t5_encoder=self.t5_encoder,
             t5_projection=self.t5_projection,
+            t5_pooled_projection=self.t5_pooled_projection,
             image_encoder=image_encoder,
             feature_extractor=feature_extractor,
         )
@@ -157,9 +165,9 @@ def _tok(text: str):
 
         # ---------- positive stream -------------------------------------
         ids, mask = _tok(prompt)
-        h_pos = self.t5_encoder(ids, attention_mask=mask).last_hidden_state     # [b, T, 4096]
-        tok_pos = self.t5_projection(h_pos)                                     # [b, T, 2048]
-        pool_pos = tok_pos.mean(dim=1)[:, :1280]                                # [b, 1280]
+        h_pos = self.t5_encoder(ids, attention_mask=mask).last_hidden_state   # [b, T, 4096]
+        tok_pos = self.t5_projection(h_pos)                                   # [b, T, 2048]
+        pool_pos = self.t5_pooled_projection(h_pos.mean(dim=1))               # [b, 1280]
 
         # expand for multiple images per prompt
         tok_pos   = tok_pos.repeat_interleave(num_images_per_prompt, 0)
@@ -171,7 +179,7 @@ def _tok(text: str):
             ids_n, mask_n = _tok(neg_text)
             h_neg = self.t5_encoder(ids_n, attention_mask=mask_n).last_hidden_state
             tok_neg = self.t5_projection(h_neg)
-            pool_neg = tok_neg.mean(dim=1)[:, :1280]
+            pool_neg = self.t5_pooled_projection(h_neg.mean(dim=1))
 
             tok_neg  = tok_neg.repeat_interleave(num_images_per_prompt, 0)
             pool_neg = pool_neg.repeat_interleave(num_images_per_prompt, 0)

From 685ad1bc0f4c382776823c001374c686edb58ea2 Mon Sep 17 00:00:00 2001
From: ppbrown <phil@bolthole.com>
Date: Mon, 9 Jun 2025 11:23:37 -0700
Subject: [PATCH 3/5] "make style" tweaks

---
 .../pipeline_stable_diffusion_xl_t5.py        | 84 ++++++++++---------
 1 file changed, 43 insertions(+), 41 deletions(-)

diff --git a/examples/community/pipeline_stable_diffusion_xl_t5.py b/examples/community/pipeline_stable_diffusion_xl_t5.py
index 7434c90bff38..4cde556710f3 100644
--- a/examples/community/pipeline_stable_diffusion_xl_t5.py
+++ b/examples/community/pipeline_stable_diffusion_xl_t5.py
@@ -12,41 +12,35 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# Note: At this time, the intent is to use the T5 encoder mentioned
-# below, with zero changes.
-# Therefore, the model deliberately does not store the T5 encoder model bytes,
-# (Since they are not unique!)
-# but instead takes advantage of huggingface hub cache loading
-
-T5_NAME  = "mcmonkey/google_t5-v1_1-xxl_encoderonly"
-
-
-# Caller is expected to load this, or equivalent, as model name for now
-#   eg: pipe = StableDiffusionXL_T5Pipeline(SDXL_NAME)
-SDXL_NAME = "stabilityai/stable-diffusion-xl-base-1.0"
-
 
+from typing import Optional
 
-from diffusers import StableDiffusionXLPipeline, DiffusionPipeline
-from transformers import T5Tokenizer, T5EncoderModel
+import torch.nn as nn
 from transformers import (
     CLIPImageProcessor,
-    CLIPTextModel,
-    CLIPTextModelWithProjection,
     CLIPTokenizer,
     CLIPVisionModelWithProjection,
+    T5EncoderModel,
 )
 
-from diffusers.models import AutoencoderKL, ImageProjection, UNet2DConditionModel
+from diffusers import DiffusionPipeline, StableDiffusionXLPipeline
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.models import AutoencoderKL, UNet2DConditionModel
 from diffusers.schedulers import KarrasDiffusionSchedulers
-from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
 
 
-from typing import Optional
+# Note: At this time, the intent is to use the T5 encoder mentioned
+# below, with zero changes.
+# Therefore, the model deliberately does not store the T5 encoder model bytes,
+# (Since they are not unique!)
+# but instead takes advantage of huggingface hub cache loading
 
-import torch.nn as nn, torch, types
+T5_NAME = "mcmonkey/google_t5-v1_1-xxl_encoderonly"
+
+# Caller is expected to load this, or equivalent, as model name for now
+#   eg: pipe = StableDiffusionXL_T5Pipeline(SDXL_NAME)
+SDXL_NAME = "stabilityai/stable-diffusion-xl-base-1.0"
 
-import torch.nn as nn
 
 class LinearWithDtype(nn.Linear):
     @property
@@ -56,14 +50,23 @@ def dtype(self):
 
 class StableDiffusionXL_T5Pipeline(StableDiffusionXLPipeline):
     _expected_modules = [
-        "vae", "unet", "scheduler", "tokenizer",
-        "image_encoder", "feature_extractor",
-        "t5_encoder", "t5_projection", "t5_pooled_projection",
+        "vae",
+        "unet",
+        "scheduler",
+        "tokenizer",
+        "image_encoder",
+        "feature_extractor",
+        "t5_encoder",
+        "t5_projection",
+        "t5_pooled_projection",
     ]
 
     _optional_components = [
-        "image_encoder", "feature_extractor",
-        "t5_encoder", "t5_projection", "t5_pooled_projection",
+        "image_encoder",
+        "feature_extractor",
+        "t5_encoder",
+        "t5_projection",
+        "t5_pooled_projection",
     ]
 
     def __init__(
@@ -83,25 +86,24 @@ def __init__(
         DiffusionPipeline.__init__(self)
 
         if t5_encoder is None:
-            self.t5_encoder     = T5EncoderModel.from_pretrained(T5_NAME,
-                                torch_dtype=unet.dtype)
+            self.t5_encoder = T5EncoderModel.from_pretrained(T5_NAME, torch_dtype=unet.dtype)
         else:
-            self.t5_encoder     = t5_encoder
+            self.t5_encoder = t5_encoder
 
         # ----- build T5 4096 => 2048 dim projection -----
         if t5_projection is None:
-            self.t5_projection  = LinearWithDtype(4096, 2048)   # trainable
+            self.t5_projection = LinearWithDtype(4096, 2048)  # trainable
         else:
-            self.t5_projection  = t5_projection
+            self.t5_projection = t5_projection
         self.t5_projection.to(dtype=unet.dtype)
         # ----- build T5 4096 => 1280 dim projection -----
         if t5_pooled_projection is None:
-            self.t5_pooled_projection  = LinearWithDtype(4096, 1280)   # trainable
+            self.t5_pooled_projection = LinearWithDtype(4096, 1280)  # trainable
         else:
-            self.t5_pooled_projection  = t5_pooled_projection
+            self.t5_pooled_projection = t5_pooled_projection
         self.t5_pooled_projection.to(dtype=unet.dtype)
 
-        print("dtype of Linear is ",self.t5_projection.dtype)
+        print("dtype of Linear is ", self.t5_projection.dtype)
 
         self.register_modules(
             vae=vae,
@@ -165,13 +167,13 @@ def _tok(text: str):
 
         # ---------- positive stream -------------------------------------
         ids, mask = _tok(prompt)
-        h_pos = self.t5_encoder(ids, attention_mask=mask).last_hidden_state   # [b, T, 4096]
-        tok_pos = self.t5_projection(h_pos)                                   # [b, T, 2048]
-        pool_pos = self.t5_pooled_projection(h_pos.mean(dim=1))               # [b, 1280]
+        h_pos = self.t5_encoder(ids, attention_mask=mask).last_hidden_state  # [b, T, 4096]
+        tok_pos = self.t5_projection(h_pos)  # [b, T, 2048]
+        pool_pos = self.t5_pooled_projection(h_pos.mean(dim=1))  # [b, 1280]
 
         # expand for multiple images per prompt
-        tok_pos   = tok_pos.repeat_interleave(num_images_per_prompt, 0)
-        pool_pos  = pool_pos.repeat_interleave(num_images_per_prompt, 0)
+        tok_pos = tok_pos.repeat_interleave(num_images_per_prompt, 0)
+        pool_pos = pool_pos.repeat_interleave(num_images_per_prompt, 0)
 
         # ---------- negative / CFG stream --------------------------------
         if do_classifier_free_guidance:
@@ -181,7 +183,7 @@ def _tok(text: str):
             tok_neg = self.t5_projection(h_neg)
             pool_neg = self.t5_pooled_projection(h_neg.mean(dim=1))
 
-            tok_neg  = tok_neg.repeat_interleave(num_images_per_prompt, 0)
+            tok_neg = tok_neg.repeat_interleave(num_images_per_prompt, 0)
             pool_neg = pool_neg.repeat_interleave(num_images_per_prompt, 0)
         else:
             tok_neg = pool_neg = None

From 53c20386990401da533ad54239f9b77a008c2f68 Mon Sep 17 00:00:00 2001
From: ppbrown <phil@bolthole.com>
Date: Mon, 9 Jun 2025 11:31:44 -0700
Subject: [PATCH 4/5] Added comments to top of code

---
 examples/community/pipeline_stable_diffusion_xl_t5.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/examples/community/pipeline_stable_diffusion_xl_t5.py b/examples/community/pipeline_stable_diffusion_xl_t5.py
index 4cde556710f3..1c3f84794b8c 100644
--- a/examples/community/pipeline_stable_diffusion_xl_t5.py
+++ b/examples/community/pipeline_stable_diffusion_xl_t5.py
@@ -12,6 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+###########################################################################
+# This pipeline attempts to use a model that has SDXL vae, T5 text encoder, 
+# and SDXL unet. 
+# At the present time, there are no pretrained models that give pleasing
+# output. So as yet, (2025/06/10) this pipeline is somewhat of a tech
+# demo proving that the pieces can at least be put together.
+# Hopefully, it will encourage someone with the hardware available to
+# throw enough resources into training one up.
+
 
 from typing import Optional
 

From 3b8f13b0927b986e6d3f6066616eb6751546ac47 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Mon, 9 Jun 2025 19:50:45 +0000
Subject: [PATCH 5/5] Apply style fixes

---
 examples/community/pipeline_stable_diffusion_xl_t5.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/community/pipeline_stable_diffusion_xl_t5.py b/examples/community/pipeline_stable_diffusion_xl_t5.py
index 1c3f84794b8c..7659bd5bc832 100644
--- a/examples/community/pipeline_stable_diffusion_xl_t5.py
+++ b/examples/community/pipeline_stable_diffusion_xl_t5.py
@@ -13,8 +13,8 @@
 # limitations under the License.
 
 ###########################################################################
-# This pipeline attempts to use a model that has SDXL vae, T5 text encoder, 
-# and SDXL unet. 
+# This pipeline attempts to use a model that has SDXL vae, T5 text encoder,
+# and SDXL unet.
 # At the present time, there are no pretrained models that give pleasing
 # output. So as yet, (2025/06/10) this pipeline is somewhat of a tech
 # demo proving that the pieces can at least be put together.