|
20 | 20 | from typing import List, Optional, Tuple, Union |
21 | 21 |
|
22 | 22 | import torch |
23 | | -from transformers import AutoModel, AutoTokenizer |
| 23 | +from transformers import PreTrainedModel, PreTrainedTokenizerBase |
24 | 24 |
|
25 | 25 | from ...image_processor import VaeImageProcessor |
26 | 26 | from ...models import AutoencoderKL |
@@ -143,13 +143,13 @@ class LuminaText2ImgPipeline(DiffusionPipeline): |
143 | 143 | Args: |
144 | 144 | vae ([`AutoencoderKL`]): |
145 | 145 | Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations. |
146 | | - text_encoder ([`AutoModel`]): |
| 146 | + text_encoder ([`PreTrainedModel`]): |
147 | 147 | Frozen text-encoder. Lumina-T2I uses |
148 | 148 | [T5](https://huggingface.co/docs/transformers/model_doc/t5#transformers.AutoModel), specifically the |
149 | 149 | [t5-v1_1-xxl](https://huggingface.co/Alpha-VLLM/tree/main/t5-v1_1-xxl) variant. |
150 | | - tokenizer (`AutoModel`): |
| 150 | + tokenizer (`AutoTokenizer`): |
151 | 151 | Tokenizer of class |
152 | | - [AutoModel](https://huggingface.co/docs/transformers/model_doc/t5#transformers.AutoModel). |
| 152 | + [AutoTokenizer](https://huggingface.co/docs/transformers/model_doc/t5#transformers.AutoModel). |
153 | 153 | transformer ([`Transformer2DModel`]): |
154 | 154 | A text conditioned `Transformer2DModel` to denoise the encoded image latents. |
155 | 155 | scheduler ([`SchedulerMixin`]): |
@@ -180,8 +180,8 @@ def __init__( |
180 | 180 | transformer: LuminaNextDiT2DModel, |
181 | 181 | scheduler: FlowMatchEulerDiscreteScheduler, |
182 | 182 | vae: AutoencoderKL, |
183 | | - text_encoder: AutoModel, |
184 | | - tokenizer: AutoTokenizer, |
| 183 | + text_encoder: PreTrainedModel, |
| 184 | + tokenizer: PreTrainedTokenizerBase, |
185 | 185 | ): |
186 | 186 | super().__init__() |
187 | 187 |
|
|
0 commit comments