@@ -574,8 +574,8 @@ def guidance_scale(self):
574574 return self ._guidance_scale
575575
576576 @property
577- def attention_kwargs (self ):
578- return self ._attention_kwargs
577+ def cross_attention_kwargs (self ):
578+ return self ._cross_attention_kwargs
579579
580580 @property
581581 def do_classifier_free_guidance (self ):
@@ -613,7 +613,7 @@ def __call__(
613613 return_dict : bool = True ,
614614 clean_caption : bool = True ,
615615 use_resolution_binning : bool = True ,
616- attention_kwargs : Optional [Dict [str , Any ]] = None ,
616+ cross_attention_kwargs : Optional [Dict [str , Any ]] = None ,
617617 callback_on_step_end : Optional [Callable [[int , int , Dict ], None ]] = None ,
618618 callback_on_step_end_tensor_inputs : List [str ] = ["latents" ],
619619 max_sequence_length : int = 300 ,
@@ -686,7 +686,7 @@ def __call__(
686686 [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
687687 return_dict (`bool`, *optional*, defaults to `True`):
688688 Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple.
689- attention_kwargs : TODO
689+ cross_attention_kwargs : TODO
690690 clean_caption (`bool`, *optional*, defaults to `True`):
691691 Whether or not to clean the caption before creating embeddings. Requires `beautifulsoup4` and `ftfy` to
692692 be installed. If the dependencies are not installed, the embeddings will be created from the raw
@@ -747,7 +747,7 @@ def __call__(
747747 )
748748
749749 self ._guidance_scale = guidance_scale
750- self ._attention_kwargs = attention_kwargs
750+ self ._cross_attention_kwargs = cross_attention_kwargs
751751 self ._interrupt = False
752752
753753 # 2. Default height and width to transformer
@@ -759,7 +759,9 @@ def __call__(
759759 batch_size = prompt_embeds .shape [0 ]
760760
761761 device = self ._execution_device
762- lora_scale = self .attention_kwargs .get ("scale" , None ) if self .attention_kwargs is not None else None
762+ lora_scale = (
763+ self .cross_attention_kwargs .get ("scale" , None ) if self .cross_attention_kwargs is not None else None
764+ )
763765
764766 # 3. Encode input prompt
765767 (
@@ -829,7 +831,7 @@ def __call__(
829831 encoder_attention_mask = prompt_attention_mask ,
830832 timestep = timestep ,
831833 return_dict = False ,
832- attention_kwargs = self .attention_kwargs ,
834+ cross_attention_kwargs = self .cross_attention_kwargs ,
833835 )[0 ]
834836 noise_pred = noise_pred .float ()
835837
0 commit comments