@@ -3648,8 +3648,9 @@ int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 *
36483648int clip_n_output_tokens (const struct clip_ctx * ctx, struct clip_image_f32 * img) {
36493649 const auto & params = ctx->model .hparams ;
36503650
3651- // only for models using fixed size square images
3652- int n_patches_sq = (params.image_size / params.patch_size ) * (params.image_size / params.patch_size );
3651+ // for models with fixed size image, the input image is already pre-processed and resized to square
3652+ int patch_size = params.patch_size ;
3653+ int n_patches = (img->nx / patch_size) * (img->ny / patch_size);
36533654
36543655 projector_type proj = ctx->proj_type ();
36553656
@@ -3663,27 +3664,27 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
36633664 case PROJECTOR_TYPE_LDPV2:
36643665 case PROJECTOR_TYPE_GLM_EDGE:
36653666 {
3666- n_patches_sq /= 4 ;
3667+ n_patches /= 4 ;
36673668 if (ctx->model .mm_glm_tok_boi ) {
3668- n_patches_sq += 2 ; // for BOI and EOI token embeddings
3669+ n_patches += 2 ; // for BOI and EOI token embeddings
36693670 }
36703671 } break ;
36713672 case PROJECTOR_TYPE_MINICPMV:
36723673 {
36733674 // Use actual config value if available, otherwise fall back to hardcoded values
36743675 if (params.minicpmv_query_num > 0 ) {
3675- n_patches_sq = params.minicpmv_query_num ;
3676+ n_patches = params.minicpmv_query_num ;
36763677 } else {
36773678 // Fallback to hardcoded values for legacy models
36783679 if (params.minicpmv_version == 2 ) {
3679- n_patches_sq = 96 ;
3680+ n_patches = 96 ;
36803681 } else if (params.minicpmv_version == 3 ) {
3681- n_patches_sq = 64 ;
3682+ n_patches = 64 ;
36823683 } else if (params.minicpmv_version == 4 ) {
3683- n_patches_sq = 64 ;
3684+ n_patches = 64 ;
36843685 } else if (params.minicpmv_version == 5 ) {
36853686 // MiniCPM-V 4.0
3686- n_patches_sq = 64 ;
3687+ n_patches = 64 ;
36873688 } else {
36883689 GGML_ABORT (" Unknown minicpmv version" );
36893690 }
@@ -3692,67 +3693,56 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
36923693 case PROJECTOR_TYPE_QWEN2VL:
36933694 case PROJECTOR_TYPE_QWEN25VL:
36943695 {
3695- // dynamic size
3696+ // dynamic size (2 conv, so double patch size)
36963697 int patch_size = params.patch_size * 2 ;
36973698 int x_patch = img->nx / patch_size + (int )(img->nx % patch_size > 0 );
36983699 int y_patch = img->ny / patch_size + (int )(img->ny % patch_size > 0 );
3699- n_patches_sq = x_patch * y_patch;
3700+ n_patches = x_patch * y_patch;
37003701 } break ;
37013702 case PROJECTOR_TYPE_GEMMA3:
3702- {
3703- int n_per_side = params.image_size / params.patch_size ;
3704- int n_per_side_2d_pool = n_per_side / params.proj_scale_factor ;
3705- n_patches_sq = n_per_side_2d_pool * n_per_side_2d_pool;
3706- } break ;
37073703 case PROJECTOR_TYPE_IDEFICS3:
37083704 case PROJECTOR_TYPE_INTERNVL:
3705+ case PROJECTOR_TYPE_LLAMA4:
3706+ case PROJECTOR_TYPE_LFM2:
37093707 {
37103708 // both W and H are divided by proj_scale_factor
3711- n_patches_sq /= (params.proj_scale_factor * params.proj_scale_factor );
3709+ int scale_factor = ctx->model .hparams .proj_scale_factor ;
3710+ n_patches /= (scale_factor * scale_factor);
37123711 } break ;
37133712 case PROJECTOR_TYPE_PIXTRAL:
37143713 {
37153714 // dynamic size
37163715 int n_merge = params.spatial_merge_size ;
3717- int n_patches_x = img->nx / params.patch_size / (n_merge > 0 ? n_merge : 1 );
3718- int n_patches_y = img->ny / params.patch_size / (n_merge > 0 ? n_merge : 1 );
3719- n_patches_sq = n_patches_y * n_patches_x + n_patches_y - 1 ; // + one [IMG_BREAK] per row, except the last row
3720- } break ;
3721- case PROJECTOR_TYPE_LLAMA4:
3722- {
3723- int scale_factor = ctx->model .hparams .proj_scale_factor ;
3724- n_patches_sq /= (scale_factor * scale_factor);
3716+ int n_patches_x = img->nx / patch_size / (n_merge > 0 ? n_merge : 1 );
3717+ int n_patches_y = img->ny / patch_size / (n_merge > 0 ? n_merge : 1 );
3718+ n_patches = n_patches_y * n_patches_x + n_patches_y - 1 ; // + one [IMG_BREAK] per row, except the last row
37253719 } break ;
37263720 case PROJECTOR_TYPE_VOXTRAL:
37273721 case PROJECTOR_TYPE_ULTRAVOX:
37283722 case PROJECTOR_TYPE_QWEN2A:
37293723 {
3730- n_patches_sq = img->nx ;
3724+ n_patches = img->nx ;
37313725
37323726 const int proj_stack_factor = ctx->model .hparams .proj_stack_factor ;
37333727 if (ctx->model .audio_has_stack_frames ()) {
37343728 GGML_ASSERT (proj_stack_factor > 0 );
3735- const int n_len = CLIP_ALIGN (n_patches_sq , proj_stack_factor);
3736- n_patches_sq = n_len / proj_stack_factor;
3729+ const int n_len = CLIP_ALIGN (n_patches , proj_stack_factor);
3730+ n_patches = n_len / proj_stack_factor;
37373731 }
37383732
37393733 // whisper downscales input token by half after conv1d
3740- n_patches_sq /= 2 ;
3734+ n_patches /= 2 ;
37413735
37423736 if (ctx->model .audio_has_avgpool ()) {
37433737 // divide by 2 because of nn.AvgPool1d(2, stride=2)
3744- n_patches_sq /= 2 ;
3738+ n_patches /= 2 ;
37453739 }
37463740 } break ;
3747- case PROJECTOR_TYPE_LFM2:
3748- {
3749- n_patches_sq = (img->nx / (params.patch_size * params.proj_scale_factor )) * (img->ny / (params.patch_size * params.proj_scale_factor ));
3750- } break ;
37513741 default :
37523742 GGML_ABORT (" unsupported projector type" );
37533743 }
37543744
3755- return n_patches_sq ;
3745+ return n_patches ;
37563746}
37573747
37583748static std::vector<std::vector<std::vector<float >>> get_1d_sincos_pos_embed_from_grid_new (int embed_dim, const std::vector<std::vector<float >> & pos) {
0 commit comments