@@ -679,9 +679,11 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
679679}
680680
681681clip_image_u8 * make_clip_image_u8 () { return new clip_image_u8 (); }
682-
683682clip_image_f32 * make_clip_image_f32 () { return new clip_image_f32 (); }
684683
684+ void clip_image_u8_free (clip_image_u8 * img) { if (img->data ) { delete[] img->data ; } delete img; }
685+ void clip_image_f32_free (clip_image_f32 * img) { if (img->data ) { delete[] img->data ; } delete img; }
686+
685687static void build_clip_img_from_data (const stbi_uc * data, int nx, int ny, clip_image_u8 * img) {
686688 img->nx = nx;
687689 img->ny = ny;
@@ -726,39 +728,40 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip
726728 // the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104)
727729 // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156
728730
729- clip_image_u8 temp; // we will keep the input image data here temporarily
731+ clip_image_u8 * temp = make_clip_image_u8 () ; // we will keep the input image data here temporarily
730732 if (pad2square && img->nx != img->ny ) {
731733 int longer_side = std::max (img->nx , img->ny );
732- temp. nx = longer_side;
733- temp. ny = longer_side;
734- temp. size = 3 * longer_side * longer_side;
735- temp. data = new uint8_t [temp. size ]();
734+ temp-> nx = longer_side;
735+ temp-> ny = longer_side;
736+ temp-> size = 3 * longer_side * longer_side;
737+ temp-> data = new uint8_t [temp-> size ]();
736738 uint8_t bc[3 ] = {122 , 116 , 104 }; // bakground color in RGB from LLaVA
737739
738740 // fill with background color
739- for (size_t i = 0 ; i < temp. size ; i++) {
740- temp. data [i] = bc[i % 3 ];
741+ for (size_t i = 0 ; i < temp-> size ; i++) {
742+ temp-> data [i] = bc[i % 3 ];
741743 }
742744
743745 // copy from the input image
744746 for (int y = 0 ; y < img->ny ; y++) {
745747 for (int x = 0 ; x < img->nx ; x++) {
746748 const int i = 3 * (y * img->nx + x);
747- const int j = 3 * (y * temp. nx + x);
748- temp. data [j] = img->data [i];
749- temp. data [j+1 ] = img->data [i+1 ];
750- temp. data [j+2 ] = img->data [i+2 ];
749+ const int j = 3 * (y * temp-> nx + x);
750+ temp-> data [j] = img->data [i];
751+ temp-> data [j+1 ] = img->data [i+1 ];
752+ temp-> data [j+2 ] = img->data [i+2 ];
751753 }
752754 }
753755 } else {
754- temp.nx = img->nx ;
755- temp.ny = img->ny ;
756- temp.size = img->size ;
757- temp.data = img->data ;
756+ temp->nx = img->nx ;
757+ temp->ny = img->ny ;
758+ temp->size = img->size ;
759+ temp->data = new uint8_t [temp->size ]();
760+ *temp->data = *img->data ; // copy
758761 }
759762
760- const int nx = temp. nx ;
761- const int ny = temp. ny ;
763+ const int nx = temp-> nx ;
764+ const int ny = temp-> ny ;
762765
763766 const int nx2 = ctx->vision_model .hparams .image_size ;
764767 const int ny2 = ctx->vision_model .hparams .image_size ;
@@ -797,10 +800,10 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip
797800 const int j10 = 3 * (y1 * nx + x0) + c;
798801 const int j11 = 3 * (y1 * nx + x1) + c;
799802
800- const float v00 = temp. data [j00];
801- const float v01 = temp. data [j01];
802- const float v10 = temp. data [j10];
803- const float v11 = temp. data [j11];
803+ const float v00 = temp-> data [j00];
804+ const float v01 = temp-> data [j01];
805+ const float v10 = temp-> data [j10];
806+ const float v11 = temp-> data [j11];
804807
805808 const float v0 = v00 * (1 .0f - dx) + v01 * dx;
806809 const float v1 = v10 * (1 .0f - dx) + v11 * dx;
@@ -815,6 +818,7 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip
815818 }
816819 }
817820 }
821+ clip_image_u8_free (temp);
818822
819823 return true ;
820824}
0 commit comments