@@ -270,6 +270,13 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128
270270
271271// 64-bit compatibility
272272
273+ // vaddvq_s16
274+ // vpaddq_s16
275+ // vaddvq_s32
276+ // vaddvq_f32
277+ // vmaxvq_f32
278+ // vcvtnq_s32_f32
279+
273280inline static int32_t vaddvq_s16 (int16x8_t v ) {
274281 return
275282 (int32_t )vgetq_lane_s16 (v , 0 ) + (int32_t )vgetq_lane_s16 (v , 1 ) +
@@ -309,6 +316,82 @@ inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
309316 return res ;
310317}
311318
319+ // vld1q_s16_x2
320+ // vld1q_u8_x2
321+ // vld1q_u8_x4
322+ // vld1q_s8_x2
323+ // vld1q_s8_x4
324+ // TODO: double-check these work correctly
325+
326+ struct int16x8x2_t {
327+ int16x8_t val [2 ];
328+ };
329+
330+ inline static int16x8x2_t vld1q_s16_x2 (const int16_t * ptr ) {
331+ int16x8x2_t res ;
332+
333+ res .val [0 ] = vld1q_s16 (ptr + 0 );
334+ res .val [1 ] = vld1q_s16 (ptr + 8 );
335+
336+ return res ;
337+ }
338+
339+ struct uint8x16x2_t {
340+ uint8x16_t val [2 ];
341+ };
342+
343+ inline static uint8x16x2_t vld1q_u8_x2 (const uint8_t * ptr ) {
344+ uint8x16x2_t res ;
345+
346+ res .val [0 ] = vld1q_u8 (ptr + 0 );
347+ res .val [1 ] = vld1q_u8 (ptr + 16 );
348+
349+ return res ;
350+ }
351+
352+ struct uint8x16x4_t {
353+ uint8x16_t val [4 ];
354+ };
355+
356+ inline static uint8x16x4_t vld1q_u8_x4 (const uint8_t * ptr ) {
357+ uint8x16x4_t res ;
358+
359+ res .val [0 ] = vld1q_u8 (ptr + 0 );
360+ res .val [1 ] = vld1q_u8 (ptr + 16 );
361+ res .val [2 ] = vld1q_u8 (ptr + 32 );
362+ res .val [3 ] = vld1q_u8 (ptr + 48 );
363+
364+ return res ;
365+ }
366+
367+ struct int8x16x2_t {
368+ int8x16_t val [2 ];
369+ };
370+
371+ inline static int8x16x2_t vld1q_s8_x2 (const int8_t * ptr ) {
372+ int8x16x2_t res ;
373+
374+ res .val [0 ] = vld1q_s8 (ptr + 0 );
375+ res .val [1 ] = vld1q_s8 (ptr + 16 );
376+
377+ return res ;
378+ }
379+
380+ struct int8x16x4_t {
381+ int8x16_t val [4 ];
382+ };
383+
384+ inline static int8x16x4_t vld1q_s8_x4 (const int8_t * ptr ) {
385+ int8x16x4_t res ;
386+
387+ res .val [0 ] = vld1q_s8 (ptr + 0 );
388+ res .val [1 ] = vld1q_s8 (ptr + 16 );
389+ res .val [2 ] = vld1q_s8 (ptr + 32 );
390+ res .val [3 ] = vld1q_s8 (ptr + 48 );
391+
392+ return res ;
393+ }
394+
312395#endif
313396#endif
314397
0 commit comments