typedef short int int16_t; typedef unsigned char uint8_t; typedef __builtin_neon_qi int8x8_t __attribute__ ((__vector_size__ (8))); typedef __builtin_neon_uqi uint8x8_t __attribute__ ((__vector_size__ (8))); typedef __builtin_neon_hi int16x8_t __attribute__ ((__vector_size__ (16))); typedef struct int8x8x4_t { int16x8_t val[4]; } int16x8x4_t; __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) vqshrun_n_s16 (int16x8_t __a, const int __b) { return (uint8x8_t)__builtin_neon_vqshrun_nv8hi (__a, __b, 1); } __extension__ static __inline void __attribute__ ((__always_inline__)) vst1_u8 (uint8_t * __a, uint8x8_t __b) { __builtin_neon_vst1v8qi ((__builtin_neon_qi *) __a, (int8x8_t) __b); } __extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) vqaddq_s16 (int16x8_t __a, int16x8_t __b) { return (int16x8_t)__builtin_neon_vqaddv8hi (__a, __b, 1); } __extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) vmulq_s16 (int16x8_t __a, int16x8_t __b) { return (int16x8_t)__builtin_neon_vmulv8hi (__a, __b, 1); } __extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) vdupq_n_s16 (int16_t __a) { return (int16x8_t)__builtin_neon_vdup_nv8hi ((__builtin_neon_hi) __a); } __extension__ static __inline int16x8x4_t __attribute__ ((__always_inline__)) vld4q_s16 (const int16_t * __a) { union { int16x8x4_t __i; __builtin_neon_xi __o; } __rv; __rv.__o = __builtin_neon_vld4v8hi ((const __builtin_neon_hi *) __a); return __rv.__i; } typedef unsigned char JSAMPLE; typedef short JCOEF; typedef unsigned int JDIMENSION; typedef JSAMPLE *JSAMPROW; typedef JSAMPROW *JSAMPARRAY; typedef JCOEF *JCOEFPTR; typedef struct { void * dct_table; } jpeg_component_info; typedef struct jpeg_decompress_struct * j_decompress_ptr; typedef struct { int16x8x4_t half[2]; } int16_8x8_t; void jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr, JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col) { int16_8x8_t coef; coef.half[0] = vld4q_s16((int16_t*) coef_block); coef.half[1] = vld4q_s16(((int16_t*) coef_block) + 32); { int16x8x4_t dequant; coef.half[0].val[1] = vmulq_s16(coef.half[0].val[1], dequant.val[1]); dequant = vld4q_s16(((int16_t*) compptr->dct_table) + 32); coef.half[1].val[3] = vmulq_s16(coef.half[1].val[3], dequant.val[3]); int16x8_t vector_one_half = vdupq_n_s16(0x80 << 5); vst1_u8((uint8_t*)(output_buf[0] + output_col), vqshrun_n_s16(vqaddq_s16(coef.half[0].val[0], vector_one_half), 5)); vst1_u8((uint8_t*)(output_buf[1] + output_col), vqshrun_n_s16(vqaddq_s16(coef.half[0].val[1], vector_one_half), 5)); vst1_u8((uint8_t*)(output_buf[7] + output_col), vqshrun_n_s16(vqaddq_s16(coef.half[1].val[3], vector_one_half), 5)); } }