본문 바로가기
SIMD/Intel Instrinsic

SIMD: Transpose

by 김뿡한 2015. 1. 26.

/**

 * @[Description] : Transpose

 * @ [Param] : pResidual

 * @ [Param] : buffer for result of transpose.

 * @ [Param] : Transform size

 */

void Transpose(short* pResi, short *pBuf, int iTrSize)

{

  short*pSrc;

  short*pDst;

  int iWidthBlk;

  int iHeightBlk;

  int i, j;


  __m128i Line0, Line1, Line2, Line3, Line4, Line5, Line6, Line7;

  __m128i L0, L1, L2, L3, H0, H1, H2, H3;


  iHeightBlk = iWidthBlk = (iTrSize) >> 3;


  for (i = 0; i < iHeightBlk; i++)

  {

    for (j = 0; j < iWidthBlk; j++)

    {

      pSrc = pResi + (j * 8) + (i * iTrSize * 8);     // 

      pDst = pBuf + (i * 8) + (j * iTrSize * 8);      // 


      Line0 = _mm_loadu_si128((__m128i *)(pSrc)); pSrc += iTrSize;

      Line1 = _mm_loadu_si128((__m128i *)(pSrc)); pSrc += iTrSize;

      Line2 = _mm_loadu_si128((__m128i *)(pSrc)); pSrc += iTrSize;

      Line3 = _mm_loadu_si128((__m128i *)(pSrc)); pSrc += iTrSize;

      Line4 = _mm_loadu_si128((__m128i *)(pSrc)); pSrc += iTrSize;

      Line5 = _mm_loadu_si128((__m128i *)(pSrc)); pSrc += iTrSize;

      Line6 = _mm_loadu_si128((__m128i *)(pSrc)); pSrc += iTrSize;

      Line7 = _mm_loadu_si128((__m128i *)(pSrc));


      L0 = _mm_unpacklo_epi16(Line0, Line1);

      L1 = _mm_unpacklo_epi16(Line2, Line3);

      L2 = _mm_unpacklo_epi16(Line4, Line5);

      L3 = _mm_unpacklo_epi16(Line6, Line7);


      H0 = _mm_unpackhi_epi16(Line0, Line1);

      H1 = _mm_unpackhi_epi16(Line2, Line3);

      H2 = _mm_unpackhi_epi16(Line4, Line5);

      H3 = _mm_unpackhi_epi16(Line6, Line7);


      Line0 = _mm_unpacklo_epi32(L0, L1);

      Line1 = _mm_unpackhi_epi32(L0, L1);


      Line2 = _mm_unpacklo_epi32(H0, H1);

      Line3 = _mm_unpackhi_epi32(H0, H1);


      Line4 = _mm_unpacklo_epi32(L2, L3);

      Line5 = _mm_unpackhi_epi32(L2, L3);


      Line6 = _mm_unpacklo_epi32(H2, H3);

      Line7 = _mm_unpackhi_epi32(H2, H3);


      L0 = _mm_unpacklo_epi64(Line0, Line4);

      H0 = _mm_unpackhi_epi64(Line0, Line4);


      L2 = _mm_unpacklo_epi64(Line1, Line5);

      H2 = _mm_unpackhi_epi64(Line1, Line5);


      L1 = _mm_unpacklo_epi64(Line2, Line6);

      H1 = _mm_unpackhi_epi64(Line2, Line6);


      L3 = _mm_unpacklo_epi64(Line3, Line7);

      H3 = _mm_unpackhi_epi64(Line3, Line7);


      _mm_store_si128((__m128i *)(pDst), L0); pDst += iTrSize;

      _mm_store_si128((__m128i *)(pDst), H0); pDst += iTrSize;

      _mm_store_si128((__m128i *)(pDst), L2); pDst += iTrSize;

      _mm_store_si128((__m128i *)(pDst), H2); pDst += iTrSize;

      _mm_store_si128((__m128i *)(pDst), L1); pDst += iTrSize;

      _mm_store_si128((__m128i *)(pDst), H1); pDst += iTrSize;

      _mm_store_si128((__m128i *)(pDst), L3); pDst += iTrSize;

      _mm_store_si128((__m128i *)(pDst), H3);

    }

  }

}

'SIMD > Intel Instrinsic' 카테고리의 다른 글

_mm256_hadd_epi16 (), _mm256_hadd_epi32()  (0) 2015.01.23
_mm_set_epi16 ,  (0) 2015.01.22
_mm_srai_epi16() , _mm_srai_epi32() : Shift  (0) 2015.01.22
__m128i _mm_blend_epi32 () : Mix data  (0) 2015.01.21
void _mm_prefetch ( )  (0) 2015.01.14