/**
* @[Description] : Transpose
* @ [Param] : pResidual
* @ [Param] : buffer for result of transpose.
* @ [Param] : Transform size
*/
void Transpose(short* pResi, short *pBuf, int iTrSize)
{
short*pSrc;
short*pDst;
int iWidthBlk;
int iHeightBlk;
int i, j;
__m128i Line0, Line1, Line2, Line3, Line4, Line5, Line6, Line7;
__m128i L0, L1, L2, L3, H0, H1, H2, H3;
iHeightBlk = iWidthBlk = (iTrSize) >> 3;
for (i = 0; i < iHeightBlk; i++)
{
for (j = 0; j < iWidthBlk; j++)
{
pSrc = pResi + (j * 8) + (i * iTrSize * 8); //
pDst = pBuf + (i * 8) + (j * iTrSize * 8); //
Line0 = _mm_loadu_si128((__m128i *)(pSrc)); pSrc += iTrSize;
Line1 = _mm_loadu_si128((__m128i *)(pSrc)); pSrc += iTrSize;
Line2 = _mm_loadu_si128((__m128i *)(pSrc)); pSrc += iTrSize;
Line3 = _mm_loadu_si128((__m128i *)(pSrc)); pSrc += iTrSize;
Line4 = _mm_loadu_si128((__m128i *)(pSrc)); pSrc += iTrSize;
Line5 = _mm_loadu_si128((__m128i *)(pSrc)); pSrc += iTrSize;
Line6 = _mm_loadu_si128((__m128i *)(pSrc)); pSrc += iTrSize;
Line7 = _mm_loadu_si128((__m128i *)(pSrc));
L0 = _mm_unpacklo_epi16(Line0, Line1);
L1 = _mm_unpacklo_epi16(Line2, Line3);
L2 = _mm_unpacklo_epi16(Line4, Line5);
L3 = _mm_unpacklo_epi16(Line6, Line7);
H0 = _mm_unpackhi_epi16(Line0, Line1);
H1 = _mm_unpackhi_epi16(Line2, Line3);
H2 = _mm_unpackhi_epi16(Line4, Line5);
H3 = _mm_unpackhi_epi16(Line6, Line7);
Line0 = _mm_unpacklo_epi32(L0, L1);
Line1 = _mm_unpackhi_epi32(L0, L1);
Line2 = _mm_unpacklo_epi32(H0, H1);
Line3 = _mm_unpackhi_epi32(H0, H1);
Line4 = _mm_unpacklo_epi32(L2, L3);
Line5 = _mm_unpackhi_epi32(L2, L3);
Line6 = _mm_unpacklo_epi32(H2, H3);
Line7 = _mm_unpackhi_epi32(H2, H3);
L0 = _mm_unpacklo_epi64(Line0, Line4);
H0 = _mm_unpackhi_epi64(Line0, Line4);
L2 = _mm_unpacklo_epi64(Line1, Line5);
H2 = _mm_unpackhi_epi64(Line1, Line5);
L1 = _mm_unpacklo_epi64(Line2, Line6);
H1 = _mm_unpackhi_epi64(Line2, Line6);
L3 = _mm_unpacklo_epi64(Line3, Line7);
H3 = _mm_unpackhi_epi64(Line3, Line7);
_mm_store_si128((__m128i *)(pDst), L0); pDst += iTrSize;
_mm_store_si128((__m128i *)(pDst), H0); pDst += iTrSize;
_mm_store_si128((__m128i *)(pDst), L2); pDst += iTrSize;
_mm_store_si128((__m128i *)(pDst), H2); pDst += iTrSize;
_mm_store_si128((__m128i *)(pDst), L1); pDst += iTrSize;
_mm_store_si128((__m128i *)(pDst), H1); pDst += iTrSize;
_mm_store_si128((__m128i *)(pDst), L3); pDst += iTrSize;
_mm_store_si128((__m128i *)(pDst), H3);
}
}
}
'SIMD > Intel Instrinsic' 카테고리의 다른 글
_mm256_hadd_epi16 (), _mm256_hadd_epi32() (0) | 2015.01.23 |
---|---|
_mm_set_epi16 , (0) | 2015.01.22 |
_mm_srai_epi16() , _mm_srai_epi32() : Shift (0) | 2015.01.22 |
__m128i _mm_blend_epi32 () : Mix data (0) | 2015.01.21 |
void _mm_prefetch ( ) (0) | 2015.01.14 |