@Spongcer
2015-03-13T04:32:08.000000Z
字数 3595
阅读 1864
Code
//在一个DWORD中找到第一个(或最后一个)为1的位#ifdef USE_NEW_BITMAPstatic INLINE Count32_t bm_find_one_in_dword(Count32_t dwWord, Bool8_t bDir){Count32_t dwPos = 0, dwShift = 0;if (0 == dwWord) return BM_NOT_FOUND;if (bDir){dwShift = ((0 == (dwWord & 0x0000FFFF)) << 4);dwPos += dwShift;dwWord >>= dwShift;dwShift = ((0 == (dwWord & 0x00FF)) << 3);dwPos += dwShift;dwWord >>= dwShift;dwShift = ((0 == (dwWord & 0x0F)) << 2);dwPos += dwShift;dwWord >>= dwShift;dwShift = ((0 == (dwWord & 0x03)) << 1);dwPos += dwShift;dwWord >>= dwShift;dwShift = (0 == (dwWord & 0x01));dwPos += dwShift;dwWord >>= dwShift;}else{dwShift = ((0 == (dwWord & 0xFFFF0000)) << 4);dwPos += dwShift;dwWord <<= dwShift;dwShift = ((0 == (dwWord & 0xFF000000)) << 3);dwPos += dwShift;dwWord <<= dwShift;dwShift = ((0 == (dwWord & 0xF0000000)) << 2);dwPos += dwShift;dwWord <<= dwShift;dwShift = ((0 == (dwWord & 0xC0000000)) << 1);dwPos += dwShift;dwWord <<= dwShift;dwShift = (0 == (dwWord & 0x80000000));dwPos += dwShift;dwWord <<= dwShift;dwPos += ((dwWord & 0x80000000) != 0);dwPos = 32 - dwPos;}return dwPos;}#elsestatic Count32_t bm_find_one_in_dword (Count32_t dwWord, Bool8_t bDir ){Count32_t i=0;Count32_t dwWord2;if (0 == dwWord) return BM_NOT_FOUND;if (bDir) {while(1) {dwWord2 = ((dwWord >> 1) << 1);if (dwWord == dwWord2) {dwWord >>= 1;i++;continue;}break;}} else {while(dwWord>1) {dwWord >>= 1;i++;}}return i;}#endif
//采用4路SIMD进行内存清空#ifdef USE_SIMD_OPT#define VectorBATZeroNullBitmap(pvbVector) \{ \Count32_t i = 0, j = 0, dwBlock = 0, dwSize = VECBAT_BITMAP_SIZE; \Count32P_t pdwNullBitmap = (pvbVector)->dwNullBitmap; \__m128i __mZero = _mm_setzero_si128(); \__m128i *__m_aNullBitmap = (__m128i *)pdwNullBitmap; \\dwBlock = dwSize >> 4; \\for (j = 0; j < dwBlock; j++) \{ \_mm_storeu_si128(__m_aNullBitmap + i, __mZero); \_mm_storeu_si128(__m_aNullBitmap + i + 1, __mZero); \_mm_storeu_si128(__m_aNullBitmap + i + 2, __mZero); \_mm_storeu_si128(__m_aNullBitmap + i + 3, __mZero); \i += 4; \} \\i <<= 2; \\for (; i < dwSize; i++) \{ \pdwNullBitmap[i] = 0; \} \}#else#define VectorBATZeroNullBitmap(pvbVector) MEMSET((pvbVector)->dwNullBitmap, 0, VECBAT_BITMAP_SIZE * sizeof(Count32_t))#endif
//采用8路SIMD进行浮点数求和运算static INLINE DatumX_sum_vec_Double_t(X_AGG_VEC_ARGS){Count_t i = 0;Double_t dtdst = 0;if (!bExistNull){#if (defined(OSC_64BIT_ARCH) && defined(USE_SIMD_OPT))__m128d __mSum0 = _mm_setzero_pd();__m128d __mSum1 = _mm_setzero_pd();__m128d __mSum2 = _mm_setzero_pd();__m128d __mSum3 = _mm_setzero_pd();__m128d __mSum4 = _mm_setzero_pd();__m128d __mSum5 = _mm_setzero_pd();__m128d __mSum6 = _mm_setzero_pd();__m128d __mSum7 = _mm_setzero_pd();__m128d __mLoad0, __mLoad1, __mLoad2, __mLoad3;__m128d __mLoad4, __mLoad5, __mLoad6, __mLoad7;Double_t *pDatum = (Double_t *)dtlft;Count_t dwBlock = (dwCount >> 4);Count_t j = 0;for (j = 0; j < dwBlock; j++){__mLoad0 = _mm_loadu_pd(pDatum + i);__mLoad1 = _mm_loadu_pd(pDatum + i + 2);__mLoad2 = _mm_loadu_pd(pDatum + i + 4);__mLoad3 = _mm_loadu_pd(pDatum + i + 6);__mLoad4 = _mm_loadu_pd(pDatum + i + 8);__mLoad5 = _mm_loadu_pd(pDatum + i + 10);__mLoad6 = _mm_loadu_pd(pDatum + i + 12);__mLoad7 = _mm_loadu_pd(pDatum + i + 14);__mSum0 = _mm_add_pd(__mSum0, __mLoad0);__mSum1 = _mm_add_pd(__mSum1, __mLoad1);__mSum2 = _mm_add_pd(__mSum2, __mLoad2);__mSum3 = _mm_add_pd(__mSum3, __mLoad3);__mSum4 = _mm_add_pd(__mSum4, __mLoad4);__mSum5 = _mm_add_pd(__mSum5, __mLoad5);__mSum6 = _mm_add_pd(__mSum6, __mLoad6);__mSum7 = _mm_add_pd(__mSum7, __mLoad7);i += 16;}__mSum0 = _mm_add_pd(__mSum0, __mSum1);__mSum2 = _mm_add_pd(__mSum2, __mSum3);__mSum4 = _mm_add_pd(__mSum4, __mSum5);__mSum6 = _mm_add_pd(__mSum6, __mSum7);__mSum0 = _mm_add_pd(__mSum0, __mSum2);__mSum4 = _mm_add_pd(__mSum4, __mSum6);__mSum0 = _mm_add_pd(__mSum0, __mSum4);pDatum = (Double_t *)(&__mSum0);dtdst = pDatum[0] + pDatum[1];#endiffor(; i < dwCount; i++){dtdst += (Double_t)DatumGetDouble_t(dtlft[i]);}}else{for(i = 0; i < dwCount; i++){if (BMTestZero_HY(dwNull, i)){dtdst += (Double_t)DatumGetDouble_t(dtlft[i]);}}}return Double_tGetDatum(dtdst);}