Back Original

Dividing unsigned 8-bit numbers

Step 1: Updating reminder

Unlike SSE/AVX2 code it's easier to actually perform shift right to place i-th bit at position zero. Then isolating the least siginificant bit and merging it with quotient can be expressed as a single ternary operation.

reminder <<= 1;
t0         = divisor >> i;
reminder   = reminder | (t0 & 1); // ternary operation

Step 2: Comparison

AVX512 supports unsigned byte comparison, and returns a mask.

Step 3: Conditional operations

This is straightforward use of masked operations.

Implementation

The actual AVX512 implementation is shown below. Unlike SSE code, the inner loop is manually unrolled. Also, there's no explictly use of the ternary logic intrinsic function — but examing the assembly code revelas that a compiler nicely fuses binary operation.

void avx512_long_div_u8(const uint8_t* A, const uint8_t* B, uint8_t* C, size_t n) {
    const __m512i one = _mm512_set1_epi8(1);

    for (size_t i=0; i < n; i += 64) {
        const __m512i dividend = _mm512_loadu_si512((const __m512*)(&A[i]));
        const __m512i divisor  = _mm512_loadu_si512((const __m512*)(&B[i]));

        const __m512i dividend_bit7 = _mm512_and_epi32(_mm512_srli_epi32(dividend, 7), one);
        const __m512i dividend_bit6 = _mm512_and_epi32(_mm512_srli_epi32(dividend, 6), one);
        const __m512i dividend_bit5 = _mm512_and_epi32(_mm512_srli_epi32(dividend, 5), one);
        const __m512i dividend_bit4 = _mm512_and_epi32(_mm512_srli_epi32(dividend, 4), one);
        const __m512i dividend_bit3 = _mm512_and_epi32(_mm512_srli_epi32(dividend, 3), one);
        const __m512i dividend_bit2 = _mm512_and_epi32(_mm512_srli_epi32(dividend, 2), one);
        const __m512i dividend_bit1 = _mm512_and_epi32(_mm512_srli_epi32(dividend, 1), one);
        const __m512i dividend_bit0 = _mm512_and_epi32(_mm512_srli_epi32(dividend, 0), one);

        __m512i quotient = _mm512_set1_epi32(0);
        __m512i reminder = dividend_bit7;

        {
            const __mmask64 ge = _mm512_cmpge_epu8_mask(reminder, divisor);
            reminder = _mm512_mask_sub_epi8(reminder, ge, reminder, divisor);
            quotient = _mm512_mask_add_epi8(quotient, ge, quotient, one);
        }

        reminder = _mm512_add_epi32(reminder, reminder);
        reminder = _mm512_or_epi32(reminder, dividend_bit6);

        {
            const __mmask64 ge = _mm512_cmpge_epu8_mask(reminder, divisor);
            reminder = _mm512_mask_sub_epi8(reminder, ge, reminder, divisor);
            quotient = _mm512_add_epi32(quotient, quotient);
            quotient = _mm512_mask_add_epi8(quotient, ge, quotient, one);
        }

        reminder = _mm512_add_epi32(reminder, reminder);
        reminder = _mm512_or_epi32(reminder, dividend_bit5);

        {
            const __mmask64 ge = _mm512_cmpge_epu8_mask(reminder, divisor);
            reminder = _mm512_mask_sub_epi8(reminder, ge, reminder, divisor);
            quotient = _mm512_add_epi32(quotient, quotient);
            quotient = _mm512_mask_add_epi8(quotient, ge, quotient, one);
        }

        reminder = _mm512_add_epi32(reminder, reminder);
        reminder = _mm512_or_epi32(reminder, dividend_bit4);

        {
            const __mmask64 ge = _mm512_cmpge_epu8_mask(reminder, divisor);
            reminder = _mm512_mask_sub_epi8(reminder, ge, reminder, divisor);
            quotient = _mm512_add_epi32(quotient, quotient);
            quotient = _mm512_mask_add_epi8(quotient, ge, quotient, one);
        }

        reminder = _mm512_add_epi32(reminder, reminder);
        reminder = _mm512_or_epi32(reminder, dividend_bit3);

        {
            const __mmask64 ge = _mm512_cmpge_epu8_mask(reminder, divisor);
            reminder = _mm512_mask_sub_epi8(reminder, ge, reminder, divisor);
            quotient = _mm512_add_epi32(quotient, quotient);
            quotient = _mm512_mask_add_epi8(quotient, ge, quotient, one);
        }

        reminder = _mm512_add_epi32(reminder, reminder);
        reminder = _mm512_or_epi32(reminder, dividend_bit2);

        {
            const __mmask64 ge = _mm512_cmpge_epu8_mask(reminder, divisor);
            reminder = _mm512_mask_sub_epi8(reminder, ge, reminder, divisor);
            quotient = _mm512_add_epi32(quotient, quotient);
            quotient = _mm512_mask_add_epi8(quotient, ge, quotient, one);
        }

        reminder = _mm512_add_epi32(reminder, reminder);
        reminder = _mm512_or_epi32(reminder, dividend_bit1);

        {
            const __mmask64 ge = _mm512_cmpge_epu8_mask(reminder, divisor);
            reminder = _mm512_mask_sub_epi8(reminder, ge, reminder, divisor);
            quotient = _mm512_add_epi32(quotient, quotient);
            quotient = _mm512_mask_add_epi8(quotient, ge, quotient, one);
        }

        reminder = _mm512_add_epi32(reminder, reminder);
        reminder = _mm512_or_epi32(reminder, dividend_bit0);


        {
            const __mmask64 ge = _mm512_cmpge_epu8_mask(reminder, divisor);
            reminder = _mm512_mask_sub_epi8(reminder, ge, reminder, divisor);
            quotient = _mm512_add_epi8(quotient, quotient);
            quotient = _mm512_mask_add_epi8(quotient, ge, quotient, one);
        }

        _mm512_storeu_si512((__m512*)&C[i], quotient);
    }
}