From 2ac55ad795487fe5c28c7599b021ef369b1670d0 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Wed, 9 Jul 2025 11:33:34 +0000 Subject: [PATCH 1/8] Implement IEEE 754 rounding conditions for fp32 to fp16 conversion --- paddle/phi/common/float16.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/paddle/phi/common/float16.h b/paddle/phi/common/float16.h index 830fb52db9568b..c3f8dd1fa4118b 100644 --- a/paddle/phi/common/float16.h +++ b/paddle/phi/common/float16.h @@ -107,8 +107,8 @@ struct PADDLE_ALIGN(2) float16 { HOSTDEVICE inline explicit float16(float val) { #if defined(PADDLE_CUDA_FP16) && \ (defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300)) - half tmp = __float2half(val); - x = *reinterpret_cast(&tmp); + half tmp = __float2half(val); + x = *reinterpret_cast(&tmp); #elif defined(PADDLE_WITH_NATIVE_FP16) float32x4_t tmp = vld1q_dup_f32(&val); @@ -131,6 +131,11 @@ struct PADDLE_ALIGN(2) float16 { v.si ^= (s.si ^ v.si) & -(minN > v.si); v.si ^= (infN ^ v.si) & -((infN > v.si) & (v.si > maxN)); v.si ^= (nanN ^ v.si) & -((nanN > v.si) & (v.si > infN)); + // Rounding conditions (round to nearest, ties to even). https://en.wikipedia.org/wiki/Rounding#Rounding_half_to_even + if (v.ui < infN) { // Skip special values (infinity and NaN) + const uint32_t lsb = (v.ui >> shift) & 0x1; // Lowest significant bit of the retained part + v.ui = (v.ui + 0xFFF + lsb); // rounding up + } v.ui >>= shift; // logical shift v.si ^= ((v.si - maxD) ^ v.si) & -(v.si > maxC); v.si ^= ((v.si - minD) ^ v.si) & -(v.si > subC); From b7e42a6023da72de0f360824252efda216bbf372 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Wed, 9 Jul 2025 11:40:27 +0000 Subject: [PATCH 2/8] fix indent --- paddle/phi/common/float16.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/phi/common/float16.h b/paddle/phi/common/float16.h index c3f8dd1fa4118b..f7e06bdf1c3245 100644 --- a/paddle/phi/common/float16.h +++ b/paddle/phi/common/float16.h @@ -107,8 +107,8 @@ struct PADDLE_ALIGN(2) float16 { HOSTDEVICE inline explicit float16(float val) { #if defined(PADDLE_CUDA_FP16) && \ (defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300)) - half tmp = __float2half(val); - x = *reinterpret_cast(&tmp); + half tmp = __float2half(val); + x = *reinterpret_cast(&tmp); #elif defined(PADDLE_WITH_NATIVE_FP16) float32x4_t tmp = vld1q_dup_f32(&val); From 253fde4f11612ab88ed6502b8a42d34d5fc5d3df Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Wed, 9 Jul 2025 12:26:33 +0000 Subject: [PATCH 3/8] fix codestyle --- paddle/phi/common/float16.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/paddle/phi/common/float16.h b/paddle/phi/common/float16.h index f7e06bdf1c3245..5871e8b56ceadc 100644 --- a/paddle/phi/common/float16.h +++ b/paddle/phi/common/float16.h @@ -131,10 +131,12 @@ struct PADDLE_ALIGN(2) float16 { v.si ^= (s.si ^ v.si) & -(minN > v.si); v.si ^= (infN ^ v.si) & -((infN > v.si) & (v.si > maxN)); v.si ^= (nanN ^ v.si) & -((nanN > v.si) & (v.si > infN)); - // Rounding conditions (round to nearest, ties to even). https://en.wikipedia.org/wiki/Rounding#Rounding_half_to_even - if (v.ui < infN) { // Skip special values (infinity and NaN) - const uint32_t lsb = (v.ui >> shift) & 0x1; // Lowest significant bit of the retained part - v.ui = (v.ui + 0xFFF + lsb); // rounding up + // Rounding conditions (round to nearest, ties to even). + // https://en.wikipedia.org/wiki/Rounding#Rounding_half_to_even + if (v.ui < infN) { // Skip special values (infinity and NaN) + // Lowest significant bit of the retained part + const uint32_t lsb = (v.ui >> shift) & 0x1; + v.ui = (v.ui + 0xFFF + lsb); // rounding up } v.ui >>= shift; // logical shift v.si ^= ((v.si - maxD) ^ v.si) & -(v.si > maxC); From ea6bcac1c75eb19c8e71d292d1f0c431e75e477a Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Mon, 21 Jul 2025 04:39:25 +0000 Subject: [PATCH 4/8] [fp32tofp16] fix subnormal round to normal error --- paddle/phi/common/float16.h | 33 +++++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/paddle/phi/common/float16.h b/paddle/phi/common/float16.h index 5871e8b56ceadc..5fa34d418a8e30 100644 --- a/paddle/phi/common/float16.h +++ b/paddle/phi/common/float16.h @@ -123,24 +123,37 @@ struct PADDLE_ALIGN(2) float16 { // http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion Bits v, s; v.f = val; + // Extract sign bit and clear from value uint32_t sign = v.si & sigN; v.si ^= sign; - sign >>= shiftSign; // logical shift + sign >>= shiftSign; + + // Handle subnormals: normalize using multiplication + const uint32_t subnormal_mask = -(minN > v.si); s.si = mulN; - s.si = s.f * v.f; // correct subnormals - v.si ^= (s.si ^ v.si) & -(minN > v.si); + s.si = s.f * v.f; // Extract the fraction of the subnormal number through + // multiplication and conversion from float to int + v.si ^= (s.si ^ v.si) & subnormal_mask; + + // Handle special values: infinity and NaN v.si ^= (infN ^ v.si) & -((infN > v.si) & (v.si > maxN)); v.si ^= (nanN ^ v.si) & -((nanN > v.si) & (v.si > infN)); - // Rounding conditions (round to nearest, ties to even). + + // Rounding: round to nearest, ties to even // https://en.wikipedia.org/wiki/Rounding#Rounding_half_to_even - if (v.ui < infN) { // Skip special values (infinity and NaN) - // Lowest significant bit of the retained part - const uint32_t lsb = (v.ui >> shift) & 0x1; - v.ui = (v.ui + 0xFFF + lsb); // rounding up - } + const uint32_t lsb = + (v.ui >> shift) & 0x1; // Least significant retained bit + v.ui += (0xFFF + lsb) & -(v.ui < infN); // Round with overflow protection + v.ui >>= shift; // logical shift + + // Exponent adjustment for overflow (max values) v.si ^= ((v.si - maxD) ^ v.si) & -(v.si > maxC); - v.si ^= ((v.si - minD) ^ v.si) & -(v.si > subC); + // Exponent adjustment for normal numbers + const uint32_t normal_mask = ~subnormal_mask; + v.si ^= ((v.si - minD) ^ v.si) & normal_mask; + + // Combine sign and value bits x = v.ui | sign; #endif From f6030f478874dc4bb782829bf4087cc84656f41c Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Thu, 24 Jul 2025 06:13:06 +0000 Subject: [PATCH 5/8] =?UTF-8?q?fp32tofp16=EF=BC=8Call=20test=20pass!?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- paddle/phi/common/float16.h | 127 +++++++++++++++++++++--------------- 1 file changed, 73 insertions(+), 54 deletions(-) diff --git a/paddle/phi/common/float16.h b/paddle/phi/common/float16.h index 5fa34d418a8e30..cff3bd6841e399 100644 --- a/paddle/phi/common/float16.h +++ b/paddle/phi/common/float16.h @@ -119,40 +119,53 @@ struct PADDLE_ALIGN(2) float16 { x = _cvtss_sh(val, 0); #else - // Conversion routine adapted from - // http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion - Bits v, s; + Bits v; v.f = val; - // Extract sign bit and clear from value - uint32_t sign = v.si & sigN; - v.si ^= sign; - sign >>= shiftSign; - - // Handle subnormals: normalize using multiplication - const uint32_t subnormal_mask = -(minN > v.si); - s.si = mulN; - s.si = s.f * v.f; // Extract the fraction of the subnormal number through - // multiplication and conversion from float to int - v.si ^= (s.si ^ v.si) & subnormal_mask; - - // Handle special values: infinity and NaN - v.si ^= (infN ^ v.si) & -((infN > v.si) & (v.si > maxN)); - v.si ^= (nanN ^ v.si) & -((nanN > v.si) & (v.si > infN)); - - // Rounding: round to nearest, ties to even - // https://en.wikipedia.org/wiki/Rounding#Rounding_half_to_even - const uint32_t lsb = - (v.ui >> shift) & 0x1; // Least significant retained bit - v.ui += (0xFFF + lsb) & -(v.ui < infN); // Round with overflow protection - - v.ui >>= shift; // logical shift - - // Exponent adjustment for overflow (max values) - v.si ^= ((v.si - maxD) ^ v.si) & -(v.si > maxC); - // Exponent adjustment for normal numbers - const uint32_t normal_mask = ~subnormal_mask; - v.si ^= ((v.si - minD) ^ v.si) & normal_mask; + // 1. Extract sign bit and clear from value + const uint32_t sign = (v.ui & sigN) >> shiftSign; + v.ui &= ~sigN; + + // 2. Handle special values: infinity and NaN + const int32_t inf_cond = -((infN >= v.si) & (v.si >= minINF)); + const int32_t nan_cond = -((nanN > v.si) & (v.si > infN)); + v.si ^= (infN ^ v.si) & inf_cond; + v.si ^= (nanN ^ v.si) & nan_cond; + + const bool is_subnormal = (v.ui < minN); + if (is_subnormal) { + // 3. Handle subnormal numbers + // 3.1 Extract FP32 exponent and mantissa + const uint32_t exp = (v.ui >> 23) & exp_mask; + const uint32_t mantissa = (v.ui & mantissa_mask) | implicit_bit; + // 3.2 Compute required shift + const uint32_t shift_amount = exp_bias_diff - exp; + // 3.3 64-bit mantissa + uint64_t normalized_mantissa = static_cast(mantissa) + << precision_shift; + normalized_mantissa >>= shift_amount; + // 3.4 Round to nearest even + // https://en.wikipedia.org/wiki/Rounding#Rounding_half_to_even + const uint32_t lsb = (normalized_mantissa >> mantissa_shift) & 0x1; + normalized_mantissa += rounding_bias + lsb; + v.ui = static_cast(normalized_mantissa >> mantissa_shift); + } else { + // 4. Handle normal numbers + // Round to nearest even + const uint32_t lsb = + (v.ui >> shift) & 0x1; // Least significant retained bit + const uint32_t rounding = + (0xFFF + lsb) & -(v.ui < infN); // Round with overflow protection + v.ui += rounding; + // inf and nan + const int32_t max_cond = -(v.ui >= infN); + // Align bits + v.ui >>= shift; + // Exponent adjustment for overflow + v.si ^= ((v.si - maxD) ^ v.si) & max_cond; + // Exponent adjustment for normal numbers + v.si ^= ((v.si - minD) ^ v.si); + } // Combine sign and value bits x = v.ui | sign; @@ -340,28 +353,34 @@ struct PADDLE_ALIGN(2) float16 { uint32_t ui; }; - static const int shift = 13; - static const int shiftSign = 16; - - static const int32_t infN = 0x7F800000; - static const int32_t maxN = 0x477FE000; // max flt16 as flt32 - static const int32_t minN = 0x38800000; // min flt16 normal as flt32 - static const int32_t sigN = 0x80000000; // sign bit - - static constexpr int32_t infC = infN >> shift; - static constexpr int32_t nanN = (infC + 1) - << shift; // minimum flt16 nan as float32 - static constexpr int32_t maxC = maxN >> shift; - static constexpr int32_t minC = minN >> shift; - static constexpr int32_t sigC = sigN >> shiftSign; - - static const int32_t mulN = 0x52000000; // (1 << 23) / minN - static const int32_t mulC = 0x33800000; // minN / (1 << (23 - shift)) - static const int32_t subC = 0x003FF; // max flt32 subnormal downshifted - static const int32_t norC = 0x00400; // min flt32 normal downshifted - - static constexpr int32_t maxD = infC - maxC - 1; - static constexpr int32_t minD = minC - subC - 1; + static constexpr int shift = 13; + static constexpr int shiftSign = 16; + + static constexpr uint32_t infN = 0x7F800000; + static constexpr uint32_t maxN = 0x477FE000; // max flt16 as flt32 + static constexpr uint32_t minINF = 0x47800000; // min flt16 inf as flt32 + static constexpr uint32_t minN = 0x38800000; // min flt16 normal as flt32 + static constexpr uint32_t sigN = 0x80000000; // sign bit + + static constexpr uint32_t infC = infN >> shift; + static constexpr uint32_t nanN = (infC + 1) + << shift; // minimum flt16 nan as float32 + static constexpr uint32_t maxC = maxN >> shift; + static constexpr uint32_t minC = minN >> shift; + static constexpr uint32_t sigC = sigN >> shiftSign; + + static constexpr uint32_t subC = 0x003FF; // max flt32 subnormal downshifted + static constexpr uint32_t norC = 0x00400; // min flt32 normal downshifted + static constexpr uint32_t maxD = infC - maxC - 1; + static constexpr uint32_t minD = minC - subC - 1; + + static constexpr uint32_t exp_mask = 0xFF; + static constexpr uint32_t mantissa_mask = 0x7FFFFF; + static constexpr uint32_t implicit_bit = 0x800000; + static constexpr uint32_t exp_bias_diff = 113; // 127 - 14 + static constexpr uint64_t precision_shift = 40; + static constexpr uint64_t rounding_bias = 0xFFFFFFFFFFFFF; + static constexpr int mantissa_shift = 53; }; // Arithmetic operators on GPU From 519317bfa2eab515d692fb21bd28fc244f78d0a9 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Thu, 24 Jul 2025 06:49:47 +0000 Subject: [PATCH 6/8] fix --- paddle/phi/common/float16.h | 1 + 1 file changed, 1 insertion(+) diff --git a/paddle/phi/common/float16.h b/paddle/phi/common/float16.h index cff3bd6841e399..c5e608b4e906e3 100644 --- a/paddle/phi/common/float16.h +++ b/paddle/phi/common/float16.h @@ -369,6 +369,7 @@ struct PADDLE_ALIGN(2) float16 { static constexpr uint32_t minC = minN >> shift; static constexpr uint32_t sigC = sigN >> shiftSign; + static const uint32_t mulC = 0x33800000; // minN / (1 << (23 - shift)) static constexpr uint32_t subC = 0x003FF; // max flt32 subnormal downshifted static constexpr uint32_t norC = 0x00400; // min flt32 normal downshifted static constexpr uint32_t maxD = infC - maxC - 1; From 4cb193fe61c549fe263b2dba74c2163f126a432c Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Mon, 28 Jul 2025 04:30:44 +0000 Subject: [PATCH 7/8] fix int to uint --- paddle/phi/common/float16.h | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/paddle/phi/common/float16.h b/paddle/phi/common/float16.h index c5e608b4e906e3..9bbab2e1d4c703 100644 --- a/paddle/phi/common/float16.h +++ b/paddle/phi/common/float16.h @@ -127,10 +127,11 @@ struct PADDLE_ALIGN(2) float16 { v.ui &= ~sigN; // 2. Handle special values: infinity and NaN - const int32_t inf_cond = -((infN >= v.si) & (v.si >= minINF)); - const int32_t nan_cond = -((nanN > v.si) & (v.si > infN)); - v.si ^= (infN ^ v.si) & inf_cond; - v.si ^= (nanN ^ v.si) & nan_cond; + const uint32_t inf_cond = + (infN >= v.ui) && (v.ui >= minINF) ? 0xFFFFFFFF : 0; + const uint32_t nan_cond = (nanN > v.ui) && (v.ui > infN) ? 0xFFFFFFFF : 0; + v.ui ^= (infN ^ v.ui) & inf_cond; + v.ui ^= (nanN ^ v.ui) & nan_cond; const bool is_subnormal = (v.ui < minN); if (is_subnormal) { @@ -155,16 +156,16 @@ struct PADDLE_ALIGN(2) float16 { const uint32_t lsb = (v.ui >> shift) & 0x1; // Least significant retained bit const uint32_t rounding = - (0xFFF + lsb) & -(v.ui < infN); // Round with overflow protection + (v.ui < infN) ? (0xFFF + lsb) : 0; // Round with overflow protection v.ui += rounding; // inf and nan - const int32_t max_cond = -(v.ui >= infN); + const uint32_t max_cond = (v.ui >= infN) ? 0xFFFFFFFF : 0; // Align bits v.ui >>= shift; // Exponent adjustment for overflow - v.si ^= ((v.si - maxD) ^ v.si) & max_cond; + v.ui ^= ((v.ui - maxD) ^ v.ui) & max_cond; // Exponent adjustment for normal numbers - v.si ^= ((v.si - minD) ^ v.si); + v.ui ^= ((v.ui - minD) ^ v.ui); } // Combine sign and value bits x = v.ui | sign; @@ -291,18 +292,18 @@ struct PADDLE_ALIGN(2) float16 { // http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion Bits v; v.ui = this->x; - int32_t sign = v.si & sigC; - v.si ^= sign; + uint32_t sign = v.ui & sigC; + v.ui ^= sign; sign <<= shiftSign; - v.si ^= ((v.si + minD) ^ v.si) & -(v.si > subC); - v.si ^= ((v.si + maxD) ^ v.si) & -(v.si > maxC); + v.ui ^= ((v.ui + minD) ^ v.ui) & -(int32_t)(v.ui > subC); + v.ui ^= ((v.ui + maxD) ^ v.ui) & -(int32_t)(v.ui > maxC); Bits s; - s.si = mulC; + s.ui = mulC; s.f *= v.si; - int32_t mask = -(norC > v.si); - v.si <<= shift; - v.si ^= (s.si ^ v.si) & mask; - v.si |= sign; + int32_t mask = -(int32_t)(norC > v.ui); + v.ui <<= shift; + v.ui ^= (s.ui ^ v.ui) & mask; + v.ui |= sign; return v.f; #endif @@ -369,7 +370,7 @@ struct PADDLE_ALIGN(2) float16 { static constexpr uint32_t minC = minN >> shift; static constexpr uint32_t sigC = sigN >> shiftSign; - static const uint32_t mulC = 0x33800000; // minN / (1 << (23 - shift)) + static constexpr uint32_t mulC = 0x33800000; // minN / (1 << (23 - shift)) static constexpr uint32_t subC = 0x003FF; // max flt32 subnormal downshifted static constexpr uint32_t norC = 0x00400; // min flt32 normal downshifted static constexpr uint32_t maxD = infC - maxC - 1; From 507f29a528b94fb5fa0a4f9fdff075b9187c1149 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Wed, 30 Jul 2025 14:21:53 +0000 Subject: [PATCH 8/8] fix arm bug --- paddle/phi/common/float16.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/paddle/phi/common/float16.h b/paddle/phi/common/float16.h index 9bbab2e1d4c703..94e0e1d893fc62 100644 --- a/paddle/phi/common/float16.h +++ b/paddle/phi/common/float16.h @@ -118,6 +118,26 @@ struct PADDLE_ALIGN(2) float16 { #elif defined(__F16C__) and defined(__PADDLE_x86__) x = _cvtss_sh(val, 0); +#elif defined(PADDLE_WITH_ARM) + // Conversion routine adapted from + // http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion + Bits v, s; + v.f = val; + uint32_t sign = v.si & (int32_t)sigN; + v.si ^= sign; + sign >>= shiftSign; // logical shift + s.si = 0x52000000; + s.si = s.f * v.f; // correct subnormals + v.si ^= (s.si ^ v.si) & -((int32_t)minN > v.si); + v.si ^= ((int32_t)infN ^ v.si) & + -(((int32_t)infN > v.si) & (v.si > (int32_t)maxN)); + v.si ^= ((int32_t)nanN ^ v.si) & + -(((int32_t)nanN > v.si) & (v.si > (int32_t)infN)); + v.ui >>= shift; // logical shift + v.si ^= ((v.si - (int32_t)maxD) ^ v.si) & -(v.si > (int32_t)maxC); + v.si ^= ((v.si - (int32_t)minD) ^ v.si) & -(v.si > (int32_t)subC); + x = v.ui | sign; + #else Bits v; v.f = val;