From 2ac55ad795487fe5c28c7599b021ef369b1670d0 Mon Sep 17 00:00:00 2001
From: Yi Liu <liuyi39@baidu.com>
Date: Wed, 9 Jul 2025 11:33:34 +0000
Subject: [PATCH 1/8] Implement IEEE 754 rounding conditions for fp32 to fp16
 conversion

---
 paddle/phi/common/float16.h | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/paddle/phi/common/float16.h b/paddle/phi/common/float16.h
index 830fb52db9568b..c3f8dd1fa4118b 100644
--- a/paddle/phi/common/float16.h
+++ b/paddle/phi/common/float16.h
@@ -107,8 +107,8 @@ struct PADDLE_ALIGN(2) float16 {
   HOSTDEVICE inline explicit float16(float val) {
 #if defined(PADDLE_CUDA_FP16) && \
     (defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300))
-    half tmp = __float2half(val);
-    x = *reinterpret_cast<uint16_t*>(&tmp);
+      half tmp = __float2half(val);
+      x = *reinterpret_cast<uint16_t*>(&tmp);
 
 #elif defined(PADDLE_WITH_NATIVE_FP16)
     float32x4_t tmp = vld1q_dup_f32(&val);
@@ -131,6 +131,11 @@ struct PADDLE_ALIGN(2) float16 {
     v.si ^= (s.si ^ v.si) & -(minN > v.si);
     v.si ^= (infN ^ v.si) & -((infN > v.si) & (v.si > maxN));
     v.si ^= (nanN ^ v.si) & -((nanN > v.si) & (v.si > infN));
+    // Rounding conditions (round to nearest, ties to even). https://en.wikipedia.org/wiki/Rounding#Rounding_half_to_even
+    if (v.ui < infN) {                               // Skip special values (infinity and NaN)
+        const uint32_t lsb = (v.ui >> shift) & 0x1;  // Lowest significant bit of the retained part
+        v.ui = (v.ui + 0xFFF + lsb);                 // rounding up
+    }
     v.ui >>= shift;  // logical shift
     v.si ^= ((v.si - maxD) ^ v.si) & -(v.si > maxC);
     v.si ^= ((v.si - minD) ^ v.si) & -(v.si > subC);

From b7e42a6023da72de0f360824252efda216bbf372 Mon Sep 17 00:00:00 2001
From: Yi Liu <liuyi39@baidu.com>
Date: Wed, 9 Jul 2025 11:40:27 +0000
Subject: [PATCH 2/8] fix indent

---
 paddle/phi/common/float16.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/phi/common/float16.h b/paddle/phi/common/float16.h
index c3f8dd1fa4118b..f7e06bdf1c3245 100644
--- a/paddle/phi/common/float16.h
+++ b/paddle/phi/common/float16.h
@@ -107,8 +107,8 @@ struct PADDLE_ALIGN(2) float16 {
   HOSTDEVICE inline explicit float16(float val) {
 #if defined(PADDLE_CUDA_FP16) && \
     (defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300))
-      half tmp = __float2half(val);
-      x = *reinterpret_cast<uint16_t*>(&tmp);
+    half tmp = __float2half(val);
+    x = *reinterpret_cast<uint16_t*>(&tmp);
 
 #elif defined(PADDLE_WITH_NATIVE_FP16)
     float32x4_t tmp = vld1q_dup_f32(&val);

From 253fde4f11612ab88ed6502b8a42d34d5fc5d3df Mon Sep 17 00:00:00 2001
From: Yi Liu <liuyi39@baidu.com>
Date: Wed, 9 Jul 2025 12:26:33 +0000
Subject: [PATCH 3/8] fix codestyle

---
 paddle/phi/common/float16.h | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/paddle/phi/common/float16.h b/paddle/phi/common/float16.h
index f7e06bdf1c3245..5871e8b56ceadc 100644
--- a/paddle/phi/common/float16.h
+++ b/paddle/phi/common/float16.h
@@ -131,10 +131,12 @@ struct PADDLE_ALIGN(2) float16 {
     v.si ^= (s.si ^ v.si) & -(minN > v.si);
     v.si ^= (infN ^ v.si) & -((infN > v.si) & (v.si > maxN));
     v.si ^= (nanN ^ v.si) & -((nanN > v.si) & (v.si > infN));
-    // Rounding conditions (round to nearest, ties to even). https://en.wikipedia.org/wiki/Rounding#Rounding_half_to_even
-    if (v.ui < infN) {                               // Skip special values (infinity and NaN)
-        const uint32_t lsb = (v.ui >> shift) & 0x1;  // Lowest significant bit of the retained part
-        v.ui = (v.ui + 0xFFF + lsb);                 // rounding up
+    // Rounding conditions (round to nearest, ties to even).
+    // https://en.wikipedia.org/wiki/Rounding#Rounding_half_to_even
+    if (v.ui < infN) {  // Skip special values (infinity and NaN)
+      // Lowest significant bit of the retained part
+      const uint32_t lsb = (v.ui >> shift) & 0x1;
+      v.ui = (v.ui + 0xFFF + lsb);  // rounding up
     }
     v.ui >>= shift;  // logical shift
     v.si ^= ((v.si - maxD) ^ v.si) & -(v.si > maxC);

From ea6bcac1c75eb19c8e71d292d1f0c431e75e477a Mon Sep 17 00:00:00 2001
From: Yi Liu <liuyi39@baidu.com>
Date: Mon, 21 Jul 2025 04:39:25 +0000
Subject: [PATCH 4/8] [fp32tofp16] fix subnormal round to normal error

---
 paddle/phi/common/float16.h | 33 +++++++++++++++++++++++----------
 1 file changed, 23 insertions(+), 10 deletions(-)

diff --git a/paddle/phi/common/float16.h b/paddle/phi/common/float16.h
index 5871e8b56ceadc..5fa34d418a8e30 100644
--- a/paddle/phi/common/float16.h
+++ b/paddle/phi/common/float16.h
@@ -123,24 +123,37 @@ struct PADDLE_ALIGN(2) float16 {
     // http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion
     Bits v, s;
     v.f = val;
+    // Extract sign bit and clear from value
     uint32_t sign = v.si & sigN;
     v.si ^= sign;
-    sign >>= shiftSign;  // logical shift
+    sign >>= shiftSign;
+
+    // Handle subnormals: normalize using multiplication
+    const uint32_t subnormal_mask = -(minN > v.si);
     s.si = mulN;
-    s.si = s.f * v.f;  // correct subnormals
-    v.si ^= (s.si ^ v.si) & -(minN > v.si);
+    s.si = s.f * v.f;  // Extract the fraction of the subnormal number through
+                       // multiplication and conversion from float to int
+    v.si ^= (s.si ^ v.si) & subnormal_mask;
+
+    // Handle special values: infinity and NaN
     v.si ^= (infN ^ v.si) & -((infN > v.si) & (v.si > maxN));
     v.si ^= (nanN ^ v.si) & -((nanN > v.si) & (v.si > infN));
-    // Rounding conditions (round to nearest, ties to even).
+
+    // Rounding: round to nearest, ties to even
     // https://en.wikipedia.org/wiki/Rounding#Rounding_half_to_even
-    if (v.ui < infN) {  // Skip special values (infinity and NaN)
-      // Lowest significant bit of the retained part
-      const uint32_t lsb = (v.ui >> shift) & 0x1;
-      v.ui = (v.ui + 0xFFF + lsb);  // rounding up
-    }
+    const uint32_t lsb =
+        (v.ui >> shift) & 0x1;               // Least significant retained bit
+    v.ui += (0xFFF + lsb) & -(v.ui < infN);  // Round with overflow protection
+
     v.ui >>= shift;  // logical shift
+
+    // Exponent adjustment for overflow (max values)
     v.si ^= ((v.si - maxD) ^ v.si) & -(v.si > maxC);
-    v.si ^= ((v.si - minD) ^ v.si) & -(v.si > subC);
+    // Exponent adjustment for normal numbers
+    const uint32_t normal_mask = ~subnormal_mask;
+    v.si ^= ((v.si - minD) ^ v.si) & normal_mask;
+
+    // Combine sign and value bits
     x = v.ui | sign;
 
 #endif

From f6030f478874dc4bb782829bf4087cc84656f41c Mon Sep 17 00:00:00 2001
From: Yi Liu <liuyi39@baidu.com>
Date: Thu, 24 Jul 2025 06:13:06 +0000
Subject: [PATCH 5/8] =?UTF-8?q?fp32tofp16=EF=BC=8Call=20test=20pass!?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 paddle/phi/common/float16.h | 127 +++++++++++++++++++++---------------
 1 file changed, 73 insertions(+), 54 deletions(-)

diff --git a/paddle/phi/common/float16.h b/paddle/phi/common/float16.h
index 5fa34d418a8e30..cff3bd6841e399 100644
--- a/paddle/phi/common/float16.h
+++ b/paddle/phi/common/float16.h
@@ -119,40 +119,53 @@ struct PADDLE_ALIGN(2) float16 {
     x = _cvtss_sh(val, 0);
 
 #else
-    // Conversion routine adapted from
-    // http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion
-    Bits v, s;
+    Bits v;
     v.f = val;
-    // Extract sign bit and clear from value
-    uint32_t sign = v.si & sigN;
-    v.si ^= sign;
-    sign >>= shiftSign;
-
-    // Handle subnormals: normalize using multiplication
-    const uint32_t subnormal_mask = -(minN > v.si);
-    s.si = mulN;
-    s.si = s.f * v.f;  // Extract the fraction of the subnormal number through
-                       // multiplication and conversion from float to int
-    v.si ^= (s.si ^ v.si) & subnormal_mask;
-
-    // Handle special values: infinity and NaN
-    v.si ^= (infN ^ v.si) & -((infN > v.si) & (v.si > maxN));
-    v.si ^= (nanN ^ v.si) & -((nanN > v.si) & (v.si > infN));
-
-    // Rounding: round to nearest, ties to even
-    // https://en.wikipedia.org/wiki/Rounding#Rounding_half_to_even
-    const uint32_t lsb =
-        (v.ui >> shift) & 0x1;               // Least significant retained bit
-    v.ui += (0xFFF + lsb) & -(v.ui < infN);  // Round with overflow protection
-
-    v.ui >>= shift;  // logical shift
-
-    // Exponent adjustment for overflow (max values)
-    v.si ^= ((v.si - maxD) ^ v.si) & -(v.si > maxC);
-    // Exponent adjustment for normal numbers
-    const uint32_t normal_mask = ~subnormal_mask;
-    v.si ^= ((v.si - minD) ^ v.si) & normal_mask;
 
+    // 1. Extract sign bit and clear from value
+    const uint32_t sign = (v.ui & sigN) >> shiftSign;
+    v.ui &= ~sigN;
+
+    // 2. Handle special values: infinity and NaN
+    const int32_t inf_cond = -((infN >= v.si) & (v.si >= minINF));
+    const int32_t nan_cond = -((nanN > v.si) & (v.si > infN));
+    v.si ^= (infN ^ v.si) & inf_cond;
+    v.si ^= (nanN ^ v.si) & nan_cond;
+
+    const bool is_subnormal = (v.ui < minN);
+    if (is_subnormal) {
+      // 3. Handle subnormal numbers
+      // 3.1 Extract FP32 exponent and mantissa
+      const uint32_t exp = (v.ui >> 23) & exp_mask;
+      const uint32_t mantissa = (v.ui & mantissa_mask) | implicit_bit;
+      // 3.2 Compute required shift
+      const uint32_t shift_amount = exp_bias_diff - exp;
+      // 3.3 64-bit mantissa
+      uint64_t normalized_mantissa = static_cast<uint64_t>(mantissa)
+                                     << precision_shift;
+      normalized_mantissa >>= shift_amount;
+      // 3.4 Round to nearest even
+      // https://en.wikipedia.org/wiki/Rounding#Rounding_half_to_even
+      const uint32_t lsb = (normalized_mantissa >> mantissa_shift) & 0x1;
+      normalized_mantissa += rounding_bias + lsb;
+      v.ui = static_cast<uint32_t>(normalized_mantissa >> mantissa_shift);
+    } else {
+      // 4. Handle normal numbers
+      // Round to nearest even
+      const uint32_t lsb =
+          (v.ui >> shift) & 0x1;  // Least significant retained bit
+      const uint32_t rounding =
+          (0xFFF + lsb) & -(v.ui < infN);  // Round with overflow protection
+      v.ui += rounding;
+      // inf and nan
+      const int32_t max_cond = -(v.ui >= infN);
+      // Align bits
+      v.ui >>= shift;
+      // Exponent adjustment for overflow
+      v.si ^= ((v.si - maxD) ^ v.si) & max_cond;
+      // Exponent adjustment for normal numbers
+      v.si ^= ((v.si - minD) ^ v.si);
+    }
     // Combine sign and value bits
     x = v.ui | sign;
 
@@ -340,28 +353,34 @@ struct PADDLE_ALIGN(2) float16 {
     uint32_t ui;
   };
 
-  static const int shift = 13;
-  static const int shiftSign = 16;
-
-  static const int32_t infN = 0x7F800000;
-  static const int32_t maxN = 0x477FE000;  // max flt16 as flt32
-  static const int32_t minN = 0x38800000;  // min flt16 normal as flt32
-  static const int32_t sigN = 0x80000000;  // sign bit
-
-  static constexpr int32_t infC = infN >> shift;
-  static constexpr int32_t nanN = (infC + 1)
-                                  << shift;  // minimum flt16 nan as float32
-  static constexpr int32_t maxC = maxN >> shift;
-  static constexpr int32_t minC = minN >> shift;
-  static constexpr int32_t sigC = sigN >> shiftSign;
-
-  static const int32_t mulN = 0x52000000;  // (1 << 23) / minN
-  static const int32_t mulC = 0x33800000;  // minN / (1 << (23 - shift))
-  static const int32_t subC = 0x003FF;     // max flt32 subnormal downshifted
-  static const int32_t norC = 0x00400;     // min flt32 normal downshifted
-
-  static constexpr int32_t maxD = infC - maxC - 1;
-  static constexpr int32_t minD = minC - subC - 1;
+  static constexpr int shift = 13;
+  static constexpr int shiftSign = 16;
+
+  static constexpr uint32_t infN = 0x7F800000;
+  static constexpr uint32_t maxN = 0x477FE000;    // max flt16 as flt32
+  static constexpr uint32_t minINF = 0x47800000;  // min flt16 inf as flt32
+  static constexpr uint32_t minN = 0x38800000;    // min flt16 normal as flt32
+  static constexpr uint32_t sigN = 0x80000000;    // sign bit
+
+  static constexpr uint32_t infC = infN >> shift;
+  static constexpr uint32_t nanN = (infC + 1)
+                                   << shift;  // minimum flt16 nan as float32
+  static constexpr uint32_t maxC = maxN >> shift;
+  static constexpr uint32_t minC = minN >> shift;
+  static constexpr uint32_t sigC = sigN >> shiftSign;
+
+  static constexpr uint32_t subC = 0x003FF;  // max flt32 subnormal downshifted
+  static constexpr uint32_t norC = 0x00400;  // min flt32 normal downshifted
+  static constexpr uint32_t maxD = infC - maxC - 1;
+  static constexpr uint32_t minD = minC - subC - 1;
+
+  static constexpr uint32_t exp_mask = 0xFF;
+  static constexpr uint32_t mantissa_mask = 0x7FFFFF;
+  static constexpr uint32_t implicit_bit = 0x800000;
+  static constexpr uint32_t exp_bias_diff = 113;  // 127 - 14
+  static constexpr uint64_t precision_shift = 40;
+  static constexpr uint64_t rounding_bias = 0xFFFFFFFFFFFFF;
+  static constexpr int mantissa_shift = 53;
 };
 
 // Arithmetic operators on GPU

From 519317bfa2eab515d692fb21bd28fc244f78d0a9 Mon Sep 17 00:00:00 2001
From: Yi Liu <liuyi39@baidu.com>
Date: Thu, 24 Jul 2025 06:49:47 +0000
Subject: [PATCH 6/8] fix

---
 paddle/phi/common/float16.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/paddle/phi/common/float16.h b/paddle/phi/common/float16.h
index cff3bd6841e399..c5e608b4e906e3 100644
--- a/paddle/phi/common/float16.h
+++ b/paddle/phi/common/float16.h
@@ -369,6 +369,7 @@ struct PADDLE_ALIGN(2) float16 {
   static constexpr uint32_t minC = minN >> shift;
   static constexpr uint32_t sigC = sigN >> shiftSign;
 
+  static const uint32_t mulC = 0x33800000;   // minN / (1 << (23 - shift))
   static constexpr uint32_t subC = 0x003FF;  // max flt32 subnormal downshifted
   static constexpr uint32_t norC = 0x00400;  // min flt32 normal downshifted
   static constexpr uint32_t maxD = infC - maxC - 1;

From 4cb193fe61c549fe263b2dba74c2163f126a432c Mon Sep 17 00:00:00 2001
From: Yi Liu <liuyi39@baidu.com>
Date: Mon, 28 Jul 2025 04:30:44 +0000
Subject: [PATCH 7/8] fix int to uint

---
 paddle/phi/common/float16.h | 37 +++++++++++++++++++------------------
 1 file changed, 19 insertions(+), 18 deletions(-)

diff --git a/paddle/phi/common/float16.h b/paddle/phi/common/float16.h
index c5e608b4e906e3..9bbab2e1d4c703 100644
--- a/paddle/phi/common/float16.h
+++ b/paddle/phi/common/float16.h
@@ -127,10 +127,11 @@ struct PADDLE_ALIGN(2) float16 {
     v.ui &= ~sigN;
 
     // 2. Handle special values: infinity and NaN
-    const int32_t inf_cond = -((infN >= v.si) & (v.si >= minINF));
-    const int32_t nan_cond = -((nanN > v.si) & (v.si > infN));
-    v.si ^= (infN ^ v.si) & inf_cond;
-    v.si ^= (nanN ^ v.si) & nan_cond;
+    const uint32_t inf_cond =
+        (infN >= v.ui) && (v.ui >= minINF) ? 0xFFFFFFFF : 0;
+    const uint32_t nan_cond = (nanN > v.ui) && (v.ui > infN) ? 0xFFFFFFFF : 0;
+    v.ui ^= (infN ^ v.ui) & inf_cond;
+    v.ui ^= (nanN ^ v.ui) & nan_cond;
 
     const bool is_subnormal = (v.ui < minN);
     if (is_subnormal) {
@@ -155,16 +156,16 @@ struct PADDLE_ALIGN(2) float16 {
       const uint32_t lsb =
           (v.ui >> shift) & 0x1;  // Least significant retained bit
       const uint32_t rounding =
-          (0xFFF + lsb) & -(v.ui < infN);  // Round with overflow protection
+          (v.ui < infN) ? (0xFFF + lsb) : 0;  // Round with overflow protection
       v.ui += rounding;
       // inf and nan
-      const int32_t max_cond = -(v.ui >= infN);
+      const uint32_t max_cond = (v.ui >= infN) ? 0xFFFFFFFF : 0;
       // Align bits
       v.ui >>= shift;
       // Exponent adjustment for overflow
-      v.si ^= ((v.si - maxD) ^ v.si) & max_cond;
+      v.ui ^= ((v.ui - maxD) ^ v.ui) & max_cond;
       // Exponent adjustment for normal numbers
-      v.si ^= ((v.si - minD) ^ v.si);
+      v.ui ^= ((v.ui - minD) ^ v.ui);
     }
     // Combine sign and value bits
     x = v.ui | sign;
@@ -291,18 +292,18 @@ struct PADDLE_ALIGN(2) float16 {
     // http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion
     Bits v;
     v.ui = this->x;
-    int32_t sign = v.si & sigC;
-    v.si ^= sign;
+    uint32_t sign = v.ui & sigC;
+    v.ui ^= sign;
     sign <<= shiftSign;
-    v.si ^= ((v.si + minD) ^ v.si) & -(v.si > subC);
-    v.si ^= ((v.si + maxD) ^ v.si) & -(v.si > maxC);
+    v.ui ^= ((v.ui + minD) ^ v.ui) & -(int32_t)(v.ui > subC);
+    v.ui ^= ((v.ui + maxD) ^ v.ui) & -(int32_t)(v.ui > maxC);
     Bits s;
-    s.si = mulC;
+    s.ui = mulC;
     s.f *= v.si;
-    int32_t mask = -(norC > v.si);
-    v.si <<= shift;
-    v.si ^= (s.si ^ v.si) & mask;
-    v.si |= sign;
+    int32_t mask = -(int32_t)(norC > v.ui);
+    v.ui <<= shift;
+    v.ui ^= (s.ui ^ v.ui) & mask;
+    v.ui |= sign;
     return v.f;
 
 #endif
@@ -369,7 +370,7 @@ struct PADDLE_ALIGN(2) float16 {
   static constexpr uint32_t minC = minN >> shift;
   static constexpr uint32_t sigC = sigN >> shiftSign;
 
-  static const uint32_t mulC = 0x33800000;   // minN / (1 << (23 - shift))
+  static constexpr uint32_t mulC = 0x33800000;  // minN / (1 << (23 - shift))
   static constexpr uint32_t subC = 0x003FF;  // max flt32 subnormal downshifted
   static constexpr uint32_t norC = 0x00400;  // min flt32 normal downshifted
   static constexpr uint32_t maxD = infC - maxC - 1;

From 507f29a528b94fb5fa0a4f9fdff075b9187c1149 Mon Sep 17 00:00:00 2001
From: Yi Liu <liuyi39@baidu.com>
Date: Wed, 30 Jul 2025 14:21:53 +0000
Subject: [PATCH 8/8] fix arm bug

---
 paddle/phi/common/float16.h | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/paddle/phi/common/float16.h b/paddle/phi/common/float16.h
index 9bbab2e1d4c703..94e0e1d893fc62 100644
--- a/paddle/phi/common/float16.h
+++ b/paddle/phi/common/float16.h
@@ -118,6 +118,26 @@ struct PADDLE_ALIGN(2) float16 {
 #elif defined(__F16C__) and defined(__PADDLE_x86__)
     x = _cvtss_sh(val, 0);
 
+#elif defined(PADDLE_WITH_ARM)
+    // Conversion routine adapted from
+    // http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion
+    Bits v, s;
+    v.f = val;
+    uint32_t sign = v.si & (int32_t)sigN;
+    v.si ^= sign;
+    sign >>= shiftSign;  // logical shift
+    s.si = 0x52000000;
+    s.si = s.f * v.f;  // correct subnormals
+    v.si ^= (s.si ^ v.si) & -((int32_t)minN > v.si);
+    v.si ^= ((int32_t)infN ^ v.si) &
+            -(((int32_t)infN > v.si) & (v.si > (int32_t)maxN));
+    v.si ^= ((int32_t)nanN ^ v.si) &
+            -(((int32_t)nanN > v.si) & (v.si > (int32_t)infN));
+    v.ui >>= shift;  // logical shift
+    v.si ^= ((v.si - (int32_t)maxD) ^ v.si) & -(v.si > (int32_t)maxC);
+    v.si ^= ((v.si - (int32_t)minD) ^ v.si) & -(v.si > (int32_t)subC);
+    x = v.ui | sign;
+
 #else
     Bits v;
     v.f = val;