|
40 | 40 | * |
41 | 41 | * Copies MR0-MR{n_untyped} from sender to receiver: |
42 | 42 | * - MR0-MR7: From saved_mrs to receiver->ctx.regs[0-7] |
43 | | - * - MR8-MR39: From sender->msg_buffer to receiver->msg_buffer (NEW) |
| 43 | + * - MR8-MR39: From sender->msg_buffer to receiver->msg_buffer |
44 | 44 | * |
45 | | - * WCET: ~20 cycles (MR0-MR7) + ~100 cycles (MR8-MR39, if used) |
| 45 | + * WCET: ~16-24 cycles (MR0-MR7 via ldmia/stmia) + ~100 cycles (MR8-MR39) |
46 | 46 | */ |
47 | 47 | static inline void ipc_fastpath_copy_mrs(volatile uint32_t *saved_mrs, |
48 | 48 | struct tcb *sender, |
49 | 49 | struct tcb *receiver, |
50 | 50 | int n_untyped) |
51 | 51 | { |
52 | 52 | int count = n_untyped + 1; /* +1 for tag in MR0 */ |
53 | | - int i; |
54 | 53 |
|
55 | | - /* Phase 1: Copy MR0-MR7 from saved registers (R4-R11) */ |
56 | | - for (i = 0; i < count && i < 8; i++) |
57 | | - receiver->ctx.regs[i] = saved_mrs[i]; |
| 54 | + /* MR0-MR7: Use ldmia/stmia for full 8-word copy (~16-24 cycles), |
| 55 | + * otherwise C loop for partial copy (~3-5 cycles/word). |
| 56 | + * |
| 57 | + * ldmia/stmia constraints: |
| 58 | + * - Base register must NOT be in the register list (UNPREDICTABLE) |
| 59 | + * - Must clobber r4-r11 and use "memory" barrier |
| 60 | + * - Both arrays are word-aligned (ctx.regs, __irq_saved_regs) |
| 61 | + */ |
| 62 | + if (count >= 8) { |
| 63 | + register uint32_t *src = (uint32_t *) saved_mrs; |
| 64 | + register uint32_t *dst = (uint32_t *) receiver->ctx.regs; |
| 65 | + __asm__ __volatile__( |
| 66 | + "ldmia %[src], {r4-r11}\n\t" |
| 67 | + "stmia %[dst], {r4-r11}\n\t" |
| 68 | + : [src] "+r"(src), [dst] "+r"(dst) |
| 69 | + : |
| 70 | + : "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "memory"); |
| 71 | + } else { |
| 72 | + for (int i = 0; i < count; i++) |
| 73 | + receiver->ctx.regs[i] = saved_mrs[i]; |
| 74 | + } |
58 | 75 |
|
59 | | - /* Phase 2: Copy MR8-MR39 from sender's msg_buffer (if needed) */ |
| 76 | + /* MR8-MR39: C loop (ldmia/stmia not practical without spare registers) */ |
60 | 77 | if (count > 8) { |
61 | | - int buf_count = count - 8; /* Number of words in buffer */ |
| 78 | + int buf_count = count - 8; |
62 | 79 | if (buf_count > 32) |
63 | | - buf_count = 32; /* Clamp to buffer size */ |
64 | | - |
65 | | - for (i = 0; i < buf_count; i++) |
| 80 | + buf_count = 32; |
| 81 | + for (int i = 0; i < buf_count; i++) |
66 | 82 | receiver->msg_buffer[i] = sender->msg_buffer[i]; |
67 | 83 | } |
68 | 84 | } |
@@ -152,20 +168,30 @@ static inline int ipc_fastpath_helper(struct tcb *caller, |
152 | 168 | caller->timeout_event = 0; |
153 | 169 | to_thr->timeout_event = 0; |
154 | 170 |
|
155 | | - /* Receiver becomes runnable with IPC priority boost */ |
| 171 | + /* Receiver becomes runnable. |
| 172 | + * Only boost priority if receiver was waiting for ANY message. |
| 173 | + * If waiting for a specific reply, skip boost - thread was just |
| 174 | + * processing an IPC and will return to user code immediately. |
| 175 | + * This prevents priority inversion where reply receivers accumulate |
| 176 | + * priority 3 and starve lower-priority threads indefinitely. |
| 177 | + */ |
156 | 178 | to_thr->state = T_RUNNABLE; |
| 179 | + if (to_thr->ipc_from == L4_ANYTHREAD) |
| 180 | + sched_set_priority(to_thr, SCHED_PRIO_IPC); |
157 | 181 | to_thr->ipc_from = L4_NILTHREAD; |
158 | | - sched_set_priority(to_thr, SCHED_PRIO_IPC); |
159 | 182 | sched_enqueue(to_thr); |
160 | 183 |
|
161 | 184 | /* Caller continues (send-only, no reply expected) |
162 | 185 | * Fastpath only handles from_tid==NILTHREAD (simple send). |
163 | 186 | * For L4_Call (send+receive), slowpath handles blocking. |
164 | 187 | * |
165 | | - * Re-enqueue caller (was dequeued at SVC entry). |
166 | | - * It's safe to enqueue current thread - sched has double-enqueue |
167 | | - * protection. |
| 188 | + * Restore caller's base priority before re-enqueueing. |
| 189 | + * This mirrors slowpath behavior (thread_make_sender_runnable) |
| 190 | + * and prevents IPC priority boost from accumulating, which would |
| 191 | + * cause starvation of lower-priority threads. |
168 | 192 | */ |
| 193 | + if (caller->priority != caller->base_priority) |
| 194 | + sched_set_priority(caller, caller->base_priority); |
169 | 195 | caller->state = T_RUNNABLE; |
170 | 196 | sched_enqueue(caller); |
171 | 197 |
|
@@ -196,9 +222,27 @@ static inline int ipc_fastpath_helper(struct tcb *caller, |
196 | 222 | static inline int ipc_try_fastpath(struct tcb *caller, uint32_t *svc_param) |
197 | 223 | { |
198 | 224 | extern volatile uint32_t __irq_saved_regs[8]; |
| 225 | + uint32_t local_mrs[8]; |
199 | 226 |
|
200 | | - /* Read from global __irq_saved_regs saved by SVC_HANDLER */ |
201 | | - return ipc_fastpath_helper(caller, svc_param, __irq_saved_regs); |
| 227 | + /* Copy __irq_saved_regs to local buffer IMMEDIATELY to prevent |
| 228 | + * corruption from nested interrupts. A higher-priority IRQ could |
| 229 | + * overwrite the global before we finish reading, corrupting MR0-MR7. |
| 230 | + * |
| 231 | + * This is safe because SVC has the lowest exception priority on |
| 232 | + * Cortex-M, so we can't be interrupted by another SVC, but we |
| 233 | + * could be interrupted by higher-priority IRQs that also save |
| 234 | + * to __irq_saved_regs. |
| 235 | + */ |
| 236 | + local_mrs[0] = __irq_saved_regs[0]; |
| 237 | + local_mrs[1] = __irq_saved_regs[1]; |
| 238 | + local_mrs[2] = __irq_saved_regs[2]; |
| 239 | + local_mrs[3] = __irq_saved_regs[3]; |
| 240 | + local_mrs[4] = __irq_saved_regs[4]; |
| 241 | + local_mrs[5] = __irq_saved_regs[5]; |
| 242 | + local_mrs[6] = __irq_saved_regs[6]; |
| 243 | + local_mrs[7] = __irq_saved_regs[7]; |
| 244 | + |
| 245 | + return ipc_fastpath_helper(caller, svc_param, local_mrs); |
202 | 246 | } |
203 | 247 |
|
204 | 248 | #endif /* PLATFORM_IPC_FASTPATH_H_ */ |
0 commit comments