f9micro
diff --git a/‎.github/workflows/build.yml‎
Lines changed: 61 additions & 13 deletions b/‎.github/workflows/build.yml‎
Lines changed: 61 additions & 13 deletions
diff --git a/‎README.md‎
Lines changed: 50 additions & 1 deletion b/‎README.md‎
Lines changed: 50 additions & 1 deletion
diff --git a/‎include/ktimer.h‎
Lines changed: 6 additions & 0 deletions b/‎include/ktimer.h‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎include/l4/utcb.h‎
Lines changed: 18 additions & 11 deletions b/‎include/l4/utcb.h‎
Lines changed: 18 additions & 11 deletions
diff --git a/‎include/notification.h‎
Lines changed: 17 additions & 0 deletions b/‎include/notification.h‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎include/platform/ipc-fastpath.h‎
Lines changed: 62 additions & 18 deletions b/‎include/platform/ipc-fastpath.h‎
Lines changed: 62 additions & 18 deletions
diff --git a/‎include/syscall.h‎
Lines changed: 3 additions & 0 deletions b/‎include/syscall.h‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎include/thread.h‎
Lines changed: 8 additions & 1 deletion b/‎include/thread.h‎
Lines changed: 8 additions & 1 deletion
@@ -43,8 +43,20 @@ jobs:
 
   test:
     runs-on: ubuntu-24.04
-    needs: build
-
+    timeout-minutes: 15
+    strategy:
+      fail-fast: false
+      matrix:
+        app: [tests, posix]
+        include:
+          - app: tests
+            name: "Kernel Test Suite"
+            config: USER_APP_TESTS
+          - app: posix
+            name: "POSIX Compliance (PSE51+PSE52)"
+            config: USER_APP_POSIX
+
+    name: test (${{ matrix.app }})
     steps:
     - uses: actions/checkout@v6
 
@@ -56,25 +68,61 @@ jobs:
     - name: Install QEMU
       run: sudo apt-get update && sudo apt-get install -y qemu-system-arm
 
-    - name: Configure for netduinoplus2
-      run: make netduinoplus2_defconfig
-
-    - name: Enable test suite
+    - name: Configure for netduinoplus2 with ${{ matrix.name }}
       run: |
-        # Enable test suite, disable conflicting apps
-        sed -i 's/^CONFIG_PINGPONG=y/# CONFIG_PINGPONG is not set/' .config
-        echo "CONFIG_TESTS=y" >> .config
+        make netduinoplus2_defconfig
+        # Enable QEMU mode and selected test suite
+        python3 tools/kconfig/setconfig.py \
+          QEMU=y \
+          ${{ matrix.config }}=y
         python3 tools/kconfig/genconfig.py --header-path include/autoconf.h Kconfig
 
-    - name: Build kernel with tests
+    - name: Verify configuration
+      run: |
+        echo "=== Build Configuration ==="
+        grep -E "^CONFIG_(QEMU|USER_APP_|BOARD_)" .config | grep "=y"
+
+    - name: Build
       run: make
 
-    - name: Run test suite
+    - name: Run tests
       run: make run-tests
 
-    - name: Run MPU fault test (expected to fail on QEMU)
+    - name: Run MPU fault test
+      if: matrix.app == 'tests'
       run: make run-tests FAULT=mpu
-      continue-on-error: true
+      continue-on-error: true  # MPU not fully emulated in QEMU
 
     - name: Run stack canary fault test
+      if: matrix.app == 'tests'
       run: make run-tests FAULT=canary
+
+  compile-hw:
+    runs-on: ubuntu-24.04
+    strategy:
+      fail-fast: false
+      matrix:
+        app: [tests, posix]
+        include:
+          - app: tests
+            config: USER_APP_TESTS
+          - app: posix
+            config: USER_APP_POSIX
+
+    name: compile-hw (${{ matrix.app }})
+    steps:
+    - uses: actions/checkout@v6
+
+    - name: Install ARM toolchain
+      uses: carlosperate/arm-none-eabi-gcc-action@v1
+      with:
+        release: '15.2.Rel1'
+
+    - name: Configure for discoveryf4 (hardware target)
+      run: |
+        make discoveryf4_defconfig
+        python3 tools/kconfig/setconfig.py ${{ matrix.config }}=y
+        python3 tools/kconfig/genconfig.py --header-path include/autoconf.h Kconfig
+
+    - name: Build (compile-only, no QEMU)
+      run: make
@@ -44,6 +44,55 @@ while adding advanced features from industrial RTOSes.
 - Profiling: Thread uptime, stack usage, memory fragmentation analysis
 - Test Suite: Automated regression tests with QEMU integration
 
+## API Sets
+
+F9 provides two API layers for application development:
+
+### Native API (L4-style)
+The kernel exposes an L4-family system call interface derived from [L4Ka::Pistachio](https://github.com/l4ka/pistachio)
+and [seL4](https://sel4.systems/).
+Key syscalls:
+
+| Syscall | Description |
+|---------|-------------|
+| `L4_Ipc` | Synchronous message passing between threads |
+| `L4_ThreadControl` | Create, configure, and delete threads |
+| `L4_Schedule` | Set thread scheduling parameters |
+| `L4_SpaceControl` | Configure address spaces |
+| `L4_ExchangeRegisters` | Read/write thread register state |
+| `L4_SystemClock` | Read system time (microseconds) |
+| `L4_KernelInterface` | Access Kernel Interface Page (KIP) |
+
+Extensions for embedded real-time:
+- `L4_TimerNotify`: Hardware timer with notification delivery
+- `L4_NotifyWait` / `L4_NotifyPost` / `L4_NotifyClear`: Lightweight notification primitives
+
+### POSIX API (PSE51/PSE52)
+A user-space compatibility layer implementing [IEEE Std 1003.13-2003](https://standards.ieee.org/ieee/1003.13/3322/)
+profiles for portable real-time applications:
+
+| Profile | Description | Status |
+|---------|-------------|--------|
+| PSE51 | Minimal Realtime System | API Compliant |
+| PSE52 | Realtime Controller System | Partial |
+
+Note: POSIX timer functions (`timer_create`, `timer_settime`) have limited functionality.
+Core threading, synchronization, and `clock_gettime`/`nanosleep` are fully operational.
+
+Supported POSIX interfaces:
+
+| Category | Functions |
+|----------|-----------|
+| Threads | `pthread_create`, `pthread_join`, `pthread_detach`, `pthread_self`, `pthread_equal`, `pthread_cancel`, `pthread_testcancel` |
+| Mutexes | `pthread_mutex_*` (normal, recursive, errorcheck), `pthread_mutex_timedlock` |
+| Condition Variables | `pthread_cond_wait`, `pthread_cond_signal`, `pthread_cond_broadcast`, `pthread_cond_timedwait` |
+| Spinlocks | `pthread_spin_init`, `pthread_spin_lock`, `pthread_spin_trylock`, `pthread_spin_unlock` |
+| Semaphores | `sem_init`, `sem_wait`, `sem_trywait`, `sem_timedwait`, `sem_post`, `sem_getvalue` |
+| Time | `clock_gettime`, `nanosleep` |
+
+The POSIX layer is implemented entirely in user space atop the native notification system,
+requiring no kernel modifications. See [user/lib/posix](user/lib/posix) for implementation details.
+
 ## Documentation
 
 Comprehensive documentation is available in the [Documentation/](Documentation/) directory:
@@ -81,7 +130,7 @@ Press `Ctrl+A` and then `X` to exit QEMU. Press `?` in KDB for debug menu (requi
 - STM32F4DISCOVERY (STM32F407VG)
 - STM32F429I-DISC1 (STM32F429ZI)
 - NUCLEO-F429ZI (STM32F429ZI)
-- Netduino Plus 2 (STM32F405RG) - QEMU emulated
+- Netduino Plus 2 (STM32F405RG) - QEMU only, used for automated testing
 
 For detailed instructions including toolchain setup, serial configuration, and debugging,
 see [Documentation/quick-start.md](Documentation/quick-start.md).
 
@@ -13,6 +13,12 @@ struct tcb;
 
 void ktimer_handler(void);
 
+/* Get current kernel time in ticks since boot.
+ * Returns 64-bit monotonically increasing tick count.
+ * Used by SYS_SYSTEM_CLOCK syscall for userspace time queries.
+ */
+uint64_t ktimer_get_now(void);
+
 /* Returns 0 if successfully handled
  * or number ticks if need to be rescheduled
  */
 
@@ -25,24 +25,31 @@ struct utcb {
     uint32_t thread_word_1;
     uint32_t thread_word_2;
     /* +12w */
-    /* Message Registers (MR) mapping with short message buffer:
-     * MR0-MR7:   Hardware registers R4-R11 (ctx.regs[0-7]) - 32 bytes
-     * MR8-MR39:  Short message buffer (tcb->msg_buffer[0-31]) - 128 bytes
-     * MR40-MR47: UTCB overflow (mr[0-7]) - 32 bytes
+    /* Message Registers (MR) storage:
      *
-     * Total message capacity: 192 bytes (48 words)
-     * Fastpath capacity: 160 bytes (40 words, MR0-MR39)
+     * User-space perspective (via L4_LoadMR/L4_StoreMR):
+     * - MR0-MR7:   mr_low[0-7] (UTCB storage, marshaled to R4-R11 by L4_Ipc)
+     * - MR8-MR39:  tcb->msg_buffer[0-31] (kernel copies to receiver)
+     * - MR40-MR47: mr[0-7] (UTCB overflow)
+     *
+     * Kernel perspective (ctx.regs[] = saved R4-R11):
+     * - On SVC entry: kernel reads R4-R11 from exception frame
+     * - On SVC exit: kernel restores R4-R11 to exception frame
+     * - L4_Ipc loads mr_low→R4-R11 before SVC, stores after
+     *
+     * This decouples MRs from physical registers, preventing corruption
+     * when C functions are called between L4_LoadMR and L4_Ipc.
      */
+    uint32_t mr_low[8]; /* MRs 0-7 (user-space cache, R4-R11 equivalent) */
+    /* +20w */
     uint32_t mr[8]; /* MRs 40-47 (overflow beyond short buffer) */
-                    /* +20w */
-    uint32_t br[8];
     /* +28w */
-    uint32_t reserved[4];
-    /* +32w */
+    uint32_t br[8];
+    /* +36w */
 };
 
 typedef struct utcb utcb_t;
 
-#define UTCB_SIZE 128
+#define UTCB_SIZE 144
 
 #endif /* L4_UTCB_H_ */
@@ -114,6 +114,23 @@ uint32_t notification_get(tcb_t *tcb);
  */
 uint32_t notification_read_clear(tcb_t *tcb, uint32_t mask);
 
+/**
+ * Wake thread blocked on SYS_NOTIFY_WAIT with proper semantics.
+ *
+ * Implements the full notification wake protocol:
+ * 1. Check if thread is T_NOTIFY_BLOCKED (not T_RECV_BLOCKED)
+ * 2. Check if signaled bits match thread's notify_mask
+ * 3. Clear matched bits from notify_bits
+ * 4. Write matched bits to thread's saved R0 (return value)
+ * 5. Clear notify_mask and transition to T_RUNNABLE
+ *
+ * T_RECV_BLOCKED threads are NOT woken - they're waiting for IPC.
+ *
+ * @param thr Thread to potentially wake
+ * @return 1 if thread was woken, 0 otherwise
+ */
+int notify_wake_thread(tcb_t *thr);
+
 /**
  * Extended notification event structure.
  * Contains both notification bits and optional event data payload.
 
@@ -40,29 +40,45 @@
  *
  * Copies MR0-MR{n_untyped} from sender to receiver:
  * - MR0-MR7:   From saved_mrs to receiver->ctx.regs[0-7]
- * - MR8-MR39:  From sender->msg_buffer to receiver->msg_buffer (NEW)
+ * - MR8-MR39:  From sender->msg_buffer to receiver->msg_buffer
  *
- * WCET: ~20 cycles (MR0-MR7) + ~100 cycles (MR8-MR39, if used)
+ * WCET: ~16-24 cycles (MR0-MR7 via ldmia/stmia) + ~100 cycles (MR8-MR39)
  */
 static inline void ipc_fastpath_copy_mrs(volatile uint32_t *saved_mrs,
                                          struct tcb *sender,
                                          struct tcb *receiver,
                                          int n_untyped)
 {
     int count = n_untyped + 1; /* +1 for tag in MR0 */
-    int i;
 
-    /* Phase 1: Copy MR0-MR7 from saved registers (R4-R11) */
-    for (i = 0; i < count && i < 8; i++)
-        receiver->ctx.regs[i] = saved_mrs[i];
+    /* MR0-MR7: Use ldmia/stmia for full 8-word copy (~16-24 cycles),
+     * otherwise C loop for partial copy (~3-5 cycles/word).
+     *
+     * ldmia/stmia constraints:
+     * - Base register must NOT be in the register list (UNPREDICTABLE)
+     * - Must clobber r4-r11 and use "memory" barrier
+     * - Both arrays are word-aligned (ctx.regs, __irq_saved_regs)
+     */
+    if (count >= 8) {
+        register uint32_t *src = (uint32_t *) saved_mrs;
+        register uint32_t *dst = (uint32_t *) receiver->ctx.regs;
+        __asm__ __volatile__(
+            "ldmia %[src], {r4-r11}\n\t"
+            "stmia %[dst], {r4-r11}\n\t"
+            : [src] "+r"(src), [dst] "+r"(dst)
+            :
+            : "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "memory");
+    } else {
+        for (int i = 0; i < count; i++)
+            receiver->ctx.regs[i] = saved_mrs[i];
+    }
 
-    /* Phase 2: Copy MR8-MR39 from sender's msg_buffer (if needed) */
+    /* MR8-MR39: C loop (ldmia/stmia not practical without spare registers) */
     if (count > 8) {
-        int buf_count = count - 8; /* Number of words in buffer */
+        int buf_count = count - 8;
         if (buf_count > 32)
-            buf_count = 32; /* Clamp to buffer size */
-
-        for (i = 0; i < buf_count; i++)
+            buf_count = 32;
+        for (int i = 0; i < buf_count; i++)
             receiver->msg_buffer[i] = sender->msg_buffer[i];
     }
 }
@@ -152,20 +168,30 @@ static inline int ipc_fastpath_helper(struct tcb *caller,
     caller->timeout_event = 0;
     to_thr->timeout_event = 0;
 
-    /* Receiver becomes runnable with IPC priority boost */
+    /* Receiver becomes runnable.
+     * Only boost priority if receiver was waiting for ANY message.
+     * If waiting for a specific reply, skip boost - thread was just
+     * processing an IPC and will return to user code immediately.
+     * This prevents priority inversion where reply receivers accumulate
+     * priority 3 and starve lower-priority threads indefinitely.
+     */
     to_thr->state = T_RUNNABLE;
+    if (to_thr->ipc_from == L4_ANYTHREAD)
+        sched_set_priority(to_thr, SCHED_PRIO_IPC);
     to_thr->ipc_from = L4_NILTHREAD;
-    sched_set_priority(to_thr, SCHED_PRIO_IPC);
     sched_enqueue(to_thr);
 
     /* Caller continues (send-only, no reply expected)
      * Fastpath only handles from_tid==NILTHREAD (simple send).
      * For L4_Call (send+receive), slowpath handles blocking.
      *
-     * Re-enqueue caller (was dequeued at SVC entry).
-     * It's safe to enqueue current thread - sched has double-enqueue
-     * protection.
+     * Restore caller's base priority before re-enqueueing.
+     * This mirrors slowpath behavior (thread_make_sender_runnable)
+     * and prevents IPC priority boost from accumulating, which would
+     * cause starvation of lower-priority threads.
      */
+    if (caller->priority != caller->base_priority)
+        sched_set_priority(caller, caller->base_priority);
     caller->state = T_RUNNABLE;
     sched_enqueue(caller);
 
@@ -196,9 +222,27 @@ static inline int ipc_fastpath_helper(struct tcb *caller,
 static inline int ipc_try_fastpath(struct tcb *caller, uint32_t *svc_param)
 {
     extern volatile uint32_t __irq_saved_regs[8];
+    uint32_t local_mrs[8];
 
-    /* Read from global __irq_saved_regs saved by SVC_HANDLER */
-    return ipc_fastpath_helper(caller, svc_param, __irq_saved_regs);
+    /* Copy __irq_saved_regs to local buffer IMMEDIATELY to prevent
+     * corruption from nested interrupts. A higher-priority IRQ could
+     * overwrite the global before we finish reading, corrupting MR0-MR7.
+     *
+     * This is safe because SVC has the lowest exception priority on
+     * Cortex-M, so we can't be interrupted by another SVC, but we
+     * could be interrupted by higher-priority IRQs that also save
+     * to __irq_saved_regs.
+     */
+    local_mrs[0] = __irq_saved_regs[0];
+    local_mrs[1] = __irq_saved_regs[1];
+    local_mrs[2] = __irq_saved_regs[2];
+    local_mrs[3] = __irq_saved_regs[3];
+    local_mrs[4] = __irq_saved_regs[4];
+    local_mrs[5] = __irq_saved_regs[5];
+    local_mrs[6] = __irq_saved_regs[6];
+    local_mrs[7] = __irq_saved_regs[7];
+
+    return ipc_fastpath_helper(caller, svc_param, local_mrs);
 }
 
 #endif /* PLATFORM_IPC_FASTPATH_H_ */
@@ -20,6 +20,9 @@ typedef enum {
     SYS_PROCESSOR_CONTROL,
     SYS_MEMORY_CONTROL,
     SYS_TIMER_NOTIFY, /* Timer notification syscall */
+    SYS_NOTIFY_WAIT,  /* Wait for notification bits */
+    SYS_NOTIFY_POST,  /* Post notification bits to thread */
+    SYS_NOTIFY_CLEAR, /* Clear notification bits (non-blocking) */
 } syscall_t;
 
 void svc_handler(void);
 
@@ -64,7 +64,9 @@ typedef enum {
     T_RUNNABLE,
     T_SVC_BLOCKED,
     T_RECV_BLOCKED,
-    T_SEND_BLOCKED
+    T_SEND_BLOCKED,
+    T_NOTIFY_BLOCKED /* Blocked on SYS_NOTIFY_WAIT - distinct from IPC receive
+                      */
 } thread_state_t;
 
 typedef struct {
@@ -141,6 +143,11 @@ struct tcb {
      */
     uint32_t notify_bits;
 
+    /* Wait mask for SYS_NOTIFY_WAIT syscall.
+     * Thread blocks until (notify_bits & notify_mask) != 0.
+     */
+    uint32_t notify_mask;
+
     /* Optional event-specific data payload.
      * Used for extended notifications (e.g., IRQ number for high IRQs).
      * Set by notification_post() and retrieved by notification_get_extended().