xukunzh · xukunzh · Jun 23, 2025 · Jun 13, 2025 · Jun 18, 2025 · Jun 18, 2025
diff --git a/capa/features/extractors/frida/extractor.py b/capa/features/extractors/frida/extractor.py
@@ -1,7 +1,7 @@
 from typing import Union, Iterator
 from pathlib import Path
 
-from .models import FridaReport, Call
+from models import FridaReport, Call
 from capa.features.common import Feature, String, OS, Arch, Format
 from capa.features.insn import API, Number
 from capa.features.address import (
@@ -28,6 +28,9 @@ class FridaExtractor(DynamicFeatureExtractor):
     Processes JSON output from Frida instrumentation to extract behavioral features.
     """
     def __init__(self, report: FridaReport):
+        # TODO: From what I’ve found, Frida cannot access original APK file to compute hashes at runtime.
+        # we may need to require users to provide both the Frida-generated log file and original file to capa,
+        # like we do with other extractors e.g. BinExport, VMRay, etc..
         super().__init__(
             hashes=SampleHashes(md5="", sha1="", sha256="")
         )
@@ -39,12 +42,27 @@ def get_base_address(self) -> Union[_NoAddress, None]:
 
     def extract_global_features(self) -> Iterator[tuple[Feature, Address]]:
         """Basic global features"""
-        yield OS("android"), NO_ADDRESS
-        yield Arch("aarch64"), NO_ADDRESS 
-        yield Format("android"), NO_ADDRESS
+        yield OS("android"), NO_ADDRESS  # OS: Frida doesn't provide OS info
 
+        if self.report.processes:
+            process = self.report.processes[0]
+
+            if process.arch:
+                arch_mapping = {
+                    "arm64": "aarch64",
+                    "arm": "arm",
+                    "x64": "amd64", 
+                    "x86": "i386"
+                }
+                capa_arch = arch_mapping.get(process.arch, process.arch)
+                yield Arch(capa_arch), NO_ADDRESS
+
+            if process.platform:
+                # TODO: capa doesn't have a dedicated FORMAT_ANDROID constant yet.
+                yield Format("android"), NO_ADDRESS
+
     def extract_file_features(self) -> Iterator[tuple[Feature, Address]]:
-        """Baisc file features"""
+        """Basic file features"""
         yield String(self.report.package_name), NO_ADDRESS
 
     def get_processes(self) -> Iterator[ProcessHandle]:
@@ -78,20 +96,24 @@ def get_calls(self, ph: ProcessHandle, th: ThreadHandle) -> Iterator[CallHandle]
         """Get all API calls in a specific thread"""
         for i, call in enumerate(ph.inner.calls):
             if call.thread_id == th.address.tid:
-                addr = DynamicCallAddress(thread=th.address, id=i)
+                addr = DynamicCallAddress(thread=th.address, id=call.call_id)
                 yield CallHandle(address=addr, inner=call)
 
     def extract_call_features(self, ph: ProcessHandle, th: ThreadHandle, ch: CallHandle
     ) -> Iterator[tuple[Feature, Address]]:
         """Extract features from individual API calls"""
-        # TODO: Implement call feature extraction
-
+        # TODO: Implement call feature extraction from arguments and return value
+        call: Call = ch.inner
+
+        yield API(call.api_name), ch.address
+
     def get_call_name(self, ph: ProcessHandle, th: ThreadHandle, ch: CallHandle) -> str:
         """Format API call name and parameters"""
+        # TODO: Implement after extract_call_features agruments
         call: Call = ch.inner
 
         parts = []
-        parts.append(call.api)
+        parts.append(call.api_name)
         parts.append("(")
 
         if call.arguments:

diff --git a/capa/features/extractors/frida/models.py b/capa/features/extractors/frida/models.py
@@ -9,30 +9,57 @@ class FlexibleModel(BaseModel):
 
 class Call(FlexibleModel):
     """Represents a single API call captured by Frida"""
-    api: str           # API name like "java.io.File.<init>", not sure if need to seperate 'japi' 'napi' 'jni'...
-    thread_id: int                             
-    timestamp: Optional[str] = None
-    arguments: Dict[str, Any] = Field(default_factory=dict)
-    return_value: Optional[str] = None
-    caller: Optional[str] = None
+    api_name: str           # API name like "java.io.File.<init>", not sure if need to seperate 'japi' 'napi' 'jni'...
+    process_id: int
+    thread_id: int 
+    call_id: int                             
+    # timestamp: Optional[str] = None
+    # arguments: Dict[str, Any] = Field(default_factory=dict)
+    # return_value: Optional[Any] = None     # Not very sure if we should use str as the return value type
+    # caller: Optional[str] = None
 
 
 class Process(FlexibleModel):
     """Process information from Frida analysis"""
+    # ppid is omitted here as Android apps are usually single-process; it will be set to 0 in extractor.py
     pid: int
     package_name: str
+    arch: Optional[str] = None
+    platform: Optional[str] = None
     calls: List[Call] = Field(default_factory=list)
 
-
 class FridaReport(FlexibleModel):
     """Main report structure for Android analysis"""
+    # TODO: Some more file-level information may go here
     package_name: str
     processes: List[Process] = Field(default_factory=list)
 
     @classmethod
     def from_json_file(cls, json_path) -> "FridaReport":
-        """Load from JSON file created by log_converter.py"""
+        """Load from JSON Lines file created by log_converter.py"""
+        metadata = None
+        api_calls = []
+
         with open(json_path, 'r') as f:
-            data = json.load(f)
-        return cls.model_validate(data) #
+            for line in f:
+                if line.strip():
+                    record = json.loads(line)
+
+                    if "metadata" in record:
+                        metadata = record["metadata"]
+                    elif "api" in record and "java_api" in record["api"]:
+                        api_calls.append(record["api"]["java_api"])
+
+        process = Process(
+            pid=metadata["process_id"],
+            package_name=metadata.get("package_name"),
+            arch=metadata.get("arch"),
+            platform=metadata.get("platform"),
+            calls=[Call(**call) for call in api_calls]
+        )
+
+        return cls(
+            package_name=metadata.get("package_name"),
+            processes=[process]
+        )
 
diff --git a/scripts/frida/README.md b/scripts/frida/README.md
@@ -2,28 +2,50 @@
 
 ## Usage
 
+**Environment Setup Guide:** [Frida Server + Rooted Emulator + Python Analysis Environment](https://docs.google.com/document/d/1fFf9Wu5y1q6OLojCpL4nPGvQ-Ne8ZpMeEBjdLe6Ef8c/edit?tab=t.t3e2ha7p49lk)
+
+### Device Preparation
+
+```bash
+# Create output directory with full permissions
+adb shell su -c "mkdir -p /data/local/tmp/frida_output && chmod 777 /data/local/tmp/frida_output"
+
+# Disable SELinux enforcement (resets on reboot)
+adb shell su -c "setenforce 0"
+
+# Start Frida server on device
+adb shell su -c "/data/local/tmp/frida-server &"
+```
+
 ### Step 1: Capture API calls with Frida
+
 ```bash
 # Attach Frida to the target app and log Java API calls
-frida -U -f com.example.app -l java_monitor.js --no-pause > frida_output.log
+frida -U -f com.example.app -l java_monitor.js
 ```
 
-### Step 2: Convert logs to capa format
+### Step 2: Retrieve Analysis Data
+
 ```bash
-# Convert raw Frida logs to capa-compatible JSON
-python log_converter.py frida_output.log com.example.app output.json
+# Check if file exits
+adb shell su -c "ls -la /data/local/tmp/frida_output/"
+
+# Method 1: Using cat with root permissions
+adb shell su -c "cat /data/local/tmp/frida_output/api_calls.jsonl" > api_calls.jsonl
+
+# OR Method 2: Using adb pull
+adb pull /data/local/tmp/frida_output/api_calls.jsonl ./api_calls.jsonl
 ```
 
 ### Step 3: Analyze with capa
 ```bash
-# Run capa on the converted log file
-capa output.json
+capa api_calls.jsonl
 ```
 
 ## Architecture
-Android App → Frida Script → Log Converter → FridaExtractor → Capa Engine
+Android App → Frida Script → FridaExtractor → Capa Engine
 
-- **java_monitor.js**: Frida script for Java API monitoring
-- **log_converter.py**: Converts raw Frida logs to structured JSON
+- **java_monitor.js**: Frida script for Java API monitoring, output JSON compatible with capa.
 - **extractor.py**: Contains `FridaExtractor` class implementing capa’s dynamic analysis interface
 - **models.py**: Defines data models for API calls and process info
+- **api_calls.jsonl**: Current JSON Lines output example
diff --git a/scripts/frida/api_calls.jsonl b/scripts/frida/api_calls.jsonl
@@ -0,0 +1,10 @@
+{"id":0,"api":{"java_api":{"process_id":14881,"thread_id":14881,"call_id":0,"api_name":"java.io.File.<init>"}}}
+{"id":1,"metadata":{"process_id":14881,"arch":"arm64","platform":"linux","package_name":"com.example.fridatestjavaapp"}}
+{"id":2,"api":{"java_api":{"process_id":14881,"thread_id":14881,"call_id":1,"api_name":"java.io.File.<init>"}}}
+{"id":3,"api":{"java_api":{"process_id":14881,"thread_id":14881,"call_id":2,"api_name":"java.io.File.<init>"}}}
+{"id":4,"api":{"java_api":{"process_id":14881,"thread_id":14881,"call_id":3,"api_name":"java.io.FileOutputStream.write"}}}
+{"id":5,"api":{"java_api":{"process_id":14881,"thread_id":14881,"call_id":4,"api_name":"java.io.File.<init>"}}}
+{"id":6,"api":{"java_api":{"process_id":14881,"thread_id":14881,"call_id":5,"api_name":"java.io.File.<init>"}}}
+{"id":7,"api":{"java_api":{"process_id":14881,"thread_id":14881,"call_id":6,"api_name":"java.io.FileInputStream.<init>(File)"}}}
+{"id":8,"api":{"java_api":{"process_id":14881,"thread_id":14924,"call_id":7,"api_name":"java.io.File.<init>"}}}
+{"id":9,"api":{"java_api":{"process_id":14881,"thread_id":14924,"call_id":8,"api_name":"java.io.FileInputStream.<init>(File)"}}}
diff --git a/scripts/frida/java_monitor.js b/scripts/frida/java_monitor.js
@@ -3,42 +3,139 @@
  * This script monitors Java API calls and outputs data in JSON format
  * 
  * How to use it:
- * frida -U -f com.example.app -l java_monitor.js --no-pause > output.log
+ * frida -U -f com.example.app -l java_monitor.js
  */
 
 // TODO: Add Thread.backtrace to get position to ensure api call happens on target place
 // TODO: Auto create script with api list
 // TODO: Selective api list get from rules
 
+console.log("[+] Capa Frida Java Monitor initializing...");
+
+// TODO: Should we use timestamp in filename for multiple runs? and let user specify output path via command line?
+var timestamp = Date.now(); 
+var filePath = "/data/local/tmp/frida_output/api_calls.jsonl";
+// "/data/data/com.example.fridatestjavaapp/files/api_calls.json";
+// "/data/local/tmp/frida_output/frida_" + timestamp + ".json";
+
+var outputFile = null;
+var recordId = 0;
+var allMetadata = {};
+
+try {
+    outputFile = new File(filePath, "w");
+} catch (e) {
+    console.log("[ERROR] Failed to open file: " + e);
+}
+
+function writeRecord(record) {
+    if (outputFile) {
+        outputFile.write(JSON.stringify(record) + '\n');
+        outputFile.flush();
+        return true;
+    }
+    return false;
+}
+
+function writeMetadata() {
+    var record = {
+        "id": recordId++,
+        "metadata": allMetadata
+    };
+
+    if (writeRecord(record)) {
+        console.log("[+] Metadata written")
+    }
+}
+
+function writeJavaApiCall(apiData) {
+    var record = {
+        "id": recordId++,
+        "api": {
+            "java_api": apiData
+        }
+    };
+
+    if (writeRecord(record)) {
+        console.log("[+] API call written: " + apiData.api_name);
+    }
+}
+
+function collectBasicInfo() {
+    allMetadata.process_id = Process.id;
+    allMetadata.arch = Process.arch;
+    allMetadata.platform = Process.platform;
+    console.log("[+] Basic info collected");
+}
+
+collectBasicInfo();
+
 Java.perform(function() {
     console.log("[+] Capa Frida Java Monitor started");
+
+    // Debug found ActivityThread.currentApplication() available after 1 second, returns null otherwise
+    // but this doesn't guarantee metadata will be written as first line in JSON.
+    // Current approach can ensure each script reinjection maintains complete metadata without requiring device restart
+    setTimeout(function() {
+
+        var ActivityThread = Java.use("android.app.ActivityThread");
+        var currentApp = ActivityThread.currentApplication();
+
+        if (currentApp && currentApp.getPackageName) {
+            allMetadata.package_name = currentApp.getPackageName().toString();
+            console.log("[+] Package name: " + allMetadata.package_name);
+        } else {
+            console.log("[!] Could not get package name, using fallback");
+            allMetadata.package_name = "unknown_package";
+        }
+
+        writeMetadata();
+    }, 1000);
+
+    var call_id = 0;
+
+    // Currently recordApiCall only captures basic: process_id, thread_id, call_id, api_name
+    // TODO: Will implement arguments and return_value parameters after testing current basic structure.
+    function recordApiCall(apiName) {
+        var apiCallRecord = {
+            "process_id": Process.id,
+            "thread_id": Process.getCurrentThreadId(),
+            "call_id": call_id++,
+            "api_name": apiName
+        };
+
+        writeJavaApiCall(apiCallRecord);
+    }
 
-    function JsonFormat(apiName, args, returnValue) {
+    function debugLog(apiName, args, returnValue) {
         var logEntry = {
             "type": "api",
             "name": apiName,
             "args": args || {},
             "return_value": returnValue,
             "timestamp": Date.now(),
+            "process_id": Process.id,
             "thread_id": Process.getCurrentThreadId(),
-            "method": "Unknown"    
+            "call_id": call_id,
         };
-        console.log("FRIDA_JSON:" + JSON.stringify(logEntry));
+        console.log("CAPA_API_LOG_ENTRY:" + JSON.stringify(logEntry));
     }
 
     // Monitor java.io.File
     try {
         var File = Java.use("java.io.File");
 
         File.$init.overload('java.lang.String').implementation = function(path) {
-            JsonFormat("java.io.File.<init>", {"path": path});
+            recordApiCall("java.io.File.<init>");
+            debugLog("java.io.File.<init>", {"path": path});
             return this.$init(path);
         };
 
         File.delete.implementation = function() {
             var path = this.getAbsolutePath();
             var result = this.delete();
-            JsonFormat("java.io.File.delete", {"path": path}, result);
+            recordApiCall("java.io.File.delete");
+            debugLog("java.io.File.delete", {"path": path}, result);
             return result;
         };
 
@@ -54,12 +151,14 @@ Java.perform(function() {
 
         FileInputStream.$init.overload('java.io.File').implementation = function(file) {
             var path = file.getAbsolutePath();
-            JsonFormat("java.io.FileInputStream.<init>", {"path": path});
+            recordApiCall("java.io.FileInputStream.<init>(File)");
+            debugLog("java.io.FileInputStream.<init>(File)", {"path": path});
             return this.$init(file);
         };
 
         FileInputStream.$init.overload('java.lang.String').implementation = function(path) {
-            JsonFormat("java.io.FileInputStream.<init>", {"path": path});
+            recordApiCall("java.io.FileInputStream.<init>(String)");
+            debugLog("java.io.FileInputStream.<init>(String)", {"path": path});
             return this.$init(path);
         };
 
@@ -74,12 +173,14 @@ Java.perform(function() {
         var FileOutputStream = Java.use("java.io.FileOutputStream");
 
         FileOutputStream.$init.overload('java.lang.String').implementation = function(path) {
-            JsonFormat("java.io.FileOutputStream.<init>", {"path": path});
+            recordApiCall("java.io.FileOutputStream.<init>");
+            debugLog("java.io.FileOutputStream.<init>", {"path": path});
             return this.$init(path);
         };
 
         FileOutputStream.write.overload('[B').implementation = function(bytes) {
-            JsonFormat("java.io.FileOutputStream.write", {"bytes": bytes.length});
+            recordApiCall("java.io.FileOutputStream.write");
+            debugLog("java.io.FileOutputStream.write", {"bytes": bytes.length});
             return this.write(bytes);
         };