Implement basic Frida JSONL output and parser

xukunzh · xukunzh · commit 8ed3cd12a802 · 2025-06-13T16:50:25.000-07:00
diff --git a/capa/features/extractors/frida/extractor.py b/capa/features/extractors/frida/extractor.py
@@ -1,7 +1,7 @@
 from typing import Union, Iterator
 from pathlib import Path
 
-from .models import FridaReport, Call
+from models import FridaReport, Call
 from capa.features.common import Feature, String, OS, Arch, Format
 from capa.features.insn import API, Number
 from capa.features.address import (
@@ -28,6 +28,9 @@ class FridaExtractor(DynamicFeatureExtractor):
     Processes JSON output from Frida instrumentation to extract behavioral features.
     """
     def __init__(self, report: FridaReport):
+        # TODO: From what I’ve found, Frida cannot access original APK file to compute hashes at runtime.
+        # we may need to require users to provide both the Frida-generated log file and original file to capa,
+        # like we do with other extractors e.g. BinExport, VMRay, etc..
         super().__init__(
             hashes=SampleHashes(md5="", sha1="", sha256="")
         )
@@ -39,12 +42,27 @@ def get_base_address(self) -> Union[_NoAddress, None]:
 
     def extract_global_features(self) -> Iterator[tuple[Feature, Address]]:
         """Basic global features"""
-        yield OS("android"), NO_ADDRESS
-        yield Arch("aarch64"), NO_ADDRESS 
-        yield Format("android"), NO_ADDRESS
+        yield OS("android"), NO_ADDRESS # OS: Frida doesn't provide OS info
 
+        if self.report.processes:
+            process = self.report.processes[0]
+            
+            if process.arch:
+                arch_mapping = {
+                    "arm64": "aarch64",
+                    "arm": "arm",
+                    "x64": "amd64", 
+                    "x86": "i386"
+                }
+                capa_arch = arch_mapping.get(process.arch, process.arch)
+                yield Arch(capa_arch), NO_ADDRESS
+            
+            if process.platform:
+                # TODO: capa doesn't have a dedicated FORMAT_ANDROID constant yet.
+                yield Format("android"), NO_ADDRESS
+        
     def extract_file_features(self) -> Iterator[tuple[Feature, Address]]:
-        """Baisc file features"""
+        """Basic file features"""
         yield String(self.report.package_name), NO_ADDRESS
 
     def get_processes(self) -> Iterator[ProcessHandle]:
@@ -78,20 +96,24 @@ def get_calls(self, ph: ProcessHandle, th: ThreadHandle) -> Iterator[CallHandle]
         """Get all API calls in a specific thread"""
         for i, call in enumerate(ph.inner.calls):
             if call.thread_id == th.address.tid:
-                addr = DynamicCallAddress(thread=th.address, id=i)
+                addr = DynamicCallAddress(thread=th.address, id=call.call_id)
                 yield CallHandle(address=addr, inner=call)
 
     def extract_call_features(self, ph: ProcessHandle, th: ThreadHandle, ch: CallHandle
     ) -> Iterator[tuple[Feature, Address]]:
         """Extract features from individual API calls"""
-        # TODO: Implement call feature extraction
-        
+        # TODO: Implement call feature extraction from arguments and return value
+        call: Call = ch.inner
+
+        yield API(call.api_name), ch.address
+
     def get_call_name(self, ph: ProcessHandle, th: ThreadHandle, ch: CallHandle) -> str:
         """Format API call name and parameters"""
+        # TODO: Implement after extract_call_features
         call: Call = ch.inner
         
         parts = []
-        parts.append(call.api)
+        parts.append(call.api_name)
         parts.append("(")
         
         if call.arguments:
diff --git a/capa/features/extractors/frida/models.py b/capa/features/extractors/frida/models.py
@@ -9,30 +9,58 @@ class FlexibleModel(BaseModel):
 
 class Call(FlexibleModel):
     """Represents a single API call captured by Frida"""
-    api: str           # API name like "java.io.File.<init>", not sure if need to seperate 'japi' 'napi' 'jni'...
-    thread_id: int                             
-    timestamp: Optional[str] = None
-    arguments: Dict[str, Any] = Field(default_factory=dict)
-    return_value: Optional[str] = None
-    caller: Optional[str] = None
+    api_name: str           # API name like "java.io.File.<init>", not sure if need to seperate 'japi' 'napi' 'jni'...
+    process_id: int
+    thread_id: int 
+    call_id: int                             
+    
+    # timestamp: Optional[str] = None
+    # arguments: Dict[str, Any] = Field(default_factory=dict)
+    # return_value: Optional[Any] = None     # Not very sure if we should use str as the return value type.
+    # caller: Optional[str] = None
 
 
 class Process(FlexibleModel):
     """Process information from Frida analysis"""
+    # ppid不存储在这里，因为Android应用通常是单进程的，在extractor.py中处理时会设置ppid=0
     pid: int
     package_name: str
+    arch: Optional[str] = None
+    platform: Optional[str] = None
     calls: List[Call] = Field(default_factory=list)
 
-
 class FridaReport(FlexibleModel):
     """Main report structure for Android analysis"""
+    # TODO: Some more file-level information may go here.
     package_name: str
     processes: List[Process] = Field(default_factory=list)
     
     @classmethod
     def from_json_file(cls, json_path) -> "FridaReport":
-        """Load from JSON file created by log_converter.py"""
+        """Load from JSON Lines file created by log_converter.py"""
+        metadata = None
+        api_calls = []
+
         with open(json_path, 'r') as f:
-            data = json.load(f)
-        return cls.model_validate(data) #
+            for line in f:
+                if line.strip():
+                    record = json.loads(line)
+                    
+                    if "metadata" in record:
+                        metadata = record["metadata"]
+                    elif "api" in record and "java_api" in record["api"]:
+                        api_calls.append(record["api"]["java_api"])
+
+        process = Process(
+            pid=metadata["process_id"],
+            package_name=metadata.get("package_name"),
+            arch=metadata.get("arch"),
+            platform=metadata.get("platform"),
+            calls=[Call(**call) for call in api_calls]
+        )
+        
+        return cls(
+            package_name=metadata.get("package_name"),
+            processes=[process]
+        )
     
diff --git a/scripts/frida/README.md b/scripts/frida/README.md
@@ -2,28 +2,50 @@
 
 ## Usage
 
+**Environment Setup Guide:** [Frida Server + Rooted Emulator + Python Analysis Environment](https://docs.google.com/document/d/1fFf9Wu5y1q6OLojCpL4nPGvQ-Ne8ZpMeEBjdLe6Ef8c/edit?tab=t.t3e2ha7p49lk)
+
+### Device Preparation
+
+```bash
+# Create output directory with full permissions
+adb shell su -c "mkdir -p /data/local/tmp/frida_output && chmod 777 /data/local/tmp/frida_output"
+
+# Disable SELinux enforcement (resets on reboot)
+adb shell su -c "setenforce 0"
+
+# Start Frida server on device
+adb shell su -c "/data/local/tmp/frida-server &"
+```
+
 ### Step 1: Capture API calls with Frida
+
 ```bash
 # Attach Frida to the target app and log Java API calls
-frida -U -f com.example.app -l java_monitor.js --no-pause > frida_output.log
+frida -U -f com.example.app -l java_monitor.js
 ```
 
-### Step 2: Convert logs to capa format
+### Step 2: Retrieve Analysis Data
+
 ```bash
-# Convert raw Frida logs to capa-compatible JSON
-python log_converter.py frida_output.log com.example.app output.json
+# Check if file exits
+adb shell su -c "ls -la /data/local/tmp/frida_output/"
+
+# Method 1: Using cat with root permissions
+adb shell su -c "cat /data/local/tmp/frida_output/api_calls.jsonl" > api_calls.jsonl
+
+# OR Method 2: Using adb pull
+adb pull /data/local/tmp/frida_output/api_calls.jsonl ./api_calls.jsonl
 ```
 
 ### Step 3: Analyze with capa
 ```bash
-# Run capa on the converted log file
-capa output.json
+capa api_calls.jsonl
 ```
 
 ## Architecture
-Android App → Frida Script → Log Converter → FridaExtractor → Capa Engine
+Android App → Frida Script → FridaExtractor → Capa Engine
 
-- **java_monitor.js**: Frida script for Java API monitoring
-- **log_converter.py**: Converts raw Frida logs to structured JSON
+- **java_monitor.js**: Frida script for Java API monitoring, output JSON compatible with capa.
 - **extractor.py**: Contains `FridaExtractor` class implementing capa’s dynamic analysis interface
 - **models.py**: Defines data models for API calls and process info
+- **api_calls.jsonl**: Current JSON Lines output example
diff --git a/scripts/frida/api_calls.jsonl b/scripts/frida/api_calls.jsonl
@@ -0,0 +1,10 @@
+{"id":0,"api":{"java_api":{"process_id":14881,"thread_id":14881,"call_id":0,"api_name":"java.io.File.<init>"}}}
+{"id":1,"metadata":{"process_id":14881,"arch":"arm64","platform":"linux","package_name":"com.example.fridatestjavaapp"}}
+{"id":2,"api":{"java_api":{"process_id":14881,"thread_id":14881,"call_id":1,"api_name":"java.io.File.<init>"}}}
+{"id":3,"api":{"java_api":{"process_id":14881,"thread_id":14881,"call_id":2,"api_name":"java.io.File.<init>"}}}
+{"id":4,"api":{"java_api":{"process_id":14881,"thread_id":14881,"call_id":3,"api_name":"java.io.FileOutputStream.write"}}}
+{"id":5,"api":{"java_api":{"process_id":14881,"thread_id":14881,"call_id":4,"api_name":"java.io.File.<init>"}}}
+{"id":6,"api":{"java_api":{"process_id":14881,"thread_id":14881,"call_id":5,"api_name":"java.io.File.<init>"}}}
+{"id":7,"api":{"java_api":{"process_id":14881,"thread_id":14881,"call_id":6,"api_name":"java.io.FileInputStream.<init>(File)"}}}
+{"id":8,"api":{"java_api":{"process_id":14881,"thread_id":14924,"call_id":7,"api_name":"java.io.File.<init>"}}}
+{"id":9,"api":{"java_api":{"process_id":14881,"thread_id":14924,"call_id":8,"api_name":"java.io.FileInputStream.<init>(File)"}}}
diff --git a/scripts/frida/java_monitor.js b/scripts/frida/java_monitor.js
diff --git a/scripts/frida/log_converter.py b/scripts/frida/log_converter.py