Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 31 additions & 9 deletions capa/features/extractors/frida/extractor.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from typing import Union, Iterator
from pathlib import Path

from .models import FridaReport, Call
from models import FridaReport, Call
from capa.features.common import Feature, String, OS, Arch, Format
from capa.features.insn import API, Number
from capa.features.address import (
Expand All @@ -28,6 +28,9 @@ class FridaExtractor(DynamicFeatureExtractor):
Processes JSON output from Frida instrumentation to extract behavioral features.
"""
def __init__(self, report: FridaReport):
# TODO: From what I’ve found, Frida cannot access original APK file to compute hashes at runtime.
# we may need to require users to provide both the Frida-generated log file and original file to capa,
# like we do with other extractors e.g. BinExport, VMRay, etc..
super().__init__(
hashes=SampleHashes(md5="", sha1="", sha256="")
)
Expand All @@ -39,12 +42,27 @@ def get_base_address(self) -> Union[_NoAddress, None]:

def extract_global_features(self) -> Iterator[tuple[Feature, Address]]:
"""Basic global features"""
yield OS("android"), NO_ADDRESS
yield Arch("aarch64"), NO_ADDRESS
yield Format("android"), NO_ADDRESS
yield OS("android"), NO_ADDRESS # OS: Frida doesn't provide OS info

if self.report.processes:
process = self.report.processes[0]

if process.arch:
arch_mapping = {
"arm64": "aarch64",
"arm": "arm",
"x64": "amd64",
"x86": "i386"
}
capa_arch = arch_mapping.get(process.arch, process.arch)
yield Arch(capa_arch), NO_ADDRESS

if process.platform:
# TODO: capa doesn't have a dedicated FORMAT_ANDROID constant yet.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please add FORMAT_ANDROID in this PR so we won't run into issues later, e.g. using android vs Android in different places.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@mike-hunhoff Your call though, but do you think "android" is a good name here? another formats are "pe", "elf", "dotnet", don't have similar scope as "android"

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point @larchchen , @xukunzh what do you think about FORMAT_FRIDA? This is similar to what we use for VMRay and others.

Copy link
Owner Author

@xukunzh xukunzh Jun 23, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry, I realized I misunderstood the meaning of FORMAT. The format should represent the file format (like Android's APK, AAB), not the analysis method or platform. My fault.
We should let Frida access the source file to get the format. @mike-hunhoff @larchchen

yield Format("android"), NO_ADDRESS

def extract_file_features(self) -> Iterator[tuple[Feature, Address]]:
"""Baisc file features"""
"""Basic file features"""
yield String(self.report.package_name), NO_ADDRESS

def get_processes(self) -> Iterator[ProcessHandle]:
Expand Down Expand Up @@ -78,20 +96,24 @@ def get_calls(self, ph: ProcessHandle, th: ThreadHandle) -> Iterator[CallHandle]
"""Get all API calls in a specific thread"""
for i, call in enumerate(ph.inner.calls):
if call.thread_id == th.address.tid:
addr = DynamicCallAddress(thread=th.address, id=i)
addr = DynamicCallAddress(thread=th.address, id=call.call_id)
yield CallHandle(address=addr, inner=call)

def extract_call_features(self, ph: ProcessHandle, th: ThreadHandle, ch: CallHandle
) -> Iterator[tuple[Feature, Address]]:
"""Extract features from individual API calls"""
# TODO: Implement call feature extraction

# TODO: Implement call feature extraction from arguments and return value
call: Call = ch.inner

yield API(call.api_name), ch.address

def get_call_name(self, ph: ProcessHandle, th: ThreadHandle, ch: CallHandle) -> str:
"""Format API call name and parameters"""
# TODO: Implement after extract_call_features agruments
call: Call = ch.inner

parts = []
parts.append(call.api)
parts.append(call.api_name)
parts.append("(")

if call.arguments:
Expand Down
47 changes: 37 additions & 10 deletions capa/features/extractors/frida/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,30 +9,57 @@ class FlexibleModel(BaseModel):

class Call(FlexibleModel):
"""Represents a single API call captured by Frida"""
api: str # API name like "java.io.File.<init>", not sure if need to seperate 'japi' 'napi' 'jni'...
thread_id: int
timestamp: Optional[str] = None
arguments: Dict[str, Any] = Field(default_factory=dict)
return_value: Optional[str] = None
caller: Optional[str] = None
api_name: str # API name like "java.io.File.<init>", not sure if need to seperate 'japi' 'napi' 'jni'...
process_id: int
thread_id: int
call_id: int
# timestamp: Optional[str] = None
# arguments: Dict[str, Any] = Field(default_factory=dict)
# return_value: Optional[Any] = None # Not very sure if we should use str as the return value type
# caller: Optional[str] = None


class Process(FlexibleModel):
"""Process information from Frida analysis"""
# ppid is omitted here as Android apps are usually single-process; it will be set to 0 in extractor.py
pid: int
package_name: str
arch: Optional[str] = None
platform: Optional[str] = None
calls: List[Call] = Field(default_factory=list)


class FridaReport(FlexibleModel):
"""Main report structure for Android analysis"""
# TODO: Some more file-level information may go here
package_name: str
processes: List[Process] = Field(default_factory=list)

@classmethod
def from_json_file(cls, json_path) -> "FridaReport":
"""Load from JSON file created by log_converter.py"""
"""Load from JSON Lines file created by log_converter.py"""
metadata = None
api_calls = []

with open(json_path, 'r') as f:
data = json.load(f)
return cls.model_validate(data) #
for line in f:
if line.strip():
record = json.loads(line)

if "metadata" in record:
metadata = record["metadata"]
elif "api" in record and "java_api" in record["api"]:
api_calls.append(record["api"]["java_api"])

process = Process(
pid=metadata["process_id"],
package_name=metadata.get("package_name"),
arch=metadata.get("arch"),
platform=metadata.get("platform"),
calls=[Call(**call) for call in api_calls]
)

return cls(
package_name=metadata.get("package_name"),
processes=[process]
)

40 changes: 31 additions & 9 deletions scripts/frida/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,28 +2,50 @@

## Usage

**Environment Setup Guide:** [Frida Server + Rooted Emulator + Python Analysis Environment](https://docs.google.com/document/d/1fFf9Wu5y1q6OLojCpL4nPGvQ-Ne8ZpMeEBjdLe6Ef8c/edit?tab=t.t3e2ha7p49lk)

### Device Preparation

```bash
# Create output directory with full permissions
adb shell su -c "mkdir -p /data/local/tmp/frida_output && chmod 777 /data/local/tmp/frida_output"

# Disable SELinux enforcement (resets on reboot)
adb shell su -c "setenforce 0"

# Start Frida server on device
adb shell su -c "/data/local/tmp/frida-server &"
```

### Step 1: Capture API calls with Frida

```bash
# Attach Frida to the target app and log Java API calls
frida -U -f com.example.app -l java_monitor.js --no-pause > frida_output.log
frida -U -f com.example.app -l java_monitor.js
```

### Step 2: Convert logs to capa format
### Step 2: Retrieve Analysis Data

```bash
# Convert raw Frida logs to capa-compatible JSON
python log_converter.py frida_output.log com.example.app output.json
# Check if file exits
adb shell su -c "ls -la /data/local/tmp/frida_output/"

# Method 1: Using cat with root permissions
adb shell su -c "cat /data/local/tmp/frida_output/api_calls.jsonl" > api_calls.jsonl

# OR Method 2: Using adb pull
adb pull /data/local/tmp/frida_output/api_calls.jsonl ./api_calls.jsonl
```

### Step 3: Analyze with capa
```bash
# Run capa on the converted log file
capa output.json
capa api_calls.jsonl
```

## Architecture
Android App → Frida Script → Log Converter → FridaExtractor → Capa Engine
Android App → Frida Script → FridaExtractor → Capa Engine

- **java_monitor.js**: Frida script for Java API monitoring
- **log_converter.py**: Converts raw Frida logs to structured JSON
- **java_monitor.js**: Frida script for Java API monitoring, output JSON compatible with capa.
- **extractor.py**: Contains `FridaExtractor` class implementing capa’s dynamic analysis interface
- **models.py**: Defines data models for API calls and process info
- **api_calls.jsonl**: Current JSON Lines output example
10 changes: 10 additions & 0 deletions scripts/frida/api_calls.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
{"id":0,"api":{"java_api":{"process_id":14881,"thread_id":14881,"call_id":0,"api_name":"java.io.File.<init>"}}}
{"id":1,"metadata":{"process_id":14881,"arch":"arm64","platform":"linux","package_name":"com.example.fridatestjavaapp"}}
{"id":2,"api":{"java_api":{"process_id":14881,"thread_id":14881,"call_id":1,"api_name":"java.io.File.<init>"}}}
{"id":3,"api":{"java_api":{"process_id":14881,"thread_id":14881,"call_id":2,"api_name":"java.io.File.<init>"}}}
{"id":4,"api":{"java_api":{"process_id":14881,"thread_id":14881,"call_id":3,"api_name":"java.io.FileOutputStream.write"}}}
{"id":5,"api":{"java_api":{"process_id":14881,"thread_id":14881,"call_id":4,"api_name":"java.io.File.<init>"}}}
{"id":6,"api":{"java_api":{"process_id":14881,"thread_id":14881,"call_id":5,"api_name":"java.io.File.<init>"}}}
{"id":7,"api":{"java_api":{"process_id":14881,"thread_id":14881,"call_id":6,"api_name":"java.io.FileInputStream.<init>(File)"}}}
{"id":8,"api":{"java_api":{"process_id":14881,"thread_id":14924,"call_id":7,"api_name":"java.io.File.<init>"}}}
{"id":9,"api":{"java_api":{"process_id":14881,"thread_id":14924,"call_id":8,"api_name":"java.io.FileInputStream.<init>(File)"}}}
121 changes: 111 additions & 10 deletions scripts/frida/java_monitor.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,42 +3,139 @@
* This script monitors Java API calls and outputs data in JSON format
*
* How to use it:
* frida -U -f com.example.app -l java_monitor.js --no-pause > output.log
* frida -U -f com.example.app -l java_monitor.js
*/

// TODO: Add Thread.backtrace to get position to ensure api call happens on target place
// TODO: Auto create script with api list
// TODO: Selective api list get from rules

console.log("[+] Capa Frida Java Monitor initializing...");

// TODO: Should we use timestamp in filename for multiple runs? and let user specify output path via command line?
var timestamp = Date.now();
var filePath = "/data/local/tmp/frida_output/api_calls.jsonl";
// "/data/data/com.example.fridatestjavaapp/files/api_calls.json";
// "/data/local/tmp/frida_output/frida_" + timestamp + ".json";

var outputFile = null;
var recordId = 0;
var allMetadata = {};

try {
outputFile = new File(filePath, "w");
} catch (e) {
console.log("[ERROR] Failed to open file: " + e);
}

function writeRecord(record) {
if (outputFile) {
outputFile.write(JSON.stringify(record) + '\n');
outputFile.flush();
return true;
}
return false;
}

function writeMetadata() {
var record = {
"id": recordId++,
"metadata": allMetadata
};

if (writeRecord(record)) {
console.log("[+] Metadata written")
}
}

function writeJavaApiCall(apiData) {
var record = {
"id": recordId++,
"api": {
"java_api": apiData
}
};

if (writeRecord(record)) {
console.log("[+] API call written: " + apiData.api_name);
}
}

function collectBasicInfo() {
allMetadata.process_id = Process.id;
allMetadata.arch = Process.arch;
allMetadata.platform = Process.platform;
console.log("[+] Basic info collected");
}

collectBasicInfo();

Java.perform(function() {
console.log("[+] Capa Frida Java Monitor started");

// Debug found ActivityThread.currentApplication() available after 1 second, returns null otherwise
// but this doesn't guarantee metadata will be written as first line in JSON.
// Current approach can ensure each script reinjection maintains complete metadata without requiring device restart
setTimeout(function() {

var ActivityThread = Java.use("android.app.ActivityThread");
var currentApp = ActivityThread.currentApplication();

if (currentApp && currentApp.getPackageName) {
allMetadata.package_name = currentApp.getPackageName().toString();
console.log("[+] Package name: " + allMetadata.package_name);
} else {
console.log("[!] Could not get package name, using fallback");
allMetadata.package_name = "unknown_package";
}

writeMetadata();
}, 1000);

var call_id = 0;

// Currently recordApiCall only captures basic: process_id, thread_id, call_id, api_name
// TODO: Will implement arguments and return_value parameters after testing current basic structure.
function recordApiCall(apiName) {
var apiCallRecord = {
"process_id": Process.id,
"thread_id": Process.getCurrentThreadId(),
"call_id": call_id++,
"api_name": apiName
};

writeJavaApiCall(apiCallRecord);
}

function JsonFormat(apiName, args, returnValue) {
function debugLog(apiName, args, returnValue) {
var logEntry = {
"type": "api",
"name": apiName,
"args": args || {},
"return_value": returnValue,
"timestamp": Date.now(),
"process_id": Process.id,
"thread_id": Process.getCurrentThreadId(),
"method": "Unknown"
"call_id": call_id,
};
console.log("FRIDA_JSON:" + JSON.stringify(logEntry));
console.log("CAPA_API_LOG_ENTRY:" + JSON.stringify(logEntry));
}

// Monitor java.io.File
try {
var File = Java.use("java.io.File");

File.$init.overload('java.lang.String').implementation = function(path) {
JsonFormat("java.io.File.<init>", {"path": path});
recordApiCall("java.io.File.<init>");
debugLog("java.io.File.<init>", {"path": path});
return this.$init(path);
};

File.delete.implementation = function() {
var path = this.getAbsolutePath();
var result = this.delete();
JsonFormat("java.io.File.delete", {"path": path}, result);
recordApiCall("java.io.File.delete");
debugLog("java.io.File.delete", {"path": path}, result);
return result;
};

Expand All @@ -54,12 +151,14 @@ Java.perform(function() {

FileInputStream.$init.overload('java.io.File').implementation = function(file) {
var path = file.getAbsolutePath();
JsonFormat("java.io.FileInputStream.<init>", {"path": path});
recordApiCall("java.io.FileInputStream.<init>(File)");
debugLog("java.io.FileInputStream.<init>(File)", {"path": path});
return this.$init(file);
};

FileInputStream.$init.overload('java.lang.String').implementation = function(path) {
JsonFormat("java.io.FileInputStream.<init>", {"path": path});
recordApiCall("java.io.FileInputStream.<init>(String)");
debugLog("java.io.FileInputStream.<init>(String)", {"path": path});
return this.$init(path);
};

Expand All @@ -74,12 +173,14 @@ Java.perform(function() {
var FileOutputStream = Java.use("java.io.FileOutputStream");

FileOutputStream.$init.overload('java.lang.String').implementation = function(path) {
JsonFormat("java.io.FileOutputStream.<init>", {"path": path});
recordApiCall("java.io.FileOutputStream.<init>");
debugLog("java.io.FileOutputStream.<init>", {"path": path});
return this.$init(path);
};

FileOutputStream.write.overload('[B').implementation = function(bytes) {
JsonFormat("java.io.FileOutputStream.write", {"bytes": bytes.length});
recordApiCall("java.io.FileOutputStream.write");
debugLog("java.io.FileOutputStream.write", {"bytes": bytes.length});
return this.write(bytes);
};

Expand Down
Loading