Skip to content

Commit 8ed3cd1

Browse files
committed
Implement basic Frida JSONL output and parser
1 parent 5415459 commit 8ed3cd1

File tree

6 files changed

+221
-112
lines changed

6 files changed

+221
-112
lines changed

capa/features/extractors/frida/extractor.py

Lines changed: 31 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from typing import Union, Iterator
22
from pathlib import Path
33

4-
from .models import FridaReport, Call
4+
from models import FridaReport, Call
55
from capa.features.common import Feature, String, OS, Arch, Format
66
from capa.features.insn import API, Number
77
from capa.features.address import (
@@ -28,6 +28,9 @@ class FridaExtractor(DynamicFeatureExtractor):
2828
Processes JSON output from Frida instrumentation to extract behavioral features.
2929
"""
3030
def __init__(self, report: FridaReport):
31+
# TODO: From what I’ve found, Frida cannot access original APK file to compute hashes at runtime.
32+
# we may need to require users to provide both the Frida-generated log file and original file to capa,
33+
# like we do with other extractors e.g. BinExport, VMRay, etc..
3134
super().__init__(
3235
hashes=SampleHashes(md5="", sha1="", sha256="")
3336
)
@@ -39,12 +42,27 @@ def get_base_address(self) -> Union[_NoAddress, None]:
3942

4043
def extract_global_features(self) -> Iterator[tuple[Feature, Address]]:
4144
"""Basic global features"""
42-
yield OS("android"), NO_ADDRESS
43-
yield Arch("aarch64"), NO_ADDRESS
44-
yield Format("android"), NO_ADDRESS
45+
yield OS("android"), NO_ADDRESS # OS: Frida doesn't provide OS info
4546

47+
if self.report.processes:
48+
process = self.report.processes[0]
49+
50+
if process.arch:
51+
arch_mapping = {
52+
"arm64": "aarch64",
53+
"arm": "arm",
54+
"x64": "amd64",
55+
"x86": "i386"
56+
}
57+
capa_arch = arch_mapping.get(process.arch, process.arch)
58+
yield Arch(capa_arch), NO_ADDRESS
59+
60+
if process.platform:
61+
# TODO: capa doesn't have a dedicated FORMAT_ANDROID constant yet.
62+
yield Format("android"), NO_ADDRESS
63+
4664
def extract_file_features(self) -> Iterator[tuple[Feature, Address]]:
47-
"""Baisc file features"""
65+
"""Basic file features"""
4866
yield String(self.report.package_name), NO_ADDRESS
4967

5068
def get_processes(self) -> Iterator[ProcessHandle]:
@@ -78,20 +96,24 @@ def get_calls(self, ph: ProcessHandle, th: ThreadHandle) -> Iterator[CallHandle]
7896
"""Get all API calls in a specific thread"""
7997
for i, call in enumerate(ph.inner.calls):
8098
if call.thread_id == th.address.tid:
81-
addr = DynamicCallAddress(thread=th.address, id=i)
99+
addr = DynamicCallAddress(thread=th.address, id=call.call_id)
82100
yield CallHandle(address=addr, inner=call)
83101

84102
def extract_call_features(self, ph: ProcessHandle, th: ThreadHandle, ch: CallHandle
85103
) -> Iterator[tuple[Feature, Address]]:
86104
"""Extract features from individual API calls"""
87-
# TODO: Implement call feature extraction
88-
105+
# TODO: Implement call feature extraction from arguments and return value
106+
call: Call = ch.inner
107+
108+
yield API(call.api_name), ch.address
109+
89110
def get_call_name(self, ph: ProcessHandle, th: ThreadHandle, ch: CallHandle) -> str:
90111
"""Format API call name and parameters"""
112+
# TODO: Implement after extract_call_features
91113
call: Call = ch.inner
92114

93115
parts = []
94-
parts.append(call.api)
116+
parts.append(call.api_name)
95117
parts.append("(")
96118

97119
if call.arguments:

capa/features/extractors/frida/models.py

Lines changed: 38 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -9,30 +9,58 @@ class FlexibleModel(BaseModel):
99

1010
class Call(FlexibleModel):
1111
"""Represents a single API call captured by Frida"""
12-
api: str # API name like "java.io.File.<init>", not sure if need to seperate 'japi' 'napi' 'jni'...
13-
thread_id: int
14-
timestamp: Optional[str] = None
15-
arguments: Dict[str, Any] = Field(default_factory=dict)
16-
return_value: Optional[str] = None
17-
caller: Optional[str] = None
12+
api_name: str # API name like "java.io.File.<init>", not sure if need to seperate 'japi' 'napi' 'jni'...
13+
process_id: int
14+
thread_id: int
15+
call_id: int
16+
17+
# timestamp: Optional[str] = None
18+
# arguments: Dict[str, Any] = Field(default_factory=dict)
19+
# return_value: Optional[Any] = None # Not very sure if we should use str as the return value type.
20+
# caller: Optional[str] = None
1821

1922

2023
class Process(FlexibleModel):
2124
"""Process information from Frida analysis"""
25+
# ppid不存储在这里,因为Android应用通常是单进程的,在extractor.py中处理时会设置ppid=0
2226
pid: int
2327
package_name: str
28+
arch: Optional[str] = None
29+
platform: Optional[str] = None
2430
calls: List[Call] = Field(default_factory=list)
2531

26-
2732
class FridaReport(FlexibleModel):
2833
"""Main report structure for Android analysis"""
34+
# TODO: Some more file-level information may go here.
2935
package_name: str
3036
processes: List[Process] = Field(default_factory=list)
3137

3238
@classmethod
3339
def from_json_file(cls, json_path) -> "FridaReport":
34-
"""Load from JSON file created by log_converter.py"""
40+
"""Load from JSON Lines file created by log_converter.py"""
41+
metadata = None
42+
api_calls = []
43+
3544
with open(json_path, 'r') as f:
36-
data = json.load(f)
37-
return cls.model_validate(data) #
45+
for line in f:
46+
if line.strip():
47+
record = json.loads(line)
48+
49+
if "metadata" in record:
50+
metadata = record["metadata"]
51+
elif "api" in record and "java_api" in record["api"]:
52+
api_calls.append(record["api"]["java_api"])
53+
54+
process = Process(
55+
pid=metadata["process_id"],
56+
package_name=metadata.get("package_name"),
57+
arch=metadata.get("arch"),
58+
platform=metadata.get("platform"),
59+
calls=[Call(**call) for call in api_calls]
60+
)
61+
62+
return cls(
63+
package_name=metadata.get("package_name"),
64+
processes=[process]
65+
)
3866

scripts/frida/README.md

Lines changed: 31 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2,28 +2,50 @@
22

33
## Usage
44

5+
**Environment Setup Guide:** [Frida Server + Rooted Emulator + Python Analysis Environment](https://docs.google.com/document/d/1fFf9Wu5y1q6OLojCpL4nPGvQ-Ne8ZpMeEBjdLe6Ef8c/edit?tab=t.t3e2ha7p49lk)
6+
7+
### Device Preparation
8+
9+
```bash
10+
# Create output directory with full permissions
11+
adb shell su -c "mkdir -p /data/local/tmp/frida_output && chmod 777 /data/local/tmp/frida_output"
12+
13+
# Disable SELinux enforcement (resets on reboot)
14+
adb shell su -c "setenforce 0"
15+
16+
# Start Frida server on device
17+
adb shell su -c "/data/local/tmp/frida-server &"
18+
```
19+
520
### Step 1: Capture API calls with Frida
21+
622
```bash
723
# Attach Frida to the target app and log Java API calls
8-
frida -U -f com.example.app -l java_monitor.js --no-pause > frida_output.log
24+
frida -U -f com.example.app -l java_monitor.js
925
```
1026

11-
### Step 2: Convert logs to capa format
27+
### Step 2: Retrieve Analysis Data
28+
1229
```bash
13-
# Convert raw Frida logs to capa-compatible JSON
14-
python log_converter.py frida_output.log com.example.app output.json
30+
# Check if file exits
31+
adb shell su -c "ls -la /data/local/tmp/frida_output/"
32+
33+
# Method 1: Using cat with root permissions
34+
adb shell su -c "cat /data/local/tmp/frida_output/api_calls.jsonl" > api_calls.jsonl
35+
36+
# OR Method 2: Using adb pull
37+
adb pull /data/local/tmp/frida_output/api_calls.jsonl ./api_calls.jsonl
1538
```
1639

1740
### Step 3: Analyze with capa
1841
```bash
19-
# Run capa on the converted log file
20-
capa output.json
42+
capa api_calls.jsonl
2143
```
2244

2345
## Architecture
24-
Android App → Frida Script → Log Converter → FridaExtractor → Capa Engine
46+
Android App → Frida Script → FridaExtractor → Capa Engine
2547

26-
- **java_monitor.js**: Frida script for Java API monitoring
27-
- **log_converter.py**: Converts raw Frida logs to structured JSON
48+
- **java_monitor.js**: Frida script for Java API monitoring, output JSON compatible with capa.
2849
- **extractor.py**: Contains `FridaExtractor` class implementing capa’s dynamic analysis interface
2950
- **models.py**: Defines data models for API calls and process info
51+
- **api_calls.jsonl**: Current JSON Lines output example

scripts/frida/api_calls.jsonl

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
{"id":0,"api":{"java_api":{"process_id":14881,"thread_id":14881,"call_id":0,"api_name":"java.io.File.<init>"}}}
2+
{"id":1,"metadata":{"process_id":14881,"arch":"arm64","platform":"linux","package_name":"com.example.fridatestjavaapp"}}
3+
{"id":2,"api":{"java_api":{"process_id":14881,"thread_id":14881,"call_id":1,"api_name":"java.io.File.<init>"}}}
4+
{"id":3,"api":{"java_api":{"process_id":14881,"thread_id":14881,"call_id":2,"api_name":"java.io.File.<init>"}}}
5+
{"id":4,"api":{"java_api":{"process_id":14881,"thread_id":14881,"call_id":3,"api_name":"java.io.FileOutputStream.write"}}}
6+
{"id":5,"api":{"java_api":{"process_id":14881,"thread_id":14881,"call_id":4,"api_name":"java.io.File.<init>"}}}
7+
{"id":6,"api":{"java_api":{"process_id":14881,"thread_id":14881,"call_id":5,"api_name":"java.io.File.<init>"}}}
8+
{"id":7,"api":{"java_api":{"process_id":14881,"thread_id":14881,"call_id":6,"api_name":"java.io.FileInputStream.<init>(File)"}}}
9+
{"id":8,"api":{"java_api":{"process_id":14881,"thread_id":14924,"call_id":7,"api_name":"java.io.File.<init>"}}}
10+
{"id":9,"api":{"java_api":{"process_id":14881,"thread_id":14924,"call_id":8,"api_name":"java.io.FileInputStream.<init>(File)"}}}

0 commit comments

Comments
 (0)