Skip to content

Commit cc06df8

Browse files
committed
add basic Android dynamic extractor framework
1 parent 51f5114 commit cc06df8

File tree

3 files changed

+178
-0
lines changed

3 files changed

+178
-0
lines changed

capa/features/extractors/android/__init__.py

Whitespace-only changes.
Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
import logging
2+
from typing import Union, Iterator
3+
from pathlib import Path
4+
5+
from .models import AndroidReport, Call
6+
from capa.features.common import Feature
7+
from capa.features.address import (
8+
NO_ADDRESS,
9+
Address,
10+
ThreadAddress,
11+
ProcessAddress,
12+
DynamicCallAddress,
13+
_NoAddress
14+
)
15+
from capa.features.extractors.base_extractor import (
16+
CallHandle,
17+
SampleHashes,
18+
ThreadHandle,
19+
ProcessHandle,
20+
DynamicFeatureExtractor,
21+
)
22+
23+
logger = logging.getLogger(__name__)
24+
25+
26+
class AndroidFeatureExtractor(DynamicFeatureExtractor):
27+
28+
def __init__(self, report: AndroidReport):
29+
# TODO: Not sure how to get APK hashes yet, will figure out later
30+
super().__init__(
31+
hashes=SampleHashes(md5="", sha1="", sha256="")
32+
)
33+
self.report: AndroidReport = report
34+
35+
self.global_features = []
36+
37+
def get_base_address(self) -> Union[_NoAddress, None]:
38+
return NO_ADDRESS
39+
40+
def extract_global_features(self) -> Iterator[tuple[Feature, Address]]:
41+
# TODO: Need to figure out what global features Android should have
42+
yield from self.global_features
43+
44+
def extract_file_features(self) -> Iterator[tuple[Feature, Address]]:
45+
# TODO: Will extract file-level features from Frida data later
46+
yield from []
47+
48+
def get_processes(self) -> Iterator[ProcessHandle]:
49+
"""Get all processes from the report"""
50+
for process in self.report.processes:
51+
addr = ProcessAddress(pid=process.pid, ppid=0)
52+
yield ProcessHandle(address=addr, inner=process)
53+
54+
def extract_process_features(self, ph: ProcessHandle) -> Iterator[tuple[Feature, Address]]:
55+
# TODO: Need to understand what process-level features make sense for Android
56+
yield from []
57+
58+
def get_process_name(self, ph: ProcessHandle) -> str:
59+
return ph.inner.package_name
60+
61+
def get_threads(self, ph: ProcessHandle) -> Iterator[ThreadHandle]:
62+
"""Get all threads by grouping calls by thread_id"""
63+
thread_ids = set()
64+
for call in ph.inner.calls:
65+
thread_ids.add(call.thread_id)
66+
67+
for tid in thread_ids:
68+
addr = ThreadAddress(process=ph.address, tid=tid)
69+
yield ThreadHandle(address=addr, inner={"tid": tid})
70+
71+
def extract_thread_features(self, ph: ProcessHandle, th: ThreadHandle) -> Iterator[tuple[Feature, Address]]:
72+
# TODO: Need to understand what thread features would be useful for Android
73+
yield from []
74+
75+
def get_calls(self, ph: ProcessHandle, th: ThreadHandle) -> Iterator[CallHandle]:
76+
"""Get all API calls in a specific thread"""
77+
for i, call in enumerate(ph.inner.calls):
78+
if call.thread_id == th.address.tid:
79+
addr = DynamicCallAddress(thread=th.address, id=i)
80+
yield CallHandle(address=addr, inner=call)
81+
82+
def extract_call_features(self, ph: ProcessHandle, th: ThreadHandle, ch: CallHandle
83+
) -> Iterator[tuple[Feature, Address]]:
84+
# TODO: Implement call feature extraction (not sure API names, arguments, return values)
85+
yield from []
86+
87+
def get_call_name(self, ph: ProcessHandle, th: ThreadHandle, ch: CallHandle) -> str:
88+
"""Format API call name and parameters"""
89+
call: Call = ch.inner
90+
91+
parts = []
92+
parts.append(call.api)
93+
parts.append("(")
94+
95+
if call.arguments:
96+
args = [f"{k}={v}" for k, v in call.arguments.items()]
97+
parts.append(", ".join(args))
98+
99+
parts.append(")")
100+
101+
if call.return_value:
102+
parts.append(f" -> {call.return_value}")
103+
104+
return "".join(parts)
105+
106+
@classmethod
107+
def from_frida_log(cls, package_name: str, log_file: Path) -> "AndroidFeatureExtractor":
108+
"""Create extractor from Frida log file - main entry point"""
109+
with open(log_file, 'r', encoding='utf-8') as f:
110+
log_lines = f.readlines()
111+
112+
report = AndroidReport.from_frida_logs(package_name, log_lines)
113+
return cls(report)
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
from typing import Any, Dict, List, Optional
2+
from pydantic import BaseModel, Field
3+
4+
5+
class FlexibleModel(BaseModel):
6+
"""Base model that allows extra fields"""
7+
class Config:
8+
extra = "allow"
9+
10+
11+
class Call(FlexibleModel):
12+
"""Represents a single API call captured by Frida"""
13+
api: str # API name like "java.io.File.<init>", not sure if need to seperate 'japi' 'napi' 'jni'...
14+
thread_id: int
15+
timestamp: Optional[str] = None
16+
arguments: Dict[str, Any] = Field(default_factory=dict)
17+
return_value: Optional[str] = None
18+
caller: Optional[str] = None
19+
20+
21+
class Process(FlexibleModel):
22+
"""Android process information"""
23+
pid: int
24+
package_name: str
25+
calls: List[Call] = Field(default_factory=list)
26+
27+
28+
class AndroidReport(FlexibleModel):
29+
"""Main report structure for Android analysis"""
30+
package_name: str
31+
processes: List[Process] = Field(default_factory=list)
32+
33+
@classmethod
34+
def from_frida_logs(cls, package_name: str, log_lines: List[str]) -> "AndroidReport":
35+
"""Parse Frida JSON logs into structured report"""
36+
import json
37+
38+
report = cls(package_name=package_name)
39+
40+
# TODO: Create a single process for now (maybe I can extend later)
41+
process = Process(pid=1, package_name=package_name)
42+
43+
for line in log_lines:
44+
if "{" in line and "}" in line:
45+
try:
46+
# Extract JSON from Frida log line
47+
start = line.find("{")
48+
end = line.rfind("}") + 1
49+
json_str = line[start:end]
50+
data = json.loads(json_str)
51+
52+
if data.get("type") == "api":
53+
call = Call(
54+
api=data["name"],
55+
thread_id=data.get("thread_id", 0),
56+
arguments=data.get("args", {}),
57+
caller=data.get("method", "unknown")
58+
)
59+
process.calls.append(call)
60+
61+
except json.JSONDecodeError:
62+
continue
63+
64+
report.processes.append(process)
65+
return report

0 commit comments

Comments
 (0)