Skip to content
7 changes: 5 additions & 2 deletions capa/features/extractors/binja/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
)
from capa.features.address import NO_ADDRESS, Address, FileOffsetAddress, AbsoluteVirtualAddress
from capa.features.extractors.binja.helpers import read_c_string, unmangle_c_name
from capa.features.extractors.binja.helpers import va_to_file_offset


def check_segment_for_pe(bv: BinaryView, seg: Segment) -> Iterator[tuple[Feature, Address]]:
Expand All @@ -46,7 +47,8 @@ def check_segment_for_pe(bv: BinaryView, seg: Segment) -> Iterator[tuple[Feature
buf = bv.read(seg.start, seg.length)

for offset, _ in capa.features.extractors.helpers.carve_pe(buf, start):
yield Characteristic("embedded pe"), FileOffsetAddress(seg.start + offset)
file_off = va_to_file_offset(bv, seg.start + offset)
yield Characteristic("embedded pe"), FileOffsetAddress(file_off)


def extract_file_embedded_pe(bv: BinaryView) -> Iterator[tuple[Feature, Address]]:
Expand Down Expand Up @@ -122,7 +124,8 @@ def extract_file_section_names(bv: BinaryView) -> Iterator[tuple[Feature, Addres
def extract_file_strings(bv: BinaryView) -> Iterator[tuple[Feature, Address]]:
"""extract ASCII and UTF-16 LE strings"""
for s in bv.strings:
yield String(s.value), FileOffsetAddress(s.start)
file_off = va_to_file_offset(bv, s.start)
yield String(s.value), FileOffsetAddress(file_off)


def extract_file_function_names(bv: BinaryView) -> Iterator[tuple[Feature, Address]]:
Expand Down
27 changes: 27 additions & 0 deletions capa/features/extractors/binja/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,3 +84,30 @@ def get_llil_instr_at_addr(bv: BinaryView, addr: int) -> Optional[LowLevelILInst
if arch.get_instruction_low_level_il(buffer, addr, llil) == 0:
return None
return llil[0]


def va_to_file_offset(bv: BinaryView, va: int) -> int:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is no need to use a helper here -- binja has an API to do that directly: get_data_offset_for_address. See https://github.com/Vector35/binaryninja-api/blob/01c9f9a2df334909fae97abdfe695b620c7161bb/python/binaryview.py#L9965

"""Map a BinaryView virtual address to a file offset using segment/section data offsets.

Assumes a modern Binary Ninja API where Segment and Section objects expose
a `data_offset` attribute which is the file offset of the start of the
segment/section. The file offset is computed as:

file_offset = segment.data_offset + (va - segment.start)

If no containing segment/section is found, fall back to returning the
given virtual address as an integer.
"""
# prefer segments (they map ranges of the file view)
for seg in bv.segments:
if seg.start <= va < seg.start + seg.length:
return int(seg.data_offset + (va - seg.start))

# otherwise check sections
for _, sec in bv.sections.items():
if sec.start <= va < sec.start + sec.length:
return int(sec.data_offset + (va - sec.start))

# If we cannot map the VA to a file offset via segments or sections, raise.
# This enforces strict mapping so callers must handle missing mappings explicitly.
raise RuntimeError(f"unable to map virtual address to file offset: 0x{va:x}")
17 changes: 10 additions & 7 deletions capa/features/extractors/ghidra/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,10 +85,11 @@ def extract_file_embedded_pe() -> Iterator[tuple[Feature, Address]]:
continue

for off, _ in find_embedded_pe(capa.features.extractors.ghidra.helpers.get_block_bytes(block), mz_xor):
# add offset back to block start
ea: int = block.getStart().add(off).getOffset()
# add offset back to block start (Address)
addr = block.getStart().add(off)
off_file = capa.features.extractors.ghidra.helpers.addr_to_file_offset(addr)

yield Characteristic("embedded pe"), FileOffsetAddress(ea)
yield Characteristic("embedded pe"), FileOffsetAddress(int(off_file))


def extract_file_export_names() -> Iterator[tuple[Feature, Address]]:
Expand Down Expand Up @@ -140,12 +141,14 @@ def extract_file_strings() -> Iterator[tuple[Feature, Address]]:
p_bytes = capa.features.extractors.ghidra.helpers.get_block_bytes(block)

for s in capa.features.extractors.strings.extract_ascii_strings(p_bytes):
offset = block.getStart().getOffset() + s.offset
yield String(s.s), FileOffsetAddress(offset)
addr = block.getStart().add(s.offset)
offset = capa.features.extractors.ghidra.helpers.addr_to_file_offset(addr)
yield String(s.s), FileOffsetAddress(int(offset))

for s in capa.features.extractors.strings.extract_unicode_strings(p_bytes):
offset = block.getStart().getOffset() + s.offset
yield String(s.s), FileOffsetAddress(offset)
addr = block.getStart().add(s.offset)
offset = capa.features.extractors.ghidra.helpers.addr_to_file_offset(addr)
yield String(s.s), FileOffsetAddress(int(offset))


def extract_file_function_names() -> Iterator[tuple[Feature, Address]]:
Expand Down
29 changes: 29 additions & 0 deletions capa/features/extractors/ghidra/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,3 +306,32 @@ def find_data_references_from_insn(insn, max_depth: int = 10):
break

yield to_addr


def addr_to_file_offset(addr: ghidra.program.model.address.Address) -> int:
"""Map a Ghidra Address to a file offset using section information.

Assumes a modern Ghidra version where MemoryBlock provides
`getStartingOffset()` and `getStart()/getEnd()` are available.

Algorithm:
- iterate memory blocks, find the block containing `addr`
- compute section-relative offset = addr - block.start
- compute file offset = block.getStartingOffset() + section-relative offset
- if no block matches, fall back to subtracting program image base
"""
prog = currentProgram() # type: ignore[name-defined]
aoff = addr.getOffset()

for block in prog.getMemory().getBlocks(): # type: ignore[name-defined]
bstart = block.getStart().getOffset()
bend = block.getEnd().getOffset()
if bstart <= aoff <= bend:
sec_rel = aoff - bstart
file_base = block.getStartingOffset()
return int(file_base + sec_rel)

# if no block matched, fall back to image-base subtraction
base = prog.getImageBase().getOffset()
return int(aoff - base)

10 changes: 7 additions & 3 deletions capa/features/extractors/ida/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import idaapi
import idautils
import ida_entry
import ida_loader

import capa.ida.helpers
import capa.features.extractors.common
Expand Down Expand Up @@ -87,7 +88,8 @@ def extract_file_embedded_pe() -> Iterator[tuple[Feature, Address]]:
"""
for seg in capa.features.extractors.ida.helpers.get_segments(skip_header_segments=True):
for ea, _ in check_segment_for_pe(seg):
yield Characteristic("embedded pe"), FileOffsetAddress(ea)
off = ida_loader.get_fileregion_offset(ea)
yield Characteristic("embedded pe"), FileOffsetAddress(off)


def extract_file_export_names() -> Iterator[tuple[Feature, Address]]:
Expand Down Expand Up @@ -161,10 +163,12 @@ def extract_file_strings() -> Iterator[tuple[Feature, Address]]:

# differing to common string extractor factor in segment offset here
for s in capa.features.extractors.strings.extract_ascii_strings(seg_buff):
yield String(s.s), FileOffsetAddress(seg.start_ea + s.offset)
off = ida_loader.get_fileregion_offset(seg.start_ea + s.offset)
yield String(s.s), FileOffsetAddress(off)

for s in capa.features.extractors.strings.extract_unicode_strings(seg_buff):
yield String(s.s), FileOffsetAddress(seg.start_ea + s.offset)
off = ida_loader.get_fileregion_offset(seg.start_ea + s.offset)
yield String(s.s), FileOffsetAddress(off)


def extract_file_function_names() -> Iterator[tuple[Feature, Address]]:
Expand Down
Loading