FRIDA x64 performance improvements (#985)

Co-authored-by: Your Name <you@example.com>
This commit is contained in:
WorksButNotTested 2023-01-05 10:51:58 +00:00 committed by GitHub
parent 1bb37e4b98
commit f27ca843e1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -8,9 +8,11 @@ use std::{
rc::Rc, rc::Rc,
}; };
use dynasmrt::{dynasm, DynasmApi, DynasmLabelApi}; #[cfg(target_arch = "aarch64")]
use dynasmrt::DynasmLabelApi;
use dynasmrt::{dynasm, DynasmApi};
#[cfg(target_arch = "x86_64")] #[cfg(target_arch = "x86_64")]
use frida_gum::instruction_writer::X86Register; use frida_gum::instruction_writer::X86InstructionWriter;
#[cfg(target_arch = "aarch64")] #[cfg(target_arch = "aarch64")]
use frida_gum::instruction_writer::{Aarch64Register, IndexMode}; use frida_gum::instruction_writer::{Aarch64Register, IndexMode};
use frida_gum::{instruction_writer::InstructionWriter, stalker::StalkerOutput}; use frida_gum::{instruction_writer::InstructionWriter, stalker::StalkerOutput};
@ -26,6 +28,7 @@ pub const MAP_SIZE: usize = 64 * 1024;
struct CoverageRuntimeInner { struct CoverageRuntimeInner {
map: [u8; MAP_SIZE], map: [u8; MAP_SIZE],
previous_pc: u64, previous_pc: u64,
#[cfg(target_arch = "aarch64")]
current_log_impl: u64, current_log_impl: u64,
blob_maybe_log: Option<Box<[u8]>>, blob_maybe_log: Option<Box<[u8]>>,
_pinned: PhantomPinned, _pinned: PhantomPinned,
@ -50,6 +53,7 @@ impl FridaRuntime for CoverageRuntime {
_ranges: &RangeMap<usize, (u16, String)>, _ranges: &RangeMap<usize, (u16, String)>,
_modules_to_instrument: &[&str], _modules_to_instrument: &[&str],
) { ) {
#[cfg(target_arch = "aarch64")]
self.generate_maybe_log_blob(); self.generate_maybe_log_blob();
} }
@ -75,6 +79,7 @@ impl CoverageRuntime {
Self(Rc::pin(RefCell::new(CoverageRuntimeInner { Self(Rc::pin(RefCell::new(CoverageRuntimeInner {
map: [0_u8; MAP_SIZE], map: [0_u8; MAP_SIZE],
previous_pc: 0, previous_pc: 0,
#[cfg(target_arch = "aarch64")]
current_log_impl: 0, current_log_impl: 0,
blob_maybe_log: None, blob_maybe_log: None,
_pinned: PhantomPinned, _pinned: PhantomPinned,
@ -126,82 +131,93 @@ impl CoverageRuntime {
Some(ops_vec[..ops_vec.len() - 8].to_vec().into_boxed_slice()); Some(ops_vec[..ops_vec.len() - 8].to_vec().into_boxed_slice());
} }
/// A minimal `maybe_log` implementation. We insert this into the transformed instruction stream /// Write inline instrumentation for coverage
/// every time we need a copy that is within a direct branch of the start of the transformed basic
/// block.
#[cfg(target_arch = "x86_64")] #[cfg(target_arch = "x86_64")]
pub fn generate_maybe_log_blob(&mut self) { pub fn generate_inline_code(&mut self, writer: &X86InstructionWriter, h64: u64) {
let mut borrow = self.0.borrow_mut();
let prev_loc_ptr = addr_of_mut!(borrow.previous_pc);
let map_addr_ptr = addr_of_mut!(borrow.map);
let mut ops = dynasmrt::VecAssembler::<dynasmrt::x64::X64Relocation>::new(0); let mut ops = dynasmrt::VecAssembler::<dynasmrt::x64::X64Relocation>::new(0);
dynasm!(ops dynasm!(ops
; .arch x64 ; .arch x64
; pushfq // Store the context
; push rax ; mov QWORD [rsp-0x88], rax
; push rcx ; lahf
; push rdx ; mov QWORD [rsp-0x90], rax
; lea rax, [>map_addr] ; mov QWORD [rsp-0x98], rbx
; mov rax, QWORD [rax]
; lea rcx, [>previous_loc] // Load the previous_pc
; mov rdx, QWORD [rcx] ; mov rax, QWORD prev_loc_ptr as *mut u64 as _
; mov rdx, QWORD [rdx] ; mov rax, QWORD [rax]
; xor rdx, rdi
; inc BYTE [rax + rdx] // Calculate the edge id
; shr rdi, 1 ; mov ebx, WORD h64 as i32
; mov rax, QWORD [rcx] ; xor rax, rbx
; mov QWORD [rax], rdi
; pop rdx // Load the map byte address
; pop rcx ; mov rbx, QWORD map_addr_ptr as *mut [u8; MAP_SIZE] as _
; pop rax ; add rax, rbx
; popfq
; ret // Update the map byte
;map_addr: ; mov bl, BYTE [rax]
;.qword addr_of_mut!(self.0.borrow_mut().map) as i64 ; add bl,0x1
;previous_loc: ; adc bl,0x0
;.qword 0 ; mov BYTE [rax],bl
// Update the previous_pc value
; mov rax, QWORD prev_loc_ptr as *mut u64 as _
; mov ebx, WORD h64 as i32
; mov QWORD [rax], rbx
// Restore the context
; mov rbx, QWORD [rsp-0x98]
; mov rax, QWORD [rsp-0x90]
; sahf
; mov rax, QWORD [rsp-0x88]
); );
let ops_vec = ops.finalize().unwrap(); let ops_vec = ops.finalize().unwrap();
self.0.borrow_mut().blob_maybe_log =
Some(ops_vec[..ops_vec.len() - 8].to_vec().into_boxed_slice()); writer.put_bytes(&ops_vec[..ops_vec.len()].to_vec().into_boxed_slice());
} }
/// Emits coverage mapping into the current basic block. /// Emits coverage mapping into the current basic block.
#[inline] #[inline]
pub fn emit_coverage_mapping(&mut self, address: u64, output: &StalkerOutput) { pub fn emit_coverage_mapping(&mut self, address: u64, output: &StalkerOutput) {
let h64 = xxh3_rrmxmx_mixer(address); let h64 = xxh3_rrmxmx_mixer(address);
let writer = output.writer(); let writer = output.writer();
#[allow(clippy::cast_possible_wrap)] // gum redzone size is u32, we need an offset as i32.
let redzone_size = i64::from(frida_gum_sys::GUM_RED_ZONE_SIZE);
if self.0.borrow().current_log_impl == 0
|| !writer.can_branch_directly_to(self.0.borrow().current_log_impl)
|| !writer
.can_branch_directly_between(writer.pc() + 128, self.0.borrow().current_log_impl)
{
let after_log_impl = writer.code_offset() + 1;
#[cfg(target_arch = "x86_64")]
writer.put_jmp_near_label(after_log_impl);
#[cfg(target_arch = "aarch64")]
writer.put_b_label(after_log_impl);
self.0.borrow_mut().current_log_impl = writer.pc();
writer.put_bytes(&self.blob_maybe_log());
let prev_loc_pointer = addr_of_mut!(self.0.borrow_mut().previous_pc) as u64; // Get the pointer to self.previous_pc
writer.put_bytes(&prev_loc_pointer.to_ne_bytes());
writer.put_label(after_log_impl);
}
#[cfg(target_arch = "x86_64")] #[cfg(target_arch = "x86_64")]
{ {
writer.put_lea_reg_reg_offset(X86Register::Rsp, X86Register::Rsp, -(redzone_size)); self.generate_inline_code(&writer, h64 & (MAP_SIZE as u64 - 1));
writer.put_push_reg(X86Register::Rdi);
writer.put_mov_reg_address(X86Register::Rdi, h64 & (MAP_SIZE as u64 - 1));
writer.put_call_address(self.0.borrow().current_log_impl);
writer.put_pop_reg(X86Register::Rdi);
writer.put_lea_reg_reg_offset(X86Register::Rsp, X86Register::Rsp, redzone_size);
} }
#[cfg(target_arch = "aarch64")] #[cfg(target_arch = "aarch64")]
{ {
#[allow(clippy::cast_possible_wrap)]
// gum redzone size is u32, we need an offset as i32.
let redzone_size = i64::from(frida_gum_sys::GUM_RED_ZONE_SIZE);
if self.0.borrow().current_log_impl == 0
|| !writer.can_branch_directly_to(self.0.borrow().current_log_impl)
|| !writer.can_branch_directly_between(
writer.pc() + 128,
self.0.borrow().current_log_impl,
)
{
let after_log_impl = writer.code_offset() + 1;
#[cfg(target_arch = "x86_64")]
writer.put_jmp_near_label(after_log_impl);
#[cfg(target_arch = "aarch64")]
writer.put_b_label(after_log_impl);
self.0.borrow_mut().current_log_impl = writer.pc();
writer.put_bytes(&self.blob_maybe_log());
let prev_loc_pointer = addr_of_mut!(self.0.borrow_mut().previous_pc) as u64; // Get the pointer to self.previous_pc
writer.put_bytes(&prev_loc_pointer.to_ne_bytes());
writer.put_label(after_log_impl);
}
writer.put_stp_reg_reg_reg_offset( writer.put_stp_reg_reg_reg_offset(
Aarch64Register::Lr, Aarch64Register::Lr,
Aarch64Register::X0, Aarch64Register::X0,