From d991ac4dcdcb19c226a3b92a975ce788836dc879 Mon Sep 17 00:00:00 2001 From: Jean-Marie 'Histausse' Mineau Date: Wed, 21 Jan 2026 01:16:52 +0100 Subject: [PATCH] add map layout feature for dex file --- androscalpel/Cargo.toml | 4 +- androscalpel/examples/map_layout.rs | 40 +++++++++++++ androscalpel/src/apk.rs | 20 ++++++- androscalpel_serializer/Cargo.toml | 6 ++ .../src/file_reader/map_dex_file.rs | 57 +++++++++++++++++++ .../{file_reader.rs => file_reader/mod.rs} | 52 ++++++++++++++++- androscalpel_serializer/src/lib.rs | 13 +++++ androscalpel_serializer_derive/src/lib.rs | 2 +- 8 files changed, 188 insertions(+), 6 deletions(-) create mode 100644 androscalpel/examples/map_layout.rs create mode 100644 androscalpel_serializer/src/file_reader/map_dex_file.rs rename androscalpel_serializer/src/{file_reader.rs => file_reader/mod.rs} (91%) diff --git a/androscalpel/Cargo.toml b/androscalpel/Cargo.toml index 8cd0ba1..83c33a9 100644 --- a/androscalpel/Cargo.toml +++ b/androscalpel/Cargo.toml @@ -16,6 +16,7 @@ androscalpel_platform_api_list = { version = "0.1.0", path = "../androscalpel_pl anyhow = { version = "1.0.75", features = ["backtrace"] } apk_frauder = { version = "0.1.0", path = "../apk_frauder" } log = "0.4.20" +# TODO: remove python support pyo3 = { version = "0.23.4", features = ["anyhow", "abi3-py38", "extension-module"], optional = true} pyo3-log = { version = "0.12.1", optional = true} rayon = "1.9.0" @@ -32,11 +33,12 @@ env_logger = "0.11.6" [features] default = ["code-analysis", "platform-list"] -# TODO: need refactoring to https://github.com/PyO3/pyo3/issues/2935#issuecomment-2560930677 or cfg_eval https://github.com/rust-lang/rust/issues/82679 +# TODO: remove python support python = ["pyo3", "pyo3-log"] # Currently not supported external-zip-reader = ["zip"] platform-list = ["androscalpel_platform_api_list"] code-analysis = [] +map_dex_file = ["androscalpel_serializer/map_dex_file"] [[example]] name = "list-method" diff --git a/androscalpel/examples/map_layout.rs b/androscalpel/examples/map_layout.rs new file mode 100644 index 0000000..49291d7 --- /dev/null +++ b/androscalpel/examples/map_layout.rs @@ -0,0 +1,40 @@ +use std::path::PathBuf; + +use androscalpel::Apk; + +use clap::Parser; + +#[derive(Parser, Debug)] +#[command(version, about, long_about = None, arg_required_else_help = true)] +struct Cli { + #[arg(short, long)] + dex: PathBuf, +} + +fn main() { + if cfg!(not(feature = "map_dex_file")) { + panic!("This program must be compiled with the map_dex_file feature: `cargo run --example map_layout --features map_dex_file`"); + } + env_logger::init(); + let cli = Cli::parse(); + let mut apk = Apk::new(); + let data = std::fs::read(&cli.dex).unwrap(); + apk.add_dex_file( + &cli.dex.into_os_string().into_string().unwrap(), + &data, + |_, _, _| None, + false, + ) + .unwrap(); + //load_apk(File::open(&cli.apk).unwrap(), |_, _, _| None, false).unwrap(); + for (name, dex) in &apk.dex_files { + println!("{name}:"); + #[cfg(feature = "map_dex_file")] + for (off, size, dscr) in dex.layout_map.iter() { + let end = off + *size as u32; + let mut dscr = dscr.clone(); + dscr.truncate(100); + println!("0x{off:x} -> 0x{end:x}: {dscr}"); + } + } +} diff --git a/androscalpel/src/apk.rs b/androscalpel/src/apk.rs index d6302cf..3b3ef63 100644 --- a/androscalpel/src/apk.rs +++ b/androscalpel/src/apk.rs @@ -33,6 +33,11 @@ pub struct DexFile { /// The binary of the dexfile. #[serde(skip_serializing)] pub(crate) bin_cache: Option>, // TODO: invalidate the cache !!! + /// Map chunks of the file to as string describing the struct. + /// Usefull to examine malformed file but slow down the parsing + /// and consume a lot of memory. + #[cfg(feature = "map_dex_file")] + pub layout_map: Vec<(u32, usize, String)>, } impl DexFile { @@ -115,11 +120,15 @@ impl DexFile { classes: classes0, not_referenced_strings: self.not_referenced_strings, bin_cache: None, + #[cfg(feature = "map_dex_file")] + layout_map: vec![], }, Self { classes: classes1, not_referenced_strings: HashSet::new(), bin_cache: None, + #[cfg(feature = "map_dex_file")] + layout_map: vec![], }, ) } @@ -155,6 +164,8 @@ impl VisitableMut for DexFile { classes }, bin_cache: None, + #[cfg(feature = "map_dex_file")] + layout_map: vec![], }) } } @@ -2943,6 +2954,8 @@ impl Apk { classes, not_referenced_strings, bin_cache, + #[cfg(feature = "map_dex_file")] + layout_map: _, }, )| { @@ -3181,7 +3194,7 @@ impl Apk { /// - `name`: the name of the dex file /// - `data`: the dex file binary /// - `label_ins`: Function that take a method id, instruction and address and return - /// a label, if a label needs to be inserted before the instruction. + /// a label, if a label needs to be inserted before the instruction. /// - `cache`: if set to true, copy and cache the binary data format. /// /// # Infos @@ -3220,7 +3233,10 @@ impl Apk { .map(DexString) .collect(), bin_cache: if cache { Some(data.to_vec()) } else { None }, + #[cfg(feature = "map_dex_file")] + layout_map: dex.get_chunks(), }; + self.dex_files.insert(name, dex_file); Ok(()) } @@ -3280,7 +3296,7 @@ impl Apk { /// - `name`: the name of the dex file /// - `data`: the dex file binary /// - `label_each_ins`: if set to true, insert a label before each instruction - /// indicating the instruction address + /// indicating the instruction address /// - `cache`: if set to true, copy and cache the binary data format. #[cfg_attr(feature = "python", pyo3(name = "add_dex_file", signature = (name, data, label_each_ins=false, cache=false)))] pub fn add_dex_file_py( diff --git a/androscalpel_serializer/Cargo.toml b/androscalpel_serializer/Cargo.toml index b447167..dccc987 100644 --- a/androscalpel_serializer/Cargo.toml +++ b/androscalpel_serializer/Cargo.toml @@ -7,3 +7,9 @@ license = "AGPL-3.0-or-later" [dependencies] androscalpel_serializer_derive = { path = "../androscalpel_serializer_derive" } log = "0.4.20" + +[features] +# Map sections of the binary dex file to the parsed value. +# Aims to explore malformated / strange dex files, but slows +# the parsing and consumes a lot of memory. +map_dex_file = [] diff --git a/androscalpel_serializer/src/file_reader/map_dex_file.rs b/androscalpel_serializer/src/file_reader/map_dex_file.rs new file mode 100644 index 0000000..f936c51 --- /dev/null +++ b/androscalpel_serializer/src/file_reader/map_dex_file.rs @@ -0,0 +1,57 @@ +//! Most of the logic for the map_dex_file feature. + +use crate::DexFileReader; + +impl<'a> DexFileReader<'a> { + /// List the different structures of the dex file, in order of + /// offset, including holes. + pub fn get_chunks(&self) -> Vec<(u32, usize, String)> { + let dex_size = self.data.len(); + let mut structs: Vec<_> = self + .layout_map + .lock() + .expect("Failed to acquire mutex lock on layout_map") + .iter() + .map(|((off, size), desc)| (*off, *size, desc.clone())) + .collect(); + structs.sort(); + let mut chunks = vec![]; + let mut last_off = 0; + for (off, size, desc) in structs.into_iter() { + if off > last_off { + let size = (off - last_off) as usize; + // ignore padding + if (size < 4) + && self.data[last_off as usize..off as usize] + .iter() + .all(|&b| b == 0) + { + chunks.push(( + last_off, + size, + format!( + "Padding: {:x?}", + &self.data[last_off as usize..off as usize] + ), + )); + } else { + chunks.push((last_off, size, "Unreferenced Data".into())); + } + last_off = off; + } + // TODO: do something with overlapping struct? + if off + size as u32 > last_off { + last_off = off + size as u32; + } + chunks.push((off, size, desc)); + } + if (last_off as usize) < dex_size { + chunks.push(( + last_off, + dex_size - last_off as usize, + "Unreferenced Data".into(), + )); + } + chunks + } +} diff --git a/androscalpel_serializer/src/file_reader.rs b/androscalpel_serializer/src/file_reader/mod.rs similarity index 91% rename from androscalpel_serializer/src/file_reader.rs rename to androscalpel_serializer/src/file_reader/mod.rs index 6f77979..db0e55e 100644 --- a/androscalpel_serializer/src/file_reader.rs +++ b/androscalpel_serializer/src/file_reader/mod.rs @@ -9,6 +9,12 @@ use log::{error, info, warn}; use std::io::{Cursor, Seek, SeekFrom}; use std::sync::atomic::{AtomicBool, Ordering}; +#[cfg(feature = "map_dex_file")] +use std::{collections::HashMap, sync::Mutex}; + +#[cfg(feature = "map_dex_file")] +pub mod map_dex_file; + #[derive(Debug)] pub struct DexFileReader<'a> { // Ideally, this would be a Read+Seek, but Read+Seek is not thread safe, while we can @@ -32,6 +38,8 @@ pub struct DexFileReader<'a> { method_handles: Vec, hiddenapi_class_data: Option, map_list: MapList, + #[cfg(feature = "map_dex_file")] + layout_map: Mutex>, } impl<'a> DexFileReader<'a> { @@ -53,7 +61,18 @@ impl<'a> DexFileReader<'a> { method_handles: vec![], hiddenapi_class_data: None, map_list: MapList { list: vec![] }, + #[cfg(feature = "map_dex_file")] + layout_map: Mutex::new(HashMap::new()), }; + #[cfg(feature = "map_dex_file")] + tmp_file + .layout_map + .lock() + .expect("Failed to acquire mutex lock on layout_map") + .insert( + (0, tmp_file.header.size()), + format!("{:x?}", tmp_file.header), + ); if tmp_file.header.map_off != 0 { tmp_file.map_list = tmp_file.get_struct_at_offset(tmp_file.header.map_off)?; } @@ -381,7 +400,11 @@ impl<'a> DexFileReader<'a> { Ok(()) } - fn get_item_list(&self, offset: u32, size: u32) -> Result> { + fn get_item_list( + &self, + offset: u32, + size: u32, + ) -> Result> { if offset == 0 { return Ok(vec![]); } @@ -401,6 +424,17 @@ impl<'a> DexFileReader<'a> { pos )) })?); + #[cfg(feature = "map_dex_file")] + if let Some(ref item) = list.last() { + let size = item.size(); + // Assume two != structs cannot be at the same offset with the same time + // maybe add struct name to index? + self.layout_map + .lock() + .expect("Failed to acquire mutex lock on layout_map") + .entry((pos as u32, size)) + .or_insert_with(|| format!("{item:x?}")); + } } Ok(list) } @@ -410,7 +444,10 @@ impl<'a> DexFileReader<'a> { /// # Warning /// /// If the offset is invalid, UB. - pub fn get_struct_at_offset(&self, offset: u32) -> Result { + pub fn get_struct_at_offset( + &self, + offset: u32, + ) -> Result { let mut buffer = Cursor::new(self.data); buffer.seek(SeekFrom::Start(offset as u64)).unwrap(); let r = T::deserialize(&mut buffer).map_err(|err| { @@ -433,6 +470,17 @@ impl<'a> DexFileReader<'a> { buffer.position() ); } + #[cfg(feature = "map_dex_file")] + if let Ok(ref r) = r { + let size = r.size(); + // Assume two != structs cannot be at the same offset with the same time + // maybe add struct name to index? + self.layout_map + .lock() + .expect("Failed to acquire mutex lock on layout_map") + .entry((offset, size)) + .or_insert_with(|| format!("{r:x?}")); + } r } diff --git a/androscalpel_serializer/src/lib.rs b/androscalpel_serializer/src/lib.rs index 5100d14..783c80a 100644 --- a/androscalpel_serializer/src/lib.rs +++ b/androscalpel_serializer/src/lib.rs @@ -1,3 +1,16 @@ +//! This crate parse/write dalvik structures from binary to rust strucs. +//! Those strucs are close to the binary representation and follow the dalvik as +//! define by google: +//! +//! +//! ## Features +//! +//! ### map_dex_file +//! +//! Map sections of the binary dex file to the parsed value. +//! Aims to explore malformated / strange dex files, but slows +//! the parsing and consumes a lot of memory. + pub mod annotation; pub mod array; pub mod consts; diff --git a/androscalpel_serializer_derive/src/lib.rs b/androscalpel_serializer_derive/src/lib.rs index 4c48c05..8906687 100644 --- a/androscalpel_serializer_derive/src/lib.rs +++ b/androscalpel_serializer_derive/src/lib.rs @@ -55,7 +55,7 @@ use syn::{ /// An enum can define ONE variant as the default variant. This variant is a catch all, /// and MUST be have named field or unnamed field (no unit variant!) and: /// - The first field of named fields variant must be named `prefix` and of the type -/// `prefix_type`. +/// `prefix_type`. /// - The first field of unnamed fields variant must be of the type `prefix_type`. /// /// The first field of the default variant store the prefix of the variant.