From a02329f7de5b688073c48f4a31ce2ce5a5f8cc50 Mon Sep 17 00:00:00 2001
From: Jean-Marie 'Histausse' Mineau <histausse@protonmail.com>
Date: Fri, 21 Feb 2025 19:47:54 +0100
Subject: [PATCH] resolve register types

---
 androscalpel/src/code_analysis/method_cfg.rs  |  38 +-
 androscalpel/src/code_analysis/mod.rs         |   3 +-
 .../src/code_analysis/register_type.rs        | 516 ++++++++++++++++++
 3 files changed, 541 insertions(+), 16 deletions(-)
 create mode 100644 androscalpel/src/code_analysis/register_type.rs
diff --git a/androscalpel/src/code_analysis/method_cfg.rs b/androscalpel/src/code_analysis/method_cfg.rs
index 7ba060e..0794316 100644
--- a/androscalpel/src/code_analysis/method_cfg.rs
+++ b/androscalpel/src/code_analysis/method_cfg.rs
@@ -1,27 +1,29 @@
 //! The Control Flow Graph for a method.
 
-use crate::{IdMethod, Instruction, Method, Result};
+use crate::{Instruction, Method, Result};
 use anyhow::Context;
 use std::collections::HashMap;
 
 const EMPTY_INSNS_SLICE: &[Instruction] = &[];
 
 /// A basic block of code of a method.
-struct MethodCFGNode<'a> {
+#[derive(Debug, PartialEq)]
+pub struct MethodCFGNode<'a> {
     /// Code represented by the block
-    code_block: &'a [Instruction],
+    pub code_block: &'a [Instruction],
     /// Labels at the begining of the node if they exists
-    labels: Vec<String>,
+    pub labels: Vec<String>,
     /// Indices in CodeGraph.nodes of the next nodes
-    next_nodes: Vec<usize>,
+    pub next_nodes: Vec<usize>,
     /// Indices in CodeGraph.nodes of the previous nodes
-    prev_nodes: Vec<usize>,
+    pub prev_nodes: Vec<usize>,
 }
 
 /// The CFG for a method, with potentially additionnal informations.
+#[derive(Debug, PartialEq)]
 pub struct MethodCFG<'a> {
-    method: &'a IdMethod,
-    nodes: Vec<MethodCFGNode<'a>>,
+    pub method: &'a Method,
+    pub nodes: Vec<MethodCFGNode<'a>>,
 }
 
 impl<'a> MethodCFG<'a> {
@@ -214,27 +216,33 @@ impl<'a> MethodCFG<'a> {
                 nodes[*j].prev_nodes.push(i);
             }
         }
-        Ok(Self {
-            method: &method.descriptor,
-            nodes,
-        })
+        Ok(Self { method, nodes })
     }
 
     /// Serialize the graph to dot format.
-    pub fn to_dot(&self) -> String {
+    pub fn to_dot(&self, add_reg_ty: bool) -> String {
         let mut dot_string: String = "digraph {\n".into();
         dot_string += "overlap=false;\n";
         dot_string += &self.to_dot_subgraph();
+        dot_string += "\n";
+        if add_reg_ty {
+            dot_string += &self.reg_types_dot();
+        }
         dot_string += "}";
         dot_string
     }
 
+    // method call: cluster_{mth}:node_{i}:i{j}:e -> cluster_{mth2}:n ?
+
     /// Serialize the graph to dot format.
     pub fn to_dot_subgraph(&self) -> String {
-        let mut dot_string = format!("subgraph \"cluster_{}\" {{\n", self.method.__str__());
+        let mut dot_string = format!(
+            "subgraph \"cluster_{}\" {{\n",
+            self.method.descriptor.__str__()
+        );
         dot_string += "    style=\"dashed\";\n";
         dot_string += "    color=\"black\";\n";
-        dot_string += &format!("    label=\"{}\";\n", self.method.__str__());
+        dot_string += &format!("    label=\"{}\";\n", self.method.descriptor.__str__());
         for (i, node) in self.nodes.iter().enumerate() {
             let block_name = if i == 0 {
                 "ENTRY".into()
diff --git a/androscalpel/src/code_analysis/mod.rs b/androscalpel/src/code_analysis/mod.rs
index c93e214..86c6b1c 100644
--- a/androscalpel/src/code_analysis/mod.rs
+++ b/androscalpel/src/code_analysis/mod.rs
@@ -3,5 +3,6 @@
 //! This is module is quite experimental but can be usefull.
 
 pub mod method_cfg;
-
+pub mod register_type;
 pub use method_cfg::*;
+pub use register_type::RegType;
diff --git a/androscalpel/src/code_analysis/register_type.rs b/androscalpel/src/code_analysis/register_type.rs
new file mode 100644
index 0000000..e7a1898
--- /dev/null
+++ b/androscalpel/src/code_analysis/register_type.rs
@@ -0,0 +1,516 @@
+//! Compute the register types at each label of a method.
+
+use super::{MethodCFG, MethodCFGNode};
+use crate::Instruction;
+use std::collections::HashMap;
+
+/// The different possible types of a register
+#[derive(Debug, PartialEq, Clone, Copy)]
+pub enum RegType {
+    /// The register content is not yet defined
+    Undefined,
+    /// The register contains a java object. It can be a classical object
+    /// an array, a type, etc
+    Object,
+    /// The register contains a 32 bit scalar
+    SimpleScalar,
+    /// The register contains the first 32 bits of a wide register. If not
+    /// followed by a `SecondWideScalar`, it should be considered as a `SimpleScalar`?
+    FirstWideScalar,
+    /// The register contains the last 32 bits of a wide register. If not
+    /// preceded by a `FirstWideScalar`, it should be considered as a `SimpleScalar`?
+    SecondWideScalar,
+    /// The register can be either a scalar or an object.
+    Any,
+}
+
+impl RegType {
+    pub fn to_str(&self) -> &'static str {
+        match self {
+            RegType::Undefined => "undef",
+            RegType::Object => "object",
+            RegType::SimpleScalar => "scalar",
+            RegType::FirstWideScalar => "wide1",
+            RegType::SecondWideScalar => "wide2",
+            RegType::Any => "any",
+        }
+    }
+}
+
+impl MethodCFG<'_> {
+    pub fn get_reg_types(&self) -> HashMap<String, Vec<RegType>> {
+        let code = if let Some(code) = self.method.code.as_ref() {
+            code
+        } else {
+            return HashMap::new();
+        };
+        let nb_reg = code.registers_size as usize;
+        let mut end_block_reg_tys: Vec<_> = self
+            .nodes
+            .iter()
+            .map(|_| vec![RegType::Undefined; nb_reg])
+            .collect();
+        if end_block_reg_tys.is_empty() {
+            return HashMap::new();
+        }
+        // Initialize the entry block from function signature:
+        let mut i = (code.registers_size - code.ins_size) as usize;
+        for arg in &self.method.descriptor.proto.get_parameters() {
+            if arg.is_class() || arg.is_array() {
+                end_block_reg_tys[0][i] = RegType::Object;
+                i += 1;
+            } else if arg.is_long() || arg.is_double() {
+                end_block_reg_tys[0][i] = RegType::FirstWideScalar;
+                i += 1;
+                end_block_reg_tys[0][i] = RegType::SecondWideScalar;
+                i += 1;
+            } else {
+                end_block_reg_tys[0][i] = RegType::SimpleScalar;
+                i += 1;
+            }
+        }
+
+        let mut changed = true;
+        while changed {
+            let mut new_end_block_reg_tys = vec![];
+            for node in &self.nodes {
+                new_end_block_reg_tys.push(transform_reg_ty(
+                    &merge_input(node, nb_reg, &end_block_reg_tys),
+                    node,
+                ))
+            }
+            changed = end_block_reg_tys != new_end_block_reg_tys;
+            end_block_reg_tys = new_end_block_reg_tys;
+        }
+        let start_block_reg_tys = &self
+            .nodes
+            .iter()
+            .map(|node| merge_input(node, nb_reg, &end_block_reg_tys))
+            .collect::<Vec<_>>();
+        self.nodes
+            .iter()
+            .enumerate()
+            .flat_map(|(i, node)| {
+                node.labels
+                    .iter()
+                    .map(move |label| (label.clone(), start_block_reg_tys[i].clone()))
+            })
+            .collect()
+    }
+
+    pub(crate) fn reg_types_dot(&self) -> String {
+        let types = self.get_reg_types();
+        let mut dot_string = format!(
+            "subgraph \"cluster_reg_types_{}\" {{\n",
+            self.method.descriptor.__str__()
+        );
+        dot_string += "    style=\"dashed\";\n";
+        dot_string += "    color=\"black\";\n";
+        dot_string += &format!(
+            "    label=\"Register Types{}\";\n",
+            self.method.descriptor.__str__()
+        );
+
+        for (label, regs) in &types {
+            let mut node_label = String::new();
+            for reg in regs {
+                node_label += "|";
+                node_label += reg.to_str();
+            }
+            node_label += "|";
+            dot_string += &format!("    node_{label} [shape=record,style=filled,fillcolor=lightgrey,label=\"{node_label}\"];\n");
+        }
+
+        dot_string += "}\n";
+        dot_string
+    }
+}
+
+fn merge_input(node: &MethodCFGNode, nb_reg: usize, inputs: &[Vec<RegType>]) -> Vec<RegType> {
+    use RegType::*;
+    let mut reg_tys = vec![Undefined; nb_reg];
+    for i in &node.prev_nodes {
+        for (r, ty) in inputs[*i].iter().enumerate() {
+            reg_tys[r] = match (reg_tys[r], ty) {
+                (Undefined, _) => *ty,
+                (_, Undefined) => reg_tys[r],
+                (_, Any) => Any,
+                (Any, _) => Any,
+
+                (Object, Object) => Object,
+                (Object, SimpleScalar) => Any,
+                (Object, FirstWideScalar) => Any,
+                (Object, SecondWideScalar) => Any,
+
+                (SimpleScalar, Object) => Any,
+                (SimpleScalar, SimpleScalar) => SimpleScalar,
+                (SimpleScalar, FirstWideScalar) => SimpleScalar,
+                (SimpleScalar, SecondWideScalar) => SimpleScalar,
+
+                (FirstWideScalar, Object) => Any,
+                (FirstWideScalar, SimpleScalar) => SimpleScalar,
+                (FirstWideScalar, FirstWideScalar) => FirstWideScalar,
+                (FirstWideScalar, SecondWideScalar) => SimpleScalar,
+
+                (SecondWideScalar, Object) => Any,
+                (SecondWideScalar, SimpleScalar) => SimpleScalar,
+                (SecondWideScalar, FirstWideScalar) => SimpleScalar,
+                (SecondWideScalar, SecondWideScalar) => SecondWideScalar,
+            }
+        }
+    }
+    reg_tys
+}
+
+fn transform_reg_ty(input_types: &[RegType], cfg: &MethodCFGNode) -> Vec<RegType> {
+    use Instruction::*;
+    use RegType::*;
+    let mut types = input_types.to_vec();
+    for ins in cfg.code_block {
+        match ins {
+            Move { to, .. } => types[*to as usize] = SimpleScalar, // E: mism
+            MoveWide { to, .. } => {
+                types[*to as usize] = FirstWideScalar;
+                types[*to as usize + 1] = SecondWideScalar;
+            }
+            MoveObject { to, .. } => types[*to as usize] = Object,
+            MoveResult { to: reg } | Return { reg } | Const { reg, .. } | Switch { reg, .. } => {
+                types[*reg as usize] = SimpleScalar
+            }
+            MoveResultWide { to: reg } | ReturnWide { reg } | ConstWide { reg, .. } => {
+                types[*reg as usize] = FirstWideScalar;
+                types[*reg as usize + 1] = SecondWideScalar;
+            }
+            MoveResultObject { to: reg }
+            | MoveException { to: reg }
+            | ReturnObject { reg }
+            | ConstString { reg, .. }
+            | ConstClass { reg, .. }
+            | NewInstance { reg, .. }
+            | Throw { reg }
+            | ConstMethodHandle { to: reg, .. }
+            | ConstMethodType { to: reg, .. } => types[*reg as usize] = Object,
+            InstanceOf { dest, obj: _, .. } => {
+                // types[*obj as usize] = Object; not sure about this one
+                types[*dest as usize] = SimpleScalar;
+            }
+            ArrayLength { dest, arr } => {
+                types[*arr as usize] = Object;
+                types[*dest as usize] = SimpleScalar;
+            }
+            NewArray { reg, size_reg, .. } => {
+                types[*reg as usize] = Object;
+                types[*size_reg as usize] = SimpleScalar;
+            }
+            FilledNewArray { type_, reg_values } => {
+                let reg_ty = if type_.is_class() || type_.is_array() {
+                    Object
+                //} else if type_.is_long() || type_.is_double() {
+                //    SimpleScalar // Not supposed to happend so default to simple scalar just in
+                //    case
+                } else {
+                    SimpleScalar
+                };
+                for i in reg_values {
+                    types[*i as usize] = reg_ty;
+                }
+            }
+            CmpLFloat { dest, b, c } | CmpGFloat { dest, b, c } => {
+                types[*dest as usize] = SimpleScalar;
+                types[*b as usize] = SimpleScalar;
+                types[*c as usize] = SimpleScalar;
+            }
+            CmpLDouble { dest, b, c } | CmpGDouble { dest, b, c } | CmpLong { dest, b, c } => {
+                types[*dest as usize] = SimpleScalar;
+                types[*b as usize] = FirstWideScalar;
+                types[*b as usize + 1] = SecondWideScalar;
+                types[*c as usize] = FirstWideScalar;
+                types[*c as usize + 1] = SecondWideScalar;
+            }
+            IfEq { a, b, .. }
+            | IfNe { a, b, .. }
+            | IfLt { a, b, .. }
+            | IfGe { a, b, .. }
+            | IfGt { a, b, .. }
+            | IfLe { a, b, .. } => {
+                types[*a as usize] = SimpleScalar;
+                types[*b as usize] = SimpleScalar;
+            }
+            IfEqZ { a, .. }
+            | IfNeZ { a, .. }
+            | IfLtZ { a, .. }
+            | IfGeZ { a, .. }
+            | IfGtZ { a, .. }
+            | IfLeZ { a, .. } => {
+                types[*a as usize] = SimpleScalar;
+            }
+            AGet {
+                dest: r,
+                arr: obj,
+                idx,
+            }
+            | AGetBoolean {
+                dest: r,
+                arr: obj,
+                idx,
+            }
+            | AGetByte {
+                dest: r,
+                arr: obj,
+                idx,
+            }
+            | AGetChar {
+                dest: r,
+                arr: obj,
+                idx,
+            }
+            | AGetShort {
+                dest: r,
+                arr: obj,
+                idx,
+            }
+            | APut {
+                from: r,
+                arr: obj,
+                idx,
+            }
+            | APutBoolean {
+                from: r,
+                arr: obj,
+                idx,
+            }
+            | APutByte {
+                from: r,
+                arr: obj,
+                idx,
+            }
+            | APutChar {
+                from: r,
+                arr: obj,
+                idx,
+            }
+            | APutShort {
+                from: r,
+                arr: obj,
+                idx,
+            } => {
+                types[*r as usize] = SimpleScalar;
+                types[*obj as usize] = Object;
+                types[*idx as usize] = SimpleScalar;
+            }
+            AGetWide {
+                dest: r,
+                arr: obj,
+                idx,
+            }
+            | APutWide {
+                from: r,
+                arr: obj,
+                idx,
+            } => {
+                types[*r as usize] = FirstWideScalar;
+                types[*r as usize + 1] = SecondWideScalar;
+                types[*obj as usize] = Object;
+                types[*idx as usize] = SimpleScalar;
+            }
+            AGetObject {
+                dest: r,
+                arr: obj,
+                idx,
+            }
+            | APutObject {
+                from: r,
+                arr: obj,
+                idx,
+            } => {
+                types[*r as usize] = Object;
+                types[*obj as usize] = Object;
+                types[*idx as usize] = SimpleScalar;
+            }
+            IGet { to: r, obj, .. }
+            | IGetBoolean { to: r, obj, .. }
+            | IGetByte { to: r, obj, .. }
+            | IGetChar { to: r, obj, .. }
+            | IGetShort { to: r, obj, .. }
+            | IPut { from: r, obj, .. }
+            | IPutBoolean { from: r, obj, .. }
+            | IPutByte { from: r, obj, .. }
+            | IPutChar { from: r, obj, .. }
+            | IPutShort { from: r, obj, .. } => {
+                types[*r as usize] = SimpleScalar;
+                types[*obj as usize] = Object;
+            }
+            IGetWide { to: r, obj, .. } | IPutWide { from: r, obj, .. } => {
+                types[*r as usize] = FirstWideScalar;
+                types[*r as usize] = SecondWideScalar;
+                types[*obj as usize] = Object;
+            }
+            IGetObject { to: r, obj, .. } | IPutObject { from: r, obj, .. } => {
+                types[*r as usize] = Object;
+                types[*obj as usize] = Object;
+            }
+            SGet { to: r, .. }
+            | SGetBoolean { to: r, .. }
+            | SGetByte { to: r, .. }
+            | SGetChar { to: r, .. }
+            | SGetShort { to: r, .. }
+            | SPut { from: r, .. }
+            | SPutBoolean { from: r, .. }
+            | SPutByte { from: r, .. }
+            | SPutChar { from: r, .. }
+            | SPutShort { from: r, .. } => {
+                types[*r as usize] = SimpleScalar;
+            }
+            SGetWide { to: r, .. } | SPutWide { from: r, .. } => {
+                types[*r as usize] = FirstWideScalar;
+                types[*r as usize] = SecondWideScalar;
+            }
+            SGetObject { to: r, .. } | SPutObject { from: r, .. } => {
+                types[*r as usize] = Object;
+            }
+            /* They are information to get from the method type, but, meh, this is not
+             * necessary (we should have the type from when the reg was initialized)
+            InvokeVirtual { method: _, args: _ }
+            | InvokeSuper { method: _, args: _ }
+            | InvokeDirect { method: _, args: _ }
+            | InvokeStatic { method: _, args: _ }
+            | InvokeInterface { method: _, args: _ } => todo!(),
+            InvokePolymorphic { method:_, proto: _, args: _ } => todo!(),
+            InvokeCustom { call_site: _, args: _ } => todo!(),
+            */
+            NegInt { dest, val }
+            | NotInt { dest, val }
+            | NegFloat { dest, val }
+            | NegDouble { dest, val }
+            | IntToFloat { dest, val }
+            | FloatToInt { dest, val }
+            | IntToByte { dest, val }
+            | IntToChar { dest, val }
+            | IntToShort { dest, val } => {
+                types[*dest as usize] = SimpleScalar;
+                types[*val as usize] = SimpleScalar;
+            }
+            NegLong { dest, val }
+            | NotLong { dest, val }
+            | LongToDouble { dest, val }
+            | DoubleToLong { dest, val } => {
+                types[*dest as usize] = FirstWideScalar;
+                types[*dest as usize + 1] = SecondWideScalar;
+                types[*val as usize] = FirstWideScalar;
+                types[*val as usize + 1] = SecondWideScalar;
+            }
+            IntToLong { dest, val }
+            | IntToDouble { dest, val }
+            | FloatToLong { dest, val }
+            | FloatToDouble { dest, val } => {
+                types[*dest as usize] = FirstWideScalar;
+                types[*dest as usize + 1] = SecondWideScalar;
+                types[*val as usize] = SimpleScalar;
+            }
+            LongToInt { dest, val }
+            | LongToFloat { dest, val }
+            | DoubleToInt { dest, val }
+            | DoubleToFloat { dest, val } => {
+                types[*dest as usize] = SimpleScalar;
+                types[*val as usize] = FirstWideScalar;
+                types[*val as usize + 1] = SecondWideScalar;
+            }
+            AddInt { dest, b, c }
+            | SubInt { dest, b, c }
+            | MulInt { dest, b, c }
+            | DivInt { dest, b, c }
+            | RemInt { dest, b, c }
+            | AndInt { dest, b, c }
+            | OrInt { dest, b, c }
+            | XorInt { dest, b, c }
+            | ShlInt { dest, b, c }
+            | ShrInt { dest, b, c }
+            | UshrInt { dest, b, c }
+            | AddFloat { dest, b, c }
+            | SubFloat { dest, b, c }
+            | MulFloat { dest, b, c }
+            | DivFloat { dest, b, c }
+            | RemFloat { dest, b, c } => {
+                types[*dest as usize] = SimpleScalar;
+                types[*b as usize] = SimpleScalar;
+                types[*c as usize] = SimpleScalar;
+            }
+            AddLong { dest, b, c }
+            | SubLong { dest, b, c }
+            | MulLong { dest, b, c }
+            | DivLong { dest, b, c }
+            | RemLong { dest, b, c }
+            | AndLong { dest, b, c }
+            | OrLong { dest, b, c }
+            | XorLong { dest, b, c }
+            | ShlLong { dest, b, c }
+            | ShrLong { dest, b, c }
+            | UshrLong { dest, b, c }
+            | AddDouble { dest, b, c }
+            | SubDouble { dest, b, c }
+            | MulDouble { dest, b, c }
+            | DivDouble { dest, b, c }
+            | RemDouble { dest, b, c } => {
+                types[*dest as usize] = FirstWideScalar;
+                types[*dest as usize + 1] = SecondWideScalar;
+                types[*b as usize] = FirstWideScalar;
+                types[*b as usize + 1] = SecondWideScalar;
+                types[*c as usize] = FirstWideScalar;
+                types[*c as usize + 1] = SecondWideScalar;
+            }
+            AddInt2Addr { dest, b }
+            | SubInt2Addr { dest, b }
+            | MulInt2Addr { dest, b }
+            | DivInt2Addr { dest, b }
+            | RemInt2Addr { dest, b }
+            | AndInt2Addr { dest, b }
+            | OrInt2Addr { dest, b }
+            | XorInt2Addr { dest, b }
+            | ShlInt2Addr { dest, b }
+            | ShrInt2Addr { dest, b }
+            | UshrInt2Addr { dest, b }
+            | AddFloat2Addr { dest, b }
+            | SubFloat2Addr { dest, b }
+            | MulFloat2Addr { dest, b }
+            | DivFloat2Addr { dest, b }
+            | RemFloat2Addr { dest, b }
+            | AddIntLit { dest, b, .. }
+            | RsubIntLit { dest, b, .. }
+            | MulIntLit { dest, b, .. }
+            | DivIntLit { dest, b, .. }
+            | RemIntLit { dest, b, .. }
+            | AndIntLit { dest, b, .. }
+            | OrIntLit { dest, b, .. }
+            | XorIntLit { dest, b, .. }
+            | ShlIntLit { dest, b, .. }
+            | ShrIntLit { dest, b, .. }
+            | UshrIntLit { dest, b, .. } => {
+                types[*dest as usize] = SimpleScalar;
+                types[*b as usize] = SimpleScalar;
+            }
+            AddLong2Addr { dest, b }
+            | SubLong2Addr { dest, b }
+            | MulLong2Addr { dest, b }
+            | DivLong2Addr { dest, b }
+            | RemLong2Addr { dest, b }
+            | AndLong2Addr { dest, b }
+            | OrLong2Addr { dest, b }
+            | XorLong2Addr { dest, b }
+            | ShlLong2Addr { dest, b }
+            | ShrLong2Addr { dest, b }
+            | UshrLong2Addr { dest, b }
+            | AddDouble2Addr { dest, b }
+            | SubDouble2Addr { dest, b }
+            | MulDouble2Addr { dest, b }
+            | DivDouble2Addr { dest, b }
+            | RemDouble2Addr { dest, b } => {
+                types[*dest as usize] = FirstWideScalar;
+                types[*dest as usize + 1] = SecondWideScalar;
+                types[*b as usize] = FirstWideScalar;
+                types[*b as usize + 1] = SecondWideScalar;
+            }
+            _ => (),
+        }
+    }
+    types
+}