From 0b572e1885af1314e8c46aa8e25661933394bc92 Mon Sep 17 00:00:00 2001 From: Jean-Marie Mineau Date: Tue, 19 Nov 2024 21:09:08 +0100 Subject: [PATCH] fuzzy class comp --- android_class_shadowing_scanner/__init__.py | 25 +++- android_class_shadowing_scanner/cmp_smali.py | 53 +++++++++ .../data_mining.py | 108 ++++++++++++++++++ run_exp_6.sh | 68 +++++++++++ 4 files changed, 252 insertions(+), 2 deletions(-) create mode 100644 android_class_shadowing_scanner/cmp_smali.py create mode 100644 run_exp_6.sh diff --git a/android_class_shadowing_scanner/__init__.py b/android_class_shadowing_scanner/__init__.py index 48f34e5..fafecf5 100644 --- a/android_class_shadowing_scanner/__init__.py +++ b/android_class_shadowing_scanner/__init__.py @@ -17,6 +17,7 @@ from androguard.core.apk import APK # type: ignore from .androzoo import download_apk from .data import ApkData, load_from_directory +from .cmp_smali import cmp_smali from .analysis import analyze @@ -618,7 +619,12 @@ def check_smali_platform(): for smalli_dir in smalli_dirs: if (smalli_dir / cl_f).exists(): with (smalli_dir / cl_f).open("r") as file: - if file.read() != plt_smali: + if not cmp_smali( + file.read(), + plt_smali, + sha256, + f"{a_sdk_dir / 'platform'}", + ): plat_diff_smalli.add(cl) break @@ -641,13 +647,28 @@ def check_smali_platform(): for smalli_dir in smalli_dirs: if (smalli_dir / cl_f).exists(): with (smalli_dir / cl_f).open("r") as file: - if file.read() != sdk_smali: + if not cmp_smali( + file.read(), sdk_smali, sha256, f"{a_sdk_dir / 'sdk'}" + ): sdk_diff_smalli.add(cl) break data[f"sdk_{sdk_v}_diff_smalli"] = list(sdk_diff_smalli) data[f"platform_{sdk_v}_diff_smalli"] = list(plat_diff_smalli) + for cl in data["duplicated_classes"]: + cl_f = cl.removesuffix(";").removeprefix("L") + ".smali" + smali = None + for cdir in smalli_dirs: + if (cdir / cl_f).exists(): + with (cdir / cl_f).open() as file: + smali_new = file.read() + if smali is None: + smali = smali_new + elif not cmp_smali(smali, smali_new, sha256, sha256): + dist_dup_classes.add(cl) + data["redef_classes"] = list(dist_dup_classes) + if args.output_dir: with (args.output_dir / sha256).open("w") as file: json.dump(data, file) diff --git a/android_class_shadowing_scanner/cmp_smali.py b/android_class_shadowing_scanner/cmp_smali.py new file mode 100644 index 0000000..1bdfa99 --- /dev/null +++ b/android_class_shadowing_scanner/cmp_smali.py @@ -0,0 +1,53 @@ +def cmp_smali(sm1: str, sm2: str, sha256_1: str = "", sha256_2: str = "") -> bool: + meths_1 = get_methods(sm1, sha256_1) + meths_2 = get_methods(sm2, sha256_2) + if set(meths_1.keys()) != set(meths_2.keys()): + return False + for m in meths_1.keys(): + s1 = meths_1[m] + s2 = meths_2[m] + for b1 in s1: + match = False + for b2 in s2: + if b1 == b2: + match = True + break + if not match: + return False + for b2 in s2: + match = False + for b1 in s1: + if b1 == b2: + match = True + break + if not match: + return False + return True + + +def get_methods(sm: str, sha256: str = "") -> dict[str, list[list[str]]]: + class_name = "UNINITIALIZED" + current_meth: None | str = None + current_body: list[str] = [] + rest: dict[str, list[list[str]]] = {} + for line in sm.split("\n"): + striped = line.strip() + if striped.startswith(".class "): + class_name = striped.split(" ")[-1] + if striped == ".end method": + if current_meth is None: + print(f"ERROR PARSING SMALI of {class_name} {sha256}") + else: + if current_meth not in rest: + rest[current_meth] = [] + rest[current_meth].append(current_body) + current_body = [] + current_meth = None + if current_meth is not None and striped and not striped.startswith(".line "): + current_body.append(striped) + if striped.startswith(".method "): + if current_meth is not None: + print(f"ERROR PARSING SMALI of {class_name} {sha256}") + current_meth = striped.split(" ")[-1] + current_body = [] + return rest diff --git a/android_class_shadowing_scanner/data_mining.py b/android_class_shadowing_scanner/data_mining.py index f8c5f6c..f8b9459 100644 --- a/android_class_shadowing_scanner/data_mining.py +++ b/android_class_shadowing_scanner/data_mining.py @@ -6,9 +6,16 @@ from pathlib import Path from .platform_classes import MIN_MAX_SDK from matplotlib import pyplot as plt +import matplotlib def stats(db: Path, out: Path, folder_plat_diff_smali: Path, detail_class_redef: Path): + + occ_sdk34 = {} + occ_hid34 = {} + occ_self_redef = {} + occ_self = {} + nb_sdk_cl_redef = 0 nb_sdk_cl_id = 0 nb_app_sdk_cl_redef = 0 @@ -23,6 +30,9 @@ def stats(db: Path, out: Path, folder_plat_diff_smali: Path, detail_class_redef: l_nb_sdk_cl_id = 0 for cl in data["sdk_34_classes"]: nb_sdk_cl_redef += 1 + if cl not in occ_sdk34: + occ_sdk34[cl] = 0 + occ_sdk34[cl] += 1 if any( [ cl not in data[lst] @@ -45,6 +55,9 @@ def stats(db: Path, out: Path, folder_plat_diff_smali: Path, detail_class_redef: l_nb_hid_cl_id = 0 for cl in data["platform_non_sdk_34_classes"]: + if cl not in occ_hid34: + occ_hid34[cl] = 0 + occ_hid34[cl] += 1 nb_hid_cl_redef += 1 if any( [ @@ -72,6 +85,14 @@ def stats(db: Path, out: Path, folder_plat_diff_smali: Path, detail_class_redef: with detail_class_redef.open("r") as fd: data = json.load(fd) for v in data.values(): + for cl in v["redef_classes"]: + if cl not in occ_self_redef: + occ_self_redef[cl] = 0 + occ_self_redef[cl] += 1 + for cl in v["duplicated_classes"]: + if cl not in occ_self: + occ_self[cl] = 0 + occ_self[cl] += 1 if v["duplicated_classes"]: nb_app_self_shadow += 1 if v["duplicated_classes"] and not v["redef_classes"]: @@ -233,6 +254,88 @@ def stats(db: Path, out: Path, folder_plat_diff_smali: Path, detail_class_redef: for row in data_only: writer.writerow(row) + # occ_sdk34 = {} + # occ_hid34 = {} + # occ_self_redef = {} + # occ_self = {} + + print() + print( + "redefined class SDK occurences" + ) + print() + for cl in sorted(occ_sdk34.keys(), key=lambda x: occ_sdk34[x], reverse=True)[:10]: + print(f"{cl:<70} {occ_sdk34[cl]: >5}") + print() + + print() + print( + "redefined class Hidden occurences" + ) + print() + for cl in sorted(occ_hid34.keys(), key=lambda x: occ_hid34[x], reverse=True)[:10]: + print(f"{cl:<70} {occ_hid34[cl]: >5}") + print() + + print() + print( + "collision class Self occurences" + ) + print() + for cl in sorted(occ_self.keys(), key=lambda x: occ_self[x], reverse=True)[:10]: + print(f"{cl:<70} {occ_self[cl]: >5}") + print() + + print() + print( + "redefined class Self occurences" + ) + print() + for cl in sorted( + occ_self_redef.keys(), key=lambda x: occ_self_redef[x], reverse=True + )[:10]: + print(f"{cl:<70} {occ_self_redef[cl]: >5}") + print() + + print() + print( + "redefined class SDK <= 7 occurences" + ) + print() + for cl in sorted( + filter(lambda cl: MIN_MAX_SDK[cl][0] == 7, occ_sdk34.keys()), + key=lambda x: occ_sdk34[x], + reverse=True, + )[:10]: + print(f"{cl:<70} {occ_sdk34[cl]: >5}") + print() + + print() + print( + "redefined class SDK = 8 occurences" + ) + print() + for cl in sorted( + filter(lambda cl: MIN_MAX_SDK[cl][0] == 8, occ_sdk34.keys()), + key=lambda x: occ_sdk34[x], + reverse=True, + )[:10]: + print(f"{cl:<70} {occ_sdk34[cl]: >5}") + print() + + print() + print( + "redefined class SDK = 16 occurences" + ) + print() + for cl in sorted( + filter(lambda cl: MIN_MAX_SDK[cl][0] == 16, occ_sdk34.keys()), + key=lambda x: occ_sdk34[x], + reverse=True, + )[:10]: + print(f"{cl:<70} {occ_sdk34[cl]: >5}") + print() + def analyse_sdk_redef(folder: Path, db: Path, out: Path): with sqlite3.connect(db) as con: @@ -289,6 +392,7 @@ def analyse_sdk_redef(folder: Path, db: Path, out: Path): for cl, n in classes_occ.items(): cls_by_sdk[MIN_MAX_SDK[cl][0]] += n + matplotlib.rcParams.update({"font.size": 22}) plt.figure(figsize=(20, 9), dpi=80) plt.bar( ["<=7" if i == 7 else str(i) for i in range(7, 35)], @@ -308,6 +412,8 @@ def analyse_sdk_redef(folder: Path, db: Path, out: Path): edgecolor="black", ) plt.legend(loc="upper left") + plt.ylabel("Nb Classes") + plt.xlabel("First SDK containing the class") plt.savefig(out / "redef_sdk_relative_min_sdk.pdf", format="pdf") plt.savefig(out / "redef_sdk_relative_min_sdk.svg", format="svg") plt.show() @@ -332,6 +438,8 @@ def analyse_sdk_redef(folder: Path, db: Path, out: Path): edgecolor="black", ) plt.legend(loc="upper left") + plt.ylabel("Nb Classes") + plt.xlabel("First SDK containing the class") plt.savefig(out / "redef_sdk_relative_targ_sdk.pdf", format="pdf") plt.savefig(out / "redef_sdk_relative_targ_sdk.svg", format="svg") plt.show() diff --git a/run_exp_6.sh b/run_exp_6.sh new file mode 100644 index 0000000..ab0a3ec --- /dev/null +++ b/run_exp_6.sh @@ -0,0 +1,68 @@ +#!/usr/bin/bash + +WD=$(pwd) +SCRIPT_DIR="$(dirname "$(readlink -f "$0")")" +PLATFORM_DIR=$(mktemp -d) +APKTOOL="${WD}/apktool.jar" +DB="${SCRIPT_DIR}/data/app-2023-xp4.db" +LIST=$(mktemp) +CHUNK_FOLDER="./app-2023-exp6" +APKTOOL="${SCRIPT_DIR}/apktool.jar" +ANDROZOO_KEY="${SCRIPT_DIR}/.ZOO_KEY" +OUT_DIR="app-2023-xp6.out" + +app_lst=( + '00' + '01' + '02' + '03' + '04' + '05' + '06' + '07' + '08' + '09' + '10' + '11' + '12' + '13' + '14' + '15' + '16' + '17' + '18' + '19' +) + +mkdir -p "${OUT_DIR}" +unzip platforms.zip -d "${PLATFORM_DIR}" + +for ad in "${PLATFORM_DIR}"/**/{platform,sdk}; do + cd ${ad} + for jar in "${ad}"/*.jar; do + java -Xmx8G -jar ${APKTOOL} d "${jar}" + done +done + +cd "${WD}" + +sqlite3 ${DB} 'SELECT sha256 FROM data WHERE nb_def_platform_32_classes >= 1 OR nb_def_platform_33_classes >= 1 OR nb_def_platform_34_classes >= 1 OR nb_duplicate_classes>=1;' > "${LIST}" + +N_CHUNK=$(python3 -c "print($(cat ${LIST} | wc -l)//20 + 1)") +rm -r "${CHUNK_FOLDER}" +mkdir "${CHUNK_FOLDER}" +split -a 2 -d -l "${N_CHUNK}" "${LIST}" "${CHUNK_FOLDER}" + +worker () { + for sha in $(cat "${1}"); do + "${SCRIPT_DIR}"/venv/bin/check-platf-reder --api-key-file "${ANDROZOO_KEY}" --sha256 "${sha}" --path-platform-smali "${PLATFORM_DIR}" --apktool-jar "${APKTOOL}" --output-dir "${OUT_DIR}" + done + echo "Finished ${1}" +} + +for lst in ${app_lst[@]}; do + worker "${CHUNK_FOLDER}/${lst}" & + sleep 1 +done + +echo 'PROCESS LAUNCHED'