From cea77c2fc30972538cd18ad583c0ad96d5a9bb4d Mon Sep 17 00:00:00 2001 From: Jean-Marie Mineau Date: Tue, 26 Nov 2024 14:58:00 +0100 Subject: [PATCH] keep tweeking --- android_class_shadowing_scanner/__init__.py | 2 +- android_class_shadowing_scanner/cmp_smali.py | 5 +- .../data_mining.py | 195 +++++++++++++++--- 3 files changed, 175 insertions(+), 27 deletions(-) diff --git a/android_class_shadowing_scanner/__init__.py b/android_class_shadowing_scanner/__init__.py index 54d108a..75d7638 100644 --- a/android_class_shadowing_scanner/__init__.py +++ b/android_class_shadowing_scanner/__init__.py @@ -571,4 +571,4 @@ def data_mining(): args = parser.parse_args() stats(args.db, args.out, args.output_check_platform_redef) - # analyse_sdk_redef(args.output_check_platform_redef, args.db, args.out) + analyse_sdk_redef(args.output_check_platform_redef, args.db, args.out) diff --git a/android_class_shadowing_scanner/cmp_smali.py b/android_class_shadowing_scanner/cmp_smali.py index 498ab47..38ddf5a 100644 --- a/android_class_shadowing_scanner/cmp_smali.py +++ b/android_class_shadowing_scanner/cmp_smali.py @@ -50,8 +50,9 @@ def get_methods(sm: str, sha256: str = "") -> dict[str, list[list[str]]]: if ( current_meth is not None and striped - and not striped.startswith(".line ") - and not striped.startswith(".param ") + # and not striped.startswith(".line ") + # and not striped.startswith(".param ") + and not striped.startswith(".") ): current_body.append(striped) if striped.startswith(".method "): diff --git a/android_class_shadowing_scanner/data_mining.py b/android_class_shadowing_scanner/data_mining.py index 35421fe..f0484d9 100644 --- a/android_class_shadowing_scanner/data_mining.py +++ b/android_class_shadowing_scanner/data_mining.py @@ -10,6 +10,7 @@ import matplotlib def stats(db: Path, out: Path, folder_plat_diff_smali: Path): + VT_THRESH = 3 occ_sdk34 = {} occ_hid34 = {} @@ -143,6 +144,13 @@ def stats(db: Path, out: Path, folder_plat_diff_smali: Path): with sqlite3.connect(db) as con: cur = con.cursor() nb_apk_all = cur.execute("SELECT COUNT(sha256) FROM data;").fetchone()[0] + nb_mal_any = cur.execute( + f"SELECT COUNT(sha256) FROM data WHERE vt_detection>={VT_THRESH} AND " + " (nb_duplicate_classes>=1 OR nb_def_sdk_34_classes>=1 OR nb_def_platform_non_sdk_34_classes>=1);" + ).fetchone()[0] + nb_mal_all = cur.execute( + f"SELECT COUNT(sha256) FROM data WHERE vt_detection>={VT_THRESH};" + ).fetchone()[0] avg_target_sdk_tot = cur.execute( "SELECT AVG(target_sdk_version) FROM data WHERE target_sdk_version>=0;" ).fetchone()[0] @@ -162,6 +170,37 @@ def stats(db: Path, out: Path, folder_plat_diff_smali: Path): "LIMIT 1 " "OFFSET (SELECT COUNT(*) FROM data) / 2;" ).fetchone()[0] + nb_apk_any = cur.execute( + "SELECT COUNT(sha256) FROM data WHERE " + " (nb_duplicate_classes>=1 OR nb_def_sdk_34_classes>=1 OR nb_def_platform_non_sdk_34_classes>=1);" + ).fetchone()[0] + avg_target_sdk_any = cur.execute( + "SELECT AVG(target_sdk_version) FROM data WHERE target_sdk_version>=0 AND" + " (nb_duplicate_classes>=1 OR nb_def_sdk_34_classes>=1 OR nb_def_platform_non_sdk_34_classes>=1);" + ).fetchone()[0] + avg_min_sdk_any = cur.execute( + "SELECT AVG(min_sdk_version) FROM data WHERE min_sdk_version>=0 AND " + " (nb_duplicate_classes>=1 OR nb_def_sdk_34_classes>=1 OR nb_def_platform_non_sdk_34_classes>=1);" + ).fetchone()[0] + tot_avg_any = cur.execute( + "SELECT AVG(" + " nb_duplicate_classes+nb_def_sdk_34_classes+nb_def_platform_non_sdk_34_classes" + ") FROM data WHERE " + " (nb_duplicate_classes>=1 OR nb_def_sdk_34_classes>=1 OR nb_def_platform_non_sdk_34_classes>=1);" + ).fetchone()[0] + tot_median_any = cur.execute( + "SELECT " + " nb_duplicate_classes+nb_def_sdk_34_classes+nb_def_platform_non_sdk_34_classes " + "FROM data WHERE " + " (nb_duplicate_classes>=1 OR nb_def_sdk_34_classes>=1 OR nb_def_platform_non_sdk_34_classes>=1) " + "ORDER BY " + " nb_duplicate_classes+nb_def_sdk_34_classes+nb_def_platform_non_sdk_34_classes " + "LIMIT 1 " + "OFFSET (" + " SELECT COUNT(*) FROM data WHERE " + " (nb_duplicate_classes>=1 OR nb_def_sdk_34_classes>=1 OR nb_def_platform_non_sdk_34_classes>=1) " + ") / 2;" + ).fetchone()[0] data_all = [] data_only = [] for name, field in [ @@ -172,6 +211,9 @@ def stats(db: Path, out: Path, folder_plat_diff_smali: Path): nb_app, avg, avg_min_sdk = cur.execute( f"SELECT COUNT(sha256), AVG({field}), AVG(min_sdk_version) FROM data WHERE {field}>=1;" ).fetchone() + nb_mal = cur.execute( + f"SELECT COUNT(sha256) FROM data WHERE {field}>=1 AND vt_detection>={VT_THRESH};" + ).fetchone()[0] avg_target_sdk = cur.execute( f"SELECT AVG(target_sdk_version) FROM data WHERE target_sdk_version>=0 AND {field}>=1;" ).fetchone()[0] @@ -202,6 +244,7 @@ def stats(db: Path, out: Path, folder_plat_diff_smali: Path): "avgtargetsdk": round(avg_target_sdk, 1), "avgminsdk": round(avg_min_sdk, 1), "ratioapp": round(100 * nb_app / nb_apk_all, 2), + "ratiomal": round(100 * nb_mal / nb_app, 2), } ) data_all.append( @@ -214,9 +257,23 @@ def stats(db: Path, out: Path, folder_plat_diff_smali: Path): "avgtargetsdk": round(avg_target_sdk_tot, 1), "avgminsdk": round(avg_min_sdk_tot, 1), "ratioapp": round(100 * nb_apk_all / nb_apk_all, 2), + "ratiomal": round(100 * nb_mal_all / nb_apk_all, 2), } ) + data_only.append( + { + "method": "Total", + "nbapp": nb_apk_any, + "avgshadow": round(tot_avg_any, 1), + "median": tot_median_any, + "id": tot_prop_id, + "avgtargetsdk": round(avg_target_sdk_any, 1), + "avgminsdk": round(avg_min_sdk_any, 1), + "ratioapp": round(100 * nb_apk_any / nb_apk_all, 2), + "ratiomal": round(100 * nb_mal_any / nb_apk_any, 2), + } + ) data_all.append( { "method": "Total", @@ -227,9 +284,15 @@ def stats(db: Path, out: Path, folder_plat_diff_smali: Path): "avgtargetsdk": round(avg_target_sdk_tot, 1), "avgminsdk": round(avg_min_sdk_tot, 1), "ratioapp": round(100 * nb_apk_all / nb_apk_all, 2), + "ratiomal": round(100 * nb_mal_all / nb_apk_all, 2), } ) + print(f"NB MALWARE: {nb_mal_all} ({round(100*nb_mal_all/nb_apk_all, 2)}%)") + print( + f"NB MALWARE USING SHADOWING: {nb_mal_any} ({round(100*nb_mal_any/nb_apk_any, 2)}%)" + ) + with (out / "results_50k.csv").open("w") as fd: writer = csv.DictWriter( fd, @@ -242,6 +305,7 @@ def stats(db: Path, out: Path, folder_plat_diff_smali: Path): "avgtargetsdk", "avgminsdk", "ratioapp", + "ratiomal", ], ) writer.writeheader() @@ -259,6 +323,7 @@ def stats(db: Path, out: Path, folder_plat_diff_smali: Path): "avgtargetsdk", "avgminsdk", "ratioapp", + "ratiomal", ], ) writer.writeheader() @@ -300,38 +365,120 @@ def stats(db: Path, out: Path, folder_plat_diff_smali: Path): print(f"{cl:<70} {occ_self_non_id[cl]: >5}") print() - print() - print(f"redefined class SDK <= 7 {' '*40} occurences disctinct") - print() - for cl in sorted( - filter(lambda cl: MIN_MAX_SDK[cl][0] == 7, occ_sdk34.keys()), - key=lambda x: occ_sdk34[x], - reverse=True, - )[:10]: - print(f"{cl:<70} {occ_sdk34[cl]: >5} {occ_sdk34_non_id.get(cl, 0): >5}") - print() + with (out / "redef_sdk_7minus.csv").open("w") as fd: + writer = csv.DictWriter( + fd, + fieldnames=[ + "class", + "occ", + "id", + "idper", + ], + ) + writer.writeheader() + print() + print( + f"redefined class SDK <= 7 {' '*40} occurences identique identique%" + ) + print() + for cl in sorted( + filter(lambda cl: MIN_MAX_SDK[cl][0] == 7, occ_sdk34.keys()), + key=lambda x: occ_sdk34[x], + reverse=True, + )[:10]: + occ = occ_sdk34[cl] + id = occ - occ_sdk34_non_id.get(cl, 0) + id_per = round((100 * id) / occ, 2) + print(f"{cl:<70} {occ: >5} {id: >5} {id_per: >5}") + writer.writerow({"class": cl, "occ": occ, "id": id, "idper": id_per}) + print() + + with (out / "redef_sdk_8.csv").open("w") as fd: + writer = csv.DictWriter( + fd, + fieldnames=[ + "class", + "occ", + "id", + "idper", + ], + ) + writer.writeheader() + print() + print( + f"redefined class SDK = 8 {' '*40} occurences identique identique%" + ) + print() + for cl in sorted( + filter(lambda cl: MIN_MAX_SDK[cl][0] == 8, occ_sdk34.keys()), + key=lambda x: occ_sdk34[x], + reverse=True, + )[:10]: + occ = occ_sdk34[cl] + id = occ - occ_sdk34_non_id.get(cl, 0) + id_per = round((100 * id) / occ, 2) + print(f"{cl:<70} {occ: >5} {id: >5} {id_per: >5}") + writer.writerow({"class": cl, "occ": occ, "id": id, "idper": id_per}) + print() + + with (out / "redef_sdk_16.csv").open("w") as fd: + writer = csv.DictWriter( + fd, + fieldnames=[ + "class", + "occ", + "id", + "idper", + ], + ) + writer.writeheader() + print() + print( + f"redefined class SDK = 16 {' '*40} occurences identique identique%" + ) + print() + for cl in sorted( + filter(lambda cl: MIN_MAX_SDK[cl][0] == 16, occ_sdk34.keys()), + key=lambda x: occ_sdk34[x], + reverse=True, + )[:10]: + occ = occ_sdk34[cl] + id = occ - occ_sdk34_non_id.get(cl, 0) + id_per = round((100 * id) / occ, 2) + print(f"{cl:<70} {occ: >5} {id: >5} {id_per: >5}") + writer.writerow({"class": cl, "occ": occ, "id": id, "idper": id_per}) + print() + + occ_package_hid34 = {} + for cl, occ in occ_hid34.items(): + package = ".".join(cl.removeprefix("L").removesuffix(";").split("/")[:-1]) + if package not in occ_package_hid34: + occ_package_hid34[package] = 0 + occ_package_hid34[package] += occ + occ_package_hid34_non_id = {} + for cl, occ in occ_hid34_non_id.items(): + package = ".".join(cl.removeprefix("L").removesuffix(";").split("/")[:-1]) + if package not in occ_package_hid34_non_id: + occ_package_hid34_non_id[package] = 0 + occ_package_hid34_non_id[package] += occ print() - print(f"redefined class SDK = 8 {' '*40} occurences disctinct") - print() - for cl in sorted( - filter(lambda cl: MIN_MAX_SDK[cl][0] == 8, occ_sdk34.keys()), - key=lambda x: occ_sdk34[x], + print("top 10 hidden package") + for pk in sorted( + occ_package_hid34.keys(), + key=lambda x: occ_package_hid34[x], reverse=True, )[:10]: - print(f"{cl:<70} {occ_sdk34[cl]: >5} {occ_sdk34_non_id.get(cl, 0): >5}") + print(f" {pk:<70} {occ_package_hid34[pk]}") print() - print() - print(f"redefined class SDK = 16 {' '*40} occurences disctinct") - print() - for cl in sorted( - filter(lambda cl: MIN_MAX_SDK[cl][0] == 16, occ_sdk34.keys()), - key=lambda x: occ_sdk34[x], + print("top 10 hidden package non id") + for pk in sorted( + occ_package_hid34_non_id.keys(), + key=lambda x: occ_package_hid34_non_id[x], reverse=True, )[:10]: - print(f"{cl:<70} {occ_sdk34[cl]: >5} {occ_sdk34_non_id.get(cl, 0): >5}") - print() + print(f" {pk:<70} {occ_package_hid34_non_id[pk]}") def analyse_sdk_redef(folder: Path, db: Path, out: Path):