diff --git a/android_class_shadowing_scanner/__init__.py b/android_class_shadowing_scanner/__init__.py index b40efe0..1a6541f 100644 --- a/android_class_shadowing_scanner/__init__.py +++ b/android_class_shadowing_scanner/__init__.py @@ -292,145 +292,6 @@ def collect_to_db(): def check_smali(): - parser = ArgumentParser( - prog="Smalli Check", - description="Check if duplicated classes have distinct smali", - ) - parser.add_argument( - "--db", - help="Path to the database storing the results", - type=Path, - required=True, - ) - parser.add_argument( - "--out", - help="Path to the file where to store the results", - type=Path, - required=True, - ) - parser.add_argument( - "--apktool-jar", - help="Path to the apktool jar file", - type=Path, - required=True, - ) - key_parser = parser.add_mutually_exclusive_group(required=False) - key_parser.add_argument( - "--api-key-file", - help="The path to a file containing the Androzoo API key", - type=Path, - ) - key_parser.add_argument( - "--api-key", help="The Androzoo API key (Usage NOT recommanded)", type=str - ) - SECRET_STORAGE_IMPORTED = False - try: - import secretstorage - - SECRET_STORAGE_IMPORTED = True - - key_parser.add_argument( - "--api-key-keyring-id", - help="The ID of the Androzoo API key in the secret service storage", - type=str, - ) - except ModuleNotFoundError: - pass - args = parser.parse_args() - - apktool = args.apktool_jar.resolve() - api_key = "" - if args.api_key: - api_key = args.api_key - if args.api_key_file: - with args.api_key_file.open("r") as file: - api_key = file.read().strip() - if SECRET_STORAGE_IMPORTED and not api_key: - if args.api_key_keyring_id: - key_id = args.api_key_keyring_id - else: - key_id = "androzoo" - try: - with secretstorage.dbus_init() as connection: - collection = secretstorage.get_default_collection(connection) - item = next(collection.search_items({"Title": key_id})) - item.unlock() - api_key = item.get_secret().decode("utf-8").strip() - except: - pass - if not api_key: - api_key = getpass(prompt="Androzoo API key: ").strip() - - with sqlite3.connect(args.db) as conn: - apks = list( - map( - lambda t: t[0], - conn.execute("SELECT sha256 FROM data WHERE nb_duplicate_classes >= 1"), - ) - ) - data = {} - for sha256 in apks: - with tempfile.TemporaryDirectory() as tmpdirname: - d = Path(tmpdirname) - apk_bin = download_apk(sha256, api_key, logfile=None) - if apk_bin is None: - continue - with (d / "app.apk").open("wb") as fp: - fp.write(apk_bin) - androguard_apk = APK(str(d / "app.apk")) - with zipfile.ZipFile(io.BytesIO(apk_bin)) as apk: - data[sha256] = {} - entry = analyze(apk, androguard_apk, sha256, json_out=data[sha256]) - r = subprocess.run( - [ - "java", - "-Xmx8G", - "-jar", - str(apktool), - "d", - "app.apk", - "-o", - "apktool_out", - ], - cwd=d, - ) - data[sha256]["apktool-finished"] = (r.returncode == 0) and ( - d / "apktool_out" / "apktool.yml" - ).exists() - smalli_dirs = [] - for dex in data[sha256]["class_dex"]: - if dex == "classes.dex": - smalli_dirs.append(d / "apktool_out" / "smali") - else: - smalli_dirs.append( - d / "apktool_out" / ("smali_" + dex.removesuffix(".dex")) - ) - dist_dup_classes = set() - for cl in data[sha256]["duplicated_classes"]: - cl_f = cl.removesuffix(";").removeprefix("L") + ".smali" - smali = None - for cdir in smalli_dirs: - if (cdir / cl_f).exists(): - print((cdir / cl_f)) - with (cdir / cl_f).open() as file: - smali_new = file.read() - if smali is None: - smali = smali_new - elif smali != smali_new: - dist_dup_classes.add(cl) - data[sha256]["redef_classes"] = list(dist_dup_classes) - if data[sha256]["redef_classes"]: - print(f"{sha256}:") - for c in data[sha256]["redef_classes"]: - print(f" {c}") - else: - print(f"{sha256}: No true redefinition") - - with args.out.open("w") as f: - json.dump(data, f) - - -def check_smali_platform(): parser = ArgumentParser( prog="Smalli Check", description="Check if duplicated classes are distinct from the actual sources", @@ -563,6 +424,7 @@ def check_smali_platform(): "apktool_out", ], cwd=d, + capture_output=True, # just avoid spamming ) data["apktool-finished"] = (r.returncode == 0) and ( d / "apktool_out" / "apktool.yml" @@ -698,27 +560,13 @@ def data_mining(): type=Path, required=True, ) - parser.add_argument( - "--output-dir-def-sdk34-classes", - help="The directory storing the classes already in SDK 34 redefined by apks", - type=Path, - required=True, - ) - parser.add_argument( - "--detail-class-redef", - help="Path to json file outputed by `check-class-redef`", - type=Path, - required=True, - ) parser.add_argument( "--output-check-platform-redef", - help="The directory storing the result of smali comparision between platform classes and classes defined in apk (--output-dir of `check-platf-reder`)", + help="The directory storing the result of smali comparision between platform classes and classes defined in apk (--output-dir of `check-smali`)", type=Path, required=True, ) args = parser.parse_args() - stats(args.db, args.out, args.output_check_platform_redef, args.detail_class_redef) - - if args.output_dir_def_sdk34_classes is not None: - analyse_sdk_redef(args.output_dir_def_sdk34_classes, args.db, args.out) + stats(args.db, args.out, args.output_check_platform_redef) + # analyse_sdk_redef(args.output_check_platform_redef, args.db, args.out) diff --git a/android_class_shadowing_scanner/cmp_smali.py b/android_class_shadowing_scanner/cmp_smali.py index 3dd273c..498ab47 100644 --- a/android_class_shadowing_scanner/cmp_smali.py +++ b/android_class_shadowing_scanner/cmp_smali.py @@ -6,6 +6,10 @@ def cmp_smali(sm1: str, sm2: str, sha256_1: str = "", sha256_2: str = "") -> boo for m in meths_1.keys(): s1 = meths_1[m] s2 = meths_2[m] + if len(s1) > 1: + print(f"method {m} in {sha256_1} has multiple implementation") + if len(s2) > 1: + print(f"method {m} in {sha256_2} has multiple implementation") for b1 in s1: match = False for b2 in s2: diff --git a/android_class_shadowing_scanner/data_mining.py b/android_class_shadowing_scanner/data_mining.py index f8b9459..35421fe 100644 --- a/android_class_shadowing_scanner/data_mining.py +++ b/android_class_shadowing_scanner/data_mining.py @@ -9,12 +9,14 @@ from matplotlib import pyplot as plt import matplotlib -def stats(db: Path, out: Path, folder_plat_diff_smali: Path, detail_class_redef: Path): +def stats(db: Path, out: Path, folder_plat_diff_smali: Path): occ_sdk34 = {} occ_hid34 = {} - occ_self_redef = {} occ_self = {} + occ_sdk34_non_id = {} + occ_hid34_non_id = {} + occ_self_non_id = {} nb_sdk_cl_redef = 0 nb_sdk_cl_id = 0 @@ -24,9 +26,32 @@ def stats(db: Path, out: Path, folder_plat_diff_smali: Path, detail_class_redef: nb_hid_cl_id = 0 nb_app_hid_cl_redef = 0 nb_app_hid_cl_redef_false_pos = 0 + nb_class_self_shadow = 0 + nb_class_self_shadow_id = 0 + nb_app_self_shadow = 0 + nb_app_self_shadow_false_pos = 0 + for file in folder_plat_diff_smali.iterdir(): with file.open("r") as fd: data = json.load(fd) + if not data["apktool-finished"]: + continue + for cl in data["redef_classes"]: + if cl not in occ_self_non_id: + occ_self_non_id[cl] = 0 + occ_self_non_id[cl] += 1 + for cl in data["duplicated_classes"]: + if cl not in occ_self: + occ_self[cl] = 0 + occ_self[cl] += 1 + if data["duplicated_classes"]: + nb_app_self_shadow += 1 + if data["duplicated_classes"] and not data["redef_classes"]: + nb_app_self_shadow_false_pos += 1 + nb_class_self_shadow += len(data["duplicated_classes"]) + nb_class_self_shadow_id += len(data["duplicated_classes"]) - len( + data["redef_classes"] + ) l_nb_sdk_cl_id = 0 for cl in data["sdk_34_classes"]: nb_sdk_cl_redef += 1 @@ -47,6 +72,10 @@ def stats(db: Path, out: Path, folder_plat_diff_smali: Path, detail_class_redef: ] ): l_nb_sdk_cl_id += 1 + else: + if cl not in occ_sdk34_non_id: + occ_sdk34_non_id[cl] = 0 + occ_sdk34_non_id[cl] += 1 nb_sdk_cl_id += l_nb_sdk_cl_id if data["sdk_34_classes"]: nb_app_sdk_cl_redef += 1 @@ -70,6 +99,10 @@ def stats(db: Path, out: Path, folder_plat_diff_smali: Path, detail_class_redef: ] ): l_nb_hid_cl_id += 1 + else: + if cl not in occ_hid34_non_id: + occ_hid34_non_id[cl] = 0 + occ_hid34_non_id[cl] += 1 nb_hid_cl_id += l_nb_hid_cl_id if data["platform_non_sdk_34_classes"]: nb_app_hid_cl_redef += 1 @@ -78,29 +111,6 @@ def stats(db: Path, out: Path, folder_plat_diff_smali: Path, detail_class_redef: and len(data["platform_non_sdk_34_classes"]) == l_nb_hid_cl_id ): nb_app_hid_cl_redef_false_pos += 1 - nb_class_self_shadow = 0 - nb_class_self_shadow_id = 0 - nb_app_self_shadow = 0 - nb_app_self_shadow_false_pos = 0 - with detail_class_redef.open("r") as fd: - data = json.load(fd) - for v in data.values(): - for cl in v["redef_classes"]: - if cl not in occ_self_redef: - occ_self_redef[cl] = 0 - occ_self_redef[cl] += 1 - for cl in v["duplicated_classes"]: - if cl not in occ_self: - occ_self[cl] = 0 - occ_self[cl] += 1 - if v["duplicated_classes"]: - nb_app_self_shadow += 1 - if v["duplicated_classes"] and not v["redef_classes"]: - nb_app_self_shadow_false_pos += 1 - nb_class_self_shadow += len(v["duplicated_classes"]) - nb_class_self_shadow_id += len(v["duplicated_classes"]) - len( - v["redef_classes"] - ) prop_id_self_cl = round(100 * nb_class_self_shadow_id / nb_class_self_shadow, 2) prop_id_self_ap = round(100 * nb_app_self_shadow_false_pos / nb_app_self_shadow, 2) @@ -110,10 +120,11 @@ def stats(db: Path, out: Path, folder_plat_diff_smali: Path, detail_class_redef: prop_id_hid_ap = round(100 * nb_app_hid_cl_redef_false_pos / nb_app_hid_cl_redef, 2) tot_prop_id = round( 100 - * (nb_class_self_shadow_id + nb_sdk_cl_id + 100 * nb_hid_cl_id) + * (nb_class_self_shadow_id + nb_sdk_cl_id + nb_hid_cl_id) / (nb_class_self_shadow + nb_sdk_cl_redef + nb_hid_cl_redef), 2, ) + print( f"Self classes: {nb_class_self_shadow_id}/{nb_class_self_shadow}: {prop_id_self_cl}%" ) @@ -260,80 +271,66 @@ def stats(db: Path, out: Path, folder_plat_diff_smali: Path, detail_class_redef: # occ_self = {} print() - print( - "redefined class SDK occurences" - ) + print(f"redefined class SDK {' '*40} occurences disctinct") print() for cl in sorted(occ_sdk34.keys(), key=lambda x: occ_sdk34[x], reverse=True)[:10]: - print(f"{cl:<70} {occ_sdk34[cl]: >5}") + print(f"{cl:<70} {occ_sdk34[cl]: >5} {occ_sdk34_non_id.get(cl, 0): >5}") print() print() - print( - "redefined class Hidden occurences" - ) + print(f"redefined class Hidden {' '*40} occurences disctinct") print() for cl in sorted(occ_hid34.keys(), key=lambda x: occ_hid34[x], reverse=True)[:10]: - print(f"{cl:<70} {occ_hid34[cl]: >5}") + print(f"{cl:<70} {occ_hid34[cl]: >5} {occ_hid34_non_id.get(cl, 0): >5}") print() print() - print( - "collision class Self occurences" - ) + print(f"collision class Self {' '*40} occurences disctinct") print() for cl in sorted(occ_self.keys(), key=lambda x: occ_self[x], reverse=True)[:10]: - print(f"{cl:<70} {occ_self[cl]: >5}") + print(f"{cl:<70} {occ_self[cl]: >5} {occ_self_non_id.get(cl, 0): >5}") print() print() - print( - "redefined class Self occurences" - ) + print(f"redefined class Self {' '*40} occurences") print() for cl in sorted( - occ_self_redef.keys(), key=lambda x: occ_self_redef[x], reverse=True + occ_self_non_id.keys(), key=lambda x: occ_self_non_id[x], reverse=True )[:10]: - print(f"{cl:<70} {occ_self_redef[cl]: >5}") + print(f"{cl:<70} {occ_self_non_id[cl]: >5}") print() print() - print( - "redefined class SDK <= 7 occurences" - ) + print(f"redefined class SDK <= 7 {' '*40} occurences disctinct") print() for cl in sorted( filter(lambda cl: MIN_MAX_SDK[cl][0] == 7, occ_sdk34.keys()), key=lambda x: occ_sdk34[x], reverse=True, )[:10]: - print(f"{cl:<70} {occ_sdk34[cl]: >5}") + print(f"{cl:<70} {occ_sdk34[cl]: >5} {occ_sdk34_non_id.get(cl, 0): >5}") print() print() - print( - "redefined class SDK = 8 occurences" - ) + print(f"redefined class SDK = 8 {' '*40} occurences disctinct") print() for cl in sorted( filter(lambda cl: MIN_MAX_SDK[cl][0] == 8, occ_sdk34.keys()), key=lambda x: occ_sdk34[x], reverse=True, )[:10]: - print(f"{cl:<70} {occ_sdk34[cl]: >5}") + print(f"{cl:<70} {occ_sdk34[cl]: >5} {occ_sdk34_non_id.get(cl, 0): >5}") print() print() - print( - "redefined class SDK = 16 occurences" - ) + print(f"redefined class SDK = 16 {' '*40} occurences disctinct") print() for cl in sorted( filter(lambda cl: MIN_MAX_SDK[cl][0] == 16, occ_sdk34.keys()), key=lambda x: occ_sdk34[x], reverse=True, )[:10]: - print(f"{cl:<70} {occ_sdk34[cl]: >5}") + print(f"{cl:<70} {occ_sdk34[cl]: >5} {occ_sdk34_non_id.get(cl, 0): >5}") print() @@ -360,18 +357,19 @@ def analyse_sdk_redef(folder: Path, db: Path, out: Path): apk_min_sdk = min_sdks[file.name] apk_targ_sdk = targ_sdks[file.name] with file.open("r") as fp: - for cl in fp: - if cl.strip(): - cls.add(cl.strip()) - cl_min_sdk, _ = MIN_MAX_SDK[cl.strip()] - if cl_min_sdk < apk_min_sdk: - cls_by_sdk_under_min[cl_min_sdk] += 1 - else: - cls_by_sdk_over_min[cl_min_sdk] += 1 - if cl_min_sdk < apk_targ_sdk: - cls_by_sdk_under_targ[cl_min_sdk] += 1 - else: - cls_by_sdk_over_targ[cl_min_sdk] += 1 + data = json.load(fp) + + for cl in data["sdk_34_classes"]: + cls.add(cl) + cl_min_sdk, _ = MIN_MAX_SDK[cl] + if cl_min_sdk < apk_min_sdk: + cls_by_sdk_under_min[cl_min_sdk] += 1 + else: + cls_by_sdk_over_min[cl_min_sdk] += 1 + if cl_min_sdk < apk_targ_sdk: + cls_by_sdk_under_targ[cl_min_sdk] += 1 + else: + cls_by_sdk_over_targ[cl_min_sdk] += 1 classes_by_app[file.name] = cls classes_occ = {} diff --git a/platforms.zip b/platforms.zip index b7f0acd..9bc7454 100644 --- a/platforms.zip +++ b/platforms.zip @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:149925aa91e73ca8f25f218d4e45aaf54997527a090a1648baf892303c518d9a -size 147045360 +oid sha256:ed844b127de4e391365a734e9a0dece6c88684887bb4d4670e01a5a9fc5a9402 +size 81365687 diff --git a/pyproject.toml b/pyproject.toml index ee0ad6b..e3d37c4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,6 +22,5 @@ build-backend = "poetry.core.masonry.api" [tool.poetry.scripts] scan = 'android_class_shadowing_scanner.__init__:main' collect-scan = 'android_class_shadowing_scanner.__init__:collect_to_db' -check-class-redef = 'android_class_shadowing_scanner.__init__:check_smali' -check-platf-reder = 'android_class_shadowing_scanner.__init__:check_smali_platform' +check-smali = 'android_class_shadowing_scanner.__init__:check_smali' data-mining = 'android_class_shadowing_scanner.__init__:data_mining'