diff --git a/android_class_shadowing_scanner/__init__.py b/android_class_shadowing_scanner/__init__.py index 6ac316b..48f34e5 100644 --- a/android_class_shadowing_scanner/__init__.py +++ b/android_class_shadowing_scanner/__init__.py @@ -658,7 +658,7 @@ def check_smali_platform(): def data_mining(): # use plt and numpy # those libs are iffy on the server so let's not import them when not needed - from .data_mining import analyse_sdk_redef + from .data_mining import analyse_sdk_redef, stats parser = ArgumentParser( prog="Data Mining", @@ -668,15 +668,35 @@ def data_mining(): "--db", help="Path to the database storing the results", type=Path, + required=True, + ) + parser.add_argument( + "--out", + help="Directory where to store results", + type=Path, + required=True, ) parser.add_argument( "--output-dir-def-sdk34-classes", help="The directory storing the classes already in SDK 34 redefined by apks", type=Path, + required=True, + ) + parser.add_argument( + "--detail-class-redef", + help="Path to json file outputed by `check-class-redef`", + type=Path, + required=True, + ) + parser.add_argument( + "--output-check-platform-redef", + help="The directory storing the result of smali comparision between platform classes and classes defined in apk (--output-dir of `check-platf-reder`)", + type=Path, + required=True, ) args = parser.parse_args() - if args.db is not None: - pass + + stats(args.db, args.out, args.output_check_platform_redef, args.detail_class_redef) if args.output_dir_def_sdk34_classes is not None: - analyse_sdk_redef(args.output_dir_def_sdk34_classes) + analyse_sdk_redef(args.output_dir_def_sdk34_classes, args.db, args.out) diff --git a/android_class_shadowing_scanner/data_mining.py b/android_class_shadowing_scanner/data_mining.py index 66d763b..f8c5f6c 100644 --- a/android_class_shadowing_scanner/data_mining.py +++ b/android_class_shadowing_scanner/data_mining.py @@ -1,17 +1,274 @@ +import sqlite3 +import csv +import json + from pathlib import Path from .platform_classes import MIN_MAX_SDK from matplotlib import pyplot as plt -def analyse_sdk_redef(folder: Path): +def stats(db: Path, out: Path, folder_plat_diff_smali: Path, detail_class_redef: Path): + nb_sdk_cl_redef = 0 + nb_sdk_cl_id = 0 + nb_app_sdk_cl_redef = 0 + nb_app_sdk_cl_redef_false_pos = 0 + nb_hid_cl_redef = 0 + nb_hid_cl_id = 0 + nb_app_hid_cl_redef = 0 + nb_app_hid_cl_redef_false_pos = 0 + for file in folder_plat_diff_smali.iterdir(): + with file.open("r") as fd: + data = json.load(fd) + l_nb_sdk_cl_id = 0 + for cl in data["sdk_34_classes"]: + nb_sdk_cl_redef += 1 + if any( + [ + cl not in data[lst] + for lst in [ + "sdk_34_diff_smalli", + "platform_34_diff_smalli", + "sdk_33_diff_smalli", + "platform_33_diff_smalli", + "sdk_32_diff_smalli", + "platform_32_diff_smalli", + ] + ] + ): + l_nb_sdk_cl_id += 1 + nb_sdk_cl_id += l_nb_sdk_cl_id + if data["sdk_34_classes"]: + nb_app_sdk_cl_redef += 1 + if data["sdk_34_classes"] and len(data["sdk_34_classes"]) == l_nb_sdk_cl_id: + nb_app_sdk_cl_redef_false_pos += 1 + + l_nb_hid_cl_id = 0 + for cl in data["platform_non_sdk_34_classes"]: + nb_hid_cl_redef += 1 + if any( + [ + cl not in data[lst] + for lst in [ + "platform_34_diff_smalli", + "platform_33_diff_smalli", + "platform_32_diff_smalli", + ] + ] + ): + l_nb_hid_cl_id += 1 + nb_hid_cl_id += l_nb_hid_cl_id + if data["platform_non_sdk_34_classes"]: + nb_app_hid_cl_redef += 1 + if ( + data["platform_non_sdk_34_classes"] + and len(data["platform_non_sdk_34_classes"]) == l_nb_hid_cl_id + ): + nb_app_hid_cl_redef_false_pos += 1 + nb_class_self_shadow = 0 + nb_class_self_shadow_id = 0 + nb_app_self_shadow = 0 + nb_app_self_shadow_false_pos = 0 + with detail_class_redef.open("r") as fd: + data = json.load(fd) + for v in data.values(): + if v["duplicated_classes"]: + nb_app_self_shadow += 1 + if v["duplicated_classes"] and not v["redef_classes"]: + nb_app_self_shadow_false_pos += 1 + nb_class_self_shadow += len(v["duplicated_classes"]) + nb_class_self_shadow_id += len(v["duplicated_classes"]) - len( + v["redef_classes"] + ) + + prop_id_self_cl = round(100 * nb_class_self_shadow_id / nb_class_self_shadow, 2) + prop_id_self_ap = round(100 * nb_app_self_shadow_false_pos / nb_app_self_shadow, 2) + prop_id_sdk_cl = round(100 * nb_sdk_cl_id / nb_sdk_cl_redef, 2) + prop_id_sdk_ap = round(100 * nb_app_sdk_cl_redef_false_pos / nb_app_sdk_cl_redef, 2) + prop_id_hid_cl = round(100 * nb_hid_cl_id / nb_hid_cl_redef, 2) + prop_id_hid_ap = round(100 * nb_app_hid_cl_redef_false_pos / nb_app_hid_cl_redef, 2) + tot_prop_id = round( + 100 + * (nb_class_self_shadow_id + nb_sdk_cl_id + 100 * nb_hid_cl_id) + / (nb_class_self_shadow + nb_sdk_cl_redef + nb_hid_cl_redef), + 2, + ) + print( + f"Self classes: {nb_class_self_shadow_id}/{nb_class_self_shadow}: {prop_id_self_cl}%" + ) + print( + f"Self apk: {nb_app_self_shadow_false_pos}/{nb_app_self_shadow}: {prop_id_self_ap}%" + ) + print(f"SDK classes: {nb_sdk_cl_id}/{nb_sdk_cl_redef}: {prop_id_sdk_cl}%") + print( + f"SDK apk: {nb_app_sdk_cl_redef_false_pos}/{nb_app_sdk_cl_redef}: {prop_id_sdk_ap}%" + ) + print(f"Hidden classes: {nb_hid_cl_id}/{nb_hid_cl_redef}: {prop_id_hid_cl}%") + print( + f"Hidden apk: {nb_app_hid_cl_redef_false_pos}/{nb_app_hid_cl_redef}: {prop_id_hid_ap}%" + ) + + with sqlite3.connect(db) as con: + cur = con.cursor() + nb_apk_all = cur.execute("SELECT COUNT(sha256) FROM data;").fetchone()[0] + avg_target_sdk_tot = cur.execute( + "SELECT AVG(target_sdk_version) FROM data WHERE target_sdk_version>=0;" + ).fetchone()[0] + avg_min_sdk_tot = cur.execute( + "SELECT AVG(min_sdk_version) FROM data WHERE min_sdk_version>=0;" + ).fetchone()[0] + tot_avg_all = cur.execute( + "SELECT AVG(" + " nb_duplicate_classes+nb_def_sdk_34_classes+nb_def_platform_non_sdk_34_classes" + ") FROM data;" + ).fetchone()[0] + tot_median_all = cur.execute( + "SELECT " + " nb_duplicate_classes+nb_def_sdk_34_classes+nb_def_platform_non_sdk_34_classes " + "FROM data ORDER BY " + " nb_duplicate_classes+nb_def_sdk_34_classes+nb_def_platform_non_sdk_34_classes " + "LIMIT 1 " + "OFFSET (SELECT COUNT(*) FROM data) / 2;" + ).fetchone()[0] + data_all = [] + data_only = [] + for name, field in [ + ("Self", "nb_duplicate_classes"), + ("Sdk", "nb_def_sdk_34_classes"), + ("Hidden", "nb_def_platform_non_sdk_34_classes"), + ]: + nb_app, avg, avg_min_sdk = cur.execute( + f"SELECT COUNT(sha256), AVG({field}), AVG(min_sdk_version) FROM data WHERE {field}>=1;" + ).fetchone() + avg_target_sdk = cur.execute( + f"SELECT AVG(target_sdk_version) FROM data WHERE target_sdk_version>=0 AND {field}>=1;" + ).fetchone()[0] + avg_all = cur.execute(f"SELECT AVG({field}) FROM data;").fetchone()[0] + median = cur.execute( + f"SELECT {field} FROM data WHERE {field}>=1 ORDER BY {field} " + f"LIMIT 1 OFFSET (SELECT COUNT(*) FROM data WHERE {field}>=1) / 2;" + ).fetchone()[0] + median_all = cur.execute( + f"SELECT {field} FROM data ORDER BY {field} " + "LIMIT 1 OFFSET (SELECT COUNT(*) FROM data) / 2;" + ).fetchone()[0] + if name == "Self": + id: str | float = prop_id_self_cl + elif name == "Sdk": + id = prop_id_sdk_cl + elif name == "Hidden": + id = prop_id_hid_cl + else: + id = "TODO" + data_only.append( + { + "method": name, + "nbapp": nb_app, + "avgshadow": round(avg, 1), + "median": median, + "id": id, + "avgtargetsdk": round(avg_target_sdk, 1), + "avgminsdk": round(avg_min_sdk, 1), + "ratioapp": round(100 * nb_app / nb_apk_all, 2), + } + ) + data_all.append( + { + "method": name, + "nbapp": nb_apk_all, + "avgshadow": round(avg_all, 1), + "median": median_all, + "id": id, + "avgtargetsdk": round(avg_target_sdk_tot, 1), + "avgminsdk": round(avg_min_sdk_tot, 1), + "ratioapp": round(100 * nb_apk_all / nb_apk_all, 2), + } + ) + + data_all.append( + { + "method": "Total", + "nbapp": nb_apk_all, + "avgshadow": round(tot_avg_all, 1), + "median": tot_median_all, + "id": tot_prop_id, + "avgtargetsdk": round(avg_target_sdk_tot, 1), + "avgminsdk": round(avg_min_sdk_tot, 1), + "ratioapp": round(100 * nb_apk_all / nb_apk_all, 2), + } + ) + + with (out / "results_50k.csv").open("w") as fd: + writer = csv.DictWriter( + fd, + fieldnames=[ + "method", + "nbapp", + "avgshadow", + "median", + "id", + "avgtargetsdk", + "avgminsdk", + "ratioapp", + ], + ) + writer.writeheader() + for row in data_all: + writer.writerow(row) + with (out / "results_only.csv").open("w") as fd: + writer = csv.DictWriter( + fd, + fieldnames=[ + "method", + "nbapp", + "avgshadow", + "median", + "id", + "avgtargetsdk", + "avgminsdk", + "ratioapp", + ], + ) + writer.writeheader() + for row in data_only: + writer.writerow(row) + + +def analyse_sdk_redef(folder: Path, db: Path, out: Path): + with sqlite3.connect(db) as con: + cur = con.cursor() + min_sdks = { + sha256: sdk + for sha256, sdk in cur.execute("SELECT sha256, min_sdk_version FROM data;") + } + targ_sdks = { + sha256: sdk + for sha256, sdk in cur.execute( + "SELECT sha256, target_sdk_version FROM data;" + ) + } + cls_by_sdk_over_min = [0 for _ in range(35)] + cls_by_sdk_under_min = [0 for _ in range(35)] + cls_by_sdk_over_targ = [0 for _ in range(35)] + cls_by_sdk_under_targ = [0 for _ in range(35)] classes_by_app = {} for file in folder.iterdir(): cls = set() + apk_min_sdk = min_sdks[file.name] + apk_targ_sdk = targ_sdks[file.name] with file.open("r") as fp: for cl in fp: if cl.strip(): cls.add(cl.strip()) + cl_min_sdk, _ = MIN_MAX_SDK[cl.strip()] + if cl_min_sdk < apk_min_sdk: + cls_by_sdk_under_min[cl_min_sdk] += 1 + else: + cls_by_sdk_over_min[cl_min_sdk] += 1 + if cl_min_sdk < apk_targ_sdk: + cls_by_sdk_under_targ[cl_min_sdk] += 1 + else: + cls_by_sdk_over_targ[cl_min_sdk] += 1 classes_by_app[file.name] = cls classes_occ = {} @@ -31,5 +288,50 @@ def analyse_sdk_redef(folder: Path): cls_by_sdk = [0 for _ in range(35)] for cl, n in classes_occ.items(): cls_by_sdk[MIN_MAX_SDK[cl][0]] += n - plt.bar([i for i in range(7, 35)], cls_by_sdk[7:], bottom=0) + + plt.figure(figsize=(20, 9), dpi=80) + plt.bar( + ["<=7" if i == 7 else str(i) for i in range(7, 35)], + cls_by_sdk_under_min[7:], + color="red", + hatch="x", + label="Class introduced before Apk Min SDK", + bottom=0, + edgecolor="black", + ) + plt.bar( + ["<=7" if i == 7 else str(i) for i in range(7, 35)], + cls_by_sdk_over_min[7:], + color="green", + label="Class introducted after Apk Min SDK", + bottom=cls_by_sdk_under_min[7:], + edgecolor="black", + ) + plt.legend(loc="upper left") + plt.savefig(out / "redef_sdk_relative_min_sdk.pdf", format="pdf") + plt.savefig(out / "redef_sdk_relative_min_sdk.svg", format="svg") + plt.show() + plt.close() + + plt.figure(figsize=(20, 9), dpi=80) + plt.bar( + ["<=7" if i == 7 else str(i) for i in range(7, 35)], + cls_by_sdk_under_targ[7:], + color="red", + hatch="x", + label="Class introduced before Apk Target SDK", + bottom=0, + edgecolor="black", + ) + plt.bar( + ["<=7" if i == 7 else str(i) for i in range(7, 35)], + cls_by_sdk_over_targ[7:], + color="green", + label="Class introducted after Apk Target SDK", + bottom=cls_by_sdk_under_min[7:], + edgecolor="black", + ) + plt.legend(loc="upper left") + plt.savefig(out / "redef_sdk_relative_targ_sdk.pdf", format="pdf") + plt.savefig(out / "redef_sdk_relative_targ_sdk.svg", format="svg") plt.show() diff --git a/run_exp_5.sh b/run_exp_5.sh index d347877..951a409 100644 --- a/run_exp_5.sh +++ b/run_exp_5.sh @@ -8,6 +8,7 @@ DB="${SCRIPT_DIR}/data/app-2023-xp4.db" LIST=$(mktemp) APKTOOL="${SCRIPT_DIR}/apktool.jar" ANDROZOO_KEY="${SCRIPT_DIR}/.ZOO_KEY" +OUT_DIR="app-2023-xp5.out" app_lst=( '00' @@ -32,6 +33,7 @@ app_lst=( '19' ) +mkdir -p "${OUT_DIR}" unzip platforms.zip -d "${PLATFORM_DIR}" for ad in "${PLATFORM_DIR}"/**/{platform,sdk}; do @@ -46,8 +48,8 @@ cd "${WD}" sqlite3 ${DB} 'SELECT sha256 FROM data WHERE nb_def_platform_32_classes >= 1 OR nb_def_platform_33_classes >= 1 OR nb_def_platform_34_classes >= 1;' > "${LIST}" N_CHUNK=$(python3 -c "print($(cat ${LIST} | wc -l)//20 + 1)") -rm -r ./app-2023-exp4 -mkdir ./app-2023-exp4 +rm -r ./app-2023-exp5 +mkdir ./app-2023-exp5 split -a 2 -d -l "${N_CHUNK}" "${LIST}" ./app-2023-exp4/ worker () { @@ -58,8 +60,8 @@ worker () { } for lst in ${app_lst[@]}; do - worker "./app-2023-exp4/${lst}" & - echo 1 + worker "./app-2023-exp5/${lst}" & + sleep 1 done echo 'PROCESS LAUNCHED'