android_class_shadowing_sca.../android_class_shadowing_scanner/data_mining.py
Jean-Marie Mineau 626d6fedb8
update
2024-11-22 14:45:54 +01:00

443 lines
15 KiB
Python

import sqlite3
import csv
import json
from pathlib import Path
from .platform_classes import MIN_MAX_SDK
from matplotlib import pyplot as plt
import matplotlib
def stats(db: Path, out: Path, folder_plat_diff_smali: Path):
occ_sdk34 = {}
occ_hid34 = {}
occ_self = {}
occ_sdk34_non_id = {}
occ_hid34_non_id = {}
occ_self_non_id = {}
nb_sdk_cl_redef = 0
nb_sdk_cl_id = 0
nb_app_sdk_cl_redef = 0
nb_app_sdk_cl_redef_false_pos = 0
nb_hid_cl_redef = 0
nb_hid_cl_id = 0
nb_app_hid_cl_redef = 0
nb_app_hid_cl_redef_false_pos = 0
nb_class_self_shadow = 0
nb_class_self_shadow_id = 0
nb_app_self_shadow = 0
nb_app_self_shadow_false_pos = 0
for file in folder_plat_diff_smali.iterdir():
with file.open("r") as fd:
data = json.load(fd)
if not data["apktool-finished"]:
continue
for cl in data["redef_classes"]:
if cl not in occ_self_non_id:
occ_self_non_id[cl] = 0
occ_self_non_id[cl] += 1
for cl in data["duplicated_classes"]:
if cl not in occ_self:
occ_self[cl] = 0
occ_self[cl] += 1
if data["duplicated_classes"]:
nb_app_self_shadow += 1
if data["duplicated_classes"] and not data["redef_classes"]:
nb_app_self_shadow_false_pos += 1
nb_class_self_shadow += len(data["duplicated_classes"])
nb_class_self_shadow_id += len(data["duplicated_classes"]) - len(
data["redef_classes"]
)
l_nb_sdk_cl_id = 0
for cl in data["sdk_34_classes"]:
nb_sdk_cl_redef += 1
if cl not in occ_sdk34:
occ_sdk34[cl] = 0
occ_sdk34[cl] += 1
if any(
[
cl not in data[lst]
for lst in [
"sdk_34_diff_smalli",
"platform_34_diff_smalli",
"sdk_33_diff_smalli",
"platform_33_diff_smalli",
"sdk_32_diff_smalli",
"platform_32_diff_smalli",
]
]
):
l_nb_sdk_cl_id += 1
else:
if cl not in occ_sdk34_non_id:
occ_sdk34_non_id[cl] = 0
occ_sdk34_non_id[cl] += 1
nb_sdk_cl_id += l_nb_sdk_cl_id
if data["sdk_34_classes"]:
nb_app_sdk_cl_redef += 1
if data["sdk_34_classes"] and len(data["sdk_34_classes"]) == l_nb_sdk_cl_id:
nb_app_sdk_cl_redef_false_pos += 1
l_nb_hid_cl_id = 0
for cl in data["platform_non_sdk_34_classes"]:
if cl not in occ_hid34:
occ_hid34[cl] = 0
occ_hid34[cl] += 1
nb_hid_cl_redef += 1
if any(
[
cl not in data[lst]
for lst in [
"platform_34_diff_smalli",
"platform_33_diff_smalli",
"platform_32_diff_smalli",
]
]
):
l_nb_hid_cl_id += 1
else:
if cl not in occ_hid34_non_id:
occ_hid34_non_id[cl] = 0
occ_hid34_non_id[cl] += 1
nb_hid_cl_id += l_nb_hid_cl_id
if data["platform_non_sdk_34_classes"]:
nb_app_hid_cl_redef += 1
if (
data["platform_non_sdk_34_classes"]
and len(data["platform_non_sdk_34_classes"]) == l_nb_hid_cl_id
):
nb_app_hid_cl_redef_false_pos += 1
prop_id_self_cl = round(100 * nb_class_self_shadow_id / nb_class_self_shadow, 2)
prop_id_self_ap = round(100 * nb_app_self_shadow_false_pos / nb_app_self_shadow, 2)
prop_id_sdk_cl = round(100 * nb_sdk_cl_id / nb_sdk_cl_redef, 2)
prop_id_sdk_ap = round(100 * nb_app_sdk_cl_redef_false_pos / nb_app_sdk_cl_redef, 2)
prop_id_hid_cl = round(100 * nb_hid_cl_id / nb_hid_cl_redef, 2)
prop_id_hid_ap = round(100 * nb_app_hid_cl_redef_false_pos / nb_app_hid_cl_redef, 2)
tot_prop_id = round(
100
* (nb_class_self_shadow_id + nb_sdk_cl_id + nb_hid_cl_id)
/ (nb_class_self_shadow + nb_sdk_cl_redef + nb_hid_cl_redef),
2,
)
print(
f"Self classes: {nb_class_self_shadow_id}/{nb_class_self_shadow}: {prop_id_self_cl}%"
)
print(
f"Self apk: {nb_app_self_shadow_false_pos}/{nb_app_self_shadow}: {prop_id_self_ap}%"
)
print(f"SDK classes: {nb_sdk_cl_id}/{nb_sdk_cl_redef}: {prop_id_sdk_cl}%")
print(
f"SDK apk: {nb_app_sdk_cl_redef_false_pos}/{nb_app_sdk_cl_redef}: {prop_id_sdk_ap}%"
)
print(f"Hidden classes: {nb_hid_cl_id}/{nb_hid_cl_redef}: {prop_id_hid_cl}%")
print(
f"Hidden apk: {nb_app_hid_cl_redef_false_pos}/{nb_app_hid_cl_redef}: {prop_id_hid_ap}%"
)
with sqlite3.connect(db) as con:
cur = con.cursor()
nb_apk_all = cur.execute("SELECT COUNT(sha256) FROM data;").fetchone()[0]
avg_target_sdk_tot = cur.execute(
"SELECT AVG(target_sdk_version) FROM data WHERE target_sdk_version>=0;"
).fetchone()[0]
avg_min_sdk_tot = cur.execute(
"SELECT AVG(min_sdk_version) FROM data WHERE min_sdk_version>=0;"
).fetchone()[0]
tot_avg_all = cur.execute(
"SELECT AVG("
" nb_duplicate_classes+nb_def_sdk_34_classes+nb_def_platform_non_sdk_34_classes"
") FROM data;"
).fetchone()[0]
tot_median_all = cur.execute(
"SELECT "
" nb_duplicate_classes+nb_def_sdk_34_classes+nb_def_platform_non_sdk_34_classes "
"FROM data ORDER BY "
" nb_duplicate_classes+nb_def_sdk_34_classes+nb_def_platform_non_sdk_34_classes "
"LIMIT 1 "
"OFFSET (SELECT COUNT(*) FROM data) / 2;"
).fetchone()[0]
data_all = []
data_only = []
for name, field in [
("Self", "nb_duplicate_classes"),
("Sdk", "nb_def_sdk_34_classes"),
("Hidden", "nb_def_platform_non_sdk_34_classes"),
]:
nb_app, avg, avg_min_sdk = cur.execute(
f"SELECT COUNT(sha256), AVG({field}), AVG(min_sdk_version) FROM data WHERE {field}>=1;"
).fetchone()
avg_target_sdk = cur.execute(
f"SELECT AVG(target_sdk_version) FROM data WHERE target_sdk_version>=0 AND {field}>=1;"
).fetchone()[0]
avg_all = cur.execute(f"SELECT AVG({field}) FROM data;").fetchone()[0]
median = cur.execute(
f"SELECT {field} FROM data WHERE {field}>=1 ORDER BY {field} "
f"LIMIT 1 OFFSET (SELECT COUNT(*) FROM data WHERE {field}>=1) / 2;"
).fetchone()[0]
median_all = cur.execute(
f"SELECT {field} FROM data ORDER BY {field} "
"LIMIT 1 OFFSET (SELECT COUNT(*) FROM data) / 2;"
).fetchone()[0]
if name == "Self":
id: str | float = prop_id_self_cl
elif name == "Sdk":
id = prop_id_sdk_cl
elif name == "Hidden":
id = prop_id_hid_cl
else:
id = "TODO"
data_only.append(
{
"method": name,
"nbapp": nb_app,
"avgshadow": round(avg, 1),
"median": median,
"id": id,
"avgtargetsdk": round(avg_target_sdk, 1),
"avgminsdk": round(avg_min_sdk, 1),
"ratioapp": round(100 * nb_app / nb_apk_all, 2),
}
)
data_all.append(
{
"method": name,
"nbapp": nb_apk_all,
"avgshadow": round(avg_all, 1),
"median": median_all,
"id": id,
"avgtargetsdk": round(avg_target_sdk_tot, 1),
"avgminsdk": round(avg_min_sdk_tot, 1),
"ratioapp": round(100 * nb_apk_all / nb_apk_all, 2),
}
)
data_all.append(
{
"method": "Total",
"nbapp": nb_apk_all,
"avgshadow": round(tot_avg_all, 1),
"median": tot_median_all,
"id": tot_prop_id,
"avgtargetsdk": round(avg_target_sdk_tot, 1),
"avgminsdk": round(avg_min_sdk_tot, 1),
"ratioapp": round(100 * nb_apk_all / nb_apk_all, 2),
}
)
with (out / "results_50k.csv").open("w") as fd:
writer = csv.DictWriter(
fd,
fieldnames=[
"method",
"nbapp",
"avgshadow",
"median",
"id",
"avgtargetsdk",
"avgminsdk",
"ratioapp",
],
)
writer.writeheader()
for row in data_all:
writer.writerow(row)
with (out / "results_only.csv").open("w") as fd:
writer = csv.DictWriter(
fd,
fieldnames=[
"method",
"nbapp",
"avgshadow",
"median",
"id",
"avgtargetsdk",
"avgminsdk",
"ratioapp",
],
)
writer.writeheader()
for row in data_only:
writer.writerow(row)
# occ_sdk34 = {}
# occ_hid34 = {}
# occ_self_redef = {}
# occ_self = {}
print()
print(f"redefined class SDK {' '*40} occurences disctinct")
print()
for cl in sorted(occ_sdk34.keys(), key=lambda x: occ_sdk34[x], reverse=True)[:10]:
print(f"{cl:<70} {occ_sdk34[cl]: >5} {occ_sdk34_non_id.get(cl, 0): >5}")
print()
print()
print(f"redefined class Hidden {' '*40} occurences disctinct")
print()
for cl in sorted(occ_hid34.keys(), key=lambda x: occ_hid34[x], reverse=True)[:10]:
print(f"{cl:<70} {occ_hid34[cl]: >5} {occ_hid34_non_id.get(cl, 0): >5}")
print()
print()
print(f"collision class Self {' '*40} occurences disctinct")
print()
for cl in sorted(occ_self.keys(), key=lambda x: occ_self[x], reverse=True)[:10]:
print(f"{cl:<70} {occ_self[cl]: >5} {occ_self_non_id.get(cl, 0): >5}")
print()
print()
print(f"redefined class Self {' '*40} occurences")
print()
for cl in sorted(
occ_self_non_id.keys(), key=lambda x: occ_self_non_id[x], reverse=True
)[:10]:
print(f"{cl:<70} {occ_self_non_id[cl]: >5}")
print()
print()
print(f"redefined class SDK <= 7 {' '*40} occurences disctinct")
print()
for cl in sorted(
filter(lambda cl: MIN_MAX_SDK[cl][0] == 7, occ_sdk34.keys()),
key=lambda x: occ_sdk34[x],
reverse=True,
)[:10]:
print(f"{cl:<70} {occ_sdk34[cl]: >5} {occ_sdk34_non_id.get(cl, 0): >5}")
print()
print()
print(f"redefined class SDK = 8 {' '*40} occurences disctinct")
print()
for cl in sorted(
filter(lambda cl: MIN_MAX_SDK[cl][0] == 8, occ_sdk34.keys()),
key=lambda x: occ_sdk34[x],
reverse=True,
)[:10]:
print(f"{cl:<70} {occ_sdk34[cl]: >5} {occ_sdk34_non_id.get(cl, 0): >5}")
print()
print()
print(f"redefined class SDK = 16 {' '*40} occurences disctinct")
print()
for cl in sorted(
filter(lambda cl: MIN_MAX_SDK[cl][0] == 16, occ_sdk34.keys()),
key=lambda x: occ_sdk34[x],
reverse=True,
)[:10]:
print(f"{cl:<70} {occ_sdk34[cl]: >5} {occ_sdk34_non_id.get(cl, 0): >5}")
print()
def analyse_sdk_redef(folder: Path, db: Path, out: Path):
with sqlite3.connect(db) as con:
cur = con.cursor()
min_sdks = {
sha256: sdk
for sha256, sdk in cur.execute("SELECT sha256, min_sdk_version FROM data;")
}
targ_sdks = {
sha256: sdk
for sha256, sdk in cur.execute(
"SELECT sha256, target_sdk_version FROM data;"
)
}
cls_by_sdk_over_min = [0 for _ in range(35)]
cls_by_sdk_under_min = [0 for _ in range(35)]
cls_by_sdk_over_targ = [0 for _ in range(35)]
cls_by_sdk_under_targ = [0 for _ in range(35)]
classes_by_app = {}
for file in folder.iterdir():
cls = set()
apk_min_sdk = min_sdks[file.name]
apk_targ_sdk = targ_sdks[file.name]
with file.open("r") as fp:
data = json.load(fp)
for cl in data["sdk_34_classes"]:
cls.add(cl)
cl_min_sdk, _ = MIN_MAX_SDK[cl]
if cl_min_sdk < apk_min_sdk:
cls_by_sdk_under_min[cl_min_sdk] += 1
else:
cls_by_sdk_over_min[cl_min_sdk] += 1
if cl_min_sdk < apk_targ_sdk:
cls_by_sdk_under_targ[cl_min_sdk] += 1
else:
cls_by_sdk_over_targ[cl_min_sdk] += 1
classes_by_app[file.name] = cls
classes_occ = {}
for cls in classes_by_app.values():
for cl in cls:
if cl not in classes_occ:
classes_occ[cl] = 0
classes_occ[cl] += 1
print()
print(f"redefined class occurences | min sdk")
print()
for cl in sorted(classes_occ.keys(), key=lambda x: classes_occ[x], reverse=True)[
:10
]:
print(f"{cl:<50} {classes_occ[cl]: >5} | {MIN_MAX_SDK[cl][0]: >2}")
cls_by_sdk = [0 for _ in range(35)]
for cl, n in classes_occ.items():
cls_by_sdk[MIN_MAX_SDK[cl][0]] += n
matplotlib.rcParams.update({"font.size": 22})
plt.figure(figsize=(20, 9), dpi=80)
plt.bar(
["<=7" if i == 7 else str(i) for i in range(7, 35)],
cls_by_sdk_under_min[7:],
color="red",
hatch="x",
label="Class introduced before Apk Min SDK",
bottom=0,
edgecolor="black",
)
plt.bar(
["<=7" if i == 7 else str(i) for i in range(7, 35)],
cls_by_sdk_over_min[7:],
color="green",
label="Class introducted after Apk Min SDK",
bottom=cls_by_sdk_under_min[7:],
edgecolor="black",
)
plt.legend(loc="upper left")
plt.ylabel("Nb Classes")
plt.xlabel("First SDK containing the class")
plt.savefig(out / "redef_sdk_relative_min_sdk.pdf", format="pdf")
plt.savefig(out / "redef_sdk_relative_min_sdk.svg", format="svg")
plt.show()
plt.close()
plt.figure(figsize=(20, 9), dpi=80)
plt.bar(
["<=7" if i == 7 else str(i) for i in range(7, 35)],
cls_by_sdk_under_targ[7:],
color="red",
hatch="x",
label="Class introduced before Apk Target SDK",
bottom=0,
edgecolor="black",
)
plt.bar(
["<=7" if i == 7 else str(i) for i in range(7, 35)],
cls_by_sdk_over_targ[7:],
color="green",
label="Class introducted after Apk Target SDK",
bottom=cls_by_sdk_under_min[7:],
edgecolor="black",
)
plt.legend(loc="upper left")
plt.ylabel("Nb Classes")
plt.xlabel("First SDK containing the class")
plt.savefig(out / "redef_sdk_relative_targ_sdk.pdf", format="pdf")
plt.savefig(out / "redef_sdk_relative_targ_sdk.svg", format="svg")
plt.show()