443 lines
15 KiB
Python
443 lines
15 KiB
Python
import sqlite3
|
|
import csv
|
|
import json
|
|
|
|
from pathlib import Path
|
|
from .platform_classes import MIN_MAX_SDK
|
|
|
|
from matplotlib import pyplot as plt
|
|
import matplotlib
|
|
|
|
|
|
def stats(db: Path, out: Path, folder_plat_diff_smali: Path):
|
|
|
|
occ_sdk34 = {}
|
|
occ_hid34 = {}
|
|
occ_self = {}
|
|
occ_sdk34_non_id = {}
|
|
occ_hid34_non_id = {}
|
|
occ_self_non_id = {}
|
|
|
|
nb_sdk_cl_redef = 0
|
|
nb_sdk_cl_id = 0
|
|
nb_app_sdk_cl_redef = 0
|
|
nb_app_sdk_cl_redef_false_pos = 0
|
|
nb_hid_cl_redef = 0
|
|
nb_hid_cl_id = 0
|
|
nb_app_hid_cl_redef = 0
|
|
nb_app_hid_cl_redef_false_pos = 0
|
|
nb_class_self_shadow = 0
|
|
nb_class_self_shadow_id = 0
|
|
nb_app_self_shadow = 0
|
|
nb_app_self_shadow_false_pos = 0
|
|
|
|
for file in folder_plat_diff_smali.iterdir():
|
|
with file.open("r") as fd:
|
|
data = json.load(fd)
|
|
if not data["apktool-finished"]:
|
|
continue
|
|
for cl in data["redef_classes"]:
|
|
if cl not in occ_self_non_id:
|
|
occ_self_non_id[cl] = 0
|
|
occ_self_non_id[cl] += 1
|
|
for cl in data["duplicated_classes"]:
|
|
if cl not in occ_self:
|
|
occ_self[cl] = 0
|
|
occ_self[cl] += 1
|
|
if data["duplicated_classes"]:
|
|
nb_app_self_shadow += 1
|
|
if data["duplicated_classes"] and not data["redef_classes"]:
|
|
nb_app_self_shadow_false_pos += 1
|
|
nb_class_self_shadow += len(data["duplicated_classes"])
|
|
nb_class_self_shadow_id += len(data["duplicated_classes"]) - len(
|
|
data["redef_classes"]
|
|
)
|
|
l_nb_sdk_cl_id = 0
|
|
for cl in data["sdk_34_classes"]:
|
|
nb_sdk_cl_redef += 1
|
|
if cl not in occ_sdk34:
|
|
occ_sdk34[cl] = 0
|
|
occ_sdk34[cl] += 1
|
|
if any(
|
|
[
|
|
cl not in data[lst]
|
|
for lst in [
|
|
"sdk_34_diff_smalli",
|
|
"platform_34_diff_smalli",
|
|
"sdk_33_diff_smalli",
|
|
"platform_33_diff_smalli",
|
|
"sdk_32_diff_smalli",
|
|
"platform_32_diff_smalli",
|
|
]
|
|
]
|
|
):
|
|
l_nb_sdk_cl_id += 1
|
|
else:
|
|
if cl not in occ_sdk34_non_id:
|
|
occ_sdk34_non_id[cl] = 0
|
|
occ_sdk34_non_id[cl] += 1
|
|
nb_sdk_cl_id += l_nb_sdk_cl_id
|
|
if data["sdk_34_classes"]:
|
|
nb_app_sdk_cl_redef += 1
|
|
if data["sdk_34_classes"] and len(data["sdk_34_classes"]) == l_nb_sdk_cl_id:
|
|
nb_app_sdk_cl_redef_false_pos += 1
|
|
|
|
l_nb_hid_cl_id = 0
|
|
for cl in data["platform_non_sdk_34_classes"]:
|
|
if cl not in occ_hid34:
|
|
occ_hid34[cl] = 0
|
|
occ_hid34[cl] += 1
|
|
nb_hid_cl_redef += 1
|
|
if any(
|
|
[
|
|
cl not in data[lst]
|
|
for lst in [
|
|
"platform_34_diff_smalli",
|
|
"platform_33_diff_smalli",
|
|
"platform_32_diff_smalli",
|
|
]
|
|
]
|
|
):
|
|
l_nb_hid_cl_id += 1
|
|
else:
|
|
if cl not in occ_hid34_non_id:
|
|
occ_hid34_non_id[cl] = 0
|
|
occ_hid34_non_id[cl] += 1
|
|
nb_hid_cl_id += l_nb_hid_cl_id
|
|
if data["platform_non_sdk_34_classes"]:
|
|
nb_app_hid_cl_redef += 1
|
|
if (
|
|
data["platform_non_sdk_34_classes"]
|
|
and len(data["platform_non_sdk_34_classes"]) == l_nb_hid_cl_id
|
|
):
|
|
nb_app_hid_cl_redef_false_pos += 1
|
|
|
|
prop_id_self_cl = round(100 * nb_class_self_shadow_id / nb_class_self_shadow, 2)
|
|
prop_id_self_ap = round(100 * nb_app_self_shadow_false_pos / nb_app_self_shadow, 2)
|
|
prop_id_sdk_cl = round(100 * nb_sdk_cl_id / nb_sdk_cl_redef, 2)
|
|
prop_id_sdk_ap = round(100 * nb_app_sdk_cl_redef_false_pos / nb_app_sdk_cl_redef, 2)
|
|
prop_id_hid_cl = round(100 * nb_hid_cl_id / nb_hid_cl_redef, 2)
|
|
prop_id_hid_ap = round(100 * nb_app_hid_cl_redef_false_pos / nb_app_hid_cl_redef, 2)
|
|
tot_prop_id = round(
|
|
100
|
|
* (nb_class_self_shadow_id + nb_sdk_cl_id + nb_hid_cl_id)
|
|
/ (nb_class_self_shadow + nb_sdk_cl_redef + nb_hid_cl_redef),
|
|
2,
|
|
)
|
|
|
|
print(
|
|
f"Self classes: {nb_class_self_shadow_id}/{nb_class_self_shadow}: {prop_id_self_cl}%"
|
|
)
|
|
print(
|
|
f"Self apk: {nb_app_self_shadow_false_pos}/{nb_app_self_shadow}: {prop_id_self_ap}%"
|
|
)
|
|
print(f"SDK classes: {nb_sdk_cl_id}/{nb_sdk_cl_redef}: {prop_id_sdk_cl}%")
|
|
print(
|
|
f"SDK apk: {nb_app_sdk_cl_redef_false_pos}/{nb_app_sdk_cl_redef}: {prop_id_sdk_ap}%"
|
|
)
|
|
print(f"Hidden classes: {nb_hid_cl_id}/{nb_hid_cl_redef}: {prop_id_hid_cl}%")
|
|
print(
|
|
f"Hidden apk: {nb_app_hid_cl_redef_false_pos}/{nb_app_hid_cl_redef}: {prop_id_hid_ap}%"
|
|
)
|
|
|
|
with sqlite3.connect(db) as con:
|
|
cur = con.cursor()
|
|
nb_apk_all = cur.execute("SELECT COUNT(sha256) FROM data;").fetchone()[0]
|
|
avg_target_sdk_tot = cur.execute(
|
|
"SELECT AVG(target_sdk_version) FROM data WHERE target_sdk_version>=0;"
|
|
).fetchone()[0]
|
|
avg_min_sdk_tot = cur.execute(
|
|
"SELECT AVG(min_sdk_version) FROM data WHERE min_sdk_version>=0;"
|
|
).fetchone()[0]
|
|
tot_avg_all = cur.execute(
|
|
"SELECT AVG("
|
|
" nb_duplicate_classes+nb_def_sdk_34_classes+nb_def_platform_non_sdk_34_classes"
|
|
") FROM data;"
|
|
).fetchone()[0]
|
|
tot_median_all = cur.execute(
|
|
"SELECT "
|
|
" nb_duplicate_classes+nb_def_sdk_34_classes+nb_def_platform_non_sdk_34_classes "
|
|
"FROM data ORDER BY "
|
|
" nb_duplicate_classes+nb_def_sdk_34_classes+nb_def_platform_non_sdk_34_classes "
|
|
"LIMIT 1 "
|
|
"OFFSET (SELECT COUNT(*) FROM data) / 2;"
|
|
).fetchone()[0]
|
|
data_all = []
|
|
data_only = []
|
|
for name, field in [
|
|
("Self", "nb_duplicate_classes"),
|
|
("Sdk", "nb_def_sdk_34_classes"),
|
|
("Hidden", "nb_def_platform_non_sdk_34_classes"),
|
|
]:
|
|
nb_app, avg, avg_min_sdk = cur.execute(
|
|
f"SELECT COUNT(sha256), AVG({field}), AVG(min_sdk_version) FROM data WHERE {field}>=1;"
|
|
).fetchone()
|
|
avg_target_sdk = cur.execute(
|
|
f"SELECT AVG(target_sdk_version) FROM data WHERE target_sdk_version>=0 AND {field}>=1;"
|
|
).fetchone()[0]
|
|
avg_all = cur.execute(f"SELECT AVG({field}) FROM data;").fetchone()[0]
|
|
median = cur.execute(
|
|
f"SELECT {field} FROM data WHERE {field}>=1 ORDER BY {field} "
|
|
f"LIMIT 1 OFFSET (SELECT COUNT(*) FROM data WHERE {field}>=1) / 2;"
|
|
).fetchone()[0]
|
|
median_all = cur.execute(
|
|
f"SELECT {field} FROM data ORDER BY {field} "
|
|
"LIMIT 1 OFFSET (SELECT COUNT(*) FROM data) / 2;"
|
|
).fetchone()[0]
|
|
if name == "Self":
|
|
id: str | float = prop_id_self_cl
|
|
elif name == "Sdk":
|
|
id = prop_id_sdk_cl
|
|
elif name == "Hidden":
|
|
id = prop_id_hid_cl
|
|
else:
|
|
id = "TODO"
|
|
data_only.append(
|
|
{
|
|
"method": name,
|
|
"nbapp": nb_app,
|
|
"avgshadow": round(avg, 1),
|
|
"median": median,
|
|
"id": id,
|
|
"avgtargetsdk": round(avg_target_sdk, 1),
|
|
"avgminsdk": round(avg_min_sdk, 1),
|
|
"ratioapp": round(100 * nb_app / nb_apk_all, 2),
|
|
}
|
|
)
|
|
data_all.append(
|
|
{
|
|
"method": name,
|
|
"nbapp": nb_apk_all,
|
|
"avgshadow": round(avg_all, 1),
|
|
"median": median_all,
|
|
"id": id,
|
|
"avgtargetsdk": round(avg_target_sdk_tot, 1),
|
|
"avgminsdk": round(avg_min_sdk_tot, 1),
|
|
"ratioapp": round(100 * nb_apk_all / nb_apk_all, 2),
|
|
}
|
|
)
|
|
|
|
data_all.append(
|
|
{
|
|
"method": "Total",
|
|
"nbapp": nb_apk_all,
|
|
"avgshadow": round(tot_avg_all, 1),
|
|
"median": tot_median_all,
|
|
"id": tot_prop_id,
|
|
"avgtargetsdk": round(avg_target_sdk_tot, 1),
|
|
"avgminsdk": round(avg_min_sdk_tot, 1),
|
|
"ratioapp": round(100 * nb_apk_all / nb_apk_all, 2),
|
|
}
|
|
)
|
|
|
|
with (out / "results_50k.csv").open("w") as fd:
|
|
writer = csv.DictWriter(
|
|
fd,
|
|
fieldnames=[
|
|
"method",
|
|
"nbapp",
|
|
"avgshadow",
|
|
"median",
|
|
"id",
|
|
"avgtargetsdk",
|
|
"avgminsdk",
|
|
"ratioapp",
|
|
],
|
|
)
|
|
writer.writeheader()
|
|
for row in data_all:
|
|
writer.writerow(row)
|
|
with (out / "results_only.csv").open("w") as fd:
|
|
writer = csv.DictWriter(
|
|
fd,
|
|
fieldnames=[
|
|
"method",
|
|
"nbapp",
|
|
"avgshadow",
|
|
"median",
|
|
"id",
|
|
"avgtargetsdk",
|
|
"avgminsdk",
|
|
"ratioapp",
|
|
],
|
|
)
|
|
writer.writeheader()
|
|
for row in data_only:
|
|
writer.writerow(row)
|
|
|
|
# occ_sdk34 = {}
|
|
# occ_hid34 = {}
|
|
# occ_self_redef = {}
|
|
# occ_self = {}
|
|
|
|
print()
|
|
print(f"redefined class SDK {' '*40} occurences disctinct")
|
|
print()
|
|
for cl in sorted(occ_sdk34.keys(), key=lambda x: occ_sdk34[x], reverse=True)[:10]:
|
|
print(f"{cl:<70} {occ_sdk34[cl]: >5} {occ_sdk34_non_id.get(cl, 0): >5}")
|
|
print()
|
|
|
|
print()
|
|
print(f"redefined class Hidden {' '*40} occurences disctinct")
|
|
print()
|
|
for cl in sorted(occ_hid34.keys(), key=lambda x: occ_hid34[x], reverse=True)[:10]:
|
|
print(f"{cl:<70} {occ_hid34[cl]: >5} {occ_hid34_non_id.get(cl, 0): >5}")
|
|
print()
|
|
|
|
print()
|
|
print(f"collision class Self {' '*40} occurences disctinct")
|
|
print()
|
|
for cl in sorted(occ_self.keys(), key=lambda x: occ_self[x], reverse=True)[:10]:
|
|
print(f"{cl:<70} {occ_self[cl]: >5} {occ_self_non_id.get(cl, 0): >5}")
|
|
print()
|
|
|
|
print()
|
|
print(f"redefined class Self {' '*40} occurences")
|
|
print()
|
|
for cl in sorted(
|
|
occ_self_non_id.keys(), key=lambda x: occ_self_non_id[x], reverse=True
|
|
)[:10]:
|
|
print(f"{cl:<70} {occ_self_non_id[cl]: >5}")
|
|
print()
|
|
|
|
print()
|
|
print(f"redefined class SDK <= 7 {' '*40} occurences disctinct")
|
|
print()
|
|
for cl in sorted(
|
|
filter(lambda cl: MIN_MAX_SDK[cl][0] == 7, occ_sdk34.keys()),
|
|
key=lambda x: occ_sdk34[x],
|
|
reverse=True,
|
|
)[:10]:
|
|
print(f"{cl:<70} {occ_sdk34[cl]: >5} {occ_sdk34_non_id.get(cl, 0): >5}")
|
|
print()
|
|
|
|
print()
|
|
print(f"redefined class SDK = 8 {' '*40} occurences disctinct")
|
|
print()
|
|
for cl in sorted(
|
|
filter(lambda cl: MIN_MAX_SDK[cl][0] == 8, occ_sdk34.keys()),
|
|
key=lambda x: occ_sdk34[x],
|
|
reverse=True,
|
|
)[:10]:
|
|
print(f"{cl:<70} {occ_sdk34[cl]: >5} {occ_sdk34_non_id.get(cl, 0): >5}")
|
|
print()
|
|
|
|
print()
|
|
print(f"redefined class SDK = 16 {' '*40} occurences disctinct")
|
|
print()
|
|
for cl in sorted(
|
|
filter(lambda cl: MIN_MAX_SDK[cl][0] == 16, occ_sdk34.keys()),
|
|
key=lambda x: occ_sdk34[x],
|
|
reverse=True,
|
|
)[:10]:
|
|
print(f"{cl:<70} {occ_sdk34[cl]: >5} {occ_sdk34_non_id.get(cl, 0): >5}")
|
|
print()
|
|
|
|
|
|
def analyse_sdk_redef(folder: Path, db: Path, out: Path):
|
|
with sqlite3.connect(db) as con:
|
|
cur = con.cursor()
|
|
min_sdks = {
|
|
sha256: sdk
|
|
for sha256, sdk in cur.execute("SELECT sha256, min_sdk_version FROM data;")
|
|
}
|
|
targ_sdks = {
|
|
sha256: sdk
|
|
for sha256, sdk in cur.execute(
|
|
"SELECT sha256, target_sdk_version FROM data;"
|
|
)
|
|
}
|
|
cls_by_sdk_over_min = [0 for _ in range(35)]
|
|
cls_by_sdk_under_min = [0 for _ in range(35)]
|
|
cls_by_sdk_over_targ = [0 for _ in range(35)]
|
|
cls_by_sdk_under_targ = [0 for _ in range(35)]
|
|
classes_by_app = {}
|
|
for file in folder.iterdir():
|
|
cls = set()
|
|
apk_min_sdk = min_sdks[file.name]
|
|
apk_targ_sdk = targ_sdks[file.name]
|
|
with file.open("r") as fp:
|
|
data = json.load(fp)
|
|
|
|
for cl in data["sdk_34_classes"]:
|
|
cls.add(cl)
|
|
cl_min_sdk, _ = MIN_MAX_SDK[cl]
|
|
if cl_min_sdk < apk_min_sdk:
|
|
cls_by_sdk_under_min[cl_min_sdk] += 1
|
|
else:
|
|
cls_by_sdk_over_min[cl_min_sdk] += 1
|
|
if cl_min_sdk < apk_targ_sdk:
|
|
cls_by_sdk_under_targ[cl_min_sdk] += 1
|
|
else:
|
|
cls_by_sdk_over_targ[cl_min_sdk] += 1
|
|
classes_by_app[file.name] = cls
|
|
|
|
classes_occ = {}
|
|
for cls in classes_by_app.values():
|
|
for cl in cls:
|
|
if cl not in classes_occ:
|
|
classes_occ[cl] = 0
|
|
classes_occ[cl] += 1
|
|
print()
|
|
print(f"redefined class occurences | min sdk")
|
|
print()
|
|
for cl in sorted(classes_occ.keys(), key=lambda x: classes_occ[x], reverse=True)[
|
|
:10
|
|
]:
|
|
print(f"{cl:<50} {classes_occ[cl]: >5} | {MIN_MAX_SDK[cl][0]: >2}")
|
|
|
|
cls_by_sdk = [0 for _ in range(35)]
|
|
for cl, n in classes_occ.items():
|
|
cls_by_sdk[MIN_MAX_SDK[cl][0]] += n
|
|
|
|
matplotlib.rcParams.update({"font.size": 22})
|
|
plt.figure(figsize=(20, 9), dpi=80)
|
|
plt.bar(
|
|
["<=7" if i == 7 else str(i) for i in range(7, 35)],
|
|
cls_by_sdk_under_min[7:],
|
|
color="red",
|
|
hatch="x",
|
|
label="Class introduced before Apk Min SDK",
|
|
bottom=0,
|
|
edgecolor="black",
|
|
)
|
|
plt.bar(
|
|
["<=7" if i == 7 else str(i) for i in range(7, 35)],
|
|
cls_by_sdk_over_min[7:],
|
|
color="green",
|
|
label="Class introducted after Apk Min SDK",
|
|
bottom=cls_by_sdk_under_min[7:],
|
|
edgecolor="black",
|
|
)
|
|
plt.legend(loc="upper left")
|
|
plt.ylabel("Nb Classes")
|
|
plt.xlabel("First SDK containing the class")
|
|
plt.savefig(out / "redef_sdk_relative_min_sdk.pdf", format="pdf")
|
|
plt.savefig(out / "redef_sdk_relative_min_sdk.svg", format="svg")
|
|
plt.show()
|
|
plt.close()
|
|
|
|
plt.figure(figsize=(20, 9), dpi=80)
|
|
plt.bar(
|
|
["<=7" if i == 7 else str(i) for i in range(7, 35)],
|
|
cls_by_sdk_under_targ[7:],
|
|
color="red",
|
|
hatch="x",
|
|
label="Class introduced before Apk Target SDK",
|
|
bottom=0,
|
|
edgecolor="black",
|
|
)
|
|
plt.bar(
|
|
["<=7" if i == 7 else str(i) for i in range(7, 35)],
|
|
cls_by_sdk_over_targ[7:],
|
|
color="green",
|
|
label="Class introducted after Apk Target SDK",
|
|
bottom=cls_by_sdk_under_min[7:],
|
|
edgecolor="black",
|
|
)
|
|
plt.legend(loc="upper left")
|
|
plt.ylabel("Nb Classes")
|
|
plt.xlabel("First SDK containing the class")
|
|
plt.savefig(out / "redef_sdk_relative_targ_sdk.pdf", format="pdf")
|
|
plt.savefig(out / "redef_sdk_relative_targ_sdk.svg", format="svg")
|
|
plt.show()
|