This commit is contained in:
Jean-Marie Mineau 2024-11-19 14:32:24 +01:00
parent c46e2ad096
commit ad789abc7b
3 changed files with 334 additions and 10 deletions

View file

@ -658,7 +658,7 @@ def check_smali_platform():
def data_mining():
# use plt and numpy
# those libs are iffy on the server so let's not import them when not needed
from .data_mining import analyse_sdk_redef
from .data_mining import analyse_sdk_redef, stats
parser = ArgumentParser(
prog="Data Mining",
@ -668,15 +668,35 @@ def data_mining():
"--db",
help="Path to the database storing the results",
type=Path,
required=True,
)
parser.add_argument(
"--out",
help="Directory where to store results",
type=Path,
required=True,
)
parser.add_argument(
"--output-dir-def-sdk34-classes",
help="The directory storing the classes already in SDK 34 redefined by apks",
type=Path,
required=True,
)
parser.add_argument(
"--detail-class-redef",
help="Path to json file outputed by `check-class-redef`",
type=Path,
required=True,
)
parser.add_argument(
"--output-check-platform-redef",
help="The directory storing the result of smali comparision between platform classes and classes defined in apk (--output-dir of `check-platf-reder`)",
type=Path,
required=True,
)
args = parser.parse_args()
if args.db is not None:
pass
stats(args.db, args.out, args.output_check_platform_redef, args.detail_class_redef)
if args.output_dir_def_sdk34_classes is not None:
analyse_sdk_redef(args.output_dir_def_sdk34_classes)
analyse_sdk_redef(args.output_dir_def_sdk34_classes, args.db, args.out)

View file

@ -1,17 +1,274 @@
import sqlite3
import csv
import json
from pathlib import Path
from .platform_classes import MIN_MAX_SDK
from matplotlib import pyplot as plt
def analyse_sdk_redef(folder: Path):
def stats(db: Path, out: Path, folder_plat_diff_smali: Path, detail_class_redef: Path):
nb_sdk_cl_redef = 0
nb_sdk_cl_id = 0
nb_app_sdk_cl_redef = 0
nb_app_sdk_cl_redef_false_pos = 0
nb_hid_cl_redef = 0
nb_hid_cl_id = 0
nb_app_hid_cl_redef = 0
nb_app_hid_cl_redef_false_pos = 0
for file in folder_plat_diff_smali.iterdir():
with file.open("r") as fd:
data = json.load(fd)
l_nb_sdk_cl_id = 0
for cl in data["sdk_34_classes"]:
nb_sdk_cl_redef += 1
if any(
[
cl not in data[lst]
for lst in [
"sdk_34_diff_smalli",
"platform_34_diff_smalli",
"sdk_33_diff_smalli",
"platform_33_diff_smalli",
"sdk_32_diff_smalli",
"platform_32_diff_smalli",
]
]
):
l_nb_sdk_cl_id += 1
nb_sdk_cl_id += l_nb_sdk_cl_id
if data["sdk_34_classes"]:
nb_app_sdk_cl_redef += 1
if data["sdk_34_classes"] and len(data["sdk_34_classes"]) == l_nb_sdk_cl_id:
nb_app_sdk_cl_redef_false_pos += 1
l_nb_hid_cl_id = 0
for cl in data["platform_non_sdk_34_classes"]:
nb_hid_cl_redef += 1
if any(
[
cl not in data[lst]
for lst in [
"platform_34_diff_smalli",
"platform_33_diff_smalli",
"platform_32_diff_smalli",
]
]
):
l_nb_hid_cl_id += 1
nb_hid_cl_id += l_nb_hid_cl_id
if data["platform_non_sdk_34_classes"]:
nb_app_hid_cl_redef += 1
if (
data["platform_non_sdk_34_classes"]
and len(data["platform_non_sdk_34_classes"]) == l_nb_hid_cl_id
):
nb_app_hid_cl_redef_false_pos += 1
nb_class_self_shadow = 0
nb_class_self_shadow_id = 0
nb_app_self_shadow = 0
nb_app_self_shadow_false_pos = 0
with detail_class_redef.open("r") as fd:
data = json.load(fd)
for v in data.values():
if v["duplicated_classes"]:
nb_app_self_shadow += 1
if v["duplicated_classes"] and not v["redef_classes"]:
nb_app_self_shadow_false_pos += 1
nb_class_self_shadow += len(v["duplicated_classes"])
nb_class_self_shadow_id += len(v["duplicated_classes"]) - len(
v["redef_classes"]
)
prop_id_self_cl = round(100 * nb_class_self_shadow_id / nb_class_self_shadow, 2)
prop_id_self_ap = round(100 * nb_app_self_shadow_false_pos / nb_app_self_shadow, 2)
prop_id_sdk_cl = round(100 * nb_sdk_cl_id / nb_sdk_cl_redef, 2)
prop_id_sdk_ap = round(100 * nb_app_sdk_cl_redef_false_pos / nb_app_sdk_cl_redef, 2)
prop_id_hid_cl = round(100 * nb_hid_cl_id / nb_hid_cl_redef, 2)
prop_id_hid_ap = round(100 * nb_app_hid_cl_redef_false_pos / nb_app_hid_cl_redef, 2)
tot_prop_id = round(
100
* (nb_class_self_shadow_id + nb_sdk_cl_id + 100 * nb_hid_cl_id)
/ (nb_class_self_shadow + nb_sdk_cl_redef + nb_hid_cl_redef),
2,
)
print(
f"Self classes: {nb_class_self_shadow_id}/{nb_class_self_shadow}: {prop_id_self_cl}%"
)
print(
f"Self apk: {nb_app_self_shadow_false_pos}/{nb_app_self_shadow}: {prop_id_self_ap}%"
)
print(f"SDK classes: {nb_sdk_cl_id}/{nb_sdk_cl_redef}: {prop_id_sdk_cl}%")
print(
f"SDK apk: {nb_app_sdk_cl_redef_false_pos}/{nb_app_sdk_cl_redef}: {prop_id_sdk_ap}%"
)
print(f"Hidden classes: {nb_hid_cl_id}/{nb_hid_cl_redef}: {prop_id_hid_cl}%")
print(
f"Hidden apk: {nb_app_hid_cl_redef_false_pos}/{nb_app_hid_cl_redef}: {prop_id_hid_ap}%"
)
with sqlite3.connect(db) as con:
cur = con.cursor()
nb_apk_all = cur.execute("SELECT COUNT(sha256) FROM data;").fetchone()[0]
avg_target_sdk_tot = cur.execute(
"SELECT AVG(target_sdk_version) FROM data WHERE target_sdk_version>=0;"
).fetchone()[0]
avg_min_sdk_tot = cur.execute(
"SELECT AVG(min_sdk_version) FROM data WHERE min_sdk_version>=0;"
).fetchone()[0]
tot_avg_all = cur.execute(
"SELECT AVG("
" nb_duplicate_classes+nb_def_sdk_34_classes+nb_def_platform_non_sdk_34_classes"
") FROM data;"
).fetchone()[0]
tot_median_all = cur.execute(
"SELECT "
" nb_duplicate_classes+nb_def_sdk_34_classes+nb_def_platform_non_sdk_34_classes "
"FROM data ORDER BY "
" nb_duplicate_classes+nb_def_sdk_34_classes+nb_def_platform_non_sdk_34_classes "
"LIMIT 1 "
"OFFSET (SELECT COUNT(*) FROM data) / 2;"
).fetchone()[0]
data_all = []
data_only = []
for name, field in [
("Self", "nb_duplicate_classes"),
("Sdk", "nb_def_sdk_34_classes"),
("Hidden", "nb_def_platform_non_sdk_34_classes"),
]:
nb_app, avg, avg_min_sdk = cur.execute(
f"SELECT COUNT(sha256), AVG({field}), AVG(min_sdk_version) FROM data WHERE {field}>=1;"
).fetchone()
avg_target_sdk = cur.execute(
f"SELECT AVG(target_sdk_version) FROM data WHERE target_sdk_version>=0 AND {field}>=1;"
).fetchone()[0]
avg_all = cur.execute(f"SELECT AVG({field}) FROM data;").fetchone()[0]
median = cur.execute(
f"SELECT {field} FROM data WHERE {field}>=1 ORDER BY {field} "
f"LIMIT 1 OFFSET (SELECT COUNT(*) FROM data WHERE {field}>=1) / 2;"
).fetchone()[0]
median_all = cur.execute(
f"SELECT {field} FROM data ORDER BY {field} "
"LIMIT 1 OFFSET (SELECT COUNT(*) FROM data) / 2;"
).fetchone()[0]
if name == "Self":
id: str | float = prop_id_self_cl
elif name == "Sdk":
id = prop_id_sdk_cl
elif name == "Hidden":
id = prop_id_hid_cl
else:
id = "TODO"
data_only.append(
{
"method": name,
"nbapp": nb_app,
"avgshadow": round(avg, 1),
"median": median,
"id": id,
"avgtargetsdk": round(avg_target_sdk, 1),
"avgminsdk": round(avg_min_sdk, 1),
"ratioapp": round(100 * nb_app / nb_apk_all, 2),
}
)
data_all.append(
{
"method": name,
"nbapp": nb_apk_all,
"avgshadow": round(avg_all, 1),
"median": median_all,
"id": id,
"avgtargetsdk": round(avg_target_sdk_tot, 1),
"avgminsdk": round(avg_min_sdk_tot, 1),
"ratioapp": round(100 * nb_apk_all / nb_apk_all, 2),
}
)
data_all.append(
{
"method": "Total",
"nbapp": nb_apk_all,
"avgshadow": round(tot_avg_all, 1),
"median": tot_median_all,
"id": tot_prop_id,
"avgtargetsdk": round(avg_target_sdk_tot, 1),
"avgminsdk": round(avg_min_sdk_tot, 1),
"ratioapp": round(100 * nb_apk_all / nb_apk_all, 2),
}
)
with (out / "results_50k.csv").open("w") as fd:
writer = csv.DictWriter(
fd,
fieldnames=[
"method",
"nbapp",
"avgshadow",
"median",
"id",
"avgtargetsdk",
"avgminsdk",
"ratioapp",
],
)
writer.writeheader()
for row in data_all:
writer.writerow(row)
with (out / "results_only.csv").open("w") as fd:
writer = csv.DictWriter(
fd,
fieldnames=[
"method",
"nbapp",
"avgshadow",
"median",
"id",
"avgtargetsdk",
"avgminsdk",
"ratioapp",
],
)
writer.writeheader()
for row in data_only:
writer.writerow(row)
def analyse_sdk_redef(folder: Path, db: Path, out: Path):
with sqlite3.connect(db) as con:
cur = con.cursor()
min_sdks = {
sha256: sdk
for sha256, sdk in cur.execute("SELECT sha256, min_sdk_version FROM data;")
}
targ_sdks = {
sha256: sdk
for sha256, sdk in cur.execute(
"SELECT sha256, target_sdk_version FROM data;"
)
}
cls_by_sdk_over_min = [0 for _ in range(35)]
cls_by_sdk_under_min = [0 for _ in range(35)]
cls_by_sdk_over_targ = [0 for _ in range(35)]
cls_by_sdk_under_targ = [0 for _ in range(35)]
classes_by_app = {}
for file in folder.iterdir():
cls = set()
apk_min_sdk = min_sdks[file.name]
apk_targ_sdk = targ_sdks[file.name]
with file.open("r") as fp:
for cl in fp:
if cl.strip():
cls.add(cl.strip())
cl_min_sdk, _ = MIN_MAX_SDK[cl.strip()]
if cl_min_sdk < apk_min_sdk:
cls_by_sdk_under_min[cl_min_sdk] += 1
else:
cls_by_sdk_over_min[cl_min_sdk] += 1
if cl_min_sdk < apk_targ_sdk:
cls_by_sdk_under_targ[cl_min_sdk] += 1
else:
cls_by_sdk_over_targ[cl_min_sdk] += 1
classes_by_app[file.name] = cls
classes_occ = {}
@ -31,5 +288,50 @@ def analyse_sdk_redef(folder: Path):
cls_by_sdk = [0 for _ in range(35)]
for cl, n in classes_occ.items():
cls_by_sdk[MIN_MAX_SDK[cl][0]] += n
plt.bar([i for i in range(7, 35)], cls_by_sdk[7:], bottom=0)
plt.figure(figsize=(20, 9), dpi=80)
plt.bar(
["<=7" if i == 7 else str(i) for i in range(7, 35)],
cls_by_sdk_under_min[7:],
color="red",
hatch="x",
label="Class introduced before Apk Min SDK",
bottom=0,
edgecolor="black",
)
plt.bar(
["<=7" if i == 7 else str(i) for i in range(7, 35)],
cls_by_sdk_over_min[7:],
color="green",
label="Class introducted after Apk Min SDK",
bottom=cls_by_sdk_under_min[7:],
edgecolor="black",
)
plt.legend(loc="upper left")
plt.savefig(out / "redef_sdk_relative_min_sdk.pdf", format="pdf")
plt.savefig(out / "redef_sdk_relative_min_sdk.svg", format="svg")
plt.show()
plt.close()
plt.figure(figsize=(20, 9), dpi=80)
plt.bar(
["<=7" if i == 7 else str(i) for i in range(7, 35)],
cls_by_sdk_under_targ[7:],
color="red",
hatch="x",
label="Class introduced before Apk Target SDK",
bottom=0,
edgecolor="black",
)
plt.bar(
["<=7" if i == 7 else str(i) for i in range(7, 35)],
cls_by_sdk_over_targ[7:],
color="green",
label="Class introducted after Apk Target SDK",
bottom=cls_by_sdk_under_min[7:],
edgecolor="black",
)
plt.legend(loc="upper left")
plt.savefig(out / "redef_sdk_relative_targ_sdk.pdf", format="pdf")
plt.savefig(out / "redef_sdk_relative_targ_sdk.svg", format="svg")
plt.show()

View file

@ -8,6 +8,7 @@ DB="${SCRIPT_DIR}/data/app-2023-xp4.db"
LIST=$(mktemp)
APKTOOL="${SCRIPT_DIR}/apktool.jar"
ANDROZOO_KEY="${SCRIPT_DIR}/.ZOO_KEY"
OUT_DIR="app-2023-xp5.out"
app_lst=(
'00'
@ -32,6 +33,7 @@ app_lst=(
'19'
)
mkdir -p "${OUT_DIR}"
unzip platforms.zip -d "${PLATFORM_DIR}"
for ad in "${PLATFORM_DIR}"/**/{platform,sdk}; do
@ -46,8 +48,8 @@ cd "${WD}"
sqlite3 ${DB} 'SELECT sha256 FROM data WHERE nb_def_platform_32_classes >= 1 OR nb_def_platform_33_classes >= 1 OR nb_def_platform_34_classes >= 1;' > "${LIST}"
N_CHUNK=$(python3 -c "print($(cat ${LIST} | wc -l)//20 + 1)")
rm -r ./app-2023-exp4
mkdir ./app-2023-exp4
rm -r ./app-2023-exp5
mkdir ./app-2023-exp5
split -a 2 -d -l "${N_CHUNK}" "${LIST}" ./app-2023-exp4/
worker () {
@ -58,8 +60,8 @@ worker () {
}
for lst in ${app_lst[@]}; do
worker "./app-2023-exp4/${lst}" &
echo 1
worker "./app-2023-exp5/${lst}" &
sleep 1
done
echo 'PROCESS LAUNCHED'