keep tweeking

This commit is contained in:
Jean-Marie Mineau 2024-11-26 14:58:00 +01:00
parent fc38321d09
commit cea77c2fc3
3 changed files with 175 additions and 27 deletions

View file

@ -571,4 +571,4 @@ def data_mining():
args = parser.parse_args()
stats(args.db, args.out, args.output_check_platform_redef)
# analyse_sdk_redef(args.output_check_platform_redef, args.db, args.out)
analyse_sdk_redef(args.output_check_platform_redef, args.db, args.out)

View file

@ -50,8 +50,9 @@ def get_methods(sm: str, sha256: str = "") -> dict[str, list[list[str]]]:
if (
current_meth is not None
and striped
and not striped.startswith(".line ")
and not striped.startswith(".param ")
# and not striped.startswith(".line ")
# and not striped.startswith(".param ")
and not striped.startswith(".")
):
current_body.append(striped)
if striped.startswith(".method "):

View file

@ -10,6 +10,7 @@ import matplotlib
def stats(db: Path, out: Path, folder_plat_diff_smali: Path):
VT_THRESH = 3
occ_sdk34 = {}
occ_hid34 = {}
@ -143,6 +144,13 @@ def stats(db: Path, out: Path, folder_plat_diff_smali: Path):
with sqlite3.connect(db) as con:
cur = con.cursor()
nb_apk_all = cur.execute("SELECT COUNT(sha256) FROM data;").fetchone()[0]
nb_mal_any = cur.execute(
f"SELECT COUNT(sha256) FROM data WHERE vt_detection>={VT_THRESH} AND "
" (nb_duplicate_classes>=1 OR nb_def_sdk_34_classes>=1 OR nb_def_platform_non_sdk_34_classes>=1);"
).fetchone()[0]
nb_mal_all = cur.execute(
f"SELECT COUNT(sha256) FROM data WHERE vt_detection>={VT_THRESH};"
).fetchone()[0]
avg_target_sdk_tot = cur.execute(
"SELECT AVG(target_sdk_version) FROM data WHERE target_sdk_version>=0;"
).fetchone()[0]
@ -162,6 +170,37 @@ def stats(db: Path, out: Path, folder_plat_diff_smali: Path):
"LIMIT 1 "
"OFFSET (SELECT COUNT(*) FROM data) / 2;"
).fetchone()[0]
nb_apk_any = cur.execute(
"SELECT COUNT(sha256) FROM data WHERE "
" (nb_duplicate_classes>=1 OR nb_def_sdk_34_classes>=1 OR nb_def_platform_non_sdk_34_classes>=1);"
).fetchone()[0]
avg_target_sdk_any = cur.execute(
"SELECT AVG(target_sdk_version) FROM data WHERE target_sdk_version>=0 AND"
" (nb_duplicate_classes>=1 OR nb_def_sdk_34_classes>=1 OR nb_def_platform_non_sdk_34_classes>=1);"
).fetchone()[0]
avg_min_sdk_any = cur.execute(
"SELECT AVG(min_sdk_version) FROM data WHERE min_sdk_version>=0 AND "
" (nb_duplicate_classes>=1 OR nb_def_sdk_34_classes>=1 OR nb_def_platform_non_sdk_34_classes>=1);"
).fetchone()[0]
tot_avg_any = cur.execute(
"SELECT AVG("
" nb_duplicate_classes+nb_def_sdk_34_classes+nb_def_platform_non_sdk_34_classes"
") FROM data WHERE "
" (nb_duplicate_classes>=1 OR nb_def_sdk_34_classes>=1 OR nb_def_platform_non_sdk_34_classes>=1);"
).fetchone()[0]
tot_median_any = cur.execute(
"SELECT "
" nb_duplicate_classes+nb_def_sdk_34_classes+nb_def_platform_non_sdk_34_classes "
"FROM data WHERE "
" (nb_duplicate_classes>=1 OR nb_def_sdk_34_classes>=1 OR nb_def_platform_non_sdk_34_classes>=1) "
"ORDER BY "
" nb_duplicate_classes+nb_def_sdk_34_classes+nb_def_platform_non_sdk_34_classes "
"LIMIT 1 "
"OFFSET ("
" SELECT COUNT(*) FROM data WHERE "
" (nb_duplicate_classes>=1 OR nb_def_sdk_34_classes>=1 OR nb_def_platform_non_sdk_34_classes>=1) "
") / 2;"
).fetchone()[0]
data_all = []
data_only = []
for name, field in [
@ -172,6 +211,9 @@ def stats(db: Path, out: Path, folder_plat_diff_smali: Path):
nb_app, avg, avg_min_sdk = cur.execute(
f"SELECT COUNT(sha256), AVG({field}), AVG(min_sdk_version) FROM data WHERE {field}>=1;"
).fetchone()
nb_mal = cur.execute(
f"SELECT COUNT(sha256) FROM data WHERE {field}>=1 AND vt_detection>={VT_THRESH};"
).fetchone()[0]
avg_target_sdk = cur.execute(
f"SELECT AVG(target_sdk_version) FROM data WHERE target_sdk_version>=0 AND {field}>=1;"
).fetchone()[0]
@ -202,6 +244,7 @@ def stats(db: Path, out: Path, folder_plat_diff_smali: Path):
"avgtargetsdk": round(avg_target_sdk, 1),
"avgminsdk": round(avg_min_sdk, 1),
"ratioapp": round(100 * nb_app / nb_apk_all, 2),
"ratiomal": round(100 * nb_mal / nb_app, 2),
}
)
data_all.append(
@ -214,9 +257,23 @@ def stats(db: Path, out: Path, folder_plat_diff_smali: Path):
"avgtargetsdk": round(avg_target_sdk_tot, 1),
"avgminsdk": round(avg_min_sdk_tot, 1),
"ratioapp": round(100 * nb_apk_all / nb_apk_all, 2),
"ratiomal": round(100 * nb_mal_all / nb_apk_all, 2),
}
)
data_only.append(
{
"method": "Total",
"nbapp": nb_apk_any,
"avgshadow": round(tot_avg_any, 1),
"median": tot_median_any,
"id": tot_prop_id,
"avgtargetsdk": round(avg_target_sdk_any, 1),
"avgminsdk": round(avg_min_sdk_any, 1),
"ratioapp": round(100 * nb_apk_any / nb_apk_all, 2),
"ratiomal": round(100 * nb_mal_any / nb_apk_any, 2),
}
)
data_all.append(
{
"method": "Total",
@ -227,9 +284,15 @@ def stats(db: Path, out: Path, folder_plat_diff_smali: Path):
"avgtargetsdk": round(avg_target_sdk_tot, 1),
"avgminsdk": round(avg_min_sdk_tot, 1),
"ratioapp": round(100 * nb_apk_all / nb_apk_all, 2),
"ratiomal": round(100 * nb_mal_all / nb_apk_all, 2),
}
)
print(f"NB MALWARE: {nb_mal_all} ({round(100*nb_mal_all/nb_apk_all, 2)}%)")
print(
f"NB MALWARE USING SHADOWING: {nb_mal_any} ({round(100*nb_mal_any/nb_apk_any, 2)}%)"
)
with (out / "results_50k.csv").open("w") as fd:
writer = csv.DictWriter(
fd,
@ -242,6 +305,7 @@ def stats(db: Path, out: Path, folder_plat_diff_smali: Path):
"avgtargetsdk",
"avgminsdk",
"ratioapp",
"ratiomal",
],
)
writer.writeheader()
@ -259,6 +323,7 @@ def stats(db: Path, out: Path, folder_plat_diff_smali: Path):
"avgtargetsdk",
"avgminsdk",
"ratioapp",
"ratiomal",
],
)
writer.writeheader()
@ -300,38 +365,120 @@ def stats(db: Path, out: Path, folder_plat_diff_smali: Path):
print(f"{cl:<70} {occ_self_non_id[cl]: >5}")
print()
print()
print(f"redefined class SDK <= 7 {' '*40} occurences disctinct")
print()
for cl in sorted(
filter(lambda cl: MIN_MAX_SDK[cl][0] == 7, occ_sdk34.keys()),
key=lambda x: occ_sdk34[x],
reverse=True,
)[:10]:
print(f"{cl:<70} {occ_sdk34[cl]: >5} {occ_sdk34_non_id.get(cl, 0): >5}")
print()
with (out / "redef_sdk_7minus.csv").open("w") as fd:
writer = csv.DictWriter(
fd,
fieldnames=[
"class",
"occ",
"id",
"idper",
],
)
writer.writeheader()
print()
print(
f"redefined class SDK <= 7 {' '*40} occurences identique identique%"
)
print()
for cl in sorted(
filter(lambda cl: MIN_MAX_SDK[cl][0] == 7, occ_sdk34.keys()),
key=lambda x: occ_sdk34[x],
reverse=True,
)[:10]:
occ = occ_sdk34[cl]
id = occ - occ_sdk34_non_id.get(cl, 0)
id_per = round((100 * id) / occ, 2)
print(f"{cl:<70} {occ: >5} {id: >5} {id_per: >5}")
writer.writerow({"class": cl, "occ": occ, "id": id, "idper": id_per})
print()
with (out / "redef_sdk_8.csv").open("w") as fd:
writer = csv.DictWriter(
fd,
fieldnames=[
"class",
"occ",
"id",
"idper",
],
)
writer.writeheader()
print()
print(
f"redefined class SDK = 8 {' '*40} occurences identique identique%"
)
print()
for cl in sorted(
filter(lambda cl: MIN_MAX_SDK[cl][0] == 8, occ_sdk34.keys()),
key=lambda x: occ_sdk34[x],
reverse=True,
)[:10]:
occ = occ_sdk34[cl]
id = occ - occ_sdk34_non_id.get(cl, 0)
id_per = round((100 * id) / occ, 2)
print(f"{cl:<70} {occ: >5} {id: >5} {id_per: >5}")
writer.writerow({"class": cl, "occ": occ, "id": id, "idper": id_per})
print()
with (out / "redef_sdk_16.csv").open("w") as fd:
writer = csv.DictWriter(
fd,
fieldnames=[
"class",
"occ",
"id",
"idper",
],
)
writer.writeheader()
print()
print(
f"redefined class SDK = 16 {' '*40} occurences identique identique%"
)
print()
for cl in sorted(
filter(lambda cl: MIN_MAX_SDK[cl][0] == 16, occ_sdk34.keys()),
key=lambda x: occ_sdk34[x],
reverse=True,
)[:10]:
occ = occ_sdk34[cl]
id = occ - occ_sdk34_non_id.get(cl, 0)
id_per = round((100 * id) / occ, 2)
print(f"{cl:<70} {occ: >5} {id: >5} {id_per: >5}")
writer.writerow({"class": cl, "occ": occ, "id": id, "idper": id_per})
print()
occ_package_hid34 = {}
for cl, occ in occ_hid34.items():
package = ".".join(cl.removeprefix("L").removesuffix(";").split("/")[:-1])
if package not in occ_package_hid34:
occ_package_hid34[package] = 0
occ_package_hid34[package] += occ
occ_package_hid34_non_id = {}
for cl, occ in occ_hid34_non_id.items():
package = ".".join(cl.removeprefix("L").removesuffix(";").split("/")[:-1])
if package not in occ_package_hid34_non_id:
occ_package_hid34_non_id[package] = 0
occ_package_hid34_non_id[package] += occ
print()
print(f"redefined class SDK = 8 {' '*40} occurences disctinct")
print()
for cl in sorted(
filter(lambda cl: MIN_MAX_SDK[cl][0] == 8, occ_sdk34.keys()),
key=lambda x: occ_sdk34[x],
print("top 10 hidden package")
for pk in sorted(
occ_package_hid34.keys(),
key=lambda x: occ_package_hid34[x],
reverse=True,
)[:10]:
print(f"{cl:<70} {occ_sdk34[cl]: >5} {occ_sdk34_non_id.get(cl, 0): >5}")
print(f" {pk:<70} {occ_package_hid34[pk]}")
print()
print()
print(f"redefined class SDK = 16 {' '*40} occurences disctinct")
print()
for cl in sorted(
filter(lambda cl: MIN_MAX_SDK[cl][0] == 16, occ_sdk34.keys()),
key=lambda x: occ_sdk34[x],
print("top 10 hidden package non id")
for pk in sorted(
occ_package_hid34_non_id.keys(),
key=lambda x: occ_package_hid34_non_id[x],
reverse=True,
)[:10]:
print(f"{cl:<70} {occ_sdk34[cl]: >5} {occ_sdk34_non_id.get(cl, 0): >5}")
print()
print(f" {pk:<70} {occ_package_hid34_non_id[pk]}")
def analyse_sdk_redef(folder: Path, db: Path, out: Path):