This commit is contained in:
Jean-Marie Mineau 2024-11-22 14:45:54 +01:00
parent e51f8e57a9
commit 7a83e118a3
5 changed files with 76 additions and 227 deletions

View file

@ -292,145 +292,6 @@ def collect_to_db():
def check_smali(): def check_smali():
parser = ArgumentParser(
prog="Smalli Check",
description="Check if duplicated classes have distinct smali",
)
parser.add_argument(
"--db",
help="Path to the database storing the results",
type=Path,
required=True,
)
parser.add_argument(
"--out",
help="Path to the file where to store the results",
type=Path,
required=True,
)
parser.add_argument(
"--apktool-jar",
help="Path to the apktool jar file",
type=Path,
required=True,
)
key_parser = parser.add_mutually_exclusive_group(required=False)
key_parser.add_argument(
"--api-key-file",
help="The path to a file containing the Androzoo API key",
type=Path,
)
key_parser.add_argument(
"--api-key", help="The Androzoo API key (Usage NOT recommanded)", type=str
)
SECRET_STORAGE_IMPORTED = False
try:
import secretstorage
SECRET_STORAGE_IMPORTED = True
key_parser.add_argument(
"--api-key-keyring-id",
help="The ID of the Androzoo API key in the secret service storage",
type=str,
)
except ModuleNotFoundError:
pass
args = parser.parse_args()
apktool = args.apktool_jar.resolve()
api_key = ""
if args.api_key:
api_key = args.api_key
if args.api_key_file:
with args.api_key_file.open("r") as file:
api_key = file.read().strip()
if SECRET_STORAGE_IMPORTED and not api_key:
if args.api_key_keyring_id:
key_id = args.api_key_keyring_id
else:
key_id = "androzoo"
try:
with secretstorage.dbus_init() as connection:
collection = secretstorage.get_default_collection(connection)
item = next(collection.search_items({"Title": key_id}))
item.unlock()
api_key = item.get_secret().decode("utf-8").strip()
except:
pass
if not api_key:
api_key = getpass(prompt="Androzoo API key: ").strip()
with sqlite3.connect(args.db) as conn:
apks = list(
map(
lambda t: t[0],
conn.execute("SELECT sha256 FROM data WHERE nb_duplicate_classes >= 1"),
)
)
data = {}
for sha256 in apks:
with tempfile.TemporaryDirectory() as tmpdirname:
d = Path(tmpdirname)
apk_bin = download_apk(sha256, api_key, logfile=None)
if apk_bin is None:
continue
with (d / "app.apk").open("wb") as fp:
fp.write(apk_bin)
androguard_apk = APK(str(d / "app.apk"))
with zipfile.ZipFile(io.BytesIO(apk_bin)) as apk:
data[sha256] = {}
entry = analyze(apk, androguard_apk, sha256, json_out=data[sha256])
r = subprocess.run(
[
"java",
"-Xmx8G",
"-jar",
str(apktool),
"d",
"app.apk",
"-o",
"apktool_out",
],
cwd=d,
)
data[sha256]["apktool-finished"] = (r.returncode == 0) and (
d / "apktool_out" / "apktool.yml"
).exists()
smalli_dirs = []
for dex in data[sha256]["class_dex"]:
if dex == "classes.dex":
smalli_dirs.append(d / "apktool_out" / "smali")
else:
smalli_dirs.append(
d / "apktool_out" / ("smali_" + dex.removesuffix(".dex"))
)
dist_dup_classes = set()
for cl in data[sha256]["duplicated_classes"]:
cl_f = cl.removesuffix(";").removeprefix("L") + ".smali"
smali = None
for cdir in smalli_dirs:
if (cdir / cl_f).exists():
print((cdir / cl_f))
with (cdir / cl_f).open() as file:
smali_new = file.read()
if smali is None:
smali = smali_new
elif smali != smali_new:
dist_dup_classes.add(cl)
data[sha256]["redef_classes"] = list(dist_dup_classes)
if data[sha256]["redef_classes"]:
print(f"{sha256}:")
for c in data[sha256]["redef_classes"]:
print(f" {c}")
else:
print(f"{sha256}: No true redefinition")
with args.out.open("w") as f:
json.dump(data, f)
def check_smali_platform():
parser = ArgumentParser( parser = ArgumentParser(
prog="Smalli Check", prog="Smalli Check",
description="Check if duplicated classes are distinct from the actual sources", description="Check if duplicated classes are distinct from the actual sources",
@ -563,6 +424,7 @@ def check_smali_platform():
"apktool_out", "apktool_out",
], ],
cwd=d, cwd=d,
capture_output=True, # just avoid spamming
) )
data["apktool-finished"] = (r.returncode == 0) and ( data["apktool-finished"] = (r.returncode == 0) and (
d / "apktool_out" / "apktool.yml" d / "apktool_out" / "apktool.yml"
@ -698,27 +560,13 @@ def data_mining():
type=Path, type=Path,
required=True, required=True,
) )
parser.add_argument(
"--output-dir-def-sdk34-classes",
help="The directory storing the classes already in SDK 34 redefined by apks",
type=Path,
required=True,
)
parser.add_argument(
"--detail-class-redef",
help="Path to json file outputed by `check-class-redef`",
type=Path,
required=True,
)
parser.add_argument( parser.add_argument(
"--output-check-platform-redef", "--output-check-platform-redef",
help="The directory storing the result of smali comparision between platform classes and classes defined in apk (--output-dir of `check-platf-reder`)", help="The directory storing the result of smali comparision between platform classes and classes defined in apk (--output-dir of `check-smali`)",
type=Path, type=Path,
required=True, required=True,
) )
args = parser.parse_args() args = parser.parse_args()
stats(args.db, args.out, args.output_check_platform_redef, args.detail_class_redef) stats(args.db, args.out, args.output_check_platform_redef)
# analyse_sdk_redef(args.output_check_platform_redef, args.db, args.out)
if args.output_dir_def_sdk34_classes is not None:
analyse_sdk_redef(args.output_dir_def_sdk34_classes, args.db, args.out)

View file

@ -6,6 +6,10 @@ def cmp_smali(sm1: str, sm2: str, sha256_1: str = "", sha256_2: str = "") -> boo
for m in meths_1.keys(): for m in meths_1.keys():
s1 = meths_1[m] s1 = meths_1[m]
s2 = meths_2[m] s2 = meths_2[m]
if len(s1) > 1:
print(f"method {m} in {sha256_1} has multiple implementation")
if len(s2) > 1:
print(f"method {m} in {sha256_2} has multiple implementation")
for b1 in s1: for b1 in s1:
match = False match = False
for b2 in s2: for b2 in s2:

View file

@ -9,12 +9,14 @@ from matplotlib import pyplot as plt
import matplotlib import matplotlib
def stats(db: Path, out: Path, folder_plat_diff_smali: Path, detail_class_redef: Path): def stats(db: Path, out: Path, folder_plat_diff_smali: Path):
occ_sdk34 = {} occ_sdk34 = {}
occ_hid34 = {} occ_hid34 = {}
occ_self_redef = {}
occ_self = {} occ_self = {}
occ_sdk34_non_id = {}
occ_hid34_non_id = {}
occ_self_non_id = {}
nb_sdk_cl_redef = 0 nb_sdk_cl_redef = 0
nb_sdk_cl_id = 0 nb_sdk_cl_id = 0
@ -24,9 +26,32 @@ def stats(db: Path, out: Path, folder_plat_diff_smali: Path, detail_class_redef:
nb_hid_cl_id = 0 nb_hid_cl_id = 0
nb_app_hid_cl_redef = 0 nb_app_hid_cl_redef = 0
nb_app_hid_cl_redef_false_pos = 0 nb_app_hid_cl_redef_false_pos = 0
nb_class_self_shadow = 0
nb_class_self_shadow_id = 0
nb_app_self_shadow = 0
nb_app_self_shadow_false_pos = 0
for file in folder_plat_diff_smali.iterdir(): for file in folder_plat_diff_smali.iterdir():
with file.open("r") as fd: with file.open("r") as fd:
data = json.load(fd) data = json.load(fd)
if not data["apktool-finished"]:
continue
for cl in data["redef_classes"]:
if cl not in occ_self_non_id:
occ_self_non_id[cl] = 0
occ_self_non_id[cl] += 1
for cl in data["duplicated_classes"]:
if cl not in occ_self:
occ_self[cl] = 0
occ_self[cl] += 1
if data["duplicated_classes"]:
nb_app_self_shadow += 1
if data["duplicated_classes"] and not data["redef_classes"]:
nb_app_self_shadow_false_pos += 1
nb_class_self_shadow += len(data["duplicated_classes"])
nb_class_self_shadow_id += len(data["duplicated_classes"]) - len(
data["redef_classes"]
)
l_nb_sdk_cl_id = 0 l_nb_sdk_cl_id = 0
for cl in data["sdk_34_classes"]: for cl in data["sdk_34_classes"]:
nb_sdk_cl_redef += 1 nb_sdk_cl_redef += 1
@ -47,6 +72,10 @@ def stats(db: Path, out: Path, folder_plat_diff_smali: Path, detail_class_redef:
] ]
): ):
l_nb_sdk_cl_id += 1 l_nb_sdk_cl_id += 1
else:
if cl not in occ_sdk34_non_id:
occ_sdk34_non_id[cl] = 0
occ_sdk34_non_id[cl] += 1
nb_sdk_cl_id += l_nb_sdk_cl_id nb_sdk_cl_id += l_nb_sdk_cl_id
if data["sdk_34_classes"]: if data["sdk_34_classes"]:
nb_app_sdk_cl_redef += 1 nb_app_sdk_cl_redef += 1
@ -70,6 +99,10 @@ def stats(db: Path, out: Path, folder_plat_diff_smali: Path, detail_class_redef:
] ]
): ):
l_nb_hid_cl_id += 1 l_nb_hid_cl_id += 1
else:
if cl not in occ_hid34_non_id:
occ_hid34_non_id[cl] = 0
occ_hid34_non_id[cl] += 1
nb_hid_cl_id += l_nb_hid_cl_id nb_hid_cl_id += l_nb_hid_cl_id
if data["platform_non_sdk_34_classes"]: if data["platform_non_sdk_34_classes"]:
nb_app_hid_cl_redef += 1 nb_app_hid_cl_redef += 1
@ -78,29 +111,6 @@ def stats(db: Path, out: Path, folder_plat_diff_smali: Path, detail_class_redef:
and len(data["platform_non_sdk_34_classes"]) == l_nb_hid_cl_id and len(data["platform_non_sdk_34_classes"]) == l_nb_hid_cl_id
): ):
nb_app_hid_cl_redef_false_pos += 1 nb_app_hid_cl_redef_false_pos += 1
nb_class_self_shadow = 0
nb_class_self_shadow_id = 0
nb_app_self_shadow = 0
nb_app_self_shadow_false_pos = 0
with detail_class_redef.open("r") as fd:
data = json.load(fd)
for v in data.values():
for cl in v["redef_classes"]:
if cl not in occ_self_redef:
occ_self_redef[cl] = 0
occ_self_redef[cl] += 1
for cl in v["duplicated_classes"]:
if cl not in occ_self:
occ_self[cl] = 0
occ_self[cl] += 1
if v["duplicated_classes"]:
nb_app_self_shadow += 1
if v["duplicated_classes"] and not v["redef_classes"]:
nb_app_self_shadow_false_pos += 1
nb_class_self_shadow += len(v["duplicated_classes"])
nb_class_self_shadow_id += len(v["duplicated_classes"]) - len(
v["redef_classes"]
)
prop_id_self_cl = round(100 * nb_class_self_shadow_id / nb_class_self_shadow, 2) prop_id_self_cl = round(100 * nb_class_self_shadow_id / nb_class_self_shadow, 2)
prop_id_self_ap = round(100 * nb_app_self_shadow_false_pos / nb_app_self_shadow, 2) prop_id_self_ap = round(100 * nb_app_self_shadow_false_pos / nb_app_self_shadow, 2)
@ -110,10 +120,11 @@ def stats(db: Path, out: Path, folder_plat_diff_smali: Path, detail_class_redef:
prop_id_hid_ap = round(100 * nb_app_hid_cl_redef_false_pos / nb_app_hid_cl_redef, 2) prop_id_hid_ap = round(100 * nb_app_hid_cl_redef_false_pos / nb_app_hid_cl_redef, 2)
tot_prop_id = round( tot_prop_id = round(
100 100
* (nb_class_self_shadow_id + nb_sdk_cl_id + 100 * nb_hid_cl_id) * (nb_class_self_shadow_id + nb_sdk_cl_id + nb_hid_cl_id)
/ (nb_class_self_shadow + nb_sdk_cl_redef + nb_hid_cl_redef), / (nb_class_self_shadow + nb_sdk_cl_redef + nb_hid_cl_redef),
2, 2,
) )
print( print(
f"Self classes: {nb_class_self_shadow_id}/{nb_class_self_shadow}: {prop_id_self_cl}%" f"Self classes: {nb_class_self_shadow_id}/{nb_class_self_shadow}: {prop_id_self_cl}%"
) )
@ -260,80 +271,66 @@ def stats(db: Path, out: Path, folder_plat_diff_smali: Path, detail_class_redef:
# occ_self = {} # occ_self = {}
print() print()
print( print(f"redefined class SDK {' '*40} occurences disctinct")
"redefined class SDK occurences"
)
print() print()
for cl in sorted(occ_sdk34.keys(), key=lambda x: occ_sdk34[x], reverse=True)[:10]: for cl in sorted(occ_sdk34.keys(), key=lambda x: occ_sdk34[x], reverse=True)[:10]:
print(f"{cl:<70} {occ_sdk34[cl]: >5}") print(f"{cl:<70} {occ_sdk34[cl]: >5} {occ_sdk34_non_id.get(cl, 0): >5}")
print() print()
print() print()
print( print(f"redefined class Hidden {' '*40} occurences disctinct")
"redefined class Hidden occurences"
)
print() print()
for cl in sorted(occ_hid34.keys(), key=lambda x: occ_hid34[x], reverse=True)[:10]: for cl in sorted(occ_hid34.keys(), key=lambda x: occ_hid34[x], reverse=True)[:10]:
print(f"{cl:<70} {occ_hid34[cl]: >5}") print(f"{cl:<70} {occ_hid34[cl]: >5} {occ_hid34_non_id.get(cl, 0): >5}")
print() print()
print() print()
print( print(f"collision class Self {' '*40} occurences disctinct")
"collision class Self occurences"
)
print() print()
for cl in sorted(occ_self.keys(), key=lambda x: occ_self[x], reverse=True)[:10]: for cl in sorted(occ_self.keys(), key=lambda x: occ_self[x], reverse=True)[:10]:
print(f"{cl:<70} {occ_self[cl]: >5}") print(f"{cl:<70} {occ_self[cl]: >5} {occ_self_non_id.get(cl, 0): >5}")
print() print()
print() print()
print( print(f"redefined class Self {' '*40} occurences")
"redefined class Self occurences"
)
print() print()
for cl in sorted( for cl in sorted(
occ_self_redef.keys(), key=lambda x: occ_self_redef[x], reverse=True occ_self_non_id.keys(), key=lambda x: occ_self_non_id[x], reverse=True
)[:10]: )[:10]:
print(f"{cl:<70} {occ_self_redef[cl]: >5}") print(f"{cl:<70} {occ_self_non_id[cl]: >5}")
print() print()
print() print()
print( print(f"redefined class SDK <= 7 {' '*40} occurences disctinct")
"redefined class SDK <= 7 occurences"
)
print() print()
for cl in sorted( for cl in sorted(
filter(lambda cl: MIN_MAX_SDK[cl][0] == 7, occ_sdk34.keys()), filter(lambda cl: MIN_MAX_SDK[cl][0] == 7, occ_sdk34.keys()),
key=lambda x: occ_sdk34[x], key=lambda x: occ_sdk34[x],
reverse=True, reverse=True,
)[:10]: )[:10]:
print(f"{cl:<70} {occ_sdk34[cl]: >5}") print(f"{cl:<70} {occ_sdk34[cl]: >5} {occ_sdk34_non_id.get(cl, 0): >5}")
print() print()
print() print()
print( print(f"redefined class SDK = 8 {' '*40} occurences disctinct")
"redefined class SDK = 8 occurences"
)
print() print()
for cl in sorted( for cl in sorted(
filter(lambda cl: MIN_MAX_SDK[cl][0] == 8, occ_sdk34.keys()), filter(lambda cl: MIN_MAX_SDK[cl][0] == 8, occ_sdk34.keys()),
key=lambda x: occ_sdk34[x], key=lambda x: occ_sdk34[x],
reverse=True, reverse=True,
)[:10]: )[:10]:
print(f"{cl:<70} {occ_sdk34[cl]: >5}") print(f"{cl:<70} {occ_sdk34[cl]: >5} {occ_sdk34_non_id.get(cl, 0): >5}")
print() print()
print() print()
print( print(f"redefined class SDK = 16 {' '*40} occurences disctinct")
"redefined class SDK = 16 occurences"
)
print() print()
for cl in sorted( for cl in sorted(
filter(lambda cl: MIN_MAX_SDK[cl][0] == 16, occ_sdk34.keys()), filter(lambda cl: MIN_MAX_SDK[cl][0] == 16, occ_sdk34.keys()),
key=lambda x: occ_sdk34[x], key=lambda x: occ_sdk34[x],
reverse=True, reverse=True,
)[:10]: )[:10]:
print(f"{cl:<70} {occ_sdk34[cl]: >5}") print(f"{cl:<70} {occ_sdk34[cl]: >5} {occ_sdk34_non_id.get(cl, 0): >5}")
print() print()
@ -360,18 +357,19 @@ def analyse_sdk_redef(folder: Path, db: Path, out: Path):
apk_min_sdk = min_sdks[file.name] apk_min_sdk = min_sdks[file.name]
apk_targ_sdk = targ_sdks[file.name] apk_targ_sdk = targ_sdks[file.name]
with file.open("r") as fp: with file.open("r") as fp:
for cl in fp: data = json.load(fp)
if cl.strip():
cls.add(cl.strip()) for cl in data["sdk_34_classes"]:
cl_min_sdk, _ = MIN_MAX_SDK[cl.strip()] cls.add(cl)
if cl_min_sdk < apk_min_sdk: cl_min_sdk, _ = MIN_MAX_SDK[cl]
cls_by_sdk_under_min[cl_min_sdk] += 1 if cl_min_sdk < apk_min_sdk:
else: cls_by_sdk_under_min[cl_min_sdk] += 1
cls_by_sdk_over_min[cl_min_sdk] += 1 else:
if cl_min_sdk < apk_targ_sdk: cls_by_sdk_over_min[cl_min_sdk] += 1
cls_by_sdk_under_targ[cl_min_sdk] += 1 if cl_min_sdk < apk_targ_sdk:
else: cls_by_sdk_under_targ[cl_min_sdk] += 1
cls_by_sdk_over_targ[cl_min_sdk] += 1 else:
cls_by_sdk_over_targ[cl_min_sdk] += 1
classes_by_app[file.name] = cls classes_by_app[file.name] = cls
classes_occ = {} classes_occ = {}

BIN
platforms.zip (Stored with Git LFS)

Binary file not shown.

View file

@ -22,6 +22,5 @@ build-backend = "poetry.core.masonry.api"
[tool.poetry.scripts] [tool.poetry.scripts]
scan = 'android_class_shadowing_scanner.__init__:main' scan = 'android_class_shadowing_scanner.__init__:main'
collect-scan = 'android_class_shadowing_scanner.__init__:collect_to_db' collect-scan = 'android_class_shadowing_scanner.__init__:collect_to_db'
check-class-redef = 'android_class_shadowing_scanner.__init__:check_smali' check-smali = 'android_class_shadowing_scanner.__init__:check_smali'
check-platf-reder = 'android_class_shadowing_scanner.__init__:check_smali_platform'
data-mining = 'android_class_shadowing_scanner.__init__:data_mining' data-mining = 'android_class_shadowing_scanner.__init__:data_mining'