193 lines
7.5 KiB
Python
193 lines
7.5 KiB
Python
import requests
|
|
import logging
|
|
import shutil
|
|
import hashlib
|
|
import json
|
|
from utils import sha256_sum
|
|
from enum import Enum
|
|
from pathlib import Path
|
|
from typing import Any, Optional
|
|
from androguard.core.bytecodes import apk as androguard_apk # type: ignore
|
|
|
|
APK_INFO_FOLDER = Path(__file__).parent / "apk_info"
|
|
if not APK_INFO_FOLDER.exists():
|
|
APK_INFO_FOLDER.mkdir()
|
|
|
|
|
|
class ApkRef:
|
|
"""The reference to an apk. The apk it referes to can be in the androzoo repository or
|
|
on the local file system.
|
|
- If the app is in androzoon the app is refered to by its sha256
|
|
- If the app is on the local file system, the app is refered to by its path
|
|
"""
|
|
|
|
RefType = Enum("RefType", ["ANDROZOO", "LOCAL"])
|
|
|
|
def __init__(
|
|
self,
|
|
type_: "ApkRef.RefType",
|
|
sha256: Optional[str] = None,
|
|
path: Optional[Path] = None,
|
|
):
|
|
self.type = type_
|
|
self.sha256 = sha256
|
|
if self.sha256 is not None:
|
|
self.sha256 = self.sha256.strip().upper()
|
|
self.path = path
|
|
self.integrity_check()
|
|
|
|
def __str__(self):
|
|
return f"APK<{str(self.type)}: sha256={self.sha256}, path={str(self.path)}>"
|
|
|
|
def integrity_check(self):
|
|
"""Check if the ApkRef is coherent."""
|
|
if self.type == ApkRef.RefType.ANDROZOO and self.sha256 is None:
|
|
raise RuntimeError(f"Androzoo ApkRef must have a sha256: {str(self)}")
|
|
if self.type == ApkRef.RefType.LOCAL and self.path is None:
|
|
raise RuntimeError(f"Local APkRef must have a path: {str(self)}")
|
|
|
|
def get_path(self) -> Path:
|
|
"""Return the path to the apk."""
|
|
if self.path is None:
|
|
raise RuntimeError(f"{str(self)} don't have a path")
|
|
return self.path
|
|
|
|
def get_sha256(self) -> str:
|
|
"""Return the sha256 of the apk."""
|
|
if self.sha256 is None:
|
|
if self.path is None:
|
|
raise RuntimeError(f"Could not compute hash for {str(self)}")
|
|
self.sha256 = sha256_sum(self.path).upper()
|
|
return self.sha256
|
|
|
|
|
|
def get_apk(apk_ref: ApkRef, path: Path, api_key: bytes):
|
|
"""Retrieve and apk from its reference and put it at `path`.
|
|
`api_key` is always ask because it's easier that way."""
|
|
if apk_ref.type == ApkRef.RefType.ANDROZOO:
|
|
downlaod_apk(apk_ref.get_sha256(), api_key, path)
|
|
elif apk_ref.type == ApkRef.RefType.LOCAL:
|
|
shutil.copy(apk_ref.get_path(), path)
|
|
|
|
|
|
def downlaod_apk(apk_sha256: str, api_key: bytes, path: Path):
|
|
"""Download an apk from androzoo and store it at the given location"""
|
|
logging.debug(f"Start downloading apk {apk_sha256}")
|
|
resp = requests.get(
|
|
"https://androzoo.uni.lu/api/download",
|
|
params={
|
|
b"apikey": api_key,
|
|
b"sha256": apk_sha256.encode("utf-8"),
|
|
},
|
|
)
|
|
with path.open("bw") as file:
|
|
file.write(resp.content)
|
|
logging.debug(f"Finished downloading apk {apk_sha256}")
|
|
|
|
|
|
def get_apk_info(apk_ref: ApkRef, api_key: bytes) -> dict[str, Any]:
|
|
"""Return the information availables about an application"""
|
|
apk_path = APK_INFO_FOLDER / (apk_ref.get_sha256() + ".json")
|
|
get_apk(apk_ref, apk_path, api_key)
|
|
info: dict[str, Any] = {}
|
|
info["apk_size"] = apk_path.stat().st_size
|
|
info["sha256"] = apk_ref.get_sha256()
|
|
if apk_ref.path is not None:
|
|
info["file"] = apk_ref.path.name
|
|
else:
|
|
info["file"] = None
|
|
|
|
try:
|
|
apk = androguard_apk.APK(apk_path)
|
|
info["name"] = apk.get_app_name() # redundant with pkg_name ?
|
|
info["min_sdk"] = apk.get_min_sdk_version()
|
|
if info["min_sdk"] is not None:
|
|
info["min_sdk"] = int(info["min_sdk"])
|
|
info["max_sdk"] = apk.get_max_sdk_version()
|
|
if info["max_sdk"] is not None:
|
|
info["max_sdk"] = int(info["max_sdk"])
|
|
info["target_sdk"] = apk.get_target_sdk_version()
|
|
if info["target_sdk"] is not None:
|
|
info["target_sdk"] = int(info["target_sdk"])
|
|
info["total_dex_size"] = sum(
|
|
map(lambda x: len(x), apk.get_all_dex())
|
|
) # TODO: faster to open the zip and use st_size?
|
|
except:
|
|
info["name"] = ""
|
|
info["min_sdk"] = None
|
|
info["max_sdk"] = None
|
|
info["target_sdk"] = None
|
|
info["total_dex_size"] = None
|
|
apk_path.unlink()
|
|
return info
|
|
|
|
|
|
def load_apk_info(apks: list[ApkRef], androzoo_list: Path, api_key: bytes):
|
|
"""Load the information for the provided apks (`apks` must contain the sha256 of the apk to load)
|
|
from the androzoo_list. The information are then stored in json files"""
|
|
logging.debug("Start extracting data from the androzoo list")
|
|
apks_dict = {a.get_sha256().strip().upper(): a for a in apks}
|
|
for apk in apks:
|
|
apk_info_path = APK_INFO_FOLDER / (apk.get_sha256() + ".json")
|
|
if apk_info_path.exists():
|
|
del apks_dict[apk.get_sha256()]
|
|
|
|
with androzoo_list.open("r") as list_file:
|
|
first_line = list_file.readline()
|
|
entrie_names = list(map(lambda x: x.strip(), first_line.split(",")))
|
|
sha256_index = entrie_names.index(
|
|
"sha256"
|
|
) # TODO: if 'sha256' is not found in the first line, we have the wrong file...
|
|
while line := list_file.readline():
|
|
if not apks_dict:
|
|
break
|
|
entries = list(map(lambda x: x.strip(), line.split(",")))
|
|
# TODO: don't parse the entries manually...
|
|
if len(entries) != len(entrie_names):
|
|
entries_set = set(map(lambda x: x.upper(), entries))
|
|
inter = entries_set.intersection(apks_dict.keys())
|
|
if inter:
|
|
logging.warning(
|
|
f"The information for the apk {inter} may not be retreived from the list due to malformated line: {line}"
|
|
)
|
|
continue
|
|
info: dict[str, Any] = {}
|
|
sha256 = entries[sha256_index].upper()
|
|
if sha256 in apks_dict:
|
|
for (k, v) in zip(entrie_names, entries):
|
|
info[k] = v
|
|
if "markets" in info:
|
|
info["markets"] = list(
|
|
map(lambda x: x.strip(), info["markets"].split("|"))
|
|
)
|
|
if "apk_size" in info:
|
|
info["apk_size"] = int(info["apk_size"])
|
|
if "vt_detection" in info:
|
|
info["vt_detection"] = int(info["vt_detection"])
|
|
if "dex_size" in info:
|
|
info["dex_size"] = int(info["dex_size"])
|
|
if "pkg_name" in info:
|
|
info["pkg_name"] = (
|
|
info["pkg_name"].removeprefix('"').removesuffix('"')
|
|
)
|
|
apk_info_path = APK_INFO_FOLDER / (sha256 + ".json")
|
|
info |= get_apk_info(apks_dict[sha256], api_key)
|
|
with apk_info_path.open("w") as file:
|
|
json.dump(info, file)
|
|
del apks_dict[sha256]
|
|
for apk_hash in apks_dict:
|
|
logging.warning(
|
|
f"The information for the apk {apk_hash} was not found in the androzoo list"
|
|
)
|
|
info = get_apk_info(apks_dict[apk_hash], api_key)
|
|
for key in entrie_names:
|
|
if key not in info:
|
|
info[key] = None
|
|
apk_info_path = APK_INFO_FOLDER / (apk_hash + ".json")
|
|
info |= get_apk_info(apks_dict[apk_hash], api_key)
|
|
with apk_info_path.open("w") as file:
|
|
json.dump(info, file)
|
|
|
|
logging.debug(
|
|
"Finished extracting the information about the apks from androzoo list"
|
|
)
|