rasta/rasta_exp/apk.py
Jean-Marie Mineau cd1e91bb99
first commit
2023-11-16 14:30:24 +01:00

193 lines
7.5 KiB
Python

import requests
import logging
import shutil
import hashlib
import json
from utils import sha256_sum
from enum import Enum
from pathlib import Path
from typing import Any, Optional
from androguard.core.bytecodes import apk as androguard_apk # type: ignore
APK_INFO_FOLDER = Path(__file__).parent / "apk_info"
if not APK_INFO_FOLDER.exists():
APK_INFO_FOLDER.mkdir()
class ApkRef:
"""The reference to an apk. The apk it referes to can be in the androzoo repository or
on the local file system.
- If the app is in androzoon the app is refered to by its sha256
- If the app is on the local file system, the app is refered to by its path
"""
RefType = Enum("RefType", ["ANDROZOO", "LOCAL"])
def __init__(
self,
type_: "ApkRef.RefType",
sha256: Optional[str] = None,
path: Optional[Path] = None,
):
self.type = type_
self.sha256 = sha256
if self.sha256 is not None:
self.sha256 = self.sha256.strip().upper()
self.path = path
self.integrity_check()
def __str__(self):
return f"APK<{str(self.type)}: sha256={self.sha256}, path={str(self.path)}>"
def integrity_check(self):
"""Check if the ApkRef is coherent."""
if self.type == ApkRef.RefType.ANDROZOO and self.sha256 is None:
raise RuntimeError(f"Androzoo ApkRef must have a sha256: {str(self)}")
if self.type == ApkRef.RefType.LOCAL and self.path is None:
raise RuntimeError(f"Local APkRef must have a path: {str(self)}")
def get_path(self) -> Path:
"""Return the path to the apk."""
if self.path is None:
raise RuntimeError(f"{str(self)} don't have a path")
return self.path
def get_sha256(self) -> str:
"""Return the sha256 of the apk."""
if self.sha256 is None:
if self.path is None:
raise RuntimeError(f"Could not compute hash for {str(self)}")
self.sha256 = sha256_sum(self.path).upper()
return self.sha256
def get_apk(apk_ref: ApkRef, path: Path, api_key: bytes):
"""Retrieve and apk from its reference and put it at `path`.
`api_key` is always ask because it's easier that way."""
if apk_ref.type == ApkRef.RefType.ANDROZOO:
downlaod_apk(apk_ref.get_sha256(), api_key, path)
elif apk_ref.type == ApkRef.RefType.LOCAL:
shutil.copy(apk_ref.get_path(), path)
def downlaod_apk(apk_sha256: str, api_key: bytes, path: Path):
"""Download an apk from androzoo and store it at the given location"""
logging.debug(f"Start downloading apk {apk_sha256}")
resp = requests.get(
"https://androzoo.uni.lu/api/download",
params={
b"apikey": api_key,
b"sha256": apk_sha256.encode("utf-8"),
},
)
with path.open("bw") as file:
file.write(resp.content)
logging.debug(f"Finished downloading apk {apk_sha256}")
def get_apk_info(apk_ref: ApkRef, api_key: bytes) -> dict[str, Any]:
"""Return the information availables about an application"""
apk_path = APK_INFO_FOLDER / (apk_ref.get_sha256() + ".json")
get_apk(apk_ref, apk_path, api_key)
info: dict[str, Any] = {}
info["apk_size"] = apk_path.stat().st_size
info["sha256"] = apk_ref.get_sha256()
if apk_ref.path is not None:
info["file"] = apk_ref.path.name
else:
info["file"] = None
try:
apk = androguard_apk.APK(apk_path)
info["name"] = apk.get_app_name() # redundant with pkg_name ?
info["min_sdk"] = apk.get_min_sdk_version()
if info["min_sdk"] is not None:
info["min_sdk"] = int(info["min_sdk"])
info["max_sdk"] = apk.get_max_sdk_version()
if info["max_sdk"] is not None:
info["max_sdk"] = int(info["max_sdk"])
info["target_sdk"] = apk.get_target_sdk_version()
if info["target_sdk"] is not None:
info["target_sdk"] = int(info["target_sdk"])
info["total_dex_size"] = sum(
map(lambda x: len(x), apk.get_all_dex())
) # TODO: faster to open the zip and use st_size?
except:
info["name"] = ""
info["min_sdk"] = None
info["max_sdk"] = None
info["target_sdk"] = None
info["total_dex_size"] = None
apk_path.unlink()
return info
def load_apk_info(apks: list[ApkRef], androzoo_list: Path, api_key: bytes):
"""Load the information for the provided apks (`apks` must contain the sha256 of the apk to load)
from the androzoo_list. The information are then stored in json files"""
logging.debug("Start extracting data from the androzoo list")
apks_dict = {a.get_sha256().strip().upper(): a for a in apks}
for apk in apks:
apk_info_path = APK_INFO_FOLDER / (apk.get_sha256() + ".json")
if apk_info_path.exists():
del apks_dict[apk.get_sha256()]
with androzoo_list.open("r") as list_file:
first_line = list_file.readline()
entrie_names = list(map(lambda x: x.strip(), first_line.split(",")))
sha256_index = entrie_names.index(
"sha256"
) # TODO: if 'sha256' is not found in the first line, we have the wrong file...
while line := list_file.readline():
if not apks_dict:
break
entries = list(map(lambda x: x.strip(), line.split(",")))
# TODO: don't parse the entries manually...
if len(entries) != len(entrie_names):
entries_set = set(map(lambda x: x.upper(), entries))
inter = entries_set.intersection(apks_dict.keys())
if inter:
logging.warning(
f"The information for the apk {inter} may not be retreived from the list due to malformated line: {line}"
)
continue
info: dict[str, Any] = {}
sha256 = entries[sha256_index].upper()
if sha256 in apks_dict:
for (k, v) in zip(entrie_names, entries):
info[k] = v
if "markets" in info:
info["markets"] = list(
map(lambda x: x.strip(), info["markets"].split("|"))
)
if "apk_size" in info:
info["apk_size"] = int(info["apk_size"])
if "vt_detection" in info:
info["vt_detection"] = int(info["vt_detection"])
if "dex_size" in info:
info["dex_size"] = int(info["dex_size"])
if "pkg_name" in info:
info["pkg_name"] = (
info["pkg_name"].removeprefix('"').removesuffix('"')
)
apk_info_path = APK_INFO_FOLDER / (sha256 + ".json")
info |= get_apk_info(apks_dict[sha256], api_key)
with apk_info_path.open("w") as file:
json.dump(info, file)
del apks_dict[sha256]
for apk_hash in apks_dict:
logging.warning(
f"The information for the apk {apk_hash} was not found in the androzoo list"
)
info = get_apk_info(apks_dict[apk_hash], api_key)
for key in entrie_names:
if key not in info:
info[key] = None
apk_info_path = APK_INFO_FOLDER / (apk_hash + ".json")
info |= get_apk_info(apks_dict[apk_hash], api_key)
with apk_info_path.open("w") as file:
json.dump(info, file)
logging.debug(
"Finished extracting the information about the apks from androzoo list"
)