first commit

This commit is contained in:
Jean-Marie Mineau 2023-11-15 15:59:13 +01:00
commit cd1e91bb99
Signed by: histausse
GPG key ID: B66AEEDA9B645AD2
287 changed files with 86425 additions and 0 deletions

View file

@ -0,0 +1,3 @@
__author__ = "annon"
__email__ = "annon"
__version__ = "0.2.0"

View file

@ -0,0 +1,115 @@
"""
Collect data about apks.
"""
import dateutil.parser as dp # type: ignore
import datetime
import numpy as np
import matplotlib.pyplot as plt # type: ignore
from typing import Any, IO, Callable
from pathlib import Path
from .utils import render
def plot_apk_info_by_generic_x(
data: list[Any],
x: str,
title: str,
extract_propertie: Callable,
y_label: str,
x_label: str | None = None,
reductions: dict[str, Callable] | None = None,
xscale: str = "linear",
interactive: bool = True,
image_path: Path | None = None,
):
"""`extract_propertie` is a founction that take a list of element and return
a value representing the value of the list, like a median or a mean.
"""
raise NotImplementedError("TODO: update function to use sqlite3")
# groupped = group_by(x, data, reductions=reductions)
# properties = {k: extract_propertie(v) for k, v in groupped.items()}
# if x_label is None:
# x_label = x
# x_values = list(set(filter(lambda x: x is not None, properties.keys())))
# x_values.sort()
# y_values = [properties[x] for x in x_values]
#
# plt.figure(figsize=(16, 9), dpi=80)
# plt.plot(x_values, y_values)
# plt.xscale(xscale)
# # plt.ylim([-5, 105])
# # plt.legend()
# plt.xlabel(x_label)
# plt.ylabel(y_label)
# render(title, interactive, image_path)
#
def plot_apk_size(
apk_data: list[Any],
interactive: bool = True,
image_path: Path | None = None,
):
sizes = np.array([e["total_dex_size"] for e in apk_data]) / 1024 / 1024
sizes.sort()
plt.figure(figsize=(16, 9), dpi=80)
plt.bar(np.arange(len(sizes)), sizes)
plt.ylabel("Bytecode size (MiB)")
plt.tick_params(
axis="x",
which="both",
bottom=False,
top=False,
labelbottom=False,
)
for s in range(7, 13):
plt.axhline(y=(4**s) / 1024 / 1024, color="r", linestyle=":")
render("Bytecode size of the apks", interactive, image_path)
def plot_apk_size_hl_subset(
apk_data: list[Any],
subset_sha: list[str],
title: str,
interactive: bool = True,
image_path: Path | None = None,
):
apk_data.sort(key=lambda x: x["total_dex_size"])
sizes = (
np.array(
[
e["total_dex_size"] if e["sha256"] not in subset_sha else 0
for e in apk_data
]
)
/ 1024
/ 1024
)
sizes_hl = (
np.array(
[e["total_dex_size"] if e["sha256"] in subset_sha else 0 for e in apk_data]
)
/ 1024
/ 1024
)
plt.figure(figsize=(16, 9), dpi=80)
plt.bar(np.arange(len(sizes)), sizes, edgecolor="black")
plt.bar(
np.arange(len(sizes)), sizes_hl, color="#D55E00", hatch="x", edgecolor="black"
)
plt.ylabel("Bytecode size (MiB)")
plt.tick_params(
axis="x",
which="both",
bottom=False,
top=False,
labelbottom=False,
)
for s in range(7, 13):
plt.axhline(y=(4**s) / 1024 / 1024, color="r", linestyle=":")
render(title, interactive, image_path)

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,199 @@
import sqlite3
import csv
import sys
from pathlib import Path
from typing import Optional, Any
from matplotlib_venn import venn2 # type: ignore
from .utils import render
ERROR_CARACT = (
"error_type",
"error",
"msg",
"file",
"function",
"level",
"origin",
"raised_info",
"called_info",
)
ERROR_MSG = " || '|' || ".join(map(lambda s: f"COALESCE({s}, '')", ERROR_CARACT))
def ic3_venn(db: Path, interactive: bool = True, image_path: Path | None = None):
values = {
("FAILED", "NOT_FAILED"): 0,
("FAILED", "FAILED"): 0,
("NOT_FAILED", "FAILED"): 0,
}
with sqlite3.connect(db) as con:
cur = con.cursor()
for ic3_s, ic3_fork_s, n in cur.execute(
"SELECT ex1.tool_status, ex2.tool_status, COUNT(*) "
"FROM exec AS ex1 OUTER LEFT JOIN exec AS ex2 ON ex1.sha256 = ex2.sha256 "
"WHERE ex1.tool_name = 'ic3' AND ex2.tool_name = 'ic3_fork' "
"GROUP BY ex1.tool_status, ex2.tool_status"
):
if ic3_s == "FAILED" and ic3_fork_s == "FAILED":
values[("FAILED", "FAILED")] += n
elif ic3_s == "FAILED":
values[("FAILED", "NOT_FAILED")] += n
elif ic3_fork_s == "FAILED":
values[("NOT_FAILED", "FAILED")] += n
venn2(
subsets=(
values[("FAILED", "NOT_FAILED")],
values[("NOT_FAILED", "FAILED")],
values[("FAILED", "FAILED")],
),
set_labels=("IC3 failed", "IC3 fork failed"),
)
render(
"Number of application that IC3 \nand its fork failed to analyse",
interactive,
image_path,
)
def ic3_errors(db: Path, file: Path | None = None):
errors = []
with sqlite3.connect(db) as con:
cur = con.cursor()
for err in cur.execute(
"SELECT ex1.tool_status = 'FAILED', ex2.tool_status = 'FAILED', "
" error.tool_name, error.error, COUNT(DISTINCT error.sha256) AS cnt, "
f" {ERROR_MSG} "
"FROM exec AS ex1 "
" OUTER LEFT JOIN exec AS ex2 ON ex1.sha256 = ex2.sha256 "
" INNER JOIN error ON ex1.sha256 = error.sha256 AND error.tool_name = 'ic3_fork' "
"WHERE ex1.tool_name = 'ic3' AND ex2.tool_name = 'ic3_fork' AND "
" ex1.tool_status = 'FAILED' AND ex2.tool_status != 'FAILED' "
f"GROUP BY ex1.tool_status = 'FAILED', ex2.tool_status != 'FAILED', error.tool_name, error.error, {ERROR_MSG} "
"ORDER BY cnt DESC "
"LIMIT 10;"
):
errors.append(err)
for err in cur.execute(
"SELECT ex1.tool_status = 'FAILED', ex2.tool_status = 'FAILED', "
" error.tool_name, error.error, COUNT(DISTINCT error.sha256) AS cnt, "
f" {ERROR_MSG} "
"FROM exec AS ex1 "
" OUTER LEFT JOIN exec AS ex2 ON ex1.sha256 = ex2.sha256 "
" INNER JOIN error ON ex1.sha256 = error.sha256 AND error.tool_name = 'ic3_fork' "
"WHERE ex1.tool_name = 'ic3' AND ex2.tool_name = 'ic3_fork' AND "
" ex1.tool_status != 'FAILED' AND ex2.tool_status = 'FAILED' "
f"GROUP BY ex1.tool_status != 'FAILED', ex2.tool_status = 'FAILED', error.tool_name, error.error, {ERROR_MSG}"
"ORDER BY cnt DESC "
"LIMIT 10;"
):
errors.append(err)
for err in cur.execute(
"SELECT ex1.tool_status = 'FAILED', ex2.tool_status = 'FAILED', "
" error.tool_name, error.error, COUNT(DISTINCT error.sha256) AS cnt, "
f" {ERROR_MSG} "
"FROM exec AS ex1 "
" OUTER LEFT JOIN exec AS ex2 ON ex1.sha256 = ex2.sha256 "
" INNER JOIN error ON ex1.sha256 = error.sha256 AND error.tool_name = 'ic3_fork' "
"WHERE ex1.tool_name = 'ic3' AND ex2.tool_name = 'ic3_fork' AND "
" ex1.tool_status = 'FAILED' AND ex2.tool_status = 'FAILED' "
f"GROUP BY ex1.tool_status = 'FAILED', ex2.tool_status = 'FAILED', error.tool_name, error.error, {ERROR_MSG} "
"ORDER BY cnt DESC "
"LIMIT 10;"
):
errors.append(err)
for err in cur.execute(
"SELECT ex1.tool_status = 'FAILED', ex2.tool_status = 'FAILED', "
" error.tool_name, error.error, COUNT(DISTINCT error.sha256) AS cnt, "
f" {ERROR_MSG} "
"FROM exec AS ex1 "
" OUTER LEFT JOIN exec AS ex2 ON ex1.sha256 = ex2.sha256 "
" INNER JOIN error ON ex1.sha256 = error.sha256 AND error.tool_name = 'ic3' "
"WHERE ex1.tool_name = 'ic3' AND ex2.tool_name = 'ic3_fork' AND "
" ex1.tool_status = 'FAILED' AND ex2.tool_status != 'FAILED' "
f"GROUP BY ex1.tool_status = 'FAILED', ex2.tool_status != 'FAILED', error.tool_name, error.error, {ERROR_MSG} "
"ORDER BY cnt DESC "
"LIMIT 10;"
):
errors.append(err)
for err in cur.execute(
"SELECT ex1.tool_status = 'FAILED', ex2.tool_status = 'FAILED', "
" error.tool_name, error.error, COUNT(DISTINCT error.sha256) AS cnt, "
f" {ERROR_MSG} "
"FROM exec AS ex1 "
" OUTER LEFT JOIN exec AS ex2 ON ex1.sha256 = ex2.sha256 "
" INNER JOIN error ON ex1.sha256 = error.sha256 AND error.tool_name = 'ic3' "
"WHERE ex1.tool_name = 'ic3' AND ex2.tool_name = 'ic3_fork' AND "
" ex1.tool_status != 'FAILED' AND ex2.tool_status = 'FAILED' "
f"GROUP BY ex1.tool_status != 'FAILED', ex2.tool_status = 'FAILED', error.tool_name, error.error, {ERROR_MSG} "
"ORDER BY cnt DESC "
"LIMIT 10;"
):
errors.append(err)
for err in cur.execute(
"SELECT ex1.tool_status = 'FAILED', ex2.tool_status = 'FAILED', "
" error.tool_name, error.error, COUNT(DISTINCT error.sha256) AS cnt, "
f" {ERROR_MSG} "
"FROM exec AS ex1 "
" OUTER LEFT JOIN exec AS ex2 ON ex1.sha256 = ex2.sha256 "
" INNER JOIN error ON ex1.sha256 = error.sha256 AND error.tool_name = 'ic3' "
"WHERE ex1.tool_name = 'ic3' AND ex2.tool_name = 'ic3_fork' AND "
" ex1.tool_status = 'FAILED' AND ex2.tool_status = 'FAILED' "
f"GROUP BY ex1.tool_status = 'FAILED', ex2.tool_status = 'FAILED', error.tool_name, error.error, {ERROR_MSG} "
"ORDER BY cnt DESC "
"LIMIT 10;"
):
errors.append(err)
if file is None:
fp = sys.stdout
else:
fp = file.open("w")
writer = csv.DictWriter(
fp,
fieldnames=[
"ic3 failed",
"ic3 fork failed",
"tool",
"error",
"occurence",
"msg",
],
)
writer.writeheader()
for err in map(rewrite_msg, errors):
writer.writerow(
{
k: v
for k, v in zip(
[
"ic3 failed",
"ic3 fork failed",
"tool",
"error",
"msg",
"occurence",
],
err,
)
}
)
if file is not None:
fp.close()
def rewrite_msg(
err: tuple[int, int, str, str, int, str]
) -> tuple[int, int, str, str, int, str]:
ic3_failed, ic3_fork_failed, tool, error, occurence, msg = err
(
error_type,
error,
msg,
file,
function,
level,
origin,
raised_info,
called_info,
) = map(lambda s: "" if s == "" else s + " ", msg.split("|"))
msg = f"{level}{error}{msg}{called_info}{called_info}{file}{function}{origin}"
return (ic3_failed, ic3_fork_failed, tool, error, occurence, msg)

View file

@ -0,0 +1,246 @@
import sqlite3
import time
import gzip
import csv
import datetime
import requests
import getpass
import dateutil.parser
from androguard.core.bytecodes import apk as androguard_apk
from pathlib import Path
def int_or_none(str_: str) -> int | None:
if str_:
return int(str_)
else:
return None
def create_apk_table(db: Path):
"""Create the db/table if it does not exist."""
with sqlite3.connect(db) as con:
cur = con.cursor()
if (
cur.execute("SELECT name FROM sqlite_master WHERE name='apk'").fetchone()
is None
):
cur.execute(
(
"CREATE TABLE apk("
" sha256, first_seen_year, apk_size,"
" vt_detection, min_sdk, max_sdk,"
" target_sdk, apk_size_decile, dex_date date,"
" pkg_name, vercode, vt_scan_date date,"
" dex_size, added date, markets, dex_size_decile, "
" dex_size_decile_by_year"
")"
)
)
con.commit()
def get_sha_set(dataset: Path) -> set[str]:
"""Read a set of sha256 from a file."""
apk_set = set()
with dataset.open() as f:
for line in f.readlines():
apk_set.add(line.strip())
return apk_set
def populate_from_year_and_sdk(db: Path, year_and_sdk: Path, apks: set[str]):
"""Add to the info from year_and_sdk.csv.gz to the database
for the apks in `apks`.
"""
apks_not_found = apks.copy()
with gzip.open(year_and_sdk, "rt", newline="") as f:
reader = csv.DictReader(f, quotechar='"')
fieldnames = reader.fieldnames
assert fieldnames is not None
for row in reader:
if row["sha256"] not in apks:
continue
value = {
"sha256": row["sha256"],
"first_seen_year": int_or_none(row["first_seen_year"]),
"vt_detection": int_or_none(row["vt_detection"]),
"min_sdk": int_or_none(row["min_sdk"]),
"max_sdk": int_or_none(row["max_sdk"]),
"target_sdk": int_or_none(row["target_sdk"]),
"apk_size_decile": 0, # Computed at dataset generation
"dex_size_decile": 0, # Computed by compute_dex_decile
}
with sqlite3.connect(db) as con:
cur = con.cursor()
cur.execute(
(
"INSERT INTO apk ("
" sha256, first_seen_year, vt_detection,"
" min_sdk, max_sdk, target_sdk, apk_size_decile,"
" dex_size_decile"
") VALUES("
" :sha256, :first_seen_year, :vt_detection,"
" :min_sdk, :max_sdk, :target_sdk, :apk_size_decile,"
" :dex_size_decile"
");"
),
value,
)
con.commit()
apks_not_found.remove(row["sha256"])
for apk in apks_not_found:
value = {
"sha256": apk,
"first_seen_year": None,
"vt_detection": None,
"min_sdk": None,
"max_sdk": None,
"target_sdk": None,
"apk_size_decile": 0,
"dex_size_decile": 0, # Computed by compute_dex_decile
}
with sqlite3.connect(db) as con:
cur = con.cursor()
cur.execute(
(
"INSERT INTO apk ("
" sha256, first_seen_year, vt_detection,"
" min_sdk, max_sdk, target_sdk, apk_size_decile,"
" dex_size_decile"
") VALUES("
" :sha256, :first_seen_year, :vt_detection,"
" :min_sdk, :max_sdk, :target_sdk, :apk_size_decile,"
" :dex_size_decile"
");"
),
value,
)
con.commit()
def populate_from_latest_with_added_date(
db: Path, latest_with_added_date: Path, apks: set[str]
):
"""Add to the info from latest_with-added-date.csv.gz to the database
for the apks in `apks`.
"""
with gzip.open(latest_with_added_date, "rt", newline="") as f:
reader = csv.DictReader(f, quotechar='"')
fieldnames = reader.fieldnames
assert fieldnames is not None
for row in reader:
if row["sha256"] not in apks:
continue
value = {
"sha256": row["sha256"],
"apk_size": int_or_none(row["apk_size"]),
"dex_date": datetime.datetime.fromisoformat(row["dex_date"])
if row["dex_date"]
else None,
"pkg_name": row["pkg_name"],
"vercode": int_or_none(row["vercode"]),
"vt_scan_date": datetime.datetime.fromisoformat(row["vt_scan_date"])
if row["vt_scan_date"]
else None,
"dex_size": int_or_none(
row["dex_size"]
), # Not necessary the right value if multiple dex are used, see 'fix_dex_size()'
"added": dateutil.parser.isoparse(row["added"])
if row["added"]
else None,
"markets": row["markets"],
}
with sqlite3.connect(db) as con:
cur = con.cursor()
cur.execute(
"UPDATE apk "
"SET apk_size = :apk_size,"
" dex_date = :dex_date,"
" pkg_name = :pkg_name,"
" vercode = :vercode,"
" vt_scan_date = :vt_scan_date,"
" dex_size = :dex_size,"
" added = :added,"
" markets = :markets "
"WHERE"
" sha256 = :sha256;",
value,
)
con.commit()
def download_apk(sha256: str, api_key: bytes) -> bytes:
while True:
resp = requests.get(
"https://androzoo.uni.lu/api/download",
params={
b"apikey": api_key,
b"sha256": sha256.encode("utf-8"),
},
)
if resp.status_code == 200:
return resp.content
else:
print(resp)
print(resp.content)
time.sleep(1)
def fix_dex_size(db: Path, apks: set[str], androzoo_key: bytes):
"""Download the apk from androzoo, compute the total size
of all .dex file and update the database.
"""
for sha256 in apks:
apk = download_apk(sha256, androzoo_key)
apk = androguard_apk.APK(apk, raw=True, skip_analysis=True)
dex_size = sum(map(lambda x: len(x), apk.get_all_dex()))
with sqlite3.connect(db) as con:
cur = con.cursor()
cur.execute(
("UPDATE apk " "SET dex_size = ? " "WHERE" " sha256 = ?;"),
(dex_size, sha256),
)
con.commit()
def populate_db_apk(
db: Path,
dataset: Path,
year_and_sdk: Path,
latest_with_added_date: Path,
fix_dsize: bool,
):
"""Populate the database with the apk informations."""
if fix_dsize:
androzoo_key = (
getpass.getpass(prompt="androzoo apikey: ").strip().encode("utf-8")
)
create_apk_table(db)
apks = get_sha_set(dataset)
populate_from_year_and_sdk(db, year_and_sdk, apks)
populate_from_latest_with_added_date(db, latest_with_added_date, apks)
if fix_dsize:
fix_dex_size(db, apks, androzoo_key)
with sqlite3.connect(db) as con:
cur = con.cursor()
cur.execute(
"UPDATE apk "
"SET dex_size_decile = compute.decile "
"FROM ("
" SELECT NTILE ( 10 ) OVER ( ORDER BY dex_size ) decile, sha256 FROM apk"
") AS compute "
"WHERE apk.sha256 = compute.sha256;"
)
cur.execute(
"UPDATE apk "
"SET dex_size_decile_by_year = compute.decile "
"FROM ("
" SELECT NTILE ( 10 ) "
" OVER ( PARTITION BY first_seen_year ORDER BY dex_size ) decile, sha256 "
" FROM apk"
") AS compute "
"WHERE apk.sha256 = compute.sha256;"
)
con.commit()

View file

@ -0,0 +1,186 @@
import sqlite3
import json
import datetime
from pathlib import Path
from .query_error import estimate_cause
def create_tables(db: Path):
"""Create the db/tables if they do not exist."""
with sqlite3.connect(db) as con:
cur = con.cursor()
cur.execute(
(
"CREATE TABLE IF NOT EXISTS exec ("
" sha256, id, rev, time, kernel_cpu_time, user_cpu_time, "
" max_rss_mem, avg_rss_mem, avg_total_mem, page_size, "
" nb_major_page_fault, nb_minor_page_fault, nb_fs_input, "
" nb_fs_output, nb_socket_msg_received, nb_socket_msg_sent, "
" nb_signal_delivered, exit_status, timeout, "
" tool_status, tool_name, date date"
");"
)
)
cur.execute(
(
"CREATE TABLE IF NOT EXISTS error ("
" tool_name, sha256, error_type, error, msg, "
" first_line, last_line, logfile_name, file, "
" line, function, level, origin, raised_info, "
" called_info, cause"
");"
)
)
con.commit()
def insert_errors(cur, tool, sha256, errors):
for error in errors:
error["tool_name"] = tool
error["sha256"] = sha256
error.setdefault("error_type", None)
error.setdefault("error", None)
error.setdefault("msg", None)
error.setdefault("first_line", None)
error.setdefault("last_line", None)
error.setdefault("logfile_name", None)
error.setdefault("file", None)
error.setdefault("line", None)
error.setdefault("function", None)
error.setdefault("level", None)
error.setdefault("origin", None)
error.setdefault("raised_info", None)
if error["raised_info"] is not None:
error["raised_info"] = 'Raised at {} in file "{}", line {}'.format(
error["raised_info"]["function"],
error["raised_info"]["file"],
error["raised_info"]["line"],
)
error.setdefault("called_info", None)
if error["called_info"] is not None:
error["called_info"] = 'Called from {} in file "{}", line {}'.format(
error["called_info"]["function"],
error["called_info"]["file"],
error["called_info"]["line"],
)
# The stack strace can be quite big without being very usefull in
# queries
error.pop("stack", None)
cur.executemany(
(
"INSERT INTO error VALUES("
" :tool_name, :sha256, :error_type, :error, :msg, "
" :first_line, :last_line, :logfile_name, :file, "
" :line, :function, :level, :origin, :raised_info, "
" :called_info, ''"
");"
),
errors,
)
def fix_error(db: Path, report_with_correct_error: Path):
"""Infortunatly they was some errors in parsing the errors during the experiment,
some another run was made for some pair of tool-apk to get the actual error.
This pass was made in a different environnment (!= memory and space constraint),
so we only replace the errors (after manual inspection, they don't seam related
to the environnment), and keep the other values from the original experiment.
"""
with sqlite3.connect(db) as con:
cur = con.cursor()
for path in report_with_correct_error.iterdir():
with path.open() as f:
exec_log = json.load(f)
sha256 = exec_log["apk"].removesuffix(".apk")
if (
len(
cur.execute(
"SELECT * FROM exec WHERE tool_name = ? AND sha256 = ?",
(exec_log["tool-name"], sha256),
).fetchall()
)
== 1
):
cur.execute(
"DELETE FROM error WHERE tool_name = ? AND sha256 = ?",
(exec_log["tool-name"], sha256),
)
errors = exec_log.pop("errors", [])
insert_errors(cur, exec_log["tool-name"], sha256, errors)
con.commit()
def populate_execution_report(db: Path, report_folder: Path):
"""Add to database the report stored in the report_folder."""
create_tables(db)
i = 0
with sqlite3.connect(db) as con:
cur = con.cursor()
for path in report_folder.iterdir():
with path.open() as f:
exec_log = json.load(f)
exec_log["sha256"] = exec_log["apk"].removesuffix(".apk")
exec_log["id"] = exec_log.get("_id", None)
exec_log["rev"] = exec_log.get("_rev", None)
errors = exec_log.pop("errors", [])
exec_log["date"] = (
datetime.datetime.fromisoformat(exec_log["date"])
if exec_log.get("date", None)
else None
)
del exec_log["apk"]
if "_id" in exec_log:
del exec_log["_id"]
if "_rev" in exec_log:
del exec_log["_rev"]
new_exec_log = {}
for key in exec_log:
new_key = key.replace("-", "_")
new_exec_log[new_key] = exec_log[key]
for val in [
"sha256",
"id",
"rev",
"time",
"kernel_cpu_time",
"user_cpu_time",
"max_rss_mem",
"avg_rss_mem",
"avg_total_mem",
"page_size",
"nb_major_page_fault",
"nb_minor_page_fault",
"nb_fs_input",
"nb_fs_output",
"nb_socket_msg_received",
"nb_socket_msg_sent",
"nb_signal_delivered",
"exit_status",
"timeout",
"tool_status",
"tool_name",
"date",
]:
if val not in new_exec_log:
new_exec_log[val] = None
cur.execute(
(
"INSERT INTO exec VALUES("
" :sha256, :id, :rev, :time, :kernel_cpu_time, :user_cpu_time, "
" :max_rss_mem, :avg_rss_mem, :avg_total_mem, :page_size, "
" :nb_major_page_fault, :nb_minor_page_fault, :nb_fs_input, "
" :nb_fs_output, :nb_socket_msg_received, :nb_socket_msg_sent, "
" :nb_signal_delivered, :exit_status, :timeout, "
" :tool_status, :tool_name, :date"
");"
),
new_exec_log,
)
insert_errors(cur, exec_log["tool-name"], exec_log["sha256"], errors)
i += 1
if i == 10_000:
# Not sure how much ram would be needed to commit in one go
con.commit()
con.commit()

View file

@ -0,0 +1,176 @@
import sqlite3
from pathlib import Path
TOOL_INFO = [
{
"tool_name": "adagio",
"use_python": True,
"use_androguard": True,
},
{
"tool_name": "amandroid",
"use_scala": True,
"use_soot": False,
"use_apktool": True,
},
{
"tool_name": "anadroid",
"use_python": True,
"use_java": True,
"use_scala": True,
"use_soot": False,
"use_apktool": True,
},
{
"tool_name": "androguard",
"use_python": True,
"use_androguard": True, # Duh
},
{
"tool_name": "androguard_dad",
"use_python": True,
"use_androguard": True,
},
{
"tool_name": "apparecium",
"use_python": True,
"use_androguard": True,
},
{
"tool_name": "blueseal",
"use_java": True,
"use_soot": True,
"use_apktool": True,
},
{
"tool_name": "dialdroid",
"use_java": True,
"use_soot": True,
},
{
"tool_name": "didfail",
"use_python": True,
"use_java": True,
"use_soot": True,
},
{
"tool_name": "droidsafe",
"use_python": True,
"use_java": True,
"use_soot": True,
"use_apktool": True,
},
{
"tool_name": "flowdroid",
"use_java": True,
"use_soot": True,
},
{
"tool_name": "gator",
"use_python": True,
"use_java": True,
"use_soot": True,
"use_apktool": True,
},
{
"tool_name": "ic3",
"use_java": True,
"use_soot": True,
},
{
"tool_name": "ic3_fork",
"use_java": True,
"use_soot": True,
},
{
"tool_name": "iccta",
"use_java": True,
"use_soot": True,
"use_apktool": True,
},
{
"tool_name": "mallodroid",
"use_python": True,
"use_androguard": True,
},
{
"tool_name": "perfchecker",
"use_java": True,
"use_soot": True,
},
{
"tool_name": "redexer",
"use_ocaml": True,
"use_ruby": True,
"use_apktool": True,
},
{
"tool_name": "saaf",
"use_java": True,
"use_soot": False,
"use_apktool": True,
},
{
"tool_name": "wognsen_et_al",
"use_python": True,
"use_prolog": True,
"use_apktool": True,
},
]
for line in TOOL_INFO:
for col in [
"use_python",
"use_java",
"use_scala",
"use_ocaml",
"use_ruby",
"use_prolog",
"use_soot",
"use_androguard",
"use_apktool",
]:
if col not in line:
line[col] = False
def create_tool_table(db: Path):
"""Create the db/table if it does not exist."""
with sqlite3.connect(db) as con:
cur = con.cursor()
if (
cur.execute("SELECT name FROM sqlite_master WHERE name='tool';").fetchone()
is None
):
cur.execute(
(
"CREATE TABLE tool ("
" tool_name, use_python, use_java, use_scala,"
" use_ocaml, use_ruby, use_prolog, use_soot, "
" use_androguard, use_apktool"
");"
)
)
con.commit()
def populate_tool(
db: Path,
):
"""Add to database the tool information"""
create_tool_table(db)
# DROP table if already exist? replace value?
with sqlite3.connect(db) as con:
cur = con.cursor()
cur.executemany(
(
"INSERT INTO tool VALUES("
" :tool_name, :use_python, :use_java, :use_scala,"
" :use_ocaml, :use_ruby, :use_prolog, :use_soot, "
" :use_androguard, :use_apktool"
");"
),
TOOL_INFO,
)
con.commit()

View file

@ -0,0 +1,699 @@
import sqlite3
import sys
import csv
import matplotlib.pyplot as plt # type: ignore
from .utils import get_list_tools, radar_chart, render
from pathlib import Path
from typing import Optional, Any
ERROR_CARACT = (
"error_type",
"error",
"msg",
"file",
"function",
"level",
"origin",
"raised_info",
"called_info",
)
# Query that remove identical error that occure multiple times on the same execution
DISTINCT_ERRORS = (
"("
f" SELECT DISTINCT tool_name, sha256, {', '.join(ERROR_CARACT)}"
" FROM error"
") AS distinct_error"
)
DISTINCT_ERROR_CLASS = (
"("
f" SELECT DISTINCT tool_name, sha256, error, error_type"
" FROM error"
") AS distinct_error"
)
DISTINCT_CAUSES = (
"("
" SELECT DISTINCT tool_name, sha256, cause"
" FROM error"
") AS distinct_cause"
)
def estimate_cause(db: Path):
"""Estimate the cause of an error to easier grouping."""
with sqlite3.connect(db) as con:
cur = con.cursor()
cur.execute("UPDATE error SET cause = '';")
con.commit()
# brut.androlib is package defined in apktool
# 'Expected: 0x001c0001, got: 0x00000000' errors are always
# part of an apktool stacktrace:
# SELECT COUNT(*) FROM error e1
# WHERE e1.tool_name = '${tool}' AND
# e1.msg = 'Expected: 0x001c0001, got: 0x00000000' AND
# e1.sha256 NOT IN (
# SELECT e2.sha256 FROM error e2
# WHERE e2.tool_name = '${tool}' AND
# e2.msg LIKE '%Could not decode arsc file%'
# )
# is always 0"
cur.execute(
(
"UPDATE error "
"SET cause = 'apktool' "
"WHERE error = 'brut.androlib.AndrolibException' OR "
" error LIKE 'brut.androlib.err.%' OR "
" msg = 'Expected: 0x001c0001, got: 0x00000000' OR "
" msg LIKE '%brut.androlib.AndrolibException: Could not decode arsc file%' OR "
" msg LIKE 'bad magic value: %' OR "
" error = 'brut.androlib.err.UndefinedResObject';"
)
)
cur.execute(
(
"UPDATE error "
"SET cause = 'memory' "
"WHERE error = 'java.lang.StackOverflowError' OR "
" error = 'java.lang.OutOfMemoryError' OR "
" msg LIKE '%java.lang.OutOfMemoryError%' OR "
" msg LIKE '%java.lang.StackOverflowError%' OR "
" msg = 'Stack overflow';"
)
)
cur.execute(
(
"UPDATE error "
"SET cause = 'soot' "
"WHERE msg LIKE ? OR "
" msg LIKE '%No call graph present in Scene. Maybe you want Whole Program mode (-w)%' OR "
" msg LIKE '%There were exceptions during IFDS analysis. Exiting.%' OR " # More hero than soot?
" msg = 'Could not find method' OR "
" msg = 'No sources found, aborting analysis' OR "
" msg = 'No sources or sinks found, aborting analysis' OR "
" msg = 'Only phantom classes loaded, skipping analysis...';"
),
(
"%RefType java.lang.Object not loaded. If you tried to get the RefType of a library class, did you call loadNecessaryClasses()? Otherwise please check Soot's classpath.%",
),
)
cur.execute(
(
"UPDATE error "
"SET cause = 'index error' "
"WHERE error = 'IndexError' OR "
" msg = 'java.lang.ArrayIndexOutOfBoundsException' OR "
" (error_type = 'Python' AND error = 'KeyError') OR "
" error = 'java.lang.IndexOutOfBoundsException' OR "
" error = 'java.lang.ArrayIndexOutOfBoundsException' OR "
" msg LIKE 'java.lang.ArrayIndexOutOfBoundsException:%';"
)
)
cur.execute(
(
"UPDATE error "
"SET cause = 'arithmetique' "
"WHERE error = 'java.lang.ArithmeticException';"
)
)
cur.execute("UPDATE error SET cause = 'jasmin' WHERE error = 'jas.jasError';")
cur.execute(
(
"UPDATE error "
"SET cause = 'storage' "
"WHERE msg = 'No space left on device' OR "
" msg LIKE 'Error copying file: %' OR "
" msg = 'java.io.IOException: No space left on device';"
)
)
cur.execute(
(
"UPDATE error "
"SET cause = 'redexe pattern maching failed' "
"WHERE msg = 'File \"src/ext/logging.ml\", line 712, characters 12-17: Pattern matching failed';"
)
)
cur.execute(
(
"UPDATE error "
"SET cause = 'null pointer' "
"WHERE error = 'java.lang.NullPointerException' OR "
" msg LIKE ? OR "
" msg LIKE 'undefined method % for nil:NilClass (NoMethodError)';"
),
("'NoneType' object has no attribute %",),
)
# Soot ?
cur.execute(
(
"UPDATE error "
"SET cause = 'unknown error in thread' "
"WHERE msg = 'Worker thread execution failed: null';"
)
)
cur.execute(
(
"UPDATE error "
"SET cause = 'timeout' "
"WHERE error = 'java.util.concurrent.TimeoutException';"
)
)
cur.execute(
(
"UPDATE error "
"SET cause = 'file name too long' "
"WHERE msg = 'File name too long';"
)
)
cur.execute(
(
"UPDATE error "
"SET cause = 'encoding' "
"WHERE error = 'UnicodeEncodeError';"
)
)
cur.execute(
(
"UPDATE error "
"SET cause = 'smali' "
"WHERE error LIKE 'org.jf.dexlib2.%' OR error LIKE 'org.jf.util.%';"
)
)
cur.execute(
(
"UPDATE error "
"SET cause = 'redexer dex parser' "
"WHERE msg LIKE 'Dex.Wrong_dex(\"%\")';"
)
)
cur.execute(
(
"UPDATE error "
"SET cause = 'bytecode not found' "
"WHERE msg LIKE 'No method source set for method %' OR "
" msg LIKE '% is an system library method.' OR "
" msg LIKE '% is an unknown method.';"
)
)
con.commit()
# Default
# default = " || '|' || ".join(map(lambda s: f"COALESCE({s}, '')", ERROR_CARACT))
# cur.execute(f"UPDATE error SET cause = {default} WHERE cause = '';")
cur.execute("UPDATE error SET cause = 'other' WHERE cause = '';")
con.commit()
def radar_cause_estimation(
db: Path,
tools: list[str] | None,
interactive: bool,
folder: Path | None,
):
# estimate_cause(db)
if tools is None:
tools = get_list_tools(db)
with sqlite3.connect(db, timeout=60) as con:
cur = con.cursor()
causes = [
v for v, in cur.execute("SELECT DISTINCT cause FROM error;").fetchall()
]
for tool in tools:
print(f"tool: {tool}")
for cause, count in cur.execute(
(
"SELECT cause, COUNT(*) AS cnt "
"FROM error "
"WHERE tool_name = ? "
"GROUP BY cause "
"ORDER BY cnt DESC LIMIT 10;"
),
(tool,),
):
print(f"{count: 6}: {cause}")
print()
values = []
labels = tools
for tool in tools:
vals = [0 for _ in causes]
with sqlite3.connect(db) as con:
cur = con.cursor()
for cause, cnt in cur.execute(
(
"SELECT distinct_cause.cause, COUNT(*) AS cnt "
f"FROM {DISTINCT_CAUSES} "
"WHERE distinct_cause.cause != '' AND distinct_cause.tool_name = ? "
"GROUP BY distinct_cause.cause;"
),
(tool,),
):
print(f"{tool=}, {cause=}, {cnt=}")
if cause in causes:
vals[causes.index(cause)] = cnt
print(f"{tool=}, {vals=}")
radar_chart(
causes, [vals], [tool], f"Causes of error for {tool}", interactive, folder
)
values.append(vals)
radar_chart(causes, values, labels, "Causes of error", interactive, folder)
def get_common_errors(
db: Path,
tool: Optional[str] = None,
status: Optional[str] = None,
use_androguard: Optional[bool] = None,
use_java: Optional[bool] = None,
use_prolog: Optional[bool] = None,
use_ruby: Optional[bool] = None,
use_soot: Optional[bool] = None,
use_apktool: Optional[bool] = None,
use_ocaml: Optional[bool] = None,
use_python: Optional[bool] = None,
use_scala: Optional[bool] = None,
folder: Optional[Path] = None,
limit: int = 10,
):
"""Get the most common errors"""
args: dict[str, Any] = {"limit": limit}
clauses = []
if tool is not None:
clauses.append("(distinct_error.tool_name = :tool)")
args["tool"] = tool
if status is not None:
clauses.append("(exec.tool_status = :tool_status)")
args["tool_status"] = status
if use_java is not None:
clauses.append("(tool.use_java = :use_java)")
args["use_java"] = use_java
if use_prolog is not None:
clauses.append("(tool.use_prolog = :use_prolog)")
args["use_prolog"] = use_prolog
if use_ruby is not None:
clauses.append("(tool.use_ruby = :use_ruby)")
args["use_ruby"] = use_ruby
if use_soot is not None:
clauses.append("(tool.use_soot = :use_soot)")
args["use_soot"] = use_soot
if use_apktool is not None:
clauses.append("(tool.use_apktool = :use_apktool)")
args["use_apktool"] = use_apktool
if use_ocaml is not None:
clauses.append("(tool.use_ocaml = :use_ocaml)")
args["use_ocaml"] = use_ocaml
if use_python is not None:
clauses.append("(tool.use_python = :use_python)")
args["use_python"] = use_python
if use_scala is not None:
clauses.append("(tool.use_scala = :use_scala)")
args["use_scala"] = use_scala
where_clause = ""
if clauses:
where_clause = f"WHERE {' AND '.join(clauses)}"
# print(
# (
# f"SELECT COUNT(*) AS cnt, {', '.join(ERROR_CARACT)} \n"
# f"FROM {DISTINCT_ERRORS} \n"
# "INNER JOIN tool ON distinct_error.tool_name = tool.tool_name \n"
# "INNER JOIN exec ON \n"
# " distinct_error.tool_name = exec.tool_name AND \n"
# " distinct_error.sha256 = exec.sha256 \n"
# f"{where_clause}\n"
# f"GROUP BY {', '.join(ERROR_CARACT)} \n"
# "ORDER BY cnt DESC LIMIT :limit;\n"
# )
# )
# print(args)
if folder is None:
out = sys.stdout
else:
# Generate filename
features = [
use_androguard,
use_java,
use_prolog,
use_ruby,
use_soot,
use_apktool,
use_ocaml,
use_python,
use_scala,
]
if tool is None:
tool_str = ""
else:
tool_str = f"_for_{tool}"
if status is None:
status_str = ""
else:
status_str = f"_when_{status}"
if all(map(lambda x: x is None, features)):
features_str = ""
else:
features_str = "_using"
if use_androguard:
features_str += "_androguard"
if use_java:
features_str += "_java"
if use_prolog:
features_str += "_prolog"
if use_ruby:
features_str += "_ruby"
if use_soot:
features_str += "_soot"
if use_apktool:
features_str += "_apktool"
if use_ocaml:
features_str += "_ocaml"
if use_python:
features_str += "_python"
if use_scala:
features_str += "_scala"
name = f"{limit}_most_common_errors{tool_str}{status_str}{features_str}.csv"
# make sure the folder exist
folder.mkdir(parents=True, exist_ok=True)
out = (folder / name).open("w")
with sqlite3.connect(db) as con:
cur = con.cursor()
writer = csv.DictWriter(out, fieldnames=["error", "msg", "count"])
writer.writeheader()
for row in cur.execute(
(
f"SELECT COUNT(*) AS cnt, {', '.join(ERROR_CARACT)} "
f"FROM {DISTINCT_ERRORS} "
"INNER JOIN tool ON distinct_error.tool_name = tool.tool_name "
"INNER JOIN exec ON "
" distinct_error.tool_name = exec.tool_name AND "
" distinct_error.sha256 = exec.sha256 "
f"{where_clause}"
f"GROUP BY {', '.join(ERROR_CARACT)} "
"ORDER BY cnt DESC LIMIT :limit;"
),
args,
):
row_d = {k: v for (k, v) in zip(("cnt", *ERROR_CARACT), row)}
writer.writerow(reduce_error_row(row_d))
if folder is not None:
out.close()
def reduce_error_row(row: dict[str, Any]) -> dict[str, Any]:
"""Reduce an error from an sqlite row to a simpler row for svg."""
new_row = {}
new_row["error"] = row["error"]
msg = row["msg"]
error = row["error"]
if error:
error += " "
else:
error = ""
if msg:
msg += " "
else:
msg = ""
file = row["file"]
if file:
file += " "
else:
file = ""
function = row["function"]
if function:
function += " "
else:
function = ""
level = row["level"]
if level:
level += " "
else:
level = ""
origin = row["origin"]
if origin:
origin += " "
else:
origin = ""
raised_info = row["raised_info"]
if raised_info:
raised_info += " "
else:
raised_info = ""
called_info = row["called_info"]
if called_info:
called_info += " "
else:
called_info = ""
new_row[
"msg"
] = f"{level}{error}{msg}{called_info}{called_info}{file}{function}{origin}"
new_row["count"] = row["cnt"]
return new_row
def get_common_error_classes(
db: Path,
tool: Optional[str] = None,
status: Optional[str] = None,
use_androguard: Optional[bool] = None,
use_java: Optional[bool] = None,
use_prolog: Optional[bool] = None,
use_ruby: Optional[bool] = None,
use_soot: Optional[bool] = None,
use_apktool: Optional[bool] = None,
use_ocaml: Optional[bool] = None,
use_python: Optional[bool] = None,
use_scala: Optional[bool] = None,
folder: Optional[Path] = None,
limit: int = 10,
):
"""Get the most common errors classes"""
args: dict[str, Any] = {"limit": limit}
clauses = []
if tool is not None:
clauses.append("(distinct_error.tool_name = :tool)")
args["tool"] = tool
if status is not None:
clauses.append("(exec.tool_status = :tool_status)")
args["tool_status"] = status
if use_java is not None:
clauses.append("(tool.use_java = :use_java)")
args["use_java"] = use_java
if use_prolog is not None:
clauses.append("(tool.use_prolog = :use_prolog)")
args["use_prolog"] = use_prolog
if use_ruby is not None:
clauses.append("(tool.use_ruby = :use_ruby)")
args["use_ruby"] = use_ruby
if use_soot is not None:
clauses.append("(tool.use_soot = :use_soot)")
args["use_soot"] = use_soot
if use_apktool is not None:
clauses.append("(tool.use_apktool = :use_apktool)")
args["use_apktool"] = use_apktool
if use_ocaml is not None:
clauses.append("(tool.use_ocaml = :use_ocaml)")
args["use_ocaml"] = use_ocaml
if use_python is not None:
clauses.append("(tool.use_python = :use_python)")
args["use_python"] = use_python
if use_scala is not None:
clauses.append("(tool.use_scala = :use_scala)")
args["use_scala"] = use_scala
where_clause = ""
if clauses:
where_clause = f"WHERE {' AND '.join(clauses)}"
if folder is None:
out = sys.stdout
else:
# Generate filename
features = [
use_androguard,
use_java,
use_prolog,
use_ruby,
use_soot,
use_apktool,
use_ocaml,
use_python,
use_scala,
]
if tool is None:
tool_str = ""
else:
tool_str = f"_for_{tool}"
if status is None:
status_str = ""
else:
status_str = f"_when_{status}"
if all(map(lambda x: x is None, features)):
features_str = ""
else:
features_str = "_using"
if use_androguard:
features_str += "_androguard"
if use_java:
features_str += "_java"
if use_prolog:
features_str += "_prolog"
if use_ruby:
features_str += "_ruby"
if use_soot:
features_str += "_soot"
if use_apktool:
features_str += "_apktool"
if use_ocaml:
features_str += "_ocaml"
if use_python:
features_str += "_python"
if use_scala:
features_str += "_scala"
name = f"{limit}_most_common_errors_classes{tool_str}{status_str}{features_str}.csv"
# make sure the folder exist
folder.mkdir(parents=True, exist_ok=True)
out = (folder / name).open("w")
with sqlite3.connect(db) as con:
cur = con.cursor()
writer = csv.DictWriter(out, fieldnames=["type", "error", "count"])
writer.writeheader()
for row in cur.execute(
(
f"SELECT COUNT(*) AS cnt, distinct_error.error, distinct_error.error_type "
f"FROM {DISTINCT_ERROR_CLASS} "
"INNER JOIN tool ON distinct_error.tool_name = tool.tool_name "
"INNER JOIN exec ON "
" distinct_error.tool_name = exec.tool_name AND "
" distinct_error.sha256 = exec.sha256 "
f"{where_clause} "
f"GROUP BY distinct_error.error, distinct_error.error_type "
"ORDER BY cnt DESC LIMIT :limit;"
),
args,
):
row_d = {k: v for (k, v) in zip(("count", "error", "type"), row)}
writer.writerow(row_d)
if folder is not None:
out.close()
def get_nb_error(
db: Path,
folder: Optional[Path] = None,
):
NB_ERR = (
"("
"SELECT "
" exec_id.tool_name, exec_id.sha256, COUNT(error._rowid_) AS nb_err "
"FROM ("
" (SELECT tool_name FROM tool) CROSS JOIN (SELECT sha256 FROM apk)"
") AS exec_id LEFT JOIN error "
"ON exec_id.tool_name=error.tool_name AND exec_id.sha256=error.sha256 "
"GROUP BY exec_id.tool_name, exec_id.sha256"
") AS nb_err"
)
data = {}
tools = set()
with sqlite3.connect(db) as con:
cur = con.cursor()
for tool, status, avg, variance in cur.execute(
"SELECT nb_err.tool_name, exec.tool_status, AVG(nb_err.nb_err), "
" AVG(nb_err.nb_err*nb_err.nb_err) - AVG(nb_err.nb_err)*AVG(nb_err.nb_err) "
f"FROM {NB_ERR} "
"INNER JOIN exec ON nb_err.tool_name = exec.tool_name AND nb_err.sha256 = exec.sha256 "
"GROUP BY nb_err.tool_name, exec.tool_status;"
):
tools.add(tool)
data[(tool, status)] = (avg, variance)
fieldnames = list(tools)
fieldnames.sort()
fieldnames = ["", *fieldnames]
if folder is None:
fd = sys.stdout
else:
fd = (folder / "average_number_of_error_by_exec.csv").open("w")
writer = csv.DictWriter(fd, fieldnames=fieldnames)
writer.writeheader()
for status in ("FINISHED", "FAILED", "TIMEOUT"):
row = {"": status}
for tool in tools:
row[tool] = round(data.get((tool, status), (0, 0))[0], 2)
writer.writerow(row)
row = {"": "standard deviation"}
for tool in tools:
row[tool] = round(data.get((tool, status), (0, 0))[1] ** (1 / 2), 2)
writer.writerow(row)
if folder is not None:
fd.close()
def error_type_repartition(
db: Path, interactive: bool = True, folder: Optional[Path] = None
):
data: dict[str, dict[str, int]] = {}
total: dict[str, int] = {}
with sqlite3.connect(db) as con:
cur = con.cursor()
for tool, err, n in cur.execute(
"SELECT tool_name, error, COUNT(*) FROM error GROUP BY tool_name, error;"
):
if tool not in data:
data[tool] = {}
total[tool] = 0
if err is not None and err != "":
data[tool][err] = n
for tool, n in cur.execute(
"SELECT tool_name, COUNT(*) FROM error WHERE error IS NOT NULL AND error != '' GROUP BY tool_name;"
):
total[tool] = n
errors = set()
N = 3
for tool in data:
for err in sorted(
[err for err in data[tool]], key=lambda err: data[tool][err], reverse=True
)[:N]:
# TODO Check of > 10%?
errors.add(err)
tools = sorted(data.keys())
errors_l = sorted(errors)
values = [
[
data[tool].get(err, 0) * 100 / total[tool] if total[tool] != 0 else 0
for tool in tools
]
for err in errors_l
]
plt.figure(figsize=(22, 20))
im = plt.imshow(values, cmap="Greys")
cbar = plt.colorbar(im)
cbar.ax.set_ylabel(
"% of the error type among the error raised by the tool",
rotation=-90,
va="bottom",
)
import numpy as np
plt.xticks(np.arange(len(tools)), labels=tools, rotation=80)
plt.yticks(np.arange(len(errors_l)), labels=errors_l)
plt.xticks(np.arange(len(tools) + 1) - 0.5, minor=True)
plt.yticks(np.arange(len(errors_l) + 1) - 0.5, minor=True)
plt.grid(which="minor", color="w", linestyle="-", linewidth=3)
plt.tick_params(which="minor", bottom=False, left=False)
plt.title("Repartition of error types among tools")
# plt.figure().set_figheight(10)
render(
"Repartition of error types among tools",
interactive,
folder,
tight_layout=False,
)

View file

@ -0,0 +1,62 @@
import sqlite3
import sys
import csv
from pathlib import Path
from typing import Optional
def get_ressource(
db: Path,
folder: Optional[Path] = None,
):
data_time = {}
data_mem = {}
tools = set()
with sqlite3.connect(db) as con:
cur = con.cursor()
for tool, status, avg_time, var_time, avg_mem, var_mem in cur.execute(
"SELECT tool_name, exec.tool_status, "
" AVG(time), AVG(time*time) - AVG(time)*AVG(time), "
" AVG(max_rss_mem), AVG(max_rss_mem*max_rss_mem) - AVG(max_rss_mem)*AVG(max_rss_mem) "
"FROM exec "
"GROUP BY tool_name, tool_status;"
):
tools.add(tool)
if var_time is None:
var_time = 0
if var_mem is None:
var_mem = 0
data_time[(tool, status)] = (avg_time, var_time ** (1 / 2))
data_mem[(tool, status)] = (avg_mem, var_mem ** (1 / 2))
fieldnames = list(tools)
fieldnames.sort()
fieldnames = ["", *fieldnames]
if folder is None:
fd_time = sys.stdout
fd_mem = sys.stdout
else:
fd_time = (folder / "average_time.csv").open("w")
fd_mem = (folder / "average_mem.csv").open("w")
writer_time = csv.DictWriter(fd_time, fieldnames=fieldnames)
writer_mem = csv.DictWriter(fd_mem, fieldnames=fieldnames)
writer_time.writeheader()
writer_mem.writeheader()
for status in ("FINISHED", "FAILED", "TIMEOUT"):
row_time = {"": status}
row_mem = {"": status}
for tool in tools:
row_time[tool] = round(data_time.get((tool, status), (0, 0))[0], 2)
row_mem[tool] = round(data_mem.get((tool, status), (0, 0))[0], 2)
writer_time.writerow(row_time)
writer_mem.writerow(row_mem)
row_time = {"": "standard deviation"}
row_mem = {"": "standard deviation"}
for tool in tools:
row_time[tool] = round(data_time.get((tool, status), (0, 0))[1], 2)
row_mem[tool] = round(data_mem.get((tool, status), (0, 0))[1], 2)
writer_time.writerow(row_time)
writer_mem.writerow(row_mem)
if folder is not None:
fd_time.close()
fd_mem.close()

View file

@ -0,0 +1,446 @@
"""
Plots related to the tool status.
"""
import numpy as np
import sqlite3
from pathlib import Path
from matplotlib import pyplot as plt # type: ignore
from typing import Any, Callable, Optional
from .utils import (
render,
DENSE_DASH,
DENSE_DOT,
get_list_tools,
plot_generic,
MARKERS,
COLORS,
)
from .populate_db_tool import TOOL_INFO
TOOL_LINE_STYLE = {
tool_info["tool_name"]: DENSE_DOT if tool_info["use_soot"] else DENSE_DASH
for tool_info in TOOL_INFO
}
def plot_status_by_tool(
db: Path,
interactive: bool = True,
image_path: Path | None = None,
tools: list[str] | None = None,
title: str = "Exit Status",
):
"""Plot the repartition of status by tools."""
if tools is None:
tools = get_list_tools(db)
with sqlite3.connect(db) as con:
cur = con.cursor()
tools_list_format = f"({','.join(['?' for _ in tools])})"
nb_apk = cur.execute("SELECT COUNT(*) FROM apk;").fetchone()[0]
status = cur.execute(
(
"SELECT tool_name, tool_status, COUNT(sha256) "
"FROM exec "
f"WHERE tool_name IN {tools_list_format}"
"GROUP BY tool_name, tool_status;"
),
tools,
).fetchall()
occurences = {}
for tool, stat, occurence in status:
occurences[(tool, stat)] = occurence
# tools.sort(key=lambda t: occurences.get((t, "FINISHED"), 0), reverse=True)
tools.sort()
values = {
"Finished": np.zeros(len(tools)),
"Time Out": np.zeros(len(tools)),
"Other": np.zeros(len(tools)),
"Failed": np.zeros(len(tools)),
}
colors = {
"Finished": "#009E73",
"Time Out": "#56B4E9",
"Failed": "#D55E00",
"Other": "#555555", # TODO: better color
}
hatch = {
"Finished": "/",
"Time Out": "x",
"Failed": "\\",
"Other": ".",
}
for i, tool in enumerate(tools):
values["Finished"][i] = occurences.get((tool, "FINISHED"), 0)
values["Time Out"][i] = occurences.get((tool, "TIMEOUT"), 0)
values["Failed"][i] = occurences.get((tool, "FAILED"), 0)
values["Other"][i] = (
nb_apk - values["Finished"][i] - values["Time Out"][i] - values["Failed"][i]
)
values["Finished"] = (100 * values["Finished"]) / nb_apk
values["Time Out"] = (100 * values["Time Out"]) / nb_apk
values["Failed"] = (100 * values["Failed"]) / nb_apk
values["Other"] = (100 * values["Other"]) / nb_apk
bottom = np.zeros(len(tools) * 2)
bottom = np.zeros(len(tools))
print("Finishing rate:")
for t, p in zip(tools, values["Finished"]):
print(f"{t}: {p:.2f}%")
plt.figure(figsize=(20, 9), dpi=80)
plt.axhline(y=50, linestyle="dotted")
plt.axhline(y=85, linestyle="dotted")
plt.axhline(y=15, linestyle="dotted")
for stat in ["Finished", "Time Out", "Other", "Failed"]:
plt.bar(
tools,
values[stat],
label=stat,
color=colors[stat],
hatch=hatch[stat],
bottom=bottom,
width=0.6,
edgecolor="black",
)
bottom += values[stat]
plt.xticks(tools, tools, rotation=80)
plt.legend()
plt.ylabel("% of analysed apk")
render(title, interactive, image_path)
def plot_status_by_tool_and_malware(
db: Path,
interactive: bool = True,
image_path: Path | None = None,
tools: list[str] | None = None,
title: str = "Exit Status Goodware/Malware",
):
"""Plot the repartition of status by tools and if apk is a malware."""
if tools is None:
tools = get_list_tools(db)
with sqlite3.connect(db) as con:
cur = con.cursor()
tools_list_format = f"({','.join(['?' for _ in tools])})"
nb_goodware = cur.execute(
"SELECT COUNT(*) FROM apk WHERE vt_detection == 0;"
).fetchone()[0]
nb_malware = cur.execute(
"SELECT COUNT(*) FROM apk WHERE vt_detection != 0;"
).fetchone()[0]
status = cur.execute(
(
"SELECT tool_name, tool_status, COUNT(exec.sha256), vt_detection != 0 "
"FROM exec INNER JOIN apk ON exec.sha256 = apk.sha256 "
f"WHERE tool_name IN {tools_list_format} "
"GROUP BY tool_name, tool_status, vt_detection != 0;"
),
tools,
).fetchall()
occurences = {}
for tool, stat, occurence, malware in status:
occurences[(tool, stat, bool(malware))] = occurence
# tools.sort(
# key=lambda t: occurences.get((t, "FINISHED", True), 0)
# + occurences.get((t, "FINISHED", False), 0),
# reverse=True,
# )
tools.sort()
values = {
"Finished": np.zeros(len(tools) * 2),
"Time Out": np.zeros(len(tools) * 2),
"Other": np.zeros(len(tools) * 2),
"Failed": np.zeros(len(tools) * 2),
}
colors = {
"Finished": "#009E73",
"Time Out": "#56B4E9",
"Other": "#555555", # TODO: find beter color
"Failed": "#D55E00",
}
hatch = {
"Finished": "/",
"Time Out": "x",
"Other": ".",
"Failed": "\\",
}
for i, tool in enumerate(tools):
i_goodware = 2 * i
i_malware = 2 * i + 1
values["Finished"][i_goodware] = occurences.get((tool, "FINISHED", False), 0)
values["Finished"][i_malware] = occurences.get((tool, "FINISHED", True), 0)
values["Time Out"][i_goodware] = occurences.get((tool, "TIMEOUT", False), 0)
values["Time Out"][i_malware] = occurences.get((tool, "TIMEOUT", True), 0)
values["Failed"][i_goodware] = occurences.get((tool, "FAILED", False), 0)
values["Failed"][i_malware] = occurences.get((tool, "FAILED", True), 0)
values["Other"][i_goodware] = (
nb_goodware
- values["Finished"][i_goodware]
- values["Time Out"][i_goodware]
- values["Failed"][i_goodware]
)
values["Other"][i_malware] = (
nb_malware
- values["Finished"][i_malware]
- values["Time Out"][i_malware]
- values["Failed"][i_malware]
)
values["Finished"][i_goodware] = (
0
if nb_goodware == 0
else (100 * values["Finished"][i_goodware]) / nb_goodware
)
values["Finished"][i_malware] = (
0 if nb_malware == 0 else (100 * values["Finished"][i_malware]) / nb_malware
)
values["Time Out"][i_goodware] = (
0
if nb_goodware == 0
else (100 * values["Time Out"][i_goodware]) / nb_goodware
)
values["Time Out"][i_malware] = (
0 if nb_malware == 0 else (100 * values["Time Out"][i_malware]) / nb_malware
)
values["Failed"][i_goodware] = (
0
if nb_goodware == 0
else (100 * values["Failed"][i_goodware]) / nb_goodware
)
values["Failed"][i_malware] = (
0 if nb_malware == 0 else (100 * values["Failed"][i_malware]) / nb_malware
)
values["Other"][i_goodware] = (
0 if nb_goodware == 0 else (100 * values["Other"][i_goodware]) / nb_goodware
)
values["Other"][i_malware] = (
0 if nb_malware == 0 else (100 * values["Other"][i_malware]) / nb_malware
)
bottom = np.zeros(len(tools) * 2)
x_axis = np.zeros(len(tools) * 2)
x_width = 3
x_0 = x_width / 2
lstep = 1
bstep = 5
for i in range(len(tools)):
x_0 += bstep + x_width
x_axis[2 * i] = x_0
x_0 += lstep + x_width
x_axis[2 * i + 1] = x_0
tick_legend = []
for tool in tools:
tick_legend.append(f"{tool}") # (f"{tool} on goodware")
tick_legend.append("") # (f"{tool} on malware")
plt.figure(figsize=(20, 9), dpi=80)
for stat in ["Finished", "Time Out", "Other", "Failed"]:
plt.bar(
x_axis,
values[stat],
label=stat,
color=colors[stat],
hatch=hatch[stat],
bottom=bottom,
width=x_width,
edgecolor="black",
)
bottom += values[stat]
plt.xticks(x_axis, tick_legend, rotation=80)
plt.legend()
plt.ylabel("% of analysed apk")
render(title, interactive, image_path)
def plot_status_by_generic_x(
tools: list[str],
x_col: str,
x_label: str,
x_in_title: str,
args,
group_by: Optional[str] = None,
):
tools.sort()
"""group_by default to x_col, x_col must be uniq for a grouped by group_by"""
if group_by is None:
group_by = x_col
with sqlite3.connect(args.data) as con:
cur = con.cursor()
nb_goodware_res = cur.execute(
f"SELECT {group_by}, COUNT(*) FROM apk WHERE vt_detection == 0 GROUP BY {group_by};",
).fetchall()
nb_goodware = {}
for x_group, count in nb_goodware_res:
nb_goodware[x_group] = count
nb_malware_res = cur.execute(
f"SELECT {group_by}, COUNT(*) FROM apk WHERE vt_detection != 0 GROUP BY {group_by};",
).fetchall()
nb_malware = {}
for x_group, count in nb_malware_res:
nb_malware[x_group] = count
statuses_res = cur.execute(
(
f"SELECT tool_name, {x_col}, {group_by}, COUNT(exec.sha256), vt_detection != 0 "
"FROM exec INNER JOIN apk ON exec.sha256 = apk.sha256 "
f"WHERE tool_status = 'FINISHED' "
f"GROUP BY tool_name, tool_status, {group_by}, vt_detection != 0 "
f"HAVING {x_col} IS NOT NULL;"
)
).fetchall()
tots = {}
for tool_, x_val, x_group, count, is_malware in statuses_res:
if not (tool_, x_group) in tots:
tots[(tool_, x_group)] = [x_val, 0]
tots[(tool_, x_group)][1] += count
plots = []
plots_malgood = []
metas = []
metas_malgood = []
for tool in tools:
malware_plot = [
(x_val, 100 * count / nb_malware[x_group])
for (tool_, x_val, x_group, count, is_malware) in statuses_res
if (tool_ == tool) and is_malware and nb_malware.get(x_group, 0) != 0
]
malware_meta = (f"{tool} on malware", DENSE_DOT, MARKERS[tool], COLORS[tool])
goodware_plot = [
(x_val, 100 * count / nb_goodware[x_group])
for (tool_, x_val, x_group, count, is_malware) in statuses_res
if (tool_ == tool) and not is_malware and nb_goodware.get(x_group, 0) != 0
]
goodware_meta = (f"{tool} on goodware", DENSE_DASH, MARKERS[tool], COLORS[tool])
total_plot = [
(
x_val,
100
* count
/ (nb_malware.get(x_group, 0) + nb_goodware.get(x_group, 0)),
)
for ((tool_, x_group), (x_val, count)) in tots.items()
if (tool_ == tool)
and (nb_malware.get(x_group, 0) + nb_goodware.get(x_group, 0)) != 0
]
total_meta = (f"{tool}", DENSE_DOT, MARKERS[tool], COLORS[tool])
plots.append(total_plot)
plots_malgood.append(malware_plot)
plots_malgood.append(goodware_plot)
metas.append(total_meta)
metas_malgood.append(malware_meta)
metas_malgood.append(goodware_meta)
plot_generic(
[goodware_plot, malware_plot],
[goodware_meta, malware_meta],
x_label,
"finishing rate",
f"Finishing Rate by {x_in_title} for {tool} on malware and goodware",
ylim=(-5, 105),
interactive=args.display,
image_path=args.figures_file,
)
plot_generic(
[total_plot],
[total_meta],
x_label,
"finishing rate",
f"Finishing Rate by {x_in_title} for {tool}",
ylim=(-5, 105),
interactive=args.display,
image_path=args.figures_file,
)
plot_generic(
plots_malgood,
metas_malgood,
x_label,
"finishing rate",
f"Finishing Rate by {x_in_title} on malware and goodware",
ylim=(-5, 105),
interactive=args.display,
image_path=args.figures_file,
)
plot_generic(
plots,
metas,
x_label,
"finishing rate",
f"Finishing Rate by {x_in_title}",
ylim=(-5, 105),
interactive=args.display,
image_path=args.figures_file,
)
def dbg(arg):
# print(arg)
return arg
def plot_all_status_by_generic_x(
tools: list[str],
x_col: str,
x_label: str,
title: str,
args,
condition: Optional[str] = None,
apk_condition: Optional[str] = None,
group_by: Optional[str] = None,
):
if condition is None and apk_condition is None:
condition = ""
apk_condition = ""
elif apk_condition is None:
condition = f"AND ({condition})"
apk_condition = ""
elif condition is None:
condition = f"AND ({apk_condition})"
apk_condition = f"WHERE ({apk_condition})"
else:
condition = f"AND ({apk_condition}) AND ({condition})"
apk_condition = f"WHERE ({apk_condition})"
if group_by is None:
group_by = x_col
nb_apk = {}
tools.sort()
with sqlite3.connect(args.data) as con:
cur = con.cursor()
for x_group, count in cur.execute(
f"SELECT {group_by}, COUNT(*) FROM apk {apk_condition} GROUP BY {group_by};",
):
nb_apk[x_group] = count
statuses_res = cur.execute(
dbg(
f"SELECT exec.tool_name, {x_col}, {group_by}, COUNT(exec.sha256) "
"FROM exec "
" INNER JOIN apk ON exec.sha256 = apk.sha256 "
" INNER JOIN tool ON exec.tool_name = tool.tool_name "
f"WHERE tool_status = 'FINISHED' {condition} "
f"GROUP BY exec.tool_name, tool_status, {group_by} "
f"HAVING {x_col} IS NOT NULL;"
)
).fetchall()
plots = []
metas = []
for tool in tools:
plot = [
(x_val, 100 * count / nb_apk[x_group])
for (tool_, x_val, x_group, count) in statuses_res
if (tool_ == tool) and nb_apk.get(x_group, 0) != 0
]
if len(plot) == 0:
continue
meta = (tool, TOOL_LINE_STYLE[tool], MARKERS[tool], COLORS[tool])
plots.append(plot)
metas.append(meta)
plot_generic(
plots,
metas,
x_label,
"finishing rate",
title,
ylim=(-5, 105),
interactive=args.display,
image_path=args.figures_file,
)

View file

@ -0,0 +1,185 @@
"""
Utils.
"""
import matplotlib.pyplot as plt # type: ignore
import numpy as np
from slugify import slugify # type: ignore
from typing import Any, Callable, Optional
from pathlib import Path
import sqlite3
DENSE_DASH = (0, (5, 1))
DENSE_DOT = (0, (1, 3))
MARKERS = {
"adagio": ".",
"amandroid": "o",
"anadroid": "X",
"androguard": "+",
"androguard_dad": "v",
"apparecium": "d",
"blueseal": "^",
"dialdroid": "<",
"didfail": ">",
"droidsafe": r"$\circ$",
"flowdroid": r"$\boxplus$",
"gator": r"$\otimes$",
"ic3": "1",
"ic3_fork": "s",
"iccta": "P",
"mallodroid": r"$\divideontimes$",
"perfchecker": "*",
"redexer": "x",
"saaf": "D",
"wognsen_et_al": r"$\rtimes$",
}
COLORS = {
"didfail": "#1f77b4",
"adagio": "#ff7f0e",
"iccta": "#2ca02c",
"androguard": "#d62728",
"gator": "#9467bd",
"mallodroid": "#8c564b",
"dialdroid": "#e377c2",
"androguard_dad": "#7f7f7f",
"wognsen_et_al": "#bcbd22",
"perfchecker": "#17becf",
"amandroid": "#1f77b4",
"ic3": "#ff7f0e",
"apparecium": "#2ca02c",
"blueseal": "#d62728",
"droidsafe": "#9467bd",
"redexer": "#8c564b",
"anadroid": "#e377c2",
"saaf": "#7f7f7f",
"ic3_fork": "#bcbd22",
"flowdroid": "#17becf",
"adagio": "#1f77b4",
"androguard": "#ff7f0e",
"mallodroid": "#2ca02c",
"androguard_dad": "#d62728",
"wognsen_et_al": "#9467bd",
"amandroid": "#8c564b",
"apparecium": "#e377c2",
"redexer": "#7f7f7f",
}
def get_list_tools(db: Path) -> list[str]:
"""Get the list of tool found in the database."""
with sqlite3.connect(db) as con:
cur = con.cursor()
tools = cur.execute("SELECT DISTINCT tool_name FROM exec;")
return [tool[0] for tool in tools]
def radar_chart(
axes: list[str],
values: list[list[Any]],
labels: list[str],
title: str,
interactive: bool,
image_path: Path | None,
):
plt.rc("grid", linewidth=1, linestyle="-")
plt.rc("xtick", labelsize=15)
plt.rc("ytick", labelsize=15)
angles = np.linspace(0, 2 * np.pi, len(axes), endpoint=False)
angles = np.concatenate((angles, [angles[0]])) # type: ignore
fig = plt.figure(figsize=(8, 8))
ax = fig.add_subplot(111, polar=True)
for label, vals in zip(labels, values):
vals = vals + [vals[0]]
ax.plot(angles, vals, label=label, marker=MARKERS.get(label, "."))
ax.fill(angles, vals, alpha=0.25)
ax.set_thetagrids(angles[:-1] * 180 / np.pi, axes)
ax.set_ylim(bottom=0)
ax.grid(True)
ncol = min(5, len(labels))
ax.legend(
loc="lower left",
bbox_to_anchor=(0.0, -0.2, ncol * 1.0 / 5, 0.102),
ncol=ncol,
mode="expand",
borderaxespad=0.0,
fancybox=True,
shadow=True,
fontsize="xx-small",
)
render(title, interactive, image_path)
def render(
title: str, interactive: bool, image_path: Path | None, tight_layout: bool = True
):
"""Render the figure. If `interactive`, display if, if `image_path`, save it."""
# plt.title(title)
if tight_layout:
plt.tight_layout()
if image_path is not None:
if not image_path.exists():
image_path.mkdir(parents=True, exist_ok=True)
plt.savefig(image_path / (slugify(title) + ".pdf"), format="pdf")
if interactive:
plt.show()
plt.close()
def mean(field: str) -> Callable[[list[Any]], float]:
def compute_mean(data: list[Any]) -> float:
s = 0
n = 0
for e in data:
n += 1
s += e[field]
return 0.0 if n == 0 else s / n
return compute_mean
def median(field: str) -> Callable[[list[Any]], float]:
def compute_median(data: list[Any]) -> float:
l = [e[field] for e in data if e[field] is not None]
l.sort()
if not l:
return 0.0
return l[len(l) // 2]
return compute_median
def plot_generic(
data: list[list[tuple[Any, Any]]],
meta: list[tuple[str, Any, Any, str]],
x_label: str,
y_label: str,
title: str,
ylim: Optional[tuple[int, int]] = None,
interactive: bool = True,
image_path: Path | None = None,
):
"""Plot a list of curve represented by list[(x, y)]. meta is the list of (label, linestyle)
for each plot.
"""
plt.figure(figsize=(16, 9), dpi=80)
for i, plot in enumerate(data):
label, linestyle, marker, color = meta[i]
plot.sort(key=lambda p: p[0])
x_values = np.array([x for (x, _) in plot])
y_values = np.array([y for (_, y) in plot])
plt.plot(
x_values[~np.isnan(y_values)],
y_values[~np.isnan(y_values)],
label=label,
marker=marker,
color=color,
linestyle=linestyle,
)
if ylim is not None:
plt.ylim(ylim)
plt.legend(loc="upper center", ncol=4, bbox_to_anchor=(0.5, -0.1))
plt.xlabel(x_label)
plt.ylabel(y_label)
render(title, interactive, image_path)