first commit
This commit is contained in:
commit
cd1e91bb99
287 changed files with 86425 additions and 0 deletions
1
rasta_data_manipulation/.gitattributes
vendored
Normal file
1
rasta_data_manipulation/.gitattributes
vendored
Normal file
|
@ -0,0 +1 @@
|
|||
data.db filter=lfs diff=lfs merge=lfs -text
|
40
rasta_data_manipulation/.gitignore
vendored
Normal file
40
rasta_data_manipulation/.gitignore
vendored
Normal file
|
@ -0,0 +1,40 @@
|
|||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
env/
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
|
||||
# virtualenv
|
||||
.venv/
|
||||
venv/
|
||||
|
||||
# mypy
|
||||
.mypy_cache/
|
||||
|
||||
*.db
|
||||
year_and_sdk.csv.gz
|
||||
latest_with-added-date.csv.gz
|
||||
figs_drebin/
|
||||
figs_rasta/
|
||||
figs
|
35
rasta_data_manipulation/README.md
Normal file
35
rasta_data_manipulation/README.md
Normal file
|
@ -0,0 +1,35 @@
|
|||
# Rasta Triturage
|
||||
|
||||
Triturage de donnée for the Rasta Project
|
||||
|
||||
## Usage
|
||||
|
||||
This project is managed by poetry (trying new things :-) ). To use it without poetry, you can install it as a python package in a venv:
|
||||
|
||||
```
|
||||
git clone git@gitlab.inria.fr:jmineau/rasta_triturage.git
|
||||
cd rasta_triturage
|
||||
python -m venv venv
|
||||
source venv/bin/activate
|
||||
pip install . -e
|
||||
```
|
||||
|
||||
The reports and information about the apk are in the prepopulated database `data.db` (TODO: add script to populate the DB)
|
||||
|
||||
To generate all the figures in the file `figures`:
|
||||
|
||||
```
|
||||
rasta-triturage -d data.db -f figures
|
||||
```
|
||||
|
||||
To display all the figures:
|
||||
|
||||
```
|
||||
rasta-triturage -d data.db --display
|
||||
```
|
||||
|
||||
The option `-t` allow to specify the tools to compare.
|
||||
|
||||
## Author
|
||||
|
||||
- annon
|
3
rasta_data_manipulation/TODO.md
Normal file
3
rasta_data_manipulation/TODO.md
Normal file
|
@ -0,0 +1,3 @@
|
|||
- Regretion en nuage de point: mem by ceilling log feels bad
|
||||
- IC3: vein diagram
|
||||
- time / mem for specific category
|
25
rasta_data_manipulation/extract_result.sh
Executable file
25
rasta_data_manipulation/extract_result.sh
Executable file
|
@ -0,0 +1,25 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
DATA_DIR=$1
|
||||
if [[ -z "${DATA_DIR}" ]]; then
|
||||
echo 'MISSING DATA_DIR parameter'
|
||||
echo 'usage: ./extract_result.sh DATA_DIR'
|
||||
exit 1
|
||||
fi
|
||||
DATA_DIR="$(readlink -f "$DATA_DIR")"
|
||||
|
||||
DB="${DATA_DIR}/results/rasta.db"
|
||||
DB_DREBIN="${DATA_DIR}/results/drebin.db"
|
||||
FOLDER="figs"
|
||||
|
||||
rasta-status -d ${DB} -f ${FOLDER} --title "Exit status for the Rasta dataset"
|
||||
rasta-status -d ${DB_DREBIN} -f ${FOLDER} --title "Exit status for the Drebin dataset"
|
||||
rasta-success-year -d ${DB} -f "${FOLDER}/by_year"
|
||||
|
||||
rasta-common-errors -d ${DB} -f "${FOLDER}/common_err" -s FAILED
|
||||
rasta-avg-nb-errors -d ${DB} -f "${FOLDER}/common_err"
|
||||
rasta-error-repartition -d ${DB} -f "${FOLDER}"
|
||||
rasta-avg-ressource -d ${DB} -f "${FOLDER}"
|
||||
|
||||
rasta-decorelate-factor -d ${DB} -f "${FOLDER}/decorelation" --decile 8
|
||||
rasta-decorelate-factor -d ${DB} -f "${FOLDER}/decorelation" --decile 6
|
25
rasta_data_manipulation/find_apks_by_tool_error.sh
Executable file
25
rasta_data_manipulation/find_apks_by_tool_error.sh
Executable file
|
@ -0,0 +1,25 @@
|
|||
#!/usr/bin/env sh
|
||||
|
||||
PWD=$(pwd)
|
||||
TOOL=${1}
|
||||
ERROR=${2}
|
||||
DATABASE=${3:-'rasta.db'}
|
||||
REPORT_FOLDER=${4:-"$PWD/../data/reports/rasta"}
|
||||
|
||||
USAGE=$(cat <<- EOM
|
||||
usage: ${0} <tool> <error> [<database> [<repport folder>]]
|
||||
EOM
|
||||
)
|
||||
|
||||
if [[ -z "$TOOL" ]] || [[ -z "$ERROR" ]] || [[ -z "$DATABASE" ]] || [[ -z "$REPORT_FOLDER" ]] ; then
|
||||
echo ${USAGE}
|
||||
exit 1
|
||||
fi
|
||||
|
||||
TMP_FILE=$(mktemp)
|
||||
sqlite3 ${DATABASE} "SELECT DISTINCT error.sha256 || '_-_' || error.tool_name FROM error INNER JOIN exec ON error.tool_name = exec.tool_name AND error.sha256 = exec.sha256 WHERE exec.tool_status = 'FAILED' AND error.tool_name = '$TOOL' and error = '$ERROR';" > ${TMP_FILE}
|
||||
|
||||
find ${REPPORT_FOLDER} | grep -F -f ${TMP_FILE}
|
||||
rm ${TMP_FILE}
|
||||
|
||||
|
35
rasta_data_manipulation/make_db.sh
Executable file
35
rasta_data_manipulation/make_db.sh
Executable file
|
@ -0,0 +1,35 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
DATA_DIR=$1
|
||||
if [[ -z "${DATA_DIR}" ]]; then
|
||||
echo 'MISSING DATA_DIR parameter'
|
||||
echo 'usage: ./make_db.sh DATA_DIR'
|
||||
exit 1
|
||||
fi
|
||||
DATA_DIR="$(readlink -f "$DATA_DIR")"
|
||||
|
||||
|
||||
all_rasta_apk=$(mktemp)
|
||||
cat ${DATA_DIR}/dataset/set* > ${all_rasta_apk}
|
||||
rasta-populate-db-apk -a ${all_rasta_apk} \
|
||||
-d "${DATA_DIR}/results/rasta.db" \
|
||||
--year-and-sdk "${DATA_DIR}/androzoo/year_and_sdk.csv.gz" \
|
||||
--latest-with-added-date "${DATA_DIR}/androzoo/latest_with-added-date.csv.gz" \
|
||||
--fix-dex-file
|
||||
rasta-populate-db-tool -d "${DATA_DIR}/results/rasta.db"
|
||||
report_folders="status_set0 status_set1 status_set2 status_set3 status_set4 status_set5 status_set6 status_set7 status_set8 status_set9"
|
||||
for folder in ${report_folders}; do
|
||||
rasta-populate-db-report -d "${DATA_DIR}/results/rasta.db" -r "${DATA_DIR}/results/reports/rasta/${folder}"
|
||||
done
|
||||
rasta-populate-db-report -d "${DATA_DIR}/results/rasta.db" --estimate-cause
|
||||
|
||||
rasta-populate-db-apk -a "${DATA_DIR}/dataset/drebin" \
|
||||
-d "${DATA_DIR}/results/drebin.db" \
|
||||
--year-and-sdk "${DATA_DIR}/androzoo/year_and_sdk.csv.gz" \
|
||||
--latest-with-added-date "${DATA_DIR}/androzoo/latest_with-added-date.csv.gz" \
|
||||
--fix-dex-file
|
||||
rasta-populate-db-tool -d "${DATA_DIR}/results/drebin.db"
|
||||
rasta-populate-db-report -d "${DATA_DIR}/results/drebin.db" -r "${DATA_DIR}/results/reports/drebin/status_drebin"
|
||||
rasta-populate-db-report -d "${DATA_DIR}/results/drebin.db" --estimate-cause
|
||||
|
||||
rm ${all_rasta_apk}
|
6
rasta_data_manipulation/means_size.sql
Normal file
6
rasta_data_manipulation/means_size.sql
Normal file
|
@ -0,0 +1,6 @@
|
|||
SELECT AVG(dex_size) FROM apk;
|
||||
SELECT AVG(dex_size) FROM apk WHERE vt_detection = 0;
|
||||
SELECT AVG(dex_size) FROM apk WHERE vt_detection != 0;
|
||||
SELECT AVG(apk_size) FROM apk;
|
||||
SELECT AVG(apk_size) FROM apk WHERE vt_detection = 0;
|
||||
SELECT AVG(apk_size) FROM apk WHERE vt_detection != 0;
|
6
rasta_data_manipulation/means_success_by_year.sql
Normal file
6
rasta_data_manipulation/means_success_by_year.sql
Normal file
|
@ -0,0 +1,6 @@
|
|||
SELECT apk1.first_seen_year, (COUNT(*) * 100) / (SELECT 20 * COUNT(*)
|
||||
FROM apk AS apk2 WHERE apk2.first_seen_year = apk1.first_seen_year
|
||||
)
|
||||
FROM exec JOIN apk AS apk1 ON exec.sha256 = apk1.sha256
|
||||
WHERE exec.tool_status = 'FINISHED' OR exec.tool_status = 'UNKNOWN'
|
||||
GROUP BY apk1.first_seen_year ORDER BY apk1.first_seen_year;
|
2
rasta_data_manipulation/mypy.ini
Normal file
2
rasta_data_manipulation/mypy.ini
Normal file
|
@ -0,0 +1,2 @@
|
|||
[mypy]
|
||||
python_executable = .venv/bin/python
|
1419
rasta_data_manipulation/poetry.lock
generated
Normal file
1419
rasta_data_manipulation/poetry.lock
generated
Normal file
File diff suppressed because it is too large
Load diff
64
rasta_data_manipulation/pyproject.toml
Normal file
64
rasta_data_manipulation/pyproject.toml
Normal file
|
@ -0,0 +1,64 @@
|
|||
[tool.poetry]
|
||||
name = "rasta_triturage"
|
||||
version = "0.2.0"
|
||||
description = "'Triturage de donnée' for the Rasta Project"
|
||||
authors = ["anon"]
|
||||
readme = "README.md"
|
||||
#homepage = ""
|
||||
#repository = ""
|
||||
license = "Proprietary"
|
||||
|
||||
[tool.poetry.urls]
|
||||
#"Bug Tracker" = ""
|
||||
|
||||
[tool.poetry.dependencies]
|
||||
python = "^3.10"
|
||||
matplotlib = "^3.7.1"
|
||||
pyqt5 = "^5.15.9"
|
||||
numpy = "^1.24.3"
|
||||
|
||||
seaborn = "^0.12.2"
|
||||
python-slugify = "^8.0.1"
|
||||
androguard = "^3.3.5"
|
||||
requests = "^2.31.0"
|
||||
matplotlib-venn = "^0.11.9"
|
||||
python-dateutil = "^2.8.2"
|
||||
|
||||
[tool.poetry.scripts]
|
||||
rasta-triturage = "rasta_triturage.cli:main"
|
||||
rasta-status = "rasta_triturage.cli:show_status_by_tool"
|
||||
rasta-collect-apk-info = "rasta_triturage.cli:get_apk_info"
|
||||
rasta-success-target-sdk = "rasta_triturage.cli:show_success_rate_by_target_sdk"
|
||||
rasta-success-min-sdk = "rasta_triturage.cli:show_success_rate_by_min_sdk"
|
||||
rasta-success-year = "rasta_triturage.cli:show_success_rate_by_first_seen_year"
|
||||
rasta-success-size = "rasta_triturage.cli:show_success_rate_by_dex_size"
|
||||
rasta-success-apk-size = "rasta_triturage.cli:show_success_rate_by_size_decile"
|
||||
rasta-timeout-target-sdk = "rasta_triturage.cli:show_timeout_rate_by_target_sdk"
|
||||
rasta-timeout-min-sdk = "rasta_triturage.cli:show_timeout_rate_by_min_sdk"
|
||||
rasta-timeout-year = "rasta_triturage.cli:show_timeout_rate_by_estimated_year"
|
||||
rasta-populate-db-apk = "rasta_triturage.cli:populate_db_apk"
|
||||
rasta-populate-db-report = "rasta_triturage.cli:populate_db_exec"
|
||||
rasta-populate-db-tool = "rasta_triturage.cli:populate_db_tool"
|
||||
rasta-common-errors = "rasta_triturage.cli:show_common_errors"
|
||||
rasta-avg-nb-errors = "rasta_triturage.cli:average_nb_errors"
|
||||
rasta-error-causes-radar = "rasta_triturage.cli:show_error_cause_radar"
|
||||
rasta-error-repartition = "rasta_triturage.cli:show_error_type_repartition"
|
||||
rasta-avg-occ-by-exec = "rasta_triturage.cli:show_error_avg_occ_by_exec"
|
||||
rasta-ic3-analysis = "rasta_triturage.cli:ic3"
|
||||
rasta-avg-ressource = "rasta_triturage.cli:get_avg_ressource_consumption"
|
||||
rasta-decorelate-factor = "rasta_triturage.cli:plot_decorelated_factor"
|
||||
rasta-count-error-stacks = "rasta_triturage.cli:count_error_stacks"
|
||||
rasta-gen-dataset = "rasta_triturage.cli:generate_dataset"
|
||||
rasta-size-malware = "rasta_triturage.cli:size_malware"
|
||||
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
pytest = "*"
|
||||
pytest-cov = "*"
|
||||
types-requests = "^2.31.0.0"
|
||||
|
||||
[tool.pytest.ini_options]
|
||||
addopts = "--cov"
|
||||
|
||||
[build-system]
|
||||
requires = ["poetry-core"]
|
||||
build-backend = "poetry.core.masonry.api"
|
3
rasta_data_manipulation/rasta_triturage/__init__.py
Normal file
3
rasta_data_manipulation/rasta_triturage/__init__.py
Normal file
|
@ -0,0 +1,3 @@
|
|||
__author__ = "annon"
|
||||
__email__ = "annon"
|
||||
__version__ = "0.2.0"
|
115
rasta_data_manipulation/rasta_triturage/apk.py
Normal file
115
rasta_data_manipulation/rasta_triturage/apk.py
Normal file
|
@ -0,0 +1,115 @@
|
|||
"""
|
||||
Collect data about apks.
|
||||
"""
|
||||
|
||||
import dateutil.parser as dp # type: ignore
|
||||
import datetime
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt # type: ignore
|
||||
|
||||
from typing import Any, IO, Callable
|
||||
from pathlib import Path
|
||||
|
||||
from .utils import render
|
||||
|
||||
|
||||
def plot_apk_info_by_generic_x(
|
||||
data: list[Any],
|
||||
x: str,
|
||||
title: str,
|
||||
extract_propertie: Callable,
|
||||
y_label: str,
|
||||
x_label: str | None = None,
|
||||
reductions: dict[str, Callable] | None = None,
|
||||
xscale: str = "linear",
|
||||
interactive: bool = True,
|
||||
image_path: Path | None = None,
|
||||
):
|
||||
"""`extract_propertie` is a founction that take a list of element and return
|
||||
a value representing the value of the list, like a median or a mean.
|
||||
"""
|
||||
raise NotImplementedError("TODO: update function to use sqlite3")
|
||||
|
||||
|
||||
# groupped = group_by(x, data, reductions=reductions)
|
||||
# properties = {k: extract_propertie(v) for k, v in groupped.items()}
|
||||
# if x_label is None:
|
||||
# x_label = x
|
||||
# x_values = list(set(filter(lambda x: x is not None, properties.keys())))
|
||||
# x_values.sort()
|
||||
# y_values = [properties[x] for x in x_values]
|
||||
#
|
||||
# plt.figure(figsize=(16, 9), dpi=80)
|
||||
# plt.plot(x_values, y_values)
|
||||
# plt.xscale(xscale)
|
||||
# # plt.ylim([-5, 105])
|
||||
# # plt.legend()
|
||||
# plt.xlabel(x_label)
|
||||
# plt.ylabel(y_label)
|
||||
# render(title, interactive, image_path)
|
||||
#
|
||||
|
||||
|
||||
def plot_apk_size(
|
||||
apk_data: list[Any],
|
||||
interactive: bool = True,
|
||||
image_path: Path | None = None,
|
||||
):
|
||||
sizes = np.array([e["total_dex_size"] for e in apk_data]) / 1024 / 1024
|
||||
sizes.sort()
|
||||
plt.figure(figsize=(16, 9), dpi=80)
|
||||
plt.bar(np.arange(len(sizes)), sizes)
|
||||
plt.ylabel("Bytecode size (MiB)")
|
||||
plt.tick_params(
|
||||
axis="x",
|
||||
which="both",
|
||||
bottom=False,
|
||||
top=False,
|
||||
labelbottom=False,
|
||||
)
|
||||
for s in range(7, 13):
|
||||
plt.axhline(y=(4**s) / 1024 / 1024, color="r", linestyle=":")
|
||||
render("Bytecode size of the apks", interactive, image_path)
|
||||
|
||||
|
||||
def plot_apk_size_hl_subset(
|
||||
apk_data: list[Any],
|
||||
subset_sha: list[str],
|
||||
title: str,
|
||||
interactive: bool = True,
|
||||
image_path: Path | None = None,
|
||||
):
|
||||
apk_data.sort(key=lambda x: x["total_dex_size"])
|
||||
sizes = (
|
||||
np.array(
|
||||
[
|
||||
e["total_dex_size"] if e["sha256"] not in subset_sha else 0
|
||||
for e in apk_data
|
||||
]
|
||||
)
|
||||
/ 1024
|
||||
/ 1024
|
||||
)
|
||||
sizes_hl = (
|
||||
np.array(
|
||||
[e["total_dex_size"] if e["sha256"] in subset_sha else 0 for e in apk_data]
|
||||
)
|
||||
/ 1024
|
||||
/ 1024
|
||||
)
|
||||
plt.figure(figsize=(16, 9), dpi=80)
|
||||
plt.bar(np.arange(len(sizes)), sizes, edgecolor="black")
|
||||
plt.bar(
|
||||
np.arange(len(sizes)), sizes_hl, color="#D55E00", hatch="x", edgecolor="black"
|
||||
)
|
||||
plt.ylabel("Bytecode size (MiB)")
|
||||
plt.tick_params(
|
||||
axis="x",
|
||||
which="both",
|
||||
bottom=False,
|
||||
top=False,
|
||||
labelbottom=False,
|
||||
)
|
||||
for s in range(7, 13):
|
||||
plt.axhline(y=(4**s) / 1024 / 1024, color="r", linestyle=":")
|
||||
render(title, interactive, image_path)
|
1129
rasta_data_manipulation/rasta_triturage/cli.py
Normal file
1129
rasta_data_manipulation/rasta_triturage/cli.py
Normal file
File diff suppressed because it is too large
Load diff
5124
rasta_data_manipulation/rasta_triturage/data_set.py
Normal file
5124
rasta_data_manipulation/rasta_triturage/data_set.py
Normal file
File diff suppressed because it is too large
Load diff
199
rasta_data_manipulation/rasta_triturage/ic3.py
Normal file
199
rasta_data_manipulation/rasta_triturage/ic3.py
Normal file
|
@ -0,0 +1,199 @@
|
|||
import sqlite3
|
||||
import csv
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Optional, Any
|
||||
from matplotlib_venn import venn2 # type: ignore
|
||||
from .utils import render
|
||||
|
||||
ERROR_CARACT = (
|
||||
"error_type",
|
||||
"error",
|
||||
"msg",
|
||||
"file",
|
||||
"function",
|
||||
"level",
|
||||
"origin",
|
||||
"raised_info",
|
||||
"called_info",
|
||||
)
|
||||
ERROR_MSG = " || '|' || ".join(map(lambda s: f"COALESCE({s}, '')", ERROR_CARACT))
|
||||
|
||||
|
||||
def ic3_venn(db: Path, interactive: bool = True, image_path: Path | None = None):
|
||||
values = {
|
||||
("FAILED", "NOT_FAILED"): 0,
|
||||
("FAILED", "FAILED"): 0,
|
||||
("NOT_FAILED", "FAILED"): 0,
|
||||
}
|
||||
with sqlite3.connect(db) as con:
|
||||
cur = con.cursor()
|
||||
for ic3_s, ic3_fork_s, n in cur.execute(
|
||||
"SELECT ex1.tool_status, ex2.tool_status, COUNT(*) "
|
||||
"FROM exec AS ex1 OUTER LEFT JOIN exec AS ex2 ON ex1.sha256 = ex2.sha256 "
|
||||
"WHERE ex1.tool_name = 'ic3' AND ex2.tool_name = 'ic3_fork' "
|
||||
"GROUP BY ex1.tool_status, ex2.tool_status"
|
||||
):
|
||||
if ic3_s == "FAILED" and ic3_fork_s == "FAILED":
|
||||
values[("FAILED", "FAILED")] += n
|
||||
elif ic3_s == "FAILED":
|
||||
values[("FAILED", "NOT_FAILED")] += n
|
||||
elif ic3_fork_s == "FAILED":
|
||||
values[("NOT_FAILED", "FAILED")] += n
|
||||
venn2(
|
||||
subsets=(
|
||||
values[("FAILED", "NOT_FAILED")],
|
||||
values[("NOT_FAILED", "FAILED")],
|
||||
values[("FAILED", "FAILED")],
|
||||
),
|
||||
set_labels=("IC3 failed", "IC3 fork failed"),
|
||||
)
|
||||
render(
|
||||
"Number of application that IC3 \nand its fork failed to analyse",
|
||||
interactive,
|
||||
image_path,
|
||||
)
|
||||
|
||||
|
||||
def ic3_errors(db: Path, file: Path | None = None):
|
||||
errors = []
|
||||
with sqlite3.connect(db) as con:
|
||||
cur = con.cursor()
|
||||
for err in cur.execute(
|
||||
"SELECT ex1.tool_status = 'FAILED', ex2.tool_status = 'FAILED', "
|
||||
" error.tool_name, error.error, COUNT(DISTINCT error.sha256) AS cnt, "
|
||||
f" {ERROR_MSG} "
|
||||
"FROM exec AS ex1 "
|
||||
" OUTER LEFT JOIN exec AS ex2 ON ex1.sha256 = ex2.sha256 "
|
||||
" INNER JOIN error ON ex1.sha256 = error.sha256 AND error.tool_name = 'ic3_fork' "
|
||||
"WHERE ex1.tool_name = 'ic3' AND ex2.tool_name = 'ic3_fork' AND "
|
||||
" ex1.tool_status = 'FAILED' AND ex2.tool_status != 'FAILED' "
|
||||
f"GROUP BY ex1.tool_status = 'FAILED', ex2.tool_status != 'FAILED', error.tool_name, error.error, {ERROR_MSG} "
|
||||
"ORDER BY cnt DESC "
|
||||
"LIMIT 10;"
|
||||
):
|
||||
errors.append(err)
|
||||
for err in cur.execute(
|
||||
"SELECT ex1.tool_status = 'FAILED', ex2.tool_status = 'FAILED', "
|
||||
" error.tool_name, error.error, COUNT(DISTINCT error.sha256) AS cnt, "
|
||||
f" {ERROR_MSG} "
|
||||
"FROM exec AS ex1 "
|
||||
" OUTER LEFT JOIN exec AS ex2 ON ex1.sha256 = ex2.sha256 "
|
||||
" INNER JOIN error ON ex1.sha256 = error.sha256 AND error.tool_name = 'ic3_fork' "
|
||||
"WHERE ex1.tool_name = 'ic3' AND ex2.tool_name = 'ic3_fork' AND "
|
||||
" ex1.tool_status != 'FAILED' AND ex2.tool_status = 'FAILED' "
|
||||
f"GROUP BY ex1.tool_status != 'FAILED', ex2.tool_status = 'FAILED', error.tool_name, error.error, {ERROR_MSG}"
|
||||
"ORDER BY cnt DESC "
|
||||
"LIMIT 10;"
|
||||
):
|
||||
errors.append(err)
|
||||
for err in cur.execute(
|
||||
"SELECT ex1.tool_status = 'FAILED', ex2.tool_status = 'FAILED', "
|
||||
" error.tool_name, error.error, COUNT(DISTINCT error.sha256) AS cnt, "
|
||||
f" {ERROR_MSG} "
|
||||
"FROM exec AS ex1 "
|
||||
" OUTER LEFT JOIN exec AS ex2 ON ex1.sha256 = ex2.sha256 "
|
||||
" INNER JOIN error ON ex1.sha256 = error.sha256 AND error.tool_name = 'ic3_fork' "
|
||||
"WHERE ex1.tool_name = 'ic3' AND ex2.tool_name = 'ic3_fork' AND "
|
||||
" ex1.tool_status = 'FAILED' AND ex2.tool_status = 'FAILED' "
|
||||
f"GROUP BY ex1.tool_status = 'FAILED', ex2.tool_status = 'FAILED', error.tool_name, error.error, {ERROR_MSG} "
|
||||
"ORDER BY cnt DESC "
|
||||
"LIMIT 10;"
|
||||
):
|
||||
errors.append(err)
|
||||
for err in cur.execute(
|
||||
"SELECT ex1.tool_status = 'FAILED', ex2.tool_status = 'FAILED', "
|
||||
" error.tool_name, error.error, COUNT(DISTINCT error.sha256) AS cnt, "
|
||||
f" {ERROR_MSG} "
|
||||
"FROM exec AS ex1 "
|
||||
" OUTER LEFT JOIN exec AS ex2 ON ex1.sha256 = ex2.sha256 "
|
||||
" INNER JOIN error ON ex1.sha256 = error.sha256 AND error.tool_name = 'ic3' "
|
||||
"WHERE ex1.tool_name = 'ic3' AND ex2.tool_name = 'ic3_fork' AND "
|
||||
" ex1.tool_status = 'FAILED' AND ex2.tool_status != 'FAILED' "
|
||||
f"GROUP BY ex1.tool_status = 'FAILED', ex2.tool_status != 'FAILED', error.tool_name, error.error, {ERROR_MSG} "
|
||||
"ORDER BY cnt DESC "
|
||||
"LIMIT 10;"
|
||||
):
|
||||
errors.append(err)
|
||||
for err in cur.execute(
|
||||
"SELECT ex1.tool_status = 'FAILED', ex2.tool_status = 'FAILED', "
|
||||
" error.tool_name, error.error, COUNT(DISTINCT error.sha256) AS cnt, "
|
||||
f" {ERROR_MSG} "
|
||||
"FROM exec AS ex1 "
|
||||
" OUTER LEFT JOIN exec AS ex2 ON ex1.sha256 = ex2.sha256 "
|
||||
" INNER JOIN error ON ex1.sha256 = error.sha256 AND error.tool_name = 'ic3' "
|
||||
"WHERE ex1.tool_name = 'ic3' AND ex2.tool_name = 'ic3_fork' AND "
|
||||
" ex1.tool_status != 'FAILED' AND ex2.tool_status = 'FAILED' "
|
||||
f"GROUP BY ex1.tool_status != 'FAILED', ex2.tool_status = 'FAILED', error.tool_name, error.error, {ERROR_MSG} "
|
||||
"ORDER BY cnt DESC "
|
||||
"LIMIT 10;"
|
||||
):
|
||||
errors.append(err)
|
||||
for err in cur.execute(
|
||||
"SELECT ex1.tool_status = 'FAILED', ex2.tool_status = 'FAILED', "
|
||||
" error.tool_name, error.error, COUNT(DISTINCT error.sha256) AS cnt, "
|
||||
f" {ERROR_MSG} "
|
||||
"FROM exec AS ex1 "
|
||||
" OUTER LEFT JOIN exec AS ex2 ON ex1.sha256 = ex2.sha256 "
|
||||
" INNER JOIN error ON ex1.sha256 = error.sha256 AND error.tool_name = 'ic3' "
|
||||
"WHERE ex1.tool_name = 'ic3' AND ex2.tool_name = 'ic3_fork' AND "
|
||||
" ex1.tool_status = 'FAILED' AND ex2.tool_status = 'FAILED' "
|
||||
f"GROUP BY ex1.tool_status = 'FAILED', ex2.tool_status = 'FAILED', error.tool_name, error.error, {ERROR_MSG} "
|
||||
"ORDER BY cnt DESC "
|
||||
"LIMIT 10;"
|
||||
):
|
||||
errors.append(err)
|
||||
if file is None:
|
||||
fp = sys.stdout
|
||||
else:
|
||||
fp = file.open("w")
|
||||
writer = csv.DictWriter(
|
||||
fp,
|
||||
fieldnames=[
|
||||
"ic3 failed",
|
||||
"ic3 fork failed",
|
||||
"tool",
|
||||
"error",
|
||||
"occurence",
|
||||
"msg",
|
||||
],
|
||||
)
|
||||
writer.writeheader()
|
||||
for err in map(rewrite_msg, errors):
|
||||
writer.writerow(
|
||||
{
|
||||
k: v
|
||||
for k, v in zip(
|
||||
[
|
||||
"ic3 failed",
|
||||
"ic3 fork failed",
|
||||
"tool",
|
||||
"error",
|
||||
"msg",
|
||||
"occurence",
|
||||
],
|
||||
err,
|
||||
)
|
||||
}
|
||||
)
|
||||
if file is not None:
|
||||
fp.close()
|
||||
|
||||
|
||||
def rewrite_msg(
|
||||
err: tuple[int, int, str, str, int, str]
|
||||
) -> tuple[int, int, str, str, int, str]:
|
||||
ic3_failed, ic3_fork_failed, tool, error, occurence, msg = err
|
||||
(
|
||||
error_type,
|
||||
error,
|
||||
msg,
|
||||
file,
|
||||
function,
|
||||
level,
|
||||
origin,
|
||||
raised_info,
|
||||
called_info,
|
||||
) = map(lambda s: "" if s == "" else s + " ", msg.split("|"))
|
||||
msg = f"{level}{error}{msg}{called_info}{called_info}{file}{function}{origin}"
|
||||
return (ic3_failed, ic3_fork_failed, tool, error, occurence, msg)
|
246
rasta_data_manipulation/rasta_triturage/populate_db_apk.py
Normal file
246
rasta_data_manipulation/rasta_triturage/populate_db_apk.py
Normal file
|
@ -0,0 +1,246 @@
|
|||
import sqlite3
|
||||
import time
|
||||
import gzip
|
||||
import csv
|
||||
import datetime
|
||||
import requests
|
||||
import getpass
|
||||
import dateutil.parser
|
||||
|
||||
from androguard.core.bytecodes import apk as androguard_apk
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def int_or_none(str_: str) -> int | None:
|
||||
if str_:
|
||||
return int(str_)
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
def create_apk_table(db: Path):
|
||||
"""Create the db/table if it does not exist."""
|
||||
with sqlite3.connect(db) as con:
|
||||
cur = con.cursor()
|
||||
if (
|
||||
cur.execute("SELECT name FROM sqlite_master WHERE name='apk'").fetchone()
|
||||
is None
|
||||
):
|
||||
cur.execute(
|
||||
(
|
||||
"CREATE TABLE apk("
|
||||
" sha256, first_seen_year, apk_size,"
|
||||
" vt_detection, min_sdk, max_sdk,"
|
||||
" target_sdk, apk_size_decile, dex_date date,"
|
||||
" pkg_name, vercode, vt_scan_date date,"
|
||||
" dex_size, added date, markets, dex_size_decile, "
|
||||
" dex_size_decile_by_year"
|
||||
")"
|
||||
)
|
||||
)
|
||||
con.commit()
|
||||
|
||||
|
||||
def get_sha_set(dataset: Path) -> set[str]:
|
||||
"""Read a set of sha256 from a file."""
|
||||
apk_set = set()
|
||||
with dataset.open() as f:
|
||||
for line in f.readlines():
|
||||
apk_set.add(line.strip())
|
||||
return apk_set
|
||||
|
||||
|
||||
def populate_from_year_and_sdk(db: Path, year_and_sdk: Path, apks: set[str]):
|
||||
"""Add to the info from year_and_sdk.csv.gz to the database
|
||||
for the apks in `apks`.
|
||||
"""
|
||||
apks_not_found = apks.copy()
|
||||
with gzip.open(year_and_sdk, "rt", newline="") as f:
|
||||
reader = csv.DictReader(f, quotechar='"')
|
||||
fieldnames = reader.fieldnames
|
||||
assert fieldnames is not None
|
||||
for row in reader:
|
||||
if row["sha256"] not in apks:
|
||||
continue
|
||||
value = {
|
||||
"sha256": row["sha256"],
|
||||
"first_seen_year": int_or_none(row["first_seen_year"]),
|
||||
"vt_detection": int_or_none(row["vt_detection"]),
|
||||
"min_sdk": int_or_none(row["min_sdk"]),
|
||||
"max_sdk": int_or_none(row["max_sdk"]),
|
||||
"target_sdk": int_or_none(row["target_sdk"]),
|
||||
"apk_size_decile": 0, # Computed at dataset generation
|
||||
"dex_size_decile": 0, # Computed by compute_dex_decile
|
||||
}
|
||||
with sqlite3.connect(db) as con:
|
||||
cur = con.cursor()
|
||||
cur.execute(
|
||||
(
|
||||
"INSERT INTO apk ("
|
||||
" sha256, first_seen_year, vt_detection,"
|
||||
" min_sdk, max_sdk, target_sdk, apk_size_decile,"
|
||||
" dex_size_decile"
|
||||
") VALUES("
|
||||
" :sha256, :first_seen_year, :vt_detection,"
|
||||
" :min_sdk, :max_sdk, :target_sdk, :apk_size_decile,"
|
||||
" :dex_size_decile"
|
||||
");"
|
||||
),
|
||||
value,
|
||||
)
|
||||
con.commit()
|
||||
apks_not_found.remove(row["sha256"])
|
||||
for apk in apks_not_found:
|
||||
value = {
|
||||
"sha256": apk,
|
||||
"first_seen_year": None,
|
||||
"vt_detection": None,
|
||||
"min_sdk": None,
|
||||
"max_sdk": None,
|
||||
"target_sdk": None,
|
||||
"apk_size_decile": 0,
|
||||
"dex_size_decile": 0, # Computed by compute_dex_decile
|
||||
}
|
||||
with sqlite3.connect(db) as con:
|
||||
cur = con.cursor()
|
||||
cur.execute(
|
||||
(
|
||||
"INSERT INTO apk ("
|
||||
" sha256, first_seen_year, vt_detection,"
|
||||
" min_sdk, max_sdk, target_sdk, apk_size_decile,"
|
||||
" dex_size_decile"
|
||||
") VALUES("
|
||||
" :sha256, :first_seen_year, :vt_detection,"
|
||||
" :min_sdk, :max_sdk, :target_sdk, :apk_size_decile,"
|
||||
" :dex_size_decile"
|
||||
");"
|
||||
),
|
||||
value,
|
||||
)
|
||||
con.commit()
|
||||
|
||||
|
||||
def populate_from_latest_with_added_date(
|
||||
db: Path, latest_with_added_date: Path, apks: set[str]
|
||||
):
|
||||
"""Add to the info from latest_with-added-date.csv.gz to the database
|
||||
for the apks in `apks`.
|
||||
"""
|
||||
with gzip.open(latest_with_added_date, "rt", newline="") as f:
|
||||
reader = csv.DictReader(f, quotechar='"')
|
||||
fieldnames = reader.fieldnames
|
||||
assert fieldnames is not None
|
||||
for row in reader:
|
||||
if row["sha256"] not in apks:
|
||||
continue
|
||||
value = {
|
||||
"sha256": row["sha256"],
|
||||
"apk_size": int_or_none(row["apk_size"]),
|
||||
"dex_date": datetime.datetime.fromisoformat(row["dex_date"])
|
||||
if row["dex_date"]
|
||||
else None,
|
||||
"pkg_name": row["pkg_name"],
|
||||
"vercode": int_or_none(row["vercode"]),
|
||||
"vt_scan_date": datetime.datetime.fromisoformat(row["vt_scan_date"])
|
||||
if row["vt_scan_date"]
|
||||
else None,
|
||||
"dex_size": int_or_none(
|
||||
row["dex_size"]
|
||||
), # Not necessary the right value if multiple dex are used, see 'fix_dex_size()'
|
||||
"added": dateutil.parser.isoparse(row["added"])
|
||||
if row["added"]
|
||||
else None,
|
||||
"markets": row["markets"],
|
||||
}
|
||||
with sqlite3.connect(db) as con:
|
||||
cur = con.cursor()
|
||||
cur.execute(
|
||||
"UPDATE apk "
|
||||
"SET apk_size = :apk_size,"
|
||||
" dex_date = :dex_date,"
|
||||
" pkg_name = :pkg_name,"
|
||||
" vercode = :vercode,"
|
||||
" vt_scan_date = :vt_scan_date,"
|
||||
" dex_size = :dex_size,"
|
||||
" added = :added,"
|
||||
" markets = :markets "
|
||||
"WHERE"
|
||||
" sha256 = :sha256;",
|
||||
value,
|
||||
)
|
||||
con.commit()
|
||||
|
||||
|
||||
def download_apk(sha256: str, api_key: bytes) -> bytes:
|
||||
while True:
|
||||
resp = requests.get(
|
||||
"https://androzoo.uni.lu/api/download",
|
||||
params={
|
||||
b"apikey": api_key,
|
||||
b"sha256": sha256.encode("utf-8"),
|
||||
},
|
||||
)
|
||||
if resp.status_code == 200:
|
||||
return resp.content
|
||||
else:
|
||||
print(resp)
|
||||
print(resp.content)
|
||||
time.sleep(1)
|
||||
|
||||
|
||||
def fix_dex_size(db: Path, apks: set[str], androzoo_key: bytes):
|
||||
"""Download the apk from androzoo, compute the total size
|
||||
of all .dex file and update the database.
|
||||
"""
|
||||
for sha256 in apks:
|
||||
apk = download_apk(sha256, androzoo_key)
|
||||
apk = androguard_apk.APK(apk, raw=True, skip_analysis=True)
|
||||
dex_size = sum(map(lambda x: len(x), apk.get_all_dex()))
|
||||
with sqlite3.connect(db) as con:
|
||||
cur = con.cursor()
|
||||
cur.execute(
|
||||
("UPDATE apk " "SET dex_size = ? " "WHERE" " sha256 = ?;"),
|
||||
(dex_size, sha256),
|
||||
)
|
||||
con.commit()
|
||||
|
||||
|
||||
def populate_db_apk(
|
||||
db: Path,
|
||||
dataset: Path,
|
||||
year_and_sdk: Path,
|
||||
latest_with_added_date: Path,
|
||||
fix_dsize: bool,
|
||||
):
|
||||
"""Populate the database with the apk informations."""
|
||||
if fix_dsize:
|
||||
androzoo_key = (
|
||||
getpass.getpass(prompt="androzoo apikey: ").strip().encode("utf-8")
|
||||
)
|
||||
create_apk_table(db)
|
||||
apks = get_sha_set(dataset)
|
||||
populate_from_year_and_sdk(db, year_and_sdk, apks)
|
||||
populate_from_latest_with_added_date(db, latest_with_added_date, apks)
|
||||
if fix_dsize:
|
||||
fix_dex_size(db, apks, androzoo_key)
|
||||
with sqlite3.connect(db) as con:
|
||||
cur = con.cursor()
|
||||
cur.execute(
|
||||
"UPDATE apk "
|
||||
"SET dex_size_decile = compute.decile "
|
||||
"FROM ("
|
||||
" SELECT NTILE ( 10 ) OVER ( ORDER BY dex_size ) decile, sha256 FROM apk"
|
||||
") AS compute "
|
||||
"WHERE apk.sha256 = compute.sha256;"
|
||||
)
|
||||
cur.execute(
|
||||
"UPDATE apk "
|
||||
"SET dex_size_decile_by_year = compute.decile "
|
||||
"FROM ("
|
||||
" SELECT NTILE ( 10 ) "
|
||||
" OVER ( PARTITION BY first_seen_year ORDER BY dex_size ) decile, sha256 "
|
||||
" FROM apk"
|
||||
") AS compute "
|
||||
"WHERE apk.sha256 = compute.sha256;"
|
||||
)
|
||||
con.commit()
|
186
rasta_data_manipulation/rasta_triturage/populate_db_exec.py
Normal file
186
rasta_data_manipulation/rasta_triturage/populate_db_exec.py
Normal file
|
@ -0,0 +1,186 @@
|
|||
import sqlite3
|
||||
import json
|
||||
import datetime
|
||||
from pathlib import Path
|
||||
|
||||
from .query_error import estimate_cause
|
||||
|
||||
|
||||
def create_tables(db: Path):
|
||||
"""Create the db/tables if they do not exist."""
|
||||
with sqlite3.connect(db) as con:
|
||||
cur = con.cursor()
|
||||
cur.execute(
|
||||
(
|
||||
"CREATE TABLE IF NOT EXISTS exec ("
|
||||
" sha256, id, rev, time, kernel_cpu_time, user_cpu_time, "
|
||||
" max_rss_mem, avg_rss_mem, avg_total_mem, page_size, "
|
||||
" nb_major_page_fault, nb_minor_page_fault, nb_fs_input, "
|
||||
" nb_fs_output, nb_socket_msg_received, nb_socket_msg_sent, "
|
||||
" nb_signal_delivered, exit_status, timeout, "
|
||||
" tool_status, tool_name, date date"
|
||||
");"
|
||||
)
|
||||
)
|
||||
cur.execute(
|
||||
(
|
||||
"CREATE TABLE IF NOT EXISTS error ("
|
||||
" tool_name, sha256, error_type, error, msg, "
|
||||
" first_line, last_line, logfile_name, file, "
|
||||
" line, function, level, origin, raised_info, "
|
||||
" called_info, cause"
|
||||
");"
|
||||
)
|
||||
)
|
||||
con.commit()
|
||||
|
||||
|
||||
def insert_errors(cur, tool, sha256, errors):
|
||||
for error in errors:
|
||||
error["tool_name"] = tool
|
||||
error["sha256"] = sha256
|
||||
error.setdefault("error_type", None)
|
||||
error.setdefault("error", None)
|
||||
error.setdefault("msg", None)
|
||||
error.setdefault("first_line", None)
|
||||
error.setdefault("last_line", None)
|
||||
error.setdefault("logfile_name", None)
|
||||
error.setdefault("file", None)
|
||||
error.setdefault("line", None)
|
||||
error.setdefault("function", None)
|
||||
error.setdefault("level", None)
|
||||
error.setdefault("origin", None)
|
||||
error.setdefault("raised_info", None)
|
||||
if error["raised_info"] is not None:
|
||||
error["raised_info"] = 'Raised at {} in file "{}", line {}'.format(
|
||||
error["raised_info"]["function"],
|
||||
error["raised_info"]["file"],
|
||||
error["raised_info"]["line"],
|
||||
)
|
||||
error.setdefault("called_info", None)
|
||||
if error["called_info"] is not None:
|
||||
error["called_info"] = 'Called from {} in file "{}", line {}'.format(
|
||||
error["called_info"]["function"],
|
||||
error["called_info"]["file"],
|
||||
error["called_info"]["line"],
|
||||
)
|
||||
# The stack strace can be quite big without being very usefull in
|
||||
# queries
|
||||
error.pop("stack", None)
|
||||
cur.executemany(
|
||||
(
|
||||
"INSERT INTO error VALUES("
|
||||
" :tool_name, :sha256, :error_type, :error, :msg, "
|
||||
" :first_line, :last_line, :logfile_name, :file, "
|
||||
" :line, :function, :level, :origin, :raised_info, "
|
||||
" :called_info, ''"
|
||||
");"
|
||||
),
|
||||
errors,
|
||||
)
|
||||
|
||||
|
||||
def fix_error(db: Path, report_with_correct_error: Path):
|
||||
"""Infortunatly they was some errors in parsing the errors during the experiment,
|
||||
some another run was made for some pair of tool-apk to get the actual error.
|
||||
This pass was made in a different environnment (!= memory and space constraint),
|
||||
so we only replace the errors (after manual inspection, they don't seam related
|
||||
to the environnment), and keep the other values from the original experiment.
|
||||
"""
|
||||
with sqlite3.connect(db) as con:
|
||||
cur = con.cursor()
|
||||
for path in report_with_correct_error.iterdir():
|
||||
with path.open() as f:
|
||||
exec_log = json.load(f)
|
||||
sha256 = exec_log["apk"].removesuffix(".apk")
|
||||
if (
|
||||
len(
|
||||
cur.execute(
|
||||
"SELECT * FROM exec WHERE tool_name = ? AND sha256 = ?",
|
||||
(exec_log["tool-name"], sha256),
|
||||
).fetchall()
|
||||
)
|
||||
== 1
|
||||
):
|
||||
cur.execute(
|
||||
"DELETE FROM error WHERE tool_name = ? AND sha256 = ?",
|
||||
(exec_log["tool-name"], sha256),
|
||||
)
|
||||
errors = exec_log.pop("errors", [])
|
||||
insert_errors(cur, exec_log["tool-name"], sha256, errors)
|
||||
con.commit()
|
||||
|
||||
|
||||
def populate_execution_report(db: Path, report_folder: Path):
|
||||
"""Add to database the report stored in the report_folder."""
|
||||
create_tables(db)
|
||||
i = 0
|
||||
with sqlite3.connect(db) as con:
|
||||
cur = con.cursor()
|
||||
for path in report_folder.iterdir():
|
||||
with path.open() as f:
|
||||
exec_log = json.load(f)
|
||||
exec_log["sha256"] = exec_log["apk"].removesuffix(".apk")
|
||||
exec_log["id"] = exec_log.get("_id", None)
|
||||
exec_log["rev"] = exec_log.get("_rev", None)
|
||||
errors = exec_log.pop("errors", [])
|
||||
|
||||
exec_log["date"] = (
|
||||
datetime.datetime.fromisoformat(exec_log["date"])
|
||||
if exec_log.get("date", None)
|
||||
else None
|
||||
)
|
||||
del exec_log["apk"]
|
||||
if "_id" in exec_log:
|
||||
del exec_log["_id"]
|
||||
if "_rev" in exec_log:
|
||||
del exec_log["_rev"]
|
||||
new_exec_log = {}
|
||||
for key in exec_log:
|
||||
new_key = key.replace("-", "_")
|
||||
new_exec_log[new_key] = exec_log[key]
|
||||
for val in [
|
||||
"sha256",
|
||||
"id",
|
||||
"rev",
|
||||
"time",
|
||||
"kernel_cpu_time",
|
||||
"user_cpu_time",
|
||||
"max_rss_mem",
|
||||
"avg_rss_mem",
|
||||
"avg_total_mem",
|
||||
"page_size",
|
||||
"nb_major_page_fault",
|
||||
"nb_minor_page_fault",
|
||||
"nb_fs_input",
|
||||
"nb_fs_output",
|
||||
"nb_socket_msg_received",
|
||||
"nb_socket_msg_sent",
|
||||
"nb_signal_delivered",
|
||||
"exit_status",
|
||||
"timeout",
|
||||
"tool_status",
|
||||
"tool_name",
|
||||
"date",
|
||||
]:
|
||||
if val not in new_exec_log:
|
||||
new_exec_log[val] = None
|
||||
cur.execute(
|
||||
(
|
||||
"INSERT INTO exec VALUES("
|
||||
" :sha256, :id, :rev, :time, :kernel_cpu_time, :user_cpu_time, "
|
||||
" :max_rss_mem, :avg_rss_mem, :avg_total_mem, :page_size, "
|
||||
" :nb_major_page_fault, :nb_minor_page_fault, :nb_fs_input, "
|
||||
" :nb_fs_output, :nb_socket_msg_received, :nb_socket_msg_sent, "
|
||||
" :nb_signal_delivered, :exit_status, :timeout, "
|
||||
" :tool_status, :tool_name, :date"
|
||||
");"
|
||||
),
|
||||
new_exec_log,
|
||||
)
|
||||
insert_errors(cur, exec_log["tool-name"], exec_log["sha256"], errors)
|
||||
i += 1
|
||||
if i == 10_000:
|
||||
# Not sure how much ram would be needed to commit in one go
|
||||
con.commit()
|
||||
con.commit()
|
176
rasta_data_manipulation/rasta_triturage/populate_db_tool.py
Normal file
176
rasta_data_manipulation/rasta_triturage/populate_db_tool.py
Normal file
|
@ -0,0 +1,176 @@
|
|||
import sqlite3
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
TOOL_INFO = [
|
||||
{
|
||||
"tool_name": "adagio",
|
||||
"use_python": True,
|
||||
"use_androguard": True,
|
||||
},
|
||||
{
|
||||
"tool_name": "amandroid",
|
||||
"use_scala": True,
|
||||
"use_soot": False,
|
||||
"use_apktool": True,
|
||||
},
|
||||
{
|
||||
"tool_name": "anadroid",
|
||||
"use_python": True,
|
||||
"use_java": True,
|
||||
"use_scala": True,
|
||||
"use_soot": False,
|
||||
"use_apktool": True,
|
||||
},
|
||||
{
|
||||
"tool_name": "androguard",
|
||||
"use_python": True,
|
||||
"use_androguard": True, # Duh
|
||||
},
|
||||
{
|
||||
"tool_name": "androguard_dad",
|
||||
"use_python": True,
|
||||
"use_androguard": True,
|
||||
},
|
||||
{
|
||||
"tool_name": "apparecium",
|
||||
"use_python": True,
|
||||
"use_androguard": True,
|
||||
},
|
||||
{
|
||||
"tool_name": "blueseal",
|
||||
"use_java": True,
|
||||
"use_soot": True,
|
||||
"use_apktool": True,
|
||||
},
|
||||
{
|
||||
"tool_name": "dialdroid",
|
||||
"use_java": True,
|
||||
"use_soot": True,
|
||||
},
|
||||
{
|
||||
"tool_name": "didfail",
|
||||
"use_python": True,
|
||||
"use_java": True,
|
||||
"use_soot": True,
|
||||
},
|
||||
{
|
||||
"tool_name": "droidsafe",
|
||||
"use_python": True,
|
||||
"use_java": True,
|
||||
"use_soot": True,
|
||||
"use_apktool": True,
|
||||
},
|
||||
{
|
||||
"tool_name": "flowdroid",
|
||||
"use_java": True,
|
||||
"use_soot": True,
|
||||
},
|
||||
{
|
||||
"tool_name": "gator",
|
||||
"use_python": True,
|
||||
"use_java": True,
|
||||
"use_soot": True,
|
||||
"use_apktool": True,
|
||||
},
|
||||
{
|
||||
"tool_name": "ic3",
|
||||
"use_java": True,
|
||||
"use_soot": True,
|
||||
},
|
||||
{
|
||||
"tool_name": "ic3_fork",
|
||||
"use_java": True,
|
||||
"use_soot": True,
|
||||
},
|
||||
{
|
||||
"tool_name": "iccta",
|
||||
"use_java": True,
|
||||
"use_soot": True,
|
||||
"use_apktool": True,
|
||||
},
|
||||
{
|
||||
"tool_name": "mallodroid",
|
||||
"use_python": True,
|
||||
"use_androguard": True,
|
||||
},
|
||||
{
|
||||
"tool_name": "perfchecker",
|
||||
"use_java": True,
|
||||
"use_soot": True,
|
||||
},
|
||||
{
|
||||
"tool_name": "redexer",
|
||||
"use_ocaml": True,
|
||||
"use_ruby": True,
|
||||
"use_apktool": True,
|
||||
},
|
||||
{
|
||||
"tool_name": "saaf",
|
||||
"use_java": True,
|
||||
"use_soot": False,
|
||||
"use_apktool": True,
|
||||
},
|
||||
{
|
||||
"tool_name": "wognsen_et_al",
|
||||
"use_python": True,
|
||||
"use_prolog": True,
|
||||
"use_apktool": True,
|
||||
},
|
||||
]
|
||||
|
||||
for line in TOOL_INFO:
|
||||
for col in [
|
||||
"use_python",
|
||||
"use_java",
|
||||
"use_scala",
|
||||
"use_ocaml",
|
||||
"use_ruby",
|
||||
"use_prolog",
|
||||
"use_soot",
|
||||
"use_androguard",
|
||||
"use_apktool",
|
||||
]:
|
||||
if col not in line:
|
||||
line[col] = False
|
||||
|
||||
|
||||
def create_tool_table(db: Path):
|
||||
"""Create the db/table if it does not exist."""
|
||||
with sqlite3.connect(db) as con:
|
||||
cur = con.cursor()
|
||||
if (
|
||||
cur.execute("SELECT name FROM sqlite_master WHERE name='tool';").fetchone()
|
||||
is None
|
||||
):
|
||||
cur.execute(
|
||||
(
|
||||
"CREATE TABLE tool ("
|
||||
" tool_name, use_python, use_java, use_scala,"
|
||||
" use_ocaml, use_ruby, use_prolog, use_soot, "
|
||||
" use_androguard, use_apktool"
|
||||
");"
|
||||
)
|
||||
)
|
||||
con.commit()
|
||||
|
||||
|
||||
def populate_tool(
|
||||
db: Path,
|
||||
):
|
||||
"""Add to database the tool information"""
|
||||
create_tool_table(db)
|
||||
# DROP table if already exist? replace value?
|
||||
with sqlite3.connect(db) as con:
|
||||
cur = con.cursor()
|
||||
cur.executemany(
|
||||
(
|
||||
"INSERT INTO tool VALUES("
|
||||
" :tool_name, :use_python, :use_java, :use_scala,"
|
||||
" :use_ocaml, :use_ruby, :use_prolog, :use_soot, "
|
||||
" :use_androguard, :use_apktool"
|
||||
");"
|
||||
),
|
||||
TOOL_INFO,
|
||||
)
|
||||
con.commit()
|
699
rasta_data_manipulation/rasta_triturage/query_error.py
Normal file
699
rasta_data_manipulation/rasta_triturage/query_error.py
Normal file
|
@ -0,0 +1,699 @@
|
|||
import sqlite3
|
||||
import sys
|
||||
import csv
|
||||
import matplotlib.pyplot as plt # type: ignore
|
||||
from .utils import get_list_tools, radar_chart, render
|
||||
from pathlib import Path
|
||||
from typing import Optional, Any
|
||||
|
||||
ERROR_CARACT = (
|
||||
"error_type",
|
||||
"error",
|
||||
"msg",
|
||||
"file",
|
||||
"function",
|
||||
"level",
|
||||
"origin",
|
||||
"raised_info",
|
||||
"called_info",
|
||||
)
|
||||
|
||||
# Query that remove identical error that occure multiple times on the same execution
|
||||
DISTINCT_ERRORS = (
|
||||
"("
|
||||
f" SELECT DISTINCT tool_name, sha256, {', '.join(ERROR_CARACT)}"
|
||||
" FROM error"
|
||||
") AS distinct_error"
|
||||
)
|
||||
DISTINCT_ERROR_CLASS = (
|
||||
"("
|
||||
f" SELECT DISTINCT tool_name, sha256, error, error_type"
|
||||
" FROM error"
|
||||
") AS distinct_error"
|
||||
)
|
||||
DISTINCT_CAUSES = (
|
||||
"("
|
||||
" SELECT DISTINCT tool_name, sha256, cause"
|
||||
" FROM error"
|
||||
") AS distinct_cause"
|
||||
)
|
||||
|
||||
|
||||
def estimate_cause(db: Path):
|
||||
"""Estimate the cause of an error to easier grouping."""
|
||||
with sqlite3.connect(db) as con:
|
||||
cur = con.cursor()
|
||||
cur.execute("UPDATE error SET cause = '';")
|
||||
con.commit()
|
||||
# brut.androlib is package defined in apktool
|
||||
# 'Expected: 0x001c0001, got: 0x00000000' errors are always
|
||||
# part of an apktool stacktrace:
|
||||
# SELECT COUNT(*) FROM error e1
|
||||
# WHERE e1.tool_name = '${tool}' AND
|
||||
# e1.msg = 'Expected: 0x001c0001, got: 0x00000000' AND
|
||||
# e1.sha256 NOT IN (
|
||||
# SELECT e2.sha256 FROM error e2
|
||||
# WHERE e2.tool_name = '${tool}' AND
|
||||
# e2.msg LIKE '%Could not decode arsc file%'
|
||||
# )
|
||||
# is always 0"
|
||||
cur.execute(
|
||||
(
|
||||
"UPDATE error "
|
||||
"SET cause = 'apktool' "
|
||||
"WHERE error = 'brut.androlib.AndrolibException' OR "
|
||||
" error LIKE 'brut.androlib.err.%' OR "
|
||||
" msg = 'Expected: 0x001c0001, got: 0x00000000' OR "
|
||||
" msg LIKE '%brut.androlib.AndrolibException: Could not decode arsc file%' OR "
|
||||
" msg LIKE 'bad magic value: %' OR "
|
||||
" error = 'brut.androlib.err.UndefinedResObject';"
|
||||
)
|
||||
)
|
||||
cur.execute(
|
||||
(
|
||||
"UPDATE error "
|
||||
"SET cause = 'memory' "
|
||||
"WHERE error = 'java.lang.StackOverflowError' OR "
|
||||
" error = 'java.lang.OutOfMemoryError' OR "
|
||||
" msg LIKE '%java.lang.OutOfMemoryError%' OR "
|
||||
" msg LIKE '%java.lang.StackOverflowError%' OR "
|
||||
" msg = 'Stack overflow';"
|
||||
)
|
||||
)
|
||||
cur.execute(
|
||||
(
|
||||
"UPDATE error "
|
||||
"SET cause = 'soot' "
|
||||
"WHERE msg LIKE ? OR "
|
||||
" msg LIKE '%No call graph present in Scene. Maybe you want Whole Program mode (-w)%' OR "
|
||||
" msg LIKE '%There were exceptions during IFDS analysis. Exiting.%' OR " # More hero than soot?
|
||||
" msg = 'Could not find method' OR "
|
||||
" msg = 'No sources found, aborting analysis' OR "
|
||||
" msg = 'No sources or sinks found, aborting analysis' OR "
|
||||
" msg = 'Only phantom classes loaded, skipping analysis...';"
|
||||
),
|
||||
(
|
||||
"%RefType java.lang.Object not loaded. If you tried to get the RefType of a library class, did you call loadNecessaryClasses()? Otherwise please check Soot's classpath.%",
|
||||
),
|
||||
)
|
||||
|
||||
cur.execute(
|
||||
(
|
||||
"UPDATE error "
|
||||
"SET cause = 'index error' "
|
||||
"WHERE error = 'IndexError' OR "
|
||||
" msg = 'java.lang.ArrayIndexOutOfBoundsException' OR "
|
||||
" (error_type = 'Python' AND error = 'KeyError') OR "
|
||||
" error = 'java.lang.IndexOutOfBoundsException' OR "
|
||||
" error = 'java.lang.ArrayIndexOutOfBoundsException' OR "
|
||||
" msg LIKE 'java.lang.ArrayIndexOutOfBoundsException:%';"
|
||||
)
|
||||
)
|
||||
cur.execute(
|
||||
(
|
||||
"UPDATE error "
|
||||
"SET cause = 'arithmetique' "
|
||||
"WHERE error = 'java.lang.ArithmeticException';"
|
||||
)
|
||||
)
|
||||
cur.execute("UPDATE error SET cause = 'jasmin' WHERE error = 'jas.jasError';")
|
||||
cur.execute(
|
||||
(
|
||||
"UPDATE error "
|
||||
"SET cause = 'storage' "
|
||||
"WHERE msg = 'No space left on device' OR "
|
||||
" msg LIKE 'Error copying file: %' OR "
|
||||
" msg = 'java.io.IOException: No space left on device';"
|
||||
)
|
||||
)
|
||||
cur.execute(
|
||||
(
|
||||
"UPDATE error "
|
||||
"SET cause = 'redexe pattern maching failed' "
|
||||
"WHERE msg = 'File \"src/ext/logging.ml\", line 712, characters 12-17: Pattern matching failed';"
|
||||
)
|
||||
)
|
||||
cur.execute(
|
||||
(
|
||||
"UPDATE error "
|
||||
"SET cause = 'null pointer' "
|
||||
"WHERE error = 'java.lang.NullPointerException' OR "
|
||||
" msg LIKE ? OR "
|
||||
" msg LIKE 'undefined method % for nil:NilClass (NoMethodError)';"
|
||||
),
|
||||
("'NoneType' object has no attribute %",),
|
||||
)
|
||||
# Soot ?
|
||||
cur.execute(
|
||||
(
|
||||
"UPDATE error "
|
||||
"SET cause = 'unknown error in thread' "
|
||||
"WHERE msg = 'Worker thread execution failed: null';"
|
||||
)
|
||||
)
|
||||
cur.execute(
|
||||
(
|
||||
"UPDATE error "
|
||||
"SET cause = 'timeout' "
|
||||
"WHERE error = 'java.util.concurrent.TimeoutException';"
|
||||
)
|
||||
)
|
||||
cur.execute(
|
||||
(
|
||||
"UPDATE error "
|
||||
"SET cause = 'file name too long' "
|
||||
"WHERE msg = 'File name too long';"
|
||||
)
|
||||
)
|
||||
cur.execute(
|
||||
(
|
||||
"UPDATE error "
|
||||
"SET cause = 'encoding' "
|
||||
"WHERE error = 'UnicodeEncodeError';"
|
||||
)
|
||||
)
|
||||
cur.execute(
|
||||
(
|
||||
"UPDATE error "
|
||||
"SET cause = 'smali' "
|
||||
"WHERE error LIKE 'org.jf.dexlib2.%' OR error LIKE 'org.jf.util.%';"
|
||||
)
|
||||
)
|
||||
cur.execute(
|
||||
(
|
||||
"UPDATE error "
|
||||
"SET cause = 'redexer dex parser' "
|
||||
"WHERE msg LIKE 'Dex.Wrong_dex(\"%\")';"
|
||||
)
|
||||
)
|
||||
cur.execute(
|
||||
(
|
||||
"UPDATE error "
|
||||
"SET cause = 'bytecode not found' "
|
||||
"WHERE msg LIKE 'No method source set for method %' OR "
|
||||
" msg LIKE '% is an system library method.' OR "
|
||||
" msg LIKE '% is an unknown method.';"
|
||||
)
|
||||
)
|
||||
con.commit()
|
||||
# Default
|
||||
# default = " || '|' || ".join(map(lambda s: f"COALESCE({s}, '')", ERROR_CARACT))
|
||||
# cur.execute(f"UPDATE error SET cause = {default} WHERE cause = '';")
|
||||
cur.execute("UPDATE error SET cause = 'other' WHERE cause = '';")
|
||||
con.commit()
|
||||
|
||||
|
||||
def radar_cause_estimation(
|
||||
db: Path,
|
||||
tools: list[str] | None,
|
||||
interactive: bool,
|
||||
folder: Path | None,
|
||||
):
|
||||
# estimate_cause(db)
|
||||
if tools is None:
|
||||
tools = get_list_tools(db)
|
||||
|
||||
with sqlite3.connect(db, timeout=60) as con:
|
||||
cur = con.cursor()
|
||||
causes = [
|
||||
v for v, in cur.execute("SELECT DISTINCT cause FROM error;").fetchall()
|
||||
]
|
||||
for tool in tools:
|
||||
print(f"tool: {tool}")
|
||||
for cause, count in cur.execute(
|
||||
(
|
||||
"SELECT cause, COUNT(*) AS cnt "
|
||||
"FROM error "
|
||||
"WHERE tool_name = ? "
|
||||
"GROUP BY cause "
|
||||
"ORDER BY cnt DESC LIMIT 10;"
|
||||
),
|
||||
(tool,),
|
||||
):
|
||||
print(f"{count: 6}: {cause}")
|
||||
print()
|
||||
|
||||
values = []
|
||||
labels = tools
|
||||
for tool in tools:
|
||||
vals = [0 for _ in causes]
|
||||
with sqlite3.connect(db) as con:
|
||||
cur = con.cursor()
|
||||
for cause, cnt in cur.execute(
|
||||
(
|
||||
"SELECT distinct_cause.cause, COUNT(*) AS cnt "
|
||||
f"FROM {DISTINCT_CAUSES} "
|
||||
"WHERE distinct_cause.cause != '' AND distinct_cause.tool_name = ? "
|
||||
"GROUP BY distinct_cause.cause;"
|
||||
),
|
||||
(tool,),
|
||||
):
|
||||
print(f"{tool=}, {cause=}, {cnt=}")
|
||||
if cause in causes:
|
||||
vals[causes.index(cause)] = cnt
|
||||
print(f"{tool=}, {vals=}")
|
||||
radar_chart(
|
||||
causes, [vals], [tool], f"Causes of error for {tool}", interactive, folder
|
||||
)
|
||||
values.append(vals)
|
||||
radar_chart(causes, values, labels, "Causes of error", interactive, folder)
|
||||
|
||||
|
||||
def get_common_errors(
|
||||
db: Path,
|
||||
tool: Optional[str] = None,
|
||||
status: Optional[str] = None,
|
||||
use_androguard: Optional[bool] = None,
|
||||
use_java: Optional[bool] = None,
|
||||
use_prolog: Optional[bool] = None,
|
||||
use_ruby: Optional[bool] = None,
|
||||
use_soot: Optional[bool] = None,
|
||||
use_apktool: Optional[bool] = None,
|
||||
use_ocaml: Optional[bool] = None,
|
||||
use_python: Optional[bool] = None,
|
||||
use_scala: Optional[bool] = None,
|
||||
folder: Optional[Path] = None,
|
||||
limit: int = 10,
|
||||
):
|
||||
"""Get the most common errors"""
|
||||
args: dict[str, Any] = {"limit": limit}
|
||||
clauses = []
|
||||
if tool is not None:
|
||||
clauses.append("(distinct_error.tool_name = :tool)")
|
||||
args["tool"] = tool
|
||||
if status is not None:
|
||||
clauses.append("(exec.tool_status = :tool_status)")
|
||||
args["tool_status"] = status
|
||||
|
||||
if use_java is not None:
|
||||
clauses.append("(tool.use_java = :use_java)")
|
||||
args["use_java"] = use_java
|
||||
if use_prolog is not None:
|
||||
clauses.append("(tool.use_prolog = :use_prolog)")
|
||||
args["use_prolog"] = use_prolog
|
||||
if use_ruby is not None:
|
||||
clauses.append("(tool.use_ruby = :use_ruby)")
|
||||
args["use_ruby"] = use_ruby
|
||||
if use_soot is not None:
|
||||
clauses.append("(tool.use_soot = :use_soot)")
|
||||
args["use_soot"] = use_soot
|
||||
if use_apktool is not None:
|
||||
clauses.append("(tool.use_apktool = :use_apktool)")
|
||||
args["use_apktool"] = use_apktool
|
||||
if use_ocaml is not None:
|
||||
clauses.append("(tool.use_ocaml = :use_ocaml)")
|
||||
args["use_ocaml"] = use_ocaml
|
||||
if use_python is not None:
|
||||
clauses.append("(tool.use_python = :use_python)")
|
||||
args["use_python"] = use_python
|
||||
if use_scala is not None:
|
||||
clauses.append("(tool.use_scala = :use_scala)")
|
||||
args["use_scala"] = use_scala
|
||||
where_clause = ""
|
||||
if clauses:
|
||||
where_clause = f"WHERE {' AND '.join(clauses)}"
|
||||
|
||||
# print(
|
||||
# (
|
||||
# f"SELECT COUNT(*) AS cnt, {', '.join(ERROR_CARACT)} \n"
|
||||
# f"FROM {DISTINCT_ERRORS} \n"
|
||||
# "INNER JOIN tool ON distinct_error.tool_name = tool.tool_name \n"
|
||||
# "INNER JOIN exec ON \n"
|
||||
# " distinct_error.tool_name = exec.tool_name AND \n"
|
||||
# " distinct_error.sha256 = exec.sha256 \n"
|
||||
# f"{where_clause}\n"
|
||||
# f"GROUP BY {', '.join(ERROR_CARACT)} \n"
|
||||
# "ORDER BY cnt DESC LIMIT :limit;\n"
|
||||
# )
|
||||
# )
|
||||
# print(args)
|
||||
|
||||
if folder is None:
|
||||
out = sys.stdout
|
||||
else:
|
||||
# Generate filename
|
||||
features = [
|
||||
use_androguard,
|
||||
use_java,
|
||||
use_prolog,
|
||||
use_ruby,
|
||||
use_soot,
|
||||
use_apktool,
|
||||
use_ocaml,
|
||||
use_python,
|
||||
use_scala,
|
||||
]
|
||||
|
||||
if tool is None:
|
||||
tool_str = ""
|
||||
else:
|
||||
tool_str = f"_for_{tool}"
|
||||
if status is None:
|
||||
status_str = ""
|
||||
else:
|
||||
status_str = f"_when_{status}"
|
||||
if all(map(lambda x: x is None, features)):
|
||||
features_str = ""
|
||||
else:
|
||||
features_str = "_using"
|
||||
if use_androguard:
|
||||
features_str += "_androguard"
|
||||
if use_java:
|
||||
features_str += "_java"
|
||||
if use_prolog:
|
||||
features_str += "_prolog"
|
||||
if use_ruby:
|
||||
features_str += "_ruby"
|
||||
if use_soot:
|
||||
features_str += "_soot"
|
||||
if use_apktool:
|
||||
features_str += "_apktool"
|
||||
if use_ocaml:
|
||||
features_str += "_ocaml"
|
||||
if use_python:
|
||||
features_str += "_python"
|
||||
if use_scala:
|
||||
features_str += "_scala"
|
||||
|
||||
name = f"{limit}_most_common_errors{tool_str}{status_str}{features_str}.csv"
|
||||
# make sure the folder exist
|
||||
folder.mkdir(parents=True, exist_ok=True)
|
||||
out = (folder / name).open("w")
|
||||
|
||||
with sqlite3.connect(db) as con:
|
||||
cur = con.cursor()
|
||||
writer = csv.DictWriter(out, fieldnames=["error", "msg", "count"])
|
||||
writer.writeheader()
|
||||
for row in cur.execute(
|
||||
(
|
||||
f"SELECT COUNT(*) AS cnt, {', '.join(ERROR_CARACT)} "
|
||||
f"FROM {DISTINCT_ERRORS} "
|
||||
"INNER JOIN tool ON distinct_error.tool_name = tool.tool_name "
|
||||
"INNER JOIN exec ON "
|
||||
" distinct_error.tool_name = exec.tool_name AND "
|
||||
" distinct_error.sha256 = exec.sha256 "
|
||||
f"{where_clause}"
|
||||
f"GROUP BY {', '.join(ERROR_CARACT)} "
|
||||
"ORDER BY cnt DESC LIMIT :limit;"
|
||||
),
|
||||
args,
|
||||
):
|
||||
row_d = {k: v for (k, v) in zip(("cnt", *ERROR_CARACT), row)}
|
||||
writer.writerow(reduce_error_row(row_d))
|
||||
if folder is not None:
|
||||
out.close()
|
||||
|
||||
|
||||
def reduce_error_row(row: dict[str, Any]) -> dict[str, Any]:
|
||||
"""Reduce an error from an sqlite row to a simpler row for svg."""
|
||||
new_row = {}
|
||||
new_row["error"] = row["error"]
|
||||
msg = row["msg"]
|
||||
error = row["error"]
|
||||
if error:
|
||||
error += " "
|
||||
else:
|
||||
error = ""
|
||||
if msg:
|
||||
msg += " "
|
||||
else:
|
||||
msg = ""
|
||||
file = row["file"]
|
||||
if file:
|
||||
file += " "
|
||||
else:
|
||||
file = ""
|
||||
function = row["function"]
|
||||
if function:
|
||||
function += " "
|
||||
else:
|
||||
function = ""
|
||||
level = row["level"]
|
||||
if level:
|
||||
level += " "
|
||||
else:
|
||||
level = ""
|
||||
origin = row["origin"]
|
||||
if origin:
|
||||
origin += " "
|
||||
else:
|
||||
origin = ""
|
||||
raised_info = row["raised_info"]
|
||||
if raised_info:
|
||||
raised_info += " "
|
||||
else:
|
||||
raised_info = ""
|
||||
called_info = row["called_info"]
|
||||
if called_info:
|
||||
called_info += " "
|
||||
else:
|
||||
called_info = ""
|
||||
new_row[
|
||||
"msg"
|
||||
] = f"{level}{error}{msg}{called_info}{called_info}{file}{function}{origin}"
|
||||
|
||||
new_row["count"] = row["cnt"]
|
||||
return new_row
|
||||
|
||||
|
||||
def get_common_error_classes(
|
||||
db: Path,
|
||||
tool: Optional[str] = None,
|
||||
status: Optional[str] = None,
|
||||
use_androguard: Optional[bool] = None,
|
||||
use_java: Optional[bool] = None,
|
||||
use_prolog: Optional[bool] = None,
|
||||
use_ruby: Optional[bool] = None,
|
||||
use_soot: Optional[bool] = None,
|
||||
use_apktool: Optional[bool] = None,
|
||||
use_ocaml: Optional[bool] = None,
|
||||
use_python: Optional[bool] = None,
|
||||
use_scala: Optional[bool] = None,
|
||||
folder: Optional[Path] = None,
|
||||
limit: int = 10,
|
||||
):
|
||||
"""Get the most common errors classes"""
|
||||
args: dict[str, Any] = {"limit": limit}
|
||||
clauses = []
|
||||
if tool is not None:
|
||||
clauses.append("(distinct_error.tool_name = :tool)")
|
||||
args["tool"] = tool
|
||||
if status is not None:
|
||||
clauses.append("(exec.tool_status = :tool_status)")
|
||||
args["tool_status"] = status
|
||||
|
||||
if use_java is not None:
|
||||
clauses.append("(tool.use_java = :use_java)")
|
||||
args["use_java"] = use_java
|
||||
if use_prolog is not None:
|
||||
clauses.append("(tool.use_prolog = :use_prolog)")
|
||||
args["use_prolog"] = use_prolog
|
||||
if use_ruby is not None:
|
||||
clauses.append("(tool.use_ruby = :use_ruby)")
|
||||
args["use_ruby"] = use_ruby
|
||||
if use_soot is not None:
|
||||
clauses.append("(tool.use_soot = :use_soot)")
|
||||
args["use_soot"] = use_soot
|
||||
if use_apktool is not None:
|
||||
clauses.append("(tool.use_apktool = :use_apktool)")
|
||||
args["use_apktool"] = use_apktool
|
||||
if use_ocaml is not None:
|
||||
clauses.append("(tool.use_ocaml = :use_ocaml)")
|
||||
args["use_ocaml"] = use_ocaml
|
||||
if use_python is not None:
|
||||
clauses.append("(tool.use_python = :use_python)")
|
||||
args["use_python"] = use_python
|
||||
if use_scala is not None:
|
||||
clauses.append("(tool.use_scala = :use_scala)")
|
||||
args["use_scala"] = use_scala
|
||||
where_clause = ""
|
||||
if clauses:
|
||||
where_clause = f"WHERE {' AND '.join(clauses)}"
|
||||
|
||||
if folder is None:
|
||||
out = sys.stdout
|
||||
else:
|
||||
# Generate filename
|
||||
features = [
|
||||
use_androguard,
|
||||
use_java,
|
||||
use_prolog,
|
||||
use_ruby,
|
||||
use_soot,
|
||||
use_apktool,
|
||||
use_ocaml,
|
||||
use_python,
|
||||
use_scala,
|
||||
]
|
||||
|
||||
if tool is None:
|
||||
tool_str = ""
|
||||
else:
|
||||
tool_str = f"_for_{tool}"
|
||||
if status is None:
|
||||
status_str = ""
|
||||
else:
|
||||
status_str = f"_when_{status}"
|
||||
if all(map(lambda x: x is None, features)):
|
||||
features_str = ""
|
||||
else:
|
||||
features_str = "_using"
|
||||
if use_androguard:
|
||||
features_str += "_androguard"
|
||||
if use_java:
|
||||
features_str += "_java"
|
||||
if use_prolog:
|
||||
features_str += "_prolog"
|
||||
if use_ruby:
|
||||
features_str += "_ruby"
|
||||
if use_soot:
|
||||
features_str += "_soot"
|
||||
if use_apktool:
|
||||
features_str += "_apktool"
|
||||
if use_ocaml:
|
||||
features_str += "_ocaml"
|
||||
if use_python:
|
||||
features_str += "_python"
|
||||
if use_scala:
|
||||
features_str += "_scala"
|
||||
|
||||
name = f"{limit}_most_common_errors_classes{tool_str}{status_str}{features_str}.csv"
|
||||
# make sure the folder exist
|
||||
folder.mkdir(parents=True, exist_ok=True)
|
||||
out = (folder / name).open("w")
|
||||
|
||||
with sqlite3.connect(db) as con:
|
||||
cur = con.cursor()
|
||||
writer = csv.DictWriter(out, fieldnames=["type", "error", "count"])
|
||||
writer.writeheader()
|
||||
for row in cur.execute(
|
||||
(
|
||||
f"SELECT COUNT(*) AS cnt, distinct_error.error, distinct_error.error_type "
|
||||
f"FROM {DISTINCT_ERROR_CLASS} "
|
||||
"INNER JOIN tool ON distinct_error.tool_name = tool.tool_name "
|
||||
"INNER JOIN exec ON "
|
||||
" distinct_error.tool_name = exec.tool_name AND "
|
||||
" distinct_error.sha256 = exec.sha256 "
|
||||
f"{where_clause} "
|
||||
f"GROUP BY distinct_error.error, distinct_error.error_type "
|
||||
"ORDER BY cnt DESC LIMIT :limit;"
|
||||
),
|
||||
args,
|
||||
):
|
||||
row_d = {k: v for (k, v) in zip(("count", "error", "type"), row)}
|
||||
writer.writerow(row_d)
|
||||
if folder is not None:
|
||||
out.close()
|
||||
|
||||
|
||||
def get_nb_error(
|
||||
db: Path,
|
||||
folder: Optional[Path] = None,
|
||||
):
|
||||
NB_ERR = (
|
||||
"("
|
||||
"SELECT "
|
||||
" exec_id.tool_name, exec_id.sha256, COUNT(error._rowid_) AS nb_err "
|
||||
"FROM ("
|
||||
" (SELECT tool_name FROM tool) CROSS JOIN (SELECT sha256 FROM apk)"
|
||||
") AS exec_id LEFT JOIN error "
|
||||
"ON exec_id.tool_name=error.tool_name AND exec_id.sha256=error.sha256 "
|
||||
"GROUP BY exec_id.tool_name, exec_id.sha256"
|
||||
") AS nb_err"
|
||||
)
|
||||
data = {}
|
||||
tools = set()
|
||||
with sqlite3.connect(db) as con:
|
||||
cur = con.cursor()
|
||||
for tool, status, avg, variance in cur.execute(
|
||||
"SELECT nb_err.tool_name, exec.tool_status, AVG(nb_err.nb_err), "
|
||||
" AVG(nb_err.nb_err*nb_err.nb_err) - AVG(nb_err.nb_err)*AVG(nb_err.nb_err) "
|
||||
f"FROM {NB_ERR} "
|
||||
"INNER JOIN exec ON nb_err.tool_name = exec.tool_name AND nb_err.sha256 = exec.sha256 "
|
||||
"GROUP BY nb_err.tool_name, exec.tool_status;"
|
||||
):
|
||||
tools.add(tool)
|
||||
data[(tool, status)] = (avg, variance)
|
||||
fieldnames = list(tools)
|
||||
fieldnames.sort()
|
||||
fieldnames = ["", *fieldnames]
|
||||
if folder is None:
|
||||
fd = sys.stdout
|
||||
else:
|
||||
fd = (folder / "average_number_of_error_by_exec.csv").open("w")
|
||||
writer = csv.DictWriter(fd, fieldnames=fieldnames)
|
||||
writer.writeheader()
|
||||
for status in ("FINISHED", "FAILED", "TIMEOUT"):
|
||||
row = {"": status}
|
||||
for tool in tools:
|
||||
row[tool] = round(data.get((tool, status), (0, 0))[0], 2)
|
||||
writer.writerow(row)
|
||||
row = {"": "standard deviation"}
|
||||
for tool in tools:
|
||||
row[tool] = round(data.get((tool, status), (0, 0))[1] ** (1 / 2), 2)
|
||||
writer.writerow(row)
|
||||
if folder is not None:
|
||||
fd.close()
|
||||
|
||||
|
||||
def error_type_repartition(
|
||||
db: Path, interactive: bool = True, folder: Optional[Path] = None
|
||||
):
|
||||
data: dict[str, dict[str, int]] = {}
|
||||
total: dict[str, int] = {}
|
||||
with sqlite3.connect(db) as con:
|
||||
cur = con.cursor()
|
||||
for tool, err, n in cur.execute(
|
||||
"SELECT tool_name, error, COUNT(*) FROM error GROUP BY tool_name, error;"
|
||||
):
|
||||
if tool not in data:
|
||||
data[tool] = {}
|
||||
total[tool] = 0
|
||||
if err is not None and err != "":
|
||||
data[tool][err] = n
|
||||
for tool, n in cur.execute(
|
||||
"SELECT tool_name, COUNT(*) FROM error WHERE error IS NOT NULL AND error != '' GROUP BY tool_name;"
|
||||
):
|
||||
total[tool] = n
|
||||
errors = set()
|
||||
N = 3
|
||||
for tool in data:
|
||||
for err in sorted(
|
||||
[err for err in data[tool]], key=lambda err: data[tool][err], reverse=True
|
||||
)[:N]:
|
||||
# TODO Check of > 10%?
|
||||
errors.add(err)
|
||||
tools = sorted(data.keys())
|
||||
errors_l = sorted(errors)
|
||||
values = [
|
||||
[
|
||||
data[tool].get(err, 0) * 100 / total[tool] if total[tool] != 0 else 0
|
||||
for tool in tools
|
||||
]
|
||||
for err in errors_l
|
||||
]
|
||||
plt.figure(figsize=(22, 20))
|
||||
im = plt.imshow(values, cmap="Greys")
|
||||
cbar = plt.colorbar(im)
|
||||
cbar.ax.set_ylabel(
|
||||
"% of the error type among the error raised by the tool",
|
||||
rotation=-90,
|
||||
va="bottom",
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
|
||||
plt.xticks(np.arange(len(tools)), labels=tools, rotation=80)
|
||||
plt.yticks(np.arange(len(errors_l)), labels=errors_l)
|
||||
plt.xticks(np.arange(len(tools) + 1) - 0.5, minor=True)
|
||||
plt.yticks(np.arange(len(errors_l) + 1) - 0.5, minor=True)
|
||||
plt.grid(which="minor", color="w", linestyle="-", linewidth=3)
|
||||
plt.tick_params(which="minor", bottom=False, left=False)
|
||||
plt.title("Repartition of error types among tools")
|
||||
# plt.figure().set_figheight(10)
|
||||
render(
|
||||
"Repartition of error types among tools",
|
||||
interactive,
|
||||
folder,
|
||||
tight_layout=False,
|
||||
)
|
62
rasta_data_manipulation/rasta_triturage/ressources.py
Normal file
62
rasta_data_manipulation/rasta_triturage/ressources.py
Normal file
|
@ -0,0 +1,62 @@
|
|||
import sqlite3
|
||||
import sys
|
||||
import csv
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
|
||||
def get_ressource(
|
||||
db: Path,
|
||||
folder: Optional[Path] = None,
|
||||
):
|
||||
data_time = {}
|
||||
data_mem = {}
|
||||
tools = set()
|
||||
with sqlite3.connect(db) as con:
|
||||
cur = con.cursor()
|
||||
for tool, status, avg_time, var_time, avg_mem, var_mem in cur.execute(
|
||||
"SELECT tool_name, exec.tool_status, "
|
||||
" AVG(time), AVG(time*time) - AVG(time)*AVG(time), "
|
||||
" AVG(max_rss_mem), AVG(max_rss_mem*max_rss_mem) - AVG(max_rss_mem)*AVG(max_rss_mem) "
|
||||
"FROM exec "
|
||||
"GROUP BY tool_name, tool_status;"
|
||||
):
|
||||
tools.add(tool)
|
||||
if var_time is None:
|
||||
var_time = 0
|
||||
if var_mem is None:
|
||||
var_mem = 0
|
||||
data_time[(tool, status)] = (avg_time, var_time ** (1 / 2))
|
||||
data_mem[(tool, status)] = (avg_mem, var_mem ** (1 / 2))
|
||||
fieldnames = list(tools)
|
||||
fieldnames.sort()
|
||||
fieldnames = ["", *fieldnames]
|
||||
if folder is None:
|
||||
fd_time = sys.stdout
|
||||
fd_mem = sys.stdout
|
||||
else:
|
||||
fd_time = (folder / "average_time.csv").open("w")
|
||||
fd_mem = (folder / "average_mem.csv").open("w")
|
||||
writer_time = csv.DictWriter(fd_time, fieldnames=fieldnames)
|
||||
writer_mem = csv.DictWriter(fd_mem, fieldnames=fieldnames)
|
||||
writer_time.writeheader()
|
||||
writer_mem.writeheader()
|
||||
for status in ("FINISHED", "FAILED", "TIMEOUT"):
|
||||
row_time = {"": status}
|
||||
row_mem = {"": status}
|
||||
for tool in tools:
|
||||
row_time[tool] = round(data_time.get((tool, status), (0, 0))[0], 2)
|
||||
row_mem[tool] = round(data_mem.get((tool, status), (0, 0))[0], 2)
|
||||
writer_time.writerow(row_time)
|
||||
writer_mem.writerow(row_mem)
|
||||
row_time = {"": "standard deviation"}
|
||||
row_mem = {"": "standard deviation"}
|
||||
for tool in tools:
|
||||
row_time[tool] = round(data_time.get((tool, status), (0, 0))[1], 2)
|
||||
row_mem[tool] = round(data_mem.get((tool, status), (0, 0))[1], 2)
|
||||
writer_time.writerow(row_time)
|
||||
writer_mem.writerow(row_mem)
|
||||
if folder is not None:
|
||||
fd_time.close()
|
||||
fd_mem.close()
|
446
rasta_data_manipulation/rasta_triturage/status.py
Normal file
446
rasta_data_manipulation/rasta_triturage/status.py
Normal file
|
@ -0,0 +1,446 @@
|
|||
"""
|
||||
Plots related to the tool status.
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
|
||||
import sqlite3
|
||||
from pathlib import Path
|
||||
from matplotlib import pyplot as plt # type: ignore
|
||||
from typing import Any, Callable, Optional
|
||||
from .utils import (
|
||||
render,
|
||||
DENSE_DASH,
|
||||
DENSE_DOT,
|
||||
get_list_tools,
|
||||
plot_generic,
|
||||
MARKERS,
|
||||
COLORS,
|
||||
)
|
||||
from .populate_db_tool import TOOL_INFO
|
||||
|
||||
TOOL_LINE_STYLE = {
|
||||
tool_info["tool_name"]: DENSE_DOT if tool_info["use_soot"] else DENSE_DASH
|
||||
for tool_info in TOOL_INFO
|
||||
}
|
||||
|
||||
|
||||
def plot_status_by_tool(
|
||||
db: Path,
|
||||
interactive: bool = True,
|
||||
image_path: Path | None = None,
|
||||
tools: list[str] | None = None,
|
||||
title: str = "Exit Status",
|
||||
):
|
||||
"""Plot the repartition of status by tools."""
|
||||
if tools is None:
|
||||
tools = get_list_tools(db)
|
||||
with sqlite3.connect(db) as con:
|
||||
cur = con.cursor()
|
||||
tools_list_format = f"({','.join(['?' for _ in tools])})"
|
||||
nb_apk = cur.execute("SELECT COUNT(*) FROM apk;").fetchone()[0]
|
||||
status = cur.execute(
|
||||
(
|
||||
"SELECT tool_name, tool_status, COUNT(sha256) "
|
||||
"FROM exec "
|
||||
f"WHERE tool_name IN {tools_list_format}"
|
||||
"GROUP BY tool_name, tool_status;"
|
||||
),
|
||||
tools,
|
||||
).fetchall()
|
||||
occurences = {}
|
||||
for tool, stat, occurence in status:
|
||||
occurences[(tool, stat)] = occurence
|
||||
# tools.sort(key=lambda t: occurences.get((t, "FINISHED"), 0), reverse=True)
|
||||
tools.sort()
|
||||
|
||||
values = {
|
||||
"Finished": np.zeros(len(tools)),
|
||||
"Time Out": np.zeros(len(tools)),
|
||||
"Other": np.zeros(len(tools)),
|
||||
"Failed": np.zeros(len(tools)),
|
||||
}
|
||||
colors = {
|
||||
"Finished": "#009E73",
|
||||
"Time Out": "#56B4E9",
|
||||
"Failed": "#D55E00",
|
||||
"Other": "#555555", # TODO: better color
|
||||
}
|
||||
hatch = {
|
||||
"Finished": "/",
|
||||
"Time Out": "x",
|
||||
"Failed": "\\",
|
||||
"Other": ".",
|
||||
}
|
||||
for i, tool in enumerate(tools):
|
||||
values["Finished"][i] = occurences.get((tool, "FINISHED"), 0)
|
||||
values["Time Out"][i] = occurences.get((tool, "TIMEOUT"), 0)
|
||||
values["Failed"][i] = occurences.get((tool, "FAILED"), 0)
|
||||
values["Other"][i] = (
|
||||
nb_apk - values["Finished"][i] - values["Time Out"][i] - values["Failed"][i]
|
||||
)
|
||||
values["Finished"] = (100 * values["Finished"]) / nb_apk
|
||||
values["Time Out"] = (100 * values["Time Out"]) / nb_apk
|
||||
values["Failed"] = (100 * values["Failed"]) / nb_apk
|
||||
values["Other"] = (100 * values["Other"]) / nb_apk
|
||||
bottom = np.zeros(len(tools) * 2)
|
||||
bottom = np.zeros(len(tools))
|
||||
|
||||
print("Finishing rate:")
|
||||
for t, p in zip(tools, values["Finished"]):
|
||||
print(f"{t}: {p:.2f}%")
|
||||
|
||||
plt.figure(figsize=(20, 9), dpi=80)
|
||||
plt.axhline(y=50, linestyle="dotted")
|
||||
plt.axhline(y=85, linestyle="dotted")
|
||||
plt.axhline(y=15, linestyle="dotted")
|
||||
for stat in ["Finished", "Time Out", "Other", "Failed"]:
|
||||
plt.bar(
|
||||
tools,
|
||||
values[stat],
|
||||
label=stat,
|
||||
color=colors[stat],
|
||||
hatch=hatch[stat],
|
||||
bottom=bottom,
|
||||
width=0.6,
|
||||
edgecolor="black",
|
||||
)
|
||||
bottom += values[stat]
|
||||
plt.xticks(tools, tools, rotation=80)
|
||||
plt.legend()
|
||||
plt.ylabel("% of analysed apk")
|
||||
render(title, interactive, image_path)
|
||||
|
||||
|
||||
def plot_status_by_tool_and_malware(
|
||||
db: Path,
|
||||
interactive: bool = True,
|
||||
image_path: Path | None = None,
|
||||
tools: list[str] | None = None,
|
||||
title: str = "Exit Status Goodware/Malware",
|
||||
):
|
||||
"""Plot the repartition of status by tools and if apk is a malware."""
|
||||
if tools is None:
|
||||
tools = get_list_tools(db)
|
||||
with sqlite3.connect(db) as con:
|
||||
cur = con.cursor()
|
||||
tools_list_format = f"({','.join(['?' for _ in tools])})"
|
||||
nb_goodware = cur.execute(
|
||||
"SELECT COUNT(*) FROM apk WHERE vt_detection == 0;"
|
||||
).fetchone()[0]
|
||||
nb_malware = cur.execute(
|
||||
"SELECT COUNT(*) FROM apk WHERE vt_detection != 0;"
|
||||
).fetchone()[0]
|
||||
status = cur.execute(
|
||||
(
|
||||
"SELECT tool_name, tool_status, COUNT(exec.sha256), vt_detection != 0 "
|
||||
"FROM exec INNER JOIN apk ON exec.sha256 = apk.sha256 "
|
||||
f"WHERE tool_name IN {tools_list_format} "
|
||||
"GROUP BY tool_name, tool_status, vt_detection != 0;"
|
||||
),
|
||||
tools,
|
||||
).fetchall()
|
||||
occurences = {}
|
||||
for tool, stat, occurence, malware in status:
|
||||
occurences[(tool, stat, bool(malware))] = occurence
|
||||
# tools.sort(
|
||||
# key=lambda t: occurences.get((t, "FINISHED", True), 0)
|
||||
# + occurences.get((t, "FINISHED", False), 0),
|
||||
# reverse=True,
|
||||
# )
|
||||
tools.sort()
|
||||
|
||||
values = {
|
||||
"Finished": np.zeros(len(tools) * 2),
|
||||
"Time Out": np.zeros(len(tools) * 2),
|
||||
"Other": np.zeros(len(tools) * 2),
|
||||
"Failed": np.zeros(len(tools) * 2),
|
||||
}
|
||||
colors = {
|
||||
"Finished": "#009E73",
|
||||
"Time Out": "#56B4E9",
|
||||
"Other": "#555555", # TODO: find beter color
|
||||
"Failed": "#D55E00",
|
||||
}
|
||||
hatch = {
|
||||
"Finished": "/",
|
||||
"Time Out": "x",
|
||||
"Other": ".",
|
||||
"Failed": "\\",
|
||||
}
|
||||
for i, tool in enumerate(tools):
|
||||
i_goodware = 2 * i
|
||||
i_malware = 2 * i + 1
|
||||
values["Finished"][i_goodware] = occurences.get((tool, "FINISHED", False), 0)
|
||||
values["Finished"][i_malware] = occurences.get((tool, "FINISHED", True), 0)
|
||||
values["Time Out"][i_goodware] = occurences.get((tool, "TIMEOUT", False), 0)
|
||||
values["Time Out"][i_malware] = occurences.get((tool, "TIMEOUT", True), 0)
|
||||
values["Failed"][i_goodware] = occurences.get((tool, "FAILED", False), 0)
|
||||
values["Failed"][i_malware] = occurences.get((tool, "FAILED", True), 0)
|
||||
values["Other"][i_goodware] = (
|
||||
nb_goodware
|
||||
- values["Finished"][i_goodware]
|
||||
- values["Time Out"][i_goodware]
|
||||
- values["Failed"][i_goodware]
|
||||
)
|
||||
values["Other"][i_malware] = (
|
||||
nb_malware
|
||||
- values["Finished"][i_malware]
|
||||
- values["Time Out"][i_malware]
|
||||
- values["Failed"][i_malware]
|
||||
)
|
||||
values["Finished"][i_goodware] = (
|
||||
0
|
||||
if nb_goodware == 0
|
||||
else (100 * values["Finished"][i_goodware]) / nb_goodware
|
||||
)
|
||||
values["Finished"][i_malware] = (
|
||||
0 if nb_malware == 0 else (100 * values["Finished"][i_malware]) / nb_malware
|
||||
)
|
||||
values["Time Out"][i_goodware] = (
|
||||
0
|
||||
if nb_goodware == 0
|
||||
else (100 * values["Time Out"][i_goodware]) / nb_goodware
|
||||
)
|
||||
values["Time Out"][i_malware] = (
|
||||
0 if nb_malware == 0 else (100 * values["Time Out"][i_malware]) / nb_malware
|
||||
)
|
||||
values["Failed"][i_goodware] = (
|
||||
0
|
||||
if nb_goodware == 0
|
||||
else (100 * values["Failed"][i_goodware]) / nb_goodware
|
||||
)
|
||||
values["Failed"][i_malware] = (
|
||||
0 if nb_malware == 0 else (100 * values["Failed"][i_malware]) / nb_malware
|
||||
)
|
||||
values["Other"][i_goodware] = (
|
||||
0 if nb_goodware == 0 else (100 * values["Other"][i_goodware]) / nb_goodware
|
||||
)
|
||||
values["Other"][i_malware] = (
|
||||
0 if nb_malware == 0 else (100 * values["Other"][i_malware]) / nb_malware
|
||||
)
|
||||
bottom = np.zeros(len(tools) * 2)
|
||||
|
||||
x_axis = np.zeros(len(tools) * 2)
|
||||
x_width = 3
|
||||
x_0 = x_width / 2
|
||||
lstep = 1
|
||||
bstep = 5
|
||||
for i in range(len(tools)):
|
||||
x_0 += bstep + x_width
|
||||
x_axis[2 * i] = x_0
|
||||
x_0 += lstep + x_width
|
||||
x_axis[2 * i + 1] = x_0
|
||||
tick_legend = []
|
||||
for tool in tools:
|
||||
tick_legend.append(f"{tool}") # (f"{tool} on goodware")
|
||||
tick_legend.append("") # (f"{tool} on malware")
|
||||
|
||||
plt.figure(figsize=(20, 9), dpi=80)
|
||||
for stat in ["Finished", "Time Out", "Other", "Failed"]:
|
||||
plt.bar(
|
||||
x_axis,
|
||||
values[stat],
|
||||
label=stat,
|
||||
color=colors[stat],
|
||||
hatch=hatch[stat],
|
||||
bottom=bottom,
|
||||
width=x_width,
|
||||
edgecolor="black",
|
||||
)
|
||||
bottom += values[stat]
|
||||
plt.xticks(x_axis, tick_legend, rotation=80)
|
||||
plt.legend()
|
||||
plt.ylabel("% of analysed apk")
|
||||
render(title, interactive, image_path)
|
||||
|
||||
|
||||
def plot_status_by_generic_x(
|
||||
tools: list[str],
|
||||
x_col: str,
|
||||
x_label: str,
|
||||
x_in_title: str,
|
||||
args,
|
||||
group_by: Optional[str] = None,
|
||||
):
|
||||
tools.sort()
|
||||
"""group_by default to x_col, x_col must be uniq for a grouped by group_by"""
|
||||
if group_by is None:
|
||||
group_by = x_col
|
||||
with sqlite3.connect(args.data) as con:
|
||||
cur = con.cursor()
|
||||
nb_goodware_res = cur.execute(
|
||||
f"SELECT {group_by}, COUNT(*) FROM apk WHERE vt_detection == 0 GROUP BY {group_by};",
|
||||
).fetchall()
|
||||
nb_goodware = {}
|
||||
for x_group, count in nb_goodware_res:
|
||||
nb_goodware[x_group] = count
|
||||
nb_malware_res = cur.execute(
|
||||
f"SELECT {group_by}, COUNT(*) FROM apk WHERE vt_detection != 0 GROUP BY {group_by};",
|
||||
).fetchall()
|
||||
nb_malware = {}
|
||||
for x_group, count in nb_malware_res:
|
||||
nb_malware[x_group] = count
|
||||
statuses_res = cur.execute(
|
||||
(
|
||||
f"SELECT tool_name, {x_col}, {group_by}, COUNT(exec.sha256), vt_detection != 0 "
|
||||
"FROM exec INNER JOIN apk ON exec.sha256 = apk.sha256 "
|
||||
f"WHERE tool_status = 'FINISHED' "
|
||||
f"GROUP BY tool_name, tool_status, {group_by}, vt_detection != 0 "
|
||||
f"HAVING {x_col} IS NOT NULL;"
|
||||
)
|
||||
).fetchall()
|
||||
tots = {}
|
||||
for tool_, x_val, x_group, count, is_malware in statuses_res:
|
||||
if not (tool_, x_group) in tots:
|
||||
tots[(tool_, x_group)] = [x_val, 0]
|
||||
tots[(tool_, x_group)][1] += count
|
||||
plots = []
|
||||
plots_malgood = []
|
||||
metas = []
|
||||
metas_malgood = []
|
||||
for tool in tools:
|
||||
malware_plot = [
|
||||
(x_val, 100 * count / nb_malware[x_group])
|
||||
for (tool_, x_val, x_group, count, is_malware) in statuses_res
|
||||
if (tool_ == tool) and is_malware and nb_malware.get(x_group, 0) != 0
|
||||
]
|
||||
malware_meta = (f"{tool} on malware", DENSE_DOT, MARKERS[tool], COLORS[tool])
|
||||
goodware_plot = [
|
||||
(x_val, 100 * count / nb_goodware[x_group])
|
||||
for (tool_, x_val, x_group, count, is_malware) in statuses_res
|
||||
if (tool_ == tool) and not is_malware and nb_goodware.get(x_group, 0) != 0
|
||||
]
|
||||
goodware_meta = (f"{tool} on goodware", DENSE_DASH, MARKERS[tool], COLORS[tool])
|
||||
total_plot = [
|
||||
(
|
||||
x_val,
|
||||
100
|
||||
* count
|
||||
/ (nb_malware.get(x_group, 0) + nb_goodware.get(x_group, 0)),
|
||||
)
|
||||
for ((tool_, x_group), (x_val, count)) in tots.items()
|
||||
if (tool_ == tool)
|
||||
and (nb_malware.get(x_group, 0) + nb_goodware.get(x_group, 0)) != 0
|
||||
]
|
||||
total_meta = (f"{tool}", DENSE_DOT, MARKERS[tool], COLORS[tool])
|
||||
plots.append(total_plot)
|
||||
plots_malgood.append(malware_plot)
|
||||
plots_malgood.append(goodware_plot)
|
||||
metas.append(total_meta)
|
||||
metas_malgood.append(malware_meta)
|
||||
metas_malgood.append(goodware_meta)
|
||||
|
||||
plot_generic(
|
||||
[goodware_plot, malware_plot],
|
||||
[goodware_meta, malware_meta],
|
||||
x_label,
|
||||
"finishing rate",
|
||||
f"Finishing Rate by {x_in_title} for {tool} on malware and goodware",
|
||||
ylim=(-5, 105),
|
||||
interactive=args.display,
|
||||
image_path=args.figures_file,
|
||||
)
|
||||
plot_generic(
|
||||
[total_plot],
|
||||
[total_meta],
|
||||
x_label,
|
||||
"finishing rate",
|
||||
f"Finishing Rate by {x_in_title} for {tool}",
|
||||
ylim=(-5, 105),
|
||||
interactive=args.display,
|
||||
image_path=args.figures_file,
|
||||
)
|
||||
plot_generic(
|
||||
plots_malgood,
|
||||
metas_malgood,
|
||||
x_label,
|
||||
"finishing rate",
|
||||
f"Finishing Rate by {x_in_title} on malware and goodware",
|
||||
ylim=(-5, 105),
|
||||
interactive=args.display,
|
||||
image_path=args.figures_file,
|
||||
)
|
||||
plot_generic(
|
||||
plots,
|
||||
metas,
|
||||
x_label,
|
||||
"finishing rate",
|
||||
f"Finishing Rate by {x_in_title}",
|
||||
ylim=(-5, 105),
|
||||
interactive=args.display,
|
||||
image_path=args.figures_file,
|
||||
)
|
||||
|
||||
|
||||
def dbg(arg):
|
||||
# print(arg)
|
||||
return arg
|
||||
|
||||
|
||||
def plot_all_status_by_generic_x(
|
||||
tools: list[str],
|
||||
x_col: str,
|
||||
x_label: str,
|
||||
title: str,
|
||||
args,
|
||||
condition: Optional[str] = None,
|
||||
apk_condition: Optional[str] = None,
|
||||
group_by: Optional[str] = None,
|
||||
):
|
||||
if condition is None and apk_condition is None:
|
||||
condition = ""
|
||||
apk_condition = ""
|
||||
elif apk_condition is None:
|
||||
condition = f"AND ({condition})"
|
||||
apk_condition = ""
|
||||
elif condition is None:
|
||||
condition = f"AND ({apk_condition})"
|
||||
apk_condition = f"WHERE ({apk_condition})"
|
||||
else:
|
||||
condition = f"AND ({apk_condition}) AND ({condition})"
|
||||
apk_condition = f"WHERE ({apk_condition})"
|
||||
if group_by is None:
|
||||
group_by = x_col
|
||||
nb_apk = {}
|
||||
tools.sort()
|
||||
with sqlite3.connect(args.data) as con:
|
||||
cur = con.cursor()
|
||||
for x_group, count in cur.execute(
|
||||
f"SELECT {group_by}, COUNT(*) FROM apk {apk_condition} GROUP BY {group_by};",
|
||||
):
|
||||
nb_apk[x_group] = count
|
||||
statuses_res = cur.execute(
|
||||
dbg(
|
||||
f"SELECT exec.tool_name, {x_col}, {group_by}, COUNT(exec.sha256) "
|
||||
"FROM exec "
|
||||
" INNER JOIN apk ON exec.sha256 = apk.sha256 "
|
||||
" INNER JOIN tool ON exec.tool_name = tool.tool_name "
|
||||
f"WHERE tool_status = 'FINISHED' {condition} "
|
||||
f"GROUP BY exec.tool_name, tool_status, {group_by} "
|
||||
f"HAVING {x_col} IS NOT NULL;"
|
||||
)
|
||||
).fetchall()
|
||||
plots = []
|
||||
metas = []
|
||||
for tool in tools:
|
||||
plot = [
|
||||
(x_val, 100 * count / nb_apk[x_group])
|
||||
for (tool_, x_val, x_group, count) in statuses_res
|
||||
if (tool_ == tool) and nb_apk.get(x_group, 0) != 0
|
||||
]
|
||||
if len(plot) == 0:
|
||||
continue
|
||||
meta = (tool, TOOL_LINE_STYLE[tool], MARKERS[tool], COLORS[tool])
|
||||
plots.append(plot)
|
||||
metas.append(meta)
|
||||
plot_generic(
|
||||
plots,
|
||||
metas,
|
||||
x_label,
|
||||
"finishing rate",
|
||||
title,
|
||||
ylim=(-5, 105),
|
||||
interactive=args.display,
|
||||
image_path=args.figures_file,
|
||||
)
|
185
rasta_data_manipulation/rasta_triturage/utils.py
Normal file
185
rasta_data_manipulation/rasta_triturage/utils.py
Normal file
|
@ -0,0 +1,185 @@
|
|||
"""
|
||||
Utils.
|
||||
"""
|
||||
|
||||
import matplotlib.pyplot as plt # type: ignore
|
||||
import numpy as np
|
||||
from slugify import slugify # type: ignore
|
||||
from typing import Any, Callable, Optional
|
||||
from pathlib import Path
|
||||
import sqlite3
|
||||
|
||||
DENSE_DASH = (0, (5, 1))
|
||||
DENSE_DOT = (0, (1, 3))
|
||||
|
||||
MARKERS = {
|
||||
"adagio": ".",
|
||||
"amandroid": "o",
|
||||
"anadroid": "X",
|
||||
"androguard": "+",
|
||||
"androguard_dad": "v",
|
||||
"apparecium": "d",
|
||||
"blueseal": "^",
|
||||
"dialdroid": "<",
|
||||
"didfail": ">",
|
||||
"droidsafe": r"$\circ$",
|
||||
"flowdroid": r"$\boxplus$",
|
||||
"gator": r"$\otimes$",
|
||||
"ic3": "1",
|
||||
"ic3_fork": "s",
|
||||
"iccta": "P",
|
||||
"mallodroid": r"$\divideontimes$",
|
||||
"perfchecker": "*",
|
||||
"redexer": "x",
|
||||
"saaf": "D",
|
||||
"wognsen_et_al": r"$\rtimes$",
|
||||
}
|
||||
|
||||
COLORS = {
|
||||
"didfail": "#1f77b4",
|
||||
"adagio": "#ff7f0e",
|
||||
"iccta": "#2ca02c",
|
||||
"androguard": "#d62728",
|
||||
"gator": "#9467bd",
|
||||
"mallodroid": "#8c564b",
|
||||
"dialdroid": "#e377c2",
|
||||
"androguard_dad": "#7f7f7f",
|
||||
"wognsen_et_al": "#bcbd22",
|
||||
"perfchecker": "#17becf",
|
||||
"amandroid": "#1f77b4",
|
||||
"ic3": "#ff7f0e",
|
||||
"apparecium": "#2ca02c",
|
||||
"blueseal": "#d62728",
|
||||
"droidsafe": "#9467bd",
|
||||
"redexer": "#8c564b",
|
||||
"anadroid": "#e377c2",
|
||||
"saaf": "#7f7f7f",
|
||||
"ic3_fork": "#bcbd22",
|
||||
"flowdroid": "#17becf",
|
||||
"adagio": "#1f77b4",
|
||||
"androguard": "#ff7f0e",
|
||||
"mallodroid": "#2ca02c",
|
||||
"androguard_dad": "#d62728",
|
||||
"wognsen_et_al": "#9467bd",
|
||||
"amandroid": "#8c564b",
|
||||
"apparecium": "#e377c2",
|
||||
"redexer": "#7f7f7f",
|
||||
}
|
||||
|
||||
|
||||
def get_list_tools(db: Path) -> list[str]:
|
||||
"""Get the list of tool found in the database."""
|
||||
with sqlite3.connect(db) as con:
|
||||
cur = con.cursor()
|
||||
tools = cur.execute("SELECT DISTINCT tool_name FROM exec;")
|
||||
return [tool[0] for tool in tools]
|
||||
|
||||
|
||||
def radar_chart(
|
||||
axes: list[str],
|
||||
values: list[list[Any]],
|
||||
labels: list[str],
|
||||
title: str,
|
||||
interactive: bool,
|
||||
image_path: Path | None,
|
||||
):
|
||||
plt.rc("grid", linewidth=1, linestyle="-")
|
||||
plt.rc("xtick", labelsize=15)
|
||||
plt.rc("ytick", labelsize=15)
|
||||
angles = np.linspace(0, 2 * np.pi, len(axes), endpoint=False)
|
||||
angles = np.concatenate((angles, [angles[0]])) # type: ignore
|
||||
fig = plt.figure(figsize=(8, 8))
|
||||
ax = fig.add_subplot(111, polar=True)
|
||||
for label, vals in zip(labels, values):
|
||||
vals = vals + [vals[0]]
|
||||
ax.plot(angles, vals, label=label, marker=MARKERS.get(label, "."))
|
||||
ax.fill(angles, vals, alpha=0.25)
|
||||
ax.set_thetagrids(angles[:-1] * 180 / np.pi, axes)
|
||||
ax.set_ylim(bottom=0)
|
||||
ax.grid(True)
|
||||
ncol = min(5, len(labels))
|
||||
ax.legend(
|
||||
loc="lower left",
|
||||
bbox_to_anchor=(0.0, -0.2, ncol * 1.0 / 5, 0.102),
|
||||
ncol=ncol,
|
||||
mode="expand",
|
||||
borderaxespad=0.0,
|
||||
fancybox=True,
|
||||
shadow=True,
|
||||
fontsize="xx-small",
|
||||
)
|
||||
render(title, interactive, image_path)
|
||||
|
||||
|
||||
def render(
|
||||
title: str, interactive: bool, image_path: Path | None, tight_layout: bool = True
|
||||
):
|
||||
"""Render the figure. If `interactive`, display if, if `image_path`, save it."""
|
||||
# plt.title(title)
|
||||
if tight_layout:
|
||||
plt.tight_layout()
|
||||
if image_path is not None:
|
||||
if not image_path.exists():
|
||||
image_path.mkdir(parents=True, exist_ok=True)
|
||||
plt.savefig(image_path / (slugify(title) + ".pdf"), format="pdf")
|
||||
if interactive:
|
||||
plt.show()
|
||||
plt.close()
|
||||
|
||||
|
||||
def mean(field: str) -> Callable[[list[Any]], float]:
|
||||
def compute_mean(data: list[Any]) -> float:
|
||||
s = 0
|
||||
n = 0
|
||||
for e in data:
|
||||
n += 1
|
||||
s += e[field]
|
||||
return 0.0 if n == 0 else s / n
|
||||
|
||||
return compute_mean
|
||||
|
||||
|
||||
def median(field: str) -> Callable[[list[Any]], float]:
|
||||
def compute_median(data: list[Any]) -> float:
|
||||
l = [e[field] for e in data if e[field] is not None]
|
||||
l.sort()
|
||||
if not l:
|
||||
return 0.0
|
||||
return l[len(l) // 2]
|
||||
|
||||
return compute_median
|
||||
|
||||
|
||||
def plot_generic(
|
||||
data: list[list[tuple[Any, Any]]],
|
||||
meta: list[tuple[str, Any, Any, str]],
|
||||
x_label: str,
|
||||
y_label: str,
|
||||
title: str,
|
||||
ylim: Optional[tuple[int, int]] = None,
|
||||
interactive: bool = True,
|
||||
image_path: Path | None = None,
|
||||
):
|
||||
"""Plot a list of curve represented by list[(x, y)]. meta is the list of (label, linestyle)
|
||||
for each plot.
|
||||
"""
|
||||
plt.figure(figsize=(16, 9), dpi=80)
|
||||
for i, plot in enumerate(data):
|
||||
label, linestyle, marker, color = meta[i]
|
||||
plot.sort(key=lambda p: p[0])
|
||||
x_values = np.array([x for (x, _) in plot])
|
||||
y_values = np.array([y for (_, y) in plot])
|
||||
plt.plot(
|
||||
x_values[~np.isnan(y_values)],
|
||||
y_values[~np.isnan(y_values)],
|
||||
label=label,
|
||||
marker=marker,
|
||||
color=color,
|
||||
linestyle=linestyle,
|
||||
)
|
||||
if ylim is not None:
|
||||
plt.ylim(ylim)
|
||||
plt.legend(loc="upper center", ncol=4, bbox_to_anchor=(0.5, -0.1))
|
||||
plt.xlabel(x_label)
|
||||
plt.ylabel(y_label)
|
||||
render(title, interactive, image_path)
|
Loading…
Add table
Add a link
Reference in a new issue