rasta/rasta_data_manipulation/rasta_triturage/cli.py
2025-09-07 17:08:42 +02:00

1191 lines
35 KiB
Python

import argparse
import code
import json
import sys
import sqlite3
import seaborn as sns # type: ignore
sns.set_context("talk")
from pathlib import Path
from math import floor, log
from typing import Callable, Any
from .status import (
plot_status_by_tool,
plot_status_by_tool_and_malware,
plot_all_status_by_generic_x,
plot_status_by_generic_x,
plot_compare_status,
)
from .apk import (
plot_apk_info_by_generic_x,
plot_apk_size,
plot_apk_size_hl_subset,
)
from .populate_db_apk import populate_db_apk as populate_db_apk_
from .utils import (
mean,
median,
get_list_tools,
DENSE_DASH,
DENSE_DOT,
)
from .ressources import get_ressource
from .populate_db_exec import (
populate_execution_report,
fix_error,
estimate_cause,
)
from .populate_db_tool import populate_tool
from .query_error import (
get_common_errors,
get_common_error_classes,
get_nb_error,
radar_cause_estimation,
error_type_repartition,
)
from .ic3 import ic3_venn, ic3_errors
from .data_set import gen_dataset
def round_apk_size(size) -> float:
return 4 ** floor(log(size, 4))
def get_common_arg(description: str = "") -> argparse.ArgumentParser:
"""Get the usual arguments."""
parser = argparse.ArgumentParser(prog=sys.argv[0], description=description)
parser.add_argument(
"-d",
"--data",
required=True,
type=Path,
help="The sqlite3 database that contain the execution report and apk info",
)
parser.add_argument(
"-f",
"--figures-file",
type=Path,
help="The folder in which the figures must be stored",
)
parser.add_argument(
"--display",
action="store_true",
help="If the figures must be displayed",
)
parser.add_argument(
"-t",
"--tools",
nargs="+",
default=None,
help="The tools to analyse",
)
return parser
def main():
"""Console entrypoint."""
show_status_by_tool()
show_success_rate_by_first_seen_year()
show_success_rate_by_size_decile()
show_success_rate_by_min_sdk()
show_success_rate_by_target_sdk()
show_success_rate_by_dex_size()
# show_timeout_rate_by_estimated_year()
# show_timeout_rate_by_dex_size()
# show_timeout_rate_by_target_sdk()
# show_timeout_rate_by_min_sdk()
# show_data_set_relations()
# show_mem_by_dex_size()
# show_mem_by_min_sdk()
# show_mem_by_target_sdk()
# show_mem_by_estimated_year()
# show_time_by_dex_size()
# show_time_by_min_sdk()
# show_time_by_target_sdk()
# show_time_by_estimated_year()
def show_status_by_tool():
"""Display the repartition of status by tool."""
parser = get_common_arg("Display the repartition of status by tool")
parser.add_argument(
"--title",
default="Exit Status",
help="The title of the graph",
)
args = parser.parse_args()
plot_status_by_tool(
args.data,
interactive=args.display,
image_path=args.figures_file,
tools=args.tools,
title=args.title,
)
plot_status_by_tool_and_malware(
args.data,
interactive=args.display,
image_path=args.figures_file,
tools=args.tools,
title=f"{args.title} Goodware/Malware",
)
def show_success_rate_by_first_seen_year():
args = get_common_arg(
"Plot success rate by the first year they were seen"
).parse_args()
if args.tools is None:
tools = get_list_tools(args.data)
else:
tools = args.tools
plot_status_by_generic_x(
tools, "first_seen_year", "first seen year", "App First Year Seen", args
)
plot_all_status_by_generic_x(
tools,
"first_seen_year",
"first seen year",
"Finishing Rate by Year of Java based tools",
args,
condition="tool.use_java = TRUE",
)
plot_all_status_by_generic_x(
tools,
"first_seen_year",
"first seen year",
"Finishing Rate by Year of Non Java based tools",
args,
condition="tool.use_java = FALSE",
)
def show_success_rate_by_size_decile():
args = get_common_arg("Plot success rate by the size of the apk").parse_args()
if args.tools is None:
tools = get_list_tools(args.data)
else:
tools = args.tools
plot_status_by_generic_x(
tools,
"MAX(apk_size)/1000000",
"apk size (MB)",
"APK Size",
args,
group_by="apk_size_decile",
) # TODO bleurk those names...
plot_all_status_by_generic_x(
tools,
"MAX(apk_size)/1000000",
"apk size (MB)",
"Finishing Rate by APK size for tools using java",
args,
condition="tool.use_java = TRUE",
group_by="apk_size_decile",
)
plot_all_status_by_generic_x(
tools,
"MAX(apk_size)/1000000",
"apk size (MB)",
"Finishing Rate by APK size for tools not using java",
args,
condition="tool.use_java = FALSE",
group_by="apk_size_decile",
)
def show_success_rate_by_min_sdk():
args = get_common_arg("Plot success rate by min sdk").parse_args()
if args.tools is None:
tools = get_list_tools(args.data)
else:
tools = args.tools
# plot_status_by_generic_x(tools, "min_sdk", "min SDK", "Min SDK", args)
plot_all_status_by_generic_x(
tools,
"min_sdk",
"min SDK",
"Finishing Rate by min SDK version for tools using java",
args,
condition="tool.use_java = TRUE",
)
plot_all_status_by_generic_x(
tools,
"min_sdk",
"min SDK",
"Finishing Rate by min SDK version for tools not using java",
args,
condition="tool.use_java = FALSE",
)
def show_success_rate_by_target_sdk():
args = get_common_arg("Plot success rate by target sdk").parse_args()
if args.tools is None:
tools = get_list_tools(args.data)
else:
tools = args.tools
plot_status_by_generic_x(tools, "target_sdk", "target SDK", "Target SDK", args)
plot_all_status_by_generic_x(
tools,
"target_sdk",
"target SDK",
"Finishing Rate by target SDK version for tools using java",
args,
condition="tool.use_java = TRUE",
)
plot_all_status_by_generic_x(
tools,
"target_sdk",
"target SDK",
"Finishing Rate by target SDK version for tools not using java",
args,
condition="tool.use_java = FALSE",
)
def show_success_rate_by_dex_size():
args = get_common_arg("Plot success rate by size of bytecode").parse_args()
if args.tools is None:
tools = get_list_tools(args.data)
else:
tools = args.tools
plot_status_by_generic_x(
tools,
"MAX(dex_size)/1000",
"bytecode size (KB)",
"bytecode Size",
args,
group_by="dex_size_decile",
) # TODO bleurk those names...
plot_all_status_by_generic_x(
tools,
"MAX(dex_size)/1000",
"bytecode size (KB)",
"Finishing Rate by bytecode size for tools using java",
args,
condition="tool.use_java = TRUE",
group_by="dex_size_decile",
)
plot_all_status_by_generic_x(
tools,
"MAX(dex_size)/1000",
"dex size (KB)",
"Finishing Rate by bytecode size for tools not using java",
args,
condition="tool.use_java = FALSE",
group_by="dex_size_decile",
)
def populate_db_apk():
parser = argparse.ArgumentParser(
prog=sys.argv[0],
description="Populate a sqlite3 database with informationn abouts the apks of the dataset",
)
parser.add_argument(
"-d",
"--data",
required=True,
type=Path,
help="The sqlite3 database to populate",
)
parser.add_argument(
"-a",
"--apks",
type=Path,
help="The file listing the apks sha256",
)
parser.add_argument(
"--year-and-sdk",
type=Path,
help="The path to year_and_sdk.csv.gz",
)
parser.add_argument(
"--latest-with-added-date",
type=Path,
help="The path to latest_with-added-date.csv.gz",
)
parser.add_argument(
"--fix-dex-file",
action="store_true",
help=(
"If the dex_file column must be set by the sum of all .dex files size "
"(long operation, need to actually download the apks)"
),
)
args = parser.parse_args()
populate_db_apk_(
args.data,
args.apks,
args.year_and_sdk,
args.latest_with_added_date,
args.fix_dex_file,
)
def populate_db_exec():
parser = argparse.ArgumentParser(
prog=sys.argv[0],
description="Populate a sqlite3 database with the execution reports",
)
parser.add_argument(
"-d",
"--data",
required=True,
type=Path,
help="The sqlite3 database to populate",
)
parser.add_argument(
"-r",
"--reports",
type=Path,
default=None,
help="The folder containing the execution reports",
)
parser.add_argument(
"--substitue-error",
action="store_true",
help=(
"If the report should be used to substitute the errors of already populated "
"entry (to fix the parsing error)"
),
)
parser.add_argument(
"--estimate-cause",
action="store_true",
help=(
"If the cause of the errors must be estimated after populating the DB "
"(operation on all entries of the error table)"
),
)
args = parser.parse_args()
if not args.substitue_error and args.reports is not None:
populate_execution_report(
args.data,
args.reports,
)
elif args.reports and args.reports is not None:
fix_error(args.data, args.reports)
if args.estimate_cause:
estimate_cause(args.data)
def populate_db_tool():
parser = argparse.ArgumentParser(
prog=sys.argv[0],
description="Populate a sqlite3 database with the tool information",
)
parser.add_argument(
"-d",
"--data",
required=True,
type=Path,
help="The sqlite3 database to populate",
)
args = parser.parse_args()
populate_tool(
args.data,
)
def show_common_errors():
parser = argparse.ArgumentParser(
prog=sys.argv[0],
description="Find the most common errors matching given criterions",
)
parser.add_argument(
"-d",
"--data",
required=True,
type=Path,
help="The sqlite3 database containing the results",
)
parser.add_argument(
"-n",
"--nb-errors",
type=int,
default=10,
help="the number of errors to find",
)
parser.add_argument(
"-t",
"--tool",
default=None,
help="restrict the error to the one raised by a specific tool",
)
parser.add_argument(
"-s",
"--status",
default=None,
help="restrict the error to the one raised when the tool FAILED, FINISHED, or TIMEOUT",
)
for carac in [
"androguard",
"java",
"prolog",
"ruby",
"soot",
"apktool",
"ocaml",
"python",
"scala",
]:
group = parser.add_mutually_exclusive_group()
group.add_argument(
f"--use-{carac}",
action="store_true",
help=(f"restrict the error to the one raised by tool that use {carac}"),
)
group.add_argument(
f"--dont-use-{carac}",
action="store_true",
help=(
f"restrict the error to the one raised by tool that do not use {carac}"
),
)
parser.add_argument(
"-f",
"--folder",
type=Path,
help="The folder in which the csv must be stored",
)
args = parser.parse_args()
carac_kargs = {}
for carac in [
"androguard",
"java",
"prolog",
"ruby",
"soot",
"apktool",
"ocaml",
"python",
"scala",
]:
if vars(args)[f"use_{carac}"]:
carac_kargs[f"use_{carac}"] = True
if vars(args)[f"dont_use_{carac}"]:
carac_kargs[f"use_{carac}"] = False
if args.folder is None:
print("Error:")
get_common_errors(
args.data,
tool=args.tool,
status=args.status,
limit=args.nb_errors,
folder=args.folder,
**carac_kargs,
)
if args.folder is None:
print("-" * 30)
print("Error classes:")
get_common_error_classes(
args.data,
tool=args.tool,
status=args.status,
limit=args.nb_errors,
folder=args.folder,
**carac_kargs,
)
if args.tool is None:
for tool in get_list_tools(args.data):
if args.folder is None:
print("-" * 30)
print(f"Top {args.nb_errors} errors for {tool}:")
get_common_errors(
args.data,
tool=tool,
status=args.status,
limit=args.nb_errors,
folder=args.folder,
**carac_kargs,
)
if args.tool is None:
for tool in get_list_tools(args.data):
if args.folder is None:
print("-" * 30)
print(f"Top {args.nb_errors} error classes for {tool}:")
get_common_error_classes(
args.data,
tool=tool,
status=args.status,
limit=args.nb_errors,
folder=args.folder,
**carac_kargs,
)
def show_error_avg_occ_by_exec():
parser = argparse.ArgumentParser(
prog=sys.argv[0],
description="Compute average number of occurences in an exec for an error",
)
parser.add_argument(
"-d",
"--data",
required=True,
type=Path,
help="The sqlite3 database containing the results",
)
parser.add_argument(
"-e",
"--error",
required=True,
type=str,
help="The error to count",
)
args = parser.parse_args()
print_avg_occ_in_exec(args.data, args.error)
def ic3():
parser = argparse.ArgumentParser(
prog=sys.argv[0],
description="Analyse the executions of ic3 and ic3_fork",
)
parser.add_argument(
"-d",
"--data",
required=True,
type=Path,
help="The sqlite3 database containing the results",
)
parser.add_argument(
"-f",
"--figures-file",
type=Path,
help="The folder in which the figures must be stored",
)
parser.add_argument(
"--display",
action="store_true",
help="If the figures must be displayed",
)
args = parser.parse_args()
ic3_venn(args.data, interactive=args.display, image_path=args.figures_file)
ic3_errors(
args.data,
file=(
args.figures_file / "ic3_err.csv" if args.figures_file is not None else None
),
)
def show_error_cause_radar():
parser = get_common_arg(
"Compute radar charts that show common identifiable causes of crash"
)
args = parser.parse_args()
radar_cause_estimation(args.data, args.tools, args.display, args.figures_file)
def average_nb_errors():
parser = argparse.ArgumentParser(
prog=sys.argv[0],
description="Compute the average number of error by execution",
)
parser.add_argument(
"-d",
"--data",
required=True,
type=Path,
help="The sqlite3 database containing the results",
)
parser.add_argument(
"-f",
"--figures-file",
type=Path,
help="The folder result must be stored",
)
args = parser.parse_args()
get_nb_error(args.data, args.figures_file)
def show_error_type_repartition():
parser = argparse.ArgumentParser(
prog=sys.argv[0],
description="Compute a heat map of error type repartition by tool",
)
parser.add_argument(
"-d",
"--data",
required=True,
type=Path,
help="The sqlite3 database containing the results",
)
parser.add_argument(
"-f",
"--figures-file",
type=Path,
help="The folder in which the figures must be stored",
)
parser.add_argument(
"--display",
action="store_true",
help="If the figures must be displayed",
)
args = parser.parse_args()
error_type_repartition(args.data, args.display, args.figures_file)
def show_apk_year_repartition_by_decil():
args = get_common_arg("Test").parse_args()
if args.tools is None:
tools = get_list_tools(args.data)
else:
tools = args.tools
import sqlite3
import matplotlib.pyplot as plt # type: ignore
deciles = [{} for _ in range(11)]
with sqlite3.connect(args.data) as con:
cur = con.cursor()
for d, y, n in cur.execute(
"SELECT dex_size_decile, first_seen_year, COUNT(*) FROM apk GROUP BY dex_size_decile, first_seen_year;"
):
deciles[d][y] = n
fig, axes = plt.subplots(3, 4)
fig.set_size_inches((24, 12))
for d in range(1, 11):
years = sorted(deciles[d].keys())
ax = axes[(d - 1) % 3, (d - 1) // 3]
ax.bar(
years,
[deciles[d][y] for y in years],
edgecolor="black",
)
ax.legend()
ax.set_title(f"Decile {d}")
plt.show()
# Size dispertion inside decile
# SELECT dex_size_decile, (100*(MAX(dex_size) - MIN (dex_size)))/AVG(dex_size) FROM apk GROUP BY dex_size_decile;
# SELECT dex_size_decile, (MAX(dex_size) - MIN (dex_size)) FROM apk GROUP BY dex_size_decile;
with sqlite3.connect(args.data) as con:
cur = con.cursor()
for d, m, v in cur.execute(
"SELECT dex_size_decile, AVG(dex_size), AVG(dex_size*dex_size) - AVG(dex_size)*AVG(dex_size) "
"FROM apk GROUP BY dex_size_decile "
"ORDER BY dex_size_decile;"
):
m = round(m / 1024 / 1024, 2)
s = round(v * 0.5 / 1024 / 1024, 2)
print(f"Decile {d}: moyenne: {m} MB, ecart type: {s} MB")
def plot_decorelated_factor():
parser = get_common_arg("Plot graph while fixing to parameter to decorelate them")
parser.add_argument(
"--decile",
default=8,
type=int,
help="The decile to use for fixed size result",
)
args = parser.parse_args()
if args.tools is None:
tools = get_list_tools(args.data)
else:
tools = args.tools
plot_all_status_by_generic_x(
tools,
"(SELECT AVG(apk2.dex_size)/1000000 FROM apk AS apk2 WHERE apk2.dex_size_decile_by_year = apk.dex_size_decile_by_year AND apk2.first_seen_year = 2022)",
"bytecode size (MB)",
"Finishing Rate of java based tool by bytecode size of apks detected in 2022",
args,
condition="tool.use_java = TRUE",
apk_condition="apk.first_seen_year = 2022",
group_by="dex_size_decile_by_year",
)
plot_all_status_by_generic_x(
tools,
"(SELECT AVG(apk2.dex_size)/1000000 FROM apk AS apk2 WHERE apk2.dex_size_decile_by_year = apk.dex_size_decile_by_year AND apk2.first_seen_year = 2022)",
"bytecode size (MB)",
"Finishing Rate of non-java based tool by bytecode size of apks detected in 2022",
args,
condition="tool.use_java = FALSE",
apk_condition="apk.first_seen_year = 2022",
group_by="dex_size_decile_by_year",
)
MIN_YEAR = [None, 2010, 2010, 2011, 2012, 2013, 2013, 2014, 2015, 2016, 2019]
MAX_YEAR = [None, 2017, 2017, 2024, 2024, 2024, 2024, 2024, 2024, 2024, 2024]
import sqlite3
with sqlite3.connect(args.data) as con:
cur = con.cursor()
min_size, max_size = cur.execute(
"SELECT MIN(dex_size), MAX(dex_size) "
"FROM apk WHERE dex_size_decile = ?;",
(args.decile,),
).fetchone()
min_size = round(min_size / 1024 / 1024, 2)
max_size = round(max_size / 1024 / 1024, 2)
plot_all_status_by_generic_x(
tools,
"apk.first_seen_year",
"Year",
(
"Finishing Rate of java based tool by discovery year of apks with a bytecode "
f"size between {min_size} MB and {max_size} MB"
),
args,
condition="tool.use_java = TRUE",
apk_condition=(
f"apk.dex_size_decile = {args.decile} AND "
f"apk.first_seen_year >= {MIN_YEAR[args.decile]} AND "
f"apk.first_seen_year <= {MAX_YEAR[args.decile]}"
),
)
plot_all_status_by_generic_x(
tools,
"apk.first_seen_year",
"Year",
(
"Finishing Rate of non-java based tool by discovery year of apks with a bytecode "
f"size between {min_size} MB and {max_size} MB"
),
args,
condition="tool.use_java = FALSE",
apk_condition=(
f"apk.dex_size_decile = {args.decile} AND "
f"apk.first_seen_year >= {MIN_YEAR[args.decile]} AND "
f"apk.first_seen_year <= {MAX_YEAR[args.decile]}"
),
)
plot_all_status_by_generic_x(
tools,
"apk.min_sdk",
"Min SDK",
(
"Finishing Rate of java based tool by min SDK of apks with a bytecode "
f"size between {min_size} MB and {max_size} MB"
),
args,
condition="tool.use_java = TRUE",
apk_condition=(
f"apk.dex_size_decile = {args.decile} AND "
f"apk.first_seen_year >= {MIN_YEAR[args.decile]} AND "
f"apk.first_seen_year <= {MAX_YEAR[args.decile]}"
),
)
plot_all_status_by_generic_x(
tools,
"apk.min_sdk",
"Min SDK",
(
"Finishing Rate of non-java based tool by min SDK of apks with a bytecode "
f"size between {min_size} MB and {max_size} MB"
),
args,
condition="tool.use_java = FALSE",
apk_condition=(
f"apk.dex_size_decile = {args.decile} AND "
f"apk.first_seen_year >= {MIN_YEAR[args.decile]} AND "
f"apk.first_seen_year <= {MAX_YEAR[args.decile]}"
),
)
plot_all_status_by_generic_x(
tools,
"(SELECT AVG(apk2.dex_size)/1000000 FROM apk AS apk2 WHERE apk2.dex_size_decile = apk.dex_size_decile AND apk2.min_sdk = 16)",
"bytecode size (MB)",
"Finishing Rate of java based tool by bytecode size of apks with min SDK = 16",
args,
condition="tool.use_java = TRUE",
apk_condition="apk.min_sdk = 16",
group_by="dex_size_decile",
)
plot_all_status_by_generic_x(
tools,
"(SELECT AVG(apk2.dex_size)/1000000 FROM apk AS apk2 WHERE apk2.dex_size_decile = apk.dex_size_decile AND apk2.min_sdk = 16)",
"bytecode size (MB)",
"Finishing Rate of non java based tool by bytecode size of apks with min SDK = 16",
args,
condition="tool.use_java = FALSE",
apk_condition="apk.min_sdk = 16",
group_by="dex_size_decile",
)
def get_avg_ressource_consumption():
parser = get_common_arg("Compute the average ressource consumption of the tools")
args = parser.parse_args()
get_ressource(args.data, args.figures_file)
def rate_malware_decile():
parser = get_common_arg("Test")
args = parser.parse_args()
import sqlite3
tot_size_good = None
tot_size_mal = None
size_good = [0 for _ in range(11)]
size_mal = [0 for _ in range(11)]
tot_sd_good = None
tot_sd_mal = None
sd_good = [0 for _ in range(11)]
sd_mal = [0 for _ in range(11)]
nb_apk_good = [0 for _ in range(11)]
nb_apk_mal = [0 for _ in range(11)]
nb_success_good = [0 for _ in range(11)]
nb_success_mal = [0 for _ in range(11)]
NB_TOOL = 20
with sqlite3.connect(args.data) as con:
cur = con.cursor()
for is_good, decile, n, m, v in cur.execute(
"SELECT vt_detection = 0, dex_size_decile, COUNT(*), "
" AVG(dex_size), AVG(dex_size*dex_size) - AVG(dex_size)*AVG(dex_size) "
"FROM apk GROUP BY vt_detection = 0, dex_size_decile;",
):
if is_good:
nb_apk_good[decile] = n
size_good[decile] = round(m / 1024 / 1024, 2)
sd_good[decile] = round(v ** (1 / 2) / 1024 / 1024, 2)
else:
nb_apk_mal[decile] = n
size_mal[decile] = round(m / 1024 / 1024, 2)
sd_mal[decile] = round(v ** (1 / 2) / 1024 / 1024, 2)
for is_good, m, v in cur.execute(
"SELECT vt_detection = 0, AVG(dex_size), AVG(dex_size*dex_size) - AVG(dex_size)*AVG(dex_size) "
"FROM apk GROUP BY vt_detection = 0",
):
if is_good:
tot_size_good = round(m / 1024 / 1024, 2)
tot_sd_good = round(v ** (1 / 2) / 1024 / 1024, 2)
else:
tot_size_mal = round(m / 1024 / 1024, 2)
tot_sd_mal = round(v ** (1 / 2) / 1024 / 1024, 2)
for is_good, decile, n in cur.execute(
"SELECT vt_detection = 0, dex_size_decile, COUNT(*) "
"FROM exec INNER JOIN apk ON exec.sha256 = apk.sha256 "
"WHERE tool_status = 'FINISHED' "
"GROUP BY vt_detection = 0, dex_size_decile;",
):
if is_good:
nb_success_good[decile] = n
else:
nb_success_mal[decile] = n
tot_apk_good = sum(nb_apk_good)
tot_apk_mal = sum(nb_apk_mal)
tot_success_good = sum(nb_success_good)
tot_success_mal = sum(nb_success_mal)
print(
" rate goodware rate malware avg size goodware (MB) avg size malware (MB)"
)
for d in range(1, 11):
rate_good = round(nb_success_good[d] / nb_apk_good[d] / NB_TOOL * 100, 2)
rate_mal = round(nb_success_mal[d] / nb_apk_mal[d] / NB_TOOL * 100, 2)
print(
f"decile {d: >2}: {rate_good: >15} {rate_mal: >15} {size_good[d]: >26} {size_mal[d]: >24}"
)
rate_good = round(tot_success_good / tot_apk_good / NB_TOOL * 100, 2)
rate_mal = round(tot_success_mal / tot_apk_mal / NB_TOOL * 100, 2)
print(
f"total: {rate_good: >15} {rate_mal: >15} {tot_size_good: >26} {tot_size_mal: >24}"
)
def count_error_stacks():
parser = argparse.ArgumentParser(
prog=sys.argv[0],
description=(
"Read a list of report file in stdin and extract and sort "
"by occurences the error stack that match the parameters.\n"
"The list can be generated with find_apks_by_tool_error.sh: \n"
" `./find_apks_by_tool_error.sh androguard_dad OSError rasta.db | rasta-count-error-stacks -e OSError`"
),
)
parser.add_argument(
"-e",
"--error",
required=True,
help="The error type to studdy",
)
parser.add_argument(
"-m",
"--msg",
required=False,
default=None,
help="The error msg to studdy",
)
parser.add_argument(
"-s",
"--status",
required=False,
default=None,
choices=["FAILED", "FINISHED", "TIMEOUT"],
help="Restrict the search to the tool that exited with a specific status",
)
args = parser.parse_args()
import json
stacks = {}
while True:
try:
file = input()
except EOFError:
break
with open(file) as fp:
report = json.load(fp)
if args.status is not None and report.get("tool-status", None) != args.status:
continue
for err in report["errors"]:
if (
"error" in err
and err["error"] == args.error
and (args.msg is None or ("msg" in err and err["msg"] == args.msg))
):
k = json.dumps(err["stack"], indent=" ")
if k not in stacks:
stacks[k] = 0
stacks[k] += 1
keys = sorted(stacks.keys(), key=lambda k: stacks[k])
for k in keys:
print(f"{k}: {stacks[k]} occurences")
def generate_dataset():
parser = argparse.ArgumentParser(
prog=sys.argv[0],
description="Generate a dataset from androzoo list. The default values are the one used for the RASTA dataset",
)
parser.add_argument(
"latest",
type=Path,
help="latest.csv.gz, the list of apk from androzoo",
)
parser.add_argument(
"year_sdk_csv",
type=Path,
help="year_and_sdk.csv.gz, the list of apk from androzoo with the pulication year and sdk version",
)
parser.add_argument(
"--out",
"-o",
required=False,
type=Path,
default=Path(".") / "rasta_dataset",
help="Where to store the dataset",
)
parser.add_argument(
"-s",
"--random_seed",
required=False,
default=1234567890,
type=int,
help="The seed to use for the random generator",
)
parser.add_argument(
"-n",
"--nb_sub_dataset",
required=False,
default=10,
type=int,
help="The number of subdatasets",
)
parser.add_argument(
"-ns",
"--nb_apk_by_bucket_by_subset",
default=50,
type=int,
help="The number of apk in each size bucket of each subset",
)
parser.add_argument(
"-mp",
"--proportion_malware",
default=0.07,
type=float,
help="The proportion of malware in the dataset",
)
parser.add_argument(
"-vt",
"--vt_threshold",
default=5,
type=int,
help="The number of virustotal detection from which we considere an apk to be a malware",
)
parser.add_argument(
"-nq",
"--nb_size_quantile",
default=10,
type=int,
help="The number of quantile to use for the size of the apk",
)
parser.add_argument(
"-ex",
"--exclution_prop",
default=0.01,
type=float,
help="The proportion of apk to exclude at each size extreme",
)
parser.add_argument(
"-my",
"--min_year",
default=2010,
type=int,
help="The min year of the year range",
)
parser.add_argument(
"-My",
"--max_year",
default=2023,
type=int,
help="The max year of the year range",
)
args = parser.parse_args()
gen_dataset(
args.latest,
args.year_sdk_csv,
args.out,
nb_sub_dataset=args.nb_sub_dataset,
nb_apk_by_bucket_by_subset=args.nb_apk_by_bucket_by_subset,
proportion_malware=args.proportion_malware,
vt_threshold=args.vt_threshold,
nb_bucket=args.nb_size_quantile,
exclution=args.exclution_prop,
min_year=args.min_year,
max_year=args.max_year,
random_seed=args.random_seed,
)
def size_malware():
parser = argparse.ArgumentParser(
prog=sys.argv[0],
description="Compare size and success rate proportion for malwares and goodware",
)
parser.add_argument(
"-d",
"--data",
required=True,
type=Path,
help="The sqlite3 database that contain the execution report and apk info",
)
args = parser.parse_args()
nb_tool = len(get_list_tools(args.data))
nb_apk = {}
size_apk = {}
nb_finished = {}
with sqlite3.connect(args.data) as con:
cur = con.cursor()
for decile, is_goodware, n, avg_size in cur.execute(
"SELECT dex_size_decile, vt_detection = 0, COUNT(*), AVG(dex_size) FROM apk GROUP BY dex_size_decile, vt_detection = 0;"
):
nb_apk[(decile, bool(is_goodware))] = n
size_apk[(decile, bool(is_goodware))] = avg_size
for is_goodware, n, avg_size in cur.execute(
"SELECT vt_detection = 0, COUNT(*), AVG(dex_size) FROM apk GROUP BY vt_detection = 0;"
):
nb_apk[("total", bool(is_goodware))] = n
size_apk[("total", bool(is_goodware))] = avg_size
for decile, is_goodware, n in cur.execute(
"SELECT dex_size_decile, vt_detection = 0, COUNT(*) "
"FROM exec INNER JOIN apk ON apk.sha256=exec.sha256 "
"WHERE tool_status = 'FINISHED' OR tool_status = 'OTHER' "
"GROUP BY dex_size_decile, vt_detection = 0;"
):
nb_finished[(decile, bool(is_goodware))] = n
for is_goodware, n in cur.execute(
"SELECT vt_detection = 0, COUNT(*) "
"FROM exec INNER JOIN apk ON apk.sha256=exec.sha256 "
"WHERE tool_status = 'FINISHED' OR tool_status = 'OTHER' "
"GROUP BY vt_detection = 0;"
):
nb_finished[("total", bool(is_goodware))] = n
print(
"dex size decile, average size goodware, average size malware, finishing rate goodware, finishing rate malware, average size goodware/malware, finishing rate goodware/malware"
)
for size in ["total", *range(1, 11)]:
finishing_rate_goodware = nb_finished[(size, True)] / (
nb_tool * nb_apk[(size, True)]
)
finishing_rate_malware = nb_finished[(size, False)] / (
nb_tool * nb_apk[(size, False)]
)
print(
f"{size}, {size_apk[(size, True)]:.2f}, {size_apk[(size, False)]:.2f}, {finishing_rate_goodware:.2f}, {finishing_rate_malware:.2f}, {size_apk[(size, True)] / size_apk[(size, False)]:.2f}, {finishing_rate_goodware/finishing_rate_malware:.2f}"
)
def compare_status_by_tool():
"""Compare the repartition of status by tool from two result dbs"""
parser = argparse.ArgumentParser(
prog=sys.argv[0],
description="Compare the repartition of status by tool from two result dbs",
)
parser.add_argument(
"-d1",
"--data1",
required=True,
type=Path,
help="The sqlite3 database that contain the execution report of the first experiment",
)
parser.add_argument(
"-d2",
"--data2",
required=True,
type=Path,
help="The sqlite3 database that contain the execution report of the second experiment",
)
parser.add_argument(
"-f",
"--figures-file",
type=Path,
help="The folder in which the figures must be stored",
)
parser.add_argument(
"--display",
action="store_true",
help="If the figures must be displayed",
)
parser.add_argument(
"-t",
"--tools",
nargs="+",
default=None,
help="The tools to analyse",
)
parser.add_argument(
"--title",
default="Comparision of Exit Status",
help="The title of the graph",
)
parser.add_argument(
"--same-apks",
action="store_true",
help="If the apks are the same in the two databases. If so, the missings applications will be shown.",
)
args = parser.parse_args()
plot_compare_status(
args.data1,
args.data2,
interactive=args.display,
image_path=args.figures_file,
tools=args.tools,
same_apks=args.same_apks,
)