From 0d8ad49c94af85eb1ff3cad3a779ce1d5de19054 Mon Sep 17 00:00:00 2001 From: Jean-Marie Mineau Date: Sun, 7 Sep 2025 17:08:42 +0200 Subject: [PATCH] add function to compare result of two experiment --- .gitignore | 1 + rasta_data_manipulation/pyproject.toml | 13 +- .../rasta_triturage/cli.py | 68 ++++++- .../rasta_triturage/status.py | 176 ++++++++++++++++++ .../rasta_triturage/utils.py | 8 +- 5 files changed, 255 insertions(+), 11 deletions(-) diff --git a/.gitignore b/.gitignore index 1269488..c93e84f 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ data +*.db diff --git a/rasta_data_manipulation/pyproject.toml b/rasta_data_manipulation/pyproject.toml index 3d223a6..a3f03ee 100644 --- a/rasta_data_manipulation/pyproject.toml +++ b/rasta_data_manipulation/pyproject.toml @@ -1,15 +1,15 @@ [tool.poetry] name = "rasta_triturage" -version = "0.2.0" +version = "0.2.1" description = "'Triturage de donnée' for the Rasta Project" -authors = ["anon"] +authors = ["Jean-Marie Mineau "] readme = "README.md" -#homepage = "" -#repository = "" -license = "Proprietary" +homepage = "https://github.com/histausse/rasta/tree/main" +repository = "https://github.com/histausse/rasta/tree/main" +license = "GPLv3" [tool.poetry.urls] -#"Bug Tracker" = "" +"Bug Tracker" = "https://github.com/histausse/rasta/issues" [tool.poetry.dependencies] python = "^3.10" @@ -50,6 +50,7 @@ rasta-decorelate-factor = "rasta_triturage.cli:plot_decorelated_factor" rasta-count-error-stacks = "rasta_triturage.cli:count_error_stacks" rasta-gen-dataset = "rasta_triturage.cli:generate_dataset" rasta-size-malware = "rasta_triturage.cli:size_malware" +rasta-compare-status = "rasta_triturage.cli:compare_status_by_tool" [tool.poetry.group.dev.dependencies] pytest = "*" diff --git a/rasta_data_manipulation/rasta_triturage/cli.py b/rasta_data_manipulation/rasta_triturage/cli.py index 6298e6d..a42d722 100644 --- a/rasta_data_manipulation/rasta_triturage/cli.py +++ b/rasta_data_manipulation/rasta_triturage/cli.py @@ -17,6 +17,7 @@ from .status import ( plot_status_by_tool_and_malware, plot_all_status_by_generic_x, plot_status_by_generic_x, + plot_compare_status, ) from .apk import ( plot_apk_info_by_generic_x, @@ -577,9 +578,9 @@ def ic3(): ic3_venn(args.data, interactive=args.display, image_path=args.figures_file) ic3_errors( args.data, - file=args.figures_file / "ic3_err.csv" - if args.figures_file is not None - else None, + file=( + args.figures_file / "ic3_err.csv" if args.figures_file is not None else None + ), ) @@ -1127,3 +1128,64 @@ def size_malware(): print( f"{size}, {size_apk[(size, True)]:.2f}, {size_apk[(size, False)]:.2f}, {finishing_rate_goodware:.2f}, {finishing_rate_malware:.2f}, {size_apk[(size, True)] / size_apk[(size, False)]:.2f}, {finishing_rate_goodware/finishing_rate_malware:.2f}" ) + + +def compare_status_by_tool(): + """Compare the repartition of status by tool from two result dbs""" + + parser = argparse.ArgumentParser( + prog=sys.argv[0], + description="Compare the repartition of status by tool from two result dbs", + ) + parser.add_argument( + "-d1", + "--data1", + required=True, + type=Path, + help="The sqlite3 database that contain the execution report of the first experiment", + ) + parser.add_argument( + "-d2", + "--data2", + required=True, + type=Path, + help="The sqlite3 database that contain the execution report of the second experiment", + ) + parser.add_argument( + "-f", + "--figures-file", + type=Path, + help="The folder in which the figures must be stored", + ) + parser.add_argument( + "--display", + action="store_true", + help="If the figures must be displayed", + ) + parser.add_argument( + "-t", + "--tools", + nargs="+", + default=None, + help="The tools to analyse", + ) + parser.add_argument( + "--title", + default="Comparision of Exit Status", + help="The title of the graph", + ) + parser.add_argument( + "--same-apks", + action="store_true", + help="If the apks are the same in the two databases. If so, the missings applications will be shown.", + ) + args = parser.parse_args() + + plot_compare_status( + args.data1, + args.data2, + interactive=args.display, + image_path=args.figures_file, + tools=args.tools, + same_apks=args.same_apks, + ) diff --git a/rasta_data_manipulation/rasta_triturage/status.py b/rasta_data_manipulation/rasta_triturage/status.py index 819b14d..69cbd93 100644 --- a/rasta_data_manipulation/rasta_triturage/status.py +++ b/rasta_data_manipulation/rasta_triturage/status.py @@ -444,3 +444,179 @@ def plot_all_status_by_generic_x( interactive=args.display, image_path=args.figures_file, ) + + +def plot_compare_status( + db1: Path, + db2: Path, + interactive: bool = True, + image_path: Path | None = None, + tools: list[str] | None = None, + title: str = "Comparision of Exit Status", + same_apks: bool = False, +): + """Plot and compare repartition of status by tools from two experiment. + + db1 and db2 are the path to two result sqlite databases to compare + image_path is where to save the result + tools is the list of tools to compare, default will compare all tools found. + title is the title of the figure + same_apks indicate if the two databases uses the same apks. If so, the missing apks will be displayed in the plot. + """ + if tools is None: + tools = list(set(get_list_tools(db1)) | set(get_list_tools(db2))) + tools_list_format = f"({','.join(['?' for _ in tools])})" + with sqlite3.connect(db1) as con: + cur = con.cursor() + status_1 = cur.execute( + ( + "SELECT tool_name, tool_status, COUNT(sha256) " + "FROM exec " + f"WHERE tool_name IN {tools_list_format} " + "GROUP BY tool_name, tool_status;" + ), + tools, + ).fetchall() + apk_1 = set(cur.execute("SELECT sha256 FROM exec;").fetchall()) + with sqlite3.connect(db2) as con: + cur = con.cursor() + status_2 = cur.execute( + ( + "SELECT tool_name, tool_status, COUNT(sha256) " + "FROM exec " + f"WHERE tool_name IN {tools_list_format} " + "GROUP BY tool_name, tool_status;" + ), + tools, + ).fetchall() + apk_2 = set(cur.execute("SELECT sha256 FROM exec;").fetchall()) + + occurences = {} + for tool, stat, occurence in status_1: + occurences[(tool, stat, "db1")] = occurence + for tool, stat, occurence in status_2: + occurences[(tool, stat, "db2")] = occurence + # tools.sort( + # key=lambda t: occurences.get((t, "FINISHED", "db1"), 0) + # + occurences.get((t, "FINISHED", "db2"), 0), + # reverse=True, + # ) + tools.sort() + + values = { + "Finished": np.zeros(len(tools) * 2), + "Time Out": np.zeros(len(tools) * 2), + "Other": np.zeros(len(tools) * 2), + "Failed": np.zeros(len(tools) * 2), + } + nb_apk_tot = len(apk_1 | apk_2) + if same_apks: + nb_apk_1 = nb_apk_tot + nb_apk_2 = nb_apk_tot + missing_1 = len(apk_2 - apk_1) + missing_2 = len(apk_1 - apk_2) + values["Missing"] = np.zeros(len(tools) * 2) + for i in range(len(tools)): + values["Missing"][2 * i] = (missing_1 * 100) / nb_apk_1 + values["Missing"][2 * i + 1] = (missing_2 * 100) / nb_apk_2 + else: + nb_apk_1 = len(apk_1) + nb_apk_2 = len(apk_2) + colors = { + "Finished": "#009E73", + "Time Out": "#56B4E9", + "Other": "#555555", # TODO: find beter color + "Failed": "#D55E00", + "Missing": "#555555", + } + hatch = { + "Finished": "/", + "Time Out": "x", + "Other": ".", + "Failed": "\\", + "Missing": "-", + } + + for i, tool in enumerate(tools): + i_1 = 2 * i + i_2 = 2 * i + 1 + values["Finished"][i_1] = occurences.get((tool, "FINISHED", "db1"), 0) + values["Finished"][i_2] = occurences.get((tool, "FINISHED", "db2"), 0) + values["Time Out"][i_1] = occurences.get((tool, "TIMEOUT", "db1"), 0) + values["Time Out"][i_2] = occurences.get((tool, "TIMEOUT", "db2"), 0) + values["Failed"][i_1] = occurences.get((tool, "FAILED", "db1"), 0) + values["Failed"][i_2] = occurences.get((tool, "FAILED", "db2"), 0) + values["Other"][i_1] = ( + len(apk_1) + - values["Finished"][i_1] + - values["Time Out"][i_1] + - values["Failed"][i_1] + ) + values["Other"][i_2] = ( + len(apk_2) + - values["Finished"][i_2] + - values["Time Out"][i_2] + - values["Failed"][i_2] + ) + values["Finished"][i_1] = ( + 0 if nb_apk_1 == 0 else (100 * values["Finished"][i_1]) / nb_apk_1 + ) + values["Finished"][i_2] = ( + 0 if nb_apk_2 == 0 else (100 * values["Finished"][i_2]) / nb_apk_2 + ) + values["Time Out"][i_1] = ( + 0 if nb_apk_1 == 0 else (100 * values["Time Out"][i_1]) / nb_apk_1 + ) + values["Time Out"][i_2] = ( + 0 if nb_apk_2 == 0 else (100 * values["Time Out"][i_2]) / nb_apk_2 + ) + values["Failed"][i_1] = ( + 0 if nb_apk_1 == 0 else (100 * values["Failed"][i_1]) / nb_apk_1 + ) + values["Failed"][i_2] = ( + 0 if nb_apk_2 == 0 else (100 * values["Failed"][i_2]) / nb_apk_2 + ) + values["Other"][i_1] = ( + 0 if nb_apk_1 == 0 else (100 * values["Other"][i_1]) / nb_apk_1 + ) + values["Other"][i_2] = ( + 0 if nb_apk_2 == 0 else (100 * values["Other"][i_2]) / nb_apk_2 + ) + bottom = np.zeros(len(tools) * 2) + + x_axis = np.zeros(len(tools) * 2) + x_width = 3 + x_0 = x_width / 2 + lstep = 1 + bstep = 5 + for i in range(len(tools)): + x_0 += bstep + x_width + x_axis[2 * i] = x_0 + x_0 += lstep + x_width + x_axis[2 * i + 1] = x_0 + tick_legend = [] + for tool in tools: + tick_legend.append(f"{tool}") # (f"{tool} on goodware") + tick_legend.append("") # (f"{tool} on malware") + + plt.figure(figsize=(20, 9), dpi=80) + if same_apks: + stats = ["Finished", "Time Out", "Other", "Failed", "Missing"] + else: + stats = ["Finished", "Time Out", "Other", "Failed"] + for stat in stats: + plt.bar( + x_axis, + values[stat], + label=stat, + color=colors[stat], + hatch=hatch[stat], + bottom=bottom, + width=x_width, + edgecolor="black", + ) + bottom += values[stat] + plt.xticks(x_axis, tick_legend, rotation=80) + plt.legend() + plt.ylabel("% of analysed apk") + render(title, interactive, image_path, format="svg") diff --git a/rasta_data_manipulation/rasta_triturage/utils.py b/rasta_data_manipulation/rasta_triturage/utils.py index 91cf08a..ee60c53 100644 --- a/rasta_data_manipulation/rasta_triturage/utils.py +++ b/rasta_data_manipulation/rasta_triturage/utils.py @@ -112,7 +112,11 @@ def radar_chart( def render( - title: str, interactive: bool, image_path: Path | None, tight_layout: bool = True + title: str, + interactive: bool, + image_path: Path | None, + tight_layout: bool = True, + format: str = "pdf", ): """Render the figure. If `interactive`, display if, if `image_path`, save it.""" # plt.title(title) @@ -121,7 +125,7 @@ def render( if image_path is not None: if not image_path.exists(): image_path.mkdir(parents=True, exist_ok=True) - plt.savefig(image_path / (slugify(title) + ".pdf"), format="pdf") + plt.savefig(image_path / (slugify(title) + "." + format), format=format) if interactive: plt.show() plt.close()