add function to compare result of two experiment

2025-09-07 17:08:42 +02:00 · 2025-09-07 17:08:42 +02:00 · 0d8ad49c94
commit 0d8ad49c94
parent 1309d7ea24
5 changed files with 255 additions and 11 deletions
--- a/rasta_data_manipulation/rasta_triturage/status.py
+++ b/rasta_data_manipulation/rasta_triturage/status.py
@ -444,3 +444,179 @@ def plot_all_status_by_generic_x(
        interactive=args.display,
        image_path=args.figures_file,
    )
+
+
+def plot_compare_status(
+    db1: Path,
+    db2: Path,
+    interactive: bool = True,
+    image_path: Path | None = None,
+    tools: list[str] | None = None,
+    title: str = "Comparision of Exit Status",
+    same_apks: bool = False,
+):
+    """Plot and compare repartition of status by tools from two experiment.
+
+    db1 and db2 are the path to two result sqlite databases to compare
+    image_path is where to save the result
+    tools is the list of tools to compare, default will compare all tools found.
+    title is the title of the figure
+    same_apks indicate if the two databases uses the same apks. If so, the missing apks will be displayed in the plot.
+    """
+    if tools is None:
+        tools = list(set(get_list_tools(db1)) | set(get_list_tools(db2)))
+    tools_list_format = f"({','.join(['?' for _ in tools])})"
+    with sqlite3.connect(db1) as con:
+        cur = con.cursor()
+        status_1 = cur.execute(
+            (
+                "SELECT tool_name, tool_status, COUNT(sha256) "
+                "FROM exec "
+                f"WHERE tool_name IN {tools_list_format} "
+                "GROUP BY tool_name, tool_status;"
+            ),
+            tools,
+        ).fetchall()
+        apk_1 = set(cur.execute("SELECT sha256 FROM exec;").fetchall())
+    with sqlite3.connect(db2) as con:
+        cur = con.cursor()
+        status_2 = cur.execute(
+            (
+                "SELECT tool_name, tool_status, COUNT(sha256) "
+                "FROM exec "
+                f"WHERE tool_name IN {tools_list_format} "
+                "GROUP BY tool_name, tool_status;"
+            ),
+            tools,
+        ).fetchall()
+        apk_2 = set(cur.execute("SELECT sha256 FROM exec;").fetchall())
+
+    occurences = {}
+    for tool, stat, occurence in status_1:
+        occurences[(tool, stat, "db1")] = occurence
+    for tool, stat, occurence in status_2:
+        occurences[(tool, stat, "db2")] = occurence
+    #    tools.sort(
+    #        key=lambda t: occurences.get((t, "FINISHED", "db1"), 0)
+    #        + occurences.get((t, "FINISHED", "db2"), 0),
+    #        reverse=True,
+    #    )
+    tools.sort()
+
+    values = {
+        "Finished": np.zeros(len(tools) * 2),
+        "Time Out": np.zeros(len(tools) * 2),
+        "Other": np.zeros(len(tools) * 2),
+        "Failed": np.zeros(len(tools) * 2),
+    }
+    nb_apk_tot = len(apk_1 | apk_2)
+    if same_apks:
+        nb_apk_1 = nb_apk_tot
+        nb_apk_2 = nb_apk_tot
+        missing_1 = len(apk_2 - apk_1)
+        missing_2 = len(apk_1 - apk_2)
+        values["Missing"] = np.zeros(len(tools) * 2)
+        for i in range(len(tools)):
+            values["Missing"][2 * i] = (missing_1 * 100) / nb_apk_1
+            values["Missing"][2 * i + 1] = (missing_2 * 100) / nb_apk_2
+    else:
+        nb_apk_1 = len(apk_1)
+        nb_apk_2 = len(apk_2)
+    colors = {
+        "Finished": "#009E73",
+        "Time Out": "#56B4E9",
+        "Other": "#555555",  # TODO: find beter color
+        "Failed": "#D55E00",
+        "Missing": "#555555",
+    }
+    hatch = {
+        "Finished": "/",
+        "Time Out": "x",
+        "Other": ".",
+        "Failed": "\\",
+        "Missing": "-",
+    }
+
+    for i, tool in enumerate(tools):
+        i_1 = 2 * i
+        i_2 = 2 * i + 1
+        values["Finished"][i_1] = occurences.get((tool, "FINISHED", "db1"), 0)
+        values["Finished"][i_2] = occurences.get((tool, "FINISHED", "db2"), 0)
+        values["Time Out"][i_1] = occurences.get((tool, "TIMEOUT", "db1"), 0)
+        values["Time Out"][i_2] = occurences.get((tool, "TIMEOUT", "db2"), 0)
+        values["Failed"][i_1] = occurences.get((tool, "FAILED", "db1"), 0)
+        values["Failed"][i_2] = occurences.get((tool, "FAILED", "db2"), 0)
+        values["Other"][i_1] = (
+            len(apk_1)
+            - values["Finished"][i_1]
+            - values["Time Out"][i_1]
+            - values["Failed"][i_1]
+        )
+        values["Other"][i_2] = (
+            len(apk_2)
+            - values["Finished"][i_2]
+            - values["Time Out"][i_2]
+            - values["Failed"][i_2]
+        )
+        values["Finished"][i_1] = (
+            0 if nb_apk_1 == 0 else (100 * values["Finished"][i_1]) / nb_apk_1
+        )
+        values["Finished"][i_2] = (
+            0 if nb_apk_2 == 0 else (100 * values["Finished"][i_2]) / nb_apk_2
+        )
+        values["Time Out"][i_1] = (
+            0 if nb_apk_1 == 0 else (100 * values["Time Out"][i_1]) / nb_apk_1
+        )
+        values["Time Out"][i_2] = (
+            0 if nb_apk_2 == 0 else (100 * values["Time Out"][i_2]) / nb_apk_2
+        )
+        values["Failed"][i_1] = (
+            0 if nb_apk_1 == 0 else (100 * values["Failed"][i_1]) / nb_apk_1
+        )
+        values["Failed"][i_2] = (
+            0 if nb_apk_2 == 0 else (100 * values["Failed"][i_2]) / nb_apk_2
+        )
+        values["Other"][i_1] = (
+            0 if nb_apk_1 == 0 else (100 * values["Other"][i_1]) / nb_apk_1
+        )
+        values["Other"][i_2] = (
+            0 if nb_apk_2 == 0 else (100 * values["Other"][i_2]) / nb_apk_2
+        )
+    bottom = np.zeros(len(tools) * 2)
+
+    x_axis = np.zeros(len(tools) * 2)
+    x_width = 3
+    x_0 = x_width / 2
+    lstep = 1
+    bstep = 5
+    for i in range(len(tools)):
+        x_0 += bstep + x_width
+        x_axis[2 * i] = x_0
+        x_0 += lstep + x_width
+        x_axis[2 * i + 1] = x_0
+    tick_legend = []
+    for tool in tools:
+        tick_legend.append(f"{tool}")  # (f"{tool} on goodware")
+        tick_legend.append("")  # (f"{tool} on malware")
+
+    plt.figure(figsize=(20, 9), dpi=80)
+    if same_apks:
+        stats = ["Finished", "Time Out", "Other", "Failed", "Missing"]
+    else:
+        stats = ["Finished", "Time Out", "Other", "Failed"]
+    for stat in stats:
+        plt.bar(
+            x_axis,
+            values[stat],
+            label=stat,
+            color=colors[stat],
+            hatch=hatch[stat],
+            bottom=bottom,
+            width=x_width,
+            edgecolor="black",
+        )
+        bottom += values[stat]
+    plt.xticks(x_axis, tick_legend, rotation=80)
+    plt.legend()
+    plt.ylabel("% of analysed apk")
+    render(title, interactive, image_path, format="svg")