add function to compare result of two experiment

2025-09-07 17:08:42 +02:00 · 2025-09-07 17:08:42 +02:00 · 0d8ad49c94
commit 0d8ad49c94
parent 1309d7ea24
5 changed files with 255 additions and 11 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1 +1,2 @@
 data
 *.db
--- a/rasta_data_manipulation/pyproject.toml
+++ b/rasta_data_manipulation/pyproject.toml
@ -1,15 +1,15 @@
 [tool.poetry]
 name = "rasta_triturage"
-version = "0.2.0"
+version = "0.2.1"
 description = "'Triturage de donnée' for the Rasta Project"
-authors = ["anon"]
+authors = ["Jean-Marie Mineau <rasta-github@jean-marie.mineau.eu>"]
 readme = "README.md"
-#homepage = ""
+homepage = "https://github.com/histausse/rasta/tree/main"
-#repository = ""
+repository = "https://github.com/histausse/rasta/tree/main"
-license = "Proprietary"
+license = "GPLv3"
 [tool.poetry.urls]
-#"Bug Tracker" = ""
+"Bug Tracker" = "https://github.com/histausse/rasta/issues"
 [tool.poetry.dependencies]
 python = "^3.10"
@ -50,6 +50,7 @@ rasta-decorelate-factor = "rasta_triturage.cli:plot_decorelated_factor"
 rasta-count-error-stacks = "rasta_triturage.cli:count_error_stacks"
 rasta-gen-dataset = "rasta_triturage.cli:generate_dataset"
 rasta-size-malware = "rasta_triturage.cli:size_malware"
 rasta-compare-status = "rasta_triturage.cli:compare_status_by_tool"
 [tool.poetry.group.dev.dependencies]
 pytest = "*"
--- a/rasta_data_manipulation/rasta_triturage/cli.py
+++ b/rasta_data_manipulation/rasta_triturage/cli.py
@ -17,6 +17,7 @@ from .status import (
    plot_status_by_tool_and_malware,
    plot_all_status_by_generic_x,
    plot_status_by_generic_x,
    plot_compare_status,
 )
 from .apk import (
    plot_apk_info_by_generic_x,
@ -577,9 +578,9 @@ def ic3():
    ic3_venn(args.data, interactive=args.display, image_path=args.figures_file)
    ic3_errors(
        args.data,
-        file=args.figures_file / "ic3_err.csv"
+        file=(
-        if args.figures_file is not None
+            args.figures_file / "ic3_err.csv" if args.figures_file is not None else None
-        else None,
+        ),
    )
@ -1127,3 +1128,64 @@ def size_malware():
        print(
            f"{size}, {size_apk[(size, True)]:.2f}, {size_apk[(size, False)]:.2f}, {finishing_rate_goodware:.2f}, {finishing_rate_malware:.2f}, {size_apk[(size, True)] / size_apk[(size, False)]:.2f}, {finishing_rate_goodware/finishing_rate_malware:.2f}"
        )
 def compare_status_by_tool():
    """Compare the repartition of status by tool from two result dbs"""
    parser = argparse.ArgumentParser(
        prog=sys.argv[0],
        description="Compare the repartition of status by tool from two result dbs",
    )
    parser.add_argument(
        "-d1",
        "--data1",
        required=True,
        type=Path,
        help="The sqlite3 database that contain the execution report of the first experiment",
    )
    parser.add_argument(
        "-d2",
        "--data2",
        required=True,
        type=Path,
        help="The sqlite3 database that contain the execution report of the second experiment",
    )
    parser.add_argument(
        "-f",
        "--figures-file",
        type=Path,
        help="The folder in which the figures must be stored",
    )
    parser.add_argument(
        "--display",
        action="store_true",
        help="If the figures must be displayed",
    )
    parser.add_argument(
        "-t",
        "--tools",
        nargs="+",
        default=None,
        help="The tools to analyse",
    )
    parser.add_argument(
        "--title",
        default="Comparision of Exit Status",
        help="The title of the graph",
    )
    parser.add_argument(
        "--same-apks",
        action="store_true",
        help="If the apks are the same in the two databases. If so, the missings applications will be shown.",
    )
    args = parser.parse_args()
    plot_compare_status(
        args.data1,
        args.data2,
        interactive=args.display,
        image_path=args.figures_file,
        tools=args.tools,
        same_apks=args.same_apks,
    )
--- a/rasta_data_manipulation/rasta_triturage/status.py
+++ b/rasta_data_manipulation/rasta_triturage/status.py
@ -444,3 +444,179 @@ def plot_all_status_by_generic_x(
        interactive=args.display,
        image_path=args.figures_file,
    )
 def plot_compare_status(
    db1: Path,
    db2: Path,
    interactive: bool = True,
    image_path: Path | None = None,
    tools: list[str] | None = None,
    title: str = "Comparision of Exit Status",
    same_apks: bool = False,
 ):
    """Plot and compare repartition of status by tools from two experiment.
    db1 and db2 are the path to two result sqlite databases to compare
    image_path is where to save the result
    tools is the list of tools to compare, default will compare all tools found.
    title is the title of the figure
    same_apks indicate if the two databases uses the same apks. If so, the missing apks will be displayed in the plot.
    """
    if tools is None:
        tools = list(set(get_list_tools(db1)) | set(get_list_tools(db2)))
    tools_list_format = f"({','.join(['?' for _ in tools])})"
    with sqlite3.connect(db1) as con:
        cur = con.cursor()
        status_1 = cur.execute(
            (
                "SELECT tool_name, tool_status, COUNT(sha256) "
                "FROM exec "
                f"WHERE tool_name IN {tools_list_format} "
                "GROUP BY tool_name, tool_status;"
            ),
            tools,
        ).fetchall()
        apk_1 = set(cur.execute("SELECT sha256 FROM exec;").fetchall())
    with sqlite3.connect(db2) as con:
        cur = con.cursor()
        status_2 = cur.execute(
            (
                "SELECT tool_name, tool_status, COUNT(sha256) "
                "FROM exec "
                f"WHERE tool_name IN {tools_list_format} "
                "GROUP BY tool_name, tool_status;"
            ),
            tools,
        ).fetchall()
        apk_2 = set(cur.execute("SELECT sha256 FROM exec;").fetchall())
    occurences = {}
    for tool, stat, occurence in status_1:
        occurences[(tool, stat, "db1")] = occurence
    for tool, stat, occurence in status_2:
        occurences[(tool, stat, "db2")] = occurence
    #    tools.sort(
    #        key=lambda t: occurences.get((t, "FINISHED", "db1"), 0)
    #        + occurences.get((t, "FINISHED", "db2"), 0),
    #        reverse=True,
    #    )
    tools.sort()
    values = {
        "Finished": np.zeros(len(tools) * 2),
        "Time Out": np.zeros(len(tools) * 2),
        "Other": np.zeros(len(tools) * 2),
        "Failed": np.zeros(len(tools) * 2),
    }
    nb_apk_tot = len(apk_1 | apk_2)
    if same_apks:
        nb_apk_1 = nb_apk_tot
        nb_apk_2 = nb_apk_tot
        missing_1 = len(apk_2 - apk_1)
        missing_2 = len(apk_1 - apk_2)
        values["Missing"] = np.zeros(len(tools) * 2)
        for i in range(len(tools)):
            values["Missing"][2 * i] = (missing_1 * 100) / nb_apk_1
            values["Missing"][2 * i + 1] = (missing_2 * 100) / nb_apk_2
    else:
        nb_apk_1 = len(apk_1)
        nb_apk_2 = len(apk_2)
    colors = {
        "Finished": "#009E73",
        "Time Out": "#56B4E9",
        "Other": "#555555",  # TODO: find beter color
        "Failed": "#D55E00",
        "Missing": "#555555",
    }
    hatch = {
        "Finished": "/",
        "Time Out": "x",
        "Other": ".",
        "Failed": "\\",
        "Missing": "-",
    }
    for i, tool in enumerate(tools):
        i_1 = 2 * i
        i_2 = 2 * i + 1
        values["Finished"][i_1] = occurences.get((tool, "FINISHED", "db1"), 0)
        values["Finished"][i_2] = occurences.get((tool, "FINISHED", "db2"), 0)
        values["Time Out"][i_1] = occurences.get((tool, "TIMEOUT", "db1"), 0)
        values["Time Out"][i_2] = occurences.get((tool, "TIMEOUT", "db2"), 0)
        values["Failed"][i_1] = occurences.get((tool, "FAILED", "db1"), 0)
        values["Failed"][i_2] = occurences.get((tool, "FAILED", "db2"), 0)
        values["Other"][i_1] = (
            len(apk_1)
            - values["Finished"][i_1]
            - values["Time Out"][i_1]
            - values["Failed"][i_1]
        )
        values["Other"][i_2] = (
            len(apk_2)
            - values["Finished"][i_2]
            - values["Time Out"][i_2]
            - values["Failed"][i_2]
        )
        values["Finished"][i_1] = (
            0 if nb_apk_1 == 0 else (100 * values["Finished"][i_1]) / nb_apk_1
        )
        values["Finished"][i_2] = (
            0 if nb_apk_2 == 0 else (100 * values["Finished"][i_2]) / nb_apk_2
        )
        values["Time Out"][i_1] = (
            0 if nb_apk_1 == 0 else (100 * values["Time Out"][i_1]) / nb_apk_1
        )
        values["Time Out"][i_2] = (
            0 if nb_apk_2 == 0 else (100 * values["Time Out"][i_2]) / nb_apk_2
        )
        values["Failed"][i_1] = (
            0 if nb_apk_1 == 0 else (100 * values["Failed"][i_1]) / nb_apk_1
        )
        values["Failed"][i_2] = (
            0 if nb_apk_2 == 0 else (100 * values["Failed"][i_2]) / nb_apk_2
        )
        values["Other"][i_1] = (
            0 if nb_apk_1 == 0 else (100 * values["Other"][i_1]) / nb_apk_1
        )
        values["Other"][i_2] = (
            0 if nb_apk_2 == 0 else (100 * values["Other"][i_2]) / nb_apk_2
        )
    bottom = np.zeros(len(tools) * 2)
    x_axis = np.zeros(len(tools) * 2)
    x_width = 3
    x_0 = x_width / 2
    lstep = 1
    bstep = 5
    for i in range(len(tools)):
        x_0 += bstep + x_width
        x_axis[2 * i] = x_0
        x_0 += lstep + x_width
        x_axis[2 * i + 1] = x_0
    tick_legend = []
    for tool in tools:
        tick_legend.append(f"{tool}")  # (f"{tool} on goodware")
        tick_legend.append("")  # (f"{tool} on malware")
    plt.figure(figsize=(20, 9), dpi=80)
    if same_apks:
        stats = ["Finished", "Time Out", "Other", "Failed", "Missing"]
    else:
        stats = ["Finished", "Time Out", "Other", "Failed"]
    for stat in stats:
        plt.bar(
            x_axis,
            values[stat],
            label=stat,
            color=colors[stat],
            hatch=hatch[stat],
            bottom=bottom,
            width=x_width,
            edgecolor="black",
        )
        bottom += values[stat]
    plt.xticks(x_axis, tick_legend, rotation=80)
    plt.legend()
    plt.ylabel("% of analysed apk")
    render(title, interactive, image_path, format="svg")
--- a/rasta_data_manipulation/rasta_triturage/utils.py
+++ b/rasta_data_manipulation/rasta_triturage/utils.py
@ -112,7 +112,11 @@ def radar_chart(
 def render(
-    title: str, interactive: bool, image_path: Path | None, tight_layout: bool = True
+    title: str,
    interactive: bool,
    image_path: Path | None,
    tight_layout: bool = True,
    format: str = "pdf",
 ):
    """Render the figure. If `interactive`, display if, if `image_path`, save it."""
    # plt.title(title)
@ -121,7 +125,7 @@ def render(
    if image_path is not None:
        if not image_path.exists():
            image_path.mkdir(parents=True, exist_ok=True)
-        plt.savefig(image_path / (slugify(title) + ".pdf"), format="pdf")
+        plt.savefig(image_path / (slugify(title) + "." + format), format=format)
    if interactive:
        plt.show()
    plt.close()