diff --git a/3_rasta/3_experiments.typ b/3_rasta/3_experiments.typ index 15f7306..e26019c 100644 --- a/3_rasta/3_experiments.typ +++ b/3_rasta/3_experiments.typ @@ -1,4 +1,4 @@ -#import "../lib.typ": todo, highlight, num, paragraph, SDK, APK, DEX, FR +#import "../lib.typ": todo, highlight, num, paragraph, SDK, APK, DEX, FR, APKs #import "X_var.typ": * #import "X_lib.typ": * @@ -8,14 +8,69 @@ === RQ1: Re-Usability Evaluation -#todo[alt text for figure rasta-exit / rasta-exit-drebin] #figure( - image("figs/exit-status-for-the-drebin-dataset.svg", width: 100%), + image( + "figs/exit-status-for-the-drebin-dataset.svg", + width: 100%, + alt: "Bar chart showing the % of analyse apk on the y-axis and the tools on the x-axis. + Horizontal blue dotted lines mark the 15%, 50% % and 85% values. + Each bar represent a tools, with the finished analysis in green at the bottom, the analysis that timed of in blue, then on top in red the analysis that failed. Their is a last color, grey, for the other category, only visible in the dialdroid bar representing 5% of the result. + The results are (approximately) as follow: + adagio: 100% finished + amandroid: less than 5% timed out, the rest finished + anadroid: 85% failed, less than 5% timed out, the rest finished + androguard: 100% finished + androguard_dad: 5% failled, the rest finished + apparecium: arround 1% failed, the rest finished + blueseal: less than 5 failed, a little more than 10% timed out, the rest (just under 85%) finished + dialdroid: a little more than 50% finished, less than 5% timed out, arround 5% are marked as other, the rest failled + didfail: 70% finished, the rest failed + droidsafe: 40% finihed, 45% timedout, 15% failed + flowdroid: 65% finished, the rest failed + gator: 100% finished + ic3: 99% finished, 1% failed + ic3_fork: 98% finishe, 2% failed + iccta: 60% finished, less than 5% timed out, the rest failed + mallodroid: 100% finished + perfchecker: 75% finished, the rest failed + redexer: 100% finished + saaf: 90% finished, 5% timed out, 5% failed, + wognsen_et_al: 75% finished, 1% failed, the rest timed out + " + ), caption: [Exit status for the Drebin dataset], ) #figure( - image("figs/exit-status-for-the-rasta-dataset.svg", width: 100%), + image( + "figs/exit-status-for-the-rasta-dataset.svg", + width: 100%, + alt: "Bar chart showing the % of analyse apk on the y-axis and the tools on the x-axis. + Horizontal blue dotted lines mark the 15%, 50% % and 85% values. + Each bar represent a tools, with the finished analysis in green at the bottom, the analysis that timed of in blue, then on top in red the analysis that failed. Their is a last color, grey, for the other category, only visible in the dialdroid bar representing 10% of the result and in the blueseal bar, for 5% of the results. + The results are (approximately) as follow: + adagio: 100% finished + amandroid: less than 5% failed, 10% timed out, the rest finished + anadroid: 95% failed, 1% timed out, the rest finished + androguard: 100% finished + androguard_dad: a little more than 45% finished, the rest failed + apparecium: arround 5% failed, 1% timed out, the rest finished + blueseal: 20% finished, a 15% timed out, 5% are marked other, the rest failed + dialdroid: 35% finished, 1% timed out, 10 are marked other, the rest failed + didfail: 25% finished, less than 5% timed out, the rest failed + droidsafe: less than 10% finihed, 20% timedout, the rest failed + flowdroid: 55% finished, the rest failed + gator: a little more than 85% finished, 5% timed out, 10% failed + ic3: less than 80% finished, 5% timed out, the rest failed + ic3_fork: 60% finished, 5% times out, the rest failed + iccta: 30% finished, 10% timed out, the rest failed + mallodroid: 100% finished + perfchecker: 25% finished, less than 5% timed out, the rest failed + redexer: 90% finished, the rest failed + saaf: 40% finished, the rest failed, + wognsen_et_al: a little less than 15% finished, a little less than 20% failed, the rest timed out + " + ), caption: [Exit status for the Rasta dataset], ) @@ -218,75 +273,6 @@ The date is also correlated with the success rate for Java based tools only. === RQ3: Malware vs Goodware -#todo[complete @sec:rasta-mal-vs-good by commenting the new figures] - -/* -``` -sqlite> SELECT vt_detection == 0, COUNT(exec.sha256) FROM exec INNER JOIN apk ON exec.sha256 = apk.sha256 WHERE tool_status = 'FINISHED' AND dex_size_decile = 6 GROUP BY vt_detection == 0; -0|2971 % malware -1|60455 % goodware -sqlite> SELECT vt_detection == 0, COUNT(DISTINCT sha256) FROM apk WHERE dex_size_decile = 6 GROUP BY vt_detection == 0; -0|243 -1|6009 -``` -``` ->>> 61.13168724279835 -0.4969812257050699 ->>> 60455/6009/20 * 100 -50.30371110001665 -``` - - rate goodware rate malware avg size goodware (MB) avg size malware (MB) - decile 1: 85.42 82.02 0.13 0.11 - decile 2: 74.46 72.34 0.54 0.55 - decile 3: 63.38 65.67 1.37 1.25 - decile 4: 57.21 62.31 2.41 2.34 - decile 5: 53.36 59.27 3.56 3.55 - decile 6: 50.3 61.13 4.61 4.56 - decile 7: 46.76 56.54 5.87 5.91 - decile 8: 42.57 56.23 7.64 7.63 - decile 9: 39.09 57.94 11.39 11.26 - decile 10: 33.34 45.86 24.24 21.36 - total: 54.28 64.82 6.29 4.14 -*/ - - -#todo[Alt text for rasta-exit-goodmal] -#figure( - image( - "figs/exit-status-for-the-rasta-dataset-goodware-malware.svg", - width: 100%, - alt: "", - ), - caption: [Exit status comparing goodware and malware for the Rasta dataset], -) - -/* -[15:25] Jean-Marie Mineau - -moyenne de la taille total des dex: 6464228.10027989 - -[15:26] Jean-Marie Mineau - -(tout confondu) - -[15:26] Jean-Marie Mineau - -goodware: 6598464.94224066 - -malware: 4337376.97252155 - -``` -sqlite> SELECT AVG(apk_size) FROM apk; -16918107.6526989 -sqlite> SELECT AVG(apk_size) FROM apk WHERE vt_detection = 0; -16897989.4472311 -sqlite> SELECT AVG(apk_size) FROM apk WHERE vt_detection != 0; -17236860.8903556 -``` -*/ - - #figure({ show table: set text(size: 0.80em) table( @@ -318,9 +304,91 @@ sqlite> SELECT AVG(apk_size) FROM apk WHERE vt_detection != 0; table.cell(colspan: 3/*4*/, inset: 3pt)[], table.hline(), )}, + placement: none, // floating figure makes this table go in the previous section :grim: caption: [Average size and date of goodware/malware parts of the Rasta dataset], ) +We sampled our dataset to have a variety of #APK sizes, but the size of the application is not entirely proportional to the bytecode size. +Looking at @tab:rasta-sizes, we can see that although malware are in average bigger #APKs, they contains less bytecode than goodware. +In the previous section, we saw that the size of the bytecode has the most significant impact on the finishing rate of analysis tools, and indeed, @fig:rasta-exit-goodmal reflect that. + + +/* +``` +sqlite> SELECT vt_detection == 0, COUNT(exec.sha256) FROM exec INNER JOIN apk ON exec.sha256 = apk.sha256 WHERE tool_status = 'FINISHED' AND dex_size_decile = 6 GROUP BY vt_detection == 0; +0|2971 % malware +1|60455 % goodware +sqlite> SELECT vt_detection == 0, COUNT(DISTINCT sha256) FROM apk WHERE dex_size_decile = 6 GROUP BY vt_detection == 0; +0|243 +1|6009 +``` +``` +>>> 61.13168724279835 +0.4969812257050699 +>>> 60455/6009/20 * 100 +50.30371110001665 +``` + + rate goodware rate malware avg size goodware (MB) avg size malware (MB) + decile 1: 85.42 82.02 0.13 0.11 + decile 2: 74.46 72.34 0.54 0.55 + decile 3: 63.38 65.67 1.37 1.25 + decile 4: 57.21 62.31 2.41 2.34 + decile 5: 53.36 59.27 3.56 3.55 + decile 6: 50.3 61.13 4.61 4.56 + decile 7: 46.76 56.54 5.87 5.91 + decile 8: 42.57 56.23 7.64 7.63 + decile 9: 39.09 57.94 11.39 11.26 + decile 10: 33.34 45.86 24.24 21.36 + total: 54.28 64.82 6.29 4.14 +*/ + +#figure( + image( + "figs/exit-status-for-the-rasta-dataset-goodware-malware.svg", + width: 100%, + alt: "Bar chart showing the % of analyse apk on the y-axis and the tools on the x-axis. + Each tools has two bars, one for goodware an one for malware. + The goodware bars are the same as the one in the figure Exit status for the Rasta dataset. + The timeout rate looks the same on both bar of each tools. + The finishing rate of the malware bar is a lot higher than in the goodware bar for androguard_dad, blueseal, didfail, iccta, perfchecker and wogsen_et_al. + The finishing rate of the malware bar is higher than in the goodware bar for ic3 and ic3_fork. + The only two tools where the finishing rate is better for goodware are apparecium (by arround 15%) and redexer (by arround 10%). + The other tools have similar finishing rate, finishing rate slightly in favor of malware. + " + ), + caption: [Exit status comparing goodware (left bars) and malware (right bars) for the Rasta dataset], +) + +/* +[15:25] Jean-Marie Mineau + +moyenne de la taille total des dex: 6464228.10027989 + +[15:26] Jean-Marie Mineau + +(tout confondu) + +[15:26] Jean-Marie Mineau + +goodware: 6598464.94224066 + +malware: 4337376.97252155 + +``` +sqlite> SELECT AVG(apk_size) FROM apk; +16918107.6526989 +sqlite> SELECT AVG(apk_size) FROM apk WHERE vt_detection = 0; +16897989.4472311 +sqlite> SELECT AVG(apk_size) FROM apk WHERE vt_detection != 0; +17236860.8903556 +``` +*/ + +In @fig:rasta-exit-goodmal, we compared the finishing rate of malware and goodware applications for the evaluated tools. +We can see that malware and goodware seam to generate a similar number of timeouts. +However, with the exception of two tools -- apparecium and redexer, we can see a trend of goodware beeing harder to analyse than malware. +Some tools, like DAD or perfchecker, show the finishing rate ratio augment by more than 20 points. #figure({ show table: set text(size: 0.80em) @@ -369,13 +437,12 @@ sqlite> SELECT AVG(apk_size) FROM apk WHERE vt_detection != 0; )}, caption: [#DEX size and Finishing Rate (#FR) per decile], ) - -We compared the finishing rate of malware and goodware applications for evaluated tools. -Because, the size of applications impacts this finishing rate, it is interesting to compare the success rate for each decile of bytecode size. -@tab:rasta-sizes-decile reports the bytecode size and the finishing rate of goodware and malware in each decile of size. +We saw the the bytecode size may be an explanation for this increase. +To investigate this further, @tab:rasta-sizes-decile reports the bytecode size and the finishing rate of goodware and malware in each decile of bytecode size. We also computed the ratio of the bytecode size and finishing rate for the two populations. -We observe that the ratio for the finishing rate decreases from 1.04 to 0.73, while the ratio of the bytecode size is around 1. -We conclude from this table that analyzing malware triggers less errors than for goodware. +We observe that the while the bytecode size ratio between goodware an malware stays close to one in each deciles (excluding the two extremes), the goodware/malware finishing rate ratio decrease for each decile. +It goes from 1.03 for the 2#super[nd] decile to 0.67 in the 9#super[th] decile. +We conclude from this table that, at equal size, analyzing malware still triggers less errors than for goodware, and that the difference of errors generated between when analyzing a goodware and analyzing a malware increase with the bytecode size. #highlight()[ diff --git a/3_rasta/4_discussion.typ b/3_rasta/4_discussion.typ index 4ee9053..0fd21a0 100644 --- a/3_rasta/4_discussion.typ +++ b/3_rasta/4_discussion.typ @@ -1,9 +1,12 @@ -#import "../lib.typ": todo, etal, paragraph +#import "../lib.typ": todo, jfl-note +#import "../lib.typ": etal, paragraph #import "X_var.typ": * #import "X_lib.typ": * == Discussion +#todo[split into: error analysis, soa comp, recommendations and limitations] + #figure({ show table: set text(size: 0.50em) show table.cell.where(y: 0): it => if it.x == 0 { it } else { rotate(-90deg, reflow: true, it) } @@ -151,8 +154,9 @@ Regarding errors linked to the disk space, we observe few ratios for the excepti Manual inspections revealed that those errors are often a consequence of a failed apktool execution. Second, the black squares indicate frequent errors that need to be investigated separately. -In the rest of this section, we manually analyzed, when possible, the code that generates this high ratio of errors and we give feedback about the possible causes and difficulties to write a bug fix. +In the next subsection, we manually analyzed, when possible, the code that generates this high ratio of errors and we give feedback about the possible causes and difficulties to write a bug fix. +=== Tool by Tool Failure Analysis /* Dialdroid: TODO com.google.common.util.concurrent.ExecutionError -> memory error: java.lang.StackOverflowError, java.lang.OutOfMemoryError: Java heap space, java.lang.OutOfMemoryError: GC overhead limit exceeded @@ -211,7 +215,7 @@ Anadroid: DONE Surprisingly, while Androguard almost never fails to analyze an APK, the internal decompiler of Androguard (DAD) fails more than half of the time. The analysis of the logs shows that the issue comes from the way the decompiled methods are stored: each method is stored in a file named after the method name and signature, and this file name can quickly exceed the size limit (255 characters on most file systems). It should be noticed that Androguard_dad rarely fails on the Drebin dataset. -This illustrate the importance to test tools on real and up-to-date APKs: even a bad handling of filenames can influence an analysis. +This illustrates the importance to test tools on real and up-to-date APKs: even a bad handling of filenames can influence an analysis. ] /* @@ -303,7 +307,7 @@ jasError #paragraph([Flowdroid])[ Our exchanges with the authors of Flowdroid led us to expect more timeouts from too long executions than failed run. -#todo[Deja dit? : Surprisingly we only got #mypercent(37,NBTOTAL) of timeout, and a hight number of failures.] +Surprisingly we only got #mypercent(37,NBTOTAL) of timeout, and a hight number of failures. We tried to detect recurring causes of failures, but the complexity of Flowdroid make the investigation difficult. Most exceptions seems to be related to concurrency. //or display a generic messages. Other errors that came up regularly are `java.nio.channels.ClosedChannelException` which is raised when Flowdoid fails to read from the APK, although we did not find the reason of the failure, null pointer exceptions when trying to check if a null value is in a `ConcurrentHashMap` (in `LazySummaryProvider.getClassFlows()`) and `StackOverflowError` from `StronglyConnectedComponentsFast.recurse()`. @@ -329,32 +333,14 @@ Pauck: Flowdroid avg 2m on DIALDroid-Bench (real worlds apks) As a conclusion, we observe that a lot of errors can be linked to bugs in dependencies. Our attempts to upgrade those dependencies led to new errors appearing: we conclude that this is a no trivial task that require familiarity with the inner code of the tools. -=== State of the art comparison +=== State-of-the-art comparison Luo #etal released TaintBench~@luoTaintBenchAutomaticRealworld2022 a real-world benchmark and the associated recommendations to build such a benchmark. These benchmarks confirmed that some tools such as Amandroid and Flowdroid are less efficient on real-world applications. -// Pauck #etal@pauckAndroidTaintAnalysis2018 -// Reaves #etal@reaves_droid_2016 +We confirm the hypothesis of Luo #etal that real-world applications lead to less efficient analysis than using hand crafted test applications or old datasets~@luoTaintBenchAutomaticRealworld2022. +In addition, even if Drebin is not hand-crafted, it is quite old seams to present similar issue as hand-crafted dataset when used to evaluate a tool: we obtained really good results compared to the Rasta dataset -- which is more representative of realworld applications. -We finally compare our results to the conclusions and discussions of previous papers~@luoTaintBenchAutomaticRealworld2022 @pauckAndroidTaintAnalysis2018 @reaves_droid_2016. -First we confirm the hypothesis of Luo #etal that real-world applications lead to less efficient analysis than using hand crafted test applications or old datasets~@luoTaintBenchAutomaticRealworld2022. -Even if Drebin is not hand-crafted, it is quite old and we obtained really good results compared to the Rasta dataset. -When considering real-world applications, the size is rather different from hand crafted application, which impacts the success rate. -We believe that it is explained by the fact that the complexity of the code increases with its size. - -/* -30*6 -180 -21+20+27+2+18+18 -106 -106/180*100 -58.88 -*/ - - -=== State-of-the-art comparison - -Our finding are consistent with the numerical results of Pauck #etal that showed that #mypercent(106, 180) of DIALDroid-Bench~@bosuCollusiveDataLeak2017 real-world applications are analyzed successfully with the 6 evaluated tools~@pauckAndroidTaintAnalysis2018. +Our finding are also consistent with the numerical results of Pauck #etal that showed that #mypercent(106, 180) of DIALDroid-Bench~@bosuCollusiveDataLeak2017 real-world applications are analyzed successfully with the 6 evaluated tools~@pauckAndroidTaintAnalysis2018. Six years after the release of DIALDroid-Bench, we obtain a lower ratio of #mypercent(40.05, 100) for the same set of 6 tools but using the Rasta dataset of #NBTOTALSTRING applications. We extended this result to a set of #nbtoolsvariationsrun tools and obtained a global success rate of #resultratio. We confirmed that most tools require a significant amount of work to get them running~@reaves_droid_2016. @@ -390,14 +376,14 @@ wognsen_et_al|386 Third, we extended to #nbtoolsselected different tools the work done by Reaves #etal on the usability of analysis tools (4 tools are in common, we added 16 new tools and two variations). We confirmed that most tools require a significant amount of work to get them running. -We encounter similar issues with libraries and operating system incompatibilities, and noticed that, with time, dependencies issues may impact the build process. +We encounter similar issues with libraries and operating system incompatibilities, and noticed that, as time passes, dependencies issues may impact the build process. For instance we encountered cases where the repository hosting the dependencies were closed, or cases where maven failed to download dependencies because the OS version did not support SSL, now mandatory to access maven central. //, and even one case were the could not find anywhere the compiled version of sbt used to build a tool. === Recommendations -Finally, we summarize some takeaways that developers should follow to improve the success of reusing their developed software. +#jfl-note[Finally, we summarize some takeaways that developers should follow to improve the success of reusing their developed software.][*developer*: dire que a la lumiere de ces resultats, on peut pense que certain pbs peuvent être évité ou bien corrigé par l'utilisateur] For improving the reliability of their software, developers should use classical development best practices, for example continuous integration, testing, code review. For improving the reusability developers should write a documentation about the tool usage and provide a minimal working example and describe the expected results. diff --git a/3_rasta/5_conclusion.typ b/3_rasta/5_conclusion.typ index 2122818..15d44f0 100644 --- a/3_rasta/5_conclusion.typ +++ b/3_rasta/5_conclusion.typ @@ -1,17 +1,19 @@ #import "@local/template-thesis-matisse:0.0.1": etal -#import "../lib.typ": todo +#import "../lib.typ": todo, jfl-note #import "X_var.typ": * #todo[Futur work: new systematic literature review, maybe check https://ieeexplore.ieee.org/abstract/document/9118907 ?] == Conclusion +#todo[Anwser pb1] + This paper has assessed the suggested results of the literature~@luoTaintBenchAutomaticRealworld2022 @pauckAndroidTaintAnalysis2018 @reaves_droid_2016 about the reliability of static analysis tools for Android applications. With a dataset of #NBTOTALSTRING applications we established that #resultunusable of #nbtoolsselectedvariations tools are not reusable, when considering that a tool that has more than 50% of time a failure is unusable. In total, the analysis success rate of the tools that we could run for the entire dataset is #resultratio. The characteristics that have the most influence on the success rate is the bytecode size and min SDK version. Finally, we showed that malware APKs have a better finishing rate than goodware. -In future works, we plan to investigate deeper the reported errors of the tools in order to analyze the most common types of errors, in particular for Java based tools. +#jfl-note[In future works, we plan to investigate deeper the reported errors of the tools in order to analyze the most common types of errors, in particular for Java based tools. We also plan to extend this work with a selection of more recent tools performing static analysis. -Following Reaves #etal recommendations~@reaves_droid_2016, we publish the Docker and Singularity images we built to run our experiments alongside the Docker files. This will allow the research community to use directly the tools without the build and installation penalty. +Following Reaves #etal recommendations~@reaves_droid_2016, we publish the Docker and Singularity images we built to run our experiments alongside the Docker files. This will allow the research community to use directly the tools without the build and installation penalty.][*Developper*]