From a31cfd940618f14f667dc4ee698af22876b2e2de Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Mon, 16 Dec 2024 13:43:56 +0000 Subject: [PATCH] sequence scope: optimize matching --- capa/capabilities/dynamic.py | 48 +++++++++++++++++++++++++++--------- 1 file changed, 37 insertions(+), 11 deletions(-) diff --git a/capa/capabilities/dynamic.py b/capa/capabilities/dynamic.py index eeff1abeb..28baae51a 100644 --- a/capa/capabilities/dynamic.py +++ b/capa/capabilities/dynamic.py @@ -11,11 +11,13 @@ import collections from dataclasses import dataclass +from capa.features.address import NO_ADDRESS import capa.perf import capa.features.freeze as frz import capa.render.result_document as rdoc from capa.rules import Scope, RuleSet from capa.engine import FeatureSet, MatchResults +from capa.features.common import Feature from capa.capabilities.common import Capabilities, find_file_capabilities from capa.features.extractors.base_extractor import CallHandle, ThreadHandle, ProcessHandle, DynamicFeatureExtractor @@ -90,9 +92,18 @@ def find_thread_capabilities( # # For each call, we consider the window of SEQUENCE_SIZE calls leading up to it, # merging all their features and doing a match. - # Here's the primary data structure: a deque of those features found in the prior calls. - # We'll append to it, and as it grows larger than SEQUENCE_SIZE, the oldest items are removed. - sequence: collections.deque[FeatureSet] = collections.deque(maxlen=SEQUENCE_SIZE) + # + # We track these features in two data structures: + # 1. a deque of those features found in the prior calls. + # We'll append to it, and as it grows larger than SEQUENCE_SIZE, the oldest items are removed. + # 2. a live set of features seen in the sequence. + # As we pop from the deque, we remove features from the current set, + # and as we push to the deque, we insert features to the current set. + # With this approach, our algorithm performance is independent of SEQUENCE_SIZE. + # The naive algorithm, of merging all the trailing feature sets at each call, is dependent upon SEQUENCE_SIZE + # (that is, runtime gets slower the larger SEQUENCE_SIZE is). + sequence_feature_sets: collections.deque[FeatureSet] = collections.deque(maxlen=SEQUENCE_SIZE) + sequence_features: FeatureSet = collections.defaultdict(set) # the names of rules matched at the last sequence, # so that we can deduplicate long strings of the same matche. @@ -109,14 +120,29 @@ def find_thread_capabilities( # # sequence scope matching # - # as we add items to the end of the deque, the oldest items will overflow and get dropped. - sequence.append(call_capabilities.features) - # collect all the features seen across the last SEQUENCE_SIZE calls, - # and match against them. - sequence_features: FeatureSet = collections.defaultdict(set) - for call in sequence: - for feature, vas in call.items(): - sequence_features[feature].update(vas) + # As we add items to the end of the deque, overflow and drop the oldest items (at the left end). + # While we could rely on `deque.append` with `maxlen` set (which we provide above), + # we want to use the dropped item first, to remove the old features, so we manually pop it here. + if len(sequence_feature_sets) == SEQUENCE_SIZE: + overflowing_feature_set = sequence_feature_sets.popleft() + + # these are the top-level features that will no longer have any associated addresses. + for feature, vas in overflowing_feature_set.items(): + if vas == { NO_ADDRESS, }: + # ignore the common case of global features getting added/removed/trimmed repeatedly, + # like arch/os/format. + continue + + feature_vas = sequence_features[feature] + feature_vas.difference_update(vas) + if not feature_vas: + del sequence_features[feature] + + # update the deque and set of features with the latest call's worth of features. + latest_features = call_capabilities.features + sequence_feature_sets.append(latest_features) + for feature, vas in latest_features.items(): + sequence_features[feature].update(vas) _, smatches = ruleset.match(Scope.SEQUENCE, sequence_features, ch.address) for rule_name, res in smatches.items():