diff --git a/.github/mypy/mypy.ini b/.github/mypy/mypy.ini index 3d22b05f7..9add4ef5c 100644 --- a/.github/mypy/mypy.ini +++ b/.github/mypy/mypy.ini @@ -76,4 +76,7 @@ ignore_missing_imports = True ignore_missing_imports = True [mypy-dncil.*] +ignore_missing_imports = True + +[mypy-tree_sitter.*] ignore_missing_imports = True \ No newline at end of file diff --git a/capa/features/address.py b/capa/features/address.py index 2033c24ef..0a20b4291 100644 --- a/capa/features/address.py +++ b/capa/features/address.py @@ -53,6 +53,26 @@ def __repr__(self): return f"file(0x{self:x})" +class FileOffsetRangeAddress(Address): + """an address range relative to the start of a file""" + + def __init__(self, start_byte, end_byte): + self.start_byte = start_byte + self.end_byte = end_byte + + def __eq__(self, other): + return (self.start_byte, self.end_byte) == (other.start_byte, other.end_byte) + + def __lt__(self, other): + return (self.start_byte, self.end_byte) < (other.start_byte, other.end_byte) + + def __hash__(self): + return hash((self.start_byte, self.end_byte)) + + def __repr__(self): + return f"file(0x{self.start_byte:x}, 0x{self.end_byte:x})" + + class DNTokenAddress(Address): """a .NET token""" diff --git a/capa/features/common.py b/capa/features/common.py index 30a4c0b25..b77514f7b 100644 --- a/capa/features/common.py +++ b/capa/features/common.py @@ -405,10 +405,17 @@ def __init__(self, value: str, description=None): self.name = "os" +class ScriptLanguage(Feature): + def __init__(self, value: str, description=None): + super().__init__(value, description=description) + self.name = "script language" + + FORMAT_PE = "pe" FORMAT_ELF = "elf" FORMAT_DOTNET = "dotnet" -VALID_FORMAT = (FORMAT_PE, FORMAT_ELF, FORMAT_DOTNET) +FORMAT_SCRIPT = "script" +VALID_FORMAT = (FORMAT_PE, FORMAT_ELF, FORMAT_DOTNET, FORMAT_SCRIPT) # internal only, not to be used in rules FORMAT_AUTO = "auto" FORMAT_SC32 = "sc32" diff --git a/capa/features/extractors/common.py b/capa/features/extractors/common.py index 5f56e50d4..cd8be8e15 100644 --- a/capa/features/extractors/common.py +++ b/capa/features/extractors/common.py @@ -9,9 +9,21 @@ import capa.features import capa.features.extractors.elf import capa.features.extractors.pefile -from capa.features.common import OS, FORMAT_PE, FORMAT_ELF, OS_WINDOWS, FORMAT_FREEZE, Arch, Format, String, Feature +from capa.features.common import ( + OS, + FORMAT_PE, + FORMAT_ELF, + OS_WINDOWS, + FORMAT_FREEZE, + FORMAT_SCRIPT, + Arch, + Format, + String, + Feature, +) from capa.features.freeze import is_freeze from capa.features.address import NO_ADDRESS, Address, FileOffsetAddress +from capa.features.extractors.ts.autodetect import is_script logger = logging.getLogger(__name__) @@ -34,6 +46,8 @@ def extract_format(buf) -> Iterator[Tuple[Feature, Address]]: yield Format(FORMAT_ELF), NO_ADDRESS elif is_freeze(buf): yield Format(FORMAT_FREEZE), NO_ADDRESS + elif is_script(buf): + yield Format(FORMAT_SCRIPT), NO_ADDRESS else: # we likely end up here: # 1. handling a file format (e.g. macho) diff --git a/capa/features/extractors/script.py b/capa/features/extractors/script.py new file mode 100644 index 000000000..853623695 --- /dev/null +++ b/capa/features/extractors/script.py @@ -0,0 +1,41 @@ +from typing import Tuple, Iterator + +from capa.features.common import OS, OS_ANY, ARCH_ANY, FORMAT_SCRIPT, Arch, Format, Feature, ScriptLanguage +from capa.features.address import NO_ADDRESS, Address, FileOffsetRangeAddress + +# Can be used to instantiate tree_sitter Language objects (see ts/query.py) +LANG_CS = "c_sharp" +LANG_HTML = "html" +LANG_JS = "javascript" +LANG_PY = "python" +LANG_TEM = "embedded_template" + +EXT_ASPX = ("aspx", "aspx_") +EXT_CS = ("cs", "cs_") +EXT_HTML = ("html", "html_") +EXT_PY = ("py", "py_") + + +LANGUAGE_FEATURE_FORMAT = { + LANG_CS: "C#", + LANG_HTML: "HTML", + LANG_JS: "JavaScript", + LANG_PY: "Python", + LANG_TEM: "Embedded Template", +} + + +def extract_arch() -> Iterator[Tuple[Feature, Address]]: + yield Arch(ARCH_ANY), NO_ADDRESS + + +def extract_language(language: str, addr: FileOffsetRangeAddress) -> Iterator[Tuple[Feature, Address]]: + yield ScriptLanguage(LANGUAGE_FEATURE_FORMAT[language]), addr + + +def extract_os() -> Iterator[Tuple[Feature, Address]]: + yield OS(OS_ANY), NO_ADDRESS + + +def extract_format() -> Iterator[Tuple[Feature, Address]]: + yield Format(FORMAT_SCRIPT), NO_ADDRESS diff --git a/capa/features/extractors/ts/__init__.py b/capa/features/extractors/ts/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/capa/features/extractors/ts/autodetect.py b/capa/features/extractors/ts/autodetect.py new file mode 100644 index 000000000..2883be9b8 --- /dev/null +++ b/capa/features/extractors/ts/autodetect.py @@ -0,0 +1,65 @@ +from typing import Optional + +from tree_sitter import Node, Tree, Parser, Language + +from capa.features.extractors.script import EXT_CS, EXT_PY, LANG_CS, LANG_PY, EXT_ASPX, EXT_HTML, LANG_TEM, LANG_HTML +from capa.features.extractors.ts.query import TS_LANGUAGES + + +def is_script(buf: bytes) -> bool: + try: + return bool(get_language_ts(buf)) + except ValueError: + return False + + +def _parse(ts_language: Language, buf: bytes) -> Optional[Tree]: + try: + parser = Parser() + parser.set_language(ts_language) + return parser.parse(buf) + except ValueError: + return None + + +def _contains_errors(ts_language, node: Node) -> bool: + return ts_language.query("(ERROR) @error").captures(node) + + +def get_language_ts(buf: bytes) -> str: + for language, ts_language in TS_LANGUAGES.items(): + tree = _parse(ts_language, buf) + if tree and not _contains_errors(ts_language, tree.root_node): + return language + raise ValueError("failed to parse the language") + + +def get_template_language_ts(buf: bytes) -> str: + for language, ts_language in TS_LANGUAGES.items(): + if language in [LANG_TEM, LANG_HTML]: + continue + tree = _parse(ts_language, buf) + if tree and not _contains_errors(ts_language, tree.root_node): + return language + raise ValueError("failed to parse the language") + + +def get_language_from_ext(path: str) -> str: + if path.endswith(EXT_ASPX): + return LANG_TEM + if path.endswith(EXT_CS): + return LANG_CS + if path.endswith(EXT_HTML): + return LANG_HTML + if path.endswith(EXT_PY): + return LANG_PY + raise ValueError(f"{path} has an unrecognized or an unsupported extension.") + + +def get_language(path: str) -> str: + try: + with open(path, "rb") as f: + buf = f.read() + return get_language_ts(buf) + except ValueError: + return get_language_from_ext(path) diff --git a/capa/features/extractors/ts/build.py b/capa/features/extractors/ts/build.py new file mode 100644 index 000000000..2e73eaeea --- /dev/null +++ b/capa/features/extractors/ts/build.py @@ -0,0 +1,15 @@ +from tree_sitter import Language + +build_dir = "build/my-languages.so" +languages = [ + "vendor/tree-sitter-c-sharp", + "vendor/tree-sitter-embedded-template", + "vendor/tree-sitter-html", + "vendor/tree-sitter-javascript", + "vendor/tree-sitter-python", +] + + +class TSBuilder: + def __init__(self): + Language.build_library(build_dir, languages) diff --git a/capa/features/extractors/ts/engine.py b/capa/features/extractors/ts/engine.py new file mode 100644 index 000000000..e1be25330 --- /dev/null +++ b/capa/features/extractors/ts/engine.py @@ -0,0 +1,261 @@ +import re +from typing import List, Tuple, Iterator, Optional + +from tree_sitter import Node, Tree, Parser + +import capa.features.extractors.ts.autodetect +from capa.features.address import FileOffsetRangeAddress +from capa.features.extractors.script import LANG_CS, LANG_JS, LANG_TEM, LANG_HTML +from capa.features.extractors.ts.query import ( + BINDINGS, + QueryBinding, + HTMLQueryBinding, + ScriptQueryBinding, + TemplateQueryBinding, +) +from capa.features.extractors.ts.tools import LANGUAGE_TOOLKITS, BaseNamespace, CSharpNamespace, LanguageToolkit + + +class TreeSitterBaseEngine: + buf: bytes + language: str + query: QueryBinding + tree: Tree + + def __init__(self, language: str, buf: bytes): + self.language = language + self.query = BINDINGS[language] + self.buf = buf + self.tree = self.parse() + + def parse(self) -> Tree: + parser = Parser() + parser.set_language(self.query.language) + return parser.parse(self.buf) + + def get_byte_range(self, node: Node) -> bytes: + return self.buf[node.start_byte : node.end_byte] + + def get_str(self, node: Node) -> str: + return self.get_byte_range(node).decode("utf-8") + + def get_address(self, node: Node) -> FileOffsetRangeAddress: + return FileOffsetRangeAddress(node.start_byte, node.end_byte) + + def get_default_address(self) -> FileOffsetRangeAddress: + return self.get_address(self.tree.root_node) + + +class TreeSitterExtractorEngine(TreeSitterBaseEngine): + query: ScriptQueryBinding + language_toolkit: LanguageToolkit + buf_offset: int + namespaces: set[BaseNamespace] + + def __init__( + self, + language: str, + buf: bytes, + buf_offset: int = 0, + additional_namespaces: set[BaseNamespace] = set(), + ): + super().__init__(language, buf) + self.buf_offset = buf_offset + self.language_toolkit = LANGUAGE_TOOLKITS[language] + self.namespaces = set(self.get_processed_namespaces()) + self.namespaces = self.namespaces.union(additional_namespaces) + + def get_address(self, node: Node) -> FileOffsetRangeAddress: + return FileOffsetRangeAddress(self.buf_offset + node.start_byte, self.buf_offset + node.end_byte) + + def get_new_object_names(self, node: Node) -> Iterator[Node]: + for obj_node, _ in self.query.new_object_name.captures(node): + yield obj_node + + def get_property_names(self, node: Node) -> Iterator[Node]: + for pt_node, _ in self.query.property_name.captures(node): + yield pt_node + + def get_processed_property_names(self, node: Node) -> Iterator[Tuple[Node, str]]: + """Generates captured property name nodes and their associated proper names (see process_property + for details), e.g.: [(node0, "StartInfo"), (node1, "RedirectStandardOutput")].""" + for pt_node in self.get_property_names(node): + pt_name = self.language_toolkit.process_property(pt_node, self.get_str(pt_node)) + if pt_name: + yield pt_node, pt_name + + def get_function_definitions(self, node: Optional[Node] = None) -> Iterator[Node]: + node = self.tree.root_node if node is None else node + for fd_node, _ in self.query.function_definition.captures(node): + yield fd_node + + def get_function_definition_name(self, node: Node) -> Node: + return node.child_by_field_name(self.query.function_definition_field_name) + + def get_function_definition_names(self, node: Node) -> Iterator[Node]: + for fd_node in self.get_function_definitions(node): + yield self.get_function_definition_name(fd_node) + + def get_function_call_names(self, node: Node) -> Iterator[Node]: + for fcn_node, _ in self.query.function_call_name.captures(node): + yield fcn_node + + def get_imported_constants(self, node: Node) -> Iterator[Node]: + for ic_node, _ in self.query.imported_constant_name.captures(node): + yield ic_node + + def get_processed_imported_constants(self, node: Node) -> Iterator[Tuple[Node, str]]: + """Generates captured imported constant nodes and their associated proper names (see process_imported_constant + for details), e.g.: [(node0, "ssl.CERT_NONE"), (node1, "win32con.FILE_ATTRIBUTE_HIDDEN")].""" + for ic_node in self.get_imported_constants(node): + ic_name = self.language_toolkit.process_imported_constant(ic_node, self.get_str(ic_node)) + if ic_name: + yield ic_node, ic_name + + def get_string_literals(self, node: Node) -> Iterator[Node]: + for str_node, _ in self.query.string_literal.captures(node): + yield str_node + + def get_integer_literals(self, node: Node) -> Iterator[Node]: + for int_node, _ in self.query.integer_literal.captures(node): + yield int_node + + def get_namespaces(self, node: Optional[Node] = None) -> List[Tuple[Node, str]]: + return self.query.namespace.captures(self.tree.root_node if node is None else node) + + def get_processed_namespaces(self, node: Optional[Node] = None) -> Iterator[BaseNamespace]: + for ns_node, query_name in self.get_namespaces(node): + for namespace in self.language_toolkit.process_namespace(ns_node, query_name, self.get_str): + yield namespace + + def get_global_statements(self) -> Iterator[Node]: + for node, _ in self.query.global_statement.captures(self.tree.root_node): + yield node + + def get_direct_method_call(self, node: Node) -> Optional[Node]: + captures = self.query.direct_method_call.captures(node) + if captures: + return captures[0][0] + return None + + +class TreeSitterTemplateEngine(TreeSitterBaseEngine): + query: TemplateQueryBinding + language_toolkit: LanguageToolkit + embedded_language: str + namespaces: set[BaseNamespace] + + def __init__(self, buf: bytes): + super().__init__(LANG_TEM, buf) + self.embedded_language = self.identify_language() + self.language_toolkit = LANGUAGE_TOOLKITS[self.embedded_language] + self.namespaces = set(self.get_namespaces()) + + def get_code_sections(self) -> Iterator[Node]: + for node, _ in self.query.code.captures(self.tree.root_node): + yield node + + def get_parsed_code_sections(self) -> Iterator[TreeSitterExtractorEngine]: + for node in self.get_code_sections(): + # TODO: support JS + if self.embedded_language == LANG_CS: + yield TreeSitterExtractorEngine( + self.embedded_language, + self.get_byte_range(node), + node.start_byte, + self.namespaces, + ) + else: + raise ValueError(f"parsing of {self.embedded_language} is not supported") + + def get_content_sections(self) -> Iterator[Node]: + for node, _ in self.query.content.captures(self.tree.root_node): + yield node + + def identify_language(self) -> str: + for node in self.get_code_sections(): + if self.is_c_sharp(node): + return LANG_CS + try: + return capa.features.extractors.ts.autodetect.get_template_language_ts(self.get_byte_range(node)) + except: + continue + raise ValueError(f"failed to identify the template language") + + def get_imported_namespaces(self) -> Iterator[BaseNamespace]: + for node in self.get_code_sections(): + if self.is_aspx_import_directive(node): + namespace = self.get_aspx_namespace(node) + if namespace is not None: + yield namespace + + def get_namespaces(self) -> Iterator[BaseNamespace]: + yield from self.language_toolkit.get_default_namespaces(True) + yield from self.get_imported_namespaces() + + def is_c_sharp(self, node: Node) -> bool: + return bool( + re.match( + r'@ .*Page Language\s*=\s*"C#".*'.encode(), + self.get_byte_range(node), + re.IGNORECASE, + ) + ) + + def is_aspx_import_directive(self, node: Node) -> bool: + return bool( + re.match( + r"@\s*Import Namespace=".encode(), + self.get_byte_range(node), + re.IGNORECASE, + ) + ) + + def get_aspx_namespace(self, node: Node) -> Optional[BaseNamespace]: + match = re.search( + r'@\s*Import namespace="(.*?)"'.encode(), + self.get_byte_range(node), + re.IGNORECASE, + ) + return CSharpNamespace(match.group(1).decode("utf-8"), node) if match is not None else None + + +class TreeSitterHTMLEngine(TreeSitterBaseEngine): + query: HTMLQueryBinding + namespaces: set[BaseNamespace] + + def __init__(self, buf: bytes, namespaces: set[BaseNamespace] = set()): + super().__init__(LANG_HTML, buf) + self.namespaces = namespaces + + def get_scripts(self) -> Iterator[Node]: + for node, _ in self.query.script_element.captures(self.tree.root_node): + yield node + + def get_attributes(self, node: Node) -> Iterator[Node]: + for att_node, _ in self.query.attribute.captures(node): + yield att_node + + def get_identified_scripts(self) -> Iterator[Tuple[Node, str]]: + for node in self.get_scripts(): + for content_node in self.get_script_contents(node): + yield content_node, self.identify_language(node) + + def get_script_contents(self, node: Node) -> Iterator[Node]: + for sc_node, _ in self.query.script_content.captures(node): + yield sc_node + + def get_parsed_code_sections(self) -> Iterator[TreeSitterExtractorEngine]: + for node, language in self.get_identified_scripts(): + # TODO: support JS + if language == LANG_CS: + yield TreeSitterExtractorEngine(language, self.get_byte_range(node), node.start_byte, self.namespaces) + + def identify_language(self, node: Node) -> str: + for att_node in self.get_attributes(node): + if self.is_server_side_c_sharp(att_node): + return LANG_CS + return LANG_JS + + def is_server_side_c_sharp(self, node: Node) -> bool: + return bool(re.findall(r'runat\s*=\s*"server"'.encode(), self.get_byte_range(node))) diff --git a/capa/features/extractors/ts/extractor.py b/capa/features/extractors/ts/extractor.py new file mode 100644 index 000000000..1447659d0 --- /dev/null +++ b/capa/features/extractors/ts/extractor.py @@ -0,0 +1,108 @@ +from typing import List, Tuple, Union, Iterator + +import capa.features.extractors.script +import capa.features.extractors.ts.file +import capa.features.extractors.ts.engine +import capa.features.extractors.ts.global_ +import capa.features.extractors.ts.function +import capa.features.extractors.ts.autodetect +from capa.exceptions import UnsupportedFormatError +from capa.features.common import Namespace +from capa.features.address import NO_ADDRESS, Address, AbsoluteVirtualAddress, FileOffsetRangeAddress +from capa.features.extractors.script import LANG_TEM, LANG_HTML +from capa.features.extractors.ts.tools import BaseNamespace +from capa.features.extractors.ts.engine import TreeSitterHTMLEngine, TreeSitterTemplateEngine, TreeSitterExtractorEngine +from capa.features.extractors.ts.function import PSEUDO_MAIN, TSFunctionInner +from capa.features.extractors.base_extractor import Feature, BBHandle, InsnHandle, FunctionHandle, FeatureExtractor + + +class TreeSitterFeatureExtractor(FeatureExtractor): + engines: List[TreeSitterExtractorEngine] + template_engine: TreeSitterTemplateEngine + language: str + path: str + + def __init__(self, path: str): + super().__init__() + self.path = path + with open(self.path, "rb") as f: + buf = f.read() + + try: + self.language = capa.features.extractors.ts.autodetect.get_language(path) + self.template_engine = self.get_template_engine(buf) + self.engines = self.get_engines(buf) + except ValueError as e: + raise UnsupportedFormatError(e) + + def get_template_engine(self, buf: bytes): + if self.language == LANG_TEM: + return TreeSitterTemplateEngine(buf) + + def get_engines(self, buf: bytes) -> List[TreeSitterExtractorEngine]: + if self.language == LANG_TEM and self.template_engine: + return self.extract_code_from_template() + if self.language == LANG_HTML: + return self.extract_code_from_html(buf) + return [TreeSitterExtractorEngine(self.language, buf)] + + def extract_code_from_template(self) -> List[TreeSitterExtractorEngine]: + engines = list(self.template_engine.get_parsed_code_sections()) + for node in self.template_engine.get_content_sections(): + section_buf = self.template_engine.get_byte_range(node) + engines.extend(self.extract_code_from_html(section_buf, self.template_engine.namespaces)) + return engines + + def extract_code_from_html( + self, buf: bytes, namespaces: set[BaseNamespace] = set() + ) -> List[TreeSitterExtractorEngine]: + return list(TreeSitterHTMLEngine(buf, namespaces).get_parsed_code_sections()) + + def get_base_address(self) -> Union[AbsoluteVirtualAddress, capa.features.address._NoAddress]: + return NO_ADDRESS + + def extract_template_namespaces(self) -> Iterator[Tuple[Feature, Address]]: + for ns in self.template_engine.get_namespaces(): + address = NO_ADDRESS if ns.node is None else FileOffsetRangeAddress(ns.node.start_byte, ns.node.end_byte) + yield Namespace(ns.name), address + + def extract_global_features(self) -> Iterator[Tuple[Feature, Address]]: + for engine in self.engines: + yield from capa.features.extractors.script.extract_language(engine.language, engine.get_default_address()) + yield from capa.features.extractors.ts.global_.extract_features() + + def extract_file_features(self) -> Iterator[Tuple[Feature, Address]]: + if self.language == LANG_TEM: + yield from self.extract_template_namespaces() + for engine in self.engines: + yield from capa.features.extractors.ts.file.extract_features(engine) + + def get_pseudo_main_function_inner(self, engine: TreeSitterExtractorEngine) -> TSFunctionInner: + return TSFunctionInner(engine.tree.root_node, PSEUDO_MAIN, engine) + + def get_pseudo_main_function(self, engine: TreeSitterExtractorEngine) -> FunctionHandle: + return FunctionHandle(engine.get_default_address(), self.get_pseudo_main_function_inner(engine)) + + def get_functions(self) -> Iterator[FunctionHandle]: + for engine in self.engines: + yield self.get_pseudo_main_function(engine) + for node in engine.get_function_definitions(): + name = engine.get_str(engine.get_function_definition_name(node)) + yield FunctionHandle(engine.get_address(node), TSFunctionInner(node, name, engine)) + + def extract_function_features(self, f: FunctionHandle) -> Iterator[Tuple[Feature, Address]]: + yield from capa.features.extractors.ts.function.extract_features(f, f.inner.engine) + + def get_basic_blocks(self, f: FunctionHandle) -> Iterator[BBHandle]: + yield from [] + + def extract_basic_block_features(self, f: FunctionHandle, bb: BBHandle) -> Iterator[Tuple[Feature, Address]]: + yield from [] + + def get_instructions(self, f: FunctionHandle, bb: BBHandle) -> Iterator[InsnHandle]: + yield from [] + + def extract_insn_features( + self, f: FunctionHandle, bb: BBHandle, insn: InsnHandle + ) -> Iterator[Tuple[Feature, Address]]: + yield from [] diff --git a/capa/features/extractors/ts/file.py b/capa/features/extractors/ts/file.py new file mode 100644 index 000000000..7aedef458 --- /dev/null +++ b/capa/features/extractors/ts/file.py @@ -0,0 +1,19 @@ +from typing import Tuple, Iterator + +from capa.features.common import Feature, Namespace +from capa.features.address import Address +from capa.features.extractors.ts.engine import TreeSitterExtractorEngine + + +def extract_namespaces(engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: + for namespace in engine.get_processed_namespaces(): + yield Namespace(namespace.name), engine.get_address(namespace.node) + + +def extract_features(engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: + for file_handler in FILE_HANDLERS: + for feature, addr in file_handler(engine): + yield feature, addr + + +FILE_HANDLERS = (extract_namespaces,) diff --git a/capa/features/extractors/ts/function.py b/capa/features/extractors/ts/function.py new file mode 100644 index 000000000..854f116af --- /dev/null +++ b/capa/features/extractors/ts/function.py @@ -0,0 +1,179 @@ +import itertools +from typing import Tuple, Iterator +from dataclasses import dataclass + +from tree_sitter import Node + +from capa.features.insn import API, Number, Property +from capa.features.common import Class, String, Feature, Namespace +from capa.features.address import Address +from capa.features.extractors.ts.tools import BaseNamespace +from capa.features.extractors.ts.engine import TreeSitterExtractorEngine +from capa.features.extractors.base_extractor import FunctionHandle + +PSEUDO_MAIN = "PSEUDO MAIN" # all global statements in one function scope + + +@dataclass +class TSFunctionInner: + node: Node + name: str + engine: TreeSitterExtractorEngine + + +def is_pseudo_main_function(fh: FunctionHandle, engine: TreeSitterExtractorEngine) -> bool: + return ( + fh.address == engine.get_default_address() + and fh.inner.node == engine.tree.root_node + and fh.inner.name == PSEUDO_MAIN + ) + + +def extract_strings(fn_node: Node, engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: + for node in engine.get_string_literals(fn_node): + yield String(engine.language_toolkit.parse_string(engine.get_str(node))), engine.get_address(node) + + +def extract_integers(fn_node: Node, engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: + for node in engine.get_integer_literals(fn_node): + try: + yield Number(engine.language_toolkit.parse_integer(engine.get_str(node))), engine.get_address(node) + except ValueError: + continue + + +def get_possible_full_names(name: str, namespaces: set[BaseNamespace]) -> Iterator[str]: + yield name + for namespace in namespaces: + yield namespace.join(name) + + +def get_default_constructor(fn_node: Node, engine: TreeSitterExtractorEngine) -> Iterator[str]: + for name_node in engine.get_new_object_names(fn_node): + for full_name in get_possible_full_names(engine.get_str(name_node), engine.namespaces): + if engine.language_toolkit.is_imported_class(full_name): + yield full_name + + +def get_custom_constructor(fn_node: Node, engine: TreeSitterExtractorEngine) -> Iterator[str]: + for name_node in engine.get_function_call_names(fn_node): + for full_name in get_possible_full_names(engine.get_str(name_node), engine.namespaces): + if engine.language_toolkit.is_imported_constructor(full_name): + yield full_name + + +def get_classes(fn_node: Node, engine: TreeSitterExtractorEngine) -> Iterator[str]: + yield from get_default_constructor(fn_node, engine) + yield from get_custom_constructor(fn_node, engine) + + +def _extract_default_constructor(fn_node: Node, engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: + for name_node in engine.get_new_object_names(fn_node): + for full_name in get_possible_full_names(engine.get_str(name_node), engine.namespaces): + if engine.language_toolkit.is_imported_class(full_name): + yield Namespace(full_name), engine.get_address(name_node) + yield Class(engine.language_toolkit.format_imported_class(full_name)), engine.get_address(name_node) + yield API(engine.language_toolkit.format_imported_default_constructor(full_name)), engine.get_address( + name_node + ) + + +def _extract_custom_constructor(fn_node: Node, engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: + for name_node in engine.get_function_call_names(fn_node): + for full_name in get_possible_full_names(engine.get_str(name_node), engine.namespaces): + if engine.language_toolkit.is_imported_constructor(full_name): + yield Namespace(full_name), engine.get_address(name_node) + yield Class(engine.language_toolkit.format_imported_class(full_name)), engine.get_address(name_node) + yield API(engine.language_toolkit.format_imported_custom_constructor(full_name)), engine.get_address( + name_node + ) + + +def _extract_classes(fn_node: Node, engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: + yield from _extract_default_constructor(fn_node, engine) + yield from _extract_custom_constructor(fn_node, engine) + + +def _extract_constants(fn_node: Node, engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: + for ic_node, ic_name in engine.get_processed_imported_constants(fn_node): + for full_name in get_possible_full_names(ic_name, engine.namespaces): + if engine.language_toolkit.is_imported_constant(full_name): + yield API(engine.language_toolkit.format_imported_constant(full_name)), engine.get_address(ic_node) + + +def _extract_properties( + fn_node: Node, classes: set[BaseNamespace], engine: TreeSitterExtractorEngine +) -> Iterator[Tuple[Feature, Address]]: + for pt_node, pt_name in engine.get_processed_property_names(fn_node): + for full_name in get_possible_full_names(pt_name, classes): + if engine.language_toolkit.is_imported_property(full_name): + yield Property(engine.language_toolkit.format_imported_property(full_name)), engine.get_address(pt_node) + + +def _extract_static_methods(node: Node, engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: + if engine.language_toolkit.is_builtin(engine.get_str(node)): + yield API(engine.language_toolkit.get_builtin_name(engine.get_str(node))), engine.get_address(node) + for full_name in get_possible_full_names(engine.get_str(node), engine.namespaces): + if engine.language_toolkit.is_imported_function(full_name): + yield API(engine.language_toolkit.format_imported_function(full_name)), engine.get_address(node) + + +def _do_extract_instance_methods( + node: Node, classes: set[BaseNamespace], engine: TreeSitterExtractorEngine +) -> Iterator[Tuple[Feature, Address]]: + for full_name in get_possible_full_names( + engine.language_toolkit.get_member_from_name(engine.get_str(node)), classes + ): + if engine.language_toolkit.is_imported_function(full_name): + yield API(engine.language_toolkit.format_imported_function(full_name)), engine.get_address(node) + + +def _extract_instance_methods( + node: Node, classes: set[BaseNamespace], engine: TreeSitterExtractorEngine +) -> Iterator[Tuple[Feature, Address]]: + direct_method_call_node = engine.get_direct_method_call(node) # eg new Foo.Bar().direct_method_call(x, y, 3) + if direct_method_call_node: + yield from _do_extract_instance_methods(direct_method_call_node, classes, engine) + else: + yield from _do_extract_instance_methods(node, classes, engine) + + +def _extract_function_calls( + fn_node: Node, classes: set[BaseNamespace], engine: TreeSitterExtractorEngine +) -> Iterator[Tuple[Feature, Address]]: + for node in engine.get_function_call_names(fn_node): + yield from _extract_static_methods(node, engine) + yield from _extract_instance_methods(node, classes, engine) + + +def extract_imports(fn_node: Node, engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: + classes = {engine.language_toolkit.create_namespace(cls) for cls in get_classes(fn_node, engine)} + yield from _extract_classes(fn_node, engine) + yield from _extract_constants(fn_node, engine) + yield from _extract_properties(fn_node, classes, engine) + yield from _extract_function_calls(fn_node, classes, engine) + + +def _extract_pseudo_main_features(engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: + for node in engine.get_global_statements(): + yield from _extract_features(node, engine) + + +def _extract_features(fn_node: Node, engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: + for function_handler in FUNCTION_HANDLERS: + for feature, addr in function_handler(fn_node, engine): + yield feature, addr + + +def extract_features(fh: FunctionHandle, engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: + if is_pseudo_main_function(fh, engine): + yield from _extract_pseudo_main_features(engine) + else: + yield from _extract_features(fh.inner.node, engine) + + +FUNCTION_HANDLERS = ( + extract_imports, + extract_integers, + extract_strings, +) diff --git a/capa/features/extractors/ts/global_.py b/capa/features/extractors/ts/global_.py new file mode 100644 index 000000000..3ea55879d --- /dev/null +++ b/capa/features/extractors/ts/global_.py @@ -0,0 +1,26 @@ +from typing import Tuple, Iterator + +import capa.features.extractors.script +from capa.features.common import Feature +from capa.features.address import Address + + +def extract_arch() -> Iterator[Tuple[Feature, Address]]: + yield from capa.features.extractors.script.extract_arch() + + +def extract_os() -> Iterator[Tuple[Feature, Address]]: + yield from capa.features.extractors.script.extract_os() + + +def extract_file_format() -> Iterator[Tuple[Feature, Address]]: + yield from capa.features.extractors.script.extract_format() + + +def extract_features() -> Iterator[Tuple[Feature, Address]]: + for glob_handler in GLOBAL_HANDLERS: + for feature, addr in glob_handler(): + yield feature, addr + + +GLOBAL_HANDLERS = (extract_arch, extract_os, extract_file_format) diff --git a/capa/features/extractors/ts/query.py b/capa/features/extractors/ts/query.py new file mode 100644 index 000000000..baf837806 --- /dev/null +++ b/capa/features/extractors/ts/query.py @@ -0,0 +1,136 @@ +from dataclasses import dataclass + +from tree_sitter import Language +from tree_sitter.binding import Query + +import capa.features.extractors.ts.build +from capa.features.extractors.script import LANG_CS, LANG_JS, LANG_PY, LANG_TEM, LANG_HTML + + +@dataclass +class QueryBinding: + language: Language + + +@dataclass +class ScriptQueryBinding(QueryBinding): + new_object_name: Query + function_definition: Query + function_definition_field_name: str + direct_method_call: Query + function_call_name: Query + property_name: Query + imported_constant_name: Query + string_literal: Query + integer_literal: Query + namespace: Query + global_statement: Query # except function definitions + + +@dataclass +class TemplateQueryBinding(QueryBinding): + code: Query + content: Query + + +@dataclass +class HTMLQueryBinding(QueryBinding): + script_element: Query + script_content: Query + attribute: Query + + +def deserialize(language: str, binding: dict) -> dict: + deserialized_binding = {} + if "query" in binding: + for construct, query in binding["query"].items(): + deserialized_binding[construct] = TS_LANGUAGES[language].query(query) + if "field_name" in binding: + for construct, field_name in binding["field_name"].items(): + deserialized_binding[f"{construct}_field_name"] = field_name + return deserialized_binding + + +capa.features.extractors.ts.build.TSBuilder() + +TS_LANGUAGES: dict[str, Language] = { + LANG_CS: Language(capa.features.extractors.ts.build.build_dir, LANG_CS), + LANG_PY: Language(capa.features.extractors.ts.build.build_dir, LANG_PY), + LANG_TEM: Language(capa.features.extractors.ts.build.build_dir, LANG_TEM), + LANG_HTML: Language(capa.features.extractors.ts.build.build_dir, LANG_HTML), + LANG_JS: Language(capa.features.extractors.ts.build.build_dir, LANG_JS), +} + +BINDINGS: dict[str, QueryBinding] = { + LANG_CS: ScriptQueryBinding( + TS_LANGUAGES[LANG_CS], + **deserialize( + LANG_CS, + { + "query": { + "new_object_name": "(object_creation_expression type: [(qualified_name) @new-object (identifier) @new-object])", + "function_definition": "(local_function_statement) @function-definition", + "function_call_name": "(invocation_expression function: [(member_access_expression name: (identifier)) @function-call (identifier) @function-call])", + "property_name": "(member_access_expression) @property", + "imported_constant_name": "(member_access_expression) @constant (equals_value_clause (identifier) @constant)", + "string_literal": "(string_literal) @string-literal", + "integer_literal": "(integer_literal) @integer-literal", + "namespace": "(using_directive [(identifier) @namespace (qualified_name) @namespace])", + "global_statement": "(global_statement [(if_statement) @global-statement (expression_statement) @global-statement (local_declaration_statement) @global-statement])", + "direct_method_call": "(member_access_expression expression: (object_creation_expression) name: (identifier) @direct-method-call)", + }, + "field_name": { + "function_definition": "name", + }, + }, + ), + ), + LANG_PY: ScriptQueryBinding( + TS_LANGUAGES[LANG_PY], + **deserialize( + LANG_PY, + { + "query": { + "new_object_name": "(call function: [(attribute) @new-object (identifier) @new-object])", # Python makes no distinction between new object creation and a function call + "function_definition": "(function_definition) @function-definition", + "function_call_name": "(call function: [(attribute) @function-call (identifier) @function-call])", + "property_name": "(attribute) @property", + "imported_constant_name": "(attribute) @constant (expression_statement (assignment right: (identifier) @constant))", + "string_literal": "(string) @string-literal", + "integer_literal": "(integer) @integer-literal", + "namespace": "(import_from_statement) @import_from (import_statement) @import", + "global_statement": "(module [(if_statement) @global-statement (expression_statement) @global-statement])", + "direct_method_call": "(attribute object: (call) attribute: (identifier) @direct-method-call)", + }, + "field_name": { + "function_definition": "name", + }, + }, + ), + ), + LANG_TEM: TemplateQueryBinding( + TS_LANGUAGES[LANG_TEM], + **deserialize( + LANG_TEM, + { + "query": { + "code": "(code) @code", + "content": "(content) @content", + }, + }, + ), + ), + LANG_HTML: HTMLQueryBinding( + TS_LANGUAGES[LANG_HTML], + **deserialize( + LANG_HTML, + { + "query": { + "script_element": "(script_element) @script-element", + "attribute": "(attribute) @attribute", + "script_content": "(raw_text) @script-content", + }, + }, + ), + ), +} diff --git a/capa/features/extractors/ts/signatures/__init__.py b/capa/features/extractors/ts/signatures/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/capa/features/extractors/ts/signatures/cs.json b/capa/features/extractors/ts/signatures/cs.json new file mode 100644 index 000000000..07ce4ee3c --- /dev/null +++ b/capa/features/extractors/ts/signatures/cs.json @@ -0,0 +1,97 @@ +{ + "classes" : [ + "System.Data.SqlClient.SqlCommand", + "System.Data.SqlClient.SqlConnection", + "System.Data.SqlClient.SqlDataAdapter", + "System.Diagnostics.Process", + "System.Diagnostics.ProcessStartInfo", + "System.IO.DirectoryInfo", + "System.Security.Cryptography.CryptoStream", + "System.Security.Cryptography.Rijndael", + "System.Security.Cryptography.RijndaelManaged", + "System.Security.Cryptography.RSACryptoServiceProvider", + "System.Security.Cryptography.SHA1", + "System.Security.Cryptography.SHA1CryptoServiceProvider", + "System.Security.Cryptography.SHA256", + "System.Security.Cryptography.SHA256CryptoServiceProvider" + ], + "constructors" : [ + "System.Security.Cryptography.Rijndael.Create" + ], + "functions": + [ + "System.Convert.ToBase64String", + "System.Convert.FromBase64String", + "System.Data.SqlClient.SqlCommand.ExecuteReader", + "System.Data.SqlClient.SqlConnection.Open", + "System.Diagnostics.Process.Start", + "System.IO.Directory.CreateDirectory", + "System.IO.File.Delete", + "System.IO.File.Write", + "System.IO.File.GetAttributes", + "System.IO.File.GetCreationTime", + "System.IO.File.GetLastAccessTime", + "System.IO.File.GetLastWriteTime", + "System.IO.File.ReadAllBytes", + "System.IO.File.ReadAllBytes", + "System.IO.File.ReadAllBytesAsync", + "System.IO.File.ReadAllLines", + "System.IO.File.ReadAllLinesAsync", + "System.IO.File.ReadAllText", + "System.IO.File.ReadAllTextAsync", + "System.IO.File.ReadLines", + "System.IO.File.ReadLinesAsync", + "System.IO.File.SetCreationTime", + "System.IO.File.SetLastAccessTime", + "System.IO.File.SetLastWriteTime", + "System.IO.File.WriteAllBytes", + "System.IO.File.WriteAllBytes", + "System.IO.File.WriteAllBytesAsync", + "System.IO.File.WriteAllLines", + "System.IO.File.WriteAllLinesAsync", + "System.IO.File.WriteAllText", + "System.IO.File.WriteAllTextAsync", + "System.IO.File.WriteLines", + "System.IO.File.WriteLinesAsync", + "System.IO.Path.GetTempPath", + "System.Security.Cryptography.RijndaelManaged.CreateDecryptor", + "System.Security.Cryptography.RijndaelManaged.CreateEncryptor", + "System.Security.Cryptography.RSACryptoServiceProvider.Encrypt", + "System.Security.Cryptography.SHA1CryptoServiceProvider.ComputeHash", + "System.Security.Cryptography.SHA256CryptoServiceProvider.ComputeHash" + ], + "properties": [ + "System.Diagnostics.Process.StartInfo.FileName", + "System.Diagnostics.Process.StartInfo.Arguments", + "System.Diagnostics.Process.StartInfo.RedirectStandardInput", + "System.Diagnostics.Process.StartInfo.RedirectStandardOutput", + "System.Diagnostics.Process.StartInfo.UseShellExecute", + "System.Diagnostics.Process.StartInfo.CreateNoWindow", + "System.Diagnostics.ProcessStartInfo.FileName", + "System.Diagnostics.ProcessStartInfo.Arguments", + "System.Diagnostics.ProcessStartInfo.RedirectStandardInput", + "System.Diagnostics.ProcessStartInfo.RedirectStandardOutput", + "System.Diagnostics.ProcessStartInfo.UseShellExecute", + "System.Diagnostics.ProcessStartInfo.CreateNoWindow" + ], + "constants": [], + "builtins": [], + "aspx_default_namespaces": + [ + "System", + "System.Collections", + "System.Collections.Specialized", + "System.Configuration", + "System.Text", + "System.Text.RegularExpressions", + "System.Web", + "System.Web.Caching", + "System.Web.Profile", + "System.Web.Security", + "System.Web.SessionState", + "System.Web.UI", + "System.Web.UI.HtmlControls", + "System.Web.UI.WebControls", + "System.Web.UI.WebControls.WebParts" + ] +} \ No newline at end of file diff --git a/capa/features/extractors/ts/signatures/py.json b/capa/features/extractors/ts/signatures/py.json new file mode 100644 index 000000000..667ffe1db --- /dev/null +++ b/capa/features/extractors/ts/signatures/py.json @@ -0,0 +1,47 @@ +{ + "classes": [ + "socket.socket", + "socket.error", + "urllib2.Request" + ], + "constructors": [ + "ssl.wrap_socket", + "win32com.client.Dispatch" + ], + "functions": [ + "subprocess.Popen", + "subprocess.PIPE", + "urllib2.urlopen", + "base64.encodestring", + "base64.b64encode", + "base64.b64decode", + "os.chdir", + "os.chmod", + "os.getcwd", + "os.popen", + "os.remove", + "os.path.expanduser", + "os.path.dirname", + "platform.mac_ver", + "shutil.copytree", + "time.sleep", + "win32api.SetFileAttributes" + ], + "constants": [ + "os.environ", + "socket.AF_INET", + "socket.SOCK_STREAM", + "socket.SQL_SOCKET", + "socket.SO_REUSEADDR", + "ssl.PROTOCOL_TLSv1", + "ssl.CERT_NONE", + "win32con.FILE_ATTRIBUTE_HIDDEN", + "win32con.FILE_ATTRIBUTE_SYSTEM" + ], + "properties": [], + "builtins": [ + "eval", + "exec", + "open" + ] +} \ No newline at end of file diff --git a/capa/features/extractors/ts/tools.py b/capa/features/extractors/ts/tools.py new file mode 100644 index 000000000..1c1d9e48f --- /dev/null +++ b/capa/features/extractors/ts/tools.py @@ -0,0 +1,263 @@ +import abc +import json +import importlib.resources +from typing import Dict, List, Tuple, Union, Callable, Iterator, Optional +from dataclasses import dataclass + +from tree_sitter import Node + +import capa.features.extractors.ts.signatures +from capa.features.extractors.script import LANG_CS, LANG_PY + + +@dataclass(frozen=True) +class BaseNamespace(abc.ABC): + """Abstract class for internal representation of the namespace concept, including aliases.""" + + name: str + node: Node = None + alias: str = "" + + def __hash__(self): + return hash(self.name) + + def join(self, name: str) -> str: + raise NotImplementedError() + + +class CSharpNamespace(BaseNamespace): + def join(self, name: str) -> str: + """using System; Diagnostics.ProcessStartInfo => System.Diagnostics.ProcessStartInfo""" + return LANGUAGE_TOOLKITS[LANG_CS].join_names(self.name, name) + + +class PythonImport(BaseNamespace): + def join(self, name: str) -> str: + """import subprocess ; subprocess.Popen => subprocess.Popen + from threading import Timer (threading.Timer) => Timer + """ + toolkit = LANGUAGE_TOOLKITS[LANG_CS] + qualified_names = toolkit.split_name(self.name) + if len(qualified_names) < 2: + return name + return toolkit.join_names(*(qualified_names[:-1] + [name])) + + +class LanguageToolkit: + signature_file: str + import_signatures: Dict[str, set[str]] + method_call_query_type: str + property_query_type: str + string_delimiters: str + integer_prefixes: List[ + Tuple[Union[str, Tuple[str, ...]], int] + ] # Tends to indicate a number system, e.g. (("0x", "0X"), 16) + integer_suffixes: Tuple[str, ...] # Tends to indicate unsigned (100u) or long (100l) integer literal + + def __init__(self): + self.import_signatures = self.load_import_signatures(self.signature_file) + + def load_import_signatures(self, signature_file: str) -> Dict[str, set[str]]: + signatures = json.loads(importlib.resources.read_text(capa.features.extractors.ts.signatures, signature_file)) + return {category: set(names) for category, names in signatures.items()} + + def get_full_name(self, name: str, namespace: Optional[BaseNamespace] = None) -> str: + if namespace: + if namespace.alias: + return name.replace(namespace.alias, namespace.name) + return namespace.join(name) + return name + + def is_imported_function(self, name: str, namespace: Optional[BaseNamespace] = None) -> bool: + return self.get_full_name(name, namespace) in self.import_signatures["functions"] + + def is_imported_class(self, name: str, namespace: Optional[BaseNamespace] = None) -> bool: + return self.get_full_name(name, namespace) in self.import_signatures["classes"] + + def is_imported_constructor(self, name: str, namespace: Optional[BaseNamespace] = None) -> bool: + return self.get_full_name(name, namespace) in self.import_signatures["constructors"] + + def is_imported_property(self, name: str, namespace: Optional[BaseNamespace] = None) -> bool: + return self.get_full_name(name, namespace) in self.import_signatures["properties"] + + def is_imported_constant(self, name: str, namespace: Optional[BaseNamespace] = None) -> bool: + return self.get_full_name(name, namespace) in self.import_signatures["constants"] + + def is_builtin(self, func: str) -> bool: + return func in self.import_signatures["builtins"] + + def get_builtin_name(self, func: str) -> str: + return self.join_names("builtins", func) + + def join_names(self, *args: str) -> str: + return ".".join(args) + + def split_name(self, name: str) -> List[str]: + return name.split(".") + + def process_property(self, node: Node, name: str) -> str: + if self.is_method_call(node): # yield only p.StartInfo but not p.Start() + return "" + if self.is_recursive_property(node): # yield only Current.Server.ClearError but not Current.Server and Current + return "" + return self.join_names(*self.split_name(name)[1:]) + + def process_imported_constant(self, node: Node, name: str) -> Optional[str]: + if self.is_method_call(node): # yield only ssl.CERT_NONE and not ssl.wrap_socket() + return None + if self.is_recursive_property(node): # yield foo.foo.bar and not foo.bar or bar + return None + return name + + def get_namespace_from_name(self, name: str) -> str: + qualified_names = self.split_name(name) + if len(qualified_names) < 2: + return "" + return self.join_names(*qualified_names[:-1]) + + def get_member_from_name(self, name: str) -> str: + qualified_names = self.split_name(name) + if len(qualified_names) < 2: + return qualified_names[0] + return self.join_names(*qualified_names[1:]) + + def format_imported_class(self, name: str) -> str: + return name + + def format_imported_class_members(self, name: str) -> str: + qualified_names = self.split_name(name) + if len(qualified_names) < 2: + raise ValueError(f"{name} does not have an associated class or namespace") + if len(qualified_names) == 2: + classname, membername = qualified_names[0], qualified_names[1] + return f"{classname}::{membername}" + namespace, classname, membername = qualified_names[:-2], qualified_names[-2], qualified_names[-1] + return f"{'.'.join(namespace)}.{classname}::{membername}" + + def format_imported_function(self, name: str) -> str: + return self.format_imported_class_members(name) + + def format_imported_custom_constructor(self, name: str) -> str: + return self.format_imported_class_members(name) + + def format_imported_default_constructor(self, name: str) -> str: + return self.format_imported_function(self.join_names(name, "ctor")) + + def format_imported_property(self, name: str) -> str: + return self.format_imported_class_members(name) + + def format_imported_constant(self, name: str) -> str: + return self.format_imported_class_members(name) + + def parse_integer(self, integer: str) -> int: + for suffix in self.integer_suffixes: + if integer.endswith(suffix): + integer = integer[:-1] + for prefix, base in self.integer_prefixes: + if integer.startswith(prefix): + return int(integer, base) + return int(integer) + + def parse_string(self, string: str) -> str: + return string.strip(self.string_delimiters) + + def is_method_call(self, node: Node) -> bool: + return node.parent.type == self.method_call_query_type + + def is_recursive_property(self, node: Node) -> bool: + return node.parent.type == self.property_query_type + + @abc.abstractmethod + def create_namespace(self, name: str) -> BaseNamespace: + raise NotImplementedError() + + @abc.abstractmethod + def process_namespace(self, node: Node, query_name: str, get_str: Callable) -> Iterator[BaseNamespace]: + raise NotImplementedError() + + @abc.abstractmethod + def get_default_namespaces(self, embedded: bool) -> set[BaseNamespace]: + raise NotImplementedError() + + +class CSharpToolkit(LanguageToolkit): + signature_file: str = "cs.json" + method_call_query_type: str = "invocation_expression" + property_query_type: str = "member_access_expression" + string_delimiters: str = '"' + integer_prefixes: List[Tuple[Union[str, Tuple[str, ...]], int]] = [(("0x", "0X"), 16)] + integer_suffixes: Tuple[str, ...] = ("u", "l") + + def create_namespace(self, name: str) -> BaseNamespace: + return CSharpNamespace(name) + + def process_namespace(self, node: Node, query_name: str, get_str: Callable) -> Iterator[BaseNamespace]: + yield CSharpNamespace(get_str(node), node, "") + + def get_default_namespaces(self, embedded: bool) -> set[BaseNamespace]: + if embedded: + return {CSharpNamespace(name) for name in self.import_signatures["aspx_default_namespaces"]} + return set() + + +class PythonToolkit(LanguageToolkit): + signature_file: str = "py.json" + method_call_query_type: str = "call" + property_query_type: str = "attribute" + string_delimiters: str = "\"'" + integer_prefixes: List[Tuple[Union[str, Tuple[str, ...]], int]] = [ + (("0b, 0B"), 2), + (("0o, 0O"), 8), + (("0x", "0X"), 16), + ] + integer_suffixes: Tuple[str, ...] = tuple() + + def create_namespace(self, name: str) -> BaseNamespace: + return PythonImport(name) + + def get_import_name(self, name: str, module_name: Optional[str] = None) -> str: + return self.join_names(module_name, name) if module_name else name + + def process_simple_import(self, node: Node, get_str: Callable, module_name: Optional[str] = None) -> BaseNamespace: + return PythonImport(self.get_import_name(get_str(node), module_name), node) + + def process_aliased_import(self, node: Node, get_str: Callable, module_name: Optional[str] = None) -> BaseNamespace: + name = self.get_import_name(get_str(node.get_child_by_field_name("name")), module_name) + alias = get_str(node.get_child_by_field_name("alias")) + return PythonImport(name, node, alias) + + def process_imports( + self, nodes: List[Node], get_str: Callable, module_name: Optional[str] = None + ) -> Iterator[BaseNamespace]: + for import_node in nodes: + if import_node.type == "dotted_name": + yield self.process_simple_import(import_node, get_str, module_name) + elif import_node.type == "aliased_import": + yield self.process_aliased_import(import_node, get_str, module_name) + + def get_wildcard_import(self, node: Node) -> Optional[Node]: + for child_node in node.children: + if child_node.type == "wildcard_import": + return child_node + return None + + def process_import_from(self, node: Node, import_nodes: List[Node], get_str: Callable) -> Iterator[BaseNamespace]: + module_name, import_nodes = get_str(import_nodes[0]), import_nodes[1:] + wildcard_import = self.get_wildcard_import(node) + if wildcard_import: + yield self.process_simple_import(wildcard_import, get_str, module_name) + else: + yield from self.process_imports(import_nodes, get_str, module_name) + + def process_namespace(self, node: Node, query_name: str, get_str: Callable) -> Iterator[BaseNamespace]: + import_nodes = [child_node for child_node in node.children if child_node.is_named] + if query_name == "import_from": + yield from self.process_import_from(node, import_nodes, get_str) + elif query_name == "import": + yield from self.process_imports(import_nodes, get_str) + + def get_default_namespaces(self, embedded: bool) -> set[BaseNamespace]: + return set() + + +LANGUAGE_TOOLKITS: dict[str, LanguageToolkit] = {LANG_CS: CSharpToolkit(), LANG_PY: PythonToolkit()} diff --git a/capa/features/freeze/__init__.py b/capa/features/freeze/__init__.py index 15129a360..24c95af15 100644 --- a/capa/features/freeze/__init__.py +++ b/capa/features/freeze/__init__.py @@ -40,6 +40,7 @@ class AddressType(str, Enum): ABSOLUTE = "absolute" RELATIVE = "relative" FILE = "file" + FILE_RANGE = "file range" DN_TOKEN = "dn token" DN_TOKEN_OFFSET = "dn token offset" NO_ADDRESS = "no address" @@ -60,6 +61,9 @@ def from_capa(cls, a: capa.features.address.Address) -> "Address": elif isinstance(a, capa.features.address.FileOffsetAddress): return cls(type=AddressType.FILE, value=int(a)) + elif isinstance(a, capa.features.address.FileOffsetRangeAddress): + return cls(type=AddressType.FILE_RANGE, value=(a.start_byte, a.end_byte)) + elif isinstance(a, capa.features.address.DNTokenAddress): return cls(type=AddressType.DN_TOKEN, value=a.token.value) @@ -88,6 +92,10 @@ def to_capa(self) -> capa.features.address.Address: elif self.type is AddressType.FILE: return capa.features.address.FileOffsetAddress(self.value) + elif self.type is AddressType.FILE_RANGE: + start_byte, end_byte = self.value + return capa.features.address.FileOffsetRangeAddress(start_byte, end_byte) + elif self.type is AddressType.DN_TOKEN: return capa.features.address.DNTokenAddress(dncil.clr.token.Token(self.value)) diff --git a/capa/features/freeze/features.py b/capa/features/freeze/features.py index 8f8665ca5..17855987b 100644 --- a/capa/features/freeze/features.py +++ b/capa/features/freeze/features.py @@ -24,6 +24,9 @@ def to_capa(self) -> capa.features.common.Feature: elif isinstance(self, FormatFeature): return capa.features.common.Format(self.format, description=self.description) + elif isinstance(self, ScriptLanguageFeature): + return capa.features.common.ScriptLanguage(self.language, description=self.description) + elif isinstance(self, MatchFeature): return capa.features.common.MatchedRule(self.match, description=self.description) @@ -66,6 +69,9 @@ def to_capa(self) -> capa.features.common.Feature: elif isinstance(self, APIFeature): return capa.features.insn.API(self.api, description=self.description) + elif isinstance(self, PropertyFeature): + return capa.features.insn.Property(self.property, description=self.description) + elif isinstance(self, NumberFeature): return capa.features.insn.Number(self.number, description=self.description) @@ -106,6 +112,9 @@ def feature_from_capa(f: capa.features.common.Feature) -> "Feature": elif isinstance(f, capa.features.common.Format): return FormatFeature(format=f.value, description=f.description) + elif isinstance(f, capa.features.common.ScriptLanguage): + return ScriptLanguageFeature(language=f.value, description=f.description) + elif isinstance(f, capa.features.common.MatchedRule): return MatchFeature(match=f.value, description=f.description) @@ -147,6 +156,9 @@ def feature_from_capa(f: capa.features.common.Feature) -> "Feature": elif isinstance(f, capa.features.insn.API): return APIFeature(api=f.value, description=f.description) + elif isinstance(f, capa.features.insn.Property): + return PropertyFeature(property=f.value, description=f.description) + elif isinstance(f, capa.features.insn.Number): return NumberFeature(number=f.value, description=f.description) @@ -189,6 +201,12 @@ class FormatFeature(FeatureModel): description: Optional[str] +class ScriptLanguageFeature(FeatureModel): + type: str = "script language" + language: str + description: Optional[str] + + class MatchFeature(FeatureModel): type: str = "match" match: str @@ -266,6 +284,12 @@ class APIFeature(FeatureModel): description: Optional[str] +class PropertyFeature(FeatureModel): + type: str = "property" + property: str + description: Optional[str] + + class NumberFeature(FeatureModel): type: str = "number" number: Union[int, float] @@ -308,6 +332,7 @@ class OperandOffsetFeature(FeatureModel): OSFeature, ArchFeature, FormatFeature, + ScriptLanguageFeature, MatchFeature, CharacteristicFeature, ExportFeature, @@ -320,6 +345,7 @@ class OperandOffsetFeature(FeatureModel): ClassFeature, NamespaceFeature, APIFeature, + PropertyFeature, NumberFeature, BytesFeature, OffsetFeature, diff --git a/capa/features/insn.py b/capa/features/insn.py index c62d3ddf3..cdafaa360 100644 --- a/capa/features/insn.py +++ b/capa/features/insn.py @@ -24,6 +24,11 @@ def __init__(self, name: str, description=None): super(API, self).__init__(name, description=description) +class Property(Feature): + def __init__(self, name: str, description=None): + super().__init__(name, description=description) + + class Number(Feature): def __init__(self, value: Union[int, float], description=None): super(Number, self).__init__(value, description=description) diff --git a/capa/helpers.py b/capa/helpers.py index 9c4c285e8..7d1c180b7 100644 --- a/capa/helpers.py +++ b/capa/helpers.py @@ -10,11 +10,13 @@ from typing import NoReturn from capa.exceptions import UnsupportedFormatError -from capa.features.common import FORMAT_SC32, FORMAT_SC64, FORMAT_UNKNOWN +from capa.features.common import FORMAT_SC32, FORMAT_SC64, FORMAT_SCRIPT, FORMAT_UNKNOWN +from capa.features.extractors.script import EXT_CS, EXT_ASPX, EXT_HTML EXTENSIONS_SHELLCODE_32 = ("sc32", "raw32") EXTENSIONS_SHELLCODE_64 = ("sc64", "raw64") EXTENSIONS_ELF = "elf_" +EXTENSIONS_SUPPORTED_SCRIPTS = EXT_ASPX + EXT_CS + EXT_HTML logger = logging.getLogger("capa") @@ -51,6 +53,8 @@ def get_format_from_extension(sample: str) -> str: return FORMAT_SC32 elif sample.endswith(EXTENSIONS_SHELLCODE_64): return FORMAT_SC64 + elif sample.endswith(EXTENSIONS_SUPPORTED_SCRIPTS): + return FORMAT_SCRIPT return FORMAT_UNKNOWN diff --git a/capa/main.py b/capa/main.py index d53221ad9..604b2a501 100644 --- a/capa/main.py +++ b/capa/main.py @@ -41,6 +41,7 @@ import capa.features.extractors import capa.features.extractors.common import capa.features.extractors.pefile +import capa.features.extractors.script import capa.features.extractors.dnfile_ import capa.features.extractors.elffile import capa.features.extractors.dotnetfile @@ -48,7 +49,6 @@ from capa.rules import Rule, Scope, RuleSet from capa.engine import FeatureSet, MatchResults from capa.helpers import ( - get_format, get_file_taste, get_auto_format, log_unsupported_os_error, @@ -64,6 +64,7 @@ FORMAT_SC64, FORMAT_DOTNET, FORMAT_FREEZE, + FORMAT_SCRIPT, ) from capa.features.address import NO_ADDRESS from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle, FeatureExtractor @@ -344,6 +345,13 @@ def has_file_limitation(rules: RuleSet, capabilities: MatchResults, is_standalon return False +def is_script_format(format_: str): + """ + If the script format was recognized, then it is supported. + """ + return format_ == FORMAT_SCRIPT + + def is_supported_format(sample: str) -> bool: """ Return if this is a supported file based on magic header values @@ -372,6 +380,14 @@ def get_arch(sample: str) -> str: return "unknown" +def get_script_arch() -> str: + for feature, _ in capa.features.extractors.script.extract_arch(): + assert isinstance(feature.value, str) + return feature.value + + return "unknown" + + def is_supported_os(sample: str) -> bool: with open(sample, "rb") as f: buf = f.read() @@ -390,6 +406,14 @@ def get_os(sample: str) -> str: return "unknown" +def get_script_os() -> str: + for feature, _ in capa.features.extractors.script.extract_os(): + assert isinstance(feature.value, str) + return feature.value + + return "unknown" + + def get_meta_str(vw): """ Return workspace meta information string @@ -497,6 +521,11 @@ def get_extractor( UnsupportedArchError UnsupportedOSError """ + if format_ == FORMAT_SCRIPT: + import capa.features.extractors.ts.extractor + + return capa.features.extractors.ts.extractor.TreeSitterFeatureExtractor(path) + if format_ not in (FORMAT_SC32, FORMAT_SC64): if not is_supported_format(path): raise UnsupportedFormatError() @@ -674,9 +703,14 @@ def collect_metadata( if rules_path != [RULES_PATH_DEFAULT_STRING]: rules_path = [os.path.abspath(os.path.normpath(r)) for r in rules_path] - format_ = get_format(sample_path) - arch = get_arch(sample_path) - os_ = get_os(sample_path) + format_ = get_auto_format(sample_path) + + if format_ == FORMAT_SCRIPT: + arch = get_script_arch() + os_ = get_script_os() + else: + arch = get_arch(sample_path) + os_ = get_os(sample_path) return { "timestamp": datetime.datetime.now().isoformat(), diff --git a/capa/rules.py b/capa/rules.py index 02399d367..8688c8f06 100644 --- a/capa/rules.py +++ b/capa/rules.py @@ -91,6 +91,7 @@ class Scope(str, Enum): capa.features.common.OS, capa.features.common.Arch, capa.features.common.Format, + capa.features.common.ScriptLanguage, }, FILE_SCOPE: { capa.features.common.MatchedRule, @@ -121,6 +122,7 @@ class Scope(str, Enum): }, INSTRUCTION_SCOPE: { capa.features.common.MatchedRule, + capa.features.insn.Property, capa.features.insn.API, capa.features.insn.Number, capa.features.common.String, @@ -254,6 +256,8 @@ def parse_feature(key: str): # keep this in sync with supported features if key == "api": return capa.features.insn.API + if key == "property": + return capa.features.insn.Property elif key == "string": return capa.features.common.StringFactory elif key == "substring": @@ -280,6 +284,8 @@ def parse_feature(key: str): return capa.features.common.MatchedRule elif key == "function-name": return capa.features.file.FunctionName + elif key == "language": + return capa.features.common.ScriptLanguage elif key == "os": return capa.features.common.OS elif key == "format": diff --git a/setup.py b/setup.py index 560533a8d..2c531fb38 100644 --- a/setup.py +++ b/setup.py @@ -29,6 +29,7 @@ "dnfile==0.11.0", "dncil==1.0.0", "pydantic==1.9.1", + "tree_sitter==0.20.0", ] # this sets __version__ diff --git a/tests/data b/tests/data index 2e8257475..f032303b5 160000 --- a/tests/data +++ b/tests/data @@ -1 +1 @@ -Subproject commit 2e8257475ebfdc8808d7e180be9a3f94977fcf57 +Subproject commit f032303b50d0a4225fc436d35f0d8b215751f9aa diff --git a/tests/fixtures.py b/tests/fixtures.py index 88a63de19..6ba493dfe 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -10,10 +10,9 @@ import os import os.path import binascii -import itertools import contextlib import collections -from typing import Set, Dict +from typing import Set, Dict, Tuple, Union, Iterator from functools import lru_cache import pytest @@ -38,12 +37,17 @@ Feature, ) from capa.features.address import Address +from capa.features.extractors.script import LANG_CS, LANG_PY from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle from capa.features.extractors.dnfile.extractor import DnfileFeatureExtractor CD = os.path.dirname(__file__) DOTNET_DIR = os.path.join(CD, "data", "dotnet") DNFILE_TESTFILES = os.path.join(DOTNET_DIR, "dnfile-testfiles") +SOURCE_DIR = os.path.join(CD, "data", "source") +ASPX_DIR = os.path.join(SOURCE_DIR, "aspx") +CS_DIR = os.path.join(SOURCE_DIR, "cs") +PY_DIR = os.path.join(SOURCE_DIR, "py") @contextlib.contextmanager @@ -169,6 +173,29 @@ def get_dnfile_extractor(path): return extractor +@lru_cache(maxsize=1) +def get_ts_extractor_engine(language, buf): + import capa.features.extractors.ts.engine + + return capa.features.extractors.ts.engine.TreeSitterExtractorEngine(language, buf) + + +@lru_cache(maxsize=1) +def get_ts_template_engine(path): + import capa.features.extractors.ts.engine + + with open(path, "rb") as f: + buf = f.read() + return capa.features.extractors.ts.engine.TreeSitterTemplateEngine(buf) + + +@lru_cache(maxsize=1) +def get_ts_extractor(path): + import capa.features.extractors.ts.extractor + + return capa.features.extractors.ts.extractor.TreeSitterFeatureExtractor(path) + + def extract_global_features(extractor): features = collections.defaultdict(set) for feature, va in extractor.extract_global_features(): @@ -279,10 +306,41 @@ def get_data_path_by_name(name): return os.path.join(CD, "data", "dotnet", "1c444ebeba24dcba8628b7dfe5fec7c6.exe_") elif name.startswith("_692f"): return os.path.join(CD, "data", "dotnet", "692f7fd6d198e804d6af98eb9e390d61.exe_") + elif name.startswith("cs_138cdc"): + return os.path.join(CS_DIR, "138cdc4b10f3f5ece9c47bb0ec17fde5b70c1f9a90b267794c5e5dfa337fc798.cs_") else: raise ValueError("unexpected sample fixture: %s" % name) +ASPX_DATA_PATH_BY_NAME = { + "aspx_4f6fa6": os.path.join(ASPX_DIR, "4f6fa6a45017397c7e1c9cd5a17235ccb1ff0f5087dfa6b7384552bf507e7fe1.aspx_"), + "aspx_5f959f": os.path.join(ASPX_DIR, "5f959f480a66a33d37d9a0ef6c8f7d0059625ca2a8ae9236b49b194733622655.aspx_"), + "aspx_10162f": os.path.join(ASPX_DIR, "10162feb5f063ea09c6a3d275f31abf0fe8a9e4e36fded0053b1f8e054da8161.aspx_"), + "aspx_2b71dd": os.path.join(ASPX_DIR, "2b71dd245520d9eb5f1e4c633fee61c7d83687591d9f64f9390c26dc95057c3c.aspx_"), + "aspx_f2bf20": os.path.join(ASPX_DIR, "f2bf20e7bb482d27da8f19aa0f8bd4927746a65300929b99166867074a38a4b4.aspx_"), + "aspx_f39dc0": os.path.join(ASPX_DIR, "f39dc0dfd43477d65c1380a7cff89296ad72bfa7fc3afcfd8e294f195632030e.aspx_"), + "aspx_ea2a01": os.path.join(ASPX_DIR, "ea2a01cae57c00df01bff6bb8a72585fdc0abb7a26a869dc1a0131bdff50b400.aspx_"), + "aspx_6f3261": os.path.join(ASPX_DIR, "6f3261eaaabf369bd928d179641b73ffd768184dfd4e00124da462a3075d4239.aspx_"), + "aspx_1f8f40": os.path.join(ASPX_DIR, "1f8f4054932ed1d5d055e9a92aa1e2abba49af3370506674cb1b2c70146ae81a.aspx_"), + "aspx_2e8c7e": os.path.join(ASPX_DIR, "2e8c7eacd739ca3f3dc4112b41a024157035096b8d0c26ba79d8b893136391bc.aspx_"), + "aspx_03bb5c": os.path.join(ASPX_DIR, "03bb5cab46b406bb8613ca6e32991ab3e10b5cd759d5c7813191e9e62868ea73.aspx_"), + "aspx_606dbf": os.path.join(ASPX_DIR, "606dbfebdc7751ecb6cb9a845853ae1905afd4b8a2cb54e1e4a98c932e268712.aspx_"), + "aspx_f397cb": os.path.join(ASPX_DIR, "f397cb676353873cdc8fcfbf0e3a317334353cc63946099e5ea22db6d1eebfb8.aspx_"), + "aspx_b4bb14": os.path.join(ASPX_DIR, "b4bb14aeb692f7afc107ee89f86d096f1cd8f9761b6c50788f626a9dccc8b077.aspx_"), + "aspx_54433d": os.path.join(ASPX_DIR, "54433dd57414773098a6d3292d262f91a6812855dfcbf8d421695608d1fad638.aspx_"), + "aspx_a35878": os.path.join(ASPX_DIR, "a35878e74425cd97ad98e3ec4b2583867bb536f4275d821cd8b82bc19380ba1a.aspx_"), + "aspx_a5c893": os.path.join(ASPX_DIR, "a5c8934836f5b36bba3a722eab691a9f1f926c138fefe5bae07e9074e7c49ae3.aspx_"), + "aspx_15eed4": os.path.join(ASPX_DIR, "15eed42e4904205b2ef2ff285ff1ce6c8138296c12cf075a2562c69a5fafd1cb.aspx_"), + "aspx_b75f16": os.path.join(ASPX_DIR, "b75f163ca9b9240bf4b37ad92bc7556b40a17e27c2b8ed5c8991385fe07d17d0.aspx_"), + "aspx_d460ca": os.path.join(ASPX_DIR, "d460cae7d34c51059ef57c5aadb3de099469efbac5fffcf76d0528a511192a28.aspx_"), +} + +PY_DATA_PATH_BY_NAME = { + "py_7f9cd1": os.path.join(PY_DIR, "7f9cd1eedf0a9088fc3e07a275d04dceadcf0a5cd425a17e9666b63685d3a37e.py_"), + "py_ca0df6": os.path.join(PY_DIR, "ca0df6cccf2a15ce8f781d81959cf230aead64e6297a3283b21457dc74938c89.py_"), +} + + def get_sample_md5_by_name(name): """used by IDA tests to ensure the correct IDB is loaded""" if name == "mimikatz": @@ -347,6 +405,24 @@ def sample(request): return resolve_sample(request.param) +def resolve_sample_ts(sample): + if sample.startswith("cs_"): + return get_data_path_by_name(sample) + if sample.startswith("py_"): + return PY_DATA_PATH_BY_NAME[sample] + if sample.startswith("aspx_"): + try: + return ASPX_DATA_PATH_BY_NAME[sample] + except KeyError: + raise ValueError(f"unexpected sample fixture: {sample}") + raise ValueError(f"unexpected sample fixture: {sample}") + + +@pytest.fixture +def sample_ts(request): + return resolve_sample_ts(request.param) + + def get_function(extractor, fva: int) -> FunctionHandle: for fh in extractor.get_functions(): if isinstance(extractor, DnfileFeatureExtractor): @@ -358,6 +434,19 @@ def get_function(extractor, fva: int) -> FunctionHandle: raise ValueError("function not found") +def get_function_ts(extractor, fid: Union[Tuple[int], str]) -> Iterator[FunctionHandle]: + for fh in extractor.get_functions(): + if isinstance(fid, tuple): + addr = (fh.address.start_byte, fh.address.end_byte) + elif isinstance(fid, str): + addr = fh.inner.name + else: + raise ValueError("invalid fva format") + + if addr == fid: + yield fh + + def get_function_by_token(extractor, token: int) -> FunctionHandle: for fh in extractor.get_functions(): if fh.address.token.value == token: @@ -463,6 +552,53 @@ def scope(request): return resolve_scope(request.param) +def get_function_id_ts(scope): + fid = scope.partition("=")[2] + if fid[0] == "(" and fid[-1] == ")": + fid = tuple(int(x, 16) if x.lstrip().startswith("0x") else int(x) for x in fid[1:-1].split(",")) + return fid + + +def resolve_scope_ts(scope): + if scope == "global": + inner_fn = lambda extractor: extract_global_features(extractor) + elif scope == "file": + + def inner_fn(extractor): + features = extract_file_features(extractor) + for k, vs in extract_global_features(extractor).items(): + features[k].update(vs) + return features + + elif scope.startswith("function"): + # like `function=(0xbeef, 0xdead) or function=(123, 456) or function=foo_bar` + def inner_fn(extractor): + fid = get_function_id_ts(scope) + fhs = list(get_function_ts(extractor, fid)) + if not fhs: + raise ValueError("function not found") + features = collections.defaultdict(set) + for fh in fhs: + for k, vs in extract_function_features(extractor, fh).items(): + # print(f"{k}:{vs}") + features[k].update(vs) + for k, vs in extract_file_features(extractor).items(): + features[k].update(vs) + for k, vs in extract_global_features(extractor).items(): + features[k].update(vs) + return features + + else: + raise ValueError("unexpected scope fixture") + inner_fn.__name__ = scope + return inner_fn + + +@pytest.fixture +def scope_ts(request): + return resolve_scope_ts(request.param) + + def make_test_id(values): return "-".join(map(str, values)) @@ -904,3 +1040,120 @@ def _1c444_dotnetfile_extractor(): @pytest.fixture def _692f_dotnetfile_extractor(): return get_dnfile_extractor(get_data_path_by_name("_692f")) + + +@pytest.fixture +def cs_138cdc_extractor_engine(): + with open(get_data_path_by_name("cs_138cdc"), "rb") as f: + buf = f.read() + return get_ts_extractor_engine(LANG_CS, buf) + + +@pytest.fixture +def aspx_4f6fa6_template_engine(): + return get_ts_template_engine(ASPX_DATA_PATH_BY_NAME["aspx_4f6fa6"]) + + +@pytest.fixture +def aspx_5f959f_template_engine(): + return get_ts_template_engine(ASPX_DATA_PATH_BY_NAME["aspx_5f959f"]) + + +@pytest.fixture +def aspx_10162f_template_engine(): + return get_ts_template_engine(ASPX_DATA_PATH_BY_NAME["aspx_10162f"]) + + +@pytest.fixture +def aspx_2b71dd_template_engine(): + return get_ts_template_engine(ASPX_DATA_PATH_BY_NAME["aspx_2b71dd"]) + + +@pytest.fixture +def aspx_f2bf20_template_engine(): + return get_ts_template_engine(ASPX_DATA_PATH_BY_NAME["aspx_f2bf20"]) + + +@pytest.fixture +def aspx_f39dc0_template_engine(): + return get_ts_template_engine(ASPX_DATA_PATH_BY_NAME["aspx_f39dc0"]) + + +@pytest.fixture +def aspx_ea2a01_template_engine(): + return get_ts_template_engine(ASPX_DATA_PATH_BY_NAME["aspx_ea2a01"]) + + +@pytest.fixture +def aspx_6f3261_template_engine(): + return get_ts_template_engine(ASPX_DATA_PATH_BY_NAME["aspx_6f3261"]) + + +@pytest.fixture +def aspx_1f8f40_template_engine(): + return get_ts_template_engine(ASPX_DATA_PATH_BY_NAME["aspx_1f8f40"]) + + +@pytest.fixture +def aspx_2e8c7e_template_engine(): + return get_ts_template_engine(ASPX_DATA_PATH_BY_NAME["aspx_2e8c7e"]) + + +@pytest.fixture +def aspx_03bb5c_template_engine(): + return get_ts_template_engine(ASPX_DATA_PATH_BY_NAME["aspx_03bb5c"]) + + +@pytest.fixture +def aspx_606dbf_template_engine(): + return get_ts_template_engine(ASPX_DATA_PATH_BY_NAME["aspx_606dbf"]) + + +@pytest.fixture +def aspx_f397cb_template_engine(): + return get_ts_template_engine(ASPX_DATA_PATH_BY_NAME["aspx_f397cb"]) + + +@pytest.fixture +def aspx_b4bb14_template_engine(): + return get_ts_template_engine(ASPX_DATA_PATH_BY_NAME["aspx_b4bb14"]) + + +@pytest.fixture +def aspx_54433d_template_engine(): + return get_ts_template_engine(ASPX_DATA_PATH_BY_NAME["aspx_54433d"]) + + +@pytest.fixture +def aspx_a35878_template_engine(): + return get_ts_template_engine(ASPX_DATA_PATH_BY_NAME["aspx_a35878"]) + + +@pytest.fixture +def aspx_a5c893_template_engine(): + return get_ts_template_engine(ASPX_DATA_PATH_BY_NAME["aspx_a5c893"]) + + +@pytest.fixture +def aspx_15eed4_template_engine(): + return get_ts_template_engine(ASPX_DATA_PATH_BY_NAME["aspx_15eed4"]) + + +@pytest.fixture +def aspx_b75f16_template_engine(): + return get_ts_template_engine(ASPX_DATA_PATH_BY_NAME["aspx_b75f16"]) + + +@pytest.fixture +def aspx_d460ca_template_engine(): + return get_ts_template_engine(ASPX_DATA_PATH_BY_NAME["aspx_d460ca"]) + + +@pytest.fixture +def py_7f9cd1_template_engine(): + return get_ts_extractor_engine(LANG_PY, PY_DATA_PATH_BY_NAME["py_7f9cd1"]) + + +@pytest.fixture +def py_ca0df6_template_engine(): + return get_ts_extractor_engine(LANG_PY, PY_DATA_PATH_BY_NAME["py_ca0df6"]) diff --git a/tests/test_ts.py b/tests/test_ts.py new file mode 100644 index 000000000..c29f93f18 --- /dev/null +++ b/tests/test_ts.py @@ -0,0 +1,1203 @@ +from typing import List, Tuple + +import pytest +import fixtures +from fixtures import * +from tree_sitter import Node, Tree + +from capa.features.insn import API, Number, Property +from capa.features.common import ( + OS, + OS_ANY, + ARCH_ANY, + FORMAT_SCRIPT, + Arch, + Class, + Format, + String, + Namespace, + Substring, + ScriptLanguage, +) +from capa.features.address import FileOffsetRangeAddress +from capa.features.extractors.script import LANG_CS, LANG_JS, LANG_PY, LANG_TEM, LANG_HTML, LANGUAGE_FEATURE_FORMAT +from capa.features.extractors.ts.query import QueryBinding, HTMLQueryBinding, TemplateQueryBinding +from capa.features.extractors.ts.tools import LANGUAGE_TOOLKITS +from capa.features.extractors.ts.engine import ( + TreeSitterBaseEngine, + TreeSitterHTMLEngine, + TreeSitterTemplateEngine, + TreeSitterExtractorEngine, +) + + +def do_test_ts_base_engine_init(engine: TreeSitterBaseEngine): + assert engine.language in [LANG_CS, LANG_TEM, LANG_HTML, LANG_JS] + assert isinstance(engine.query, QueryBinding) + assert isinstance(engine.buf, bytes) and len(engine.buf) > 0 + assert isinstance(engine.tree, Tree) + + +def do_test_ts_base_engine_get_str( + engine: TreeSitterBaseEngine, node: Node, expected_range: str, startswith: bool = False +): + assert engine.get_str(node).startswith(expected_range) if startswith else engine.get_str(node) == expected_range + + +def do_test_ts_base_engine_get_address(engine: TreeSitterBaseEngine, node: Node): + assert isinstance(engine.get_address(node), FileOffsetRangeAddress) + addr = engine.get_address(node) + assert addr.start_byte == node.start_byte and addr.end_byte == node.end_byte + + +def do_test_ts_base_engine_get_default_address(engine: TreeSitterBaseEngine): + assert isinstance(engine.get_default_address(), FileOffsetRangeAddress) + addr1 = engine.get_address(engine.tree.root_node) + addr2 = engine.get_default_address() + assert addr1.start_byte == addr2.start_byte and addr1.end_byte == addr2.end_byte + + +def do_test_ts_extractor_engine_init(engine: TreeSitterExtractorEngine, expected_language: str): + assert engine.language == expected_language + assert isinstance(engine.query, QueryBinding) + assert isinstance(engine.get_default_address(), FileOffsetRangeAddress) + assert isinstance(engine.buf_offset, int) and engine.buf_offset >= 0 + addr = engine.get_default_address() + assert ( + addr.start_byte == engine.tree.root_node.start_byte + engine.buf_offset + and addr.end_byte == engine.tree.root_node.end_byte + engine.buf_offset + ) + + +def do_test_ts_extractor_engine_get_address( + engine: TreeSitterExtractorEngine, node: Node, expected_range: str, startswith: bool = False +): + assert engine.get_str(node).startswith(expected_range) if startswith else engine.get_str(node) == expected_range + + +def do_test_ts_extractor_engine_get_new_objects( + engine: TreeSitterExtractorEngine, root_node: Node, expected: List[Tuple[str, str]] +): + assert len(list(engine.get_new_object_names(root_node))) == len(expected) + for node, (_, expected_name_range) in zip(engine.get_new_object_names(root_node), expected): + assert isinstance(node, Node) + do_test_ts_base_engine_get_str(engine, node, expected_name_range) + do_test_ts_base_engine_get_address(engine, node) + + +def do_test_ts_extractor_engine_get_function_definitions( + engine: TreeSitterExtractorEngine, root_node: Node, expected: List[Tuple[str, str]] +): + assert list(engine.get_function_definitions(engine.tree.root_node)) == list(engine.get_function_definitions()) + assert len(list(engine.get_function_definitions(root_node))) == len(expected) + for node, (expected_range, expected_name_range) in zip(engine.get_function_definitions(root_node), expected): + assert isinstance(node, Node) + do_test_ts_base_engine_get_str(engine, node, expected_range, startswith=True) + do_test_ts_base_engine_get_address(engine, node) + do_test_ts_base_engine_get_str(engine, engine.get_function_definition_name(node), expected_name_range) + + assert len(list(engine.get_function_definition_names(root_node))) == len(expected) + for node, (_, expected_name_range) in zip(engine.get_function_definition_names(root_node), expected): + assert isinstance(node, Node) + do_test_ts_base_engine_get_str(engine, node, expected_name_range) + do_test_ts_base_engine_get_address(engine, node) + + +def do_test_ts_extractor_engine_get_function_calls( + engine: TreeSitterExtractorEngine, root_node: Node, expected: List[Tuple[str, str]] +): + assert len(list(engine.get_function_call_names(root_node))) == len(expected) + for node, (_, expected_id_range) in zip(engine.get_function_call_names(root_node), expected): + assert isinstance(node, Node) + do_test_ts_base_engine_get_str(engine, node, expected_id_range) + do_test_ts_base_engine_get_address(engine, node) + + +def do_test_ts_extractor_engine_get_string_literals( + engine: TreeSitterExtractorEngine, root_node: Node, expected: List[str] +): + assert len(list(engine.get_string_literals(root_node))) == len(expected) + for node, expected_range in zip(engine.get_string_literals(root_node), expected): + assert isinstance(node, Node) + do_test_ts_base_engine_get_str(engine, node, expected_range) + do_test_ts_base_engine_get_address(engine, node) + + +def do_test_ts_extractor_engine_get_integer_literals( + engine: TreeSitterExtractorEngine, root_node: Node, expected: List[str] +): + assert len(list(engine.get_integer_literals(root_node))) == len(expected) + for node, expected_range in zip(engine.get_integer_literals(root_node), expected): + assert isinstance(node, Node) + do_test_ts_base_engine_get_str(engine, node, expected_range) + do_test_ts_base_engine_get_address(engine, node) + + +def do_test_ts_extractor_engine_get_namespaces(engine: TreeSitterExtractorEngine, expected: List[str]): + assert list(engine.get_namespaces(engine.tree.root_node)) == list(engine.get_namespaces()) + assert len(list(engine.get_namespaces())) == len(expected) + for (node, _), expected_range in zip(engine.get_namespaces(), expected): + assert isinstance(node, Node) + do_test_ts_base_engine_get_str(engine, node, expected_range) + do_test_ts_base_engine_get_address(engine, node) + + +def do_test_ts_extractor_engine_get_global_statements(engine: TreeSitterExtractorEngine, expected: List[str]): + assert len(list(engine.get_global_statements())) == len(expected) + for node, expected_range in zip(engine.get_global_statements(), expected): + assert isinstance(node, Node) + do_test_ts_base_engine_get_str(engine, node, expected_range, startswith=True) + do_test_ts_base_engine_get_address(engine, node) + + +def do_test_ts_extractor_engine_get_assigned_property_names( + engine: TreeSitterExtractorEngine, root_node: Node, expected: List[str] +): + assert len(list(engine.get_processed_property_names(root_node))) == len(expected) + for (node, name), expected_name in zip(engine.get_processed_property_names(root_node), expected): + assert isinstance(node, Node) + do_test_ts_base_engine_get_address(engine, node) + + +@parametrize( + "engine_str,expected", + [ + ( + "cs_138cdc_extractor_engine", + { + "language": LANG_CS, + "all objects": [ + ( + 'new Diagnostics.ProcessStartInfo("cmd", "/c " + Request.Form["c"])', + "Diagnostics.ProcessStartInfo", + ), + ("new System.Diagnostics.Process()", "System.Diagnostics.Process"), + ], + "all function definitions": [ + ("void die()", "die"), + ("void Page_Load(object sender, System.EventArgs e)", "Page_Load"), + ], + "all function calls": [ + ( + 'HttpContext.Current.Response.Write("

404 Not Found

")', + "HttpContext.Current.Response.Write", + ), + ( + "HttpContext.Current.Server.ClearError()", + "HttpContext.Current.Server.ClearError", + ), + ( + "HttpContext.Current.Response.End()", + "HttpContext.Current.Response.End", + ), + ( + "HttpContext.Current.Request.Headers[\"X-Forwarded-For\"].Split(new char[] { ',' })", + 'HttpContext.Current.Request.Headers["X-Forwarded-For"].Split', + ), + ( + "die()", + "die", + ), + ( + "p.Start()", + "p.Start", + ), + ( + "p.StandardOutput.ReadToEnd()", + "p.StandardOutput.ReadToEnd", + ), + ( + "p.StandardError.ReadToEnd()", + "p.StandardError.ReadToEnd", + ), + ], + "all string literals": [ + '""', + '""', + '"Not Found"', + '"

404 Not Found

"', + '"::1"', + '"192.168.0.1"', + '"127.0.0.1"', + '"X-Forwarded-For"', + '"X-Forwarded-For"', + '"c"', + '"cmd"', + '"/c "', + '"c"', + ], + "all integer literals": [ + "404", + "0", + ], + "namespaces": ["System"], + "global statements": [ + 'string stdout = "";', + 'string stderr = "";', + ], + "properties": [ + "Current.Response.StatusCode", + "Current.Response.StatusDescription", + "Current.Request.Headers", + "UserHostAddress", + "Current.Request.Headers", + "Form", + "Form", + "RedirectStandardOutput", + "RedirectStandardError", + "UseShellExecute", + "CreateNoWindow", + "StartInfo", + ], + }, + ), + ], +) +def test_ts_extractor_engine(request: pytest.FixtureRequest, engine_str: str, expected: dict): + engine: TreeSitterExtractorEngine = request.getfixturevalue(engine_str) + do_test_ts_extractor_engine_init(engine, expected["language"]) + do_test_ts_extractor_engine_get_new_objects(engine, engine.tree.root_node, expected["all objects"]) + do_test_ts_extractor_engine_get_function_definitions( + engine, engine.tree.root_node, expected["all function definitions"] + ) + do_test_ts_extractor_engine_get_function_calls(engine, engine.tree.root_node, expected["all function calls"]) + do_test_ts_extractor_engine_get_string_literals(engine, engine.tree.root_node, expected["all string literals"]) + do_test_ts_extractor_engine_get_integer_literals(engine, engine.tree.root_node, expected["all integer literals"]) + do_test_ts_extractor_engine_get_assigned_property_names(engine, engine.tree.root_node, expected["properties"]) + do_test_ts_extractor_engine_get_global_statements(engine, expected["global statements"]) + do_test_ts_extractor_engine_get_namespaces(engine, expected["namespaces"]) + do_test_ts_base_engine_get_default_address(engine) + + +def do_test_ts_template_engine_init(engine: TreeSitterTemplateEngine): + assert engine.language == LANG_TEM + assert isinstance(engine.query, TemplateQueryBinding) + assert isinstance(engine.buf, bytes) and len(engine.buf) > 0 + assert isinstance(engine.tree, Tree) + assert isinstance(engine.get_default_address(), FileOffsetRangeAddress) + addr = engine.get_default_address() + assert addr.start_byte == engine.tree.root_node.start_byte and addr.end_byte == engine.tree.root_node.end_byte + + +def do_test_ts_template_engine_get_template_namespaces( + engine: TreeSitterTemplateEngine, expected_language: str, expected: List[str] +): + default_namespaces = LANGUAGE_TOOLKITS[expected_language].get_default_namespaces(True) + template_namespaces = set(engine.get_namespaces()) + assert default_namespaces.issubset(template_namespaces) + assert len(list(engine.get_imported_namespaces())) == len(expected) + for namespace, expected_namespace in zip(list(engine.get_imported_namespaces()), expected): + assert isinstance(namespace.node, Node) + assert engine.is_aspx_import_directive(namespace.node) == True + aspx_namespace = engine.get_aspx_namespace(namespace.node) + assert aspx_namespace is not None and aspx_namespace.name == expected_namespace + assert namespace.name == expected_namespace + + +def do_test_ts_template_engine_get_code_sections(engine: TreeSitterTemplateEngine, expected: List[Tuple[int, int]]): + assert len(list(engine.get_code_sections())) == len(expected) + for node, (expected_start_byte, expected_end_byte) in zip(list(engine.get_code_sections()), expected): + assert isinstance(node, Node) + assert node.start_byte == expected_start_byte and node.end_byte == expected_end_byte + + +def do_test_ts_template_engine_get_content_sections(engine: TreeSitterTemplateEngine, expected: List[Tuple[int, int]]): + assert len(list(engine.get_content_sections())) == len(expected) + for node, (expected_start_byte, expected_end_byte) in zip(list(engine.get_content_sections()), expected): + assert isinstance(node, Node) + assert node.start_byte == expected_start_byte and node.end_byte == expected_end_byte + + +def do_test_ts_template_engine_get_parsed_code_sections( + engine: TreeSitterTemplateEngine, expected_language: str, expected: List[Tuple[int, int]] +): + assert len(list(engine.get_parsed_code_sections())) == len(expected) + for extractor_engine, (expected_start_byte, _) in zip(engine.get_parsed_code_sections(), expected): + do_test_ts_extractor_engine_init(extractor_engine, expected_language) + assert extractor_engine.buf_offset == expected_start_byte + root = extractor_engine.tree.root_node + addr = extractor_engine.get_default_address() + assert ( + addr.start_byte == root.start_byte + expected_start_byte + and addr.end_byte == root.end_byte + expected_start_byte + ) + addr = extractor_engine.get_address(extractor_engine.tree.root_node) + assert ( + addr.start_byte == root.start_byte + expected_start_byte + and addr.end_byte == root.end_byte + expected_start_byte + ) + + +@parametrize( + "engine_str,expected", + [ + ( + "aspx_1f8f40_template_engine", + { + "language": LANG_CS, + "aspx namespaces": ["System.Reflection"], + "code sections": [(2, 23), (27, 64), (68, 469)], + "content sections": [], + }, + ), + ( + "aspx_2b71dd_template_engine", + { + "language": LANG_CS, + "aspx namespaces": ["System.Diagnostics", "System.IO"], + "code sections": [(2, 50), (55, 95), (100, 131)], + "content sections": [(52, 53), (97, 98), (133, 1273)], + }, + ), + ( + "aspx_2e8c7e_template_engine", + { + "language": LANG_CS, + "aspx namespaces": ["System.Diagnostics", "System.IO"], + "code sections": [(2, 23), (28, 67), (72, 103)], + "content sections": [(25, 26), (69, 70), (105, 2919)], + }, + ), + ( + "aspx_03bb5c_template_engine", + { + "language": LANG_CS, + "aspx namespaces": ["System.Web.UI.WebControls", "System.Diagnostics", "System.IO"], + "code sections": [(2, 47), (53, 100), (106, 146), (152, 183), (1659, 7702)], + "content sections": [(49, 51), (102, 104), (148, 150), (185, 1657), (7704, 10790)], + }, + ), + ( + "aspx_4f6fa6_template_engine", + { + "language": LANG_CS, + "aspx namespaces": ["System.Diagnostics", "System.IO", "System.IO.Compression"], + "code sections": [(2, 50), (55, 95), (100, 131), (136, 179), (186, 234)], + "content sections": [(52, 53), (97, 98), (133, 134), (181, 183), (237, 6039)], + }, + ), + ( + "aspx_a35878_template_engine", + { + "language": LANG_CS, + "aspx namespaces": [ + "System.IO", + "System.Diagnostics", + "System.Data", + "System.Management", + "System.Data.OleDb", + "Microsoft.Win32", + "System.Net.Sockets", + "System.Net", + "System.Web.UI", + "System.Runtime.InteropServices", + "System.DirectoryServices", + "System.ServiceProcess", + "System.Text.RegularExpressions", + "System.Threading", + "System.Data.SqlClient", + "Microsoft.VisualBasic", + ], + "code sections": [ + (2, 123), + (128, 158), + (163, 202), + (207, 239), + (244, 282), + (287, 325), + (330, 366), + (371, 411), + (416, 448), + (453, 487), + (492, 543), + (548, 593), + (598, 640), + (645, 696), + (701, 738), + (743, 785), + (790, 832), + (837, 943), + (948, 1047), + (1052, 1155), + (1160, 1266), + ], + "content sections": [ + (125, 126), + (160, 161), + (204, 205), + (241, 242), + (284, 285), + (327, 328), + (368, 369), + (413, 414), + (450, 451), + (489, 490), + (545, 546), + (595, 596), + (642, 643), + (698, 699), + (740, 741), + (787, 788), + (834, 835), + (945, 946), + (1049, 1050), + (1157, 1158), + (1268, 2680), + ], + }, + ), + ( + "aspx_10162f_template_engine", + { + "language": LANG_CS, + "aspx namespaces": ["System.IO"], + "code sections": [ + (2, 71), + (76, 106), + (162, 2122), + (25579, 25596), + (25625, 25642), + (25664, 25700), + (25738, 25747), + (25801, 25822), + (25960, 25973), + (26002, 26015), + (26092, 26115), + (26153, 26168), + (26278, 26295), + (26324, 26341), + (26402, 26455), + (26472, 26489), + (26550, 26555), + (26593, 26612), + (26752, 26765), + (26794, 26811), + (26863, 26880), + (26941, 26946), + (26995, 27020), + (27037, 27062), + (27123, 27128), + (27166, 27181), + (27291, 27308), + (27337, 27354), + (27456, 27475), + (27686, 27711), + (27740, 27761), + (27854, 27879), + (27896, 27926), + (27992, 28002), + (28040, 28055), + (28167, 28188), + (28271, 28312), + (28374, 28443), + (28511, 28548), + (28610, 28675), + (28699, 28728), + (28789, 28794), + (28813, 28826), + (28871, 28876), + (28921, 28932), + (29044, 29077), + (29141, 29158), + (29220, 29226), + (29264, 29275), + (29359, 29384), + (29446, 29452), + (29490, 29501), + (29585, 29602), + (29664, 29670), + (29708, 29719), + (30163, 30170), + ], + "content sections": [ + (73, 74), + (108, 160), + (2124, 25576), + (25598, 25622), + (25644, 25661), + (25702, 25735), + (25749, 25798), + (25824, 25957), + (25975, 25999), + (26017, 26089), + (26117, 26150), + (26170, 26275), + (26297, 26321), + (26343, 26399), + (26457, 26469), + (26491, 26547), + (26557, 26590), + (26614, 26749), + (26767, 26791), + (26813, 26860), + (26882, 26938), + (26948, 26992), + (27022, 27034), + (27064, 27120), + (27130, 27163), + (27183, 27288), + (27310, 27334), + (27356, 27453), + (27477, 27683), + (27713, 27737), + (27763, 27851), + (27881, 27893), + (27928, 27989), + (28004, 28037), + (28057, 28164), + (28190, 28268), + (28314, 28371), + (28445, 28508), + (28550, 28607), + (28677, 28696), + (28730, 28786), + (28796, 28810), + (28828, 28868), + (28878, 28918), + (28934, 29041), + (29079, 29138), + (29160, 29217), + (29228, 29261), + (29277, 29356), + (29386, 29443), + (29454, 29487), + (29503, 29582), + (29604, 29661), + (29672, 29705), + (29721, 30160), + (30172, 30635), + ], + }, + ), + ( + "aspx_606dbf_template_engine", + { + "language": LANG_CS, + "aspx namespaces": [ + "System", + "System.IO", + "System.Web", + "System.Web.SessionState", + "System.Web.UI", + "System.Web.Configuration", + "System.Threading", + "System.Net", + "System.Net.Sockets", + "System.Text", + ], + "code sections": [ + (2, 87), + (93, 121), + (127, 158), + (164, 196), + (202, 247), + (253, 288), + (294, 340), + (346, 384), + (390, 422), + (428, 468), + (474, 507), + ], + "content sections": [ + (89, 91), + (123, 125), + (160, 162), + (198, 200), + (249, 251), + (290, 292), + (342, 344), + (386, 388), + (424, 426), + (470, 472), + (509, 7078), + ], + }, + ), + ( + "aspx_ea2a01_template_engine", + { + "language": LANG_CS, + "aspx namespaces": ["System.Diagnostics", "System.IO", "System.Security.Cryptography", "System"], + "code sections": [(2, 47), (53, 93), (99, 130), (136, 186), (192, 220), (228, 5811)], + "content sections": [(49, 51), (95, 97), (132, 134), (188, 190), (222, 226), (5813, 5818)], + }, + ), + ( + "aspx_a5c893_template_engine", + { + "language": LANG_CS, + "aspx namespaces": ["System.Reflection"], + "code sections": [(2, 23), (27, 64), (68, 469)], + "content sections": [(471, 472)], + }, + ), + ( + "aspx_b75f16_template_engine", + { + "language": LANG_CS, + "aspx namespaces": ["System.IO"], + "code sections": [(2, 123), (127, 157), (303, 587)], + "content sections": [(159, 301), (589, 596)], + }, + ), + ( + "aspx_d460ca_template_engine", + { + "language": LANG_CS, + "aspx namespaces": [ + "System.Reflection", + "Microsoft.CSharp", + "System.CodeDom.Compiler", + "System.IO", + "System.Security.Cryptography", + ], + "code sections": [(2, 22), (27, 65), (70, 107), (112, 156), (161, 191), (196, 245)], + "content sections": [(24, 25), (67, 68), (109, 110), (158, 159), (193, 194), (247, 4866)], + }, + ), + ( + "aspx_b4bb14_template_engine", + { + "language": LANG_CS, + "aspx namespaces": ["System.Diagnostics", "System.IO"], + "code sections": [(2, 50), (55, 95), (100, 131)], + "content sections": [(52, 53), (97, 98), (133, 1398)], + }, + ), + ( + "aspx_f2bf20_template_engine", + { + "language": LANG_CS, + "aspx namespaces": [ + "System.IO", + "System.IO.Compression", + "System.Diagnostics", + "System.Data", + "System.Data.OleDb", + "System.Data.Common", + "System.Data.SqlClient", + "System.Management", + "Microsoft.Win32", + "System.Net", + "System.Net.Sockets", + "System.Reflection", + "System.Runtime.InteropServices", + "System.DirectoryServices", + "System.ServiceProcess", + "System.Text.RegularExpressions", + "System.Security", + "System.Security.Permissions", + "System.Threading", + ], + "code sections": [ + (2, 125), + (133, 164), + (170, 213), + (219, 259), + (265, 298), + (304, 343), + (349, 389), + (395, 438), + (444, 483), + (489, 526), + (532, 564), + (570, 610), + (616, 655), + (661, 713), + (719, 765), + (771, 814), + (820, 872), + (878, 915), + (921, 970), + (976, 1014), + (1020, 1127), + (1133, 1233), + (1239, 1343), + (39508, 39563), + (45103, 45113), + (47599, 47609), + (48705, 48712), + ], + "content sections": [ + (127, 131), + (166, 168), + (215, 217), + (261, 263), + (300, 302), + (345, 347), + (391, 393), + (440, 442), + (485, 487), + (528, 530), + (566, 568), + (612, 614), + (657, 659), + (715, 717), + (767, 769), + (816, 818), + (874, 876), + (917, 919), + (972, 974), + (1016, 1018), + (1129, 1131), + (1235, 1237), + (1345, 39505), + (39565, 45100), + (45116, 47596), + (47612, 48702), + (48715, 55896), + ], + }, + ), + ( + "aspx_5f959f_template_engine", + { + "language": LANG_CS, + "aspx namespaces": ["System.Diagnostics", "System.IO"], + "code sections": [(2, 50), (55, 95), (100, 131)], + "content sections": [(52, 53), (97, 98), (133, 1400)], + }, + ), + ( + "aspx_f39dc0_template_engine", + { + "language": LANG_CS, + "aspx namespaces": ["System.Diagnostics", "System.IO", "System.Net"], + "code sections": [(2, 50), (56, 96), (102, 133), (139, 171), (678, 1421)], + "content sections": [(52, 54), (98, 100), (135, 137), (173, 676), (1423, 1441)], + }, + ), + ( + "aspx_54433d_template_engine", + { + "language": LANG_CS, + "aspx namespaces": [ + "System.Diagnostics", + "System.IO", + "System.IO.Compression", + "Microsoft.VisualBasic", + ], + "code sections": [(2, 50), (55, 95), (100, 131), (136, 179), (184, 227), (233, 280)], + "content sections": [(52, 53), (97, 98), (133, 134), (181, 182), (229, 230), (283, 10444)], + }, + ), + ( + "aspx_f397cb_template_engine", + { + "language": LANG_CS, + "aspx namespaces": ["System"], + "code sections": [(2, 22), (28, 56), (3950, 3981), (4033, 4064)], + "content sections": [(24, 26), (58, 3948), (3983, 4031), (4066, 4388)], + }, + ), + ( + "aspx_15eed4_template_engine", + { + "language": LANG_CS, + "aspx namespaces": [ + "System.IO", + "System.Diagnostics", + "System.Data", + "System.Management", + "System.Data.OleDb", + "Microsoft.Win32", + "System.Net.Sockets", + "System.Net", + "System.Runtime.InteropServices", + "System.DirectoryServices", + "System.ServiceProcess", + "System.Text.RegularExpressions", + "System.Threading", + "System.Data.SqlClient", + "Microsoft.VisualBasic", + ], + "code sections": [ + (2, 123), + (128, 158), + (163, 202), + (207, 239), + (244, 282), + (287, 325), + (330, 366), + (371, 411), + (416, 448), + (453, 504), + (509, 554), + (559, 601), + (606, 657), + (662, 699), + (704, 746), + (751, 793), + (798, 904), + (909, 1008), + (1013, 1116), + (1121, 1227), + (54081, 54091), + (55610, 55620), + (56304, 56315), + (57500, 57508), + (57995, 58004), + (58531, 58541), + (58984, 58994), + (59512, 59521), + (60014, 60024), + (60284, 60291), + (61559, 61564), + (62217, 62227), + (62711, 62721), + (66897, 66906), + (67954, 67962), + ], + "content sections": [ + (125, 126), + (160, 161), + (204, 205), + (241, 242), + (284, 285), + (327, 328), + (368, 369), + (413, 414), + (450, 451), + (506, 507), + (556, 557), + (603, 604), + (659, 660), + (701, 702), + (748, 749), + (795, 796), + (906, 907), + (1010, 1011), + (1118, 1119), + (1229, 54078), + (54094, 55607), + (55623, 56301), + (56318, 57497), + (57511, 57992), + (58007, 58528), + (58544, 58981), + (58997, 59509), + (59524, 60011), + (60027, 60281), + (60294, 61556), + (61567, 62214), + (62230, 62708), + (62724, 66894), + (66909, 67951), + (67965, 70053), + ], + }, + ), + ( + "aspx_6f3261_template_engine", + { + "language": LANG_CS, + "aspx namespaces": ["System.Data", "System.Data.SqlClient"], + "code sections": [(2, 23), (28, 60), (65, 107)], + "content sections": [(25, 26), (62, 63), (109, 3303)], + }, + ), + ], +) +def test_ts_template_engine(request: pytest.FixtureRequest, engine_str: str, expected: dict): + engine: TreeSitterTemplateEngine = request.getfixturevalue(engine_str) + do_test_ts_template_engine_init(engine) + assert engine.identify_language() == expected["language"] + do_test_ts_template_engine_get_template_namespaces(engine, expected["language"], expected["aspx namespaces"]) + do_test_ts_template_engine_get_code_sections(engine, expected["code sections"]) + do_test_ts_template_engine_get_parsed_code_sections(engine, expected["language"], expected["code sections"]) + do_test_ts_template_engine_get_content_sections(engine, expected["content sections"]) + for expected_start_byte, expected_end_byte in expected["content sections"]: + html_engine = TreeSitterHTMLEngine( + engine.buf[expected_start_byte:expected_end_byte], set(engine.get_namespaces()) + ) + do_test_ts_html_engine_init(html_engine) + + +def do_test_ts_html_engine_init(engine: TreeSitterHTMLEngine): + assert engine.language == LANG_HTML + assert isinstance(engine.query, HTMLQueryBinding) + assert isinstance(engine.buf, bytes) and len(engine.buf) > 0 + assert isinstance(engine.tree, Tree) + assert isinstance(engine.get_default_address(), FileOffsetRangeAddress) + assert isinstance(engine.namespaces, set) + addr = engine.get_default_address() + assert addr.start_byte == engine.tree.root_node.start_byte and addr.end_byte == engine.tree.root_node.end_byte + + +FEATURE_PRESENCE_TESTS_SCRIPTS = sorted( + [ + ("cs_138cdc", "global", Arch(ARCH_ANY), True), + ("cs_138cdc", "global", OS(OS_ANY), True), + ("cs_138cdc", "global", ScriptLanguage(LANGUAGE_FEATURE_FORMAT[LANG_CS]), True), + ("cs_138cdc", "file", Format(FORMAT_SCRIPT), True), + ("cs_138cdc", "file", Namespace("System"), True), + ("cs_138cdc", "function=PSEUDO MAIN", String(""), True), + ("cs_138cdc", "function=die", String("Not Found"), True), + ("cs_138cdc", "function=Page_Load", String("127.0.0.1"), True), + ("cs_138cdc", "function=Page_Load", Class("System.Diagnostics.ProcessStartInfo"), True), + ("cs_138cdc", "function=Page_Load", API("System.Diagnostics.ProcessStartInfo::ctor"), True), + ("cs_138cdc", "function=Page_Load", Class("System.Diagnostics.Process"), True), + ("cs_138cdc", "function=Page_Load", API("System.Diagnostics.Process::ctor"), True), + ( + "cs_138cdc", + "function=Page_Load", + Property("System.Diagnostics.ProcessStartInfo::RedirectStandardOutput"), + True, + ), + ("aspx_4f6fa6", "global", Arch(ARCH_ANY), True), + ("aspx_4f6fa6", "global", OS(OS_ANY), True), + ("aspx_4f6fa6", "global", ScriptLanguage(LANGUAGE_FEATURE_FORMAT[LANG_CS]), True), + ("aspx_4f6fa6", "file", Format(FORMAT_SCRIPT), True), + ("aspx_4f6fa6", "file", Namespace("System.Diagnostics"), True), + ("aspx_4f6fa6", "file", Namespace("System.IO"), True), + ("aspx_4f6fa6", "file", Namespace("System.IO.Compression"), True), + ("aspx_4f6fa6", "function=do_ps", String("powershell.exe"), True), + ("aspx_4f6fa6", "function=do_ps", Substring("-executionpolicy bypass"), True), + ("aspx_4f6fa6", "function=do_ps", Class("System.Diagnostics.ProcessStartInfo"), True), + ("aspx_4f6fa6", "function=do_ps", API("System.Diagnostics.ProcessStartInfo::ctor"), True), + ("aspx_4f6fa6", "function=do_ps", API("System.Diagnostics.Process::Start"), True), + ("aspx_4f6fa6", "function=ps", String("\\nPS> "), True), + ("aspx_4f6fa6", "function=ps", Substring("PS>"), True), + ("aspx_4f6fa6", "function=downloadbutton_Click", Substring("filename"), True), + ("aspx_4f6fa6", "function=base64encode", API("System.Convert::ToBase64String"), True), + ("aspx_5f959f", "global", Arch(ARCH_ANY), True), + ("aspx_5f959f", "global", OS(OS_ANY), True), + ("aspx_5f959f", "global", ScriptLanguage(LANGUAGE_FEATURE_FORMAT[LANG_CS]), True), + ("aspx_5f959f", "file", Format(FORMAT_SCRIPT), True), + ("aspx_5f959f", "file", Namespace("System.Diagnostics"), True), + ("aspx_5f959f", "file", Namespace("System.IO"), True), + ("aspx_5f959f", "file", Namespace("System.Web.SessionState"), True), + ("aspx_5f959f", "function=ExcuteCmd", Class("System.Diagnostics.ProcessStartInfo"), True), + ("aspx_5f959f", "function=ExcuteCmd", API("System.Diagnostics.ProcessStartInfo::ctor"), True), + ("aspx_5f959f", "function=ExcuteCmd", String("cmd.exe"), True), + ("aspx_5f959f", "function=ExcuteCmd", Substring("/c"), True), + ("aspx_5f959f", "function=ExcuteCmd", API("System.Diagnostics.Process::Start"), True), + ("aspx_5f959f", "function=ExcuteCmd", Property("System.Diagnostics.ProcessStartInfo::FileName"), True), + ("aspx_5f959f", "function=ExcuteCmd", Property("System.Diagnostics.ProcessStartInfo::Arguments"), True), + ("aspx_5f959f", "function=ExcuteCmd", Property("System.Diagnostics.ProcessStartInfo::UseShellExecute"), True), + ( + "aspx_5f959f", + "function=ExcuteCmd", + Property("System.Diagnostics.ProcessStartInfo::RedirectStandardOutput"), + True, + ), + ("aspx_5f959f", "function=cmdExe_Click", String("
"), True),
+        ("aspx_5f959f", "function=cmdExe_Click", String("
"), True), + ("aspx_10162f", "global", Arch(ARCH_ANY), True), + ("aspx_10162f", "global", OS(OS_ANY), True), + ("aspx_10162f", "global", ScriptLanguage(LANGUAGE_FEATURE_FORMAT[LANG_CS]), True), + ("aspx_10162f", "file", Format(FORMAT_SCRIPT), True), + ("aspx_10162f", "file", Namespace("System.IO"), True), + ("aspx_10162f", "file", Namespace("System.Web.Security"), True), + ("aspx_10162f", "function=PSEUDO MAIN", String("data"), True), + ("aspx_10162f", "function=PSEUDO MAIN", String("gsize"), True), + ("aspx_10162f", "function=PSEUDO MAIN", String("cmd"), True), + ("aspx_10162f", "function=PSEUDO MAIN", String("ttar"), True), + ("aspx_10162f", "function=PSEUDO MAIN", String("sdfewq@#$51234234DF@#$!@#$ASDF"), True), + ("aspx_10162f", "function=rm", API("System.IO.File::Delete"), False), + ("aspx_10162f", "function=(0x564, 0x6af)", API("System.Convert::ToBase64String"), True), + ("aspx_10162f", "function=(0x564, 0x6af)", API("System.Convert::ToBase64String"), True), + ("aspx_10162f", "function=(0x564, 0x6af)", String("p"), True), + ( + "aspx_10162f", + "function=c", + API("System.Security.Cryptography.SHA256CryptoServiceProvider::ComputeHash"), + True, + ), + ("aspx_10162f", "function=z", API("System.IO.File::ReadAllBytes"), True), + ("aspx_10162f", "function=ti", API("System.IO.File::GetCreationTime"), True), + ("aspx_10162f", "function=ti", API("System.IO.File::GetLastAccessTime"), True), + ("aspx_10162f", "function=ti", API("System.IO.File::GetCreationTime"), True), + ("aspx_10162f", "function=g", API("System.IO.File::GetLastAccessTime"), True), + ("aspx_10162f", "function=g", API("System.IO.File::GetLastWriteTime"), True), + ("aspx_10162f", "function=g", API("System.IO.File::GetLastWriteTime"), True), + ("aspx_10162f", "function=g", API("System.IO.File::SetCreationTime"), True), + ("aspx_10162f", "function=g", API("System.IO.File::SetLastAccessTime"), True), + ("aspx_10162f", "function=g", API("System.IO.File::SetLastWriteTime"), True), + ("aspx_10162f", "function=h", API("System.IO.Path::GetTempPath"), True), + ("aspx_10162f", "function=h", API("System.IO.File::WriteAllBytes"), True), + ("aspx_10162f", "function=h", API("System.Convert::FromBase64String"), True), + ("aspx_10162f", "function=d", API("System.IO.File::Delete"), True), + ("aspx_10162f", "function=d", API("System.IO.File::Delete"), True), + ("aspx_10162f", "function=sq", Class("System.Data.SqlClient.SqlConnection"), True), + ("aspx_10162f", "function=sq", API("System.Data.SqlClient.SqlConnection::ctor"), True), + ("aspx_10162f", "function=sq", Class("System.Data.SqlClient.SqlCommand"), True), + ("aspx_10162f", "function=sq", API("System.Data.SqlClient.SqlCommand::ctor"), True), + ("aspx_10162f", "function=sq", Class("System.Data.SqlClient.SqlDataAdapter"), True), + ("aspx_10162f", "function=sq", API("System.Data.SqlClient.SqlDataAdapter::ctor"), True), + ("aspx_10162f", "function=sq", API("System.Data.SqlClient.SqlConnection::Open"), True), + ("aspx_10162f", "function=exec", Class("System.Diagnostics.Process"), True), + ("aspx_10162f", "function=exec", API("System.Diagnostics.Process::ctor"), True), + ("aspx_10162f", "function=exec", String("cmd.exe"), True), + ("aspx_10162f", "function=exec", Property("System.Diagnostics.Process.StartInfo::FileName"), True), + ("aspx_10162f", "function=exec", Property("System.Diagnostics.Process.StartInfo::UseShellExecute"), True), + ("aspx_10162f", "function=exec", Property("System.Diagnostics.Process.StartInfo::RedirectStandardInput"), True), + ( + "aspx_10162f", + "function=exec", + Property("System.Diagnostics.Process.StartInfo::RedirectStandardOutput"), + True, + ), + ("aspx_10162f", "function=exec", Property("System.Diagnostics.Process.StartInfo::CreateNoWindow"), True), + ("aspx_10162f", "function=gsize", Substring("error"), True), + ("aspx_10162f", "function=exp", Substring("root"), True), + ("aspx_10162f", "function=exp", Substring("net use"), True), + ("aspx_10162f", "function=exp", Number(2), True), + ("aspx_10162f", "function=exp", Class("System.IO.DirectoryInfo"), True), + ("aspx_10162f", "function=exp", API("System.IO.DirectoryInfo::ctor"), True), + ("aspx_10162f", "function=exp", API("System.IO.File::GetAttributes"), True), + ("aspx_10162f", "function=GetDirSize", Number(0), True), + ("aspx_10162f", "function=createJsonDirectory", String('\\"dir\\":['), True), + ("aspx_10162f", "function=createJsonDirectory", Number(0), True), + ("aspx_10162f", "function=createJsonFile", Substring("file"), True), + ("aspx_10162f", "function=sizeFix", Number(1024), True), + ("aspx_10162f", "function=sizeFix", Number(2), True), + ("aspx_10162f", "function=sizeFix", Substring("GB"), True), + ("aspx_2b71dd", "global", Arch(ARCH_ANY), True), + ("aspx_2b71dd", "global", OS(OS_ANY), True), + ("aspx_2b71dd", "global", ScriptLanguage(LANGUAGE_FEATURE_FORMAT[LANG_CS]), True), + ("aspx_2b71dd", "file", Format(FORMAT_SCRIPT), True), + ("aspx_2b71dd", "file", Namespace("System.Diagnostics"), True), + ("aspx_2b71dd", "file", Namespace("System.IO"), True), + ("aspx_2b71dd", "function=ExcuteCmd", Class("System.Diagnostics.ProcessStartInfo"), True), + ("aspx_2b71dd", "function=ExcuteCmd", API("System.Diagnostics.ProcessStartInfo::ctor"), True), + ("aspx_2b71dd", "function=ExcuteCmd", String("cmd.exe"), True), + ("aspx_2b71dd", "function=ExcuteCmd", Substring("/c"), True), + ("aspx_2b71dd", "function=ExcuteCmd", API("System.Diagnostics.Process::Start"), True), + ("aspx_2b71dd", "function=ExcuteCmd", Property("System.Diagnostics.ProcessStartInfo::FileName"), True), + ("aspx_2b71dd", "function=ExcuteCmd", Property("System.Diagnostics.ProcessStartInfo::Arguments"), True), + ("aspx_2b71dd", "function=ExcuteCmd", Property("System.Diagnostics.ProcessStartInfo::UseShellExecute"), True), + ( + "aspx_2b71dd", + "function=ExcuteCmd", + Property("System.Diagnostics.ProcessStartInfo::RedirectStandardOutput"), + True, + ), + ("aspx_f2bf20", "global", Arch(ARCH_ANY), True), + ("aspx_f39dc0", "global", Arch(ARCH_ANY), True), + ("aspx_ea2a01", "global", Arch(ARCH_ANY), True), + ("aspx_6f3261", "global", Arch(ARCH_ANY), True), + ("aspx_6f3261", "global", OS(OS_ANY), True), + ("aspx_6f3261", "global", ScriptLanguage(LANGUAGE_FEATURE_FORMAT[LANG_CS]), True), + ("aspx_6f3261", "file", Format(FORMAT_SCRIPT), True), + ("aspx_6f3261", "file", Namespace("System.Data"), True), + ("aspx_6f3261", "file", Namespace("System.Data.SqlClient"), True), + ("aspx_6f3261", "function=PSEUDO MAIN", String("woanware"), True), + ("aspx_6f3261", "function=btnExecute_Click", Class("System.Data.SqlClient.SqlConnection"), True), + ("aspx_6f3261", "function=btnExecute_Click", API("System.Data.SqlClient.SqlConnection::ctor"), True), + ("aspx_6f3261", "function=btnExecute_Click", API("System.Data.SqlClient.SqlConnection::Open"), True), + ("aspx_6f3261", "function=btnExecute_Click", Class("System.Data.SqlClient.SqlCommand"), True), + ("aspx_6f3261", "function=btnExecute_Click", API("System.Data.SqlClient.SqlCommand::ctor"), True), + ("aspx_6f3261", "function=btnExecute_Click", API("System.Data.SqlClient.SqlCommand::ExecuteReader"), True), + ("aspx_1f8f40", "global", Arch(ARCH_ANY), True), + ("aspx_1f8f40", "global", OS(OS_ANY), True), + ("aspx_1f8f40", "global", ScriptLanguage(LANGUAGE_FEATURE_FORMAT[LANG_CS]), True), + ("aspx_1f8f40", "file", Format(FORMAT_SCRIPT), True), + ("aspx_1f8f40", "file", Namespace("System.Reflection"), True), + ("aspx_1f8f40", "function=PSEUDO MAIN", Class("System.Security.Cryptography.RijndaelManaged"), True), + ("aspx_1f8f40", "function=PSEUDO MAIN", API("System.Security.Cryptography.RijndaelManaged::ctor"), True), + ( + "aspx_1f8f40", + "function=PSEUDO MAIN", + API("System.Security.Cryptography.RijndaelManaged::CreateDecryptor"), + True, + ), + ("aspx_2e8c7e", "global", Arch(ARCH_ANY), True), + ("aspx_2e8c7e", "global", OS(OS_ANY), True), + ("aspx_2e8c7e", "global", ScriptLanguage(LANGUAGE_FEATURE_FORMAT[LANG_CS]), True), + ("aspx_2e8c7e", "file", Format(FORMAT_SCRIPT), True), + ("aspx_2e8c7e", "file", Namespace("System.Diagnostics"), True), + ("aspx_2e8c7e", "file", Namespace("System.IO"), True), + ("aspx_2e8c7e", "function=ExecuteCommand", Class("System.Diagnostics.ProcessStartInfo"), True), + ("aspx_2e8c7e", "function=ExecuteCommand", API("System.Diagnostics.ProcessStartInfo::ctor"), True), + ("aspx_2e8c7e", "function=ExecuteCommand", String("cmd.exe"), True), + ("aspx_2e8c7e", "function=ExecuteCommand", Substring("/c"), True), + ("aspx_2e8c7e", "function=ExecuteCommand", API("System.Diagnostics.Process::Start"), True), + ("aspx_2e8c7e", "function=ExecuteCommand", Property("System.Diagnostics.ProcessStartInfo::FileName"), True), + ("aspx_2e8c7e", "function=ExecuteCommand", Property("System.Diagnostics.ProcessStartInfo::Arguments"), True), + ( + "aspx_2e8c7e", + "function=ExecuteCommand", + Property("System.Diagnostics.ProcessStartInfo::UseShellExecute"), + True, + ), + ( + "aspx_2e8c7e", + "function=ExecuteCommand", + Property("System.Diagnostics.ProcessStartInfo::RedirectStandardOutput"), + True, + ), + ("aspx_03bb5c", "global", Arch(ARCH_ANY), True), + ("aspx_03bb5c", "global", OS(OS_ANY), True), + ("aspx_03bb5c", "global", ScriptLanguage(LANGUAGE_FEATURE_FORMAT[LANG_CS]), True), + ("aspx_03bb5c", "file", Format(FORMAT_SCRIPT), True), + ("aspx_03bb5c", "file", Namespace("System.Diagnostics"), True), + ("aspx_03bb5c", "file", Namespace("System.IO"), True), + ("aspx_03bb5c", "function=PSEUDO MAIN", Class("System.Diagnostics.ProcessStartInfo"), True), + ("aspx_03bb5c", "function=PSEUDO MAIN", API("System.Diagnostics.ProcessStartInfo::ctor"), True), + ("aspx_03bb5c", "function=PSEUDO MAIN", API("System.Diagnostics.Process::Start"), True), + ("aspx_03bb5c", "function=PSEUDO MAIN", Property("System.Diagnostics.ProcessStartInfo::FileName"), True), + ("aspx_03bb5c", "function=PSEUDO MAIN", Property("System.Diagnostics.ProcessStartInfo::Arguments"), True), + ("aspx_03bb5c", "function=PSEUDO MAIN", Property("System.Diagnostics.ProcessStartInfo::UseShellExecute"), True), + ( + "aspx_03bb5c", + "function=PSEUDO MAIN", + Property("System.Diagnostics.ProcessStartInfo::RedirectStandardOutput"), + True, + ), + ("aspx_606dbf", "global", Arch(ARCH_ANY), True), + ("aspx_f397cb", "global", Arch(ARCH_ANY), True), + ("aspx_b4bb14", "global", Arch(ARCH_ANY), True), + ("aspx_54433d", "global", Arch(ARCH_ANY), True), + ("aspx_a35878", "global", Arch(ARCH_ANY), True), + ("aspx_a5c893", "global", Arch(ARCH_ANY), True), + ("aspx_15eed4", "global", Arch(ARCH_ANY), True), + ("aspx_b75f16", "global", Arch(ARCH_ANY), True), + ("aspx_d460ca", "global", Arch(ARCH_ANY), True), + ("py_7f9cd1", "global", Arch(ARCH_ANY), True), + ("py_7f9cd1", "global", OS(OS_ANY), True), + ("py_7f9cd1", "global", ScriptLanguage(LANGUAGE_FEATURE_FORMAT[LANG_PY]), True), + ("py_7f9cd1", "file", Format(FORMAT_SCRIPT), True), + ("py_7f9cd1", "file", Namespace("socket"), True), + ("py_7f9cd1", "file", Namespace("threading.Timer"), True), + ("py_7f9cd1", "file", Namespace("threading.Timer"), True), + ("py_7f9cd1", "function=icloud_phish", API("subprocess::Popen"), True), + ("py_7f9cd1", "function=icloud_phish", Class("urllib2.Request"), True), + ("py_7f9cd1", "function=icloud_phish", API("base64::encodestring"), True), + ("py_7f9cd1", "function=icloud_phish", API("urllib2::urlopen"), True), + ("py_7f9cd1", "function=get_itunes_backups", String("IMEI"), True), + ("py_7f9cd1", "function=PSEUDO MAIN", String("[I] "), True), + ("py_7f9cd1", "function=PSEUDO MAIN", Substring("[!]"), True), + ("py_7f9cd1", "function=get_itunes_backups", Number(0), True), + ("py_7f9cd1", "function=get_itunes_backups", Number(1), True), + ("py_ca0df6", "file", Namespace("win32com.client"), True), + ("py_ca0df6", "file", Namespace("shutil"), True), + ("py_ca0df6", "function=PSEUDO MAIN", API("os::environ"), True), + ("py_ca0df6", "function=yut", API("shutil::copytree"), True), + ("py_ca0df6", "function=yut", API("os::getcwd"), True), + ("py_ca0df6", "function=takk", API("win32com.client::Dispatch"), True), + ("py_ca0df6", "function=takk", String("Schedule.Service"), True), + ("py_ca0df6", "function=takk", Substring("Updatewmplayer.exe"), True), + ("py_ca0df6", "function=llp", API("win32api::SetFileAttributes"), True), + ("py_ca0df6", "function=llp", Substring("KMPlayer"), True), + ("py_ca0df6", "function=fop", API("os::remove"), True), + ("py_ca0df6", "function=fop", Substring("Projec.exe"), True), + ("py_ca0df6", "function=htr", API("time::sleep"), True), + ("py_ca0df6", "function=htr", Number(30), True), + ("py_ca0df6", "function=htr", Number(25), True), + ("py_ca0df6", "function=htr", Number(10), True), + ("py_ca0df6", "function=vul", Number(5), True), + ("py_ca0df6", "function=vul", Number(1), True), + ("py_ca0df6", "function=vul", API("os::popen"), True), + ("py_ca0df6", "function=vul", String("Updatewmplayer"), True), + ("py_ca0df6", "function=vul", Substring("SCHTASKS"), True), + ("py_ca0df6", "function=llp", API("win32con::FILE_ATTRIBUTE_HIDDEN"), True), + ] +) + + +@parametrize( + "sample_ts, scope_ts, feature, expected", FEATURE_PRESENCE_TESTS_SCRIPTS, indirect=["sample_ts", "scope_ts"] +) +def test_ts_extractor(sample_ts, scope_ts, feature, expected): + fixtures.do_test_feature_presence(fixtures.get_ts_extractor, sample_ts, scope_ts, feature, expected)