data-dot-all · dlpzx · Oct 16, 2023 · Sep 13, 2023 · Sep 14, 2023 · Sep 14, 2023
diff --git a/backend/dataall/base/cdkproxy/cdk_cli_wrapper.py b/backend/dataall/base/cdkproxy/cdk_cli_wrapper.py
@@ -18,6 +18,7 @@
 from dataall.base.aws.sts import SessionHelper
 from dataall.base.db import Engine
 from dataall.base.utils.alarm_service import AlarmService
+from dataall.base.utils.shell_utils import CommandSanitizer
 
 logger = logging.getLogger('cdksass')
 
@@ -46,7 +47,7 @@ def aws_configure(profile_name='default'):
     print('..............................................')
     print(f"AWS_CONTAINER_CREDENTIALS_RELATIVE_URI: {os.getenv('AWS_CONTAINER_CREDENTIALS_RELATIVE_URI')}")
     cmd = ['curl', '169.254.170.2$AWS_CONTAINER_CREDENTIALS_RELATIVE_URI']
-    process = subprocess.run(' '.join(cmd), text=True, shell=True, encoding='utf-8', capture_output=True)  # nosec  # nosemgrep
+    process = subprocess.run(cmd, text=True, shell=False, encoding='utf-8', capture_output=True)
     creds = None
     if process.returncode == 0:
         creds = ast.literal_eval(process.stdout)
@@ -130,6 +131,16 @@ def deploy_cdk_stack(engine: Engine, stackid: str, app_path: str = None, path: s
             app_path = app_path or './app.py'
 
             logger.info(f'app_path: {app_path}')
+            input_args = [
+                stack.name,
+                stack.accountid,
+                stack.region,
+                stack.stack,
+                stack.targetUri
+            ]
+
+            CommandSanitizer(input_args)
+
             cmd = [
                 '' '. ~/.nvm/nvm.sh &&',
                 'cdk',
@@ -157,9 +168,11 @@ def deploy_cdk_stack(engine: Engine, stackid: str, app_path: str = None, path: s
                 f'"{sys.executable} {app_path}"',
                 '--verbose',
             ]
-
             logger.info(f"Running command : \n {' '.join(cmd)}")
 
+            # This command is too complex to be executed as a list of commands. We need to run it with shell=True
+            # However, the input arguments have be sanitized with the CommandSanitizer
+
             process = subprocess.run(  # nosemgrep
                 ' '.join(cmd),  # nosemgrep
                 text=True,  # nosemgrep
@@ -208,17 +221,28 @@ def describe_stack(stack, engine: Engine = None, stackid: str = None):
 
 
 def cdk_installed():
-    cmd = ['. ~/.nvm/nvm.sh && cdk', '--version']
-    logger.info(f"Running command {' '.join(cmd)}")
-
-    subprocess.run(  # nosemgrep
-        cmd,  # nosemgrep
-        text=True,  # nosemgrep
-        shell=True,  # nosec  # nosemgrep
-        encoding='utf-8',  # nosemgrep
-        stdout=subprocess.PIPE,  # nosemgrep
-        stderr=subprocess.PIPE,  # nosemgrep
-        cwd=os.path.dirname(__file__),  # nosemgrep
+    cmd1 = ['.', '~/.nvm/nvm.sh']
+    logger.info(f"Running command {' '.join(cmd1)}")
+    subprocess.run(
+        cmd1,
+        text=True,
+        shell=False,
+        encoding='utf-8',
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        cwd=os.path.dirname(__file__),
+    )
+
+    cmd2 = ['cdk', '--version']
+    logger.info(f"Running command {' '.join(cmd2)}")
+    subprocess.run(
+        cmd2,
+        text=True,
+        shell=False,
+        encoding='utf-8',
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        cwd=os.path.dirname(__file__),
     )
 
 

diff --git a/backend/dataall/base/utils/shell_utils.py b/backend/dataall/base/utils/shell_utils.py
@@ -0,0 +1,23 @@
+import re
+
+
+class CommandSanitizer:
+    """
+    Takes a list of arguments and verifies that each argument is alphanumeric or "-" or "_" and of type string
+    Trows a TypeError if any of these conditions is violated.
+    """
+    def __init__(self, args) -> None:
+        if not args:
+            raise TypeError("Arguments cannot be empty")
+
+        for arg in args:
+            if not isinstance(arg, str):
+                raise TypeError(f"arguments must be strings, not {type(arg)} of {str(arg)}")
+            if re.search(r"[^a-zA-Z0-9-_]", arg):
+                raise TypeError(f"argument contains invalid characters: {arg}")
+
+        self._arguments = args
+
+    @property
+    def arguments(self):
+        return self._arguments
diff --git a/backend/dataall/modules/datapipelines/cdk/datapipelines_cdk_pipeline.py b/backend/dataall/modules/datapipelines/cdk/datapipelines_cdk_pipeline.py
@@ -7,6 +7,7 @@
 
 from dataall.base import db
 from dataall.base.aws.sts import SessionHelper
+from dataall.base.utils.shell_utils import CommandSanitizer
 from dataall.core.environment.services.environment_service import EnvironmentService
 from dataall.modules.datapipelines.db.datapipelines_repositories import DatapipelinesRepository
 
@@ -71,6 +72,10 @@ def __init__(self, target_uri):
                     'COMMITID=$(aws codecommit get-branch --repository-name ${REPO_NAME} --branch-name main --query branch.commitId --output text)',
                     'aws codecommit put-file --repository-name ${REPO_NAME} --branch-name main --file-content file://app.py --file-path app.py --parent-commit-id ${COMMITID} --cli-binary-format raw-in-base64-out',
                 ]
+                CommandSanitizer(args=[self.pipeline.repo])
+
+                # This command is too complex to be executed as a list of commands. We need to run it with shell=True
+                # However, the input arguments have be sanitized with the CommandSanitizer
 
                 process = subprocess.run(  # nosemgrep
                     "; ".join(update_cmds),  # nosemgrep
@@ -99,6 +104,11 @@ def initialize_repo(self):
 
         logger.info(f"Running Commands: {'; '.join(cmd_init)}")
 
+        CommandSanitizer(args=[self.pipeline.repo, self.pipeline.SamlGroupName])
+
+        # This command is too complex to be executed as a list of commands. We need to run it with shell=True
+        # However, the input arguments have be sanitized with the CommandSanitizer
+
         process = subprocess.run(  # nosemgrep
             '; '.join(cmd_init),  # nosemgrep
             text=True,  # nosemgrep
@@ -201,6 +211,9 @@ def git_push_repo(self):
 
         logger.info(f"Running Commands: {'; '.join(git_cmds)}")
 
+        # This command does not include any customer upstream input
+        # no sanitization is needed and shell=true does not impose a risk
+
         process = subprocess.run(  # nosemgrep
             '; '.join(git_cmds),  # nosemgrep
             text=True,  # nosemgrep
@@ -225,13 +238,13 @@ def clean_up_repo(path):
             cwd = os.path.dirname(os.path.abspath(__file__))
             logger.info(f"Running command : \n {' '.join(precmd)}")
 
-            process = subprocess.run(  # nosemgrep
-                ' '.join(precmd),  # nosemgrep
-                text=True,  # nosemgrep
-                shell=True,  # nosec  # nosemgrep
-                encoding='utf-8',  # nosemgrep
-                capture_output=True,  # nosemgrep
-                cwd=cwd  # nosemgrep
+            process = subprocess.run(
+                precmd,
+                text=True,
+                shell=False,
+                encoding='utf-8',
+                capture_output=True,
+                cwd=cwd
             )
 
             if process.returncode == 0:

diff --git a/backend/dataall/modules/datapipelines/cdk/datapipelines_pipeline.py b/backend/dataall/modules/datapipelines/cdk/datapipelines_pipeline.py
@@ -22,6 +22,7 @@
 from dataall.modules.datapipelines.db.datapipelines_models import DataPipeline, DataPipelineEnvironment
 from dataall.modules.datapipelines.db.datapipelines_repositories import DatapipelinesRepository
 from dataall.base.utils.cdk_nag_utils import CDKNagUtil
+from dataall.base.utils.shell_utils import CommandSanitizer
 
 logger = logging.getLogger(__name__)
 
@@ -186,6 +187,11 @@ def __init__(self, scope, id, target_uri: str = None, **kwargs):
                     'aws codecommit put-file --repository-name ${REPO_NAME} --branch-name main --file-content file://ddk.json --file-path ddk.json --parent-commit-id ${COMMITID} --cli-binary-format raw-in-base64-out',
                 ]
 
+                CommandSanitizer(args=[pipeline.repo])
+
+                # This command is too complex to be executed as a list of commands. We need to run it with shell=True
+                # However, the input arguments have be sanitized with the CommandSanitizer
+
                 process = subprocess.run(  # nosemgrep
                     "; ".join(update_cmds),  # nosemgrep
                     text=True,  # nosemgrep
@@ -507,6 +513,7 @@ def write_ddk_json_multienvironment(path, output_file, pipeline_environment, dev
         with open(f'{path}/{output_file}', 'w') as text_file:
             print(json, file=text_file)
 
+    @staticmethod
     def initialize_repo(pipeline, code_dir_path, env_vars):
 
         venv_name = ".venv"
@@ -521,6 +528,11 @@ def initialize_repo(pipeline, code_dir_path, env_vars):
 
         logger.info(f"Running Commands: {'; '.join(cmd_init)}")
 
+        CommandSanitizer(args=[pipeline.repo])
+
+        # This command is too complex to be executed as a list of commands. We need to run it with shell=True
+        # However, the input arguments have be sanitized with the CommandSanitizer
+
         process = subprocess.run(  # nosemgrep
             '; '.join(cmd_init),  # nosemgrep
             text=True,  # nosemgrep

diff --git a/deploy/stacks/solution_bundling.py b/deploy/stacks/solution_bundling.py
@@ -14,18 +14,21 @@ def __init__(self, source_path=None):
 
     def try_bundle(self, output_dir: str, options: BundlingOptions) -> bool:
         requirements_path = Path(self.source_path, 'requirements.txt')
-        command = [
-            f'cp -a {self.source_path}/. {output_dir}/ && pip install -r {requirements_path} -t {output_dir}'
-        ]
-        subprocess.check_output(  # nosemgrep
-            command,  # nosemgrep
-            stderr=subprocess.STDOUT,  # nosemgrep
-            shell=True,  # nosec  # nosemgrep
+        subprocess.check_output(
+            ['cp', '-a', f'{self.source_path}/.', f'{output_dir}/'],
+            stderr=subprocess.STDOUT,
+            shell=False,
         )
 
-        ls_output = subprocess.check_output(  # nosemgrep
-            [f'ls -ll {output_dir}'],  # nosemgrep
-            stderr=subprocess.STDOUT,  # nosemgrep
-            shell=True,  # nosec  # nosemgrep
+        subprocess.check_output(
+            ['pip', 'install', '-r', requirements_path, '-t', output_dir],
+            stderr=subprocess.STDOUT,
+            shell=False,
+        )
+
+        ls_output = subprocess.check_output(
+            ['ls', '-ll', f'{output_dir}'],
+            stderr=subprocess.STDOUT,
+            shell=False,
         )
         return True