From cfad81f2e6e829a6de6881483a836a30f80ad802 Mon Sep 17 00:00:00 2001
From: Benjamin Ramhorst <bramhorst27@gmail.com>
Date: Tue, 29 Oct 2024 12:05:41 +0100
Subject: [PATCH 01/36] Fix wrong note in README.md

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 606e824d09..4e9515d52d 100644
--- a/README.md
+++ b/README.md
@@ -49,8 +49,8 @@ hls_model = hls4ml.converters.keras_to_hls(config)
 hls4ml.utils.fetch_example_list()
 ```
 
-### Building a project with Xilinx Vivado HLS (after downloading and installing from [here](https://www.xilinx.com/products/design-tools/vivado/integration/esl-design.html))
-Note: Vitis HLS is not yet supported. Vivado HLS versions between 2018.2 and 2020.1 are recommended.
+### Building a project.
+We will build the project using Xilinx Vivado HLS, which can be downloaded and installed from [here](https://www.xilinx.com/products/design-tools/vivado/integration/esl-design.html). Alongside Vivado HLS, hls4ml also supports Vitis HLS, Intel HLS, Catapult HLS and has some experimental support dor Intel oneAPI. The target back-end can be changed using the argument backend when building the model.
 
 ```Python
 # Use Vivado HLS to synthesize the model

From fabcf8c97295b34a2662dfa2f4a1602ff889f43c Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Mon, 4 Nov 2024 20:15:38 -0600
Subject: [PATCH 02/36] update the project status

---
 docs/setup.rst  | 19 +++++++++++++------
 docs/status.rst | 34 ++++++++++++++++++++++++----------
 2 files changed, 37 insertions(+), 16 deletions(-)

diff --git a/docs/setup.rst b/docs/setup.rst
index a735281c3f..d083e07c71 100644
--- a/docs/setup.rst
+++ b/docs/setup.rst
@@ -43,23 +43,30 @@ version can be installed directly from ``git``:
 Dependencies
 ============
 
-The ``hls4ml`` library depends on a number of Python packages and external tools for synthesis and simulation. Python dependencies are automatically managed
+The ``hls4ml`` library requires python 3.10 or later, and depends on a number of Python packages and external tools for synthesis and simulation. Python dependencies are automatically managed
 by ``pip`` or ``conda``.
 
-* `TensorFlow <https://pypi.org/project/tensorflow/>`_ (version 2.4 and newer) and `QKeras <https://pypi.org/project/qkeras/>`_ are required by the Keras converter.
+* `TensorFlow <https://pypi.org/project/tensorflow/>`_ (version 2.8 to 2.14) and `QKeras <https://pypi.org/project/qkeras/>`_ are required by the Keras converter. One may want to install newer versions of QKeras from GitHub. Newer versions of TensorFlow can be used, but QKeras and hl4ml do not currently support Keras v3.
+
 * `ONNX <https://pypi.org/project/onnx/>`_ (version 1.4.0 and newer) is required by the ONNX converter.
+
 * `PyTorch <https://pytorch.org/get-started>`_ package is optional. If not installed, the PyTorch converter will not be available.
 
 Running C simulation from Python requires a C++11-compatible compiler. On Linux, a GCC C++ compiler ``g++`` is required. Any version from a recent
-Linux should work. On MacOS, the *clang*-based ``g++`` is enough.
+Linux should work. On MacOS, the *clang*-based ``g++`` is enough. For the oneAPI backend, one must have oneAPI installed, along with the FPGA compiler,
+to run C/SYCL simulations.
 
 To run FPGA synthesis, installation of following tools is required:
 
-* Xilinx Vivado HLS 2018.2 to 2020.1 for synthesis for Xilinx FPGAs
+* Xilinx Vivado HLS 2018.2 to 2020.1 for synthesis for Xilinx FPGAs using the ``Vivado`` backend.
+
+* Vitis HLS 2022.2 or newer is required for synthesis for Xilinx FPGAs using the ``Vitis`` backend.
+
+* Intel Quartus 20.1 to 21.4 for the synthesis for Intel/Altera FPGAs using the ``Quartus`` backend.
 
-  * Vitis HLS 2022.2 or newer is required for synthesis for Xilinx FPGAs using the ``Vitis`` backend.
+* oneAPI 2024.1 to 2025.0 with the FPGA compiler and recent Intel/Altara Quartus for Intel/Altera FPGAs using the ``oneAPI`` backend.
 
-* Intel Quartus 20.1 to 21.4 for the synthesis for Intel FPGAs
+Catapult HLS 2024.1_1 or 2024.2 can be used to synthesize both for ASICs and FPGAs.
 
 
 Quick Start
diff --git a/docs/status.rst b/docs/status.rst
index 4ff4d33282..5d3f3591f2 100644
--- a/docs/status.rst
+++ b/docs/status.rst
@@ -18,8 +18,8 @@ A list of supported ML frameworks, HLS backends, and neural network architecture
 ML framework support:
 
 * (Q)Keras
-* PyTorch (limited)
-* (Q)ONNX (in development)
+* PyTorch
+* (Q)ONNX
 
 Neural network architectures:
 
@@ -32,7 +32,9 @@ HLS backends:
 
 * Vivado HLS
 * Intel HLS
-* Vitis HLS (experimental)
+* Vitis HLS
+* Catapult HLS
+* oneAPI (experimental)
 
 A summary of the on-going status of the ``hls4ml`` tool is in the table below.
 
@@ -46,35 +48,44 @@ A summary of the on-going status of the ``hls4ml`` tool is in the table below.
      - Vivado HLS
      - Intel HLS
      - Vitis HLS
+     - Catapult HLS
+     - oneAPI
    * - MLP
      - ``supported``
-     - ``limited``
-     - ``in development``
+     - ``supported``
+     - ``supported``
+     - ``supported``
+     - ``supported``
      - ``supported``
      - ``supported``
      - ``experimental``
    * - CNN
      - ``supported``
-     - ``limited``
-     - ``in development``
+     - ``supported``
+     - ``supported``
+     - ``supported``
+     - ``supported``
      - ``supported``
      - ``supported``
      - ``experimental``
    * - RNN (LSTM)
+     - ``supported``
      - ``supported``
      - ``N/A``
-     - ``in development``
      - ``supported``
      - ``supported``
-     - ``N/A``
+     - ``supported``
+     - ``supported``
+     - ``experimental``
    * - GNN (GarNet)
      - ``supported``
+     - ``in development``
+     - ``N/A``
      - ``N/A``
      - ``N/A``
      - ``N/A``
      - ``N/A``
      - ``N/A``
-
 
 Other feature notes:
 
@@ -82,6 +93,9 @@ Other feature notes:
    * Vivado HLS versions 2018.2 to 2020.1
    * Intel HLS versions 20.1 to 21.4
    * Vitis HLS versions 2022.2 to 2024.1
+   * Catapult HLS versions 2024.1_1 to 2024.2
+   * oneAPI versions 2024.1 to 2025.0
+
 * Windows and macOS are not supported
 * BDT support has moved to the `Conifer <https://github.com/thesps/conifer>`__ package
 

From b844acf061facdd81f8fa7bb48931b85cdb17e01 Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Tue, 5 Nov 2024 11:37:22 -0600
Subject: [PATCH 03/36] restructure of existing documentation

---
 docs/{ => advanced}/command.rst |  2 +-
 docs/{ => advanced}/flows.rst   |  0
 docs/advanced/oneapi.rst        | 16 +++----
 docs/{ => api}/details.rst      |  0
 docs/index.rst                  | 10 +++--
 docs/setup.rst                  | 75 +++++++++++++++++----------------
 6 files changed, 53 insertions(+), 50 deletions(-)
 rename docs/{ => advanced}/command.rst (97%)
 rename docs/{ => advanced}/flows.rst (100%)
 rename docs/{ => api}/details.rst (100%)

diff --git a/docs/command.rst b/docs/advanced/command.rst
similarity index 97%
rename from docs/command.rst
rename to docs/advanced/command.rst
index cb9d346e31..67f7e3fe2f 100644
--- a/docs/command.rst
+++ b/docs/advanced/command.rst
@@ -50,7 +50,7 @@ hls4ml config
 
    hls4ml config [-h] [-m MODEL] [-w WEIGHTS] [-o OUTPUT]
 
-This creates a conversion configuration file. Visit Configuration section of the :doc:`Setup <setup>` page for more details on how to write a configuration file.
+This creates a conversion configuration file. Visit Configuration section of the :doc:`Setup <../setup>` page for more details on how to write a configuration file.
 
 **Arguments**
 
diff --git a/docs/flows.rst b/docs/advanced/flows.rst
similarity index 100%
rename from docs/flows.rst
rename to docs/advanced/flows.rst
diff --git a/docs/advanced/oneapi.rst b/docs/advanced/oneapi.rst
index ae0e0bc56b..fb926409eb 100644
--- a/docs/advanced/oneapi.rst
+++ b/docs/advanced/oneapi.rst
@@ -3,18 +3,17 @@ oneAPI Backend
 ==============
 
 The ``oneAPI`` backend of hls4ml is designed for deploying NNs on Intel/Altera FPGAs. It will eventually
-replace the ``Quartus`` backend, which should really have been called the Intel HLS backend. (The actual Quartus
-program continues to be used with IP produced by the ``oneAPI`` backend.)
-This section discusses details of the ``oneAPI`` backend.
+replace the ``Quartus`` backend, which targeted Intel HLS. (Quartus continues to be used with IP produced by the
+``oneAPI`` backend.) This section discusses details of the ``oneAPI`` backend.
 
 The ``oneAPI`` code uses SYCL kernels to implement the logic that is deployed on FPGAs. It naturally leads to the
-accelerator style of programming. In the IP Component flow, which is currently the only flow supported, the
+accelerator style of programming. In the SYCL HLS (IP Component) flow, which is currently the only flow supported, the
 kernel becomes the IP, and the "host code" becomes the testbench. An accelerator flow, with easier deployment on
 PCIe accelerator boards, is planned to be added in the future.
 
 The produced work areas use cmake to build the projects in a style based
 `oneAPI-samples <https://github.com/oneapi-src/oneAPI-samples/tree/main/DirectProgramming/C%2B%2BSYCL_FPGA>`_.
-The standard ``fpga_emu``, ``report``, ``fpga_sim``, and ``fpga`` are supported. Additionally, ``make lib``
+The standard ``fpga_emu``, ``report``, ``fpga_sim``, and ``fpga`` make targets are supported. Additionally, ``make lib``
 produces the library used for calling the ``predict`` function from hls4ml. The ``compile`` and ``build`` commands
 in hls4ml interact with the cmake system, so one does not need to manually use the build system, but it there
 if desired.
@@ -30,6 +29,7 @@ io_parallel and io_stream
 As mentioned in the :ref:`I/O Types` section, ``io_parallel`` is for small models, while ``io_stream`` is for
 larger models. In ``oneAPI``, there is an additional difference: ``io_stream`` implements each layer on its
 own ``task_sequence``. Thus, the layers run in parallel, with pipes connecting the inputs and outputs. This
-is similar in style to the `dataflow` implementation on Vitis, but more explicit. On the other hand, ``io_parallel``
-always uses a single task, relying on pipelining within the task for good performance. In contrast, the Vitis
-backend sometimes uses dataflow with ``io_parallel``.
+is similar in style to the `dataflow` implementation on Vitis HLS, but more explicit. It is also a change
+relative to the Intel HLS-based ``Quartus`` backend. On the other hand, ``io_parallel`` always uses a single task,
+relying on pipelining within the task for good performance. In contrast, the Vitis backend sometimes uses dataflow
+with ``io_parallel``.
diff --git a/docs/details.rst b/docs/api/details.rst
similarity index 100%
rename from docs/details.rst
rename to docs/api/details.rst
diff --git a/docs/index.rst b/docs/index.rst
index 339c4cfd42..2e417b1d74 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -6,9 +6,6 @@
     status
     setup
     release_notes
-    details
-    flows
-    command
     reference
 
 .. toctree::
@@ -16,18 +13,23 @@
     :glob:
     :caption: Quick API Reference
 
-    api/*
+    api/configuration
+    api/details
+    api/hls-model
+    api/profiling
 
 .. toctree::
     :hidden:
     :caption: Advanced Features
 
+    advanced/flows
     advanced/qonnx
     advanced/fifo_depth
     advanced/extension
     advanced/oneapi
     advanced/accelerator
     advanced/model_optimization
+    advanced/command
 
 .. toctree::
     :hidden:
diff --git a/docs/setup.rst b/docs/setup.rst
index d083e07c71..628f4ee69a 100644
--- a/docs/setup.rst
+++ b/docs/setup.rst
@@ -107,76 +107,77 @@ Done! You've built your first project using ``hls4ml``! To learn more about our
 
 If you want to configure your model further, check out our :doc:`Configuration <api/configuration>` page.
 
-Apart from our main API, we also support model conversion using a command line interface, check out our next section to find out more:
+..
+   Apart from our main API, we also support model conversion using a command line interface, check out our next section to find out more:
 
-Getting started with hls4ml CLI (deprecated)
---------------------------------------------
+   Getting started with hls4ml CLI (deprecated)
+   --------------------------------------------
 
-As an alternative to the recommended Python PI, the command-line interface is provided via the ``hls4ml`` command.
+   As an alternative to the recommended Python PI, the command-line interface is provided via the ``hls4ml`` command.
 
-To follow this tutorial, you must first download our ``example-models`` repository:
+   To follow this tutorial, you must first download our ``example-models`` repository:
 
-.. code-block:: bash
+   .. code-block:: bash
 
-   git clone https://github.com/fastmachinelearning/example-models
+      git clone https://github.com/fastmachinelearning/example-models
 
-Alternatively, you can clone the ``hls4ml`` repository with submodules
+   Alternatively, you can clone the ``hls4ml`` repository with submodules
 
-.. code-block:: bash
+   .. code-block:: bash
 
-   git clone --recurse-submodules https://github.com/fastmachinelearning/hls4ml
+      git clone --recurse-submodules https://github.com/fastmachinelearning/hls4ml
 
-The model files, along with other configuration parameters, are defined in the ``.yml`` files.
-Further information about ``.yml`` files can be found in :doc:`Configuration <api/configuration>` page.
+   The model files, along with other configuration parameters, are defined in the ``.yml`` files.
+   Further information about ``.yml`` files can be found in :doc:`Configuration <api/configuration>` page.
 
-In order to create an example HLS project, first go to ``example-models/`` from the main directory:
+   In order to create an example HLS project, first go to ``example-models/`` from the main directory:
 
-.. code-block:: bash
+   .. code-block:: bash
 
-   cd example-models/
+      cd example-models/
 
-And use this command to translate a Keras model:
+   And use this command to translate a Keras model:
 
-.. code-block:: bash
+   .. code-block:: bash
 
-   hls4ml convert -c keras-config.yml
+      hls4ml convert -c keras-config.yml
 
-This will create a new HLS project directory with an implementation of a model from the ``example-models/keras/`` directory.
-To build the HLS project, do:
+   This will create a new HLS project directory with an implementation of a model from the ``example-models/keras/`` directory.
+   To build the HLS project, do:
 
-.. code-block:: bash
+   .. code-block:: bash
 
-   hls4ml build -p my-hls-test -a
+      hls4ml build -p my-hls-test -a
 
-This will create a Vivado HLS project with your model implementation!
+   This will create a Vivado HLS project with your model implementation!
 
-**NOTE:** For the last step, you can alternatively do the following to build the HLS project:
+   **NOTE:** For the last step, you can alternatively do the following to build the HLS project:
 
-.. code-block:: Bash
+   .. code-block:: Bash
 
-   cd my-hls-test
-   vivado_hls -f build_prj.tcl
+      cd my-hls-test
+      vivado_hls -f build_prj.tcl
 
-``vivado_hls`` can be controlled with:
+   ``vivado_hls`` can be controlled with:
 
-.. code-block:: bash
+   .. code-block:: bash
 
-   vivado_hls -f build_prj.tcl "csim=1 synth=1 cosim=1 export=1 vsynth=1"
+      vivado_hls -f build_prj.tcl "csim=1 synth=1 cosim=1 export=1 vsynth=1"
 
-Setting the additional parameters from ``1`` to ``0`` disables that step, but disabling ``synth`` also disables ``cosim`` and ``export``.
+   Setting the additional parameters from ``1`` to ``0`` disables that step, but disabling ``synth`` also disables ``cosim`` and ``export``.
 
-Further help
-^^^^^^^^^^^^
+   Further help
+   ^^^^^^^^^^^^
 
-* For further information about how to use ``hls4ml``\ , do: ``hls4ml --help`` or ``hls4ml -h``
-* If you need help for a particular ``command``\ , ``hls4ml command -h`` will show help for the requested ``command``
-* We provide a detailed documentation for each of the command in the :doc:`Command Help <../command>` section
+   * For further information about how to use ``hls4ml``\ , do: ``hls4ml --help`` or ``hls4ml -h``
+   * If you need help for a particular ``command``\ , ``hls4ml command -h`` will show help for the requested ``command``
+   * We provide a detailed documentation for each of the command in the :doc:`Command Help <advanced/command>` section
 
 Existing examples
 -----------------
 
-* Examples of model files and weights can be found in `example_models <https://github.com/fastmachinelearning/example-models>`_ directory.
 * Training codes and examples of resources needed to train the models can be found in the `tutorial <https://github.com/fastmachinelearning/hls4ml-tutorial>`__.
+* Examples of model files and weights can be found in `example_models <https://github.com/fastmachinelearning/example-models>`_ directory.
 
 Uninstalling
 ------------

From 88e84f36fd1b2546ee39a86f90930521437a48f0 Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Tue, 5 Nov 2024 17:02:27 -0600
Subject: [PATCH 04/36] add an internal layers section, and auto precision

---
 docs/api/auto.rst          | 16 ++++++++++++++++
 docs/api/configuration.rst |  4 ++--
 docs/index.rst             | 10 ++++++++++
 docs/ir/activations.rst    | 15 +++++++++++++++
 docs/ir/conv.rst           | 32 ++++++++++++++++++++++++++++++++
 docs/ir/dense.rst          | 25 +++++++++++++++++++++++++
 hls4ml/model/layers.py     |  2 +-
 7 files changed, 101 insertions(+), 3 deletions(-)
 create mode 100644 docs/api/auto.rst
 create mode 100644 docs/ir/activations.rst
 create mode 100644 docs/ir/conv.rst
 create mode 100644 docs/ir/dense.rst

diff --git a/docs/api/auto.rst b/docs/api/auto.rst
new file mode 100644
index 0000000000..7e52358ebb
--- /dev/null
+++ b/docs/api/auto.rst
@@ -0,0 +1,16 @@
+=============================
+Automatic precision inference
+=============================
+
+The automatic precision inference (implemented in :py:class:`~hls4ml.model.optimizer.passes.infer_precision.InferPrecisionTypes`) attempts to infer the appropriate widths for a given precision.
+It is initiated by configuring a precision in the configuration as 'auto'. Functions like :py:class:`~hls4ml.utils.config.config_from_keras_model` and :py:class:`~hls4ml.utils.config.config_from_onnx_model`
+automatically set most precisions to 'auto' if the ``'name'`` granularity is used.
+
+.. note::
+    It is recommended to pass the backend to the ``config_from_*`` functions so that they can properly extract all the configurable precisions.
+
+The approach taken by the precision inference is to set accumulator and other precisions to never truncate, using only the bitwidths of the inputs (not the values). This is quite conservative,
+especially in cases where post-training quantization is used, or if the bit widths were set fairly loosely. The recommended action in that case is to edit the configuration and explicitly set
+some widths in it, potentially in an iterative process after seeing what precisions are automatically set. Another option, currently implemented in :py:class:`~hls4ml.utils.config.config_from_keras_model`,
+is to pass a maximum bitwidth using the ``max_precison`` option. Then the automatic precision inference will never set a bitwdith larger than the bitwidth or an integer part larger than the integer part of
+the ``max_precision`` that is passed. (The bitwidth and integer parts are treated separately.)
\ No newline at end of file
diff --git a/docs/api/configuration.rst b/docs/api/configuration.rst
index f0db50a9b6..c8bd61b59f 100644
--- a/docs/api/configuration.rst
+++ b/docs/api/configuration.rst
@@ -45,8 +45,8 @@ This python dictionary can be edited as needed. A more advanced configuration ca
         default_precision='fixed<16,6>',
         backend='Vitis')
 
-This will include per-layer configuration based on the model. Including the backend is recommended because some configation options depend on the backend. Note, the precisions at the
-higher granularites usually default to 'auto', which means that ``hls4ml`` will try to set it automatically. Note that higher granularity settings take precendence
+This will include per-layer configuration based on the model. Including the backend is recommended because some configuration options depend on the backend. Note, the precisions at the
+higher granularites usually default to 'auto', which means that ``hls4ml`` will try to set it automatically (see :ref:`Automatic precision inference`). Note that higher granularity settings take precedence
 over model-level settings. See :py:class:`~hls4ml.utils.config.config_from_keras_model` for more information on the various options.
 
 One can override specific values before using the configuration:
diff --git a/docs/index.rst b/docs/index.rst
index 2e417b1d74..299a8c0512 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -14,10 +14,20 @@
     :caption: Quick API Reference
 
     api/configuration
+    api/auto
     api/details
     api/hls-model
     api/profiling
 
+.. toctree::
+    :hidden:
+    :glob:
+    :caption: Internal Layers
+
+    ir/dense
+    ir/activations
+    ir/conv
+
 .. toctree::
     :hidden:
     :caption: Advanced Features
diff --git a/docs/ir/activations.rst b/docs/ir/activations.rst
new file mode 100644
index 0000000000..f80f0208ba
--- /dev/null
+++ b/docs/ir/activations.rst
@@ -0,0 +1,15 @@
+===========
+Activations
+===========
+
+Most activations without extra parameters are represented with the ``Activation`` layer, and those with single parameters (leaky ReLU, thresholded ReLU, ELU) as ``ParametrizedActivation``.
+``PReLU`` has its own class because it has a parameter matrix (stored as a weight). The hard (piecewise linear) sigmoid and tanh functions are implemented in a ``HardActivation`` layer,
+and ``Softmax`` has its own layer class.
+
+Softmax has four implementations that the user can choose from by setting the ``implementation`` parameter:
+
+* **latency**:  Good latency, but somewhat high resource usage. It does not work well if there are many output classes.
+* **stable**:  Slower but with better accuracy, useful in scenarios where higher accuracy is needed.
+* **legacy**:  An older implementation with poor accuracy, but good performance. Usually the latency implementation is preferred.
+* **argmax**:  If you don't care about normalized outputs and only care about which one has the highest value, using argmax saves a lot of resources. This sets the highest value to 1, the others to 0.
+
diff --git a/docs/ir/conv.rst b/docs/ir/conv.rst
new file mode 100644
index 0000000000..6b6ca953fd
--- /dev/null
+++ b/docs/ir/conv.rst
@@ -0,0 +1,32 @@
+==================
+Convolution Layers
+==================
+
+Standard convolutions
+=====================
+
+These are the standard 1D and 2D convolutions currently supported by hls4ml, and the fallback if there is no special pointwise implementation.
+
+io_parallel
+-----------
+
+Parallel convolutions are for cases where the model needs to be small and fast, though synthesizability limits can be quickly reached. Also note that skip connections
+are not supported in io_parallel.
+
+For the Xilinx backends and Catapult, there is a very direct convolution implementation when using the ``Latency`` strategy. This is only for very small models because the
+high number of nested loops. The ``Resource`` strategy in all cases defaults to an algorithm using the *im2col* transformation. This generally supports larger models. The ``Quartus``,
+``oneAPI``, and ``Catapult`` backends also implement a ``Winograd`` algorithm choosable by setting the ``implementation`` to ``Winograd`` or ``combination``. Note that
+the Winograd implementation is available for only a handful of filter size configurations, and it is less concerned about bit accuracy and overflow, but it can be faster.
+
+io_stream
+---------
+
+There are two main classes of io_stream implementations, ``LineBuffer`` and  ``Encoded``. ``LineBuffer`` is always the default, and generally produces marginally better results,
+while ``Catapult`` and ``Vivado`` also implement ``Encoded``, choosable with the ``convImplementation`` configuration option. In all cases, the data is processed serially, one pixel
+at a time, with a pixel containing an array of all the channel values for the pixel.
+
+Depthwise convolutions
+======================
+
+Pointwise convolutions
+======================
\ No newline at end of file
diff --git a/docs/ir/dense.rst b/docs/ir/dense.rst
new file mode 100644
index 0000000000..4e24324e6a
--- /dev/null
+++ b/docs/ir/dense.rst
@@ -0,0 +1,25 @@
+============
+Dense Layers
+============
+
+One-dimensional Dense Layers
+============================
+
+One-dimensional dense layers implement a matrix multiply and bias add. The produced code is also used by other layers to implement the matrix multiplication.
+
+
+io_parallel
+-----------
+
+All the backends implement a ``Resource`` implementation, which explicitly iterates over the reuse factor. There are different implementations depending on whether the reuse factor is
+smaller or bigger than the input size. The two Xilinx backends and Catapult also implement a ``Latency`` implementation, which only uses the reuse factor in pragmas.
+
+io_stream
+---------
+
+The io_stream implementation only wraps the io_parallel implementation with streams or pipes for communication. The data is still transferred in parallel. 
+
+Multi-dimensional Dense Layers
+==============================
+
+Multi-dimensional Dense layers are converted to pointwise convolutions, and do not directly use the above implementation
\ No newline at end of file
diff --git a/hls4ml/model/layers.py b/hls4ml/model/layers.py
index fb548aa164..f4c0d3d1a5 100644
--- a/hls4ml/model/layers.py
+++ b/hls4ml/model/layers.py
@@ -930,7 +930,7 @@ def _get_act_function_name(self):
 
 class HardActivation(Activation):
     '''
-    Implements the hard sigmoid and tan function in keras and qkeras
+    Implements the hard sigmoid and tanh function in keras and qkeras
     (Default parameters in qkeras are different, so should be configured)
     The hard sigmoid unction is clip(slope * x + shift, 0, 1), and the
     hard tanh function is 2 * hard_sigmoid - 1

From 6abc8ad59f5d21e4858d6690f36d4203a6742526 Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Tue, 5 Nov 2024 17:06:57 -0600
Subject: [PATCH 05/36] pre-commit fixes

---
 docs/api/auto.rst       | 2 +-
 docs/ir/activations.rst | 1 -
 docs/ir/conv.rst        | 2 +-
 docs/ir/dense.rst       | 4 ++--
 4 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/docs/api/auto.rst b/docs/api/auto.rst
index 7e52358ebb..9a18e70936 100644
--- a/docs/api/auto.rst
+++ b/docs/api/auto.rst
@@ -13,4 +13,4 @@ The approach taken by the precision inference is to set accumulator and other pr
 especially in cases where post-training quantization is used, or if the bit widths were set fairly loosely. The recommended action in that case is to edit the configuration and explicitly set
 some widths in it, potentially in an iterative process after seeing what precisions are automatically set. Another option, currently implemented in :py:class:`~hls4ml.utils.config.config_from_keras_model`,
 is to pass a maximum bitwidth using the ``max_precison`` option. Then the automatic precision inference will never set a bitwdith larger than the bitwidth or an integer part larger than the integer part of
-the ``max_precision`` that is passed. (The bitwidth and integer parts are treated separately.)
\ No newline at end of file
+the ``max_precision`` that is passed. (The bitwidth and integer parts are treated separately.)
diff --git a/docs/ir/activations.rst b/docs/ir/activations.rst
index f80f0208ba..3515d3101b 100644
--- a/docs/ir/activations.rst
+++ b/docs/ir/activations.rst
@@ -12,4 +12,3 @@ Softmax has four implementations that the user can choose from by setting the ``
 * **stable**:  Slower but with better accuracy, useful in scenarios where higher accuracy is needed.
 * **legacy**:  An older implementation with poor accuracy, but good performance. Usually the latency implementation is preferred.
 * **argmax**:  If you don't care about normalized outputs and only care about which one has the highest value, using argmax saves a lot of resources. This sets the highest value to 1, the others to 0.
-
diff --git a/docs/ir/conv.rst b/docs/ir/conv.rst
index 6b6ca953fd..e2615b4795 100644
--- a/docs/ir/conv.rst
+++ b/docs/ir/conv.rst
@@ -29,4 +29,4 @@ Depthwise convolutions
 ======================
 
 Pointwise convolutions
-======================
\ No newline at end of file
+======================
diff --git a/docs/ir/dense.rst b/docs/ir/dense.rst
index 4e24324e6a..352a7d25b2 100644
--- a/docs/ir/dense.rst
+++ b/docs/ir/dense.rst
@@ -17,9 +17,9 @@ smaller or bigger than the input size. The two Xilinx backends and Catapult also
 io_stream
 ---------
 
-The io_stream implementation only wraps the io_parallel implementation with streams or pipes for communication. The data is still transferred in parallel. 
+The io_stream implementation only wraps the io_parallel implementation with streams or pipes for communication. The data is still transferred in parallel.
 
 Multi-dimensional Dense Layers
 ==============================
 
-Multi-dimensional Dense layers are converted to pointwise convolutions, and do not directly use the above implementation
\ No newline at end of file
+Multi-dimensional Dense layers are converted to pointwise convolutions, and do not directly use the above implementation

From 09bbefba64443c327b865941b4825711e160c450 Mon Sep 17 00:00:00 2001
From: Vladimir Loncar <vloncar@users.noreply.github.com>
Date: Tue, 26 Nov 2024 18:30:51 +0100
Subject: [PATCH 06/36] Typo fixes

---
 docs/advanced/accelerator.rst        | 2 +-
 docs/advanced/hgq.rst                | 2 +-
 docs/advanced/model_optimization.rst | 4 ++--
 docs/api/configuration.rst           | 7 ++++---
 4 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/docs/advanced/accelerator.rst b/docs/advanced/accelerator.rst
index 7a79d9dbdc..da945509d5 100644
--- a/docs/advanced/accelerator.rst
+++ b/docs/advanced/accelerator.rst
@@ -13,7 +13,7 @@ Currently ``hls4ml`` supports the following boards:
 * `alveo-u280 <https://www.xilinx.com/products/boards-and-kits/alveo/u280.html>`_ (part: ``xcu280-fsvh2892-2L-e``)
 
 but, in principle, support can be extended to `any board supported by PYNQ <http://www.pynq.io/board.html>`_.
-For the Zynq-based boards, there are two components: an ARM-based processing system (PS) and FPGA-based programmable logic (PL), with various intefaces between the two.
+For the Zynq-based boards, there are two components: an ARM-based processing system (PS) and FPGA-based programmable logic (PL), with various interfaces between the two.
 
 .. image:: ../img/zynq_interfaces.png
   :height: 300px
diff --git a/docs/advanced/hgq.rst b/docs/advanced/hgq.rst
index cf8f53d4d0..dd0faad7dc 100644
--- a/docs/advanced/hgq.rst
+++ b/docs/advanced/hgq.rst
@@ -9,7 +9,7 @@ High Granularity Quantization (HGQ)
 .. image:: https://img.shields.io/badge/arXiv-2405.00645-b31b1b.svg
    :target: https://arxiv.org/abs/2405.00645
 
-`High Granularity Quantization (HGQ) <https://github.com/calad0i/HGQ/>`_ is a library that performs gradient-based automatic bitwidth optimization and quantization-aware training algorithm for neural networks to be deployed on FPGAs. By laveraging gradients, it allows for bitwidth optimization at arbitrary granularity, up to per-weight and per-activation level.
+`High Granularity Quantization (HGQ) <https://github.com/calad0i/HGQ/>`_ is a library that performs gradient-based automatic bitwidth optimization and quantization-aware training algorithm for neural networks to be deployed on FPGAs. By leveraging gradients, it allows for bitwidth optimization at arbitrary granularity, up to per-weight and per-activation level.
 
 .. image:: https://calad0i.github.io/HGQ/_images/overview.svg
    :alt: Overview of HGQ
diff --git a/docs/advanced/model_optimization.rst b/docs/advanced/model_optimization.rst
index 41132ab619..302d646023 100644
--- a/docs/advanced/model_optimization.rst
+++ b/docs/advanced/model_optimization.rst
@@ -124,8 +124,8 @@ Finally, optimizing Vivado DSPs is possible, given a hls4ml config:
     acc_optimized = accuracy_score(np.argmax(y_test, axis=1), np.argmax(y_optimized, axis=1))
     print(f'Optimized Keras accuracy: {acc_optimized}')
 
-There are two more Vivado "optimizers" - VivadoFFEstimator, aimed at reducing register utilisation and VivadoMultiObjectiveEstimator, aimed at optimising BRAM and DSP utilisation.
-Note, to ensure DSPs are optimized, "unrolled" Dense multiplication must be used before synthesing HLS, by modifying the config:
+There are two more Vivado "optimizers" - VivadoFFEstimator, aimed at reducing register utilization and VivadoMultiObjectiveEstimator, aimed at optimizing BRAM and DSP utilization.
+Note, to ensure DSPs are optimized, "unrolled" Dense multiplication must be used before synthesizing HLS, by modifying the config:
 
 .. code-block:: Python
 
diff --git a/docs/api/configuration.rst b/docs/api/configuration.rst
index c8bd61b59f..2f969603f6 100644
--- a/docs/api/configuration.rst
+++ b/docs/api/configuration.rst
@@ -124,7 +124,7 @@ There are a number of configuration options that you have.  Let's go through the
 * **ProjectName**\ : the name of the HLS project IP that is produced
 * **KerasJson/KerasH5**\ : for Keras, the model architecture and weights are stored in a ``json`` and ``h5`` file.  The path to those files are required here.
   We also support keras model's file obtained just from ``model.save()``. In this case you can just supply the ``h5`` file in ``KerasH5:`` field.
-* **InputData/OutputPredictions**\ : path to your input/predictions of the model. If none is supplied, then hls4ml will create aritificial data for simulation. The data used above in the example can be found `here <https://cernbox.cern.ch/index.php/s/2LTJVVwCYFfkg59>`__. We also support ``npy`` data files. We welcome suggestions on more input data types to support.
+* **InputData/OutputPredictions**\ : path to your input/predictions of the model. If none is supplied, then hls4ml will create artificial data for simulation. The data used above in the example can be found `here <https://cernbox.cern.ch/index.php/s/2LTJVVwCYFfkg59>`__. We also support ``npy`` data files. We welcome suggestions on more input data types to support.
 
 The backend-specific section of the configuration depends on the backend. You can get a starting point for the necessary settings using, for example `hls4ml.templates.get_backend('Vivado').create_initial_config()`.
 For Vivado backend the options are:
@@ -134,12 +134,13 @@ For Vivado backend the options are:
   Then you have some optimization parameters for how your algorithm runs:
 * **IOType**\ : your options are ``io_parallel`` or ``io_stream`` which defines the type of data structure used for inputs, intermediate activations between layers, and outputs. For ``io_parallel``, arrays are used that, in principle, can be fully unrolled and are typically implemented in RAMs. For ``io_stream``, HLS streams are used, which are a more efficient/scalable mechanism to represent data that are produced and consumed in a sequential manner. Typically, HLS streams are implemented with FIFOs instead of RAMs. For more information see `here <https://docs.xilinx.com/r/en-US/ug1399-vitis-hls/pragma-HLS-stream>`__.
 * **HLSConfig**\: the detailed configuration of precision and parallelism, including:
+
   * **ReuseFactor**\ : in the case that you are pipelining, this defines the pipeline interval or initiation interval
   * **ParallelizationFactor**\ : The number of output "pixels" to compute in parallel in convolutional layers. Increasing this parameter results in significant increase in resources required on the FPGA.
   * **Strategy**\ : Optimization strategy on FPGA, either "Latency", "Resource" or "Unrolled". If none is supplied then hl4ml uses "Latency" as default. Note that a reuse factor larger than 1 should be specified when using "resource" or "unrolled" strategy. An example of using larger reuse factor can be found `here. <https://github.com/fastmachinelearning/models/tree/master/keras/KERAS_dense>`__
   * **PipelineStyle**\ : Set the top level pipeline style. Valid options are "auto", "pipeline" and "dataflow". If unspecified, it defaults to "auto".
   * **PipelineInterval**\ : Optionally override the desired initiation interval of the design. Only valid in combination with "pipeline" style. If unspecified, it is left to the compiler to decide, ideally matching the largest reuse factor of the network.
-  * **Precision**\ : this defines the precsion of your inputs, outputs, weights and biases. It is denoted by ``ap_fixed<X,Y>``\ , where ``Y`` is the number of bits representing the signed number above the binary point (i.e. the integer part), and ``X`` is the total number of bits.
+  * **Precision**\ : this defines the precision of your inputs, outputs, weights and biases. It is denoted by ``ap_fixed<X,Y>``\ , where ``Y`` is the number of bits representing the signed number above the binary point (i.e. the integer part), and ``X`` is the total number of bits.
   Additionally, integers in fixed precision data type (\ ``ap_int<N>``\ , where ``N`` is a bit-size from 1 to 1024) can also be used. You have a chance to further configure this more finely with per-layer configuration described below.
 
 2.2 Per-Layer Configuration
@@ -235,7 +236,7 @@ In your project, the file ``<OutputDir>/firmware/<ProjectName>.cpp`` is your top
 
    nnet::sigmoid<layer4_t, result_t, sigmoid_config5>(layer4_out, layer5_out);
 
-You can see, for the simple 1-layer DNN, the computation (\ ``nnet::dense_latency``\ ) and activation (\ ``nnet::relu``\ /\ ``nnet::sigmoid``\ ) caluclation for each layer.  For each layer, it has its own additional configuration parameters, e.g. ``config2``.
+You can see, for the simple 1-layer DNN, the computation (\ ``nnet::dense_latency``\ ) and activation (\ ``nnet::relu``\ /\ ``nnet::sigmoid``\ ) calculation for each layer.  For each layer, it has its own additional configuration parameters, e.g. ``config2``.
 
 In your project, the file ``<OutputDir>/firmware/parameters.h`` stores all the configuration options for each neural network library.
 An example is `here <https://github.com/hls-fpga-machine-learning/models/blob/master/HLS_projects/KERAS-1layer-hls/firmware/parameters.h>`__. So for example, the detailed configuration options for an example DNN layer is:

From 42cb368b2dcb4ca5b558089ac4549617f8ee731e Mon Sep 17 00:00:00 2001
From: Benjamin Ramhorst <59868635+bo3z@users.noreply.github.com>
Date: Tue, 3 Dec 2024 11:15:09 +0100
Subject: [PATCH 07/36] Add video tutorial link

---
 README.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 4e9515d52d..96984b35e4 100644
--- a/README.md
+++ b/README.md
@@ -15,7 +15,9 @@ If you have any questions, comments, or ideas regarding hls4ml or just want to s
 
 # Documentation & Tutorial
 
-For more information visit the webpage: [https://fastmachinelearning.org/hls4ml/](https://fastmachinelearning.org/hls4ml/)
+For more information visit the webpage: [https://fastmachinelearning.org/hls4ml/](https://fastmachinelearning.org/hls4ml/).
+
+For introductory material on FPGAs, HLS and ML inferences using hls4ml, check out the [video](https://www.youtube.com/watch?v=2y3GNY4tf7A&ab_channel=SystemsGroupatETHZ%C3%BCrich).
 
 Detailed tutorials on how to use `hls4ml`'s various functionalities can be found [here](https://github.com/hls-fpga-machine-learning/hls4ml-tutorial).
 

From fedf790b2fd04542e72145a1dd334b131361cb07 Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Wed, 4 Dec 2024 17:29:49 -0600
Subject: [PATCH 08/36] respond to some review comments and update some
 descriptions

---
 docs/api/auto.rst          | 22 ++++++++-----
 docs/api/configuration.rst | 66 ++++++++++++++++++++++++++------------
 docs/setup.rst             |  2 +-
 3 files changed, 60 insertions(+), 30 deletions(-)

diff --git a/docs/api/auto.rst b/docs/api/auto.rst
index 9a18e70936..f944a11e54 100644
--- a/docs/api/auto.rst
+++ b/docs/api/auto.rst
@@ -2,15 +2,21 @@
 Automatic precision inference
 =============================
 
-The automatic precision inference (implemented in :py:class:`~hls4ml.model.optimizer.passes.infer_precision.InferPrecisionTypes`) attempts to infer the appropriate widths for a given precision.
-It is initiated by configuring a precision in the configuration as 'auto'. Functions like :py:class:`~hls4ml.utils.config.config_from_keras_model` and :py:class:`~hls4ml.utils.config.config_from_onnx_model`
-automatically set most precisions to 'auto' if the ``'name'`` granularity is used.
+The automatic precision inference (implemented in :py:class:`~hls4ml.model.optimizer.passes.infer_precision.InferPrecisionTypes`) attempts to infer the appropriate
+widths for a given precision. It is initiated by setting a precision in the configuration as ``'auto'``. (Note, only layer-level precisions can be set to ``'auto'``,
+not model-level.)  Functions like :py:class:`~hls4ml.utils.config.config_from_keras_model`, :py:class:`~hls4ml.utils.config.config_from_onnx_model`,
+and :py:class:`~hls4ml.utils.config.config_from_pytorch_model` automatically set most precisions to ``'auto'`` if the ``'name'`` granularity is used.
 
 .. note::
     It is recommended to pass the backend to the ``config_from_*`` functions so that they can properly extract all the configurable precisions.
 
-The approach taken by the precision inference is to set accumulator and other precisions to never truncate, using only the bitwidths of the inputs (not the values). This is quite conservative,
-especially in cases where post-training quantization is used, or if the bit widths were set fairly loosely. The recommended action in that case is to edit the configuration and explicitly set
-some widths in it, potentially in an iterative process after seeing what precisions are automatically set. Another option, currently implemented in :py:class:`~hls4ml.utils.config.config_from_keras_model`,
-is to pass a maximum bitwidth using the ``max_precison`` option. Then the automatic precision inference will never set a bitwdith larger than the bitwidth or an integer part larger than the integer part of
-the ``max_precision`` that is passed. (The bitwidth and integer parts are treated separately.)
+The approach taken by the precision inference is to set accumulator (the internal variable used to accumulate values in the matrix multiplications) and other precisions
+to never truncate, using only the bitwidths of the inputs (not the values). This is quite conservative, especially in cases where post-training quantization is used, or
+if the bit widths were set fairly loosely. The recommended action in that case is to edit the configuration and explicitly set some widths in it, potentially in an iterative process
+after profiling the data. Another option is to pass a maximum precision using the ``max_precison`` parameter of the ``config_form_*`` functions. Then the automatic precision
+inference will never set a bitwdith larger than the bitwidth of the ``max_precision`` or an integer part larger than the integer part of the ``max_precision`` that is passed.
+(The bitwidth and integer parts of the ``max_precision`` are treated separately.)
+
+When manually setting bitdwidths, the accumulator can overflow, and the precision may need to be reduced. For the accumulator, it is usually a bad idea to explicitly
+enable rounding or saturation modes since it dramatically increases the execution time. For other types (e.g. output types or weight types), however, rounding and saturation handling
+can be enabled as needed.
diff --git a/docs/api/configuration.rst b/docs/api/configuration.rst
index 2f969603f6..6f7f04359b 100644
--- a/docs/api/configuration.rst
+++ b/docs/api/configuration.rst
@@ -34,20 +34,45 @@ Using hls4ml, you can quickly generate a simple configuration dictionary from a
    import hls4ml
    config = hls4ml.utils.config_from_keras_model(model, granularity='model')
 
-This python dictionary can be edited as needed. A more advanced configuration can be generated by, for example:
+This python dictionary can be edited as needed. More advanced configuration can be generated by, for example for ONNX models:
 
 .. code-block:: python
 
    import hls4ml
-   config = hls4ml.utils.config_from_keras_model(
+   config = hls4ml.utils.config_from_onnx_model(
         model,
         granularity='name',
         default_precision='fixed<16,6>',
         backend='Vitis')
 
-This will include per-layer configuration based on the model. Including the backend is recommended because some configuration options depend on the backend. Note, the precisions at the
-higher granularites usually default to 'auto', which means that ``hls4ml`` will try to set it automatically (see :ref:`Automatic precision inference`). Note that higher granularity settings take precedence
-over model-level settings. See :py:class:`~hls4ml.utils.config.config_from_keras_model` for more information on the various options.
+for Keras models:
+
+.. code-block:: python
+
+   import hls4ml
+   config = hls4ml.utils.config_from_keras_model(
+        model,
+        granularity='name',
+        default_precision='fixed<16,6>',
+        backend='oneAPI')
+
+or for PyTorch models:
+
+.. code-block:: python
+
+   import hls4ml
+   config = hls4ml.utils.config_from_pytorch_model(
+        model,
+        granularity='name',
+        default_precision='fixed<16,6>',
+        backend='Catapult')
+
+
+The ``name`` granularity includes per-layer configuration based on the model. A ``'name'`` granularity is generally recommended because it allows for more turning, and also because it allows
+for automatic setting of precisions.  The layer-level precisions with the ``'name'`` granularity default to ``'auto'``, which means that hls4ml will try to set it automatically
+(see :ref:`Automatic precision inference`). Note that layer-level settings take precedence over model-level settings. A ``'name'`` granularity is required for QKeras
+and QONNX model parsing. Passing the backend to these functions is recommended because some configuration options depend on the backend. See :py:class:`~hls4ml.utils.config.config_from_keras_model`
+and similar for more information on the various options.
 
 One can override specific values before using the configuration:
 
@@ -59,7 +84,7 @@ Or to set the precision of a specific layer's weight:
 
 .. code-block:: python
 
-   config['LayerName']['fc1']['Precision']['weight'] = 'ap_fixed<8,4>'
+   config['LayerName']['fc1']['Precision']['weight'] = 'fixed<8,4>'
 
 To better understand how the configuration hierachy works, refer to the next section for more details.
 
@@ -75,7 +100,7 @@ Finally, one then uses the configuration to create an hls model:
         backend='Vitis'
     )
 
-See :py:class:`~hls4ml.converters.convert_from_keras_model` for more information on the various options.
+See :py:class:`~hls4ml.converters.convert_from_keras_model` for more information on the various options. Similar functions exist for ONNX and PyTorch.
 
 ----
 
@@ -85,7 +110,7 @@ See :py:class:`~hls4ml.converters.convert_from_keras_model` for more information
 2.1 Top Level Configuration
 ---------------------------
 
-Configuration files are YAML files in hls4ml (\ ``*.yml``\ ). An example configuration file is `here <https://github.com/hls-fpga-machine-learning/example-models/blob/master/keras-config.yml>`__.
+One can also use YAML configuration files in hls4ml (\ ``*.yml``\ ). An example configuration file is `here <https://github.com/hls-fpga-machine-learning/example-models/blob/master/keras-config.yml>`__.
 
 It looks like this:
 
@@ -108,7 +133,7 @@ It looks like this:
 
    HLSConfig:
      Model:
-       Precision: ap_fixed<16,6>
+       Precision: fixed<16,6>
        ReuseFactor: 1
        Strategy: Latency
      LayerType:
@@ -140,8 +165,7 @@ For Vivado backend the options are:
   * **Strategy**\ : Optimization strategy on FPGA, either "Latency", "Resource" or "Unrolled". If none is supplied then hl4ml uses "Latency" as default. Note that a reuse factor larger than 1 should be specified when using "resource" or "unrolled" strategy. An example of using larger reuse factor can be found `here. <https://github.com/fastmachinelearning/models/tree/master/keras/KERAS_dense>`__
   * **PipelineStyle**\ : Set the top level pipeline style. Valid options are "auto", "pipeline" and "dataflow". If unspecified, it defaults to "auto".
   * **PipelineInterval**\ : Optionally override the desired initiation interval of the design. Only valid in combination with "pipeline" style. If unspecified, it is left to the compiler to decide, ideally matching the largest reuse factor of the network.
-  * **Precision**\ : this defines the precision of your inputs, outputs, weights and biases. It is denoted by ``ap_fixed<X,Y>``\ , where ``Y`` is the number of bits representing the signed number above the binary point (i.e. the integer part), and ``X`` is the total number of bits.
-  Additionally, integers in fixed precision data type (\ ``ap_int<N>``\ , where ``N`` is a bit-size from 1 to 1024) can also be used. You have a chance to further configure this more finely with per-layer configuration described below.
+  * **Precision**\ : this defines the precision of your inputs, outputs, weights and biases. It is denoted by ``fixed<X,Y>``\ , where ``Y`` is the number of bits representing the signed number above the binary point (i.e. the integer part), and ``X`` is the total number of bits. Additionally, integers in the type (\ ``int<N>``\ , where ``N`` is a bit-size from 1 to 1024) can also be used. The format follows ``ap_fixed`` and ``ap_int`` conventions. You have a chance to further configure this more finely with per-layer configuration described below. In the per-layer configuration (but not globally) one can also use ``'auto'`` precision.
 
 2.2 Per-Layer Configuration
 ---------------------------
@@ -154,10 +178,10 @@ Under the ``HLSConfig`` heading, these can be set for the ``Model``\ , per ``Lay
 
    HLSConfig:
      Model:
-       Precision: ap_fixed<16,6>
+       Precision: fixed<16,6>
        ReuseFactor: 1
 
-This configuration use ``ap_fixed<16,6>`` for every variable and a ReuseFactor of 1 throughout.
+This configuration use ``fixed<16,6>`` for every variable and a ReuseFactor of 1 throughout.
 
 Specify all ``Dense`` layers to use a different precision like this:
 
@@ -165,13 +189,13 @@ Specify all ``Dense`` layers to use a different precision like this:
 
    HLSConfig:
      Model:
-       Precision: ap_fixed<16,6>
+       Precision: fixed<16,6>
        ReuseFactor: 1
      LayerType:
        Dense:
-         Precision: ap_fixed<14,5>
+         Precision: fixed<14,5>
 
-In this case, all variables in any ``Dense`` layers will be represented with ``ap_fixed<14,5>`` while any other layer types will use ``ap_fixed<16,6>``.
+In this case, all variables in any ``Dense`` layers will be represented with ``fixed<14,5>`` while any other layer types will use ``fixed<16,6>``.
 
 A specific layer can be targeted like this:
 
@@ -179,18 +203,18 @@ A specific layer can be targeted like this:
 
     HLSConfig:
        Model:
-         Precision: ap_fixed<16,6>
+         Precision: fixed<16,6>
          ReuseFactor: 16
        LayerName:
          dense1:
            Precision:
-             weight: ap_fixed<14,2>
-             bias: ap_fixed<14,4>
-             result: ap_fixed<16,6>
+             weight: fixed<14,2>
+             bias: fixed<14,4>
+             result: fixed<16,6>
            ReuseFactor: 12
            Strategy: Resource
 
-In this case, the default model configuration will use ``ap_fixed<16,6>`` and a ``ReuseFactor`` of 16. The layer named ``dense1`` (defined in the user provided model architecture file) will instead use different precision for the ``weight``\ , ``bias``\ , and ``result`` (output) variables, a ``ReuseFactor`` of 12, and the ``Resource`` strategy (while the model default is ``Latency`` strategy.
+In this case, the default model configuration will use ``fixed<16,6>`` and a ``ReuseFactor`` of 16. The layer named ``dense1`` (defined in the user provided model architecture file) will instead use different precision for the ``weight``\ , ``bias``\ , and ``result`` (output) variables, a ``ReuseFactor`` of 12, and the ``Resource`` strategy (while the model default is ``Latency`` strategy.
 
 More than one layer can have a configuration specified, e.g.:
 
diff --git a/docs/setup.rst b/docs/setup.rst
index 628f4ee69a..052b6d1fa7 100644
--- a/docs/setup.rst
+++ b/docs/setup.rst
@@ -64,7 +64,7 @@ To run FPGA synthesis, installation of following tools is required:
 
 * Intel Quartus 20.1 to 21.4 for the synthesis for Intel/Altera FPGAs using the ``Quartus`` backend.
 
-* oneAPI 2024.1 to 2025.0 with the FPGA compiler and recent Intel/Altara Quartus for Intel/Altera FPGAs using the ``oneAPI`` backend.
+* oneAPI 2024.1 to 2025.0 with the FPGA compiler and recent Intel/Altera Quartus for Intel/Altera FPGAs using the ``oneAPI`` backend.
 
 Catapult HLS 2024.1_1 or 2024.2 can be used to synthesize both for ASICs and FPGAs.
 

From f28f3647fe3eb1b09f1e7f277cc3ff3641f6fc3a Mon Sep 17 00:00:00 2001
From: Jan-Frederik Schulte <jschulte@cern.ch>
Date: Thu, 5 Dec 2024 10:23:57 -0500
Subject: [PATCH 09/36] fix documentation of channels_last conversion for
 pytorch

---
 docs/api/configuration.rst    | 3 ++-
 hls4ml/converters/__init__.py | 7 ++++---
 hls4ml/utils/config.py        | 7 ++++---
 3 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/docs/api/configuration.rst b/docs/api/configuration.rst
index 6f7f04359b..e5591f126b 100644
--- a/docs/api/configuration.rst
+++ b/docs/api/configuration.rst
@@ -72,7 +72,8 @@ The ``name`` granularity includes per-layer configuration based on the model. A
 for automatic setting of precisions.  The layer-level precisions with the ``'name'`` granularity default to ``'auto'``, which means that hls4ml will try to set it automatically
 (see :ref:`Automatic precision inference`). Note that layer-level settings take precedence over model-level settings. A ``'name'`` granularity is required for QKeras
 and QONNX model parsing. Passing the backend to these functions is recommended because some configuration options depend on the backend. See :py:class:`~hls4ml.utils.config.config_from_keras_model`
-and similar for more information on the various options.
+and similar for more information on the various options. Note specifically the documentation of :py:class:`~hls4ml.utils.config.config_from_pytorch_model` on how to handle differences in input data
+formats between pytorch and keras (hls4ml follows keras conventions internally).
 
 One can override specific values before using the configuration:
 
diff --git a/hls4ml/converters/__init__.py b/hls4ml/converters/__init__.py
index 13e90df687..3d7ce1fe56 100644
--- a/hls4ml/converters/__init__.py
+++ b/hls4ml/converters/__init__.py
@@ -278,9 +278,10 @@ def convert_from_pytorch_model(
     Notes:
         Pytorch uses the "channels_first" data format for its tensors, while hls4ml expects the "channels_last" format
         used by keras. By default, hls4ml will automatically add layers to the model which transpose the inputs to the
-        "channels_last"format. Not that this is not supported for the "io_stream" io_type, for which the user will have
-        to transpose the input by hand before passing it to hls4ml. In that case the "inputs_channel_last" argument of
-        the "config_from_pytorch_model" function needs to be set to True. By default, the output of the model remains
+        "channels_last" format. Not that this is not supported for the "io_stream" io_type, for which the user will have
+        to transpose the input by hand before passing it to hls4ml. In that case the "channels_last_conversion" argument of
+        the "config_from_pytorch_model" function needs to be set to "internal". This argument can be used to completely
+        disable this internal conversion. By default, the output of the model remains
         in the "channels_last" data format. The "transpose_outputs" argument of the "config_from_pytorch_model" can be
         used to add a layer to the model that transposes back to "channels_first". As before, this will not work for
         io_stream.
diff --git a/hls4ml/utils/config.py b/hls4ml/utils/config.py
index 1a25fb9c3f..4ca2f5e198 100644
--- a/hls4ml/utils/config.py
+++ b/hls4ml/utils/config.py
@@ -309,9 +309,10 @@ def config_from_pytorch_model(
             be an explicit precision: 'auto' is not allowed.
         default_reuse_factor (int, optional): Default reuse factor. Defaults to 1.
         channels_last_conversion (string, optional): Configures the conversion of pytorch layers to
-        'channels_last' dataformate. Can be set to 'full', 'internal', or 'off'. If 'full', both the inputs
-        and internal layers will be converted. If 'internal', only internal layers will be converted; this
-        assumes the inputs are converted by the user. If 'off', no conversion is performed.
+            'channels_last' data format used by hls4ml internally. Can be set to 'full', 'internal',
+            or 'off'. If 'full', both the inputs and internal layers will be converted. If 'internal',
+            only internal layers will be converted; this assumes the inputs are converted by the user.
+            If 'off', no conversion is performed.
         transpose_outputs (bool, optional): Set to 'False' if the output should not be transposed from
             channels_last into channels_first data format. Defaults to 'False'. If False, outputs needs
             to be transposed manually.

From e55b29caa493d0dbc7f97133750ece5ed07bcf6c Mon Sep 17 00:00:00 2001
From: Jan-Frederik Schulte <jschulte@cern.ch>
Date: Thu, 5 Dec 2024 10:35:36 -0500
Subject: [PATCH 10/36] slightly expand discussion of channels_last in pytorch

---
 hls4ml/utils/config.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/hls4ml/utils/config.py b/hls4ml/utils/config.py
index 4ca2f5e198..f5924ccf64 100644
--- a/hls4ml/utils/config.py
+++ b/hls4ml/utils/config.py
@@ -292,6 +292,15 @@ def config_from_pytorch_model(
     Users are advised to inspect the returned object to tweak the conversion configuration.
     The return object can be passed as `hls_config` parameter to `convert_from_pytorch_model`.
 
+    Note that hls4ml internally follows the keras convention for nested tensors known as
+    "channels last", wherease pytorch uses the "channels first" convention.
+    For exampe, for a tensor encoding an image with 3 channels, pytorch will expect the data
+    to be encoded as (Number_Of_Channels, Height , Width), whereas hls4ml expects
+    (Height , Width, Number_Of_Channels). By default, hls4ml will perform the necessary
+    conversions of the inputs and internal tensors automatically, but will return the output
+    in "channels last" However, this behavior can be controlled by the user using the
+    related arguments discussed below.
+
     Args:
         model: PyTorch model
         input_shape (tuple or list of tuples): The shape of the input tensor, excluding the batch size.

From 99e3be09e5acf11740c43cc01b181002d9166a72 Mon Sep 17 00:00:00 2001
From: Javier Duarte <jduarte@ucsd.edu>
Date: Thu, 5 Dec 2024 09:35:58 -0800
Subject: [PATCH 11/36] update requirements

---
 docs/requirements.txt | 1 -
 setup.cfg             | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/docs/requirements.txt b/docs/requirements.txt
index 66aa579ea6..fe3c4f2544 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -4,5 +4,4 @@ sphinx>=3.2.1
 sphinx_contributors
 sphinx_github_changelog
 sphinx_rtd_theme
-tensorflow<=2.15
 toposort>=1.5.0
diff --git a/setup.cfg b/setup.cfg
index cc15eec49f..dc1075d9f3 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -33,7 +33,7 @@ install_requires =
     tabulate
     tensorflow>=2.8.0,<=2.14.1
     tensorflow-model-optimization<=0.7.5
-python_requires = >=3.10, <=3.12
+python_requires = >=3.10, <3.12
 include_package_data = True
 scripts = scripts/hls4ml
 

From 96b530f04a7e3b485c81727b5bc19f5db42939be Mon Sep 17 00:00:00 2001
From: Javier Duarte <jduarte@ucsd.edu>
Date: Thu, 5 Dec 2024 10:36:45 -0800
Subject: [PATCH 12/36] add pointwise documentation

---
 docs/ir/conv.rst | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/docs/ir/conv.rst b/docs/ir/conv.rst
index e2615b4795..293732d3b0 100644
--- a/docs/ir/conv.rst
+++ b/docs/ir/conv.rst
@@ -30,3 +30,9 @@ Depthwise convolutions
 
 Pointwise convolutions
 ======================
+
+Pointwise convolutions are a special case of convolution where the filter size is 1 for 1D or 1x1 for 2D.
+
+For the Xilinx backend, there is a dedicated io_parallel ``Latency`` strategy implementation of 1D pointwise convolutional layers integrated in `#881 <https://github.com/fastmachinelearning/hls4ml/pull/881>`_.
+The reuse factor (RF) is used to split the layer execution and reuse the existing module RF times. The RF also limits the number of multipliers in each module.
+The initiation interval scales as the RF. One limitation is that it assumes ``in_width`` is divisible by the RF.

From a7b6f7997fec0c033c990893781c5298c39ea828 Mon Sep 17 00:00:00 2001
From: Javier Duarte <jduarte@ucsd.edu>
Date: Thu, 5 Dec 2024 10:37:34 -0800
Subject: [PATCH 13/36] update pointwise description

---
 docs/ir/conv.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/ir/conv.rst b/docs/ir/conv.rst
index 293732d3b0..f31e676e9d 100644
--- a/docs/ir/conv.rst
+++ b/docs/ir/conv.rst
@@ -33,6 +33,6 @@ Pointwise convolutions
 
 Pointwise convolutions are a special case of convolution where the filter size is 1 for 1D or 1x1 for 2D.
 
-For the Xilinx backend, there is a dedicated io_parallel ``Latency`` strategy implementation of 1D pointwise convolutional layers integrated in `#881 <https://github.com/fastmachinelearning/hls4ml/pull/881>`_.
+For the Xilinx backend, there is a dedicated io_parallel ``Latency`` strategy implementation of 1D pointwise convolutional layers integrated in `#881 <https://github.com/fastmachinelearning/hls4ml/pull/881>`_ developed for `arXiv:2402.01876 <https://arxiv.org/abs/2402.01876>`_.
 The reuse factor (RF) is used to split the layer execution and reuse the existing module RF times. The RF also limits the number of multipliers in each module.
 The initiation interval scales as the RF. One limitation is that it assumes ``in_width`` is divisible by the RF.

From 6af7fefbacd49373ab80f8c32ff723ae5bccc2c4 Mon Sep 17 00:00:00 2001
From: Vladimir Loncar <vloncar@users.noreply.github.com>
Date: Fri, 6 Dec 2024 16:06:58 +0100
Subject: [PATCH 14/36] Add FAQ to docs and readme

---
 README.md      |  4 ++++
 docs/faq.rst   | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++
 docs/index.rst |  1 +
 3 files changed, 57 insertions(+)
 create mode 100644 docs/faq.rst

diff --git a/README.md b/README.md
index 96984b35e4..8d97bda3b6 100644
--- a/README.md
+++ b/README.md
@@ -63,6 +63,10 @@ hls_model.build()
 hls4ml.report.read_vivado_report('my-hls-test')
 ```
 
+# FAQ
+
+List of frequently asked questions and common HLS synthesis can be found [here](https://fastmachinelearning.org/hls4ml/faq.html)
+
 # Citation
 If you use this software in a publication, please cite the software
 ```bibtex
diff --git a/docs/faq.rst b/docs/faq.rst
new file mode 100644
index 0000000000..22b4c6c99a
--- /dev/null
+++ b/docs/faq.rst
@@ -0,0 +1,52 @@
+Frequently asked questions
+==========================
+
+**What is hls4ml?**
+
+``hls4ml`` is a tool for converting neural network models into FPGA firmware. hls4ml is aimed at low-latency applications, such as triggering at the Large Hadron Collider (LHC) at CERN, but is applicable to other domains requiring microsecond latency. See the full documentation for more details.
+
+**How does hls4ml work?**
+
+``hls4ml`` takes the models from Keras, PyTorch and ONNX (optionally quantized with the respective quantization libraries) and produces high-level synthesis code (based on C++) that can be converted to FPGA firmware using the HLS compilers from different vendors (AMD/Xilinx, Intel/Altera, Catapult...).
+
+**How is hls4ml so fast?**
+
+``hls4ml`` stores all weights on-chip for fast access and has tuneable parallelism. As a consequence, the size of the model that can be successfully converted into firmware with hls4ml largely depends on the amount of available resources on the target FPGA. Therefore it is highly recommended to compress the model with quantization (via QKeras or HGQ for Keras or Brevitas for PyTorch) and pruning. Additionally, ``hls4ml`` exploits the parallelism available in an FPGA or ASIC by implementing a spatial dataflow architecture.
+
+**Will my model work with hls4ml?**
+
+``hls4ml`` supports many common layers found in MLP, CNN and RNN architectures, however some seldom-used features of these layers may not be supported. Novel architectures such as graph networks or transformers are in various stages of development and are currently not stable for end-users. See the status and features page for more information. Models with custom layers can be supported through extension API. If you encounter a feature not yet supported, open a new issue.
+
+**Will my model with X parameters fit an FPGA model Y?**
+
+It depends. ``hls4ml`` has been successfully used with quantized models with `O` (10k) parameters, while for some architectures going beyond `O` (1000) parameters is not doable even on the largest FPGAs. The number of parameters of a model is generally not a good estimate of the performance on an FPGA as the computational complexity of different types of NN layers has big effects on the resource consumption on an FPGA. For example, a CNN or GNN may reuse the same parameter in many operations. Furthermore, model compression in the form of quantization and pruning can significantly change the footprint of the model on the FPGA. For these reasons, we discourage the use of this metric for estimating performance.
+
+If you're looking for a quick estimate of the resource usage and latency for a given model without synthesis, look into `rule4ml <https://github.com/IMPETUS-UdeS/rule4ml>`_ and `wa-hls4ml <https://github.com/Dendendelen/wa-hls4ml>`_ projects.
+
+LLMs and large vision transformers are not supported nor planned.
+
+**How do I get started with hls4ml?**
+
+We strongly recommend interested users unfamiliar with FPGAs or model compression techniques to review the `hls4ml tutorials <https://github.com/fastmachinelearning/hls4ml-tutorial>`_ to get an overview of the features and conversion workflow.
+
+**How do I contribute to hls4ml development?**
+
+We're always welcoming new contributions. If you have an interesting feature in mind feel free to start a new discussion thread with your proposal. We also have regular meetings online to discuss the status of developments where you can be invited to present your work. To receive announcements, `request to be added to our CERN e-group <https://e-groups.cern.ch/e-groups/Egroup.do?egroupName=hls-fml>`_. Furthermore, check the `CONTRIBUTING <https://github.com/fastmachinelearning/hls4ml/blob/main/CONTRIBUTING.md>`_ document for a set of technical requirements for making contributions to the hls4ml project.
+
+
+Common HLS synthesis issues
+***************************
+
+**Stop unrolling loop ... because it may cause large runtime and excessive memory usage due to increase in code size.**
+
+This error is common with models that are too large to fit on the FPGA given the ``IOType`` used. If you are using ``io_parallel``, consider switching to ``io_stream``, which prevents unrolling all arrays. It may help to also use the ``Resource`` strategy. Pruning or quantizing the model may not help as it is related to the size of the loops. If possible, try to reduce the number of neurons/filters of your model to reduce the size of the activation tensors and thus number of iterations of loops.
+
+**cannot open shared object file ...: No such file or directory.**
+
+This is usually an indication that the compilation failed due to incorrect HLS code being produced. It is most likely a bug in hls4ml. Please open a bug report. Note that the displayed error message may be the same but the cause can be different. Unless you're sure that the existing bug reports show the same underlying issue, it is better to open a separate bug report.
+
+**My hls4ml predictions don't match the original Keras/PyTorch/ONNX ones**
+
+``hls4ml`` uses fixed-point precision types to represent internal data structures, unlike the floating-point precision types used for computation in upstream ML toolkits. If the used bit width is not sufficiently wide, you may encounter issues with computation accuracy that propagates through the layers. This is especially true for models that are not fully quantized, or models with insufficient ``accum_t`` bitwidth. Look into automatic precision inference and profiling tools to resolve the issue.
+
+Note that bit-exact behavior is not always possible, as many math functions (used by activation functions) are approximated with lookup tables.
diff --git a/docs/index.rst b/docs/index.rst
index 950971fd26..d0976a67aa 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -5,6 +5,7 @@
     concepts
     status
     setup
+    faq
     release_notes
     reference
 

From eac61dd1478f47fe1947160de0e0de678a9fa103 Mon Sep 17 00:00:00 2001
From: Vladimir Loncar <vloncar@users.noreply.github.com>
Date: Fri, 6 Dec 2024 16:12:18 +0100
Subject: [PATCH 15/36] Nicer link to the tutorial

---
 docs/index.rst | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/docs/index.rst b/docs/index.rst
index d0976a67aa..4a830de675 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -75,6 +75,4 @@ For the latest status including current and planned features, see the :ref:`Stat
 
 Tutorials
 =================================
-Detailed tutorials on how to use ``hls4ml``'s various functionalities can be found at:
-
-https://github.com/fastmachinelearning/hls4ml-tutorial
+Detailed tutorials on how to use ``hls4ml``'s various functionalities can be found `here <https://github.com/fastmachinelearning/hls4ml-tutorial>`_.

From c65e9152a63191e8e0975ab33a1ba4fb74651b5f Mon Sep 17 00:00:00 2001
From: Jan-Frederik Schulte <jschulte@cern.ch>
Date: Fri, 6 Dec 2024 10:48:15 -0500
Subject: [PATCH 16/36] add doc strings to pytorch-specific padding calculation
 functions

---
 hls4ml/converters/utils.py | 39 +++++++++++++++++++++++++++++++++++++-
 1 file changed, 38 insertions(+), 1 deletion(-)

diff --git a/hls4ml/converters/utils.py b/hls4ml/converters/utils.py
index d1c9e050d5..f365916b55 100644
--- a/hls4ml/converters/utils.py
+++ b/hls4ml/converters/utils.py
@@ -45,7 +45,7 @@ def compute_padding_1d(pad_type, in_size, stride, filt_size):
     is odd, it will add the extra column to the right.
 
     Args:
-        pad_type (str): Padding type, one of ``same``, `valid`` or ``causal`` (case insensitive).
+        pad_type (str): Padding type, one of ``same``, ``valid`` or ``causal`` (case insensitive).
         in_size (int): Input size.
         stride (int): Stride length.
         filt_size (int): Length of the kernel window.
@@ -135,6 +135,23 @@ def compute_padding_2d(pad_type, in_height, in_width, stride_height, stride_widt
 
 
 def compute_padding_1d_pytorch(pad_type, in_size, stride, filt_size, dilation):
+    """Computes the amount of padding required on each side of the 1D input tensor following pytorch conventions.
+
+    In case of ``same`` padding, this routine tries to pad evenly left and right, but if the amount of columns to be added
+    is odd, it will add the extra column to the right.
+
+    Args:
+        pad_type (str or int): Padding type. If string, one of ``same``, ``valid`` or ``causal`` (case insensitive).
+        in_size (int): Input size.
+        stride (int): Stride length.
+        filt_size (int): Length of the kernel window.
+
+    Raises:
+        Exception: Raised if the padding type is unknown.
+
+    Returns:
+        tuple: Tuple containing the padded input size, left and right padding values.
+    """
     if isinstance(pad_type, str):
         if pad_type.lower() == 'same':
             n_out = int(
@@ -176,6 +193,26 @@ def compute_padding_1d_pytorch(pad_type, in_size, stride, filt_size, dilation):
 def compute_padding_2d_pytorch(
     pad_type, in_height, in_width, stride_height, stride_width, filt_height, filt_width, dilation_height, dilation_width
 ):
+    """Computes the amount of padding required on each side of the 2D input tensor following pytorch conventions.
+
+    In case of ``same`` padding, this routine tries to pad evenly left and right (top and bottom), but if the amount of
+    columns to be added is odd, it will add the extra column to the right/bottom.
+
+    Args:
+        pad_type (str or int): Padding type. If string, one of ``same`` or ``valid`` (case insensitive).
+        in_height (int): The height of the input tensor.
+        in_width (int): The width of the input tensor.
+        stride_height (int): Stride height.
+        stride_width (int): Stride width.
+        filt_height (int): Height of the kernel window.
+        filt_width (int): Width of the kernel window.
+
+    Raises:
+        Exception: Raised if the padding type is unknown.
+
+    Returns:
+        tuple: Tuple containing the padded input height, width, and top, bottom, left and right padding values.
+    """
     if isinstance(pad_type, str):
         if pad_type.lower() == 'same':
             # Height

From 4fc1ea9d331ae0a30064d79fa3edd0bdf9858431 Mon Sep 17 00:00:00 2001
From: Jan-Frederik Schulte <jschulte@cern.ch>
Date: Fri, 6 Dec 2024 11:14:49 -0500
Subject: [PATCH 17/36] clarify default for channels last conversion in pytorch

---
 hls4ml/utils/config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hls4ml/utils/config.py b/hls4ml/utils/config.py
index f5924ccf64..e450084095 100644
--- a/hls4ml/utils/config.py
+++ b/hls4ml/utils/config.py
@@ -318,7 +318,7 @@ def config_from_pytorch_model(
             be an explicit precision: 'auto' is not allowed.
         default_reuse_factor (int, optional): Default reuse factor. Defaults to 1.
         channels_last_conversion (string, optional): Configures the conversion of pytorch layers to
-            'channels_last' data format used by hls4ml internally. Can be set to 'full', 'internal',
+            'channels_last' data format used by hls4ml internally. Can be set to 'full' (default), 'internal',
             or 'off'. If 'full', both the inputs and internal layers will be converted. If 'internal',
             only internal layers will be converted; this assumes the inputs are converted by the user.
             If 'off', no conversion is performed.

From 548c462224439448e6a647b6ccf6a315593bbefe Mon Sep 17 00:00:00 2001
From: Vladimir Loncar <vloncar@users.noreply.github.com>
Date: Fri, 6 Dec 2024 22:00:43 +0100
Subject: [PATCH 18/36] Restructure documentation

---
 docs/{api => advanced}/auto.rst               |    0
 docs/{api => advanced}/profiling.rst          |    0
 docs/{advanced => api}/command.rst            |    2 +-
 docs/api/concepts.rst                         |   78 +
 docs/api/configuration.rst                    |    2 +-
 docs/api/details.rst                          |   33 -
 docs/{advanced => backend}/accelerator.rst    |    8 +-
 docs/backend/catapult.rst                     |    7 +
 docs/{advanced => backend}/oneapi.rst         |    6 +-
 docs/backend/quartus.rst                      |    8 +
 docs/backend/sr.rst                           |    7 +
 docs/backend/vitis.rst                        |    7 +
 docs/concepts.rst                             |   69 -
 docs/frontend/keras.rst                       |   11 +
 docs/frontend/pytorch.rst                     |    9 +
 docs/{advanced => frontend}/qonnx.rst         |    0
 docs/index.rst                                |   60 +-
 docs/{ => intro}/faq.rst                      |    0
 docs/intro/introduction.rst                   |   30 +
 docs/{ => intro}/reference.rst                |    0
 docs/{ => intro}/release_notes.rst            |    0
 docs/{ => intro}/setup.rst                    |    6 +-
 docs/{ => intro}/status.rst                   |    0
 docs/ir/activations.rst                       |   14 -
 docs/ir/attributes.rst                        | 2802 +++++++++++++++++
 docs/ir/conv.rst                              |   38 -
 docs/ir/dense.rst                             |   25 -
 docs/{advanced => ir}/flows.rst               |   11 -
 docs/ir/ir.rst                                |   90 +
 docs/{api/hls-model.rst => ir/modelgraph.rst} |   24 +-
 30 files changed, 3114 insertions(+), 233 deletions(-)
 rename docs/{api => advanced}/auto.rst (100%)
 rename docs/{api => advanced}/profiling.rst (100%)
 rename docs/{advanced => api}/command.rst (97%)
 create mode 100644 docs/api/concepts.rst
 delete mode 100644 docs/api/details.rst
 rename docs/{advanced => backend}/accelerator.rst (96%)
 create mode 100644 docs/backend/catapult.rst
 rename docs/{advanced => backend}/oneapi.rst (98%)
 create mode 100644 docs/backend/quartus.rst
 create mode 100644 docs/backend/sr.rst
 create mode 100644 docs/backend/vitis.rst
 delete mode 100644 docs/concepts.rst
 create mode 100644 docs/frontend/keras.rst
 create mode 100644 docs/frontend/pytorch.rst
 rename docs/{advanced => frontend}/qonnx.rst (100%)
 rename docs/{ => intro}/faq.rst (100%)
 create mode 100644 docs/intro/introduction.rst
 rename docs/{ => intro}/reference.rst (100%)
 rename docs/{ => intro}/release_notes.rst (100%)
 rename docs/{ => intro}/setup.rst (97%)
 rename docs/{ => intro}/status.rst (100%)
 delete mode 100644 docs/ir/activations.rst
 create mode 100644 docs/ir/attributes.rst
 delete mode 100644 docs/ir/conv.rst
 delete mode 100644 docs/ir/dense.rst
 rename docs/{advanced => ir}/flows.rst (84%)
 create mode 100644 docs/ir/ir.rst
 rename docs/{api/hls-model.rst => ir/modelgraph.rst} (58%)

diff --git a/docs/api/auto.rst b/docs/advanced/auto.rst
similarity index 100%
rename from docs/api/auto.rst
rename to docs/advanced/auto.rst
diff --git a/docs/api/profiling.rst b/docs/advanced/profiling.rst
similarity index 100%
rename from docs/api/profiling.rst
rename to docs/advanced/profiling.rst
diff --git a/docs/advanced/command.rst b/docs/api/command.rst
similarity index 97%
rename from docs/advanced/command.rst
rename to docs/api/command.rst
index 67f7e3fe2f..1f821b7f35 100644
--- a/docs/advanced/command.rst
+++ b/docs/api/command.rst
@@ -50,7 +50,7 @@ hls4ml config
 
    hls4ml config [-h] [-m MODEL] [-w WEIGHTS] [-o OUTPUT]
 
-This creates a conversion configuration file. Visit Configuration section of the :doc:`Setup <../setup>` page for more details on how to write a configuration file.
+This creates a conversion configuration file. Visit Configuration section of the :doc:`Setup <../intro/setup>` page for more details on how to write a configuration file.
 
 **Arguments**
 
diff --git a/docs/api/concepts.rst b/docs/api/concepts.rst
new file mode 100644
index 0000000000..9087470cf3
--- /dev/null
+++ b/docs/api/concepts.rst
@@ -0,0 +1,78 @@
+========
+Concepts
+========
+
+How it Works
+----------------------
+
+.. image:: ../img/nn_map_paper_fig_2.png
+   :width: 70%
+   :align: center
+
+
+Consider a multilayer neural network. At each neuron in a layer :math:`m`  (containing :math:`N_m` neurons), we calculate an output value (part of the output vector :math:`\mathbf{x}_m` of said layer) using the sum of output values of the previous layer multiplied by independent weights for each of these values and a bias value. An activation function is performed on the result to get the final output value for the neuron. Representing the weights as a :math:`N_m` by :math:`N_{m-1}`  matrix  :math:`W_{m,m-1}`, the bias values as :math:`\mathbf{b}_m`, and the activation function as :math:`g_m`, we can express this compactly as:
+
+
+.. math::
+
+   \mathbf{x}_m = g_m (W_{m,m-1} \mathbf{x}_{m-1} +\mathbf{b}_m)
+
+With hls4ml, each layer of output values is calculated independently in sequence, using pipelining to speed up the process by accepting new inputs after an initiation interval.
+The activations, if nontrivial, are precomputed.
+
+To ensure optimal performance, the user can control aspects of their model, principally:
+
+
+* **Size/Compression** - Though not explicitly part of the ``hls4ml`` package, this is an important optimization to efficiently use the FPGA resources
+* **Precision** - Define the :doc:`precision <../advanced/profiling>` of the calculations in your model
+* **Dataflow/Resource Reuse** - Control parallel or streaming model implementations with varying levels of pipelining
+* **Quantization Aware Training** - Achieve best performance at low precision with tools like QKeras, and benefit automatically during inference with ``hls4ml`` parsing of QKeras models
+
+
+.. image:: ../img/reuse_factor_paper_fig_8.png
+   :width: 70%
+   :align: center
+
+
+Often, these decisions will be hardware dependent to maximize performance.
+Of note is that simplifying the input network must be done before using ``hls4ml`` to generate HLS code, for optimal compression to provide a sizable speedup.
+Also important to note is the use of fixed point arithmetic in ``hls4ml``.
+This improves processing speed relative to floating point implementations.
+The ``hls4ml`` package also offers the functionality of configuring binning and output bit width of the precomputed activation functions as necessary. With respect to parallelization and resource reuse, ``hls4ml`` offers a "reuse factor" parameter that determines the number of times each multiplier is used in order to compute a layer of neuron's values. Therefore, a reuse factor of one would split the computation so each multiplier had to only perform one multiplication in the computation of the output values of a layer, as shown above. Conversely, a reuse factor of four, in this case, uses a single multiplier four times sequentially. Low reuse factor achieves the lowest latency and highest throughput but uses the most resources, while high reuse factor save resources at the expense of longer latency and lower throughput.
+
+
+Frontends and Backends
+----------------------
+
+``hls4ml`` has a concept of a **frontend** that parses the input NN into an internal model graph, and a **backend** that controls
+what type of output is produced from the graph. Frontends and backends can be independently chosen. Examples of frontends are the
+parsers for Keras or ONNX, and examples of backends are Vivado HLS, Intel HLS, and Vitis HLS. See :ref:`Status and Features` for the
+currently supported frontends and backends or the dedicated sections for each frontend/backend.
+
+
+I/O Types
+---------
+
+``hls4ml`` supports multiple styles for handling data transfer to/from the network and between layers, known as the ``io_type``.
+
+io_parallel
+^^^^^^^^^^^
+In this processing style, data is passed in parallel between the layers. Conceptually this corresponds to the C/C++ array where all elements can be accessed ay any time. This style allows for maximum parallelism and is well suited for MLP networks and small CNNs which aim for lowest latency. Due to the impact of parallel processing on resource utilization on FPGAs, the synthesis may fail for larger networks.
+
+io_stream
+^^^^^^^^^
+As opposed to the parallel processing style, in ``io_stream`` mode data is passed one "pixel" at a time. Each pixel is an array of channels, which are always sent in parallel. This method for sending data between layers is recommended for larger CNN and RNN networks. For one-dimensional ``Dense`` layers, all the inputs are streamed in parallel as a single array.
+
+With the ``io_stream`` IO type, each layer is connected with the subsequent layer through first-in first-out (FIFO) buffers.
+The implementation of the FIFO buffers contribute to the overall resource utilization of the design, impacting in particular the BRAM or LUT utilization.
+Because the neural networks can have complex architectures generally, it is hard to know a priori the correct depth of each FIFO buffer.
+By default ``hls4ml`` choses the most conservative possible depth for each FIFO buffer, which can result in a an unnecessary overutilization of resources.
+
+In order to reduce the impact on the resources used for FIFO buffer implementation, we have a FIFO depth optimization flow. This is described
+in the :ref:`FIFO Buffer Depth Optimization` section.
+
+
+Strategy
+---------
+
+**Strategy** in ``hls4ml`` refers to the implementation of core matrix-vector multiplication routine, which can be latency-oriented, resource-saving oriented, or specialized. Different strategies will have an impact on overall latency and resource consumption of each layer and users are advised to choose based on their design goals. The availability of particular strategy for a layer varies across backends, see the :doc:`Attributes <../ir/attributes>` section for a complete list of available strategies per-layer and per-backend.
diff --git a/docs/api/configuration.rst b/docs/api/configuration.rst
index e5591f126b..1bc8f0676c 100644
--- a/docs/api/configuration.rst
+++ b/docs/api/configuration.rst
@@ -232,7 +232,7 @@ More than one layer can have a configuration specified, e.g.:
        dense2:
           ...
 
-For more information on the optimization parameters and what they mean, you can visit the :doc:`Concepts <../concepts>` chapter.
+For more information on the optimization parameters and what they mean, you can visit the :doc:`Concepts <../api/concepts>` section.
 
 ----
 
diff --git a/docs/api/details.rst b/docs/api/details.rst
deleted file mode 100644
index 750833001d..0000000000
--- a/docs/api/details.rst
+++ /dev/null
@@ -1,33 +0,0 @@
-================
-Software Details
-================
-
-Frontends and Backends
-----------------------
-
-In ``hls4ml`` there is a a concept of a *frontend* to parse the input NN into an internal model graph, and a *backend* that controls
-what type of output is produced from the graph. Frontends and backends can be independently chosen. Examples of frontends are the
-parsers for Keras or ONNX, and examples of backends are Vivado HLS, Intel HLS, and Vitis HLS. See :ref:`Status and Features` for the
-currently supported frontends and backends.
-
-I/O Types
----------
-
-``hls4ml`` supports multiple styles for handling data between layers, known as the ``io_type``.
-
-io_parallel
-^^^^^^^^^^^
-Data is passed in parallel between the layers. This is good for MLP networks and small CNNs. Synthesis may fail for larger networks.
-
-io_stream
-^^^^^^^^^
-Data is passed one "pixel" at a time. Each pixel is an array of channels, which are always sent in parallel. This method for sending
-data between layers is recommended for larger CNNs. For ``Dense`` layers, all the inputs are streamed in parallel as a single array.
-
-With the ``io_stream`` IO type, each layer is connected with the subsequent layer through first-in first-out (FIFO) buffers.
-The implementation of the FIFO buffers contribute to the overall resource utilization of the design, impacting in particular the BRAM or LUT utilization.
-Because the neural networks can have complex architectures generally, it is hard to know a priori the correct depth of each FIFO buffer.
-By default ``hls4ml`` choses the most conservative possible depth for each FIFO buffer, which can result in a an unnecessary overutilization of resources.
-
-In order to reduce the impact on the resources used for FIFO buffer implementation, we have a FIFO depth optimization flow. This is described
-in the :ref:`FIFO Buffer Depth Optimization` section.
diff --git a/docs/advanced/accelerator.rst b/docs/backend/accelerator.rst
similarity index 96%
rename from docs/advanced/accelerator.rst
rename to docs/backend/accelerator.rst
index da945509d5..187bccaa2c 100644
--- a/docs/advanced/accelerator.rst
+++ b/docs/backend/accelerator.rst
@@ -1,8 +1,8 @@
-=========================
-VivadoAccelerator Backend
-=========================
+=================
+VivadoAccelerator
+=================
 
-The ``VivadoAccelerator`` backend of ``hls4ml`` leverages the `PYNQ <http://pynq.io/>`_ software stack to easily deploy models on supported devices.
+The **VivadoAccelerator** backend of ``hls4ml`` leverages the `PYNQ <http://pynq.io/>`_ software stack to easily deploy models on supported devices.
 Currently ``hls4ml`` supports the following boards:
 
 * `pynq-z2 <https://www.xilinx.com/support/university/xup-boards/XUPPYNQ-Z2.html>`_ (part: ``xc7z020clg400-1``)
diff --git a/docs/backend/catapult.rst b/docs/backend/catapult.rst
new file mode 100644
index 0000000000..00cf0fb98b
--- /dev/null
+++ b/docs/backend/catapult.rst
@@ -0,0 +1,7 @@
+========
+Catapult
+========
+
+Support for Siemens Catapult HLS compiler has been added in ``hls4ml`` version 1.0.0.
+
+*TODO expand this section*
diff --git a/docs/advanced/oneapi.rst b/docs/backend/oneapi.rst
similarity index 98%
rename from docs/advanced/oneapi.rst
rename to docs/backend/oneapi.rst
index fb926409eb..3ee65fb41c 100644
--- a/docs/advanced/oneapi.rst
+++ b/docs/backend/oneapi.rst
@@ -1,6 +1,6 @@
-==============
-oneAPI Backend
-==============
+======
+oneAPI
+======
 
 The ``oneAPI`` backend of hls4ml is designed for deploying NNs on Intel/Altera FPGAs. It will eventually
 replace the ``Quartus`` backend, which targeted Intel HLS. (Quartus continues to be used with IP produced by the
diff --git a/docs/backend/quartus.rst b/docs/backend/quartus.rst
new file mode 100644
index 0000000000..225ce2e12c
--- /dev/null
+++ b/docs/backend/quartus.rst
@@ -0,0 +1,8 @@
+=======
+Quartus
+=======
+
+.. warning::
+    Quartus backend is deprecated and will be removed in a future version. Users should migrate to oneAPI backend.
+
+*TODO expand this section*
diff --git a/docs/backend/sr.rst b/docs/backend/sr.rst
new file mode 100644
index 0000000000..93a247b63d
--- /dev/null
+++ b/docs/backend/sr.rst
@@ -0,0 +1,7 @@
+==================
+SymbolicExpression
+==================
+
+This backend can be used to implement expressions obtained through symbolic regression tools such as `PySR <https://github.com/MilesCranmer/PySR>`_ or `SymbolNet <https://github.com/hftsoi/SymbolNet>`_. The backend targets Vivado/Vitis HLS and relies on HLS math libraries provided with a licensed installation of these tools.
+
+*TODO expand this section*
diff --git a/docs/backend/vitis.rst b/docs/backend/vitis.rst
new file mode 100644
index 0000000000..17d87763a7
--- /dev/null
+++ b/docs/backend/vitis.rst
@@ -0,0 +1,7 @@
+============
+Vivado/Vitis
+============
+
+``Vivado`` and ``Vitis`` backends are aimed for use with Xilinx FPGAs. They are currently the most advanced and well-supported backends of ``hls4ml``.
+
+*TODO expand this section*
diff --git a/docs/concepts.rst b/docs/concepts.rst
deleted file mode 100644
index b788d5ba5d..0000000000
--- a/docs/concepts.rst
+++ /dev/null
@@ -1,69 +0,0 @@
-========
-Concepts
-========
-
-The goal of ``hls4ml`` is to provide an efficient and fast translation of machine learning models from open-source packages (like Keras and PyTorch) for training machine learning algorithms to high level synthesis (HLS) code that can then be transpiled to run on an FPGA. The resulting HLS project can be then used to produce an IP which can be plugged into more complex designs or be used to create a kernel for CPU co-processing. The user has freedom to define many of the parameters of their algorithm to best suit their needs.
-
-The ``hls4ml`` package enables fast prototyping of a machine learning algorithm implementation in FPGAs,
-greatly reducing the time to results and giving the user intuition for how to best design a machine learning algorithm for their application while balancing performance, resource utilization and latency requirements.
-
-The Inspiration
-===============
-
-The inspiration for the creation of the ``hls4ml`` package stems from the high energy physics community at the CERN Large Hadron Collider (LHC).
-While machine learning has already been proven to be extremely useful in analysis of data from detectors at the LHC, it is typically performed in an "offline" environment after the data is taken and agglomerated.
-However, one of the largest problems at detectors on the LHC is that collisions, or "events", generate too much data for everything to be saved.
-As such, filters called "triggers" are used to determine whether a given event should be kept.
-Using FPGAs allows for significantly lower latency so machine learning algorithms can essentially be run "live" at the detector level for event selection. As a result, more events with potential signs of new physics can be preserved for analysis.
-
-The Solution: ``hls4ml``
-========================
-
-.. image:: img/overview.jpg
-
-
-With this in mind, let's take a look at how ``hls4ml`` helps to achieve such a goal. First, it's important to realize the architecture differences between an FPGA and a CPU or GPU.
-An FPGA can be specifically programmed to do a certain task, in this case evaluate neural networks given a set of inputs, and as such can be highly optimized for the task, with tricks like pipelining and parallel evaluation. However, this means dynamic remapping while running isn't really a possibility.
-FPGAs also often come at a comparatively low power cost with respect to CPUs and GPUs. This allows ``hls4ml`` to build HLS code from compressed neural networks that results in predictions on the microsecond scale for latency.
-The ``hls4ml`` tool saves the time investment needed to convert a neural network to a hardware design language or even HLS code, thus allowing for rapid prototyping.
-
-How it Works
-=============
-
-.. image:: img/nn_map_paper_fig_2.png
-   :width: 70%
-   :align: center
-
-
-Consider a multilayer neural network. At each neuron in a layer :math:`m`  (containing :math:`N_m` neurons), we calculate an output value (part of the output vector :math:`\mathbf{x}_m` of said layer) using the sum of output values of the previous layer multiplied by independent weights for each of these values and a bias value. An activation function is performed on the result to get the final output value for the neuron. Representing the weights as a :math:`N_m` by :math:`N_{m-1}`  matrix  :math:`W_{m,m-1}`, the bias values as :math:`\mathbf{b}_m`, and the activation function as :math:`g_m`, we can express this compactly as:
-
-
-.. math::
-
-   \mathbf{x}_m = g_m (W_{m,m-1} \mathbf{x}_{m-1} +\mathbf{b}_m)
-
-With hls4ml, each layer of output values is calculated independently in sequence, using pipelining to speed up the process by accepting new inputs after an initiation interval.
-The activations, if nontrivial, are precomputed.
-
-To ensure optimal performance, the user can control aspects of their model, principally:
-
-
-* **Size/Compression** - Though not explicitly part of the ``hls4ml`` package, this is an important optimization to efficiently use the FPGA resources
-* **Precision** - Define the :doc:`precision <api/profiling>` of the calculations in your model
-* **Dataflow/Resource Reuse** - Control parallel or streaming model implementations with varying levels of pipelining
-* **Quantization Aware Training** - Achieve best performance at low precision with tools like QKeras, and benefit automatically during inference with ``hls4ml`` parsing of QKeras models
-
-
-.. image:: img/reuse_factor_paper_fig_8.png
-   :width: 70%
-   :align: center
-
-
-Often, these decisions will be hardware dependent to maximize performance.
-Of note is that simplifying the input network must be done before using ``hls4ml`` to generate HLS code, for optimal compression to provide a sizable speedup.
-Also important to note is the use of fixed point arithmetic in ``hls4ml``.
-This improves processing speed relative to floating point implementations.
-The ``hls4ml`` package also offers the functionality of configuring binning and output bit width of the precomputed activation functions as necessary. With respect to parallelization and resource reuse, ``hls4ml`` offers a "reuse factor" parameter that determines the number of times each multiplier is used in order to compute a layer of neuron's values. Therefore, a reuse factor of one would split the computation so each multiplier had to only perform one multiplication in the computation of the output values of a layer, as shown above. Conversely, a reuse factor of four, in this case, uses a single multiplier four times sequentially. Low reuse factor achieves the lowest latency and highest throughput but uses the most resources, while high reuse factor save resources at the expense of longer latency and lower throughput.
-The reuse factor can be set using the configuration options defined on the :doc:`Setup <setup>` page.
-
-Thereby, the ``hls4ml`` package builds efficient HLS code to implement neural networks on FPGAs for microsecond-scale latency on predictions. For more detailed information, take a look at our :doc:`References <reference>` page. All figures on this page are taken from the following paper: `JINST 13 P07027 (2018) <https://dx.doi.org/10.1088/1748-0221/13/07/P07027>`_.
diff --git a/docs/frontend/keras.rst b/docs/frontend/keras.rst
new file mode 100644
index 0000000000..d6d42cb4b8
--- /dev/null
+++ b/docs/frontend/keras.rst
@@ -0,0 +1,11 @@
+================
+Keras and QKeras
+================
+
+Keras and the quantization library QKeras are well supported in ``hls4ml``. Currently, the Keras v2 (``tf.keras``) is the preferred version, and the future versions of ``hls4ml`` will expand support for Keras v3. The frontend is based on the parsing the serialized json representation of the model.
+
+Currently, ``hls4ml`` can parse most Keras layers, including core layers, convolutional layers, pooling layers, recurrent layers, merging/reshaping layers and activation layers, implemented either via sequential or functional API. Notably missing are the attention and normalization layers. The equivalent QKeras API and quantizers are also supported. The ``Lambda`` layers don't save their state in the serialized format and are thus impossible to parse. In this case, the ``Lambda`` layers can be implemented as custom layers and parsed via the :ref:`Extension API`.
+
+The ``data_format='channels_first'`` parameter of Keras layers is supported, but not extensively tested. All HLS implementations in ``hls4ml`` are based on ``channels_last`` data format and need to be converted to that format before the HLS code can be emitted. We encourage users of ``channels_first`` to report their experiences to developers on GitHub.
+
+The development team of ``hls4ml`` is currently exploring options for QKeras alternative and will provide a drop-in replacement API compatible with Keras v3.
diff --git a/docs/frontend/pytorch.rst b/docs/frontend/pytorch.rst
new file mode 100644
index 0000000000..923c17f438
--- /dev/null
+++ b/docs/frontend/pytorch.rst
@@ -0,0 +1,9 @@
+====================
+PyTorch and Brevitas
+====================
+
+PyTorch frontend in ``hls4ml`` is implemented by parsing the symbolic trace of the ``torch.fx`` framework. This ensures proper execution graph is captured. Therefore, only models that can be traced with the FX framework can be parsed by ``hls4ml``.
+
+PyTorch/Brevitas parser is under heavy development and doesn't yet have the same feature set of the Keras parsers. Feel free to reach out to developers if you find a missing feature that is present in Keras parser and would like it implemented.
+
+The equivalent of Keras extension API is not yet available for PyTorch parser, and will be provided in the future.
diff --git a/docs/advanced/qonnx.rst b/docs/frontend/qonnx.rst
similarity index 100%
rename from docs/advanced/qonnx.rst
rename to docs/frontend/qonnx.rst
diff --git a/docs/index.rst b/docs/index.rst
index 4a830de675..62003f662b 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -2,46 +2,66 @@
     :hidden:
     :caption: Introduction
 
-    concepts
-    status
-    setup
-    faq
-    release_notes
-    reference
+    intro/introduction
+    intro/status
+    intro/setup
+    intro/faq
+    intro/release_notes
+    intro/reference
 
 .. toctree::
     :hidden:
     :glob:
-    :caption: Quick API Reference
+    :caption: User Guide
 
+    api/concepts
     api/configuration
-    api/auto
-    api/details
-    api/hls-model
-    api/profiling
+    api/command
 
 .. toctree::
     :hidden:
     :glob:
-    :caption: Internal Layers
+    :caption: Frontends
 
-    ir/dense
-    ir/activations
-    ir/conv
+    frontend/keras
+    frontend/pytorch
+    frontend/qonnx
+
+.. toctree::
+    :hidden:
+    :glob:
+    :caption: Backends
+
+    backend/vitis
+    backend/accelerator
+    backend/oneapi
+    backend/catapult
+    backend/quartus
+    backend/sr
 
 .. toctree::
     :hidden:
     :caption: Advanced Features
 
-    advanced/flows
+    advanced/profiling
+    advanced/auto
     advanced/hgq
-    advanced/qonnx
     advanced/fifo_depth
     advanced/extension
-    advanced/oneapi
-    advanced/accelerator
     advanced/model_optimization
-    advanced/command
+
+.. toctree::
+    :hidden:
+    :glob:
+    :caption: Internals
+
+    ir/ir
+    ir/modelgraph
+    ir/flows
+    ir/dense
+    ir/activations
+    ir/conv
+    ir/attributes
 
 .. toctree::
     :hidden:
diff --git a/docs/faq.rst b/docs/intro/faq.rst
similarity index 100%
rename from docs/faq.rst
rename to docs/intro/faq.rst
diff --git a/docs/intro/introduction.rst b/docs/intro/introduction.rst
new file mode 100644
index 0000000000..8d603bd78f
--- /dev/null
+++ b/docs/intro/introduction.rst
@@ -0,0 +1,30 @@
+============
+Introduction
+============
+
+The goal of ``hls4ml`` is to provide an efficient and fast translation of machine learning models from open-source packages (like Keras and PyTorch) for training machine learning algorithms to high level synthesis (HLS) code that can then be transpiled to run on an FPGA. The resulting HLS project can be then used to produce an IP which can be plugged into more complex designs or be used to create a kernel for CPU co-processing. The user has freedom to define many of the parameters of their algorithm to best suit their needs.
+
+The ``hls4ml`` package enables fast prototyping of a machine learning algorithm implementation in FPGAs,
+greatly reducing the time to results and giving the user intuition for how to best design a machine learning algorithm for their application while balancing performance, resource utilization and latency requirements.
+
+The Inspiration
+===============
+
+The inspiration for the creation of the ``hls4ml`` package stems from the high energy physics community at the CERN Large Hadron Collider (LHC).
+While machine learning has already been proven to be extremely useful in analysis of data from detectors at the LHC, it is typically performed in an "offline" environment after the data is taken and agglomerated.
+However, one of the largest problems at detectors on the LHC is that collisions, or "events", generate too much data for everything to be saved.
+As such, filters called "triggers" are used to determine whether a given event should be kept.
+Using FPGAs allows for significantly lower latency so machine learning algorithms can essentially be run "live" at the detector level for event selection. As a result, more events with potential signs of new physics can be preserved for analysis.
+
+The Solution: ``hls4ml``
+========================
+
+.. image:: ../img/overview.jpg
+
+
+With this in mind, let's take a look at how ``hls4ml`` helps to achieve such a goal. First, it's important to realize the architecture differences between an FPGA and a CPU or GPU.
+An FPGA can be specifically programmed to do a certain task, in this case evaluate neural networks given a set of inputs, and as such can be highly optimized for the task, with tricks like pipelining and parallel evaluation. However, this means dynamic remapping while running isn't really a possibility.
+FPGAs also often come at a comparatively low power cost with respect to CPUs and GPUs. This allows ``hls4ml`` to build HLS code from compressed neural networks that results in predictions on the microsecond scale for latency.
+The ``hls4ml`` tool saves the time investment needed to convert a neural network to a hardware design language or even HLS code, thus allowing for rapid prototyping.
+
+For more detailed information on technical details of ``hls4ml``, read the "Internals" section of our documentation or our :doc:`References <reference>` page. All figures on this page are taken from the following paper: `JINST 13 P07027 (2018) <https://dx.doi.org/10.1088/1748-0221/13/07/P07027>`_.
diff --git a/docs/reference.rst b/docs/intro/reference.rst
similarity index 100%
rename from docs/reference.rst
rename to docs/intro/reference.rst
diff --git a/docs/release_notes.rst b/docs/intro/release_notes.rst
similarity index 100%
rename from docs/release_notes.rst
rename to docs/intro/release_notes.rst
diff --git a/docs/setup.rst b/docs/intro/setup.rst
similarity index 97%
rename from docs/setup.rst
rename to docs/intro/setup.rst
index 052b6d1fa7..6ba0c4ce0e 100644
--- a/docs/setup.rst
+++ b/docs/intro/setup.rst
@@ -14,7 +14,7 @@ The latest release of ``hls4ml`` can be installed with ``pip``:
 
    pip install hls4ml
 
-If you want to use our :doc:`profiling <api/profiling>` toolbox, you might need to install extra dependencies:
+If you want to use our :doc:`profiling <../advanced/profiling>` toolbox, you might need to install extra dependencies:
 
 .. code-block::
 
@@ -72,7 +72,7 @@ Catapult HLS 2024.1_1 or 2024.2 can be used to synthesize both for ASICs and FPG
 Quick Start
 =============
 
-For basic concepts to understand the tool, please visit the :doc:`Concepts <concepts>` chapter.
+For basic concepts to understand the tool, please visit the :doc:`Concepts <../api/concepts>` chapter.
 Here we give line-by-line instructions to demonstrate the general workflow.
 
 .. code-block:: python
@@ -105,7 +105,7 @@ After that, you can use :code:`Vivado HLS` to synthesize the model:
 
 Done! You've built your first project using ``hls4ml``! To learn more about our various API functionalities, check out our tutorials `here <https://github.com/fastmachinelearning/hls4ml-tutorial>`__.
 
-If you want to configure your model further, check out our :doc:`Configuration <api/configuration>` page.
+If you want to configure your model further, check out our :doc:`Configuration <../api/configuration>` page.
 
 ..
    Apart from our main API, we also support model conversion using a command line interface, check out our next section to find out more:
diff --git a/docs/status.rst b/docs/intro/status.rst
similarity index 100%
rename from docs/status.rst
rename to docs/intro/status.rst
diff --git a/docs/ir/activations.rst b/docs/ir/activations.rst
deleted file mode 100644
index 3515d3101b..0000000000
--- a/docs/ir/activations.rst
+++ /dev/null
@@ -1,14 +0,0 @@
-===========
-Activations
-===========
-
-Most activations without extra parameters are represented with the ``Activation`` layer, and those with single parameters (leaky ReLU, thresholded ReLU, ELU) as ``ParametrizedActivation``.
-``PReLU`` has its own class because it has a parameter matrix (stored as a weight). The hard (piecewise linear) sigmoid and tanh functions are implemented in a ``HardActivation`` layer,
-and ``Softmax`` has its own layer class.
-
-Softmax has four implementations that the user can choose from by setting the ``implementation`` parameter:
-
-* **latency**:  Good latency, but somewhat high resource usage. It does not work well if there are many output classes.
-* **stable**:  Slower but with better accuracy, useful in scenarios where higher accuracy is needed.
-* **legacy**:  An older implementation with poor accuracy, but good performance. Usually the latency implementation is preferred.
-* **argmax**:  If you don't care about normalized outputs and only care about which one has the highest value, using argmax saves a lot of resources. This sets the highest value to 1, the others to 0.
diff --git a/docs/ir/attributes.rst b/docs/ir/attributes.rst
new file mode 100644
index 0000000000..dfbec51b1c
--- /dev/null
+++ b/docs/ir/attributes.rst
@@ -0,0 +1,2802 @@
+================
+Layer attributes
+================
+
+
+Input
+=====
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Constant
+========
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* value: ndarray
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Activation
+==========
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* n_in: int
+
+* activation: str
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Backend-specific attributes
+---------------------------
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* table_size: int (Default: 1024)
+
+  * The size of the lookup table used to approximate the function.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* table_t: NamedType (Default: fixed<18,8,TRN,WRAP,0>)
+
+  * The datatype (precision) used for the values of the lookup table.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+ParametrizedActivation
+======================
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* param_t: NamedType
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* n_in: int
+
+* activation: str
+
+* n_in: int
+
+* activation: str
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* param_t: NamedType
+
+Backend-specific attributes
+---------------------------
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* table_size: int (Default: 1024)
+
+  * The size of the lookup table used to approximate the function.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* table_t: NamedType (Default: fixed<18,8,TRN,WRAP,0>)
+
+  * The datatype (precision) used for the values of the lookup table.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+PReLU
+=====
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* param_t: NamedType
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* n_in: int
+
+* activation: str
+
+* n_in: int
+
+* activation: str
+
+Weight attributes
+-----------------
+* param: WeightVariable
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* param_t: NamedType
+
+Backend-specific attributes
+---------------------------
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* table_size: int (Default: 1024)
+
+  * The size of the lookup table used to approximate the function.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* table_t: NamedType (Default: fixed<18,8,TRN,WRAP,0>)
+
+  * The datatype (precision) used for the values of the lookup table.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+Softmax
+=======
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* n_in: int
+
+* activation: str
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Backend-specific attributes
+---------------------------
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* table_size: int (Default: 1024)
+
+  * The size of the lookup table used to approximate the function.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* table_t: NamedType (Default: fixed<18,8,TRN,WRAP,0>)
+
+  * The datatype (precision) used for the values of the lookup table.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* implementation: list [latency,stable,argmax,legacy] (Default: stable)
+
+  * Choice of implementation of softmax function. "latency" provides good latency at the expense of extra resources. performs well on small number of classes. "stable" may require extra clock cycles but has better accuracy. "legacy" is the older implementation which has bad accuracy, but is fast and has low resource use. It is superseded by the "latency" implementation for most applications. "argmax" is a special implementation that can be used if only the output with the highest probability is important. Using this implementation will save resources and clock cycles.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* skip: bool (Default: False)
+
+  * If enabled, skips the softmax node and returns the raw outputs.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* exp_table_t: NamedType (Default: fixed<18,8,RND,SAT,0>)
+
+  * The datatype (precision) used for the values of the lookup table.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* inv_table_t: NamedType (Default: fixed<18,8,RND,SAT,0>)
+
+  * The datatype (precision) used for the values of the lookup table.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+TernaryTanh
+===========
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* n_in: int
+
+* activation: str
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Backend-specific attributes
+---------------------------
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* table_size: int (Default: 1024)
+
+  * The size of the lookup table used to approximate the function.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* table_t: NamedType (Default: fixed<18,8,TRN,WRAP,0>)
+
+  * The datatype (precision) used for the values of the lookup table.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+HardActivation
+==============
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* slope_t: NamedType
+
+* shift_t: NamedType
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* n_in: int
+
+* activation: str
+
+* slope: float (Default: 0.2)
+
+* shift: float (Default: 0.5)
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* slope_t: NamedType
+
+* shift_t: NamedType
+
+Backend-specific attributes
+---------------------------
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* table_size: int (Default: 1024)
+
+  * The size of the lookup table used to approximate the function.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* table_t: NamedType (Default: fixed<18,8,TRN,WRAP,0>)
+
+  * The datatype (precision) used for the values of the lookup table.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+Reshape
+=======
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* target_shape: Sequence
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Dense
+=====
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* weight_t: NamedType
+
+* bias_t: NamedType
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* n_in: int
+
+* n_out: int
+
+Weight attributes
+-----------------
+* weight: WeightVariable
+
+* bias: WeightVariable
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* weight_t: NamedType
+
+* bias_t: NamedType
+
+Backend-specific attributes
+---------------------------
+* accum_t: NamedType
+
+  * The datatype (precision) used to store intermediate results of the computation within the layer.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+Conv
+====
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Backend-specific attributes
+---------------------------
+* accum_t: NamedType
+
+  * The datatype (precision) used to store intermediate results of the computation within the layer.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+Conv1D
+======
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* weight_t: NamedType
+
+* bias_t: NamedType
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* in_width: int
+
+* out_width: int
+
+* n_chan: int
+
+* n_filt: int
+
+* filt_width: int
+
+* stride_width: int
+
+* pad_left: int
+
+* pad_right: int
+
+Weight attributes
+-----------------
+* weight: WeightVariable
+
+* bias: WeightVariable
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* weight_t: NamedType
+
+* bias_t: NamedType
+
+Backend-specific attributes
+---------------------------
+* accum_t: NamedType
+
+  * The datatype (precision) used to store intermediate results of the computation within the layer.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* parallelization_factor: int (Default: 1)
+
+  * The number of outputs computed in parallel. Essentially the number of multiplications of input window with the convolution kernel occuring in parallel. Higher number results in more parallelism (lower latency and II) at the expense of resources used.Currently only supported in io_parallel.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Catapult, oneAPI
+
+* conv_implementation: list [LineBuffer,Encoded] (Default: LineBuffer)
+
+  * "LineBuffer" implementation is preferred over "Encoded" for most use cases. This attribute only applies to io_stream.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Catapult
+
+Conv2D
+======
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* weight_t: NamedType
+
+* bias_t: NamedType
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* in_height: int
+
+* in_width: int
+
+* out_height: int
+
+* out_width: int
+
+* n_chan: int
+
+* n_filt: int
+
+* filt_height: int
+
+* filt_width: int
+
+* stride_height: int
+
+* stride_width: int
+
+* pad_top: int
+
+* pad_bottom: int
+
+* pad_left: int
+
+* pad_right: int
+
+Weight attributes
+-----------------
+* weight: WeightVariable
+
+* bias: WeightVariable
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* weight_t: NamedType
+
+* bias_t: NamedType
+
+Backend-specific attributes
+---------------------------
+* accum_t: NamedType
+
+  * The datatype (precision) used to store intermediate results of the computation within the layer.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* parallelization_factor: int (Default: 1)
+
+  * The number of outputs computed in parallel. Essentially the number of multiplications of input window with the convolution kernel occuring in parallel. Higher number results in more parallelism (lower latency and II) at the expense of resources used.Currently only supported in io_parallel.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Catapult, oneAPI
+
+* conv_implementation: list [LineBuffer,Encoded] (Default: LineBuffer)
+
+  * "LineBuffer" implementation is preferred over "Encoded" for most use cases. This attribute only applies to io_stream.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Catapult
+
+Conv2DBatchnorm
+===============
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* weight_t: NamedType
+
+* bias_t: NamedType
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* in_height: int
+
+* in_width: int
+
+* out_height: int
+
+* out_width: int
+
+* n_chan: int
+
+* n_filt: int
+
+* filt_height: int
+
+* filt_width: int
+
+* stride_height: int
+
+* stride_width: int
+
+* pad_top: int
+
+* pad_bottom: int
+
+* pad_left: int
+
+* pad_right: int
+
+Weight attributes
+-----------------
+* weight: WeightVariable
+
+* bias: WeightVariable
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* weight_t: NamedType
+
+* bias_t: NamedType
+
+Backend-specific attributes
+---------------------------
+* accum_t: NamedType
+
+  * The datatype (precision) used to store intermediate results of the computation within the layer.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* parallelization_factor: int (Default: 1)
+
+  * The number of outputs computed in parallel. Essentially the number of multiplications of input window with the convolution kernel occuring in parallel. Higher number results in more parallelism (lower latency and II) at the expense of resources used.Currently only supported in io_parallel.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Catapult, oneAPI
+
+* conv_implementation: list [LineBuffer,Encoded] (Default: LineBuffer)
+
+  * "LineBuffer" implementation is preferred over "Encoded" for most use cases. This attribute only applies to io_stream.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Catapult
+
+SeparableConv1D
+===============
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* depthwise_t: NamedType
+
+* pointwise_t: NamedType
+
+* bias_t: NamedType
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* in_width: int
+
+* out_width: int
+
+* n_chan: int
+
+* n_filt: int
+
+* depth_multiplier: int (Default: 1)
+
+* filt_width: int
+
+* stride_width: int
+
+* pad_left: int
+
+* pad_right: int
+
+Weight attributes
+-----------------
+* depthwise: WeightVariable
+
+* pointwise: WeightVariable
+
+* bias: WeightVariable
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* depthwise_t: NamedType
+
+* pointwise_t: NamedType
+
+* bias_t: NamedType
+
+Backend-specific attributes
+---------------------------
+* depthwise_accum_t: NamedType
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* pointwise_accum_t: NamedType
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* depthwise_result_t: NamedType
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* depthwise_reuse_factor: int (Default: 1)
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* pointwise_reuse_factor: int (Default: 1)
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* conv_implementation: list [LineBuffer,Encoded] (Default: LineBuffer)
+
+  * "LineBuffer" implementation is preferred over "Encoded" for most use cases. This attribute only applies to io_stream.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Catapult
+
+* dw_output_t: NamedType (Default: fixed<18,8,TRN,WRAP,0>)
+
+  * Available in: Catapult
+
+DepthwiseConv1D
+===============
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* weight_t: NamedType
+
+* bias_t: NamedType
+
+* weight_t: NamedType
+
+* bias_t: NamedType
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* in_width: int
+
+* out_width: int
+
+* n_chan: int
+
+* n_filt: int
+
+* filt_width: int
+
+* stride_width: int
+
+* pad_left: int
+
+* pad_right: int
+
+* in_width: int
+
+* out_width: int
+
+* n_chan: int
+
+* depth_multiplier: int (Default: 1)
+
+* n_filt: int
+
+* filt_width: int
+
+* stride_width: int
+
+* pad_left: int
+
+* pad_right: int
+
+Weight attributes
+-----------------
+* weight: WeightVariable
+
+* bias: WeightVariable
+
+* weight: WeightVariable
+
+* bias: WeightVariable
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* weight_t: NamedType
+
+* bias_t: NamedType
+
+* weight_t: NamedType
+
+* bias_t: NamedType
+
+Backend-specific attributes
+---------------------------
+* accum_t: NamedType
+
+  * The datatype (precision) used to store intermediate results of the computation within the layer.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* parallelization_factor: int (Default: 1)
+
+  * The number of outputs computed in parallel. Essentially the number of multiplications of input window with the convolution kernel occuring in parallel. Higher number results in more parallelism (lower latency and II) at the expense of resources used.Currently only supported in io_parallel.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Catapult, oneAPI
+
+* conv_implementation: list [LineBuffer,Encoded] (Default: LineBuffer)
+
+  * "LineBuffer" implementation is preferred over "Encoded" for most use cases. This attribute only applies to io_stream.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Catapult
+
+SeparableConv2D
+===============
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* depthwise_t: NamedType
+
+* pointwise_t: NamedType
+
+* bias_t: NamedType
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* in_height: int
+
+* in_width: int
+
+* out_height: int
+
+* out_width: int
+
+* n_chan: int
+
+* n_filt: int
+
+* depth_multiplier: int (Default: 1)
+
+* filt_height: int
+
+* filt_width: int
+
+* stride_height: int
+
+* stride_width: int
+
+* pad_top: int
+
+* pad_bottom: int
+
+* pad_left: int
+
+* pad_right: int
+
+Weight attributes
+-----------------
+* depthwise: WeightVariable
+
+* pointwise: WeightVariable
+
+* bias: WeightVariable
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* depthwise_t: NamedType
+
+* pointwise_t: NamedType
+
+* bias_t: NamedType
+
+Backend-specific attributes
+---------------------------
+* depthwise_accum_t: NamedType
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* pointwise_accum_t: NamedType
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* depthwise_result_t: NamedType
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* depthwise_reuse_factor: int (Default: 1)
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* pointwise_reuse_factor: int (Default: 1)
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* conv_implementation: list [LineBuffer,Encoded] (Default: LineBuffer)
+
+  * "LineBuffer" implementation is preferred over "Encoded" for most use cases. This attribute only applies to io_stream.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Catapult
+
+* dw_output_t: NamedType (Default: fixed<18,8,TRN,WRAP,0>)
+
+  * Available in: Catapult
+
+DepthwiseConv2D
+===============
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* weight_t: NamedType
+
+* bias_t: NamedType
+
+* weight_t: NamedType
+
+* bias_t: NamedType
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* in_height: int
+
+* in_width: int
+
+* out_height: int
+
+* out_width: int
+
+* n_chan: int
+
+* n_filt: int
+
+* filt_height: int
+
+* filt_width: int
+
+* stride_height: int
+
+* stride_width: int
+
+* pad_top: int
+
+* pad_bottom: int
+
+* pad_left: int
+
+* pad_right: int
+
+* in_height: int
+
+* in_width: int
+
+* out_height: int
+
+* out_width: int
+
+* n_chan: int
+
+* depth_multiplier: int (Default: 1)
+
+* n_filt: int
+
+* filt_height: int
+
+* filt_width: int
+
+* stride_height: int
+
+* stride_width: int
+
+* pad_top: int
+
+* pad_bottom: int
+
+* pad_left: int
+
+* pad_right: int
+
+Weight attributes
+-----------------
+* weight: WeightVariable
+
+* bias: WeightVariable
+
+* weight: WeightVariable
+
+* bias: WeightVariable
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* weight_t: NamedType
+
+* bias_t: NamedType
+
+* weight_t: NamedType
+
+* bias_t: NamedType
+
+Backend-specific attributes
+---------------------------
+* accum_t: NamedType
+
+  * The datatype (precision) used to store intermediate results of the computation within the layer.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* parallelization_factor: int (Default: 1)
+
+  * The number of outputs computed in parallel. Essentially the number of multiplications of input window with the convolution kernel occuring in parallel. Higher number results in more parallelism (lower latency and II) at the expense of resources used.Currently only supported in io_parallel.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Catapult, oneAPI
+
+* conv_implementation: list [LineBuffer,Encoded] (Default: LineBuffer)
+
+  * "LineBuffer" implementation is preferred over "Encoded" for most use cases. This attribute only applies to io_stream.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Catapult
+
+BatchNormalization
+==================
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* scale_t: NamedType
+
+* bias_t: NamedType
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* n_in: int
+
+* n_filt: int (Default: -1)
+
+* use_gamma: bool (Default: True)
+
+* use_beta: bool (Default: True)
+
+Weight attributes
+-----------------
+* scale: WeightVariable
+
+* bias: WeightVariable
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* scale_t: NamedType
+
+* bias_t: NamedType
+
+Backend-specific attributes
+---------------------------
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+Pooling1D
+=========
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* n_in: int
+
+* n_out: int
+
+* n_filt: int
+
+* pool_width: int
+
+* stride_width: int
+
+* pad_left: int
+
+* pad_right: int
+
+* count_pad: bool (Default: False)
+
+* pool_op: list [Max,Average]
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Backend-specific attributes
+---------------------------
+* accum_t: NamedType
+
+  * The datatype (precision) used to store intermediate results of the computation within the layer.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* conv_implementation: list [LineBuffer,Encoded] (Default: LineBuffer)
+
+  * "LineBuffer" implementation is preferred over "Encoded" for most use cases. This attribute only applies to io_stream.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Catapult
+
+Pooling2D
+=========
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* in_height: int
+
+* in_width: int
+
+* out_height: int
+
+* out_width: int
+
+* n_filt: int
+
+* pool_height: int
+
+* pool_width: int
+
+* stride_height: int
+
+* stride_width: int
+
+* pad_top: int
+
+* pad_bottom: int
+
+* pad_left: int
+
+* pad_right: int
+
+* count_pad: bool (Default: False)
+
+* pool_op: list [Max,Average]
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Backend-specific attributes
+---------------------------
+* accum_t: NamedType
+
+  * The datatype (precision) used to store intermediate results of the computation within the layer.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* conv_implementation: list [LineBuffer,Encoded] (Default: LineBuffer)
+
+  * "LineBuffer" implementation is preferred over "Encoded" for most use cases. This attribute only applies to io_stream.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Catapult
+
+GlobalPooling1D
+===============
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* n_in: int
+
+* n_filt: int
+
+* pool_op: list [Max,Average]
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Backend-specific attributes
+---------------------------
+* accum_t: NamedType
+
+  * The datatype (precision) used to store intermediate results of the computation within the layer.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+GlobalPooling2D
+===============
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* in_height: int
+
+* in_width: int
+
+* n_filt: int
+
+* pool_op: list [Max,Average]
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Backend-specific attributes
+---------------------------
+* accum_t: NamedType
+
+  * The datatype (precision) used to store intermediate results of the computation within the layer.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+ZeroPadding1D
+=============
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* in_width: int
+
+* out_width: int
+
+* n_chan: int
+
+* pad_left: int
+
+* pad_right: int
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+ZeroPadding2D
+=============
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* in_height: int
+
+* in_width: int
+
+* out_height: int
+
+* out_width: int
+
+* n_chan: int
+
+* pad_top: int
+
+* pad_bottom: int
+
+* pad_left: int
+
+* pad_right: int
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Merge
+=====
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Backend-specific attributes
+---------------------------
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+MatMul
+======
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Backend-specific attributes
+---------------------------
+* accum_t: NamedType
+
+  * The datatype (precision) used to store intermediate results of the computation within the layer.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+Dot
+===
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Backend-specific attributes
+---------------------------
+* accum_t: NamedType
+
+  * The datatype (precision) used to store intermediate results of the computation within the layer.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, Vivado, VivadoAccelerator, VivadoAccelerator, Vitis, Vitis, Quartus, Quartus, Catapult, Catapult, SymbolicExpression, SymbolicExpression, oneAPI, oneAPI
+
+Concatenate
+===========
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Backend-specific attributes
+---------------------------
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+Resize
+======
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* in_height: int
+
+* in_width: int
+
+* out_height: int
+
+* out_width: int
+
+* n_chan: int
+
+* align_corners: bool (Default: False)
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* algorithm: list [nearest,bilinear] (Default: nearest)
+
+Transpose
+=========
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Embedding
+=========
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* embeddings_t: NamedType
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* n_in: int
+
+* n_out: int
+
+* vocab_size: int
+
+Weight attributes
+-----------------
+* embeddings: WeightVariable
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* embeddings_t: NamedType
+
+Backend-specific attributes
+---------------------------
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+SimpleRNN
+=========
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* weight_t: NamedType
+
+* bias_t: NamedType
+
+* recurrent_weight_t: NamedType
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* n_out: int
+
+* activation: str
+
+* return_sequences: bool (Default: False)
+
+* return_state: bool (Default: False)
+
+Weight attributes
+-----------------
+* weight: WeightVariable
+
+* bias: WeightVariable
+
+* recurrent_weight: WeightVariable
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* direction: list [forward,backward] (Default: forward)
+
+* weight_t: NamedType
+
+* bias_t: NamedType
+
+* recurrent_weight_t: NamedType
+
+Backend-specific attributes
+---------------------------
+* accum_t: NamedType
+
+  * The datatype (precision) used to store intermediate results of the computation within the layer.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* recurrent_reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, oneAPI
+
+* static: bool (Default: True)
+
+  * If set to True, will reuse the the same recurrent block for computation, resulting in lower resource usage at the expense of serialized computation and higher latency/II.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Catapult
+
+* table_size: int (Default: 1024)
+
+  * The size of the lookup table used to approximate the function.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, oneAPI
+
+* table_t: NamedType (Default: fixed<18,8,TRN,WRAP,0>)
+
+  * The datatype (precision) used for the values of the lookup table.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, oneAPI
+
+LSTM
+====
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* weight_t: NamedType
+
+* bias_t: NamedType
+
+* recurrent_weight_t: NamedType
+
+* recurrent_bias_t: NamedType
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* n_out: int
+
+* activation: str
+
+* recurrent_activation: str
+
+* return_sequences: bool (Default: False)
+
+* return_state: bool (Default: False)
+
+* time_major: bool (Default: False)
+
+Weight attributes
+-----------------
+* weight: WeightVariable
+
+* bias: WeightVariable
+
+* recurrent_weight: WeightVariable
+
+* recurrent_bias: WeightVariable
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* direction: list [forward,backward] (Default: forward)
+
+* weight_t: NamedType
+
+* bias_t: NamedType
+
+* recurrent_weight_t: NamedType
+
+* recurrent_bias_t: NamedType
+
+Backend-specific attributes
+---------------------------
+* accum_t: NamedType
+
+  * The datatype (precision) used to store intermediate results of the computation within the layer.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* recurrent_reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, oneAPI
+
+* static: bool (Default: True)
+
+  * If set to True, will reuse the the same recurrent block for computation, resulting in lower resource usage at the expense of serialized computation and higher latency/II.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Catapult
+
+* table_size: int (Default: 1024)
+
+  * The size of the lookup table used to approximate the function.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, oneAPI
+
+* table_t: NamedType (Default: fixed<18,8,TRN,WRAP,0>)
+
+  * The datatype (precision) used for the values of the lookup table.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, oneAPI
+
+GRU
+===
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* weight_t: NamedType
+
+* bias_t: NamedType
+
+* recurrent_weight_t: NamedType
+
+* recurrent_bias_t: NamedType
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* n_out: int
+
+* activation: str
+
+* recurrent_activation: str
+
+* return_sequences: bool (Default: False)
+
+* return_state: bool (Default: False)
+
+* time_major: bool (Default: False)
+
+Weight attributes
+-----------------
+* weight: WeightVariable
+
+* bias: WeightVariable
+
+* recurrent_weight: WeightVariable
+
+* recurrent_bias: WeightVariable
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* direction: list [forward,backward] (Default: forward)
+
+* apply_reset_gate: list [before,after] (Default: after)
+
+* weight_t: NamedType
+
+* bias_t: NamedType
+
+* recurrent_weight_t: NamedType
+
+* recurrent_bias_t: NamedType
+
+Backend-specific attributes
+---------------------------
+* accum_t: NamedType
+
+  * The datatype (precision) used to store intermediate results of the computation within the layer.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* recurrent_reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, oneAPI
+
+* static: bool (Default: True)
+
+  * If set to True, will reuse the the same recurrent block for computation, resulting in lower resource usage at the expense of serialized computation and higher latency/II.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Catapult
+
+* table_size: int (Default: 1024)
+
+  * The size of the lookup table used to approximate the function.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, oneAPI
+
+* table_t: NamedType (Default: fixed<18,8,TRN,WRAP,0>)
+
+  * The datatype (precision) used for the values of the lookup table.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, oneAPI
+
+GarNet
+======
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Backend-specific attributes
+---------------------------
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, Vivado, VivadoAccelerator, VivadoAccelerator, Vitis, Vitis, Quartus, Quartus, Catapult, Catapult, SymbolicExpression, SymbolicExpression, oneAPI, oneAPI
+
+GarNetStack
+===========
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Backend-specific attributes
+---------------------------
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, Vivado, VivadoAccelerator, VivadoAccelerator, Vitis, Vitis, Quartus, Quartus, Catapult, Catapult, SymbolicExpression, SymbolicExpression, oneAPI, oneAPI
+
+Quant
+=====
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* narrow: bool
+
+* rounding_mode: str
+
+* signed: bool
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Backend-specific attributes
+---------------------------
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+ApplyAlpha
+==========
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* scale_t: NamedType
+
+* bias_t: NamedType
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* n_in: int
+
+* n_filt: int (Default: -1)
+
+* use_gamma: bool (Default: True)
+
+* use_beta: bool (Default: True)
+
+Weight attributes
+-----------------
+* scale: WeightVariable
+
+* bias: WeightVariable
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* scale_t: NamedType
+
+* bias_t: NamedType
+
+Backend-specific attributes
+---------------------------
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+BatchNormOnnx
+=============
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Backend-specific attributes
+---------------------------
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+LayerGroup
+==========
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* layer_list: list
+
+* input_layers: list
+
+* output_layers: list
+
+* data_reader: object
+
+* output_shape: list
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+SymbolicExpression
+==================
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* expression: list
+
+* n_symbols: int
+
+* lut_functions: list (Default: [])
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+BiasAdd
+=======
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Backend-specific attributes
+---------------------------
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+FixedPointQuantizer
+===================
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+UnaryLUT
+========
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Repack
+======
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Clone
+=====
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+BatchNormalizationQuantizedTanh
+===============================
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* accum_t: NamedType
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* n_in: int
+
+* n_filt: int (Default: 0)
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* accum_t: NamedType
+
+* reuse_factor: int (Default: 1)
+
+PointwiseConv1D
+===============
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* weight_t: NamedType
+
+* bias_t: NamedType
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* in_width: int
+
+* out_width: int
+
+* n_chan: int
+
+* n_filt: int
+
+* filt_width: int
+
+* stride_width: int
+
+* pad_left: int
+
+* pad_right: int
+
+Weight attributes
+-----------------
+* weight: WeightVariable
+
+* bias: WeightVariable
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* weight_t: NamedType
+
+* bias_t: NamedType
+
+Backend-specific attributes
+---------------------------
+* accum_t: NamedType
+
+  * The datatype (precision) used to store intermediate results of the computation within the layer.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* parallelization_factor: int (Default: 1)
+
+  * The number of outputs computed in parallel. Essentially the number of multiplications of input window with the convolution kernel occuring in parallel. Higher number results in more parallelism (lower latency and II) at the expense of resources used.Currently only supported in io_parallel.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Catapult, oneAPI
+
+* conv_implementation: list [LineBuffer,Encoded] (Default: LineBuffer)
+
+  * "LineBuffer" implementation is preferred over "Encoded" for most use cases. This attribute only applies to io_stream.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Catapult
+
+PointwiseConv2D
+===============
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* weight_t: NamedType
+
+* bias_t: NamedType
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* in_height: int
+
+* in_width: int
+
+* out_height: int
+
+* out_width: int
+
+* n_chan: int
+
+* n_filt: int
+
+* filt_height: int
+
+* filt_width: int
+
+* stride_height: int
+
+* stride_width: int
+
+* pad_top: int
+
+* pad_bottom: int
+
+* pad_left: int
+
+* pad_right: int
+
+Weight attributes
+-----------------
+* weight: WeightVariable
+
+* bias: WeightVariable
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* weight_t: NamedType
+
+* bias_t: NamedType
+
+Backend-specific attributes
+---------------------------
+* accum_t: NamedType
+
+  * The datatype (precision) used to store intermediate results of the computation within the layer.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* parallelization_factor: int (Default: 1)
+
+  * The number of outputs computed in parallel. Essentially the number of multiplications of input window with the convolution kernel occuring in parallel. Higher number results in more parallelism (lower latency and II) at the expense of resources used.Currently only supported in io_parallel.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Catapult, oneAPI
+
+* conv_implementation: list [LineBuffer,Encoded] (Default: LineBuffer)
+
+  * "LineBuffer" implementation is preferred over "Encoded" for most use cases. This attribute only applies to io_stream.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Catapult
+
+Broadcast
+=========
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
diff --git a/docs/ir/conv.rst b/docs/ir/conv.rst
deleted file mode 100644
index f31e676e9d..0000000000
--- a/docs/ir/conv.rst
+++ /dev/null
@@ -1,38 +0,0 @@
-==================
-Convolution Layers
-==================
-
-Standard convolutions
-=====================
-
-These are the standard 1D and 2D convolutions currently supported by hls4ml, and the fallback if there is no special pointwise implementation.
-
-io_parallel
------------
-
-Parallel convolutions are for cases where the model needs to be small and fast, though synthesizability limits can be quickly reached. Also note that skip connections
-are not supported in io_parallel.
-
-For the Xilinx backends and Catapult, there is a very direct convolution implementation when using the ``Latency`` strategy. This is only for very small models because the
-high number of nested loops. The ``Resource`` strategy in all cases defaults to an algorithm using the *im2col* transformation. This generally supports larger models. The ``Quartus``,
-``oneAPI``, and ``Catapult`` backends also implement a ``Winograd`` algorithm choosable by setting the ``implementation`` to ``Winograd`` or ``combination``. Note that
-the Winograd implementation is available for only a handful of filter size configurations, and it is less concerned about bit accuracy and overflow, but it can be faster.
-
-io_stream
----------
-
-There are two main classes of io_stream implementations, ``LineBuffer`` and  ``Encoded``. ``LineBuffer`` is always the default, and generally produces marginally better results,
-while ``Catapult`` and ``Vivado`` also implement ``Encoded``, choosable with the ``convImplementation`` configuration option. In all cases, the data is processed serially, one pixel
-at a time, with a pixel containing an array of all the channel values for the pixel.
-
-Depthwise convolutions
-======================
-
-Pointwise convolutions
-======================
-
-Pointwise convolutions are a special case of convolution where the filter size is 1 for 1D or 1x1 for 2D.
-
-For the Xilinx backend, there is a dedicated io_parallel ``Latency`` strategy implementation of 1D pointwise convolutional layers integrated in `#881 <https://github.com/fastmachinelearning/hls4ml/pull/881>`_ developed for `arXiv:2402.01876 <https://arxiv.org/abs/2402.01876>`_.
-The reuse factor (RF) is used to split the layer execution and reuse the existing module RF times. The RF also limits the number of multipliers in each module.
-The initiation interval scales as the RF. One limitation is that it assumes ``in_width`` is divisible by the RF.
diff --git a/docs/ir/dense.rst b/docs/ir/dense.rst
deleted file mode 100644
index 352a7d25b2..0000000000
--- a/docs/ir/dense.rst
+++ /dev/null
@@ -1,25 +0,0 @@
-============
-Dense Layers
-============
-
-One-dimensional Dense Layers
-============================
-
-One-dimensional dense layers implement a matrix multiply and bias add. The produced code is also used by other layers to implement the matrix multiplication.
-
-
-io_parallel
------------
-
-All the backends implement a ``Resource`` implementation, which explicitly iterates over the reuse factor. There are different implementations depending on whether the reuse factor is
-smaller or bigger than the input size. The two Xilinx backends and Catapult also implement a ``Latency`` implementation, which only uses the reuse factor in pragmas.
-
-io_stream
----------
-
-The io_stream implementation only wraps the io_parallel implementation with streams or pipes for communication. The data is still transferred in parallel.
-
-Multi-dimensional Dense Layers
-==============================
-
-Multi-dimensional Dense layers are converted to pointwise convolutions, and do not directly use the above implementation
diff --git a/docs/advanced/flows.rst b/docs/ir/flows.rst
similarity index 84%
rename from docs/advanced/flows.rst
rename to docs/ir/flows.rst
index 37b8b44ff9..dbdef58896 100644
--- a/docs/advanced/flows.rst
+++ b/docs/ir/flows.rst
@@ -2,17 +2,6 @@
 Optimizer Passes and Flows
 ==========================
 
-Internal Structure
-------------------
-
-The ``hls4ml`` library will parse models from Keras, PyTorch or ONNX into an internal execution graph. This model graph is represented with the
-:py:class:`~hls4ml.model.graph.ModelGraph` class. The nodes in this graph, corresponding to the layer and operations of the input model are represented
-by classes derived from the :py:class:`~hls4ml.model.layers.Layer` base class.
-
-Layers are required to have defined inputs and outputs that define how they are connected in the graph and what is the shape of their output. All information
-about the layer's state and configuration is stored in its attributes. All weights, variables and data types are attributes and there are mapping views to sort through them.
-Layers can define expected attributes and can be verified for correctness, or to produce a list of configurable attributes that user can tweak.
-
 Optimizer passes
 ----------------
 
diff --git a/docs/ir/ir.rst b/docs/ir/ir.rst
new file mode 100644
index 0000000000..18b0a1c679
--- /dev/null
+++ b/docs/ir/ir.rst
@@ -0,0 +1,90 @@
+=======================
+Internal representation
+=======================
+
+The ``hls4ml`` library will parse models from Keras, PyTorch or ONNX into an internal execution graph. This model graph is represented with the
+:py:class:`~hls4ml.model.graph.ModelGraph` class. The nodes in this graph, loosely corresponding to the layers and operations of the input model are represented
+by classes derived from the :py:class:`~hls4ml.model.layers.Layer` base class.
+
+Layers are required to have defined inputs and outputs that define how they are connected in the graph and what is the shape of their output. All information
+about the layer's state and configuration is stored in its attributes. All weights, variables and data types are attributes and there are mapping views to sort through them.
+Layers can define expected attributes and can be verified for correctness, or to produce a list of configurable attributes that user can tweak. The complete list of attributes can be found in the :doc:`Attributes <attributes>` page.
+
+
+Layers
+======
+
+The backends of ``hls4ml`` are independent from each other and free to implement features in any suitable way, most implementations share common concepts which we will mention here.
+
+Dense Layers
+------------
+
+One-dimensional Dense Layers
+****************************
+
+Dense layers over one-dimensional data perform a matrix-vector multiplication followed by elementwise addition of bias tensor. This routine is the underlying computation of many other layers as well and is reused as much as possible. It exists in several implementations across different backends, for different `io_type`'s and strategies.
+
+io_parallel
+^^^^^^^^^^^
+
+All the backends have a ``Resource`` implementation, which divides the computation into a loop of ``reuse_factor`` iterations, each iteration simultaneously accessing a different part of the array partitioned in BRAM. There are different implementations depending on whether the reuse factor is smaller or bigger than the input size. The two Xilinx backends and Catapult also implement a ``Latency`` implementation, which uses the reuse factor to control the amount of pipelining/unrolling of the whole function while the weight array is fully partitioned in registers.
+
+io_stream
+^^^^^^^^^
+
+The io_stream implementation only wraps the io_parallel implementation with streams or pipes for communication. Internally, data is still accessed in parallel as an array.
+
+Multi-dimensional Dense Layers
+******************************
+
+Multi-dimensional Dense layers are converted to pointwise convolutions, and do not directly use the above implementation.
+
+
+Convolution Layers
+------------------
+
+Standard convolution
+********************
+
+By *standard* convolution we refer to the operation represented by the ``Conv1D/2D`` layer in Keras (``Conv1d/2d`` in PyTorch). Depending on the ``io_type`` option used, there are two classes of implementations in ``hls4ml``.
+
+io_parallel
+^^^^^^^^^^^
+
+Parallel IO is applicable to small models that require low latency implementation. Larger models face synthesizability limits very quickly.
+
+In Vivado/Vitis backends, parallel convolution relies on the *im2col* transformation of the input, which turns convolution into a matrix-multiplication task. This task is then implemented as a sequence of matrix-vector multiplications using the routine mentioned above. The ``Latency`` and ``Resource`` strategies refer to the function used for matrix-vector multiplication routine, with ``Resource`` allowing for a slightly larger models to be synthesized. Parallelism can be further controlled via the ``ParallelizationFactor``. Catapult backend in turn uses a direct implementation of convolution via nested loops. The ``Quartus``, ``oneAPI``, and ``Catapult`` backends also implement a ``Winograd`` algorithm choosable by setting the ``implementation`` to ``Winograd`` or ``combination``. Winograd implementation is available for only a handful of filter size configurations, and it is less concerned about bit accuracy and overflow. In certain conditions it can be faster.
+
+io_stream
+^^^^^^^^^
+
+There are two main classes of io_stream implementations, ``LineBuffer`` and  ``Encoded``. ``LineBuffer`` is the default, and generally produces marginally better results,
+while ``Catapult`` and ``Vivado`` also implement ``Encoded``, choosable with the ``ConvImplementation`` configuration option. In all cases, the data is processed serially, one pixel at a time, with a pixel containing an array of all the channel values for the pixel.
+
+Depthwise convolution
+*********************
+
+Depthwise implementation substitutes the matrix-vector multiplication in the kernel to the elementwise multiplication. The only implementation available is based on ``Latency`` strategy, used by both ``io_parallel`` and ``io_stream``.
+
+Pointwise convolution
+*********************
+
+Pointwise convolutions are a special case of convolution where the filter size is ``1`` for 1D or ``1x1`` for 2D.
+
+For the Vivado/Vitis backends, there is a dedicated ``io_parallel``/``Latency`` strategy implementation of 1D pointwise convolutional layers originally developed for `arXiv:2402.01876 <https://arxiv.org/abs/2402.01876>`_.
+The reuse factor (RF) is used to split the layer execution and reuse the existing module RF times. The RF also limits the number of multipliers in each module.
+The initiation interval scales as the RF. One limitation is that it assumes ``in_width`` is divisible by the RF.
+
+Activations
+-----------
+
+Most activations without extra parameters are represented with the ``Activation`` layer, and those with single parameters (leaky ReLU, thresholded ReLU, ELU) as ``ParametrizedActivation``. ``PReLU`` has its own class because it has a parameter matrix (stored as a weight). The hard (piecewise linear) sigmoid and tanh functions are implemented in a ``HardActivation`` layer, and ``Softmax`` has its own layer class.
+
+Backends have four softmax implementations that the user can choose from by setting the ``implementation`` parameter:
+
+* **latency**:  Good latency, but somewhat high resource usage. It does not work well if there are many output classes.
+* **stable**:  Slower but with better accuracy, useful in scenarios where higher accuracy is needed.
+* **legacy**:  An older implementation with poor accuracy, but good performance. Usually the latency implementation is preferred.
+* **argmax**:  If you don't care about normalized outputs and only care about which one has the highest value, using argmax saves a lot of resources. This sets the highest value to 1, the others to 0.
+
+Vivado/Vitis backend additionally support completely skipping softmax activation and returning raw outputs.
diff --git a/docs/api/hls-model.rst b/docs/ir/modelgraph.rst
similarity index 58%
rename from docs/api/hls-model.rst
rename to docs/ir/modelgraph.rst
index bf0d8ee3ce..048e67e101 100644
--- a/docs/api/hls-model.rst
+++ b/docs/ir/modelgraph.rst
@@ -1,8 +1,8 @@
 ================
-HLS Model Class
+ModelGraph Class
 ================
 
-This page documents our hls_model class usage. You can generate generate an hls model object from a keras model through ``hls4ml``'s API:
+This page documents our ``ModelGraph`` class usage. You can generate generate an instance of this class through ``hls4ml``'s API, for example by converting a Keras model:
 
 .. code-block:: python
 
@@ -11,10 +11,10 @@ This page documents our hls_model class usage. You can generate generate an hls
    # Generate a simple configuration from keras model
    config = hls4ml.utils.config_from_keras_model(keras_model, granularity='name')
 
-   # Convert to an hls model
+   # Convert to a ModelGraph instance (hls_model)
    hls_model = hls4ml.converters.convert_from_keras_model(keras_model, hls_config=config, output_dir='test_prj')
 
-After that, you can use several methods in that object. Here is a list of all the methods:
+This object can be used to perform common simulation and firmware-generation tasks. Here is a list of important user-facing methods:
 
 
 * :ref:`write <write-method>`
@@ -23,8 +23,6 @@ After that, you can use several methods in that object. Here is a list of all th
 * :ref:`build <build-method>`
 * :ref:`trace <trace-method>`
 
-Similar functionalities are also supported through command line interface. If you prefer using them, please refer to Command Help section.
-
 ----
 
 .. _write-method:
@@ -32,7 +30,7 @@ Similar functionalities are also supported through command line interface. If yo
 ``write`` method
 ====================
 
-Write your keras model as a hls project to ``hls_model``\ 's ``output_dir``\ :
+Write the ``ModelGraph`` to the output directory specified in the config:
 
 .. code-block:: python
 
@@ -45,7 +43,7 @@ Write your keras model as a hls project to ``hls_model``\ 's ``output_dir``\ :
 ``compile`` method
 ======================
 
-Compile your hls project.
+Compiles the written C++/HLS code and links it into the Python runtime. Compiled model can be used to evaluate performance (accuracy) through ``predict()`` method.
 
 .. code-block:: python
 
@@ -58,7 +56,7 @@ Compile your hls project.
 ``predict`` method
 ======================
 
-Similar to ``keras``\ 's predict API, you can get the predictions of ``hls_model`` just by supplying an input ``numpy`` array:
+Similar to ``keras``\ 's predict API, you can get the predictions just by supplying an input ``numpy`` array:
 
 .. code-block:: python
 
@@ -67,7 +65,7 @@ Similar to ``keras``\ 's predict API, you can get the predictions of ``hls_model
 
    y = hls_model.predict(X)
 
-This is similar to doing ``csim`` simulation, but you can get your prediction results much faster. It's very helpful when you want to quickly prototype different configurations for your model.
+This is similar to doing ``csim`` simulation, without creating the testbench and supplying data. It's very helpful when you want to quickly prototype different configurations for your model.
 
 ----
 
@@ -76,13 +74,17 @@ This is similar to doing ``csim`` simulation, but you can get your prediction re
 ``build`` method
 ====================
 
+This method "builds" the generated HLS project. The parameters of build are backend-specific and usually include simulation and synthesis. Refer to each backend for a complete list of supported parameters to ``build()``.
+
 .. code-block:: python
 
-   hls_model.build()
+   report = hls_model.build()
 
    #You can also read the report of the build
    hls4ml.report.read_vivado_report('hls4ml_prj')
 
+The returned ``report`` object will contain the result of build step, which may include C-simulation results, HLS synthesis estimates, co-simulation latency etc, depending on the backend used.
+
 ----
 
 .. _trace-method:

From 4da52a4333dde0b79e87051b397113f88ffce560 Mon Sep 17 00:00:00 2001
From: Javier Duarte <jduarte@ucsd.edu>
Date: Fri, 6 Dec 2024 13:34:11 -0800
Subject: [PATCH 19/36] bump version to 1.0.0

---
 CITATION.cff             | 2 +-
 README.md                | 4 ++--
 docs/intro/reference.rst | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/CITATION.cff b/CITATION.cff
index 9e1880f03f..91bf036a1d 100644
--- a/CITATION.cff
+++ b/CITATION.cff
@@ -4,7 +4,7 @@ type: software
 authors:
 - given-names: "FastML Team"
 title: "hls4ml"
-version: "v0.8.1"
+version: "v1.0.0"
 doi: 10.5281/zenodo.1201549
 repository-code: "https://github.com/fastmachinelearning/hls4ml"
 url: "https://fastmachinelearning.org/hls4ml"
diff --git a/README.md b/README.md
index 8d97bda3b6..fd96763476 100644
--- a/README.md
+++ b/README.md
@@ -73,9 +73,9 @@ If you use this software in a publication, please cite the software
 @software{fastml_hls4ml,
   author       = {{FastML Team}},
   title        = {fastmachinelearning/hls4ml},
-  year         = 2023,
+  year         = 2024,
   publisher    = {Zenodo},
-  version      = {v0.8.1},
+  version      = {v1.0.0},
   doi          = {10.5281/zenodo.1201549},
   url          = {https://github.com/fastmachinelearning/hls4ml}
 }
diff --git a/docs/intro/reference.rst b/docs/intro/reference.rst
index f271679620..0bd5912bb1 100644
--- a/docs/intro/reference.rst
+++ b/docs/intro/reference.rst
@@ -12,9 +12,9 @@ If you use this software in a publication, please cite the software
     @software{fastml_hls4ml,
     author       = {{FastML Team}},
     title        = {fastmachinelearning/hls4ml},
-    year         = 2023,
+    year         = 2024,
     publisher    = {Zenodo},
-    version      = {v0.8.1},
+    version      = {v1.0.0},
     doi          = {10.5281/zenodo.1201549},
     url          = {https://github.com/fastmachinelearning/hls4ml}
     }

From 6959c71a60355a67f6e7653f9fc81005f507a7fb Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Fri, 6 Dec 2024 16:34:23 -0600
Subject: [PATCH 20/36] remove obsolete file references

---
 docs/index.rst | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/docs/index.rst b/docs/index.rst
index 62003f662b..098a814803 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -58,9 +58,6 @@
     ir/ir
     ir/modelgraph
     ir/flows
-    ir/dense
-    ir/activations
-    ir/conv
     ir/attributes
 
 .. toctree::

From 47d74357c65c11ca2a2213e5e954b61c5060968b Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Fri, 6 Dec 2024 17:33:26 -0600
Subject: [PATCH 21/36] add a touch of text on the backends

---
 docs/backend/oneapi.rst  | 10 +++++-----
 docs/backend/quartus.rst |  8 ++++++--
 docs/backend/vitis.rst   |  9 +++++++--
 3 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/docs/backend/oneapi.rst b/docs/backend/oneapi.rst
index 3ee65fb41c..585bfc27cb 100644
--- a/docs/backend/oneapi.rst
+++ b/docs/backend/oneapi.rst
@@ -2,11 +2,11 @@
 oneAPI
 ======
 
-The ``oneAPI`` backend of hls4ml is designed for deploying NNs on Intel/Altera FPGAs. It will eventually
-replace the ``Quartus`` backend, which targeted Intel HLS. (Quartus continues to be used with IP produced by the
-``oneAPI`` backend.) This section discusses details of the ``oneAPI`` backend.
+The **oneAPI** backend of hls4ml is designed for deploying NNs on Intel/Altera FPGAs. It will eventually
+replace the **Quartus** backend, which targeted Intel HLS. (Quartus continues to be used with IP produced by the
+**oneAPI** backend.) This section discusses details of the **oneAPI** backend.
 
-The ``oneAPI`` code uses SYCL kernels to implement the logic that is deployed on FPGAs. It naturally leads to the
+The **oneAPI** code uses SYCL kernels to implement the logic that is deployed on FPGAs. It naturally leads to the
 accelerator style of programming. In the SYCL HLS (IP Component) flow, which is currently the only flow supported, the
 kernel becomes the IP, and the "host code" becomes the testbench. An accelerator flow, with easier deployment on
 PCIe accelerator boards, is planned to be added in the future.
@@ -18,7 +18,7 @@ produces the library used for calling the ``predict`` function from hls4ml. The
 in hls4ml interact with the cmake system, so one does not need to manually use the build system, but it there
 if desired.
 
-The ``oneAPI`` backend, like the ``Quartus`` backend, only implements the ``Resource`` strategy for the layers. There
+The **oneAPI** backend, like the **Quartus** backend, only implements the ``Resource`` strategy for the layers. There
 is no ``Latency`` implementation of any of the layers.
 
 Note:  currently tracing and external weights (i.e. setting BramFactor) are not supported.
diff --git a/docs/backend/quartus.rst b/docs/backend/quartus.rst
index 225ce2e12c..e32ff50b72 100644
--- a/docs/backend/quartus.rst
+++ b/docs/backend/quartus.rst
@@ -3,6 +3,10 @@ Quartus
 =======
 
 .. warning::
-    Quartus backend is deprecated and will be removed in a future version. Users should migrate to oneAPI backend.
+    The **Quartus** backend is deprecated and will be removed in a future version. Users should migrate to the **oneAPI** backend.
 
-*TODO expand this section*
+The **Quartus** backend of hls4ml is designed for deploying NNs on Intel/Altera FPGAs. It uses the discontinued Intel HLS compiler. The **oneAPI** backend should be preferred for new projects.
+The **oneAPI** backend contains the migrated the HLS code from this backend, with significantly better io_stream support, though the **oneAPI** backend does not yet support profiling, tracing,
+or the BramFactor option supported by the **Quartus** backend.  Nevertheless, little or no further development is expected for this backend.
+
+The **Quartus** backend only implements the ``Resource`` strategy for the layers. There is no ``Latency`` implementation of any of the layers.
\ No newline at end of file
diff --git a/docs/backend/vitis.rst b/docs/backend/vitis.rst
index 17d87763a7..bc851d59cd 100644
--- a/docs/backend/vitis.rst
+++ b/docs/backend/vitis.rst
@@ -2,6 +2,11 @@
 Vivado/Vitis
 ============
 
-``Vivado`` and ``Vitis`` backends are aimed for use with Xilinx FPGAs. They are currently the most advanced and well-supported backends of ``hls4ml``.
+The **Vivado** and **Vitis** backends are aimed for use with AMD/Xilinx FPGAs. The **Vivado** backend targets the discontinued ``Vivado HLS`` compiler, while
+the **Vitis** backend targets the ``Vitis HLS`` compiler. Both are designed to produce IP for incorporation in ``Vivado`` designs. (See :doc:`VivadoAccelerator <accelerator>`
+for generating easily-deployable models with ``Vivado HLS``.) The ``Vitis`` accelerator flow is not directly supported, though HLS produced with the **Vitis**
+backend can be easily incorporated into Vitis kernel.
+
+Users should generally use the **Vitis** backend for new designs that target AMD/Xilinx FPGAs; new ``hls4ml`` developments will not necessarily be backported to
+the **Vivado** backend.
 
-*TODO expand this section*

From 05f8a45f3c57d3b38d1c18d40c1491819aa015da Mon Sep 17 00:00:00 2001
From: Jan-Frederik Schulte <jschulte@cern.ch>
Date: Sun, 8 Dec 2024 17:23:55 -0500
Subject: [PATCH 22/36] expand pytorch frontend documentation

---
 docs/frontend/pytorch.rst | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/docs/frontend/pytorch.rst b/docs/frontend/pytorch.rst
index 923c17f438..87d1ec1f1d 100644
--- a/docs/frontend/pytorch.rst
+++ b/docs/frontend/pytorch.rst
@@ -4,6 +4,15 @@ PyTorch and Brevitas
 
 PyTorch frontend in ``hls4ml`` is implemented by parsing the symbolic trace of the ``torch.fx`` framework. This ensures proper execution graph is captured. Therefore, only models that can be traced with the FX framework can be parsed by ``hls4ml``.
 
-PyTorch/Brevitas parser is under heavy development and doesn't yet have the same feature set of the Keras parsers. Feel free to reach out to developers if you find a missing feature that is present in Keras parser and would like it implemented.
+Provided the underlying opertion is supported in ``hls4ml``, we generally aim to support the use of both ``torch.nn`` classes and ``torch.nn.functional`` functions in the construction of PyTorch models. Generally, the use of classes is more thoroughly
+tested. Please reach out if you experience any issues with either case.
+
+The PyTorch/Brevitas parser is under heavy development and doesn't yet have the same feature set of the Keras parsers. Feel free to reach out to developers if you find a missing feature that is present in Keras parser and would like it implemented.
+The direct ingestion of models quantized from brevitas is not yet support. Exporting brevitas models in the ONNX format (see `here <https://xilinx.github.io/brevitas/tutorials/onnx_export.html>`_) and reading those with the ``hls4ml`` QONNX frontend
+might be possible, but is untested.
+
+For multi-dimensional tensors, ``hls4ml`` follows the channels-last convention adopted by Keras, whereas PyTorch uses channels-first. By default, ``hls4ml`` will automaticlly transpose any tensors associated with weights and biases of the internal layers
+of the model. If the ``io_parallel`` I/O type (see :ref:`Concepts`) is used, a transpose node will be added to the model that also adjusts the input tensors. This is not available in the ``io_stream`` case and inputs must be transposed by the user.
+Outputs are not transposed back by default, but in ``io_parallel`` case, a transpose node can be added. If not needed, these adjustments can also be switched off. See :py:class:`~hls4ml.utils.config.config_from_pytorch_model` for details.
 
 The equivalent of Keras extension API is not yet available for PyTorch parser, and will be provided in the future.

From 536c069688f76e8d36f5753bf119d0895411bdb2 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 9 Dec 2024 13:51:14 +0000
Subject: [PATCH 23/36] [pre-commit.ci] auto fixes from pre-commit hooks

---
 docs/backend/quartus.rst | 2 +-
 docs/backend/vitis.rst   | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/docs/backend/quartus.rst b/docs/backend/quartus.rst
index e32ff50b72..8cde5f97b2 100644
--- a/docs/backend/quartus.rst
+++ b/docs/backend/quartus.rst
@@ -9,4 +9,4 @@ The **Quartus** backend of hls4ml is designed for deploying NNs on Intel/Altera
 The **oneAPI** backend contains the migrated the HLS code from this backend, with significantly better io_stream support, though the **oneAPI** backend does not yet support profiling, tracing,
 or the BramFactor option supported by the **Quartus** backend.  Nevertheless, little or no further development is expected for this backend.
 
-The **Quartus** backend only implements the ``Resource`` strategy for the layers. There is no ``Latency`` implementation of any of the layers.
\ No newline at end of file
+The **Quartus** backend only implements the ``Resource`` strategy for the layers. There is no ``Latency`` implementation of any of the layers.
diff --git a/docs/backend/vitis.rst b/docs/backend/vitis.rst
index bc851d59cd..9528e89a93 100644
--- a/docs/backend/vitis.rst
+++ b/docs/backend/vitis.rst
@@ -9,4 +9,3 @@ backend can be easily incorporated into Vitis kernel.
 
 Users should generally use the **Vitis** backend for new designs that target AMD/Xilinx FPGAs; new ``hls4ml`` developments will not necessarily be backported to
 the **Vivado** backend.
-

From d9d09e03c5ea81ce0500da67cb06d431d731f7c9 Mon Sep 17 00:00:00 2001
From: Jan-Frederik Schulte <jschulte@cern.ch>
Date: Mon, 9 Dec 2024 09:58:34 -0500
Subject: [PATCH 24/36] typos in pytorch frontend documentation

---
 docs/frontend/pytorch.rst | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/frontend/pytorch.rst b/docs/frontend/pytorch.rst
index 87d1ec1f1d..8a16d31a8d 100644
--- a/docs/frontend/pytorch.rst
+++ b/docs/frontend/pytorch.rst
@@ -2,13 +2,13 @@
 PyTorch and Brevitas
 ====================
 
-PyTorch frontend in ``hls4ml`` is implemented by parsing the symbolic trace of the ``torch.fx`` framework. This ensures proper execution graph is captured. Therefore, only models that can be traced with the FX framework can be parsed by ``hls4ml``.
+The PyTorch frontend in ``hls4ml`` is implemented by parsing the symbolic trace of the ``torch.fx`` framework. This ensures the proper execution graph is captured. Therefore, only models that can be traced with the FX framework can be parsed by ``hls4ml``.
 
-Provided the underlying opertion is supported in ``hls4ml``, we generally aim to support the use of both ``torch.nn`` classes and ``torch.nn.functional`` functions in the construction of PyTorch models. Generally, the use of classes is more thoroughly
+Provided the underlying operation is supported in ``hls4ml``, we generally aim to support the use of both ``torch.nn`` classes and ``torch.nn.functional`` functions in the construction of PyTorch models. Generally, the use of classes is more thoroughly
 tested. Please reach out if you experience any issues with either case.
 
 The PyTorch/Brevitas parser is under heavy development and doesn't yet have the same feature set of the Keras parsers. Feel free to reach out to developers if you find a missing feature that is present in Keras parser and would like it implemented.
-The direct ingestion of models quantized from brevitas is not yet support. Exporting brevitas models in the ONNX format (see `here <https://xilinx.github.io/brevitas/tutorials/onnx_export.html>`_) and reading those with the ``hls4ml`` QONNX frontend
+The direct ingestion of models quantized from brevitas is not yet supported. Exporting brevitas models in the ONNX format (see `here <https://xilinx.github.io/brevitas/tutorials/onnx_export.html>`_) and reading those with the ``hls4ml`` QONNX frontend
 might be possible, but is untested.
 
 For multi-dimensional tensors, ``hls4ml`` follows the channels-last convention adopted by Keras, whereas PyTorch uses channels-first. By default, ``hls4ml`` will automaticlly transpose any tensors associated with weights and biases of the internal layers

From e69a392aabbaf2cb113b18c6092ba339419bedbd Mon Sep 17 00:00:00 2001
From: Jan-Frederik Schulte <jschulte@cern.ch>
Date: Mon, 9 Dec 2024 10:18:38 -0500
Subject: [PATCH 25/36] improve description of brevtias -> QONNX -> hlsm4l
 workflow

---
 docs/frontend/pytorch.rst | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/docs/frontend/pytorch.rst b/docs/frontend/pytorch.rst
index 8a16d31a8d..6e91d0c44e 100644
--- a/docs/frontend/pytorch.rst
+++ b/docs/frontend/pytorch.rst
@@ -8,8 +8,10 @@ Provided the underlying operation is supported in ``hls4ml``, we generally aim t
 tested. Please reach out if you experience any issues with either case.
 
 The PyTorch/Brevitas parser is under heavy development and doesn't yet have the same feature set of the Keras parsers. Feel free to reach out to developers if you find a missing feature that is present in Keras parser and would like it implemented.
-The direct ingestion of models quantized from brevitas is not yet supported. Exporting brevitas models in the ONNX format (see `here <https://xilinx.github.io/brevitas/tutorials/onnx_export.html>`_) and reading those with the ``hls4ml`` QONNX frontend
-might be possible, but is untested.
+
+.. note::
+    The direct ingestion of models quantized with brevitas is not supported currently. Instead, brevitas models shoud be exported in the ONNX format (see `here <https://xilinx.github.io/brevitas/tutorials/onnx_export.html>`_) and read with the ``hls4ml``
+    QONNX frontend. Issues may arise, for example when non power-of-2 or non-scalar quantization scales are used. Please reach out if you encounter any problems with this workflow.
 
 For multi-dimensional tensors, ``hls4ml`` follows the channels-last convention adopted by Keras, whereas PyTorch uses channels-first. By default, ``hls4ml`` will automaticlly transpose any tensors associated with weights and biases of the internal layers
 of the model. If the ``io_parallel`` I/O type (see :ref:`Concepts`) is used, a transpose node will be added to the model that also adjusts the input tensors. This is not available in the ``io_stream`` case and inputs must be transposed by the user.

From 896951a63561379f4ff28a3a9a31f273cd5a2751 Mon Sep 17 00:00:00 2001
From: Vladimir Loncar <vloncar@users.noreply.github.com>
Date: Mon, 9 Dec 2024 19:24:52 +0100
Subject: [PATCH 26/36] Add docs on BramFactor

---
 docs/advanced/bramfactor.rst | 42 ++++++++++++++++++++++++++++++++++++
 docs/index.rst               |  1 +
 2 files changed, 43 insertions(+)
 create mode 100644 docs/advanced/bramfactor.rst

diff --git a/docs/advanced/bramfactor.rst b/docs/advanced/bramfactor.rst
new file mode 100644
index 0000000000..37fe766060
--- /dev/null
+++ b/docs/advanced/bramfactor.rst
@@ -0,0 +1,42 @@
+==================================
+Loading weights from external BRAM
+==================================
+
+.. note::
+    This feature is being evaluated for re-implementation. We welcome feedback from users how to make the implementation more flexible.
+
+``hls4ml`` can optionally store weights in BRAMs external to the design. This is supported in Vivado/Vitis and Catapult backends. It is the responsibility of the user to ensure the weights are properly loaded during the operation of the design.
+
+The feature works as a threshold, exposed through a ``BramFactor`` config parameter. Layers with more weights above the threshold will be exposed as BRAM interface. Consider the following code:
+
+.. code-block:: Python
+
+    model = tf.keras.models.Sequential()
+    model.add(Dense(10, activation="relu", input_shape=(12,), name="dense_1"))
+    model.add(Dense(20, activation="relu", name="dense_2"))
+    model.add(Dense(5, activation="softmax", name="dense_3"))
+    model.compile(optimizer='adam', loss='mse')
+
+    config = hls4ml.utils.config_from_keras_model(model)
+    config["Model"]["Strategy"] = "Resource"
+    config["Model"]["BramFactor"] = 100
+
+    hls_model = hls4ml.converters.convert_from_keras_model(
+        model, hls_config=config, output_dir=output_dir, io_type=io_type, backend=backend
+    )
+
+Having set ``BramFactor=100``, only layers with more than 100 weights will be exposed as external BRAM, in this case layers ``dense_1`` and ``dense_2``. ``BramFactor`` can currently be only set at the model level. The generated code will now have weights as part of the interface.
+
+.. code-block:: C++
+
+    void myproject(
+        hls::stream<input_t> &dense_1_input,
+        hls::stream<result_t> &layer7_out,
+        model_default_t w2[120],
+        model_default_t w4[200]
+    ) {
+        #pragma HLS INTERFACE axis port=dense_1_input,layer7_out
+        #pragma HLS INTERFACE bram port=w2,w4
+        ...
+
+When integrating the design, users can use the exposed interface to implement weight reloading scheme.
diff --git a/docs/index.rst b/docs/index.rst
index 098a814803..ff92a3d543 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -49,6 +49,7 @@
     advanced/fifo_depth
     advanced/extension
     advanced/model_optimization
+    advanced/bramfactor
 
 .. toctree::
     :hidden:

From 5dd771538d6d513df8f730b0cf1458873307c2c6 Mon Sep 17 00:00:00 2001
From: Vladimir Loncar <vloncar@users.noreply.github.com>
Date: Mon, 9 Dec 2024 23:33:46 +0100
Subject: [PATCH 27/36] Temporary workaround for QKeras installation

---
 hls4ml/__init__.py | 32 +++++++++++++++++++++++++++++++-
 setup.cfg          |  1 -
 2 files changed, 31 insertions(+), 2 deletions(-)

diff --git a/hls4ml/__init__.py b/hls4ml/__init__.py
index 81b2859551..e3a7247b0d 100644
--- a/hls4ml/__init__.py
+++ b/hls4ml/__init__.py
@@ -1,4 +1,34 @@
-from hls4ml import converters, report, utils  # noqa: F401
+# Temporary workaround for QKeras installation requirement, will be removed after 1.0.0
+def maybe_install_qkeras():
+    import subprocess
+    import sys
+
+    QKERAS_PKG_NAME = 'QKeras'
+    # QKERAS_PKG_SOURCE = QKERAS_PKG_NAME
+    QKERAS_PKG_SOURCE = 'qkeras@git+https://github.com/fastmachinelearning/qkeras.git'
+
+    def pip_list():
+        p = subprocess.run([sys.executable, '-m', 'pip', 'list'], check=True, capture_output=True)
+        return p.stdout.decode()
+
+    def pip_install(package):
+        subprocess.check_call([sys.executable, '-m', 'pip', 'install', package])
+
+    all_pkgs = pip_list()
+    if QKERAS_PKG_NAME not in all_pkgs:
+        print('QKeras installation not found, installing one...')
+        pip_install(QKERAS_PKG_SOURCE)
+        print('QKeras installed.')
+
+
+try:
+    maybe_install_qkeras()
+except Exception:
+    print('Could not find QKeras installation, make sure you have QKeras installed.')
+
+# End of workaround
+
+from hls4ml import converters, report, utils  # noqa: F401, E402
 
 try:
     from ._version import version as __version__
diff --git a/setup.cfg b/setup.cfg
index dc1075d9f3..0b81e7b592 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -29,7 +29,6 @@ install_requires =
     pydigitalwavetools==1.1
     pyparsing
     pyyaml
-    qkeras@git+https://github.com/google/qkeras.git
     tabulate
     tensorflow>=2.8.0,<=2.14.1
     tensorflow-model-optimization<=0.7.5

From 661731007fbe29c76963c5c981f495b38d4694bd Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Wed, 11 Dec 2024 00:26:16 -0600
Subject: [PATCH 28/36] don't overwrite already set accum_t, fix pointwise
 output res

---
 hls4ml/backends/fpga/fpga_layers.py | 10 ++++++----
 hls4ml/model/layers.py              | 10 ++++++----
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/hls4ml/backends/fpga/fpga_layers.py b/hls4ml/backends/fpga/fpga_layers.py
index 356973517c..0026ebe213 100644
--- a/hls4ml/backends/fpga/fpga_layers.py
+++ b/hls4ml/backends/fpga/fpga_layers.py
@@ -73,12 +73,14 @@ def set_thresholds(self, scale, bias, ternary_threshold=0.5):
 class PointwiseConv1D(Conv1D):
     '''Optimized Conv1D implementation for 1x1 kernels.'''
 
-    # Nothing to do, will pick up function and config from class name
-    pass
+    def initialize(self):
+        # Do noting, values copied
+        pass
 
 
 class PointwiseConv2D(Conv2D):
     '''Optimized Conv2D implementation for 1x1 kernels.'''
 
-    # Nothing to do, will pick up function and config from class name
-    pass
+    def initialize(self):
+        # Do noting, values copied
+        pass
diff --git a/hls4ml/model/layers.py b/hls4ml/model/layers.py
index edd0051c6e..3847cda9cf 100644
--- a/hls4ml/model/layers.py
+++ b/hls4ml/model/layers.py
@@ -176,10 +176,12 @@ def _wrap_precision_to_type(self, name, precision):
         return NamedType(name=name, precision=precision)
 
     def _set_accum_t(self):
-        has_accum_t = any(a for a in self.expected_attributes if a.name == 'accum_t' and isinstance(a, TypeAttribute))
-        if has_accum_t:
-            accum_t = NamedType(*reversed(self.model.config.get_precision(self, 'accum')))
-            self.set_attr('accum_t', accum_t)
+        """Set the accumulator, but don't overwrite an existing one"""
+        if self.get_attr('accum_t') is None:
+            has_accum_t = any(a for a in self.expected_attributes if a.name == 'accum_t' and isinstance(a, TypeAttribute))
+            if has_accum_t:
+                accum_t = NamedType(*reversed(self.model.config.get_precision(self, 'accum')))
+                self.set_attr('accum_t', accum_t)
 
     def _set_type_t(self, name):
         has_type_t = any(a for a in self.expected_attributes if a.name == name + '_t' and isinstance(a, TypeAttribute))

From f211a0e32e7bbc4a608c19e353b8648f9d5c03b7 Mon Sep 17 00:00:00 2001
From: Jan-Frederik Schulte <jschulte@cern.ch>
Date: Fri, 13 Dec 2024 15:07:03 -0500
Subject: [PATCH 29/36] split hgq tests and isolate qkeras tests to make tests
 run in under 1h

---
 test/pytest/generate_ci_yaml.py |   2 +-
 test/pytest/test_hgq_layers.py  |  85 ----------------
 test/pytest/test_hgq_players.py | 169 ++++++++++++++++++++++++++++++++
 3 files changed, 170 insertions(+), 86 deletions(-)
 create mode 100644 test/pytest/test_hgq_players.py

diff --git a/test/pytest/generate_ci_yaml.py b/test/pytest/generate_ci_yaml.py
index b130b43cef..c83e7ad5c7 100644
--- a/test/pytest/generate_ci_yaml.py
+++ b/test/pytest/generate_ci_yaml.py
@@ -24,7 +24,7 @@
 BLACKLIST = {'test_reduction'}
 
 # Long-running tests will not be bundled with other tests
-LONGLIST = {'test_hgq_layers'}
+LONGLIST = {'test_hgq_layers','test_hgq_players','test_qkeras'}
 
 
 def path_to_name(test_path):
diff --git a/test/pytest/test_hgq_layers.py b/test/pytest/test_hgq_layers.py
index 92a7ea1876..4605394409 100644
--- a/test/pytest/test_hgq_layers.py
+++ b/test/pytest/test_hgq_layers.py
@@ -79,51 +79,6 @@ def run_model_test(
     _run_synth_match_test(proxy, data, io_type, backend, dir, cond=cond)
 
 
-def create_player_model(layer: str, rnd_strategy: str, io_type: str):
-    pa_config = get_default_paq_conf()
-    pa_config['rnd_strategy'] = rnd_strategy
-    pa_config['skip_dims'] = 'all' if io_type == 'io_stream' else 'batch'
-    set_default_paq_conf(pa_config)
-
-    inp = keras.Input(shape=(15))
-    if 'PConcatenate' in layer:
-        _inp = [HQuantize()(inp)] * 2
-        out = eval(layer)(_inp)
-        out = HDense(15)(out)
-        return keras.Model(inp, out)
-    elif 'Signature' in layer:
-        _inp = eval(layer)(inp)
-        out = HDense(15)(_inp)
-        return keras.Model(inp, out)
-    elif 'Pool2D' in layer:
-        _inp = PReshape((3, 5, 1))(HQuantize()(inp))
-    elif 'Pool1D' in layer:
-        _inp = PReshape((5, 3))(HQuantize()(inp))
-    elif 'Dense' in layer or 'Activation' in layer:
-        _inp = HQuantize()(inp)
-    elif 'Flatten' in layer:
-        out = HQuantize()(inp)
-        out = PReshape((3, 5))(out)
-        out = HConv1D(2, 2)(out)
-        out = eval(layer)(out)
-        out = HDense(15)(out)
-        return keras.Model(inp, out)
-    else:
-        raise Exception(f'Please add test for {layer}')
-
-    out = eval(layer)(_inp)
-    model = keras.Model(inp, out)
-
-    for layer in model.layers:
-        # No weight bitwidths to randomize
-        # And activation bitwidths
-        if hasattr(layer, 'paq'):
-            fbw: tf.Variable = layer.paq.fbw
-            fbw.assign(tf.constant(np.random.uniform(4, 6, fbw.shape).astype(np.float32)))
-
-    return model
-
-
 def create_hlayer_model(layer: str, rnd_strategy: str, io_type: str):
     pa_config = get_default_paq_conf()
     pa_config['rnd_strategy'] = rnd_strategy
@@ -222,43 +177,3 @@ def test_syn_hlayers(layer, N: int, rnd_strategy: str, io_type: str, cover_facto
     path = test_path / f'hls4mlprj_hgq_{layer}_{rnd_strategy}_{io_type}_{aggressive}_{backend}'
 
     run_model_test(model, cover_factor, data, io_type, backend, str(path), aggressive, cond=cond)
-
-
-@pytest.mark.parametrize(
-    'layer',
-    [
-        "PConcatenate()",
-        "PMaxPool1D(2, padding='same')",
-        "PMaxPool1D(4, padding='same')",
-        "PMaxPool2D((5,3), padding='same')",
-        "PMaxPool1D(2, padding='valid')",
-        "PMaxPool2D((2,3), padding='valid')",
-        "Signature(1,6,3)",
-        "PAvgPool1D(2, padding='same')",
-        "PAvgPool2D((1,2), padding='same')",
-        "PAvgPool2D((2,2), padding='same')",
-        "PAvgPool1D(2, padding='valid')",
-        "PAvgPool2D((1,2), padding='valid')",
-        "PAvgPool2D((2,2), padding='valid')",
-        "PFlatten()",
-    ],
-)
-@pytest.mark.parametrize("N", [1000])
-@pytest.mark.parametrize("rnd_strategy", ['floor', 'standard_round'])
-@pytest.mark.parametrize("io_type", ['io_parallel', 'io_stream'])
-@pytest.mark.parametrize("cover_factor", [1.0])
-@pytest.mark.parametrize("aggressive", [True, False])
-@pytest.mark.parametrize("backend", ['vivado', 'vitis'])
-def test_syn_players(layer, N: int, rnd_strategy: str, io_type: str, cover_factor: float, aggressive: bool, backend: str):
-    model = create_player_model(layer=layer, rnd_strategy=rnd_strategy, io_type=io_type)
-    data = get_data((N, 15), 7, 1)
-
-    path = test_path / f'hls4mlprj_hgq_{layer}_{rnd_strategy}_{io_type}_{aggressive}_{backend}'
-
-    if 'Signature' in layer:
-        q = gfixed(1, 6, 3)
-        data = q(data).numpy()
-    if "padding='same'" in layer and io_type == 'io_stream':
-        pytest.skip("io_stream does not support padding='same' for pools at the moment")
-
-    run_model_test(model, cover_factor, data, io_type, backend, str(path), aggressive)
diff --git a/test/pytest/test_hgq_players.py b/test/pytest/test_hgq_players.py
new file mode 100644
index 0000000000..db44328a2d
--- /dev/null
+++ b/test/pytest/test_hgq_players.py
@@ -0,0 +1,169 @@
+from pathlib import Path
+
+import HGQ  # noqa: F401
+import numpy as np
+import pytest
+import tensorflow as tf
+from HGQ import get_default_paq_conf, set_default_paq_conf, trace_minmax
+from HGQ.layers import (  # noqa: F401
+    HConv1D,
+    HDense,
+    HQuantize,
+    PAvgPool1D,
+    PAvgPool2D,
+    PConcatenate,
+    PFlatten,
+    PMaxPool1D,
+    PMaxPool2D,
+    PReshape,
+    Signature,
+)
+from HGQ.proxy import to_proxy_model
+from HGQ.proxy.fixed_point_quantizer import gfixed
+from tensorflow import keras
+
+from hls4ml.converters import convert_from_keras_model
+
+# tf.config.experimental_run_functions_eagerly(True)  # noqa
+
+
+test_path = Path(__file__).parent
+
+
+def _run_synth_match_test(proxy: keras.Model, data, io_type: str, backend: str, dir: str, cond=None):
+
+    output_dir = dir + '/hls4ml_prj'
+    hls_model = convert_from_keras_model(
+        proxy,
+        io_type=io_type,
+        output_dir=output_dir,
+        backend=backend,
+        hls_config={'Model': {'Precision': 'fixed<1,0>', 'ReuseFactor': 1}},
+    )
+    hls_model.compile()
+
+    data_len = data.shape[0] if isinstance(data, np.ndarray) else data[0].shape[0]
+    # Multiple output case. Check each output separately
+    if len(proxy.outputs) > 1:  # type: ignore
+        r_proxy: list[np.ndarray] = [x.numpy() for x in proxy(data)]  # type: ignore
+        r_hls: list[np.ndarray] = hls_model.predict(data)  # type: ignore
+        r_hls = [x.reshape(r_proxy[i].shape) for i, x in enumerate(r_hls)]
+    else:
+        r_proxy: list[np.ndarray] = [proxy(data).numpy()]  # type: ignore
+        r_hls: list[np.ndarray] = [hls_model.predict(data).reshape(r_proxy[0].shape)]  # type: ignore
+
+    errors = []
+    for i, (p, h) in enumerate(zip(r_proxy, r_hls)):
+        try:
+            if cond is None:
+                mismatch_ph = p != h
+                assert (
+                    np.sum(mismatch_ph) == 0
+                ), f"Proxy-HLS4ML mismatch for out {i}: {np.sum(np.any(mismatch_ph, axis=1))} out of {data_len} samples are different. Sample: {p[mismatch_ph].ravel()[:5]} vs {h[mismatch_ph].ravel()[:5]}"  # noqa: E501
+            else:
+                cond(p, h)
+        except AssertionError as e:
+            errors.append(e)
+    if len(errors) > 0:
+        msgs = [str(e) for e in errors]
+        raise AssertionError('\n'.join(msgs))
+
+
+def run_model_test(
+    model: keras.Model, cover_factor: float | None, data, io_type: str, backend: str, dir: str, aggressive: bool, cond=None
+):
+    data_len = data.shape[0] if isinstance(data, np.ndarray) else data[0].shape[0]
+    if cover_factor is not None:
+        trace_minmax(model, data, cover_factor=cover_factor, bsz=data_len)
+    proxy = to_proxy_model(model, aggressive=aggressive, unary_lut_max_table_size=4096)
+    _run_synth_match_test(proxy, data, io_type, backend, dir, cond=cond)
+
+
+def create_player_model(layer: str, rnd_strategy: str, io_type: str):
+    pa_config = get_default_paq_conf()
+    pa_config['rnd_strategy'] = rnd_strategy
+    pa_config['skip_dims'] = 'all' if io_type == 'io_stream' else 'batch'
+    set_default_paq_conf(pa_config)
+
+    inp = keras.Input(shape=(15))
+    if 'PConcatenate' in layer:
+        _inp = [HQuantize()(inp)] * 2
+        out = eval(layer)(_inp)
+        out = HDense(15)(out)
+        return keras.Model(inp, out)
+    elif 'Signature' in layer:
+        _inp = eval(layer)(inp)
+        out = HDense(15)(_inp)
+        return keras.Model(inp, out)
+    elif 'Pool2D' in layer:
+        _inp = PReshape((3, 5, 1))(HQuantize()(inp))
+    elif 'Pool1D' in layer:
+        _inp = PReshape((5, 3))(HQuantize()(inp))
+    elif 'Dense' in layer or 'Activation' in layer:
+        _inp = HQuantize()(inp)
+    elif 'Flatten' in layer:
+        out = HQuantize()(inp)
+        out = PReshape((3, 5))(out)
+        out = HConv1D(2, 2)(out)
+        out = eval(layer)(out)
+        out = HDense(15)(out)
+        return keras.Model(inp, out)
+    else:
+        raise Exception(f'Please add test for {layer}')
+
+    out = eval(layer)(_inp)
+    model = keras.Model(inp, out)
+
+    for layer in model.layers:
+        # No weight bitwidths to randomize
+        # And activation bitwidths
+        if hasattr(layer, 'paq'):
+            fbw: tf.Variable = layer.paq.fbw
+            fbw.assign(tf.constant(np.random.uniform(4, 6, fbw.shape).astype(np.float32)))
+
+    return model
+
+def get_data(shape: tuple[int, ...], v: float, max_scale: float):
+    rng = np.random.default_rng()
+    a1 = rng.uniform(-v, v, shape).astype(np.float32)
+    a2 = rng.uniform(0, max_scale, (1, shape[1])).astype(np.float32)
+    return (a1 * a2).astype(np.float32)
+
+@pytest.mark.parametrize(
+    'layer',
+    [
+        "PConcatenate()",
+        "PMaxPool1D(2, padding='same')",
+        "PMaxPool1D(4, padding='same')",
+        "PMaxPool2D((5,3), padding='same')",
+        "PMaxPool1D(2, padding='valid')",
+        "PMaxPool2D((2,3), padding='valid')",
+        "Signature(1,6,3)",
+        "PAvgPool1D(2, padding='same')",
+        "PAvgPool2D((1,2), padding='same')",
+        "PAvgPool2D((2,2), padding='same')",
+        "PAvgPool1D(2, padding='valid')",
+        "PAvgPool2D((1,2), padding='valid')",
+        "PAvgPool2D((2,2), padding='valid')",
+        "PFlatten()",
+    ],
+)
+@pytest.mark.parametrize("N", [1000])
+@pytest.mark.parametrize("rnd_strategy", ['floor', 'standard_round'])
+@pytest.mark.parametrize("io_type", ['io_parallel', 'io_stream'])
+@pytest.mark.parametrize("cover_factor", [1.0])
+@pytest.mark.parametrize("aggressive", [True, False])
+@pytest.mark.parametrize("backend", ['vivado', 'vitis'])
+def test_syn_players(layer, N: int, rnd_strategy: str, io_type: str, cover_factor: float, aggressive: bool, backend: str):
+    model = create_player_model(layer=layer, rnd_strategy=rnd_strategy, io_type=io_type)
+    data = get_data((N, 15), 7, 1)
+
+    path = test_path / f'hls4mlprj_hgq_{layer}_{rnd_strategy}_{io_type}_{aggressive}_{backend}'
+
+    if 'Signature' in layer:
+        q = gfixed(1, 6, 3)
+        data = q(data).numpy()
+    if "padding='same'" in layer and io_type == 'io_stream':
+        pytest.skip("io_stream does not support padding='same' for pools at the moment")
+
+    run_model_test(model, cover_factor, data, io_type, backend, str(path), aggressive)

From 82ab6bfc9e4a71f4316b7cad437270495711dad4 Mon Sep 17 00:00:00 2001
From: Jan-Frederik Schulte <jschulte@cern.ch>
Date: Fri, 13 Dec 2024 15:09:23 -0500
Subject: [PATCH 30/36] pre-commit

---
 test/pytest/generate_ci_yaml.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test/pytest/generate_ci_yaml.py b/test/pytest/generate_ci_yaml.py
index c83e7ad5c7..0714a4acce 100644
--- a/test/pytest/generate_ci_yaml.py
+++ b/test/pytest/generate_ci_yaml.py
@@ -18,13 +18,14 @@
     EXAMPLEMODEL: {}
 """
 
+
 n_test_files_per_yml = int(os.environ.get('N_TESTS_PER_YAML', 4))
 
 # Blacklisted tests will be skipped
 BLACKLIST = {'test_reduction'}
 
 # Long-running tests will not be bundled with other tests
-LONGLIST = {'test_hgq_layers','test_hgq_players','test_qkeras'}
+LONGLIST = {'test_hgq_layers', 'test_hgq_players', 'test_qkeras'}
 
 
 def path_to_name(test_path):

From 96da3fe5d348e06b53cd4e1ce1130cbb4db41580 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 13 Dec 2024 20:13:23 +0000
Subject: [PATCH 31/36] [pre-commit.ci] auto fixes from pre-commit hooks

---
 test/pytest/test_hgq_players.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test/pytest/test_hgq_players.py b/test/pytest/test_hgq_players.py
index db44328a2d..9c4b40f97f 100644
--- a/test/pytest/test_hgq_players.py
+++ b/test/pytest/test_hgq_players.py
@@ -123,12 +123,14 @@ def create_player_model(layer: str, rnd_strategy: str, io_type: str):
 
     return model
 
+
 def get_data(shape: tuple[int, ...], v: float, max_scale: float):
     rng = np.random.default_rng()
     a1 = rng.uniform(-v, v, shape).astype(np.float32)
     a2 = rng.uniform(0, max_scale, (1, shape[1])).astype(np.float32)
     return (a1 * a2).astype(np.float32)
 
+
 @pytest.mark.parametrize(
     'layer',
     [

From 8a018f16773c368ca83b4ee5774e2da33e09d28c Mon Sep 17 00:00:00 2001
From: Jan-Frederik Schulte <jschulte@cern.ch>
Date: Fri, 13 Dec 2024 15:15:21 -0500
Subject: [PATCH 32/36] remove unnecessary import

---
 test/pytest/test_hgq_layers.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/test/pytest/test_hgq_layers.py b/test/pytest/test_hgq_layers.py
index 4605394409..80d96fbcda 100644
--- a/test/pytest/test_hgq_layers.py
+++ b/test/pytest/test_hgq_layers.py
@@ -19,7 +19,6 @@
     Signature,
 )
 from HGQ.proxy import to_proxy_model
-from HGQ.proxy.fixed_point_quantizer import gfixed
 from tensorflow import keras
 
 from hls4ml.converters import convert_from_keras_model

From 46bdacc05c359531c5070f647fea23093dbc90f0 Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Fri, 13 Dec 2024 14:38:29 -0600
Subject: [PATCH 33/36] update example-model

---
 example-models | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/example-models b/example-models
index 6a82da23ad..c6bb3c0686 160000
--- a/example-models
+++ b/example-models
@@ -1 +1 @@
-Subproject commit 6a82da23ad24c238fe156ed4d0aa907db547dbcf
+Subproject commit c6bb3c0686d52439d8c53d7407903bf78e852562

From 1d0cf1e28d5ecdccb1b1dc1786ee8e467cfd019e Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Fri, 13 Dec 2024 14:39:21 -0600
Subject: [PATCH 34/36] change order of optimizers

---
 hls4ml/model/optimizer/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hls4ml/model/optimizer/__init__.py b/hls4ml/model/optimizer/__init__.py
index 3302e3c691..a745eceba1 100644
--- a/hls4ml/model/optimizer/__init__.py
+++ b/hls4ml/model/optimizer/__init__.py
@@ -59,7 +59,6 @@
     'convert',
     [
         'channels_last_converter',
-        'merge_linear_activation',
         'seperable_to_depthwise_and_conv',
         'remove_transpose_before_flatten',
         'remove_nop_transpose',
@@ -74,6 +73,7 @@
         'replace_multidimensional_dense_with_conv',
         'enforce_proxy_model_embedded_config',
         'eliminate_linear_activation',
+        'merge_linear_activation',
         # many of the above optimzers need to be done before this
         'infer_precision_types',
     ],

From eabb785dc8a748987429cfaefd11c82eef8d285a Mon Sep 17 00:00:00 2001
From: Jan-Frederik Schulte <jschulte@cern.ch>
Date: Fri, 13 Dec 2024 17:31:06 -0500
Subject: [PATCH 35/36] fix example-models setting for long running pytetss

---
 test/pytest/generate_ci_yaml.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/pytest/generate_ci_yaml.py b/test/pytest/generate_ci_yaml.py
index 0714a4acce..4ff9b85723 100644
--- a/test/pytest/generate_ci_yaml.py
+++ b/test/pytest/generate_ci_yaml.py
@@ -72,7 +72,7 @@ def generate_test_yaml(test_root='.'):
         name = path.stem.replace('test_', '')
         test_file = str(path.relative_to(test_root))
         needs_examples = uses_example_model(path)
-        diff_yml = yaml.safe_load(template.format(name, test_file, needs_examples))
+        diff_yml = yaml.safe_load(template.format(name, test_file, int(needs_examples)))
         yml.update(diff_yml)
 
     return yml

From fb120403ff800689059b6bf2a6adc320d185c68b Mon Sep 17 00:00:00 2001
From: Jan-Frederik Schulte <jschulte@cern.ch>
Date: Fri, 13 Dec 2024 19:27:15 -0500
Subject: [PATCH 36/36] add pytorch to long tests

---
 test/pytest/generate_ci_yaml.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/pytest/generate_ci_yaml.py b/test/pytest/generate_ci_yaml.py
index 4ff9b85723..adc3d680ab 100644
--- a/test/pytest/generate_ci_yaml.py
+++ b/test/pytest/generate_ci_yaml.py
@@ -25,7 +25,7 @@
 BLACKLIST = {'test_reduction'}
 
 # Long-running tests will not be bundled with other tests
-LONGLIST = {'test_hgq_layers', 'test_hgq_players', 'test_qkeras'}
+LONGLIST = {'test_hgq_layers', 'test_hgq_players', 'test_qkeras', 'test_pytorch_api'}
 
 
 def path_to_name(test_path):