diff --git a/candle-core/src/backend.rs b/candle-core/src/backend.rs
index afe3e40754..f98cb4f4fd 100644
--- a/candle-core/src/backend.rs
+++ b/candle-core/src/backend.rs
@@ -1,3 +1,5 @@
+//! Traits to Define Backend Behavior
+//!
 use crate::op::{BinaryOpT, CmpOp, ReduceOp, UnaryOpT};
 use crate::{CpuStorage, DType, Layout, Result, Shape};
 
diff --git a/candle-core/src/backprop.rs b/candle-core/src/backprop.rs
index a556677478..d19f099f71 100644
--- a/candle-core/src/backprop.rs
+++ b/candle-core/src/backprop.rs
@@ -1,4 +1,4 @@
-/// Methods for backpropagation of gradients.
+//! Methods for backpropagation of gradients.
 use crate::op::{BinaryOp, Op, ReduceOp, UnaryOp};
 use crate::{Error, Result, Tensor, TensorId};
 use std::collections::HashMap;
diff --git a/candle-core/src/conv.rs b/candle-core/src/conv.rs
index 7b3922dd73..4728c21a23 100644
--- a/candle-core/src/conv.rs
+++ b/candle-core/src/conv.rs
@@ -1,3 +1,5 @@
+//! 1D and 2D Convolutions
+//!
 use crate::{op::BackpropOp, op::Op, Error, Result, Tensor};
 
 #[derive(Debug, Clone, PartialEq, Eq)]
diff --git a/candle-core/src/cpu/mod.rs b/candle-core/src/cpu/mod.rs
index e7d8b6906f..be5b99128e 100644
--- a/candle-core/src/cpu/mod.rs
+++ b/candle-core/src/cpu/mod.rs
@@ -1,3 +1,5 @@
+//! Traits and methods for CPU-backed Tensors
+
 pub mod erf;
 pub mod kernels;
 
diff --git a/candle-core/src/cpu_backend/mod.rs b/candle-core/src/cpu_backend/mod.rs
index 58773c8020..229e3bbce1 100644
--- a/candle-core/src/cpu_backend/mod.rs
+++ b/candle-core/src/cpu_backend/mod.rs
@@ -1,3 +1,4 @@
+//! Implementation of Backend Fns for CPU
 use crate::backend::{BackendDevice, BackendStorage};
 use crate::op::{BinaryOpT, CmpOp, ReduceOp, UnaryOpT};
 use crate::{DType, Error, IntDType, Layout, Result, Shape, WithDType};
diff --git a/candle-core/src/cuda_backend/mod.rs b/candle-core/src/cuda_backend/mod.rs
index f14e00d533..37fef5078e 100644
--- a/candle-core/src/cuda_backend/mod.rs
+++ b/candle-core/src/cuda_backend/mod.rs
@@ -1,3 +1,5 @@
+//! Implementation of Backend traits for CUDA device
+//!
 use crate::backend::{BackendDevice, BackendStorage};
 use crate::op::{BinaryOpT, CmpOp, ReduceOp, UnaryOpT};
 use crate::{CpuStorage, DType, Layout, Result, Shape, WithDType};
diff --git a/candle-core/src/device.rs b/candle-core/src/device.rs
index 18aa61aff7..9b1fb9ee00 100644
--- a/candle-core/src/device.rs
+++ b/candle-core/src/device.rs
@@ -11,6 +11,7 @@ pub enum DeviceLocation {
     Metal { gpu_id: usize },
 }
 
+/// Cpu, Cuda, or Metal
 #[derive(Debug, Clone)]
 pub enum Device {
     Cpu,
diff --git a/candle-core/src/display.rs b/candle-core/src/display.rs
index 7e6e3cf8f1..76d39010a9 100644
--- a/candle-core/src/display.rs
+++ b/candle-core/src/display.rs
@@ -1,6 +1,7 @@
-/// Pretty printing of tensors
-/// This implementation should be in line with the PyTorch version.
-/// https://github.com/pytorch/pytorch/blob/7b419e8513a024e172eae767e24ec1b849976b13/torch/_tensor_str.py
+//! Pretty printing of tensors
+//!
+//! This implementation should be in line with the [PyTorch version](https://github.com/pytorch/pytorch/blob/7b419e8513a024e172eae767e24ec1b849976b13/torch/_tensor_str.py).
+//!
 use crate::{DType, Result, Tensor, WithDType};
 use half::{bf16, f16};
 
diff --git a/candle-core/src/dummy_cuda_backend.rs b/candle-core/src/dummy_cuda_backend.rs
index b4f2e8aa00..9d30d8214d 100644
--- a/candle-core/src/dummy_cuda_backend.rs
+++ b/candle-core/src/dummy_cuda_backend.rs
@@ -1,3 +1,5 @@
+//! Implementation of the Cuda backend when Cuda support has not been compiled in.
+//!
 #![allow(dead_code)]
 use crate::op::{BinaryOpT, CmpOp, ReduceOp, UnaryOpT};
 use crate::{CpuStorage, DType, Error, Layout, Result, Shape};
diff --git a/candle-core/src/error.rs b/candle-core/src/error.rs
index a35bec3cbe..15604c15a8 100644
--- a/candle-core/src/error.rs
+++ b/candle-core/src/error.rs
@@ -1,3 +1,4 @@
+//! Candle-specific Error and Result
 use crate::{DType, DeviceLocation, Layout, MetalError, Shape};
 
 #[derive(Debug, Clone)]
diff --git a/candle-core/src/layout.rs b/candle-core/src/layout.rs
index 7e3b7afbba..949695848b 100644
--- a/candle-core/src/layout.rs
+++ b/candle-core/src/layout.rs
@@ -1,3 +1,4 @@
+//! Tensor Layouts including contiguous or sparse strides
 use crate::{Error, Result, Shape};
 
 #[derive(Debug, PartialEq, Eq, Clone)]
diff --git a/candle-core/src/lib.rs b/candle-core/src/lib.rs
index 4b73d00696..5f9a1c97a5 100644
--- a/candle-core/src/lib.rs
+++ b/candle-core/src/lib.rs
@@ -7,8 +7,8 @@
 //!
 //! let a = Tensor::arange(0f32, 6f32, &Device::Cpu)?.reshape((2, 3))?;
 //! let b = Tensor::arange(0f32, 12f32, &Device::Cpu)?.reshape((3, 4))?;
-//!
 //! let c = a.matmul(&b)?;
+//!
 //! # Ok(())}
 //! ```
 //!
@@ -140,7 +140,7 @@ impl ToUsize2 for (usize, usize) {
     }
 }
 
-// A simple trait defining a module with forward method using a single argument.
+/// Defining a module with forward method using a single argument.
 pub trait Module {
     fn forward(&self, xs: &Tensor) -> Result<Tensor>;
 }
@@ -160,8 +160,8 @@ impl<M: Module> Module for Option<&M> {
     }
 }
 
-// A trait defining a module with forward method using a single tensor argument and a flag to
-// separate the training and evaluation behaviors.
+/// A single forward method using a single single tensor argument and a flag to
+/// separate the training and evaluation behaviors.
 pub trait ModuleT {
     fn forward_t(&self, xs: &Tensor, train: bool) -> Result<Tensor>;
 }
diff --git a/candle-core/src/metal_backend/mod.rs b/candle-core/src/metal_backend/mod.rs
index de107a61b0..47f54c8d59 100644
--- a/candle-core/src/metal_backend/mod.rs
+++ b/candle-core/src/metal_backend/mod.rs
@@ -1,3 +1,5 @@
+//! Implementation of Backend traits for Metal
+//!
 use crate::backend::{BackendDevice, BackendStorage};
 use crate::conv::{ParamsConv1D, ParamsConv2D, ParamsConvTranspose1D, ParamsConvTranspose2D};
 use crate::op::{BinaryOpT, CmpOp, ReduceOp, UnaryOpT};
diff --git a/candle-core/src/op.rs b/candle-core/src/op.rs
index 49ba44be89..c5fc3fc475 100644
--- a/candle-core/src/op.rs
+++ b/candle-core/src/op.rs
@@ -1,3 +1,5 @@
+//! Tensor Opertion Enums and Traits
+//!
 #![allow(clippy::redundant_closure_call)]
 use crate::Tensor;
 use half::{bf16, f16};
diff --git a/candle-core/src/pickle.rs b/candle-core/src/pickle.rs
index 08335257c6..24f13d2025 100644
--- a/candle-core/src/pickle.rs
+++ b/candle-core/src/pickle.rs
@@ -1,4 +1,4 @@
-// Just enough pickle support to be able to read PyTorch checkpoints.
+//! Just enough pickle support to be able to read PyTorch checkpoints.
 // This hardcodes objects that are required for tensor reading, we may want to make this a bit more
 // composable/tensor agnostic at some point.
 use crate::{DType, Error as E, Layout, Result, Tensor};
diff --git a/candle-core/src/quantized/ggml_file.rs b/candle-core/src/quantized/ggml_file.rs
index 99200bbd06..0f7e9c118c 100644
--- a/candle-core/src/quantized/ggml_file.rs
+++ b/candle-core/src/quantized/ggml_file.rs
@@ -134,7 +134,7 @@ fn from_raw_data<T: super::GgmlType + Send + Sync + 'static>(
     super::QTensor::new(data, dims)
 }
 
-/// Creates a [Tensor] from a raw GGML tensor.
+/// Creates a Tensor from a raw GGML tensor.
 pub fn qtensor_from_ggml(
     ggml_dtype: GgmlDType,
     raw_data: &[u8],
diff --git a/candle-core/src/quantized/gguf_file.rs b/candle-core/src/quantized/gguf_file.rs
index d3fe4b5852..cdd1a1543e 100644
--- a/candle-core/src/quantized/gguf_file.rs
+++ b/candle-core/src/quantized/gguf_file.rs
@@ -1,6 +1,5 @@
-//! Support for the GGUF file format.
+//! Support for the [GGUF file format](https://github.com/philpax/ggml/blob/gguf-spec/docs/gguf.md).
 //!
-//! Spec: https://github.com/philpax/ggml/blob/gguf-spec/docs/gguf.md
 
 use super::{GgmlDType, QTensor};
 use crate::{Device, Result};
diff --git a/candle-core/src/quantized/mod.rs b/candle-core/src/quantized/mod.rs
index d852d50410..236f5a9811 100644
--- a/candle-core/src/quantized/mod.rs
+++ b/candle-core/src/quantized/mod.rs
@@ -1,3 +1,4 @@
+//! Code for GGML and GGUF files
 use crate::{CpuStorage, DType, Device, Result, Shape, Storage, Tensor};
 use k_quants::*;
 use std::borrow::Cow;
diff --git a/candle-core/src/safetensors.rs b/candle-core/src/safetensors.rs
index 5ea1f192b3..618e391e34 100644
--- a/candle-core/src/safetensors.rs
+++ b/candle-core/src/safetensors.rs
@@ -1,3 +1,14 @@
+//! Module to load `safetensor` files into CPU/GPU memory.
+//!
+//! There are multiple ways to load tensors from safetensor files:
+//! - `load` function for loading directly into memory and returning a HashMap of tensors
+//! - `MmapedSafetensors` for memory mapping files and avoiding full allocation
+//! - `SliceSafetensors` for working with in-memory buffers
+//! - `BufferedSafetensors` for owning a buffer of data
+//!
+//! Tensors can also be serialized to safetensor format using the `save` function or
+//! `Tensor::save_safetensors` method.
+//!
 use crate::{DType, Device, Error, Result, Tensor, WithDType};
 use safetensors::tensor as st;
 use safetensors::tensor::SafeTensors;
diff --git a/candle-core/src/scalar.rs b/candle-core/src/scalar.rs
index 43e1f4c8c5..30308d11c0 100644
--- a/candle-core/src/scalar.rs
+++ b/candle-core/src/scalar.rs
@@ -1,3 +1,5 @@
+//! TensorScalar Enum and Trait
+//!
 use crate::{Result, Tensor, WithDType};
 
 pub enum TensorScalar {
diff --git a/candle-core/src/streaming.rs b/candle-core/src/streaming.rs
index f70ec51e6c..f4c0a9ff0b 100644
--- a/candle-core/src/streaming.rs
+++ b/candle-core/src/streaming.rs
@@ -1,3 +1,5 @@
+//! StreamTensror useful for streaming ops.
+//!
 use crate::{Result, Shape, Tensor};
 
 pub trait Dim: crate::shape::Dim + Copy {}
diff --git a/candle-core/src/utils.rs b/candle-core/src/utils.rs
index 78c45a9a9d..aa4d2705ef 100644
--- a/candle-core/src/utils.rs
+++ b/candle-core/src/utils.rs
@@ -1,3 +1,4 @@
+//! Useful functions for checking features.
 use std::str::FromStr;
 
 pub fn get_num_threads() -> usize {
diff --git a/candle-transformers/src/generation/mod.rs b/candle-transformers/src/generation/mod.rs
index c250a1865f..d95a05953a 100644
--- a/candle-transformers/src/generation/mod.rs
+++ b/candle-transformers/src/generation/mod.rs
@@ -1,3 +1,8 @@
+//! Logit Processing and Sampling
+//!
+//! Functionality for modeling sampling strategies and logits processing in text generation
+//! with support for temperature-based sampling, top-k filtering, nucleus sampling (top-p),
+//! and combinations thereof.
 use candle::{DType, Error, Result, Tensor};
 use rand::{distributions::Distribution, SeedableRng};
 
diff --git a/candle-transformers/src/object_detection.rs b/candle-transformers/src/object_detection.rs
index e922075fcc..d1b78cfa25 100644
--- a/candle-transformers/src/object_detection.rs
+++ b/candle-transformers/src/object_detection.rs
@@ -1,3 +1,9 @@
+//! Bounding Boxes and Intersection
+//!
+//! This module provides functionality for handling bounding boxes and their manipulation,
+//! particularly in the context of object detection. It includes tools for calculating
+//! intersection over union (IoU) and non-maximum suppression (NMS).
+
 /// A bounding box around an object.
 #[derive(Debug, Clone)]
 pub struct Bbox<D> {
diff --git a/candle-transformers/src/quantized_nn.rs b/candle-transformers/src/quantized_nn.rs
index 9298b80e7e..4a83253d2e 100644
--- a/candle-transformers/src/quantized_nn.rs
+++ b/candle-transformers/src/quantized_nn.rs
@@ -1,3 +1,9 @@
+//! Utilities for quanitized network layers
+//!
+//! This module contains various implementations of standard neural network layers, modules and
+//! utilities including embedding, linear layers, and various normalization techniques.
+//! Most implementations provide quantized weights support.
+
 use crate::models::with_tracing::QMatMul;
 use crate::quantized_var_builder::VarBuilder;
 use candle::quantized::QTensor;
diff --git a/candle-transformers/src/quantized_var_builder.rs b/candle-transformers/src/quantized_var_builder.rs
index 875a2b454d..2ac64aa5e7 100644
--- a/candle-transformers/src/quantized_var_builder.rs
+++ b/candle-transformers/src/quantized_var_builder.rs
@@ -1,3 +1,9 @@
+//! Varbuilder for Loading gguf files
+//!
+//! VarBuilder is a utility to store quantized tensors from a [GGUF model file](https://huggingface.co/docs/hub/gguf).
+//! These tensors can be loaded from disk using `from_gguf` or from an in-memory
+//! buffer using `from_gguf_buffer`.
+
 use candle::quantized::QTensor;
 use candle::{Device, Result, Shape};
 use std::sync::Arc;
diff --git a/candle-transformers/src/utils.rs b/candle-transformers/src/utils.rs
index 17e836946f..884d4f378a 100644
--- a/candle-transformers/src/utils.rs
+++ b/candle-transformers/src/utils.rs
@@ -1,3 +1,5 @@
+//! Apply penalty and repeat_kv
+
 use candle::{Result, Tensor};
 
 pub fn apply_repeat_penalty(logits: &Tensor, penalty: f32, context: &[u32]) -> Result<Tensor> {