From 67f623ed14e85d1642fb2d1120efc3b29cbf7b69 Mon Sep 17 00:00:00 2001
From: Romain Tavenard <romain.tavenard@univ-rennes2.fr>
Date: Sun, 17 May 2020 14:36:38 +0200
Subject: [PATCH] beta version ready

---
 _config.yml                         |  6 +++---
 content/parts/01/dtw.md             |  6 +++---
 content/parts/01/dtw/dtw_da.md      | 12 ++++-------
 content/parts/01/dtw/dtw_gi.md      | 16 +++++++--------
 content/parts/01/ot.md              | 32 ++++++++++++++---------------
 content/parts/01/temporal_kernel.md | 15 ++++++++------
 content/parts/01_metrics.md         |  3 ++-
 content/parts/02/early.md           | 15 +++++++-------
 content/parts/02/shapelets_cnn.md   | 22 ++++++++++----------
 content/parts/02/topic_models.md    | 24 +++++++++++-----------
 content/parts/02_representations.md |  1 +
 content/parts/about.md              |  2 +-
 content/parts/conclu.md             |  6 +++---
 content/parts/intro.md              | 20 ++++++++++--------
 14 files changed, 92 insertions(+), 88 deletions(-)

diff --git a/_config.yml b/_config.yml
index ad44c56..6857e0a 100755
--- a/_config.yml
+++ b/_config.yml
@@ -64,7 +64,7 @@ jupyterhub_url                   : ""  # The URL for your JupyterHub. If no URL,
 jupyterhub_interact_text         : "Interact"  # The text that interact buttons will contain.
 
 # Binder link settings
-use_binder_button                : true  # If 'true', add a binder button for interactive links
+use_binder_button                : false  # If 'true', add a binder button for interactive links
 binderhub_url                    : "https://mybinder.org"  # The URL for your BinderHub. If no URL, use ""
 binder_repo_base                 : "https://github.com/"  # The site on which the textbook repository is hosted
 binder_repo_org                  : "rtavenar"  # The username or organization that owns this repository
@@ -73,12 +73,12 @@ binder_repo_branch               : "gh-pages"  # The branch on which your textbo
 binderhub_interact_text          : "Interact"  # The text that interact buttons will contain.
 
 # Thebelab settings
-use_thebelab_button              : true  # If 'true', display a button to allow in-page running code cells with Thebelab
+use_thebelab_button              : false  # If 'true', display a button to allow in-page running code cells with Thebelab
 thebelab_button_text             : "Thebelab"  # The text to display inside the Thebelab initialization button
 codemirror_theme                 : "abcdef"  # Theme for codemirror cells, for options see https://codemirror.net/doc/manual.html#config
 
 # nbinteract settings
-use_show_widgets_button              : true  # If 'true', display a button to allow in-page running code cells with nbinteract
+use_show_widgets_button              : false  # If 'true', display a button to allow in-page running code cells with nbinteract
 
 # Download settings
 use_download_button              : true  # If 'true', display a button to download a zip file for the notebook
diff --git a/content/parts/01/dtw.md b/content/parts/01/dtw.md
index d1e3c0e..ba8122e 100644
--- a/content/parts/01/dtw.md
+++ b/content/parts/01/dtw.md
@@ -15,7 +15,7 @@ jupyter:
 
 # Dynamic Time Warping
 
-This section covers my works related to Dynamic Time Warping for time series.
+This section covers works related to Dynamic Time Warping for time series.
 
 <!-- #region {"tags": ["popout"]} -->
 **Note.** In ``tslearn``, such time series would be represented as arrays of
@@ -48,8 +48,8 @@ optimization problem:
 
 \begin{equation}
 DTW(\mathbf{x}, \mathbf{x}^\prime) =
-    \sqrt{ \min_{\pi \in \mathcal{A}(\mathbf{x}, \mathbf{x}^\prime)}
-        \sum_{(i, j) \in \pi} d(x_i, x^\prime_j)^2 }
+    \min_{\pi \in \mathcal{A}(\mathbf{x}, \mathbf{x}^\prime)}
+        \sqrt{ \sum_{(i, j) \in \pi} d(x_i, x^\prime_j)^2 }
 \label{eq:dtw}
 \end{equation}
 
diff --git a/content/parts/01/dtw/dtw_da.md b/content/parts/01/dtw/dtw_da.md
index 3ce3aad..08f829e 100644
--- a/content/parts/01/dtw/dtw_da.md
+++ b/content/parts/01/dtw/dtw_da.md
@@ -34,7 +34,7 @@ Optimal Transport for Domain Adaptation {% cite courty:hal-02112785 %}.
 One significant difference however is that we rely on a reference modality for
 alignment, which is guided by our application context.
 
-## Use case
+## Motivating use case
 
 Phosphorus (P) transfer during storm events represents a significant part of
 annual P loads in streams and contributes to eutrophication in downstream water
@@ -58,7 +58,7 @@ limit and test its ability to compare seasonal variability of P storm dynamics
 in two headwater watersheds. Both watersheds are ca. 5 km², have similar
 climate and geology, but differ in land use and P pressure intensity.
 
-## Method
+## Alignment-based resampling method
 
 In the above-described setting, we have access to one modality (discharge,
 commonly denoted $Q$) that is representative of the evolution of the flood.
@@ -69,8 +69,8 @@ Indeed, time series may have
 1. different starting times due to the discharge threshold at which the
 autosamplers were triggered,
 2. different lengths  and
-3. differences in phase that yield different positions of the discharge peak
-and of concentration data points relative to the hydrograph.
+3. differences in phase that yield different temporal localization of the
+discharge peak.
 
 To align time series, we use the path associated to DTW.
 This matching path can be viewed as the optimal way to perform point-wise
@@ -83,10 +83,6 @@ The reference discharge time series used in this study is chosen
 as a storm event with full coverage of flow rise and flow recession phases.
 Alternatively, one could choose a synthetic idealized storm hydrograph.
 
-As stated above, the continuity condition imposed on admissible paths results
-in each element of reference time series $\mathbf{x}^\text{ref}_\text{Q}$ being
-matched with at least one element in each discharge time series from the
-dataset.
 We then use barycentric mapping based on obtained matches to realign other
 modalities to the timestamps of the reference time series, as shown in the
 following Figures:
diff --git a/content/parts/01/dtw/dtw_gi.md b/content/parts/01/dtw/dtw_gi.md
index f6fafc5..e200603 100644
--- a/content/parts/01/dtw/dtw_gi.md
+++ b/content/parts/01/dtw/dtw_gi.md
@@ -16,8 +16,8 @@ jupyter:
 # DTW with Global Invariances
 
 <!-- #region {"tags": ["popout"]} -->
-**Note.** This work was part of Titouan Vayer's PhD thesis.
-We were co-supervising Titouan together with Laetitia Chapel and Nicolas Courty.
+**Note.** This work is part of Titouan Vayer's PhD thesis.
+We are co-supervising Titouan together with Laetitia Chapel and Nicolas Courty.
 <!-- #endregion -->
 
 In this work we address the problem of comparing time series while taking
@@ -43,10 +43,8 @@ lie. More formally, we define Dynamic Time Warping with Global Invariances
 
 \begin{equation}
     \text{DTW-GI}(\mathbf{x}, \mathbf{x^\prime}) =
-        \sqrt{
-            \min_{f \in \mathcal{F}, \pi \in \mathcal{A}(\mathbf{x}, \mathbf{x^\prime})}
-                \sum_{(i, j) \in \pi} d(x_i, f(x^\prime_j))^2
-        } ,
+        \min_{f \in \mathcal{F}, \pi \in \mathcal{A}(\mathbf{x}, \mathbf{x^\prime})}
+            \sqrt{ \sum_{(i, j) \in \pi} d(x_i, f(x^\prime_j))^2 } \, ,
     \label{eq:dtwgi}
 \end{equation}
 
@@ -699,13 +697,15 @@ for idx_dataset, dataset_fun in enumerate(list_dataset_generators):
 
 We also introduce soft counterparts following the definition of softDTW from
 {% cite cuturi2017soft %}.
+In this case, optimization consists in gradient descent and a wider variety of
+feature space transformation families can be considered.
 
-We validate the utility of this new metric on real world
+We validate the utility of these similarity measures on real world
 datasets on the tasks of human motion prediction (where motion is captured under
 different points of view) and cover song identification (where song similarity
 is defined up to a key transposition).
 In both these settings, we observe that joint optimization on feature space
-transformation and temporal alignment improves over standard techniques that
+transformation and temporal alignment improves over standard approaches that
 consider these as two independent steps.
 
 ## References
diff --git a/content/parts/01/ot.md b/content/parts/01/ot.md
index 70ec195..6da4155 100644
--- a/content/parts/01/ot.md
+++ b/content/parts/01/ot.md
@@ -22,8 +22,8 @@ distance that interpolates between Wasserstein distance between node feature
 distributions and Gromov-Wasserstein distance between structures.
 
 <!-- #region {"tags": ["popout"]} -->
-**Note.** This work was part of Titouan Vayer's PhD thesis.
-We were co-supervising Titouan together with Laetitia Chapel and Nicolas Courty.
+**Note.** This work is part of Titouan Vayer's PhD thesis.
+We are co-supervising Titouan together with Laetitia Chapel and Nicolas Courty.
 <!-- #endregion -->
 
 Here, we first introduce both Wasserstein and Gromov-Wasserstein distances and
@@ -49,9 +49,8 @@ beginning to end).
 <!-- #endregion -->
 
 \begin{equation}
-    W_p(\mu, \mu') = \left(
-        \min_{\pi \in \Pi(\mu, \mu^\prime)}
-            \sum_{i,j} d(x_i, x^\prime_j)^p \pi_{i,j} \right)^{\frac{1}{p}}
+    W_p(\mu, \mu') = \min_{\pi \in \Pi(\mu, \mu^\prime)}
+        \left(\sum_{i,j} d(x_i, x^\prime_j)^p \pi_{i,j} \right)^{\frac{1}{p}}
     \label{eq:wass}
 \end{equation}
 
@@ -72,8 +71,8 @@ distances, as illustrated below:
 The corresponding distance is the Gromov-Wasserstein distance, defined as:
 
 \begin{equation}
-    GW_p(\mu, \mu') = \left(
-        \min_{\pi \in \Pi(\mu, \mu^\prime)}
+    GW_p(\mu, \mu') = \min_{\pi \in \Pi(\mu, \mu^\prime)}
+        \left(
             \sum_{i,j,k,l}
             \left| d_\mu(x_i, x_k) - d_{\mu'}(x^\prime_j, x^\prime_l) \right|^p
             \pi_{i,j} \pi_{k,l}
@@ -112,13 +111,13 @@ More formally, we consider undirected labeled graphs as tuples of the form $\mat
 $(\mathcal{V},\mathcal{E})$ are the set of vertices and edges of the graph.
 $\ell_f: \mathcal{V} \rightarrow \Omega_f$ is a labelling function which
 associates each vertex $v_{i} \in \mathcal{V}$ with a feature
-$a_{i}\stackrel{\text{def}}{=}\ell_f(v_{i})$ in some feature metric space
+$a_{i} = \ell_f(v_{i})$ in some feature metric space
 $(\Omega_f,d)$.
 We will denote by _feature information_ the set of all the features
 $\{a_{i}\}_{i}$ of the graph.
 Similarly, $\ell_s: \mathcal{V} \rightarrow \Omega_s$ maps a vertex $v_i$ from
 the graph to its structure representation
-$x_{i} \stackrel{\text{def}}{=} \ell_s(v_{i})$ in some structure space
+$x_{i} = \ell_s(v_{i})$ in some structure space
 $(\Omega_s,C)$ specific to each graph.
 $C : \Omega_s \times \Omega_s \rightarrow \mathbb{R_{+}}$ is a symmetric
 application which aims at measuring the similarity between the nodes in the
@@ -178,7 +177,7 @@ E_{q}(\mathcal{G}, \mathcal{G}', \pi) =
 
 The FGW distance looks for the coupling $\pi$ between vertices of the
 graphs that minimizes the cost $E_{q}$ which is a linear combination of a cost
-$d(a_{i},a^\prime_j)$ of transporting one feature $a_{i}$ to a feature $a^\prime_j$
+$d(a_{i},a^\prime_j)$ of transporting feature $a_{i}$ to $a^\prime_j$
 and a cost $|C(i,k)-C'(j,l)|$ of transporting pairs of nodes in each structure.
 As such, the optimal coupling tends to associate pairs of feature and
 structure points with similar distances within each structure pair and with
@@ -200,14 +199,14 @@ between the structures;
 We also define a continuous counterpart for FGW which comes with a
 concentration inequality in {% cite vayer:hal-02174316 %}.
 
-We have presented a Conditional Gradient algorithm for optimization on the
+We present a Conditional Gradient algorithm for optimization on the
 above-defined loss.
-We have also exposed a Block Coordinate Descent algorithm to compute graph
+We also provide a Block Coordinate Descent algorithm to compute graph
 barycenters _w.r.t._ FGW.
 
 ### Results
 
-We show that FGW allows to extract meaningful barycenters:
+We have shown that FGW allows to extract meaningful barycenters:
 
 <!-- #region {"tags": ["popout"]} -->
 **Note.** The code provided here uses integration of FGW provided by the
@@ -337,9 +336,10 @@ draw_graph(barycenter)
 plt.title('FGW Barycenter');
 ```
 
-We also show that these barycenters can be used for graph clustering.
-Finally, we exhibit classification results for FGW embedded in a Gaussian kernel
-SVM which leads to state-of-the-art performance (even outperforming graph
+These barycenters can be used for graph clustering.
+Finally, we have exhibited classification results for FGW embedded in a
+Gaussian kernel SVM which leads to state-of-the-art performance
+(even outperforming graph
 neural network approaches) on a wide range of graph classification problems.
 
 ## References
diff --git a/content/parts/01/temporal_kernel.md b/content/parts/01/temporal_kernel.md
index c4c23a9..c4f8e0d 100644
--- a/content/parts/01/temporal_kernel.md
+++ b/content/parts/01/temporal_kernel.md
@@ -42,8 +42,11 @@ between feature sets embedded in the Reproducing Kernel Hilbert Space (RKHS)
 associated with $K$:
 
 \begin{equation}
-    SQFD(\mathbf{x}, \mathbf{x}^\prime)^2 = K(\mathbf{x}, \mathbf{x}) +
-        K(\mathbf{x}^\prime, \mathbf{x}^\prime) - 2 K(\mathbf{x}, \mathbf{x}^\prime).
+    SQFD(\mathbf{x}, \mathbf{x}^\prime) =
+        \sqrt{K(\mathbf{x}, \mathbf{x})
+              + K(\mathbf{x}^\prime, \mathbf{x}^\prime)
+              - 2 K(\mathbf{x}, \mathbf{x}^\prime)}
+        \, .
 \end{equation}
 
 ## Local temporal kernel
@@ -166,7 +169,7 @@ ax_s_y.plot(- s_y1, numpy.arange(s_y1.shape[0])[::-1],
             "b-", linewidth=3.);
 ```
 
-$k_t$ is then a RBF kernel itself, and kernel approximation techniques such as
+$k_t$ is then a RBF kernel itself, and
 Random Fourier Features {% cite NIPS2007_3182 %} can be
 used in order to approximate it with a linear kernel.
 
@@ -193,7 +196,7 @@ computation $b_\phi(\cdot)$ in the feature space (which can be done offline)
 followed by (ii) a Euclidean distance computation in $O(D)$ time, where $D$ is
 the dimension of the feature map $\phi(x)$.
 Overall, we have a distance between timestamped feature sets whose
-complexity can be tuned via the map dimensionality $D$.
+precision / complexity tradeoff can be tuned via the map dimensionality $D$.
 
 ## Evaluation
 
@@ -205,9 +208,9 @@ computer vision community at the time of this work.
 However, in our small data context, they proved useful for the task at hand.
 <!-- #endregion -->
 
-In order to evaluate the classifier presented above, we used the UCR Time
+In order to evaluate the method presented above, we have used the UCR Time
 Series Classification archive, which, at the time, was made of monodimensional
-data only.
+time series only.
 We decided not to work on raw data but rather extract local features to
 describe our time series.
 We chose to rely on temporal SIFT features, that we had introduced in
diff --git a/content/parts/01_metrics.md b/content/parts/01_metrics.md
index 615ed09..2314113 100644
--- a/content/parts/01_metrics.md
+++ b/content/parts/01_metrics.md
@@ -16,4 +16,5 @@ Second, in [Sec. 1.2](01/dtw.html), time series are treated as sequences, which
 means that only ordering is of importance (time delay between observations
 is ignored) and variants of the Dynamic Time Warping algorithm are used.
 Finally, in [Sec. 1.3](01/ot.html), undirected labeled graphs are seen as
-discrete distributions over the feature-structure product space.
+discrete distributions over the feature-structure product space and we rely on
+optimal transport distances.
diff --git a/content/parts/02/early.md b/content/parts/02/early.md
index 3ebed87..1db69ce 100644
--- a/content/parts/02/early.md
+++ b/content/parts/02/early.md
@@ -39,8 +39,7 @@ The cost function is of the following form:
 \label{eq:loss_early}
 \end{equation}
 
-where $\hat{y}$ is the class predicted by the model,
-$\mathcal{L}_c(\cdot,\cdot,\cdot)$ is a
+where $\mathcal{L}_c(\cdot,\cdot,\cdot)$ is a
 classification loss and $t$ is the timestamp at which a
 decision is triggered by the system.
 In this setting, $\alpha$ drives the tradeoff between accuracy and earliness
@@ -85,8 +84,8 @@ We are co-supervising François together with Laetitia Chapel and Chloé Friguet
 
 Relying on Equation \eqref{eq:dachraoui} to decide prediction time can be
 tricky. We show in the following that in some cases (related to specific
-configurations of training time confusion matrices), such an approach will lead
-to undesirable behaviors.
+configurations of the training time confusion matrices), such an approach will
+lead to undesirable behaviors.
 
 Using Bayes rule, Equation \eqref{eq:dachraoui} can be re-written
 
@@ -136,8 +135,8 @@ cost of a larger computational complexity.
 
 We also showed that in order to limit inference time complexity, one could
 learn a _decision triggering classifier_ that, based on the time series
-$\mathbf{x}_{\rightarrow t}$
-observed up to time $t$ predicts whether a decision should be triggered or not.
+$\mathbf{x}_{\rightarrow t}$, predicts whether a decision should be triggered
+or not.
 In this setting, the target values $\gamma_t$ used to train this
 _decision triggering classifier_
 were computed from expected costs $f_\tau$ presented above:
@@ -180,9 +179,9 @@ We have hence proposed a representation learning framework that
 covers these three limitations {% cite ruwurm:hal-02174314 %}.
 
 In more details, we rely on a feature extraction module (that can either be
-made of convolutional or recurrent submodules) to extract a fixed-sized
+made of causal convolutions or recurrent submodules) to extract a fixed-sized
 representation $h_t$ from an incoming time series $\mathbf{x}_{\rightarrow t}$.
-An important point here is that this feature extractor can operate on time
+An important point here is that this feature extractor should operate on time
 series whatever their length (and hence a different feature extractor need not
 to be learned for each time series length).
 Then, this feature is provided as input to two different heads, as shown in the
diff --git a/content/parts/02/shapelets_cnn.md b/content/parts/02/shapelets_cnn.md
index 040abd9..5bd9fe5 100644
--- a/content/parts/02/shapelets_cnn.md
+++ b/content/parts/02/shapelets_cnn.md
@@ -16,27 +16,27 @@ jupyter:
 # Shapelet-based Representations and Convolutional Models
 
 In this section, we will cover works that either relate to the Shapelet
-representation for time series or to the family of (1d) Convolutional Neural
+representation for time series or to the family of (1D) Convolutional Neural
 Networks, since these two families of methods are very similar in spirit
 {% cite lods:hal-01565207 %}.
 
 ## Data Augmentation for Time Series Classification
 
 <!-- #region {"tags": ["popout"]} -->
-**Note.** This work is part of Arthur Le Guennec's Master internship.
+**Note.** This work was part of Arthur Le Guennec's Master internship.
 We were co-supervising Arthur together with Simon Malinowski.
 <!-- #endregion -->
 
 We have shown in {% cite leguennec:halshs-01357973 %} that augmenting time
 series classification datasets was an efficient way to improve generalization
-for shallow Convolutional Neural Networks.
+for Convolutional Neural Networks.
 The data augmentation strategies that were investigated in this work are
 local warping and window slicing and both lead to improvements.
 
 ## Learning to Mimic a Target Distance
 
 <!-- #region {"tags": ["popout"]} -->
-**Note.** This work is part of Arnaud Lods' Master internship.
+**Note.** This work was part of Arnaud Lods' Master internship.
 We were co-supervising Arnaud together with Simon Malinowski.
 <!-- #endregion -->
 
@@ -50,9 +50,9 @@ used similarity measure for time series.
 However, it suffers from its non differentiability and the fact that it does
 not satisfy metric properties.
 Our goal in {% cite lods:hal-01565207 %} was to introduce a Shapelet model that
-extracts latent representations such that Euclidean distance in the latent
-space is as close as possible to Dynamic Time Warping between original time
-series.
+extracts latent representations such that Euclidean distance between latent
+representations is as close as possible to Dynamic Time Warping between original
+time series.
 The resulting model is an instance of a Siamese Network:
 
 ![](../../images/siamese_ldps.png)
@@ -433,8 +433,8 @@ Semi-Sparse Group Lasso) loss that allows to enforce sparsity on some individual
 variables only:
 
 \begin{equation}
-    \mathcal{L}^{\mathrm{SSGL}}(y, \hat{y}, \boldsymbol{\theta}) =
-        \mathcal{L}(y, \hat{y}, \boldsymbol{\theta})
+    \mathcal{L}^{\mathrm{SSGL}}(\mathbf{x}, y, \boldsymbol{\theta}) =
+        \mathcal{L}(\mathbf{x}, y, \boldsymbol{\theta})
         + \alpha \lambda
             \left\| \mathbf{M}_\text{ind} \boldsymbol{\beta} \right\|_1
         + (1-\alpha) \lambda \sum_{k=1}^{K} \sqrt{p_k}
@@ -447,7 +447,7 @@ features in our random shapelet case), $\boldsymbol{\theta}$ is the set of
 all model weights, including weights $\boldsymbol{\beta}$ that are directly
 connected to the features (_ie._ these are weights from the first layer), that
 are organized in groups $\boldsymbol{\beta}^{(k)}$ of size $p_k$ ($p_k=2$ in the
-random shapelet context), each group corresponding to a different shapelet.
+random shapelet context, each group corresponding to a different shapelet).
 
 ```python tags=["hide_input"]
 %config InlineBackend.figure_format = 'svg'
@@ -610,7 +610,7 @@ terms of both Mean Squared Error (MSE) and estimation of zero coefficients.
 
 When applied to the specific case of random shapelets, we have shown that this
 lead to improved accuracy as soon as datasets are large enough for coefficients
-to be properly estimated.
+to be estimated properly.
 
 ## Learning Shapelets that Look Like Time Series Snippets
 
diff --git a/content/parts/02/topic_models.md b/content/parts/02/topic_models.md
index d96b51b..cbf0f21 100644
--- a/content/parts/02/topic_models.md
+++ b/content/parts/02/topic_models.md
@@ -31,9 +31,8 @@ In this work, we build upon the Hierarchical Dirichlet Latent Semantic Motifs
 This generative model relies on the extraction of motifs that encapsulate the
 temporal information of the data.
 It is able to automatically find both the underlying number of motifs needed to
-model a given set of documents and the number of motif occurrences in each
-document (which includes their temporal locations), as shown in the following
-Figure:
+model a given set of documents and the number and localization of motif
+occurrences in each document, as shown in the following Figure:
 
 ![half-width](../../images/hdlsm.svg)
 
@@ -41,7 +40,7 @@ The HDLSM model takes as input a set of quantized time series (aka temporal
 documents).
 More specifically, a time series is represented as a table of counts that
 informs, for
-each pair $(w, t)$, whether word $w$ (typically a quantized feature) was
+each pair $(w, t)$, whether word (or a quantized feature) $w$ was
 present in the time series at time index $t$ (in fact, it can also account for
 the _amount_ of presence of word $w$ at time $t$).
 
@@ -60,12 +59,12 @@ a starting time $t_o$ and an associated motif $k$.
 
 As stated above, motifs are represented as probabilistic maps.
 Each map is drawn from a Dirichlet distribution.
-This models makes intensive use of Dirichlet Processes (DP) to model the
+This model makes intensive use of Dirichlet Processes (DP) to model the
 possibly infinite number of motifs and occurrences.
 
-To learn the parameters of the model, a Gibbs sampling is applied, in which it
+To learn the parameters of the model, Gibbs sampling is used, in which it
 is sufficient to re-sample motif assignments for both observations and
-occurrences and starting time for each motif occurrence.
+occurrences as well as occurrence starting times.
 Other variables are either integrated out or deduced, when a deterministic
 relation holds.
 
@@ -115,7 +114,7 @@ in the segmentation of trajectories into homogeneous _movement modes_;
 * in order to scale to larger datasets, stochastic variational inference is used
 (in place of Gibbs sampling) for inference.
 
-### Use case
+### Motivating use case
 
 The monitoring of maritime traffic relies on several sources of data, in a
 rising context of maritime big data {% cite garnier2016exploiting %}.
@@ -138,7 +137,7 @@ that can handle such complex data while being efficient on large databases,
 and that both cluster trajectories as a whole and detect common
 sub-trajectories.
 
-### The model
+### Model
 
 We define a parametric framework to model trajectory data,
 _i.e._ sequences of geographical positions recorded through time.
@@ -342,9 +341,10 @@ computations involved can be distributed.
 We have provided [a dataset](https://github.com/rtavenar/ushant_ais) of several
 millions of observations in the AIS context.
 This dataset is used to validate our model qualitatively (through visual
-analysis of extracted movement modes and trajectory clusters) and should
-allow future competitive methods to compare on a real-world large-scale
-trajectory dataset.
+analysis of extracted movement modes and trajectory clusters) and compare it to
+a standard $k$-means clustering.
+We hope this dataset will be used for future competitive methods to compare on a 
+real-world large-scale trajectory dataset.
 
 **TODO: add ref to online tech report**
 
diff --git a/content/parts/02_representations.md b/content/parts/02_representations.md
index c12e020..9fc3727 100644
--- a/content/parts/02_representations.md
+++ b/content/parts/02_representations.md
@@ -8,6 +8,7 @@ represented as multinomial distributions over latent topics -- or intermediate
 neural networks feature maps (as in [Sec 2.2](02/shapelets_cnn.html) and
 [Sec 2.3](02/early.html)) -- and then time series are represented through
 filter activations they trigger.
+
 More specifically, in [Sec 2.3](02/early.html), we focus on the task of early
 classification of time series. In this context, a method is introduced that
 learns an intermediate representation from which both the decision of
diff --git a/content/parts/about.md b/content/parts/about.md
index cc34963..b6dc13f 100644
--- a/content/parts/about.md
+++ b/content/parts/about.md
@@ -1,4 +1,4 @@
-# About this document
+# Preamble
 
 This document is the Jupyter book version of my HDR thesis[^1].
 It is being written at the moment, so one should not expect it to be stable
diff --git a/content/parts/conclu.md b/content/parts/conclu.md
index da54613..a18476c 100644
--- a/content/parts/conclu.md
+++ b/content/parts/conclu.md
@@ -1,4 +1,4 @@
-# Conclusion and Perspectives
+# Perspectives
 
 In this part, I will first describe some current and future works that we plan
 to investigate.
@@ -29,7 +29,7 @@ computing costs since the Bellmann recursion that is at the core of the Dynamic
 Time Warping algorithm cannot be used anymore.
 It is likely that approximate solvers will have to be used in this case.
 Also, one typical use-case for such a similarity measure would be to serve as
-a loss function in a structured prediction setting, in which case the
+a loss function in a forecasting setting, in which case the
 computational complexity would be an even higher concern which could necessitate
 to train dedicated Siamese networks (_e.g._ by taking inspiration from the
 method presented in
@@ -96,7 +96,7 @@ literature up to now, despite recent advances such as
 In this context, I believe structure can
 be used as a guide.
 Typically, in the time series context, learning intermediate representations
-that are suited for structured prediction (_i.e._ predicting both the future
+that are suited for structured prediction (_i.e._ predicting future
 observations together with their emission times) is likely to capture the
 intrinsics of the data.
 Such approaches could rely on the recent revival of time series forecasting
diff --git a/content/parts/intro.md b/content/parts/intro.md
index d54547b..06783c6 100644
--- a/content/parts/intro.md
+++ b/content/parts/intro.md
@@ -25,11 +25,11 @@ In **TODO ref Gloaguen**, for the sake of efficiency, we have relied on a fully
 non-temporal pre-clustering of the data so as to be able, in a refinement step,
 to model series segments using a continuous-time model (hence re-introducing
 temporal information at the sub-segment level).
-At the other extreme of the spectrum, we have
+At the other extreme of the spectrum, in
 {% cite guilleme:hal-02513295 %} and {% cite tavenard:halshs-01561461 %},
-in which we have postulated that temporal localization information is key for
+we have postulated that temporal localization information was key for
 prediction.
-In these works, we have hence used timestamps as additional features of the
+In these works, we hence use timestamps as additional features of the
 input data.
 Elastic alignment-based approaches (such as the well-known Dynamic Time Warping)
 somehow belong somewhere in-between those two extremes.
@@ -37,15 +37,19 @@ Indeed, they only rely on temporal ordering
 (not on timestamps) to assess similarity between series.
 Note also that, compared to other approaches considered in this document,
 convolutional models presented in [Sec. 2.2](02/shapelets_cnn.html) make an
-extra assumption about the regularity of the sampling process.
+extra assumption about the regularity of the sampling process (_i.e._
+observations in a time series are supposed to be acquired at a fixed time
+interval and this interval is the same for all time series in the considered
+collection).
+
 I have, more recently, turned my focus to other structured data such as graphs,
 and it appears that choosing an adequate encoding for the structural information
-in these cases is also a very important question.
+in this context is also a very important question.
 This study relies on the use of Optimal Transport distances that, surprisingly
 or not, use formulations that are very similar in spirit to the one of
 Dynamic Time Warping.
 
-Coming back to the current document, my contributions are organized in two
+In the present document, my contributions are organized in two
 parts, the first one being dedicated to the design of adequate similarity
 measures between structured data (_i.e._ graphs and time series), while the
 second one focuses on methods that
@@ -66,7 +70,7 @@ Time series datasets will be denoted $(\mathbf{X}, \mathbf{y})$ (or just
 $\mathbf{X}$ for unsupervised methods) where
 $\mathbf{X} = \left( \mathbf{x}^{(0)}, \cdots, \mathbf{x}^{(N-1)} \right)$ is
 a set of $N$ time series (that do not necessarily share the same length) and
-$\mathbf{y}$ is a vector of target values.
+$\mathbf{y}$ is a vector of $N$ target values.
 
 When subseries have to be considered, we will denote by
 $\mathbf{x}_{t_1 \rightarrow t_2}$ the subseries extracted from $\mathbf{x}$
@@ -82,4 +86,4 @@ covers first $t$ timestamps.
 [^1]: `tslearn` is a general-purpose Python machine learning library
     for time series that offers tools for pre-processing and feature extraction
     as well as dedicated models for clustering, classification and regression,
-    and I started this project in 2017.
+    and I initiated this project in 2017.