dsp2seq.tex

%&latex
% Created from /l/mt/JOSOverview.tex

\newcommand{\theTitle}{Signal Processing Formulations of Sequence Models}
%\newcommand{\theTitle}{Inventing Sequence Models as Vectorized Signal Processors}
%\newcommand{\theTitle}{Sequence Models from a Signal Processing Perspective}
\newcommand{\theEvent}{\htmladdnormallink{DSP Online Conference}{https://www.dsponlineconference.com}}
%\newcommand{\theEvent}{\htmladdnormallink{West Coast Machine Learning}{https://www.youtube.com/channel/UCuoNQWLuEYwjP7mI23sZ3WQ}}
%\newcommand{\theDate}{June 20, 2024}
%\newcommand{\theDate}{July 18, 2024}
%\newcommand{\theDate}{August 8, 2024}
\newcommand{\theDate}{October 29-31, 2024}

%% \newcommand{\theTitle}{Inventing Modern Sequence Models as a Music 320 Project}
%% \newcommand{\theSubTitle}{Samples become Meaning Vectors}
%% \newcommand{\theEvent}{CCRMA Open House}
%% \newcommand{\theDate}{May 17, 2024}
%% % Classroom (Knoll 217), Friday 1:40-1:55pm}

\newcommand{\theAuthor}{\htmladdnormallink{Julius Smith}{http://ccrma.stanford.edu/~jos/}}

%Mohonk05 said:
\input ../latex/stdpreshdr.tex % /w/latex/stdpreshdr.tex
%\input ../latex/stddefs.tex % /w/latex/stddefs.tex
\input ../latex/wgtmac.tex % /w/latex/wgtmac.tex

\usepackage{xcolor}
%N:\usepackage[dvipsnames]{xcolor}
%N:\usepackage[svgnames]{xcolor}

\usepackage{url} % hyperref not able to handle # in URLs

% Uses package etoolbox for \newtoggle et al:
\usepackage{etoolbox} % for \newtoggle et al
\newtoggle{local}
%\toggletrue{local}
\togglefalse{local}
\input localremote.tex

\date{\theDate}

\title{\theTitle}
\author{\theAuthor\\
%  CCRMA Open House\\
%  Classroom (Knoll 217)\\
%  Stanford University \\[10pt]
  \theEvent\\
  \htmladdnormallink{\texttt{www.dsponlineconference.com}}{https://www.dsponlineconference.com}
}

\newcommand{\onevec}{\underline{1}}

\begin{document}

% GALLEY HERE

%\end{document}
%\endinput

\maketitle

%\input abstract.tex
\input overview.tex
%\end{document}
%\end{input}

%\input jos-overview.tex

%\input courses-overview.tex

\begin{slide}[\slideopts,toc={Deep Learning}]{Example Deep Neural Network for F0 Estimation}
  \vspace{-1.5em}
  \centerline{Verma and Schafer, Interspeech 2016}
% \twocolumn{
  \vspace{-2em}
  \myFigureRotateToWidth{VermaSchafer1}{-90}{\twidth}{}
  \vspace{-4em}
  \myFigureRotateToWidth{VermaSchafer2}{-90}{\twidth}{}
%  }{
  \vspace{-2em}
  \begin{itemize}
    \mpitem ``Audio filter bank'' \emph{learned} in the first layer for the F0-estimation task
    \mpitem Filter bands more dense in the F0 range
    \mpitem Suggests: Replace first layer with a \emph{pre-structured
            auditory filter bank} having a \emph{differentiable and convex} parameterization,
            for \emph{data} and \emph{task adaptation} (see \href{https://github.com/google-research/leaf-audio}{LEAF})
            %\mpitem See, \eg, \href{https://github.com/google-research/leaf-audio}{LEAF: a LEarnable Audio Frontend}
  \end{itemize}
% }
\end{slide}

%\section[\sectopts,toc={Approach}]{Signal-Processing Ingredients}
%\section[\sectopts,toc={Basic Idea}]{Signal-Processing Class-Project Idea}
%\section[\sectopts,toc={Basic Idea}]{Music 320 (Signal Processing) Project Idea}
\section[\sectopts,toc={Basic Idea}]{Motivating Problem: Associative Memory}

\begin{slide}[\slideopts,toc={One Pole Filter}]{Task: Make an \emph{Associative Memory} using a \emph{Vectorized One-Pole Filter}}
\vspace{-2em}
\myFigureToWidth{one-pole-jos}{0.5\twidth}{
  One Pole at $z = p$\\[5pt]
    $y(n) = g\, x(n) + p\, y(n-1)$, $n=0,1,2,\ldots$\quad[difference equation]\\[5pt]
    $H(z) = \frac{\displaystyle Y(z)}{\displaystyle X(z)} = \frac{\displaystyle  g}{\displaystyle 1 - p\,\zi}$\quad[transfer function]}
\maybepause
\vspace{-1em}
\textbf{A Unit Delay (for Vectors) can be an Associative Memory:}
%\textbf{Idea: Let's Make an Associative Memory!}
\begin{itemize}
\mpitem Generalize $x(n)$ to a \emph{long vector} $\xv(n)\in\RN$ representing any ``label''
\mpitem Set $\geev = \onevec$ and $\peev = \onevec$ to make $\yv(n)$ a \emph{sum of all input vectors} (``integrator'')
\mpitem Choose the dimension $N$ so large that \emph{vectors in the sum are mostly orthogonal}
\mpitem Let $\xv(n)$ be \textbf{embedding vectors} (\eg, \tx{word2vec}) so that \emph{closeness = similarity}
\mpitem Retrieve similar vectors using a \emph{matched inner product} $\wv^T\xv > b$,\\
        for some suitable threshold $b$ (Hey!  That's a simulated neuron! (``Perceptron'' [1957]))
\end{itemize}
\end{slide}

\begin{slide}[\slideopts,toc={Inner Product}]{Vector Retrieval by Inner Product}

  Given the sum of vectors
  \[
  \yv(n) = \sum_{m=0}^n \xv(m)
  \]
  and a ``query vector'' $\wv = \xv(k)$,\\
  \maybepause
  find the query in the sum using an \emph{inner product:}
  \[
  \wv^T\yv(n) \eqsp \sum_{m=0}^n \wv^T\xv(m) \;\approx\; \xv^T(k)\,\xv(k) \eqsp \|\xv(k)\|^2 \;>\; b(k)
  \]
  or ``found'' if $\wv^T\yv(n) - b(k) > 0$, where $b(k)$ is the \emph{detection threshold} for $\xv(k)$

  \begin{itemize}
  \mpitem This works because the spatial dimension is so large that $\xv^T(j)\,\xv(k)\approx \epsilon$ for $j\ne k$

  \mpitem Retrieval threshold $b(k)$ depends on $\|\xv(k)\|^2$\\
  $\Rightarrow$ \textbf{suggestion:} \emph{reserve the radial dimension for similarity scoring}

  \mpitem \Ie, \emph{only populate the \textbf{hypersphere}} in $\RN$: $\norm{\xv(k)}^2 = 1 \mbox{ (or $N$)}, \, \forall k$

  \mpitem We just invented \textbf{\texttt{RMSNorm}}, used extensively in neural networks (not \texttt{LayerNorm})

  \end{itemize}

\end{slide}

\begin{slide}[\slideopts,toc={}]{Decaying Vector Retrieval by Inner Product}

  RNNs typically have a \emph{forgetting factor} $p<1$.

  Given the sum of vectors
  \[
  \yv(n) = \sum_{m=0}^n p^{n-m}\xv(m)
  \]
  and a ``query vector'' $\wv = \xv(k)$,\\
  \maybepause
  the inner product now gives
  \[
  \wv^T\yv(n) \eqsp \sum_{m=0}^n \wv^Tp^{n-m}\xv(m) \;\approx\; p^{n-k}\xv^T(k)\,\xv(k) \eqsp p^{n-k} \;>\; b
  \]
  where $b$ is the detection threshold for $\xv(k)$, independent of $k$ if $\|\xv(k)\|=1$

  \begin{itemize}
  \mpitem \emph{Cannot retrieve} when $p^{n-k} < b$, setting an upper limit on $n$
  \mpitem We need $p > b^{1/n}$ or $n_{\mbox{max}} \le \log(b)/\log(p)$
  \mpitem Lower $b$ $\Rightarrow$ more memory, but also more false detections from ``interference''\\
	  (``other vectors in the sum'')
  \end{itemize}

\end{slide}

%---------------------------------------------------------------------------------------------------

\begin{slide}[\slideopts,toc={Orthogonality}]{Orthogonality in High Dimensions}
\vspace{-1em}
% \input orthogonality.tex
Let $\ab\in\reals^N$ and $\bb\in\reals^N$ be two normally random, real, unit-norm vectors in $N$ dimensions,
with $\|\ab\|=\|\bb\|=1$

\maybepause
The dot-product (inner product) of
$\ab^T=[a_1,a_2,\ldots,a_N]$ and
$\bb^T=[b_1,b_2,\ldots,b_N]$ is defined as
\[
\ab \cdot \bb = \ab^T\bb = \sum_{i=1}^{N} a_i b_i.
\]

\maybepause
The squared dot product is
\[
(\ab \cdot \bb)^2 = \left(\sum_{i=1}^{N} a_i b_i\right)^2 = \sum_{i=1}^{N} \sum_{j=1}^{N} a_i a_j b_i b_j.
\]

\maybepause
Expected value (average):
\[
E\left[(\ab \cdot \bb)^2\right]
= \sum_{i=1}^{N} \sum_{j=1}^{N} E[a_i a_j] \, E[b_i b_j]
= \sum_{i=1}^{N} \frac{1}{N}\,\frac{1}{N}
= \zbox{\frac{1}{N}}
\]
\end{slide}

\begin{slide}[\slideopts,toc={}]{Orthogonality in High Dimensions, Continued}
\vspace{-1em}
We just showed the \emph{expected squared dot product of two normally random unit vectors in $\reals^N$ is $1/N$}, \ie,
\[
E\left[(\ab \cdot \bb)^2\right] = \zbox{\frac{1}{N}}
\]
since $E[a_i b_j]=0$ for $i \ne j$, $E[a_i^2] = E[b_i]^2 = 1/N$, and $\ab$ and $\bb$ are independent.\\
\maybepause
\textbf{Suggestions for Training:}
\begin{itemize}
\mpitem \emph{Initialize biases (detection thresholds) near $1/N$}
\mpitem \emph{Divide the sum of $M$ vectors by $\sqrt{M}$:}
\begin{itemize}
  \item ``power normalization''
  \mpitem ``\tx{RMSNorm}-preserving''
  % \mpitem Often seen in machine learning
  % \mpitem Done in \htmladdnormallink{Hawk \& Griffin}{https://arxiv.org/abs/2402.19427}, \eg
  \mpitem ``Keep vector sums near the unit sphere''
\end{itemize}
\mpitem Apply \tx{RMSNorm} when \emph{training} the initial \emph{vocabulary embedding} (``\tx{word2sphere}'')
\mpitem Set the \emph{model dimension} just sufficient for the \emph{layer width} at each level
\mpitem \textbf{Caveat:} We are only considering associative recall as \emph{one mechanism} here.\\
          \emph{Other mechanisms are definitely learned}, such as ``attention sinks'' and ``induction heads'', \ldots)
\end{itemize}
% Note to self: embeddings have semantic proximity while feature maps may not

\end{slide}

\begin{slide}[\slideopts,toc={}]{Orthogonality of Random Sums}
\vspace{-1em}
Similarly,
\vspace{-1em}
\beas
E\left[\left(\wv^T\yv_n\right)^2\right]
&=& E\left[\left(\sum_{m=0}^n \wv^T\xv_m\right)^2\right]
\eqsp \sum_{l=0}^n\sum_{m=0}^n E \left[ \wv^T\xv_l\xv_m^T\wv \right]\\[5pt]
&=& \sum_{m=0}^n E \left[ \wv^T\xv_m\xv_m^T\wv \right]
\eqsp \sum_{m=0}^n E\left[\left(\wv^T\xv_m\right)^2\right]
\eqsp \zbox{\frac{n}{N}}
\eeas
assuming $\wv\notin\yv$ and $\|\wv\|=\|\xv_m\|=1$ for all $m$. Thus,
\emph{retrieval becomes unreliable when the number of summed vectors $n$ nears the model dimension $N$.}
\begin{itemize}
  \mpitem $N$ is of course the number of exactly orthogonal vectors possible in $N$ dimensions
%  \mpitem We the importance of input and feedback \emph{gating}
%  \mpitem Later it will similarly be the importance of \emph{selective attention}
  \mpitem If $L$ vectors are typically in the sum, our Perceptron ``bias'' (detection threshold)
  should be higher than $L/N$
  \mpitem \textbf{Suggestion:} \emph{Keep the number of active vectors in memory well below $N$}
% Move this to attention section:  \mpitem \textbf{Consider:} Devise vocabulary grids on which all subset sums are \emph{unique}\\
% (Pairs well with 1-bit weights)
% Can replace inner-product with direct lookup
\end{itemize}
\end{slide}

\begin{slide}[\slideopts,toc={Language}]{How Many Summed Vectors Needed for Language Parsing? (BERT style)}

\maybepause
\vspace{-1em}

It is well known that \emph{phone numbers} were limited to \emph{7
digits} due to our \emph{limited short-term memory} for \emph{unrelated}
objects. \maybepause Can \emph{language} be parsed using 7 vectors or
less at each level? [Original Transformer paper had 8 attention heads and 6 layers (like neocortex)]

\maybepause

\textbf{Layers (\eg):}
\vspace{-1em}
\begin{multicols}{2}
\begin{enumerate}
 \mpitem Base vocabulary = characters\\
  (26 for English)
  \mpitem Syllable in 7 chars or less\\
  (44 syllables in English;\\
   107 in Int'l Phonetic Alphabet)
  \mpitem Word in 7 syllables or less
  \mpitem Noun + 6 or less modifying adjectives
  \mpitem Verb + up to 6 adverbs
  \mpitem Noun phrase
  \columnbreak
  \mpitem Direct or indirect object
  \mpitem Prepositional phrase
  \mpitem Subject, verb, [indirect object], object
  \mpitem Sentence
  \mpitem Paragraph
  \mpitem Section
  \mpitem Chapter
  \mpitem Book
 \mpitem Subject Area Hierarchy $\ldots$
%  \mpitem $\ldots$
\end{enumerate}
\end{multicols}
\maybepause
\vspace{-1em}
Different cortical areas (6 layers each) needed for this many levels.\\
\maybepause
\textbf{Complex Sentence Diagram Examples:}
{\tiny
\url{https://www.quora.com/In-regards-to-diagramming-sentences-which-one-is-the-most-difficult-youve-ever-come-across}
}
% Jamba block (3/24) goes 4 Mamba layers : Attention : three more Mambas (alternately MoE types)
% Zamba block (5/24) is 6 Mamba + Skip : Shared Attention : MLP
% Samba block (6/24) is Mamba : MLP : SlidingWindowAttention : MLP
\end{slide}

\begin{slide}[\slideopts,toc={}]{Pictorial Examples}
\begin{itemize}
\mpitem 6 dot-patterns on a die
\mpitem \htmladdnormallink{7 LED segments for numbers}{https://www.opledtw.com/wp-content/uploads/2021/11/7segment_display_arabic_numeral_7段顯示器數字顯示.png}
\mpitem 4 or fewer strokes to draw a letter
%\mpitem 10 digits driven by number of fingers
\end{itemize}

\maybepause
\emph{Hierarchy} used to keep the number of components per concept \emph{small}

\vspace{1em}

\maybepause
\textbf{Suggestions:}
\begin{itemize}
\mpitem \emph{Ready signal} when symbol is recognized (whole letter, word, phrase, etc.)
\mpitem \emph{Reset memory} after symbol recognition
\mpitem Memory can be \emph{small!}
\end{itemize}

Maybe these signals are happening in the layers already?

% 26 letter characters in the alphabet, but a lot of time is spent learning them, and in ~7 groups:
%  ABCD, EFG, HIJK, LMNOP, QRST, UV, WX&Y&Z
% Chinese characters?  https://www.ames.cam.ac.uk/files/introduction_to_chinese_characters.pdf
\end{slide}

\begin{slide}[\slideopts,toc={}]{Orthogonality of Exponentially Decaying Random Sums}
\vspace{-1em}
RNNs typically have a \emph{forgetting factor} $p<1$, in which case we have,\\
defining $\mu=n-m$ and $\lambda=n-l$:
\beas
E\left[\left(\wv^T\yv_n\right)^2\right]
&=& E\left[\left(\sum_{m=0}^n \wv^Tp^\mu\xv_m\right)^2\right]
\eqsp \sum_{l=0}^n\sum_{m=0}^n E \left[ \wv^Tp^\lambda\xv_lp^\mu\xv_m^T\wv \right]\\[5pt]
&=& \sum_{m=0}^n p^{2\mu} E \left[ \wv^T\xv_m\xv_m^T\wv \right]
\eqsp \sum_{m=0}^n p^{2\mu} E\left[\left(\wv^T\xv_m\right)^2\right]\\[5pt]
&=& \zbox{\frac{1}{N} \frac{1-p^{2(n+1)}}{1-p^2}}
\;\to\; \frac{1}{N} \frac{1}{1-p^2}\quad\mbox{(as $n\to\infty$)}
\eeas
\begin{itemize}
  \mpitem For $1/(1-p^2) < N$, keep $p < \sqrt{(N-1)/N}$
  \mpitem For $1/(1-p^2) < N/2$, keep $p < \sqrt{(N-2)/N}$
\end{itemize}
and so on.  \emph{This gives us one way to calculate a maximum feedback coefficient $p$ in RNNs}
\end{slide}

% \input ModelDims.tex

%---------------------------------------------------------------------------------------------------

\section[\sectopts,toc={Architectures}]{Architectures}

\begin{slide}[\slideopts,toc={Vector Memory}]{Cumulative Vector Memory}
\vspace{-1em}
\myFigureToWidth{integrator}{0.6\twidth}{}
\vspace{-1em}
\myFigureToWidth{ginaSphere1}{0.5\twidth}{MidJourney}
%\myTwoFiguresToWidth{integrator}{ginaSphere1}{0.5\twidth}{Vector summer}{MidJourney depiction for $N=3$}{}
\end{slide}

\begin{slide}[\slideopts,toc={Gating}]{Gated Vector Memory}

\myFigureToWidth{integrator}{0.6\twidth}{Input Vector Summer}

\begin{itemize}
\item \textbf{Problem:} Need a \emph{memory reset}
\mpitem \textbf{Solution:} Set \emph{feedback gain to zero} for one step to clear the memory
\item[]
\mpitem \textbf{Problem:} Need an \emph{input gate} to suppress unimportant inputs
\mpitem \textbf{Solution:} Set \emph{input gain to zero} for unimportant inputs
\item[]
\mpitem We just invented \textbf{gating}, used extensively in neural sequence models
\end{itemize}

\end{slide}

\begin{slide}[\slideopts,toc={Gated RNN}]{Gated Recurrent Network}

\vspace{-1em}

  \textbf{Idea:} \emph{Learn} the input and feedback gates as functions of input $\xv_n$\\
  based on many input-output examples $(\xv_n,\yv_n)$ (``training data''):
%% \mpitem Some item
%% \end{itemize}

\vspace{-1em}

\myFigureToWidth{one-pole-rnn}{0.4\twidth}{Vector Memory with Learned Input and Feedback Gates}

\vspace{-1em}

\maybepause

\textbf{Suggestions:}
\begin{itemize}
\mpitem Use learned, input-based, \emph{activations} for gating (LSTM, GRU, Mamba)
\mpitem While activated, \emph{optionally} set \emph{memory duration} via $\peev$ magnitude (SSMs, Mamba)
\begin{itemize}
  \mpitem \emph{Initialize} $\peev$ for desired initial memory duration (exponential fade time)
  \mpitem Learn $\peev(\xv_n)$ as $\Imtx\cdot e^{-\Delta}\approx \Imtx -\Imtx \Delta$,
  %and $\geev(\xv_n)$ as $\mbox{Linear}(\xv_n,\yv_n)\cdot\Delta$,
  where $\Delta = \mbox{softPlus}(\mbox{parameter}(\xv_n,\yv_n))$ (guaranteed stable --- no ``exploding gradients'')
  [Also multiply $\geev(\xv_n)$ by $\Delta$] % for gain-normalization]
%\mpitem \emph{memory duration} $\Rightarrow$ \emph{linear projection}\\
  % (SSMs, Mamba) ---
%  \mpitem SSMs led to \emph{linear projection} (\emph{no activation} in feedback)
%  based on cool \emph{history-polynomial-approximation} ideas that ultimately went away (S4D), but \emph{linearity survives}.\\
  \mpitem Consider \emph{separate meaning-driven activation} multiplying feedback: $\sigma(\Lmtx\xv)\peev(\xv)$
%  Example: activation by end-of-sentence detector, end-of-paragraph, etc. (e.g., for RAG sentence/paragraph embeddings)\\
%  Analogy: clear Transformer input context buffer after processing a full sentence/paragraph
\end{itemize}
\end{itemize}

\end{slide}

\begin{slide}[\slideopts,toc={Channel Mixing}]{Input Gating with Projection}
\textbf{Idea:} Learn a \emph{full matrix} for the input to provide \emph{arbitrary projection} as well as \emph{gating}
\myFigureToWidth{one-pole-rnn-mixing}{0.4\twidth}{Vector Memory with Learned Input Projection and Gating}
The added \emph{linear transformation} can 
\begin{itemize}
\mpitem further optimize the \emph{input embedding} for the current \emph{task} and \emph{training data},
\mpitem change the spatial layout to make room for things like temporal encoding,
\mpitem up-project to a higher internal model dimension (``state expansion'' discussed later).
\end{itemize}
In \emph{state-space models} such as S4 and H3,
\begin{itemize}
\mpitem full \emph{feedback matrices} $\Pmtx(\xv_n)$ were investigated, but
\mpitem diagonal $\peev(\xv_n)$ were found to be sufficient (even a constant diagonal in Mamba-2).
\end{itemize}
\end{slide}

%% ERROR IN FIGURE - REMOVE FOR NOW
%% \begin{slide}[\slideopts,toc={MLP Layer}]{MLP Information Extraction}
%% %\vspace{-1em}
%% Let's hook up an MLP to detect vectors in our vector memory and assign meaning:
%% %\vspace{-1em}
%% \myFigureToWidth{one-pole-rnn-mlp}{0.8\twidth}{Gated Recurrent Unit into an MLP}
%% %\vspace{-1em}
%% \maybepause
%% \begin{itemize}
%% \mpitem Hidden layer(s) can have any dimension
%% \mpitem Final linear layer $\Wmtx_2$ typically projects back to the model dimension
%% % Not introduced yet, or in figure: \mpitem Skip connection normally provided, bypassing the ``residual stream''
%% \end{itemize}
%% \end{slide}

\begin{slide}[\slideopts,toc={Skip Connection}]{Output Gating}

  \vspace{-1em}

  \textbf{Idea:} Since we have input and feedback gates, why not an \textbf{\emph{output gate} and bypass?}

\myFigureToWidth{one-pole-rnn-skip}{0.6\twidth}{Gated RNN with \textbf{Skip Connection}}

\maybepause
Output gating allows network to be ``bypassed'' when not helpful.

\begin{itemize}
\mpitem \textbf{``Obvious'' Suggestion:} The bypass path should be scaled for \emph{power normalization}\\
\mpitem \textbf{Better yet:} Don't scale the bypass and use \tx{RMSNorm} at the input of the next layer\\
(prevents a ``bad layer'' from isolating deeper layers from the input with garbage,\\
and equalizes gradient backpropagation to all layers)
\end{itemize}

\end{slide}

\begin{slide}[\slideopts]{State Expansion}

\vspace{-1em}

  \textbf{Idea:} \emph{Expand} vector-memory dimension to an integer multiple of the model dimension:
% \vspace{-4em}
% \myFigureToWidth{one-pole-rnn-seq}{0.6\twidth}{} % {Sequence Modeling with Gated RNNs}
%

\myFigureToWidth{statespace-rnn}{0.5\twidth}{}

\vspace{-1em}

\maybepause

\emph{``Structured State-Space Models''} (SSM) look like this (\eg, Mamba)
\begin{itemize}
  \mpitem Increased storage capacity (more vectors can be summed and later retrieved)
  \mpitem Feedback matrix $\Amtx$ typically \emph{diagonal} since 2022 (see ``S4D'')\\
  	  \maybepause $\Rightarrow$ Parallel bank of vector one-poles (\emph{``linearly'' gated, state-expanded RNNs})
  \mpitem In Mamba-2, $\Amtx = p\,\Imtx$, \ie, \emph{shared memory duration} across expanded state
  % \mpitem Processed sequence (``context buffer'') is \emph{indefinitely long}
  % \mpitem Multiple parallel memories (as in ``multi-head attention'')
  \mpitem Gating matrices in Mamba[-2] are simple linear input projections:
%  \[
  $
          [\Bmtx(\xv_n), \Cmtx(\xv_n)] = \Lmtx\,\xv_n  % , \qquad \Dmtx(\xv_n) = \mbox{SiLU}(\Lmtx^\prime\xv_n)\]
  $
%  \]
  % (See Mamba, \eg)
  % \mpitem Conv1D mixing followed by SiLU on input for Mamba (only nonlinear activation)
  % \mpitem $\Cmtx$ could become an \emph{attention matrix} across the expanded state; $\Amtx$ could make it \emph{shift} like a
  %    \emph{transformer context buffer} (using a unit subdiagonal, \eg)
  % \mpitem $\Amtx = -p\Imtx$ as of Mamba-2, \ie, parallel RNNs all have the same memory duration
\end{itemize}

\end{slide}

%Must shorten:
%\input memory-access.tex

\section[\sectopts,toc={Attention}]{Attention}

\begin{slide}[\slideopts,toc={Attention}]{Attention Layer}

\vspace{-1em}

\textbf{Idea:} Also use \emph{FIR Filtering} (SSM State Expansion Factor $M$, $\Amtx$ subdiagonal):

%\myFigureToWidth{one-pole-rnn-seq}{0.6\twidth}{} % {Sequence Modeling with Gated RNNs}
%\vspace{-4em}

\myFigureToWidth{attention}{0.8\twidth}{}

\vspace{-1em}

\emph{Separately learnable FIR coefficient matrices} $\dscalar_k[\xv(n)], \cscalar_j[\xv(n-j),k]$, depending on:
  \begin{enumerate}
    \mpitem input \emph{position} $j$ in the input sequence (``context buffer'' or ``expanded state'' + [W]RoPE)
    \mpitem input \emph{vector} $\xv(n-j)$,\; $j=0,1,2,\dots,M$
    \mpitem \emph{output-position} $k$ being computed,\; $k=0,1,2,\dots,M$ ($M+1$ outputs)
  \end{enumerate}

\maybepause
\textbf{Idea:} Add \emph{relevance gating} suppressing unimportant inputs to each output (``attention'')

\vspace{0.2em}

\maybepause
\textbf{Idea:} Create \emph{new embedding vectors} as \emph{sums} of relevant input vectors (``attention'')

\vspace{0.2em}

\maybepause
\textbf{Idea:} Measure relevance using an \emph{inner product} between the output and input positions (``dot-product attention'')
\end{slide}

\begin{slide}[\slideopts]{Dot-Product Attention}

\vspace{-2em}

\myFigureToWidth{attention}{0.8\twidth}{}

\vspace{-2em}

\textbf{Relevance Gating}

Let $\xv_k$ denote $\xv(n-k)$

The contribution from input $\xv_j$ to the nonlinear FIR sum for output $\yv_k$ can be calculated as
\\
$
\cscalar_{kj}\xv_j = \left[\left(\sum_{m\in \Rscr(\xv_k)}\xv_m\right)^T \xv_j\right]\xv_j
$
\\
\maybepause
or more generally $\zbox{\cscalar_{kj} = \qv_k^T \xv_j}$, where
\\
%\begin{itemize}
%  \mpitem
$\qv_k$ is called the \emph{query} vector for position $k$ in the output sequence
%  \mpitem $k_j[\xv_j,j]$ is called the \emph{key} vector for position $j$ in the input sequence
%  \mpitem $v_j[\xv_j]$ is called the \emph{value} vector for position $j$ in the input sequence
%\end{itemize}

\vspace{1em}

\maybepause
The query $\qv_k$ can be for example a \emph{sum of vectors allowed in the attention sum}:\\
$\qv_k = \xv_k + \xv_{m_1} + \cdots + \xv_{m_k}$\\
$\Rightarrow (\qv_k^T \xv_j)\xv_j \approx \xv_j$, if $\xv_j$ is similar to \emph{any vector} in the query sum.

%We still need a reason to generalize $\xv$ above to a \emph{key} $K(\xv)$ and \emph{value} $V(\xv)$

\end{slide}

%% \begin{slide}[\slideopts]{Queries, Keys, and Values}

%% \begin{itemize}
%%   A query $\qv_k$ formed as a sum of sought vectors, looks useful, but we can generalize further
%%   \mpitem The query for a \emph{noun} can look for \emph{all verbs} and \emph{all adjectives} that could contextualize it
%%   \mpitem The model space can have different partitions for nouns, verbs, and adjectives
%%   \mpitem We can learn a projection $K(\xv)$ that projects any $\xv$ to a vector representing ``noun'', ``verb'', or ``adjective,'' etc.
%%   \mpitem Finally, instead of adding the original vector into the sum, we can learn a different \emph{value vector} $V(\xv)$
%%   for building attention-sums in some new sector of the model space, etc.
%% \end{itemize}

%% So now we're up to (Q,K,V) attention:
%% \[
%% \cscalar_{kj}\xv_j = (\qv_k^T k_j)v_j
%% \]

%% %Similarly, the key vector $k_j$ can be a sum of \emph{all queries} to which it applies, \eg,\\
%% %$k_j = \xv_j + \xv_{n_1} + \cdots + \xv_{n_k} \Rightarrow (\qv_k^T k_j)\xv_j \approx M\xv_j$, given $M$ similar vectors in both.


%% %% \textbf{Idea:} For more flexibility in the attention sum (or to learn
%% %% down-projections in \emph{multi-head} mode), replace $\xv_j$ in the
%% %% attention sum with a learned \emph{value vector} $v_j[\xv_j]$\\

%% \end{slide}

\begin{slide}[\slideopts]{Multi-Head Attention}

\textbf{Idea:} To support multiple meaning possibilities, \emph{partition the model space} into\\
parallel independent \emph{attention calculations} (``multi-head attention'')

%\myFigureToWidth{one-pole-rnn-seq}{0.6\twidth}{} % {Sequence Modeling with Gated RNNs}
%\vspace{-4em}

\begin{itemize}
  \mpitem Each \emph{attention head} can form an independent input interpretation
  \mpitem Useful for \emph{ambiguous} sequences, especially in the lower layers
  \mpitem Also introduced in the Transformer paper (2017)
\end{itemize}

\maybepause

Now we need \emph{down-projections} of the relevance-calculation components\\
$\Rightarrow$ relevance of input $j$ to output $k$ in attention-head $l$ becomes proportional to
\[
\cscalar_{kj}\xv_j = (\qv_k^T \xv_j)\xv_j \;\longrightarrow\; \cscalar_{lkj}\xv_{lj} = \left[\qv_{lk}^T(\xv_k) k_{lj}(\xv_{j})\right]v_{lj}(\xv_j)
\]
where $\qv_{lk}$ (``query''), $k_{lj}$ (``key''), and $v_{lj}$
(``value'') vectors are learned \emph{down-projections} of the input $\xv_j$
for each attention-head $l$ and for all sequence indices $j$ and $k$
in the context buffer (``Transformer'')

%%   \mpitem Filter coefficients computed from input as before, but smaller

%% \[
%% $(\qv_k^T k_j)v j$,
%% \]
%% using down-projected queries $\qv_k(\xv)$, \emph{keys} $k_j(\xv)$, and \emph{values} $V_j(\xv)$

\maybepause

Other useful generalizations can be imagined for these learned (Q,K,V)
vectors, such as grouping grammatical functions, creating new
model-space regions, etc.

\end{slide}

% Relatively weak/obvious points:
%% \begin{slide}[\slideopts]{Weight Tying}

%%   When the sequence model maps input to output in the same ``language'' (e.g., English to English),
%%   it makes sense to use the \emph{same embedding vectors} at the input and output layers, instead
%%   of separately learning a set of weights for mapping to the final output.
%%   This is called ``weight tying'' (many fewer parameters, better results).

%% \end{slide}

%% \begin{slide}[\slideopts]{Hierarchical Blocks}
%% \begin{itemize}
%%   \mpitem Cascade blocks of attention + MLP and/or gated recurrence + MLP to model \emph{hierarchical relationships} like image features or grammatical constructs
%%   \mpitem Attention and gated RNNs are called ``mixing layers'' (successive inputs are combined)
%%   \mpitem MLPs are called ``point transformations'' (general mapping of any vector from one place to another)
%%   \mpitem RMSNorm typical at the input to put it on the hypersphere --- also used internally\\
%%   (see Hawk/Griffin e.g.)
%%   \mpitem Thus, these ``point MLPs'' are effectively data-dependent \emph{rotation matrices}.\\
%%   In model-dimension $N$, we need only learn $N-1$ \emph{rotation angles} for each input class.\\
%%   The direct mapping of the coordinates used now seems efficient enough, but projection back to the sphere
%%   during training might help.  Then perhaps there is no need for RMSNorm at the output after summing the skip
%%   connection.
%% \end{itemize}
%% \end{slide}

%% \begin{slide}[\slideopts,toc={Other Features}]{Other ``Obvious'' Features}
%% \begin{itemize}
%%   \mpitem Tying output ``language modeling head'' weights to the input embedding weights
%%   \mpitem Positional encoding within an RNN
%% \end{itemize}
%% Less obvious:
%% \begin{itemize}
%%   \mpitem Multihead Attention (down-projection + independent spatial processing)
%%   \mpitem (Q,K,V) matrices for ``dot-product attention'' (down-projection + inner-product function + value flexibility)
%% \end{itemize}
%% \end{slide}

%% \begin{wideslidewhite}[\slideopts,toc={Architectures}]{Architectures}
%% \vspace{-4em}
%% \myFigureToWidth{Architectures}{\twidth}{}
%% \vspace{-4em}
%% \end{wideslidewhite}

%% Too long to describe, and not as good as a diagram which comes next.
%% Also, IMPROVE THE NOTATION so that underbar always means an Nx1 column vector and nothing else:
%% \begin{slide}[\slideopts,toc={Unified State Space}]{State Space Unification of Transformers and GRNNs}
%% \vspace{-1em}
%% % State Transition Matrices
%% \begin{equation*}
%% \begin{array}{cc}
%% \underbrace{
%% \mathbf{A_T} = \ev^{j\Delta_n} \begin{pmatrix}
%% 0 & 0 & \cdots & 0 & 0 \\
%% \onevec & 0 & \cdots & 0 & 0 \\
%% 0 & \onevec & \cdots & 0 & 0 \\
%% \vdots & \vdots & \ddots & \vdots & \vdots \\
%% 0 & 0 & \cdots & \onevec & 0
%% \end{pmatrix}
%% }_{\mbox{Transformer}}
%% &\quad
%% \underbrace{
%% \mathbf{A_M} = \av_n \ev^{j\Delta_n} \begin{pmatrix}
%% \onevec & 0 & \cdots & 0 \\
%% 0 & \onevec & \cdots & 0 \\
%% \vdots & \vdots & \ddots & \vdots \\
%% 0 & 0 & \cdots & \onevec
%% \end{pmatrix}
%% }_{\mbox{Mamba-2 style RNN + [W]RoPE}}
%% \end{array}
%% \end{equation*}
%% \[
%% \mathbf{A_{TM}} = \ev^{j\Delta_n} \begin{pmatrix}
%% 0 & 0 & \cdots & 0 & 0 & 0 & \cdots & 0 \\
%% \onevec & 0 & \cdots & 0 & 0 & 0 & \cdots & 0 \\
%% 0 & \onevec & \cdots & 0 & 0 & 0 & \cdots & 0 \\
%% \vdots & \vdots & \ddots & \vdots & \vdots & \vdots & \ddots & \vdots \\
%% 0 & 0 & \cdots & \onevec & 0 & 0 & \cdots & 0 \\
%% % & & & & & \av_n \onevec & 0 & \cdots & 0 \\
%% 0 & 0 & \cdots & 0 & \bv_{1}(n) & \av_n & \cdots & 0 \\
%% \vdots & \vdots & \ddots & \vdots & \vdots & \vdots & \ddots & \vdots \\
%% 0 & 0 & \cdots & 0 & \bv_{N_M}(n) & 0 & \cdots & \av_n
%% \end{pmatrix}
%% \quad
%% \mathbf{B_{TM}} = \begin{pmatrix} \beev_1(n) \\ 0 \\ 0 \\ \vdots \\ 0 \end{pmatrix}
%% \]
%% \end{slide}

\begin{slide}[\slideopts,toc={TransMamba}]{Transformer followed by GRNN with 2x State Expansion (like Mamba)}
\vspace{-1.6em}
\myFigureToWidth{transmamba}{\twidth}{}
%!\myFigureToWidth{transmamba}{\twidth}{TransMamba} % MambaFormer is taken
%!\vspace{-2em}
%!\maybepause
% Prompt 1: "Friendly cartoon Mamba snake" (cute but boring):
% => \myFigureToWidth{cartoon-mamba}{0.9in}{} % And now we don't want MambaFormer after all - this is great!
% Prompt 2: "Make it trans" (over the top better!):=>
%!\myFigureToWidth{cartoon-mamba-t}{0.9in}{} % And now we don't want MambaFormer after all - this is great!
\end{slide}

%% % Not finished:
%% \begin{slide}[\slideopts,toc={MambaX}]{From Direct Form I to Direct Form II}
%% \vspace{-1.6em}
%% \textbf{Idea: Mamba for Short-Term Memory, Transformer for (Finite) Long-Term Memory}
%% \myFigureToWidth{mambax}{\twidth}{MambaX}
%% \vspace{-2em}
%% %\maybepause
%% %\myFigureToWidth{cartoon-mambax}{0.9in}{}
%% \end{slide}

\begin{slide}[\slideopts,toc={Direct Forms}]{``Direct Form I'' or ``Direct Form II''?}
  \vspace{-1.6em}
  \myFigureToWidth{directforms}{\twidth}{XMamba (``DF-I'') versus MambaX (``DF-II'')}
  \vspace{-2em}
  \begin{multicols}{2}
    \begin{itemize}
    \mpitem Perfect short-term memory
    \mpitem Fuzzy, fading, long-term memory
    \mpitem Like Infini-Attention
      \columnbreak
    \mpitem Both memories see latest input
    \mpitem IIR part less efficiently used
    \end{itemize}
  \end{multicols}
  \maybepause
  Direct Form I looks preferable because it separates short and longer-term memory functions.
  %% Hold off on this discussion for now:
  %% In \htmladdnormallink{``On the Power of Convolution Augmented
  %%   Transformer'' (CAT)}{https://arxiv.org/pdf/2407.05591}, they found that\\
  %% ``the locality of the convolution synergizes with the global view of
  %% the attention''.
  %% \begin{itemize}
  %% \mpitem Convolution can ``summarize'' the context window as\\
  %% ``salient summary tokens'' for attention
  %% \mpitem Summary $(Q, K, V)$ matrices computed as $(X\ast F_i)W_i$, for $i=q,k,v$,\\
  %% where $X$ is the input, $F_i$ is a learned filter, and $W_i$ is a learned projection
  %% \end{itemize}
  %% % They do not cite Infini-Attention
  %% % Code will be at https://github.com/umich-sota/CAT
\end{slide}

%% No time to update these right now:
%% \begin{slide}[\slideopts,toc={Plan}]{Possible Next Steps}
%%   \vspace{-1em}
%%   \begin{itemize}
%%     \mpitem Try to improve [Trans]\textbf{Mamba}[-2] on small synthetic datasets testing \emph{memory}
%%     \begin{itemize}
%%       \mpitem Vocabulary embeddings trained to the unit hypersphere (\eg, \tx{word2sphere})
%%       \mpitem Memory \emph{duration} and \emph{reset} functions separately trained and implemented
%%       \mpitem Initial \emph{biases} at $\zv$ versus $L/N$ etc.
%%       \mpitem Do \emph{power normalization} in place of \tx{RMSNorm} where possible (efficiency)
%%       \mpitem Try \emph{power normalized attention} in place of $1/\sqrt{d_h}$ and \tx{Softmax} (efficiency)
%%       \mpitem Adapt \emph{model dimension} to \emph{layer width} at each level (efficiency)
%%       \mpitem Truncated Recurrent Neural Network (TRNN) sliding-window memory + linear RoPE
%%       \mpitem Translational Positional Encoding (TraPE) in its own head (no \tx{RMSNorm})
%%       \mpitem Explore other ``Control Heads'' that flow along purely for ``conditioning'' like TraPE
%%     \end{itemize}
%%     \mpitem Progress to date:
%%     \begin{itemize}
%%       \mpitem New synthetic benchmarks analogous to ``needle in a haystack''
%%       \mpitem Adapted Andrej Karpathy's \tx{makemore} code, adding Mamba and new benchmarks
%%       \mpitem Four papers started, aiming for Arxiv, GitHub, ``AI social media,'' blog
%%     \end{itemize}
%%     \mpitem Feel free to take over any of these! (and LMK so I can do something else)
%%   \end{itemize}
%% \end{slide}

\begin{slide}[\slideopts,toc={Hypersphere}]{Thanks for your Attention!}
\vspace{-1em}
\myFigureToWidth{ginaSphere1}{\twidth}{}
\end{slide}

\section[\sectopts,toc={History Samples}]{Sequence Modeling Snapshots}

\begin{slidewhite}[\slideopts, toc={LSTM \& GRU}]{LSTM and GRU}
\vspace{-6em}
\myFigureRotateToWidth{Architectures1}{-90}{\twidth}{}
\end{slidewhite}

\begin{slidewhite}[\slideopts, toc={SSM \& Mamba}]{Structured State Space and Mamba}
\vspace{-6em}
\myFigureRotateToWidth{Architectures2}{-90}{\twidth}{}
\end{slidewhite}

\begin{slidewhite}[\slideopts,toc={Hawk \& Griffin}]{Hawk and Griffin}
\vspace{-6em}
\myFigureRotateToWidth{Architectures3}{-90}{\twidth}{}
\end{slidewhite}

\begin{slidewhite}[\slideopts,toc={HGRN2}]{Gated ``Linear'' RNNs with State Expansion}
\vspace{-6em}
\myFigureRotateToWidth{Architectures4}{-90}{\twidth}{}
\end{slidewhite}

\begin{slidewhite}[\slideopts,toc={RWKV+}]{RWKV, Eagle, Finch}
\vspace{-6em}
\myFigureRotateToWidth{Architectures5}{-90}{\twidth}{}
\end{slidewhite}

\begin{slidewhite}[\slideopts,toc={Hybrid}]{Jamba, Zamba, \& Samba Hybrid Architectures (Mamba then Attention)}
\vspace{-6em}
\myFigureRotateToWidth{JambaZambaSamba}{-90}{\twidth}{}
\end{slidewhite}

%\begin{slidewhite}[\slideopts,toc={}]{Architectures 6}
%\vspace{-6em}
%\myFigureRotateToWidth{Architectures6}{-90}{\twidth}{}
%\end{slidewhite}

%\input ai-reading-2022.tex
%\input ai-reading-2023.tex
%\input ai-projects.tex
%\section[\sectopts]{Differentiable DSP (DDSP)}
%\input ddsp.tex

\end{document}
\endinput