conference_101719.tex

%% DO NOT CHANGE
\documentclass[twoside,9pt,journal,letterpage]{IEEEtran}
\usepackage{cite}
\usepackage{amsmath,amssymb,amsfonts}
\usepackage{algorithmic}
\usepackage{graphicx}
\usepackage{textcomp}
\usepackage{placeins}
\usepackage{tabularx}
%\usepackage{svg}
\def\BibTeX{{\rm B\kern-.05em{\sc i\kern-.025em b}\kern-.08em
    T\kern-.1667em\lower.7ex\hbox{E}\kern-.125emX}}
%% DO NOT CHANGE

\renewcommand{\tabularxcolumn}[1]{m{#1}}

\let\labelindent\relax
\usepackage[shortlabels]{enumitem}
\usepackage{float}
\setlength{\textfloatsep}{5pt}

\usepackage{datetime}
\newdateformat{monthdayyeardate}{%
  \monthname[\THEMONTH]~\THEDAY, \THEYEAR}

\newcommand{\titlestr}{Adaptive Clocking Techniques for SoC Supply Droop Response in Predictive 7nm CMOS}
\title{\titlestr}

\usepackage{lipsum}

%\author{
%\authorblockN{Dan Fritchman}
%\authorblockA{Department of Electrical Engineering and Computer Sciences\\
%University of California, Berkeley\\
%Berkeley, California  94720\\
%Email: dan\_fritchman@berkeley.edu}
%\and
%\authorblockN{Wahid Rahman}
%\authorblockA{Department of Electrical Engineering and Computer Sciences\\
%University of California, Berkeley\\
%Berkeley, California  94720\\
%Email: wahid.rahman@berkeley.edu}
%}

\author{
	Dan Fritchman, \IEEEmembership{Member, IEEE} and Wahid Rahman, \IEEEmembership{Member, IEEE}
	\thanks{Date of publication \monthdayyeardate\today.}
	\thanks{
		D. Fritchman and W. Rahman are with the Department of Electrical Engineering and Computer Sciences, University of California, Berkeley, Berkeley, CA 94720 USA (e-mail: dan\_fritchman@berkeley.edu; wahid.rahman@berkeley.edu).}
	\thanks{
		This work was supported by Professor Borivoje Nikoli\'{c}.}
}

\markboth{UC Berkeley Proceedings of EE241B, May 2020}{Fritchman \MakeLowercase{\textit{and}} Rahman: \titlestr}

\IEEEpubid{~\copyright~2020 University of California, Berkeley}

\begin{document}
\maketitle
\IEEEpeerreviewmaketitle

\begin{abstract}
Adaptive clock generation techniques have emerged in recent generations of high-performance SoCs to mitigate timing failure due to supply voltage droops. Resilient techniques vary from analog voltage mixing to digital sensing and clock actuation. This work identifies a taxonomy for adaptive clock generation systems: adaptive clock distribution (ACD) and adaptive PLL-based schemes. Reported realizations in state-of-the-art processor SoCs are reviewed and key performance metrics for comparisons are identified. A PLL-based design is evaluated in a predictive 7nm CMOS technology and compared with prior works.
\end{abstract}

\begin{IEEEkeywords}
Adaptive clocking, adaptive frequency, power efficiency, supply-droop mitigation, supply-voltage droop.
\end{IEEEkeywords}

\maketitle

\section{Introduction}

Power management techniques in modern system-on-chips (SoCs) are critical for energy-efficient processors ranging from data servers to mobile devices. SoC thermal dissipation constraints and energy-saving modes necessitate system-level power management to decrease the power supply or reduce the number of active processing cores. Such techniques exhibit a decrease (i.e. droop) in the SoC supply voltage ($V_{DD}$) due to: the controlled decrease of $V_{DD}$ from a supply regulator or DC-DC converter; and the transient $L \frac{di}{dt}$ supply voltage ripple due to sudden current changes through package inductances when dynamically enabling or disabling on-chip processing cores. To maximize processing throughput, SoCs are designed to operate close to the maximum possible clock frequency ($f_{MAX}$) for the target ($V_{DD}$) with minimal guardbanding. Decreasing $V_{DD}$ reduces the drain currents of CMOS transistors, thereby increasing propagation delays of digital logic, potentially leading to irrecoverable timing-induced failure. 

%\begin{figure}[h]
%	\centering
%	\includegraphics[width=0.85\columnwidth]{fig_fmax}
%	\caption{Reduction of $V_{DD}$ and corresponding reduction in $f_{MAX}$ (adapted from \cite{ahmad2017}).}
%	\label{fig:fmax}
%\end{figure}
\begin{figure}[h]
	\centering
	\includegraphics[width=\columnwidth]{fig_droop_schem}
	\includegraphics[width=\columnwidth]{fig_droop}
	\caption{Supply droop transients due to package inductances and load current (adapted from \cite{hashimoto2018}).}
	\label{fig:droop}
\end{figure}

To mitigate such failures, adaptive clock generation techniques have emerged in recent generations of high-performance SoCs\cite{ahmad2017,hashimoto2018,wilcox2015,floyd2017,bowman2016}. Adaptive-clock systems detect transient supply events and dynamically adjust clock frequencies to not exceed a changing $f_{MAX}$ as $V_{DD}$ decreases. 

Transient response of adaptive clock techniques is vital. The external package routes supplying power to the SoC impact supply voltage settling during changes to on-chip core utilization \cite{hashimoto2018}, as illustrated in Fig.\ \ref{fig:droop}. The range of inductances \& capacitances in this network induces a medley of fast and slow time constants. Resilient adaptive clock generation circuits react quickly to fast ripples in $V_{DD}$ ranging from 50-100MHz \cite{hashimoto2018,wilcox2015} while also tracking slower transient settling at 1MHz and below \cite{hashimoto2018,bowman2016,wilcox2015}.

Several schemes exist to realize supply-induced clock adaptation ranging from directly controlling the clock-synthesizing phase-locked loop (PLL)\cite{ahmad2017,hashimoto2018} to modulating a tunable delay line or phase rotator downstream in an adaptive clock distribution (ACD) network\cite{bowman2016,floyd2017,wilcox2015,kwak2016self}. In this work, we survey and contrast the adaptive PLL- and ACD-based mechanisms presented in \cite{hashimoto2018} and \cite{wilcox2015}, respectively. This brief is organized as follows: Section \ref{sec:overview} provides an overview of adaptive clocking schemes and defines the scope of this work; Section \ref{sec:details} discusses the operating principles of mechanisms presented in \cite{hashimoto2018} and \cite{wilcox2015}; Section \ref{sec:comparison} provides a comparison of \cite{hashimoto2018} and \cite{wilcox2015}. Sections \ref{sec:pll-design} and \ref{sec:discussion} introduce an adaptive all-digital PLL design, implementation in 7nm predictive CMOS. Section \ref{sec:conclusion} discusses planned work and concludes this brief.

% If using pudid, place pubidadjcol so it lands in second column of first page
\IEEEpubidadjcol

\vspace{-10pt}
\section{Adaptive Clocking Schemes}
\label{sec:overview}

Adaptive clocking systems include two fundamental components:

\begin{enumerate}[(a)]
\item a \textit{power-supply sensor}, which measures and
reports transient droops in supply voltage; and
\item a \textit{clock period actuator}, which modulates the
system clock period in response to reports from
the power-supply sensor.
\end{enumerate}

During a supply drop, low-latency sensing and actuation is crucial to detect and adjust the clock frequency quickly to ensure error-free operation continues throughout the event. If and when $V_{DD}$ recovers from this event to its nominal value, additional circuitry can assist with managing the corresponding $f_{CLK}$ recovery.

ACD- and adaptive PLL-based systems differ in their implementation of the clock period actuator. ACD-based systems decrease the clock frequency by extending clock periods using an ACD circuit placed between the clock-synthesizing PLL and the global clock distribution network as illustrated in Fig.\ \ref{fig:overview_acd}. A digital code $CTRL_{FCLK}$ from the power supply sensor controls the ACD circuit such that it extends the period of the synthesized clock, $CLK_{PLL}$, to the desired frequency given the sensed change in $V_{DD}$. This lower-frequency clock, $CLK_{SOC}$, is then distributed through the global clock network to the SoC processing cores. Examples of this ACD circuit include tunable-length delay \cite{bowman2016} and phase rotators \cite{wilcox2015}, of which the latter is discussed in Section \ref{sec:details_acd}.

\begin{figure}[h]
	\centering
	\includegraphics[width=0.7\columnwidth]{fig_overview_acd}
	\caption{Adaptive clock distribution (ACD) based system.}
	\label{fig:overview_acd}
\end{figure}

\begin{figure}[h]
	\centering
	\includegraphics[width=\columnwidth]{fig_overview_pll}
	\caption{Adaptive PLL-based system with possible intra-PLL control variants.}
	\label{fig:overview_pll}
\end{figure}

Adaptive PLL-based systems, in contrast, incorporate information from the power-supply sensor to dynamically change the behaviour of PLL sub-components. Typically, these schemes affect oscillator control on top of the traditional PLL loop. In \cite{ahmad2017} for example, the oscillator is controlled indirectly through a loop filter modified for droop response: when a droop is detected, the traditional proportional- and integral-loop filter switches to proportional-only to form a type-I PLL that tunes the oscillator to frequency lock without hazardous over- or underclocking. In \cite{hashimoto2018}, this idea is further extended by interrupting the PLL feedback loop to modify the oscillator directly, reacting to a supply droop event with relatively low latency. The feedback divider is then slowly modified to restore PLL lock. This latter work is further discussed in Section \ref{sec:details_pll}.

This brief compares the effectiveness of \textit{clock period actuators} in ACD and PLL-based schemes. While high-resolution power-supply sensors and their low-latency integration are critical to the overall adaptation time of such schemes, they remains outside the scope of these comparisons.

\vspace{-10pt}
\section{Operating principles}
\label{sec:details}
The two main works for comparison are the PLL- and ACD-based adaptive clocking schemes in \cite{hashimoto2018} and \cite{wilcox2015} respectively. The operating principles of each are described in this section.

\vspace{-10pt}
\subsection{PLL-based adaptive clocking}
\label{sec:details_pll}
The scheme reported in \cite{hashimoto2018} presents a PLL-based adaptive clocking control for three SPARC processor cores in a 20nm CMOS testchip. A sense point placed near one of the SPARC cores transmits the core $V_{DD}$ through low-impedance on-package (off-chip) routing to the on-chip power-supply sensor. This sensor converts $V_{DD}$ droops into a thermometer-coded quantized digital signal, $q[n]$, by use two calibrated delay lines and a 8-bit time-to-digital converter (TDC). 

A central contribution of \cite{hashimoto2018} is the use of the instantaneous value of q[n] to immediately reduce the PLL's oscillator frequency to correct for high-frequency droops, while simultaneously utilizing a filtered version of $q[n]$ to correct for low frequency droops and ultimately re-lock the PLL feedback loop. As shown in Fig.\ \ref{fig:detail_pll}, $q[n]$ is processed by the frequency control logic to generate two control signals: $\Delta F_{code}$ that can instantaneously respond to changes in $q[n]$ to directly change the digitally-controlled oscillator (DCO) frequency, and $N^{sync}_{code}$ that relies only on a filtered value of $q[n]$ to track low-frequency droops and re-lock the PLL loop at the new frequency by changing the feedback division value. The minimum function and the non-linear filter together effectively construct an all-pass filter when $V_{DD}$ decreases and a low-pass filter when $V_{DD}$ increases. The combination minimizes latency during a $V_{DD}$ droop event. 

\begin{figure}[h]
	\centering
	\includegraphics[width=\columnwidth]{fig_detail_pll}
	\caption{Frequency control logic for PLL-based adaptive clocking in \cite{hashimoto2018}.}
	\label{fig:detail_pll}
\end{figure}

A fast response to high-frequency supply droops is an important criterion in \cite{hashimoto2018}. The reported design target is 8$\times$ faster than the first-droop frequency of 50MHz. This permits a time window of 2.5ns from the time $V_{DD}$ crosses the first supply sensor threshold to the time the SoC clock frequency is corrected. This narrow timing constraint is met by generating and propagating the critical oscillator control signal $\Delta F_{code}$ asynchronously. Crucially, the 8-bit thermometer-coded $\Delta F_{code}$ controls a DCO that can tolerate asynchronous changes in its code without glitching its output clock. The DCO reported in \cite{hashimoto2018} is a bank of nine ring inverters with eight of the rings independently enabled by each thermometer bit of $\Delta F_{code}$. Such a structure can tolerate asynchronous arrival of $\Delta F_{code}$ when suddenly decreasing frequency, as in the case of a droop. Note that frequency increases are restricted: the combination of non-linear filter memory with the masking of minimum function ensures $\Delta F_{code}$ only increases one bit at a time after droop recovery \cite{hashimoto2018}. This obviates synchronization of $\Delta F_{code}$, reducing adaptation latency.

The reduction in adaptation latency directly correlates with reported experimental results. Fast response to droops allows for tighter guardbanding and therefore higher $f_{MAX}$. While \cite{hashimoto2018} does not report the exact clock adaptation latency, the work improves $f_{MAX}$ by 7.5\%, from 4.65GHz without adaptive clocking to 5.00GHz with the proposed scheme. This is the largest frequency gain reported in compared works.

\vspace{-10pt}
\subsection{ACD-based adaptive clocking}
\label{sec:details_acd}
The scheme reported in \cite{wilcox2015} presents a ACD-based adaptive clocking control for AMD's Steamroller CPU in 28nm CMOS. The power supply sensor consists of a DLL-based droop detector acting as a delay line connected to the SoC core $V_{DD}$: as $V_{DD}$ changes, the DLL phases change with respect to a regulated PLL's output clock. A programmable threshold selects one of four DLL phases to compare against the reference PLL phase to signal a 1-bit droop activity.

Fig.\ \ref{fig:detail_acd} illustrates the reported scheme. A DLL-based phase rotator realizes the ACD circuit. When a droop activity is detected and $DroopDetected$ is asserted, phase rotation between the 40 DLL phases occurs to effectively ``stretch" the clock period and reduce the clock frequency. To ensure glitchless phase rotation, small positive pre-programmed phase increments occur at each step, and the phase rotator rotates through all 40 phases before continuing the cycle. Rotation can be bypassed entirely by de-asserting $DroopEnable$. 

\begin{figure}[h]
	\centering
	\includegraphics[width=0.6\columnwidth]{fig_detail_acd}
	\caption{Frequency control logic for ACD-based adaptive clocking in \cite{wilcox2015}.}
	\label{fig:detail_acd}
\end{figure}

This scheme reports to target fast initial droops of 100MHz and slower 1-5MHz secondary droop transients by means of the programmed droop detector threshold \cite{wilcox2015}. As the detector signal is binary, the ACD-based reduces the SoC clock frequency by a fixed amount (phase rotator steps through a fixed phase step per cycle) or does not change the frequency at all. Voltage recovery is triggered when $V_{DD}$ crosses back beyond the detector threshold. The work reports a total adaptation latency of 3 cycles at 3.4GHz from droop detection to clock frequency reduction. While experimental results do not report frequency gain as in \cite{hashimoto2018}, silicon results demonstrated a reduction of 3-6\% in $V_{MIN}$, the minimum supply voltage tolerable at a given frequency for the SoC. No comparison was provided with prior works.

\section{Comparison of ACD and PLL Actuators}
\label{sec:comparison}

\subsection{Comparison Scope}
\label{sec:Scope of comparison}

Adaptive-clocking latency includes several sub-components illustrated in Fig.\ \ref{fig:scope}. Design of a complete adaptation system requires a $V_{DD}$ sense point, routing delay, detector and actuation latencies, and clock propagation delay. This comparison focuses on the detection logic, clock actuation, and their combined actuation latency. While performance of other adaptive-system components (e.g. sensor resolution and bandwidth, capacity for digital integration) are crucial to overall system performance, their contributions are largely orthogonal to the period-actuator design. 

\begin{figure}[h]
	\centering
	\includegraphics[width=0.8\columnwidth]{fig_scope}
	\caption{Various latencies in adaptive clock system (adapted from \cite{hashimoto2018}).}
	\label{fig:scope}
\end{figure}

In comparing the PLL- and ACD-based adaptive clocking schemes, we identify five key metrics: actuation latency, power, area, calibration requirements, and frequency gain/$V_{MIN}$ reduction. We then project these metrics into the predictive 7nm technology ASAP7 \cite{asap7}.

\vspace{-10pt}
\subsection{Actuation Latency}
\label{sec:ActuationLatency}
Minimizing actuation latency helps reduce overall system response time. In \cite{hashimoto2018}, the PLL-based actuation latency varies depending on the direction of $V_{DD}$ change due to the non-linear adaptation filter. As previously discussed, for decreases in $V_{DD}$, the latency is less than one clock cycle and limited by asynchronous propagation delay through the minimum function in Fig.\ \ref{fig:overview_pll} and the control bandwidth of the oscillator. The control port time constant is approximately $\frac{T_{VCO}}{2N}$ for the reported oscillator \cite{hashimoto2018}. Assuming the routing parasitic from the minimum logic to the oscillator control port is negligible, this allows a latency below one clock cycle at the reported $f_{MAX}$ = 5GHz, i.e. $t_{act} <$ 200ps.

Conversely, the ACD-based actuation latency is reported as 2 clock cycles at 3.4 GHz \cite{wilcox2015}, implying $t_{act} =$ 588ps. This is due to synchronizing $DroopDetected$ to the PLL clock domain  and generating the corresponding phase rotation enable signal as was shown in Fig.\ \ref{fig:detail_acd}.

We predict these results will be comparable in ASAP7. The actuation latency in \cite{wilcox2015} will remain 2 clock cycles by design. That in \cite{hashimoto2018} will remain less than one cycle again by design, with the potential for $t_{act}$ to be reduced due to increased $f_{T}$. One drawback may be higher resistance routing which could decrease oscillator control bandwidth if not carefully designed.

%\vspace{-10pt}
%\subsection{Power/Area}
%\label{sec:power_area}
%While neither works report measured power and area overhead of the actuation or overall adaptation circuits, the use of a DLL in \cite{wilcox2015} is estimated to be of materially higher power and area. Consider phase mismatch and jitter amplification in the DLL \cite{wilcox2015}. The 40-phase DLL consists of 20 pseudo-differential delay cells, each designed as a pair of cross-coupled inverters with tunable MOS capacitor (MOSCAP) loads. In theory, the $k^{th}$ and $(k+20)^{th}$ phases have a standard deviation of $\sqrt{k} \sigma_{u}$ where $\sigma_{u}$ is one stage's delay variation. The DLL must, at a minimum, ensure phase monotonicity to guarantee glitchless phase rotations. $\sigma_{u}$ is directly correlates to $V_{t}$ and thereby area.

\vspace{-10pt}
\subsection{Clock Tree Insertion Delay}
\label{sec:clktree}

Modern SoCs incur several clock cycles of delay in clock distribution. When a supply droop occurs, it affects the delays of both data-path logic and clock buffers. PLL and ACD-based actuators feature different capacities for managing the latter clock-distribution delay. Adaptive PLLs correct for droops at the clock source; delay and supply dependence in clock distribution is typically beyond scope. ACD circuits, in contrast, can be distributed deeper into clock distribution network. Compact implementations such as \cite{kwak2016self} can be inserted many times per SoC clock domain, each cleaning up a small sub-block's supply noise. Such distributed sensors may also benefit from the phenomenon of \textit{clock-data compensation} (CDC), originally reported in \cite{wong2006enhancing}, potentially relaxing their required actuation range. However to enable compact area, such distributed actuators generally must rely on the PLL clock frequency as a time-domain reference-signal \cite{kwak2016self}, thereby injecting PLL-clock distribution noise directly into their supply-sensor measurements. 


\vspace{-10pt}
\subsection{Silicon Performance}
\label{sec:performance}
The key performance metric reported in both adaptation schemes is reducing voltage/frequency margin. As \cite{hashimoto2018} claims, this is directly proportional to adaptation latency: the shorter the adaptation response time, the closer the SoC can operate to safety limits. Both systems achieve similar results. In terms of $V_{MIN}$, the PLL-based scheme achieves 5\% reduction at 4.65GHz \cite{hashimoto2018} while the ACD-based scheme reports up to 6\% reduction \cite{wilcox2015} at 4GHz. The PLL-based scheme also reports a 7.5\% gain in $f_{MAX}$ to 5GHz if $V_{DD}$ is unchanged \cite{hashimoto2018}. These results require integration of the adaptation system with the complete entire SoC, which was not feasible in ASAP7 for this project scope. Adaptation latency is used as a proxy.

\section{Design of PLL-based Adaptive Clocking in ASAP7}
\label{sec:pll-design}

The PLL presented in \cite{hashimoto2018} was selected as a candidate for further feasibility study in FinFET technologies due to the reduced actuation latency and the lack of immediate power, area, and calibration overhead for similar reduction in $V_{MIN}$ reported in planar technologies \cite{hashimoto2018,wilcox2015}. The following sections introduce a nearly fully-synthesizable, all-digital PLL design implemented in ASAP7 predictive 7nm technology \cite{asap7}, at the system-, logic-, and transistor-level implementations.

\vspace{-5pt}
\subsection{System-level Design}
\label{sec:system-level_design}

The top-level PLL architecture is shown in Fig.\ \ref{fig:pll_model}. A bang-bang phase detector in the PLL digitizes the phase error between $CLK_{REF}$ and $CLK_{FB}$. A proportional-integral loop filter then tunes the DCO through a 9-bit 1st-order $\Delta\Sigma$ converter, which reduces the DCO quantization noise within the bandwidth of the PLL. A 5-bit 2nd-order $\Delta\Sigma$ single-modulus divider then divides the DCO output clock, $CLK_{PLL}$, to generate $CLK_{FB}$ and closing the loop. A frequency feedback loop tunes the DCO direct control port $\Delta F_{CODE}$ and the feedback division ratio $N_{DIV}^{sync}$. 

The PLL operates at a maximum clock frequency $f_{MAX}$ of 5GHz, identical to that of \cite{hashimoto2018}. It operates with a nominal  $CLK_{REF}$ frequency of 500MHz and a nominal division ratio $N_{DIV}^{sync}$ of 10. A multi-bank DCO supports two control ports with gains of 250MHz and 60MHz per LSB, respectively. Combined with an inverse bilinear transform of the loop filter with $K_{P}=1$ and $K_{I}=2^{-9}$ clocked at $f_{REF}$ of 500MHz yields a phase margin of $79^{\circ}$ and a bandwidth of 810kHz. A moving average of length 32 in the frequency rebound controller, similar to \cite{hashimoto2018}, set its bandwidth to roughly $10\times$ that of the PLL.

\vspace{-10pt}
\begin{figure}[h]
	\centering
	\includegraphics[width=0.8\columnwidth]{fig_pll_model}
	\caption{Top-level implementation of PLL-based scheme from \cite{hashimoto2018}.}
	\label{fig:pll_model}
\end{figure}

The response to a droop event simulated in MATLAB is shown in Fig.\ \ref{fig:drooptransient}. Critical latencies from transistor-level simulations (Section \ref{sec:circuit_design}) are back-annotated into its system model. A short, maximum droop impulse is injected by q[n]. The frequency rebound controller then tunes $\Delta F_{CODE}$ that, after an actuation latency, directly changes the DCO frequency. The controller also slowly tunes the divider ratio to re-lock the PLL at the new DCO frequency and ramp the DCO back up to its nominal value. The PLL thereby recovers from the droop impulse after 540ns.

\vspace{-5pt}
\begin{figure}[h]
	\centering
	\includegraphics[width=0.8\columnwidth]{fig_drooptransient}
	\caption{PLL and frequency control transient response to maximum droop.}
	\label{fig:drooptransient}
\end{figure}

\vspace{-3pt}
\subsection{Logic Design}
\label{sec:logic_design}

Save for the DCO and phase detector, the PLL was designed entirely in IEEE1800-standard SystemVerilog. Broad parameterization enables a generator-like flexibility of phase-detector (multi-bit TDC, single-bit bang-bang), DCO, and loop filter parameters. 

The PLL's locking process is necessarily co-designed with its adaptive droop response. Lock is achieved through a three-step process: coarse frequency calibration, fine frequency locking, and finally phase. During droop response, the PLL remains in its fine-frequency locking mode. Coarse frequency control is frozen once initially calibrated. Droop management operates on the higher-resolution fine-frequency control. A sample logic simulation in Fig.\ \ref{fig:brake} shows the PLL lock to a target $f_{DCO}$ of 4.0GHz, respond to a droop event, temporarily reducing frequency to 3.5GHz, and finally recover. 

\begin{figure}[h]
	\centering
	\includegraphics[width=0.8\columnwidth]{brake1.png}
	\caption{Logic simulation of initial lock, droop response, and droop recovery from 4.0GHz to 3.5GHz and back to 4.0GHz. Main waveform shows DCO frequency vs. time.}
	\label{fig:brake}
\end{figure}

To minimize actuation latency, all logic from $DroopDetected$ to $\Delta F_{code}$ is asynchronous and thermometer-encoded, similar to that described in \cite{hashimoto2018}. This reduces the propagation delay to a single AND logic gate plus associated wiring delays, enabling a sub-cycle response time. Section \ref{sec:circuit_design} details the actuation latency and its constitutent components. 

\vspace{-7pt}
\subsection{Circuit Design and Implementation}
\label{sec:circuit_design}

The PLL layout, generated by standard EDA tools and the HAMMER \cite{wanghammer} back-end toolchain, is shown in Fig.\ \ref{fig:layout}. Its area is 10,000 $\mu m^2$ (100$\mu m$$\times$100$\mu m$). This figure excludes the custom-designed DCO, which was estimated to comprises less than 10\% of the overall PLL area.

% DF - maybe it was our estimate, but the VCO only being 1% of a PLL area sounds crazy. Plus 10% fits in the outline anyway. 

\begin{figure}[h]
	\centering
	\includegraphics[width=0.5\columnwidth]{pnr3.png}
	\caption{PLL layout in ASAP7 predictive technology (100$\mu m\times100\mu m$). DCO is omitted.}
	\label{fig:layout}
\end{figure}

The PLL's custom-designed oscillator consists of an array of C$^2$MOS cells gated-inverter cells, as shown in Fig.\ \ref{fig:c2mos}. The top-level DCO schematic, shown in Fig.\ \ref{fig:dco}, is verified with cell-level RC extractions using Calibre. An estimated 60fF of wire capacitance is added to each node for performance characterization. 

\begin{figure}[h]
	\centering
	\includegraphics[width=0.9\columnwidth]{fig_dco}
	\caption{DCO schematic using custom C$^2$MOS cells.}
	\label{fig:dco}
\end{figure}

\begin{figure}[h]
	\centering
	\begin{minipage}{0.49\columnwidth}
	\includegraphics[width=\textwidth]{fig_c2mossch}
	%\caption{Custom C2MOS schematic.}
	\end{minipage}
	\hfill
	\begin{minipage}{0.49\columnwidth}
	\includegraphics[width=\textwidth]{fig_c2moslayout}
	%\caption{Custom C2MOS layout.}
	\end{minipage}
	\caption{Custom C$^2$MOS schematic and layout for DCO implementation.}
	\label{fig:c2mos}
\end{figure}

The DCO achieves an octave (2:1) frequency tuning range with sufficiently high resolution for desirable loop dynamics. Decoupling of range and resolution are achieved via segmented coarse- and fine DACs. The fine DAC is designed to span approximately four coarse LSBs. After initial frequency calibration freezes its coarse control, the fine DAC must include sufficient range to support (a) droop-induced frequency shifts, and (b) its own drift and noise. The fine-control DAC tuning curve is pictured in Fig.\ \ref{fig:dco_oscctrl}.

\begin{figure}[h]
	\centering
	\includegraphics[width=0.75\columnwidth]{fig_dco_oscctrl}
	\caption{DCO frequency vs. coarse + fine DAC code at TT corner.}
	\label{fig:dco_oscctrl}
\end{figure}

The smaller droop control DAC was designed to tolerate a 10\% change in frequency at an $f_{MAX}$ of 5GHz. At high $V_{DD}$, this corresponds to roughly a 10\% droop in supply, corresponding to the maximum droop transient shown previously in Fig.\ \ref{fig:droop}. Eight thermometer bits were used for the total droop range, mapping to 60MHz/LSB. The response of this droop control is shown in Fig.\ \ref{fig:dco_droopctrl}.

\begin{figure}[h]
	\centering
	\includegraphics[width=0.75\columnwidth]{fig_dco_droopctrl}
	\caption{DCO frequency vs. droop code at TT corner (coarse + fine centred at 5GHz).}
	\label{fig:dco_droopctrl}
\end{figure}

The automatically placed and routed actuation logic from $DroopDetected$ to $\Delta F_{code}$ achieves a maximum delay of 40ps, or roughly one fifth of a 5GHz clock cycle. Results of timing-annotated simulation at its worst-case conditions (SS, 0.63V, $100^{\circ}$C) are pictured in Fig.\ \ref{fig:freqreboundctrl_postpandr_timing}. 

\begin{figure}[!htb]
	\centering
	\includegraphics[width=\columnwidth]{fig_freqreboundctrl_postpandr_timing}
	\caption{Post-place-and-route timing-annotated simulation of input to output delay in frequency rebound logic ($q[n]$ to $\Delta F_{CODE}$ at SS corner).}
	\label{fig:freqreboundctrl_postpandr_timing}
\end{figure}

The C$^2$MOS DCO latency through its droop control DAC was measured using a full-scale droop code change. Settling time was defined as the change from the original to new target frequency to fairly compare against the work presented in \cite{wilcox2015}. The simulated latency, shown in Fig.\ \ref{fig:dco_response}, was measured as 171ps.
Note that while this definition of DCO latency aligns for comparison with phase-rotator latency used in \cite{wilcox2015}, the use of a DCO can permit the adaptive clock system to adjust to a safe clock period before settling the DCO frequency fully settles. Fig.\ \ref{fig:subtledelay} illustrates this. The SoC critical path is $T_{CRIT,1}$ at high $V_{DD1}$ and a slower $T_{CRIT,2}$ at lower $V_{DD2}$. The DCO period to satisfy timing are $T_{DCO,1}$ and a slower $T_{DCO,2}$, respectively. If a droop event occurs in the middle of a cycle (when $D_{1}$ is propagating through logic), the critical timing for that cycle is an intermediate $T_{CRIT,INT}$ between $T_{CRIT,1}$ and $T_{CRIT,2}$. Note that adaptive clocking sensor adjusts the DCO at the droop event with the goal of ensuring $T_{DCO} > T_{CRIT}$ for each cycle. Crucially, although the transient $T_{VCO,INT}$ is not equal to the final desired DCO period ($T_{DCO,2}$), it may meet timing if $T_{DCO,INT} > T_{CRIT,INT}$. That is, if the DCO period is elongated longer than the critical logic delay in the intermediate cycle, the DCO does not necessarily have to settle fully to its new period: the use of a DCO allows a shorter effective latency than defined in \cite{wilcox2015}. However, for comparison purposes, the definition from \cite{wilcox2015} is used.

%The C$^2$MOS DCO implementation reacts essentially instantaneously to changes in $\Delta F_{code}$. This is illustrated in Fig.\ \ref{fig:dco_response}, which plots each of the five DCO stage's normalized rising- and falling-edge delays as $\Delta F_{code}$ is stepped from its minimum to maximum value. Each delay element reaches its final value within two gate delays. All but one, the initial rising edge of phase zero, reaches its target delay on its first transition following $\Delta F_{code}$'s arrival. 
%
%This high control-bandwidth is a salient feature of the C$^2$MOS implementation. In contrast to oscillators with reactive storage elements (e.g. LC tanks) or additional layers of control circuitry (e.g. discrete DACs), the C$^2$MOS oscillator's frequency-control signals are fed directly into each delay element. The attendant high control-to-frequency bandwidth is ideal for rapid frequency changes as desired by adaptive PLLs. 
%
%While changes to $\Delta F_{code}$ near-instantaneously effect the oscillator's instantaneous frequency (rate of phase accumulation), these changes generally require the completion of a clock-cycle to propagate to the PLL's output period. Clock-periods which are only partially effected by a droop event (i.e. droop occurs mid-cycle) are therefore only partially elongated. However a convenient phenomena similar to the clock-data compensation originally reported in \cite{wong2006enhancing} renders the time of change in \textit{instantaneous} frequency the relevant variable. 
%
%As an illustrative example, consider a zero-delay adaptive-clocking system set for a 200ps clock cycle, and programmed to extend its clock period by 10\% to 220ps during a droop event. If said system detects a step-function droop event half-way (100ps) into a clock period, it is causally limited to extend only the second half of the ongoing cycle, resulting in a single period of roughly 205ps. However any datapath logic on this system's critical path is similarly slowed for only the latter portion of this cycle. So long as the adaptive clock period is extended by an amount greater than its critical-path delay, its timing requirements are still met. 

\vspace{-10pt}
\begin{figure}[!htb]
	\centering
	\includegraphics[width=0.7\columnwidth]{fig_dco_response.png}
	\caption{DCO frequency vs. time with full-scale DCO droop code change.}
	\label{fig:dco_response}
\end{figure}
\FloatBarrier

\vspace{-10pt}
\begin{figure}[h]
	\centering
	\includegraphics[width=0.7\columnwidth]{fig_subtledelay.png}
	\caption{Intermediate changes in DCO frequency can still meet timing during a supply droop.}
	\label{fig:subtledelay}
\end{figure}

\section{Discussion}
\label{sec:discussion}
\subsection{Actuation Latency Penalty to $f_{MAX}$}
\label{sec:actuation_latency}

As previously discussed, actuation latency is the time delay from a supply droop to when the clock frequency settles to its target value \cite{wilcox2015}. This serves as a proxy to evaluate how $f_{MAX}$ is improved by an adaptive clocking technique for an SoC. More precisely, consider the droop example illustrated in Fig.\ \ref{fig:drooplatency}. A supply droop occurs where $V_{DD}$ suddenly decreases from its ideal 1.0V, and the adaptive clocking technique adjusts SoC clock frequency $f_{CLK}$ after actuation latency $T_{ACT}$. To mitigate timing failure during this latency, the SoC $f_{MAX}$ can only operate assuming $V_{DD,ACT}$, the supply voltage at the time $f_{CLK}$ changes. Actuation latency limits SoC performance.

\begin{figure}[!ht]
	\centering
	\includegraphics[width=0.6\columnwidth]{fig_drooplatency}
	\caption{Effect of droop activation latency on $f_{MAX}$.}
	\label{fig:drooplatency}
\end{figure}

If the supply voltage initially follows $V_{DD}(t) = -\Delta V_{d}sin(2\pi f_{d}t)$ from the linear $RLC$ supply network, where $\Delta V_{d}$ and $f_{d}$ are the voltage drop and frequency of the worst-case droop, then the slope of $V_{DD}(t)$ near the start of the droop is $-2\pi \Delta V_{d}f_{d}$. Using $T_{ACT}$, it can be shown that $V_{DD,ACT} = V_{DD}-2\pi \Delta V_{d}f_{d}T_{ACT}$. Assuming high $V_{DD}$ and a linear transistor model, the achievable $f_{MAX}$ is:

\begin{equation}
\label{eq:fmax_penalty}
f_{MAX} = (1 - \frac{2\pi\Delta V_{d}f_{d}}{V_{DD}}T_{ACT})f_{MAX,ideal}
\end{equation} 
where $f_{MAX,ideal}$ is the maximum SoC clock frequency when $V_{DD}$ is at its maximum voltage. It is apparent from Eq.\ (\ref{eq:fmax_penalty}) that, to minimize impact on performance, either the actuation latency, droop voltage, or droop frequency must be reduced or the nominal supply voltage must be increased. In ASAP7 for example, the simulated logic latencies total $T_{ACT}$ = 40ps+171ps = 217ps, and $V_{DD}$ = 0.7V nominally. To target a 100mV first droop with 50MHz ripple as in \cite{hashimoto2018}, a 0.95\% $f_{MAX}$ penalty is incurred from $f_{MAX,ideal}$. Note that clock tree insertion delay can significantly change this: simulated delays in ASAP7 for a chain of 45 FO4 inverters incur a delay of 951ps, increasing $T_{ACT}$ to 1162ps and thus $f_{MAX}$ penalty to 5.2\%.

\vspace{-5pt}
\subsection{Comparison to Prior Works}
\label{sec:comp-priors}
Table \ref{table:comparison} shows a comparison of this feasibility study with prior works discussed. A key point in the analysis from Eq.\ (\ref{eq:fmax_penalty}) above is that prior works do not report the theoretical $f_{MAX,ideal}$, but only the comparisons between $f_{MAX}$ achieved with and without their reported adaptive clocking schemes. This includes clock tree insertion delay, which may dominate $T_{ACT}$ and thus the $f_{MAX}$ penalty.

\begin{table}[!ht]
\caption{Comparison to Prior Works} 
\label{table:comparison}
\centering
\begin{tabularx}{\columnwidth}{
	| >{\centering\arraybackslash}X 
	| >{\centering\arraybackslash}X 
	| >{\centering\arraybackslash}X 
	| >{\centering\arraybackslash}X | }
	\hline
	& JSSC 2015 \cite{wilcox2015} & JSSC 2018 \cite{hashimoto2018} & This work \\
	\hline
	Process & 28nm CMOS & 20nm CMOS & 7nm Predictive CMOS  \\
	\hline
	Detection Method & Droop sensor & Droop sensor & Droop sensor (assumed)  \\
	\hline
	Adaptation Method & DLL \& Phase Rotator & Adaptive PLL & Adaptive PLL  \\
	\hline
	Actuator Latency & 588ps & 200ps & 40ps  \\
	\hline
	Clock Freq.\ Latency & 294ps (phase rotator) & -- & 171ps (DCO, full settling) \\
	\hline
	Power & -- & -- & 2.25mW \\
	\hline
	Area & -- & -- & 10,000 $\mu m^2$ \\
	\hline
\end{tabularx}
\end{table}

% DF - DCO will fit inside the PnR area 

\vspace{-5pt}
\section{Conclusion}
\label{sec:conclusion}

Modern SoC architectures will continue to require innovation in mitigating supply noise and its effect on timing closure. We compare two popular schemes, ACD-based and PLL-based adaptive clocking. We find both systems well-suited to low-latency actuation and wide frequency-shift range. ACD-based systems exchange design complexity for higher power and area. We then introduce an adaptive all-digital PLL-based system similar to that of \cite{hashimoto2018}, implemented in predictive 7nm FinFET CMOS, and featuring a sub-cycle actuator with $5\times$-reduced latency. 

\vspace{-5pt}
\bibliographystyle{IEEEtran}
\begingroup
\raggedright
\bibliography{references}
\endgroup

\end{document}