%%% ToC #355, (v004/a003)
\documentclass{toc}
%\usepackage{pdfsync}

\tocdetails{%
   volume=4, number=3, year=2008, firstpage=53,
   received={November 28, 2007}, 
   published={May 15, 2008}
}

%\usepackage{psfrag,anysize, graphics, stmaryrd}
%\usepackage{bbm}
%\input{macros}
\providecommand{\equivaut}{\Leftrightarrow}
\providecommand{\drawn}[1]{\stackrel{#1}{\leftarrow}}
\providecommand{\drawnr}{\drawn{R}}
\providecommand{\eps}{\varepsilon}
\providecommand{\zo}{\{0, 1\}}
\providecommand{\supp}{\mathrm{supp}}
\providecommand{\P}{\mathbf{P}}
\providecommand{\NP}{\mathbf{NP}}
\DeclareMathOperator{\Exp}{\mathbb{E}}
\providecommand{\Tr}{\mathrm{Tr}}
\providecommand{\dans}{\rightarrow}
\providecommand{\poly}{\mathrm{poly}}
\providecommand{\angles}[1]{\langle #1 \rangle}
\providecommand{\appendixref}[1]{\hyperref[#1]{Appendix~\ref{#1}}}
\providecommand{\inequalityref}[1]{\hyperref[#1]{Inequality~\eqref{#1}}}
%% LB  in this and the next macro,  \ref --> \eqref
\providecommand{\equationref}[1]{\hyperref[#1]{Equation~\eqref{#1}}}
\providecommand{\tableref}[1]{\hyperref[#1]{Table~\ref{#1}}}
\providecommand{\definitionref}[1]{\hyperref[#1]{Definition~\ref{#1}}}
\providecommand{\calM}{\mathcal{M}}


\runningauthor{A.~Wigderson and D.~Xiao}
\copyrightauthor{Avi Wigderson and David Xiao}

\runningtitle{Derandomizing the Ahlswede-Winter matrix-valued Chernoff
  bound}% using pessimistic estimators, and applications}


\newcommand{\OPT}{\mathit{OPT}}

\begin{document}

\begin{frontmatter}[classification=float]

\tocpdftitle{Derandomizing the Ahlswede-Winter matrix-valued Chernoff bound using pessimistic estimators, and applications}
\tocpdfauthor{Avi Wigderson, David Xiao}
\title{Derandomizing the\\ Ahlswede-Winter matrix-valued Chernoff
  bound using pessimistic estimators,\\ and applications}
\author[avi]{Avi Wigderson\thanks{Partially supported by NSF grant
    CCR-0324906}}
\author[david]{David Xiao\thanks{Supported by an NDSEG Graduate Fellowship
    and a NSF Graduate Fellowship}}

\begin{abstract}
Ahlswede and Winter [IEEE Trans. Inf. Th. 2002] introduced a Chernoff
bound for matrix-valued random variables, which is a non-trivial
generalization of the usual Chernoff bound for real-valued random
variables.  We present an efficient derandomization of their bound
using the method of pessimistic estimators (see Raghavan [JCSS 1988]).
As a consequence, we derandomize an efficient construction by Alon
and Roichman [RSA 1994] of an expanding Cayley graph of
logarithmic degree on any (possibly non-abelian) group.  This gives an
optimal solution to the homomorphism testing problem of Shpilka and
Wigderson [STOC 2004].  We also apply these pessimistic estimators to
the problem of solving semidefinite  
covering problems, thus giving a
deterministic algorithm for the quantum hypergraph cover problem of
Ahslwede and Winter.

The results above appear as theorems in our paper ``A
randomness-efficient sampler for matrix-valued functions and
applications'' [FOCS 2005, ECCC 2005], as consequences of the main
claim of that paper: a randomness efficient sampler for matrix-valued
functions via expander walks.  However, we discovered an error in the
proof of that main theorem (which we briefly describe in the
appendix).  That claim stating that the expander walk sampler is good
for matrix-valued functions thus remains open.  One purpose of the
current paper is to show that the applications in that paper hold
despite our inability to prove the expander walk sampler theorem for
matrix-valued functions.
\end{abstract}

\tockeywords{Chernoff bounds, matrix-valued random variables, 
derandomization, pessimistic estimators}
  \tocacm{G.3, G.2.2, F.2.1, F.1.2}
  \tocams{68W20, 68R10, 60F10, 20D60, 81P68, 15A18}

\end{frontmatter}

\section{Introduction}
Chernoff bounds are extremely useful throughout theoretical computer
science.  Intuitively, they say that a random sample approximates the
average, with a probability of deviation that goes down exponentially
with the number of samples.  Typically we are concerned with
real-valued random variables, but recently several applications have
called for large-deviation bounds for matrix-valued random variables.
Such a bound was given by Ahlswede and Winter~\cite{AW} (see
\thmref{thm:aw} and \thmref{thm:aw2} for a precise statement of their
bounds).

In particular, the matrix-valued bound seems useful in giving new
proofs of probabilistic constructions of expander graphs~\cite{AR} and
also in the randomized rounding of semidefinite covering problems,
with further applications in quantum information theory~\cite{AW}.

In this paper we use the method of pessimistic estimators, originally
formulated in~\cite{Raghavan88},\footnote{The simpler method of
  conditional probabilities was described earlier in the first edition
  of~\cite{SpencerLectures}.  Ideas similar to those of~\cite{Raghavan88} also appeared in~\cite{BeckSpencer}.} to
derandomize the Chernoff bound of~\cite{AW}, and in the process
derandomize the Alon-Roichman theorem and the randomized rounding of
covering SDP's.

The results of this paper prove the claimed applications of our
previous paper~\cite{WX05}, and in fact supersede them in simplicity
and efficiency.  However, we discovered a fatal mistake in the
analysis of using an expander sampler in~\cite{WX05}, and it remains
open whether the expander sampler achieves the deviation bound claimed
there (or something asymptotically equivalent).  For details on the
problem with the previous work, see \appendixref{app:problem}.

Arora and Kale~\cite{ArKa07} independently reached results similar to
the ones presented in this paper that imply the applications to
constructing expanding Cayley graphs and semidefinite covering
programs.

The paper is organized as follows.  In \secref{sec:aw} we define
the linear algebra notation we use and prove the Chernoff bounds of
Ahlswede-Winter, given in \thmref{thm:aw} and
\thmref{thm:aw2}.  In \secref{sec:condprob} we review the
method of pessimistic estimators and how it is used to derandomize
algorithms.  In \secref{sec:awestimators} we construct pessimistic
estimators for the Ahlswede-Winter Chernoff bounds.  Finally we apply
these estimators to derandomize the construction of Cayley expanders
in \secref{sec:ar} and to derandomize the rounding of integer
covering SDP's in \secref{sec:sdp}.

\section{Matrix-valued random variables and the Chernoff
  bound of Alhswede and Winter}
\label{sec:aw}

 We will work with the set $\calM_d$ of real symmetric $d \times d$ 
 matrices.\footnote{All our results extend to complex Hermitian
  matrices, or abstractly to self-adjoint operators over any Hilbert
  space where the operations of addition, multiplication, trace,
  exponential, and norm are efficiently computable.}
We let $I_d$
denote the identity matrix in $\calM_d$, and will write simply $I$
when the dimension is clear.  For any $A \in \calM_d$ we let
$\lambda_1(A) \geq \ldots \geq \lambda_d(A)$ denote the eigenvalues of
$A$ in non-increasing order.  Recall that every matrix $A \in \calM_d$
has an orthonormal eigenbasis.                 

We will measure distance between matrices using the 
operator norm
$$
\|A\| = \max_v \frac{\|Av\|}{\|v\|} = \max_i |\lambda_i(A)|\,.
$$
We will also
frequently use the trace, $\Tr(A) = \sum_{i=1}^d \lambda_i(A)$.  It is
well-known that for any orthonormal basis $v_1, \ldots, v_d \in \R^d$
we have that $\Tr(A) = \sum_{i=1}^d \langle v_i, A v_i\rangle$, where
$\langle \cdot, \cdot \rangle$ denotes the usual inner product over
$\R^d$.

We say that a matrix $A \in \calM_d$ is positive semidefinite
(p.s.d.) if all its eigenvalues are non-negative.  We will use the
fact that $A$ is p.s.d.\ iff for all $v \in \R^d$, $\langle v, A v
\rangle \geq 0$.  We let $A \geq 0$ denote that $A$ is p.s.d.  We use
the ordering of symmetric matrices given by this definition, namely $A
\leq B$ iff $B - A \geq 0$.  For two matrices $A \leq B$, we will let
$[A, B]$ denote the set of all symmetric matrices $C$ such that $A
\leq C$ and $C \leq B$.

We will work with the matrix exponential, which is defined by
$$\exp(A) = \sum_{\ell=0}^\infty \frac{A^\ell}{\ell!}\,.$$
Recall that the matrix exponential is convergent for all matrices.
Furthermore, it is not hard to see for $A \in \calM_d$ that
an eigenbasis of $A$ is also an eigenbasis of $\exp(A)$  and that
$\lambda_i(\exp(A)) = e^{\lambda_i(A)}$ for all $1 \leq i \leq d$.
Also, for all $A \in \calM_d$, it holds that $\exp(A) \geq 0$.

We will consider matrix-valued random variables of the following form.
We let $f : [n] \dans [-I_d, I_d]$, where $[n] = \{1, \ldots, n\}$.
Let $X$ be a distribution (not necessarily uniform) over $[n]$, and
consider the variable $f(X)$.  This is a natural extension of bounded
discrete random variables over the reals, which may be thought of as
functions $f : [n] \dans [-1, 1]$.  We will let the expectation of
$f(X)$ be the obvious thing: $\Exp[f(X)] = \sum_{i=1}^n \Pr[X = i]
f(i)$.  Note that because $\Tr$ is linear, $\Exp$ and $\Tr$ commute:
$\Exp[\Tr(f(X))] = \Tr(\Exp[f(X)])$.  We let $\supp(X)$ denote the set
of all values of $X$ that occur with non-zero probability.  When we
say that something holds for a random variable $X$ \emph{always}, we
mean that it holds for every element in $\supp(X)$.

We will use the following useful facts several times:
\begin{fact}
\label{fact:tr}
If $A, B \in \calM_d$ and $B \geq 0$, then $\Tr(AB) \leq \|A\| \cdot
\Tr(B)$.
\end{fact}
\begin{proof}
Let $v_1, \ldots, v_d$ be the orthonormal 
eigenbasis of $A$, with    %%%%%%%%%%%%%%%%%
corresponding eigenvalues $\lambda_i = \lambda_i(A)$.  Then we may
write
$$
\Tr(AB)  = \sum_{i=1}^d \langle v_i, AB v_i \rangle = \sum_{i=1}^d \lambda_i \langle v_i, B v_i \rangle\,.
$$
Since $B \geq 0$ we know that $\langle v_i, B v_i \rangle
  \geq 0$, so we get
$$  
\Tr(AB) \leq \sum_{i=1}^d \max_j \lambda_j \langle v_i, B v_i \rangle  \leq \|A\| \cdot \Tr(B)\,.
$$
\end{proof}

\begin{theorem}[Golden-Thompson inequality,~\cite{Golden, Thompson}]
\label{thm:gt}
For $A, B \in \calM_d$, we have
$$\Tr(\exp(A+B)) \leq \Tr(\exp(A) \exp(B))\,.$$
\end{theorem}
The proof of this is outside the scope of this paper.

Ahlswede and Winter introduce a generalization of Markov's inequality
for matrix-valued random variables.
\begin{theorem}[Markov's inequality~\cite{AW}]
\label{thm:markov}
For any $\gamma > 0$, any function $g : [n] \dans \calM_d$ such that
$g(x) \geq 0$ for all $x \in [n]$, and for any random variable $X$
over $[n]$, we have
$$\Pr[g(X) \not\leq \gamma I] \leq \tfrac{1}{\gamma}\Tr(\Exp[g(X)])\,.$$
\end{theorem}
\begin{proof}
\begin{align*}
\Pr[g(X) \not\leq \gamma I] & = \Pr[\|g(X)\| > \gamma]  \leq \tfrac{1}{\gamma} \Exp[\|g(X)\|]\,. \\
\intertext{Since $g(X) \geq 0$ always, we have $\|g(X)\| \leq
  \Tr(g(X))$ always, so we get:}
& \leq \tfrac{1}{\gamma} \Exp[\Tr(g(X))]  = \tfrac{1}{\gamma} \Tr(\Exp[g(X)])\,.
\end{align*}
\end{proof}

The following \thmref{thm:main} is the main theorem proving~\cite{AW}'s Chernoff-type bound.  We will use \thmref{thm:main},
which holds for all distributions, to derive two corollaries
(\thmref{thm:aw} and \thmref{thm:aw2}), which hold for more
specific kinds of distributions.  In addition, the proof of
\thmref{thm:main} will give us the pessimistic estimators
corresponding to the two corollaries.
\begin{theorem}[\cite{AW}]
\label{thm:main}
Suppose $f : [n] \dans [ -I_d, I_d]$ and let $X_1, \ldots, X_k$ be
arbitrary independent random variables distributed over $[n]$.
Then for all $\gamma \in \R$:
$$
\Pr\Bigl[\tfrac{1}{k} \sum_{j=1}^k f(X_j) \not\leq \gamma I\Bigr] \leq d e^{-t
  \gamma k} \prod_{j=1}^k \bigl\|\Exp[\exp(t f(X_j))]\bigr\|\,.$$
\end{theorem}
\begin{proof}
The proof begins analogously to the real-valued case, generalizing the
classical Bernstein trick.  We first multiply by an optimization
constant $t > 0$ and exponentiate to obtain
$$\Pr\Bigl[\tfrac{1}{k} \sum_{j=1}^k f(X_j) \not\leq \gamma I\Bigr] = \Pr\Bigl[\exp\Bigl(t
  \sum_{j=1}^k f(X_j)\Bigr) \not\leq e^{t \gamma k} I\Bigr]\,.$$

The equality holds because for any $A \in \calM_d, \alpha \in \R$, the
statement $A \not\leq \alpha I$ is equivalent to saying some
eigenvalue of $A$ is larger than $\alpha$, which is the same as saying
that some eigenvalue of $\exp(A)$ is larger than $e^\alpha$, which in
turn is equivalent to $\exp(A) \not\leq e^\alpha I$.  Then the
following inequality is a direct consequence of \thmref{thm:markov}
since $\exp(A) \geq 0$ for all $A \in \calM_d$.
\begin{equation}
\label{eq:begin}
\Pr\Bigl[\tfrac{1}{k} \sum_{j=1}^k f(X_j) \not\leq \gamma I\Bigr] \leq e^{-t
  \gamma k} \Tr\left(\Exp\Bigl[\exp\Bigl(t \sum_{j=1}^k f(X_j)\Bigr)\Bigr]\right)\,.
\end{equation}

Then we apply \factref{fact:tr} and the Golden-Thompson Inequality
\thmref{thm:gt} to bound the expression in a manageable form.
This step will be expressed in the following lemma.

\begin{lemma}
\label{lemma:mid}
For any matrix $A \in \calM_d$, any $f : [n] \dans \calM_d$ and any
random variable $X$ over $[n]$, we have
$$
\Tr\bigl(\Exp_{X}[\exp(A + f(X))]\bigr) \leq \bigl\|\Exp[\exp(f(X))]\bigr\| \cdot
  \Tr(\exp(A))\,.
  $$
\end{lemma}

To obtain \thmref{thm:main}, we simply apply
\lemref{lemma:mid} to \inequalityref{eq:begin} repeatedly:
\begin{align}
\Pr\Bigl[\tfrac{1}{k} & \sum_{j=1}^k f(X_j) \not\leq \gamma I\Bigr] \leq e^{-t
  \gamma k} \Tr\left(\Exp\Bigl[\exp\Bigl(t \sum_{j=1}^k f(X_j)\Bigr)\Bigr]\right) & \nonumber\\
& = e^{-t \gamma k} \Exp_{X_1, \ldots, X_{k-1}} \left[ \Tr\left(\Exp_{X_k}\Bigl[
  \exp\Bigl(t \sum_{j=1}^{k-1} f(X_j) + t f(X_k)\Bigr)\Bigr]\right) \right] &
  \textit{(By independence)}\nonumber\\
& \leq e^{-t \gamma k} \Exp_{X_1, \ldots, X_{k-1}} \left[
  \bigl\|\Exp[\exp(t f(X_k))] \bigr\| \cdot \Tr\Bigl(\exp\Bigl(t \sum_{j=1}^{k-1} f(X_j)\Bigr)\Bigr)
  \right] & \textit{(Apply \lemref{lemma:mid})}\nonumber\\
& = e^{-t \gamma k}  \bigl\|\Exp[\exp(t f(X_k))] \bigr\| \cdot \Tr\left(\Exp_{X_1,
  \ldots, X_{k-1}}\Bigl[\exp\Bigl(t \sum_{j=1}^{k-1} f(X_j)\Bigr)\Bigr]\right) &
  \textit{(Put expectation back inside)} \nonumber\\
& \leq e^{-t \gamma k} \prod_{j=1}^k \bigl\| \Exp[\exp(t f(X_j))]\bigr\|\cdot \Tr(I)
  & \textit{(Repeat $k$ times \ldots)} \nonumber\\
& = d e^{-t \gamma k} \prod_{j=1}^k \bigl\| \Exp[\exp(t f(X_j))]\bigr\|\,.
\end{align}
This completes the proof modulo \lemref{lemma:mid}.
\end{proof}

\begin{proof}[Proof of \lemref{lemma:mid}]
\begin{align*}
\Tr(\Exp[\exp(A + f(X))]) & = \Exp[\Tr(\exp(A + f(X)))] 
&\textit{(Since trace and expectation commute)} \\
& \leq \Exp[\Tr(\exp(f(X))\exp(A))] &
\textit{(Applying the Golden-Thompson inequality)} \\
& \leq \Tr(\Exp[\exp(f(X))]\exp(A)) 
& \textit{(Commuting trace and expectation again)} \\
& \leq \bigl\|\Exp[\exp(t f(X))]\bigr\| \cdot \Tr(\exp(A))\,.
& \textit{(By \factref{fact:tr})} 
\end{align*}
\end{proof}

Now we will draw two corollaries from this main theorem.  These two
corollaries are useful in different settings; the first guarantees
that the probability of an additive deviation is small, while the
second that of a multiplicative deviation.
\begin{theorem}[\cite{AW}]
\label{thm:aw}
Let $f : [n] \dans [-I_d, I_d]$.  Let $X$ be distributed over $[n]$
with $\Exp_X [f(X)] = 0$, and let $X_1, \ldots, X_k$ be i.i.d. copies
of $X$.  Then for all $1 > \gamma > 0$:\footnote{For the sake of
  simplicity, no attempt was made to optimize the constant in the
  exponent of the bound in this analysis. To get a tighter bound, we
  can apply the analysis of~\cite{AW} to get a bound of $$d e^{-k
    D\bigl(\tfrac{1+\gamma}{2} \| \tfrac{1}{2}\bigr)}\,.$$  Here $D(p \| q) = p
  (\log p - \log q) + (1 - p) (\log(1-p) - \log (1 - q))$ is the
  relative entropy function, and using the approximation
  $D((1+\gamma)/2 \| 1/2) \geq \gamma^2 / (2 \ln 2)$,
  which can be shown by looking at the Taylor expansion of $D(\cdot \|
  \cdot)$, we have the improved bound of $de^{-k \gamma^2 / (2 \ln
    2)}$.} 
$$\Pr\Bigl[\tfrac{1}{k} \sum_{i=1}^k f(X_i) \not\leq \gamma I\Bigr] \leq d
  e^{-\gamma^2 k/4}\,.$$
\end{theorem}
Note that the other direction $\tfrac{1}{k} \sum_{i=1}^k f(X_i)
\not\geq - \gamma I$ holds with the same bound by considering $-f$.
\begin{proof}
We require only \thmref{thm:main} and a simple claim.
Because all the $X_i$ are i.i.d. \thmref{thm:main} gives us
$$\Pr\Bigl[\tfrac{1}{k} \sum_{i=1}^k f(X_i) \not\leq \gamma I\Bigr] \leq d e^{-
  t \gamma k} \bigl\|\Exp[ \exp(t f(X)) ] \bigr\|^k \,.$$

We use the following claim to bound the RHS.
\begin{claim}
\label{claim:additive}
$\bigl\|\Exp[\exp(t f(X))]\bigr\| \leq 1 + t^2$ for $t \leq 1/2$.
\end{claim}
\begin{proof}
This follows from the Taylor expansion of $\exp$:
\begin{align*}
\bigl\|\Exp[\exp(t f(X))\bigr\| & = \bigl\| \Exp[I + t f(X) + (t f(X))^2/2 +
  \ldots] \bigr\| \\
& = \bigl\|I + t \Exp[f(X)] + \Exp[(t f(X))^2/2 + \ldots] \bigr\|\,. \\
\intertext{Since $\Exp[f(X)] = 0$, applying the triangle inequality,
  and using $\|f(X)\| \leq 1$ always, we have}
& \leq 1 + \sum_{\ell=2}^\infty t^\ell / \ell! \,.\\
\intertext{Since $t = \gamma / 2 \leq 1/2$ this gives}
& \leq 1 + t^2\,.
\end{align*}
\end{proof}

We will choose $t = \gamma / 2 \leq 1/2$, so we may apply
\claimref{claim:additive} to \thmref{thm:main} to get
\begin{align*}
\Pr\Bigl[\tfrac{1}{k} \sum_{i=1}^k f(X_i) \not\leq \gamma I\Bigr] & \leq d e^{-
  t \gamma k} (1 + t^2)^k & \\
& \leq d e^{- t \gamma k + t^2 k} 
& \textit{(Using $1+x \leq e^x$ for all $x \in \R$)}\\
& \leq d e^{-\gamma^2 k / 4}\,.
& \textit{(Choosing $t = \gamma / 2$)}
\end{align*}
\end{proof}

\begin{theorem}[\cite{AW}]
\label{thm:aw2}
Let $f : [n] \dans [0, I_d]$.  Let $X$ be distributed over $[n]$, with
$M = \Exp_X [f(X)] \geq \mu I$ for some $\mu \in (0, 1)$.  Let $X_1,
\ldots, X_k$ be i.i.d. copies of $X$.  Then we have, for all $\gamma
\in [0, 1/2]$,
$$\Pr\Bigl[ \tfrac{1}{k} \sum_{i=1}^k f(X_i) \not\geq (1 - \gamma) \mu I\Bigr]
\leq d e^{-\gamma^2 \mu k / (2 \ln 2)}\,.$$
\end{theorem}
\begin{proof}
We can assume without loss of generality that $M = \mu I$.\footnote{If
not, we could work with $g(x) = \mu M^{-1/2} f(x) M^{-1/2}$ instead.}
Because the direction of this bound is the opposite of what we proved
in \thmref{thm:main}, we will work with $I - f$ to get:
\begin{align}
\Pr\Bigl[ \tfrac{1}{k} \sum_{i=1}^k f(X_i) \not\geq (1 - \gamma) \mu I\Bigr] & =
\Pr\Bigl[ \tfrac{1}{k} \sum_{i=1}^k (I - f(X_i)) \not\leq (1 - (1 -
  \gamma)\mu) I\Bigr]\,. \\
\intertext{Applying \thmref{thm:main}}
& \leq d e^{-t (1 - (1 - \gamma) \mu) k} \bigl\|\Exp[\exp( t (I -
  f(X)))]\bigr\|^k \\
\label{eq:djfkj}
& = d \bigl\|\Exp[ \exp( - t f(X)) e^{t (1 - \gamma) \mu}]\bigr\|^k\,.
\end{align}

This last quantity was analyzed in the proof of Theorem 19 of~\cite{AW}, with the following
conclusion which we state without proof:
\begin{claim}[\cite{AW}]
For $t = \log \bigl( \frac{1 - (1 - \gamma) \mu}{1 - \mu} \frac{1}{(1 -
\gamma)}\bigr)$, we have
$$\bigl\|\Exp[\exp(- t f(X))] e^{t (1 - \gamma) \mu} \bigr\| \leq e^{-\gamma^2
  \mu / (2 \ln 2)}\,.$$ 
\end{claim}

Applying this claim to \inequalityref{eq:djfkj} gives us the theorem.
\end{proof}

\section{Method of pessimistic estimators}
\label{sec:condprob}

First we review the method of pessimistic estimators, due to Raghavan~\cite{Raghavan88}.  The setting is the following: we have a random
variable $X$ and we know that with some non-zero probability an event
$\sigma(X)$ occurs, \ie, $\Pr[\sigma(X) = 1] > 0$, where $\sigma :
\supp(X) \dans \zo$, $\sigma(x) = 1$ iff $x$ is in the event.  We
wish to efficiently and deterministically find a particular $x \in
\supp(X)$ such that $\sigma(x) = 1$.

Our application of pessimistic estimators is to derandomizing
probabilistic algorithms.  In particular, suppose we have a randomized
algorithm that constructs an object, and with some non-zero
probability that object satisfies some property.  Thus, our event
$\sigma$ is the event that the object satisfies the property, and our
goal is to deterministically and efficiently find the object.  In this
paper our two main applications are to deterministically and
efficiently find a small generating set of a group that satisfies
expansion, and to find an integer solution to a SDP covering problem
that satisfies feasibility and some approximation guarantee.  Both
problems were previously known to have randomized algorithms, and we
use our pessimistic estimators to derandomize these algorithms.

We will only be concerned with random variables with finite state
space with a product structure, and we will sub-divide the variable
into many parts.  Thus we use the notation $\vec{X}$ to denote a
random variable where \emph{w.l.o.g.} $\supp(\vec{X}) \subseteq [n]^k$
for some $k, n \in \N$ (these will be chosen according to the
application).  Let $\vec{X} = (X_1, \ldots, X_k)$, where each $X_i \in
[n]$.  To find a ``good'' setting of $\vec{X}$, we will iteratively
find settings of $X_1$, then $X_2$, and so forth until we have a
complete setting of $\vec{X}$.

By the definition of expectation
$$\Pr_{\vec{X}} [\sigma(\vec{X}) = 0] = \Exp_{X_1}
    \Bigl[\Pr\bigl[\sigma(\vec{X}) = 0 \mid X_1\bigr]\Bigr]\,.$$
Now by averaging there must exist at least one setting $x_1 \in [n]$
of $X_1$ such that
$$\Pr\bigl[\sigma(\vec{X}) = 0 \mid X_1 = x_1\bigr] \leq \Exp_{X_1}
 \Bigl[\Pr\bigl[\sigma(\vec{X}) = 0 \mid X_1\bigr]\Bigr]\,.$$ 
We set $X_1 = x_1$, and then repeat the same reasoning for $X_2,
\ldots, X_k$.  Let us denote the resulting setting of $\vec{X}$ by
$\vec{x}$.  Thus at the end we have $\Pr [\sigma(\vec{x}) = 0] \leq
\Pr[\sigma(\vec{X}) = 0]$.  But note that we supposed that
$\Pr[\sigma(\vec{X}) = 0] < 1$, and since $\vec{x}$ is a \emph{fixed}
vector, it must be that $\Pr[\sigma(\vec{x}) = 0] = 0$ and therefore
$\sigma(\vec{x}) = 1$. 

The difficulty with turning this into an algorithm is in calculating
the probabilities, for each $1 \leq i \leq k$ and, $\forall x_1,
\ldots, x_i \in [n] $
$$\Pr_{X_{i+1}, \ldots, X_k} [\sigma(\vec{X})= 0 \mid X_1 = x_1,
  \ldots, X_i = x_i]$$
since they may not be efficiently computable.  Fortunately we may
relax the requirements slightly by the following.\footnote{Our
definition is stronger than the standard definition of pessimistic
estimators, in that in the second condition usually all that is
required is for all $x_1, \ldots, x_i \in [n]$, there exists $x_{i+1}
\in [n]$ such that $\phi_{i+1}(x_1, \ldots, x_{i+1}) \leq \phi_i(x_1,
\ldots, x_i)$.  But our estimators satisfy the stronger
definition and we will find it useful, especially when
composing estimators (see \lemref{lemma:compose}).}

\begin{definition}
\label{def:estimators}
Let $\sigma : [n]^k \dans \zo$ be an event on a random variable
$\vec{X}$ distributed over $[n]^k$ and suppose $\Pr[\sigma(\vec{X}) = 1]
> 0$.  We say that $\phi_0, \ldots, \phi_k$, $\phi_i : [n]^i \dans [0,
1]$ (here $\phi_0$ is just a number in $[0, 1]$), are
\emph{pessimistic estimators} for $\sigma$ if the following hold.
\begin{enumerate}
\label{property:lt1}
\label{property:bound}
\item For any $i$ and any fixed $x_1, \ldots, x_i \in [n]$, we have
  that
$$\Pr_{X_{i+1}, \ldots, X_k} [\sigma(x_1, \ldots, x_i, X_{i+1}, \ldots,
    X_k) = 0] \leq \phi_i(x_1, \ldots, x_i)\,.$$ 
\label{property:decrease}
\item \label{property:sub} For any $i$ and any fixed $x_1, \ldots, x_i \in [n]$:
$$\Exp_{X_{i+1}} \phi_{i+1}(x_1, \ldots, x_i, X_{i+1}) \leq
  \phi_i(x_1, \ldots, x_i)\,.$$
\end{enumerate}
\end{definition}

We will also want the pessimistic estimators to be \emph{efficient},
namely each $\phi_i$ is efficiently computable, and \emph{useful},
which means $\phi_0 < 1$.  This last condition is because $\phi_0$ is
a bound on the initial probability of failure, which we need to be
strictly less than $1$.

\begin{theorem}[\cite{Raghavan88}]
\label{thm:derand}
If there exist efficient and useful pessimistic estimators $(\phi_0,
\ldots, \phi_k)$ for an event $\sigma$, then one can efficiently
compute a \emph{fixed} $\vec{x} \in [n]^k$ such that $\sigma(\vec{x})
= 1$.
\end{theorem}
\begin{proof}
We pick $x_1, \ldots, x_k$ one by one.  At step $0$ we have
$\phi_0 < 1$ since the estimators are useful.

At step $i$, we have $x_1, \ldots, x_i$ already fixed.  Enumerate over
$x_{i+1} \in [n]$ and choose the value such that $\phi_{i+1}(x_1,
\ldots, x_{i+1}) \leq \phi_i(x_1, \ldots, x_i) < 1$.  We are
guaranteed that
$$\Exp_{X_{i+1}} [ \phi_{i+1}(x_1, \ldots x_i, X_{i+1}) ] \leq
\phi_i(x_1, \ldots, x_i)$$
by \expref{Property}{property:sub} of \definitionref{def:estimators}, and so by averaging
there must exist a fixed $x_{i+1} \in [n]$ that is at most the
expectation on the LHS of the above inequality.  We can compute the
value of the estimator efficiently by hypothesis. 

Finally, we have after $k$ steps that $\phi_k(\vec{x}) < 1$ and by
\expref{Property}{property:decrease} we have that $\Pr[\sigma(\vec{x}) = 0] \leq \phi_k(\vec{x})
< 1$, and therefore $\sigma(\vec{x}) = 1$.

The algorithm runs through $k$ steps, and each step is efficient, so
the overall algorithm is efficient.
\end{proof}

We will find it useful to compose estimators, which is possible from
the following lemma.

\begin{lemma}
\label{lemma:compose}
Suppose $\sigma, \tau : [n]^k \dans \zo$ are events on $\vec{X}$,
which is distributed over $[n]^k$.  Suppose that $(\phi_0, \ldots,
\phi_k), (\psi_0, \ldots, \psi_k)$ are pessimistic estimators for
$\sigma, \tau$ respectively.  Then $(\phi_0 + \psi_0, \ldots, \phi_k +
\psi_k)$ are pessimistic estimators for the event $\sigma \cap \tau$.
\end{lemma}
\begin{proof}
We need to verify the properties of \definitionref{def:estimators}.
\begin{enumerate}
\item This is verified by a union bound:
\begin{align*}
\Pr[(\sigma \cap \tau) & (x_1, \ldots, x_i, X_{i+1}, \ldots, X_k) = 0] \\
& \leq \Pr[\sigma (x_1, \ldots, x_i, X_{i+1}, \ldots, X_k) = 0] +
\Pr[\tau (x_1, \ldots, x_i, X_{i+1}, \ldots, X_k) = 0] \\
& \leq (\phi_i + \psi_i)(x_1, \ldots, x_i)\,.
\end{align*}
\item This is immediate from linearity of expectation.
\end{enumerate}
\end{proof}


\section{Applying pessimistic estimators to the AW bound}

\label{sec:awestimators}

The method of pessimistic estimators extends to the AW Chernoff bound.
We will first describe pessimistic estimators for \thmref{thm:aw}
and then for \thmref{thm:aw2}.  They are essentially identical
except for the difference in distributions in the two settings, and
the proofs that the pessimistic estimators satisfy
\definitionref{def:estimators} rely mainly on \lemref{lemma:mid}.
In both cases, they allow us to efficiently and deterministically find
settings $x_1, \ldots, x_k$ such that bad event bounded by
\thmref{thm:aw} (resp. \thmref{thm:aw2}) does not occur.

\begin{theorem}
\label{thm:mest}
Let $f : [n] \dans [-I_d, I_d]$.  Let $X$ be distributed over $[n]$
with $\Exp_X [f(X)] = 0$, and let $X_1, \ldots, X_k$ be
i.i.d. copies of $X$.  Fix $1 > \gamma > 0$.  Let $t = \gamma / 2$.
Suppose that $\Exp[\exp( t f(X))]$ is efficiently computable.

Combining the notation of \secref{sec:aw} and
\secref{sec:condprob}, we let $\vec{X} = (X_1, \ldots, X_k)$ with
$X_i \in [n]$ and we let $\sigma : [n]^k \dans \zo$ be the event
$\sigma(\vec{x}) = 1$ if $\tfrac{1}{k}\sum_{i=1}^k f(x_i) \leq \gamma
I$ and $\sigma(\vec{x}) = 0$ otherwise. Then the following $(\phi_0,
\ldots, \phi_k), \phi_i : [n]^i \dans [0, 1]$ are \emph{efficient
pessimistic estimators} for $\sigma$:
\begin{align*}
\phi_0 = & d e^{-t \gamma k} \bigl\|\Exp[\exp(t f(X))]\bigr\|^k  \quad
(\text{which is at most } d  e^{-\gamma^2 k / 4})\,, \\
\phi_i(x_1, \ldots, x_i) = & d e^{-t \gamma k} \Tr\left(\exp\Bigl(t
  \sum_{j=1}^i f(x_j)\Bigr)\right) \cdot \bigl\|\Exp[\exp(t f(X))]\bigr\|^{k-i}\,.
\end{align*}
\end{theorem}
\begin{proof}
We verify the properties of \definitionref{def:estimators}.
\begin{enumerate}
\item From \inequalityref{eq:begin}:
\begin{align*}
\Pr\Bigl[\tfrac{1}{k} \sum_{i=1}^k f(X_i) \not\leq \gamma I\Bigr] & \leq d e^{-
    t \gamma k} \Tr\left(\Exp\Bigl[\exp\bigl(t \sum_{j=1}^k f(X_j)\bigr)\Bigr]\right) \\
& \leq d e^{- t \gamma k} \Tr\left(\Exp\Bigl[\exp\bigl(t \sum_{j=1}^i f(X_j)\bigr)\Bigr]\right)
  \prod_{j=i+1}^k\bigl\|\Exp[\exp(t f(X_j))]\bigr\|\,.
\end{align*}
By fixing $X_j = x_j$ for all $j \leq i$, we derive that
\begin{align*}
\Pr\Bigl[\tfrac{1}{k} \sum_{i=1}^k f(X_i) \not\leq \gamma I \mid X_1 = x_1,
\ldots, X_i = x_i\Bigr] & \leq d e^{- t \gamma k} \Tr\left(\exp\Bigl(t \sum_{j=1}^i
f(x_j)\Bigr)\right) \cdot \bigl\|\Exp[\exp(t f(X))]\bigr\|^{k-i} \\
& = \phi_i(x_1,\ldots, x_i)\,.
\end{align*}

\item We use the following derivation, where the inequality follows
  from \lemref{lemma:mid}:
\begin{align*}
\Exp_{X_{i+1}} [ & \phi_{i+1}(x_1, \ldots, x_i, X_{i+1})] \\
& = d e^{-t \gamma k} 
\Tr\left(\Exp_{X_{i+1}}\Bigl[\exp\bigl(t \sum_{j=1}^i f(x_i) +
  t f(X_{i+1})\bigr)\Bigr]\right) \cdot \|\Exp(\exp(t f(X)))\|^{k-i-1} \\
& \leq d e^{-t \gamma k} 
\Tr\left(\exp\Bigl(t \sum_{j=1}^i f(x_i)\Bigr)\right) \cdot
\bigl\|\Exp(\exp(t f(X)))\bigr\|^{k-i} \\
& = \phi_i(x_1, \ldots, x_i)\,.
\end{align*}
\end{enumerate}

To see that the $\phi_i$ are efficiently computable, we will specify
the input to the algorithm as a function $f$ (which we assume is given
as a list of $d \times d$ matrices $f(1), \ldots, f(n)$) and $1^k$.
Thus we desire the algorithm to be computable in time $\poly(n, d, k)$.
We require multiplication, addition, trace, matrix exponential, and
norm computations.  The first three are obviously efficient; the last
two are efficient because eigenvalues of a $d \times d$ matrix can be
computed (and hence it can be diagonalized thus making the exponential
and norm computations trivial) 
in $O(d^3)$ numerical operations~\cite{GolubLoan}.  
On a machine with finite precision, we can truncate
the estimators to a sufficiently fine resolution so that the truncated
estimators behave essentially as the real-valued estimators do.
%%
%% Of course these are all numerical computations so
%% the running time will also depend on the accuracy; if we require an
%% error of $\pm \eps$ in the estimator means that, in the worst case,
%% each numerical operation will have a linear dependence on
%% $\tfrac{k}{\eps}$ (rather than $\log \tfrac{k}{\eps}$, since we are
%% working with exponentials).
\end{proof}

\thmref{thm:mest} gives us pessimistic estimators
$(\phi_0, \ldots, \phi_k)$ for $\sigma$, and the same
proof gives efficient pessimistic estimators $(\psi_0,
\ldots, \psi_k)$ for the event $\tau(\vec{x}) = 1$ iff
$\tfrac{1}{k} \sum_{i=1}^k f(x_i) \geq - \gamma I$ by
applying \thmref{thm:aw} to $-f$.  Combining these with
the $\phi_i$ gives us the following.

\begin{corollary}
\label{cor:mest}
Let $f : [n] \dans [-I_d, I_d]$.  Let $X$ be distributed over $[n]$
with $\Exp_X [f(X)] = 0$, and let $X_1, \ldots, X_k$ be
i.i.d. copies of $X$.  Fix $1 > \gamma > 0$ and fix $t = \gamma / 2$.
Suppose that $\Exp[\exp( t f(X))]$ and $\Exp[\exp( -t f(X))]$ are
efficiently computable.

Let $\eta : [n]^k \dans \zo$ be the event $\eta(\vec{x}) = 1$ if
$\|\tfrac{1}{k}\sum_{i=1}^k f(x_i)\| \leq \gamma$ and $\eta(\vec{x}) =
0$ otherwise. Then $(\phi_0 + \psi_0, \ldots, \phi_k + \psi_k)$ are
efficient pessimistic estimators for $\eta$.
\end{corollary}
\begin{proof}
Note that $\eta = \sigma \cap \tau$.  Efficiency is clear.  We can
apply \lemref{lemma:compose} to get that $(\phi_0 + \psi_0, \ldots,
\phi_k + \psi_k)$ is a pessimistic estimator for the event $\eta =
\sigma \cap \tau$.
\end{proof}

This allows us to derandomize \thmref{thm:aw} efficiently.  Notice
that in general the only property of $X$ that we need is to be able to
compute $\Exp[\exp(t f(X))]$ and $\Exp[\exp(-t f(X))]$.\footnote{In
  fact this is only necessary because we want a two-sided guarantee,
  \ie, $\tfrac{1}{k} \sum_{i=1}^k f(X_i) \leq \gamma I$ and
  $\tfrac{1}{k} \sum_{i=1}^k f(X_i) \geq - \gamma I$.  It is not
  necessary if we only require a one-sided guarantee, such as in the
  setting of \thmref{thm:mest2}, where we only want $\tfrac{1}{k}
  \sum_{i=1}^k f(X_i) \geq (1 - \gamma) \mu I$.  In this second
  setting, when picking $X_i$ to minimize $\phi_i$, notice that the
  quantity $\|\Exp[\exp(t f(X))]\|$ does not change with different
  choices of $X_i$, so the only part we need to compute is the trace
  part, which does depend on the choice of $X_i$.  Thus it suffices to
  compute the choice of $X_i$ that minimizes the trace part of the
  $\phi_i$.}  This is of course true when $X$ is uniform, or when we
can efficiently compute $\Pr[X = x]$ for each $x \in [n]$.  The actual
distribution is irrelevant, since we exhaustively search through the
entire space for the choice of each $X_i$.


\begin{theorem}
Let $f : [n] \dans [-I_d, I_d]$ be such that there exists a
distribution $X$ over $[n]$ such that $\Exp[f(X)] = 0$.  Then for $k =
O(\tfrac{1}{\gamma^2} \log d)$, we can efficiently and deterministically
find $\vec{x} \in [n]^k$ such that $\|\tfrac{1}{k} \sum_{i=1}^k f(x_i)\|
\leq \gamma$.
\end{theorem}
\begin{proof}
Use the efficient pessimistic estimators of \corref{cor:mest}.
Pick $k = O(\tfrac{1}{\gamma^2} \log d)$ such that $\phi_0 + \psi_0 <
1$ and so that the estimators are useful.  We may then apply
\thmref{thm:derand} to get the result.
\end{proof}

We can construct pessimistic estimators for \thmref{thm:aw2} in
the same way.

\begin{theorem}
\label{thm:mest2}
Let $f : [n] \dans [0, I_d]$.  Let $X$ be distributed over $[n]$, with
$M = \Exp_X [f(X)] \geq \mu I$ for some $\mu \in (0, 1)$.  Let $X_1,
\ldots, X_k$ be i.i.d. copies of $X$.  Fix $$t = \log \left( \frac{1 - (1 -
\gamma) \mu}{1 - \mu} \frac{1}{(1 - \gamma)}\right)\,.$$

Let $\vec{X} = (X_1, \ldots, X_k)$ with $X_i \in [n]$ and we let
$\sigma : [n]^k \dans \zo$ be the event $\sigma(\vec{x}) = 1$ if
$\tfrac{1}{k}\sum_{i=1}^k f(x_i) \geq (1-\gamma) \mu I$ and
$\sigma(\vec{x}) = 0$ otherwise. Then the following $(\phi_0, \ldots,
\phi_k), \phi_i : [n]^i \dans [0, 1]$ are efficient pessimistic
estimators for $\sigma$:
\begin{align*}
\phi_0 = & d e^{t k (1 - \gamma) \mu} \bigl\|\Exp[\exp(- tf(X))]\bigr\|^k \quad
(\text{which is at most }  d e^{-\gamma^2 \mu k / (2 \ ln 2)})\,, \\
\phi_i(x_1, \ldots, x_i) = & d e^{t k (1 - \gamma) \mu}
  \Tr\left(\exp\Bigl(- t \sum_{j=1}^i f(x_j)\Bigr)\right) \cdot \bigl\|\Exp[\exp(-
  tf(X))]\bigr\|^{k-i}\,.
\end{align*}
\end{theorem}
\begin{proof}
The proof follows exactly along the lines of \thmref{thm:mest}.
\end{proof}

\begin{theorem}
Let $f : [n] \dans [0, I_d]$ be such that there exists a distribution
$X$ over $[n]$ and a number $\mu \in (0, 1)$ such that $\Exp[f(X)]
\geq \mu I$.  Then for $k = O(\tfrac{1}{\gamma^2 \mu} \log d)$, we can
efficiently and deterministically find $\vec{x} \in [n]^k$ such that
$\tfrac{1}{k} \sum_{i=1}^k f(x_i) \geq (1 - \gamma) \mu I$.
\end{theorem}
\begin{proof}
Use the efficient pessimistic estimators of \thmref{thm:mest2},
and notice for our choice of $k$ that $\phi_0 < 1$ so they are
useful.  Then apply \thmref{thm:derand}.
\end{proof}

\section{$O(\log n)$ expanding generators for any group}
\label{sec:ar}

Our main application is a complete derandomization of the
Alon-Roichman~\cite{AR} theorem, which states that a certain kind of
expander graph may be constructed by random sampling (details below).
Expander graphs have a central role in theoretical computer science,
especially in but not limited to the study of derandomization.
Indeed, they have found a large number of applications in a variety of
areas such as deterministic amplification~\cite{CohenWi89, ImZu},
security amplification in cryptography~\cite{GoldreichImLeVeZu90},
hardness of approximation~\cite{AroraLuMoSuSz98, AFWZ}, extractor
construction (e.g. see surveys~\cite{NisanTa99, goldreich97sample,
Shaltiel02}), construction of efficient error-correcting codes~\cite{SpielmanThesis, bilu-hypergraph}, construction of $\eps$-biased
spaces~\cite{NaorNa93} and much more.  See~\cite{HLW06} for a
comprehensive survey.

We derandomize the proof of the Alon-Roichman theorem given 
by~\cite{RusLan} (see also~\cite{SchulLoh}) to give a deterministic and
efficient construction of the expanding generating set.  We show how
it implies an optimal solution to a problem of Shpilka and 
Wigderson~\cite{SW} (see also~\cite{Goldreich-Sudan}), significantly 
improving their results.


\subsection{Definitions}

Given an undirected $d$-regular graph $G = (V, E)$ on $n$
vertices, we define its normalized adjacency matrix
$A$ by setting $A_{ij} = e_{ij}/d$          
where $e_{ij}$ is the number of edges between vertices $i$
and $j$ (we allow self-loops and multiple edges).  
The matrix $A$ is real and symmetric.

We assume $G$ is connected.  
It is well known that the set of eigenvalues of $A$ is of the form 
$1 = \lambda_1(A) > \lambda_2(A) \geq \ldots \geq \lambda_n(A)$.  
(The strict separation between $\lambda_1(A)$ and $\lambda_2(A)$
follows from connectivity.)  The eigenvalues of $G$ are the eigenvalues
of $A$.  Note that $1$ is an eigenvalue of multiplicity 1, and with
corresponding eigenvector $u = [1/\sqrt{n}, \ldots, 1/\sqrt{n}]^T$.
% which we call the uniform vector.  Alternatively, the eigenvalue 1
% also corresponds to the uniform eigenspace, given by the matrix
% $J/n$, where $J$ is the all $1$'s matrix, which is the orthogonal
% projection onto the space spanned by the eigenvector $u$.
The orthogonal projection to the subspace 
spanned by the eigenvector $u$ is given by the matrix
$J/n$, where $J$ is the all $1$'s matrix.

\newcommand{\Cay}{\mathrm{Cay}}

The Cayley graph $\Cay(H; S)$ on a group $H$ with respect to the
generating multi-set $S \subset H$ is the graph whose vertex set is
$H$, and where $h$ and $h'$ are connected by an edge if there exists
$s \in S$ such that $h' = hs$ (allowing for multiple edges for
multiple elements in $S$).  We require $S$ to be symmetric, namely for
each $s \in S$, we also have $s^{-1} \in S$ (this is to make the graph
undirected).  Let $\lambda(\Cay(H; S))$ denote the second-largest
eigenvalue (in absolute value) of the normalized adjacency matrix of
the Cayley graph.

Our goal is to design an algorithm that, for a fixed $\gamma < 1$,
takes as input the multiplication table of a group $H$ of
order $n$ and
efficiently constructs a small generating set $S$ such that
$\lambda(\Cay(H; S)) < \gamma$.  This is given by the following
theorem.

\begin{theorem}
\label{thm:arderand}
Fix $\gamma < 1$.  Then there exists an algorithm running in time
$\poly(n)$ that,
given a group $H$ of order $n$, constructs a
symmetric set $S \subseteq H$ of
size $|S| = O(\tfrac{1}{\gamma^2} \log n)$ 
such that $\lambda(\Cay(H; S)) \leq \gamma$.
\end{theorem}

We prove this after presenting the randomized algorithm.

\subsection{A randomized algorithm}
\begin{theorem}[\cite{AR, RusLan, SchulLoh}]
\label{thm:ar}
Fix $0 < \gamma < 1$, and let $H$ be a group of  
order $n$.  Identify $H$
with $[n]$.  Let $X_1, \ldots, X_k$ be chosen randomly in $H$, where
$k = O(\tfrac{1}{\gamma^2} \log n)$.  We let the multi-set $S$ be $(X_1,
\ldots, X_k)$, and we have
$$\Pr_{S \subseteq H}[\lambda(\Cay(H; S \sqcup S^{-1})) > \gamma]
< 1$$
where $S \sqcup S^{-1}$ denotes the symmetric closure of $S$,
namely the number of occurrences of $s$ and $s^{-1}$ in $S
\sqcup S^{-1}$ equals the number of occurrences of $s$ in $S$.
\end{theorem}

To identify the notation in the following proof precisely with that
used in \secref{sec:awestimators}, we have that $S$ corresponds to
$\vec{X}$, $|S| = k$, and it will become clear that in this setting that $n
= d = |H|$.

\begin{proof}
Consider the $n \times n$ matrices $P_h$ for $h \in H$, where each
$P_h$ is the $n \times n$ permutation matrix of the action of $h$ by
right multiplication.  Consider now $\tfrac{1}{2} (P_h + P_{h^{-1}})$.
It is not hard to see that the normalized adjacency matrix $A$ of
$\Cay(H; S \sqcup S^{-1})$ is given by
$$A = \tfrac{1}{k} \sum_{i = 1}^k \tfrac{1}{2} (P_{X_i} + P_{X_i^{-1}})\,.$$

We wish to bound $\lambda(A)$.  We know that the largest eigenvalue is
$1$ and corresponds to $J/n$ where $J$ is the all 1 matrix.  Since we
want to analyze the second-largest eigenvalue, we consider
$$(I - J/n)A = \tfrac{1}{k} \sum_{i = 1}^k (I - J/n) \tfrac{1}{2}
(P_{X_i} + P_{X_i^{-1}})\,.$$
We let our matrix-valued function be $f(h) = (I - J/n) \tfrac{1}{2}
(P_h + P_{h{-1}})$, so that
$$\lambda(A) = \bigl\|(I - J/n)A\bigr\| = \Bigl\| \tfrac{1}{k} \sum_{i = 1}^k f(X_i)
\Bigr\|\,.$$
It is straightforward to verify that $f(h) \in \calM_n$, $\|f(h)\|
\leq 1$ and $\Exp_{h \in H}[f(h)] = 0$.

Thus we may apply \thmref{thm:aw} to get that
\begin{align}
\Pr[\lambda(A) > \gamma] & = \Pr\Bigl[\Bigl\|\tfrac{1}{k} \sum_{i=1}^k f(X_i)\Bigr\|
  > \gamma\Bigr] \\
\label{eq:ar}
& \leq 2n e^{-\gamma^2 |S| / 4}\nonumber
\end{align}
so picking $k = O(\tfrac{1}{\gamma^2} \log n)$ suffices to make this
probability less than $1$.
\end{proof}

\subsection{Derandomizing}
\begin{proof}[Proof of \thmref{thm:arderand}]
To derandomize and obtain \thmref{thm:arderand}, we apply
\corref{cor:mest} to obtain efficient pessimistic estimators for
the event $\sigma(S) = 1$ iff $\| \tfrac{1}{k} \sum_{i=1}^k f(X_i)\|
\leq \gamma$.  We fix $k = O(\tfrac{1}{\gamma^2} \log n)$ large enough
such that the probability of this event is non-zero (\ie, the
estimators we got are useful).  We then apply \thmref{thm:derand}
to greedily choose successive elements of $H$ to be put in $S$ in
order to make an expander.
\end{proof}

\subsection{Derandomized homomorphism testing}
\thmref{thm:arderand} answers a question about the derandomization of
affine  homomorphism testers posed by Shpilka and Wigderson~\cite{SW}.  
In this section we will use \thmref{thm:arderand} to prove
\corref{cor:homtest}.

An \emph{affine homomorphism} between two groups $H, H'$ is a map $f :
H \dans H'$ such that $f^{-1}(0) f$ is a homomorphism.  An 
$(\delta, \eta)$-test for affine homomorphisms is a tester that accepts 
any affine homomorphism surely and rejects with probability $1 - \delta$
any $f : H \dans H'$ which is $\eta$ far from being an affine
homomorphism.  Here distance is measured by the normalized Hamming
distance: $d(f, g) = \Pr[f(x) \neq g(x)]$, where the probability is
over $x$ chosen uniformly from $H$.

\begin{sloppypar}
Shpilka and Wigderson~\cite{SW} showed how to 
construct a tester $T_{H \times S}$ efficiently    
using an expander $\Cay(H; S)$ where $\lambda(\Cay(H; S)) <
\lambda$: simply pick a random element 
%% $x \drawnr H$ 
$x\in H$         
and a random element %% $y \drawnr S$
$y\in S$           
and check to see that $f(0)f(x)^{-1}f(xy) = f(y)$.  
It is clear this test 
accepts $f$ surely if $f$ is an affine
homomorphism.  \cite{SW} shows that if $12\delta < 1 - \lambda$ then
this rejects with probability $1-\delta$ any $f$ that is
$4\delta/(1-\lambda)$-far from being an affine homomorphism.
\end{sloppypar}

\begin{theorem}[\cite{SW}]
\label{thm:swtest}
\begin{sloppypar}
For all groups $H, H'$ and $S \subseteq H$ an expanding generating set
such that $\lambda(\Cay(H; S)) < \lambda$, we can construct a tester
$T_{H \times S}$ that surely accepts any affine homomorphism $f : H
\dans H'$ and rejects with probability at least $1 - \delta$ any $f :
H \dans H'$ which is $4\delta/(1 - \lambda)$ far from being an affine
homomorphism, given that ${12\delta}/({1 - \lambda}) < 1$.  That is,
$T_{H \times S}$ is a $(\delta, 4\delta/({1 - \lambda}))$-test for
affine homomorphisms.
\end{sloppypar}
\end{theorem}

In~\cite{SW} the deterministic construction of $S$ gave a set of size
$|H|^\eps$ for arbitrary $\eps > 0$.  The explicit construction given
in~\cite{SW} requires that $T_{H \times S}$ use $(1+\eps)\log |H|$
random bits and asks whether it is possible to improve this dependency
on randomness.  \thmref{thm:arderand} allows us indeed to
improve this dependency to the following.


\begin{corollary}
\label{cor:homtest}
Given an arbitrary group $H$, one can construct in time $|H|^{O(1)}$ 
an affine homomorphism tester    
for functions on $H$ which uses only 
$\log |H| + \log\log |H| +O(1)$ random bits.
\end{corollary}
\begin{proof}[Proof of \corref{cor:homtest}]
\thmref{thm:swtest} says we can construct a homomorphism
tester that only uses randomness to pick an element of $H$ and an
element of an expanding generating set of $H$.
\thmref{thm:arderand} implies this only requires $\log |H| +
\log \log |H| + O(1)$ random bits since we can deterministically
construct an expanding generating set of size $\log |H|$ in polynomial
time.
\end{proof}


\section{Covering SDP's}
\label{sec:sdp}

Linear programming (LP) was one of the first tools computer scientists
used to approximate $\NP$-hard problems.  As a natural relaxation of
integer programming (IP), linear programs give fractional solutions to
an IP, which may then be rounded to give provably good solutions to
the original IP.

More recently, a more general class of relaxations,
\emph{semidefinite programs} (SDP's), have been used by computer
scientists (e.g.~\cite{GW95, ARV04}) to give better approximation
guarantees to $\NP$-hard problems.  SDP's may be solved in polynomial
time (using e.g. the ellipsoid method or interior-point methods, see~\cite{Shor1, Shor2, NemYu, VanBoyd}), and again the solution may be
rounded to give a solution to the original IP.

In this section we will define a restricted class of integer SDP's and
show that our pessimistic estimators will give a good approximation
guarantee.

\subsection{Definition}
We define the notion of \emph{integer covering SDP's}, which are
generalizations of integer covering linear programs (see
e.g.~\cite{KolYoung05}).  These programs take the following form:
given $c \in [0,1]^n$ and $f : [n] \dans [0, I_d]$,\footnote{We
restrict ourself to this scale for simplicity.  Our results apply to
any bounded function with a constant loss in efficiency.} find
$y \in \N^n$ where
\begin{equation}
\label{eq:mainsdp}
\begin{split}
\text{minimize } & c^T y \\
\text{with feasibility constraint } & y_1 f(1) + \ldots + y_n f(n) \geq I
\end{split}
\end{equation}
where the feasibility inequality uses the p.s.d.\ ordering.  The
vector $c$ may be interpreted as a cost vector, and we wish to
minimize the cost of a solution $y \in \N^n$.  This is relaxed
into a covering SDP by allowing $y \in \R_+^n$ where $\R_+$
denotes the non-negative reals, which we would then like to round
to a solution $\hat{y} \in \N^n$ that is not too much more
costly.  We will let $\OPT$ denote the optimal value of the
\emph{relaxed} covering SDP.

Our main theorem is as follows:
\begin{theorem}
\label{thm:sdpderand}
  Suppose we have a program as in \equationref{eq:mainsdp} and suppose
  we have a feasible relaxed solution vector $y \in \R^n_+$.
  Then we can find in time $\poly(n, d)$ a feasible integer solution
  $\hat{y}$ such that
  $$c^T \hat{y} \leq O(\log d) \cdot  c^T y\,. $$
\end{theorem}
\begin{corollary}
Given an integer covering SDP with optimum $\OPT$, we can efficiently
find an integer solution with cost at most $O(\log d) \cdot \OPT$.
\end{corollary}

This is done by using a randomized rounding algorithm given implicitly
in~\cite{AW}, and then derandomizing using pessimistic estimators.

Also, note that this is a natural generalization of integer covering
linear programs of the following form: for a cost vector $c \in
\R^n_+$, a matrix $A \in \R^{d \times n}_+$
\begin{gather*}
\text{minimize } c^T y \\
\text{subject to feasibility constraints that for all $i \in [d]$: }
(A y)_i \geq 1\,.
\end{gather*}
This may be viewed as the special case of integer covering SDP's where
all the matrices are diagonal; each $f(i)$ is just the diagonal matrix
with $i$'th column of $A$ along the diagonal.  Integer covering LP's, in
turn, are a generalization of the very familiar set cover problem,
which are exactly the programs where the columns of $A$ are either $0$
or $1$.  In the language of set cover, the universe is $[n]$ and the
columns of $A$ are the indicator vectors for the sets we may use to
cover $[n]$.

Our approximation for integer covering SDP's will imply a new
approximation algorithm for all these covering problems with a
logarithmic approximation guarantee.  Thus in a sense our algorithm
gives optimal approximation factors (up to constants), since a
logarithmic approximation factor is optimal (up to constant factors)
assuming that $\mathbf{P} \neq \NP$, as shown by~\cite{feige98threshold}.  This connection is discussed in more detail
in \secref{sec:hcover}.

\subsection{A randomized rounding algorithm}

First suppose we have a solution to the SDP given by a vector $y
\in \R^n_+$, and let us define $Q = \sum_{j=}^n y_j$.  In the case
where $Q \geq n$, we can get a trivial deterministic rounding scheme
with approximation factor $2$ by always rounding up, since this will
increase the value of the program at most by an additive $n$.  Thus in
the following we consider only programs where $Q \leq n$.

Suppose we have a program as in \equationref{eq:mainsdp} and we have
solved it efficiently to obtain a solution $y$, where $c^T
y = \OPT$.  Let $X$ be distributed according to the distribution
over $[n]$ given by normalizing $y$, \ie,
$$\Pr[X = i] = y_i / Q\,.$$
Note that, because $y$ is a feasible solution, we have $\Exp_X
[f(X)] \geq \tfrac{1}{Q} I$.  It was implicitly shown in~\cite{AW} that
sampling $k = Q \cdot O(\log d)$ elements from $[n]$ according to the
distribution $X$ and taking $f(X_i)$ ($1 \leq i \leq k$) gives us a
feasible solution with approximation factor $O(\log d)$.  We state
this formally:

\newcommand{\valueofk}{Q \cdot 8 \ln 2d}
\newcommand{\approxmin}{c^T y \cdot 16 \ln 2d}

\begin{theorem}[\cite{AW}]
\label{thm:randround}
Suppose we sample $k = \valueofk$ times from $[n]$ according to $X$ in
order to get $X_1, \ldots, X_k$.  Furthermore, for each $1 \leq j \leq
n$, we define the random variables
$$\hat{Y}_j = |\{ i \mid X_i = j \}|\,,$$
the number of times that $j$ is sampled, and let $\hat{Y} =
(\hat{Y}_1, \ldots, \hat{Y}_n)$.\footnote{Notice that $\sum_{i=1}^k
  f(X_i) = \sum_{j=1}^n \hat{Y}_j f(j)$.}  Then, with non-zero
probability, we have that
$$f(X_1) + f(X_2) + \ldots + f(X_k) \geq I \qquad \text{ and }
  \qquad c^T \hat{Y} \leq \approxmin\,.$$
\end{theorem}
\begin{proof}
We will use a union bound to show that the probability that either
$$
\sum_j f(X_j) \not\geq I\quad \text{or}\quad c^T \hat{Y} > \approxmin
$$
 occurs
is strictly less than 1.
  
All expectations below are over the $X_i$ (since the $\hat{Y}_j$ are
totally determined by the $X_i$).
\begin{align}
\label{eq:awbound}
\Pr\Bigl[\sum_{j=1}^k f(X_j) \not\geq I\Bigr] & = \Pr\Bigl[ \tfrac{1}{k}
  \sum_{j=1}^k f(X_j) \not\geq \tfrac{1}{k} I\Bigr]\,. \\
\intertext{We know from the fact that $y$ is feasible that $\Exp[f(X)]
  \geq \tfrac{1}{Q} I$, and so for $k > 2Q$ we get:}
\Pr\Bigl[\sum_{j=1}^k f(X_j) \not\geq I\Bigr] & \leq \Pr\Bigl[\tfrac{1}{k}
  \sum_{j=1}^k f(X_j) \not\geq \tfrac{1}{2} \tfrac{1}{Q} I\Bigr]\,.\nonumber
\intertext{Invoking \thmref{thm:aw2}, we obtain}
\Pr\Bigl[\sum_{j=1}^k f(X_j) \not\geq I\Bigr] & \leq d e^{{- k}/{(8 Q)}}\,.\nonumber
\end{align}
Therefore if we take $k = \valueofk$ with probability greater than
${1}/{2}$ we have $\sum_j f(X_j) \geq I$.

For the second event it is easy to see that $c^T \hat{Y} =
\sum_{j=1}^k c_{X_j}$.  Furthermore, a simple calculation shows that
for each $j$, $\Exp[c_{X_j}] = c^T y / Q$.  Thus, by
Markov we have:
\begin{align}
\Pr[c^T \hat{Y} > \approxmin] & =
  \Pr\left[\sum_{j=1}^k c_{X_j} > \approxmin \right]\nonumber
  \\
\label{eq:toocostly}
& < \frac{\Exp\left[\sum_{j=1}^k c_{X_j} \right]}{\approxmin} \\
& = \frac{k \cdot c^T y / Q}{\approxmin}\,.\nonumber
\end{align}
Expanding $k = \valueofk$ shows that this last expression is at most
$1/2$.

Thus each bad event happens with probability less than $1/2$, and so
the probability that either bad event happens is strictly less than $1$.
\end{proof}


\subsection{Derandomizing}
Derandomizing is a simple proposition.  Given a program, first solve
it using a standard efficient technique (\cite{Shor1, Shor2, NemYu},
for a survey see~\cite{VanBoyd}), with solution $y$ and $Q =
\sum_{j=1}^n y_j$.  Let $k = \valueofk$.  In the proof of
\thmref{thm:randround} at \inequalityref{eq:awbound}, we can apply
\thmref{thm:mest2} to get pessimistic estimators $\phi_i$ for the
event $\sum_{j=1}^k f(X_j) \geq I$, which we call $\sigma$.  We only
need now a pessimistic estimator $(\psi_0, \ldots, \psi_k)$ for the
event of the solution not being too costly, which we call $\tau$.

We define $\psi_i : [n]^i \dans [0, 1]$ as follows:
$$\psi_i(x_1, \ldots, x_i) = \frac{\sum_{j=1}^i c_{x_j} + (k - i)
   \Exp[c_X]}{\approxmin}\,.$$

It is clear that the $\psi_i$ are efficiently computable.  They
satisfy the properties of \definitionref{def:estimators}.  This is
easy to see, since the $\psi_i$ are exactly the expressions given by a
Markov bound on the event $\tau$, and such expressions always satisfy
\definitionref{def:estimators}.  We write this out explicitly here
for completeness.

\begin{enumerate}
\item By an application of Markov (this is the same as in
  \inequalityref{eq:toocostly}), we see:
\begin{align*}
\Pr\left[ \sum_{j=1}^k c_{X_j}  > \approxmin \;\vrule\; X_1 = x_1, \ldots,
  X_i = x_i \right] & \leq \frac{\sum_{j=1}^i c_{x_j} + (k - i) \Exp[
  c_X]}{\approxmin} \\
& = \psi(x_1, \ldots, x_i)\,.
\end{align*}
\item For estimators based on Markov, we actually have equality for
  this property. 
\begin{align*}
\Exp_{X_{i+1}} [\psi_{i+1}(x_1, \ldots, x_i, X_{i+1})] & =
  \Exp_{X_{i+1}} \left[\frac{\sum_{j=1}^i c_{x_j} + c_{X_{i+1}} + (k -
  i - 1) \Exp[c_X]}{\approxmin} \right]\\ 
& = \frac{\sum_{j=1}^i c_{x_j} + (k - i) \Exp[c_{X_j}]}{\approxmin} \\ 
& = \psi_i(x_1, \ldots, x_i)\,.
\end{align*}
\end{enumerate}

\begin{theorem}
\label{thm:sdpest}
Since $\phi_0 + \psi_0 < 1$ because of the choice of $k = \valueofk$,
we may invoke \lemref{lemma:compose} to get that $(\phi_0 +
\psi_0, \ldots, \phi_k + \psi_k)$ are efficient and useful pessimistic
estimators for the event in \thmref{thm:randround}.
\end{theorem}

Finally we may prove \thmref{thm:sdpderand}.

\begin{proof}[Proof of \thmref{thm:sdpderand}]
By \thmref{thm:sdpest} we have pessimistic estimators for the
event in \thmref{thm:randround}, and so we may apply
\thmref{thm:derand}, which says we can efficiently and
deterministically find a suitable integer vector $\hat{y}$ that
satisfies \thmref{thm:sdpderand}.  The algorithm runs in time
$\poly(n, k, d)$, but since $k = \valueofk$ and we only consider $Q
\leq n$, this is $\poly(n, d)$.
\end{proof}

\subsection{Quantum Hypergraph Covers}
\label{sec:qhcover}
In this section we define hypergraphs and quantum hypergraphs and
discuss the cover problem for both.  The hypergraph cover problem is
just the classical set cover problem, and the quantum hypergraph cover
problem is a non-commutative generalization arising in quantum
information theory~\cite{AW}.  Our efficient and useful pessimistic
estimators for the integer covering SDP problem immediately give an
efficient deterministic algorithm to find a quantum hypergraph cover
that is optimal up to logarithmic factors.

\newcommand{\V}{\mathcal{V}}
\newcommand{\E}{\mathcal{E}}
\subsubsection{Hypergraphs}
\label{sec:hcover}

Here we will describe the hypergraph cover problem, which is just
another name for the classical set cover.  A hypergraph is a pair 
$(V,E)$ where $E \subseteq 2^V$, \ie, $E$ is a collection of subsets of
$V$.  Say $|V| = d$.  One often views an edge $e$ as a vector in
$\zo^d$, where the $i$'th entry is $1$ if vertex $i$ is in the edge
and $0$ otherwise.

It will actually be convenient for us to view $e \in E$ as $d \times d$ 
diagonal matrix with $1$ or $0$ at each diagonal entry to signify
whether that vertex is in the edge.  In this section we will denote
the matrix associated with $e$ as $f(e)$.  This representation will
naturally generalize to quantum hypergraphs.

A \emph{cover} of a hypergraph $\Gamma = (V, E)$ is a set
of edges $C$ such that $\bigcup_{e \in C} e = V$, \ie, each vertex is
in at least one edge.  Note that this definition of cover coincides
exactly with the definition of set cover.  The size of the smallest
cover is called the \emph{cover number} and is denoted $c(\Gamma)$.

Using the matrix representation of $E$, one sees that
$$\bigcup_{e \in C} e = V\ \ \ \  \equivaut\ \ \ \  \sum_{e \in C}
f(e) \geq I$$
where the second expression uses our usual ordering of matrices.

A \emph{fractional cover} is 
an assignment $w\,:\,E\to \R_+$ of non-negative weights to the edges  
such that $\sum_{e \in E} w(e) f(e) \geq I$. 
The \emph{fractional cover number} is defined as      
$$\tilde{c}(\Gamma) = \min_w \left\{\sum_{e \in E} 
w(e) \;\vrule\; \sum_{e \in E} w(e) f(e) \geq I  \right\}\,.$$

We know that the hypergraph cover problem is hard to approximate up to
a $\ln n$ factor~\cite{feige98threshold}.  From the definitions, it is
clear that this problem is a special case of our integer covering
SDP's.  In the next section we generalize to the non-commutative case.


\subsubsection{Quantum Hypergraphs}

Ahlswede and Winter~\cite{AW} define 
\emph{quantum hypergraphs} as generalizations of
hypergraphs.  Recall that we represented an edge of a hypergraph as a
$d \times d$ diagonal matrix with $1, 0$ along the diagonal.  So a
hypergraph is equivalent to a pair     
$(\V, \E)$ where $\V = \C^d$ and each $e\in \E$ 
is identified with a diagonal matrix $f(e)$  
whose diagonal entries are $0$ or $1$.   
We generalize this
to non-commutative ``edges'' by allowing $\E$ to contain other
operators, \ie, $f(e)$ can be any Hermitian operator (\ie, matrix) in
$[0, I]$.\footnote{A complex matrix $A$ is Hermitian (self-adjoint)
if $A = A^*$ where $^*$ denotes the conjugate transpose.   
Here we use the fact that all our previous results for real
symmetric matrices generalize to complex Hermitian matrices.}  %% LB

\begin{definition}
\label{def:qh}
A \emph{quantum hypergraph} is a pair $\Gamma = (\V, \E)$ 
where $\V$ is a $d$-dimensional Hilbert space and $\E$ is
a finite set such that each
$e \in \E$ is identified with a Hermitian operator $f(e) \in [0,I_d]$. 
\end{definition}
We define a \emph{cover} of a quantum hypergraph         
to be a finite subset $C \subseteq \E$ such that
$\sum_{e \in C} f(e) \geq I$.  The \emph{cover number}
$c(\Gamma)$ is the size of the smallest cover of $\Gamma$.

Likewise, we define a fractional cover to be
an assignment $w\,:\,E\to \R_+$ of non-negative weights to the edges  
such that $\sum_{e \in \E} w(e) f(e) \geq I$, 
and the fractional cover number as
$$\tilde{c}(\Gamma) = \min_w \left\{\sum_{e \in \E} w(e) \;\vrule\; 
\sum_{e\in \E} w(e) f(e) \geq I\right\}\,.$$
Note that this corresponds exactly to 
our previous definitions for
hypergraphs.  The problem of finding the fractional cover has
equivalent forms that are natural and interesting, which are discussed
at the end of this section.

It is important to note that the notion of ``vertex'' is lost because
the matrices $f(e) \in \calM_d$ are not necessarily diagonal in a
common basis.  However, it is again clear from the definitions that a
quantum hypergraph cover problem is just a special case of integer
covering SDP's (extended to complex matrices), so we may use
\thmref{thm:sdpderand} to give an efficient deterministic
approximation.  Thus the theorem below follows.

\begin{theorem}
\label{theorem:qhcover}
Suppose we are given
a quantum hypergraph $\Gamma = (\V, \E)$   with
fractional cover number $\tilde{c}(\Gamma)$, with
$\dim(\V)=d$
and $|\E| = n$.  Then we can find an integer cover of $\Gamma$ of size 
$k = \tilde{c}(\Gamma) \cdot O(\log d)$ in time $\poly(n, d)$.
\end{theorem}

\subsection{Other Applications}
Our integer covering SDP (and its extension to complex matrices) also
encompasses two other natural problems from quantum information
theory.  Given a function $f : [n] \dans [0, I_d]$,
one may want to find a probability distribution $X$ over $[n]$
that achieves the optimum of either of the following quantities:
\begin{enumerate}
\item $\min_X \lambda_1(\Exp_X [f(X)]) = \min_X \|\Exp_X[f(X)]\|$,
\item $\max_X \lambda_d(\Exp_X [f(X)])$.
\end{enumerate}
The former minimizes the norm of the expected value of the
distribution, which is also its largest eigenvalue, while the latter
may be viewed as maximizing the lowest energy state of a quantum
system, which is also its smallest eigenvalue.  The second can be
formulated as a covering SDP by using the cost vector $c = \mathbf{1}$
(the all 1's vector), and then normalizing the solution vector $y$ to be
a probability distribution.  The first can be formulated as the second
by considering the function $I - f$.

In both cases, our pessimistic estimators give an ``integral
solution'' that is worse by at most $O(\log d)$.  In this case, an
integral solution is actually a distribution with sparse support; we
sample from the solution distribution $X$ to get a distribution
$\hat{X}$ with support of size $O(\tfrac{1}{\gamma^2} \log d)$ such
that the corresponding objective is worse by at most a factor of
$O(\log d)$.

\section{Acknowledgments}
We thank Sanjeev Arora for suggesting a connection between
pessimistic estimators and Ahslwede-Winter's Chernoff-type bounds.  We
thank Satyen Kale for interesting discussions.

\appendix

\section{Error in~\cite{WX05}}
\label{app:problem}

Due to the error, the main claim of~\cite{WX05} remains open.
\begin{conjecture}
Let $f : [N] \dans [-I_d, I_d]$ be a matrix-valued function with
expectation $\Exp[f] = 0$.  Let $G$ be an expander graph on $[N]$ with
spectral gap $\eps$.  Let $Y_1, \ldots, Y_k$ be a random walk of
length $k$ on $G$  (for sufficiently large $k$).  Then it holds that
$$\Pr \Bigl[\Bigl\|\tfrac{1}{k} \sum_{i=1}^k f(Y_i) \Bigr\| > \gamma
  \Bigr] \leq d e^{-\Omega(\eps \gamma^2 k)}\,.$$
\end{conjecture}

The above statement gives a relatively strong bound; in its weakest
non-trivial form, the conjecture would be that
$$\Pr \Bigl[\Bigl\|\tfrac{1}{k} \sum_{i=1}^k f(Y_i) \Bigr\| > \gamma
  \Bigr] \leq \poly(d) e^{-\poly(\eps, \gamma) k}\,.$$


The error in~\cite{WX05} is in the application of the Golden-Thompson
inequality (\thmref{thm:gt}).  The following derivation, which
appears in the proof of Theorem 3.6 in the second column of page
401 of~\cite{WX05}\footnote{Top of page 12 in the ECCC version~\cite{WXECCC}}, is incorrect:
$$\Exp \left[\Tr \Bigl(\exp \bigl(t \sum_{i=1}^k f(Y_i) \bigr)
  \Bigr) \right] \leq \Exp \left[ \Tr \Bigl( \prod_{i=1}^k \exp( t
  f(Y_i)) \Bigr) \right]$$
where the $Y_i$ are the steps in a random expander walk and the
expectation is over all walks.  This is incorrect because the
Golden-Thompson inequality does not generalize to more than two terms,
\ie, the following does not hold in general for real symmetric
matrices $A, B, C$:
$$\Tr(\exp(A + B + C)) \leq \Tr(\exp(A) \exp(B) \exp(C))$$
and it is not hard to come up with counterexamples.

We have tried various techniques to bypass this problem, but we have
not discovered any method to get parameters that are sufficient for
our applications.  In the notation of~\cite{WX05}, it would suffice to
prove
$$\Tr\left( \Exp\Bigl[ \exp\Bigl(t \sum_{i=2}^k f(Y_i)\Bigr) 
\exp(t f(Y_1))\Bigr]\right) \leq
\|\tilde{A} \tilde{D}_t\| \cdot \Tr\left( \Exp\Bigl[ \exp\Bigl(t \sum_{i=2}^k
  f(Y_i)\Bigr)\Bigr]\right)$$
or even only
$$\Tr\left( \Exp\Bigl[ \exp\Bigl(t \sum_{i=1}^k f(Y_i)\Bigl)\Bigr]\right) 
\leq d \|\tilde{A} \tilde{D}_t \|^k\,.$$
We know from the proof of the \thmref{thm:main} that both of the
inequalities hold when the normalized adjacency matrix of the
graph is $A = J/n$, \ie, we sample from the complete graph
with self-loops, which
corresponds to independent sampling.  We do not know counterexamples
to either of these inequalities for sampling according to an expander
walk.  Thus, as far as we know, Theorem 3.6 of~\cite{WX05} may be
true as stated. 

\bibliographystyle{tocplain}
\bibliography{a003}


\begin{tocauthors} 
\begin{tocinfo}[avi] 
Avi Wigderson \tocabout{}\\ 
Professor\\
School of Mathematics\\
Institute for Advanced Study\\ 
avi\tocat{}ias\tocdot{}edu\\
\url{http://www.math.ias.edu/~avi}
\end{tocinfo} 
\begin{tocinfo}[david] 
David Xiao \tocabout{}\\ 
Student\\
Department of Computer Science\\
Princeton University\\ 
dxiao\tocat{}cs\tocdot{}princeton\tocdot{}edu\\
\url{http://www.cs.princeton.edu/~dxiao}
\end{tocinfo} 
\end{tocauthors} 

\begin{tocaboutauthors} 
\begin{tocabout}[avi] 
{\sc Avi Wigderson} was born in Haifa, Israel in 1956, and received his
\phd\ in 1983 at \href{http://www.cs.princeton.edu/}{Princeton University}
under Dick Lipton. He enjoys
and is fascinated with studying the power and limits of efficient
computation, and the remarkable impacts of this field on understanding
our world. Avi's other major source of fascination and joy are his
three kids, Eyal, Einat, and Yuval.
\end{tocabout} 
\begin{tocabout}[david]
{\sc David Xiao} hopes to receive his \phd\
soon from \href{http://www.cs.princeton.edu/}{Princeton University}
under the supervision of 
\href{http://www.cs.princeton.edu/~boaz/}{Boaz Barak} and Avi Wigderson. He 
is interested in complexity theory and cryptography, and in his spare
time he enjoys wandering through city streets and rocking out to
French house music.
\end{tocabout}

\end{tocaboutauthors} 

\end{document}


