\documentclass[11pt]{article}
\usepackage{fullpage,amssymb,graphicx}
\newcommand{\Q}[1]{\item {[{\em #1 pts\/}]}\ }
\newcommand{\fighere}[2]{\begin{figure}[h]\begin{center}%
\includegraphics[scale=#2]{#1.pdf}\end{center}%
\end{figure}\vspace{-\bigskipamount}}
\newcommand{\R}{\mathbb{R}}
\newcommand{\Ex}{\mathbb{E}}
\newcommand{\eps}{\varepsilon}
\newcommand{\OO}{\widetilde{O}}
\newcommand{\down}[1]{\left\lfloor #1\right\rfloor}
\newcommand{\up}[1]{\left\lceil #1\right\rceil}
\newcommand{\IGNORE}[1]{}
\begin{document}
%\thispagestyle{empty}
\hfill CS 574, Spring 2021\par\hfill Timothy Chan
\bigskip
\begin{center}\large\bf Homework 3 (due March 31 Wednesday 5pm (CT))
\end{center}
\medskip\noindent
{\bf Instructions}:
You may work in groups of at most 3; submit one set of solutions per group.
Always acknowledge any discussions you have with other people and
any sources you have used (although most homework problems should be doable without using outside sources). In any case,
\emph{solutions must be written entirely in your own words}.
\begin{enumerate}
\Q{28} We are given a text string $A[1]A[2]\cdots A[n]\in \{0,1\}^*$ of length $n$ and a pattern string $B[1]B[2]\cdots B[m]\in \{0,1\}^*$ of length $m\le n$.
We want to find a location in the text that best matches the pattern.
More precisely, we want to compute
\[ \Delta\ = \min_{i\in\{0,\ldots,n-m\}} d(A[i+1]A[i+2]\cdots A[i+m], \, B[1]B[2]\cdots B[m]), \]
where for two strings $s$ and $t$ of the same length,
their ``distance'' $d(s,t)$ is defined as the number of positions at which the two strings differ.
In other words, $d(A[i+1]A[i+2]\cdots A[i+m], \, B[1]B[2]\cdots B[m])$ is equal to
$|\{j: A[i+j]\neq B[j]\}|$.
There are known algorithms for computing $\Delta$ in $O(n\log n)$ time. Here, we will investigate algorithms for \emph{approximating} $\Delta$ that only need to a read a \emph{sublinear}
number of characters of the input strings. (Here, we care about the number
of characters read, rather than actual run time.) Sublinear algorithms are especially attractive when the input size $n$ is very large.
\begin{enumerate}
\Q{14} First consider the simpler problem of approximating the distance
between two strings: given two strings $s$ and $t$
of length $n$, compute a value $\widetilde{d}$ such that
$|\widetilde{d} - d(s,t)| \le \eps n$. Describe a simple Monte Carlo
algorithm that solves this problem correctly w.h.p.
Bound the number of characters read (which should be small, and
is a function of $n$ and~$\eps$).
%\medskip
\Q{14} Next, consider the original problem.
Pick random indices $r_1,\ldots,r_\ell,s_1,\ldots,s_\ell\in \{0,\ldots,\sqrt{n}-1\}$ (you may assume that $\sqrt{n}$ is an integer).
Read the following characters:
\begin{itemize}
\item $A[u\sqrt{n}+r_k]$ for all $u\in\{0,\ldots,\sqrt{n}-1\}$ and $k\in\{1,\ldots,\ell\}$; and
\item $B[s_k\sqrt{n}+v]$ for all $v\in\{0,\ldots,\sqrt{n}-1\}$ and $k\in\{1,\ldots,\ell\}$.
\end{itemize}
For an appropriate (small) value of $\ell$,
show that after reading the above $O(\ell\sqrt{n})$ characters,
we have enough information to compute a value $\widetilde{\Delta}$ such that
$|\widetilde{\Delta}-\Delta|\le \eps n$ w.h.p.
\end{enumerate}
\bigskip
\Q{30}
Recall the problem of approximating the volume of the union of
$n$ boxes $B_1,\ldots,B_n$ in $\R^d$. Let $V^*$ denote the volume of the union. Let $r$ be a parameter to be determined.
Consider the following variant of the algorithm from class (based
on Karp and Luby's technique):
\newcommand{\VOL}{\mathop{\rm vol}}
\newcommand{\COUNT}{\mathop{\it count}}
\begin{quote}
\begin{tabbing}
19.M\= for\= for\=\+\kill
0.\' $\COUNT=0$,\ $\Lambda=\sum_{i=1}^n \VOL(B_i)$\\
1.\' repeat $r$ times \{\\
2.\'\> pick a random $i\in\{1,\ldots,n\}$ with probability $\VOL(B_i)/\Lambda$\\
3.\'\> pick a uniformly random point $z\in B_i$\\
4.\'\> repeat \{\\
5.\'\>\> increment $\COUNT$\\
6.\'\>\> pick a uniformly random $j\in \{1,\ldots,n\}$\\
7.\'\> \} until $z\in B_j$\\
\}
\end{tabbing}
\end{quote}
\begin{enumerate}
\Q{2} Let $H_i^{(k)}$ be the set of all points in $B_i$ that are in exactly $k$ of the input boxes. What is $\sum_{i=1}^n\sum_{k=1}^n \VOL(H_i^{(k)})/k$?
\Q{8} For each $\ell=1,\ldots,r$, let $X_\ell$ be the number of times
$\COUNT$ is incremented during iteration $\ell$ of the outer loop in lines 1--7.
Prove that $\Ex[X_\ell] = nV^*/\Lambda$ for every $\ell$.
\Q{4}
Prove that $X_\ell \le O(n\log n)$ for all $\ell$ w.h.p.
\Q{12} Assume that a factor-2 approximation $V_0$ of $V^*$ is given (with $V^*\le V_0\le 2V^*$).
Describe how to set the parameter $r$ so that a factor-$(1\pm \delta)$ approximation $\widetilde{V}$ of $V^*$
(with $(1-\delta)V^*\le \widetilde{V}\le (1+\delta)V^*$) for any given $\delta>0$ can be computed w.h.p.\ from the value $\COUNT$ found
by the above algorithm. Analyze the total expected running time, which
should be better than the quadratic bound from class.
\Q{4}
Describe how to remove the assumption that a factor-2 approximation is given.
\end{enumerate}
\bigskip
\Q{26}
Consider $n$ independent random variables $X_1,\ldots,X_n$, where
each $X_i$ is geometrically distributed with probability $1/2$,
i.e.,
$\Pr[X_i=\ell]=1/2^\ell$ for each $\ell=1,2,\ldots$\ \
\begin{enumerate}
\Q{13}
Prove that $\Pr[\sum_{i=1}^n X_i > 2.01n] \le e^{-\Theta(n)}$,
by modifying the proof of Chernoff's bound.
\Q{13}
Prove that
$\Pr[\sum_{i=1}^n X_i^2 > 6.01n] \le e^{-\Theta(n^\alpha)}$ for some constant $0 < \alpha < 1$. Aim for the best $\alpha$.
\emph{Note}: $\sum_{\ell=1}^\infty \ell^2/2^\ell=6$.
\emph{Hint}:
This part could be trickier (adapting Chernoff's proof doesn't immediately seem to
work). One (not necessarily best) approach is to apply Hoeffding's bound to $\sum_{i=1}^n \min\{X_i^2,b\}$ for some fixed value $b$\ldots
\end{enumerate}
\bigskip
\Q{16}
Consider a random binary string $s=a_1\cdots a_n\in\{0,1\}^*$,
where each $a_i$ is chosen from $\{0,1\}$ uniformly and independently.
Given an even number $k$, let $P$ be the number of length-$k$ palindromic substrings of $s$, i.e., $P=|\{i: \mbox{$a_{i+1}\cdots a_{i+k}$ is a palindrome}\}|$.
\begin{enumerate}
\Q{4} What is $\mu = \Ex[P]$ (as a function of $n$ and $k$)?
\Q{12} Bound $\Pr[|P-\mu|>t]$ (in terms of $n$, $k$, and $t$) by using Azuma's inequality
and the method of bounded differences.
\end{enumerate}
\end{enumerate}
\end{document}