diff --git a/static/src/gzip-classification.pdf b/static/src/gzip-classification.pdf index 8bf0d2c..af2a857 100644 Binary files a/static/src/gzip-classification.pdf and b/static/src/gzip-classification.pdf differ diff --git a/static/src/gzip-classification.tex b/static/src/gzip-classification.tex index ca743d9..74282d8 100644 --- a/static/src/gzip-classification.tex +++ b/static/src/gzip-classification.tex @@ -2,9 +2,12 @@ \usepackage{pythonhighlight} +\hypersetup{colorlinks=true} + % https://arxiv.org/abs/2212.09410 % https://arxiv.org/abs/2309.10668 % https://github.com/Futrell/ziplm +% https://bellard.org/nncp/nncp.pdf % https://bellard.org/nncp/nncp_v2.1.pdf % https://prize.hutter1.net/ (blocked by AMD 🤷) % https://kenschutte.com/gzip-knn-paper/ @@ -18,12 +21,12 @@ \frame{\titlepage} \begin{frame} - \frametitle{Task: text compression} + \frametitle{Task: text classification} \begin{itemize} \item Given some categories, classify strings into the categories \item Example from AG News dataset: \begin{itemize} - \item Categories: World, Sports, Business, Sci/Tech + \item 4 categories: World, Sports, Business, Sci/Tech \item Example string: "AMD Will Have An Edge Over Intel Through 2005. Piper Jaffray said Advanced Micro Devices (nyse: AMD - news - people ) should have an edge over Intel (nasdaq: INTC - news - people ) quot;throughout 2005 quot; but resumed coverage of both companies at quot;market perform quot; and both at price targets of \$25." \end{itemize} \item Usually solved using neural networks, i.e. BERT @@ -52,24 +55,35 @@ \begin{frame} \frametitle{Kolmogorov complexity} \begin{definition} - \textbf{Kolmogorov complexity} $K(x)$: length of shortest program that outputs $x$, i.e. an ideal compressor + \textbf{Kolmogorov complexity} $K(x)$: length of shortest program that outputs $x$, basically an ideal compressor \end{definition} \begin{definition} \textbf{Information distance} $E(x, y)$: length of shortest program that converts $x$ to $y$, equal to $K(xy)-K(x)$ \end{definition} +\end{frame} + +\begin{frame}[fragile] + \frametitle{K(x) is uncomputable!} \begin{itemize} - \item $K(x)$ is generally \textbf{incomputable} though, similar to the halting problem + \item Equivalent to the halting problem + \item Analogy: "The smallest positive integer not definable in under eleven words." + \item + \begin{python} +for x in all_strings: + if K(x) > len(this_program): + print(x) + \end{python} \end{itemize} \end{frame} \begin{frame} - \frametitle{Approximating $K$} + \frametitle{Instead, approximate $K$} \begin{itemize} \item Simply swap out $K$ for our gzip compressor $C$ \item Normalize by length \end{itemize} \begin{definition} - \textbf{Normalized compression distance} \[NCD(x,y) = \frac{C(x,y)-\min(C(x),C(y))}{\max(C(x),C(y))}\] + \textbf{Normalized compression distance} \[NCD(x,y) = \frac{C(xy)-\min(C(x),C(y))}{\max(C(x),C(y))}\] \end{definition} \end{frame} @@ -111,18 +125,85 @@ for (x1, _) in test_set: \end{itemize} \end{frame} -\begin{frame} +\begin{frame}[fragile] \frametitle{Results} + "Outperforms BERT" + \begin{tabular}{c|c|c} + Dataset & NNs average & gzip \\ + \hline + AGNews & 0.901 & \textbf{0.937} \\ + DBpedia & 0.978 & 0.970 \\ + YahooAnswers & 0.726 & 0.638 \\ + 20News & 0.656 & \textbf{0.685} \\ + Ohsumed & 0.433 & \textbf{0.521} \\ + R8 & 0.903 & \textbf{0.954} \\ + R52 & 0.815 & \textbf{0.896} \\ + \hline + \end{tabular} \end{frame} \begin{frame} \frametitle{Actually\dots} + \begin{itemize} + \item Experiments use $k=2$ with top-2 accuracy rather than randomly breaking ties! + \item Contamination between training and test sets! + \item gzip is not great for text classification, but still an interesting idea + \end{itemize} \end{frame} \begin{frame} \frametitle{Connections between compression and ML} \begin{itemize} - \item Autoencoders + \item Autoencoders, dimensionality reduction + \item Compressors can generate text! + \item LLMs can do compression! + \item Both compressors and LLMs model the probability distribution of the next character given the current text + \end{itemize} +\end{frame} + +\begin{frame} + \frametitle{ziplm: text generation using gzip} + \begin{itemize} + \item Idea: generate character that makes text compress the best + \item Results with gzip: theudcanvas. ;cm,zumhmcyoetter toauuo long a one aay,;wvbu.mvns. x the dtls and enso.;k.like bla.njv + \end{itemize} +\end{frame} + +\begin{frame}[fragile] + \frametitle{ziplm code} + \begin{python} +class ZipModel: + def logprobs(self, prefix="", temperature=1): + code_lengths = np.array([ + len(self.compressor.compress("".join([self.training, prefix, v]).encode())) + for v in self.vocabulary + ]) + return scipy.special.log_softmax(-code_lengths*self.conversion*(1/temperature)) + def sample(self, prefix="", temperature=1): + scores = self.logprobs(prefix, temperature=temperature) + i = np.random.choice(range(len(self.vocabulary)), p=np.exp(scores)) + return self.vocabulary[i] + \end{python} +\end{frame} + +\begin{frame} + \frametitle{Compression using LLMs} + \begin{itemize} + \item Idea: LLM generates list of candidate tokens, store the index of the token that the text uses + \item Very predictable text (i.e. digits of $\pi$) turn into 11111111111111111 + \item Then run that through gzip + \item Amazing compression ratio, but slow + \end{itemize} +\end{frame} + +\begin{frame} + \frametitle{References} + \begin{itemize} + \item \href{https://arxiv.org/abs/2212.09410}{Less is More: Parameter-Free Text Classification with Gzip} + \item \href{https://kenschutte.com/gzip-knn-paper/}{Bad numbers in the "gzip beats BERT" paper?} + \item \href{https://github.com/Futrell/ziplm}{ziplm GitHub} + \item \href{https://bellard.org/nncp/nncp.pdf}{Lossless Data Compression with Neural Networks} + \item \href{https://arxiv.org/abs/2309.10668}{Language Modeling Is Compression} \end{itemize} \end{frame}