First draft of gzip classification presentation
All checks were successful
ci/woodpecker/push/woodpecker Pipeline was successful
All checks were successful
ci/woodpecker/push/woodpecker Pipeline was successful
This commit is contained in:
parent
dbc640213d
commit
66432b3158
2 changed files with 129 additions and 0 deletions
BIN
static/src/gzip-classification.pdf
Normal file
BIN
static/src/gzip-classification.pdf
Normal file
Binary file not shown.
129
static/src/gzip-classification.tex
Normal file
129
static/src/gzip-classification.tex
Normal file
|
@ -0,0 +1,129 @@
|
||||||
|
\documentclass{beamer}
|
||||||
|
|
||||||
|
\usepackage{pythonhighlight}
|
||||||
|
|
||||||
|
% https://arxiv.org/abs/2212.09410
|
||||||
|
% https://arxiv.org/abs/2309.10668
|
||||||
|
% https://github.com/Futrell/ziplm
|
||||||
|
% https://bellard.org/nncp/nncp_v2.1.pdf
|
||||||
|
% https://prize.hutter1.net/ (blocked by AMD 🤷)
|
||||||
|
% https://kenschutte.com/gzip-knn-paper/
|
||||||
|
% https://kenschutte.com/gzip-knn-paper2/
|
||||||
|
|
||||||
|
\title{Less is More: Parameter-Free Text Classification with Gzip}
|
||||||
|
\author{Anthony Wang}
|
||||||
|
|
||||||
|
\begin{document}
|
||||||
|
|
||||||
|
\frame{\titlepage}
|
||||||
|
|
||||||
|
\begin{frame}
|
||||||
|
\frametitle{Task: text compression}
|
||||||
|
\begin{itemize}
|
||||||
|
\item Given some categories, classify strings into the categories
|
||||||
|
\item Example from AG News dataset:
|
||||||
|
\begin{itemize}
|
||||||
|
\item Categories: World, Sports, Business, Sci/Tech
|
||||||
|
\item Example string: "AMD Will Have An Edge Over Intel Through 2005. Piper Jaffray said Advanced Micro Devices (nyse: AMD - news - people ) should have an edge over Intel (nasdaq: INTC - news - people ) quot;throughout 2005 quot; but resumed coverage of both companies at quot;market perform quot; and both at price targets of \$25."
|
||||||
|
\end{itemize}
|
||||||
|
\item Usually solved using neural networks, i.e. BERT
|
||||||
|
\end{itemize}
|
||||||
|
\end{frame}
|
||||||
|
|
||||||
|
\begin{frame}
|
||||||
|
\frametitle{gzip}
|
||||||
|
\begin{itemize}
|
||||||
|
\item Compression program using LZ77 and Huffman coding
|
||||||
|
\item Pretty fast and used everywhere
|
||||||
|
\item Doesn't require giant GPUs
|
||||||
|
\end{itemize}
|
||||||
|
\end{frame}
|
||||||
|
|
||||||
|
\begin{frame}
|
||||||
|
\frametitle{Intuition}
|
||||||
|
\begin{itemize}
|
||||||
|
\item Compressors are good at capturing regularity
|
||||||
|
\item Objects from the same category share more regularity than those that aren't
|
||||||
|
\item Compressor $C$, strings $x,y$ similar, different from $z$
|
||||||
|
\item Then $C(xy) < C(xz)$
|
||||||
|
\end{itemize}
|
||||||
|
\end{frame}
|
||||||
|
|
||||||
|
\begin{frame}
|
||||||
|
\frametitle{Kolmogorov complexity}
|
||||||
|
\begin{definition}
|
||||||
|
\textbf{Kolmogorov complexity} $K(x)$: length of shortest program that outputs $x$, i.e. an ideal compressor
|
||||||
|
\end{definition}
|
||||||
|
\begin{definition}
|
||||||
|
\textbf{Information distance} $E(x, y)$: length of shortest program that converts $x$ to $y$, equal to $K(xy)-K(x)$
|
||||||
|
\end{definition}
|
||||||
|
\begin{itemize}
|
||||||
|
\item $K(x)$ is generally \textbf{incomputable} though, similar to the halting problem
|
||||||
|
\end{itemize}
|
||||||
|
\end{frame}
|
||||||
|
|
||||||
|
\begin{frame}
|
||||||
|
\frametitle{Approximating $K$}
|
||||||
|
\begin{itemize}
|
||||||
|
\item Simply swap out $K$ for our gzip compressor $C$
|
||||||
|
\item Normalize by length
|
||||||
|
\end{itemize}
|
||||||
|
\begin{definition}
|
||||||
|
\textbf{Normalized compression distance} \[NCD(x,y) = \frac{C(x,y)-\min(C(x),C(y))}{\max(C(x),C(y))}\]
|
||||||
|
\end{definition}
|
||||||
|
\end{frame}
|
||||||
|
|
||||||
|
\begin{frame}
|
||||||
|
\frametitle{Method for classification}
|
||||||
|
\begin{itemize}
|
||||||
|
\item $k$-nearest-neighbor among all strings in the training set, using $NCD$ as the distance
|
||||||
|
\item That's all!
|
||||||
|
\end{itemize}
|
||||||
|
\end{frame}
|
||||||
|
|
||||||
|
\begin{frame}[fragile]
|
||||||
|
\frametitle{Code}
|
||||||
|
\begin{python}
|
||||||
|
from gzip import compress as C
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
for (x1, _) in test_set:
|
||||||
|
Cx1 = len(C(x1.encode()))
|
||||||
|
distance_from_x1 = []
|
||||||
|
for (x2, _) in training_set:
|
||||||
|
Cx2 = len(C(x2.encode()))
|
||||||
|
x1x2 = " ".join([x1, x2])
|
||||||
|
Cx1x2 = len(C(x1x2.encode()))
|
||||||
|
ncd = (Cx1x2 - min(Cx1,Cx2)) / max(Cx1, Cx2)
|
||||||
|
distance_from_x1.append(ncd)
|
||||||
|
sorted_idx = np.argsort(np.array(distance_from_x1))
|
||||||
|
top_k_class = training_set[sorted_idx[:k], 1]
|
||||||
|
predict_class = max(set(top_k_class), key=top_k_class.count)
|
||||||
|
\end{python}
|
||||||
|
\end{frame}
|
||||||
|
|
||||||
|
\begin{frame}
|
||||||
|
\frametitle{Performance optimizations}
|
||||||
|
\begin{itemize}
|
||||||
|
\item Can reuse $C(x)$ when computing $C(xy)$
|
||||||
|
\item Multithreading
|
||||||
|
\item Use C instead of Python
|
||||||
|
\end{itemize}
|
||||||
|
\end{frame}
|
||||||
|
|
||||||
|
\begin{frame}
|
||||||
|
\frametitle{Results}
|
||||||
|
\end{frame}
|
||||||
|
|
||||||
|
\begin{frame}
|
||||||
|
\frametitle{Actually\dots}
|
||||||
|
\end{frame}
|
||||||
|
|
||||||
|
\begin{frame}
|
||||||
|
\frametitle{Connections between compression and ML}
|
||||||
|
\begin{itemize}
|
||||||
|
\item Autoencoders
|
||||||
|
\end{itemize}
|
||||||
|
\end{frame}
|
||||||
|
|
||||||
|
\end{document}
|
Loading…
Reference in a new issue