Finish presentation
All checks were successful
ci/woodpecker/push/woodpecker Pipeline was successful
All checks were successful
ci/woodpecker/push/woodpecker Pipeline was successful
This commit is contained in:
parent
66432b3158
commit
00005f45d3
Binary file not shown.
|
@ -2,9 +2,12 @@
|
|||
|
||||
\usepackage{pythonhighlight}
|
||||
|
||||
\hypersetup{colorlinks=true}
|
||||
|
||||
% https://arxiv.org/abs/2212.09410
|
||||
% https://arxiv.org/abs/2309.10668
|
||||
% https://github.com/Futrell/ziplm
|
||||
% https://bellard.org/nncp/nncp.pdf
|
||||
% https://bellard.org/nncp/nncp_v2.1.pdf
|
||||
% https://prize.hutter1.net/ (blocked by AMD 🤷)
|
||||
% https://kenschutte.com/gzip-knn-paper/
|
||||
|
@ -18,12 +21,12 @@
|
|||
\frame{\titlepage}
|
||||
|
||||
\begin{frame}
|
||||
\frametitle{Task: text compression}
|
||||
\frametitle{Task: text classification}
|
||||
\begin{itemize}
|
||||
\item Given some categories, classify strings into the categories
|
||||
\item Example from AG News dataset:
|
||||
\begin{itemize}
|
||||
\item Categories: World, Sports, Business, Sci/Tech
|
||||
\item 4 categories: World, Sports, Business, Sci/Tech
|
||||
\item Example string: "AMD Will Have An Edge Over Intel Through 2005. Piper Jaffray said Advanced Micro Devices (nyse: AMD - news - people ) should have an edge over Intel (nasdaq: INTC - news - people ) quot;throughout 2005 quot; but resumed coverage of both companies at quot;market perform quot; and both at price targets of \$25."
|
||||
\end{itemize}
|
||||
\item Usually solved using neural networks, i.e. BERT
|
||||
|
@ -52,24 +55,35 @@
|
|||
\begin{frame}
|
||||
\frametitle{Kolmogorov complexity}
|
||||
\begin{definition}
|
||||
\textbf{Kolmogorov complexity} $K(x)$: length of shortest program that outputs $x$, i.e. an ideal compressor
|
||||
\textbf{Kolmogorov complexity} $K(x)$: length of shortest program that outputs $x$, basically an ideal compressor
|
||||
\end{definition}
|
||||
\begin{definition}
|
||||
\textbf{Information distance} $E(x, y)$: length of shortest program that converts $x$ to $y$, equal to $K(xy)-K(x)$
|
||||
\end{definition}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}[fragile]
|
||||
\frametitle{K(x) is uncomputable!}
|
||||
\begin{itemize}
|
||||
\item $K(x)$ is generally \textbf{incomputable} though, similar to the halting problem
|
||||
\item Equivalent to the halting problem
|
||||
\item Analogy: "The smallest positive integer not definable in under eleven words."
|
||||
\item
|
||||
\begin{python}
|
||||
for x in all_strings:
|
||||
if K(x) > len(this_program):
|
||||
print(x)
|
||||
\end{python}
|
||||
\end{itemize}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}
|
||||
\frametitle{Approximating $K$}
|
||||
\frametitle{Instead, approximate $K$}
|
||||
\begin{itemize}
|
||||
\item Simply swap out $K$ for our gzip compressor $C$
|
||||
\item Normalize by length
|
||||
\end{itemize}
|
||||
\begin{definition}
|
||||
\textbf{Normalized compression distance} \[NCD(x,y) = \frac{C(x,y)-\min(C(x),C(y))}{\max(C(x),C(y))}\]
|
||||
\textbf{Normalized compression distance} \[NCD(x,y) = \frac{C(xy)-\min(C(x),C(y))}{\max(C(x),C(y))}\]
|
||||
\end{definition}
|
||||
\end{frame}
|
||||
|
||||
|
@ -111,18 +125,85 @@ for (x1, _) in test_set:
|
|||
\end{itemize}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}
|
||||
\begin{frame}[fragile]
|
||||
\frametitle{Results}
|
||||
"Outperforms BERT"
|
||||
\begin{tabular}{c|c|c}
|
||||
Dataset & NNs average & gzip \\
|
||||
\hline
|
||||
AGNews & 0.901 & \textbf{0.937} \\
|
||||
DBpedia & 0.978 & 0.970 \\
|
||||
YahooAnswers & 0.726 & 0.638 \\
|
||||
20News & 0.656 & \textbf{0.685} \\
|
||||
Ohsumed & 0.433 & \textbf{0.521} \\
|
||||
R8 & 0.903 & \textbf{0.954} \\
|
||||
R52 & 0.815 & \textbf{0.896} \\
|
||||
\hline
|
||||
\end{tabular}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}
|
||||
\frametitle{Actually\dots}
|
||||
\begin{itemize}
|
||||
\item Experiments use $k=2$ with top-2 accuracy rather than randomly breaking ties!
|
||||
\item Contamination between training and test sets!
|
||||
\item gzip is not great for text classification, but still an interesting idea
|
||||
\end{itemize}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}
|
||||
\frametitle{Connections between compression and ML}
|
||||
\begin{itemize}
|
||||
\item Autoencoders
|
||||
\item Autoencoders, dimensionality reduction
|
||||
\item Compressors can generate text!
|
||||
\item LLMs can do compression!
|
||||
\item Both compressors and LLMs model the probability distribution of the next character given the current text
|
||||
\end{itemize}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}
|
||||
\frametitle{ziplm: text generation using gzip}
|
||||
\begin{itemize}
|
||||
\item Idea: generate character that makes text compress the best
|
||||
\item Results with gzip: theudcanvas. ;cm,zumhmcyoetter toauuo long a one aay,;wvbu.mvns. x the dtls and enso.;k.like bla.njv
|
||||
\end{itemize}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}[fragile]
|
||||
\frametitle{ziplm code}
|
||||
\begin{python}
|
||||
class ZipModel:
|
||||
def logprobs(self, prefix="", temperature=1):
|
||||
code_lengths = np.array([
|
||||
len(self.compressor.compress("".join([self.training, prefix, v]).encode()))
|
||||
for v in self.vocabulary
|
||||
])
|
||||
return scipy.special.log_softmax(-code_lengths*self.conversion*(1/temperature))
|
||||
def sample(self, prefix="", temperature=1):
|
||||
scores = self.logprobs(prefix, temperature=temperature)
|
||||
i = np.random.choice(range(len(self.vocabulary)), p=np.exp(scores))
|
||||
return self.vocabulary[i]
|
||||
\end{python}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}
|
||||
\frametitle{Compression using LLMs}
|
||||
\begin{itemize}
|
||||
\item Idea: LLM generates list of candidate tokens, store the index of the token that the text uses
|
||||
\item Very predictable text (i.e. digits of $\pi$) turn into 11111111111111111
|
||||
\item Then run that through gzip
|
||||
\item Amazing compression ratio, but slow
|
||||
\end{itemize}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}
|
||||
\frametitle{References}
|
||||
\begin{itemize}
|
||||
\item \href{https://arxiv.org/abs/2212.09410}{Less is More: Parameter-Free Text Classification with Gzip}
|
||||
\item \href{https://kenschutte.com/gzip-knn-paper/}{Bad numbers in the "gzip beats BERT" paper?}
|
||||
\item \href{https://github.com/Futrell/ziplm}{ziplm GitHub}
|
||||
\item \href{https://bellard.org/nncp/nncp.pdf}{Lossless Data Compression with Neural Networks}
|
||||
\item \href{https://arxiv.org/abs/2309.10668}{Language Modeling Is Compression}
|
||||
\end{itemize}
|
||||
\end{frame}
|
||||
|
||||
|
|
Loading…
Reference in a new issue