Finish presentation

2024-06-14 11:30:30 -05:00 · 2024-06-14 11:30:30 -05:00 · 00005f45d3
commit 00005f45d3
parent 66432b3158
2 changed files with 89 additions and 8 deletions
--- a/static/src/gzip-classification.pdf
+++ b/static/src/gzip-classification.pdf
--- a/static/src/gzip-classification.tex
+++ b/static/src/gzip-classification.tex
@ -2,9 +2,12 @@

 \usepackage{pythonhighlight}

+\hypersetup{colorlinks=true}
+
 % https://arxiv.org/abs/2212.09410
 % https://arxiv.org/abs/2309.10668
 % https://github.com/Futrell/ziplm
+% https://bellard.org/nncp/nncp.pdf
 % https://bellard.org/nncp/nncp_v2.1.pdf
 % https://prize.hutter1.net/ (blocked by AMD 🤷)
 % https://kenschutte.com/gzip-knn-paper/
@ -18,12 +21,12 @@
 \frame{\titlepage}

 \begin{frame}
-	\frametitle{Task: text compression}
+	\frametitle{Task: text classification}
 	\begin{itemize}
 		\item Given some categories, classify strings into the categories
 		\item Example from AG News dataset:
 		\begin{itemize}
-			\item Categories: World, Sports, Business, Sci/Tech
+			\item 4 categories: World, Sports, Business, Sci/Tech
 			\item Example string: "AMD Will Have An Edge Over Intel Through 2005. Piper Jaffray said Advanced Micro Devices (nyse: AMD - news - people ) should have an edge over Intel (nasdaq: INTC - news - people )  quot;throughout 2005 quot; but resumed coverage of both companies at  quot;market perform quot; and both at price targets of \$25."
 		\end{itemize}
 		\item Usually solved using neural networks, i.e. BERT
@ -52,24 +55,35 @@
 \begin{frame}
 	\frametitle{Kolmogorov complexity}
 	\begin{definition}
-		\textbf{Kolmogorov complexity} $K(x)$: length of shortest program that outputs $x$, i.e. an ideal compressor
+		\textbf{Kolmogorov complexity} $K(x)$: length of shortest program that outputs $x$, basically an ideal compressor
 	\end{definition}
 	\begin{definition}
 		\textbf{Information distance} $E(x, y)$: length of shortest program that converts $x$ to $y$, equal to $K(xy)-K(x)$
 	\end{definition}
+\end{frame}
+
+\begin{frame}[fragile]
+	\frametitle{K(x) is uncomputable!}
 	\begin{itemize}
-		\item $K(x)$ is generally \textbf{incomputable} though, similar to the halting problem
+		\item Equivalent to the halting problem
+		\item Analogy: "The smallest positive integer not definable in under eleven words."
+		\item
+			\begin{python}
+for x in all_strings:
+	if K(x) > len(this_program):
+		print(x)
+			\end{python}
 	\end{itemize}
 \end{frame}

 \begin{frame}
-	\frametitle{Approximating $K$}
+	\frametitle{Instead, approximate $K$}
 	\begin{itemize}
 		\item Simply swap out $K$ for our gzip compressor $C$
 		\item Normalize by length
 	\end{itemize}
 	\begin{definition}
-		\textbf{Normalized compression distance} \[NCD(x,y) = \frac{C(x,y)-\min(C(x),C(y))}{\max(C(x),C(y))}\]
+		\textbf{Normalized compression distance} \[NCD(x,y) = \frac{C(xy)-\min(C(x),C(y))}{\max(C(x),C(y))}\]
 	\end{definition}
 \end{frame}

@ -111,18 +125,85 @@ for (x1, _) in test_set:
 	\end{itemize}
 \end{frame}

-\begin{frame}
+\begin{frame}[fragile]
 	\frametitle{Results}
+	"Outperforms BERT"
+	\begin{tabular}{c|c|c}
+		Dataset & NNs average & gzip \\
+		\hline
+		AGNews & 0.901 & \textbf{0.937} \\
+		DBpedia & 0.978 & 0.970 \\
+		YahooAnswers & 0.726 & 0.638 \\
+		20News & 0.656 & \textbf{0.685} \\
+		Ohsumed & 0.433 & \textbf{0.521} \\
+		R8 & 0.903 & \textbf{0.954} \\
+		R52 & 0.815 & \textbf{0.896} \\
+		\hline
+	\end{tabular}
 \end{frame}

 \begin{frame}
 	\frametitle{Actually\dots}
+	\begin{itemize}
+		\item Experiments use $k=2$ with top-2 accuracy rather than randomly breaking ties!
+		\item Contamination between training and test sets!
+		\item gzip is not great for text classification, but still an interesting idea
+	\end{itemize}
 \end{frame}

 \begin{frame}
 	\frametitle{Connections between compression and ML}
 	\begin{itemize}
-		\item Autoencoders
+		\item Autoencoders, dimensionality reduction
+		\item Compressors can generate text!
+		\item LLMs can do compression!
+		\item Both compressors and LLMs model the probability distribution of the next character given the current text
+	\end{itemize}
+\end{frame}
+
+\begin{frame}
+	\frametitle{ziplm: text generation using gzip}
+	\begin{itemize}
+		\item Idea: generate character that makes text compress the best
+		\item Results with gzip: theudcanvas. ;cm,zumhmcyoetter toauuo long a one aay,;wvbu.mvns. x the dtls and enso.;k.like bla.njv
+	\end{itemize}
+\end{frame}
+
+\begin{frame}[fragile]
+	\frametitle{ziplm code}
+	\begin{python}
+class ZipModel:
+	def logprobs(self, prefix="", temperature=1):
+		code_lengths = np.array([
+			len(self.compressor.compress("".join([self.training, prefix, v]).encode()))
+			for v in self.vocabulary
+		])
+		return scipy.special.log_softmax(-code_lengths*self.conversion*(1/temperature))
+	def sample(self, prefix="", temperature=1):
+		scores = self.logprobs(prefix, temperature=temperature)
+		i = np.random.choice(range(len(self.vocabulary)), p=np.exp(scores))
+		return self.vocabulary[i]
+	\end{python}
+\end{frame}
+
+\begin{frame}
+	\frametitle{Compression using LLMs}
+	\begin{itemize}
+		\item Idea: LLM generates list of candidate tokens, store the index of the token that the text uses
+		\item Very predictable text (i.e. digits of $\pi$) turn into 11111111111111111
+		\item Then run that through gzip
+		\item Amazing compression ratio, but slow
+	\end{itemize}
+\end{frame}
+
+\begin{frame}
+	\frametitle{References}
+	\begin{itemize}
+		\item \href{https://arxiv.org/abs/2212.09410}{Less is More: Parameter-Free Text Classification with Gzip}
+		\item \href{https://kenschutte.com/gzip-knn-paper/}{Bad numbers in the "gzip beats BERT" paper?}
+		\item \href{https://github.com/Futrell/ziplm}{ziplm GitHub}
+		\item \href{https://bellard.org/nncp/nncp.pdf}{Lossless Data Compression with Neural Networks}
+		\item \href{https://arxiv.org/abs/2309.10668}{Language Modeling Is Compression}
 	\end{itemize}
 \end{frame}