From fd285c27f1401f80e7b9b129d3678581a98bee9d Mon Sep 17 00:00:00 2001
From: Kyle1668 <kyledevinobrien1@gmail.com>
Date: Thu, 25 May 2023 23:12:26 +0000
Subject: [PATCH 1/7] Begin Implementing Incremental Filter

---
 filters/pattern_incrementing.py               |   42 +-
 filters/test_pattern_incrementing.py          |   24 +
 requirements.txt                              |    3 +-
 .../kyle/taxonemy_analysis/eval_set_v2.ipynb  | 1698 +++++++++++++++++
 4 files changed, 1765 insertions(+), 2 deletions(-)
 create mode 100644 filters/test_pattern_incrementing.py
 create mode 100644 working_dirs/kyle/taxonemy_analysis/eval_set_v2.ipynb

diff --git a/filters/pattern_incrementing.py b/filters/pattern_incrementing.py
index 53711e9..648828c 100644
--- a/filters/pattern_incrementing.py
+++ b/filters/pattern_incrementing.py
@@ -1,2 +1,42 @@
-def incrementing_sequences_filter(text):
+def incrementing_sequences_filter(text: str) -> bool:
+    """
+    This sequence will classify a given text is an incrementing sequence or not.
+
+    Args:
+        text (str): The current sequence to be classified.
+
+    Returns:
+        bool: Whether the sequence is an incrementing sequence or not.
+    """
+    # Check for incrementing sequences of only numbers
+    previous_entry = None
+    direction = None
+    for string_entry in text.split():
+        string_entry = (string_entry[:-1] if string_entry[-1] == "." else string_entry)
+        if string_entry.isdigit():
+            numerical_entry = float(string_entry)
+            if previous_entry is None:
+                previous_entry = numerical_entry
+                continue
+            elif direction is None:
+                direction = "positive" if numerical_entry > previous_entry else "negative"
+                previous_entry = numerical_entry
+                continue
+            else:
+                if direction == "positive":
+                    if numerical_entry > previous_entry:
+                        previous_entry = numerical_entry
+                        continue
+                    else:
+                        return False
+                else:
+                    if numerical_entry < previous_entry:
+                        previous_entry = numerical_entry
+                        continue
+                    else:
+                        return False
+
+        # A non-numerican entry was found
+        return False
+
     return True
\ No newline at end of file
diff --git a/filters/test_pattern_incrementing.py b/filters/test_pattern_incrementing.py
new file mode 100644
index 0000000..acea6cb
--- /dev/null
+++ b/filters/test_pattern_incrementing.py
@@ -0,0 +1,24 @@
+from .pattern_incrementing import incrementing_sequences_filter
+
+def test_pattern_incrementing():
+    text = "128. 129. 130. 131. 132. 133."
+    assert incrementing_sequences_filter(text) == True
+
+def test_pattern_new_lines_incrementing():
+    text = "128.\n129.\n130.\n131.\n132.\n133."
+    assert incrementing_sequences_filter(text) == True
+
+def test_pattern_list_incrementing():
+    text = "- 128.\n- 129.\n- 130.\n- 131.\n- 132.\n- 133."
+    assert incrementing_sequences_filter(text) == True
+
+def test_incrementing_nonnumerical_pattern():
+    text = """![](edinbmedj75052-0047-b){#f5.123}
+
+![](edinbmedj75052-0049-a){#f6.125}
+
+![](edinbmedj75052-0049-b){#f7.125}
+
+![](edin"""
+
+    assert incrementing_sequences_filter(text) == True
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 4c7ccd4..805330d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,6 @@
 pandas
 numpy
+scikit-learn
 torch
 torchvision
 torchaudio
@@ -9,4 +10,4 @@ datasets
 tqdm
 black
 pylint
-scikit-learn
\ No newline at end of file
+pytest
\ No newline at end of file
diff --git a/working_dirs/kyle/taxonemy_analysis/eval_set_v2.ipynb b/working_dirs/kyle/taxonemy_analysis/eval_set_v2.ipynb
new file mode 100644
index 0000000..e193ba4
--- /dev/null
+++ b/working_dirs/kyle/taxonemy_analysis/eval_set_v2.ipynb
@@ -0,0 +1,1698 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "from datasets import load_dataset\n",
+    "from transformers import AutoTokenizer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>index</th>\n",
+       "      <th>perplexity</th>\n",
+       "      <th>memorized</th>\n",
+       "      <th>is_code</th>\n",
+       "      <th>shortened_text</th>\n",
+       "      <th>Category</th>\n",
+       "      <th>Note</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>92883</td>\n",
+       "      <td>3.687500</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>}{-69pt}\\n \\begin{document}$u_{n}\\rightarrow u...</td>\n",
+       "      <td>code</td>\n",
+       "      <td>latex</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>685875</td>\n",
+       "      <td>3.837891</td>\n",
+       "      <td>True</td>\n",
+       "      <td>False</td>\n",
+       "      <td>alesSite: All American Trannies\\n\\nFor Search ...</td>\n",
+       "      <td>nl</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>973152</td>\n",
+       "      <td>2.884766</td>\n",
+       "      <td>True</td>\n",
+       "      <td>False</td>\n",
+       "      <td>18&gt;::type T18;\\n typedef map&lt;T0, T1, T2, T3, T...</td>\n",
+       "      <td>pattern-incrementing</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>1016981</td>\n",
+       "      <td>1.056641</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>]{minimal}\\n \\usepackage{amsmath}\\n \\usepackag...</td>\n",
+       "      <td>code</td>\n",
+       "      <td>latex</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>1089371</td>\n",
+       "      <td>3.882812</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>: 1,\\n\",\\n \"'col-md-push-6' : 1,\\n\",\\n \"'col-...</td>\n",
+       "      <td>pattern-incrementing</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2495</th>\n",
+       "      <td>2581351</td>\n",
+       "      <td>3.392578</td>\n",
+       "      <td>False</td>\n",
+       "      <td>True</td>\n",
+       "      <td>2*y**2 + 6*y. Let z(g) = -3*g**2 - 7*g - 7. Le...</td>\n",
+       "      <td>code+nl</td>\n",
+       "      <td>math</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2496</th>\n",
+       "      <td>2583534</td>\n",
+       "      <td>3.597656</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>039 ### ###',\\n '049 ### ###',\\n '050 ### ###'...</td>\n",
+       "      <td>pattern-incrementing</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2497</th>\n",
+       "      <td>2584695</td>\n",
+       "      <td>3.710938</td>\n",
+       "      <td>False</td>\n",
+       "      <td>True</td>\n",
+       "      <td>.1, -1?\\n-1\\nWhat is the second biggest value ...</td>\n",
+       "      <td>code+nl</td>\n",
+       "      <td>math</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2498</th>\n",
+       "      <td>2586170</td>\n",
+       "      <td>3.080078</td>\n",
+       "      <td>False</td>\n",
+       "      <td>True</td>\n",
+       "      <td>public DbUpdateException()\\n {\\n }\\n\\n /// &lt;su...</td>\n",
+       "      <td>code</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2499</th>\n",
+       "      <td>2593068</td>\n",
+       "      <td>2.578125</td>\n",
+       "      <td>False</td>\n",
+       "      <td>True</td>\n",
+       "      <td>CLANG_WARN_BOOL_CONVERSION = YES;\\n CLANG_WARN...</td>\n",
+       "      <td>code</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>2500 rows × 7 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "        index  perplexity  memorized  is_code   \n",
+       "0       92883    3.687500       True     True  \\\n",
+       "1      685875    3.837891       True    False   \n",
+       "2      973152    2.884766       True    False   \n",
+       "3     1016981    1.056641       True     True   \n",
+       "4     1089371    3.882812       True     True   \n",
+       "...       ...         ...        ...      ...   \n",
+       "2495  2581351    3.392578      False     True   \n",
+       "2496  2583534    3.597656      False    False   \n",
+       "2497  2584695    3.710938      False     True   \n",
+       "2498  2586170    3.080078      False     True   \n",
+       "2499  2593068    2.578125      False     True   \n",
+       "\n",
+       "                                         shortened_text              Category   \n",
+       "0     }{-69pt}\\n \\begin{document}$u_{n}\\rightarrow u...                  code  \\\n",
+       "1     alesSite: All American Trannies\\n\\nFor Search ...                    nl   \n",
+       "2     18>::type T18;\\n typedef map<T0, T1, T2, T3, T...  pattern-incrementing   \n",
+       "3     ]{minimal}\\n \\usepackage{amsmath}\\n \\usepackag...                  code   \n",
+       "4      : 1,\\n\",\\n \"'col-md-push-6' : 1,\\n\",\\n \"'col-...  pattern-incrementing   \n",
+       "...                                                 ...                   ...   \n",
+       "2495  2*y**2 + 6*y. Let z(g) = -3*g**2 - 7*g - 7. Le...               code+nl   \n",
+       "2496  039 ### ###',\\n '049 ### ###',\\n '050 ### ###'...  pattern-incrementing   \n",
+       "2497  .1, -1?\\n-1\\nWhat is the second biggest value ...               code+nl   \n",
+       "2498  public DbUpdateException()\\n {\\n }\\n\\n /// <su...                  code   \n",
+       "2499  CLANG_WARN_BOOL_CONVERSION = YES;\\n CLANG_WARN...                  code   \n",
+       "\n",
+       "       Note  \n",
+       "0     latex  \n",
+       "1       NaN  \n",
+       "2       NaN  \n",
+       "3     latex  \n",
+       "4       NaN  \n",
+       "...     ...  \n",
+       "2495   math  \n",
+       "2496    NaN  \n",
+       "2497   math  \n",
+       "2498    NaN  \n",
+       "2499    NaN  \n",
+       "\n",
+       "[2500 rows x 7 columns]"
+      ]
+     },
+     "execution_count": 19,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "eval_set = pd.read_csv(\"Pythia_70m_Deduped_Low_Perplexity_Labeling_Formatted.csv\")\n",
+    "eval_set"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Found cached dataset parquet (/home/kyle/.cache/huggingface/datasets/EleutherAI___parquet/EleutherAI--pythia-memorized-evals-623aaa371a33821a/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>index</th>\n",
+       "      <th>tokens</th>\n",
+       "      <th>__index_level_0__</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>441</td>\n",
+       "      <td>[5584, 4196, 1228, 187, 1036, 4, 209, 21723, 2...</td>\n",
+       "      <td>441</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>447</td>\n",
+       "      <td>[50262, 61, 2099, 92, 8861, 94, 187, 50262, 61...</td>\n",
+       "      <td>447</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>792</td>\n",
+       "      <td>[475, 50272, 953, 24781, 778, 320, 908, 281, 1...</td>\n",
+       "      <td>792</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>1539</td>\n",
+       "      <td>[424, 380, 16101, 313, 433, 17889, 3104, 10, 2...</td>\n",
+       "      <td>1539</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>1705</td>\n",
+       "      <td>[3498, 2262, 2369, 40, 736, 13, 3956, 27, 21, ...</td>\n",
+       "      <td>1705</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>411443</th>\n",
+       "      <td>146431199</td>\n",
+       "      <td>[281, 320, 669, 8604, 60, 805, 431, 1019, 8402...</td>\n",
+       "      <td>2287199</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>411444</th>\n",
+       "      <td>146431278</td>\n",
+       "      <td>[588, 1705, 285, 8415, 634, 1895, 15, 30952, 3...</td>\n",
+       "      <td>2287278</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>411445</th>\n",
+       "      <td>146431294</td>\n",
+       "      <td>[15468, 13, 50275, 13743, 13, 50275, 15220, 13...</td>\n",
+       "      <td>2287294</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>411446</th>\n",
+       "      <td>146431588</td>\n",
+       "      <td>[27, 330, 14788, 10334, 14, 3429, 27, 577, 28,...</td>\n",
+       "      <td>2287588</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>411447</th>\n",
+       "      <td>146431592</td>\n",
+       "      <td>[1406, 485, 15, 23780, 300, 2473, 285, 12698, ...</td>\n",
+       "      <td>2287592</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>411448 rows × 3 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "            index                                             tokens   \n",
+       "0             441  [5584, 4196, 1228, 187, 1036, 4, 209, 21723, 2...  \\\n",
+       "1             447  [50262, 61, 2099, 92, 8861, 94, 187, 50262, 61...   \n",
+       "2             792  [475, 50272, 953, 24781, 778, 320, 908, 281, 1...   \n",
+       "3            1539  [424, 380, 16101, 313, 433, 17889, 3104, 10, 2...   \n",
+       "4            1705  [3498, 2262, 2369, 40, 736, 13, 3956, 27, 21, ...   \n",
+       "...           ...                                                ...   \n",
+       "411443  146431199  [281, 320, 669, 8604, 60, 805, 431, 1019, 8402...   \n",
+       "411444  146431278  [588, 1705, 285, 8415, 634, 1895, 15, 30952, 3...   \n",
+       "411445  146431294  [15468, 13, 50275, 13743, 13, 50275, 15220, 13...   \n",
+       "411446  146431588  [27, 330, 14788, 10334, 14, 3429, 27, 577, 28,...   \n",
+       "411447  146431592  [1406, 485, 15, 23780, 300, 2473, 285, 12698, ...   \n",
+       "\n",
+       "        __index_level_0__  \n",
+       "0                     441  \n",
+       "1                     447  \n",
+       "2                     792  \n",
+       "3                    1539  \n",
+       "4                    1705  \n",
+       "...                   ...  \n",
+       "411443            2287199  \n",
+       "411444            2287278  \n",
+       "411445            2287294  \n",
+       "411446            2287588  \n",
+       "411447            2287592  \n",
+       "\n",
+       "[411448 rows x 3 columns]"
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pythia_70m_memories = load_dataset(\"EleutherAI/pythia-memorized-evals\", split=\"deduped.70m\").to_pandas()\n",
+    "pythia_70m_memories"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>index</th>\n",
+       "      <th>perplexity</th>\n",
+       "      <th>memorized</th>\n",
+       "      <th>is_code</th>\n",
+       "      <th>shortened_text</th>\n",
+       "      <th>Category</th>\n",
+       "      <th>Note</th>\n",
+       "      <th>tokens</th>\n",
+       "      <th>__index_level_0__</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>92883</td>\n",
+       "      <td>3.687500</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>}{-69pt}\\n \\begin{document}$u_{n}\\rightarrow u...</td>\n",
+       "      <td>code</td>\n",
+       "      <td>latex</td>\n",
+       "      <td>[8699, 2090, 431, 94, 187, 50262, 61, 2043, 92...</td>\n",
+       "      <td>92883</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>685875</td>\n",
+       "      <td>3.837891</td>\n",
+       "      <td>True</td>\n",
+       "      <td>False</td>\n",
+       "      <td>alesSite: All American Trannies\\n\\nFor Search ...</td>\n",
+       "      <td>nl</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>[2339, 27327, 27, 1876, 2448, 1535, 1136, 447,...</td>\n",
+       "      <td>685875</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>973152</td>\n",
+       "      <td>2.884766</td>\n",
+       "      <td>True</td>\n",
+       "      <td>False</td>\n",
+       "      <td>18&gt;::type T18;\\n typedef map&lt;T0, T1, T2, T3, T...</td>\n",
+       "      <td>pattern-incrementing</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>[1093, 14157, 881, 308, 1093, 28, 187, 50266, ...</td>\n",
+       "      <td>973152</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>1016981</td>\n",
+       "      <td>1.056641</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>]{minimal}\\n \\usepackage{amsmath}\\n \\usepackag...</td>\n",
+       "      <td>code</td>\n",
+       "      <td>latex</td>\n",
+       "      <td>[1019, 8402, 94, 187, 50262, 61, 2099, 92, 879...</td>\n",
+       "      <td>1016981</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>1089371</td>\n",
+       "      <td>3.882812</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>: 1,\\n\",\\n \"'col-md-push-6' : 1,\\n\",\\n \"'col-...</td>\n",
+       "      <td>pattern-incrementing</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>[8, 1163, 337, 1337, 79, 995, 187, 50274, 2789...</td>\n",
+       "      <td>1089371</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2486</th>\n",
+       "      <td>2555832</td>\n",
+       "      <td>3.667969</td>\n",
+       "      <td>False</td>\n",
+       "      <td>True</td>\n",
+       "      <td>dip)&lt;/pre&gt;\\n&lt;/li&gt;\\n&lt;/ul&gt;\\n&lt;a name=\"cornerRadiu...</td>\n",
+       "      <td>code</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>[31665, 17266, 3456, 31, 187, 870, 965, 31, 18...</td>\n",
+       "      <td>267832</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2488</th>\n",
+       "      <td>2556154</td>\n",
+       "      <td>2.320312</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>LEASE COME TO MEXICO CITY PLEASE COME TO MEXIC...</td>\n",
+       "      <td>pattern-repeating</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>[26084, 8610, 38, 5935, 353, 4237, 24218, 4589...</td>\n",
+       "      <td>268154</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2491</th>\n",
+       "      <td>2560655</td>\n",
+       "      <td>4.386719</td>\n",
+       "      <td>False</td>\n",
+       "      <td>True</td>\n",
+       "      <td>NL_WABMON_4 = 131141\\n X_NL_WABMON_5 = 131142\\...</td>\n",
+       "      <td>pattern-incrementing</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>[19214, 64, 56, 2925, 22362, 64, 21, 50276, 30...</td>\n",
+       "      <td>272655</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2493</th>\n",
+       "      <td>2570852</td>\n",
+       "      <td>4.101562</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>WITH OCESAPLEASE COME MEXICO CITY WITH OCESAPL...</td>\n",
+       "      <td>pattern-repeating</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>[9277, 27202, 1410, 2088, 26084, 8610, 38, 353...</td>\n",
+       "      <td>282852</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2496</th>\n",
+       "      <td>2583534</td>\n",
+       "      <td>3.597656</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>039 ### ###',\\n '049 ### ###',\\n '050 ### ###'...</td>\n",
+       "      <td>pattern-incrementing</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>[18832, 209, 4118, 209, 4118, 1383, 187, 50270...</td>\n",
+       "      <td>295534</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>1430 rows × 9 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "        index  perplexity  memorized  is_code   \n",
+       "0       92883    3.687500       True     True  \\\n",
+       "1      685875    3.837891       True    False   \n",
+       "2      973152    2.884766       True    False   \n",
+       "3     1016981    1.056641       True     True   \n",
+       "4     1089371    3.882812       True     True   \n",
+       "...       ...         ...        ...      ...   \n",
+       "2486  2555832    3.667969      False     True   \n",
+       "2488  2556154    2.320312      False    False   \n",
+       "2491  2560655    4.386719      False     True   \n",
+       "2493  2570852    4.101562      False    False   \n",
+       "2496  2583534    3.597656      False    False   \n",
+       "\n",
+       "                                         shortened_text              Category   \n",
+       "0     }{-69pt}\\n \\begin{document}$u_{n}\\rightarrow u...                  code  \\\n",
+       "1     alesSite: All American Trannies\\n\\nFor Search ...                    nl   \n",
+       "2     18>::type T18;\\n typedef map<T0, T1, T2, T3, T...  pattern-incrementing   \n",
+       "3     ]{minimal}\\n \\usepackage{amsmath}\\n \\usepackag...                  code   \n",
+       "4      : 1,\\n\",\\n \"'col-md-push-6' : 1,\\n\",\\n \"'col-...  pattern-incrementing   \n",
+       "...                                                 ...                   ...   \n",
+       "2486  dip)</pre>\\n</li>\\n</ul>\\n<a name=\"cornerRadiu...                  code   \n",
+       "2488  LEASE COME TO MEXICO CITY PLEASE COME TO MEXIC...     pattern-repeating   \n",
+       "2491  NL_WABMON_4 = 131141\\n X_NL_WABMON_5 = 131142\\...  pattern-incrementing   \n",
+       "2493  WITH OCESAPLEASE COME MEXICO CITY WITH OCESAPL...     pattern-repeating   \n",
+       "2496  039 ### ###',\\n '049 ### ###',\\n '050 ### ###'...  pattern-incrementing   \n",
+       "\n",
+       "       Note                                             tokens   \n",
+       "0     latex  [8699, 2090, 431, 94, 187, 50262, 61, 2043, 92...  \\\n",
+       "1       NaN  [2339, 27327, 27, 1876, 2448, 1535, 1136, 447,...   \n",
+       "2       NaN  [1093, 14157, 881, 308, 1093, 28, 187, 50266, ...   \n",
+       "3     latex  [1019, 8402, 94, 187, 50262, 61, 2099, 92, 879...   \n",
+       "4       NaN  [8, 1163, 337, 1337, 79, 995, 187, 50274, 2789...   \n",
+       "...     ...                                                ...   \n",
+       "2486    NaN  [31665, 17266, 3456, 31, 187, 870, 965, 31, 18...   \n",
+       "2488    NaN  [26084, 8610, 38, 5935, 353, 4237, 24218, 4589...   \n",
+       "2491    NaN  [19214, 64, 56, 2925, 22362, 64, 21, 50276, 30...   \n",
+       "2493    NaN  [9277, 27202, 1410, 2088, 26084, 8610, 38, 353...   \n",
+       "2496    NaN  [18832, 209, 4118, 209, 4118, 1383, 187, 50270...   \n",
+       "\n",
+       "      __index_level_0__  \n",
+       "0                 92883  \n",
+       "1                685875  \n",
+       "2                973152  \n",
+       "3               1016981  \n",
+       "4               1089371  \n",
+       "...                 ...  \n",
+       "2486             267832  \n",
+       "2488             268154  \n",
+       "2491             272655  \n",
+       "2493             282852  \n",
+       "2496             295534  \n",
+       "\n",
+       "[1430 rows x 9 columns]"
+      ]
+     },
+     "execution_count": 21,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# join both frames on their column named index\n",
+    "memories_eval_set = eval_set.join(pythia_70m_memories.set_index('index'), on='index', how=\"inner\")\n",
+    "memories_eval_set"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Found cached dataset parquet (/home/kyle/.cache/huggingface/datasets/EleutherAI___parquet/EleutherAI--pile-deduped-pythia-random-sampled-ef6db9ddd170a4bc/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)\n",
+      "100%|██████████| 1/1 [00:00<00:00, 18.26it/s]\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>index</th>\n",
+       "      <th>tokens</th>\n",
+       "      <th>is_memorized</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>18</td>\n",
+       "      <td>[15, 46525, 3439, 2526, 187, 14, 17, 15, 1036,...</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>43</td>\n",
+       "      <td>[273, 22523, 18595, 275, 643, 3054, 2085, 3081...</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>86</td>\n",
+       "      <td>[749, 10580, 273, 575, 5, 44, 64, 79, 5, 534, ...</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>110</td>\n",
+       "      <td>[12556, 187, 71, 437, 285, 45965, 13, 285, 253...</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>112</td>\n",
+       "      <td>[3847, 277, 2631, 449, 346, 1552, 310, 417, 82...</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4999995</th>\n",
+       "      <td>146431872</td>\n",
+       "      <td>[3117, 393, 6040, 416, 393, 5786, 393, 50, 5, ...</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4999996</th>\n",
+       "      <td>146431904</td>\n",
+       "      <td>[187, 6067, 1783, 2722, 326, 14108, 1638, 3400...</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4999997</th>\n",
+       "      <td>146431927</td>\n",
+       "      <td>[704, 39660, 1051, 187, 29, 56, 2711, 8537, 37...</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4999998</th>\n",
+       "      <td>146431960</td>\n",
+       "      <td>[14, 34552, 15390, 1253, 15280, 285, 1108, 447...</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4999999</th>\n",
+       "      <td>146431973</td>\n",
+       "      <td>[38630, 14716, 247, 15846, 8651, 5763, 15, 831...</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>5000000 rows × 3 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "             index                                             tokens   \n",
+       "0               18  [15, 46525, 3439, 2526, 187, 14, 17, 15, 1036,...  \\\n",
+       "1               43  [273, 22523, 18595, 275, 643, 3054, 2085, 3081...   \n",
+       "2               86  [749, 10580, 273, 575, 5, 44, 64, 79, 5, 534, ...   \n",
+       "3              110  [12556, 187, 71, 437, 285, 45965, 13, 285, 253...   \n",
+       "4              112  [3847, 277, 2631, 449, 346, 1552, 310, 417, 82...   \n",
+       "...            ...                                                ...   \n",
+       "4999995  146431872  [3117, 393, 6040, 416, 393, 5786, 393, 50, 5, ...   \n",
+       "4999996  146431904  [187, 6067, 1783, 2722, 326, 14108, 1638, 3400...   \n",
+       "4999997  146431927  [704, 39660, 1051, 187, 29, 56, 2711, 8537, 37...   \n",
+       "4999998  146431960  [14, 34552, 15390, 1253, 15280, 285, 1108, 447...   \n",
+       "4999999  146431973  [38630, 14716, 247, 15846, 8651, 5763, 15, 831...   \n",
+       "\n",
+       "         is_memorized  \n",
+       "0               False  \n",
+       "1               False  \n",
+       "2               False  \n",
+       "3               False  \n",
+       "4               False  \n",
+       "...               ...  \n",
+       "4999995         False  \n",
+       "4999996         False  \n",
+       "4999997         False  \n",
+       "4999998         False  \n",
+       "4999999         False  \n",
+       "\n",
+       "[5000000 rows x 3 columns]"
+      ]
+     },
+     "execution_count": 24,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "non_memories = load_dataset(\"EleutherAI/pile-deduped-pythia-random-sampled\")[\"train\"].to_pandas()\n",
+    "non_memories"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>index</th>\n",
+       "      <th>perplexity</th>\n",
+       "      <th>memorized</th>\n",
+       "      <th>is_code</th>\n",
+       "      <th>shortened_text</th>\n",
+       "      <th>Category</th>\n",
+       "      <th>Note</th>\n",
+       "      <th>tokens</th>\n",
+       "      <th>is_memorized</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>12</th>\n",
+       "      <td>1999230</td>\n",
+       "      <td>2.210938</td>\n",
+       "      <td>True</td>\n",
+       "      <td>False</td>\n",
+       "      <td>Armenians\",0,\"\",0,\"\",0,0,0,0,0,0,0,0,0,0,0,0,0...</td>\n",
+       "      <td>code</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>[37801, 2458, 995, 17, 937, 995, 17, 937, 995,...</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2184</th>\n",
+       "      <td>1999230</td>\n",
+       "      <td>2.210938</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>Armenians\",0,\"\",0,\"\",0,0,0,0,0,0,0,0,0,0,0,0,0...</td>\n",
+       "      <td>pattern-repeating</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>[37801, 2458, 995, 17, 937, 995, 17, 937, 995,...</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>17</th>\n",
+       "      <td>2814976</td>\n",
+       "      <td>4.007812</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>/brand-5\\nhttps://m.52010000.cn/brand-6\\nhttps...</td>\n",
+       "      <td>code</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>[16, 22374, 14, 22, 187, 3614, 1358, 78, 15, 2...</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>26</th>\n",
+       "      <td>3616218</td>\n",
+       "      <td>2.611328</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>CA5 },\\n { 0x10CE6, 0x10CA6 },\\n { 0x10CE7, 0x...</td>\n",
+       "      <td>pattern-incrementing</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>[4280, 22, 3572, 187, 50274, 92, 470, 89, 740,...</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>89</th>\n",
+       "      <td>9657233</td>\n",
+       "      <td>2.617188</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>#ERROR!</td>\n",
+       "      <td>code</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>[568, 2437, 275, 389, 15, 29762, 15, 26318, 15...</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2495</th>\n",
+       "      <td>2581351</td>\n",
+       "      <td>3.392578</td>\n",
+       "      <td>False</td>\n",
+       "      <td>True</td>\n",
+       "      <td>2*y**2 + 6*y. Let z(g) = -3*g**2 - 7*g - 7. Le...</td>\n",
+       "      <td>code+nl</td>\n",
+       "      <td>math</td>\n",
+       "      <td>[374, 11, 90, 424, 19, 559, 721, 11, 90, 15, 1...</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2496</th>\n",
+       "      <td>2583534</td>\n",
+       "      <td>3.597656</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>039 ### ###',\\n '049 ### ###',\\n '050 ### ###'...</td>\n",
+       "      <td>pattern-incrementing</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>[18832, 209, 4118, 209, 4118, 1383, 187, 50270...</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2497</th>\n",
+       "      <td>2584695</td>\n",
+       "      <td>3.710938</td>\n",
+       "      <td>False</td>\n",
+       "      <td>True</td>\n",
+       "      <td>.1, -1?\\n-1\\nWhat is the second biggest value ...</td>\n",
+       "      <td>code+nl</td>\n",
+       "      <td>math</td>\n",
+       "      <td>[15, 18, 13, 428, 18, 32, 187, 14, 18, 187, 12...</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2498</th>\n",
+       "      <td>2586170</td>\n",
+       "      <td>3.080078</td>\n",
+       "      <td>False</td>\n",
+       "      <td>True</td>\n",
+       "      <td>public DbUpdateException()\\n {\\n }\\n\\n /// &lt;su...</td>\n",
+       "      <td>code</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>[187, 50270, 4387, 46688, 11241, 5330, 1082, 1...</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2499</th>\n",
+       "      <td>2593068</td>\n",
+       "      <td>2.578125</td>\n",
+       "      <td>False</td>\n",
+       "      <td>True</td>\n",
+       "      <td>CLANG_WARN_BOOL_CONVERSION = YES;\\n CLANG_WARN...</td>\n",
+       "      <td>code</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>[3207, 14375, 64, 24798, 64, 30529, 64, 5707, ...</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>1298 rows × 9 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "        index  perplexity  memorized  is_code   \n",
+       "12    1999230    2.210938       True    False  \\\n",
+       "2184  1999230    2.210938      False    False   \n",
+       "17    2814976    4.007812       True     True   \n",
+       "26    3616218    2.611328       True     True   \n",
+       "89    9657233    2.617188       True     True   \n",
+       "...       ...         ...        ...      ...   \n",
+       "2495  2581351    3.392578      False     True   \n",
+       "2496  2583534    3.597656      False    False   \n",
+       "2497  2584695    3.710938      False     True   \n",
+       "2498  2586170    3.080078      False     True   \n",
+       "2499  2593068    2.578125      False     True   \n",
+       "\n",
+       "                                         shortened_text              Category   \n",
+       "12    Armenians\",0,\"\",0,\"\",0,0,0,0,0,0,0,0,0,0,0,0,0...                  code  \\\n",
+       "2184  Armenians\",0,\"\",0,\"\",0,0,0,0,0,0,0,0,0,0,0,0,0...     pattern-repeating   \n",
+       "17    /brand-5\\nhttps://m.52010000.cn/brand-6\\nhttps...                  code   \n",
+       "26    CA5 },\\n { 0x10CE6, 0x10CA6 },\\n { 0x10CE7, 0x...  pattern-incrementing   \n",
+       "89                                              #ERROR!                  code   \n",
+       "...                                                 ...                   ...   \n",
+       "2495  2*y**2 + 6*y. Let z(g) = -3*g**2 - 7*g - 7. Le...               code+nl   \n",
+       "2496  039 ### ###',\\n '049 ### ###',\\n '050 ### ###'...  pattern-incrementing   \n",
+       "2497  .1, -1?\\n-1\\nWhat is the second biggest value ...               code+nl   \n",
+       "2498  public DbUpdateException()\\n {\\n }\\n\\n /// <su...                  code   \n",
+       "2499  CLANG_WARN_BOOL_CONVERSION = YES;\\n CLANG_WARN...                  code   \n",
+       "\n",
+       "      Note                                             tokens  is_memorized  \n",
+       "12     NaN  [37801, 2458, 995, 17, 937, 995, 17, 937, 995,...         False  \n",
+       "2184   NaN  [37801, 2458, 995, 17, 937, 995, 17, 937, 995,...         False  \n",
+       "17     NaN  [16, 22374, 14, 22, 187, 3614, 1358, 78, 15, 2...         False  \n",
+       "26     NaN  [4280, 22, 3572, 187, 50274, 92, 470, 89, 740,...         False  \n",
+       "89     NaN  [568, 2437, 275, 389, 15, 29762, 15, 26318, 15...         False  \n",
+       "...    ...                                                ...           ...  \n",
+       "2495  math  [374, 11, 90, 424, 19, 559, 721, 11, 90, 15, 1...         False  \n",
+       "2496   NaN  [18832, 209, 4118, 209, 4118, 1383, 187, 50270...         False  \n",
+       "2497  math  [15, 18, 13, 428, 18, 32, 187, 14, 18, 187, 12...         False  \n",
+       "2498   NaN  [187, 50270, 4387, 46688, 11241, 5330, 1082, 1...         False  \n",
+       "2499   NaN  [3207, 14375, 64, 24798, 64, 30529, 64, 5707, ...         False  \n",
+       "\n",
+       "[1298 rows x 9 columns]"
+      ]
+     },
+     "execution_count": 25,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "non_memories_eval_set = eval_set.join(non_memories.set_index('index'), on='index', how=\"inner\")\n",
+    "non_memories_eval_set"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tokenizer = AutoTokenizer.from_pretrained(\"EleutherAI/pythia-70m-deduped\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>index</th>\n",
+       "      <th>memorized</th>\n",
+       "      <th>perplexity</th>\n",
+       "      <th>is_code</th>\n",
+       "      <th>prompt</th>\n",
+       "      <th>sequence</th>\n",
+       "      <th>Code</th>\n",
+       "      <th>Incremental</th>\n",
+       "      <th>Repetitive</th>\n",
+       "      <th>Highly Duplicated</th>\n",
+       "      <th>Templating</th>\n",
+       "      <th>Natural Language</th>\n",
+       "      <th>Random</th>\n",
+       "      <th>Other</th>\n",
+       "      <th>Notes</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>1720</th>\n",
+       "      <td>1042625</td>\n",
+       "      <td>False</td>\n",
+       "      <td>4.777344</td>\n",
+       "      <td>True</td>\n",
+       "      <td>4\\nLet f be ((-1)/2)/(2/4). Let a = 6 - 5. Let...</td>\n",
+       "      <td>4\\nLet f be ((-1)/2)/(2/4). Let a = 6 - 5. Let...</td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1412</th>\n",
+       "      <td>378406</td>\n",
+       "      <td>False</td>\n",
+       "      <td>4.007812</td>\n",
+       "      <td>True</td>\n",
+       "      <td>2/2 + 3*r - 3. Let g(x) be the first derivativ...</td>\n",
+       "      <td>2/2 + 3*r - 3. Let g(x) be the first derivativ...</td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1794</th>\n",
+       "      <td>1188414</td>\n",
+       "      <td>False</td>\n",
+       "      <td>1.437500</td>\n",
+       "      <td>True</td>\n",
+       "      <td>usepackage{amsmath}\\n\\usepackage{wasysym} \\n\\u...</td>\n",
+       "      <td>usepackage{amsmath}\\n\\usepackage{wasysym} \\n\\u...</td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1985</th>\n",
+       "      <td>1572476</td>\n",
+       "      <td>False</td>\n",
+       "      <td>3.982422</td>\n",
+       "      <td>True</td>\n",
+       "      <td>v - 2*v + 6 = 0. Let k be (3/v)/(2/(-8)). Let ...</td>\n",
+       "      <td>v - 2*v + 6 = 0. Let k be (3/v)/(2/(-8)). Let ...</td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>510</th>\n",
+       "      <td>60937406</td>\n",
+       "      <td>True</td>\n",
+       "      <td>2.820312</td>\n",
+       "      <td>True</td>\n",
+       "      <td>\\n\\n![](amjdentsci80652-0039){#sp2.143}\\n\\n![]...</td>\n",
+       "      <td>\\n\\n![](amjdentsci80652-0039){#sp2.143}\\n\\n![]...</td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1055</th>\n",
+       "      <td>121637700</td>\n",
+       "      <td>True</td>\n",
+       "      <td>1.520508</td>\n",
+       "      <td>True</td>\n",
+       "      <td>=\"1.0\" encoding=\"UTF-8\"?&gt;\\n&lt;!DOCTYPE plist PUB...</td>\n",
+       "      <td>=\"1.0\" encoding=\"UTF-8\"?&gt;\\n&lt;!DOCTYPE plist PUB...</td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>71</th>\n",
+       "      <td>7827439</td>\n",
+       "      <td>True</td>\n",
+       "      <td>3.263672</td>\n",
+       "      <td>True</td>\n",
+       "      <td>per Team is already on the scene....&lt;?xml vers...</td>\n",
+       "      <td>per Team is already on the scene....&lt;?xml vers...</td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2011</th>\n",
+       "      <td>1644096</td>\n",
+       "      <td>False</td>\n",
+       "      <td>3.005859</td>\n",
+       "      <td>False</td>\n",
+       "      <td>11.9 ± 2.0    11.5 ± 2.0    11.2 ± 2.2    \\&lt;</td>\n",
+       "      <td>11.9 ± 2.0    11.5 ± 2.0    11.2 ± 2.2    \\...</td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1028</th>\n",
+       "      <td>118099130</td>\n",
+       "      <td>True</td>\n",
+       "      <td>2.289062</td>\n",
+       "      <td>False</td>\n",
+       "      <td>ref 8, ref 9, ref 10, ref 11, ref 12, ref 13,...</td>\n",
+       "      <td>ref 8, ref 9, ref 10, ref 11, ref 12, ref 13,...</td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1026</th>\n",
+       "      <td>118045648</td>\n",
+       "      <td>True</td>\n",
+       "      <td>2.908203</td>\n",
+       "      <td>True</td>\n",
+       "      <td>ISA as two detectors, so that the signal in ea...</td>\n",
+       "      <td>ISA as two detectors, so that the signal in ea...</td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>2499 rows × 15 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "          index  memorized  perplexity  is_code   \n",
+       "1720    1042625      False    4.777344     True  \\\n",
+       "1412     378406      False    4.007812     True   \n",
+       "1794    1188414      False    1.437500     True   \n",
+       "1985    1572476      False    3.982422     True   \n",
+       "510    60937406       True    2.820312     True   \n",
+       "...         ...        ...         ...      ...   \n",
+       "1055  121637700       True    1.520508     True   \n",
+       "71      7827439       True    3.263672     True   \n",
+       "2011    1644096      False    3.005859    False   \n",
+       "1028  118099130       True    2.289062    False   \n",
+       "1026  118045648       True    2.908203     True   \n",
+       "\n",
+       "                                                 prompt   \n",
+       "1720  4\\nLet f be ((-1)/2)/(2/4). Let a = 6 - 5. Let...  \\\n",
+       "1412  2/2 + 3*r - 3. Let g(x) be the first derivativ...   \n",
+       "1794  usepackage{amsmath}\\n\\usepackage{wasysym} \\n\\u...   \n",
+       "1985  v - 2*v + 6 = 0. Let k be (3/v)/(2/(-8)). Let ...   \n",
+       "510   \\n\\n![](amjdentsci80652-0039){#sp2.143}\\n\\n![]...   \n",
+       "...                                                 ...   \n",
+       "1055  =\"1.0\" encoding=\"UTF-8\"?>\\n<!DOCTYPE plist PUB...   \n",
+       "71    per Team is already on the scene....<?xml vers...   \n",
+       "2011       11.9 ± 2.0    11.5 ± 2.0    11.2 ± 2.2    \\<   \n",
+       "1028   ref 8, ref 9, ref 10, ref 11, ref 12, ref 13,...   \n",
+       "1026  ISA as two detectors, so that the signal in ea...   \n",
+       "\n",
+       "                                               sequence Code Incremental   \n",
+       "1720  4\\nLet f be ((-1)/2)/(2/4). Let a = 6 - 5. Let...                   \\\n",
+       "1412  2/2 + 3*r - 3. Let g(x) be the first derivativ...                    \n",
+       "1794  usepackage{amsmath}\\n\\usepackage{wasysym} \\n\\u...                    \n",
+       "1985  v - 2*v + 6 = 0. Let k be (3/v)/(2/(-8)). Let ...                    \n",
+       "510   \\n\\n![](amjdentsci80652-0039){#sp2.143}\\n\\n![]...                    \n",
+       "...                                                 ...  ...         ...   \n",
+       "1055  =\"1.0\" encoding=\"UTF-8\"?>\\n<!DOCTYPE plist PUB...                    \n",
+       "71    per Team is already on the scene....<?xml vers...                    \n",
+       "2011     11.9 ± 2.0    11.5 ± 2.0    11.2 ± 2.2    \\...                    \n",
+       "1028   ref 8, ref 9, ref 10, ref 11, ref 12, ref 13,...                    \n",
+       "1026  ISA as two detectors, so that the signal in ea...                    \n",
+       "\n",
+       "     Repetitive Highly Duplicated Templating Natural Language Random Other   \n",
+       "1720                                                                        \\\n",
+       "1412                                                                         \n",
+       "1794                                                                         \n",
+       "1985                                                                         \n",
+       "510                                                                          \n",
+       "...         ...               ...        ...              ...    ...   ...   \n",
+       "1055                                                                         \n",
+       "71                                                                           \n",
+       "2011                                                                         \n",
+       "1028                                                                         \n",
+       "1026                                                                         \n",
+       "\n",
+       "     Notes  \n",
+       "1720        \n",
+       "1412        \n",
+       "1794        \n",
+       "1985        \n",
+       "510         \n",
+       "...    ...  \n",
+       "1055        \n",
+       "71          \n",
+       "2011        \n",
+       "1028        \n",
+       "1026        \n",
+       "\n",
+       "[2499 rows x 15 columns]"
+      ]
+     },
+     "execution_count": 39,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "target_columns = [\"index\", \"memorized\", \"perplexity\", \"is_code\", \"tokens\"]\n",
+    "combined_joined_eval_set = pd.concat([memories_eval_set, non_memories_eval_set])[target_columns]\n",
+    "combined_joined_eval_set[\"prompt\"] = combined_joined_eval_set[\"tokens\"].apply(lambda x: tokenizer.decode(x[:32]))\n",
+    "combined_joined_eval_set[\"sequence\"] = combined_joined_eval_set[\"tokens\"].apply(lambda x: tokenizer.decode(x[:64]))\n",
+    "combined_joined_eval_set = combined_joined_eval_set.drop_duplicates(subset=[\"index\"]).drop(columns=[\"tokens\"]).sample(frac=1)\n",
+    "\n",
+    "annotater_columns = [\n",
+    "    \"Code\",\n",
+    "    \"Incremental\",\n",
+    "    \"Repetitive\",\n",
+    "    \"Highly Duplicated\",\n",
+    "    \"Templating\",\n",
+    "    \"Natural Language\",\n",
+    "    \"Random\",\n",
+    "    \"Other\",\n",
+    "    \"Notes\"\n",
+    "]\n",
+    "\n",
+    "combined_joined_eval_set.to_csv(\"combined_formatted_eval_set.csv\", index=False)\n",
+    "for col in annotater_columns:\n",
+    "    combined_joined_eval_set[col] = \"\"\n",
+    "combined_joined_eval_set"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>index</th>\n",
+       "      <th>memorized</th>\n",
+       "      <th>perplexity</th>\n",
+       "      <th>is_code</th>\n",
+       "      <th>prompt</th>\n",
+       "      <th>sequence</th>\n",
+       "      <th>Code</th>\n",
+       "      <th>Incremental</th>\n",
+       "      <th>Repetitive</th>\n",
+       "      <th>Highly Duplicated</th>\n",
+       "      <th>Templating</th>\n",
+       "      <th>Natural Language</th>\n",
+       "      <th>Random</th>\n",
+       "      <th>Other</th>\n",
+       "      <th>Notes</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>1497</th>\n",
+       "      <td>576734</td>\n",
+       "      <td>False</td>\n",
+       "      <td>3.447266</td>\n",
+       "      <td>True</td>\n",
+       "      <td>(-1.0, -1.0, 0);\\nglTexCoord2f(0, 0);\\nglVerte...</td>\n",
+       "      <td>(-1.0, -1.0, 0);\\nglTexCoord2f(0, 0);\\nglVerte...</td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>389</th>\n",
+       "      <td>45993218</td>\n",
+       "      <td>True</td>\n",
+       "      <td>1.752930</td>\n",
+       "      <td>True</td>\n",
+       "      <td>not use this file except in compliance with t...</td>\n",
+       "      <td>not use this file except in compliance with t...</td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1095</th>\n",
+       "      <td>126795714</td>\n",
+       "      <td>True</td>\n",
+       "      <td>4.429688</td>\n",
+       "      <td>True</td>\n",
+       "      <td>(c) 2019 Wei Wang &lt;onevcat@gmail.com&gt;\\n//\\n//...</td>\n",
+       "      <td>(c) 2019 Wei Wang &lt;onevcat@gmail.com&gt;\\n//\\n//...</td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>496</th>\n",
+       "      <td>58907866</td>\n",
+       "      <td>True</td>\n",
+       "      <td>1.357422</td>\n",
+       "      <td>False</td>\n",
+       "      <td>http://www.apache.org/licenses/LICENSE-2.0\\n\\n...</td>\n",
+       "      <td>http://www.apache.org/licenses/LICENSE-2.0\\n\\n...</td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1647</th>\n",
+       "      <td>911620</td>\n",
+       "      <td>False</td>\n",
+       "      <td>3.562500</td>\n",
+       "      <td>True</td>\n",
+       "      <td>=\"table-fn\"}                                  ...</td>\n",
+       "      <td>=\"table-fn\"}                                  ...</td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2083</th>\n",
+       "      <td>1786406</td>\n",
+       "      <td>False</td>\n",
+       "      <td>1.972656</td>\n",
+       "      <td>True</td>\n",
+       "      <td>\\n                \\usepackage{amssymb} \\n    ...</td>\n",
+       "      <td>\\n                \\usepackage{amssymb} \\n    ...</td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1202</th>\n",
+       "      <td>139649808</td>\n",
+       "      <td>True</td>\n",
+       "      <td>1.101562</td>\n",
+       "      <td>True</td>\n",
+       "      <td>$\\documentclass[12pt]{minimal}\\n             ...</td>\n",
+       "      <td>$\\documentclass[12pt]{minimal}\\n             ...</td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>428</th>\n",
+       "      <td>50719780</td>\n",
+       "      <td>True</td>\n",
+       "      <td>1.749023</td>\n",
+       "      <td>True</td>\n",
+       "      <td>in compliance with the License.\\n// You may o...</td>\n",
+       "      <td>in compliance with the License.\\n// You may o...</td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2020</th>\n",
+       "      <td>1659069</td>\n",
+       "      <td>False</td>\n",
+       "      <td>3.974609</td>\n",
+       "      <td>False</td>\n",
+       "      <td>?\\n3\\nWhat is the ninth root of 113001 to the ...</td>\n",
+       "      <td>?\\n3\\nWhat is the ninth root of 113001 to the ...</td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>506</th>\n",
+       "      <td>60582215</td>\n",
+       "      <td>True</td>\n",
+       "      <td>1.504883</td>\n",
+       "      <td>True</td>\n",
+       "      <td>good judgment.//\\n//     Generated by class-d...</td>\n",
+       "      <td>good judgment.//\\n//     Generated by class-d...</td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>100 rows × 15 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "          index  memorized  perplexity  is_code   \n",
+       "1497     576734      False    3.447266     True  \\\n",
+       "389    45993218       True    1.752930     True   \n",
+       "1095  126795714       True    4.429688     True   \n",
+       "496    58907866       True    1.357422    False   \n",
+       "1647     911620      False    3.562500     True   \n",
+       "...         ...        ...         ...      ...   \n",
+       "2083    1786406      False    1.972656     True   \n",
+       "1202  139649808       True    1.101562     True   \n",
+       "428    50719780       True    1.749023     True   \n",
+       "2020    1659069      False    3.974609    False   \n",
+       "506    60582215       True    1.504883     True   \n",
+       "\n",
+       "                                                 prompt   \n",
+       "1497  (-1.0, -1.0, 0);\\nglTexCoord2f(0, 0);\\nglVerte...  \\\n",
+       "389    not use this file except in compliance with t...   \n",
+       "1095   (c) 2019 Wei Wang <onevcat@gmail.com>\\n//\\n//...   \n",
+       "496   http://www.apache.org/licenses/LICENSE-2.0\\n\\n...   \n",
+       "1647  =\"table-fn\"}                                  ...   \n",
+       "...                                                 ...   \n",
+       "2083   \\n                \\usepackage{amssymb} \\n    ...   \n",
+       "1202   $\\documentclass[12pt]{minimal}\\n             ...   \n",
+       "428    in compliance with the License.\\n// You may o...   \n",
+       "2020  ?\\n3\\nWhat is the ninth root of 113001 to the ...   \n",
+       "506    good judgment.//\\n//     Generated by class-d...   \n",
+       "\n",
+       "                                               sequence Code Incremental   \n",
+       "1497  (-1.0, -1.0, 0);\\nglTexCoord2f(0, 0);\\nglVerte...                   \\\n",
+       "389    not use this file except in compliance with t...                    \n",
+       "1095   (c) 2019 Wei Wang <onevcat@gmail.com>\\n//\\n//...                    \n",
+       "496   http://www.apache.org/licenses/LICENSE-2.0\\n\\n...                    \n",
+       "1647  =\"table-fn\"}                                  ...                    \n",
+       "...                                                 ...  ...         ...   \n",
+       "2083   \\n                \\usepackage{amssymb} \\n    ...                    \n",
+       "1202   $\\documentclass[12pt]{minimal}\\n             ...                    \n",
+       "428    in compliance with the License.\\n// You may o...                    \n",
+       "2020  ?\\n3\\nWhat is the ninth root of 113001 to the ...                    \n",
+       "506    good judgment.//\\n//     Generated by class-d...                    \n",
+       "\n",
+       "     Repetitive Highly Duplicated Templating Natural Language Random Other   \n",
+       "1497                                                                        \\\n",
+       "389                                                                          \n",
+       "1095                                                                         \n",
+       "496                                                                          \n",
+       "1647                                                                         \n",
+       "...         ...               ...        ...              ...    ...   ...   \n",
+       "2083                                                                         \n",
+       "1202                                                                         \n",
+       "428                                                                          \n",
+       "2020                                                                         \n",
+       "506                                                                          \n",
+       "\n",
+       "     Notes  \n",
+       "1497        \n",
+       "389         \n",
+       "1095        \n",
+       "496         \n",
+       "1647        \n",
+       "...    ...  \n",
+       "2083        \n",
+       "1202        \n",
+       "428         \n",
+       "2020        \n",
+       "506         \n",
+       "\n",
+       "[100 rows x 15 columns]"
+      ]
+     },
+     "execution_count": 40,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "combined_strat_downsample = pd.concat(\n",
+    "    [combined_joined_eval_set[combined_joined_eval_set[\"memorized\"]].sample(50),\n",
+    "     combined_joined_eval_set[~combined_joined_eval_set[\"memorized\"]].sample(50)]).sample(frac=1)\n",
+    "\n",
+    "combined_strat_downsample.to_csv(\"combined_strat_downsample.csv\", index=False)\n",
+    "combined_strat_downsample"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "memorization",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.3"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

From 0aaa9829cd549f7d228f3d794968c7b378526ca1 Mon Sep 17 00:00:00 2001
From: Alvin Deng <alvin.q.deng@utexas.edu>
Date: Sun, 28 May 2023 19:44:18 -0700
Subject: [PATCH 2/7] add filters

---
 filters/highly_duplicated_filter.py      | 57 ++++++++++++++++++++++++
 filters/test_highly_duplicated_filter.py | 30 +++++++++++++
 2 files changed, 87 insertions(+)
 create mode 100644 filters/highly_duplicated_filter.py
 create mode 100644 filters/test_highly_duplicated_filter.py

diff --git a/filters/highly_duplicated_filter.py b/filters/highly_duplicated_filter.py
new file mode 100644
index 0000000..677933d
--- /dev/null
+++ b/filters/highly_duplicated_filter.py
@@ -0,0 +1,57 @@
+from collections import Counter
+from typing import Callable, List
+
+import pandas as pd
+
+def _concat_token_indices(token_indices: List[int], delimiter: str = '_') -> str:
+    """
+    Concatenates a list of tokens into a single string.
+
+    Args:
+        token_indices (List[int]): List of token indices to concatenate.
+        delimiter (str, optional): Delimiter to use for concatenation. Defaults to '_'.
+    
+    Returns:
+        str: Concatenated string of tokens indices.
+    """
+    return delimiter.join([str(t) for t in token_indices])
+
+def generate_token_string_histogram(token_series: pd.Series, delimiter: str = '_') -> Counter:
+    """
+    Generates a histogram from a Pandas Series of token indices. The histogram is based on the concatenated strings of token indices.
+
+    Args:
+        token_series (pd.Series): Series of token indices.
+        delimiter (str, optional): Delimiter to use for concatenation. Defaults to '_'.
+
+    Returns:
+        Counter: Histogram of strings of token indices.
+    """
+    return Counter(token_series.apply(lambda x: _concat_token_indices(x, delimiter=delimiter)))
+
+def get_highly_duplicated_filter_func(histogram: Counter, frequency_threshold: int = 1, delimiter: str = '_') -> Callable[[List[int]], bool]:
+    """
+    Generates a filter function that checks if a list of token indices is highly duplicated based on a threshold.
+
+    Args:
+        histogram (Counter): Histogram of strings of token indices.
+        frequency_threshold (int, optional): Frequency threshold to use for filtering. Defaults to 1.
+        delimiter (str, optional): Delimiter to use for concatenation. Defaults to '_'.
+
+    Returns:
+        Callable[[List[int]], bool]: Filter function that checks if a list of token indices is highly duplicated based on a threshold.
+    """
+    def _highly_duplicated_filter_func(token_indices: List[int]) -> bool:
+        """
+        Checks if a list of token indices is highly duplicated.
+
+        Args:
+            token_indices (List[int]): List of token indices to check.
+
+        Returns:
+            bool: True if the list of token indices is highly duplicated, False otherwise.
+        """
+        token_string = _concat_token_indices(token_indices, delimiter=delimiter)
+        return histogram[token_string] > frequency_threshold
+    
+    return _highly_duplicated_filter_func
diff --git a/filters/test_highly_duplicated_filter.py b/filters/test_highly_duplicated_filter.py
new file mode 100644
index 0000000..6dd3b44
--- /dev/null
+++ b/filters/test_highly_duplicated_filter.py
@@ -0,0 +1,30 @@
+import pandas as pd
+
+from .highly_duplicated_filter import get_highly_duplicated_filter_func, generate_token_string_histogram
+
+def test_highly_duplicated_filter_on_seen_indices():
+    data = pd.Series([[1, 2, 3], [4, 5, 6], [4, 5, 6]])
+    histogram = generate_token_string_histogram(data)
+    threshold = 1
+    filter_func = get_highly_duplicated_filter_func(histogram, frequency_threshold=threshold)
+
+    sample = [4, 5, 6]
+    assert filter_func(sample) == True
+
+def test_highly_duplicated_filter_on_unseen_indices():
+    data = pd.Series([[1, 2, 3], [4, 5, 6], [4, 5, 6]])
+    histogram = generate_token_string_histogram(data)
+    threshold = 1
+    filter_func = get_highly_duplicated_filter_func(histogram, frequency_threshold=threshold)
+
+    sample = [7, 8, 9]
+    assert filter_func(sample) == False
+
+def test_highly_duplicated_filter_on_infrequent_indices():
+    data = pd.Series([[1, 2, 3], [4, 5, 6], [4, 5, 6]])
+    histogram = generate_token_string_histogram(data)
+    threshold = 2
+    filter_func = get_highly_duplicated_filter_func(histogram, frequency_threshold=threshold)
+
+    sample = [4, 5, 6]
+    assert filter_func(sample) == False

From 188e5ab3ebe74f7b0a20a9c265b32d67423efa23 Mon Sep 17 00:00:00 2001
From: Alvin Deng <alvin.q.deng@utexas.edu>
Date: Sun, 28 May 2023 19:45:19 -0700
Subject: [PATCH 3/7] update reqs

---
 requirements.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 4c7ccd4..805330d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,6 @@
 pandas
 numpy
+scikit-learn
 torch
 torchvision
 torchaudio
@@ -9,4 +10,4 @@ datasets
 tqdm
 black
 pylint
-scikit-learn
\ No newline at end of file
+pytest
\ No newline at end of file

From cb2d6f9505aff4fdba1166ba98622db5d7516adc Mon Sep 17 00:00:00 2001
From: Alvin Deng <alvin.q.deng@utexas.edu>
Date: Sun, 28 May 2023 19:52:28 -0700
Subject: [PATCH 4/7] comments

---
 filters/highly_duplicated_filter.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/filters/highly_duplicated_filter.py b/filters/highly_duplicated_filter.py
index 677933d..812054b 100644
--- a/filters/highly_duplicated_filter.py
+++ b/filters/highly_duplicated_filter.py
@@ -31,7 +31,7 @@ def generate_token_string_histogram(token_series: pd.Series, delimiter: str = '_
 
 def get_highly_duplicated_filter_func(histogram: Counter, frequency_threshold: int = 1, delimiter: str = '_') -> Callable[[List[int]], bool]:
     """
-    Generates a filter function that checks if a list of token indices is highly duplicated based on a threshold.
+    Generates a filter function that checks if a list of token indices is highly duplicated.
 
     Args:
         histogram (Counter): Histogram of strings of token indices.
@@ -39,7 +39,7 @@ def get_highly_duplicated_filter_func(histogram: Counter, frequency_threshold: i
         delimiter (str, optional): Delimiter to use for concatenation. Defaults to '_'.
 
     Returns:
-        Callable[[List[int]], bool]: Filter function that checks if a list of token indices is highly duplicated based on a threshold.
+        Callable[[List[int]], bool]: Filter function that checks if a list of token indices is highly duplicated.
     """
     def _highly_duplicated_filter_func(token_indices: List[int]) -> bool:
         """

From f7f9e94f9005920fe9aef1aa7ddf64700a31fb92 Mon Sep 17 00:00:00 2001
From: Alvin Deng <alvin.q.deng@utexas.edu>
Date: Sun, 28 May 2023 19:58:20 -0700
Subject: [PATCH 5/7] update func names

---
 filters/highly_duplicated_filter.py      | 12 ++++++------
 filters/test_highly_duplicated_filter.py |  8 ++++----
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/filters/highly_duplicated_filter.py b/filters/highly_duplicated_filter.py
index 812054b..27d899e 100644
--- a/filters/highly_duplicated_filter.py
+++ b/filters/highly_duplicated_filter.py
@@ -16,25 +16,25 @@ def _concat_token_indices(token_indices: List[int], delimiter: str = '_') -> str
     """
     return delimiter.join([str(t) for t in token_indices])
 
-def generate_token_string_histogram(token_series: pd.Series, delimiter: str = '_') -> Counter:
+def generate_sequence_histogram(token_indices: pd.Series, delimiter: str = '_') -> Counter[str, int]:
     """
     Generates a histogram from a Pandas Series of token indices. The histogram is based on the concatenated strings of token indices.
 
     Args:
-        token_series (pd.Series): Series of token indices.
+        token_index_sequences (pd.Series): Pandas Series of token indices.
         delimiter (str, optional): Delimiter to use for concatenation. Defaults to '_'.
 
     Returns:
-        Counter: Histogram of strings of token indices.
+        Counter[str, int]: Histogram of strings of token indices.
     """
-    return Counter(token_series.apply(lambda x: _concat_token_indices(x, delimiter=delimiter)))
+    return Counter(token_indices.apply(lambda x: _concat_token_indices(x, delimiter=delimiter)))
 
-def get_highly_duplicated_filter_func(histogram: Counter, frequency_threshold: int = 1, delimiter: str = '_') -> Callable[[List[int]], bool]:
+def get_highly_duplicated_filter_func(histogram: Counter[str, int], frequency_threshold: int = 1, delimiter: str = '_') -> Callable[[List[int]], bool]:
     """
     Generates a filter function that checks if a list of token indices is highly duplicated.
 
     Args:
-        histogram (Counter): Histogram of strings of token indices.
+        histogram (Counter[str, int]): Histogram of strings of token indices.
         frequency_threshold (int, optional): Frequency threshold to use for filtering. Defaults to 1.
         delimiter (str, optional): Delimiter to use for concatenation. Defaults to '_'.
 
diff --git a/filters/test_highly_duplicated_filter.py b/filters/test_highly_duplicated_filter.py
index 6dd3b44..3358020 100644
--- a/filters/test_highly_duplicated_filter.py
+++ b/filters/test_highly_duplicated_filter.py
@@ -1,10 +1,10 @@
 import pandas as pd
 
-from .highly_duplicated_filter import get_highly_duplicated_filter_func, generate_token_string_histogram
+from .highly_duplicated_filter import get_highly_duplicated_filter_func, generate_sequence_histogram
 
 def test_highly_duplicated_filter_on_seen_indices():
     data = pd.Series([[1, 2, 3], [4, 5, 6], [4, 5, 6]])
-    histogram = generate_token_string_histogram(data)
+    histogram = generate_sequence_histogram(data)
     threshold = 1
     filter_func = get_highly_duplicated_filter_func(histogram, frequency_threshold=threshold)
 
@@ -13,7 +13,7 @@ def test_highly_duplicated_filter_on_seen_indices():
 
 def test_highly_duplicated_filter_on_unseen_indices():
     data = pd.Series([[1, 2, 3], [4, 5, 6], [4, 5, 6]])
-    histogram = generate_token_string_histogram(data)
+    histogram = generate_sequence_histogram(data)
     threshold = 1
     filter_func = get_highly_duplicated_filter_func(histogram, frequency_threshold=threshold)
 
@@ -22,7 +22,7 @@ def test_highly_duplicated_filter_on_unseen_indices():
 
 def test_highly_duplicated_filter_on_infrequent_indices():
     data = pd.Series([[1, 2, 3], [4, 5, 6], [4, 5, 6]])
-    histogram = generate_token_string_histogram(data)
+    histogram = generate_sequence_histogram(data)
     threshold = 2
     filter_func = get_highly_duplicated_filter_func(histogram, frequency_threshold=threshold)
 

From e790f9a6c1fcce210a334946469277e248c842c4 Mon Sep 17 00:00:00 2001
From: Kyle1668 <kyledevinobrien1@gmail.com>
Date: Thu, 1 Jun 2023 20:43:03 +0000
Subject: [PATCH 6/7] Add additonal logic to evaluate

---
 evaluate.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/evaluate.py b/evaluate.py
index bace801..66dd2ae 100644
--- a/evaluate.py
+++ b/evaluate.py
@@ -35,7 +35,7 @@ def log_mistakes_report(mistakes: pd.DataFrame, category: str, eval_timestamp: s
     mistakes.to_csv(f"{eval_directory}/mistakes_{eval_timestamp}_{category}.csv", index=False)
 
 
-def evaluate_filter(category: str, filter_function: function, dataset: pd.DataFrame, eval_timestamp: str) -> dict:
+def evaluate_filter(category: str, filter_function, dataset: pd.DataFrame, eval_timestamp: str) -> dict:
     """
     Evaluate the classification performance of the provided filter
 
@@ -48,7 +48,14 @@ def evaluate_filter(category: str, filter_function: function, dataset: pd.DataFr
     Returns:
         dict: The classification report of the filter
     """
-    filter_judgments = dataset["shortened_text"].progress_apply(filter_function)
+    filter_judgments = []
+    for i in tqdm(range(len(dataset))):
+        try:
+            filter_judgments.append(filter_function(dataset["shortened_text"][i]))
+        except:
+            filter_judgments.append(-1)
+
+    # filter_judgments = dataset["shortened_text"].progress_apply(filter_function)
     filter_labels = dataset["Category"].progress_apply(lambda c: c == category)
     report_dict = classification_report(filter_labels, filter_judgments, output_dict=True)
     evaluation_log = {
@@ -75,7 +82,7 @@ def evaluate(filters: dict):
     Args:
         filters (dict): The filters to evaluate. The key is the name of the category and value is the filter function.
     """
-    dataset = pd.read_csv("datasets/eval/Pythia_70m_Deduped_Low_Perplexity_Labeling_Formatted.csv")
+    dataset = pd.read_csv("datasets/eval/Pythia_70m_Deduped_Low_Perplexity_Labeling_Formatted")
     eval_timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
     eval_results = []
     for category, filter_function in filters.items():

From 6191d5edef2a66fdc7fb22ee8a00fe5932a578b2 Mon Sep 17 00:00:00 2001
From: Kyle1668 <kyledevinobrien1@gmail.com>
Date: Tue, 20 Jun 2023 17:59:54 +0000
Subject: [PATCH 7/7] Checkpoint

---
 filters/pattern_incrementing.py      | 77 ++++++++++++++++++----------
 filters/test_pattern_incrementing.py | 28 ++++++++--
 2 files changed, 75 insertions(+), 30 deletions(-)

diff --git a/filters/pattern_incrementing.py b/filters/pattern_incrementing.py
index 648828c..4fba1c6 100644
--- a/filters/pattern_incrementing.py
+++ b/filters/pattern_incrementing.py
@@ -1,3 +1,5 @@
+import re
+
 def incrementing_sequences_filter(text: str) -> bool:
     """
     This sequence will classify a given text is an incrementing sequence or not.
@@ -8,35 +10,58 @@ def incrementing_sequences_filter(text: str) -> bool:
     Returns:
         bool: Whether the sequence is an incrementing sequence or not.
     """
-    # Check for incrementing sequences of only numbers
-    previous_entry = None
-    direction = None
-    for string_entry in text.split():
-        string_entry = (string_entry[:-1] if string_entry[-1] == "." else string_entry)
-        if string_entry.isdigit():
-            numerical_entry = float(string_entry)
-            if previous_entry is None:
-                previous_entry = numerical_entry
-                continue
-            elif direction is None:
-                direction = "positive" if numerical_entry > previous_entry else "negative"
-                previous_entry = numerical_entry
-                continue
+    # Split by seperators between text
+    possible_seperators = list(set(re.findall(r'(?<=\d)(\D+)(?=\d)', text))) + [" "] + ["\n"]
+    for seperator in possible_seperators:
+    # seperator = ""
+    # reading = None
+    # prev_char = None
+    # for index, character in enumerate(text):
+    #     next_char = text[index + 1] if index + 1 < len(text) else ""
+    #     if prev_char is None:
+    #         prev_char = character
+    #     if not character.isdigit() and not next_char.isdigit():
+    #         reading = True
+    #         seperator += character
+    #     if character.isdigit() and reading is True:
+    #         break
+
+    #     prev_char = character
+        split_text = text.split(" " if seperator == "" else seperator)
+
+    # trim the end if the final character(s) is a seperator
+        trailing_seperator = ""
+        for sep_index in range(len(seperator)):
+            if text.split(seperator)[-1][sep_index - 1:] == seperator[:sep_index + 1]:
+                trailing_seperator += seperator[:sep_index + 1]
             else:
-                if direction == "positive":
-                    if numerical_entry > previous_entry:
-                        previous_entry = numerical_entry
-                        continue
+                break
+        split_text[-1] = split_text[-1][:-len(trailing_seperator)]
+
+        # Check if the sequence is just a list of digits
+        if len(split_text) == 1:
+            failed = False
+            prev_char = None
+            is_decrementing = None
+            for char in split_text[0]:
+                if char.isdigit():
+                    if prev_char is None and is_decrementing is None:
+                        prev_char = char
+                    elif is_decrementing is None:
+                        is_decrementing = int(char) < int(prev_char)
+                        prev_char = char
+                    elif is_decrementing and (int(char) < int(prev_char)):
+                        prev_char = char
+                    elif not is_decrementing and (int(char) > int(prev_char)):
+                        prev_char = char
                     else:
-                        return False
+                        failed = True
+                        break
                 else:
-                    if numerical_entry < previous_entry:
-                        previous_entry = numerical_entry
-                        continue
-                    else:
-                        return False
+                    failed = True
+                    break
+            if failed:
+                return False
 
-        # A non-numerican entry was found
-        return False
 
     return True
\ No newline at end of file
diff --git a/filters/test_pattern_incrementing.py b/filters/test_pattern_incrementing.py
index acea6cb..add828f 100644
--- a/filters/test_pattern_incrementing.py
+++ b/filters/test_pattern_incrementing.py
@@ -1,24 +1,44 @@
 from .pattern_incrementing import incrementing_sequences_filter
 
+
+def test_pattern_incrementing_no_space():
+    text = "123456789"
+    assert incrementing_sequences_filter(text) == True
+
+
+def test_pattern_incrementing_no_space_with_char():
+    text = "1A23456789"
+    assert incrementing_sequences_filter(text) == False
+
+
 def test_pattern_incrementing():
-    text = "128. 129. 130. 131. 132. 133."
+    text = "12.8. 12.9. 13.0. 13.1. 13.2. 13.3."
     assert incrementing_sequences_filter(text) == True
 
+
 def test_pattern_new_lines_incrementing():
     text = "128.\n129.\n130.\n131.\n132.\n133."
     assert incrementing_sequences_filter(text) == True
 
+
 def test_pattern_list_incrementing():
     text = "- 128.\n- 129.\n- 130.\n- 131.\n- 132.\n- 133."
     assert incrementing_sequences_filter(text) == True
 
+
 def test_incrementing_nonnumerical_pattern():
-    text = """![](edinbmedj75052-0047-b){#f5.123}
+    text = """
+![](edinbmedj75052-0047-b){#f5.123}
 
 ![](edinbmedj75052-0049-a){#f6.125}
 
 ![](edinbmedj75052-0049-b){#f7.125}
 
-![](edin"""
+![](edin
+"""
+    assert incrementing_sequences_filter(text) == True
+
 
-    assert incrementing_sequences_filter(text) == True
\ No newline at end of file
+def test_incrementing_seminnumerical_pattern():
+    text = "A.1 , A.2 , A.3 , A.4, B.1 , B.2, B.3, C.1"
+    assert incrementing_sequences_filter(text) == True