diff --git a/evaluate.py b/evaluate.py index bace801..66dd2ae 100644 --- a/evaluate.py +++ b/evaluate.py @@ -35,7 +35,7 @@ def log_mistakes_report(mistakes: pd.DataFrame, category: str, eval_timestamp: s mistakes.to_csv(f"{eval_directory}/mistakes_{eval_timestamp}_{category}.csv", index=False) -def evaluate_filter(category: str, filter_function: function, dataset: pd.DataFrame, eval_timestamp: str) -> dict: +def evaluate_filter(category: str, filter_function, dataset: pd.DataFrame, eval_timestamp: str) -> dict: """ Evaluate the classification performance of the provided filter @@ -48,7 +48,14 @@ def evaluate_filter(category: str, filter_function: function, dataset: pd.DataFr Returns: dict: The classification report of the filter """ - filter_judgments = dataset["shortened_text"].progress_apply(filter_function) + filter_judgments = [] + for i in tqdm(range(len(dataset))): + try: + filter_judgments.append(filter_function(dataset["shortened_text"][i])) + except: + filter_judgments.append(-1) + + # filter_judgments = dataset["shortened_text"].progress_apply(filter_function) filter_labels = dataset["Category"].progress_apply(lambda c: c == category) report_dict = classification_report(filter_labels, filter_judgments, output_dict=True) evaluation_log = { @@ -75,7 +82,7 @@ def evaluate(filters: dict): Args: filters (dict): The filters to evaluate. The key is the name of the category and value is the filter function. """ - dataset = pd.read_csv("datasets/eval/Pythia_70m_Deduped_Low_Perplexity_Labeling_Formatted.csv") + dataset = pd.read_csv("datasets/eval/Pythia_70m_Deduped_Low_Perplexity_Labeling_Formatted") eval_timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") eval_results = [] for category, filter_function in filters.items(): diff --git a/filters/highly_duplicated_filter.py b/filters/highly_duplicated_filter.py new file mode 100644 index 0000000..27d899e --- /dev/null +++ b/filters/highly_duplicated_filter.py @@ -0,0 +1,57 @@ +from collections import Counter +from typing import Callable, List + +import pandas as pd + +def _concat_token_indices(token_indices: List[int], delimiter: str = '_') -> str: + """ + Concatenates a list of tokens into a single string. + + Args: + token_indices (List[int]): List of token indices to concatenate. + delimiter (str, optional): Delimiter to use for concatenation. Defaults to '_'. + + Returns: + str: Concatenated string of tokens indices. + """ + return delimiter.join([str(t) for t in token_indices]) + +def generate_sequence_histogram(token_indices: pd.Series, delimiter: str = '_') -> Counter[str, int]: + """ + Generates a histogram from a Pandas Series of token indices. The histogram is based on the concatenated strings of token indices. + + Args: + token_index_sequences (pd.Series): Pandas Series of token indices. + delimiter (str, optional): Delimiter to use for concatenation. Defaults to '_'. + + Returns: + Counter[str, int]: Histogram of strings of token indices. + """ + return Counter(token_indices.apply(lambda x: _concat_token_indices(x, delimiter=delimiter))) + +def get_highly_duplicated_filter_func(histogram: Counter[str, int], frequency_threshold: int = 1, delimiter: str = '_') -> Callable[[List[int]], bool]: + """ + Generates a filter function that checks if a list of token indices is highly duplicated. + + Args: + histogram (Counter[str, int]): Histogram of strings of token indices. + frequency_threshold (int, optional): Frequency threshold to use for filtering. Defaults to 1. + delimiter (str, optional): Delimiter to use for concatenation. Defaults to '_'. + + Returns: + Callable[[List[int]], bool]: Filter function that checks if a list of token indices is highly duplicated. + """ + def _highly_duplicated_filter_func(token_indices: List[int]) -> bool: + """ + Checks if a list of token indices is highly duplicated. + + Args: + token_indices (List[int]): List of token indices to check. + + Returns: + bool: True if the list of token indices is highly duplicated, False otherwise. + """ + token_string = _concat_token_indices(token_indices, delimiter=delimiter) + return histogram[token_string] > frequency_threshold + + return _highly_duplicated_filter_func diff --git a/filters/pattern_incrementing.py b/filters/pattern_incrementing.py index 53711e9..4fba1c6 100644 --- a/filters/pattern_incrementing.py +++ b/filters/pattern_incrementing.py @@ -1,2 +1,67 @@ -def incrementing_sequences_filter(text): +import re + +def incrementing_sequences_filter(text: str) -> bool: + """ + This sequence will classify a given text is an incrementing sequence or not. + + Args: + text (str): The current sequence to be classified. + + Returns: + bool: Whether the sequence is an incrementing sequence or not. + """ + # Split by seperators between text + possible_seperators = list(set(re.findall(r'(?<=\d)(\D+)(?=\d)', text))) + [" "] + ["\n"] + for seperator in possible_seperators: + # seperator = "" + # reading = None + # prev_char = None + # for index, character in enumerate(text): + # next_char = text[index + 1] if index + 1 < len(text) else "" + # if prev_char is None: + # prev_char = character + # if not character.isdigit() and not next_char.isdigit(): + # reading = True + # seperator += character + # if character.isdigit() and reading is True: + # break + + # prev_char = character + split_text = text.split(" " if seperator == "" else seperator) + + # trim the end if the final character(s) is a seperator + trailing_seperator = "" + for sep_index in range(len(seperator)): + if text.split(seperator)[-1][sep_index - 1:] == seperator[:sep_index + 1]: + trailing_seperator += seperator[:sep_index + 1] + else: + break + split_text[-1] = split_text[-1][:-len(trailing_seperator)] + + # Check if the sequence is just a list of digits + if len(split_text) == 1: + failed = False + prev_char = None + is_decrementing = None + for char in split_text[0]: + if char.isdigit(): + if prev_char is None and is_decrementing is None: + prev_char = char + elif is_decrementing is None: + is_decrementing = int(char) < int(prev_char) + prev_char = char + elif is_decrementing and (int(char) < int(prev_char)): + prev_char = char + elif not is_decrementing and (int(char) > int(prev_char)): + prev_char = char + else: + failed = True + break + else: + failed = True + break + if failed: + return False + + return True \ No newline at end of file diff --git a/filters/test_highly_duplicated_filter.py b/filters/test_highly_duplicated_filter.py new file mode 100644 index 0000000..3358020 --- /dev/null +++ b/filters/test_highly_duplicated_filter.py @@ -0,0 +1,30 @@ +import pandas as pd + +from .highly_duplicated_filter import get_highly_duplicated_filter_func, generate_sequence_histogram + +def test_highly_duplicated_filter_on_seen_indices(): + data = pd.Series([[1, 2, 3], [4, 5, 6], [4, 5, 6]]) + histogram = generate_sequence_histogram(data) + threshold = 1 + filter_func = get_highly_duplicated_filter_func(histogram, frequency_threshold=threshold) + + sample = [4, 5, 6] + assert filter_func(sample) == True + +def test_highly_duplicated_filter_on_unseen_indices(): + data = pd.Series([[1, 2, 3], [4, 5, 6], [4, 5, 6]]) + histogram = generate_sequence_histogram(data) + threshold = 1 + filter_func = get_highly_duplicated_filter_func(histogram, frequency_threshold=threshold) + + sample = [7, 8, 9] + assert filter_func(sample) == False + +def test_highly_duplicated_filter_on_infrequent_indices(): + data = pd.Series([[1, 2, 3], [4, 5, 6], [4, 5, 6]]) + histogram = generate_sequence_histogram(data) + threshold = 2 + filter_func = get_highly_duplicated_filter_func(histogram, frequency_threshold=threshold) + + sample = [4, 5, 6] + assert filter_func(sample) == False diff --git a/filters/test_pattern_incrementing.py b/filters/test_pattern_incrementing.py new file mode 100644 index 0000000..add828f --- /dev/null +++ b/filters/test_pattern_incrementing.py @@ -0,0 +1,44 @@ +from .pattern_incrementing import incrementing_sequences_filter + + +def test_pattern_incrementing_no_space(): + text = "123456789" + assert incrementing_sequences_filter(text) == True + + +def test_pattern_incrementing_no_space_with_char(): + text = "1A23456789" + assert incrementing_sequences_filter(text) == False + + +def test_pattern_incrementing(): + text = "12.8. 12.9. 13.0. 13.1. 13.2. 13.3." + assert incrementing_sequences_filter(text) == True + + +def test_pattern_new_lines_incrementing(): + text = "128.\n129.\n130.\n131.\n132.\n133." + assert incrementing_sequences_filter(text) == True + + +def test_pattern_list_incrementing(): + text = "- 128.\n- 129.\n- 130.\n- 131.\n- 132.\n- 133." + assert incrementing_sequences_filter(text) == True + + +def test_incrementing_nonnumerical_pattern(): + text = """ +{#f5.123} + +{#f6.125} + +{#f7.125} + + == True + + +def test_incrementing_seminnumerical_pattern(): + text = "A.1 , A.2 , A.3 , A.4, B.1 , B.2, B.3, C.1" + assert incrementing_sequences_filter(text) == True diff --git a/requirements.txt b/requirements.txt index 4c7ccd4..805330d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,6 @@ pandas numpy +scikit-learn torch torchvision torchaudio @@ -9,4 +10,4 @@ datasets tqdm black pylint -scikit-learn \ No newline at end of file +pytest \ No newline at end of file diff --git a/working_dirs/kyle/taxonemy_analysis/eval_set_v2.ipynb b/working_dirs/kyle/taxonemy_analysis/eval_set_v2.ipynb new file mode 100644 index 0000000..e193ba4 --- /dev/null +++ b/working_dirs/kyle/taxonemy_analysis/eval_set_v2.ipynb @@ -0,0 +1,1698 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "from datasets import load_dataset\n", + "from transformers import AutoTokenizer" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
| \n", + " | index | \n", + "perplexity | \n", + "memorized | \n", + "is_code | \n", + "shortened_text | \n", + "Category | \n", + "Note | \n", + "
|---|---|---|---|---|---|---|---|
| 0 | \n", + "92883 | \n", + "3.687500 | \n", + "True | \n", + "True | \n", + "}{-69pt}\\n \\begin{document}$u_{n}\\rightarrow u... | \n", + "code | \n", + "latex | \n", + "
| 1 | \n", + "685875 | \n", + "3.837891 | \n", + "True | \n", + "False | \n", + "alesSite: All American Trannies\\n\\nFor Search ... | \n", + "nl | \n", + "NaN | \n", + "
| 2 | \n", + "973152 | \n", + "2.884766 | \n", + "True | \n", + "False | \n", + "18>::type T18;\\n typedef map<T0, T1, T2, T3, T... | \n", + "pattern-incrementing | \n", + "NaN | \n", + "
| 3 | \n", + "1016981 | \n", + "1.056641 | \n", + "True | \n", + "True | \n", + "]{minimal}\\n \\usepackage{amsmath}\\n \\usepackag... | \n", + "code | \n", + "latex | \n", + "
| 4 | \n", + "1089371 | \n", + "3.882812 | \n", + "True | \n", + "True | \n", + ": 1,\\n\",\\n \"'col-md-push-6' : 1,\\n\",\\n \"'col-... | \n", + "pattern-incrementing | \n", + "NaN | \n", + "
| ... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "
| 2495 | \n", + "2581351 | \n", + "3.392578 | \n", + "False | \n", + "True | \n", + "2*y**2 + 6*y. Let z(g) = -3*g**2 - 7*g - 7. Le... | \n", + "code+nl | \n", + "math | \n", + "
| 2496 | \n", + "2583534 | \n", + "3.597656 | \n", + "False | \n", + "False | \n", + "039 ### ###',\\n '049 ### ###',\\n '050 ### ###'... | \n", + "pattern-incrementing | \n", + "NaN | \n", + "
| 2497 | \n", + "2584695 | \n", + "3.710938 | \n", + "False | \n", + "True | \n", + ".1, -1?\\n-1\\nWhat is the second biggest value ... | \n", + "code+nl | \n", + "math | \n", + "
| 2498 | \n", + "2586170 | \n", + "3.080078 | \n", + "False | \n", + "True | \n", + "public DbUpdateException()\\n {\\n }\\n\\n /// <su... | \n", + "code | \n", + "NaN | \n", + "
| 2499 | \n", + "2593068 | \n", + "2.578125 | \n", + "False | \n", + "True | \n", + "CLANG_WARN_BOOL_CONVERSION = YES;\\n CLANG_WARN... | \n", + "code | \n", + "NaN | \n", + "
2500 rows × 7 columns
\n", + "| \n", + " | index | \n", + "tokens | \n", + "__index_level_0__ | \n", + "
|---|---|---|---|
| 0 | \n", + "441 | \n", + "[5584, 4196, 1228, 187, 1036, 4, 209, 21723, 2... | \n", + "441 | \n", + "
| 1 | \n", + "447 | \n", + "[50262, 61, 2099, 92, 8861, 94, 187, 50262, 61... | \n", + "447 | \n", + "
| 2 | \n", + "792 | \n", + "[475, 50272, 953, 24781, 778, 320, 908, 281, 1... | \n", + "792 | \n", + "
| 3 | \n", + "1539 | \n", + "[424, 380, 16101, 313, 433, 17889, 3104, 10, 2... | \n", + "1539 | \n", + "
| 4 | \n", + "1705 | \n", + "[3498, 2262, 2369, 40, 736, 13, 3956, 27, 21, ... | \n", + "1705 | \n", + "
| ... | \n", + "... | \n", + "... | \n", + "... | \n", + "
| 411443 | \n", + "146431199 | \n", + "[281, 320, 669, 8604, 60, 805, 431, 1019, 8402... | \n", + "2287199 | \n", + "
| 411444 | \n", + "146431278 | \n", + "[588, 1705, 285, 8415, 634, 1895, 15, 30952, 3... | \n", + "2287278 | \n", + "
| 411445 | \n", + "146431294 | \n", + "[15468, 13, 50275, 13743, 13, 50275, 15220, 13... | \n", + "2287294 | \n", + "
| 411446 | \n", + "146431588 | \n", + "[27, 330, 14788, 10334, 14, 3429, 27, 577, 28,... | \n", + "2287588 | \n", + "
| 411447 | \n", + "146431592 | \n", + "[1406, 485, 15, 23780, 300, 2473, 285, 12698, ... | \n", + "2287592 | \n", + "
411448 rows × 3 columns
\n", + "" + ], + "text/plain": [ + " index tokens \n", + "0 441 [5584, 4196, 1228, 187, 1036, 4, 209, 21723, 2... \\\n", + "1 447 [50262, 61, 2099, 92, 8861, 94, 187, 50262, 61... \n", + "2 792 [475, 50272, 953, 24781, 778, 320, 908, 281, 1... \n", + "3 1539 [424, 380, 16101, 313, 433, 17889, 3104, 10, 2... \n", + "4 1705 [3498, 2262, 2369, 40, 736, 13, 3956, 27, 21, ... \n", + "... ... ... \n", + "411443 146431199 [281, 320, 669, 8604, 60, 805, 431, 1019, 8402... \n", + "411444 146431278 [588, 1705, 285, 8415, 634, 1895, 15, 30952, 3... \n", + "411445 146431294 [15468, 13, 50275, 13743, 13, 50275, 15220, 13... \n", + "411446 146431588 [27, 330, 14788, 10334, 14, 3429, 27, 577, 28,... \n", + "411447 146431592 [1406, 485, 15, 23780, 300, 2473, 285, 12698, ... \n", + "\n", + " __index_level_0__ \n", + "0 441 \n", + "1 447 \n", + "2 792 \n", + "3 1539 \n", + "4 1705 \n", + "... ... \n", + "411443 2287199 \n", + "411444 2287278 \n", + "411445 2287294 \n", + "411446 2287588 \n", + "411447 2287592 \n", + "\n", + "[411448 rows x 3 columns]" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pythia_70m_memories = load_dataset(\"EleutherAI/pythia-memorized-evals\", split=\"deduped.70m\").to_pandas()\n", + "pythia_70m_memories" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "| \n", + " | index | \n", + "perplexity | \n", + "memorized | \n", + "is_code | \n", + "shortened_text | \n", + "Category | \n", + "Note | \n", + "tokens | \n", + "__index_level_0__ | \n", + "
|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", + "92883 | \n", + "3.687500 | \n", + "True | \n", + "True | \n", + "}{-69pt}\\n \\begin{document}$u_{n}\\rightarrow u... | \n", + "code | \n", + "latex | \n", + "[8699, 2090, 431, 94, 187, 50262, 61, 2043, 92... | \n", + "92883 | \n", + "
| 1 | \n", + "685875 | \n", + "3.837891 | \n", + "True | \n", + "False | \n", + "alesSite: All American Trannies\\n\\nFor Search ... | \n", + "nl | \n", + "NaN | \n", + "[2339, 27327, 27, 1876, 2448, 1535, 1136, 447,... | \n", + "685875 | \n", + "
| 2 | \n", + "973152 | \n", + "2.884766 | \n", + "True | \n", + "False | \n", + "18>::type T18;\\n typedef map<T0, T1, T2, T3, T... | \n", + "pattern-incrementing | \n", + "NaN | \n", + "[1093, 14157, 881, 308, 1093, 28, 187, 50266, ... | \n", + "973152 | \n", + "
| 3 | \n", + "1016981 | \n", + "1.056641 | \n", + "True | \n", + "True | \n", + "]{minimal}\\n \\usepackage{amsmath}\\n \\usepackag... | \n", + "code | \n", + "latex | \n", + "[1019, 8402, 94, 187, 50262, 61, 2099, 92, 879... | \n", + "1016981 | \n", + "
| 4 | \n", + "1089371 | \n", + "3.882812 | \n", + "True | \n", + "True | \n", + ": 1,\\n\",\\n \"'col-md-push-6' : 1,\\n\",\\n \"'col-... | \n", + "pattern-incrementing | \n", + "NaN | \n", + "[8, 1163, 337, 1337, 79, 995, 187, 50274, 2789... | \n", + "1089371 | \n", + "
| ... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "
| 2486 | \n", + "2555832 | \n", + "3.667969 | \n", + "False | \n", + "True | \n", + "dip)</pre>\\n</li>\\n</ul>\\n<a name=\"cornerRadiu... | \n", + "code | \n", + "NaN | \n", + "[31665, 17266, 3456, 31, 187, 870, 965, 31, 18... | \n", + "267832 | \n", + "
| 2488 | \n", + "2556154 | \n", + "2.320312 | \n", + "False | \n", + "False | \n", + "LEASE COME TO MEXICO CITY PLEASE COME TO MEXIC... | \n", + "pattern-repeating | \n", + "NaN | \n", + "[26084, 8610, 38, 5935, 353, 4237, 24218, 4589... | \n", + "268154 | \n", + "
| 2491 | \n", + "2560655 | \n", + "4.386719 | \n", + "False | \n", + "True | \n", + "NL_WABMON_4 = 131141\\n X_NL_WABMON_5 = 131142\\... | \n", + "pattern-incrementing | \n", + "NaN | \n", + "[19214, 64, 56, 2925, 22362, 64, 21, 50276, 30... | \n", + "272655 | \n", + "
| 2493 | \n", + "2570852 | \n", + "4.101562 | \n", + "False | \n", + "False | \n", + "WITH OCESAPLEASE COME MEXICO CITY WITH OCESAPL... | \n", + "pattern-repeating | \n", + "NaN | \n", + "[9277, 27202, 1410, 2088, 26084, 8610, 38, 353... | \n", + "282852 | \n", + "
| 2496 | \n", + "2583534 | \n", + "3.597656 | \n", + "False | \n", + "False | \n", + "039 ### ###',\\n '049 ### ###',\\n '050 ### ###'... | \n", + "pattern-incrementing | \n", + "NaN | \n", + "[18832, 209, 4118, 209, 4118, 1383, 187, 50270... | \n", + "295534 | \n", + "
1430 rows × 9 columns
\n", + "| \n", + " | index | \n", + "tokens | \n", + "is_memorized | \n", + "
|---|---|---|---|
| 0 | \n", + "18 | \n", + "[15, 46525, 3439, 2526, 187, 14, 17, 15, 1036,... | \n", + "False | \n", + "
| 1 | \n", + "43 | \n", + "[273, 22523, 18595, 275, 643, 3054, 2085, 3081... | \n", + "False | \n", + "
| 2 | \n", + "86 | \n", + "[749, 10580, 273, 575, 5, 44, 64, 79, 5, 534, ... | \n", + "False | \n", + "
| 3 | \n", + "110 | \n", + "[12556, 187, 71, 437, 285, 45965, 13, 285, 253... | \n", + "False | \n", + "
| 4 | \n", + "112 | \n", + "[3847, 277, 2631, 449, 346, 1552, 310, 417, 82... | \n", + "False | \n", + "
| ... | \n", + "... | \n", + "... | \n", + "... | \n", + "
| 4999995 | \n", + "146431872 | \n", + "[3117, 393, 6040, 416, 393, 5786, 393, 50, 5, ... | \n", + "False | \n", + "
| 4999996 | \n", + "146431904 | \n", + "[187, 6067, 1783, 2722, 326, 14108, 1638, 3400... | \n", + "False | \n", + "
| 4999997 | \n", + "146431927 | \n", + "[704, 39660, 1051, 187, 29, 56, 2711, 8537, 37... | \n", + "False | \n", + "
| 4999998 | \n", + "146431960 | \n", + "[14, 34552, 15390, 1253, 15280, 285, 1108, 447... | \n", + "False | \n", + "
| 4999999 | \n", + "146431973 | \n", + "[38630, 14716, 247, 15846, 8651, 5763, 15, 831... | \n", + "False | \n", + "
5000000 rows × 3 columns
\n", + "" + ], + "text/plain": [ + " index tokens \n", + "0 18 [15, 46525, 3439, 2526, 187, 14, 17, 15, 1036,... \\\n", + "1 43 [273, 22523, 18595, 275, 643, 3054, 2085, 3081... \n", + "2 86 [749, 10580, 273, 575, 5, 44, 64, 79, 5, 534, ... \n", + "3 110 [12556, 187, 71, 437, 285, 45965, 13, 285, 253... \n", + "4 112 [3847, 277, 2631, 449, 346, 1552, 310, 417, 82... \n", + "... ... ... \n", + "4999995 146431872 [3117, 393, 6040, 416, 393, 5786, 393, 50, 5, ... \n", + "4999996 146431904 [187, 6067, 1783, 2722, 326, 14108, 1638, 3400... \n", + "4999997 146431927 [704, 39660, 1051, 187, 29, 56, 2711, 8537, 37... \n", + "4999998 146431960 [14, 34552, 15390, 1253, 15280, 285, 1108, 447... \n", + "4999999 146431973 [38630, 14716, 247, 15846, 8651, 5763, 15, 831... \n", + "\n", + " is_memorized \n", + "0 False \n", + "1 False \n", + "2 False \n", + "3 False \n", + "4 False \n", + "... ... \n", + "4999995 False \n", + "4999996 False \n", + "4999997 False \n", + "4999998 False \n", + "4999999 False \n", + "\n", + "[5000000 rows x 3 columns]" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "non_memories = load_dataset(\"EleutherAI/pile-deduped-pythia-random-sampled\")[\"train\"].to_pandas()\n", + "non_memories" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "| \n", + " | index | \n", + "perplexity | \n", + "memorized | \n", + "is_code | \n", + "shortened_text | \n", + "Category | \n", + "Note | \n", + "tokens | \n", + "is_memorized | \n", + "
|---|---|---|---|---|---|---|---|---|---|
| 12 | \n", + "1999230 | \n", + "2.210938 | \n", + "True | \n", + "False | \n", + "Armenians\",0,\"\",0,\"\",0,0,0,0,0,0,0,0,0,0,0,0,0... | \n", + "code | \n", + "NaN | \n", + "[37801, 2458, 995, 17, 937, 995, 17, 937, 995,... | \n", + "False | \n", + "
| 2184 | \n", + "1999230 | \n", + "2.210938 | \n", + "False | \n", + "False | \n", + "Armenians\",0,\"\",0,\"\",0,0,0,0,0,0,0,0,0,0,0,0,0... | \n", + "pattern-repeating | \n", + "NaN | \n", + "[37801, 2458, 995, 17, 937, 995, 17, 937, 995,... | \n", + "False | \n", + "
| 17 | \n", + "2814976 | \n", + "4.007812 | \n", + "True | \n", + "True | \n", + "/brand-5\\nhttps://m.52010000.cn/brand-6\\nhttps... | \n", + "code | \n", + "NaN | \n", + "[16, 22374, 14, 22, 187, 3614, 1358, 78, 15, 2... | \n", + "False | \n", + "
| 26 | \n", + "3616218 | \n", + "2.611328 | \n", + "True | \n", + "True | \n", + "CA5 },\\n { 0x10CE6, 0x10CA6 },\\n { 0x10CE7, 0x... | \n", + "pattern-incrementing | \n", + "NaN | \n", + "[4280, 22, 3572, 187, 50274, 92, 470, 89, 740,... | \n", + "False | \n", + "
| 89 | \n", + "9657233 | \n", + "2.617188 | \n", + "True | \n", + "True | \n", + "#ERROR! | \n", + "code | \n", + "NaN | \n", + "[568, 2437, 275, 389, 15, 29762, 15, 26318, 15... | \n", + "False | \n", + "
| ... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "
| 2495 | \n", + "2581351 | \n", + "3.392578 | \n", + "False | \n", + "True | \n", + "2*y**2 + 6*y. Let z(g) = -3*g**2 - 7*g - 7. Le... | \n", + "code+nl | \n", + "math | \n", + "[374, 11, 90, 424, 19, 559, 721, 11, 90, 15, 1... | \n", + "False | \n", + "
| 2496 | \n", + "2583534 | \n", + "3.597656 | \n", + "False | \n", + "False | \n", + "039 ### ###',\\n '049 ### ###',\\n '050 ### ###'... | \n", + "pattern-incrementing | \n", + "NaN | \n", + "[18832, 209, 4118, 209, 4118, 1383, 187, 50270... | \n", + "False | \n", + "
| 2497 | \n", + "2584695 | \n", + "3.710938 | \n", + "False | \n", + "True | \n", + ".1, -1?\\n-1\\nWhat is the second biggest value ... | \n", + "code+nl | \n", + "math | \n", + "[15, 18, 13, 428, 18, 32, 187, 14, 18, 187, 12... | \n", + "False | \n", + "
| 2498 | \n", + "2586170 | \n", + "3.080078 | \n", + "False | \n", + "True | \n", + "public DbUpdateException()\\n {\\n }\\n\\n /// <su... | \n", + "code | \n", + "NaN | \n", + "[187, 50270, 4387, 46688, 11241, 5330, 1082, 1... | \n", + "False | \n", + "
| 2499 | \n", + "2593068 | \n", + "2.578125 | \n", + "False | \n", + "True | \n", + "CLANG_WARN_BOOL_CONVERSION = YES;\\n CLANG_WARN... | \n", + "code | \n", + "NaN | \n", + "[3207, 14375, 64, 24798, 64, 30529, 64, 5707, ... | \n", + "False | \n", + "
1298 rows × 9 columns
\n", + "| \n", + " | index | \n", + "memorized | \n", + "perplexity | \n", + "is_code | \n", + "prompt | \n", + "sequence | \n", + "Code | \n", + "Incremental | \n", + "Repetitive | \n", + "Highly Duplicated | \n", + "Templating | \n", + "Natural Language | \n", + "Random | \n", + "Other | \n", + "Notes | \n", + "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1720 | \n", + "1042625 | \n", + "False | \n", + "4.777344 | \n", + "True | \n", + "4\\nLet f be ((-1)/2)/(2/4). Let a = 6 - 5. Let... | \n", + "4\\nLet f be ((-1)/2)/(2/4). Let a = 6 - 5. Let... | \n", + "\n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " |
| 1412 | \n", + "378406 | \n", + "False | \n", + "4.007812 | \n", + "True | \n", + "2/2 + 3*r - 3. Let g(x) be the first derivativ... | \n", + "2/2 + 3*r - 3. Let g(x) be the first derivativ... | \n", + "\n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " |
| 1794 | \n", + "1188414 | \n", + "False | \n", + "1.437500 | \n", + "True | \n", + "usepackage{amsmath}\\n\\usepackage{wasysym} \\n\\u... | \n", + "usepackage{amsmath}\\n\\usepackage{wasysym} \\n\\u... | \n", + "\n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " |
| 1985 | \n", + "1572476 | \n", + "False | \n", + "3.982422 | \n", + "True | \n", + "v - 2*v + 6 = 0. Let k be (3/v)/(2/(-8)). Let ... | \n", + "v - 2*v + 6 = 0. Let k be (3/v)/(2/(-8)). Let ... | \n", + "\n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " |
| 510 | \n", + "60937406 | \n", + "True | \n", + "2.820312 | \n", + "True | \n", + "\\n\\n{#sp2.143}\\n\\n![]... | \n", + "\\n\\n{#sp2.143}\\n\\n![]... | \n", + "\n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " |
| ... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "
| 1055 | \n", + "121637700 | \n", + "True | \n", + "1.520508 | \n", + "True | \n", + "=\"1.0\" encoding=\"UTF-8\"?>\\n<!DOCTYPE plist PUB... | \n", + "=\"1.0\" encoding=\"UTF-8\"?>\\n<!DOCTYPE plist PUB... | \n", + "\n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " |
| 71 | \n", + "7827439 | \n", + "True | \n", + "3.263672 | \n", + "True | \n", + "per Team is already on the scene....<?xml vers... | \n", + "per Team is already on the scene....<?xml vers... | \n", + "\n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " |
| 2011 | \n", + "1644096 | \n", + "False | \n", + "3.005859 | \n", + "False | \n", + "11.9 ± 2.0 11.5 ± 2.0 11.2 ± 2.2 \\< | \n", + "11.9 ± 2.0 11.5 ± 2.0 11.2 ± 2.2 \\... | \n", + "\n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " |
| 1028 | \n", + "118099130 | \n", + "True | \n", + "2.289062 | \n", + "False | \n", + "ref 8, ref 9, ref 10, ref 11, ref 12, ref 13,... | \n", + "ref 8, ref 9, ref 10, ref 11, ref 12, ref 13,... | \n", + "\n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " |
| 1026 | \n", + "118045648 | \n", + "True | \n", + "2.908203 | \n", + "True | \n", + "ISA as two detectors, so that the signal in ea... | \n", + "ISA as two detectors, so that the signal in ea... | \n", + "\n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " |
2499 rows × 15 columns
\n", + "" + ], + "text/plain": [ + " index memorized perplexity is_code \n", + "1720 1042625 False 4.777344 True \\\n", + "1412 378406 False 4.007812 True \n", + "1794 1188414 False 1.437500 True \n", + "1985 1572476 False 3.982422 True \n", + "510 60937406 True 2.820312 True \n", + "... ... ... ... ... \n", + "1055 121637700 True 1.520508 True \n", + "71 7827439 True 3.263672 True \n", + "2011 1644096 False 3.005859 False \n", + "1028 118099130 True 2.289062 False \n", + "1026 118045648 True 2.908203 True \n", + "\n", + " prompt \n", + "1720 4\\nLet f be ((-1)/2)/(2/4). Let a = 6 - 5. Let... \\\n", + "1412 2/2 + 3*r - 3. Let g(x) be the first derivativ... \n", + "1794 usepackage{amsmath}\\n\\usepackage{wasysym} \\n\\u... \n", + "1985 v - 2*v + 6 = 0. Let k be (3/v)/(2/(-8)). Let ... \n", + "510 \\n\\n{#sp2.143}\\n\\n![]... \n", + "... ... \n", + "1055 =\"1.0\" encoding=\"UTF-8\"?>\\n\\n\n", + "\n", + "| \n", + " | index | \n", + "memorized | \n", + "perplexity | \n", + "is_code | \n", + "prompt | \n", + "sequence | \n", + "Code | \n", + "Incremental | \n", + "Repetitive | \n", + "Highly Duplicated | \n", + "Templating | \n", + "Natural Language | \n", + "Random | \n", + "Other | \n", + "Notes | \n", + "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1497 | \n", + "576734 | \n", + "False | \n", + "3.447266 | \n", + "True | \n", + "(-1.0, -1.0, 0);\\nglTexCoord2f(0, 0);\\nglVerte... | \n", + "(-1.0, -1.0, 0);\\nglTexCoord2f(0, 0);\\nglVerte... | \n", + "\n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " |
| 389 | \n", + "45993218 | \n", + "True | \n", + "1.752930 | \n", + "True | \n", + "not use this file except in compliance with t... | \n", + "not use this file except in compliance with t... | \n", + "\n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " |
| 1095 | \n", + "126795714 | \n", + "True | \n", + "4.429688 | \n", + "True | \n", + "(c) 2019 Wei Wang <onevcat@gmail.com>\\n//\\n//... | \n", + "(c) 2019 Wei Wang <onevcat@gmail.com>\\n//\\n//... | \n", + "\n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " |
| 496 | \n", + "58907866 | \n", + "True | \n", + "1.357422 | \n", + "False | \n", + "http://www.apache.org/licenses/LICENSE-2.0\\n\\n... | \n", + "http://www.apache.org/licenses/LICENSE-2.0\\n\\n... | \n", + "\n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " |
| 1647 | \n", + "911620 | \n", + "False | \n", + "3.562500 | \n", + "True | \n", + "=\"table-fn\"} ... | \n", + "=\"table-fn\"} ... | \n", + "\n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " |
| ... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "
| 2083 | \n", + "1786406 | \n", + "False | \n", + "1.972656 | \n", + "True | \n", + "\\n \\usepackage{amssymb} \\n ... | \n", + "\\n \\usepackage{amssymb} \\n ... | \n", + "\n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " |
| 1202 | \n", + "139649808 | \n", + "True | \n", + "1.101562 | \n", + "True | \n", + "$\\documentclass[12pt]{minimal}\\n ... | \n", + "$\\documentclass[12pt]{minimal}\\n ... | \n", + "\n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " |
| 428 | \n", + "50719780 | \n", + "True | \n", + "1.749023 | \n", + "True | \n", + "in compliance with the License.\\n// You may o... | \n", + "in compliance with the License.\\n// You may o... | \n", + "\n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " |
| 2020 | \n", + "1659069 | \n", + "False | \n", + "3.974609 | \n", + "False | \n", + "?\\n3\\nWhat is the ninth root of 113001 to the ... | \n", + "?\\n3\\nWhat is the ninth root of 113001 to the ... | \n", + "\n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " |
| 506 | \n", + "60582215 | \n", + "True | \n", + "1.504883 | \n", + "True | \n", + "good judgment.//\\n// Generated by class-d... | \n", + "good judgment.//\\n// Generated by class-d... | \n", + "\n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " |
100 rows × 15 columns
\n", + "" + ], + "text/plain": [ + " index memorized perplexity is_code \n", + "1497 576734 False 3.447266 True \\\n", + "389 45993218 True 1.752930 True \n", + "1095 126795714 True 4.429688 True \n", + "496 58907866 True 1.357422 False \n", + "1647 911620 False 3.562500 True \n", + "... ... ... ... ... \n", + "2083 1786406 False 1.972656 True \n", + "1202 139649808 True 1.101562 True \n", + "428 50719780 True 1.749023 True \n", + "2020 1659069 False 3.974609 False \n", + "506 60582215 True 1.504883 True \n", + "\n", + " prompt \n", + "1497 (-1.0, -1.0, 0);\\nglTexCoord2f(0, 0);\\nglVerte... \\\n", + "389 not use this file except in compliance with t... \n", + "1095 (c) 2019 Wei Wang