From 2baf319344706ab38325558879a2d15385efda79 Mon Sep 17 00:00:00 2001 From: mobius Date: Mon, 17 Jul 2023 17:48:19 -0700 Subject: [PATCH 01/11] checkin a sandbox version --- sandbox/t10_state_loop.py | 197 ++++++++++++++++++++++++++++++++++++++ test/test_common.py | 36 ++++--- uniteai/common.py | 28 +++--- 3 files changed, 235 insertions(+), 26 deletions(-) create mode 100644 sandbox/t10_state_loop.py diff --git a/sandbox/t10_state_loop.py b/sandbox/t10_state_loop.py new file mode 100644 index 0000000..517d898 --- /dev/null +++ b/sandbox/t10_state_loop.py @@ -0,0 +1,197 @@ +''' + +Just getting the basic components of the State Loop Working + +''' + +import openai +import yaml +import os +import re + +with open(os.path.expanduser('~/.uniteai.yml'), 'r') as ymlfile: + cfg = yaml.safe_load(ymlfile) +openai.api_key = cfg['openai']['api_key'] + +COMPLETION_ENGINES = [ + "text-davinci-003", + "text-davinci-002", + "ada", + "babbage", + "curie", + "davinci", +] + +CHAT_ENGINES = [ + "gpt-3.5-turbo", + "gpt-4", +] + +ENGINE = 'gpt-3.5-turbo' +# ENGINE = 'gpt-4' + +def openai_autocomplete(engine, text, max_length): + ''' NON-Streaming responses from OpenAI's API.''' + if engine in COMPLETION_ENGINES: + response = openai.Completion.create( + engine=engine, + prompt=text, + max_tokens=max_length, + stream=False + ) + return response + elif engine in CHAT_ENGINES: + response = openai.ChatCompletion.create( + model=engine, + messages=[{"role": "user", "content": text}], + stream=False + ) + return response['choices'][0]['message']['content'] + + +def find_tag(tag: str, doc_lines: [str]): + ''' Find index of first element that contains `tag`. ''' + ix = 0 + for ix, line in enumerate(doc_lines): + match = re.search(tag, line) + if match: + return ix, match.start(), match.end() + return None + + +def find_block(start_tag, end_tag, doc): + '''Fine the indices of a start/end-tagged block.''' + if doc is None: + return None, None + doc_lines = doc.split('\n') + s = find_tag(start_tag, doc_lines) + e = find_tag(end_tag, doc_lines) + return s, e + + +def extract_block(start, end, doc): + '''Extract block of text between `start` and `end` tag.''' + if doc is None: + return None + doc_lines = doc.split('\n') + if start is None or end is None: + return None + if start[0] > end[0] or (start[0] == end[0] and start[2] > end[1]): + return None + if start[0] == end[0]: + return [doc_lines[start[0]][start[2]: end[1]]] + else: + block = [doc_lines[start[0]][start[2]:]] # portion of start line + block.extend(doc_lines[start[0]+1:end[0]]) # all of middle lines + block.append(doc_lines[end[0]][:end[1]]) # portion of end line + return '\n'.join(block) + + +def start_tag(x): + return f'<{x}_TAG>' + + +def end_tag(x): + return f'' + + +def get_block(tag, doc): + s1, s2 = find_block(start_tag(tag), end_tag(tag), doc) + return extract_block(s1, s2, doc) + + +STATE = 'STATE' +NEW_STATE = 'NEW_STATE' +REQUEST = 'REQUEST' +RESPONSE = 'RESPONSE' +UPDATES_NEEDED = 'UPDATES_NEEDED' + +state = ''' +players: + josh: + items: + location: + kirtley: + items: + location: + +quests: + +obstacles: + +enemies: +''' + +def get_response(request, + running_resp, + state, + prefix=None, + suffix=None): + nl = '\n\n' # can't do newlines inside f-exprs + prompt = f''' +{prefix + nl if prefix else ''}You must assume the role of a finite state machine, but using only natural language. + +You will be given state, and a request. + +You must return a response, and a new state. + +Please format your response like: + +{start_tag(RESPONSE)} +your response +{end_tag(RESPONSE)} + +{start_tag(UPDATES_NEEDED)} +updates that you'll need to apply to the new state +{end_tag(UPDATES_NEEDED)} + +{start_tag(NEW_STATE)} +the new state +{end_tag(NEW_STATE)} + +Here is the current state: + +{start_tag(STATE)} +{state} +{end_tag(STATE)} + +Here is a transcript of your responses so far: +{running_resp} + +Here is the current request: + {request}{nl + suffix if suffix else ''} +'''.strip() + + return openai_autocomplete(ENGINE, prompt, max_length=200) + +prefix = 'You will be a Dungeon Master, and you will keep notes via a natural language-based state machine. Keep notes on: items, players, quests, etc.' +suffix = 'Remember, keep responses brief, invent interesting quests and obstacles, and make sure the state is always accurate.' + +print('Welcome!') + +running_resp = '' +while True: + request = input('Your Command:') + x = get_response(request, + running_resp=running_resp, + state=state, + prefix=prefix, + suffix=suffix) + + # Try extracting new_state + new_state = get_block(NEW_STATE, x) + if new_state is None: + new_state = get_block(STATE, x) + + # Try extracting response + resp = get_block(RESPONSE, x) + if resp is None: + resp = '' + print(f'INVALID RESPONSE: \n{x}') + continue + + if new_state is not None and resp is not None: + state = new_state + print(f'STATE: {state}') + running_resp = f'{running_resp.strip()}\n\n{resp.strip()}' + print(f'RESPONSE: {running_resp}') diff --git a/test/test_common.py b/test/test_common.py index 3e114b9..bb06ff3 100644 --- a/test/test_common.py +++ b/test/test_common.py @@ -4,7 +4,7 @@ ''' -from uniteai.common import insert_text_at, find_pattern_in_document +from uniteai.common import insert_text_at, find_block, extract_block import pytest @@ -49,15 +49,27 @@ def test_insert_text_at(): assert str(e.value) == "Column number out of range" -def test_find_pattern_in_document(): - document = ''' -Hello, world! -Regex is fun. -I like programming in Python. -''' +def test_extract_block(): + doc = "This is the first document line.\nThe second line is start_tag and also contains some more text.\nThis is the third line between the tags.\nThis is the fourth line.\nThe fifth line is end_tag and also contains some more text.\nThis is the last document line." + start_tag = "start_tag" + end_tag = "end_tag" + + # Getting the start and end line and column tuples + start, end = find_block(start_tag, end_tag, doc) + + # Expecting three lines as output - the line containing start tag, the line between the tags and the line containing the end tag + expected_output = " and also contains some more text.\nThis is the third line between the tags.\nThis is the fourth line.\nThe fifth line is " + + assert extract_block(start, end, doc) == expected_output, "Test case 1 failed!" + + # Test case when no tag is there in document + start_tag = "no_tag" + end_tag = "no_tag" + start, end = find_block(start_tag, end_tag, doc) + + # Expecting None since no tag is there in document + expected_output = None + + assert extract_block(start, end, doc) == expected_output, "Test case 2 failed!" - assert find_pattern_in_document(document, "o") == [(1, 4, 5), (1, 8, 9), - (3, 9, 10), (3, 26, 27)] - assert find_pattern_in_document(document, "P...on") == [(3, 22, 28)] - assert find_pattern_in_document(document, "Java") == [] - assert find_pattern_in_document('', "o") == [] + print('All test cases passed!') diff --git a/uniteai/common.py b/uniteai/common.py index a76e579..5d28c1b 100644 --- a/uniteai/common.py +++ b/uniteai/common.py @@ -105,20 +105,20 @@ def find_block(start_tag, end_tag, doc): return s, e -def find_pattern_in_document( - document: str, - pattern: str) -> List[Tuple[int, int, int]]: - '''Return (line, start_col, end_col) for each match. Regex cannot span - newlines.''' - result = [] - compiled_pattern = re.compile(pattern) - - for line_number, line in enumerate(document.split('\n')): - for match in compiled_pattern.finditer(line): - start, end = match.span() - result.append((line_number, start, end)) - - return result +def extract_block(start, end, doc): + '''Extract block of text between `start` and `end` tag.''' + doc_lines = doc.split('\n') + if start is None or end is None: + return None + if start[0] > end[0] or (start[0] == end[0] and start[2] > end[1]): + return None + if start[0] == end[0]: + return [doc_lines[start[0]][start[2]: end[1]]] + else: + block = [doc_lines[start[0]][start[2]:]] # portion of start line + block.extend(doc_lines[start[0]+1:end[0]]) # all of middle lines + block.append(doc_lines[end[0]][:end[1]]) # portion of end line + return '\n'.join(block) ################################################## From 2596149d33debe3072127b04ea9de30a3e5151b7 Mon Sep 17 00:00:00 2001 From: mobius Date: Mon, 17 Jul 2023 23:47:08 -0700 Subject: [PATCH 02/11] document chat example --- sandbox/t11_document_chat.py | 217 +++++++++++++++++++++++++++++++++++ 1 file changed, 217 insertions(+) create mode 100644 sandbox/t11_document_chat.py diff --git a/sandbox/t11_document_chat.py b/sandbox/t11_document_chat.py new file mode 100644 index 0000000..a7796c3 --- /dev/null +++ b/sandbox/t11_document_chat.py @@ -0,0 +1,217 @@ +''' + +Reading in and indexing documents. + +pip install pypdf +pip install InstructorEmbedding +pip install sentence-transformers + +''' + +import os +from InstructorEmbedding import INSTRUCTOR +import numpy as np +from sklearn.metrics.pairwise import cosine_similarity +from pypdf import PdfReader +from tqdm import tqdm +from typing import List, Dict +import numpy as np +from tqdm import tqdm +from scipy.signal import savgol_filter + +WINDOW_SIZE = 300 +STRIDE = 100 +PERCENTILE = 80 + +pdf_paths = { + 'bitcoin': '~/Documents/misc/bitcoin.pdf', + # 'idaho': '~/Documents/misc/land use and development code.pdf' +} + + +################################################## +# PDF + +def read_pdf(path): + path = os.path.expanduser(path) + reader = PdfReader(path) + text = '' + for page in reader.pages: + text += '\n' + page.extract_text() + return text + +try: + pdf_cache +except: + pdf_cache = {} + + print('Loading PDF Text') + for k, path in pdf_paths.items(): + pdf_cache[k] = read_pdf(path) + +try: + embedding_cache +except: + embedding_cache = {} + + +################################################## +# Load Model + +try: + already_loaded +except: + model = INSTRUCTOR('hkunlp/instructor-base') + already_loaded = True + +query_instruction = 'Represent the Science question for retrieving supporting documents: ' +embed_instruction = 'Represent the Science document for retrieval: ' + +def embed(xs: List[str]): + ''' Build sentence embeddings for each sentence in `xs` ''' + return model.encode(xs) + +def similar_tokens(query: str, + pdf_key: str, + ) -> List[float]: + '''Compare a `query to a strided window over a `document`.''' + global embedding_cache + # Initialize a numpy array for storing similarities and overlaps + document = pdf_cache[pdf_key] + + similarities = np.zeros(len(document), dtype=float) + overlaps = np.zeros(len(document), dtype=float) + + query_e = embed([[query_instruction, query]]) + + if pdf_key in embedding_cache: + embedding_is_saved = True + embeddings = embedding_cache[pdf_key] + else: + embedding_is_saved = False + embeddings = [] + + # Loop through the document with given stride + for emb_i, doc_i in tqdm(enumerate(range(0, len(document) - WINDOW_SIZE + 1, STRIDE))): + # Extract the chunk from document + chunk = document[doc_i:doc_i+WINDOW_SIZE] + + # Similarity + if embedding_is_saved: + chunk_e = embeddings[emb_i] + else: + chunk_e = embed([[embed_instruction, chunk]]) + embeddings.append(chunk_e) + sim = cosine_similarity(query_e, chunk_e)[0][0] + + # Update the similarities and overlaps array + for j in range(doc_i, doc_i + WINDOW_SIZE): + similarities[j] += sim + overlaps[j] += 1 + + embedding_cache[pdf_key] = embeddings + # Average the similarities with the number of overlaps + similarities /= np.where(overlaps != 0, overlaps, 1) + + return similarities + +def find_spans(arr, threshold=0.5): + # Create an array that is 1 where arr is above threshold, and padded with 0s at the edges + is_over_threshold = np.concatenate(([0], np.greater(arr, threshold), [0])) + + # Find the indices of rising and falling edges + diffs = np.diff(is_over_threshold) + starts = np.where(diffs > 0)[0] + ends = np.where(diffs < 0)[0] + return list(zip(starts, ends - 1)) + +def tune_percentile(xs, percentile): + ''' 0-out all elements below percentile. Essentially, this will leave some + `1-percentile` percentage of the document highlighted. ''' + xs = np.copy(xs) # don't mutate original + p = np.percentile(xs, percentile) + xs[xs < p] *= 0 + return xs + +def find_similar(query, pdf_key): + ''' + query: a query that you want to find similar passages to + pdf_key: {bitcoin, idaho} + ''' + global embedding_cache + + # Embeddings + print('Calculating embeddings') + return similar_tokens(query, pdf_key) + +def segments(similarities, document, threshold=0.0): + out = '' + last_thresh = False # for finding edge + + text = '' + sims = [] + out = [] # [(text, sims), ...] + for sim, char in zip(similarities, document): + super_thresh = sim > threshold + # no longer a super_thresh run + if last_thresh and not super_thresh: + out.append((text, np.array(sims))) + text = '' + sims = [] + + # is a super_thresh run + if super_thresh: + text += char + sims.append(sim) + last_thresh = super_thresh + if len(text) > 0: + out.append((text, np.array(sims))) + + return out + +def rank(segments, rank_fn): + scores = [] + for text, sims in segments: + scores.append(rank_fn(sims)) + out = [] + for score, (text, sims) in sorted(zip(scores, segments)): + out.append(text) + return out + +# query = 'whats in it for participants to the blockchain?' +# query = 'how does this protect my anonymity?' +# query = 'im concerned my hdd isnt big enough' +query = 'who contributed to this paper?' +pdf_key = 'bitcoin' +document = pdf_cache[pdf_key] +similarities = find_similar(query, pdf_key) + +# remove outlier +last_edge = int(len(similarities) * 0.02) +similarities[-last_edge:] = similarities[-last_edge] + + +def denoise_similarities(similarities, window_size=2000, poly_order=2): + # Apply Savitzky-Golay filter to smooth out the scores + denoised_scores = savgol_filter(similarities, window_size, poly_order) + return denoised_scores + + +# Denoise salience scores +d_similarities = denoise_similarities(similarities) +d_similarities -= d_similarities.min() # normalize +d_similarities /= d_similarities.max() +d_similarities = tune_percentile(d_similarities, percentile=75) + +segs = segments(d_similarities, document) +ranked_segments = rank(segs, np.mean) + +import matplotlib.pyplot as plt +plt.plot(similarities) +plt.plot(d_similarities) +# plt.plot(tune_percentile(similarities)) +plt.show() + +for x in ranked_segments: + print('--------------------------------------------------') + print(x) From fafdc8771a8057503f7427591fbdc3e7a8bef612 Mon Sep 17 00:00:00 2001 From: mobius Date: Mon, 17 Jul 2023 23:54:00 -0700 Subject: [PATCH 03/11] touchups --- sandbox/t11_document_chat.py | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/sandbox/t11_document_chat.py b/sandbox/t11_document_chat.py index a7796c3..596acbe 100644 --- a/sandbox/t11_document_chat.py +++ b/sandbox/t11_document_chat.py @@ -25,7 +25,7 @@ pdf_paths = { 'bitcoin': '~/Documents/misc/bitcoin.pdf', - # 'idaho': '~/Documents/misc/land use and development code.pdf' + 'idaho': '~/Documents/misc/land use and development code.pdf' } @@ -67,10 +67,12 @@ def read_pdf(path): query_instruction = 'Represent the Science question for retrieving supporting documents: ' embed_instruction = 'Represent the Science document for retrieval: ' + def embed(xs: List[str]): ''' Build sentence embeddings for each sentence in `xs` ''' return model.encode(xs) + def similar_tokens(query: str, pdf_key: str, ) -> List[float]: @@ -92,7 +94,9 @@ def similar_tokens(query: str, embeddings = [] # Loop through the document with given stride - for emb_i, doc_i in tqdm(enumerate(range(0, len(document) - WINDOW_SIZE + 1, STRIDE))): + # listify offsets to help out tqdm + offsets = list(enumerate(range(0, len(document) - WINDOW_SIZE + 1, STRIDE))) + for emb_i, doc_i in tqdm(offsets): # Extract the chunk from document chunk = document[doc_i:doc_i+WINDOW_SIZE] @@ -115,7 +119,9 @@ def similar_tokens(query: str, return similarities + def find_spans(arr, threshold=0.5): + ''' ''' # Create an array that is 1 where arr is above threshold, and padded with 0s at the edges is_over_threshold = np.concatenate(([0], np.greater(arr, threshold), [0])) @@ -125,6 +131,7 @@ def find_spans(arr, threshold=0.5): ends = np.where(diffs < 0)[0] return list(zip(starts, ends - 1)) + def tune_percentile(xs, percentile): ''' 0-out all elements below percentile. Essentially, this will leave some `1-percentile` percentage of the document highlighted. ''' @@ -133,6 +140,7 @@ def tune_percentile(xs, percentile): xs[xs < p] *= 0 return xs + def find_similar(query, pdf_key): ''' query: a query that you want to find similar passages to @@ -144,6 +152,7 @@ def find_similar(query, pdf_key): print('Calculating embeddings') return similar_tokens(query, pdf_key) + def segments(similarities, document, threshold=0.0): out = '' last_thresh = False # for finding edge @@ -169,7 +178,9 @@ def segments(similarities, document, threshold=0.0): return out + def rank(segments, rank_fn): + '''Sort segments according to an aggregate function of their scores.''' scores = [] for text, sims in segments: scores.append(rank_fn(sims)) @@ -178,6 +189,13 @@ def rank(segments, rank_fn): out.append(text) return out + +def denoise_similarities(similarities, window_size=2000, poly_order=2): + ''' Apply Savitzky-Golay filter to smooth out the similarity scores. ''' + denoised_scores = savgol_filter(similarities, window_size, poly_order) + return denoised_scores + + # query = 'whats in it for participants to the blockchain?' # query = 'how does this protect my anonymity?' # query = 'im concerned my hdd isnt big enough' @@ -190,13 +208,6 @@ def rank(segments, rank_fn): last_edge = int(len(similarities) * 0.02) similarities[-last_edge:] = similarities[-last_edge] - -def denoise_similarities(similarities, window_size=2000, poly_order=2): - # Apply Savitzky-Golay filter to smooth out the scores - denoised_scores = savgol_filter(similarities, window_size, poly_order) - return denoised_scores - - # Denoise salience scores d_similarities = denoise_similarities(similarities) d_similarities -= d_similarities.min() # normalize From 08b195b9492c666ce5a5da0f8a85f66dc1795f89 Mon Sep 17 00:00:00 2001 From: mobius Date: Tue, 18 Jul 2023 11:24:50 -0700 Subject: [PATCH 04/11] checkin --- sandbox/t11_document_chat.py | 44 ++++++++++++++++++++++++++++-------- 1 file changed, 35 insertions(+), 9 deletions(-) diff --git a/sandbox/t11_document_chat.py b/sandbox/t11_document_chat.py index 596acbe..64e796e 100644 --- a/sandbox/t11_document_chat.py +++ b/sandbox/t11_document_chat.py @@ -18,6 +18,7 @@ import numpy as np from tqdm import tqdm from scipy.signal import savgol_filter +import pickle WINDOW_SIZE = 300 STRIDE = 100 @@ -40,6 +41,24 @@ def read_pdf(path): text += '\n' + page.extract_text() return text +def get_file_name(path): + full_name = os.path.basename(path) + name, ext = os.path.splitext(full_name) + return name + +def load_pkl(pdf_key): + path = f'{pdf_key}.pkl' + if os.path.exists(path): + with open(path, 'rb') as f: + xs = pickle.load(f) + return xs + return None + +def save_pkl(pdf_key, xs): + with open(f'{pdf_key}.pkl', 'wb') as f: + pickle.dump(xs, f) + print(f'Saved: {pdf_key}') + try: pdf_cache except: @@ -53,6 +72,10 @@ def read_pdf(path): embedding_cache except: embedding_cache = {} + for k in pdf_paths.keys(): + emb = load_pkl(k) + if emb is not None: + embedding_cache[k] = emb ################################################## @@ -113,7 +136,10 @@ def similar_tokens(query: str, similarities[j] += sim overlaps[j] += 1 - embedding_cache[pdf_key] = embeddings + if not embedding_is_saved: + embedding_cache[pdf_key] = embeddings + save_pkl(pdf_key, embeddings) + # Average the similarities with the number of overlaps similarities /= np.where(overlaps != 0, overlaps, 1) @@ -192,20 +218,21 @@ def rank(segments, rank_fn): def denoise_similarities(similarities, window_size=2000, poly_order=2): ''' Apply Savitzky-Golay filter to smooth out the similarity scores. ''' - denoised_scores = savgol_filter(similarities, window_size, poly_order) - return denoised_scores + return savgol_filter(similarities, window_size, poly_order) # query = 'whats in it for participants to the blockchain?' # query = 'how does this protect my anonymity?' # query = 'im concerned my hdd isnt big enough' -query = 'who contributed to this paper?' -pdf_key = 'bitcoin' +# query = 'who contributed to this paper?' + +query = 'how close can my silver mine be to a farm?' +pdf_key = 'idaho' document = pdf_cache[pdf_key] similarities = find_similar(query, pdf_key) -# remove outlier -last_edge = int(len(similarities) * 0.02) +# remove outlier at end +last_edge = int(len(similarities) * 0.01) similarities[-last_edge:] = similarities[-last_edge] # Denoise salience scores @@ -215,12 +242,11 @@ def denoise_similarities(similarities, window_size=2000, poly_order=2): d_similarities = tune_percentile(d_similarities, percentile=75) segs = segments(d_similarities, document) -ranked_segments = rank(segs, np.mean) +ranked_segments = rank(segs, np.max) import matplotlib.pyplot as plt plt.plot(similarities) plt.plot(d_similarities) -# plt.plot(tune_percentile(similarities)) plt.show() for x in ranked_segments: From 838493068576a0231778d2ea74564234c1ed1a2b Mon Sep 17 00:00:00 2001 From: mobius Date: Tue, 18 Jul 2023 11:58:07 -0700 Subject: [PATCH 05/11] checkin light refactor --- sandbox/t11_document_chat.py | 168 +++++++++++++++++++---------------- 1 file changed, 92 insertions(+), 76 deletions(-) diff --git a/sandbox/t11_document_chat.py b/sandbox/t11_document_chat.py index 64e796e..2d7d94e 100644 --- a/sandbox/t11_document_chat.py +++ b/sandbox/t11_document_chat.py @@ -30,6 +30,44 @@ } +# cache = [ +# { +# 'bitcoin': '~/Documents/misc/bitcoin.pdf', +# 'window_size': 300, +# 'stride': 100, +# 'percentile': 80, +# 'pdf_text': None, +# 'embedding': None, +# }, +# { +# 'idaho': '~/Documents/misc/land use and development code.pdf', +# 'window_size': 300, +# 'stride': 100, +# 'percentile': 80, +# 'pdf_text': None, +# 'embedding': None, +# } +# ] + + +################################################## +# Load Model + +try: + already_loaded +except: + model = INSTRUCTOR('hkunlp/instructor-base') + already_loaded = True + +query_instruction = 'Represent the Science question for retrieving supporting documents: ' +embed_instruction = 'Represent the Science document for retrieval: ' + + +def embed(xs: List[str]): + ''' Build sentence embeddings for each sentence in `xs` ''' + return model.encode(xs) + + ################################################## # PDF @@ -41,11 +79,13 @@ def read_pdf(path): text += '\n' + page.extract_text() return text + def get_file_name(path): full_name = os.path.basename(path) name, ext = os.path.splitext(full_name) return name + def load_pkl(pdf_key): path = f'{pdf_key}.pkl' if os.path.exists(path): @@ -54,81 +94,52 @@ def load_pkl(pdf_key): return xs return None + def save_pkl(pdf_key, xs): - with open(f'{pdf_key}.pkl', 'wb') as f: + path = f'{pdf_key}.pkl' + with open(path, 'wb') as f: pickle.dump(xs, f) - print(f'Saved: {pdf_key}') - -try: - pdf_cache -except: - pdf_cache = {} + print(f'Saved: {path}') - print('Loading PDF Text') - for k, path in pdf_paths.items(): - pdf_cache[k] = read_pdf(path) -try: - embedding_cache -except: +def embed_documents(pdf_cache): + print('Preparing Embeddings') embedding_cache = {} - for k in pdf_paths.keys(): - emb = load_pkl(k) - if emb is not None: - embedding_cache[k] = emb - - -################################################## -# Load Model - -try: - already_loaded -except: - model = INSTRUCTOR('hkunlp/instructor-base') - already_loaded = True - -query_instruction = 'Represent the Science question for retrieving supporting documents: ' -embed_instruction = 'Represent the Science document for retrieval: ' - - -def embed(xs: List[str]): - ''' Build sentence embeddings for each sentence in `xs` ''' - return model.encode(xs) - - -def similar_tokens(query: str, - pdf_key: str, - ) -> List[float]: - '''Compare a `query to a strided window over a `document`.''' - global embedding_cache + for pdf_key, document in pdf_cache.items(): + # Try to load embeddings from disk + embeddings = load_pkl(pdf_key) + if embeddings is not None: + print(f'Loaded {pdf_key} embeddings') + embedding_cache[pdf_key] = embeddings + else: # If not found, then calculate + print(f'Preparing embeddings for {pdf_key}') + embeddings = [] + # Loop through the document with given stride + offsets = list(enumerate(range(0, len(document) - WINDOW_SIZE + 1, STRIDE))) + for emb_i, doc_i in tqdm(offsets): + # Extract the chunk from document + chunk = document[doc_i:doc_i+WINDOW_SIZE] + # Embed the chunk + chunk_e = embed([[embed_instruction, chunk]]) + embeddings.append(chunk_e) + + embedding_cache[pdf_key] = embeddings + save_pkl(pdf_key, embeddings) + return embedding_cache + + +def similar_tokens(query: str, embeddings: List[np.ndarray]) -> List[float]: + '''Compare a `query` to a strided window over a `document`.''' # Initialize a numpy array for storing similarities and overlaps - document = pdf_cache[pdf_key] - - similarities = np.zeros(len(document), dtype=float) - overlaps = np.zeros(len(document), dtype=float) + document_length = len(embeddings) * STRIDE + WINDOW_SIZE - 1 # Derive the document length from embeddings + similarities = np.zeros(document_length, dtype=float) + overlaps = np.zeros(document_length, dtype=float) query_e = embed([[query_instruction, query]]) - if pdf_key in embedding_cache: - embedding_is_saved = True - embeddings = embedding_cache[pdf_key] - else: - embedding_is_saved = False - embeddings = [] - # Loop through the document with given stride - # listify offsets to help out tqdm - offsets = list(enumerate(range(0, len(document) - WINDOW_SIZE + 1, STRIDE))) - for emb_i, doc_i in tqdm(offsets): - # Extract the chunk from document - chunk = document[doc_i:doc_i+WINDOW_SIZE] - - # Similarity - if embedding_is_saved: - chunk_e = embeddings[emb_i] - else: - chunk_e = embed([[embed_instruction, chunk]]) - embeddings.append(chunk_e) + offsets = list(range(0, document_length - WINDOW_SIZE + 1, STRIDE)) + for chunk_e, doc_i in tqdm(zip(embeddings, offsets)): sim = cosine_similarity(query_e, chunk_e)[0][0] # Update the similarities and overlaps array @@ -136,15 +147,23 @@ def similar_tokens(query: str, similarities[j] += sim overlaps[j] += 1 - if not embedding_is_saved: - embedding_cache[pdf_key] = embeddings - save_pkl(pdf_key, embeddings) - # Average the similarities with the number of overlaps similarities /= np.where(overlaps != 0, overlaps, 1) - return similarities +try: + pdf_cache +except: + pdf_cache = {} + + print('Loading PDF Text') + for k, path in pdf_paths.items(): + pdf_cache[k] = read_pdf(path) + +try: + embedding_cache +except: + embedding_cache = embed_documents(pdf_cache) def find_spans(arr, threshold=0.5): ''' ''' @@ -172,11 +191,8 @@ def find_similar(query, pdf_key): query: a query that you want to find similar passages to pdf_key: {bitcoin, idaho} ''' - global embedding_cache - - # Embeddings - print('Calculating embeddings') - return similar_tokens(query, pdf_key) + embeddings = embedding_cache[pdf_key] + return similar_tokens(query, embeddings) def segments(similarities, document, threshold=0.0): @@ -242,7 +258,7 @@ def denoise_similarities(similarities, window_size=2000, poly_order=2): d_similarities = tune_percentile(d_similarities, percentile=75) segs = segments(d_similarities, document) -ranked_segments = rank(segs, np.max) +ranked_segments = rank(segs, np.max)[-3:] import matplotlib.pyplot as plt plt.plot(similarities) From 9302eb7839d6c74cd2c01f1d1a1e38d692833614 Mon Sep 17 00:00:00 2001 From: mobius Date: Tue, 18 Jul 2023 11:58:37 -0700 Subject: [PATCH 06/11] don't track pickles --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 289adb5..f957775 100644 --- a/.gitignore +++ b/.gitignore @@ -13,7 +13,8 @@ test.md test.txt *.log debug_transcription.wav +*.pkl # VSCode .vscode/ -node_modules/ \ No newline at end of file +node_modules/ From 698ab1c59963aac04318e9b28f07db58d174da10 Mon Sep 17 00:00:00 2001 From: mobius Date: Tue, 18 Jul 2023 13:00:40 -0700 Subject: [PATCH 07/11] significant refactor --- sandbox/t11_document_chat.py | 259 +++++++++++++++++++---------------- 1 file changed, 144 insertions(+), 115 deletions(-) diff --git a/sandbox/t11_document_chat.py b/sandbox/t11_document_chat.py index 2d7d94e..ce0b8e7 100644 --- a/sandbox/t11_document_chat.py +++ b/sandbox/t11_document_chat.py @@ -19,36 +19,21 @@ from tqdm import tqdm from scipy.signal import savgol_filter import pickle - -WINDOW_SIZE = 300 -STRIDE = 100 -PERCENTILE = 80 - -pdf_paths = { - 'bitcoin': '~/Documents/misc/bitcoin.pdf', - 'idaho': '~/Documents/misc/land use and development code.pdf' -} - - -# cache = [ -# { -# 'bitcoin': '~/Documents/misc/bitcoin.pdf', -# 'window_size': 300, -# 'stride': 100, -# 'percentile': 80, -# 'pdf_text': None, -# 'embedding': None, -# }, -# { -# 'idaho': '~/Documents/misc/land use and development code.pdf', -# 'window_size': 300, -# 'stride': 100, -# 'percentile': 80, -# 'pdf_text': None, -# 'embedding': None, -# } -# ] - +from dataclasses import dataclass + +@dataclass +class Meta: + name: str + path: str + window_size: int + stride: int + percentile: int + pdf_text: str + embeddings: List[np.ndarray] + query_instruction: str + embed_instruction: str + denoise_window_size: int + denoise_poly_order: int ################################################## # Load Model @@ -59,8 +44,6 @@ model = INSTRUCTOR('hkunlp/instructor-base') already_loaded = True -query_instruction = 'Represent the Science question for retrieving supporting documents: ' -embed_instruction = 'Represent the Science document for retrieval: ' def embed(xs: List[str]): @@ -71,13 +54,16 @@ def embed(xs: List[str]): ################################################## # PDF -def read_pdf(path): - path = os.path.expanduser(path) - reader = PdfReader(path) - text = '' - for page in reader.pages: - text += '\n' + page.extract_text() - return text +def load_pdf(meta: Meta): + ''' Mutate `meta` to include `pdf_text` ''' + if meta.pdf_text is None: + path = meta.path + path = os.path.expanduser(path) + reader = PdfReader(path) + text = '' + for page in reader.pages: + text += '\n' + page.extract_text() + meta.pdf_text = text def get_file_name(path): @@ -102,48 +88,58 @@ def save_pkl(pdf_key, xs): print(f'Saved: {path}') -def embed_documents(pdf_cache): - print('Preparing Embeddings') - embedding_cache = {} - for pdf_key, document in pdf_cache.items(): - # Try to load embeddings from disk - embeddings = load_pkl(pdf_key) - if embeddings is not None: - print(f'Loaded {pdf_key} embeddings') - embedding_cache[pdf_key] = embeddings - else: # If not found, then calculate - print(f'Preparing embeddings for {pdf_key}') - embeddings = [] - # Loop through the document with given stride - offsets = list(enumerate(range(0, len(document) - WINDOW_SIZE + 1, STRIDE))) - for emb_i, doc_i in tqdm(offsets): - # Extract the chunk from document - chunk = document[doc_i:doc_i+WINDOW_SIZE] - # Embed the chunk - chunk_e = embed([[embed_instruction, chunk]]) - embeddings.append(chunk_e) - - embedding_cache[pdf_key] = embeddings - save_pkl(pdf_key, embeddings) - return embedding_cache - - -def similar_tokens(query: str, embeddings: List[np.ndarray]) -> List[float]: +def load_embeddings(meta): + ''' Mutate `meta` to include embeddings. ''' + pdf_key = meta.name + document = meta.pdf_text + window_size = meta.window_size + stride = meta.stride + embed_instruction = meta.embed_instruction + + # Try to load embeddings from disk + embeddings = load_pkl(pdf_key) + if embeddings is not None: + print(f'Loaded {pdf_key} embeddings') + meta.embeddings = embeddings + else: # If not found, then calculate + print(f'Preparing embeddings for {pdf_key}') + embeddings = [] + # Loop through the document with given stride + offsets = list( + enumerate(range(0, len(document) - window_size + 1, stride))) + for emb_i, doc_i in tqdm(offsets): + # Extract the chunk from document + chunk = document[doc_i:doc_i+window_size] + # Embed the chunk + chunk_e = embed([[embed_instruction, chunk]]) + embeddings.append(chunk_e) + + meta.embedding = embeddings + save_pkl(pdf_key, embeddings) + + +def similar_tokens(query: str, meta: Meta) -> List[float]: '''Compare a `query` to a strided window over a `document`.''' + embeddings = meta.embeddings + pdf_key = meta.name + document = meta.pdf_text + window_size = meta.window_size + stride = meta.stride + query_instruction = meta.query_instruction # Initialize a numpy array for storing similarities and overlaps - document_length = len(embeddings) * STRIDE + WINDOW_SIZE - 1 # Derive the document length from embeddings + document_length = len(embeddings) * stride + window_size - 1 # Derive the document length from embeddings similarities = np.zeros(document_length, dtype=float) overlaps = np.zeros(document_length, dtype=float) query_e = embed([[query_instruction, query]]) # Loop through the document with given stride - offsets = list(range(0, document_length - WINDOW_SIZE + 1, STRIDE)) + offsets = list(range(0, document_length - window_size + 1, stride)) for chunk_e, doc_i in tqdm(zip(embeddings, offsets)): sim = cosine_similarity(query_e, chunk_e)[0][0] # Update the similarities and overlaps array - for j in range(doc_i, doc_i + WINDOW_SIZE): + for j in range(doc_i, doc_i + window_size): similarities[j] += sim overlaps[j] += 1 @@ -151,19 +147,6 @@ def similar_tokens(query: str, embeddings: List[np.ndarray]) -> List[float]: similarities /= np.where(overlaps != 0, overlaps, 1) return similarities -try: - pdf_cache -except: - pdf_cache = {} - - print('Loading PDF Text') - for k, path in pdf_paths.items(): - pdf_cache[k] = read_pdf(path) - -try: - embedding_cache -except: - embedding_cache = embed_documents(pdf_cache) def find_spans(arr, threshold=0.5): ''' ''' @@ -186,15 +169,6 @@ def tune_percentile(xs, percentile): return xs -def find_similar(query, pdf_key): - ''' - query: a query that you want to find similar passages to - pdf_key: {bitcoin, idaho} - ''' - embeddings = embedding_cache[pdf_key] - return similar_tokens(query, embeddings) - - def segments(similarities, document, threshold=0.0): out = '' last_thresh = False # for finding edge @@ -227,7 +201,7 @@ def rank(segments, rank_fn): for text, sims in segments: scores.append(rank_fn(sims)) out = [] - for score, (text, sims) in sorted(zip(scores, segments)): + for score, (text, sims) in reversed(sorted(zip(scores, segments))): out.append(text) return out @@ -237,34 +211,89 @@ def denoise_similarities(similarities, window_size=2000, poly_order=2): return savgol_filter(similarities, window_size, poly_order) +def top_segments(query, pdf_name, top_n): + meta = cache[pdf_name] + document = meta.pdf_text + denoise_window_size = meta.denoise_window_size + denoise_poly_order = meta.denoise_poly_order + percentile = meta.percentile + similarities = similar_tokens(query, meta) + + # remove outlier at end + last_edge = int(len(similarities) * 0.01) + similarities[-last_edge:] = similarities[-last_edge] + + # Denoise salience scores + similarities = tune_percentile(similarities, percentile) + d_similarities = denoise_similarities(similarities, denoise_window_size, denoise_poly_order) + d_similarities -= d_similarities.min() # normalize + d_similarities /= d_similarities.max() + d_similarities = tune_percentile(d_similarities, percentile) + + segs = segments(d_similarities, document) + ranked_segments = rank(segs, np.mean)[:top_n] + + import matplotlib.pyplot as plt + plt.plot(similarities) + plt.plot(d_similarities) + plt.show() + + return ranked_segments + + + +try: + cache_is_loaded +except: + print('Populating cache') + cache = { + 'bitcoin': Meta( + name='bitcoin', + path='~/Documents/misc/bitcoin.pdf', + window_size=300, + stride=100, + pdf_text=None, + embeddings=None, + query_instruction='Represent the Science question for retrieving supporting documents: ', + embed_instruction='Represent the Science document for retrieval: ', + denoise_window_size=2000, + denoise_poly_order=2, + percentile=80, + ), + 'idaho': Meta( + name='idaho', + path='~/Documents/misc/land use and development code.pdf', + window_size=300, + stride=100, + pdf_text=None, + embeddings=None, + query_instruction='Represent the wikipedia question for retrieving supporting documents: ', + embed_instruction='Represent the wikipedia document for retrieval: ', + denoise_window_size=5000, + denoise_poly_order=2, + percentile=80, + ) + } + for k, m in cache.items(): + load_pdf(m) + load_embeddings(m) + cache_is_loaded = True + +# pdf_name = 'bitcoin' # query = 'whats in it for participants to the blockchain?' # query = 'how does this protect my anonymity?' # query = 'im concerned my hdd isnt big enough' # query = 'who contributed to this paper?' -query = 'how close can my silver mine be to a farm?' -pdf_key = 'idaho' -document = pdf_cache[pdf_key] -similarities = find_similar(query, pdf_key) - -# remove outlier at end -last_edge = int(len(similarities) * 0.01) -similarities[-last_edge:] = similarities[-last_edge] - -# Denoise salience scores -d_similarities = denoise_similarities(similarities) -d_similarities -= d_similarities.min() # normalize -d_similarities /= d_similarities.max() -d_similarities = tune_percentile(d_similarities, percentile=75) - -segs = segments(d_similarities, document) -ranked_segments = rank(segs, np.max)[-3:] +pdf_name = 'idaho' +# query = 'how close can my silver mine be to a farm?' +# query = 'how do houses on the lake need to be addressed? marine addressing.' +# query = 'How can I rezone my property? Rezoning.' +# query = 'What signs can I put on my property?' +query = 'How does this document define "sign"?' -import matplotlib.pyplot as plt -plt.plot(similarities) -plt.plot(d_similarities) -plt.show() +ranked_segments = top_segments(query, pdf_name, top_n=3) -for x in ranked_segments: - print('--------------------------------------------------') +for x in reversed(ranked_segments): + print('\n\nXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX\n\n') print(x) From 5e6aa1e602b91f53378ab9344aafbaf0803d938f Mon Sep 17 00:00:00 2001 From: mobius Date: Tue, 18 Jul 2023 16:43:20 -0700 Subject: [PATCH 08/11] add colorized visualization in html --- sandbox/t11_document_chat.py | 226 ++++++++++++++++++++++++++++------- 1 file changed, 184 insertions(+), 42 deletions(-) diff --git a/sandbox/t11_document_chat.py b/sandbox/t11_document_chat.py index ce0b8e7..aef1e6e 100644 --- a/sandbox/t11_document_chat.py +++ b/sandbox/t11_document_chat.py @@ -28,7 +28,7 @@ class Meta: window_size: int stride: int percentile: int - pdf_text: str + text: str embeddings: List[np.ndarray] query_instruction: str embed_instruction: str @@ -54,17 +54,21 @@ def embed(xs: List[str]): ################################################## # PDF -def load_pdf(meta: Meta): - ''' Mutate `meta` to include `pdf_text` ''' - if meta.pdf_text is None: +def load_doc(meta: Meta): + ''' Mutate `meta` to include `text` ''' + if meta.text is None: path = meta.path path = os.path.expanduser(path) - reader = PdfReader(path) - text = '' - for page in reader.pages: - text += '\n' + page.extract_text() - meta.pdf_text = text - + _, ext = os.path.splitext(path) + if ext == '.pdf': + reader = PdfReader(path) + text = '' + for page in reader.pages: + text += '\n' + page.extract_text() + meta.text = text + else: + with open(path, 'r') as f: + meta.text = f.read() def get_file_name(path): full_name = os.path.basename(path) @@ -91,7 +95,7 @@ def save_pkl(pdf_key, xs): def load_embeddings(meta): ''' Mutate `meta` to include embeddings. ''' pdf_key = meta.name - document = meta.pdf_text + document = meta.text window_size = meta.window_size stride = meta.stride embed_instruction = meta.embed_instruction @@ -114,7 +118,7 @@ def load_embeddings(meta): chunk_e = embed([[embed_instruction, chunk]]) embeddings.append(chunk_e) - meta.embedding = embeddings + meta.embeddings = embeddings save_pkl(pdf_key, embeddings) @@ -122,7 +126,7 @@ def similar_tokens(query: str, meta: Meta) -> List[float]: '''Compare a `query` to a strided window over a `document`.''' embeddings = meta.embeddings pdf_key = meta.name - document = meta.pdf_text + document = meta.text window_size = meta.window_size stride = meta.stride query_instruction = meta.query_instruction @@ -211,9 +215,9 @@ def denoise_similarities(similarities, window_size=2000, poly_order=2): return savgol_filter(similarities, window_size, poly_order) -def top_segments(query, pdf_name, top_n): - meta = cache[pdf_name] - document = meta.pdf_text +def top_segments(query, doc_name, top_n, visualize=False): + meta = cache[doc_name] + document = meta.text denoise_window_size = meta.denoise_window_size denoise_poly_order = meta.denoise_poly_order percentile = meta.percentile @@ -224,8 +228,10 @@ def top_segments(query, pdf_name, top_n): similarities[-last_edge:] = similarities[-last_edge] # Denoise salience scores - similarities = tune_percentile(similarities, percentile) - d_similarities = denoise_similarities(similarities, denoise_window_size, denoise_poly_order) + # similarities = tune_percentile(similarities, percentile) + d_similarities = denoise_similarities(similarities, + denoise_window_size, + denoise_poly_order) d_similarities -= d_similarities.min() # normalize d_similarities /= d_similarities.max() d_similarities = tune_percentile(d_similarities, percentile) @@ -233,14 +239,108 @@ def top_segments(query, pdf_name, top_n): segs = segments(d_similarities, document) ranked_segments = rank(segs, np.mean)[:top_n] - import matplotlib.pyplot as plt - plt.plot(similarities) - plt.plot(d_similarities) - plt.show() + if visualize: + import matplotlib.pyplot as plt + plt.plot(similarities) + plt.plot(d_similarities) + plt.show() + + return ranked_segments, d_similarities + + +################################################## +# Visualization + +import webbrowser +from html import escape +import os +from typing import List, Tuple +import numpy as np + + +def hex_to_rgb(hex_color: str): + """ Converts a hexadecimal color string to an RGB tuple. """ + hex_color = hex_color.lstrip('#') + return tuple(int(hex_color[i:i+2], 16) for i in (0, 2, 4)) + + +def interpolate_color(similarity: float, color): + """ Scales an RGB color tuple by a similarity factor. """ + rgb_color = hex_to_rgb(color) + return tuple(similarity * channel for channel in rgb_color) + + +def add_color(a, b): + """ Adds two RGB color tuples element-wise. """ + return tuple(ai + bi for ai, bi in zip(a, b)) + + +def rgb_to_hex(rgb): + """ Converts an RGB color tuple to a hexadecimal color string. """ + r,g,b = rgb + rgb = (int(r), int(g), int(b)) + return '#%02x%02x%02x' % rgb + + +def colorize_text(text: str, queries_and_colors_and_similarity: List[Tuple[str, str, np.ndarray]]) -> str: + """ Colorizes text based on similarity scores for different queries. """ + # Initialize an empty list for the HTML parts + html_parts = [] - return ranked_segments + # Get the color names and similarity scores + color_names = [item[1] for item in queries_and_colors_and_similarity] + similarities_tr = zip(*[item[2] for item in queries_and_colors_and_similarity]) # transposed + # Loop through the text with the corresponding similarity + for similarities, char in zip(similarities_tr, text): + # Initialize color as black + color = (0, 0, 0) + for name, sim in zip(color_names, similarities): + color = add_color(color, interpolate_color(sim, name)) + hex_color = rgb_to_hex(color) + if char == '\n': + html_parts.append('
') + continue + char = escape(char) + html_parts.append(f'{char}') + + return ''.join(html_parts) + + +def create_colorful_html(queries_and_colors_and_similarity: List[Tuple[str, str, np.ndarray]], + document: str) -> None: + """ Creates an HTML page with colorized query and document text. """ + # Colorize the queries and document + queries_html = [colorize_text(query, [(query, color, np.ones(len(query)))]) + for query, color, self_similarities + in queries_and_colors_and_similarity] + document_html = colorize_text(document, queries_and_colors_and_similarity) + + # Combine the HTML strings for the queries and document + html = ''' + + + +''' + for i, query_html in enumerate(queries_html): + html += f'

QUERY {i+1}:

\n{query_html}\n

\n' + html += '
\n

\n

DOCUMENT:

\n' + document_html + + # Write the HTML string to a temporary file + with open('temp.html', 'w') as f: + f.write(html) + + # Open the HTML file in the default web browser + webbrowser.open('file://' + os.path.realpath('temp.html')) + + +################################################## +# Ranked Segments try: cache_is_loaded @@ -252,7 +352,7 @@ def top_segments(query, pdf_name, top_n): path='~/Documents/misc/bitcoin.pdf', window_size=300, stride=100, - pdf_text=None, + text=None, embeddings=None, query_instruction='Represent the Science question for retrieving supporting documents: ', embed_instruction='Represent the Science document for retrieval: ', @@ -265,35 +365,77 @@ def top_segments(query, pdf_name, top_n): path='~/Documents/misc/land use and development code.pdf', window_size=300, stride=100, - pdf_text=None, + text=None, embeddings=None, query_instruction='Represent the wikipedia question for retrieving supporting documents: ', embed_instruction='Represent the wikipedia document for retrieval: ', denoise_window_size=5000, denoise_poly_order=2, percentile=80, - ) + ), + '2001_positive': Meta( + name='2001_positive', + path='./2001_positive.md', + window_size=500, + stride=25, + text=None, + embeddings=None, + query_instruction='Represent the book review question for retrieving supporting documents: ', + embed_instruction='Represent the book review document for retrieval: ', + denoise_window_size=250, + denoise_poly_order=3, + percentile=80, + ), + '2001_negative': Meta( + name='2001_negative', + path='./2001_negative.md', + window_size=500, + stride=25, + text=None, + embeddings=None, + query_instruction='Represent the book review question for retrieving supporting documents: ', + embed_instruction='Represent the book review document for retrieval: ', + denoise_window_size=250, + denoise_poly_order=3, + percentile=80, + ), } for k, m in cache.items(): - load_pdf(m) + load_doc(m) load_embeddings(m) cache_is_loaded = True -# pdf_name = 'bitcoin' -# query = 'whats in it for participants to the blockchain?' -# query = 'how does this protect my anonymity?' -# query = 'im concerned my hdd isnt big enough' -# query = 'who contributed to this paper?' +# # doc_name = 'bitcoin' +# # query = 'whats in it for participants to the blockchain?' +# # query = 'how does this protect my anonymity?' +# # query = 'im concerned my hdd isnt big enough' +# # query = 'who contributed to this paper?' + +# doc_name = 'idaho' +# # query = 'how close can my silver mine be to a farm?' +# # query = 'how do houses on the lake need to be addressed? marine addressing.' +# # query = 'How can I rezone my property? Rezoning.' +# # query = 'What signs can I put on my property?' +# query = 'How does this document define "sign"?' -pdf_name = 'idaho' -# query = 'how close can my silver mine be to a farm?' -# query = 'how do houses on the lake need to be addressed? marine addressing.' -# query = 'How can I rezone my property? Rezoning.' -# query = 'What signs can I put on my property?' -query = 'How does this document define "sign"?' +# ranked_segments, sims = top_segments(query, doc_name, top_n=3) -ranked_segments = top_segments(query, pdf_name, top_n=3) +# for x in reversed(ranked_segments): +# print('\n\nXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX\n\n') +# print(x) -for x in reversed(ranked_segments): - print('\n\nXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX\n\n') - print(x) + +################################################## +# Colorize document + +doc_name = '2001_positive' +queries = [ + ('The reviewer loves the book.', '00cc00'), + ('The reviewer hates the book.', 'cc0000'), +] +query_tups = [] +for (q, c) in queries: + _, sims = top_segments(q, doc_name, top_n=0) + query_tups.append((q, c, sims)) + +create_colorful_html(query_tups, cache[doc_name].text) From 89826adfecc59f7b47440eac93e7ee451db04f68 Mon Sep 17 00:00:00 2001 From: mobius Date: Sat, 29 Jul 2023 18:36:31 -0700 Subject: [PATCH 09/11] checkin (non-working) --- uniteai/contrib/state_loop.py | 351 ++++++++++++++++++++++++++++++++++ 1 file changed, 351 insertions(+) create mode 100644 uniteai/contrib/state_loop.py diff --git a/uniteai/contrib/state_loop.py b/uniteai/contrib/state_loop.py new file mode 100644 index 0000000..8438393 --- /dev/null +++ b/uniteai/contrib/state_loop.py @@ -0,0 +1,351 @@ +''' + +A Language-model-based State Machine + +State + Action -> State' + Response + +''' + +import re +from lsprotocol.types import ( + CodeAction, + CodeActionKind, + CodeActionParams, + Command, + Range, + TextDocumentIdentifier, + WorkspaceEdit, +) +from concurrent.futures import ThreadPoolExecutor +import openai +from threading import Event +from thespian.actors import Actor +import argparse +import logging + +from uniteai.edit import init_block, cleanup_block, BlockJob +from uniteai.common import extract_range, find_block, mk_logger, get_nested +from uniteai.server import Server + + +################################################## +# + +def parse_state_and_response(string): + state_pattern = r"START_STATE:\n(.*?)\nEND_STATE:" + response_pattern = r"START_RESPONSE:\n(.*?)\nEND_RESPONSE:" + + state = re.search(state_pattern, string, re.DOTALL) + response = re.search(response_pattern, string, re.DOTALL) + + if state: + state = state.group(1) + else: + state = "" + + if response: + response = response.group(1) + else: + response = "" + + return state, response + +################################################## +# StateLoop + +STATE_START_TAG = ':STATE_START_TAG:' +STATE_END_TAG = ':STATE_END_TAG:' +NAME = 'state_loop' +log = mk_logger(NAME, logging.DEBUG) + +class StateLoopActor(Actor): + def __init__(self): + log.debug('ACTOR INIT') + self.is_running = False + self.executor = ThreadPoolExecutor(max_workers=3) + self.current_future = None + self.should_stop = Event() + self.tags = [START_TAG, END_TAG] + + def receiveMessage(self, msg, sender): + command = msg.get('command') + doc = msg.get('doc') + edits = msg.get('edits') + log.debug(f''' +%%%%%%%%%% +ACTOR RECV: {msg["command"]} +ACTOR STATE: +is_running: {self.is_running} +should_stop: {self.should_stop.is_set()} +current_future: {self.current_future} + +EDITS STATE: +job_thread alive: {edits.job_thread.is_alive() if edits and edits.job_thread else "NOT STARTED"} +%%%%%%%%%%''') + if command == 'start': + uri = msg.get('uri') + range = msg.get('range') + prompt = msg.get('prompt') + engine = msg.get('engine') + max_length = msg.get('max_length') + edits = msg.get('edits') + + # check if block already exists + start_ixs, end_ixs = find_block(START_TAG, + END_TAG, + doc) + + if not (start_ixs and end_ixs): + init_block(NAME, self.tags, uri, range, edits) + + self.start(uri, range, prompt, engine, max_length, edits) + + elif command == 'stop': + self.stop() + + def init_state_block(): + pass + + def + + + def start(self, uri, range, prompt, engine, max_length, edits): + if self.is_running: + log.info('WARN: ON_START_BUT_RUNNING') + return + log.debug('ACTOR START') + + self.is_running = True + self.should_stop.clear() + + def f(uri_, prompt_, engine_, max_length_, should_stop_, edits_): + ''' Compose the streaming fn with some cleanup. ''' + openai_stream_fn(uri_, prompt_, engine_, max_length_, + should_stop_, edits_) + + # Cleanup + log.debug('CLEANING UP') + cleanup_block(NAME, self.tags, uri_, edits_) + self.is_running = False + self.current_future = None + self.should_stop.clear() + + self.current_future = self.executor.submit( + f, uri, prompt, engine, max_length, self.should_stop, edits + ) + log.debug('START CAN RETURN') + + def stop(self): + log.debug('ACTOR STOP') + if not self.is_running: + log.info('WARN: ON_STOP_BUT_STOPPED') + + self.should_stop.set() + + if self.current_future: + self.current_future.result() # block, wait to finish + self.current_future = None + log.debug('FINALLY STOPPED') + + +################################################## +# StateLoop + +COMPLETION_ENGINES = [ + "text-davinci-003", + "text-davinci-002", + "ada", + "babbage", + "curie", + "davinci", +] + +CHAT_ENGINES = [ + "gpt-3.5-turbo", + "gpt-3.5-turbo-0613", + "gpt-4", +] + + +def openai_autocomplete(engine, text, max_length): + ''' Stream responses from StateLoop's API as a generator. ''' + if engine in COMPLETION_ENGINES: + response = openai.Completion.create( + engine=engine, + prompt=text, + max_tokens=max_length, + stream=True + ) + for message in response: + generated_text = message['choices'][0]['text'] + yield generated_text + elif engine in CHAT_ENGINES: + response = openai.ChatCompletion.create( + model=engine, + messages=[{"role": "user", "content": text}], + stream=True + ) + for message in response: + # different json structure than completion endpoint + delta = message['choices'][0]['delta'] + if 'content' in delta: + generated_text = delta['content'] + yield generated_text + +def openai_stream_fn(uri, prompt, engine, max_length, stop_event, edits): + log.debug(f'START: OPENAI_STREAM_FN, max_length={max_length}') + try: + # Stream the results to LSP Client + running_text = '' + for new_text in openai_autocomplete(engine, prompt, max_length): + # For breaking out early + if stop_event.is_set(): + log.debug('STREAM_FN received STOP EVENT') + break + log.debug(f'NEW: {new_text}') + # ignore empty strings + if len(new_text) == 0: + continue + + running_text += new_text + job = BlockJob( + uri=uri, + start_tag=START_TAG, + end_tag=END_TAG, + text=f'\n{running_text}\n', + strict=False, + ) + edits.add_job(NAME, job) + + # Streaming is done, and those added jobs were all non-strict. Let's + # make sure to have one final strict job. Streaming jobs are ok to be + # dropped, but we need to make sure it does finalize, eg before a + # strict delete-tags job is added. + job = BlockJob( + uri=uri, + start_tag=START_TAG, + end_tag=END_TAG, + text=f'\n{running_text}\n', + strict=True, + ) + edits.add_job(NAME, job) + log.debug('STREAM COMPLETE') + except Exception as e: + log.error(f'Error: StateLoop, {e}') + + +def code_action_gpt(engine, max_length, params: CodeActionParams): + '''Trigger a GPT Autocompletion response. A code action calls a command, + which is set up below to `tell` the actor to start streaming a response.''' + text_document = params.text_document + range = params.range + return CodeAction( + title='StateLoop GPT', + kind=CodeActionKind.Refactor, + command=Command( + title='StateLoop GPT', + command='command.openaiAutocompleteStream', + # Note: these arguments get jsonified, not passed as python objs + arguments=[text_document, range, engine, max_length] + ) + ) + + +def code_action_chat_gpt(engine, max_length, params: CodeActionParams): + '''Trigger a ChatGPT response. A code action calls a command, which is set + up below to `tell` the actor to start streaming a response. ''' + text_document = params.text_document + range = params.range + return CodeAction( + title='StateLoop ChatGPT', + kind=CodeActionKind.Refactor, + command=Command( + title='StateLoop ChatGPT', + command='command.openaiAutocompleteStream', + # Note: these arguments get jsonified, not passed as python objs + arguments=[text_document, range, engine, max_length] + ) + ) + + +################################################## +# Setup + +def configure(config_yaml): + parser = argparse.ArgumentParser() + parser.add_argument('--openai_completion_engine', default=get_nested(config_yaml, ['openai', 'completion_engine'])) + parser.add_argument('--openai_chat_engine', default=get_nested(config_yaml, ['openai', 'chat_engine'])) + parser.add_argument('--openai_max_length', default=get_nested(config_yaml, ['openai', 'max_length'])) + parser.add_argument('--openai_api_key', default=get_nested(config_yaml, ['openai', 'api_key'])) + + # bc this is only concerned with openai params, do not error if extra params + # are sent via cli. + args, _ = parser.parse_known_args() + return args + + + + +def initialize(config, server): + # Config + openai_chat_engine = config.openai_chat_engine + openai_completion_engine = config.openai_completion_engine + openai_max_length = config.openai_max_length + openai.api_key = config.openai_api_key # make library aware of api key + + # Actor + server.add_actor(NAME, StateLoopActor) + + # CodeActions + server.add_code_action( + lambda params: + code_action_gpt(openai_completion_engine, openai_max_length, params)) + server.add_code_action( + lambda params: + code_action_chat_gpt(openai_chat_engine, openai_max_length, params)) + + # Modify Server + @server.thread() + @server.command('command.openaiAutocompleteStream') + def openai_autocomplete_stream(ls: Server, args): + if len(args) != 4: + log.error(f'command.openaiAutocompleteStream: Wrong arguments, received: {args}') + text_document = ls.converter.structure(args[0], TextDocumentIdentifier) + range = ls.converter.structure(args[1], Range) + uri = text_document.uri + doc = ls.workspace.get_document(uri) + doc_source = doc.source + + # Determine engine, by checking for sentinel values to allow LSP client + # to defer arguments to server's configuration. + if args[2] == FROM_CONFIG_CHAT: + engine = openai_chat_engine + elif args[2] == FROM_CONFIG_COMPLETION: + engine = openai_completion_engine + else: + engine = args[2] + + # Max Length + if args[3] == FROM_CONFIG: + max_length = openai_max_length + else: + max_length = args[3] + + # Extract the highlighted region + prompt = extract_range(doc_source, range) + + # Send a message to start the stream + actor_args = { + 'command': 'start', + 'uri': uri, + 'range': range, + 'prompt': prompt, + 'engine': engine, + 'max_length': max_length, + 'edits': ls.edits, + 'doc': doc_source, + } + ls.tell_actor(NAME, actor_args) + + # Return null-edit immediately (the rest will stream) + return WorkspaceEdit() From 25fe88e7d01d174d45adc046a0478923acd2dc1c Mon Sep 17 00:00:00 2001 From: mobius Date: Sat, 29 Jul 2023 18:37:16 -0700 Subject: [PATCH 10/11] note --- uniteai/contrib/state_loop.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/uniteai/contrib/state_loop.py b/uniteai/contrib/state_loop.py index 8438393..7fa9ca6 100644 --- a/uniteai/contrib/state_loop.py +++ b/uniteai/contrib/state_loop.py @@ -1,5 +1,7 @@ ''' +TODO: As of now, this is little more than a copy-pasted example bot. + A Language-model-based State Machine State + Action -> State' + Response From 7bba0020528e904e3631191b5636b4b436e6e5c1 Mon Sep 17 00:00:00 2001 From: mobius Date: Sat, 29 Jul 2023 18:38:41 -0700 Subject: [PATCH 11/11] checkin --- sandbox/t11_document_chat.py | 48 +++++++++++++++++++----------------- 1 file changed, 25 insertions(+), 23 deletions(-) diff --git a/sandbox/t11_document_chat.py b/sandbox/t11_document_chat.py index aef1e6e..b8a1244 100644 --- a/sandbox/t11_document_chat.py +++ b/sandbox/t11_document_chat.py @@ -405,11 +405,13 @@ def create_colorful_html(queries_and_colors_and_similarity: List[Tuple[str, str, load_embeddings(m) cache_is_loaded = True -# # doc_name = 'bitcoin' -# # query = 'whats in it for participants to the blockchain?' -# # query = 'how does this protect my anonymity?' -# # query = 'im concerned my hdd isnt big enough' -# # query = 'who contributed to this paper?' +doc_name = 'bitcoin' +# query = 'whats in it for participants to the blockchain?' +# query = 'how does this protect my anonymity?' +# query = 'im concerned my hdd isnt big enough' +# query = 'who contributed to this paper?' +# query = 'what is the transaction size limit?' +query = 'what game theory problem is it trying to solve?' # doc_name = 'idaho' # # query = 'how close can my silver mine be to a farm?' @@ -418,24 +420,24 @@ def create_colorful_html(queries_and_colors_and_similarity: List[Tuple[str, str, # # query = 'What signs can I put on my property?' # query = 'How does this document define "sign"?' -# ranked_segments, sims = top_segments(query, doc_name, top_n=3) +ranked_segments, sims = top_segments(query, doc_name, top_n=3, visualize=True) -# for x in reversed(ranked_segments): -# print('\n\nXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX\n\n') -# print(x) +for x in reversed(ranked_segments): + print('\n\nXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX\n\n') + print(x) -################################################## -# Colorize document - -doc_name = '2001_positive' -queries = [ - ('The reviewer loves the book.', '00cc00'), - ('The reviewer hates the book.', 'cc0000'), -] -query_tups = [] -for (q, c) in queries: - _, sims = top_segments(q, doc_name, top_n=0) - query_tups.append((q, c, sims)) - -create_colorful_html(query_tups, cache[doc_name].text) +# ################################################## +# # Colorize document + +# doc_name = '2001_positive' +# queries = [ +# ('The reviewer loves the book.', '00cc00'), +# ('The reviewer hates the book.', 'cc0000'), +# ] +# query_tups = [] +# for (q, c) in queries: +# _, sims = top_segments(q, doc_name, top_n=0) +# query_tups.append((q, c, sims)) + +# create_colorful_html(query_tups, cache[doc_name].text)