Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file modified backend/requirements.txt
Binary file not shown.
41 changes: 6 additions & 35 deletions backend/src/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,42 +4,12 @@
from docarray import DocumentArray
from docarray.document.generators import from_csv

from backend_config import papers_data_path, papers_data_url
from config import config, create_parser
from flows import index_flow, search_flow
from helpers import download_csv, log, maximise_csv_field_size_limit


# boolean args:
# https://stackoverflow.com/questions/15008758/parsing-boolean-values-with-argparse/36031646
def str2bool(v):
if isinstance(v, bool):
return v
if v.lower() in ("yes", "true", "t", "y", "1"):
return True
elif v.lower() in ("no", "false", "f", "n", "0"):
return False
else:
raise argparse.ArgumentTypeError("Boolean value expected.")


def get_args():
# Command line arguments definitions
parser = argparse.ArgumentParser()
parser.add_argument(
"--index",
dest="index",
default=False,
action="store_true",
help="index the available documents",
)
parser.add_argument(
"--n",
type=int,
default=0,
help="when `--index` is used, specifies the number of documnts to index (0 indexes the full dataset)",
)

return parser.parse_args()
papers_data_path = config["papers_data_path"].get()
papers_data_url = config["papers_data_url"].get()


def index(n):
Expand Down Expand Up @@ -70,12 +40,13 @@ def index(n):
indexer.index(papers, request_size=32)


args = get_args()
args = create_parser()
config.set_args(args)

if args.index:
index(args.n)

# running the search/finetuning flow as a service
# running the search/finetuning flow as a service
flow = search_flow()
flow.expose_endpoint("/finetune", summary="Finetune documents.", tags=["Finetuning"])

Expand Down
30 changes: 30 additions & 0 deletions backend/src/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
from argparse import ArgumentParser
from pathlib import Path

from confuse import Configuration

config = Configuration("backend")

file_abs_path = Path(__file__).parent.resolve()
config.set_file(file_abs_path / "config.yaml")


def create_parser():

parser = ArgumentParser()

parser.add_argument(
"--index",
dest="index",
default=False,
action="store_true",
help="index the available documents",
)
parser.add_argument(
"--n",
type=int,
default=0,
help="when `--index` is used, specifies the number of documnts to index (0 indexes the full dataset)",
)

return parser.parse_args()
13 changes: 7 additions & 6 deletions backend/src/backend_config.py → backend/src/config.yaml
Original file line number Diff line number Diff line change
@@ -1,17 +1,18 @@

# Hugging Face: https://huggingface.co/sentence-transformers/allenai-specter
embedding_model = "sentence-transformers/allenai-specter"
embedding_model : "sentence-transformers/allenai-specter"

# dataset link
papers_data_url = "http://www.lri.fr/owncloud/index.php/s/OO987IvsoKwWI3l/download"
papers_data_url : "http://www.lri.fr/owncloud/index.php/s/OO987IvsoKwWI3l/download"

# Number of search results to show.
top_k = 5
top_k : 5

# Protein file path relative to root.
papers_data_path = "data/papers.csv"
papers_data_path : "data/papers.csv"

# Prints logs to command line if true.
print_logs = True
print_logs : True

# Search service port
search_port = 8020
search_port : 8020
10 changes: 5 additions & 5 deletions backend/src/executors.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
import re
import os

from typing import Sequence, List, Tuple
from sentence_transformers import SentenceTransformer, InputExample, losses
from jina import DocumentArray, Executor, requests
from sentence_transformers import InputExample, SentenceTransformer, losses
from torch.utils.data import DataLoader
from jina import Executor, requests, Document, DocumentArray

from backend_config import top_k, embedding_model
from config import config
from helpers import log

embedding_model = config["embedding_model"].get()


def get_model_dir():
model_dir = f"./models/{embedding_model}"
Expand Down
6 changes: 3 additions & 3 deletions backend/src/flows.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from jina import Flow

from config import config
from executors import SpecterExecutor
from backend_config import top_k, search_port

# Using a standard indexer: https://hub.jina.ai/executor/zb38xlt4
indexer = "jinahub://SimpleIndexer"
Expand All @@ -21,14 +21,14 @@ def index_flow():

def search_flow():
flow = (
Flow(port_expose=search_port, protocol="http")
Flow(port_expose=config["search_port"].get(), protocol="http")
.add(uses=SpecterExecutor)
.add(
uses=indexer,
uses_with={
"match_args": {
"metric": "cosine",
"limit": top_k,
"limit": config["top_k"].get(),
},
},
)
Expand Down
19 changes: 16 additions & 3 deletions backend/src/helpers.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
import os
from argparse import ArgumentTypeError
from csv import field_size_limit
from sys import maxsize

import requests

from backend_config import print_logs
from config import config


def download_csv(url, fp):
Expand All @@ -15,15 +16,27 @@ def download_csv(url, fp):


def log(message):
if print_logs:
if config["print_logs"].get():
print(message)


def maximise_csv_field_size_limit(maxInt=maxsize):

while True:
try:
field_size_limit(maxInt)
break
except OverflowError:
maxInt = int(maxInt / 10)


# boolean args:
# https://stackoverflow.com/questions/15008758/parsing-boolean-values-with-argparse/36031646
def str2bool(v):
if isinstance(v, bool):
return v
if v.lower() in ("yes", "true", "t", "y", "1"):
return True
elif v.lower() in ("no", "false", "f", "n", "0"):
return False
else:
raise ArgumentTypeError("Boolean value expected.")