diff --git a/Dockerfile b/Dockerfile index 89fbc64..8edca5d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,45 +1,38 @@ -# Use Python 3.8 base image -FROM python:3.12 +FROM python:3.12-slim@sha256:9e01bf1ae5db7649a236da7be1e94ffbbbdd7a93f867dd0d8d5720d9e1f89fab AS builder -# Install system dependencies (for spaCy and its dependencies) RUN apt-get update && apt-get install -y \ build-essential \ python3-dev \ - libatlas-base-dev \ + libopenblas-dev \ gfortran \ curl \ git \ - wget + && rm -rf /var/lib/apt/lists/* -# Upgrade pip, setuptools, and wheel (to ensure we're using the latest version) -RUN pip install --upgrade pip setuptools wheel +WORKDIR /build -# Install spaCy (make sure the compatible version with Python 3.8 is installed) -RUN pip3 install spacy==3.7.5 # Change this to a specific compatible version, e.g. 2.2.4 +RUN python -m venv /opt/venv +ENV PATH="/opt/venv/bin:$PATH" -# Download the necessary spaCy language model (en_core_web_sm) -RUN python -m spacy download en_core_web_sm +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt -# Install required Python dependencies -RUN pip3 install tqdm -RUN pip3 install Cython -RUN pip install xaif_eval==0.0.9 -RUN pip3 install markdown2 -RUN pip3 install flask-cors +FROM python:3.12-slim@sha256:9e01bf1ae5db7649a236da7be1e94ffbbbdd7a93f867dd0d8d5720d9e1f89fab AS runtime +WORKDIR /app +RUN apt-get update && apt-get install -y --no-install-recommends \ + libopenblas-dev \ + && rm -rf /var/lib/apt/lists/* -# Copy the application files into the container -COPY . /app +COPY --from=builder /opt/venv /opt/venv -# Set the working directory to /app -WORKDIR /app +ENV PATH="/opt/venv/bin:$PATH" +ENV PYTHONDONTWRITEBYTECODE=1 +ENV PYTHONUNBUFFERED=1 -# Install additional Python dependencies from requirements.txt -RUN pip install -r requirements.txt +COPY . /app -# Expose port 5008 for the Flask app EXPOSE 5005 -# Set the default command to run the application -CMD ["python", "./main.py"] +CMD ["gunicorn", "--bind", "0.0.0.0:5005", "--workers", "4", "main:app"] \ No newline at end of file diff --git a/README copy.md b/README copy.md deleted file mode 100644 index 8f10866..0000000 --- a/README copy.md +++ /dev/null @@ -1,33 +0,0 @@ -# Default Segmenter - -It provides a default implementation of Segmenter, an AMF component that segments arguments into propositions. It utilises simple regular expressions for text segmentation. - -## Endpoints - -### /segmenter-01 - -- **Methods**: GET, POST -- **Description**: - - **GET**: Returns information about the Segmenter component and its usage. - - **POST**: Expects a file upload containing text data to segment. Parses the input and returns the segmented output in xIAF format. - -## Usage - -- Use the `/segmenter-01` endpoint to interact with the Segmenter: - - For GET requests, visit the endpoint URL to get information about Segmenter usage. - - For POST requests, upload a file containing text data to segment and receive the segmented output. - -## Input Format - -The Segmenter accepts input in xIAF formats: - -- **xIAF**: It segements the texts in the L-nodes. - -## Output Format - -The processed output is returned in xIAF format, containing segmented nodes, edges, locutions, and keeps the rest as they are. - -## Notes - -- This app serves as a basic segmenter using regular expressions for text segmentation. -- It can be connected to other components in an argument mining pipeline for further analysis. diff --git a/README.md b/README.md index e2ca1b7..8f10866 100644 --- a/README.md +++ b/README.md @@ -1,150 +1,33 @@ +# Default Segmenter - -# Default BERT Textual Entailment Service Documentation - -## Introduction -This application provides an implementation of BART fine-tuned on NLI dataset for indetifying argument relations. It serves as a default AMF component designed for detecting argument relations between propositions. Entailemtnt relation is mapped to support relation, conflicts, and non-relations between propositions. -- It can be integrated into the argument mining pipeline alongside other AMF components for further analysis and processing. - - -## Brief Overview of the Architecture/Method -Brief overview of the architecture/method used. - -- **Dataset**: [Link to datasets](#) -- **Model ID**: [facebook/bart-large-mnli](https://huggingface.co/facebook/bart-large-mnli) -- **Repository**: [GitHub repository](https://github.com/arg-tech/bert-te) -- **Paper**: [Link to published paper](https://arxiv.org/abs/1909.00161) +It provides a default implementation of Segmenter, an AMF component that segments arguments into propositions. It utilises simple regular expressions for text segmentation. ## Endpoints -### /bert-te - -#### Description -- **Methods**: `GET`, `POST` - - **GET**: Returns information about the BERT Textual Entailment Service and its usage. - - **POST**: Expects a file upload (`file` parameter) in the xAIF format. The Flask route processes the uploaded file identify argument relation between I-nodes and update the xAIF node to represent the argument relations and returns the updated xAIF as a json file. - -#### Details -- **URL**: `/bert-te` -- **Methods**: `GET`, `POST` -- **Input**: - - **GET**: No parameters. - - **POST**: Expects a file upload (`file` parameter) in the xAIF format. -- **Output**: - - **Response**: The inferred argument structure in xAIF json file format, containing nodes, edges, locutions, and other relevant information. - - **Example Response**: Example JSON response. - -## Input and Output Formats - -### Input Format -- **Text File**: xAIF format input ([xAIF format details](https://wiki.arg.tech/books/amf/page/xaif)). - -### Output Format -The inferred argument structure is returned in the xAIF format, containing nodes, edges, locutions, and other relevant information. In the xAIF: -- Argument units are specified as type "I" nodes. -- Argument relations are represented as "RA" type for support and "CA" type for attack relations. -- The relations between the "I" nodes and argument relation nodes are presented as edges. - -## Installation - -### Requirements for Installation -torch -numpy -transformers -xaif_eval==0.0.9 -amf-fast-inference==0.0.3 -markdown2 - - -### Installation Setup - -#### Using Docker Container - -To set up the BERT Textual Entailment Service using Docker, follow these steps: - -1. **Clone the Repository:** - ```sh - git clone https://github.com/arg-tech/bert-te.git - ``` - -2. **Navigate to the Project Root Directory:** - ```sh - cd bert-te - ``` - -3. **Make Required Changes:** - - Edit the `Dockerfile`, `main.py`, and `docker-compose.yml` files to specify the container name, port number, and other settings as needed. - -4. **Build and Run the Docker Container:** - ```sh - docker-compose up - ``` - -#### From Source Without Docker - -If you prefer to install without Docker: - -1. **Install Dependencies:** - - Ensure Python and necessary libraries are installed. - -2. **Configure and Run:** - - Configure the environment variables and settings in `main.py`. - - Run the application using Python: - ```sh - python main.py - ``` - +### /segmenter-01 +- **Methods**: GET, POST +- **Description**: + - **GET**: Returns information about the Segmenter component and its usage. + - **POST**: Expects a file upload containing text data to segment. Parses the input and returns the segmented output in xIAF format. ## Usage -### Using Programming Interface - -#### Example Python Code Snippet - -```python -import requests -import json - -url = 'http://your-server-url/bert-te' -input_file_path = 'example_xAIF.json' - -with open(input_file_path, 'r', encoding='utf-8') as file: - files = {'file': (input_file_path, file, 'application/json')} - -response = requests.post(url, files=files) - -if response.status_code == 200: - output_file_path = 'output_xAIF.json' - with open(output_file_path, 'w', encoding='utf-8') as output_file: - json.dump(response.json(), output_file, ensure_ascii=False, indent=4) - print(f'Response saved to {output_file_path}') -else: - print(f'Failed to make a POST request. Status code: {response.status_code}') - print(response.text) - -``` - -### Using cURL - -- **Example Request**: - -```bash -curl -X POST \ - -F "file=@example_xAIF.json" \ - http://your-server-url/bert-te -``` - - +- Use the `/segmenter-01` endpoint to interact with the Segmenter: + - For GET requests, visit the endpoint URL to get information about Segmenter usage. + - For POST requests, upload a file containing text data to segment and receive the segmented output. -### Using Web Interface +## Input Format -The service can also be used to create a pipeline on our n8n interface. Simply create an HTTP node, configure the node including the URL of the service and the parameter name of the file (`file`). +The Segmenter accepts input in xIAF formats: +- **xIAF**: It segements the texts in the L-nodes. +## Output Format +The processed output is returned in xIAF format, containing segmented nodes, edges, locutions, and keeps the rest as they are. -
- Image Description -
+## Notes +- This app serves as a basic segmenter using regular expressions for text segmentation. +- It can be connected to other components in an argument mining pipeline for further analysis. diff --git a/docker-compose.yml b/docker-compose.yml index 6187304..cda6f7d 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,6 +1,3 @@ -version: '3' -### - services: default_segmenter: container_name: default-segmenter diff --git a/requirements.txt b/requirements.txt index 5c2c669..fe29efe 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,55 @@ -flask -flask_uploads -prometheus_flask_exporter - - +annotated-doc==0.0.4 +annotated-types==0.7.0 +blinker==1.9.0 +blis==0.7.11 +catalogue==2.0.10 +certifi==2026.1.4 +charset-normalizer==3.4.4 +click==8.3.1 +cloudpathlib==0.23.0 +confection==0.1.5 +cymem==2.0.13 +en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl#sha256=86cc141f63942d4b2c5fcee06630fd6f904788d2f0ab005cce45aadb8fb73889 +Flask==3.1.3 +gunicorn==25.1.0 +flask-cors==6.0.2 +Flask-Uploads==0.2.1 +idna==3.11 +itsdangerous==2.2.0 +Jinja2==3.1.6 +langcodes==3.5.1 +markdown-it-py==4.0.0 +markdown2==2.5.4 +MarkupSafe==3.0.3 +mdurl==0.1.2 +murmurhash==1.0.15 +numpy==1.26.4 +packaging==26.0 +preshed==3.0.12 +prometheus_client==0.24.1 +prometheus_flask_exporter==0.23.2 +pydantic==2.12.5 +pydantic_core==2.41.5 +Pygments==2.19.2 +requests==2.32.5 +rich==14.3.3 +setuptools==82.0.0 +shellingham==1.5.4 +smart_open==7.5.0 +spacy==3.7.5 +spacy-legacy==3.0.12 +spacy-loggers==1.0.5 +srsly==2.5.2 +thinc==8.2.5 +tqdm==4.67.3 +typer==0.24.0 +typer-slim==0.24.0 +typing-inspection==0.4.2 +typing_extensions==4.15.0 +urllib3==2.6.3 +wasabi==1.1.3 +weasel==0.4.3 +Werkzeug==3.1.6 +wheel==0.46.3 +wrapt==2.1.1 +xaif_eval==0.0.9 diff --git a/tests/api-requests/.gitignore b/tests/api-requests/.gitignore new file mode 100644 index 0000000..e19311f --- /dev/null +++ b/tests/api-requests/.gitignore @@ -0,0 +1,9 @@ +# Secrets +.env* + +# Dependencies +node_modules + +# OS files +.DS_Store +Thumbs.db \ No newline at end of file diff --git a/tests/api-requests/Basic get info.yml b/tests/api-requests/Basic get info.yml new file mode 100644 index 0000000..bf23e39 --- /dev/null +++ b/tests/api-requests/Basic get info.yml @@ -0,0 +1,31 @@ +info: + name: Basic get info + type: http + seq: 1 + +http: + method: GET + url: "{{baseUrl}}/segmenter-01" + auth: inherit + +runtime: + scripts: + - type: tests + code: |- + const expectedPartialData = "Default Segmenter"; + const maxTimeToReturnInMilliseconds = 300; + + test("Response should match partial expected value", function () { + const receivedData = res.getBody(); + expect(receivedData).to.contain(expectedPartialData) + }); + + test(`Response time should be faster than ${maxTimeToReturnInMilliseconds} milliseconds`, function () { + expect(res.responseTime).to.be.below(maxTimeToReturnInMilliseconds); + }); + +settings: + encodeUrl: true + timeout: 0 + followRedirects: true + maxRedirects: 5 diff --git a/tests/api-requests/Basic post.yml b/tests/api-requests/Basic post.yml new file mode 100644 index 0000000..d642786 --- /dev/null +++ b/tests/api-requests/Basic post.yml @@ -0,0 +1,65 @@ +info: + name: Basic post + type: http + seq: 2 + +http: + method: POST + url: "{{baseUrl}}/segmenter-01" + body: + type: multipart-form + data: + - name: file + type: file + value: + - input files/basic_input.json + auth: inherit + +runtime: + scripts: + - type: tests + code: |- + const expectedData = { + "AIF": { + "edges": [], + "locutions": [ + { + "nodeID": "2", + "personID": null + }, + { + "nodeID": "3", + "personID": null + } + ], + "nodes": [ + { + "nodeID": "2", + "text": "The SNP is a big party.", + "type": "L" + }, + { + "nodeID": "3", + "text": "There are many disagreements within it.", + "type": "L" + } + ], + "participants": [] + } + }; + const maxTimeToReturnInMilliseconds = 300; + + test("Response should match expected value", function () { + const receivedData = res.getBody(); + expect(receivedData).to.deep.equal(expectedData) + }); + + test(`Response time should be faster than ${maxTimeToReturnInMilliseconds} milliseconds`, function () { + expect(res.responseTime).to.be.below(maxTimeToReturnInMilliseconds); + }); + +settings: + encodeUrl: true + timeout: 0 + followRedirects: true + maxRedirects: 5 diff --git a/tests/api-requests/environments/(1) local.yml b/tests/api-requests/environments/(1) local.yml new file mode 100644 index 0000000..fadbf72 --- /dev/null +++ b/tests/api-requests/environments/(1) local.yml @@ -0,0 +1,4 @@ +name: (1) local +variables: + - name: baseUrl + value: http://localhost:5005 diff --git a/tests/api-requests/environments/(2) staging.yml b/tests/api-requests/environments/(2) staging.yml new file mode 100644 index 0000000..452c2c2 --- /dev/null +++ b/tests/api-requests/environments/(2) staging.yml @@ -0,0 +1,4 @@ +name: (2) staging +variables: + - name: baseUrl + value: http://default-segmenter.amfws.staging.arg.tech diff --git a/tests/api-requests/environments/(3) production.yml b/tests/api-requests/environments/(3) production.yml new file mode 100644 index 0000000..e8eb31a --- /dev/null +++ b/tests/api-requests/environments/(3) production.yml @@ -0,0 +1,4 @@ +name: (3) production +variables: + - name: baseUrl + value: http://default-segmenter.amfws.arg.tech diff --git a/tests/api-requests/input files/basic_input.json b/tests/api-requests/input files/basic_input.json new file mode 100644 index 0000000..c7df594 --- /dev/null +++ b/tests/api-requests/input files/basic_input.json @@ -0,0 +1,15 @@ +{ + "AIF": { + "nodes": [ + { + "nodeID": "1", + "text": "The SNP is a big party. There are many disagreements within it.", + "type": "L" + } + ], + "edges": [], + "locutions": [], + "participants": [] + } +} + diff --git a/tests/api-requests/input files/folder.yml b/tests/api-requests/input files/folder.yml new file mode 100644 index 0000000..d5bd23a --- /dev/null +++ b/tests/api-requests/input files/folder.yml @@ -0,0 +1,7 @@ +info: + name: input files + type: folder + seq: 3 + +request: + auth: inherit diff --git a/tests/api-requests/opencollection.yml b/tests/api-requests/opencollection.yml new file mode 100644 index 0000000..189e735 --- /dev/null +++ b/tests/api-requests/opencollection.yml @@ -0,0 +1,10 @@ +opencollection: 1.0.0 + +info: + name: Default segmenter +bundled: false +extensions: + bruno: + ignore: + - node_modules + - .git