diff --git a/Dockerfile b/Dockerfile index 02af66d..2682f25 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,18 +1,45 @@ -#FROM python:3.7.2 -FROM python:3.8.2 +# Use Python 3.8 base image +FROM python:3.12 +# Install system dependencies (for spaCy and its dependencies) +RUN apt-get update && apt-get install -y \ + build-essential \ + python3-dev \ + libatlas-base-dev \ + gfortran \ + curl \ + git \ + wget -RUN pip install --upgrade pip +# Upgrade pip, setuptools, and wheel (to ensure we're using the latest version) +RUN pip install --upgrade pip setuptools wheel + +# Install spaCy (make sure the compatible version with Python 3.8 is installed) +RUN pip3 install spacy==3.7.5 # Change this to a specific compatible version, e.g. 2.2.4 + +# Download the necessary spaCy language model (en_core_web_sm) +RUN python -m spacy download en_core_web_sm + +# Install required Python dependencies RUN pip3 install tqdm RUN pip3 install Cython -RUN pip3 install xaif_eval==0.0.9 +RUN pip install xaif_eval==0.0.9 RUN pip3 install markdown2 RUN pip3 install flask-cors +# Copy the application files into the container COPY . /app + +# Set the working directory to /app WORKDIR /app + +# Install additional Python dependencies from requirements.txt RUN pip install -r requirements.txt -EXPOSE 5005 -CMD python ./main.py \ No newline at end of file + +# Expose port 5008 for the Flask app +EXPOSE 5008 + +# Set the default command to run the application +CMD ["python", "./main.py"] diff --git a/main.py b/main.py index 5ef7dcc..56ce45b 100644 --- a/main.py +++ b/main.py @@ -47,4 +47,4 @@ def segmenter_defult(): if __name__ == "__main__": - app.run(host="0.0.0.0", port=int("5005"), debug=False) + app.run(host="0.0.0.0", port=int("5008"), debug=False) diff --git a/src/segmenter.py b/src/segmenter.py index c7655ca..531c25e 100644 --- a/src/segmenter.py +++ b/src/segmenter.py @@ -6,6 +6,10 @@ import re from flask import json import logging +import spacy + +# Load the pre-trained spaCy model for English +nlp = spacy.load("en_core_web_sm") from src.data import Data from xaif_eval import xaif from src.templates import SegmenterOutput @@ -19,10 +23,10 @@ def __init__(self,file_obj): self.file_obj.save(self.f_name) file = open(self.f_name,'r') - def get_segments(self, input_text): # regex - """Segmenter that avoids splitting within decimal numbers and abbreviations.""" - pattern = r'(?