Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 33 additions & 6 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,18 +1,45 @@
#FROM python:3.7.2
FROM python:3.8.2
# Use Python 3.8 base image
FROM python:3.12

# Install system dependencies (for spaCy and its dependencies)
RUN apt-get update && apt-get install -y \
build-essential \
python3-dev \
libatlas-base-dev \
gfortran \
curl \
git \
wget

RUN pip install --upgrade pip
# Upgrade pip, setuptools, and wheel (to ensure we're using the latest version)
RUN pip install --upgrade pip setuptools wheel

# Install spaCy (make sure the compatible version with Python 3.8 is installed)
RUN pip3 install spacy==3.7.5 # Change this to a specific compatible version, e.g. 2.2.4

# Download the necessary spaCy language model (en_core_web_sm)
RUN python -m spacy download en_core_web_sm

# Install required Python dependencies
RUN pip3 install tqdm
RUN pip3 install Cython
RUN pip3 install xaif_eval==0.0.9
RUN pip install xaif_eval==0.0.9
RUN pip3 install markdown2
RUN pip3 install flask-cors



# Copy the application files into the container
COPY . /app

# Set the working directory to /app
WORKDIR /app

# Install additional Python dependencies from requirements.txt
RUN pip install -r requirements.txt
EXPOSE 5005
CMD python ./main.py

# Expose port 5008 for the Flask app
EXPOSE 5008

# Set the default command to run the application
CMD ["python", "./main.py"]
2 changes: 1 addition & 1 deletion main.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,4 +47,4 @@ def segmenter_defult():


if __name__ == "__main__":
app.run(host="0.0.0.0", port=int("5005"), debug=False)
app.run(host="0.0.0.0", port=int("5008"), debug=False)
12 changes: 8 additions & 4 deletions src/segmenter.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@
import re
from flask import json
import logging
import spacy

# Load the pre-trained spaCy model for English
nlp = spacy.load("en_core_web_sm")
from src.data import Data
from xaif_eval import xaif
from src.templates import SegmenterOutput
Expand All @@ -19,10 +23,10 @@ def __init__(self,file_obj):
self.file_obj.save(self.f_name)
file = open(self.f_name,'r')

def get_segments(self, input_text): # regex
"""Segmenter that avoids splitting within decimal numbers and abbreviations."""
pattern = r'(?<!\d)\s*[.!?]\s+(?!\w\.)'
return re.split(pattern, input_text)
def get_segments(self, input_text):
"""Split input text into sentences using spaCy's sentence segmentation."""
doc = nlp(input_text) # Process the input text using spaCy
return [sent.text.strip() for sent in doc.sents] # Return sentences, stripping any extra spaces

def is_valid_json(self):
''' check if the file is valid json
Expand Down