Informfully · runzeliuzh · Sep 21, 2025 · Jul 30, 2025 · Aug 12, 2025 · Aug 12, 2025
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -10,7 +10,6 @@ jobs:
       - image: cimg/python:3.10.2
         environment:
           LIMIT_NUMPY_VERSION: 2.0.0
-          LIMIT_SCIPY_VERSION: 1.13.1
     steps:
       - checkout
       - python/install-packages:
@@ -20,7 +19,7 @@ jobs:
           no_output_timeout: 30m
           command: |
             pip install --upgrade pip
-            pip install --only-binary=numpy,scipy "numpy<$LIMIT_NUMPY_VERSION" "scipy<=$LIMIT_SCIPY_VERSION" Cython pytest pytest-cov codecov
+            pip install --only-binary=numpy,scipy "numpy>$LIMIT_NUMPY_VERSION" Cython pytest pytest-cov codecov
             pip install -e .[tests]
       - run:
           name: Run tests

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
@@ -8,19 +8,18 @@ on:
     branches: [ master ]
   pull_request:
     branches: [ master ]
-
+  
 jobs:
   build:
     name: Building on ${{ matrix.os }}
     runs-on: ${{ matrix.os }}
     strategy:
       fail-fast: false
       matrix:
-        os: [windows-latest, ubuntu-latest, macos-latest]
+        os: [windows-latest, ubuntu-22.04, macos-latest]
         python-version: ["3.9", "3.10", "3.11", "3.12"]
     env:
       LIMIT_NUMPY_VERSION: 2.0.0
-      LIMIT_SCIPY_VERSION: 1.13.1
     steps:
     - name: Get number of CPU cores
       uses: SimenB/github-actions-cpu-cores@v2
@@ -29,7 +28,7 @@ jobs:
     - uses: actions/checkout@v4
 
     - name: Setup Python ${{ matrix.python-version }}
-      if: ${{ ((matrix.os == 'macos-latest') && (matrix.python-version != '3.9')) }}
+      if: ${{ (matrix.os != 'macos-latest') || ((matrix.os == 'macos-latest') && (matrix.python-version != '3.9')) }}
       uses: actions/setup-python@v5
       id: pysetup
       with:
@@ -55,15 +54,20 @@ jobs:
         python${{ matrix.python-version }} -c "import sys; print(sys.version)"
         pip --version
 
+    - name: Display GLIBCXX versions
+      if: matrix.os == 'ubuntu-22.04'
+      run: | 
+        ls /lib/x86_64-linux-gnu/libstdc*
+        strings /usr/lib/x86_64-linux-gnu/libstdc++.so.6 | grep GLIBCXX        
+
     - name: Upgrade pip wheel setuptools
       run: python${{ matrix.python-version }} -m pip install wheel setuptools pip --upgrade
 
     - name: Install other dependencies
-      run: python${{ matrix.python-version }} -m pip install Cython pytest pytest-cov flake8
+      run: python${{ matrix.python-version }} -m pip install Cython pytest pytest-cov flake8 "numpy>${{ env.LIMIT_NUMPY_VERSION }}" scipy
 
-    - name: Install other dependencies
+    - name: Build extensions and install test dependencies
       run: |
-        python${{ matrix.python-version }} -m pip install Cython pytest pytest-cov flake8 "numpy<${{ env.LIMIT_NUMPY_VERSION }}" "scipy<=${{ env.LIMIT_SCIPY_VERSION }}"
         python${{ matrix.python-version }} setup.py build_ext -j${{ steps.cpu-cores.outputs.count }}
         python${{ matrix.python-version }} -m pip install -e .[tests]
 

diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml
@@ -14,7 +14,6 @@ on:
 
 env:
   LIMIT_NUMPY_VERSION: 2.0.0
-  LIMIT_SCIPY_VERSION: 1.13.1
 
 jobs:
   build-wheels:
@@ -23,7 +22,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        os: [windows-latest, ubuntu-latest, macos-latest]
+        os: [windows-latest, ubuntu-22.04, macos-latest]
         python-version: ["3.9", "3.10", "3.11", "3.12"]
     steps:
     - uses: actions/checkout@v4
@@ -54,12 +53,18 @@ jobs:
       run: | 
         python${{ matrix.python-version }} -c "import sys; print(sys.version)"
         pip --version
+
+    - name: Display GLIBCXX versions
+      if: matrix.os == 'ubuntu-22.04'
+      run: | 
+        ls /lib/x86_64-linux-gnu/libstdc*
+        strings /usr/lib/x86_64-linux-gnu/libstdc++.so.6 | grep GLIBCXX
 
     - name: Upgrade pip wheel setuptools
       run: python${{ matrix.python-version }} -m pip install wheel setuptools pip --upgrade
 
     - name: Install numpy, scipy
-      run: pip install "numpy<${{ env.LIMIT_NUMPY_VERSION }}" "scipy<=${{ env.LIMIT_SCIPY_VERSION }}"
+      run: pip install "numpy>${{ env.LIMIT_NUMPY_VERSION }}" scipy
 
     - name: Install other dependencies
       run: |
@@ -72,7 +77,7 @@ jobs:
       run: python${{ matrix.python-version }} setup.py bdist_wheel
 
     - name: Rename Linux wheels to supported platform of PyPI
-      if: matrix.os == 'ubuntu-latest'
+      if: matrix.os == 'ubuntu-22.04'
       run: for f in dist/*.whl; do mv "$f" "$(echo "$f" | sed s/linux/manylinux1/)"; done
 
     - name: Publish wheels to GitHub artifacts
@@ -83,7 +88,7 @@ jobs:
 
   publish-pypi:
     needs: [build-wheels]
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
     steps:
     - uses: actions/checkout@v4
 
@@ -101,9 +106,14 @@ jobs:
     - name: Display Python version
       run: python -c "import sys; print(sys.version)"
 
-    - name: Install numpy
+    - name: Display GLIBCXX versions
+      run: | 
+        ls /lib/x86_64-linux-gnu/libstdc*
+        strings /usr/lib/x86_64-linux-gnu/libstdc++.so.6 | grep GLIBCXX
+
+    - name: Install numpy, scipy
       run: |
-        python -m pip install "numpy<${{ env.LIMIT_NUMPY_VERSION }}" "scipy<=${{ env.LIMIT_SCIPY_VERSION }}"
+        python -m pip install "numpy>${{ env.LIMIT_NUMPY_VERSION }}" scipy
         python -c "import numpy; print(numpy.__version__)"
 
     - name: Install other dependencies

diff --git a/assets/demo.png b/assets/demo.png
diff --git a/assets/feedback-dashboard.png b/assets/feedback-dashboard.png
diff --git a/assets/flow.jpg b/assets/flow.jpg
diff --git a/assets/recommendation-dashboard.png b/assets/recommendation-dashboard.png
diff --git a/cornac/__init__.py b/cornac/__init__.py
@@ -23,4 +23,4 @@
 # Also importable from root
 from .experiment import Experiment
 
-__version__ = '2.3.0'
+__version__ = "2.3.3"
diff --git a/cornac/augmentation/category.py b/cornac/augmentation/category.py
@@ -33,8 +33,7 @@ def load_model(model_name='facebook/bart-large-mnli', cache_dir= None):
 
     return model, tokenizer
 
-model, tokenizer = load_model()
-classifier = pipeline("zero-shot-classification", model=model,tokenizer=tokenizer)
+_classifier = None
 
 def get_category(row, **kwargs):
     """ Enhance the dataset with its category (e.g. news, sports, life)
@@ -49,17 +48,21 @@ def get_category(row, **kwargs):
     -------
     cat: string, corresponding category name for each news id row
     """
+    global _classifier
     candidate_labels = kwargs.get('candidate_labels')
     meta_data = kwargs.get('meta_data')
     threshold = kwargs.get('threshold', 0.5)
+    if candidate_labels and _classifier is None:
+        model, tokenizer = load_model()
+        _classifier = pipeline("zero-shot-classification", model=model,tokenizer=tokenizer)
 
     if candidate_labels:
          # Ensure row is a string (text)
         if not isinstance(row, str):
             raise TypeError(f"Expected row to be str (text), but got {type(row).__name__}")
         try:
             # run classifier
-            res = classifier(row, candidate_labels, multi_label=True)
+            res = _classifier(row, candidate_labels, multi_label=True)
 
             categories = res['labels']
             scores = res['scores']
@@ -83,4 +86,4 @@ def get_category(row, **kwargs):
             return -1
 
     # If no candidate labels and no metadata, return -1 (indicating no category found)
-    return -1  # -1 is the default return value in case of missing candidate_labels and meta_data
+    return -1 
diff --git a/cornac/augmentation/enrich_ne.py b/cornac/augmentation/enrich_ne.py
@@ -214,14 +214,14 @@ def lookup_and_update(lookup_dict: EfficientDict, alternative: str, all_alternat
                       wikidata: WikidataQuery, language_tags: List[str] = None):
     # Check if alternative is in lookup_dict
     lookup_result = lookup_dict.get(alternative.lower())
+    # If earlier query get nothing, directly return None
+    if lookup_result == '':
+        return None
     # If already enriched, update all alternatives and return stored value
-    if lookup_result:
+    elif lookup_result:
         for dict_key in all_alternatives:
             lookup_dict.add(dict_key.lower(), lookup_result)
         return lookup_result
-    # If earlier query get nothing, directly return None
-    elif lookup_result == '':
-        return None
 
     # If not queried before, query Wikidata
     if language_tags:
@@ -242,9 +242,6 @@ def get_person_data(wikidata: WikidataQuery, entity: Dict, lookup_person: Effici
     """
     Get person data from Wikidata.
     """
-    # print(entity['text'])
-    # print(lookup_person.main_dict)
-    # print(lookup_person.hash_table)
     info = {
         'key': entity['text'],
         'frequency': entity['frequency'],
@@ -269,9 +266,6 @@ def get_org_data(wikidata: WikidataQuery, entity: Dict, lookup_org: EfficientDic
     """
     Get organization data from Wikidata.
     """
-    # print(entity['text'])
-    # print(lookup_org.main_dict)
-    # print(lookup_org.hash_table)
     info = {
         'frequency': entity['frequency'],
         'alternative': entity['alternative']

diff --git a/cornac/augmentation/min_maj.py b/cornac/augmentation/min_maj.py
@@ -25,8 +25,6 @@ def get_min_maj_ratio(ne_list, **kwargs):
     # Check if ne_list is a valid iterable
     if not isinstance(ne_list, list):
         raise TypeError(f"Invalid input: Expected a list for 'ne_list', but received {type(ne_list).__name__}.")
-        # print("Error: ne_list is not a list. Received:", type(ne_list))
-        # return {}  # Return an empty dictionary if ne_list is not valid
 
     # Iterate through each entity in the named entity list
     for entity in ne_list:
@@ -61,13 +59,11 @@ def get_min_maj_ratio(ne_list, **kwargs):
                         for major_place_of_birth in major_place_of_births:
                             if (major_place_of_birth in entity_dict.get('place_of_birth', [])) or not entity_dict.get('place_of_birth'):
                                 place_of_birth_match = True
-
                         if ethnicity_match and place_of_birth_match:
                             count['ethnicity'][1] += entity_dict.get('frequency', 1)
-                            break
-
-                        count['ethnicity'][0] += entity_dict.get('frequency', 1)
-                        break
+                        else:
+                            count['ethnicity'][0] += entity_dict.get('frequency', 1)
+                        break 
 
                 if not loop_break:
                     count['ethnicity'][0] += entity_dict.get('frequency', 1)

diff --git a/cornac/augmentation/ner.py b/cornac/augmentation/ner.py
@@ -122,8 +122,7 @@ def set_ner_lang(lang='en'):
         return ner
 
     except Exception as e:
-        # print(f"An error occurred while loading the SpaCy model: {e}")
-        # return None
+
         raise RuntimeError(f"An unexpected error occurred while loading the SpaCy model '{model_name}': {e}") from e
 
 
@@ -229,8 +228,6 @@ def get_ner(text, ner_model=set_ner_lang(), **kwargs):
                                      'frequency': len(with_name),
                                      'label': label}))
     except Exception as e:
-        # print(f"An error occurred while getting Named Entities: {e}")
-        # ne_list = None
         raise RuntimeError(f"An error occurred while getting Named Entities: {e}")
 
 

diff --git a/cornac/augmentation/party.py b/cornac/augmentation/party.py
@@ -56,8 +56,7 @@ def get_party(ne_list, lang, lookup_parties):
     try:
         if not isinstance(ne_list, list):
             raise ValueError(f"Error: when extraing party, expected ne_list to be a list, but got {type(ne_list)} instead.")
-            # print("Error: ne_list is not a list. Received:", type(ne_list))
-            # return {}, {}
+
 
         for entity in ne_list:
             if isinstance(entity, dict):
@@ -93,9 +92,4 @@ def get_party(ne_list, lang, lookup_parties):
     except Exception as e:
         raise RuntimeError(f"Error in get_party function: {e}")
 
-    # except Exception as e:
-    #     # Log any errors during party extraction but ensure the pipeline continues
-    #     print(f"Error in get_party function: {e}")
-    #     return {}, lookup_parties  # Return empty parties in case of failure, but don't stop the pipeline
-
     return parties, lookup_parties
diff --git a/cornac/augmentation/readability.py b/cornac/augmentation/readability.py
@@ -177,30 +177,37 @@ def get_readability(text, lang='en'):
         (https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests#Flesch_reading_ease).
 
     """
-    # print(f"Computing readability for language:{lang}")
-    try:
-        textstat.set_lang(lang)
-    except KeyError:  # Handle invalid language codes
-        if lang in new_langs.keys():
-            lang = 'en'  # Default to English
-            textstat.set_lang(lang)  # Set language to English
-        else:
-            # print(f"Language code '{lang}' not supported.")
-            # return None
-            raise ValueError(f"Invalid language code '{lang}' provided. Supported language codes are: {', '.join(new_langs.keys())}")
-
+
     if not isinstance(text, str):
         raise TypeError(f"Invalid input: Expected a string for 'text', but received {type(text).__name__}.")
+
+    # Extract language root (e.g., "en" from "en_US")
+    lang_root = lang.split("_")[0]
+
+    # Check if language is supported by either textstat or our custom configs
+    all_supported_langs = set(textstat_langs + list(new_langs.keys()))
+
+    if lang_root not in all_supported_langs:
+        raise ValueError(f"Invalid language code '{lang}' provided. Supported language codes are: {', '.join(sorted(all_supported_langs))}")
+
+    # Only set language if it's valid
+    if lang_root in textstat_langs:
+        textstat.set_lang(lang_root)  # Set to root language for textstat
+    else:
+        # For custom languages, we'll use our own calculations, so set to English as fallback
+        textstat.set_lang('en')
+
     try:
         if not text:
             return None  # Empty text
         # Check if the text contains any meaningful characters
         if not contains_meaningful_characters(text):
             return None
-        lang_root = lang.split("_")[0]
+        # lang_root = lang.split("_")[0]
         if lang_root in textstat_langs:
             readability = textstat.flesch_reading_ease(text)
         else:
+             # Use our custom formula
             flesch = (
                     get_lang_cfg(lang_root, "fre_base")
                     - float(
@@ -214,8 +221,6 @@ def get_readability(text, lang='en'):
             )
             readability = round(flesch, 2)
     except Exception as e:
-        # print(f"An error occurred while getting readability score: {e}")
-        # readability = None
         raise RuntimeError(f"An error occurred while calculating the readability score: {e}")
 
     return readability