Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ jobs:
- image: cimg/python:3.10.2
environment:
LIMIT_NUMPY_VERSION: 2.0.0
LIMIT_SCIPY_VERSION: 1.13.1
steps:
- checkout
- python/install-packages:
Expand All @@ -20,7 +19,7 @@ jobs:
no_output_timeout: 30m
command: |
pip install --upgrade pip
pip install --only-binary=numpy,scipy "numpy<$LIMIT_NUMPY_VERSION" "scipy<=$LIMIT_SCIPY_VERSION" Cython pytest pytest-cov codecov
pip install --only-binary=numpy,scipy "numpy>$LIMIT_NUMPY_VERSION" Cython pytest pytest-cov codecov
pip install -e .[tests]
- run:
name: Run tests
Expand Down
18 changes: 11 additions & 7 deletions .github/workflows/python-package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,19 +8,18 @@ on:
branches: [ master ]
pull_request:
branches: [ master ]

jobs:
build:
name: Building on ${{ matrix.os }}
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os: [windows-latest, ubuntu-latest, macos-latest]
os: [windows-latest, ubuntu-22.04, macos-latest]
python-version: ["3.9", "3.10", "3.11", "3.12"]
env:
LIMIT_NUMPY_VERSION: 2.0.0
LIMIT_SCIPY_VERSION: 1.13.1
steps:
- name: Get number of CPU cores
uses: SimenB/github-actions-cpu-cores@v2
Expand All @@ -29,7 +28,7 @@ jobs:
- uses: actions/checkout@v4

- name: Setup Python ${{ matrix.python-version }}
if: ${{ ((matrix.os == 'macos-latest') && (matrix.python-version != '3.9')) }}
if: ${{ (matrix.os != 'macos-latest') || ((matrix.os == 'macos-latest') && (matrix.python-version != '3.9')) }}
uses: actions/setup-python@v5
id: pysetup
with:
Expand All @@ -55,15 +54,20 @@ jobs:
python${{ matrix.python-version }} -c "import sys; print(sys.version)"
pip --version

- name: Display GLIBCXX versions
if: matrix.os == 'ubuntu-22.04'
run: |
ls /lib/x86_64-linux-gnu/libstdc*
strings /usr/lib/x86_64-linux-gnu/libstdc++.so.6 | grep GLIBCXX

- name: Upgrade pip wheel setuptools
run: python${{ matrix.python-version }} -m pip install wheel setuptools pip --upgrade

- name: Install other dependencies
run: python${{ matrix.python-version }} -m pip install Cython pytest pytest-cov flake8
run: python${{ matrix.python-version }} -m pip install Cython pytest pytest-cov flake8 "numpy>${{ env.LIMIT_NUMPY_VERSION }}" scipy

- name: Install other dependencies
- name: Build extensions and install test dependencies
run: |
python${{ matrix.python-version }} -m pip install Cython pytest pytest-cov flake8 "numpy<${{ env.LIMIT_NUMPY_VERSION }}" "scipy<=${{ env.LIMIT_SCIPY_VERSION }}"
python${{ matrix.python-version }} setup.py build_ext -j${{ steps.cpu-cores.outputs.count }}
python${{ matrix.python-version }} -m pip install -e .[tests]

Expand Down
24 changes: 17 additions & 7 deletions .github/workflows/python-publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ on:

env:
LIMIT_NUMPY_VERSION: 2.0.0
LIMIT_SCIPY_VERSION: 1.13.1

jobs:
build-wheels:
Expand All @@ -23,7 +22,7 @@ jobs:
strategy:
fail-fast: false
matrix:
os: [windows-latest, ubuntu-latest, macos-latest]
os: [windows-latest, ubuntu-22.04, macos-latest]
python-version: ["3.9", "3.10", "3.11", "3.12"]
steps:
- uses: actions/checkout@v4
Expand Down Expand Up @@ -54,12 +53,18 @@ jobs:
run: |
python${{ matrix.python-version }} -c "import sys; print(sys.version)"
pip --version

- name: Display GLIBCXX versions
if: matrix.os == 'ubuntu-22.04'
run: |
ls /lib/x86_64-linux-gnu/libstdc*
strings /usr/lib/x86_64-linux-gnu/libstdc++.so.6 | grep GLIBCXX

- name: Upgrade pip wheel setuptools
run: python${{ matrix.python-version }} -m pip install wheel setuptools pip --upgrade

- name: Install numpy, scipy
run: pip install "numpy<${{ env.LIMIT_NUMPY_VERSION }}" "scipy<=${{ env.LIMIT_SCIPY_VERSION }}"
run: pip install "numpy>${{ env.LIMIT_NUMPY_VERSION }}" scipy

- name: Install other dependencies
run: |
Expand All @@ -72,7 +77,7 @@ jobs:
run: python${{ matrix.python-version }} setup.py bdist_wheel

- name: Rename Linux wheels to supported platform of PyPI
if: matrix.os == 'ubuntu-latest'
if: matrix.os == 'ubuntu-22.04'
run: for f in dist/*.whl; do mv "$f" "$(echo "$f" | sed s/linux/manylinux1/)"; done

- name: Publish wheels to GitHub artifacts
Expand All @@ -83,7 +88,7 @@ jobs:

publish-pypi:
needs: [build-wheels]
runs-on: ubuntu-latest
runs-on: ubuntu-22.04
steps:
- uses: actions/checkout@v4

Expand All @@ -101,9 +106,14 @@ jobs:
- name: Display Python version
run: python -c "import sys; print(sys.version)"

- name: Install numpy
- name: Display GLIBCXX versions
run: |
ls /lib/x86_64-linux-gnu/libstdc*
strings /usr/lib/x86_64-linux-gnu/libstdc++.so.6 | grep GLIBCXX

- name: Install numpy, scipy
run: |
python -m pip install "numpy<${{ env.LIMIT_NUMPY_VERSION }}" "scipy<=${{ env.LIMIT_SCIPY_VERSION }}"
python -m pip install "numpy>${{ env.LIMIT_NUMPY_VERSION }}" scipy
python -c "import numpy; print(numpy.__version__)"

- name: Install other dependencies
Expand Down
Binary file added assets/demo.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added assets/feedback-dashboard.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added assets/flow.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added assets/recommendation-dashboard.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2 changes: 1 addition & 1 deletion cornac/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,4 @@
# Also importable from root
from .experiment import Experiment

__version__ = '2.3.0'
__version__ = "2.3.3"
11 changes: 7 additions & 4 deletions cornac/augmentation/category.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,7 @@ def load_model(model_name='facebook/bart-large-mnli', cache_dir= None):

return model, tokenizer

model, tokenizer = load_model()
classifier = pipeline("zero-shot-classification", model=model,tokenizer=tokenizer)
_classifier = None

def get_category(row, **kwargs):
""" Enhance the dataset with its category (e.g. news, sports, life)
Expand All @@ -49,17 +48,21 @@ def get_category(row, **kwargs):
-------
cat: string, corresponding category name for each news id row
"""
global _classifier
candidate_labels = kwargs.get('candidate_labels')
meta_data = kwargs.get('meta_data')
threshold = kwargs.get('threshold', 0.5)
if candidate_labels and _classifier is None:
model, tokenizer = load_model()
_classifier = pipeline("zero-shot-classification", model=model,tokenizer=tokenizer)

if candidate_labels:
# Ensure row is a string (text)
if not isinstance(row, str):
raise TypeError(f"Expected row to be str (text), but got {type(row).__name__}")
try:
# run classifier
res = classifier(row, candidate_labels, multi_label=True)
res = _classifier(row, candidate_labels, multi_label=True)

categories = res['labels']
scores = res['scores']
Expand All @@ -83,4 +86,4 @@ def get_category(row, **kwargs):
return -1

# If no candidate labels and no metadata, return -1 (indicating no category found)
return -1 # -1 is the default return value in case of missing candidate_labels and meta_data
return -1
14 changes: 4 additions & 10 deletions cornac/augmentation/enrich_ne.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,14 +214,14 @@ def lookup_and_update(lookup_dict: EfficientDict, alternative: str, all_alternat
wikidata: WikidataQuery, language_tags: List[str] = None):
# Check if alternative is in lookup_dict
lookup_result = lookup_dict.get(alternative.lower())
# If earlier query get nothing, directly return None
if lookup_result == '':
return None
# If already enriched, update all alternatives and return stored value
if lookup_result:
elif lookup_result:
for dict_key in all_alternatives:
lookup_dict.add(dict_key.lower(), lookup_result)
return lookup_result
# If earlier query get nothing, directly return None
elif lookup_result == '':
return None

# If not queried before, query Wikidata
if language_tags:
Expand All @@ -242,9 +242,6 @@ def get_person_data(wikidata: WikidataQuery, entity: Dict, lookup_person: Effici
"""
Get person data from Wikidata.
"""
# print(entity['text'])
# print(lookup_person.main_dict)
# print(lookup_person.hash_table)
info = {
'key': entity['text'],
'frequency': entity['frequency'],
Expand All @@ -269,9 +266,6 @@ def get_org_data(wikidata: WikidataQuery, entity: Dict, lookup_org: EfficientDic
"""
Get organization data from Wikidata.
"""
# print(entity['text'])
# print(lookup_org.main_dict)
# print(lookup_org.hash_table)
info = {
'frequency': entity['frequency'],
'alternative': entity['alternative']
Expand Down
10 changes: 3 additions & 7 deletions cornac/augmentation/min_maj.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,6 @@ def get_min_maj_ratio(ne_list, **kwargs):
# Check if ne_list is a valid iterable
if not isinstance(ne_list, list):
raise TypeError(f"Invalid input: Expected a list for 'ne_list', but received {type(ne_list).__name__}.")
# print("Error: ne_list is not a list. Received:", type(ne_list))
# return {} # Return an empty dictionary if ne_list is not valid

# Iterate through each entity in the named entity list
for entity in ne_list:
Expand Down Expand Up @@ -61,13 +59,11 @@ def get_min_maj_ratio(ne_list, **kwargs):
for major_place_of_birth in major_place_of_births:
if (major_place_of_birth in entity_dict.get('place_of_birth', [])) or not entity_dict.get('place_of_birth'):
place_of_birth_match = True

if ethnicity_match and place_of_birth_match:
count['ethnicity'][1] += entity_dict.get('frequency', 1)
break

count['ethnicity'][0] += entity_dict.get('frequency', 1)
break
else:
count['ethnicity'][0] += entity_dict.get('frequency', 1)
break

if not loop_break:
count['ethnicity'][0] += entity_dict.get('frequency', 1)
Expand Down
5 changes: 1 addition & 4 deletions cornac/augmentation/ner.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,8 +122,7 @@ def set_ner_lang(lang='en'):
return ner

except Exception as e:
# print(f"An error occurred while loading the SpaCy model: {e}")
# return None

raise RuntimeError(f"An unexpected error occurred while loading the SpaCy model '{model_name}': {e}") from e


Expand Down Expand Up @@ -229,8 +228,6 @@ def get_ner(text, ner_model=set_ner_lang(), **kwargs):
'frequency': len(with_name),
'label': label}))
except Exception as e:
# print(f"An error occurred while getting Named Entities: {e}")
# ne_list = None
raise RuntimeError(f"An error occurred while getting Named Entities: {e}")


Expand Down
8 changes: 1 addition & 7 deletions cornac/augmentation/party.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,7 @@ def get_party(ne_list, lang, lookup_parties):
try:
if not isinstance(ne_list, list):
raise ValueError(f"Error: when extraing party, expected ne_list to be a list, but got {type(ne_list)} instead.")
# print("Error: ne_list is not a list. Received:", type(ne_list))
# return {}, {}


for entity in ne_list:
if isinstance(entity, dict):
Expand Down Expand Up @@ -93,9 +92,4 @@ def get_party(ne_list, lang, lookup_parties):
except Exception as e:
raise RuntimeError(f"Error in get_party function: {e}")

# except Exception as e:
# # Log any errors during party extraction but ensure the pipeline continues
# print(f"Error in get_party function: {e}")
# return {}, lookup_parties # Return empty parties in case of failure, but don't stop the pipeline

return parties, lookup_parties
35 changes: 20 additions & 15 deletions cornac/augmentation/readability.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,30 +177,37 @@ def get_readability(text, lang='en'):
(https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests#Flesch_reading_ease).

"""
# print(f"Computing readability for language:{lang}")
try:
textstat.set_lang(lang)
except KeyError: # Handle invalid language codes
if lang in new_langs.keys():
lang = 'en' # Default to English
textstat.set_lang(lang) # Set language to English
else:
# print(f"Language code '{lang}' not supported.")
# return None
raise ValueError(f"Invalid language code '{lang}' provided. Supported language codes are: {', '.join(new_langs.keys())}")


if not isinstance(text, str):
raise TypeError(f"Invalid input: Expected a string for 'text', but received {type(text).__name__}.")

# Extract language root (e.g., "en" from "en_US")
lang_root = lang.split("_")[0]

# Check if language is supported by either textstat or our custom configs
all_supported_langs = set(textstat_langs + list(new_langs.keys()))

if lang_root not in all_supported_langs:
raise ValueError(f"Invalid language code '{lang}' provided. Supported language codes are: {', '.join(sorted(all_supported_langs))}")

# Only set language if it's valid
if lang_root in textstat_langs:
textstat.set_lang(lang_root) # Set to root language for textstat
else:
# For custom languages, we'll use our own calculations, so set to English as fallback
textstat.set_lang('en')

try:
if not text:
return None # Empty text
# Check if the text contains any meaningful characters
if not contains_meaningful_characters(text):
return None
lang_root = lang.split("_")[0]
# lang_root = lang.split("_")[0]
if lang_root in textstat_langs:
readability = textstat.flesch_reading_ease(text)
else:
# Use our custom formula
flesch = (
get_lang_cfg(lang_root, "fre_base")
- float(
Expand All @@ -214,8 +221,6 @@ def get_readability(text, lang='en'):
)
readability = round(flesch, 2)
except Exception as e:
# print(f"An error occurred while getting readability score: {e}")
# readability = None
raise RuntimeError(f"An error occurred while calculating the readability score: {e}")

return readability
Loading
Loading