diff --git a/README.md b/README.md index d4e3326..2a5ef74 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,38 @@ -# CQLEngine - a simple Corpus Query Language Processor +# Corpus Query Language Engine +## Presentation +This repo hosts the code for a simple +CQL processor. CQL is a language used for +linguistics queries over large corporas. -Work in progress \ No newline at end of file +## Pip install + +```shell +pip3 install corpus-query-language +``` + +## Uses + +Two main functions are implemented: +- match, for checking if some pattern exists in a corpus (stops at first match). Returns a boolean +- findall, for finding the position of all matching tokens. Returns a list of tuples, with start and end position. + +```python +import sys +import corpus_query_language as CQL + +query = "Some CQL query" +corpus = CQL.utils.import_corpus("path/to/json/corpus.json") +MyEngine = CQL.core.CQLEngine() +MyEngine.findall(corpus, query) +MyEngine.match(corpus, query) +``` + +## Implemented CQL functions + +- parsing of any kind of annotation classes: `word`, `lemma`, `pos`, `morph` +- combination of annotations: `[lemma='rey' & pos='NCMP000']` +- one or zero annotations `[lemma='rey']?` (partially implemented, may produce errors). +- distance `[lemma='rey'][]{,5}[lemma='santo']` +- any regex in the annotation value `[lemma='reye?s?']` +- alternatives: `([lemma='rey']|[lemma='príncipe'])[]{,5}[lemma='santo']` (may produce errors) \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 21449a0..bced933 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "corpus-query-language" -version = "0.0.1" +version = "0.0.5" authors = [ { name="Matthias Gille Levenson", email="matthias.gille-levenson@ens-lyon.fr" }, ] @@ -21,6 +21,9 @@ dependencies = [ requires = ["setuptools", "setuptools-scm"] build-backend = "setuptools.build_meta" +[tool.setuptools.packages.find] +where = ["src"] + [project.urls] Homepage = "https://github.com/matgille/CQL" Issues = "https://github.com/matgille/CQL/issues" \ No newline at end of file diff --git a/CQL.py b/src/CQL.py similarity index 75% rename from CQL.py rename to src/CQL.py index adb1d24..a933b91 100644 --- a/CQL.py +++ b/src/CQL.py @@ -1,4 +1,4 @@ -# Python package project: CQL (Corpus Query Language) parser: +# Python package project: CQL (Corpus Query Language) language: # - parsing of any kind of annotation: word, lemma, pos, morph # - combination of annotations: [lemma='rey' & pos='NCMP000'] # - one or zero annotations [lemma='rey']?. @@ -6,18 +6,17 @@ # - any regex in the annotation value [lemma='reye?s?'] # - alternatives: [lemma='rey']|[lemma='príncipe'][]{,5}[lemma='santo'] import sys -import CQLEngine.functions as functions +import corpus_query_language as CQL # Takes a list of dicts with the annotations as input. Returns: # - a list of spans (search_all function) # - a boolean (match function) - def main(): query = sys.argv[1] - corpus = functions.import_corpus("tests/test_data/test_corpus.json") - MyEngine = functions.CQLEngine() + corpus = CQL.utils.import_corpus("../tests/test_data/test_corpus.json") + MyEngine = CQL.core.CQLEngine() MyEngine.findall(corpus, query) MyEngine.match(corpus, query) diff --git a/src/corpus_query_language/__init__.py b/src/corpus_query_language/__init__.py new file mode 100644 index 0000000..c76bd74 --- /dev/null +++ b/src/corpus_query_language/__init__.py @@ -0,0 +1,5 @@ +import corpus_query_language.core.core as core +import corpus_query_language.engine.engine as engine +import corpus_query_language.utils.utils as utils +import corpus_query_language.language as language +__all__ = ["core", "engine", "language", "utils"] \ No newline at end of file diff --git a/src/CQLEngine/__init__.py b/src/corpus_query_language/core/__init__.py similarity index 100% rename from src/CQLEngine/__init__.py rename to src/corpus_query_language/core/__init__.py diff --git a/src/corpus_query_language/core/core.py b/src/corpus_query_language/core/core.py new file mode 100644 index 0000000..4039eae --- /dev/null +++ b/src/corpus_query_language/core/core.py @@ -0,0 +1,38 @@ +import corpus_query_language.utils.utils as utils +import corpus_query_language.engine.engine as engine + +class CQLEngine(): + """ + The main class: tokenize a query, parse it, and parse a corpus with 2 main functions: + - findall + - match + """ + def findall(self, corpus:list[dict], query:str, verbose:bool=True, debug:bool=False) -> list[tuple[int, int]]: + """ + This function checks if a query matches some text, and returns the start and end span. + :param query: a CQL query + :param corpus: the annotated text as a list of dictionnaries containing the annotations (lemma, pos, morph, word) + :return: a list of tuples with the start and end position. + """ + query_ast = utils.build_grammar(debug=debug, query=query) + result = engine.parse_corpus(query_ast, corpus, mode="find", debug=debug) + if verbose: + print(f"\n---\nResults for query {query}:") + print(f"Ast: {query_ast}") + print(f"Spans: {result}") + return result + + + def match(self, corpus:list[dict], query:str, verbose:bool=True, debug:bool=False) -> bool: + """ + This function checks whether a query matches some text, and returns True or False + :param query: a CQL query + :param corpus: the annotated text as a list of dictionnaries containing the annotations (lemma, pos, morph, word) + :return: a boolean + """ + query_ast = utils.build_grammar(debug=debug, query=query) + result = engine.parse_corpus(query_ast, corpus, mode="match", debug=debug) + if verbose: + print(f"\n---\nResults for query {query}:") + print(result) + return result diff --git a/src/corpus_query_language/engine/__init__.py b/src/corpus_query_language/engine/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/CQLEngine/engine.py b/src/corpus_query_language/engine/engine.py similarity index 82% rename from src/CQLEngine/engine.py rename to src/corpus_query_language/engine/engine.py index 89574c1..1d51eaf 100644 --- a/src/CQLEngine/engine.py +++ b/src/corpus_query_language/engine/engine.py @@ -1,7 +1,15 @@ -import CQLEngine.functions as functions +import corpus_query_language.utils.utils as utils -def parse_corpus(ast, corpus, mode, debug): +def parse_corpus(ast, corpus: list[dict], mode:str, debug) -> bool | list[tuple[int, int]]: + """ + Main function for parsing a corpus given an AST. + :param ast: The Abstract Syntax Tree to be matched against the corpus. + :param corpus: The corpus as a list of dictionaries. + :param mode: The mode: match (stop at first match, return Bool) or find (search for all matches, returns list of tuples) + :param debug: Debug mode: print all information of matching process + :return: + """ match = False text_end = False tree_index = 0 @@ -23,7 +31,6 @@ def parse_corpus(ast, corpus, mode, debug): # Text-directed engine. while text_end == False: - # On teste si on est en bout de texte. if len(corpus) == text_index and tree_index != ast_length: if debug: @@ -65,7 +72,7 @@ def parse_corpus(ast, corpus, mode, debug): print(f"{operator} in list of analysis") print(len(corpus)) print(text_index) - if functions.simple_match(current_query, corpus[text_index]): + if utils.simple_match(current_query, corpus[text_index]): if debug: print("Found you a. Going forward on tree and text.") print(f"First match is {text_index}") @@ -84,7 +91,7 @@ def parse_corpus(ast, corpus, mode, debug): if debug: print(f"{operator} operator") if operator == "or": - if functions.alternative_match(current_query[1:], corpus[text_index]): + if utils.alternative_match(current_query[1:], corpus[text_index]): if debug: print("Found your alternative. Going forward on tree and text.") print(f"First match is {text_index}") @@ -107,7 +114,7 @@ def parse_corpus(ast, corpus, mode, debug): print(f"\t{text_index}: Looking for {ast[tree_index + 1]} in position {text_index}") if len(corpus) == text_index: break - if functions.simple_match(ast[tree_index + 1], corpus[text_index]): + if utils.simple_match(ast[tree_index + 1], corpus[text_index]): submatch = True tree_index += 2 if debug: @@ -126,7 +133,7 @@ def parse_corpus(ast, corpus, mode, debug): elif operator == "and": all_matches = [] for item in current_query[1:]: - if functions.simple_match(item, corpus[text_index]): + if utils.simple_match(item, corpus[text_index]): all_matches.append(True) else: all_matches.append(False) @@ -143,7 +150,7 @@ def parse_corpus(ast, corpus, mode, debug): # Pour l'opérateur "0 ou 1", on vérifie que le token matche. # S'il ne matche pas, on passe à la requête suivante sans # incrémenter le texte - if functions.alternative_match(current_query[1:], corpus[text_index]): + if utils.alternative_match(current_query[1:], corpus[text_index]): if debug: print("Found your alternative. Going forward on tree and text.") print(f"First match is {text_index}") diff --git a/src/corpus_query_language/language/__init__.py b/src/corpus_query_language/language/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/CQLEngine/lexer.py b/src/corpus_query_language/language/lexer.py similarity index 94% rename from src/CQLEngine/lexer.py rename to src/corpus_query_language/language/lexer.py index 3490917..ad417d3 100644 --- a/src/CQLEngine/lexer.py +++ b/src/corpus_query_language/language/lexer.py @@ -2,6 +2,9 @@ import copy class Lexer(object): + """ + Lexer that is used to tokenize a query. + """ tokens = ( 'RANGE', 'DISTANCE', @@ -81,7 +84,7 @@ def t_error(self, t): print("Illegal character '%s'" % t.value[0]) t.lexer.skip(1) - def tokenize(self, query, debug): + def tokenize(self, query:str, debug:bool=False): self.lexer = lex.lex(module=self) self.lexer.input(query) diff --git a/src/CQLEngine/parser.py b/src/corpus_query_language/language/parser.py similarity index 95% rename from src/CQLEngine/parser.py rename to src/corpus_query_language/language/parser.py index 4c16277..9d7266e 100644 --- a/src/CQLEngine/parser.py +++ b/src/corpus_query_language/language/parser.py @@ -1,11 +1,14 @@ import ply.yacc as yacc -import CQLEngine.lexer as lexer +import corpus_query_language.language.lexer as lexer # API functionnalities. class Parser(lexer.Lexer): + """ + The parser. Builds the Ast with the tokens produced by the lexer. + """ tokens = lexer.Lexer.tokens def p_or_queries(self, p): diff --git a/src/corpus_query_language/utils/__init__.py b/src/corpus_query_language/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/CQLEngine/functions.py b/src/corpus_query_language/utils/utils.py similarity index 53% rename from src/CQLEngine/functions.py rename to src/corpus_query_language/utils/utils.py index 4c01ad6..a73e4d9 100644 --- a/src/CQLEngine/functions.py +++ b/src/corpus_query_language/utils/utils.py @@ -1,43 +1,18 @@ import re import json -import CQLEngine.parser as parser -import CQLEngine.lexer as lexer -import CQLEngine.engine as engine +import corpus_query_language.language.parser as parser +import corpus_query_language.language.lexer as lexer -class CQLEngine(): - def findall(self, corpus:list[dict], query:str, verbose:bool=True, debug:bool=False) -> list[tuple[int, int]]: - """ - This function checks if a query matches some text, and returns the start and end span. - :param query: a CQL query - :param corpus: the annotated text as a list of dictionnaries containing the annotations (lemma, pos, morph, word) - :return: a list of tuples with the start and end position. - """ - query_ast = build_grammar(debug=debug, query=query) - result = engine.parse_corpus(query_ast, corpus, mode="find", debug=debug) - if verbose: - print(f"\n---\nResults for query {query}:") - print(f"Ast: {query_ast}") - print(f"Spans: {result}") - return result - def match(self, corpus:list[dict], query:str, verbose:bool=True, debug:bool=False) -> bool: - """ - This function checks whether a query matches some text, and returns True or False - :param query: a CQL query - :param corpus: the annotated text as a list of dictionnaries containing the annotations (lemma, pos, morph, word) - :return: a boolean - """ - query_ast = build_grammar(debug=debug, query=query) - result = engine.parse_corpus(query_ast, corpus, mode="match", debug=debug) - if verbose: - print(f"\n---\nResults for query {query}:") - print(result) - return result - - -def build_grammar(debug, query): +def build_grammar(debug:bool, query:str) -> list: + """ + This function builds an Abstract Syntax Tree from a query + :param debug: outputs parsing information + :param query: the query to build the AST from + :return: the ast + """ MyLexer = lexer.Lexer() MyLexer.tokenize(query, debug=debug) MyParser = parser.Parser(MyLexer, debug=debug) @@ -103,7 +78,12 @@ def alternative_match(queries:list[tuple], text_token:dict) -> bool: -def import_corpus(path): +def import_corpus(path) -> list: + """ + Simple JSON file import to dict + :param path: Path to the JSON file + :return: the list of dicts + """ with open(path, "r") as f: corpus = json.load(f) return corpus \ No newline at end of file diff --git a/tests/tests.py b/tests/tests.py index 6899377..61537c5 100644 --- a/tests/tests.py +++ b/tests/tests.py @@ -1,7 +1,7 @@ import ast import sys sys.path.append('src/') -import CQLEngine.functions as functions +import corpus_query_language as CQL import unittest def import_test_queries(path): @@ -23,14 +23,14 @@ def test_simple_match(self): "pos": "NCMS000", "morph": None, "word": "asnos"} - self.assertEqual(functions.simple_match(query, test_token), True, "Something is wrong" + self.assertEqual(CQL.utils.simple_match(query, test_token), True, "Something is wrong" "with function `test_simple_match`") class TestQueries(unittest.TestCase): def test_findall_queries(self): - self.corpus = functions.import_corpus("tests/test_data/test_corpus.json") + self.corpus = CQL.utils.import_corpus("tests/test_data/test_corpus.json") self.queries = import_test_queries("tests/queries_findall.txt") - self.MyEngine = functions.CQLEngine() + self.MyEngine = CQL.core.CQLEngine() for query, GT in self.queries: GT = ast.literal_eval(GT) with self.subTest(query=query, GT=GT): @@ -39,7 +39,7 @@ def test_findall_queries(self): def test_match_queries(self): self.queries = import_match_queries("tests/queries_match.txt") - self.MyEngine = functions.CQLEngine() + self.MyEngine = CQL.core.CQLEngine() for idx, (nodes, query, GT) in enumerate(self.queries): with self.subTest(query=query, GT=GT): GT = True if GT == "True" else False