From fe2ec58f945b2626a27a6acaaac813e0e97bcabd Mon Sep 17 00:00:00 2001 From: SCN-MNG <205478402+SCN-MNG@users.noreply.github.com> Date: Sun, 28 Sep 2025 21:44:45 -0400 Subject: [PATCH 01/24] feat(simssadb): add script to export and class all tables from SQL database. --- simssa/src/export_all_tables.py | 127 ++++++++++++++++++++++++++++++++ 1 file changed, 127 insertions(+) create mode 100644 simssa/src/export_all_tables.py diff --git a/simssa/src/export_all_tables.py b/simssa/src/export_all_tables.py new file mode 100644 index 00000000..6f0bb6a4 --- /dev/null +++ b/simssa/src/export_all_tables.py @@ -0,0 +1,127 @@ +""" +Script to export all tables from a PostgreSQL database to structured CSV files. + +Tables are categorized into subdirectories based on predefined mappings. +Empty tables are skipped during the export process. +""" + +import psycopg2 +import csv +import re +import os +import logging + +# Database connection parameters +DB_PARAMS = { + "dbname": "simssadb", + "user": "myuser", + "password": "mypassword", + "host": "localhost", +} + +BASE_OUTPUT_DIR = os.path.abspath("./simssa/data/raw") + +# Table to directory mapping based on existing structure +TABLE_MAPPINGS = { + # Feature-related tables + "extracted_feature": "feature", + "feature": "feature", + "feature_file": "feature", + # Genre-related tables + "genre_as_in_style": "genre", + "genre_as_in_type": "genre", + "musical_work_genres_as_in_style": "genre", + "musical_work_genres_as_in_type": "genre", + # Instance-related tables + "files": "instance", + "source_instantiation": "instance", + "source_instantiation_sections": "instance", + # Musical work-related tables + "musical_work": "musical_work", + "part": "musical_work", + "section": "musical_work", + "geographic_area": "musical_work", + "instrument": "musical_work", + # Person-related tables + "contribution_musical_work": "person", + "person": "person", + # Source-related tables + "source": "source", +} + +# Configure logging +logging.basicConfig( + level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" +) + + +def main(): + """ + Export all tables from the database to CSV files. + + Tables are placed in subdirectories based on their category. + Skips empty tables and logs the export process. + """ + + # Ensure base output directory exists + os.makedirs(BASE_OUTPUT_DIR, exist_ok=True) + + # Create all subdirectories + subdirs = set(TABLE_MAPPINGS.values()) | {"other"} + for subdir in subdirs: + os.makedirs(os.path.join(BASE_OUTPUT_DIR, subdir), exist_ok=True) + + # Connect to database + conn = psycopg2.connect(**DB_PARAMS) + cur = conn.cursor() + + try: + # Get all table names in public schema + cur.execute( + """ + SELECT tablename FROM pg_tables WHERE schemaname = 'public'; + """ + ) + table_names = [row[0] for row in cur.fetchall()] + + exported_count = 0 + + for table_name in table_names: + try: + # Get table directory (default to 'other' if unknown) + table_dir = TABLE_MAPPINGS.get(table_name, "other") + output_dir = os.path.join(BASE_OUTPUT_DIR, table_dir) + + # Execute query + cur.execute(f'SELECT * FROM "{table_name}"') + + # Skip empty tables + rows = cur.fetchall() + if len(rows) == 0: + logging.info("Skipped %s (empty table)", table_name) + continue + + # Write to CSV + csv_path = os.path.join(output_dir, f"{table_name}.csv") + with open(csv_path, "w", newline="", encoding="utf-8") as f: + writer = csv.writer(f) + writer.writerow( + [col[0] for col in cur.description] + ) # Write headers + writer.writerows(rows) + + exported_count += 1 + + except (psycopg2.Error, IOError) as e: + logging.error("Error exporting table %s: %s", table_name, e) + + logging.info("\nExport completed!") + logging.info("Total tables exported: %d", exported_count) + + finally: + cur.close() + conn.close() + + +if __name__ == "__main__": + main() From e31fdde676a1cf33a9a06401b9132dcec1ec6536 Mon Sep 17 00:00:00 2001 From: SCN-MNG <205478402+SCN-MNG@users.noreply.github.com> Date: Sun, 28 Sep 2025 21:46:11 -0400 Subject: [PATCH 02/24] feat(simssadb): add script to merge and process CSV. This facilitates reconciliation and RDF conversion. --- simssa/src/merge.py | 221 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 221 insertions(+) create mode 100644 simssa/src/merge.py diff --git a/simssa/src/merge.py b/simssa/src/merge.py new file mode 100644 index 00000000..2ddcf69f --- /dev/null +++ b/simssa/src/merge.py @@ -0,0 +1,221 @@ +import argparse +import logging +from pathlib import Path +import pandas as pd + +DEFAULT_INPUT_DIR = Path("simssa/data/raw") +DEFAULT_OUTPUT_DIR = Path("simssa/data/merged") + +# Configure logger +logger = logging.getLogger(__name__) +if not logger.hasHandlers(): + logging.basicConfig(level=logging.INFO, format="[%(levelname)s] %(message)s") + + +def load_and_select(csv_path, usecols, rename_dict): + df = pd.read_csv(csv_path, usecols=usecols, dtype=str) + return df.rename(columns=rename_dict) + + +def merge_instance_data(input_dir): + source_inst_df = load_and_select( + input_dir / "instance" / "source_instantiation.csv", + usecols=["id", "source_id", "work_id"], + rename_dict={"id": "instance_id"}, + ) + + source_inst_section_df = load_and_select( + input_dir / "instance" / "source_instantiation_sections.csv", + usecols=["sourceinstantiation_id", "section_id"], + rename_dict={"sourceinstantiation_id": "instance_id"}, + ) + + files_df = load_and_select( + input_dir / "instance" / "files.csv", + usecols=["id", "file_format", "file", "instantiates_id"], + rename_dict={ + "id": "file_id", + "file": "file_name", + "instantiates_id": "instance_id", + }, + ) + + # Each instance may have multiple files + merged_df = pd.merge(source_inst_df, files_df, on="instance_id", how="left") + full_merged_df = pd.merge( + merged_df, source_inst_section_df, on="instance_id", how="left" + ) + + return full_merged_df + + +def merge_work_data(input_dir): + # Load and process musical_work.csv + work_df = load_and_select( + input_dir / "musical_work" / "musical_work.csv", + usecols=["id", "variant_titles", "sacred_or_secular"], + rename_dict={"id": "work_id", "variant_titles": "work_title"}, + ) + # Clean work_title by removing brackets and quotes + work_df["work_title"] = work_df["work_title"].str.replace( + r"[\[\]'\"']", "", regex=True + ) + work_df["sacred_or_secular"] = work_df["sacred_or_secular"].replace( + {"False": "Secular", "True": "Sacred"} + ) + + # Load and process section.csv + section_df = load_and_select( + input_dir / "musical_work" / "section.csv", + usecols=["id", "title", "musical_work_id"], + rename_dict={ + "id": "section_id", + "title": "section_title", + "musical_work_id": "work_id", + }, + ) + + # Left join work_df with section_df on work_id + merged_with_sections_df = pd.merge(work_df, section_df, on="work_id", how="left") + + # Load and process contribution_musical_works.csv + contribution_df = load_and_select( + input_dir / "person" / "contribution_musical_work.csv", + usecols=["role", "person_id", "contributed_to_work_id"], + rename_dict={"contributed_to_work_id": "work_id"}, + ) + + # The original table store the role of the contributor as either AUTHOR or COMPOSER + contribution_pivot = ( + contribution_df.pivot_table( + index="work_id", columns="role", values="person_id", aggfunc="first" + ) + .rename(columns={"AUTHOR": "author_id", "COMPOSER": "composer_id"}) + .reset_index() + ) + + # Merge the pivoted contribution data with the work-section DataFrame + merged_with_creator_df = pd.merge( + merged_with_sections_df, contribution_pivot, on="work_id", how="left" + ) + + # Processing genre data + # Load and process genre-work match table + genre_of_work_df = load_and_select( + input_dir / "genre" / "musical_work_genres_as_in_type.csv", + usecols=["musicalwork_id", "genreasintype_id"], + rename_dict={"musicalwork_id": "work_id", "genreasintype_id": "genre_id"}, + ) + + # Load and process genre id-name match table + genres_df = load_and_select( + input_dir / "genre" / "genre_as_in_type.csv", + usecols=["id", "name"], + rename_dict={"id": "genre_id", "name": "genre_name"}, + ) + + # Left join genres_df with work_genres_df on genre_id + merged_genres_df = pd.merge(genre_of_work_df, genres_df, on="genre_id", how="left") + + # Left join the result with the merged work-section-contribution DataFrame on work_id + final_merged_df = pd.merge( + merged_with_creator_df, merged_genres_df, on="work_id", how="left" + ) + + # genre_as_in_style.csv explicitly specify "Renaissance" for all works + final_merged_df["style"] = "Renaissance" + + return final_merged_df + + +def process_source_data(input_dir): + # Load and process source/source.csv + source_df = load_and_select( + input_dir / "source" / "source.csv", + usecols=["id", "title"], + rename_dict={"id": "source_id", "title": "source_title"}, + ) + # Remove double quotes from source_title + source_df["source_title"] = source_df["source_title"].str.replace( + '"', "", regex=False + ) + + return source_df + + +def process_person_data(input_dir): + # Load and process person/person.csv + person_df = load_and_select( + input_dir / "person" / "person.csv", + usecols=[ + "id", + "given_name", + "surname", + "birth_date_range_year_only", + "death_date_range_year_only", + "authority_control_url", + ], + rename_dict={"id": "person_id", "authority_control_url": "viaf_id"}, + ) + + # Combine given_name and surname into person_name + person_df["person_name"] = person_df["given_name"] + " " + person_df["surname"] + + # Extract the year and convert to xsd:date compatible format (YYYY-01-01) + person_df["birth_year"] = ( + person_df["birth_date_range_year_only"] + .str.extract(r"\[(\d{4})")[0] + .apply(lambda y: f"{y}-01-01" if pd.notnull(y) else None) + ) + + person_df["death_year"] = ( + person_df["death_date_range_year_only"] + .str.extract(r"\[(\d{4})")[0] + .apply(lambda y: f"{y}-01-01" if pd.notnull(y) else None) + ) + + # Drop unnecessary name and date_range columns + person_df = person_df[ + ["person_id", "person_name", "birth_year", "death_year", "viaf_id"] + ] + + return person_df + + +def main(): + parser = argparse.ArgumentParser(description="SIMSSA CSV merge utilities") + parser.add_argument( + "-i", + "--input-dir", + default=DEFAULT_INPUT_DIR, + type=Path, + help="Input raw data directory (default: simssa/data/raw)", + ) + parser.add_argument( + "-o", + "--output-dir", + default=DEFAULT_OUTPUT_DIR, + type=Path, + help="Output directory for merged CSVs (default: simssa/data/merged)", + ) + args = parser.parse_args() + args.output_dir.mkdir(parents=True, exist_ok=True) + + merge_instance_data(args.input_dir).to_csv( + args.output_dir / "instance.csv", index=False + ) + merge_work_data(args.input_dir).to_csv(args.output_dir / "work.csv", index=False) + process_source_data(args.input_dir).to_csv( + args.output_dir / "source.csv", index=False + ) + process_person_data(args.input_dir).to_csv( + args.output_dir / "person.csv", index=False + ) + + logger.info( + "All CSVs have been successfully processed and saved to %s", args.output_dir + ) + + +if __name__ == "__main__": + main() From 404945e3e6fa0bdc35a8ef2ac4926413c9318981 Mon Sep 17 00:00:00 2001 From: SCN-MNG <205478402+SCN-MNG@users.noreply.github.com> Date: Wed, 22 Oct 2025 11:30:44 -0400 Subject: [PATCH 03/24] chore(simssadb): reformat and lint scripts --- simssa/src/flattening/SQL_query.py | 15 +++++++-------- simssa/src/flattening/restructure.py | 4 +--- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/simssa/src/flattening/SQL_query.py b/simssa/src/flattening/SQL_query.py index ec86bce6..8b41af45 100644 --- a/simssa/src/flattening/SQL_query.py +++ b/simssa/src/flattening/SQL_query.py @@ -13,10 +13,10 @@ # Query main info from the database db_params = { - 'dbname': 'simssadb', - 'user': 'postgres', - 'password': 'postgres', - 'host': 'localhost' + "dbname": "simssadb", + "user": "postgres", + "password": "postgres", + "host": "localhost", } author_query = """ @@ -177,7 +177,6 @@ cur.execute(compact_files_query) - # # START FEATURE FLATTENING # # get distinct feature names: # cur.execute("SELECT DISTINCT feature FROM flattened_view") @@ -210,7 +209,7 @@ results = cur.fetchall() # Export data to CSV -with open('initial_flattened.csv', 'w') as f: +with open("initial_flattened.csv", "w") as f: writer = csv.writer(f) writer.writerow([col[0] for col in cur.description]) writer.writerows(results) @@ -225,11 +224,11 @@ results = cur.fetchall() # Export data to CSV -with open('files.csv', 'w') as f: +with open("files.csv", "w") as f: writer = csv.writer(f) writer.writerow([col[0] for col in cur.description]) writer.writerows(results) # Clean up cur.close() -conn.close() \ No newline at end of file +conn.close() diff --git a/simssa/src/flattening/restructure.py b/simssa/src/flattening/restructure.py index 5a2e054a..fcda7cfa 100644 --- a/simssa/src/flattening/restructure.py +++ b/simssa/src/flattening/restructure.py @@ -31,6 +31,4 @@ # df2 = df2.reset_index() - - -df2.to_csv('./final_flattened.csv', index=False) +df2.to_csv("./final_flattened.csv", index=False) From 384ef6c737c9f38492e1d5e9e9ad18a1648d21d2 Mon Sep 17 00:00:00 2001 From: SCN-MNG <205478402+SCN-MNG@users.noreply.github.com> Date: Wed, 22 Oct 2025 11:32:00 -0400 Subject: [PATCH 04/24] doc(simssadb): create template README --- simssa/README.md | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/simssa/README.md b/simssa/README.md index ecb0e0c4..c2055059 100644 --- a/simssa/README.md +++ b/simssa/README.md @@ -35,3 +35,24 @@ In `generate_jsonld.py`: 4. The contexts used in the `compact.jsonld` file is imported from `context.jsonld` ### TODO: Make the RDF conversion convert to Turtle + +## Database Export Scripts + +### export_structured_tables.py + +A new script that exports all tables from the SimssaDB database to CSV files organized by category: + +- **Usage**: `python src/export_structured_tables.py` +- **Output**: Structured CSV files in `data/raw/` subdirectories +- **Categories**: + - `feature/`: extracted_feature, feature, feature_file + - `genre/`: genre_as_in_style, genre_as_in_type, musical_work_genres_* + - `instance/`: files, source_instantiation, source_instantiation_sections + - `musical_work/`: musical_work, part, section + - `person/`: contribution_musical_work, person + - `source/`: source + - `other/`: geographic_area, instrument, software, and any unknown tables + +This script automatically maps tables to appropriate directories based on their content type and puts unknown tables in the `other/` directory. + +``` From a38c427fa8a0d0e9efa90d9a9ead1cf7b5100998 Mon Sep 17 00:00:00 2001 From: SCN-MNG <205478402+SCN-MNG@users.noreply.github.com> Date: Wed, 22 Oct 2025 11:33:11 -0400 Subject: [PATCH 05/24] feat(simssadb): create RDF conversion config --- shared/rdf_config/simssadb.toml | 59 +++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) create mode 100644 shared/rdf_config/simssadb.toml diff --git a/shared/rdf_config/simssadb.toml b/shared/rdf_config/simssadb.toml new file mode 100644 index 00000000..9721613e --- /dev/null +++ b/shared/rdf_config/simssadb.toml @@ -0,0 +1,59 @@ +[general] +name = "simssadb" +csv_folder = "../../simssa/data/reconciled" +rdf_output_folder = "../../simssa/data/rdf" +test_mode = false + +[namespaces] +rdf = "http://www.w3.org/1999/02/22-rdf-syntax-ns#" +rdfs = "http://www.w3.org/2000/01/rdf-schema#" +xsd = "http://www.w3.org/2001/XMLSchema#" +wd = "http://www.wikidata.org/entity/" +wdt = "http://www.wikidata.org/prop/direct/" +sp = "https://db.simssa.ca/persons/" +sw = "https://db.simssa.ca/musicalworks/" +sse = "https://db.simssa.ca/sections/" +st = "https://db.simssa.ca/types/" +ss = "https://db.simssa.ca/sources/" +lms = "https://linkedmusic.ca/graphs/simssadb/" + +[instance] +PRIMARY_KEY = "instance_id" +instance_id = "" +source_id = {prefix = "ss"} +work_id = "" +file_id = "" +file_format = "" +file_name = "" +section_id = "" + +[person] +PRIMARY_KEY = "person_id" +person_name = "P2888" +person_name_original = "rdfs:label" +birth_year = "P569" +death_year = "P570" +viaf_id = "P214" + +[person.person_id] +type = "lms:Person" +prefix = "sp" + +[source] +PRIMARY_KEY = "source_id" +source_id = {prefix = "ss", type = "ls:Source"} +source_title = "rdfs:label" + +[work] +PRIMARY_KEY = "work_id" +work_id = "" +work_title = "" +work_title_original = "" +sacred_or_secular = "" +section_id = "" +section_title = "" +author_id = "" +composer_id = "" +genre_id = "" +genre_name = "" +style = "" From c1f75e3d12333c0cf61f0915e5cd2dcc61731a5f Mon Sep 17 00:00:00 2001 From: SCN-MNG <205478402+SCN-MNG@users.noreply.github.com> Date: Wed, 22 Oct 2025 11:33:37 -0400 Subject: [PATCH 06/24] doc(simssadb): save OpenRefine reconciliation history and export history json --- simssa/openrefine/export/export_person.json | 67 +++++++ simssa/openrefine/export/export_work.json | 89 +++++++++ simssa/openrefine/history/history_person.json | 67 +++++++ simssa/openrefine/history/history_work.json | 170 ++++++++++++++++++ 4 files changed, 393 insertions(+) create mode 100644 simssa/openrefine/export/export_person.json create mode 100644 simssa/openrefine/export/export_work.json create mode 100644 simssa/openrefine/history/history_person.json create mode 100644 simssa/openrefine/history/history_work.json diff --git a/simssa/openrefine/export/export_person.json b/simssa/openrefine/export/export_person.json new file mode 100644 index 00000000..dc79cc4e --- /dev/null +++ b/simssa/openrefine/export/export_person.json @@ -0,0 +1,67 @@ +[ + { + "op": "core/recon", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "person_name", + "config": { + "mode": "standard-service", + "service": "https://wikidata.reconci.link/en/api", + "identifierSpace": "http://www.wikidata.org/entity/", + "schemaSpace": "http://www.wikidata.org/prop/direct/", + "type": { + "id": "Q5", + "name": "human" + }, + "autoMatch": true, + "batchSize": 10, + "columnDetails": [], + "limit": 0 + }, + "description": "Reconcile cells in column person_name to type Q5" + }, + { + "op": "core/recon", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "person_name", + "config": { + "mode": "standard-service", + "service": "https://wikidata.reconci.link/en/api", + "identifierSpace": "http://www.wikidata.org/entity/", + "schemaSpace": "http://www.wikidata.org/prop/direct/", + "type": { + "id": "Q5", + "name": "human" + }, + "autoMatch": true, + "batchSize": 10, + "columnDetails": [ + { + "column": "viaf_id", + "propertyName": "VIAF cluster ID", + "propertyID": "P214" + } + ], + "limit": 0 + }, + "description": "Reconcile cells in column person_name to type Q5" + }, + { + "op": "core/column-addition", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "baseColumnName": "person_name", + "expression": "grel:value", + "onError": "set-to-blank", + "newColumnName": "person_name_original", + "columnInsertIndex": 2, + "description": "Create column person_name_original at index 2 based on column person_name using expression grel:value" + } +] \ No newline at end of file diff --git a/simssa/openrefine/export/export_work.json b/simssa/openrefine/export/export_work.json new file mode 100644 index 00000000..9d730418 --- /dev/null +++ b/simssa/openrefine/export/export_work.json @@ -0,0 +1,89 @@ +{ + "format": "csv", + "separator": ",", + "lineSeparator": "\n", + "encoding": "UTF-8", + "quoteAll": false, + "outputColumnHeaders": true, + "outputBlankRows": false, + "columns": [ + { + "name": "person_id", + "reconSettings": { + "output": "entity-name", + "blankUnmatchedCells": false, + "linkToEntityPages": true + }, + "dateSettings": { + "format": "iso-8601", + "useLocalTimeZone": false, + "omitTime": false + } + }, + { + "name": "person_name", + "reconSettings": { + "output": "entity-id", + "blankUnmatchedCells": false, + "linkToEntityPages": true + }, + "dateSettings": { + "format": "iso-8601", + "useLocalTimeZone": false, + "omitTime": false + } + }, + { + "name": "person_name_original", + "reconSettings": { + "output": "entity-name", + "blankUnmatchedCells": false, + "linkToEntityPages": true + }, + "dateSettings": { + "format": "iso-8601", + "useLocalTimeZone": false, + "omitTime": false + } + }, + { + "name": "birth_year", + "reconSettings": { + "output": "entity-name", + "blankUnmatchedCells": false, + "linkToEntityPages": true + }, + "dateSettings": { + "format": "iso-8601", + "useLocalTimeZone": false, + "omitTime": false + } + }, + { + "name": "death_year", + "reconSettings": { + "output": "entity-name", + "blankUnmatchedCells": false, + "linkToEntityPages": true + }, + "dateSettings": { + "format": "iso-8601", + "useLocalTimeZone": false, + "omitTime": false + } + }, + { + "name": "viaf_id", + "reconSettings": { + "output": "entity-name", + "blankUnmatchedCells": false, + "linkToEntityPages": true + }, + "dateSettings": { + "format": "iso-8601", + "useLocalTimeZone": false, + "omitTime": false + } + } + ] +} \ No newline at end of file diff --git a/simssa/openrefine/history/history_person.json b/simssa/openrefine/history/history_person.json new file mode 100644 index 00000000..dc79cc4e --- /dev/null +++ b/simssa/openrefine/history/history_person.json @@ -0,0 +1,67 @@ +[ + { + "op": "core/recon", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "person_name", + "config": { + "mode": "standard-service", + "service": "https://wikidata.reconci.link/en/api", + "identifierSpace": "http://www.wikidata.org/entity/", + "schemaSpace": "http://www.wikidata.org/prop/direct/", + "type": { + "id": "Q5", + "name": "human" + }, + "autoMatch": true, + "batchSize": 10, + "columnDetails": [], + "limit": 0 + }, + "description": "Reconcile cells in column person_name to type Q5" + }, + { + "op": "core/recon", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "person_name", + "config": { + "mode": "standard-service", + "service": "https://wikidata.reconci.link/en/api", + "identifierSpace": "http://www.wikidata.org/entity/", + "schemaSpace": "http://www.wikidata.org/prop/direct/", + "type": { + "id": "Q5", + "name": "human" + }, + "autoMatch": true, + "batchSize": 10, + "columnDetails": [ + { + "column": "viaf_id", + "propertyName": "VIAF cluster ID", + "propertyID": "P214" + } + ], + "limit": 0 + }, + "description": "Reconcile cells in column person_name to type Q5" + }, + { + "op": "core/column-addition", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "baseColumnName": "person_name", + "expression": "grel:value", + "onError": "set-to-blank", + "newColumnName": "person_name_original", + "columnInsertIndex": 2, + "description": "Create column person_name_original at index 2 based on column person_name using expression grel:value" + } +] \ No newline at end of file diff --git a/simssa/openrefine/history/history_work.json b/simssa/openrefine/history/history_work.json new file mode 100644 index 00000000..facb4404 --- /dev/null +++ b/simssa/openrefine/history/history_work.json @@ -0,0 +1,170 @@ +[ + { + "op": "core/recon", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "work_title", + "config": { + "mode": "standard-service", + "service": "https://wikidata.reconci.link/en/api", + "identifierSpace": "http://www.wikidata.org/entity/", + "schemaSpace": "http://www.wikidata.org/prop/direct/", + "type": { + "id": "Q105543609", + "name": "musical work/composition" + }, + "autoMatch": true, + "batchSize": 10, + "columnDetails": [], + "limit": 0 + }, + "description": "Reconcile cells in column work_title to type Q105543609" + }, + { + "op": "core/column-addition", + "engineConfig": { + "facets": [ + { + "type": "list", + "name": "work_title: judgment", + "expression": "forNonBlank(cell.recon.judgment, v, v, if(isNonBlank(value), \"(unreconciled)\", \"(blank)\"))", + "columnName": "work_title", + "invert": false, + "omitBlank": false, + "omitError": false, + "selection": [ + { + "v": { + "v": "matched", + "l": "matched" + } + } + ], + "selectBlank": false, + "selectError": false + }, + { + "type": "range", + "name": "work_title: best candidate's score", + "expression": "cell.recon.best.score", + "columnName": "work_title", + "from": 99, + "to": 101, + "selectNumeric": true, + "selectNonNumeric": true, + "selectBlank": false, + "selectError": true + } + ], + "mode": "row-based" + }, + "baseColumnName": "work_title", + "expression": "grel:value", + "onError": "set-to-blank", + "newColumnName": "work_title_original", + "columnInsertIndex": 2, + "description": "Create column work_title_original at index 2 based on column work_title using expression grel:value" + }, + { + "op": "core/column-removal", + "columnName": "work_title_original", + "description": "Remove column work_title_original" + }, + { + "op": "core/column-addition", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "baseColumnName": "work_title", + "expression": "grel:value", + "onError": "set-to-blank", + "newColumnName": "work_title_original", + "columnInsertIndex": 2, + "description": "Create column work_title_original at index 2 based on column work_title using expression grel:value" + }, + { + "op": "core/recon", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "sacred_or_secular", + "config": { + "mode": "standard-service", + "service": "https://wikidata.reconci.link/en/api", + "identifierSpace": "http://www.wikidata.org/entity/", + "schemaSpace": "http://www.wikidata.org/prop/direct/", + "autoMatch": true, + "batchSize": 10, + "columnDetails": [], + "limit": 0 + }, + "description": "Reconcile cells in column sacred_or_secular to type null" + }, + { + "op": "core/recon-judge-similar-cells", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "sacred_or_secular", + "similarValue": "Secular", + "judgment": "matched", + "match": { + "id": "Q2707298", + "name": "secular music", + "types": [ + "" + ], + "score": 100 + }, + "shareNewTopics": false, + "description": "Match item secular music (Q2707298) for cells containing \"Secular\" in column sacred_or_secular" + }, + { + "op": "core/recon", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "genre_name", + "config": { + "mode": "standard-service", + "service": "https://wikidata.reconci.link/en/api", + "identifierSpace": "http://www.wikidata.org/entity/", + "schemaSpace": "http://www.wikidata.org/prop/direct/", + "type": { + "id": "Q188451", + "name": "music genre" + }, + "autoMatch": true, + "batchSize": 10, + "columnDetails": [], + "limit": 0 + }, + "description": "Reconcile cells in column genre_name to type Q188451" + }, + { + "op": "core/recon-judge-similar-cells", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "sacred_or_secular", + "similarValue": "Sacred", + "judgment": "matched", + "match": { + "id": "Q1065742", + "name": "religious music", + "types": [ + "" + ], + "score": 100 + }, + "shareNewTopics": false, + "description": "Match item religious music (Q1065742) for cells containing \"Sacred\" in column sacred_or_secular" + } +] \ No newline at end of file From 4d516443826092bb7d5aaf28a17988e1e5ea2c0a Mon Sep 17 00:00:00 2001 From: SCN-MNG <205478402+SCN-MNG@users.noreply.github.com> Date: Wed, 22 Oct 2025 12:15:44 -0400 Subject: [PATCH 07/24] feat(simssadb): Complete RDF configuration file --- shared/rdf_config/simssadb.toml | 50 ++++++++++++++++++--------------- 1 file changed, 28 insertions(+), 22 deletions(-) diff --git a/shared/rdf_config/simssadb.toml b/shared/rdf_config/simssadb.toml index 9721613e..f0dd74a2 100644 --- a/shared/rdf_config/simssadb.toml +++ b/shared/rdf_config/simssadb.toml @@ -15,25 +15,28 @@ sw = "https://db.simssa.ca/musicalworks/" sse = "https://db.simssa.ca/sections/" st = "https://db.simssa.ca/types/" ss = "https://db.simssa.ca/sources/" +sf = "https://db.simssa.ca/files/" lms = "https://linkedmusic.ca/graphs/simssadb/" [instance] -PRIMARY_KEY = "instance_id" -instance_id = "" -source_id = {prefix = "ss"} -work_id = "" -file_id = "" +PRIMARY_KEY = "file_id" +work_id = {prefix = "sw", pred = "P6243"} # digital representation of (P6243) +section_id = {prefix = "sse", pred = "P6243"} # digital representation of (P6243) +file_id = {prefix = "sf", type = "lms:File"} file_format = "" -file_name = "" -section_id = "" +file_name = "rdfs:label" + +[instance.source_id] +prefix = "ss" +pred = "P144" # based on (P144) [person] PRIMARY_KEY = "person_id" -person_name = "P2888" +person_name = "P2888" # exact match (P2888) person_name_original = "rdfs:label" -birth_year = "P569" -death_year = "P570" -viaf_id = "P214" +birth_year = "P569" # date of birth (P569) +death_year = "P570" # date of death (P570) +viaf_id = "P214" # VIAF cluster ID (P214) [person.person_id] type = "lms:Person" @@ -41,19 +44,22 @@ prefix = "sp" [source] PRIMARY_KEY = "source_id" -source_id = {prefix = "ss", type = "ls:Source"} source_title = "rdfs:label" +[source.source_id] +prefix = "ss" +type = "lms:Source" + [work] PRIMARY_KEY = "work_id" -work_id = "" -work_title = "" -work_title_original = "" -sacred_or_secular = "" -section_id = "" -section_title = "" -author_id = "" -composer_id = "" -genre_id = "" -genre_name = "" +work_id = {prefix = "sw", type = "lms:Work"} +work_title = {if = "isinstance(obj, URIRef)", pred = "P2888"} # exact match (P2888) +work_title_original = "rdfs:label" +sacred_or_secular = "P136" # genre (P136) +section_id = {prefix = "sse", pred = "P527", type = "lms:Section"} # has part(s) (P527) +section_title = {subj = "section_id", pred = "rdfs:label"} +author_id = {prefix = "sp", pred = "P50"} # author (P50) +composer_id = {prefix = "sp", pred = "P86"} # composer (P86) +genre_id = {prefix = "st", type = "lms:GenreAsInType"} +genre_name = {subj = "genre_id", pred = "rdfs:label"} style = "" From f632d34ae92ce17be31cab16a67b280775ac5d42 Mon Sep 17 00:00:00 2001 From: SCN-MNG <205478402+SCN-MNG@users.noreply.github.com> Date: Wed, 22 Oct 2025 12:16:14 -0400 Subject: [PATCH 08/24] refactor(simssadb): remove unneeded column from merged CSV. --- simssa/src/merge.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/simssa/src/merge.py b/simssa/src/merge.py index 2ddcf69f..8cff9197 100644 --- a/simssa/src/merge.py +++ b/simssa/src/merge.py @@ -41,11 +41,14 @@ def merge_instance_data(input_dir): ) # Each instance may have multiple files - merged_df = pd.merge(source_inst_df, files_df, on="instance_id", how="left") + merged_df = pd.merge(source_inst_df, source_inst_section_df, on="instance_id", how="left") full_merged_df = pd.merge( - merged_df, source_inst_section_df, on="instance_id", how="left" + merged_df, files_df, on="instance_id", how="left" ) + # Drop the instance_id column + full_merged_df = full_merged_df.drop(columns=["instance_id"]) + return full_merged_df From 1652450a04f68de64a927a0e31af4c00f6cf4b8e Mon Sep 17 00:00:00 2001 From: SCN-MNG <205478402+SCN-MNG@users.noreply.github.com> Date: Wed, 22 Oct 2025 15:29:57 -0400 Subject: [PATCH 09/24] chore(simssadb): update docstrings --- simssa/src/export_all_tables.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/simssa/src/export_all_tables.py b/simssa/src/export_all_tables.py index 6f0bb6a4..1ea4cb73 100644 --- a/simssa/src/export_all_tables.py +++ b/simssa/src/export_all_tables.py @@ -1,15 +1,16 @@ """ -Script to export all tables from a PostgreSQL database to structured CSV files. +Script to export all tables from the SimssaDB PostgreSQL database to CSV files. -Tables are categorized into subdirectories based on predefined mappings. +Make sure to set up the database following the guidelines in the README. + +CSV filesare categorized into subdirectories based on predefined mappings. Empty tables are skipped during the export process. """ -import psycopg2 import csv -import re import os import logging +import psycopg2 # Database connection parameters DB_PARAMS = { @@ -21,7 +22,8 @@ BASE_OUTPUT_DIR = os.path.abspath("./simssa/data/raw") -# Table to directory mapping based on existing structure +# Table to subdirectory mapping based on existing structure +# Example: "extracted_feature" CSV goes to "data/raw/feature" subdirectory TABLE_MAPPINGS = { # Feature-related tables "extracted_feature": "feature", @@ -59,8 +61,8 @@ def main(): """ Export all tables from the database to CSV files. - Tables are placed in subdirectories based on their category. - Skips empty tables and logs the export process. + Tables are placed in subdirectories based on existing mapping. + Skips empty tables. """ # Ensure base output directory exists @@ -88,9 +90,9 @@ def main(): for table_name in table_names: try: - # Get table directory (default to 'other' if unknown) - table_dir = TABLE_MAPPINGS.get(table_name, "other") - output_dir = os.path.join(BASE_OUTPUT_DIR, table_dir) + # Get table subdirectory (default to 'other' if unknown) + table_subdir = TABLE_MAPPINGS.get(table_name, "other") + output_dir = os.path.join(BASE_OUTPUT_DIR, table_subdir) # Execute query cur.execute(f'SELECT * FROM "{table_name}"') From 489b79134c73eba3164d384f9eae551f51dc3010 Mon Sep 17 00:00:00 2001 From: SCN-MNG <205478402+SCN-MNG@users.noreply.github.com> Date: Wed, 22 Oct 2025 15:41:55 -0400 Subject: [PATCH 10/24] refactor(simssadb): add command argument options --- simssa/src/export_all_tables.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/simssa/src/export_all_tables.py b/simssa/src/export_all_tables.py index 1ea4cb73..7fd8cbaf 100644 --- a/simssa/src/export_all_tables.py +++ b/simssa/src/export_all_tables.py @@ -10,6 +10,7 @@ import csv import os import logging +import argparse import psycopg2 # Database connection parameters @@ -20,7 +21,7 @@ "host": "localhost", } -BASE_OUTPUT_DIR = os.path.abspath("./simssa/data/raw") +DEFAULT_OUTPUT_DIR = os.path.abspath("./simssa/data/raw") # Table to subdirectory mapping based on existing structure # Example: "extracted_feature" CSV goes to "data/raw/feature" subdirectory @@ -57,7 +58,7 @@ ) -def main(): +def main(base_output_dir): """ Export all tables from the database to CSV files. @@ -66,12 +67,12 @@ def main(): """ # Ensure base output directory exists - os.makedirs(BASE_OUTPUT_DIR, exist_ok=True) + os.makedirs(base_output_dir, exist_ok=True) # Create all subdirectories subdirs = set(TABLE_MAPPINGS.values()) | {"other"} for subdir in subdirs: - os.makedirs(os.path.join(BASE_OUTPUT_DIR, subdir), exist_ok=True) + os.makedirs(os.path.join(base_output_dir, subdir), exist_ok=True) # Connect to database conn = psycopg2.connect(**DB_PARAMS) @@ -92,7 +93,7 @@ def main(): try: # Get table subdirectory (default to 'other' if unknown) table_subdir = TABLE_MAPPINGS.get(table_name, "other") - output_dir = os.path.join(BASE_OUTPUT_DIR, table_subdir) + output_dir = os.path.join(base_output_dir, table_subdir) # Execute query cur.execute(f'SELECT * FROM "{table_name}"') @@ -126,4 +127,12 @@ def main(): if __name__ == "__main__": - main() + parser = argparse.ArgumentParser(description="Export all tables from the SimssaDB PostgreSQL database to CSV files.") + parser.add_argument( + "--output", + type=str, + default=DEFAULT_OUTPUT_DIR, + help="Base directory for output CSV files (default: ./simssa/data/raw)" + ) + args = parser.parse_args() + main(args.output) From f3329762b3f3ccd1ea5de3c183036c44d24d9268 Mon Sep 17 00:00:00 2001 From: SCN-MNG <205478402+SCN-MNG@users.noreply.github.com> Date: Wed, 22 Oct 2025 15:53:15 -0400 Subject: [PATCH 11/24] chore(simssadb): update variable names and docstrings --- simssa/src/merge.py | 94 +++++++++++++++++++++++++++++---------------- 1 file changed, 60 insertions(+), 34 deletions(-) diff --git a/simssa/src/merge.py b/simssa/src/merge.py index 8cff9197..4b0afe86 100644 --- a/simssa/src/merge.py +++ b/simssa/src/merge.py @@ -1,3 +1,14 @@ +""" +Merge and process raw SimssaDB CSV files. + +This script should be run before reconciliation. +Generates the following merged CSV files: +- instance.csv +- work.csv +- source.csv +- person.csv +""" + import argparse import logging from pathlib import Path @@ -12,25 +23,34 @@ logging.basicConfig(level=logging.INFO, format="[%(levelname)s] %(message)s") -def load_and_select(csv_path, usecols, rename_dict): +def load_csv(csv_path, usecols, rename_dict): + """ + Load a CSV file, select specific columns, and rename them if needed. + + Args: + csv_path (Path): Path to the CSV file. + usecols (list): List of columns to select. + rename_dict (dict): Dictionary for renaming columns. + """ df = pd.read_csv(csv_path, usecols=usecols, dtype=str) return df.rename(columns=rename_dict) def merge_instance_data(input_dir): - source_inst_df = load_and_select( + """Merge instance-related CSV files into a single DataFrame.""" + source_inst_df = load_csv( input_dir / "instance" / "source_instantiation.csv", usecols=["id", "source_id", "work_id"], rename_dict={"id": "instance_id"}, ) - source_inst_section_df = load_and_select( + source_inst_section_df = load_csv( input_dir / "instance" / "source_instantiation_sections.csv", usecols=["sourceinstantiation_id", "section_id"], rename_dict={"sourceinstantiation_id": "instance_id"}, ) - files_df = load_and_select( + files_df = load_csv( input_dir / "instance" / "files.csv", usecols=["id", "file_format", "file", "instantiates_id"], rename_dict={ @@ -40,21 +60,24 @@ def merge_instance_data(input_dir): }, ) - # Each instance may have multiple files - merged_df = pd.merge(source_inst_df, source_inst_section_df, on="instance_id", how="left") - full_merged_df = pd.merge( - merged_df, files_df, on="instance_id", how="left" + + merged_df = pd.merge( + source_inst_df, source_inst_section_df, on="instance_id", how="left" ) + # Each instance may have multiple files + full_merged_df = pd.merge(merged_df, files_df, on="instance_id", how="left") - # Drop the instance_id column + # There is no webpage for instances. It is not necessary to keep the instance_id + # This CSV is mainly to link files to works, sections and sources full_merged_df = full_merged_df.drop(columns=["instance_id"]) return full_merged_df def merge_work_data(input_dir): + """Merge work-related CSV files into a single DataFrame.""" # Load and process musical_work.csv - work_df = load_and_select( + work_df = load_csv( input_dir / "musical_work" / "musical_work.csv", usecols=["id", "variant_titles", "sacred_or_secular"], rename_dict={"id": "work_id", "variant_titles": "work_title"}, @@ -63,12 +86,13 @@ def merge_work_data(input_dir): work_df["work_title"] = work_df["work_title"].str.replace( r"[\[\]'\"']", "", regex=True ) + # Map sacred_or_secular boolean to descriptive strings work_df["sacred_or_secular"] = work_df["sacred_or_secular"].replace( {"False": "Secular", "True": "Sacred"} ) # Load and process section.csv - section_df = load_and_select( + section_df = load_csv( input_dir / "musical_work" / "section.csv", usecols=["id", "title", "musical_work_id"], rename_dict={ @@ -78,18 +102,15 @@ def merge_work_data(input_dir): }, ) - # Left join work_df with section_df on work_id - merged_with_sections_df = pd.merge(work_df, section_df, on="work_id", how="left") - # Load and process contribution_musical_works.csv - contribution_df = load_and_select( + contribution_df = load_csv( input_dir / "person" / "contribution_musical_work.csv", usecols=["role", "person_id", "contributed_to_work_id"], rename_dict={"contributed_to_work_id": "work_id"}, ) # The original table store the role of the contributor as either AUTHOR or COMPOSER - contribution_pivot = ( + contribution_pivoted = ( contribution_df.pivot_table( index="work_id", columns="role", values="person_id", aggfunc="first" ) @@ -97,21 +118,24 @@ def merge_work_data(input_dir): .reset_index() ) + # Left join work_df with section_df on work_id + merged_with_sections_df = pd.merge(work_df, section_df, on="work_id", how="left") + # Merge the pivoted contribution data with the work-section DataFrame merged_with_creator_df = pd.merge( - merged_with_sections_df, contribution_pivot, on="work_id", how="left" + merged_with_sections_df, contribution_pivoted, on="work_id", how="left" ) # Processing genre data # Load and process genre-work match table - genre_of_work_df = load_and_select( + genre_of_work_df = load_csv( input_dir / "genre" / "musical_work_genres_as_in_type.csv", usecols=["musicalwork_id", "genreasintype_id"], rename_dict={"musicalwork_id": "work_id", "genreasintype_id": "genre_id"}, ) # Load and process genre id-name match table - genres_df = load_and_select( + genres_df = load_csv( input_dir / "genre" / "genre_as_in_type.csv", usecols=["id", "name"], rename_dict={"id": "genre_id", "name": "genre_name"}, @@ -131,9 +155,10 @@ def merge_work_data(input_dir): return final_merged_df -def process_source_data(input_dir): +def merge_source_data(input_dir): + """Merge source-related CSV files into a single DataFrame.""" # Load and process source/source.csv - source_df = load_and_select( + source_df = load_csv( input_dir / "source" / "source.csv", usecols=["id", "title"], rename_dict={"id": "source_id", "title": "source_title"}, @@ -146,9 +171,10 @@ def process_source_data(input_dir): return source_df -def process_person_data(input_dir): +def merge_person_data(input_dir): + """Merge person-related CSV files into a single DataFrame.""" # Load and process person/person.csv - person_df = load_and_select( + person_df = load_csv( input_dir / "person" / "person.csv", usecols=[ "id", @@ -189,34 +215,34 @@ def main(): parser = argparse.ArgumentParser(description="SIMSSA CSV merge utilities") parser.add_argument( "-i", - "--input-dir", + "--input", default=DEFAULT_INPUT_DIR, type=Path, help="Input raw data directory (default: simssa/data/raw)", ) parser.add_argument( "-o", - "--output-dir", + "--output", default=DEFAULT_OUTPUT_DIR, type=Path, help="Output directory for merged CSVs (default: simssa/data/merged)", ) args = parser.parse_args() - args.output_dir.mkdir(parents=True, exist_ok=True) + args.output.mkdir(parents=True, exist_ok=True) - merge_instance_data(args.input_dir).to_csv( - args.output_dir / "instance.csv", index=False + merge_instance_data(args.input).to_csv( + args.output / "instance.csv", index=False ) - merge_work_data(args.input_dir).to_csv(args.output_dir / "work.csv", index=False) - process_source_data(args.input_dir).to_csv( - args.output_dir / "source.csv", index=False + merge_work_data(args.input).to_csv(args.output / "work.csv", index=False) + merge_source_data(args.input).to_csv( + args.output / "source.csv", index=False ) - process_person_data(args.input_dir).to_csv( - args.output_dir / "person.csv", index=False + merge_person_data(args.input).to_csv( + args.output / "person.csv", index=False ) logger.info( - "All CSVs have been successfully processed and saved to %s", args.output_dir + "All CSVs have been successfully processed and saved to %s", args.output ) From 1fc900164d29520d22a1a031a0bc2b2a251febb2 Mon Sep 17 00:00:00 2001 From: SCN-MNG <205478402+SCN-MNG@users.noreply.github.com> Date: Wed, 22 Oct 2025 15:56:18 -0400 Subject: [PATCH 12/24] chore(simssadb): do minor linting --- simssa/src/export_all_tables.py | 4 +++- simssa/src/merge.py | 3 ++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/simssa/src/export_all_tables.py b/simssa/src/export_all_tables.py index 7fd8cbaf..835b205a 100644 --- a/simssa/src/export_all_tables.py +++ b/simssa/src/export_all_tables.py @@ -127,7 +127,9 @@ def main(base_output_dir): if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Export all tables from the SimssaDB PostgreSQL database to CSV files.") + parser = argparse.ArgumentParser( + description="Export all tables from the SimssaDB PostgreSQL database to CSV files." + ) parser.add_argument( "--output", type=str, diff --git a/simssa/src/merge.py b/simssa/src/merge.py index 4b0afe86..7a818a15 100644 --- a/simssa/src/merge.py +++ b/simssa/src/merge.py @@ -60,7 +60,7 @@ def merge_instance_data(input_dir): }, ) - + merged_df = pd.merge( source_inst_df, source_inst_section_df, on="instance_id", how="left" ) @@ -212,6 +212,7 @@ def merge_person_data(input_dir): def main(): + """Main function to parse arguments and run merging functions.""" parser = argparse.ArgumentParser(description="SIMSSA CSV merge utilities") parser.add_argument( "-i", From 1546fefdb8a485eed890114fb1ee474a11777509 Mon Sep 17 00:00:00 2001 From: SCN-MNG <205478402+SCN-MNG@users.noreply.github.com> Date: Fri, 14 Nov 2025 08:47:27 -0500 Subject: [PATCH 13/24] doc(simssa): update README to include an overview of SIMSSADB content and ingestion process --- simssa/README.md | 204 ++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 166 insertions(+), 38 deletions(-) diff --git a/simssa/README.md b/simssa/README.md index c2055059..f2c45f16 100644 --- a/simssa/README.md +++ b/simssa/README.md @@ -1,58 +1,186 @@ -# SimssaDB flattening and json-ld structures +# Ingestion of SIMSSA DB -> Summary: +# 1. General Description -> 1. Upload SQL dump to local postgreSQL database -> 2. With output run `simssa/src/flattening/SQL_query.py` -> 3. Reconcile `initial_flattened.csv` with OpenRefine -> 4. Reconcile `files.csv` with OpenRefine -> 5. With output run `simssa/src/flattening/restructure.py` -> 6. With output run `simssa/src/jsonld/generate_jsonld.py` (which also takes `simssa/src/jsonld/context.jsonld` as the initial context) +You can read more about SIMSSA DB on the [official webpage](https://db.simssa.ca/about/). A graphic of the SIMSSA DB database model can be found [on Cory McKay's SourceForge page](https://jmir.sourceforge.net/cmckay/papers/mckay17database.pdf) -## 1. Extracting columns and feature flattening +The project is mainly maintained by [Cory McKay](https://jmir.sourceforge.net/cmckay/). According to Ich, it is unlikely for SIMSSA DB to see any future update. -After uploading the database dump to the local PostgreSQL database, we first select relevant columns and perform initial feature flattening with `psycopg` in `SQL_query.py` +# 2. Obtaining The Database Dump -When extracting the files, I found that since there often was more than one file per work, the SQL query would create rows where each data field was duplicated, except for the fields relating to the files, due to the behaviour of the `FULL OUTER JOIN` SQL command. -As such, I decided to instead create a second CSV file that would only contain the files, and there would be a field indicating the musical work that the file corresponded to, allowing us to merge that CSV file with the main CSV file during RDF conversion. -Furthermore, some files aren't linked to any musical works. I chose to simply ignore them when exporting the list of files because files without musical works aren't useful at all for the datalake. These files seem to be linked to musical works that aren't in the data dump currently being used. This will hopefully be fixed by [#263](https://github.com/DDMAL/linkedmusic-datalake/issues/263). +Dylan has obtained a PostgreSQL dump of SIMSSA DB, the dump can be found on [Arbutus Object Storage](https://arbutus.cloud.computecanada.ca/auth/login/?next=/project/containers/container/virtuoso/misc). Please refer to the Internal SIMSSA Wiki on how to set up your Arbutus account. -This produces 2 CSV files, `final_flattened.csv`, a flattening of all the tables into one CSV with `musical_work_id` as the primary key, and `files.csv`, containing the data about all files and the works they are linked to. +# 3. Export SQL Dump to CSV files +1. Install PostgreSQL, if it is not installed already. -## 2. Reconciliation with OpenRefine +2. Make sure that postgres is running using the following command: +```bash +sudo service postgresql status +``` +Start postgresql if it is not running: -OpenRefine reconciliation was performed on `initial_flattened.csv` and on `files.csv`. You can see the reconciled files `reconciled_wikiID.csv` and `reconciled_files_WikiID.csv`. You can use `simssa/openrefine/history/history_flattened.json` and `simssa/openrefine/history/history_files.json` to facilitate reconciliation and `simssa/openrefine/export/export_template_flattened.json` and `simssa/openrefine/export/export_template_files.json` to export to the desired csv format. +```bash +sudo service postgresql start +``` -## 3. Reconcile column names and generating json-ld +3. Start the postgres shell -Currently the json-ld is generated as follows: +```bash +sudo -u postgres psql +``` -In `generate_jsonld.py`: +4. Inside the shell, create a new user and database, and exit the shell: -1. Convert csv to json documents -2. Loop through each json document and edit each entry, creating the compact jsonld. Also parse the files csv to extract and files associated with each entry. -3. Generate the jsonld file at `compact.jsonld` -4. The contexts used in the `compact.jsonld` file is imported from `context.jsonld` +```bash +CREATE USER myuser WITH PASSWORD 'mypassword'; +CREATE DATABASE simssadb OWNER myuser; +GRANT ALL PRIVILEGES ON DATABASE simssadb TO myuser; +\q +``` -### TODO: Make the RDF conversion convert to Turtle +5. Load the SQL dump into your new database through the following command: -## Database Export Scripts +```bash +sudo -u postgres sh -c "gunzip -c | psql -d simssadb" +``` + +When prompted, enter "mypassword" as the password. -### export_structured_tables.py +6. Grant read access of all loaded tables to "myuser" -A new script that exports all tables from the SimssaDB database to CSV files organized by category: +First, start the shell again: +```bash +sudo -u postgres psql -d simssadb +``` -- **Usage**: `python src/export_structured_tables.py` -- **Output**: Structured CSV files in `data/raw/` subdirectories -- **Categories**: - - `feature/`: extracted_feature, feature, feature_file - - `genre/`: genre_as_in_style, genre_as_in_type, musical_work_genres_* - - `instance/`: files, source_instantiation, source_instantiation_sections - - `musical_work/`: musical_work, part, section - - `person/`: contribution_musical_work, person - - `source/`: source - - `other/`: geographic_area, instrument, software, and any unknown tables +Then, run the following commands: +```bash +-- Grant SELECT on all existing tables +GRANT SELECT ON ALL TABLES IN SCHEMA public TO myuser; -This script automatically maps tables to appropriate directories based on their content type and puts unknown tables in the `other/` directory. +-- Grant SELECT on tables created in the future +ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT SELECT ON TABLES TO myuser; +\q ``` + +7. Run `export_all_tables.py` + +Run the following command from the repository root directory: +```bash +python simssa/src/export_all_tables.py +``` + +All nonempty tables should be outputted as CSV files in the subdirectories of `simssa/data/raw` + + +# 4. Overview of The Raw Dataset + +After running `simssa/src/export_all_tables.py `, each nonempty table should be outputted as a CSV file in a subdirectory of `simssa/data/raw` + +`export_all_tables.py` groups the CSV files into the following subdirectories: +1. `feature`: CSV related to audio/musical features (e.g. most frequent pitch, rhythmic variability). +2. `genre`: CSV files related to musical genres, including both "genre-as-in-style" (e.g., Renaissance) and "genre-as-in-type" (e.g., Madrigal). +3. `instance`: CSV files related to instances of musical works, which serve as intermediate links between works, sources, and files. +4. `musical_work`: CSV files related to musical works, including their titles, sections, and associated metadata. Musical works (i.e. compositions) are the central entities of SIMSSA DB. +5. `person`: CSV files containing data about authors and composers, including their roles and contributions. +6. `source`: CSV files describing the origins of scores and their relationships to musical works and sections. + +Every other CSV file is placed in the `other` subdirectory: these do not seem to pertinent to the datalake. + + +## 4.1 Feature Subdirectory +Contains CSV related to audio/musical features (e.g. most frequent pitch, rythmic variability). These features were extracted from MIDI files. You can find an example of features list at `https://db.simssa.ca/files/2018` + +Contains the following CSVs: +- extracted_features.csv: list of musical/audio features +- feature_file.csv: location of files containing extracted features +- feature.csv: another list of musical/audio features + +Musical features are currently omitted from the RDF since it is very difficult/impractical to store them Linked Data form. Anyone interested in these data should be redirected to the SIMSSA DB website. + +## 4.2 Genre Subdirectory +Contains CSV files related to musical genres, including both "genre-as-in-style" and "genre-as-in-type." + +Contains the following CSVs: +- genre_as_in_style.csv: "Renaissance" is the only genre_as_in_style in SIMSSA DB. +- genre_as_in_type.csv: Lists twelve different genre_as_in_type (e.g., Zibaldone, Madrigal). +- musical_work_genres_as_in_style.csv: Maps every musical work in SIMSSA DB to the genre "Renaissance." +- musical_work_genres_as_in_type.csv: Maps musical works to their genre_as_in_type. + +Musical genres are an important aspect of SIMSSA DB, particularly "genre-as-in-type," which provides more detailed classifications. These data are suitable for Linked Data representation. + +## 4.3 Instance Subdirectory +Contains CSV files related to instances of musical works, which serve as intermediate links between works, sources, and files. + +Contains the following CSVs: +- files.csv: Points to files containing sheet music or MIDI scores. +- source_instantiation.csv: Links instances to a musical work and to a source. +- source_instantiation_sections.csv: Links instances to a section of a musical work. An instance is either linked to the entire musical work or to a section of it. + +Instances are not stored as distinct entities in the datalake but are crucial for linking works, sources, and files in the raw dataset. + +## 4.4 Musical Work Subdirectory +Contains CSV files related to musical works, including their titles, sections, and associated metadata. + +Contains the following CSVs: +- geographic_area.csv: Only contains "Vienna." +- instruments.csv: Only contains "Voice." +- musical_works.csv: Links a musical work to its title and indicates whether it is sacred or secular. +- part.csv: Lists whenever a work has a part for voice. +- section.csv: Lists sections of the musical works (e.g., work 117 may have a "Sanctus (In nomine)" section). + +Among these, only `musical_works.csv` and `section.csv` are ingested into the datalake. The other files were not part of the final RDF since they contained so little data. + +## 4.5 Person Subdirectory +Contains CSV files related to authors and composers, including their roles and contributions. + +Contains the following CSVs: +- person.csv: Lists all composers/authors, with their birth and death years. +- contribution_musical_work.csv: Links people to compositions. The "role" column describes whether the person was an "AUTHOR" or a "COMPOSER." + +These files provide essential metadata about the creators of musical works and their contributions, making them suitable for Linked Data representation. + +# 5. Type of Entities in the RDF + +## 5.1 Persons +Prefix: `https://db.simssa.ca/persons/` + +Identifies people who are either author or composers of musical work. Each person is linked to a VIAF ID in the raw dataset. + +## 5.2 Musical Works +Prefix: `https://db.simssa.ca/musicalworks/` + +Identifies individual musical works (i.e. compositions). Each composition is linked to: +1. An author and a composer +2. A genre +3. Symbolic music files (MIDI & PDF score) +4. Sections (e.g. a mass may have an Introit section) + +## 5.3 Sections +Prefix: `https://db.simssa.ca/sections/` + +This namespace refers to *sections* of musical works. A “section” may correspond to a movement, chant segment, or logical division within a work. + +There can a symbolic music file for a particular section instead of the whole composition. + + +## 5.4 Types +Prefix: `https://db.simssa.ca/types/` + + +This namespace contains controlled vocabulary terms and classification types used throughout the database—such as genre categories, musical form types, chant classifications, and descriptive typologies. These are reference entities used to annotate works, sections, or sources with normalized terms. + + +## 5.5 Sources +Prefix: `https://db.simssa.ca/sources/` + +Identifies the genre (i.e. genre-as-in-type, see discussion under [4.2 Genre Subdirectory](./database_content.md#42-genre-subdirectory)) of a musical work. For example, a musical work can have the genre "madrigal". + + +## 5.6 Files +Prefix: `https://db.simssa.ca/files/` + +Identifies the symbolic music file (PDF or MIDI) attached to a work or a section. + + From 79b9b1782cca7bc6cb2fd916751914e842e28b2d Mon Sep 17 00:00:00 2001 From: SCN-MNG <205478402+SCN-MNG@users.noreply.github.com> Date: Fri, 14 Nov 2025 08:51:02 -0500 Subject: [PATCH 14/24] chore(simssa): delete outdated SQL to CSV scripts --- simssa/src/flattening/SQL_query.py | 234 --------------------------- simssa/src/flattening/restructure.py | 34 ---- 2 files changed, 268 deletions(-) delete mode 100644 simssa/src/flattening/SQL_query.py delete mode 100644 simssa/src/flattening/restructure.py diff --git a/simssa/src/flattening/SQL_query.py b/simssa/src/flattening/SQL_query.py deleted file mode 100644 index 8b41af45..00000000 --- a/simssa/src/flattening/SQL_query.py +++ /dev/null @@ -1,234 +0,0 @@ -import psycopg2 -import csv -import re - - -# steps: -# 1. upload the simssadb dump to a local postgreSQL database called 'simssadb'. -# 2. change db_params fields below as needed. -# 3. Run the scripts, generate the flattened.csv file. -# Each row associates with a different file_id. File-to-work flattening happens after this, in the script restructure.py (with pandas) -# flattened.csv however has all the associated features of each file flattened. -# 4. Reconcile the generated flattened.csv and run the script with restructure.py - -# Query main info from the database -db_params = { - "dbname": "simssadb", - "user": "postgres", - "password": "postgres", - "host": "localhost", -} - -author_query = """ - CREATE VIEW author AS - SELECT - musical_work.id AS musical_work_id, - contribution_musical_work.id AS contribution_id, - person.given_name AS given_name, - person.surname AS sur_name, - person.authority_control_url AS auth_URL - FROM - musical_work - FULL OUTER JOIN - contribution_musical_work ON contribution_musical_work.contributed_to_work_id = musical_work.id - FULL OUTER JOIN - person ON contribution_musical_work.person_id = person.id - WHERE contribution_musical_work.role = 'AUTHOR' -""" - -composer_query = """ - CREATE VIEW composer AS - SELECT - musical_work.id AS musical_work_id, - contribution_musical_work.id AS contribution_id, - person.given_name AS given_name, - person.surname AS sur_name, - person.authority_control_url AS auth_URL - FROM - musical_work - FULL OUTER JOIN - contribution_musical_work ON contribution_musical_work.contributed_to_work_id = musical_work.id - FULL OUTER JOIN - person ON contribution_musical_work.person_id = person.id - WHERE contribution_musical_work.role = 'COMPOSER' -""" - -# flattened_view_query_with_features = """ -# CREATE VIEW flattened_view AS -# SELECT -# musical_work.id AS musical_work_id, -# musical_work.variant_titles AS musical_work_variant_titles, -# musical_work.sacred_or_secular AS sacred_or_secular, -# author.given_name AS author_given_name, -# author.sur_name AS author_sur_name, -# author.auth_URL AS author_auth_URL, -# author.contribution_id AS author_contribution_id, -# composer.given_name AS composer_given_name, -# composer.sur_name AS composer_sur_name, -# composer.auth_URL AS composer_auth_URL, -# composer.contribution_id AS composer_contribution_id, -# genre_style.genre_style AS genre_style, -# genre_type.genre_type AS genre_type, -# source_instantiation.portion AS source_instantiation_portion, -# source.title AS source_title, -# source.source_type AS source_type, -# source.url AS source_url, -# source.id AS source_id, -# files.file_type AS file_type, -# files.file_format AS file_format, -# files.version AS file_version, -# 'https://db.simssa.ca/files/' || files.id AS url_to_file, -# extracted.value AS extracted_value, -# extracted.feature AS feature -# FROM -# musical_work -# FULL OUTER JOIN -# author ON author.musical_work_id = musical_work.id -# FULL OUTER JOIN -# composer ON composer.musical_work_id = musical_work.id -# FULL OUTER JOIN -# source_instantiation ON musical_work.id = source_instantiation.work_id -# FULL OUTER JOIN -# source ON source_instantiation.source_id = source.id -# FULL OUTER JOIN -# files ON files.instantiates_id = source_instantiation.id -# FULL OUTER JOIN -# (SELECT m.musicalwork_id AS mid, g.name AS genre_style FROM musical_work_genres_as_in_style m JOIN genre_as_in_style g -# ON m.genreasinstyle_id = g.id)genre_style -# ON genre_style.mid = musical_work.id -# FULL OUTER JOIN -# (SELECT m.musicalwork_id AS mid, g.name AS genre_type FROM musical_work_genres_as_in_type m JOIN genre_as_in_type g -# ON m.genreasintype_id = g.id)genre_type -# ON genre_type.mid = musical_work.id -# FULL OUTER JOIN -# (SELECT f.name AS feature, e.feature_of_id AS feature_of_id, e.value AS value FROM extracted_feature e JOIN feature f ON e.instance_of_feature_id = f.id)extracted -# ON files.id = extracted.feature_of_id -# WHERE musical_work.id IS NOT NULL -# """ - -compact_files_query = """ - CREATE VIEW compact_files AS - SELECT - musical_work.id AS musical_work_id, - files.id AS file_id, - files.file_type AS file_type, - files.file_format AS file_format, - files.version AS file_version, - 'https://db.simssa.ca/files/' || files.id AS url_to_file - FROM - musical_work - FULL OUTER JOIN - source_instantiation ON musical_work.id = source_instantiation.work_id - FULL OUTER JOIN - source ON source_instantiation.source_id = source.id - FULL OUTER JOIN - files ON files.instantiates_id = source_instantiation.id - WHERE musical_work.id IS NOT NULL AND files.id IS NOT NULL -""" - -flattened_view_query = """ - CREATE VIEW flattened_view AS - SELECT - musical_work.id AS musical_work_id, - musical_work.variant_titles AS musical_work_variant_titles, - musical_work.sacred_or_secular AS sacred_or_secular, - author.given_name AS author_given_name, - author.sur_name AS author_sur_name, - author.auth_URL AS author_auth_URL, - author.contribution_id AS author_contribution_id, - composer.given_name AS composer_given_name, - composer.sur_name AS composer_sur_name, - composer.auth_URL AS composer_auth_URL, - composer.contribution_id AS composer_contribution_id, - genre_style.genre_style AS genre_style, - genre_type.genre_type AS genre_type, - source_instantiation.portion AS source_instantiation_portion, - source.title AS source_title, - source.source_type AS source_type, - source.url AS source_url, - source.id AS source_id - FROM - musical_work - FULL OUTER JOIN - author ON author.musical_work_id = musical_work.id - FULL OUTER JOIN - composer ON composer.musical_work_id = musical_work.id - FULL OUTER JOIN - source_instantiation ON musical_work.id = source_instantiation.work_id - FULL OUTER JOIN - source ON source_instantiation.source_id = source.id - FULL OUTER JOIN - (SELECT m.musicalwork_id AS mid, g.name AS genre_style FROM musical_work_genres_as_in_style m JOIN genre_as_in_style g - ON m.genreasinstyle_id = g.id)genre_style - ON genre_style.mid = musical_work.id - FULL OUTER JOIN - (SELECT m.musicalwork_id AS mid, g.name AS genre_type FROM musical_work_genres_as_in_type m JOIN genre_as_in_type g - ON m.genreasintype_id = g.id)genre_type - ON genre_type.mid = musical_work.id - WHERE musical_work.id IS NOT NULL - """ - -# creating the initial flattened view -conn = psycopg2.connect(**db_params) -cur = conn.cursor() -cur.execute(author_query) -cur.execute(composer_query) -cur.execute(flattened_view_query) -cur.execute(compact_files_query) - - -# # START FEATURE FLATTENING -# # get distinct feature names: -# cur.execute("SELECT DISTINCT feature FROM flattened_view") -# feature_names = [row[0] for row in cur.fetchall()] - -# # for renaming feature names for column compatibility -# def sanitize_column_name(name): -# if name is None: -# return None -# return re.sub(' ', '_', name) - -# # Create feature columns -# feature_columns = ", ".join( -# f"MAX(CASE WHEN feature = '{name}' THEN extracted_value ELSE NULL END) AS \"{sanitize_column_name(name)}\"" -# for name in feature_names -# ) -# cur.execute("""SELECT column_name -# FROM information_schema.columns -# WHERE table_name = 'flattened_view'""") -# exclude_list=['extracted_value', 'feature'] -# flattened_column_names = ", ".join([row[0] for row in cur.fetchall() if row[0] not in exclude_list]) - -# return all columns -final_query_flattened_view = """ - SELECT * - FROM flattened_view -""" -# run -cur.execute(final_query_flattened_view) -results = cur.fetchall() - -# Export data to CSV -with open("initial_flattened.csv", "w") as f: - writer = csv.writer(f) - writer.writerow([col[0] for col in cur.description]) - writer.writerows(results) - -# also export the files to another csv -final_query_compact_files = """ - SELECT * - FROM compact_files -""" -# run -cur.execute(final_query_compact_files) -results = cur.fetchall() - -# Export data to CSV -with open("files.csv", "w") as f: - writer = csv.writer(f) - writer.writerow([col[0] for col in cur.description]) - writer.writerows(results) - -# Clean up -cur.close() -conn.close() diff --git a/simssa/src/flattening/restructure.py b/simssa/src/flattening/restructure.py deleted file mode 100644 index fcda7cfa..00000000 --- a/simssa/src/flattening/restructure.py +++ /dev/null @@ -1,34 +0,0 @@ -import csv -import pandas as pd - -# Note: This script is ran AFTER SQL_query.py script and AFTER reconciliation. -# This script takes in a reconciled version of csv and: -# 1. Filter for only the columns we're interested in -# 2. Merge columns so that each row corresponds to a musical_work_id instead of file_id as previously -# Each file is now belong to a musical work - - -df = pd.read_csv("./reconciled_WikiID.csv") -# get only the columns we're interested in -cols = ['musical_work_id','sacred_or_secular','source_id','author_contribution_id','composer_contribution_id', "author_viaf_id",'composer_viaf_id', "genre_style_@id","genre_style",'genre_type','genre_type_@id','source_instantiation_portion','source_title', 'source_type','source_url','musical_work_variant_titles',"author_name",'author_name_@id','composer_name','composer_name_@id'] -df2 = df[cols] - - -# df2['musical_work_id'] = df2['musical_work_id'].astype(int) - - -# # since the flattening process flattened the info related to files (file_formats, url_to_file, Last_Pitch), we merge on the other columns -# merge_on = ["musical_work_id",'musical_work_variant_titles','composer', 'genre_style','genre_type'] - -# df2['test_count'] = df2.groupby("musical_work_id").cumcount() + 1 -# df2 = df2.pivot(index=merge_on, columns='test_count', values=['file_format','url_to_file', 'Last_Pitch_Class']) - -# # Flatten the multi-index columns -# df2.columns = [f'{col[0]}_{col[1]}' for col in df2.columns] -# # now each columns represent a row, each row will have the - -# # Reset the index -# df2 = df2.reset_index() - - -df2.to_csv("./final_flattened.csv", index=False) From 298d6f5ad8b8ff39db687cba595ab436c8ec94a9 Mon Sep 17 00:00:00 2001 From: SCN-MNG <205478402+SCN-MNG@users.noreply.github.com> Date: Fri, 14 Nov 2025 09:06:17 -0500 Subject: [PATCH 15/24] chore(simssa): reformat documentation --- simssa/README.md | 44 ++++++++++++++++++++++++++++++-------------- 1 file changed, 30 insertions(+), 14 deletions(-) diff --git a/simssa/README.md b/simssa/README.md index f2c45f16..4abb64d7 100644 --- a/simssa/README.md +++ b/simssa/README.md @@ -11,12 +11,15 @@ The project is mainly maintained by [Cory McKay](https://jmir.sourceforge.net/cm Dylan has obtained a PostgreSQL dump of SIMSSA DB, the dump can be found on [Arbutus Object Storage](https://arbutus.cloud.computecanada.ca/auth/login/?next=/project/containers/container/virtuoso/misc). Please refer to the Internal SIMSSA Wiki on how to set up your Arbutus account. # 3. Export SQL Dump to CSV files + 1. Install PostgreSQL, if it is not installed already. 2. Make sure that postgres is running using the following command: + ```bash sudo service postgresql status ``` + Start postgresql if it is not running: ```bash @@ -49,11 +52,13 @@ When prompted, enter "mypassword" as the password. 6. Grant read access of all loaded tables to "myuser" First, start the shell again: + ```bash sudo -u postgres psql -d simssadb ``` Then, run the following commands: + ```bash -- Grant SELECT on all existing tables GRANT SELECT ON ALL TABLES IN SCHEMA public TO myuser; @@ -67,18 +72,19 @@ ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT SELECT ON TABLES TO myuser; 7. Run `export_all_tables.py` Run the following command from the repository root directory: + ```bash -python simssa/src/export_all_tables.py +python simssa/src/export_all_tables.py ``` All nonempty tables should be outputted as CSV files in the subdirectories of `simssa/data/raw` - -# 4. Overview of The Raw Dataset +# 4. Overview of The Raw Dataset After running `simssa/src/export_all_tables.py `, each nonempty table should be outputted as a CSV file in a subdirectory of `simssa/data/raw` `export_all_tables.py` groups the CSV files into the following subdirectories: + 1. `feature`: CSV related to audio/musical features (e.g. most frequent pitch, rhythmic variability). 2. `genre`: CSV files related to musical genres, including both "genre-as-in-style" (e.g., Renaissance) and "genre-as-in-type" (e.g., Madrigal). 3. `instance`: CSV files related to instances of musical works, which serve as intermediate links between works, sources, and files. @@ -88,11 +94,12 @@ After running `simssa/src/export_all_tables.py `, each nonempty table should be Every other CSV file is placed in the `other` subdirectory: these do not seem to pertinent to the datalake. - ## 4.1 Feature Subdirectory + Contains CSV related to audio/musical features (e.g. most frequent pitch, rythmic variability). These features were extracted from MIDI files. You can find an example of features list at `https://db.simssa.ca/files/2018` Contains the following CSVs: + - extracted_features.csv: list of musical/audio features - feature_file.csv: location of files containing extracted features - feature.csv: another list of musical/audio features @@ -100,9 +107,11 @@ Contains the following CSVs: Musical features are currently omitted from the RDF since it is very difficult/impractical to store them Linked Data form. Anyone interested in these data should be redirected to the SIMSSA DB website. ## 4.2 Genre Subdirectory + Contains CSV files related to musical genres, including both "genre-as-in-style" and "genre-as-in-type." Contains the following CSVs: + - genre_as_in_style.csv: "Renaissance" is the only genre_as_in_style in SIMSSA DB. - genre_as_in_type.csv: Lists twelve different genre_as_in_type (e.g., Zibaldone, Madrigal). - musical_work_genres_as_in_style.csv: Maps every musical work in SIMSSA DB to the genre "Renaissance." @@ -111,9 +120,11 @@ Contains the following CSVs: Musical genres are an important aspect of SIMSSA DB, particularly "genre-as-in-type," which provides more detailed classifications. These data are suitable for Linked Data representation. ## 4.3 Instance Subdirectory + Contains CSV files related to instances of musical works, which serve as intermediate links between works, sources, and files. Contains the following CSVs: + - files.csv: Points to files containing sheet music or MIDI scores. - source_instantiation.csv: Links instances to a musical work and to a source. - source_instantiation_sections.csv: Links instances to a section of a musical work. An instance is either linked to the entire musical work or to a section of it. @@ -121,9 +132,11 @@ Contains the following CSVs: Instances are not stored as distinct entities in the datalake but are crucial for linking works, sources, and files in the raw dataset. ## 4.4 Musical Work Subdirectory + Contains CSV files related to musical works, including their titles, sections, and associated metadata. Contains the following CSVs: + - geographic_area.csv: Only contains "Vienna." - instruments.csv: Only contains "Voice." - musical_works.csv: Links a musical work to its title and indicates whether it is sacred or secular. @@ -133,9 +146,11 @@ Contains the following CSVs: Among these, only `musical_works.csv` and `section.csv` are ingested into the datalake. The other files were not part of the final RDF since they contained so little data. ## 4.5 Person Subdirectory + Contains CSV files related to authors and composers, including their roles and contributions. Contains the following CSVs: + - person.csv: Lists all composers/authors, with their birth and death years. - contribution_musical_work.csv: Links people to compositions. The "role" column describes whether the person was an "AUTHOR" or a "COMPOSER." @@ -144,43 +159,44 @@ These files provide essential metadata about the creators of musical works and t # 5. Type of Entities in the RDF ## 5.1 Persons + Prefix: `https://db.simssa.ca/persons/` Identifies people who are either author or composers of musical work. Each person is linked to a VIAF ID in the raw dataset. -## 5.2 Musical Works +## 5.2 Musical Works + Prefix: `https://db.simssa.ca/musicalworks/` Identifies individual musical works (i.e. compositions). Each composition is linked to: + 1. An author and a composer 2. A genre 3. Symbolic music files (MIDI & PDF score) 4. Sections (e.g. a mass may have an Introit section) -## 5.3 Sections +## 5.3 Sections + Prefix: `https://db.simssa.ca/sections/` -This namespace refers to *sections* of musical works. A “section” may correspond to a movement, chant segment, or logical division within a work. +This namespace refers to _sections_ of musical works. A “section” may correspond to a movement, chant segment, or logical division within a work. There can a symbolic music file for a particular section instead of the whole composition. +## 5.4 Types -## 5.4 Types Prefix: `https://db.simssa.ca/types/` - This namespace contains controlled vocabulary terms and classification types used throughout the database—such as genre categories, musical form types, chant classifications, and descriptive typologies. These are reference entities used to annotate works, sections, or sources with normalized terms. +## 5.5 Sources -## 5.5 Sources Prefix: `https://db.simssa.ca/sources/` Identifies the genre (i.e. genre-as-in-type, see discussion under [4.2 Genre Subdirectory](./database_content.md#42-genre-subdirectory)) of a musical work. For example, a musical work can have the genre "madrigal". +## 5.6 Files -## 5.6 Files Prefix: `https://db.simssa.ca/files/` -Identifies the symbolic music file (PDF or MIDI) attached to a work or a section. - - +Identifies the symbolic music file (PDF or MIDI) attached to a work or a section. From 28c919e877024a2e2080c652c84c030184238e6d Mon Sep 17 00:00:00 2001 From: SC Meng <205478402+SCN-MNG@users.noreply.github.com> Date: Fri, 14 Nov 2025 09:21:40 -0500 Subject: [PATCH 16/24] chore(simssa): fix grammar mistake Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- simssa/src/export_all_tables.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/simssa/src/export_all_tables.py b/simssa/src/export_all_tables.py index 835b205a..e86ecbb4 100644 --- a/simssa/src/export_all_tables.py +++ b/simssa/src/export_all_tables.py @@ -3,7 +3,7 @@ Make sure to set up the database following the guidelines in the README. -CSV filesare categorized into subdirectories based on predefined mappings. +CSV files are categorized into subdirectories based on predefined mappings. Empty tables are skipped during the export process. """ From b714b40041e973b5064419a89186b4df6b4c1f81 Mon Sep 17 00:00:00 2001 From: SCN-MNG <205478402+SCN-MNG@users.noreply.github.com> Date: Fri, 14 Nov 2025 09:24:28 -0500 Subject: [PATCH 17/24] fix(simssa): address SQL injection vulnerability in table name handling --- simssa/src/export_all_tables.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/simssa/src/export_all_tables.py b/simssa/src/export_all_tables.py index 835b205a..cba8d181 100644 --- a/simssa/src/export_all_tables.py +++ b/simssa/src/export_all_tables.py @@ -12,6 +12,7 @@ import logging import argparse import psycopg2 +from psycopg2 import sql # Database connection parameters DB_PARAMS = { @@ -95,8 +96,9 @@ def main(base_output_dir): table_subdir = TABLE_MAPPINGS.get(table_name, "other") output_dir = os.path.join(base_output_dir, table_subdir) - # Execute query - cur.execute(f'SELECT * FROM "{table_name}"') + # Use psycopg2.sql.Identifier for safe table name quoting + query = sql.SQL("SELECT * FROM {}").format(sql.Identifier(table_name)) + cur.execute(query) # Skip empty tables rows = cur.fetchall() From 9fe6b54b01113d0bed3e5ce42b0c5b484316dca8 Mon Sep 17 00:00:00 2001 From: SC Meng <205478402+SCN-MNG@users.noreply.github.com> Date: Fri, 21 Nov 2025 08:56:47 -0500 Subject: [PATCH 18/24] doc(simssa): fix grammar error in README Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- simssa/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/simssa/README.md b/simssa/README.md index 4abb64d7..4d76df91 100644 --- a/simssa/README.md +++ b/simssa/README.md @@ -96,7 +96,7 @@ Every other CSV file is placed in the `other` subdirectory: these do not seem to ## 4.1 Feature Subdirectory -Contains CSV related to audio/musical features (e.g. most frequent pitch, rythmic variability). These features were extracted from MIDI files. You can find an example of features list at `https://db.simssa.ca/files/2018` +Contains CSV related to audio/musical features (e.g. most frequent pitch, rhythmic variability). These features were extracted from MIDI files. You can find an example of features list at `https://db.simssa.ca/files/2018` Contains the following CSVs: From 0125ecd5696203ac71ab3b7324a919b570468314 Mon Sep 17 00:00:00 2001 From: SC Meng <205478402+SCN-MNG@users.noreply.github.com> Date: Fri, 21 Nov 2025 08:57:16 -0500 Subject: [PATCH 19/24] doc(simssa): update path in README Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- simssa/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/simssa/README.md b/simssa/README.md index 4d76df91..58151b27 100644 --- a/simssa/README.md +++ b/simssa/README.md @@ -193,7 +193,7 @@ This namespace contains controlled vocabulary terms and classification types use Prefix: `https://db.simssa.ca/sources/` -Identifies the genre (i.e. genre-as-in-type, see discussion under [4.2 Genre Subdirectory](./database_content.md#42-genre-subdirectory)) of a musical work. For example, a musical work can have the genre "madrigal". +Identifies the genre (i.e. genre-as-in-type, see discussion under [4.2 Genre Subdirectory](#42-genre-subdirectory)) of a musical work. For example, a musical work can have the genre "madrigal". ## 5.6 Files From 05e969ab989e21a994b56a677d1919f7e02a1e0d Mon Sep 17 00:00:00 2001 From: Liam Pond Date: Thu, 2 Apr 2026 13:41:51 -0400 Subject: [PATCH 20/24] Fix grammar and clarity issues in README Corrected grammatical errors and improved clarity in README. --- simssa/README.md | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/simssa/README.md b/simssa/README.md index 58151b27..4e8d9e7e 100644 --- a/simssa/README.md +++ b/simssa/README.md @@ -8,7 +8,7 @@ The project is mainly maintained by [Cory McKay](https://jmir.sourceforge.net/cm # 2. Obtaining The Database Dump -Dylan has obtained a PostgreSQL dump of SIMSSA DB, the dump can be found on [Arbutus Object Storage](https://arbutus.cloud.computecanada.ca/auth/login/?next=/project/containers/container/virtuoso/misc). Please refer to the Internal SIMSSA Wiki on how to set up your Arbutus account. +Dylan has obtained a PostgreSQL dump of the SIMSSA DB. The dump can be found on [Arbutus Object Storage](https://arbutus.cloud.computecanada.ca/auth/login/?next=/project/containers/container/virtuoso/misc). Please refer to the Internal SIMSSA Wiki on how to set up your Arbutus account. # 3. Export SQL Dump to CSV files @@ -49,7 +49,7 @@ sudo -u postgres sh -c "gunzip -c | psql -d simssadb" When prompted, enter "mypassword" as the password. -6. Grant read access of all loaded tables to "myuser" +6. Grant read access to all loaded tables to "myuser" First, start the shell again: @@ -77,26 +77,26 @@ Run the following command from the repository root directory: python simssa/src/export_all_tables.py ``` -All nonempty tables should be outputted as CSV files in the subdirectories of `simssa/data/raw` +All nonempty tables should be output as CSV files in the subdirectories of `simssa/data/raw` # 4. Overview of The Raw Dataset -After running `simssa/src/export_all_tables.py `, each nonempty table should be outputted as a CSV file in a subdirectory of `simssa/data/raw` +After running `simssa/src/export_all_tables.py `, each nonempty table should be output as a CSV file in a subdirectory of `simssa/data/raw` `export_all_tables.py` groups the CSV files into the following subdirectories: -1. `feature`: CSV related to audio/musical features (e.g. most frequent pitch, rhythmic variability). +1. `feature`: CSV related to audio/musical features (e.g., most frequent pitch, rhythmic variability). 2. `genre`: CSV files related to musical genres, including both "genre-as-in-style" (e.g., Renaissance) and "genre-as-in-type" (e.g., Madrigal). 3. `instance`: CSV files related to instances of musical works, which serve as intermediate links between works, sources, and files. -4. `musical_work`: CSV files related to musical works, including their titles, sections, and associated metadata. Musical works (i.e. compositions) are the central entities of SIMSSA DB. +4. `musical_work`: CSV files related to musical works, including their titles, sections, and associated metadata. Musical works (i.e., compositions) are the central entities of SIMSSA DB. 5. `person`: CSV files containing data about authors and composers, including their roles and contributions. 6. `source`: CSV files describing the origins of scores and their relationships to musical works and sections. -Every other CSV file is placed in the `other` subdirectory: these do not seem to pertinent to the datalake. +Every other CSV file is placed in the `other` subdirectory: these do not seem to be pertinent to the datalake. ## 4.1 Feature Subdirectory -Contains CSV related to audio/musical features (e.g. most frequent pitch, rhythmic variability). These features were extracted from MIDI files. You can find an example of features list at `https://db.simssa.ca/files/2018` +Contains CSV related to audio/musical features (e.g., most frequent pitch, rhythmic variability). These features were extracted from MIDI files. You can find a list of example features at `https://db.simssa.ca/files/2018` Contains the following CSVs: @@ -104,7 +104,7 @@ Contains the following CSVs: - feature_file.csv: location of files containing extracted features - feature.csv: another list of musical/audio features -Musical features are currently omitted from the RDF since it is very difficult/impractical to store them Linked Data form. Anyone interested in these data should be redirected to the SIMSSA DB website. +Musical features are currently omitted from the RDF since it is very difficult/impractical to store them in Linked Data form. Anyone interested in these data should be redirected to the SIMSSA DB website. ## 4.2 Genre Subdirectory @@ -129,7 +129,7 @@ Contains the following CSVs: - source_instantiation.csv: Links instances to a musical work and to a source. - source_instantiation_sections.csv: Links instances to a section of a musical work. An instance is either linked to the entire musical work or to a section of it. -Instances are not stored as distinct entities in the datalake but are crucial for linking works, sources, and files in the raw dataset. +Instances are not stored as distinct entities in the data lake but are crucial for linking works, sources, and files in the raw dataset. ## 4.4 Musical Work Subdirectory @@ -162,18 +162,18 @@ These files provide essential metadata about the creators of musical works and t Prefix: `https://db.simssa.ca/persons/` -Identifies people who are either author or composers of musical work. Each person is linked to a VIAF ID in the raw dataset. +Identifies people who are either authors or composers of musical works. Each person is linked to a VIAF ID in the raw dataset. ## 5.2 Musical Works Prefix: `https://db.simssa.ca/musicalworks/` -Identifies individual musical works (i.e. compositions). Each composition is linked to: +Identifies individual musical works (i.e., compositions). Each composition is linked to: 1. An author and a composer 2. A genre 3. Symbolic music files (MIDI & PDF score) -4. Sections (e.g. a mass may have an Introit section) +4. Sections (e.g., a mass may have an Introit section) ## 5.3 Sections @@ -181,7 +181,7 @@ Prefix: `https://db.simssa.ca/sections/` This namespace refers to _sections_ of musical works. A “section” may correspond to a movement, chant segment, or logical division within a work. -There can a symbolic music file for a particular section instead of the whole composition. +There can be a symbolic music file for a particular section instead of the whole composition. ## 5.4 Types @@ -193,7 +193,7 @@ This namespace contains controlled vocabulary terms and classification types use Prefix: `https://db.simssa.ca/sources/` -Identifies the genre (i.e. genre-as-in-type, see discussion under [4.2 Genre Subdirectory](#42-genre-subdirectory)) of a musical work. For example, a musical work can have the genre "madrigal". +Identifies the genre (i.e., genre-as-in-type, see discussion under [4.2 Genre Subdirectory](#42-genre-subdirectory)) of a musical work. For example, a musical work can have the genre "madrigal". ## 5.6 Files From 39ddb0a54f3fdc768022a20521257de7e16fb7a9 Mon Sep 17 00:00:00 2001 From: SC Meng <205478402+SCN-MNG@users.noreply.github.com> Date: Thu, 2 Apr 2026 17:20:45 -0400 Subject: [PATCH 21/24] fix(simssaDB): merge.py now removes viaf_id prefix Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- simssa/src/merge.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/simssa/src/merge.py b/simssa/src/merge.py index 7a818a15..499602cb 100644 --- a/simssa/src/merge.py +++ b/simssa/src/merge.py @@ -187,6 +187,18 @@ def merge_person_data(input_dir): rename_dict={"id": "person_id", "authority_control_url": "viaf_id"}, ) + # Normalize VIAF identifiers: extract numeric ID from VIAF URLs + # e.g., "https://viaf.org/viaf/123456/" -> "123456" + person_df["viaf_id"] = ( + person_df["viaf_id"] + .astype("string") + .str.strip() + .str.replace( + r"^https?://viaf\.org/viaf/(\d+)/?.*$", + r"\1", + regex=True, + ) + ) # Combine given_name and surname into person_name person_df["person_name"] = person_df["given_name"] + " " + person_df["surname"] From 6cf29f282a43fbdf24f31a8cae7992e45f4f248b Mon Sep 17 00:00:00 2001 From: SC Meng <205478402+SCN-MNG@users.noreply.github.com> Date: Thu, 2 Apr 2026 17:25:56 -0400 Subject: [PATCH 22/24] fix(simssadb): Add missing predicate for music genre in SimssaDB rdf conversion config Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- shared/rdf_config/simssadb.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/shared/rdf_config/simssadb.toml b/shared/rdf_config/simssadb.toml index f0dd74a2..bd78db7c 100644 --- a/shared/rdf_config/simssadb.toml +++ b/shared/rdf_config/simssadb.toml @@ -60,6 +60,6 @@ section_id = {prefix = "sse", pred = "P527", type = "lms:Section"} # has part(s section_title = {subj = "section_id", pred = "rdfs:label"} author_id = {prefix = "sp", pred = "P50"} # author (P50) composer_id = {prefix = "sp", pred = "P86"} # composer (P86) -genre_id = {prefix = "st", type = "lms:GenreAsInType"} +genre_id = {prefix = "st", pred = "P136", type = "lms:GenreAsInType"} # genre (P136) genre_name = {subj = "genre_id", pred = "rdfs:label"} style = "" From 9ce4b2335e85718e6382567980be7e766c7333f6 Mon Sep 17 00:00:00 2001 From: SC Meng <205478402+SCN-MNG@users.noreply.github.com> Date: Thu, 2 Apr 2026 17:26:41 -0400 Subject: [PATCH 23/24] fix(simssadb): add date datatype to rdf config. It makes the graph slightly more precise. Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- shared/rdf_config/simssadb.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/shared/rdf_config/simssadb.toml b/shared/rdf_config/simssadb.toml index bd78db7c..cc4b65d0 100644 --- a/shared/rdf_config/simssadb.toml +++ b/shared/rdf_config/simssadb.toml @@ -34,8 +34,8 @@ pred = "P144" # based on (P144) PRIMARY_KEY = "person_id" person_name = "P2888" # exact match (P2888) person_name_original = "rdfs:label" -birth_year = "P569" # date of birth (P569) -death_year = "P570" # date of death (P570) +birth_year = { pred = "P569", datatype = "xsd:date" } # date of birth (P569) +death_year = { pred = "P570", datatype = "xsd:date" } # date of death (P570) viaf_id = "P214" # VIAF cluster ID (P214) [person.person_id] From 04f08e82193d29c6e18315961a3a307b799dfab5 Mon Sep 17 00:00:00 2001 From: SC Meng <205478402+SCN-MNG@users.noreply.github.com> Date: Thu, 2 Apr 2026 19:12:49 -0400 Subject: [PATCH 24/24] doc(simssa): fix README error --- simssa/README.md | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/simssa/README.md b/simssa/README.md index 4e8d9e7e..652654ab 100644 --- a/simssa/README.md +++ b/simssa/README.md @@ -156,6 +156,10 @@ Contains the following CSVs: These files provide essential metadata about the creators of musical works and their contributions, making them suitable for Linked Data representation. +## 4.6 Source subdirectory + +Contains the CSV file `source.csv, which specifies information on a source (i.e. a book/anthology from which a musical work is taken). + # 5. Type of Entities in the RDF ## 5.1 Persons @@ -174,6 +178,7 @@ Identifies individual musical works (i.e., compositions). Each composition is li 2. A genre 3. Symbolic music files (MIDI & PDF score) 4. Sections (e.g., a mass may have an Introit section) +5. A source (a book or an anthology in which the work was found). ## 5.3 Sections @@ -187,13 +192,13 @@ There can be a symbolic music file for a particular section instead of the whole Prefix: `https://db.simssa.ca/types/` -This namespace contains controlled vocabulary terms and classification types used throughout the database—such as genre categories, musical form types, chant classifications, and descriptive typologies. These are reference entities used to annotate works, sections, or sources with normalized terms. +This namespace identifies the genre of the musical work ("genre-as-in-type"). ## 5.5 Sources Prefix: `https://db.simssa.ca/sources/` -Identifies the genre (i.e., genre-as-in-type, see discussion under [4.2 Genre Subdirectory](#42-genre-subdirectory)) of a musical work. For example, a musical work can have the genre "madrigal". +Identifies the book/anthology from which the chant was taken. ## 5.6 Files