diff --git a/shared/rdf_config/simssadb.toml b/shared/rdf_config/simssadb.toml new file mode 100644 index 000000000..cc4b65d0f --- /dev/null +++ b/shared/rdf_config/simssadb.toml @@ -0,0 +1,65 @@ +[general] +name = "simssadb" +csv_folder = "../../simssa/data/reconciled" +rdf_output_folder = "../../simssa/data/rdf" +test_mode = false + +[namespaces] +rdf = "http://www.w3.org/1999/02/22-rdf-syntax-ns#" +rdfs = "http://www.w3.org/2000/01/rdf-schema#" +xsd = "http://www.w3.org/2001/XMLSchema#" +wd = "http://www.wikidata.org/entity/" +wdt = "http://www.wikidata.org/prop/direct/" +sp = "https://db.simssa.ca/persons/" +sw = "https://db.simssa.ca/musicalworks/" +sse = "https://db.simssa.ca/sections/" +st = "https://db.simssa.ca/types/" +ss = "https://db.simssa.ca/sources/" +sf = "https://db.simssa.ca/files/" +lms = "https://linkedmusic.ca/graphs/simssadb/" + +[instance] +PRIMARY_KEY = "file_id" +work_id = {prefix = "sw", pred = "P6243"} # digital representation of (P6243) +section_id = {prefix = "sse", pred = "P6243"} # digital representation of (P6243) +file_id = {prefix = "sf", type = "lms:File"} +file_format = "" +file_name = "rdfs:label" + +[instance.source_id] +prefix = "ss" +pred = "P144" # based on (P144) + +[person] +PRIMARY_KEY = "person_id" +person_name = "P2888" # exact match (P2888) +person_name_original = "rdfs:label" +birth_year = { pred = "P569", datatype = "xsd:date" } # date of birth (P569) +death_year = { pred = "P570", datatype = "xsd:date" } # date of death (P570) +viaf_id = "P214" # VIAF cluster ID (P214) + +[person.person_id] +type = "lms:Person" +prefix = "sp" + +[source] +PRIMARY_KEY = "source_id" +source_title = "rdfs:label" + +[source.source_id] +prefix = "ss" +type = "lms:Source" + +[work] +PRIMARY_KEY = "work_id" +work_id = {prefix = "sw", type = "lms:Work"} +work_title = {if = "isinstance(obj, URIRef)", pred = "P2888"} # exact match (P2888) +work_title_original = "rdfs:label" +sacred_or_secular = "P136" # genre (P136) +section_id = {prefix = "sse", pred = "P527", type = "lms:Section"} # has part(s) (P527) +section_title = {subj = "section_id", pred = "rdfs:label"} +author_id = {prefix = "sp", pred = "P50"} # author (P50) +composer_id = {prefix = "sp", pred = "P86"} # composer (P86) +genre_id = {prefix = "st", pred = "P136", type = "lms:GenreAsInType"} # genre (P136) +genre_name = {subj = "genre_id", pred = "rdfs:label"} +style = "" diff --git a/simssa/README.md b/simssa/README.md index ecb0e0c45..652654ab0 100644 --- a/simssa/README.md +++ b/simssa/README.md @@ -1,37 +1,207 @@ -# SimssaDB flattening and json-ld structures +# Ingestion of SIMSSA DB -> Summary: +# 1. General Description -> 1. Upload SQL dump to local postgreSQL database -> 2. With output run `simssa/src/flattening/SQL_query.py` -> 3. Reconcile `initial_flattened.csv` with OpenRefine -> 4. Reconcile `files.csv` with OpenRefine -> 5. With output run `simssa/src/flattening/restructure.py` -> 6. With output run `simssa/src/jsonld/generate_jsonld.py` (which also takes `simssa/src/jsonld/context.jsonld` as the initial context) +You can read more about SIMSSA DB on the [official webpage](https://db.simssa.ca/about/). A graphic of the SIMSSA DB database model can be found [on Cory McKay's SourceForge page](https://jmir.sourceforge.net/cmckay/papers/mckay17database.pdf) -## 1. Extracting columns and feature flattening +The project is mainly maintained by [Cory McKay](https://jmir.sourceforge.net/cmckay/). According to Ich, it is unlikely for SIMSSA DB to see any future update. -After uploading the database dump to the local PostgreSQL database, we first select relevant columns and perform initial feature flattening with `psycopg` in `SQL_query.py` +# 2. Obtaining The Database Dump -When extracting the files, I found that since there often was more than one file per work, the SQL query would create rows where each data field was duplicated, except for the fields relating to the files, due to the behaviour of the `FULL OUTER JOIN` SQL command. -As such, I decided to instead create a second CSV file that would only contain the files, and there would be a field indicating the musical work that the file corresponded to, allowing us to merge that CSV file with the main CSV file during RDF conversion. -Furthermore, some files aren't linked to any musical works. I chose to simply ignore them when exporting the list of files because files without musical works aren't useful at all for the datalake. These files seem to be linked to musical works that aren't in the data dump currently being used. This will hopefully be fixed by [#263](https://github.com/DDMAL/linkedmusic-datalake/issues/263). +Dylan has obtained a PostgreSQL dump of the SIMSSA DB. The dump can be found on [Arbutus Object Storage](https://arbutus.cloud.computecanada.ca/auth/login/?next=/project/containers/container/virtuoso/misc). Please refer to the Internal SIMSSA Wiki on how to set up your Arbutus account. -This produces 2 CSV files, `final_flattened.csv`, a flattening of all the tables into one CSV with `musical_work_id` as the primary key, and `files.csv`, containing the data about all files and the works they are linked to. +# 3. Export SQL Dump to CSV files -## 2. Reconciliation with OpenRefine +1. Install PostgreSQL, if it is not installed already. -OpenRefine reconciliation was performed on `initial_flattened.csv` and on `files.csv`. You can see the reconciled files `reconciled_wikiID.csv` and `reconciled_files_WikiID.csv`. You can use `simssa/openrefine/history/history_flattened.json` and `simssa/openrefine/history/history_files.json` to facilitate reconciliation and `simssa/openrefine/export/export_template_flattened.json` and `simssa/openrefine/export/export_template_files.json` to export to the desired csv format. +2. Make sure that postgres is running using the following command: -## 3. Reconcile column names and generating json-ld +```bash +sudo service postgresql status +``` -Currently the json-ld is generated as follows: +Start postgresql if it is not running: -In `generate_jsonld.py`: +```bash +sudo service postgresql start +``` -1. Convert csv to json documents -2. Loop through each json document and edit each entry, creating the compact jsonld. Also parse the files csv to extract and files associated with each entry. -3. Generate the jsonld file at `compact.jsonld` -4. The contexts used in the `compact.jsonld` file is imported from `context.jsonld` +3. Start the postgres shell -### TODO: Make the RDF conversion convert to Turtle +```bash +sudo -u postgres psql +``` + +4. Inside the shell, create a new user and database, and exit the shell: + +```bash +CREATE USER myuser WITH PASSWORD 'mypassword'; +CREATE DATABASE simssadb OWNER myuser; +GRANT ALL PRIVILEGES ON DATABASE simssadb TO myuser; +\q +``` + +5. Load the SQL dump into your new database through the following command: + +```bash +sudo -u postgres sh -c "gunzip -c | psql -d simssadb" +``` + +When prompted, enter "mypassword" as the password. + +6. Grant read access to all loaded tables to "myuser" + +First, start the shell again: + +```bash +sudo -u postgres psql -d simssadb +``` + +Then, run the following commands: + +```bash +-- Grant SELECT on all existing tables +GRANT SELECT ON ALL TABLES IN SCHEMA public TO myuser; + +-- Grant SELECT on tables created in the future +ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT SELECT ON TABLES TO myuser; + +\q +``` + +7. Run `export_all_tables.py` + +Run the following command from the repository root directory: + +```bash +python simssa/src/export_all_tables.py +``` + +All nonempty tables should be output as CSV files in the subdirectories of `simssa/data/raw` + +# 4. Overview of The Raw Dataset + +After running `simssa/src/export_all_tables.py `, each nonempty table should be output as a CSV file in a subdirectory of `simssa/data/raw` + +`export_all_tables.py` groups the CSV files into the following subdirectories: + +1. `feature`: CSV related to audio/musical features (e.g., most frequent pitch, rhythmic variability). +2. `genre`: CSV files related to musical genres, including both "genre-as-in-style" (e.g., Renaissance) and "genre-as-in-type" (e.g., Madrigal). +3. `instance`: CSV files related to instances of musical works, which serve as intermediate links between works, sources, and files. +4. `musical_work`: CSV files related to musical works, including their titles, sections, and associated metadata. Musical works (i.e., compositions) are the central entities of SIMSSA DB. +5. `person`: CSV files containing data about authors and composers, including their roles and contributions. +6. `source`: CSV files describing the origins of scores and their relationships to musical works and sections. + +Every other CSV file is placed in the `other` subdirectory: these do not seem to be pertinent to the datalake. + +## 4.1 Feature Subdirectory + +Contains CSV related to audio/musical features (e.g., most frequent pitch, rhythmic variability). These features were extracted from MIDI files. You can find a list of example features at `https://db.simssa.ca/files/2018` + +Contains the following CSVs: + +- extracted_features.csv: list of musical/audio features +- feature_file.csv: location of files containing extracted features +- feature.csv: another list of musical/audio features + +Musical features are currently omitted from the RDF since it is very difficult/impractical to store them in Linked Data form. Anyone interested in these data should be redirected to the SIMSSA DB website. + +## 4.2 Genre Subdirectory + +Contains CSV files related to musical genres, including both "genre-as-in-style" and "genre-as-in-type." + +Contains the following CSVs: + +- genre_as_in_style.csv: "Renaissance" is the only genre_as_in_style in SIMSSA DB. +- genre_as_in_type.csv: Lists twelve different genre_as_in_type (e.g., Zibaldone, Madrigal). +- musical_work_genres_as_in_style.csv: Maps every musical work in SIMSSA DB to the genre "Renaissance." +- musical_work_genres_as_in_type.csv: Maps musical works to their genre_as_in_type. + +Musical genres are an important aspect of SIMSSA DB, particularly "genre-as-in-type," which provides more detailed classifications. These data are suitable for Linked Data representation. + +## 4.3 Instance Subdirectory + +Contains CSV files related to instances of musical works, which serve as intermediate links between works, sources, and files. + +Contains the following CSVs: + +- files.csv: Points to files containing sheet music or MIDI scores. +- source_instantiation.csv: Links instances to a musical work and to a source. +- source_instantiation_sections.csv: Links instances to a section of a musical work. An instance is either linked to the entire musical work or to a section of it. + +Instances are not stored as distinct entities in the data lake but are crucial for linking works, sources, and files in the raw dataset. + +## 4.4 Musical Work Subdirectory + +Contains CSV files related to musical works, including their titles, sections, and associated metadata. + +Contains the following CSVs: + +- geographic_area.csv: Only contains "Vienna." +- instruments.csv: Only contains "Voice." +- musical_works.csv: Links a musical work to its title and indicates whether it is sacred or secular. +- part.csv: Lists whenever a work has a part for voice. +- section.csv: Lists sections of the musical works (e.g., work 117 may have a "Sanctus (In nomine)" section). + +Among these, only `musical_works.csv` and `section.csv` are ingested into the datalake. The other files were not part of the final RDF since they contained so little data. + +## 4.5 Person Subdirectory + +Contains CSV files related to authors and composers, including their roles and contributions. + +Contains the following CSVs: + +- person.csv: Lists all composers/authors, with their birth and death years. +- contribution_musical_work.csv: Links people to compositions. The "role" column describes whether the person was an "AUTHOR" or a "COMPOSER." + +These files provide essential metadata about the creators of musical works and their contributions, making them suitable for Linked Data representation. + +## 4.6 Source subdirectory + +Contains the CSV file `source.csv, which specifies information on a source (i.e. a book/anthology from which a musical work is taken). + +# 5. Type of Entities in the RDF + +## 5.1 Persons + +Prefix: `https://db.simssa.ca/persons/` + +Identifies people who are either authors or composers of musical works. Each person is linked to a VIAF ID in the raw dataset. + +## 5.2 Musical Works + +Prefix: `https://db.simssa.ca/musicalworks/` + +Identifies individual musical works (i.e., compositions). Each composition is linked to: + +1. An author and a composer +2. A genre +3. Symbolic music files (MIDI & PDF score) +4. Sections (e.g., a mass may have an Introit section) +5. A source (a book or an anthology in which the work was found). + +## 5.3 Sections + +Prefix: `https://db.simssa.ca/sections/` + +This namespace refers to _sections_ of musical works. A “section” may correspond to a movement, chant segment, or logical division within a work. + +There can be a symbolic music file for a particular section instead of the whole composition. + +## 5.4 Types + +Prefix: `https://db.simssa.ca/types/` + +This namespace identifies the genre of the musical work ("genre-as-in-type"). + +## 5.5 Sources + +Prefix: `https://db.simssa.ca/sources/` + +Identifies the book/anthology from which the chant was taken. + +## 5.6 Files + +Prefix: `https://db.simssa.ca/files/` + +Identifies the symbolic music file (PDF or MIDI) attached to a work or a section. diff --git a/simssa/openrefine/export/export_person.json b/simssa/openrefine/export/export_person.json new file mode 100644 index 000000000..dc79cc4eb --- /dev/null +++ b/simssa/openrefine/export/export_person.json @@ -0,0 +1,67 @@ +[ + { + "op": "core/recon", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "person_name", + "config": { + "mode": "standard-service", + "service": "https://wikidata.reconci.link/en/api", + "identifierSpace": "http://www.wikidata.org/entity/", + "schemaSpace": "http://www.wikidata.org/prop/direct/", + "type": { + "id": "Q5", + "name": "human" + }, + "autoMatch": true, + "batchSize": 10, + "columnDetails": [], + "limit": 0 + }, + "description": "Reconcile cells in column person_name to type Q5" + }, + { + "op": "core/recon", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "person_name", + "config": { + "mode": "standard-service", + "service": "https://wikidata.reconci.link/en/api", + "identifierSpace": "http://www.wikidata.org/entity/", + "schemaSpace": "http://www.wikidata.org/prop/direct/", + "type": { + "id": "Q5", + "name": "human" + }, + "autoMatch": true, + "batchSize": 10, + "columnDetails": [ + { + "column": "viaf_id", + "propertyName": "VIAF cluster ID", + "propertyID": "P214" + } + ], + "limit": 0 + }, + "description": "Reconcile cells in column person_name to type Q5" + }, + { + "op": "core/column-addition", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "baseColumnName": "person_name", + "expression": "grel:value", + "onError": "set-to-blank", + "newColumnName": "person_name_original", + "columnInsertIndex": 2, + "description": "Create column person_name_original at index 2 based on column person_name using expression grel:value" + } +] \ No newline at end of file diff --git a/simssa/openrefine/export/export_work.json b/simssa/openrefine/export/export_work.json new file mode 100644 index 000000000..9d730418c --- /dev/null +++ b/simssa/openrefine/export/export_work.json @@ -0,0 +1,89 @@ +{ + "format": "csv", + "separator": ",", + "lineSeparator": "\n", + "encoding": "UTF-8", + "quoteAll": false, + "outputColumnHeaders": true, + "outputBlankRows": false, + "columns": [ + { + "name": "person_id", + "reconSettings": { + "output": "entity-name", + "blankUnmatchedCells": false, + "linkToEntityPages": true + }, + "dateSettings": { + "format": "iso-8601", + "useLocalTimeZone": false, + "omitTime": false + } + }, + { + "name": "person_name", + "reconSettings": { + "output": "entity-id", + "blankUnmatchedCells": false, + "linkToEntityPages": true + }, + "dateSettings": { + "format": "iso-8601", + "useLocalTimeZone": false, + "omitTime": false + } + }, + { + "name": "person_name_original", + "reconSettings": { + "output": "entity-name", + "blankUnmatchedCells": false, + "linkToEntityPages": true + }, + "dateSettings": { + "format": "iso-8601", + "useLocalTimeZone": false, + "omitTime": false + } + }, + { + "name": "birth_year", + "reconSettings": { + "output": "entity-name", + "blankUnmatchedCells": false, + "linkToEntityPages": true + }, + "dateSettings": { + "format": "iso-8601", + "useLocalTimeZone": false, + "omitTime": false + } + }, + { + "name": "death_year", + "reconSettings": { + "output": "entity-name", + "blankUnmatchedCells": false, + "linkToEntityPages": true + }, + "dateSettings": { + "format": "iso-8601", + "useLocalTimeZone": false, + "omitTime": false + } + }, + { + "name": "viaf_id", + "reconSettings": { + "output": "entity-name", + "blankUnmatchedCells": false, + "linkToEntityPages": true + }, + "dateSettings": { + "format": "iso-8601", + "useLocalTimeZone": false, + "omitTime": false + } + } + ] +} \ No newline at end of file diff --git a/simssa/openrefine/history/history_person.json b/simssa/openrefine/history/history_person.json new file mode 100644 index 000000000..dc79cc4eb --- /dev/null +++ b/simssa/openrefine/history/history_person.json @@ -0,0 +1,67 @@ +[ + { + "op": "core/recon", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "person_name", + "config": { + "mode": "standard-service", + "service": "https://wikidata.reconci.link/en/api", + "identifierSpace": "http://www.wikidata.org/entity/", + "schemaSpace": "http://www.wikidata.org/prop/direct/", + "type": { + "id": "Q5", + "name": "human" + }, + "autoMatch": true, + "batchSize": 10, + "columnDetails": [], + "limit": 0 + }, + "description": "Reconcile cells in column person_name to type Q5" + }, + { + "op": "core/recon", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "person_name", + "config": { + "mode": "standard-service", + "service": "https://wikidata.reconci.link/en/api", + "identifierSpace": "http://www.wikidata.org/entity/", + "schemaSpace": "http://www.wikidata.org/prop/direct/", + "type": { + "id": "Q5", + "name": "human" + }, + "autoMatch": true, + "batchSize": 10, + "columnDetails": [ + { + "column": "viaf_id", + "propertyName": "VIAF cluster ID", + "propertyID": "P214" + } + ], + "limit": 0 + }, + "description": "Reconcile cells in column person_name to type Q5" + }, + { + "op": "core/column-addition", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "baseColumnName": "person_name", + "expression": "grel:value", + "onError": "set-to-blank", + "newColumnName": "person_name_original", + "columnInsertIndex": 2, + "description": "Create column person_name_original at index 2 based on column person_name using expression grel:value" + } +] \ No newline at end of file diff --git a/simssa/openrefine/history/history_work.json b/simssa/openrefine/history/history_work.json new file mode 100644 index 000000000..facb4404c --- /dev/null +++ b/simssa/openrefine/history/history_work.json @@ -0,0 +1,170 @@ +[ + { + "op": "core/recon", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "work_title", + "config": { + "mode": "standard-service", + "service": "https://wikidata.reconci.link/en/api", + "identifierSpace": "http://www.wikidata.org/entity/", + "schemaSpace": "http://www.wikidata.org/prop/direct/", + "type": { + "id": "Q105543609", + "name": "musical work/composition" + }, + "autoMatch": true, + "batchSize": 10, + "columnDetails": [], + "limit": 0 + }, + "description": "Reconcile cells in column work_title to type Q105543609" + }, + { + "op": "core/column-addition", + "engineConfig": { + "facets": [ + { + "type": "list", + "name": "work_title: judgment", + "expression": "forNonBlank(cell.recon.judgment, v, v, if(isNonBlank(value), \"(unreconciled)\", \"(blank)\"))", + "columnName": "work_title", + "invert": false, + "omitBlank": false, + "omitError": false, + "selection": [ + { + "v": { + "v": "matched", + "l": "matched" + } + } + ], + "selectBlank": false, + "selectError": false + }, + { + "type": "range", + "name": "work_title: best candidate's score", + "expression": "cell.recon.best.score", + "columnName": "work_title", + "from": 99, + "to": 101, + "selectNumeric": true, + "selectNonNumeric": true, + "selectBlank": false, + "selectError": true + } + ], + "mode": "row-based" + }, + "baseColumnName": "work_title", + "expression": "grel:value", + "onError": "set-to-blank", + "newColumnName": "work_title_original", + "columnInsertIndex": 2, + "description": "Create column work_title_original at index 2 based on column work_title using expression grel:value" + }, + { + "op": "core/column-removal", + "columnName": "work_title_original", + "description": "Remove column work_title_original" + }, + { + "op": "core/column-addition", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "baseColumnName": "work_title", + "expression": "grel:value", + "onError": "set-to-blank", + "newColumnName": "work_title_original", + "columnInsertIndex": 2, + "description": "Create column work_title_original at index 2 based on column work_title using expression grel:value" + }, + { + "op": "core/recon", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "sacred_or_secular", + "config": { + "mode": "standard-service", + "service": "https://wikidata.reconci.link/en/api", + "identifierSpace": "http://www.wikidata.org/entity/", + "schemaSpace": "http://www.wikidata.org/prop/direct/", + "autoMatch": true, + "batchSize": 10, + "columnDetails": [], + "limit": 0 + }, + "description": "Reconcile cells in column sacred_or_secular to type null" + }, + { + "op": "core/recon-judge-similar-cells", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "sacred_or_secular", + "similarValue": "Secular", + "judgment": "matched", + "match": { + "id": "Q2707298", + "name": "secular music", + "types": [ + "" + ], + "score": 100 + }, + "shareNewTopics": false, + "description": "Match item secular music (Q2707298) for cells containing \"Secular\" in column sacred_or_secular" + }, + { + "op": "core/recon", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "genre_name", + "config": { + "mode": "standard-service", + "service": "https://wikidata.reconci.link/en/api", + "identifierSpace": "http://www.wikidata.org/entity/", + "schemaSpace": "http://www.wikidata.org/prop/direct/", + "type": { + "id": "Q188451", + "name": "music genre" + }, + "autoMatch": true, + "batchSize": 10, + "columnDetails": [], + "limit": 0 + }, + "description": "Reconcile cells in column genre_name to type Q188451" + }, + { + "op": "core/recon-judge-similar-cells", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "sacred_or_secular", + "similarValue": "Sacred", + "judgment": "matched", + "match": { + "id": "Q1065742", + "name": "religious music", + "types": [ + "" + ], + "score": 100 + }, + "shareNewTopics": false, + "description": "Match item religious music (Q1065742) for cells containing \"Sacred\" in column sacred_or_secular" + } +] \ No newline at end of file diff --git a/simssa/src/export_all_tables.py b/simssa/src/export_all_tables.py new file mode 100644 index 000000000..7e3edd9da --- /dev/null +++ b/simssa/src/export_all_tables.py @@ -0,0 +1,142 @@ +""" +Script to export all tables from the SimssaDB PostgreSQL database to CSV files. + +Make sure to set up the database following the guidelines in the README. + +CSV files are categorized into subdirectories based on predefined mappings. +Empty tables are skipped during the export process. +""" + +import csv +import os +import logging +import argparse +import psycopg2 +from psycopg2 import sql + +# Database connection parameters +DB_PARAMS = { + "dbname": "simssadb", + "user": "myuser", + "password": "mypassword", + "host": "localhost", +} + +DEFAULT_OUTPUT_DIR = os.path.abspath("./simssa/data/raw") + +# Table to subdirectory mapping based on existing structure +# Example: "extracted_feature" CSV goes to "data/raw/feature" subdirectory +TABLE_MAPPINGS = { + # Feature-related tables + "extracted_feature": "feature", + "feature": "feature", + "feature_file": "feature", + # Genre-related tables + "genre_as_in_style": "genre", + "genre_as_in_type": "genre", + "musical_work_genres_as_in_style": "genre", + "musical_work_genres_as_in_type": "genre", + # Instance-related tables + "files": "instance", + "source_instantiation": "instance", + "source_instantiation_sections": "instance", + # Musical work-related tables + "musical_work": "musical_work", + "part": "musical_work", + "section": "musical_work", + "geographic_area": "musical_work", + "instrument": "musical_work", + # Person-related tables + "contribution_musical_work": "person", + "person": "person", + # Source-related tables + "source": "source", +} + +# Configure logging +logging.basicConfig( + level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" +) + + +def main(base_output_dir): + """ + Export all tables from the database to CSV files. + + Tables are placed in subdirectories based on existing mapping. + Skips empty tables. + """ + + # Ensure base output directory exists + os.makedirs(base_output_dir, exist_ok=True) + + # Create all subdirectories + subdirs = set(TABLE_MAPPINGS.values()) | {"other"} + for subdir in subdirs: + os.makedirs(os.path.join(base_output_dir, subdir), exist_ok=True) + + # Connect to database + conn = psycopg2.connect(**DB_PARAMS) + cur = conn.cursor() + + try: + # Get all table names in public schema + cur.execute( + """ + SELECT tablename FROM pg_tables WHERE schemaname = 'public'; + """ + ) + table_names = [row[0] for row in cur.fetchall()] + + exported_count = 0 + + for table_name in table_names: + try: + # Get table subdirectory (default to 'other' if unknown) + table_subdir = TABLE_MAPPINGS.get(table_name, "other") + output_dir = os.path.join(base_output_dir, table_subdir) + + # Use psycopg2.sql.Identifier for safe table name quoting + query = sql.SQL("SELECT * FROM {}").format(sql.Identifier(table_name)) + cur.execute(query) + + # Skip empty tables + rows = cur.fetchall() + if len(rows) == 0: + logging.info("Skipped %s (empty table)", table_name) + continue + + # Write to CSV + csv_path = os.path.join(output_dir, f"{table_name}.csv") + with open(csv_path, "w", newline="", encoding="utf-8") as f: + writer = csv.writer(f) + writer.writerow( + [col[0] for col in cur.description] + ) # Write headers + writer.writerows(rows) + + exported_count += 1 + + except (psycopg2.Error, IOError) as e: + logging.error("Error exporting table %s: %s", table_name, e) + + logging.info("\nExport completed!") + logging.info("Total tables exported: %d", exported_count) + + finally: + cur.close() + conn.close() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Export all tables from the SimssaDB PostgreSQL database to CSV files." + ) + parser.add_argument( + "--output", + type=str, + default=DEFAULT_OUTPUT_DIR, + help="Base directory for output CSV files (default: ./simssa/data/raw)" + ) + args = parser.parse_args() + main(args.output) diff --git a/simssa/src/flattening/SQL_query.py b/simssa/src/flattening/SQL_query.py deleted file mode 100644 index ec86bce6c..000000000 --- a/simssa/src/flattening/SQL_query.py +++ /dev/null @@ -1,235 +0,0 @@ -import psycopg2 -import csv -import re - - -# steps: -# 1. upload the simssadb dump to a local postgreSQL database called 'simssadb'. -# 2. change db_params fields below as needed. -# 3. Run the scripts, generate the flattened.csv file. -# Each row associates with a different file_id. File-to-work flattening happens after this, in the script restructure.py (with pandas) -# flattened.csv however has all the associated features of each file flattened. -# 4. Reconcile the generated flattened.csv and run the script with restructure.py - -# Query main info from the database -db_params = { - 'dbname': 'simssadb', - 'user': 'postgres', - 'password': 'postgres', - 'host': 'localhost' -} - -author_query = """ - CREATE VIEW author AS - SELECT - musical_work.id AS musical_work_id, - contribution_musical_work.id AS contribution_id, - person.given_name AS given_name, - person.surname AS sur_name, - person.authority_control_url AS auth_URL - FROM - musical_work - FULL OUTER JOIN - contribution_musical_work ON contribution_musical_work.contributed_to_work_id = musical_work.id - FULL OUTER JOIN - person ON contribution_musical_work.person_id = person.id - WHERE contribution_musical_work.role = 'AUTHOR' -""" - -composer_query = """ - CREATE VIEW composer AS - SELECT - musical_work.id AS musical_work_id, - contribution_musical_work.id AS contribution_id, - person.given_name AS given_name, - person.surname AS sur_name, - person.authority_control_url AS auth_URL - FROM - musical_work - FULL OUTER JOIN - contribution_musical_work ON contribution_musical_work.contributed_to_work_id = musical_work.id - FULL OUTER JOIN - person ON contribution_musical_work.person_id = person.id - WHERE contribution_musical_work.role = 'COMPOSER' -""" - -# flattened_view_query_with_features = """ -# CREATE VIEW flattened_view AS -# SELECT -# musical_work.id AS musical_work_id, -# musical_work.variant_titles AS musical_work_variant_titles, -# musical_work.sacred_or_secular AS sacred_or_secular, -# author.given_name AS author_given_name, -# author.sur_name AS author_sur_name, -# author.auth_URL AS author_auth_URL, -# author.contribution_id AS author_contribution_id, -# composer.given_name AS composer_given_name, -# composer.sur_name AS composer_sur_name, -# composer.auth_URL AS composer_auth_URL, -# composer.contribution_id AS composer_contribution_id, -# genre_style.genre_style AS genre_style, -# genre_type.genre_type AS genre_type, -# source_instantiation.portion AS source_instantiation_portion, -# source.title AS source_title, -# source.source_type AS source_type, -# source.url AS source_url, -# source.id AS source_id, -# files.file_type AS file_type, -# files.file_format AS file_format, -# files.version AS file_version, -# 'https://db.simssa.ca/files/' || files.id AS url_to_file, -# extracted.value AS extracted_value, -# extracted.feature AS feature -# FROM -# musical_work -# FULL OUTER JOIN -# author ON author.musical_work_id = musical_work.id -# FULL OUTER JOIN -# composer ON composer.musical_work_id = musical_work.id -# FULL OUTER JOIN -# source_instantiation ON musical_work.id = source_instantiation.work_id -# FULL OUTER JOIN -# source ON source_instantiation.source_id = source.id -# FULL OUTER JOIN -# files ON files.instantiates_id = source_instantiation.id -# FULL OUTER JOIN -# (SELECT m.musicalwork_id AS mid, g.name AS genre_style FROM musical_work_genres_as_in_style m JOIN genre_as_in_style g -# ON m.genreasinstyle_id = g.id)genre_style -# ON genre_style.mid = musical_work.id -# FULL OUTER JOIN -# (SELECT m.musicalwork_id AS mid, g.name AS genre_type FROM musical_work_genres_as_in_type m JOIN genre_as_in_type g -# ON m.genreasintype_id = g.id)genre_type -# ON genre_type.mid = musical_work.id -# FULL OUTER JOIN -# (SELECT f.name AS feature, e.feature_of_id AS feature_of_id, e.value AS value FROM extracted_feature e JOIN feature f ON e.instance_of_feature_id = f.id)extracted -# ON files.id = extracted.feature_of_id -# WHERE musical_work.id IS NOT NULL -# """ - -compact_files_query = """ - CREATE VIEW compact_files AS - SELECT - musical_work.id AS musical_work_id, - files.id AS file_id, - files.file_type AS file_type, - files.file_format AS file_format, - files.version AS file_version, - 'https://db.simssa.ca/files/' || files.id AS url_to_file - FROM - musical_work - FULL OUTER JOIN - source_instantiation ON musical_work.id = source_instantiation.work_id - FULL OUTER JOIN - source ON source_instantiation.source_id = source.id - FULL OUTER JOIN - files ON files.instantiates_id = source_instantiation.id - WHERE musical_work.id IS NOT NULL AND files.id IS NOT NULL -""" - -flattened_view_query = """ - CREATE VIEW flattened_view AS - SELECT - musical_work.id AS musical_work_id, - musical_work.variant_titles AS musical_work_variant_titles, - musical_work.sacred_or_secular AS sacred_or_secular, - author.given_name AS author_given_name, - author.sur_name AS author_sur_name, - author.auth_URL AS author_auth_URL, - author.contribution_id AS author_contribution_id, - composer.given_name AS composer_given_name, - composer.sur_name AS composer_sur_name, - composer.auth_URL AS composer_auth_URL, - composer.contribution_id AS composer_contribution_id, - genre_style.genre_style AS genre_style, - genre_type.genre_type AS genre_type, - source_instantiation.portion AS source_instantiation_portion, - source.title AS source_title, - source.source_type AS source_type, - source.url AS source_url, - source.id AS source_id - FROM - musical_work - FULL OUTER JOIN - author ON author.musical_work_id = musical_work.id - FULL OUTER JOIN - composer ON composer.musical_work_id = musical_work.id - FULL OUTER JOIN - source_instantiation ON musical_work.id = source_instantiation.work_id - FULL OUTER JOIN - source ON source_instantiation.source_id = source.id - FULL OUTER JOIN - (SELECT m.musicalwork_id AS mid, g.name AS genre_style FROM musical_work_genres_as_in_style m JOIN genre_as_in_style g - ON m.genreasinstyle_id = g.id)genre_style - ON genre_style.mid = musical_work.id - FULL OUTER JOIN - (SELECT m.musicalwork_id AS mid, g.name AS genre_type FROM musical_work_genres_as_in_type m JOIN genre_as_in_type g - ON m.genreasintype_id = g.id)genre_type - ON genre_type.mid = musical_work.id - WHERE musical_work.id IS NOT NULL - """ - -# creating the initial flattened view -conn = psycopg2.connect(**db_params) -cur = conn.cursor() -cur.execute(author_query) -cur.execute(composer_query) -cur.execute(flattened_view_query) -cur.execute(compact_files_query) - - - -# # START FEATURE FLATTENING -# # get distinct feature names: -# cur.execute("SELECT DISTINCT feature FROM flattened_view") -# feature_names = [row[0] for row in cur.fetchall()] - -# # for renaming feature names for column compatibility -# def sanitize_column_name(name): -# if name is None: -# return None -# return re.sub(' ', '_', name) - -# # Create feature columns -# feature_columns = ", ".join( -# f"MAX(CASE WHEN feature = '{name}' THEN extracted_value ELSE NULL END) AS \"{sanitize_column_name(name)}\"" -# for name in feature_names -# ) -# cur.execute("""SELECT column_name -# FROM information_schema.columns -# WHERE table_name = 'flattened_view'""") -# exclude_list=['extracted_value', 'feature'] -# flattened_column_names = ", ".join([row[0] for row in cur.fetchall() if row[0] not in exclude_list]) - -# return all columns -final_query_flattened_view = """ - SELECT * - FROM flattened_view -""" -# run -cur.execute(final_query_flattened_view) -results = cur.fetchall() - -# Export data to CSV -with open('initial_flattened.csv', 'w') as f: - writer = csv.writer(f) - writer.writerow([col[0] for col in cur.description]) - writer.writerows(results) - -# also export the files to another csv -final_query_compact_files = """ - SELECT * - FROM compact_files -""" -# run -cur.execute(final_query_compact_files) -results = cur.fetchall() - -# Export data to CSV -with open('files.csv', 'w') as f: - writer = csv.writer(f) - writer.writerow([col[0] for col in cur.description]) - writer.writerows(results) - -# Clean up -cur.close() -conn.close() \ No newline at end of file diff --git a/simssa/src/flattening/restructure.py b/simssa/src/flattening/restructure.py deleted file mode 100644 index 5a2e054ae..000000000 --- a/simssa/src/flattening/restructure.py +++ /dev/null @@ -1,36 +0,0 @@ -import csv -import pandas as pd - -# Note: This script is ran AFTER SQL_query.py script and AFTER reconciliation. -# This script takes in a reconciled version of csv and: -# 1. Filter for only the columns we're interested in -# 2. Merge columns so that each row corresponds to a musical_work_id instead of file_id as previously -# Each file is now belong to a musical work - - -df = pd.read_csv("./reconciled_WikiID.csv") -# get only the columns we're interested in -cols = ['musical_work_id','sacred_or_secular','source_id','author_contribution_id','composer_contribution_id', "author_viaf_id",'composer_viaf_id', "genre_style_@id","genre_style",'genre_type','genre_type_@id','source_instantiation_portion','source_title', 'source_type','source_url','musical_work_variant_titles',"author_name",'author_name_@id','composer_name','composer_name_@id'] -df2 = df[cols] - - -# df2['musical_work_id'] = df2['musical_work_id'].astype(int) - - -# # since the flattening process flattened the info related to files (file_formats, url_to_file, Last_Pitch), we merge on the other columns -# merge_on = ["musical_work_id",'musical_work_variant_titles','composer', 'genre_style','genre_type'] - -# df2['test_count'] = df2.groupby("musical_work_id").cumcount() + 1 -# df2 = df2.pivot(index=merge_on, columns='test_count', values=['file_format','url_to_file', 'Last_Pitch_Class']) - -# # Flatten the multi-index columns -# df2.columns = [f'{col[0]}_{col[1]}' for col in df2.columns] -# # now each columns represent a row, each row will have the - -# # Reset the index -# df2 = df2.reset_index() - - - - -df2.to_csv('./final_flattened.csv', index=False) diff --git a/simssa/src/merge.py b/simssa/src/merge.py new file mode 100644 index 000000000..499602cbd --- /dev/null +++ b/simssa/src/merge.py @@ -0,0 +1,263 @@ +""" +Merge and process raw SimssaDB CSV files. + +This script should be run before reconciliation. +Generates the following merged CSV files: +- instance.csv +- work.csv +- source.csv +- person.csv +""" + +import argparse +import logging +from pathlib import Path +import pandas as pd + +DEFAULT_INPUT_DIR = Path("simssa/data/raw") +DEFAULT_OUTPUT_DIR = Path("simssa/data/merged") + +# Configure logger +logger = logging.getLogger(__name__) +if not logger.hasHandlers(): + logging.basicConfig(level=logging.INFO, format="[%(levelname)s] %(message)s") + + +def load_csv(csv_path, usecols, rename_dict): + """ + Load a CSV file, select specific columns, and rename them if needed. + + Args: + csv_path (Path): Path to the CSV file. + usecols (list): List of columns to select. + rename_dict (dict): Dictionary for renaming columns. + """ + df = pd.read_csv(csv_path, usecols=usecols, dtype=str) + return df.rename(columns=rename_dict) + + +def merge_instance_data(input_dir): + """Merge instance-related CSV files into a single DataFrame.""" + source_inst_df = load_csv( + input_dir / "instance" / "source_instantiation.csv", + usecols=["id", "source_id", "work_id"], + rename_dict={"id": "instance_id"}, + ) + + source_inst_section_df = load_csv( + input_dir / "instance" / "source_instantiation_sections.csv", + usecols=["sourceinstantiation_id", "section_id"], + rename_dict={"sourceinstantiation_id": "instance_id"}, + ) + + files_df = load_csv( + input_dir / "instance" / "files.csv", + usecols=["id", "file_format", "file", "instantiates_id"], + rename_dict={ + "id": "file_id", + "file": "file_name", + "instantiates_id": "instance_id", + }, + ) + + + merged_df = pd.merge( + source_inst_df, source_inst_section_df, on="instance_id", how="left" + ) + # Each instance may have multiple files + full_merged_df = pd.merge(merged_df, files_df, on="instance_id", how="left") + + # There is no webpage for instances. It is not necessary to keep the instance_id + # This CSV is mainly to link files to works, sections and sources + full_merged_df = full_merged_df.drop(columns=["instance_id"]) + + return full_merged_df + + +def merge_work_data(input_dir): + """Merge work-related CSV files into a single DataFrame.""" + # Load and process musical_work.csv + work_df = load_csv( + input_dir / "musical_work" / "musical_work.csv", + usecols=["id", "variant_titles", "sacred_or_secular"], + rename_dict={"id": "work_id", "variant_titles": "work_title"}, + ) + # Clean work_title by removing brackets and quotes + work_df["work_title"] = work_df["work_title"].str.replace( + r"[\[\]'\"']", "", regex=True + ) + # Map sacred_or_secular boolean to descriptive strings + work_df["sacred_or_secular"] = work_df["sacred_or_secular"].replace( + {"False": "Secular", "True": "Sacred"} + ) + + # Load and process section.csv + section_df = load_csv( + input_dir / "musical_work" / "section.csv", + usecols=["id", "title", "musical_work_id"], + rename_dict={ + "id": "section_id", + "title": "section_title", + "musical_work_id": "work_id", + }, + ) + + # Load and process contribution_musical_works.csv + contribution_df = load_csv( + input_dir / "person" / "contribution_musical_work.csv", + usecols=["role", "person_id", "contributed_to_work_id"], + rename_dict={"contributed_to_work_id": "work_id"}, + ) + + # The original table store the role of the contributor as either AUTHOR or COMPOSER + contribution_pivoted = ( + contribution_df.pivot_table( + index="work_id", columns="role", values="person_id", aggfunc="first" + ) + .rename(columns={"AUTHOR": "author_id", "COMPOSER": "composer_id"}) + .reset_index() + ) + + # Left join work_df with section_df on work_id + merged_with_sections_df = pd.merge(work_df, section_df, on="work_id", how="left") + + # Merge the pivoted contribution data with the work-section DataFrame + merged_with_creator_df = pd.merge( + merged_with_sections_df, contribution_pivoted, on="work_id", how="left" + ) + + # Processing genre data + # Load and process genre-work match table + genre_of_work_df = load_csv( + input_dir / "genre" / "musical_work_genres_as_in_type.csv", + usecols=["musicalwork_id", "genreasintype_id"], + rename_dict={"musicalwork_id": "work_id", "genreasintype_id": "genre_id"}, + ) + + # Load and process genre id-name match table + genres_df = load_csv( + input_dir / "genre" / "genre_as_in_type.csv", + usecols=["id", "name"], + rename_dict={"id": "genre_id", "name": "genre_name"}, + ) + + # Left join genres_df with work_genres_df on genre_id + merged_genres_df = pd.merge(genre_of_work_df, genres_df, on="genre_id", how="left") + + # Left join the result with the merged work-section-contribution DataFrame on work_id + final_merged_df = pd.merge( + merged_with_creator_df, merged_genres_df, on="work_id", how="left" + ) + + # genre_as_in_style.csv explicitly specify "Renaissance" for all works + final_merged_df["style"] = "Renaissance" + + return final_merged_df + + +def merge_source_data(input_dir): + """Merge source-related CSV files into a single DataFrame.""" + # Load and process source/source.csv + source_df = load_csv( + input_dir / "source" / "source.csv", + usecols=["id", "title"], + rename_dict={"id": "source_id", "title": "source_title"}, + ) + # Remove double quotes from source_title + source_df["source_title"] = source_df["source_title"].str.replace( + '"', "", regex=False + ) + + return source_df + + +def merge_person_data(input_dir): + """Merge person-related CSV files into a single DataFrame.""" + # Load and process person/person.csv + person_df = load_csv( + input_dir / "person" / "person.csv", + usecols=[ + "id", + "given_name", + "surname", + "birth_date_range_year_only", + "death_date_range_year_only", + "authority_control_url", + ], + rename_dict={"id": "person_id", "authority_control_url": "viaf_id"}, + ) + + # Normalize VIAF identifiers: extract numeric ID from VIAF URLs + # e.g., "https://viaf.org/viaf/123456/" -> "123456" + person_df["viaf_id"] = ( + person_df["viaf_id"] + .astype("string") + .str.strip() + .str.replace( + r"^https?://viaf\.org/viaf/(\d+)/?.*$", + r"\1", + regex=True, + ) + ) + # Combine given_name and surname into person_name + person_df["person_name"] = person_df["given_name"] + " " + person_df["surname"] + + # Extract the year and convert to xsd:date compatible format (YYYY-01-01) + person_df["birth_year"] = ( + person_df["birth_date_range_year_only"] + .str.extract(r"\[(\d{4})")[0] + .apply(lambda y: f"{y}-01-01" if pd.notnull(y) else None) + ) + + person_df["death_year"] = ( + person_df["death_date_range_year_only"] + .str.extract(r"\[(\d{4})")[0] + .apply(lambda y: f"{y}-01-01" if pd.notnull(y) else None) + ) + + # Drop unnecessary name and date_range columns + person_df = person_df[ + ["person_id", "person_name", "birth_year", "death_year", "viaf_id"] + ] + + return person_df + + +def main(): + """Main function to parse arguments and run merging functions.""" + parser = argparse.ArgumentParser(description="SIMSSA CSV merge utilities") + parser.add_argument( + "-i", + "--input", + default=DEFAULT_INPUT_DIR, + type=Path, + help="Input raw data directory (default: simssa/data/raw)", + ) + parser.add_argument( + "-o", + "--output", + default=DEFAULT_OUTPUT_DIR, + type=Path, + help="Output directory for merged CSVs (default: simssa/data/merged)", + ) + args = parser.parse_args() + args.output.mkdir(parents=True, exist_ok=True) + + merge_instance_data(args.input).to_csv( + args.output / "instance.csv", index=False + ) + merge_work_data(args.input).to_csv(args.output / "work.csv", index=False) + merge_source_data(args.input).to_csv( + args.output / "source.csv", index=False + ) + merge_person_data(args.input).to_csv( + args.output / "person.csv", index=False + ) + + logger.info( + "All CSVs have been successfully processed and saved to %s", args.output + ) + + +if __name__ == "__main__": + main()