DDMAL · SCN-MNG · Sep 29, 2025 · Sep 29, 2025 · Oct 22, 2025 · Oct 22, 2025
diff --git a/shared/rdf_config/simssadb.toml b/shared/rdf_config/simssadb.toml
@@ -0,0 +1,65 @@
+[general]
+name = "simssadb"
+csv_folder = "../../simssa/data/reconciled"
+rdf_output_folder = "../../simssa/data/rdf"
+test_mode = false
+
+[namespaces]
+rdf = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+rdfs = "http://www.w3.org/2000/01/rdf-schema#"
+xsd = "http://www.w3.org/2001/XMLSchema#"
+wd = "http://www.wikidata.org/entity/"
+wdt = "http://www.wikidata.org/prop/direct/"
+sp = "https://db.simssa.ca/persons/"
+sw = "https://db.simssa.ca/musicalworks/"
+sse = "https://db.simssa.ca/sections/"
+st = "https://db.simssa.ca/types/"
+ss = "https://db.simssa.ca/sources/"
+sf = "https://db.simssa.ca/files/"
+lms = "https://linkedmusic.ca/graphs/simssadb/"
+
+[instance]
+PRIMARY_KEY = "file_id"
+work_id = {prefix = "sw", pred = "P6243"}  # digital representation of (P6243)
+section_id = {prefix = "sse", pred = "P6243"}  # digital representation of (P6243)
+file_id = {prefix = "sf", type = "lms:File"}
+file_format = ""
+file_name = "rdfs:label"
+
+[instance.source_id]
+prefix = "ss"
+pred = "P144"  # based on (P144)
+
+[person]
+PRIMARY_KEY = "person_id"
+person_name = "P2888"  # exact match (P2888)
+person_name_original = "rdfs:label"
+birth_year = { pred = "P569", datatype = "xsd:date" }  # date of birth (P569)
+death_year = { pred = "P570", datatype = "xsd:date" }  # date of death (P570)
+viaf_id = "P214"  # VIAF cluster ID (P214)
+
+[person.person_id]
+type = "lms:Person"
+prefix = "sp"
+
+[source]
+PRIMARY_KEY = "source_id"
+source_title = "rdfs:label"
+
+[source.source_id]
+prefix = "ss"
+type = "lms:Source"
+
+[work]
+PRIMARY_KEY = "work_id"
+work_id = {prefix = "sw", type = "lms:Work"}
+work_title = {if = "isinstance(obj, URIRef)", pred = "P2888"}  # exact match (P2888)
+work_title_original = "rdfs:label"
+sacred_or_secular = "P136"  # genre (P136)
+section_id = {prefix = "sse", pred = "P527", type = "lms:Section"}  # has part(s) (P527)
+section_title = {subj = "section_id", pred = "rdfs:label"}
+author_id = {prefix = "sp", pred = "P50"}  # author (P50)
+composer_id = {prefix = "sp", pred = "P86"}  # composer (P86)
+genre_id = {prefix = "st", pred = "P136", type = "lms:GenreAsInType"}  # genre (P136)
+genre_name = {subj = "genre_id", pred = "rdfs:label"}
+style = ""
diff --git a/simssa/README.md b/simssa/README.md
@@ -1,37 +1,207 @@
-# SimssaDB flattening and json-ld structures
+# Ingestion of SIMSSA DB
 
-> Summary:
+# 1. General Description
 
-> 1. Upload SQL dump to local postgreSQL database
-> 2. With output run `simssa/src/flattening/SQL_query.py`
-> 3. Reconcile `initial_flattened.csv` with OpenRefine
-> 4. Reconcile `files.csv` with OpenRefine
-> 5. With output run `simssa/src/flattening/restructure.py`
-> 6. With output run `simssa/src/jsonld/generate_jsonld.py` (which also takes `simssa/src/jsonld/context.jsonld` as the initial context)
+You can read more about SIMSSA DB on the [official webpage](https://db.simssa.ca/about/). A graphic of the SIMSSA DB database model can be found [on Cory McKay's SourceForge page](https://jmir.sourceforge.net/cmckay/papers/mckay17database.pdf)
 
-## 1. Extracting columns and feature flattening
+The project is mainly maintained by [Cory McKay](https://jmir.sourceforge.net/cmckay/). According to Ich, it is unlikely for SIMSSA DB to see any future update.
 
-After uploading the database dump to the local PostgreSQL database, we first select relevant columns and perform initial feature flattening with `psycopg` in `SQL_query.py`
+# 2. Obtaining The Database Dump
 
-When extracting the files, I found that since there often was more than one file per work, the SQL query would create rows where each data field was duplicated, except for the fields relating to the files, due to the behaviour of the `FULL OUTER JOIN` SQL command.
-As such, I decided to instead create a second CSV file that would only contain the files, and there would be a field indicating the musical work that the file corresponded to, allowing us to merge that CSV file with the main CSV file during RDF conversion.
-Furthermore, some files aren't linked to any musical works. I chose to simply ignore them when exporting the list of files because files without musical works aren't useful at all for the datalake. These files seem to be linked to musical works that aren't in the data dump currently being used. This will hopefully be fixed by [#263](https://github.com/DDMAL/linkedmusic-datalake/issues/263).
+Dylan has obtained a PostgreSQL dump of the SIMSSA DB. The dump can be found on [Arbutus Object Storage](https://arbutus.cloud.computecanada.ca/auth/login/?next=/project/containers/container/virtuoso/misc). Please refer to the Internal SIMSSA Wiki on how to set up your Arbutus account.
 
-This produces 2 CSV files, `final_flattened.csv`, a flattening of all the tables into one CSV with `musical_work_id` as the primary key, and `files.csv`, containing the data about all files and the works they are linked to.
+# 3. Export SQL Dump to CSV files
 
-## 2. Reconciliation with OpenRefine
+1. Install PostgreSQL, if it is not installed already.
 
-OpenRefine reconciliation was performed on `initial_flattened.csv` and on `files.csv`. You can see the reconciled files `reconciled_wikiID.csv` and `reconciled_files_WikiID.csv`. You can use `simssa/openrefine/history/history_flattened.json` and `simssa/openrefine/history/history_files.json` to facilitate reconciliation and `simssa/openrefine/export/export_template_flattened.json` and `simssa/openrefine/export/export_template_files.json` to export to the desired csv format.
+2. Make sure that postgres is running using the following command:
 
-## 3. Reconcile column names and generating json-ld
+```bash
+sudo service postgresql status
+```
 
-Currently the json-ld is generated as follows:
+Start postgresql if it is not running:
 
-In `generate_jsonld.py`:
+```bash
+sudo service postgresql start
+```
 
-1. Convert csv to json documents
-2. Loop through each json document and edit each entry, creating the compact jsonld. Also parse the files csv to extract and files associated with each entry.
-3. Generate the jsonld file at `compact.jsonld`
-4. The contexts used in the `compact.jsonld` file is imported from `context.jsonld`
+3. Start the postgres shell
 
-### TODO: Make the RDF conversion convert to Turtle
+```bash
+sudo -u postgres psql
+```
+
+4. Inside the shell, create a new user and database, and exit the shell:
+
+```bash
+CREATE USER myuser WITH PASSWORD 'mypassword';
+CREATE DATABASE simssadb OWNER myuser;
+GRANT ALL PRIVILEGES ON DATABASE simssadb TO myuser;
+\q
+```
+
+5. Load the SQL dump into your new database through the following command:
+
+```bash
+sudo -u postgres sh -c "gunzip -c <path/to/sql_gz/dump> | psql -d simssadb"
+```
+
+When prompted, enter "mypassword" as the password.
+
+6. Grant read access to all loaded tables to "myuser"
+
+First, start the shell again:
+
+```bash
+sudo -u postgres psql -d simssadb
+```
+
+Then, run the following commands:
+
+```bash
+-- Grant SELECT on all existing tables
+GRANT SELECT ON ALL TABLES IN SCHEMA public TO myuser;
+
+-- Grant SELECT on tables created in the future
+ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT SELECT ON TABLES TO myuser;
+
+\q
+```
+
+7. Run `export_all_tables.py`
+
+Run the following command from the repository root directory:
+
+```bash
+python simssa/src/export_all_tables.py
+```
+
+All nonempty tables should be output as CSV files in the subdirectories of `simssa/data/raw`
+
+# 4. Overview of The Raw Dataset
+
+After running `simssa/src/export_all_tables.py `, each nonempty table should be output as a CSV file in a subdirectory of `simssa/data/raw`
+
+`export_all_tables.py` groups the CSV files into the following subdirectories:
+
+1. `feature`: CSV related to audio/musical features (e.g., most frequent pitch, rhythmic variability).
+2. `genre`: CSV files related to musical genres, including both "genre-as-in-style" (e.g., Renaissance) and "genre-as-in-type" (e.g., Madrigal).
+3. `instance`: CSV files related to instances of musical works, which serve as intermediate links between works, sources, and files.
+4. `musical_work`: CSV files related to musical works, including their titles, sections, and associated metadata. Musical works (i.e., compositions) are the central entities of SIMSSA DB.
+5. `person`: CSV files containing data about authors and composers, including their roles and contributions.
+6. `source`: CSV files describing the origins of scores and their relationships to musical works and sections.
+
+Every other CSV file is placed in the `other` subdirectory: these do not seem to be pertinent to the datalake.
+
+## 4.1 Feature Subdirectory
+
+Contains CSV related to audio/musical features (e.g., most frequent pitch, rhythmic variability). These features were extracted from MIDI files. You can find a list of example features at `https://db.simssa.ca/files/2018`
+
+Contains the following CSVs:
+
+- extracted_features.csv: list of musical/audio features
+- feature_file.csv: location of files containing extracted features
+- feature.csv: another list of musical/audio features
+
+Musical features are currently omitted from the RDF since it is very difficult/impractical to store them in Linked Data form. Anyone interested in these data should be redirected to the SIMSSA DB website.
+
+## 4.2 Genre Subdirectory
+
+Contains CSV files related to musical genres, including both "genre-as-in-style" and "genre-as-in-type."
+
+Contains the following CSVs:
+
+- genre_as_in_style.csv: "Renaissance" is the only genre_as_in_style in SIMSSA DB.
+- genre_as_in_type.csv: Lists twelve different genre_as_in_type (e.g., Zibaldone, Madrigal).
+- musical_work_genres_as_in_style.csv: Maps every musical work in SIMSSA DB to the genre "Renaissance."
+- musical_work_genres_as_in_type.csv: Maps musical works to their genre_as_in_type.
+
+Musical genres are an important aspect of SIMSSA DB, particularly "genre-as-in-type," which provides more detailed classifications. These data are suitable for Linked Data representation.
+
+## 4.3 Instance Subdirectory
+
+Contains CSV files related to instances of musical works, which serve as intermediate links between works, sources, and files.
+
+Contains the following CSVs:
+
+- files.csv: Points to files containing sheet music or MIDI scores.
+- source_instantiation.csv: Links instances to a musical work and to a source.
+- source_instantiation_sections.csv: Links instances to a section of a musical work. An instance is either linked to the entire musical work or to a section of it.
+
+Instances are not stored as distinct entities in the data lake but are crucial for linking works, sources, and files in the raw dataset.
+
+## 4.4 Musical Work Subdirectory
+
+Contains CSV files related to musical works, including their titles, sections, and associated metadata.
+
+Contains the following CSVs:
+
+- geographic_area.csv: Only contains "Vienna."
+- instruments.csv: Only contains "Voice."
+- musical_works.csv: Links a musical work to its title and indicates whether it is sacred or secular.
+- part.csv: Lists whenever a work has a part for voice.
+- section.csv: Lists sections of the musical works (e.g., work 117 may have a "Sanctus (In nomine)" section).
+
+Among these, only `musical_works.csv` and `section.csv` are ingested into the datalake. The other files were not part of the final RDF since they contained so little data.
+
+## 4.5 Person Subdirectory
+
+Contains CSV files related to authors and composers, including their roles and contributions.
+
+Contains the following CSVs:
+
+- person.csv: Lists all composers/authors, with their birth and death years.
+- contribution_musical_work.csv: Links people to compositions. The "role" column describes whether the person was an "AUTHOR" or a "COMPOSER."
+
+These files provide essential metadata about the creators of musical works and their contributions, making them suitable for Linked Data representation.
+
+## 4.6 Source subdirectory
+
+Contains the CSV file `source.csv, which specifies information on a source (i.e. a book/anthology from which a musical work is taken).
+
+# 5. Type of Entities in the RDF
+
+## 5.1 Persons
+
+Prefix: `https://db.simssa.ca/persons/`
+
+Identifies people who are either authors or composers of musical works. Each person is linked to a VIAF ID in the raw dataset.
+
+## 5.2 Musical Works
+
+Prefix: `https://db.simssa.ca/musicalworks/`
+
+Identifies individual musical works (i.e., compositions). Each composition is linked to:
+
+1. An author and a composer
+2. A genre
+3. Symbolic music files (MIDI & PDF score)
+4. Sections (e.g., a mass may have an Introit section)
+5. A source (a book or an anthology in which the work was found).
+
+## 5.3 Sections
+
+Prefix: `https://db.simssa.ca/sections/`
+
+This namespace refers to _sections_ of musical works. A “section” may correspond to a movement, chant segment, or logical division within a work.
+
+There can be a symbolic music file for a particular section instead of the whole composition.
+
+## 5.4 Types
+
+Prefix: `https://db.simssa.ca/types/`
+
+This namespace identifies the genre of the musical work ("genre-as-in-type"). 
+
+## 5.5 Sources
+
+Prefix: `https://db.simssa.ca/sources/`
+
+Identifies the book/anthology from which the chant was taken.
+
+## 5.6 Files
+
+Prefix: `https://db.simssa.ca/files/`
+
+Identifies the symbolic music file (PDF or MIDI) attached to a work or a section.
diff --git a/simssa/openrefine/export/export_person.json b/simssa/openrefine/export/export_person.json
@@ -0,0 +1,67 @@
+[
+  {
+    "op": "core/recon",
+    "engineConfig": {
+      "facets": [],
+      "mode": "row-based"
+    },
+    "columnName": "person_name",
+    "config": {
+      "mode": "standard-service",
+      "service": "https://wikidata.reconci.link/en/api",
+      "identifierSpace": "http://www.wikidata.org/entity/",
+      "schemaSpace": "http://www.wikidata.org/prop/direct/",
+      "type": {
+        "id": "Q5",
+        "name": "human"
+      },
+      "autoMatch": true,
+      "batchSize": 10,
+      "columnDetails": [],
+      "limit": 0
+    },
+    "description": "Reconcile cells in column person_name to type Q5"
+  },
+  {
+    "op": "core/recon",
+    "engineConfig": {
+      "facets": [],
+      "mode": "row-based"
+    },
+    "columnName": "person_name",
+    "config": {
+      "mode": "standard-service",
+      "service": "https://wikidata.reconci.link/en/api",
+      "identifierSpace": "http://www.wikidata.org/entity/",
+      "schemaSpace": "http://www.wikidata.org/prop/direct/",
+      "type": {
+        "id": "Q5",
+        "name": "human"
+      },
+      "autoMatch": true,
+      "batchSize": 10,
+      "columnDetails": [
+        {
+          "column": "viaf_id",
+          "propertyName": "VIAF cluster ID",
+          "propertyID": "P214"
+        }
+      ],
+      "limit": 0
+    },
+    "description": "Reconcile cells in column person_name to type Q5"
+  },
+  {
+    "op": "core/column-addition",
+    "engineConfig": {
+      "facets": [],
+      "mode": "row-based"
+    },
+    "baseColumnName": "person_name",
+    "expression": "grel:value",
+    "onError": "set-to-blank",
+    "newColumnName": "person_name_original",
+    "columnInsertIndex": 2,
+    "description": "Create column person_name_original at index 2 based on column person_name using expression grel:value"
+  }
+]