Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
fe2ec58
feat(simssadb): add script to export and class all tables from SQL da…
SCN-MNG Sep 29, 2025
e31fdde
feat(simssadb): add script to merge and process CSV. This facilitates…
SCN-MNG Sep 29, 2025
404945e
chore(simssadb): reformat and lint scripts
SCN-MNG Oct 22, 2025
384ef6c
doc(simssadb): create template README
SCN-MNG Oct 22, 2025
a38c427
feat(simssadb): create RDF conversion config
SCN-MNG Oct 22, 2025
c1f75e3
doc(simssadb): save OpenRefine reconciliation history and export hist…
SCN-MNG Oct 22, 2025
4d51644
feat(simssadb): Complete RDF configuration file
SCN-MNG Oct 22, 2025
f632d34
refactor(simssadb): remove unneeded column from merged CSV.
SCN-MNG Oct 22, 2025
1652450
chore(simssadb): update docstrings
SCN-MNG Oct 22, 2025
489b791
refactor(simssadb): add command argument options
SCN-MNG Oct 22, 2025
f332976
chore(simssadb): update variable names and docstrings
SCN-MNG Oct 22, 2025
1fc9001
chore(simssadb): do minor linting
SCN-MNG Oct 22, 2025
1546fef
doc(simssa): update README to include an overview of SIMSSADB content…
SCN-MNG Nov 14, 2025
79b9b17
chore(simssa): delete outdated SQL to CSV scripts
SCN-MNG Nov 14, 2025
298d6f5
chore(simssa): reformat documentation
SCN-MNG Nov 14, 2025
28c919e
chore(simssa): fix grammar mistake
SCN-MNG Nov 14, 2025
b714b40
fix(simssa): address SQL injection vulnerability in table name handling
SCN-MNG Nov 14, 2025
6bf2f42
Merge branch 'simssadb-ingestion' of github.com:DDMAL/linkedmusic-dat…
SCN-MNG Nov 14, 2025
9fe6b54
doc(simssa): fix grammar error in README
SCN-MNG Nov 21, 2025
0125ecd
doc(simssa): update path in README
SCN-MNG Nov 21, 2025
05e969a
Fix grammar and clarity issues in README
liampond Apr 2, 2026
39ddb0a
fix(simssaDB): merge.py now removes viaf_id prefix
SCN-MNG Apr 2, 2026
6cf29f2
fix(simssadb): Add missing predicate for music genre in SimssaDB rdf …
SCN-MNG Apr 2, 2026
9ce4b23
fix(simssadb): add date datatype to rdf config. It makes the graph sl…
SCN-MNG Apr 2, 2026
04f08e8
doc(simssa): fix README error
SCN-MNG Apr 2, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 65 additions & 0 deletions shared/rdf_config/simssadb.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
[general]
name = "simssadb"
csv_folder = "../../simssa/data/reconciled"
rdf_output_folder = "../../simssa/data/rdf"
test_mode = false

[namespaces]
rdf = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
rdfs = "http://www.w3.org/2000/01/rdf-schema#"
xsd = "http://www.w3.org/2001/XMLSchema#"
wd = "http://www.wikidata.org/entity/"
wdt = "http://www.wikidata.org/prop/direct/"
sp = "https://db.simssa.ca/persons/"
sw = "https://db.simssa.ca/musicalworks/"
sse = "https://db.simssa.ca/sections/"
st = "https://db.simssa.ca/types/"
ss = "https://db.simssa.ca/sources/"
sf = "https://db.simssa.ca/files/"
lms = "https://linkedmusic.ca/graphs/simssadb/"

[instance]
PRIMARY_KEY = "file_id"
work_id = {prefix = "sw", pred = "P6243"} # digital representation of (P6243)
section_id = {prefix = "sse", pred = "P6243"} # digital representation of (P6243)
file_id = {prefix = "sf", type = "lms:File"}
file_format = ""
file_name = "rdfs:label"

[instance.source_id]
prefix = "ss"
pred = "P144" # based on (P144)

[person]
PRIMARY_KEY = "person_id"
person_name = "P2888" # exact match (P2888)
person_name_original = "rdfs:label"
birth_year = { pred = "P569", datatype = "xsd:date" } # date of birth (P569)
death_year = { pred = "P570", datatype = "xsd:date" } # date of death (P570)
viaf_id = "P214" # VIAF cluster ID (P214)

[person.person_id]
type = "lms:Person"
prefix = "sp"

[source]
PRIMARY_KEY = "source_id"
source_title = "rdfs:label"

[source.source_id]
prefix = "ss"
type = "lms:Source"

[work]
PRIMARY_KEY = "work_id"
work_id = {prefix = "sw", type = "lms:Work"}
work_title = {if = "isinstance(obj, URIRef)", pred = "P2888"} # exact match (P2888)
work_title_original = "rdfs:label"
sacred_or_secular = "P136" # genre (P136)
section_id = {prefix = "sse", pred = "P527", type = "lms:Section"} # has part(s) (P527)
section_title = {subj = "section_id", pred = "rdfs:label"}
author_id = {prefix = "sp", pred = "P50"} # author (P50)
composer_id = {prefix = "sp", pred = "P86"} # composer (P86)
genre_id = {prefix = "st", pred = "P136", type = "lms:GenreAsInType"} # genre (P136)
genre_name = {subj = "genre_id", pred = "rdfs:label"}
style = ""
218 changes: 194 additions & 24 deletions simssa/README.md
Original file line number Diff line number Diff line change
@@ -1,37 +1,207 @@
# SimssaDB flattening and json-ld structures
# Ingestion of SIMSSA DB

> Summary:
# 1. General Description

> 1. Upload SQL dump to local postgreSQL database
> 2. With output run `simssa/src/flattening/SQL_query.py`
> 3. Reconcile `initial_flattened.csv` with OpenRefine
> 4. Reconcile `files.csv` with OpenRefine
> 5. With output run `simssa/src/flattening/restructure.py`
> 6. With output run `simssa/src/jsonld/generate_jsonld.py` (which also takes `simssa/src/jsonld/context.jsonld` as the initial context)
You can read more about SIMSSA DB on the [official webpage](https://db.simssa.ca/about/). A graphic of the SIMSSA DB database model can be found [on Cory McKay's SourceForge page](https://jmir.sourceforge.net/cmckay/papers/mckay17database.pdf)

## 1. Extracting columns and feature flattening
The project is mainly maintained by [Cory McKay](https://jmir.sourceforge.net/cmckay/). According to Ich, it is unlikely for SIMSSA DB to see any future update.
Comment thread
SCN-MNG marked this conversation as resolved.

After uploading the database dump to the local PostgreSQL database, we first select relevant columns and perform initial feature flattening with `psycopg` in `SQL_query.py`
# 2. Obtaining The Database Dump

When extracting the files, I found that since there often was more than one file per work, the SQL query would create rows where each data field was duplicated, except for the fields relating to the files, due to the behaviour of the `FULL OUTER JOIN` SQL command.
As such, I decided to instead create a second CSV file that would only contain the files, and there would be a field indicating the musical work that the file corresponded to, allowing us to merge that CSV file with the main CSV file during RDF conversion.
Furthermore, some files aren't linked to any musical works. I chose to simply ignore them when exporting the list of files because files without musical works aren't useful at all for the datalake. These files seem to be linked to musical works that aren't in the data dump currently being used. This will hopefully be fixed by [#263](https://github.com/DDMAL/linkedmusic-datalake/issues/263).
Dylan has obtained a PostgreSQL dump of the SIMSSA DB. The dump can be found on [Arbutus Object Storage](https://arbutus.cloud.computecanada.ca/auth/login/?next=/project/containers/container/virtuoso/misc). Please refer to the Internal SIMSSA Wiki on how to set up your Arbutus account.

This produces 2 CSV files, `final_flattened.csv`, a flattening of all the tables into one CSV with `musical_work_id` as the primary key, and `files.csv`, containing the data about all files and the works they are linked to.
# 3. Export SQL Dump to CSV files

## 2. Reconciliation with OpenRefine
1. Install PostgreSQL, if it is not installed already.

OpenRefine reconciliation was performed on `initial_flattened.csv` and on `files.csv`. You can see the reconciled files `reconciled_wikiID.csv` and `reconciled_files_WikiID.csv`. You can use `simssa/openrefine/history/history_flattened.json` and `simssa/openrefine/history/history_files.json` to facilitate reconciliation and `simssa/openrefine/export/export_template_flattened.json` and `simssa/openrefine/export/export_template_files.json` to export to the desired csv format.
2. Make sure that postgres is running using the following command:

## 3. Reconcile column names and generating json-ld
```bash
sudo service postgresql status
```

Currently the json-ld is generated as follows:
Start postgresql if it is not running:

In `generate_jsonld.py`:
```bash
sudo service postgresql start
```

1. Convert csv to json documents
2. Loop through each json document and edit each entry, creating the compact jsonld. Also parse the files csv to extract and files associated with each entry.
3. Generate the jsonld file at `compact.jsonld`
4. The contexts used in the `compact.jsonld` file is imported from `context.jsonld`
3. Start the postgres shell

### TODO: Make the RDF conversion convert to Turtle
```bash
sudo -u postgres psql
```

4. Inside the shell, create a new user and database, and exit the shell:

```bash
CREATE USER myuser WITH PASSWORD 'mypassword';
CREATE DATABASE simssadb OWNER myuser;
GRANT ALL PRIVILEGES ON DATABASE simssadb TO myuser;
\q
```

5. Load the SQL dump into your new database through the following command:

```bash
sudo -u postgres sh -c "gunzip -c <path/to/sql_gz/dump> | psql -d simssadb"
```

When prompted, enter "mypassword" as the password.

6. Grant read access to all loaded tables to "myuser"

First, start the shell again:

```bash
sudo -u postgres psql -d simssadb
```

Then, run the following commands:

```bash
-- Grant SELECT on all existing tables
GRANT SELECT ON ALL TABLES IN SCHEMA public TO myuser;

-- Grant SELECT on tables created in the future
ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT SELECT ON TABLES TO myuser;

\q
```

7. Run `export_all_tables.py`

Run the following command from the repository root directory:

```bash
python simssa/src/export_all_tables.py
```

All nonempty tables should be output as CSV files in the subdirectories of `simssa/data/raw`

# 4. Overview of The Raw Dataset

After running `simssa/src/export_all_tables.py `, each nonempty table should be output as a CSV file in a subdirectory of `simssa/data/raw`

`export_all_tables.py` groups the CSV files into the following subdirectories:

1. `feature`: CSV related to audio/musical features (e.g., most frequent pitch, rhythmic variability).
2. `genre`: CSV files related to musical genres, including both "genre-as-in-style" (e.g., Renaissance) and "genre-as-in-type" (e.g., Madrigal).
3. `instance`: CSV files related to instances of musical works, which serve as intermediate links between works, sources, and files.
4. `musical_work`: CSV files related to musical works, including their titles, sections, and associated metadata. Musical works (i.e., compositions) are the central entities of SIMSSA DB.
5. `person`: CSV files containing data about authors and composers, including their roles and contributions.
6. `source`: CSV files describing the origins of scores and their relationships to musical works and sections.

Every other CSV file is placed in the `other` subdirectory: these do not seem to be pertinent to the datalake.

## 4.1 Feature Subdirectory

Contains CSV related to audio/musical features (e.g., most frequent pitch, rhythmic variability). These features were extracted from MIDI files. You can find a list of example features at `https://db.simssa.ca/files/2018`

Contains the following CSVs:

- extracted_features.csv: list of musical/audio features
- feature_file.csv: location of files containing extracted features
- feature.csv: another list of musical/audio features

Musical features are currently omitted from the RDF since it is very difficult/impractical to store them in Linked Data form. Anyone interested in these data should be redirected to the SIMSSA DB website.

## 4.2 Genre Subdirectory

Contains CSV files related to musical genres, including both "genre-as-in-style" and "genre-as-in-type."

Contains the following CSVs:

- genre_as_in_style.csv: "Renaissance" is the only genre_as_in_style in SIMSSA DB.
- genre_as_in_type.csv: Lists twelve different genre_as_in_type (e.g., Zibaldone, Madrigal).
- musical_work_genres_as_in_style.csv: Maps every musical work in SIMSSA DB to the genre "Renaissance."
- musical_work_genres_as_in_type.csv: Maps musical works to their genre_as_in_type.

Musical genres are an important aspect of SIMSSA DB, particularly "genre-as-in-type," which provides more detailed classifications. These data are suitable for Linked Data representation.

## 4.3 Instance Subdirectory

Contains CSV files related to instances of musical works, which serve as intermediate links between works, sources, and files.

Contains the following CSVs:

- files.csv: Points to files containing sheet music or MIDI scores.
- source_instantiation.csv: Links instances to a musical work and to a source.
- source_instantiation_sections.csv: Links instances to a section of a musical work. An instance is either linked to the entire musical work or to a section of it.

Instances are not stored as distinct entities in the data lake but are crucial for linking works, sources, and files in the raw dataset.

## 4.4 Musical Work Subdirectory

Contains CSV files related to musical works, including their titles, sections, and associated metadata.

Contains the following CSVs:

- geographic_area.csv: Only contains "Vienna."
- instruments.csv: Only contains "Voice."
- musical_works.csv: Links a musical work to its title and indicates whether it is sacred or secular.
- part.csv: Lists whenever a work has a part for voice.
- section.csv: Lists sections of the musical works (e.g., work 117 may have a "Sanctus (In nomine)" section).

Among these, only `musical_works.csv` and `section.csv` are ingested into the datalake. The other files were not part of the final RDF since they contained so little data.
Comment thread
SCN-MNG marked this conversation as resolved.

## 4.5 Person Subdirectory

Contains CSV files related to authors and composers, including their roles and contributions.

Contains the following CSVs:

- person.csv: Lists all composers/authors, with their birth and death years.
- contribution_musical_work.csv: Links people to compositions. The "role" column describes whether the person was an "AUTHOR" or a "COMPOSER."

These files provide essential metadata about the creators of musical works and their contributions, making them suitable for Linked Data representation.

## 4.6 Source subdirectory

Contains the CSV file `source.csv, which specifies information on a source (i.e. a book/anthology from which a musical work is taken).

# 5. Type of Entities in the RDF

## 5.1 Persons

Prefix: `https://db.simssa.ca/persons/`

Identifies people who are either authors or composers of musical works. Each person is linked to a VIAF ID in the raw dataset.

## 5.2 Musical Works

Prefix: `https://db.simssa.ca/musicalworks/`

Identifies individual musical works (i.e., compositions). Each composition is linked to:

1. An author and a composer
2. A genre
3. Symbolic music files (MIDI & PDF score)
4. Sections (e.g., a mass may have an Introit section)
5. A source (a book or an anthology in which the work was found).

## 5.3 Sections

Prefix: `https://db.simssa.ca/sections/`

This namespace refers to _sections_ of musical works. A “section” may correspond to a movement, chant segment, or logical division within a work.

There can be a symbolic music file for a particular section instead of the whole composition.

## 5.4 Types

Prefix: `https://db.simssa.ca/types/`

This namespace identifies the genre of the musical work ("genre-as-in-type").

## 5.5 Sources

Prefix: `https://db.simssa.ca/sources/`

Identifies the book/anthology from which the chant was taken.

Comment thread
SCN-MNG marked this conversation as resolved.
## 5.6 Files

Prefix: `https://db.simssa.ca/files/`

Identifies the symbolic music file (PDF or MIDI) attached to a work or a section.
67 changes: 67 additions & 0 deletions simssa/openrefine/export/export_person.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
[
{
"op": "core/recon",
"engineConfig": {
"facets": [],
"mode": "row-based"
},
"columnName": "person_name",
"config": {
"mode": "standard-service",
"service": "https://wikidata.reconci.link/en/api",
"identifierSpace": "http://www.wikidata.org/entity/",
"schemaSpace": "http://www.wikidata.org/prop/direct/",
"type": {
"id": "Q5",
"name": "human"
},
"autoMatch": true,
"batchSize": 10,
"columnDetails": [],
"limit": 0
},
"description": "Reconcile cells in column person_name to type Q5"
},
{
"op": "core/recon",
"engineConfig": {
"facets": [],
"mode": "row-based"
},
"columnName": "person_name",
"config": {
"mode": "standard-service",
"service": "https://wikidata.reconci.link/en/api",
"identifierSpace": "http://www.wikidata.org/entity/",
"schemaSpace": "http://www.wikidata.org/prop/direct/",
"type": {
"id": "Q5",
"name": "human"
},
"autoMatch": true,
"batchSize": 10,
"columnDetails": [
{
"column": "viaf_id",
"propertyName": "VIAF cluster ID",
"propertyID": "P214"
}
],
"limit": 0
},
"description": "Reconcile cells in column person_name to type Q5"
},
{
"op": "core/column-addition",
"engineConfig": {
"facets": [],
"mode": "row-based"
},
"baseColumnName": "person_name",
"expression": "grel:value",
"onError": "set-to-blank",
"newColumnName": "person_name_original",
"columnInsertIndex": 2,
"description": "Create column person_name_original at index 2 based on column person_name using expression grel:value"
}
]
Comment thread
SCN-MNG marked this conversation as resolved.
Loading