Skip to content
73 changes: 73 additions & 0 deletions shared/rdf_config/wjazzd.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
[general]
name = "wjazzd"
csv_folder = "../../wjazzd/data/reconciled"
rdf_output_folder = "../../wjazzd/data/rdf"
test_mode = false

[namespaces]
rdf = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
rdfs = "http://www.w3.org/2000/01/rdf-schema#"
xsd = "http://www.w3.org/2001/XMLSchema#"
wd = "http://www.wikidata.org/entity/"
wdt = "http://www.wikidata.org/prop/direct/"
mb = "https://musicbrainz.org/recording/"
jr = "http://mir.audiolabs.uni-erlangen.de/jazztube/wjazzd/records/"
jc = "http://mir.audiolabs.uni-erlangen.de/jazztube/wjazzd/compositions/"
js = "http://mir.audiolabs.uni-erlangen.de/jazztube/wjazzd/solos/solo/"
jt = "http://mir.audiolabs.uni-erlangen.de/jazztube/wjazzd/tracks/"
jtr = "http://mir.audiolabs.uni-erlangen.de/jazztube/wjazzd/transcriptions/"
lmwj = "https://linkedmusic.ca/graphs/wjazzd/"

[composition_info]
PRIMARY_KEY = "compid"
compid = {type = "lmwj:Composition", prefix = "jc"}
title = "rdfs:label"
composer = "P86" # composer (P86)
form = ""
template = "P144" # based on (P144)
tonalitytype = ""
genre = "P136" # genre (P136)

[record_info]
PRIMARY_KEY = "recordid"
recordid = {type = "lmwj:Record", prefix = "jr"}
artist = "P175" # performer (P175)
recordtitle = "rdfs:label"
label = "P264" # record label (P264)
recordbib = ""
mbzid = ""

[record_info.releasedate]
pred = "P577" # publication date (P577)
datatype = "xsd:gYear"

[solo_info]
PRIMARY_KEY = "melid"
melid = {type = "lmwj:Solo",prefix = "js"}
trackid = {pred = "P361", prefix = "jt"} # part of (P361)
compid = {pred = "P2550", prefix = "jc"} # recording or performance of (P2550)
recordid = {prefix = "jr"}
performer = "P175" # performer (P175)
title = "rdfs:label"
titleaddon = ""
solopart = ""
instrument = "P870" # instrumentation (P870)
style = "P176" # manufacturer (P176)
avgtempo = "P1725" # beats per minute (P1725)
tempoclass = ""
rhythmfeel = ""
key = "P826" # tonality (P826)
signature = "P3440" # time signature (P3440)
chorus_count = ""

[track_info]
PRIMARY_KEY = "trackid"
trackid = {type = "lmwj:Track", prefix = "jt"}
filename_track = "rdfs:label"
recordid = {pred = "P361", prefix = "jr"} # part of (P361)
lineup = "P175" # performer (P175)
instrument = "P870" # instrumentation (P870)
mbzid = "P4404" # MusicBrainz recording ID (P4404)
trackno = ""
recordingdate = {pred = "P10135", datatype = "xsd:date"} # recording date (P10135)
compid = {pred = "P2550", prefix = "jc"} # recording or performance of (P2550)
4 changes: 2 additions & 2 deletions shared/rdf_conversion/using_rdfconv_script.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,13 +60,13 @@ The steps below must be completed before running the general RDF conversion scri
- Run the following command to create a configuration file

```bash
python -m rdfconv.tomlgen <path to csv folder> --output <config output path>
python -m rdfconv.tomlgen --input <path to csv folder> --output <config output path>
```

In the case of The Session, the command looks like:

```bash
python -m rdfconv.tomlgen ../thesession/data/reconciled --output rdf_config/thesession.toml
python -m rdfconv.tomlgen --input ../thesession/data/reconciled --output rdf_config/thesession.toml
```

- A new TOML configuration will be created at your select output path.
Expand Down
1 change: 1 addition & 0 deletions wjazzd/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
data
48 changes: 48 additions & 0 deletions wjazzd/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# About Weimar Jazz Database
The Weimar Jazz Database was created as part of the Jazzomat project. Much of the information can be found on the [official website](https://jazzomat.hfm-weimar.de/dbformat/dboverview.html).

For an even more in depth dive into the project, you can read chapters in the [following book](https://d1wqtxts1xzle7.cloudfront.net/55243585/inside_the_jazzomat_final_rev_oa4-libre.pdf?1512809734=&response-content-disposition=inline%3B+filename%3DInside_the_Jazzomat_New_Perspectives_for.pdf&Expires=1767825454&Signature=GuXygFuslUrc9TcEqJTsp-NZWtGMvTtDvm8-4uvCqWHFW5Fd2OXsNfHIwj6Y1PN4wGxoWO2ielG8fTfp2ZX9viXent09q7LTbipArwkMq0J~U6nfwg8DNakUtaG5i902N5Mc3Pq5jpjOFjFCt5yKVvOZxj0QV2Nap1c84YcV3aj1kZ7WPJY4iKRcGZwasLaWUqn0WJIEj3fne0DfZ5G~ygytq3ySiyJhH726cwSO4yRuocTuq80BXfMH1xoc6ZqzOcamy2~xwr3EOQw0oWt0ytvq7yr6J2hNBNhYRGmLT7ggOcPVZIrE0D5B3CStzZgA~dMWcBrWGva22c4Dz4WNaA__&Key-Pair-Id=APKAJLOHF5GGSLRBV4ZA), starting on page 19.

The official abbreviation of `Weimar Jazz Database` is `WjazzD` (occasionally referred to as `WJD` as well), we will be using the lowercase string `wjazzd` to as a naming convention.

[Jazztube](http://mir.audiolabs.uni-erlangen.de/jazztube/) is a related project by the team behind the Weimar Jazz Database, aiming to help visualize `WjazzD`. Since [Jazztube](http://mir.audiolabs.uni-erlangen.de/jazztube/) webpages are more informative, our URI will point to `http://mir.audiolabs.uni-erlangen.de/jazztube/` instead of `https://jazzomat.hfm-weimar.de/`.

# How to Obtain The Database
The database can be downloaded at the [official download page](https://jazzomat.hfm-weimar.de/download/download.html) in the form of a SQLite3 database.

# Ingestion Workflow
- Change Directory to Repository Root
- Obtain a copy of the Weimar Jazz Database SQLite file and store it at the path `/wjazzd/data/sql/wjazzd.db`
- Install `sqlite3` if not done already
```bash
sudo apt install sqlite3 # Or 'brew install sqlite' on macOS
```
- Export all tables of SQLite file to CSV
```bash
mkdir -p ./wjazzd/data/raw && \
for t in $(sqlite3 ./wjazzd/data/sql/wjazzd.db ".tables"); do
echo "Exporting $t"
sqlite3 -header -csv ./wjazzd/data/sql/wjazzd.db "SELECT * FROM $t;" \
> ./wjazzd/data/raw/$t.csv
done
```
- Copy relevant CSV to a separate `data/processed` folder (some CSVs, like `melody.csv`, are not worth being converted to Linked Data form)
```bash
mkdir -p wjazzd/data/processed && cp wjazzd/data/raw/{composition_info.csv,record_info.csv,solo_info.csv,track_info.csv,transcription_info.csv} wjazzd/data/processed/
```
- Reconcile processed CSV using OpenRefine: refer to [reconciliation guideline](./doc/reconciliation_procedures.md)
- After reconciliation, review `shared/rdf_config/wjazzd.toml` to make sure that it matches your reconciled CSV. For more information on how the General RDF Conversion script works, please consult [its documentation](../shared/rdf_conversion/using_rdfconv_script.md)
- After having reviewed the TOML file, run the general rdf conversion script using the following command from the `/shared` directory:
```bash
python -m rdfconv.convert rdf_config/wjazzd.toml
```

# Content of the Database
The [official database homepage](https://jazzomat.hfm-weimar.de/dbformat/dboverview.html) provides a comprehensive overview of each table and field in the database. Below will be provided a quick overview of the entities that are ingested into the LinkedMusic Datalake

## Ingested Entity Types
- solo: a section in a recorded song where a musician is soloing. A solo is part of a song
- track: a song. A track is part of a record (i.e. album) and contains one or more solos
- record: an album. A record contains tracks.
- composition: the jazz composition underlying a solo or a track. Both a solo and the track containing it are linked to the composition.

207 changes: 207 additions & 0 deletions wjazzd/doc/reconciliation_procedures.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,207 @@
# Weimar Jazz Database OpenRefine Reconciliation Procedures

This guide covers the steps to take to clean and reconcile the Weimar Jazz Database in OpenRefine. More specifically, the following CSV files will be reconciled before being converted to RDF:

- `solo_info.csv`
- `composition_info.csv`
- `record_info.csv`
- `track_info.csv`

The JSON files located in `wjazzd/openrefine` can be used to automatically apply the procedures detailed below.

## solo_info.csv

- The performer column must be reconciled to the type `human (Q5)`. Some musicians need to be manually selected amongst matching people with similar names
- The `instrument` column must be expanded using the dictionary below, before being reconciled:

```json
{
"as": "Alto Saxophone",
"bcl": "Bass Clarinet",
"bs": "Bass Saxophone",
"cl": "Clarinet",
"cor": "Cornet",
"g": "Guitar",
"p": "Piano",
"ss": "Soprano Saxophone",
"tb": "Trombone",
"tp": "Trumpet",
"ts": "Tenor Saxophone",
"ts-c": "C Melody Saxophone",
"vib": "Vibraphone"
}
```

- The `style` column should be prepended `" jazz"` and then reconciled
- The `key` column should be processed using the following Jython expression, then reconciled; strings like "C-dor" will be left unreconciled

```python
if value.endswith("maj"):
return value[:-3] + " major"
elif value.endswith("min"):
return value[:-3] + " minor"
else:
return value
```

- The `signature` column should be reconciled with time signatures

## composition_info.csv

- The ids in `compid` are unfortunately slightly misaligned with the ones in Jazztube, which the URI should reference. Run the following Python command to align the ids:
```python
num = int(value)
if num >= 156:
num += 1

# Step 2
if num >= 276:
num += 1

# Step 3
if num >= 281:
num += 1

return str(num)
```
- The column `genre` has two possible values: `"Original"` and `Great American Songbook`. The former should be deleted (it will not be stored); the latter should be reconciled to `Great American Songbook (Q1151397)`.
- In the column `template`, the value `blues` should be expanded to `twelve-bar blues`, the column should then be reconciled
- For the column `composer`, do the following steps:
1. Split multi-valued cell at `,` (e.g. `"Parker, Gillespie` should be split in two)
2. Spilt multi-valued cell at `/` (e.g. `Carmichael/Parish` should be split in two)
3. Trim leading and trailing whitespace
4. Create a separate `jazz musician` column, filled entirely with the value `"jazz musician"`. Reconcile the `composer` column using `jazz musician` as the `occupation (P106)`: this should improve the accuracy of reconciliation. Delete the `jazz musician` column after reconciliation.
5. After the first reconciliation, you will do a second reconciliation for `composer` that have been unmatched. This time, you should create a column filled with the value `"composer"`
6. Repeat the same process for the profession `"songwriter"`

## record_info.csv

- The column `artist` must have its multi-valued cells split at `/`, and then reconciled
- The column `label` must have its multi-valued cells split at `/`, and then reconciled. This reconciliation requires slightly more manual verification.

## track_info.csv

- For the column `lineup` (and `instrument` column, which we will create from it), do the following steps:

1. Split multi-valued cell in the column `lineup` at `;` (e.g. `"Art Pepper (as, cl); Charles Haden (b)` should be split in two)
2. Create a new column `instrument` based on the `lineup` column. Use the following GREL regex: `value.match(/.*\(([^)]+)\).*/)[0]`
3. Split multi-valued cell in the column `instrument` at `,` (e.g. `"as, cl"` should be split in two)
4. Delete the parenthesis containing the instrument from the `lineup` column (e.g. `"Charles Haden (b)` becomes `Charles Haden`). Use the following GREL regex: `value.replace(/\s*\(.*\)\s*/, "")`
5. Trim whitespace for both columns
6. Expand the column `instrument` using the following dictionary

```python
jazz_instruments = {
"arr": "",
"as": "Alto Saxophone",
"b": "Bass",
"B": "Bass",
"bc": "Bass Clarinet",
"bcl": "Bass Clarinet",
"bgo": "Baritone Guitar",
"bjo": "Banjo",
"bs": "Baritone Saxophone",
"cga": "Congas",
"cl": "Clarinet",
"cn": "Conga",
"cor": "Cornet",
"dr": "Drums",
"eb": "Electric Bass",
"electric p": "Electric Piano",
"fl": "Flute",
"flgn": "Flugelhorn",
"g": "Guitar",
"git": "Guitar",
"hca": "Harmonica",
"key": "Keyboard",
"p": "Piano",
"p-tp": "Piccolo Trumpet",
"perc": "Percussion",
"rhodes": "Rhodes Piano (Electric Piano)",
"ss": "Soprano Saxophone",
"synth": "Synthesizer",
"tb": "Trombone",
"tp": "Trumpet",
"trp": "Trumpet",
"ts": "Tenor Saxophone",
"ts-c": "Tenor Saxophone C-melody",
"Vc": "Cello",
"vcl": "Vocals",
"vib": "Vibraphone",
"voc": "Vocals"
}
```

7. Reconcile both the `instrument` and `lineup` column

- The `recordingdate` column has many entity that needs to be cleaned up:
Comment thread
SCN-MNG marked this conversation as resolved.

1. Apply the following Jython command to clean up most of the badly formatted cells:

```python
import re

def extract_date(value):
g = re.search(r"(\d{1,2})\s*\.\s*(\d{1,2})\s*\.\s*(\d{4})$", value)

if g:
day = g.group(1).zfill(2)
month = g.group(2).zfill(2)
year = g.group(3)
return year + "-" + month + "-" + day
else:
return value

return extract_date(value)
```

2. Apply the following Jython command to clean up a few remaining cells in the format of `January, 1999`

```python
import re

def extract_date(value):
month_dict = {
"january": "01",
"february": "02",
"march": "03",
"april": "04",
"may": "05",
"june": "06",
"july": "07",
"august": "08",
"september": "09",
"october": "10",
"november": "11",
"december": "12"
}

g = re.search(r"([A-Za-z]+)\s*(\d{4})$", value)

if g:
day = "01"
month = month_dict[g.group(1).lower()]
year = g.group(2)
return year + "-" + month + "-" + day
else:
return value

return extract_date(value)
```

3. Apply the following command to expand years (e.g. `1999`) to a date (e.g. `1999-01-01`)

```python
import re

def extract_date(value):
g = re.match(r"\d{4}", value)

if g:
return g.group(1)+"-01-01"
Comment thread
SCN-MNG marked this conversation as resolved.
else:
return value

return extract_date(value)
```

Loading