Source code for mirdata.datasets.jtd

"""Jazz Trio Database (JTD) Loader

.. admonition:: Dataset Info
    :class: dropdown

    The Jazz Trio Database (JTD) is a dataset comprising 1,294 multitrack jazz performances (about 45 hours total)
    annotated by an automated signal processing pipeline. All performances are commercial recordings of jazz piano
    trios, comprising acoustic piano, upright bass, and drum kit, and are broadly in the "straight-ahead" jazz style.

    Its purpose is to serve as a reference database for the design, evaluation, and implementation of various music
    information retrieval systems related to jazz and improvised music, including (but not limited to) onset detection,
    beat tracking, automatic music transcription, and performer identification.

    For every performance, the following audio files are included:

    1) the "raw" audio from the piano solo, typically including piano, bass, and drums (stereo, 44.1 kHz)
        - for some performances, individual audio files for the left and right stereo channels are also included
    2) unmixed piano audio obtained by applying a music source separation model to the "raw" audio
    3) unmixed bass audio
    4) unmixed drums audio

    For the "raw" audio, there are the following annotations:

    1) Beat timestamps for the start of each quarter note
    2) Downbeat annotations for the start of each bar

    For the three "unmixed" audio files, there are the following annotations:

    1) MIDI transcription (frame-level, currently piano only)
    2) Onset timestamps
    3) Beat-matched onsets

    To "match" onsets in the unmixed audio and beats in the "raw" audio, a window of -32nd/+16th note is applied to
    every beat timestamp, and the nearest onset from every unmixed audio file is taken as the "match". In cases where
    no onsets are contained inside the window, the beat is set to "missing" in the data, such that the number of
    beat-matched onsets is always the same as the number of beats.

    Finally, there are the following piece-level annotations:

    1) Tempo, in quarter-note beats-per-minute
    2) Time signature (either three or four quarter-note beats)
    3) Timestamps for the duration of the piano solo within the performance
    4) Metadata (e.g., recording year, performer names, album title)

    The JTD was created by researchers at the Centre for Music & Science, University of Cambridge, as part of Huw
    Cheston's PhD research, during the period 2023-2024.

    The audio data is not publicly available and access must be requested on Zenodo. The annotations and metadata are
    freely available. The database is made available for research and educational purposes under the MIT license
    (https://github.com/HuwCheston/Jazz-Trio-Database/blob/main/LICENSE).

    For more details, please visit our GitHub repository (https://github.com/HuwCheston/Jazz-Trio-Database/) or
    our TISMIR publication (https://doi.org/10.5334/tismir.186).

"""

import csv
import json
from typing import BinaryIO, Optional, TextIO, Tuple
from smart_open import open

import librosa
import numpy as np

from mirdata import download_utils, core, annotations, io

BIBTEX = """
@article{jazz-trio-database
    title = {Jazz Trio Database: Automated Annotation of Jazz Piano Trio Recordings Processed Using Audio Source Separation},
    url = {https://doi.org/10.5334/tismir.186},
    doi = {10.5334/tismir.186},
    publisher = {Transactions of the International Society for Music Information Retrieval},
    author = {Cheston, Huw and Schlichting, Joshua L and Cross, Ian and Harrison, Peter M C},
    year = {2024},
}
"""

INDEXES = {
    "default": "2.0",
    "test": "sample",
    "2.0": core.Index(
        filename="jtd_index_2.0.json",
        url="https://zenodo.org/records/14546790/files/jtd_index_2.0.json?download=1",
        checksum="fd31d02762fecadfd4615c3fdb41e225",
    ),
    "sample": core.Index(filename="jtd_index_2.0_sample.json"),
}

REMOTES = {
    "annotations": download_utils.RemoteFileMetadata(
        filename="annotation.zip",
        url="https://github.com/HuwCheston/Jazz-Trio-Database/releases/download/v02-zenodo/jazz-trio-database-v02.zip",
        checksum="43f543fb286c6222ae1f52bcf7561f37",
        destination_dir="annotations",
        unpack_directories=[
            "jazz-trio-database-v02"
        ],  # removes a redundant extra subdirectory
    )
}

DOWNLOAD_INFO = """
To download the audio for files for JTD, visit: https://zenodo.org/records/13828030 and request access.

After you've been granted access, press the "Download all" button on the Zenodo record.

This will create a new file named files-archive (with no extension). Rename the file to files-archive.zip and extract 
using any unzipping tool (7zip, WinRAR, the unarchiver) or the command line. This will give you a list of multi-part 
zip files in the form [processed.zip.001, processed.zip.002, ...] and [raw.zip.001, raw.zip.002, ...]. 

To extract these, use 7zip from the command line:

```
7z x processed.zip.001
7z x raw.zip.001
```

Note that the default `unzip` command on Linux can't handle these files, so you'll need to use 7zip. You may also be 
able to use a GUI tool like WinRAR, which was used to create the archive in the first place. 

These commands will extract the audio to the current folder. You'll then need to move the results to {0}/processed and 
{0}/raw, respectively, creating these folders if they don't already exist.

Combined with the annotation files (which can be obtained by calling `.download()` on the `mirdata.Dataset` instance 
you've just initialized), the end result should be a file structure that looks like:

```
{0}
├─ raw
│  ├─ barronk-allgodschildren-drummondrrileyb-1990-8b77c067.wav    # one to three audio files per performance
│  ├─ ...
├─ processed
│  ├─ barronk-allgodschildren-drummondrrileyb-1990-8b77c067_piano.wav     # always three audio files per performance
│  ├─ barronk-allgodschildren-drummondrrileyb-1990-8b77c067_bass.wav
│  ├─ barronk-allgodschildren-drummondrrileyb-1990-8b77c067_drums.wav
│  ├─ ...
├─ annotations
│  ├─ barronk-allgodschildren-drummondrrileyb-1990-8b77c067    # one folder per performance
│  │  ├─ bass_onsets.csv
│  │  ├─ beats.csv
│  │  ├─ ...
│  ├─ barronk-beautifullove-mrazgrileyb-2009-c87abfa6
│  ├─ ...
```

"""

LICENSE_INFO = """

The MIT License (MIT)
Copyright (c) 2023, Huw Cheston

Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated 
documentation files (the "Software"), to deal in the Software without restriction, including without limitation the 
rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit 
persons to whom the Software is furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all copies or substantial portions of the 
Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE 
WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 
OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

"""


[docs] class Track(core.Track): """JTD track class Args: track_id (str): track id of the track Attributes: audio_path (str): path to audio file onsets_path (str): path to onsets file midi_path (str): path to MIDI file beats_path (str): path to beats file instrument (str): name of the instrument for this track, either "piano", "bass", or "drums" Properties: audio(tuple): audio signal and sample rate for the isolated instrument track of this performance musician (str): name of the musician playing the `instrument` on this track Cached Properties: beats (BeatData): beat times for this instrument onsets (EventData): onset and offset times midi (NoteData): midi pitches, onset, offset times, and velocities """ def __init__(self, track_id, data_home, dataset_name, index, metadata): super().__init__( track_id, data_home, dataset_name=dataset_name, index=index, metadata=lambda: json.load(open(self.get_path("metadata"), "r")), ) self.audio_path = self.get_path("audio") self.onsets_path = self.get_path("onsets") self.midi_path = self.get_path("midi") self.beats_path = self.get_path("beats") self.instrument = self.track_id.split("_")[1] @property def audio(self) -> Optional[Tuple[np.ndarray, float]]: """The source-separated audio for this instrument Returns: * np.ndarray - audio signal * float - sample rate """ return load_audio(self.audio_path) @core.cached_property def beats(self) -> Optional[annotations.BeatData]: """The times of onsets by this musician matched to the nearest quarter-note beat timestamp Returns: * annotations.BeatData - timestamp, beat number (1-indexed to bar) """ # This maps instrument names onto columns of the CSV file column_mapping = {"piano": 1, "bass": 2, "drums": 3} instrument = self.instrument return load_beats(self.beats_path, column_mapping[instrument]) @core.cached_property def midi(self) -> Optional[annotations.NoteData]: """The MIDI for this instrument Returns: * annotations.NoteData """ if self.midi_path is None: return None return io.load_notes_from_midi(self.midi_path) # returns None if no MIDI @property def musician(self) -> str: """The name of the musician playing on this track Returns: * str - name of musician """ # The `musicians` dictionary has a different mapping to the `instruments` one instruments_and_roles = { "piano": "pianist", "bass": "bassist", "drums": "drummer", } # This maps e.g. "piano" -> "pianist", "bass" -> "bassist current_role = instruments_and_roles[self.instrument] return self._track_metadata["musicians"][current_role] @core.cached_property def onsets(self) -> Optional[annotations.EventData]: """The onsets for this instrument Returns: * annotations.EventData """ return load_onsets(self.onsets_path)
[docs] class MultiTrack(core.MultiTrack): """JTD multitrack class Args: mtrack_id (str): multitrack id data_home (str): Local path where the dataset is stored. If `None`, looks for the data in the default directory, `~/mir_datasets/jtd` Attributes: album (str): The name of the album that this performance was taken from. audio (Tuple[np.ndarray, float]): The track's audio, center channel. audio_lchan (Tuple[np.ndarray, float]): The track's audio, left channel (if available). audio_rchan (Tuple[np.ndarray, float]): The track's audio, right channel (if available). bandleader (str): The full name of the bandleader who led the recording session. bass (Track): The associated bass track for this recording. drums (Track): The associated drums track for this recording. duration (float): The duration of the piano solo in seconds. jtd_300 (bool): Whether the track is contained in the smaller JTD-300 subset of 300 recordings. mtrack_id (str): track id musicbrainz_id (str): The MusicBrainz ID for the recording. name (str): The track's name. piano (Track): The associated piano track for this recording. start (int): The start of the piano solo relative to the full recording (in seconds). stop (int): The end of the piano solo relative to the full recording (in seconds). tempo (float): The average tempo of the track in beats per minute. time_signature (int): The time signature of the recording (3 or 4 quarter-note beats). tracks (dict): Dictionary of track IDs and `Track` instances year (int): The year the recording was made. Cached Properties: beats (annotations.BeatData): The times of quarter-note beats for the recording. """ def __init__( self, mtrack_id, data_home, dataset_name, index, track_class, metadata ): super().__init__( mtrack_id=mtrack_id, data_home=data_home, dataset_name=dataset_name, index=index, track_class=track_class, metadata=lambda: json.load(open(self.get_path("metadata"), "r")), ) self.audio_path = self.get_path("audio") self.audio_lchan_path = self.get_path("audio-lchan") self.audio_rchan_path = self.get_path("audio-rchan") self.beats_path = self.get_path("beats") @property def album(self) -> str: """The name of the album that this performance was taken from Returns: * str - name of the album """ return self._multitrack_metadata["album_name"] @property def audio(self) -> Optional[Tuple[np.ndarray, float]]: """The track's audio, center channel Returns: * np.ndarray - audio signal * float - sample rate """ return load_audio(self.audio_path) @property def audio_lchan(self) -> Optional[Tuple[np.ndarray, float]]: """The track's audio, left channel (not present for all tracks) Returns: * np.ndarray - audio signal * float - sample rate """ return load_audio(self.audio_lchan_path) @property def audio_rchan(self) -> Optional[Tuple[np.ndarray, float]]: """The track's audio, right channel (not present for all tracks) Returns: * np.ndarray - audio signal * float - sample rate """ return load_audio(self.audio_rchan_path) @property def bandleader(self) -> str: """The full name of the bandleader who led the recording session Returns: * str - name of the bandleader """ return self._multitrack_metadata["bandleader"] @property def bass(self) -> Track: """The associated bass track for this recording Returns: * Track """ return self.tracks[self.mtrack_id + "_bass"] @core.cached_property def beats(self) -> Optional[annotations.BeatData]: """The times of quarter-note beats for the recording Returns: * annotations.BeatData - timestamp, beat number (1-indexed to bar) """ return load_beats(self.beats_path, 0) @property def drums(self) -> Track: """The associated drums track for this recording Returns: * Track """ return self.tracks[self.mtrack_id + "_drums"] @property def duration(self) -> int: """The duration of the piano solo Returns: * float - solo duration (in seconds) """ start = self._multitrack_metadata["timestamps"]["start"] stop = self._multitrack_metadata["timestamps"]["end"] return timestamp_to_seconds(stop) - timestamp_to_seconds(start) @property def jtd_300(self) -> bool: """Whether the track is contained in the smaller JTD-300 subset of 300 recordings Returns: * bool - True if contained in JTD-300, otherwise false """ return self._multitrack_metadata["in_30_corpus"] @property def musicbrainz_id(self) -> str: """The MusicBrainz ID for the recording Returns: * str - musicbrainz ID """ return self._multitrack_metadata["mbz_id"] @property def name(self) -> str: """The track's name Returns: * str - track name """ return self._multitrack_metadata["track_name"] @property def piano(self) -> Track: """The associated piano track for this recording Returns: * Track """ return self.tracks[self.mtrack_id + "_piano"] @property def start(self) -> int: """The start of the piano solo relative to the full recording Returns: * int - start of performance, in seconds """ return timestamp_to_seconds(self._multitrack_metadata["timestamps"]["start"]) @property def stop(self) -> int: """The end of the piano solo relative to the full recording Returns: * int - end of performance, in seconds """ return timestamp_to_seconds(self._multitrack_metadata["timestamps"]["end"]) @property def tempo(self) -> float: """The average tempo of the track Returns: * float - the tempo, in beats-per-minute """ return float(self._multitrack_metadata["tempo"]) @property def time_signature(self) -> int: """The time signature of the recording, either 3 or 4 quarter-note beats Returns: * int - time signature """ return int(self._multitrack_metadata["time_signature"]) @property def track_audio_property(self): return "audio" @property def year(self) -> Optional[int]: """The year the recording was made Returns: * int - recording year """ return int(self._multitrack_metadata["recording_year"])
[docs] def timestamp_to_seconds(ts: str) -> int: """Coerces timestamp in form `%M:%S` or `%H:%M:S` to an integer""" # Timestamp is in format hours-minutes-seconds if ts.count(":") == 2: hours, minutes, seconds = map(int, ts.split(":")) return int((hours * 60 * 60) + (minutes * 60) + seconds) # Timestamp is in format minutes-seconds elif ts.count(":") == 1: minutes, seconds = map(int, ts.split(":")) return int((minutes * 60) + seconds) # Timestamp is in incorrect format else: raise ValueError("Timestamp must be in format %H:%M:%S or %M:%S")
[docs] @io.coerce_to_bytes_io def load_audio(fhandle: BinaryIO) -> Tuple[np.ndarray, float]: """Load a JTD audio file. Args: fhandle (str or file-like): path or file-like object pointing to an audio file Returns: * np.ndarray - the audio signal * float - The sample rate of the audio file """ return librosa.load(fhandle, sr=44100, mono=True)
[docs] @io.coerce_to_string_io def load_onsets(fhandle: TextIO) -> Optional[annotations.EventData]: """Load a JTD onset file. Args: fhandle (str or file-like): path or file-like object pointing to an onset csv file Returns: * annotations.EventData - the onset data """ reader: list = list(csv.reader(fhandle)) # Flatten list and evaluate items as floats reader = [float(x) for xs in reader for x in xs] intervals = [] annotation = [] # Iterate over successive onset times for line_num, (line1, line2) in enumerate(zip(reader, reader[1:])): # Creates a list of (onset1, onset2), (onset2, onset3), ... (i.e., onset and offset times) intervals.append([line1, line2]) # This is just the count of onsets, 0-indexed annotation.append(str(line_num)) # Needs to be an array to pass validation intervals_arr = np.array(intervals) return annotations.EventData(intervals_arr, "s", annotation, "open")
[docs] @io.coerce_to_string_io def load_beats(fhandle: TextIO, col_idx: int) -> Optional[annotations.BeatData]: """Load a JTD beat file. Args: fhandle (str or file-like): path or file-like object pointing to a beat csv file col_idx (int, optional): index of the column to use (0=overall, 1=piano, 2=bass, 3=drums), defaults to 0 Returns: * annotations.BeatData - the beat data """ reader = csv.reader(fhandle) reader.__next__() # The first line of the CSV is always a header so we can just skip it timestamps, positions = [], [] # Iterating over each line of the CSV file (i.e., each 'beat') for beat_number, beat, piano, bass, drums, metre in reader: # Get the required data from the row desired_data = [beat, piano, bass, drums][col_idx] # Coerce empty strings to NaN values if desired_data == "": desired_data_fmt = np.nan else: desired_data_fmt = float(desired_data) # Append everything to the list with the required datatypes timestamps.append(desired_data_fmt) positions.append(int(float(metre))) # coerce string to float and then to int return annotations.BeatData( np.array(timestamps), "s", np.array(positions), "bar_index" )
[docs] @core.docstring_inherit(core.Dataset) class Dataset(core.Dataset): """ The Jazz Trio Database. """ def __init__(self, data_home=None, version="default"): super().__init__( data_home, version, name="jtd", track_class=Track, multitrack_class=MultiTrack, bibtex=BIBTEX, indexes=INDEXES, remotes=REMOTES, download_info=DOWNLOAD_INFO, license_info=LICENSE_INFO, )