Source code for mirdata.datasets.jtd

"""Jazz Trio Database (JTD) Loader

.. admonition:: Dataset Info
    :class: dropdown

    The Jazz Trio Database (JTD) is a dataset comprising 1,294 multitrack jazz performances (about 45 hours total)
    annotated by an automated signal processing pipeline. All performances are commercial recordings of jazz piano
    trios, comprising acoustic piano, upright bass, and drum kit, and are broadly in the "straight-ahead" jazz style.

    Its purpose is to serve as a reference database for the design, evaluation, and implementation of various music
    information retrieval systems related to jazz and improvised music, including (but not limited to) onset detection,
    beat tracking, automatic music transcription, and performer identification.

    For every performance, the following audio files are included:

    1) the "raw" audio from the piano solo, typically including piano, bass, and drums (stereo, 44.1 kHz)
        - for some performances, individual audio files for the left and right stereo channels are also included
    2) unmixed piano audio obtained by applying a music source separation model to the "raw" audio
    3) unmixed bass audio
    4) unmixed drums audio

    For the "raw" audio, there are the following annotations:

    1) Beat timestamps for the start of each quarter note
    2) Downbeat annotations for the start of each bar

    For the three "unmixed" audio files, there are the following annotations:

    1) MIDI transcription (frame-level, currently piano only)
    2) Onset timestamps
    3) Beat-matched onsets

    To "match" onsets in the unmixed audio and beats in the "raw" audio, a window of -32nd/+16th note is applied to
    every beat timestamp, and the nearest onset from every unmixed audio file is taken as the "match". In cases where
    no onsets are contained inside the window, the beat is set to "missing" in the data, such that the number of
    beat-matched onsets is always the same as the number of beats.

    Finally, there are the following piece-level annotations:

    1) Tempo, in quarter-note beats-per-minute
    2) Time signature (either three or four quarter-note beats)
    3) Timestamps for the duration of the piano solo within the performance
    4) Metadata (e.g., recording year, performer names, album title)

    The JTD was created by researchers at the Centre for Music & Science, University of Cambridge, as part of Huw
    Cheston's PhD research, during the period 2023-2024.

    The audio data is not publicly available and access must be requested on Zenodo. The annotations and metadata are
    freely available. The database is made available for research and educational purposes under the MIT license
    (https://github.com/HuwCheston/Jazz-Trio-Database/blob/main/LICENSE).

    For more details, please visit our GitHub repository (https://github.com/HuwCheston/Jazz-Trio-Database/) or
    our TISMIR publication (https://doi.org/10.5334/tismir.186).

"""

import csv
import json
from typing import BinaryIO, Optional, TextIO, Tuple
from smart_open import open

import librosa
import numpy as np

from mirdata import download_utils, core, annotations, io

BIBTEX = """
@article{jazz-trio-database
    title = {Jazz Trio Database: Automated Annotation of Jazz Piano Trio Recordings Processed Using Audio Source Separation},
    url = {https://doi.org/10.5334/tismir.186},
    doi = {10.5334/tismir.186},
    publisher = {Transactions of the International Society for Music Information Retrieval},
    author = {Cheston, Huw and Schlichting, Joshua L and Cross, Ian and Harrison, Peter M C},
    year = {2024},
}
"""

INDEXES = {
    "default": "2.0",
    "test": "sample",
    "2.0": core.Index(
        filename="jtd_index_2.0.json",
        url="https://zenodo.org/records/14546790/files/jtd_index_2.0.json?download=1",
        checksum="fd31d02762fecadfd4615c3fdb41e225",
    ),
    "sample": core.Index(filename="jtd_index_2.0_sample.json"),
}

REMOTES = {
    "annotations": download_utils.RemoteFileMetadata(
        filename="annotation.zip",
        url="https://github.com/HuwCheston/Jazz-Trio-Database/releases/download/v02-zenodo/jazz-trio-database-v02.zip",
        checksum="43f543fb286c6222ae1f52bcf7561f37",
        destination_dir="annotations",
        unpack_directories=[
            "jazz-trio-database-v02"
        ],  # removes a redundant extra subdirectory
    )
}

DOWNLOAD_INFO = """
To download the audio for files for JTD, visit: https://zenodo.org/records/13828030 and request access.

After you've been granted access, press the "Download all" button on the Zenodo record.

This will create a new file named files-archive (with no extension). Rename the file to files-archive.zip and extract 
using any unzipping tool (7zip, WinRAR, the unarchiver) or the command line. This will give you a list of multi-part 
zip files in the form [processed.zip.001, processed.zip.002, ...] and [raw.zip.001, raw.zip.002, ...]. 

To extract these, use 7zip from the command line:

```
7z x processed.zip.001
7z x raw.zip.001
```

Note that the default `unzip` command on Linux can't handle these files, so you'll need to use 7zip. You may also be 
able to use a GUI tool like WinRAR, which was used to create the archive in the first place. 

These commands will extract the audio to the current folder. You'll then need to move the results to {0}/processed and 
{0}/raw, respectively, creating these folders if they don't already exist.

Combined with the annotation files (which can be obtained by calling `.download()` on the `mirdata.Dataset` instance 
you've just initialized), the end result should be a file structure that looks like:

```
{0}
├─ raw
│  ├─ barronk-allgodschildren-drummondrrileyb-1990-8b77c067.wav    # one to three audio files per performance
│  ├─ ...
├─ processed
│  ├─ barronk-allgodschildren-drummondrrileyb-1990-8b77c067_piano.wav     # always three audio files per performance
│  ├─ barronk-allgodschildren-drummondrrileyb-1990-8b77c067_bass.wav
│  ├─ barronk-allgodschildren-drummondrrileyb-1990-8b77c067_drums.wav
│  ├─ ...
├─ annotations
│  ├─ barronk-allgodschildren-drummondrrileyb-1990-8b77c067    # one folder per performance
│  │  ├─ bass_onsets.csv
│  │  ├─ beats.csv
│  │  ├─ ...
│  ├─ barronk-beautifullove-mrazgrileyb-2009-c87abfa6
│  ├─ ...
```

"""

LICENSE_INFO = """

The MIT License (MIT)
Copyright (c) 2023, Huw Cheston

Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated 
documentation files (the "Software"), to deal in the Software without restriction, including without limitation the 
rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit 
persons to whom the Software is furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all copies or substantial portions of the 
Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE 
WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 
OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

"""



[docs]
class Track(core.Track):
    """JTD track class

    Args:
        track_id (str): track id of the track

    Attributes:
        audio_path (str): path to audio file
        onsets_path (str): path to onsets file
        midi_path (str): path to MIDI file
        beats_path (str): path to beats file
        instrument (str): name of the instrument for this track, either "piano", "bass", or "drums"

    Properties:
        audio(tuple): audio signal and sample rate for the isolated instrument track of this performance
        musician (str): name of the musician playing the `instrument` on this track

    Cached Properties:
        beats (BeatData): beat times for this instrument
        onsets (EventData): onset and offset times
        midi (NoteData): midi pitches, onset, offset times, and velocities

    """

    def __init__(self, track_id, data_home, dataset_name, index, metadata):
        super().__init__(
            track_id,
            data_home,
            dataset_name=dataset_name,
            index=index,
            metadata=lambda: json.load(open(self.get_path("metadata"), "r")),
        )

        self.audio_path = self.get_path("audio")
        self.onsets_path = self.get_path("onsets")
        self.midi_path = self.get_path("midi")
        self.beats_path = self.get_path("beats")
        self.instrument = self.track_id.split("_")[1]

    @property
    def audio(self) -> Optional[Tuple[np.ndarray, float]]:
        """The source-separated audio for this instrument

        Returns:
            * np.ndarray - audio signal
            * float - sample rate

        """
        return load_audio(self.audio_path)

    @core.cached_property
    def beats(self) -> Optional[annotations.BeatData]:
        """The times of onsets by this musician matched to the nearest quarter-note beat timestamp

        Returns:
            * annotations.BeatData - timestamp, beat number (1-indexed to bar)

        """
        # This maps instrument names onto columns of the CSV file
        column_mapping = {"piano": 1, "bass": 2, "drums": 3}
        instrument = self.instrument
        return load_beats(self.beats_path, column_mapping[instrument])

    @core.cached_property
    def midi(self) -> Optional[annotations.NoteData]:
        """The MIDI for this instrument

        Returns:
            * annotations.NoteData

        """
        if self.midi_path is None:
            return None
        return io.load_notes_from_midi(self.midi_path)  # returns None if no MIDI

    @property
    def musician(self) -> str:
        """The name of the musician playing on this track

        Returns:
            * str - name of musician

        """
        # The `musicians` dictionary has a different mapping to the `instruments` one
        instruments_and_roles = {
            "piano": "pianist",
            "bass": "bassist",
            "drums": "drummer",
        }
        # This maps e.g. "piano" -> "pianist", "bass" -> "bassist
        current_role = instruments_and_roles[self.instrument]
        return self._track_metadata["musicians"][current_role]

    @core.cached_property
    def onsets(self) -> Optional[annotations.EventData]:
        """The onsets for this instrument

        Returns:
            * annotations.EventData

        """
        return load_onsets(self.onsets_path)




[docs]
class MultiTrack(core.MultiTrack):
    """JTD multitrack class

    Args:
        mtrack_id (str): multitrack id
        data_home (str): Local path where the dataset is stored.
            If `None`, looks for the data in the default directory, `~/mir_datasets/jtd`

    Attributes:
        album (str): The name of the album that this performance was taken from.
        audio (Tuple[np.ndarray, float]): The track's audio, center channel.
        audio_lchan (Tuple[np.ndarray, float]): The track's audio, left channel (if available).
        audio_rchan (Tuple[np.ndarray, float]): The track's audio, right channel (if available).
        bandleader (str): The full name of the bandleader who led the recording session.
        bass (Track): The associated bass track for this recording.
        drums (Track): The associated drums track for this recording.
        duration (float): The duration of the piano solo in seconds.
        jtd_300 (bool): Whether the track is contained in the smaller JTD-300 subset of 300 recordings.
        mtrack_id (str): track id
        musicbrainz_id (str): The MusicBrainz ID for the recording.
        name (str): The track's name.
        piano (Track): The associated piano track for this recording.
        start (int): The start of the piano solo relative to the full recording (in seconds).
        stop (int): The end of the piano solo relative to the full recording (in seconds).
        tempo (float): The average tempo of the track in beats per minute.
        time_signature (int): The time signature of the recording (3 or 4 quarter-note beats).
        tracks (dict): Dictionary of track IDs and `Track` instances
        year (int): The year the recording was made.

    Cached Properties:
        beats (annotations.BeatData): The times of quarter-note beats for the recording.

    """

    def __init__(
        self, mtrack_id, data_home, dataset_name, index, track_class, metadata
    ):
        super().__init__(
            mtrack_id=mtrack_id,
            data_home=data_home,
            dataset_name=dataset_name,
            index=index,
            track_class=track_class,
            metadata=lambda: json.load(open(self.get_path("metadata"), "r")),
        )

        self.audio_path = self.get_path("audio")
        self.audio_lchan_path = self.get_path("audio-lchan")
        self.audio_rchan_path = self.get_path("audio-rchan")
        self.beats_path = self.get_path("beats")

    @property
    def album(self) -> str:
        """The name of the album that this performance was taken from

        Returns:
            * str - name of the album

        """
        return self._multitrack_metadata["album_name"]

    @property
    def audio(self) -> Optional[Tuple[np.ndarray, float]]:
        """The track's audio, center channel

        Returns:
            * np.ndarray - audio signal
            * float - sample rate

        """
        return load_audio(self.audio_path)

    @property
    def audio_lchan(self) -> Optional[Tuple[np.ndarray, float]]:
        """The track's audio, left channel (not present for all tracks)

        Returns:
            * np.ndarray - audio signal
            * float - sample rate

        """
        return load_audio(self.audio_lchan_path)

    @property
    def audio_rchan(self) -> Optional[Tuple[np.ndarray, float]]:
        """The track's audio, right channel (not present for all tracks)

        Returns:
            * np.ndarray - audio signal
            * float - sample rate

        """
        return load_audio(self.audio_rchan_path)

    @property
    def bandleader(self) -> str:
        """The full name of the bandleader who led the recording session

        Returns:
            * str - name of the bandleader

        """
        return self._multitrack_metadata["bandleader"]

    @property
    def bass(self) -> Track:
        """The associated bass track for this recording

        Returns:
            * Track

        """
        return self.tracks[self.mtrack_id + "_bass"]

    @core.cached_property
    def beats(self) -> Optional[annotations.BeatData]:
        """The times of quarter-note beats for the recording

        Returns:
            * annotations.BeatData - timestamp, beat number (1-indexed to bar)

        """
        return load_beats(self.beats_path, 0)

    @property
    def drums(self) -> Track:
        """The associated drums track for this recording

        Returns:
            * Track

        """
        return self.tracks[self.mtrack_id + "_drums"]

    @property
    def duration(self) -> int:
        """The duration of the piano solo

        Returns:
            * float - solo duration (in seconds)

        """
        start = self._multitrack_metadata["timestamps"]["start"]
        stop = self._multitrack_metadata["timestamps"]["end"]
        return timestamp_to_seconds(stop) - timestamp_to_seconds(start)

    @property
    def jtd_300(self) -> bool:
        """Whether the track is contained in the smaller JTD-300 subset of 300 recordings

        Returns:
            * bool - True if contained in JTD-300, otherwise false

        """
        return self._multitrack_metadata["in_30_corpus"]

    @property
    def musicbrainz_id(self) -> str:
        """The MusicBrainz ID for the recording

        Returns:
            * str - musicbrainz ID

        """
        return self._multitrack_metadata["mbz_id"]

    @property
    def name(self) -> str:
        """The track's name

        Returns:
            * str - track name

        """
        return self._multitrack_metadata["track_name"]

    @property
    def piano(self) -> Track:
        """The associated piano track for this recording

        Returns:
            * Track

        """
        return self.tracks[self.mtrack_id + "_piano"]

    @property
    def start(self) -> int:
        """The start of the piano solo relative to the full recording

        Returns:
            * int - start of performance, in seconds

        """
        return timestamp_to_seconds(self._multitrack_metadata["timestamps"]["start"])

    @property
    def stop(self) -> int:
        """The end of the piano solo relative to the full recording

        Returns:
            * int - end of performance, in seconds

        """
        return timestamp_to_seconds(self._multitrack_metadata["timestamps"]["end"])

    @property
    def tempo(self) -> float:
        """The average tempo of the track

        Returns:
            * float - the tempo, in beats-per-minute

        """
        return float(self._multitrack_metadata["tempo"])

    @property
    def time_signature(self) -> int:
        """The time signature of the recording, either 3 or 4 quarter-note beats

        Returns:
            * int - time signature

        """
        return int(self._multitrack_metadata["time_signature"])

    @property
    def track_audio_property(self):
        return "audio"

    @property
    def year(self) -> Optional[int]:
        """The year the recording was made

        Returns:
            * int - recording year

        """
        return int(self._multitrack_metadata["recording_year"])




[docs]
def timestamp_to_seconds(ts: str) -> int:
    """Coerces timestamp in form `%M:%S` or `%H:%M:S` to an integer"""
    # Timestamp is in format hours-minutes-seconds
    if ts.count(":") == 2:
        hours, minutes, seconds = map(int, ts.split(":"))
        return int((hours * 60 * 60) + (minutes * 60) + seconds)
    # Timestamp is in format minutes-seconds
    elif ts.count(":") == 1:
        minutes, seconds = map(int, ts.split(":"))
        return int((minutes * 60) + seconds)
    # Timestamp is in incorrect format
    else:
        raise ValueError("Timestamp must be in format %H:%M:%S or %M:%S")




[docs]
@io.coerce_to_bytes_io
def load_audio(fhandle: BinaryIO) -> Tuple[np.ndarray, float]:
    """Load a JTD audio file.

    Args:
        fhandle (str or file-like): path or file-like object pointing to an audio file

    Returns:
        * np.ndarray - the audio signal
        * float - The sample rate of the audio file

    """
    return librosa.load(fhandle, sr=44100, mono=True)




[docs]
@io.coerce_to_string_io
def load_onsets(fhandle: TextIO) -> Optional[annotations.EventData]:
    """Load a JTD onset file.

    Args:
        fhandle (str or file-like): path or file-like object pointing to an onset csv file

    Returns:
        * annotations.EventData - the onset data

    """
    reader: list = list(csv.reader(fhandle))
    # Flatten list and evaluate items as floats
    reader = [float(x) for xs in reader for x in xs]
    intervals = []
    annotation = []
    # Iterate over successive onset times
    for line_num, (line1, line2) in enumerate(zip(reader, reader[1:])):
        # Creates a list of (onset1, onset2), (onset2, onset3), ... (i.e., onset and offset times)
        intervals.append([line1, line2])
        # This is just the count of onsets, 0-indexed
        annotation.append(str(line_num))
    # Needs to be an array to pass validation
    intervals_arr = np.array(intervals)
    return annotations.EventData(intervals_arr, "s", annotation, "open")




[docs]
@io.coerce_to_string_io
def load_beats(fhandle: TextIO, col_idx: int) -> Optional[annotations.BeatData]:
    """Load a JTD beat file.

    Args:
        fhandle (str or file-like): path or file-like object pointing to a beat csv file
        col_idx (int, optional): index of the column to use (0=overall, 1=piano, 2=bass, 3=drums), defaults to 0

    Returns:
        * annotations.BeatData - the beat data

    """
    reader = csv.reader(fhandle)
    reader.__next__()  # The first line of the CSV is always a header so we can just skip it
    timestamps, positions = [], []
    # Iterating over each line of the CSV file (i.e., each 'beat')
    for beat_number, beat, piano, bass, drums, metre in reader:
        # Get the required data from the row
        desired_data = [beat, piano, bass, drums][col_idx]
        # Coerce empty strings to NaN values
        if desired_data == "":
            desired_data_fmt = np.nan
        else:
            desired_data_fmt = float(desired_data)
        # Append everything to the list with the required datatypes
        timestamps.append(desired_data_fmt)
        positions.append(int(float(metre)))  # coerce string to float and then to int
    return annotations.BeatData(
        np.array(timestamps), "s", np.array(positions), "bar_index"
    )




[docs]
@core.docstring_inherit(core.Dataset)
class Dataset(core.Dataset):
    """
    The Jazz Trio Database.
    """

    def __init__(self, data_home=None, version="default"):
        super().__init__(
            data_home,
            version,
            name="jtd",
            track_class=Track,
            multitrack_class=MultiTrack,
            bibtex=BIBTEX,
            indexes=INDEXES,
            remotes=REMOTES,
            download_info=DOWNLOAD_INFO,
            license_info=LICENSE_INFO,
        )