Package `music_df`

This package features various functions for working with Pandas dataframes representing music scores. I developed it for my personal use during my dissertation.

With this library, you can among many other things:

read from a variety of formats, including MIDI, Humdrum, MusicXML, and others.
postprocess the scores in various ways, such as salami-slicing, quantizing, dedoubling and de-tremoloing them.
apply augmentations like transposition (which can be sensitive to pitch-spelling) or time-scaling (which adjusts the time signatures appropriately).
plot dataframes as piano-rolls or display them as scores, displaying annotations or coloring noteheads.

Among the more useful parts of the package worth highlighting:

music_df.read: read in a variety of music file formats to a dataframe
music_df.add_feature: infer/adjust features like barlines, time signatures, etc.
music_df.salami_slice: salami-slice a dataframe
music_df.augmentations: apply augmentations (e.g., transposition) to a dataframe
music_df.transpose: transpose the pitches of a dataframe

The documentation is currently a work in progress.

Sub-modules

music_df.add_analysis
music_df.add_feature: This module contains diverse functions for adding features to music dataframes.
music_df.align_dfs
music_df.augmentations
music_df.chord_df
music_df.constants
music_df.conversions: Functions for converting from ms3, music21, and symusic.
music_df.crop_df
music_df.dedouble: Provides a function to "dedouble" a music_df …
music_df.detremolo: Provides a function for detremoloing (removing rapid repeated notes from) a dataframe.
music_df.find: Provides a function for searching salami-slices.
music_df.harmony
music_df.humdrum_export: Functions for exporting music dataframes to Humdrum format …
music_df.join_repeated_notes: Not yet implemented.
music_df.keys: Provides functions for working with keys in music dataframes.
music_df.label_df
music_df.merge_dfs
music_df.merge_notes
music_df.midi_parser
music_df.plot
music_df.plot_piano_rolls
music_df.read: Provides a function, music_df.read, for reading a file into a music_df …
music_df.read_midi
music_df.read_rntxt
music_df.read_xml
music_df.salami_slice: This module contains functions for salami-slicing music dataframes …
music_df.script_helpers
music_df.scripts
music_df.show_scores
music_df.slice_df
music_df.split_df
music_df.split_notes: Functions for splitting notes.
music_df.sync_df
music_df.time: A few functions for working with temporal/rhythmic features of music dataframes.
music_df.transpose: Functions for transposing music either by pitch-class or along the circle of fifths.
music_df.utils
music_df.xml_parser

Functions

def chromatic_transpose(df: pandas.DataFrame, interval: int, inplace: bool = True, label: bool = False, metadata=True)

Expand source code

def chromatic_transpose(
    df: pd.DataFrame,
    interval: int,
    inplace: bool = True,
    label: bool = False,
    metadata=True,
):
    """
    Transpose the pitches of a music_df by a given chromatic interval.

    Note that this will change the "pitch" column but not any other columns that may
    be pitch-related such as those that may indicate the spelling or key signature.

    Args:
        df: a music_df
        interval: the interval to transpose by
        inplace: if True, will modify the music_df in place
        label: if True, will add a "transposed_by_n_semitones" column to the music_df
        metadata: if True, will add a "chromatic_transpose" attribute to the music_df

    Returns:
        A new music_df with the pitches transposed by the given interval.
    """
    out_df = df if inplace else df.copy()
    out_df.pitch += interval
    if metadata:
        if "chromatic_transpose" in out_df.attrs:
            out_df.attrs["chromatic_transpose"] += interval
        else:
            out_df.attrs["chromatic_transpose"] = interval
    if label:
        out_df.loc[:, "transposed_by_n_semitones"] = interval
    return out_df

Transpose the pitches of a music_df by a given chromatic interval.

Note that this will change the "pitch" column but not any other columns that may be pitch-related such as those that may indicate the spelling or key signature.

Args

df: a music_df
interval: the interval to transpose by
inplace: if True, will modify the music_df in place
label: if True, will add a "transposed_by_n_semitones" column to the music_df
metadata: if True, will add a "chromatic_transpose" attribute to the music_df

Returns

A new music_df with the pitches transposed by the given interval.

def get_df_segment_indices(eligible_onsets: Sequence[int] | numpy.ndarray[tuple[typing.Any, ...], numpy.dtype[numpy.int64]], eligible_releases: Sequence[int] | numpy.ndarray[tuple[typing.Any, ...], numpy.dtype[numpy.int64]], target_len: int)

Expand source code

def get_df_segment_indices(
    eligible_onsets: t.Union[t.Sequence[int], npt.NDArray[np.int_]],
    eligible_releases: t.Union[t.Sequence[int], npt.NDArray[np.int_]],
    target_len: int,
):
    """
    # >>> eligible_onsets = list(range(32))
    # >>> eligible_releases = list(range(32))
    # >>> list(get_df_segment_indices(eligible_onsets, eligible_releases, 8))
    # [(0, 8), (8, 16), (16, 24), (24, 32)]

    # >>> eligible_onsets = [i * 2 for i in range(16)]
    # >>> eligible_releases = [i * 2 + 1 for i in range(16)]
    # >>> list(get_df_segment_indices(eligible_onsets, eligible_releases, 8))
    # [(0, 8), (8, 16), (16, 24), (24, 32)]

    # >>> eligible_onsets = [0, 3, 7, 14]
    # >>> eligible_releases = [2, 3, 6, 12, 13, 17]
    # >>> list(get_df_segment_indices(eligible_onsets, eligible_releases, 8))
    # [(0, 7), (7, 14), (14, 18)]

    We aim for target_len, but there is no firm limit on how long a segment
    might be. We depend on eligible_onsets/eligible_releases to be fairly
    evenly distributed to avoid segments that are far too long (or short).
    >>> eligible_onsets = [0, 1, 14, 15]
    >>> eligible_releases = [2, 3, 17]
    >>> list(get_df_segment_indices(eligible_onsets, eligible_releases, 8))
    [(0, 4), (1, 18)]

    >>> eligible_onsets = [0, 1, 14, 15]
    >>> eligible_releases = [16, 17]
    >>> list(get_df_segment_indices(eligible_onsets, eligible_releases, 8))
    [(0, 17), (15, 18)]

    Releases before the first eligible onset are ignored.
    >>> eligible_onsets = [14, 15]
    >>> eligible_releases = [0, 1, 16, 17]
    >>> list(get_df_segment_indices(eligible_onsets, eligible_releases, 8))
    [(14, 18)]

    There shouldn't be any other circumstance in which onsets or releases are
    skipped.
    """
    # assumes df has a range index
    start_i = None
    end_i = eligible_releases[0] - 1
    max_release_i = eligible_releases[-1]
    while end_i < max_release_i:
        if start_i is None:
            start_i = eligible_onsets[0]
        else:
            try:
                start_i = get_item_leq(eligible_onsets, end_i + 1, min_val=start_i + 1)
            except ValueError:  # pylint: disable=try-except-raise
                # We should never get here, I think this is a bug if we do
                raise
        # we calculate end_i *inclusively*, then add 1 to it to return
        #   an exclusive boundary appropriate for slicing in Python
        end_i = get_item_leq(
            eligible_releases,
            # We need to subtract 1 from target_len because we are
            #   calculating an inclusive boundary
            start_i + target_len - 1,
            min_val=max(start_i + 1, end_i + 1),
        )
        yield start_i, end_i + 1

>>> eligible_onsets = list(range(32))

>>> eligible_releases = list(range(32))

>>> list(get_df_segment_indices(eligible_onsets, eligible_releases, 8))

[(0, 8), (8, 16), (16, 24), (24, 32)]

>>> eligible_onsets = [i * 2 for i in range(16)]

>>> eligible_releases = [i * 2 + 1 for i in range(16)]

>>> list(get_df_segment_indices(eligible_onsets, eligible_releases, 8))

[(0, 8), (8, 16), (16, 24), (24, 32)]

>>> eligible_onsets = [0, 3, 7, 14]

>>> eligible_releases = [2, 3, 6, 12, 13, 17]

>>> list(get_df_segment_indices(eligible_onsets, eligible_releases, 8))

[(0, 7), (7, 14), (14, 18)]

We aim for target_len, but there is no firm limit on how long a segment might be. We depend on eligible_onsets/eligible_releases to be fairly evenly distributed to avoid segments that are far too long (or short).

>>> eligible_onsets = [0, 1, 14, 15]
>>> eligible_releases = [2, 3, 17]
>>> list(get_df_segment_indices(eligible_onsets, eligible_releases, 8))
[(0, 4), (1, 18)]

>>> eligible_onsets = [0, 1, 14, 15]
>>> eligible_releases = [16, 17]
>>> list(get_df_segment_indices(eligible_onsets, eligible_releases, 8))
[(0, 17), (15, 18)]

Releases before the first eligible onset are ignored.

>>> eligible_onsets = [14, 15]
>>> eligible_releases = [0, 1, 16, 17]
>>> list(get_df_segment_indices(eligible_onsets, eligible_releases, 8))
[(14, 18)]

There shouldn't be any other circumstance in which onsets or releases are skipped.

def get_eligible_onsets(df: pandas.DataFrame, keep_onsets_together: bool = True, notes_only: bool = False) ‑> numpy.ndarray[tuple[typing.Any, ...], numpy.dtype[numpy.int64]]

Expand source code

def get_eligible_onsets(
    df: pd.DataFrame,
    keep_onsets_together: bool = True,
    notes_only: bool = False,
) -> npt.NDArray[np.int_]:
    """
    This function should perhaps be renamed "get indices to eligible onsets".
    >>> df = pd.DataFrame(
    ...     {
    ...         "pitch": [0, 60, 64, 60, 64, 0, 60, 64, 60, 64, 0],
    ...         "onset": [0, 0, 0, 1, 1, 1.5, 1.5, 2.0, 3.0, 3.0, 5.0],
    ...         "release": [0, 1, 1, 1.5, 2.0, 0, 3.0, 3.0, 4.0, 4.5, 0],
    ...         "type": ["bar"] + ["note"] * 4 + ["bar"] + ["note"] * 4 + ["bar"],
    ...     }
    ... )
    >>> df
        pitch  onset  release  type
    0       0    0.0      0.0   bar
    1      60    0.0      1.0  note
    2      64    0.0      1.0  note
    3      60    1.0      1.5  note
    4      64    1.0      2.0  note
    5       0    1.5      0.0   bar
    6      60    1.5      3.0  note
    7      64    2.0      3.0  note
    8      60    3.0      4.0  note
    9      64    3.0      4.5  note
    10      0    5.0      0.0   bar
    >>> get_eligible_onsets(df)
    array([ 0,  3,  5,  7,  8, 10])
    >>> get_eligible_onsets(df, notes_only=True)
    array([1, 3, 6, 7, 8])
    >>> get_eligible_onsets(df, keep_onsets_together=False)
    array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10])
    >>> get_eligible_onsets(df, keep_onsets_together=False, notes_only=True)
    array([1, 2, 3, 4, 6, 7, 8, 9])
    """
    if notes_only and "type" in df.columns:
        df = df[df.type == "note"]
    if not keep_onsets_together:
        return df.index.to_numpy()
    onset_indices = np.unique(df.onset, return_index=True)[1]
    return df.index[onset_indices].to_numpy()

This function should perhaps be renamed "get indices to eligible onsets".

>>> df = pd.DataFrame(
...     {
...         "pitch": [0, 60, 64, 60, 64, 0, 60, 64, 60, 64, 0],
...         "onset": [0, 0, 0, 1, 1, 1.5, 1.5, 2.0, 3.0, 3.0, 5.0],
...         "release": [0, 1, 1, 1.5, 2.0, 0, 3.0, 3.0, 4.0, 4.5, 0],
...         "type": ["bar"] + ["note"] * 4 + ["bar"] + ["note"] * 4 + ["bar"],
...     }
... )
>>> df
    pitch  onset  release  type
0       0    0.0      0.0   bar
1      60    0.0      1.0  note
2      64    0.0      1.0  note
3      60    1.0      1.5  note
4      64    1.0      2.0  note
5       0    1.5      0.0   bar
6      60    1.5      3.0  note
7      64    2.0      3.0  note
8      60    3.0      4.0  note
9      64    3.0      4.5  note
10      0    5.0      0.0   bar
>>> get_eligible_onsets(df)
array([ 0,  3,  5,  7,  8, 10])
>>> get_eligible_onsets(df, notes_only=True)
array([1, 3, 6, 7, 8])
>>> get_eligible_onsets(df, keep_onsets_together=False)
array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10])
>>> get_eligible_onsets(df, keep_onsets_together=False, notes_only=True)
array([1, 2, 3, 4, 6, 7, 8, 9])

def get_eligible_releases(df: pandas.DataFrame, keep_releases_together: bool = True) ‑> pandas.Series

Expand source code

def get_eligible_releases(
    df: pd.DataFrame,
    keep_releases_together: bool = True,
) -> pd.Series:
    """
    Returns a series where the Index gives the indices into the dataframe
    and the values are the associated release times.
    >>> df = pd.DataFrame(
    ...     {
    ...         "pitch": [0, 60, 64, 60, 64, 0, 60, 64, 60, 64, 0],
    ...         "onset": [0, 0, 0, 1, 1, 1.5, 1.5, 2.0, 3.0, 3.0, 5.0],
    ...         "release": [0, 1, 1, 1.5, 2.0, 0, 3.0, 3.0, 4.0, 4.5, 0],
    ...         "type": ["bar"] + ["note"] * 4 + ["bar"] + ["note"] * 4 + ["bar"],
    ...     }
    ... )
    >>> df
        pitch  onset  release  type
    0       0    0.0      0.0   bar
    1      60    0.0      1.0  note
    2      64    0.0      1.0  note
    3      60    1.0      1.5  note
    4      64    1.0      2.0  note
    5       0    1.5      0.0   bar
    6      60    1.5      3.0  note
    7      64    2.0      3.0  note
    8      60    3.0      4.0  note
    9      64    3.0      4.5  note
    10      0    5.0      0.0   bar

    Only notes have releases so get_eligible_releases() is always `note_only`.
    (Cf get_eligible_onsets().)

    >>> get_eligible_releases(df)
    2    1.0
    3    1.5
    4    2.0
    7    3.0
    8    4.0
    9    4.5
    Name: release, dtype: float64
    >>> get_eligible_releases(df, keep_releases_together=False)
    1    1.0
    2    1.0
    3    1.5
    4    2.0
    6    3.0
    7    3.0
    8    4.0
    9    4.5
    Name: release, dtype: float64
    """
    if "type" in df.columns:
        df = df[df.type == "note"]
    if not keep_releases_together:
        return df.release
    df2 = df.sort_values(
        by="pitch", inplace=False, ignore_index=False, kind="mergesort"
    )
    df2 = df2.sort_values(
        by="release", inplace=False, ignore_index=False, kind="mergesort"
    )
    release_indices = (len(df2) - 1) - np.unique(
        np.flip(df2.release.to_numpy()), return_index=True
    )[1]
    out = df2.iloc[release_indices]["release"]
    return out

Returns a series where the Index gives the indices into the dataframe and the values are the associated release times.

>>> df = pd.DataFrame(
...     {
...         "pitch": [0, 60, 64, 60, 64, 0, 60, 64, 60, 64, 0],
...         "onset": [0, 0, 0, 1, 1, 1.5, 1.5, 2.0, 3.0, 3.0, 5.0],
...         "release": [0, 1, 1, 1.5, 2.0, 0, 3.0, 3.0, 4.0, 4.5, 0],
...         "type": ["bar"] + ["note"] * 4 + ["bar"] + ["note"] * 4 + ["bar"],
...     }
... )
>>> df
    pitch  onset  release  type
0       0    0.0      0.0   bar
1      60    0.0      1.0  note
2      64    0.0      1.0  note
3      60    1.0      1.5  note
4      64    1.0      2.0  note
5       0    1.5      0.0   bar
6      60    1.5      3.0  note
7      64    2.0      3.0  note
8      60    3.0      4.0  note
9      64    3.0      4.5  note
10      0    5.0      0.0   bar

Only notes have releases so get_eligible_releases() is always note_only. (Cf get_eligible_onsets().)

>>> get_eligible_releases(df)
2    1.0
3    1.5
4    2.0
7    3.0
8    4.0
9    4.5
Name: release, dtype: float64
>>> get_eligible_releases(df, keep_releases_together=False)
1    1.0
2    1.0
3    1.5
4    2.0
6    3.0
7    3.0
8    4.0
9    4.5
Name: release, dtype: float64

def quantize_df(df, tpq: int = 4, ticks_out: bool = False, zero_dur_action: Literal['remove', 'drop', 'min_dur', 'preserve'] = 'min_dur') ‑> pandas.DataFrame

Expand source code

def quantize_df(
    df,
    tpq: int = 4,
    ticks_out: bool = False,
    zero_dur_action: ZeroDurAction = "min_dur",
) -> pd.DataFrame:
    """
    >>> df = pd.DataFrame(
    ...     {
    ...         "pitch": [60, 61, 62, 63],
    ...         "onset": [-0.01, 1.01, 1.95, 2.9],
    ...         "release": [0.99, 2.03, 3.0, 3.97],
    ...     }
    ... )
    >>> df
       pitch  onset  release
    0     60  -0.01     0.99
    1     61   1.01     2.03
    2     62   1.95     3.00
    3     63   2.90     3.97

    There may be a negative zero in the output:
    >>> quantize_df(df, tpq=4)
       pitch  onset  release
    0     60   -0.0      1.0
    1     61    1.0      2.0
    2     62    2.0      3.0
    3     63    3.0      4.0

    >>> quantize_df(df, tpq=16)
       pitch   onset  release
    0     60 -0.0000      1.0
    1     61  1.0000      2.0
    2     62  1.9375      3.0
    3     63  2.8750      4.0

    >>> quantize_df(df, tpq=16, ticks_out=True)
       pitch  onset  release
    0     60      0       16
    1     61     16       32
    2     62     31       48
    3     63     46       64

    Note that by default, notes that would be rounded to have zero length
    are given the minimum length.
    >>> df = pd.DataFrame(
    ...     {
    ...         "pitch": [60, 61, 62],
    ...         "onset": [0.0, 0.5, 1.0],
    ...         "release": [0.4, 1.0, 2.0],
    ...     }
    ... )
    >>> quantize_df(df, tpq=1)
       pitch  onset  release
    0     60    0.0      1.0
    1     61    0.0      1.0
    2     62    1.0      2.0

    To preserve zero-dur notes, pass `zero_dur_action="preserve"`:
    >>> quantize_df(df, tpq=1, zero_dur_action="preserve")
       pitch  onset  release
    0     60    0.0      0.0
    1     61    0.0      1.0
    2     62    1.0      2.0

    To remove zero-dur notes, pass `zero_dur_action="remove"` (NB the index is not
    reset):
    >>> quantize_df(df, tpq=1, zero_dur_action="remove")
       pitch  onset  release
    1     61    0.0      1.0
    2     62    1.0      2.0

    "drop" is an alias for "remove"
    >>> quantize_df(df, tpq=1, zero_dur_action="drop")
       pitch  onset  release
    1     61    0.0      1.0
    2     62    1.0      2.0
    """
    assert zero_dur_action in get_args(ZeroDurAction)

    onsets = np.rint(df.onset.apply(float).to_numpy() * tpq)
    releases = np.rint(df.release.apply(float).to_numpy() * tpq)
    if zero_dur_action == "min_dur":
        releases[releases == onsets] += 1
    if ticks_out:
        onsets = onsets.astype(int)
        releases = releases.astype(int)
    else:
        onsets /= tpq
        releases /= tpq
    out = pd.DataFrame(
        {
            col_name: (
                df[col_name].copy()
                if col_name not in ("onset", "release")
                else {"onset": onsets, "release": releases}[col_name]
            )
            for col_name in df.columns
        }
    )
    if zero_dur_action in {"remove", "drop"}:
        out = out[out["onset"] != out["release"]]
    return out

>>> df = pd.DataFrame(
...     {
...         "pitch": [60, 61, 62, 63],
...         "onset": [-0.01, 1.01, 1.95, 2.9],
...         "release": [0.99, 2.03, 3.0, 3.97],
...     }
... )
>>> df
   pitch  onset  release
0     60  -0.01     0.99
1     61   1.01     2.03
2     62   1.95     3.00
3     63   2.90     3.97

There may be a negative zero in the output:

>>> quantize_df(df, tpq=4)
   pitch  onset  release
0     60   -0.0      1.0
1     61    1.0      2.0
2     62    2.0      3.0
3     63    3.0      4.0

>>> quantize_df(df, tpq=16)
   pitch   onset  release
0     60 -0.0000      1.0
1     61  1.0000      2.0
2     62  1.9375      3.0
3     63  2.8750      4.0

>>> quantize_df(df, tpq=16, ticks_out=True)
   pitch  onset  release
0     60      0       16
1     61     16       32
2     62     31       48
3     63     46       64

Note that by default, notes that would be rounded to have zero length are given the minimum length.

>>> df = pd.DataFrame(
...     {
...         "pitch": [60, 61, 62],
...         "onset": [0.0, 0.5, 1.0],
...         "release": [0.4, 1.0, 2.0],
...     }
... )
>>> quantize_df(df, tpq=1)
   pitch  onset  release
0     60    0.0      1.0
1     61    0.0      1.0
2     62    1.0      2.0

To preserve zero-dur notes, pass zero_dur_action="preserve":

>>> quantize_df(df, tpq=1, zero_dur_action="preserve")
   pitch  onset  release
0     60    0.0      0.0
1     61    0.0      1.0
2     62    1.0      2.0

To remove zero-dur notes, pass zero_dur_action="remove" (NB the index is not reset):

>>> quantize_df(df, tpq=1, zero_dur_action="remove")
   pitch  onset  release
1     61    0.0      1.0
2     62    1.0      2.0

"drop" is an alias for "remove"

>>> quantize_df(df, tpq=1, zero_dur_action="drop")
   pitch  onset  release
1     61    0.0      1.0
2     62    1.0      2.0

def read_csv(path: str, quantize_tpq: int | None = None, column_dtypes: dict | None = None, column_converters: Mapping | None = None) ‑> pandas.DataFrame | None

Expand source code

def read_csv(
    path: str,
    # onset_type=fraction_to_float,
    # release_type=fraction_to_float,
    quantize_tpq: int | None = None,
    column_dtypes: dict | None = None,
    column_converters: Mapping | None = None,
) -> pd.DataFrame | None:
    if column_dtypes is None:
        column_dtypes = COLUMN_DTYPES
    if column_converters is None:
        column_converters = COLUMN_CONVERTERS

    column_dtypes = copy(column_dtypes)
    for key in column_converters:
        column_dtypes.pop(key, None)

    if not os.path.exists(path):
        LOGGER.warning(f"{path} does not appear to exist")
        return None
    df = pd.read_csv(
        path,
        converters=column_converters,
        index_col=0,
        dtype=column_dtypes,
    )

    # df["onset"] = [onset_type(o) for o in df.onset]
    # df.loc[df.type == "note", "release"] = [
    #     release_type(o) for o in df.loc[df.type == "note", "release"]
    # ]
    df.loc[df.type != "note", "release"] = float("nan")
    if "other" in df.columns:
        df.loc[df.type == "time_signature", "other"] = df.loc[
            df.type == "time_signature", "other"
        ].map(ast.literal_eval)
    if "color" in df.columns:
        df.loc[df.color.isna(), "color"] = ""
    if quantize_tpq is not None:
        df = quantize_df(df, quantize_tpq)
    return df

def read_krn(krn_path: str, remove_graces: bool = True, no_final_barline: bool = True, ensure_initial_barline: bool = True, sort: bool = False, infer_tempo: bool = False, default_tempo: float | None = None, label_identifiers: str | None = None) ‑> pandas.DataFrame

Expand source code

def read_krn(
    krn_path: str,
    remove_graces: bool = True,
    no_final_barline: bool = True,
    ensure_initial_barline: bool = True,
    # TODO: (Malcolm 2023-12-28) why is sort False by default?
    sort: bool = False,
    infer_tempo: bool = False,
    default_tempo: float | None = None,
    label_identifiers: str | None = None,
) -> pd.DataFrame:
    assert TOTABLE is not None, "TOTABLE environment variable undefined"
    totable_cmd = [TOTABLE, krn_path]
    if label_identifiers is not None:
        totable_cmd.append(label_identifiers)

    result = subprocess.run(
        totable_cmd, check=True, capture_output=True
    ).stdout.decode()

    df = pd.read_csv(io.StringIO(result), sep="\t")
    assert df["onset"].is_monotonic_increasing, (
        f"{krn_path}: onsets are not monotonicaly increasing"
    )

    df.attrs["score_name"] = krn_path
    if remove_graces:
        df = df[(df.type != "note") | (df.release > df.onset)].reset_index(drop=True)
    # Kern files often contain a final barline, which we don't generally need
    if no_final_barline and df.iloc[-1]["type"] == "bar":
        df = df.iloc[:-1]
    # On the other hand, we *do* want an initial barline (helps us calculate
    # whether the score starts with a pickup)
    if ensure_initial_barline and df.iloc[0]["type"] != "bar":
        df = _insert_initial_barline(df)
    # TOTABLE doesn't give bar releases, so we calculate them here
    bar_releases = df.loc[df.type == "bar", "onset"].iloc[1:].to_list() + [
        df[df.type == "note"].iloc[-1]["release"]
    ]
    df.loc[df.type == "bar", "release"] = bar_releases
    if infer_tempo:
        # TODO: (Malcolm 2023-12-27) re-encode kern files to utf-8 so we don't need to
        #   set encoding here?
        bpm = infer_bpm(krn_path, encoding="cp1252")
        if bpm is None and default_tempo is not None:
            # TODO: (Malcolm 2023-12-27) set tempo heuristically?
            bpm = default_tempo
        if bpm is not None:
            df = pd.concat(
                [pd.DataFrame([{"type": "tempo", "onset": 0.0, "tempo": bpm}]), df],
                ignore_index=True,
            )

    if sort:
        sort_df(df, inplace=True)
    return df

def read_krn_via_xml(krn_path: str, expand_repeats: Literal['yes', 'no', 'drop', 'max2'] = 'yes') ‑> pandas.DataFrame

Expand source code

def read_krn_via_xml(
    krn_path: str, expand_repeats: RepeatOptions = "yes"
) -> pd.DataFrame:
    result = subprocess.run(
        ["hum2xml", krn_path], check=True, capture_output=True
    ).stdout.decode()
    _, temp_path = tempfile.mkstemp(suffix=".xml")
    try:
        with open(temp_path, "w") as outf:
            outf.write(result)
        return xml_parse(temp_path, expand_repeats=expand_repeats)
    finally:
        os.remove(temp_path)

def segment_df(df: pandas.DataFrame, target_len)

Expand source code

def segment_df(df: pd.DataFrame, target_len):
    """
    This function segments dataframes such that they contain a certain target
    number of notes (or rows? not sure).
    """
    eligible_onsets = get_eligible_onsets(df)
    eligible_releases = get_eligible_releases(df).index.to_numpy()
    for start_i, end_i in get_df_segment_indices(
        eligible_onsets, eligible_releases, target_len
    ):
        yield df[start_i:end_i]

This function segments dataframes such that they contain a certain target number of notes (or rows? not sure).

def sort_df(df: pandas.DataFrame, inplace: bool = False, ignore_index: bool = True)

Expand source code

def sort_df(df: pd.DataFrame, inplace: bool = False, ignore_index: bool = True):
    if not inplace:
        df = df.sort_values(
            by="release",
            axis=0,
            inplace=False,
            ignore_index=ignore_index,
            key=lambda x: 0 if x is None else x,
            kind="mergesort",  # default sort is not stable
        )
    else:
        df.sort_values(
            by="release",
            axis=0,
            inplace=True,
            ignore_index=ignore_index,
            key=lambda x: 0 if x is None else x,
            kind="mergesort",  # default sort is not stable
        )
    if "pitch" in df.columns:
        df.sort_values(
            by="pitch",
            axis=0,
            inplace=True,
            ignore_index=ignore_index,
            key=lambda x: 128 if x is None else x,
            kind="mergesort",  # default sort is not stable
        )
    if "type" in df.columns:
        # We first sort by type so that result of sort will always be the same (i.e.,
        #   the "missing value" rows below will be sorted)
        df.sort_values(
            by="type", axis=0, inplace=True, ignore_index=ignore_index, kind="mergesort"
        )
        # Then we sort by type again to make sure we have rows in the following order:
        #   time signature
        #   bar
        #   all other
        #   note
        df.sort_values(
            by="type",
            axis=0,
            inplace=True,
            ignore_index=ignore_index,
            key=lambda x: x.map(DF_TYPE_SORT_ORDER),
            kind="mergesort",  # default sort is not stable
        )
    df.sort_values(
        by="onset",
        axis=0,
        inplace=True,
        ignore_index=ignore_index,
        kind="mergesort",  # default sort is not stable
    )
    return df

def split_musicdf(music_df: pandas.DataFrame, split_by_track: bool = True, split_by_channel: bool = True) ‑> dict[typing.Any, pandas.DataFrame]

Expand source code

def split_musicdf(
    music_df: pd.DataFrame, split_by_track: bool = True, split_by_channel: bool = True
) -> dict[Any, pd.DataFrame]:
    if split_by_channel and split_by_track:
        if "channel" in music_df.columns:
            grouping = ["track", "channel"]
        else:
            grouping = "track"
    elif split_by_channel:
        grouping = "channel"
    elif split_by_track:
        grouping = "track"
    else:
        raise ValueError

    if isinstance(grouping, str):
        if grouping not in music_df.columns:
            return {None: music_df}

    grouped = music_df.groupby(grouping)

    output = {key: df.copy() for key, df in grouped}
    return output