Module music_df.add_feature

This module contains diverse functions for adding features to music dataframes.

Functions

def add_bar_durs(music_df: pandas.DataFrame) ‑> pandas.DataFrame
Expand source code
def add_bar_durs(music_df: pd.DataFrame) -> pd.DataFrame:
    """
    Add "bar_dur" column specifying the duration of each bar.

    The "bar_dur" column will be NaN for non-bar rows. We also set the "release" column
    to the sum of the "onset" and "bar_dur" columns for bar rows.

    Args:
        music_df: The dataframe to add the bar duration column to. Must have at least
            one bar (i.e., one row with type == "bar").
    """
    bar_mask = music_df.type == "bar"
    if not bar_mask.any():
        raise ValueError("Score must have at least one bar")
    bars = music_df[bar_mask]
    bar_durs = bars.iloc[1:].onset.reset_index(drop=True) - bars.iloc[
        :-1
    ].onset.reset_index(drop=True)
    last_bar = bars.iloc[-1]
    last_bar_dur = music_df.release.max() - last_bar.onset
    bar_durs = pd.concat([bar_durs, pd.Series([last_bar_dur])]).reset_index(drop=True)
    music_df["bar_dur"] = float("nan")
    music_df.loc[bar_mask, "bar_dur"] = bar_durs.astype(float).to_numpy()
    music_df.loc[bar_mask, "release"] = (
        music_df.loc[bar_mask, "onset"] + music_df.loc[bar_mask, "bar_dur"]
    )
    return music_df

Add "bar_dur" column specifying the duration of each bar.

The "bar_dur" column will be NaN for non-bar rows. We also set the "release" column to the sum of the "onset" and "bar_dur" columns for bar rows.

Args

music_df
The dataframe to add the bar duration column to. Must have at least one bar (i.e., one row with type == "bar").
def add_default_midi_instrument(music_df: pandas.DataFrame, default_instrument: int = 0) ‑> pandas.DataFrame
Expand source code
def add_default_midi_instrument(
    music_df: pd.DataFrame,
    default_instrument: int = 0,
) -> pd.DataFrame:
    """
    Add default MIDI instrument where it is missing.
    """
    if "midi_instrument" not in music_df.columns:
        music_df["midi_instrument"] = default_instrument
    else:
        music_df["midi_instrument"] = music_df.midi_instrument.fillna(
            value=default_instrument
        )
    return music_df

Add default MIDI instrument where it is missing.

def add_default_time_sig(music_df: pandas.DataFrame,
default_time_signature: dict[str, int] | None = None,
keep_old_index: bool = False) ‑> pandas.DataFrame
Expand source code
def add_default_time_sig(
    music_df: pd.DataFrame,
    default_time_signature: dict[str, int] | None = None,
    keep_old_index: bool = False,
) -> pd.DataFrame:
    """
    Add default time signature to dataframes that lack them (or lack an initial one).

    >>> nan = float("nan")  # Alias to simplify below assignments

    No time signature at all:
    >>> df = pd.DataFrame(
    ...     {
    ...         "pitch": [nan, 60, nan, 62],
    ...         "onset": [0, 0, 4, 4],
    ...         "release": [4, 4, 8, 5],
    ...         "type": ["bar", "note", "bar", "note"],
    ...         "other": [nan, nan, nan, nan],
    ...     }
    ... )
    >>> df
       pitch  onset  release  type  other
    0    NaN      0        4   bar    NaN
    1   60.0      0        4  note    NaN
    2    NaN      4        8   bar    NaN
    3   62.0      4        5  note    NaN
    >>> add_default_time_sig(df)
       pitch  onset  release            type                               other
    0    NaN      0      NaN  time_signature  {'numerator': 4, 'denominator': 4}
    1    NaN      0      4.0             bar                                 NaN
    2   60.0      0      4.0            note                                 NaN
    3    NaN      4      8.0             bar                                 NaN
    4   62.0      4      5.0            note                                 NaN

    Missing initial time signature:
    >>> df = pd.DataFrame(
    ...     {
    ...         "pitch": [nan, 60, nan, nan, 62],
    ...         "onset": [0, 0, 4, 4, 4],
    ...         "release": [4, 4, nan, 7, 5],
    ...         "type": ["bar", "note", "time_signature", "bar", "note"],
    ...         "other": [nan, nan, {"numerator": 3, "denominator": 4}, nan, nan],
    ...     }
    ... )
    >>> df
       pitch  onset  release            type                               other
    0    NaN      0      4.0             bar                                 NaN
    1   60.0      0      4.0            note                                 NaN
    2    NaN      4      NaN  time_signature  {'numerator': 3, 'denominator': 4}
    3    NaN      4      7.0             bar                                 NaN
    4   62.0      4      5.0            note                                 NaN
    >>> add_default_time_sig(df)
       pitch  onset  release            type                               other
    0    NaN      0      NaN  time_signature  {'numerator': 4, 'denominator': 4}
    1    NaN      0      4.0             bar                                 NaN
    2   60.0      0      4.0            note                                 NaN
    3    NaN      4      NaN  time_signature  {'numerator': 3, 'denominator': 4}
    4    NaN      4      7.0             bar                                 NaN
    5   62.0      4      5.0            note                                 NaN

    No missing time signature:
    >>> df = pd.DataFrame(
    ...     {
    ...         "pitch": [nan, nan, 60, nan, nan, 62],
    ...         "onset": [0, 0, 0, 4, 4, 4],
    ...         "release": [nan, 4, 4, nan, 7, 5],
    ...         "type": [
    ...             "time_signature",
    ...             "bar",
    ...             "note",
    ...             "time_signature",
    ...             "bar",
    ...             "note",
    ...         ],
    ...         "other": [
    ...             {"numerator": 4, "denominator": 4},
    ...             nan,
    ...             nan,
    ...             {"numerator": 3, "denominator": 4},
    ...             nan,
    ...             nan,
    ...         ],
    ...     }
    ... )
    >>> df
       pitch  onset  release            type                               other
    0    NaN      0      NaN  time_signature  {'numerator': 4, 'denominator': 4}
    1    NaN      0      4.0             bar                                 NaN
    2   60.0      0      4.0            note                                 NaN
    3    NaN      4      NaN  time_signature  {'numerator': 3, 'denominator': 4}
    4    NaN      4      7.0             bar                                 NaN
    5   62.0      4      5.0            note                                 NaN
    >>> df.equals(add_default_time_sig(df))
    True
    """

    time_sig_mask = music_df.type == "time_signature"
    if time_sig_mask.any() and (
        music_df[time_sig_mask].index[0]
        <= music_df[music_df.type.isin({"note", "bar"})].index[0]
    ):
        return music_df
    column_order = music_df.columns
    if default_time_signature is None:
        default_time_signature = {"numerator": 4, "denominator": 4}

    time_sig_df = pd.DataFrame(
        {
            "onset": [0],
            "type": ["time_signature"],
            "other": [default_time_signature],
        }
    )

    if "ts_numerator" in music_df.columns:
        time_sig_df["ts_numerator"] = [default_time_signature["numerator"]]
    if "ts_denominator" in music_df.columns:
        time_sig_df["ts_denominator"] = [default_time_signature["denominator"]]

    # ensure indices are unique
    time_sig_df.index += max(music_df.index) + 1
    out_df = pd.concat([time_sig_df, music_df], axis=0)
    out_df = out_df[column_order]

    out_df = out_df.reset_index(drop=not keep_old_index)
    return out_df

Add default time signature to dataframes that lack them (or lack an initial one).

>>> nan = float("nan")  # Alias to simplify below assignments

No time signature at all:

>>> df = pd.DataFrame(
...     {
...         "pitch": [nan, 60, nan, 62],
...         "onset": [0, 0, 4, 4],
...         "release": [4, 4, 8, 5],
...         "type": ["bar", "note", "bar", "note"],
...         "other": [nan, nan, nan, nan],
...     }
... )
>>> df
   pitch  onset  release  type  other
0    NaN      0        4   bar    NaN
1   60.0      0        4  note    NaN
2    NaN      4        8   bar    NaN
3   62.0      4        5  note    NaN
>>> add_default_time_sig(df)
   pitch  onset  release            type                               other
0    NaN      0      NaN  time_signature  {'numerator': 4, 'denominator': 4}
1    NaN      0      4.0             bar                                 NaN
2   60.0      0      4.0            note                                 NaN
3    NaN      4      8.0             bar                                 NaN
4   62.0      4      5.0            note                                 NaN

Missing initial time signature:

>>> df = pd.DataFrame(
...     {
...         "pitch": [nan, 60, nan, nan, 62],
...         "onset": [0, 0, 4, 4, 4],
...         "release": [4, 4, nan, 7, 5],
...         "type": ["bar", "note", "time_signature", "bar", "note"],
...         "other": [nan, nan, {"numerator": 3, "denominator": 4}, nan, nan],
...     }
... )
>>> df
   pitch  onset  release            type                               other
0    NaN      0      4.0             bar                                 NaN
1   60.0      0      4.0            note                                 NaN
2    NaN      4      NaN  time_signature  {'numerator': 3, 'denominator': 4}
3    NaN      4      7.0             bar                                 NaN
4   62.0      4      5.0            note                                 NaN
>>> add_default_time_sig(df)
   pitch  onset  release            type                               other
0    NaN      0      NaN  time_signature  {'numerator': 4, 'denominator': 4}
1    NaN      0      4.0             bar                                 NaN
2   60.0      0      4.0            note                                 NaN
3    NaN      4      NaN  time_signature  {'numerator': 3, 'denominator': 4}
4    NaN      4      7.0             bar                                 NaN
5   62.0      4      5.0            note                                 NaN

No missing time signature:

>>> df = pd.DataFrame(
...     {
...         "pitch": [nan, nan, 60, nan, nan, 62],
...         "onset": [0, 0, 0, 4, 4, 4],
...         "release": [nan, 4, 4, nan, 7, 5],
...         "type": [
...             "time_signature",
...             "bar",
...             "note",
...             "time_signature",
...             "bar",
...             "note",
...         ],
...         "other": [
...             {"numerator": 4, "denominator": 4},
...             nan,
...             nan,
...             {"numerator": 3, "denominator": 4},
...             nan,
...             nan,
...         ],
...     }
... )
>>> df
   pitch  onset  release            type                               other
0    NaN      0      NaN  time_signature  {'numerator': 4, 'denominator': 4}
1    NaN      0      4.0             bar                                 NaN
2   60.0      0      4.0            note                                 NaN
3    NaN      4      NaN  time_signature  {'numerator': 3, 'denominator': 4}
4    NaN      4      7.0             bar                                 NaN
5   62.0      4      5.0            note                                 NaN
>>> df.equals(add_default_time_sig(df))
True
def add_default_velocity(music_df: pandas.DataFrame, default_velocity: int = 96) ‑> pandas.DataFrame
Expand source code
def add_default_velocity(
    music_df: pd.DataFrame, default_velocity: int = 96
) -> pd.DataFrame:
    """
    Add default velocity where it is missing.
    """
    if "velocity" not in music_df.columns:
        music_df["velocity"] = default_velocity
    else:
        music_df["velocity"] = music_df.velocity.fillna(value=default_velocity)
    return music_df

Add default velocity where it is missing.

def add_enharmonic_key_signature_from_key(music_df: pandas.DataFrame) ‑> pandas.DataFrame
Expand source code
def add_enharmonic_key_signature_from_key(music_df: pd.DataFrame) -> pd.DataFrame:
    """An "enharmonic" key signature is between -5 (5 flats) and 6 (6 sharps).

    For example, F# and Gb have different key signatures (6 and -6 respectively), but
    the same enharmonic key signature (6).

    >>> csv_table = '''
    ... type,pitch,key
    ... bar,,
    ... note,60,B#
    ... note,69,b-
    ... note,68,F#
    ... note,68,Gb
    ... note,70,Cbb
    ... '''
    >>> df = pd.read_csv(io.StringIO(csv_table.strip()))
    >>> add_enharmonic_key_signature_from_key(df)
       type  pitch  key  key_signature  enh_key_signature
    0   bar    NaN  NaN            NaN                NaN
    1  note   60.0   B#           12.0                0.0
    2  note   69.0   b-            2.0                2.0
    3  note   68.0   F#            6.0                6.0
    4  note   68.0   Gb           -6.0                6.0
    5  note   70.0  Cbb          -14.0               -2.0
    """
    if "key_signature" not in music_df.columns:
        music_df = add_key_signature_from_key(music_df)
    music_df["enh_key_signature"] = music_df["key_signature"] % 12
    music_df.loc[(music_df["enh_key_signature"] > 6), "enh_key_signature"] -= 12
    return music_df

An "enharmonic" key signature is between -5 (5 flats) and 6 (6 sharps).

For example, F# and Gb have different key signatures (6 and -6 respectively), but the same enharmonic key signature (6).

>>> csv_table = '''
... type,pitch,key
... bar,,
... note,60,B#
... note,69,b-
... note,68,F#
... note,68,Gb
... note,70,Cbb
... '''
>>> df = pd.read_csv(io.StringIO(csv_table.strip()))
>>> add_enharmonic_key_signature_from_key(df)
   type  pitch  key  key_signature  enh_key_signature
0   bar    NaN  NaN            NaN                NaN
1  note   60.0   B#           12.0                0.0
2  note   69.0   b-            2.0                2.0
3  note   68.0   F#            6.0                6.0
4  note   68.0   Gb           -6.0                6.0
5  note   70.0  Cbb          -14.0               -2.0
def add_key_signature(music_df: pandas.DataFrame) ‑> pandas.DataFrame
Expand source code
def add_key_signature(music_df: pd.DataFrame) -> pd.DataFrame:
    """
    Add "key_signature" column specifying the key signature of each row.

    The dataframe must have either:
        - a "key_pc" column specifying the pitch-class of the key, and a "mode" column
          specifying the mode (M or m), or
        - a "key" column specifying the key.

    >>> csv_table = '''
    ... type,pitch,key_pc,mode
    ... bar,,
    ... note,60,0,M
    ... note,69,9,m
    ... note,68,9,M
    ... note,68,8,m
    ... note,70,3,m
    ... '''
    >>> df = pd.read_csv(io.StringIO(csv_table.strip()))
    >>> add_key_signature(df)
       type  pitch  key_pc mode  key_signature
    0   bar    NaN     NaN  NaN            NaN
    1  note   60.0     0.0    M            0.0
    2  note   69.0     9.0    m            0.0
    3  note   68.0     9.0    M            3.0
    4  note   68.0     8.0    m            5.0
    5  note   70.0     3.0    m            6.0

    >>> csv_table = '''
    ... type,pitch,key
    ... bar,,
    ... note,60,C
    ... note,69,a
    ... note,68,ab
    ... note,68,Ab
    ... note,70,Cbb
    ... '''
    >>> df = pd.read_csv(io.StringIO(csv_table.strip()))
    >>> add_key_signature(df)
       type  pitch  key  key_signature
    0   bar    NaN  NaN            NaN
    1  note   60.0    C            0.0
    2  note   69.0    a            0.0
    3  note   68.0   ab           -7.0
    4  note   68.0   Ab           -4.0
    5  note   70.0  Cbb          -14.0
    """
    if "mode" in music_df.columns and "key_pc" in music_df.columns:
        return add_key_signature_from_pc_and_mode(music_df)
    elif "key" in music_df.columns:
        return add_key_signature_from_key(music_df)
    else:
        raise ValueError

Add "key_signature" column specifying the key signature of each row.

The dataframe must have either: - a "key_pc" column specifying the pitch-class of the key, and a "mode" column specifying the mode (M or m), or - a "key" column specifying the key.

>>> csv_table = '''
... type,pitch,key_pc,mode
... bar,,
... note,60,0,M
... note,69,9,m
... note,68,9,M
... note,68,8,m
... note,70,3,m
... '''
>>> df = pd.read_csv(io.StringIO(csv_table.strip()))
>>> add_key_signature(df)
   type  pitch  key_pc mode  key_signature
0   bar    NaN     NaN  NaN            NaN
1  note   60.0     0.0    M            0.0
2  note   69.0     9.0    m            0.0
3  note   68.0     9.0    M            3.0
4  note   68.0     8.0    m            5.0
5  note   70.0     3.0    m            6.0
>>> csv_table = '''
... type,pitch,key
... bar,,
... note,60,C
... note,69,a
... note,68,ab
... note,68,Ab
... note,70,Cbb
... '''
>>> df = pd.read_csv(io.StringIO(csv_table.strip()))
>>> add_key_signature(df)
   type  pitch  key  key_signature
0   bar    NaN  NaN            NaN
1  note   60.0    C            0.0
2  note   69.0    a            0.0
3  note   68.0   ab           -7.0
4  note   68.0   Ab           -4.0
5  note   70.0  Cbb          -14.0
def add_key_signature_from_key(music_df: pandas.DataFrame) ‑> pandas.DataFrame
Expand source code
def add_key_signature_from_key(music_df: pd.DataFrame) -> pd.DataFrame:
    """
    Add "key_signature" column specifying the key signature of each row.

    Sharps are positive, flats are negative.
    >>> csv_table = '''
    ... type,pitch,key
    ... bar,,
    ... note,60,C
    ... note,69,a
    ... note,68,ab
    ... note,68,Ab
    ... note,70,Cbb
    ... '''
    >>> df = pd.read_csv(io.StringIO(csv_table.strip()))
    >>> add_key_signature_from_key(df)
       type  pitch  key  key_signature
    0   bar    NaN  NaN            NaN
    1  note   60.0    C            0.0
    2  note   69.0    a            0.0
    3  note   68.0   ab           -7.0
    4  note   68.0   Ab           -4.0
    5  note   70.0  Cbb          -14.0
    """
    key_row_mask = ~music_df["key"].isna()
    music_df["key_signature"] = float("nan")
    music_df.loc[key_row_mask, "key_signature"] = music_df.loc[
        key_row_mask, "key"
    ].apply(_key_signature_from_key)
    return music_df

Add "key_signature" column specifying the key signature of each row.

Sharps are positive, flats are negative.

>>> csv_table = '''
... type,pitch,key
... bar,,
... note,60,C
... note,69,a
... note,68,ab
... note,68,Ab
... note,70,Cbb
... '''
>>> df = pd.read_csv(io.StringIO(csv_table.strip()))
>>> add_key_signature_from_key(df)
   type  pitch  key  key_signature
0   bar    NaN  NaN            NaN
1  note   60.0    C            0.0
2  note   69.0    a            0.0
3  note   68.0   ab           -7.0
4  note   68.0   Ab           -4.0
5  note   70.0  Cbb          -14.0
def add_key_signature_from_pc_and_mode(music_df: pandas.DataFrame,
added_col_name: str = 'key_signature',
pc_col_name: str = 'key_pc',
mode_col_name: str = 'mode') ‑> pandas.DataFrame
Expand source code
def add_key_signature_from_pc_and_mode(
    music_df: pd.DataFrame,
    added_col_name: str = "key_signature",
    pc_col_name: str = "key_pc",
    mode_col_name: str = "mode",
) -> pd.DataFrame:
    """
    Add a key signature column inferred from the key pitch-class and mode.

    Because pitch-classes are enharmonic, key signatures are enharmonic as well
    (between -5 and 6). In other words, both F# and Gb will be indicated by key_pc=6
    and mode="M", so they can't be distinguished from each other.

    >>> csv_table = '''
    ... type,pitch,key_pc,mode
    ... bar,,
    ... note,60,0,M
    ... note,69,9,m
    ... note,68,9,M
    ... note,68,8,m
    ... note,70,3,m
    ... '''
    >>> df = pd.read_csv(io.StringIO(csv_table.strip()))
    >>> add_key_signature_from_pc_and_mode(df)
       type  pitch  key_pc mode  key_signature
    0   bar    NaN     NaN  NaN            NaN
    1  note   60.0     0.0    M            0.0
    2  note   69.0     9.0    m            0.0
    3  note   68.0     9.0    M            3.0
    4  note   68.0     8.0    m            5.0
    5  note   70.0     3.0    m            6.0
    """
    music_df[added_col_name] = music_df.apply(
        partial(
            _key_signature_from_pc_and_mode,
            pc_col_name=pc_col_name,
            mode_col_name=mode_col_name,
        ),
        axis=1,
    )
    return music_df

Add a key signature column inferred from the key pitch-class and mode.

Because pitch-classes are enharmonic, key signatures are enharmonic as well (between -5 and 6). In other words, both F# and Gb will be indicated by key_pc=6 and mode="M", so they can't be distinguished from each other.

>>> csv_table = '''
... type,pitch,key_pc,mode
... bar,,
... note,60,0,M
... note,69,9,m
... note,68,9,M
... note,68,8,m
... note,70,3,m
... '''
>>> df = pd.read_csv(io.StringIO(csv_table.strip()))
>>> add_key_signature_from_pc_and_mode(df)
   type  pitch  key_pc mode  key_signature
0   bar    NaN     NaN  NaN            NaN
1  note   60.0     0.0    M            0.0
2  note   69.0     9.0    m            0.0
3  note   68.0     9.0    M            3.0
4  note   68.0     8.0    m            5.0
5  note   70.0     3.0    m            6.0
def add_scale_degrees(music_df: pandas.DataFrame)
Expand source code
def add_scale_degrees(music_df: pd.DataFrame):
    """
    Add "scale_degree" column specifying the scale degree of each note.

    The scale degree is inferred from the note's spelling and key. See examples
    below.

    >>> df = pd.DataFrame(
    ...     {
    ...         # we omit all other columns
    ...         # (Malcolm 2023-12-22) Note that "bb" is not supported by music21 so
    ...         #   we handle it separately.
    ...         "type": ["bar"] + ["note"] * 9,
    ...         "spelling": [
    ...             float("nan"),
    ...             "Db",
    ...             "F",
    ...             "Gb",
    ...             "C",
    ...             "C#",
    ...             "C##",
    ...             "Fb",
    ...             "F--",
    ...             "Fbb",
    ...         ],
    ...         "key": ["na"] + ["Gb"] * 9,
    ...     }
    ... )
    >>> add_scale_degrees(df)
       type spelling key scale_degree
    0   bar      NaN  na           na
    1  note       Db  Gb            5
    2  note        F  Gb            7
    3  note       Gb  Gb            1
    4  note        C  Gb           #4
    5  note       C#  Gb          ##4
    6  note      C##  Gb         ###4
    7  note       Fb  Gb           b7
    8  note      F--  Gb          bb7
    9  note      Fbb  Gb          bb7

    There is an issue with some scale degrees turning up as floats when reading saved
    CSVs later. To avoid this, we replace NaN with "na" string.
    >>> df = pd.DataFrame(
    ...     {
    ...         # we omit all other columns
    ...         "type": ["bar"] + ["note"] * 2,
    ...         "spelling": [float("nan"), "C", "F"],
    ...         "key": ["na"] + ["C"] * 2,
    ...     }
    ... )
    >>> scale_degrees_df = add_scale_degrees(df)
    >>> scale_degrees_df
       type spelling key scale_degree
    0   bar      NaN  na           na
    1  note        C   C            1
    2  note        F   C            4
    >>> from io import StringIO
    >>> output = StringIO()
    >>> df.to_csv(output)
    >>> csv_str = output.getvalue()
    >>> input_ = StringIO(csv_str)
    >>> df2 = pd.read_csv(input_)
    >>> df2
       Unnamed: 0  type spelling key scale_degree
    0           0   bar      NaN  na           na
    1           1  note        C   C            1
    2           2  note        F   C            4

    Checking minor key behavior
    >>> df = pd.DataFrame(
    ...     {
    ...         # we omit all other columns
    ...         "type": ["bar"] + ["note"] * 10,
    ...         "spelling": [
    ...             float("nan"),
    ...             "E",
    ...             "Fb",
    ...             "F",
    ...             "F#",
    ...             "Gb",
    ...             "G",
    ...             "G#",
    ...             "Ab",
    ...             "G##",
    ...             "A",
    ...         ],
    ...         "key": ["na"] + ["a"] * 10,
    ...     }
    ... )
    >>> add_scale_degrees(df)
        type spelling key scale_degree
    0    bar      NaN  na           na
    1   note        E   a            5
    2   note       Fb   a           b6
    3   note        F   a            6
    4   note       F#   a           #6
    5   note       Gb   a           b7
    6   note        G   a            7
    7   note       G#   a           #7
    8   note       Ab   a           b1
    9   note      G##   a          ##7
    10  note        A   a            1
    """
    from music21.key import Key
    from music21.pitch import Pitch

    assert "spelling" in music_df.columns
    assert "key" in music_df.columns

    mapping = {}

    # (Malcolm 2023-12-22) we could save a little time caching keys globally
    keys = {}

    for (spelling, key), _ in music_df.groupby(["spelling", "key"]):
        if key not in keys:
            key_obj = Key(key)
            keys[key] = key_obj
        else:
            key_obj = keys[key]

        scale_degree_int, accidental = key_obj.getScaleDegreeAndAccidentalFromPitch(
            Pitch(spelling[0] + spelling[1:].replace("b", "-"))
        )

        if accidental is None:
            scale_degree = str(scale_degree_int)
        else:
            scale_degree = f"{accidental.modifier.replace('-', 'b')}{scale_degree_int}"

        mapping[(spelling, key)] = scale_degree

    note_mask = music_df.type == "note"
    music_df["scale_degree"] = "na"
    music_df.loc[note_mask, "scale_degree"] = music_df.loc[note_mask].apply(
        lambda row: mapping[(row.spelling, row.key)], axis=1, result_type=None
    )

    return music_df

Add "scale_degree" column specifying the scale degree of each note.

The scale degree is inferred from the note's spelling and key. See examples below.

>>> df = pd.DataFrame(
...     {
...         # we omit all other columns
...         # (Malcolm 2023-12-22) Note that "bb" is not supported by music21 so
...         #   we handle it separately.
...         "type": ["bar"] + ["note"] * 9,
...         "spelling": [
...             float("nan"),
...             "Db",
...             "F",
...             "Gb",
...             "C",
...             "C#",
...             "C##",
...             "Fb",
...             "F--",
...             "Fbb",
...         ],
...         "key": ["na"] + ["Gb"] * 9,
...     }
... )
>>> add_scale_degrees(df)
   type spelling key scale_degree
0   bar      NaN  na           na
1  note       Db  Gb            5
2  note        F  Gb            7
3  note       Gb  Gb            1
4  note        C  Gb           #4
5  note       C#  Gb          ##4
6  note      C##  Gb         ###4
7  note       Fb  Gb           b7
8  note      F--  Gb          bb7
9  note      Fbb  Gb          bb7

There is an issue with some scale degrees turning up as floats when reading saved CSVs later. To avoid this, we replace NaN with "na" string.

>>> df = pd.DataFrame(
...     {
...         # we omit all other columns
...         "type": ["bar"] + ["note"] * 2,
...         "spelling": [float("nan"), "C", "F"],
...         "key": ["na"] + ["C"] * 2,
...     }
... )
>>> scale_degrees_df = add_scale_degrees(df)
>>> scale_degrees_df
   type spelling key scale_degree
0   bar      NaN  na           na
1  note        C   C            1
2  note        F   C            4
>>> from io import StringIO
>>> output = StringIO()
>>> df.to_csv(output)
>>> csv_str = output.getvalue()
>>> input_ = StringIO(csv_str)
>>> df2 = pd.read_csv(input_)
>>> df2
   Unnamed: 0  type spelling key scale_degree
0           0   bar      NaN  na           na
1           1  note        C   C            1
2           2  note        F   C            4

Checking minor key behavior

>>> df = pd.DataFrame(
...     {
...         # we omit all other columns
...         "type": ["bar"] + ["note"] * 10,
...         "spelling": [
...             float("nan"),
...             "E",
...             "Fb",
...             "F",
...             "F#",
...             "Gb",
...             "G",
...             "G#",
...             "Ab",
...             "G##",
...             "A",
...         ],
...         "key": ["na"] + ["a"] * 10,
...     }
... )
>>> add_scale_degrees(df)
    type spelling key scale_degree
0    bar      NaN  na           na
1   note        E   a            5
2   note       Fb   a           b6
3   note        F   a            6
4   note       F#   a           #6
5   note       Gb   a           b7
6   note        G   a            7
7   note       G#   a           #7
8   note       Ab   a           b1
9   note      G##   a          ##7
10  note        A   a            1
def add_time_sig_dur(music_df: pandas.DataFrame) ‑> pandas.DataFrame
Expand source code
def add_time_sig_dur(music_df: pd.DataFrame) -> pd.DataFrame:
    """
    Add "time_sig_dur" column specifying the quarter duration of each time signature.
    """
    music_df["time_sig_dur"] = float("nan")
    music_df.loc[music_df.type == "time_signature", "time_sig_dur"] = music_df[
        music_df.type == "time_signature"
    ].apply(_time_sig_dur_from_row, axis=1)
    music_df["time_sig_dur"] = music_df.time_sig_dur.ffill()
    return music_df

Add "time_sig_dur" column specifying the quarter duration of each time signature.

def concatenate_features(df: pandas.DataFrame, features: Iterable[str]) ‑> pandas.DataFrame
Expand source code
def concatenate_features(df: pd.DataFrame, features: Iterable[str]) -> pd.DataFrame:
    """
    Create a new feature by concatenating the values of the given features.
    >>> csv_table = '''
    ... type,pitch,onset,release,foo,bar
    ... bar,,0.0,4.0,,
    ... note,60,0.0,0.5,a,1.0
    ... note,60,0.0,1.5,b,2.0
    ... note,60,1.0,2.0,c,3.0
    ... note,60,2.0,3.0,d,4.0
    ... bar,,4.0,8.0,,
    ... '''
    >>> df = pd.read_csv(io.StringIO(csv_table.strip()))
    >>> df
       type  pitch  onset  release  foo  bar
    0   bar    NaN    0.0      4.0  NaN  NaN
    1  note   60.0    0.0      0.5    a  1.0
    2  note   60.0    0.0      1.5    b  2.0
    3  note   60.0    1.0      2.0    c  3.0
    4  note   60.0    2.0      3.0    d  4.0
    5   bar    NaN    4.0      8.0  NaN  NaN
    >>> concatenate_features(df, ["foo", "bar"])
       type  pitch  onset  release  foo  bar foo_bar
    0   bar    NaN    0.0      4.0  NaN  NaN      na
    1  note   60.0    0.0      0.5    a  1.0    a1.0
    2  note   60.0    0.0      1.5    b  2.0    b2.0
    3  note   60.0    1.0      2.0    c  3.0    c3.0
    4  note   60.0    2.0      3.0    d  4.0    d4.0
    5   bar    NaN    4.0      8.0  NaN  NaN      na
    """
    concat_feature_name = "_".join(features)
    assert concat_feature_name not in df.columns
    df[concat_feature_name] = df[features].astype(str).sum(axis=1)
    df.loc[
        ((df[features].isna()) | (df[features] == "na")).any(axis=1),
        concat_feature_name,
    ] = "na"
    return df

Create a new feature by concatenating the values of the given features.

>>> csv_table = '''
... type,pitch,onset,release,foo,bar
... bar,,0.0,4.0,,
... note,60,0.0,0.5,a,1.0
... note,60,0.0,1.5,b,2.0
... note,60,1.0,2.0,c,3.0
... note,60,2.0,3.0,d,4.0
... bar,,4.0,8.0,,
... '''
>>> df = pd.read_csv(io.StringIO(csv_table.strip()))
>>> df
   type  pitch  onset  release  foo  bar
0   bar    NaN    0.0      4.0  NaN  NaN
1  note   60.0    0.0      0.5    a  1.0
2  note   60.0    0.0      1.5    b  2.0
3  note   60.0    1.0      2.0    c  3.0
4  note   60.0    2.0      3.0    d  4.0
5   bar    NaN    4.0      8.0  NaN  NaN
>>> concatenate_features(df, ["foo", "bar"])
   type  pitch  onset  release  foo  bar foo_bar
0   bar    NaN    0.0      4.0  NaN  NaN      na
1  note   60.0    0.0      0.5    a  1.0    a1.0
2  note   60.0    0.0      1.5    b  2.0    b2.0
3  note   60.0    1.0      2.0    c  3.0    c3.0
4  note   60.0    2.0      3.0    d  4.0    d4.0
5   bar    NaN    4.0      8.0  NaN  NaN      na
def decompose_scale_degrees(music_df: pandas.DataFrame, max_alteration: int = 2)
Expand source code
def decompose_scale_degrees(music_df: pd.DataFrame, max_alteration: int = 2):
    """
    Decompose "scale_degree" into "scale_degree_step" and "scale_degree_alteration".

    For example,
       - the scale degree 5 has step 5 and alteration "_"
       - the scale degree #4 has step 4 and alteration "#"
       - the scale degree bb7 has step 7 and alteration "bb"

    Args:
        music_df: The dataframe to decompose the scale degree column of.
        max_alteration: The maximum number of accidentals to allow. If the alteration
            is greater than this, it is set to "x".

    >>> df = pd.DataFrame(
    ...     {
    ...         # we omit all other columns
    ...         # (Malcolm 2023-12-22) Note that "bb" is not supported by music21 so
    ...         #   we handle it separately.
    ...         "type": ["bar"] + ["note"] * 9,
    ...         "spelling": [
    ...             float("nan"),
    ...             "Db",
    ...             "F",
    ...             "Gb",
    ...             "C",
    ...             "C#",
    ...             "C##",
    ...             "Fb",
    ...             "F--",
    ...             "Fbb",
    ...         ],
    ...         "key": ["na"] + ["Gb"] * 9,
    ...     }
    ... )
    >>> df = add_scale_degrees(df)
    >>> decompose_scale_degrees(df)
       type spelling key scale_degree scale_degree_step scale_degree_alteration
    0   bar      NaN  na           na                na                      na
    1  note       Db  Gb            5                 5                       _
    2  note        F  Gb            7                 7                       _
    3  note       Gb  Gb            1                 1                       _
    4  note        C  Gb           #4                 4                       #
    5  note       C#  Gb          ##4                 4                      ##
    6  note      C##  Gb         ###4                 4                       x
    7  note       Fb  Gb           b7                 7                       b
    8  note      F--  Gb          bb7                 7                      bb
    9  note      Fbb  Gb          bb7                 7                      bb
    """
    music_df["scale_degree_step"] = "na"
    music_df["scale_degree_alteration"] = "na"

    note_mask = music_df.type == "note"

    music_df.loc[note_mask, "scale_degree_step"] = music_df.loc[
        note_mask, "scale_degree"
    ].apply(
        lambda s: re.search(r"\d+", s).group()  # type:ignore
    )
    music_df.loc[note_mask, "scale_degree_alteration"] = music_df.loc[
        note_mask, "scale_degree"
    ].apply(
        lambda s: re.search(r"\D*", s).group()  # type:ignore
    )
    music_df.loc[note_mask, "scale_degree_alteration"] = music_df.loc[
        note_mask, "scale_degree_alteration"
    ].apply(lambda s: s if len(s) <= max_alteration else "x")
    music_df.loc[music_df.scale_degree_alteration == "", "scale_degree_alteration"] = (
        "_"
    )
    return music_df

Decompose "scale_degree" into "scale_degree_step" and "scale_degree_alteration".

For example, - the scale degree 5 has step 5 and alteration "_" - the scale degree #4 has step 4 and alteration "#" - the scale degree bb7 has step 7 and alteration "bb"

Args

music_df
The dataframe to decompose the scale degree column of.
max_alteration
The maximum number of accidentals to allow. If the alteration is greater than this, it is set to "x".
>>> df = pd.DataFrame(
...     {
...         # we omit all other columns
...         # (Malcolm 2023-12-22) Note that "bb" is not supported by music21 so
...         #   we handle it separately.
...         "type": ["bar"] + ["note"] * 9,
...         "spelling": [
...             float("nan"),
...             "Db",
...             "F",
...             "Gb",
...             "C",
...             "C#",
...             "C##",
...             "Fb",
...             "F--",
...             "Fbb",
...         ],
...         "key": ["na"] + ["Gb"] * 9,
...     }
... )
>>> df = add_scale_degrees(df)
>>> decompose_scale_degrees(df)
   type spelling key scale_degree scale_degree_step scale_degree_alteration
0   bar      NaN  na           na                na                      na
1  note       Db  Gb            5                 5                       _
2  note        F  Gb            7                 7                       _
3  note       Gb  Gb            1                 1                       _
4  note        C  Gb           #4                 4                       #
5  note       C#  Gb          ##4                 4                      ##
6  note      C##  Gb         ###4                 4                       x
7  note       Fb  Gb           b7                 7                       b
8  note      F--  Gb          bb7                 7                      bb
9  note      Fbb  Gb          bb7                 7                      bb
def explicit_instruments_to_program_changes(music_df: pandas.DataFrame) ‑> pandas.DataFrame
Expand source code
def explicit_instruments_to_program_changes(music_df: pd.DataFrame) -> pd.DataFrame:
    """This is a sort of inverse of make_instruments_explicit."""
    if "midi_instrument" not in music_df.columns:
        raise ValueError("midi_instrument column not found")
    program_changes = []
    for (track, channel), contents in music_df.groupby(["track", "channel"]):
        # We rely on the order of the dataframe being preserved by groupby, see
        #  https://stackoverflow.com/a/26465555/10155119
        instrument_changes = contents.midi_instrument != contents.midi_instrument.shift(
            1
        )
        reference_rows = contents[instrument_changes]
        for i, row in reference_rows.iterrows():
            program_change = pd.Series(
                {
                    "type": "program_change",
                    "track": track,
                    "channel": channel,
                    "onset": row.onset,
                    "other": {"program": row.midi_instrument},
                }
            )
            program_changes.append(program_change)

    out_df = pd.concat([music_df, pd.DataFrame(program_changes)])
    out_df = sort_df(out_df)
    return out_df

This is a sort of inverse of make_instruments_explicit.

def get_bar_relative_onset(music_df: pandas.DataFrame) ‑> pandas.DataFrame
Expand source code
def get_bar_relative_onset(music_df: pd.DataFrame) -> pd.DataFrame:
    """
    Add "bar_relative_onset" column specifying the offset of each row from the onset
    of the bar.

    For example, if a note has onset 4.5 and the preceding bar has onset 3, then the
    bar relative onset is 1.5.

    Args:
        music_df: The dataframe to add the bar relative onset column to.
    """
    bar_mask = music_df.type == "bar"
    if not len(bar_mask):
        raise ValueError("No bars found")
    music_df["bar_onset"] = float("nan")
    music_df.loc[bar_mask, "bar_onset"] = music_df.onset[bar_mask]
    music_df["bar_onset"] = music_df.bar_onset.ffill()

    null_mask = music_df["bar_onset"].isnull()

    # No notes should have null values
    assert not (music_df[null_mask].type == "note").sum()
    music_df["bar_onset"] = music_df.bar_onset.fillna(value=0)

    # assert not music_df["bar_onset"].isnull().values.any()  # type:ignore
    music_df["bar_relative_onset"] = music_df.onset - music_df.bar_onset
    music_df = music_df.drop("bar_onset", axis=1)
    return music_df

Add "bar_relative_onset" column specifying the offset of each row from the onset of the bar.

For example, if a note has onset 4.5 and the preceding bar has onset 3, then the bar relative onset is 1.5.

Args

music_df
The dataframe to add the bar relative onset column to.
def infer_barlines(music_df: pandas.DataFrame, keep_old_index: bool = False) ‑> pandas.DataFrame
Expand source code
def infer_barlines(
    music_df: pd.DataFrame, keep_old_index: bool = False
) -> pd.DataFrame:
    time_sig_mask = music_df.type == "time_signature"
    time_sigs = [series for (_, series) in music_df[time_sig_mask].iterrows()]

    assert time_sigs and (
        time_sigs[0].onset <= music_df[music_df.type == "note"].iloc[0].onset
    ), (
        "There is no time signature before the first note; default time signature not yet implemented"
    )

    assert len(time_sigs)

    barline_onset_accumulator = []

    for time_sig1, time_sig2 in zip(time_sigs, chain(time_sigs[1:], [None])):
        time_sig_dur = _time_sig_dur(_to_dict_if_necessary(time_sig1.other))
        if time_sig2 is not None:
            end = time_sig2.onset
        else:
            end = music_df.release.max()
        barline_onset_accumulator.append(
            np.arange(time_sig1.onset, end, step=time_sig_dur)
        )
    barline_onsets = np.concatenate(barline_onset_accumulator)
    barline_releases = np.concatenate([barline_onsets[1:], [end]])  # type:ignore
    barlines = pd.DataFrame({"onset": barline_onsets, "release": barline_releases})
    barlines["type"] = "bar"

    # Ensure that index values will be unique
    barlines.index += max(music_df.index) + 1

    out_df = pd.concat([music_df, barlines])
    out_df = sort_df(out_df, ignore_index=False)

    out_df = out_df.reset_index(drop=not keep_old_index)

    return out_df
def instruments_to_midi_instruments(music_df: pandas.DataFrame,
default_instrument: int = 0,
translation: dict[str, int] = {'acoustic grand piano': 0, 'bright acoustic piano': 1, 'electric grand piano': 2, 'honky-tonk piano': 3, 'electric piano 1': 4, 'electric piano 2': 5, 'harpsichord': 6, 'clavinet': 7, 'celesta': 8, 'glockenspiel': 9, 'music box': 10, 'vibraphone': 11, 'marimba': 12, 'xylophone': 13, 'tubular bells': 14, 'dulcimer': 15, 'drawbar organ': 16, 'percussive organ': 17, 'rock organ': 18, 'church organ': 19, 'reed organ': 20, 'accordion': 21, 'harmonica': 22, 'tango accordion': 23, 'acoustic guitar (nylon)': 24, 'acoustic guitar (steel)': 25, 'electric guitar (jazz)': 26, 'electric guitar (clean)': 27, 'electric guitar (muted)': 28, 'overdriven guitar': 29, 'distortion guitar': 30, 'guitar harmonics': 31, 'acoustic bass': 32, 'electric bass (finger)': 33, 'electric bass (pick)': 34, 'fretless bass': 35, 'slap bass 1': 36, 'slap bass 2': 37, 'synth bass 1': 38, 'synth bass 2': 39, 'violin': 40, 'viola': 41, 'cello': 42, 'contrabass': 43, 'bass': 43, 'tremolo strings': 44, 'pizzicato strings': 45, 'orchestral harp': 46, 'timpani': 47, 'string ensemble 1': 48, 'string ensemble 2': 49, 'synth strings 1': 50, 'synth strings 2': 51, 'choir aahs': 52, 'voice oohs': 53, 'synth voice': 54, 'orchestra hit': 55, 'trumpet': 56, 'trombone': 57, 'tuba': 58, 'muted trumpet': 59, 'french horn': 60, 'brass section': 61, 'synth brass 1': 62, 'synth brass 2': 63, 'soprano sax': 64, 'alto sax': 65, 'tenor sax': 66, 'baritone sax': 67, 'oboe': 68, 'english horn': 69, 'bassoon': 70, 'clarinet': 71, 'piccolo': 72, 'flute': 73, 'recorder': 74, 'pan flute': 75, 'blown bottle': 76, 'shakuhachi': 77, 'whistle': 78, 'ocarina': 79, 'lead 1 (square)': 80, 'lead 2 (sawtooth)': 81, 'lead 3 (calliope)': 82, 'lead 4 (chiff)': 83, 'lead 5 (charang)': 84, 'lead 6 (voice)': 85, 'lead 7 (fifths)': 86, 'lead 8 (bass + lead)': 87, 'pad 1 (new age)': 88, 'pad 2 (warm)': 89, 'pad 3 (polysynth)': 90, 'pad 4 (choir)': 91, 'pad 5 (bowed)': 92, 'pad 6 (metallic)': 93, 'pad 7 (halo)': 94, 'pad 8 (sweep)': 95, 'fx 1 (rain)': 96, 'fx 2 (soundtrack)': 97, 'fx 3 (crystal)': 98, 'fx 4 (atmosphere)': 99, 'fx 5 (brightness)': 100, 'fx 6 (goblins)': 101, 'fx 7 (echoes)': 102, 'fx 8 (sci-fi)': 103, 'sitar': 104, 'banjo': 105, 'shamisen': 106, 'koto': 107, 'kalimba': 108, 'bag pipe': 109, 'fiddle': 110, 'shanai': 111, 'tinkle bell': 112, 'agogo': 113, 'steel drums': 114, 'woodblock': 115, 'taiko drum': 116, 'melodic tom': 117, 'synth drum': 118, 'reverse cymbal': 119, 'guitar fret noise': 120, 'breath noise': 121, 'seashore': 122, 'bird tweet': 123, 'telephone ring': 124, 'helicopter': 125, 'applause': 126, 'gunshot': 127},
raise_error_on_missing: bool = False) ‑> pandas.DataFrame
Expand source code
def instruments_to_midi_instruments(
    music_df: pd.DataFrame,
    default_instrument: int = 0,
    translation: dict[str, int] = NAME_TO_MIDI_INSTRUMENT,
    raise_error_on_missing: bool = False,
) -> pd.DataFrame:
    """This function sends an "instrument" column with string values to a
    "midi_instrument" column with ints specifying General MIDI programs.
    I am not actually using it anywhere, however.
    """
    music_df = music_df.copy()
    if raise_error_on_missing:
        missing = []
        for instr in music_df.instrument.unique():
            if instr not in translation:
                missing.append(instr)
        if missing:
            raise ValueError(f"Missing instrument translations for {missing}")
    music_df["midi_instrument"] = music_df.instrument.apply(
        lambda x: translation.get(x, default_instrument)
    )
    return music_df

This function sends an "instrument" column with string values to a "midi_instrument" column with ints specifying General MIDI programs. I am not actually using it anywhere, however.

def make_bar_explicit(music_df: pandas.DataFrame,
default_bar_number: int = -1,
initial_bar_number: int = 1) ‑> pandas.DataFrame
Expand source code
def make_bar_explicit(
    music_df: pd.DataFrame, default_bar_number: int = -1, initial_bar_number: int = 1
) -> pd.DataFrame:
    """
    Add "bar_number" column specifying the bar number of each row.

    Thus every row in the dataframe will have an explicit bar number.

    The actual bar numbering is performed by the number_bars function.

    Args:
        music_df: The dataframe to add the bar number column to. The dataframe must have
            at least one bar (i.e., one row with type == "bar").
        default_bar_number: The number to use for rows that precede the first bar.
        initial_bar_number: The number of the first bar. The convention in music is that
            the first full bar should be numbered 1. Note that this function isn't smart
            enough to distinguish pickup measures (normally numbered 0 in music
            notation).
    """
    bar_mask = music_df.type == "bar"
    # TODO: (Malcolm 2023-12-25) maybe I should use appears_to_have_pickup_measure to
    #   determine initial_bar_number?
    if not len(bar_mask):
        raise ValueError("No bars found")

    music_df = number_bars(music_df, initial_bar_number)
    music_df.loc[:, "bar_number"] = music_df["bar_number"].ffill()
    music_df.loc[:, "bar_number"] = music_df["bar_number"].fillna(
        value=default_bar_number
    )
    music_df.loc[:, "bar_number"] = music_df.bar_number.astype(int)
    return music_df

Add "bar_number" column specifying the bar number of each row.

Thus every row in the dataframe will have an explicit bar number.

The actual bar numbering is performed by the number_bars function.

Args

music_df
The dataframe to add the bar number column to. The dataframe must have at least one bar (i.e., one row with type == "bar").
default_bar_number
The number to use for rows that precede the first bar.
initial_bar_number
The number of the first bar. The convention in music is that the first full bar should be numbered 1. Note that this function isn't smart enough to distinguish pickup measures (normally numbered 0 in music notation).
def make_instruments_explicit(music_df: pandas.DataFrame, default_instrument: int = 0) ‑> pandas.DataFrame
Expand source code
def make_instruments_explicit(
    music_df: pd.DataFrame, default_instrument: int = 0
) -> pd.DataFrame:
    """
    Add "midi_instrument" column specifying the MIDI instrument of each row.

    Args:
        music_df: The dataframe to add the MIDI instrument column to.
        default_instrument: The MIDI instrument to use if no program changes are
            present.
    """
    if "track" not in music_df.columns:
        return add_default_midi_instrument(music_df, default_instrument)
    program_change_mask = music_df.type == "program_change"
    music_df["midi_instrument"] = float("nan")
    music_df.loc[program_change_mask, "midi_instrument"] = [
        _to_dict_if_necessary(d)["program"]
        for _, d in music_df.other[program_change_mask].items()
    ]

    grouped_by_track = music_df.groupby("track", dropna=False)
    accumulator = []
    for track, group_df in grouped_by_track:
        group_df["midi_instrument"] = group_df.midi_instrument.ffill()
        accumulator.append(group_df)

    out_df = pd.concat(accumulator)
    out_df = out_df.sort_index(axis=0)

    out_df = add_default_midi_instrument(out_df, default_instrument=default_instrument)
    out_df["midi_instrument"] = out_df.midi_instrument.astype(int)
    return out_df

Add "midi_instrument" column specifying the MIDI instrument of each row.

Args

music_df
The dataframe to add the MIDI instrument column to.
default_instrument
The MIDI instrument to use if no program changes are present.
def make_tempos_explicit(music_df: pandas.DataFrame, default_tempo: float) ‑> pandas.DataFrame
Expand source code
def make_tempos_explicit(music_df: pd.DataFrame, default_tempo: float) -> pd.DataFrame:
    """
    Add "tempo" column to the dataframe.

    Thus every row in the dataframe will have an explicit tempo.

    If there are no tempo events in the dataframe, the tempo is set to the default
    tempo. The default tempo is also used for any rows that precede the first tempo.

    Args:
        music_df: The dataframe to add the tempo column to.
        default_tempo: The tempo to use if no tempo events are present.
    """
    # If there already *is* a tempo column, we just want to make sure it doesn't
    #   have any nans in it
    if "tempo" in music_df.columns:
        music_df["tempo"] = music_df.tempo.ffill()
        music_df["tempo"] = music_df.tempo.fillna(value=default_tempo)
        return music_df

    # Otherwise, we check for tempo events

    # First handle midi tempi
    tempo_mask = music_df.type == "set_tempo"
    music_df["tempo"] = float("nan")
    music_df.loc[tempo_mask, "tempo"] = [
        tempo2bpm(_to_dict_if_necessary(d)["tempo"])
        for _, d in music_df[tempo_mask].other.items()
    ]
    # Next handle BPM tempi from musicxml etc
    tempo_mask = music_df.type == "tempo"
    music_df.loc[tempo_mask, "tempo"] = [
        _to_dict_if_necessary(d)["tempo"] for _, d in music_df[tempo_mask].other.items()
    ]
    music_df["tempo"] = music_df.tempo.ffill()
    music_df["tempo"] = music_df.tempo.fillna(value=default_tempo)

    return music_df

Add "tempo" column to the dataframe.

Thus every row in the dataframe will have an explicit tempo.

If there are no tempo events in the dataframe, the tempo is set to the default tempo. The default tempo is also used for any rows that precede the first tempo.

Args

music_df
The dataframe to add the tempo column to.
default_tempo
The tempo to use if no tempo events are present.
def make_time_signatures_explicit(music_df: pandas.DataFrame,
default_time_signature: dict[str, int] | None = None) ‑> pandas.DataFrame
Expand source code
def make_time_signatures_explicit(
    music_df: pd.DataFrame, default_time_signature: dict[str, int] | None = None
) -> pd.DataFrame:
    """
    Add "ts_numerator" and "ts_denominator" columns to the dataframe.

    Thus every row in the dataframe will have an explicit time signature.

    If the dataframe already has "ts_numerator" and "ts_denominator" columns, it is
    returned unchanged.

    Args:
        music_df: The dataframe to add the time signature columns to.
        default_time_signature: The time signature to use if no time signature is
            present in the dataframe. If not provided, 4/4 is assumed.
    """
    if "ts_numerator" in music_df.columns and "ts_denominator" in music_df.columns:
        # There appears to be nothing to be done
        return music_df

    music_df = music_df.copy()

    if default_time_signature is None:
        default_time_signature = {"numerator": 4, "denominator": 4}

    time_sig_mask = music_df.type == "time_signature"
    time_sigs = music_df[music_df.type == "time_signature"]
    music_df["ts_numerator"] = float("nan")
    music_df["ts_denominator"] = float("nan")

    # krn music_df function seems to return time sigs as string representations of
    # dicts, whereas midi function returns them as dicts. Probably the latter behavior
    #   should be enforced everywhere.
    music_df.loc[time_sig_mask, "ts_numerator"] = [
        _to_dict_if_necessary(d)["numerator"] for _, d in time_sigs.other.items()
    ]
    music_df.loc[time_sig_mask, "ts_denominator"] = [
        _to_dict_if_necessary(d)["denominator"] for _, d in time_sigs.other.items()
    ]

    music_df["ts_numerator"] = music_df.ts_numerator.ffill()
    music_df["ts_denominator"] = music_df.ts_denominator.ffill()

    music_df["ts_numerator"] = music_df.ts_numerator.fillna(
        value=default_time_signature["numerator"]
    )
    music_df["ts_denominator"] = music_df.ts_denominator.fillna(
        value=default_time_signature["denominator"]
    )
    music_df["ts_numerator"] = music_df.ts_numerator.astype(int)
    music_df["ts_denominator"] = music_df.ts_denominator.astype(int)
    return music_df

Add "ts_numerator" and "ts_denominator" columns to the dataframe.

Thus every row in the dataframe will have an explicit time signature.

If the dataframe already has "ts_numerator" and "ts_denominator" columns, it is returned unchanged.

Args

music_df
The dataframe to add the time signature columns to.
default_time_signature
The time signature to use if no time signature is present in the dataframe. If not provided, 4/4 is assumed.
def number_bars(music_df: pandas.DataFrame, initial_bar_number: int = 1) ‑> pandas.DataFrame
Expand source code
def number_bars(music_df: pd.DataFrame, initial_bar_number: int = 1) -> pd.DataFrame:
    """
    Add "bar_number" column specifying the number of each bar.

    Args:
        music_df: The dataframe to add the bar number column to. The dataframe must have
            at least one bar (i.e., one row with type == "bar").
        initial_bar_number: The number of the first bar. The convention in music is that
            the first full bar should be numbered 1. Note that this function isn't smart
            enough to distinguish pickup measures (normally numbered 0 in music
            notation).
    """
    bar_mask = music_df.type == "bar"
    if not bar_mask.sum():
        raise ValueError("No bars found")
    bar_numbers = np.arange(
        start=initial_bar_number, stop=bar_mask.sum() + initial_bar_number
    )

    music_df["bar_number"] = float("nan")
    music_df.loc[bar_mask, "bar_number"] = bar_numbers

    return music_df

Add "bar_number" column specifying the number of each bar.

Args

music_df
The dataframe to add the bar number column to. The dataframe must have at least one bar (i.e., one row with type == "bar").
initial_bar_number
The number of the first bar. The convention in music is that the first full bar should be numbered 1. Note that this function isn't smart enough to distinguish pickup measures (normally numbered 0 in music notation).
def simplify_time_sigs(music_df: pandas.DataFrame,
simplify_func: Callable[[int, int], tuple[int, int]] = <function _time_signature_reduce>) ‑> pandas.DataFrame
Expand source code
def simplify_time_sigs(
    music_df: pd.DataFrame,
    simplify_func: Callable[[int, int], tuple[int, int]] = _time_signature_reduce,
) -> pd.DataFrame:
    time_sig_mask = music_df.type == "time_signature"

    def f(other):
        dict_ = _to_dict_if_necessary(other)
        dict_["numerator"], dict_["denominator"] = simplify_func(
            dict_["numerator"], dict_["denominator"]
        )
        return dict_

    music_df.loc[time_sig_mask, "other"] = music_df[time_sig_mask].other.apply(f)
    return music_df
def split_long_bars(music_df: pandas.DataFrame) ‑> pandas.DataFrame
Expand source code
def split_long_bars(music_df: pd.DataFrame) -> pd.DataFrame:
    """
    Split "long" bars (bars whose actual duration exceeds the time signature duration).

    Note: sorts result before returning it.
    """
    assert (
        "ts_numerator" in music_df.columns and "ts_denominator" in music_df.columns
    ), "call make_time_signatures_explicit(music_df) first"

    orig_cols = music_df.columns

    music_df = add_time_sig_dur(music_df)

    music_df = add_bar_durs(music_df)

    long_bars = music_df["bar_dur"] > music_df["time_sig_dur"]
    if long_bars.any():
        added_bars = []
        for i, long_bar in music_df[long_bars].iterrows():
            last_release = long_bar.release
            assert not isnan(last_release)
            remaining_dur = long_bar.bar_dur - long_bar.time_sig_dur
            onset = long_bar.onset
            prev_bar = long_bar

            # We need to modify the release of the long measure in place
            music_df.loc[i, "release"] = onset + long_bar.time_sig_dur  # type:ignore

            new_bar = None
            while remaining_dur > 0:
                onset += long_bar.time_sig_dur
                new_bar = long_bar.copy()
                new_bar.onset = onset
                added_bars.append(new_bar)
                prev_bar.release = onset
                prev_bar = new_bar
                remaining_dur -= long_bar.time_sig_dur

            assert new_bar is not None
            new_bar.release = last_release
        music_df = pd.concat([music_df, pd.DataFrame(added_bars)])
        music_df = sort_df(music_df)
    return music_df[orig_cols]

Split "long" bars (bars whose actual duration exceeds the time signature duration).

Note: sorts result before returning it.