Source code for metator.figures

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""Figures foor metaTOR output.

General utility functions to plot some figures to describe metaTOR output.

Core functions to plot the firgures are:
    - build_matrix_network
    - figures_bins_distribution
    - figures_bins_size_distribution
    - figure_camembert_quality
    - figures_mags_GC_boxplots
    - figures_mags_HiC_cov_boxplots
    - figures_mags_SG_cov_boxplots
    - generates_frags_network
    - network_heatmap
    - plot_figures
    - reindex_df
"""


import hicstuff.hicstuff as hcs
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import seaborn as sns
from metator.log import logger
from os.path import join


[docs]def build_matrix_network(pairs_files, info_contigs, N, bin_size):
    """Build a binned matrix from the pairs alignement files and the final
    binning information in the contig data file.

    Parameters:
    -----------
    pairs_files : list of str
        List of path of the alignment ".pairs" file.
    info_contigs: dict
        Dictionnary with frags id of the contigs.
    N : int
        Size of the final bin matrix.
    bin_size : int
        Size of the bin to plot the heatmap.

    Returns:
    --------
    numpy.array:
        Binned dense matrix of the network.
    """
    # Initiate the matix
    matrix = np.zeros((N, N))

    # Iterates on the input pairs file
    for pairs_file in pairs_files:
        with open(pairs_file, "r") as input_pairs:
            for pairs_line in input_pairs:
                # Ignore header lines.
                if pairs_line.startswith("#"):
                    continue
                # Split the line on the tabulation and check if both contigs
                # are in the bin.
                pairs = pairs_line.split("\t")
                contig1, contig2 = pairs[1], pairs[3]
                pos1, pos2 = pairs[2], pairs[4]
                try:
                    frag1 = (
                        info_contigs[contig1]["start"]
                        + (int(pos1) + info_contigs[contig1]["init"])
                        // bin_size
                    )
                    frag2 = (
                        info_contigs[contig2]["start"]
                        + (int(pos2) + info_contigs[contig2]["init"])
                        // bin_size
                    )
                    if frag1 < frag2:
                        matrix[frag1, frag2] += 1
                    elif frag1 > frag2:
                        matrix[frag2, frag1] += 1
                except KeyError:
                    continue
    matrix_norm = hcs.normalize_dense(
        matrix + matrix.T, norm="SCN", iterations=100
    )
    return matrix_norm


[docs]def figures_bins_distribution(bin_summary, out_file):
    """Function to plot the distribution of the estimated completion and
    contamination of the final bins generated by metaTOR. The bins are ordered
    by decreasing completion and contamination values superior to 105% are set
    to 105%.

    Parameters:
    -----------
    bin_summary : pandas.core.frame.DataFrame
        Table with the informations about the final bins.
    out_file : str
        Path where to save the figure.
    """
    # Transform values as float
    bin_summary["Weighted completeness"] = bin_summary[
        "Weighted completeness"
    ].apply(float)
    bin_summary["Weighted redundancy"] = bin_summary[
        "Weighted redundancy"
    ].apply(float)
    # Sort the values by decreasing completion.
    bin_summary = bin_summary.sort_values(
        by="Weighted completeness", ascending=False
    )
    # Put the contamination values bigger than 105 to 105.
    mask = bin_summary["Weighted redundancy"] >= 105
    bin_summary.loc[mask, "Weighted redundancy"] = 105
    # Plot the contamination and the completion.
    _fig, ax = plt.subplots()
    ax.tick_params(axis="both", which="both", length=0)
    plt.box(on=None)
    plt.scatter(
        y=bin_summary["Weighted completeness"] * 100,
        x=range(len(bin_summary)),
        color="r",
        label="Completion",
        s=2.5,
    )
    plt.scatter(
        y=bin_summary["Weighted redundancy"] * 100 - 100,
        x=range(len(bin_summary)),
        color="k",
        label="Contamination",
        s=2.5,
    )
    plt.axhline(y=5, color="k", linestyle=(0, (5, 10)))
    plt.axhline(y=10, color="k", linestyle=(0, (5, 10)))
    plt.axhline(y=50, color="k", linestyle=(0, (5, 10)))
    plt.axhline(y=70, color="k", linestyle=(0, (5, 10)))
    plt.axhline(y=90, color="k", linestyle=(0, (5, 10)))
    plt.axhline(y=100, color="k", linestyle=(0, (5, 10)))
    plt.yticks([5, 10, 50, 70, 90, 100])
    plt.xticks(np.arange(0, len(bin_summary), 50))
    plt.xlabel("Rank index")
    plt.ylabel("Completness/Contamination (%)")
    plt.legend(
        bbox_to_anchor=(0.0, -0.25, 1.0, 1.0),
        loc="lower center",
        ncol=2,
        borderaxespad=0.0,
    )
    # Save the file
    plt.savefig(out_file, dpi=200, bbox_inches="tight")
    plt.close()


[docs]def figures_bins_size_distribution(
    bin_summary, total_size, threshold, out_file
):
    """Function to plot a camembert of the fraction of size corresponding
    to each quality of bins. We defined 6 categories:
        - High quality MAGs (HQ MAGs): >= 90% completion ; < 5% contamination
        - Medium quality MAGs (MQ MAGs): >= 70% completion ; < 10% contamination
        - Low quality MAGs (LQ MAGs): >= 50% completion ; < 10% contamination
        - Contaminated bins: >= 50% completion ; >= 10% contamination
        - Bins superior to size threshold but with less 50% completion.
        - Unbinned contigs

    Parameters:
    -----------
    bin_summary : pandas.core.frame.DataFrame
        Table with the informations about the final bins.
    total_size : int
        Size of the whole assembly.
    threshold : int
        Minimun size of the bins considered.
    out_file : str
        Path where to save the figure.
    """
    # Add a qualitive quality column in bin_summary with the quality of the MAGs
    bin_summary["MAG_quality"] = "ND"
    # Build a small table with the sum for each quality category.
    mags_summary = pd.DataFrame(
        [[0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [None, 0]],
        columns=["bins", "size"],
    )
    for i in range(len(bin_summary)):
        completness = float(bin_summary.loc[i, "Weighted completeness"])
        contamination = float(bin_summary.loc[i, "Weighted redundancy"])
        size = int(bin_summary.loc[i, "Length"])
        if completness >= 0.5:
            if contamination > 1.1:
                mags_summary.loc[3, "bins"] += 1
                mags_summary.loc[3, "size"] += size
                bin_summary.loc[i, "MAG_quality"] = "Contaminated"
            else:
                if completness >= 0.9 and contamination <= 1.05:
                    mags_summary.loc[0, "bins"] += 1
                    mags_summary.loc[0, "size"] += size
                    bin_summary.loc[i, "MAG_quality"] = "HQ"
                elif completness >= 0.7:
                    mags_summary.loc[1, "bins"] += 1
                    mags_summary.loc[1, "size"] += size
                    bin_summary.loc[i, "MAG_quality"] = "MQ"
                else:
                    mags_summary.loc[2, "bins"] += 1
                    mags_summary.loc[2, "size"] += size
                    bin_summary.loc[i, "MAG_quality"] = "LQ"
        else:
            mags_summary.loc[4, "bins"] += 1
            mags_summary.loc[4, "size"] += size
            bin_summary.loc[i, "MAG_quality"] = "Other"
    mags_summary.loc[5, "size"] = total_size - sum(mags_summary["size"])
    # Plot the camembert of the size ratio.
    labels = [
        "HQ MAGs: {0} - {1}Mb".format(
            int(mags_summary.loc[0, "bins"]),
            round(mags_summary.loc[0, "size"] / 1000000, 2),
        ),
        "MQ MAGs: {0} - {1}Mb".format(
            int(mags_summary.loc[1, "bins"]),
            round(mags_summary.loc[1, "size"] / 1000000, 2),
        ),
        "LQ MAGs: {0} - {1}Mb".format(
            int(mags_summary.loc[2, "bins"]),
            round(mags_summary.loc[2, "size"] / 1000000, 2),
        ),
        "Contaminated bins: {0} - {1}Mb".format(
            int(mags_summary.loc[3, "bins"]),
            round(mags_summary.loc[3, "size"] / 1000000, 2),
        ),
        "Others bins: {0} - {1}Mb".format(
            int(mags_summary.loc[4, "bins"]),
            round(mags_summary.loc[4, "size"] / 1000000, 2),
        ),
        "Others contigs - {0}Mb".format(
            round(mags_summary.loc[5, "size"] / 1000000, 2),
        ),
    ]
    plt.pie(
        mags_summary["size"],
        colors=[
            "#103b6f",
            "#6666ff",
            "#ccccff",
            "#ed3139",
            "#c0bcbc",
            "#eaeded",
        ],
    )
    plt.legend(labels, bbox_to_anchor=(0.9, 0.0, 1.0, 1.0), loc="upper right")
    plt.text(
        -1.5,
        -1.2,
        "Total size of the assembly: {0}Mb".format(
            round(total_size / 1000000, 2)
        ),
        fontdict=None,
    )
    plt.text(
        -1.5,
        -1.35,
        "Percentage of MAGs: {0}%".format(
            round(sum(mags_summary["size"][0:3]) / total_size * 100, 2)
        ),
        fontdict=None,
    )
    plt.text(
        -1.5,
        -1.5,
        "Bins threshold: {0}kb".format(
            round(threshold / 1000, 2),
        ),
        fontdict=None,
    )
    plt.title("Size proportion of bins depending on their quality")
    # Save the file
    plt.savefig(out_file, dpi=200, bbox_inches="tight")
    plt.close()
    return bin_summary


[docs]def figure_camembert_quality(
    out_file,
    prefix,
    n_religated,
    n_loops,
    n_weirds,
    n_informative_intra,
    n_informative_inter,
    n_intra_mags,
    n_inter_mags,
    uncut_thr,
    loop_thr,
):
    """Functon to plot nice hicstuff camembert from metator high quality
    contigs.
    """
    plt.figure(2, figsize=(6, 6))
    # The slices will be ordered and plotted counter-clockwise.
    total = n_inter_mags + n_intra_mags
    fracs = [
        n_religated,
        n_loops,
        n_weirds,
        n_informative_intra,
        n_informative_inter,
        n_inter_mags,
    ]
    # Format labels to include event names and proportion
    labels = list(
        map(
            lambda x: (x[0] + ": %.2f%%") % (100 * x[1] / total),
            [
                ("Religated", n_religated),
                ("Loops", n_loops),
                ("Weirds", n_weirds),
                ("Informative intracontigs", n_informative_intra),
                ("Informative intercontigs", n_informative_inter),
                ("Noise inter", n_inter_mags),
            ],
        )
    )
    colors = ["#D55E00", "#E69F00", "#999999", "#103b6f", "#6096fd", "#cc0000"]
    patches, _ = plt.pie(fracs, colors=colors, startangle=90)
    plt.legend(
        patches,
        labels,
        loc="upper left",
        bbox_to_anchor=(-0.1, 1.0),
    )
    if prefix:
        plt.title(
            "Distribution of library events in {}".format(prefix),
            bbox={"facecolor": "1.0", "pad": 5},
        )
    plt.text(
        0.3,
        1.15,
        "Threshold Uncuts = " + str(uncut_thr),
        fontdict=None,
    )
    plt.text(
        0.3,
        1.05,
        "Threshold Loops = " + str(loop_thr),
        fontdict=None,
    )
    plt.text(
        -1.5,
        -1.2,
        f"Total number of reads in the estimation: {total / 1_000_000:.2f} millions reads",
        fontdict=None,
    )
    noise = 100 * n_inter_mags / (n_intra_mags + n_inter_mags)
    plt.text(
        -1.5,
        -1.3,
        f"Estimated noise signal = {noise:.2f}%",
        fontdict=None,
    )
    informative = (
        100
        * (n_informative_intra + n_informative_inter)
        / (n_intra_mags + n_inter_mags)
    )
    plt.text(
        -1.5,
        -1.4,
        f"Informative reads = {informative:.2f}%",
        fontdict=None,
    )
    plt.savefig(out_file)
    plt.close()


[docs]def figures_mags_GC_boxplots(contigs_data, out_file):
    """Function to plot barplots GC content of the contigs for each bins. The
    count are weighted by the size of the contigs.

    Parameters:
    -----------
    contigs_data : pandas.DataFrame
        Table with all the data from the contigs, the ubinned contigs should
        have removed and a column size_weight should have been add to weight
        with the size.
    out_file : str
        Path where to write the figure.
    """
    # Sort by decreasing GC.
    contigs_data = contigs_data.sort_values(by="GC-content", ascending=False)
    # Build the palette
    my_pal = {
        "HQ": "#103b6f",
        "MQ": "#6666ff",
        "LQ": "#ccccff",
        "Contaminated": "#ed3139",
        "Other": "#c0bcbc",
    }
    # Plot the figure.
    _fig, ax = plt.subplots(figsize=(20, 10))
    boxplot = sns.boxplot(
        y="GC_content",
        x="Final_bin",
        data=reindex_df(contigs_data, "size_weight"),
        palette=my_pal,
        hue="MAG_quality",
        hue_order=["HQ", "MQ", "LQ", "Contaminated", "Other"],
        flierprops=dict(
            markerfacecolor="0.5",
            markersize=1.0,
            marker="o",
            markeredgewidth=0.0,
        ),
        linewidth=1.25,
        width=1.0,
        dodge=False,
    )
    ax.set(xlabel="", ylabel="GC content distribution", xticklabels=[])
    labels = [
        "HQ MAGs (Completion > 90, Contamination < 5)",
        "MQ MAGs (Completion > 70, Contamination < 10)",
        "LQ MAGs (Completion > 50, Contamination < 10)",
        "Contaminated (Completion > 50, Contamination < 10)",
        "Other (Completion < 50)",
    ]
    handles, _ = boxplot.get_legend_handles_labels()
    boxplot.legend(handles, labels)
    # Save the file.
    plt.savefig(out_file, dpi=200, bbox_inches="tight")
    plt.close()


[docs]def figures_mags_HiC_cov_boxplots(contigs_data, out_file):
    """Function to plot barplots HiC_coverage coverage of the contigs for each
    bins. The count are weighted by the size of the contigs.

    Parameters:
    -----------
    contigs_data : pandas.DataFrame
        Table with all the data from the contigs, the ubinned contigs should
        have removed and a column size_weight should have been add to weight
        with the size.
    out_file : str
        Path where to write the figure.
    """
    # Compute the HiC coverage.
    contigs_data["HiC_cov"] = 100 * contigs_data["Hit"] / contigs_data["Size"]
    # Sort by decreasing HiC coverage.
    contigs_data = contigs_data.sort_values(by="HiC_cov", ascending=False)
    # Build the palette
    my_pal = {
        "HQ": "#103b6f",
        "MQ": "#6666ff",
        "LQ": "#ccccff",
        "Contaminated": "#ed3139",
        "Other": "#c0bcbc",
    }
    # Plot the figure.
    _fig, ax = plt.subplots(figsize=(20, 10))
    boxplot = sns.boxplot(
        y="HiC_cov",
        x="Final_bin",
        data=reindex_df(contigs_data, "size_weight"),
        palette=my_pal,
        hue="MAG_quality",
        hue_order=["HQ", "MQ", "LQ", "Contaminated", "Other"],
        flierprops=dict(
            markerfacecolor="0.5",
            markersize=1.0,
            marker="o",
            markeredgewidth=0.0,
        ),
        linewidth=1.25,
        width=1.0,
        dodge=False,
    )
    plt.yscale("log")
    ax.set(xlabel="", ylabel="HiC coverage distribution", xticklabels=[])
    labels = [
        "HQ MAGs (Completion > 90, Contamination < 5)",
        "MQ MAGs (Completion > 70, Contamination < 10)",
        "LQ MAGs (Completion > 50, Contamination < 10)",
        "Contaminated (Completion > 50, Contamination < 10)",
        "Other (Completion < 50)",
    ]
    handles, _ = boxplot.get_legend_handles_labels()
    boxplot.legend(handles, labels)
    # Save the file.
    plt.savefig(out_file, dpi=200, bbox_inches="tight")
    plt.close()


[docs]def figures_mags_SG_cov_boxplots(contigs_data, out_file):
    """Function to plot barplots assembly coverage of the contigs for each bins.
    The count are weighted by the size of the contigs.

    Parameters:
    -----------
    contigs_data : pandas.DataFrame
        Table with all the data from the contigs, the ubinned contigs should
        have removed and a column size_weight should have been add to weight
        with the size.
    out_file : str
        Path where to write the figure.
    """
    # Sort by decreasing Shotgun coverage.
    contigs_data = contigs_data.sort_values(by="SG_Coverage", ascending=False)
    # Build the palette
    my_pal = {
        "HQ": "#103b6f",
        "MQ": "#6666ff",
        "LQ": "#ccccff",
        "Contaminated": "#ed3139",
        "Other": "#c0bcbc",
    }
    # Plot the figure.
    _fig, ax = plt.subplots(figsize=(20, 10))
    boxplot = sns.boxplot(
        y="Shotgun_coverage",
        x="Final_bin",
        data=reindex_df(contigs_data, "size_weight"),
        palette=my_pal,
        hue="MAG_quality",
        hue_order=["HQ", "MQ", "LQ", "Contaminated", "Other"],
        flierprops=dict(
            markerfacecolor="0.5",
            markersize=1.0,
            marker="o",
            markeredgewidth=0.0,
        ),
        linewidth=1.25,
        width=1.0,
        dodge=False,
    )
    plt.yscale("log")
    ax.set(xlabel="", ylabel="Assembly coverage distribution", xticklabels=[])
    labels = [
        "HQ MAGs (Completion > 90, Contamination < 5)",
        "MQ MAGs (Completion > 70, Contamination < 10)",
        "LQ MAGs (Completion > 50, Contamination < 10)",
        "Contaminated (Completion > 50, Contamination < 10)",
        "Other (Completion < 50)",
    ]
    handles, _ = boxplot.get_legend_handles_labels()
    boxplot.legend(handles, labels)
    # Save the file.
    plt.savefig(out_file, dpi=200, bbox_inches="tight")
    plt.close()


[docs]def generates_frags_network(contig_data, bin_summary, bin_size):
    """Generates info fragments for all contigs and start position all bins.

    Parameters:
    -----------
    contigs_data : pandas.DataFrame
        Table with all the data from the contigs.
    bin_summary : dict
        Dictionnary with the informations of the final bins kept by MetaTOR.
    bin_size : int
        Size of the bin to plot the heatmap.

    Returns:
    --------
    dict:
        Dictionnary with frags id of the contigs.
    list of int:
        List of the start positions of the mags
    int:
        Size of the final binned matrix
    """

    # Define start frag id for each MAGs. The rest column is the size still
    # available from the previous contigs.
    frag_id = 0
    mag_starts = []
    for bin_name in bin_summary:
        bin_summary[bin_name]["start"] = frag_id
        mag_starts.append(frag_id)
        bin_summary[bin_name]["current"] = frag_id
        bin_summary[bin_name]["rest"] = 0
        frag_id += (int(bin_summary[bin_name]["Length"]) // bin_size) + 1

    # Define start frag id for each bins
    info_contigs = dict()
    for i in contig_data.index:
        # Only add the contig if it's in a final bin.
        if contig_data.loc[i, "Final_bin"] != "ND":
            # Call bin name dict to have the status of start frag id and rest.
            bin_name = contig_data.loc[i, "Final_bin"]
            size = contig_data.loc[i, "Size"]
            info_contigs[contig_data.loc[i, "Name"]] = {
                "start": bin_summary[bin_name]["current"],
                "init": bin_summary[bin_name]["rest"],
            }
            # Update rest and start position.
            bin_summary[bin_name]["current"] += (
                size + bin_summary[bin_name]["rest"]
            ) // bin_size
            bin_summary[bin_name]["rest"] = (
                size + bin_summary[bin_name]["rest"]
            ) % bin_size

    return info_contigs, mag_starts, frag_id


[docs]def network_heatmap(matrix, out_file=None, mag_starts=None):
    """Plot the heatmap of the network, with the contig ordering by their bin
    attribution.

    Parameters:
    -----------
    matrix : numpy.array
        Binned dense matrix of the network.
    out_file : str
        Name of the file to save the plot.
    mag_starts : list of int
        List of bin positions where to draw mags starts as dotted lines.
    """

    # Plot the heatmap.
    vmax = np.percentile(matrix, 99.5)
    im_kwargs = {
        "vmin": 0,
        "vmax": vmax,
        "cmap": "viridis",
        "interpolation": "none",
    }
    li_kwargs = {"ls": ":", "alpha": 0.8, "c": "white", "linewidth": 0.1}
    plt.figure()
    plt.imshow(matrix, **im_kwargs)
    plt.colorbar()
    plt.axis("off")

    # Add line between MAGs.
    if mag_starts is not None:
        for pos in mag_starts:
            plt.axvline(pos, **li_kwargs)
            plt.axhline(pos, **li_kwargs)

    # Save or show the file.
    if out_file:
        plt.savefig(out_file, bbox_inches="tight", pad_inches=0.0, dpi=2000)
        plt.close()
    else:
        plt.show()


[docs]def plot_figures(out_dir, contigs_data, bin_summary, threshold):
    """Function to generates all figures.

    Parameters:
    -----------
    out_dir : str
        Path of the output directory of MetaTOR.
    contigs_data : pandas.DataFrame
        Table with all the data from the contigs.
    bin_summary : dict
        Dictionnary with the informations of the final bins kept by MetaTOR.
    threshold : int
        Minimun size of the bins considered.
    """
    # Create plot directory
    plot_dir = join(out_dir, "plot")
    os.makedirs(plot_dir, exist_ok=True)

    # Test if Shotgun coverage have been given
    if contigs_data.loc[contigs_data.index[0], "Shotgun_coverage"] == "-":
        SG_cov = False
    else:
        SG_cov = True

    # Create outfiles
    outfile_bins_distribution = join(plot_dir, "bins_distribution.png")
    outfile_bins_size_distribution = join(
        plot_dir, "bins_size_distribution.png"
    )
    outfile_MAGs_GC = join(plot_dir, "MAGs_GC_distribution.png")
    outfile_MAGs_HiC_cov = join(plot_dir, "MAGs_HiC_cov_distribution.png")
    if SG_cov:
        outfile_MAGs_SG_cov = join(plot_dir, "MAGs_SG_cov_distribution.png")

    # Look for pairs file in out_dir
    pairs_files = []
    list_files = os.listdir(out_dir)
    for file in list_files:
        if re.search("\.pairs$", file):
            pairs_files.append(join(out_dir, file))

    # Plot heatmap with a binning of 50kb.
    if len(pairs_files) > 0:
        heatmap_file = join(plot_dir, "network_heatmap.png")
        bin_size = 50000
        info_contigs, mag_starts, N = generates_frags_network(
            contigs_data, bin_summary, bin_size
        )
        matrix = build_matrix_network(pairs_files, info_contigs, N, bin_size)
        network_heatmap(matrix, heatmap_file, mag_starts)
    else:
        logger.warning(
            "No pairs alignment files found in %s, no heatmap will be generated.",
            out_dir,
        )

    # Transform dictionnary to pandas DataFrame.
    bin_summary = pd.DataFrame.from_dict(bin_summary, orient="index")
    bin_summary["Bin"] = bin_summary.index
    bin_summary.index = range(len(bin_summary))
    # Compute size of the assembly.
    total_size = sum(contigs_data["Size"])

    # Plot bin distribution
    figures_bins_distribution(bin_summary, outfile_bins_distribution)
    bin_summary = figures_bins_size_distribution(
        bin_summary, total_size, threshold, outfile_bins_size_distribution
    )

    # Remove unbinned contigs.
    contigs_data = contigs_data[contigs_data["Final_bin"] != "ND"]
    # Create a column of size divided by the minimum size of the contigs.
    min_size = min(contigs_data["Size"])
    contigs_data["size_weight"] = (contigs_data["Size"] / min_size).apply(int)
    # Merge the bin_summary and the contigs data file.
    contigs_data = pd.merge(
        contigs_data, bin_summary, left_on="Final_bin", right_on="Bin"
    )

    # Plots the distribution of GC and coverage inside the bins.
    figures_mags_GC_boxplots(contigs_data, outfile_MAGs_GC)
    figures_mags_HiC_cov_boxplots(contigs_data, outfile_MAGs_HiC_cov)
    if SG_cov:
        figures_mags_SG_cov_boxplots(contigs_data, outfile_MAGs_SG_cov)


[docs]def reindex_df(dataframe, weight_col):
    """Expand the dataframe to prepare for resampling result is 1 row per count
    per sample.

    Parameters:
    -----------
    dataframe : pandas.core.frame.DataFrame
        Table to expand.
    weight_col : str
        Name of the weighted column to use.

    Returns:
    --------
    pandas.core.frame.DataFrame:
        Table expanded with new index.
    """
    dataframe = dataframe.reindex(dataframe.index.repeat(dataframe[weight_col]))
    dataframe.reset_index(drop=True, inplace=True)
    return dataframe