Source code for metator.figures

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""Figures foor metaTOR output.

General utility functions to plot some figures to describe metaTOR output.

Core functions to plot the firgures are:
    - build_matrix_network
    - figures_bins_distribution
    - figures_bins_size_distribution
    - figure_camembert_quality
    - figures_mags_GC_boxplots
    - figures_mags_HiC_cov_boxplots
    - figures_mags_SG_cov_boxplots
    - generates_frags_network
    - network_heatmap
    - plot_figures
    - reindex_df
"""


import hicstuff.hicstuff as hcs
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import seaborn as sns
from metator.log import logger
from os.path import join


[docs]def build_matrix_network(pairs_files, info_contigs, N, bin_size): """Build a binned matrix from the pairs alignement files and the final binning information in the contig data file. Parameters: ----------- pairs_files : list of str List of path of the alignment ".pairs" file. info_contigs: dict Dictionnary with frags id of the contigs. N : int Size of the final bin matrix. bin_size : int Size of the bin to plot the heatmap. Returns: -------- numpy.array: Binned dense matrix of the network. """ # Initiate the matix matrix = np.zeros((N, N)) # Iterates on the input pairs file for pairs_file in pairs_files: with open(pairs_file, "r") as input_pairs: for pairs_line in input_pairs: # Ignore header lines. if pairs_line.startswith("#"): continue # Split the line on the tabulation and check if both contigs # are in the bin. pairs = pairs_line.split("\t") contig1, contig2 = pairs[1], pairs[3] pos1, pos2 = pairs[2], pairs[4] try: frag1 = ( info_contigs[contig1]["start"] + (int(pos1) + info_contigs[contig1]["init"]) // bin_size ) frag2 = ( info_contigs[contig2]["start"] + (int(pos2) + info_contigs[contig2]["init"]) // bin_size ) if frag1 < frag2: matrix[frag1, frag2] += 1 elif frag1 > frag2: matrix[frag2, frag1] += 1 except KeyError: continue matrix_norm = hcs.normalize_dense( matrix + matrix.T, norm="SCN", iterations=100 ) return matrix_norm
[docs]def figures_bins_distribution(bin_summary, out_file): """Function to plot the distribution of the estimated completion and contamination of the final bins generated by metaTOR. The bins are ordered by decreasing completion and contamination values superior to 105% are set to 105%. Parameters: ----------- bin_summary : pandas.core.frame.DataFrame Table with the informations about the final bins. out_file : str Path where to save the figure. """ # Transform values as float bin_summary["Weighted completeness"] = bin_summary[ "Weighted completeness" ].apply(float) bin_summary["Weighted redundancy"] = bin_summary[ "Weighted redundancy" ].apply(float) # Sort the values by decreasing completion. bin_summary = bin_summary.sort_values( by="Weighted completeness", ascending=False ) # Put the contamination values bigger than 105 to 105. mask = bin_summary["Weighted redundancy"] >= 105 bin_summary.loc[mask, "Weighted redundancy"] = 105 # Plot the contamination and the completion. _fig, ax = plt.subplots() ax.tick_params(axis="both", which="both", length=0) plt.box(on=None) plt.scatter( y=bin_summary["Weighted completeness"] * 100, x=range(len(bin_summary)), color="r", label="Completion", s=2.5, ) plt.scatter( y=bin_summary["Weighted redundancy"] * 100 - 100, x=range(len(bin_summary)), color="k", label="Contamination", s=2.5, ) plt.axhline(y=5, color="k", linestyle=(0, (5, 10))) plt.axhline(y=10, color="k", linestyle=(0, (5, 10))) plt.axhline(y=50, color="k", linestyle=(0, (5, 10))) plt.axhline(y=70, color="k", linestyle=(0, (5, 10))) plt.axhline(y=90, color="k", linestyle=(0, (5, 10))) plt.axhline(y=100, color="k", linestyle=(0, (5, 10))) plt.yticks([5, 10, 50, 70, 90, 100]) plt.xticks(np.arange(0, len(bin_summary), 50)) plt.xlabel("Rank index") plt.ylabel("Completness/Contamination (%)") plt.legend( bbox_to_anchor=(0.0, -0.25, 1.0, 1.0), loc="lower center", ncol=2, borderaxespad=0.0, ) # Save the file plt.savefig(out_file, dpi=200, bbox_inches="tight") plt.close()
[docs]def figures_bins_size_distribution( bin_summary, total_size, threshold, out_file ): """Function to plot a camembert of the fraction of size corresponding to each quality of bins. We defined 6 categories: - High quality MAGs (HQ MAGs): >= 90% completion ; < 5% contamination - Medium quality MAGs (MQ MAGs): >= 70% completion ; < 10% contamination - Low quality MAGs (LQ MAGs): >= 50% completion ; < 10% contamination - Contaminated bins: >= 50% completion ; >= 10% contamination - Bins superior to size threshold but with less 50% completion. - Unbinned contigs Parameters: ----------- bin_summary : pandas.core.frame.DataFrame Table with the informations about the final bins. total_size : int Size of the whole assembly. threshold : int Minimun size of the bins considered. out_file : str Path where to save the figure. """ # Add a qualitive quality column in bin_summary with the quality of the MAGs bin_summary["MAG_quality"] = "ND" # Build a small table with the sum for each quality category. mags_summary = pd.DataFrame( [[0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [None, 0]], columns=["bins", "size"], ) for i in range(len(bin_summary)): completness = float(bin_summary.loc[i, "Weighted completeness"]) contamination = float(bin_summary.loc[i, "Weighted redundancy"]) size = int(bin_summary.loc[i, "Length"]) if completness >= 0.5: if contamination > 1.1: mags_summary.loc[3, "bins"] += 1 mags_summary.loc[3, "size"] += size bin_summary.loc[i, "MAG_quality"] = "Contaminated" else: if completness >= 0.9 and contamination <= 1.05: mags_summary.loc[0, "bins"] += 1 mags_summary.loc[0, "size"] += size bin_summary.loc[i, "MAG_quality"] = "HQ" elif completness >= 0.7: mags_summary.loc[1, "bins"] += 1 mags_summary.loc[1, "size"] += size bin_summary.loc[i, "MAG_quality"] = "MQ" else: mags_summary.loc[2, "bins"] += 1 mags_summary.loc[2, "size"] += size bin_summary.loc[i, "MAG_quality"] = "LQ" else: mags_summary.loc[4, "bins"] += 1 mags_summary.loc[4, "size"] += size bin_summary.loc[i, "MAG_quality"] = "Other" mags_summary.loc[5, "size"] = total_size - sum(mags_summary["size"]) # Plot the camembert of the size ratio. labels = [ "HQ MAGs: {0} - {1}Mb".format( int(mags_summary.loc[0, "bins"]), round(mags_summary.loc[0, "size"] / 1000000, 2), ), "MQ MAGs: {0} - {1}Mb".format( int(mags_summary.loc[1, "bins"]), round(mags_summary.loc[1, "size"] / 1000000, 2), ), "LQ MAGs: {0} - {1}Mb".format( int(mags_summary.loc[2, "bins"]), round(mags_summary.loc[2, "size"] / 1000000, 2), ), "Contaminated bins: {0} - {1}Mb".format( int(mags_summary.loc[3, "bins"]), round(mags_summary.loc[3, "size"] / 1000000, 2), ), "Others bins: {0} - {1}Mb".format( int(mags_summary.loc[4, "bins"]), round(mags_summary.loc[4, "size"] / 1000000, 2), ), "Others contigs - {0}Mb".format( round(mags_summary.loc[5, "size"] / 1000000, 2), ), ] plt.pie( mags_summary["size"], colors=[ "#103b6f", "#6666ff", "#ccccff", "#ed3139", "#c0bcbc", "#eaeded", ], ) plt.legend(labels, bbox_to_anchor=(0.9, 0.0, 1.0, 1.0), loc="upper right") plt.text( -1.5, -1.2, "Total size of the assembly: {0}Mb".format( round(total_size / 1000000, 2) ), fontdict=None, ) plt.text( -1.5, -1.35, "Percentage of MAGs: {0}%".format( round(sum(mags_summary["size"][0:3]) / total_size * 100, 2) ), fontdict=None, ) plt.text( -1.5, -1.5, "Bins threshold: {0}kb".format( round(threshold / 1000, 2), ), fontdict=None, ) plt.title("Size proportion of bins depending on their quality") # Save the file plt.savefig(out_file, dpi=200, bbox_inches="tight") plt.close() return bin_summary
[docs]def figure_camembert_quality( out_file, prefix, n_religated, n_loops, n_weirds, n_informative_intra, n_informative_inter, n_intra_mags, n_inter_mags, uncut_thr, loop_thr, ): """Functon to plot nice hicstuff camembert from metator high quality contigs. """ plt.figure(2, figsize=(6, 6)) # The slices will be ordered and plotted counter-clockwise. total = n_inter_mags + n_intra_mags fracs = [ n_religated, n_loops, n_weirds, n_informative_intra, n_informative_inter, n_inter_mags, ] # Format labels to include event names and proportion labels = list( map( lambda x: (x[0] + ": %.2f%%") % (100 * x[1] / total), [ ("Religated", n_religated), ("Loops", n_loops), ("Weirds", n_weirds), ("Informative intracontigs", n_informative_intra), ("Informative intercontigs", n_informative_inter), ("Noise inter", n_inter_mags), ], ) ) colors = ["#D55E00", "#E69F00", "#999999", "#103b6f", "#6096fd", "#cc0000"] patches, _ = plt.pie(fracs, colors=colors, startangle=90) plt.legend( patches, labels, loc="upper left", bbox_to_anchor=(-0.1, 1.0), ) if prefix: plt.title( "Distribution of library events in {}".format(prefix), bbox={"facecolor": "1.0", "pad": 5}, ) plt.text( 0.3, 1.15, "Threshold Uncuts = " + str(uncut_thr), fontdict=None, ) plt.text( 0.3, 1.05, "Threshold Loops = " + str(loop_thr), fontdict=None, ) plt.text( -1.5, -1.2, f"Total number of reads in the estimation: {total / 1_000_000:.2f} millions reads", fontdict=None, ) noise = 100 * n_inter_mags / (n_intra_mags + n_inter_mags) plt.text( -1.5, -1.3, f"Estimated noise signal = {noise:.2f}%", fontdict=None, ) informative = ( 100 * (n_informative_intra + n_informative_inter) / (n_intra_mags + n_inter_mags) ) plt.text( -1.5, -1.4, f"Informative reads = {informative:.2f}%", fontdict=None, ) plt.savefig(out_file) plt.close()
[docs]def figures_mags_GC_boxplots(contigs_data, out_file): """Function to plot barplots GC content of the contigs for each bins. The count are weighted by the size of the contigs. Parameters: ----------- contigs_data : pandas.DataFrame Table with all the data from the contigs, the ubinned contigs should have removed and a column size_weight should have been add to weight with the size. out_file : str Path where to write the figure. """ # Sort by decreasing GC. contigs_data = contigs_data.sort_values(by="GC-content", ascending=False) # Build the palette my_pal = { "HQ": "#103b6f", "MQ": "#6666ff", "LQ": "#ccccff", "Contaminated": "#ed3139", "Other": "#c0bcbc", } # Plot the figure. _fig, ax = plt.subplots(figsize=(20, 10)) boxplot = sns.boxplot( y="GC_content", x="Final_bin", data=reindex_df(contigs_data, "size_weight"), palette=my_pal, hue="MAG_quality", hue_order=["HQ", "MQ", "LQ", "Contaminated", "Other"], flierprops=dict( markerfacecolor="0.5", markersize=1.0, marker="o", markeredgewidth=0.0, ), linewidth=1.25, width=1.0, dodge=False, ) ax.set(xlabel="", ylabel="GC content distribution", xticklabels=[]) labels = [ "HQ MAGs (Completion > 90, Contamination < 5)", "MQ MAGs (Completion > 70, Contamination < 10)", "LQ MAGs (Completion > 50, Contamination < 10)", "Contaminated (Completion > 50, Contamination < 10)", "Other (Completion < 50)", ] handles, _ = boxplot.get_legend_handles_labels() boxplot.legend(handles, labels) # Save the file. plt.savefig(out_file, dpi=200, bbox_inches="tight") plt.close()
[docs]def figures_mags_HiC_cov_boxplots(contigs_data, out_file): """Function to plot barplots HiC_coverage coverage of the contigs for each bins. The count are weighted by the size of the contigs. Parameters: ----------- contigs_data : pandas.DataFrame Table with all the data from the contigs, the ubinned contigs should have removed and a column size_weight should have been add to weight with the size. out_file : str Path where to write the figure. """ # Compute the HiC coverage. contigs_data["HiC_cov"] = 100 * contigs_data["Hit"] / contigs_data["Size"] # Sort by decreasing HiC coverage. contigs_data = contigs_data.sort_values(by="HiC_cov", ascending=False) # Build the palette my_pal = { "HQ": "#103b6f", "MQ": "#6666ff", "LQ": "#ccccff", "Contaminated": "#ed3139", "Other": "#c0bcbc", } # Plot the figure. _fig, ax = plt.subplots(figsize=(20, 10)) boxplot = sns.boxplot( y="HiC_cov", x="Final_bin", data=reindex_df(contigs_data, "size_weight"), palette=my_pal, hue="MAG_quality", hue_order=["HQ", "MQ", "LQ", "Contaminated", "Other"], flierprops=dict( markerfacecolor="0.5", markersize=1.0, marker="o", markeredgewidth=0.0, ), linewidth=1.25, width=1.0, dodge=False, ) plt.yscale("log") ax.set(xlabel="", ylabel="HiC coverage distribution", xticklabels=[]) labels = [ "HQ MAGs (Completion > 90, Contamination < 5)", "MQ MAGs (Completion > 70, Contamination < 10)", "LQ MAGs (Completion > 50, Contamination < 10)", "Contaminated (Completion > 50, Contamination < 10)", "Other (Completion < 50)", ] handles, _ = boxplot.get_legend_handles_labels() boxplot.legend(handles, labels) # Save the file. plt.savefig(out_file, dpi=200, bbox_inches="tight") plt.close()
[docs]def figures_mags_SG_cov_boxplots(contigs_data, out_file): """Function to plot barplots assembly coverage of the contigs for each bins. The count are weighted by the size of the contigs. Parameters: ----------- contigs_data : pandas.DataFrame Table with all the data from the contigs, the ubinned contigs should have removed and a column size_weight should have been add to weight with the size. out_file : str Path where to write the figure. """ # Sort by decreasing Shotgun coverage. contigs_data = contigs_data.sort_values(by="SG_Coverage", ascending=False) # Build the palette my_pal = { "HQ": "#103b6f", "MQ": "#6666ff", "LQ": "#ccccff", "Contaminated": "#ed3139", "Other": "#c0bcbc", } # Plot the figure. _fig, ax = plt.subplots(figsize=(20, 10)) boxplot = sns.boxplot( y="Shotgun_coverage", x="Final_bin", data=reindex_df(contigs_data, "size_weight"), palette=my_pal, hue="MAG_quality", hue_order=["HQ", "MQ", "LQ", "Contaminated", "Other"], flierprops=dict( markerfacecolor="0.5", markersize=1.0, marker="o", markeredgewidth=0.0, ), linewidth=1.25, width=1.0, dodge=False, ) plt.yscale("log") ax.set(xlabel="", ylabel="Assembly coverage distribution", xticklabels=[]) labels = [ "HQ MAGs (Completion > 90, Contamination < 5)", "MQ MAGs (Completion > 70, Contamination < 10)", "LQ MAGs (Completion > 50, Contamination < 10)", "Contaminated (Completion > 50, Contamination < 10)", "Other (Completion < 50)", ] handles, _ = boxplot.get_legend_handles_labels() boxplot.legend(handles, labels) # Save the file. plt.savefig(out_file, dpi=200, bbox_inches="tight") plt.close()
[docs]def generates_frags_network(contig_data, bin_summary, bin_size): """Generates info fragments for all contigs and start position all bins. Parameters: ----------- contigs_data : pandas.DataFrame Table with all the data from the contigs. bin_summary : dict Dictionnary with the informations of the final bins kept by MetaTOR. bin_size : int Size of the bin to plot the heatmap. Returns: -------- dict: Dictionnary with frags id of the contigs. list of int: List of the start positions of the mags int: Size of the final binned matrix """ # Define start frag id for each MAGs. The rest column is the size still # available from the previous contigs. frag_id = 0 mag_starts = [] for bin_name in bin_summary: bin_summary[bin_name]["start"] = frag_id mag_starts.append(frag_id) bin_summary[bin_name]["current"] = frag_id bin_summary[bin_name]["rest"] = 0 frag_id += (int(bin_summary[bin_name]["Length"]) // bin_size) + 1 # Define start frag id for each bins info_contigs = dict() for i in contig_data.index: # Only add the contig if it's in a final bin. if contig_data.loc[i, "Final_bin"] != "ND": # Call bin name dict to have the status of start frag id and rest. bin_name = contig_data.loc[i, "Final_bin"] size = contig_data.loc[i, "Size"] info_contigs[contig_data.loc[i, "Name"]] = { "start": bin_summary[bin_name]["current"], "init": bin_summary[bin_name]["rest"], } # Update rest and start position. bin_summary[bin_name]["current"] += ( size + bin_summary[bin_name]["rest"] ) // bin_size bin_summary[bin_name]["rest"] = ( size + bin_summary[bin_name]["rest"] ) % bin_size return info_contigs, mag_starts, frag_id
[docs]def network_heatmap(matrix, out_file=None, mag_starts=None): """Plot the heatmap of the network, with the contig ordering by their bin attribution. Parameters: ----------- matrix : numpy.array Binned dense matrix of the network. out_file : str Name of the file to save the plot. mag_starts : list of int List of bin positions where to draw mags starts as dotted lines. """ # Plot the heatmap. vmax = np.percentile(matrix, 99.5) im_kwargs = { "vmin": 0, "vmax": vmax, "cmap": "viridis", "interpolation": "none", } li_kwargs = {"ls": ":", "alpha": 0.8, "c": "white", "linewidth": 0.1} plt.figure() plt.imshow(matrix, **im_kwargs) plt.colorbar() plt.axis("off") # Add line between MAGs. if mag_starts is not None: for pos in mag_starts: plt.axvline(pos, **li_kwargs) plt.axhline(pos, **li_kwargs) # Save or show the file. if out_file: plt.savefig(out_file, bbox_inches="tight", pad_inches=0.0, dpi=2000) plt.close() else: plt.show()
[docs]def plot_figures(out_dir, contigs_data, bin_summary, threshold): """Function to generates all figures. Parameters: ----------- out_dir : str Path of the output directory of MetaTOR. contigs_data : pandas.DataFrame Table with all the data from the contigs. bin_summary : dict Dictionnary with the informations of the final bins kept by MetaTOR. threshold : int Minimun size of the bins considered. """ # Create plot directory plot_dir = join(out_dir, "plot") os.makedirs(plot_dir, exist_ok=True) # Test if Shotgun coverage have been given if contigs_data.loc[contigs_data.index[0], "Shotgun_coverage"] == "-": SG_cov = False else: SG_cov = True # Create outfiles outfile_bins_distribution = join(plot_dir, "bins_distribution.png") outfile_bins_size_distribution = join( plot_dir, "bins_size_distribution.png" ) outfile_MAGs_GC = join(plot_dir, "MAGs_GC_distribution.png") outfile_MAGs_HiC_cov = join(plot_dir, "MAGs_HiC_cov_distribution.png") if SG_cov: outfile_MAGs_SG_cov = join(plot_dir, "MAGs_SG_cov_distribution.png") # Look for pairs file in out_dir pairs_files = [] list_files = os.listdir(out_dir) for file in list_files: if re.search("\.pairs$", file): pairs_files.append(join(out_dir, file)) # Plot heatmap with a binning of 50kb. if len(pairs_files) > 0: heatmap_file = join(plot_dir, "network_heatmap.png") bin_size = 50000 info_contigs, mag_starts, N = generates_frags_network( contigs_data, bin_summary, bin_size ) matrix = build_matrix_network(pairs_files, info_contigs, N, bin_size) network_heatmap(matrix, heatmap_file, mag_starts) else: logger.warning( "No pairs alignment files found in %s, no heatmap will be generated.", out_dir, ) # Transform dictionnary to pandas DataFrame. bin_summary = pd.DataFrame.from_dict(bin_summary, orient="index") bin_summary["Bin"] = bin_summary.index bin_summary.index = range(len(bin_summary)) # Compute size of the assembly. total_size = sum(contigs_data["Size"]) # Plot bin distribution figures_bins_distribution(bin_summary, outfile_bins_distribution) bin_summary = figures_bins_size_distribution( bin_summary, total_size, threshold, outfile_bins_size_distribution ) # Remove unbinned contigs. contigs_data = contigs_data[contigs_data["Final_bin"] != "ND"] # Create a column of size divided by the minimum size of the contigs. min_size = min(contigs_data["Size"]) contigs_data["size_weight"] = (contigs_data["Size"] / min_size).apply(int) # Merge the bin_summary and the contigs data file. contigs_data = pd.merge( contigs_data, bin_summary, left_on="Final_bin", right_on="Bin" ) # Plots the distribution of GC and coverage inside the bins. figures_mags_GC_boxplots(contigs_data, outfile_MAGs_GC) figures_mags_HiC_cov_boxplots(contigs_data, outfile_MAGs_HiC_cov) if SG_cov: figures_mags_SG_cov_boxplots(contigs_data, outfile_MAGs_SG_cov)
[docs]def reindex_df(dataframe, weight_col): """Expand the dataframe to prepare for resampling result is 1 row per count per sample. Parameters: ----------- dataframe : pandas.core.frame.DataFrame Table to expand. weight_col : str Name of the weighted column to use. Returns: -------- pandas.core.frame.DataFrame: Table expanded with new index. """ dataframe = dataframe.reindex(dataframe.index.repeat(dataframe[weight_col])) dataframe.reset_index(drop=True, inplace=True) return dataframe