Source code for metator.validation

#!/usr/bin/env python3
# coding: utf-8

"""Validates bins using miComplete and make a recursive partition to try to
decontaminate them.

General functions to validate bins completion using miComplete and make 
recursive iterations of Louvain or Leiden to try to partition contaminated bins.
Only bins with more than 50% completion and 5% contamination are subject to the
recursive step. If the recursive step gave worst results than the first 
(decrease of the completion with no decrease of the contamination), it will keep
the original bin.


Functions in this module:
    - correct_final_bin
    - get_bin_coverage
    - give_results_info
    - merge_micomplete
    - micomplete_compare_bins
    - micomplete_quality
    - recursive_clustering
    - recursive_clustering_worker
    - recursive_decontamination
    - update_contigs_data_recursive
    - write_bins_contigs

CheckM deprecated functions:
    - checkm
    - checkm_compare_bins
"""

import logging
import metator.io as mio
import metator.figures as mtf
import metator.partition as mtp
import micomplete
import multiprocessing
import networkx as nx
import numpy as np
import os
import pandas as pd
import pyfastx
import shutil
import subprocess as sp
from functools import partial
from metator.log import logger
from os.path import join
from scipy import sparse


[docs]def correct_final_bin(contigs_data, final_fasta_dir, bin_summary):
    """Function to compute the coverage of each bin.

    Parameters:
    -----------
    bin_summary : dict
        Dictionnary with the informations of the final bins kept by MetaTOR.
    contigs_data : pandas.core.frame.DataFrame
        Dataframe with the contigs informations.

    Returns:
    --------
    pandas.core.frame.DataFrame
        Dataframe with the contigs informations.
    """
    contigs_data = contigs_data.set_index("Name", drop=False)
    for bin_name in bin_summary:
        fasta_file = join(final_fasta_dir, f"{bin_name}.fa")
        fasta = pyfastx.Fasta(fasta_file)
        for seq in fasta:
            contigs_data.loc[seq.name, "Final_bin"] = bin_name
    contigs_data = contigs_data.reset_index(drop=True)
    return contigs_data


[docs]def get_bin_coverage(bin_summary, contigs_data):
    """Function to compute the coverage of each bin.

    Parameters:
    -----------
    bin_summary : dict
        Dictionnary with the informations of the final bins kept by MetaTOR.
    contigs_data : pandas.core.frame.DataFrame
        Dataframe with the contigs informations.

    Returns:
    --------
    dict:
        Dictionnary with the informations of the final bins kept by MetaTOR with
        the coverage.
    """
    # Compute HiC_coverage
    total_hic_hit = 0
    total_sg_hit = 0
    for i in contigs_data.index:
        bin_name = contigs_data.loc[i, "Final_bin"]
        if contigs_data.loc[contigs_data.index[0], "Shotgun_coverage"] != "-":
            total_sg_hit += (
                contigs_data.loc[i, "Size"]
                * contigs_data.loc[i, "Shotgun_coverage"]
            )
        total_hic_hit += contigs_data.loc[i, "Hit"]
        if bin_name != "ND":
            try:
                bin_summary[bin_name]["HiC_abundance"] += (
                    100 * contigs_data.loc[i, "Hit"]
                )
            except KeyError:
                bin_summary[bin_name]["HiC_abundance"] = (
                    100 * contigs_data.loc[i, "Hit"]
                )

            # If no depth files were given do not compute the Shotgun coverage.
            if (
                contigs_data.loc[contigs_data.index[0], "Shotgun_coverage"]
                != "-"
            ):
                try:
                    bin_summary[bin_name]["SG_abundance"] += (
                        contigs_data.loc[i, "Size"]
                        * contigs_data.loc[i, "Shotgun_coverage"]
                        * 100
                    )
                except KeyError:
                    bin_summary[bin_name]["SG_abundance"] = (
                        contigs_data.loc[i, "Size"]
                        * contigs_data.loc[i, "Shotgun_coverage"]
                        * 100
                    )
    for bin_name in bin_summary:
        # Divide the HiC abundance by two as the hit are counted twice.
        bin_summary[bin_name]["HiC_abundance"] /= 2 * total_hic_hit
        if contigs_data.loc[contigs_data.index[0], "Shotgun_coverage"] != "-":
            bin_summary[bin_name]["SG_abundance"] /= total_sg_hit
    return bin_summary


[docs]def give_results_info(bin_summary):
    """Function to return the general information about the binning results.

    Parameters:
    -----------
    bin_summary : dict
        Dictionnary with the summary results of the kept bins.
    """

    # Defined categories of the bins
    HQ = 0  # Completness >= 90 and Contamination <= 5
    total_size_HQ = 0
    MQ = 0  # Completness >= 70 and Contamination <= 10
    total_size_MQ = 0
    LQ = 0  # Completness >= 50 and Contamination <= 10
    total_size_LQ = 0
    conta_bins = 0  # Completness >= 50 and Contamination > 10
    total_size_conta_bins = 0
    others = 0  # Not determined bins.
    total_size_others = 0

    # Class each bin in a category
    for bin_name in bin_summary:
        completness = float(bin_summary[bin_name]["Weighted completeness"])
        contamination = float(bin_summary[bin_name]["Weighted redundancy"])
        size = int(bin_summary[bin_name]["Length"])
        if completness >= 0.5:
            if ((contamination - 1) / completness) > 0.1:
                conta_bins += 1
                total_size_conta_bins += size
            else:
                if (
                    completness >= 0.9
                    and ((contamination - 1) / completness) > 0.05
                ):
                    HQ += 1
                    total_size_HQ += size
                elif completness >= 0.7:
                    MQ += 1
                    total_size_MQ += size
                else:
                    LQ += 1
                    total_size_LQ += size
        else:
            others += 1
            total_size_others += size
    total = HQ + MQ + LQ + conta_bins + others
    total_size = (
        total_size_HQ
        + total_size_MQ
        + total_size_LQ
        + total_size_conta_bins
        + total_size_others
    )

    # Return info in the logger:
    logger.info(
        "{0} bins have been kept after the recursive iterations.".format(total)
    )
    logger.info("Total size of the extracted bins: {0}".format(total_size))
    logger.info("HQ MAGs: {0}\tTotal Size: {1}".format(HQ, total_size_HQ))
    logger.info("MQ MAGs: {0}\tTotal Size: {1}".format(MQ, total_size_MQ))
    logger.info("LQ MAGs: {0}\tTotal Size: {1}".format(LQ, total_size_LQ))
    logger.info(
        "Contaminated potential MAGs: {0}\tTotal Size: {1}".format(
            conta_bins, total_size_conta_bins
        )
    )
    logger.info(
        "Others bins: {0}\tTotal Size: {1}".format(others, total_size_others)
    )


[docs]def merge_micomplete(out_bact105, out_arch131, outfile):
    """Function to merge bacterial and archaeal output of micomplete into one
    file.

    Parameters:
    out_bact105 : str
        Output of micomplete using 105 bacterial markers.
    out_arch131 : str
        Output of micomplete using 131 arcaheal markers.
    outfile : str
        Final merged output.
    """
    # Reads both files.
    bact105 = pd.read_csv(out_bact105, sep="\t", comment="#", index_col=0).loc[
        :, :"CDs"
    ]
    arch131 = pd.read_csv(out_arch131, sep="\t", comment="#", index_col=0).loc[
        :, :"CDs"
    ]

    # Write header
    with open(outfile, "w") as out:
        out.write("## miComplete\n")
        out.write(f"## v{micomplete.__version__}\n")
        out.write(f"## Weights: Bact105 and Arch131\n")
        out.write(
            "Name\t{0}\tMarkers\n".format("\t".join(list(arch131.columns)))
        )
        for bin_id in bact105.index:
            if (
                bact105.loc[bin_id, "Weighted completeness"]
                >= arch131.loc[bin_id, "Weighted completeness"]
            ):
                out.write(
                    "{0}\t{1}\tBacteria\n".format(
                        bin_id, "\t".join(map(str, list(bact105.loc[bin_id])))
                    )
                )
            else:
                out.write(
                    "{0}\t{1}\tArchaea\n".format(
                        bin_id, "\t".join(map(str, list(arch131.loc[bin_id])))
                    )
                )


[docs]def micomplete_compare_bins(
    recursive_micomplete_file,
    bin_summary,
    parent_dict,
    step,
):
    """Compare the completness and contamination of the bins from the first step
    and from the recursive step. If the recursive step decrease the completion
    of one bin without decreasing its contamination, it will kept the bin before
    the recursive step. Moreover if the completion goes below 50% with the
    recursive it will not take the recursive bins. The goal is to keep complete
    bins even if there are contaminated that the users can manually curate.

    Parameters:
    -----------
    recursive_micomplete_file : str
        Path to the miComplete summary from the recursive step.
    bin_summary : dict
        Dictionnary containing iinformation about the bins.
    parent_dict : dict
        Dictionnary with recursive bin_id as key and parent bin as values.
    step : int
        Recursive step.

    Returns:
    --------
    dict:
        Dictionnary with the informations of the final bins kept by MetaTOR.
    """
    # Setup contamination
    contamination = False

    # Load the miComplete summary
    micomplete_recursive_summary = pd.read_csv(
        recursive_micomplete_file,
        sep="\t",
        comment="#",
        index_col=0,
    ).iloc[:, :13]

    bin_summary_tab = pd.DataFrame.from_dict(bin_summary, orient="index")

    # Create new columns for recursive values
    bin_summary_tab["max_rec_completness"] = np.nan
    bin_summary_tab["rec_id"] = np.nan

    # Retrieve maximum completness of the recursive bins.
    for recursive_bin in micomplete_recursive_summary.index:
        parent_bin = parent_dict[recursive_bin]
        bin_summary_tab.loc[parent_bin, "max_rec_completness"] = max(
            float(
                micomplete_recursive_summary.loc[
                    recursive_bin, "Weighted completeness"
                ]
            ),
            bin_summary_tab.loc[parent_bin, "max_rec_completness"],
        )
        try:
            bin_summary_tab.loc[parent_bin, "rec_id"].append(recursive_bin)
        except AttributeError:
            bin_summary_tab.loc[parent_bin, "rec_id"] = [recursive_bin]

    # If there are some recursive bins which have not loose too much completion
    # write their information otherwise keep the original bins.
    for overlapping_bin in bin_summary_tab.index:
        if np.isnan(
            bin_summary_tab.loc[overlapping_bin, "max_rec_completness"]
        ):
            bin_summary[overlapping_bin]["recursive"] = False
        else:
            max_rec = bin_summary_tab.loc[
                overlapping_bin, "max_rec_completness"
            ]
            over = float(
                bin_summary_tab.loc[overlapping_bin, "Weighted completeness"]
            )
            if (max_rec - 0.4) > ((over - 0.4) / 1.6):
                rec_ids = bin_summary_tab.loc[overlapping_bin, "rec_id"]
                # Case of only one bin.
                if isinstance(rec_ids, str):
                    rec_ids = [rec_ids]
                for rec_id in rec_ids:
                    bin_summary[rec_id] = micomplete_recursive_summary.loc[
                        rec_id, :"CDs"
                    ].to_dict()
                    bin_summary[rec_id]["recursive"] = True
                    bin_summary[rec_id]["step"] = step
                    bin_summary[rec_id]["parent"] = overlapping_bin
                    # If one bin from recursive step contamination set to true
                    # to continue the while loop.
                    contamination = True
                bin_summary.pop(overlapping_bin, "None")
            else:
                bin_summary[overlapping_bin]["recursive"] = False

    return bin_summary, contamination


[docs]def micomplete_quality(fasta_dir, outfile, threads):
    """Function to evaluate fasta bins using miComplete. Write the bins quality
    summury in the outfile.

    Parameters:
    -----------
    fasta_dir : str
        Path to the input fasta of the bins to evaluate.
    outfile : str
        Path to the file where the results of miComplete will be written.
    threads : int
        Numbers of threads to use for miComplete.
    """
    # Prepare input table for micomplete.
    list_fasta = filter(
        lambda x: ".fa" in x,
        [join(fasta_dir, path) for path in os.listdir(fasta_dir)],
    )
    tmp_seq_tab = join(fasta_dir, "micomplete_seq.tsv")
    with open(tmp_seq_tab, "w") as tab:
        for fasta in list_fasta:
            tab.write(f"{fasta}\tfna\n")

    # Create two temporary output for archae and bacteria.
    out_bact105 = join(fasta_dir, "micomplete_bact105.tsv")
    out_arch131 = join(fasta_dir, "micomplete_arch131.tsv")

    # Launch miComplete using subprocess for bacteria.
    cmd = f"miComplete {tmp_seq_tab} --hmms Bact105 --weights Bact105 --threads {threads} --outfile {out_bact105}"
    process = sp.Popen(cmd, shell=True)
    out, err = process.communicate()

    # Launch miComplete using subprocess for archaea.
    cmd = f"miComplete {tmp_seq_tab} --hmms Arch131 --weights Arch131 --threads {threads} --outfile {out_arch131}"
    process = sp.Popen(cmd, shell=True)
    out, err = process.communicate()

    # Merge bacteria and archaea
    merge_micomplete(out_bact105, out_arch131, outfile)

    # Remove micomplete temporary files.
    for path in filter(lambda x: ".fa" in x, os.listdir(fasta_dir)):
        name = path.split(".")[0]
        os.remove(f"{name}_prodigal.faa")
        os.remove(f"{name}.tblout")
    os.remove(tmp_seq_tab)
    os.remove(out_bact105)
    os.remove(out_arch131)
    try:
        os.remove("miComplete.log")
    except FileNotFoundError:
        pass


[docs]def recursive_clustering(
    assembly,
    iterations,
    overlapping_parameter,
    resolution_parameter,
    outdir,
    recursive_fasta_dir,
    algorithm,
    tmpdir,
    bin_summary,
    contigs_data,
    network,
    cluster_matrix,
    size,
    threads,
    prefix,
):
    """Function to run recursive iterations on contaminated bins in order to try
    to improve the quality of the bins using Louvain or Leiden algorthm.

    Parameters:
    -----------
    assembly : str
        Path to the fasta file used as assembly.
    iterations : int
        Number of iterations to use for recursive iterations of Louvain or
        Leiden.
    overlapping_parameter : float
        Hamming distance threshold to consider two bins as the same bin.
    resolution parameter : float
        Resolution parameter of Leiden algorithm.
    outdir : str
        Path to the output directory.
    recursive_fasta_dir : str
        Path to the directory where to write the decontaminated fasta.
    algorithm : str
        Algorithm to use, either louvain, leiden or spinglass.
    tmpdir : str
        Path the temp directory.
    bin_summary : dict
        Dictionnary containing iinformation about the bins.
    contigs_data : pandas.DataFrame
        Table with all the data from the contigs.
    network : str
        Metator full network.
    cluster_matrix : bool
        If True, build the clustering matrix and save it.
    size : int
        Size threshodl in base pairs of the bins.
    threads : int
        Number of threads to use.
    prefix : str
        Sample prefix to use.

    Returns:
    --------
    boolean:
        True if at least one new recursive bin has been generated.
    pandas.DataFrame
        Updated dictionnary which has as keys the values of the iterations from
        the recursive partition separated by a semicolon and as values the list
        of the id of the contigs.
    scipy.sparse.coo.coo_matrix:
        Matrix with all the previously computed hamming distance between two
        contigs.
    dict
        Dictionnary with recursive bin_id as key and parent bin as values.
    """

    # Create temporary folders.
    tmpdir_binning = join(tmpdir, "recursive_bins")
    os.makedirs(tmpdir_binning, exist_ok=True)

    # Default no contamination
    contamination = False

    # Create an empty matrix
    N = len(contigs_data.ID)
    clustering_matrix = sparse.coo_matrix((N + 1, N + 1), dtype=np.float32)

    # Put iterations to on eif spinglass partition used.
    if algorithm == "spinglass":
        iterations = 1
        logger.error("Spinglass is no longer maintained.")
        raise ValueError

    # Check bin_id to decontaminate.
    bin_ids = []
    for bin_id in bin_summary:
        try:
            completness = float(bin_summary[bin_id]["Weighted completeness"])
            conta = float(bin_summary[bin_id]["Weighted redundancy"])
            recursive = bin_summary[bin_id]["recursive"]
            if recursive:
                if completness >= 0.33:
                    if (conta - 1) / completness >= 0.05:
                        bin_ids.append(bin_id)
        except KeyError:
            continue

    # Iterate on cmicomplete summary to find conatminated bins:
    # pool = multiprocessing.Pool(processes=threads)
    output_partitions = list(
        map(
            partial(
                recursive_clustering_worker,
                bin_summary=bin_summary,
                tmpdir=tmpdir,
                network=network,
                algorithm=algorithm,
                iterations=iterations,
                resolution_parameter=resolution_parameter,
                contigs_data=contigs_data,
            ),
            bin_ids,
        )
    )

    parent_dict = dict()

    for i, bin_id in enumerate(bin_ids):
        output_partition = output_partitions[i]

        # Detect core bins
        (
            recursive_core_bins,
            recursive_bins_iterations,
        ) = mtp.detect_core_bins(output_partition, iterations)

        # Compute the Hamming distance between core bins.
        hamming_distance = mtp.get_hamming_distance(
            recursive_bins_iterations,
            threads,
        )

        # Defined overlapping bins according to the threshold
        recursive_bins = mtp.defined_overlapping_bins(
            overlapping_parameter,
            hamming_distance,
            recursive_core_bins,
        )

        # update bin data and generate fasta
        (
            contamination,
            contigs_data,
            parent_dict,
        ) = update_contigs_data_recursive(
            bin_id,
            contigs_data,
            recursive_bins,
            assembly,
            recursive_fasta_dir,
            tmpdir_binning,
            size,
            contamination,
            parent_dict,
            prefix,
        )

        # Build the clustering matrix of the subnetwork and add it.
        if cluster_matrix > 0:
            clustering_matrix += mtp.build_clustering_matrix(
                recursive_core_bins, hamming_distance, cluster_matrix
            )

        logger.info("Recursive step for {0} is done.".format(bin_id))

    # Save the clustering matrix
    if cluster_matrix:
        clustering_matrix_file = join(outdir, "clustering_matrix_recursive")
        sparse.save_npz(clustering_matrix_file, clustering_matrix)
    else:
        clustering_matrix_file = None

    return contamination, contigs_data, clustering_matrix_file, parent_dict


[docs]def recursive_clustering_worker(
    bin_id,
    bin_summary,
    tmpdir,
    network,
    algorithm,
    iterations,
    resolution_parameter,
    contigs_data,
):
    """Worker to partition one bin if it's contaminated.

    Parameters:
    -----------
    bin_summary : dict
        Dictionnary of the output of micomplete as values and the bin id as
        keys.
    tmpdir : str
        Path the temp directory.
    algorithm : str
        Algorithm to use, either louvain, leiden or spinglass.
    network : networkx.classes.graph.Graph
        Network of contigs from HiC librairies.
    iterations : int
        Number of iterations to use for recursive iterations of Louvain or
        Leiden.
    resolution parameter : float
        Resolution parameter of Leiden algorithm.
    contigs_data_file : str
        Path to the contigs data file from metator partition.
    """
    # Create temporary folders.
    tmpdir_subnetwork = join(tmpdir, "recursive_bins", bin_id)
    os.makedirs(tmpdir_subnetwork, exist_ok=True)
    tmpdir_clustering = join(tmpdir, "recursive_clustering", bin_id)
    os.makedirs(tmpdir_clustering, exist_ok=True)

    logger.info("Bin in progress: {0}".format(bin_id))
    subnetwork_file = join(tmpdir_subnetwork, "subnetwork_" + bin_id + ".txt")
    over_bin_id = str(bin_id.split("_")[-2])
    rec_bin_id = str(bin_id.split("_")[-1])

    # Extract contigs
    mask = (contigs_data["Overlapping_bin_ID"] == over_bin_id) & (
        contigs_data["Recursive_bin_ID"].apply(str) == rec_bin_id
    )
    list_contigs = list(contigs_data.loc[mask, "ID"])

    # Extract subnetwork
    subnetwork = network.subgraph(list_contigs)

    # Write the new subnetwork
    nx.write_edgelist(
        subnetwork, subnetwork_file, delimiter="\t", data=["weight"]
    )

    # Compute spin prediction on the completion/contamination values.
    spin = max(
        2,
        int(1 + float(bin_summary[bin_id]["Weighted redundancy"])),
    )
    # Partition the subnetwork.

    output_partition = mtp.algo_partition(
        algorithm,
        subnetwork_file,
        subnetwork,
        iterations,
        resolution_parameter,
        tmpdir_clustering,
        spin,
    )
    logger.info("Output partition done for {0}.".format(bin_id))

    return output_partition


[docs]def recursive_decontamination(
    algorithm,
    assembly,
    cluster_matrix,
    contig_data_file,
    final_fasta_dir,
    input_fasta_dir,
    iterations,
    network_file,
    outdir,
    overlapping_parameter,
    recursive_fasta_dir,
    resolution_parameter,
    size,
    temp_directory,
    threads,
    prefix,
):
    """Function to validate bins do the recursive decontamination using Louvain
    or Leiden algorithm

    Parameters:
    -----------
    algorithm : str
        Algorithm to use to recursively partition the network. Either leiden,
        louvain or spinglass.
    assembly : str
        Path to the assembly file used for the partition.
    cluster_matrix : bool
        If True, build the clustering matrix and save it.
    contig_data_file : str
        Path to the contig data table to update.
    final_fasta_dir : str
        Path to write the final fasta decontaminated bins.
    input_fasta_dir : str
        Path to the directory where the fasta bin from the partition are.
    iterations : int
        Number of iterations to use for the recursive partition.
    network_file : str
        Path to the network file.
    outdir : str
        Path to the output directory where to write the output files.
    overlapping_parameter : int
        Hamming distance threshold in percentage to use to consider to bins as
        one in the recursive partition.
    recursive_fasta_dir : str
        Path to write the fasta decontaminated bins.
    resolution_parameter : float
        Resolution parameter to use if Leiden algorithm is chosen. It will be a
        factor of the cost function used. A resolution parameter of 1 will be
        equivalent as the modularity function used in Louvain. Higher these
        parameters, smaller the bins will be in the output.
    size : int
        Threshold size in base pair of the output bins.
    temp_directory : str
        Path to the directory used to write temporary files.
    threads : int
        Number of threads to use.
    prefix : str
        Sample prefix to use.

    Returns:
    --------
    scipy.sparse.coo.coo_matrix:
        Matrix with all the previously computed hamming distance between two
        contigs.
    """

    # Create folders in the temporary directory
    tmpdir_recursive_clustering = join(temp_directory, "recursive_clustering")
    os.makedirs(tmpdir_recursive_clustering, exist_ok=True)

    # Defined miComplete output file path
    overlapping_micomplete_file = join(
        outdir, "overlapping_micomplete_results.txt"
    )

    logger.info("Lauch miComplete quality check.")

    # Launch miComplete
    micomplete_quality(
        input_fasta_dir,
        overlapping_micomplete_file,
        threads,
    )

    # Load network:
    network = nx.read_edgelist(
        network_file, nodetype=int, data=(("weight", float),)
    )

    # Load contigs data:
    contigs_data = pd.read_csv(
        contig_data_file,
        sep="\t",
        header=0,
        converters={"Overlapping_bin_ID": str},
    )
    contigs_data["index"] = contigs_data["ID"] - 1
    contigs_data = contigs_data.set_index("index")

    # Add new coulumns for recursive information.
    contigs_data["Recursive_bin_ID"] = f"{0:05d}"
    contigs_data["Recursive_bin_contigs"] = "-"
    contigs_data["Recursive_bin_size"] = "-"
    contigs_data["Final_bin"] = "ND"

    # Load miComplete result:
    bin_summary = mio.micomplete_results_to_dict(overlapping_micomplete_file)

    # Add columns about recursive step.
    for bin_id in bin_summary:
        bin_summary[bin_id]["recursive"] = True
        bin_summary[bin_id]["step"] = 0
        bin_summary[bin_id]["parent"] = None

    # Recursively remove contamination.
    contamination = True
    step = 1

    logger.info("Starts recursive decontamition step:")

    while contamination == True:
        # Create fasta dir.
        recursive_fasta_dir_step = join(recursive_fasta_dir, f"step_{step}")
        os.makedirs(recursive_fasta_dir_step, exist_ok=True)

        # Stop to report info log
        logger.setLevel(logging.WARNING)

        # Iterates Louvain or Leiden on contaminated and complete bins.
        (
            contamination,
            contigs_data,
            clustering_matrix_file,
            parent_dict,
        ) = recursive_clustering(
            assembly,
            iterations,
            overlapping_parameter,
            resolution_parameter,
            outdir,
            recursive_fasta_dir_step,
            algorithm,
            tmpdir_recursive_clustering,
            bin_summary,
            contigs_data,
            network,
            cluster_matrix,
            size,
            threads,
            prefix,
        )

        # Put back the info log
        logger.setLevel(logging.INFO)

        # Recursive iterations of Louvain or Leiden on the contaminated bins.
        # Save bin information if the new bins have the same quality otherwise
        # keep the original bin information.
        if contamination:
            # Run miComplete on the recursive bins.
            recursive_micomplete_file = join(
                outdir, f"recursive_micomplete_step_{step}.txt"
            )

            micomplete_quality(
                recursive_fasta_dir_step,
                recursive_micomplete_file,
                threads,
            )

            # Compare
            bin_summary, contamination = micomplete_compare_bins(
                recursive_micomplete_file,
                bin_summary,
                parent_dict,
                step,
            )

        # Keep overlapping bin information
        else:
            logger.info(
                f"No more contaminated bin have been found after {step} steps."
            )
        # Increase count
        logger.info(f"Recursive step {step} done.")
        step += 1
        if step == 10:
            contamination = False

    # Create fasta directory and copy final bins.
    for bin_name in bin_summary:
        dst = join(final_fasta_dir, bin_name + ".fa")
        step = bin_summary[bin_name]["step"]
        if step == 0:
            src = join(input_fasta_dir, bin_name + ".fa")
        else:
            src = join(recursive_fasta_dir, f"step_{step}", f"{bin_name}.fa")
        shutil.copyfile(src, dst)

    # Return some values of efficiency of the binning.
    give_results_info(bin_summary)

    # Write relevant bins/contigs information for anvio.
    binning_file = join(outdir, "binning.txt")
    contigs_data = write_bins_contigs(
        bin_summary, contigs_data, binning_file, prefix
    )

    # Compute the abundance of the mags.
    bin_summary = get_bin_coverage(bin_summary, contigs_data)

    # Save bin information in final file
    bin_summary_file = join(outdir, "bin_summary.txt")
    mio.write_bin_summary(bin_summary, bin_summary_file)

    # Correct final bin value in contigs data
    contigs_data = correct_final_bin(contigs_data, final_fasta_dir, bin_summary)

    # Write the new file
    contig_data_file_final = join(outdir, "contig_data_final.txt")
    contigs_data.to_csv(
        contig_data_file_final, sep="\t", header=True, index=True
    )

    # Plot some figures of contigs distribution inside bins:
    mtf.plot_figures(outdir, contigs_data, bin_summary, size)

    return clustering_matrix_file


[docs]def update_contigs_data_recursive(
    bin_id,
    contigs_data,
    recursive_bins,
    assembly,
    outdir,
    tmpdir,
    size,
    contamination,
    parent_dict,
    prefix,
):
    """Update the data of the bin according to the recursive step and generated
    their fasta.

    Parameters:
    -----------
    bin_id : str
        Name of the parental bin.
    contigs_data : pandas.DataFrame
        Table with all the data from the contigs.
    recursive_bins : dict
        Dictionnary which has as keys the values of the recursive iterations
        from Louvain or Leiden separated by a semicolon and as values the list
        of the id of the contigs.
    assembly : str
        Path to the fasta file.
    outdir : str
        Path to the output directory to write the new fasta.
    tmpdir : str
        Path to the file where to write the temporary contigs list.
    size : int
        Size threshold to generate fasta.
    contamination : boolean
        True if one bin has already been generated, false otherwise.
    parent_dict : dict
        Dictionnary with recursive bin_id as key and parent bin as values.
    prefix : str
        Sample prefix to use.

    Returns:
    --------
    boolean:
        True if one bin has already been generated, false otherwise.
    pandas.DataFrame
        Updated dictionnary which has as keys the values of the iterations from
        the recursive partition separated by a semicolon and as values the list
        of the id of the contigs.
    dict
        Dictionnary with recursive bin_id as key and parent bin as values.
    """

    # Add recursive bin information
    rec_id = 1

    # Extract last recursive ID.
    over_id = contigs_data.loc[recursive_bins[1][0] - 1, "Overlapping_bin_ID"]
    max_rec_id = max(
        map(
            int,
            contigs_data[contigs_data.Overlapping_bin_ID == over_id][
                "Recursive_bin_ID"
            ],
        )
    )

    for i in recursive_bins:
        # Extract contigs of the bin
        recursive_bin = [id - 1 for id in recursive_bins[i]]
        recursive_bin_data = contigs_data.loc[recursive_bin]
        recursive_bin_contigs_number = len(recursive_bin)
        recursive_bin_length = sum(recursive_bin_data.Size)

        if recursive_bin_length > size:
            # If one new bin is generated change the boolean value to True
            contamination = True
            rec_final_id = max_rec_id + rec_id
            # Write the new information
            contigs_data.loc[
                recursive_bin, "Recursive_bin_ID"
            ] = f"{rec_final_id:05d}"
            contigs_data.loc[
                recursive_bin, "Recursive_bin_contigs"
            ] = recursive_bin_contigs_number
            contigs_data.loc[
                recursive_bin, "Recursive_bin_size"
            ] = recursive_bin_length

            # Defined name of the recursive bin
            oc_id = int(
                contigs_data.loc[recursive_bin[0], "Overlapping_bin_ID"]
            )
            output_file = join(
                outdir, f"{prefix}_{oc_id:05d}_{rec_final_id:05d}.fa"
            )

            # Update bin_summary:
            parent_dict[f"{prefix}_{oc_id:05d}_{rec_final_id:05d}"] = bin_id

            # Retrieve names of the contigs
            list_contigs = list(contigs_data.loc[recursive_bin, "Name"])

            # Generate the fasta
            contigs_file = join(
                tmpdir, f"{prefix}_{oc_id:05d}_{rec_final_id:05d}.txt"
            )
            with open(contigs_file, "w") as f:
                for contig in list_contigs:
                    f.write("{0}\n".format(contig))
            cmd = "pyfastx extract {0} -l {1} > {2}".format(
                assembly, contigs_file, output_file
            )
            process = sp.Popen(cmd, shell=True)
            process.communicate()

            # Add one to the recursive id
            rec_id += 1

    return contamination, contigs_data, parent_dict


[docs]def write_bins_contigs(bin_summary, contigs_data, outfile, prefix):
    """Function to write a table with the nodes kept in the bins and their bin
    id. The file is adapted to be added in anvio.

    Parameters:
    -----------
    bin_summary : dict
        Dictionnary with the informations of the final bins kept by MetaTOR.
    contigs_data : pandas.core.frame.DataFrame
        Dataframe with the contigs informations.
    outfile : str
        Path where to write the output file.
    prefix : str
        Sample prefix to use.

    Returns:
    --------
    pandas.core.frame.DataFrame
        Dataframe with the contigs informations with the final bin information.
    """

    # Create a list with the id of the bins
    list_bin_id = dict()
    for bin_name in bin_summary:
        over_id = bin_name.split("_")[-2]
        rec_id = bin_name.split("_")[-1]
        try:
            list_bin_id[over_id].append(rec_id)
        except KeyError:
            list_bin_id[over_id] = [rec_id]

    # Write the contigs id with their bins id in table file
    with open(outfile, "w") as f:
        for i in contigs_data.index:
            over_id = str(contigs_data.loc[i, "Overlapping_bin_ID"])
            rec_id = str(contigs_data.loc[i, "Recursive_bin_ID"])
            try:
                rec_ids = list_bin_id[over_id]
                binned = False
                # Case of a recursive bin
                if rec_id in rec_ids:
                    binned = True
                # Case where the recursive bins where not kept.
                elif rec_ids == ["0"]:
                    rec_id = "0"
                    binned = True

                if binned:
                    final_bin = f"{prefix}_{over_id}_{rec_id}"
                    contigs_data.loc[i, "Final_bin"] = final_bin
                    f.write(
                        "{0}\t{1}\n".format(
                            contigs_data.loc[i, "Name"], final_bin
                        )
                    )
            except KeyError:
                pass
    return contigs_data


#############################################
# Deprecated functions for checkM validation.
#############################################


[docs]def checkm(fasta_dir, outfile, taxonomy_file, tmpdir, threads):
    """Function to evaluate fasta bins using CheckM. Write the checkM results
    summary in the outfile and the taxonomy results in the the taxonomy file.

    Parameters:
    -----------
    fasta_dir : str
        Path to the input fasta of the bins to evaluate.
    outfile : str
        Path to the file where the results of checkm will be written.
    taxonomy_file : str
        path to the file where checkm taxonomy results will be written.
    tmpdir : str
        Path to the temporary directory where CheckM intermediary files will be
        written.
    threads : int
        Numbers of threads to use for CheckM.
    """

    logger.info("Start CheckM validation.")

    # Build CheckM tree
    cmd = "checkm tree -q -t {0} -x fa {1} {2}".format(
        threads, fasta_dir, tmpdir
    )
    logger.info(cmd)
    process = sp.Popen(cmd, shell=True)
    out, err = process.communicate()

    # Build taxonomy values of the bins
    cmd = "checkm tree_qa {0} -q -o 1 -f {1}".format(tmpdir, taxonomy_file)
    logger.info(cmd)
    process = sp.Popen(cmd, shell=True)
    out, err = process.communicate()

    # Build lineage marker set
    markers_set = join(tmpdir, "markers.txt")
    cmd = "checkm lineage_set -q {0} {1}".format(tmpdir, markers_set)
    logger.info(cmd)
    process = sp.Popen(cmd, shell=True)
    out, err = process.communicate()

    # Compute the analysis
    cmd = "checkm analyze -q -x fa -t {0} {1} {2} {3}".format(
        threads, markers_set, fasta_dir, tmpdir
    )
    logger.info(cmd)
    process = sp.Popen(cmd, shell=True)
    out, err = process.communicate()

    # Write the summary file
    cmd = "checkm qa -q {0} {1} -o 2 > {2}".format(markers_set, tmpdir, outfile)
    logger.info(cmd)
    process = sp.Popen(cmd, shell=True)
    out, err = process.communicate()


[docs]def checkm_compare_bins(
    overlapping_checkm_file,
    overlapping_taxonomy_file,
    recursive_checkm_file,
    recursive_taxonomy_file,
    prefix,
):
    """Compare the completness and contamination of the bins from the first step
    and from the recursive step. If the recursive step decrease the completion
    of one bin without decreasing its contamination, it will kept the bin before
    the recursive step. Moreover if the completion goes below 50% with the
    recursive it will not take the recursive bins.

    Parameters:
    -----------
    overlapping_checkm_file : str
        Path to the checkm summary from the overlapping step.
    overlapping_taxonomy_file : str
        path to the overlapping checkm taxonomy results file.
    recursive_checkm_file : str
        Path to the checkm summary from the recursive step.
    recursive_taxonomy_file : str
        path to the recursive checkm taxonomy results file.
    prefix : str
        Sample prefix to use.

    Returns:
    --------
    dict:
        Dictionnary with the informations of the final bins kept by MetaTOR.
    """

    # Load the checkm summary
    checkm_summary_overlapping = mio.read_results_checkm(
        overlapping_checkm_file, overlapping_taxonomy_file
    )
    checkm_summary_recursive = mio.read_results_checkm(
        recursive_checkm_file, recursive_taxonomy_file
    )

    # Prepare a dictionnary for a final summary.
    checkm_summary = dict()

    # Retrieve maximum completness of the recursive bins.
    for recursive_bin in checkm_summary_recursive:
        overlapping_bin = "_".join(
            [f"{prefix}", recursive_bin.split("_")[-2], "0"]
        )
        try:
            checkm_summary_overlapping[overlapping_bin][
                "max_rec_completness"
            ] = max(
                float(checkm_summary_recursive[recursive_bin]["completness"]),
                checkm_summary_overlapping[overlapping_bin][
                    "max_rec_completness"
                ],
            )
            checkm_summary_overlapping[overlapping_bin]["rec_id"].append(
                recursive_bin
            )
        except KeyError:
            checkm_summary_overlapping[overlapping_bin][
                "max_rec_completness"
            ] = float(checkm_summary_recursive[recursive_bin]["completness"])
            checkm_summary_overlapping[overlapping_bin]["rec_id"] = [
                recursive_bin
            ]

    # If there are some recursive bins which have not loose too much completion
    # write their information otherwise keep the original bins.
    for overlapping_bin in checkm_summary_overlapping:
        try:
            max_rec = checkm_summary_overlapping[overlapping_bin][
                "max_rec_completness"
            ]
            over = float(
                checkm_summary_overlapping[overlapping_bin]["completness"]
            )
            if (max_rec > (over / 2)) & (max_rec > 50):
                for rec_id in checkm_summary_overlapping[overlapping_bin][
                    "rec_id"
                ]:
                    checkm_summary[rec_id] = checkm_summary_recursive[rec_id]
            else:
                checkm_summary_overlapping[overlapping_bin].pop(
                    "max_rec_completness"
                )
                checkm_summary_overlapping[overlapping_bin].pop("rec_id")
                checkm_summary[overlapping_bin] = checkm_summary_overlapping[
                    overlapping_bin
                ]
        except KeyError:
            checkm_summary[overlapping_bin] = checkm_summary_overlapping[
                overlapping_bin
            ]

    return checkm_summary