Source code for muffin.load

"""
Functions to create a dataset from arrays or bam files.
"""
import anndata as ad
import numpy as np
import pandas as pd
import pyranges as pr
from .utils import common
import warnings
import muffin


[docs]
def dataset_from_arrays(count_table, row_names=None, col_names=None, input_count_table=None,
                        random_loc_count_table=None, random_loc_input_table=None):
    """
    Import count matrices from python array.

    Parameters
    ----------
    count_table : integer ndarray
        (sample, genomic feature) RAW, NON NORMALIZED COUNT data. To save time and memory
        you can use an int32 format.
    row_annot : None or dict
        If set to None, will give an integer id to each row (1->n).
        If set to a dict, expects a dict of the form {annotation_name:annotations},
        where annotations are ordered according to the count table.
        By default None.
    col_annot : None or dict
        If set to None, will give an integer id to each column (1->n).
        If set to a dict, expects a dict of the form {annotation_name:annotations},
        where annotations are ordered according to the count table.
        By default None.
    input_count_table : integer or float ndarray or None, optional
        For sequencing data with input only !
        (sample, genomic feature) Raw, non normalized input counts (int) or 
        input with custom normalization. If your input counts are already normalized,
        you have to set input_already_normalized to True.
        To save time and memory you can use a 32bits format.
        By default None
    random_loc_count_table : integer ndarray, optional
        For sequencing data with input only ! Mandatory if your input count table
        is not already normalized.
        (sample, n_random_intervals) Raw, non normalized counts (int) sampled
        at random genomic intervals (preferably at least 5,000). 
        To save time and memory you can use an int32 format.
        By default None
    random_loc_input_table : integer ndarray, optional
        For sequencing data with input only ! Mandatory if your input count table
        is not already normalized.
        (sample, n_random_intervals) Raw, non normalized counts (int) sampled
        at random genomic intervals (preferably at least 5,000). 
        To save time and memory you can use an int32 format.
        By default None

    Returns
    -------
    dataset : AnnData
        Annotated data matrix
    """
    dataset = ad.AnnData(count_table)
    if row_names is None:
        dataset.obs_names = np.arange(len(count_table)).astype(str)
    else:
        dataset.obs_names = row_names.astype(str)
    if col_names is None:
        dataset.var_names = np.arange(count_table.shape[1]).astype(str)
    else:
        dataset.var_names = col_names.astype(str)
    if input_count_table is not None:
        dataset.layers["input"] = input_count_table
        dataset.uns["counts_random"] = random_loc_count_table
        dataset.uns["input_random"] = random_loc_input_table
    return dataset



[docs]
def dataset_from_bam(bam_paths, genomic_regions=None, genomic_regions_path=None, row_names=None, col_names=None,
                    featureCounts_params=None, isBEDannotation=True, input_bam_paths=None, 
                    n_random_input=10000, chromsizes=None):
    """
    Initialize count matrices using BAM files and query random regions.

    Parameters
    ----------
    bam_paths : str array-like 
        Paths to BAM files.
    genomic_regions: pandas Dataframe
        Genomic annotation in bed format
    genomic_regions_path : str
        Path to a bed or SAF genome annotation file
    row_annot : None or dict, optional
        If set to None, will give file name to each row (1->n).
        If set to a dict, expects a dict of the form {annotation_name:annotations},
        where annotations are ordered according to the BAM file order.
        By default None.
    col_annot : None or dict, optional
        If set to None, will give use the given genomic regions as annotations.
        If set to a dict, expects a dict of the form {annotation_name:annotations},
        where annotations are ordered according to the genomic_regions_path.
        By default None.
    featureCounts_params : None or dict, optional
        Dictionary of keyword arguments to be passed to featureCounts, see featureCounts
        R package documentation, by default None
    isBEDannotation : bool, optional
        If set to true assumes the genomic_regions_path is in .bed format with strand
        information otherwise, it assumes a SAF file (see featureCounts doc), by default True.
    input_bam_paths : str, optional
        For sequencing data with input only !
        Paths to input BAM files, path order has to match the signal bam_paths.
        By default None.
    n_random_input : int, optional
        For sequencing data with input only !
        Number of random regions to sample to estimate centering and scaling factors.
        By default 10000.
    chromsizes : dict or None, optional
        Mandatory for sequencing data with input !
        Set of chromosome that will be used to generate random sampling locations.
        Dict with format : {"chr1":3000000, "chr2":456455412}.

    Returns
    -------
    dataset
        _description_
    """        
    try:
        import rpy2.robjects as ro
        from rpy2.robjects import numpy2ri, pandas2ri
        from rpy2.robjects.conversion import localconverter
        from rpy2.robjects.packages import importr
        subread = importr("Rsubread")
    except:
        raise ImportError("""R or rsubread are not installed !\n
                          On unix systems, use :\n
                          conda install -c bioconda biocondutor-rsubread.\n
                          On windows, install R then manually install rsubread within R.
                          conda install -c conda-forge r""")
    common.createDir(muffin.params["temp_dir"])
    # Append random regions to bed
    if input_bam_paths is not None:
        if chromsizes is None:
            raise ValueError("Missing chromosome size dictionary !")
        np.random.seed(42)
        randomRegions = pr.random(n=n_random_input, length=500, chromsizes=chromsizes, strand=False, int64=True).as_df()
        randomRegions.columns = ["Chr", "Start", "End"]
        randomRegions.insert(0, "GeneID", ["r"+str(i) for i in range(n_random_input)])
        randomRegions.insert(3, "Strand", ["." for i in range(n_random_input)])
    # Convert bed file to SAF
    if isBEDannotation:
        if genomic_regions is None:
            bedFile = pd.read_csv(genomic_regions_path, sep="\t", header=None)
        else:
            bedFile = genomic_regions
        ncols = bedFile.shape[1]
        if ncols >= 6:
            bedFile = bedFile.iloc[:, [0,1,2,5]]
            bedFile.insert(0, "GeneID", ["r"+str(i) for i in range(len(bedFile))])
            bedFile.columns = ["GeneID", "Chr", "Start", "End", "Strand"]
        else:
            bedFile = bedFile.iloc[:, [0,1,2]]
            bedFile.columns = ["Chr", "Start", "End"]
            bedFile.insert(0, "GeneID", ["r"+str(i) for i in range(len(bedFile))])
            bedFile.insert(3, "Strand", ["." for i in range(len(bedFile))])
            bedFile.columns = ["GeneID", "Chr", "Start", "End", "Strand"]
        if col_names is None:
            col_names = np.arange(len(bedFile))
        bedFile.loc[:, "GeneID"] = col_names
        if input_bam_paths is not None:
            bedFile = pd.concat([bedFile, randomRegions])
        bedFile.to_csv(muffin.params["temp_dir"] + "/ann.saf", sep="\t", index=None)
        genomic_regions_path = muffin.params["temp_dir"] + "/ann.saf"
    else:
        if genomic_regions is not None:
            genomic_regions.to_csv(muffin.params["temp_dir"] + "/ann.saf", sep="\t", index=None)
            genomic_regions_path = muffin.params["temp_dir"] + "/ann.saf"
    paramsDict = {"files":ro.vectors.StrVector(bam_paths)}
    # Count reads over genomic regions
    paramsDict["annot.ext"] = genomic_regions_path
    if featureCounts_params is not None:
        paramsDict = dict(paramsDict, **featureCounts_params)
    # Sanitize arguments for rpy2
    paramsDictRpy2 = {}
    for k in paramsDict:
        paramsDictRpy2[k.replace(".", "_")] = paramsDict[k]
    with localconverter(ro.default_converter + pandas2ri.converter + numpy2ri.converter):
        res = subread.featureCounts(**paramsDictRpy2)
        mapping_stats = pd.DataFrame(res["stat"]).set_index("Status")
        if input_bam_paths is None:
            dataset = ad.AnnData(res["counts"].T.astype(np.int32))
        else:
            dataset = ad.AnnData(res["counts"].T.astype(np.int32)[:, :-n_random_input])
            dataset.uns["counts_random"] = res["counts"].T.astype(np.int32)[:, -n_random_input:]
        dataset.uns["tot_mapped_counts"] = mapping_stats.sum(axis=0).values
    # Same but for input files
    if input_bam_paths is not None:
        # Take care of the case where some files may share the same input
        # Which causes an issue with featureCounts
        unique_paths = list(set(input_bam_paths))
        paramsDict = {"files":ro.vectors.StrVector(unique_paths)}
        if genomic_regions_path is not None:
            paramsDict["annot.ext"] = genomic_regions_path
        if featureCounts_params is not None:
            paramsDict = dict(paramsDict, **featureCounts_params)
        # Sanitize arguments for rpy2
        paramsDictRpy2 = {}
        for k in paramsDict:
            paramsDictRpy2[k.replace(".", "_")] = paramsDict[k]
        with localconverter(ro.default_converter + pandas2ri.converter + numpy2ri.converter):
            res = subread.featureCounts(**paramsDictRpy2)
            if len(unique_paths) != len(input_bam_paths):
                warnings.warn("Multiple files share the same input (this can be intended)")
                unique_counts = res["counts"].T.astype(np.int32)
                dataset.layers["input"] = np.zeros((len(input_bam_paths), unique_counts.shape[1]-n_random_input), dtype="int32")
                dataset.uns["input_random"] = np.zeros((len(input_bam_paths), n_random_input), dtype="int32")
                for i, val in enumerate(input_bam_paths):
                    index = unique_paths.index(val)
                    dataset.layers["input"][i,:] = unique_counts[index,:-n_random_input]
                    dataset.uns["input_random"][i,:] = unique_counts[index, -n_random_input:]
            else:
                dataset.layers["input"] = res["counts"].T.astype(np.int32)[:, :-n_random_input]
                dataset.uns["input_random"] = res["counts"].T.astype(np.int32)[:, -n_random_input:]
                mapping_stats_input = pd.DataFrame(res["stat"]).set_index("Status")
                dataset.uns["tot_mapped_input"] = mapping_stats_input.sum(axis=0).values
                fracMapped = mapping_stats.loc["Assigned"]/dataset.uns["tot_mapped_counts"]
                fracMappedInput = mapping_stats_input.loc["Assigned"]/mapping_stats_input.sum(axis=0)
                dataset.obs["est_signal_to_noise"] = fracMapped.values/fracMappedInput.values
    # Annotate
    if row_names is None:
        dataset.obs_names = np.array([f.split("/")[-1] for f in bam_paths])
    else:        
        dataset.obs_names = row_names.astype(str)
    if col_names is None:
        dataset.var_names = np.arange(dataset.X.shape[1]).astype(str)
    else:
        dataset.var_names = col_names.astype(str)
    if input_bam_paths is not None:
        dataset.var[["Chromosome", "Start", "End", "Strand"]] = bedFile.values[:-n_random_input, 1:]
    else:
        dataset.var[["Chromosome", "Start", "End", "Strand"]] = bedFile.values[:, 1:]
    dataset.var[["Start", "End"]] = dataset.var[["Start", "End"]].astype("int")
    dataset.var["Chromosome"] = dataset.var["Chromosome"].astype("category")
    return dataset



[docs]
def set_design_matrix(dataset, design="intercept"):
    """
    Set design matrix for statistical modelling. If you do not want to remove
    any confounding factors, set it to a column vector of ones. E.g. :
    np.ones((model.dataset.layers["input"].shape[0],1))

    Parameters
    ----------
    design : ndarray or "intercept"
        Design matrix. If set to intercept will automatically set a column
        vector of ones.

    """
    if type(design) is str:
        if design == "intercept":
            design = np.ones((len(dataset),1))
    dataset.obsm["design"] = design



[docs]
def set_size_factors(dataset, values):
    """
    Load custom size factors for the model.

    Parameters
    ----------
    values : float ndarray
        Size factors, e.g. np.sum(model.matrices["counts"], axis=1).
        They will be standardized to unit mean to ensure numerical
        stability when fitting models.
    """
    dataset.obs["size_factors"] = (values.astype("float64") / np.mean(values.astype("float64"))).astype("float32")



[docs]
def set_normalization_factors(dataset, normalization_factors):
    """
    Load normalization factors from a numpy array

    Parameters
    ----------
    dataset : _type_
        _description_
    normalization_factors : float ndarray (observations x variables)
        Normalization factors per observation, per variable
    """    
    dataset.obs["normalization_factors"] = normalization_factors