"""
Functions to create a dataset from arrays or bam files.
"""
import anndata as ad
import numpy as np
import pandas as pd
import pyranges as pr
from .utils import common
import warnings
import muffin
[docs]
def dataset_from_arrays(count_table, row_names=None, col_names=None, input_count_table=None,
random_loc_count_table=None, random_loc_input_table=None):
"""
Import count matrices from python array.
Parameters
----------
count_table : integer ndarray
(sample, genomic feature) RAW, NON NORMALIZED COUNT data. To save time and memory
you can use an int32 format.
row_annot : None or dict
If set to None, will give an integer id to each row (1->n).
If set to a dict, expects a dict of the form {annotation_name:annotations},
where annotations are ordered according to the count table.
By default None.
col_annot : None or dict
If set to None, will give an integer id to each column (1->n).
If set to a dict, expects a dict of the form {annotation_name:annotations},
where annotations are ordered according to the count table.
By default None.
input_count_table : integer or float ndarray or None, optional
For sequencing data with input only !
(sample, genomic feature) Raw, non normalized input counts (int) or
input with custom normalization. If your input counts are already normalized,
you have to set input_already_normalized to True.
To save time and memory you can use a 32bits format.
By default None
random_loc_count_table : integer ndarray, optional
For sequencing data with input only ! Mandatory if your input count table
is not already normalized.
(sample, n_random_intervals) Raw, non normalized counts (int) sampled
at random genomic intervals (preferably at least 5,000).
To save time and memory you can use an int32 format.
By default None
random_loc_input_table : integer ndarray, optional
For sequencing data with input only ! Mandatory if your input count table
is not already normalized.
(sample, n_random_intervals) Raw, non normalized counts (int) sampled
at random genomic intervals (preferably at least 5,000).
To save time and memory you can use an int32 format.
By default None
Returns
-------
dataset : AnnData
Annotated data matrix
"""
dataset = ad.AnnData(count_table)
if row_names is None:
dataset.obs_names = np.arange(len(count_table)).astype(str)
else:
dataset.obs_names = row_names.astype(str)
if col_names is None:
dataset.var_names = np.arange(count_table.shape[1]).astype(str)
else:
dataset.var_names = col_names.astype(str)
if input_count_table is not None:
dataset.layers["input"] = input_count_table
dataset.uns["counts_random"] = random_loc_count_table
dataset.uns["input_random"] = random_loc_input_table
return dataset
[docs]
def dataset_from_bam(bam_paths, genomic_regions=None, genomic_regions_path=None, row_names=None, col_names=None,
featureCounts_params=None, isBEDannotation=True, input_bam_paths=None,
n_random_input=10000, chromsizes=None):
"""
Initialize count matrices using BAM files and query random regions.
Parameters
----------
bam_paths : str array-like
Paths to BAM files.
genomic_regions: pandas Dataframe
Genomic annotation in bed format
genomic_regions_path : str
Path to a bed or SAF genome annotation file
row_annot : None or dict, optional
If set to None, will give file name to each row (1->n).
If set to a dict, expects a dict of the form {annotation_name:annotations},
where annotations are ordered according to the BAM file order.
By default None.
col_annot : None or dict, optional
If set to None, will give use the given genomic regions as annotations.
If set to a dict, expects a dict of the form {annotation_name:annotations},
where annotations are ordered according to the genomic_regions_path.
By default None.
featureCounts_params : None or dict, optional
Dictionary of keyword arguments to be passed to featureCounts, see featureCounts
R package documentation, by default None
isBEDannotation : bool, optional
If set to true assumes the genomic_regions_path is in .bed format with strand
information otherwise, it assumes a SAF file (see featureCounts doc), by default True.
input_bam_paths : str, optional
For sequencing data with input only !
Paths to input BAM files, path order has to match the signal bam_paths.
By default None.
n_random_input : int, optional
For sequencing data with input only !
Number of random regions to sample to estimate centering and scaling factors.
By default 10000.
chromsizes : dict or None, optional
Mandatory for sequencing data with input !
Set of chromosome that will be used to generate random sampling locations.
Dict with format : {"chr1":3000000, "chr2":456455412}.
Returns
-------
dataset
_description_
"""
try:
import rpy2.robjects as ro
from rpy2.robjects import numpy2ri, pandas2ri
from rpy2.robjects.conversion import localconverter
from rpy2.robjects.packages import importr
subread = importr("Rsubread")
except:
raise ImportError("""R or rsubread are not installed !\n
On unix systems, use :\n
conda install -c bioconda biocondutor-rsubread.\n
On windows, install R then manually install rsubread within R.
conda install -c conda-forge r""")
common.createDir(muffin.params["temp_dir"])
# Append random regions to bed
if input_bam_paths is not None:
if chromsizes is None:
raise ValueError("Missing chromosome size dictionary !")
np.random.seed(42)
randomRegions = pr.random(n=n_random_input, length=500, chromsizes=chromsizes, strand=False, int64=True).as_df()
randomRegions.columns = ["Chr", "Start", "End"]
randomRegions.insert(0, "GeneID", ["r"+str(i) for i in range(n_random_input)])
randomRegions.insert(3, "Strand", ["." for i in range(n_random_input)])
# Convert bed file to SAF
if isBEDannotation:
if genomic_regions is None:
bedFile = pd.read_csv(genomic_regions_path, sep="\t", header=None)
else:
bedFile = genomic_regions
ncols = bedFile.shape[1]
if ncols >= 6:
bedFile = bedFile.iloc[:, [0,1,2,5]]
bedFile.insert(0, "GeneID", ["r"+str(i) for i in range(len(bedFile))])
bedFile.columns = ["GeneID", "Chr", "Start", "End", "Strand"]
else:
bedFile = bedFile.iloc[:, [0,1,2]]
bedFile.columns = ["Chr", "Start", "End"]
bedFile.insert(0, "GeneID", ["r"+str(i) for i in range(len(bedFile))])
bedFile.insert(3, "Strand", ["." for i in range(len(bedFile))])
bedFile.columns = ["GeneID", "Chr", "Start", "End", "Strand"]
if col_names is None:
col_names = np.arange(len(bedFile))
bedFile.loc[:, "GeneID"] = col_names
if input_bam_paths is not None:
bedFile = pd.concat([bedFile, randomRegions])
bedFile.to_csv(muffin.params["temp_dir"] + "/ann.saf", sep="\t", index=None)
genomic_regions_path = muffin.params["temp_dir"] + "/ann.saf"
else:
if genomic_regions is not None:
genomic_regions.to_csv(muffin.params["temp_dir"] + "/ann.saf", sep="\t", index=None)
genomic_regions_path = muffin.params["temp_dir"] + "/ann.saf"
paramsDict = {"files":ro.vectors.StrVector(bam_paths)}
# Count reads over genomic regions
paramsDict["annot.ext"] = genomic_regions_path
if featureCounts_params is not None:
paramsDict = dict(paramsDict, **featureCounts_params)
# Sanitize arguments for rpy2
paramsDictRpy2 = {}
for k in paramsDict:
paramsDictRpy2[k.replace(".", "_")] = paramsDict[k]
with localconverter(ro.default_converter + pandas2ri.converter + numpy2ri.converter):
res = subread.featureCounts(**paramsDictRpy2)
mapping_stats = pd.DataFrame(res["stat"]).set_index("Status")
if input_bam_paths is None:
dataset = ad.AnnData(res["counts"].T.astype(np.int32))
else:
dataset = ad.AnnData(res["counts"].T.astype(np.int32)[:, :-n_random_input])
dataset.uns["counts_random"] = res["counts"].T.astype(np.int32)[:, -n_random_input:]
dataset.uns["tot_mapped_counts"] = mapping_stats.sum(axis=0).values
# Same but for input files
if input_bam_paths is not None:
# Take care of the case where some files may share the same input
# Which causes an issue with featureCounts
unique_paths = list(set(input_bam_paths))
paramsDict = {"files":ro.vectors.StrVector(unique_paths)}
if genomic_regions_path is not None:
paramsDict["annot.ext"] = genomic_regions_path
if featureCounts_params is not None:
paramsDict = dict(paramsDict, **featureCounts_params)
# Sanitize arguments for rpy2
paramsDictRpy2 = {}
for k in paramsDict:
paramsDictRpy2[k.replace(".", "_")] = paramsDict[k]
with localconverter(ro.default_converter + pandas2ri.converter + numpy2ri.converter):
res = subread.featureCounts(**paramsDictRpy2)
if len(unique_paths) != len(input_bam_paths):
warnings.warn("Multiple files share the same input (this can be intended)")
unique_counts = res["counts"].T.astype(np.int32)
dataset.layers["input"] = np.zeros((len(input_bam_paths), unique_counts.shape[1]-n_random_input), dtype="int32")
dataset.uns["input_random"] = np.zeros((len(input_bam_paths), n_random_input), dtype="int32")
for i, val in enumerate(input_bam_paths):
index = unique_paths.index(val)
dataset.layers["input"][i,:] = unique_counts[index,:-n_random_input]
dataset.uns["input_random"][i,:] = unique_counts[index, -n_random_input:]
else:
dataset.layers["input"] = res["counts"].T.astype(np.int32)[:, :-n_random_input]
dataset.uns["input_random"] = res["counts"].T.astype(np.int32)[:, -n_random_input:]
mapping_stats_input = pd.DataFrame(res["stat"]).set_index("Status")
dataset.uns["tot_mapped_input"] = mapping_stats_input.sum(axis=0).values
fracMapped = mapping_stats.loc["Assigned"]/dataset.uns["tot_mapped_counts"]
fracMappedInput = mapping_stats_input.loc["Assigned"]/mapping_stats_input.sum(axis=0)
dataset.obs["est_signal_to_noise"] = fracMapped.values/fracMappedInput.values
# Annotate
if row_names is None:
dataset.obs_names = np.array([f.split("/")[-1] for f in bam_paths])
else:
dataset.obs_names = row_names.astype(str)
if col_names is None:
dataset.var_names = np.arange(dataset.X.shape[1]).astype(str)
else:
dataset.var_names = col_names.astype(str)
if input_bam_paths is not None:
dataset.var[["Chromosome", "Start", "End", "Strand"]] = bedFile.values[:-n_random_input, 1:]
else:
dataset.var[["Chromosome", "Start", "End", "Strand"]] = bedFile.values[:, 1:]
dataset.var[["Start", "End"]] = dataset.var[["Start", "End"]].astype("int")
dataset.var["Chromosome"] = dataset.var["Chromosome"].astype("category")
return dataset
[docs]
def set_design_matrix(dataset, design="intercept"):
"""
Set design matrix for statistical modelling. If you do not want to remove
any confounding factors, set it to a column vector of ones. E.g. :
np.ones((model.dataset.layers["input"].shape[0],1))
Parameters
----------
design : ndarray or "intercept"
Design matrix. If set to intercept will automatically set a column
vector of ones.
"""
if type(design) is str:
if design == "intercept":
design = np.ones((len(dataset),1))
dataset.obsm["design"] = design
[docs]
def set_size_factors(dataset, values):
"""
Load custom size factors for the model.
Parameters
----------
values : float ndarray
Size factors, e.g. np.sum(model.matrices["counts"], axis=1).
They will be standardized to unit mean to ensure numerical
stability when fitting models.
"""
dataset.obs["size_factors"] = (values.astype("float64") / np.mean(values.astype("float64"))).astype("float32")
[docs]
def set_normalization_factors(dataset, normalization_factors):
"""
Load normalization factors from a numpy array
Parameters
----------
dataset : _type_
_description_
normalization_factors : float ndarray (observations x variables)
Normalization factors per observation, per variable
"""
dataset.obs["normalization_factors"] = normalization_factors