Source code for muffin.great

"""
Genomic Regions Enrichment Analysis
"""
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
import pyranges as pr
from joblib import Parallel, delayed
from joblib.externals.loky import get_reusable_executor
from statsmodels.stats.multitest import fdrcorrection
from scipy.sparse import csr_array
import warnings
import muffin
from .utils import cluster, common, overlap_utils, stats

try:
    maxCores = len(os.sched_getaffinity(0))
except AttributeError:
    import multiprocessing
    maxCores = multiprocessing.cpu_count()

class regLogicGREAT:
    """
    :meta private:
    """
    def __init__(self, upstream, downstream, distal):
        self.upstream = upstream
        self.downstream = downstream
        self.distal = distal

    def __call__(self, txDF, chrInfo):
        # Infered regulatory domain logic
        copyTx = txDF.copy()
        copyTx["Start"] = (txDF["Start"] - self.upstream).where(txDF["Strand"] == "+", 
                                    txDF["End"] - self.downstream)
        copyTx["End"] = (txDF["Start"] + self.downstream).where(txDF["Strand"] == "+", 
                                    txDF["End"] + self.upstream)
        copyTx.sort_values(["Chromosome", "Start"], inplace=True)
        try:
            copyTx["Chromosome"].cat.remove_unused_categories(inplace=True)
        except:
            pass
        gb = copyTx.groupby("Chromosome")
        copyTx["Chromosome"] = copyTx["Chromosome"]
        perChr = dict([(x,gb.get_group(x)) for x in gb.groups])
        for c in perChr:
            inIdx = copyTx["Chromosome"] == c
            nextReg = np.roll(copyTx["Start"][inIdx], -1)
            try:
                nextReg[-1] = chrInfo.loc[c].values[0]
            except:
                print(f"Warning: chromosome '{c}' in gtf but not in size file, skipping all genes within this chromosome.")
                copyTx = copyTx[np.logical_not(inIdx)]
                continue
            previousReg = np.roll(copyTx["End"][inIdx], 1)
            previousReg[0] = 0
            extMin = np.maximum(copyTx["Start"][inIdx] - self.distal, previousReg)
            extMax = np.minimum(copyTx["End"][inIdx] + self.distal, nextReg)
            extMin = np.minimum(copyTx["Start"][inIdx], extMin)
            extMax = np.maximum(copyTx["End"][inIdx], extMax)
            copyTx.loc[copyTx["Chromosome"] == c, "Start"] = np.clip(extMin, 0, chrInfo.loc[c].values[0])
            copyTx.loc[copyTx["Chromosome"] == c, "End"] = np.clip(extMax, 0, chrInfo.loc[c].values[0])
        return copyTx



[docs]
class pyGREAT:
    """
        Genomic regions GSEA tool.

        Parameters
        ----------
        geneFile : str
            Path to a GTF file.
        chrFile : str
            Path to a chrInfo file. (chrom-tab-Chrom size-line return)
        gmtFile : str or list of str, optional
            Path to gmt file or list of paths to gmt files that will get
            concatenated.
        regulatory_bed : str, optional
            Path to a bed file of the form "chr start end gene", that will
            override muffin's default regulatory domains.
        distal : int, optional
            Size of inferred distal regulatory regions, by default 1000000
        upstream : int, optional
            Size of inferred upstream regulatory regions, by default 5000
        downstream : int, optional
            Size of inferred downstream regulatory regions, by default 1000
        gtfGeneCol : str, optional
            Name of the gene name/id column in the GTF file, by default
            "gene_name"
        gene_biotype : str, optional
            Type of gene to keep e.g. "protein_coding", by default "all"
    """
    def __init__(self, geneFile, chrFile, gmtFile=None, regulatory_bed=None,
                 distal=1000000, upstream=5000, downstream=1000, 
                 gtfGeneCol = "gene_name", gene_biotype="all"):
        self.chrInfo = pd.read_csv(chrFile, sep="\t", index_col=0, header=None)
        self.gtfGeneCol = gtfGeneCol
        # Read gtf file
        self.transcripts = pr.read_gtf(geneFile, as_df=True)
        self.transcripts = self.transcripts[self.transcripts["Feature"] == "gene"]
        if not gene_biotype == "all":
            self.transcripts = self.transcripts[self.transcripts["gene_type"] == gene_biotype]
        self.transcripts = self.transcripts[["Chromosome", "Start", "End", self.gtfGeneCol, "Strand"]]
        tss = self.transcripts["Start"].where(self.transcripts["Strand"] == "+", 
                                    self.transcripts["End"])
        self.TSSs = self.transcripts.copy()
        self.TSSs["Start"] = tss
        self.TSSs["End"] = tss+1
        # Apply infered regulatory logic
        if regulatory_bed is None:
            self.distal = distal
            self.upstream = upstream
            self.downstream = downstream
            self.geneRegulatory = regLogicGREAT(upstream, downstream, distal)(self.transcripts, self.chrInfo)
            self.geneRegulatory.drop("Strand", axis=1, inplace=True)
        else:
            self.geneRegulatory = pd.read_csv(regulatory_bed, header=None, sep="\t", usecols=[0,1,2,3])
            self.geneRegulatory.columns = ["Chromosome", "Start", "End", "gene_name"]
        self.geneRegulatory.index = self.geneRegulatory["gene_name"]
        self.geneRegulatory = self.geneRegulatory[~self.geneRegulatory.index.duplicated(False)]
        self.validGenes = self.geneRegulatory["gene_name"]
        self.geneRegulatory = self.geneRegulatory.loc[self.validGenes]
        # Parse GMT file Setup gene-GO matrix
        if gmtFile is not None:
            if type(gmtFile) is str:
                gmtFile = [gmtFile]
            self.all_associations = list()
            self.goMap = dict()
            self.allGenes = set()
            for gmtF in gmtFile:
                with open(gmtF) as f:
                    for l in f:
                        vals = l.rstrip("\n").split("\t")
                        for g in vals[2:]:
                            self.all_associations.append((vals[0], g))
                        self.allGenes |= set(vals[2:])
                        self.goMap[vals[0]] = vals[1]
            # Transform gene set - gene associations to a sparse binary matrix (for
        # clustering and glm)
            self.allGenes = list(self.allGenes)
            self.allTerms = list(self.goMap.keys())
            self.invGoMat = {v: k for k, v in self.goMap.items()}
            goFa, gos = pd.factorize(np.array(self.all_associations)[:,0])
            geneFa, genes = pd.factorize(np.array(self.all_associations)[:,1])
            non_annotated_genes = np.setdiff1d(self.validGenes.values, genes)
            genes = np.concatenate([genes, non_annotated_genes])
            data = np.ones_like(goFa, dtype="bool")
            self.mat = pd.DataFrame.sparse.from_spmatrix(csr_array((data, (geneFa, goFa)), shape=(len(genes), len(gos))).T, 
                                                        columns=genes, index=gos)




[docs]
    def get_nearest_gene(self, query, max_dist=np.inf):
        """
        Get the nearest TSS for each row. If it can't be retrieved will be
        named "None" (str).

        Parameters
        ----------
        query : pandas Dataframe or PyRanges
            Set of genomic regions.
        
        max_dist : integer or float
            Maximum distance for association, default np.inf

        """
        if not isinstance(query, pd.DataFrame):
            query = query.as_df()
        gr = query.copy()
        gr["Names"] = np.arange(len(gr))
        gr = pr.PyRanges(gr[["Chromosome", "Start", "End", "Names"]])
        gr2 = pr.PyRanges(self.TSSs[["Chromosome", "Start", "End", "gene_name"]])
        r = gr.nearest(gr2, strandedness=False)
        chroms_missing = set(gr.chromosomes).difference(gr2.chromosomes)
        if len(chroms_missing) > 0:
            missing_gr = pr.concat([gr[c] for c in chroms_missing]).as_df()
            missing_gr["gene_name"] = "None"
            missing_gr = pr.PyRanges(missing_gr)
            results = pr.concat([r,  missing_gr], strand=False).as_df()
        else:
            results = r.as_df()
        results.index=results["Names"]
        results = results.loc[np.arange(len(gr))]
        results["gene_name"].where(results["Distance"] < max_dist, "None", inplace=True)
        return results

    

[docs]
    def label_by_nearest_gene(self, query):
        """
        Label each row by its nearest gene TSS, avoiding duplicates. E.g. NADK_1,
        MYC_1, MYC_2, None_1, None_2

        Parameters
        ----------
        query : pandas Dataframe or PyRanges
            Set of genomic regions.

        Returns
        -------
        labels: ndarray
            Each corresponding label.
        """
        names = self.get_nearest_gene(query, max_dist=self.distal)
        groups = names.groupby('gene_name')
        # add a count for each duplicate value within each group
        names['count'] = groups.cumcount() + 1
        # create new column with unique names for each duplicate value
        names['col1_new'] = names['gene_name'] + '__' + names['count'].astype(str)
        # drop the count column and the original column with duplicate values
        names = names.drop(['gene_name', 'count'], axis=1)
        return names["col1_new"].values



[docs]
    def find_enriched_genes(self, query, background=None):
        """
        Find enriched terms in genes near query.

        Parameters
        ----------
        query: pandas dataframe in bed-like format or PyRanges
            Set of genomic regions to compute enrichment on.
        background: None, pandas dataframe in bed-like format, or PyRanges
        (default: None)
            If set to None considers the whole genome as the possible locations
            of the query. Otherwise it supposes the query is a subset of these
            background regions.
        
        Returns
        -------
        results: pandas dataframe
        """
        from rpy2.robjects.packages import importr
        rs = importr("stats")
        regPR = pr.PyRanges(self.geneRegulatory.rename({self.gtfGeneCol:"Name"}, axis=1))
        refCounts = overlap_utils.countOverlapPerCategory(regPR, 
                                                          overlap_utils.dfToPrWorkaround(background, useSummit=False))
        allCats = np.array(list(refCounts.keys()))
        pvals = np.zeros(len(allCats))
        fc = np.zeros(len(allCats))
        M = len(background)
        # Then for the query
        obsCounts = overlap_utils.countOverlapPerCategory(regPR, 
                                                          overlap_utils.dfToPrWorkaround(query, useSummit=False))
        N = len(query)
        # Find hypergeometric enrichment
        k = pd.Series(np.zeros(len(allCats), dtype="int"), allCats)
        isFound = np.isin(allCats, obsCounts.index, assume_unique=True)
        k[allCats[isFound]] = obsCounts
        n = pd.Series(np.zeros(len(allCats), dtype="int"), allCats)
        n[allCats] = refCounts
        # Scipy hyper 
        pvals = np.array(rs.phyper(k.values-1,n.values,M-n.values,N, lower_tail=False))
        pvals = np.nan_to_num(pvals, nan=1.0)
        fc = (k/np.maximum(N, 1e-7))/np.maximum(n/np.maximum(M, 1e-7), 1e-7)
        qvals = fdrcorrection(pvals)[1]
        pvals = pd.Series(pvals)
        pvals.index = allCats
        qvals = pd.Series(qvals)
        qvals.index = allCats
        fc = pd.Series(fc)
        fc.index = allCats
        geneEnriched = pvals, fc, qvals, k, n
        geneEnriched = pd.DataFrame(geneEnriched, 
                            index=["P-value", "FC", "FDR", "Query hits", "Background hits"]).T
        return geneEnriched.sort_values("P-value")

        
    

[docs]
    def find_genes_for_geneset(self, term, enrichedGeneTab, alpha=0.05):
        """
        Find genes enriched for a particular geneset

        Parameters
        ----------
        term : str
            Name of the GSEA term.
        enrichedGeneTab : pandas dataframe
            Results of findEnrichedGenes.
        alpha : float, optional
            Adjusted P-value threshold, by default 0.05

        Returns
        -------
        Enriched: pandas Dataframe
            Enriched genes
        """
        sigGenes = enrichedGeneTab[enrichedGeneTab["FDR"] < alpha].index
        genesInTerm = self.mat.columns[self.mat.loc[term] > 0.5]
        enrichedInTerm = sigGenes.intersection(genesInTerm)
        return enrichedGeneTab.loc[enrichedInTerm]





[docs]
    def find_enriched(self, query, background=None, min_genes=3, max_genes=1000, cores=-1):
        """
        Find enriched terms in genes near query.

        Parameters
        ----------
        query: pandas dataframe in bed-like format or PyRanges
            Set of genomic regions to compute enrichment on. If a pandas
            dataframe, assume the first three columns are chromosome, start,
            end.
        background: None, pandas dataframe in bed-like format, or PyRanges
        (default: None)
            If set to None considers the whole genome as the possible,
            equiprobable locations of the query. Otherwise it supposes the query
            is a subset of these background regions. (Note that it does not
            explicitly check for it).
        min_genes: int, (default 3)
            Minimum number of intersected genes for a geneset.
        max_genes: int, (default 1000)
            Maximum number of genes for a geneset.
        cores: int, (default -1)
            Max number of cores to used for parallelized computations. Default
            uses all available cores (-1).
        
        Returns
        -------
        results: pandas dataframe
        """
        # First compute intersections count for each gene And expected
        # intersection count for each gene
        regPR = pr.PyRanges(self.geneRegulatory.rename({self.gtfGeneCol:"Name"}, axis=1))
        if background is not None:
            intersectBg = overlap_utils.countOverlapPerCategory(regPR, overlap_utils.dfToPrWorkaround(background, useSummit=False))
        else:
            genomeSize = np.sum(self.chrInfo).values[0]
            intersectBg = (self.geneRegulatory["End"]-self.geneRegulatory["Start"])/genomeSize
            intersectBg = np.maximum(intersectBg, 1/genomeSize)
            intersectBg.index = self.geneRegulatory["gene_name"]
            intersectBg = intersectBg.groupby(intersectBg.index).sum()
        intersectQuery = overlap_utils.countOverlapPerCategory(regPR, overlap_utils.dfToPrWorkaround(query, useSummit=False))
        queryCounts = intersectBg.copy() * 0
        queryCounts.loc[intersectQuery.index] = intersectQuery
        obsMatrix = self.mat.loc[:, queryCounts.index]
        if background is not None:
            expected = intersectBg.loc[obsMatrix.columns]
            observed = pd.DataFrame(queryCounts.loc[queryCounts.index])
            endog = observed.copy()
            expected *= len(query)/len(background)
        else: 
            expected = intersectBg.loc[obsMatrix.columns]*len(query)
            observed = pd.DataFrame(queryCounts.loc[queryCounts.index])
            endog = observed.copy()
        # Trim GOs under cutoffs
        trimmed = obsMatrix.loc[:, intersectQuery.index].values.sum(axis=1) >= min_genes
        trimmed = trimmed & (obsMatrix.values.sum(axis=1) <= max_genes)
        # Setup parallel computation settings
        if cores == -1:
            cores = maxCores      
        # I don't think i'm doing anything wrong but pandas spams warnings
        warnings.filterwarnings('ignore', category=FutureWarning)
        maxBatch = len(obsMatrix.loc[trimmed])
        maxBatch = min(int(0.25*maxBatch/cores)+1,256)        
        hitsPerGO = np.sum(csr_array(obsMatrix.sparse.to_coo()) * observed.values.ravel(), axis=1)[trimmed]
        
        # Fit a Negative Binomial GLM for each annotation, and evaluate wald
        # test p-value for each gene annotation
        with Parallel(n_jobs=cores, batch_size=maxBatch, verbose=1, max_nbytes=None, mmap_mode=None) as pool:
            results = pool(delayed(stats.fitNBinomModel)(hasAnnot, endog, expected, gos, queryCounts.index) for gos, hasAnnot in obsMatrix[trimmed].iterrows())
        warnings.resetwarnings()
        # Manually kill workers afterwards or they'll just stack up with
        # multiple runs
        get_reusable_executor().shutdown(wait=False, kill_workers=True)
        # Format results
        results = pd.DataFrame(results)
        if results.shape[1] > 0:
            results.set_index(0, inplace=True)
            results.columns = ["P(Beta > 0)", "Beta"]
        else :
            results = pd.DataFrame(None, columns=["P(Beta > 0)", "Beta"])
        results["Total hits"] = hitsPerGO
        results.dropna(inplace=True)
        results["P(Beta > 0)"] = np.maximum(results["P(Beta > 0)"], 1e-320)
        qvals = results["P(Beta > 0)"].copy()
        qvals.loc[:] = fdrcorrection(qvals.values)[1]
        results["BH corrected p-value"] = qvals
        results["-log10(qval)"] = -np.log10(qvals)
        results["-log10(pval)"] = -np.log10(results["P(Beta > 0)"])
        results["FC"] = np.exp(results["Beta"])
        results["Name"] = [self.goMap[i] for i in results.index]
        results.sort_values(by="P(Beta > 0)", inplace=True)
        return results

    
    def __poisson_glm__(self, query, background=None, min_genes=3, max_genes=1000, cores=-1, yes_really=False):
        """
        :meta private:
        DO NOT USE
        """
        if not yes_really:
            raise PermissionError("Do not use this")
        # First compute intersections count for each gene And expected
        # intersection count for each gene
        regPR = pr.PyRanges(self.geneRegulatory.rename({self.gtfGeneCol:"Name"}, axis=1))
        if background is not None:
            intersectBg = overlap_utils.countOverlapPerCategory(regPR, overlap_utils.dfToPrWorkaround(background, useSummit=False))
        else:
            genomeSize = np.sum(self.chrInfo).values[0]
            intersectBg = (self.geneRegulatory["End"]-self.geneRegulatory["Start"])/genomeSize
            intersectBg = np.maximum(intersectBg, 1/genomeSize)
            intersectBg.index = self.geneRegulatory["gene_name"]
            intersectBg = intersectBg.groupby(intersectBg.index).sum()
        intersectQuery = overlap_utils.countOverlapPerCategory(regPR, overlap_utils.dfToPrWorkaround(query, useSummit=False))
        queryCounts = intersectBg.copy() * 0
        queryCounts.loc[intersectQuery.index] = intersectQuery
        obsMatrix = self.mat.loc[:, queryCounts.index]
        if background is not None:
            expected = intersectBg.loc[obsMatrix.columns]
            observed = pd.DataFrame(queryCounts.loc[queryCounts.index])
            endog = observed.copy()
            expected *= len(query)/len(background)
        else: 
            expected = intersectBg.loc[obsMatrix.columns]*len(query)
            observed = pd.DataFrame(queryCounts.loc[queryCounts.index])
            endog = observed.copy()
        # Trim GOs under cutoffs
        trimmed = obsMatrix.loc[:, intersectQuery.index].values.sum(axis=1) >= min_genes
        trimmed = trimmed & (obsMatrix.values.sum(axis=1) <= max_genes)
        # Setup parallel computation settings
        if cores == -1:
            cores = maxCores      
        # I don't think i'm doing anything wrong but pandas spams warnings
        warnings.filterwarnings('ignore', category=FutureWarning)
        maxBatch = len(obsMatrix.loc[trimmed])
        maxBatch = min(int(0.25*maxBatch/cores)+1,256)        
        hitsPerGO = np.sum(csr_array(obsMatrix.sparse.to_coo()) * observed.values.ravel(), axis=1)[trimmed]
        
        # Fit a Negative Binomial GLM for each annotation, and evaluate wald
        # test p-value for each gene annotation
        with Parallel(n_jobs=cores, batch_size=maxBatch, verbose=1, max_nbytes=None, mmap_mode=None) as pool:
            results = pool(delayed(stats.fitPoissonModel)(hasAnnot, endog, expected, gos, queryCounts.index) for gos, hasAnnot in obsMatrix[trimmed].iterrows())
        warnings.resetwarnings()
        # Manually kill workers afterwards or they'll just stack up with
        # multiple runs
        get_reusable_executor().shutdown(wait=False, kill_workers=True)
        # Format results
        results = pd.DataFrame(results)
        if results.shape[1] > 0:
            results.set_index(0, inplace=True)
            results.columns = ["P(Beta > 0)", "Beta"]
        else :
            results = pd.DataFrame(None, columns=["P(Beta > 0)", "Beta"])
        results["Total hits"] = hitsPerGO
        results.dropna(inplace=True)
        results["P(Beta > 0)"] = np.maximum(results["P(Beta > 0)"], 1e-320)
        qvals = results["P(Beta > 0)"].copy()
        qvals.loc[:] = fdrcorrection(qvals.values)[1]
        results["BH corrected p-value"] = qvals
        results["-log10(qval)"] = -np.log10(qvals)
        results["-log10(pval)"] = -np.log10(results["P(Beta > 0)"])
        results["FC"] = np.exp(results["Beta"])
        results["Name"] = [self.goMap[i] for i in results.index]
        results.sort_values(by="P(Beta > 0)", inplace=True)
        return results

    def __binomial_glm__(self, query, background=None, min_genes=3, max_genes=1000, cores=-1, yes_really=False):
        """
        :meta private:
        DO NOT USE
        """
        if not yes_really:
            raise PermissionError("Do not use this")
        # First compute intersections count for each gene And expected
        # intersection count for each gene
        regPR = pr.PyRanges(self.geneRegulatory.rename({self.gtfGeneCol:"Name"}, axis=1))
        if background is not None:
            intersectBg = overlap_utils.countOverlapPerCategory(regPR, overlap_utils.dfToPrWorkaround(background, useSummit=False))
        else:
            raise KeyError("Nope")
            genomeSize = np.sum(self.chrInfo).values[0]
            intersectBg = (self.geneRegulatory["End"]-self.geneRegulatory["Start"])/genomeSize
            intersectBg = np.maximum(intersectBg, 1/genomeSize)
            intersectBg.index = self.geneRegulatory["gene_name"]
            intersectBg = intersectBg.groupby(intersectBg.index).sum()
        intersectQuery = overlap_utils.countOverlapPerCategory(regPR, overlap_utils.dfToPrWorkaround(query, useSummit=False))
        queryCounts = intersectBg.copy() * 0
        queryCounts.loc[intersectQuery.index] = intersectQuery
        obsMatrix = self.mat.loc[:, queryCounts.index]
        if background is not None:
            expected = intersectBg.loc[obsMatrix.columns]
            observed = pd.DataFrame(queryCounts.loc[queryCounts.index])
            endog = observed.copy()
            # expected *= len(query)/len(background)
        else: 
            expected = intersectBg.loc[obsMatrix.columns]*len(query)
            observed = pd.DataFrame(queryCounts.loc[queryCounts.index])
            endog = observed.copy()
        # Trim GOs under cutoffs
        trimmed = obsMatrix.loc[:, intersectQuery.index].values.sum(axis=1) >= min_genes
        trimmed = trimmed & (obsMatrix.values.sum(axis=1) <= max_genes)
        # Setup parallel computation settings
        if cores == -1:
            cores = maxCores      
        # I don't think i'm doing anything wrong but pandas spams warnings
        warnings.filterwarnings('ignore', category=FutureWarning)
        maxBatch = len(obsMatrix.loc[trimmed])
        maxBatch = min(int(0.25*maxBatch/cores)+1,256)        
        hitsPerGO = np.sum(csr_array(obsMatrix.sparse.to_coo()) * observed.values.ravel(), axis=1)[trimmed]
        print(observed)
        # Fit a Negative Binomial GLM for each annotation, and evaluate wald
        # test p-value for each gene annotation
        with Parallel(n_jobs=cores, batch_size=maxBatch, verbose=1, max_nbytes=None, mmap_mode=None) as pool:
            results = pool(delayed(stats.fitBinomialModel)(hasAnnot, endog, expected, gos, queryCounts.index) for gos, hasAnnot in obsMatrix[trimmed].iterrows())
        warnings.resetwarnings()
        # Manually kill workers afterwards or they'll just stack up with
        # multiple runs
        get_reusable_executor().shutdown(wait=False, kill_workers=True)
        # Format results
        results = pd.DataFrame(results)
        if results.shape[1] > 0:
            results.set_index(0, inplace=True)
            results.columns = ["P(Beta > 0)", "Beta"]
        else :
            results = pd.DataFrame(None, columns=["P(Beta > 0)", "Beta"])
        results["Total hits"] = hitsPerGO
        results.dropna(inplace=True)
        results["P(Beta > 0)"] = np.maximum(results["P(Beta > 0)"], 1e-320)
        qvals = results["P(Beta > 0)"].copy()
        qvals.loc[:] = fdrcorrection(qvals.values)[1]
        results["BH corrected p-value"] = qvals
        results["-log10(qval)"] = -np.log10(qvals)
        results["-log10(pval)"] = -np.log10(results["P(Beta > 0)"])
        results["FC"] = np.exp(results["Beta"])
        results["Name"] = [self.goMap[i] for i in results.index]
        results.sort_values(by="P(Beta > 0)", inplace=True)
        return results



[docs]
    def barplot_enrich(self, enrichDF, title="", by="P(Beta > 0)", alpha=0.05, topK=10, savePath=None):
        """
        Draw Enrichment barplots

        Parameters
        ----------
        enrichDF: pandas dataframe or tuple of pandas dataframes
            The result of the findEnriched method.
            
        savePath: string (optional)
            If set to None, does not save the figure.
        """
        fig, ax = plt.subplots()
        newDF = enrichDF.copy()
        newDF.index = [self.goMap[i].capitalize() for i in newDF.index]
        selected = (newDF["BH corrected p-value"] < alpha)
        ordered = -np.log10(newDF[by][selected]).sort_values(ascending=True)[:topK]
        terms = ordered.index
        t = [common.capTxtLen(term, 50) for term in terms]
        ax.tick_params(axis="x", labelsize=8)
        ax.tick_params(length=3, width=1.2)
        ax.barh(range(len(terms)), np.minimum(ordered[::-1],324.0))
        ax.set_yticks(range(len(terms)))
        ax.set_yticklabels(t[::-1], fontsize=5)
        ax.spines['right'].set_visible(False)
        ax.spines['left'].set_visible(False)
        ax.spines['top'].set_visible(False)
        ax.set_xlabel("-log10(Corrected P-value)", fontsize=8)
        ax.set_title(title, fontsize=10)
        ax.grid(False)
        return fig, ax



[docs]
    def cluster_treemap(self, enrichDF, alpha=0.05, score="-log10(qval)", 
                       metric="yule", resolution=1.0, output=None, topK=9):
        """
        Plot a treemap of clustered gene set terms. Clustering is done according
        to similarities between annotated genes of a gene set.

        Parameters
        ----------
        enrichDF : dataframe
            Output of "findEnriched" method.
        alpha : float, optional
            FDR cutoff, by default 0.05
        score : str, optional
            Which column of the result dataframe to use to identify lead term,
            by default "-log10(qval)"
        metric : str, optional
            Similarity measure of gene annotation between terms, by default
            "yule"
        resolution : float, optional
            Influences the size of the clusters (larger = more clusters), by
            default 1.0
        output : str or None, optional
            Path to save figure, by default None
        topK : int or None, optional
            Number of terms to display, if set to None displays all, by default
            9
        Returns
        -------
        filtered: pandas dataframe
        """
        sig = enrichDF[enrichDF["BH corrected p-value"] < alpha]
        if len(sig) == 0:
            print("WARNING : No statistically significant enrichment, returning an empty dataframe and no plot is drawn.")
            return None
        # Remove unused columns and keep enriched rows
        warnings.filterwarnings('ignore', category=FutureWarning)
        nz = csr_array(self.mat.loc[sig.index].sparse.to_coo()).sum(axis=0) >= 1
        simplifiedMat = self.mat.loc[sig.index, nz].sparse.to_dense().values
        warnings.resetwarnings()
        clusters = cluster.graphClustering(simplifiedMat, 
                                           metric, k=int(np.sqrt(len(sig))), r=resolution, snn=True, 
                                           approx=False, restarts=10)
        sig.loc[:,"Cluster"] = clusters
        representatives = pd.Series(dict([(i, sig["Name"][sig[score][sig["Cluster"] == i].idxmax()]) for i in np.unique(sig["Cluster"])]))
        representatives = pd.Series([common.customwrap(r, 30) for r in representatives.values])
        sig.loc[:,"Representative"] = representatives[sig["Cluster"]].values
        long_wrap = [common.customwrap(self.goMap[i], 30) for i in sig.index]
        short_wrap = [common.customwrap(self.goMap[i], 15, 5) for i in sig.index]
        sig.loc[:, "Name"] = long_wrap
        duplicate = sig["Representative"] == sig["Name"]
        sig.loc[:, "Name"] = np.where(duplicate, sig.loc[:, "Name"].values, short_wrap)
        sig.loc[duplicate, "Representative"] = ""
        if topK is not None:
            filtered = []
            for r in representatives.unique():
                filtered.append(sig[(sig["Representative"]==r)|(sig["Name"]==r)].sort_values(score, ascending=False).iloc[:topK])
            filtered = pd.concat(filtered)
        else:
            filtered = sig
        fig = px.treemap(names=filtered["Name"], parents=filtered["Representative"], values=np.sqrt(np.sqrt(filtered[score])),
                        width=800, height=800)
        fig.update_layout(margin = dict(t=2, l=2, r=2, b=2),
                        font_size=20, font_family="Arial Black")
        if output is not None:
            fig.write_image(output)
            fig.write_html(output+".html")
        fig.show()
        return sig[duplicate]

    

[docs]
    def gene_activity_matrix(self, dataset, extend=1000):
        """
        Creates a gene-activity matrix, useful to quantify activity (e.g. ATAC or
        ChIP) for each gene. Sums the counts of peaks located over gene bodies.

        Parameters
        ----------
        dataset : anndata
            The .var fields need to contain the "Chromosome", "Start" and "End"
            fields.

        extend : int
            How much to extend gene body in bp.

        Returns
        -------
        gene_activity : anndata
            observations-genes activity matrix. Will also copy obs,obsm,obsp and uns
            fields of the original dataset.
        """
        dataset.var["Id"] = np.arange(dataset.shape[1])
        pairs = pr.PyRanges(self.geneRegulatory).join(pr.PyRanges(dataset.var[["Chromosome", "Start", "End", "Id"]]), 
                                                      slack=extend).as_df()[["gene_name", "Id"]]

        unique_genes = pairs["gene_name"].unique()
        pairs["gene_name"], names = pd.factorize(pairs["gene_name"])
        new_array = np.zeros((len(dataset), len(unique_genes)), dtype="int32")

        indices = list(zip(pairs["gene_name"], pairs["Id"]))
        for ig, ip in indices:
            new_array[:, ig] += dataset.X[:, ip]

        gene_activity = muffin.load.dataset_from_arrays(new_array, 
                                                        row_names=dataset.obs_names, 
                                                        col_names=names)
        gene_activity.obs = dataset.obs
        gene_activity.obsm = dataset.obsm
        gene_activity.obsp = dataset.obsp
        gene_activity.uns = dataset.uns
        return gene_activity

    

[docs]
    def tss_activity_matrix(self, dataset, extend=1000):
        """
        Creates a gene tss-activity matrix, useful to quantify activity (e.g. ATAC or
        ChIP) for each gene. Sums the counts of peaks located over TSS.

        Parameters
        ----------
        dataset : anndata
            The .var fields need to contain the "Chromosome", "Start" and "End"
            fields.
        
        extend : int
            How much to extend TSSs in bp.

        Returns
        -------
        gene_activity : anndata
            observations-genes activity matrix. Will also copy obs,obsm,obsp and uns
            fields of the original dataset.
        """
        dataset.var["Id"] = np.arange(dataset.shape[1])
        pairs = pr.PyRanges(self.TSSs).join(pr.PyRanges(dataset.var[["Chromosome", "Start", "End", "Id"]]), 
                                            slack=extend).as_df()[["gene_name", "Id"]]

        unique_genes = pairs["gene_name"].unique()
        pairs["gene_name"], names = pd.factorize(pairs["gene_name"])
        new_array = np.zeros((len(dataset), len(unique_genes)), dtype="int32")

        indices = list(zip(pairs["gene_name"], pairs["Id"]))
        for ig, ip in indices:
            new_array[:, ig] += dataset.X[:, ip]

        gene_activity = muffin.load.dataset_from_arrays(new_array, 
                                                        row_names=dataset.obs_names, 
                                                        col_names=names)
        gene_activity.obs = dataset.obs
        gene_activity.obsm = dataset.obsm
        gene_activity.obsp = dataset.obsp
        gene_activity.uns = dataset.uns
        return gene_activity