"""
Genomic Regions Enrichment Analysis
"""
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
import pyranges as pr
from joblib import Parallel, delayed
from joblib.externals.loky import get_reusable_executor
from statsmodels.stats.multitest import fdrcorrection
from scipy.sparse import csr_array
import warnings
import muffin
from .utils import cluster, common, overlap_utils, stats
try:
maxCores = len(os.sched_getaffinity(0))
except AttributeError:
import multiprocessing
maxCores = multiprocessing.cpu_count()
class regLogicGREAT:
"""
:meta private:
"""
def __init__(self, upstream, downstream, distal):
self.upstream = upstream
self.downstream = downstream
self.distal = distal
def __call__(self, txDF, chrInfo):
# Infered regulatory domain logic
copyTx = txDF.copy()
copyTx["Start"] = (txDF["Start"] - self.upstream).where(txDF["Strand"] == "+",
txDF["End"] - self.downstream)
copyTx["End"] = (txDF["Start"] + self.downstream).where(txDF["Strand"] == "+",
txDF["End"] + self.upstream)
copyTx.sort_values(["Chromosome", "Start"], inplace=True)
try:
copyTx["Chromosome"].cat.remove_unused_categories(inplace=True)
except:
pass
gb = copyTx.groupby("Chromosome")
copyTx["Chromosome"] = copyTx["Chromosome"]
perChr = dict([(x,gb.get_group(x)) for x in gb.groups])
for c in perChr:
inIdx = copyTx["Chromosome"] == c
nextReg = np.roll(copyTx["Start"][inIdx], -1)
try:
nextReg[-1] = chrInfo.loc[c].values[0]
except:
print(f"Warning: chromosome '{c}' in gtf but not in size file, skipping all genes within this chromosome.")
copyTx = copyTx[np.logical_not(inIdx)]
continue
previousReg = np.roll(copyTx["End"][inIdx], 1)
previousReg[0] = 0
extMin = np.maximum(copyTx["Start"][inIdx] - self.distal, previousReg)
extMax = np.minimum(copyTx["End"][inIdx] + self.distal, nextReg)
extMin = np.minimum(copyTx["Start"][inIdx], extMin)
extMax = np.maximum(copyTx["End"][inIdx], extMax)
copyTx.loc[copyTx["Chromosome"] == c, "Start"] = np.clip(extMin, 0, chrInfo.loc[c].values[0])
copyTx.loc[copyTx["Chromosome"] == c, "End"] = np.clip(extMax, 0, chrInfo.loc[c].values[0])
return copyTx
[docs]
class pyGREAT:
"""
Genomic regions GSEA tool.
Parameters
----------
geneFile : str
Path to a GTF file.
chrFile : str
Path to a chrInfo file. (chrom-tab-Chrom size-line return)
gmtFile : str or list of str, optional
Path to gmt file or list of paths to gmt files that will get
concatenated.
regulatory_bed : str, optional
Path to a bed file of the form "chr start end gene", that will
override muffin's default regulatory domains.
distal : int, optional
Size of inferred distal regulatory regions, by default 1000000
upstream : int, optional
Size of inferred upstream regulatory regions, by default 5000
downstream : int, optional
Size of inferred downstream regulatory regions, by default 1000
gtfGeneCol : str, optional
Name of the gene name/id column in the GTF file, by default
"gene_name"
gene_biotype : str, optional
Type of gene to keep e.g. "protein_coding", by default "all"
"""
def __init__(self, geneFile, chrFile, gmtFile=None, regulatory_bed=None,
distal=1000000, upstream=5000, downstream=1000,
gtfGeneCol = "gene_name", gene_biotype="all"):
self.chrInfo = pd.read_csv(chrFile, sep="\t", index_col=0, header=None)
self.gtfGeneCol = gtfGeneCol
# Read gtf file
self.transcripts = pr.read_gtf(geneFile, as_df=True)
self.transcripts = self.transcripts[self.transcripts["Feature"] == "gene"]
if not gene_biotype == "all":
self.transcripts = self.transcripts[self.transcripts["gene_type"] == gene_biotype]
self.transcripts = self.transcripts[["Chromosome", "Start", "End", self.gtfGeneCol, "Strand"]]
tss = self.transcripts["Start"].where(self.transcripts["Strand"] == "+",
self.transcripts["End"])
self.TSSs = self.transcripts.copy()
self.TSSs["Start"] = tss
self.TSSs["End"] = tss+1
# Apply infered regulatory logic
if regulatory_bed is None:
self.distal = distal
self.upstream = upstream
self.downstream = downstream
self.geneRegulatory = regLogicGREAT(upstream, downstream, distal)(self.transcripts, self.chrInfo)
self.geneRegulatory.drop("Strand", axis=1, inplace=True)
else:
self.geneRegulatory = pd.read_csv(regulatory_bed, header=None, sep="\t", usecols=[0,1,2,3])
self.geneRegulatory.columns = ["Chromosome", "Start", "End", "gene_name"]
self.geneRegulatory.index = self.geneRegulatory["gene_name"]
self.geneRegulatory = self.geneRegulatory[~self.geneRegulatory.index.duplicated(False)]
self.validGenes = self.geneRegulatory["gene_name"]
self.geneRegulatory = self.geneRegulatory.loc[self.validGenes]
# Parse GMT file Setup gene-GO matrix
if gmtFile is not None:
if type(gmtFile) is str:
gmtFile = [gmtFile]
self.all_associations = list()
self.goMap = dict()
self.allGenes = set()
for gmtF in gmtFile:
with open(gmtF) as f:
for l in f:
vals = l.rstrip("\n").split("\t")
for g in vals[2:]:
self.all_associations.append((vals[0], g))
self.allGenes |= set(vals[2:])
self.goMap[vals[0]] = vals[1]
# Transform gene set - gene associations to a sparse binary matrix (for
# clustering and glm)
self.allGenes = list(self.allGenes)
self.allTerms = list(self.goMap.keys())
self.invGoMat = {v: k for k, v in self.goMap.items()}
goFa, gos = pd.factorize(np.array(self.all_associations)[:,0])
geneFa, genes = pd.factorize(np.array(self.all_associations)[:,1])
non_annotated_genes = np.setdiff1d(self.validGenes.values, genes)
genes = np.concatenate([genes, non_annotated_genes])
data = np.ones_like(goFa, dtype="bool")
self.mat = pd.DataFrame.sparse.from_spmatrix(csr_array((data, (geneFa, goFa)), shape=(len(genes), len(gos))).T,
columns=genes, index=gos)
[docs]
def get_nearest_gene(self, query, max_dist=np.inf):
"""
Get the nearest TSS for each row. If it can't be retrieved will be
named "None" (str).
Parameters
----------
query : pandas Dataframe or PyRanges
Set of genomic regions.
max_dist : integer or float
Maximum distance for association, default np.inf
"""
if not isinstance(query, pd.DataFrame):
query = query.as_df()
gr = query.copy()
gr["Names"] = np.arange(len(gr))
gr = pr.PyRanges(gr[["Chromosome", "Start", "End", "Names"]])
gr2 = pr.PyRanges(self.TSSs[["Chromosome", "Start", "End", "gene_name"]])
r = gr.nearest(gr2, strandedness=False)
chroms_missing = set(gr.chromosomes).difference(gr2.chromosomes)
if len(chroms_missing) > 0:
missing_gr = pr.concat([gr[c] for c in chroms_missing]).as_df()
missing_gr["gene_name"] = "None"
missing_gr = pr.PyRanges(missing_gr)
results = pr.concat([r, missing_gr], strand=False).as_df()
else:
results = r.as_df()
results.index=results["Names"]
results = results.loc[np.arange(len(gr))]
results["gene_name"].where(results["Distance"] < max_dist, "None", inplace=True)
return results
[docs]
def label_by_nearest_gene(self, query):
"""
Label each row by its nearest gene TSS, avoiding duplicates. E.g. NADK_1,
MYC_1, MYC_2, None_1, None_2
Parameters
----------
query : pandas Dataframe or PyRanges
Set of genomic regions.
Returns
-------
labels: ndarray
Each corresponding label.
"""
names = self.get_nearest_gene(query, max_dist=self.distal)
groups = names.groupby('gene_name')
# add a count for each duplicate value within each group
names['count'] = groups.cumcount() + 1
# create new column with unique names for each duplicate value
names['col1_new'] = names['gene_name'] + '__' + names['count'].astype(str)
# drop the count column and the original column with duplicate values
names = names.drop(['gene_name', 'count'], axis=1)
return names["col1_new"].values
[docs]
def find_enriched_genes(self, query, background=None):
"""
Find enriched terms in genes near query.
Parameters
----------
query: pandas dataframe in bed-like format or PyRanges
Set of genomic regions to compute enrichment on.
background: None, pandas dataframe in bed-like format, or PyRanges
(default: None)
If set to None considers the whole genome as the possible locations
of the query. Otherwise it supposes the query is a subset of these
background regions.
Returns
-------
results: pandas dataframe
"""
from rpy2.robjects.packages import importr
rs = importr("stats")
regPR = pr.PyRanges(self.geneRegulatory.rename({self.gtfGeneCol:"Name"}, axis=1))
refCounts = overlap_utils.countOverlapPerCategory(regPR,
overlap_utils.dfToPrWorkaround(background, useSummit=False))
allCats = np.array(list(refCounts.keys()))
pvals = np.zeros(len(allCats))
fc = np.zeros(len(allCats))
M = len(background)
# Then for the query
obsCounts = overlap_utils.countOverlapPerCategory(regPR,
overlap_utils.dfToPrWorkaround(query, useSummit=False))
N = len(query)
# Find hypergeometric enrichment
k = pd.Series(np.zeros(len(allCats), dtype="int"), allCats)
isFound = np.isin(allCats, obsCounts.index, assume_unique=True)
k[allCats[isFound]] = obsCounts
n = pd.Series(np.zeros(len(allCats), dtype="int"), allCats)
n[allCats] = refCounts
# Scipy hyper
pvals = np.array(rs.phyper(k.values-1,n.values,M-n.values,N, lower_tail=False))
pvals = np.nan_to_num(pvals, nan=1.0)
fc = (k/np.maximum(N, 1e-7))/np.maximum(n/np.maximum(M, 1e-7), 1e-7)
qvals = fdrcorrection(pvals)[1]
pvals = pd.Series(pvals)
pvals.index = allCats
qvals = pd.Series(qvals)
qvals.index = allCats
fc = pd.Series(fc)
fc.index = allCats
geneEnriched = pvals, fc, qvals, k, n
geneEnriched = pd.DataFrame(geneEnriched,
index=["P-value", "FC", "FDR", "Query hits", "Background hits"]).T
return geneEnriched.sort_values("P-value")
[docs]
def find_genes_for_geneset(self, term, enrichedGeneTab, alpha=0.05):
"""
Find genes enriched for a particular geneset
Parameters
----------
term : str
Name of the GSEA term.
enrichedGeneTab : pandas dataframe
Results of findEnrichedGenes.
alpha : float, optional
Adjusted P-value threshold, by default 0.05
Returns
-------
Enriched: pandas Dataframe
Enriched genes
"""
sigGenes = enrichedGeneTab[enrichedGeneTab["FDR"] < alpha].index
genesInTerm = self.mat.columns[self.mat.loc[term] > 0.5]
enrichedInTerm = sigGenes.intersection(genesInTerm)
return enrichedGeneTab.loc[enrichedInTerm]
[docs]
def find_enriched(self, query, background=None, min_genes=3, max_genes=1000, cores=-1):
"""
Find enriched terms in genes near query.
Parameters
----------
query: pandas dataframe in bed-like format or PyRanges
Set of genomic regions to compute enrichment on. If a pandas
dataframe, assume the first three columns are chromosome, start,
end.
background: None, pandas dataframe in bed-like format, or PyRanges
(default: None)
If set to None considers the whole genome as the possible,
equiprobable locations of the query. Otherwise it supposes the query
is a subset of these background regions. (Note that it does not
explicitly check for it).
min_genes: int, (default 3)
Minimum number of intersected genes for a geneset.
max_genes: int, (default 1000)
Maximum number of genes for a geneset.
cores: int, (default -1)
Max number of cores to used for parallelized computations. Default
uses all available cores (-1).
Returns
-------
results: pandas dataframe
"""
# First compute intersections count for each gene And expected
# intersection count for each gene
regPR = pr.PyRanges(self.geneRegulatory.rename({self.gtfGeneCol:"Name"}, axis=1))
if background is not None:
intersectBg = overlap_utils.countOverlapPerCategory(regPR, overlap_utils.dfToPrWorkaround(background, useSummit=False))
else:
genomeSize = np.sum(self.chrInfo).values[0]
intersectBg = (self.geneRegulatory["End"]-self.geneRegulatory["Start"])/genomeSize
intersectBg = np.maximum(intersectBg, 1/genomeSize)
intersectBg.index = self.geneRegulatory["gene_name"]
intersectBg = intersectBg.groupby(intersectBg.index).sum()
intersectQuery = overlap_utils.countOverlapPerCategory(regPR, overlap_utils.dfToPrWorkaround(query, useSummit=False))
queryCounts = intersectBg.copy() * 0
queryCounts.loc[intersectQuery.index] = intersectQuery
obsMatrix = self.mat.loc[:, queryCounts.index]
if background is not None:
expected = intersectBg.loc[obsMatrix.columns]
observed = pd.DataFrame(queryCounts.loc[queryCounts.index])
endog = observed.copy()
expected *= len(query)/len(background)
else:
expected = intersectBg.loc[obsMatrix.columns]*len(query)
observed = pd.DataFrame(queryCounts.loc[queryCounts.index])
endog = observed.copy()
# Trim GOs under cutoffs
trimmed = obsMatrix.loc[:, intersectQuery.index].values.sum(axis=1) >= min_genes
trimmed = trimmed & (obsMatrix.values.sum(axis=1) <= max_genes)
# Setup parallel computation settings
if cores == -1:
cores = maxCores
# I don't think i'm doing anything wrong but pandas spams warnings
warnings.filterwarnings('ignore', category=FutureWarning)
maxBatch = len(obsMatrix.loc[trimmed])
maxBatch = min(int(0.25*maxBatch/cores)+1,256)
hitsPerGO = np.sum(csr_array(obsMatrix.sparse.to_coo()) * observed.values.ravel(), axis=1)[trimmed]
# Fit a Negative Binomial GLM for each annotation, and evaluate wald
# test p-value for each gene annotation
with Parallel(n_jobs=cores, batch_size=maxBatch, verbose=1, max_nbytes=None, mmap_mode=None) as pool:
results = pool(delayed(stats.fitNBinomModel)(hasAnnot, endog, expected, gos, queryCounts.index) for gos, hasAnnot in obsMatrix[trimmed].iterrows())
warnings.resetwarnings()
# Manually kill workers afterwards or they'll just stack up with
# multiple runs
get_reusable_executor().shutdown(wait=False, kill_workers=True)
# Format results
results = pd.DataFrame(results)
if results.shape[1] > 0:
results.set_index(0, inplace=True)
results.columns = ["P(Beta > 0)", "Beta"]
else :
results = pd.DataFrame(None, columns=["P(Beta > 0)", "Beta"])
results["Total hits"] = hitsPerGO
results.dropna(inplace=True)
results["P(Beta > 0)"] = np.maximum(results["P(Beta > 0)"], 1e-320)
qvals = results["P(Beta > 0)"].copy()
qvals.loc[:] = fdrcorrection(qvals.values)[1]
results["BH corrected p-value"] = qvals
results["-log10(qval)"] = -np.log10(qvals)
results["-log10(pval)"] = -np.log10(results["P(Beta > 0)"])
results["FC"] = np.exp(results["Beta"])
results["Name"] = [self.goMap[i] for i in results.index]
results.sort_values(by="P(Beta > 0)", inplace=True)
return results
def __poisson_glm__(self, query, background=None, min_genes=3, max_genes=1000, cores=-1, yes_really=False):
"""
:meta private:
DO NOT USE
"""
if not yes_really:
raise PermissionError("Do not use this")
# First compute intersections count for each gene And expected
# intersection count for each gene
regPR = pr.PyRanges(self.geneRegulatory.rename({self.gtfGeneCol:"Name"}, axis=1))
if background is not None:
intersectBg = overlap_utils.countOverlapPerCategory(regPR, overlap_utils.dfToPrWorkaround(background, useSummit=False))
else:
genomeSize = np.sum(self.chrInfo).values[0]
intersectBg = (self.geneRegulatory["End"]-self.geneRegulatory["Start"])/genomeSize
intersectBg = np.maximum(intersectBg, 1/genomeSize)
intersectBg.index = self.geneRegulatory["gene_name"]
intersectBg = intersectBg.groupby(intersectBg.index).sum()
intersectQuery = overlap_utils.countOverlapPerCategory(regPR, overlap_utils.dfToPrWorkaround(query, useSummit=False))
queryCounts = intersectBg.copy() * 0
queryCounts.loc[intersectQuery.index] = intersectQuery
obsMatrix = self.mat.loc[:, queryCounts.index]
if background is not None:
expected = intersectBg.loc[obsMatrix.columns]
observed = pd.DataFrame(queryCounts.loc[queryCounts.index])
endog = observed.copy()
expected *= len(query)/len(background)
else:
expected = intersectBg.loc[obsMatrix.columns]*len(query)
observed = pd.DataFrame(queryCounts.loc[queryCounts.index])
endog = observed.copy()
# Trim GOs under cutoffs
trimmed = obsMatrix.loc[:, intersectQuery.index].values.sum(axis=1) >= min_genes
trimmed = trimmed & (obsMatrix.values.sum(axis=1) <= max_genes)
# Setup parallel computation settings
if cores == -1:
cores = maxCores
# I don't think i'm doing anything wrong but pandas spams warnings
warnings.filterwarnings('ignore', category=FutureWarning)
maxBatch = len(obsMatrix.loc[trimmed])
maxBatch = min(int(0.25*maxBatch/cores)+1,256)
hitsPerGO = np.sum(csr_array(obsMatrix.sparse.to_coo()) * observed.values.ravel(), axis=1)[trimmed]
# Fit a Negative Binomial GLM for each annotation, and evaluate wald
# test p-value for each gene annotation
with Parallel(n_jobs=cores, batch_size=maxBatch, verbose=1, max_nbytes=None, mmap_mode=None) as pool:
results = pool(delayed(stats.fitPoissonModel)(hasAnnot, endog, expected, gos, queryCounts.index) for gos, hasAnnot in obsMatrix[trimmed].iterrows())
warnings.resetwarnings()
# Manually kill workers afterwards or they'll just stack up with
# multiple runs
get_reusable_executor().shutdown(wait=False, kill_workers=True)
# Format results
results = pd.DataFrame(results)
if results.shape[1] > 0:
results.set_index(0, inplace=True)
results.columns = ["P(Beta > 0)", "Beta"]
else :
results = pd.DataFrame(None, columns=["P(Beta > 0)", "Beta"])
results["Total hits"] = hitsPerGO
results.dropna(inplace=True)
results["P(Beta > 0)"] = np.maximum(results["P(Beta > 0)"], 1e-320)
qvals = results["P(Beta > 0)"].copy()
qvals.loc[:] = fdrcorrection(qvals.values)[1]
results["BH corrected p-value"] = qvals
results["-log10(qval)"] = -np.log10(qvals)
results["-log10(pval)"] = -np.log10(results["P(Beta > 0)"])
results["FC"] = np.exp(results["Beta"])
results["Name"] = [self.goMap[i] for i in results.index]
results.sort_values(by="P(Beta > 0)", inplace=True)
return results
def __binomial_glm__(self, query, background=None, min_genes=3, max_genes=1000, cores=-1, yes_really=False):
"""
:meta private:
DO NOT USE
"""
if not yes_really:
raise PermissionError("Do not use this")
# First compute intersections count for each gene And expected
# intersection count for each gene
regPR = pr.PyRanges(self.geneRegulatory.rename({self.gtfGeneCol:"Name"}, axis=1))
if background is not None:
intersectBg = overlap_utils.countOverlapPerCategory(regPR, overlap_utils.dfToPrWorkaround(background, useSummit=False))
else:
raise KeyError("Nope")
genomeSize = np.sum(self.chrInfo).values[0]
intersectBg = (self.geneRegulatory["End"]-self.geneRegulatory["Start"])/genomeSize
intersectBg = np.maximum(intersectBg, 1/genomeSize)
intersectBg.index = self.geneRegulatory["gene_name"]
intersectBg = intersectBg.groupby(intersectBg.index).sum()
intersectQuery = overlap_utils.countOverlapPerCategory(regPR, overlap_utils.dfToPrWorkaround(query, useSummit=False))
queryCounts = intersectBg.copy() * 0
queryCounts.loc[intersectQuery.index] = intersectQuery
obsMatrix = self.mat.loc[:, queryCounts.index]
if background is not None:
expected = intersectBg.loc[obsMatrix.columns]
observed = pd.DataFrame(queryCounts.loc[queryCounts.index])
endog = observed.copy()
# expected *= len(query)/len(background)
else:
expected = intersectBg.loc[obsMatrix.columns]*len(query)
observed = pd.DataFrame(queryCounts.loc[queryCounts.index])
endog = observed.copy()
# Trim GOs under cutoffs
trimmed = obsMatrix.loc[:, intersectQuery.index].values.sum(axis=1) >= min_genes
trimmed = trimmed & (obsMatrix.values.sum(axis=1) <= max_genes)
# Setup parallel computation settings
if cores == -1:
cores = maxCores
# I don't think i'm doing anything wrong but pandas spams warnings
warnings.filterwarnings('ignore', category=FutureWarning)
maxBatch = len(obsMatrix.loc[trimmed])
maxBatch = min(int(0.25*maxBatch/cores)+1,256)
hitsPerGO = np.sum(csr_array(obsMatrix.sparse.to_coo()) * observed.values.ravel(), axis=1)[trimmed]
print(observed)
# Fit a Negative Binomial GLM for each annotation, and evaluate wald
# test p-value for each gene annotation
with Parallel(n_jobs=cores, batch_size=maxBatch, verbose=1, max_nbytes=None, mmap_mode=None) as pool:
results = pool(delayed(stats.fitBinomialModel)(hasAnnot, endog, expected, gos, queryCounts.index) for gos, hasAnnot in obsMatrix[trimmed].iterrows())
warnings.resetwarnings()
# Manually kill workers afterwards or they'll just stack up with
# multiple runs
get_reusable_executor().shutdown(wait=False, kill_workers=True)
# Format results
results = pd.DataFrame(results)
if results.shape[1] > 0:
results.set_index(0, inplace=True)
results.columns = ["P(Beta > 0)", "Beta"]
else :
results = pd.DataFrame(None, columns=["P(Beta > 0)", "Beta"])
results["Total hits"] = hitsPerGO
results.dropna(inplace=True)
results["P(Beta > 0)"] = np.maximum(results["P(Beta > 0)"], 1e-320)
qvals = results["P(Beta > 0)"].copy()
qvals.loc[:] = fdrcorrection(qvals.values)[1]
results["BH corrected p-value"] = qvals
results["-log10(qval)"] = -np.log10(qvals)
results["-log10(pval)"] = -np.log10(results["P(Beta > 0)"])
results["FC"] = np.exp(results["Beta"])
results["Name"] = [self.goMap[i] for i in results.index]
results.sort_values(by="P(Beta > 0)", inplace=True)
return results
[docs]
def barplot_enrich(self, enrichDF, title="", by="P(Beta > 0)", alpha=0.05, topK=10, savePath=None):
"""
Draw Enrichment barplots
Parameters
----------
enrichDF: pandas dataframe or tuple of pandas dataframes
The result of the findEnriched method.
savePath: string (optional)
If set to None, does not save the figure.
"""
fig, ax = plt.subplots()
newDF = enrichDF.copy()
newDF.index = [self.goMap[i].capitalize() for i in newDF.index]
selected = (newDF["BH corrected p-value"] < alpha)
ordered = -np.log10(newDF[by][selected]).sort_values(ascending=True)[:topK]
terms = ordered.index
t = [common.capTxtLen(term, 50) for term in terms]
ax.tick_params(axis="x", labelsize=8)
ax.tick_params(length=3, width=1.2)
ax.barh(range(len(terms)), np.minimum(ordered[::-1],324.0))
ax.set_yticks(range(len(terms)))
ax.set_yticklabels(t[::-1], fontsize=5)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.set_xlabel("-log10(Corrected P-value)", fontsize=8)
ax.set_title(title, fontsize=10)
ax.grid(False)
return fig, ax
[docs]
def cluster_treemap(self, enrichDF, alpha=0.05, score="-log10(qval)",
metric="yule", resolution=1.0, output=None, topK=9):
"""
Plot a treemap of clustered gene set terms. Clustering is done according
to similarities between annotated genes of a gene set.
Parameters
----------
enrichDF : dataframe
Output of "findEnriched" method.
alpha : float, optional
FDR cutoff, by default 0.05
score : str, optional
Which column of the result dataframe to use to identify lead term,
by default "-log10(qval)"
metric : str, optional
Similarity measure of gene annotation between terms, by default
"yule"
resolution : float, optional
Influences the size of the clusters (larger = more clusters), by
default 1.0
output : str or None, optional
Path to save figure, by default None
topK : int or None, optional
Number of terms to display, if set to None displays all, by default
9
Returns
-------
filtered: pandas dataframe
"""
sig = enrichDF[enrichDF["BH corrected p-value"] < alpha]
if len(sig) == 0:
print("WARNING : No statistically significant enrichment, returning an empty dataframe and no plot is drawn.")
return None
# Remove unused columns and keep enriched rows
warnings.filterwarnings('ignore', category=FutureWarning)
nz = csr_array(self.mat.loc[sig.index].sparse.to_coo()).sum(axis=0) >= 1
simplifiedMat = self.mat.loc[sig.index, nz].sparse.to_dense().values
warnings.resetwarnings()
clusters = cluster.graphClustering(simplifiedMat,
metric, k=int(np.sqrt(len(sig))), r=resolution, snn=True,
approx=False, restarts=10)
sig.loc[:,"Cluster"] = clusters
representatives = pd.Series(dict([(i, sig["Name"][sig[score][sig["Cluster"] == i].idxmax()]) for i in np.unique(sig["Cluster"])]))
representatives = pd.Series([common.customwrap(r, 30) for r in representatives.values])
sig.loc[:,"Representative"] = representatives[sig["Cluster"]].values
long_wrap = [common.customwrap(self.goMap[i], 30) for i in sig.index]
short_wrap = [common.customwrap(self.goMap[i], 15, 5) for i in sig.index]
sig.loc[:, "Name"] = long_wrap
duplicate = sig["Representative"] == sig["Name"]
sig.loc[:, "Name"] = np.where(duplicate, sig.loc[:, "Name"].values, short_wrap)
sig.loc[duplicate, "Representative"] = ""
if topK is not None:
filtered = []
for r in representatives.unique():
filtered.append(sig[(sig["Representative"]==r)|(sig["Name"]==r)].sort_values(score, ascending=False).iloc[:topK])
filtered = pd.concat(filtered)
else:
filtered = sig
fig = px.treemap(names=filtered["Name"], parents=filtered["Representative"], values=np.sqrt(np.sqrt(filtered[score])),
width=800, height=800)
fig.update_layout(margin = dict(t=2, l=2, r=2, b=2),
font_size=20, font_family="Arial Black")
if output is not None:
fig.write_image(output)
fig.write_html(output+".html")
fig.show()
return sig[duplicate]
[docs]
def gene_activity_matrix(self, dataset, extend=1000):
"""
Creates a gene-activity matrix, useful to quantify activity (e.g. ATAC or
ChIP) for each gene. Sums the counts of peaks located over gene bodies.
Parameters
----------
dataset : anndata
The .var fields need to contain the "Chromosome", "Start" and "End"
fields.
extend : int
How much to extend gene body in bp.
Returns
-------
gene_activity : anndata
observations-genes activity matrix. Will also copy obs,obsm,obsp and uns
fields of the original dataset.
"""
dataset.var["Id"] = np.arange(dataset.shape[1])
pairs = pr.PyRanges(self.geneRegulatory).join(pr.PyRanges(dataset.var[["Chromosome", "Start", "End", "Id"]]),
slack=extend).as_df()[["gene_name", "Id"]]
unique_genes = pairs["gene_name"].unique()
pairs["gene_name"], names = pd.factorize(pairs["gene_name"])
new_array = np.zeros((len(dataset), len(unique_genes)), dtype="int32")
indices = list(zip(pairs["gene_name"], pairs["Id"]))
for ig, ip in indices:
new_array[:, ig] += dataset.X[:, ip]
gene_activity = muffin.load.dataset_from_arrays(new_array,
row_names=dataset.obs_names,
col_names=names)
gene_activity.obs = dataset.obs
gene_activity.obsm = dataset.obsm
gene_activity.obsp = dataset.obsp
gene_activity.uns = dataset.uns
return gene_activity
[docs]
def tss_activity_matrix(self, dataset, extend=1000):
"""
Creates a gene tss-activity matrix, useful to quantify activity (e.g. ATAC or
ChIP) for each gene. Sums the counts of peaks located over TSS.
Parameters
----------
dataset : anndata
The .var fields need to contain the "Chromosome", "Start" and "End"
fields.
extend : int
How much to extend TSSs in bp.
Returns
-------
gene_activity : anndata
observations-genes activity matrix. Will also copy obs,obsm,obsp and uns
fields of the original dataset.
"""
dataset.var["Id"] = np.arange(dataset.shape[1])
pairs = pr.PyRanges(self.TSSs).join(pr.PyRanges(dataset.var[["Chromosome", "Start", "End", "Id"]]),
slack=extend).as_df()[["gene_name", "Id"]]
unique_genes = pairs["gene_name"].unique()
pairs["gene_name"], names = pd.factorize(pairs["gene_name"])
new_array = np.zeros((len(dataset), len(unique_genes)), dtype="int32")
indices = list(zip(pairs["gene_name"], pairs["Id"]))
for ig, ip in indices:
new_array[:, ig] += dataset.X[:, ip]
gene_activity = muffin.load.dataset_from_arrays(new_array,
row_names=dataset.obs_names,
col_names=names)
gene_activity.obs = dataset.obs
gene_activity.obsm = dataset.obsm
gene_activity.obsp = dataset.obsp
gene_activity.uns = dataset.uns
return gene_activity