Source code for muffin.peakMerge

'''
Generate consensus peaks.
'''
import numpy as np
import pandas as pd
from scipy.signal import argrelextrema, oaconvolve
from scipy.signal.windows import gaussian
from .utils import stats

[docs] def merge_peaks(beds, chrom_sizes, fileFormat="narrowPeak", inferCenter=False, forceUnstranded=False, sigma="auto", perPeakDensity=False, minOverlap=2, output_bedgraph=None): """ Read peak called files, generate consensuses and the matrix. Parameters ---------- beds: list of pandas dataframes Each dataframe should be formatted in bed format. Summit is assumed to be 7th column (=thickStart) for bed format, and 9th column for narrowPeak format. chrom_sizes: str or dict Path to tab separated (chromosome, length) annotation file. If a dict, must be of the form {"ChrName": "chrSize"}. fileFormat: "bed" or "narrowPeak" Format of the files being read. Bed file format assumes the max signal position to be at the 6th column (0-based) in absolute coordinates. The narrowPeak format assumes the max signal position to be the 9th column with this position being relative to the start position. inferCenter: boolean (optional, default False) If set to true will use the position halfway between start and end positions. Enable this only if the summit position is missing. Can also be suitable for broad peaks as the summit position can be unreliable. forceUnstranded: Boolean (optional, default False) If set to true, assumes all peaks are not strand-specific even if strand specific information was found. sigma: float or "auto" (optional, default "auto") Size of the gaussian filter (lower values = more separation). Only effective if perPeakDensity is set to False. "auto" automatically selects the filter width at (average peak size)/8. perPeakDensity: Boolean (optional, default False) Recommended for broad peaks. If set to false will perform a gaussian filter along the genome (faster), assuming all peaks have roughly the same size. If set to true will create the density curve per peak based on each peak individual size. This is much more slower than the filter method. May be useful if peaks are expected to have very different sizes. Can also be faster when the number of peaks is small. minOverlap: integer (optional, default 2) Minimum number of peaks required at a consensus. 2 Indicates that a peak must be replicated at least once. """ alltabs = [] if type(beds) is not list: beds = [beds] for tab in beds: fmt = fileFormat if not fmt in ["bed", "narrowPeak"]: raise TypeError(f"Unknown file format : {fmt}") # Read bed format if fmt == "bed": if inferCenter: usedCols = [0,1,2,5] else: usedCols = [0,1,2,5,6] tab = tab.iloc[:, usedCols].copy() tab[5000] = 1 tab.columns = np.arange(len(tab.columns)) tab[0] = tab[0].astype("str", copy=False) tab[3].fillna(value=".", inplace=True) if inferCenter: tab[5] = tab[4] tab[4] = ((tab[1]+tab[2])*0.5).astype(int) tab[5] = [1]*len(tab) alltabs.append(tab) elif fmt == "narrowPeak": if inferCenter: usedCols = [0,1,2,5] else: usedCols = [0,1,2,5,9] tab = tab.iloc[:, usedCols].copy() tab[5000] = 1 tab.columns = np.arange(len(tab.columns)) tab[0] = tab[0].astype("str", copy=False) tab[3].fillna(value=".", inplace=True) if inferCenter: tab[5] = tab[4] tab[4] = ((tab[1]+tab[2])*0.5).astype(int, copy=False) else: tab[4] = (tab[1] + tab[4]).astype(int, copy=False) alltabs.append(tab) if type(chrom_sizes) is str: chrom_sizes = pd.read_csv(chrom_sizes, sep="\t", header=None, index_col=0).iloc[:,0].to_dict() # Concatenate files df = pd.concat(alltabs) numElements = len(df) avgPeakSize = np.median(df[2] - df[1]) # Check strandedness if forceUnstranded == True: df[3] = "." strandCount = 1 else: # Check if there is only stranded or non-stranded elements strandValues = np.unique(df[3]) strandCount = len(strandValues) if strandCount > 2: raise ValueError("More than two strand directions !") elif strandCount == 2 and "." in strandValues: raise ValueError("Unstranded and stranded values !") # Split per strand df = dict([(k, x) for k, x in df.groupby(3)]) ########### Peak separation step ########### # Compute sigma if automatic setting if sigma == "auto": sigma = avgPeakSize/4 else: sigma = float(sigma) if perPeakDensity: sigma = 0.25 windowSize = int(8*sigma)+1 sepPerStrand = {} sepIdxPerStrand = {} # Iterate for each strand consensuses = [] j = 0 for s in df.keys(): # Split peaks per chromosome df[s].sort_values(by=[0, 4], inplace=True) posPerChr = dict([(k, x.values[:, [1,2,4]].astype(int)) for k, x in df[s].groupby(0)]) infoPerChr = dict([(k, x.values) for k, x in df[s].groupby(0)]) # Iterate over all chromosomes sepPerStrand[s] = {} sepIdxPerStrand[s] = {} if output_bedgraph is not None: f_bedgraph = open(output_bedgraph+f"{s}.wig", "w") for chrName in posPerChr.keys(): # Place peak on the genomic array try: currentLen = chrom_sizes[str(chrName)] except KeyError: print(f"Warning: chromosome {str(chrName)} is not in genome annotation and will be removed") continue array = np.zeros(currentLen, dtype="float32") peakIdx = posPerChr[chrName] np.add.at(array, peakIdx[:, 2],1) if not perPeakDensity: # Smooth peak density smoothed = oaconvolve(array, gaussian(windowSize, sigma), "same") separators = argrelextrema(smoothed, np.less_equal)[0] # Get local minimas else: smoothed = np.zeros(currentLen, dtype="float32") for i in range(len(peakIdx)): peakSigma = (peakIdx[i, 1] - peakIdx[i, 0])*sigma windowSize = int(8*peakSigma)+1 center = (peakIdx[i, 1] + peakIdx[i, 0])*0.5 start = max(center - int(windowSize/2), 0) end = min(center + int(windowSize/2) + 1, currentLen) window = gaussian(end-start, peakSigma) smoothed[start:end] += window/window.sum() # Split consensuses separators = argrelextrema(smoothed, np.less_equal)[0] # Get local minimas if output_bedgraph: sampling_interval = 5 f_bedgraph.write(f"fixedStep chrom={chrName} start=1 step={sampling_interval}\n") positions = np.arange(0, len(smoothed), sampling_interval)[:-1] to_write = np.around(smoothed[positions+int(1+sampling_interval/2)],3) to_write = "\n".join(to_write.astype(str)) f_bedgraph.write(to_write+"\n") separators = separators[np.where(np.ediff1d(separators) != 1)[0]+1] # Removes consecutive separators (because less-equal comparison) separators = np.insert(separators, [0,len(separators)], [0, currentLen]) # Add start and end points # Assign peaks to separators # Not the most optimized but fast enough separators[-1]+=1 smallest_bin = np.digitize(peakIdx[:,0], separators) largest_bin = np.digitize(peakIdx[:,1], separators) bin_to_segments = dict() for seg_id, (smallest, largest) in enumerate(zip(smallest_bin, largest_bin)): seg_start, seg_end = peakIdx[seg_id, :2] seg_length = seg_end - seg_start for bin_id in range(smallest, largest+1): bin_start = separators[bin_id-1] # np.digitize's output is 1-indexed bin_end = separators[bin_id] bin_length = bin_end - bin_start overlap_start = max(bin_start, seg_start) overlap_end = min(bin_end, seg_end) overlap_length = max(0, overlap_end - overlap_start) if overlap_length / bin_length > 0.5 or overlap_length / seg_length > 0.5: if bin_id in bin_to_segments: bin_to_segments[bin_id].append(seg_id) else: bin_to_segments[bin_id] = [seg_id] # Format consensus peaks for k in bin_to_segments.keys(): currentConsensus = infoPerChr[chrName][bin_to_segments[k]] # Exclude consensuses that are too small if len(currentConsensus) < minOverlap: continue currentSep = separators[k-1:k+1] # Setup consensuses coordinates consensusStart = max(np.min(currentConsensus[:,1]), currentSep[0]) consensusEnd = min(np.max(currentConsensus[:,2]), currentSep[1]) # Discard abnormally small consensus peaks if consensusEnd-consensusStart < avgPeakSize*0.125: continue inSep = (currentConsensus[:,4] > currentSep[0]) & (currentConsensus[:,4] < currentSep[1]) if inSep.sum() >= 1: consensusCenter = int(np.mean(currentConsensus[inSep,4])) else: consensusCenter = int(consensusStart*0.5+consensusEnd*0.5) # Mean value of present features meanScore = len(currentConsensus) # Add consensus to the genomic locations data = [chrName, consensusStart, consensusEnd, j, meanScore, s, consensusCenter, consensusCenter + 1] consensuses.append(data) j += 1 if output_bedgraph: f_bedgraph.close() return pd.DataFrame(consensuses)