Source code for muffin.peakMerge

'''
Generate consensus peaks.
'''
import numpy as np
import pandas as pd
from scipy.signal import argrelextrema, oaconvolve
from scipy.signal.windows import gaussian
from .utils import stats


[docs]
def merge_peaks(beds, chrom_sizes, fileFormat="narrowPeak", inferCenter=False, forceUnstranded=False, 
                  sigma="auto", perPeakDensity=False, minOverlap=2, output_bedgraph=None):
    """
    Read peak called files, generate consensuses and the matrix.

    Parameters
    ----------
    beds: list of pandas dataframes
        Each dataframe should be formatted in bed format. Summit is assumed to
        be 7th column (=thickStart) for bed format, and 9th column for
        narrowPeak format.

    chrom_sizes: str or dict
        Path to tab separated (chromosome, length) annotation file. If a dict,
        must be of the form {"ChrName": "chrSize"}.

    fileFormat: "bed" or "narrowPeak"
        Format of the files being read. Bed file format assumes the max signal
        position to be at the 6th column (0-based) in absolute coordinates. The
        narrowPeak format assumes the max signal position to be the 9th column
        with this position being relative to the start position.

    inferCenter: boolean (optional, default False)
        If set to true will use the position halfway between start and end
        positions. Enable this only if the summit position is missing. Can also
        be suitable for broad peaks as the summit position can be unreliable.

    forceUnstranded: Boolean (optional, default False)
        If set to true, assumes all peaks are not strand-specific even if strand
        specific information was found.

    sigma: float or "auto" (optional, default "auto")
        Size of the gaussian filter (lower values = more separation). Only
        effective if perPeakDensity is set to False. "auto" automatically
        selects the filter width at (average peak size)/8. 
    
    perPeakDensity: Boolean (optional, default False)
        Recommended for broad peaks. If set to false will perform a gaussian
        filter along the genome (faster), assuming all peaks have roughly the
        same size. If set to true will create the density curve per peak based
        on each peak individual size. This is much more slower than the filter
        method. May be useful if peaks are expected to have very different
        sizes. Can also be faster when the number of peaks is small.
    
    minOverlap: integer (optional, default 2)
        Minimum number of peaks required at a consensus. 2 Indicates that a peak
        must be replicated at least once.
    """
    alltabs = []
    if type(beds) is not list:
        beds = [beds]
    for tab in beds:
        fmt = fileFormat
        if not fmt in ["bed", "narrowPeak"]:
            raise TypeError(f"Unknown file format : {fmt}")
        # Read bed format
        if fmt == "bed":
            if inferCenter:
                usedCols = [0,1,2,5]
            else:
                usedCols = [0,1,2,5,6]
            tab = tab.iloc[:, usedCols].copy()
            tab[5000] = 1
            tab.columns = np.arange(len(tab.columns))
            tab[0] = tab[0].astype("str", copy=False)
            tab[3].fillna(value=".", inplace=True)
            if inferCenter:
                tab[5] = tab[4]
                tab[4] = ((tab[1]+tab[2])*0.5).astype(int)
            tab[5] = [1]*len(tab)
            alltabs.append(tab)
        elif fmt == "narrowPeak":
            if inferCenter:
                usedCols = [0,1,2,5]
            else:
                usedCols = [0,1,2,5,9]
            tab = tab.iloc[:, usedCols].copy()
            tab[5000] = 1
            tab.columns = np.arange(len(tab.columns))
            tab[0] = tab[0].astype("str", copy=False)
            tab[3].fillna(value=".", inplace=True)
            if inferCenter:
                tab[5] = tab[4]
                tab[4] = ((tab[1]+tab[2])*0.5).astype(int, copy=False)
            else:
                tab[4] = (tab[1] + tab[4]).astype(int, copy=False)
            alltabs.append(tab)
    if type(chrom_sizes) is str:
        chrom_sizes = pd.read_csv(chrom_sizes, sep="\t", header=None, index_col=0).iloc[:,0].to_dict()  
    # Concatenate files
    df = pd.concat(alltabs)
    numElements = len(df)
    avgPeakSize = np.median(df[2] - df[1])
    # Check strandedness
    if forceUnstranded == True:
        df[3] = "."
        strandCount = 1
    else:
        # Check if there is only stranded or non-stranded elements
        strandValues = np.unique(df[3])
        strandCount = len(strandValues)
        if strandCount > 2:
            raise ValueError("More than two strand directions !")
        elif strandCount == 2 and "." in strandValues:
            raise ValueError("Unstranded and stranded values !")
    # Split per strand
    df = dict([(k, x) for k, x in df.groupby(3)])
    ########### Peak separation step ########### 
    # Compute sigma if automatic setting
    if sigma == "auto":   
        sigma = avgPeakSize/4
    else:
        sigma = float(sigma)
    if perPeakDensity:
        sigma = 0.25
    windowSize = int(8*sigma)+1
    sepPerStrand = {}
    sepIdxPerStrand = {}
    
    
    # Iterate for each strand
    consensuses = []
    j = 0
    for s in df.keys():
        # Split peaks per chromosome
        df[s].sort_values(by=[0, 4], inplace=True)
        posPerChr = dict([(k, x.values[:, [1,2,4]].astype(int)) for k, x in df[s].groupby(0)])
        infoPerChr = dict([(k, x.values) for k, x in df[s].groupby(0)])
        # Iterate over all chromosomes
        sepPerStrand[s] = {}
        sepIdxPerStrand[s] = {}
        if output_bedgraph is not None:
            f_bedgraph = open(output_bedgraph+f"{s}.wig", "w")
        for chrName in posPerChr.keys():
            # Place peak on the genomic array
            try:
                currentLen = chrom_sizes[str(chrName)]
            except KeyError:
                print(f"Warning: chromosome {str(chrName)} is not in genome annotation and will be removed")
                continue
            array = np.zeros(currentLen, dtype="float32")
            peakIdx = posPerChr[chrName]
            np.add.at(array, peakIdx[:, 2],1)
            if not perPeakDensity:
                # Smooth peak density
                smoothed = oaconvolve(array, gaussian(windowSize, sigma), "same")
                separators = argrelextrema(smoothed, np.less_equal)[0]      # Get local minimas
            else:
                smoothed = np.zeros(currentLen, dtype="float32")
                for i in range(len(peakIdx)):
                    peakSigma = (peakIdx[i, 1] - peakIdx[i, 0])*sigma
                    windowSize = int(8*peakSigma)+1
                    center = (peakIdx[i, 1] + peakIdx[i, 0])*0.5
                    start = max(center - int(windowSize/2), 0)
                    end = min(center + int(windowSize/2) + 1, currentLen)
                    window = gaussian(end-start, peakSigma)
                    smoothed[start:end] += window/window.sum()
            # Split consensuses
            separators = argrelextrema(smoothed, np.less_equal)[0]      # Get local minimas
            if output_bedgraph:
                sampling_interval = 5
                f_bedgraph.write(f"fixedStep chrom={chrName} start=1 step={sampling_interval}\n")
                positions = np.arange(0, len(smoothed), sampling_interval)[:-1]
                to_write = np.around(smoothed[positions+int(1+sampling_interval/2)],3)
                to_write = "\n".join(to_write.astype(str))
                f_bedgraph.write(to_write+"\n")
            separators = separators[np.where(np.ediff1d(separators) != 1)[0]+1]    # Removes consecutive separators (because less-equal comparison)
            separators = np.insert(separators, [0,len(separators)], [0, currentLen])        # Add start and end points
            # Assign peaks to separators
            # Not the most optimized but fast enough
            separators[-1]+=1
            smallest_bin = np.digitize(peakIdx[:,0], separators)
            largest_bin = np.digitize(peakIdx[:,1], separators)
            bin_to_segments = dict()
            for seg_id, (smallest, largest) in enumerate(zip(smallest_bin, largest_bin)):
                seg_start, seg_end = peakIdx[seg_id, :2]
                seg_length = seg_end - seg_start
                for bin_id in range(smallest, largest+1):
                    bin_start = separators[bin_id-1]  # np.digitize's output is 1-indexed
                    bin_end = separators[bin_id]
                    bin_length = bin_end - bin_start

                    overlap_start = max(bin_start, seg_start)
                    overlap_end = min(bin_end, seg_end)
                    overlap_length = max(0, overlap_end - overlap_start)
                    
                    if overlap_length / bin_length > 0.5 or overlap_length / seg_length > 0.5:
                        if bin_id in bin_to_segments:
                            bin_to_segments[bin_id].append(seg_id)
                        else:
                            bin_to_segments[bin_id] = [seg_id]
            # Format consensus peaks
            for k in bin_to_segments.keys():
                currentConsensus = infoPerChr[chrName][bin_to_segments[k]]
                # Exclude consensuses that are too small
                if len(currentConsensus) < minOverlap:
                    continue
                currentSep = separators[k-1:k+1]
                # Setup consensuses coordinates
                consensusStart = max(np.min(currentConsensus[:,1]), currentSep[0])
                consensusEnd = min(np.max(currentConsensus[:,2]), currentSep[1])
                # Discard abnormally small consensus peaks
                if consensusEnd-consensusStart < avgPeakSize*0.125:
                    continue
                inSep = (currentConsensus[:,4] > currentSep[0]) & (currentConsensus[:,4] < currentSep[1]) 
                if inSep.sum() >= 1:
                    consensusCenter = int(np.mean(currentConsensus[inSep,4]))
                else:
                    consensusCenter = int(consensusStart*0.5+consensusEnd*0.5)
                # Mean value of present features
                meanScore = len(currentConsensus)
                # Add consensus to the genomic locations
                data = [chrName, consensusStart, consensusEnd, j, 
                        meanScore, s, consensusCenter, consensusCenter + 1]
                consensuses.append(data)
                j += 1
        if output_bedgraph:
            f_bedgraph.close()
    return pd.DataFrame(consensuses)