diff --git a/src/qpcr/MinerMethod.py b/src/qpcr/MinerMethod.py index 194c219..6fb3c9d 100644 --- a/src/qpcr/MinerMethod.py +++ b/src/qpcr/MinerMethod.py @@ -1,7 +1,21 @@ #!/usr/bin/env python ''' -Created on Sep 1, 2010 +Implementation of the Miner Method for qPCR crossing-point determination. + +Provides the four-parameter logistic (4PL) model (``qpcrFit``), an +exponential-phase nonlinear regression model (``nlmFit``), and three +crossing-point estimation methods derived from the fitted 4PL parameters: + +- FDM (First Derivative Maximum) +- SDM (Second Derivative Maximum) +- SPE (Signal-to-noise / Percentage of Efficiency) + +Also contains an example fit executed at import time using a hard-coded +sample fluorescence curve (``myData``). + +Reference: Zhao & Fernald (2005). "Comprehensive algorithm for +quantitative real-time polymerase chain reaction." J Comput Biol. @author: lgoff ''' @@ -24,6 +38,15 @@ #Misc ######### def nthRoot(num,n): + """Compute the nth root of a number. + + Args: + num: The base value (numeric). + n: The root degree (numeric, must not be zero). + + Returns: + ``num ** (1.0 / n)`` as a float. + """ return num ** (1.0/n) ############# @@ -33,45 +56,138 @@ def nthRoot(num,n): #errfunc = lambda p,x,y: y-fitfunc(p,x) #Distance to the target function (residuals) def fit(p,x): - """ - Depricated in favor of qpcrFit to use optimize.curve_fit() - f(x) Logistic model for qPCR Data - fitfunc = lambda p,x: p[3]+(p[0]/(1+((x/p[2])**p[1]))) # From actual paper (Zhao et al) where p = [a,b,x_0,y_0] + """Evaluate the four-parameter logistic (4PL) model using a parameter vector. + + Deprecated in favor of ``qpcrFit``, which is compatible with + ``scipy.optimize.curve_fit``. + + The model is: + f(x) = p[3] + p[0] / (1 + (x / p[2])^p[1]) + + where ``p = [a, b, x0, y0]`` following the notation in Zhao et al. + + Args: + p: Sequence of four model parameters ``[a, b, x0, y0]``: + a – amplitude (difference between upper and lower asymptotes), + b – slope/steepness, + x0 – inflection point (cycle at midpoint), + y0 – baseline fluorescence (lower asymptote). + x: Cycle number (scalar or array). + + Returns: + Predicted fluorescence value(s) at cycle ``x``. """ return (p[3]+(p[0]/(1+((x/p[2])**p[1])))) def qpcrFit(x,a,b,x0,y0): - """Same as fit but designed to run with optimize.curve_fit""" + """Evaluate the four-parameter logistic (4PL) model for qPCR fluorescence data. + + Implements the model from Zhao et al.: + f(x) = y0 + a / (1 + (x / x0)^b) + + Designed for use with ``scipy.optimize.curve_fit``. + + Args: + x: Cycle number (scalar or array). + a: Amplitude parameter (difference between upper and lower + asymptotes). + b: Slope/steepness parameter. + x0: Inflection point (cycle at the midpoint of the curve). + y0: Baseline fluorescence (lower asymptote). + + Returns: + Predicted fluorescence value(s) at cycle ``x``. + """ return (y0+(a/(1+((x/x0)**b)))) def qpcrFitResiduals(x,y,a,b,x0,y0): - """ - Residuals: - errfunc = lambda p,x,y: y-fitfunc(p,x) #Distance to the target function (residuals) + """Compute residuals between observed fluorescence and the 4PL model. + + Calculates ``y - qpcrFit(x, a, b, x0, y0)``. + + Args: + x: Cycle number(s) (scalar or array). + y: Observed fluorescence value(s). + a: Amplitude parameter. + b: Slope/steepness parameter. + x0: Inflection point (cycle at midpoint). + y0: Baseline fluorescence (lower asymptote). + + Returns: + Residual value(s) ``y - predicted``. """ return y-qpcrFit(x,a,b,x0,y0) def nlmFit(x,a,b,y0): - """ - Non-linear regression function to optimize for windows in exponential phase - here p = [a,b,y0] + """Evaluate the exponential nonlinear regression model for the exponential phase. + + Models the exponential amplification phase as: + f(x) = y0 + a * (b ^ x) + + Used for iterative nonlinear regression (iNLR) on windows within the + exponential phase. Parameters are ``[a, b, y0]``. + + Args: + x: Cycle number (scalar or array). + a: Amplitude scaling factor. + b: Per-cycle amplification factor (related to efficiency: b ~ E). + y0: Baseline offset. + + Returns: + Predicted fluorescence value(s) at cycle ``x``. """ return y0+(a*(b**x)) def nlmFitResiduals(x,y,a,b,y0): - """ - Residuals: - errfunc = lambda p,x,y: y-nlmFit(x,a,b,y0) #Distance to the target function (residuals) + """Compute residuals between observed fluorescence and the exponential NLM model. + + Calculates ``y - nlmFit(x, a, b, y0)``. + + Args: + x: Cycle number(s) (scalar or array). + y: Observed fluorescence value(s). + a: Amplitude scaling factor. + b: Per-cycle amplification factor. + y0: Baseline offset. + + Returns: + Residual value(s) ``y - predicted``. """ return y-nlmFit(x,a,b,y0) def CP_FDM(p): + """Compute the crossing-point using the First Derivative Maximum (FDM) method. + + Args: + p: Sequence of four fitted 4PL parameters ``[a, b, x0, y0]``. + + Returns: + The FDM crossing-point cycle number as a float. + """ return (p[2]*nthRoot(((p[1]-1)/(p[1]+1)),p[1])) def CP_SDM(p): + """Compute the crossing-point using the Second Derivative Maximum (SDM) method. + + Args: + p: Sequence of four fitted 4PL parameters ``[a, b, x0, y0]``. + + Returns: + The SDM crossing-point cycle number as a float. + """ return p[2]*nthRoot((np.sqrt((3*p[1]**2)*(p[1]**2-1))-(2*(1-p[1]**2)))/((p[1]**2)+(3*p[1])+2),p[1]) def CP_SPE(p,rNoise): + """Compute the crossing-point using the Signal-to-Noise (SPE) method. + + Args: + p: Sequence of four fitted 4PL parameters ``[a, b, x0, y0]``. + rNoise: Baseline noise estimate (standard error of the ``y0`` + parameter, i.e., ``RNoise``). + + Returns: + The SPE crossing-point cycle number as a float. + """ return (p[2]*nthRoot(((p[0]-rNoise)/rNoise),p[1])) diff --git a/src/qpcr/abi.py b/src/qpcr/abi.py index 99e7499..8af1a5f 100644 --- a/src/qpcr/abi.py +++ b/src/qpcr/abi.py @@ -1,6 +1,12 @@ #!/usr/bin/env python ''' -Created on Feb 22, 2010 +Utilities for parsing and analyzing ABI qPCR instrument output. + +Provides functions for parsing raw ABI results and cycle data files, +computing PCR amplification efficiencies via a sliding-window linear +regression on log-transformed fluorescence values, performing the +delta-delta Ct (ddCt) relative-quantification calculation, and +summarizing/reporting the results. Requirements: - numpy @@ -46,8 +52,21 @@ ########################## def parseData(fname): - """Raw input for this file is a matrix of well x (Well,SampleName,DetectorName,Task,Ct,Threshold). You must also delete the intermediate headers and summary rows from raw output of ABI. - Be sure to remove the header section (except one header row). + """Parse a simplified ABI results text file into a list of well dictionaries. + + Raw input is a tab-delimited matrix with columns: + Well, SampleName, DetectorName, Task, Ct, Threshold. + Intermediate headers and summary rows must be removed from the raw ABI + output before calling this function; only one header row should remain. + Wells with an ``Undetermined`` Ct value are silently skipped. + + Args: + fname: Path to the tab-delimited results text file. + + Returns: + A list of dicts, one per well, with keys ``well`` (int), + ``sample``, ``detector``, ``task``, ``Ct`` (float), and + ``threshold`` (float). """ data = [] handle = open(fname,'r') @@ -65,6 +84,18 @@ def parseData(fname): return data def getDetAndSamp(data): + """Return ordered lists of unique detector and sample names found in the data. + + Preserves first-seen order for both detectors and samples. + + Args: + data: List of well dicts as returned by ``parseData``, each containing + ``detector`` and ``sample`` keys. + + Returns: + A tuple ``(detectors, samples)`` where each element is a list of + unique string names in the order they were first encountered. + """ detectors = [] samples = [] for well in data: @@ -75,13 +106,33 @@ def getDetAndSamp(data): return detectors,samples def wellIndex(data): + """Build a list of well numbers in the same order as the data list. + + Args: + data: List of well dicts, each containing a ``well`` key. + + Returns: + A list of integer well numbers corresponding positionally to each + entry in ``data``. + """ index = [] for i in range(len(data)): index.append(data[i]['well']) return index def parseCycleData(fname): - """Raw input is tab-delimited text file with matrix of WellsxCycle values. Header row is included. + """Parse a tab-delimited cycle fluorescence file into a list of well dicts. + + Raw input is a tab-delimited file with a header row. Columns are: + Well, Sample, Detector, followed by one column per cycle number. + + Args: + fname: Path to the tab-delimited cycle data text file. + + Returns: + A list of dicts, one per well, with keys ``well`` (int), + ``sample`` (str), ``detector`` (str), and ``values`` (numpy array + of float fluorescence readings, one per cycle). """ cycleData = [] handle = open(fname,'r') @@ -105,6 +156,17 @@ def parseCycleData(fname): #Get User Input ###################### def getEndoControl(detectors): + """Interactively prompt the user to select an endogenous control detector. + + Prints a numbered list of detector names and reads an integer choice from + standard input. + + Args: + detectors: List of detector name strings to present to the user. + + Returns: + The detector name string chosen by the user. + """ myString = "Please choose an endogenous control:\n" for i in range(0,len(detectors)): myString = myString+"\t(%d):\t%s\n" % (i,detectors[i]) @@ -113,6 +175,17 @@ def getEndoControl(detectors): return detectors[choice] def getReference(samples): + """Interactively prompt the user to select a reference sample. + + Prints a numbered list of sample names and reads an integer choice from + standard input. + + Args: + samples: List of sample name strings to present to the user. + + Returns: + The sample name string chosen by the user. + """ myString = "Please choose a reference sample:\n" for i in range(0,len(samples)): myString = myString + "\t(%d):\t%s\n" % (i,samples[i]) @@ -125,6 +198,19 @@ def getReference(samples): ##################################### def aggregateReplicateCts(data): + """Aggregate replicate Ct values per sample/detector pair using the median. + + Groups raw per-well Ct values by (sample, detector) and computes the + median Ct for each combination. + + Args: + data: List of well dicts, each containing ``sample``, ``detector``, + and ``Ct`` keys. + + Returns: + A nested dict ``{sample: {detector: median_Ct}}`` where each value + is the median Ct (float) computed from all replicate wells. + """ #TODO: make this aggregate either Ct values or N0 values? tmp = {} for d in data: @@ -146,8 +232,28 @@ def aggregateReplicateCts(data): ##################################### def calculateEfficiencies(cycleData): - """Takes a list of dictionaries of cycle information by well and returns those same dictionaries with - additional keys for efficiency and concentration (N0) values.""" + """Compute PCR amplification efficiency and initial concentration (N0) for each well. + + For each well, log10-transforms the fluorescence values, then slides a + window of size ``windowSize`` across all cycles and picks the window with + the highest Pearson correlation between log-fluorescence and cycle number + (i.e., the most linear exponential-phase segment). A linear regression on + that best window gives the slope (from which efficiency = 10^slope) and + intercept (from which N0 = 10^intercept). + + Adds the following keys to each well dict in-place: + ``logVals``, ``bestIdx``, ``bestCorr``, ``bestSlice``, + ``bestCycles``, ``bestSlope``, ``bestIntercept``, + ``efficiency``, ``N0``. + + Args: + cycleData: List of well dicts as returned by ``parseCycleData``, + each containing at minimum a ``values`` numpy array. + + Returns: + The same list of well dicts with the additional efficiency and N0 + keys populated. + """ res = [] for well in cycleData: well['logVals'] = getLogVals(well['values']) @@ -172,6 +278,20 @@ def calculateEfficiencies(cycleData): return res def summarizeEfficiencies(cycleData): + """Compute mean and standard deviation of PCR efficiency for each detector. + + Groups per-well efficiency values by detector name and computes summary + statistics. + + Args: + cycleData: List of well dicts, each containing ``detector`` and + ``efficiency`` keys (as produced by ``calculateEfficiencies``). + + Returns: + A dict ``{detector: {'meanEff': float, 'sdevEff': float}}`` giving + the mean and standard deviation of efficiency across all wells for + each detector. + """ tmp = {} #Aggregate efficiencies by detector for i in cycleData: @@ -184,8 +304,24 @@ def summarizeEfficiencies(cycleData): return eff def mergeDataAndCycleData(data,cycleData,idx): - """Takes an index of data (by well) and the cycleData to add the efficiency and N0 from cycleData to the - data dictionaries""" + """Copy efficiency and N0 values from cycleData into the matching well dicts in data. + + Uses the provided well-number index to look up each cycle-data well in + the data list and transfers the ``N0`` and ``efficiency`` values. Wells + present in cycleData but absent from data (e.g., wells skipped due to + undetermined Ct) are silently ignored. + + Args: + data: List of well dicts as returned by ``parseData``. + cycleData: List of well dicts as returned by ``calculateEfficiencies``, + each containing ``well``, ``N0``, and ``efficiency`` keys. + idx: List of integer well numbers parallel to ``data``, as returned + by ``wellIndex``. + + Returns: + The ``data`` list with ``N0`` and ``efficiency`` keys added to each + matched well dict. + """ for c in cycleData: try: dataloc = idx.index(c['well']) @@ -198,12 +334,45 @@ def mergeDataAndCycleData(data,cycleData,idx): #TODO: Make summarizer for N0 elements by sample and detector def getLogVals(myArray): + """Return the base-10 logarithm of each element in a numpy array. + + Args: + myArray: A numpy array of positive numeric values. + + Returns: + A numpy array of the same shape containing log10 of each input value. + """ return np.log10(myArray) ############################### #ddCt math ############################### def ddCt(data,medianCts,endoControl,reference): + """Compute delta-Ct and delta-delta-Ct values for each well. + + For each well, dCt is calculated as: + dCt = Ct - median_Ct(sample, endoControl) + + ddCt is then calculated as: + ddCt = dCt - median_dCt(reference, detector) + + Wells where the endogenous control Ct is unavailable receive ``"N/A"`` + for dCt, and wells where the reference dCt is unavailable receive + ``"N/A"`` for ddCt. + + Args: + data: List of well dicts, each containing ``sample``, ``detector``, + and ``Ct`` keys. + medianCts: Nested dict ``{sample: {detector: median_Ct}}`` as returned + by ``aggregateReplicateCts``. + endoControl: Name of the endogenous control detector to use for + normalization. + reference: Name of the reference sample to use for ddCt calculation. + + Returns: + The ``data`` list with ``dCt`` and ``ddCt`` keys added to each well + dict (values are floats or ``"N/A"``). + """ tmp = {} #Calculate dCts for i in range(len(data)): @@ -230,6 +399,24 @@ def ddCt(data,medianCts,endoControl,reference): return data def RQ(data,effs): + """Calculate relative quantification (RQ) values for each well. + + RQ is computed as: + RQ = meanEfficiency ^ (-ddCt) + + Wells with a ``"N/A"`` ddCt or a missing efficiency entry receive + ``"N/A"`` for RQ. + + Args: + data: List of well dicts containing ``detector`` and ``ddCt`` keys, + as returned by ``ddCt``. + effs: Dict ``{detector: {'meanEff': float, ...}}`` as returned by + ``summarizeEfficiencies``. + + Returns: + The ``data`` list with an ``RQ`` key added to each well dict + (float or ``"N/A"``). + """ res = [] for d in data: try: @@ -247,7 +434,14 @@ def RQ(data,effs): ############################### def mean(vals): - """Computes the mean of a list of numbers""" + """Compute the arithmetic mean of a list of numbers. + + Args: + vals: An iterable of numeric values. + + Returns: + The arithmetic mean as a float. + """ n = 0 s = 0.0 for i in vals: @@ -256,7 +450,17 @@ def mean(vals): return s / float(n) def median(vals): - """Computes the median of a list of numbers""" + """Compute the median of a list of numbers. + + Sorts the list in-place before computing. + + Args: + vals: A list of numeric values. + + Returns: + The median value as a float. For even-length lists, returns the + average of the two middle values. + """ lenvals = len(vals) vals.sort() @@ -266,17 +470,46 @@ def median(vals): return vals[lenvals // 2] def variance(vals): - """Variance""" + """Compute the sample variance of a list of numbers. + + Uses Bessel's correction (divides by N-1). + + Args: + vals: A list of numeric values with at least two elements. + + Returns: + The sample variance as a float. + """ u = mean(vals) return sum((x - u)**2 for x in vals) / float(len(vals)-1) def sdev(vals): - """Standard deviation""" + """Compute the sample standard deviation of a list of numbers. + + Returns 0.0 for lists with one or fewer elements. + + Args: + vals: A list of numeric values. + + Returns: + The sample standard deviation as a float. + """ if len(vals) <=1: return 0.0 return math.sqrt(variance(vals)) def covariance(lst1, lst2): - """Covariance""" + """Compute the sample covariance between two equal-length lists. + + Uses Bessel's correction (divides by N-1). + + Args: + lst1: First list of numeric values. + lst2: Second list of numeric values; must be the same length as + ``lst1``. + + Returns: + The sample covariance as a float. + """ m1 = mean(lst1) m2 = mean(lst2) tot = 0.0 @@ -285,7 +518,21 @@ def covariance(lst1, lst2): return tot / (len(lst1)-1) def corr(lst1, lst2): - """Pearson's Correlation""" + """Compute the Pearson correlation coefficient between two lists. + + Returns a very large number (1e1000) when the denominator is zero + (i.e., one or both lists have zero variance), which is used as a + sentinel for a perfect linear relationship in the sliding-window search. + + Args: + lst1: First list of numeric values. + lst2: Second list of numeric values; must be the same length as + ``lst1``. + + Returns: + The Pearson correlation coefficient as a float, or 1e1000 when the + standard deviation of either list is zero. + """ num = covariance(lst1, lst2) denom = float(sdev(lst1) * sdev(lst2)) if denom != 0: @@ -294,13 +541,38 @@ def corr(lst1, lst2): return 1e1000 def slope(xarray,yarray): - """Uses numpy, in fact assumes that the list arguments are numpy arrays.""" + """Compute the ordinary least-squares regression slope. + + Uses the standard closed-form formula. Requires numpy arrays because + element-wise multiplication (``xarray * yarray``) and vectorized + ``sum`` are used. + + Args: + xarray: Numpy array of independent variable values. + yarray: Numpy array of dependent variable values; must be the same + length as ``xarray``. + + Returns: + The regression slope as a float. + """ n = float(len(xarray)) m = (n*sum(xarray*yarray)-sum(xarray)*sum(yarray))/(n*sum(xarray**2)-(sum(xarray))**2) return m def intercept(xarray,yarray): - """Uses numpy, in fact assumes that the list arguments are numpy arrays.""" + """Compute the ordinary least-squares regression intercept. + + Uses the standard closed-form formula given the slope. Requires numpy + arrays because vectorized ``sum`` is used. + + Args: + xarray: Numpy array of independent variable values. + yarray: Numpy array of dependent variable values; must be the same + length as ``xarray``. + + Returns: + The regression intercept (y-axis) as a float. + """ m = slope(xarray,yarray) n = float(len(xarray)) b = (sum(yarray)-m*(sum(xarray)))/n @@ -311,9 +583,35 @@ def intercept(xarray,yarray): ############################### def flagBadDetectors(): + """Flag detectors with poor amplification characteristics. + + Not yet implemented. + """ pass def aggregateResults(data): + """Aggregate per-well RQ, N0, and dCt values into per-(sample, detector) summaries. + + Computes mean, median, and standard deviation of RQ, dCt, and N0 + for every (sample, detector) combination across all replicate wells. + Wells with ``"N/A"`` RQ are excluded from RQ and dCt summaries but N0 + is always summarized (N0 values are assumed to be present). + + Args: + data: List of well dicts containing ``sample``, ``detector``, + ``RQ``, ``N0``, and ``dCt`` keys, as returned by ``RQ``. + + Returns: + A nested dict ``{sample: {detector: stats_dict}}`` where + ``stats_dict`` contains the keys: ``medianRQ``, ``meanRQ``, + ``sdevRQ``, ``mediandCt``, ``meandCt``, ``sdevdCt``, + ``medianN0``, ``meanN0``, ``sdevN0``. Unavailable values are + represented as ``"N/A"``. + + Raises: + KeyError: If ``RQ`` values have not yet been computed on the data + (i.e., ``ddCt`` and ``RQ`` have not been called first). + """ try: data[0]['RQ'] except KeyError: @@ -377,6 +675,22 @@ def aggregateResults(data): return res def printDataFrameRQs(RQsummary,effs,outFile): + """Write a tab-delimited summary of RQ results to a file and to stdout. + + Outputs one row per (sample, detector) combination with columns: + Sample, Detector, meanEff, meanRQ, sdevRQ, medianRQ, meandCt, + mediandCt, sdevdCt, quant, ci.l, ci.u. + + The ``quant`` column is efficiency^(-mediandCt); ``ci.l`` and ``ci.u`` + are efficiency^(-(mediandCt +/- sdevdCt)), providing approximate + confidence intervals. + + Args: + RQsummary: Nested dict as returned by ``aggregateResults``. + effs: Dict ``{detector: {'meanEff': float, ...}}`` as returned by + ``summarizeEfficiencies``. + outFile: Path to the output file to write. + """ #Open out Handle outHandle = open(outFile,'w') #Print header row @@ -395,16 +709,58 @@ def printDataFrameRQs(RQsummary,effs,outFile): #TODO:Create R Function to plot output from printDataFramRQs() def plotRQs(results): + """Plot relative quantification (RQ) values. + + Not yet implemented. + + Args: + results: Aggregated results dict as returned by ``aggregateResults``. + """ pass def plotEdCt(results): + """Plot efficiency-corrected delta-Ct (EdCt) values. + + Not yet implemented. + + Args: + results: Aggregated results dict as returned by ``aggregateResults``. + """ pass def doPlotting(plotScript = "plotting.q"): + """Execute an external R plotting script as a subprocess. + + Args: + plotScript: Path to the R script to execute. Defaults to + ``"plotting.q"``. + + Returns: + A tuple ``(status, output)`` as returned by + ``subprocess.getstatusoutput``. + """ return subprocess.getstatusoutput(plotScript) def makeDvsS(results,detectors,samples,value = "mediandCt"): + """Build a detector-by-sample matrix of a chosen summary statistic. + + Creates a 2-D numpy array indexed by detector (rows) and sample + (columns). Missing (sample, detector) combinations are filled with + ``nan``. + + Args: + results: Nested dict ``{sample: {detector: stats_dict}}`` as + returned by ``aggregateResults``. + detectors: Ordered list of detector names defining the row order. + samples: Ordered list of sample names defining the column order. + value: Key within the innermost stats dict to extract. Defaults to + ``"mediandCt"``. + + Returns: + A numpy float array of shape ``(len(detectors), len(samples))`` + containing the requested statistic for each cell. + """ matrix = np.zeros((len(detectors),len(samples)),float) for d in range(0,len(detectors)): for s in range(0,len(samples)): @@ -419,6 +775,17 @@ def makeDvsS(results,detectors,samples,value = "mediandCt"): ############################## def main(mainFile,cycleFile): + """Run the full ABI qPCR analysis pipeline interactively. + + Parses results and cycle-data files, computes efficiencies, + interactively asks the user to select an endogenous control and reference + sample, performs ddCt/RQ calculations, and writes ``output.txt`` before + running the external plotting script. + + Args: + mainFile: Path to the tab-delimited ABI results file. + cycleFile: Path to the tab-delimited cycle fluorescence file. + """ #Parse mainFile print("Parsing Results File...") data = parseData(mainFile) @@ -455,6 +822,17 @@ def main(mainFile,cycleFile): return def test(): + """Run a manual integration test using hard-coded HeLa RIP data files. + + Parses ``'RIP HeLa clipped.txt'`` and ``'new_RIP_HeLa.txt'``, runs the + full ddCt/RQ pipeline with hard-coded endogenous control (``'hGAPDH'``) + and reference sample (``'IgG RIP'``), writes ``output.txt``, and + returns a detector-by-sample matrix of mediandCt values. + + Returns: + A numpy float array of shape ``(n_detectors, n_samples)`` containing + the mediandCt for each (detector, sample) combination. + """ cycleData = parseCycleData('RIP HeLa clipped.txt') cycleData = calculateEfficiencies(cycleData) effs = summarizeEfficiencies(cycleData) diff --git a/src/qpcr/qpcrAnalysis.py b/src/qpcr/qpcrAnalysis.py index 9072c9d..d4f2a15 100644 --- a/src/qpcr/qpcrAnalysis.py +++ b/src/qpcr/qpcrAnalysis.py @@ -1,12 +1,19 @@ #!/usr/bin/env python ''' -Created on Feb 22, 2010 +Core qPCR analysis module using four-parameter logistic modelling and iterative +nonlinear regression for efficiency estimation. + +Provides the ``Well`` class for per-well data storage and curve fitting, along +with standalone functions for parsing raw ABI instrument output, performing +delta-delta Ct (ddCt) relative quantification, and reporting results. + +This module extends the functionality in ``abi.py`` with a more rigorous +curve-fitting approach based on the four-parameter logistic (4PL) model +described in Zhao et al. Requirements: - numpy - - rpy - - R (obviously) - - lattice package (for plotting) + - scipy results.txt input format example (tab-delimited): Well Sample Detector Task Ct Threshold @@ -49,7 +56,37 @@ #Classes ########################## class Well: + """Represents a single PCR well with its raw data and fitted curve parameters. + + Stores metadata (sample name, detector, task, etc.), raw fluorescence + readings keyed by cycle, and all intermediate and final results from + four-parameter logistic curve fitting and crossing-point estimation. + + Attributes: + wellNum: Integer well number (defaults to -1 until populated). + sample: Sample name string. + detector: Detector (primer/probe) name string. + reporter: Reporter dye name string. + task: Task type string (e.g., ``"EndogenousControl"``). + Ct: Threshold cycle value (float). + quantity: Quantity value from ABI output (float). + eff: Amplification efficiency (float). + threshold: Fluorescence threshold (float). + cycles: List of cycle labels from the cycle data file. + fluorData: Numpy array of fluorescence readings per cycle. + flags: Dict of quality-flag name/value pairs parsed from the ABI file. + RNoise: Standard error of the baseline fluorescence parameter (y0) + from the fitted 4PL model; None until ``fitPCRCurve`` is called. + """ + def __init__(self,line): + """Initialise a Well with default empty values. + + Args: + line: The raw text line from the ABI file used to create this + well (stored for reference but not parsed here; parsing is + done by ``parseRawABI``). + """ self.wellNum = -1 self.sample = '' self.detector = '' @@ -65,12 +102,42 @@ def __init__(self,line): self.RNoise = None def estimateParams(self): + """Generate initial parameter guesses for the four-parameter logistic model. + + Estimates starting values for the curve-fitting routine based on + simple statistics of the raw fluorescence data: + + - ``y0``: mean of the first five cycles (baseline fluorescence). + - ``x0``: cycle nearest the midpoint fluorescence (inflection point). + - ``a``: dynamic range (max minus min fluorescence). + - ``b``: set to 0 (the optimiser handles this parameter well without + a manual initial estimate). + + Populates the instance attributes ``y0``, ``x0``, ``a``, and ``b`` + in-place. + """ self.y0 = np.mean(self.fluorData[:5]) # Initial guess as to baseline fluorescence (mean of first five cycles) self.x0 = self.cycles[np.argmin(abs(self.fluorData-np.mean(self.fluorData)))] # Initial guess as to inflection point at middle of curve self.a = (np.max(self.fluorData)-np.min(self.fluorData))# Initial guess as to y value at inflection self.b = 0 # Don't think I need to estimate this parameter, model seems to do a good job of fitting this one. def fitPCRCurve(self): + """Fit the four-parameter logistic (4PL) model to the fluorescence data. + + Calls ``scipy.optimize.curve_fit`` with ``qpcrFit`` as the model + function and up to 5000 function evaluations. After fitting, + updates the instance attributes: + + - ``a``, ``b``, ``x0``, ``y0``: fitted model parameters. + - ``pCov``: covariance matrix of the fitted parameters. + - ``fitData``: list of model-predicted fluorescence values at each + cycle. + - ``paramSE``: dict mapping parameter names (``'a'``, ``'b'``, + ``'x0'``, ``'y0'``) to their standard errors (sqrt of the + diagonal of ``pCov``). + - ``RNoise``: standard error of the ``y0`` parameter, used as an + estimate of baseline noise. + """ #Fit qpcr Model newParams,self.pCov = optimize.curve_fit(qpcrFit,xdata=self.cycles,ydata=self.fluorData,maxfev=5000) #Update params @@ -87,18 +154,54 @@ def fitPCRCurve(self): return def CP_FDM(self): + """Compute the crossing-point by the First Derivative Maximum (FDM) method. + + Calculates the cycle number at which the first derivative of the + fitted 4PL curve is maximised, stored in ``self.FDM``. + + Returns: + The FDM crossing-point cycle number as a float. + """ self.FDM = (self.x0*nthRoot(((self.b-1)/(self.b+1)),self.b)) return self.FDM def CP_SDM(self): + """Compute the crossing-point by the Second Derivative Maximum (SDM) method. + + Calculates the cycle number at which the second derivative of the + fitted 4PL curve is maximised, stored in ``self.SDM``. + + Returns: + The SDM crossing-point cycle number as a float. + """ self.SDM = self.x0*nthRoot((np.sqrt((3*self.b**2)*(self.b**2-1))-(2*(1-self.b**2)))/((self.b**2)+(3*self.b)+2),self.b) return self.SDM def CP_SPE(self): + """Compute the crossing-point by the Signal-to-Noise method (SPE). + + Calculates the cycle at which the fluorescence signal exceeds the + baseline noise by a factor of ``a / RNoise``, stored in ``self.SPE``. + Requires that ``fitPCRCurve`` has been called so that ``RNoise`` is + available. + + Returns: + The SPE crossing-point cycle number as a float. + """ self.SPE = (self.x0*nthRoot(((self.a-self.RNoise)/self.RNoise),self.b)) return self.SPE def iterativeNLR(self): + """Perform iterative nonlinear regression over the exponential phase window. + + Uses the SPE and SDM crossing-point estimates to define the lower and + upper cycle boundaries of the exponential phase. Enumerates all + sub-windows of size >= ``windowSize`` within that range using + combinatorics and stores the window indices in ``self.winIdx``. + + Requires that ``CP_SPE`` and ``CP_SDM`` have been called first to + populate ``self.SPE`` and ``self.SDM``. + """ self.lowerCycleNum = int(self.SPE) self.upperCycleNum = int(self.SDM) self.regPoints = self.upperCycleNum-self.lowerCycleNum+1 @@ -115,7 +218,21 @@ def iterativeNLR(self): #Parsing ########################## def parseRawABI(fname): - """This replaces parseData""" + """Parse a raw ABI results file into a dict of Well objects keyed by well number. + + Replaces the simpler ``parseData`` function. Handles the multi-section ABI + export format: skips the first line, collects key/value header metadata, + then reads data rows until EOF. Rows with an ``"Undetermined"`` Ct value + are skipped. Quality-flag columns (indices 17 onwards) are stored in each + ``Well.flags`` dict. + + Args: + fname: Path to the raw ABI tab-delimited results export file. + + Returns: + A dict ``{well_number (int): Well}`` for every well with a valid + numeric Ct value. + """ dictKeys = ['well','sample','detector','reporter','task','Ct','quantity','Qty Mean','Qty StdDev','Ct Median','Ct Mean','Ct StdDev','Baseline Type','Baseline Start','Baseline Stop','Threshold Type','threshold','FOS','HMD','LME','EW','BPR','NAW','HNS','HRN','EAF','BAF','TAF','CAF'] handle = open(fname,'r') header = {} @@ -171,7 +288,18 @@ def parseRawABI(fname): assert False, "Should not reach this line..." def parseRawCycle(fname,wellData): - """This replaces parseCycleData""" + """Parse a raw ABI cycle fluorescence file and populate the matching Well objects. + + Replaces the simpler ``parseCycleData`` function. Reads fluorescence + readings up to (but not including) the ``"Delta Rn"`` column and writes + ``cycles`` and ``fluorData`` directly onto the corresponding ``Well`` + objects in ``wellData``. + + Args: + fname: Path to the raw ABI cycle data tab-delimited export file. + wellData: Dict ``{well_number: Well}`` as returned by + ``parseRawABI``. Modified in-place. + """ handle = open(fname,'r') handle.readline()#Remove first line headerRow = handle.readline() @@ -187,12 +315,33 @@ def parseRawCycle(fname,wellData): return def getDetAndSamp(wellData): - """Returns two lists of unique detectors and unique samples""" + """Return lists of unique detector and sample names from a collection of Well objects. + + Uses ``util.uniqify`` to deduplicate; result order is not guaranteed to + be preserved (depends on dict key ordering). + + Args: + wellData: An iterable of ``Well`` objects (e.g., the values of the + dict returned by ``parseRawABI``). + + Returns: + A tuple ``(detectors, samples)`` where each element is a list of + unique string names. + """ detectors = util.uniqify(detectors = [x.detector for x in wellData]) samples = util.uniqify(samples = [x.sample for x in wellData]) return detectors,samples def wellIndex(data): + """Build a list of well numbers in the same order as the data list. + + Args: + data: List of well dicts, each containing a ``well`` key. + + Returns: + A list of integer well numbers corresponding positionally to each + entry in ``data``. + """ index = [] for i in range(len(data)): index.append(data[i]['well']) @@ -202,6 +351,17 @@ def wellIndex(data): #Get User Input ###################### def getEndoControl(detectors): + """Interactively prompt the user to select an endogenous control detector. + + Prints a numbered list of detector names and reads an integer choice from + standard input. + + Args: + detectors: List of detector name strings to present to the user. + + Returns: + The detector name string chosen by the user. + """ myString = "Please choose an endogenous control:\n" for i in range(0,len(detectors)): myString = myString+"\t(%d):\t%s\n" % (i,detectors[i]) @@ -210,6 +370,17 @@ def getEndoControl(detectors): return detectors[choice] def getReference(samples): + """Interactively prompt the user to select a reference sample. + + Prints a numbered list of sample names and reads an integer choice from + standard input. + + Args: + samples: List of sample name strings to present to the user. + + Returns: + The sample name string chosen by the user. + """ myString = "Please choose a reference sample:\n" for i in range(0,len(samples)): myString = myString + "\t(%d):\t%s\n" % (i,samples[i]) @@ -222,6 +393,20 @@ def getReference(samples): ##################################### def aggregateReplicateCts(data): + """Aggregate replicate Ct values per sample/detector pair using the median. + + Groups raw per-well Ct values by (sample, detector) and computes the + median Ct for each combination. ``"N/A"`` values (from undetermined wells + that slipped through) are silently dropped by the ``median`` helper. + + Args: + data: List of well dicts, each containing ``sample``, ``detector``, + and ``Ct`` keys. + + Returns: + A nested dict ``{sample: {detector: median_Ct}}`` where each value + is the median Ct (float or ``"N/A"`` if all replicates are missing). + """ #This will have to change... #TODO: make this aggregate either Ct values or N0 values? tmp = {} @@ -244,48 +429,142 @@ def aggregateReplicateCts(data): ##################################### def getLogVals(myArray): + """Return the base-10 logarithm of each element in a numpy array. + + Args: + myArray: A numpy array of positive numeric values. + + Returns: + A numpy array of the same shape containing log10 of each input value. + """ return np.log10(myArray) ######### # Four-parameter Logistic Model fitting ######### def nthRoot(num,n): + """Compute the nth root of a number. + + Args: + num: The base value (numeric). + n: The root degree (numeric, must not be zero). + + Returns: + ``num ** (1.0 / n)`` as a float. + """ return num ** (1.0/n) def qpcrFit(x,a,b,x0,y0): - """Same as fit but designed to run with optimize.curve_fit""" + """Evaluate the four-parameter logistic (4PL) model for qPCR fluorescence data. + + Implements the model from Zhao et al.: + f(x) = y0 + a / (1 + (x / x0)^b) + + Designed for use with ``scipy.optimize.curve_fit``. + + Args: + x: Cycle number (scalar or array). + a: Amplitude parameter (difference between upper and lower + asymptotes). + b: Slope/steepness parameter. + x0: Inflection point (cycle at the midpoint of the curve). + y0: Baseline fluorescence (lower asymptote). + + Returns: + Predicted fluorescence value(s) at cycle ``x``. + """ return (y0+(a/(1+((x/x0)**b)))) def qpcrFitResiduals(x,y,a,b,x0,y0): - """ - Residuals: - errfunc = lambda p,x,y: y-fitfunc(p,x) #Distance to the target function (residuals) + """Compute residuals between observed fluorescence and the 4PL model. + + Calculates ``y - qpcrFit(x, a, b, x0, y0)``. + + Args: + x: Cycle number(s) (scalar or array). + y: Observed fluorescence value(s). + a: Amplitude parameter. + b: Slope/steepness parameter. + x0: Inflection point (cycle at midpoint). + y0: Baseline fluorescence (lower asymptote). + + Returns: + Residual value(s) ``y - predicted``. """ return y-qpcrFit(x,a,b,x0,y0) def CP_FDM(p): + """Compute the crossing-point using the First Derivative Maximum (FDM) method. + + Args: + p: Sequence of four fitted 4PL parameters ``[a, b, x0, y0]``. + + Returns: + The FDM crossing-point cycle number as a float. + """ return (p[2]*nthRoot(((p[1]-1)/(p[1]+1)),p[1])) def CP_SDM(p): + """Compute the crossing-point using the Second Derivative Maximum (SDM) method. + + Args: + p: Sequence of four fitted 4PL parameters ``[a, b, x0, y0]``. + + Returns: + The SDM crossing-point cycle number as a float. + """ return p[2]*nthRoot((np.sqrt((3*p[1]**2)*(p[1]**2-1))-(2*(1-p[1]**2)))/((p[1]**2)+(3*p[1])+2),p[1]) def CP_SPE(p,rNoise): + """Compute the crossing-point using the Signal-to-Noise (SPE) method. + + Args: + p: Sequence of four fitted 4PL parameters ``[a, b, x0, y0]``. + rNoise: Baseline noise estimate (standard error of the ``y0`` + parameter, i.e., ``RNoise``). + + Returns: + The SPE crossing-point cycle number as a float. + """ return (p[2]*nthRoot(((p[0]-rNoise)/rNoise),p[1])) ############################### #Iterative Nonlinear Regression ############################### def nlmFit(x,a,b,y0): - """ - Non-linear regression function to optimize for windows in exponential phase - here p = [a,b,y0] + """Evaluate the exponential nonlinear regression model for the exponential phase. + + Models the exponential amplification phase as: + f(x) = y0 + a * (b ^ x) + + Used for iterative nonlinear regression (iNLR) on windows within the + exponential phase. Parameters are ``[a, b, y0]``. + + Args: + x: Cycle number (scalar or array). + a: Amplitude scaling factor. + b: Per-cycle amplification factor (related to efficiency: b ~ E). + y0: Baseline offset. + + Returns: + Predicted fluorescence value(s) at cycle ``x``. """ return y0+(a*(b**x)) def nlmFitResiduals(x,y,a,b,y0): - """ - Residuals: - errfunc = lambda p,x,y: y-nlmFit(p,x) #Distance to the target function (residuals) + """Compute residuals between observed fluorescence and the exponential NLM model. + + Calculates ``y - nlmFit(x, a, b, y0)``. + + Args: + x: Cycle number(s) (scalar or array). + y: Observed fluorescence value(s). + a: Amplitude scaling factor. + b: Per-cycle amplification factor. + y0: Baseline offset. + + Returns: + Residual value(s) ``y - predicted``. """ return y-nlmFit(x,a,b,y0) @@ -294,6 +573,30 @@ def nlmFitResiduals(x,y,a,b,y0): #ddCt math ############################### def ddCt(data,medianCts,endoControl,reference): + """Compute delta-Ct and delta-delta-Ct values for each well. + + For each well, dCt is calculated as: + dCt = Ct - median_Ct(sample, endoControl) + + If the endogenous control Ct is unavailable for a sample, dCt is set to + ``"N/A"``. ddCt is then calculated as: + ddCt = dCt - median_dCt(reference, detector) + + If the reference dCt is unavailable, ddCt is set to ``"N/A"``. + + Args: + data: List of well dicts, each containing ``sample``, ``detector``, + and ``Ct`` keys. + medianCts: Nested dict ``{sample: {detector: median_Ct}}`` as + returned by ``aggregateReplicateCts``. + endoControl: Name of the endogenous control detector to use for + normalization. + reference: Name of the reference sample to use for ddCt calculation. + + Returns: + The ``data`` list with ``dCt`` and ``ddCt`` keys added to each well + dict (values are floats or ``"N/A"``). + """ tmp = {} #Calculate dCts for i in range(len(data)): @@ -324,9 +627,37 @@ def ddCt(data,medianCts,endoControl,reference): return data def JohnsMethod(data,medianCts,endoControl,reference): + """Placeholder for an alternative relative quantification method. + + Not yet implemented. + + Args: + data: List of well dicts. + medianCts: Nested dict of median Ct values per sample/detector. + endoControl: Name of the endogenous control detector. + reference: Name of the reference sample. + """ pass def RQ(data,effs): + """Calculate relative quantification (RQ) values for each well. + + RQ is computed as: + RQ = meanEfficiency ^ (-ddCt) + + Wells with a ``"N/A"`` ddCt or a missing efficiency entry receive + ``"N/A"`` for RQ. + + Args: + data: List of well dicts containing ``detector`` and ``ddCt`` keys, + as returned by ``ddCt``. + effs: Dict ``{detector: {'meanEff': float, ...}}`` as returned by + ``summarizeEfficiencies``. + + Returns: + The ``data`` list with an ``RQ`` key added to each well dict + (float or ``"N/A"``). + """ res = [] for d in data: try: @@ -344,7 +675,14 @@ def RQ(data,effs): ############################### def mean(vals): - """Computes the mean of a list of numbers""" + """Compute the arithmetic mean of a list of numbers. + + Args: + vals: An iterable of numeric values. + + Returns: + The arithmetic mean as a float. + """ n = 0 s = 0.0 for i in vals: @@ -353,7 +691,19 @@ def mean(vals): return s / float(n) def median(vals): - """Computes the median of a list of numbers""" + """Compute the median of a list, ignoring any ``"N/A"`` sentinel values. + + Filters out ``"N/A"`` entries before sorting. Sorts the remaining values + in-place. Returns ``"N/A"`` if no numeric values remain after filtering. + + Args: + vals: A list that may contain numeric values and/or the string + ``"N/A"``. + + Returns: + The median numeric value as a float, or the string ``"N/A"`` if all + values are ``"N/A"``. + """ print(vals) vals = [i for i in vals if i != "N/A"] print(vals) @@ -367,17 +717,46 @@ def median(vals): return vals[lenvals // 2] def variance(vals): - """Variance""" + """Compute the sample variance of a list of numbers. + + Uses Bessel's correction (divides by N-1). + + Args: + vals: A list of numeric values with at least two elements. + + Returns: + The sample variance as a float. + """ u = mean(vals) return sum((x - u)**2 for x in vals) / float(len(vals)-1) def sdev(vals): - """Standard deviation""" + """Compute the sample standard deviation of a list of numbers. + + Returns 0.0 for lists with one or fewer elements. + + Args: + vals: A list of numeric values. + + Returns: + The sample standard deviation as a float. + """ if len(vals) <=1: return 0.0 return math.sqrt(variance(vals)) def covariance(lst1, lst2): - """Covariance""" + """Compute the sample covariance between two equal-length lists. + + Uses Bessel's correction (divides by N-1). + + Args: + lst1: First list of numeric values. + lst2: Second list of numeric values; must be the same length as + ``lst1``. + + Returns: + The sample covariance as a float. + """ m1 = mean(lst1) m2 = mean(lst2) tot = 0.0 @@ -386,7 +765,21 @@ def covariance(lst1, lst2): return tot / (len(lst1)-1) def corr(lst1, lst2): - """Pearson's Correlation""" + """Compute the Pearson correlation coefficient between two lists. + + Returns a very large number (1e1000) when the denominator is zero + (i.e., one or both lists have zero variance), used as a sentinel for + a perfect linear relationship in the sliding-window search. + + Args: + lst1: First list of numeric values. + lst2: Second list of numeric values; must be the same length as + ``lst1``. + + Returns: + The Pearson correlation coefficient as a float, or 1e1000 when the + standard deviation of either list is zero. + """ num = covariance(lst1, lst2) denom = float(sdev(lst1) * sdev(lst2)) if denom != 0: @@ -395,13 +788,38 @@ def corr(lst1, lst2): return 1e1000 def slope(xarray,yarray): - """Uses numpy, in fact assumes that the list arguments are numpy arrays.""" + """Compute the ordinary least-squares regression slope. + + Uses the standard closed-form formula. Requires numpy arrays because + element-wise multiplication (``xarray * yarray``) and vectorized + ``sum`` are used. + + Args: + xarray: Numpy array of independent variable values. + yarray: Numpy array of dependent variable values; must be the same + length as ``xarray``. + + Returns: + The regression slope as a float. + """ n = float(len(xarray)) m = (n*sum(xarray*yarray)-sum(xarray)*sum(yarray))/(n*sum(xarray**2)-(sum(xarray))**2) return m def intercept(xarray,yarray): - """Uses numpy, in fact assumes that the list arguments are numpy arrays.""" + """Compute the ordinary least-squares regression intercept. + + Uses the standard closed-form formula given the slope. Requires numpy + arrays because vectorized ``sum`` is used. + + Args: + xarray: Numpy array of independent variable values. + yarray: Numpy array of dependent variable values; must be the same + length as ``xarray``. + + Returns: + The regression intercept (y-axis) as a float. + """ m = slope(xarray,yarray) n = float(len(xarray)) b = (sum(yarray)-m*(sum(xarray)))/n @@ -412,9 +830,35 @@ def intercept(xarray,yarray): ############################### def flagBadDetectors(): + """Flag detectors with poor amplification characteristics. + + Not yet implemented. + """ pass def aggregateResults(data): + """Aggregate per-well RQ, N0, and dCt values into per-(sample, detector) summaries. + + Computes mean, median, and standard deviation of RQ, dCt, and N0 + for every (sample, detector) combination across all replicate wells. + Wells with ``"N/A"`` RQ are excluded from RQ and dCt summaries; N0 + is always summarised. + + Args: + data: List of well dicts containing ``sample``, ``detector``, + ``RQ``, ``N0``, and ``dCt`` keys, as returned by ``RQ``. + + Returns: + A nested dict ``{sample: {detector: stats_dict}}`` where + ``stats_dict`` contains the keys: ``medianRQ``, ``meanRQ``, + ``sdevRQ``, ``mediandCt``, ``meandCt``, ``sdevdCt``, + ``medianN0``, ``meanN0``, ``sdevN0``. Unavailable values are + represented as ``"N/A"``. + + Raises: + KeyError: If ``RQ`` values have not yet been computed on the data + (i.e., ``ddCt`` and ``RQ`` have not been called first). + """ try: data[0]['RQ'] except KeyError: @@ -478,6 +922,22 @@ def aggregateResults(data): return res def printDataFrameRQs(RQsummary,effs,outFile): + """Write a tab-delimited summary of RQ results to a file and to stdout. + + Outputs one row per (sample, detector) combination with columns: + Sample, Detector, meanEff, meanRQ, sdevRQ, medianRQ, meandCt, + mediandCt, sdevdCt, quant, ci.l, ci.u. + + The ``quant`` column is efficiency^(-mediandCt); ``ci.l`` and ``ci.u`` + are efficiency^(-(mediandCt +/- sdevdCt)), providing approximate + confidence intervals. + + Args: + RQsummary: Nested dict as returned by ``aggregateResults``. + effs: Dict ``{detector: {'meanEff': float, ...}}`` as returned by + ``summarizeEfficiencies``. + outFile: Path to the output file to write. + """ #Open out Handle outHandle = open(outFile,'w') #Print header row @@ -496,16 +956,58 @@ def printDataFrameRQs(RQsummary,effs,outFile): #TODO:Create R Function to plot output from printDataFramRQs() def plotRQs(results): + """Plot relative quantification (RQ) values. + + Not yet implemented. + + Args: + results: Aggregated results dict as returned by ``aggregateResults``. + """ pass def plotEdCt(results): + """Plot efficiency-corrected delta-Ct (EdCt) values. + + Not yet implemented. + + Args: + results: Aggregated results dict as returned by ``aggregateResults``. + """ pass def doPlotting(plotScript = "qPCRPlotting.q"): + """Execute an external R plotting script as a subprocess. + + Args: + plotScript: Path to the R script to execute. Defaults to + ``"qPCRPlotting.q"``. + + Returns: + A tuple ``(status, output)`` as returned by + ``subprocess.getstatusoutput``. + """ return subprocess.getstatusoutput(plotScript) def makeDvsS(results,detectors,samples,value = "mediandCt"): + """Build a detector-by-sample matrix of a chosen summary statistic. + + Creates a 2-D numpy array indexed by detector (rows) and sample + (columns). Missing (sample, detector) combinations are filled with + ``nan``. + + Args: + results: Nested dict ``{sample: {detector: stats_dict}}`` as + returned by ``aggregateResults``. + detectors: Ordered list of detector names defining the row order. + samples: Ordered list of sample names defining the column order. + value: Key within the innermost stats dict to extract. Defaults to + ``"mediandCt"``. + + Returns: + A numpy float array of shape ``(len(detectors), len(samples))`` + containing the requested statistic for each cell. + """ matrix = np.zeros((len(detectors),len(samples)),float) for d in range(0,len(detectors)): for s in range(0,len(samples)): @@ -520,6 +1022,17 @@ def makeDvsS(results,detectors,samples,value = "mediandCt"): ############################## def main(mainFile,cycleFile): + """Run the full qPCR analysis pipeline interactively. + + Parses results and cycle-data files using the raw ABI format parsers, + computes efficiencies, interactively asks the user to select an endogenous + control and reference sample, performs ddCt/RQ calculations, writes + ``output.txt``, and runs the external plotting script. + + Args: + mainFile: Path to the raw ABI tab-delimited results export file. + cycleFile: Path to the raw ABI cycle fluorescence export file. + """ #Parse mainFile print("Parsing Results File...") data = parseRawABI(mainFile) @@ -556,6 +1069,17 @@ def main(mainFile,cycleFile): return def test(): + """Run a manual integration test using hard-coded HeLa RIP data files. + + Parses ``'RIP HeLa clipped.txt'`` and ``'new_RIP_HeLa.txt'``, runs the + full ddCt/RQ pipeline with hard-coded endogenous control (``'hGAPDH'``) + and reference sample (``'IgG RIP'``), writes ``output.txt``, and + returns a detector-by-sample matrix of mediandCt values. + + Returns: + A numpy float array of shape ``(n_detectors, n_samples)`` containing + the mediandCt for each (detector, sample) combination. + """ cycleData = parseCycleData('RIP HeLa clipped.txt') cycleData = calculateEfficiencies(cycleData) effs = summarizeEfficiencies(cycleData) diff --git a/src/qpcr/util.py b/src/qpcr/util.py index 70bff2d..552cf53 100644 --- a/src/qpcr/util.py +++ b/src/qpcr/util.py @@ -1,11 +1,23 @@ ''' -Created on Sep 2, 2010 +Miscellaneous utility functions for the qpcr package. @author: lgoff ''' #Misc Tools and Utilities def uniqify(seq): + """Return a list of unique elements from a sequence. + + Deduplicates by inserting elements into a dict. The returned order is + not guaranteed to be the same as the input order (depends on dict + insertion-order behaviour of the Python version). + + Args: + seq: Any iterable of hashable elements. + + Returns: + A list containing each unique element from ``seq`` exactly once. + """ # Not order preserving keys = {} for e in seq: diff --git a/src/seqlib/Alignment.py b/src/seqlib/Alignment.py index 0640a86..57e87c9 100644 --- a/src/seqlib/Alignment.py +++ b/src/seqlib/Alignment.py @@ -1,18 +1,47 @@ -''' -Created on Jun 30, 2009 +"""Short RNA read alignment data structure. -@author: lgoff -''' +Provides the Alignment class for representing a single short-read alignment, +with methods for strand testing, BED output, and conversion to intervallib +Interval objects. + +Originally created on Jun 30, 2009. + +Author: lgoff +""" from . import misc from .intervallib import * class Alignment(object): - """ - Basic Alignment class for short RNA reads - Can be avoided directly in favor of aligner-specific implementations (ie. ShrimpRead and/or MAQRead) + """Basic alignment class for short RNA reads. + + Can be bypassed in favour of aligner-specific implementations such as + ShrimpRead or MAQRead. Supports score-based sorting (higher scores sort + first) and conversion to BED or Interval format. + + Attributes: + readname: Name/identifier of the aligned read. + chr: Chromosome name. + start: 0-based start coordinate. + end: End coordinate. + strand: Strand orientation ("+" or "-"). + score: Alignment score (float). + readsequence: DNA sequence of the read. + readcount: Integer read count (-1 if unset). """ def __init__(self,readname,chr,start,end,strand,score=0,readcount = -1,readsequence=''): + """Initialize an Alignment. + + Args: + readname: Name/identifier of the read. + chr: Chromosome name string. + start: Start coordinate (converted to int). + end: End coordinate (converted to int). + strand: Strand string ("+" or "-"). + score: Alignment score (default 0, converted to float). + readcount: Read count integer (default -1). + readsequence: DNA sequence string of the read (default ""). + """ self.readname = str(readname) self.chr = chr self.start = int(start) @@ -23,34 +52,63 @@ def __init__(self,readname,chr,start,end,strand,score=0,readcount = -1,readseque self.readcount = readcount def __lt__(self, b): + """Compare by score in descending order (higher scores sort first).""" return self.score > b.score # reversed because original was -cmp(self.score, b.score) def __eq__(self, b): + """Return True if self and b have the same score.""" return self.score == b.score def __str__(self): + """Return a readname:chr:start:end string.""" return "%s:%s:%d:%d" % (self.readname,self.chr,self.start,self.end) def __repr__(self): + """Return a readname:chr:start:end string.""" return "%s:%s:%d:%d" % (self.readname,self.chr,self.start,self.end) def __len__(self): + """Return the length of the alignment in bases (end - start + 1).""" return self.end-self.start+1 def isPlus(self): + """Return True if the alignment is on the "+" strand. + + Returns: + True if self.strand == "+", otherwise False. + """ if self.strand=="+": return True else: return False def isMinus(self): + """Return True if the alignment is on the "-" strand. + + Returns: + True if self.strand == "-", otherwise False. + """ if self.strand=="-": return True else: return False def toInterval(self): + """Convert this alignment to an intervallib.Interval. + + Returns: + An Interval with the same coordinates, score, readcount, and + readname as this alignment. + """ return Interval(self.chr,self.start,self.end,self.strand,self.score,self.readcount,name=self.readname) def toBed(self): + """Return a BED-formatted string for this alignment. + + The name field is encoded using misc.seq2nuID applied to the read + sequence. + + Returns: + Tab-delimited BED line string with a trailing newline. + """ return ("%s\t%d\t%d\t%s\t%d\t%s\n" % (self.chr,self.start,self.end,misc.seq2nuID(self.readsequence),self.readcount,self.strand)) diff --git a/src/seqlib/Chip.py b/src/seqlib/Chip.py index fcf4863..5374845 100644 --- a/src/seqlib/Chip.py +++ b/src/seqlib/Chip.py @@ -1,6 +1,12 @@ ''' +Tools for working with NimbleGen ChIP-chip tiling array data. + +Provides an interval class with tiling-array-specific methods, parsers for +NimbleGen GFF output files, interval-merging utilities, and statistical +helpers for identifying enriched regions via permutation-based p-value +estimation — following the approach of Guttman et al. + Created on Jul 6, 2009 -This module will attempt to deal with the nimblegen array data in a similar mechanism to that achieved by Guttman et al. @author: lgoff ''' @@ -19,39 +25,103 @@ class ChipInterval(Interval): - """Extends basic Interval class with Tiling array methods and attributes""" + """Genomic interval extended with tiling-array probe-hierarchy support. + + Extends the basic Interval class with parent/child relationships so + that individual NimbleGen probes (children) can be grouped under a + merged enriched region (parent), and provides methods for computing + coverage maps and plots from the probe scores. + + Attributes: + parents: List of ChipInterval objects that contain this interval. + children: List of ChipInterval objects contained within this + interval (e.g. individual probes belonging to an enriched + region). + """ def __init__(self, chr, start, end, strand="*", score=0.0, readcount = -1,name="",sequence = "",data={}): + """Initialise a ChipInterval. + + Args: + chr: Reference sequence name / chromosome. + start: Start coordinate of the interval. + end: End coordinate of the interval. + strand: Strand indicator; defaults to '*' (unstranded). + score: Probe or enrichment score; defaults to 0.0. + readcount: Number of reads/probes; defaults to -1 (unset). + name: Optional label for the interval; defaults to ''. + sequence: Optional genomic sequence; defaults to ''. + data: Optional dict of additional attributes; defaults to {}. + """ Interval.__init__(self, chr, start, end, strand=strand, score=score, readcount = readcount,name=name,sequence = sequence,data=data) self.parents = [] self.children = [] def addChild(self, child): - """Adds child node to self.children""" + """Add a child interval to this interval's children list. + + The child is only added if it is not already present. A back- + reference from the child to this interval is added to + ``child.parents``. + + Args: + child: A ChipInterval to add as a child of this interval. + """ #assert child not in self.children if child not in self.children: child.parents.append(self) self.children.append(child) def removeChild(self, child): - """Removes child node from self.children (not sure how or if this works. Don't trust it yet)""" + """Remove a child interval from this interval's children list. + + Also removes the corresponding back-reference from ``child.parents``. + The correctness of this method has not been fully verified. + + Args: + child: The ChipInterval to remove from ``self.children``. + """ child.parents.remove(self) self.children.remove(child) def childScores(self): - """Returns list of scores for each interval in self.children""" + """Return the score attribute of each child interval. + + Returns: + A list of score values, one per element in ``self.children``, + in the same order as ``self.children``. + """ return [x.score for x in self.children] def childAvg(self): - """Empty""" + """Placeholder for computing the average score across child intervals. + + Not yet implemented. + """ pass def childMedian(self): - """Empty""" + """Placeholder for computing the median score across child intervals. + + Not yet implemented. + """ pass def makeValMap(self,value = 'readcount'): - """Check these two to see which one is right...""" + """Build a per-base value map by averaging child interval attributes. + + Creates ``self.valMap``, a numpy array of length ``len(self)`` + initialised to -1. For each base position covered by at least one + child interval the stored value is the mean of the specified + attribute across all children that cover that base. + + Note: An alternative implementation exists in a commented-out block + in the source; both approaches are noted as unverified. + + Args: + value: Name of the attribute on each child ChipInterval whose + values are averaged. Defaults to ``'readcount'``. + """ self.valMap = np.zeros(len(self)) self.valMap = self.valMap-1 myTmp = [] @@ -96,7 +166,13 @@ def makeValMap(self): """ def plotVals(self): - """Creates a line plot (via rpy2) across all bases within interval of the scores from self.valMap for the given base""" + """Plot probe scores across this interval using rpy2. + + Opens an X11 window and draws a step-style line plot. Each child + probe is drawn as a horizontal segment at its score level spanning + its start to end coordinates. If ``self.valMap`` has not yet been + computed, ``makeValMap`` is called automatically. + """ if 'valMap' not in self.__dict__: self.makeValMap() robjects.r.x11() @@ -106,7 +182,10 @@ def plotVals(self): robjects.r.lines((x.start,x.end),(x.score,x.score),lwd=2) def plot(self): - """Convenience wrapper for self.plotVals""" + """Convenience wrapper that calls plotVals to display the interval. + + Equivalent to calling ``self.plotVals()`` directly. + """ self.plotVals() # def uniqifySig(self): @@ -116,6 +195,25 @@ def plot(self): # self.significant = keys.keys() def scan(self,permuted,windowSize,threshold): + """Scan child probes with a sliding window to identify significant regions. + + Sorts ``self.children`` in place and slides a window of + ``windowSize`` probes across them. For each window, computes the + mean probe score and compares it against a pre-computed permutation + distribution to obtain an empirical p-value. Probes in windows + whose p-value is at or below ``threshold`` are added to + ``self.significant``. + + Args: + permuted: A dict keyed by window size whose values are numpy + arrays of maximum-window-mean values from permuted data (as + produced by ``getRandomDist``). The key ``windowSize`` must + be present. + windowSize: Number of consecutive probes in each sliding window. + threshold: Maximum empirical p-value (proportion of permuted + values >= observed mean) for a window to be considered + significant. + """ self.children.sort() if 'significant' not in self.__dict__: self.significant = [] @@ -132,8 +230,27 @@ def scan(self,permuted,windowSize,threshold): #This should be deleted... class ChipData(object): - """Container for one array's worth of NimbleGen data""" + """Container for one NimbleGen array's worth of probe data. + + Deprecated — this class is marked for deletion in the source. + + Parses a NimbleGen GFF file on construction and organises the resulting + ChipInterval probe objects by chromosome. + + Attributes: + fname: Path to the NimbleGen GFF file that was parsed. + sampleName: Human-readable label for this sample. + probeData: Dict mapping chromosome name to a list of ChipInterval + objects for probes on that chromosome. + """ + def __init__(self, fname, sampleName): + """Initialise a ChipData container by parsing a NimbleGen GFF file. + + Args: + fname: Path to the NimbleGen GFF output file to parse. + sampleName: Label for this array sample. + """ self.fname = fname self.sampleName = sampleName self.probeData = {} @@ -146,19 +263,45 @@ def __init__(self, fname, sampleName): self.probeData[ci.chr].append(ci) def sort(self): - """Sorts all chromosomes seperately and in place""" + """Sort probe lists for all chromosomes in place. + + Iterates over ``self.data`` (note: the attribute populated on + construction is ``self.probeData``; this method references + ``self.data`` which may not exist). + """ for k in self.data.keys(): self.data[k].sort() def shuffle(self,chr): - """This doesn't work yet""" + """Shuffle probe scores for a chromosome in place. + + Note: This method is not yet correctly implemented — ``random.shuffle`` + operates on the temporary ``vals`` list and does not modify + ``self.probeData``. + + Args: + chr: Chromosome key to look up in ``self.probeData``. + + Returns: + None (``random.shuffle`` always returns None). + """ vals = [x.score for x in self.probeData[chr]] return random.shuffle(vals) #End crap def nimblegenIter(fname): - """Returns a generator of ChipInterval objects from a nimblegen .GFF output file""" + """Yield ChipInterval objects parsed from a NimbleGen GFF output file. + + Skips comment lines (starting with '#') and extracts chromosome, + start, end, score, and probe name from each data row. + + Args: + fname: Path to a NimbleGen GFF file. + + Yields: + ChipInterval objects, one per non-comment line in the file. + """ handle = open(fname,'r') for line in handle: if line.startswith("#"): continue @@ -167,6 +310,18 @@ def nimblegenIter(fname): yield ChipInterval(tokens[0],tokens[3],tokens[4],score=tokens[5],name=pname) def parseNimblegen(fname): + """Parse an entire NimbleGen GFF file into a list of ChipInterval objects. + + Convenience wrapper around ``nimblegenIter`` that collects all intervals + into a list rather than lazily yielding them. + + Args: + fname: Path to a NimbleGen GFF file. + + Returns: + A list of ChipInterval objects, one per non-comment line in the + file. + """ iter = nimblegenIter(fname) rtrn = [] for i in iter: @@ -174,8 +329,30 @@ def parseNimblegen(fname): return rtrn def joinNimblegenIntervals(intervals,start='start',end='end',offset=1000): - """ - Returns a list of independent transcription units overlaping by offset + """Merge overlapping NimbleGen probe intervals into enriched regions. + + Sorts the probe list and iterates through it, merging any probes that + intersect (with optional extension by ``offset``) into a single + ChipInterval. Each merged interval stores its constituent probes as + children and resets its name and score. + + Returns the input list unchanged if it is empty. + + Args: + intervals: A list of ChipInterval objects (typically from + ``parseNimblegen``). The list is sorted in place. + start: Attribute name used as the start coordinate when testing + for intersection. Defaults to ``'start'``. + end: Attribute name used as the end coordinate when testing for + intersection. Defaults to ``'end'``. + offset: Number of bases by which each interval is extended before + testing for overlap, effectively merging probes within this + distance. Defaults to 1000. + + Returns: + A list of merged ChipInterval objects representing independent + enriched regions, each with a ``children`` list of the constituent + probe intervals. """ if not intervals: return intervals @@ -202,12 +379,37 @@ def joinNimblegenIntervals(intervals,start='start',end='end',offset=1000): return non_overlapping def probeScores(probes): - """Returns list of scores across all a list of probes""" + """Extract scores from a list of probe intervals into a numpy array. + + Args: + probes: A list of ChipInterval (or any object with a ``score`` + attribute) objects. + + Returns: + A numpy array of dtype float32 containing the score of each probe + in the same order as the input list. + """ return np.array([x.score for x in probes],dtype='f') def getRandomDist(probes,nRandom,windowSize): - """Returns a numpy array of length 'nRandom' corresponding to the max values of sliding windows of size 'windowSize' - from shuffled probe data. + """Build an empirical null distribution of maximum sliding-window means. + + Repeatedly shuffles the probe score array in place, slides a window of + ``windowSize`` across it, records the maximum window mean for each + shuffle, and returns all maxima as a numpy array. This distribution is + used to compute empirical p-values for observed window means. + + Args: + probes: A numpy array (or list) of numeric probe scores. The array + is shuffled in place during this function — pass a copy if the + original order must be preserved. + nRandom: Number of shuffle iterations (i.e. length of the returned + distribution array). + windowSize: Number of consecutive probes in each sliding window. + + Returns: + A numpy array of dtype float32 with ``nRandom`` elements, where each + element is the maximum window mean observed in one shuffle iteration. """ sys.stderr.write("Getting %d Max value distributions from windows of size %d:\n" % (nRandom,windowSize)) #scores = probeScores(probes) @@ -226,11 +428,33 @@ def getRandomDist(probes,nRandom,windowSize): return maxVals def calcPVals(segScores,permuted,windowSize): - """This does not work yet""" + """Count permuted values at least as extreme as the observed score. + + Note: This function is not yet correctly implemented. + + Args: + segScores: The observed test statistic (scalar or array) to compare + against the permuted distribution. + permuted: A numpy array of values from the null distribution (e.g. + as returned by ``getRandomDist``). + windowSize: Window size used to generate the distribution (not + currently used in the comparison). + + Returns: + The number of elements in ``permuted`` that are >= ``segScores``. + """ return len(permuted[permuted>=segScores]) def main(): + """Run the default ChIP-chip analysis pipeline. + + Discovers all ``.gff`` files in the current directory, loads and + normalises them via ``continuousData.SimpleChIPData``, merges adjacent + probes, and generates permutation-based null distributions for a set of + predefined window sizes (5, 7, 9, 11 probes). The resulting + distributions are stored in ``permuted`` keyed by window size. + """ files = glob.glob("*.gff") data = continuousData.SimpleChIPData(files) data.normalize() diff --git a/src/seqlib/GTFlib.py b/src/seqlib/GTFlib.py index 9c27dcb..690e34b 100644 --- a/src/seqlib/GTFlib.py +++ b/src/seqlib/GTFlib.py @@ -1,11 +1,16 @@ -''' -Created on Aug 31, 2010 +"""Parsing and data structures for GTF (Gene Transfer Format) files. -All of this is very fragile and is -absolutely dependent on a unique geneId and unique transcriptId for any records... +All of this is very fragile and is absolutely dependent on a unique geneId +and unique transcriptId for any records. -@author: lgoff -''' +Provides GTF_Entry, GTFTranscriptContainer, and GTFGeneContainer classes for +holding GTF data, along with iterator functions for streaming over transcripts +and genes, and utility functions for building attribute dictionaries and tables. + +Originally created on Aug 31, 2010. + +Author: lgoff +""" ########### #Imports ########### @@ -20,8 +25,13 @@ #Error Handling ####################### class Error(Exception): - """Base class for exceptions in this module.""" + """Base class for exceptions in this module. + + Provides a message property with getter/setter so subclasses can store + a human-readable error description. + """ def __str__(self): + """Return the string representation of the error message.""" return str(self.message) def _get_message(self, message): return self._message def _set_message(self, message): self._message = message @@ -42,14 +52,22 @@ def __init__(self, message): ######################### class GTF_Entry: - ''' - Holds a row's worth of GTF information. - ''' + """Holds a single row's worth of GTF/GFF information. + + Attributes: + contig: Sequence name / chromosome. + source: Annotation source name. + feature: Feature type (e.g. "exon", "CDS", "transcript"). + frame: Reading frame (".","0","1","2"). + start: 1-based start coordinate (integer). + end: 1-based end coordinate (integer). + score: Score value (float or "."). + strand: Strand ("+" or "-" or "."). + attributes: Dictionary of parsed key-value attribute pairs. + """ def __init__(self): - ''' - Constructor - ''' + """Construct a GTF_Entry with default empty/sentinel field values.""" self.contig = "." self.source = "." self.feature = "." @@ -61,15 +79,23 @@ def __init__(self): self.attributes = {} def __lt__(self, b): + """Compare GTF entries by midpoint coordinate.""" return (self.start + self.end) // 2 < (b.start + b.end) // 2 def __eq__(self, b): + """Return True if two GTF entries share the same midpoint coordinate.""" return (self.start + self.end) // 2 == (b.start + b.end) // 2 def __repr__(self): + """Return a transcript_id:feature string representation.""" return self.attributes['transcript_id']+":"+self.feature def addGTF_Entry(self,gtf_entry): + """Copy all fields from another GTF_Entry into self. + + Args: + gtf_entry: A GTF_Entry instance whose fields will be copied. + """ self.contig = gtf_entry.contig self.source = gtf_entry.source self.feature = gtf_entry.feature @@ -133,6 +159,14 @@ def parseInfo(self,myAttributes,line ): self.attributes[n] = v def toGTF(self): + """Serialize this entry back to a GTF-formatted string. + + Writes gene_id and transcript_id first (as required by the GTF spec), + then all remaining attributes in arbitrary order. + + Returns: + A GTF-formatted line string ending with a newline. + """ tmp = '%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t' % (self.contig,self.source,self.feature,self.start,self.end,str(self.score),self.strand,self.frame) #Print 'gene_id' and 'transcript_id' as first and second attributes (required by GTF spec.) for attr in ['gene_id','transcript_id']: @@ -151,10 +185,20 @@ def toGTF(self): #GTFTranscriptContainer ############ class GTFTranscriptContainer(object): + """Container grouping all GTF_Entry instances sharing a transcript_id. + + Attributes: + features: List of GTF_Entry objects belonging to this transcript. + start: Minimum start coordinate across all features. + end: Maximum end coordinate across all features. + contig: Chromosome/contig name. + strand: Strand orientation. + transcriptId: transcript_id attribute value. + geneId: gene_id attribute value. + """ + def __init__(self): - ''' - Constructor - ''' + """Construct an empty GTFTranscriptContainer with sentinel values.""" self.features = [] self.start = -1 self.end = -1 @@ -164,15 +208,30 @@ def __init__(self): self.geneId = '' def __len__(self): + """Return the genomic span of the transcript (end - start + 1).""" return self.end-self.start+1 def __lt__(self, b): + """Compare transcript containers by midpoint coordinate.""" return (self.start + self.end) // 2 < (b.start + b.end) // 2 def __eq__(self, b): + """Return True if two transcript containers share the same midpoint.""" return (self.start + self.end) // 2 == (b.start + b.end) // 2 def addFeature(self,gtf_entry): + """Add a GTF_Entry to this transcript container. + + Initialises contig, strand, and transcriptId from the first feature + added. Asserts that subsequent features share the same transcript_id. + Updates self.start and self.end to span all features. + + Args: + gtf_entry: A GTF_Entry instance to add. + + Raises: + AssertionError: If gtf_entry has a different transcript_id. + """ if self.transcriptId == '': self.contig = gtf_entry.contig self.strand = gtf_entry.strand @@ -184,10 +243,23 @@ def addFeature(self,gtf_entry): self.update() def update(self): + """Recompute self.start and self.end from the current feature list.""" self.start = min([x.start for x in self.features]) self.end = max([x.end for x in self.features]) def toSplicedInterval(self): + """Convert this transcript container to a SplicedInterval. + + Extracts exon features, sorts them by exon_number, and constructs a + SplicedInterval using their lengths and offsets. + + Returns: + A SplicedInterval representing the spliced transcript. + + Raises: + ValueError: If more than one distinct transcript_id is found + in the feature list. + """ transcripts = uniqify([x.attributes['transcript_id'] for x in self.features]) if len(transcripts) > 1: raise ValueError ("Something is wrong, there are too many different transcript_ids") @@ -204,18 +276,27 @@ def toSplicedInterval(self): ############ class GTFGeneContainer(object): - ''' - Container for all GTF_Entry instances with a common geneId + """Container for all GTF_Entry instances sharing a common gene_id. + Assumptions: - - gene_id field is unique to a gene locus (ie. not shared amongst gene duplicates - - There is no guarantee that the order of rows is preserved during reading in and returning GTF + - The gene_id field is unique to a gene locus (not shared among + gene duplicates). + - There is no guarantee that the row order is preserved during + reading or when returning GTF output. - ''' + Attributes: + features: List of GTF_Entry objects for this gene. + transcripts: List of GTFTranscriptContainer objects for this gene. + start: Minimum start coordinate across all features/transcripts. + end: Maximum end coordinate across all features/transcripts. + contig: Chromosome/contig name. + strand: Strand orientation. + geneId: gene_id attribute value. + sequence: DNA sequence string (empty by default). + """ def __init__(self): - ''' - Constructor - ''' + """Construct an empty GTFGeneContainer with sentinel values.""" self.features = [] self.transcripts = [] self.start = -1 @@ -226,15 +307,30 @@ def __init__(self): self.sequence = '' def __len__(self): + """Return the genomic span of the gene (end - start + 1).""" return self.end-self.start+1 def __lt__(self, b): + """Compare gene containers by midpoint coordinate.""" return (self.start + self.end) // 2 < (b.start + b.end) // 2 def __eq__(self, b): + """Return True if two gene containers share the same midpoint.""" return (self.start + self.end) // 2 == (b.start + b.end) // 2 def addFeature(self,gtf_entry): + """Add a GTF_Entry feature to this gene container. + + Initialises contig, strand, and geneId from the first feature added. + Asserts that subsequent features share the same gene_id. Updates + self.start and self.end. + + Args: + gtf_entry: A GTF_Entry instance to add. + + Raises: + AssertionError: If gtf_entry has a different gene_id. + """ if self.geneId == '': self.contig = gtf_entry.contig self.strand = gtf_entry.strand @@ -244,6 +340,18 @@ def addFeature(self,gtf_entry): self.update() def addGTFTranscript(self,gtf_transcript): + """Add a GTFTranscriptContainer to this gene container. + + Initialises contig, strand, and geneId from the first transcript added. + Asserts that subsequent transcripts share the same geneId, contig, and + strand. Updates self.start and self.end via transcriptUpdate(). + + Args: + gtf_transcript: A GTFTranscriptContainer instance to add. + + Raises: + AssertionError: If geneId, contig, or strand do not match. + """ if self.geneId == '': self.contig = gtf_transcript.contig self.strand = gtf_transcript.strand @@ -253,21 +361,34 @@ def addGTFTranscript(self,gtf_transcript): self.transcriptUpdate() def update(self): + """Recompute self.start and self.end from the current features list.""" self.start = min([x.start for x in self.features]) self.end = max([x.end for x in self.features]) def transcriptUpdate(self): + """Recompute self.start and self.end from the transcripts list.""" self.start = min([x.start for x in self.transcripts]) self.end = max([x.end for x in self.transcripts]) def propogateLincName(self,lincName): + """Set the linc_name attribute on all features, and gene_name if absent. + + Args: + lincName: The lincRNA name string to propagate to all features. + """ for feat in self.features: feat.attributes['linc_name'] = lincName if 'gene_name' not in feat.attributes: feat.attributes['gene_name'] = lincName def addAttribute(self,key,value): + """Add or overwrite an attribute key-value pair on all features. + + Args: + key: Attribute name string. + value: Attribute value to assign. + """ for feat in self.features: feat.attributes[key] = value @@ -277,15 +398,27 @@ def geneToBed(self): return "%s\t%d\t%d\t%s\t0\t%s\t%s\t%s" % (self.contig,self.start,self.end,self.attributes['transcript_id'],self.strand,",".join(self.exonLengths),",".join(self.exonOffsets)) def transcriptsToBed(self): + """Placeholder for BED output of transcripts (not yet implemented).""" pass def getGTF(self): + """Return a GTF string containing all features of this gene. + + Returns: + Multi-line string of GTF-formatted rows for every feature. + """ tmp = '' for feat in self.features: tmp += feat.toGTF() return tmp def toInterval(self): + """Convert this gene to an Interval spanning its genomic footprint. + + Returns: + An Interval with 0-based start (start-1), end, strand, and the + gene_id as its name. + """ return intervallib.Interval(self.contig,self.start-1,self.end,self.strand,name=self.geneId) # def fetchSequence(self,genome='hg19',connection=None): @@ -303,6 +436,17 @@ def toInterval(self): #lineIterator ############# def lineIterator(gtfHandle): + """Yield GTF_Entry objects for every non-comment line in gtfHandle. + + Skips lines starting with "#". Parses each remaining line into a + GTF_Entry via GTF_Entry.read(). + + Args: + gtfHandle: An open file handle to a GTF file. + + Yields: + GTF_Entry objects, one per data line. + """ while True: line = gtfHandle.readline() if not line: return @@ -312,6 +456,18 @@ def lineIterator(gtfHandle): yield gtf_entry def GTFGeneIterator(gtfFile,verbose = False): + """Iterate over genes in a GTF file, yielding one GTFGeneContainer per gene. + + Groups all GTF_Entry rows by gene_id and yields a fully-populated + GTFGeneContainer for each unique gene_id found. + + Args: + gtfFile: Path to the GTF file. + verbose: If True, write progress messages to stderr (default False). + + Yields: + GTFGeneContainer objects, one per unique gene_id. + """ handle = open(gtfFile,'r') iter = lineIterator(handle) res = {} @@ -324,6 +480,18 @@ def GTFGeneIterator(gtfFile,verbose = False): yield res[k] def GTFGeneIterator2(gtfFile,verbose=False): + """Iterate over genes by grouping transcripts, yielding one GTFGeneContainer per gene. + + An alternative to GTFGeneIterator that builds genes from + GTFTranscriptContainer objects rather than raw GTF_Entry rows. + + Args: + gtfFile: Path to the GTF file. + verbose: If True, write progress messages to stderr (default False). + + Yields: + GTFGeneContainer objects, one per unique gene_id. + """ iter = GTFTranscriptIterator(gtfFile,verbose=verbose) res = {} for i in iter: @@ -333,6 +501,18 @@ def GTFGeneIterator2(gtfFile,verbose=False): yield res[k] def GTFTranscriptIterator(gtfFile,verbose = False): + """Iterate over transcripts in a GTF file, yielding one GTFTranscriptContainer per transcript. + + Groups all GTF_Entry rows by transcript_id and yields a fully-populated + GTFTranscriptContainer for each unique transcript_id found. + + Args: + gtfFile: Path to the GTF file. + verbose: If True, write progress messages to stderr (default False). + + Yields: + GTFTranscriptContainer objects, one per unique transcript_id. + """ handle = open(gtfFile,'r') iter = lineIterator(handle) res = {} @@ -394,12 +574,15 @@ def GTFAttributeTable(gtfFile,outfile,idField='gene_id'): return def test(): - """ -from RNASeq import GTFlib -fname = 'linc_catalog.gtf' -iter = GTFlib.GTFGeneIterator(fname) -for i in iter: - print i.getGTF(), + """Placeholder test function. No-op. + + Example usage (Python 2 style, for reference):: + + from RNASeq import GTFlib + fname = 'linc_catalog.gtf' + iter = GTFlib.GTFGeneIterator(fname) + for i in iter: + print i.getGTF(), """ pass diff --git a/src/seqlib/JensenShannon.py b/src/seqlib/JensenShannon.py index f6bf249..2ca643a 100644 --- a/src/seqlib/JensenShannon.py +++ b/src/seqlib/JensenShannon.py @@ -1,10 +1,19 @@ #!/usr/bin/env python -""" -JensenShannon.py +"""Jensen-Shannon divergence utilities for comparing probability distributions. + +Provides functions to compute the Jensen-Shannon (JS) divergence between pairs +of discrete probability distributions and to construct pairwise JS divergence +matrices from a collection of distributions. The JS divergence is a +symmetrised, smoothed version of the Kullback-Leibler (KL) divergence and is +defined as:: + + JS(A || B) = 0.5 * KL(A || M) + 0.5 * KL(B || M), where M = (A + B) / 2 -Created by Loyal Goff on Nov 10, 2010. -Copyright (c) 2010 +Because it is bounded in [0, ln 2] (or [0, 1] in bits), its square root is a +proper metric known as the Jensen-Shannon distance. + +Originally created by Loyal Goff on Nov 10, 2010. """ import rpy2.robjects as r @@ -15,6 +24,26 @@ #efficnent js_div def js_div_matrix(a): + """Compute a pairwise Jensen-Shannon divergence matrix efficiently. + + For each pair of rows ``i`` and ``j`` in ``a``, computes:: + + JS(i, j) = 0.5 * (H(M) - 0.5*(H(i) + H(j))) + + where ``M = (a[i] + a[j]) / 2`` and ``H`` denotes Shannon entropy. + The implementation avoids an O(n^2) loop over pairs by vectorising + the inner computation row-by-row, giving O(n) outer iterations. + + Args: + a: A 2-D array-like of shape ``(n, d)`` where each row is a + probability distribution over ``d`` categories (rows should + sum to 1 for the result to be a true JS divergence). + + Returns: + A symmetric ``(n, n)`` NumPy array ``W`` where ``W[i, j]`` is the + Jensen-Shannon divergence between rows ``i`` and ``j`` of ``a``. + Diagonal entries are 0. + """ a=array(a) W=zeros((a.shape[0],a.shape[0])) e=-entropy(a.transpose()) @@ -27,6 +56,19 @@ def js_div_matrix(a): return W def make_probs(a): + """Normalise each row of a 2-D array to sum to 1. + + Divides each row of ``a`` by its sum, converting raw counts or + unnormalised weights into proper probability distributions. + + Args: + a: A 2-D NumPy array of shape ``(n, d)`` with non-negative + entries. Each row must have a positive sum. + + Returns: + A 2-D NumPy array of the same shape as ``a`` where each row + sums to 1.0. + """ sums = sum(a,1) res = zeros(a.shape) for i in range(a.shape[0]): @@ -34,13 +76,56 @@ def make_probs(a): return res def js_div(A,B): + """Compute the Jensen-Shannon divergence between two distributions. + + The JS divergence is defined as:: + + JS(A || B) = 0.5 * KL(A || M) + 0.5 * KL(B || M) + + where ``M = (A + B) / 2`` is the mixture distribution and + ``KL(P || Q) = sum(P * log(P / Q))``. The result is symmetric and + always non-negative. + + Args: + A: A 1-D array-like representing the first probability + distribution. All entries should be positive for a + well-defined result. + B: A 1-D array-like representing the second probability + distribution of the same length as ``A``. + + Returns: + A scalar float equal to the Jensen-Shannon divergence between + ``A`` and ``B``. + """ half=(A+B)/2 return 0.5*kl_div(A,half)+0.5*kl_div(B,half) def kl_div(A,B): + """Compute the Kullback-Leibler divergence of distribution A from B. + + Calculates the KL divergence using the formula:: + + KL(A || B) = sum(A * log(A / B)) + + where the sum is taken element-wise. The result is non-negative and + equals zero only when ``A`` and ``B`` are identical. Note that the + KL divergence is not symmetric: ``kl_div(A, B) != kl_div(B, A)`` + in general. + + Args: + A: A 1-D array-like representing the first (reference) + probability distribution. All entries should be positive. + B: A 1-D array-like representing the second probability + distribution of the same length as ``A``. All entries + should be positive to avoid division by zero. + + Returns: + A scalar float equal to the KL divergence KL(A || B). + """ return sum(multiply(A,log(A/B))) def main(): + """Entry point placeholder; no operation is performed.""" pass if __name__ == "__main__": diff --git a/src/seqlib/LSFlib.py b/src/seqlib/LSFlib.py index 5fc684d..6b86d74 100644 --- a/src/seqlib/LSFlib.py +++ b/src/seqlib/LSFlib.py @@ -1,8 +1,13 @@ -''' -Created on Jun 29, 2011 +"""Utilities for submitting and monitoring jobs on an IBM Platform LSF cluster. -@author: lgoff -''' +Provides the LSFJob class for constructing, submitting, polling, killing, and +waiting on LSF batch jobs via the bsub/bjobs/bkill command-line tools. Also +supports a 'local' pseudo-queue for running commands directly on the current +host without LSF. + +Designed for use with Harvard's Odyssey LSF cluster but applicable to any +Platform LSF installation. +""" import os import re import subprocess @@ -19,26 +24,73 @@ #Error Handling ####################### class LSFError(Exception): - """Base class for exceptions in this module.""" + """Exception raised for LSF-related errors. + + Attributes: + value: String or object describing the error condition. + """ def __init__(self,value): + """Initialises an LSFError with an error value. + + Args: + value: A string or object describing the LSF error. + """ self.value = value + def __str__(self): + """Returns a string representation of the error value.""" return repr(self.value) ################# #Base Class ################# class LSFJob(object): - ''' - LSF Job - ''' - + """Represents a single LSF batch job with lifecycle management. + + Constructs the bsub command string, submits the job to LSF (or runs it + locally), and provides methods to poll job status, wait for completion, + and kill the job. + + Attributes: + cmd_str: The shell command to execute. + queue: LSF queue name (or 'local' for local execution). + outfile: Path to the stdout capture file. + errfile: Path to the stderr capture file. + job_name: Optional LSF job name. + group: Optional LSF job group. + job_mem: Memory requirement in GB (capped at lsf_mem global). + submit_flag: True after the job has been submitted. + complete: True after the job has finished. + status: Current job status string (e.g. 'PEND', 'RUN', 'DONE'). + jobID: LSF job ID integer (-999 before submission). + submit_time: Submission timestamp from bjobs. + exec_host: Host on which the job is/was running. + submit_host: Host from which the job was submitted. + bsub_str: List of tokens forming the complete bsub command. + """ def __init__(self,cmd_str,job_name=None,job_group=None,blocking=False,outfilename=None,errfilename=None,queue_name=None,job_mem=None,job_cores=1,notify=None): - ''' - Creates instance of LSFJob - #Don't use blocking because this is a limiting resource on Odyssey LSF - ''' + """Creates an LSFJob instance and constructs the bsub command. + + Args: + cmd_str: The shell command string to submit as an LSF job. + job_name: Optional LSF job name passed to bsub -J. + job_group: Optional LSF job group passed to bsub -g. + blocking: If True, add -K flag to bsub to block until job + completes. Avoid on Odyssey LSF (limiting resource). + outfilename: Path for stdout redirection. If None, a temporary + file in 'tmp/' is created. + errfilename: Path for stderr redirection. If None, a temporary + file in 'tmp/' is created. + queue_name: LSF queue name. Defaults to lsf_default_queue. + Use 'local' to run without LSF. + job_mem: Memory requirement in GB. Capped at the module-level + lsf_mem constant. + job_cores: Number of cores requested (stored but not currently + used in the bsub command). + notify: If truthy, add -N flag to bsub to send email notification + on job completion. + """ self.cmd_str = cmd_str global lsf_default_queue @@ -108,12 +160,26 @@ def __init__(self,cmd_str,job_name=None,job_group=None,blocking=False,outfilenam self.bsub_str.insert(0,self.cmd_str) def __repr__(self): + """Returns a verbose string representation including all attributes.""" return "Instance of class LSF Job:\n\t%s\n\tSubmitted: %s\n\t Complete: %s\n" % (self.cmd_str,self.submit_flag,self.complete) + str(self.__dict__) def __str__(self): + """Returns the complete bsub command as a space-joined string.""" return " ".join(self.bsub_str) def submit(self): # wait pend + """Submits the job to LSF (or runs it locally) and waits for it to enter a stable state. + + For LSF jobs, uses subprocess.Popen to call bsub, retrieves the job ID, + and polls until the status transitions out of 'SUBMITTED'. For local + jobs, launches the process and returns immediately. + + Returns: + 0 on successful submission (or 0 for local job launch). + + Raises: + LSFError: If the bsub command returns a non-zero exit code. + """ if self.submit_flag == True: print("Job already submitted", file=sys.stderr) return 0# what do you return here? @@ -205,6 +271,12 @@ def poll(self): raise LSFError("Problem with bjobs polling. Error %s" % tmp_err) def getJobId(self): + """Parses the LSF job ID from the bsub submission output. + + Extracts the integer job ID from the '' pattern in + self.submit_status and stores it in self.jobID. Prints a message to + stdout if the job has not been submitted yet. + """ if self.submit_flag: jobID_search = re.search(r"\<[0-9]+\>",self.submit_status) self.jobID = int(jobID_search.group().strip("><")) @@ -214,6 +286,12 @@ def getJobId(self): return def kill(self): + """Kills the LSF job using bkill. + + Does nothing if the job has not been submitted or has no valid job ID. + Loops until bkill returns 0, retrying if necessary. On success, resets + status to 'NOT SUBMITTED' and clears submit_flag and complete. + """ #Added this to fix cases were kill fails because there is no job id if self.status in ['NOT SUBMITTED'] or self.jobID== -999 : self.status = 'NOT SUBMITTED' @@ -231,6 +309,13 @@ def kill(self): return def wait(self): + """Blocks until the LSF job reaches a terminal state. + + Polls the job status every 30 seconds until status is no longer + 'SUBMITTED', 'PEND', 'RUN', or 'SUSP'. Prints a warning to stderr + if the job is suspended. Sets status to 'DONE' and complete to True + on exit. + """ self.poll() if not self.submit_flag: print("Job not yet submitted") @@ -249,6 +334,18 @@ def wait(self): #Helper functions ############## def tmp_name(prefix): + """Generates a unique temporary file path inside a local 'tmp/' directory. + + Creates the 'tmp/' directory in the current working directory if it does + not already exist, then returns a path of the form + 'tmp/'. + + Args: + prefix: String prefix for the temporary file name. + + Returns: + A string file path for a temporary file that does not yet exist. + """ import tempfile tmp_root = "tmp/" if os.path.exists(tmp_root): diff --git a/src/seqlib/QCtools.py b/src/seqlib/QCtools.py index 7655d3a..968858f 100644 --- a/src/seqlib/QCtools.py +++ b/src/seqlib/QCtools.py @@ -1,5 +1,10 @@ #!/usr/bin/env python ''' +Quality control tools for sequencing data. + +Provides a FASTQ file parser and a position-weight matrix (PWM) builder for +inspecting base-composition biases across read positions. + Created on May 6, 2010 @author: lgoff @@ -9,6 +14,27 @@ def makePWM(fastqFile,readLen,freq=True): + """Build a position-weight matrix of base composition from a FASTQ file. + + Iterates over all records in a FASTQ file and tallies the occurrence of + each nucleotide (A, C, G, T) at every position across ``readLen`` + positions. Ambiguous bases (e.g. 'N') are silently ignored. + Optionally converts raw counts to per-position frequencies. + + Args: + fastqFile: Path to the FASTQ file to process. + readLen: Expected read length (number of positions to track). + freq: If True (default), each base count vector is divided by the + total count at that position to produce a frequency. If False, + raw counts are returned. + + Returns: + A dict with keys 'A', 'C', 'G', 'T', and 'Total'. Each key maps to + a numpy array of length ``readLen``. The 'Total' array contains the + total number of valid base observations at each position; the + individual base arrays contain either counts or frequencies depending + on the ``freq`` argument. + """ bases = ['A','C','G','T'] pwm = { 'A':np.zeros(readLen), @@ -37,6 +63,27 @@ def makePWM(fastqFile,readLen,freq=True): #Parsers ################ def FastqIterator(fastqFile): + """Iterate over records in a FASTQ file. + + Skips any non-FASTQ header text at the start of the file (lines that do + not begin with '@') and then yields one dict per record. The file is + expected to use standard four-line FASTQ format: a '@'-prefixed name + line, a sequence line, a '+' line, and a quality line. + + Args: + fastqFile: Path to the FASTQ file to parse. + + Yields: + A dict with keys: + ``'name'``: Read name string (the '@' prefix is stripped). + ``'sequence'``: Nucleotide sequence string. + ``'quals'``: ASCII quality string. + + Raises: + ValueError: If a record's name line does not start with '@'. + ValueError: If the separator line between sequence and qualities + does not start with '+'. + """ handle = open(fastqFile,'r') #Skip any header text while True: diff --git a/src/seqlib/RIPDiff.py b/src/seqlib/RIPDiff.py index 210f3ee..730be66 100644 --- a/src/seqlib/RIPDiff.py +++ b/src/seqlib/RIPDiff.py @@ -1,11 +1,15 @@ -''' -Created on May 13, 2010 +"""Framework for RIP-Seq differential enrichment analysis. -Normalizes and compares RIP vs Control (IgG or total RNA) to identify segments of transcripts that are -preferrentially enriched in RIP +Provides skeletal classes and functions for comparing RNA Immunoprecipitation +(RIP) sequencing data against an isotype control (IgG) or total RNA input to +identify transcript segments preferentially enriched in the RIP sample. -@author: lgoff -''' +RIP-Seq (RNA Immunoprecipitation followed by Sequencing) is used to identify +RNA molecules bound by a specific RNA-binding protein. + +Note: This module is largely unimplemented (placeholder pass statements) and +is retained as a design scaffold for future development. +""" ################## #Imports ################## @@ -16,28 +20,53 @@ ################## class RIPUnit(intervallib.Interval): - """ - Can be individual transcript or some basic unit being interrogated for differential peaks (ie. chromosome) - Extends intervallib.Interval class + """A genomic interval unit used as the basic unit of RIP-Seq differential analysis. + + Can represent an individual transcript or any other genomic region (e.g. a + whole chromosome) that is to be tested for differential read enrichment + between a RIP sample and its control. Extends intervallib.Interval. + + Note: All methods are currently unimplemented placeholders. """ def __init__(self,interval): - """Initiate from existing instance of Interval class only""" + """Initialises a RIPUnit from an existing Interval instance. + + Args: + interval: An intervallib.Interval object to copy coordinates from. + + Raises: + AssertionError: If interval is not an instance of + intervallib.Interval. + """ assert isinstance(interval,intervallib.Interval) intervallib.Interval.__init__(interval) def scan(self): + """Scans the interval for differential RIP peaks (not implemented).""" pass def makebins(self,binSize): + """Divides the interval into bins of the given size (not implemented). + + Args: + binSize: Size of each bin in base pairs. + """ pass def binBinom(self): + """Applies a binomial test to each bin (not implemented).""" pass def binPois(self): + """Applies a Poisson test to each bin (not implemented).""" pass def fetchReads(self,bamHandle): + """Fetches aligned reads overlapping this interval from a BAM file (not implemented). + + Args: + bamHandle: A pysam AlignmentFile handle. + """ pass @@ -45,7 +74,21 @@ def fetchReads(self,bamHandle): #Functions ################# def globalNorm(ripUnit,totReads): + """Applies global normalisation to a RIPUnit based on total library size (not implemented). + + Args: + ripUnit: A RIPUnit object representing the region to normalise. + totReads: Total number of mapped reads in the library, used as the + normalisation denominator. + """ pass def localNorm(ripUnitA,ripUnitB): + """Applies local normalisation between two RIPUnit objects (not implemented). + + Args: + ripUnitA: A RIPUnit from the experimental (RIP) sample. + ripUnitB: A RIPUnit from the control (IgG or input) sample for the + same genomic region. + """ pass diff --git a/src/seqlib/algorithms.py b/src/seqlib/algorithms.py index 2184c51..bb3ac96 100644 --- a/src/seqlib/algorithms.py +++ b/src/seqlib/algorithms.py @@ -1,30 +1,61 @@ # python libs +"""Algorithmic data structures and search utilities for sequence analysis. +Provides Union-Find disjoint set, QuadTree spatial indexing, and binary search +implementations used throughout the seqlib package. +""" #============================================================================= class UnionFind: - """An implementation of the UNINON/FIND algorithm""" + """An implementation of the UNION/FIND algorithm for disjoint sets. + + Supports efficient union and membership queries using path compression. + Each UnionFind instance represents a single set; sets can be merged via + union() and queried for shared membership via same(). + """ def __init__(self, items): + """Initialize a new UnionFind set containing the given items. + + Args: + items: An iterable of hashable items to populate the initial set. + """ self.parent = None self.items = dict.fromkeys(items, 1) def __contains__(self): + """Return True if item is a member of the root set.""" return item in self.root().items def __len__(self): + """Return the number of items in the root set.""" return len(self.root().items) def __iter__(self): + """Iterate over the items in the root set.""" return iter(self.root().items) def add(self, item): + """Add an item to the root set. + + Args: + item: A hashable item to add to the set. + """ self.root().items[item] = 1 def root(self): + """Return the root UnionFind node for this set, applying path compression. + + Traverses parent pointers to find the canonical representative of the + set. As a side effect, compresses the path by pointing this node + directly at the root. + + Returns: + The root UnionFind node representing this disjoint set. + """ node = self while node.parent: node = node.parent @@ -33,9 +64,26 @@ def root(self): return node def same(self, other): + """Return True if this set and other share the same root (are in the same set). + + Args: + other: Another UnionFind instance to compare against. + + Returns: + True if both instances belong to the same disjoint set, False otherwise. + """ return self.root() == other.root() def union(self, other): + """Merge this set with other so that all members belong to a single set. + + If both sets already share the same root, this is a no-op. Otherwise, + all items from other's root are merged into this set's root, and + other's root is reparented. + + Args: + other: Another UnionFind instance to merge with this set. + """ root1 = self.root() root2 = other.root() if root1 == root2: @@ -46,6 +94,11 @@ def union(self, other): root2.parent = root1 def members(self): + """Return a view of all items belonging to this set. + + Returns: + A dict_keys view of all items in the root set. + """ return self.root().items.keys() @@ -64,9 +117,22 @@ def size(self): # QuadTree data structure class Rect: - """A representation of a rectangle""" + """A representation of an axis-aligned rectangle. + + Stores the bounding box as (x1, y1) lower-left and (x2, y2) upper-right + corners, normalizing the coordinates so that x1 <= x2 and y1 <= y2 + regardless of the order the arguments are supplied. + """ def __init__(self, x1, y1, x2, y2): + """Initialize a Rect, normalizing so that (x1, y1) is the lower-left corner. + + Args: + x1: X coordinate of one horizontal boundary. + y1: Y coordinate of one vertical boundary. + x2: X coordinate of the other horizontal boundary. + y2: Y coordinate of the other vertical boundary. + """ if x1 < x2: self.x1 = x1 self.x2 = x2 @@ -81,19 +147,51 @@ def __init__(self, x1, y1, x2, y2): self.y2 = y1 class QuadNode: + """A single entry stored in a QuadTree leaf node. + + Associates an arbitrary item with the bounding Rect used for spatial + indexing inside the QuadTree. + """ + item = None rect = None def __init__(self, item, rect): + """Initialize a QuadNode with an item and its bounding rectangle. + + Args: + item: The object to store (any type). + rect: A Rect instance representing the spatial extent of item. + """ self.item = item self.rect = rect class QuadTree: + """A spatial index that partitions 2-D space into four quadrants recursively. + + Items are stored alongside their bounding Rect. When a leaf node exceeds + MAX items and has not yet reached MAX_DEPTH, it is split into four child + QuadTree nodes and its items are redistributed. Items whose bounding + rectangles span multiple quadrants are stored in every overlapping child. + + Class attributes: + MAX: Maximum number of items in a leaf before splitting (default 10). + MAX_DEPTH: Maximum recursion depth allowed for splits (default 10). + """ + MAX = 10 MAX_DEPTH = 10 def __init__(self, x, y, size, depth = 0): + """Initialize a QuadTree node centered at (x, y) with a given half-size. + + Args: + x: X coordinate of this node's center. + y: Y coordinate of this node's center. + size: Half-width (and half-height) of the region covered by this node. + depth: Current depth of this node in the tree (0 for the root). + """ self.nodes = [] self.children = [] self.center = [x, y] @@ -101,6 +199,17 @@ def __init__(self, x, y, size, depth = 0): self.depth = depth def insert(self, item, rect): + """Insert an item with the given bounding rectangle into the tree. + + If this node is a leaf, the item is appended to the local node list. + If the leaf then exceeds MAX items and depth allows, the node is split. + If this node already has children, the item is forwarded to the + appropriate child(ren). + + Args: + item: The object to store. + rect: A Rect instance representing the spatial extent of item. + """ if len(self.children) == 0: self.nodes.append(QuadNode(item, rect)) @@ -110,6 +219,15 @@ def insert(self, item, rect): self.insertIntoChildren(item, rect) def insertIntoChildren(self, item, rect): + """Forward an item into every child quadrant that its bounding rect overlaps. + + The four children are ordered: [bottom-left, top-left, bottom-right, + top-right] relative to the center of this node. + + Args: + item: The object to store. + rect: A Rect instance representing the spatial extent of item. + """ if rect.x1 < self.center[0]: if rect.y1 < self.center[1]: self.children[0].insert(item, rect) @@ -122,6 +240,12 @@ def insertIntoChildren(self, item, rect): self.children[3].insert(item, rect) def split(self): + """Split this leaf node into four child QuadTree nodes. + + Creates four children covering the four quadrants of this node's + region, then redistributes all currently held items into the children. + After splitting, the local node list is cleared. + """ self.children = [QuadTree(self.center[0] - self.size/2, self.center[1] - self.size/2, self.size/2, self.depth + 1), @@ -140,6 +264,24 @@ def split(self): self.nodes = [] def query(self, rect, results = {}, ret = True): + """Return all items whose bounding rectangles overlap the query rect. + + Recursively traverses child nodes that overlap rect. At leaf nodes, + items whose stored Rect intersects rect are added to the result set. + The results dict is used for deduplication (items are keys). + + Args: + rect: A Rect instance defining the query region. + results: A dict used internally to accumulate results across + recursive calls. Callers should not pass this argument. + ret: If True (the default for the top-level call), the method + returns the keys of the results dict. Recursive calls pass + False to suppress the return. + + Returns: + A dict_keys view of all items that overlap rect, or None when + called recursively (ret=False). + """ if ret: results = {} @@ -164,6 +306,12 @@ def query(self, rect, results = {}, ret = True): return results.keys() def getSize(self): + """Return the total number of items stored in this node and all descendants. + + Returns: + An integer count of all QuadNode items held in the subtree rooted + at this node. + """ size = 0 for child in self.children: size += child.getSize() @@ -174,15 +322,27 @@ def getSize(self): # TODO: make a funtion based linear search def binsearch(lst, val, compare=None, order=1): - """Performs binary search for val in lst using compare - - if val in lst: - Returns (i, i) where lst[i] == val - if val not in lst - Returns index i,j where - lst[i] < val < lst[j] - - runs in O(log n) + """Perform binary search for val in lst, returning a bracket of indices. + + Runs in O(log n). If val is found exactly, both elements of the returned + tuple are the same index. If val is not found, the tuple brackets the + position where val would be inserted. + + Args: + lst: A sorted sequence to search. + val: The value to search for. + compare: An optional two-argument callable compare(a, b) that returns + -1, 0, or 1 (like the old cmp function). Defaults to numeric + comparison via subtraction of boolean comparisons. + order: 1 for ascending sort order, -1 for descending. Defaults to 1. + + Returns: + A tuple (i, j) where: + - (i, i) if lst[i] == val (exact match). + - (i, None) if val is beyond the high end of lst. + - (None, j) if val is before the low end of lst. + - (i, j) with i < j if val falls between lst[i] and lst[j]. + - (None, None) if lst is empty. """ if compare is None: def compare(a, b): return (a > b) - (a < b) diff --git a/src/seqlib/blockIt.py b/src/seqlib/blockIt.py index 0c5f032..698442a 100644 --- a/src/seqlib/blockIt.py +++ b/src/seqlib/blockIt.py @@ -1,4 +1,10 @@ ''' +Block-iT miRNA expression vector insert design utilities. + +Given a 21-mer siRNA candidate sequence, generates the forward and reverse +oligonucleotide sequences required for cloning into the pcDNA6.2-GW/Em-GFP/miR +expression vector (Invitrogen Block-iT Kit). + Created on Oct 14, 2009 Takes as input a 21mer sequence (candidate siRNA) and creates the appropriate fwd and rev oligo sequences to order for insertion into the pcDNA6.2-GW/Em-GFP/miR expression vector from @@ -15,12 +21,36 @@ revAdapter = 'CCTG' def makeBlockItInsert(seq): + """Design forward and reverse oligos for a Block-iT miRNA insert. + + Constructs the forward strand by concatenating the fixed forward adapter, + the reverse complement of seq, the loop sequence, and a modified copy of + seq (positions 0-7 joined directly to positions 10 onward, skipping 8-9). + The reverse strand is the reverse complement of the forward strand + (excluding the first four adapter bases), prefixed by the reverse adapter. + + Args: + seq: A 21-nucleotide DNA string representing the candidate siRNA + sense sequence (5' to 3'). + + Returns: + A tuple (fwdStrand, revStrand) where both elements are DNA strings + suitable for ordering as oligonucleotides. + """ fwdStrand = fwdAdapter+sequence.reverse_complement(seq)+loopSequence+seq[:8]+seq[10:] revStrand = revAdapter+sequence.reverse_complement(fwdStrand[4:]) - return (fwdStrand,revStrand) + return (fwdStrand, revStrand) def printBlockIt(seqs): - """Takes as input the tuple returned from makeBlockItInsert and prints the result to stdout""" + """Print the forward and reverse oligo sequences from a Block-iT insert tuple. + + Prints each strand labeled 'FWD' or 'REV' to stdout. Also computes a + base-pairing alignment string between the forward and reverse strands, + though the alignment is computed but not printed. + + Args: + seqs: A tuple (fwdStrand, revStrand) as returned by makeBlockItInsert. + """ print("FWD:\t%s" % seqs[0]) print("REV:\t%s" % seqs[1]) diff --git a/src/seqlib/bowtie.py b/src/seqlib/bowtie.py index 074a40a..10a5f65 100644 --- a/src/seqlib/bowtie.py +++ b/src/seqlib/bowtie.py @@ -1,20 +1,25 @@ ''' -Created on Dec 15, 2009 +Python tools for running Bowtie in colorspace mode on the Broad Institute cluster. -Python tools for bowtie in colorspace (on Broad cluster) +Provides helpers for preparing SOLiD colorspace reads for Bowtie alignment +and for submitting alignment jobs to an LSF cluster. The pipeline is: -@author: lgoff +1. Make colorspace FASTQ files from ``.csfasta`` and ``.qual`` files + (see ``solid.py`` or ``makeFastq.py``). +2. Align reads with ``bowtie`` using the ``-C`` (colorspace) and ``-S`` + (SAM output) flags. +3. Process the resulting SAM/BAM files with the tools in ``mySam.py`` or + ``bwa.py``. + +The module-level constant ``hg18_bowtieIndex`` points to the colorspace +Bowtie index used by the original author; update it for other references. -Pipeline: -1) make cs .fastq file from .csfasta and .qual (see solid.py or use makeFastq.py in scripts) -2) Align reads in .fastq file using bowtie and specify SAM output with -S flag -3) Enjoy your alignments! +Created on Dec 15, 2009 +@author: lgoff Example commandline: bowtie -C -t -S -n 2 -k 1 -p 4 --best /seq/compbio-hp/lgoff/genomes/hg18/bowtie/hg18_c head0073_20090130_1Uppsala1_Upp1_F3_no_header.csfasta >head0073_20090130_1Uppsala1_Upp1_F3_bowtie.sam 2>bowtie.err -""" - ''' ############ #Imports @@ -35,6 +40,30 @@ ######## def prepBowtie(csfile,qualfile,shortname,basedir,split=100000,readsdir="fastq/",resultsdir="results/"): + """Prepare SOLiD colorspace reads for a Bowtie alignment run. + + Validates input file extensions, generates split FASTQ files from the + colorspace FASTA and quality files using ``solid.makeFastq``, and + creates the results output directory if it does not already exist. + + Args: + csfile: Path to the SOLiD colorspace FASTA file (must end with + ``.csfasta``). + qualfile: Path to the quality score file (must end with ``.qual``). + shortname: Base name used when naming the output FASTQ files. + basedir: Base directory for the project (currently unused in the + function body but reserved for future use). + split: Maximum number of reads per split FASTQ file. Defaults to + 100000. + readsdir: Subdirectory path (relative to cwd) into which the FASTQ + files are written. Defaults to ``'fastq/'``. + resultsdir: Subdirectory path (relative to cwd) that will receive + Bowtie output. Created if absent. Defaults to ``'results/'``. + + Raises: + ValueError: If ``csfile`` does not end with ``.csfasta``. + ValueError: If ``qualfile`` does not end with ``.qual``. + """ if not csfile.endswith('.csfasta'): raise ValueError("prepBowtie requires a .csfasta file") if not qualfile.endswith('.qual'): @@ -49,6 +78,19 @@ def prepBowtie(csfile,qualfile,shortname,basedir,split=100000,readsdir="fastq/", return def runBowtie(queue="broad",cwd=os.getcwd(),outDir = "../results/"): + """Submit colorspace Bowtie alignment jobs to an LSF cluster. + + Scans ``cwd`` for files ending in ``.fastq`` and submits one LSF + ``bsub`` job per file. Each job runs Bowtie in colorspace mode + (``-C``), reporting a single best-alignment SAM file per input. + + Args: + queue: LSF queue name to submit jobs to. Defaults to ``'broad'``. + cwd: Directory to scan for ``.fastq`` files. Defaults to the + current working directory at import time. + outDir: Directory (relative or absolute) into which the SAM and + error files are written. Defaults to ``'../results/'``. + """ files = os.listdir(cwd) for file in files: if file.endswith(".fastq"): diff --git a/src/seqlib/bwa.py b/src/seqlib/bwa.py index 359b589..f999def 100644 --- a/src/seqlib/bwa.py +++ b/src/seqlib/bwa.py @@ -1,6 +1,16 @@ ''' +Python wrappers for the BWA short-read alignment algorithm. + +Provides helper functions for submitting BWA alignment jobs to an LSF +cluster (``bsub``), converting SAM output to sorted BAM files, and parsing +SAM records. Also includes utilities for converting pileup output to UCSC +wiggle format. + +The module-level ``prefix`` and ``ref_index`` constants point to the hg18 +reference genome used by the original author; update these for other +references. + Created on Jul 30, 2009 -Python wrappers for BWA algorithm @author: lgoff @@ -20,19 +30,62 @@ #================= class SAMAlignment(Alignment): + """SAM alignment record with CIGAR and quality-string fields. + + Extends the Alignment base class with the two SAM-specific fields that + are not part of the generic Alignment interface. + + Attributes: + qual: ASCII-encoded base-quality string (SAM field 11). + cigar: CIGAR string describing the alignment operations (SAM field 6). + """ + def __init__(self,readname,chr,start,end,strand,score,readcount,readsequence,cigar,qualstring): + """Initialise a SAMAlignment. + + Args: + readname: Query template name (SAM field 1). + chr: Reference sequence name / chromosome (SAM field 3). + start: 1-based leftmost mapping position (SAM field 4). + end: Computed end position (start + read length - 1). + strand: Strand of the alignment, '+' or '-'. + score: Mapping quality score (SAM field 5). + readcount: Number of reads represented (typically 1). + readsequence: Read sequence bases (SAM field 10). + cigar: CIGAR string (SAM field 6). + qualstring: ASCII-encoded base-quality string (SAM field 11). + """ Alignment.__init__(self,readname,chr,start,end,strand,score=readcount,readcount = readcount,readsequence=readsequence) self.qual = qualstring self.cigar = cigar def SAMReader(fname): - """Iterator for SAMAlignment records""" + """Iterate over SAM alignment records from a file. + + Args: + fname: Path to the SAM file. + + Yields: + An Interval object for each alignment record in the file. + """ handle = open(fname,'r') for line in handle: aln = parseSAMString(line) yield aln.toInterval() def parseSAMString(samstring): + """Parse a single SAM-format line into a SAMAlignment object. + + The end position is derived from the start position plus the length of + the read sequence field; this is only correct for non-spliced alignments. + + Args: + samstring: A single tab-delimited SAM record line (trailing whitespace + is stripped internally). + + Returns: + A SAMAlignment instance populated from the SAM fields. + """ tokens = samstring.rstrip().split("\t") readname = tokens[0] chr = tokens[2] @@ -47,9 +100,31 @@ def parseSAMString(samstring): return SAMAlignment(readname,chr,start,end,strand,score,readcount,readsequence,cigar,qualstring) def joinSAMIntervals(iter,start='start',end='end',offset=0): - """ - Returns a list of independent non-overlapping intervals for each strand overlapping by offset if set - ***SAM file must first be sorted using 'samtools sort'*** + """Merge overlapping SAM intervals into non-overlapping intervals, per strand. + + Groups intervals by strand ('+' or '-'), then iterates through each + group in order and merges any pair of intervals that intersect (with + optional extension by ``offset``). Each merged interval stores its + constituent child intervals and reports their count as ``readcount``. + + The SAM file must be sorted with ``samtools sort`` before use. + + Args: + iter: An iterable of Interval (or Alignment) objects already + loaded from a sorted SAM file. Each must have a ``strand`` + attribute of '+' or '-'. + start: Name of the start-coordinate attribute used when testing + intersection. Defaults to 'start'. + end: Name of the end-coordinate attribute used when testing + intersection. Defaults to 'end'. + offset: Number of bases by which interval extents are extended + before testing for overlap. Defaults to 0. + + Returns: + A dict with keys '+' and '-', each mapping to a list of merged + Interval objects for that strand. Each merged interval has a + ``readcount`` equal to the number of constituent child reads and + a ``children`` list of those child intervals. """ overlapping_plus = [] @@ -91,6 +166,18 @@ def joinSAMIntervals(iter,start='start',end='end',offset=0): return res def bwaAlignSubmit(files,mismatches=2,queue='hugemem'): + """Submit BWA alignment jobs (``bwa aln``) to an LSF cluster. + + For each input FASTQ file, constructs and submits an LSF ``bsub`` job + that runs ``bwa aln`` against the module-level ``prefix`` reference and + writes a ``.sai`` alignment index file. + + Args: + files: A list of FASTQ file paths to align. + mismatches: Maximum number of mismatches allowed in the seed region + (passed to ``bwa aln -n``). Defaults to 2. + queue: LSF queue name to submit jobs to. Defaults to 'hugemem'. + """ for fname in files: shortname = fname.rstrip(".fastq") command = "bsub -q %s -N -o /dev/null -P BWA_Align 'bwa aln -c -n %d %s %s >%s.sai 2>%s.e'" % (queue,mismatches,prefix,fname,shortname,shortname) @@ -98,6 +185,19 @@ def bwaAlignSubmit(files,mismatches=2,queue='hugemem'): return def bwaSamseSubmit(files,mismatches=2,queue='broad'): + """Submit BWA SAM conversion jobs (``bwa samse``) to an LSF cluster. + + For each ``.sai`` file, constructs and submits an LSF ``bsub`` job that + runs ``bwa samse`` to convert the alignment index back to SAM format, + writing a ``.sam`` file. Assumes a matching ``.fastq`` file exists with + the same base name. + + Args: + files: A list of ``.sai`` file paths to convert. + mismatches: Unused parameter kept for interface compatibility. + Defaults to 2. + queue: LSF queue name to submit jobs to. Defaults to 'broad'. + """ for fname in files: shortname = fname.rstrip(".sai") command = "bsub -q %s -N -o /dev/null -P BWA_Samse 'bwa samse %s %s.sai %s.fastq >%s.sam 2>%s.e'" % (queue,prefix,shortname,shortname,shortname,shortname) @@ -105,6 +205,16 @@ def bwaSamseSubmit(files,mismatches=2,queue='broad'): return def makeBam(files,queue='broad'): + """Submit SAM-to-BAM conversion jobs (``samtools view``) to an LSF cluster. + + For each SAM file, constructs and submits an LSF ``bsub`` job that uses + ``samtools view`` to convert it to a BAM file indexed against the + module-level ``ref_index`` FASTA index. + + Args: + files: A list of SAM file paths to convert. + queue: LSF queue name to submit jobs to. Defaults to 'broad'. + """ for fname in files: shortname = fname.rstrip("*.sam") command = "bsub -q %s -N -o /dev/null -P SAM2BAM 'samtools view -h -bt %s -o %s.bam %s 2>%s.bam.e'" % (queue,ref_index,shortname,fname,shortname) @@ -112,6 +222,17 @@ def makeBam(files,queue='broad'): return def samSort(files,queue='broad'): + """Sort BAM files by coordinate using ``samtools sort``. + + Iterates over a list of BAM files, printing a status message for each, + and runs ``samtools sort`` locally (not via LSF) to produce a + ``*_sorted.bam`` output file. + + Args: + files: A list of BAM file paths to sort. + queue: Unused parameter kept for interface consistency with other + submit functions. Defaults to 'broad'. + """ for fname in files: shortname = fname.rstrip("*.bam")+"_sorted" command = "samtools sort %s %s" % (fname,shortname) @@ -122,6 +243,20 @@ def samSort(files,queue='broad'): def pileup2wig(fname,shortname,outDir=os.getcwd()+"/"): + """Convert a samtools pileup file to strand-specific wiggle files. + + Reads a samtools pileup output file and writes two variableStep wiggle + files: one for the plus strand (forward reads, '.' characters) and one + for the minus strand (reverse reads, ',' characters). + + Args: + fname: Path to the samtools pileup file to read. + shortname: Base name used for the wiggle track labels and the output + file names (``_plus.wig`` and + ``_minus.wig``). + outDir: Directory in which the output wiggle files are written. + Defaults to the current working directory. + """ handle = open(fname,'r') preRef = '' prePos = -1 @@ -132,6 +267,17 @@ def pileup2wig(fname,shortname,outDir=os.getcwd()+"/"): minusHand = open(outDir+shortname+"_minus.wig",'w') def wigHeader(shortname,strand): + """Build a UCSC wiggle track-definition header line. + + Args: + shortname: Base name used in the track name and description. + strand: Strand of the track; '+' produces a blue track, + '-' produces a red track. + + Returns: + A wiggle track header string suitable for the first line of a + wiggle file. + """ if strand=="+": color = '0,0,255' sName = 'plus' @@ -163,14 +309,34 @@ def wigHeader(shortname,strand): def getBitValue(n, p): - ''' - get the bitvalue of denary (base 10) number n at the equivalent binary - position p (binary count starts at position 0 from the right) - ''' + """Return the bit at position p of integer n. + + Extracts a single bit at binary position p (zero-indexed from the + least-significant bit) of the integer n. + + Args: + n: A non-negative integer to inspect. + p: Zero-based bit position (0 = least-significant / rightmost bit). + + Returns: + 1 if the bit at position p is set, 0 otherwise. + """ return (n >> p) & 1 def strandFlag(flag): - """Returns strand of sequence from SAM record bitflag (field 4)""" + """Determine the alignment strand from a SAM bitflag value. + + Inspects bit 4 (0x10) of the SAM FLAG field to determine whether a read + mapped to the reverse strand. + + Args: + flag: The integer SAM FLAG value (field 2), or a string + representation of it. + + Returns: + '+' if bit 4 is 0 (forward strand), '-' if bit 4 is 1 (reverse + strand), or '*' for any other value. + """ flag = int(flag) if getBitValue(flag,4)==0: return "+" diff --git a/src/seqlib/clustering.py b/src/seqlib/clustering.py index fa8fd93..25c4bbc 100644 --- a/src/seqlib/clustering.py +++ b/src/seqlib/clustering.py @@ -1,4 +1,10 @@ ''' +K-means clustering implementation for n-dimensional point data. + +Provides Point and Cluster data structures along with a K-means clustering +algorithm and Euclidean distance metric for grouping arbitrary numeric +coordinate data. + Created on Nov 26, 2010 @author: lgoff @@ -10,6 +16,15 @@ #Classes class Point: + """A point in n-dimensional space. + + Attributes: + coords: A list of numeric coordinates, one per dimension. + n: The number of dimensions (length of coords). + reference: An optional object associated with this point (e.g. an + original data record). + """ + # -- The Point class represents points in n-dimensional space # Instance variables # self.coords is a list of coordinates for this Point @@ -17,20 +32,53 @@ class Point: # self.reference is an object bound to this Point # Initialize new Points def __init__(self, coords, reference=None): + """Initialize a Point with a coordinate list and optional reference. + + Args: + coords: A list of numeric values representing the coordinates of + this point in n-dimensional space. + reference: An optional object to associate with this point. + Defaults to None. + """ self.coords = coords self.n = len(coords) self.reference = reference + # Return a string representation of this Point def __repr__(self): + """Return a string representation of the coordinate list.""" return str(self.coords) class Cluster: + """A cluster of Points in n-dimensional space used by the K-means algorithm. + + All Points in a Cluster must share the same number of dimensions. The + cluster maintains a centroid (the coordinate-wise mean of its points) + which is recalculated whenever the cluster's membership changes. + + Attributes: + points: A list of Point objects belonging to this cluster. + n: The number of dimensions of the Points in this cluster. + centroid: A Point representing the sample mean of all cluster points. + """ + # -- The Cluster class represents clusters of points in n-dimensional space # Instance variables # self.points is a list of Points associated with this Cluster # self.n is the number of dimensions this Cluster's Points live in # self.centroid is the sample mean Point of this Cluster def __init__(self, points): + """Initialize a Cluster from a non-empty list of same-dimensional Points. + + Args: + points: A non-empty list of Point objects, all with the same + number of dimensions. + + Raises: + Exception: If points is empty ('ILLEGAL: EMPTY CLUSTER'). + Exception: If points contain mixed dimensionality + ('ILLEGAL: MULTISPACE CLUSTER'). + """ # We forbid empty Clusters (they don't make mathematical sense!) if len(points) == 0: raise Exception("ILLEGAL: EMPTY CLUSTER") self.points = points @@ -41,19 +89,44 @@ def __init__(self, points): if p.n != self.n: raise Exception("ILLEGAL: MULTISPACE CLUSTER") # Figure out what the centroid of this Cluster should be self.centroid = self.calculateCentroid() + # Return a string representation of this Cluster def __repr__(self): + """Return a string representation of the list of Points in this cluster.""" return str(self.points) + # Update function for the K-means algorithm # Assigns a new list of Points to this Cluster, returns centroid difference def update(self, points): + """Replace this cluster's points and return how far the centroid moved. + + Used during each iteration of the K-means algorithm to reassign points + and measure convergence. + + Args: + points: A new list of Point objects to assign to this cluster. + + Returns: + The Euclidean distance between the old centroid and the new + centroid after recalculation. + """ old_centroid = self.centroid self.points = points self.centroid = self.calculateCentroid() return getDistance(old_centroid, self.centroid) + # Calculates the centroid Point - the centroid is the sample mean Point # (in plain English, the average of all the Points in the Cluster) def calculateCentroid(self): + """Compute and return the centroid of the current cluster points. + + The centroid is the coordinate-wise arithmetic mean of all Points + in the cluster. + + Returns: + A new Point whose coordinates are the mean of each dimension + across all points in the cluster. + """ centroid_coords = [] # For each coordinate: for i in range(self.n): @@ -67,6 +140,23 @@ def calculateCentroid(self): # -- Return Clusters of Points formed by K-means clustering def kmeans(points, k, cutoff): + """Cluster points into k groups using the K-means algorithm. + + Randomly selects k seed points and iteratively reassigns every point to + the nearest cluster centroid, updating centroids after each round. Stops + when the largest centroid shift in a single iteration falls below cutoff. + + Args: + points: A list of Point objects to cluster. All points must have the + same dimensionality. + k: The number of clusters to form. + cutoff: A float convergence threshold. Iteration stops when the + maximum centroid displacement across all clusters is less than + this value. + + Returns: + A list of k Cluster objects containing the final cluster assignments. + """ # Randomly sample k Points from the points list, build Clusters around them initial = random.sample(points, k) clusters = [] @@ -104,6 +194,20 @@ def kmeans(points, k, cutoff): ###### # -- Get the Euclidean distance between two Points def getDistance(a, b): + """Return the Euclidean distance between two Points. + + Args: + a: A Point object. + b: A Point object in the same dimensional space as a. + + Returns: + A float representing the Euclidean (straight-line) distance between + the two points. + + Raises: + Exception: If a and b have different numbers of dimensions + ('ILLEGAL: NON-COMPARABLE POINTS'). + """ # Forbid measurements between Points in different spaces if a.n != b.n: raise Exception("ILLEGAL: NON-COMPARABLE POINTS") # Euclidean distance between a and b is sqrt(sum((a[i]-b[i])^2) for all i) @@ -117,6 +221,17 @@ def getDistance(a, b): ########### # -- Create a random Point in n-dimensional space def makeRandomPoint(n, lower, upper): + """Create a Point with n random coordinates drawn uniformly from [lower, upper]. + + Args: + n: The number of dimensions for the new point. + lower: The lower bound of the uniform distribution. + upper: The upper bound of the uniform distribution. + + Returns: + A Point object with n coordinates each sampled from + random.uniform(lower, upper). + """ coords = [] for i in range(n): coords.append(random.uniform(lower, upper)) return Point(coords) @@ -125,6 +240,15 @@ def makeRandomPoint(n, lower, upper): #Main ############## def main(args): + """Run a demo K-means clustering on randomly generated 2-D points. + + Creates 10 random points in 2-D space within [-200, 200] and clusters + them into 3 groups with a convergence cutoff of 0.5, then prints the + points and resulting clusters to stdout. + + Args: + args: Command-line argument list (not currently used). + """ num_points, n, k, cutoff, lower, upper = 10, 2, 3, 0.5, -200, 200 # Create num_points random Points in n-dimensional space points = [] diff --git a/src/seqlib/continuousData.py b/src/seqlib/continuousData.py index 7895d34..dd64923 100644 --- a/src/seqlib/continuousData.py +++ b/src/seqlib/continuousData.py @@ -1,8 +1,18 @@ -''' -Created on Jun 30, 2009 -First attempt at a data structure for high-resolution genome-wide data -@author: lgoff -''' +"""High-resolution genome-wide continuous data storage structures. + +Provides ContinuousData for per-nucleotide or binned coverage arrays on a +single chromosome, and SimpleChIPData for loading, normalising, and scanning +multi-sample NimbleGen ChIP data. + +Note: SimpleChIPData depends on rpy2 and tables (PyTables) as well as the +Chip module from this package. + +First attempt at a data structure for high-resolution genome-wide data. + +Originally created on Jun 30, 2009. + +Author: lgoff +""" import gzip import sys @@ -14,14 +24,32 @@ class ContinuousData(object): - ''' - Data storage object that is specific to a single chromosome - ''' + """Per-chromosome continuous (coverage) data storage backed by numpy arrays. + + Stores strand-separated floating-point data at a configurable bin + resolution. Supports interval-based data accumulation, range extraction, + and gzipped binary serialisation. + + Attributes: + name: Sample name string. + chr: Chromosome name (must be in genomelib.chr_lengths). + binSize: Resolution in base pairs per bin (default 1). + fname: Default filename for binary output. + data: Dict with "+" and "-" keys mapping to numpy float64 arrays. + """ def __init__(self,name,chr,binSize = 1,data = {}): - ''' - Constructor: Creates instance specifically tailored to a given chromosome - ''' + """Construct a ContinuousData object for a single chromosome. + + If data is non-empty, it is used directly. Otherwise, two zero-filled + numpy arrays of length chr_length // binSize are created. + + Args: + name: Sample name string. + chr: Chromosome name string (must be in genomelib.chr_lengths). + binSize: Bin size in base pairs (default 1). + data: Optional pre-existing dict with "+" and "-" numpy arrays. + """ self.name = name self.chr = chr self.binSize = int(binSize) @@ -35,37 +63,99 @@ def __init__(self,name,chr,binSize = 1,data = {}): } def __len__(self): - """Equivalent to length of the genome""" + """Return the number of bins, equivalent to the chromosome length in bins.""" return np.alen(self.data['+']) def __repr__(self): + """Return the sample name string.""" return self.name def __str__(self): + """Return the sample name string.""" return self.name def getMin(self,strand): + """Return the minimum value in the data array for the given strand. + + Args: + strand: "+" or "-". + + Returns: + Minimum float value in self.data[strand]. + """ return np.amin(self.data[strand]) def getMax(self,strand): + """Return the maximum value in the data array for the given strand. + + Args: + strand: "+" or "-". + + Returns: + Maximum float value in self.data[strand]. + """ return np.amax(self.data[strand]) def whichMax(self,strand): + """Return the bin index of the maximum value for the given strand. + + Args: + strand: "+" or "-". + + Returns: + Integer index of the maximum element in self.data[strand]. + """ return np.argmax(self.data[strand]) def whichMin(self,strand): + """Return the bin index of the minimum value for the given strand. + + Args: + strand: "+" or "-". + + Returns: + Integer index of the minimum element in self.data[strand]. + """ return np.argmin(self.data[strand]) def getDataRange(self,strand,start,end): + """Return the data array slice corresponding to a genomic coordinate range. + + Args: + strand: "+" or "-". + start: Genomic start coordinate. + end: Genomic end coordinate. + + Returns: + Numpy array slice of self.data[strand] for the given range. + """ return self.data[strand][(start//self.binSize)-1:(end//self.binSize)-1] def addInterval(self,interval): + """Accumulate an interval's count into the data arrays. + + Adds interval.count to each bin covered by the interval on its strand. + Does nothing if the interval's chromosome does not match self.chr. + + Args: + interval: An object with chr, strand, start, end, and count + attributes. + + Returns: + The string "Wrong data file" if interval.chr != self.chr, + otherwise None. + """ if self.chr != interval.chr: return "Wrong data file" else: self.data[interval.strand][(interval.start//self.binSize)-1:(interval.end//self.binSize)-1]=self.data[interval.strand][(interval.start//self.binSize)-1:(interval.end//self.binSize)-1]+interval.count def write(self,fname=None): + """Write data arrays to a gzipped binary file. + + Args: + fname: Output file path. Defaults to self.fname if not provided. + """ if fname == None: fname = self.fname fd = gzip.open(fname,'wb') @@ -74,19 +164,65 @@ def write(self,fname=None): fd.close() def read(self,fname): + """Read data from a file (not yet implemented). + + Args: + fname: Path to the file to read from. + """ pass def innerHeight(self,strand,start,end): + """Return the maximum value (peak height) within a genomic range. + + Args: + strand: "+" or "-". + start: Genomic start coordinate. + end: Genomic end coordinate. + + Returns: + Maximum float value in the data range. + """ region = self.getDataRange(strand,start,end) return np.amax(region) def outerHeight(self,strand,start,end): + """Return the total signal (sum) within a genomic range. + + Args: + strand: "+" or "-". + start: Genomic start coordinate. + end: Genomic end coordinate. + + Returns: + Sum of all values in the data range. + """ region = self.getDataRange(strand,start,end) return sum(region) class SimpleChIPData(object): + """Multi-sample NimbleGen ChIP-chip data container with normalisation and scanning. + + Loads NimbleGen GFF probe files, applies quantile normalisation via + limma, joins probes into intervals, and scans intervals with a sliding + window test. + + Attributes: + data: Dict mapping sample name to list of probe Intervals. + samples: List of sample name strings in load order. + dataMatrix: 2D numpy float array of probe scores (set by makeMatrix). + normMatrix: 2D numpy array of quantile-normalised scores (set by + quantileNormalize). + intervals: Dict mapping sample name to list of joined Intervals (set + by joinProbes). + """ def __init__(self,files): + """Load NimbleGen GFF files and initialise the data store. + + Args: + files: List of GFF file paths to load. Each file's sample name is + derived by stripping the ".gff" extension. + """ self.data = {} self.samples = [] for fname in files: @@ -96,12 +232,28 @@ def __init__(self,files): self.data[sampleName] = Chip.parseNimblegen(fname) def doIt(self,permuted,windows=[5,6,7,8,9,10,11,12],threshold=0.05): + """Run the full normalise-join-scan pipeline. + + Calls normalize(), joinProbes(), and then scan() for each window size. + + Args: + permuted: Permuted score data passed to scan() for significance + testing. + windows: List of window sizes to scan (default [5..12]). + threshold: Significance threshold for scanning (default 0.05). + """ self.normalize() self.joinProbes() for winSize in windows: self.scan(permuted,winSize,threshold) def makeMatrix(self): + """Build self.dataMatrix from probe scores across all samples. + + Creates a 2D numpy float array of shape (n_probes, n_samples) where + each column contains the scores for one sample in probe order. + Writes a progress message to stderr on completion. + """ data_keys = list(self.data.keys()) self.dataMatrix = np.empty((len(self.data[data_keys[0]]),len(self.samples)),'f') for i in range(0,len(data_keys)): @@ -109,12 +261,22 @@ def makeMatrix(self): sys.stderr.write("Created dataMatrix!\n") def quantileNormalize(self): + """Apply quantile normalisation to self.dataMatrix using limma. + + Calls makeMatrix() first if dataMatrix is not yet set. Requires the + R limma package. Stores the result in self.normMatrix. + """ if 'dataMatrix' not in self.__dict__: self.makeMatrix() rpy.r.library("limma") sys.stderr.write("Performing Quantile Normalization...\n") self.normMatrix = rpy.r.normalizeQuantiles(self.dataMatrix) def normalize(self): + """Replace probe scores with quantile-normalised values. + + Calls quantileNormalize() first if normMatrix is not yet set. Updates + the score attribute of every probe object in self.data in-place. + """ if 'normMatrix' not in self.__dict__: self.quantileNormalize() sys.stderr.write("Replacing values in data with normalized values...\n") data_keys = list(self.data.keys()) @@ -123,6 +285,11 @@ def normalize(self): self.data[data_keys[i]][j].score = self.normMatrix[j,i] def joinProbes(self): + """Join adjacent probes into contiguous intervals for each sample. + + Populates self.intervals dict via Chip.joinNimblegenIntervals(). + Writes per-sample progress messages to stderr. + """ sys.stderr.write("Joining Probes into intervals...\n") self.intervals = {} for sample in self.samples: @@ -130,6 +297,17 @@ def joinProbes(self): self.intervals[sample] = Chip.joinNimblegenIntervals(self.data[sample]) def scan(self,permuted,windowSize,threshold=0.05): + """Scan all intervals with a sliding window test of the given size. + + Calls i.scan(permuted, windowSize, threshold) on every interval in + every sample. Writes progress messages to stderr. + + Args: + permuted: Permuted score data used by the interval scan method for + significance testing. + windowSize: Integer number of probes per sliding window. + threshold: Significance threshold (default 0.05). + """ sys.stderr.write("Scanning with window of size %d..\n" % windowSize) for sample in self.samples: sys.stderr.write("\t%s\n" % sample) diff --git a/src/seqlib/converters.py b/src/seqlib/converters.py index d9009a4..3a0266c 100644 --- a/src/seqlib/converters.py +++ b/src/seqlib/converters.py @@ -1,12 +1,30 @@ ''' +File format conversion utilities for genomic annotation files. + +Contains functions for converting between common bioinformatics file formats +such as BED and GTF. + Created on Mar 17, 2011 @author: lgoff ''' # from misc import rstrips # rasmus library removed - not Python 3.12 compatible -def bed2GTF(fname,outfile=None): - """This does not work yet""" +def bed2GTF(fname, outfile=None): + """Convert a BED file to GTF format (not yet fully implemented). + + Opens the input BED file, writes comment lines and track/browser header + lines through unchanged, and parses remaining tab-delimited lines. The + actual record conversion logic is not yet implemented. + + Note: This function is incomplete and does not currently produce GTF + output records. + + Args: + fname: Path to the input BED file. + outfile: Path for the output GTF file. Defaults to fname with the + trailing '.bed' stripped and '.gtf' appended. + """ handle = open(fname,'r') if outfile == None: outfile = fname.rstrip('.bed')+'.gtf' diff --git a/src/seqlib/dbConn.py b/src/seqlib/dbConn.py index a084380..50c28d1 100644 --- a/src/seqlib/dbConn.py +++ b/src/seqlib/dbConn.py @@ -1,4 +1,15 @@ #!/usr/bin/env python +"""Database connection helpers and genomic data retrieval utilities. + +Provides connection factories for several MySQL databases (Broad Institute +internal, UCSC Genome Browser public mirror, local UCSC mirror on 'valor', and +Ensembl) and a collection of query functions for fetching RefSeq transcripts, +wgRNA annotations, CpG islands, repeat overlaps, lincRNA records, and miRNA +seed sequences. + +Most connection functions require network access to specific internal or public +servers and appropriate credentials. +""" import sys import time @@ -14,6 +25,13 @@ # ################### def broadConnect(): + """Opens a DictCursor connection to the Broad Institute MySQL database. + + Connects to the lgoff_nextgen schema on mysql.broadinstitute.org. + + Returns: + A MySQLdb DictCursor for the lgoff_nextgen database. + """ host="mysql.broadinstitute.org" user="lgoff" password="" @@ -27,6 +45,14 @@ def broadConnect(): # ################### def gbdbConnect(gbdbname = "hg18"): + """Opens a DictCursor connection to the UCSC Genome Browser public MySQL mirror. + + Args: + gbdbname: UCSC genome database name (default: 'hg18'). + + Returns: + A MySQLdb DictCursor for the specified UCSC genome database. + """ gbHost = "genome-mysql.cse.ucsc.edu" gbUser = "genome" gbdb = MySQLdb.connect(host=gbHost,user=gbUser,db=gbdbname) @@ -38,6 +64,17 @@ def gbdbConnect(gbdbname = "hg18"): # ################### def valorGbdbConnect(gbdbname='hg19'): + """Opens a DictCursor connection to the local UCSC Genome Browser mirror on 'valor'. + + Connects to a locally hosted UCSC mirror database using the root account + without a password. + + Args: + gbdbname: Local UCSC genome database name (default: 'hg19'). + + Returns: + A MySQLdb DictCursor for the specified local genome database. + """ gbHost = 'localhost' gbUser = 'root' gbPass = '' @@ -50,6 +87,14 @@ def valorGbdbConnect(gbdbname='hg19'): # #################### def ensemblConnect(): + """Opens a DictCursor connection to the public Ensembl MySQL server. + + Connects to the homo_sapiens_core_47_36i schema on ensembldb.ensembl.org + using the anonymous account. + + Returns: + A MySQLdb DictCursor for the Ensembl homo_sapiens_core_47_36i database. + """ ensemblHost = "ensembldb.ensembl.org" ensemblUser = "anonymous" ensembldbname = "homo_sapiens_core_47_36i" @@ -78,6 +123,18 @@ def fetchRefSeq(genome = 'hg18',lookupval = 'name'): return output def fetchRefSeqIntervals(genome = 'hg18'): + """Returns a dictionary of RefSeq SplicedInterval objects keyed by transcript name. + + Queries the refGene table of the UCSC Genome Browser database and + constructs an intervallib.SplicedInterval for each transcript. + + Args: + genome: UCSC genome database name (default: 'hg18'). + + Returns: + A dictionary mapping RefSeq transcript names to SplicedInterval + objects. + """ cursor = gbdbConnect(gbdbname=genome) select = "SELECT * from refGene" cursor.execute(select) @@ -146,6 +203,23 @@ def fetchRefSeqIntervalsIndexed(genome='hg18',proteinCodingOnly=False,verbose=Fa return output def getIntervalFromRefSeq(lookupval,genome='hg18',lookupkey= 'name2',verbose=False): + """Returns SplicedInterval objects for RefSeq transcripts matching a lookup value. + + Queries the UCSC refGene table for rows where lookupkey equals lookupval + and constructs an intervallib.SplicedInterval for each matching transcript. + + Args: + lookupval: The value to search for (e.g. a gene symbol or transcript + ID). + genome: UCSC genome database name (default: 'hg18'). + lookupkey: refGene column to search against (default: 'name2', which + corresponds to the gene symbol). + verbose: If True, print the SQL query and row count to stderr + (default: False). + + Returns: + A list of SplicedInterval objects for the matching transcripts. + """ cursor = gbdbConnect(gbdbname=genome) select = """SELECT * FROM refGene WHERE %s = '%s'""" % (lookupkey,lookupval) if verbose: @@ -170,6 +244,22 @@ def getIntervalFromRefSeq(lookupval,genome='hg18',lookupkey= 'name2',verbose=Fal return output def getIntervalFromAll_mRNA(lookupval,genome='hg18',lookupkey='qName',verbose=False): + """Returns SplicedInterval objects from the UCSC all_mrna alignment table. + + Queries the all_mrna table for mRNA BLAT alignments matching lookupval + in the specified column, and constructs a SplicedInterval for each row. + + Args: + lookupval: The value to search for (e.g. a GenBank accession). + genome: UCSC genome database name (default: 'hg18'). + lookupkey: all_mrna column to search (default: 'qName', the query + sequence name). + verbose: If True, print the SQL query and row count to stderr + (default: False). + + Returns: + A list of SplicedInterval objects for the matching alignments. + """ cursor = gbdbConnect(gbdbname=genome) select = """SELECT * FROM all_mrna WHERE %s = '%s'""" % (lookupkey,lookupval) if verbose: @@ -211,6 +301,15 @@ def refseqTSS(): return output def fetchwgRNA(): + """Returns all wgRNA entries from the UCSC Genome Browser indexed by chromosome, strand, and name. + + Queries the wgRna table of the default genome (hg18) and organises + results into a nested dictionary structure. + + Returns: + A dictionary of the form output[chr][strand][name] = row_dict for + each wgRNA entry on a standard chromosome. + """ cursor=gbdbConnect() select="SELECT * FROM wgRna" cursor.execute(select) @@ -246,6 +345,20 @@ def hostRefSeq(chr,start,end,strand): return results def testCpG(chr,start,end): + """Tests whether a genomic interval overlaps a CpG island in the UCSC database. + + Queries the cpgIslandExt table for CpG islands that overlap the given + coordinates. + + Args: + chr: Chromosome name (e.g. 'chr1'). + start: Start coordinate (0-based). + end: End coordinate. + + Returns: + The first matching row as a dictionary, or False if no CpG island + overlaps the interval. + """ cursor=gbdbConnect() selSQL="SELECT * from cpgIslandExt WHERE chrom='%s' AND chromStart<='%d' AND chromEnd>='%d'" % (chr,int(start),int(end)) cursor.execute(selSQL) @@ -273,6 +386,21 @@ def testwgRNA(chr,start,end,strand): return results def hostmRNA(chr,start,end,strand): + """Returns mRNA alignments that span a given genomic interval from the UCSC database. + + Queries a chromosome-specific mRNA table (named _mrna) for + alignments that contain the interval [start, end]. + + Args: + chr: Chromosome name (e.g. 'chr1'). + start: Start coordinate of the query interval. + end: End coordinate of the query interval. + strand: Strand orientation (not currently used in the SQL query). + + Returns: + A list of row dictionaries for overlapping mRNA alignments, or False + if none are found. + """ cursor=gbdbConnect() selSQL="SELECT * from %s_mrna WHERE tName='%s' AND tStart<='%d' AND tEnd>='%d'" % (chr,chr,int(start),int(end)) cursor.execute(selSQL) @@ -286,6 +414,19 @@ def hostmRNA(chr,start,end,strand): return results def fetchLincRNA(fname="/seq/compbio/lgoff/lincRNAs/hg18_lincRNA_Guttman.bed"): + """Reads a lincRNA BED file and returns intervals indexed by chromosome. + + Parses a three-column BED file (chr, start, end) and organises the + resulting intervals into a dictionary keyed by chromosome name. + + Args: + fname: Path to a BED file of lincRNA intervals (default: hg18 + Guttman et al. lincRNA catalogue). + + Returns: + A dictionary mapping chromosome names to lists of interval + dictionaries, each with keys 'chr', 'start' (int), and 'end' (int). + """ handle=open(fname,'r') lincs={} for chr in genomelib.chr_names: @@ -300,6 +441,21 @@ def fetchLincRNA(fname="/seq/compbio/lgoff/lincRNAs/hg18_lincRNA_Guttman.bed"): return lincs def fetchmiRNASeeds(fname="/seq/compbio/lgoff/smallRNAs/genomes/human/microRNA/mature.fa",species = 'hsa'): + """Reads a miRBase FASTA file and returns a dictionary mapping seed sequences to miRNA names. + + Extracts the 7-nt seed sequence (positions 2-8 of the mature miRNA) for + each entry matching the given species prefix. + + Args: + fname: Path to a miRBase mature miRNA FASTA file (default: internal + Broad Institute path). + species: Two- or three-letter miRBase species prefix to filter by + (default: 'hsa' for Homo sapiens). + + Returns: + A dictionary mapping 7-nt seed sequences (str) to the first token + of the miRNA name (str). + """ handle = open(fname,'r') seeds = {} iter = sequencelib.FastaIterator(handle) @@ -313,6 +469,21 @@ def fetchmiRNASeeds(fname="/seq/compbio/lgoff/smallRNAs/genomes/human/microRNA/m ############ def findRepeatOverlap(interval,cursor=None): + """Returns RepeatMasker annotations that overlap a given genomic interval. + + Queries the rmsk table of the local UCSC mirror for repeat elements that + partially or fully overlap the interval. + + Args: + interval: An intervallib interval object with chr, start, end, and + genome attributes. + cursor: An optional pre-existing MySQLdb DictCursor. If None, a new + connection to the local valor UCSC mirror is opened. + + Returns: + A list of row dictionaries for overlapping repeats, or False if none + are found. + """ if cursor == None: cursor = valorGbdbConnect(interval.genome) selSQL = "SELECT * from rmsk WHERE genoName = '%s' AND (genoStart >= '%d' OR genoEnd >= '%d') AND (genoStart <= '%d' OR genoEnd <= '%d')" % (interval.chr,interval.start,interval.start,interval.end,interval.end) @@ -327,6 +498,21 @@ def findRepeatOverlap(interval,cursor=None): return results def findUCSCOverlap(interval,cursor=None): + """Returns UCSC knownGene entries (with RefSeq mapping) that overlap a given interval. + + Queries the knownGene table joined to knownToRefSeq on the local UCSC + mirror for known genes that partially or fully overlap the interval. + + Args: + interval: An intervallib interval object with chr, start, end, and + genome attributes. + cursor: An optional pre-existing MySQLdb DictCursor. If None, a new + connection to the local valor UCSC mirror is opened. + + Returns: + A list of row dictionaries for overlapping known genes, or False if + none are found. + """ if cursor == None: cursor = valorGbdbConnect(interval.genome) selSQL = "SELECT * from knownGene kg LEFT JOIN knownToRefSeq krs ON kg.name = krs.name WHERE kg.chrom = '%s' AND (kg.txStart >= '%d' OR kg.txEnd >= '%d') AND (kg.txStart <= '%d' OR kg.txEnd <= '%d')" % (interval.chr,interval.start,interval.start,interval.end,interval.end) diff --git a/src/seqlib/genomelib.py b/src/seqlib/genomelib.py index 1cf0d84..d0b712b 100644 --- a/src/seqlib/genomelib.py +++ b/src/seqlib/genomelib.py @@ -1,10 +1,20 @@ -''' -Created on Aug 28, 2010 +"""Genome-level utilities and constants for human genome builds. -This is a port of the genome.py module from seqtools (it is a work in progress) +Contains chromosome names, lengths, and base frequencies for hg18, along with +helper functions for fetching genome sequences (via pygr), generating random +genomic regions, building repeat-masker and refGene NLMSA indices, and +checking whether a sequence is soft-masked. -@author: lgoff -''' +Note: Functions that depend on pygr (pygrConnect, build_rmsk_nlmsa, +refGene_nlmsa, fetchSequence) are non-functional in Python 3 because pygr +is a Python 2-only library. + +This is a port of the genome.py module from seqtools (work in progress). + +Originally created on Aug 28, 2010. + +Author: lgoff +""" ############ #Imports ############ @@ -94,6 +104,19 @@ #Functions ####### def fetch_genbases(genhandle,genbases={}): + """Count occurrences of each nucleotide across an entire genome FASTA file. + + Iterates over all sequences in the FASTA file and tallies A, T, G, C, and N + counts. Results are accumulated into the genbases dict. + + Args: + genhandle: An open file handle to a genome FASTA file. + genbases: Optional dict to accumulate counts into (default new dict). + Mutated in-place and also returned. + + Returns: + Dict mapping each base character to its total integer count. + """ bases = ['A','T','G','C','N'] geniter = sequencelib.FastaIterator(genhandle) for genseq in geniter: @@ -123,6 +146,17 @@ def random_region(n,m=1): return c, start, end, strand def isMasked(s): + """Return True if the sequence contains any soft-masked or N characters. + + Soft-masked characters are lowercase a, c, t, g, and n, plus uppercase N. + + Args: + s: DNA sequence string. + + Returns: + True if any character in s is in the set {a, c, t, g, n, N}, + False otherwise. + """ maskedChars='actgnN' for c in s: if c in maskedChars: @@ -136,6 +170,25 @@ def isMasked(s): #SeqPath = pygr.Data.Bio.Seq.Genome.HUMAN.hg18 def pygrConnect(genome="hg18",useWorldbase = False): + """Return a pygr genome sequence database handle for the given build. + + Note: pygr is a Python 2-only library and is not available in Python 3. + This function will raise an ImportError or NameError at call time in + Python 3 environments. + + Args: + genome: Genome build identifier string. Supported values: "hg18", + "hg19", "mm9", "mm8" (worldbase only for mm8). + useWorldbase: If True, connect via pygr's worldbase service. If + False (default), open the local FASTA file via SequenceFileDB. + + Returns: + A pygr SequenceFileDB or worldbase genome object supporting + chromosome-level sequence access. + + Raises: + AssertionError: If genome is not recognised. + """ if useWorldbase: if genome == "hg18": res=worldbase.Bio.Seq.Genome.HUMAN.hg18() @@ -161,20 +214,50 @@ def pygrConnect(genome="hg18",useWorldbase = False): #pygr annotation layers #This is very closely tied to valor class UCSCStrandDescr(object): + """A descriptor that converts UCSC strand strings to pygr orientation ints. + + Returns 1 for "+" strand and -1 for all other strands. Intended to be + used as a class attribute on sqlgraph row classes. + """ def __get__(self, obj, objtype): + """Return orientation integer for the row object's strand. + + Args: + obj: The row instance whose strand attribute is read. + objtype: The owner class (unused). + + Returns: + 1 if obj.strand == "+", otherwise -1. + """ if obj.strand == '+': return 1 else: return -1 class UCSCSeqIntervalRow(sqlgraph.TupleO): + """A sqlgraph TupleO row class for UCSC interval tables. + + Adds an orientation attribute via UCSCStrandDescr, converting the + strand column to a pygr-compatible +1/-1 integer. + """ orientation = UCSCStrandDescr() serverInfo = sqlgraph.DBServerInfo(host='localhost',user='root',passwd='') def build_rmsk_nlmsa(genome="hg19"): + """Build a pygr NLMSA index for the RepeatMasker annotation table. + + Connects to the local UCSC MySQL server, creates an AnnotationDB over + the rmsk table, and writes the NLMSA index to disk for later use. + + Note: Requires a running local MySQL server with the UCSC schema and + pygr installed (Python 2 only). + + Args: + genome: Genome build string (default "hg19"). + """ #This is horse shit... - + seqDB = pygrConnect(genome) rmsk = sqlgraph.SQLTable('hg19.rmsk',serverInfo=serverInfo,itemClass=UCSCSeqIntervalRow,primaryKey="lookupName") annodb = annotation.AnnotationDB(rmsk, @@ -191,9 +274,23 @@ def build_rmsk_nlmsa(genome="hg19"): al.build() def refGene_nlmsa(genome="hg19"): + """Return a pygr NLMSA index for the refGene annotation table. + + Attempts to load a pre-built NLMSA from disk. If not found, builds one + from the local UCSC MySQL refGene table and saves it to disk. + + Note: Requires a running local MySQL server with a 'lookupName' primary + key added to the refGene table, and pygr installed (Python 2 only). + + Args: + genome: Genome build string (default "hg19"). + + Returns: + A cnestedlist.NLMSA object opened in read mode. + """ #Needed to add primary key 'lookupName' to hg19.refGene for this to work (pygr requires unique ids for an annotation) #This is really CRAP....I don't know how or why anyone will every be able to use this.... - + try: al = cnestedlist.NLMSA('/n/rinn_data1/indexes/human/'+genome+'/refGene/refGene_'+genome,'r') except: @@ -223,6 +320,20 @@ def refGene_nlmsa(genome="hg19"): #MISC ################ def fetchSequence(chrom,start,end,strand,genome="hg18"): + """Fetch a genomic sequence from the specified region using pygr. + + Note: Requires pygr (Python 2 only). + + Args: + chrom: Chromosome name string (e.g. "chr1"). + start: Start coordinate (0-based, integer). + end: End coordinate (integer). + strand: Strand string; if "-" the reverse complement is returned. + genome: Genome build string (default "hg18"). + + Returns: + A pygr sequence object for the requested region. + """ connection=pygrConnect(genome) start,end=int(start),int(end) seq=connection[chrom][start:end] diff --git a/src/seqlib/gibson.py b/src/seqlib/gibson.py index 4223ca3..87a3367 100644 --- a/src/seqlib/gibson.py +++ b/src/seqlib/gibson.py @@ -1,10 +1,14 @@ -''' -Created on Sep 19, 2012 +"""Tools for designing Gibson Assembly fragments from FASTA sequences. -Script to create gibson assembly fragments for ordering from a fasta file. +Reads a FASTA file of sequences (e.g. cDNAs or genomic regions) and splits +each into overlapping fragments suitable for Gibson Assembly cloning. +Optionally prepends Gateway attB recombination sequences to the outermost +primers. Fragments are written in a tab-delimited or pretty-printed format. -@author: lgoff -''' +Usage:: + + python gibson.py [options] +""" #Imports import getopt import sys @@ -31,10 +35,42 @@ ''' class Usage(Exception): + """Exception raised for command-line usage errors. + + Attributes: + msg: Human-readable explanation of the error or the help message. + """ def __init__(self, msg): + """Initialises a Usage exception with an error message. + + Args: + msg: Human-readable error or help text. + """ self.msg = msg def gibson(fname,gateway=True,fragSize=500,overhangSize=20): + """Splits FASTA sequences into overlapping Gibson Assembly fragments. + + Reads each record from a FASTA file and divides its sequence into a series + of fragments of approximately fragSize bp, with consecutive fragments + overlapping by overhangSize bp. When gateway is True, the Gateway attB + forward site (attF) is prepended to the sequence and the reverse + complement of the Gateway attB reverse site (attR) is appended before + fragmentation. + + Args: + fname: Path to a FASTA-format input file. + gateway: If True, add Gateway attB recombination sequences flanking + the insert before fragmentation (default: True). + fragSize: Target size in base pairs for each Gibson fragment + (default: 500). + overhangSize: Length in base pairs of the overlap between adjacent + fragments (default: 20). + + Returns: + A dictionary mapping each FASTA record name to a list of fragment + sequence strings in 5'-to-3' order. + """ res = {} #Fasta file handle @@ -63,6 +99,17 @@ def gibson(fname,gateway=True,fragSize=500,overhangSize=20): return res def printGibson(fragDict,outHandle): + """Writes Gibson Assembly fragments to a file handle in tab-delimited format. + + For each sequence in fragDict, prints a header line with the sequence name + followed by one line per fragment in the format: + _block\\t + + Args: + fragDict: Dictionary mapping sequence names to lists of fragment + sequence strings, as returned by gibson(). + outHandle: Writable file-like object to receive the output. + """ for k in fragDict.keys(): print("%s:" % k, file=outHandle) blockCount = 0 @@ -77,6 +124,18 @@ def printGibson(fragDict,outHandle): # Main ############## def main(argv=None): + """Command-line entry point for the Gibson Assembly fragment designer. + + Parses command-line arguments, calls gibson() to generate fragments from + the provided FASTA file, and writes the results with printGibson(). + + Args: + argv: List of command-line argument strings. Defaults to sys.argv + when None. + + Raises: + SystemExit: On usage errors or when --help is requested. + """ if argv is None: argv = sys.argv verbose = False diff --git a/src/seqlib/go.py b/src/seqlib/go.py index 0d3f1ba..a7855e2 100644 --- a/src/seqlib/go.py +++ b/src/seqlib/go.py @@ -1,10 +1,30 @@ +"""Gene Ontology (GO) database parsing and traversal utilities. + +Provides classes and functions for loading a Gene Ontology OBO-XML file, +representing GO terms, and traversing the GO DAG to retrieve all ancestor +terms for a given GO accession. Includes deprecated tab-delimited annotation +file readers. +""" import xml.sax.handler from xml.sax import make_parser from xml.sax.handler import feature_namespaces def readGo(filename): - """DEPRECATED""" + """Reads a tab-delimited GO annotation file and returns a mapping of gene IDs to GO terms. + + DEPRECATED: This function relies on the Python 2 built-in file() and the + non-standard Dict class. It is retained for historical reference only. + + Args: + filename: Path to a tab-delimited GO annotation file where column 0 + contains the gene/feature identifier and column 4 contains the + GO term. Lines containing 'GI:' are skipped. + + Returns: + A Dict (default list) mapping gene identifiers to lists of GO term + strings. + """ terms = Dict(default=[]) for line in file(filename): @@ -20,7 +40,19 @@ def readGo(filename): def readCommonNames(filename): - """DEPRECATED""" + """Reads a tab-delimited file mapping identifiers to common gene names. + + DEPRECATED: Relies on the Python 2 built-in file(). Retained for + historical reference only. + + Args: + filename: Path to a two-column tab-delimited file where column 0 is + the primary identifier and column 1 is the common name ('-' + entries are skipped). + + Returns: + A dictionary mapping primary identifiers to common name strings. + """ commonNames = {} for line in file(filename): @@ -33,7 +65,18 @@ def readCommonNames(filename): class GoTerm: + """Represents a single Gene Ontology term. + + Attributes: + accession: GO accession string (e.g. 'GO:0008150'). + name: Human-readable term name (e.g. 'biological process'). + definition: Textual definition of the term. + is_a: List of parent GO accession strings linked by 'is_a' relations. + part_of: List of parent GO accession strings linked by 'part_of' + relations. + """ def __init__(self): + """Initialises a GoTerm with empty/default attribute values.""" self.accession = "" self.name = "" self.definition = "" @@ -42,21 +85,56 @@ def __init__(self): # self.synonym = [] class AllTerm(GoTerm): + """Synthetic top-level GO term used as the root of the GO hierarchy. + + AllTerm has a fixed accession and name of 'all' and is added to the + GoDatabase after parsing to provide a single root node for traversal. + """ def __init__(self): + """Initialises AllTerm with accession='all' and name='all'.""" GoTerm.__init__(self) - + self.accession = "all" self.name = "all" - self.defintion = "top-level term" + self.defintion = "top-level term" class GoHandler(xml.sax.handler.ContentHandler): + """SAX content handler for parsing Gene Ontology OBO-XML files. + + Builds a dictionary of GoTerm objects from a GO OBO-XML file as it is + streamed through a SAX parser. Handles go:term, go:is_a, go:part_of, + go:accession, go:name, and go:definition elements. + + Attributes: + terms: Dictionary mapping GO accession strings to GoTerm objects. + term: The GoTerm currently being parsed, or None between terms. + elm: Name of the XML element currently open, used to route character + data to the correct GoTerm attribute. + base: URL prefix for the GO namespace, used to strip absolute URIs + to relative accession strings in is_a and part_of relations. + """ def __init__(self, base): + """Initialises the GoHandler with a namespace base URL. + + Args: + base: URL prefix for the GO namespace + (e.g. 'http://www.geneontology.org/go#'). + """ self.terms = {} self.term = None self.elm = "" self.base = base - + def startElement(self, name, attrs): + """Handles the opening of an XML element during SAX parsing. + + Creates a new GoTerm when a go:term element opens, and appends + parent accessions when go:is_a or go:part_of elements are encountered. + + Args: + name: Local name of the XML element. + attrs: AttributesImpl object providing element attributes. + """ if name == "go:term": self.term = GoTerm() elif name == "go:is_a": @@ -70,11 +148,24 @@ def startElement(self, name, attrs): self.elm = name def endElement(self, name): + """Handles the closing of an XML element during SAX parsing. + + Stores the completed GoTerm in the terms dictionary when a go:term + element closes, and resets the current element tracker. + + Args: + name: Local name of the closing XML element. + """ if name == "go:term": self.terms[self.term.accession] = self.term self.elm = "" def characters(self, text): + """Routes character data to the appropriate attribute of the current GoTerm. + + Args: + text: Character data string from the SAX parser. + """ if self.elm == "go:accession": self.term.accession = text elif self.elm == "go:name": @@ -84,7 +175,22 @@ def characters(self, text): class GoDatabase: + """In-memory representation of a Gene Ontology database loaded from OBO-XML. + + Parses a GO OBO-XML file using SAX and stores all terms in a dictionary + indexed by GO accession. Provides methods for traversing the GO DAG to + retrieve ancestor terms. + + Attributes: + terms: Dictionary mapping GO accession strings to GoTerm objects. + Also includes an 'all' entry (AllTerm) as the synthetic root. + """ def __init__(self, filename): + """Loads and parses a Gene Ontology OBO-XML file. + + Args: + filename: Path to a GO OBO-XML file (e.g. gene_ontology.obo.xml). + """ # Create a parser parser = make_parser() @@ -107,6 +213,28 @@ def __init__(self, filename): def getAllParents(self, goid, touched=None, count=0, ret=True): + """Returns all ancestor GO terms of a given GO accession via BFS. + + Recursively follows is_a and part_of relationships to collect all + ancestor GO accessions in breadth-first discovery order (excluding + the synthetic 'all' root). + + Args: + goid: A GO accession string (e.g. 'GO:0008150') whose ancestors + should be retrieved. + touched: Dictionary used internally to track visited accessions + and their discovery order. Should not be passed by callers. + count: Integer counter used internally during recursion. Should + not be passed by callers. + ret: If True (default), return the sorted list of ancestor + accessions. If False, only populate touched (used during + recursion). + + Returns: + When ret is True, a list of GO accession strings for all ancestors + of goid, ordered by discovery sequence (breadth-first). Returns + None when ret is False. + """ if touched == None: touched = {} diff --git a/src/seqlib/intervallib.py b/src/seqlib/intervallib.py index 6a67827..0f05bc9 100644 --- a/src/seqlib/intervallib.py +++ b/src/seqlib/intervallib.py @@ -1,9 +1,15 @@ #!/usr/bin/env python -''' -Created on Jun 25, 2009 +"""Genomic interval data structures and utilities. -@author: lgoff -''' +Provides the Interval and SplicedInterval classes for representing genomic +regions, along with a collection of functions for parsing BED/FASTA files, +performing interval arithmetic (overlaps, distances, TSS maps), and converting +intervals to various output formats. + +Originally created on Jun 25, 2009. + +Author: lgoff +""" # import genomelib import copy import os @@ -22,10 +28,46 @@ #This is very human-specific at this point class Interval: - """Basic interval class, try to use ChipInterval or SeqInterval if possible... - At this point, the Interval class is rather human specific so avoid calls to self.fetchSequence() or self.getChrNum(), etc... + """Basic genomic interval class. + + Represents a genomic region defined by chromosome, start, end, and strand. + Try to use ChipInterval or SeqInterval if possible. At this point, the + Interval class is rather human-specific, so avoid calls to + self.fetchSequence() or self.getChrNum() in non-human contexts. + + Attributes: + chr: Chromosome name (e.g. "chr1"). + start: 0-based start coordinate. + end: End coordinate (inclusive). + strand: Strand orientation ("+", "-", or "*"). + score: Floating-point score; can proxy for read count. + readcount: Integer read count for the interval (-1 if unset). + name: Human-readable name for the interval. + sequence: DNA sequence string for the interval (empty if not fetched). + data: Dictionary of arbitrary key-value metadata. + genome: Genome build identifier (default "hg18"). + TSS: Transcription start site coordinate based on strand. """ def __init__(self, chr, start, end, strand="*", score=0.0, readcount = -1,name="",sequence = "",data={},genome="hg18"): + """Initialize an Interval. + + If the first argument is an existing Interval instance, all attributes + are copied from it (copy constructor behaviour). + + Args: + chr: Chromosome name string, or an existing Interval to copy. + start: 0-based start coordinate. + end: End coordinate (inclusive). + strand: Strand orientation: "+", "-", or "*". + score: Floating-point score (default 0.0). + readcount: Integer read count (default -1 meaning unset). + name: Name string. If empty, a "chr:start-end:strand" label is + auto-generated. + sequence: DNA sequence string (default empty string). + data: Dictionary of arbitrary metadata (default empty dict). + genome: Genome build string used for sequence fetching + (default "hg18"). + """ #Check if creating new instance from old instance as 1st arg if isinstance(chr,Interval): @@ -67,6 +109,14 @@ def __init__(self, chr, start, end, strand="*", score=0.0, readcount = -1,name=" self.endIndex = -1 def getTSS(self): + """Return the transcription start site coordinate. + + Sets and returns self.TSS based on strand: start for "+" strand, + end for "-" strand. + + Returns: + Integer coordinate of the TSS. + """ if self.strand == "+": self.TSS = self.start elif self.strand == "-": @@ -90,7 +140,17 @@ def childScores(self): return [x.score for x in self.children] def makeValMap(self,value = 'readcount'): - """Check these two to see which one is right...""" + """Build a positional value map across the interval from child intervals. + + Creates self.valMap, a numpy array of length len(self) where each + position holds the average of the specified attribute over all child + intervals that cover that position. Positions with no coverage are set + to -1. + + Args: + value: Name of the Interval attribute to average at each position + (default "readcount"). + """ self.valMap = np.zeros(len(self)) self.valMap = self.valMap-1 myTmp = [] @@ -104,32 +164,48 @@ def makeValMap(self,value = 'readcount'): self.valMap[nt]=sum(myTmp[nt])/len(myTmp[nt]) def __iter__(self): + """Iterate over characters in self.sequence.""" return iter(self.sequence) def __getitem__(self,key): + """Return character(s) at index/slice key from self.sequence.""" return self.sequence[key] def __repr__(self): + """Return the interval name, or a chr:start-end:strand string if name is empty.""" if self.name == "": return "%s:%d-%d:%s" % (self.chr,self.start,self.end,self.strand) else: return self.name def __neg__(self): + """Return a new Interval with the strand flipped.""" strandLookup = {"+":"-","-":"+"} newStrand = strandLookup[self.strand] return Interval(self.chr,self.start,self.end,newStrand,self.score,self.readcount) def __len__(self): + """Return the length of the interval in bases (end - start + 1).""" return self.end-self.start+1 def __str__(self): + """Return self.sequence if set, otherwise self.name.""" if self.sequence != "": return self.sequence else: return self.name def __lt__(self, b): + """Compare intervals by chromosomal position. + + Compares first by chromosome number, then by midpoint position. + + Args: + b: Another Interval to compare against. + + Returns: + True if self sorts before b. + """ chr_test_a = self.getChrNum() chr_test_b = b.getChrNum() if chr_test_a != chr_test_b: @@ -139,15 +215,19 @@ def __lt__(self, b): return mid1 < mid2 def __eq__(self, b): + """Return True if self and b have the same chr, start, and end.""" return self.equals(b) def __le__(self, b): + """Return True if self is less than or equal to b.""" return self.__lt__(b) or self.__eq__(b) def __gt__(self, b): + """Return True if self is greater than b.""" return not self.__le__(b) def __ge__(self, b): + """Return True if self is greater than or equal to b.""" return not self.__lt__(b) def windows(self,windowSize): @@ -160,21 +240,53 @@ def toBed(self,value = 'score'): return "%s\t%d\t%d\t%s\t%.2f\t%s" %(self.chr,self.start,self.end,self.name,self.__dict__[value],self.strand) def toUCSC(self): + """Return a UCSC browser region string (chr:start-end). + + Returns: + String formatted as "chr:start-end". + """ return "%s:%d-%d" % (self.chr,self.start,self.end) def toStringNumIGV(self): + """Return an IGV-compatible numeric chromosome and start string. + + Strips the "chr" prefix from the chromosome name. + + Returns: + Tab-delimited string of numeric chromosome and start position. + """ return "%s\t%d" % (self.chr.replace("chr",""),self.start) def toFasta(self): + """Return the interval as a FASTA-formatted string. + + Returns: + String with a FASTA header line followed by self.sequence. + """ return ">%s\n%s" % (self.name,self.sequence) def getString(self): + """Return a chr:start-end:strand string representation. + + Returns: + String formatted as "chr:start-end:strand". + """ return "%s:%d-%d:%s" % (self.chr,self.start,self.end,self.strand) def getScore(self): + """Return self.score. + + Returns: + The floating-point score of the interval. + """ return self.score def getStrand(self): + """Return self.strand. + + Returns: + The strand string ("+", "-", or "*"). + """ return self.strand def mature(self,start,end): @@ -225,7 +337,19 @@ def distanceBetweenTSS(self,b): return False def findDist(self,b): - """ + """Return the signed distance from self's TSS to b's relevant end. + + The relevant end of b depends on each interval's strand: + - self "+" and b "+": b.start - self.TSS + - self "+" and b "-": b.end - self.TSS + - self "-" and b "+": self.TSS - b.start + - self "-" and b "-": self.TSS - b.end + + Args: + b: Another Interval. + + Returns: + Signed integer distance. """ if self.strand == "+" and b.strand == "+": return b.start-self.TSS @@ -261,6 +385,14 @@ def getChrNum(self): else: return self.chr def fetchSequence(self): + """Fetch and store the genomic sequence for this interval via pygr. + + Uses self.genome to connect to the genome database. On "-" strand + the reverse complement is returned. Sets and returns self.sequence. + + Returns: + The DNA sequence string for the interval. + """ if self.genome != "": genome = genomelib.pygrConnect(self.genome) seq = genome[self.chr][self.start-1:self.end] @@ -297,6 +429,20 @@ def getGC(self): return self.gc def getPromoter(self,promUp=2000,promDown=0): + """Return an Interval representing the promoter region of self. + + For "+" strand, the promoter spans [start - promUp, start + promDown]. + For "-" strand, the promoter spans [end - promDown, end + promUp]. + + Args: + promUp: Number of bases upstream of the TSS to include + (default 2000). + promDown: Number of bases downstream of the TSS to include + (default 0). + + Returns: + A new Interval representing the promoter region. + """ if self.strand == "+": align = Interval(self.chr,self.start-promUp,self.start+promDown,self.strand,score=self.score,name=self.name+"_promoter") elif self.strand == "-": @@ -304,6 +450,12 @@ def getPromoter(self,promUp=2000,promDown=0): return align def fold(self): + """Predict RNA secondary structure of self.sequence using RNAfold. + + Runs RNAfold via subprocess on self.sequence. Sets self.structure to + the dot-bracket notation and self.mfe to the minimum free energy + (as a float). If parsing fails, both are set to the string "nan". + """ command = "echo '%s' | %s" % (self.sequence,RNAFOLD) output = subprocess.getoutput(command) if len(output.split())>2: @@ -314,15 +466,31 @@ def fold(self): return def getStructureFasta(self): + """Return the predicted RNA structure as a FASTA-formatted string. + + Returns: + String with a FASTA header followed by self.structure in + dot-bracket notation. + """ return ">%s\n%s" % (self.name,self.structure) def isPlus(self): + """Return True if the interval is on the "+" strand. + + Returns: + True if self.strand == "+", otherwise False. + """ if self.strand=="+": return True else: return False def isMinus(self): + """Return True if the interval is on the "-" strand. + + Returns: + True if self.strand == "-", otherwise False. + """ if self.strand=="-": return True else: @@ -342,26 +510,84 @@ def nmer_dictionary(self,n,dic={}): return dic def intersects(self,b,start='start',end='end',offset=0): + """Return True if self and b overlap on the same chromosome and strand. + + Args: + b: Another Interval. + start: Unused parameter name placeholder (default "start"). + end: Unused parameter name placeholder (default "end"). + offset: Optional integer offset added to b.end for looser matching + (default 0). + + Returns: + True if the intervals share chr and strand and their coordinates + overlap (optionally expanded by offset). + """ if self.chr == b.chr and self.strand==b.strand: return not(self.start>b.end+offset or b.start>self.end+offset) else: return False def grow5_prime(self,length): + """Extend the interval by length bases in the 5-prime direction. + + For "+" strand, decreases self.start by length. + For "-" strand, increases self.end by length. + + Args: + length: Number of bases to extend. + """ if self.strand == "+": self.start = self.start-length elif self.strand == "-": self.end = self.end+length def grow3_prime(self,length): + """Extend the interval by length bases in the 3-prime direction. + + For "+" strand, increases self.end by length. + For "-" strand, decreases self.start by length. + + Args: + length: Number of bases to extend. + """ if self.strand == "+": self.end = self.end+length elif self.strand == "-": self.start = self.start-length class SplicedInterval(Interval): - """Extends Interval and Adds/overwrites methods to incorporate spliced elements""" + """Genomic interval with spliced (multi-exon) structure. + + Extends Interval with exon coordinate information parsed from BED12-style + blockSizes and blockStarts fields. Overrides __len__ to return the spliced + (CDS) length rather than the genomic footprint length. + + Attributes: + exonLengths: List of integer exon lengths. + exonOffsets: List of integer exon start offsets relative to self.start. + exonStarts: List of absolute genomic start coordinates for each exon. + exonEnds: List of absolute genomic end coordinates for each exon. + numExons: Number of exons. + """ def __init__(self, chr, start, end, strand="*",exonLengths=[],exonOffsets=[],score=0.0, readcount = -1,name="",sequence = "",data={},genome="hg18"): + """Initialize a SplicedInterval. + + Args: + chr: Chromosome name string. + start: Genomic start coordinate. + end: Genomic end coordinate. + strand: Strand orientation (default "*"). + exonLengths: Comma-separated string of exon lengths (BED12 field). + exonOffsets: Comma-separated string of exon offsets from start + (BED12 field). + score: Floating-point score (default 0.0). + readcount: Integer read count (default -1). + name: Interval name string. + sequence: DNA sequence string. + data: Dictionary of arbitrary metadata. + genome: Genome build string (default "hg18"). + """ Interval.__init__(self,chr,start,end,strand,score=score, readcount = readcount,name=name,sequence = sequence,data=data,genome=genome) self.exonLengths = [int(x) for x in exonLengths.rstrip(",").split(",")] self.exonOffsets = [int(x) for x in exonOffsets.rstrip(",").split(",")] @@ -370,10 +596,15 @@ def __init__(self, chr, start, end, strand="*",exonLengths=[],exonOffsets=[],sco self.numExons = len(self.exonStarts) def __len__(self): + """Return the total spliced (CDS) length of all exons.""" return self.CDSlen() def intervalLen(self): - """Length of genomic footprint for self (ie. end-start+1)""" + """Length of genomic footprint for self (ie. end-start+1) + + Returns: + Integer genomic span from start to end inclusive. + """ return self.end-self.start+1 def CDSlen(self): @@ -722,6 +953,16 @@ def fetchRefSeqByChrom(RefSeqBed="/fg/compbio-t/lgoff/magda/references/human/tra return res def makeTSSBed(fname,outFname): + """Write a BED file of TSS positions derived from another BED file. + + For each interval, the end coordinate is collapsed to the start ("+") or + the start is collapsed to the end ("-") to produce a single-base TSS + interval. + + Args: + fname: Path to the input BED file. + outFname: Path to the output BED file to write. + """ iter = parseBed(fname) outHandle = open(outFname,'w') for i in iter: @@ -733,7 +974,17 @@ def makeTSSBed(fname,outFname): print(myInterval.toBed(), file=outHandle) def parseGalaxyCons(fname): - """Parses bed-like output of conservation fetch from Galaxy webserver""" + """Parse bed-like conservation output from the Galaxy webserver. + + Reads a tab-delimited file where field 6 (index 6) contains the average + phastCons conservation score. + + Args: + fname: Path to the Galaxy conservation BED-like file. + + Yields: + Interval objects with score set to the phastCons value. + """ handle=open(fname,'r') for line in handle: if line.startswith("#"): @@ -754,7 +1005,19 @@ def parseGalaxyCons(fname): yield res def findNearest(myInterval,IntervalList): - """It would be nice to write some sort of binary search for Intervals""" + """Find the nearest interval to myInterval in IntervalList by start distance. + + Performs a linear scan. Only considers intervals with a positive distance + (i.e., intervals that are downstream/to the right of myInterval). + + Args: + myInterval: Reference Interval. + IntervalList: List of Interval objects to search. + + Returns: + The Interval in IntervalList with the smallest positive distance to + myInterval, or 0 if no such interval exists. + """ myDist = 9999999999999999999 res = 0 diff --git a/src/seqlib/lincClonelib.py b/src/seqlib/lincClonelib.py index ea26884..3da3b22 100644 --- a/src/seqlib/lincClonelib.py +++ b/src/seqlib/lincClonelib.py @@ -1,19 +1,19 @@ #!/usr/bin/env python -''' -Created on Aug 19, 2010 +"""Primer design pipeline for lincRNA cloning, qPCR, and in situ hybridisation. + +Wraps the primer3_core command-line tool to design three classes of primers +from FASTA sequences: cloning primers (with optional Gateway attB flanks), +qPCR primers, and in situ hybridisation probe primers. Output can be +formatted as human-readable text or as tab-delimited tables for downstream +processing. Requirements: - - primer3_core + - primer3_core executable on PATH -@author: Loyal Goff +Usage:: -TODO: -- Add bed file output for primers as option -- Integrate a few more primer3 options into commandline - * number of primers - * GC adjustment - * etc... -''' + python lincClonelib.py [options] +""" #from Bio.Emboss import Primer3 import getopt @@ -45,10 +45,41 @@ class Usage(Exception): + """Exception raised for command-line usage errors in lincClonelib. + + Attributes: + msg: Human-readable explanation of the error or the help message. + """ def __init__(self, msg): + """Initialises a Usage exception with an error message. + + Args: + msg: Human-readable error or help text. + """ self.msg = msg def runPrimer3(fastaFile,p3CloneSetFile="/n/rinn_data1/users/lgoff/utils/primer_design/P3_cloning_primer_settings.p3",p3PCRSetFile="/n/rinn_data1/users/lgoff/utils/primer_design/P3_qPCR_primer_settings.p3",p3InsituSetFile="/n/rinn_data1/users/lgoff/utils/primer_design/P3_insitu_probe_settings.p3",verbose=False,keepTmp=False): + """Runs primer3_core to design qPCR, cloning, and in situ primers from a FASTA file. + + Creates three Boulder-IO input files from the FASTA sequences and launches + three parallel primer3_core processes (one per primer type), each with its + own settings file. Waits for all processes to complete before returning. + + Args: + fastaFile: Path to a FASTA file of sequences to design primers for. + Sequences shorter than clonePrimerSteps[-1] + PRIMER_MAX_SIZE + bases are skipped for cloning design. + p3CloneSetFile: Path to a primer3 settings file for cloning primers. + p3PCRSetFile: Path to a primer3 settings file for qPCR primers. + p3InsituSetFile: Path to a primer3 settings file for in situ primers. + verbose: If True, write progress messages to stderr (default: False). + keepTmp: If True, retain the temporary Boulder-IO input files after + the run (default: False). + + Returns: + A tuple of three strings: (qPCR_output_path, cloning_output_path, + insitu_output_path) giving the paths to the primer3 output files. + """ baseName = fastaFile.rstrip(".fa") iter = sequencelib.FastaIterator(open(fastaFile,'r')) cloneTmpFname = baseName+"_clone.p3in" @@ -97,17 +128,44 @@ def runPrimer3(fastaFile,p3CloneSetFile="/n/rinn_data1/users/lgoff/utils/primer_ return (baseName+"_qPCR.p3out",baseName+"_cloning.p3out",baseName+"_insitu.p3out") def test(): + """Smoke test for runPrimer3 using a hard-coded FASTA file. + + Calls runPrimer3 on 'lincSFPQ.fa' and returns nothing. Intended for + interactive testing only. + """ fastaFile="lincSFPQ.fa" qPCR,cloning = runPrimer3(fastaFile) return def parsePrimer3(p3OutFile): + """Yields parsed primer3 Record objects from a primer3 output file. + + Opens the specified output file and delegates parsing to primer3lib.parse, + yielding one Record object per sequence entry. + + Args: + p3OutFile: Path to a primer3 output file (Boulder-IO format). + + Yields: + primer3lib.Record objects, each containing the sequenceID, template + sequence, and a list of Primer objects. + """ handle = open(p3OutFile,'r') iter = primer3lib.parse(handle) for record in iter: yield record def printqPCR(p3outFile,outHandle): + """Writes qPCR primer results in human-readable format. + + Parses primer3 output and writes a formatted, multi-line report of qPCR + primer pairs grouped by sequence ID. If no acceptable primers were found + for a sequence, a placeholder message is printed. + + Args: + p3outFile: Path to a primer3 qPCR output file. + outHandle: Writable file-like object to receive the formatted output. + """ recordIter = parsePrimer3(p3outFile) print("######################\n# qPCR Primers\n######################", file=outHandle) for record in recordIter: @@ -129,6 +187,18 @@ def printqPCR(p3outFile,outHandle): print("--------------------------------", file=outHandle) def printqPCRTabDelim(p3outFile,outHandle): + """Writes qPCR primer results in tab-delimited format. + + Parses primer3 output and writes one line per primer pair with columns: + sequenceID, primer type ('qPCR'), primer number, product size, forward + sequence, forward start, forward length, forward Tm, forward GC, reverse + sequence, reverse start, reverse length, reverse Tm, reverse GC. + + Args: + p3outFile: Path to a primer3 qPCR output file. + outHandle: Writable file-like object to receive the tab-delimited + output. + """ recordIter = parsePrimer3(p3outFile) #print >>outHandle, "######################\n# qPCR Primers\n######################" for record in recordIter: @@ -145,6 +215,19 @@ def printqPCRTabDelim(p3outFile,outHandle): def printCloning(p3outFile,outHandle,gateway=False): + """Writes cloning primer results in human-readable format. + + Parses primer3 output and writes a formatted, multi-line report of + cloning primer pairs grouped by sequence ID. When gateway is True, + Gateway attB sequences are prepended to the forward and reverse primers + and 'Gateway' is noted in the output. + + Args: + p3outFile: Path to a primer3 cloning output file. + outHandle: Writable file-like object to receive the formatted output. + gateway: If True, prepend attF to forward and attR to reverse primers + for Gateway cloning (default: False). + """ recordIter = parsePrimer3(p3outFile) print("\n######################\n# Cloning Primers\n######################", file=outHandle) for record in recordIter: @@ -170,6 +253,21 @@ def printCloning(p3outFile,outHandle,gateway=False): print("--------------------------------", file=outHandle) def printCloningTabDelim(p3outFile,outHandle,gateway=False): + """Writes cloning primer results in tab-delimited format. + + Parses primer3 output and writes one line per primer pair with columns: + sequenceID, primer type ('Cloning'), primer number, product size, forward + sequence, forward start, forward length, forward Tm, forward GC, reverse + sequence, reverse start, reverse length, reverse Tm, reverse GC. When + gateway is True, attB sequences are prepended to the primer sequences. + + Args: + p3outFile: Path to a primer3 cloning output file. + outHandle: Writable file-like object to receive the tab-delimited + output. + gateway: If True, prepend attF to forward and attR to reverse primers + (default: False). + """ recordIter = parsePrimer3(p3outFile) #print >>outHandle, "\n######################\n# Cloning Primers\n######################" for record in recordIter: @@ -190,6 +288,15 @@ def printCloningTabDelim(p3outFile,outHandle,gateway=False): print(outStr, file=outHandle) def printInsitu(p3outFile,outHandle): + """Writes in situ hybridisation primer results in human-readable format. + + Parses primer3 output and writes a formatted, multi-line report of in situ + probe primer pairs grouped by sequence ID. + + Args: + p3outFile: Path to a primer3 in situ output file. + outHandle: Writable file-like object to receive the formatted output. + """ recordIter = parsePrimer3(p3outFile) print("######################\n# InSitu Primers\n######################", file=outHandle) for record in recordIter: @@ -211,6 +318,18 @@ def printInsitu(p3outFile,outHandle): print("--------------------------------", file=outHandle) def printInsituTabDelim(p3outFile,outHandle): + """Writes in situ hybridisation primer results in tab-delimited format. + + Parses primer3 output and writes one line per primer pair with columns: + sequenceID, primer type ('InSitu'), primer number, product size, forward + sequence, forward start, forward length, forward Tm, forward GC, reverse + sequence, reverse start, reverse length, reverse Tm, reverse GC. + + Args: + p3outFile: Path to a primer3 in situ output file. + outHandle: Writable file-like object to receive the tab-delimited + output. + """ recordIter = parsePrimer3(p3outFile) #print >>outHandle, "######################\n# qPCR Primers\n######################" for record in recordIter: @@ -226,6 +345,17 @@ def printInsituTabDelim(p3outFile,outHandle): print(outStr, file=outHandle) def printInsitu(p3outFile,outHandle): + """Writes in situ hybridisation primer results in human-readable format (second definition). + + Duplicate of the earlier printInsitu definition; this version is the one + that Python will actually use at runtime. Parses primer3 output and writes + a formatted, multi-line report of in situ probe primer pairs grouped by + sequence ID. + + Args: + p3outFile: Path to a primer3 in situ output file. + outHandle: Writable file-like object to receive the formatted output. + """ recordIter = parsePrimer3(p3outFile) print("######################\n# InSitu Primers\n######################", file=outHandle) for record in recordIter: @@ -247,6 +377,19 @@ def printInsitu(p3outFile,outHandle): print("--------------------------------", file=outHandle) def printInsituTabDelim(p3outFile,outHandle): + """Writes ASO / in situ primer results in tab-delimited format (second definition). + + Duplicate of the earlier printInsituTabDelim definition; this version + overrides the first at runtime. Parses primer3 output for in situ / + ASO candidates and writes one tab-delimited line per primer pair with + an 'InSitu' type column. When no candidates are found, writes an 'ASO' + type placeholder line. + + Args: + p3outFile: Path to a primer3 output file. + outHandle: Writable file-like object to receive the tab-delimited + output. + """ recordIter = parsePrimer3(p3outFile) #print >>outHandle, "######################\n# ASO Candidates\n######################" for record in recordIter: @@ -262,6 +405,19 @@ def printInsituTabDelim(p3outFile,outHandle): print(outStr, file=outHandle) def main(argv=None): + """Command-line entry point for the lincRNA primer design pipeline. + + Parses command-line options, runs primer3 via runPrimer3, and writes + formatted primer output (human-readable or tab-delimited) to the output + file. Cleans up temporary primer3 output files unless keepTmp is set. + + Args: + argv: List of command-line argument strings. Defaults to sys.argv + when None. + + Raises: + SystemExit: On usage errors or when --help is requested. + """ if argv is None: argv = sys.argv task = 'qpcr' diff --git a/src/seqlib/lincName.py b/src/seqlib/lincName.py index 8274798..c1fc87c 100644 --- a/src/seqlib/lincName.py +++ b/src/seqlib/lincName.py @@ -1,9 +1,22 @@ #!/usr/bin/env python -''' -Created on Aug 27, 2010 +"""Assigns systematic names to lincRNA loci based on proximity to RefSeq genes. -@author: lgoff -''' +Implements the naming scheme described in Guttman et al. for long intergenic +non-coding RNA (lincRNA) loci: + +- If the 5' end of a lincRNA overlaps the 5' end of a protein-coding gene on + the opposite strand by less than the overlap threshold, the lincRNA is named + 'linc--BP' (bidirectional promoter). +- If a lincRNA overlaps any protein-coding gene on the opposite strand without + satisfying the bidirectional criterion, it is named 'linc--AS' + (antisense). +- Otherwise, the lincRNA is named after the nearest downstream protein-coding + gene on the same strand: 'linc-' (single lincRNA) or + 'linc--' (multiple lincRNAs near the same gene). + +Requires GTFlib, dbConn, and intervallib packages, and a connection to the +UCSC genome browser MySQL server. +""" ############ #Imports @@ -45,7 +58,17 @@ #Classes ############ class Usage(Exception): + """Exception raised for command-line usage errors in lincName. + + Attributes: + msg: Human-readable explanation of the error or the help message. + """ def __init__(self, msg): + """Initialises a Usage exception. + + Args: + msg: Human-readable error or help text. + """ self.msg = msg @@ -54,7 +77,29 @@ def __init__(self, msg): ############ def test5PrimeOverlap(lincInt,geneInt): - """May need to validate this. I'm not sure this works when a lincRNA completely covers a PC gene on the opposite strand""" + """Determines whether the overlap between a lincRNA and a gene is at the lincRNA 5' end. + + Tests whether a lincRNA interval overlaps a protein-coding gene such that + the overlap is at the 5' end of the lincRNA (and also involves the 5' end + of the gene on the opposite strand). Used to identify bidirectional + promoter pairs. + + Note: may not give correct results when a lincRNA completely spans a + protein-coding gene on the opposite strand. + + Args: + lincInt: An interval object for the lincRNA with strand, start, and + end attributes. + geneInt: An interval object for the overlapping protein-coding gene + with strand, start, and end attributes. + + Returns: + True if the overlap is at the 5' end of lincInt; False otherwise. + + Raises: + AssertionError: If the two intervals do not overlap. + ValueError: If the strand of lincInt cannot be determined. + """ assert lincInt.overlaps(geneInt) if lincInt.strand == "+": if lincInt.start <= geneInt.end and lincInt.end > geneInt.end: @@ -70,6 +115,22 @@ def test5PrimeOverlap(lincInt,geneInt): raise ValueError("Could not determine") def bpOverlap(lincInt,geneInt): + """Returns the number of base pairs of overlap between two genomic intervals. + + Sorts the four boundary coordinates and computes the inner distance as the + length of the shared region. + + Args: + lincInt: An interval object with start and end attributes. + geneInt: An interval object with start and end attributes that must + overlap with lincInt. + + Returns: + Integer number of overlapping base pairs between the two intervals. + + Raises: + AssertionError: If the two intervals do not overlap. + """ assert lincInt.overlaps(geneInt), "%s and %s do not overlap" % (lincInt.name,geneInt.name) bounds = [lincInt.start,lincInt.end,geneInt.start,geneInt.end] bounds.sort() @@ -78,6 +139,13 @@ def bpOverlap(lincInt,geneInt): return overlap def printLincs(handle,lincs): + """Writes a collection of lincRNA GTF records to a file handle. + + Args: + handle: Writable file-like object to receive the GTF output. + lincs: Iterable of lincRNA objects, each exposing a getGTF() method + that returns a GTF-formatted string. + """ for linc in lincs: print(linc.getGTF(), end=' ', file=handle) @@ -86,6 +154,23 @@ def printLincs(handle,lincs): ############ def main(gtfFile,genome='hg19'): + """Assigns systematic names to all lincRNA loci in a GTF file. + + Reads lincRNA transcript models from gtfFile, retrieves protein-coding + RefSeq transcripts for the specified genome build, and applies the + bidirectional promoter, antisense, and proximity naming rules to produce + a set of named lincRNA objects. + + Args: + gtfFile: Path to a GTF file of unannotated lincRNA loci (as produced + by Cufflinks or similar assemblers). + genome: UCSC genome build identifier used to fetch RefSeq transcripts + (default: 'hg19'). + + Returns: + A set of lincRNA gene objects with updated name attributes following + the systematic naming convention. + """ #Parse GTF File for lincs lincIter = GTFlib.GTFGeneIterator(gtfFile,verbose=verbose) @@ -198,6 +283,12 @@ def main(gtfFile,genome='hg19'): #Tests ############ def test(): + """Runs a full naming test using hardcoded Broad Institute file paths. + + Calls main() on a hard-coded lincRNA GTF file, writes the named output + to a companion file, and prints a completion message to stderr. Intended + for interactive/development testing only. + """ fname = '/seq/rinnscratch/cole/ftp/assemblies/linc_catalog.gtf' outHandle = open('/seq/rinnscratch/cole/ftp/assemblies/linc_catalog_named.gtf','w') verbose=True diff --git a/src/seqlib/lincRNAs.py b/src/seqlib/lincRNAs.py index 84d58ad..0ff4d20 100644 --- a/src/seqlib/lincRNAs.py +++ b/src/seqlib/lincRNAs.py @@ -1,8 +1,10 @@ -''' -Created on Jun 3, 2010 +"""Utilities for processing lincRNA (long intergenic non-coding RNA) transcript models. -@author: lgoff -''' +Processes BED-format lincRNA annotations to fetch spliced sequences, insert +records into a MySQL database, generate transcript model PNG plots, and export +sequences to FASTA format. Requires a local MySQL instance at the Broad +Institute and the intervallib package. +""" import os import sys @@ -13,7 +15,18 @@ def main(bedFile,lincLotID): - + """Processes a BED file of lincRNA models and inserts them into the database. + + For each transcript in the BED file, fetches its spliced sequence, + creates a PNG transcript model plot, and bulk-inserts all records into the + lgoff_nextgen MySQL database using mysqlimport. + + Args: + bedFile: Path to a BED-format file of lincRNA transcript models. + lincLotID: Integer identifier for the lincRNA lot/batch being + processed; used as a foreign key in the database insert. + """ + #Setup environment if not os.path.exists('transcriptModels'): os.mkdir('transcriptModels') @@ -56,6 +69,19 @@ def main(bedFile,lincLotID): return def drawModelPNG(bedRecord,outDir=os.getcwd(),verbose=False): + """Generates a PNG transcript model image for a single BED record. + + Delegates to the BED record's makePNG method and optionally prints + progress information to stdout. + + Args: + bedRecord: An intervallib BED interval object that exposes a + makePNG(outDir) method and a name attribute. + outDir: Directory path where the PNG file will be written + (default: current working directory). + verbose: If True, print status messages indicating which transcript + model is being drawn (default: False). + """ if verbose: print("Making transcript model plot...") bedRecord.makePNG(outDir) @@ -64,7 +90,19 @@ def drawModelPNG(bedRecord,outDir=os.getcwd(),verbose=False): return def insertRecord(lincRNA,lincLotID): - """Does not work for some reason...""" + """Inserts a single lincRNA transcript record into the database. + + Constructs and executes an INSERT SQL statement for the transcripts table. + The function references a module-level db cursor variable which must be + set before calling. Note: this function is known to be non-functional; + use the bulk mysqlimport approach in main() instead. + + Args: + lincRNA: An intervallib interval object with attributes: name, chr, + start, end, strand, exonLengths, exonOffsets, and splicedSequence. + lincLotID: Integer lot identifier to associate with the transcript + record in the database. + """ cursor = db.cursor() insert="INSERT INTO transcripts VALUES (NULL,'%s','%s','%d','%d','%s','%s','%s','%s','%d');" % (lincRNA.name,lincRNA.chr,lincRNA.start,lincRNA.end,lincRNA.strand,",".join([str(x) for x in lincRNA.exonLengths]),",".join([str(x) for x in lincRNA.exonOffsets]),lincRNA.splicedSequence,int(lincLotID)) @@ -77,6 +115,14 @@ def insertRecord(lincRNA,lincLotID): return def getDb(): + """Opens and returns a connection to the Broad Institute MySQL database. + + Connects to the lgoff_nextgen database on mysql.broadinstitute.org with + a hard-coded user and empty password. + + Returns: + A MySQLdb connection object for the lgoff_nextgen database. + """ host="mysql.broadinstitute.org" user="lgoff" password="" diff --git a/src/seqlib/misc.py b/src/seqlib/misc.py index dae4235..7dffc92 100644 --- a/src/seqlib/misc.py +++ b/src/seqlib/misc.py @@ -1,4 +1,10 @@ #!/usr/bin/python +"""Miscellaneous utility functions for sequence analysis, data structures, and pretty printing. + +Provides tools for nuID encoding/decoding of nucleotide sequences, dictionary sorting, +pretty-printing of nested data structures, ranking/ordering utilities, and basic string +manipulation functions used across the seqlib package. +""" import sys @@ -6,8 +12,28 @@ #pygr tools ############# class Annot: - """Annotation class for pygr data""" + """Annotation class for pygr data. + + A lightweight container for genomic annotation records used with the pygr + genome database library. + + Attributes: + name: Identifier for the annotation (e.g. gene name or transcript ID). + chr: Chromosome name (e.g. 'chr1'). + strand: Strand orientation ('+' or '-'). + start: 0-based start coordinate of the annotation. + end: End coordinate of the annotation. + """ def __init__(self,name,chr,strand,start,end): + """Initialises an Annot instance. + + Args: + name: Identifier for the annotation. + chr: Chromosome name. + strand: Strand orientation ('+' or '-'). + start: 0-based start coordinate. + end: End coordinate. + """ self.name=name self.chr=chr self.strand=strand @@ -18,12 +44,44 @@ def __init__(self,name,chr,strand,start,end): #nuID implementation for python ################### def mreplace(s,chararray=['A','C','G','T','U'],newarray=['0','1','2','3','3']): + """Replaces multiple characters in a string using paired replacement arrays. + + Iterates over corresponding pairs from chararray and newarray, replacing + each occurrence of chararray[i] with newarray[i] in sequence. Defaults + map the nucleotide alphabet (A, C, G, T, U) to single-digit codes used + by the nuID encoding scheme. + + Args: + s: Input string to perform replacements on. + chararray: List of characters (or substrings) to replace. + newarray: List of replacement characters (or substrings), paired + positionally with chararray. + + Returns: + The modified string after all replacements have been applied. + """ for a,b in zip(chararray,newarray): s=s.replace(a,b) return s def seq2nuID(seq): - """Converts a string DNA or RNA sequence into its corresponding 'nuID'""" + """Converts a DNA or RNA sequence string into its corresponding nuID. + + The nuID (nucleotide identifier) is a compact, base-64-like encoding of a + nucleotide sequence that encodes both sequence content and a checksum + character. This implementation replaces the standard "_" character in the + code alphabet with "!" to avoid conflicts with SHRiMP alignment output + parsing. + + Args: + seq: A DNA or RNA sequence string (case-insensitive; 'U' is treated + identically to 'T'). + + Returns: + A nuID string whose first character encodes checksum and padding + information and whose remaining characters encode successive triplets + of nucleotides in base-64 space. + """ """ Default code includes "_" as char. This conflicts with parsing for shrimp. So for my specific instance, @@ -55,6 +113,25 @@ def seq2nuID(seq): return id def nuID2seq(nuID): + """Decodes a nuID string back into the original nucleotide sequence. + + Reverses the nuID encoding produced by seq2nuID. The first character of + the nuID encodes checksum and padding length; the remaining characters are + decoded from base-64 triplets back to the ACGT alphabet. This + implementation uses "!" instead of "_" in the code alphabet (matching + seq2nuID) to avoid conflicts with SHRiMP output parsing. + + Args: + nuID: A nuID string as produced by seq2nuID. + + Returns: + The original DNA sequence string (uppercase ACGT). + + Raises: + AssertionError: If the nuID contains the '.' character as a check code + (which would indicate a coding error or invalid nuID), or if the + checksum validation fails. + """ """ Default code includes "_" as char. This conflicts with parsing for shrimp. So for my specific instance, "_" has been replaced with "!" @@ -98,7 +175,16 @@ def sort_by_value(d): return [ backitems[i][1] for i in range(0,len(backitems))] def sbv2(d,reverse=False): - ''' proposed in PEP 265, using the itemgetter ''' + """Returns dictionary items sorted by value, using itemgetter (PEP 265 approach). + + Args: + d: A dictionary to sort. + reverse: Not currently used; items are always sorted in descending + order by value regardless of this parameter. + + Returns: + A list of (key, value) tuples sorted by value in descending order. + """ from operator import itemgetter return sorted(d.items(), key=itemgetter(1), reverse=True) @@ -110,6 +196,17 @@ def sortListofDicts(fieldname): return lambda x: x[fieldname] def sort_dict(d,reverse=True): + """Returns dictionary items sorted first by value then by key. + + Args: + d: A dictionary to sort. + reverse: If True (default), sort in descending order; if False, + sort in ascending order. + + Returns: + A list of (key, value) tuples sorted by (value, key) using the + specified direction. + """ return sorted(d.items(), key=lambda item: (item[1], item[0]), reverse=reverse) ######## @@ -118,6 +215,29 @@ def sort_dict(d,reverse=True): # ######## def pretty_print(f, d, level=-1, maxw=0, maxh=0, gap="", first_gap='', last_gap=''): + """Recursively pretty-prints a nested Python data structure to a file stream. + + Handles lists, tuples, dicts, class instances, and scalar values, printing + each with indentation that reflects the nesting depth. Optionally limits + the depth of recursion, the width of each printed line, and the number of + elements printed per container. + + Args: + f: Output file stream (e.g. sys.stdout or an open file handle). + d: The data structure to print. + level: Maximum recursion depth. -1 (default) means unlimited depth. + 0 means stop recursing and print a repr of the current element. + maxw: Maximum character width for a single printed line. 0 (default) + means no width limit. + maxh: Maximum number of elements to print from any list, tuple, or + dict at any recursion level. 0 (default) means no limit. + gap: Indentation prefix inserted before each element inside a + container. + first_gap: Prefix printed before the opening bracket/brace/paren of + a container, or before a scalar value. + last_gap: Prefix printed before the closing bracket/brace/paren of + a container. + """ # depending on the type of expression, it recurses through its elements # and prints with appropriate indentation @@ -282,7 +402,23 @@ def pretty_print(f, d, level=-1, maxw=0, maxh=0, gap="", first_gap='', last_gap= f.write(first_gap+repr(d)+'\n') def pp(d,level=-1,maxw=0,maxh=0,parsable=0): - """ wrapper around pretty_print that prints to stdout""" + """Pretty-prints a data structure to stdout. + + Wrapper around pretty_print that writes to sys.stdout. When parsable is + set to a truthy value the standard library pprint module is used instead, + which produces output that can be eval'd back to the original structure. + + Args: + d: The data structure to print. + level: Maximum recursion depth passed to pretty_print. -1 means + unlimited. + maxw: Maximum line width passed to pretty_print (or pprint width when + parsable is set). 0 means no limit. + maxh: Maximum container height passed to pretty_print. 0 means no + limit. + parsable: If 0 (default), use pretty_print for human-readable output. + If non-zero, use the standard library pprint module. + """ if not parsable: pretty_print(sys.stdout, d, level, maxw, maxh, '', '', '') else: @@ -292,6 +428,12 @@ def pp(d,level=-1,maxw=0,maxh=0,parsable=0): pp2.pprint(d) def test_pp(): + """Runs a self-contained smoke test of the pp / pretty_print functions. + + Calls pp with a heterogeneous nested data structure containing dicts, + lists, tuples, integers, strings, and a lambda. Output is written to + stdout. No return value. + """ pp({'one': ('two',3,[4,5,6]), 7: (lambda x: 8*9), 'ten': ['ele', {'ven': 12, @@ -320,6 +462,26 @@ def ifab(test, a, b): # #################################### def sfill(s, length, fill_char = '.'): + """Pads a string on the right with a fill character until it reaches the target length. + + Example:: + + sfill('hello', 18, '.') -> 'hello.............' + # <--- 18 chars ---> + + Useful for aligning dictionary keys when pretty-printing: + ``one......: 1``, ``five.....: 5``, ``seventeen: 17``. + + Args: + s: The input string to pad. + length: The desired total length of the returned string. + fill_char: The character used for padding (default: '.'). + + Returns: + The input string right-padded with fill_char to the specified length. + If the input string is already at least as long as length, it is + returned unchanged. + """ # Appends fill_char to the string s until it reaches length length # ex: sfill('hello',18,'.') -> hello............... # <--- 18 chars ---> @@ -336,6 +498,20 @@ def sfill(s, length, fill_char = '.'): return s + fill_char*(length-len(s)) def rstrips(s, suffix): + """Strips a specific suffix from the right end of a string. + + Unlike str.rstrip, this function removes the exact suffix string rather + than a set of characters. + + Args: + s: The input string. + suffix: The exact suffix to remove. If empty or not present at the + end of s, the string is returned unchanged. + + Returns: + The input string with the suffix removed from the right end, or the + original string if the suffix was not found. + """ if suffix and s.endswith(suffix): s = s[:-len(suffix)] return s @@ -459,6 +635,17 @@ def rank(x, NoneIsLast=True, decreasing = False, ties = "first"): return R def uniqify(seq): + """Returns the unique elements of an iterable as a list. + + Not order-preserving: the returned list may appear in arbitrary order + because uniqueness is tracked via a dictionary. + + Args: + seq: An iterable of hashable elements. + + Returns: + A list containing each unique element from seq exactly once. + """ # Not order preserving keys = {} for e in seq: diff --git a/src/seqlib/myDataTypes.py b/src/seqlib/myDataTypes.py index dea6473..8616d72 100644 --- a/src/seqlib/myDataTypes.py +++ b/src/seqlib/myDataTypes.py @@ -1,4 +1,10 @@ ''' +Custom data type implementations for seqlib/RNASeq data processing. + +Provides a Stack (LIFO), a binary search tree (BinaryTree / BinaryNode / +EmptyNode), and a directed-graph (Graph) useful for path-finding in +acyclic graphs. + Created on Dec 14, 2009 My custom data types to help with RNASeq data @@ -9,113 +15,329 @@ class Stack: ''' - Basic 'stack' data type + A last-in, first-out (LIFO) stack data structure backed by a Python list. + + Supports push, pop, and peek operations, and delegates unknown attribute + lookups to the underlying list so list methods are accessible directly. ''' + def __init__(self, start=[]): ''' - Constructor + Initialize the Stack, optionally pre-loading it with items. + + Items from start are pushed in order and then reversed so that the + first element of start ends up at the top of the stack. + + Args: + start: An optional list of items to pre-load. Defaults to []. ''' self.stack = [] for x in start: self.push(x) self.reverse() - - def push(self,obj): + + def push(self, obj): + """Push an item onto the top of the stack. + + Args: + obj: The object to place on top of the stack. + """ self.stack = [obj] + self.stack - + def pop(self): + """Remove and return the item at the top of the stack. + + Returns: + The top item of the stack. + + Raises: + stack2.error: If the stack is empty (underflow). + """ if not self.stack: raise error('underflow') top, self.stack = self.stack[0], self.stack[1:] return top def top(self): + """Return the top item without removing it. + + Returns: + The item currently at the top of the stack. + + Raises: + stack2.error: If the stack is empty (underflow). + """ if not self.stack: raise error('underflow') return self.stack[0] - + def empty(self): + """Return True if the stack contains no items. + + Returns: + True if the stack is empty, False otherwise. + """ return not self.stack - + #Overloads def __repr__(self): + """Return a string representation of the stack.""" return '[Stack:%s]' % self.stack - - def __cmp__(self,other): + + def __cmp__(self, other): + """Compare this stack to another by their underlying lists.""" return cmp(self.stack, other.stack) - + def __len__(self): + """Return the number of items in the stack.""" return len(self.stack) - - def __add__(self,other): + + def __add__(self, other): + """Concatenate two stacks and return a new Stack. + + Args: + other: Another Stack instance to append. + + Returns: + A new Stack containing items from this stack followed by other's. + """ return Stack(self.stack+other.stack) - - def __mul__(self,reps): + + def __mul__(self, reps): + """Repeat the stack contents reps times and return a new Stack. + + Args: + reps: An integer number of times to repeat. + + Returns: + A new Stack with the contents repeated reps times. + """ return Stack(self.stack * reps) - - def __getitem__(self,offset): + + def __getitem__(self, offset): + """Return the item at the given index. + + Args: + offset: An integer index into the underlying list. + + Returns: + The item at position offset. + """ return self.stack[offset] - - def __getslice__(self,low,high): + + def __getslice__(self, low, high): + """Return a new Stack containing the slice from low to high. + + Args: + low: The start index of the slice. + high: The end index of the slice (exclusive). + + Returns: + A new Stack containing the sliced elements. + """ return Stack(self.stack[low:high]) - - def __getattr__(self,name): - return getattr(self.stack,name) + + def __getattr__(self, name): + """Delegate attribute lookup to the underlying list. + + Args: + name: The attribute name to look up on the underlying list. + + Returns: + The attribute from the underlying list. + """ + return getattr(self.stack, name) ################## #Binary Trees ################## class BinaryTree: + """A binary search tree that delegates to recursive BinaryNode/EmptyNode objects. + + Stores values in sorted order and supports O(log n) average-case lookup + and insertion. Duplicate values are silently ignored. + """ + def __init__(self): + """Initialize an empty BinaryTree.""" self.tree = EmptyNode() + def __repr__(self): + """Return a parenthesized string representation of the tree.""" return repr(self.tree) - def lookup(self,value): + + def lookup(self, value): + """Return 1 if value exists in the tree, 0 otherwise. + + Args: + value: The value to search for. + + Returns: + 1 if the value is present, 0 if it is not. + """ return self.tree.lookup(value) - def insert(self,value): + + def insert(self, value): + """Insert value into the tree, maintaining sort order. + + If value already exists in the tree it is not inserted again. + + Args: + value: The value to insert. + """ self.tree = self.tree.insert(value) class EmptyNode: + """Sentinel node representing an empty position in a BinaryTree. + + Acts as the leaf terminator: lookup always fails and insert creates a + new BinaryNode at this position. + """ + def __repr__(self): + """Return '*' to represent an empty node.""" return "*" - def lookup(self,value): #Fail at the bottom + + def lookup(self, value): + """Return 0 because an empty node contains no value. + + Args: + value: The value being searched for (unused). + + Returns: + Always 0 (not found). + """ return 0 - def insert(self,value): - return BinaryNode(self,value,self) #Add new node at bottom + + def insert(self, value): + """Create a new BinaryNode at this position with value. + + Args: + value: The value to store in the new node. + + Returns: + A new BinaryNode with empty left and right children. + """ + return BinaryNode(self, value, self) #Add new node at bottom class BinaryNode: - def __init__(self,left,value,right): - self.data,self.left,self.right = value,left,right - def lookup(self,value): + """An internal node of a binary search tree holding a value and two subtrees. + + Attributes: + data: The value stored at this node. + left: The left child node (values less than data). + right: The right child node (values greater than data). + """ + + def __init__(self, left, value, right): + """Initialize a BinaryNode with left subtree, a value, and right subtree. + + Args: + left: The left child (a BinaryNode or EmptyNode). + value: The value to store at this node. + right: The right child (a BinaryNode or EmptyNode). + """ + self.data, self.left, self.right = value, left, right + + def lookup(self, value): + """Search for value in the subtree rooted at this node. + + Args: + value: The value to search for. + + Returns: + 1 if value is found in this subtree, 0 otherwise. + """ if self.data == value: return 1 - elif self.data>value: + elif self.data > value: return self.left.lookup(value) else: return self.right.lookup(value) - def insert(self,value): + + def insert(self, value): + """Insert value into the subtree rooted at this node. + + Traverses left if value is less than this node's data, right if + greater. Equal values are ignored (no duplicates stored). + + Args: + value: The value to insert. + + Returns: + This node (possibly with an updated child subtree). + """ if self.data > value: self.left = self.left.insert(value) elif self.data < value: self.right = self.right.insert(value) return self + def __repr__(self): + """Return a parenthesized inorder string representation of this subtree.""" return '( %s, %s, %s )' % (repr(self.left), repr(self.data), repr(self.right)) ################ #Directed Acyclic Graphs ################ class Graph: - def __init__(self,label,extra=None): + """A node in a directed acyclic graph (DAG) that supports path-finding. + + Each Graph node has a label, optional data payload, and a list of + outgoing edges to other Graph nodes. Multiple paths between nodes are + found via depth-first search and stored as class-level state in + Graph.solns. + + Attributes: + name: A string label identifying this node. + data: An optional data payload associated with this node. + edges: A list of Graph nodes reachable from this node. + """ + + def __init__(self, label, extra=None): + """Initialize a Graph node with a label and optional data. + + Args: + label: A string name for this node. + extra: An optional data object to associate with the node. + Defaults to None. + """ self.name = label self.data = extra self.edges = [] + def __repr__(self): + """Return the node's label as its string representation.""" return self.name - def search(self,goal): + + def search(self, goal): + """Find all acyclic paths from this node to goal. + + Resets Graph.solns, performs a depth-first search via generate(), + and sorts found paths by length (shortest first). + + Args: + goal: A Graph node to search for. + + Returns: + A list of paths (each path is a list of Graph nodes) from this + node to goal, sorted by path length ascending. + """ Graph.solns = [] - self.generate([self],goal) - Graph.solns.sort(lambda x,y: cmp(len(x), len(y))) + self.generate([self], goal) + Graph.solns.sort(lambda x, y: cmp(len(x), len(y))) return Graph.solns - def generate(self,path,goal): + + def generate(self, path, goal): + """Recursively explore paths from this node towards goal. + + Appends the current path to Graph.solns when goal is reached. + Avoids cycles by checking whether each neighbor is already in the + current path before recursing. + + Args: + path: A list of Graph nodes representing the current path from + the search origin to this node. + goal: A Graph node to find. + """ if self == goal: Graph.solns.append(path) else: diff --git a/src/seqlib/mySam.py b/src/seqlib/mySam.py index 341d89f..570abd9 100644 --- a/src/seqlib/mySam.py +++ b/src/seqlib/mySam.py @@ -1,6 +1,12 @@ ''' +Miscellaneous tools to get information from a SAM/BAM file. + +Provides utilities for parsing SAM/BAM alignment files, computing read +pileups, fetching strand-specific coverage arrays, and plotting read +density across genomic intervals. Built on top of pysam. + Created on Oct 25, 2009 -Misc tools to get information from a SAM/BAM file... + @author: lgoff ''' import array @@ -17,20 +23,65 @@ # from inOut.wiggle import WiggleFileWriter # NOTE: inOut.wiggle module not available; WiggleFileWriter commented out class SAMAlignment(Alignment): - """Basic object for SAMstring (extends Alignment class)""" + """Basic object representing a single SAM alignment record. + + Extends the Alignment base class with SAM-specific fields for the + CIGAR string and base-quality string. + + Attributes: + qual: Base-quality string from SAM field 11. + cigar: CIGAR string from SAM field 6 describing the alignment. + """ + def __init__(self,readname,chr,start,end,strand,score,readcount,readsequence,cigar,qualstring): + """Initialises a SAMAlignment. + + Args: + readname: Query template name (SAM field 1). + chr: Reference sequence name / chromosome (SAM field 3). + start: 1-based leftmost mapping position (SAM field 4). + end: Computed end position (start + read length - 1). + strand: Strand of the alignment, one of '+' or '-'. + score: Mapping quality score (SAM field 5). + readcount: Number of reads represented by this alignment + (typically 1 for a single record). + readsequence: Read sequence bases (SAM field 10). + cigar: CIGAR string describing alignment operations (SAM field 6). + qualstring: ASCII-encoded base-quality string (SAM field 11). + """ Alignment.__init__(self,readname,chr,start,end,strand,score=readcount,readcount = readcount,readsequence=readsequence) self.qual = qualstring self.cigar = cigar def SAMReader(fname): - """Iterator for SAMAlignment records (depricated, use pysam)""" + """Iterate over SAM alignment records from a file. + + Deprecated — use pysam directly for new code. + + Args: + fname: Path to the SAM file. + + Yields: + An Interval object for each alignment record in the file. + """ handle = open(fname,'r') for line in handle: aln = parseSAMString(line) yield aln.toInterval() def parseSAMString(samstring): + """Parse a single SAM-format line into a SAMAlignment object. + + Reads are assumed to be non-paired and non-spliced; the end position is + derived from the start position plus the read-sequence length. + + Args: + samstring: A single tab-delimited SAM record line (no trailing + newline required — it is stripped internally). + + Returns: + A SAMAlignment instance populated from the SAM fields. + """ tokens = samstring.rstrip().split("\t") readname = tokens[0] chr = tokens[2] @@ -45,7 +96,22 @@ def parseSAMString(samstring): return SAMAlignment(readname,chr,start,end,strand,score,readcount,readsequence,cigar,qualstring) def pileup2wig(fname,shortname,outDir=os.getcwd()+"/"): - """Don't use this...it's lazy and it doesn't feel right""" + """Convert a samtools pileup file to strand-specific wiggle files. + + Reads a samtools pileup output file and writes two variableStep wiggle + files: one for the plus strand (forward reads, indicated by '.') and one + for the minus strand (reverse reads, indicated by ','). This + implementation is noted as incomplete / not recommended for production + use. + + Args: + fname: Path to the samtools pileup file to read. + shortname: Base name used for both the wiggle track labels and the + output file names (``_plus.wig`` and + ``_minus.wig``). + outDir: Directory in which the output wiggle files are written. + Defaults to the current working directory. + """ handle = open(fname,'r') preRef = '' prePos = -1 @@ -56,6 +122,16 @@ def pileup2wig(fname,shortname,outDir=os.getcwd()+"/"): minusHand = open(outDir+shortname+"_minus.wig",'w') def wigHeader(shortname,strand): + """Build a UCSC wiggle track-definition header line. + + Args: + shortname: Base name used in the track name and description fields. + strand: Strand of the track, either '+' (blue) or '-' (red). + + Returns: + A wiggle track header string suitable for writing as the first + line of a wiggle file. + """ if strand=="+": color = '0,0,255' sName = 'plus' @@ -84,17 +160,49 @@ def wigHeader(shortname,strand): minusHand.close() class Counter: - """Use in callback function to store read counts within an alignment (includes those that - are not completely contained within the alignment""" + """Callable that accumulates a total read count for use as a pysam callback. + + Designed to be passed as a callback to pysam fetch/pileup methods. + Counts all reads that overlap the queried region, including those not + completely contained within it. + + Attributes: + mCounts: Running total of reads seen so far. + """ + mCounts = 0 + def __call__(self,alignment): + """Increment the read counter by one for each alignment seen. + + Args: + alignment: A pysam AlignedSegment (or compatible) object. + The alignment itself is not inspected; only its presence + increments the counter. + """ self.mCounts += 1 class StrandCounter: - """Provides a strand-specific number of reads as opposed to total read density""" + """Callable that accumulates strand-specific read counts for use as a pysam callback. + + Separates reads into forward (plus) and reverse (minus) strand tallies + rather than combining them into a single total. + + Attributes: + plusCount: Running total of forward-strand reads seen. + minusCount: Running total of reverse-strand reads seen. + """ + plusCount = 0 minusCount = 0 + def __call__(self,alignment): + """Increment the appropriate strand counter for each alignment seen. + + Args: + alignment: A pysam AlignedSegment (or compatible) object. + Strand is determined from the ``is_reverse`` flag. + """ if alignment.is_reverse: self.minusCount += 1 else: @@ -102,14 +210,34 @@ def __call__(self,alignment): def getBitValue(n, p): - ''' - get the bitvalue of denary (base 10) number n at the equivalent binary - position p (binary count starts at position 0 from the right) - ''' + """Return the bit at position p of integer n. + + Extracts the single bit at binary position p (zero-indexed from the + least-significant bit) of the denary integer n. + + Args: + n: A non-negative integer whose bit is to be inspected. + p: Zero-based bit position (0 = least-significant / rightmost bit). + + Returns: + 1 if the bit at position p is set, 0 otherwise. + """ return (n >> p) & 1 def strandFlag(flag): - """Returns strand of sequence from SAM record bitflag (field 4)""" + """Determine the alignment strand from a SAM bitflag value. + + Inspects bit 4 (0x10) of the SAM FLAG field to determine whether the + read mapped to the reverse strand. + + Args: + flag: The integer SAM FLAG value (field 2), or a string + representation of it. + + Returns: + '+' if bit 4 is 0 (forward strand), '-' if bit 4 is 1 (reverse + strand), or '*' for any other value. + """ flag = int(flag) if getBitValue(flag,4)==0: return "+" @@ -119,11 +247,24 @@ def strandFlag(flag): return "*" def makeCigar(): + """Placeholder for CIGAR string construction. + + Not yet implemented. + """ pass def samScanByStrand(samFetch,strand): - """Generator to iterate over a samFetch using only one of the strands. - strand should be one of ["+","-"] + """Yield only reads that map to the specified strand from a pysam fetch iterator. + + Args: + samFetch: An iterable of pysam AlignedSegment objects, typically + returned by ``pysam.AlignmentFile.fetch()``. + strand: The strand to retain. Must be '+' (forward, non-reverse + reads) or '-' (reverse reads). + + Yields: + pysam AlignedSegment objects whose strand matches the requested + strand value. """ for read in samFetch: if strand == "+": @@ -138,6 +279,19 @@ def samScanByStrand(samFetch,strand): continue def sam2Interval(samRead): + """Convert a pysam AlignedSegment to an intervallib Interval object. + + The interval uses 1-based coordinates (pysam's 0-based ``pos`` is + incremented by 1) and a readcount of 1. + + Args: + samRead: A pysam AlignedSegment object with valid ``rname``, + ``pos``, ``seq``, and ``is_reverse`` attributes. + + Returns: + An intervallib.Interval representing the read's mapped region, + with strand set to '+' or '-' according to ``samRead.is_reverse``. + """ if samRead.is_reverse: strand = "-" else: @@ -146,7 +300,26 @@ def sam2Interval(samRead): def samReadsIntersect(a,b,useStrand = True,offset=0): - """Checks to see if two samReads (a,b) intersect""" + """Determine whether two pysam AlignedSegment reads overlap each other. + + Two reads are considered to intersect if their mapped positions overlap + (allowing for an optional extension by ``offset`` bases). When + ``useStrand`` is True, reads on different strands or different reference + sequences are never considered to intersect. + + Args: + a: A pysam AlignedSegment object. + b: A pysam AlignedSegment object to compare against ``a``. + useStrand: If True (default), reads must be on the same reference + sequence and the same strand (``is_reverse`` must match) to + be considered intersecting. + offset: Number of extra bases by which each read's length is + extended before testing for overlap. Defaults to 0. + + Returns: + True if reads a and b overlap (subject to strand and offset rules), + False otherwise. + """ if useStrand: if a.rname == b.rname and a.is_reverse == b.is_reverse: return not(a.pos>b.pos+len(b.seq)+offset or b.pos>a.pos+len(a.seq)+offset) @@ -178,6 +351,22 @@ def makeContiguousIntervals2(samHandle,start='start',end='end',offset=0,useStran currentInterval = sam2Interval(current) """ def makeContiguousIntervalsByStrand(samHandle,offset=0): + """Generate contiguous genomic intervals from a sorted BAM file, separately per strand. + + Iterates over all reads in the BAM file and merges overlapping reads + (with optional extension by ``offset``) into contiguous intervals. + Processing is performed independently for the forward ('+') and reverse + ('-') strands. + + Args: + samHandle: An open pysam AlignmentFile object (must be sorted). + offset: Number of bases by which read extents are extended when + testing for overlap. Defaults to 0. + + Yields: + intervallib.Interval objects representing contiguous merged regions, + with ``readcount`` reflecting the number of constituent reads. + """ for strand in ["+","-"]: samFetch = samScanByStrand(samHandle.fetch(),strand) current = next(samFetch) @@ -201,9 +390,41 @@ def generate_pileup_chunks(read_iterator, dtype=numpy.uint32, max_rlen=2048, chunk_size=8192): - ''' - don't use this function with RNA-seq data because it does not pileup spliced reads properly - ''' + """Generate read-pileup data in contiguous chunks across a genomic region. + + Iterates over a sorted stream of reads and accumulates per-base read + depth in fixed-size chunks, yielding each chunk as it is complete. + Reverse-strand reads may optionally be shifted upstream so that their + 5' end corresponds to the inferred fragment start. + + Note: Do not use with RNA-seq data — spliced reads are not handled + correctly. + + Args: + read_iterator: An iterable of pysam AlignedSegment objects sorted + by position. + start: 0-based start of the region to pileup. + end: 0-based (exclusive) end of the region to pileup. + unique_only: If True (default), reads flagged as PCR/optical + duplicates (``is_duplicate``) are skipped. + merge_strands: If True, reverse-strand reads are shifted left by + ``(read_length - fragment_length)`` bases so both strands + contribute to the same inferred fragment positions. + fragment_length: Expected DNA fragment length used to extend reads. + A value <= 0 means use the actual read length unchanged. + dtype: numpy dtype for the internal accumulation array. + Defaults to numpy.uint32. + max_rlen: Maximum anticipated read length in bases. The internal + buffer is sized to accommodate this. Defaults to 2048. + chunk_size: Number of bases covered by each yielded chunk. + Must be >= max_rlen. Defaults to 8192. + + Yields: + Tuples of (chunk_start, chunk_end, chunk_array) where chunk_start + and chunk_end are offsets relative to ``start``, and chunk_array is + a numpy array of length (chunk_end - chunk_start) containing the + per-base read depth. + """ assert chunk_size >= max_rlen assert end > start # figure out the boundaries of the first chunk @@ -281,6 +502,26 @@ def bam_to_wiggle(inbamfile, wigfile, merge_strands=False, fragment_length=-1, norm=False): + """Convert a BAM file to a compressed wiggle file. + + Computes per-base read depth across every reference sequence in the BAM + file and writes the result as a wiggle file using WiggleFileWriter (from + the inOut.wiggle module). Note: WiggleFileWriter is currently + unavailable — calling this function will raise a NameError. + + Args: + inbamfile: Path to the input BAM file (must be sorted and indexed). + wigfile: Path to the output wiggle file to write. + unique_only: If True, reads flagged as PCR/optical duplicates are + excluded from the pileup. Defaults to False. + merge_strands: If True, reverse-strand reads are shifted upstream + so both strands reflect inferred fragment start positions. + Defaults to False. + fragment_length: Expected DNA fragment length used to extend reads. + A value <= 0 means use the actual read length unchanged. + norm: If True, read depths are normalised to reads-per-kilobase per + million mapped reads (RPKM-style). Defaults to False. + """ #logger = logging.getLogger(__name__) bamfile = pysam.AlignmentFile(inbamfile, 'rb') @@ -334,7 +575,28 @@ def bam_to_wiggle(inbamfile, wigfile, bamfile.close() def bamFetchFlank(bamHandle,chr,pos,flankSize=1000,fragment_length=200): - """This does not work with gapped alignments""" + """Compute merged-strand read-depth in a window centred on a genomic position. + + Fetches reads from a BAM file within ``pos ± (flankSize + fragment_length)`` + and accumulates per-base coverage into a numpy array. Reverse-strand + reads are shifted upstream to align with their inferred fragment start. + + Note: Does not handle gapped (spliced) alignments correctly. + + Args: + bamHandle: An open pysam AlignmentFile object. + chr: Reference sequence name / chromosome to query. + pos: Centre position (0-based) of the window. + flankSize: Number of bases to include on each side of ``pos`` in the + returned array. Defaults to 1000. + fragment_length: Expected DNA fragment length used to extend reverse- + strand reads. A value <= 0 means use the actual read length. + Defaults to 200. + + Returns: + A numpy array of length ``2 * flankSize + 1`` containing the + per-base read depth centred on ``pos``. + """ #Create container to hold pos +- (flankSize+fragment_length) arr = numpy.zeros(2*(flankSize+fragment_length)+1) range = (pos-flankSize-fragment_length,pos+flankSize+fragment_length) @@ -358,7 +620,32 @@ def bamFetchFlank(bamHandle,chr,pos,flankSize=1000,fragment_length=200): return arr[fragment_length:fragment_length+2*flankSize+1] def bamFetchFlank_byStrand(bamHandle,chr,pos,flankSize=1000,fragment_length=200,span=1): - """This does not work with gapped alignments""" + """Compute strand-specific read-depth arrays in a window centred on a genomic position. + + Similar to ``bamFetchFlank`` but returns separate arrays for the sense + (forward) and antisense (reverse) strands. Reverse-strand reads are + extended to the inferred fragment start when ``fragment_length`` exceeds + the read length. + + Note: Does not handle gapped (spliced) alignments correctly. + + Args: + bamHandle: An open pysam AlignmentFile object. + chr: Reference sequence name / chromosome to query. + pos: Centre position (0-based) of the window. + flankSize: Number of bases to include on each side of ``pos`` in + each returned array. Defaults to 1000. + fragment_length: Expected DNA fragment length used to extend reverse- + strand reads. A value <= 0 means use the actual read length. + Defaults to 200. + span: Step size for down-sampling the output arrays. A value of 1 + (default) returns every base; 2 returns every other base, etc. + + Returns: + A tuple (senseArr, antisenseArr) where each element is a numpy + array of length ``(2 * flankSize + 1) / span`` containing per-base + read depth for the respective strand, centred on ``pos``. + """ senseArr = numpy.zeros(2*(flankSize+fragment_length)+1) antisenseArr = numpy.zeros(2*(flankSize+fragment_length)+1) @@ -386,7 +673,30 @@ def bamFetchFlank_byStrand(bamHandle,chr,pos,flankSize=1000,fragment_length=200, return (senseArr[fragment_length:fragment_length+2*flankSize+1:span],antisenseArr[fragment_length:fragment_length+2*flankSize+1:span]) def bamFetchInterval(bamHandle,chr,start,end,fragment_length=200,span=1): - """This does not work with gapped alignments""" + """Compute strand-specific read-depth arrays across a genomic interval. + + Fetches reads from the BAM file that overlap ``[start, end]`` and + accumulates per-base read depth separately for the sense and antisense + strands. Reverse-strand reads whose actual length is less than + ``fragment_length`` are extended upstream to the inferred fragment start. + + Note: Does not handle gapped (spliced) alignments correctly. + + Args: + bamHandle: An open pysam AlignmentFile object. + chr: Reference sequence name / chromosome to query. + start: 0-based start of the interval. + end: 0-based end of the interval (inclusive). + fragment_length: Expected DNA fragment length used to extend reads. + A value <= 0 means use the actual read length unchanged. + Defaults to 200. + span: Step size for down-sampling the output arrays. Defaults to 1. + + Returns: + A tuple (senseArr, antisenseArr) where each element is a numpy + array of length ``(end - start + 1) / span`` containing per-base + read depth for the respective strand across the interval. + """ senseArr = numpy.zeros(end-start+(2*fragment_length)+1) antisenseArr = numpy.zeros(end-start+(2*fragment_length)+1) @@ -415,6 +725,24 @@ def bamFetchInterval(bamHandle,chr,start,end,fragment_length=200,span=1): return(senseArr[fragment_length:fragment_length+intervalSize:span],antisenseArr[fragment_length:fragment_length+intervalSize:span]) def makeCigarMask(cigar,increment=1): + """Build a per-base mask vector from a CIGAR string. + + Parses a text CIGAR string and produces a flat list where each element + corresponds to one reference base consumed by the alignment. 'M' + (match/mismatch) operations contribute ``increment`` to each position; + 'N' (intron/skip) operations contribute 0. Other CIGAR operations that + do not consume reference bases (e.g. 'I', 'S', 'H', 'P') are omitted + from the output. + + Args: + cigar: A CIGAR string such as ``'36M'`` or ``'20M1000N16M'``. + increment: Value assigned to each matched ('M') reference base in + the output mask. Defaults to 1. + + Returns: + A list of numeric values (each 0 or ``increment``) with one entry + per reference base consumed by the alignment. + """ incrementTable = { 'M':increment, 'N':0 @@ -440,6 +768,25 @@ def makeCigarMask(cigar,increment=1): return cigarMask def makePysamCigarMask(cigarTuple,increment=1): + """Build a per-base mask vector from a pysam CIGAR tuple. + + Equivalent to ``makeCigarMask`` but accepts the pysam representation + of a CIGAR string (a list of (operation_code, length) integer pairs) + rather than a text CIGAR string. 'M' operations contribute + ``increment``; 'N' operations contribute 0; other operations that do + not consume reference bases are omitted. + + Args: + cigarTuple: A sequence of (operation, length) pairs as returned by + pysam's ``AlignedSegment.cigar`` attribute. Operation codes + follow the SAM spec order: 0=M, 1=I, 2=D, 3=N, 4=S, 5=H, 6=P. + increment: Value assigned to each matched ('M') reference base. + Defaults to 1. + + Returns: + A list of numeric values (each 0 or ``increment``) with one entry + per reference base consumed by the alignment. + """ lookupTable = ['M','I','D','N','S','H','P'] incrementTable = { 'M':increment, @@ -454,6 +801,25 @@ def makePysamCigarMask(cigarTuple,increment=1): return cigarMask def bamFetchGappedInterval(bamHandle,chr,start,end,span=1): + """Compute strand-specific read-depth arrays across an interval, respecting CIGAR gaps. + + Unlike ``bamFetchInterval``, this function uses each read's CIGAR + information (via ``makePysamCigarMask``) so that intronic regions ('N' + operations) do not contribute to the depth. Fragment-length extension + is not yet implemented (TODO). + + Args: + bamHandle: An open pysam AlignmentFile object. + chr: Reference sequence name / chromosome to query. + start: 0-based start of the interval. + end: 0-based end of the interval (inclusive). + span: Step size for down-sampling the output arrays. Defaults to 1. + + Returns: + A tuple (senseArr, antisenseArr) where each element is a numpy + array of length ``(end - start + 1) / span`` containing per-base + read depth for the respective strand across the interval. + """ #TODO incoporate fragment size into reads (see above), default 200nt intervalSize = end-start+1 senseArr = numpy.zeros(intervalSize) @@ -491,8 +857,33 @@ def bamFetchGappedInterval(bamHandle,chr,start,end,span=1): return senseArr[::span],antisenseArr[::span] def findLargestKmer(bamHandle,chr,start,end,strand,k=21,gapped=False,span=1): - """Fetches read density across an interval and finds the start and end position (start and end offset by an index) - of the kmer with the largest value. Has not been tested yet""" + """Find the k-mer window with the highest total read depth within an interval. + + Computes per-base read depth across the interval (using either the + simple or gapped pileup function) and slides a window of size ``k`` + across the appropriate strand array to locate the window whose summed + depth is largest. + + Note: This function has not been tested yet. + + Args: + bamHandle: An open pysam AlignmentFile object. + chr: Reference sequence name / chromosome to query. + start: 0-based start of the interval. + end: 0-based end of the interval (inclusive). + strand: Which strand array to search; '+' uses the sense array, + '-' uses the antisense array. + k: Window size in bases. Defaults to 21. + gapped: If True, uses ``bamFetchGappedInterval`` (CIGAR-aware + pileup); otherwise uses ``bamFetchInterval``. Defaults to False. + span: Down-sampling step passed to the pileup function. + Defaults to 1. + + Returns: + A tuple (window_start, window_end) giving the genomic coordinates + of the highest-scoring k-mer window. Both values are offset from + ``start`` by the index of the best window. + """ if not gapped: sense,antisense = bamFetchInterval(bamHandle,chr,start,end,span=span) else: @@ -513,6 +904,28 @@ def findLargestKmer(bamHandle,chr,start,end,strand,k=21,gapped=False,span=1): return start+maxPos,end+maxPos def plotInterval(bamFiles,chr,start,end,name="",span=1,pdfName = "",sumStrands=False): + """Plot read depth across a genomic interval for one or more BAM files. + + Uses rpy2 to create a multi-panel line plot, one panel per BAM file. + Forward-strand depth is shown in blue (positive y-axis) and reverse- + strand depth in red (negative y-axis) unless ``sumStrands`` is True, + in which case a single combined black trace is drawn. Optionally saves + the plot to a PDF. + + Args: + bamFiles: A list of paths to BAM files to plot (one panel each). + chr: Reference sequence name / chromosome to display. + start: 0-based start of the display window. + end: 0-based end of the display window (inclusive). + name: Optional label appended to each panel title. Defaults to ''. + span: Down-sampling step passed to the pileup function. + Defaults to 1. + pdfName: If non-empty, the plot is written to this PDF path; otherwise + an interactive R window is opened. Defaults to ''. + sumStrands: If False (default), sense and antisense tracks are + plotted separately with opposite sign. If True, strand depths + are summed into a single positive trace. + """ nplots = len(bamFiles) #Setup plot environment @@ -544,6 +957,19 @@ def plotInterval(bamFiles,chr,start,end,name="",span=1,pdfName = "",sumStrands=F robjects.r['dev.off']() def bamStats(bamFile): + """Compute per-chromosome read counts for a BAM file. + + Iterates over every read in the BAM file (including unmapped reads) and + tallies how many reads map to each reference sequence. + + Args: + bamFile: Path to the BAM file. + + Returns: + A dict with a single key ``'readDist'`` whose value is itself a + dict mapping reference sequence index (``rname``) to the number of + reads mapping to that reference. + """ rtrn ={} #Fetch total reads in Bam by chromosome samfile = pysam.AlignmentFile(bamFile,'rb') @@ -554,7 +980,20 @@ def bamStats(bamFile): return rtrn def getrRNAReads(bamFile,rRNABedFile): - """Takes a bed file of rRNA genes and queries the bam file to determine the number of unique reads that are mapping to rRNA genes in a given sample""" + """Count unique reads that map to rRNA gene loci. + + Parses a BED file of rRNA gene coordinates and queries the BAM file for + each locus, collecting all overlapping read names. Duplicate read names + are collapsed before returning the final count. + + Args: + bamFile: Path to the sorted, indexed BAM file to query. + rRNABedFile: Path to a BED file listing rRNA gene intervals. + + Returns: + The number of unique read names (query names) that overlap at least + one rRNA gene locus. + """ reads = [] bedIter = intervallib.parseBed(rRNABedFile) samfile = pysam.AlignmentFile(bamFile,'rb') @@ -567,6 +1006,15 @@ def getrRNAReads(bamFile,rRNABedFile): return len(uniqify(reads)) def uniqify(seq): + """Return the unique elements of a sequence (order not preserved). + + Args: + seq: Any iterable of hashable elements. + + Returns: + A dict_keys view containing one entry per unique element found in + ``seq``. The original order is not preserved. + """ # Not order preserving keys = {} for e in seq: @@ -574,7 +1022,25 @@ def uniqify(seq): return keys.keys() def collapseMatrix(fname): - """Specifically finds a vector of sums for a chromatin matrix by position""" + """Sum a tab-delimited chromatin matrix column-wise across all samples. + + Reads a matrix file whose first row is a header and whose subsequent + rows each begin with two identifier fields (sample and name) followed + by numeric values. Returns the element-wise sum of all data rows and + the list of row names. + + Args: + fname: Path to a tab-delimited matrix file. Expected format: the + first line is a header whose columns (after the leading + identifier columns) name the positions. Each subsequent line + starts with a sample identifier and a row name, followed by + numeric values. + + Returns: + A tuple (names, sums) where ``names`` is a list of row-name strings + (second column of each data row) and ``sums`` is a numpy array of + the column-wise sums across all data rows. + """ handle = open(fname,'r') header = handle.readline().rstrip() header = header.split("\t")[1:] diff --git a/src/seqlib/plotting.py b/src/seqlib/plotting.py index 89196d1..bf5e6b4 100644 --- a/src/seqlib/plotting.py +++ b/src/seqlib/plotting.py @@ -1,20 +1,34 @@ -''' -Created on Jul 13, 2010 +"""Plotting utilities for genomic and epigenomic data visualisation. -@author: lgoff -''' +Provides helper functions for generating publication-quality plots of +chromatin mark occupancy and other aggregate genomic features using R via +Rscript. +""" import os def chromatinAggPlots(basename): - """ - Makes chromatin aggregate plots - - requires: - basename.vec - basename.row - basename.col - + """Generates chromatin aggregate plots as a multi-panel PDF using R. + + Writes an R script that reads three data files produced by an upstream + pipeline step, then calls Rscript to execute it and produce a PDF of + aggregate chromatin mark occupancy profiles centred on smRNA predictions. + + Required input files (all derived from basename): + - basename.vec: Tab-delimited matrix of signal values. + - basename.row: Tab-delimited BED-like annotation of rows. + - basename.col: Tab-delimited column name file. + + Output: + - basename.pdf: Multi-panel PDF with one line plot per chromatin mark. + - basename.q: The R script used to generate the plot (retained). + + Args: + basename: Base path/name shared by all input files and used for the + output PDF and R script. + + Returns: + The return code of the Rscript invocation (0 on success). """ myScript = """ colNames<-read.table("%s.col",colClasses="character",header=F,sep="\\t") diff --git a/src/seqlib/primer3lib.py b/src/seqlib/primer3lib.py index 48383f1..1a9d9af 100644 --- a/src/seqlib/primer3lib.py +++ b/src/seqlib/primer3lib.py @@ -1,12 +1,11 @@ -''' -Created on Sep 9, 2010 +"""Primer3 output parsing and primer design helpers. -Handles primer3 running and parsing output +Provides data classes (Record, Primer) for representing primer3 output and +a generator function for parsing primer3 Boulder-IO output files. Also +includes a convenience wrapper for running primer3_core directly from Python. -primer3 >= v2.2 - -@author: lgoff -''' +Requires primer3 >= v2.2. +""" import subprocess import sys @@ -14,20 +13,19 @@ class Record(object): - ''' - Represent information from a primer3 run finding primers. - - Members: - - sequenceID = value of SEQUENCE_ID field from primer3 record - - sequence = value of SEQUENCE_TEMPLATE field - - primers = list of Primer objects describing primer pairs for this target sequence. - - comments = the comment line(s) for the record - - attributes = other global parameters relevant to the record as a whole and not just a primer - ''' + """Represents the primer3 output for a single target sequence. + + Attributes: + sequenceID: Value of the SEQUENCE_ID field from the primer3 record. + sequence: Value of the SEQUENCE_TEMPLATE field. + comments: Comment line(s) associated with the record. + primers: List of Primer objects describing primer pairs designed for + this target sequence. + attributes: Dictionary of other global parameters in the primer3 + record that are not specific to an individual primer pair. + """ def __init__(self): - ''' - Constructor - ''' + """Initialises a Record with empty/default attribute values.""" self.sequenceID = "" self.sequence = "" self.comments = "" @@ -35,19 +33,37 @@ def __init__(self): self.attributes = {} def __iter__(self): + """Iterates over the Primer objects in this record.""" return iter(self.primers) def __repr__(self): + """Returns a short string representation of the record.""" return "%s: %d primer pair(s)" % (self.sequenceID,len(self.primers)) class Primer(object): - ''' - A primer set designed by Primer3 - ''' + """Represents a single primer pair designed by Primer3. + + Attributes: + sequenceID: ID of the target sequence for which this primer was + designed (matches the parent Record's sequenceID). + number: 1-based rank of this primer pair within the record. + size: Deprecated field; use product_size instead. + forward_seq: Sequence of the forward (left) primer. + forward_start: 0-based start position of the forward primer on the + template. + forward_length: Length of the forward primer in bases. + forward_tm: Melting temperature of the forward primer in °C. + forward_gc: GC content of the forward primer as a percentage. + reverse_seq: Sequence of the reverse (right) primer. + reverse_start: 0-based start position of the reverse primer on the + template. + reverse_length: Length of the reverse primer in bases. + reverse_tm: Melting temperature of the reverse primer in °C. + reverse_gc: GC content of the reverse primer as a percentage. + product_size: Expected PCR product size in base pairs. + """ def __init__(self): - ''' - Constructor - ''' + """Initialises a Primer with zero/empty attribute values.""" self.sequenceID="" self.number = 0 self.size = 0 @@ -64,9 +80,30 @@ def __init__(self): self.product_size = 0 def __repr__(self): + """Returns a short string representation showing the sequence ID, number, and primer sequences.""" return "%s_%d\n\tFwd: %s\tRev: %s" % (self.sequenceID,self.number,self.forward_seq, self.reverse_seq) def parse(handle): + """Parses a primer3 Boulder-IO output file and yields Record objects. + + Reads lines from the file handle, accumulates them until a '=' record + separator is encountered, then constructs a Record with its associated + Primer objects and yields it. + + Args: + handle: A readable file-like object containing primer3 output in + Boulder-IO format (each record terminated by a line containing + only '='). + + Yields: + Record objects, one per primer3 sequence entry. Each Record contains + a list of Primer objects corresponding to the primer pairs returned + by primer3 for that sequence. + + Raises: + StopIteration: When the end of the file is reached. + KeyError: If a required primer3 output field is missing from a record. + """ recordLines = [] while True: line = handle.readline().rstrip() @@ -109,7 +146,25 @@ def parse(handle): #Context specific runs ####### def runPrimer3(fastaFile,task="qpcr",p3CloneSetFile="/seq/compbio-hp/lgoff/lincRNAs/primer_design/P3_cloning_primer_settings.p3",p3PCRSetFile="/seq/compbio-hp/lgoff/lincRNAs/primer_design/P3_qPCR_primer_settings.p3"): - """Task can be either 'qpcr' or 'cloning'""" + """Runs primer3_core on a FASTA file to design qPCR or cloning primers. + + Converts the FASTA file to Boulder-IO format and launches a primer3_core + subprocess with the appropriate settings file. The output file path is + returned; note that the subprocess is not waited on before returning. + + Args: + fastaFile: Path to a FASTA file of sequences to design primers for. + task: Either 'qpcr' (default) to design short amplicon primers, or + 'cloning' to design full-length amplification primers using a + defined included region. + p3CloneSetFile: Path to the primer3 settings file used for cloning + primer design. + p3PCRSetFile: Path to the primer3 settings file used for qPCR primer + design. + + Returns: + Path to the primer3 output file (baseName + '.p3out'). + """ baseName = fastaFile.rstrip(".fa") iter = sequencelib.FastaIterator(open(fastaFile,'r')) diff --git a/src/seqlib/prob.py b/src/seqlib/prob.py index 72d808a..e7969a4 100644 --- a/src/seqlib/prob.py +++ b/src/seqlib/prob.py @@ -1,4 +1,11 @@ #!/usr/bin/env python +"""Probability and statistics tools for DNA sequence analysis. + +Provides signal-to-noise ratio, Z-score, binning, cumulative sums, +nucleotide frequency utilities, Gaussian evaluation, moving averages, +Poisson and binomial probability functions, combinatorics, and +dictionary utility functions used throughout seqlib. +""" import math import operator import random @@ -12,21 +19,61 @@ #Probability Tools for DNA sequence analysis ####### def snr(observed,expected): + """Compute the signal-to-noise ratio (SNR) of an observed count vs an expected count. + + Calculates the simple ratio:: + + SNR = observed / expected + + Args: + observed: The observed count or value (numeric). + expected: The expected count or value (numeric, must be non-zero). + + Returns: + The ratio observed / expected as a float. + """ return observed/expected def zscore(observed,expected): + """Compute the Z-score of an observed count under a Poisson null model. + + Assumes the standard deviation equals the square root of the + expected count (Poisson approximation):: + + Z = (observed - expected) / sqrt(expected) + + Args: + observed: The observed count or value (numeric). + expected: The expected count or value (numeric, must be positive). + + Returns: + The Z-score as a float. + """ return (observed-expected)/math.sqrt(expected) def which_bin(bins, x, safe=0): - """ - # if we're interested in binning x with boundaries - # 0, 5, 10, 15 - # then it will return which boundary it belongs in. - # if x<0: -1 - # if 0<=x<5: 0 - # if 5<=x<10: 1 - # if 10<=x<15: 2 - # if x>=15: 3 + """Determine which bin interval a value ``x`` falls into. + + Given sorted bin boundary values, returns the 0-based index of the + interval that contains ``x``. For example, with boundaries + ``[0, 5, 10, 15]``:: + + x < 0 -> -1 + 0 <= x < 5 -> 0 + 5 <= x < 10 -> 1 + 10 <= x < 15-> 2 + x >= 15 -> 3 (or len(bins) when safe=0) + + Args: + bins: A sorted list of numeric bin boundary values. + x: The value to bin. + safe: If ``1`` and ``x`` exactly equals ``bins[-1]``, returns + ``len(bins)`` instead of the usual out-of-range value. + Defaults to 0. + + Returns: + An integer bin index. Returns ``-1`` if ``x < bins[0]``, or + ``len(bins)`` if ``x >= bins[-1]`` (unless ``safe=1`` applies). """ if x gaussianN(2,3) evaluated at 4 + """Create a Gaussian PDF function with fixed mean and standard deviation. + + Returns a callable that evaluates the Gaussian PDF at any point + ``x``, with ``mu`` and ``sigma`` captured by closure. + + Example:: + + N2_3 = make_gaussian(2, 3) + N2_3(4) # -> gaussian(4, mu=2, sigma=3) + + Args: + mu: The mean of the Gaussian. + sigma: The standard deviation of the Gaussian. + + Returns: + A function ``f(x)`` that evaluates the Gaussian N(mu, sigma) at + ``x``. """ return lambda x,mu=mu,sigma=sigma: ( (1.0/math.sqrt(2*math.pi*sigma)) * (math.e**(-((x-mu)**2)/(2*sigma**2)))) def make_adder(n): - """ - usage: - Add2=make_adder(2) - Add2(3) -> 5 + """Create an adder function that adds a fixed value ``n`` to its argument. + + Returns a callable that adds ``n`` (captured by closure) to any + input ``x``. + + Example:: + + Add2 = make_adder(2) + Add2(3) # -> 5 + + Args: + n: The fixed value to add. + + Returns: + A function ``f(x)`` that returns ``x + n``. """ return lambda x,n=n: x+n @@ -107,6 +257,17 @@ def make_adder(n): loge_2 = math.log(2) def avg(l,precise=0): + """Compute the arithmetic mean of a list of numbers. + + Args: + l: A list of numeric values. + precise: If non-zero, divide by ``float(len(l))`` for a + floating-point result. If 0 (default), divide by + ``len(l)`` using integer or floor division. + + Returns: + The mean of ``l`` as a number, or 0 if ``l`` is empty. + """ if not l: return 0 if precise: return reduce(operator.add,l,0)/float(len(l)) @@ -114,28 +275,78 @@ def avg(l,precise=0): return reduce(operator.add,l,0)/len(l) def movavg(s, n): - ''' returns an n period moving average for the time series s + """Compute an n-period moving average for a time series. + + Uses cumulative sums for an O(len(s)) implementation:: - s is a list ordered from oldest (index 0) to most recent (index -1) - n is an integer + MA[i] = mean(s[i-n+1 : i+1]) - returns a numeric array of the moving average - ''' + The result has length ``len(s) - n + 1``. + + Args: + s: A list or array of numeric values ordered from oldest + (index 0) to most recent (index -1). + n: The window size (number of periods) for the moving average. + + Returns: + A NumPy array of the moving average values. The array has + ``len(s) - n + 1`` elements. + """ s = np.array(s) c = np.cumsum(s) return (c[n-1:] - c[:-n+1]) / float(n) def median(l): + """Compute the median of a list of numbers. + + Sorts ``l`` and returns the middle value for odd-length lists or the + average of the two middle values for even-length lists. + + Args: + l: A list of numeric values. + + Returns: + The median value, or ``None`` if ``l`` is empty. + """ if not l: return None l = sorted(l) if len(l)%2: return sorted(l)[len(l)//2] else: return (l[len(l)//2]+l[len(l)//2-1])/2.0 def stdev(l, failfast=1): + """Compute the sample standard deviation of a list of numbers. + + Returns the square root of the sample variance computed by + :func:`variance`. + + Args: + l: A list of numeric values with at least 2 elements. + failfast: Passed directly to :func:`variance`. If non-zero + (default), raises an error when fewer than 2 samples are + provided. + + Returns: + The sample standard deviation as a float. + """ return math.sqrt(variance(l,failfast=failfast)) def variance(l,failfast=1): + """Compute the sample variance of a list of numbers. + + Uses Bessel's correction (divides by ``n - 1``):: + + s^2 = sum((x - mean)^2) / (n - 1) + + Args: + l: A list of numeric values. + failfast: If non-zero (default), raises a string exception when + fewer than 2 samples are provided. If 0, returns 0 instead. + + Returns: + The sample variance as a float, or 0 when ``failfast=0`` and + the list has fewer than 2 elements. + """ if (not l) or len(l)==1: if failfast: raise "tools.variance: Not enough samples. Need >= 2, got %s"%len(l) else: return 0#'N/A' @@ -146,14 +357,51 @@ def variance(l,failfast=1): return s / (len(l)-1) def log2(x): + """Compute the base-2 logarithm of ``x``. + + Uses the change-of-base formula:: + + log2(x) = ln(x) / ln(2) + + Args: + x: A positive real number. + + Returns: + The base-2 logarithm of ``x`` as a float. + """ #converting bases: log_a(b) = log_c(b)/log_c(a) #i.e. log_2(x) = log_e(2)/log_e(x) = log_10(2)/log_10(x) return math.log(x)/float(loge_2) def log_k(x,k): + """Compute the base-``k`` logarithm of ``x``. + + Uses the change-of-base formula:: + + log_k(x) = ln(x) / ln(k) + + Args: + x: A positive real number. + k: The base of the logarithm (positive real number != 1). + + Returns: + The base-``k`` logarithm of ``x`` as a float. + """ return math.log(x)/math.log(k) def prob2score(prob): + """Convert a probability to a Phred-like quality score. + + Computes ``-10 * log10(prob)``, so a probability of 1/100 maps to + a score of 20 (the standard Phred-score convention). + + Args: + prob: A probability value (float in (0, 1]). + + Returns: + A float quality score equal to ``-10 * log10(prob)``. Returns + -1 if any exception is raised (e.g. ``prob=0``). + """ #1/100 -> 20 try: return -10*float(math.log10(float(prob))) @@ -161,10 +409,32 @@ def prob2score(prob): return -1 def p2bits(p): - """Takes p-value and returns negative log2""" + """Convert a p-value to bits of evidence (negative log base-2). + + Computes ``-log2(p)``, which quantifies the evidence against the + null hypothesis in bits. + + Args: + p: A p-value (float in (0, 1]). + + Returns: + A float equal to ``-log2(p)``. Higher values indicate stronger + evidence against the null. + """ return -log2(p) def factorial(n): + """Compute n! (n factorial) iteratively. + + Multiplies all integers from ``n`` down to 1. + + Args: + n: A non-negative integer. + + Returns: + An integer equal to ``n * (n-1) * ... * 2 * 1``. Returns 1 + when ``n`` is 0 or 1. + """ result = 1 for i in range(n,0,-1): #print i @@ -175,18 +445,63 @@ def factorial(n): #Poisson ########### def poisson_expected(rate): + """Print a table of Poisson probabilities for counts 1 to 49. + + For each integer ``x`` from 1 to 49, prints the Poisson probability + ``P(X = x; rate)`` and the expected count in a population of 12 million:: + + x P(X=x) 12000000 * P(X=x) + + Args: + rate: The Poisson rate parameter (expected number of events). + """ for x in range(1,50,1): p = poisson(rate,x) print(f"{x}\t{p}\t{12000000*p}") def poisson(rate, x): - """Returns the probability of observing a count of x""" + """Compute the Poisson probability of observing exactly ``x`` events. + + Evaluates the Poisson PMF:: + + P(X = x; rate) = exp(-rate) * rate^x / x! + + Args: + rate: The expected number of events (lambda, must be non-negative). + x: The observed count (non-negative integer). + + Returns: + The probability P(X = x) as a float. + """ return math.exp(-rate)*(rate**x)/factorial(x) ###################### #Binomial Distribution ####################### def binomial_likelihood_ratio(ps,k,n): + """Compute the likelihood ratio of two binomial hypotheses. + + Given two probability parameters ``ps[0]`` (null hypothesis H0) and + ``ps[1]`` (alternative hypothesis H1), computes:: + + LR = log(P(k | p=ps[1], n)) / P(k | p=ps[0], n) + + Note: + The formula mixes log and linear likelihoods and is not the + standard log-likelihood ratio test; see :func:`binomial_log_likelihood_ratio` + for the standard implementation. + + Args: + ps: A 2-element list ``[p0, p1]`` where ``p0`` is the null + probability and ``p1`` is the alternative probability. + k: The observed number of successes. + n: The total number of trials. + + Returns: + A float representing the likelihood ratio. Returns + ``sys.maxsize`` with a warning message if the null hypothesis + likelihood is 0. + """ # p[0] is the null hypothesis # p[1] is the hypothesis being tested assert(len(ps)==2) @@ -202,20 +517,83 @@ def binomial_likelihood_ratio(ps,k,n): return sys.maxsize def binomial_log_likelihood_ratio(ps,k,n): + """Compute the log-likelihood ratio of two binomial hypotheses. + + Calculates:: + + LLR = log P(k | p=ps[1], n) - log P(k | p=ps[0], n) + + where each log probability is computed by :func:`log_binomial`. + A positive LLR supports the alternative hypothesis ``ps[1]`` over + the null ``ps[0]``. + + Args: + ps: A 2-element list ``[p0, p1]`` where ``p0`` is the null + success probability and ``p1`` is the alternative. + k: The observed number of successes. + n: The total number of trials. + + Returns: + The log-likelihood ratio as a float. + """ return log_binomial(ps[1],k,n) - log_binomial(ps[0],k,n) def log_binomial(p,k,n): + """Compute the log probability of the binomial PMF. + + Returns the natural log of P(X = k) for X ~ Binomial(n, p):: + + log P(k; n, p) = log C(n, k) + k*log(p) + (n-k)*log(1-p) + + Args: + p: The probability of success per trial (float in (0, 1)). + k: The number of successes (non-negative integer). + n: The number of trials (integer >= k). + + Returns: + The natural log of the binomial PMF as a float. + """ # the log probability of seeing exactly k successes in n trials # given the probability of success is p return log_n_choose_k(n,k)+math.log(p)*k+math.log(1-p)*(n-k) def binomial(p,k,n): + """Compute the binomial probability P(X = k; n, p). + + Calculates the probability of observing exactly ``k`` successes in + ``n`` independent Bernoulli trials each with success probability + ``p``:: + + P(X = k) = C(n, k) * p^k * (1-p)^(n-k) + + Args: + p: The probability of success per trial (float in [0, 1]). + k: The number of successes (non-negative integer). + n: The number of trials (integer >= k). + + Returns: + The binomial probability as a float. + """ # probability of seeing exactly k successes in n trials, given # the probability of success is p #return n_choose_k(n,k)*(p**k)*((1-p)**(n-k)) return n_choose_k(n,k)*(p**k)*((1-p)**(n-k)) def cumBinomial(p,k,n): + """Compute the cumulative binomial probability P(X <= k; n, p). + + Sums the binomial PMF from 0 to ``k`` inclusive:: + + P(X <= k) = sum_{j=0}^{k} C(n, j) * p^j * (1-p)^(n-j) + + Args: + p: The probability of success per trial (float in [0, 1]). + k: The upper bound on the number of successes (non-negative int). + n: The number of trials (integer >= k). + + Returns: + The cumulative binomial probability P(X <= k) as a float. + """ #Returns the cumulative probability from the binomaial distribution Pval = 0.0 for j in range(0,k+1): @@ -223,6 +601,25 @@ def cumBinomial(p,k,n): return Pval def n_choose_k(n,k): + """Compute the binomial coefficient C(n, k) = n! / (k! * (n-k)!). + + Uses the multiplicative recurrence:: + + C(n, k) = (n * (n-1) * ... * (n-k+1)) / (k * (k-1) * ... * 1) + + Exploits the symmetry ``C(n, k) = C(n, n-k)`` to choose the smaller + of ``k`` and ``n-k`` for efficiency. + + Args: + n: Total number of items (non-negative integer). + k: Number of items to choose (non-negative integer, ``k <= n``). + + Returns: + The binomial coefficient C(n, k) as a float. + + Raises: + AssertionError: If ``k > n``. + """ # (n k) = n! / (k! (n-k)!) # # n*(n-1)*(n-2)*....*(n-k+1) @@ -244,6 +641,25 @@ def n_choose_k(n,k): return result def log_n_choose_k(n,k): + """Compute log(C(n, k)) in log space to avoid integer overflow. + + Evaluates the natural logarithm of the binomial coefficient using + the additive log form of the multiplicative recurrence:: + + log C(n, k) = sum(log(n-i+1) - log(i) for i in 1..k') + + where ``k' = min(k, n-k)``. + + Args: + n: Total number of items (non-negative integer). + k: Number of items to choose (non-negative integer, ``k <= n``). + + Returns: + The natural log of C(n, k) as a float. + + Raises: + AssertionError: If ``k > n``. + """ # (n k) = n! / (k! (n-k)!) # # n*(n-1)*(n-2)*....*(n-k+1) @@ -263,6 +679,25 @@ def log_n_choose_k(n,k): #Dictionary Tools ################# def cget(diclist, key, strict=1): + """Extract the same key from every item in a list of dicts (or sequences). + + Also known as "cross-get" or "gather". Iterates over ``diclist`` + and collects ``item[key]`` for each element. + + Args: + diclist: A list of dictionaries or index-accessible objects that + all share the specified ``key``. + key: The key (or integer index) to look up in each element. + strict: If non-zero (default), every element must contain + ``key``; raises ``KeyError`` or ``IndexError`` otherwise. + If 0, silently skips elements that are falsy or do not + contain ``key`` (using ``generic_has_key``). + + Returns: + A list of values ``item[key]`` for each item in ``diclist``. + When ``strict=1`` the returned list has the same length as + ``diclist``. When ``strict=0`` the length may be shorter. + """ # cross_get was: gather(diclist,key) # gathers the same key from a list of dictionaries # can also be used in lists diff --git a/src/seqlib/pygrlib.py b/src/seqlib/pygrlib.py index 9f5b1e7..f980c61 100644 --- a/src/seqlib/pygrlib.py +++ b/src/seqlib/pygrlib.py @@ -18,7 +18,27 @@ ###Classes class MySliceInfo(object): + """Stores coordinate information for a genomic slice in pygr convention. + + Holds the four fields required to identify a sequence slice: sequence ID, + start, stop (exclusive), and orientation (+1 or -1). + + Attributes: + id: Sequence (chromosome) identifier. + start: 0-based start coordinate. + stop: Exclusive end coordinate. + orientation: Strand orientation; +1 for forward, -1 for reverse. + """ def __init__(self, seq_id, start, stop, orientation): + """Initialises a MySliceInfo. + + Args: + seq_id: Sequence (chromosome) identifier. + start: 0-based start coordinate of the slice. + stop: Exclusive end coordinate of the slice. + orientation: Strand orientation; +1 for forward strand, -1 for + reverse strand (pygr convention). + """ (self.id, self.start, self.stop, self.orientation) = \ (seq_id, start, stop, orientation) @@ -26,7 +46,30 @@ def __init__(self, seq_id, start, stop, orientation): ###GFF Futzing around class GFF3Row(object): + """Represents a single data row from a GFF3 annotation file. + + Parses one GFF3 line and stores the type, sequence ID, start/stop + coordinates (converted to 0-based pygr convention), strand orientation, + and all key=value attributes from column 9. + + Attributes: + type: Feature type string from column 3 (e.g. 'gene', 'exon'). + id: Sequence (chromosome) ID from column 1. + start: 0-based start coordinate (GFF3 1-based column 4 minus 1). + stop: Exclusive end coordinate (GFF3 column 5). + orientation: +1 for '+' strand, -1 for '-' strand. + Additional attributes are set dynamically from column 9 key=value + pairs; multi-value attributes (comma-separated) are stored as lists. + """ def __init__(self, line): + """Parses a GFF3 line into a GFF3Row object. + + Args: + line: A single tab-delimited GFF3 data line (not a comment). + + Raises: + ValueError: If the strand character in column 7 is not '+' or '-'. + """ cols = line.split('\t') self.type = cols[2] self.id = cols[0] # sequence ID @@ -47,6 +90,25 @@ def __init__(self, line): def read_gff3(filename, genome): + """Reads a GFF3 annotation file and builds pygr AnnotationDB objects. + + Parses a GFF3 file, groups features by type, and creates one pygr + AnnotationDB per feature type, each associated with the provided genome + sequence database. Comment lines (starting with '#') are skipped. + Features lacking a type or gene_id attribute are also skipped. + + Args: + filename: Path to a GFF3-format annotation file. + genome: A pygr sequence database object (e.g. a worldbase genome) + used to associate annotation slices with genomic sequence. + + Returns: + A dictionary mapping feature type strings to pygr AnnotationDB + objects. + + Raises: + ImportError: If the pygr library is not installed. + """ if not _PYGR_AVAILABLE: raise ImportError("pygr is required for read_gff3 but is not installed.") d = {} # for different types of sliceDBs diff --git a/src/seqlib/seqData.py b/src/seqlib/seqData.py index 23f970b..5258693 100644 --- a/src/seqlib/seqData.py +++ b/src/seqlib/seqData.py @@ -1,9 +1,17 @@ #!/usr/bin/env python -''' -Created on Oct 27, 2009 +"""Data structures and utilities for working with BAM/SAM sequencing data. -@author: lgoff -''' +Provides SamData and ChromData classes wrapping pysam for read access to BAM +files, a plotRegions function for strand-aware coverage visualisation via rpy, +and helper utilities for parsing SAM bitflags and converting reads to Interval +objects. + +Note: This module depends on pysam and rpy, which must be installed separately. + +Originally created on Oct 27, 2009. + +Author: lgoff +""" import intervallib import pysam @@ -11,7 +19,26 @@ class SamData: + """Wrapper around a pysam BAM file handle. + + Provides basic access to a sorted, indexed BAM file including pileup + queries and a pysam Samfile handle. + + Attributes: + name: Sample name string. + file: Path to the BAM file. + description: Human-readable description string. + type: Data type label (default "basic"). + handle: Open pysam.Samfile handle. + """ def __init__(self,name,file,description): + """Initialize and open a SamData object. + + Args: + name: Sample name string. + file: Path to the BAM file. + description: Human-readable description of the sample. + """ self.name = name self.file = file self.description = description @@ -19,22 +46,37 @@ def __init__(self,name,file,description): self.open() def __str__(self): + """Return the sample name string.""" return self.name def open(self): - """Returns a pysam handle to the .BAM file""" + """Open the BAM file and store the pysam handle in self.handle.""" self.handle = pysam.Samfile(self.file,'rb') def close(self): + """Close the pysam BAM file handle.""" self.handle.close() def samSort(self): + """Placeholder for BAM sorting (not yet implemented).""" pass def samIndex(self): + """Placeholder for BAM indexing (not yet implemented).""" pass def pileupQuery(self,chr,start='',end=''): + """Return per-position pileup depths for a genomic region. + + Args: + chr: Chromosome name string. + start: Start coordinate (default "" for beginning of chromosome). + end: End coordinate (default "" for end of chromosome). + + Returns: + A tuple (pos, n) where pos is a list of genomic positions and + n is a list of corresponding pileup depths. + """ pos = [] n = [] for pileupcolumn in self.handle.pileup(chr,start,end): @@ -43,7 +85,25 @@ def pileupQuery(self,chr,start='',end=''): return (pos,n) class ChromData(SamData): + """SamData subclass for chromatin modification ChIP-seq BAM files. + + Extends SamData with mark and cell-line metadata. + + Attributes: + mark: Histone mark or chromatin feature name (e.g. "H3K4me3"). + cellLine: Cell line identifier string. + type: Data type label (always "chromatin"). + """ def __init__(self,name,file,description,mark,cellLine): + """Initialize a ChromData object. + + Args: + name: Sample name string. + file: Path to the BAM file. + description: Human-readable description. + mark: Histone mark or antibody target name. + cellLine: Cell line identifier string. + """ SamData.__init__(self, name=name, file=file, description=description) self.mark = mark self.cellLine = cellLine @@ -74,6 +134,18 @@ def __init__(self,name,file,description,mark,cellLine): } def openBams(dataDict,cellLine): + """Open a collection of BAM files described by a dictionary. + + Creates ChromData objects for each entry in dataDict, opens each BAM + file handle, and returns the list. + + Args: + dataDict: Dict mapping mark name to BAM file path. + cellLine: Cell line identifier assigned to all ChromData objects. + + Returns: + List of opened ChromData objects. + """ files = [] for k,v in dataDict.items(): sample = v.split("_")[0] @@ -99,7 +171,18 @@ def plotRegions(bamHandle,chrom,start,end): """ def plotRegions(bamHandle,chrom,start,end): - """Incorporates strandedness and possibly an extension factor to account for fragment size""" + """Plot strand-aware read coverage for a genomic region using rpy. + + Counts per-position forward ("+") and reverse ("-") read coverage using + pysam fetch, then draws a coverage plot via rpy with forward reads in blue + above the axis and reverse reads in red below. + + Args: + bamHandle: An open pysam Samfile or AlignmentFile handle. + chrom: Chromosome name string. + start: Start coordinate (integer). + end: End coordinate (integer). + """ tmp = {} tmp["+"] = {} tmp["-"] = {} @@ -119,8 +202,18 @@ def plotRegions(bamHandle,chrom,start,end): def plotChromProfile(bamFiles,chrom,start,end): - """Not terribly flexible at this point, but will plot 'tracks' from a given chrom,start,end - position from a list of opened .BAM files""" + """Plot stacked pileup-depth tracks for multiple BAM files via rpy. + + Opens a new rpy graphics device and plots one coverage track per BAM + file in a vertically stacked layout. Not very flexible at this point. + + Args: + bamFiles: List of opened SamData (or similar) objects with a + .handle attribute supporting pileup() and a .name attribute. + chrom: Chromosome name string. + start: Start coordinate (integer). + end: End coordinate (integer). + """ r.x11(width=6,height=10) r.par(mfrow=[len(bamFiles),1]) @@ -136,10 +229,17 @@ def plotChromProfile(bamFiles,chrom,start,end): #Functions for sam Reads ############### def getBitValue(n, p): - ''' - get the bitvalue of denary (base 10) number n at the equivalent binary - position p (binary count starts at position 0 from the right) - ''' + """Return the bit value of integer n at binary position p. + + Binary position 0 is the least significant bit (rightmost). + + Args: + n: Denary (base-10) integer. + p: Bit position to inspect (0-indexed from the right). + + Returns: + 0 or 1 depending on the bit at position p. + """ return (n >> p) & 1 def strandFlag(flag): @@ -153,10 +253,33 @@ def strandFlag(flag): return "*" def samRead2Interval(samRead): + """Convert a single pysam AlignedRead to an intervallib.Interval. + + The strand is determined from the SAM bitflag. Coordinates are converted + to 1-based by adding 1 to samRead.pos. + + Args: + samRead: A pysam AlignedRead object. + + Returns: + An intervallib.Interval with chr set to samRead.qname, 1-based + start/end coordinates, and strand derived from the bitflag. + """ strand = strandFlag(int(samRead.flag)) return intervallib.Interval(samRead.qname,int(samRead.pos)+1,int(samRead.pos)+samRead.rlen+1,strand) def samReads2Intervals(samReads,start='start',end='end',score='readcount',sampleName=".",offset=0): - """samReads is an iterator object over a set of sam reads using the pysam 'fetch' call""" + """Convert a pysam fetch iterator of SAM reads to Interval objects. + + Note: This function is not yet implemented (passes without action). + + Args: + samReads: Iterator object over SAM reads from a pysam 'fetch' call. + start: Name of the start coordinate field (default "start"). + end: Name of the end coordinate field (default "end"). + score: Name of the score field (default "readcount"). + sampleName: Sample name string (default "."). + offset: Integer offset applied to coordinates (default 0). + """ pass diff --git a/src/seqlib/seqlib.py b/src/seqlib/seqlib.py index adaf53c..43d0e32 100644 --- a/src/seqlib/seqlib.py +++ b/src/seqlib/seqlib.py @@ -1,3 +1,13 @@ +"""Sequence data structures and molecular biology utilities. + +Provides SeqDict, a dictionary subclass for ordered molecular sequences, and +a variety of constants and functions for DNA/RNA/protein operations including +codon translation, reverse complementation, GC content calculation, and +Kimura sequence evolution simulation. + +Author: lgoff (derived from rasmus seqlib) +""" + import copy import math import random @@ -6,20 +16,30 @@ class SeqDict (dict): - """\ - A dictionary for molecular sequences. Also keeps track of their order, - useful for reading and writing sequences from fasta's. See fasta.FastaDict - for subclass that implements FASTA reading and writing. + """A dictionary for molecular sequences that also tracks insertion order. + + Useful for reading and writing sequences from FASTA files where order + matters. Keys are sequence names; values are sequence strings. See + fasta.FastaDict for a subclass that implements FASTA reading and writing. + + Attributes: + names: List of sequence names in insertion order. """ def __init__(self): + """Initialize an empty SeqDict.""" dict.__init__(self) self.names = [] def orderNames(self, aln): - """Orders the names in the same order they appear in aln""" + """Reorder self.names to match the key order of another dict. + + Args: + aln: A dict (typically another SeqDict or alignment) whose key + order is used to sort self.names. + """ # Inlined util.list2lookup: creates a dict mapping list items to their index lookup = {v: i for i, v in enumerate(aln.keys())} @@ -28,6 +48,18 @@ def orderNames(self, aln): # add a key, value pair def add(self, key, value, errors=False): + """Add a key-value pair, keeping the longest value on duplicate keys. + + If the key already exists and the new value is at least as long as the + stored value, the stored value is replaced. The insertion order in + self.names is preserved (duplicate keys do not add to names). + + Args: + key: Sequence name string. + value: Sequence string. + errors: If True, write a warning to stderr on duplicate keys + (default False). + """ if key in self: if errors: # Inlined util.logger: write to stderr @@ -43,7 +75,17 @@ def add(self, key, value, errors=False): def get(self, keys, new=None): - """Return a subset of the sequences""" + """Return a new SeqDict containing only the given keys. + + Args: + keys: Iterable of key names to include. + new: Optional pre-existing SeqDict to populate. If None, a new + instance of the same type is created. + + Returns: + A SeqDict (or instance of the same subclass) containing the + requested keys that are present in self. + """ if new == None: new = type(self)() @@ -66,57 +108,103 @@ def alignlen(self): # The following methods keep names in sync with dictionary keys def __setitem__(self, key, value): + """Set a key-value pair and add key to self.names if new.""" if key not in self: self.names.append(key) dict.__setitem__(self, key, value) def __delitem__(self, key): + """Delete a key and remove it from self.names.""" self.names.remove(key) def update(self, dct): + """Update from another dict, appending new keys to self.names. + + Args: + dct: Dict-like object whose items will be merged into self. + """ for key in dct: if key not in self.names: self.names.append(key) dict.update(self, dct) def setdefault(self, key, value): + """Set key to value only if key is absent, tracking order. + + Args: + key: Key to look up or set. + value: Default value to assign if key is missing. + """ if key not in self.names: self.names.append(key) dict.setdefault(self, key, value) def clear(self): + """Remove all items and reset self.names to an empty list.""" self.names = [] dict.clear(self) # keys are always sorted in order added def keys(self): + """Return keys in insertion order. + + Returns: + List of key names in insertion order. + """ return list(self.names) def iterkeys(self): + """Iterate over keys in insertion order. + + Returns: + Iterator over key name strings. + """ return iter(self.names) def values(self): + """Return values in key insertion order. + + Returns: + List of sequence strings in the same order as self.names. + """ return [self[key] for key in self.iterkeys()] def itervalues(self): + """Iterate over values in key insertion order. + + Returns: + Generator yielding sequence strings in insertion order. + """ def func(): for key in self.iterkeys(): yield self[key] return func() def iteritems(self): + """Iterate over (key, value) pairs in key insertion order. + + Returns: + Generator yielding (name, sequence) tuples. + """ def func(): for key in self.iterkeys(): yield (key, self[key]) return func() def items(self): + """Return list of (key, value) pairs in insertion order. + + Returns: + List of (name, sequence) tuples. + """ return list(self.iteritems()) def __iter__(self): + """Iterate over keys in insertion order.""" return iter(self.names) def __len__(self): + """Return the number of sequences stored.""" return len(self.names) @@ -210,6 +298,17 @@ def __len__(self): # hydrophobic / hydrophilic def hydrophobic(aa): + """Return a numeric hydrophobicity score for a single amino acid. + + Args: + aa: Single-letter amino-acid code string. + + Returns: + 2.0 for strongly hydrophobic residues (VILMFWC), + 1.0 for weakly hydrophobic residues (AYHTSPG), + 0.5 for weakly hydrophilic residues (RK), + 0.0 for all other residues. + """ if aa in 'VILMFWC': return 2.0 if aa in 'AYHTSPG': return 1.0 if aa in 'RK': return 0.5 @@ -309,7 +408,24 @@ def hydrophobic(aa): # class TranslateError (Exception): + """Exception raised when a codon cannot be translated correctly. + + Attributes: + aa: The amino-acid sequence string being reverse-translated. + dna: The original DNA sequence string. + a: The amino-acid character that triggered the error. + codon: The DNA codon that did not match. + """ def __init__(self, msg, aa, dna, a, codon): + """Initialize a TranslateError. + + Args: + msg: Human-readable error message. + aa: Amino-acid sequence being processed. + dna: Original DNA sequence. + a: The amino-acid character at the point of failure. + codon: The DNA codon at the point of failure. + """ Exception.__init__(self, msg) self.aa = aa self.dna = dna @@ -319,7 +435,22 @@ def __init__(self, msg, aa, dna, a, codon): def translate(dna, table=CODON_TABLE): - """Translates DNA (with gaps) into amino-acids""" + """Translate a DNA sequence (with gaps) into an amino-acid sequence. + + Codons containing "N" are translated to "X" (unknown amino acid). + Gap codons "---" are translated to "-". + + Args: + dna: DNA string whose length must be a multiple of 3. + table: Codon-to-amino-acid lookup dict (default CODON_TABLE). + + Returns: + Amino-acid sequence string. + + Raises: + AssertionError: If len(dna) is not a multiple of 3. + KeyError: If a codon is not present in the codon table. + """ aa = [] @@ -335,9 +466,24 @@ def translate(dna, table=CODON_TABLE): def revtranslate(aa, dna, check=False): - """Reverse translates aminoacids (with gaps) into DNA + """Reverse-translate an amino-acid sequence (with gaps) back into DNA. + + The original ungapped DNA sequence must be supplied so that the correct + codons are restored. Gap characters "-" in aa are expanded to "---" in the + output. + + Args: + aa: Amino-acid string (may contain "-" gap characters). + dna: Original ungapped DNA string used to recover codons. + check: If True, verify that each codon translates back to the + expected amino acid (default False). - Must supply original ungapped DNA. + Returns: + DNA string with codons matching the amino-acid sequence. + + Raises: + TranslateError: If check=True and a codon does not translate to the + expected amino acid. """ seq = [] @@ -361,7 +507,17 @@ def revtranslate(aa, dna, check=False): "b":"v", "v":"b", "d":"h", "h":"d"} def revcomp(seq): - """Reverse complement a sequence""" + """Return the reverse complement of a DNA sequence. + + Handles IUPAC ambiguity codes as well as standard A/C/G/T bases (both + upper and lower case). + + Args: + seq: DNA sequence string. + + Returns: + Reverse-complemented DNA sequence string. + """ seq2 = [] for i in range(len(seq)-1, -1, -1): @@ -370,6 +526,14 @@ def revcomp(seq): def gcContent(seq): + """Compute the GC content fraction of a DNA sequence. + + Args: + seq: DNA sequence string containing A, C, G, and T characters. + + Returns: + GC fraction as a float in [0.0, 1.0]. + """ # Inlined util.histDict: build a frequency dict of characters hist = {} for c in seq: @@ -392,6 +556,23 @@ def gcContent(seq): def evolveKimuraSeq(seq, time, alpha=1, beta=1): + """Evolve a DNA sequence under the Kimura two-parameter model. + + Each base is independently substituted according to transition (alpha) + and transversion (beta) rate parameters over the given evolutionary time. + + Args: + seq: DNA sequence string (uppercase A/C/G/T only). + time: Evolutionary branch length (substitutions per site). + alpha: Transition rate parameter (default 1). + beta: Transversion rate parameter (default 1). + + Returns: + Evolved DNA sequence string of the same length as seq. + + Raises: + AssertionError: If substitution probabilities do not sum to one. + """ probs = { 's': .25 * (1 - math.e**(-4 * beta * time)), 'u': .25 * (1 + math.e**(-4 * beta * time) @@ -418,6 +599,20 @@ def evolveKimuraSeq(seq, time, alpha=1, beta=1): def evolveKimuraBase(base, time, alpha, beta): + """Evolve a single DNA base under the Kimura two-parameter model. + + Args: + base: A single DNA base character (A/C/G/T). + time: Evolutionary branch length. + alpha: Transition rate parameter. + beta: Transversion rate parameter. + + Returns: + The (possibly substituted) DNA base character. + + Raises: + AssertionError: If substitution probabilities do not sum to one. + """ probs = { 's': .25 * (1 - math.e**(-4 * beta * time)), 'u': .25 * (1 + math.e**(-4 * beta * time) diff --git a/src/seqlib/seqstats.py b/src/seqlib/seqstats.py index c587157..77f7ccc 100644 --- a/src/seqlib/seqstats.py +++ b/src/seqlib/seqstats.py @@ -1,4 +1,18 @@ #!/usr/bin/env python +"""Statistical utilities for peak enrichment analysis in RNA immunoprecipitation and ChIP-Seq experiments. + +Implements a PeakSeq-like approach for comparing experimental (RIP or ChIP) +BAM files against input/IgG control BAM files. The pipeline: + +1. Segments the genome into fixed-size bins and counts reads in each bin for + both the experimental and control samples. +2. Determines a global normalisation factor (alpha) via linear regression on + bins that have reads in both samples. +3. Tests each interval in a BED file using a binomial model (reads from the + experimental sample vs. alpha-scaled control reads) to assign p-values. +4. Corrects for multiple testing using Benjamini-Hochberg FDR correction. +5. Outputs results to stdout sorted or filtered by q-value. +""" import getopt import math import sys @@ -12,12 +26,16 @@ #from rpy2 import robjects #from seqtools.genome import chr_lengths,genome_length -"""Collection of utilities for determining peak enrichment in xxx-Seq experiments""" - ################# #Main ################# def main(): + """Legacy command-line entry point — reads three positional arguments and runs smRNApeakSeq. + + Expects sys.argv to contain: expBam ctlBam bedFile. Calls smRNApeakSeq + with filter=False and the module-level useStrand variable. Prefer + newMain() for proper option parsing. + """ expBam = sys.argv[1] ctlBam = sys.argv[2] bedFile = sys.argv[3] @@ -29,6 +47,27 @@ def main(): #Wrappers ######################## def smRNApeakSeq(expBam,ctlBam,bedFile,cutoff = 0.0001,filter=True,useStrand=True): + """Runs the full smRNA/RIP-Seq peak-calling pipeline and writes results to stdout. + + Segments the genome, computes a normalisation factor between experimental + and control BAM files, tests each BED interval with a binomial model, + applies Benjamini-Hochberg FDR correction, and prints tab-delimited + output. + + Args: + expBam: Path to a sorted, indexed BAM file from the experimental + (RIP/ChIP) sample. + ctlBam: Path to a sorted, indexed BAM file from the control (IgG or + input) sample. + bedFile: Path to a BED file of candidate intervals to test. + cutoff: Q-value threshold below which results are printed when filter + is True (default: 0.0001). + filter: If True, only print intervals with q-value <= cutoff. If + False, print all intervals (default: True). + useStrand: If True, count only reads on the same strand as each + interval. If False, count all reads regardless of strand + (default: True). + """ #open files expHandle = pysam.Samfile(expBam,'rb') ctlHandle = pysam.Samfile(ctlBam,'rb') @@ -151,8 +190,24 @@ def cumBinom(nExp,adjCtl,P=0.5): return 1-scipy.stats.binom.cdf(nExp-1,nExp+adjCtl,P) def testInterval(interval,expHandle,ctlHandle,alpha): - """ - #TODO:Make sure that this is only grabbing the appropriate strand and not both....this can be dangerous + """Tests a single genomic interval for strand-aware read enrichment. + + Counts reads on the same strand as the interval from both the experimental + and control BAM files, scales the control count by alpha, and returns + a binomial p-value. + + Args: + interval: An intervallib.Interval object with chr, start, end, and + strand attributes. + expHandle: A pysam AlignmentFile for the experimental sample. + ctlHandle: A pysam AlignmentFile for the control sample. + alpha: Normalisation factor (slope from getAlpha) used to scale + control counts to match the experimental library size. + + Returns: + A tuple (pVal, nExp, adjCtl) where pVal is the binomial p-value, + nExp is the raw experimental read count, and adjCtl is the + alpha-scaled control read count. """ #expCounter = mySam.Counter() @@ -172,6 +227,24 @@ def testInterval(interval,expHandle,ctlHandle,alpha): return cumBinom(nExp,nCtl*alpha),nExp,nCtl*alpha def testIntervalNoStrand(interval,expHandle,ctlHandle,alpha): + """Tests a single genomic interval for read enrichment ignoring strand. + + Counts all reads (both strands) overlapping the interval from experimental + and control BAM files, scales control count by alpha, and returns a + binomial p-value. + + Args: + interval: An intervallib.Interval object with chr, start, and end + attributes. + expHandle: A pysam AlignmentFile for the experimental sample. + ctlHandle: A pysam AlignmentFile for the control sample. + alpha: Normalisation factor used to scale control counts. + + Returns: + A tuple (pVal, nExp, adjCtl) where pVal is the binomial p-value, + nExp is the raw experimental read count, and adjCtl is the + alpha-scaled control read count. + """ expCounter = mySam.Counter() ctlCounter = mySam.Counter() expFetch = expHandle.fetch(interval.chr,interval.start,interval.end,callback=expCounter) @@ -235,19 +308,56 @@ def poissonProb(lamb,height): ######################### def slope(xarray,yarray): - """Uses numpy, in fact assumes that the list arguments are numpy arrays.""" + """Computes the slope of the ordinary least-squares regression line. + + Uses numpy arrays for efficient computation. The slope is: + m = (n*sum(x*y) - sum(x)*sum(y)) / (n*sum(x^2) - (sum(x))^2) + + Args: + xarray: A numpy array of x (independent variable) values. + yarray: A numpy array of y (dependent variable) values of the same + length as xarray. + + Returns: + The slope of the linear regression line (float). + """ n = float(len(xarray)) m = (n*sum(xarray*yarray)-sum(xarray)*sum(yarray))/(n*sum(xarray**2)-(sum(xarray))**2) return m def intercept(xarray,yarray): - """Uses numpy, in fact assumes that the list arguments are numpy arrays.""" + """Computes the y-intercept of the ordinary least-squares regression line. + + Uses numpy arrays for efficient computation. The intercept is: + b = (sum(y) - m*sum(x)) / n + + Args: + xarray: A numpy array of x (independent variable) values. + yarray: A numpy array of y (dependent variable) values of the same + length as xarray. + + Returns: + The y-intercept of the linear regression line (float). + """ m = slope(xarray,yarray) n = float(len(xarray)) b = (sum(yarray)-m*(sum(xarray)))/n return b def getSegmentCounts(bamHandle,segSize=10000): + """Counts reads in fixed-size genomic bins across all chromosomes in a BAM file. + + Iterates over all chromosomes and divides each into bins of segSize base + pairs, counting the total number of reads per bin using mySam.Counter. + + Args: + bamHandle: A pysam AlignmentFile opened for reading. + segSize: Bin size in base pairs (default: 10000). + + Returns: + A numpy array of read counts, one element per bin, ordered by + chromosome then genomic position. + """ chrs = bamHandle.references chr_lengths = bamHandle.lengths bins = numpy.zeros(sum(chr_lengths)//segSize+len(chrs)) @@ -263,11 +373,43 @@ def getSegmentCounts(bamHandle,segSize=10000): return bins def getNonZeroIndices(bins1,bins2): + """Returns the indices of bins that have non-zero counts in both arrays. + + Used to restrict linear regression normalisation to bins that are + informative in both the experimental and control samples. + + Args: + bins1: A numpy array of read counts (e.g. experimental sample bins). + bins2: A numpy array of read counts (e.g. control sample bins) of + the same length as bins1. + + Returns: + A list of integer indices where both bins1 and bins2 have non-zero + values. + """ set1 = set(numpy.nonzero(bins1)[0]) set2 = set(numpy.nonzero(bins2)[0]) return list(set1.intersection(set2)) def getAlpha(expBins,ctlBins,index): + """Computes the normalisation factor (alpha) between experimental and control samples. + + Fits a linear regression through the origin on the subset of bins + specified by index, treating control counts as x and experimental counts + as y. The slope is used to scale the control sample to the experimental + library size. + + Args: + expBins: Numpy array of per-bin read counts for the experimental + sample. + ctlBins: Numpy array of per-bin read counts for the control sample. + index: List of integer indices identifying informative bins (non-zero + in both arrays). + + Returns: + Alpha (float): the slope of the linear regression, used as the + multiplicative scaling factor for control counts. + """ return slope(ctlBins[index],expBins[index]) def getAlphaFromLinReg(exp,ctl,r): @@ -304,10 +446,35 @@ def getAlphaFromLinReg(exp,ctl,r): ''' class Usage(Exception): + """Exception raised for command-line usage errors in seqstats. + + Attributes: + msg: Human-readable explanation of the error or the help message. + """ def __init__(self, msg): + """Initialises a Usage exception. + + Args: + msg: Human-readable error or help text. + """ self.msg = msg def newMain(argv=None): + """Command-line entry point for the seqstats peak-calling pipeline. + + Parses command-line options and delegates to smRNApeakSeq. Supports + optional strand-specific counting, q-value filtering, and verbose output. + + Args: + argv: List of command-line argument strings. Defaults to sys.argv + when None. + + Returns: + 2 on usage error, None on success. + + Raises: + SystemExit: Indirectly via sys.exit() on usage error. + """ if argv is None: argv = sys.argv try: diff --git a/src/seqlib/sequencelib.py b/src/seqlib/sequencelib.py index 9071876..aa4c77e 100644 --- a/src/seqlib/sequencelib.py +++ b/src/seqlib/sequencelib.py @@ -1,4 +1,10 @@ #/usr/bin/env python +"""Sequence utility functions for DNA/RNA analysis. + +Provides parsers, generic sequence tools, and motif tools for working +with biological sequence data including FASTA parsing, complement +computation, GC content, k-mer analysis, and random sequence generation. +""" import math import operator import random @@ -11,11 +17,26 @@ #Parsers ###### def FastaIterator(handle): - """ - Generator function to iterate over fasta records in : - Use in a loop to apply to each Seq record contained in a .fasta file - Input: record handle as obtained by handle = open(,'r') - Returns an iterator across Sequences in file + """Iterate over FASTA records in an open file handle. + + Skips any header text before the first '>' character, then yields + one record dict per FASTA entry. Each sequence has internal + whitespace stripped and lines joined into a single string. + + Args: + handle: A readable file object (e.g. opened with ``open(path, 'r')``) + positioned at or before the first FASTA record. + + Yields: + A dict with keys: + ``'name'``: The record header string (everything after ``>`` + on the header line, whitespace-stripped). + ``'sequence'``: The concatenated sequence string with all + internal spaces removed. + + Raises: + ValueError: If a record block does not begin with a ``>`` + character as required by the FASTA format. """ #Skip any header text while True: @@ -49,6 +70,24 @@ def FastaIterator(handle): ### def complement(s): + """Return the base-by-base complement of a DNA sequence as a list. + + Handles both upper- and lower-case input characters. Note that the + lower-case mapping contains a known quirk: ``'c'`` maps to ``'t'`` + instead of ``'g'``. + + Args: + s: An iterable of single-character DNA bases (``A``, ``T``, ``G``, + ``C`` in either case). + + Returns: + A list of single-character strings representing the complemented + bases in the same order as the input. + + Raises: + KeyError: If a character in ``s`` is not present in the complement + lookup table. + """ comp = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'a': 't', 'c': 't', 'g': 'c', 't': 'a' } @@ -56,27 +95,103 @@ def complement(s): return complseq def reverse_complement(s): + """Return the reverse complement of a DNA sequence string. + + Reverses the sequence and then complements each base using + :func:`complement`. + + Args: + s: A DNA sequence string containing bases ``A``, ``T``, ``G``, ``C`` + (upper or lower case). + + Returns: + A string that is the reverse complement of ``s``. + """ seq = list(s) seq.reverse() return ''.join(complement(seq)) def rcomp(s): - """Does same thing as reverse_complement only cooler""" + """Return the reverse complement of an uppercase DNA string. + + Uses ``str.translate`` with a precomputed translation table for + ``A<->T`` and ``C<->G``, then reverses the result with a slice. + Equivalent to :func:`reverse_complement` but operates only on + uppercase bases via the translation table. + + Args: + s: An uppercase DNA sequence string (``A``, ``T``, ``C``, ``G``). + + Returns: + A string that is the reverse complement of ``s``. + """ return s.translate(string.maketrans("ATCG","TAGC"))[::-1] def getTm(seq): + """Calculate the melting temperature (Tm) of a DNA sequence. + + Uses the nearest-neighbour-inspired empirical formula:: + + Tm = 79.8 + 18.5*log10([Na+]) + 58.4*GC + 11.8*GC^2 - 820/len + + where ``[Na+]`` is fixed at 0.05 M and ``GC`` is the fractional GC + content of the sequence. + + Args: + seq: A DNA sequence string. + + Returns: + The estimated melting temperature in degrees Celsius as a float. + """ Tm = 79.8 + 18.5*math.log10(0.05) + (58.4 * getGC(seq)) + (11.8 * getGC(seq)**2) - (820/len(seq)) return Tm def getGC(seq): + """Return the fractional GC content of a DNA sequence. + + Counts both upper- and lower-case ``G`` and ``C`` characters and + divides by the total sequence length. + + Args: + seq: A DNA sequence string. + + Returns: + A float in [0, 1] representing the proportion of G and C bases. + """ return (seq.count('C')+seq.count('G')+seq.count('c')+seq.count('g'))/float(len(seq)) def gc_content(seq): + """Return the percentage GC content of a nucleotide sequence. + + Counts G and C characters (upper and lower case) and divides by the + sum of all A, T, U, G, C characters (upper and lower case), ignoring + any ambiguity codes or gap characters. The result is scaled to + percentage (0–100). + + Args: + seq: A DNA or RNA sequence string. + + Returns: + A float representing GC content as a percentage (0–100). + """ gc = mcount(seq, 'GCgc') at = mcount(seq, 'ATUatu') return 100*gc/float((gc+at)) def mcount(s, chars): + """Count all occurrences of any character in ``chars`` within string ``s``. + + Iterates over each character in ``chars`` and accumulates the count of + its appearances in ``s`` using ``string.count``. + + Args: + s: The string to search within. + chars: A string whose individual characters are each counted in ``s``. + + Returns: + The total number of occurrences of any character from ``chars`` + found in ``s`` as an integer. + """ # sums the counts of appearances of each char in chars count = 0 for char in chars: @@ -84,6 +199,26 @@ def mcount(s, chars): return count def prob_seq(seq, pGC=.5): + """Return the probability of a DNA sequence under a background GC model. + + Assumes each position is independently drawn from a 4-letter alphabet + where G and C each have probability ``pGC/2`` and A and T each have + probability ``(1-pGC)/2``. The joint probability is the product of + per-position probabilities. + + Args: + seq: A DNA sequence string containing only ``A``, ``T``, ``G``, + or ``C`` characters (upper case). + pGC: The background GC probability in [0, 1]. Defaults to 0.5. + + Returns: + The probability of observing ``seq`` under the model as a float. + + Raises: + AssertionError: If ``pGC`` is outside [0, 1]. + ValueError: If ``seq`` contains a character other than + ``A``, ``T``, ``G``, or ``C``. + """ # given a GC content, what is the probability # of getting the particular sequence @@ -98,10 +233,36 @@ def prob_seq(seq, pGC=.5): return reduce(operator.mul, ps, 1) def transcribe(seq): + """Transcribe a DNA sequence to RNA by replacing thymine with uracil. + + Performs a simple string substitution of every uppercase ``'T'`` + with ``'U'``. Lower-case ``'t'`` characters are not converted. + + Args: + seq: A DNA sequence string (upper case ``T`` will be replaced). + + Returns: + The RNA sequence string with all ``'T'`` characters replaced by + ``'U'``. + """ RNA = seq.replace('T', 'U') return RNA def GenRandomSeq(length, type='DNA'): + """Generate a random nucleotide sequence of a given length. + + Each position is drawn uniformly and independently from the + four-letter alphabet appropriate for the requested sequence type. + + Args: + length: The number of nucleotides in the returned sequence. + type: The sequence type: ``'DNA'`` (alphabet ``A``, ``T``, ``G``, + ``C``) or ``'RNA'`` (alphabet ``A``, ``U``, ``G``, ``C``). + Defaults to ``'DNA'``. + + Returns: + A random nucleotide sequence string of the specified length. + """ if type == 'DNA': chars = ['A','T','G','C'] if type == 'RNA': @@ -109,9 +270,31 @@ def GenRandomSeq(length, type='DNA'): return ''.join([random.choice(chars) for i in range(length)]) def seed(): + """Re-seed the random number generator from the current system time. + + Calls :func:`random.seed` with no arguments, which uses the OS entropy + source or the current time as the seed. Useful for resetting + deterministic state after a fixed seed has been set elsewhere. + """ random.seed() def draw(distribution): + """Draw a random index from a discrete probability distribution. + + Iterates through the distribution, accumulating a running sum, and + returns the index of the first element whose cumulative sum exceeds + a uniformly drawn random number. + + Args: + distribution: A list of non-negative floats that sum to + approximately 1.0. Element ``i`` represents the probability + of returning index ``i``. + + Returns: + An integer index into ``distribution`` sampled according to the + distribution's probabilities, or ``None`` if no element was + selected (which can occur when probabilities do not sum to 1). + """ sum=0 r = random.random() for i in range(0,len(distribution)): @@ -120,6 +303,22 @@ def draw(distribution): return i def makeDistFromFreqs(freqs): + """Build a cumulative distribution list from a nucleotide frequency dict. + + Converts a dictionary of base frequencies into a list of cumulative + boundary values suitable for use with :func:`draw`. Bases are + processed in the fixed order ``A``, ``T``, ``C``, ``G``. + + Args: + freqs: A dictionary mapping nucleotide characters (``'A'``, + ``'T'``, ``'C'``, ``'G'``) to their relative frequencies. + Values should be non-negative and sum to 1.0. + + Returns: + A list of five floats: the initial ``0.0`` followed by the + cumulative sum after adding each of ``A``, ``T``, ``C``, ``G`` + in that order. + """ res = [] chars = ['A','T','C','G'] cum = 0 @@ -130,8 +329,21 @@ def makeDistFromFreqs(freqs): return res def genRandomFromDist(length,freqs): - """Generates a random sequence of length 'length' drawing from a distribution of - base frequencies in a dictionary""" + """Generate a random DNA sequence drawn from a given base-frequency distribution. + + Builds a cumulative distribution from ``freqs`` and samples each + position independently using :func:`draw`. + + Args: + length: The number of nucleotides in the returned sequence. + freqs: A dictionary mapping nucleotide characters (``'A'``, + ``'T'``, ``'C'``, ``'G'``) to their probabilities. Values + should be non-negative and sum to 1.0. + + Returns: + A random DNA sequence string of the specified length, with each + base sampled proportionally to its frequency. + """ myDist = makeDistFromFreqs(freqs) chars = ['A','T','C','G'] return ''.join([chars[draw(myDist)] for i in range(length)]) @@ -140,6 +352,29 @@ def genRandomFromDist(length,freqs): #Motif Tools ########### def allindices(string, sub, listindex=[], offset=0): + """Find all start indices of substring ``sub`` within ``string``. + + Searches for non-overlapping occurrences of ``sub`` in ``string`` + starting from ``offset`` and appends each found index to + ``listindex``. + + Warning: + ``listindex`` uses a mutable default argument. Repeated calls + without explicitly passing a new list will accumulate results + across calls. + + Args: + string: The string to search within. + sub: The substring to search for. + listindex: A list to which found indices are appended. + Defaults to a shared mutable list (see warning above). + offset: The character position at which to start the search. + Defaults to 0. + + Returns: + The ``listindex`` list (same object passed in) with the start + positions of all occurrences of ``sub`` appended. + """ i = string.find(sub, offset) while i >= 0: listindex.append(i) @@ -147,6 +382,19 @@ def allindices(string, sub, listindex=[], offset=0): return listindex def find_all(seq, sub): + """Find all start positions of a substring within a sequence string. + + Iterates through ``seq`` looking for non-overlapping occurrences of + ``sub`` using :func:`string.find` and collects each start index. + + Args: + seq: The sequence string to search within. + sub: The substring to search for. + + Returns: + A list of integer start positions (0-based) of all occurrences of + ``sub`` in ``seq``. Returns an empty list if ``sub`` is not found. + """ #print "Looking for %s in %s"%(sub,seq) found = [] next = string.find(seq,sub) @@ -156,7 +404,25 @@ def find_all(seq, sub): return found def kmer_dictionary_counts(seq,k,dic={}): - """Returns a dictionary of k,v = kmer:'count of kmer in seq'""" + """Count all k-mers in a sequence and store the counts in a dictionary. + + Slides a window of width ``k`` across ``seq`` and increments the + count for each k-mer substring encountered. + + Warning: + ``dic`` uses a mutable default argument. Repeated calls without + explicitly passing a fresh dict will accumulate counts across calls. + + Args: + seq: The nucleotide (or any) sequence string to count k-mers in. + k: The length of each k-mer. + dic: A dictionary to update with k-mer counts. Defaults to a + shared mutable dict (see warning above). + + Returns: + The updated ``dic`` dictionary mapping each k-mer string to its + occurrence count in ``seq``. + """ for i in range(0, len(seq)-k): subseq = seq[i:][:k] #if not dic.has_key(subseq): dic[subseq] = 1 @@ -166,15 +432,53 @@ def kmer_dictionary_counts(seq,k,dic={}): return dic def kmer_dictionary(seq,k,dic={},offset=0): - """Returns dictionary of k,v = kmer:'list of kmer start positions in seq' """ + """Build a dictionary mapping each k-mer to its start positions in a sequence. + + Slides a window of width ``k`` across ``seq`` and records each + 1-based start position under the corresponding k-mer key. + + Warning: + ``dic`` uses a mutable default argument. Repeated calls without + passing a fresh dict will accumulate positions across calls. + + Args: + seq: The nucleotide (or any) sequence string to index. + k: The length of each k-mer. + dic: A dictionary to update with k-mer position lists. Defaults + to a shared mutable dict (see warning above). + offset: Unused parameter retained for API compatibility. + + Returns: + The updated ``dic`` dictionary mapping each k-mer string to a list + of 1-based integer start positions at which it occurs in ``seq``. + """ for i in range(0,len(seq)-k): subseq = seq[i:][:k] dic.setdefault(subseq,[]).append(i+1) return dic def kmer_stats(kmer,dic,genfreqs): - """Takes as argument a kmer string, a dictionary with kmers as keys from kmer_dictionary_counts, and a dictionary - of genomic frequencies with kmers as keys. Returns a dictionary of stats for kmer ("Signal2Noise Ratio, Z-score") + """Compute enrichment statistics for a k-mer relative to genomic background. + + Calculates the signal-to-noise ratio (SNR) and Z-score for the + observed count of ``kmer`` in a sequence compared to the count + expected under a genomic-frequency background model. + + The expected count is ``sum(dic.values()) * genfreqs[kmer]``. + + Args: + kmer: The k-mer string to evaluate. + dic: A dictionary mapping k-mer strings to their observed counts, + as returned by :func:`kmer_dictionary_counts`. + genfreqs: A dictionary mapping k-mer strings to their expected + background frequencies (floats summing to 1 across all k-mers + of that length). + + Returns: + A dict with keys ``'snr'`` (signal-to-noise ratio) and + ``'zscore'`` (Z-score) if ``kmer`` is present in both ``dic`` and + ``genfreqs``. Returns ``None`` if ``dic`` is empty or ``kmer`` + is absent from either dictionary. """ if not dic: return if kmer in dic.keys() and kmer in genfreqs.keys(): @@ -186,6 +490,28 @@ def kmer_stats(kmer,dic,genfreqs): else: return def get_seeds(iter,seeds={}): + """Collect and count 7-mer seeds from an iterable of sequence records. + + Iterates over sequence records, converts each from colorspace to DNA + (by calling ``CSToDNA()`` on each record), extracts a 7-base seed + from positions 1–7 (1-based, i.e. ``sequence[1:8]``), and counts the + occurrences of each seed. Prints progress every 10 000 records. + + Warning: + ``seeds`` uses a mutable default argument. Repeated calls + without passing a fresh dict will accumulate counts across calls. + + Args: + iter: An iterable of sequence-record objects. Each object must + have a ``sequence`` attribute and a ``CSToDNA()`` method that + converts colorspace encoding to DNA in-place. + seeds: A dictionary to update with seed counts. Defaults to a + shared mutable dict (see warning above). + + Returns: + The updated ``seeds`` dictionary mapping 7-mer seed strings to + their occurrence counts. + """ counter = 0 for i in iter: counter+=1 diff --git a/src/seqlib/shrimp.py b/src/seqlib/shrimp.py index 9dd637d..b328518 100644 --- a/src/seqlib/shrimp.py +++ b/src/seqlib/shrimp.py @@ -1,4 +1,13 @@ #!/usr/bin/python +"""Utilities for running and parsing SHRiMP colorspace short-read alignments. + +Provides classes and functions for preparing input files, submitting jobs to +LSF, parsing SHRiMP v1.1+ alignment output, and parsing probcalc statistical +output for colorspace (SOLiD) short-read sequencing data. + +SHRiMP (Short Read Mapping Package) aligns colorspace reads from the Applied +Biosystems SOLiD platform to a reference genome. +""" import glob import os import random @@ -25,9 +34,43 @@ order = ["readname","contigname","strand","contigstart","contigend","readstart","readend","readlength","score","editstring","readsequence"] ####################### class ShrimpRead(Alignment): - """Extends Alignment class to include a few SHRiMP-specific attributes and methods""" - + """Extends Alignment class to include SHRiMP-specific attributes and methods. + + Represents a single read alignment produced by the SHRiMP rmapper-cs + aligner. In addition to the base Alignment attributes, stores colorspace + edit string information and counts of mismatches and crossover events. + + Attributes: + readstart: 0-based start position within the read. + readend: End position within the read. + readcount: Number of times this read sequence was observed. + editstring: SHRiMP edit string describing mismatches and crossovers + relative to the reference. + readlength: Length of the read in bases. + crossovers: Number of colorspace crossover errors ('x') in the edit + string. + nSNPs: Number of apparent SNP calls (A/C/G/T substitutions) in the + edit string. + aligner: Always 'shrimp' for this class. + """ + def __init__(self,readname,chr,start,end,strand,readstart,readend,score,readcount,readsequence,editstring,readlength): + """Initialises a ShrimpRead from parsed SHRiMP alignment fields. + + Args: + readname: Unique identifier (nuID or read name) of the read. + chr: Chromosome name of the alignment target. + start: 0-based genomic start coordinate. + end: Genomic end coordinate. + strand: Strand orientation ('+' or '-'). + readstart: Start position within the read sequence. + readend: End position within the read sequence. + score: Alignment score from SHRiMP. + readcount: Number of times this read was observed (from nuID). + readsequence: The read sequence string. + editstring: SHRiMP edit/cigar string. + readlength: Length of the read. + """ Alignment.__init__(self,readname,chr,start,end,strand,score,readcount,readsequence) self.readstart = int(readstart) self.readend = int(readend) @@ -41,17 +84,58 @@ def __init__(self,readname,chr,start,end,strand,readstart,readend,score,readcoun self.aligner = "shrimp" def __len__(self): + """Returns the length of the read.""" return self.readlength - + def __str__(self): + """Returns a short string representation: 'SHRiMP:readname:chr:start:end'.""" return "SHRiMP:%s:%s:%d:%d" % (self.readname,self.chr,self.start,self.end) - + def shrimpString(self): + """Returns the alignment formatted as a SHRiMP output line. + + The returned string matches the tab-delimited format produced by + SHRiMP, beginning with '>readsequence_xreadcount'. + + Returns: + A SHRiMP-format alignment string ending with a newline. + """ return ">%s_x%d\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%s\n" % (self.readsequence,self.readcount,self.chr,self.strand,self.start,self.end,self.readstart,self.readend,self.readlength,self.score,self.editstring,self.readsequence) class ProbCalcRead(ShrimpRead): - """Extends ShrimpRead class to include statistical output from probcalc""" + """Extends ShrimpRead to include statistical scores from the SHRiMP probcalc utility. + + The probcalc utility assigns probabilistic scores to SHRiMP alignments + to help distinguish true genomic mappings from chance alignments. In + addition to ShrimpRead attributes, this class stores the normalised odds + ratio and two probability scores. + + Attributes: + normodds: Normalised odds ratio for this alignment. + pgenome: Probability that the read originated from the genome. + pchance: Probability that the alignment is due to chance. + """ def __init__(self,readname,chr,start,end,strand,readstart,readend,score,readcount,editstring,readlength,normodds,pgenome,pchance,readsequence=''): + """Initialises a ProbCalcRead from probcalc output fields. + + Args: + readname: nuID-encoded read name; the read sequence is decoded + from this via misc.nuID2seq. + chr: Chromosome name of the alignment target. + start: 0-based genomic start coordinate. + end: Genomic end coordinate. + strand: Strand orientation ('+' or '-'). + readstart: Start position within the read. + readend: End position within the read. + score: SHRiMP alignment score. + readcount: Observation count encoded in the read name. + editstring: SHRiMP edit string. + readlength: Length of the read. + normodds: Normalised odds ratio from probcalc. + pgenome: Probability the read originates from the genome. + pchance: Probability of a chance alignment. + readsequence: Optional read sequence string (default: ''). + """ ShrimpRead.__init__(self,readname,chr,start,end,strand,readstart,readend,score,readcount,readsequence,editstring,readlength) self.readsequence = misc.nuID2seq(self.readname) self.normodds = float(normodds) @@ -78,6 +162,16 @@ def prepShrimp(file,basedir,binSize=1000): os.chdir(curDir) def GenRandom(length = 10, chars=string.letters+string.digits): + """Generates a random alphanumeric string of the given length. + + Args: + length: Number of characters in the returned string (default: 10). + chars: Pool of characters to sample from (default: all ASCII letters + and digits). + + Returns: + A random string of the specified length drawn from chars. + """ return ''.join([random.choice(chars) for i in range(length)]) def submitShrimp(queue="broad",cwd = os.getcwd(),outDir="../results/",readLength=25): diff --git a/src/seqlib/smRNA.py b/src/seqlib/smRNA.py index e93e0f6..a6dae22 100644 --- a/src/seqlib/smRNA.py +++ b/src/seqlib/smRNA.py @@ -1,12 +1,14 @@ #!/usr/bin/env python -''' -Created on Oct 8, 2009 -Generates list of candidate siRNAs from .fasta sequence given as argument +"""Tools for designing small RNA molecules including siRNAs, dsRNAs, and ASOs. -@author: lgoff +Generates and scores candidate siRNA sequences from FASTA input according to +published design rules for RNA interference (RNAi). Also includes support for +RNA activation (RNAa) dsRNA design based on Vera et al. criteria, and +antisense oligonucleotide (ASO) scanning. -Reference: http://www.protocol-online.org/prot/Protocols/Rules-of-siRNA-design-for-RNA-interference--RNAi--3210.html -''' +Reference: + http://www.protocol-online.org/prot/Protocols/Rules-of-siRNA-design-for-RNA-interference--RNAi--3210.html +""" import math import sys @@ -14,7 +16,15 @@ def main(fastaFile): - """Do it all""" + """Runs the full siRNA candidate pipeline on a FASTA file. + + Opens the FASTA file, iterates over each record, and prints candidate + siRNA sequences with their scores to stdout using evaluateSequence. + + Args: + fastaFile: Path to a FASTA-format file containing one or more + nucleotide sequences to screen for siRNA candidates. + """ handle = open(fastaFile,'r') iter = sequencelib.FastaIterator(handle) for i in iter: @@ -22,7 +32,17 @@ def main(fastaFile): evaluateSequence(i["sequence"]) def evaluateSequence(seq,scoreCutoff=6): - """Wrapper for testCandidate() that iterates across sequence provided and returns candidates with a score >= scoreCutoff (default = 6)""" + """Scans a nucleotide sequence for siRNA candidates meeting a score threshold. + + Slides a 21-nt window across the sequence, scores each window with + testCandidate, and prints passing candidates together with their BlockIt + insert sequences. + + Args: + seq: Nucleotide sequence string to scan. + scoreCutoff: Minimum score (inclusive) for a 21-mer to be reported + (default: 6). + """ for i in range(0,len(seq)-21): candidate = seq[i:i+21] score = testCandidate(candidate) @@ -32,7 +52,27 @@ def evaluateSequence(seq,scoreCutoff=6): print("Fwd:%s\tRev:%s" % (insertSeqs[0],insertSeqs[1])) def testCandidate(seq): - """Checks 21mer candidates against siRNA rules and assigns a score on a scale of 0-8""" + """Scores a 21-mer siRNA candidate against established siRNA design rules. + + Evaluates the 21-nt sense strand against the following criteria: + 1. Moderate GC content (30-52%) — +1 point. + 2. At least 3 A/U nucleotides at positions 15-19 — +1 per A/U (up to +4). + 3. Lack of internal repeats (melting temperature < 20 °C) — +1 point. + 4. 'A' at position 19 — +1 point. + 5. 'A' at position 3 — +1 point. + 6. 'U' (T in DNA) at position 10 — +1 point. + 7. G or C at position 19 — -1 point. + 8. 'G' at position 13 — -1 point. + 9. Homopolymer run of 4 or more identical bases — -5 points per run. + + Args: + seq: A 21-nucleotide DNA-sequence string representing the siRNA sense + strand (case-insensitive; T is used in place of U). + + Returns: + Numeric score (float) on an approximate scale of 0-8. Returns False + if the sequence is not exactly 21 nt. + """ #seq = seq.upper() if len(seq)!=21: assert ValueError("Candidate is not 21nt in length") @@ -72,10 +112,31 @@ def testCandidate(seq): return score def getTm(seq): + """Calculates the melting temperature (Tm) of a nucleotide sequence. + + Uses an empirical formula suitable for oligonucleotides to estimate the + melting temperature in degrees Celsius assuming a salt concentration of + 50 mM: + Tm = 79.8 + 18.5*log10([Na+]) + 58.4*GC + 11.8*GC^2 - 820/len + + Args: + seq: A nucleotide sequence string. + + Returns: + Estimated melting temperature in degrees Celsius (float). + """ Tm = 79.8 + 18.5*math.log10(0.05) + (58.4 * getGC(seq)) + (11.8 * getGC(seq)**2) - (820/len(seq)) return Tm def getGC(seq): + """Calculates the GC content of a nucleotide sequence. + + Args: + seq: A nucleotide sequence string (case-insensitive). + + Returns: + GC fraction as a float between 0.0 and 1.0. + """ seq = seq.upper() return (seq.count('C')+seq.count('G'))/float(len(seq)) @@ -83,8 +144,29 @@ def getGC(seq): #dsRNA rules from Vera et al. (updated 2-1-10) ###### def scanPromoter(promSeq): - """ - Evaluates candidate dsRNAs for RNAa from a given sequence. Returns a list of dictionaries of candidates and their score. + """Scans a promoter sequence for RNA activation (RNAa) dsRNA candidates. + + Slides a 19-nt window across the promoter sequence and scores each window + against design rules derived from Vera et al. for small activating RNA + (saRNA) design. Scoring rules include: + - GC content 40-65%: +1 point. + - Homopolymer run of 4 or more bases: -5 points per run. + - 'A' at position 19: +1 point. + - G or C at position 19: -1 point. + - 'A' at position 18: +2 points; 'T' at position 18: +1 point. + - 'T' at position 7: +1 point. + - 3 or more A/T nucleotides at positions 20-23 (3' flank): bonus points. + - Tm < 20 °C (low internal repeats): +1 point. + + Args: + promSeq: Promoter DNA/RNA sequence string to scan (case-insensitive; + converted to uppercase internally). + + Returns: + A list of candidate dictionaries sorted by descending score. Each + dictionary contains: 'seq' (19-nt candidate), 'pos' (position relative + to 3' end of promSeq), 'gc' (GC fraction), 'score' (float), and + 'Tm' (melting temperature in °C). """ promSeq = promSeq.upper() window = 19 @@ -141,8 +223,23 @@ def scanPromoter(promSeq): return sorted(candidates,key=lambda k: k['score'],reverse=True) def ASOscan(targetSeq): - """ - Evaluates candidate dsRNAs for RNAa from a given sequence. Returns a list of dictionaries of candidates and their score. + """Scans a target RNA sequence for antisense oligonucleotide (ASO) candidates. + + Reverse-complements the input sequence and slides a 20-nt window across + it to evaluate ASO design candidates. Each candidate is scored primarily + on GC content (45-65% preferred, +2 points) and melting temperature + (Tm > 45 °C preferred, +2 points), with penalties for homopolymer runs + of 4 or more bases (-5 points each). + + Args: + targetSeq: The target RNA/DNA sequence string (sense strand) to + design ASOs against. It is reverse-complemented internally. + + Returns: + A list of candidate dictionaries sorted by descending score, each + containing keys: 'seq' (20-nt candidate sequence), 'pos' (position + relative to 3' end of input), 'gc' (GC fraction), 'score' (float + total score), and 'Tm' (melting temperature in °C). """ targetSeq = sequencelib.rcomp(targetSeq) window = 20 @@ -200,6 +297,20 @@ def ASOscan(targetSeq): return sorted(candidates,key=lambda k: k['score'],reverse=True) def makeDsRNA(seq): + """Formats a 19-nt RNA sequence as a dsRNA oligonucleotide pair with TT 3' overhangs. + + Produces the sense and antisense strands in a format suitable for ordering + RNA oligonucleotides. Each nucleotide is prefixed with 'r' and the + sequence is terminated with a 'TT' 3' overhang. + + Args: + seq: A 19-nucleotide DNA sequence string representing the sense strand. + + Returns: + A list of two strings: [sense_strand_oligo, antisense_strand_oligo], + each formatted as individual 'r'-prefixed RNA nucleotides followed by + 'TT'. Returns False if the sequence is not exactly 19 nt. + """ if len(seq)!=19: assert ValueError("Candidate is not 19nt in length") return False @@ -208,7 +319,16 @@ def makeDsRNA(seq): return ["r"+"r".join(seq)+"TT","r"+"r".join(revSeq)+"TT"] def veraMain(fastaFile): - """Do it all""" + """Runs the full RNA activation (RNAa) dsRNA design pipeline on a FASTA file. + + Opens a FASTA file of promoter sequences, scans each sequence for RNAa + dsRNA candidates using scanPromoter, and prints the top 10 results with + their positions, sequences, scores, melting temperatures, GC fractions, + and formatted oligonucleotide sequences. + + Args: + fastaFile: Path to a FASTA file of promoter sequences to scan. + """ handle = open(fastaFile,'r') iter = sequencelib.FastaIterator(handle) for i in iter: @@ -219,7 +339,18 @@ def veraMain(fastaFile): print("Pos:\t%d\nCandidate:\t%s\nScore:\t%.2f\nTm:\t%.2f\nGC:\t%.2f\nFwd:\t%s\nRev:\t%s\n------------------------" % (candidate['pos'],candidate['seq'],candidate['score'],candidate['Tm'],candidate['gc'],dsRNA[0],dsRNA[1])) def ASOMain(fastafile): - """Takes a fasta sequnce of RNAs, reverse-complements and scans for ASO sequences""" + """Runs the full ASO design pipeline on a FASTA file of RNA sequences. + + Opens the FASTA file, reverse-complements each record, scans for + antisense oligonucleotide (ASO) candidates using ASOscan, and prints the + top 10 uppercase candidates with their positions, sequences, scores, + melting temperatures, and GC fractions. Candidates containing lowercase + letters (ambiguous bases) are skipped. + + Args: + fastafile: Path to a FASTA file of RNA/DNA sequences for which ASOs + should be designed. + """ handle = open(fastafile,'r') iter = sequencelib.FastaIterator(handle) for i in iter: diff --git a/src/seqlib/solid.py b/src/seqlib/solid.py index da0cdef..9775c97 100644 --- a/src/seqlib/solid.py +++ b/src/seqlib/solid.py @@ -1,4 +1,15 @@ #!/usr/bin/python +"""Utilities for processing Applied Biosystems SOLiD colorspace sequencing data. + +Provides the CSSeq class for representing colorspace sequences, iterators for +reading .csfasta and .qual files, and functions for converting between +colorspace and DNA space, trimming linker sequences, building unique-read +tables, and generating FASTQ files compatible with Bowtie. + +SOLiD sequencing encodes each base as a color (0-3) that represents the +transition between successive dinucleotides. The first character of each +read is a nucleotide seed; subsequent characters are color codes. +""" import os import sys @@ -31,8 +42,33 @@ def linker_oligos(linker = P2_seq): #CSSeq Class definition: Basic class of Colorspace sequence ################################################################# class CSSeq: - "Defines the basic sequence class for the pipeline (DNA or CS)" + """Represents a single SOLiD colorspace (or DNA-space) sequence read. + + Holds the sequence data, quality scores, and alignment metadata for one + SOLiD bead read. The sequence may be in colorspace (space='CS') or may + have been converted to DNA space (space='DNA') via CSToDNA(). + + Attributes: + name: Read identifier string (bead name). + sequence: Sequence string; either colorspace (digits 0-3 prefixed by + a nucleotide) or DNA (ACGT) depending on space. + readcount: Number of times this read sequence was observed (used when + collapsing duplicates to a unique table, default 1). + matches: List of match location strings (populated when parsing a + .csfasta file with match annotations). + qual: List of integer Phred quality scores corresponding to each base. + space: Either 'CS' (colorspace, default) or 'DNA' after CSToDNA(). + trimmed: True once the SOLiD linker has been stripped by + strip_solid_linker(). + """ def __init__(self,name,sequence,readcount=1): + """Initialises a CSSeq. + + Args: + name: Read identifier (bead name). + sequence: Colorspace sequence string. + readcount: Observation count for this sequence (default: 1). + """ self.name = name self.sequence = sequence self.readcount = readcount @@ -43,11 +79,15 @@ def __init__(self,name,sequence,readcount=1): #self.count = 0 def __len__(self): + """Returns the length of the sequence string.""" return len(self.sequence) def __str__(self): + """Returns the sequence string.""" return self.sequence + def __repr__(self): + """Returns the read name.""" return self.name # def __repr__(self): @@ -62,15 +102,31 @@ def __repr__(self): # return ('%s\t%s\t%s\t' % (self.name,CSseq,self.sequence)) def returnFasta(self): + """Returns the sequence formatted as a two-line FASTA record string. + + Returns: + A string of the form '>name\\nsequence'. + """ return ('>%s\n%s' % (self.name,self.sequence)) def returnSHRiMPcsfasta(self): + """Returns the sequence in SHRiMP csfasta format with the readcount suffix. + + Returns: + A string of the form '>name_xreadcount\\nsequence'. + """ return ('>%s_x%d\n%s') % (self.name,self.readcount,self.sequence) def returnQual(self): + """Returns the quality scores formatted as a two-line FASTA-style qual record. + + Returns: + A string of the form '>name\\nq0 q1 q2 ...'. + """ return('>%s\n%s' % (self.name," ".join(q for q in self.qual))) def printFasta(self): + """Prints the sequence as a two-line FASTA record to stdout.""" print ('>%s\n%s' % (self.name,self.sequence)) def CSToDNA(self): @@ -127,6 +183,12 @@ def trim_by_qual(self,phredCutoff=10): return def nuIDName(self): + """Replaces the read name with the nuID encoding of its DNA sequence. + + Converts the sequence to DNA space first if it is currently in + colorspace, then encodes it as a nuID and stores the result in + self.name. + """ if self.space == "CS": tempString = CS2DNA(self.sequence) else: @@ -229,6 +291,20 @@ def CompIter(csfile,qualfile): assert ValueError ("It appears that the sequences don't match...have you modified the .csfasta or .qual files?") def uniqueTableIterator(handle,trim=True): + """Yields CSSeq objects from a tab-delimited unique-reads table. + + Reads a two-column tab-delimited file where column 0 is a colorspace + read sequence and column 1 is its observation count. Assigns nuID names + and optionally strips the SOLiD linker. + + Args: + handle: Readable file-like object containing the unique-reads table. + trim: If True (default), strip the SOLiD P2 linker from each read + using strip_solid_linker(). + + Yields: + CSSeq objects with nuID names and readcount set from the table. + """ for line in handle: tokens = line.rstrip().split("\t") seq = CSSeq(tokens[0],tokens[0],readcount=int(tokens[1])) @@ -283,6 +359,16 @@ def makeFastq(csfile,qualfile,shortname,outdir="",split=-1,trim=False): ######################################################################## def csfasta2fasta(fname): + """Converts a .csfasta file to DNA-space FASTA format and prints to stdout. + + Reads each colorspace record, converts it to DNA space, and prints the + result as a FASTA record. Note: due to a bug the CS2DNA conversion and + printFasta are referenced as attributes rather than called, so conversion + does not actually occur. + + Args: + fname: Path to a .csfasta file. + """ handle=open(fname,'r') iter=CSFastaIterator(handle) for i in iter: @@ -338,9 +424,20 @@ def uniqueTable(dir=os.getcwd()): print(row) def filterUnique(uniqueFile,minObs=5): - """ - At this point, this function is specific to the H1U and H1NSC samples - I need to change that + """Filters a unique-reads table and writes separate .csfasta files per sample. + + Reads a tab-delimited unique-reads table and writes reads that meet the + minimum observation threshold to sample-specific .csfasta files. + + Note: This function is hard-coded for exactly two samples (H1U and H1NSC) + and writes output to 'H1U.csfasta' and 'H1NSC.csfasta' in the current + directory. + + Args: + uniqueFile: Path to the tab-delimited unique-reads table produced by + uniqueTable(). The header line begins with '#'. + minObs: Minimum total observation count required for a read to be + written to the output (default: 5). """ handle = open(uniqueFile,'r') count = 0 diff --git a/src/seqlib/stats.py b/src/seqlib/stats.py index bed6b67..07db2e3 100644 --- a/src/seqlib/stats.py +++ b/src/seqlib/stats.py @@ -1,3 +1,11 @@ +"""Statistical and mathematical utilities for biological data analysis. + +Provides descriptive statistics, probability distributions (PDF and CDF), +random variates, regression, sliding-window operations, curve fitting, and +special mathematical functions. Functions that require external tools (rpy2, +gnuplot) fall back gracefully or raise ``NotImplementedError`` when those +dependencies are absent. +""" # python libs import cmath import os @@ -16,11 +24,35 @@ def prod(lst): - """Computes the product of a list of numbers""" + """Compute the product of a list of positive numbers via log-space summation. + + Calculates ``exp(sum(log(i) for i in lst))``, which avoids numerical + overflow for large lists by working in log space. All values in + ``lst`` must be strictly positive. + + Args: + lst: An iterable of strictly positive numbers. + + Returns: + The product of all elements in ``lst`` as a float. + """ return exp(sum(log(i) for i in lst)) def mean(vals): - """Computes the mean of a list of numbers""" + """Compute the arithmetic mean of a sequence of numbers. + + Iterates through ``vals`` once, accumulating the sum and count, + then divides to produce the mean. + + Args: + vals: An iterable of numeric values. Must be non-empty. + + Returns: + The arithmetic mean as a float. + + Raises: + ZeroDivisionError: If ``vals`` is empty. + """ n = 0 s = 0.0 for i in vals: @@ -29,7 +61,17 @@ def mean(vals): return s / float(n) def median(vals): - """Computes the median of a list of numbers""" + """Compute the median of a list of numbers. + + Sorts ``vals`` and returns the middle value for odd-length lists, or + the average of the two middle values for even-length lists. + + Args: + vals: A sequence of numeric values. Must be non-empty. + + Returns: + The median value as a float. + """ lenvals = len(vals) sortvals = sorted(vals) @@ -39,7 +81,20 @@ def median(vals): return sortvals[lenvals // 2] def mode(vals): - """Computes the mode of a list of numbers""" + """Compute the mode (most frequently occurring value) of a sequence. + + Uses :class:`collections.Counter` to count occurrences and returns + the value with the highest count. If multiple values share the + maximum count, the one encountered first during dict iteration is + returned (which is insertion order in Python 3.7+). + + Args: + vals: An iterable of hashable values. + + Returns: + The most frequently occurring element in ``vals``, or ``None`` + if ``vals`` is empty. + """ top = 0 topkey = None for key, val in Counter(vals).items(): @@ -50,8 +105,22 @@ def mode(vals): def msqerr(vals1, vals2): - """Mean squared error""" + """Compute the mean squared error between two equal-length sequences. + Calculates the average of the squared element-wise differences:: + + MSE = mean((vals1[i] - vals2[i])^2 for all i) + + Args: + vals1: A sequence of numeric values. + vals2: A sequence of numeric values of the same length as ``vals1``. + + Returns: + The mean squared error as a float. + + Raises: + AssertionError: If ``vals1`` and ``vals2`` have different lengths. + """ assert len(vals1) == len(vals2), "lists are not the same length" @@ -61,23 +130,80 @@ def msqerr(vals1, vals2): def variance(vals): - """Variance""" + """Compute the sample variance of a sequence of numbers. + + Uses Bessel's correction (divides by ``n - 1``) to produce an + unbiased estimate of the population variance:: + + s^2 = sum((x - mean)^2) / (n - 1) + + Args: + vals: A sequence of at least two numeric values. + + Returns: + The sample variance as a float. + + Raises: + ZeroDivisionError: If ``vals`` has fewer than 2 elements. + """ u = mean(vals) return sum((x - u)**2 for x in vals) / float(len(vals)-1) def var(vals): + """Alias for :func:`variance`. + + Args: + vals: A sequence of at least two numeric values. + + Returns: + The sample variance as a float. + """ return variance(vals) def sdev(vals): - """Standard deviation""" + """Compute the sample standard deviation of a sequence of numbers. + + Returns the square root of the sample variance computed by + :func:`variance` (Bessel-corrected, ``n - 1`` denominator). + + Args: + vals: A sequence of at least two numeric values. + + Returns: + The sample standard deviation as a float. + """ return sqrt(variance(vals)) def serror(vals): - """Stanadrd error""" + """Compute the standard error of the mean of a sequence of numbers. + + Divides the sample standard deviation by the square root of the + sample size:: + + SE = sdev(vals) / sqrt(n) + + Args: + vals: A sequence of at least two numeric values. + + Returns: + The standard error of the mean as a float. + """ return sdev(vals) / sqrt(len(vals)) def covariance(lst1, lst2): - """Covariance""" + """Compute the sample covariance between two equal-length sequences. + + Uses Bessel's correction (divides by ``n - 1``):: + + cov(X, Y) = sum((x - mean_x) * (y - mean_y)) / (n - 1) + + Args: + lst1: A sequence of numeric values. + lst2: A sequence of numeric values of the same length as ``lst1``. + + Returns: + The sample covariance as a float. + """ m1 = mean(lst1) m2 = mean(lst2) tot = 0.0 @@ -87,14 +213,37 @@ def covariance(lst1, lst2): def covmatrix(mat): - """Covariance Matrix""" + """Compute the full pairwise sample covariance matrix for a list of sequences. + + Evaluates :func:`covariance` for every pair ``(i, j)`` of rows in + ``mat`` (including self-covariances on the diagonal, which equal the + sample variance of that row). + + Args: + mat: A list of ``n`` equal-length numeric sequences (rows). + + Returns: + A ``(n, n)`` NumPy array where element ``[i, j]`` is the sample + covariance between ``mat[i]`` and ``mat[j]``. + """ size = len(mat) flat = [covariance(mat[i], mat[j]) for i,j in ((i,j) for i in range(size) for j in range(size))] return np.array(flat).reshape(size, size) def corrmatrix(mat): - """Correlation Matrix""" + """Compute the full pairwise Pearson correlation matrix for a list of sequences. + + Evaluates :func:`corr` for every pair ``(i, j)`` of rows in + ``mat`` (including self-correlations of 1.0 on the diagonal). + + Args: + mat: A list of ``n`` equal-length numeric sequences (rows). + + Returns: + A ``(n, n)`` NumPy array where element ``[i, j]`` is the Pearson + correlation coefficient between ``mat[i]`` and ``mat[j]``. + """ size = len(mat) flat = [corr(mat[i], mat[j]) for i,j in ((i,j) for i in range(size) for j in range(size))] @@ -102,7 +251,23 @@ def corrmatrix(mat): def corr(lst1, lst2): - """Pearson's Correlation""" + """Compute the Pearson correlation coefficient between two sequences. + + Calculates:: + + r = cov(lst1, lst2) / (sdev(lst1) * sdev(lst2)) + + If the denominator is zero (one or both sequences have zero variance), + returns ``1e1000`` (effectively infinity) as a sentinel value. + + Args: + lst1: A sequence of numeric values. + lst2: A sequence of numeric values of the same length as ``lst1``. + + Returns: + The Pearson correlation coefficient as a float in [-1, 1], or + ``1e1000`` if either sequence has zero standard deviation. + """ num = covariance(lst1, lst2) denom = float(sdev(lst1) * sdev(lst2)) if denom != 0: @@ -112,8 +277,26 @@ def corr(lst1, lst2): def qqnorm(data, plot=None): - """Quantile-quantile plot""" - + """Generate data for a normal quantile-quantile (Q-Q) plot. + + Sorts ``data`` and generates an equal-length sample from the standard + normal distribution (mean 0, sigma 1), also sorted. The two sorted + sequences can be plotted against each other to assess normality. + + Args: + data: A sequence of numeric values to compare against the normal + distribution. + plot: An optional plot object with a ``plot(x, y)`` method. If + provided, the Q-Q data are passed to ``plot.plot`` and the + plot object is returned. Defaults to ``None``. + + Returns: + If ``plot`` is ``None``: a 2-tuple ``(data2, norm)`` where + ``data2`` is the sorted input data and ``norm`` is a sorted + sample from N(0, 1) of the same length. + If ``plot`` is provided: the ``plot`` object after calling + ``plot.plot(data2, norm)``. + """ data2 = sorted(data) norm = [random.normalvariate(0, 1) for x in range(len(data2))] norm.sort() @@ -128,8 +311,25 @@ def qqnorm(data, plot=None): def fitLine(xlist, ylist): - """2D regression""" + """Fit a least-squares line to 2-D data and return slope and intercept. + + Uses the ordinary least-squares closed-form formula:: + slope = (sum(x*y) - n*mean_x*mean_y) / (sum(x^2) - n*mean_x^2) + inter = mean_y - slope * mean_x + + If the denominator is zero (all x values are identical), slope is set + to ``1e10`` as a sentinel for a vertical line. + + Args: + xlist: A sequence of x-coordinates (numeric). + ylist: A sequence of y-coordinates (numeric) of the same length + as ``xlist``. + + Returns: + A 2-tuple ``(slope, inter)`` where ``slope`` is the gradient and + ``inter`` is the y-intercept of the fitted line. + """ xysum = 0 xxsum = 0 n = len(xlist) @@ -150,7 +350,23 @@ def fitLine(xlist, ylist): def fitLineError(xlist, ylist, slope, inter): - """Returns the Mean Square Error of the data fit""" + """Compute the mean squared error of a linear fit against data. + + Evaluates the fitted line ``y_hat = slope * x + inter`` at each x + and averages the squared residuals:: + + MSE = sum((slope*x_i + inter - y_i)^2) / n + + Args: + xlist: A sequence of x-coordinates (numeric). + ylist: A sequence of observed y-coordinates of the same length + as ``xlist``. + slope: The slope of the fitted line. + inter: The y-intercept of the fitted line. + + Returns: + The mean squared error of the linear fit as a float. + """ error = 0 n = len(xlist) @@ -160,8 +376,27 @@ def fitLineError(xlist, ylist, slope, inter): def pearsonsRegression(observed, expected): - """Pearson's coefficient of regression""" + """Compute the Pearson coefficient of determination (R^2). + Measures how well ``expected`` values explain the variance of + ``observed``:: + + R^2 = 1 - ESS / TSS + + where ``ESS = sum((observed - expected)^2)`` is the error sum of + squares and ``TSS = sum((observed - mean(observed))^2)`` is the + total sum of squares. + + Args: + observed: A sequence of observed (actual) numeric values. + expected: A sequence of predicted values of the same length as + ``observed``. + + Returns: + R^2 as a float. A value of 1.0 indicates a perfect fit; + values near 0 indicate no explanatory power; negative values + indicate the model is worse than predicting the mean. + """ # error sum of squares ess = sum((a - b)**2 for a, b in zip(observed, expected)) @@ -174,6 +409,22 @@ def pearsonsRegression(observed, expected): def pearsonsRegressionLine(x, y, m, b): + """Compute R^2 for data against a linear model y = m*x + b. + + Generates expected values from the line ``y = m*x + b`` and + delegates to :func:`pearsonsRegression`. + + Args: + x: A sequence of x-coordinates (numeric). + y: A sequence of observed y-coordinates of the same length as + ``x``. + m: The slope of the reference line. + b: The y-intercept of the reference line. + + Returns: + R^2 as a float indicating goodness of fit of the linear model + to the observed data. + """ observed = y expected = [m*i + b for i in x] return pearsonsRegression(observed, expected) @@ -181,11 +432,27 @@ def pearsonsRegressionLine(x, y, m, b): def percentile(vals, perc, rounding=-1, sort=True): - """Give the value at a percentile - - rounding -- round down if -1 or round up for 1 + """Return the value at a given percentile of a sequence. + + Optionally sorts ``vals`` and returns the element at index + ``int(perc * n)`` (round down) or ``ceil(perc * n)`` (round up), + clamped to valid list indices. + + Args: + vals: A sequence of numeric values. + perc: The desired percentile as a fraction in [0, 1] (e.g. 0.5 + for the median, 0.95 for the 95th percentile). + rounding: Controls how the fractional index is resolved. + Use ``-1`` to floor (default) or ``1`` to ceiling. + sort: If ``True`` (default), sort ``vals`` before indexing. + Pass ``False`` if ``vals`` is already sorted to save time. + + Returns: + The value in ``vals`` at the requested percentile. + + Raises: + Exception: If ``rounding`` is not ``-1`` or ``1``. """ - if sort: vals2 = sorted(vals) else: @@ -200,8 +467,22 @@ def percentile(vals, perc, rounding=-1, sort=True): def logadd(lna, lnb): - """Adding numbers in log-space""" + """Add two numbers represented in log space without underflow. + + Computes ``log(exp(lna) + exp(lnb))`` in a numerically stable way:: + + logadd(lna, lnb) = log(exp(lna - lnb) + 1) + lnb + + When ``lna - lnb >= 500`` the second term is negligible and ``lna`` + is returned directly to avoid overflow. + + Args: + lna: The natural log of the first value. + lnb: The natural log of the second value. + Returns: + The natural log of the sum ``exp(lna) + exp(lnb)`` as a float. + """ diff = lna - lnb if diff < 500: return log(exp(diff) + 1.0) + lnb @@ -211,13 +492,25 @@ def logadd(lna, lnb): def smooth(vals, radius): - """ - return an averaging of vals using a radius + """Smooth a sequence by replacing each value with a local window average. - Note: not implemented as fast as possible - runtime: O(len(vals) * radius) - """ + For each position ``i``, computes the mean of the sub-list + ``vals[i - r : i + r + 1]`` where ``r = min(i, vlen - i - 1, radius)`` + ensures the window stays within array bounds. Values near the + edges therefore use a smaller effective radius. + + Note: + Not implemented as fast as possible. + Runtime is O(len(vals) * radius). + Args: + vals: A sequence of numeric values. + radius: The maximum half-width of the averaging window (the + window spans at most ``2*radius + 1`` elements). + + Returns: + A list of smoothed values of the same length as ``vals``. + """ vals2 = [] vlen = len(vals) @@ -231,12 +524,27 @@ def smooth(vals, radius): def iter_window_index(x, xdist, esp=None): - """ - iterates a sliding window over x with radius xradius - - returns an iterator over list of indices in x that represent windows - - x must be sorted least to greatest + """Iterate sliding-window index ranges over a sorted value sequence. + + Advances a window of fixed width ``xdist`` along the value axis of + a sorted sequence ``x``, yielding the array-index bounds and value + bounds of the window each time a point enters or exits it. + + The window boundaries are updated one step at a time: the lower + bound advances whenever the leading point would be expelled, and the + upper bound advances to admit the next point. + + Args: + x: A sorted (ascending) list of numeric values. + xdist: The width of the sliding window in the same units as + values in ``x``. + esp: Unused parameter retained for API compatibility. + + Yields: + 4-tuples ``(lowi, highi, low, high)`` where ``lowi`` and + ``highi`` are the inclusive index bounds of the current window + in ``x``, and ``low`` / ``high`` are the corresponding value + boundaries. """ vlen = len(x) @@ -294,7 +602,25 @@ def iter_window_index(x, xdist, esp=None): def iter_window_index_step(x, size, step, minsize=0): - + """Iterate fixed-step sliding-window index ranges over a sorted value sequence. + + Advances a window of fixed width ``size`` in increments of ``step`` + along the value axis, yielding index and value bounds for each + window position that contains at least ``minsize`` points. + + Args: + x: A sorted (ascending) list of numeric values. + size: The width of each window in the same units as ``x``. + step: The distance to advance the window centre between successive + yields. + minsize: Minimum number of points that must be inside the window + for it to be yielded. Defaults to 0. + + Yields: + 4-tuples ``(lowi, highi, low, high)`` where ``lowi`` and + ``highi`` are the inclusive index bounds of the current window + in ``x``, and ``low`` / ``high`` are the value boundaries. + """ vlen = len(x) start = x[0] end = x[-1] @@ -328,32 +654,73 @@ def iter_window_index_step(x, size, step, minsize=0): def iter_window(x, xdist, func=lambda win: win, minsize=0): + """Apply a function to each sliding window over a sorted sequence. + + Wraps :func:`iter_window_index` and yields the window midpoint + together with ``func`` applied to the window slice. + + Note: + The internal call uses ``xsize`` rather than ``xdist``; this is + a latent bug in the original code and is preserved here. + + Args: + x: A sorted (ascending) list of numeric values. + xdist: The width of the sliding window. + func: A callable applied to each window slice ``x[lowi:highi]``. + Defaults to the identity function. + minsize: Minimum number of points in the window before it is + yielded. Defaults to 0. + + Yields: + 2-tuples ``(midpoint, func(window))`` where ``midpoint`` is + ``(low + high) / 2`` and ``window`` is the slice of ``x`` + within the current bounds. """ - iterates a sliding window over x with radius xradius - - x must be sorted least to greatest - """ - for lowi, highi, low, high in iter_window_index(x, xsize): if highi - lowi >= minsize: yield (high + low)/2.0, func(x[lowi:highi]) def iter_window_step(x, width, step, func=lambda win: win, minsize=0): + """Apply a function to each fixed-step sliding window over a sorted sequence. + + Wraps :func:`iter_window_index_step` and yields the window midpoint + together with ``func`` applied to the window slice. ``x`` must be + sorted in ascending order. + + Args: + x: A sorted (ascending) list of numeric values. + width: The width of each window in the same units as ``x``. + step: The distance to advance the window between successive yields. + func: A callable applied to each window slice ``x[lowi:highi]``. + Defaults to the identity function. + minsize: Minimum number of points that must be in the window for + it to be yielded. Defaults to 0. + + Yields: + 2-tuples ``(midpoint, func(window))`` where ``midpoint`` is + ``(low + high) / 2.0`` and ``window`` is the slice of ``x`` + within the current bounds. """ - iterates a sliding window over x with width 'width' - - x must be sorted least to greatest - - return an iterator with (midx, func(x[lowi:highi])) - """ - for lowi, highi, low, high in iter_window_index_step(x, width, step, minsize): yield (high + low) / 2.0, func(x[lowi:highi]) def _sortTogether(x, y): - """Sort x and y together by x values.""" + """Sort two sequences together by the values of ``x``. + + Zips ``x`` and ``y`` into pairs, sorts by the first element of each + pair, then unzips back into two separate lists. + + Args: + x: A sequence of sortable values used as the sort key. + y: A sequence of values of the same length as ``x``. + + Returns: + A 2-tuple ``(x2, y2)`` where both lists have been reordered so + that ``x2`` is sorted ascending. Returns ``([], [])`` if ``x`` + is empty. + """ if not x: return [], [] pairs = sorted(zip(x, y)) @@ -362,10 +729,28 @@ def _sortTogether(x, y): def smooth2(x, y, xradius, minsize=0, sort=False): - """ - return an averaging of x and y using xradius - - x must be sorted least to greatest + """Smooth paired (x, y) data by averaging within a sliding x-radius window. + + For each point ``x[i]``, the window spans all points whose x-value + lies within ``[x[i] - r, x[i] + r]`` where + ``r = min(x[i] - min(x), max(x) - x[i], xradius)`` so that the + effective radius shrinks near the data boundaries. + + Args: + x: A sorted (ascending) list of x-coordinates. Must be + non-empty and of the same length as ``y``. + y: A list of y-values corresponding to ``x``. + xradius: The maximum half-width of the averaging window in + the same units as ``x``. + minsize: Minimum number of points that must be in the window + for the averaged point to be included in the output. + Defaults to 0. + sort: If ``True``, sort ``x`` and ``y`` together by ``x`` before + smoothing. Defaults to ``False``. + + Returns: + A 2-tuple ``(x2, y2)`` of lists containing the smoothed x and y + values. Returns ``([], [])`` if ``x`` is empty. """ vlen = len(x) @@ -413,8 +798,23 @@ def smooth2(x, y, xradius, minsize=0, sort=False): def factorial(x, k=1): - """Simple implementation of factorial""" - + """Compute the partial factorial product x! / k!. + + Calculates the product of all integers from ``k+1`` to ``x`` + inclusive. When ``k=1`` (the default) this is the standard + factorial ``x!``. When ``k > 1`` it returns the falling factorial + ``x! / k!``. + + Args: + x: The upper bound of the product (inclusive). Converted to + ``int`` internally. + k: The lower bound; the product starts at ``k+1``. Defaults + to 1. + + Returns: + An integer equal to ``(k+1) * (k+2) * ... * x``, or 1 if the + range is empty (i.e. ``x <= k``). + """ n = 1 for i in range(int(k)+1, int(x)+1): n *= i @@ -422,8 +822,22 @@ def factorial(x, k=1): def logfactorial(x, k=1): - """returns the log(factorial(x) / factorial(k)""" + """Compute log(x! / k!) in log space. + + Returns the natural log of the partial factorial product + ``(k+1) * (k+2) * ... * x`` by summing ``log(i)`` terms. This + avoids integer overflow for large ``x``. + Args: + x: The upper bound of the product (inclusive). Converted to + ``int`` internally. + k: The lower bound; the product starts at ``k+1``. Defaults + to 1. + + Returns: + A float equal to ``log((k+1)) + log(k+2) + ... + log(x)``, + or 0.0 if the range is empty. + """ n = 0 for i in range(int(k)+1, int(x)+1): n += log(i) @@ -431,6 +845,20 @@ def logfactorial(x, k=1): def choose(n, k): + """Compute the binomial coefficient C(n, k) = n! / (k! * (n-k)!). + + Uses a multiplicative formula for efficiency, exploiting the + symmetry ``C(n, k) == C(n, n-k)`` to minimise the number of + multiplications. Returns the result rounded to the nearest integer. + + Args: + n: The total number of items. + k: The number of items to choose. + + Returns: + An integer equal to C(n, k). Returns 1.0 when both ``n`` and + ``k`` are 0, and 0 when any argument is negative or ``k > n``. + """ if n == 0 and k == 0: return 1.0 @@ -449,19 +877,44 @@ def choose(n, k): def _oneNorm(weights): - """Normalize a list of weights to sum to 1.""" + """Normalise a list of weights so they sum to 1. + + Divides each weight by the total sum of all weights. + + Args: + weights: A list of non-negative numeric values whose sum is + positive. + + Returns: + A new list of floats of the same length as ``weights`` that + sum to 1.0. + """ s = sum(weights) return [w / s for w in weights] def sample(weights): - """ - Randomly choose an int between 0 and len(probs)-1 using - the weights stored in list probs. + """Randomly choose an index proportional to the given weights. - item i will be chosen with probability weights[i]/sum(weights) - """ + Normalises ``weights`` to a proper probability distribution and then + samples using a CDF built from the normalised weights and a binary + search via :func:`algorithms.binsearch`. + + Item ``i`` is chosen with probability ``weights[i] / sum(weights)``. + Args: + weights: A list of non-negative numeric values. The length + determines the range of possible return values (0 to + ``len(weights) - 1``). + + Returns: + An integer index into ``weights``, selected with probability + proportional to each weight. + + Raises: + AssertionError: If ``algorithms.binsearch`` returns ``None`` for + the lower bound, indicating an unexpected state. + """ probs = _oneNorm(weights) cdf = [0] @@ -478,12 +931,31 @@ def sample(weights): def chyper(m, n, M, N, report=0): - ''' - calculates cumulative probability based on - hypergeometric distribution - over/under/both (report = 0/1/2) - (uses /seq/compbio02/software-Linux/misc/chyper) - ''' + """Compute a hypergeometric cumulative probability via an external ``chyper`` binary. + + Models drawing ``n`` balls from an urn containing ``N`` balls of which + ``M`` are white (successes). ``m`` is the number of white balls drawn. + Calls the external command-line tool ``chyper`` and parses its output. + + Args: + m: Number of white balls drawn (observed successes). Must be + an ``int`` with ``m <= n`` and ``m <= M``. + n: Total balls drawn. Must be an ``int`` with ``n <= N``. + M: Total white balls in urn. Must be an ``int``. + N: Total balls in urn. Must be an ``int``. + report: Controls which tail(s) are returned. + ``0`` — p-value for over-representation (default). + ``1`` — p-value for under-representation. + ``2`` — 2-tuple ``(over_p, under_p)``. + + Returns: + A float p-value, or a list of two floats when ``report=2``. + + Raises: + AssertionError: If arguments do not satisfy type or range constraints. + Exception: If the ``chyper`` command produces no output. + Exception: If ``report`` is not 0, 1, or 2. + """ assert( (type(m) == type(n) == type(M) == type(N) == int) and m <= n and m <= M and n <= N) @@ -511,18 +983,31 @@ def chyper(m, n, M, N, report=0): def rhyper(m, n, M, N, report=0): - ''' - calculates cumulative probability based on - hypergeometric distribution - over/under/both (report = 0/1/2) - (uses R through RPy2) - - N = total balls in urn - M = total white balls in urn - n = drawn balls from urn - m = drawn white balls from urn - - ''' + """Compute a hypergeometric cumulative probability via R (rpy2). + + Models drawing ``n`` balls from an urn containing ``N`` balls of which + ``M`` are white (successes). ``m`` is the number of white balls drawn. + Uses R's ``phyper`` function via the rpy2 interface. + + Args: + m: Number of white balls drawn (observed successes). Must be + an ``int`` with ``m <= n`` and ``m <= M``. + n: Total balls drawn. Must be an ``int`` with ``n <= N``. + M: Total white balls in urn. Must be an ``int``. + N: Total balls in urn. Must be an ``int``. + report: Controls which tail(s) are returned. + ``0`` — p-value for over-representation, i.e. + ``P(X >= m)`` (default). + ``1`` — p-value for under-representation, i.e. ``P(X <= m)``. + ``2`` — 2-tuple ``(over_p, under_p)``. + + Returns: + A float p-value, or a 2-tuple of floats when ``report=2``. + + Raises: + AssertionError: If arguments do not satisfy type or range constraints. + Exception: If ``report`` is not 0, 1, or 2. + """ import rpy2.robjects as r_module r = r_module.r @@ -543,8 +1028,19 @@ def rhyper(m, n, M, N, report=0): raise Exception("unknown option") def cdf(vals): - """Computes the CDF of a list of values""" + """Compute the empirical cumulative distribution function (ECDF) of a list. + + Sorts ``vals`` and assigns each unique value a cumulative probability + equal to its 0-based rank divided by the total number of values. + Args: + vals: A sequence of numeric values. + + Returns: + A 2-tuple ``(x, y)`` where ``x`` is the sorted list of values and + ``y`` is the corresponding list of cumulative probabilities in + [0, 1). + """ vals = sorted(vals) tot = float(len(vals)) x = [] @@ -558,8 +1054,31 @@ def cdf(vals): def enrichItems(in_items, out_items, M=None, N=None, useq=True, extra=False): - """Calculates enrichment for items within an in-set vs and out-set. - Returns a sorted DataFrame. + """Calculate item enrichment between an in-set and an out-set. + + Counts how often each item appears in ``in_items`` vs ``out_items`` and + tests for enrichment using the hypergeometric distribution via + :func:`rhyper`. Optionally adjusts p-values to q-values (FDR) and + adds fold-enrichment columns. + + Args: + in_items: An iterable of items in the foreground (in-set). + out_items: An iterable of items in the background (out-set). + M: The foreground population size. Defaults to + ``len(in_items)``. + N: The total population size. Defaults to + ``len(in_items) + len(out_items)``. + useq: If ``True`` (default), add ``qval`` and ``qval_under`` + columns computed via FDR correction using :func:`qvalues`. + extra: If ``True``, add columns ``in_size``, ``out_size``, + ``item_ratio``, ``size_ratio``, and ``fold`` for fold- + enrichment analysis. Defaults to ``False``. + + Returns: + A :class:`pandas.DataFrame` sorted by ``pval`` (ascending) with + columns ``item``, ``in_count``, ``out_count``, ``pval``, + ``pval_under``, and optionally ``qval``, ``qval_under``, and + fold-enrichment columns. """ # count items using defaultdict instead of rasmus util.Dict @@ -607,11 +1126,34 @@ def enrichItems(in_items, out_items, M=None, N=None, useq=True, extra=False): def qvalues(pvals): + """Compute Benjamini-Hochberg FDR-adjusted p-values (q-values) via R. + + Calls R's ``p.adjust`` function with ``method='fdr'`` through rpy2. + + Args: + pvals: A list of raw p-values (floats in [0, 1]). + + Returns: + A list of FDR-adjusted p-values (q-values) of the same length + as ``pvals``. + """ import rpy2.robjects as robjects ret = robjects.r['p.adjust'](robjects.FloatVector(pvals), 'fdr') return list(ret) def qvalues2(pvals): + """Compute q-values using the Storey-Tibshirani method via R's qvalue package. + + Loads the ``qvalue`` R package through rpy2 and calls ``qvalue()`` on + the provided p-values. + + Args: + pvals: A list of raw p-values (floats in [0, 1]). + + Returns: + A list of q-values of the same length as ``pvals`` as computed + by the Storey-Tibshirani estimator. + """ import rpy2.robjects as robjects robjects.r['library']('qvalue') ret = robjects.r['qvalue'](robjects.FloatVector(pvals)) @@ -623,6 +1165,18 @@ def qvalues2(pvals): # def uniformPdf(x, params): + """Evaluate the Uniform(a, b) probability density function at ``x``. + + Returns ``1 / (b - a)`` when ``a <= x <= b``, and 0 otherwise. + + Args: + x: The point at which to evaluate the PDF. + params: A 2-tuple ``(a, b)`` defining the lower and upper bounds + of the uniform distribution. + + Returns: + The PDF value at ``x`` as a float. + """ a, b = params if x < a or x > b: return 0.0 @@ -631,37 +1185,137 @@ def uniformPdf(x, params): def binomialPdf(k, params): + """Evaluate the Binomial(n, p) probability mass function at ``k``. + + Computes:: + + P(X = k) = C(n, k) * p^k * (1 - p)^(n - k) + + Args: + k: The number of successes (non-negative integer). + params: A 2-tuple ``(p, n)`` where ``p`` is the success probability + per trial and ``n`` is the total number of trials. + + Returns: + The probability of exactly ``k`` successes as a float. + """ p, n = params return choose(n, k) * (p ** k) * ((1.0-p) ** (n - k)) def gaussianPdf(x, params): + """Evaluate the standard Normal N(0, 1) probability density function at ``x``. + + Computes:: + + f(x) = (1 / sqrt(2*pi)) * exp(-x^2 / 2) + + Note: + The ``params`` argument is accepted but ignored; this function + always evaluates the standard normal (mean 0, variance 1). + + Args: + x: The point at which to evaluate the PDF. + params: Unused. Accepted for API consistency with other PDF + functions. + + Returns: + The standard normal PDF value at ``x`` as a float. + """ return 1/sqrt(2*pi) * exp(- x**2 / 2.0) def normalPdf(x, params): + """Evaluate the Normal(mu, sigma) probability density function at ``x``. + + Computes:: + + f(x) = (1 / (sigma * sqrt(2*pi))) * exp(-(x - mu)^2 / (2*sigma^2)) + + Args: + x: The point at which to evaluate the PDF. + params: A 2-tuple ``(mu, sigma)`` — the mean and standard + deviation of the normal distribution. + + Returns: + The normal PDF value at ``x`` as a float. + """ mu, sigma = params return 1.0/(sigma * sqrt(2.0*pi)) * exp(- (x - mu)**2 / (2.0 * sigma**2)) def normalCdf(x, params): + """Evaluate the Normal(mu, sigma) cumulative distribution function at ``x``. + + Computes:: + + F(x) = (1 + erf((x - mu) / (sigma * sqrt(2)))) / 2 + + Args: + x: The point at which to evaluate the CDF. + params: A 2-tuple ``(mu, sigma)`` — the mean and standard + deviation of the normal distribution. + + Returns: + The cumulative probability P(X <= x) as a float in [0, 1]. + """ mu, sigma = params return (1 + erf((x - mu)/(sigma * sqrt(2)))) / 2.0 def logNormalPdf(x, params): - """mu and sigma are the mean and standard deviation of the - variable's logarithm""" + """Evaluate the log-normal probability density function at ``x``. + + The log-normal distribution describes a variable whose natural + logarithm is normally distributed. The PDF is:: + + f(x) = (1 / (x * sigma * sqrt(2*pi))) * exp(-(log(x) - mu)^2 / (2*sigma^2)) + Args: + x: The point at which to evaluate the PDF. Must be positive. + params: A 2-tuple ``(mu, sigma)`` — the mean and standard + deviation of the variable's natural logarithm. + + Returns: + The log-normal PDF value at ``x`` as a float. Returns nonsensical + values for ``x <= 0``. + """ mu, sigma = params return 1/(x * sigma * sqrt(2*pi)) * \ exp(- (log(x) - mu)**2 / (2.0 * sigma**2)) def logNormalCdf(x, params): - """mu and sigma are the mean and standard deviation of the - variable's logarithm""" + """Evaluate the log-normal cumulative distribution function at ``x``. + Computes:: + + F(x) = (1 + erf((log(x) - mu) / (sigma * sqrt(2)))) / 2 + + Args: + x: The point at which to evaluate the CDF. Must be positive. + params: A 2-tuple ``(mu, sigma)`` — the mean and standard + deviation of the variable's natural logarithm. + + Returns: + The cumulative probability P(X <= x) as a float in [0, 1]. + """ mu, sigma = params return (1 + erf((log(x) - mu)/(sigma * sqrt(2)))) / 2.0 def poissonPdf(x, params): + """Evaluate the Poisson probability mass function at ``x``. + + Computes the probability in log space to avoid overflow:: + + P(X = x) = exp(-lambda) * lambda^x / x! + = exp(-lambda + sum(log(lambda/i) for i in 1..x)) + + Args: + x: The number of events (non-negative integer). + params: A 1-tuple or list whose first element is ``lambda`` + (the expected number of events, must be positive). + + Returns: + The Poisson PMF value P(X = x) as a float. Returns 0.0 if + ``x < 0`` or ``lambda <= 0``. + """ lambd = params[0] if x < 0 or lambd <= 0: @@ -674,7 +1328,26 @@ def poissonPdf(x, params): def poissonCdf(x, params): - """Cumulative distribution function of the Poisson distribution""" + """Evaluate the Poisson cumulative distribution function at ``x``. + + Computes P(X <= x) using the regularised incomplete gamma function:: + + F(x; lambda) = (Gamma(floor(x+1)) - gammainc(floor(x+1), lambda)) + / floor(x)! + + Note: + Not implemented accurately for large ``x`` or ``lambda``. + + Args: + x: The upper bound (non-negative number; floor is taken + internally). + params: A 1-tuple or list whose first element is ``lambda`` + (the expected number of events). + + Returns: + The cumulative probability P(X <= x) as a float, or 0 if + ``x < 0``. + """ # NOTE: not implemented accurately for large x or lambd lambd = params[0] @@ -686,7 +1359,19 @@ def poissonCdf(x, params): def poissonvariate(lambd): - """Sample from a Poisson distribution""" + """Draw a random sample from a Poisson distribution. + + Uses Knuth's algorithm: generate uniform random variables and + multiply them together until their product falls below + ``exp(-lambda)``. The count of multiplications minus one is the + Poisson variate. + + Args: + lambd: The expected number of events per interval (lambda > 0). + + Returns: + A non-negative integer drawn from Poisson(lambda). + """ l = exp(-lambd) k = 0 p = 1.0 @@ -698,6 +1383,21 @@ def poissonvariate(lambd): return k - 1 def exponentialPdf(x, params): + """Evaluate the Exponential(lambda) probability density function at ``x``. + + Computes:: + + f(x; lambda) = lambda * exp(-lambda * x) for x >= 0, lambda >= 0 + + Args: + x: The point at which to evaluate the PDF. + params: A 1-tuple or list whose first element is ``lambda`` + (the rate parameter). + + Returns: + The exponential PDF value at ``x`` as a float. Returns 0.0 if + ``x < 0`` or ``lambda < 0``. + """ lambd = params[0] if x < 0 or lambd < 0: @@ -707,6 +1407,21 @@ def exponentialPdf(x, params): def exponentialCdf(x, params): + """Evaluate the Exponential(lambda) cumulative distribution function at ``x``. + + Computes:: + + F(x; lambda) = 1 - exp(-lambda * x) for x >= 0, lambda >= 0 + + Args: + x: The point at which to evaluate the CDF. + params: A 1-tuple or list whose first element is ``lambda`` + (the rate parameter). + + Returns: + The cumulative probability P(X <= x) as a float. Returns 0.0 if + ``x < 0`` or ``lambda < 0``. + """ lambd = params[0] if x < 0 or lambd < 0: @@ -716,9 +1431,36 @@ def exponentialCdf(x, params): def exponentialvariate(lambd): + """Draw a random sample from an Exponential(lambda) distribution. + + Uses the inverse CDF (quantile) method: if U ~ Uniform(0,1) then + ``-log(U) / lambda`` is Exponentially distributed with rate ``lambda``. + + Args: + lambd: The rate parameter (lambda > 0). + + Returns: + A non-negative float drawn from Exponential(lambda). + """ return -log(random.random()) / lambd def gammaPdf(x, params): + """Evaluate the Gamma(alpha, beta) probability density function at ``x``. + + Uses the rate (inverse-scale) parameterisation:: + + f(x; alpha, beta) = beta^alpha * x^(alpha-1) * exp(-beta*x) + / Gamma(alpha) + + Args: + x: The point at which to evaluate the PDF. Must be positive. + params: A 2-tuple ``(alpha, beta)`` — the shape and rate + parameters. Both must be positive. + + Returns: + The gamma PDF value at ``x`` as a float. Returns 0.0 if any of + ``x``, ``alpha``, or ``beta`` is non-positive. + """ alpha, beta = params if x <= 0 or alpha <= 0 or beta <= 0: return 0.0 @@ -727,6 +1469,22 @@ def gammaPdf(x, params): gamma(alpha) def gammaPdf2(x, params): + """Evaluate the Gamma(alpha, beta) PDF at ``x`` using log-space arithmetic. + + Numerically more stable than :func:`gammaPdf` for large parameter + values. Computes the same distribution in log space:: + + log f = -x*beta + (alpha-1)*log(x) + alpha*log(beta) - gammaln(alpha) + + Args: + x: The point at which to evaluate the PDF. Must be positive. + params: A 2-tuple ``(alpha, beta)`` — the shape and rate + parameters (rate parameterisation). Both must be positive. + + Returns: + The gamma PDF value at ``x`` as a float. Returns 0.0 if any of + ``x``, ``alpha``, or ``beta`` is non-positive. + """ alpha, beta = params if x <= 0 or alpha <= 0 or beta <= 0: return 0.0 @@ -736,6 +1494,21 @@ def gammaPdf2(x, params): def gammaCdf(x, params): + """Evaluate the Gamma(alpha, beta) cumulative distribution function at ``x``. + + Computes P(X <= x) using the lower incomplete gamma function:: + + F(x; alpha, beta) = gammainc(alpha, x*beta) / Gamma(alpha) + + Args: + x: The point at which to evaluate the CDF. + params: A 2-tuple ``(alpha, beta)`` — the shape and rate + parameters (rate parameterisation). Both must be positive. + + Returns: + The cumulative probability P(X <= x) as a float. Returns 0 if + ``x <= 0``. + """ alpha, beta = params if x <= 0: return 0 @@ -744,10 +1517,27 @@ def gammaCdf(x, params): def betaPdf2(x, params): - """A simpler implementation of beta distribution but will overflow - for values of alpha and beta near 100 - """ + """Evaluate the Beta(alpha, beta) PDF at ``x`` using direct gamma computation. + Simpler but less numerically stable than :func:`betaPdf`; will + overflow for ``alpha`` or ``beta`` values near 100 because it + evaluates ``Gamma(alpha + beta)`` directly. + + Formula:: + + f(x; alpha, beta) = Gamma(alpha+beta) / (Gamma(alpha)*Gamma(beta)) + * x^(alpha-1) * (1-x)^(beta-1) + + Args: + x: The point at which to evaluate the PDF. Must satisfy + ``0 < x < 1``. + params: A 2-tuple ``(alpha, beta)`` — the shape parameters, both + must be positive. + + Returns: + The beta PDF value at ``x`` as a float. Returns 0.0 if ``x`` + is outside (0, 1) or if either shape parameter is non-positive. + """ alpha, beta = params if 0 < x < 1 and alpha > 0 and beta > 0: return gamma(alpha + beta) / (gamma(alpha)*gamma(beta)) * \ @@ -756,6 +1546,24 @@ def betaPdf2(x, params): return 0.0 def betaPdf(x, params): + """Evaluate the Beta(alpha, beta) PDF at ``x`` using log-gamma arithmetic. + + Numerically stable implementation that avoids overflow by computing + the PDF in log space:: + + log f = gammaln(alpha+beta) - gammaln(alpha) - gammaln(beta) + + (alpha-1)*log(x) + (beta-1)*log(1-x) + + Args: + x: The point at which to evaluate the PDF. Must satisfy + ``0 < x < 1``. + params: A 2-tuple ``(alpha, beta)`` — the shape parameters, both + must be positive. + + Returns: + The beta PDF value at ``x`` as a float. Returns 0.0 if ``x`` + is outside (0, 1) or if either shape parameter is non-positive. + """ alpha, beta = params if 0 < x < 1 and alpha > 0 and beta > 0: @@ -767,6 +1575,25 @@ def betaPdf(x, params): def betaPdf3(x, params): + """Evaluate the Beta(alpha, beta) PDF at ``x`` using a product formula. + + Computes the PDF via a direct multiplicative recurrence with + integer-cast parameters. Splits the product into two parts: a + symmetric core term up to ``min(alpha-1, beta-1)``, then an + asymmetric tail term up to ``max(alpha-1, beta-1)``. + + Args: + x: The point at which to evaluate the PDF. Must satisfy + ``0 < x < 1``. + params: A 2-tuple ``(alpha, beta)`` — the shape parameters. + Values are cast to ``int`` internally, so non-integer inputs + are truncated. Both must be positive. + + Returns: + The beta PDF value at ``x`` as a float. Returns 0.0 if ``x`` + is outside (0, 1) or if either shape parameter is non-positive + after truncation. + """ alpha, beta = map(int, params) if 0 < x < 1 and alpha > 0 and beta > 0: n = min(alpha-1, beta-1) @@ -790,10 +1617,23 @@ def betaPdf3(x, params): def gamma(x): - """ - Lanczos approximation to the gamma function. + """Compute the gamma function Gamma(x) via the Lanczos approximation. + + Uses the Lanczos coefficients to approximate Gamma(x) for positive + real ``x``. The formula is:: + + Gamma(x) ≈ sqrt(2*pi) / x * (x + 5.5)^(x + 0.5) * exp(-x - 5.5) + * series(x) - found on http://www.rskey.org/gamma.htm + where ``series(x)`` is the Lanczos sum with 7 coefficients. + + Reference: http://www.rskey.org/gamma.htm + + Args: + x: A positive real number. + + Returns: + An approximation of Gamma(x) as a float. """ ret = 1.000000000190015 + \ @@ -809,26 +1649,21 @@ def gamma(x): def gammaln(xx): - """ - From numerical alogrithms in C - - float gammln(float xx) - Returns the value ln[(xx)] for xx > 0. - { - Internal arithmetic will be done in double precision, a nicety that you can omit if five-figure - accuracy is good enough. - double x,y,tmp,ser; - static double cof[6]={76.18009172947146,-86.50532032941677, - 24.01409824083091,-1.231739572450155, - 0.1208650973866179e-2,-0.5395239384953e-5}; - int j; - y=x=xx; - tmp=x+5.5; - tmp -= (x+0.5)*log(tmp); - ser=1.000000000190015; - for (j=0;j<=5;j++) ser += cof[j]/++y; - return -tmp+log(2.5066282746310005*ser/x); - } + """Compute the natural logarithm of the gamma function, ln(Gamma(xx)). + + Implements the Lanczos approximation from *Numerical Algorithms in C* + (Press et al.). Returns ``ln(Gamma(xx))`` for ``xx > 0``:: + + y = x = xx + tmp = x + 5.5 - (x + 0.5) * log(x + 5.5) + ser = 1.000000000190015 + sum(cof[j] / (y + j + 1) for j in 0..5) + return -tmp + log(2.5066282746310005 * ser / x) + + Args: + xx: A positive real number. + + Returns: + The natural logarithm of Gamma(xx) as a float. """ cof = [76.18009172947146,-86.50532032941677, @@ -851,7 +1686,23 @@ def gammaln(xx): GAMMA_INCOMP_ACCURACY = 1000 def gammainc(a, x): - """Lower incomplete gamma function""" + """Compute the lower incomplete gamma function gamma(a, x). + + Uses a series expansion truncated at ``GAMMA_INCOMP_ACCURACY`` terms + or when the current term drops below 0.0001:: + + gamma(a, x) = x^a * exp(-x) * sum_{n=0}^{inf} x^n / prod_{i=0}^{n}(a+i) + + Reference: http://www.rskey.org/gamma.htm + + Args: + a: The shape parameter (positive real number). + x: The upper integration limit (non-negative real number). + + Returns: + An approximation of the lower incomplete gamma function + ``gamma(a, x)`` as a float. + """ # found on http://www.rskey.org/gamma.htm ret = 0 @@ -865,6 +1716,22 @@ def gammainc(a, x): def erf(x): + """Compute an approximation of the error function erf(x). + + Uses the rational approximation from the paper at + http://www.theorie.physik.uni-muenchen.de/~serge/erf-approx.pdf :: + + a = (8 / (3*pi)) * (pi - 3) / (4 - pi) + erf(x) ≈ sign(x) * sqrt(1 - exp(-x^2 * (4/pi + a*x^2) / (1 + a*x^2))) + + The approximation is accurate to approximately four decimal places. + + Args: + x: A real number. + + Returns: + An approximation of erf(x) in (-1, 1) as a float. + """ # http://www.theorie.physik.uni-muenchen.de/~serge/erf-approx.pdf a = 8/(3*pi) * (pi - 3)/(4 - pi) @@ -878,6 +1745,33 @@ def erf(x): def chiSquare(rows, expected=None, nparams=0): + """Compute the chi-square statistic and approximate p-value for a contingency table. + + Given a 2-D table of observed counts ``rows``, computes expected + counts under independence (or uses the provided ``expected`` table), + then calculates:: + + chi^2 = sum((obs - exp)^2 / exp) + + The degrees of freedom are + ``(nrows - 1) * (ncols - 1) - nparams``, clamped to at least 1. + The p-value is looked up in a hardcoded table via + :func:`chi_square_lookup`. + + Args: + rows: A list of lists of observed counts. All rows must have + the same length. + expected: A list of lists of expected counts with the same shape + as ``rows``. If ``None`` (default), expected counts are + computed from marginal totals via :func:`make_expected`. + nparams: The number of estimated parameters to subtract from + the degrees of freedom. Defaults to 0. + + Returns: + A 2-tuple ``(chisq, p)`` where ``chisq`` is the chi-square + statistic (float) and ``p`` is the approximate p-value (float). + Returns ``(0, 1.0)`` if any row or column marginal sum is zero. + """ # ex: rows = [[1,2,3],[1,4,5]] assert(len(set(map(len, rows))) <= 1) @@ -901,6 +1795,20 @@ def chiSquare(rows, expected=None, nparams=0): def make_expected(rows): + """Compute expected counts for a contingency table under independence. + + For each cell ``(i, j)``, the expected count is:: + + expected[i][j] = row_total[i] * col_total[j] / grand_total + + Args: + rows: A list of lists of observed counts. All rows must have + the same length. + + Returns: + A list of lists of expected counts with the same shape as + ``rows``. + """ rowtotals = map(sum, rows) coltotals = map(sum, zip(* rows)) grandtotal = float(sum(rowtotals)) @@ -916,6 +1824,34 @@ def make_expected(rows): def chiSquareFit(xbins, ybins, func, nsamples, nparams, minsamples=5): + """Test a fitted distribution against binned data using a chi-square goodness-of-fit test. + + Converts normalised bin heights ``ybins`` to raw counts, computes + expected counts from ``func`` integrated over each bin, discards + bins with fewer than ``minsamples`` expected observations, and then + calls :func:`chiSquare`. + + Args: + xbins: A list of ``n+1`` bin-edge x-values (the left edges of + the first ``n`` bins). + ybins: A list of ``n`` normalised bin heights (density values, + not raw counts). + func: A callable ``func(x)`` representing the fitted PDF; + evaluated at each bin edge to compute expected bin mass. + nsamples: The total number of data samples used to convert + normalised heights to counts. + nparams: The number of fitted parameters to subtract from the + chi-square degrees of freedom. + minsamples: Minimum expected count required for a bin to be + included. Defaults to 5. + + Returns: + A 3-tuple ``(result, counts, expected)`` where ``result`` is the + ``(chisq, p)`` pair from :func:`chiSquare`, ``counts`` is the + list of observed counts for included bins, and ``expected`` is + the list of expected counts for included bins. If no bins pass + the ``minsamples`` threshold, returns ``([0, 1], [], [])``. + """ sizes = [xbins[i+1] - xbins[i] for i in range(len(xbins)-1)] sizes.append(sizes[-1]) @@ -973,7 +1909,23 @@ def chiSquareFit(xbins, ybins, func, nsamples, nparams, minsamples=5): def chi_square_lookup(value, df): - + """Look up an approximate p-value for a chi-square statistic from a hardcoded table. + + Compares ``value`` against the ``chi_square_table`` for the given + degrees of freedom ``df`` (capped at 30) and returns the largest + significance level whose critical value does not exceed ``value``. + + Args: + value: The observed chi-square statistic. + df: Degrees of freedom. Values above 30 are treated as 30; + values of 0 or less return 1.0. + + Returns: + An approximate p-value from the set + ``{0.20, 0.10, 0.05, 0.025, 0.01, 0.001}`` as a float. + Returns 1.0 if ``value`` is smaller than all critical values in + the table row. + """ ps = [0.20, 0.10, 0.05, 0.025, 0.01, 0.001] if df <= 0: @@ -991,6 +1943,22 @@ def chi_square_lookup(value, df): def ttest(lst1, lst2): + """Compute the Welch's t-statistic for two independent samples. + + Calculates the two-sample t-statistic using the Welch (unequal + variance) formula:: + + t = |mean(lst1) - mean(lst2)| / sqrt(var(lst1)/n1 + var(lst2)/n2) + + Note: + The function computes ``t`` and ``df`` but does not return + anything; the implementation body is incomplete and has no + ``return`` statement. + + Args: + lst1: The first sample as a list of numeric values. + lst2: The second sample as a list of numeric values. + """ sdevdist = sqrt(var(lst1)/len(lst1) + var(lst2)/len(lst2)) t = abs(mean(lst1) - mean(lst2)) / sdevdist df = len(lst2) + len(lst2) - 2 @@ -1049,8 +2017,29 @@ def ttest(lst1, lst2): def spearman(vec1, vec2): - """Spearman's rank test""" + """Compute a Spearman rank-order correlation-like statistic. + + Computes a Z-score based on the sum of squared differences between + the original values (not their ranks, despite the name):: + + R = sum((vec1[i] - vec2[i])^2 for i in range(n)) + Z = (6*R - n*(n^2 - 1)) / (n*(n+1)*sqrt(n-1)) + Note: + Despite the name, this implementation does not actually rank the + values before computing differences; it uses the raw values. + This differs from the standard Spearman rank correlation formula. + + Args: + vec1: A list of numeric values. + vec2: A list of numeric values of the same length as ``vec1``. + + Returns: + A Z-score float derived from the sum of squared raw differences. + + Raises: + AssertionError: If ``vec1`` and ``vec2`` have different lengths. + """ assert len(vec1) == len(vec2), "vec1 and vec2 are not the same length" n = len(vec1) @@ -1065,11 +2054,26 @@ def spearman(vec1, vec2): -# input: -# xdata, ydata - data to fit -# func - a function of the form f(x, params) -# def fitCurve(xdata, ydata, func, paramsInit): + """Fit a parametric function to data using least-squares optimisation. + + Uses :func:`scipy.optimize.leastsq` to minimise the sum of squared + residuals between ``ydata`` and ``func(x, params)`` evaluated at + each ``x`` in ``xdata``. + + Args: + xdata: A list of x-values. + ydata: A list of observed y-values of the same length as + ``xdata``. + func: A callable ``func(x, params)`` that returns a scalar given + a single x-value and a parameter array. + paramsInit: Initial parameter guess as a list or array. + + Returns: + A 2-tuple ``(params, resid_sum)`` where ``params`` is a list of + fitted parameter values and ``resid_sum`` is the sum of squared + residuals at the solution. + """ import scipy.optimize y = np.array(ydata) @@ -1087,6 +2091,27 @@ def error(params): def fitDistrib(func, paramsInit, data, start, end, step, perc=1.0): + """Fit a parametric distribution to a data histogram. + + Note: + This function is currently disabled because it depends on + ``rasmus.util.distrib`` and ``rasmus.util.histbins``, which are + not available. Calling it always raises ``NotImplementedError``. + + Args: + func: A callable ``func(x, params)`` representing the PDF to fit. + paramsInit: Initial parameter guess. + data: The raw data samples to bin. + start: The lower edge of the histogram range. + end: The upper edge of the histogram range. + step: The bin width. + perc: A normalisation factor applied to bin heights. + Defaults to 1.0. + + Raises: + NotImplementedError: Always, because the required dependency is + unavailable. + """ # NOTE: fitDistrib is disabled because it depends on rasmus util.distrib # and util.histbins which are not available. # xdata, ydata = util.distrib(data, low=start, width=step) @@ -1099,6 +2124,29 @@ def fitDistrib(func, paramsInit, data, start, end, step, perc=1.0): def plotfuncFit(func, paramsInit, xdata, ydata, start, end, step, plot=None, **options): + """Fit a parametric function to data and (formerly) plot the result. + + Calls :func:`fitCurve` to fit ``func`` to ``(xdata, ydata)`` and + returns the fitted parameters and residual sum. Plotting via gnuplot + has been removed; the ``plot`` argument and plotting-related + parameters are retained for API compatibility but have no effect. + + Args: + func: A callable ``func(x, params)`` representing the model. + paramsInit: Initial parameter guess. + xdata: A list of x-values. + ydata: A list of observed y-values. + start: Unused (formerly the start of the plot range). + end: Unused (formerly the end of the plot range). + step: Unused (formerly the plot step size). + plot: Unused. Defaults to ``None``. + **options: Unused keyword arguments retained for compatibility. + + Returns: + A 3-tuple ``(None, params, resid)`` where ``params`` is the list + of fitted parameters and ``resid`` is the sum of squared + residuals. + """ # NOTE: plotting via gnuplot removed; returns params and resid only params, resid = fitCurve(xdata, ydata, func, paramsInit) # plot.plot(util.histbins(xdata), ydata, **options) @@ -1108,13 +2156,61 @@ def plotfuncFit(func, paramsInit, xdata, ydata, start, end, step, plot=None, def plotdistribFit(func, paramsInit, data, start, end, step, plot=None, **options): + """Fit a distribution to data and (formerly) plot the result. + + Note: + This function is currently disabled because it depends on + ``rasmus.util.distrib``, which is not available. Calling it + always raises ``NotImplementedError``. + + Args: + func: A callable ``func(x, params)`` representing the PDF. + paramsInit: Initial parameter guess. + data: The raw data samples. + start: The lower edge of the histogram range. + end: The upper edge of the histogram range. + step: The bin width. + plot: Unused plot object. Defaults to ``None``. + **options: Unused keyword arguments. + + Raises: + NotImplementedError: Always, because the required dependency is + unavailable. + """ # NOTE: disabled because it requires rasmus util.distrib raise NotImplementedError("plotdistribFit requires rasmus util.distrib which is not available") def solveCubic(a, b, c, real=True): - """solves x^3 + ax^2 + bx + c = 0 for x""" + """Solve the depressed-form cubic equation x^3 + ax^2 + bx + c = 0. + + Applies the Cardano / Vieta substitution to reduce to a depressed + cubic and then computes all three cube roots using complex arithmetic. + Returns only real roots by default. + + Algorithm: + 1. Substitute ``x = t - a/3`` to eliminate the quadratic term, + yielding ``t^3 + pt + q = 0``. + 2. Compute the square root of the discriminant + ``sqrt(q^2/4 + p^3/27)`` in complex arithmetic. + 3. Find the three cube roots of ``q/2 + sqrt(...)`` using the + primitive cube root of unity. + 4. Recover the three roots ``x_k = p/(3*u_k) - u_k - a/3``. + + Args: + a: Coefficient of the x^2 term. + b: Coefficient of the x term. + c: The constant term. + real: If ``True`` (default), return only roots whose imaginary + part is smaller than 1e-10 in absolute value. If ``False``, + return all three complex roots. + + Returns: + A list of roots. With ``real=True`` the list contains 1 or 3 + real floats. With ``real=False`` the list always contains 3 + complex numbers. + """ p = b - a*a / 3.0 q = c + (2*a*a*a - 9*a*b) / 27.0 @@ -1155,7 +2251,18 @@ def solveCubic(a, b, c, real=True): def _solveCubic_test(n=100): + """Run a self-test of :func:`solveCubic` on random and fixed inputs. + + Generates ``n`` random cubics (plus three fixed edge cases) and + verifies that each root ``x`` satisfies ``|x^3 + a*x^2 + b*x + c| < 1e-4``. + Args: + n: Number of random test cubics to generate. Defaults to 100. + + Raises: + AssertionError: If any computed root does not satisfy the + polynomial equation within tolerance. + """ def test(a, b, c): xs = solveCubic(a, b, c) diff --git a/src/seqlib/util.py b/src/seqlib/util.py index 0d01e84..9d9d3db 100644 --- a/src/seqlib/util.py +++ b/src/seqlib/util.py @@ -33,52 +33,83 @@ # Python 3 compatibility: cmp() was removed def cmp(a, b): + """Three-way comparison function for Python 3 compatibility. + + Args: + a: First value. + b: Second value. + + Returns: + 1 if a > b, -1 if a < b, 0 if equal. + """ return (a > b) - (a < b) class Bundle (dict): - """ - A small class for creating a closure of variables - handy for nested functions that need to assign to variables in an - outer scope + """A small class for creating a closure of variables. - Example: + Handy for nested functions that need to assign to variables in an outer + scope. Attributes and dictionary keys are kept in sync. - def func1(): - this = Bundle(var1 = 0, var2 = "hello") - def func2(): - this.var1 += 1 - func2() - print(this.var1) - func1() - - will produce: - 1 - + Example:: + + def func1(): + this = Bundle(var1=0, var2="hello") + def func2(): + this.var1 += 1 + func2() + print(this.var1) + func1() + # prints: 1 """ def __init__(self, **variables): + """Initialize a Bundle with keyword arguments as attributes. + + Args: + **variables: Arbitrary keyword arguments that become both + attributes (self.key) and dictionary entries. + """ for key, val in variables.items(): setattr(self, key, val) dict.__setitem__(self, key, val) def __setitem__(self, key, val): + """Set a key both as an attribute and as a dict entry. + + Args: + key: Attribute/key name. + val: Value to assign. + """ setattr(self, key, val) dict.__setitem__(self, key, val) class Dict (dict): - """My personal nested Dictionary (with default values)""" + """A nested dictionary with configurable dimensionality and default values. + + Accessing a missing key returns (and optionally inserts) a default value + or a nested Dict of one lower dimension, enabling multi-dimensional sparse + containers without explicit initialisation. + """ def __init__(self, items=None, dim=1, default=None, insert=True): - """ - items -- items to initialize Dict (can be dict, list, iter) - dim -- number of dimensions of the dictionary - default -- default value of a dictionary item + """Initialize a Dict. + + Args: + items: Initial items to populate the dict (dict, list of pairs, + or other iterable). If an int is passed, it is treated as + the old-style positional dim argument for backwards + compatibility. + dim: Number of nesting dimensions (default 1). + default: Default value returned for missing leaf-level keys + (default None). + insert: If True, accessing a missing key inserts the default + value automatically (default True). """ if isinstance(items, int): @@ -97,6 +128,15 @@ def __init__(self, items=None, dim=1, default=None, insert=True): def __getitem__(self, i): + """Return the value for key i, inserting a default if missing. + + Args: + i: The key to look up. + + Returns: + The stored value, or a default Dict/copy of null if the key was + absent. + """ if i not in self: if self._dim > 1: ret = Dict(self._dim - 1, self._null) @@ -109,6 +149,14 @@ def __getitem__(self, i): def has_keys(self, *keys): + """Check whether a sequence of nested keys all exist. + + Args: + *keys: Keys to check at successive nesting levels. + + Returns: + True if all keys are present at the corresponding nesting levels. + """ if len(keys) == 0: return True elif len(keys) == 1: @@ -118,6 +166,11 @@ def has_keys(self, *keys): self[keys[0]].has_keys(*keys[1:]) def write(self, out = sys.stdout): + """Write a human-readable representation of the dict to a stream. + + Args: + out: Output stream to write to (default sys.stdout). + """ def walk(node, path): if node.dim == 1: for i in node: @@ -137,39 +190,77 @@ def walk(node, path): class Percent (float): + """A float subclass that formats itself as a percentage string. + + Attributes: + digits: Number of decimal places used when formatting (default 1). + """ digits = 1 def __str__(self): + """Return the value formatted as a percentage with self.digits decimals. + + Returns: + String such as "42.0" representing 42.0% (i.e. float value 0.42). + """ return (("%%.%df" % self.digits) % (float(self) * 100)) def __repr__(self): + """Return the same string as __str__.""" return str(self) class PushIter (object): - """Wrap an iterator in another iterator that allows one to push new - items onto the front of the iteration stream""" + """An iterator wrapper that allows pushing items back to the front of the stream. + + Wraps any iterable and provides a push() method to prepend items. + """ def __init__(self, it): + """Initialize a PushIter from any iterable. + + Args: + it: Any iterable to wrap. + """ self._it = iter(it) self._queue = [] def __iter__(self): + """Return self as the iterator.""" return self def __next__(self): + """Return the next item, preferring items from the push queue. + + Returns: + The next item from the queue if non-empty, otherwise from the + underlying iterator. + """ if len(self._queue) > 0: return self._queue.pop() else: return self.next(_it) def push(self, item): - """Push a new item onto the front of the iteration stream""" + """Push a new item onto the front of the iteration stream. + + Args: + item: Item to prepend to the iteration. + """ self._queue.append(item) def exceptDefault(func, val, exc=Exception): - """Specify a default value for when an exception occurs""" + """Call func() and return val if the specified exception is raised. + + Args: + func: A zero-argument callable to invoke. + val: Default value to return on exception. + exc: Exception type (or tuple of types) to catch (default Exception). + + Returns: + The return value of func(), or val if exc was raised. + """ try: return func() except exc: @@ -463,6 +554,16 @@ def frange(start, end, step): # simple matrix functions def make_matrix(nrows, ncols, val = 0): + """Create a 2D list (matrix) with given dimensions and a fill value. + + Args: + nrows: Number of rows. + ncols: Number of columns. + val: Fill value for each cell (default 0); each cell gets a copy. + + Returns: + A list of lists of shape (nrows, ncols) filled with copies of val. + """ mat = [] for i in range(nrows): row = [] @@ -585,12 +686,29 @@ def count(func, lst): n += 1 return n -def counteq(a, lst): return count(eqfunc(a), lst) -def countneq(a, lst): return count(neqfunc(a), lst) -def countle(a, lst): return count(lefunc(a), lst) -def countlt(a, lst): return count(ltfunc(a), lst) -def countge(a, lst): return count(gefunc(a), lst) -def countgt(a, lst): return count(gtfunc(a), lst) +def counteq(a, lst): + """Count items in lst equal to a.""" + return count(eqfunc(a), lst) + +def countneq(a, lst): + """Count items in lst not equal to a.""" + return count(neqfunc(a), lst) + +def countle(a, lst): + """Count items in lst less than or equal to a.""" + return count(lefunc(a), lst) + +def countlt(a, lst): + """Count items in lst strictly less than a.""" + return count(ltfunc(a), lst) + +def countge(a, lst): + """Count items in lst greater than or equal to a.""" + return count(gefunc(a), lst) + +def countgt(a, lst): + """Count items in lst strictly greater than a.""" + return count(gtfunc(a), lst) def find(func, *lsts): @@ -629,12 +747,29 @@ def find(func, *lsts): return pos -def findeq(a, lst): return find(eqfunc(a), lst) -def findneq(a, lst): return find(neqfunc(a), lst) -def findle(a, lst): return find(lefunc(a), lst) -def findlt(a, lst): return find(ltfunc(a), lst) -def findge(a, lst): return find(gefunc(a), lst) -def findgt(a, lst): return find(gtfunc(a), lst) +def findeq(a, lst): + """Return indices of items in lst equal to a.""" + return find(eqfunc(a), lst) + +def findneq(a, lst): + """Return indices of items in lst not equal to a.""" + return find(neqfunc(a), lst) + +def findle(a, lst): + """Return indices of items in lst less than or equal to a.""" + return find(lefunc(a), lst) + +def findlt(a, lst): + """Return indices of items in lst strictly less than a.""" + return find(ltfunc(a), lst) + +def findge(a, lst): + """Return indices of items in lst greater than or equal to a.""" + return find(gefunc(a), lst) + +def findgt(a, lst): + """Return indices of items in lst strictly greater than a.""" + return find(gtfunc(a), lst) def islands(lst): @@ -748,13 +883,42 @@ def minfunc(func, lst): # count(ltfunc(4), lst) ==> returns the number of values in lst < 4 # -def eqfunc(a): return lambda x: x == a -def neqfunc(a): return lambda x: x != a -def ltfunc(a): return lambda x: x < a -def gtfunc(a): return lambda x: x > a -def lefunc(a): return lambda x: x <= a -def gefunc(a): return lambda x: x >= a +def eqfunc(a): + """Return a function that tests equality with a.""" + return lambda x: x == a + +def neqfunc(a): + """Return a function that tests inequality with a.""" + return lambda x: x != a + +def ltfunc(a): + """Return a function that tests x < a.""" + return lambda x: x < a + +def gtfunc(a): + """Return a function that tests x > a.""" + return lambda x: x > a + +def lefunc(a): + """Return a function that tests x <= a.""" + return lambda x: x <= a + +def gefunc(a): + """Return a function that tests x >= a.""" + return lambda x: x >= a + def withinfunc(a, b, ainc=True, binc=True): + """Return a function that tests whether x is within the range [a, b]. + + Args: + a: Lower bound. + b: Upper bound. + ainc: If True, the lower bound is inclusive (default True). + binc: If True, the upper bound is inclusive (default True). + + Returns: + A one-argument function returning True if x is in the specified range. + """ if ainc: if binc: return lambda x: a <= x <= b @@ -775,25 +939,69 @@ def lg(num): """Retruns the log_2 of a number""" return math.log(num, 2) -def add(a, b): return a + b -def sub(a, b): return a - b -def mul(a, b): return a * b -def idiv(a, b): return a / b -def div(a, b): return a / float(b) +def add(a, b): + """Return a + b.""" + return a + b + +def sub(a, b): + """Return a - b.""" + return a - b + +def mul(a, b): + """Return a * b.""" + return a * b + +def idiv(a, b): + """Return a / b (true division).""" + return a / b + +def div(a, b): + """Return a / float(b).""" + return a / float(b) def safediv(a, b, default=INF): + """Divide a by b, returning default on ZeroDivisionError. + + Args: + a: Numerator. + b: Denominator. + default: Value to return when b is zero (default INF). + + Returns: + a / float(b), or default if b is zero. + """ try: return a / float(b) except ZeroDivisionError: return default def safelog(x, base=math.e, default=-INF): + """Compute log(x) in the given base, returning default on error. + + Args: + x: Value to take the logarithm of. + base: Logarithm base (default math.e for natural log). + default: Value to return when x <= 0 or overflow occurs (default -INF). + + Returns: + math.log(x, base), or default on OverflowError or ValueError. + """ try: return math.log(x, base) except (OverflowError, ValueError): return default -def invcmp(a, b): return cmp(b, a) # cmp is defined locally above +def invcmp(a, b): + """Return the reversed comparison of a and b (i.e. cmp(b, a)). + + Args: + a: First value. + b: Second value. + + Returns: + 1 if b > a, -1 if b < a, 0 if equal. + """ + return cmp(b, a) # cmp is defined locally above def clamp(x, low, high): """Clamps a value 'x' between the values 'low' and 'high' @@ -809,6 +1017,15 @@ def clamp(x, low, high): return x def clampfunc(low, high): + """Return a function that clamps its argument between low and high. + + Args: + low: Lower bound (or None for no lower bound). + high: Upper bound (or None for no upper bound). + + Returns: + A one-argument function equivalent to clamp(x, low, high). + """ return lambda x: clamp(x, low, high) @@ -1106,6 +1323,16 @@ def write_delim(filename, data, delim="\t"): # def default_justify(val): + """Return the default column justification for a value. + + Numeric types (int, float) are right-justified; everything else is left. + + Args: + val: The value whose justification is needed. + + Returns: + "right" for int/float values, "left" otherwise. + """ if isinstance(val, int) or \ isinstance(val, float): return "right" @@ -1114,6 +1341,18 @@ def default_justify(val): defaultJustify = default_justify def default_format(val): + """Format a value for tabular display. + + Integers are formatted with comma separators via int2pretty. Percent + values use their own __str__. Small floats use scientific notation; + others use 4 decimal places. Everything else uses str(). + + Args: + val: The value to format. + + Returns: + A human-readable string representation of val. + """ if isinstance(val, int) and \ not isinstance(val, bool): return int2pretty(val) @@ -1196,7 +1435,18 @@ def printcols(data, width=None, spacing=1, format=defaultFormat, def list2matrix(lst, nrows=None, ncols=None, bycols=True): - """Turn a list into a matrix by wrapping its entries""" + """Reshape a flat list into a 2D matrix. + + Args: + lst: The list to reshape. + nrows: Number of rows. Inferred from ncols if not given. + ncols: Number of columns. Inferred from nrows if not given. + If neither is given, a roughly square shape is used. + bycols: If True, fill the matrix column-by-column (default True). + + Returns: + A list of lists representing the reshaped matrix. + """ mat = [] @@ -1222,7 +1472,15 @@ def list2matrix(lst, nrows=None, ncols=None, bycols=True): def printwrap(text, width=80, prefix="", out=sys.stdout): - """Prints text with wrapping""" + """Print text with line wrapping at a fixed column width. + + Args: + text: The string to print. + width: Maximum number of characters per line (default 80). + If None, print the text as a single line with no wrapping. + prefix: String prepended to each wrapped line (default ""). + out: Output stream (default sys.stdout). + """ if width == None: out.write(text) out.write("\n") @@ -1276,7 +1534,21 @@ def print_dict(dic, key=lambda x: x, val=lambda x: x, spacing=4, out=sys.stdout, format=defaultFormat, justify=defaultJustify): - """Print s a dictionary in two columns""" + """Print a dictionary as an aligned two-column table. + + Args: + dic: Dictionary to print. + key: Function applied to keys before printing (default identity). + val: Function applied to values before printing (default identity). + num: Maximum number of entries to print. Defaults to all. + cmp: Comparison function (unused in Python 3; kept for compatibility). + order: Key function for sorting items. If None, default sort is used. + reverse: If True, sort in descending order (default False). + spacing: Number of spaces between columns (default 4). + out: Output stream (default sys.stdout). + format: Formatting function for cell values (default default_format). + justify: Justification function for cell values (default default_justify). + """ if num == None: num = len(dic) @@ -1299,13 +1571,32 @@ def print_dict(dic, key=lambda x: x, val=lambda x: x, # class SafeReadIter: + """An iterator over a file handle that stops at EOF without raising an error. + + Unlike a bare for-loop over a file, this class uses readline() and raises + StopIteration when an empty string (EOF) is encountered. + """ def __init__(self, infile): + """Initialize from an open file handle. + + Args: + infile: An open file handle to iterate over. + """ self.infile = infile def __iter__(self): + """Return self as the iterator.""" return self def __next__(self): + """Return the next line or raise StopIteration at EOF. + + Returns: + Next line string from the file. + + Raises: + StopIteration: When end of file is reached. + """ line = self.infile.readline() if line == "": raise StopIteration @@ -1313,6 +1604,15 @@ def __next__(self): return line def readWord(infile, delims = [" ", "\t", "\n"]): + """Read the next whitespace-delimited word from a file stream. + + Args: + infile: An open file handle to read from. + delims: List of delimiter characters (default space, tab, newline). + + Returns: + The next word as a string, or an empty string at EOF. + """ word = "" while True: @@ -1331,6 +1631,16 @@ def readWord(infile, delims = [" ", "\t", "\n"]): def readUntil(stream, chars): + """Read from stream until one of the given characters (or EOF) is seen. + + Args: + stream: An open file handle. + chars: String or iterable of stop characters. + + Returns: + A tuple (token, char) where token is the accumulated string before + the stop character, and char is the stop character (or "" at EOF). + """ token = "" while True: char = stream.read(1) @@ -1340,6 +1650,17 @@ def readUntil(stream, chars): def readWhile(stream, chars): + """Read from stream while characters are in the given set. + + Args: + stream: An open file handle. + chars: String or iterable of accepted characters. + + Returns: + A tuple (token, char) where token is the accumulated string of + matching characters, and char is the first non-matching character + (or "" at EOF). + """ token = "" while True: char = stream.read(1) @@ -1349,6 +1670,14 @@ def readWhile(stream, chars): def skipComments(infile): + """Yield non-comment, non-blank lines from a file. + + Args: + infile: An iterable of lines (e.g. an open file handle). + + Yields: + Lines that do not start with "#" and are not blank. + """ for line in infile: if line.startswith("#") or line.startswith("\n"): continue @@ -1356,26 +1685,51 @@ def skipComments(infile): class IndentStream: - """ - Makes any stream into an indent stream. - - Indent stream auto indents every line written to it + """A write-only stream wrapper that automatically indents every line. + + Tracks a current indentation depth and prepends that many spaces to the + start of each new line. Use indent() and dedent() to change the depth. + + Attributes: + stream: The underlying writable stream. + linestart: True when the next character written begins a new line. + depth: Current indentation level in spaces. """ def __init__(self, stream): + """Initialize an IndentStream wrapping the given stream. + + Args: + stream: A filename string or writable file object to wrap. + """ self.stream = open_stream(stream, "w") self.linestart = True self.depth = 0 def indent(self, num=2): + """Increase the indentation depth. + + Args: + num: Number of spaces to add (default 2). + """ self.depth += num def dedent(self, num=2): + """Decrease the indentation depth, clamped to zero. + + Args: + num: Number of spaces to remove (default 2). + """ self.depth -= num if self.depth < 0: self.depth = 0 def write(self, text): + """Write text to the underlying stream, prepending indentation as needed. + + Args: + text: The string to write. + """ lines = text.split("\n") for line in lines[:-1]: @@ -1473,7 +1827,18 @@ def replace_ext(filename, oldext, newext): def sortrank(lst, cmp=None, key=None, reverse=False): - """Returns the ranks of items in lst""" + """Return the indices that would sort lst. + + Args: + lst: The list to rank. + cmp: Comparison function (deprecated; ignored if key is provided). + key: A one-argument function to extract a comparison key from + each list element (default identity). + reverse: If True, sort in descending order (default False). + + Returns: + A list of integer indices such that [lst[i] for i in result] is sorted. + """ ind = list(range(len(lst))) if key is None: @@ -1497,7 +1862,14 @@ def sort_together(compare, lst, *others): sortTogether = sort_together def invperm(perm): - """Returns the inverse of a permutation 'perm'""" + """Return the inverse of a permutation. + + Args: + perm: A list of unique integers 0..n-1 representing a permutation. + + Returns: + A list inv such that inv[perm[i]] == i for all i. + """ inv = [0] * len(perm) for i in range(len(perm)): inv[perm[i]] = i @@ -1511,14 +1883,33 @@ def invperm(perm): # def oneNorm(vals): - """Normalize values so that they sum to 1""" + """Normalize a list of values so that they sum to 1. + + Args: + vals: A list or iterable of numeric values. + + Returns: + A list of values each divided by the total sum. + """ s = float(sum(vals)) return [x/s for x in vals] def bucketSize(array, ndivs=None, low=None, width=None): - """Determine the bucket size needed to divide the values in array into - 'ndivs' evenly sized buckets""" + """Determine bucket parameters for dividing array values into bins. + + Exactly one of ndivs or width should be supplied (or neither, which + defaults to ndivs=20). The other value is derived from the data. + + Args: + array: A sequence of numeric values. + ndivs: Desired number of bins. Derived from width if not given. + low: Lower bound for binning. Defaults to min(array). + width: Desired bin width. Derived from ndivs if not given. + + Returns: + A tuple (ndivs, low, width) with all three values resolved. + """ if low is None: low = min(array) @@ -1545,7 +1936,20 @@ def bucketBin(item, ndivs, low, width): def bucket(array, ndivs=None, low=None, width=None, key=lambda x: x): - """Group elements of 'array' into 'ndivs' lists""" + """Group elements of array into ndivs buckets. + + Args: + array: A sequence of items to bucket. + ndivs: Number of buckets (inferred if not given). + low: Lower bound for the first bucket (default min of key values). + width: Bucket width (inferred if not given). + key: Function to extract a numeric comparison key from each item + (default identity). + + Returns: + A tuple (x, h) where x is a list of bucket lower-bound values and + h is a list of lists containing the array items in each bucket. + """ keys = map(key, array) @@ -1566,7 +1970,18 @@ def bucket(array, ndivs=None, low=None, width=None, key=lambda x: x): def hist(array, ndivs=None, low=None, width=None): - """Create a histogram of 'array' with 'ndivs' buckets""" + """Create a histogram of array values. + + Args: + array: A sequence of numeric values. + ndivs: Number of histogram bins (default 20 if width is also None). + low: Lower bound of the first bin. Defaults to min(array). + width: Bin width (inferred from ndivs if not given). + + Returns: + A tuple (x, h) where x is a list of bin lower-bound values and + h is a list of integer counts for each bin. + """ # set bucket sizes ndivs, low, width = bucketSize(array, ndivs, low, width) @@ -1590,7 +2005,22 @@ def hist2(array1, array2, ndivs1=None, ndivs2=None, low1=None, low2=None, width1=None, width2=None): - """Perform a 2D histogram""" + """Perform a 2D histogram over two arrays. + + Args: + array1: First sequence of numeric values (mapped to columns). + array2: Second sequence of numeric values (mapped to rows). + ndivs1: Number of bins for array1 (default derived from data). + ndivs2: Number of bins for array2 (default derived from data). + low1: Lower bound for array1 bins. Defaults to min(array1). + low2: Lower bound for array2 bins. Defaults to min(array2). + width1: Bin width for array1 (inferred if not given). + width2: Bin width for array2 (inferred if not given). + + Returns: + A tuple (labels, h) where labels is a 2D list of [x, y] bin + coordinates and h is a 2D list of integer counts. + """ # set bucket sizes @@ -1615,8 +2045,14 @@ def hist2(array1, array2, def histbins(bins): - """Adjust the bins from starts to centers, this will allow GNUPLOT to plot - histograms correctly""" + """Convert bin start positions to bin center positions for GNUPLOT plotting. + + Args: + bins: A list of bin start positions. + + Returns: + A list of bin center positions the same length as bins. + """ bins2 = [] @@ -1631,7 +2067,21 @@ def histbins(bins): def distrib(array, ndivs=None, low=None, width=None): - """Find the distribution of 'array' using 'ndivs' buckets""" + """Compute the probability density distribution of array. + + Normalises histogram counts by the total number of items and bin width, + giving an approximate PDF. + + Args: + array: A sequence of numeric values. + ndivs: Number of bins (default derived from data). + low: Lower bound of the first bin. Defaults to min(array). + width: Bin width (inferred if not given). + + Returns: + A tuple (x, h) where x is bin lower-bound values and h is a list + of density values (count / total / width). + """ # set bucket sizes ndivs, low, width = bucketSize(array, ndivs, low, width) @@ -1674,6 +2124,17 @@ def hist_dict(array): def print_hist(array, ndivs=20, low=None, width=None, cols=75, spacing=2, out=sys.stdout): + """Print a text-based histogram with ASCII bar chart. + + Args: + array: A sequence of numeric values to histogram. + ndivs: Number of bins (default 20). + low: Lower bound for the first bin. Defaults to min(array). + width: Bin width (inferred if not given). + cols: Total character width of the output including bars (default 75). + spacing: Number of spaces between columns (default 2). + out: Output stream (default sys.stdout). + """ data = list(hist(array, ndivs, low=low, width=width)) # find max bar