jigold · cseed · Nov 17, 2016 · Nov 17, 2016 · Nov 17, 2016 · Nov 18, 2016
diff --git a/docs/faq/ExpressionLanguage.md b/docs/faq/ExpressionLanguage.md
@@ -1 +1,4 @@
 ## <a class="jumptarget" name="exprlang"></a> Expression Language
+
+
+
diff --git a/docs/reference/HailExpressionLanguage.md b/docs/reference/HailExpressionLanguage.md
@@ -58,6 +58,12 @@ Several Hail commands provide the ability to perform a broad array of computatio
      - pcoin(p) -- returns `true` with probability `p`. `p` should be between 0.0 and 1.0
      - runif(min, max) -- returns a random draw from a uniform distribution on \[`min`, `max`). `min` should be less than or equal to `max`
      - rnorm(mean, sd) -- returns a random draw from a normal distribution with mean `mean` and standard deviation `sd`. `sd` should be non-negative
+
+ - Statistics
+    - pnorm(x) -- Returns left-tail probability p for which p = Prob($Z$ < x) with $Z$ a standard normal random variable
+    - qnorm(p) -- Returns left-quantile x for which p = Prob($Z$ < x) with $Z$ a standard normal random variable. `p` must satisfy `0 < p < 1`. Inverse of `pnorm`
+    - pchisq1tail(x) -- Returns right-tail probability p for which p = Prob($Z^2$ > x) with $Z^2$ a chi-squared random variable with one degree of freedom. `x` must be positive
+    - qchisq1tail(p) -- Returns right-quantile x for which p = Prob($Z^2$ > x) with $Z^2$ a chi-squared RV with one degree of freedom. `p` must satisfy `0 < p <= 1`. Inverse of `pchisq1tail`
 
  - Array Operations:
      - constructor: `[element1, element2, ...]` -- Create a new array from elements of the same type.
@@ -139,6 +145,10 @@ Several Hail commands provide the ability to perform a broad array of computatio
 
     - range: `range(end)` or `range(start, end)`.  This function will produce an `Array[Int]`.  `range(3)` produces `[0, 1, 2]`.  `range(-2, 2)` produces `[-2, -1, 0, 1]`.
 
+    - `gtj(i)` and `gtk(i)`.  Convert from genotype index (triangular numbers) to `j/k` pairs.
+
+    - `gtIndex(j, k)`.  Convert from `j/k` pair to genotype index (triangular numbers).
+
 **Note:**
 
  - All variables and values are case sensitive
@@ -355,15 +365,15 @@ The resulting array is sorted by count in descending order (the most common elem
 <numeric aggregable>.hist( start, end, bins )
 ```
 
-This aggregator is used to compute density distributions of numeric parameters.  The start, end, and bins params are no-scope parameters, which means that while computations like `100 / 4` are acceptable, variable references like `global.nBins` are not.
+This aggregator is used to compute frequency distributions of numeric parameters.  The start, end, and bins params are no-scope parameters, which means that while computations like `100 / 4` are acceptable, variable references like `global.nBins` are not.
 
 The result of a `hist` invocation is a struct:
 
 ```
 Struct {
     binEdges: Array[Double],
     binFrequencies: Array[Long],
-    nSmaller: Long,
+    nLess: Long,
     nGreater: Long
 }
 ```
@@ -374,7 +384,7 @@ Important properties:
  - (bins + 1) breakpoints are generated from the range `(start to end by binsize)`
  - `binEdges` stores an array of bin cutoffs.  Each bin is left-inclusive, right-exclusive except the last bin, which includes the maximum value.  This means that if there are N total bins, there will be N + 1 elements in binEdges.  For the invocation `hist(0, 3, 3)`, `binEdges` would be `[0, 1, 2, 3]` where the bins are `[0, 1)`, `[1, 2)`, `[2, 3]`.
  - `binFrequencies` stores the number of elements in the aggregable that fall in each bin.  It contains one element for each bin.
- - Elements greater than the max bin or smaller than the min bin will be tracked separately by `nSmaller` and `nGreater`
+ - Elements greater than the max bin or less than the min bin will be tracked separately by `nLess` and `nGreater`
 
 **Examples:**
 
@@ -388,7 +398,7 @@ Or, extend the above to compute a global gq histogram:
 
 ```
 annotatevariants expr -c 'va.gqHist = gs.map(g => g.gq).hist(0, 100, 20)'
-annotateglobal expr -c 'global.gqDensity = variants.map(v => va.gqHist.densities).sum()'
+annotateglobal expr -c 'global.gqHist = variants.map(v => va.gqHist.binFrequencies).sum()'
 ```
 
 ### Collect

diff --git a/python/pyhail/__init__.py b/python/pyhail/__init__.py
@@ -1,4 +1,7 @@
 from pyhail.context import HailContext
 from pyhail.dataset import VariantDataset
+from pyhail.keytable import KeyTable
+from pyhail.utils import TextTableConfig
+from pyhail.type import Type
 
-__all__ = ["HailContext", "VariantDataset"]
+__all__ = ["HailContext", "VariantDataset", "KeyTable", "TextTableConfig", "Type"]
diff --git a/python/pyhail/context.py b/python/pyhail/context.py
@@ -1,16 +1,47 @@
 import pyspark
 
 from pyhail.dataset import VariantDataset
-from pyhail.java import jarray, scala_object
+from pyhail.java import jarray, scala_object, scala_package_object
+from pyhail.keytable import KeyTable
+from pyhail.utils import TextTableConfig
+from py4j.protocol import Py4JJavaError
+
+class FatalError(Exception):
+    """:class:`.FatalError` is an error thrown by Hail method failures"""
+
+    def __init__(self, message, java_exception):
+        self.msg = message
+        self.java_exception = java_exception
+        super(FatalError)
+
+    def __str__(self):
+        return self.msg
 
 class HailContext(object):
     """:class:`.HailContext` is the main entrypoint for PyHail
     functionality.
 
     :param SparkContext sc: The pyspark context.
+
+    :param str log: Log file.
+
+    :param bool quiet: Don't write log file.
+
+    :param bool append: Append to existing log file.
+
+    :param long block_size: Minimum size of file splits in MB.
+
+    :param str parquet_compression: Parquet compression codec.
+
+    :param int branching_factor: Branching factor to use in tree aggregate.
+
+    :param str tmp_dir: Temporary directory for file merging.
     """
 
-    def __init__(self, sc):
+    def __init__(self, sc=None, log='hail.log', quiet=False, append=False,
+                 block_size=1, parquet_compression='uncompressed',
+                 branching_factor=50, tmp_dir='/tmp'):
+
         self.sc = sc
 
         self.gateway = sc._gateway
@@ -23,26 +54,37 @@ def __init__(self, sc):
 
         self.sql_context = pyspark.sql.SQLContext(sc, self.jsql_context)
 
-        self.jsc.hadoopConfiguration().set(
-            'io.compression.codecs',
-            'org.apache.hadoop.io.compress.DefaultCodec,org.broadinstitute.hail.io.compress.BGzipCodec,org.apache.hadoop.io.compress.GzipCodec')
+        scala_package_object(self.jvm.org.broadinstitute.hail.driver).configure(
+            self.jsc,
+            log,
+            quiet,
+            append,
+            parquet_compression,
+            block_size,
+            branching_factor,
+            tmp_dir)
 
-        logger = sc._jvm.org.apache.log4j
-        logger.LogManager.getLogger("org"). setLevel(logger.Level.ERROR)
-        logger.LogManager.getLogger("akka").setLevel(logger.Level.ERROR)
 
     def _jstate(self, jvds):
         return self.jvm.org.broadinstitute.hail.driver.State(
             self.jsc, self.jsql_context, jvds, scala_object(self.jvm.scala.collection.immutable, 'Map').empty())
 
+    def _raise_py4j_exception(self, e):
+        msg = scala_package_object(self.jvm.org.broadinstitute.hail.utils).getMinimalMessage(e.java_exception)
+        raise FatalError(msg, e.java_exception)
+
     def run_command(self, vds, pargs):
         jargs = jarray(self.gateway, self.jvm.java.lang.String, pargs)
         t = self.jvm.org.broadinstitute.hail.driver.ToplevelCommands.lookup(jargs)
         cmd = t._1()
         cmd_args = t._2()
         jstate = self._jstate(vds.jvds if vds != None else None)
-        result = cmd.run(jstate,
-                         cmd_args)
+
+        try:
+            result = cmd.run(jstate, cmd_args)
+        except Py4JJavaError as e:
+            self._raise_py4j_exception(e)
+
         return VariantDataset(self, result.vds())
 
     def grep(self, regex, path, max_count=100):
@@ -74,7 +116,7 @@ def import_annotations_table(self, path, variant_expr, code=None, npartitions=No
                                  # text table options
                                  types=None, missing="NA", delimiter="\\t", comment=None,
                                  header=True, impute=False):
-        """Import variants and variant annotaitons from a delimited text file
+        """Import variants and variant annotations from a delimited text file
         (text table) as a sites-only VariantDataset.
 
         :param path: The files to import.
@@ -235,7 +277,43 @@ def import_gen(self, path, tolerance=0.2, sample_file=None, npartitions=None, ch
 
         return self.run_command(None, pargs)
 
-    def import_plink(self, bed, bim, fam, npartitions=None, delimiter='\\\\s+', missing="NA", quantpheno=False):
+    def import_keytable(self, path, key_names, npartitions=None, config=None):
+        """Import delimited text file (text table) as KeyTable.
+
+        :param path: files to import.
+        :type path: str or list of str
+
+        :param key_names: The name(s) of fields to be considered keys
+        :type key_names: str or list of str
+
+        :param npartitions: Number of partitions.
+        :type npartitions: int or None
+
+        :param config: Configuration options for importing text files
+        :type config: :class:`.TextTableConfig` or None
+
+        :rtype: :class:`.KeyTable`
+        """
+        path_args = []
+        if isinstance(path, str):
+            path_args.append(path)
+        else:
+            for p in path:
+                path_args.append(p)
+
+        if not isinstance(key_names, str):
+            key_names = ','.join(key_names)
+
+        if not npartitions:
+            npartitions = self.sc.defaultMinPartitions
+
+        if not config:
+            config = TextTableConfig()
+
+        return KeyTable(self, self.jvm.org.broadinstitute.hail.keytable.KeyTable.importTextTable(
+            self.jsc, jarray(self.gateway, self.jvm.java.lang.String, path_args), key_names, npartitions, config.to_java(self)))
+
+    def import_plink(self, bed, bim, fam, npartitions=None, delimiter='\\\\s+', missing='NA', quantpheno=False):
         """
         Import PLINK binary file (.bed, .bim, .fam) as VariantDataset
 
@@ -427,7 +505,8 @@ def balding_nichols_model(self, populations, samples, variants, npartitions,
         :rtype: :class:`.VariantDataset`
         """
 
-        pargs = ['baldingnichols', '-k', str(populations), '-n', str(samples), '-m', str(variants), '--npartitions', str(npartitions),
+        pargs = ['baldingnichols', '-k', str(populations), '-n', str(samples), '-m', str(variants), '--npartitions',
+                 str(npartitions),
                  '--root', root]
         if population_dist:
             pargs.append('-d')
Original file line number	Diff line number	Diff line change
		@@ -1 +1,4 @@
		## <a class="jumptarget" name="exprlang"></a> Expression Language